treat 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (100) hide show
  1. data/LICENSE +7 -8
  2. data/TODO +16 -13
  3. data/examples/keywords.rb +89 -1
  4. data/lib/treat/buildable.rb +1 -8
  5. data/lib/treat/categories.rb +3 -4
  6. data/lib/treat/category.rb +1 -1
  7. data/lib/treat/delegatable.rb +1 -1
  8. data/lib/treat/detectors/encoding/native.rb +5 -0
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
  10. data/lib/treat/detectors/language/language_detector.rb +4 -0
  11. data/lib/treat/detectors/language/what_language.rb +4 -4
  12. data/lib/treat/detectors.rb +1 -1
  13. data/lib/treat/entities/entity.rb +5 -3
  14. data/lib/treat/entities/tokens.rb +14 -5
  15. data/lib/treat/entities/zones.rb +4 -0
  16. data/lib/treat/entities.rb +7 -5
  17. data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
  18. data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
  19. data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
  20. data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
  21. data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
  22. data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
  23. data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
  24. data/lib/treat/extractors/time/chronic.rb +8 -0
  25. data/lib/treat/extractors/time/native.rb +6 -0
  26. data/lib/treat/extractors/time/nickel.rb +31 -23
  27. data/lib/treat/extractors/topic_words/lda.rb +21 -16
  28. data/lib/treat/extractors/topics/reuters.rb +6 -4
  29. data/lib/treat/extractors.rb +7 -7
  30. data/lib/treat/formatters/readers/abw.rb +32 -0
  31. data/lib/treat/formatters/readers/autoselect.rb +13 -11
  32. data/lib/treat/formatters/readers/doc.rb +13 -0
  33. data/lib/treat/formatters/readers/gocr.rb +2 -0
  34. data/lib/treat/formatters/readers/html.rb +21 -1
  35. data/lib/treat/formatters/readers/ocropus.rb +3 -3
  36. data/lib/treat/formatters/readers/odt.rb +41 -0
  37. data/lib/treat/formatters/readers/pdf.rb +5 -2
  38. data/lib/treat/formatters/readers/txt.rb +2 -0
  39. data/lib/treat/formatters/serializers/xml.rb +3 -2
  40. data/lib/treat/formatters/serializers/yaml.rb +2 -0
  41. data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
  42. data/lib/treat/formatters/unserializers/xml.rb +6 -1
  43. data/lib/treat/formatters/unserializers/yaml.rb +5 -1
  44. data/lib/treat/formatters/visualizers/dot.rb +35 -37
  45. data/lib/treat/formatters/visualizers/html.rb +1 -0
  46. data/lib/treat/formatters/visualizers/inspect.rb +4 -0
  47. data/lib/treat/formatters/visualizers/short_value.rb +18 -3
  48. data/lib/treat/formatters/visualizers/standoff.rb +11 -6
  49. data/lib/treat/formatters/visualizers/tree.rb +5 -1
  50. data/lib/treat/formatters/visualizers/txt.rb +6 -1
  51. data/lib/treat/formatters.rb +1 -1
  52. data/lib/treat/group.rb +4 -3
  53. data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
  54. data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
  55. data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
  56. data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
  57. data/lib/treat/inflectors/stem/porter.rb +6 -2
  58. data/lib/treat/inflectors/stem/porter_c.rb +4 -1
  59. data/lib/treat/inflectors/stem/uea.rb +4 -4
  60. data/lib/treat/languages/english/tags.rb +16 -0
  61. data/lib/treat/languages/english.rb +4 -1
  62. data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
  63. data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
  64. data/lib/treat/lexicalizers/tag/brill.rb +3 -11
  65. data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
  66. data/lib/treat/lexicalizers.rb +0 -2
  67. data/lib/treat/processors/chunkers/txt.rb +4 -4
  68. data/lib/treat/processors/parsers/enju.rb +3 -17
  69. data/lib/treat/processors/parsers/stanford.rb +4 -0
  70. data/lib/treat/processors/segmenters/punkt.rb +1 -0
  71. data/lib/treat/processors/segmenters/stanford.rb +4 -0
  72. data/lib/treat/processors/segmenters/tactful.rb +4 -1
  73. data/lib/treat/processors/tokenizers/punkt.rb +1 -2
  74. data/lib/treat/processors/tokenizers/stanford.rb +4 -0
  75. data/lib/treat/processors/tokenizers/tactful.rb +1 -1
  76. data/lib/treat/processors.rb +4 -4
  77. data/lib/treat/proxies.rb +18 -11
  78. data/lib/treat/registrable.rb +12 -5
  79. data/lib/treat/sugar.rb +8 -3
  80. data/lib/treat/tree.rb +10 -3
  81. data/lib/treat.rb +55 -55
  82. data/test/tc_entity.rb +7 -7
  83. data/test/tc_extractors.rb +6 -4
  84. data/test/tc_formatters.rb +0 -4
  85. data/test/tests.rb +2 -0
  86. data/test/texts.rb +4 -4
  87. metadata +48 -56
  88. data/examples/texts/bugged_out.txt +0 -26
  89. data/examples/texts/half_cocked_basel.txt +0 -16
  90. data/examples/texts/hedge_funds.txt +0 -24
  91. data/examples/texts/hose_and_dry.txt +0 -19
  92. data/examples/texts/hungarys_troubles.txt +0 -46
  93. data/examples/texts/indias_slowdown.txt +0 -15
  94. data/examples/texts/merkozy_rides_again.txt +0 -24
  95. data/examples/texts/prada_is_not_walmart.txt +0 -9
  96. data/examples/texts/republican_nomination.txt +0 -26
  97. data/examples/texts/to_infinity_and_beyond.txt +0 -15
  98. data/lib/treat/entities/text.rb +0 -7
  99. data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
  100. data/lib/treat/formatters/cleaners/html.rb +0 -17
@@ -29,7 +29,6 @@ module Treat
29
29
  extend Group
30
30
  self.type = :annotator
31
31
  self.targets = [:word, :number]
32
-
33
32
  def self.synonyms(entity, synsets)
34
33
  synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
35
34
  end
@@ -42,7 +41,6 @@ module Treat
42
41
  def self.hypernyms(entity, synsets)
43
42
  synsets.collect { |ss| ss.hypernyms }.flatten
44
43
  end
45
-
46
44
  end
47
45
  extend Treat::Category
48
46
  end
@@ -5,18 +5,18 @@ module Treat
5
5
  # zones based on a very naive analysis of the
6
6
  # file.
7
7
  class Txt
8
- # Return an array of Zone objects found in the text.
8
+ # Split a document into Zone objects.
9
9
  def self.chunk(text, options = {})
10
10
  zones = text.to_s.split("\n")
11
11
  zones.each do |zone|
12
12
  next if zone.strip == ''
13
13
  if false # fix
14
- text << Entities::List.new(zone)
14
+ text << Treat::Entities::List.new(zone)
15
15
  end
16
16
  if zone.length < 60
17
- text << Entities::Title.new(zone)
17
+ text << Treat::Entities::Title.new(zone)
18
18
  else
19
- text << Entities::Paragraph.new(zone)
19
+ text << Treat::Entities::Paragraph.new(zone)
20
20
  end
21
21
  end
22
22
  text
@@ -20,21 +20,6 @@ module Treat
20
20
  @@i = 0
21
21
  # Require the Nokogiri XML parser.
22
22
  require 'nokogiri'
23
- # Maps Enju categories to Treat categories.
24
- CategoryMap = {
25
- 'ADJ' => :adjective,
26
- 'ADV' => :adverb,
27
- 'CONJ' => :conjunction,
28
- 'COOD' => :conjunction,
29
- 'C' => :complementizer,
30
- 'D' => :determiner,
31
- 'N' => :noun,
32
- 'P' => :preposition,
33
- 'PN' => :punctuation,
34
- 'SC' => :conjunction,
35
- 'V' => :verb,
36
- 'PRT' => :particle
37
- }
38
23
  # Return the process running Enju.
39
24
  def self.proc
40
25
  if @@parsers.size < @@options[:processes]
@@ -55,7 +40,8 @@ module Treat
55
40
  text = entity.to_s + '.'
56
41
  else
57
42
  remove_last = false
58
- text = entity.to_s.gsub('.', '') + '.' # Fix
43
+ text = entity.to_s.gsub('.', '')
44
+ text += '.' unless ['!', '?'].include?(text[-1])
59
45
  end
60
46
  stdin.puts(text + "\n")
61
47
  parsed = build(stdout.gets, remove_last)
@@ -114,7 +100,7 @@ module Treat
114
100
  new_attributes[:saturated] = (value[-1] == 'P')
115
101
  value = value[0..-2]
116
102
  end
117
- cat = CategoryMap[value]
103
+ cat = Treat::Languages::English::EnjuCatToCategory[value]
118
104
  new_attributes[:cat] = cat
119
105
  else
120
106
  new_attributes[:enju_cat] = value
@@ -1,6 +1,7 @@
1
1
  module Treat
2
2
  module Processors
3
3
  module Parsers
4
+ # A wrapper class for the Stanford parser.
4
5
  class Stanford
5
6
  # Require the Ruby-Java bridge.
6
7
  silence_warnings { require 'rjb' }
@@ -13,6 +14,7 @@ module Treat
13
14
  Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
14
15
  LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
15
16
  @@parsers = {}
17
+ # Parse the entity using the Stanford parser.
16
18
  def self.parse(entity, options = {})
17
19
  lang = Treat::Languages.describe(entity.language).to_s.upcase
18
20
  pcfg = "#{Treat.bin}/stanford-parser*/grammar/#{lang.upcase}PCFG.ser.gz"
@@ -26,6 +28,8 @@ module Treat
26
28
  recurse(parse, entity)
27
29
  entity
28
30
  end
31
+ # Helper method which recurses the tree supplied by
32
+ # the Stanford parser.
29
33
  def self.recurse(java_node, ruby_node)
30
34
  # Leaf
31
35
  if java_node.num_children == 0
@@ -21,6 +21,7 @@ module Treat
21
21
  # Segment a text using the Punkt segmenter gem.
22
22
  #
23
23
  # Options:
24
+ #
24
25
  # :training_text => (String) Text to train the segmenter on.
25
26
  def self.segment(entity, options = {})
26
27
  lang = entity.language
@@ -1,6 +1,8 @@
1
1
  module Treat
2
2
  module Processors
3
3
  module Segmenters
4
+ # A wrapper for the sentence splitter supplied by
5
+ # the Stanford parser.
4
6
  class Stanford
5
7
  # Require the Ruby-Java bridge.
6
8
  silence_warnings do
@@ -16,6 +18,8 @@ module Treat
16
18
  ::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
17
19
  StringReader = ::Rjb::import('java.io.StringReader')
18
20
  end
21
+ # Segment sentences using the sentence splitter supplied by
22
+ # the Stanford parser.
19
23
  def self.segment(entity, options = {})
20
24
  sr = StringReader.new(entity.to_s)
21
25
  sit = DocumentPreprocessor.new(sr).iterator
@@ -8,7 +8,10 @@ module Treat
8
8
  # based on Splitta, but has support for ‘?’ and ‘!’
9
9
  # as well as primitive handling of XHTML markup.
10
10
  #
11
- # Project website:
11
+ # Project website: https://github.com/SlyShy/Tackful-Tokenizer
12
+ # Original paper: Dan Gillick. 2009. Sentence Boundary Detection
13
+ # and the Problem with the U.S. University of California, Berkeley.
14
+ # http://dgillick.com/resource/sbd_naacl_2009.pdf
12
15
  class Tactful
13
16
  # Require the 'tactful_tokenizer' gem.
14
17
  silence_warnings { require 'tactful_tokenizer' }
@@ -26,12 +26,11 @@ module Treat
26
26
  ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
27
27
  RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
28
28
  # Tokenize the text using the algorithm lifted from
29
- # the Punkt tokenizer.
29
+ # the Punkt tokenizer gem.
30
30
  #
31
31
  # Options: none.
32
32
  def self.tokenize(entity, options = {})
33
33
  entity.to_s.scan(ReWordTokenizer).each do |token|
34
- puts token
35
34
  entity << Treat::Entities::Entity.from_string(token)
36
35
  end
37
36
  entity
@@ -1,6 +1,8 @@
1
1
  module Treat
2
2
  module Processors
3
3
  module Tokenizers
4
+ # A wrapper for the Stanford parser's Penn-Treebank
5
+ # style tokenizer.
4
6
  class Stanford
5
7
  # Require the Ruby-Java bridge.
6
8
  silence_warnings do
@@ -18,6 +20,8 @@ module Treat
18
20
  CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
19
21
  StringReader = ::Rjb::import('java.io.StringReader')
20
22
  end
23
+ # Tokenize the entity using a Penn-Treebank style tokenizer
24
+ # included with the Stanford Parser.
21
25
  def self.tokenize(entity, options = {})
22
26
  ptbt = PTBTokenizer.new(
23
27
  StringReader.new(entity.to_s),
@@ -41,7 +41,7 @@ module Treat
41
41
  [/([Cc])annot/, '\1an not']
42
42
  ]
43
43
  # Tokenize the entity using a rule-based algorithm
44
- # which has been lifted from the 'tactful-tokenizer'
44
+ # that has been lifted from the 'tactful-tokenizer'
45
45
  # gem.
46
46
  def self.tokenize(entity, options = {})
47
47
  s = entity.to_s
@@ -17,19 +17,19 @@ module Treat
17
17
  module Chunkers
18
18
  extend Group
19
19
  self.type = :transformer
20
- self.targets = [:document, :text]
20
+ self.targets = [:document, :zone]
21
21
  end
22
22
  # Segmenters split a text or zone into sentences.
23
23
  module Segmenters
24
24
  extend Group
25
25
  self.type = :transformer
26
- self.targets = [:document, :text, :zone]
26
+ self.targets = [:document, :zone]
27
27
  end
28
28
  # Tokenizers splits a sentence into Token objects.
29
29
  module Tokenizers
30
30
  extend Group
31
31
  self.type = :transformer
32
- self.targets = [:document, :text, :zone, :sentence, :constituent]
32
+ self.targets = [:document, :zone, :sentence, :constituent]
33
33
  end
34
34
  # Parsers split a sentence into constituent objects
35
35
  # representing its syntactic structure, with the
@@ -37,7 +37,7 @@ module Treat
37
37
  module Parsers
38
38
  extend Group
39
39
  self.type = :transformer
40
- self.targets = [:document, :text, :zone, :sentence, :constituent]
40
+ self.targets = [:document, :zone, :sentence, :constituent]
41
41
  end
42
42
  # Makes all the groups autoloadable and creates the delegators.
43
43
  extend Treat::Category
data/lib/treat/proxies.rb CHANGED
@@ -1,11 +1,13 @@
1
1
  module Treat
2
- # Proxies install Treat functions on Rubycore classes.
2
+ # Proxies install Treat functions on core Ruby classes.
3
3
  module Proxies
4
4
  # The module proxy provides functionanaty common
5
5
  # to the different types of proxies.
6
6
  module Proxy
7
+ # Build the entity corresponding to the proxied
8
+ # object and send the method call to the entity.
7
9
  def method_missing(sym, *args, &block)
8
- if Categories.have_method?(sym)
10
+ if Treat::Categories.have_method?(sym)
9
11
  to_entity.send(sym, *args)
10
12
  else
11
13
  super(sym, *args, &block)
@@ -16,8 +18,8 @@ module Treat
16
18
  end
17
19
  end
18
20
  # Install Treat functions on String objects.
19
- module StringProxy
20
- include Proxy
21
+ module String
22
+ include Treat::Proxies::Proxy
21
23
  # Save the string to the specified file.
22
24
  def save(file)
23
25
  File.open(file, 'w') { |f| f.write(self) }
@@ -28,16 +30,21 @@ module Treat
28
30
  end
29
31
  end
30
32
  # Install Treat functions on Numeric objects.
31
- module NumericProxy
32
- include Proxy
33
+ module Numeric
34
+ include Treat::Proxies::Proxy
33
35
  # Return the entity corresponding to the number.
34
36
  def to_entity(builder = nil)
35
37
  Treat::Entities::Entity.from_numeric(self)
36
38
  end
37
39
  end
38
40
  # Install Treat functions on Array objects.
39
- module ArrayProxy
40
- include Proxy
41
+ module Array
42
+ include Treat::Proxies::Proxy
43
+ # The behaviour of this proxy is special:
44
+ # if a Treat function is called on an array,
45
+ # the function will be called on each element
46
+ # of the array and a new array with the
47
+ # results will be returned.
41
48
  def method_missing(sym, *args, &block)
42
49
  if Category.has_method?(sym)
43
50
  array = []
@@ -59,8 +66,8 @@ module Treat
59
66
  end
60
67
  end
61
68
  # Include the proxies in the core classes.
62
- String.class_eval { include StringProxy }
63
- Numeric.class_eval { include NumericProxy }
64
- Array.class_eval { include ArrayProxy }
69
+ ::String.class_eval { include Treat::Proxies::String }
70
+ ::Numeric.class_eval { include Treat::Proxies::Numeric }
71
+ ::Array.class_eval { include Treat::Proxies::Array }
65
72
  end
66
73
  end
@@ -3,20 +3,27 @@ module Treat
3
3
  # Registers a token in the @token_registry
4
4
  # hash in the root node.
5
5
  def register_token(token)
6
- if is_root?
6
+ if is_root? || type == :document
7
7
  @token_registry ||= {value: {}, id: {}}
8
8
  @token_registry[:id][token.id] = token
9
- @token_registry[:value][token.value] ||= []
10
- @token_registry[:value][token.value] << token
9
+ @token_registry[:value][token.to_s] ||= []
10
+ @token_registry[:value][token.to_s] << token
11
+ if has_parent? && type == :document
12
+ @parent.register_token(token)
13
+ end
11
14
  else
12
15
  @parent.register_token(token)
13
16
  end
14
17
  end
15
18
  # Find the token registry, which is
16
19
  # always in the root node.
17
- def token_registry
20
+ def token_registry(type = nil)
21
+ if self.type == type
22
+ @token_registry ||= {value: {}, id: {}}
23
+ return @token_registry
24
+ end
18
25
  if has_parent?
19
- @parent.token_registry
26
+ @parent.token_registry(type)
20
27
  else
21
28
  @token_registry ||= {value: {}, id: {}}
22
29
  @token_registry
data/lib/treat/sugar.rb CHANGED
@@ -1,5 +1,10 @@
1
1
  module Treat
2
+ # This module provides syntactic sugar in the following manner:
3
+ # all entities found under Treat::Entities will be made
4
+ # available within the global namespace. For example,
5
+ # Treat::Entities::Word can now be referred to as simply 'Word'.
2
6
  module Sugar
7
+ # Installs syntactic sugar.
3
8
  def edulcorate
4
9
  return if @@edulcorated
5
10
  @@edulcorated = true
@@ -13,6 +18,7 @@ module Treat
13
18
  end
14
19
  end
15
20
  end
21
+ # Uninstalls syntactic sugar.
16
22
  def unedulcorate
17
23
  return unless @@edulcorated
18
24
  @@edulcorated = false
@@ -24,14 +30,13 @@ module Treat
24
30
  end
25
31
  end
26
32
  end
27
- # Whtypeher syntactic sugar is
33
+ # Boolean - whether syntactic sugar is
28
34
  # enabled or not.
29
35
  def edulcorated?; @@edulcorated; end
30
36
  # Syntactic sugar is disabled by default.
31
37
  @@edulcorated = false
32
-
33
38
  private
34
-
39
+ # Helper method, yields each entity type and class.
35
40
  def each_entity_class
36
41
  Treat::Entities.list.each do |entity_type|
37
42
  type = :"#{cc(entity_type)}"
data/lib/treat/tree.rb CHANGED
@@ -68,18 +68,20 @@ module Treat
68
68
  end
69
69
  nodes[0]
70
70
  end
71
+ # Retrieve a child node by name or index.
71
72
  def [](name_or_index)
72
73
  if name_or_index == nil
73
74
  raise Treat::Exception,
74
75
  "Non-nil name or index needs to be provided."
75
76
  end
76
77
  if name_or_index.kind_of?(Integer) &&
77
- name_or_index < 1000 # Fix
78
+ name_or_index < 1000
78
79
  @children[name_or_index]
79
80
  else
80
81
  @children_hash[name_or_index]
81
82
  end
82
83
  end
84
+ # Remove the supplied node or id of a node from the children.
83
85
  def remove!(ion)
84
86
  return nil unless ion
85
87
  if ion.is_a? Treat::Tree::Node
@@ -91,6 +93,7 @@ module Treat
91
93
  @children_hash.delete(ion)
92
94
  end
93
95
  end
96
+ # Remove all children.
94
97
  def remove_all!
95
98
  @children.each { |child| child.set_as_root! }
96
99
  @children.clear
@@ -103,14 +106,18 @@ module Treat
103
106
  id = @parent.children.index(self)
104
107
  @parent.children.at(id + 1) if id
105
108
  end
109
+ # Return the sibling N positions to the left of this one.
106
110
  def left(n = 1); sibling(-1*n); end
111
+ # Return the sibling N positions to the right of this one.
107
112
  def right(n = 1); sibling(1*n); end
113
+ # Return the sibling with position #pos versus
114
+ # this one. #pos can be ... -1, 0, 1, ...
108
115
  def sibling(pos)
109
116
  return nil if is_root?
110
117
  id = @parent.children.index(self)
111
118
  @parent.children.at(id + pos)
112
119
  end
113
- # There must be a cleaner way to do this.
120
+ # Return all brothers and sisters of this node.
114
121
  def siblings
115
122
  r = @parent.children.dup
116
123
  r.delete(self)
@@ -133,7 +140,7 @@ module Treat
133
140
  # Does the entity have a feature ?
134
141
  def has_feature?(feature)
135
142
  @features.has_key?(feature) ||
136
- feature == :value
143
+ [:id, :value, :children, :edges].include?(feature)
137
144
  end
138
145
  alias :has? :has_feature?
139
146
  # Link this node to the target node with
data/lib/treat.rb CHANGED
@@ -1,51 +1,50 @@
1
- #
2
1
  # Main namespace for Treat modules.
3
2
  #
4
- # 1. Entities
3
+ # === Entities
5
4
  #
6
- # Entities are Tree structures that represent any textual
7
- # entity (from a collection of texts down to an individual
8
- # word) with a value, features, children and edges linking
9
- # it to other textual entities. Sugar provides syntactic sugar
10
- # for Entities and can be enabled by running Treat.edulcorate.
5
+ # Entities are Tree structures that represent any textual
6
+ # entity (from a collection of texts down to an individual
7
+ # word) with a value, features, children and edges linking
8
+ # it to other textual entities. Sugar provides syntactic sugar
9
+ # for Entities and can be enabled by running Treat.edulcorate.
11
10
  #
12
- # Here are some example of how to create entities:
11
+ # Here are some example of how to create entities:
13
12
  #
14
- # c = Collection 'folder_with_documents'
15
- # d = Document 'filename.txt' # (or PDF, html, xml, png, jpg, gif).
16
- # p = Paragraph 'A short story. The end.'
17
- # s = Sentence 'That is not a sentence.'
18
- # w = Word 'fox'
13
+ # c = Collection 'folder_with_documents'
14
+ # d = Document 'filename.txt' # (or PDF, html, xml, png, jpg, gif).
15
+ # p = Paragraph 'A short story. The end.'
16
+ # s = Sentence 'That is not a sentence.'
17
+ # w = Word 'fox'
19
18
  #
20
- # Here's a full list of entities (subtypes in parentheses):
21
- # Collection, Document, Zone (Section, Title, Paragraph or List),
22
- # Sentence, Constituent (Phrase or Clause), Token (Word, Number,
23
- # Symbol or Punctuation).
19
+ # Here's a full list of entities (subtypes in parentheses):
20
+ # Collection, Document, Zone (Section, Title, Paragraph or List),
21
+ # Sentence, Constituent (Phrase or Clause), Token (Word, Number,
22
+ # Symbol or Punctuation).
23
+ #
24
+ # === Proxies
24
25
  #
25
- # 2. Proxies
26
+ # Proxies allow the Treat functions to be called on the core
27
+ # Ruby classes String, Numeric and Array. They build the entity
28
+ # corresponding to the supplied raw text and send the requested
29
+ # function to it.
26
30
  #
27
- # Proxies allow the Treat functions to be called on the core
28
- # Ruby classes String, Numeric and Array. They build the entity
29
- # corresponding to the supplied raw text and send the requested
30
- # function to it.
31
- #
32
- # For example,
31
+ # For example,
33
32
  #
34
- # 'fox'.tag
33
+ # 'fox'.tag
35
34
  #
36
- # Is equivalent to:
35
+ # Is equivalent to:
37
36
  #
38
- # w = Word 'fox'
39
- # w.tag
37
+ # w = Word 'fox'
38
+ # w.tag
40
39
  #
41
- # 3. Functions
40
+ # === Functions
42
41
  #
43
- # A class is defined for each implemented algorithm performing a given
44
- # task. These classes are clustered into groups of algorithms performing
45
- # the same given task (Group), and the groups are clustered into Categories
46
- # of groups performing related tasks.
42
+ # A class is defined for each implemented algorithm performing a given
43
+ # task. These classes are clustered into groups of algorithms performing
44
+ # the same given task (Group), and the groups are clustered into Categories
45
+ # of groups performing related tasks.
47
46
  #
48
- # Here are the different Categories:
47
+ # Here are the different Categories:
49
48
  #
50
49
  # - Detectors - Category for language, encoding, and format
51
50
  # detectors.
@@ -60,22 +59,22 @@
60
59
  # - Processors - Namespace for algorithms that process collections and
61
60
  # documents into trees.
62
61
  #
63
- # 3. Linguistic resources
62
+ # === Linguistic resources
64
63
  #
65
- # The Languages module contains linguistic information about
66
- # languages (full ISO-639-1 and 2 language list, tag alignments
67
- # for three treebanks, word categories, etc.)
64
+ # The Languages module contains linguistic information about
65
+ # languages (full ISO-639-1 and 2 language list, tag alignments
66
+ # for three treebanks, word categories, etc.)
68
67
  #
69
- # 4. Mixins for entities.
68
+ # === Mixins for entities.
70
69
  #
71
- # Buildable, Delegatable, Visitable and Registrable are
72
- # or extended by Entity and provide it with the ability to be built,
73
- # to delegate function calls, to accept visitors and to maintain a
74
- # token registry, respectively.
70
+ # Buildable, Delegatable, Visitable and Registrable are
71
+ # or extended by Entity and provide it with the ability to be built,
72
+ # to delegate function calls, to accept visitors and to maintain a
73
+ # token registry, respectively.
75
74
  #
76
- # 5. Exception
75
+ # === Exception class.
77
76
  #
78
- # Exception defines a custom exception for the Treat module.
77
+ # Exception defines a custom exception class for the Treat module.
79
78
  #
80
79
  module Treat
81
80
 
@@ -85,20 +84,20 @@ module Treat
85
84
  end
86
85
 
87
86
  # The current version of Treat.
88
- VERSION = "0.1.2"
87
+ VERSION = "0.1.3"
89
88
 
90
- # $LOAD_PATH << '/ruby/treat/lib/' # Remove for release
89
+ # $LOAD_PATH << '/ruby/treat/lib/' # Remove for release
91
90
 
92
91
  # Create class variables for the Treat module.
93
92
  class << self
94
- # Default language to use when detect_language is false
93
+ # Symbol - default language to use when detect_language is false.
95
94
  attr_accessor :default_language
96
- # Default encoding to use.
95
+ # Symbol - default encoding to use.
97
96
  attr_accessor :default_encoding
98
97
  # Boolean - detect language or use default?
99
98
  attr_accessor :detect_language
100
- # Identifier - the ideal entity level to detect language at
101
- # (:entity, :sentence, :zone, :text, :document, klass.)
99
+ # Symbol - the ideal entity level to detect language at
100
+ # (e.g., :entity, :sentence, :zone, :section, :document)
102
101
  attr_accessor :language_detection_level
103
102
  # String - main folder for executable files.
104
103
  attr_accessor :bin
@@ -117,13 +116,13 @@ module Treat
117
116
  # Turn language detection off by default.
118
117
  self.detect_language = false
119
118
  # Detect the language once per text by default.
120
- self.language_detection_level = :text
119
+ self.language_detection_level = :section
121
120
  # Set the lib path to that of this file.
122
121
  self.lib = File.dirname(__FILE__)
123
122
  # Set the paths to the bin, test and tmp folders.
124
- self.bin = self.lib + '/../bin/'
125
- self.test = self.lib + '/../test/'
126
- self.tmp = self.lib + '/../tmp/'
123
+ self.bin = self.lib + '/../bin'
124
+ self.test = self.lib + '/../test'
125
+ self.tmp = self.lib + '/../tmp'
127
126
 
128
127
  # Require modified core classes.
129
128
  require 'treat/object'
@@ -137,6 +136,7 @@ module Treat
137
136
  require 'treat/proxies'
138
137
  require 'treat/sugar'
139
138
 
139
+ # Make sugar available when needed.
140
140
  extend Sugar
141
141
 
142
142
  end
data/test/tc_entity.rb CHANGED
@@ -2,7 +2,7 @@ module Treat
2
2
  module Tests
3
3
  class TestEntity < Test::Unit::TestCase
4
4
  def setup
5
- @text = Treat::Entities::Text.new
5
+ @text = Treat::Entities::Section.new
6
6
 
7
7
  @sentence = Treat::Entities::Sentence.new
8
8
 
@@ -14,23 +14,23 @@ module Treat
14
14
  @adj_phrase.set :tag, 'ADJP'
15
15
 
16
16
  @det = Treat::Entities::Word.new('The')
17
- @det.set :cat, :determiner
17
+ @det.set :category, :determiner
18
18
  @det.set :tag, 'DT'
19
19
  @det.set :tag_set, :penn
20
20
  @adj = Treat::Entities::Word.new('lazy')
21
- @adj.set :cat, :adjective
21
+ @adj.set :category, :adjective
22
22
  @adj.set :tag, 'JJ'
23
23
  @adj.set :tag_set, :penn
24
24
  @noun = Treat::Entities::Word.new('fox')
25
- @noun.set :cat, :noun
25
+ @noun.set :category, :noun
26
26
  @noun.set :tag, 'NN'
27
27
  @noun.set :tag_set, :penn
28
28
  @aux = Treat::Entities::Word.new('is')
29
- @aux.set :cat, :verb
29
+ @aux.set :category, :verb
30
30
  @aux.set :tag, 'VBZ'
31
31
  @aux.set :tag_set, :penn
32
32
  @verb = Treat::Entities::Word.new('running')
33
- @verb.set :cat, :verb
33
+ @verb.set :category, :verb
34
34
  @verb.set :tag, 'VBG'
35
35
  @verb.set :tag_set, :penn
36
36
  @dot = Treat::Entities::Punctuation.new('.')
@@ -62,7 +62,7 @@ module Treat
62
62
  end
63
63
 
64
64
  def test_type
65
- assert_equal :text, @text.type
65
+ assert_equal :section, @text.type
66
66
  end
67
67
 
68
68
  def test_printers
@@ -25,9 +25,11 @@ module Treat
25
25
  # assert_nothing_raised { @doc.named_entity(:abner) }
26
26
  end
27
27
 
28
- def test_key_sentences
29
- topics = @doc.topic_words(:lda)
30
- assert_nothing_raised { @doc.key_sentences(:topics_frequency, topics) }
28
+ def test_keywords
29
+ assert_nothing_raised do
30
+ topics = @doc.topic_words(:lda)
31
+ @doc.keywords(:topics_frequency, topic_words: topics)
32
+ end
31
33
  end
32
34
 
33
35
  def test_topics
@@ -38,7 +40,7 @@ module Treat
38
40
  @doc.chunk.segment(:tactful).tokenize
39
41
 
40
42
  assert_nothing_raised { @doc.statistics(:frequency_of, value: 'the') }
41
- assert_nothing_raised { @word.statistics(:frequency) }
43
+ assert_nothing_raised { @word.statistics(:frequency_in) }
42
44
  # assert_nothing_raised { @doc.statistics(:position_in) }
43
45
  # assert_nothing_raised { @doc.statistics(:transition_matrix) }
44
46
  # assert_nothing_raised { @doc.statistics(:transition_probability) }
@@ -37,10 +37,6 @@ module Treat
37
37
  assert_nothing_raised { @sentence.visualize(:standoff) }
38
38
  end
39
39
 
40
- def test_cleaners
41
- assert_nothing_raised { @html_doc.clean(:html) }
42
- end
43
-
44
40
  end
45
41
  end
46
42
  end