treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -6,6 +6,7 @@ module Treat
6
6
  # containing the texts of the collection.
7
7
  def initialize(folder = nil, id = nil)
8
8
  super('', id)
9
+ @type = :collection
9
10
  if folder
10
11
  set :folder, folder
11
12
  Dir.glob("#{folder}/*").each do |f|
@@ -14,6 +15,7 @@ module Treat
14
15
  end
15
16
  end
16
17
  end
18
+ def type; :collection; end
17
19
  end
18
20
  end
19
21
  end
@@ -2,9 +2,10 @@ module Treat
2
2
  module Entities
3
3
  # Represents a document.
4
4
  class Document < Entity
5
- def initialize(file, id = nil)
5
+ def initialize(file = nil, id = nil)
6
6
  super('', id)
7
- set :file, file
7
+ set :file, file if file
8
+ @type = :document
8
9
  end
9
10
  end
10
11
  end
@@ -4,27 +4,27 @@ require 'treat/delegatable'
4
4
  require 'treat/visitable'
5
5
  require 'treat/registrable'
6
6
  require 'treat/buildable'
7
+ require 'treat/doable'
8
+ require 'treat/viewable'
9
+ require 'treat/features'
7
10
 
8
11
  module Treat
9
12
  module Entities
10
13
  class Entity < Tree::Node
14
+ # A Symbol representing the lowercase version of the class name.
15
+ attr_accessor :type
11
16
  # Implements support for #register
12
17
  include Registrable
13
18
  # Implement support for #accept.
14
19
  include Visitable
15
- # Implement support for #self.add_delegators
20
+ # Implement support for #self.add_workers
16
21
  extend Delegatable
17
22
  # Implement support for #self.from_*
18
23
  extend Buildable
19
- # Initialize the document with its filename.
20
- # Optionally specify a reader to read the file.
21
- # If +read+ is set to false, the document will
22
- # not be read automatically; in that case, the
23
- # method #read must be called on the document
24
- # object to load it in.
25
- def self.build(file_or_value = '', id = nil)
26
- from_anything(file_or_value, id)
27
- end
24
+ # Implement support for #do.
25
+ include Doable
26
+ # Implement support for to_s, inspect, etc.
27
+ include Viewable
28
28
  # Initialize the entity with its value and
29
29
  # (optionally) a unique identifier. By default,
30
30
  # the object_id will be used as id. Also initialize
@@ -32,12 +32,10 @@ module Treat
32
32
  def initialize(value = '', id = nil)
33
33
  id ||= object_id
34
34
  super(value, id)
35
+ @type = :entity
35
36
  end
36
- # Return a lowercase identifier representing the
37
- # type of entity (e.g. :word, :token, etc.)
38
- def type; :"#{cl(self.class).downcase}"; end
39
37
  # Catch missing methods to support method-like
40
- # access to features (e.g. entity.cat instead of
38
+ # access to features (e.g. entity.categoryinstead of
41
39
  # entity.features[:cat]) and to support magic
42
40
  # methods (see #parse_magic_method). If the
43
41
  # feature does not exist
@@ -49,8 +47,8 @@ module Treat
49
47
  begin
50
48
  super(sym, *args, &block)
51
49
  rescue NoMethodError
52
- # Check...
53
- if Categories.have_method?(sym)
50
+ return false if sym.to_s[-1] == '?'
51
+ if Categories.lookup(sym)
54
52
  msg = "Method #{sym} cannot be called on a #{type}."
55
53
  else
56
54
  msg = "Method #{sym} does not exist."
@@ -87,77 +85,81 @@ module Treat
87
85
  # - nouns_with_*(value)
88
86
  # - noun_with_*(value)
89
87
  #
90
- # Note that repetition of code in this method
91
- # (instead of method chaining) is intentional
92
- # and aims to reduce the number of method
93
- # dispatches done by Ruby to improve performance.
94
- def parse_magic_method(sym, *args, &block)
88
+ def parse_magic_method(sym, *args)
95
89
  @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
96
- @@cats_regexp ||= "(#{Treat::Languages::English::Categories.join('|')})"
90
+ @@cats_regexp ||= "(#{Treat::Languages::WordCategories.join('|')})"
97
91
  method = sym.to_s =~ /entities/ ?
98
- sym.to_s.gsub('entities', 'entitys'):
92
+ sym.to_s.gsub('entities', 'entitys') :
99
93
  method = sym.to_s
100
- a = []
101
- if method =~ /^parent_#{@@entities_regexp}$/ # Optimize all
102
- self.class.send(:define_method, "parent_#{$1}") do
103
- ancestor_with_types(:"#{$1}")
104
- end
105
- ancestor_with_types(:"#{$1}")
106
- elsif method =~ /^each_#{@@entities_regexp}$/
107
- each_entity(:"#{$1}") { |entity| yield entity }
108
- elsif method =~ /^#{@@entities_regexp}s$/
109
- each_entity(:"#{$1}") { |e| a << e }
94
+ if method =~ /^#{@@entities_regexp}s$/
95
+ a = []
96
+ each_entity($1.intern) { |e| a << e }
110
97
  a
111
98
  elsif method =~ /^#{@@entities_regexp}$/
112
- each_entity(:"#{$1}") { |e| a << e }
99
+ a = []
100
+ each_entity($1.intern) { |e| a << e }
113
101
  first_but_warn(a, $1)
102
+ elsif method =~ /^parent_#{@@entities_regexp}$/
103
+ ancestor_with_types($1.intern)
104
+ elsif method =~ /^each_#{@@entities_regexp}$/
105
+ each_entity($1.intern) { |e| yield e }
114
106
  elsif method =~ /^#{@@entities_regexp}_count$/
115
107
  i = 0
116
- each_entity(:"#{$1}") { |e| i += 1 }
108
+ each_entity($1.intern) { |e| i += 1 }
117
109
  i
118
- elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
119
- each_entity(:"#{$1}") do |e|
120
- a << e if e.has?(:"#{$2}") &&
121
- e.send(:"#{$2}") == args[0]
110
+ elsif method =~ /^#{@@entities_regexp}s_with_([a-z]+)$/
111
+ a = []
112
+ each_entity($1.intern) do |e|
113
+ a << e if e.has?($2.intern) &&
114
+ e.send($2.intern) == args[0]
122
115
  end
123
116
  a
124
117
  elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
125
- each_entity(:"#{$1}") do |e|
126
- a << e if e.has?(:"#{$2}") &&
127
- e.send(:"#{$2}") == args[0]
118
+ a = []
119
+ each_entity($1.intern) do |e|
120
+ a << e if e.has?($2.intern) &&
121
+ e.send($2.intern) == args[0]
128
122
  end
129
123
  first_but_warn(a, $1)
130
124
  elsif method =~ /^each_with_([a-z]*)$/
131
125
  each_entity do |e|
132
- yield e if e.has?(:"#{$2}") &&
133
- e.send(:"#{$2}") == args[0]
126
+ yield e if e.has?($1.intern) &&
127
+ e.send($1.intern) == args[0]
134
128
  end
135
129
  elsif method =~ /^each_#{@@cats_regexp}$/
136
- each_entity(:word) { |e| yield e if e.cat == :"#{$1}" }
130
+ each_entity(:word) { |e| yield e if e.category == $1.intern }
137
131
  elsif method =~ /^#{@@cats_regexp}s$/
138
- each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
132
+ a = []
133
+ each_entity(:word) { |e| a << e if e.category == $1.intern }
139
134
  a
140
135
  elsif method =~ /^#{@@cats_regexp}$/
141
- each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
136
+ a = []
137
+ each_entity(:word) { |e| a << e if e.category == $1.intern }
142
138
  first_but_warn(a, $1)
143
139
  elsif method =~ /^#{@@cats_regexp}_count$/
144
140
  i = 0
145
- each_entity(:word) { |e| i += 1 if e.cat == :"#{$1}" }
141
+ each_entity(:word) { |e| i += 1 if e.category == $1.intern }
146
142
  i
147
143
  elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
144
+ a = []
148
145
  each_entity(:word) do |e|
149
- a << e if e.cat == :"#{$1}" &&
150
- e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
146
+ a << e if e.category == $1.intern &&
147
+ e.has?($2.intern) && e.send($2.intern) == args[0]
151
148
  end
152
149
  a
153
150
  elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
151
+ a = []
154
152
  each_entity(:word) do |e|
155
- a << e if e.cat == :"#{$1}" &&
156
- e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
153
+ a << e if e.category== $1.intern &&
154
+ e.has?($2.intern) && e.send($2.intern) == args[0]
157
155
  end
158
156
  first_but_warn(a, $1)
157
+ elsif method =~ /^is_#{@@entities_regexp}\?$/
158
+ type.to_s == $1
159
+ elsif method =~ /^is_#{@@cats_regexp}\?$/
160
+ category.to_s == $1
159
161
  else
160
- :no_magic
162
+ return :no_magic
161
163
  end
162
164
  end
163
165
  # Add an entity to the current entity.
@@ -168,9 +170,9 @@ module Treat
168
170
  def <<(entities, clear_parent = true)
169
171
  entities = [entities] unless entities.is_a? Array
170
172
  entities.each do |entity|
171
- if entity.is_a?(Treat::Entities::Token) ||
172
- entity.is_a?(Treat::Entities::Constituent)
173
- register_token(entity) unless entity.value == ''
173
+ if entity.is_a?(Treat::Entities::Token) ||
174
+ entity.is_a?(Treat::Entities::Phrase)
175
+ register_token(entity) unless entity.value == ''
174
176
  end
175
177
  end
176
178
  super(entities)
@@ -182,39 +184,64 @@ module Treat
182
184
  # Note that this function is recursive, unlike
183
185
  # #each. It does not yield the top element being
184
186
  # recursed.
187
+ #
188
+ # This function NEEDS to be ported to C (see source).
185
189
  def each_entity(*types)
186
- yield self if match_types(self, types)
187
- if has_children?
190
+ =begin
191
+ # Replace with:
192
+ inline do |builder|
193
+
194
+ builder.c_raw <<-EOS, :arity => -1
195
+ VALUE each_entity_c(int argc, VALUE *types, VALUE self)
196
+ {
197
+
198
+ }
199
+ EOS
200
+ end
201
+ =end
202
+ types = [:entity] if types.size == 0
203
+ f = false
204
+ types.each { |t2| f = true if Treat::Entities.match_types[type][t2] }
205
+ yield self if f
206
+ unless @children.size == 0
188
207
  @children.each do |child|
189
208
  child.each_entity(*types) { |y| yield y }
190
209
  end
191
210
  end
192
211
  end
193
- # Returns the first ancestor of this
194
- # entity that has the given type.
212
+ # Returns the first ancestor of this entity that has the given type.
195
213
  def ancestor_with_types(*types)
196
214
  ancestor = @parent
197
- while not match_types(ancestor, types)
198
- return nil unless ancestor.has_parent?
199
- ancestor = ancestor.parent
215
+ match_types = lambda do |t1, t2s|
216
+ f = false
217
+ t2s.each do |t2|
218
+ if Treat::Entities.match_types[t1][t2]
219
+ f = true; break
220
+ end
221
+ end
222
+ f
223
+ end
224
+ if ancestor
225
+ while not match_types.call(ancestor.type, types)
226
+ return nil unless (ancestor && ancestor.has_parent?)
227
+ ancestor = ancestor.parent
228
+ end
229
+ match_types.call(ancestor.type, types) ? ancestor : nil
200
230
  end
201
- match_types(ancestor, types) ? ancestor : nil
202
231
  end
203
232
  alias :ancestor_with_type :ancestor_with_types
204
- # Return the entity's string value in plain text format.
205
- def to_string; @value; end
206
- # An alias for #to_string.
207
- def to_s; visualize(:txt); end
208
- alias :to_str :to_s
209
- # Return an informative string representation of the entity.
210
- def inspect; visualize(:inspect); end
211
- # Print out an ASCII representation of the tree.
212
- def print_tree; puts visualize(:tree); end
213
- # Return a shortened value of the entity's string value using [...].
214
- def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
215
- # Convenience functions. Convenience decorators.
216
- def frequency_of(word); statistics(:frequency_of, value: word); end
217
- private
233
+ # Returns the (direct) ancestors of this entity that
234
+ # have the given type.
235
+ def ancestors_with_types(*types)
236
+ ancestor = self
237
+ ancestors = []
238
+ while (a = ancestor.ancestor_with_types(*types))
239
+ ancestors << a
240
+ ancestor = ancestor.parent
241
+ end
242
+ ancestors
243
+ end
244
+ alias :ancestors_with_type :ancestors_with_types
218
245
  # Return the first element in the array, warning if not
219
246
  # the only one in the array. Used for magic methods: e.g.,
220
247
  # the magic method "word" if called on a sentence
@@ -227,18 +254,6 @@ module Treat
227
254
  end
228
255
  array[0]
229
256
  end
230
- # Cache a list of the type => class relationships.
231
- @@type_classes = {}
232
- # Returns true if the node is of the same type or
233
- # is a subtype of of one of the specified entity types,
234
- # which are supplied as identifiers rather than classes.
235
- def match_types(node, entity_types)
236
- entity_types.each do |type|
237
- @@type_classes[type] ||= Entities.const_get(cc(type))
238
- return true if node.is_a? @@type_classes[type]
239
- end
240
- false
241
- end
242
257
  end
243
258
  end
244
259
  end
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents any syntactic phrase of a sentence.
4
+ class Phrase < Entity
5
+ def initialize(value = '', id = nil)
6
+ super(value, id)
7
+ @type = :phrase
8
+ end
9
+ end
10
+ class Sentence < Phrase
11
+ def initialize(value = '', id = nil)
12
+ super(value, id)
13
+ @type = :sentence
14
+ end
15
+ end
16
+ end
17
+ end
@@ -4,25 +4,24 @@ module Treat
4
4
  class Token < Entity
5
5
  # All tokens are leafs.
6
6
  def is_leaf?; true; end
7
- # Convenience function for statistics.
8
- def frequency; statistics(:frequency_in); end
9
- def frequency_in(type); statistics(:frequency_in, type: type); end
10
- def position_in(type); statistics(:position_in_parent); end
11
- def tf_idf; statistics(:tf_idf); end
7
+ def initialize(value = '', id = nil)
8
+ super(value, id)
9
+ @type = :token
10
+ end
12
11
  end
13
12
  # Represents a word.
14
13
  class Word < Token
15
- # Convenience function for conjugations.
16
- def infinitive(conjugator = nil); conjugations(conjugator, :mode => :infinitive); end
17
- # Convenience function for conjugations.
18
- def present_participle(conjugator = nil); conjugations(conjugator, :tense => :present, :mode => :participle); end
19
- # Convenience function for declensions.
20
- def plural(declensor = nil); declensions(declensor, :count => :plural); end
21
- # Convenience function for declensions.
22
- def singular(declensor = nil); declensions(declensor, :count => :singular); end
14
+ def initialize(value = '', id = nil)
15
+ super(value, id)
16
+ @type = :word
17
+ end
23
18
  end
24
19
  # Represents a clitic ('s).
25
20
  class Clitic < Token
21
+ def initialize(value = '', id = nil)
22
+ super(value, id)
23
+ @type = :clitic
24
+ end
26
25
  end
27
26
  # Represents a number.
28
27
  class Number < Token
@@ -30,17 +29,33 @@ module Treat
30
29
  def to_i; to_s.to_i; end
31
30
  # Convert the number to a float.
32
31
  def to_f; to_s.to_f; end
32
+ def initialize(value = '', id = nil)
33
+ super(value, id)
34
+ @type = :number
35
+ end
33
36
  end
34
37
  # Represents a punctuation sign.
35
38
  class Punctuation < Token
39
+ def initialize(value = '', id = nil)
40
+ super(value, id)
41
+ @type = :punctuation
42
+ end
36
43
  end
37
44
  # Represents a character that is neither
38
45
  # alphabetical, numerical or a punctuation
39
46
  # character (e.g. @#$%&*).
40
47
  class Symbol < Token
48
+ def initialize(value = '', id = nil)
49
+ super(value, id)
50
+ @type = :symbol
51
+ end
41
52
  end
42
53
  # Represents an entity of unknown type.
43
54
  class Unknown < Token
55
+ def initialize(value = '', id = nil)
56
+ super(value, id)
57
+ @type = :unknown
58
+ end
44
59
  end
45
60
  end
46
61
  end
@@ -3,19 +3,39 @@ module Treat
3
3
  # Represents a zone of text
4
4
  # (Title, Paragraph, List, Quote).
5
5
  class Zone < Entity
6
+ def initialize(value = '', id = nil)
7
+ super(value, id)
8
+ @type = :zone
9
+ end
6
10
  end
7
11
  # Represents a title, subtitle, logical header.
8
12
  class Title < Zone
13
+ def initialize(value = '', id = nil)
14
+ super(value, id)
15
+ @type = :title
16
+ end
9
17
  end
10
18
  # Represents a paragraph.
11
19
  class Paragraph < Zone
20
+ def initialize(value = '', id = nil)
21
+ super(value, id)
22
+ @type = :paragraph
23
+ end
12
24
  end
13
25
  # Represents a list.
14
26
  class List < Zone
27
+ def initialize(value = '', id = nil)
28
+ super(value, id)
29
+ @type = :list
30
+ end
15
31
  end
16
32
  # Represents a section, usually with a title
17
33
  # and at least one paragraph.
18
34
  class Section < Zone
35
+ def initialize(value = '', id = nil)
36
+ super(value, id)
37
+ @type = :section
38
+ end
19
39
  end
20
40
  end
21
41
  end