treat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,19 @@
1
+ module Treat
2
+ module Detectors
3
+ module Language
4
+ class LanguageDetector
5
+ def self.language(entity, options = {})
6
+ if Treat.detect_language == false
7
+ return Treat.default_language
8
+ else
9
+ dlvl = Treat.language_detection_level
10
+ if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
11
+ entity.has_parent?
12
+ return entity.ancestor_with_type(dlvl).language
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,29 @@
1
+ module Treat
2
+ module Detectors
3
+ module Language
4
+ # Require the 'whatlanguage' gem.
5
+ silently { require 'whatlanguage' }
6
+ # Adaptor for the 'whatlanguage' gem, which
7
+ # performs probabilistic language detection.
8
+ class WhatLanguage < LanguageDetector
9
+ # Keep only once instance of the gem class.
10
+ @@wl = nil
11
+ # Detect the language of an entity using the
12
+ # 'whatlanguage' gem. Return an identifier
13
+ # corresponding to the ISO-639-2 code for the
14
+ # language.
15
+ def self.language(entity, options = {})
16
+ predetection = super(entity, options)
17
+ return predetection if predetection
18
+ @@wl ||= ::WhatLanguage.new(:all)
19
+ all = @@wl.process_text(entity.to_s)
20
+ lang = {}
21
+ all.each do |k,v|
22
+ lang[Treat::Resources::Languages.find(k)] = v
23
+ end
24
+ Treat::Feature.new(lang).best
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,52 @@
1
+ module Treat
2
+ # Abstract and concrete structures extending the
3
+ # Tree::Node class to represent textual entities:
4
+ #
5
+ # - Collection
6
+ # - Document
7
+ # - Text
8
+ # - Zone (a Section, Title, Paragraph, or List)
9
+ # - Sentence
10
+ # - Constituent (a Phrase or Clause)
11
+ # - Token (a Word, Number, Punctuation, or Symbol).
12
+ module Entities
13
+ # Require Entity first, since the other classes
14
+ # extend this class.
15
+ require 'treat/entities/entity'
16
+ require 'treat/entities/collection'
17
+ require 'treat/entities/document'
18
+ require 'treat/entities/text'
19
+ require 'treat/entities/zones'
20
+ require 'treat/entities/sentence'
21
+ require 'treat/entities/constituents'
22
+ require 'treat/entities/tokens'
23
+ # Make the constants buildable.
24
+ constants.each do |entity|
25
+ define_singleton_method(entity) do |value='', id=nil|
26
+ const_get(entity).build(value, id)
27
+ end
28
+ end
29
+ # Provide a list of defined entity types,
30
+ # as non-camel case identifiers.
31
+ @@list = []
32
+ def self.list
33
+ return @@list unless @@list.empty?
34
+ self.constants.each do |constant|
35
+ @@list << :"#{ucc(constant)}"
36
+ end
37
+ @@list
38
+ end
39
+ # Return the 'z-order' for hierarchical
40
+ # comparison of entity types.
41
+ def self.rank(type)
42
+ klass = Entities.const_get(cc(type))
43
+ return 6 if klass == Collection || klass < Collection
44
+ return 5 if klass == Document || klass < Document
45
+ return 4 if klass == Text || klass < Text
46
+ return 3 if klass == Zone || klass < Zone
47
+ return 2 if klass == Sentence || klass < Sentence
48
+ return 1 if klass == Constituent || klass < Constituent
49
+ return 0 if klass == Token || klass < Token
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,19 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a collection of texts.
4
+ class Collection < Entity
5
+ # Initialize the collection with a folder
6
+ # containing the texts of the collection.
7
+ def initialize(folder = nil, id = nil)
8
+ super('', id)
9
+ if folder
10
+ set :folder, folder
11
+ Dir.glob("#{folder}/*").each do |f|
12
+ next if FileTest.directory?(f)
13
+ self << Document.new(f)
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,15 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents any syntactic constituent
4
+ # of a sentence.
5
+ class Constituent < Entity
6
+ end
7
+ # Represents a phrase inside a sentence
8
+ # or by itself.
9
+ class Phrase < Constituent
10
+ end
11
+ # Represents a clause inside a sentence.
12
+ class Clause < Constituent
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,11 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a document.
4
+ class Document < Entity
5
+ def initialize(file, id = nil)
6
+ super('', id)
7
+ set :file, file
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,242 @@
1
+ require 'treat/tree'
2
+ require 'treat/feature'
3
+ require 'treat/delegatable'
4
+ require 'treat/visitable'
5
+ require 'treat/registrable'
6
+ require 'treat/buildable'
7
+
8
+ module Treat
9
+ module Entities
10
+ class Entity < Tree::Node
11
+ # Implements support for #register
12
+ include Registrable
13
+ # Implement support for #accept.
14
+ include Visitable
15
+ # Implement support for #self.add_delegators
16
+ extend Delegatable
17
+ # Implement support for #self.from_*
18
+ extend Buildable
19
+ # Initialize the document with its filename.
20
+ # Optionally specify a reader to read the file.
21
+ # If +read+ is set to false, the document will
22
+ # not be read automatically; in that case, the
23
+ # method #read must be called on the document
24
+ # object to load it in.
25
+ def self.build(file_or_value = '', id = nil)
26
+ from_anything(file_or_value, id)
27
+ end
28
+ # Initialize the entity with its value and
29
+ # (optionally) a unique identifier. By default,
30
+ # the object_id will be used as id. Also initialize
31
+ # the token registry in the root node.
32
+ def initialize(value = '', id = nil)
33
+ id ||= object_id
34
+ super(value, id)
35
+ end
36
+ # Return a lowercase identifier representing the
37
+ # type of entity (e.g. :word, :token, etc.)
38
+ def type; :"#{cl(self.class).downcase}"; end
39
+ # Catch missing methods to support method-like
40
+ # access to features (e.g. entity.cat instead of
41
+ # entity.features[:cat]) and to support magic
42
+ # methods (see #parse_magic_method). If the
43
+ # feature does not exist
44
+ def method_missing(sym, *args, &block)
45
+ return self.build(*args) if sym == nil
46
+ if !@features[sym]
47
+ r = parse_magic_method(sym, *args, &block)
48
+ if r == :no_magic
49
+ begin
50
+ super(sym, *args, &block)
51
+ rescue NoMethodError
52
+ # Check...
53
+ if Categories.have_method?(sym)
54
+ msg = "Method #{sym} cannot be called on a #{type}."
55
+ else
56
+ msg = "Method #{sym} does not exist."
57
+ msg += did_you_mean?(Category.methods, sym)
58
+ end
59
+ raise Treat::Exception, msg
60
+ end
61
+ else
62
+ r
63
+ end
64
+ else
65
+ @features[sym]
66
+ end
67
+ end
68
+ # Parse "magic methods", which allow the following
69
+ # syntaxes to be used (where 'word' can be replaced
70
+ # by any entity type, e.g. token, zone, etc.):
71
+ #
72
+ # - each_word : iterate over each entity of type word.
73
+ # - words: return an array of words in the entity.
74
+ # - word: return the first word in the entity.
75
+ # - word_count: return the number of words in the entity.
76
+ # - words_with_*(value) (where is an arbitrary feature):
77
+ # return the words that have the given feature.
78
+ # - word_with_*(value) : return the first word with
79
+ # the feature specified by * in value.
80
+ #
81
+ # Also provides magical methods for types of words:
82
+ #
83
+ # - each_noun:
84
+ # - nouns:
85
+ # - noun:
86
+ # - noun_count:
87
+ # - nouns_with_*(value)
88
+ # - noun_with_*(value)
89
+ #
90
+ # Note that repetition of code in this method
91
+ # (instead of method chaining) is intentional
92
+ # and aims to reduce the number of method
93
+ # dispatches done by Ruby to improve performance.
94
+ def parse_magic_method(sym, *args, &block)
95
+ @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
96
+ @@cats_regexp ||= "(#{Treat::Resources::Categories::List.join('|')})"
97
+ method = sym.to_s =~ /entities/ ?
98
+ sym.to_s.gsub('entities', 'entitys'):
99
+ method = sym.to_s
100
+ a = []
101
+ if method =~ /^parent_#{@@entities_regexp}$/ # Optimize all
102
+ self.class.send(:define_method, "parent_#{$1}") do
103
+ ancestor_with_types(:"#{$1}")
104
+ end
105
+ ancestor_with_types(:"#{$1}")
106
+ elsif method =~ /^each_#{@@entities_regexp}$/
107
+ each_entity(:"#{$1}") { |entity| yield entity }
108
+ elsif method =~ /^#{@@entities_regexp}s$/
109
+ each_entity(:"#{$1}") { |e| a << e }
110
+ a
111
+ elsif method =~ /^#{@@entities_regexp}$/
112
+ each_entity(:"#{$1}") { |e| a << e }
113
+ first_but_warn(a, $1)
114
+ elsif method =~ /^#{@@entities_regexp}_count$/
115
+ i = 0
116
+ each_entity(:"#{$1}") { |e| i += 1 }
117
+ i
118
+ elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
119
+ each_entity(:"#{$1}") do |e|
120
+ a << e if e.has?(:"#{$2}") &&
121
+ e.send(:"#{$2}") == args[0]
122
+ end
123
+ a
124
+ elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
125
+ each_entity(:"#{$1}") do |e|
126
+ a << e if e.has?(:"#{$2}") &&
127
+ e.send(:"#{$2}") == args[0]
128
+ end
129
+ first_but_warn(a, $1)
130
+ elsif method =~ /^each_with_([a-z]*)$/
131
+ each_entity do |e|
132
+ yield e if e.has?(:"#{$2}") &&
133
+ e.send(:"#{$2}") == args[0]
134
+ end
135
+ elsif method =~ /^each_#{@@cats_regexp}$/
136
+ each_entity(:word) { |e| yield e if e.cat == :"#{$1}" }
137
+ elsif method =~ /^#{@@cats_regexp}s$/
138
+ each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
139
+ a
140
+ elsif method =~ /^#{@@cats_regexp}$/
141
+ each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
142
+ first_but_warn(a, $1)
143
+ elsif method =~ /^#{@@cats_regexp}_count$/
144
+ i = 0
145
+ each_entity(:word) { |e| i += 1 if e.cat == :"#{$1}" }
146
+ i
147
+ elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
148
+ each_entity(:word) do |e|
149
+ a << e if e.cat == :"#{$1}" &&
150
+ e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
151
+ end
152
+ a
153
+ elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
154
+ each_entity(:word) do |e|
155
+ a << e if e.cat == :"#{$1}" &&
156
+ e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
157
+ end
158
+ first_but_warn(a, $1)
159
+ else
160
+ :no_magic
161
+ end
162
+ end
163
+ # Add an entity to the current entity.
164
+ # Registers the entity in the root node
165
+ # token registry if the entity is a leaf.
166
+ #
167
+ # @see Treat::Registrable
168
+ def <<(entities, clear_parent = true)
169
+ entities = [entities] unless entities.is_a? Array
170
+ entities.each do |entity|
171
+ register_token(entity) if entity.is_leaf?
172
+ end
173
+ super(entities)
174
+ @parent.value = '' if has_parent?
175
+ entities[0]
176
+ end
177
+ # Yields each entity of any of the supplied
178
+ # types in the children tree of this Entity.
179
+ # Note that this function is recursive, unlike
180
+ # #each. It does not yield the top element being
181
+ # recursed.
182
+ def each_entity(*types)
183
+ yield self if match_types(self, types)
184
+ if has_children?
185
+ @children.each do |child|
186
+ child.each_entity(*types) { |y| yield y }
187
+ end
188
+ end
189
+ end
190
+ # Returns the first ancestor of this
191
+ # entity that has the given type.
192
+ def ancestor_with_types(*types)
193
+ ancestor = @parent
194
+ while not match_types(ancestor, types)
195
+ return nil unless ancestor.has_parent?
196
+ ancestor = ancestor.parent
197
+ end
198
+ match_types(ancestor, types) ? ancestor : nil
199
+ end
200
+ alias :ancestor_with_type :ancestor_with_types
201
+ # Return the entity's string value in plain text format.
202
+ def to_string; @value; end
203
+ # An alias for #to_string.
204
+ def to_s; visualize(:txt); end
205
+ alias :to_str :to_s
206
+ # Return an informative string representation of the entity.
207
+ def inspect; visualize(:inspect); end
208
+ # Print out an ASCII representation of the tree.
209
+ def print_tree; puts visualize(:tree); end
210
+ # Return a shortened value of the entity's string value using [...].
211
+ def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
212
+ # Convenience functions. Convenience decorators.
213
+ def frequency_of(word); statistics(:frequency_of, value: word); end
214
+
215
+ private
216
+ # Return the first element in the array, warning if not
217
+ # the only one in the array. Used for magic methods: e.g.,
218
+ # the magic method "word" if called on a sentence
219
+ # with many words, Treat will return the first word
220
+ # but warn the user.
221
+ def first_but_warn(array, type)
222
+ if array.size > 1
223
+ warn "Warning: requested one #{type}, but" +
224
+ " there are many #{type}s in the given entity."
225
+ end
226
+ array[0]
227
+ end
228
+ # Cache a list of the type => class relationships.
229
+ @@type_classes = {}
230
+ # Returns true if the node is of the same type or
231
+ # is a subtype of of one of the specified entity types,
232
+ # which are supplied as identifiers rather than classes.
233
+ def match_types(node, entity_types)
234
+ entity_types.each do |type|
235
+ @@type_classes[type] ||= Entities.const_get(cc(type))
236
+ return true if node.is_a? @@type_classes[type]
237
+ end
238
+ false
239
+ end
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,8 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a sentence.
4
+ class Sentence < Entity
5
+ def subject(l = nil, o = {}); link(l, o.merge({:linkage => :subject})); end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,7 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a text.
4
+ class Text < Entity
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,37 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a terminal element in the text structure.
4
+ class Token < Entity
5
+ # All tokens are leafs.
6
+ def is_leaf?; true; end
7
+ def frequency; self.set :frequency, statistics(:frequency); end
8
+ end
9
+ # Represents a word.
10
+ class Word < Token
11
+ def infinitive(conjugator = nil); conjugate(conjugator, :mode => :infinitive); end
12
+ def present_participle(conjugator = nil); conjugate(conjugator, :tense => :present, :mode => :participle); end
13
+ def plural(declensor = nil); declense(declensor, :count => :plural); end
14
+ def singular(declensor = nil); declense(declensor, :count => :singular); end
15
+ end
16
+ class Clitic < Token
17
+ end
18
+ # Represents a number.
19
+ class Number < Token
20
+ # Convert the number to an integer.
21
+ def to_i; to_s.to_i; end
22
+ # Convert the number to a float.
23
+ def to_f; to_s.to_f; end
24
+ end
25
+ # Represents a punctuation sign.
26
+ class Punctuation < Token
27
+ end
28
+ # Represents a character that is neither
29
+ # alphabetical, numerical or a punctuation
30
+ # character (e.g. @#$%&*).
31
+ class Symbol < Token
32
+ end
33
+ # Represents an entity of unknown type.
34
+ class Unknown < Token
35
+ end
36
+ end
37
+ end