treat 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,19 @@
1
+ module Treat
2
+ module Detectors
3
+ module Language
4
+ class LanguageDetector
5
+ def self.language(entity, options = {})
6
+ if Treat.detect_language == false
7
+ return Treat.default_language
8
+ else
9
+ dlvl = Treat.language_detection_level
10
+ if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
11
+ entity.has_parent?
12
+ return entity.ancestor_with_type(dlvl).language
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,29 @@
1
+ module Treat
2
+ module Detectors
3
+ module Language
4
+ # Require the 'whatlanguage' gem.
5
+ silently { require 'whatlanguage' }
6
+ # Adaptor for the 'whatlanguage' gem, which
7
+ # performs probabilistic language detection.
8
+ class WhatLanguage < LanguageDetector
9
+ # Keep only once instance of the gem class.
10
+ @@wl = nil
11
+ # Detect the language of an entity using the
12
+ # 'whatlanguage' gem. Return an identifier
13
+ # corresponding to the ISO-639-2 code for the
14
+ # language.
15
+ def self.language(entity, options = {})
16
+ predetection = super(entity, options)
17
+ return predetection if predetection
18
+ @@wl ||= ::WhatLanguage.new(:all)
19
+ all = @@wl.process_text(entity.to_s)
20
+ lang = {}
21
+ all.each do |k,v|
22
+ lang[Treat::Resources::Languages.find(k)] = v
23
+ end
24
+ Treat::Feature.new(lang).best
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,52 @@
1
+ module Treat
2
+ # Abstract and concrete structures extending the
3
+ # Tree::Node class to represent textual entities:
4
+ #
5
+ # - Collection
6
+ # - Document
7
+ # - Text
8
+ # - Zone (a Section, Title, Paragraph, or List)
9
+ # - Sentence
10
+ # - Constituent (a Phrase or Clause)
11
+ # - Token (a Word, Number, Punctuation, or Symbol).
12
+ module Entities
13
+ # Require Entity first, since the other classes
14
+ # extend this class.
15
+ require 'treat/entities/entity'
16
+ require 'treat/entities/collection'
17
+ require 'treat/entities/document'
18
+ require 'treat/entities/text'
19
+ require 'treat/entities/zones'
20
+ require 'treat/entities/sentence'
21
+ require 'treat/entities/constituents'
22
+ require 'treat/entities/tokens'
23
+ # Make the constants buildable.
24
+ constants.each do |entity|
25
+ define_singleton_method(entity) do |value='', id=nil|
26
+ const_get(entity).build(value, id)
27
+ end
28
+ end
29
+ # Provide a list of defined entity types,
30
+ # as non-camel case identifiers.
31
+ @@list = []
32
+ def self.list
33
+ return @@list unless @@list.empty?
34
+ self.constants.each do |constant|
35
+ @@list << :"#{ucc(constant)}"
36
+ end
37
+ @@list
38
+ end
39
+ # Return the 'z-order' for hierarchical
40
+ # comparison of entity types.
41
+ def self.rank(type)
42
+ klass = Entities.const_get(cc(type))
43
+ return 6 if klass == Collection || klass < Collection
44
+ return 5 if klass == Document || klass < Document
45
+ return 4 if klass == Text || klass < Text
46
+ return 3 if klass == Zone || klass < Zone
47
+ return 2 if klass == Sentence || klass < Sentence
48
+ return 1 if klass == Constituent || klass < Constituent
49
+ return 0 if klass == Token || klass < Token
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,19 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a collection of texts.
4
+ class Collection < Entity
5
+ # Initialize the collection with a folder
6
+ # containing the texts of the collection.
7
+ def initialize(folder = nil, id = nil)
8
+ super('', id)
9
+ if folder
10
+ set :folder, folder
11
+ Dir.glob("#{folder}/*").each do |f|
12
+ next if FileTest.directory?(f)
13
+ self << Document.new(f)
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,15 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents any syntactic constituent
4
+ # of a sentence.
5
+ class Constituent < Entity
6
+ end
7
+ # Represents a phrase inside a sentence
8
+ # or by itself.
9
+ class Phrase < Constituent
10
+ end
11
+ # Represents a clause inside a sentence.
12
+ class Clause < Constituent
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,11 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a document.
4
+ class Document < Entity
5
+ def initialize(file, id = nil)
6
+ super('', id)
7
+ set :file, file
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,242 @@
1
+ require 'treat/tree'
2
+ require 'treat/feature'
3
+ require 'treat/delegatable'
4
+ require 'treat/visitable'
5
+ require 'treat/registrable'
6
+ require 'treat/buildable'
7
+
8
+ module Treat
9
+ module Entities
10
+ class Entity < Tree::Node
11
+ # Implements support for #register
12
+ include Registrable
13
+ # Implement support for #accept.
14
+ include Visitable
15
+ # Implement support for #self.add_delegators
16
+ extend Delegatable
17
+ # Implement support for #self.from_*
18
+ extend Buildable
19
+ # Initialize the document with its filename.
20
+ # Optionally specify a reader to read the file.
21
+ # If +read+ is set to false, the document will
22
+ # not be read automatically; in that case, the
23
+ # method #read must be called on the document
24
+ # object to load it in.
25
+ def self.build(file_or_value = '', id = nil)
26
+ from_anything(file_or_value, id)
27
+ end
28
+ # Initialize the entity with its value and
29
+ # (optionally) a unique identifier. By default,
30
+ # the object_id will be used as id. Also initialize
31
+ # the token registry in the root node.
32
+ def initialize(value = '', id = nil)
33
+ id ||= object_id
34
+ super(value, id)
35
+ end
36
+ # Return a lowercase identifier representing the
37
+ # type of entity (e.g. :word, :token, etc.)
38
+ def type; :"#{cl(self.class).downcase}"; end
39
+ # Catch missing methods to support method-like
40
+ # access to features (e.g. entity.cat instead of
41
+ # entity.features[:cat]) and to support magic
42
+ # methods (see #parse_magic_method). If the
43
+ # feature does not exist
44
+ def method_missing(sym, *args, &block)
45
+ return self.build(*args) if sym == nil
46
+ if !@features[sym]
47
+ r = parse_magic_method(sym, *args, &block)
48
+ if r == :no_magic
49
+ begin
50
+ super(sym, *args, &block)
51
+ rescue NoMethodError
52
+ # Check...
53
+ if Categories.have_method?(sym)
54
+ msg = "Method #{sym} cannot be called on a #{type}."
55
+ else
56
+ msg = "Method #{sym} does not exist."
57
+ msg += did_you_mean?(Category.methods, sym)
58
+ end
59
+ raise Treat::Exception, msg
60
+ end
61
+ else
62
+ r
63
+ end
64
+ else
65
+ @features[sym]
66
+ end
67
+ end
68
+ # Parse "magic methods", which allow the following
69
+ # syntaxes to be used (where 'word' can be replaced
70
+ # by any entity type, e.g. token, zone, etc.):
71
+ #
72
+ # - each_word : iterate over each entity of type word.
73
+ # - words: return an array of words in the entity.
74
+ # - word: return the first word in the entity.
75
+ # - word_count: return the number of words in the entity.
76
+ # - words_with_*(value) (where is an arbitrary feature):
77
+ # return the words that have the given feature.
78
+ # - word_with_*(value) : return the first word with
79
+ # the feature specified by * in value.
80
+ #
81
+ # Also provides magical methods for types of words:
82
+ #
83
+ # - each_noun:
84
+ # - nouns:
85
+ # - noun:
86
+ # - noun_count:
87
+ # - nouns_with_*(value)
88
+ # - noun_with_*(value)
89
+ #
90
+ # Note that repetition of code in this method
91
+ # (instead of method chaining) is intentional
92
+ # and aims to reduce the number of method
93
+ # dispatches done by Ruby to improve performance.
94
+ def parse_magic_method(sym, *args, &block)
95
+ @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
96
+ @@cats_regexp ||= "(#{Treat::Resources::Categories::List.join('|')})"
97
+ method = sym.to_s =~ /entities/ ?
98
+ sym.to_s.gsub('entities', 'entitys'):
99
+ method = sym.to_s
100
+ a = []
101
+ if method =~ /^parent_#{@@entities_regexp}$/ # Optimize all
102
+ self.class.send(:define_method, "parent_#{$1}") do
103
+ ancestor_with_types(:"#{$1}")
104
+ end
105
+ ancestor_with_types(:"#{$1}")
106
+ elsif method =~ /^each_#{@@entities_regexp}$/
107
+ each_entity(:"#{$1}") { |entity| yield entity }
108
+ elsif method =~ /^#{@@entities_regexp}s$/
109
+ each_entity(:"#{$1}") { |e| a << e }
110
+ a
111
+ elsif method =~ /^#{@@entities_regexp}$/
112
+ each_entity(:"#{$1}") { |e| a << e }
113
+ first_but_warn(a, $1)
114
+ elsif method =~ /^#{@@entities_regexp}_count$/
115
+ i = 0
116
+ each_entity(:"#{$1}") { |e| i += 1 }
117
+ i
118
+ elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
119
+ each_entity(:"#{$1}") do |e|
120
+ a << e if e.has?(:"#{$2}") &&
121
+ e.send(:"#{$2}") == args[0]
122
+ end
123
+ a
124
+ elsif method =~ /^#{@@entities_regexp}s_with_([a-z]*)$/
125
+ each_entity(:"#{$1}") do |e|
126
+ a << e if e.has?(:"#{$2}") &&
127
+ e.send(:"#{$2}") == args[0]
128
+ end
129
+ first_but_warn(a, $1)
130
+ elsif method =~ /^each_with_([a-z]*)$/
131
+ each_entity do |e|
132
+ yield e if e.has?(:"#{$2}") &&
133
+ e.send(:"#{$2}") == args[0]
134
+ end
135
+ elsif method =~ /^each_#{@@cats_regexp}$/
136
+ each_entity(:word) { |e| yield e if e.cat == :"#{$1}" }
137
+ elsif method =~ /^#{@@cats_regexp}s$/
138
+ each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
139
+ a
140
+ elsif method =~ /^#{@@cats_regexp}$/
141
+ each_entity(:word) { |e| a << e if e.cat == :"#{$1}" }
142
+ first_but_warn(a, $1)
143
+ elsif method =~ /^#{@@cats_regexp}_count$/
144
+ i = 0
145
+ each_entity(:word) { |e| i += 1 if e.cat == :"#{$1}" }
146
+ i
147
+ elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
148
+ each_entity(:word) do |e|
149
+ a << e if e.cat == :"#{$1}" &&
150
+ e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
151
+ end
152
+ a
153
+ elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
154
+ each_entity(:word) do |e|
155
+ a << e if e.cat == :"#{$1}" &&
156
+ e.has?(:"#{$2}") && e.send(:"#{$2}") == args[0]
157
+ end
158
+ first_but_warn(a, $1)
159
+ else
160
+ :no_magic
161
+ end
162
+ end
163
+ # Add an entity to the current entity.
164
+ # Registers the entity in the root node
165
+ # token registry if the entity is a leaf.
166
+ #
167
+ # @see Treat::Registrable
168
+ def <<(entities, clear_parent = true)
169
+ entities = [entities] unless entities.is_a? Array
170
+ entities.each do |entity|
171
+ register_token(entity) if entity.is_leaf?
172
+ end
173
+ super(entities)
174
+ @parent.value = '' if has_parent?
175
+ entities[0]
176
+ end
177
+ # Yields each entity of any of the supplied
178
+ # types in the children tree of this Entity.
179
+ # Note that this function is recursive, unlike
180
+ # #each. It does not yield the top element being
181
+ # recursed.
182
+ def each_entity(*types)
183
+ yield self if match_types(self, types)
184
+ if has_children?
185
+ @children.each do |child|
186
+ child.each_entity(*types) { |y| yield y }
187
+ end
188
+ end
189
+ end
190
+ # Returns the first ancestor of this
191
+ # entity that has the given type.
192
+ def ancestor_with_types(*types)
193
+ ancestor = @parent
194
+ while not match_types(ancestor, types)
195
+ return nil unless ancestor.has_parent?
196
+ ancestor = ancestor.parent
197
+ end
198
+ match_types(ancestor, types) ? ancestor : nil
199
+ end
200
+ alias :ancestor_with_type :ancestor_with_types
201
+ # Return the entity's string value in plain text format.
202
+ def to_string; @value; end
203
+ # An alias for #to_string.
204
+ def to_s; visualize(:txt); end
205
+ alias :to_str :to_s
206
+ # Return an informative string representation of the entity.
207
+ def inspect; visualize(:inspect); end
208
+ # Print out an ASCII representation of the tree.
209
+ def print_tree; puts visualize(:tree); end
210
+ # Return a shortened value of the entity's string value using [...].
211
+ def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
212
+ # Convenience functions. Convenience decorators.
213
+ def frequency_of(word); statistics(:frequency_of, value: word); end
214
+
215
+ private
216
+ # Return the first element in the array, warning if not
217
+ # the only one in the array. Used for magic methods: e.g.,
218
+ # the magic method "word" if called on a sentence
219
+ # with many words, Treat will return the first word
220
+ # but warn the user.
221
+ def first_but_warn(array, type)
222
+ if array.size > 1
223
+ warn "Warning: requested one #{type}, but" +
224
+ " there are many #{type}s in the given entity."
225
+ end
226
+ array[0]
227
+ end
228
+ # Cache a list of the type => class relationships.
229
+ @@type_classes = {}
230
+ # Returns true if the node is of the same type or
231
+ # is a subtype of of one of the specified entity types,
232
+ # which are supplied as identifiers rather than classes.
233
+ def match_types(node, entity_types)
234
+ entity_types.each do |type|
235
+ @@type_classes[type] ||= Entities.const_get(cc(type))
236
+ return true if node.is_a? @@type_classes[type]
237
+ end
238
+ false
239
+ end
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,8 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a sentence.
4
+ class Sentence < Entity
5
+ def subject(l = nil, o = {}); link(l, o.merge({:linkage => :subject})); end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,7 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a text.
4
+ class Text < Entity
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,37 @@
1
+ module Treat
2
+ module Entities
3
+ # Represents a terminal element in the text structure.
4
+ class Token < Entity
5
+ # All tokens are leafs.
6
+ def is_leaf?; true; end
7
+ def frequency; self.set :frequency, statistics(:frequency); end
8
+ end
9
+ # Represents a word.
10
+ class Word < Token
11
+ def infinitive(conjugator = nil); conjugate(conjugator, :mode => :infinitive); end
12
+ def present_participle(conjugator = nil); conjugate(conjugator, :tense => :present, :mode => :participle); end
13
+ def plural(declensor = nil); declense(declensor, :count => :plural); end
14
+ def singular(declensor = nil); declense(declensor, :count => :singular); end
15
+ end
16
+ class Clitic < Token
17
+ end
18
+ # Represents a number.
19
+ class Number < Token
20
+ # Convert the number to an integer.
21
+ def to_i; to_s.to_i; end
22
+ # Convert the number to a float.
23
+ def to_f; to_s.to_f; end
24
+ end
25
+ # Represents a punctuation sign.
26
+ class Punctuation < Token
27
+ end
28
+ # Represents a character that is neither
29
+ # alphabetical, numerical or a punctuation
30
+ # character (e.g. @#$%&*).
31
+ class Symbol < Token
32
+ end
33
+ # Represents an entity of unknown type.
34
+ class Unknown < Token
35
+ end
36
+ end
37
+ end