treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,46 +1,54 @@
1
- module Treat
2
- module Formatters
3
- module Visualizers
4
- # This class allows the visualization of
5
- # an entity in standoff format; for example:
6
- # (S (NP John) (VP has (VP come))).
7
- class Standoff
8
- # Default options for the visualizer.
9
- DefaultOptions = { :indent => 0 }
10
- # A lambda to recursively visualize the children
11
- # of an entity.
12
- Recurse = lambda do |entity, options|
13
- v = ''
14
- entity.each { |child| v += visualize(child, options) }
15
- v
16
- end
17
- # Visualize the entity using standoff notation.
18
- # This can only be called on sentences and smaller
19
- # entities, as it is not a suitable format to
20
- # represent larger entities.
21
- def self.visualize(entity, options = {})
22
- options = DefaultOptions.merge(options)
23
- value = ''; spaces = ''
24
- options[:indent].times { spaces << ' '}
25
- options[:indent] += 1
26
- if entity.is_a?(Treat::Entities::Token)
27
- value += "#{spaces}(#{entity.tag} #{entity.value})"
28
- elsif entity.is_a?(Treat::Entities::Phrase)
29
- tag = entity.has?(:tag) ? entity.tag : ''
30
- value += ("#{spaces}(#{tag}\n" +
31
- "#{Recurse.call(entity, options)})\n")
32
- elsif entity.is_a?(Treat::Entities::Sentence)
33
- value += ("#{spaces}(S\n" +
34
- "#{Recurse.call(entity, options)})\n")
35
- else
36
- raise 'Standoff format is unsuitable to represent' +
37
- ' entities larger than sentences.'
38
- end
39
- options[:indent] -= 1
40
- value.gsub!(")\n)", "))")
41
- value
42
- end
43
- end
1
+ # This class allows the visualization of
2
+ # an entity in standoff format; for example:
3
+ # (S (NP John) (VP has (VP come))).
4
+ class Treat::Formatters::Visualizers::Standoff
5
+
6
+ # Start out with an indent of 0.
7
+ DefaultOptions = { :indent => 0 }
8
+
9
+ # A lambda to recursively visualize the children
10
+ # of an entity.
11
+ Recurse = lambda do |entity, options|
12
+ v = ''
13
+ entity.each { |child| v += visualize(child, options) }
14
+ v
15
+ end
16
+
17
+ # Fix - brackets
18
+ # Visualize the entity using standoff notation.
19
+ # This can only be called on sentences and smaller
20
+ # entities, as it is not a suitable format to
21
+ # represent larger entities.
22
+ def self.visualize(entity, options = {})
23
+ options = DefaultOptions.merge(options)
24
+ value = ''; spaces = ''
25
+ options[:indent].times { spaces << ' '}
26
+ options[:indent] += 1
27
+ if entity.is_a?(Treat::Entities::Token)
28
+ val = ptb_escape(entity.value)
29
+ value += "#{spaces}(#{entity.tag} #{val})"
30
+ elsif entity.is_a?(Treat::Entities::Phrase)
31
+ tag = entity.has?(:tag) ? entity.tag : ''
32
+ value += ("#{spaces}(#{tag}\n" +
33
+ "#{Recurse.call(entity, options)})\n")
34
+ elsif entity.is_a?(Treat::Entities::Sentence)
35
+ value += ("#{spaces}(S\n" +
36
+ "#{Recurse.call(entity, options)})\n")
37
+ else
38
+ raise 'Standoff format is unsuitable to represent' +
39
+ ' entities larger than sentences.'
40
+ end
41
+ options[:indent] -= 1
42
+ value.gsub!(")\n)", "))")
43
+ value
44
+ end
45
+
46
+ def self.ptb_escape(val)
47
+ Treat::Linguistics::Tags::
48
+ PTBEscapeCharacters.each do |char, esc|
49
+ val.gsub!(char, val)
44
50
  end
51
+
52
+ val
45
53
  end
46
- end
54
+ end
@@ -1,32 +1,29 @@
1
- module Treat
2
- module Formatters
3
- module Visualizers
4
- # This class generates an ASCII representation
5
- # of a tree of entities.
6
- class Tree
7
- # Default options for the visualizer.
8
- DefaultOptions = { :indent => 0 }
9
- # Obtain a plain text tree representation
10
- # of the entity.
11
- def self.visualize(entity, options = {})
12
- options = DefaultOptions.merge(options)
13
- string = ''
14
- if entity.has_children?
15
- spacer = '--'
16
- spaces = ''
17
- options[:indent].times { spaces << ' '}
18
- string << "+ #{entity.inspect}\n#{spaces}|"
19
- options[:indent] += 1
20
- entity.children.each do |child|
21
- string = string + "\n" + spaces + '+' +
22
- spacer + self.visualize(child, options)
23
- end
24
- options[:indent] -= 1
25
- return string
26
- end
27
- '> ' + entity.inspect
28
- end
1
+ # This class generates an ASCII representation
2
+ # of a tree of entities.
3
+ class Treat::Formatters::Visualizers::Tree
4
+
5
+ # Start out with an indent at 0.
6
+ DefaultOptions = { :indent => 0 }
7
+
8
+ # Obtain a plain text tree representation
9
+ # of the entity.
10
+ def self.visualize(entity, options = {})
11
+ options = DefaultOptions.merge(options)
12
+ string = ''
13
+ if entity.has_children?
14
+ spacer = '--'
15
+ spaces = ''
16
+ options[:indent].times { spaces << ' '}
17
+ string << "+ #{entity.inspect}\n#{spaces}|"
18
+ options[:indent] += 1
19
+ entity.children.each do |child|
20
+ string = string + "\n" + spaces + '+' +
21
+ spacer + self.visualize(child, options)
29
22
  end
23
+ options[:indent] -= 1
24
+ return string
30
25
  end
26
+ '> ' + entity.inspect
31
27
  end
32
- end
28
+
29
+ end
@@ -0,0 +1,153 @@
1
+ module Treat::Groupable
2
+
3
+ # Lazily load the worker classes in the group.
4
+ def const_missing(const)
5
+ bits = self.ancestors[0].to_s.split('::')
6
+ bits.collect! { |bit| ucc(bit) }
7
+ file = bits.join('/') + "/#{ucc(const)}"
8
+ if not File.readable?(Treat.lib + "#{file}.rb")
9
+ raise Treat::Exception,
10
+ "File '#{file}.rb' corresponding to " +
11
+ "requested worker #{self}::#{const} " +
12
+ "does not exist."
13
+ else
14
+ require file
15
+ if not const_defined?(const)
16
+ raise Treat::Exception,
17
+ "File #{file} does not define " +
18
+ "#{self}::#{const}."
19
+ end
20
+ const_get(const)
21
+ end
22
+ end
23
+
24
+ # Cache the list of workers to improve performance.
25
+ @@list = {}
26
+ # Populates once the list of the workers in the group
27
+ # by crawling the filesystem.
28
+ def list
29
+ mod = ucc(cl(self))
30
+ if @@list[mod].nil?
31
+ @@list[mod] = []
32
+ dirs = Dir[Treat.lib + "treat/*/#{mod}/*.rb"]
33
+ dirs.each do |file|
34
+ @@list[mod] <<
35
+ file.split('/')[-1][0..-4].intern
36
+ end
37
+ end
38
+ @@list[mod]
39
+ end
40
+
41
+ # Boolean - does the group have the supplied class
42
+ # included in its targets?
43
+ def has_target?(target, strict = false)
44
+ is_target = false
45
+ self.targets.each do |entity_type|
46
+ t = cc(entity_type)
47
+ entity_type = Treat::Entities.const_get(t)
48
+ if target < entity_type ||
49
+ entity_type == target
50
+ is_target = true; break
51
+ end
52
+ end
53
+ is_target
54
+ end
55
+
56
+ # Create a new algorithm within the group. Once
57
+ # the algorithm is added, it will be automatically
58
+ # installed on all the targets of the group.
59
+ def add(class_name, &block)
60
+ c = cc(class_name).intern
61
+ klass = self.const_set(c, Class.new)
62
+ method = self.method
63
+ @@list[ucc(cl(self))] << class_name
64
+ klass.send(:define_singleton_method,
65
+ method) do |entity, options={}|
66
+ block.call(entity, options)
67
+ end
68
+ end
69
+
70
+ # Get constants in this module, excluding by
71
+ # default those defined by parent modules.
72
+ def const_get(const)
73
+ super(const, false)
74
+ end
75
+
76
+ # Modify the extended class.
77
+ def self.extended(group)
78
+
79
+ group.module_eval do
80
+
81
+ class << self
82
+
83
+ # The type of the group. There are three types:
84
+ #
85
+ # - Transformers transform the tree of an entity.
86
+ # - Annotators compute a value and store it in the entity.
87
+ # - Computers compute a value and do not store it.
88
+ attr_accessor :type
89
+ # The default worker in the group, for language-
90
+ # independent tasks.
91
+ attr_accessor :default
92
+ # The entity types which the group's workers work on.
93
+ attr_accessor :targets
94
+ # Presets to automatically generate functions.
95
+ attr_accessor :presets
96
+ # The preset option to use with preset functions.
97
+ attr_accessor :preset_option
98
+ end
99
+
100
+ # Return the method corresponding to the group.
101
+ # This method resolves the name of the method
102
+ # that a group should provide based on the name
103
+ # of the group. Basically, if the group ends in
104
+ # -ers, the verb corresponding to the group is
105
+ # returned (tokenizers -> tokenize, inflectors ->
106
+ # inflect). Otherwise, the name of the method
107
+ # is the same as that of the group (encoding ->
108
+ # encoding, tag -> tag).
109
+ @method = nil
110
+ def self.method
111
+ return @method if @method
112
+ m = ucc(cl(self)).dup
113
+ if m[-4..-1] == 'zers'
114
+ if type == :annotator
115
+ if m[-6] == 'l'
116
+ m[-5..-1] = ''
117
+ else
118
+ m[-5..-1] = 'y'
119
+ end
120
+ else
121
+ m = m[0..-3]
122
+ end
123
+ n = m
124
+ elsif m[-4..-1] == 'iers'
125
+ m[-4..-1] = 'y'
126
+ n = m
127
+ elsif m[-3..-1] == 'ers'
128
+ if ['k', 't', 'm', 'd',
129
+ 'g', 'n', 'x', 'h'].
130
+ include? m[-4]
131
+ n = m[0..-4]
132
+ if n[-1] == n[-2]
133
+ n = n[0..-2]
134
+ end
135
+ else
136
+ n = m[0..-3]
137
+ end
138
+ elsif m[-3..-1] == 'ors'
139
+ n = m[0..-4] + 'e'
140
+ else
141
+ n = m
142
+ end
143
+ @method = n.intern
144
+ end
145
+
146
+ # Populate the group's list.
147
+ group.list
148
+
149
+ end
150
+
151
+ end
152
+
153
+ end
@@ -0,0 +1,22 @@
1
+ module Treat::Helpers
2
+
3
+ class DecimalPointEscaper
4
+
5
+ EscapeChar = '^^'
6
+ EscapedEscapeChar = '\^\^'
7
+
8
+ def self.escape!(s)
9
+ s.gsub!(/([0-9]+)\.([0-9]+)/) do
10
+ $1 + EscapeChar + $2
11
+ end
12
+ end
13
+
14
+ def self.unescape!(s)
15
+ s.gsub!(/([0-9]+)#{EscapedEscapeChar}([0-9]+)/) do
16
+ $1 + '.' + $2
17
+ end
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -1,47 +1,52 @@
1
- module Treat
2
- # Algorithms to retrieve the inflections of a word.
3
- module Inflectors
4
- # Return the stem (*not root form*) of a word.
5
- module Stem
6
- extend Group
7
- self.type = :annotator
8
- self.targets = [:word]
9
- end
10
- # Retrieve the different declensions of a noun (singular, plural).
11
- module Declensions
12
- extend Group
13
- self.type = :annotator
14
- self.targets = [:word]
15
- self.presets = {
16
- :plural => {:count => :plural},
17
- :singular => {:count => :singular}
18
- }
19
- end
20
- # Retrieve the different conjugations of a word.
21
- module Conjugations
22
- extend Group
23
- self.type = :annotator
24
- self.targets = [:word]
25
- self.presets = {
26
- :infinitive => {:mode => :infinitive},
27
- :present_participle => {:tense => :present, :mode => :participle},
28
- :plural_verb => {:count => :plural},
29
- :singular_verb => {:count => :singular}
30
- }
31
- end
32
- # Retrieve the full text description of a cardinal number.
33
- module CardinalWords
34
- extend Group
35
- self.type = :annotator
36
- self.targets = [:number]
37
- end
38
- # Retrieve the full text description of an ordinal number.
39
- module OrdinalWords
40
- extend Group
41
- self.type = :annotator
42
- self.targets = [:number]
43
- end
44
- extend Treat::Category
1
+ # Category of worker groups that retrieve
2
+ # the inflections of a word.
3
+ module Treat::Inflectors
4
+
5
+ # Return the stem (*not root form*) of a word.
6
+ module Stemmers
7
+ extend Treat::Groupable
8
+ self.type = :annotator
9
+ self.targets = [:word]
10
+ end
11
+
12
+ # Retrieve the different declensions of a
13
+ # noun (singular, plural).
14
+ module Declensors
15
+ extend Treat::Groupable
16
+ self.type = :annotator
17
+ self.targets = [:word]
18
+ self.preset_option = :count
19
+ self.presets = [:plural, :singular]
20
+ end
21
+
22
+ # Retrieve the different conjugations of a word
23
+ # given a mode, tense, person, and/or number.
24
+ module Conjugators
25
+ extend Treat::Groupable
26
+ self.type = :annotator
27
+ self.targets = [:word]
28
+ self.preset_option = :form
29
+ self.presets = [:infinitive, :present_participle,
30
+ :plural_verb, :singular_verb]
45
31
  end
46
- end
47
32
 
33
+ # Retrieve the full text description of a
34
+ # cardinal number.
35
+ module Cardinalizers
36
+ extend Treat::Groupable
37
+ self.type = :annotator
38
+ self.targets = [:number]
39
+ end
40
+
41
+ # Retrieve the full text description of an
42
+ # ordinal number.
43
+ module Ordinalizers
44
+ extend Treat::Groupable
45
+ self.type = :annotator
46
+ self.targets = [:number]
47
+ end
48
+
49
+ # Make Inflectors categorizable.
50
+ extend Treat::Categorizable
51
+
52
+ end
@@ -0,0 +1,40 @@
1
+ # This class is a wrapper for the functions included
2
+ # in the 'linguistics' gem that allow to describe a
3
+ # number in words in cardinal form.
4
+ #
5
+ # Project website: http://deveiate.org/projects/Linguistics/
6
+ module Treat::Inflectors::Cardinalizers::Linguistics
7
+
8
+ require 'treat/loaders/linguistics'
9
+
10
+ # Return the description of a cardinal number in words.
11
+ #
12
+ # Options:
13
+ #
14
+ # - :group => Controls how many numbers at a time are
15
+ # grouped together. Valid values are 0 (normal grouping),
16
+ # 1 (single-digit grouping, e.g., “one, two, three, four”),
17
+ # 2 (double-digit grouping, e.g., “twelve, thirty-four”, or
18
+ # 3 (triple-digit grouping, e.g., “one twenty-three, four”).
19
+ # - :comma => Set the character/s used to separate word groups.
20
+ # Defaults to ", ".
21
+ # - :and => Set the word and/or characters used where ' and '
22
+ # (the default) is normally used. Setting :and to ' ', for
23
+ # example, will cause 2556 to be returned as “two-thousand,
24
+ # five hundred fifty-six” instead of “two-thousand, five
25
+ # hundred and fifty-six”.
26
+ # - :zero => Set the word used to represent the numeral 0 in
27
+ # the result. 'zero' is the default.
28
+ # - :decimal => Set the translation of any decimal points in
29
+ # the number; the default is 'point'.
30
+ # - :asArray If set to a true value, the number will be returned
31
+ # as an array of word groups instead of a String.
32
+ #
33
+ # More specific options when using :type => :ordinal:
34
+ def self.cardinal(entity, options = {})
35
+ Treat::Loaders::Linguistics.
36
+ load(entity.language).
37
+ numwords(entity.to_s, options)
38
+ end
39
+
40
+ end