treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,77 +0,0 @@
1
- module Treat
2
- module Processors
3
- module Tokenizers
4
- # A native rule-basd tokenizer based on the one
5
- # developped by Robert Macyntyre in 1995 for the Penn
6
- # Treebank project. This tokenizer follows the
7
- # conventions used by the Penn Treebank.
8
- #
9
- # Original script:
10
- # http://www.cis.upenn.edu/~treebank/tokenizer.sed
11
- #
12
- # Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
13
- # All rights reserved. This program is free software;
14
- # you can redistribute it and/or modify it under the
15
- # same terms as Ruby itself.
16
- class Macintyre
17
- # Tokenize the entity using a native rule-based algorithm.
18
- def self.tokenize(entity, options = {})
19
- if entity.has_children?
20
- raise Treat::Exception,
21
- 'Cannot tokenize a Phrase that already has children.'
22
- end
23
- chunks = split(entity.to_s)
24
- chunks.each do |chunk|
25
- next if chunk =~ /([[:space:]]+)/
26
- entity << Treat::Entities::Token.from_string(chunk)
27
- end
28
- end
29
- # Helper method to split the string into tokens.
30
- def self.split(string)
31
- s = " " + string + " "
32
- s.gsub!(/\s+/," ")
33
- s.gsub!(/(\s+)''/,'\1"')
34
- s.gsub!(/(\s+)``/,'\1"')
35
- s.gsub!(/''(\s+)/,'"\1')
36
- s.gsub!(/``(\s+)/,'"\1')
37
- s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
38
- s.gsub!(/([ (\[{<])"/,'\1 `` ')
39
- s.gsub!(/\.\.\./,' ... ')
40
- s.gsub!(/[,;:@\#$%&]/,' \& ')
41
- s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
42
- s.gsub!(/[?!]/,' \& ')
43
- s.gsub!(/[\]\[(){}<>]/,' \& ')
44
- s.gsub!(/--/,' -- ')
45
- s.sub!(/$/,' ')
46
- s.sub!(/^/,' ')
47
- s.gsub!(/"/,' \'\' ')
48
- s.gsub!(/([^'])' /,'\1 \' ')
49
- s.gsub!(/'([sSmMdD]) /,' \'\1 ')
50
- s.gsub!(/'ll /,' \'ll ')
51
- s.gsub!(/'re /,' \'re ')
52
- s.gsub!(/'ve /,' \'ve ')
53
- s.gsub!(/n't /,' n\'t ')
54
- s.gsub!(/'LL /,' \'LL ')
55
- s.gsub!(/'RE /,' \'RE ')
56
- s.gsub!(/'VE /,' \'VE ')
57
- s.gsub!(/N'T /,' N\'T ')
58
- s.gsub!(/ ([Cc])annot /,' \1an not ')
59
- s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
60
- s.gsub!(/ ([Gg])imme /,' \1im me ')
61
- s.gsub!(/ ([Gg])onna /,' \1on na ')
62
- s.gsub!(/ ([Gg])otta /,' \1ot ta ')
63
- s.gsub!(/ ([Ll])emme /,' \1em me ')
64
- s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
65
- s.gsub!(/ '([Tt])is /,' \'\1 is ')
66
- s.gsub!(/ '([Tt])was /,' \'\1 was ')
67
- s.gsub!(/ ([Ww])anna /,' \1an na ')
68
- while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4'); end
69
- s.gsub!(/\//, ' / ')
70
- s.gsub!(/\s+/,' ')
71
- s.strip!
72
- s.split(/\s+/)
73
- end
74
- end
75
- end
76
- end
77
- end
@@ -1,30 +0,0 @@
1
- module Treat
2
- module Processors
3
- module Tokenizers
4
- # An adapter for the 'tokenizer' gem, which performs
5
- # rule-based tokenizing of texts in English, German
6
- # or French.
7
- class Multilingual
8
- # Hold one tokenizer per language.
9
- @@tokenizers = {}
10
- # Require the 'tokenizer' gem.
11
- silence_warnings { require 'tokenizer' }
12
- # Perform the tokenization of English, German or French text.
13
- # Options:
14
- # :language => (Symbol) Force a language for the tokenizer.
15
- def self.tokenize(entity, options = {})
16
- lang = options[:language] ? options[:language] : entity.language
17
- lang = Treat::Languages.code(lang, 1)
18
- if @@tokenizers[lang].nil?
19
- @@tokenizers[lang] = ::Tokenizer::Tokenizer.new(lang)
20
- end
21
- tokens = @@tokenizers[lang].tokenize(entity.to_s)
22
- tokens.each do |token|
23
- next if token =~ /([[:space:]]+)/
24
- entity << Treat::Entities::Token.from_string(token)
25
- end
26
- end
27
- end
28
- end
29
- end
30
- end
@@ -1,28 +0,0 @@
1
- module Treat
2
- module Registrable
3
- # Registers a token in the @token_registry hash.
4
- def register_token(token)
5
- @token_registry ||= {:value => {}, :id => {}}
6
- @token_registry[:id][token.id] = token
7
- v = token.to_s.downcase
8
- @token_registry[:value][v] ||= []
9
- @token_registry[:value][v] << token
10
- @parent.register_token(token) if has_parent?
11
- end
12
- # Find the token registry, by default the one
13
- # in the root node.
14
- def token_registry(type = nil)
15
- if (type == nil && is_root?) || type == self.type
16
- @token_registry ||= {:value => {}, :id => {}}
17
- return @token_registry
18
- else
19
- if has_parent?
20
- @parent.token_registry(type)
21
- else
22
- @token_registry ||= {:value => {}, :id => {}}
23
- @token_registry
24
- end
25
- end
26
- end
27
- end
28
- end
data/lib/treat/sugar.rb DELETED
@@ -1,50 +0,0 @@
1
- module Treat
2
- # This module provides syntactic sugar in the following manner:
3
- # all entities found under Treat::Entities will be made
4
- # available within the global namespace. For example,
5
- # Treat::Entities::Word can now be referred to as simply 'Word'.
6
- module Sugar
7
- # Installs syntactic sugar.
8
- def sweeten!
9
- return if @@sweetened
10
- @@sweetened = true
11
- each_entity_class do |type, klass|
12
- unless type == :Symbol
13
- Object.class_eval do
14
- define_method(type) do |value='',id=nil|
15
- klass.build(value, id)
16
- end
17
- end
18
- end
19
- end
20
- end
21
-
22
- # Uninstalls syntactic sugar.
23
- def unsweeten!
24
- return unless @@sweetened
25
- @@sweetened = false
26
- each_entity_class do |type, klass|
27
- unless type == :Symbol
28
- Object.class_eval do
29
- remove_method(type)
30
- end
31
- end
32
- end
33
- end
34
-
35
- # Boolean - whether syntactic sugar is
36
- # enabled or not.
37
- def sweetened?; @@sweetened; end
38
- # Syntactic sugar is disabled by default.
39
- @@sweetened = false
40
- private
41
- # Helper method, yields each entity type and class.
42
- def each_entity_class
43
- Treat::Entities.list.each do |entity_type|
44
- type = cc(entity_type).intern
45
- klass = Treat::Entities.const_get(type, klass)
46
- yield type, klass
47
- end
48
- end
49
- end
50
- end
@@ -1,29 +0,0 @@
1
- module Treat
2
- module Viewable
3
- # Return the entity's string value in plain text format.
4
- def to_string; @value; end
5
- # An alias for #to_string.
6
- def to_s; visualize(:txt); end
7
- alias :to_str :to_s
8
- # Return a shortened value of the entity's string value using [...].
9
- def short_value(ml = 6); visualize(:short_value, :max_length => ml); end
10
- # Return an informative string representation of the entity.
11
- def inspect
12
- s = "#{cl(self.class)} (#{@id.to_s})"
13
- if caller_method(2) == :inspect
14
- @id.to_s
15
- else
16
- dependencies = []
17
- @dependencies.each do |dependency|
18
- dependencies << "#{dependency.target}#{dependency.type}"
19
- end
20
- s += " | #{short_value.inspect}" +
21
- " | #{@features.inspect}" +
22
- " | { #{dependencies.join(', ')} }"
23
- end
24
- s
25
- end
26
- # Print out an ASCII representation of the tree.
27
- def print_tree; puts visualize(:tree); end
28
- end
29
- end
@@ -1,28 +0,0 @@
1
- module Treat
2
- # Make a tree visitable by implementing the method #accept.
3
- module Visitable
4
- # Accept a visitor implemented by klass, which is
5
- # found in the supplied group, and call method on it.
6
- def accept(group, klass, method, options)
7
- if group.has_target?(self.class)
8
- if group.type == :transformer
9
- if has_children?
10
- @children.each do |entity|
11
- if group.has_target?(entity.class) && entity.id != id
12
- entity.accept(group, klass, method, options)
13
- end
14
- end
15
- else
16
- klass.send(method, self, options)
17
- end
18
- return self
19
- else
20
- return klass.send(method, self, options)
21
- end
22
- else
23
- raise Treat::Exception,
24
- "This type of visitor cannot visit a #{self.class}."
25
- end
26
- end
27
- end
28
- end
data/test/profile.rb DELETED
@@ -1,2 +0,0 @@
1
- require 'unprof'
2
- require 'tests'
data/test/tc_entity.rb DELETED
@@ -1,117 +0,0 @@
1
- module Treat
2
- module Tests
3
- class TestEntity < Test::Unit::TestCase
4
- def setup
5
- @section = Treat::Entities::Section.new
6
- @sentence = Treat::Entities::Sentence.new
7
- @noun_cons = Treat::Entities::Phrase.new
8
- @noun_cons.set :tag, 'NP'
9
- @verb_cons = Treat::Entities::Phrase.new
10
- @verb_cons.set :tag, 'VP'
11
- @adj_cons = Treat::Entities::Phrase.new
12
- @adj_cons.set :tag, 'ADJP'
13
- @det = Treat::Entities::Word.new('The')
14
- @det.set :category, :determiner
15
- @det.set :tag, 'DT'
16
- @det.set :tag_set, :penn
17
- @adj = Treat::Entities::Word.new('lazy')
18
- @adj.set :category, :adjective
19
- @adj.set :tag, 'JJ'
20
- @adj.set :tag_set, :penn
21
- @noun = Treat::Entities::Word.new('fox')
22
- @noun.set :category, :noun
23
- @noun.set :tag, 'NN'
24
- @noun.set :tag_set, :penn
25
- @aux = Treat::Entities::Word.new('is')
26
- @aux.set :category, :verb
27
- @aux.set :tag, 'VBZ'
28
- @aux.set :tag_set, :penn
29
- @verb = Treat::Entities::Word.new('running')
30
- @verb.set :category, :verb
31
- @verb.set :tag, 'VBG'
32
- @verb.set :tag_set, :penn
33
- @dot = Treat::Entities::Punctuation.new('.')
34
- @section << @sentence << [@noun_cons, @verb_cons, @dot]
35
- @noun_cons << [@det, @adj_cons, @noun]
36
- @adj_cons << @adj
37
- @verb_cons << [@aux, @verb]
38
- end
39
-
40
- def test_viewable
41
- s = 'Happiness is not an ideal of reason, but of imagination.'.tokenize
42
- assert_nothing_raised do
43
- # Return the string value of the sentence.
44
- s.to_s
45
- # Return a debug description of the sentence.
46
- s.inspect
47
- # Return a shortened version of the Sentence with [...]
48
- s.short_value
49
- end
50
- end
51
-
52
- def test_registrable
53
- assert_equal @section.token_registry, @verb.token_registry
54
- assert_equal @noun, @section.token_registry[:id][@noun.id]
55
- assert_equal [@noun], @section.token_registry[:value][@noun.value]
56
- end
57
-
58
- def test_delegatable_visitable
59
- assert_raise(Treat::Exception) do
60
- @section.encoding(:nonexistent)
61
- end
62
- assert_nothing_raised do
63
- @section.language
64
- end
65
- end
66
-
67
- def test_type
68
- assert_equal :section, @section.type
69
- end
70
-
71
- def test_printers
72
- assert_nothing_raised do
73
- @section.to_s
74
- @section.to_string
75
- @section.short_value
76
- @section.inspect
77
- end
78
- end
79
-
80
- def test_magic_methods
81
-
82
- assert_equal true, @sentence.is_sentence?
83
- assert_equal true, @noun.is_noun?
84
-
85
- assert_equal @sentence, @section.sentence
86
- assert_equal [@sentence], @section.sentences
87
- assert_equal 1, @section.sentence_count
88
-
89
- assert_equal [@det], @section.words_with_value('The')
90
- assert_equal [@verb], @section.words_with_tag('VBG')
91
-
92
- assert_equal @noun, @section.noun
93
- assert_equal [@aux, @verb], @section.verbs
94
- assert_equal 6, @section.token_count
95
-
96
- @section.each_sentence do |s|
97
- assert_equal @sentence, s
98
- end
99
- @section.each_noun do |n|
100
- assert_equal @noun, n
101
- end
102
- @section.each_with_value('The') do |x|
103
- assert_equal @det, x
104
- end
105
-
106
- assert_equal @sentence, @noun.parent_sentence
107
- end
108
-
109
- def test_features
110
- @verb.set :test, :test
111
- assert_equal :test, @verb.test
112
- assert_raise(Treat::Exception) { @verb.nonexistent }
113
- end
114
-
115
- end
116
- end
117
- end
@@ -1,73 +0,0 @@
1
- # encoding: utf-8
2
- module Treat
3
- module Tests
4
- class TestExtractors < Test::Unit::TestCase
5
-
6
- def setup
7
- @time = Treat::Tests::English::Time
8
- @date = Treat::Tests::English::Date
9
- @doc = Treat::Tests::English::LongDoc
10
- @word = Treat::Tests::English::Word
11
- @col = Treat::Tests::English::Collection
12
- end
13
-
14
- def test_time
15
- assert_nothing_raised { @time.time(:nickel) }
16
- end
17
-
18
- def test_date
19
- assert_equal 2011, @date.date(:chronic).year
20
- assert_equal 2011, @date.date(:ruby).year
21
- end
22
-
23
- def test_topic_words
24
- assert_nothing_raised { @col.topic_words(:lda) }
25
- end
26
-
27
- def test_named_entity
28
- p = 'Angela Merkel and Nicolas Sarkozy were the first ones to board the p'
29
- assert_nothing_raised { @doc.named_entity(:stanford) }
30
- end
31
-
32
- def test_keywords
33
- assert_nothing_raised do
34
- topics = @col.topic_words(:lda)
35
- @doc.keywords(:topics_frequency, :topic_words => topics)
36
- end
37
- end
38
-
39
- def test_topics
40
- assert_nothing_raised { @doc.topics(:reuters) }
41
- end
42
-
43
- def test_statistics
44
- @doc.chunk.segment(:tactful).tokenize
45
- assert_equal 1, @word.frequency_in(:document)
46
- assert_nothing_raised { @word.tf_idf ; puts @word.tf_idf }
47
- # assert_nothing_raised { @doc.statistics(:position_in) }
48
- # assert_nothing_raised { @doc.statistics(:transition_matrix) }
49
- # assert_nothing_raised { @doc.statistics(:transition_probability) }
50
- end
51
-
52
- def test_language
53
- assert_equal Treat.default_language, @doc.language
54
- Treat.detect_language = true
55
- assert_equal :eng, @doc.language
56
-
57
- a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
58
- b = 'El mundo de hoy no tiene sentido, así que ¿por qué debería pintar cuadros que lo tuvieran? - Pablo Picasso'
59
- c = 'Un bon Allemand ne peut souffrir les Français, mais il boit volontiers les vins de France. - Goethe'
60
- d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
61
-
62
- assert_equal :eng, a.language
63
- assert_equal :spa, b.language
64
- assert_equal :fre, c.language
65
- assert_equal :ger, d.language
66
-
67
- # Reset defaults
68
- Treat.detect_language = false
69
- end
70
-
71
- end
72
- end
73
- end