treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
data/spec/document.rb ADDED
@@ -0,0 +1,93 @@
1
+ require_relative '../lib/treat'
2
+
3
+ describe Treat::Entities::Document do
4
+
5
+ describe "Extractable" do
6
+
7
+ describe "#topics" do
8
+
9
+ it "returns a list of general topics the document belongs to" do
10
+ #doc = Treat::Entities::Document.new(
11
+ #Treat.spec + 'samples/mathematicians/archimedes.abw').read(:abw)
12
+ #doc.do(:chunk, :segment, :tokenize)
13
+ #puts doc.topics.inspect
14
+ end
15
+
16
+ end
17
+
18
+ end
19
+
20
+ describe "Buildable" do
21
+
22
+ describe "#build" do
23
+
24
+ context "when supplied with a readable file name" do
25
+ it "opens the file and reads its " +
26
+ "content into a document" do
27
+ f = Treat.spec + 'samples/mathematicians/leibniz.txt'
28
+ d = Treat::Entities::Document.build(f)
29
+ d.should be_an_instance_of Treat::Entities::Document
30
+ d.to_s.index('Gottfried Leibniz').should_not eql nil
31
+ end
32
+ end
33
+
34
+ context "when supplied with a url" do
35
+ it "downloads the file the URL points to and opens " +
36
+ "a document with the contents of the file" do
37
+ url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
38
+ d = Treat::Entities::Document.build(url)
39
+ d.should be_an_instance_of Treat::Entities::Document
40
+ d.to_s.index('Rubyist').should_not eql nil
41
+ end
42
+ end
43
+
44
+ context "when called with anything else than a " +
45
+ "readable file name or url" do
46
+
47
+ it "raises an exception" do
48
+ lambda do
49
+ Treat::Entities::Document.build('nonexistent')
50
+ end.should raise_error
51
+ end
52
+
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+
59
+ describe "Processable" do
60
+
61
+ describe "#chunk" do
62
+
63
+ context "when called on an HTML document" do
64
+ doc = Treat::Entities::Document.new(
65
+ Treat.spec + 'samples/mathematicians/euler.html').read(:html)
66
+ it "splits the HTML document into sections, " +
67
+ "titles, paragraphs and lists" do
68
+ doc.chunk
69
+ doc.title_count.should eql 1
70
+ doc.title.to_s.should eql "Leonhard Euler (1707-1783)"
71
+ doc.paragraph_count.should eql 5
72
+ end
73
+
74
+ end
75
+
76
+ context "when called on a text document" do
77
+
78
+ doc = Treat::Entities::Document.new(Treat.spec +
79
+ 'samples/mathematicians/leibniz.txt').read(:txt)
80
+ it "splits the document into titles and paragraphs" do
81
+ doc.chunk
82
+ doc.title_count.should eql 1
83
+ doc.title.to_s.should eql "Gottfried Leibniz (1646-1716)"
84
+ doc.paragraph_count.should eql 6
85
+ end
86
+
87
+ end
88
+
89
+ end
90
+
91
+ end
92
+
93
+ end
data/spec/entity.rb ADDED
@@ -0,0 +1,408 @@
1
+ require_relative '../lib/treat'
2
+
3
+ describe Treat::Entities::Entity do
4
+
5
+ before do
6
+
7
+ @paragraph = Treat::Entities::Paragraph.new
8
+ @sentence = Treat::Entities::Sentence.new
9
+ @noun_phrase = Treat::Entities::Phrase.new
10
+ @noun_phrase.set :tag, 'NP'
11
+ @verb_phrase = Treat::Entities::Phrase.new
12
+ @verb_phrase.set :tag, 'VP'
13
+ @adj_phrase = Treat::Entities::Phrase.new
14
+ @adj_phrase.set :tag, 'ADJP'
15
+ @det = Treat::Entities::Word.new('The')
16
+ @det.set :category, :determiner
17
+ @det.set :tag, 'DT'
18
+ @adj = Treat::Entities::Word.new('lazy')
19
+ @adj.set :category, :adjective
20
+ @adj.set :tag, 'JJ'
21
+ @noun = Treat::Entities::Word.new('fox')
22
+ @noun.set :category, :noun
23
+ @noun.set :tag, 'NN'
24
+ @aux = Treat::Entities::Word.new('is')
25
+ @aux.set :category, :verb
26
+ @aux.set :tag, 'VBZ'
27
+ @verb = Treat::Entities::Word.new('running')
28
+ @verb.set :category, :verb
29
+ @verb.set :tag, 'VBG'
30
+ @dot = Treat::Entities::Punctuation.new('.')
31
+ @dot.set :tag, '.'
32
+ @paragraph << @sentence << [@noun_phrase, @verb_phrase, @dot]
33
+ @noun_phrase << [@det, @adj_phrase, @noun]
34
+ @adj_phrase << @adj
35
+ @verb_phrase << [@aux, @verb]
36
+
37
+ end
38
+
39
+
40
+ describe "Checkable" do
41
+
42
+ describe "#check_has(feature, do_it = true) " do
43
+
44
+ it "checks if an entity has the feature; if not, " +
45
+ "calls the default worker to get the feature if do_it " +
46
+ "is set to true; if the entity doesn't have the feature " +
47
+ " and do_it is set to false, it raises an exception." do
48
+
49
+ # NOT PASSING! Dependence on caller method.
50
+
51
+ # lambda { '$'.to_entity.check_has(:tag, false) }.should raise_error Treat::Exception
52
+
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+
59
+ describe "Countable" do
60
+
61
+ describe "#position" do
62
+
63
+ it "returns the position of the entity in its parent, sarting at 1" do
64
+ @noun_phrase.position.should eql 1
65
+ @det.position.should eql 1
66
+ end
67
+
68
+ end
69
+
70
+ =begin
71
+
72
+ describe "#frequency" do
73
+
74
+ it "returns the frequency of the entity's value in the root" do
75
+ @det.frequency.should eql 1
76
+ end
77
+
78
+ end
79
+
80
+
81
+ describe "#frequency_in(parent_type = nil)" do
82
+
83
+ it "returns the position of the entity's value "+
84
+ "in the supplied parent type, or root if nil" do
85
+ @noun_phrase.frequency_in(:sentence).should eql 1
86
+
87
+ end
88
+
89
+ end
90
+
91
+ =end
92
+
93
+ end
94
+
95
+ describe "Delegatable" do
96
+
97
+ describe "#self.call_worker" do
98
+
99
+ it "finds the worker class to " +
100
+ "perform a task and delegates the task to it " do
101
+
102
+ Treat::Entities::Entity.call_worker(
103
+ '$'.to_entity, :tag, :lingua,
104
+ Treat::Lexicalizers::Taggers, {}).should
105
+ eql @sentence.tag(:lingua)
106
+
107
+ end
108
+
109
+ end
110
+
111
+ end
112
+
113
+ describe "Exportable" do
114
+
115
+ context "when supplied with a classification to export" do
116
+ classification = Treat::Classification.new(:word, :tag, :is_keyword?)
117
+ it "returns a data set with the exported features" do
118
+ ds = @sentence.export(classification)
119
+ ds.classification.should eql classification
120
+ ds.labels.should eql [:tag]
121
+ ds.ids.should eql @sentence.words.map { |w| w.id }
122
+ ds.items.should eql [
123
+ ["DT", false], ["JJ", false],
124
+ ["NN", false], ["VBZ", false],
125
+ ["VBG", false]
126
+ ]
127
+ end
128
+ end
129
+
130
+ end
131
+
132
+ describe "Iterable" do
133
+
134
+ describe "#each { |child| ... }" do
135
+ it "yields each direct child of a node" do
136
+ a = []
137
+ @sentence.each do |child|
138
+ a << child
139
+ end
140
+ a.should eql [@noun_phrase, @verb_phrase, @dot]
141
+ end
142
+ end
143
+
144
+ describe "#each_entity(*entity_types) { |entity| ... }" do
145
+
146
+ context "when called with no arguments" do
147
+ it "recursively yields each element in " +
148
+ "the tree, including itself, top-down " +
149
+ "first then left to right" do
150
+
151
+ a = []
152
+ @sentence.each_entity do |e|
153
+ a << e
154
+ end
155
+
156
+ a.should eql [@sentence, @noun_phrase, @det,
157
+ @adj_phrase, @adj, @noun,
158
+ @verb_phrase, @aux, @verb, @dot]
159
+
160
+ end
161
+ end
162
+
163
+ context "when called with one or more entity " +
164
+ "types supplied as lowercase symbols" do
165
+ it "recursively yields all elements with the given type(s), "+
166
+ "including the receiver if it matches on of the types" do
167
+ a = []
168
+ @sentence.each_entity(:phrase, :punctuation) do |e|
169
+ a << e
170
+ end
171
+ a.should eql [@sentence, @noun_phrase,
172
+ @adj_phrase, @verb_phrase, @dot]
173
+ end
174
+ end
175
+
176
+ end
177
+ end
178
+
179
+ describe "Magical" do
180
+
181
+ describe "#<entity or word type> - e.g. " +
182
+ "#title, #paragraph, etc. and #adjective, #noun, etc." do
183
+
184
+ it "return the first entity with the corresponding " +
185
+ "type inside another entity, but raises an exception "+
186
+ "the type occurs more than once in the entity" do
187
+ @paragraph.sentence.should eql @sentence
188
+ end
189
+
190
+ end
191
+
192
+
193
+ describe "#<entity or word type>s - e.g. " +
194
+ "#sections, #words, etc. and #nouns, #adverbs, etc." do
195
+
196
+ it "return an array of the entities with the " +
197
+ "corresponding type in the subtree of an entity" do
198
+ @paragraph.phrases.should eql [@sentence,
199
+ @noun_phrase, @adj_phrase, @verb_phrase]
200
+ end
201
+
202
+ end
203
+
204
+ describe "#each_<entity type> - e.g. " +
205
+ "#each_sentence, #each_word, etc." do
206
+
207
+ it "yields each of the entities with the " +
208
+ "corresponding type in the subtree of an entity" do
209
+ a = []
210
+
211
+ @paragraph.each_phrase { |p| a << p }
212
+ a.should eql [@sentence, @noun_phrase,
213
+ @adj_phrase, @verb_phrase]
214
+
215
+ end
216
+
217
+ end
218
+
219
+ describe "#<entity or word type>_count - e.g. " +
220
+ "#sentence_count, #paragraph_count, etc. and " +
221
+ "#noun_count, #verb_count, etc." do
222
+
223
+ it "return the number of entities with the " +
224
+ "corresponding type inside another entity" do
225
+ @paragraph.sentence_count.should eql 1
226
+ @paragraph.phrase_count.should eql 4
227
+ end
228
+
229
+ end
230
+
231
+ describe "#<entity or word type>_with_<feature>(value) - " +
232
+ "e.g. #word_with_id(x) or #adverb_with_value('seemingly')" do
233
+
234
+ it "return the entity with the corresponding type " +
235
+ "that have [feature] set to the supplied value; raise" +
236
+ "a warning if there are many entities of that type" do
237
+ @paragraph.word_with_value('The').should eql @det
238
+ @paragraph.token_with_tag('.').should eql @dot
239
+ @sentence.phrase_with_tag('NP').should eql @noun_phrase
240
+ end
241
+
242
+ end
243
+
244
+ describe "#<entity or word type>s_with_<feature>(value) - " +
245
+ "e.g. #phrases_with_tag('NP'), #nouns_with_value('foo')" do
246
+
247
+ it "return an array of the entities with the " +
248
+ "corresponding type that have [feature] set to "+
249
+ "the supplied value" do
250
+ @paragraph.words_with_value('The').should eql [@det]
251
+ @paragraph.tokens_with_tag('.').should eql [@dot]
252
+ @sentence.phrases_with_tag('NP').should eql [@noun_phrase]
253
+ end
254
+
255
+ end
256
+
257
+ describe "#parent_<entity type> - e.g. " +
258
+ "#parent_document, #parent_collection, etc." do
259
+
260
+ it "return the first ancestor of the entity " +
261
+ "that has the supplied type, or nil if none" do
262
+ @sentence.parent_paragraph.should eql @paragraph
263
+ @adj.parent_sentence.should eql @sentence
264
+ end
265
+
266
+ end
267
+
268
+ describe "#frequency_in_<entity type> - e.g. " +
269
+ "#frequency_in_collection, #frequency_in_document, etc." do
270
+
271
+ it "return the frequency of this entity's value " +
272
+ "in the parent entity with the corresponding type" do
273
+ @adj.frequency_in_sentence.should eql 1
274
+ end
275
+
276
+ end
277
+
278
+ end
279
+
280
+ describe "Stringable" do
281
+
282
+ describe "#to_string" do
283
+ it "returns the true text value of the entity " +
284
+ "or an empty string if it has none" do
285
+ @paragraph.to_string.should eql ''
286
+ @noun.to_string.should eql 'fox'
287
+ end
288
+ end
289
+
290
+ describe "#to_s" do
291
+ it "returns the string value of the " +
292
+ "entity or its full subtree" do
293
+ @paragraph.to_s.should
294
+ eql 'The lazy fox is running.'
295
+ @noun.to_s.should eql 'fox'
296
+ end
297
+ end
298
+
299
+ describe "#inspect" do
300
+ it "returns an informative string " +
301
+ "concerning the entity" do
302
+ @paragraph.inspect.should
303
+ be_an_instance_of String
304
+ end
305
+ end
306
+
307
+ describe "#short_value" do
308
+ it "returns a shortened version of the " +
309
+ "entity's string value" do
310
+ @paragraph.short_value.should
311
+ eql 'The lazy fox is running.'
312
+ end
313
+ end
314
+
315
+ end
316
+
317
+ describe "Formatters" do
318
+
319
+ describe "#serialize" do
320
+
321
+ before :all do
322
+ @serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
323
+ @txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
324
+ end
325
+
326
+ context "when called with a file to save to" do
327
+
328
+ it "serializes a document to the supplied format" do
329
+
330
+ @serializers.each do |ser|
331
+ f = Treat.spec + 'test.' + ser.to_s
332
+ s = Treat::Entities::Paragraph.new(@txt)
333
+ s.do(:segment, :tokenize)
334
+ s.serialize(ser, :file => f)
335
+ d = Treat::Entities::Document.build(f)
336
+ d.to_s.should eql @txt
337
+ d.size.should eql s.size
338
+ d.token_count.should eql s.token_count
339
+ d.tokens[0].id.should eql s.tokens[0].id
340
+ File.delete(f)
341
+ end
342
+
343
+ end
344
+
345
+ end
346
+
347
+ end
348
+
349
+ describe "#unserialize" do
350
+
351
+ end
352
+
353
+ end
354
+
355
+ describe "Extractors" do
356
+
357
+ describe "#language" do
358
+ context "when language detection is disabled " +
359
+ "(Treat.detect_language is set to false)" do
360
+ it "returns the default language (Treat.default_language)" do
361
+ Treat.detect_language = false
362
+ Treat.default_language = :test
363
+ s = 'Les grands hommes ne sont pas toujours grands, dit un jour Napoleon.'
364
+ s.language.should eql :test
365
+ Treat.default_language = :eng
366
+ end
367
+ end
368
+
369
+ context "when language detection is enabled " +
370
+ "(Treat.detect_language is set to true)" do
371
+
372
+ it "guesses the language of the entity" do
373
+
374
+ Treat.detect_language = true
375
+ a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
376
+ b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran? - Pablo Picasso'
377
+ c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France. - Goethe'
378
+ d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
379
+ a.language.should eql :eng
380
+ b.language.should eql :spa
381
+ c.language.should eql :fre
382
+ d.language.should eql :ger
383
+
384
+ # Reset default
385
+ Treat.detect_language = false
386
+ end
387
+
388
+ end
389
+
390
+ end
391
+
392
+ end
393
+
394
+ end
395
+
396
+
397
+ =begin
398
+
399
+
400
+ def test_visualizers
401
+ assert_nothing_raised { @doc.visualize(:tree) }
402
+ # assert_nothing_raised { @doc.visualize(:html) }
403
+ assert_nothing_raised { @doc.visualize(:dot) }
404
+ assert_nothing_raised { @doc.visualize(:short_value) }
405
+ assert_nothing_raised { @sentence.visualize(:standoff) }
406
+ end
407
+
408
+ =end