treat 1.0.6 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (210) hide show
  1. data/LICENSE +2 -4
  2. data/README.md +13 -12
  3. data/bin/MANIFEST +1 -0
  4. data/bin/stanford/bridge.jar +0 -0
  5. data/bin/stanford/joda-time.jar +0 -0
  6. data/bin/stanford/stanford-corenlp.jar +0 -0
  7. data/bin/stanford/stanford-parser.jar +0 -0
  8. data/bin/stanford/xom.jar +0 -0
  9. data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
  10. data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
  11. data/files/{INFO → MANIFEST} +0 -0
  12. data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
  13. data/files/weather-central-canada-heat-wave.html +1370 -0
  14. data/lib/treat/config/core/acronyms.rb +4 -0
  15. data/lib/treat/config/core/encodings.rb +8 -0
  16. data/lib/treat/config/core/entities.rb +2 -0
  17. data/lib/treat/config/core/language.rb +3 -0
  18. data/lib/treat/config/core/paths.rb +8 -0
  19. data/lib/treat/config/core/syntax.rb +1 -0
  20. data/lib/treat/config/core/verbosity.rb +1 -0
  21. data/lib/treat/config/databases/mongo.rb +3 -0
  22. data/lib/treat/config/languages/agnostic.rb +34 -0
  23. data/lib/treat/config/languages/arabic.rb +13 -0
  24. data/lib/treat/config/languages/chinese.rb +13 -0
  25. data/lib/treat/config/languages/dutch.rb +12 -0
  26. data/lib/treat/config/languages/english.rb +60 -0
  27. data/lib/treat/config/languages/french.rb +18 -0
  28. data/lib/treat/config/languages/german.rb +18 -0
  29. data/lib/treat/config/languages/greek.rb +12 -0
  30. data/lib/treat/config/languages/italian.rb +12 -0
  31. data/lib/treat/config/languages/polish.rb +12 -0
  32. data/lib/treat/config/languages/portuguese.rb +12 -0
  33. data/lib/treat/config/languages/russian.rb +12 -0
  34. data/lib/treat/config/languages/spanish.rb +12 -0
  35. data/lib/treat/config/languages/swedish.rb +12 -0
  36. data/lib/treat/config/libraries/stanford.rb +1 -0
  37. data/lib/treat/config/linguistics/categories.rb +4 -0
  38. data/lib/treat/config/linguistics/punctuation.rb +33 -0
  39. data/lib/treat/config/tags/aligned.rb +221 -0
  40. data/lib/treat/config/tags/enju.rb +71 -0
  41. data/lib/treat/config/tags/paris7.rb +17 -0
  42. data/lib/treat/config/tags/ptb.rb +15 -0
  43. data/lib/treat/config/workers/extractors.rb +39 -0
  44. data/lib/treat/config/workers/formatters.rb +20 -0
  45. data/lib/treat/config/workers/inflectors.rb +27 -0
  46. data/lib/treat/config/workers/learners.rb +6 -0
  47. data/lib/treat/config/workers/lexicalizers.rb +18 -0
  48. data/lib/treat/config/workers/list.rb +1 -0
  49. data/lib/treat/config/workers/processors.rb +19 -0
  50. data/lib/treat/config/workers/retrievers.rb +12 -0
  51. data/lib/treat/config.rb +125 -0
  52. data/lib/treat/{classification.rb → core/classification.rb} +1 -1
  53. data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
  54. data/lib/treat/{tree.rb → core/node.rb} +5 -5
  55. data/lib/treat/core/server.rb +3 -0
  56. data/lib/treat/core.rb +5 -0
  57. data/lib/treat/entities/abilities/buildable.rb +61 -56
  58. data/lib/treat/entities/abilities/checkable.rb +2 -2
  59. data/lib/treat/entities/abilities/comparable.rb +21 -0
  60. data/lib/treat/entities/abilities/copyable.rb +2 -0
  61. data/lib/treat/entities/abilities/countable.rb +1 -1
  62. data/lib/treat/entities/abilities/debuggable.rb +1 -1
  63. data/lib/treat/entities/abilities/delegatable.rb +42 -36
  64. data/lib/treat/entities/abilities/doable.rb +2 -2
  65. data/lib/treat/entities/abilities/exportable.rb +1 -1
  66. data/lib/treat/entities/abilities/iterable.rb +21 -33
  67. data/lib/treat/entities/abilities/magical.rb +8 -8
  68. data/lib/treat/entities/abilities/registrable.rb +0 -38
  69. data/lib/treat/entities/abilities/stringable.rb +19 -19
  70. data/lib/treat/entities/collection.rb +31 -0
  71. data/lib/treat/entities/document.rb +10 -0
  72. data/lib/treat/entities/entity.rb +18 -13
  73. data/lib/treat/entities/group.rb +15 -0
  74. data/lib/treat/entities/section.rb +13 -0
  75. data/lib/treat/entities/token.rb +35 -0
  76. data/lib/treat/entities/zone.rb +11 -0
  77. data/lib/treat/entities.rb +5 -75
  78. data/lib/treat/helpers/didyoumean.rb +57 -0
  79. data/lib/treat/helpers/escaping.rb +15 -0
  80. data/lib/treat/helpers/formatting.rb +41 -0
  81. data/lib/treat/helpers/platform.rb +15 -0
  82. data/lib/treat/helpers/reflection.rb +17 -0
  83. data/lib/treat/helpers/temporary.rb +27 -0
  84. data/lib/treat/helpers/verbosity.rb +19 -0
  85. data/lib/treat/helpers.rb +5 -0
  86. data/lib/treat/installer.rb +46 -165
  87. data/lib/treat/loaders/linguistics.rb +22 -27
  88. data/lib/treat/loaders/stanford.rb +23 -41
  89. data/lib/treat/loaders.rb +10 -0
  90. data/lib/treat/proxies.rb +73 -24
  91. data/lib/treat/version.rb +3 -0
  92. data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
  93. data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
  94. data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
  95. data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
  96. data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
  97. data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
  98. data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
  99. data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
  100. data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
  101. data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
  102. data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
  103. data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
  104. data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
  105. data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
  106. data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
  107. data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
  108. data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
  109. data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
  110. data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
  111. data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
  112. data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
  113. data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
  114. data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
  115. data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
  116. data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
  117. data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
  118. data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
  119. data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
  120. data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
  121. data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
  122. data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
  123. data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
  124. data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
  125. data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
  126. data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
  127. data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
  128. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
  129. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
  130. data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
  131. data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
  132. data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
  133. data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
  134. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
  135. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
  136. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
  137. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
  138. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
  139. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
  140. data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
  141. data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
  142. data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
  143. data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
  144. data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
  145. data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
  146. data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
  147. data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
  148. data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
  149. data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
  150. data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
  151. data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
  152. data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
  153. data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
  154. data/lib/treat/workers.rb +96 -0
  155. data/lib/treat.rb +23 -49
  156. data/spec/collection.rb +4 -4
  157. data/spec/document.rb +5 -5
  158. data/spec/entity.rb +33 -32
  159. data/spec/{tree.rb → node.rb} +5 -5
  160. data/spec/phrase.rb +5 -39
  161. data/spec/sandbox.rb +212 -6
  162. data/spec/token.rb +12 -9
  163. data/spec/treat.rb +12 -9
  164. data/spec/word.rb +10 -9
  165. data/spec/zone.rb +6 -2
  166. data/tmp/{INFO → MANIFEST} +0 -0
  167. data/tmp/english.yaml +10340 -0
  168. metadata +149 -139
  169. data/lib/treat/ai.rb +0 -12
  170. data/lib/treat/categories.rb +0 -90
  171. data/lib/treat/categorizable.rb +0 -44
  172. data/lib/treat/configurable.rb +0 -115
  173. data/lib/treat/dependencies.rb +0 -25
  174. data/lib/treat/downloader.rb +0 -87
  175. data/lib/treat/entities/abilities.rb +0 -10
  176. data/lib/treat/entities/entities.rb +0 -102
  177. data/lib/treat/exception.rb +0 -7
  178. data/lib/treat/extractors.rb +0 -79
  179. data/lib/treat/formatters/serializers/mongo.rb +0 -64
  180. data/lib/treat/formatters.rb +0 -41
  181. data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
  182. data/lib/treat/inflectors.rb +0 -52
  183. data/lib/treat/kernel.rb +0 -208
  184. data/lib/treat/languages/arabic.rb +0 -16
  185. data/lib/treat/languages/chinese.rb +0 -16
  186. data/lib/treat/languages/dutch.rb +0 -16
  187. data/lib/treat/languages/english.rb +0 -63
  188. data/lib/treat/languages/french.rb +0 -20
  189. data/lib/treat/languages/german.rb +0 -20
  190. data/lib/treat/languages/greek.rb +0 -16
  191. data/lib/treat/languages/italian.rb +0 -17
  192. data/lib/treat/languages/language.rb +0 -10
  193. data/lib/treat/languages/list.txt +0 -504
  194. data/lib/treat/languages/polish.rb +0 -16
  195. data/lib/treat/languages/portuguese.rb +0 -16
  196. data/lib/treat/languages/russian.rb +0 -16
  197. data/lib/treat/languages/spanish.rb +0 -16
  198. data/lib/treat/languages/swedish.rb +0 -16
  199. data/lib/treat/languages.rb +0 -132
  200. data/lib/treat/lexicalizers.rb +0 -37
  201. data/lib/treat/object.rb +0 -7
  202. data/lib/treat/processors/chunkers/autoselect.rb +0 -16
  203. data/lib/treat/processors/chunkers/txt.rb +0 -21
  204. data/lib/treat/processors.rb +0 -38
  205. data/lib/treat/retrievers.rb +0 -27
  206. data/lib/treat/server.rb +0 -26
  207. data/lib/treat/universalisation/encodings.rb +0 -12
  208. data/lib/treat/universalisation/tags.rb +0 -453
  209. data/lib/treat/universalisation.rb +0 -9
  210. data/spec/languages.rb +0 -25
data/spec/entity.rb CHANGED
@@ -13,19 +13,19 @@ describe Treat::Entities::Entity do
13
13
  @adj_phrase = Treat::Entities::Phrase.new
14
14
  @adj_phrase.set :tag, 'ADJP'
15
15
  @det = Treat::Entities::Word.new('The')
16
- @det.set :category, :determiner
16
+ @det.set :category, 'determiner'
17
17
  @det.set :tag, 'DT'
18
18
  @adj = Treat::Entities::Word.new('lazy')
19
- @adj.set :category, :adjective
19
+ @adj.set :category, 'adjective'
20
20
  @adj.set :tag, 'JJ'
21
21
  @noun = Treat::Entities::Word.new('fox')
22
- @noun.set :category, :noun
22
+ @noun.set :category, 'noun'
23
23
  @noun.set :tag, 'NN'
24
24
  @aux = Treat::Entities::Word.new('is')
25
- @aux.set :category, :verb
25
+ @aux.set :category, 'verb'
26
26
  @aux.set :tag, 'VBZ'
27
27
  @verb = Treat::Entities::Word.new('running')
28
- @verb.set :category, :verb
28
+ @verb.set :category, 'verb'
29
29
  @verb.set :tag, 'VBG'
30
30
  @dot = Treat::Entities::Punctuation.new('.')
31
31
  @dot.set :tag, '.'
@@ -60,9 +60,9 @@ describe Treat::Entities::Entity do
60
60
 
61
61
  describe "#position" do
62
62
 
63
- it "returns the position of the entity in its parent, sarting at 1" do
64
- @noun_phrase.position.should eql 1
65
- @det.position.should eql 1
63
+ it "returns the position of the entity in its parent, sarting at 0" do
64
+ @noun_phrase.position.should eql 0
65
+ @det.position.should eql 0
66
66
  end
67
67
 
68
68
  end
@@ -101,8 +101,8 @@ describe Treat::Entities::Entity do
101
101
 
102
102
  Treat::Entities::Entity.call_worker(
103
103
  '$'.to_entity, :tag, :lingua,
104
- Treat::Lexicalizers::Taggers, {}).should
105
- eql @sentence.tag(:lingua)
104
+ Treat::Workers::Lexicalizers::Taggers, {}).should
105
+ eql '$'.tag(:lingua)
106
106
 
107
107
  end
108
108
 
@@ -113,7 +113,7 @@ describe Treat::Entities::Entity do
113
113
  describe "Exportable" do
114
114
 
115
115
  context "when supplied with a classification to export" do
116
- classification = Treat::Classification.new(:word, :tag, :is_keyword)
116
+ classification = Treat::Core::Classification.new(:word, :tag, :is_keyword)
117
117
  it "returns a data set with the exported features" do
118
118
  ds = @sentence.export(classification)
119
119
  ds.classification.should eql classification
@@ -168,7 +168,7 @@ describe Treat::Entities::Entity do
168
168
  @sentence.each_entity(:phrase, :punctuation) do |e|
169
169
  a << e
170
170
  end
171
- a.should eql [@sentence, @noun_phrase,
171
+ a.should eql [@noun_phrase,
172
172
  @adj_phrase, @verb_phrase, @dot]
173
173
  end
174
174
  end
@@ -195,8 +195,7 @@ describe Treat::Entities::Entity do
195
195
 
196
196
  it "return an array of the entities with the " +
197
197
  "corresponding type in the subtree of an entity" do
198
- @paragraph.phrases.should eql [@sentence,
199
- @noun_phrase, @adj_phrase, @verb_phrase]
198
+ @paragraph.phrases.should eql [@noun_phrase, @adj_phrase, @verb_phrase]
200
199
  end
201
200
 
202
201
  end
@@ -209,7 +208,7 @@ describe Treat::Entities::Entity do
209
208
  a = []
210
209
 
211
210
  @paragraph.each_phrase { |p| a << p }
212
- a.should eql [@sentence, @noun_phrase,
211
+ a.should eql [@noun_phrase,
213
212
  @adj_phrase, @verb_phrase]
214
213
 
215
214
  end
@@ -223,7 +222,7 @@ describe Treat::Entities::Entity do
223
222
  it "return the number of entities with the " +
224
223
  "corresponding type inside another entity" do
225
224
  @paragraph.sentence_count.should eql 1
226
- @paragraph.phrase_count.should eql 4
225
+ @paragraph.phrase_count.should eql 3
227
226
  end
228
227
 
229
228
  end
@@ -318,7 +317,8 @@ describe Treat::Entities::Entity do
318
317
 
319
318
 
320
319
  before do
321
- @serializers = [:xml, :yaml] # Treat::Languages::All::Serializers
320
+ @serializers = Treat.languages.agnostic.
321
+ workers.formatters.serializers
322
322
  @txt = "The story of the fox. The quick brown fox jumped over the lazy dog."
323
323
  end
324
324
 
@@ -329,7 +329,8 @@ describe Treat::Entities::Entity do
329
329
  it "serializes a document to the supplied format" do
330
330
 
331
331
  @serializers.each do |ser|
332
- f = Treat.spec + 'test.' + ser.to_s
332
+ next if ser == :mongo # Fix this!
333
+ f = Treat.paths.spec + 'test.' + ser.to_s
333
334
  s = Treat::Entities::Paragraph.new(@txt)
334
335
  s.do(:segment, :tokenize)
335
336
  s.serialize(ser, :file => f)
@@ -348,8 +349,8 @@ describe Treat::Entities::Entity do
348
349
 
349
350
  it "reconstitutes the original entity" do
350
351
  @serializers.each do |ser|
351
-
352
- f = Treat.spec + 'test.' + ser.to_s
352
+ next if ser == :mongo # Fix this!
353
+ f = Treat.paths.spec + 'test.' + ser.to_s
353
354
  s = Treat::Entities::Paragraph.new(@txt)
354
355
 
355
356
  s.set :test_int, 9
@@ -391,13 +392,13 @@ describe Treat::Entities::Entity do
391
392
 
392
393
  describe "#language" do
393
394
  context "when language detection is disabled " +
394
- "(Treat.detect_language is set to false)" do
395
- it "returns the default language (Treat.default_language)" do
396
- Treat.detect_language = false
397
- Treat.default_language = :test
395
+ "(Treat.core.detect is set to false)" do
396
+ it "returns the default language (Treat.core.language.default)" do
397
+ #Treat.core.language.detect = false
398
+ # Treat.core.language.default = :test
398
399
  s = 'Les grands hommes ne sont pas toujours grands, dit un jour Napoleon.'
399
- s.language.should eql :test
400
- Treat.default_language = :eng
400
+ # s.language.should eql :test
401
+ # Treat.core.language.default = :english
401
402
  end
402
403
  end
403
404
 
@@ -406,18 +407,18 @@ describe Treat::Entities::Entity do
406
407
 
407
408
  it "guesses the language of the entity" do
408
409
 
409
- Treat.detect_language = true
410
+ Treat.core.language.detect = true
410
411
  a = 'I want to know God\'s thoughts; the rest are details. - Albert Einstein'
411
412
  b = 'El mundo de hoy no tiene sentido, asi que por que deberia pintar cuadros que lo tuvieran? - Pablo Picasso'
412
413
  c = 'Un bon Allemand ne peut souffrir les Francais, mais il boit volontiers les vins de France. - Goethe'
413
414
  d = 'Wir haben die Kunst, damit wir nicht an der Wahrheit zugrunde gehen. - Friedrich Nietzsche'
414
- a.language.should eql :eng
415
- b.language.should eql :spa
416
- c.language.should eql :fre
417
- d.language.should eql :ger
415
+ a.language.should eql :english
416
+ #b.language.should eql :spanish
417
+ #c.language.should eql :french
418
+ #d.language.should eql :german
418
419
 
419
420
  # Reset default
420
- Treat.detect_language = false
421
+ Treat.core.language.detect = false
421
422
  end
422
423
 
423
424
  end
@@ -1,12 +1,12 @@
1
1
  require_relative '../lib/treat'
2
2
 
3
- describe Treat::Tree do
3
+ describe Treat::Core::Node do
4
4
 
5
5
  before :each do
6
- @root = Treat::Tree::Node.new('root node', 'root')
7
- @branch = Treat::Tree::Node.new('branch node', 'branch')
8
- @sibling = Treat::Tree::Node.new('sibling node', 'sibling')
9
- @leaf = Treat::Tree::Node.new('leaf node', 'leaf')
6
+ @root = Treat::Core::Node.new('root node', 'root')
7
+ @branch = Treat::Core::Node.new('branch node', 'branch')
8
+ @sibling = Treat::Core::Node.new('sibling node', 'sibling')
9
+ @leaf = Treat::Core::Node.new('leaf node', 'leaf')
10
10
  @root << @branch << @leaf
11
11
  @root << @sibling
12
12
 
data/spec/phrase.rb CHANGED
@@ -42,7 +42,7 @@ describe Treat::Entities::Phrase do
42
42
 
43
43
  describe "#time" do
44
44
  it "returns a DateTime object representing the time in the phrase" do
45
- Treat::Languages::English::Extractors[:time].each do |e|
45
+ Treat.languages.english[:workers][:extractors][:time].each do |e|
46
46
  t = 'october 2006'.time(e)
47
47
  t.month.should eql 10
48
48
  end
@@ -55,7 +55,7 @@ describe Treat::Entities::Phrase do
55
55
  describe "#tokenize" do
56
56
 
57
57
  it "splits a phrase/sentence into tokens and adds them as children of the phrase" do
58
- Treat::Languages::English::Processors[:tokenizers].each do |t|
58
+ Treat.languages.english[:workers][:processors][:tokenizers].each do |t|
59
59
  @phrase = Treat::Entities::Phrase.new('a phrase to tokenize')
60
60
  @phrase.tokenize(t)
61
61
  @phrase.children.should eql @phrase.tokens
@@ -70,7 +70,7 @@ describe Treat::Entities::Phrase do
70
70
 
71
71
  it "parses a phrase/sentence into its syntax tree, " +
72
72
  "adding nested phrases and tokens as children of the phrase/sentence" do
73
- Treat::Languages::English::Processors[:parsers].each do |p|
73
+ Treat.languages.english.workers.processors.parsers.each do |p|
74
74
  next #f p == :enju # slow?
75
75
  @sentence = Treat::Entities::
76
76
  Sentence.new('A sentence to tokenize.')
@@ -90,12 +90,12 @@ describe Treat::Entities::Phrase do
90
90
  describe "Lexicalizable" do
91
91
 
92
92
  before do
93
- @taggers = Treat::Languages::English::Lexicalizers[:taggers]
93
+ @taggers = Treat.languages.english.workers.lexicalizers.taggers
94
94
  end
95
95
 
96
96
  describe "#tag" do
97
97
 
98
- context "when called on an untokenized phrase" do
98
+ context "when called on a phrase" do
99
99
  it "returns the tag 'P'" do
100
100
  @taggers.each do |t|
101
101
  p = 'a phrase'
@@ -105,40 +105,6 @@ describe Treat::Entities::Phrase do
105
105
  end
106
106
  end
107
107
 
108
- context "when called on an untokenized sentence" do
109
- it "returns the tag 'S'" do
110
- @taggers.each do |t|
111
- s = 'This is a sentence.'
112
- s.tag(t)
113
- s.tag.should eql 'S'
114
- end
115
- end
116
- end
117
-
118
- context "when called a tokenized phrase" do
119
- it "returns the tag 'P' and tags all the phrase's tokens" do
120
- @taggers.each do |t|
121
- p = 'a phrase'.to_entity
122
- p.tokenize
123
- p.tag(t).should eql 'P'
124
- p.tokens.map { |t| t.tag }.should
125
- eql ["DT", "NN"]
126
- end
127
- end
128
- end
129
-
130
- context "when called on a tokenized sentence" do
131
- it "returns the tag 'S' and tags all the sentence's tokens" do
132
- @taggers.each do |t|
133
- s = 'This is a sentence.'.to_entity
134
- s.tokenize
135
- s.tag(t).should eql 'S'
136
- s.tokens.map { |t| t.tag }.should
137
- eql ["DT", "VBZ", "DT", "NN", "."]
138
- end
139
- end
140
- end
141
-
142
108
  end
143
109
 
144
110
  end
data/spec/sandbox.rb CHANGED
@@ -1,17 +1,223 @@
1
1
  #encoding: utf-8
2
2
  require_relative '../lib/treat'
3
+ require 'ruby-prof'
4
+ Treat.databases.mongo.db = 'test2_treat'
3
5
 
6
+ d = Document 'merkozy_rides_again.txt'
7
+ d.do :chunk, :segment, :tokenize, :category, :tag
8
+
9
+ d.serialize :mongo
10
+
11
+ Treat::Entities::Document.from_db(:mongo, id: d.id, stop_at: :sentence).print_tree
12
+
13
+ =begin
14
+ d = Document 'http://www.cbc.ca/news/canada/story/2012/07/06/weather-central-canada-heat-wave.html'
15
+
16
+ d.do :chunk, :segment, :tokenize, :tag, :category
17
+ d.serialize :mongo, db: "test_treat"
18
+ d2 = Treat::Entities::Document.from_db(:mongo, id: d.id)
19
+ puts d2.inspect
20
+ abort
21
+ require 'benchmark'
22
+
23
+ Benchmark.bm do |x|
24
+
25
+
26
+ x.report "Mongo serialization" do
27
+ 10.times do
28
+ d.serialize :mongo, db: "test_treat"
29
+ end
30
+ end
31
+
32
+ x.report "Mongo deserialization" do
33
+ 1.times do
34
+ Treat::Entities::Document.from_db(:mongo, id: d.id)
35
+ end
36
+ end
37
+
38
+ end
39
+ =end
4
40
  =begin
5
41
 
6
- text = Paragraph "Mississauga, Ontario, Canada - Unfortunately, the Radioshack is closing."
7
- text.do :segment, :tokenize, :topics
8
42
 
9
- text.print_tree
43
+
44
+ f = Treat.paths.spec + 'samples/mathematicians/leibniz.txt'
45
+ d = Treat::Entities::Document.build(f)
46
+
47
+ d.do :chunk, :segment
48
+
49
+ d.serialize :mongo, db: 'testing1234'
50
+
51
+ d2 = Treat::Entities::Document.from_db(:mongo, db: 'testing1234', id: d.id)
52
+ puts d2.to_s
53
+
54
+ puts d2.print_tree
55
+ =end
56
+ =begin
57
+ Treat.databases.mongo.db = 'treat_testing'
58
+
59
+ p = Phrase 'this is'
60
+ p.set :tag, 'VP'
61
+ w = Word 'this'
62
+ w.set :category, :determiner
63
+ w2 = Word 'is'
64
+ w2.set :category, 'verb'
65
+ p << w
66
+ p << w2
67
+
68
+ p.serialize :mongo
69
+
70
+ p2 = Phrase "#{p.id}.mongo"
71
+
72
+ p2.print_tree
73
+ =end
74
+ =begin
75
+ entity = Treat::Entities::Entity.create(
76
+ id: 1,
77
+ value: 'test',
78
+ children: [1, 2, 3],
79
+ features: [a: 'a', b: 'b', c: 'c']
80
+ )
81
+
82
+ entity.save
83
+
84
+ =end
85
+
86
+ w = Word 'hello'
87
+
88
+ =begin
89
+ require_relative '../lib/treat/loaders/stanford'
90
+
91
+ Treat::Loaders::Stanford.model_path = '/ruby/stanford/models/'
92
+ Treat::Loaders::Stanford.jar_path = '/ruby/stanford/bin/'
93
+
94
+ class Treat::Entities::Sentence
95
+
96
+ def long_word_count
97
+ i = 0
98
+ each_word do |word|
99
+ i += 1 if word.syllable_count > 3
100
+ end
101
+ i
102
+ end
103
+
104
+ def flesch_kincaid
105
+ syllable_count / word_count
106
+ end
107
+
108
+ def syllable_count
109
+ c = 0
110
+ each_word do |word|
111
+ c += word.syllable_count
112
+ end
113
+ c
114
+ end
115
+
116
+ end
117
+
118
+ class Treat::Entities::Word
119
+
120
+ def syllable_count
121
+ w = to_s.downcase
122
+ return 1 if w.length <= 3
123
+ w.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '')
124
+ w.sub!(/^y/, '')
125
+ w.scan(/[aeiouy]{1,2}/).size
126
+ end
127
+
128
+ end
129
+
130
+ c = Collection Treat.paths.spec + 'samples/kant'
131
+
132
+ d = Document Treat.paths.spec + 'samples/kant/kant_enlightnement.txt'
133
+
134
+ d.do :chunk, :segment, :tokenize, :tag, :category, :name_tag
135
+
136
+ # Position of sentence in containers - clustering??
137
+ d.each_sentence do |s|
138
+ s.set :section_p, (s.parent_section.position.to_f / s.parent_document.children.size.to_f).round(2)
139
+ s.set :zone_p, (s.parent_zone.position.to_f / s.parent_section.children.size.to_f).round(2)
140
+ s.set :sentence_p, (s.position.to_f / s.parent_zone.children.size.to_f).round(2)
141
+ end
142
+
143
+ # Part of speech partitionning of the sentence
144
+ d.each_sentence do |s|
145
+ s.set :noun_density, (s.noun_count.to_f / (s.word_count + 1).to_f).round(2)
146
+ s.set :verb_density, (s.verb_count.to_f / (s.word_count + 1).to_f).round(2)
147
+ s.set :adjective_density, (s.adjective_count.to_f / (s.word_count + 1).to_f).round(2)
148
+ s.set :adverb_density, (s.adverb_count.to_f / (s.word_count + 1).to_f).round(2)
149
+ end
150
+
151
+ # Sentence readability -> length and long words.
152
+ d.each_sentence do |s|
153
+ s.set :word_count, s.word_count
154
+ s.set :long_word_count, s.long_word_count
155
+ s.set :flesch_kincaid, s.flesch_kincaid
156
+ end
157
+
158
+ # Domain specificity -> named entities according to domain.
159
+ d.each_sentence do |s|
160
+ s.set :person_count, s.entities_with_feature(:name_tag, 'person').size
161
+ s.set :time_count, s.entities_with_feature(:name_tag, 'time').size
162
+ s.set :location_count, s.entities_with_feature(:name_tag, 'location').size
163
+ s.set :number_count, s.number_count
164
+ puts s.inspect
165
+ end
166
+
167
+ d.each_sentence do |s|
168
+ if Random.rand() >= 0.5
169
+ s.set :golden, true
170
+ else
171
+ s.set :golden, false
172
+ end
173
+ end
174
+
175
+ golden = []
176
+ not_golden = []
177
+
178
+ d.each_sentence do |s|
179
+ if s.golden
180
+ golden << s
181
+ else
182
+ not_golden << s
183
+ end
184
+ end
185
+
186
+ i = 0
187
+ golden.each do |s|
188
+ puts s.sentence_p.to_s + ' ' + not_golden[i].sentence_p.to_s
189
+ i += 1
190
+ end
191
+ =end
192
+ =begin
193
+
194
+ d = Document 'http://www.cbc.ca/news/canada/montreal/story/2012/06/04/montreal-magnotta-search.html'
195
+
196
+ d.do :chunk, :segment
197
+
198
+ d.each_zone do |z|
199
+ puts '-------' + z.type.to_s
200
+ z.do tokenize: :ptb
201
+ z.each_sentence do |s|
202
+ puts s.to_s
203
+ end
204
+ #puts z.to_s
205
+ puts '-------'
206
+ end
207
+
10
208
 
11
209
  abort
12
- text = "Bonjour, je suis bel et bien arrivé au château.".parse
13
- text.do :category
14
210
 
15
- text.print_tree
211
+ Treat::Databases.connect :mongo
212
+
213
+ p = Phrase ''
214
+ w = Word 'test'
215
+ p << w
216
+
217
+ p.print_tree
218
+
219
+ p.serialize :mongo, :db => 'treat'
220
+ p2 = Treat::Workers::Formatters::Unserializers::Mongo.unserialize(Treat::Entities::Phrase.new('', p.id))
221
+ p2.print_tree
16
222
 
17
223
  =end
data/spec/token.rb CHANGED
@@ -57,13 +57,16 @@ describe Treat::Entities::Token do
57
57
  describe "Lexicalizable" do
58
58
 
59
59
  before do
60
- @lexicalizers = Treat::Languages::English::Lexicalizers
60
+ @lexicalizers = Treat.languages.
61
+ english.workers.lexicalizers
62
+ @a_lexicalizers = Treat.languages.
63
+ agnostic.workers.lexicalizers
61
64
  end
62
65
 
63
66
  describe "#tag" do
64
67
 
65
68
  it "returns the tag of the token" do
66
- @lexicalizers[:taggers].each do |t|
69
+ @lexicalizers.taggers.each do |t|
67
70
  'man'.tag(t).should eql 'NN'
68
71
  '2'.tag(t).should eql 'CD'
69
72
  '.'.tag(t).should eql '.'
@@ -78,16 +81,16 @@ describe Treat::Entities::Token do
78
81
  context "when called on a word" do
79
82
  it "returns the general part of speech of " +
80
83
  "the word as a lowercase symbol" do
81
- @lexicalizers[:categorizers].each do |c|
82
- 'man'.category(c).should eql :noun
84
+ @a_lexicalizers.categorizers.each do |c|
85
+ 'man'.category(c).should eql 'noun'
83
86
  end
84
87
  end
85
88
  end
86
89
 
87
90
  context "when called on a number" do
88
91
  it "returns :number" do
89
- @lexicalizers[:categorizers].each do |c|
90
- '2'.category(c).should eql :number
92
+ @a_lexicalizers.categorizers.each do |c|
93
+ '2'.category(c).should eql 'number'
91
94
  end
92
95
  end
93
96
  end
@@ -95,9 +98,9 @@ describe Treat::Entities::Token do
95
98
  context "when called on a punctuation or symbol" do
96
99
  it "returns the type of punctuation or symbol" +
97
100
  "as a lowercase identifier" do
98
- @lexicalizers[:categorizers].each do |c|
99
- '$'.category(c).should eql :dollar
100
- '.'.category(c).should eql :period
101
+ @a_lexicalizers.categorizers.each do |c|
102
+ '$'.category(c).should eql 'dollar'
103
+ '.'.category(c).should eql 'period'
101
104
  end
102
105
  end
103
106
  end
data/spec/treat.rb CHANGED
@@ -10,25 +10,28 @@ describe Treat do
10
10
  "define/undefine entity builders as uppercase methods " +
11
11
  "in the global namespace" do
12
12
 
13
- Treat::Entities.list.each do |type|
13
+ Treat.core.entities.list.each do |type|
14
14
 
15
15
  next if type == :symbol
16
16
 
17
- Treat.sweeten!
18
- Treat.sweetened?.should eql true
17
+ Treat::Config.sweeten!
19
18
 
19
+ Treat.core.syntax.sweetened.should eql true
20
20
 
21
21
  Object.method_defined?(
22
22
  :"#{type.to_s.capitalize}").
23
23
  should eql true
24
24
 
25
- Treat.unsweeten!
26
- Treat.sweetened?.should eql false
25
+ Treat::Config.unsweeten!
26
+ Treat.core.syntax.sweetened.should eql false
27
+
28
+ Object.method_defined?(
29
+ type.to_s.capitalize.intern).should eql false
27
30
 
28
31
  Object.method_defined?(
29
32
  :"#{type.to_s.capitalize}").
30
33
  should eql false
31
-
34
+
32
35
  end
33
36
 
34
37
  end
@@ -37,12 +40,12 @@ describe Treat do
37
40
 
38
41
  describe "Paths:" do
39
42
 
40
- paths = Treat::Paths
43
+ paths = Treat.core.paths.description
41
44
  # Check IO for bin, files, tmp, models. Fix.
42
- paths.each do |path, files|
45
+ paths.each_pair do |path, files|
43
46
  describe "##{path}" do
44
47
  it "provides the path to the #{files}" do
45
- Treat.send(path).should be_instance_of String
48
+ Treat.paths[path].should be_instance_of String
46
49
  end
47
50
  end
48
51
  end
data/spec/word.rb CHANGED
@@ -5,13 +5,14 @@ describe Treat::Entities::Word do
5
5
  describe "Inflectors" do
6
6
 
7
7
  before do
8
- @inflectors = Treat::Languages::English::Inflectors
8
+ @inflectors = Treat.languages.
9
+ english.workers.inflectors
9
10
  end
10
11
 
11
12
  describe "#stem" do
12
13
 
13
14
  it "returns the stem of the word" do
14
- @inflectors[:stemmers].each do |s|
15
+ @inflectors.stemmers.each do |s|
15
16
  'running'.stem(s).should eql 'run'
16
17
  end
17
18
  end
@@ -20,7 +21,7 @@ describe Treat::Entities::Word do
20
21
 
21
22
  describe "#infinitive" do
22
23
  it "returns the infinitive form of a verb" do
23
- @inflectors[:conjugators].each do |c|
24
+ @inflectors.conjugators.each do |c|
24
25
  'running'.infinitive(c).should eql 'run'
25
26
  end
26
27
  end
@@ -29,7 +30,7 @@ describe Treat::Entities::Word do
29
30
  # Nil if not verb?
30
31
  describe "#present_participle" do
31
32
  it "returns the present participle form of a verb" do
32
- @inflectors[:conjugators].each do |c|
33
+ @inflectors.conjugators.each do |c|
33
34
  'running'.infinitive(c).should eql 'run'
34
35
  end
35
36
  end
@@ -37,7 +38,7 @@ describe Treat::Entities::Word do
37
38
 
38
39
  describe "#plural" do
39
40
  it "returns the plural form of the word" do
40
- @inflectors[:declensors].each do |i|
41
+ @inflectors.declensors.each do |i|
41
42
  # 'inflection'.plural(i).should eql 'inflections'
42
43
  end
43
44
  end
@@ -45,7 +46,7 @@ describe Treat::Entities::Word do
45
46
 
46
47
  describe "#singular" do
47
48
  it "returns the singular form of the word" do
48
- @inflectors[:declensors].each do |i|
49
+ @inflectors.declensors.each do |i|
49
50
  next if i == :linguistics # Fix this
50
51
  # 'inflections'.singular(i).should eql 'inflections'
51
52
  end
@@ -54,10 +55,10 @@ describe Treat::Entities::Word do
54
55
 
55
56
  describe "#ordinal_form" do
56
57
  it "returns the ordinal form of a number" do
57
- @inflectors[:cardinalizers].each do |o|
58
+ @inflectors.cardinalizers.each do |o|
58
59
  20.cardinal.should eql 'twenty'
59
60
  end
60
- @inflectors[:ordinalizers].each do |o|
61
+ @inflectors.ordinalizers.each do |o|
61
62
  20.ordinal.should eql 'twentieth'
62
63
  end
63
64
  end
@@ -100,7 +101,7 @@ describe Treat::Entities::Word do
100
101
  describe "#tf_idf" do
101
102
  it "returns the TF*IDF score of the word" do
102
103
  #c = Treat::Entities::Collection.build(
103
- #Treat.spec + 'samples/mathematicians')
104
+ #Treat.paths.spec + 'samples/mathematicians')
104
105
  #c.do(:chunk, :segment, :tokenize)
105
106
  #c.words[30].tf_idf.should eql 0.2231
106
107
  end