treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
data/spec/sandbox.rb CHANGED
@@ -1,36 +1,269 @@
1
- require_relative 'helper'
1
+ # encoding: utf-8
2
+ require_relative '../lib/treat'
3
+
4
+ require 'treat'
5
+ include Treat::Core::DSL
6
+
7
+ Treat.libraries.stanford.jar_path = '/ruby/treat/bin/'
8
+ Treat.libraries.stanford.model_path = '/ruby/treat/models/'
9
+
10
+ p = paragraph
11
+ s = sentence
12
+ w = word
13
+
14
+ p = phrase 'hello world'
15
+ e = email 'louis@gmail.com'
16
+
17
+ #d = document Treat.paths.spec + 'workers/examples/english/economist/hungarys_troubles.txt'
18
+ #d.apply :chunk, :segment, :tokenize, :tag, :category, :name_tag
19
+ #d.print_tree
20
+ #d = document Treat.paths.spec + 'workers/examples/english/economist/saving_the_euro.odt'
21
+ #d.print_tree
22
+ =begin
23
+ d = document 'test.htm'
24
+ d.apply :chunk
25
+ #d.serialize :yaml, file: 'test444.yaml'
26
+ d.set :test, 2
27
+ d.serialize :mongo, db: 'test'
28
+ d.set :test, 3
29
+ d.serialize :mongo, db: 'test'
30
+ d.apply :segment, :tokenize, :tag, :category
31
+ puts d.verb_count
32
+ #d2 = document id: d.id, db: 'test'
33
+ d2 = document 'features.test' => 3, db: 'test'
34
+ d2.apply :segment, :tokenize, :tag, :category
35
+ puts d2.verb_count
36
+ #d.print_tree
37
+ #s = document 'http://www.economist.com'
38
+
39
+ p = phrase 'hello', 'world', '!'
40
+ puts p.to_s
41
+ puts p.to_str
42
+ =end
43
+
44
+ =begin
45
+ ### Super basics.
46
+ puts p.value
47
+
48
+ p << 'bitch'
49
+ p << word('hello')
50
+ puts p.to_s
51
+ puts p.to_str
52
+ puts p.value
53
+ puts p.to_ary.inspect
54
+ =end
55
+
56
+ =begin
57
+
58
+ ### Configuration
59
+
60
+ # A boolean value indicating whether to silence the output of external libraries (e.g. Stanford tools, Enju, LDA, Ruby-FANN) when they are used.
61
+ puts Treat.core.verbosity.silence
62
+ # A boolean value indicating whether to explain the steps that Treat is performing.
63
+ puts Treat.core.verbosity.debug
64
+ # A boolean value indicating whether Treat should try to detect the language of newly input text.
65
+ puts Treat.core.language.detect
66
+ # The language to default to when detection is off.
67
+ puts Treat.core.language.default
68
+ # A symbol representing the finest level at which language detection should be performed if language detection is turned on.
69
+ puts Treat.core.language.detect_at
70
+
71
+ # A directory in which to create temporary files.
72
+ puts Treat.paths.tmp
73
+ # A directory in which to store downloaded files.
74
+ puts Treat.paths.files
75
+ # A directory containing trained models for various tasks.
76
+ puts Treat.paths.models
77
+ # A directory containing the spec files.
78
+ puts Treat.paths.spec
79
+ # A directory containing executables and JAR files.
80
+ puts Treat.paths.bin
81
+ puts Treat.paths.lib
82
+
83
+ # Set up Mongoid.
84
+ Treat.databases.mongo.db = 'your_database'
85
+ Treat.databases.mongo.host = 'localhost'
86
+ Treat.databases.mongo.port = '27017'
87
+
88
+ # Transparent string casting.
89
+ s = 'inflection'.stem
90
+ # is equivalent to
91
+ s = 'inflection'.to_entity.stem
92
+ # which comes down to
93
+ s = word('inflection').stem
94
+
95
+ # Transparent number casting.
96
+ n = 2.ordinal
97
+ # is equivalent to
98
+ s = 2.to_entity.ordinal
99
+ # which comes down to
100
+ s = number(2).ordinal
101
+ =end
2
102
  =begin
3
- problem = Problem(
4
- Question(:is_key_sentence, :sentence, :discrete, 0, [0, 1]),
5
- Feature(:word_count, 0),
6
- Tag(:number_count, 0)
7
- )
103
+ ### BASIC USAGE
8
104
 
9
- problem.id = 70316753228720
105
+ # Create a sentence
106
+ s = sentence 'Those who dream by day know of at least ' +
107
+ '19 things that escape those who dream only at night.'
10
108
 
11
- test = Paragraph("Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged.")
109
+ # Tokenize and tag it.
110
+ s.tokenize.tag
12
111
 
13
- test.do :segment, :tokenize, :tag, :category
112
+ # View the sentence structure.
113
+ s.print_tree
14
114
 
15
- test.sentences[0].set :is_key_sentence, 1
16
- test.sentences[1].set :is_key_sentence, 1
17
- test.sentences[2].set :is_key_sentence, 0
115
+ # Iterate over the tokens.
116
+ s.each_token do |tok|
117
+ puts tok.value
118
+ puts tok.type
119
+ end
18
120
 
19
- ds = test.export(problem)
20
121
 
21
- test.each_sentence do |s|
22
- puts s.classify :linear, training: ds
122
+
123
+ # Arrays instead of iterators.
124
+ (s.nouns + s.adjectives).each do |word|
125
+ puts word.synonyms
126
+ puts word.antonyms
127
+ end
128
+
129
+ # Functions on numbers.
130
+ s.each_number do |num|
131
+ puts num.ordinal
132
+ puts num.cardinal
23
133
  end
134
+
135
+ # See all the annotations.
136
+ s.each do |tok|
137
+ puts tok.inspect
138
+ end
139
+
140
+ # Lazy way of doing all of the above.
141
+ s = sentence 'Those who dream by day know of at least ' +
142
+ '19 things that escape those who dream only at night.'
143
+
144
+ s.apply :tokenize, :tag, :category,
145
+ :stem, :hyponyms, :hypernyms,
146
+ :antonyms, :ordinal, :cardinal
147
+
24
148
  =end
149
+
25
150
  =begin
26
- Treat.databases.mongo.db = 'testing_ds'
27
- ds1 = Treat::Core::DataSet.unserialize :marshal, file: 'test.dump'
28
- ds1.serialize :mongo
29
- puts ds1.problem.id
30
- ds = Treat::Core::DataSet.unserialize :mongo, {problem: ds1.problem.id}
31
- puts ds.inspect
151
+ ### A BIT MORE ADVANCED USAGE
152
+
153
+ section = section "Obama-Sarkozy Meeting\n" +
154
+ "Obama and Sarkozy met on January 1st to investigate " +
155
+ "the possibility of a new rescue plan. President " +
156
+ "Sarkozy is to meet Merkel next Tuesday in Berlin."
157
+
158
+ # Chunk: split the titles and paragraphs.
159
+ # Segment: perform sentence segmentation.
160
+ # Parse: parse the syntax of each sentence.
161
+ section.apply :chunk, :segment, :parse
162
+
163
+ # View the tree structure.
164
+ section.print_tree
32
165
 
33
- Treat.databases.mongo.db = 'testingyetagain'
34
- ds = DataSet(:dump, './all.dump')
35
- ds.to_mongo({db: 'test_ds', collection: 'data'})
36
- =end
166
+ # Get some basic info on the text.
167
+ puts section.title
168
+ puts section.sentence_count
169
+ puts section.word_count
170
+
171
+ section.apply :category
172
+ puts section.noun_count
173
+ puts section.frequency_of 'president'
174
+
175
+ section.each_phrase_with_tag('NP') do |phrase|
176
+ puts phrase.to_s
177
+ end
178
+
179
+ =end
180
+ =begin
181
+ ### URL documents, XML serialization.
182
+
183
+ urls = ['http://www.cbc.ca/news/world/story/2012/11/25/snc-lavalin-ben-aissa-charges.html',
184
+ 'http://www.cbc.ca/news/world/story/2012/11/25/egypt.html', 'http://www.cbc.ca/news/canada/prince-edward-island/story/2012/11/25/pei-murder-arrest-stlucia.html', 'http://www.cbc.ca/news/world/story/2012/11/25/bangladesh-garment-factory-fire.html']
185
+
186
+ c = collection
187
+ urls.each { |url| c << document(url) }
188
+
189
+ # View the collection.
190
+ c.print_tree
191
+
192
+ c.apply :chunk, :segment, :tokenize
193
+ c.serialize :xml, :file => 'test.xml'
194
+
195
+ # Reopen the collection.
196
+ c = collection 'test.xml'
197
+
198
+ # View it again.
199
+ c.print_tree
200
+ =end
201
+ =begin
202
+ include Treat::Core::DSL
203
+
204
+ # Show progress bars for download.
205
+ Treat.core.verbosity.silence = false
206
+ # Explain what Treat is doing.
207
+ Treat.core.verbosity.debug = true
208
+
209
+ # Define the question "is it junk?" on sentences.
210
+ qn = question(:is_junk, :sentence)
211
+
212
+ # Frame the problem as depending on punctuation
213
+ # count and word count for each sentence.
214
+ pb = problem(qn,
215
+ feature(:punctuation_count),
216
+ feature(:word_count) )
217
+
218
+ # Get some web documents to work on.
219
+ url1 = 'http://en.wikipedia.org/wiki/NOD_mouse'
220
+ url2 = 'http://en.wikipedia.org/wiki/Academic_studies_about_Wikipedia'
221
+ d1, d2 = document(url1), document(url2)
222
+
223
+ # Process both of our documents.
224
+ [d1,d2].apply(:chunk, :segment, :tokenize)
225
+
226
+ # Answer our problem to create a training set.
227
+ d1.sentences[0..17].each { |s| s.set :is_junk, 0 }
228
+ d1.sentences[17..-1].each { |s| s.set :is_junk, 1 }
229
+ d_set = d1.export(pb)
230
+
231
+ # Define our gold standard results for evaluation.
232
+ d2.sentences[0..81].each { |s| s.set :is_true_junk, 0 }
233
+ d2.sentences[81..-1].each { |s| s.set :is_true_junk, 1 }
234
+
235
+ tp, fp, tn, fn = 0.0, 0.0, 0.0, 0.0
236
+
237
+ d2.sentences.map do |s|
238
+ pred = s.classify(:id3, training: d_set)
239
+ if pred == 1
240
+ tp += 1 if s.is_true_junk == 1
241
+ fp += 1 if s.is_true_junk == 0
242
+ else
243
+ tn += 1 if s.is_true_junk == 0
244
+ fn += 1 if s.is_true_junk == 1
245
+ end
246
+ end
247
+
248
+ puts "Precision: #{tp/(tp + fp)}"
249
+ puts "Recall: #{tp/(tp + fn)}"
250
+ =end
251
+ =begin
252
+ d = document 'http://louismullie.com/susan-text-scan1.jpg'
253
+ d.apply :chunk, :segment, :tokenize
254
+ d.print_tree
255
+ =end
256
+ =begin
257
+ # Syntax example
258
+ phra = phrase 'Obama', 'Sarkozy', 'Meeting'
259
+
260
+ para = paragraph 'Obama and Sarkozy met on January 1st to'
261
+ 'investigate the possibility of a new rescue plan. Nicolas ' +
262
+ 'Sarkozy is to meet Merkel next Tuesday in Berlin.'
263
+
264
+ sect = section title(phra), para
265
+ =end
266
+ =begin
267
+ puts "beer".plural.inspect
268
+ =end
269
+ p = paragraph
data/spec/treat.rb CHANGED
@@ -2,44 +2,36 @@ require_relative 'helper'
2
2
 
3
3
  describe Treat do
4
4
 
5
- describe "Syntactic sugar:"
6
-
7
- describe "#sweeten!, #unsweeten!" do
8
-
9
- it "respectively turn on and off syntactic sugar and " +
10
- "define/undefine entity builders as uppercase methods " +
11
- "in the global namespace" do
12
-
13
- Treat.core.entities.list.each do |type|
14
-
15
- next if type == :symbol
16
-
17
- Treat::Config.sweeten!
18
-
19
- Treat.core.syntax.sweetened.should eql true
20
-
21
- Object.method_defined?(
22
- :"#{type.to_s.capitalize}").
23
- should eql true
24
-
25
- Treat::Config.unsweeten!
26
- Treat.core.syntax.sweetened.should eql false
27
-
28
- Object.method_defined?(
29
- type.to_s.capitalize.intern).should eql false
30
-
31
- Object.method_defined?(
32
- :"#{type.to_s.capitalize}").
33
- should eql false
5
+ describe "Syntactic sugar:" do
6
+
7
+ describe "#sweeten!, #unsweeten!" do
8
+ it "respectively turn on and off syntactic sugar and " +
9
+ "define/undefine entity builders as uppercase methods " +
10
+ "in the global namespace" do
11
+ Treat.core.entities.list.each do |type|
12
+ next if type == :symbol
13
+
14
+ Treat::Config.sweeten!
15
+ Treat.core.syntax.sweetened.should eql true
16
+ Object.method_defined?(
17
+ :"#{type.to_s.capitalize}").
18
+ should eql true
34
19
 
20
+ Treat::Config.unsweeten!
21
+ Treat.core.syntax.sweetened.should eql false
22
+ Object.method_defined?(
23
+ type.to_s.capitalize.intern).should eql false
24
+ Object.method_defined?(
25
+ :"#{type.to_s.capitalize}").
26
+ should eql false
27
+ end
35
28
  end
36
-
37
29
  end
38
-
30
+
39
31
  end
40
-
32
+
41
33
  describe "Paths:" do
42
-
34
+
43
35
  paths = Treat.core.paths.description
44
36
  # Check IO for bin, files, tmp, models. Fix.
45
37
  paths.each_pair do |path, files|
@@ -49,7 +41,7 @@ describe Treat do
49
41
  end
50
42
  end
51
43
  end
52
-
44
+
53
45
  end
54
46
 
55
47
  end
@@ -0,0 +1,137 @@
1
+ class Treat::Specs::Workers::Agnostic < Treat::Specs::Workers::Language
2
+
3
+ # TODO: :tf_idf, :keywords, :classifiers
4
+ # :read,. :unserialize
5
+
6
+ Scenarios = {
7
+
8
+ # Also tests unserialize.
9
+ serialize: {
10
+ entity: {
11
+ examples: [
12
+ ["A test entity.", "A test entity."]
13
+ ],
14
+ generator: lambda { |selector| Treat::Entities::Entity.build(selector).to_s }
15
+ }
16
+ },
17
+ classify: {
18
+ entity: {
19
+ examples: [
20
+ ["Homer", 1, lambda { {training: Treat::Learning::DataSet.build('test.marshal')} }]
21
+ ],
22
+ preprocessor: lambda do |entity|
23
+ ds = Treat::Learning::DataSet.new(
24
+ Treat::Learning::Problem.new(
25
+ Treat::Learning::Question.new(:is_person, :word, :false, :discrete),
26
+ Treat::Learning::Feature.new(:first_capital, 0, "->(e) { (e.to_s[0] =~ /^[A-Z]$/) ? 1 : 0 }"),
27
+ Treat::Learning::Tag.new(:value, 0)
28
+ ))
29
+ w1, w2, w3, w4, w5 =
30
+ ["Alfred", "lucky", "Hobbit", "hello", "Alice"].
31
+ map { |w| Treat::Entities::Word.new(w) }
32
+ w1.set :is_person, 1
33
+ w2.set :is_person, 0
34
+ w3.set :is_person, 1
35
+ w4.set :is_person, 0
36
+ w5.set :is_person, 1
37
+ ds << w1; ds << w2; ds << w3
38
+ ds.serialize :marshal, file: 'test.marshal'
39
+ end
40
+ }
41
+ },
42
+ visualize: {
43
+ entity: {
44
+ examples: {
45
+ standoff: [
46
+ ["I walked to the store.", "(S\n (PRP I) (VBD walked) (TO to) (DT the) (NN store) (. .))\n"]
47
+ ],
48
+ tree: [
49
+ ["I walked to the store.", "+ Sentence (*) --- \"I walked to the store.\" --- {} --- [] \n|\n+--> Word (*) --- \"I\" --- {} --- [] \n+--> Word (*) --- \"walked\" --- {} --- [] \n+--> Word (*) --- \"to\" --- {} --- [] \n+--> Word (*) --- \"the\" --- {} --- [] \n+--> Word (*) --- \"store\" --- {} --- [] \n+--> Punctuation (*) --- \".\" --- {} --- [] "]
50
+ ],
51
+ dot: [
52
+ ["I walked to the store.", "graph {\n* [label=\"Sentence\\n\\\"I walked to the store.\\\"\",color=\"\"]\n* [label=\"Word\\n\\\"I\\\"\",color=\"\"]\n* -- *;\n* [label=\"Word\\n\\\"walked\\\"\",color=\"\"]\n* -- *;\n* [label=\"Word\\n\\\"to\\\"\",color=\"\"]\n* -- *;\n* [label=\"Word\\n\\\"the\\\"\",color=\"\"]\n* -- *;\n* [label=\"Word\\n\\\"store\\\"\",color=\"\"]\n* -- *;\n* [label=\"Punctuation\\n\\\".\\\"\",color=\"\"]\n* -- *;\n}"]
53
+ ]
54
+ },
55
+ preprocessor: lambda { |entity| entity.tokenize },
56
+ generator: lambda { |result| result.gsub(/[0-9]+/, '*') }
57
+ }
58
+ },
59
+
60
+ =begin
61
+ keywords: {
62
+ document: {
63
+ examples: [
64
+ ["./spec/workers/examples/english/economist/saving_the_euro.odt",
65
+ ["crisis", "government", "called", "financial", "funds", "treaty"]]
66
+ ],
67
+ preprocessor: lambda do |document|
68
+ coll = Treat::Entities::Collection.build('./spec/workers/examples/english/economist/')
69
+ coll << document
70
+ coll.apply(:chunk, :segment, :tokenize, :keywords)
71
+ document
72
+ end
73
+ },
74
+ section: {
75
+ examples: [
76
+ ["A test phrase", ["A", "test", "phrase"]]
77
+ ]
78
+ },
79
+ zone: {
80
+ examples: [
81
+ ["A test phrase", ["A", "test", "phrase"]]
82
+ ]
83
+ }
84
+ },
85
+ =end
86
+ =begin
87
+ unserialize: {
88
+ examples: [
89
+ ["A test entity.", "A test entity."]
90
+ ],
91
+ generator: lambda { |selector| Treat::Entities::Entity.build(selector).to_s }
92
+ },
93
+ =end
94
+ =begin
95
+ # Index
96
+ search: {
97
+ collection: {
98
+ examples: [
99
+ ["./spec/workers/examples/english/economist/",
100
+ "Hungary's troubles", {query: 'Hungary'}]
101
+ ],
102
+ generator: lambda { |docs| docs[0].titles[0] },
103
+ preprocessor: lambda { |coll| coll.apply(:index) }
104
+ },
105
+ },
106
+ =end
107
+ =begin
108
+ keywords: {
109
+ document: {
110
+ examples: [
111
+ ["./spec/languages/english/economist/saving_the_euro.odt",
112
+ ["A", "test", "phrase"]]
113
+ ],
114
+ preprocessor: lambda { |doc| doc.parent = Collection('./spec/languages/english/economist/') }
115
+ },
116
+ section: {
117
+ examples: [
118
+ ["A test phrase", ["A", "test", "phrase"]]
119
+ ]
120
+ },
121
+ zone: {
122
+ examples: [
123
+ ["A test phrase", ["A", "test", "phrase"]]
124
+ ]
125
+ }
126
+ },
127
+ =end
128
+ topic_words: {
129
+ collection: {
130
+ examples: [
131
+ ["./spec/workers/examples/english/economist", [["orban", "minister", "bajnai", "mr", "government", "president", "law", "brussels", "commission", "hu"], ["government", "minister", "fidesz", "mr", "hvg", "today", "hungarian", "bajnai", "national", "office"], ["mr", "today", "central", "minister", "crisis", "prime", "president", "bank", "european", "government"], ["sarkozy", "mr", "greece", "german", "summit", "france", "merkel", "opera", "growth", "euro"], ["central", "hand", "minister", "week", "bank", "forced", "hungarian", "parliament", "political", "hvg"], ["minister", "crisis", "central", "bank", "hand", "law", "forced", "bajnai", "parliament", "president"], ["mr", "bank", "european", "central", "government", "called", "today", "financial", "policies", "press"], ["mr", "crisis", "government", "central", "today", "funds", "president", "issues", "bank", "called"], ["mr", "crisis", "minister", "today", "european", "prime", "financial", "president", "issues", "treaty"], ["central", "minister", "mr", "bajnai", "orban", "bank", "parliament", "week", "fidesz", "washington"], ["mr", "central", "government", "crisis", "minister", "orban", "hand", "fidesz", "bajnai", "judicial"], ["mr", "sarkozy", "chancellor", "government", "european", "merkozy", "role", "mrs", "interest", "quickly"], ["mr", "orban", "government", "crisis", "hungarian", "independence", "prime", "today", "hand", "bajnai"], ["euro", "fiscal", "merkel", "mrs", "sarkozy", "mr", "european", "zone", "leaders", "chancellor"], ["mr", "bank", "crisis", "financial", "president", "funds", "government", "treaty", "central", "part"], ["mr", "central", "minister", "crisis", "prime", "european", "government", "bank", "treaty", "issues"], ["sarkozy", "fiscal", "merkel", "mrs", "growth", "zone", "german", "role", "paper", "quickly"], ["mr", "government", "orban", "bank", "bajnai", "hungarian", "prime", "-", "hu", "commission"], ["mr", "orban", "today", "bank", "minister", "national", "government", "-", "crisis", "forced"], ["role", "summit", "merkel", "euro", "zone", "german", "mr", "greece", "sarkozy", "step"]]]
132
+ ]
133
+ }
134
+ }
135
+ }
136
+
137
+ end