treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
@@ -0,0 +1,289 @@
1
+ {
2
+ dependencies: [
3
+ 'punkt-segmenter',
4
+ 'tactful_tokenizer'
5
+ ],
6
+ workers: {
7
+ processors: {
8
+ segmenters: [:punkt],
9
+ tokenizers: []
10
+ }
11
+ },
12
+ stop_words:
13
+ [
14
+ "atminstone",
15
+ "an",
16
+ "anda",
17
+ "aven",
18
+ "aldrig",
19
+ "alla",
20
+ "alls",
21
+ "allt",
22
+ "alltid",
23
+ "allting",
24
+ "alltsa",
25
+ "andra",
26
+ "annan",
27
+ "annars",
28
+ "antingen",
29
+ "att",
30
+ "bakom",
31
+ "bland",
32
+ "blev",
33
+ "bli",
34
+ "bliva",
35
+ "blivit",
36
+ "bort",
37
+ "bortom",
38
+ "bredvid",
39
+ "dar",
40
+ "darav",
41
+ "darefter",
42
+ "darfor",
43
+ "dari",
44
+ "darigenom",
45
+ "darvid",
46
+ "dedar",
47
+ "definitivt",
48
+ "del",
49
+ "den",
50
+ "dendar",
51
+ "denhar",
52
+ "denna",
53
+ "deras",
54
+ "dessa",
55
+ "dessutom",
56
+ "desto",
57
+ "det",
58
+ "detta",
59
+ "dylik",
60
+ "efterat",
61
+ "efter",
62
+ "eftersom",
63
+ "eller",
64
+ "emellertid",
65
+ "enbart",
66
+ "endast",
67
+ "enligt",
68
+ "ens",
69
+ "ensam",
70
+ "envar",
71
+ "eran",
72
+ "etc",
73
+ "ett",
74
+ "exakt",
75
+ "fatt",
76
+ "fastan",
77
+ "fick",
78
+ "fler",
79
+ "flera",
80
+ "foljande",
81
+ "foljde",
82
+ "foljer",
83
+ "for",
84
+ "fore",
85
+ "forhoppningsvis",
86
+ "formodligen",
87
+ "forr",
88
+ "forra",
89
+ "forutom",
90
+ "forvisso",
91
+ "fran",
92
+ "framfor",
93
+ "fullstandigt",
94
+ "gang",
95
+ "gar",
96
+ "gatt",
97
+ "ganska",
98
+ "gav",
99
+ "genom",
100
+ "genomgaende",
101
+ "ger",
102
+ "gick",
103
+ "gjorde",
104
+ "gjort",
105
+ "gor",
106
+ "hade",
107
+ "har",
108
+ "harav",
109
+ "har",
110
+ "hej",
111
+ "hela",
112
+ "helst",
113
+ "helt",
114
+ "hitta",
115
+ "hon",
116
+ "honom",
117
+ "hur",
118
+ "huruvida",
119
+ "huvudsakligen",
120
+ "ibland",
121
+ "icke",
122
+ "ickedestomindre",
123
+ "igen",
124
+ "ihop",
125
+ "inat",
126
+ "ingen",
127
+ "ingenstans",
128
+ "inget",
129
+ "innan",
130
+ "innehalla",
131
+ "inre",
132
+ "inte",
133
+ "inuti",
134
+ "istaellet",
135
+ "kanske",
136
+ "klart",
137
+ "knappast",
138
+ "knappt",
139
+ "kom",
140
+ "komma",
141
+ "kommer",
142
+ "kraver",
143
+ "kunde",
144
+ "kunna",
145
+ "lata",
146
+ "later",
147
+ "lagga",
148
+ "langre",
149
+ "laet",
150
+ "lagd",
151
+ "leta",
152
+ "letar",
153
+ "manga",
154
+ "maste",
155
+ "med",
156
+ "medan",
157
+ "medans",
158
+ "mellan",
159
+ "mest",
160
+ "min",
161
+ "mindre",
162
+ "minst",
163
+ "mittemellan",
164
+ "motsvarande",
165
+ "mycket",
166
+ "nagon",
167
+ "nagongang",
168
+ "nagonsin",
169
+ "nagonstans",
170
+ "nagonting",
171
+ "nagorlunda",
172
+ "nagot",
173
+ "namligen",
174
+ "nar",
175
+ "nara",
176
+ "nasta",
177
+ "nastan",
178
+ "nedat",
179
+ "nedanfor",
180
+ "nerat",
181
+ "ner",
182
+ "nog",
183
+ "normalt",
184
+ "nummer",
185
+ "nuvarande",
186
+ "nytt",
187
+ "oavsett",
188
+ "och",
189
+ "ocksa",
190
+ "oppna",
191
+ "over",
192
+ "overallt",
193
+ "ofta",
194
+ "okej",
195
+ "olika",
196
+ "ovanfor",
197
+ "ratt",
198
+ "redan",
199
+ "relativt",
200
+ "respektive",
201
+ "rimlig",
202
+ "rimligen",
203
+ "rimligt",
204
+ "salunda",
205
+ "savida",
206
+ "saga",
207
+ "sager",
208
+ "sakert",
209
+ "sand",
210
+ "sarskilt",
211
+ "satt",
212
+ "sak",
213
+ "samma",
214
+ "samtliga",
215
+ "sedd",
216
+ "senare",
217
+ "senaste",
218
+ "ser",
219
+ "sig",
220
+ "sista",
221
+ "sjaelv",
222
+ "ska",
223
+ "skall",
224
+ "skickad",
225
+ "skriva",
226
+ "skulle",
227
+ "snabb",
228
+ "snarare",
229
+ "snart",
230
+ "som",
231
+ "somliga",
232
+ "speciellt",
233
+ "stalla",
234
+ "stallet",
235
+ "starta",
236
+ "strax",
237
+ "stundom",
238
+ "tackar",
239
+ "tanka",
240
+ "taga",
241
+ "tagen",
242
+ "tala",
243
+ "tanke",
244
+ "tidigare",
245
+ "tills",
246
+ "tog",
247
+ "totalt",
248
+ "trolig",
249
+ "troligen",
250
+ "tvaers",
251
+ "tvars",
252
+ "tycka",
253
+ "tyckte",
254
+ "tyvarr",
255
+ "understundom",
256
+ "upp",
257
+ "uppenbarligen",
258
+ "uppenbart",
259
+ "utan",
260
+ "utanfor",
261
+ "uteslutande",
262
+ "utom",
263
+ "var",
264
+ "varan",
265
+ "vad",
266
+ "val",
267
+ "varde",
268
+ "vanlig",
269
+ "vanligen",
270
+ "var",
271
+ "vare",
272
+ "varenda",
273
+ "varfor",
274
+ "varifran",
275
+ "varit",
276
+ "varje",
277
+ "varken",
278
+ "vars",
279
+ "vart",
280
+ "vem",
281
+ "verkligen",
282
+ "vidare",
283
+ "vilken",
284
+ "vill",
285
+ "visar",
286
+ "visst",
287
+ "visste"
288
+ ]
289
+ }
@@ -0,0 +1,12 @@
1
+ {
2
+ punkt: {
3
+ model_path: nil
4
+ },
5
+ reuters: {
6
+ model_path: nil
7
+ },
8
+ stanford: {
9
+ jar_path: nil,
10
+ model_path: nil
11
+ }
12
+ }
@@ -0,0 +1,44 @@
1
+ {
2
+ categories:
3
+ ['adjective', 'adverb', 'noun',
4
+ 'verb', 'interjection', 'clitic',
5
+ 'coverb', 'conjunction', 'determiner',
6
+ 'particle', 'preposition', 'pronoun',
7
+ 'number', 'symbol', 'punctuation',
8
+ 'complementizer'],
9
+
10
+ punctuation: {
11
+ punct_to_category: {
12
+ '.' => 'period',
13
+ ',' => 'comma',
14
+ ';' => 'semicolon',
15
+ ':' => 'colon',
16
+ '?' => 'interrogation',
17
+ '!' => 'exclamation',
18
+ '"' => 'double_quote',
19
+ "'" => 'single_quote',
20
+ '$' => 'dollar',
21
+ '%' => 'percent',
22
+ '#' => 'hash',
23
+ '*' => 'asterisk',
24
+ '&' => 'ampersand',
25
+ '+' => 'plus',
26
+ '-' => 'dash',
27
+ '/' => 'slash',
28
+ '\\' => 'backslash',
29
+ '^' => 'caret',
30
+ '_' => 'underscore',
31
+ '`' => 'tick',
32
+ '|' => 'pipe',
33
+ '~' => 'tilde',
34
+ '@' => 'at',
35
+ '[' => 'bracket',
36
+ ']' => 'bracket',
37
+ '{' => 'brace',
38
+ '}' => 'brace',
39
+ '(' => 'parenthesis',
40
+ ')' => 'parenthesis',
41
+ '<' => 'tag',
42
+ '>' => 'tag'
43
+ }}
44
+ }