treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
@@ -0,0 +1,194 @@
1
+ class Treat::Specs::Workers::English < Treat::Specs::Workers::Language
2
+
3
+ # TODO: parse
4
+
5
+ Scenarios = {
6
+ tokenize: {
7
+ group: {
8
+ examples: [
9
+ ["Julius Obsequens was a Roman writer who is believed to have lived in the middle of the fourth century AD.", ["Julius", "Obsequens", "was", "a", "Roman", "writer", "who", "is", "believed", "to", "have", "lived", "in", "the", "middle", "of", "the", "fourth", "century", "AD", "."]],
10
+ ["The only work associated with his name is the Liber de prodigiis (Book of Prodigies), completely extracted from an epitome, or abridgment, written by Livy; De prodigiis was constructed as an account of the wonders and portents that occurred in Rome between 249 BC-12 BC.", ["The", "only", "work", "associated", "with", "his", "name", "is", "the", "Liber", "de", "prodigiis", "(", "Book", "of", "Prodigies", ")", ",", "completely", "extracted", "from", "an", "epitome", ",", "or", "abridgment", ",", "written", "by", "Livy", ";", "De", "prodigiis", "was", "constructed", "as", "an", "account", "of", "the", "wonders", "and", "portents", "that", "occurred", "in", "Rome", "between", "249", "BC-12", "BC", "."]],
11
+ ["Of great importance was the edition by the Basle Humanist Conrad Lycosthenes (1552), trying to reconstruct lost parts and illustrating the text with wood-cuts.", ["Of", "great", "importance", "was", "the", "edition", "by", "the", "Basle", "Humanist", "Conrad", "Lycosthenes", "(", "1552", ")", ",", "trying", "to", "reconstruct", "lost", "parts", "and", "illustrating", "the", "text", "with", "wood-cuts", "."]],
12
+ ["These have been interpreted as reports of unidentified flying objects (UFOs), but may just as well describe meteors, and, since Obsequens, probably, writes in the 4th century, that is, some 400 years after the events he describes, they hardly qualify as eye-witness accounts.", ["These", "have", "been", "interpreted", "as", "reports", "of", "unidentified", "flying", "objects", "(", "UFOs", ")", ",", "but", "may", "just", "as", "well", "describe", "meteors", ",", "and", ",", "since", "Obsequens", ",", "probably", ",", "writes", "in", "the", "4th", "century", ",", "that", "is", ",", "some", "400", "years", "after", "the", "events", "he", "describes", ",", "they", "hardly", "qualify", "as", "eye-witness", "accounts", "."]],
13
+ ['"At Aenariae, while Livius Troso was promulgating the laws at the beginning of the Italian war, at sunrise, there came a terrific noise in the sky, and a globe of fire appeared burning in the north.', ["\"", "At", "Aenariae", ",", "while", "Livius", "Troso", "was", "promulgating", "the", "laws", "at", "the", "beginning", "of", "the", "Italian", "war", ",", "at", "sunrise", ",", "there", "came", "a", "terrific", "noise", "in", "the", "sky", ",", "and", "a", "globe", "of", "fire", "appeared", "burning", "in", "the", "north", "."]]
14
+ ],
15
+ generator: lambda { |entity| entity.tokens.map { |tok| tok.to_s } }
16
+ }
17
+ },
18
+ parse: {
19
+ group: {
20
+ examples: [
21
+ ["A sentence to tokenize.", ["A sentence to tokenize.", "A sentence", "to tokenize",
22
+ "tokenize"]]
23
+ ],
24
+ generator: lambda { |group| group.phrases.map { |phrase| phrase.to_s } }
25
+ }
26
+ },
27
+ segment: {
28
+ zone: {
29
+ examples: [
30
+ ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien. It is the easternmost village of Gozo and has been inhabited since early times. The development of the present settlement began in the second half of the seventeenth century. It is a pleasant and rural place with many natural and historic attractions.", ["Qala is first referred to in a fifteenth century portolan preserved at the Vatican library has taken its name from the qala or port of Mondoq ir-Rummien.", "It is the easternmost village of Gozo and has been inhabited since early times.", "The development of the present settlement began in the second half of the seventeenth century.", "It is a pleasant and rural place with many natural and historic attractions."]],
31
+ ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM. But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio." "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes. This was a further proof of the value of the radio. It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community. An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM.", ["Originally Radio Lehen il-Qala transmitted on frequency 106.5FM.", "But when consequently a national radio started transmissions on a frequency quite close, it caused a hindrance to our community radio.", "People were complaining that the voice of the local radio was no longer clear and they were experiencing difficulty in following the programmes.", "This was a further proof of the value of the radio.", "It was a confirmation that it was a good and modern means of bringing the Christian message to the whole community.", "An official request was therefore made to the Broadcasting Authority and Radio Lehen il-Qala was given a new frequency - 106.3FM."]]
32
+ ],
33
+ generator: lambda { |entity| entity.sentences.map { |sent| sent.to_s } }
34
+ }
35
+ },
36
+ tag: {
37
+ phrase: {
38
+ examples: [
39
+ ["I was running", "P"]
40
+ ]
41
+ },
42
+ token: {
43
+ examples: [
44
+ ["running", "VBG"],
45
+ ["man", "NN"],
46
+ ["2", "CD"],
47
+ [".", "."],
48
+ ["$", "$"]
49
+ ]
50
+ }
51
+ },
52
+ category: {
53
+ phrase: {
54
+ examples: [
55
+ ["I was running", "phrase"]
56
+ ]
57
+ },
58
+ token: {
59
+ examples: [
60
+ ["running", "verb"]
61
+ ]
62
+ }
63
+ },
64
+ ordinal: {
65
+ word: {
66
+ examples: [
67
+ ["20", "twentieth"]
68
+ ]
69
+ },
70
+ number: {
71
+ examples: [
72
+ [20, "twentieth"]
73
+ ]
74
+ }
75
+ },
76
+ cardinal: {
77
+ word: {
78
+ examples: [
79
+ ['20', "twenty"]
80
+ ]
81
+ },
82
+ number: {
83
+ examples: [
84
+ [20, "twenty"]
85
+ ]
86
+ }
87
+ },
88
+ name_tag: {
89
+ group: {
90
+ examples: [
91
+ ["Obama and Sarkozy will meet in Berlin.", ["person", nil, "person", nil, nil, nil, "location"]]
92
+ ],
93
+ preprocessor: lambda { |group| group.tokenize },
94
+ generator: lambda { |group| group.words.map { |word| word.get(:name_tag) } }
95
+ }
96
+ },
97
+ language: { ######
98
+ entity: {
99
+ examples: [
100
+ ["Obama and Sarkozy will meet in Berlin.", "english"]
101
+ ],
102
+ preprocessor: lambda { |entity| Treat.core.language.detect = true; entity.do(:tokenize); entity },
103
+ postprocessor: lambda { |entity| Treat.core.language.detect = false; entity; },
104
+ generator: lambda { |group| group.words.map { |word| word.get(:name_tag) } }
105
+ }
106
+ },
107
+ stem: {
108
+ word: {
109
+ examples: [
110
+ ["running", "run"]
111
+ ]
112
+ }
113
+ },
114
+ time: {
115
+ group: {
116
+ examples: [
117
+ ['october 2006', 10]
118
+ ],
119
+ generator: lambda { |entity| entity.time.month }
120
+ }
121
+ },
122
+ topics: {
123
+ document: {
124
+ examples: [
125
+ ["./spec/workers/examples/english/test.txt",
126
+ ['household goods and hardware',
127
+ 'united states of america',
128
+ 'corporate/industrial']]
129
+ ],
130
+ preprocessor: lambda { |doc| doc.do :chunk, :segment, :tokenize }
131
+ },
132
+ section: {
133
+ # Must implement
134
+ },
135
+ zone: {
136
+ examples: [
137
+ ["Michigan, Ohio, Texas - Unfortunately, the RadioShack is closing. This is horrible news for U.S. politics.", ['household goods and hardware', 'united states of america', 'corporate/industrial']]
138
+ ],
139
+ preprocessor: lambda { |zone| zone.do :segment, :tokenize }
140
+ }
141
+ },
142
+ topic_words: {
143
+ collection: {
144
+ examples: [
145
+ ["./perf/examples/economist", [""]]
146
+ ],
147
+ preprocessor: lambda { |coll| coll.do :chunk, :segment, :tokenize }
148
+ }
149
+ },
150
+ conjugate: {
151
+ word: {
152
+ examples: {
153
+ present_participle: [
154
+ ["run", "running"]
155
+ ],
156
+ infinitive: [
157
+ ["running", "run"]
158
+ ]
159
+ }
160
+ }
161
+ },
162
+ declense: {
163
+ word: {
164
+ examples: {
165
+ singular: [
166
+ ["men", "man"]
167
+ ],
168
+ plural: [
169
+ ["man", "men"]
170
+ ]
171
+ }
172
+ }
173
+ },
174
+ sense: {
175
+ word: {
176
+ examples: {
177
+ synonyms: [
178
+ ["throw", ["throw", "shed", "cast", "cast off", "shake off", "throw off", "throw away", "drop", "thrust", "give", "flip", "switch", "project", "contrive", "bewilder", "bemuse", "discombobulate", "hurl", "hold", "have", "make", "confuse", "fox", "befuddle", "fuddle", "bedevil", "confound"]]
179
+ ],
180
+ antonyms: [
181
+ ["weak", ["strong"]]
182
+ ],
183
+ hypernyms: [
184
+ ["table", ["array", "furniture", "piece of furniture", "article of furniture", "tableland", "plateau", "gathering", "assemblage", "fare"]]
185
+ ],
186
+ hyponyms: [
187
+ ["furniture", ["baby bed", "baby's bed", "bedroom furniture", "bedstead", "bedframe", "bookcase", "buffet", "counter", "sideboard", "cabinet", "chest of drawers", "chest", "bureau", "dresser", "dining-room furniture", "etagere", "fitment", "hallstand", "lamp", "lawn furniture", "nest", "office furniture", "seat", "sectional", "Sheraton", "sleeper", "table", "wall unit", "wardrobe", "closet", "press", "washstand", "wash-hand stand"]]
188
+ ]
189
+ }
190
+ }
191
+ },
192
+ }
193
+
194
+ end
@@ -0,0 +1,46 @@
1
+ Hungary's troubles
2
+ Not just a rap on the knuckles
3
+
4
+ THE pressure is piling up on the beleaguered Hungarian government. Today the European Commission threatened it with legal action over several new "cardinal" laws that would require a two-thirds majority in parliament to overturn.
5
+
6
+ The commission is still considering the laws, but today it highlighted concerns over three issues:
7
+
8
+ - The independence of the central bank. Late last year the Hungarian parliament passed a law which expands the monetary council and takes the power to nominate deputies away from the governor and hands it to the prime minister. A separate law opens the door to a merger between the bank and the financial regulator.
9
+
10
+ - The judiciary. More than 200 judges over the age of 62 have been forced into retirement and hundreds more face the sack. The new National Judicial Authority is headed by Tünde Handó, a friend of the family of Viktor Orban, the prime minister.
11
+
12
+ - The independence of the national data authority.
13
+
14
+ That wasn't all the commission had to say today. Hungary also received a ticking-off from Olli Rehn (pictured), the economic-affairs commissioner, for not doing enough to tackle its budget deficit. It may now lose access to EU funds.
15
+
16
+ Slammed in Brussels, the Hungarian government is also under pressure at home. Earlier this week Gordon Bajnai, who served as Socialist prime minister from 2009-10, fired off a broadside that sent shockwaves through the political and media establishments.
17
+
18
+ After a year and a half of government by the right-wing Fidesz party, wrote Mr Bajnai in a lengthy article on the website of the Patriotism and Progress Public Policy Foundation, democracy has been destroyed in Hungary. The country, he warned, is scarred by division and is drifting towards bankruptcy and away from Europe.
19
+
20
+ Mr Bajnai called for a radical change of government and a complete political re-orientation. “A new government must have a programme readily at hand that can be applied without delay: a programme that promotes the republic, reconciliation, and recovery.”
21
+
22
+ Fidesz is rattled by Mr Bajnai, who since leaving office has been teaching at Columbia University in New York. Understandably so. He headed a technocratic administration which stabilised the economy. Unlike his Socialist predecessor, Ferenc Gyurcsany, he was neither part of the old Communist elite nor connected to it by marriage, and so cannot be smeared as a "Komcsi". He is modern in outlook and well regarded internationally.
23
+
24
+ Moreover, say those how know him, Mr Bajnai has little patience for the narcissistic exceptionalism that shapes Fidesz’s worldview. Exhibit A: the plaintive cry of Janos Martonyi, the foreign minister, who lamented recently: “The world will never understand our pains and spiritual wounds.” Such self-pity is unlikely to endear the Hungarian government to Brussels or Washington DC (to where it has sent an envoy this week to negotiate with the IMF).
25
+
26
+ Fidesz won a two-thirds majority in 2010. But its support is evaporating, and analysts say there is a gap in the political market for a centrist pro-business party committed to democratic norms. Mr Bajnai, who has not ruled out a return to politics, would be an obvious candidate to lead it.
27
+
28
+ Meanwhile, as Hungarians watch the value of their assets vaporise, in large part thanks to the government’s increasingly erratic policies, Mr Orban smirks his way through press conferences. Here he is dodging questions from a reporter from HVG, an economics weekly, about his responsibility for the crisis and trying to shift the blame to his old enemy Andras Simor, president of the central bank. The interview ran as follows:
29
+
30
+ hvg.hu: Do you feel responsible for the falling/weakening forint?
31
+
32
+ Mr Orban: You mean the president of the central bank? He did not comment on it.
33
+
34
+ hvg.hu: No, you, Mr prime minister!
35
+
36
+ Mr Orban: The personal responsibility of the president of the central bank was not discussed over the meeting.
37
+
38
+ hvg.hu: You, your personal…!
39
+
40
+ Mr Orban: That neither.
41
+
42
+ Surrounded by yes-men and grinning flunkies, Mr Orban seems increasingly out of touch. His future will likely be decided not in the gilded corridors of the Hungarian parliament, but in Brussels and Washington DC.
43
+
44
+ What happens next? If his hand is forced Mr Orban can probably endure policy reversals on the independence of the central bank and the data ombudsman. Sorry, he would say to his loyal followers: national crisis, what can you do.
45
+
46
+ The dismantling of the judiciary would be another matter. If outsiders keep up the pressure and the judicial changes are judged to be in breach of the EU treaty, Mr Orban would be in a tricky spot. It’s hard to see how he could declare the 200-plus judges his government has forced into retirement ready for office after all, and still sit in his own.
@@ -0,0 +1,5 @@
1
+ <?xml version="1.0" encoding="us-ascii" ?>
2
+ <treat>
3
+ <sentence id='70233694858140'>
4
+ A test entity.</sentence>
5
+ </treat>
@@ -0,0 +1 @@
1
+ A Rough Day for Republicans\n Michigan, Ohio, Texas - Unfortunately, the RadioShack is closing. This is horrible news for U.S. politics.
@@ -0,0 +1,280 @@
1
+ module Treat::Specs::Workers
2
+
3
+ class Language
4
+
5
+ include Treat::Core::DSL
6
+
7
+ @@list = []
8
+
9
+ # Headings for the list of workers table.
10
+ BenchmarkHeadings =
11
+ ['Method', 'Worker', 'Description',
12
+ 'Reference', 'User time', 'System time',
13
+ 'Real time', 'Accuracy']
14
+
15
+ # Add the language to the list,
16
+ # and define an initialize method.
17
+ def self.inherited(base)
18
+ @@list << base
19
+ base.class_eval do
20
+ def initialize(mode)
21
+ klass = self.class.const_get(:Scenarios)
22
+ @scenarios, @mode = klass, mode
23
+ @language = self.class.mn.downcase
24
+ end
25
+ end
26
+ end
27
+
28
+ # Return the list of registered languages.
29
+ def self.list; @@list; end
30
+
31
+ # Default options for #run.
32
+ DefaultOptions = { save_html: true }
33
+
34
+ # Runs the benchmarks or spec tasks.
35
+ def run(options = {})
36
+ options = DefaultOptions.merge(options)
37
+ results = run_scenarios
38
+ if @mode == 'benchmark'
39
+ l = @language.capitalize
40
+ print "\n\nBenchmark for #{l}\n"
41
+ Treat::Specs::Helper.text_table(
42
+ BenchmarkHeadings, results)
43
+ if options[:save_html]
44
+ Treat::Specs::Helper.html_table(
45
+ BenchmarkHeadings, results)
46
+ end
47
+ end
48
+ end
49
+
50
+ # Run all scenarios for a language, for all of the
51
+ # algorithm categories (e.g. Processors, Extractors).
52
+ def run_scenarios
53
+ categories = Treat.languages[
54
+ @language].workers
55
+ results = []
56
+ method = "run_scenarios_as_#{@mode}s"
57
+ categories.members.each do |cat|
58
+ category = categories[cat]
59
+ category.members.each do |grp|
60
+ group = category[grp]
61
+ group_class = Treat::Workers.
62
+ const_get(cat.cc).
63
+ const_get(grp.cc)
64
+ #next unless group_class ==
65
+ #Treat::Workers::Learners::Classifiers
66
+ group.each do |worker|
67
+ next if worker == :mongo # FIXME
68
+ next if worker == :html # FIXME
69
+ next if worker == :lda # FIXME
70
+ results << send(method,
71
+ worker, group_class)
72
+ end
73
+ end
74
+ end
75
+ results
76
+ end
77
+
78
+ # Run all benchmarks.
79
+ def run_scenarios_as_benchmarks(worker, group)
80
+ info = get_worker_info(worker, group)
81
+ description, reference =
82
+ info[:description], info[:reference]
83
+ accuracy = 0
84
+ time = ::Benchmark.measure do |x|
85
+ accuracy = run_scenarios_for_all_workers(
86
+ worker, group, 'benchmark')
87
+ end
88
+ # Return a row for the table.
89
+ [ group.method.to_s, worker.to_s,
90
+ description.strip,
91
+ reference ? reference : '-',
92
+ time.utime.round(4).to_s,
93
+ time.stime.round(4).to_s,
94
+ time.real.round(4).to_s,
95
+ accuracy ]
96
+ end
97
+
98
+ # Run examples as specs on each
99
+ # of the worker's target entities.
100
+ def run_scenarios_as_specs(worker, group)
101
+ run_scenarios_for_all_workers(worker, group, 'spec')
102
+ end
103
+
104
+ # Run a scenario (i.e. spec or benchmark
105
+ # all workers available to perform a given
106
+ # method call in a certain language).
107
+ def run_scenarios_for_all_workers(worker, group, mode)
108
+ accuracy = 0; i = 0; n = 0
109
+ method = "run_worker_#{mode}s"
110
+ group.targets.each do |target|
111
+ next if target == :section ### FIXME
112
+ i2, n2 = send(method, worker, group, target)
113
+ i += i2; n += n2
114
+ end
115
+ # Return the accuracy of the worker.
116
+ accuracy = (i.to_f/n.to_f*100).round(2)
117
+ accuracy
118
+ end
119
+
120
+ # Run all examples available to test the worker
121
+ # on a given target entity type as benchmarks.
122
+ # Outputs [# successes, # tries].
123
+ def run_worker_benchmarks(worker, group, target)
124
+ scenario = find_scenario(group.method, target)
125
+ return [0, 1] unless scenario
126
+ scenario = @scenarios[group.method][target]
127
+ if scenario[:examples].is_a?(Hash)
128
+ i, n = run_scenario_presets(
129
+ worker, group, target, scenario)
130
+ else
131
+ i, n = Treat::Specs::Workers::Language.
132
+ run_examples(worker, group, target, scenario)
133
+ end
134
+ [i, n]
135
+ end
136
+
137
+
138
+ # Run all examples available to test the worker
139
+ # on a given target entity type as RSpec tests.
140
+ def run_worker_specs(worker, group, target)
141
+ scenario = find_scenario(group.method, target)
142
+ return [0, 1] unless scenario
143
+ does = Treat::Specs::Workers::
144
+ Descriptions[group.method]
145
+ i = 0; n = 0;
146
+ rspec_task = RSpec::Core::ExampleGroup.describe(group) do
147
+ context "when it is called on a #{target}" do
148
+ if scenario[:examples].is_a?(Hash) && group.preset_option
149
+ preset_examples = scenario[:examples]
150
+ preset_examples.each do |preset, examples|
151
+ context "and #{group.preset_option} is set to #{preset}" do
152
+ it does[preset] do
153
+ options = {group.preset_option => preset}
154
+ bm = scenario.dup; bm[:examples] = examples
155
+ i2, n2 = *Treat::Specs::Workers::Language.
156
+ run_examples(worker, group, target, bm, options)
157
+ (i2.to_f/n2.to_f*100).round(2).should eql 100.0
158
+ i += i2; n += n2
159
+ end
160
+ end
161
+ end
162
+ else
163
+ it does do
164
+ i, n = Treat::Specs::Workers::Language.
165
+ run_examples(worker, group, target, scenario)
166
+ (i.to_f/n.to_f*100).round(2).should eql 100.0
167
+ end
168
+ end
169
+ # Check for accuracy.
170
+ end
171
+ end
172
+ rspec_task.register
173
+ [i, n]
174
+ end
175
+
176
+ def self.run_examples(worker, group, target, scenario, options = {})
177
+ i = 0; n = 0
178
+ examples, generator, preprocessor =
179
+ scenario[:examples], scenario[:generator],
180
+ scenario[:preprocessor]
181
+ target_class = Treat::Entities.
182
+ const_get(target.cc)
183
+ if examples.is_a?(Hash)
184
+ unless examples[worker]
185
+ raise Treat::Exception,
186
+ "No example defined for worker #{worker}."
187
+ end
188
+ examples = examples[worker]
189
+ end
190
+ examples.each do |example|
191
+ value, expectation, options2 = *example
192
+ entity = target_class.build(value)
193
+ begin
194
+ if preprocessor
195
+ preprocessor.call(entity)
196
+ end
197
+ if options2.is_a?(::Proc)
198
+ options2 = options2.call
199
+ end
200
+ options = options.merge(options2 || {})
201
+ if generator
202
+ result = entity.send(group.
203
+ method, worker, options)
204
+ operand = (group.type ==
205
+ :computer ? result : entity)
206
+ result = generator.call(operand)
207
+ else
208
+ result = entity.send(group.
209
+ method, worker, options)
210
+ end
211
+ rescue Treat::Exception => e
212
+ puts e.message
213
+ next
214
+ end
215
+ puts result.inspect
216
+ i += 1 if result == expectation
217
+ n += 1
218
+ end
219
+ (i == 0 && n == 0) ? [1, 1] : [i, n]
220
+ end
221
+
222
+ # * Helpers * #
223
+
224
+ # Given a method and a target,
225
+ # find a scenario for the current
226
+ # language class instance.
227
+ def find_scenario(method, target)
228
+ unless @scenarios[method]
229
+ puts "Warning: there is no scenario for " +
230
+ "method ##{method} called on " +
231
+ "#{target.to_s.plural} in the " +
232
+ "#{@language.capitalize} language."
233
+ return nil
234
+ end
235
+ unless @scenarios[method]
236
+ puts "Warning: there is a scenario for " +
237
+ "method ##{method} in the " +
238
+ "#{@language.capitalize} language, " +
239
+ "but there are no examples for target " +
240
+ "entity type '#{target.to_s.plural}'."
241
+ return nil
242
+ end
243
+ @scenarios[method][target]
244
+ end
245
+
246
+ # Parse out the description and reference from
247
+ # the Ruby file defining the worker/adapter.
248
+ def get_worker_info(worker, group)
249
+ bits = group.to_s.split('::')
250
+ bits.collect! { |bit| bit.ucc }
251
+ file = bits.join('/') + "/#{worker}.rb"
252
+ contents = File.read(Treat.paths.lib + file)
253
+ head = contents[0...contents.index('class')]
254
+ parts = head.gsub("\n# ", "\n").gsub('#', '').
255
+ gsub('encoding: utf-8', '').
256
+ gsub(/Authors: (.*)/m, ''). # ouch
257
+ gsub(/License: (.*)/m, '').
258
+ gsub(/Website: (.*)/m, '').
259
+ split('Original paper: ')
260
+ {description: parts[0] || '',
261
+ reference: parts[1] || '-'}
262
+ end
263
+
264
+ # Runs a benchmark for each preset.
265
+ def run_scenario_presets(worker, group, target, scenario)
266
+ i, n = 0, 0
267
+ examples = scenario[:examples]
268
+ examples.each do |preset, examples|
269
+ options = {group.preset_option => preset}
270
+ sc = scenario.dup; sc[:examples] = examples
271
+ i2, n2 = Treat::Specs::Workers::Language.
272
+ run_examples(worker, group, target, sc, options)
273
+ i += i2; n += n2
274
+ end
275
+ [i, n]
276
+ end
277
+
278
+ end
279
+
280
+ end
data/spec/workers.rb ADDED
@@ -0,0 +1,28 @@
1
+ module Treat::Specs::Workers
2
+ Descriptions = {
3
+ stem: "returns the stem of the word",
4
+ conjugate: {
5
+ infinitive: "returns the infinitive form of a verb",
6
+ present_participle: "returns the present participle form of a verb"
7
+ },
8
+ declense: {
9
+ plural: "returns the plural form of the word",
10
+ singular: "returns the singular form of the word"
11
+ },
12
+ ordinal: "returns the ordinal form of a number",
13
+ sense: {
14
+ synonyms: "returns the synonyms of the word",
15
+ antonyms: "returns the antonyms of the word",
16
+ hypernyms: "returns the hypernyms of the word",
17
+ hyponyms:"returns the hyponyms of the word"
18
+ },
19
+ tag: "returns the tag of the token",
20
+ category: "returns the category of the number, punctuation or symbol",
21
+ name_tag: "tags the named entity words in the group of words",
22
+ time: "annotates all entities within the group with time information",
23
+ tokenize: "splits the group of words into tokens and adds them as children of the group",
24
+ parse: "parses a group of words into its syntax tree, adding nested phrases and tokens as children of the group",
25
+ topics: "returns a list of general topics the document belongs to",
26
+ segment: "splits a zone into phrases/sentences and adds them as children of the zone"
27
+ }
28
+ end