treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
@@ -1,221 +0,0 @@
1
- {tag_sets: [
2
- :claws_c5, :brown, :penn, :stutgart, :chinese, :paris7
3
- ],
4
- phrase_tags: [
5
- 'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
6
- 'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
7
- 'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
8
- 'Fragment', ['', '', 'FRAG', '', '', ''],
9
- 'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
10
- 'List marker', ['', '', 'LST', '', '', ''],
11
- 'Not a phrase', ['', '', 'NAC', '', '', ''],
12
- 'Noun phrase', ['', '', 'NP', '', '', 'NP'],
13
- 'Verbal nucleus', ['', '', '', '', '', 'VN'],
14
- 'Head of noun phrase', ['', '', 'NX', '', '', ''],
15
- 'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
16
- 'Parenthetical', ['', '', 'PRN', '', '', ''],
17
- 'Particle', ['', '', 'PRT', '', '', ''],
18
- 'Participial phrase', ['', '', '', '', '', 'VPart'],
19
- 'Quantifier phrase', ['', '', 'QP', '', '', ''],
20
- 'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
21
- 'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
22
- 'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
23
- 'Verb phrase', ['', '', 'VP', '', '', ''],
24
- 'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
25
- 'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
26
- 'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
27
- 'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
28
- 'Unknown', ['', '', 'X', '', '', ''],
29
- 'Phrase', ['', '', 'P', '', '', 'Sint'],
30
- 'Sentence', ['', '', 'S', '', '', 'SENT'],
31
- 'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
32
- ],
33
- word_tags: [
34
-
35
- # Aligned tags for the Claws C5, Brown and Penn tag sets.
36
- # Adapted from Manning, Christopher and Schütze, Hinrich,
37
- # 1999. Foundations of Statistical Natural Language
38
- # Processing. MIT Press, p. 141-142;
39
- # http://www.isocat.org/rest/dcs/376;
40
-
41
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
42
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
43
- 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
44
- 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
45
- 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
46
- 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
47
- 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
48
- 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
49
- 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
50
- 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
51
-
52
- 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
53
- 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
54
- 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
55
- 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
56
- 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
57
- 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
58
- 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
59
- 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
60
- 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
61
- 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
62
- 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
63
-
64
- 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
65
- 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
66
- 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
67
- 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
68
- 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
69
-
70
- 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
71
- 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
72
- 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
73
- 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
74
- 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
75
- 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
76
- 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
77
- 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
78
- 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
79
- 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
80
- 'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
81
- 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
82
- 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
83
- 'Interjection', ['', '', '', '', '', 'I'],
84
- 'Localizer', ['', '', '', '', 'LC'],
85
-
86
- 'Measure word', ['', '', '', '', 'M'],
87
-
88
- 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
89
- 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
90
- 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
91
- 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
92
- 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
93
- 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
94
- 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
95
- 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
96
- 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
97
-
98
- 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
99
- 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
100
- 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
101
- 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
102
- 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
103
- 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
104
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
105
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
106
- 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
107
- 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
108
- 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
109
- 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
110
- 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
111
- 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
112
- 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
113
- 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
114
- 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
115
- 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
116
- 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
117
- 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
118
-
119
- 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
120
- 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
121
- 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
122
- 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
123
- 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
124
- 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
125
- 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
126
- 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
127
- 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
128
- 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
129
- 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
130
- 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
131
- 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
132
- 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
133
- 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
134
- 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
135
- 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
136
- 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
137
- 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
138
- 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
139
- 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
140
- 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
141
- 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
142
- 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
143
- 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
144
- 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
145
- 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
146
- 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
147
- 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
148
- 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
149
- 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
150
- 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
151
- 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
152
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
153
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
154
- 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
155
- 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
156
- 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
157
-
158
- 'Particle', ['', '', '', '', '', 'PRT'],
159
- 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
160
- 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
161
- 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
162
- 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
163
- 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
164
-
165
- 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
166
- 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
167
- 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
168
- 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
169
- 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
170
-
171
- 'Possessive', ['POS', '$', 'POS'],
172
-
173
- 'Postposition', ['', '', '', 'APPO'],
174
-
175
- 'Circumposition, right', ['', '', '', 'APZR', ''],
176
-
177
- 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
178
-
179
- 'Onomatopoeia', ['', '', '', '', 'ON'],
180
-
181
- 'Punctuation', ['', '', '', '', 'PU', 'PN'],
182
- 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
183
-
184
- 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
185
- 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
186
- 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
187
- 'Punctuation, dash', ['PUN', '-', '-'],
188
- 'Punctuation, dollar sign', ['PUN', '', '$'],
189
- 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
190
- 'Punctuation, right bracket', ['PUR', ')', ')'],
191
- 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
192
- 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
193
-
194
- 'Punctuation, left bracket', ['PUL', '(', 'PPL'],
195
- 'Punctuation, right bracket', ['PUR', ')', 'PPR'],
196
- 'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
197
- 'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
198
- 'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
199
- 'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
200
-
201
- 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
202
-
203
- 'Symbol', ['', '', 'SYM', 'XY'],
204
- 'Symbol, alphabetical', ['ZZ0', '', ''],
205
- 'Symbol, list item', ['', '', 'LS'],
206
-
207
- # Not sure about these tags from the Chinese PTB.
208
- 'Aspect marker', ['', '', '', '', 'AS'], # ?
209
- 'Ba-construction', ['', '', '', '', 'BA'], # ?
210
- 'In relative', ['', '', '', '', 'DEC'], # ?
211
- 'Associative', ['', '', '', '', 'DER'], # ?
212
- 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
213
- 'For words ? ', ['', '', '', '', 'ETC'], # ?
214
- 'In long bei-construct', ['', '', '', '', 'LB'], # ?
215
- 'In short bei-construct', ['', '', '', '', 'SB'], # ?
216
- 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
217
- 'Particle, other', ['', '', '', '', 'MSP'], # ?
218
- 'Before VP', ['', '', '', '', 'DEV'], # ?
219
- 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
220
- 'Verb, ????', ['', '', '', '', 'VC'] # ?
221
- ]}
@@ -1,71 +0,0 @@
1
- {cat_to_category: {
2
- 'ADJ' => 'adjective',
3
- 'ADV' => 'adverb',
4
- 'CONJ' => 'conjunction',
5
- 'COOD' => 'conjunction',
6
- 'C' => 'complementizer',
7
- 'D' => 'determiner',
8
- 'N' => 'noun',
9
- 'P' => 'preposition',
10
- 'PN' => 'punctuation',
11
- 'SC' => 'conjunction',
12
- 'V' => 'verb',
13
- 'PRT' => 'particle'
14
- },
15
- cat_to_description: [
16
- ['ADJ', 'Adjective'],
17
- ['ADV', 'Adverb'],
18
- ['CONJ', 'Coordination conjunction'],
19
- ['C', 'Complementizer'],
20
- ['D', 'Determiner'],
21
- ['N', 'Noun'],
22
- ['P', 'Preposition'],
23
- ['SC', 'Subordination conjunction'],
24
- ['V', 'Verb'],
25
- ['COOD', 'Part of coordination'],
26
- ['PN', 'Punctuation'],
27
- ['PRT', 'Particle'],
28
- ['S', 'Sentence']
29
- ],
30
- xcat_to_description: [
31
- ['COOD', 'Coordinated phrase/clause'],
32
- ['IMP', 'Imperative sentence'],
33
- ['INV', 'Subject-verb inversion'],
34
- ['Q', 'Interrogative sentence with subject-verb inversion'],
35
- ['REL', 'A relativizer included'],
36
- ['FREL', 'A free relative included'],
37
- ['TRACE', 'A trace included'],
38
- ['WH', 'A wh-question word included']
39
- ],
40
- xcat_to_ptb: [
41
- ['ADJP', '', 'ADJP'],
42
- ['ADJP', 'REL', 'WHADJP'],
43
- ['ADJP', 'FREL', 'WHADJP'],
44
- ['ADJP', 'WH', 'WHADJP'],
45
- ['ADVP', '', 'ADVP'],
46
- ['ADVP', 'REL', 'WHADVP'],
47
- ['ADVP', 'FREL', 'WHADVP'],
48
- ['ADVP', 'WH', 'WHADVP'],
49
- ['CONJP', '', 'CONJP'],
50
- ['CP', '', 'SBAR'],
51
- ['DP', '', 'NP'],
52
- ['NP', '', 'NP'],
53
- ['NX', 'NX', 'NAC'],
54
- ['NP' 'REL' 'WHNP'],
55
- ['NP' 'FREL' 'WHNP'],
56
- ['NP' 'WH' 'WHNP'],
57
- ['PP', '', 'PP'],
58
- ['PP', 'REL', 'WHPP'],
59
- ['PP', 'WH', 'WHPP'],
60
- ['PRT', '', 'PRT'],
61
- ['S', '', 'S'],
62
- ['S', 'INV', 'SINV'],
63
- ['S', 'Q', 'SQ'],
64
- ['S', 'REL', 'SBAR'],
65
- ['S', 'FREL', 'SBAR'],
66
- ['S', 'WH', 'SBARQ'],
67
- ['SCP', '', 'SBAR'],
68
- ['VP', '', 'VP'],
69
- ['VP', '', 'VP'],
70
- ['', '', 'UK']
71
- ]}
@@ -1,17 +0,0 @@
1
- {tag_to_category: {
2
- 'C' => :complementizer,
3
- 'PN' => :punctuation,
4
- 'SC' => :conjunction
5
- }
6
- # Paris7 Treebank functional tags
7
- =begin
8
- SUJ (subject)
9
- OBJ (direct object)
10
- ATS (predicative complement of a subject)
11
- ATO (predicative complement of a direct object)
12
- MOD (modifier or adjunct)
13
- A-OBJ (indirect complement introduced by à)
14
- DE-OBJ (indirect complement introduced by de)
15
- P-OBJ (indirect complement introduced by another preposition)
16
- =end
17
- }
@@ -1,15 +0,0 @@
1
- {escape_characters: {
2
- '(' => '-LRB-',
3
- ')' => '-RRB-',
4
- '[' => '-LSB-',
5
- ']' => '-RSB-',
6
- '{' => '-LCB-',
7
- '}' => '-RCB-'
8
- },
9
- phrase_tag_to_description: [
10
- ['S', 'Paris7 declarative clause'],
11
- ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
12
- ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
13
- ['SINV', 'Inverted declarative sentence'],
14
- ['SQ', 'Inverted yes/no question']
15
- ]}
@@ -1 +0,0 @@
1
- [:extractors, :inflectors, :formatters, :learners, :lexicalizers, :processors, :retrievers]
data/lib/treat/config.rb DELETED
@@ -1,135 +0,0 @@
1
- module Treat::Config
2
-
3
- Paths = [ :tmp, :lib, :bin,
4
- :files, :data, :models, :spec ]
5
-
6
- class << self
7
- attr_accessor :config
8
- end
9
-
10
- Treat.module_eval do
11
- # Handle all missing methods as conf options.
12
- def self.method_missing(sym, *args, &block)
13
- super(sym, *args, &block) if sym == :to_ary
14
- Treat::Config.config[sym]
15
- end
16
- end
17
-
18
- def self.configure
19
- # Temporary configuration hash.
20
- config = { paths: {} }
21
- confdir = get_full_path(:lib) + 'treat/config'
22
- # Iterate over each directory in the config.
23
- Dir[confdir + '/*'].each do |dir|
24
- name = File.basename(dir, '.*').intern
25
- config[name] = {}
26
- # Iterate over each file in the directory.
27
- Dir[confdir + "/#{name}/*.rb"].each do |file|
28
- key = File.basename(file, '.*').intern
29
- config[name][key] = eval(File.read(file))
30
- end
31
- end
32
- # Get the path config.
33
- Paths.each do |path|
34
- config[:paths][path] = get_full_path(path)
35
- end
36
- # Get the tag alignments.
37
- configure_tags!(config[:tags][:aligned])
38
- # Convert hash to structs.
39
- self.config = self.hash_to_struct(config)
40
- end
41
-
42
- def self.get_full_path(dir)
43
- File.dirname(__FILE__) +
44
- '/../../' + dir.to_s + "/"
45
- end
46
-
47
- def self.configure_tags!(config)
48
- ts = config[:tag_sets]
49
- config[:word_tags_to_category] =
50
- align_tags(config[:word_tags], ts)
51
- config[:phrase_tags_to_category] =
52
- align_tags(config[:phrase_tags], ts)
53
- end
54
-
55
- # Align tag configuration.
56
- def self.align_tags(tags, tag_sets)
57
- wttc = {}
58
- tags.each_slice(2) do |desc, tags|
59
- category = desc.gsub(',', ' ,').
60
- split(' ')[0].downcase
61
- tag_sets.each_with_index do |tag_set, i|
62
- next unless tags[i]
63
- wttc[tags[i]] ||= {}
64
- wttc[tags[i]][tag_set] = category
65
- end
66
- end
67
- wttc
68
- end
69
-
70
- def self.hash_to_struct(hash)
71
- return hash if hash.keys.
72
- select { |k| !k.is_a?(Symbol) }.size > 0
73
- struct = Struct.new(
74
- *hash.keys).new(*hash.values)
75
- hash.each do |key, value|
76
- if value.is_a?(Hash)
77
- struct[key] =
78
- self.hash_to_struct(value)
79
- end
80
- end
81
- struct
82
- end
83
-
84
- # Turn on syntactic sugar.
85
- def self.sweeten!
86
-
87
- # Undo this in unsweeten! - # Fix
88
- Treat::Entities.module_eval do
89
- self.constants.each do |type|
90
- define_singleton_method(type) do |value='', id=nil|
91
- const_get(type).build(value, id)
92
- end
93
- end
94
- end
95
-
96
- return if Treat.core.syntax.sweetened
97
- Treat.core.syntax.sweetened = true
98
- Treat.core.entities.list.each do |type|
99
- next if type == :Symbol
100
- kname = cc(type).intern
101
- klass = Treat::Entities.const_get(kname)
102
- Object.class_eval do
103
- define_method(kname) do |val, opts={}|
104
- klass.build(val, opts)
105
- end
106
- end
107
- end
108
-
109
- Treat::Core.constants.each do |kname|
110
- Object.class_eval do
111
- klass = Treat::Core.const_get(kname)
112
- define_method(kname) do |*args|
113
- klass.new(*args)
114
- end
115
- end
116
- end
117
-
118
- end
119
-
120
- # Turn off syntactic sugar.
121
- def self.unsweeten!
122
- return unless Treat.core.syntax.sweetened
123
- Treat.core.syntax.sweetened = false
124
- Treat.core.entities.list.each do |type|
125
- name = cc(type).intern
126
- next if type == :Symbol
127
- Object.class_eval { remove_method(name) }
128
- end
129
-
130
- end
131
-
132
- # Run all configuration.
133
- self.configure
134
-
135
- end
data/lib/treat/core.rb DELETED
@@ -1,5 +0,0 @@
1
- # Contains the core classes used by Treat.
2
- module Treat::Core
3
- p = Treat.paths.lib + 'treat/core/*.rb'
4
- Dir.glob(p).each { |f| require f }
5
- end
@@ -1,47 +0,0 @@
1
- module Treat::Entities::Abilities::Copyable
2
-
3
- require 'fileutils'
4
-
5
- # What happens when it is a database-stored
6
- # collection or document ?
7
- def copy_into(collection)
8
- unless collection.is_a?(
9
- Treat::Entities::Collection)
10
- raise Treat::Exception,
11
- "Cannot copy an entity into " +
12
- "something else than a collection."
13
- end
14
- if type == :document
15
- copy_document_into(collection)
16
- elsif type == :collection
17
- copy_collection_into(collection)
18
- else
19
- raise Treat::Exception,
20
- "Can only copy a document " +
21
- "or collection into a collection."
22
- end
23
- end
24
-
25
- def copy_collection_into(collection)
26
- copy = dup
27
- f = File.dirname(folder)
28
- f = f.split(File::SEPARATOR)[-1]
29
- f = File.join(collection.folder, f)
30
- FileUtils.mkdir(f) unless
31
- FileTest.directory(f)
32
- FileUtils.cp_r(folder, f)
33
- copy.set :folder, f
34
- copy
35
- end
36
-
37
- def copy_document_into(collection)
38
- copy = dup
39
- return copy unless file
40
- f = File.basename(file)
41
- f = File.join(collection.folder, f)
42
- FileUtils.cp(file, f)
43
- copy.set :file, f
44
- copy
45
- end
46
-
47
- end
@@ -1,83 +0,0 @@
1
- # When Treat.debug is set to true, each call to
2
- # #call_worker will result in a debug message being
3
- # printed by the #print_debug function.
4
- module Treat::Entities::Abilities::Debuggable
5
-
6
- @@prev = nil
7
- @@i = 0
8
-
9
- # Explains what Treat is currently doing.
10
- def print_debug(entity, task, worker, group, options)
11
-
12
- targs = group.targets.map do |target|
13
- target.to_s
14
- end
15
-
16
- if targs.size == 1
17
- t = targs[0]
18
- else
19
- t = targs[0..-2].join(', ') +
20
- ' and/or ' + targs[-1]
21
- end
22
-
23
- genitive = targs.size > 1 ?
24
- 'their' : 'its'
25
-
26
- doing = ''
27
-
28
- human_task = task.to_s.gsub('_', ' ')
29
-
30
- if group.type == :transformer ||
31
- group.type == :computer
32
-
33
- tt = human_task
34
- tt = tt[0..-2] if tt[-1] == 'e'
35
- ed = tt[-1] == 'd' ? '' : 'ed'
36
- doing = "#{tt.capitalize}#{ed} #{t}"
37
-
38
- elsif group.type == :annotator
39
-
40
- if group.preset_option
41
- opt = options[group.preset_option]
42
- form = opt.to_s.gsub('_', ' ')
43
- human_task[-1] = ''
44
- human_task = form + ' ' + human_task
45
- end
46
-
47
- doing = "Annotated #{t} with " +
48
- "#{genitive} #{human_task}"
49
- end
50
-
51
- if group.to_s.index('Formatters')
52
- curr = doing +
53
- ' in format ' +
54
- worker.to_s
55
- else
56
- curr = doing +
57
- ' using ' +
58
- worker.to_s.gsub('_', ' ')
59
- end
60
-
61
- curr.gsub!('ss', 's') unless curr.index('class')
62
- curr += '.'
63
-
64
- if curr == @@prev
65
- @@i += 1
66
- else
67
- if @@i > 1
68
- Treat.core.entities.list.each do |e|
69
- @@prev.gsub!(e.to_s, e.to_s + 's')
70
- end
71
- @@prev.gsub!('its', 'their')
72
- @@prev = @@prev.split(' ').
73
- insert(1, @@i.to_s).join(' ')
74
- end
75
- @@i = 0
76
- puts @@prev # Last call doesn't get shown.
77
- end
78
-
79
- @@prev = curr
80
-
81
- end
82
-
83
- end
@@ -1,46 +0,0 @@
1
- # Registers occurences of textual values inside
2
- # all children entity. Useful to calculate frequency.
3
- module Treat::Entities::Abilities::Registrable
4
-
5
- # Registers a token in the @registry hash.
6
- def register(entity)
7
-
8
- unless @registry
9
- @count = 0
10
- @registry = {
11
- :value => {},
12
- :position => {},
13
- :type => {},
14
- :id => {}
15
- }
16
- end
17
-
18
- if entity.is_a?(Treat::Entities::Token) ||
19
- entity.is_a?(Treat::Entities::Phrase)
20
- val = entity.to_s.downcase
21
- @registry[:value][val] ||= 0
22
- @registry[:value][val] += 1
23
- end
24
-
25
- @registry[:id][entity.id] = true
26
- @registry[:type][entity.type] ||= 0
27
- @registry[:type][entity.type] += 1
28
- @registry[:position][entity.id] = @count
29
- @count += 1
30
-
31
- @parent.register(entity) if has_parent?
32
-
33
- end
34
-
35
- # Backtrack up the tree to find a token registry,
36
- # by default the one in the root node of any entity.
37
- def registry(type = nil)
38
- if has_parent? &&
39
- type != self.type
40
- @parent.registry(type)
41
- else
42
- @registry
43
- end
44
- end
45
-
46
- end