treat 1.0.6 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (210) hide show
  1. data/LICENSE +2 -4
  2. data/README.md +13 -12
  3. data/bin/MANIFEST +1 -0
  4. data/bin/stanford/bridge.jar +0 -0
  5. data/bin/stanford/joda-time.jar +0 -0
  6. data/bin/stanford/stanford-corenlp.jar +0 -0
  7. data/bin/stanford/stanford-parser.jar +0 -0
  8. data/bin/stanford/xom.jar +0 -0
  9. data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
  10. data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
  11. data/files/{INFO → MANIFEST} +0 -0
  12. data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
  13. data/files/weather-central-canada-heat-wave.html +1370 -0
  14. data/lib/treat/config/core/acronyms.rb +4 -0
  15. data/lib/treat/config/core/encodings.rb +8 -0
  16. data/lib/treat/config/core/entities.rb +2 -0
  17. data/lib/treat/config/core/language.rb +3 -0
  18. data/lib/treat/config/core/paths.rb +8 -0
  19. data/lib/treat/config/core/syntax.rb +1 -0
  20. data/lib/treat/config/core/verbosity.rb +1 -0
  21. data/lib/treat/config/databases/mongo.rb +3 -0
  22. data/lib/treat/config/languages/agnostic.rb +34 -0
  23. data/lib/treat/config/languages/arabic.rb +13 -0
  24. data/lib/treat/config/languages/chinese.rb +13 -0
  25. data/lib/treat/config/languages/dutch.rb +12 -0
  26. data/lib/treat/config/languages/english.rb +60 -0
  27. data/lib/treat/config/languages/french.rb +18 -0
  28. data/lib/treat/config/languages/german.rb +18 -0
  29. data/lib/treat/config/languages/greek.rb +12 -0
  30. data/lib/treat/config/languages/italian.rb +12 -0
  31. data/lib/treat/config/languages/polish.rb +12 -0
  32. data/lib/treat/config/languages/portuguese.rb +12 -0
  33. data/lib/treat/config/languages/russian.rb +12 -0
  34. data/lib/treat/config/languages/spanish.rb +12 -0
  35. data/lib/treat/config/languages/swedish.rb +12 -0
  36. data/lib/treat/config/libraries/stanford.rb +1 -0
  37. data/lib/treat/config/linguistics/categories.rb +4 -0
  38. data/lib/treat/config/linguistics/punctuation.rb +33 -0
  39. data/lib/treat/config/tags/aligned.rb +221 -0
  40. data/lib/treat/config/tags/enju.rb +71 -0
  41. data/lib/treat/config/tags/paris7.rb +17 -0
  42. data/lib/treat/config/tags/ptb.rb +15 -0
  43. data/lib/treat/config/workers/extractors.rb +39 -0
  44. data/lib/treat/config/workers/formatters.rb +20 -0
  45. data/lib/treat/config/workers/inflectors.rb +27 -0
  46. data/lib/treat/config/workers/learners.rb +6 -0
  47. data/lib/treat/config/workers/lexicalizers.rb +18 -0
  48. data/lib/treat/config/workers/list.rb +1 -0
  49. data/lib/treat/config/workers/processors.rb +19 -0
  50. data/lib/treat/config/workers/retrievers.rb +12 -0
  51. data/lib/treat/config.rb +125 -0
  52. data/lib/treat/{classification.rb → core/classification.rb} +1 -1
  53. data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
  54. data/lib/treat/{tree.rb → core/node.rb} +5 -5
  55. data/lib/treat/core/server.rb +3 -0
  56. data/lib/treat/core.rb +5 -0
  57. data/lib/treat/entities/abilities/buildable.rb +61 -56
  58. data/lib/treat/entities/abilities/checkable.rb +2 -2
  59. data/lib/treat/entities/abilities/comparable.rb +21 -0
  60. data/lib/treat/entities/abilities/copyable.rb +2 -0
  61. data/lib/treat/entities/abilities/countable.rb +1 -1
  62. data/lib/treat/entities/abilities/debuggable.rb +1 -1
  63. data/lib/treat/entities/abilities/delegatable.rb +42 -36
  64. data/lib/treat/entities/abilities/doable.rb +2 -2
  65. data/lib/treat/entities/abilities/exportable.rb +1 -1
  66. data/lib/treat/entities/abilities/iterable.rb +21 -33
  67. data/lib/treat/entities/abilities/magical.rb +8 -8
  68. data/lib/treat/entities/abilities/registrable.rb +0 -38
  69. data/lib/treat/entities/abilities/stringable.rb +19 -19
  70. data/lib/treat/entities/collection.rb +31 -0
  71. data/lib/treat/entities/document.rb +10 -0
  72. data/lib/treat/entities/entity.rb +18 -13
  73. data/lib/treat/entities/group.rb +15 -0
  74. data/lib/treat/entities/section.rb +13 -0
  75. data/lib/treat/entities/token.rb +35 -0
  76. data/lib/treat/entities/zone.rb +11 -0
  77. data/lib/treat/entities.rb +5 -75
  78. data/lib/treat/helpers/didyoumean.rb +57 -0
  79. data/lib/treat/helpers/escaping.rb +15 -0
  80. data/lib/treat/helpers/formatting.rb +41 -0
  81. data/lib/treat/helpers/platform.rb +15 -0
  82. data/lib/treat/helpers/reflection.rb +17 -0
  83. data/lib/treat/helpers/temporary.rb +27 -0
  84. data/lib/treat/helpers/verbosity.rb +19 -0
  85. data/lib/treat/helpers.rb +5 -0
  86. data/lib/treat/installer.rb +46 -165
  87. data/lib/treat/loaders/linguistics.rb +22 -27
  88. data/lib/treat/loaders/stanford.rb +23 -41
  89. data/lib/treat/loaders.rb +10 -0
  90. data/lib/treat/proxies.rb +73 -24
  91. data/lib/treat/version.rb +3 -0
  92. data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
  93. data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
  94. data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
  95. data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
  96. data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
  97. data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
  98. data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
  99. data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
  100. data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
  101. data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
  102. data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
  103. data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
  104. data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
  105. data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
  106. data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
  107. data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
  108. data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
  109. data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
  110. data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
  111. data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
  112. data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
  113. data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
  114. data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
  115. data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
  116. data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
  117. data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
  118. data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
  119. data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
  120. data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
  121. data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
  122. data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
  123. data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
  124. data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
  125. data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
  126. data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
  127. data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
  128. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
  129. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
  130. data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
  131. data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
  132. data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
  133. data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
  134. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
  135. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
  136. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
  137. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
  138. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
  139. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
  140. data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
  141. data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
  142. data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
  143. data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
  144. data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
  145. data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
  146. data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
  147. data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
  148. data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
  149. data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
  150. data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
  151. data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
  152. data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
  153. data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
  154. data/lib/treat/workers.rb +96 -0
  155. data/lib/treat.rb +23 -49
  156. data/spec/collection.rb +4 -4
  157. data/spec/document.rb +5 -5
  158. data/spec/entity.rb +33 -32
  159. data/spec/{tree.rb → node.rb} +5 -5
  160. data/spec/phrase.rb +5 -39
  161. data/spec/sandbox.rb +212 -6
  162. data/spec/token.rb +12 -9
  163. data/spec/treat.rb +12 -9
  164. data/spec/word.rb +10 -9
  165. data/spec/zone.rb +6 -2
  166. data/tmp/{INFO → MANIFEST} +0 -0
  167. data/tmp/english.yaml +10340 -0
  168. metadata +149 -139
  169. data/lib/treat/ai.rb +0 -12
  170. data/lib/treat/categories.rb +0 -90
  171. data/lib/treat/categorizable.rb +0 -44
  172. data/lib/treat/configurable.rb +0 -115
  173. data/lib/treat/dependencies.rb +0 -25
  174. data/lib/treat/downloader.rb +0 -87
  175. data/lib/treat/entities/abilities.rb +0 -10
  176. data/lib/treat/entities/entities.rb +0 -102
  177. data/lib/treat/exception.rb +0 -7
  178. data/lib/treat/extractors.rb +0 -79
  179. data/lib/treat/formatters/serializers/mongo.rb +0 -64
  180. data/lib/treat/formatters.rb +0 -41
  181. data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
  182. data/lib/treat/inflectors.rb +0 -52
  183. data/lib/treat/kernel.rb +0 -208
  184. data/lib/treat/languages/arabic.rb +0 -16
  185. data/lib/treat/languages/chinese.rb +0 -16
  186. data/lib/treat/languages/dutch.rb +0 -16
  187. data/lib/treat/languages/english.rb +0 -63
  188. data/lib/treat/languages/french.rb +0 -20
  189. data/lib/treat/languages/german.rb +0 -20
  190. data/lib/treat/languages/greek.rb +0 -16
  191. data/lib/treat/languages/italian.rb +0 -17
  192. data/lib/treat/languages/language.rb +0 -10
  193. data/lib/treat/languages/list.txt +0 -504
  194. data/lib/treat/languages/polish.rb +0 -16
  195. data/lib/treat/languages/portuguese.rb +0 -16
  196. data/lib/treat/languages/russian.rb +0 -16
  197. data/lib/treat/languages/spanish.rb +0 -16
  198. data/lib/treat/languages/swedish.rb +0 -16
  199. data/lib/treat/languages.rb +0 -132
  200. data/lib/treat/lexicalizers.rb +0 -37
  201. data/lib/treat/object.rb +0 -7
  202. data/lib/treat/processors/chunkers/autoselect.rb +0 -16
  203. data/lib/treat/processors/chunkers/txt.rb +0 -21
  204. data/lib/treat/processors.rb +0 -38
  205. data/lib/treat/retrievers.rb +0 -27
  206. data/lib/treat/server.rb +0 -26
  207. data/lib/treat/universalisation/encodings.rb +0 -12
  208. data/lib/treat/universalisation/tags.rb +0 -453
  209. data/lib/treat/universalisation.rb +0 -9
  210. data/spec/languages.rb +0 -25
@@ -1,453 +0,0 @@
1
- module Treat::Universalisation::Tags
2
-
3
- ClawsC5 = 0
4
- Brown = 1
5
- Penn = 2
6
- Stuttgart = 3
7
- PennChinese = 4
8
- Paris7 = 5
9
-
10
- StanfordTagSetForLanguage = {
11
- :french => :paris7,
12
- :english => :penn,
13
- :german => :stuttgart
14
- }
15
-
16
- PTBClauseTagDescription = [
17
- ['S', 'Paris7 declarative clause'],
18
- ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
19
- ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
20
- ['SINV', 'Inverted declarative sentence'],
21
- ['SQ', 'Inverted yes/no question']
22
- ]
23
-
24
- PTBEscapeCharacters = {
25
- '(' => '-LRB-',
26
- ')' => '-RRB-',
27
- '[' => '-LSB-',
28
- ']' => '-RSB-',
29
- '{' => '-LCB-',
30
- '}' => '-RCB-'
31
- }
32
-
33
- AlignedPhraseTags =
34
- [
35
- 'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
36
- 'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
37
- 'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
38
- 'Fragment', ['', '', 'FRAG', '', '', ''],
39
- 'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
40
- 'List marker', ['', '', 'LST', '', '', ''],
41
- 'Not a phrase', ['', '', 'NAC', '', '', ''],
42
- 'Noun phrase', ['', '', 'NP', '', '', 'NP'],
43
- 'Verbal nucleus', ['', '', '', '', '', 'VN'],
44
- 'Head of noun phrase', ['', '', 'NX', '', '', ''],
45
- 'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
46
- 'Parenthetical', ['', '', 'PRN', '', '', ''],
47
- 'Particle', ['', '', 'PRT', '', '', ''],
48
- 'Participial phrase', ['', '', '', '', '', 'VPart'],
49
- 'Quantifier phrase', ['', '', 'QP', '', '', ''],
50
- 'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
51
- 'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
52
- 'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
53
- 'Verb phrase', ['', '', 'VP', '', '', ''],
54
- 'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
55
- 'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
56
- 'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
57
- 'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
58
- 'Unknown', ['', '', 'X', '', '', ''],
59
- 'Phrase', ['', '', 'P', '', '', 'Sint'],
60
- 'Sentence', ['', '', 'S', '', '', 'SENT'],
61
- 'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
62
- ]
63
-
64
- # A description of Enju categories.
65
- EnjuCatDescription = [
66
- ['ADJ', 'Adjective'],
67
- ['ADV', 'Adverb'],
68
- ['CONJ', 'Coordination conjunction'],
69
- ['C', 'Complementizer'],
70
- ['D', 'Determiner'],
71
- ['N', 'Noun'],
72
- ['P', 'Preposition'],
73
- ['SC', 'Subordination conjunction'],
74
- ['V', 'Verb'],
75
- ['COOD', 'Part of coordination'],
76
- ['PN', 'Punctuation'],
77
- ['PRT', 'Particle'],
78
- ['S', 'Sentence']
79
- ]
80
-
81
- # Maps Enju categories to Treat categories.
82
- EnjuCatToCategory = {
83
- 'ADJ' => :adjective,
84
- 'ADV' => :adverb,
85
- 'CONJ' => :conjunction,
86
- 'COOD' => :conjunction,
87
- 'C' => :complementizer,
88
- 'D' => :determiner,
89
- 'N' => :noun,
90
- 'P' => :preposition,
91
- 'PN' => :punctuation,
92
- 'SC' => :conjunction,
93
- 'V' => :verb,
94
- 'PRT' => :particle
95
- }
96
-
97
- # Description of the xcat in the Enju output specification.
98
- EnjuXCatDescription = [
99
- ['COOD', 'Coordinated phrase/clause'],
100
- ['IMP', 'Imperative sentence'],
101
- ['INV', 'Subject-verb inversion'],
102
- ['Q', 'Interrogative sentence with subject-verb inversion'],
103
- ['REL', 'A relativizer included'],
104
- ['FREL', 'A free relative included'],
105
- ['TRACE', 'A trace included'],
106
- ['WH', 'A wh-question word included']
107
- ]
108
-
109
- EnjuCatXcatToPTB = [
110
- ['ADJP', '', 'ADJP'],
111
- ['ADJP', 'REL', 'WHADJP'],
112
- ['ADJP', 'FREL', 'WHADJP'],
113
- ['ADJP', 'WH', 'WHADJP'],
114
- ['ADVP', '', 'ADVP'],
115
- ['ADVP', 'REL', 'WHADVP'],
116
- ['ADVP', 'FREL', 'WHADVP'],
117
- ['ADVP', 'WH', 'WHADVP'],
118
- ['CONJP', '', 'CONJP'],
119
- ['CP', '', 'SBAR'],
120
- ['DP', '', 'NP'],
121
- ['NP', '', 'NP'],
122
- ['NX', 'NX', 'NAC'],
123
- ['NP' 'REL' 'WHNP'],
124
- ['NP' 'FREL' 'WHNP'],
125
- ['NP' 'WH' 'WHNP'],
126
- ['PP', '', 'PP'],
127
- ['PP', 'REL', 'WHPP'],
128
- ['PP', 'WH', 'WHPP'],
129
- ['PRT', '', 'PRT'],
130
- ['S', '', 'S'],
131
- ['S', 'INV', 'SINV'],
132
- ['S', 'Q', 'SQ'],
133
- ['S', 'REL', 'SBAR'],
134
- ['S', 'FREL', 'SBAR'],
135
- ['S', 'WH', 'SBARQ'],
136
- ['SCP', '', 'SBAR'],
137
- ['VP', '', 'VP'],
138
- ['VP', '', 'VP'],
139
- ['', '', 'UK']
140
- ]
141
-
142
- # Aligned tags for the Claws C5, Brown and Penn tag sets.
143
- # Adapted from Manning, Christopher and Schütze, Hinrich,
144
- # 1999. Foundations of Statistical Natural Language
145
- # Processing. MIT Press, p. 141-142;
146
- # http://www.isocat.org/rest/dcs/376;
147
- #
148
- # JRS?
149
-
150
-
151
- Paris7WordTagToCategory = {
152
- 'C' => :complementizer,
153
- 'PN' => :punctuation,
154
- 'SC' => :conjunction
155
- }
156
-
157
- PunctuationToCategory = {
158
- '.' => :period,
159
- ',' => :comma,
160
- ';' => :semicolon,
161
- ':' => :colon,
162
- '!' => :exclamation,
163
- '?' => :interrogation,
164
- '"' => :double_quote,
165
- "'" => :single_quote,
166
- '$' => :dollar,
167
- '%' => :percent,
168
- '#' => :hash,
169
- '*' => :asterisk,
170
- '&' => :ampersand,
171
- '+' => :plus,
172
- '-' => :dash,
173
-
174
- '/' => :slash,
175
- '\\' => :backslash,
176
- '^' => :caret,
177
- '_' => :underscore,
178
- '`' => :tick,
179
- '|' => :pipe,
180
- '~' => :tilde,
181
- '@' => :at,
182
-
183
- '[' => :bracket,
184
- ']' => :bracket,
185
- '{' => :brace,
186
- '}' => :brace,
187
- '(' => :parenthesis,
188
- ')' => :parenthesis,
189
-
190
- '<' => :tag,
191
- '>' => :tag
192
- }
193
-
194
- AlignedWordTags = [
195
-
196
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'A'],
197
- 'Adjective', ['AJ0', 'JJ', 'JJ', '', 'JJ', 'ADJ'],
198
- 'Ajective, adverbial or predicative', ['', '', '', 'ADJD', '', 'ADJ'],
199
- 'Adjective, attribute', ['', '', '', 'ADJA', 'VA', 'ADJ'],
200
- 'Adjective, ordinal number', ['ORD', 'OD', 'JJ', '', 'OD', 'ADJ'],
201
- 'Adjective, comparative', ['AJC', 'JJR', 'JJR', 'KOKOM', '', 'ADJ'],
202
- 'Adjective, superlative', ['AJS', 'JJT', 'JJS', '', 'JJ', 'ADJ'],
203
- 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ', '', '', 'ADJ'],
204
- 'Adjective, cardinal number', ['CRD', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
205
- 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD', 'CARD', 'CD', 'ADJ'],
206
-
207
- 'Adverb', ['AV0', 'RB', 'RB', 'ADV', 'AD', 'ADV'],
208
- 'Adverb, negative', ['XX0', '*', 'RB', 'PTKNEG', '', 'ADV'],
209
- 'Adverb, comparative', ['AV0', 'RBR', 'RBR', '', 'AD', 'ADV'],
210
- 'Adverb, superlative', ['AV0', 'RBT', 'RBS', '', 'AD', 'ADV'],
211
- 'Adverb, particle', ['AVP', 'RP', 'RP', '', '', 'ADV'],
212
- 'Adverb, question', ['AVQ', 'WRB', 'WRB', '', 'AD', 'ADV'],
213
- 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB', '', 'ADV'],
214
- 'Adverb, degree', ['AV0', 'QL', 'RB', '', '', 'ADV'],
215
- 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB', '', '', 'ADV'],
216
- 'Adverb, nominal', ['AV0', 'RN', 'RB', 'PROP', '', 'ADV'],
217
- 'Adverb, pronominal', ['', '', '', '', 'PROP', '', 'ADV'],
218
-
219
- 'Conjunction, coordination', ['CJC', 'CC', 'CC', 'KON', 'CC', 'COOD'],
220
- 'Conjunction, coordination, and', ['CJC', 'CC', 'CC', 'KON', 'CC', 'ET'],
221
- 'Conjunction, subordination', ['CJS', 'CS', 'IN', 'KOUS', 'CS', 'CONJ'],
222
- 'Conjunction, subordination with to and infinitive', ['', '', '', 'KOUI', '', ''],
223
- 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN', '', '', 'C'],
224
-
225
- 'Determiner', ['DT0', 'DT', 'DT', '', 'DT', 'D'],
226
- 'Determiner, pronoun', ['DT0', 'DTI', 'DT', '', '', 'D'],
227
- 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT', '', '', 'D'],
228
- 'Determiner, prequalifier', ['DT0', 'ABL', 'DT', '', '', 'D'],
229
- 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT', '', 'DT', 'D'],
230
- 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT', '', '', 'D'],
231
- 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT', '', '', 'D'],
232
- 'Determiner, article', ['AT0', 'AT', 'DT', 'ART', '', 'D'],
233
- 'Determiner, postdeterminer', ['DT0', 'AP', 'DT', '', '', 'D'],
234
- 'Determiner, possessive', ['DPS', 'PP$', 'PRP$', '', '', 'D'],
235
- 'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
236
- 'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
237
- 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
238
- 'Interjection', ['', '', '', '', '', 'I'],
239
- 'Localizer', ['', '', '', '', 'LC'],
240
-
241
- 'Measure word', ['', '', '', '', 'M'],
242
-
243
- 'Noun, common', ['NN0', 'NN', 'NN', 'N', 'NN', 'NN'],
244
- 'Noun, singular', ['NN1', 'NN', 'NN', 'NN', 'NN', 'N'],
245
- 'Noun, plural', ['NN2', 'NNS', 'NNS', 'NN', 'NN', 'N'],
246
- 'Noun, proper, singular', ['NP0', 'NP', 'NNP', 'NE', 'NR', 'N'],
247
- 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS', 'NE', 'NR', 'N'],
248
- 'Noun, adverbial', ['NN0', 'NR', 'NN', 'NE', '', 'N'],
249
- 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS', '', 'N'],
250
- 'Noun, temporal', ['', '', '', '', 'NT', 'N'],
251
- 'Noun, verbal', ['', '', '', '', 'NN', 'N'],
252
-
253
- 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP', '', 'PN', 'CL'],
254
- 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP', 'PPER'],
255
- 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP', 'PPER'],
256
- 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP', 'PPER'],
257
- 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP', 'PRF'],
258
- 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP', 'PRF'],
259
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP', 'PWAV'],
260
- 'Pronoun, question, subject', ['PNQ', 'WPS', 'WPS', 'PWAV'], # Hack
261
- 'Pronoun, question, object', ['PNQ', 'WPO', 'WP', 'PWAV', 'PWAT'],
262
- 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
263
- 'Pronoun, attributive demonstrative', ['', '', '', 'PDAT'],
264
- 'Prounoun, attributive indefinite without determiner', ['', '', '', 'PIAT'],
265
- 'Pronoun, attributive possessive', ['', '', '', 'PPOSAT', ''],
266
- 'Pronoun, substituting demonstrative', ['', '', '', 'PDS'],
267
- 'Pronoun, substituting possessive', ['', '', '', 'PPOSS', ''],
268
- 'Prounoun, substituting indefinite', ['', '', '', 'PIS'],
269
- 'Pronoun, attributive relative', ['', '', '', 'PRELAT', ''],
270
- 'Pronoun, substituting relative', ['', '', '', 'PRELS', ''],
271
- 'Pronoun, attributive interrogative', ['', '', '', 'PWAT'],
272
- 'Pronoun, adverbial interrogative', ['', '', '', 'PWAV'],
273
-
274
- 'Pronoun, substituting interrogative', ['', '', '', 'PWS'],
275
- 'Verb, main, finite', ['', '', '', 'VVFIN', '', 'V'],
276
- 'Verb, main, infinitive', ['', '', '', 'VVINF', '', 'V'],
277
- 'Verb, main, imperative', ['', '', '', 'VVIMP', '', 'V'],
278
- 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP', '', '', 'V'],
279
- 'Verb, infinitive', ['VVI', 'VB', 'VB', 'V', '', 'V'],
280
- 'Verb, past tense', ['VVD', 'VBD', 'VBD', '', '', 'V'],
281
- 'Verb, present participle', ['VVG', 'VBG', 'VBG', 'VAPP', '', 'V'],
282
- 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN', 'VVPP', '', 'V'],
283
- 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ', '', '', 'V'],
284
- 'Verb, auxiliary', ['', '', '', 'VAFIN', '', 'V'],
285
- 'Verb, imperative', ['', '', '', 'VAIMP', '', 'V'],
286
- 'Verb, imperative infinitive', ['', '', '', 'VAINF', '', 'V'],
287
- 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP', '', '', 'V'],
288
- 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB', '', '', 'V'],
289
- 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD', '', '', 'V'],
290
- 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG', '', '', 'V'],
291
- 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN', '', '', 'V'],
292
- 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ', '', '', 'V'],
293
- 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP', 'VA', '', 'V'],
294
- 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB', 'VAINF', '', 'V'],
295
- 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD', 'VA', '', 'V'],
296
- 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG', 'VA', '', 'V'],
297
- 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN', 'VAPP', '', 'V'],
298
- 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ', 'VA', '', 'V'],
299
- 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB', '', '', 'V'],
300
- 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD', '', '', 'V'],
301
- 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD', '', '', 'V'],
302
- 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG', '', '', 'V'],
303
- 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN', '', '', 'V'],
304
- 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ', '', '', 'V'],
305
- 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP', '', '', 'V'],
306
- 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP', '', '', 'V'],
307
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMFIN', 'VV', 'V'],
308
- 'Verb, modal', ['VM0', 'MD', 'MD', 'VMINF', 'VV', 'V'],
309
- 'Verb, modal, finite', ['', '', '', '', 'VMFIN', 'V'],
310
- 'Verb, modal, infinite', ['', '', '', '', 'VMINF', 'V'],
311
- 'Verb, modal, past participle', ['', '', '', '', 'VMPP', 'V'],
312
-
313
- 'Particle', ['', '', '', '', '', 'PRT'],
314
- 'Particle, with adverb', ['', '', '', 'PTKA', '', 'PRT'],
315
- 'Particle, answer', ['', '', '', 'PTKANT', '', 'PRT'],
316
- 'Particle, negation', ['', '', '', 'PTKNEG', '', 'PRT'],
317
- 'Particle, separated verb', ['', '', '', 'PTKVZ', '', 'PRT'],
318
- 'Particle, to as infinitive marker', ['TO0', 'TO', 'TO', 'PTKZU', '', 'PRT'],
319
-
320
- 'Preposition, comparative', ['', '', '', 'KOKOM', '', 'P'],
321
- 'Preposition, to', ['PRP', 'IN', 'TO', '', '', 'P'],
322
- 'Preposition', ['PRP', 'IN', 'IN', 'APPR', 'P', 'P'],
323
- 'Preposition, with aritcle', ['', '', '', 'APPART', '', 'P'],
324
- 'Preposition, of', ['PRF', 'IN', 'IN', '', '', 'P'],
325
-
326
- 'Possessive', ['POS', '$', 'POS'],
327
-
328
- 'Postposition', ['', '', '', 'APPO'],
329
-
330
- 'Circumposition, right', ['', '', '', 'APZR', ''],
331
-
332
- 'Interjection, onomatopoeia or other isolate', ['ITJ', 'UH', 'UH', 'ITJ', 'IJ'],
333
-
334
- 'Onomatopoeia', ['', '', '', '', 'ON'],
335
-
336
- 'Punctuation', ['', '', '', '', 'PU', 'PN'],
337
- 'Punctuation, sentence ender', ['PUN', '.', '.', '', '', 'PN'],
338
-
339
- 'Punctuation, semicolon', ['PUN', '.', '.', '', '', 'PN'],
340
- 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
341
- 'Punctuationm, comma', ['PUN', ',', ',', '$,'],
342
- 'Punctuation, dash', ['PUN', '-', '-'],
343
- 'Punctuation, dollar sign', ['PUN', '', '$'],
344
- 'Punctuation, left bracket', ['PUL', '(', '(', '$('],
345
- 'Punctuation, right bracket', ['PUR', ')', ')'],
346
- 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
347
- 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
348
-
349
- 'Punctuation, left bracket', ['PUL', '(', 'PPL'],
350
- 'Punctuation, right bracket', ['PUR', ')', 'PPR'],
351
- 'Punctuation, left square bracket', ['PUL', '(', 'LSB'],
352
- 'Punctuation, right square bracket', ['PUR', ')', 'RSB'],
353
- 'Punctuation, left curly bracket', ['PUL', '(', 'LCB'],
354
- 'Punctuation, right curly bracket', ['PUR', ')', 'RCB'],
355
-
356
- 'Unknown, foreign words (not in lexicon)', ['UNZ', '(FW-)', 'FW', '', 'FW'],
357
-
358
- 'Symbol', ['', '', 'SYM', 'XY'],
359
- 'Symbol, alphabetical', ['ZZ0', '', ''],
360
- 'Symbol, list item', ['', '', 'LS'],
361
-
362
- # Not sure about these tags from the Chinese PTB.
363
- 'Aspect marker', ['', '', '', '', 'AS'], # ?
364
- 'Ba-construction', ['', '', '', '', 'BA'], # ?
365
- 'In relative', ['', '', '', '', 'DEC'], # ?
366
- 'Associative', ['', '', '', '', 'DER'], # ?
367
- 'In V-de or V-de-R construct', ['', '', '', '', 'DER'], # ?
368
- 'For words ? ', ['', '', '', '', 'ETC'], # ?
369
- 'In long bei-construct', ['', '', '', '', 'LB'], # ?
370
- 'In short bei-construct', ['', '', '', '', 'SB'], # ?
371
- 'Sentence-nal particle', ['', '', '', '', 'SB'], # ?
372
- 'Particle, other', ['', '', '', '', 'MSP'], # ?
373
- 'Before VP', ['', '', '', '', 'DEV'], # ?
374
- 'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
375
- 'Verb, ????', ['', '', '', '', 'VC'] # ?
376
- ]
377
-
378
- # Paris7 Treebank functional tags
379
- =begin
380
- SUJ (subject)
381
- OBJ (direct object)
382
- ATS (predicative complement of a subject)
383
- ATO (predicative complement of a direct object)
384
- MOD (modifier or adjunct)
385
- A-OBJ (indirect complement introduced by à)
386
- DE-OBJ (indirect complement introduced by de)
387
- P-OBJ (indirect complement introduced by another preposition)
388
- =end
389
-
390
- # !! Extremely ugly code follows.
391
-
392
- # Generate word tag -> category hash.
393
- wttc = {}
394
-
395
- Treat::Universalisation::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
396
-
397
- category = desc.gsub(',', ' ,').
398
- split(' ')[0].downcase.intern
399
-
400
- wttc[tags[ClawsC5]] ||= {}
401
- wttc[tags[Brown]] ||= {}
402
- wttc[tags[Penn]] ||= {}
403
- wttc[tags[Stuttgart]] ||= {}
404
- wttc[tags[PennChinese]] ||= {}
405
- wttc[tags[Paris7]] ||= {}
406
-
407
- wttc[tags[ClawsC5]][:claws_5] = category
408
- wttc[tags[Brown]][:brown] = category
409
- wttc[tags[Penn]][:penn] = category
410
- wttc[tags[Stuttgart]][:stuttgart] = category if tags[Stuttgart]
411
- wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
412
- wttc[tags[Paris7]][:paris7] = category if tags[Paris7]
413
-
414
- end
415
-
416
- # A hash converting word tags to word categories.
417
- WordTagToCategory = wttc
418
-
419
- # A hash converting phrase tag to categories.
420
- pttc = {}
421
-
422
- Treat::Universalisation::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
423
-
424
- category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
425
-
426
- pttc[tags[Penn]] ||= {};
427
- pttc[tags[Paris7]] ||= {};
428
-
429
- pttc[tags[Penn]][:penn] = category
430
- pttc[tags[Paris7]][:paris7] = category
431
-
432
- # Not yet for other tag sts.
433
- #pttc[tags[0]][:claws_5] = category
434
- #pttc[tags[1]][:brown] = category
435
-
436
- end
437
-
438
- # A hash converting word tags to word categories.
439
- PhraseTagToCategory = pttc
440
-
441
- def self.describe(tag, tag_set)
442
- if PhraseTagToCategory[tag] &&
443
- PhraseTagToCategory[tag_set] &&
444
- WordTagToCategory[tag] &&
445
- WordTagToCategory[tag_set]
446
- end
447
- end
448
-
449
- def self.convert(tag, from, to)
450
-
451
- end
452
-
453
- end
@@ -1,9 +0,0 @@
1
- module Treat::Universalisation
2
-
3
- p = 'treat/universalisation/*.rb'
4
-
5
- Dir[Treat.lib + p].each do |f|
6
- require f
7
- end
8
-
9
- end
data/spec/languages.rb DELETED
@@ -1,25 +0,0 @@
1
- require_relative '../lib/treat'
2
-
3
- describe Treat::Languages do
4
-
5
- describe "#code(language, iso = 2)" do
6
-
7
- it "returns the language code given a full-length " +
8
- "lowercase identifier representing a language, in " +
9
- "the specified ISO-639 format (1 or 2)" do
10
- Treat::Languages.code(:english, 2).should eql :eng
11
- Treat::Languages.code(:english, 1).should eql :en
12
- end
13
-
14
- end
15
-
16
- describe "#describe(code)" do
17
-
18
- it "returns a lowercase identifier representing the " +
19
- "full name of a language, given its ISO-639-1/2 code." do
20
- Treat::Languages.describe(:eng).should eql :english
21
- end
22
-
23
- end
24
-
25
- end