treat 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,393 @@
1
+ # encoding: UTF-8
2
+
3
+ module Treat
4
+ module Resources
5
+ class Tags
6
+
7
+ ClawsC5 = 0
8
+ Brown = 1
9
+ Penn = 2
10
+ Enju = 3
11
+
12
+ PTBWordTagToCategory = {
13
+ 'CC' => :conjunction, # Coordinating conjunction
14
+ 'CD' => :number, # Cardinal number
15
+ 'DT' => :determiner, # Determiner
16
+ 'DET' => :determiner, # Determiner
17
+ 'EX' => :determiner, # Existential there
18
+ 'FW' => :foreign, # Foreign word
19
+ 'IN' => :preposition, # Preposition or subordinating conjunction
20
+ 'JJ' => :adjective, # Adjective
21
+ 'JJR' => :adjective, # Adjective, comparative
22
+ 'JJS' => :adjective, # Adjective, superlative
23
+ 'LS' => :list, # List item marker
24
+ 'MD' => :modal, # Modal
25
+ 'NN' => :noun, # Noun, singular or mass
26
+ 'NNS' => :noun, # Noun, plural
27
+ 'NNP' => :noun, # Proper noun, singular
28
+ 'NNPS' => :noun, # Proper noun, plural
29
+ 'PDT' => :determiner, # Predeterminer
30
+ 'POS' => :determiner, # Possessive ending
31
+ 'PRP' => :pronoun, # Personal pronoun
32
+ 'PRP$' => :pronoun, # Possessive pronoun,
33
+ 'PRPS' => :determiner, # Possessive determiner
34
+ 'RB' => :adverb, # Adverb
35
+ 'RBR' => :adverb, # Adverb, comparative
36
+ 'RBS' => :adverb, # Adverb, superlative
37
+ 'RP' => :particle, # Particle
38
+ 'SYM' => :symbol, # Symbol
39
+ 'TO' => :to, # to
40
+ 'UH' => :interjection, # Interjection
41
+ 'VB' => :verb, # Verb, base form
42
+ 'VBD' => :verb, # Verb, past tense
43
+ 'VBG' => :verb, # Verb, gerund or present participle
44
+ 'VBN' => :verb, # Verb, past participle
45
+ 'VBP' => :verb, # Verb, non-3rd person singular present
46
+ 'VBZ' => :verb, # Verb, 3rd person singular present
47
+ 'WDT' => :determiner, # Wh-determiner
48
+ 'WP' => :pronoun, # Wh-pronoun
49
+ 'WP$' => :pronoun, # Possessive wh-pronoun
50
+ 'WRB' => :adverb, # Wh-adverb
51
+ ')' => :punctuation, # Right bracket
52
+ '(' => :punctuation, # Left bracket
53
+ '.' => :punctuation, # Period
54
+ '\'\'' => :symbol, # Quote
55
+ ',' => :punctuation,
56
+ ';' => :punctuation
57
+ }
58
+
59
+ PTBClauseTagDescription = [
60
+ ['S', 'Simple declarative clause'],
61
+ ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
62
+ ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
63
+ ['SINV', 'Inverted declarative sentence'],
64
+ ['SQ', 'Inverted yes/no question']
65
+ ]
66
+
67
+ PTBPhraseTagDescription = [
68
+ ['ADJP', 'Adjective phrase'],
69
+ ['ADVP', 'Adverb phrase'],
70
+ ['CONJP', 'Conjunction phrase'],
71
+ ['FRAG', 'Fragment'],
72
+ ['INTJ', 'Interjection'],
73
+ ['LST', 'List marker'],
74
+ ['NAC', 'Not a constituent'],
75
+ ['NP', 'Noun phrase'],
76
+ ['NX', 'Head of an NP'],
77
+ ['PP', 'Prepositional phrase'],
78
+ ['PRN', 'Parenthetical'],
79
+ ['PRT', 'Particle'],
80
+ ['QP', 'Quantifier phrase'],
81
+ ['RRC', 'Reduced relative clause'],
82
+ ['UCP', 'Unlike coordinated phrase'],
83
+ ['VP', 'Verb phrase'],
84
+ ['WHADJP', 'Wh-adjective phrase'],
85
+ ['WHAVP', 'Wh-adverb phrase'],
86
+ ['WHNP', 'Wh-noun phrase'],
87
+ ['WHPP', 'Wh-prepositional phrase'],
88
+ ['X', 'Unknown, uncertain, or unbracketable']
89
+ ]
90
+
91
+ PTBWordTagDescription = [
92
+ =begin
93
+ CC - Coordinating conjunction
94
+ CD - Cardinal number
95
+ DT - Determiner
96
+ EX - Existential there
97
+ FW - Foreign word
98
+ IN - Preposition or subordinating conjunction
99
+ JJ - Adjective
100
+ JJR - Adjective, comparative
101
+ JJS - Adjective, superlative
102
+ LS - List item marker
103
+ MD - Modal
104
+ NN - Noun, singular or mass
105
+ NNS - Noun, plural
106
+ NNP - Proper noun, singular
107
+ NNPS - Proper noun, plural
108
+ PDT - Predeterminer
109
+ POS - Possessive ending
110
+ PRP - Personal pronoun
111
+ PRP$ - Possessive pronoun (prolog version PRP-S)
112
+ RB - Adverb
113
+ RBR - Adverb, comparative
114
+ RBS - Adverb, superlative
115
+ RP - Particle
116
+ SYM - Symbol
117
+ TO - to
118
+ UH - Interjection
119
+ VB - Verb, base form
120
+ VBD - Verb, past tense
121
+ VBG - Verb, gerund or present participle
122
+ VBN - Verb, past participle
123
+ VBP - Verb, non-3rd person singular present
124
+ VBZ - Verb, 3rd person singular present
125
+ WDT - Wh-determiner
126
+ WP - Wh-pronoun
127
+ WP$ - Possessive wh-pronoun (prolog version WP-S)
128
+ WRB - Wh-adverb
129
+
130
+ =end
131
+ ]
132
+ BrownWordTagDescription = [
133
+ =begin
134
+
135
+ Tag Description Examples
136
+
137
+ . sentence closer . ; ? !
138
+ ( left paren
139
+ ) right paren
140
+ * not, n't
141
+ -- dash
142
+ , comma
143
+ : colon
144
+ ABL pre-qualifier quite, rather
145
+ ABN pre-quantifier half, all
146
+ ABX pre-quantifier both
147
+ AP post-determiner many, several, next
148
+ AT article a, the, no
149
+ BE be
150
+ BED were
151
+ BEDZ was
152
+ BEG being
153
+ BEM am
154
+ BEN been
155
+ BER are, art
156
+ BEZ is
157
+ CC coordinating conjunction and, or
158
+ CD cardinal numeral one, two, 2, etc.
159
+ CS subordinating conjunction if, although
160
+ DO do
161
+ DOD did
162
+ DOZ does
163
+ DT singular determiner this, that
164
+ DTI singular or plural determiner/quantifier some, any
165
+ DTS plural determiner these, those
166
+ DTX determiner/double conjunction either
167
+ EX existentil there
168
+ FW foreign word (hyphenated before regular tag)
169
+ HL word occurring in headline (hyphenated after regular tag)
170
+ HV have
171
+ HVD had (past tense)
172
+ HVG having
173
+ HVN had (past participle)
174
+ HVZ has
175
+ IN preposition
176
+ JJ adjective
177
+ JJR comparative adjective
178
+ JJS semantically superlative adjective chief, top
179
+ JJT morphologically superlative adjective biggest
180
+ MD modal auxiliary can, should, will
181
+ NC cited word (hyphenated after regular tag)
182
+ NN singular or mass noun
183
+ NN$ possessive singular noun
184
+ NNS plural noun
185
+ NNS$ possessive plural noun
186
+ NP proper noun or part of name phrase
187
+ NP$ possessive proper noun
188
+ NPS plural proper noun
189
+ NPS$ possessive plural proper noun
190
+ NR adverbial noun home, today, west
191
+ NRS plural adverbial noun
192
+ OD ordinal numeral first, 2nd
193
+ PN nominal pronoun everybody, nothing
194
+ PN$ possessive nominal pronoun
195
+ PP$ possessive personal pronoun my, our
196
+ PP$$ second (nominal) possessive pronoun mine, ours
197
+ PPL singular reflexive/intensive personal pronoun myself
198
+ PPLS plural reflexive/intensive personal pronoun ourselves
199
+ PPO objective personal pronoun me, him, it, them
200
+ PPS 3rd. singular nominative pronoun he, she, it, one
201
+ PPSS other nominative personal pronoun I, we, they, you
202
+ QL qualifier very, fairly
203
+ QLP post-qualifier enough, indeed
204
+ RB adverb
205
+ RBR comparative adverb
206
+ RBT superlative adverb
207
+ RN nominal adverb here then, indoors
208
+ RP adverb/particle about, off, up
209
+ TL word occurring in title (hyphenated after
210
+ regular tag)
211
+ TO infinitive marker to
212
+ UH interjection, exclamation
213
+ VB verb, base form
214
+ VBD verb, past tense
215
+ VBG verb, present participle/gerund
216
+ VBN verb, past participle
217
+ VBZ verb, 3rd. singular present
218
+ WDT wh- determiner what, which
219
+ WP$ possessive wh- pronoun whose
220
+ WPO objective wh- pronoun whom, which, that
221
+ WPS nominative wh- pronoun who, which, that
222
+ WQL wh- qualifier how
223
+ WRB wh- adverb how, where, when
224
+
225
+ =end
226
+ ]
227
+ EnjuCatDescription = [
228
+ ['ADJ', 'Adjective'],
229
+ ['ADV', 'Adverb'],
230
+ ['CONJ', 'Coordination conjunction'],
231
+ ['C', 'Complementizer'],
232
+ ['D', 'Determiner'],
233
+ ['N', 'Noun'],
234
+ ['P', 'Preposition'],
235
+ ['SC', 'Subordination conjunction'],
236
+ ['V', 'Verb'],
237
+ ['COOD', 'Part of coordination'],
238
+ ['PN', 'Punctuation'],
239
+ ['PRT', 'Particle'],
240
+ ['S', 'Sentence']
241
+ ]
242
+
243
+ # Description of the xcat in the Enju output specification.
244
+ EnjuXCatDescription = [
245
+ ['COOD', 'Coordinated phrase/clause'],
246
+ ['IMP', 'Imperative sentence'],
247
+ ['INV', 'Subject-verb inversion'],
248
+ ['Q', 'Interrogative sentence with subject-verb inversion'],
249
+ ['REL', 'A relativizer included'],
250
+ ['FREL', 'A free relative included'],
251
+ ['TRACE', 'A trace included'],
252
+ ['WH', 'A wh-question word included']
253
+ ]
254
+
255
+ EnjuCatXcatToPTB = [
256
+ ['ADJP', '', 'ADJP'],
257
+ ['ADJP', 'REL', 'WHADJP'],
258
+ ['ADJP', 'FREL', 'WHADJP'],
259
+ ['ADJP', 'WH', 'WHADJP'],
260
+ ['ADVP', '', 'ADVP'],
261
+ ['ADVP', 'REL', 'WHADVP'],
262
+ ['ADVP', 'FREL', 'WHADVP'],
263
+ ['ADVP', 'WH', 'WHADVP'],
264
+ ['CONJP', '', 'CONJP'],
265
+ ['CP', '', 'SBAR'],
266
+ ['DP', '', 'NP'],
267
+ ['NP', '', 'NP'],
268
+ ['NX', 'NX', 'NAC'],
269
+ ['NP' 'REL' 'WHNP'],
270
+ ['NP' 'FREL' 'WHNP'],
271
+ ['NP' 'WH' 'WHNP'],
272
+ ['PP', '', 'PP'],
273
+ ['PP', 'REL', 'WHPP'],
274
+ ['PP', 'WH', 'WHPP'],
275
+ ['PRT', '', 'PRT'],
276
+ ['S', '', 'S'],
277
+ ['S', 'INV', 'SINV'],
278
+ ['S', 'Q', 'SQ'],
279
+ ['S', 'REL', 'SBAR'],
280
+ ['S', 'FREL', 'SBAR'],
281
+ ['S', 'WH', 'SBARQ'],
282
+ ['SCP', '', 'SBAR'],
283
+ ['VP', '', 'VP'],
284
+ ['VP', '', 'VP'],
285
+ ['', '', 'UK']
286
+ ]
287
+
288
+ # Aligned tags for the Claws C5, Brown and Penn tag sets.
289
+ # Adapted from Manning, Christopher and Schütze, Hinrich,
290
+ # 1999. Foundations of Statistical Natural Language
291
+ # Processing. MIT Press, p. 141-142.
292
+ AlignedWordTags = [
293
+ 'Adjective', ['AJ0', 'JJ', 'JJ'],
294
+ 'Adjective, ordinal number', ['ORD', 'OD', 'JJ'],
295
+ 'Adjective, comparative', ['AJC', 'JJR', 'JJR'],
296
+ 'Adjective, superlative', ['AJS', 'JJT', 'JJS'],
297
+ 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ'],
298
+ 'Adjective, cardinal number', ['CRD', 'CD', 'CD'],
299
+ 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD'],
300
+ 'Adverb', ['AV0', 'RB', 'RB'],
301
+ 'Adverb, negative', ['XX0', '*', 'RB'],
302
+ 'Adverb, comparative', ['AV0', 'RBR', 'RBR'],
303
+ 'Adverb, superlative', ['AV0', 'RBT', 'RBS'],
304
+ 'Adverb, particle', ['AVP', 'RP', 'RP'],
305
+ 'Adverb, question', ['AVQ', 'WRB', 'WRB'],
306
+ 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB'],
307
+ 'Adverb, degree', ['AV0', 'QL', 'RB'],
308
+ 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB'],
309
+ 'Adverb, nominal', ['AV0', 'RN', 'RB'],
310
+ 'Conjunction, coordination', ['CJC', 'CC', 'CC'],
311
+ 'Conjunction, subordination', ['CJS', 'CS', 'IN'],
312
+ 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN'],
313
+ 'Determiner', ['DT0', 'DT', 'DT'],
314
+ 'Determiner, pronoun', ['DT0', 'DTI', 'DT'],
315
+ 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT'],
316
+ 'Determiner, prequalifier', ['DT0', 'ABL', 'DT'],
317
+ 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT'],
318
+ 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT'],
319
+ 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT'],
320
+ 'Determiner, article', ['AT0', 'AT', 'DT'],
321
+ 'Determiner, postdeterminer', ['DT0', 'AP', 'JJ'],
322
+ 'Determiner, possessive', ['DPS', 'PP$', 'PRP$'],
323
+ 'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP'],
324
+ 'Determiner, question', ['DTQ', 'WDT', 'WDT'],
325
+ 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$'],
326
+ 'Noun', ['NN0', 'NN', 'NN'],
327
+ 'Noun, singular', ['NN1', 'NN', 'NN'],
328
+ 'Noun, plural', ['NN2', 'NNS', 'NNS'],
329
+ 'Noun, proper, singular', ['NP0', 'NP', 'NNP'],
330
+ 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
331
+ 'Noun, adverbial', ['NN0', 'NR', 'NN'],
332
+ 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
333
+ 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'NN'],
334
+ 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
335
+ 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
336
+ 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
337
+ 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP'],
338
+ 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP'],
339
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP'],
340
+ 'Pronoun, question, object', ['PNQ', 'WPO', 'WP'],
341
+ 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
342
+ 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP'],
343
+ 'Verb, infinitive', ['VVI', 'VB', 'VB'],
344
+ 'Verb, past tense', ['VVD', 'VBD', 'VBD'],
345
+ 'Verb, present participle', ['VVG', 'VBG', 'VBG'],
346
+ 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN'],
347
+ 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ'],
348
+ 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP'],
349
+ 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB'],
350
+ 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD'],
351
+ 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG'],
352
+ 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN'],
353
+ 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ'],
354
+ 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP'],
355
+ 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB'],
356
+ 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD'],
357
+ 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG'],
358
+ 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN'],
359
+ 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ'],
360
+ 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB'],
361
+ 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD'],
362
+ 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD'],
363
+ 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG'],
364
+ 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN'],
365
+ 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ'],
366
+ 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP'],
367
+ 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP'],
368
+ 'Verb, modal', ['VM0', 'MD', 'MD'],
369
+ 'Preposition, to as infinitive marker', ['TO0', 'TO', 'TO'],
370
+ 'Preposition, to', ['PRP', 'IN', 'TO'],
371
+ 'Preposition', ['PRP', 'IN', 'IN'],
372
+ 'Preposition, of', ['PRF', 'IN', 'IN'],
373
+ 'Possessive', ['POS', '$', 'POS'],
374
+ 'Interjection (or other isolate)', ['ITJ', 'UH', 'UH'],
375
+ 'Punctuation, sentence ender', ['PUN', '.', '.'],
376
+ 'Punctuation, semicolon', ['PUN', '.', '.'],
377
+ 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
378
+ 'Punctuationm, comma', ['PUN', ',', ','],
379
+ 'Punctuation, dash', ['PUN', '-', '-'],
380
+ 'Punctuation, dollar sign', ['PUN', '', '$'],
381
+ 'Punctuation, left bracket', ['PUL', '(', '('],
382
+ 'Punctuation, right bracket', ['PUR', ')', ')'],
383
+ 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
384
+ 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
385
+ 'Unknown, foreign words (not in English lexicon)', ['UNZ', '(FW-)', 'FW'],
386
+ 'Symbol', ['', '', 'SYM'],
387
+ 'Symbol, alphabetical', ['ZZ0', '', ''],
388
+ 'Symbol, list item', ['', '', 'LS']
389
+ ]
390
+
391
+ end
392
+ end
393
+ end
@@ -0,0 +1,43 @@
1
+ module Treat
2
+ module Sugar
3
+ def edulcorate
4
+ return if @@edulcorated
5
+ @@edulcorated = true
6
+ each_entity_class do |type, klass|
7
+ unless type == :Symbol
8
+ Object.class_eval do
9
+ define_method(type) do |value='',id=nil|
10
+ klass.build(value, id)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
16
+ def unedulcorate
17
+ return unless @@edulcorated
18
+ @@edulcorated = false
19
+ each_entity_class do |type, klass|
20
+ unless type == :Symbol
21
+ Object.class_eval do
22
+ remove_method(type)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ # Whtypeher syntactic sugar is
28
+ # enabled or not.
29
+ def edulcorated?; @@edulcorated; end
30
+ # Syntactic sugar is disabled by default.
31
+ @@edulcorated = false
32
+
33
+ private
34
+
35
+ def each_entity_class
36
+ Treat::Entities.list.each do |entity_type|
37
+ type = :"#{cc(entity_type)}"
38
+ klass = Treat::Entities.const_get(type, klass)
39
+ yield type, klass
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,174 @@
1
+ module Treat
2
+ # This module provides an abstract tree structure with
3
+ # nodes having an id, a value, children, features and edges.
4
+ module Tree
5
+ # This class models the nodes for an N-ary tree data structue
6
+ # with unique identifiers, text value, children, features
7
+ # (annotations) and edges.
8
+ #
9
+ # This class was tightly based on the 'rubytree' gem.
10
+ # RubyTree is licensed under the BSD license and can
11
+ # be found at http://rubytree.rubyforge.org/rdoc/.
12
+ # I have made several modifications in order to better
13
+ # suit this library and to avoid monkey patching.
14
+ class Node
15
+ # Iterate over each children in the node.
16
+ def each
17
+ @children.each { |child| yield child }
18
+ end
19
+ # A string containing the node's value (or empty).
20
+ attr_accessor :value
21
+ # A unique identifier for the node.
22
+ attr_reader :id
23
+ # An array containing the children of this node.
24
+ attr_reader :children
25
+ # A hash containing the features of this node.
26
+ attr_accessor :features
27
+ # A hash containing the edges that link this
28
+ # node to other nodes.
29
+ attr_accessor :edges
30
+ # The parent of the node.
31
+ attr_accessor :parent
32
+ # Initialize the node with its value and id.
33
+ # Setup containers for the children, features
34
+ # and edges of this node.
35
+ def initialize(value, id = nil)
36
+ @parent = nil
37
+ @value, @id = value, id
38
+ @children = []
39
+ @children_hash = {}
40
+ @features = {}
41
+ @edges = {}
42
+ end
43
+ # Boolean - does the node have edges?
44
+ def has_edges?; !@edges.empty?; end
45
+ # Boolean - does the node have children?
46
+ def has_children?; !@children.empty?; end
47
+ # Boolean - does the node have features?
48
+ def has_features?; !@features.empty?; end
49
+ # Boolean - does the node have a parent?
50
+ def has_parent?; !@parent.nil?; end
51
+ # Boolean - does the node not have a parent?
52
+ def is_root?; @parent.nil?; end
53
+ # Remove this node from its parent and set as root.
54
+ def set_as_root!; @parent = nil; self; end
55
+ # Boolean - is this node a leaf ?
56
+ # This is overriden in leaf classes.
57
+ def is_leaf?; !has_children?; end
58
+ # Add the nodes to the given child.
59
+ # This may be used with several nodes,
60
+ # for example: node << [child1, child2, child3]
61
+ def <<(nodes)
62
+ nodes = [nodes] unless nodes.is_a? Array
63
+ raise 'Trying to add a nil node.' if nodes.include? nil
64
+ nodes.each do |node|
65
+ node.parent = self
66
+ @children << node
67
+ @children_hash[node.id] = node
68
+ end
69
+ nodes[0]
70
+ end
71
+ def [](name_or_index)
72
+ if name_or_index == nil
73
+ raise Treat::Exception,
74
+ "Non-nil name or index needs to be provided."
75
+ end
76
+ if name_or_index.kind_of?(Integer) &&
77
+ name_or_index < 1000 # Fix
78
+ @children[name_or_index]
79
+ else
80
+ @children_hash[name_or_index]
81
+ end
82
+ end
83
+ def remove!(ion)
84
+ return nil unless ion
85
+ if ion.is_a? Treat::Tree::Node
86
+ @children.delete(ion)
87
+ @children_hash.delete(ion.id)
88
+ ion.set_as_root!
89
+ else
90
+ @children.delete(@children_hash[ion])
91
+ @children_hash.delete(ion)
92
+ end
93
+ end
94
+ def remove_all!
95
+ @children.each { |child| child.set_as_root! }
96
+ @children.clear
97
+ @children_hash.clear
98
+ self
99
+ end
100
+ # Previous sibling from the same parent.
101
+ def next_sibling
102
+ return nil if is_root?
103
+ id = @parent.children.index(self)
104
+ @parent.children.at(id + 1) if id
105
+ end
106
+ def left(n = 1); sibling(-1*n); end
107
+ def right(n = 1); sibling(1*n); end
108
+ def sibling(pos)
109
+ return nil if is_root?
110
+ id = @parent.children.index(self)
111
+ @parent.children.at(id + pos)
112
+ end
113
+ # There must be a cleaner way to do this.
114
+ def siblings
115
+ r = @parent.children.dup
116
+ r.delete(self)
117
+ r
118
+ end
119
+ # Total number of nodes in the subtree, including this one.
120
+ def size
121
+ @children.inject(1) { |sum, node| sum + node.size }
122
+ end
123
+ # Set the feature to the supplied value.
124
+ def set(feature, value)
125
+ @features ||= {}
126
+ @features[feature] = value
127
+ end
128
+ # Return the depth of this node in the tree.
129
+ def depth
130
+ return 0 if is_root?
131
+ 1 + parent.depth
132
+ end
133
+ # Does the entity have a feature ?
134
+ def has_feature?(feature)
135
+ @features.has_key?(feature) ||
136
+ feature == :value
137
+ end
138
+ alias :has? :has_feature?
139
+ # Link this node to the target node with
140
+ # the supplied edge type.
141
+ def associate(id_or_node, edge_type = nil)
142
+ if id_or_node.is_a? Treat::Tree::Node
143
+ id = root.find(id_or_node).id
144
+ else
145
+ id = id_or_node
146
+ end
147
+ @edges[id] = edge_type if id
148
+ end
149
+ # Find the node in the tree with the given id.
150
+ def find(id_or_node)
151
+ if id_or_node.is_a? self.class
152
+ id = id_or_node.id
153
+ else
154
+ id = id_or_node
155
+ end
156
+ return @children_hash[id] if @children_hash[id]
157
+ self.each do |child|
158
+ r = child.find(id)
159
+ return r if r.is_a? Tree::Node
160
+ end
161
+ end
162
+ # Find the root of the tree within which
163
+ # this node is contained.
164
+ def root
165
+ return self if !has_parent?
166
+ ancestor = @parent
167
+ while ancestor.has_parent?
168
+ ancestor = ancestor.parent
169
+ end
170
+ ancestor
171
+ end
172
+ end
173
+ end
174
+ end