treat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,393 @@
1
+ # encoding: UTF-8
2
+
3
+ module Treat
4
+ module Resources
5
+ class Tags
6
+
7
+ ClawsC5 = 0
8
+ Brown = 1
9
+ Penn = 2
10
+ Enju = 3
11
+
12
+ PTBWordTagToCategory = {
13
+ 'CC' => :conjunction, # Coordinating conjunction
14
+ 'CD' => :number, # Cardinal number
15
+ 'DT' => :determiner, # Determiner
16
+ 'DET' => :determiner, # Determiner
17
+ 'EX' => :determiner, # Existential there
18
+ 'FW' => :foreign, # Foreign word
19
+ 'IN' => :preposition, # Preposition or subordinating conjunction
20
+ 'JJ' => :adjective, # Adjective
21
+ 'JJR' => :adjective, # Adjective, comparative
22
+ 'JJS' => :adjective, # Adjective, superlative
23
+ 'LS' => :list, # List item marker
24
+ 'MD' => :modal, # Modal
25
+ 'NN' => :noun, # Noun, singular or mass
26
+ 'NNS' => :noun, # Noun, plural
27
+ 'NNP' => :noun, # Proper noun, singular
28
+ 'NNPS' => :noun, # Proper noun, plural
29
+ 'PDT' => :determiner, # Predeterminer
30
+ 'POS' => :determiner, # Possessive ending
31
+ 'PRP' => :pronoun, # Personal pronoun
32
+ 'PRP$' => :pronoun, # Possessive pronoun,
33
+ 'PRPS' => :determiner, # Possessive determiner
34
+ 'RB' => :adverb, # Adverb
35
+ 'RBR' => :adverb, # Adverb, comparative
36
+ 'RBS' => :adverb, # Adverb, superlative
37
+ 'RP' => :particle, # Particle
38
+ 'SYM' => :symbol, # Symbol
39
+ 'TO' => :to, # to
40
+ 'UH' => :interjection, # Interjection
41
+ 'VB' => :verb, # Verb, base form
42
+ 'VBD' => :verb, # Verb, past tense
43
+ 'VBG' => :verb, # Verb, gerund or present participle
44
+ 'VBN' => :verb, # Verb, past participle
45
+ 'VBP' => :verb, # Verb, non-3rd person singular present
46
+ 'VBZ' => :verb, # Verb, 3rd person singular present
47
+ 'WDT' => :determiner, # Wh-determiner
48
+ 'WP' => :pronoun, # Wh-pronoun
49
+ 'WP$' => :pronoun, # Possessive wh-pronoun
50
+ 'WRB' => :adverb, # Wh-adverb
51
+ ')' => :punctuation, # Right bracket
52
+ '(' => :punctuation, # Left bracket
53
+ '.' => :punctuation, # Period
54
+ '\'\'' => :symbol, # Quote
55
+ ',' => :punctuation,
56
+ ';' => :punctuation
57
+ }
58
+
59
+ PTBClauseTagDescription = [
60
+ ['S', 'Simple declarative clause'],
61
+ ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
62
+ ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
63
+ ['SINV', 'Inverted declarative sentence'],
64
+ ['SQ', 'Inverted yes/no question']
65
+ ]
66
+
67
+ PTBPhraseTagDescription = [
68
+ ['ADJP', 'Adjective phrase'],
69
+ ['ADVP', 'Adverb phrase'],
70
+ ['CONJP', 'Conjunction phrase'],
71
+ ['FRAG', 'Fragment'],
72
+ ['INTJ', 'Interjection'],
73
+ ['LST', 'List marker'],
74
+ ['NAC', 'Not a constituent'],
75
+ ['NP', 'Noun phrase'],
76
+ ['NX', 'Head of an NP'],
77
+ ['PP', 'Prepositional phrase'],
78
+ ['PRN', 'Parenthetical'],
79
+ ['PRT', 'Particle'],
80
+ ['QP', 'Quantifier phrase'],
81
+ ['RRC', 'Reduced relative clause'],
82
+ ['UCP', 'Unlike coordinated phrase'],
83
+ ['VP', 'Verb phrase'],
84
+ ['WHADJP', 'Wh-adjective phrase'],
85
+ ['WHAVP', 'Wh-adverb phrase'],
86
+ ['WHNP', 'Wh-noun phrase'],
87
+ ['WHPP', 'Wh-prepositional phrase'],
88
+ ['X', 'Unknown, uncertain, or unbracketable']
89
+ ]
90
+
91
+ PTBWordTagDescription = [
92
+ =begin
93
+ CC - Coordinating conjunction
94
+ CD - Cardinal number
95
+ DT - Determiner
96
+ EX - Existential there
97
+ FW - Foreign word
98
+ IN - Preposition or subordinating conjunction
99
+ JJ - Adjective
100
+ JJR - Adjective, comparative
101
+ JJS - Adjective, superlative
102
+ LS - List item marker
103
+ MD - Modal
104
+ NN - Noun, singular or mass
105
+ NNS - Noun, plural
106
+ NNP - Proper noun, singular
107
+ NNPS - Proper noun, plural
108
+ PDT - Predeterminer
109
+ POS - Possessive ending
110
+ PRP - Personal pronoun
111
+ PRP$ - Possessive pronoun (prolog version PRP-S)
112
+ RB - Adverb
113
+ RBR - Adverb, comparative
114
+ RBS - Adverb, superlative
115
+ RP - Particle
116
+ SYM - Symbol
117
+ TO - to
118
+ UH - Interjection
119
+ VB - Verb, base form
120
+ VBD - Verb, past tense
121
+ VBG - Verb, gerund or present participle
122
+ VBN - Verb, past participle
123
+ VBP - Verb, non-3rd person singular present
124
+ VBZ - Verb, 3rd person singular present
125
+ WDT - Wh-determiner
126
+ WP - Wh-pronoun
127
+ WP$ - Possessive wh-pronoun (prolog version WP-S)
128
+ WRB - Wh-adverb
129
+
130
+ =end
131
+ ]
132
+ BrownWordTagDescription = [
133
+ =begin
134
+
135
+ Tag Description Examples
136
+
137
+ . sentence closer . ; ? !
138
+ ( left paren
139
+ ) right paren
140
+ * not, n't
141
+ -- dash
142
+ , comma
143
+ : colon
144
+ ABL pre-qualifier quite, rather
145
+ ABN pre-quantifier half, all
146
+ ABX pre-quantifier both
147
+ AP post-determiner many, several, next
148
+ AT article a, the, no
149
+ BE be
150
+ BED were
151
+ BEDZ was
152
+ BEG being
153
+ BEM am
154
+ BEN been
155
+ BER are, art
156
+ BEZ is
157
+ CC coordinating conjunction and, or
158
+ CD cardinal numeral one, two, 2, etc.
159
+ CS subordinating conjunction if, although
160
+ DO do
161
+ DOD did
162
+ DOZ does
163
+ DT singular determiner this, that
164
+ DTI singular or plural determiner/quantifier some, any
165
+ DTS plural determiner these, those
166
+ DTX determiner/double conjunction either
167
+ EX existentil there
168
+ FW foreign word (hyphenated before regular tag)
169
+ HL word occurring in headline (hyphenated after regular tag)
170
+ HV have
171
+ HVD had (past tense)
172
+ HVG having
173
+ HVN had (past participle)
174
+ HVZ has
175
+ IN preposition
176
+ JJ adjective
177
+ JJR comparative adjective
178
+ JJS semantically superlative adjective chief, top
179
+ JJT morphologically superlative adjective biggest
180
+ MD modal auxiliary can, should, will
181
+ NC cited word (hyphenated after regular tag)
182
+ NN singular or mass noun
183
+ NN$ possessive singular noun
184
+ NNS plural noun
185
+ NNS$ possessive plural noun
186
+ NP proper noun or part of name phrase
187
+ NP$ possessive proper noun
188
+ NPS plural proper noun
189
+ NPS$ possessive plural proper noun
190
+ NR adverbial noun home, today, west
191
+ NRS plural adverbial noun
192
+ OD ordinal numeral first, 2nd
193
+ PN nominal pronoun everybody, nothing
194
+ PN$ possessive nominal pronoun
195
+ PP$ possessive personal pronoun my, our
196
+ PP$$ second (nominal) possessive pronoun mine, ours
197
+ PPL singular reflexive/intensive personal pronoun myself
198
+ PPLS plural reflexive/intensive personal pronoun ourselves
199
+ PPO objective personal pronoun me, him, it, them
200
+ PPS 3rd. singular nominative pronoun he, she, it, one
201
+ PPSS other nominative personal pronoun I, we, they, you
202
+ QL qualifier very, fairly
203
+ QLP post-qualifier enough, indeed
204
+ RB adverb
205
+ RBR comparative adverb
206
+ RBT superlative adverb
207
+ RN nominal adverb here then, indoors
208
+ RP adverb/particle about, off, up
209
+ TL word occurring in title (hyphenated after
210
+ regular tag)
211
+ TO infinitive marker to
212
+ UH interjection, exclamation
213
+ VB verb, base form
214
+ VBD verb, past tense
215
+ VBG verb, present participle/gerund
216
+ VBN verb, past participle
217
+ VBZ verb, 3rd. singular present
218
+ WDT wh- determiner what, which
219
+ WP$ possessive wh- pronoun whose
220
+ WPO objective wh- pronoun whom, which, that
221
+ WPS nominative wh- pronoun who, which, that
222
+ WQL wh- qualifier how
223
+ WRB wh- adverb how, where, when
224
+
225
+ =end
226
+ ]
227
+ EnjuCatDescription = [
228
+ ['ADJ', 'Adjective'],
229
+ ['ADV', 'Adverb'],
230
+ ['CONJ', 'Coordination conjunction'],
231
+ ['C', 'Complementizer'],
232
+ ['D', 'Determiner'],
233
+ ['N', 'Noun'],
234
+ ['P', 'Preposition'],
235
+ ['SC', 'Subordination conjunction'],
236
+ ['V', 'Verb'],
237
+ ['COOD', 'Part of coordination'],
238
+ ['PN', 'Punctuation'],
239
+ ['PRT', 'Particle'],
240
+ ['S', 'Sentence']
241
+ ]
242
+
243
+ # Description of the xcat in the Enju output specification.
244
+ EnjuXCatDescription = [
245
+ ['COOD', 'Coordinated phrase/clause'],
246
+ ['IMP', 'Imperative sentence'],
247
+ ['INV', 'Subject-verb inversion'],
248
+ ['Q', 'Interrogative sentence with subject-verb inversion'],
249
+ ['REL', 'A relativizer included'],
250
+ ['FREL', 'A free relative included'],
251
+ ['TRACE', 'A trace included'],
252
+ ['WH', 'A wh-question word included']
253
+ ]
254
+
255
+ EnjuCatXcatToPTB = [
256
+ ['ADJP', '', 'ADJP'],
257
+ ['ADJP', 'REL', 'WHADJP'],
258
+ ['ADJP', 'FREL', 'WHADJP'],
259
+ ['ADJP', 'WH', 'WHADJP'],
260
+ ['ADVP', '', 'ADVP'],
261
+ ['ADVP', 'REL', 'WHADVP'],
262
+ ['ADVP', 'FREL', 'WHADVP'],
263
+ ['ADVP', 'WH', 'WHADVP'],
264
+ ['CONJP', '', 'CONJP'],
265
+ ['CP', '', 'SBAR'],
266
+ ['DP', '', 'NP'],
267
+ ['NP', '', 'NP'],
268
+ ['NX', 'NX', 'NAC'],
269
+ ['NP' 'REL' 'WHNP'],
270
+ ['NP' 'FREL' 'WHNP'],
271
+ ['NP' 'WH' 'WHNP'],
272
+ ['PP', '', 'PP'],
273
+ ['PP', 'REL', 'WHPP'],
274
+ ['PP', 'WH', 'WHPP'],
275
+ ['PRT', '', 'PRT'],
276
+ ['S', '', 'S'],
277
+ ['S', 'INV', 'SINV'],
278
+ ['S', 'Q', 'SQ'],
279
+ ['S', 'REL', 'SBAR'],
280
+ ['S', 'FREL', 'SBAR'],
281
+ ['S', 'WH', 'SBARQ'],
282
+ ['SCP', '', 'SBAR'],
283
+ ['VP', '', 'VP'],
284
+ ['VP', '', 'VP'],
285
+ ['', '', 'UK']
286
+ ]
287
+
288
+ # Aligned tags for the Claws C5, Brown and Penn tag sets.
289
+ # Adapted from Manning, Christopher and Schütze, Hinrich,
290
+ # 1999. Foundations of Statistical Natural Language
291
+ # Processing. MIT Press, p. 141-142.
292
+ AlignedWordTags = [
293
+ 'Adjective', ['AJ0', 'JJ', 'JJ'],
294
+ 'Adjective, ordinal number', ['ORD', 'OD', 'JJ'],
295
+ 'Adjective, comparative', ['AJC', 'JJR', 'JJR'],
296
+ 'Adjective, superlative', ['AJS', 'JJT', 'JJS'],
297
+ 'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ'],
298
+ 'Adjective, cardinal number', ['CRD', 'CD', 'CD'],
299
+ 'Adjective, cardinal number, one', ['PNI', 'CD', 'CD'],
300
+ 'Adverb', ['AV0', 'RB', 'RB'],
301
+ 'Adverb, negative', ['XX0', '*', 'RB'],
302
+ 'Adverb, comparative', ['AV0', 'RBR', 'RBR'],
303
+ 'Adverb, superlative', ['AV0', 'RBT', 'RBS'],
304
+ 'Adverb, particle', ['AVP', 'RP', 'RP'],
305
+ 'Adverb, question', ['AVQ', 'WRB', 'WRB'],
306
+ 'Adverb, degree & question', ['AVQ', 'WQL', 'WRB'],
307
+ 'Adverb, degree', ['AV0', 'QL', 'RB'],
308
+ 'Adverb, degree, postposed', ['AV0', 'QLP', 'RB'],
309
+ 'Adverb, nominal', ['AV0', 'RN', 'RB'],
310
+ 'Conjunction, coordination', ['CJC', 'CC', 'CC'],
311
+ 'Conjunction, subordination', ['CJS', 'CS', 'IN'],
312
+ 'Conjunction, complementizer, that', ['CJT', 'CS', 'IN'],
313
+ 'Determiner', ['DT0', 'DT', 'DT'],
314
+ 'Determiner, pronoun', ['DT0', 'DTI', 'DT'],
315
+ 'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT'],
316
+ 'Determiner, prequalifier', ['DT0', 'ABL', 'DT'],
317
+ 'Determiner, prequantifier', ['DT0', 'ABN', 'PDT'],
318
+ 'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT'],
319
+ 'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT'],
320
+ 'Determiner, article', ['AT0', 'AT', 'DT'],
321
+ 'Determiner, postdeterminer', ['DT0', 'AP', 'JJ'],
322
+ 'Determiner, possessive', ['DPS', 'PP$', 'PRP$'],
323
+ 'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP'],
324
+ 'Determiner, question', ['DTQ', 'WDT', 'WDT'],
325
+ 'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$'],
326
+ 'Noun', ['NN0', 'NN', 'NN'],
327
+ 'Noun, singular', ['NN1', 'NN', 'NN'],
328
+ 'Noun, plural', ['NN2', 'NNS', 'NNS'],
329
+ 'Noun, proper, singular', ['NP0', 'NP', 'NNP'],
330
+ 'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
331
+ 'Noun, adverbial', ['NN0', 'NR', 'NN'],
332
+ 'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
333
+ 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'NN'],
334
+ 'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
335
+ 'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
336
+ 'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
337
+ 'Pronoun, reflexive', ['PNX', 'PPL', 'PRP'],
338
+ 'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP'],
339
+ 'Pronoun, question, subject', ['PNQ', 'WPS', 'WP'],
340
+ 'Pronoun, question, object', ['PNQ', 'WPO', 'WP'],
341
+ 'Pronoun, existential there', ['EX0', 'EX', 'EX'],
342
+ 'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP'],
343
+ 'Verb, infinitive', ['VVI', 'VB', 'VB'],
344
+ 'Verb, past tense', ['VVD', 'VBD', 'VBD'],
345
+ 'Verb, present participle', ['VVG', 'VBG', 'VBG'],
346
+ 'Verb, past/passive participle', ['VVN', 'VBN', 'VBN'],
347
+ 'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ'],
348
+ 'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP'],
349
+ 'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB'],
350
+ 'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD'],
351
+ 'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG'],
352
+ 'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN'],
353
+ 'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ'],
354
+ 'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP'],
355
+ 'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB'],
356
+ 'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD'],
357
+ 'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG'],
358
+ 'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN'],
359
+ 'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ'],
360
+ 'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB'],
361
+ 'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD'],
362
+ 'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD'],
363
+ 'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG'],
364
+ 'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN'],
365
+ 'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ'],
366
+ 'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP'],
367
+ 'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP'],
368
+ 'Verb, modal', ['VM0', 'MD', 'MD'],
369
+ 'Preposition, to as infinitive marker', ['TO0', 'TO', 'TO'],
370
+ 'Preposition, to', ['PRP', 'IN', 'TO'],
371
+ 'Preposition', ['PRP', 'IN', 'IN'],
372
+ 'Preposition, of', ['PRF', 'IN', 'IN'],
373
+ 'Possessive', ['POS', '$', 'POS'],
374
+ 'Interjection (or other isolate)', ['ITJ', 'UH', 'UH'],
375
+ 'Punctuation, sentence ender', ['PUN', '.', '.'],
376
+ 'Punctuation, semicolon', ['PUN', '.', '.'],
377
+ 'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
378
+ 'Punctuationm, comma', ['PUN', ',', ','],
379
+ 'Punctuation, dash', ['PUN', '-', '-'],
380
+ 'Punctuation, dollar sign', ['PUN', '', '$'],
381
+ 'Punctuation, left bracket', ['PUL', '(', '('],
382
+ 'Punctuation, right bracket', ['PUR', ')', ')'],
383
+ 'Punctuation, quotation mark, left', ['PUQ', '', '``'],
384
+ 'Punctuation, quotation mark, right', ['PUQ', '', '"'],
385
+ 'Unknown, foreign words (not in English lexicon)', ['UNZ', '(FW-)', 'FW'],
386
+ 'Symbol', ['', '', 'SYM'],
387
+ 'Symbol, alphabetical', ['ZZ0', '', ''],
388
+ 'Symbol, list item', ['', '', 'LS']
389
+ ]
390
+
391
+ end
392
+ end
393
+ end
@@ -0,0 +1,43 @@
1
+ module Treat
2
+ module Sugar
3
+ def edulcorate
4
+ return if @@edulcorated
5
+ @@edulcorated = true
6
+ each_entity_class do |type, klass|
7
+ unless type == :Symbol
8
+ Object.class_eval do
9
+ define_method(type) do |value='',id=nil|
10
+ klass.build(value, id)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
16
+ def unedulcorate
17
+ return unless @@edulcorated
18
+ @@edulcorated = false
19
+ each_entity_class do |type, klass|
20
+ unless type == :Symbol
21
+ Object.class_eval do
22
+ remove_method(type)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ # Whtypeher syntactic sugar is
28
+ # enabled or not.
29
+ def edulcorated?; @@edulcorated; end
30
+ # Syntactic sugar is disabled by default.
31
+ @@edulcorated = false
32
+
33
+ private
34
+
35
+ def each_entity_class
36
+ Treat::Entities.list.each do |entity_type|
37
+ type = :"#{cc(entity_type)}"
38
+ klass = Treat::Entities.const_get(type, klass)
39
+ yield type, klass
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,174 @@
1
+ module Treat
2
+ # This module provides an abstract tree structure with
3
+ # nodes having an id, a value, children, features and edges.
4
+ module Tree
5
+ # This class models the nodes for an N-ary tree data structue
6
+ # with unique identifiers, text value, children, features
7
+ # (annotations) and edges.
8
+ #
9
+ # This class was tightly based on the 'rubytree' gem.
10
+ # RubyTree is licensed under the BSD license and can
11
+ # be found at http://rubytree.rubyforge.org/rdoc/.
12
+ # I have made several modifications in order to better
13
+ # suit this library and to avoid monkey patching.
14
+ class Node
15
+ # Iterate over each children in the node.
16
+ def each
17
+ @children.each { |child| yield child }
18
+ end
19
+ # A string containing the node's value (or empty).
20
+ attr_accessor :value
21
+ # A unique identifier for the node.
22
+ attr_reader :id
23
+ # An array containing the children of this node.
24
+ attr_reader :children
25
+ # A hash containing the features of this node.
26
+ attr_accessor :features
27
+ # A hash containing the edges that link this
28
+ # node to other nodes.
29
+ attr_accessor :edges
30
+ # The parent of the node.
31
+ attr_accessor :parent
32
+ # Initialize the node with its value and id.
33
+ # Setup containers for the children, features
34
+ # and edges of this node.
35
+ def initialize(value, id = nil)
36
+ @parent = nil
37
+ @value, @id = value, id
38
+ @children = []
39
+ @children_hash = {}
40
+ @features = {}
41
+ @edges = {}
42
+ end
43
+ # Boolean - does the node have edges?
44
+ def has_edges?; !@edges.empty?; end
45
+ # Boolean - does the node have children?
46
+ def has_children?; !@children.empty?; end
47
+ # Boolean - does the node have features?
48
+ def has_features?; !@features.empty?; end
49
+ # Boolean - does the node have a parent?
50
+ def has_parent?; !@parent.nil?; end
51
+ # Boolean - does the node not have a parent?
52
+ def is_root?; @parent.nil?; end
53
+ # Remove this node from its parent and set as root.
54
+ def set_as_root!; @parent = nil; self; end
55
+ # Boolean - is this node a leaf ?
56
+ # This is overriden in leaf classes.
57
+ def is_leaf?; !has_children?; end
58
+ # Add the nodes to the given child.
59
+ # This may be used with several nodes,
60
+ # for example: node << [child1, child2, child3]
61
+ def <<(nodes)
62
+ nodes = [nodes] unless nodes.is_a? Array
63
+ raise 'Trying to add a nil node.' if nodes.include? nil
64
+ nodes.each do |node|
65
+ node.parent = self
66
+ @children << node
67
+ @children_hash[node.id] = node
68
+ end
69
+ nodes[0]
70
+ end
71
+ def [](name_or_index)
72
+ if name_or_index == nil
73
+ raise Treat::Exception,
74
+ "Non-nil name or index needs to be provided."
75
+ end
76
+ if name_or_index.kind_of?(Integer) &&
77
+ name_or_index < 1000 # Fix
78
+ @children[name_or_index]
79
+ else
80
+ @children_hash[name_or_index]
81
+ end
82
+ end
83
+ def remove!(ion)
84
+ return nil unless ion
85
+ if ion.is_a? Treat::Tree::Node
86
+ @children.delete(ion)
87
+ @children_hash.delete(ion.id)
88
+ ion.set_as_root!
89
+ else
90
+ @children.delete(@children_hash[ion])
91
+ @children_hash.delete(ion)
92
+ end
93
+ end
94
+ def remove_all!
95
+ @children.each { |child| child.set_as_root! }
96
+ @children.clear
97
+ @children_hash.clear
98
+ self
99
+ end
100
+ # Previous sibling from the same parent.
101
+ def next_sibling
102
+ return nil if is_root?
103
+ id = @parent.children.index(self)
104
+ @parent.children.at(id + 1) if id
105
+ end
106
+ def left(n = 1); sibling(-1*n); end
107
+ def right(n = 1); sibling(1*n); end
108
+ def sibling(pos)
109
+ return nil if is_root?
110
+ id = @parent.children.index(self)
111
+ @parent.children.at(id + pos)
112
+ end
113
+ # There must be a cleaner way to do this.
114
+ def siblings
115
+ r = @parent.children.dup
116
+ r.delete(self)
117
+ r
118
+ end
119
+ # Total number of nodes in the subtree, including this one.
120
+ def size
121
+ @children.inject(1) { |sum, node| sum + node.size }
122
+ end
123
+ # Set the feature to the supplied value.
124
+ def set(feature, value)
125
+ @features ||= {}
126
+ @features[feature] = value
127
+ end
128
+ # Return the depth of this node in the tree.
129
+ def depth
130
+ return 0 if is_root?
131
+ 1 + parent.depth
132
+ end
133
+ # Does the entity have a feature ?
134
+ def has_feature?(feature)
135
+ @features.has_key?(feature) ||
136
+ feature == :value
137
+ end
138
+ alias :has? :has_feature?
139
+ # Link this node to the target node with
140
+ # the supplied edge type.
141
+ def associate(id_or_node, edge_type = nil)
142
+ if id_or_node.is_a? Treat::Tree::Node
143
+ id = root.find(id_or_node).id
144
+ else
145
+ id = id_or_node
146
+ end
147
+ @edges[id] = edge_type if id
148
+ end
149
+ # Find the node in the tree with the given id.
150
+ def find(id_or_node)
151
+ if id_or_node.is_a? self.class
152
+ id = id_or_node.id
153
+ else
154
+ id = id_or_node
155
+ end
156
+ return @children_hash[id] if @children_hash[id]
157
+ self.each do |child|
158
+ r = child.find(id)
159
+ return r if r.is_a? Tree::Node
160
+ end
161
+ end
162
+ # Find the root of the tree within which
163
+ # this node is contained.
164
+ def root
165
+ return self if !has_parent?
166
+ ancestor = @parent
167
+ while ancestor.has_parent?
168
+ ancestor = ancestor.parent
169
+ end
170
+ ancestor
171
+ end
172
+ end
173
+ end
174
+ end