treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,393 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Treat
|
4
|
+
module Resources
|
5
|
+
class Tags
|
6
|
+
|
7
|
+
ClawsC5 = 0
|
8
|
+
Brown = 1
|
9
|
+
Penn = 2
|
10
|
+
Enju = 3
|
11
|
+
|
12
|
+
PTBWordTagToCategory = {
|
13
|
+
'CC' => :conjunction, # Coordinating conjunction
|
14
|
+
'CD' => :number, # Cardinal number
|
15
|
+
'DT' => :determiner, # Determiner
|
16
|
+
'DET' => :determiner, # Determiner
|
17
|
+
'EX' => :determiner, # Existential there
|
18
|
+
'FW' => :foreign, # Foreign word
|
19
|
+
'IN' => :preposition, # Preposition or subordinating conjunction
|
20
|
+
'JJ' => :adjective, # Adjective
|
21
|
+
'JJR' => :adjective, # Adjective, comparative
|
22
|
+
'JJS' => :adjective, # Adjective, superlative
|
23
|
+
'LS' => :list, # List item marker
|
24
|
+
'MD' => :modal, # Modal
|
25
|
+
'NN' => :noun, # Noun, singular or mass
|
26
|
+
'NNS' => :noun, # Noun, plural
|
27
|
+
'NNP' => :noun, # Proper noun, singular
|
28
|
+
'NNPS' => :noun, # Proper noun, plural
|
29
|
+
'PDT' => :determiner, # Predeterminer
|
30
|
+
'POS' => :determiner, # Possessive ending
|
31
|
+
'PRP' => :pronoun, # Personal pronoun
|
32
|
+
'PRP$' => :pronoun, # Possessive pronoun,
|
33
|
+
'PRPS' => :determiner, # Possessive determiner
|
34
|
+
'RB' => :adverb, # Adverb
|
35
|
+
'RBR' => :adverb, # Adverb, comparative
|
36
|
+
'RBS' => :adverb, # Adverb, superlative
|
37
|
+
'RP' => :particle, # Particle
|
38
|
+
'SYM' => :symbol, # Symbol
|
39
|
+
'TO' => :to, # to
|
40
|
+
'UH' => :interjection, # Interjection
|
41
|
+
'VB' => :verb, # Verb, base form
|
42
|
+
'VBD' => :verb, # Verb, past tense
|
43
|
+
'VBG' => :verb, # Verb, gerund or present participle
|
44
|
+
'VBN' => :verb, # Verb, past participle
|
45
|
+
'VBP' => :verb, # Verb, non-3rd person singular present
|
46
|
+
'VBZ' => :verb, # Verb, 3rd person singular present
|
47
|
+
'WDT' => :determiner, # Wh-determiner
|
48
|
+
'WP' => :pronoun, # Wh-pronoun
|
49
|
+
'WP$' => :pronoun, # Possessive wh-pronoun
|
50
|
+
'WRB' => :adverb, # Wh-adverb
|
51
|
+
')' => :punctuation, # Right bracket
|
52
|
+
'(' => :punctuation, # Left bracket
|
53
|
+
'.' => :punctuation, # Period
|
54
|
+
'\'\'' => :symbol, # Quote
|
55
|
+
',' => :punctuation,
|
56
|
+
';' => :punctuation
|
57
|
+
}
|
58
|
+
|
59
|
+
PTBClauseTagDescription = [
|
60
|
+
['S', 'Simple declarative clause'],
|
61
|
+
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
62
|
+
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
63
|
+
['SINV', 'Inverted declarative sentence'],
|
64
|
+
['SQ', 'Inverted yes/no question']
|
65
|
+
]
|
66
|
+
|
67
|
+
PTBPhraseTagDescription = [
|
68
|
+
['ADJP', 'Adjective phrase'],
|
69
|
+
['ADVP', 'Adverb phrase'],
|
70
|
+
['CONJP', 'Conjunction phrase'],
|
71
|
+
['FRAG', 'Fragment'],
|
72
|
+
['INTJ', 'Interjection'],
|
73
|
+
['LST', 'List marker'],
|
74
|
+
['NAC', 'Not a constituent'],
|
75
|
+
['NP', 'Noun phrase'],
|
76
|
+
['NX', 'Head of an NP'],
|
77
|
+
['PP', 'Prepositional phrase'],
|
78
|
+
['PRN', 'Parenthetical'],
|
79
|
+
['PRT', 'Particle'],
|
80
|
+
['QP', 'Quantifier phrase'],
|
81
|
+
['RRC', 'Reduced relative clause'],
|
82
|
+
['UCP', 'Unlike coordinated phrase'],
|
83
|
+
['VP', 'Verb phrase'],
|
84
|
+
['WHADJP', 'Wh-adjective phrase'],
|
85
|
+
['WHAVP', 'Wh-adverb phrase'],
|
86
|
+
['WHNP', 'Wh-noun phrase'],
|
87
|
+
['WHPP', 'Wh-prepositional phrase'],
|
88
|
+
['X', 'Unknown, uncertain, or unbracketable']
|
89
|
+
]
|
90
|
+
|
91
|
+
PTBWordTagDescription = [
|
92
|
+
=begin
|
93
|
+
CC - Coordinating conjunction
|
94
|
+
CD - Cardinal number
|
95
|
+
DT - Determiner
|
96
|
+
EX - Existential there
|
97
|
+
FW - Foreign word
|
98
|
+
IN - Preposition or subordinating conjunction
|
99
|
+
JJ - Adjective
|
100
|
+
JJR - Adjective, comparative
|
101
|
+
JJS - Adjective, superlative
|
102
|
+
LS - List item marker
|
103
|
+
MD - Modal
|
104
|
+
NN - Noun, singular or mass
|
105
|
+
NNS - Noun, plural
|
106
|
+
NNP - Proper noun, singular
|
107
|
+
NNPS - Proper noun, plural
|
108
|
+
PDT - Predeterminer
|
109
|
+
POS - Possessive ending
|
110
|
+
PRP - Personal pronoun
|
111
|
+
PRP$ - Possessive pronoun (prolog version PRP-S)
|
112
|
+
RB - Adverb
|
113
|
+
RBR - Adverb, comparative
|
114
|
+
RBS - Adverb, superlative
|
115
|
+
RP - Particle
|
116
|
+
SYM - Symbol
|
117
|
+
TO - to
|
118
|
+
UH - Interjection
|
119
|
+
VB - Verb, base form
|
120
|
+
VBD - Verb, past tense
|
121
|
+
VBG - Verb, gerund or present participle
|
122
|
+
VBN - Verb, past participle
|
123
|
+
VBP - Verb, non-3rd person singular present
|
124
|
+
VBZ - Verb, 3rd person singular present
|
125
|
+
WDT - Wh-determiner
|
126
|
+
WP - Wh-pronoun
|
127
|
+
WP$ - Possessive wh-pronoun (prolog version WP-S)
|
128
|
+
WRB - Wh-adverb
|
129
|
+
|
130
|
+
=end
|
131
|
+
]
|
132
|
+
BrownWordTagDescription = [
|
133
|
+
=begin
|
134
|
+
|
135
|
+
Tag Description Examples
|
136
|
+
|
137
|
+
. sentence closer . ; ? !
|
138
|
+
( left paren
|
139
|
+
) right paren
|
140
|
+
* not, n't
|
141
|
+
-- dash
|
142
|
+
, comma
|
143
|
+
: colon
|
144
|
+
ABL pre-qualifier quite, rather
|
145
|
+
ABN pre-quantifier half, all
|
146
|
+
ABX pre-quantifier both
|
147
|
+
AP post-determiner many, several, next
|
148
|
+
AT article a, the, no
|
149
|
+
BE be
|
150
|
+
BED were
|
151
|
+
BEDZ was
|
152
|
+
BEG being
|
153
|
+
BEM am
|
154
|
+
BEN been
|
155
|
+
BER are, art
|
156
|
+
BEZ is
|
157
|
+
CC coordinating conjunction and, or
|
158
|
+
CD cardinal numeral one, two, 2, etc.
|
159
|
+
CS subordinating conjunction if, although
|
160
|
+
DO do
|
161
|
+
DOD did
|
162
|
+
DOZ does
|
163
|
+
DT singular determiner this, that
|
164
|
+
DTI singular or plural determiner/quantifier some, any
|
165
|
+
DTS plural determiner these, those
|
166
|
+
DTX determiner/double conjunction either
|
167
|
+
EX existentil there
|
168
|
+
FW foreign word (hyphenated before regular tag)
|
169
|
+
HL word occurring in headline (hyphenated after regular tag)
|
170
|
+
HV have
|
171
|
+
HVD had (past tense)
|
172
|
+
HVG having
|
173
|
+
HVN had (past participle)
|
174
|
+
HVZ has
|
175
|
+
IN preposition
|
176
|
+
JJ adjective
|
177
|
+
JJR comparative adjective
|
178
|
+
JJS semantically superlative adjective chief, top
|
179
|
+
JJT morphologically superlative adjective biggest
|
180
|
+
MD modal auxiliary can, should, will
|
181
|
+
NC cited word (hyphenated after regular tag)
|
182
|
+
NN singular or mass noun
|
183
|
+
NN$ possessive singular noun
|
184
|
+
NNS plural noun
|
185
|
+
NNS$ possessive plural noun
|
186
|
+
NP proper noun or part of name phrase
|
187
|
+
NP$ possessive proper noun
|
188
|
+
NPS plural proper noun
|
189
|
+
NPS$ possessive plural proper noun
|
190
|
+
NR adverbial noun home, today, west
|
191
|
+
NRS plural adverbial noun
|
192
|
+
OD ordinal numeral first, 2nd
|
193
|
+
PN nominal pronoun everybody, nothing
|
194
|
+
PN$ possessive nominal pronoun
|
195
|
+
PP$ possessive personal pronoun my, our
|
196
|
+
PP$$ second (nominal) possessive pronoun mine, ours
|
197
|
+
PPL singular reflexive/intensive personal pronoun myself
|
198
|
+
PPLS plural reflexive/intensive personal pronoun ourselves
|
199
|
+
PPO objective personal pronoun me, him, it, them
|
200
|
+
PPS 3rd. singular nominative pronoun he, she, it, one
|
201
|
+
PPSS other nominative personal pronoun I, we, they, you
|
202
|
+
QL qualifier very, fairly
|
203
|
+
QLP post-qualifier enough, indeed
|
204
|
+
RB adverb
|
205
|
+
RBR comparative adverb
|
206
|
+
RBT superlative adverb
|
207
|
+
RN nominal adverb here then, indoors
|
208
|
+
RP adverb/particle about, off, up
|
209
|
+
TL word occurring in title (hyphenated after
|
210
|
+
regular tag)
|
211
|
+
TO infinitive marker to
|
212
|
+
UH interjection, exclamation
|
213
|
+
VB verb, base form
|
214
|
+
VBD verb, past tense
|
215
|
+
VBG verb, present participle/gerund
|
216
|
+
VBN verb, past participle
|
217
|
+
VBZ verb, 3rd. singular present
|
218
|
+
WDT wh- determiner what, which
|
219
|
+
WP$ possessive wh- pronoun whose
|
220
|
+
WPO objective wh- pronoun whom, which, that
|
221
|
+
WPS nominative wh- pronoun who, which, that
|
222
|
+
WQL wh- qualifier how
|
223
|
+
WRB wh- adverb how, where, when
|
224
|
+
|
225
|
+
=end
|
226
|
+
]
|
227
|
+
EnjuCatDescription = [
|
228
|
+
['ADJ', 'Adjective'],
|
229
|
+
['ADV', 'Adverb'],
|
230
|
+
['CONJ', 'Coordination conjunction'],
|
231
|
+
['C', 'Complementizer'],
|
232
|
+
['D', 'Determiner'],
|
233
|
+
['N', 'Noun'],
|
234
|
+
['P', 'Preposition'],
|
235
|
+
['SC', 'Subordination conjunction'],
|
236
|
+
['V', 'Verb'],
|
237
|
+
['COOD', 'Part of coordination'],
|
238
|
+
['PN', 'Punctuation'],
|
239
|
+
['PRT', 'Particle'],
|
240
|
+
['S', 'Sentence']
|
241
|
+
]
|
242
|
+
|
243
|
+
# Description of the xcat in the Enju output specification.
|
244
|
+
EnjuXCatDescription = [
|
245
|
+
['COOD', 'Coordinated phrase/clause'],
|
246
|
+
['IMP', 'Imperative sentence'],
|
247
|
+
['INV', 'Subject-verb inversion'],
|
248
|
+
['Q', 'Interrogative sentence with subject-verb inversion'],
|
249
|
+
['REL', 'A relativizer included'],
|
250
|
+
['FREL', 'A free relative included'],
|
251
|
+
['TRACE', 'A trace included'],
|
252
|
+
['WH', 'A wh-question word included']
|
253
|
+
]
|
254
|
+
|
255
|
+
EnjuCatXcatToPTB = [
|
256
|
+
['ADJP', '', 'ADJP'],
|
257
|
+
['ADJP', 'REL', 'WHADJP'],
|
258
|
+
['ADJP', 'FREL', 'WHADJP'],
|
259
|
+
['ADJP', 'WH', 'WHADJP'],
|
260
|
+
['ADVP', '', 'ADVP'],
|
261
|
+
['ADVP', 'REL', 'WHADVP'],
|
262
|
+
['ADVP', 'FREL', 'WHADVP'],
|
263
|
+
['ADVP', 'WH', 'WHADVP'],
|
264
|
+
['CONJP', '', 'CONJP'],
|
265
|
+
['CP', '', 'SBAR'],
|
266
|
+
['DP', '', 'NP'],
|
267
|
+
['NP', '', 'NP'],
|
268
|
+
['NX', 'NX', 'NAC'],
|
269
|
+
['NP' 'REL' 'WHNP'],
|
270
|
+
['NP' 'FREL' 'WHNP'],
|
271
|
+
['NP' 'WH' 'WHNP'],
|
272
|
+
['PP', '', 'PP'],
|
273
|
+
['PP', 'REL', 'WHPP'],
|
274
|
+
['PP', 'WH', 'WHPP'],
|
275
|
+
['PRT', '', 'PRT'],
|
276
|
+
['S', '', 'S'],
|
277
|
+
['S', 'INV', 'SINV'],
|
278
|
+
['S', 'Q', 'SQ'],
|
279
|
+
['S', 'REL', 'SBAR'],
|
280
|
+
['S', 'FREL', 'SBAR'],
|
281
|
+
['S', 'WH', 'SBARQ'],
|
282
|
+
['SCP', '', 'SBAR'],
|
283
|
+
['VP', '', 'VP'],
|
284
|
+
['VP', '', 'VP'],
|
285
|
+
['', '', 'UK']
|
286
|
+
]
|
287
|
+
|
288
|
+
# Aligned tags for the Claws C5, Brown and Penn tag sets.
|
289
|
+
# Adapted from Manning, Christopher and Schütze, Hinrich,
|
290
|
+
# 1999. Foundations of Statistical Natural Language
|
291
|
+
# Processing. MIT Press, p. 141-142.
|
292
|
+
AlignedWordTags = [
|
293
|
+
'Adjective', ['AJ0', 'JJ', 'JJ'],
|
294
|
+
'Adjective, ordinal number', ['ORD', 'OD', 'JJ'],
|
295
|
+
'Adjective, comparative', ['AJC', 'JJR', 'JJR'],
|
296
|
+
'Adjective, superlative', ['AJS', 'JJT', 'JJS'],
|
297
|
+
'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ'],
|
298
|
+
'Adjective, cardinal number', ['CRD', 'CD', 'CD'],
|
299
|
+
'Adjective, cardinal number, one', ['PNI', 'CD', 'CD'],
|
300
|
+
'Adverb', ['AV0', 'RB', 'RB'],
|
301
|
+
'Adverb, negative', ['XX0', '*', 'RB'],
|
302
|
+
'Adverb, comparative', ['AV0', 'RBR', 'RBR'],
|
303
|
+
'Adverb, superlative', ['AV0', 'RBT', 'RBS'],
|
304
|
+
'Adverb, particle', ['AVP', 'RP', 'RP'],
|
305
|
+
'Adverb, question', ['AVQ', 'WRB', 'WRB'],
|
306
|
+
'Adverb, degree & question', ['AVQ', 'WQL', 'WRB'],
|
307
|
+
'Adverb, degree', ['AV0', 'QL', 'RB'],
|
308
|
+
'Adverb, degree, postposed', ['AV0', 'QLP', 'RB'],
|
309
|
+
'Adverb, nominal', ['AV0', 'RN', 'RB'],
|
310
|
+
'Conjunction, coordination', ['CJC', 'CC', 'CC'],
|
311
|
+
'Conjunction, subordination', ['CJS', 'CS', 'IN'],
|
312
|
+
'Conjunction, complementizer, that', ['CJT', 'CS', 'IN'],
|
313
|
+
'Determiner', ['DT0', 'DT', 'DT'],
|
314
|
+
'Determiner, pronoun', ['DT0', 'DTI', 'DT'],
|
315
|
+
'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT'],
|
316
|
+
'Determiner, prequalifier', ['DT0', 'ABL', 'DT'],
|
317
|
+
'Determiner, prequantifier', ['DT0', 'ABN', 'PDT'],
|
318
|
+
'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT'],
|
319
|
+
'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT'],
|
320
|
+
'Determiner, article', ['AT0', 'AT', 'DT'],
|
321
|
+
'Determiner, postdeterminer', ['DT0', 'AP', 'JJ'],
|
322
|
+
'Determiner, possessive', ['DPS', 'PP$', 'PRP$'],
|
323
|
+
'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP'],
|
324
|
+
'Determiner, question', ['DTQ', 'WDT', 'WDT'],
|
325
|
+
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$'],
|
326
|
+
'Noun', ['NN0', 'NN', 'NN'],
|
327
|
+
'Noun, singular', ['NN1', 'NN', 'NN'],
|
328
|
+
'Noun, plural', ['NN2', 'NNS', 'NNS'],
|
329
|
+
'Noun, proper, singular', ['NP0', 'NP', 'NNP'],
|
330
|
+
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
|
331
|
+
'Noun, adverbial', ['NN0', 'NR', 'NN'],
|
332
|
+
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
|
333
|
+
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'NN'],
|
334
|
+
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
|
335
|
+
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
|
336
|
+
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
|
337
|
+
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP'],
|
338
|
+
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP'],
|
339
|
+
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP'],
|
340
|
+
'Pronoun, question, object', ['PNQ', 'WPO', 'WP'],
|
341
|
+
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
342
|
+
'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP'],
|
343
|
+
'Verb, infinitive', ['VVI', 'VB', 'VB'],
|
344
|
+
'Verb, past tense', ['VVD', 'VBD', 'VBD'],
|
345
|
+
'Verb, present participle', ['VVG', 'VBG', 'VBG'],
|
346
|
+
'Verb, past/passive participle', ['VVN', 'VBN', 'VBN'],
|
347
|
+
'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ'],
|
348
|
+
'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP'],
|
349
|
+
'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB'],
|
350
|
+
'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD'],
|
351
|
+
'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG'],
|
352
|
+
'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN'],
|
353
|
+
'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ'],
|
354
|
+
'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP'],
|
355
|
+
'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB'],
|
356
|
+
'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD'],
|
357
|
+
'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG'],
|
358
|
+
'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN'],
|
359
|
+
'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ'],
|
360
|
+
'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB'],
|
361
|
+
'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD'],
|
362
|
+
'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD'],
|
363
|
+
'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG'],
|
364
|
+
'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN'],
|
365
|
+
'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ'],
|
366
|
+
'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP'],
|
367
|
+
'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP'],
|
368
|
+
'Verb, modal', ['VM0', 'MD', 'MD'],
|
369
|
+
'Preposition, to as infinitive marker', ['TO0', 'TO', 'TO'],
|
370
|
+
'Preposition, to', ['PRP', 'IN', 'TO'],
|
371
|
+
'Preposition', ['PRP', 'IN', 'IN'],
|
372
|
+
'Preposition, of', ['PRF', 'IN', 'IN'],
|
373
|
+
'Possessive', ['POS', '$', 'POS'],
|
374
|
+
'Interjection (or other isolate)', ['ITJ', 'UH', 'UH'],
|
375
|
+
'Punctuation, sentence ender', ['PUN', '.', '.'],
|
376
|
+
'Punctuation, semicolon', ['PUN', '.', '.'],
|
377
|
+
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
378
|
+
'Punctuationm, comma', ['PUN', ',', ','],
|
379
|
+
'Punctuation, dash', ['PUN', '-', '-'],
|
380
|
+
'Punctuation, dollar sign', ['PUN', '', '$'],
|
381
|
+
'Punctuation, left bracket', ['PUL', '(', '('],
|
382
|
+
'Punctuation, right bracket', ['PUR', ')', ')'],
|
383
|
+
'Punctuation, quotation mark, left', ['PUQ', '', '``'],
|
384
|
+
'Punctuation, quotation mark, right', ['PUQ', '', '"'],
|
385
|
+
'Unknown, foreign words (not in English lexicon)', ['UNZ', '(FW-)', 'FW'],
|
386
|
+
'Symbol', ['', '', 'SYM'],
|
387
|
+
'Symbol, alphabetical', ['ZZ0', '', ''],
|
388
|
+
'Symbol, list item', ['', '', 'LS']
|
389
|
+
]
|
390
|
+
|
391
|
+
end
|
392
|
+
end
|
393
|
+
end
|
data/lib/treat/sugar.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
module Treat
|
2
|
+
module Sugar
|
3
|
+
def edulcorate
|
4
|
+
return if @@edulcorated
|
5
|
+
@@edulcorated = true
|
6
|
+
each_entity_class do |type, klass|
|
7
|
+
unless type == :Symbol
|
8
|
+
Object.class_eval do
|
9
|
+
define_method(type) do |value='',id=nil|
|
10
|
+
klass.build(value, id)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
def unedulcorate
|
17
|
+
return unless @@edulcorated
|
18
|
+
@@edulcorated = false
|
19
|
+
each_entity_class do |type, klass|
|
20
|
+
unless type == :Symbol
|
21
|
+
Object.class_eval do
|
22
|
+
remove_method(type)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
# Whtypeher syntactic sugar is
|
28
|
+
# enabled or not.
|
29
|
+
def edulcorated?; @@edulcorated; end
|
30
|
+
# Syntactic sugar is disabled by default.
|
31
|
+
@@edulcorated = false
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def each_entity_class
|
36
|
+
Treat::Entities.list.each do |entity_type|
|
37
|
+
type = :"#{cc(entity_type)}"
|
38
|
+
klass = Treat::Entities.const_get(type, klass)
|
39
|
+
yield type, klass
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/treat/tree.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
module Treat
|
2
|
+
# This module provides an abstract tree structure with
|
3
|
+
# nodes having an id, a value, children, features and edges.
|
4
|
+
module Tree
|
5
|
+
# This class models the nodes for an N-ary tree data structue
|
6
|
+
# with unique identifiers, text value, children, features
|
7
|
+
# (annotations) and edges.
|
8
|
+
#
|
9
|
+
# This class was tightly based on the 'rubytree' gem.
|
10
|
+
# RubyTree is licensed under the BSD license and can
|
11
|
+
# be found at http://rubytree.rubyforge.org/rdoc/.
|
12
|
+
# I have made several modifications in order to better
|
13
|
+
# suit this library and to avoid monkey patching.
|
14
|
+
class Node
|
15
|
+
# Iterate over each children in the node.
|
16
|
+
def each
|
17
|
+
@children.each { |child| yield child }
|
18
|
+
end
|
19
|
+
# A string containing the node's value (or empty).
|
20
|
+
attr_accessor :value
|
21
|
+
# A unique identifier for the node.
|
22
|
+
attr_reader :id
|
23
|
+
# An array containing the children of this node.
|
24
|
+
attr_reader :children
|
25
|
+
# A hash containing the features of this node.
|
26
|
+
attr_accessor :features
|
27
|
+
# A hash containing the edges that link this
|
28
|
+
# node to other nodes.
|
29
|
+
attr_accessor :edges
|
30
|
+
# The parent of the node.
|
31
|
+
attr_accessor :parent
|
32
|
+
# Initialize the node with its value and id.
|
33
|
+
# Setup containers for the children, features
|
34
|
+
# and edges of this node.
|
35
|
+
def initialize(value, id = nil)
|
36
|
+
@parent = nil
|
37
|
+
@value, @id = value, id
|
38
|
+
@children = []
|
39
|
+
@children_hash = {}
|
40
|
+
@features = {}
|
41
|
+
@edges = {}
|
42
|
+
end
|
43
|
+
# Boolean - does the node have edges?
|
44
|
+
def has_edges?; !@edges.empty?; end
|
45
|
+
# Boolean - does the node have children?
|
46
|
+
def has_children?; !@children.empty?; end
|
47
|
+
# Boolean - does the node have features?
|
48
|
+
def has_features?; !@features.empty?; end
|
49
|
+
# Boolean - does the node have a parent?
|
50
|
+
def has_parent?; !@parent.nil?; end
|
51
|
+
# Boolean - does the node not have a parent?
|
52
|
+
def is_root?; @parent.nil?; end
|
53
|
+
# Remove this node from its parent and set as root.
|
54
|
+
def set_as_root!; @parent = nil; self; end
|
55
|
+
# Boolean - is this node a leaf ?
|
56
|
+
# This is overriden in leaf classes.
|
57
|
+
def is_leaf?; !has_children?; end
|
58
|
+
# Add the nodes to the given child.
|
59
|
+
# This may be used with several nodes,
|
60
|
+
# for example: node << [child1, child2, child3]
|
61
|
+
def <<(nodes)
|
62
|
+
nodes = [nodes] unless nodes.is_a? Array
|
63
|
+
raise 'Trying to add a nil node.' if nodes.include? nil
|
64
|
+
nodes.each do |node|
|
65
|
+
node.parent = self
|
66
|
+
@children << node
|
67
|
+
@children_hash[node.id] = node
|
68
|
+
end
|
69
|
+
nodes[0]
|
70
|
+
end
|
71
|
+
def [](name_or_index)
|
72
|
+
if name_or_index == nil
|
73
|
+
raise Treat::Exception,
|
74
|
+
"Non-nil name or index needs to be provided."
|
75
|
+
end
|
76
|
+
if name_or_index.kind_of?(Integer) &&
|
77
|
+
name_or_index < 1000 # Fix
|
78
|
+
@children[name_or_index]
|
79
|
+
else
|
80
|
+
@children_hash[name_or_index]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
def remove!(ion)
|
84
|
+
return nil unless ion
|
85
|
+
if ion.is_a? Treat::Tree::Node
|
86
|
+
@children.delete(ion)
|
87
|
+
@children_hash.delete(ion.id)
|
88
|
+
ion.set_as_root!
|
89
|
+
else
|
90
|
+
@children.delete(@children_hash[ion])
|
91
|
+
@children_hash.delete(ion)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
def remove_all!
|
95
|
+
@children.each { |child| child.set_as_root! }
|
96
|
+
@children.clear
|
97
|
+
@children_hash.clear
|
98
|
+
self
|
99
|
+
end
|
100
|
+
# Previous sibling from the same parent.
|
101
|
+
def next_sibling
|
102
|
+
return nil if is_root?
|
103
|
+
id = @parent.children.index(self)
|
104
|
+
@parent.children.at(id + 1) if id
|
105
|
+
end
|
106
|
+
def left(n = 1); sibling(-1*n); end
|
107
|
+
def right(n = 1); sibling(1*n); end
|
108
|
+
def sibling(pos)
|
109
|
+
return nil if is_root?
|
110
|
+
id = @parent.children.index(self)
|
111
|
+
@parent.children.at(id + pos)
|
112
|
+
end
|
113
|
+
# There must be a cleaner way to do this.
|
114
|
+
def siblings
|
115
|
+
r = @parent.children.dup
|
116
|
+
r.delete(self)
|
117
|
+
r
|
118
|
+
end
|
119
|
+
# Total number of nodes in the subtree, including this one.
|
120
|
+
def size
|
121
|
+
@children.inject(1) { |sum, node| sum + node.size }
|
122
|
+
end
|
123
|
+
# Set the feature to the supplied value.
|
124
|
+
def set(feature, value)
|
125
|
+
@features ||= {}
|
126
|
+
@features[feature] = value
|
127
|
+
end
|
128
|
+
# Return the depth of this node in the tree.
|
129
|
+
def depth
|
130
|
+
return 0 if is_root?
|
131
|
+
1 + parent.depth
|
132
|
+
end
|
133
|
+
# Does the entity have a feature ?
|
134
|
+
def has_feature?(feature)
|
135
|
+
@features.has_key?(feature) ||
|
136
|
+
feature == :value
|
137
|
+
end
|
138
|
+
alias :has? :has_feature?
|
139
|
+
# Link this node to the target node with
|
140
|
+
# the supplied edge type.
|
141
|
+
def associate(id_or_node, edge_type = nil)
|
142
|
+
if id_or_node.is_a? Treat::Tree::Node
|
143
|
+
id = root.find(id_or_node).id
|
144
|
+
else
|
145
|
+
id = id_or_node
|
146
|
+
end
|
147
|
+
@edges[id] = edge_type if id
|
148
|
+
end
|
149
|
+
# Find the node in the tree with the given id.
|
150
|
+
def find(id_or_node)
|
151
|
+
if id_or_node.is_a? self.class
|
152
|
+
id = id_or_node.id
|
153
|
+
else
|
154
|
+
id = id_or_node
|
155
|
+
end
|
156
|
+
return @children_hash[id] if @children_hash[id]
|
157
|
+
self.each do |child|
|
158
|
+
r = child.find(id)
|
159
|
+
return r if r.is_a? Tree::Node
|
160
|
+
end
|
161
|
+
end
|
162
|
+
# Find the root of the tree within which
|
163
|
+
# this node is contained.
|
164
|
+
def root
|
165
|
+
return self if !has_parent?
|
166
|
+
ancestor = @parent
|
167
|
+
while ancestor.has_parent?
|
168
|
+
ancestor = ancestor.parent
|
169
|
+
end
|
170
|
+
ancestor
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|