treat 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,393 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Treat
|
4
|
+
module Resources
|
5
|
+
class Tags
|
6
|
+
|
7
|
+
ClawsC5 = 0
|
8
|
+
Brown = 1
|
9
|
+
Penn = 2
|
10
|
+
Enju = 3
|
11
|
+
|
12
|
+
PTBWordTagToCategory = {
|
13
|
+
'CC' => :conjunction, # Coordinating conjunction
|
14
|
+
'CD' => :number, # Cardinal number
|
15
|
+
'DT' => :determiner, # Determiner
|
16
|
+
'DET' => :determiner, # Determiner
|
17
|
+
'EX' => :determiner, # Existential there
|
18
|
+
'FW' => :foreign, # Foreign word
|
19
|
+
'IN' => :preposition, # Preposition or subordinating conjunction
|
20
|
+
'JJ' => :adjective, # Adjective
|
21
|
+
'JJR' => :adjective, # Adjective, comparative
|
22
|
+
'JJS' => :adjective, # Adjective, superlative
|
23
|
+
'LS' => :list, # List item marker
|
24
|
+
'MD' => :modal, # Modal
|
25
|
+
'NN' => :noun, # Noun, singular or mass
|
26
|
+
'NNS' => :noun, # Noun, plural
|
27
|
+
'NNP' => :noun, # Proper noun, singular
|
28
|
+
'NNPS' => :noun, # Proper noun, plural
|
29
|
+
'PDT' => :determiner, # Predeterminer
|
30
|
+
'POS' => :determiner, # Possessive ending
|
31
|
+
'PRP' => :pronoun, # Personal pronoun
|
32
|
+
'PRP$' => :pronoun, # Possessive pronoun,
|
33
|
+
'PRPS' => :determiner, # Possessive determiner
|
34
|
+
'RB' => :adverb, # Adverb
|
35
|
+
'RBR' => :adverb, # Adverb, comparative
|
36
|
+
'RBS' => :adverb, # Adverb, superlative
|
37
|
+
'RP' => :particle, # Particle
|
38
|
+
'SYM' => :symbol, # Symbol
|
39
|
+
'TO' => :to, # to
|
40
|
+
'UH' => :interjection, # Interjection
|
41
|
+
'VB' => :verb, # Verb, base form
|
42
|
+
'VBD' => :verb, # Verb, past tense
|
43
|
+
'VBG' => :verb, # Verb, gerund or present participle
|
44
|
+
'VBN' => :verb, # Verb, past participle
|
45
|
+
'VBP' => :verb, # Verb, non-3rd person singular present
|
46
|
+
'VBZ' => :verb, # Verb, 3rd person singular present
|
47
|
+
'WDT' => :determiner, # Wh-determiner
|
48
|
+
'WP' => :pronoun, # Wh-pronoun
|
49
|
+
'WP$' => :pronoun, # Possessive wh-pronoun
|
50
|
+
'WRB' => :adverb, # Wh-adverb
|
51
|
+
')' => :punctuation, # Right bracket
|
52
|
+
'(' => :punctuation, # Left bracket
|
53
|
+
'.' => :punctuation, # Period
|
54
|
+
'\'\'' => :symbol, # Quote
|
55
|
+
',' => :punctuation,
|
56
|
+
';' => :punctuation
|
57
|
+
}
|
58
|
+
|
59
|
+
PTBClauseTagDescription = [
|
60
|
+
['S', 'Simple declarative clause'],
|
61
|
+
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
62
|
+
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
63
|
+
['SINV', 'Inverted declarative sentence'],
|
64
|
+
['SQ', 'Inverted yes/no question']
|
65
|
+
]
|
66
|
+
|
67
|
+
PTBPhraseTagDescription = [
|
68
|
+
['ADJP', 'Adjective phrase'],
|
69
|
+
['ADVP', 'Adverb phrase'],
|
70
|
+
['CONJP', 'Conjunction phrase'],
|
71
|
+
['FRAG', 'Fragment'],
|
72
|
+
['INTJ', 'Interjection'],
|
73
|
+
['LST', 'List marker'],
|
74
|
+
['NAC', 'Not a constituent'],
|
75
|
+
['NP', 'Noun phrase'],
|
76
|
+
['NX', 'Head of an NP'],
|
77
|
+
['PP', 'Prepositional phrase'],
|
78
|
+
['PRN', 'Parenthetical'],
|
79
|
+
['PRT', 'Particle'],
|
80
|
+
['QP', 'Quantifier phrase'],
|
81
|
+
['RRC', 'Reduced relative clause'],
|
82
|
+
['UCP', 'Unlike coordinated phrase'],
|
83
|
+
['VP', 'Verb phrase'],
|
84
|
+
['WHADJP', 'Wh-adjective phrase'],
|
85
|
+
['WHAVP', 'Wh-adverb phrase'],
|
86
|
+
['WHNP', 'Wh-noun phrase'],
|
87
|
+
['WHPP', 'Wh-prepositional phrase'],
|
88
|
+
['X', 'Unknown, uncertain, or unbracketable']
|
89
|
+
]
|
90
|
+
|
91
|
+
PTBWordTagDescription = [
|
92
|
+
=begin
|
93
|
+
CC - Coordinating conjunction
|
94
|
+
CD - Cardinal number
|
95
|
+
DT - Determiner
|
96
|
+
EX - Existential there
|
97
|
+
FW - Foreign word
|
98
|
+
IN - Preposition or subordinating conjunction
|
99
|
+
JJ - Adjective
|
100
|
+
JJR - Adjective, comparative
|
101
|
+
JJS - Adjective, superlative
|
102
|
+
LS - List item marker
|
103
|
+
MD - Modal
|
104
|
+
NN - Noun, singular or mass
|
105
|
+
NNS - Noun, plural
|
106
|
+
NNP - Proper noun, singular
|
107
|
+
NNPS - Proper noun, plural
|
108
|
+
PDT - Predeterminer
|
109
|
+
POS - Possessive ending
|
110
|
+
PRP - Personal pronoun
|
111
|
+
PRP$ - Possessive pronoun (prolog version PRP-S)
|
112
|
+
RB - Adverb
|
113
|
+
RBR - Adverb, comparative
|
114
|
+
RBS - Adverb, superlative
|
115
|
+
RP - Particle
|
116
|
+
SYM - Symbol
|
117
|
+
TO - to
|
118
|
+
UH - Interjection
|
119
|
+
VB - Verb, base form
|
120
|
+
VBD - Verb, past tense
|
121
|
+
VBG - Verb, gerund or present participle
|
122
|
+
VBN - Verb, past participle
|
123
|
+
VBP - Verb, non-3rd person singular present
|
124
|
+
VBZ - Verb, 3rd person singular present
|
125
|
+
WDT - Wh-determiner
|
126
|
+
WP - Wh-pronoun
|
127
|
+
WP$ - Possessive wh-pronoun (prolog version WP-S)
|
128
|
+
WRB - Wh-adverb
|
129
|
+
|
130
|
+
=end
|
131
|
+
]
|
132
|
+
BrownWordTagDescription = [
|
133
|
+
=begin
|
134
|
+
|
135
|
+
Tag Description Examples
|
136
|
+
|
137
|
+
. sentence closer . ; ? !
|
138
|
+
( left paren
|
139
|
+
) right paren
|
140
|
+
* not, n't
|
141
|
+
-- dash
|
142
|
+
, comma
|
143
|
+
: colon
|
144
|
+
ABL pre-qualifier quite, rather
|
145
|
+
ABN pre-quantifier half, all
|
146
|
+
ABX pre-quantifier both
|
147
|
+
AP post-determiner many, several, next
|
148
|
+
AT article a, the, no
|
149
|
+
BE be
|
150
|
+
BED were
|
151
|
+
BEDZ was
|
152
|
+
BEG being
|
153
|
+
BEM am
|
154
|
+
BEN been
|
155
|
+
BER are, art
|
156
|
+
BEZ is
|
157
|
+
CC coordinating conjunction and, or
|
158
|
+
CD cardinal numeral one, two, 2, etc.
|
159
|
+
CS subordinating conjunction if, although
|
160
|
+
DO do
|
161
|
+
DOD did
|
162
|
+
DOZ does
|
163
|
+
DT singular determiner this, that
|
164
|
+
DTI singular or plural determiner/quantifier some, any
|
165
|
+
DTS plural determiner these, those
|
166
|
+
DTX determiner/double conjunction either
|
167
|
+
EX existentil there
|
168
|
+
FW foreign word (hyphenated before regular tag)
|
169
|
+
HL word occurring in headline (hyphenated after regular tag)
|
170
|
+
HV have
|
171
|
+
HVD had (past tense)
|
172
|
+
HVG having
|
173
|
+
HVN had (past participle)
|
174
|
+
HVZ has
|
175
|
+
IN preposition
|
176
|
+
JJ adjective
|
177
|
+
JJR comparative adjective
|
178
|
+
JJS semantically superlative adjective chief, top
|
179
|
+
JJT morphologically superlative adjective biggest
|
180
|
+
MD modal auxiliary can, should, will
|
181
|
+
NC cited word (hyphenated after regular tag)
|
182
|
+
NN singular or mass noun
|
183
|
+
NN$ possessive singular noun
|
184
|
+
NNS plural noun
|
185
|
+
NNS$ possessive plural noun
|
186
|
+
NP proper noun or part of name phrase
|
187
|
+
NP$ possessive proper noun
|
188
|
+
NPS plural proper noun
|
189
|
+
NPS$ possessive plural proper noun
|
190
|
+
NR adverbial noun home, today, west
|
191
|
+
NRS plural adverbial noun
|
192
|
+
OD ordinal numeral first, 2nd
|
193
|
+
PN nominal pronoun everybody, nothing
|
194
|
+
PN$ possessive nominal pronoun
|
195
|
+
PP$ possessive personal pronoun my, our
|
196
|
+
PP$$ second (nominal) possessive pronoun mine, ours
|
197
|
+
PPL singular reflexive/intensive personal pronoun myself
|
198
|
+
PPLS plural reflexive/intensive personal pronoun ourselves
|
199
|
+
PPO objective personal pronoun me, him, it, them
|
200
|
+
PPS 3rd. singular nominative pronoun he, she, it, one
|
201
|
+
PPSS other nominative personal pronoun I, we, they, you
|
202
|
+
QL qualifier very, fairly
|
203
|
+
QLP post-qualifier enough, indeed
|
204
|
+
RB adverb
|
205
|
+
RBR comparative adverb
|
206
|
+
RBT superlative adverb
|
207
|
+
RN nominal adverb here then, indoors
|
208
|
+
RP adverb/particle about, off, up
|
209
|
+
TL word occurring in title (hyphenated after
|
210
|
+
regular tag)
|
211
|
+
TO infinitive marker to
|
212
|
+
UH interjection, exclamation
|
213
|
+
VB verb, base form
|
214
|
+
VBD verb, past tense
|
215
|
+
VBG verb, present participle/gerund
|
216
|
+
VBN verb, past participle
|
217
|
+
VBZ verb, 3rd. singular present
|
218
|
+
WDT wh- determiner what, which
|
219
|
+
WP$ possessive wh- pronoun whose
|
220
|
+
WPO objective wh- pronoun whom, which, that
|
221
|
+
WPS nominative wh- pronoun who, which, that
|
222
|
+
WQL wh- qualifier how
|
223
|
+
WRB wh- adverb how, where, when
|
224
|
+
|
225
|
+
=end
|
226
|
+
]
|
227
|
+
EnjuCatDescription = [
|
228
|
+
['ADJ', 'Adjective'],
|
229
|
+
['ADV', 'Adverb'],
|
230
|
+
['CONJ', 'Coordination conjunction'],
|
231
|
+
['C', 'Complementizer'],
|
232
|
+
['D', 'Determiner'],
|
233
|
+
['N', 'Noun'],
|
234
|
+
['P', 'Preposition'],
|
235
|
+
['SC', 'Subordination conjunction'],
|
236
|
+
['V', 'Verb'],
|
237
|
+
['COOD', 'Part of coordination'],
|
238
|
+
['PN', 'Punctuation'],
|
239
|
+
['PRT', 'Particle'],
|
240
|
+
['S', 'Sentence']
|
241
|
+
]
|
242
|
+
|
243
|
+
# Description of the xcat in the Enju output specification.
|
244
|
+
EnjuXCatDescription = [
|
245
|
+
['COOD', 'Coordinated phrase/clause'],
|
246
|
+
['IMP', 'Imperative sentence'],
|
247
|
+
['INV', 'Subject-verb inversion'],
|
248
|
+
['Q', 'Interrogative sentence with subject-verb inversion'],
|
249
|
+
['REL', 'A relativizer included'],
|
250
|
+
['FREL', 'A free relative included'],
|
251
|
+
['TRACE', 'A trace included'],
|
252
|
+
['WH', 'A wh-question word included']
|
253
|
+
]
|
254
|
+
|
255
|
+
EnjuCatXcatToPTB = [
|
256
|
+
['ADJP', '', 'ADJP'],
|
257
|
+
['ADJP', 'REL', 'WHADJP'],
|
258
|
+
['ADJP', 'FREL', 'WHADJP'],
|
259
|
+
['ADJP', 'WH', 'WHADJP'],
|
260
|
+
['ADVP', '', 'ADVP'],
|
261
|
+
['ADVP', 'REL', 'WHADVP'],
|
262
|
+
['ADVP', 'FREL', 'WHADVP'],
|
263
|
+
['ADVP', 'WH', 'WHADVP'],
|
264
|
+
['CONJP', '', 'CONJP'],
|
265
|
+
['CP', '', 'SBAR'],
|
266
|
+
['DP', '', 'NP'],
|
267
|
+
['NP', '', 'NP'],
|
268
|
+
['NX', 'NX', 'NAC'],
|
269
|
+
['NP' 'REL' 'WHNP'],
|
270
|
+
['NP' 'FREL' 'WHNP'],
|
271
|
+
['NP' 'WH' 'WHNP'],
|
272
|
+
['PP', '', 'PP'],
|
273
|
+
['PP', 'REL', 'WHPP'],
|
274
|
+
['PP', 'WH', 'WHPP'],
|
275
|
+
['PRT', '', 'PRT'],
|
276
|
+
['S', '', 'S'],
|
277
|
+
['S', 'INV', 'SINV'],
|
278
|
+
['S', 'Q', 'SQ'],
|
279
|
+
['S', 'REL', 'SBAR'],
|
280
|
+
['S', 'FREL', 'SBAR'],
|
281
|
+
['S', 'WH', 'SBARQ'],
|
282
|
+
['SCP', '', 'SBAR'],
|
283
|
+
['VP', '', 'VP'],
|
284
|
+
['VP', '', 'VP'],
|
285
|
+
['', '', 'UK']
|
286
|
+
]
|
287
|
+
|
288
|
+
# Aligned tags for the Claws C5, Brown and Penn tag sets.
|
289
|
+
# Adapted from Manning, Christopher and Schütze, Hinrich,
|
290
|
+
# 1999. Foundations of Statistical Natural Language
|
291
|
+
# Processing. MIT Press, p. 141-142.
|
292
|
+
AlignedWordTags = [
|
293
|
+
'Adjective', ['AJ0', 'JJ', 'JJ'],
|
294
|
+
'Adjective, ordinal number', ['ORD', 'OD', 'JJ'],
|
295
|
+
'Adjective, comparative', ['AJC', 'JJR', 'JJR'],
|
296
|
+
'Adjective, superlative', ['AJS', 'JJT', 'JJS'],
|
297
|
+
'Adjective, superlative, semantically', ['AJ0', 'JJS', 'JJ'],
|
298
|
+
'Adjective, cardinal number', ['CRD', 'CD', 'CD'],
|
299
|
+
'Adjective, cardinal number, one', ['PNI', 'CD', 'CD'],
|
300
|
+
'Adverb', ['AV0', 'RB', 'RB'],
|
301
|
+
'Adverb, negative', ['XX0', '*', 'RB'],
|
302
|
+
'Adverb, comparative', ['AV0', 'RBR', 'RBR'],
|
303
|
+
'Adverb, superlative', ['AV0', 'RBT', 'RBS'],
|
304
|
+
'Adverb, particle', ['AVP', 'RP', 'RP'],
|
305
|
+
'Adverb, question', ['AVQ', 'WRB', 'WRB'],
|
306
|
+
'Adverb, degree & question', ['AVQ', 'WQL', 'WRB'],
|
307
|
+
'Adverb, degree', ['AV0', 'QL', 'RB'],
|
308
|
+
'Adverb, degree, postposed', ['AV0', 'QLP', 'RB'],
|
309
|
+
'Adverb, nominal', ['AV0', 'RN', 'RB'],
|
310
|
+
'Conjunction, coordination', ['CJC', 'CC', 'CC'],
|
311
|
+
'Conjunction, subordination', ['CJS', 'CS', 'IN'],
|
312
|
+
'Conjunction, complementizer, that', ['CJT', 'CS', 'IN'],
|
313
|
+
'Determiner', ['DT0', 'DT', 'DT'],
|
314
|
+
'Determiner, pronoun', ['DT0', 'DTI', 'DT'],
|
315
|
+
'Determiner, pronoun, plural', ['DT0', 'DTS', 'DT'],
|
316
|
+
'Determiner, prequalifier', ['DT0', 'ABL', 'DT'],
|
317
|
+
'Determiner, prequantifier', ['DT0', 'ABN', 'PDT'],
|
318
|
+
'Determiner, pronoun or double conjunction', ['DT0', 'ABX', 'PDT'],
|
319
|
+
'Determiner, pronoun or double conjunction', ['DT0', 'DTX', 'DT'],
|
320
|
+
'Determiner, article', ['AT0', 'AT', 'DT'],
|
321
|
+
'Determiner, postdeterminer', ['DT0', 'AP', 'JJ'],
|
322
|
+
'Determiner, possessive', ['DPS', 'PP$', 'PRP$'],
|
323
|
+
'Determiner, possessive, second', ['DPS', 'PP$$', 'PRP'],
|
324
|
+
'Determiner, question', ['DTQ', 'WDT', 'WDT'],
|
325
|
+
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$'],
|
326
|
+
'Noun', ['NN0', 'NN', 'NN'],
|
327
|
+
'Noun, singular', ['NN1', 'NN', 'NN'],
|
328
|
+
'Noun, plural', ['NN2', 'NNS', 'NNS'],
|
329
|
+
'Noun, proper, singular', ['NP0', 'NP', 'NNP'],
|
330
|
+
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
|
331
|
+
'Noun, adverbial', ['NN0', 'NR', 'NN'],
|
332
|
+
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
|
333
|
+
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'NN'],
|
334
|
+
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
|
335
|
+
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
|
336
|
+
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
|
337
|
+
'Pronoun, reflexive', ['PNX', 'PPL', 'PRP'],
|
338
|
+
'Pronoun, reflexive, plural', ['PNX', 'PPLS', 'PRP'],
|
339
|
+
'Pronoun, question, subject', ['PNQ', 'WPS', 'WP'],
|
340
|
+
'Pronoun, question, object', ['PNQ', 'WPO', 'WP'],
|
341
|
+
'Pronoun, existential there', ['EX0', 'EX', 'EX'],
|
342
|
+
'Verb, base present form (not infinitive)', ['VVB', 'VB', 'VBP'],
|
343
|
+
'Verb, infinitive', ['VVI', 'VB', 'VB'],
|
344
|
+
'Verb, past tense', ['VVD', 'VBD', 'VBD'],
|
345
|
+
'Verb, present participle', ['VVG', 'VBG', 'VBG'],
|
346
|
+
'Verb, past/passive participle', ['VVN', 'VBN', 'VBN'],
|
347
|
+
'Verb, present, 3SG, -s form', ['VVZ', 'VBZ', 'VBZ'],
|
348
|
+
'Verb, auxiliary do, base', ['VDB', 'DO', 'VBP'],
|
349
|
+
'Verb, auxiliary do, infinitive', ['VDB', 'DO', 'VB'],
|
350
|
+
'Verb, auxiliary do, past', ['VDD', 'DOD', 'VBD'],
|
351
|
+
'Verb, auxiliary do, present participle', ['VDG', 'VBG', 'VBG'],
|
352
|
+
'Verb, auxiliary do, past participle', ['VDN', 'VBN', 'VBN'],
|
353
|
+
'Verb, auxiliary do, present 3SG', ['VDZ', 'DOZ', 'VBZ'],
|
354
|
+
'Verb, auxiliary have, base', ['VHB', 'HV', 'VBP'],
|
355
|
+
'Verb, auxiliary have, infinitive', ['VHI', 'HV', 'VB'],
|
356
|
+
'Verb, auxiliary have, past', ['VHD', 'HVD', 'VBD'],
|
357
|
+
'Verb, auxiliary have, present participle', ['VHG', 'HVG', 'VBG'],
|
358
|
+
'Verb, auxiliary have, past participle', ['VHN', 'HVN', 'VBN'],
|
359
|
+
'Verb, auxiliary have, present 3SG', ['VHZ', 'HVZ', 'VBZ'],
|
360
|
+
'Verb, auxiliary be, infinitive', ['VBI', 'BE', 'VB'],
|
361
|
+
'Verb, auxiliary be, past', ['VBD', 'BED', 'VBD'],
|
362
|
+
'Verb, auxiliary be, past, 3SG', ['VBD', 'BEDZ', 'VBD'],
|
363
|
+
'Verb, auxiliary be, present participle', ['VBG', 'BEG', 'VBG'],
|
364
|
+
'Verb, auxiliary be, past participle', ['VBN', 'BEN', 'VBN'],
|
365
|
+
'Verb, auxiliary be, present, 3SG', ['VBZ', 'BEZ', 'VBZ'],
|
366
|
+
'Verb, auxiliary be, present, 1SG', ['VBB', 'BEM', 'VBP'],
|
367
|
+
'Verb, auxiliary be, present', ['VBB', 'BER', 'VBP'],
|
368
|
+
'Verb, modal', ['VM0', 'MD', 'MD'],
|
369
|
+
'Preposition, to as infinitive marker', ['TO0', 'TO', 'TO'],
|
370
|
+
'Preposition, to', ['PRP', 'IN', 'TO'],
|
371
|
+
'Preposition', ['PRP', 'IN', 'IN'],
|
372
|
+
'Preposition, of', ['PRF', 'IN', 'IN'],
|
373
|
+
'Possessive', ['POS', '$', 'POS'],
|
374
|
+
'Interjection (or other isolate)', ['ITJ', 'UH', 'UH'],
|
375
|
+
'Punctuation, sentence ender', ['PUN', '.', '.'],
|
376
|
+
'Punctuation, semicolon', ['PUN', '.', '.'],
|
377
|
+
'Puncutation, colon or ellipsis', ['PUN', ':', ':'],
|
378
|
+
'Punctuationm, comma', ['PUN', ',', ','],
|
379
|
+
'Punctuation, dash', ['PUN', '-', '-'],
|
380
|
+
'Punctuation, dollar sign', ['PUN', '', '$'],
|
381
|
+
'Punctuation, left bracket', ['PUL', '(', '('],
|
382
|
+
'Punctuation, right bracket', ['PUR', ')', ')'],
|
383
|
+
'Punctuation, quotation mark, left', ['PUQ', '', '``'],
|
384
|
+
'Punctuation, quotation mark, right', ['PUQ', '', '"'],
|
385
|
+
'Unknown, foreign words (not in English lexicon)', ['UNZ', '(FW-)', 'FW'],
|
386
|
+
'Symbol', ['', '', 'SYM'],
|
387
|
+
'Symbol, alphabetical', ['ZZ0', '', ''],
|
388
|
+
'Symbol, list item', ['', '', 'LS']
|
389
|
+
]
|
390
|
+
|
391
|
+
end
|
392
|
+
end
|
393
|
+
end
|
data/lib/treat/sugar.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
module Treat
|
2
|
+
module Sugar
|
3
|
+
def edulcorate
|
4
|
+
return if @@edulcorated
|
5
|
+
@@edulcorated = true
|
6
|
+
each_entity_class do |type, klass|
|
7
|
+
unless type == :Symbol
|
8
|
+
Object.class_eval do
|
9
|
+
define_method(type) do |value='',id=nil|
|
10
|
+
klass.build(value, id)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
def unedulcorate
|
17
|
+
return unless @@edulcorated
|
18
|
+
@@edulcorated = false
|
19
|
+
each_entity_class do |type, klass|
|
20
|
+
unless type == :Symbol
|
21
|
+
Object.class_eval do
|
22
|
+
remove_method(type)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
# Whtypeher syntactic sugar is
|
28
|
+
# enabled or not.
|
29
|
+
def edulcorated?; @@edulcorated; end
|
30
|
+
# Syntactic sugar is disabled by default.
|
31
|
+
@@edulcorated = false
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def each_entity_class
|
36
|
+
Treat::Entities.list.each do |entity_type|
|
37
|
+
type = :"#{cc(entity_type)}"
|
38
|
+
klass = Treat::Entities.const_get(type, klass)
|
39
|
+
yield type, klass
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
data/lib/treat/tree.rb
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
module Treat
|
2
|
+
# This module provides an abstract tree structure with
|
3
|
+
# nodes having an id, a value, children, features and edges.
|
4
|
+
module Tree
|
5
|
+
# This class models the nodes for an N-ary tree data structue
|
6
|
+
# with unique identifiers, text value, children, features
|
7
|
+
# (annotations) and edges.
|
8
|
+
#
|
9
|
+
# This class was tightly based on the 'rubytree' gem.
|
10
|
+
# RubyTree is licensed under the BSD license and can
|
11
|
+
# be found at http://rubytree.rubyforge.org/rdoc/.
|
12
|
+
# I have made several modifications in order to better
|
13
|
+
# suit this library and to avoid monkey patching.
|
14
|
+
class Node
|
15
|
+
# Iterate over each children in the node.
|
16
|
+
def each
|
17
|
+
@children.each { |child| yield child }
|
18
|
+
end
|
19
|
+
# A string containing the node's value (or empty).
|
20
|
+
attr_accessor :value
|
21
|
+
# A unique identifier for the node.
|
22
|
+
attr_reader :id
|
23
|
+
# An array containing the children of this node.
|
24
|
+
attr_reader :children
|
25
|
+
# A hash containing the features of this node.
|
26
|
+
attr_accessor :features
|
27
|
+
# A hash containing the edges that link this
|
28
|
+
# node to other nodes.
|
29
|
+
attr_accessor :edges
|
30
|
+
# The parent of the node.
|
31
|
+
attr_accessor :parent
|
32
|
+
# Initialize the node with its value and id.
|
33
|
+
# Setup containers for the children, features
|
34
|
+
# and edges of this node.
|
35
|
+
def initialize(value, id = nil)
|
36
|
+
@parent = nil
|
37
|
+
@value, @id = value, id
|
38
|
+
@children = []
|
39
|
+
@children_hash = {}
|
40
|
+
@features = {}
|
41
|
+
@edges = {}
|
42
|
+
end
|
43
|
+
# Boolean - does the node have edges?
|
44
|
+
def has_edges?; !@edges.empty?; end
|
45
|
+
# Boolean - does the node have children?
|
46
|
+
def has_children?; !@children.empty?; end
|
47
|
+
# Boolean - does the node have features?
|
48
|
+
def has_features?; !@features.empty?; end
|
49
|
+
# Boolean - does the node have a parent?
|
50
|
+
def has_parent?; !@parent.nil?; end
|
51
|
+
# Boolean - does the node not have a parent?
|
52
|
+
def is_root?; @parent.nil?; end
|
53
|
+
# Remove this node from its parent and set as root.
|
54
|
+
def set_as_root!; @parent = nil; self; end
|
55
|
+
# Boolean - is this node a leaf ?
|
56
|
+
# This is overriden in leaf classes.
|
57
|
+
def is_leaf?; !has_children?; end
|
58
|
+
# Add the nodes to the given child.
|
59
|
+
# This may be used with several nodes,
|
60
|
+
# for example: node << [child1, child2, child3]
|
61
|
+
def <<(nodes)
|
62
|
+
nodes = [nodes] unless nodes.is_a? Array
|
63
|
+
raise 'Trying to add a nil node.' if nodes.include? nil
|
64
|
+
nodes.each do |node|
|
65
|
+
node.parent = self
|
66
|
+
@children << node
|
67
|
+
@children_hash[node.id] = node
|
68
|
+
end
|
69
|
+
nodes[0]
|
70
|
+
end
|
71
|
+
def [](name_or_index)
|
72
|
+
if name_or_index == nil
|
73
|
+
raise Treat::Exception,
|
74
|
+
"Non-nil name or index needs to be provided."
|
75
|
+
end
|
76
|
+
if name_or_index.kind_of?(Integer) &&
|
77
|
+
name_or_index < 1000 # Fix
|
78
|
+
@children[name_or_index]
|
79
|
+
else
|
80
|
+
@children_hash[name_or_index]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
def remove!(ion)
|
84
|
+
return nil unless ion
|
85
|
+
if ion.is_a? Treat::Tree::Node
|
86
|
+
@children.delete(ion)
|
87
|
+
@children_hash.delete(ion.id)
|
88
|
+
ion.set_as_root!
|
89
|
+
else
|
90
|
+
@children.delete(@children_hash[ion])
|
91
|
+
@children_hash.delete(ion)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
def remove_all!
|
95
|
+
@children.each { |child| child.set_as_root! }
|
96
|
+
@children.clear
|
97
|
+
@children_hash.clear
|
98
|
+
self
|
99
|
+
end
|
100
|
+
# Previous sibling from the same parent.
|
101
|
+
def next_sibling
|
102
|
+
return nil if is_root?
|
103
|
+
id = @parent.children.index(self)
|
104
|
+
@parent.children.at(id + 1) if id
|
105
|
+
end
|
106
|
+
def left(n = 1); sibling(-1*n); end
|
107
|
+
def right(n = 1); sibling(1*n); end
|
108
|
+
def sibling(pos)
|
109
|
+
return nil if is_root?
|
110
|
+
id = @parent.children.index(self)
|
111
|
+
@parent.children.at(id + pos)
|
112
|
+
end
|
113
|
+
# There must be a cleaner way to do this.
|
114
|
+
def siblings
|
115
|
+
r = @parent.children.dup
|
116
|
+
r.delete(self)
|
117
|
+
r
|
118
|
+
end
|
119
|
+
# Total number of nodes in the subtree, including this one.
|
120
|
+
def size
|
121
|
+
@children.inject(1) { |sum, node| sum + node.size }
|
122
|
+
end
|
123
|
+
# Set the feature to the supplied value.
|
124
|
+
def set(feature, value)
|
125
|
+
@features ||= {}
|
126
|
+
@features[feature] = value
|
127
|
+
end
|
128
|
+
# Return the depth of this node in the tree.
|
129
|
+
def depth
|
130
|
+
return 0 if is_root?
|
131
|
+
1 + parent.depth
|
132
|
+
end
|
133
|
+
# Does the entity have a feature ?
|
134
|
+
def has_feature?(feature)
|
135
|
+
@features.has_key?(feature) ||
|
136
|
+
feature == :value
|
137
|
+
end
|
138
|
+
alias :has? :has_feature?
|
139
|
+
# Link this node to the target node with
|
140
|
+
# the supplied edge type.
|
141
|
+
def associate(id_or_node, edge_type = nil)
|
142
|
+
if id_or_node.is_a? Treat::Tree::Node
|
143
|
+
id = root.find(id_or_node).id
|
144
|
+
else
|
145
|
+
id = id_or_node
|
146
|
+
end
|
147
|
+
@edges[id] = edge_type if id
|
148
|
+
end
|
149
|
+
# Find the node in the tree with the given id.
|
150
|
+
def find(id_or_node)
|
151
|
+
if id_or_node.is_a? self.class
|
152
|
+
id = id_or_node.id
|
153
|
+
else
|
154
|
+
id = id_or_node
|
155
|
+
end
|
156
|
+
return @children_hash[id] if @children_hash[id]
|
157
|
+
self.each do |child|
|
158
|
+
r = child.find(id)
|
159
|
+
return r if r.is_a? Tree::Node
|
160
|
+
end
|
161
|
+
end
|
162
|
+
# Find the root of the tree within which
|
163
|
+
# this node is contained.
|
164
|
+
def root
|
165
|
+
return self if !has_parent?
|
166
|
+
ancestor = @parent
|
167
|
+
while ancestor.has_parent?
|
168
|
+
ancestor = ancestor.parent
|
169
|
+
end
|
170
|
+
ancestor
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|