treat 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +1 -0
- data/README +3 -0
- data/TODO +14 -26
- data/bin/INFO +1 -1
- data/lib/treat/buildable.rb +10 -11
- data/lib/treat/categories.rb +8 -6
- data/lib/treat/category.rb +7 -2
- data/lib/treat/delegatable.rb +64 -56
- data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
- data/lib/treat/detectors/language/language_detector.rb +2 -1
- data/lib/treat/detectors/language/what_language.rb +2 -2
- data/lib/treat/detectors.rb +3 -0
- data/lib/treat/entities/entity.rb +1 -1
- data/lib/treat/entities.rb +9 -10
- data/lib/treat/exception.rb +3 -1
- data/lib/treat/extractors/named_entity/abner.rb +1 -1
- data/lib/treat/extractors/named_entity/stanford.rb +2 -2
- data/lib/treat/extractors/time/chronic.rb +2 -2
- data/lib/treat/extractors/time/nickel.rb +2 -2
- data/lib/treat/extractors/topic_words/lda.rb +2 -2
- data/lib/treat/extractors.rb +12 -9
- data/lib/treat/feature.rb +6 -1
- data/lib/treat/formatters/cleaners/html.rb +1 -1
- data/lib/treat/formatters.rb +8 -8
- data/lib/treat/group.rb +11 -10
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
- data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
- data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
- data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
- data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
- data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
- data/lib/treat/inflectors.rb +8 -21
- data/lib/treat/kernel.rb +120 -0
- data/lib/treat/languages/arabic.rb +14 -0
- data/lib/treat/languages/categories.rb +5 -0
- data/lib/treat/languages/chinese.rb +12 -0
- data/lib/treat/languages/english/categories.rb +23 -0
- data/lib/treat/{resources → languages/english}/tags.rb +127 -184
- data/lib/treat/languages/english.rb +33 -0
- data/lib/treat/languages/french.rb +17 -0
- data/lib/treat/languages/german.rb +17 -0
- data/lib/treat/languages/italian.rb +14 -0
- data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
- data/lib/treat/languages/xinhua.rb +12 -0
- data/lib/treat/languages.rb +91 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
- data/lib/treat/lexicalizers/tag/brill.rb +2 -1
- data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
- data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
- data/lib/treat/lexicalizers.rb +1 -1
- data/lib/treat/object.rb +6 -0
- data/lib/treat/processors/parsers/enju.rb +3 -2
- data/lib/treat/processors/parsers/stanford.rb +15 -12
- data/lib/treat/processors/segmenters/punkt.rb +1 -1
- data/lib/treat/processors/segmenters/stanford.rb +7 -5
- data/lib/treat/processors/segmenters/tactful.rb +1 -1
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
- data/lib/treat/processors/tokenizers/stanford.rb +7 -5
- data/lib/treat/visitable.rb +2 -1
- data/lib/treat.rb +105 -54
- data/test/tc_entity.rb +5 -0
- data/test/tc_resources.rb +5 -5
- data/test/tc_treat.rb +1 -2
- data/test/tests.rb +2 -1
- metadata +63 -64
- data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
- data/lib/treat/resources/categories.rb +0 -18
- data/lib/treat/resources/delegates.rb +0 -96
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +0 -8
- data/lib/treat/resources/formats.rb +0 -23
- data/lib/treat/resources/languages.rb +0 -86
- data/lib/treat/resources.rb +0 -10
- data/lib/treat/utilities.rb +0 -127
@@ -1,60 +1,10 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
1
|
module Treat
|
4
|
-
module
|
5
|
-
class
|
2
|
+
module Languages
|
3
|
+
class English
|
6
4
|
|
7
5
|
ClawsC5 = 0
|
8
6
|
Brown = 1
|
9
7
|
Penn = 2
|
10
|
-
Enju = 3
|
11
|
-
|
12
|
-
PTBWordTagToCategory = {
|
13
|
-
'CC' => :conjunction, # Coordinating conjunction
|
14
|
-
'CD' => :number, # Cardinal number
|
15
|
-
'DT' => :determiner, # Determiner
|
16
|
-
'DET' => :determiner, # Determiner
|
17
|
-
'EX' => :determiner, # Existential there
|
18
|
-
'FW' => :foreign, # Foreign word
|
19
|
-
'IN' => :preposition, # Preposition or subordinating conjunction
|
20
|
-
'JJ' => :adjective, # Adjective
|
21
|
-
'JJR' => :adjective, # Adjective, comparative
|
22
|
-
'JJS' => :adjective, # Adjective, superlative
|
23
|
-
'LS' => :list, # List item marker
|
24
|
-
'MD' => :modal, # Modal
|
25
|
-
'NN' => :noun, # Noun, singular or mass
|
26
|
-
'NNS' => :noun, # Noun, plural
|
27
|
-
'NNP' => :noun, # Proper noun, singular
|
28
|
-
'NNPS' => :noun, # Proper noun, plural
|
29
|
-
'PDT' => :determiner, # Predeterminer
|
30
|
-
'POS' => :determiner, # Possessive ending
|
31
|
-
'PRP' => :pronoun, # Personal pronoun
|
32
|
-
'PRP$' => :pronoun, # Possessive pronoun,
|
33
|
-
'PRPS' => :determiner, # Possessive determiner
|
34
|
-
'RB' => :adverb, # Adverb
|
35
|
-
'RBR' => :adverb, # Adverb, comparative
|
36
|
-
'RBS' => :adverb, # Adverb, superlative
|
37
|
-
'RP' => :particle, # Particle
|
38
|
-
'SYM' => :symbol, # Symbol
|
39
|
-
'TO' => :to, # to
|
40
|
-
'UH' => :interjection, # Interjection
|
41
|
-
'VB' => :verb, # Verb, base form
|
42
|
-
'VBD' => :verb, # Verb, past tense
|
43
|
-
'VBG' => :verb, # Verb, gerund or present participle
|
44
|
-
'VBN' => :verb, # Verb, past participle
|
45
|
-
'VBP' => :verb, # Verb, non-3rd person singular present
|
46
|
-
'VBZ' => :verb, # Verb, 3rd person singular present
|
47
|
-
'WDT' => :determiner, # Wh-determiner
|
48
|
-
'WP' => :pronoun, # Wh-pronoun
|
49
|
-
'WP$' => :pronoun, # Possessive wh-pronoun
|
50
|
-
'WRB' => :adverb, # Wh-adverb
|
51
|
-
')' => :punctuation, # Right bracket
|
52
|
-
'(' => :punctuation, # Left bracket
|
53
|
-
'.' => :punctuation, # Period
|
54
|
-
'\'\'' => :symbol, # Quote
|
55
|
-
',' => :punctuation,
|
56
|
-
';' => :punctuation
|
57
|
-
}
|
58
8
|
|
59
9
|
PTBClauseTagDescription = [
|
60
10
|
['S', 'Simple declarative clause'],
|
@@ -89,141 +39,135 @@ module Treat
|
|
89
39
|
]
|
90
40
|
|
91
41
|
PTBWordTagDescription = [
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
PRP
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
WP -
|
127
|
-
|
128
|
-
WRB - Wh-adverb
|
129
|
-
|
130
|
-
=end
|
42
|
+
['CC', 'Coordinating conjunction'],
|
43
|
+
['CD', 'Cardinal number'],
|
44
|
+
['DT', 'Determiner'],
|
45
|
+
['EX', 'Existential there'],
|
46
|
+
['FW', 'Foreign word'],
|
47
|
+
['IN', 'Preposition or subordinating conjunction'],
|
48
|
+
['JJ', 'Adjective'],
|
49
|
+
['JJR', 'Adjective, comparative'],
|
50
|
+
['JJS', 'Adjective, superlative'],
|
51
|
+
['LS', 'List item marker'],
|
52
|
+
['MD', 'Modal'],
|
53
|
+
['NN', 'Noun, singular or mass'],
|
54
|
+
['NNS', 'Noun, plural'],
|
55
|
+
['NNP', 'Proper noun, singular'],
|
56
|
+
['NNPS', 'Proper noun, plural'],
|
57
|
+
['PDT', 'Predeterminer'],
|
58
|
+
['POS', 'Possessive ending'],
|
59
|
+
['PRP', 'Personal pronoun'],
|
60
|
+
['PRP$', 'Possessive pronoun (prolog version PRP-S)'],
|
61
|
+
['RB', 'Adverb'],
|
62
|
+
['RBR', 'Adverb, comparative'],
|
63
|
+
['RBS', 'Adverb, superlative'],
|
64
|
+
['RP', 'Particle'],
|
65
|
+
['SYM', 'Symbol'],
|
66
|
+
['TO', 'to'],
|
67
|
+
['UH', 'Interjection'],
|
68
|
+
['VB', 'Verb, base form'],
|
69
|
+
['VBD', 'Verb, past tense'],
|
70
|
+
['VBG', 'Verb, gerund or present participle'],
|
71
|
+
['VBN', 'Verb, past participle'],
|
72
|
+
['VBP', 'Verb, non 3rd person singular present'],
|
73
|
+
['VBZ', 'Verb, 3rd person singular present'],
|
74
|
+
['WDT', 'Wh-determiner'],
|
75
|
+
['WP', 'Wh-pronoun'],
|
76
|
+
['WP$', 'Possessive wh-pronoun (prolog version WP-S)'],
|
77
|
+
['WRB', 'Wh-adverb']
|
131
78
|
]
|
79
|
+
|
132
80
|
BrownWordTagDescription = [
|
133
|
-
=begin
|
134
81
|
|
135
|
-
|
82
|
+
['.', 'sentence closer . ; ? !'],
|
83
|
+
['(', 'left parent'] ,
|
84
|
+
[')', 'right parent'],
|
85
|
+
['*', 'not'],
|
86
|
+
['--', 'dash'],
|
87
|
+
[',', 'comma'],
|
88
|
+
[':', 'colon'],
|
89
|
+
['ABL', 'pre-qualifier quite, rather'],
|
90
|
+
['ABN', 'pre-quantifier half, all'],
|
91
|
+
['ABX', 'pre-quantifier both'],
|
92
|
+
['AP', 'post-determiner many, several, next'],
|
93
|
+
['AT', 'article a, the, no'],
|
94
|
+
['BE', 'be '],
|
95
|
+
['BED', 'were '],
|
96
|
+
['BEDZ', 'was '],
|
97
|
+
['BEG', 'being '],
|
98
|
+
['BEM', 'am '],
|
99
|
+
['BEN', 'been '],
|
100
|
+
['BER', 'are, art '],
|
101
|
+
['BEZ', 'is '],
|
102
|
+
['CC', 'coordinating conjunction and, or'],
|
103
|
+
['CD', 'cardinal numeral one, two, 2, etc.'],
|
104
|
+
['CS', 'subordinating conjunction if, although'],
|
105
|
+
['DO', 'do '],
|
106
|
+
['DOD', 'did '],
|
107
|
+
['DOZ', 'does '],
|
108
|
+
['DT', 'singular determiner this, that'],
|
109
|
+
['DTI', 'singular or plural determiner/quantifier some, any'],
|
110
|
+
['DTS', 'plural determiner these, those'],
|
111
|
+
['DTX', 'determiner/double conjunction either'],
|
112
|
+
['EX', 'existentil there '],
|
113
|
+
['FW', 'foreign word (hyphenated before regular tag) '],
|
114
|
+
['HL', 'word occurring in headline (hyphenated after regular tag) '],
|
115
|
+
['HV', 'have '],
|
116
|
+
['HVD', 'had (past tense) '],
|
117
|
+
['HVG', 'having '],
|
118
|
+
['HVN', 'had (past participle) '],
|
119
|
+
['HVZ', 'has '],
|
120
|
+
['IN', 'preposition '],
|
121
|
+
['JJ', 'adjective '],
|
122
|
+
['JJR', 'comparative adjective '],
|
123
|
+
['JJS', 'semantically superlative adjective chief, top'],
|
124
|
+
['JJT', 'morphologically superlative adjective biggest'],
|
125
|
+
['MD', 'modal auxiliary can, should, will'],
|
126
|
+
['NC', 'cited word (hyphenated after regular tag) '],
|
127
|
+
['NN', 'singular or mass noun '],
|
128
|
+
['NN$', 'possessive singular noun '],
|
129
|
+
['NNS', 'plural noun '],
|
130
|
+
['NNS$', 'possessive plural noun '],
|
131
|
+
['NP', 'proper noun or part of name phrase '],
|
132
|
+
['NP$', 'possessive proper noun '],
|
133
|
+
['NPS', 'plural proper noun '],
|
134
|
+
['NPS$', 'possessive plural proper noun '],
|
135
|
+
['NR', 'adverbial noun home, today, west'],
|
136
|
+
['NRS', 'plural adverbial noun'],
|
137
|
+
['OD', 'ordinal numeral first, 2nd'],
|
138
|
+
['PN', 'nominal pronoun everybody, nothing'],
|
139
|
+
['PN$', 'possessive nominal pronoun '],
|
140
|
+
['PP$', 'possessive personal pronoun my, our'],
|
141
|
+
['PP$$', 'second (nominal) possessive pronoun mine, ours'],
|
142
|
+
['PPL', 'singular reflexive/intensive personal pronoun myself'],
|
143
|
+
['PPLS', 'plural reflexive/intensive personal pronoun ourselves'],
|
144
|
+
['PPO', 'objective personal pronoun me, him, it, them'],
|
145
|
+
['PPS', '3rd. singular nominative pronoun he, she, it, one'],
|
146
|
+
['PPSS', 'other nominative personal pronoun I, we, they, you'],
|
147
|
+
['QL', 'qualifier very, fairly'],
|
148
|
+
['QLP', 'post-qualifier enough, indeed'],
|
149
|
+
['RB', 'adverb '],
|
150
|
+
['RBR', 'comparative adverb '],
|
151
|
+
['RBT', 'superlative adverb '],
|
152
|
+
['RN', 'nominal adverb here then, indoors '],
|
153
|
+
['RP', 'adverb/particle about, off, up'],
|
154
|
+
['TL', 'word occurring in title (hyphenated after regular tag)'],
|
155
|
+
['TO', 'infinitive marker to '],
|
156
|
+
['UH', 'interjection, exclamation '],
|
157
|
+
['VB', 'verb, base form '],
|
158
|
+
['VBD', 'verb, past tense '],
|
159
|
+
['VBG', 'verb, present participle/gerund '],
|
160
|
+
['VBN', 'verb, past participle '],
|
161
|
+
['VBZ', 'verb, 3rd. singular present '],
|
162
|
+
['WDT', 'wh- determiner what, which'],
|
163
|
+
['WP$', 'possessive wh- pronoun whose'],
|
164
|
+
['WPO', 'objective wh- pronoun whom, which, that'],
|
165
|
+
['WPS', 'nominative wh- pronoun who, which, that'],
|
166
|
+
['WQL', 'wh- qualifier how'],
|
167
|
+
['WRB', 'wh- adverb how, where, when']
|
136
168
|
|
137
|
-
. sentence closer . ; ? !
|
138
|
-
( left paren
|
139
|
-
) right paren
|
140
|
-
* not, n't
|
141
|
-
-- dash
|
142
|
-
, comma
|
143
|
-
: colon
|
144
|
-
ABL pre-qualifier quite, rather
|
145
|
-
ABN pre-quantifier half, all
|
146
|
-
ABX pre-quantifier both
|
147
|
-
AP post-determiner many, several, next
|
148
|
-
AT article a, the, no
|
149
|
-
BE be
|
150
|
-
BED were
|
151
|
-
BEDZ was
|
152
|
-
BEG being
|
153
|
-
BEM am
|
154
|
-
BEN been
|
155
|
-
BER are, art
|
156
|
-
BEZ is
|
157
|
-
CC coordinating conjunction and, or
|
158
|
-
CD cardinal numeral one, two, 2, etc.
|
159
|
-
CS subordinating conjunction if, although
|
160
|
-
DO do
|
161
|
-
DOD did
|
162
|
-
DOZ does
|
163
|
-
DT singular determiner this, that
|
164
|
-
DTI singular or plural determiner/quantifier some, any
|
165
|
-
DTS plural determiner these, those
|
166
|
-
DTX determiner/double conjunction either
|
167
|
-
EX existentil there
|
168
|
-
FW foreign word (hyphenated before regular tag)
|
169
|
-
HL word occurring in headline (hyphenated after regular tag)
|
170
|
-
HV have
|
171
|
-
HVD had (past tense)
|
172
|
-
HVG having
|
173
|
-
HVN had (past participle)
|
174
|
-
HVZ has
|
175
|
-
IN preposition
|
176
|
-
JJ adjective
|
177
|
-
JJR comparative adjective
|
178
|
-
JJS semantically superlative adjective chief, top
|
179
|
-
JJT morphologically superlative adjective biggest
|
180
|
-
MD modal auxiliary can, should, will
|
181
|
-
NC cited word (hyphenated after regular tag)
|
182
|
-
NN singular or mass noun
|
183
|
-
NN$ possessive singular noun
|
184
|
-
NNS plural noun
|
185
|
-
NNS$ possessive plural noun
|
186
|
-
NP proper noun or part of name phrase
|
187
|
-
NP$ possessive proper noun
|
188
|
-
NPS plural proper noun
|
189
|
-
NPS$ possessive plural proper noun
|
190
|
-
NR adverbial noun home, today, west
|
191
|
-
NRS plural adverbial noun
|
192
|
-
OD ordinal numeral first, 2nd
|
193
|
-
PN nominal pronoun everybody, nothing
|
194
|
-
PN$ possessive nominal pronoun
|
195
|
-
PP$ possessive personal pronoun my, our
|
196
|
-
PP$$ second (nominal) possessive pronoun mine, ours
|
197
|
-
PPL singular reflexive/intensive personal pronoun myself
|
198
|
-
PPLS plural reflexive/intensive personal pronoun ourselves
|
199
|
-
PPO objective personal pronoun me, him, it, them
|
200
|
-
PPS 3rd. singular nominative pronoun he, she, it, one
|
201
|
-
PPSS other nominative personal pronoun I, we, they, you
|
202
|
-
QL qualifier very, fairly
|
203
|
-
QLP post-qualifier enough, indeed
|
204
|
-
RB adverb
|
205
|
-
RBR comparative adverb
|
206
|
-
RBT superlative adverb
|
207
|
-
RN nominal adverb here then, indoors
|
208
|
-
RP adverb/particle about, off, up
|
209
|
-
TL word occurring in title (hyphenated after
|
210
|
-
regular tag)
|
211
|
-
TO infinitive marker to
|
212
|
-
UH interjection, exclamation
|
213
|
-
VB verb, base form
|
214
|
-
VBD verb, past tense
|
215
|
-
VBG verb, present participle/gerund
|
216
|
-
VBN verb, past participle
|
217
|
-
VBZ verb, 3rd. singular present
|
218
|
-
WDT wh- determiner what, which
|
219
|
-
WP$ possessive wh- pronoun whose
|
220
|
-
WPO objective wh- pronoun whom, which, that
|
221
|
-
WPS nominative wh- pronoun who, which, that
|
222
|
-
WQL wh- qualifier how
|
223
|
-
WRB wh- adverb how, where, when
|
224
|
-
|
225
|
-
=end
|
226
169
|
]
|
170
|
+
# A description of Enju categories.
|
227
171
|
EnjuCatDescription = [
|
228
172
|
['ADJ', 'Adjective'],
|
229
173
|
['ADV', 'Adverb'],
|
@@ -330,7 +274,7 @@ WRB wh- adverb how, where, when
|
|
330
274
|
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
|
331
275
|
'Noun, adverbial', ['NN0', 'NR', 'NN'],
|
332
276
|
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
|
333
|
-
'Pronoun, nominal (indefinite)', ['PNI', 'PN', '
|
277
|
+
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP'],
|
334
278
|
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
|
335
279
|
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
|
336
280
|
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
|
@@ -387,7 +331,6 @@ WRB wh- adverb how, where, when
|
|
387
331
|
'Symbol, alphabetical', ['ZZ0', '', ''],
|
388
332
|
'Symbol, list item', ['', '', 'LS']
|
389
333
|
]
|
390
|
-
|
391
334
|
end
|
392
335
|
end
|
393
336
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class English
|
4
|
+
require 'treat/languages/english/tags'
|
5
|
+
require 'treat/languages/english/categories'
|
6
|
+
Extractors = {
|
7
|
+
time: [:chronic],
|
8
|
+
topics: [:reuters],
|
9
|
+
topic_words: [:lda],
|
10
|
+
key_sentences: [:topics_frequency]
|
11
|
+
}
|
12
|
+
Processors = {
|
13
|
+
chunkers: [:txt],
|
14
|
+
parsers: [:enju, :stanford],
|
15
|
+
segmenters: [:tactful, :punkt, :stanford],
|
16
|
+
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
17
|
+
}
|
18
|
+
Lexicalizers = {
|
19
|
+
category: [:from_tag],
|
20
|
+
linkages: [:naive],
|
21
|
+
synsets: [:wordnet, :rita_wn],
|
22
|
+
tag: [:brill, :lingua, :stanford]
|
23
|
+
}
|
24
|
+
Inflectors = {
|
25
|
+
conjugations: [:linguistics],
|
26
|
+
declensions: [:linguistics, :english],
|
27
|
+
stem: [:porter_c, :porter, :uea],
|
28
|
+
ordinal_words: [:linguistics],
|
29
|
+
cardinal_words: [:linguistics]
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class French
|
4
|
+
Extractors = {}
|
5
|
+
Inflectors = {}
|
6
|
+
Lexicalizers = {
|
7
|
+
tag: [:stanford]
|
8
|
+
}
|
9
|
+
Processors = {
|
10
|
+
chunkers: [:txt],
|
11
|
+
parsers: [:stanford],
|
12
|
+
segmenters: [:tactful, :punkt, :stanford],
|
13
|
+
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class German
|
4
|
+
Extractors = {}
|
5
|
+
Inflectors = {}
|
6
|
+
Lexicalizers = {
|
7
|
+
tag: [:stanford]
|
8
|
+
}
|
9
|
+
Processors = {
|
10
|
+
chunkers: [:txt],
|
11
|
+
parsers: [:stanford],
|
12
|
+
segmenters: [:tactful, :punkt, :stanford],
|
13
|
+
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class Italian
|
4
|
+
Extractors = {}
|
5
|
+
Inflectors = {}
|
6
|
+
Lexicalizers = {}
|
7
|
+
Processors = {
|
8
|
+
chunkers: [:txt],
|
9
|
+
segmenters: [:tactful, :punkt, :stanford],
|
10
|
+
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
File without changes
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Treat
|
2
|
+
# This module provides linguistic resources
|
3
|
+
# for the Treat library, including information
|
4
|
+
# about language codes, the functions available
|
5
|
+
# for each language, and the different tags used
|
6
|
+
# to markup that language.
|
7
|
+
module Languages
|
8
|
+
Dir["#{Treat.lib}/treat/languages/*.rb"].each { |file| require file }
|
9
|
+
ISO639_1 = 1
|
10
|
+
ISO639_2 = 2
|
11
|
+
# Describe a language code (ISO-639-1 or ISO-639-2)
|
12
|
+
# or its full text description in full French or English.
|
13
|
+
def self.describe(lang, desc_lang = :en)
|
14
|
+
raise "Must provide a non-nil language identifier to describe." if lang.nil?
|
15
|
+
lang = find(lang).to_s
|
16
|
+
if [:en, :eng, :english, :anglais].include?(desc_lang)
|
17
|
+
l = @@english_full.key(lang)
|
18
|
+
elsif [:fr, :fra, :french, :french].include?(desc_lang)
|
19
|
+
l = @@french_full.key(lang)
|
20
|
+
else
|
21
|
+
raise Treat::Exception,
|
22
|
+
"Unknown language to describe: #{desc_lang}."
|
23
|
+
end
|
24
|
+
not_found(lang) if l.nil?
|
25
|
+
l.intern
|
26
|
+
end
|
27
|
+
# Raise an error message when a language code
|
28
|
+
# or description is not found and suggest
|
29
|
+
# possible misspellings.
|
30
|
+
def self.not_found(lang)
|
31
|
+
msg = "Language '#{lang}' does not exist."
|
32
|
+
all = @@iso639_2.keys + @@iso639_1.keys +
|
33
|
+
@@english_full.keys + @@french_full.keys
|
34
|
+
msg += did_you_mean?(all, lang)
|
35
|
+
raise Treat::Exception, msg
|
36
|
+
end
|
37
|
+
# Return the class representing a language.
|
38
|
+
def self.get(lang)
|
39
|
+
const_get(Treat::Languages.describe(lang).to_s.capitalize)
|
40
|
+
end
|
41
|
+
# Find a language by ISO-639-1 or ISO-639-2 code
|
42
|
+
# or full name (in English or French) and return
|
43
|
+
# the ISO-639-1 or ISO-639-2 language code as a
|
44
|
+
# lowercase identifier.
|
45
|
+
def self.find(lang, rc = ISO639_2)
|
46
|
+
raise "Must provide a non-nil language identifier to describe." if lang.nil?
|
47
|
+
get_languages
|
48
|
+
lang = lang.to_s.downcase
|
49
|
+
if @@iso639_1.has_key?(lang)
|
50
|
+
return :"#{lang}" if rc == ISO639_1
|
51
|
+
return :"#{@@iso639_1[lang]}" if rc == ISO639_2
|
52
|
+
elsif @@iso639_2.has_key?(lang)
|
53
|
+
return :"#{lang}" if rc == ISO639_2
|
54
|
+
return :"#{@@iso639_2[lang]}" if rc == ISO639_1
|
55
|
+
elsif @@english_full.has_key?(lang)
|
56
|
+
return :"#{@@english_full[lang]}" if rc == ISO639_2
|
57
|
+
return :"#{@@iso639_2[@@english_full[lang]]}" if rc == ISO639_1
|
58
|
+
elsif @@french_full.has_key?(lang)
|
59
|
+
return :"#{@@french_full[lang]}" if rc == ISO639_2
|
60
|
+
return :"#{@@iso639_1[@@french_full[lang]]}" if rc == ISO639_2
|
61
|
+
else
|
62
|
+
not_found(lang)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
@@loaded = false
|
66
|
+
# Get the languages from the dictionary.
|
67
|
+
def self.get_languages
|
68
|
+
return if @@loaded
|
69
|
+
@@iso639_1 = {}; @@iso639_2 = {};
|
70
|
+
@@english_full = {}; @@french_full = {}
|
71
|
+
languages = IO.readlines(Treat.lib + '/treat/languages/list.txt')
|
72
|
+
languages.each do |language|
|
73
|
+
iso639_2, iso639_1, english_desc, french_desc =
|
74
|
+
language.split(',')
|
75
|
+
@@iso639_1[iso639_1] = iso639_2
|
76
|
+
@@iso639_2[iso639_2] = iso639_1
|
77
|
+
unless english_desc.nil?
|
78
|
+
english_desc.strip.downcase.split('|').each do |l|
|
79
|
+
@@english_full[l.downcase.strip] = iso639_2
|
80
|
+
end
|
81
|
+
end
|
82
|
+
unless french_desc.nil?
|
83
|
+
french_desc.strip.downcase.split('|').each do |l|
|
84
|
+
@@french_full[l.downcase.strip] = iso639_2
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
@@loaded = true
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -4,24 +4,36 @@ module Treat
|
|
4
4
|
# A class that detects the category of a word from its tag,
|
5
5
|
# using the default tagger for the language of the entity.
|
6
6
|
class FromTag
|
7
|
+
DefaultOptions = { tagger: nil }
|
7
8
|
# Find the category of the current entity.
|
8
9
|
# Options:
|
9
10
|
# :tagger => (Symbol) force the use of a tagger.
|
10
11
|
# :tag_to_cat => (Hash) a list of categories for each possible tag.
|
11
12
|
def self.category(entity, options = {})
|
12
|
-
|
13
|
-
options = {
|
14
|
-
tagger: nil,
|
15
|
-
tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
|
16
|
-
}
|
17
|
-
end
|
13
|
+
options = DefaultOptions.merge(options)
|
18
14
|
tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
|
19
|
-
|
15
|
+
lang = Treat::Languages.get(entity.language)
|
16
|
+
cat = lang::WordTagToCategory[tag]
|
20
17
|
if cat.nil?
|
21
18
|
warn "Category not found for tag #{tag}."
|
22
19
|
:unknown
|
23
20
|
else
|
24
|
-
cat
|
21
|
+
if cat.size == 1
|
22
|
+
return cat[0]
|
23
|
+
else
|
24
|
+
if entity.has?(:tag_set)
|
25
|
+
if cat[entity.tag_set]
|
26
|
+
return cat[entity.tag_set]
|
27
|
+
else
|
28
|
+
raise Treat::Exception,
|
29
|
+
"The specified tag set (#{entity.tag_set})" +
|
30
|
+
" does not contain the tag #{tag}."
|
31
|
+
end
|
32
|
+
else
|
33
|
+
raise Treat::Exception,
|
34
|
+
"No information can be found regarding which tag set to use."
|
35
|
+
end
|
36
|
+
end
|
25
37
|
end
|
26
38
|
end
|
27
39
|
end
|
@@ -25,7 +25,7 @@ module Treat
|
|
25
25
|
patch = false
|
26
26
|
# Require the 'rbtagger' gem.
|
27
27
|
begin
|
28
|
-
|
28
|
+
silence_warnings { require 'rbtagger' }
|
29
29
|
# This whole mess is required to deal with
|
30
30
|
# the fact that the 'rbtagger' gem defines
|
31
31
|
# a top-level module called 'Word', which
|
@@ -73,6 +73,7 @@ module Treat
|
|
73
73
|
# Create the tagger if necessary
|
74
74
|
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
75
75
|
options[:lexical_rules], options[:contextual_rules])
|
76
|
+
entity.set :tag_set, :penn
|
76
77
|
# Perform tagging.
|
77
78
|
if entity.type == :word
|
78
79
|
# Setup the context of the word
|
@@ -17,7 +17,7 @@ module Treat
|
|
17
17
|
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
|
18
18
|
class Lingua
|
19
19
|
# Require the 'engtagger' gem.
|
20
|
-
|
20
|
+
silence_warnings { require 'engtagger' }
|
21
21
|
# Hold the tagger.
|
22
22
|
@@tagger = nil
|
23
23
|
# Hold the user-set options
|
@@ -46,6 +46,7 @@ module Treat
|
|
46
46
|
@@tagger = nil # Reset the tagger
|
47
47
|
end
|
48
48
|
@@tagger ||= ::EngTagger.new(@@options)
|
49
|
+
entity.set :tag_set, :penn
|
49
50
|
left = entity.left
|
50
51
|
if left.nil? || left.type != :word
|
51
52
|
left_tag = 'pp'
|