treat 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +3 -0
- data/TODO +14 -26
- data/bin/INFO +1 -1
- data/lib/treat/buildable.rb +10 -11
- data/lib/treat/categories.rb +8 -6
- data/lib/treat/category.rb +7 -2
- data/lib/treat/delegatable.rb +64 -56
- data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
- data/lib/treat/detectors/language/language_detector.rb +2 -1
- data/lib/treat/detectors/language/what_language.rb +2 -2
- data/lib/treat/detectors.rb +3 -0
- data/lib/treat/entities/entity.rb +1 -1
- data/lib/treat/entities.rb +9 -10
- data/lib/treat/exception.rb +3 -1
- data/lib/treat/extractors/named_entity/abner.rb +1 -1
- data/lib/treat/extractors/named_entity/stanford.rb +2 -2
- data/lib/treat/extractors/time/chronic.rb +2 -2
- data/lib/treat/extractors/time/nickel.rb +2 -2
- data/lib/treat/extractors/topic_words/lda.rb +2 -2
- data/lib/treat/extractors.rb +12 -9
- data/lib/treat/feature.rb +6 -1
- data/lib/treat/formatters/cleaners/html.rb +1 -1
- data/lib/treat/formatters.rb +8 -8
- data/lib/treat/group.rb +11 -10
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
- data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
- data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
- data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
- data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
- data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
- data/lib/treat/inflectors.rb +8 -21
- data/lib/treat/kernel.rb +120 -0
- data/lib/treat/languages/arabic.rb +14 -0
- data/lib/treat/languages/categories.rb +5 -0
- data/lib/treat/languages/chinese.rb +12 -0
- data/lib/treat/languages/english/categories.rb +23 -0
- data/lib/treat/{resources → languages/english}/tags.rb +127 -184
- data/lib/treat/languages/english.rb +33 -0
- data/lib/treat/languages/french.rb +17 -0
- data/lib/treat/languages/german.rb +17 -0
- data/lib/treat/languages/italian.rb +14 -0
- data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
- data/lib/treat/languages/xinhua.rb +12 -0
- data/lib/treat/languages.rb +91 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
- data/lib/treat/lexicalizers/tag/brill.rb +2 -1
- data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
- data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
- data/lib/treat/lexicalizers.rb +1 -1
- data/lib/treat/object.rb +6 -0
- data/lib/treat/processors/parsers/enju.rb +3 -2
- data/lib/treat/processors/parsers/stanford.rb +15 -12
- data/lib/treat/processors/segmenters/punkt.rb +1 -1
- data/lib/treat/processors/segmenters/stanford.rb +7 -5
- data/lib/treat/processors/segmenters/tactful.rb +1 -1
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
- data/lib/treat/processors/tokenizers/stanford.rb +7 -5
- data/lib/treat/visitable.rb +2 -1
- data/lib/treat.rb +105 -54
- data/test/tc_entity.rb +5 -0
- data/test/tc_resources.rb +5 -5
- data/test/tc_treat.rb +1 -2
- data/test/tests.rb +2 -1
- metadata +63 -64
- data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
- data/lib/treat/resources/categories.rb +0 -18
- data/lib/treat/resources/delegates.rb +0 -96
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +0 -8
- data/lib/treat/resources/formats.rb +0 -23
- data/lib/treat/resources/languages.rb +0 -86
- data/lib/treat/resources.rb +0 -10
- data/lib/treat/utilities.rb +0 -127
@@ -1,60 +1,10 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
1
|
module Treat
|
4
|
-
module
|
5
|
-
class
|
2
|
+
module Languages
|
3
|
+
class English
|
6
4
|
|
7
5
|
ClawsC5 = 0
|
8
6
|
Brown = 1
|
9
7
|
Penn = 2
|
10
|
-
Enju = 3
|
11
|
-
|
12
|
-
PTBWordTagToCategory = {
|
13
|
-
'CC' => :conjunction, # Coordinating conjunction
|
14
|
-
'CD' => :number, # Cardinal number
|
15
|
-
'DT' => :determiner, # Determiner
|
16
|
-
'DET' => :determiner, # Determiner
|
17
|
-
'EX' => :determiner, # Existential there
|
18
|
-
'FW' => :foreign, # Foreign word
|
19
|
-
'IN' => :preposition, # Preposition or subordinating conjunction
|
20
|
-
'JJ' => :adjective, # Adjective
|
21
|
-
'JJR' => :adjective, # Adjective, comparative
|
22
|
-
'JJS' => :adjective, # Adjective, superlative
|
23
|
-
'LS' => :list, # List item marker
|
24
|
-
'MD' => :modal, # Modal
|
25
|
-
'NN' => :noun, # Noun, singular or mass
|
26
|
-
'NNS' => :noun, # Noun, plural
|
27
|
-
'NNP' => :noun, # Proper noun, singular
|
28
|
-
'NNPS' => :noun, # Proper noun, plural
|
29
|
-
'PDT' => :determiner, # Predeterminer
|
30
|
-
'POS' => :determiner, # Possessive ending
|
31
|
-
'PRP' => :pronoun, # Personal pronoun
|
32
|
-
'PRP$' => :pronoun, # Possessive pronoun,
|
33
|
-
'PRPS' => :determiner, # Possessive determiner
|
34
|
-
'RB' => :adverb, # Adverb
|
35
|
-
'RBR' => :adverb, # Adverb, comparative
|
36
|
-
'RBS' => :adverb, # Adverb, superlative
|
37
|
-
'RP' => :particle, # Particle
|
38
|
-
'SYM' => :symbol, # Symbol
|
39
|
-
'TO' => :to, # to
|
40
|
-
'UH' => :interjection, # Interjection
|
41
|
-
'VB' => :verb, # Verb, base form
|
42
|
-
'VBD' => :verb, # Verb, past tense
|
43
|
-
'VBG' => :verb, # Verb, gerund or present participle
|
44
|
-
'VBN' => :verb, # Verb, past participle
|
45
|
-
'VBP' => :verb, # Verb, non-3rd person singular present
|
46
|
-
'VBZ' => :verb, # Verb, 3rd person singular present
|
47
|
-
'WDT' => :determiner, # Wh-determiner
|
48
|
-
'WP' => :pronoun, # Wh-pronoun
|
49
|
-
'WP$' => :pronoun, # Possessive wh-pronoun
|
50
|
-
'WRB' => :adverb, # Wh-adverb
|
51
|
-
')' => :punctuation, # Right bracket
|
52
|
-
'(' => :punctuation, # Left bracket
|
53
|
-
'.' => :punctuation, # Period
|
54
|
-
'\'\'' => :symbol, # Quote
|
55
|
-
',' => :punctuation,
|
56
|
-
';' => :punctuation
|
57
|
-
}
|
58
8
|
|
59
9
|
PTBClauseTagDescription = [
|
60
10
|
['S', 'Simple declarative clause'],
|
@@ -89,141 +39,135 @@ module Treat
|
|
89
39
|
]
|
90
40
|
|
91
41
|
PTBWordTagDescription = [
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
PRP
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
WP -
|
127
|
-
|
128
|
-
WRB - Wh-adverb
|
129
|
-
|
130
|
-
=end
|
42
|
+
['CC', 'Coordinating conjunction'],
|
43
|
+
['CD', 'Cardinal number'],
|
44
|
+
['DT', 'Determiner'],
|
45
|
+
['EX', 'Existential there'],
|
46
|
+
['FW', 'Foreign word'],
|
47
|
+
['IN', 'Preposition or subordinating conjunction'],
|
48
|
+
['JJ', 'Adjective'],
|
49
|
+
['JJR', 'Adjective, comparative'],
|
50
|
+
['JJS', 'Adjective, superlative'],
|
51
|
+
['LS', 'List item marker'],
|
52
|
+
['MD', 'Modal'],
|
53
|
+
['NN', 'Noun, singular or mass'],
|
54
|
+
['NNS', 'Noun, plural'],
|
55
|
+
['NNP', 'Proper noun, singular'],
|
56
|
+
['NNPS', 'Proper noun, plural'],
|
57
|
+
['PDT', 'Predeterminer'],
|
58
|
+
['POS', 'Possessive ending'],
|
59
|
+
['PRP', 'Personal pronoun'],
|
60
|
+
['PRP$', 'Possessive pronoun (prolog version PRP-S)'],
|
61
|
+
['RB', 'Adverb'],
|
62
|
+
['RBR', 'Adverb, comparative'],
|
63
|
+
['RBS', 'Adverb, superlative'],
|
64
|
+
['RP', 'Particle'],
|
65
|
+
['SYM', 'Symbol'],
|
66
|
+
['TO', 'to'],
|
67
|
+
['UH', 'Interjection'],
|
68
|
+
['VB', 'Verb, base form'],
|
69
|
+
['VBD', 'Verb, past tense'],
|
70
|
+
['VBG', 'Verb, gerund or present participle'],
|
71
|
+
['VBN', 'Verb, past participle'],
|
72
|
+
['VBP', 'Verb, non 3rd person singular present'],
|
73
|
+
['VBZ', 'Verb, 3rd person singular present'],
|
74
|
+
['WDT', 'Wh-determiner'],
|
75
|
+
['WP', 'Wh-pronoun'],
|
76
|
+
['WP$', 'Possessive wh-pronoun (prolog version WP-S)'],
|
77
|
+
['WRB', 'Wh-adverb']
|
131
78
|
]
|
79
|
+
|
132
80
|
BrownWordTagDescription = [
|
133
|
-
=begin
|
134
81
|
|
135
|
-
|
82
|
+
['.', 'sentence closer . ; ? !'],
|
83
|
+
['(', 'left parent'] ,
|
84
|
+
[')', 'right parent'],
|
85
|
+
['*', 'not'],
|
86
|
+
['--', 'dash'],
|
87
|
+
[',', 'comma'],
|
88
|
+
[':', 'colon'],
|
89
|
+
['ABL', 'pre-qualifier quite, rather'],
|
90
|
+
['ABN', 'pre-quantifier half, all'],
|
91
|
+
['ABX', 'pre-quantifier both'],
|
92
|
+
['AP', 'post-determiner many, several, next'],
|
93
|
+
['AT', 'article a, the, no'],
|
94
|
+
['BE', 'be '],
|
95
|
+
['BED', 'were '],
|
96
|
+
['BEDZ', 'was '],
|
97
|
+
['BEG', 'being '],
|
98
|
+
['BEM', 'am '],
|
99
|
+
['BEN', 'been '],
|
100
|
+
['BER', 'are, art '],
|
101
|
+
['BEZ', 'is '],
|
102
|
+
['CC', 'coordinating conjunction and, or'],
|
103
|
+
['CD', 'cardinal numeral one, two, 2, etc.'],
|
104
|
+
['CS', 'subordinating conjunction if, although'],
|
105
|
+
['DO', 'do '],
|
106
|
+
['DOD', 'did '],
|
107
|
+
['DOZ', 'does '],
|
108
|
+
['DT', 'singular determiner this, that'],
|
109
|
+
['DTI', 'singular or plural determiner/quantifier some, any'],
|
110
|
+
['DTS', 'plural determiner these, those'],
|
111
|
+
['DTX', 'determiner/double conjunction either'],
|
112
|
+
['EX', 'existentil there '],
|
113
|
+
['FW', 'foreign word (hyphenated before regular tag) '],
|
114
|
+
['HL', 'word occurring in headline (hyphenated after regular tag) '],
|
115
|
+
['HV', 'have '],
|
116
|
+
['HVD', 'had (past tense) '],
|
117
|
+
['HVG', 'having '],
|
118
|
+
['HVN', 'had (past participle) '],
|
119
|
+
['HVZ', 'has '],
|
120
|
+
['IN', 'preposition '],
|
121
|
+
['JJ', 'adjective '],
|
122
|
+
['JJR', 'comparative adjective '],
|
123
|
+
['JJS', 'semantically superlative adjective chief, top'],
|
124
|
+
['JJT', 'morphologically superlative adjective biggest'],
|
125
|
+
['MD', 'modal auxiliary can, should, will'],
|
126
|
+
['NC', 'cited word (hyphenated after regular tag) '],
|
127
|
+
['NN', 'singular or mass noun '],
|
128
|
+
['NN$', 'possessive singular noun '],
|
129
|
+
['NNS', 'plural noun '],
|
130
|
+
['NNS$', 'possessive plural noun '],
|
131
|
+
['NP', 'proper noun or part of name phrase '],
|
132
|
+
['NP$', 'possessive proper noun '],
|
133
|
+
['NPS', 'plural proper noun '],
|
134
|
+
['NPS$', 'possessive plural proper noun '],
|
135
|
+
['NR', 'adverbial noun home, today, west'],
|
136
|
+
['NRS', 'plural adverbial noun'],
|
137
|
+
['OD', 'ordinal numeral first, 2nd'],
|
138
|
+
['PN', 'nominal pronoun everybody, nothing'],
|
139
|
+
['PN$', 'possessive nominal pronoun '],
|
140
|
+
['PP$', 'possessive personal pronoun my, our'],
|
141
|
+
['PP$$', 'second (nominal) possessive pronoun mine, ours'],
|
142
|
+
['PPL', 'singular reflexive/intensive personal pronoun myself'],
|
143
|
+
['PPLS', 'plural reflexive/intensive personal pronoun ourselves'],
|
144
|
+
['PPO', 'objective personal pronoun me, him, it, them'],
|
145
|
+
['PPS', '3rd. singular nominative pronoun he, she, it, one'],
|
146
|
+
['PPSS', 'other nominative personal pronoun I, we, they, you'],
|
147
|
+
['QL', 'qualifier very, fairly'],
|
148
|
+
['QLP', 'post-qualifier enough, indeed'],
|
149
|
+
['RB', 'adverb '],
|
150
|
+
['RBR', 'comparative adverb '],
|
151
|
+
['RBT', 'superlative adverb '],
|
152
|
+
['RN', 'nominal adverb here then, indoors '],
|
153
|
+
['RP', 'adverb/particle about, off, up'],
|
154
|
+
['TL', 'word occurring in title (hyphenated after regular tag)'],
|
155
|
+
['TO', 'infinitive marker to '],
|
156
|
+
['UH', 'interjection, exclamation '],
|
157
|
+
['VB', 'verb, base form '],
|
158
|
+
['VBD', 'verb, past tense '],
|
159
|
+
['VBG', 'verb, present participle/gerund '],
|
160
|
+
['VBN', 'verb, past participle '],
|
161
|
+
['VBZ', 'verb, 3rd. singular present '],
|
162
|
+
['WDT', 'wh- determiner what, which'],
|
163
|
+
['WP$', 'possessive wh- pronoun whose'],
|
164
|
+
['WPO', 'objective wh- pronoun whom, which, that'],
|
165
|
+
['WPS', 'nominative wh- pronoun who, which, that'],
|
166
|
+
['WQL', 'wh- qualifier how'],
|
167
|
+
['WRB', 'wh- adverb how, where, when']
|
136
168
|
|
137
|
-
. sentence closer . ; ? !
|
138
|
-
( left paren
|
139
|
-
) right paren
|
140
|
-
* not, n't
|
141
|
-
-- dash
|
142
|
-
, comma
|
143
|
-
: colon
|
144
|
-
ABL pre-qualifier quite, rather
|
145
|
-
ABN pre-quantifier half, all
|
146
|
-
ABX pre-quantifier both
|
147
|
-
AP post-determiner many, several, next
|
148
|
-
AT article a, the, no
|
149
|
-
BE be
|
150
|
-
BED were
|
151
|
-
BEDZ was
|
152
|
-
BEG being
|
153
|
-
BEM am
|
154
|
-
BEN been
|
155
|
-
BER are, art
|
156
|
-
BEZ is
|
157
|
-
CC coordinating conjunction and, or
|
158
|
-
CD cardinal numeral one, two, 2, etc.
|
159
|
-
CS subordinating conjunction if, although
|
160
|
-
DO do
|
161
|
-
DOD did
|
162
|
-
DOZ does
|
163
|
-
DT singular determiner this, that
|
164
|
-
DTI singular or plural determiner/quantifier some, any
|
165
|
-
DTS plural determiner these, those
|
166
|
-
DTX determiner/double conjunction either
|
167
|
-
EX existentil there
|
168
|
-
FW foreign word (hyphenated before regular tag)
|
169
|
-
HL word occurring in headline (hyphenated after regular tag)
|
170
|
-
HV have
|
171
|
-
HVD had (past tense)
|
172
|
-
HVG having
|
173
|
-
HVN had (past participle)
|
174
|
-
HVZ has
|
175
|
-
IN preposition
|
176
|
-
JJ adjective
|
177
|
-
JJR comparative adjective
|
178
|
-
JJS semantically superlative adjective chief, top
|
179
|
-
JJT morphologically superlative adjective biggest
|
180
|
-
MD modal auxiliary can, should, will
|
181
|
-
NC cited word (hyphenated after regular tag)
|
182
|
-
NN singular or mass noun
|
183
|
-
NN$ possessive singular noun
|
184
|
-
NNS plural noun
|
185
|
-
NNS$ possessive plural noun
|
186
|
-
NP proper noun or part of name phrase
|
187
|
-
NP$ possessive proper noun
|
188
|
-
NPS plural proper noun
|
189
|
-
NPS$ possessive plural proper noun
|
190
|
-
NR adverbial noun home, today, west
|
191
|
-
NRS plural adverbial noun
|
192
|
-
OD ordinal numeral first, 2nd
|
193
|
-
PN nominal pronoun everybody, nothing
|
194
|
-
PN$ possessive nominal pronoun
|
195
|
-
PP$ possessive personal pronoun my, our
|
196
|
-
PP$$ second (nominal) possessive pronoun mine, ours
|
197
|
-
PPL singular reflexive/intensive personal pronoun myself
|
198
|
-
PPLS plural reflexive/intensive personal pronoun ourselves
|
199
|
-
PPO objective personal pronoun me, him, it, them
|
200
|
-
PPS 3rd. singular nominative pronoun he, she, it, one
|
201
|
-
PPSS other nominative personal pronoun I, we, they, you
|
202
|
-
QL qualifier very, fairly
|
203
|
-
QLP post-qualifier enough, indeed
|
204
|
-
RB adverb
|
205
|
-
RBR comparative adverb
|
206
|
-
RBT superlative adverb
|
207
|
-
RN nominal adverb here then, indoors
|
208
|
-
RP adverb/particle about, off, up
|
209
|
-
TL word occurring in title (hyphenated after
|
210
|
-
regular tag)
|
211
|
-
TO infinitive marker to
|
212
|
-
UH interjection, exclamation
|
213
|
-
VB verb, base form
|
214
|
-
VBD verb, past tense
|
215
|
-
VBG verb, present participle/gerund
|
216
|
-
VBN verb, past participle
|
217
|
-
VBZ verb, 3rd. singular present
|
218
|
-
WDT wh- determiner what, which
|
219
|
-
WP$ possessive wh- pronoun whose
|
220
|
-
WPO objective wh- pronoun whom, which, that
|
221
|
-
WPS nominative wh- pronoun who, which, that
|
222
|
-
WQL wh- qualifier how
|
223
|
-
WRB wh- adverb how, where, when
|
224
|
-
|
225
|
-
=end
|
226
169
|
]
|
170
|
+
# A description of Enju categories.
|
227
171
|
EnjuCatDescription = [
|
228
172
|
['ADJ', 'Adjective'],
|
229
173
|
['ADV', 'Adverb'],
|
@@ -330,7 +274,7 @@ WRB wh- adverb how, where, when
|
|
330
274
|
'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
|
331
275
|
'Noun, adverbial', ['NN0', 'NR', 'NN'],
|
332
276
|
'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
|
333
|
-
'Pronoun, nominal (indefinite)', ['PNI', 'PN', '
|
277
|
+
'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP'],
|
334
278
|
'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
|
335
279
|
'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
|
336
280
|
'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
|
@@ -387,7 +331,6 @@ WRB wh- adverb how, where, when
|
|
387
331
|
'Symbol, alphabetical', ['ZZ0', '', ''],
|
388
332
|
'Symbol, list item', ['', '', 'LS']
|
389
333
|
]
|
390
|
-
|
391
334
|
end
|
392
335
|
end
|
393
336
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class English
|
4
|
+
require 'treat/languages/english/tags'
|
5
|
+
require 'treat/languages/english/categories'
|
6
|
+
Extractors = {
|
7
|
+
time: [:chronic],
|
8
|
+
topics: [:reuters],
|
9
|
+
topic_words: [:lda],
|
10
|
+
key_sentences: [:topics_frequency]
|
11
|
+
}
|
12
|
+
Processors = {
|
13
|
+
chunkers: [:txt],
|
14
|
+
parsers: [:enju, :stanford],
|
15
|
+
segmenters: [:tactful, :punkt, :stanford],
|
16
|
+
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
17
|
+
}
|
18
|
+
Lexicalizers = {
|
19
|
+
category: [:from_tag],
|
20
|
+
linkages: [:naive],
|
21
|
+
synsets: [:wordnet, :rita_wn],
|
22
|
+
tag: [:brill, :lingua, :stanford]
|
23
|
+
}
|
24
|
+
Inflectors = {
|
25
|
+
conjugations: [:linguistics],
|
26
|
+
declensions: [:linguistics, :english],
|
27
|
+
stem: [:porter_c, :porter, :uea],
|
28
|
+
ordinal_words: [:linguistics],
|
29
|
+
cardinal_words: [:linguistics]
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class French
|
4
|
+
Extractors = {}
|
5
|
+
Inflectors = {}
|
6
|
+
Lexicalizers = {
|
7
|
+
tag: [:stanford]
|
8
|
+
}
|
9
|
+
Processors = {
|
10
|
+
chunkers: [:txt],
|
11
|
+
parsers: [:stanford],
|
12
|
+
segmenters: [:tactful, :punkt, :stanford],
|
13
|
+
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class German
|
4
|
+
Extractors = {}
|
5
|
+
Inflectors = {}
|
6
|
+
Lexicalizers = {
|
7
|
+
tag: [:stanford]
|
8
|
+
}
|
9
|
+
Processors = {
|
10
|
+
chunkers: [:txt],
|
11
|
+
parsers: [:stanford],
|
12
|
+
segmenters: [:tactful, :punkt, :stanford],
|
13
|
+
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
14
|
+
}
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class Italian
|
4
|
+
Extractors = {}
|
5
|
+
Inflectors = {}
|
6
|
+
Lexicalizers = {}
|
7
|
+
Processors = {
|
8
|
+
chunkers: [:txt],
|
9
|
+
segmenters: [:tactful, :punkt, :stanford],
|
10
|
+
tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
|
11
|
+
}
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
File without changes
|
@@ -0,0 +1,91 @@
|
|
1
|
+
module Treat
|
2
|
+
# This module provides linguistic resources
|
3
|
+
# for the Treat library, including information
|
4
|
+
# about language codes, the functions available
|
5
|
+
# for each language, and the different tags used
|
6
|
+
# to markup that language.
|
7
|
+
module Languages
|
8
|
+
Dir["#{Treat.lib}/treat/languages/*.rb"].each { |file| require file }
|
9
|
+
ISO639_1 = 1
|
10
|
+
ISO639_2 = 2
|
11
|
+
# Describe a language code (ISO-639-1 or ISO-639-2)
|
12
|
+
# or its full text description in full French or English.
|
13
|
+
def self.describe(lang, desc_lang = :en)
|
14
|
+
raise "Must provide a non-nil language identifier to describe." if lang.nil?
|
15
|
+
lang = find(lang).to_s
|
16
|
+
if [:en, :eng, :english, :anglais].include?(desc_lang)
|
17
|
+
l = @@english_full.key(lang)
|
18
|
+
elsif [:fr, :fra, :french, :french].include?(desc_lang)
|
19
|
+
l = @@french_full.key(lang)
|
20
|
+
else
|
21
|
+
raise Treat::Exception,
|
22
|
+
"Unknown language to describe: #{desc_lang}."
|
23
|
+
end
|
24
|
+
not_found(lang) if l.nil?
|
25
|
+
l.intern
|
26
|
+
end
|
27
|
+
# Raise an error message when a language code
|
28
|
+
# or description is not found and suggest
|
29
|
+
# possible misspellings.
|
30
|
+
def self.not_found(lang)
|
31
|
+
msg = "Language '#{lang}' does not exist."
|
32
|
+
all = @@iso639_2.keys + @@iso639_1.keys +
|
33
|
+
@@english_full.keys + @@french_full.keys
|
34
|
+
msg += did_you_mean?(all, lang)
|
35
|
+
raise Treat::Exception, msg
|
36
|
+
end
|
37
|
+
# Return the class representing a language.
|
38
|
+
def self.get(lang)
|
39
|
+
const_get(Treat::Languages.describe(lang).to_s.capitalize)
|
40
|
+
end
|
41
|
+
# Find a language by ISO-639-1 or ISO-639-2 code
|
42
|
+
# or full name (in English or French) and return
|
43
|
+
# the ISO-639-1 or ISO-639-2 language code as a
|
44
|
+
# lowercase identifier.
|
45
|
+
def self.find(lang, rc = ISO639_2)
|
46
|
+
raise "Must provide a non-nil language identifier to describe." if lang.nil?
|
47
|
+
get_languages
|
48
|
+
lang = lang.to_s.downcase
|
49
|
+
if @@iso639_1.has_key?(lang)
|
50
|
+
return :"#{lang}" if rc == ISO639_1
|
51
|
+
return :"#{@@iso639_1[lang]}" if rc == ISO639_2
|
52
|
+
elsif @@iso639_2.has_key?(lang)
|
53
|
+
return :"#{lang}" if rc == ISO639_2
|
54
|
+
return :"#{@@iso639_2[lang]}" if rc == ISO639_1
|
55
|
+
elsif @@english_full.has_key?(lang)
|
56
|
+
return :"#{@@english_full[lang]}" if rc == ISO639_2
|
57
|
+
return :"#{@@iso639_2[@@english_full[lang]]}" if rc == ISO639_1
|
58
|
+
elsif @@french_full.has_key?(lang)
|
59
|
+
return :"#{@@french_full[lang]}" if rc == ISO639_2
|
60
|
+
return :"#{@@iso639_1[@@french_full[lang]]}" if rc == ISO639_2
|
61
|
+
else
|
62
|
+
not_found(lang)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
@@loaded = false
|
66
|
+
# Get the languages from the dictionary.
|
67
|
+
def self.get_languages
|
68
|
+
return if @@loaded
|
69
|
+
@@iso639_1 = {}; @@iso639_2 = {};
|
70
|
+
@@english_full = {}; @@french_full = {}
|
71
|
+
languages = IO.readlines(Treat.lib + '/treat/languages/list.txt')
|
72
|
+
languages.each do |language|
|
73
|
+
iso639_2, iso639_1, english_desc, french_desc =
|
74
|
+
language.split(',')
|
75
|
+
@@iso639_1[iso639_1] = iso639_2
|
76
|
+
@@iso639_2[iso639_2] = iso639_1
|
77
|
+
unless english_desc.nil?
|
78
|
+
english_desc.strip.downcase.split('|').each do |l|
|
79
|
+
@@english_full[l.downcase.strip] = iso639_2
|
80
|
+
end
|
81
|
+
end
|
82
|
+
unless french_desc.nil?
|
83
|
+
french_desc.strip.downcase.split('|').each do |l|
|
84
|
+
@@french_full[l.downcase.strip] = iso639_2
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
@@loaded = true
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -4,24 +4,36 @@ module Treat
|
|
4
4
|
# A class that detects the category of a word from its tag,
|
5
5
|
# using the default tagger for the language of the entity.
|
6
6
|
class FromTag
|
7
|
+
DefaultOptions = { tagger: nil }
|
7
8
|
# Find the category of the current entity.
|
8
9
|
# Options:
|
9
10
|
# :tagger => (Symbol) force the use of a tagger.
|
10
11
|
# :tag_to_cat => (Hash) a list of categories for each possible tag.
|
11
12
|
def self.category(entity, options = {})
|
12
|
-
|
13
|
-
options = {
|
14
|
-
tagger: nil,
|
15
|
-
tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
|
16
|
-
}
|
17
|
-
end
|
13
|
+
options = DefaultOptions.merge(options)
|
18
14
|
tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
|
19
|
-
|
15
|
+
lang = Treat::Languages.get(entity.language)
|
16
|
+
cat = lang::WordTagToCategory[tag]
|
20
17
|
if cat.nil?
|
21
18
|
warn "Category not found for tag #{tag}."
|
22
19
|
:unknown
|
23
20
|
else
|
24
|
-
cat
|
21
|
+
if cat.size == 1
|
22
|
+
return cat[0]
|
23
|
+
else
|
24
|
+
if entity.has?(:tag_set)
|
25
|
+
if cat[entity.tag_set]
|
26
|
+
return cat[entity.tag_set]
|
27
|
+
else
|
28
|
+
raise Treat::Exception,
|
29
|
+
"The specified tag set (#{entity.tag_set})" +
|
30
|
+
" does not contain the tag #{tag}."
|
31
|
+
end
|
32
|
+
else
|
33
|
+
raise Treat::Exception,
|
34
|
+
"No information can be found regarding which tag set to use."
|
35
|
+
end
|
36
|
+
end
|
25
37
|
end
|
26
38
|
end
|
27
39
|
end
|
@@ -25,7 +25,7 @@ module Treat
|
|
25
25
|
patch = false
|
26
26
|
# Require the 'rbtagger' gem.
|
27
27
|
begin
|
28
|
-
|
28
|
+
silence_warnings { require 'rbtagger' }
|
29
29
|
# This whole mess is required to deal with
|
30
30
|
# the fact that the 'rbtagger' gem defines
|
31
31
|
# a top-level module called 'Word', which
|
@@ -73,6 +73,7 @@ module Treat
|
|
73
73
|
# Create the tagger if necessary
|
74
74
|
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
75
75
|
options[:lexical_rules], options[:contextual_rules])
|
76
|
+
entity.set :tag_set, :penn
|
76
77
|
# Perform tagging.
|
77
78
|
if entity.type == :word
|
78
79
|
# Setup the context of the word
|
@@ -17,7 +17,7 @@ module Treat
|
|
17
17
|
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
|
18
18
|
class Lingua
|
19
19
|
# Require the 'engtagger' gem.
|
20
|
-
|
20
|
+
silence_warnings { require 'engtagger' }
|
21
21
|
# Hold the tagger.
|
22
22
|
@@tagger = nil
|
23
23
|
# Hold the user-set options
|
@@ -46,6 +46,7 @@ module Treat
|
|
46
46
|
@@tagger = nil # Reset the tagger
|
47
47
|
end
|
48
48
|
@@tagger ||= ::EngTagger.new(@@options)
|
49
|
+
entity.set :tag_set, :penn
|
49
50
|
left = entity.left
|
50
51
|
if left.nil? || left.type != :word
|
51
52
|
left_tag = 'pp'
|