treat 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. data/INSTALL +1 -0
  2. data/README +3 -0
  3. data/TODO +14 -26
  4. data/bin/INFO +1 -1
  5. data/lib/treat/buildable.rb +10 -11
  6. data/lib/treat/categories.rb +8 -6
  7. data/lib/treat/category.rb +7 -2
  8. data/lib/treat/delegatable.rb +64 -56
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
  10. data/lib/treat/detectors/language/language_detector.rb +2 -1
  11. data/lib/treat/detectors/language/what_language.rb +2 -2
  12. data/lib/treat/detectors.rb +3 -0
  13. data/lib/treat/entities/entity.rb +1 -1
  14. data/lib/treat/entities.rb +9 -10
  15. data/lib/treat/exception.rb +3 -1
  16. data/lib/treat/extractors/named_entity/abner.rb +1 -1
  17. data/lib/treat/extractors/named_entity/stanford.rb +2 -2
  18. data/lib/treat/extractors/time/chronic.rb +2 -2
  19. data/lib/treat/extractors/time/nickel.rb +2 -2
  20. data/lib/treat/extractors/topic_words/lda.rb +2 -2
  21. data/lib/treat/extractors.rb +12 -9
  22. data/lib/treat/feature.rb +6 -1
  23. data/lib/treat/formatters/cleaners/html.rb +1 -1
  24. data/lib/treat/formatters.rb +8 -8
  25. data/lib/treat/group.rb +11 -10
  26. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  27. data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
  28. data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
  29. data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
  30. data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
  31. data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
  32. data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
  33. data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
  34. data/lib/treat/inflectors.rb +8 -21
  35. data/lib/treat/kernel.rb +120 -0
  36. data/lib/treat/languages/arabic.rb +14 -0
  37. data/lib/treat/languages/categories.rb +5 -0
  38. data/lib/treat/languages/chinese.rb +12 -0
  39. data/lib/treat/languages/english/categories.rb +23 -0
  40. data/lib/treat/{resources → languages/english}/tags.rb +127 -184
  41. data/lib/treat/languages/english.rb +33 -0
  42. data/lib/treat/languages/french.rb +17 -0
  43. data/lib/treat/languages/german.rb +17 -0
  44. data/lib/treat/languages/italian.rb +14 -0
  45. data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
  46. data/lib/treat/languages/xinhua.rb +12 -0
  47. data/lib/treat/languages.rb +91 -0
  48. data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
  49. data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
  50. data/lib/treat/lexicalizers/tag/brill.rb +2 -1
  51. data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
  52. data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
  53. data/lib/treat/lexicalizers.rb +1 -1
  54. data/lib/treat/object.rb +6 -0
  55. data/lib/treat/processors/parsers/enju.rb +3 -2
  56. data/lib/treat/processors/parsers/stanford.rb +15 -12
  57. data/lib/treat/processors/segmenters/punkt.rb +1 -1
  58. data/lib/treat/processors/segmenters/stanford.rb +7 -5
  59. data/lib/treat/processors/segmenters/tactful.rb +1 -1
  60. data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
  61. data/lib/treat/processors/tokenizers/stanford.rb +7 -5
  62. data/lib/treat/visitable.rb +2 -1
  63. data/lib/treat.rb +105 -54
  64. data/test/tc_entity.rb +5 -0
  65. data/test/tc_resources.rb +5 -5
  66. data/test/tc_treat.rb +1 -2
  67. data/test/tests.rb +2 -1
  68. metadata +63 -64
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
  70. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
  71. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
  72. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
  73. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
  74. data/lib/treat/resources/categories.rb +0 -18
  75. data/lib/treat/resources/delegates.rb +0 -96
  76. data/lib/treat/resources/dependencies.rb +0 -0
  77. data/lib/treat/resources/edges.rb +0 -8
  78. data/lib/treat/resources/formats.rb +0 -23
  79. data/lib/treat/resources/languages.rb +0 -86
  80. data/lib/treat/resources.rb +0 -10
  81. data/lib/treat/utilities.rb +0 -127
@@ -1,60 +1,10 @@
1
- # encoding: UTF-8
2
-
3
1
  module Treat
4
- module Resources
5
- class Tags
2
+ module Languages
3
+ class English
6
4
 
7
5
  ClawsC5 = 0
8
6
  Brown = 1
9
7
  Penn = 2
10
- Enju = 3
11
-
12
- PTBWordTagToCategory = {
13
- 'CC' => :conjunction, # Coordinating conjunction
14
- 'CD' => :number, # Cardinal number
15
- 'DT' => :determiner, # Determiner
16
- 'DET' => :determiner, # Determiner
17
- 'EX' => :determiner, # Existential there
18
- 'FW' => :foreign, # Foreign word
19
- 'IN' => :preposition, # Preposition or subordinating conjunction
20
- 'JJ' => :adjective, # Adjective
21
- 'JJR' => :adjective, # Adjective, comparative
22
- 'JJS' => :adjective, # Adjective, superlative
23
- 'LS' => :list, # List item marker
24
- 'MD' => :modal, # Modal
25
- 'NN' => :noun, # Noun, singular or mass
26
- 'NNS' => :noun, # Noun, plural
27
- 'NNP' => :noun, # Proper noun, singular
28
- 'NNPS' => :noun, # Proper noun, plural
29
- 'PDT' => :determiner, # Predeterminer
30
- 'POS' => :determiner, # Possessive ending
31
- 'PRP' => :pronoun, # Personal pronoun
32
- 'PRP$' => :pronoun, # Possessive pronoun,
33
- 'PRPS' => :determiner, # Possessive determiner
34
- 'RB' => :adverb, # Adverb
35
- 'RBR' => :adverb, # Adverb, comparative
36
- 'RBS' => :adverb, # Adverb, superlative
37
- 'RP' => :particle, # Particle
38
- 'SYM' => :symbol, # Symbol
39
- 'TO' => :to, # to
40
- 'UH' => :interjection, # Interjection
41
- 'VB' => :verb, # Verb, base form
42
- 'VBD' => :verb, # Verb, past tense
43
- 'VBG' => :verb, # Verb, gerund or present participle
44
- 'VBN' => :verb, # Verb, past participle
45
- 'VBP' => :verb, # Verb, non-3rd person singular present
46
- 'VBZ' => :verb, # Verb, 3rd person singular present
47
- 'WDT' => :determiner, # Wh-determiner
48
- 'WP' => :pronoun, # Wh-pronoun
49
- 'WP$' => :pronoun, # Possessive wh-pronoun
50
- 'WRB' => :adverb, # Wh-adverb
51
- ')' => :punctuation, # Right bracket
52
- '(' => :punctuation, # Left bracket
53
- '.' => :punctuation, # Period
54
- '\'\'' => :symbol, # Quote
55
- ',' => :punctuation,
56
- ';' => :punctuation
57
- }
58
8
 
59
9
  PTBClauseTagDescription = [
60
10
  ['S', 'Simple declarative clause'],
@@ -89,141 +39,135 @@ module Treat
89
39
  ]
90
40
 
91
41
  PTBWordTagDescription = [
92
- =begin
93
- CC - Coordinating conjunction
94
- CD - Cardinal number
95
- DT - Determiner
96
- EX - Existential there
97
- FW - Foreign word
98
- IN - Preposition or subordinating conjunction
99
- JJ - Adjective
100
- JJR - Adjective, comparative
101
- JJS - Adjective, superlative
102
- LS - List item marker
103
- MD - Modal
104
- NN - Noun, singular or mass
105
- NNS - Noun, plural
106
- NNP - Proper noun, singular
107
- NNPS - Proper noun, plural
108
- PDT - Predeterminer
109
- POS - Possessive ending
110
- PRP - Personal pronoun
111
- PRP$ - Possessive pronoun (prolog version PRP-S)
112
- RB - Adverb
113
- RBR - Adverb, comparative
114
- RBS - Adverb, superlative
115
- RP - Particle
116
- SYM - Symbol
117
- TO - to
118
- UH - Interjection
119
- VB - Verb, base form
120
- VBD - Verb, past tense
121
- VBG - Verb, gerund or present participle
122
- VBN - Verb, past participle
123
- VBP - Verb, non-3rd person singular present
124
- VBZ - Verb, 3rd person singular present
125
- WDT - Wh-determiner
126
- WP - Wh-pronoun
127
- WP$ - Possessive wh-pronoun (prolog version WP-S)
128
- WRB - Wh-adverb
129
-
130
- =end
42
+ ['CC', 'Coordinating conjunction'],
43
+ ['CD', 'Cardinal number'],
44
+ ['DT', 'Determiner'],
45
+ ['EX', 'Existential there'],
46
+ ['FW', 'Foreign word'],
47
+ ['IN', 'Preposition or subordinating conjunction'],
48
+ ['JJ', 'Adjective'],
49
+ ['JJR', 'Adjective, comparative'],
50
+ ['JJS', 'Adjective, superlative'],
51
+ ['LS', 'List item marker'],
52
+ ['MD', 'Modal'],
53
+ ['NN', 'Noun, singular or mass'],
54
+ ['NNS', 'Noun, plural'],
55
+ ['NNP', 'Proper noun, singular'],
56
+ ['NNPS', 'Proper noun, plural'],
57
+ ['PDT', 'Predeterminer'],
58
+ ['POS', 'Possessive ending'],
59
+ ['PRP', 'Personal pronoun'],
60
+ ['PRP$', 'Possessive pronoun (prolog version PRP-S)'],
61
+ ['RB', 'Adverb'],
62
+ ['RBR', 'Adverb, comparative'],
63
+ ['RBS', 'Adverb, superlative'],
64
+ ['RP', 'Particle'],
65
+ ['SYM', 'Symbol'],
66
+ ['TO', 'to'],
67
+ ['UH', 'Interjection'],
68
+ ['VB', 'Verb, base form'],
69
+ ['VBD', 'Verb, past tense'],
70
+ ['VBG', 'Verb, gerund or present participle'],
71
+ ['VBN', 'Verb, past participle'],
72
+ ['VBP', 'Verb, non 3rd person singular present'],
73
+ ['VBZ', 'Verb, 3rd person singular present'],
74
+ ['WDT', 'Wh-determiner'],
75
+ ['WP', 'Wh-pronoun'],
76
+ ['WP$', 'Possessive wh-pronoun (prolog version WP-S)'],
77
+ ['WRB', 'Wh-adverb']
131
78
  ]
79
+
132
80
  BrownWordTagDescription = [
133
- =begin
134
81
 
135
- Tag Description Examples
82
+ ['.', 'sentence closer . ; ? !'],
83
+ ['(', 'left parent'] ,
84
+ [')', 'right parent'],
85
+ ['*', 'not'],
86
+ ['--', 'dash'],
87
+ [',', 'comma'],
88
+ [':', 'colon'],
89
+ ['ABL', 'pre-qualifier quite, rather'],
90
+ ['ABN', 'pre-quantifier half, all'],
91
+ ['ABX', 'pre-quantifier both'],
92
+ ['AP', 'post-determiner many, several, next'],
93
+ ['AT', 'article a, the, no'],
94
+ ['BE', 'be '],
95
+ ['BED', 'were '],
96
+ ['BEDZ', 'was '],
97
+ ['BEG', 'being '],
98
+ ['BEM', 'am '],
99
+ ['BEN', 'been '],
100
+ ['BER', 'are, art '],
101
+ ['BEZ', 'is '],
102
+ ['CC', 'coordinating conjunction and, or'],
103
+ ['CD', 'cardinal numeral one, two, 2, etc.'],
104
+ ['CS', 'subordinating conjunction if, although'],
105
+ ['DO', 'do '],
106
+ ['DOD', 'did '],
107
+ ['DOZ', 'does '],
108
+ ['DT', 'singular determiner this, that'],
109
+ ['DTI', 'singular or plural determiner/quantifier some, any'],
110
+ ['DTS', 'plural determiner these, those'],
111
+ ['DTX', 'determiner/double conjunction either'],
112
+ ['EX', 'existentil there '],
113
+ ['FW', 'foreign word (hyphenated before regular tag) '],
114
+ ['HL', 'word occurring in headline (hyphenated after regular tag) '],
115
+ ['HV', 'have '],
116
+ ['HVD', 'had (past tense) '],
117
+ ['HVG', 'having '],
118
+ ['HVN', 'had (past participle) '],
119
+ ['HVZ', 'has '],
120
+ ['IN', 'preposition '],
121
+ ['JJ', 'adjective '],
122
+ ['JJR', 'comparative adjective '],
123
+ ['JJS', 'semantically superlative adjective chief, top'],
124
+ ['JJT', 'morphologically superlative adjective biggest'],
125
+ ['MD', 'modal auxiliary can, should, will'],
126
+ ['NC', 'cited word (hyphenated after regular tag) '],
127
+ ['NN', 'singular or mass noun '],
128
+ ['NN$', 'possessive singular noun '],
129
+ ['NNS', 'plural noun '],
130
+ ['NNS$', 'possessive plural noun '],
131
+ ['NP', 'proper noun or part of name phrase '],
132
+ ['NP$', 'possessive proper noun '],
133
+ ['NPS', 'plural proper noun '],
134
+ ['NPS$', 'possessive plural proper noun '],
135
+ ['NR', 'adverbial noun home, today, west'],
136
+ ['NRS', 'plural adverbial noun'],
137
+ ['OD', 'ordinal numeral first, 2nd'],
138
+ ['PN', 'nominal pronoun everybody, nothing'],
139
+ ['PN$', 'possessive nominal pronoun '],
140
+ ['PP$', 'possessive personal pronoun my, our'],
141
+ ['PP$$', 'second (nominal) possessive pronoun mine, ours'],
142
+ ['PPL', 'singular reflexive/intensive personal pronoun myself'],
143
+ ['PPLS', 'plural reflexive/intensive personal pronoun ourselves'],
144
+ ['PPO', 'objective personal pronoun me, him, it, them'],
145
+ ['PPS', '3rd. singular nominative pronoun he, she, it, one'],
146
+ ['PPSS', 'other nominative personal pronoun I, we, they, you'],
147
+ ['QL', 'qualifier very, fairly'],
148
+ ['QLP', 'post-qualifier enough, indeed'],
149
+ ['RB', 'adverb '],
150
+ ['RBR', 'comparative adverb '],
151
+ ['RBT', 'superlative adverb '],
152
+ ['RN', 'nominal adverb here then, indoors '],
153
+ ['RP', 'adverb/particle about, off, up'],
154
+ ['TL', 'word occurring in title (hyphenated after regular tag)'],
155
+ ['TO', 'infinitive marker to '],
156
+ ['UH', 'interjection, exclamation '],
157
+ ['VB', 'verb, base form '],
158
+ ['VBD', 'verb, past tense '],
159
+ ['VBG', 'verb, present participle/gerund '],
160
+ ['VBN', 'verb, past participle '],
161
+ ['VBZ', 'verb, 3rd. singular present '],
162
+ ['WDT', 'wh- determiner what, which'],
163
+ ['WP$', 'possessive wh- pronoun whose'],
164
+ ['WPO', 'objective wh- pronoun whom, which, that'],
165
+ ['WPS', 'nominative wh- pronoun who, which, that'],
166
+ ['WQL', 'wh- qualifier how'],
167
+ ['WRB', 'wh- adverb how, where, when']
136
168
 
137
- . sentence closer . ; ? !
138
- ( left paren
139
- ) right paren
140
- * not, n't
141
- -- dash
142
- , comma
143
- : colon
144
- ABL pre-qualifier quite, rather
145
- ABN pre-quantifier half, all
146
- ABX pre-quantifier both
147
- AP post-determiner many, several, next
148
- AT article a, the, no
149
- BE be
150
- BED were
151
- BEDZ was
152
- BEG being
153
- BEM am
154
- BEN been
155
- BER are, art
156
- BEZ is
157
- CC coordinating conjunction and, or
158
- CD cardinal numeral one, two, 2, etc.
159
- CS subordinating conjunction if, although
160
- DO do
161
- DOD did
162
- DOZ does
163
- DT singular determiner this, that
164
- DTI singular or plural determiner/quantifier some, any
165
- DTS plural determiner these, those
166
- DTX determiner/double conjunction either
167
- EX existentil there
168
- FW foreign word (hyphenated before regular tag)
169
- HL word occurring in headline (hyphenated after regular tag)
170
- HV have
171
- HVD had (past tense)
172
- HVG having
173
- HVN had (past participle)
174
- HVZ has
175
- IN preposition
176
- JJ adjective
177
- JJR comparative adjective
178
- JJS semantically superlative adjective chief, top
179
- JJT morphologically superlative adjective biggest
180
- MD modal auxiliary can, should, will
181
- NC cited word (hyphenated after regular tag)
182
- NN singular or mass noun
183
- NN$ possessive singular noun
184
- NNS plural noun
185
- NNS$ possessive plural noun
186
- NP proper noun or part of name phrase
187
- NP$ possessive proper noun
188
- NPS plural proper noun
189
- NPS$ possessive plural proper noun
190
- NR adverbial noun home, today, west
191
- NRS plural adverbial noun
192
- OD ordinal numeral first, 2nd
193
- PN nominal pronoun everybody, nothing
194
- PN$ possessive nominal pronoun
195
- PP$ possessive personal pronoun my, our
196
- PP$$ second (nominal) possessive pronoun mine, ours
197
- PPL singular reflexive/intensive personal pronoun myself
198
- PPLS plural reflexive/intensive personal pronoun ourselves
199
- PPO objective personal pronoun me, him, it, them
200
- PPS 3rd. singular nominative pronoun he, she, it, one
201
- PPSS other nominative personal pronoun I, we, they, you
202
- QL qualifier very, fairly
203
- QLP post-qualifier enough, indeed
204
- RB adverb
205
- RBR comparative adverb
206
- RBT superlative adverb
207
- RN nominal adverb here then, indoors
208
- RP adverb/particle about, off, up
209
- TL word occurring in title (hyphenated after
210
- regular tag)
211
- TO infinitive marker to
212
- UH interjection, exclamation
213
- VB verb, base form
214
- VBD verb, past tense
215
- VBG verb, present participle/gerund
216
- VBN verb, past participle
217
- VBZ verb, 3rd. singular present
218
- WDT wh- determiner what, which
219
- WP$ possessive wh- pronoun whose
220
- WPO objective wh- pronoun whom, which, that
221
- WPS nominative wh- pronoun who, which, that
222
- WQL wh- qualifier how
223
- WRB wh- adverb how, where, when
224
-
225
- =end
226
169
  ]
170
+ # A description of Enju categories.
227
171
  EnjuCatDescription = [
228
172
  ['ADJ', 'Adjective'],
229
173
  ['ADV', 'Adverb'],
@@ -330,7 +274,7 @@ WRB wh- adverb how, where, when
330
274
  'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
331
275
  'Noun, adverbial', ['NN0', 'NR', 'NN'],
332
276
  'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
333
- 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'NN'],
277
+ 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP'],
334
278
  'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
335
279
  'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
336
280
  'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
@@ -387,7 +331,6 @@ WRB wh- adverb how, where, when
387
331
  'Symbol, alphabetical', ['ZZ0', '', ''],
388
332
  'Symbol, list item', ['', '', 'LS']
389
333
  ]
390
-
391
334
  end
392
335
  end
393
336
  end
@@ -0,0 +1,33 @@
1
+ module Treat
2
+ module Languages
3
+ class English
4
+ require 'treat/languages/english/tags'
5
+ require 'treat/languages/english/categories'
6
+ Extractors = {
7
+ time: [:chronic],
8
+ topics: [:reuters],
9
+ topic_words: [:lda],
10
+ key_sentences: [:topics_frequency]
11
+ }
12
+ Processors = {
13
+ chunkers: [:txt],
14
+ parsers: [:enju, :stanford],
15
+ segmenters: [:tactful, :punkt, :stanford],
16
+ tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
17
+ }
18
+ Lexicalizers = {
19
+ category: [:from_tag],
20
+ linkages: [:naive],
21
+ synsets: [:wordnet, :rita_wn],
22
+ tag: [:brill, :lingua, :stanford]
23
+ }
24
+ Inflectors = {
25
+ conjugations: [:linguistics],
26
+ declensions: [:linguistics, :english],
27
+ stem: [:porter_c, :porter, :uea],
28
+ ordinal_words: [:linguistics],
29
+ cardinal_words: [:linguistics]
30
+ }
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Languages
3
+ class French
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {
7
+ tag: [:stanford]
8
+ }
9
+ Processors = {
10
+ chunkers: [:txt],
11
+ parsers: [:stanford],
12
+ segmenters: [:tactful, :punkt, :stanford],
13
+ tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
14
+ }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Languages
3
+ class German
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {
7
+ tag: [:stanford]
8
+ }
9
+ Processors = {
10
+ chunkers: [:txt],
11
+ parsers: [:stanford],
12
+ segmenters: [:tactful, :punkt, :stanford],
13
+ tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
14
+ }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,14 @@
1
+ module Treat
2
+ module Languages
3
+ class Italian
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {}
7
+ Processors = {
8
+ chunkers: [:txt],
9
+ segmenters: [:tactful, :punkt, :stanford],
10
+ tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
11
+ }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,12 @@
1
+ module Treat
2
+ module Languages
3
+ class Xinhua
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {}
7
+ Processors = {
8
+ parsers: [:stanford]
9
+ }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,91 @@
1
+ module Treat
2
+ # This module provides linguistic resources
3
+ # for the Treat library, including information
4
+ # about language codes, the functions available
5
+ # for each language, and the different tags used
6
+ # to markup that language.
7
+ module Languages
8
+ Dir["#{Treat.lib}/treat/languages/*.rb"].each { |file| require file }
9
+ ISO639_1 = 1
10
+ ISO639_2 = 2
11
+ # Describe a language code (ISO-639-1 or ISO-639-2)
12
+ # or its full text description in full French or English.
13
+ def self.describe(lang, desc_lang = :en)
14
+ raise "Must provide a non-nil language identifier to describe." if lang.nil?
15
+ lang = find(lang).to_s
16
+ if [:en, :eng, :english, :anglais].include?(desc_lang)
17
+ l = @@english_full.key(lang)
18
+ elsif [:fr, :fra, :french, :french].include?(desc_lang)
19
+ l = @@french_full.key(lang)
20
+ else
21
+ raise Treat::Exception,
22
+ "Unknown language to describe: #{desc_lang}."
23
+ end
24
+ not_found(lang) if l.nil?
25
+ l.intern
26
+ end
27
+ # Raise an error message when a language code
28
+ # or description is not found and suggest
29
+ # possible misspellings.
30
+ def self.not_found(lang)
31
+ msg = "Language '#{lang}' does not exist."
32
+ all = @@iso639_2.keys + @@iso639_1.keys +
33
+ @@english_full.keys + @@french_full.keys
34
+ msg += did_you_mean?(all, lang)
35
+ raise Treat::Exception, msg
36
+ end
37
+ # Return the class representing a language.
38
+ def self.get(lang)
39
+ const_get(Treat::Languages.describe(lang).to_s.capitalize)
40
+ end
41
+ # Find a language by ISO-639-1 or ISO-639-2 code
42
+ # or full name (in English or French) and return
43
+ # the ISO-639-1 or ISO-639-2 language code as a
44
+ # lowercase identifier.
45
+ def self.find(lang, rc = ISO639_2)
46
+ raise "Must provide a non-nil language identifier to describe." if lang.nil?
47
+ get_languages
48
+ lang = lang.to_s.downcase
49
+ if @@iso639_1.has_key?(lang)
50
+ return :"#{lang}" if rc == ISO639_1
51
+ return :"#{@@iso639_1[lang]}" if rc == ISO639_2
52
+ elsif @@iso639_2.has_key?(lang)
53
+ return :"#{lang}" if rc == ISO639_2
54
+ return :"#{@@iso639_2[lang]}" if rc == ISO639_1
55
+ elsif @@english_full.has_key?(lang)
56
+ return :"#{@@english_full[lang]}" if rc == ISO639_2
57
+ return :"#{@@iso639_2[@@english_full[lang]]}" if rc == ISO639_1
58
+ elsif @@french_full.has_key?(lang)
59
+ return :"#{@@french_full[lang]}" if rc == ISO639_2
60
+ return :"#{@@iso639_1[@@french_full[lang]]}" if rc == ISO639_2
61
+ else
62
+ not_found(lang)
63
+ end
64
+ end
65
+ @@loaded = false
66
+ # Get the languages from the dictionary.
67
+ def self.get_languages
68
+ return if @@loaded
69
+ @@iso639_1 = {}; @@iso639_2 = {};
70
+ @@english_full = {}; @@french_full = {}
71
+ languages = IO.readlines(Treat.lib + '/treat/languages/list.txt')
72
+ languages.each do |language|
73
+ iso639_2, iso639_1, english_desc, french_desc =
74
+ language.split(',')
75
+ @@iso639_1[iso639_1] = iso639_2
76
+ @@iso639_2[iso639_2] = iso639_1
77
+ unless english_desc.nil?
78
+ english_desc.strip.downcase.split('|').each do |l|
79
+ @@english_full[l.downcase.strip] = iso639_2
80
+ end
81
+ end
82
+ unless french_desc.nil?
83
+ french_desc.strip.downcase.split('|').each do |l|
84
+ @@french_full[l.downcase.strip] = iso639_2
85
+ end
86
+ end
87
+ end
88
+ @@loaded = true
89
+ end
90
+ end
91
+ end
@@ -4,24 +4,36 @@ module Treat
4
4
  # A class that detects the category of a word from its tag,
5
5
  # using the default tagger for the language of the entity.
6
6
  class FromTag
7
+ DefaultOptions = { tagger: nil }
7
8
  # Find the category of the current entity.
8
9
  # Options:
9
10
  # :tagger => (Symbol) force the use of a tagger.
10
11
  # :tag_to_cat => (Hash) a list of categories for each possible tag.
11
12
  def self.category(entity, options = {})
12
- if options.empty?
13
- options = {
14
- tagger: nil,
15
- tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
16
- }
17
- end
13
+ options = DefaultOptions.merge(options)
18
14
  tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
19
- cat = options[:tag_to_cat][tag]
15
+ lang = Treat::Languages.get(entity.language)
16
+ cat = lang::WordTagToCategory[tag]
20
17
  if cat.nil?
21
18
  warn "Category not found for tag #{tag}."
22
19
  :unknown
23
20
  else
24
- cat
21
+ if cat.size == 1
22
+ return cat[0]
23
+ else
24
+ if entity.has?(:tag_set)
25
+ if cat[entity.tag_set]
26
+ return cat[entity.tag_set]
27
+ else
28
+ raise Treat::Exception,
29
+ "The specified tag set (#{entity.tag_set})" +
30
+ " does not contain the tag #{tag}."
31
+ end
32
+ else
33
+ raise Treat::Exception,
34
+ "No information can be found regarding which tag set to use."
35
+ end
36
+ end
25
37
  end
26
38
  end
27
39
  end
@@ -4,7 +4,7 @@ module Treat
4
4
  # Currently not implemented.
5
5
  class RitaWn
6
6
  # Require the Ruby-Java bridge.
7
- #silently do
7
+ #silence_warnings do
8
8
  require 'rjb'
9
9
  # Load the RitaWN jars.
10
10
  Rjb::load("#{Treat.bin}/jwnl/jwnl.jar", [])
@@ -25,7 +25,7 @@ module Treat
25
25
  patch = false
26
26
  # Require the 'rbtagger' gem.
27
27
  begin
28
- silently { require 'rbtagger' }
28
+ silence_warnings { require 'rbtagger' }
29
29
  # This whole mess is required to deal with
30
30
  # the fact that the 'rbtagger' gem defines
31
31
  # a top-level module called 'Word', which
@@ -73,6 +73,7 @@ module Treat
73
73
  # Create the tagger if necessary
74
74
  @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
75
75
  options[:lexical_rules], options[:contextual_rules])
76
+ entity.set :tag_set, :penn
76
77
  # Perform tagging.
77
78
  if entity.type == :word
78
79
  # Setup the context of the word
@@ -17,7 +17,7 @@ module Treat
17
17
  # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
18
18
  class Lingua
19
19
  # Require the 'engtagger' gem.
20
- silently { require 'engtagger' }
20
+ silence_warnings { require 'engtagger' }
21
21
  # Hold the tagger.
22
22
  @@tagger = nil
23
23
  # Hold the user-set options
@@ -46,6 +46,7 @@ module Treat
46
46
  @@tagger = nil # Reset the tagger
47
47
  end
48
48
  @@tagger ||= ::EngTagger.new(@@options)
49
+ entity.set :tag_set, :penn
49
50
  left = entity.left
50
51
  if left.nil? || left.type != :word
51
52
  left_tag = 'pp'