treat 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. data/INSTALL +1 -0
  2. data/README +3 -0
  3. data/TODO +14 -26
  4. data/bin/INFO +1 -1
  5. data/lib/treat/buildable.rb +10 -11
  6. data/lib/treat/categories.rb +8 -6
  7. data/lib/treat/category.rb +7 -2
  8. data/lib/treat/delegatable.rb +64 -56
  9. data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
  10. data/lib/treat/detectors/language/language_detector.rb +2 -1
  11. data/lib/treat/detectors/language/what_language.rb +2 -2
  12. data/lib/treat/detectors.rb +3 -0
  13. data/lib/treat/entities/entity.rb +1 -1
  14. data/lib/treat/entities.rb +9 -10
  15. data/lib/treat/exception.rb +3 -1
  16. data/lib/treat/extractors/named_entity/abner.rb +1 -1
  17. data/lib/treat/extractors/named_entity/stanford.rb +2 -2
  18. data/lib/treat/extractors/time/chronic.rb +2 -2
  19. data/lib/treat/extractors/time/nickel.rb +2 -2
  20. data/lib/treat/extractors/topic_words/lda.rb +2 -2
  21. data/lib/treat/extractors.rb +12 -9
  22. data/lib/treat/feature.rb +6 -1
  23. data/lib/treat/formatters/cleaners/html.rb +1 -1
  24. data/lib/treat/formatters.rb +8 -8
  25. data/lib/treat/group.rb +11 -10
  26. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  27. data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
  28. data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
  29. data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
  30. data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
  31. data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
  32. data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
  33. data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
  34. data/lib/treat/inflectors.rb +8 -21
  35. data/lib/treat/kernel.rb +120 -0
  36. data/lib/treat/languages/arabic.rb +14 -0
  37. data/lib/treat/languages/categories.rb +5 -0
  38. data/lib/treat/languages/chinese.rb +12 -0
  39. data/lib/treat/languages/english/categories.rb +23 -0
  40. data/lib/treat/{resources → languages/english}/tags.rb +127 -184
  41. data/lib/treat/languages/english.rb +33 -0
  42. data/lib/treat/languages/french.rb +17 -0
  43. data/lib/treat/languages/german.rb +17 -0
  44. data/lib/treat/languages/italian.rb +14 -0
  45. data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
  46. data/lib/treat/languages/xinhua.rb +12 -0
  47. data/lib/treat/languages.rb +91 -0
  48. data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
  49. data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
  50. data/lib/treat/lexicalizers/tag/brill.rb +2 -1
  51. data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
  52. data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
  53. data/lib/treat/lexicalizers.rb +1 -1
  54. data/lib/treat/object.rb +6 -0
  55. data/lib/treat/processors/parsers/enju.rb +3 -2
  56. data/lib/treat/processors/parsers/stanford.rb +15 -12
  57. data/lib/treat/processors/segmenters/punkt.rb +1 -1
  58. data/lib/treat/processors/segmenters/stanford.rb +7 -5
  59. data/lib/treat/processors/segmenters/tactful.rb +1 -1
  60. data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
  61. data/lib/treat/processors/tokenizers/stanford.rb +7 -5
  62. data/lib/treat/visitable.rb +2 -1
  63. data/lib/treat.rb +105 -54
  64. data/test/tc_entity.rb +5 -0
  65. data/test/tc_resources.rb +5 -5
  66. data/test/tc_treat.rb +1 -2
  67. data/test/tests.rb +2 -1
  68. metadata +63 -64
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
  70. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
  71. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
  72. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
  73. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
  74. data/lib/treat/resources/categories.rb +0 -18
  75. data/lib/treat/resources/delegates.rb +0 -96
  76. data/lib/treat/resources/dependencies.rb +0 -0
  77. data/lib/treat/resources/edges.rb +0 -8
  78. data/lib/treat/resources/formats.rb +0 -23
  79. data/lib/treat/resources/languages.rb +0 -86
  80. data/lib/treat/resources.rb +0 -10
  81. data/lib/treat/utilities.rb +0 -127
@@ -1,60 +1,10 @@
1
- # encoding: UTF-8
2
-
3
1
  module Treat
4
- module Resources
5
- class Tags
2
+ module Languages
3
+ class English
6
4
 
7
5
  ClawsC5 = 0
8
6
  Brown = 1
9
7
  Penn = 2
10
- Enju = 3
11
-
12
- PTBWordTagToCategory = {
13
- 'CC' => :conjunction, # Coordinating conjunction
14
- 'CD' => :number, # Cardinal number
15
- 'DT' => :determiner, # Determiner
16
- 'DET' => :determiner, # Determiner
17
- 'EX' => :determiner, # Existential there
18
- 'FW' => :foreign, # Foreign word
19
- 'IN' => :preposition, # Preposition or subordinating conjunction
20
- 'JJ' => :adjective, # Adjective
21
- 'JJR' => :adjective, # Adjective, comparative
22
- 'JJS' => :adjective, # Adjective, superlative
23
- 'LS' => :list, # List item marker
24
- 'MD' => :modal, # Modal
25
- 'NN' => :noun, # Noun, singular or mass
26
- 'NNS' => :noun, # Noun, plural
27
- 'NNP' => :noun, # Proper noun, singular
28
- 'NNPS' => :noun, # Proper noun, plural
29
- 'PDT' => :determiner, # Predeterminer
30
- 'POS' => :determiner, # Possessive ending
31
- 'PRP' => :pronoun, # Personal pronoun
32
- 'PRP$' => :pronoun, # Possessive pronoun,
33
- 'PRPS' => :determiner, # Possessive determiner
34
- 'RB' => :adverb, # Adverb
35
- 'RBR' => :adverb, # Adverb, comparative
36
- 'RBS' => :adverb, # Adverb, superlative
37
- 'RP' => :particle, # Particle
38
- 'SYM' => :symbol, # Symbol
39
- 'TO' => :to, # to
40
- 'UH' => :interjection, # Interjection
41
- 'VB' => :verb, # Verb, base form
42
- 'VBD' => :verb, # Verb, past tense
43
- 'VBG' => :verb, # Verb, gerund or present participle
44
- 'VBN' => :verb, # Verb, past participle
45
- 'VBP' => :verb, # Verb, non-3rd person singular present
46
- 'VBZ' => :verb, # Verb, 3rd person singular present
47
- 'WDT' => :determiner, # Wh-determiner
48
- 'WP' => :pronoun, # Wh-pronoun
49
- 'WP$' => :pronoun, # Possessive wh-pronoun
50
- 'WRB' => :adverb, # Wh-adverb
51
- ')' => :punctuation, # Right bracket
52
- '(' => :punctuation, # Left bracket
53
- '.' => :punctuation, # Period
54
- '\'\'' => :symbol, # Quote
55
- ',' => :punctuation,
56
- ';' => :punctuation
57
- }
58
8
 
59
9
  PTBClauseTagDescription = [
60
10
  ['S', 'Simple declarative clause'],
@@ -89,141 +39,135 @@ module Treat
89
39
  ]
90
40
 
91
41
  PTBWordTagDescription = [
92
- =begin
93
- CC - Coordinating conjunction
94
- CD - Cardinal number
95
- DT - Determiner
96
- EX - Existential there
97
- FW - Foreign word
98
- IN - Preposition or subordinating conjunction
99
- JJ - Adjective
100
- JJR - Adjective, comparative
101
- JJS - Adjective, superlative
102
- LS - List item marker
103
- MD - Modal
104
- NN - Noun, singular or mass
105
- NNS - Noun, plural
106
- NNP - Proper noun, singular
107
- NNPS - Proper noun, plural
108
- PDT - Predeterminer
109
- POS - Possessive ending
110
- PRP - Personal pronoun
111
- PRP$ - Possessive pronoun (prolog version PRP-S)
112
- RB - Adverb
113
- RBR - Adverb, comparative
114
- RBS - Adverb, superlative
115
- RP - Particle
116
- SYM - Symbol
117
- TO - to
118
- UH - Interjection
119
- VB - Verb, base form
120
- VBD - Verb, past tense
121
- VBG - Verb, gerund or present participle
122
- VBN - Verb, past participle
123
- VBP - Verb, non-3rd person singular present
124
- VBZ - Verb, 3rd person singular present
125
- WDT - Wh-determiner
126
- WP - Wh-pronoun
127
- WP$ - Possessive wh-pronoun (prolog version WP-S)
128
- WRB - Wh-adverb
129
-
130
- =end
42
+ ['CC', 'Coordinating conjunction'],
43
+ ['CD', 'Cardinal number'],
44
+ ['DT', 'Determiner'],
45
+ ['EX', 'Existential there'],
46
+ ['FW', 'Foreign word'],
47
+ ['IN', 'Preposition or subordinating conjunction'],
48
+ ['JJ', 'Adjective'],
49
+ ['JJR', 'Adjective, comparative'],
50
+ ['JJS', 'Adjective, superlative'],
51
+ ['LS', 'List item marker'],
52
+ ['MD', 'Modal'],
53
+ ['NN', 'Noun, singular or mass'],
54
+ ['NNS', 'Noun, plural'],
55
+ ['NNP', 'Proper noun, singular'],
56
+ ['NNPS', 'Proper noun, plural'],
57
+ ['PDT', 'Predeterminer'],
58
+ ['POS', 'Possessive ending'],
59
+ ['PRP', 'Personal pronoun'],
60
+ ['PRP$', 'Possessive pronoun (prolog version PRP-S)'],
61
+ ['RB', 'Adverb'],
62
+ ['RBR', 'Adverb, comparative'],
63
+ ['RBS', 'Adverb, superlative'],
64
+ ['RP', 'Particle'],
65
+ ['SYM', 'Symbol'],
66
+ ['TO', 'to'],
67
+ ['UH', 'Interjection'],
68
+ ['VB', 'Verb, base form'],
69
+ ['VBD', 'Verb, past tense'],
70
+ ['VBG', 'Verb, gerund or present participle'],
71
+ ['VBN', 'Verb, past participle'],
72
+ ['VBP', 'Verb, non 3rd person singular present'],
73
+ ['VBZ', 'Verb, 3rd person singular present'],
74
+ ['WDT', 'Wh-determiner'],
75
+ ['WP', 'Wh-pronoun'],
76
+ ['WP$', 'Possessive wh-pronoun (prolog version WP-S)'],
77
+ ['WRB', 'Wh-adverb']
131
78
  ]
79
+
132
80
  BrownWordTagDescription = [
133
- =begin
134
81
 
135
- Tag Description Examples
82
+ ['.', 'sentence closer . ; ? !'],
83
+ ['(', 'left parent'] ,
84
+ [')', 'right parent'],
85
+ ['*', 'not'],
86
+ ['--', 'dash'],
87
+ [',', 'comma'],
88
+ [':', 'colon'],
89
+ ['ABL', 'pre-qualifier quite, rather'],
90
+ ['ABN', 'pre-quantifier half, all'],
91
+ ['ABX', 'pre-quantifier both'],
92
+ ['AP', 'post-determiner many, several, next'],
93
+ ['AT', 'article a, the, no'],
94
+ ['BE', 'be '],
95
+ ['BED', 'were '],
96
+ ['BEDZ', 'was '],
97
+ ['BEG', 'being '],
98
+ ['BEM', 'am '],
99
+ ['BEN', 'been '],
100
+ ['BER', 'are, art '],
101
+ ['BEZ', 'is '],
102
+ ['CC', 'coordinating conjunction and, or'],
103
+ ['CD', 'cardinal numeral one, two, 2, etc.'],
104
+ ['CS', 'subordinating conjunction if, although'],
105
+ ['DO', 'do '],
106
+ ['DOD', 'did '],
107
+ ['DOZ', 'does '],
108
+ ['DT', 'singular determiner this, that'],
109
+ ['DTI', 'singular or plural determiner/quantifier some, any'],
110
+ ['DTS', 'plural determiner these, those'],
111
+ ['DTX', 'determiner/double conjunction either'],
112
+ ['EX', 'existentil there '],
113
+ ['FW', 'foreign word (hyphenated before regular tag) '],
114
+ ['HL', 'word occurring in headline (hyphenated after regular tag) '],
115
+ ['HV', 'have '],
116
+ ['HVD', 'had (past tense) '],
117
+ ['HVG', 'having '],
118
+ ['HVN', 'had (past participle) '],
119
+ ['HVZ', 'has '],
120
+ ['IN', 'preposition '],
121
+ ['JJ', 'adjective '],
122
+ ['JJR', 'comparative adjective '],
123
+ ['JJS', 'semantically superlative adjective chief, top'],
124
+ ['JJT', 'morphologically superlative adjective biggest'],
125
+ ['MD', 'modal auxiliary can, should, will'],
126
+ ['NC', 'cited word (hyphenated after regular tag) '],
127
+ ['NN', 'singular or mass noun '],
128
+ ['NN$', 'possessive singular noun '],
129
+ ['NNS', 'plural noun '],
130
+ ['NNS$', 'possessive plural noun '],
131
+ ['NP', 'proper noun or part of name phrase '],
132
+ ['NP$', 'possessive proper noun '],
133
+ ['NPS', 'plural proper noun '],
134
+ ['NPS$', 'possessive plural proper noun '],
135
+ ['NR', 'adverbial noun home, today, west'],
136
+ ['NRS', 'plural adverbial noun'],
137
+ ['OD', 'ordinal numeral first, 2nd'],
138
+ ['PN', 'nominal pronoun everybody, nothing'],
139
+ ['PN$', 'possessive nominal pronoun '],
140
+ ['PP$', 'possessive personal pronoun my, our'],
141
+ ['PP$$', 'second (nominal) possessive pronoun mine, ours'],
142
+ ['PPL', 'singular reflexive/intensive personal pronoun myself'],
143
+ ['PPLS', 'plural reflexive/intensive personal pronoun ourselves'],
144
+ ['PPO', 'objective personal pronoun me, him, it, them'],
145
+ ['PPS', '3rd. singular nominative pronoun he, she, it, one'],
146
+ ['PPSS', 'other nominative personal pronoun I, we, they, you'],
147
+ ['QL', 'qualifier very, fairly'],
148
+ ['QLP', 'post-qualifier enough, indeed'],
149
+ ['RB', 'adverb '],
150
+ ['RBR', 'comparative adverb '],
151
+ ['RBT', 'superlative adverb '],
152
+ ['RN', 'nominal adverb here then, indoors '],
153
+ ['RP', 'adverb/particle about, off, up'],
154
+ ['TL', 'word occurring in title (hyphenated after regular tag)'],
155
+ ['TO', 'infinitive marker to '],
156
+ ['UH', 'interjection, exclamation '],
157
+ ['VB', 'verb, base form '],
158
+ ['VBD', 'verb, past tense '],
159
+ ['VBG', 'verb, present participle/gerund '],
160
+ ['VBN', 'verb, past participle '],
161
+ ['VBZ', 'verb, 3rd. singular present '],
162
+ ['WDT', 'wh- determiner what, which'],
163
+ ['WP$', 'possessive wh- pronoun whose'],
164
+ ['WPO', 'objective wh- pronoun whom, which, that'],
165
+ ['WPS', 'nominative wh- pronoun who, which, that'],
166
+ ['WQL', 'wh- qualifier how'],
167
+ ['WRB', 'wh- adverb how, where, when']
136
168
 
137
- . sentence closer . ; ? !
138
- ( left paren
139
- ) right paren
140
- * not, n't
141
- -- dash
142
- , comma
143
- : colon
144
- ABL pre-qualifier quite, rather
145
- ABN pre-quantifier half, all
146
- ABX pre-quantifier both
147
- AP post-determiner many, several, next
148
- AT article a, the, no
149
- BE be
150
- BED were
151
- BEDZ was
152
- BEG being
153
- BEM am
154
- BEN been
155
- BER are, art
156
- BEZ is
157
- CC coordinating conjunction and, or
158
- CD cardinal numeral one, two, 2, etc.
159
- CS subordinating conjunction if, although
160
- DO do
161
- DOD did
162
- DOZ does
163
- DT singular determiner this, that
164
- DTI singular or plural determiner/quantifier some, any
165
- DTS plural determiner these, those
166
- DTX determiner/double conjunction either
167
- EX existentil there
168
- FW foreign word (hyphenated before regular tag)
169
- HL word occurring in headline (hyphenated after regular tag)
170
- HV have
171
- HVD had (past tense)
172
- HVG having
173
- HVN had (past participle)
174
- HVZ has
175
- IN preposition
176
- JJ adjective
177
- JJR comparative adjective
178
- JJS semantically superlative adjective chief, top
179
- JJT morphologically superlative adjective biggest
180
- MD modal auxiliary can, should, will
181
- NC cited word (hyphenated after regular tag)
182
- NN singular or mass noun
183
- NN$ possessive singular noun
184
- NNS plural noun
185
- NNS$ possessive plural noun
186
- NP proper noun or part of name phrase
187
- NP$ possessive proper noun
188
- NPS plural proper noun
189
- NPS$ possessive plural proper noun
190
- NR adverbial noun home, today, west
191
- NRS plural adverbial noun
192
- OD ordinal numeral first, 2nd
193
- PN nominal pronoun everybody, nothing
194
- PN$ possessive nominal pronoun
195
- PP$ possessive personal pronoun my, our
196
- PP$$ second (nominal) possessive pronoun mine, ours
197
- PPL singular reflexive/intensive personal pronoun myself
198
- PPLS plural reflexive/intensive personal pronoun ourselves
199
- PPO objective personal pronoun me, him, it, them
200
- PPS 3rd. singular nominative pronoun he, she, it, one
201
- PPSS other nominative personal pronoun I, we, they, you
202
- QL qualifier very, fairly
203
- QLP post-qualifier enough, indeed
204
- RB adverb
205
- RBR comparative adverb
206
- RBT superlative adverb
207
- RN nominal adverb here then, indoors
208
- RP adverb/particle about, off, up
209
- TL word occurring in title (hyphenated after
210
- regular tag)
211
- TO infinitive marker to
212
- UH interjection, exclamation
213
- VB verb, base form
214
- VBD verb, past tense
215
- VBG verb, present participle/gerund
216
- VBN verb, past participle
217
- VBZ verb, 3rd. singular present
218
- WDT wh- determiner what, which
219
- WP$ possessive wh- pronoun whose
220
- WPO objective wh- pronoun whom, which, that
221
- WPS nominative wh- pronoun who, which, that
222
- WQL wh- qualifier how
223
- WRB wh- adverb how, where, when
224
-
225
- =end
226
169
  ]
170
+ # A description of Enju categories.
227
171
  EnjuCatDescription = [
228
172
  ['ADJ', 'Adjective'],
229
173
  ['ADV', 'Adverb'],
@@ -330,7 +274,7 @@ WRB wh- adverb how, where, when
330
274
  'Noun, proper, plural', ['NP0', 'NPS', 'NNPS'],
331
275
  'Noun, adverbial', ['NN0', 'NR', 'NN'],
332
276
  'Noun, adverbial, plural', ['NN2', 'NRS', 'NNS'],
333
- 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'NN'],
277
+ 'Pronoun, nominal (indefinite)', ['PNI', 'PN', 'PRP'],
334
278
  'Pronoun, personal, subject', ['PNP', 'PPSS', 'PRP'],
335
279
  'Pronoun, personal, subject, 3SG', ['PNP', 'PPS', 'PRP'],
336
280
  'Pronoun, personal, object', ['PNP', 'PPO', 'PRP'],
@@ -387,7 +331,6 @@ WRB wh- adverb how, where, when
387
331
  'Symbol, alphabetical', ['ZZ0', '', ''],
388
332
  'Symbol, list item', ['', '', 'LS']
389
333
  ]
390
-
391
334
  end
392
335
  end
393
336
  end
@@ -0,0 +1,33 @@
1
+ module Treat
2
+ module Languages
3
+ class English
4
+ require 'treat/languages/english/tags'
5
+ require 'treat/languages/english/categories'
6
+ Extractors = {
7
+ time: [:chronic],
8
+ topics: [:reuters],
9
+ topic_words: [:lda],
10
+ key_sentences: [:topics_frequency]
11
+ }
12
+ Processors = {
13
+ chunkers: [:txt],
14
+ parsers: [:enju, :stanford],
15
+ segmenters: [:tactful, :punkt, :stanford],
16
+ tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
17
+ }
18
+ Lexicalizers = {
19
+ category: [:from_tag],
20
+ linkages: [:naive],
21
+ synsets: [:wordnet, :rita_wn],
22
+ tag: [:brill, :lingua, :stanford]
23
+ }
24
+ Inflectors = {
25
+ conjugations: [:linguistics],
26
+ declensions: [:linguistics, :english],
27
+ stem: [:porter_c, :porter, :uea],
28
+ ordinal_words: [:linguistics],
29
+ cardinal_words: [:linguistics]
30
+ }
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Languages
3
+ class French
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {
7
+ tag: [:stanford]
8
+ }
9
+ Processors = {
10
+ chunkers: [:txt],
11
+ parsers: [:stanford],
12
+ segmenters: [:tactful, :punkt, :stanford],
13
+ tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
14
+ }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Languages
3
+ class German
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {
7
+ tag: [:stanford]
8
+ }
9
+ Processors = {
10
+ chunkers: [:txt],
11
+ parsers: [:stanford],
12
+ segmenters: [:tactful, :punkt, :stanford],
13
+ tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
14
+ }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,14 @@
1
+ module Treat
2
+ module Languages
3
+ class Italian
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {}
7
+ Processors = {
8
+ chunkers: [:txt],
9
+ segmenters: [:tactful, :punkt, :stanford],
10
+ tokenizers: [:multilingual, :macintyre, :perl, :punkt, :tactful, :stanford]
11
+ }
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,12 @@
1
+ module Treat
2
+ module Languages
3
+ class Xinhua
4
+ Extractors = {}
5
+ Inflectors = {}
6
+ Lexicalizers = {}
7
+ Processors = {
8
+ parsers: [:stanford]
9
+ }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,91 @@
1
+ module Treat
2
+ # This module provides linguistic resources
3
+ # for the Treat library, including information
4
+ # about language codes, the functions available
5
+ # for each language, and the different tags used
6
+ # to markup that language.
7
+ module Languages
8
+ Dir["#{Treat.lib}/treat/languages/*.rb"].each { |file| require file }
9
+ ISO639_1 = 1
10
+ ISO639_2 = 2
11
+ # Describe a language code (ISO-639-1 or ISO-639-2)
12
+ # or its full text description in full French or English.
13
+ def self.describe(lang, desc_lang = :en)
14
+ raise "Must provide a non-nil language identifier to describe." if lang.nil?
15
+ lang = find(lang).to_s
16
+ if [:en, :eng, :english, :anglais].include?(desc_lang)
17
+ l = @@english_full.key(lang)
18
+ elsif [:fr, :fra, :french, :french].include?(desc_lang)
19
+ l = @@french_full.key(lang)
20
+ else
21
+ raise Treat::Exception,
22
+ "Unknown language to describe: #{desc_lang}."
23
+ end
24
+ not_found(lang) if l.nil?
25
+ l.intern
26
+ end
27
+ # Raise an error message when a language code
28
+ # or description is not found and suggest
29
+ # possible misspellings.
30
+ def self.not_found(lang)
31
+ msg = "Language '#{lang}' does not exist."
32
+ all = @@iso639_2.keys + @@iso639_1.keys +
33
+ @@english_full.keys + @@french_full.keys
34
+ msg += did_you_mean?(all, lang)
35
+ raise Treat::Exception, msg
36
+ end
37
+ # Return the class representing a language.
38
+ def self.get(lang)
39
+ const_get(Treat::Languages.describe(lang).to_s.capitalize)
40
+ end
41
+ # Find a language by ISO-639-1 or ISO-639-2 code
42
+ # or full name (in English or French) and return
43
+ # the ISO-639-1 or ISO-639-2 language code as a
44
+ # lowercase identifier.
45
+ def self.find(lang, rc = ISO639_2)
46
+ raise "Must provide a non-nil language identifier to describe." if lang.nil?
47
+ get_languages
48
+ lang = lang.to_s.downcase
49
+ if @@iso639_1.has_key?(lang)
50
+ return :"#{lang}" if rc == ISO639_1
51
+ return :"#{@@iso639_1[lang]}" if rc == ISO639_2
52
+ elsif @@iso639_2.has_key?(lang)
53
+ return :"#{lang}" if rc == ISO639_2
54
+ return :"#{@@iso639_2[lang]}" if rc == ISO639_1
55
+ elsif @@english_full.has_key?(lang)
56
+ return :"#{@@english_full[lang]}" if rc == ISO639_2
57
+ return :"#{@@iso639_2[@@english_full[lang]]}" if rc == ISO639_1
58
+ elsif @@french_full.has_key?(lang)
59
+ return :"#{@@french_full[lang]}" if rc == ISO639_2
60
+ return :"#{@@iso639_1[@@french_full[lang]]}" if rc == ISO639_2
61
+ else
62
+ not_found(lang)
63
+ end
64
+ end
65
+ @@loaded = false
66
+ # Get the languages from the dictionary.
67
+ def self.get_languages
68
+ return if @@loaded
69
+ @@iso639_1 = {}; @@iso639_2 = {};
70
+ @@english_full = {}; @@french_full = {}
71
+ languages = IO.readlines(Treat.lib + '/treat/languages/list.txt')
72
+ languages.each do |language|
73
+ iso639_2, iso639_1, english_desc, french_desc =
74
+ language.split(',')
75
+ @@iso639_1[iso639_1] = iso639_2
76
+ @@iso639_2[iso639_2] = iso639_1
77
+ unless english_desc.nil?
78
+ english_desc.strip.downcase.split('|').each do |l|
79
+ @@english_full[l.downcase.strip] = iso639_2
80
+ end
81
+ end
82
+ unless french_desc.nil?
83
+ french_desc.strip.downcase.split('|').each do |l|
84
+ @@french_full[l.downcase.strip] = iso639_2
85
+ end
86
+ end
87
+ end
88
+ @@loaded = true
89
+ end
90
+ end
91
+ end
@@ -4,24 +4,36 @@ module Treat
4
4
  # A class that detects the category of a word from its tag,
5
5
  # using the default tagger for the language of the entity.
6
6
  class FromTag
7
+ DefaultOptions = { tagger: nil }
7
8
  # Find the category of the current entity.
8
9
  # Options:
9
10
  # :tagger => (Symbol) force the use of a tagger.
10
11
  # :tag_to_cat => (Hash) a list of categories for each possible tag.
11
12
  def self.category(entity, options = {})
12
- if options.empty?
13
- options = {
14
- tagger: nil,
15
- tag_to_cat: Treat::Resources::Tags::PTBWordTagToCategory
16
- }
17
- end
13
+ options = DefaultOptions.merge(options)
18
14
  tag = options[:tagger].nil? ? entity.tag : entity.tag(options[:tagger])
19
- cat = options[:tag_to_cat][tag]
15
+ lang = Treat::Languages.get(entity.language)
16
+ cat = lang::WordTagToCategory[tag]
20
17
  if cat.nil?
21
18
  warn "Category not found for tag #{tag}."
22
19
  :unknown
23
20
  else
24
- cat
21
+ if cat.size == 1
22
+ return cat[0]
23
+ else
24
+ if entity.has?(:tag_set)
25
+ if cat[entity.tag_set]
26
+ return cat[entity.tag_set]
27
+ else
28
+ raise Treat::Exception,
29
+ "The specified tag set (#{entity.tag_set})" +
30
+ " does not contain the tag #{tag}."
31
+ end
32
+ else
33
+ raise Treat::Exception,
34
+ "No information can be found regarding which tag set to use."
35
+ end
36
+ end
25
37
  end
26
38
  end
27
39
  end
@@ -4,7 +4,7 @@ module Treat
4
4
  # Currently not implemented.
5
5
  class RitaWn
6
6
  # Require the Ruby-Java bridge.
7
- #silently do
7
+ #silence_warnings do
8
8
  require 'rjb'
9
9
  # Load the RitaWN jars.
10
10
  Rjb::load("#{Treat.bin}/jwnl/jwnl.jar", [])
@@ -25,7 +25,7 @@ module Treat
25
25
  patch = false
26
26
  # Require the 'rbtagger' gem.
27
27
  begin
28
- silently { require 'rbtagger' }
28
+ silence_warnings { require 'rbtagger' }
29
29
  # This whole mess is required to deal with
30
30
  # the fact that the 'rbtagger' gem defines
31
31
  # a top-level module called 'Word', which
@@ -73,6 +73,7 @@ module Treat
73
73
  # Create the tagger if necessary
74
74
  @@tagger ||= ::Brill::Tagger.new(options[:lexicon],
75
75
  options[:lexical_rules], options[:contextual_rules])
76
+ entity.set :tag_set, :penn
76
77
  # Perform tagging.
77
78
  if entity.type == :word
78
79
  # Setup the context of the word
@@ -17,7 +17,7 @@ module Treat
17
17
  # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
18
18
  class Lingua
19
19
  # Require the 'engtagger' gem.
20
- silently { require 'engtagger' }
20
+ silence_warnings { require 'engtagger' }
21
21
  # Hold the tagger.
22
22
  @@tagger = nil
23
23
  # Hold the user-set options
@@ -46,6 +46,7 @@ module Treat
46
46
  @@tagger = nil # Reset the tagger
47
47
  end
48
48
  @@tagger ||= ::EngTagger.new(@@options)
49
+ entity.set :tag_set, :penn
49
50
  left = entity.left
50
51
  if left.nil? || left.type != :word
51
52
  left_tag = 'pp'