linguistics 1.0.9 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/.gemtest +0 -0
- data/ChangeLog +849 -342
- data/History.rdoc +11 -0
- data/LICENSE +9 -9
- data/Manifest.txt +44 -0
- data/README.rdoc +226 -0
- data/Rakefile +32 -349
- data/examples/endocs.rb +272 -0
- data/examples/generalize_sentence.rb +2 -1
- data/examples/klingon.rb +22 -0
- data/lib/linguistics.rb +130 -292
- data/lib/linguistics/en.rb +337 -1628
- data/lib/linguistics/en/articles.rb +138 -0
- data/lib/linguistics/en/conjugation.rb +2245 -0
- data/lib/linguistics/en/conjunctions.rb +202 -0
- data/lib/linguistics/en/{infinitive.rb → infinitives.rb} +41 -55
- data/lib/linguistics/en/linkparser.rb +41 -49
- data/lib/linguistics/en/numbers.rb +483 -0
- data/lib/linguistics/en/participles.rb +33 -0
- data/lib/linguistics/en/pluralization.rb +810 -0
- data/lib/linguistics/en/stemmer.rb +75 -0
- data/lib/linguistics/en/titlecase.rb +121 -0
- data/lib/linguistics/en/wordnet.rb +63 -97
- data/lib/linguistics/inflector.rb +89 -0
- data/lib/linguistics/iso639.rb +534 -448
- data/lib/linguistics/languagebehavior.rb +36 -0
- data/lib/linguistics/monkeypatches.rb +42 -0
- data/spec/lib/constants.rb +15 -0
- data/spec/lib/helpers.rb +38 -0
- data/spec/linguistics/en/articles_spec.rb +797 -0
- data/spec/linguistics/en/conjugation_spec.rb +2083 -0
- data/spec/linguistics/en/conjunctions_spec.rb +154 -0
- data/spec/linguistics/en/infinitives_spec.rb +518 -0
- data/spec/linguistics/en/linkparser_spec.rb +66 -0
- data/spec/linguistics/en/numbers_spec.rb +1295 -0
- data/spec/linguistics/en/participles_spec.rb +55 -0
- data/spec/linguistics/en/pluralization_spec.rb +4636 -0
- data/spec/linguistics/en/stemmer_spec.rb +72 -0
- data/spec/linguistics/en/titlecase_spec.rb +841 -0
- data/spec/linguistics/en/wordnet_spec.rb +85 -0
- data/spec/linguistics/en_spec.rb +45 -167
- data/spec/linguistics/inflector_spec.rb +40 -0
- data/spec/linguistics/iso639_spec.rb +49 -53
- data/spec/linguistics/monkeypatches_spec.rb +40 -0
- data/spec/linguistics_spec.rb +46 -76
- metadata +241 -113
- metadata.gz.sig +0 -0
- data/README +0 -166
- data/README.english +0 -245
- data/rake/191_compat.rb +0 -26
- data/rake/dependencies.rb +0 -76
- data/rake/documentation.rb +0 -123
- data/rake/helpers.rb +0 -502
- data/rake/hg.rb +0 -318
- data/rake/manual.rb +0 -787
- data/rake/packaging.rb +0 -129
- data/rake/publishing.rb +0 -341
- data/rake/style.rb +0 -62
- data/rake/svn.rb +0 -668
- data/rake/testing.rb +0 -152
- data/rake/verifytask.rb +0 -64
- data/tests/en/infinitive.tests.rb +0 -207
- data/tests/en/inflect.tests.rb +0 -1389
- data/tests/en/lafcadio.tests.rb +0 -77
- data/tests/en/linkparser.tests.rb +0 -42
- data/tests/en/lprintf.tests.rb +0 -77
- data/tests/en/titlecase.tests.rb +0 -73
- data/tests/en/wordnet.tests.rb +0 -95
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
require 'linguistics/en' unless defined?( Linguistics::EN )
|
4
|
+
|
5
|
+
# Methods for deriving present participles for the English-language
|
6
|
+
# Linguistics module.
|
7
|
+
module Linguistics::EN::Participles
|
8
|
+
|
9
|
+
# Register this module to the list of modules to include
|
10
|
+
Linguistics::EN.register_extension( self )
|
11
|
+
|
12
|
+
|
13
|
+
### Attempt to return the inflected string in its present participle
|
14
|
+
### form (e.g., talked -> talking).
|
15
|
+
def present_participle
|
16
|
+
plural = self.to_s.en.plural_verb
|
17
|
+
|
18
|
+
plural.sub!( /ie$/, 'y' ) or
|
19
|
+
plural.sub!( /ue$/, 'u' ) or
|
20
|
+
plural.sub!( /([auy])e$/, '$1' ) or
|
21
|
+
plural.sub!( /i$/, '' ) or
|
22
|
+
plural.sub!( /([^e])e$/, "\\1" ) or
|
23
|
+
/er$/.match( plural ) or
|
24
|
+
plural.sub!( /([^aeiou][aeiouy]([bdgmnprst]))$/, "\\1\\2" )
|
25
|
+
|
26
|
+
return "#{plural}ing"
|
27
|
+
end
|
28
|
+
alias_method :part_pres, :present_participle
|
29
|
+
Linguistics::EN.register_lprintf_formatter :PART_PRES, :present_participle
|
30
|
+
|
31
|
+
|
32
|
+
end # module Linguistics::EN::Participles
|
33
|
+
|
@@ -0,0 +1,810 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'linguistics/en' unless defined?( Linguistics )
|
4
|
+
|
5
|
+
# Plural inflection methods for the English-language Linguistics module.
|
6
|
+
#
|
7
|
+
# It provides conversion of plural forms of all nouns, most verbs,
|
8
|
+
# and some adjectives. It also provides "classical" variants (for
|
9
|
+
# example: "brother" -> "brethren", "dogma" -> "dogmata", etc.) where
|
10
|
+
# appropriate.
|
11
|
+
module Linguistics::EN::Pluralization
|
12
|
+
|
13
|
+
# Register this module to the list of modules to include
|
14
|
+
Linguistics::EN.register_extension( self )
|
15
|
+
|
16
|
+
### Utility function for creating Regexp unions
|
17
|
+
def self::matchgroup( *parts )
|
18
|
+
return Regexp.union( *(parts.flatten) )
|
19
|
+
end
|
20
|
+
private_class_method :matchgroup
|
21
|
+
|
22
|
+
#
|
23
|
+
# Plurals
|
24
|
+
#
|
25
|
+
|
26
|
+
PL_sb_irregular_s = {
|
27
|
+
"ephemeris" => "ephemerides",
|
28
|
+
"iris" => "irises|irides",
|
29
|
+
"clitoris" => "clitorises|clitorides",
|
30
|
+
"corpus" => "corpuses|corpora",
|
31
|
+
"opus" => "opuses|opera",
|
32
|
+
"genus" => "genera",
|
33
|
+
"mythos" => "mythoi",
|
34
|
+
"penis" => "penises|penes",
|
35
|
+
"testis" => "testes",
|
36
|
+
}
|
37
|
+
|
38
|
+
PL_sb_irregular_h = {
|
39
|
+
"child" => "children",
|
40
|
+
"brother" => "brothers|brethren",
|
41
|
+
"loaf" => "loaves",
|
42
|
+
"hoof" => "hoofs|hooves",
|
43
|
+
"beef" => "beefs|beeves",
|
44
|
+
"money" => "monies",
|
45
|
+
"mongoose" => "mongooses",
|
46
|
+
"ox" => "oxen",
|
47
|
+
"cow" => "cows|kine",
|
48
|
+
"soliloquy" => "soliloquies",
|
49
|
+
"graffito" => "graffiti",
|
50
|
+
"prima donna" => "prima donnas|prime donne",
|
51
|
+
"octopus" => "octopuses|octopodes",
|
52
|
+
"genie" => "genies|genii",
|
53
|
+
"ganglion" => "ganglions|ganglia",
|
54
|
+
"trilby" => "trilbys",
|
55
|
+
"turf" => "turfs|turves",
|
56
|
+
}.update( PL_sb_irregular_s )
|
57
|
+
PL_sb_irregular = matchgroup PL_sb_irregular_h.keys
|
58
|
+
|
59
|
+
|
60
|
+
# Classical "..a" -> "..ata"
|
61
|
+
PL_sb_C_a_ata = matchgroup %w[
|
62
|
+
anathema bema carcinoma charisma diploma
|
63
|
+
dogma drama edema enema enigma lemma
|
64
|
+
lymphoma magma melisma miasma oedema
|
65
|
+
sarcoma schema soma stigma stoma trauma
|
66
|
+
gumma pragma
|
67
|
+
].collect {|word| word[0...-1]}
|
68
|
+
|
69
|
+
# Unconditional "..a" -> "..ae"
|
70
|
+
PL_sb_U_a_ae = matchgroup %w[
|
71
|
+
alumna alga vertebra persona
|
72
|
+
]
|
73
|
+
|
74
|
+
# Classical "..a" -> "..ae"
|
75
|
+
PL_sb_C_a_ae = matchgroup [/.*umbra/ ] + %w[
|
76
|
+
amoeba antenna formula hyperbola
|
77
|
+
medusa nebula parabola abscissa
|
78
|
+
hydra nova lacuna aurora
|
79
|
+
flora fauna
|
80
|
+
]
|
81
|
+
|
82
|
+
# Classical "..en" -> "..ina"
|
83
|
+
PL_sb_C_en_ina = matchgroup %w[
|
84
|
+
stamen foramen lumen
|
85
|
+
].collect {|word| word[0...-2] }
|
86
|
+
|
87
|
+
# Unconditional "..um" -> "..a"
|
88
|
+
PL_sb_U_um_a = matchgroup %w[
|
89
|
+
bacterium agendum desideratum erratum
|
90
|
+
stratum datum ovum extremum candelabrum
|
91
|
+
].collect {|word| word[0...-2] }
|
92
|
+
|
93
|
+
# Classical "..um" -> "..a"
|
94
|
+
PL_sb_C_um_a = matchgroup %w[
|
95
|
+
maximum minimum momentum optimum
|
96
|
+
quantum cranium curriculum dictum
|
97
|
+
phylum aquarium compendium emporium
|
98
|
+
enconium gymnasium honorarium interregnum
|
99
|
+
lustrum memorandum millenium rostrum
|
100
|
+
spectrum speculum stadium trapezium
|
101
|
+
ultimatum medium vacuum velum
|
102
|
+
consortium
|
103
|
+
].collect {|word| word[0...-2]}
|
104
|
+
|
105
|
+
# Unconditional "..us" -> "i"
|
106
|
+
PL_sb_U_us_i = matchgroup %w[
|
107
|
+
alumnus alveolus bacillus bronchus
|
108
|
+
locus nucleus stimulus meniscus
|
109
|
+
].collect {|word| word[0...-2]}
|
110
|
+
|
111
|
+
# Classical "..us" -> "..i"
|
112
|
+
PL_sb_C_us_i = matchgroup %w[
|
113
|
+
focus radius genius
|
114
|
+
incubus succubus nimbus
|
115
|
+
fungus nucleolus stylus
|
116
|
+
torus umbilicus uterus
|
117
|
+
hippopotamus
|
118
|
+
].collect {|word| word[0...-2]}
|
119
|
+
|
120
|
+
# Classical "..us" -> "..us" (assimilated 4th declension latin nouns)
|
121
|
+
PL_sb_C_us_us = matchgroup %w[
|
122
|
+
status apparatus prospectus sinus
|
123
|
+
hiatus impetus plexus
|
124
|
+
]
|
125
|
+
|
126
|
+
# Unconditional "..on" -> "a"
|
127
|
+
PL_sb_U_on_a = matchgroup %w[
|
128
|
+
criterion perihelion aphelion
|
129
|
+
phenomenon prolegomenon noumenon
|
130
|
+
organon asyndeton hyperbaton
|
131
|
+
].collect {|word| word[0...-2]}
|
132
|
+
|
133
|
+
# Classical "..on" -> "..a"
|
134
|
+
PL_sb_C_on_a = matchgroup %w[
|
135
|
+
oxymoron
|
136
|
+
].collect {|word| word[0...-2]}
|
137
|
+
|
138
|
+
# Classical "..o" -> "..i" (but normally -> "..os")
|
139
|
+
PL_sb_C_o_i_a = %w[
|
140
|
+
solo soprano basso alto
|
141
|
+
contralto tempo piano
|
142
|
+
]
|
143
|
+
PL_sb_C_o_i = matchgroup PL_sb_C_o_i_a.collect{|word| word[0...-1]}
|
144
|
+
|
145
|
+
# Always "..o" -> "..os"
|
146
|
+
PL_sb_U_o_os = matchgroup( %w[
|
147
|
+
albino archipelago armadillo
|
148
|
+
commando crescendo fiasco
|
149
|
+
ditto dynamo embryo
|
150
|
+
ghetto guano inferno
|
151
|
+
jumbo lumbago magneto
|
152
|
+
manifesto medico octavo
|
153
|
+
photo pro quarto
|
154
|
+
canto lingo generalissimo
|
155
|
+
stylo rhino
|
156
|
+
] | PL_sb_C_o_i_a )
|
157
|
+
|
158
|
+
|
159
|
+
# Unconditional "..[ei]x" -> "..ices"
|
160
|
+
PL_sb_U_ex_ices = matchgroup %w[
|
161
|
+
codex murex silex
|
162
|
+
].collect {|word| word[0...-2]}
|
163
|
+
PL_sb_U_ix_ices = matchgroup %w[
|
164
|
+
radix helix
|
165
|
+
].collect {|word| word[0...-2]}
|
166
|
+
|
167
|
+
# Classical "..[ei]x" -> "..ices"
|
168
|
+
PL_sb_C_ex_ices = matchgroup %w[
|
169
|
+
vortex vertex cortex latex
|
170
|
+
pontifex apex index simplex
|
171
|
+
].collect {|word| word[0...-2]}
|
172
|
+
PL_sb_C_ix_ices = matchgroup %w[
|
173
|
+
appendix
|
174
|
+
].collect {|word| word[0...-2]}
|
175
|
+
|
176
|
+
|
177
|
+
# Arabic: ".." -> "..i"
|
178
|
+
PL_sb_C_i = matchgroup %w[
|
179
|
+
afrit afreet efreet
|
180
|
+
]
|
181
|
+
|
182
|
+
|
183
|
+
# Hebrew: ".." -> "..im"
|
184
|
+
PL_sb_C_im = matchgroup %w[
|
185
|
+
goy seraph cherub
|
186
|
+
]
|
187
|
+
|
188
|
+
# Unconditional "..man" -> "..mans"
|
189
|
+
PL_sb_U_man_mans = matchgroup %w[
|
190
|
+
human
|
191
|
+
Alabaman Bahaman Burman German
|
192
|
+
Hiroshiman Liman Nakayaman Oklahoman
|
193
|
+
Panaman Selman Sonaman Tacoman Yakiman
|
194
|
+
Yokohaman Yuman
|
195
|
+
]
|
196
|
+
|
197
|
+
|
198
|
+
PL_sb_uninflected_s = [
|
199
|
+
# Pairs or groups subsumed to a singular...
|
200
|
+
"breeches", "britches", "clippers", "gallows", "hijinks",
|
201
|
+
"headquarters", "pliers", "scissors", "testes", "herpes",
|
202
|
+
"pincers", "shears", "proceedings", "trousers",
|
203
|
+
|
204
|
+
# Unassimilated Latin 4th declension
|
205
|
+
"cantus", "coitus", "nexus",
|
206
|
+
|
207
|
+
# Recent imports...
|
208
|
+
"contretemps", "corps", "debris",
|
209
|
+
/.*ois/,
|
210
|
+
|
211
|
+
# Diseases
|
212
|
+
/.*measles/, "mumps",
|
213
|
+
|
214
|
+
# Miscellaneous others...
|
215
|
+
"diabetes", "jackanapes", "series", "species", "rabies",
|
216
|
+
"chassis", "innings", "news", "mews",
|
217
|
+
]
|
218
|
+
|
219
|
+
|
220
|
+
# Don't inflect in classical mode, otherwise normal inflection
|
221
|
+
PL_sb_uninflected_herd = matchgroup %w[
|
222
|
+
wildebeest swine eland bison buffalo
|
223
|
+
elk moose rhinoceros
|
224
|
+
]
|
225
|
+
|
226
|
+
PL_sb_uninflected = matchgroup(
|
227
|
+
|
228
|
+
# Some fish and herd animals
|
229
|
+
/.*fish/, "tuna", "salmon", "mackerel", "trout",
|
230
|
+
"bream", /sea[- ]bass/, "carp", "cod", "flounder", "whiting",
|
231
|
+
|
232
|
+
/.*deer/, /.*sheep/,
|
233
|
+
|
234
|
+
# All nationals ending in -ese
|
235
|
+
"Portuguese", "Amoyese", "Borghese", "Congoese", "Faroese",
|
236
|
+
"Foochowese", "Genevese", "Genoese", "Gilbertese", "Hottentotese",
|
237
|
+
"Kiplingese", "Kongoese", "Lucchese", "Maltese", "Nankingese",
|
238
|
+
"Niasese", "Pekingese", "Piedmontese", "Pistoiese", "Sarawakese",
|
239
|
+
"Shavese", "Vermontese", "Wenchowese", "Yengeese",
|
240
|
+
/.*[nrlm]ese/,
|
241
|
+
|
242
|
+
# Some words ending in ...s (often pairs taken as a whole)
|
243
|
+
PL_sb_uninflected_s,
|
244
|
+
|
245
|
+
# Diseases
|
246
|
+
/.*pox/,
|
247
|
+
|
248
|
+
# Other oddities
|
249
|
+
"graffiti", "djinn"
|
250
|
+
)
|
251
|
+
|
252
|
+
|
253
|
+
# Singular words ending in ...s (all inflect with ...es)
|
254
|
+
PL_sb_singular_s = matchgroup [ /.*ss/, /.*us/ ] +
|
255
|
+
%w[
|
256
|
+
acropolis aegis alias arthritis asbestos atlas
|
257
|
+
bathos bias bronchitis bursitis caddis cannabis
|
258
|
+
canvas chaos cosmos dais digitalis encephalitis
|
259
|
+
epidermis ethos eyas gas glottis hepatitis
|
260
|
+
hubris ibis lens mantis marquis metropolis
|
261
|
+
neuritis pathos pelvis polis rhinoceros
|
262
|
+
sassafras tonsillitis trellis
|
263
|
+
]
|
264
|
+
|
265
|
+
PL_v_special_s = matchgroup [
|
266
|
+
PL_sb_singular_s,
|
267
|
+
PL_sb_uninflected_s,
|
268
|
+
PL_sb_irregular_s.keys,
|
269
|
+
/(.*[csx])is/,
|
270
|
+
/(.*)ceps/,
|
271
|
+
/[A-Z].*s/,
|
272
|
+
]
|
273
|
+
|
274
|
+
PL_sb_postfix_adj = '(' + {
|
275
|
+
|
276
|
+
'general' => '(?!major|lieutenant|brigadier|adjutant)\S+',
|
277
|
+
'martial' => "court",
|
278
|
+
|
279
|
+
}.collect {|key,val|
|
280
|
+
"(?:#{val})(?=(?:-|\\s+)#{key})"
|
281
|
+
}.join("|") + ")(.*)"
|
282
|
+
|
283
|
+
|
284
|
+
PL_sb_military = %r'major|lieutenant|brigadier|adjutant|quartermaster'
|
285
|
+
PL_sb_general = %r'((?!#{PL_sb_military.source}).*?)((-|\s+)general)'
|
286
|
+
|
287
|
+
PL_prep = matchgroup %w[
|
288
|
+
about above across after among around at athwart before behind
|
289
|
+
below beneath beside besides between betwixt beyond but by
|
290
|
+
during except for from in into near of off on onto out over
|
291
|
+
since till to under until unto upon with
|
292
|
+
]
|
293
|
+
|
294
|
+
PL_sb_prep_dual_compound = %r'(.*?)((?:-|\s+)(?:#{PL_prep}|d[eu])(?:-|\s+))a(?:-|\s+)(.*)'
|
295
|
+
PL_sb_prep_compound = %r'(.*?)((-|\s+)(#{PL_prep}|d[eu])((-|\s+)(.*))?)'
|
296
|
+
|
297
|
+
|
298
|
+
PL_pron_nom_h = {
|
299
|
+
# Nominative Reflexive
|
300
|
+
"i" => "we", "myself" => "ourselves",
|
301
|
+
"you" => "you", "yourself" => "yourselves",
|
302
|
+
"she" => "they", "herself" => "themselves",
|
303
|
+
"he" => "they", "himself" => "themselves",
|
304
|
+
"it" => "they", "itself" => "themselves",
|
305
|
+
"they" => "they", "themself" => "themselves",
|
306
|
+
|
307
|
+
# Possessive
|
308
|
+
"mine" => "ours",
|
309
|
+
"yours" => "yours",
|
310
|
+
"hers" => "theirs",
|
311
|
+
"his" => "theirs",
|
312
|
+
"its" => "theirs",
|
313
|
+
"theirs" => "theirs",
|
314
|
+
}
|
315
|
+
PL_pron_nom = Regexp.new( PL_pron_nom_h.keys.join('|'), Regexp::IGNORECASE )
|
316
|
+
|
317
|
+
PL_pron_acc_h = {
|
318
|
+
# Accusative Reflexive
|
319
|
+
"me" => "us", "myself" => "ourselves",
|
320
|
+
"you" => "you", "yourself" => "yourselves",
|
321
|
+
"her" => "them", "herself" => "themselves",
|
322
|
+
"him" => "them", "himself" => "themselves",
|
323
|
+
"it" => "them", "itself" => "themselves",
|
324
|
+
"them" => "them", "themself" => "themselves",
|
325
|
+
}
|
326
|
+
PL_pron_acc = matchgroup PL_pron_acc_h.keys
|
327
|
+
|
328
|
+
PL_v_irregular_pres_h = {
|
329
|
+
# 1St pers. sing. 2nd pers. sing. 3rd pers. singular
|
330
|
+
# 3rd pers. (indet.)
|
331
|
+
"am" => "are", "are" => "are", "is" => "are",
|
332
|
+
"was" => "were", "were" => "were", "was" => "were",
|
333
|
+
"have" => "have", "have" => "have", "has" => "have",
|
334
|
+
}
|
335
|
+
PL_v_irregular_pres = matchgroup PL_v_irregular_pres_h.keys
|
336
|
+
|
337
|
+
PL_v_ambiguous_pres_h = {
|
338
|
+
# 1st pers. sing. 2nd pers. sing. 3rd pers. singular
|
339
|
+
# 3rd pers. (indet.)
|
340
|
+
"act" => "act", "act" => "act", "acts" => "act",
|
341
|
+
"blame" => "blame", "blame" => "blame", "blames" => "blame",
|
342
|
+
"can" => "can", "can" => "can", "can" => "can",
|
343
|
+
"must" => "must", "must" => "must", "must" => "must",
|
344
|
+
"fly" => "fly", "fly" => "fly", "flies" => "fly",
|
345
|
+
"copy" => "copy", "copy" => "copy", "copies" => "copy",
|
346
|
+
"drink" => "drink", "drink" => "drink", "drinks" => "drink",
|
347
|
+
"fight" => "fight", "fight" => "fight", "fights" => "fight",
|
348
|
+
"fire" => "fire", "fire" => "fire", "fires" => "fire",
|
349
|
+
"like" => "like", "like" => "like", "likes" => "like",
|
350
|
+
"look" => "look", "look" => "look", "looks" => "look",
|
351
|
+
"make" => "make", "make" => "make", "makes" => "make",
|
352
|
+
"reach" => "reach", "reach" => "reach", "reaches" => "reach",
|
353
|
+
"run" => "run", "run" => "run", "runs" => "run",
|
354
|
+
"sink" => "sink", "sink" => "sink", "sinks" => "sink",
|
355
|
+
"sleep" => "sleep", "sleep" => "sleep", "sleeps" => "sleep",
|
356
|
+
"view" => "view", "view" => "view", "views" => "view",
|
357
|
+
}
|
358
|
+
PL_v_ambiguous_pres = matchgroup PL_v_ambiguous_pres_h.keys
|
359
|
+
|
360
|
+
PL_v_irregular_non_pres = matchgroup %w[
|
361
|
+
did had ate made put
|
362
|
+
spent fought sank gave sought
|
363
|
+
shall could ought should
|
364
|
+
]
|
365
|
+
|
366
|
+
PL_v_ambiguous_non_pres = matchgroup %w[
|
367
|
+
thought saw bent will might cut
|
368
|
+
]
|
369
|
+
|
370
|
+
PL_count_zero = matchgroup %w[
|
371
|
+
0 no zero nil
|
372
|
+
]
|
373
|
+
|
374
|
+
PL_count_one = matchgroup %w[
|
375
|
+
1 a an one each every this that
|
376
|
+
]
|
377
|
+
|
378
|
+
PL_adj_special_h = {
|
379
|
+
"a" => "some", "an" => "some",
|
380
|
+
"this" => "these", "that" => "those",
|
381
|
+
}
|
382
|
+
PL_adj_special = matchgroup PL_adj_special_h.keys
|
383
|
+
|
384
|
+
PL_adj_poss_h = {
|
385
|
+
"my" => "our",
|
386
|
+
"your" => "your",
|
387
|
+
"its" => "their",
|
388
|
+
"her" => "their",
|
389
|
+
"his" => "their",
|
390
|
+
"their" => "their",
|
391
|
+
}
|
392
|
+
PL_adj_poss = matchgroup PL_adj_poss_h.keys
|
393
|
+
|
394
|
+
|
395
|
+
#################################################################
|
396
|
+
### P U B L I C F U N C T I O N S
|
397
|
+
#################################################################
|
398
|
+
|
399
|
+
### Return the plural of the given +phrase+ if +count+ indicates it should
|
400
|
+
### be plural.
|
401
|
+
def plural( count=2 )
|
402
|
+
phrase = if self.respond_to?( :to_int )
|
403
|
+
self.numwords
|
404
|
+
else
|
405
|
+
self.to_s
|
406
|
+
end
|
407
|
+
|
408
|
+
self.log.debug "Pluralizing %p" % [ phrase ]
|
409
|
+
pre = text = post = nil
|
410
|
+
|
411
|
+
# If the string has whitespace, only pluralize the middle bit, but
|
412
|
+
# preserve the whitespace to add back to the result.
|
413
|
+
if md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
414
|
+
pre, text, post = md.captures
|
415
|
+
else
|
416
|
+
return phrase
|
417
|
+
end
|
418
|
+
|
419
|
+
plural = postprocess( text,
|
420
|
+
pluralize_special_adjective(text, count) ||
|
421
|
+
pluralize_special_verb(text, count) ||
|
422
|
+
pluralize_noun(text, count) )
|
423
|
+
|
424
|
+
return pre + plural + post
|
425
|
+
end
|
426
|
+
Linguistics::EN.register_lprintf_formatter :PL, :plural
|
427
|
+
|
428
|
+
|
429
|
+
### Return the plural of the given noun +phrase+ if +count+ indicates it
|
430
|
+
### should be plural.
|
431
|
+
def plural_noun( count=2 )
|
432
|
+
phrase = self.to_s
|
433
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase )
|
434
|
+
pre, word, post = md.captures
|
435
|
+
|
436
|
+
return phrase if word.nil? or word.empty?
|
437
|
+
|
438
|
+
plural = postprocess( word, pluralize_noun(word, count) )
|
439
|
+
|
440
|
+
return pre + plural + post
|
441
|
+
end
|
442
|
+
|
443
|
+
|
444
|
+
### Return the plural of the given verb +phrase+ if +count+ indicates it
|
445
|
+
### should be plural.
|
446
|
+
def plural_verb( count=2 )
|
447
|
+
phrase = self.to_s
|
448
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase )
|
449
|
+
pre, word, post = md.captures
|
450
|
+
|
451
|
+
return phrase if word.nil? or word.empty?
|
452
|
+
|
453
|
+
plural = postprocess( word,
|
454
|
+
pluralize_special_verb(word, count) ||
|
455
|
+
pluralize_general_verb(word, count) )
|
456
|
+
|
457
|
+
return pre + plural + post
|
458
|
+
end
|
459
|
+
|
460
|
+
|
461
|
+
### Return the plural of the given adjectival +phrase+ if +count+ indicates
|
462
|
+
### it should be plural.
|
463
|
+
def plural_adjective( count=2 )
|
464
|
+
phrase = self.to_s
|
465
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase )
|
466
|
+
pre, word, post = md.captures
|
467
|
+
|
468
|
+
return phrase if word.nil? or word.empty?
|
469
|
+
|
470
|
+
plural = postprocess( word, pluralize_special_adjective(word, count) || word )
|
471
|
+
|
472
|
+
return pre + plural + post
|
473
|
+
end
|
474
|
+
alias_method :plural_adj, :plural_adjective
|
475
|
+
|
476
|
+
|
477
|
+
#################################################################
|
478
|
+
### P R I V A T E F U N C T I O N S
|
479
|
+
#################################################################
|
480
|
+
|
481
|
+
#######
|
482
|
+
private
|
483
|
+
#######
|
484
|
+
|
485
|
+
### Do normal/classical switching and match capitalization in +inflected+ by
|
486
|
+
### examining the +original+ input.
|
487
|
+
def postprocess( original, inflected )
|
488
|
+
|
489
|
+
# If there's a classical variant, use it instead of the modern one if
|
490
|
+
# classical mode is on.
|
491
|
+
inflected.sub!( /([^|]+)\|(.+)/ ) do
|
492
|
+
Linguistics::EN.classical? ? $2 : $1
|
493
|
+
end
|
494
|
+
|
495
|
+
# Try to duplicate the case of the original string
|
496
|
+
case original
|
497
|
+
when "I"
|
498
|
+
return inflected
|
499
|
+
when /^[A-Z]+$/
|
500
|
+
return inflected.upcase
|
501
|
+
when /^[A-Z]/
|
502
|
+
# Can't use #capitalize, as it will downcase the rest of the string,
|
503
|
+
# too.
|
504
|
+
inflected[0,1] = inflected[0,1].upcase
|
505
|
+
return inflected
|
506
|
+
else
|
507
|
+
return inflected
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
|
512
|
+
### Normalize a count to either 1 or 2 (singular or plural)
|
513
|
+
def normalize_count( count, default=2 )
|
514
|
+
return default if count.nil? # Default to plural
|
515
|
+
if /^(#{PL_count_one})$/i =~ count.to_s ||
|
516
|
+
( Linguistics::EN.classical? && /^(#{PL_count_zero})$/ =~ count.to_s )
|
517
|
+
return 1
|
518
|
+
else
|
519
|
+
return default
|
520
|
+
end
|
521
|
+
end
|
522
|
+
|
523
|
+
|
524
|
+
### Pluralize nouns
|
525
|
+
def pluralize_noun( word, count=2 )
|
526
|
+
self.log.debug "Trying to pluralize %p as a noun" % [ word ]
|
527
|
+
|
528
|
+
value = nil
|
529
|
+
count = normalize_count( count )
|
530
|
+
|
531
|
+
return word if count == 1
|
532
|
+
|
533
|
+
# Handle user-defined nouns
|
534
|
+
#if value = ud_match( word, PL_sb_user_defined )
|
535
|
+
# return value
|
536
|
+
#end
|
537
|
+
|
538
|
+
# Handle empty word, singular count and uninflected plurals
|
539
|
+
case word
|
540
|
+
when ''
|
541
|
+
self.log.debug " empty string"
|
542
|
+
return word
|
543
|
+
when /^(#{PL_sb_uninflected})$/i
|
544
|
+
self.log.debug " uninflected plural"
|
545
|
+
return word
|
546
|
+
else
|
547
|
+
if Linguistics::EN.classical? && /^(#{PL_sb_uninflected_herd})$/i =~ word
|
548
|
+
self.log.debug " uninflected classical herd word"
|
549
|
+
return word
|
550
|
+
end
|
551
|
+
end
|
552
|
+
|
553
|
+
# Handle compounds ("Governor General", "mother-in-law", "aide-de-camp", etc.)
|
554
|
+
case word
|
555
|
+
when /^(?:#{PL_sb_postfix_adj})$/i
|
556
|
+
value = $2
|
557
|
+
noun = $1
|
558
|
+
self.log.debug " postfixed adjectival compound noun phrase (#{value} -> #{noun})"
|
559
|
+
return pluralize_noun( noun, 2 ) + value
|
560
|
+
|
561
|
+
when /^(?:#{PL_sb_prep_dual_compound})$/i
|
562
|
+
noun = $1
|
563
|
+
value = [ $2, $3 ]
|
564
|
+
self.log.debug " prepositional dual compound noun phrase (%s -> %s %s)" %
|
565
|
+
[ noun, *value ]
|
566
|
+
return pluralize_noun( noun, 2 ) + value[0] + pluralize_noun( value[1] )
|
567
|
+
|
568
|
+
when /^(?:#{PL_sb_prep_compound})$/i
|
569
|
+
noun = $1
|
570
|
+
value = $2
|
571
|
+
self.log.debug " prepositional singular compound noun phrase (%s -> %s)" %
|
572
|
+
[ noun, value ]
|
573
|
+
return pluralize_noun( noun, 2 ) + value
|
574
|
+
|
575
|
+
# Handle pronouns
|
576
|
+
when /^((?:#{PL_prep})\s+)(#{PL_pron_acc})$/i
|
577
|
+
prep, pron = $1, $2
|
578
|
+
self.log.debug " prepositional pronoun phrase (%p + %p)" % [ prep, pron ]
|
579
|
+
return prep + PL_pron_acc_h[ pron.downcase ]
|
580
|
+
|
581
|
+
when /^(#{PL_pron_nom})$/i
|
582
|
+
pron = $1
|
583
|
+
self.log.debug " nominative pronoun; using PL_pron_nom table"
|
584
|
+
return PL_pron_nom_h[ word.downcase ]
|
585
|
+
|
586
|
+
when /^(#{PL_pron_acc})$/i
|
587
|
+
self.log.debug " accusative pronoun; using PL_pron_acc table"
|
588
|
+
return PL_pron_acc_h[ word.downcase ]
|
589
|
+
|
590
|
+
# Handle isolated irregular plurals
|
591
|
+
when /(.*)\b(#{PL_sb_irregular})$/i
|
592
|
+
prefix, word = $1, $2
|
593
|
+
self.log.debug " isolated irregular; using PL_sb_irregular_h table"
|
594
|
+
return prefix + PL_sb_irregular_h[ word.downcase ]
|
595
|
+
|
596
|
+
# Unconditional ...man -> ...mans
|
597
|
+
when /(#{PL_sb_U_man_mans})$/i
|
598
|
+
word = $1
|
599
|
+
self.log.debug " unconditional man -> mans (%p)" % [ word ]
|
600
|
+
return "#{word}s"
|
601
|
+
|
602
|
+
# Handle families of irregular plurals
|
603
|
+
when /(.*)man$/i then return "#{$1}men"
|
604
|
+
when /(.*[ml])ouse$/i then return "#{$1}ice"
|
605
|
+
when /(.*)goose$/i then return "#{$1}geese"
|
606
|
+
when /(.*)tooth$/i then return "#{$1}teeth"
|
607
|
+
when /(.*)foot$/i then return "#{$1}feet"
|
608
|
+
|
609
|
+
# Handle unassimilated imports
|
610
|
+
when /(.*)ceps$/i then return word
|
611
|
+
when /(.*)zoon$/i then return "#{$1}zoa"
|
612
|
+
when /(.*[csx])is$/i then return "#{$1}es"
|
613
|
+
when /(#{PL_sb_U_ex_ices})ex$/i then return "#{$1}ices"
|
614
|
+
when /(#{PL_sb_U_ix_ices})ix$/i then return "#{$1}ices"
|
615
|
+
when /(#{PL_sb_U_um_a})um$/i then return "#{$1}a"
|
616
|
+
when /(#{PL_sb_U_us_i})us$/i then return "#{$1}i"
|
617
|
+
when /(#{PL_sb_U_on_a})on$/i then return "#{$1}a"
|
618
|
+
when /(#{PL_sb_U_a_ae})$/i then return "#{$1}e"
|
619
|
+
end
|
620
|
+
|
621
|
+
|
622
|
+
# Handle incompletely assimilated imports in classical mode
|
623
|
+
if Linguistics::EN.classical?
|
624
|
+
self.log.debug " checking for classical incompletely assimilated imports"
|
625
|
+
case word
|
626
|
+
when /(.*)trix$/i then return "#{$1}trices"
|
627
|
+
when /(.*)eau$/i then return "#{$1}eaux"
|
628
|
+
when /(.*)ieu$/i then return "#{$1}ieux"
|
629
|
+
when /(.{2,}[yia])nx$/i then return "#{$1}nges"
|
630
|
+
when /(#{PL_sb_C_en_ina})en$/i then return "#{$1}ina"
|
631
|
+
when /(#{PL_sb_C_ex_ices})ex$/i then return "#{$1}ices"
|
632
|
+
when /(#{PL_sb_C_ix_ices})ix$/i then return "#{$1}ices"
|
633
|
+
when /(#{PL_sb_C_um_a})um$/i then return "#{$1}a"
|
634
|
+
when /(#{PL_sb_C_us_i})us$/i then return "#{$1}i"
|
635
|
+
when /(#{PL_sb_C_us_us})$/i then return "#{$1}"
|
636
|
+
when /(#{PL_sb_C_a_ae})$/i then return "#{$1}e"
|
637
|
+
when /(#{PL_sb_C_a_ata})a$/i then return "#{$1}ata"
|
638
|
+
when /(#{PL_sb_C_o_i})o$/i then return "#{$1}i"
|
639
|
+
when /(#{PL_sb_C_on_a})on$/i then return "#{$1}a"
|
640
|
+
when /#{PL_sb_C_im}$/i then return "#{word}im"
|
641
|
+
when /#{PL_sb_C_i}$/i then return "#{word}i"
|
642
|
+
end
|
643
|
+
end
|
644
|
+
|
645
|
+
|
646
|
+
# Handle singular nouns ending in ...s or other silibants
|
647
|
+
case word
|
648
|
+
when /^(#{PL_sb_singular_s})$/i then return "#{$1}es"
|
649
|
+
when /^([A-Z].*s)$/ then return "#{$1}es"
|
650
|
+
when /(.*)([cs]h|[zx])$/i then return "#{$1}#{$2}es"
|
651
|
+
# when /(.*)(us)$/i then return "#{$1}#{$2}es"
|
652
|
+
|
653
|
+
# Handle ...f -> ...ves
|
654
|
+
when /(.*[eao])lf$/i then return "#{$1}lves"
|
655
|
+
when /(.*[^d])eaf$/i then return "#{$1}eaves"
|
656
|
+
when /(.*[nlw])ife$/i then return "#{$1}ives"
|
657
|
+
when /(.*)arf$/i then return "#{$1}arves"
|
658
|
+
|
659
|
+
# Handle ...y
|
660
|
+
when /(.*[aeiou])y$/i then return "#{$1}ys"
|
661
|
+
when /([A-Z].*y)$/ then return "#{$1}s"
|
662
|
+
when /(.*)y$/i then return "#{$1}ies"
|
663
|
+
|
664
|
+
# Handle ...o
|
665
|
+
when /#{PL_sb_U_o_os}$/i then return "#{word}s"
|
666
|
+
when /[aeiou]o$/i then return "#{word}s"
|
667
|
+
when /o$/i then return "#{word}es"
|
668
|
+
|
669
|
+
# Otherwise just add ...s
|
670
|
+
else
|
671
|
+
self.log.debug " appears to be regular; adding +s"
|
672
|
+
return "#{word}s"
|
673
|
+
end
|
674
|
+
end # def pluralize_noun
|
675
|
+
|
676
|
+
|
677
|
+
|
678
|
+
### Pluralize special verbs
|
679
|
+
def pluralize_special_verb( word, count )
|
680
|
+
self.log.debug "Trying to pluralize %p as a special verb..." % [ word ]
|
681
|
+
count ||= 1
|
682
|
+
count = normalize_count( count )
|
683
|
+
|
684
|
+
if /^(#{PL_count_one})$/i =~ count.to_s
|
685
|
+
self.log.debug " it's a single-count word, returning it unchanged."
|
686
|
+
return word # :FIXME: should this return nil instead?
|
687
|
+
# return nil
|
688
|
+
end
|
689
|
+
|
690
|
+
# Handle user-defined verbs
|
691
|
+
#if value = ud_match( word, PL_v_user_defined )
|
692
|
+
# return value
|
693
|
+
#end
|
694
|
+
|
695
|
+
case word
|
696
|
+
|
697
|
+
# Handle irregular present tense (simple and compound)
|
698
|
+
when /^(#{PL_v_irregular_pres})((\s.*)?)$/i
|
699
|
+
key = $1.downcase
|
700
|
+
self.log.debug " yep, it's an irregular present tense verb (%p)" % [ key ]
|
701
|
+
return PL_v_irregular_pres_h[ $1.downcase ] + $2
|
702
|
+
|
703
|
+
# Handle irregular future, preterite and perfect tenses
|
704
|
+
when /^(#{PL_v_irregular_non_pres})((\s.*)?)$/i
|
705
|
+
self.log.debug " yep, it's an irregular non-present tense verb (%p)" % [ key ]
|
706
|
+
return word
|
707
|
+
|
708
|
+
# Handle special cases
|
709
|
+
when /^(#{PL_v_special_s})$/
|
710
|
+
self.log.debug " it's a not special-case verb; aborting."
|
711
|
+
return nil
|
712
|
+
|
713
|
+
# Handle standard 3rd person (chop the ...(e)s off single words)
|
714
|
+
when /^(.*)([cs]h|[x]|zz|ss)es$/i
|
715
|
+
base, suffix = $1, $2
|
716
|
+
self.log.debug " it's a standard third-person verb (%p + %p)" % [ base, suffix ]
|
717
|
+
return base + suffix
|
718
|
+
when /^(..+)ies$/i
|
719
|
+
verb = $1
|
720
|
+
self.log.debug " it's a standard third-person verb (%p + ies -> +y)" % [ verb ]
|
721
|
+
return "#{verb}y"
|
722
|
+
when /^(.+)oes$/i
|
723
|
+
verb = $1
|
724
|
+
self.log.debug " it's a standard third-person verb (%p + oes -> +o)" % [ verb ]
|
725
|
+
return "#{verb}o"
|
726
|
+
when /^(.*[^s])s$/i
|
727
|
+
verb = $1
|
728
|
+
self.log.debug " it's a standard third-person verb (%p + (^s)s -> -s)" % [ verb ]
|
729
|
+
return verb
|
730
|
+
|
731
|
+
# Otherwise, a regular verb (handle elsewhere)
|
732
|
+
else
|
733
|
+
self.log.debug " nope. Either a regular verb or not a verb."
|
734
|
+
return nil
|
735
|
+
end
|
736
|
+
end
|
737
|
+
|
738
|
+
|
739
|
+
### Pluralize regular verbs
|
740
|
+
def pluralize_general_verb( word, count )
|
741
|
+
count = normalize_count( count )
|
742
|
+
|
743
|
+
return word if /^(#{PL_count_one})$/i =~ count.to_s
|
744
|
+
|
745
|
+
case word
|
746
|
+
|
747
|
+
# Handle ambiguous present tenses (simple and compound)
|
748
|
+
when /^(#{PL_v_ambiguous_pres})((\s.*)?)$/i
|
749
|
+
return PL_v_ambiguous_pres_h[ $1.downcase ] + $2
|
750
|
+
|
751
|
+
# Handle ambiguous preterite and perfect tenses
|
752
|
+
when /^(#{PL_v_ambiguous_non_pres})((\s.*)?)$/i
|
753
|
+
return word
|
754
|
+
|
755
|
+
# Otherwise, 1st or 2nd person is uninflected
|
756
|
+
else
|
757
|
+
return word
|
758
|
+
end
|
759
|
+
end
|
760
|
+
|
761
|
+
|
762
|
+
### Handle special adjectives
|
763
|
+
def pluralize_special_adjective( word, count )
|
764
|
+
self.log.debug "Trying to pluralize %p as a special adjective..." % [ word ]
|
765
|
+
count ||= 1
|
766
|
+
count = normalize_count( count )
|
767
|
+
|
768
|
+
if /^(#{PL_count_one})$/i =~ count.to_s
|
769
|
+
self.log.debug " it's a single-count word; aborting"
|
770
|
+
return nil
|
771
|
+
end
|
772
|
+
|
773
|
+
# Handle user-defined verbs
|
774
|
+
#if value = ud_match( word, PL_adj_user_defined )
|
775
|
+
# return value
|
776
|
+
#end
|
777
|
+
|
778
|
+
case word
|
779
|
+
|
780
|
+
# Handle known cases
|
781
|
+
when /^(#{PL_adj_special})$/i
|
782
|
+
key = $1.downcase
|
783
|
+
self.log.debug " yep, it's a special plural adjective (%p)" % [ key ]
|
784
|
+
return PL_adj_special_h[ key ]
|
785
|
+
|
786
|
+
# Handle possessives
|
787
|
+
when /^(#{PL_adj_poss})$/i
|
788
|
+
key = $1.downcase
|
789
|
+
self.log.debug " it's a special possessive adjective (%p)" % [ key ]
|
790
|
+
return PL_adj_poss_h[ $1.downcase ]
|
791
|
+
|
792
|
+
when /^(.*)'s?$/
|
793
|
+
pl = $1.en.plural_noun( count )
|
794
|
+
self.log.debug " it has an apostrophe (%p); using generic possessive rules" % [ pl ]
|
795
|
+
if /s$/ =~ pl
|
796
|
+
return "#{pl}'"
|
797
|
+
else
|
798
|
+
return "#{pl}'s"
|
799
|
+
end
|
800
|
+
|
801
|
+
# Otherwise, no idea
|
802
|
+
else
|
803
|
+
self.log.debug " nope."
|
804
|
+
return nil
|
805
|
+
end
|
806
|
+
end
|
807
|
+
|
808
|
+
|
809
|
+
end # module Linguistics::EN::Pluralization
|
810
|
+
|