Linguistics 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Artistic +127 -0
- data/ChangeLog +444 -0
- data/MANIFEST +19 -0
- data/README +178 -0
- data/README.english +245 -0
- data/TODO +17 -0
- data/experiments/randobjlist.rb +34 -0
- data/install.rb +154 -0
- data/lib/linguistics/en/infinitive.rb +1149 -0
- data/lib/linguistics/en/linkparser.rb +142 -0
- data/lib/linguistics/en/wordnet.rb +253 -0
- data/lib/linguistics/en.rb +1694 -0
- data/lib/linguistics/iso639.rb +456 -0
- data/lib/linguistics.rb +368 -0
- data/redist/crosscase.rb +298 -0
- data/test.rb +110 -0
- data/tests/en/conjunction.tests.rb +114 -0
- data/tests/en/inflect.tests.rb +1378 -0
- data/tests/lingtestcase.rb +239 -0
- data/tests/use.tests.rb +99 -0
- data/utils.rb +689 -0
- metadata +58 -0
@@ -0,0 +1,1694 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
#
|
3
|
+
# = Linguistics::EN
|
4
|
+
#
|
5
|
+
# This module contains English-language linguistic functions for the Linguistics
|
6
|
+
# module. It can be either loaded directly, or by passing some variant of 'en'
|
7
|
+
# or 'eng' to the Linguistics::use method.
|
8
|
+
#
|
9
|
+
# The functions contained by the module provide:
|
10
|
+
#
|
11
|
+
# == Plural Inflections
|
12
|
+
#
|
13
|
+
# Plural forms of all nouns, most verbs, and some adjectives are provided. Where
|
14
|
+
# appropriate, "classical" variants (for example: "brother" -> "brethren",
|
15
|
+
# "dogma" -> "dogmata", etc.) are also provided.
|
16
|
+
#
|
17
|
+
# These can be accessed via the #plural, #plural_noun, #plural_verb, and
|
18
|
+
# #plural_adjective methods.
|
19
|
+
#
|
20
|
+
# == Indefinite Articles
|
21
|
+
#
|
22
|
+
# Pronunciation-based "a"/"an" selection is provided for all English words, and
|
23
|
+
# most initialisms.
|
24
|
+
#
|
25
|
+
# See: #a, #an, and #no.
|
26
|
+
#
|
27
|
+
# == Numbers to Words
|
28
|
+
#
|
29
|
+
# Conversion from Numeric values to words are supported using the American
|
30
|
+
# "thousands" system. E.g., 2561 => "two thousand, five hundred and sixty-one".
|
31
|
+
#
|
32
|
+
# See the #numwords method.
|
33
|
+
#
|
34
|
+
# == Ordinals
|
35
|
+
#
|
36
|
+
# It is also possible to inflect numerals (1,2,3) and number words ("one",
|
37
|
+
# "two", "three") to ordinals (1st, 2nd, 3rd) and ordinates ("first", "second",
|
38
|
+
# "third").
|
39
|
+
#
|
40
|
+
# == Conjunctions
|
41
|
+
#
|
42
|
+
# This module also supports the creation of English conjunctions from Arrays of
|
43
|
+
# Strings or objects which respond to the #to_s message. Eg.,
|
44
|
+
#
|
45
|
+
# %w{cow pig chicken cow dog cow duck duck moose}.en.conjunction
|
46
|
+
# ==> "three cows, two ducks, a pig, a chicken, a dog, and a moose"
|
47
|
+
#
|
48
|
+
# == Infinitives
|
49
|
+
#
|
50
|
+
# Returns the infinitive form of English verbs:
|
51
|
+
#
|
52
|
+
# "dodging".en.infinitive
|
53
|
+
# ==> "dodge"
|
54
|
+
#
|
55
|
+
#
|
56
|
+
# == Authors
|
57
|
+
#
|
58
|
+
# * Michael Granger <ged@FaerieMUD.org>
|
59
|
+
#
|
60
|
+
# == Copyright
|
61
|
+
#
|
62
|
+
# This module is copyright (c) 2003-2005 The FaerieMUD Consortium. All rights
|
63
|
+
# reserved.
|
64
|
+
#
|
65
|
+
# This module is free software. You may use, modify, and/or redistribute this
|
66
|
+
# software under the terms of the Perl Artistic License. (See
|
67
|
+
# http://language.perl.com/misc/Artistic.html)
|
68
|
+
#
|
69
|
+
# The inflection functions of this module were adapted from Damien Conway's
|
70
|
+
# Lingua::EN::Inflect Perl module:
|
71
|
+
#
|
72
|
+
# Copyright (c) 1997-2000, Damian Conway. All Rights Reserved.
|
73
|
+
# This module is free software. It may be used, redistributed
|
74
|
+
# and/or modified under the same terms as Perl itself.
|
75
|
+
#
|
76
|
+
# The conjunctions code was adapted from the Lingua::Conjunction Perl module
|
77
|
+
# written by Robert Rothenberg and Damian Conway, which has no copyright
|
78
|
+
# statement included.
|
79
|
+
#
|
80
|
+
# == Version
|
81
|
+
#
|
82
|
+
# $Id: en.rb,v 1.8 2003/09/14 10:47:12 deveiant Exp $
|
83
|
+
#
|
84
|
+
|
85
|
+
|
86
|
+
### This module contains English-language linguistics functions accessible from
|
87
|
+
### the Linguistics module, or as a standalone function library.
|
88
|
+
module Linguistics::EN
|
89
|
+
|
90
|
+
begin
|
91
|
+
require 'crosscase'
|
92
|
+
rescue LoadError
|
93
|
+
else
|
94
|
+
include CrossCase
|
95
|
+
end
|
96
|
+
|
97
|
+
# Load in the secondary modules and add them to Linguistics::EN.
|
98
|
+
require 'linguistics/en/infinitive'
|
99
|
+
require 'linguistics/en/wordnet'
|
100
|
+
require 'linguistics/en/linkparser'
|
101
|
+
|
102
|
+
# Subversion revision
|
103
|
+
SVNRev = %q$Rev$
|
104
|
+
|
105
|
+
# Subversion revision tag
|
106
|
+
SVNId = %q$Id: en.rb,v 1.8 2003/09/14 10:47:12 deveiant Exp $
|
107
|
+
|
108
|
+
# Add 'english' to the list of default languages
|
109
|
+
Linguistics::DefaultLanguages.push( :en )
|
110
|
+
|
111
|
+
|
112
|
+
#################################################################
|
113
|
+
### U T I L I T Y F U N C T I O N S
|
114
|
+
#################################################################
|
115
|
+
|
116
|
+
### Wrap one or more parts in a non-capturing alteration Regexp
|
117
|
+
def self::matchgroup( *parts )
|
118
|
+
re = parts.flatten.join("|")
|
119
|
+
"(?:#{re})"
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
#################################################################
|
124
|
+
### C O N S T A N T S
|
125
|
+
#################################################################
|
126
|
+
|
127
|
+
# :stopdoc:
|
128
|
+
|
129
|
+
#
|
130
|
+
# Plurals
|
131
|
+
#
|
132
|
+
|
133
|
+
PL_sb_irregular_s = {
|
134
|
+
"ephemeris" => "ephemerides",
|
135
|
+
"iris" => "irises|irides",
|
136
|
+
"clitoris" => "clitorises|clitorides",
|
137
|
+
"corpus" => "corpuses|corpora",
|
138
|
+
"opus" => "opuses|opera",
|
139
|
+
"genus" => "genera",
|
140
|
+
"mythos" => "mythoi",
|
141
|
+
"penis" => "penises|penes",
|
142
|
+
"testis" => "testes",
|
143
|
+
}
|
144
|
+
|
145
|
+
PL_sb_irregular_h = {
|
146
|
+
"child" => "children",
|
147
|
+
"brother" => "brothers|brethren",
|
148
|
+
"loaf" => "loaves",
|
149
|
+
"hoof" => "hoofs|hooves",
|
150
|
+
"beef" => "beefs|beeves",
|
151
|
+
"money" => "monies",
|
152
|
+
"mongoose" => "mongooses",
|
153
|
+
"ox" => "oxen",
|
154
|
+
"cow" => "cows|kine",
|
155
|
+
"soliloquy" => "soliloquies",
|
156
|
+
"graffito" => "graffiti",
|
157
|
+
"prima donna" => "prima donnas|prime donne",
|
158
|
+
"octopus" => "octopuses|octopodes",
|
159
|
+
"genie" => "genies|genii",
|
160
|
+
"ganglion" => "ganglions|ganglia",
|
161
|
+
"trilby" => "trilbys",
|
162
|
+
"turf" => "turfs|turves",
|
163
|
+
}.update( PL_sb_irregular_s )
|
164
|
+
PL_sb_irregular = matchgroup PL_sb_irregular_h.keys
|
165
|
+
|
166
|
+
|
167
|
+
# Classical "..a" -> "..ata"
|
168
|
+
PL_sb_C_a_ata = matchgroup %w[
|
169
|
+
anathema bema carcinoma charisma diploma
|
170
|
+
dogma drama edema enema enigma lemma
|
171
|
+
lymphoma magma melisma miasma oedema
|
172
|
+
sarcoma schema soma stigma stoma trauma
|
173
|
+
gumma pragma
|
174
|
+
].collect {|word| word[0...-1]}
|
175
|
+
|
176
|
+
# Unconditional "..a" -> "..ae"
|
177
|
+
PL_sb_U_a_ae = matchgroup %w[
|
178
|
+
alumna alga vertebra persona
|
179
|
+
]
|
180
|
+
|
181
|
+
# Classical "..a" -> "..ae"
|
182
|
+
PL_sb_C_a_ae = matchgroup %w[
|
183
|
+
amoeba antenna formula hyperbola
|
184
|
+
medusa nebula parabola abscissa
|
185
|
+
hydra nova lacuna aurora .*umbra
|
186
|
+
flora fauna
|
187
|
+
]
|
188
|
+
|
189
|
+
# Classical "..en" -> "..ina"
|
190
|
+
PL_sb_C_en_ina = matchgroup %w[
|
191
|
+
stamen foramen lumen
|
192
|
+
].collect {|word| word[0...-2] }
|
193
|
+
|
194
|
+
# Unconditional "..um" -> "..a"
|
195
|
+
PL_sb_U_um_a = matchgroup %w[
|
196
|
+
bacterium agendum desideratum erratum
|
197
|
+
stratum datum ovum extremum
|
198
|
+
candelabrum
|
199
|
+
].collect {|word| word[0...-2] }
|
200
|
+
|
201
|
+
# Classical "..um" -> "..a"
|
202
|
+
PL_sb_C_um_a = matchgroup %w[
|
203
|
+
maximum minimum momentum optimum
|
204
|
+
quantum cranium curriculum dictum
|
205
|
+
phylum aquarium compendium emporium
|
206
|
+
enconium gymnasium honorarium interregnum
|
207
|
+
lustrum memorandum millenium rostrum
|
208
|
+
spectrum speculum stadium trapezium
|
209
|
+
ultimatum medium vacuum velum
|
210
|
+
consortium
|
211
|
+
].collect {|word| word[0...-2]}
|
212
|
+
|
213
|
+
# Unconditional "..us" -> "i"
|
214
|
+
PL_sb_U_us_i = matchgroup %w[
|
215
|
+
alumnus alveolus bacillus bronchus
|
216
|
+
locus nucleus stimulus meniscus
|
217
|
+
].collect {|word| word[0...-2]}
|
218
|
+
|
219
|
+
# Classical "..us" -> "..i"
|
220
|
+
PL_sb_C_us_i = matchgroup %w[
|
221
|
+
focus radius genius
|
222
|
+
incubus succubus nimbus
|
223
|
+
fungus nucleolus stylus
|
224
|
+
torus umbilicus uterus
|
225
|
+
hippopotamus
|
226
|
+
].collect {|word| word[0...-2]}
|
227
|
+
|
228
|
+
# Classical "..us" -> "..us" (assimilated 4th declension latin nouns)
|
229
|
+
PL_sb_C_us_us = matchgroup %w[
|
230
|
+
status apparatus prospectus sinus
|
231
|
+
hiatus impetus plexus
|
232
|
+
]
|
233
|
+
|
234
|
+
# Unconditional "..on" -> "a"
|
235
|
+
PL_sb_U_on_a = matchgroup %w[
|
236
|
+
criterion perihelion aphelion
|
237
|
+
phenomenon prolegomenon noumenon
|
238
|
+
organon asyndeton hyperbaton
|
239
|
+
].collect {|word| word[0...-2]}
|
240
|
+
|
241
|
+
# Classical "..on" -> "..a"
|
242
|
+
PL_sb_C_on_a = matchgroup %w[
|
243
|
+
oxymoron
|
244
|
+
].collect {|word| word[0...-2]}
|
245
|
+
|
246
|
+
# Classical "..o" -> "..i" (but normally -> "..os")
|
247
|
+
PL_sb_C_o_i_a = %w[
|
248
|
+
solo soprano basso alto
|
249
|
+
contralto tempo piano
|
250
|
+
]
|
251
|
+
PL_sb_C_o_i = matchgroup PL_sb_C_o_i_a.collect{|word| word[0...-1]}
|
252
|
+
|
253
|
+
# Always "..o" -> "..os"
|
254
|
+
PL_sb_U_o_os = matchgroup( %w[
|
255
|
+
albino archipelago armadillo
|
256
|
+
commando crescendo fiasco
|
257
|
+
ditto dynamo embryo
|
258
|
+
ghetto guano inferno
|
259
|
+
jumbo lumbago magneto
|
260
|
+
manifesto medico octavo
|
261
|
+
photo pro quarto
|
262
|
+
canto lingo generalissimo
|
263
|
+
stylo rhino
|
264
|
+
] | PL_sb_C_o_i_a )
|
265
|
+
|
266
|
+
|
267
|
+
# Unconditional "..[ei]x" -> "..ices"
|
268
|
+
PL_sb_U_ex_ices = matchgroup %w[
|
269
|
+
codex murex silex
|
270
|
+
].collect {|word| word[0...-2]}
|
271
|
+
PL_sb_U_ix_ices = matchgroup %w[
|
272
|
+
radix helix
|
273
|
+
].collect {|word| word[0...-2]}
|
274
|
+
|
275
|
+
# Classical "..[ei]x" -> "..ices"
|
276
|
+
PL_sb_C_ex_ices = matchgroup %w[
|
277
|
+
vortex vertex cortex latex
|
278
|
+
pontifex apex index simplex
|
279
|
+
].collect {|word| word[0...-2]}
|
280
|
+
PL_sb_C_ix_ices = matchgroup %w[
|
281
|
+
appendix
|
282
|
+
].collect {|word| word[0...-2]}
|
283
|
+
|
284
|
+
|
285
|
+
# Arabic: ".." -> "..i"
|
286
|
+
PL_sb_C_i = matchgroup %w[
|
287
|
+
afrit afreet efreet
|
288
|
+
]
|
289
|
+
|
290
|
+
|
291
|
+
# Hebrew: ".." -> "..im"
|
292
|
+
PL_sb_C_im = matchgroup %w[
|
293
|
+
goy seraph cherub
|
294
|
+
]
|
295
|
+
|
296
|
+
# Unconditional "..man" -> "..mans"
|
297
|
+
PL_sb_U_man_mans = matchgroup %w[
|
298
|
+
human
|
299
|
+
Alabaman Bahaman Burman German
|
300
|
+
Hiroshiman Liman Nakayaman Oklahoman
|
301
|
+
Panaman Selman Sonaman Tacoman Yakiman
|
302
|
+
Yokohaman Yuman
|
303
|
+
]
|
304
|
+
|
305
|
+
|
306
|
+
PL_sb_uninflected_s = [
|
307
|
+
# Pairs or groups subsumed to a singular...
|
308
|
+
"breeches", "britches", "clippers", "gallows", "hijinks",
|
309
|
+
"headquarters", "pliers", "scissors", "testes", "herpes",
|
310
|
+
"pincers", "shears", "proceedings", "trousers",
|
311
|
+
|
312
|
+
# Unassimilated Latin 4th declension
|
313
|
+
"cantus", "coitus", "nexus",
|
314
|
+
|
315
|
+
# Recent imports...
|
316
|
+
"contretemps", "corps", "debris",
|
317
|
+
".*ois",
|
318
|
+
|
319
|
+
# Diseases
|
320
|
+
".*measles", "mumps",
|
321
|
+
|
322
|
+
# Miscellaneous others...
|
323
|
+
"diabetes", "jackanapes", "series", "species", "rabies",
|
324
|
+
"chassis", "innings", "news", "mews",
|
325
|
+
]
|
326
|
+
|
327
|
+
|
328
|
+
# Don't inflect in classical mode, otherwise normal inflection
|
329
|
+
PL_sb_uninflected_herd = matchgroup %w[
|
330
|
+
wildebeest swine eland bison buffalo
|
331
|
+
elk moose rhinoceros
|
332
|
+
]
|
333
|
+
|
334
|
+
PL_sb_uninflected = matchgroup [
|
335
|
+
|
336
|
+
# Some fish and herd animals
|
337
|
+
".*fish", "tuna", "salmon", "mackerel", "trout",
|
338
|
+
"bream", "sea[- ]bass", "carp", "cod", "flounder", "whiting",
|
339
|
+
|
340
|
+
".*deer", ".*sheep",
|
341
|
+
|
342
|
+
# All nationals ending in -ese
|
343
|
+
"Portuguese", "Amoyese", "Borghese", "Congoese", "Faroese",
|
344
|
+
"Foochowese", "Genevese", "Genoese", "Gilbertese", "Hottentotese",
|
345
|
+
"Kiplingese", "Kongoese", "Lucchese", "Maltese", "Nankingese",
|
346
|
+
"Niasese", "Pekingese", "Piedmontese", "Pistoiese", "Sarawakese",
|
347
|
+
"Shavese", "Vermontese", "Wenchowese", "Yengeese",
|
348
|
+
".*[nrlm]ese",
|
349
|
+
|
350
|
+
# Some words ending in ...s (often pairs taken as a whole)
|
351
|
+
PL_sb_uninflected_s,
|
352
|
+
|
353
|
+
# Diseases
|
354
|
+
".*pox",
|
355
|
+
|
356
|
+
# Other oddities
|
357
|
+
"graffiti", "djinn"
|
358
|
+
]
|
359
|
+
|
360
|
+
|
361
|
+
# Singular words ending in ...s (all inflect with ...es)
|
362
|
+
PL_sb_singular_s = matchgroup %w[
|
363
|
+
.*ss
|
364
|
+
acropolis aegis alias arthritis asbestos atlas
|
365
|
+
bathos bias bronchitis bursitis caddis cannabis
|
366
|
+
canvas chaos cosmos dais digitalis encephalitis
|
367
|
+
epidermis ethos eyas gas glottis hepatitis
|
368
|
+
hubris ibis lens mantis marquis metropolis
|
369
|
+
neuritis pathos pelvis polis rhinoceros
|
370
|
+
sassafras tonsillitis trellis .*us
|
371
|
+
]
|
372
|
+
|
373
|
+
PL_v_special_s = matchgroup [
|
374
|
+
PL_sb_singular_s,
|
375
|
+
PL_sb_uninflected_s,
|
376
|
+
PL_sb_irregular_s.keys,
|
377
|
+
'(.*[csx])is',
|
378
|
+
'(.*)ceps',
|
379
|
+
'[A-Z].*s',
|
380
|
+
]
|
381
|
+
|
382
|
+
PL_sb_postfix_adj = '(' + {
|
383
|
+
|
384
|
+
'general' => ['(?!major|lieutenant|brigadier|adjutant)\S+'],
|
385
|
+
'martial' => ["court"],
|
386
|
+
|
387
|
+
}.collect {|key,val|
|
388
|
+
matchgroup( matchgroup(val) + "(?=(?:-|\\s+)#{key})" )
|
389
|
+
}.join("|") + ")(.*)"
|
390
|
+
|
391
|
+
|
392
|
+
PL_sb_military = %r'major|lieutenant|brigadier|adjutant|quartermaster'
|
393
|
+
PL_sb_general = %r'((?!#{PL_sb_military.source}).*?)((-|\s+)general)'
|
394
|
+
|
395
|
+
PL_prep = matchgroup %w[
|
396
|
+
about above across after among around at athwart before behind
|
397
|
+
below beneath beside besides between betwixt beyond but by
|
398
|
+
during except for from in into near of off on onto out over
|
399
|
+
since till to under until unto upon with
|
400
|
+
]
|
401
|
+
|
402
|
+
PL_sb_prep_dual_compound = %r'(.*?)((?:-|\s+)(?:#{PL_prep}|d[eu])(?:-|\s+))a(?:-|\s+)(.*)'
|
403
|
+
PL_sb_prep_compound = %r'(.*?)((-|\s+)(#{PL_prep}|d[eu])((-|\s+)(.*))?)'
|
404
|
+
|
405
|
+
|
406
|
+
PL_pron_nom_h = {
|
407
|
+
# Nominative Reflexive
|
408
|
+
"i" => "we", "myself" => "ourselves",
|
409
|
+
"you" => "you", "yourself" => "yourselves",
|
410
|
+
"she" => "they", "herself" => "themselves",
|
411
|
+
"he" => "they", "himself" => "themselves",
|
412
|
+
"it" => "they", "itself" => "themselves",
|
413
|
+
"they" => "they", "themself" => "themselves",
|
414
|
+
|
415
|
+
# Possessive
|
416
|
+
"mine" => "ours",
|
417
|
+
"yours" => "yours",
|
418
|
+
"hers" => "theirs",
|
419
|
+
"his" => "theirs",
|
420
|
+
"its" => "theirs",
|
421
|
+
"theirs" => "theirs",
|
422
|
+
}
|
423
|
+
PL_pron_nom = matchgroup PL_pron_nom_h.keys
|
424
|
+
|
425
|
+
PL_pron_acc_h = {
|
426
|
+
# Accusative Reflexive
|
427
|
+
"me" => "us", "myself" => "ourselves",
|
428
|
+
"you" => "you", "yourself" => "yourselves",
|
429
|
+
"her" => "them", "herself" => "themselves",
|
430
|
+
"him" => "them", "himself" => "themselves",
|
431
|
+
"it" => "them", "itself" => "themselves",
|
432
|
+
"them" => "them", "themself" => "themselves",
|
433
|
+
}
|
434
|
+
PL_pron_acc = matchgroup PL_pron_acc_h.keys
|
435
|
+
|
436
|
+
PL_v_irregular_pres_h = {
|
437
|
+
# 1St pers. sing. 2nd pers. sing. 3rd pers. singular
|
438
|
+
# 3rd pers. (indet.)
|
439
|
+
"am" => "are", "are" => "are", "is" => "are",
|
440
|
+
"was" => "were", "were" => "were", "was" => "were",
|
441
|
+
"have" => "have", "have" => "have", "has" => "have",
|
442
|
+
}
|
443
|
+
PL_v_irregular_pres = matchgroup PL_v_irregular_pres_h.keys
|
444
|
+
|
445
|
+
PL_v_ambiguous_pres_h = {
|
446
|
+
# 1st pers. sing. 2nd pers. sing. 3rd pers. singular
|
447
|
+
# 3rd pers. (indet.)
|
448
|
+
"act" => "act", "act" => "act", "acts" => "act",
|
449
|
+
"blame" => "blame", "blame" => "blame", "blames" => "blame",
|
450
|
+
"can" => "can", "can" => "can", "can" => "can",
|
451
|
+
"must" => "must", "must" => "must", "must" => "must",
|
452
|
+
"fly" => "fly", "fly" => "fly", "flies" => "fly",
|
453
|
+
"copy" => "copy", "copy" => "copy", "copies" => "copy",
|
454
|
+
"drink" => "drink", "drink" => "drink", "drinks" => "drink",
|
455
|
+
"fight" => "fight", "fight" => "fight", "fights" => "fight",
|
456
|
+
"fire" => "fire", "fire" => "fire", "fires" => "fire",
|
457
|
+
"like" => "like", "like" => "like", "likes" => "like",
|
458
|
+
"look" => "look", "look" => "look", "looks" => "look",
|
459
|
+
"make" => "make", "make" => "make", "makes" => "make",
|
460
|
+
"reach" => "reach", "reach" => "reach", "reaches" => "reach",
|
461
|
+
"run" => "run", "run" => "run", "runs" => "run",
|
462
|
+
"sink" => "sink", "sink" => "sink", "sinks" => "sink",
|
463
|
+
"sleep" => "sleep", "sleep" => "sleep", "sleeps" => "sleep",
|
464
|
+
"view" => "view", "view" => "view", "views" => "view",
|
465
|
+
}
|
466
|
+
PL_v_ambiguous_pres = matchgroup PL_v_ambiguous_pres_h.keys
|
467
|
+
|
468
|
+
PL_v_irregular_non_pres = matchgroup %w[
|
469
|
+
did had ate made put
|
470
|
+
spent fought sank gave sought
|
471
|
+
shall could ought should
|
472
|
+
]
|
473
|
+
|
474
|
+
PL_v_ambiguous_non_pres = matchgroup %w[
|
475
|
+
thought saw bent will might cut
|
476
|
+
]
|
477
|
+
|
478
|
+
PL_count_zero = matchgroup %w[
|
479
|
+
0 no zero nil
|
480
|
+
]
|
481
|
+
|
482
|
+
PL_count_one = matchgroup %w[
|
483
|
+
1 a an one each every this that
|
484
|
+
]
|
485
|
+
|
486
|
+
PL_adj_special_h = {
|
487
|
+
"a" => "some", "an" => "some",
|
488
|
+
"this" => "these", "that" => "those",
|
489
|
+
}
|
490
|
+
PL_adj_special = matchgroup PL_adj_special_h.keys
|
491
|
+
|
492
|
+
PL_adj_poss_h = {
|
493
|
+
"my" => "our",
|
494
|
+
"your" => "your",
|
495
|
+
"its" => "their",
|
496
|
+
"her" => "their",
|
497
|
+
"his" => "their",
|
498
|
+
"their" => "their",
|
499
|
+
}
|
500
|
+
PL_adj_poss = matchgroup PL_adj_poss_h.keys
|
501
|
+
|
502
|
+
|
503
|
+
#
|
504
|
+
# Numerals, ordinals, and numbers-to-words
|
505
|
+
#
|
506
|
+
|
507
|
+
# Numerical inflections
|
508
|
+
Nth = {
|
509
|
+
0 => 'th',
|
510
|
+
1 => 'st',
|
511
|
+
2 => 'nd',
|
512
|
+
3 => 'rd',
|
513
|
+
4 => 'th',
|
514
|
+
5 => 'th',
|
515
|
+
6 => 'th',
|
516
|
+
7 => 'th',
|
517
|
+
8 => 'th',
|
518
|
+
9 => 'th',
|
519
|
+
11 => 'th',
|
520
|
+
12 => 'th',
|
521
|
+
13 => 'th',
|
522
|
+
}
|
523
|
+
|
524
|
+
# Ordinal word parts
|
525
|
+
Ordinals = {
|
526
|
+
'ty' => 'tieth',
|
527
|
+
'one' => 'first',
|
528
|
+
'two' => 'second',
|
529
|
+
'three' => 'third',
|
530
|
+
'five' => 'fifth',
|
531
|
+
'eight' => 'eighth',
|
532
|
+
'nine' => 'ninth',
|
533
|
+
'twelve' => 'twelfth',
|
534
|
+
}
|
535
|
+
OrdinalSuffixes = Ordinals.keys.join("|") + "|"
|
536
|
+
Ordinals[""] = 'th'
|
537
|
+
|
538
|
+
# Numeral names
|
539
|
+
Units = [''] + %w[one two three four five six seven eight nine]
|
540
|
+
Teens = %w[ten eleven twelve thirteen fourteen
|
541
|
+
fifteen sixteen seventeen eighteen nineteen]
|
542
|
+
Tens = ['',''] + %w[twenty thirty forty fifty sixty seventy eighty ninety]
|
543
|
+
Thousands = [' ', ' thousand'] + %w[
|
544
|
+
m b tr quadr quint sext sept oct non dec undec duodec tredec
|
545
|
+
quattuordec quindec sexdec septemdec octodec novemdec vigint
|
546
|
+
].collect {|prefix| ' ' + prefix + 'illion'}
|
547
|
+
|
548
|
+
# A collection of functions for transforming digits into word
|
549
|
+
# phrases. Indexed by the number of digits being transformed; e.g.,
|
550
|
+
# <tt>NumberToWordsFunctions[2]</tt> is the function for transforming
|
551
|
+
# double-digit numbers.
|
552
|
+
NumberToWordsFunctions = [
|
553
|
+
proc {|*args| raise "No digits (#{args.inspect})"},
|
554
|
+
|
555
|
+
# Single-digits
|
556
|
+
proc {|zero,x|
|
557
|
+
(x.nonzero? ? to_units(x) : "#{zero} ")
|
558
|
+
},
|
559
|
+
|
560
|
+
# Double-digits
|
561
|
+
proc {|zero,x,y|
|
562
|
+
if x.nonzero?
|
563
|
+
to_tens( x, y )
|
564
|
+
elsif y.nonzero?
|
565
|
+
"#{zero} " + NumberToWordsFunctions[1].call( zero, y )
|
566
|
+
else
|
567
|
+
([zero] * 2).join(" ")
|
568
|
+
end
|
569
|
+
},
|
570
|
+
|
571
|
+
# Triple-digits
|
572
|
+
proc {|zero,x,y,z|
|
573
|
+
NumberToWordsFunctions[1].call(zero,x) +
|
574
|
+
NumberToWordsFunctions[2].call(zero,y,z)
|
575
|
+
}
|
576
|
+
]
|
577
|
+
|
578
|
+
|
579
|
+
#
|
580
|
+
# Indefinite Articles
|
581
|
+
#
|
582
|
+
|
583
|
+
# This pattern matches strings of capitals starting with a "vowel-sound"
|
584
|
+
# consonant followed by another consonant, and which are not likely
|
585
|
+
# to be real words (oh, all right then, it's just magic!)
|
586
|
+
A_abbrev = %{
|
587
|
+
(?! FJO | [HLMNS]Y. | RY[EO] | SQU
|
588
|
+
| ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU])
|
589
|
+
[FHLMNRSX][A-Z]
|
590
|
+
}
|
591
|
+
|
592
|
+
# This pattern codes the beginnings of all english words begining with a
|
593
|
+
# 'y' followed by a consonant. Any other y-consonant prefix therefore
|
594
|
+
# implies an abbreviation.
|
595
|
+
A_y_cons = 'y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)'
|
596
|
+
|
597
|
+
# Exceptions to exceptions
|
598
|
+
A_explicit_an = matchgroup( "euler", "hour(?!i)", "heir", "honest", "hono" )
|
599
|
+
|
600
|
+
|
601
|
+
#
|
602
|
+
# Configuration defaults
|
603
|
+
#
|
604
|
+
|
605
|
+
# Default configuration arguments for the #numwords function
|
606
|
+
NumwordDefaults = {
|
607
|
+
:group => 0,
|
608
|
+
:comma => ', ',
|
609
|
+
:and => ' and ',
|
610
|
+
:zero => 'zero',
|
611
|
+
:decimal => 'point',
|
612
|
+
:asArray => false,
|
613
|
+
}
|
614
|
+
|
615
|
+
# Default ranges for #quantify
|
616
|
+
SeveralRange = 2..5
|
617
|
+
NumberRange = 6..19
|
618
|
+
NumerousRange = 20..45
|
619
|
+
ManyRange = 46..99
|
620
|
+
|
621
|
+
# Default configuration arguments for the #quantify function
|
622
|
+
QuantifyDefaults = {
|
623
|
+
:joinword => " of ",
|
624
|
+
}
|
625
|
+
|
626
|
+
# Default configuration arguments for the #conjunction (junction, what's
|
627
|
+
# your) function.
|
628
|
+
ConjunctionDefaults = {
|
629
|
+
:separator => ', ',
|
630
|
+
:altsep => '; ',
|
631
|
+
:penultimate => true,
|
632
|
+
:conjunctive => 'and',
|
633
|
+
:combine => true,
|
634
|
+
:casefold => true,
|
635
|
+
:generalize => false,
|
636
|
+
:quantsort => true,
|
637
|
+
}
|
638
|
+
|
639
|
+
|
640
|
+
#
|
641
|
+
# Title case
|
642
|
+
#
|
643
|
+
|
644
|
+
# "In titles, capitalize the first word, the last word, and all words in
|
645
|
+
# between except articles (a, an, and the), prepositions under five letters
|
646
|
+
# (in, of, to), and coordinating conjunctions (and, but). These rules apply
|
647
|
+
# to titles of long, short, and partial works as well as your own papers"
|
648
|
+
# (Anson, Schwegler, and Muth. The Longman Writer's Companion 240).
|
649
|
+
|
650
|
+
# Build the list of exceptions to title-capitalization
|
651
|
+
Articles = %w[a and the]
|
652
|
+
ShortPrepositions = ["amid", "at", "but", "by", "down", "from", "in",
|
653
|
+
"into", "like", "near", "of", "off", "on", "onto", "out", "over",
|
654
|
+
"past", "save", "with", "till", "to", "unto", "up", "upon", "with"]
|
655
|
+
CoordConjunctions = %w[and but as]
|
656
|
+
TitleCaseExceptions = Articles | ShortPrepositions | CoordConjunctions
|
657
|
+
|
658
|
+
|
659
|
+
# :startdoc:
|
660
|
+
|
661
|
+
#################################################################
|
662
|
+
### " B A C K E N D " F U N C T I O N S
|
663
|
+
#################################################################
|
664
|
+
|
665
|
+
|
666
|
+
###############
|
667
|
+
module_function
|
668
|
+
###############
|
669
|
+
|
670
|
+
### Debugging output
|
671
|
+
def debugMsg( *msgs ) # :nodoc:
|
672
|
+
$stderr.puts msgs.join(" ") if $DEBUG
|
673
|
+
end
|
674
|
+
|
675
|
+
|
676
|
+
### Normalize a count to either 1 or 2 (singular or plural)
|
677
|
+
def normalizeCount( count, default=2 )
|
678
|
+
return default if count.nil? # Default to plural
|
679
|
+
if /^(#{PL_count_one})$/i =~ count.to_s ||
|
680
|
+
Linguistics::classical? &&
|
681
|
+
/^(#{PL_count_zero})$/ =~ count.to_s
|
682
|
+
return 1
|
683
|
+
else
|
684
|
+
return default
|
685
|
+
end
|
686
|
+
end
|
687
|
+
|
688
|
+
|
689
|
+
### Do normal/classical switching and match capitalization in <tt>inflected</tt> by
|
690
|
+
### examining the <tt>original</tt> input.
|
691
|
+
def postprocess( original, inflected )
|
692
|
+
inflected.sub!( /([^|]+)\|(.+)/ ) {
|
693
|
+
Linguistics::classical? ? $2 : $1
|
694
|
+
}
|
695
|
+
|
696
|
+
case original
|
697
|
+
when "I"
|
698
|
+
return inflected
|
699
|
+
when /^[A-Z]+$/
|
700
|
+
return inflected.upcase
|
701
|
+
when /^[A-Z]/
|
702
|
+
# Can't use #capitalize, as it will downcase the rest of the string,
|
703
|
+
# too.
|
704
|
+
inflected[0,1] = inflected[0,1].upcase
|
705
|
+
return inflected
|
706
|
+
else
|
707
|
+
return inflected
|
708
|
+
end
|
709
|
+
end
|
710
|
+
|
711
|
+
|
712
|
+
### Pluralize nouns
|
713
|
+
def pluralize_noun( word, count=nil )
|
714
|
+
value = nil
|
715
|
+
count ||= Linguistics::num
|
716
|
+
count = normalizeCount( count )
|
717
|
+
|
718
|
+
return word if count == 1
|
719
|
+
|
720
|
+
# Handle user-defined nouns
|
721
|
+
#if value = ud_match( word, PL_sb_user_defined )
|
722
|
+
# return value
|
723
|
+
#end
|
724
|
+
|
725
|
+
# Handle empty word, singular count and uninflected plurals
|
726
|
+
case word
|
727
|
+
when ''
|
728
|
+
return word
|
729
|
+
when /^(#{PL_sb_uninflected})$/i
|
730
|
+
return word
|
731
|
+
else
|
732
|
+
if Linguistics::classical? &&
|
733
|
+
/^(#{PL_sb_uninflected_herd})$/i =~ word
|
734
|
+
return word
|
735
|
+
end
|
736
|
+
end
|
737
|
+
|
738
|
+
# Handle compounds ("Governor General", "mother-in-law", "aide-de-camp", etc.)
|
739
|
+
case word
|
740
|
+
when /^(?:#{PL_sb_postfix_adj})$/i
|
741
|
+
value = $2
|
742
|
+
return pluralize_noun( $1, 2 ) + value
|
743
|
+
|
744
|
+
when /^(?:#{PL_sb_prep_dual_compound})$/i
|
745
|
+
value = [ $2, $3 ]
|
746
|
+
return pluralize_noun( $1, 2 ) + value[0] + pluralize_noun( value[1] )
|
747
|
+
|
748
|
+
when /^(?:#{PL_sb_prep_compound})$/i
|
749
|
+
value = $2
|
750
|
+
return pluralize_noun( $1, 2 ) + value
|
751
|
+
|
752
|
+
# Handle pronouns
|
753
|
+
when /^((?:#{PL_prep})\s+)(#{PL_pron_acc})$/i
|
754
|
+
return $1 + PL_pron_acc_h[ $2.downcase ]
|
755
|
+
|
756
|
+
when /^(#{PL_pron_nom})$/i
|
757
|
+
return PL_pron_nom_h[ word.downcase ]
|
758
|
+
|
759
|
+
when /^(#{PL_pron_acc})$/i
|
760
|
+
return PL_pron_acc_h[ $1.downcase ]
|
761
|
+
|
762
|
+
# Handle isolated irregular plurals
|
763
|
+
when /(.*)\b(#{PL_sb_irregular})$/i
|
764
|
+
return $1 + PL_sb_irregular_h[ $2.downcase ]
|
765
|
+
|
766
|
+
when /(#{PL_sb_U_man_mans})$/i
|
767
|
+
return "#{$1}s"
|
768
|
+
|
769
|
+
# Handle families of irregular plurals
|
770
|
+
when /(.*)man$/i ; return "#{$1}men"
|
771
|
+
when /(.*[ml])ouse$/i ; return "#{$1}ice"
|
772
|
+
when /(.*)goose$/i ; return "#{$1}geese"
|
773
|
+
when /(.*)tooth$/i ; return "#{$1}teeth"
|
774
|
+
when /(.*)foot$/i ; return "#{$1}feet"
|
775
|
+
|
776
|
+
# Handle unassimilated imports
|
777
|
+
when /(.*)ceps$/i ; return word
|
778
|
+
when /(.*)zoon$/i ; return "#{$1}zoa"
|
779
|
+
when /(.*[csx])is$/i ; return "#{$1}es"
|
780
|
+
when /(#{PL_sb_U_ex_ices})ex$/i; return "#{$1}ices"
|
781
|
+
when /(#{PL_sb_U_ix_ices})ix$/i; return "#{$1}ices"
|
782
|
+
when /(#{PL_sb_U_um_a})um$/i ; return "#{$1}a"
|
783
|
+
when /(#{PL_sb_U_us_i})us$/i ; return "#{$1}i"
|
784
|
+
when /(#{PL_sb_U_on_a})on$/i ; return "#{$1}a"
|
785
|
+
when /(#{PL_sb_U_a_ae})$/i ; return "#{$1}e"
|
786
|
+
end
|
787
|
+
|
788
|
+
# Handle incompletely assimilated imports
|
789
|
+
if Linguistics::classical?
|
790
|
+
case word
|
791
|
+
when /(.*)trix$/i ; return "#{$1}trices"
|
792
|
+
when /(.*)eau$/i ; return "#{$1}eaux"
|
793
|
+
when /(.*)ieu$/i ; return "#{$1}ieux"
|
794
|
+
when /(.{2,}[yia])nx$/i ; return "#{$1}nges"
|
795
|
+
when /(#{PL_sb_C_en_ina})en$/i; return "#{$1}ina"
|
796
|
+
when /(#{PL_sb_C_ex_ices})ex$/i; return "#{$1}ices"
|
797
|
+
when /(#{PL_sb_C_ix_ices})ix$/i; return "#{$1}ices"
|
798
|
+
when /(#{PL_sb_C_um_a})um$/i ; return "#{$1}a"
|
799
|
+
when /(#{PL_sb_C_us_i})us$/i ; return "#{$1}i"
|
800
|
+
when /(#{PL_sb_C_us_us})$/i ; return "#{$1}"
|
801
|
+
when /(#{PL_sb_C_a_ae})$/i ; return "#{$1}e"
|
802
|
+
when /(#{PL_sb_C_a_ata})a$/i ; return "#{$1}ata"
|
803
|
+
when /(#{PL_sb_C_o_i})o$/i ; return "#{$1}i"
|
804
|
+
when /(#{PL_sb_C_on_a})on$/i ; return "#{$1}a"
|
805
|
+
when /#{PL_sb_C_im}$/i ; return "#{word}im"
|
806
|
+
when /#{PL_sb_C_i}$/i ; return "#{word}i"
|
807
|
+
end
|
808
|
+
end
|
809
|
+
|
810
|
+
|
811
|
+
# Handle singular nouns ending in ...s or other silibants
|
812
|
+
case word
|
813
|
+
when /^(#{PL_sb_singular_s})$/i; return "#{$1}es"
|
814
|
+
when /^([A-Z].*s)$/; return "#{$1}es"
|
815
|
+
when /(.*)([cs]h|[zx])$/i ; return "#{$1}#{$2}es"
|
816
|
+
# when /(.*)(us)$/i ; return "#{$1}#{$2}es"
|
817
|
+
|
818
|
+
# Handle ...f -> ...ves
|
819
|
+
when /(.*[eao])lf$/i ; return "#{$1}lves";
|
820
|
+
when /(.*[^d])eaf$/i ; return "#{$1}eaves"
|
821
|
+
when /(.*[nlw])ife$/i ; return "#{$1}ives"
|
822
|
+
when /(.*)arf$/i ; return "#{$1}arves"
|
823
|
+
|
824
|
+
# Handle ...y
|
825
|
+
when /(.*[aeiou])y$/i ; return "#{$1}ys"
|
826
|
+
when /([A-Z].*y)$/ ; return "#{$1}s"
|
827
|
+
when /(.*)y$/i ; return "#{$1}ies"
|
828
|
+
|
829
|
+
# Handle ...o
|
830
|
+
when /#{PL_sb_U_o_os}$/i ; return "#{word}s"
|
831
|
+
when /[aeiou]o$/i ; return "#{word}s"
|
832
|
+
when /o$/i ; return "#{word}es"
|
833
|
+
|
834
|
+
# Otherwise just add ...s
|
835
|
+
else
|
836
|
+
return "#{word}s"
|
837
|
+
end
|
838
|
+
end # def pluralize_noun
|
839
|
+
|
840
|
+
|
841
|
+
|
842
|
+
### Pluralize special verbs
|
843
|
+
def pluralize_special_verb( word, count )
|
844
|
+
count ||= Linguistics::num
|
845
|
+
count = normalizeCount( count )
|
846
|
+
|
847
|
+
return nil if /^(#{PL_count_one})$/i =~ count.to_s
|
848
|
+
|
849
|
+
# Handle user-defined verbs
|
850
|
+
#if value = ud_match( word, PL_v_user_defined )
|
851
|
+
# return value
|
852
|
+
#end
|
853
|
+
|
854
|
+
case word
|
855
|
+
|
856
|
+
# Handle irregular present tense (simple and compound)
|
857
|
+
when /^(#{PL_v_irregular_pres})((\s.*)?)$/i
|
858
|
+
return PL_v_irregular_pres_h[ $1.downcase ] + $2
|
859
|
+
|
860
|
+
# Handle irregular future, preterite and perfect tenses
|
861
|
+
when /^(#{PL_v_irregular_non_pres})((\s.*)?)$/i
|
862
|
+
return word
|
863
|
+
|
864
|
+
# Handle special cases
|
865
|
+
when /^(#{PL_v_special_s})$/, /\s/
|
866
|
+
return nil
|
867
|
+
|
868
|
+
# Handle standard 3rd person (chop the ...(e)s off single words)
|
869
|
+
when /^(.*)([cs]h|[x]|zz|ss)es$/i
|
870
|
+
return $1 + $2
|
871
|
+
when /^(..+)ies$/i
|
872
|
+
return "#{$1}y"
|
873
|
+
when /^(.+)oes$/i
|
874
|
+
return "#{$1}o"
|
875
|
+
when /^(.*[^s])s$/i
|
876
|
+
return $1
|
877
|
+
|
878
|
+
# Otherwise, a regular verb (handle elsewhere)
|
879
|
+
else
|
880
|
+
return nil
|
881
|
+
end
|
882
|
+
end
|
883
|
+
|
884
|
+
|
885
|
+
### Pluralize regular verbs
|
886
|
+
def pluralize_general_verb( word, count )
|
887
|
+
count ||= Linguistics::num
|
888
|
+
count = normalizeCount( count )
|
889
|
+
|
890
|
+
return word if /^(#{PL_count_one})$/i =~ count.to_s
|
891
|
+
|
892
|
+
case word
|
893
|
+
|
894
|
+
# Handle ambiguous present tenses (simple and compound)
|
895
|
+
when /^(#{PL_v_ambiguous_pres})((\s.*)?)$/i
|
896
|
+
return PL_v_ambiguous_pres_h[ $1.downcase ] + $2
|
897
|
+
|
898
|
+
# Handle ambiguous preterite and perfect tenses
|
899
|
+
when /^(#{PL_v_ambiguous_non_pres})((\s.*)?)$/i
|
900
|
+
return word
|
901
|
+
|
902
|
+
# Otherwise, 1st or 2nd person is uninflected
|
903
|
+
else
|
904
|
+
return word
|
905
|
+
end
|
906
|
+
end
|
907
|
+
|
908
|
+
|
909
|
+
### Handle special adjectives
|
910
|
+
def pluralize_special_adjective( word, count )
|
911
|
+
count ||= Linguistics::num
|
912
|
+
count = normalizeCount( count )
|
913
|
+
|
914
|
+
return word if /^(#{PL_count_one})$/i =~ count.to_s
|
915
|
+
|
916
|
+
# Handle user-defined verbs
|
917
|
+
#if value = ud_match( word, PL_adj_user_defined )
|
918
|
+
# return value
|
919
|
+
#end
|
920
|
+
|
921
|
+
case word
|
922
|
+
|
923
|
+
# Handle known cases
|
924
|
+
when /^(#{PL_adj_special})$/i
|
925
|
+
return PL_adj_special_h[ $1.downcase ]
|
926
|
+
|
927
|
+
# Handle possessives
|
928
|
+
when /^(#{PL_adj_poss})$/i
|
929
|
+
return PL_adj_poss_h[ $1.downcase ]
|
930
|
+
|
931
|
+
when /^(.*)'s?$/
|
932
|
+
pl = plural_noun( $1 )
|
933
|
+
if /s$/ =~ pl
|
934
|
+
return "#{pl}'"
|
935
|
+
else
|
936
|
+
return "#{pl}'s"
|
937
|
+
end
|
938
|
+
|
939
|
+
# Otherwise, no idea
|
940
|
+
else
|
941
|
+
return nil
|
942
|
+
end
|
943
|
+
end
|
944
|
+
|
945
|
+
|
946
|
+
### Returns the given word with a prepended indefinite article, unless
|
947
|
+
### +count+ is non-nil and not singular.
|
948
|
+
def indef_article( word, count )
|
949
|
+
count ||= Linguistics::num
|
950
|
+
return "#{count} #{word}" if
|
951
|
+
count && /^(#{PL_count_one})$/i !~ count.to_s
|
952
|
+
|
953
|
+
# Handle user-defined variants
|
954
|
+
# return value if value = ud_match( word, A_a_user_defined )
|
955
|
+
|
956
|
+
case word
|
957
|
+
|
958
|
+
# Handle special cases
|
959
|
+
when /^(#{A_explicit_an})/i
|
960
|
+
return "an #{word}"
|
961
|
+
|
962
|
+
# Handle abbreviations
|
963
|
+
when /^(#{A_abbrev})/x
|
964
|
+
return "an #{word}"
|
965
|
+
when /^[aefhilmnorsx][.-]/i
|
966
|
+
return "an #{word}"
|
967
|
+
when /^[a-z][.-]/i
|
968
|
+
return "a #{word}"
|
969
|
+
|
970
|
+
# Handle consonants
|
971
|
+
when /^[^aeiouy]/i
|
972
|
+
return "a #{word}"
|
973
|
+
|
974
|
+
# Handle special vowel-forms
|
975
|
+
when /^e[uw]/i
|
976
|
+
return "a #{word}"
|
977
|
+
when /^onc?e\b/i
|
978
|
+
return "a #{word}"
|
979
|
+
when /^uni([^nmd]|mo)/i
|
980
|
+
return "a #{word}"
|
981
|
+
when /^u[bcfhjkqrst][aeiou]/i
|
982
|
+
return "a #{word}"
|
983
|
+
|
984
|
+
# Handle vowels
|
985
|
+
when /^[aeiou]/i
|
986
|
+
return "an #{word}"
|
987
|
+
|
988
|
+
# Handle y... (before certain consonants implies (unnaturalized) "i.." sound)
|
989
|
+
when /^(#{A_y_cons})/i
|
990
|
+
return "an #{word}"
|
991
|
+
|
992
|
+
# Otherwise, guess "a"
|
993
|
+
else
|
994
|
+
return "a #{word}"
|
995
|
+
end
|
996
|
+
end
|
997
|
+
|
998
|
+
|
999
|
+
### Transform the specified number of units-place numerals into a
|
1000
|
+
### word-phrase at the given number of +thousands+ places.
|
1001
|
+
def to_units( units, thousands=0 )
|
1002
|
+
return Units[ units ] + to_thousands( thousands )
|
1003
|
+
end
|
1004
|
+
|
1005
|
+
|
1006
|
+
### Transform the specified number of tens- and units-place numerals into a
|
1007
|
+
### word-phrase at the given number of +thousands+ places.
|
1008
|
+
def to_tens( tens, units, thousands=0 )
|
1009
|
+
unless tens == 1
|
1010
|
+
return Tens[ tens ] + ( tens.nonzero? && units.nonzero? ? '-' : '' ) +
|
1011
|
+
to_units( units, thousands )
|
1012
|
+
else
|
1013
|
+
return Teens[ units ] + to_thousands( thousands )
|
1014
|
+
end
|
1015
|
+
end
|
1016
|
+
|
1017
|
+
|
1018
|
+
### Transform the specified number of hundreds-, tens-, and units-place
|
1019
|
+
### numerals into a word phrase. If the number of thousands (+thousands+) is
|
1020
|
+
### greater than 0, it will be used to determine where the decimal point is
|
1021
|
+
### in relation to the hundreds-place number.
|
1022
|
+
def to_hundreds( hundreds, tens=0, units=0, thousands=0, joinword=" and " )
|
1023
|
+
joinword = ' ' if joinword.empty?
|
1024
|
+
if hundreds.nonzero?
|
1025
|
+
return to_units( hundreds ) + " hundred" +
|
1026
|
+
(tens.nonzero? || units.nonzero? ? joinword : '') +
|
1027
|
+
to_tens( tens, units ) +
|
1028
|
+
to_thousands( thousands )
|
1029
|
+
elsif tens.nonzero? || units.nonzero?
|
1030
|
+
return to_tens( tens, units ) + to_thousands( thousands )
|
1031
|
+
else
|
1032
|
+
return nil
|
1033
|
+
end
|
1034
|
+
end
|
1035
|
+
|
1036
|
+
### Transform the specified number into one or more words like 'thousand',
|
1037
|
+
### 'million', etc. Uses the thousands (American) system.
|
1038
|
+
def to_thousands( thousands=0 )
|
1039
|
+
parts = []
|
1040
|
+
(0..thousands).step( Thousands.length - 1 ) {|i|
|
1041
|
+
if i.zero?
|
1042
|
+
parts.push Thousands[ thousands % (Thousands.length - 1) ]
|
1043
|
+
else
|
1044
|
+
parts.push Thousands.last
|
1045
|
+
end
|
1046
|
+
}
|
1047
|
+
|
1048
|
+
return parts.join(" ")
|
1049
|
+
end
|
1050
|
+
|
1051
|
+
|
1052
|
+
### Return the specified number +num+ as an array of number phrases.
|
1053
|
+
def number_to_words( num, config )
|
1054
|
+
return [config[:zero]] if num.to_i.zero?
|
1055
|
+
chunks = []
|
1056
|
+
|
1057
|
+
# Break into word-groups if groups is set
|
1058
|
+
if config[:group].nonzero?
|
1059
|
+
|
1060
|
+
# Build a Regexp with <config[:group]> number of digits. Any past
|
1061
|
+
# the first are optional.
|
1062
|
+
re = Regexp::new( "(\\d)" + ("(\\d)?" * (config[:group] - 1)) )
|
1063
|
+
|
1064
|
+
# Scan the string, and call the word-chunk function that deals with
|
1065
|
+
# chunks of the found number of digits.
|
1066
|
+
num.to_s.scan( re ) {|digits|
|
1067
|
+
debugMsg " digits = #{digits.inspect}"
|
1068
|
+
fn = NumberToWordsFunctions[ digits.nitems ]
|
1069
|
+
numerals = digits.flatten.compact.collect {|i| i.to_i}
|
1070
|
+
debugMsg " numerals = #{numerals.inspect}"
|
1071
|
+
chunks.push fn.call( config[:zero], *numerals ).strip
|
1072
|
+
}
|
1073
|
+
else
|
1074
|
+
phrase = num.to_s
|
1075
|
+
phrase.sub!( /\A\s*0+/, '' )
|
1076
|
+
mill = 0
|
1077
|
+
|
1078
|
+
# Match backward from the end of the digits in the string, turning
|
1079
|
+
# chunks of three, of two, and of one into words.
|
1080
|
+
mill += 1 while
|
1081
|
+
phrase.sub!( /(\d)(\d)(\d)(?=\D*\Z)/ ) {
|
1082
|
+
words = to_hundreds( $1.to_i, $2.to_i, $3.to_i, mill,
|
1083
|
+
config[:and] )
|
1084
|
+
chunks.unshift words.strip.squeeze(' ') unless words.nil?
|
1085
|
+
''
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
phrase.sub!( /(\d)(\d)(?=\D*\Z)/ ) {
|
1089
|
+
chunks.unshift to_tens( $1.to_i, $2.to_i, mill ).strip.squeeze(' ')
|
1090
|
+
''
|
1091
|
+
}
|
1092
|
+
phrase.sub!( /(\d)(?=\D*\Z)/ ) {
|
1093
|
+
chunks.unshift to_units( $1.to_i, mill ).strip.squeeze(' ')
|
1094
|
+
''
|
1095
|
+
}
|
1096
|
+
end
|
1097
|
+
|
1098
|
+
return chunks
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
|
1102
|
+
#################################################################
|
1103
|
+
### P U B L I C F U N C T I O N S
|
1104
|
+
#################################################################
|
1105
|
+
|
1106
|
+
### Return the name of the language this module is for.
|
1107
|
+
def language
|
1108
|
+
"English"
|
1109
|
+
end
|
1110
|
+
|
1111
|
+
|
1112
|
+
### Return the plural of the given +phrase+ if +count+ indicates it should
|
1113
|
+
### be plural.
|
1114
|
+
def plural( phrase, count=nil )
|
1115
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1116
|
+
pre, word, post = md.to_a[1,3]
|
1117
|
+
return phrase if word.nil? or word.empty?
|
1118
|
+
|
1119
|
+
plural = postprocess( word,
|
1120
|
+
pluralize_special_adjective(word, count) ||
|
1121
|
+
pluralize_special_verb(word, count) ||
|
1122
|
+
pluralize_noun(word, count) )
|
1123
|
+
|
1124
|
+
return pre + plural + post
|
1125
|
+
end
|
1126
|
+
alias_method :PL, :plural
|
1127
|
+
|
1128
|
+
|
1129
|
+
### Return the plural of the given noun +phrase+ if +count+ indicates it
|
1130
|
+
### should be plural.
|
1131
|
+
def plural_noun( phrase, count=nil )
|
1132
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1133
|
+
pre, word, post = md.to_a[1,3]
|
1134
|
+
return phrase if word.nil? or word.empty?
|
1135
|
+
|
1136
|
+
plural = postprocess( word, pluralize_noun(word, count) )
|
1137
|
+
return pre + plural + post
|
1138
|
+
end
|
1139
|
+
alias_method :PL_N, :plural_noun
|
1140
|
+
|
1141
|
+
|
1142
|
+
### Return the plural of the given verb +phrase+ if +count+ indicates it
|
1143
|
+
### should be plural.
|
1144
|
+
def plural_verb( phrase, count=nil )
|
1145
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1146
|
+
pre, word, post = md.to_a[1,3]
|
1147
|
+
return phrase if word.nil? or word.empty?
|
1148
|
+
|
1149
|
+
plural = postprocess( word,
|
1150
|
+
pluralize_special_verb(word, count) ||
|
1151
|
+
pluralize_general_verb(word, count) )
|
1152
|
+
return pre + plural + post
|
1153
|
+
end
|
1154
|
+
alias_method :PL_V, :plural_verb
|
1155
|
+
|
1156
|
+
|
1157
|
+
### Return the plural of the given adjectival +phrase+ if +count+ indicates
|
1158
|
+
### it should be plural.
|
1159
|
+
def plural_adjective( phrase, count=nil )
|
1160
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1161
|
+
pre, word, post = md.to_a[1,3]
|
1162
|
+
return phrase if word.nil? or word.empty?
|
1163
|
+
|
1164
|
+
plural = postprocess( word,
|
1165
|
+
pluralize_special_adjective(word, count) || word )
|
1166
|
+
return pre + plural + post
|
1167
|
+
end
|
1168
|
+
alias_method :plural_adj, :plural_adjective
|
1169
|
+
alias_method :PL_ADJ, :plural_adjective
|
1170
|
+
|
1171
|
+
|
1172
|
+
### Return the given phrase with the appropriate indefinite article ("a" or
|
1173
|
+
### "an") prepended.
|
1174
|
+
def a( phrase, count=nil )
|
1175
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1176
|
+
pre, word, post = md.to_a[1,3]
|
1177
|
+
return phrase if word.nil? or word.empty?
|
1178
|
+
|
1179
|
+
result = indef_article( word, count )
|
1180
|
+
return pre + result + post
|
1181
|
+
end
|
1182
|
+
alias_method :an, :a
|
1183
|
+
alias_method :A, :a
|
1184
|
+
alias_method :AN, :a
|
1185
|
+
|
1186
|
+
|
1187
|
+
### Translate zero-quantified +phrase+ to "no +phrase.plural+"
|
1188
|
+
def no( phrase, count=nil )
|
1189
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1190
|
+
pre, word, post = md.to_a[1,3]
|
1191
|
+
count ||= Linguistics::num || 0
|
1192
|
+
|
1193
|
+
unless /^#{PL_count_zero}$/ =~ count.to_s
|
1194
|
+
return "#{pre}#{count} " + plural( word, count ) + post
|
1195
|
+
else
|
1196
|
+
return "#{pre}no " + plural( word, 0 ) + post
|
1197
|
+
end
|
1198
|
+
end
|
1199
|
+
alias_method :NO, :no
|
1200
|
+
|
1201
|
+
|
1202
|
+
### Participles
|
1203
|
+
def present_participle( word )
|
1204
|
+
plural = plural_verb( word.to_s, 2 )
|
1205
|
+
|
1206
|
+
plural.sub!( /ie$/, 'y' ) or
|
1207
|
+
plural.sub!( /ue$/, 'u' ) or
|
1208
|
+
plural.sub!( /([auy])e$/, '$1' ) or
|
1209
|
+
plural.sub!( /i$/, '' ) or
|
1210
|
+
plural.sub!( /([^e])e$/, "\\1" ) or
|
1211
|
+
/er$/.match( plural ) or
|
1212
|
+
plural.sub!( /([^aeiou][aeiouy]([bdgmnprst]))$/, "\\1\\2" )
|
1213
|
+
|
1214
|
+
return "#{plural}ing"
|
1215
|
+
end
|
1216
|
+
alias_method :part_pres, :present_participle
|
1217
|
+
alias_method :PART_PRES, :present_participle
|
1218
|
+
|
1219
|
+
|
1220
|
+
|
1221
|
+
### Return the specified number as english words. One or more configuration
|
1222
|
+
### values may be passed to control the returned String:
|
1223
|
+
###
|
1224
|
+
### [<b>:group</b>]
|
1225
|
+
### Controls how many numbers at a time are grouped together. Valid values
|
1226
|
+
### are +0+ (normal grouping), +1+ (single-digit grouping, e.g., "one,
|
1227
|
+
### two, three, four"), +2+ (double-digit grouping, e.g., "twelve,
|
1228
|
+
### thirty-four", or +3+ (triple-digit grouping, e.g., "one twenty-three,
|
1229
|
+
### four").
|
1230
|
+
### [<b>:comma</b>]
|
1231
|
+
### Set the character/s used to separate word groups. Defaults to +", "+.
|
1232
|
+
### [<b>:and</b>]
|
1233
|
+
### Set the word and/or characters used where ' and ' (the default) is
|
1234
|
+
### normally used. Setting <tt>:and</tt> to +' '+, for example, will cause
|
1235
|
+
### +2556+ to be returned as "two-thousand, five hundred fifty-six"
|
1236
|
+
### instead of ""two-thousand, five hundred and fifty-six".
|
1237
|
+
### [<b>:zero</b>]
|
1238
|
+
### Set the word used to represent the numeral +0+ in the result. +'zero'+
|
1239
|
+
### is the default.
|
1240
|
+
### [<b>:decimal</b>]
|
1241
|
+
### Set the translation of any decimal points in the number; the default
|
1242
|
+
### is +'point'+.
|
1243
|
+
### [<b>:asArray</b>]
|
1244
|
+
### If set to a true value, the number will be returned as an array of
|
1245
|
+
### word groups instead of a String.
|
1246
|
+
def numwords( number, hashargs={} )
|
1247
|
+
num = number.to_s
|
1248
|
+
config = NumwordDefaults.dup.update( hashargs )
|
1249
|
+
raise "Bad chunking option: #{config[:group]}" unless
|
1250
|
+
config[:group].between?( 0, 3 )
|
1251
|
+
|
1252
|
+
# Array of number parts: first is everything to the left of the first
|
1253
|
+
# decimal, followed by any groups of decimal-delimted numbers after that
|
1254
|
+
parts = []
|
1255
|
+
|
1256
|
+
# Wordify any sign prefix
|
1257
|
+
sign = (/\A\s*\+/ =~ num) ? 'plus' : (/\A\s*\-/ =~ num) ? 'minus' : ''
|
1258
|
+
|
1259
|
+
# Strip any ordinal suffixes
|
1260
|
+
ord = true if num.sub!( /(st|nd|rd|th)\Z/, '' )
|
1261
|
+
|
1262
|
+
# Split the number into chunks delimited by '.'
|
1263
|
+
chunks = if !config[:decimal].empty? then
|
1264
|
+
if config[:group].nonzero?
|
1265
|
+
num.split(/\./)
|
1266
|
+
else
|
1267
|
+
num.split(/\./, 2)
|
1268
|
+
end
|
1269
|
+
else
|
1270
|
+
[ num ]
|
1271
|
+
end
|
1272
|
+
|
1273
|
+
# Wordify each chunk, pushing arrays into the parts array
|
1274
|
+
chunks.each_with_index {|chunk,section|
|
1275
|
+
chunk.gsub!( /\D+/, '' )
|
1276
|
+
|
1277
|
+
# If there's nothing in this chunk of the number, set it to zero
|
1278
|
+
# unless it's the whole-number part, in which case just push an
|
1279
|
+
# empty array.
|
1280
|
+
if chunk.empty?
|
1281
|
+
if section.zero?
|
1282
|
+
parts.push []
|
1283
|
+
next
|
1284
|
+
end
|
1285
|
+
end
|
1286
|
+
|
1287
|
+
# Split the number section into wordified parts unless this is the
|
1288
|
+
# second or succeeding part of a non-group number
|
1289
|
+
unless config[:group].zero? && section.nonzero?
|
1290
|
+
parts.push number_to_words( chunk, config )
|
1291
|
+
else
|
1292
|
+
parts.push number_to_words( chunk, config.dup.update(:group => 1) )
|
1293
|
+
end
|
1294
|
+
}
|
1295
|
+
|
1296
|
+
debugMsg "Parts => #{parts.inspect}"
|
1297
|
+
|
1298
|
+
# Turn the last word of the whole-number part back into an ordinal if
|
1299
|
+
# the original number came in that way.
|
1300
|
+
if ord && !parts[0].empty?
|
1301
|
+
parts[0][-1] = ordinal( parts[0].last )
|
1302
|
+
end
|
1303
|
+
|
1304
|
+
# If the caller's expecting an Array return, just flatten and return the
|
1305
|
+
# parts array.
|
1306
|
+
if config[:asArray]
|
1307
|
+
unless sign.empty?
|
1308
|
+
parts[0].unshift( sign )
|
1309
|
+
end
|
1310
|
+
return parts.flatten
|
1311
|
+
end
|
1312
|
+
|
1313
|
+
# Catenate each sub-parts array into a whole number part and one or more
|
1314
|
+
# post-decimal parts. If grouping is turned on, all sub-parts get joined
|
1315
|
+
# with commas, otherwise just the whole-number part is.
|
1316
|
+
if config[:group].zero?
|
1317
|
+
if parts[0].nitems > 1
|
1318
|
+
|
1319
|
+
# Join all but the last part together with commas
|
1320
|
+
wholenum = parts[0][0...-1].join( config[:comma] )
|
1321
|
+
|
1322
|
+
# If the last part is just a single word, append it to the
|
1323
|
+
# wholenum part with an 'and'. This is to get things like 'three
|
1324
|
+
# thousand and three' instead of 'three thousand, three'.
|
1325
|
+
if /^\s*(\S+)\s*$/ =~ parts[0].last
|
1326
|
+
wholenum += " and #{parts[0].last}"
|
1327
|
+
else
|
1328
|
+
wholenum += config[:comma] + parts[0].last
|
1329
|
+
end
|
1330
|
+
else
|
1331
|
+
wholenum = parts[0][0]
|
1332
|
+
end
|
1333
|
+
decimals = parts[1..-1].collect {|part| part.join(" ")}
|
1334
|
+
|
1335
|
+
debugMsg "Wholenum: #{wholenum.inspect}; decimals: #{decimals.inspect}"
|
1336
|
+
|
1337
|
+
# Join with the configured decimal; if it's empty, just join with
|
1338
|
+
# spaces.
|
1339
|
+
unless config[:decimal].empty?
|
1340
|
+
return sign + ([ wholenum ] + decimals).
|
1341
|
+
join( " #{config[:decimal]} " ).strip
|
1342
|
+
else
|
1343
|
+
return sign + ([ wholenum ] + decimals).
|
1344
|
+
join( " " ).strip
|
1345
|
+
end
|
1346
|
+
else
|
1347
|
+
return parts.compact.
|
1348
|
+
separate( config[:decimal] ).
|
1349
|
+
delete_if {|el| el.empty?}.
|
1350
|
+
join( config[:comma] ).
|
1351
|
+
strip
|
1352
|
+
end
|
1353
|
+
end
|
1354
|
+
alias_method :NUMWORDS, :numwords
|
1355
|
+
|
1356
|
+
|
1357
|
+
### Transform the given +number+ into an ordinal word. The +number+ object
|
1358
|
+
### can be either an Integer or a String.
|
1359
|
+
def ordinal( number )
|
1360
|
+
case number
|
1361
|
+
when Integer
|
1362
|
+
return number.to_s + (Nth[ number % 100 ] || Nth[ number % 10 ])
|
1363
|
+
|
1364
|
+
else
|
1365
|
+
return number.to_s.sub( /(#{OrdinalSuffixes})\Z/ ) { Ordinals[$1] }
|
1366
|
+
end
|
1367
|
+
end
|
1368
|
+
alias_method :ORD, :ordinal
|
1369
|
+
|
1370
|
+
|
1371
|
+
### Return a phrase describing the specified +number+ of objects in the
|
1372
|
+
### given +phrase+. The following options can be used to control the makeup
|
1373
|
+
### of the returned quantity String:
|
1374
|
+
###
|
1375
|
+
### [<b>:joinword</b>]
|
1376
|
+
### Sets the word (and any surrounding spaces) used as the word separating the
|
1377
|
+
### quantity from the noun in the resulting string. Defaults to <tt>' of
|
1378
|
+
### '</tt>.
|
1379
|
+
def quantify( phrase, number=0, args={} )
|
1380
|
+
num = number.to_i
|
1381
|
+
config = QuantifyDefaults.dup.update( args )
|
1382
|
+
|
1383
|
+
case num
|
1384
|
+
when 0
|
1385
|
+
no( phrase )
|
1386
|
+
when 1
|
1387
|
+
a( phrase )
|
1388
|
+
when SeveralRange
|
1389
|
+
"several " + plural( phrase, num )
|
1390
|
+
when NumberRange
|
1391
|
+
"a number of " + plural( phrase, num )
|
1392
|
+
when NumerousRange
|
1393
|
+
"numerous " + plural( phrase, num )
|
1394
|
+
when ManyRange
|
1395
|
+
"many " + plural( phrase, num )
|
1396
|
+
else
|
1397
|
+
|
1398
|
+
# Anything bigger than the ManyRange gets described like
|
1399
|
+
# "hundreds of thousands of..." or "millions of..."
|
1400
|
+
# depending, of course, on how many there are.
|
1401
|
+
thousands, subthousands = Math::log10( num ).to_i.divmod( 3 )
|
1402
|
+
stword =
|
1403
|
+
case subthousands
|
1404
|
+
when 2
|
1405
|
+
"hundreds"
|
1406
|
+
when 1
|
1407
|
+
"tens"
|
1408
|
+
else
|
1409
|
+
nil
|
1410
|
+
end
|
1411
|
+
thword = plural( to_thousands(thousands).strip )
|
1412
|
+
thword = nil if thword.empty?
|
1413
|
+
|
1414
|
+
[ # Hundreds (of)...
|
1415
|
+
stword,
|
1416
|
+
|
1417
|
+
# thousands (of)
|
1418
|
+
thword,
|
1419
|
+
|
1420
|
+
# stars.
|
1421
|
+
plural(phrase, number)
|
1422
|
+
].compact.join( config[:joinword] )
|
1423
|
+
end
|
1424
|
+
end
|
1425
|
+
|
1426
|
+
|
1427
|
+
### Return the specified +obj+ (which must support the <tt>#collect</tt>
|
1428
|
+
### method) as a conjunction. Each item is converted to a String if it is
|
1429
|
+
### not already (using #to_s) unless a block is given, in which case it is
|
1430
|
+
### called once for each object in the array, and the stringified return
|
1431
|
+
### value from the block is used instead. Returning +nil+ causes that
|
1432
|
+
### particular element to be omitted from the resulting conjunction. The
|
1433
|
+
### following options can be used to control the makeup of the returned
|
1434
|
+
### conjunction String:
|
1435
|
+
###
|
1436
|
+
### [<b>:separator</b>]
|
1437
|
+
### Specify one or more characters to separate items in the resulting
|
1438
|
+
### list. Defaults to <tt>', '</tt>.
|
1439
|
+
### [<b>:altsep</b>]
|
1440
|
+
### An alternate separator to use if any of the resulting conjunction's
|
1441
|
+
### clauses contain the <tt>:separator</tt> character/s. Defaults to <tt>'; '</tt>.
|
1442
|
+
### [<b>:penultimate</b>]
|
1443
|
+
### Flag that indicates whether or not to join the last clause onto the
|
1444
|
+
### rest of the conjunction using a penultimate <tt>:separator</tt>. E.g.,
|
1445
|
+
### %w{duck, cow, dog}.en.conjunction
|
1446
|
+
### # => "a duck, a cow, and a dog"
|
1447
|
+
### %w{duck cow dog}.en.conjunction( :penultimate => false )
|
1448
|
+
### "a duck, a cow and a dog"
|
1449
|
+
### Default to <tt>true</tt>.
|
1450
|
+
### [<b>:conjunctive</b>]
|
1451
|
+
### Sets the word used as the conjunctive (separating word) of the
|
1452
|
+
### resulting string. Default to <tt>'and'</tt>.
|
1453
|
+
### [<b>:combine</b>]
|
1454
|
+
### If set to <tt>true</tt> (the default), items which are indentical (after
|
1455
|
+
### surrounding spaces are stripped) will be combined in the resulting
|
1456
|
+
### conjunction. E.g.,
|
1457
|
+
### %w{goose cow goose dog}.en.conjunction
|
1458
|
+
### # => "two geese, a cow, and a dog"
|
1459
|
+
### %w{goose cow goose dog}.en.conjunction( :combine => false )
|
1460
|
+
### # => "a goose, a cow, a goose, and a dog"
|
1461
|
+
### [<b>:casefold</b>]
|
1462
|
+
### If set to <tt>true</tt> (the default), then items are compared
|
1463
|
+
### case-insensitively when combining them. This has no effect if
|
1464
|
+
### <tt>:combine</tt> is <tt>false</tt>.
|
1465
|
+
### [<b>:generalize</b>]
|
1466
|
+
### If set to <tt>true</tt>, then quantities of combined items are turned into
|
1467
|
+
### general descriptions instead of exact amounts.
|
1468
|
+
### ary = %w{goose pig dog horse goose reindeer goose dog horse}
|
1469
|
+
### ary.en.conjunction
|
1470
|
+
### # => "three geese, two dogs, two horses, a pig, and a reindeer"
|
1471
|
+
### ary.en.conjunction( :generalize => true )
|
1472
|
+
### # => "several geese, several dogs, several horses, a pig, and a reindeer"
|
1473
|
+
### See the #quantify method for specifics on how quantities are
|
1474
|
+
### generalized. Generalization defaults to <tt>false</tt>, and has no effect if
|
1475
|
+
### :combine is <tt>false</tt>.
|
1476
|
+
### [<b>:quantsort</b>]
|
1477
|
+
### If set to <tt>true</tt> (the default), items which are combined in the
|
1478
|
+
### resulting conjunction will be listed in order of amount, with greater
|
1479
|
+
### quantities sorted first. If <tt>:quantsort</tt> is <tt>false</tt>, combined items
|
1480
|
+
### will appear where the first instance of them occurred in the
|
1481
|
+
### list. This sort is also the fallback for indentical quantities (ie.,
|
1482
|
+
### items of the same quantity will be listed in the order they appeared
|
1483
|
+
### in the source list).
|
1484
|
+
###
|
1485
|
+
def conjunction( obj, args={} )
|
1486
|
+
config = ConjunctionDefaults.dup.update( args )
|
1487
|
+
phrases = []
|
1488
|
+
|
1489
|
+
# Transform items in the obj to phrases
|
1490
|
+
if block_given?
|
1491
|
+
phrases = obj.collect {|item| yield(item) }.compact
|
1492
|
+
else
|
1493
|
+
phrases = obj.collect {|item| item.to_s }
|
1494
|
+
end
|
1495
|
+
|
1496
|
+
# No need for a conjunction if there's only one thing
|
1497
|
+
return a(phrases[0]) if phrases.length < 2
|
1498
|
+
|
1499
|
+
# Set up a Proc to derive a collector key from a phrase depending on the
|
1500
|
+
# configuration
|
1501
|
+
keyfunc =
|
1502
|
+
if config[:casefold]
|
1503
|
+
proc {|key| key.downcase.strip}
|
1504
|
+
else
|
1505
|
+
proc {|key| key.strip}
|
1506
|
+
end
|
1507
|
+
|
1508
|
+
# Count and delete phrases that hash the same when the keyfunc munges
|
1509
|
+
# them into the same thing if we're combining (:combine => true).
|
1510
|
+
collector = {}
|
1511
|
+
if config[:combine]
|
1512
|
+
|
1513
|
+
phrases.each_index do |i|
|
1514
|
+
# Stop when reaching the end of a truncated list
|
1515
|
+
break if phrases[i].nil?
|
1516
|
+
|
1517
|
+
# Make the key using the configured key function
|
1518
|
+
phrase = keyfunc[ phrases[i] ]
|
1519
|
+
|
1520
|
+
# If the collector already has this key, increment its count,
|
1521
|
+
# eliminate the duplicate from the phrase list, and redo the loop.
|
1522
|
+
if collector.key?( phrase )
|
1523
|
+
collector[ phrase ] += 1
|
1524
|
+
phrases.delete_at( i )
|
1525
|
+
redo
|
1526
|
+
end
|
1527
|
+
|
1528
|
+
collector[ phrase ] = 1
|
1529
|
+
end
|
1530
|
+
else
|
1531
|
+
# If we're not combining, just make everything have a count of 1.
|
1532
|
+
phrases.uniq.each {|key| collector[ keyfunc[key] ] = 1}
|
1533
|
+
end
|
1534
|
+
|
1535
|
+
# If sort-by-quantity is turned on, sort the phrases first by how many
|
1536
|
+
# there are (most-first), and then by the order they were specified in.
|
1537
|
+
if config[:quantsort] && config[:combine]
|
1538
|
+
origorder = {}
|
1539
|
+
phrases.each_with_index {|phrase,i| origorder[ keyfunc[phrase] ] ||= i }
|
1540
|
+
phrases.sort! {|a,b|
|
1541
|
+
(collector[ keyfunc[b] ] <=> collector[ keyfunc[a] ]).nonzero? ||
|
1542
|
+
(origorder[ keyfunc[a] ] <=> origorder[ keyfunc[b] ])
|
1543
|
+
}
|
1544
|
+
end
|
1545
|
+
|
1546
|
+
# Set up a filtering function that adds either an indefinite article, an
|
1547
|
+
# indefinite quantifier, or a definite quantifier to each phrase
|
1548
|
+
# depending on the configuration and the count of phrases in the
|
1549
|
+
# collector.
|
1550
|
+
filter =
|
1551
|
+
if config[:generalize]
|
1552
|
+
proc {|phrase, count| quantify(phrase, count) }
|
1553
|
+
else
|
1554
|
+
proc {|phrase, count|
|
1555
|
+
if count > 1
|
1556
|
+
"%s %s" % [
|
1557
|
+
# :TODO: Make this threshold settable
|
1558
|
+
count < 10 ? count.en.numwords : count.to_s,
|
1559
|
+
plural(phrase, count)
|
1560
|
+
]
|
1561
|
+
else
|
1562
|
+
a( phrase )
|
1563
|
+
end
|
1564
|
+
}
|
1565
|
+
end
|
1566
|
+
|
1567
|
+
# Now use the configured filter to turn each phrase into its final
|
1568
|
+
# form. Hmmm... square-bracket Lisp?
|
1569
|
+
phrases.collect! {|phrase| filter[phrase, collector[ keyfunc[phrase] ]] }
|
1570
|
+
|
1571
|
+
# Prepend the conjunctive to the last element unless it's empty or
|
1572
|
+
# there's only one element
|
1573
|
+
phrases[-1].insert( 0, config[:conjunctive] + " " ) unless
|
1574
|
+
config[:conjunctive].strip.empty? or
|
1575
|
+
phrases.length < 2
|
1576
|
+
|
1577
|
+
# Catenate the last two elements if there's no penultimate separator,
|
1578
|
+
# and pick a separator based on how many phrases there are and whether
|
1579
|
+
# or not there's already an instance of it in the phrases.
|
1580
|
+
phrases[-2] << " " << phrases.pop unless config[:penultimate]
|
1581
|
+
sep = if phrases.length <= 2
|
1582
|
+
' '
|
1583
|
+
elsif phrases.grep( /#{config[:separator]}/ ).empty?
|
1584
|
+
config[:separator]
|
1585
|
+
else
|
1586
|
+
config[:altsep]
|
1587
|
+
end
|
1588
|
+
|
1589
|
+
return phrases.join( sep )
|
1590
|
+
end
|
1591
|
+
|
1592
|
+
|
1593
|
+
### Turns a camel-case +string+ ("camelCaseToEnglish") to plain English
|
1594
|
+
### ("camel case to english"). Each word is decapitalized.
|
1595
|
+
def camel_case_to_english( string )
|
1596
|
+
string.to_s.gsub( /([a-z])([A-Z])/ ) { "#$1 #$2" }.downcase
|
1597
|
+
end
|
1598
|
+
|
1599
|
+
|
1600
|
+
### Turns an English language +string+ into a CamelCase word.
|
1601
|
+
def english_to_camel_case( string )
|
1602
|
+
string.to_s.gsub( /\s+([a-z])/ ) { $1.upcase }
|
1603
|
+
end
|
1604
|
+
|
1605
|
+
|
1606
|
+
### This method doesn't work quite right yet. It does okay for simple cases,
|
1607
|
+
### but it misses more complex ones, e.g. 'as' used as a coordinating
|
1608
|
+
### conjunction in "A Portrait of the Artist as a Young Man". Perhaps after
|
1609
|
+
### there's a working (non-leaking) LinkParser for Ruby, this can be fixed
|
1610
|
+
### up. Until then it'll just be undocumented.
|
1611
|
+
|
1612
|
+
### Returns the given +string+ as a title-cased phrase.
|
1613
|
+
def titlecase( string ) # :nodoc:
|
1614
|
+
|
1615
|
+
# Split on word-boundaries
|
1616
|
+
words = string.split( /\b/ )
|
1617
|
+
|
1618
|
+
# Always capitalize the first and last words
|
1619
|
+
words.first.capitalize!
|
1620
|
+
words.last.capitalize!
|
1621
|
+
|
1622
|
+
# Now scan the rest of the tokens, skipping non-words and capitalization
|
1623
|
+
# exceptions.
|
1624
|
+
words.each_with_index do |word, i|
|
1625
|
+
|
1626
|
+
# Non-words
|
1627
|
+
next unless /^\w+$/.match( word )
|
1628
|
+
|
1629
|
+
# Skip exception-words
|
1630
|
+
next if TitleCaseExceptions.include?( word )
|
1631
|
+
|
1632
|
+
# Skip second parts of contractions
|
1633
|
+
next if words[i - 1] == "'" && /\w/.match( words[i - 2] )
|
1634
|
+
|
1635
|
+
# Have to do it this way instead of capitalize! because that method
|
1636
|
+
# also downcases all other letters.
|
1637
|
+
word.gsub!( /^(\w)(.*)/ ) { $1.upcase + $2 }
|
1638
|
+
end
|
1639
|
+
|
1640
|
+
return words.join
|
1641
|
+
end
|
1642
|
+
|
1643
|
+
|
1644
|
+
### Returns the proper noun form of a string by capitalizing most of the
|
1645
|
+
### words.
|
1646
|
+
###
|
1647
|
+
### Examples:
|
1648
|
+
### English.proper_noun("bosnia and herzegovina") ->
|
1649
|
+
### "Bosnia and Herzegovina"
|
1650
|
+
### English.proper_noun("macedonia, the former yugoslav republic of") ->
|
1651
|
+
### "Macedonia, the Former Yugoslav Republic of"
|
1652
|
+
### English.proper_noun("virgin islands, u.s.") ->
|
1653
|
+
### "Virgin Islands, U.S."
|
1654
|
+
def proper_noun( string )
|
1655
|
+
return string.split(/([ .]+)/).collect {|word|
|
1656
|
+
next word unless /^[a-z]/.match( word ) &&
|
1657
|
+
! (%w{and the of}.include?( word ))
|
1658
|
+
word.capitalize
|
1659
|
+
}.join
|
1660
|
+
end
|
1661
|
+
|
1662
|
+
end # module Linguistics::EN
|
1663
|
+
|
1664
|
+
|
1665
|
+
### Add the #separate and #separate! methods to Array.
|
1666
|
+
class Array # :nodoc:
|
1667
|
+
|
1668
|
+
### Returns a new Array that has had a new member inserted between all of
|
1669
|
+
### the current ones. The value used is the given +value+ argument unless a
|
1670
|
+
### block is given, in which case the block is called once for each pair of
|
1671
|
+
### the Array, and the return value is used as the separator.
|
1672
|
+
def separate( value=:__no_arg__, &block )
|
1673
|
+
ary = self.dup
|
1674
|
+
ary.separate!( value, &block )
|
1675
|
+
return ary
|
1676
|
+
end
|
1677
|
+
|
1678
|
+
### The same as #separate, but modifies the Array in place.
|
1679
|
+
def separate!( value=:__no_arg__ )
|
1680
|
+
raise ArgumentError, "wrong number of arguments: (0 for 1)" if
|
1681
|
+
value == :__no_arg__ && !block_given?
|
1682
|
+
|
1683
|
+
(1..( (self.length * 2) - 2 )).step(2) do |i|
|
1684
|
+
if block_given?
|
1685
|
+
self.insert( i, yield(self[i-1,2]) )
|
1686
|
+
else
|
1687
|
+
self.insert( i, value )
|
1688
|
+
end
|
1689
|
+
end
|
1690
|
+
self
|
1691
|
+
end
|
1692
|
+
|
1693
|
+
end
|
1694
|
+
|