Linguistics 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Artistic +127 -0
- data/ChangeLog +444 -0
- data/MANIFEST +19 -0
- data/README +178 -0
- data/README.english +245 -0
- data/TODO +17 -0
- data/experiments/randobjlist.rb +34 -0
- data/install.rb +154 -0
- data/lib/linguistics/en/infinitive.rb +1149 -0
- data/lib/linguistics/en/linkparser.rb +142 -0
- data/lib/linguistics/en/wordnet.rb +253 -0
- data/lib/linguistics/en.rb +1694 -0
- data/lib/linguistics/iso639.rb +456 -0
- data/lib/linguistics.rb +368 -0
- data/redist/crosscase.rb +298 -0
- data/test.rb +110 -0
- data/tests/en/conjunction.tests.rb +114 -0
- data/tests/en/inflect.tests.rb +1378 -0
- data/tests/lingtestcase.rb +239 -0
- data/tests/use.tests.rb +99 -0
- data/utils.rb +689 -0
- metadata +58 -0
@@ -0,0 +1,1694 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
#
|
3
|
+
# = Linguistics::EN
|
4
|
+
#
|
5
|
+
# This module contains English-language linguistic functions for the Linguistics
|
6
|
+
# module. It can be either loaded directly, or by passing some variant of 'en'
|
7
|
+
# or 'eng' to the Linguistics::use method.
|
8
|
+
#
|
9
|
+
# The functions contained by the module provide:
|
10
|
+
#
|
11
|
+
# == Plural Inflections
|
12
|
+
#
|
13
|
+
# Plural forms of all nouns, most verbs, and some adjectives are provided. Where
|
14
|
+
# appropriate, "classical" variants (for example: "brother" -> "brethren",
|
15
|
+
# "dogma" -> "dogmata", etc.) are also provided.
|
16
|
+
#
|
17
|
+
# These can be accessed via the #plural, #plural_noun, #plural_verb, and
|
18
|
+
# #plural_adjective methods.
|
19
|
+
#
|
20
|
+
# == Indefinite Articles
|
21
|
+
#
|
22
|
+
# Pronunciation-based "a"/"an" selection is provided for all English words, and
|
23
|
+
# most initialisms.
|
24
|
+
#
|
25
|
+
# See: #a, #an, and #no.
|
26
|
+
#
|
27
|
+
# == Numbers to Words
|
28
|
+
#
|
29
|
+
# Conversion from Numeric values to words are supported using the American
|
30
|
+
# "thousands" system. E.g., 2561 => "two thousand, five hundred and sixty-one".
|
31
|
+
#
|
32
|
+
# See the #numwords method.
|
33
|
+
#
|
34
|
+
# == Ordinals
|
35
|
+
#
|
36
|
+
# It is also possible to inflect numerals (1,2,3) and number words ("one",
|
37
|
+
# "two", "three") to ordinals (1st, 2nd, 3rd) and ordinates ("first", "second",
|
38
|
+
# "third").
|
39
|
+
#
|
40
|
+
# == Conjunctions
|
41
|
+
#
|
42
|
+
# This module also supports the creation of English conjunctions from Arrays of
|
43
|
+
# Strings or objects which respond to the #to_s message. Eg.,
|
44
|
+
#
|
45
|
+
# %w{cow pig chicken cow dog cow duck duck moose}.en.conjunction
|
46
|
+
# ==> "three cows, two ducks, a pig, a chicken, a dog, and a moose"
|
47
|
+
#
|
48
|
+
# == Infinitives
|
49
|
+
#
|
50
|
+
# Returns the infinitive form of English verbs:
|
51
|
+
#
|
52
|
+
# "dodging".en.infinitive
|
53
|
+
# ==> "dodge"
|
54
|
+
#
|
55
|
+
#
|
56
|
+
# == Authors
|
57
|
+
#
|
58
|
+
# * Michael Granger <ged@FaerieMUD.org>
|
59
|
+
#
|
60
|
+
# == Copyright
|
61
|
+
#
|
62
|
+
# This module is copyright (c) 2003-2005 The FaerieMUD Consortium. All rights
|
63
|
+
# reserved.
|
64
|
+
#
|
65
|
+
# This module is free software. You may use, modify, and/or redistribute this
|
66
|
+
# software under the terms of the Perl Artistic License. (See
|
67
|
+
# http://language.perl.com/misc/Artistic.html)
|
68
|
+
#
|
69
|
+
# The inflection functions of this module were adapted from Damien Conway's
|
70
|
+
# Lingua::EN::Inflect Perl module:
|
71
|
+
#
|
72
|
+
# Copyright (c) 1997-2000, Damian Conway. All Rights Reserved.
|
73
|
+
# This module is free software. It may be used, redistributed
|
74
|
+
# and/or modified under the same terms as Perl itself.
|
75
|
+
#
|
76
|
+
# The conjunctions code was adapted from the Lingua::Conjunction Perl module
|
77
|
+
# written by Robert Rothenberg and Damian Conway, which has no copyright
|
78
|
+
# statement included.
|
79
|
+
#
|
80
|
+
# == Version
|
81
|
+
#
|
82
|
+
# $Id: en.rb,v 1.8 2003/09/14 10:47:12 deveiant Exp $
|
83
|
+
#
|
84
|
+
|
85
|
+
|
86
|
+
### This module contains English-language linguistics functions accessible from
|
87
|
+
### the Linguistics module, or as a standalone function library.
|
88
|
+
module Linguistics::EN
|
89
|
+
|
90
|
+
begin
|
91
|
+
require 'crosscase'
|
92
|
+
rescue LoadError
|
93
|
+
else
|
94
|
+
include CrossCase
|
95
|
+
end
|
96
|
+
|
97
|
+
# Load in the secondary modules and add them to Linguistics::EN.
|
98
|
+
require 'linguistics/en/infinitive'
|
99
|
+
require 'linguistics/en/wordnet'
|
100
|
+
require 'linguistics/en/linkparser'
|
101
|
+
|
102
|
+
# Subversion revision
|
103
|
+
SVNRev = %q$Rev$
|
104
|
+
|
105
|
+
# Subversion revision tag
|
106
|
+
SVNId = %q$Id: en.rb,v 1.8 2003/09/14 10:47:12 deveiant Exp $
|
107
|
+
|
108
|
+
# Add 'english' to the list of default languages
|
109
|
+
Linguistics::DefaultLanguages.push( :en )
|
110
|
+
|
111
|
+
|
112
|
+
#################################################################
|
113
|
+
### U T I L I T Y F U N C T I O N S
|
114
|
+
#################################################################
|
115
|
+
|
116
|
+
### Wrap one or more parts in a non-capturing alteration Regexp
|
117
|
+
def self::matchgroup( *parts )
|
118
|
+
re = parts.flatten.join("|")
|
119
|
+
"(?:#{re})"
|
120
|
+
end
|
121
|
+
|
122
|
+
|
123
|
+
#################################################################
|
124
|
+
### C O N S T A N T S
|
125
|
+
#################################################################
|
126
|
+
|
127
|
+
# :stopdoc:
|
128
|
+
|
129
|
+
#
|
130
|
+
# Plurals
|
131
|
+
#
|
132
|
+
|
133
|
+
PL_sb_irregular_s = {
|
134
|
+
"ephemeris" => "ephemerides",
|
135
|
+
"iris" => "irises|irides",
|
136
|
+
"clitoris" => "clitorises|clitorides",
|
137
|
+
"corpus" => "corpuses|corpora",
|
138
|
+
"opus" => "opuses|opera",
|
139
|
+
"genus" => "genera",
|
140
|
+
"mythos" => "mythoi",
|
141
|
+
"penis" => "penises|penes",
|
142
|
+
"testis" => "testes",
|
143
|
+
}
|
144
|
+
|
145
|
+
PL_sb_irregular_h = {
|
146
|
+
"child" => "children",
|
147
|
+
"brother" => "brothers|brethren",
|
148
|
+
"loaf" => "loaves",
|
149
|
+
"hoof" => "hoofs|hooves",
|
150
|
+
"beef" => "beefs|beeves",
|
151
|
+
"money" => "monies",
|
152
|
+
"mongoose" => "mongooses",
|
153
|
+
"ox" => "oxen",
|
154
|
+
"cow" => "cows|kine",
|
155
|
+
"soliloquy" => "soliloquies",
|
156
|
+
"graffito" => "graffiti",
|
157
|
+
"prima donna" => "prima donnas|prime donne",
|
158
|
+
"octopus" => "octopuses|octopodes",
|
159
|
+
"genie" => "genies|genii",
|
160
|
+
"ganglion" => "ganglions|ganglia",
|
161
|
+
"trilby" => "trilbys",
|
162
|
+
"turf" => "turfs|turves",
|
163
|
+
}.update( PL_sb_irregular_s )
|
164
|
+
PL_sb_irregular = matchgroup PL_sb_irregular_h.keys
|
165
|
+
|
166
|
+
|
167
|
+
# Classical "..a" -> "..ata"
|
168
|
+
PL_sb_C_a_ata = matchgroup %w[
|
169
|
+
anathema bema carcinoma charisma diploma
|
170
|
+
dogma drama edema enema enigma lemma
|
171
|
+
lymphoma magma melisma miasma oedema
|
172
|
+
sarcoma schema soma stigma stoma trauma
|
173
|
+
gumma pragma
|
174
|
+
].collect {|word| word[0...-1]}
|
175
|
+
|
176
|
+
# Unconditional "..a" -> "..ae"
|
177
|
+
PL_sb_U_a_ae = matchgroup %w[
|
178
|
+
alumna alga vertebra persona
|
179
|
+
]
|
180
|
+
|
181
|
+
# Classical "..a" -> "..ae"
|
182
|
+
PL_sb_C_a_ae = matchgroup %w[
|
183
|
+
amoeba antenna formula hyperbola
|
184
|
+
medusa nebula parabola abscissa
|
185
|
+
hydra nova lacuna aurora .*umbra
|
186
|
+
flora fauna
|
187
|
+
]
|
188
|
+
|
189
|
+
# Classical "..en" -> "..ina"
|
190
|
+
PL_sb_C_en_ina = matchgroup %w[
|
191
|
+
stamen foramen lumen
|
192
|
+
].collect {|word| word[0...-2] }
|
193
|
+
|
194
|
+
# Unconditional "..um" -> "..a"
|
195
|
+
PL_sb_U_um_a = matchgroup %w[
|
196
|
+
bacterium agendum desideratum erratum
|
197
|
+
stratum datum ovum extremum
|
198
|
+
candelabrum
|
199
|
+
].collect {|word| word[0...-2] }
|
200
|
+
|
201
|
+
# Classical "..um" -> "..a"
|
202
|
+
PL_sb_C_um_a = matchgroup %w[
|
203
|
+
maximum minimum momentum optimum
|
204
|
+
quantum cranium curriculum dictum
|
205
|
+
phylum aquarium compendium emporium
|
206
|
+
enconium gymnasium honorarium interregnum
|
207
|
+
lustrum memorandum millenium rostrum
|
208
|
+
spectrum speculum stadium trapezium
|
209
|
+
ultimatum medium vacuum velum
|
210
|
+
consortium
|
211
|
+
].collect {|word| word[0...-2]}
|
212
|
+
|
213
|
+
# Unconditional "..us" -> "i"
|
214
|
+
PL_sb_U_us_i = matchgroup %w[
|
215
|
+
alumnus alveolus bacillus bronchus
|
216
|
+
locus nucleus stimulus meniscus
|
217
|
+
].collect {|word| word[0...-2]}
|
218
|
+
|
219
|
+
# Classical "..us" -> "..i"
|
220
|
+
PL_sb_C_us_i = matchgroup %w[
|
221
|
+
focus radius genius
|
222
|
+
incubus succubus nimbus
|
223
|
+
fungus nucleolus stylus
|
224
|
+
torus umbilicus uterus
|
225
|
+
hippopotamus
|
226
|
+
].collect {|word| word[0...-2]}
|
227
|
+
|
228
|
+
# Classical "..us" -> "..us" (assimilated 4th declension latin nouns)
|
229
|
+
PL_sb_C_us_us = matchgroup %w[
|
230
|
+
status apparatus prospectus sinus
|
231
|
+
hiatus impetus plexus
|
232
|
+
]
|
233
|
+
|
234
|
+
# Unconditional "..on" -> "a"
|
235
|
+
PL_sb_U_on_a = matchgroup %w[
|
236
|
+
criterion perihelion aphelion
|
237
|
+
phenomenon prolegomenon noumenon
|
238
|
+
organon asyndeton hyperbaton
|
239
|
+
].collect {|word| word[0...-2]}
|
240
|
+
|
241
|
+
# Classical "..on" -> "..a"
|
242
|
+
PL_sb_C_on_a = matchgroup %w[
|
243
|
+
oxymoron
|
244
|
+
].collect {|word| word[0...-2]}
|
245
|
+
|
246
|
+
# Classical "..o" -> "..i" (but normally -> "..os")
|
247
|
+
PL_sb_C_o_i_a = %w[
|
248
|
+
solo soprano basso alto
|
249
|
+
contralto tempo piano
|
250
|
+
]
|
251
|
+
PL_sb_C_o_i = matchgroup PL_sb_C_o_i_a.collect{|word| word[0...-1]}
|
252
|
+
|
253
|
+
# Always "..o" -> "..os"
|
254
|
+
PL_sb_U_o_os = matchgroup( %w[
|
255
|
+
albino archipelago armadillo
|
256
|
+
commando crescendo fiasco
|
257
|
+
ditto dynamo embryo
|
258
|
+
ghetto guano inferno
|
259
|
+
jumbo lumbago magneto
|
260
|
+
manifesto medico octavo
|
261
|
+
photo pro quarto
|
262
|
+
canto lingo generalissimo
|
263
|
+
stylo rhino
|
264
|
+
] | PL_sb_C_o_i_a )
|
265
|
+
|
266
|
+
|
267
|
+
# Unconditional "..[ei]x" -> "..ices"
|
268
|
+
PL_sb_U_ex_ices = matchgroup %w[
|
269
|
+
codex murex silex
|
270
|
+
].collect {|word| word[0...-2]}
|
271
|
+
PL_sb_U_ix_ices = matchgroup %w[
|
272
|
+
radix helix
|
273
|
+
].collect {|word| word[0...-2]}
|
274
|
+
|
275
|
+
# Classical "..[ei]x" -> "..ices"
|
276
|
+
PL_sb_C_ex_ices = matchgroup %w[
|
277
|
+
vortex vertex cortex latex
|
278
|
+
pontifex apex index simplex
|
279
|
+
].collect {|word| word[0...-2]}
|
280
|
+
PL_sb_C_ix_ices = matchgroup %w[
|
281
|
+
appendix
|
282
|
+
].collect {|word| word[0...-2]}
|
283
|
+
|
284
|
+
|
285
|
+
# Arabic: ".." -> "..i"
|
286
|
+
PL_sb_C_i = matchgroup %w[
|
287
|
+
afrit afreet efreet
|
288
|
+
]
|
289
|
+
|
290
|
+
|
291
|
+
# Hebrew: ".." -> "..im"
|
292
|
+
PL_sb_C_im = matchgroup %w[
|
293
|
+
goy seraph cherub
|
294
|
+
]
|
295
|
+
|
296
|
+
# Unconditional "..man" -> "..mans"
|
297
|
+
PL_sb_U_man_mans = matchgroup %w[
|
298
|
+
human
|
299
|
+
Alabaman Bahaman Burman German
|
300
|
+
Hiroshiman Liman Nakayaman Oklahoman
|
301
|
+
Panaman Selman Sonaman Tacoman Yakiman
|
302
|
+
Yokohaman Yuman
|
303
|
+
]
|
304
|
+
|
305
|
+
|
306
|
+
PL_sb_uninflected_s = [
|
307
|
+
# Pairs or groups subsumed to a singular...
|
308
|
+
"breeches", "britches", "clippers", "gallows", "hijinks",
|
309
|
+
"headquarters", "pliers", "scissors", "testes", "herpes",
|
310
|
+
"pincers", "shears", "proceedings", "trousers",
|
311
|
+
|
312
|
+
# Unassimilated Latin 4th declension
|
313
|
+
"cantus", "coitus", "nexus",
|
314
|
+
|
315
|
+
# Recent imports...
|
316
|
+
"contretemps", "corps", "debris",
|
317
|
+
".*ois",
|
318
|
+
|
319
|
+
# Diseases
|
320
|
+
".*measles", "mumps",
|
321
|
+
|
322
|
+
# Miscellaneous others...
|
323
|
+
"diabetes", "jackanapes", "series", "species", "rabies",
|
324
|
+
"chassis", "innings", "news", "mews",
|
325
|
+
]
|
326
|
+
|
327
|
+
|
328
|
+
# Don't inflect in classical mode, otherwise normal inflection
|
329
|
+
PL_sb_uninflected_herd = matchgroup %w[
|
330
|
+
wildebeest swine eland bison buffalo
|
331
|
+
elk moose rhinoceros
|
332
|
+
]
|
333
|
+
|
334
|
+
PL_sb_uninflected = matchgroup [
|
335
|
+
|
336
|
+
# Some fish and herd animals
|
337
|
+
".*fish", "tuna", "salmon", "mackerel", "trout",
|
338
|
+
"bream", "sea[- ]bass", "carp", "cod", "flounder", "whiting",
|
339
|
+
|
340
|
+
".*deer", ".*sheep",
|
341
|
+
|
342
|
+
# All nationals ending in -ese
|
343
|
+
"Portuguese", "Amoyese", "Borghese", "Congoese", "Faroese",
|
344
|
+
"Foochowese", "Genevese", "Genoese", "Gilbertese", "Hottentotese",
|
345
|
+
"Kiplingese", "Kongoese", "Lucchese", "Maltese", "Nankingese",
|
346
|
+
"Niasese", "Pekingese", "Piedmontese", "Pistoiese", "Sarawakese",
|
347
|
+
"Shavese", "Vermontese", "Wenchowese", "Yengeese",
|
348
|
+
".*[nrlm]ese",
|
349
|
+
|
350
|
+
# Some words ending in ...s (often pairs taken as a whole)
|
351
|
+
PL_sb_uninflected_s,
|
352
|
+
|
353
|
+
# Diseases
|
354
|
+
".*pox",
|
355
|
+
|
356
|
+
# Other oddities
|
357
|
+
"graffiti", "djinn"
|
358
|
+
]
|
359
|
+
|
360
|
+
|
361
|
+
# Singular words ending in ...s (all inflect with ...es)
|
362
|
+
PL_sb_singular_s = matchgroup %w[
|
363
|
+
.*ss
|
364
|
+
acropolis aegis alias arthritis asbestos atlas
|
365
|
+
bathos bias bronchitis bursitis caddis cannabis
|
366
|
+
canvas chaos cosmos dais digitalis encephalitis
|
367
|
+
epidermis ethos eyas gas glottis hepatitis
|
368
|
+
hubris ibis lens mantis marquis metropolis
|
369
|
+
neuritis pathos pelvis polis rhinoceros
|
370
|
+
sassafras tonsillitis trellis .*us
|
371
|
+
]
|
372
|
+
|
373
|
+
PL_v_special_s = matchgroup [
|
374
|
+
PL_sb_singular_s,
|
375
|
+
PL_sb_uninflected_s,
|
376
|
+
PL_sb_irregular_s.keys,
|
377
|
+
'(.*[csx])is',
|
378
|
+
'(.*)ceps',
|
379
|
+
'[A-Z].*s',
|
380
|
+
]
|
381
|
+
|
382
|
+
PL_sb_postfix_adj = '(' + {
|
383
|
+
|
384
|
+
'general' => ['(?!major|lieutenant|brigadier|adjutant)\S+'],
|
385
|
+
'martial' => ["court"],
|
386
|
+
|
387
|
+
}.collect {|key,val|
|
388
|
+
matchgroup( matchgroup(val) + "(?=(?:-|\\s+)#{key})" )
|
389
|
+
}.join("|") + ")(.*)"
|
390
|
+
|
391
|
+
|
392
|
+
PL_sb_military = %r'major|lieutenant|brigadier|adjutant|quartermaster'
|
393
|
+
PL_sb_general = %r'((?!#{PL_sb_military.source}).*?)((-|\s+)general)'
|
394
|
+
|
395
|
+
PL_prep = matchgroup %w[
|
396
|
+
about above across after among around at athwart before behind
|
397
|
+
below beneath beside besides between betwixt beyond but by
|
398
|
+
during except for from in into near of off on onto out over
|
399
|
+
since till to under until unto upon with
|
400
|
+
]
|
401
|
+
|
402
|
+
PL_sb_prep_dual_compound = %r'(.*?)((?:-|\s+)(?:#{PL_prep}|d[eu])(?:-|\s+))a(?:-|\s+)(.*)'
|
403
|
+
PL_sb_prep_compound = %r'(.*?)((-|\s+)(#{PL_prep}|d[eu])((-|\s+)(.*))?)'
|
404
|
+
|
405
|
+
|
406
|
+
PL_pron_nom_h = {
|
407
|
+
# Nominative Reflexive
|
408
|
+
"i" => "we", "myself" => "ourselves",
|
409
|
+
"you" => "you", "yourself" => "yourselves",
|
410
|
+
"she" => "they", "herself" => "themselves",
|
411
|
+
"he" => "they", "himself" => "themselves",
|
412
|
+
"it" => "they", "itself" => "themselves",
|
413
|
+
"they" => "they", "themself" => "themselves",
|
414
|
+
|
415
|
+
# Possessive
|
416
|
+
"mine" => "ours",
|
417
|
+
"yours" => "yours",
|
418
|
+
"hers" => "theirs",
|
419
|
+
"his" => "theirs",
|
420
|
+
"its" => "theirs",
|
421
|
+
"theirs" => "theirs",
|
422
|
+
}
|
423
|
+
PL_pron_nom = matchgroup PL_pron_nom_h.keys
|
424
|
+
|
425
|
+
PL_pron_acc_h = {
|
426
|
+
# Accusative Reflexive
|
427
|
+
"me" => "us", "myself" => "ourselves",
|
428
|
+
"you" => "you", "yourself" => "yourselves",
|
429
|
+
"her" => "them", "herself" => "themselves",
|
430
|
+
"him" => "them", "himself" => "themselves",
|
431
|
+
"it" => "them", "itself" => "themselves",
|
432
|
+
"them" => "them", "themself" => "themselves",
|
433
|
+
}
|
434
|
+
PL_pron_acc = matchgroup PL_pron_acc_h.keys
|
435
|
+
|
436
|
+
PL_v_irregular_pres_h = {
|
437
|
+
# 1St pers. sing. 2nd pers. sing. 3rd pers. singular
|
438
|
+
# 3rd pers. (indet.)
|
439
|
+
"am" => "are", "are" => "are", "is" => "are",
|
440
|
+
"was" => "were", "were" => "were", "was" => "were",
|
441
|
+
"have" => "have", "have" => "have", "has" => "have",
|
442
|
+
}
|
443
|
+
PL_v_irregular_pres = matchgroup PL_v_irregular_pres_h.keys
|
444
|
+
|
445
|
+
PL_v_ambiguous_pres_h = {
|
446
|
+
# 1st pers. sing. 2nd pers. sing. 3rd pers. singular
|
447
|
+
# 3rd pers. (indet.)
|
448
|
+
"act" => "act", "act" => "act", "acts" => "act",
|
449
|
+
"blame" => "blame", "blame" => "blame", "blames" => "blame",
|
450
|
+
"can" => "can", "can" => "can", "can" => "can",
|
451
|
+
"must" => "must", "must" => "must", "must" => "must",
|
452
|
+
"fly" => "fly", "fly" => "fly", "flies" => "fly",
|
453
|
+
"copy" => "copy", "copy" => "copy", "copies" => "copy",
|
454
|
+
"drink" => "drink", "drink" => "drink", "drinks" => "drink",
|
455
|
+
"fight" => "fight", "fight" => "fight", "fights" => "fight",
|
456
|
+
"fire" => "fire", "fire" => "fire", "fires" => "fire",
|
457
|
+
"like" => "like", "like" => "like", "likes" => "like",
|
458
|
+
"look" => "look", "look" => "look", "looks" => "look",
|
459
|
+
"make" => "make", "make" => "make", "makes" => "make",
|
460
|
+
"reach" => "reach", "reach" => "reach", "reaches" => "reach",
|
461
|
+
"run" => "run", "run" => "run", "runs" => "run",
|
462
|
+
"sink" => "sink", "sink" => "sink", "sinks" => "sink",
|
463
|
+
"sleep" => "sleep", "sleep" => "sleep", "sleeps" => "sleep",
|
464
|
+
"view" => "view", "view" => "view", "views" => "view",
|
465
|
+
}
|
466
|
+
PL_v_ambiguous_pres = matchgroup PL_v_ambiguous_pres_h.keys
|
467
|
+
|
468
|
+
PL_v_irregular_non_pres = matchgroup %w[
|
469
|
+
did had ate made put
|
470
|
+
spent fought sank gave sought
|
471
|
+
shall could ought should
|
472
|
+
]
|
473
|
+
|
474
|
+
PL_v_ambiguous_non_pres = matchgroup %w[
|
475
|
+
thought saw bent will might cut
|
476
|
+
]
|
477
|
+
|
478
|
+
PL_count_zero = matchgroup %w[
|
479
|
+
0 no zero nil
|
480
|
+
]
|
481
|
+
|
482
|
+
PL_count_one = matchgroup %w[
|
483
|
+
1 a an one each every this that
|
484
|
+
]
|
485
|
+
|
486
|
+
PL_adj_special_h = {
|
487
|
+
"a" => "some", "an" => "some",
|
488
|
+
"this" => "these", "that" => "those",
|
489
|
+
}
|
490
|
+
PL_adj_special = matchgroup PL_adj_special_h.keys
|
491
|
+
|
492
|
+
PL_adj_poss_h = {
|
493
|
+
"my" => "our",
|
494
|
+
"your" => "your",
|
495
|
+
"its" => "their",
|
496
|
+
"her" => "their",
|
497
|
+
"his" => "their",
|
498
|
+
"their" => "their",
|
499
|
+
}
|
500
|
+
PL_adj_poss = matchgroup PL_adj_poss_h.keys
|
501
|
+
|
502
|
+
|
503
|
+
#
|
504
|
+
# Numerals, ordinals, and numbers-to-words
|
505
|
+
#
|
506
|
+
|
507
|
+
# Numerical inflections
|
508
|
+
Nth = {
|
509
|
+
0 => 'th',
|
510
|
+
1 => 'st',
|
511
|
+
2 => 'nd',
|
512
|
+
3 => 'rd',
|
513
|
+
4 => 'th',
|
514
|
+
5 => 'th',
|
515
|
+
6 => 'th',
|
516
|
+
7 => 'th',
|
517
|
+
8 => 'th',
|
518
|
+
9 => 'th',
|
519
|
+
11 => 'th',
|
520
|
+
12 => 'th',
|
521
|
+
13 => 'th',
|
522
|
+
}
|
523
|
+
|
524
|
+
# Ordinal word parts
|
525
|
+
Ordinals = {
|
526
|
+
'ty' => 'tieth',
|
527
|
+
'one' => 'first',
|
528
|
+
'two' => 'second',
|
529
|
+
'three' => 'third',
|
530
|
+
'five' => 'fifth',
|
531
|
+
'eight' => 'eighth',
|
532
|
+
'nine' => 'ninth',
|
533
|
+
'twelve' => 'twelfth',
|
534
|
+
}
|
535
|
+
OrdinalSuffixes = Ordinals.keys.join("|") + "|"
|
536
|
+
Ordinals[""] = 'th'
|
537
|
+
|
538
|
+
# Numeral names
|
539
|
+
Units = [''] + %w[one two three four five six seven eight nine]
|
540
|
+
Teens = %w[ten eleven twelve thirteen fourteen
|
541
|
+
fifteen sixteen seventeen eighteen nineteen]
|
542
|
+
Tens = ['',''] + %w[twenty thirty forty fifty sixty seventy eighty ninety]
|
543
|
+
Thousands = [' ', ' thousand'] + %w[
|
544
|
+
m b tr quadr quint sext sept oct non dec undec duodec tredec
|
545
|
+
quattuordec quindec sexdec septemdec octodec novemdec vigint
|
546
|
+
].collect {|prefix| ' ' + prefix + 'illion'}
|
547
|
+
|
548
|
+
# A collection of functions for transforming digits into word
|
549
|
+
# phrases. Indexed by the number of digits being transformed; e.g.,
|
550
|
+
# <tt>NumberToWordsFunctions[2]</tt> is the function for transforming
|
551
|
+
# double-digit numbers.
|
552
|
+
NumberToWordsFunctions = [
|
553
|
+
proc {|*args| raise "No digits (#{args.inspect})"},
|
554
|
+
|
555
|
+
# Single-digits
|
556
|
+
proc {|zero,x|
|
557
|
+
(x.nonzero? ? to_units(x) : "#{zero} ")
|
558
|
+
},
|
559
|
+
|
560
|
+
# Double-digits
|
561
|
+
proc {|zero,x,y|
|
562
|
+
if x.nonzero?
|
563
|
+
to_tens( x, y )
|
564
|
+
elsif y.nonzero?
|
565
|
+
"#{zero} " + NumberToWordsFunctions[1].call( zero, y )
|
566
|
+
else
|
567
|
+
([zero] * 2).join(" ")
|
568
|
+
end
|
569
|
+
},
|
570
|
+
|
571
|
+
# Triple-digits
|
572
|
+
proc {|zero,x,y,z|
|
573
|
+
NumberToWordsFunctions[1].call(zero,x) +
|
574
|
+
NumberToWordsFunctions[2].call(zero,y,z)
|
575
|
+
}
|
576
|
+
]
|
577
|
+
|
578
|
+
|
579
|
+
#
|
580
|
+
# Indefinite Articles
|
581
|
+
#
|
582
|
+
|
583
|
+
# This pattern matches strings of capitals starting with a "vowel-sound"
|
584
|
+
# consonant followed by another consonant, and which are not likely
|
585
|
+
# to be real words (oh, all right then, it's just magic!)
|
586
|
+
A_abbrev = %{
|
587
|
+
(?! FJO | [HLMNS]Y. | RY[EO] | SQU
|
588
|
+
| ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU])
|
589
|
+
[FHLMNRSX][A-Z]
|
590
|
+
}
|
591
|
+
|
592
|
+
# This pattern codes the beginnings of all english words begining with a
|
593
|
+
# 'y' followed by a consonant. Any other y-consonant prefix therefore
|
594
|
+
# implies an abbreviation.
|
595
|
+
A_y_cons = 'y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)'
|
596
|
+
|
597
|
+
# Exceptions to exceptions
|
598
|
+
A_explicit_an = matchgroup( "euler", "hour(?!i)", "heir", "honest", "hono" )
|
599
|
+
|
600
|
+
|
601
|
+
#
|
602
|
+
# Configuration defaults
|
603
|
+
#
|
604
|
+
|
605
|
+
# Default configuration arguments for the #numwords function
|
606
|
+
NumwordDefaults = {
|
607
|
+
:group => 0,
|
608
|
+
:comma => ', ',
|
609
|
+
:and => ' and ',
|
610
|
+
:zero => 'zero',
|
611
|
+
:decimal => 'point',
|
612
|
+
:asArray => false,
|
613
|
+
}
|
614
|
+
|
615
|
+
# Default ranges for #quantify
|
616
|
+
SeveralRange = 2..5
|
617
|
+
NumberRange = 6..19
|
618
|
+
NumerousRange = 20..45
|
619
|
+
ManyRange = 46..99
|
620
|
+
|
621
|
+
# Default configuration arguments for the #quantify function
|
622
|
+
QuantifyDefaults = {
|
623
|
+
:joinword => " of ",
|
624
|
+
}
|
625
|
+
|
626
|
+
# Default configuration arguments for the #conjunction (junction, what's
|
627
|
+
# your) function.
|
628
|
+
ConjunctionDefaults = {
|
629
|
+
:separator => ', ',
|
630
|
+
:altsep => '; ',
|
631
|
+
:penultimate => true,
|
632
|
+
:conjunctive => 'and',
|
633
|
+
:combine => true,
|
634
|
+
:casefold => true,
|
635
|
+
:generalize => false,
|
636
|
+
:quantsort => true,
|
637
|
+
}
|
638
|
+
|
639
|
+
|
640
|
+
#
|
641
|
+
# Title case
|
642
|
+
#
|
643
|
+
|
644
|
+
# "In titles, capitalize the first word, the last word, and all words in
|
645
|
+
# between except articles (a, an, and the), prepositions under five letters
|
646
|
+
# (in, of, to), and coordinating conjunctions (and, but). These rules apply
|
647
|
+
# to titles of long, short, and partial works as well as your own papers"
|
648
|
+
# (Anson, Schwegler, and Muth. The Longman Writer's Companion 240).
|
649
|
+
|
650
|
+
# Build the list of exceptions to title-capitalization
|
651
|
+
Articles = %w[a and the]
|
652
|
+
ShortPrepositions = ["amid", "at", "but", "by", "down", "from", "in",
|
653
|
+
"into", "like", "near", "of", "off", "on", "onto", "out", "over",
|
654
|
+
"past", "save", "with", "till", "to", "unto", "up", "upon", "with"]
|
655
|
+
CoordConjunctions = %w[and but as]
|
656
|
+
TitleCaseExceptions = Articles | ShortPrepositions | CoordConjunctions
|
657
|
+
|
658
|
+
|
659
|
+
# :startdoc:
|
660
|
+
|
661
|
+
#################################################################
|
662
|
+
### " B A C K E N D " F U N C T I O N S
|
663
|
+
#################################################################
|
664
|
+
|
665
|
+
|
666
|
+
###############
|
667
|
+
module_function
|
668
|
+
###############
|
669
|
+
|
670
|
+
### Debugging output
|
671
|
+
def debugMsg( *msgs ) # :nodoc:
|
672
|
+
$stderr.puts msgs.join(" ") if $DEBUG
|
673
|
+
end
|
674
|
+
|
675
|
+
|
676
|
+
### Normalize a count to either 1 or 2 (singular or plural)
|
677
|
+
def normalizeCount( count, default=2 )
|
678
|
+
return default if count.nil? # Default to plural
|
679
|
+
if /^(#{PL_count_one})$/i =~ count.to_s ||
|
680
|
+
Linguistics::classical? &&
|
681
|
+
/^(#{PL_count_zero})$/ =~ count.to_s
|
682
|
+
return 1
|
683
|
+
else
|
684
|
+
return default
|
685
|
+
end
|
686
|
+
end
|
687
|
+
|
688
|
+
|
689
|
+
### Do normal/classical switching and match capitalization in <tt>inflected</tt> by
|
690
|
+
### examining the <tt>original</tt> input.
|
691
|
+
def postprocess( original, inflected )
|
692
|
+
inflected.sub!( /([^|]+)\|(.+)/ ) {
|
693
|
+
Linguistics::classical? ? $2 : $1
|
694
|
+
}
|
695
|
+
|
696
|
+
case original
|
697
|
+
when "I"
|
698
|
+
return inflected
|
699
|
+
when /^[A-Z]+$/
|
700
|
+
return inflected.upcase
|
701
|
+
when /^[A-Z]/
|
702
|
+
# Can't use #capitalize, as it will downcase the rest of the string,
|
703
|
+
# too.
|
704
|
+
inflected[0,1] = inflected[0,1].upcase
|
705
|
+
return inflected
|
706
|
+
else
|
707
|
+
return inflected
|
708
|
+
end
|
709
|
+
end
|
710
|
+
|
711
|
+
|
712
|
+
### Pluralize nouns
|
713
|
+
def pluralize_noun( word, count=nil )
|
714
|
+
value = nil
|
715
|
+
count ||= Linguistics::num
|
716
|
+
count = normalizeCount( count )
|
717
|
+
|
718
|
+
return word if count == 1
|
719
|
+
|
720
|
+
# Handle user-defined nouns
|
721
|
+
#if value = ud_match( word, PL_sb_user_defined )
|
722
|
+
# return value
|
723
|
+
#end
|
724
|
+
|
725
|
+
# Handle empty word, singular count and uninflected plurals
|
726
|
+
case word
|
727
|
+
when ''
|
728
|
+
return word
|
729
|
+
when /^(#{PL_sb_uninflected})$/i
|
730
|
+
return word
|
731
|
+
else
|
732
|
+
if Linguistics::classical? &&
|
733
|
+
/^(#{PL_sb_uninflected_herd})$/i =~ word
|
734
|
+
return word
|
735
|
+
end
|
736
|
+
end
|
737
|
+
|
738
|
+
# Handle compounds ("Governor General", "mother-in-law", "aide-de-camp", etc.)
|
739
|
+
case word
|
740
|
+
when /^(?:#{PL_sb_postfix_adj})$/i
|
741
|
+
value = $2
|
742
|
+
return pluralize_noun( $1, 2 ) + value
|
743
|
+
|
744
|
+
when /^(?:#{PL_sb_prep_dual_compound})$/i
|
745
|
+
value = [ $2, $3 ]
|
746
|
+
return pluralize_noun( $1, 2 ) + value[0] + pluralize_noun( value[1] )
|
747
|
+
|
748
|
+
when /^(?:#{PL_sb_prep_compound})$/i
|
749
|
+
value = $2
|
750
|
+
return pluralize_noun( $1, 2 ) + value
|
751
|
+
|
752
|
+
# Handle pronouns
|
753
|
+
when /^((?:#{PL_prep})\s+)(#{PL_pron_acc})$/i
|
754
|
+
return $1 + PL_pron_acc_h[ $2.downcase ]
|
755
|
+
|
756
|
+
when /^(#{PL_pron_nom})$/i
|
757
|
+
return PL_pron_nom_h[ word.downcase ]
|
758
|
+
|
759
|
+
when /^(#{PL_pron_acc})$/i
|
760
|
+
return PL_pron_acc_h[ $1.downcase ]
|
761
|
+
|
762
|
+
# Handle isolated irregular plurals
|
763
|
+
when /(.*)\b(#{PL_sb_irregular})$/i
|
764
|
+
return $1 + PL_sb_irregular_h[ $2.downcase ]
|
765
|
+
|
766
|
+
when /(#{PL_sb_U_man_mans})$/i
|
767
|
+
return "#{$1}s"
|
768
|
+
|
769
|
+
# Handle families of irregular plurals
|
770
|
+
when /(.*)man$/i ; return "#{$1}men"
|
771
|
+
when /(.*[ml])ouse$/i ; return "#{$1}ice"
|
772
|
+
when /(.*)goose$/i ; return "#{$1}geese"
|
773
|
+
when /(.*)tooth$/i ; return "#{$1}teeth"
|
774
|
+
when /(.*)foot$/i ; return "#{$1}feet"
|
775
|
+
|
776
|
+
# Handle unassimilated imports
|
777
|
+
when /(.*)ceps$/i ; return word
|
778
|
+
when /(.*)zoon$/i ; return "#{$1}zoa"
|
779
|
+
when /(.*[csx])is$/i ; return "#{$1}es"
|
780
|
+
when /(#{PL_sb_U_ex_ices})ex$/i; return "#{$1}ices"
|
781
|
+
when /(#{PL_sb_U_ix_ices})ix$/i; return "#{$1}ices"
|
782
|
+
when /(#{PL_sb_U_um_a})um$/i ; return "#{$1}a"
|
783
|
+
when /(#{PL_sb_U_us_i})us$/i ; return "#{$1}i"
|
784
|
+
when /(#{PL_sb_U_on_a})on$/i ; return "#{$1}a"
|
785
|
+
when /(#{PL_sb_U_a_ae})$/i ; return "#{$1}e"
|
786
|
+
end
|
787
|
+
|
788
|
+
# Handle incompletely assimilated imports
|
789
|
+
if Linguistics::classical?
|
790
|
+
case word
|
791
|
+
when /(.*)trix$/i ; return "#{$1}trices"
|
792
|
+
when /(.*)eau$/i ; return "#{$1}eaux"
|
793
|
+
when /(.*)ieu$/i ; return "#{$1}ieux"
|
794
|
+
when /(.{2,}[yia])nx$/i ; return "#{$1}nges"
|
795
|
+
when /(#{PL_sb_C_en_ina})en$/i; return "#{$1}ina"
|
796
|
+
when /(#{PL_sb_C_ex_ices})ex$/i; return "#{$1}ices"
|
797
|
+
when /(#{PL_sb_C_ix_ices})ix$/i; return "#{$1}ices"
|
798
|
+
when /(#{PL_sb_C_um_a})um$/i ; return "#{$1}a"
|
799
|
+
when /(#{PL_sb_C_us_i})us$/i ; return "#{$1}i"
|
800
|
+
when /(#{PL_sb_C_us_us})$/i ; return "#{$1}"
|
801
|
+
when /(#{PL_sb_C_a_ae})$/i ; return "#{$1}e"
|
802
|
+
when /(#{PL_sb_C_a_ata})a$/i ; return "#{$1}ata"
|
803
|
+
when /(#{PL_sb_C_o_i})o$/i ; return "#{$1}i"
|
804
|
+
when /(#{PL_sb_C_on_a})on$/i ; return "#{$1}a"
|
805
|
+
when /#{PL_sb_C_im}$/i ; return "#{word}im"
|
806
|
+
when /#{PL_sb_C_i}$/i ; return "#{word}i"
|
807
|
+
end
|
808
|
+
end
|
809
|
+
|
810
|
+
|
811
|
+
# Handle singular nouns ending in ...s or other silibants
|
812
|
+
case word
|
813
|
+
when /^(#{PL_sb_singular_s})$/i; return "#{$1}es"
|
814
|
+
when /^([A-Z].*s)$/; return "#{$1}es"
|
815
|
+
when /(.*)([cs]h|[zx])$/i ; return "#{$1}#{$2}es"
|
816
|
+
# when /(.*)(us)$/i ; return "#{$1}#{$2}es"
|
817
|
+
|
818
|
+
# Handle ...f -> ...ves
|
819
|
+
when /(.*[eao])lf$/i ; return "#{$1}lves";
|
820
|
+
when /(.*[^d])eaf$/i ; return "#{$1}eaves"
|
821
|
+
when /(.*[nlw])ife$/i ; return "#{$1}ives"
|
822
|
+
when /(.*)arf$/i ; return "#{$1}arves"
|
823
|
+
|
824
|
+
# Handle ...y
|
825
|
+
when /(.*[aeiou])y$/i ; return "#{$1}ys"
|
826
|
+
when /([A-Z].*y)$/ ; return "#{$1}s"
|
827
|
+
when /(.*)y$/i ; return "#{$1}ies"
|
828
|
+
|
829
|
+
# Handle ...o
|
830
|
+
when /#{PL_sb_U_o_os}$/i ; return "#{word}s"
|
831
|
+
when /[aeiou]o$/i ; return "#{word}s"
|
832
|
+
when /o$/i ; return "#{word}es"
|
833
|
+
|
834
|
+
# Otherwise just add ...s
|
835
|
+
else
|
836
|
+
return "#{word}s"
|
837
|
+
end
|
838
|
+
end # def pluralize_noun
|
839
|
+
|
840
|
+
|
841
|
+
|
842
|
+
### Pluralize special verbs
|
843
|
+
def pluralize_special_verb( word, count )
|
844
|
+
count ||= Linguistics::num
|
845
|
+
count = normalizeCount( count )
|
846
|
+
|
847
|
+
return nil if /^(#{PL_count_one})$/i =~ count.to_s
|
848
|
+
|
849
|
+
# Handle user-defined verbs
|
850
|
+
#if value = ud_match( word, PL_v_user_defined )
|
851
|
+
# return value
|
852
|
+
#end
|
853
|
+
|
854
|
+
case word
|
855
|
+
|
856
|
+
# Handle irregular present tense (simple and compound)
|
857
|
+
when /^(#{PL_v_irregular_pres})((\s.*)?)$/i
|
858
|
+
return PL_v_irregular_pres_h[ $1.downcase ] + $2
|
859
|
+
|
860
|
+
# Handle irregular future, preterite and perfect tenses
|
861
|
+
when /^(#{PL_v_irregular_non_pres})((\s.*)?)$/i
|
862
|
+
return word
|
863
|
+
|
864
|
+
# Handle special cases
|
865
|
+
when /^(#{PL_v_special_s})$/, /\s/
|
866
|
+
return nil
|
867
|
+
|
868
|
+
# Handle standard 3rd person (chop the ...(e)s off single words)
|
869
|
+
when /^(.*)([cs]h|[x]|zz|ss)es$/i
|
870
|
+
return $1 + $2
|
871
|
+
when /^(..+)ies$/i
|
872
|
+
return "#{$1}y"
|
873
|
+
when /^(.+)oes$/i
|
874
|
+
return "#{$1}o"
|
875
|
+
when /^(.*[^s])s$/i
|
876
|
+
return $1
|
877
|
+
|
878
|
+
# Otherwise, a regular verb (handle elsewhere)
|
879
|
+
else
|
880
|
+
return nil
|
881
|
+
end
|
882
|
+
end
|
883
|
+
|
884
|
+
|
885
|
+
### Pluralize regular verbs
|
886
|
+
def pluralize_general_verb( word, count )
|
887
|
+
count ||= Linguistics::num
|
888
|
+
count = normalizeCount( count )
|
889
|
+
|
890
|
+
return word if /^(#{PL_count_one})$/i =~ count.to_s
|
891
|
+
|
892
|
+
case word
|
893
|
+
|
894
|
+
# Handle ambiguous present tenses (simple and compound)
|
895
|
+
when /^(#{PL_v_ambiguous_pres})((\s.*)?)$/i
|
896
|
+
return PL_v_ambiguous_pres_h[ $1.downcase ] + $2
|
897
|
+
|
898
|
+
# Handle ambiguous preterite and perfect tenses
|
899
|
+
when /^(#{PL_v_ambiguous_non_pres})((\s.*)?)$/i
|
900
|
+
return word
|
901
|
+
|
902
|
+
# Otherwise, 1st or 2nd person is uninflected
|
903
|
+
else
|
904
|
+
return word
|
905
|
+
end
|
906
|
+
end
|
907
|
+
|
908
|
+
|
909
|
+
### Handle special adjectives
|
910
|
+
def pluralize_special_adjective( word, count )
|
911
|
+
count ||= Linguistics::num
|
912
|
+
count = normalizeCount( count )
|
913
|
+
|
914
|
+
return word if /^(#{PL_count_one})$/i =~ count.to_s
|
915
|
+
|
916
|
+
# Handle user-defined verbs
|
917
|
+
#if value = ud_match( word, PL_adj_user_defined )
|
918
|
+
# return value
|
919
|
+
#end
|
920
|
+
|
921
|
+
case word
|
922
|
+
|
923
|
+
# Handle known cases
|
924
|
+
when /^(#{PL_adj_special})$/i
|
925
|
+
return PL_adj_special_h[ $1.downcase ]
|
926
|
+
|
927
|
+
# Handle possessives
|
928
|
+
when /^(#{PL_adj_poss})$/i
|
929
|
+
return PL_adj_poss_h[ $1.downcase ]
|
930
|
+
|
931
|
+
when /^(.*)'s?$/
|
932
|
+
pl = plural_noun( $1 )
|
933
|
+
if /s$/ =~ pl
|
934
|
+
return "#{pl}'"
|
935
|
+
else
|
936
|
+
return "#{pl}'s"
|
937
|
+
end
|
938
|
+
|
939
|
+
# Otherwise, no idea
|
940
|
+
else
|
941
|
+
return nil
|
942
|
+
end
|
943
|
+
end
|
944
|
+
|
945
|
+
|
946
|
+
### Returns the given word with a prepended indefinite article, unless
|
947
|
+
### +count+ is non-nil and not singular.
|
948
|
+
def indef_article( word, count )
|
949
|
+
count ||= Linguistics::num
|
950
|
+
return "#{count} #{word}" if
|
951
|
+
count && /^(#{PL_count_one})$/i !~ count.to_s
|
952
|
+
|
953
|
+
# Handle user-defined variants
|
954
|
+
# return value if value = ud_match( word, A_a_user_defined )
|
955
|
+
|
956
|
+
case word
|
957
|
+
|
958
|
+
# Handle special cases
|
959
|
+
when /^(#{A_explicit_an})/i
|
960
|
+
return "an #{word}"
|
961
|
+
|
962
|
+
# Handle abbreviations
|
963
|
+
when /^(#{A_abbrev})/x
|
964
|
+
return "an #{word}"
|
965
|
+
when /^[aefhilmnorsx][.-]/i
|
966
|
+
return "an #{word}"
|
967
|
+
when /^[a-z][.-]/i
|
968
|
+
return "a #{word}"
|
969
|
+
|
970
|
+
# Handle consonants
|
971
|
+
when /^[^aeiouy]/i
|
972
|
+
return "a #{word}"
|
973
|
+
|
974
|
+
# Handle special vowel-forms
|
975
|
+
when /^e[uw]/i
|
976
|
+
return "a #{word}"
|
977
|
+
when /^onc?e\b/i
|
978
|
+
return "a #{word}"
|
979
|
+
when /^uni([^nmd]|mo)/i
|
980
|
+
return "a #{word}"
|
981
|
+
when /^u[bcfhjkqrst][aeiou]/i
|
982
|
+
return "a #{word}"
|
983
|
+
|
984
|
+
# Handle vowels
|
985
|
+
when /^[aeiou]/i
|
986
|
+
return "an #{word}"
|
987
|
+
|
988
|
+
# Handle y... (before certain consonants implies (unnaturalized) "i.." sound)
|
989
|
+
when /^(#{A_y_cons})/i
|
990
|
+
return "an #{word}"
|
991
|
+
|
992
|
+
# Otherwise, guess "a"
|
993
|
+
else
|
994
|
+
return "a #{word}"
|
995
|
+
end
|
996
|
+
end
|
997
|
+
|
998
|
+
|
999
|
+
### Transform the specified number of units-place numerals into a
|
1000
|
+
### word-phrase at the given number of +thousands+ places.
|
1001
|
+
def to_units( units, thousands=0 )
|
1002
|
+
return Units[ units ] + to_thousands( thousands )
|
1003
|
+
end
|
1004
|
+
|
1005
|
+
|
1006
|
+
### Transform the specified number of tens- and units-place numerals into a
|
1007
|
+
### word-phrase at the given number of +thousands+ places.
|
1008
|
+
def to_tens( tens, units, thousands=0 )
|
1009
|
+
unless tens == 1
|
1010
|
+
return Tens[ tens ] + ( tens.nonzero? && units.nonzero? ? '-' : '' ) +
|
1011
|
+
to_units( units, thousands )
|
1012
|
+
else
|
1013
|
+
return Teens[ units ] + to_thousands( thousands )
|
1014
|
+
end
|
1015
|
+
end
|
1016
|
+
|
1017
|
+
|
1018
|
+
### Transform the specified number of hundreds-, tens-, and units-place
|
1019
|
+
### numerals into a word phrase. If the number of thousands (+thousands+) is
|
1020
|
+
### greater than 0, it will be used to determine where the decimal point is
|
1021
|
+
### in relation to the hundreds-place number.
|
1022
|
+
def to_hundreds( hundreds, tens=0, units=0, thousands=0, joinword=" and " )
|
1023
|
+
joinword = ' ' if joinword.empty?
|
1024
|
+
if hundreds.nonzero?
|
1025
|
+
return to_units( hundreds ) + " hundred" +
|
1026
|
+
(tens.nonzero? || units.nonzero? ? joinword : '') +
|
1027
|
+
to_tens( tens, units ) +
|
1028
|
+
to_thousands( thousands )
|
1029
|
+
elsif tens.nonzero? || units.nonzero?
|
1030
|
+
return to_tens( tens, units ) + to_thousands( thousands )
|
1031
|
+
else
|
1032
|
+
return nil
|
1033
|
+
end
|
1034
|
+
end
|
1035
|
+
|
1036
|
+
### Transform the specified number into one or more words like 'thousand',
|
1037
|
+
### 'million', etc. Uses the thousands (American) system.
|
1038
|
+
def to_thousands( thousands=0 )
|
1039
|
+
parts = []
|
1040
|
+
(0..thousands).step( Thousands.length - 1 ) {|i|
|
1041
|
+
if i.zero?
|
1042
|
+
parts.push Thousands[ thousands % (Thousands.length - 1) ]
|
1043
|
+
else
|
1044
|
+
parts.push Thousands.last
|
1045
|
+
end
|
1046
|
+
}
|
1047
|
+
|
1048
|
+
return parts.join(" ")
|
1049
|
+
end
|
1050
|
+
|
1051
|
+
|
1052
|
+
### Return the specified number +num+ as an array of number phrases.
|
1053
|
+
def number_to_words( num, config )
|
1054
|
+
return [config[:zero]] if num.to_i.zero?
|
1055
|
+
chunks = []
|
1056
|
+
|
1057
|
+
# Break into word-groups if groups is set
|
1058
|
+
if config[:group].nonzero?
|
1059
|
+
|
1060
|
+
# Build a Regexp with <config[:group]> number of digits. Any past
|
1061
|
+
# the first are optional.
|
1062
|
+
re = Regexp::new( "(\\d)" + ("(\\d)?" * (config[:group] - 1)) )
|
1063
|
+
|
1064
|
+
# Scan the string, and call the word-chunk function that deals with
|
1065
|
+
# chunks of the found number of digits.
|
1066
|
+
num.to_s.scan( re ) {|digits|
|
1067
|
+
debugMsg " digits = #{digits.inspect}"
|
1068
|
+
fn = NumberToWordsFunctions[ digits.nitems ]
|
1069
|
+
numerals = digits.flatten.compact.collect {|i| i.to_i}
|
1070
|
+
debugMsg " numerals = #{numerals.inspect}"
|
1071
|
+
chunks.push fn.call( config[:zero], *numerals ).strip
|
1072
|
+
}
|
1073
|
+
else
|
1074
|
+
phrase = num.to_s
|
1075
|
+
phrase.sub!( /\A\s*0+/, '' )
|
1076
|
+
mill = 0
|
1077
|
+
|
1078
|
+
# Match backward from the end of the digits in the string, turning
|
1079
|
+
# chunks of three, of two, and of one into words.
|
1080
|
+
mill += 1 while
|
1081
|
+
phrase.sub!( /(\d)(\d)(\d)(?=\D*\Z)/ ) {
|
1082
|
+
words = to_hundreds( $1.to_i, $2.to_i, $3.to_i, mill,
|
1083
|
+
config[:and] )
|
1084
|
+
chunks.unshift words.strip.squeeze(' ') unless words.nil?
|
1085
|
+
''
|
1086
|
+
}
|
1087
|
+
|
1088
|
+
phrase.sub!( /(\d)(\d)(?=\D*\Z)/ ) {
|
1089
|
+
chunks.unshift to_tens( $1.to_i, $2.to_i, mill ).strip.squeeze(' ')
|
1090
|
+
''
|
1091
|
+
}
|
1092
|
+
phrase.sub!( /(\d)(?=\D*\Z)/ ) {
|
1093
|
+
chunks.unshift to_units( $1.to_i, mill ).strip.squeeze(' ')
|
1094
|
+
''
|
1095
|
+
}
|
1096
|
+
end
|
1097
|
+
|
1098
|
+
return chunks
|
1099
|
+
end
|
1100
|
+
|
1101
|
+
|
1102
|
+
#################################################################
|
1103
|
+
### P U B L I C F U N C T I O N S
|
1104
|
+
#################################################################
|
1105
|
+
|
1106
|
+
### Return the name of the language this module is for.
|
1107
|
+
def language
|
1108
|
+
"English"
|
1109
|
+
end
|
1110
|
+
|
1111
|
+
|
1112
|
+
### Return the plural of the given +phrase+ if +count+ indicates it should
|
1113
|
+
### be plural.
|
1114
|
+
def plural( phrase, count=nil )
|
1115
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1116
|
+
pre, word, post = md.to_a[1,3]
|
1117
|
+
return phrase if word.nil? or word.empty?
|
1118
|
+
|
1119
|
+
plural = postprocess( word,
|
1120
|
+
pluralize_special_adjective(word, count) ||
|
1121
|
+
pluralize_special_verb(word, count) ||
|
1122
|
+
pluralize_noun(word, count) )
|
1123
|
+
|
1124
|
+
return pre + plural + post
|
1125
|
+
end
|
1126
|
+
alias_method :PL, :plural
|
1127
|
+
|
1128
|
+
|
1129
|
+
### Return the plural of the given noun +phrase+ if +count+ indicates it
|
1130
|
+
### should be plural.
|
1131
|
+
def plural_noun( phrase, count=nil )
|
1132
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1133
|
+
pre, word, post = md.to_a[1,3]
|
1134
|
+
return phrase if word.nil? or word.empty?
|
1135
|
+
|
1136
|
+
plural = postprocess( word, pluralize_noun(word, count) )
|
1137
|
+
return pre + plural + post
|
1138
|
+
end
|
1139
|
+
alias_method :PL_N, :plural_noun
|
1140
|
+
|
1141
|
+
|
1142
|
+
### Return the plural of the given verb +phrase+ if +count+ indicates it
|
1143
|
+
### should be plural.
|
1144
|
+
def plural_verb( phrase, count=nil )
|
1145
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1146
|
+
pre, word, post = md.to_a[1,3]
|
1147
|
+
return phrase if word.nil? or word.empty?
|
1148
|
+
|
1149
|
+
plural = postprocess( word,
|
1150
|
+
pluralize_special_verb(word, count) ||
|
1151
|
+
pluralize_general_verb(word, count) )
|
1152
|
+
return pre + plural + post
|
1153
|
+
end
|
1154
|
+
alias_method :PL_V, :plural_verb
|
1155
|
+
|
1156
|
+
|
1157
|
+
### Return the plural of the given adjectival +phrase+ if +count+ indicates
|
1158
|
+
### it should be plural.
|
1159
|
+
def plural_adjective( phrase, count=nil )
|
1160
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1161
|
+
pre, word, post = md.to_a[1,3]
|
1162
|
+
return phrase if word.nil? or word.empty?
|
1163
|
+
|
1164
|
+
plural = postprocess( word,
|
1165
|
+
pluralize_special_adjective(word, count) || word )
|
1166
|
+
return pre + plural + post
|
1167
|
+
end
|
1168
|
+
alias_method :plural_adj, :plural_adjective
|
1169
|
+
alias_method :PL_ADJ, :plural_adjective
|
1170
|
+
|
1171
|
+
|
1172
|
+
### Return the given phrase with the appropriate indefinite article ("a" or
|
1173
|
+
### "an") prepended.
|
1174
|
+
def a( phrase, count=nil )
|
1175
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1176
|
+
pre, word, post = md.to_a[1,3]
|
1177
|
+
return phrase if word.nil? or word.empty?
|
1178
|
+
|
1179
|
+
result = indef_article( word, count )
|
1180
|
+
return pre + result + post
|
1181
|
+
end
|
1182
|
+
alias_method :an, :a
|
1183
|
+
alias_method :A, :a
|
1184
|
+
alias_method :AN, :a
|
1185
|
+
|
1186
|
+
|
1187
|
+
### Translate zero-quantified +phrase+ to "no +phrase.plural+"
|
1188
|
+
def no( phrase, count=nil )
|
1189
|
+
md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
|
1190
|
+
pre, word, post = md.to_a[1,3]
|
1191
|
+
count ||= Linguistics::num || 0
|
1192
|
+
|
1193
|
+
unless /^#{PL_count_zero}$/ =~ count.to_s
|
1194
|
+
return "#{pre}#{count} " + plural( word, count ) + post
|
1195
|
+
else
|
1196
|
+
return "#{pre}no " + plural( word, 0 ) + post
|
1197
|
+
end
|
1198
|
+
end
|
1199
|
+
alias_method :NO, :no
|
1200
|
+
|
1201
|
+
|
1202
|
+
### Participles
|
1203
|
+
def present_participle( word )
|
1204
|
+
plural = plural_verb( word.to_s, 2 )
|
1205
|
+
|
1206
|
+
plural.sub!( /ie$/, 'y' ) or
|
1207
|
+
plural.sub!( /ue$/, 'u' ) or
|
1208
|
+
plural.sub!( /([auy])e$/, '$1' ) or
|
1209
|
+
plural.sub!( /i$/, '' ) or
|
1210
|
+
plural.sub!( /([^e])e$/, "\\1" ) or
|
1211
|
+
/er$/.match( plural ) or
|
1212
|
+
plural.sub!( /([^aeiou][aeiouy]([bdgmnprst]))$/, "\\1\\2" )
|
1213
|
+
|
1214
|
+
return "#{plural}ing"
|
1215
|
+
end
|
1216
|
+
alias_method :part_pres, :present_participle
|
1217
|
+
alias_method :PART_PRES, :present_participle
|
1218
|
+
|
1219
|
+
|
1220
|
+
|
1221
|
+
### Return the specified number as english words. One or more configuration
|
1222
|
+
### values may be passed to control the returned String:
|
1223
|
+
###
|
1224
|
+
### [<b>:group</b>]
|
1225
|
+
### Controls how many numbers at a time are grouped together. Valid values
|
1226
|
+
### are +0+ (normal grouping), +1+ (single-digit grouping, e.g., "one,
|
1227
|
+
### two, three, four"), +2+ (double-digit grouping, e.g., "twelve,
|
1228
|
+
### thirty-four", or +3+ (triple-digit grouping, e.g., "one twenty-three,
|
1229
|
+
### four").
|
1230
|
+
### [<b>:comma</b>]
|
1231
|
+
### Set the character/s used to separate word groups. Defaults to +", "+.
|
1232
|
+
### [<b>:and</b>]
|
1233
|
+
### Set the word and/or characters used where ' and ' (the default) is
|
1234
|
+
### normally used. Setting <tt>:and</tt> to +' '+, for example, will cause
|
1235
|
+
### +2556+ to be returned as "two-thousand, five hundred fifty-six"
|
1236
|
+
### instead of ""two-thousand, five hundred and fifty-six".
|
1237
|
+
### [<b>:zero</b>]
|
1238
|
+
### Set the word used to represent the numeral +0+ in the result. +'zero'+
|
1239
|
+
### is the default.
|
1240
|
+
### [<b>:decimal</b>]
|
1241
|
+
### Set the translation of any decimal points in the number; the default
|
1242
|
+
### is +'point'+.
|
1243
|
+
### [<b>:asArray</b>]
|
1244
|
+
### If set to a true value, the number will be returned as an array of
|
1245
|
+
### word groups instead of a String.
|
1246
|
+
def numwords( number, hashargs={} )
|
1247
|
+
num = number.to_s
|
1248
|
+
config = NumwordDefaults.dup.update( hashargs )
|
1249
|
+
raise "Bad chunking option: #{config[:group]}" unless
|
1250
|
+
config[:group].between?( 0, 3 )
|
1251
|
+
|
1252
|
+
# Array of number parts: first is everything to the left of the first
|
1253
|
+
# decimal, followed by any groups of decimal-delimted numbers after that
|
1254
|
+
parts = []
|
1255
|
+
|
1256
|
+
# Wordify any sign prefix
|
1257
|
+
sign = (/\A\s*\+/ =~ num) ? 'plus' : (/\A\s*\-/ =~ num) ? 'minus' : ''
|
1258
|
+
|
1259
|
+
# Strip any ordinal suffixes
|
1260
|
+
ord = true if num.sub!( /(st|nd|rd|th)\Z/, '' )
|
1261
|
+
|
1262
|
+
# Split the number into chunks delimited by '.'
|
1263
|
+
chunks = if !config[:decimal].empty? then
|
1264
|
+
if config[:group].nonzero?
|
1265
|
+
num.split(/\./)
|
1266
|
+
else
|
1267
|
+
num.split(/\./, 2)
|
1268
|
+
end
|
1269
|
+
else
|
1270
|
+
[ num ]
|
1271
|
+
end
|
1272
|
+
|
1273
|
+
# Wordify each chunk, pushing arrays into the parts array
|
1274
|
+
chunks.each_with_index {|chunk,section|
|
1275
|
+
chunk.gsub!( /\D+/, '' )
|
1276
|
+
|
1277
|
+
# If there's nothing in this chunk of the number, set it to zero
|
1278
|
+
# unless it's the whole-number part, in which case just push an
|
1279
|
+
# empty array.
|
1280
|
+
if chunk.empty?
|
1281
|
+
if section.zero?
|
1282
|
+
parts.push []
|
1283
|
+
next
|
1284
|
+
end
|
1285
|
+
end
|
1286
|
+
|
1287
|
+
# Split the number section into wordified parts unless this is the
|
1288
|
+
# second or succeeding part of a non-group number
|
1289
|
+
unless config[:group].zero? && section.nonzero?
|
1290
|
+
parts.push number_to_words( chunk, config )
|
1291
|
+
else
|
1292
|
+
parts.push number_to_words( chunk, config.dup.update(:group => 1) )
|
1293
|
+
end
|
1294
|
+
}
|
1295
|
+
|
1296
|
+
debugMsg "Parts => #{parts.inspect}"
|
1297
|
+
|
1298
|
+
# Turn the last word of the whole-number part back into an ordinal if
|
1299
|
+
# the original number came in that way.
|
1300
|
+
if ord && !parts[0].empty?
|
1301
|
+
parts[0][-1] = ordinal( parts[0].last )
|
1302
|
+
end
|
1303
|
+
|
1304
|
+
# If the caller's expecting an Array return, just flatten and return the
|
1305
|
+
# parts array.
|
1306
|
+
if config[:asArray]
|
1307
|
+
unless sign.empty?
|
1308
|
+
parts[0].unshift( sign )
|
1309
|
+
end
|
1310
|
+
return parts.flatten
|
1311
|
+
end
|
1312
|
+
|
1313
|
+
# Catenate each sub-parts array into a whole number part and one or more
|
1314
|
+
# post-decimal parts. If grouping is turned on, all sub-parts get joined
|
1315
|
+
# with commas, otherwise just the whole-number part is.
|
1316
|
+
if config[:group].zero?
|
1317
|
+
if parts[0].nitems > 1
|
1318
|
+
|
1319
|
+
# Join all but the last part together with commas
|
1320
|
+
wholenum = parts[0][0...-1].join( config[:comma] )
|
1321
|
+
|
1322
|
+
# If the last part is just a single word, append it to the
|
1323
|
+
# wholenum part with an 'and'. This is to get things like 'three
|
1324
|
+
# thousand and three' instead of 'three thousand, three'.
|
1325
|
+
if /^\s*(\S+)\s*$/ =~ parts[0].last
|
1326
|
+
wholenum += " and #{parts[0].last}"
|
1327
|
+
else
|
1328
|
+
wholenum += config[:comma] + parts[0].last
|
1329
|
+
end
|
1330
|
+
else
|
1331
|
+
wholenum = parts[0][0]
|
1332
|
+
end
|
1333
|
+
decimals = parts[1..-1].collect {|part| part.join(" ")}
|
1334
|
+
|
1335
|
+
debugMsg "Wholenum: #{wholenum.inspect}; decimals: #{decimals.inspect}"
|
1336
|
+
|
1337
|
+
# Join with the configured decimal; if it's empty, just join with
|
1338
|
+
# spaces.
|
1339
|
+
unless config[:decimal].empty?
|
1340
|
+
return sign + ([ wholenum ] + decimals).
|
1341
|
+
join( " #{config[:decimal]} " ).strip
|
1342
|
+
else
|
1343
|
+
return sign + ([ wholenum ] + decimals).
|
1344
|
+
join( " " ).strip
|
1345
|
+
end
|
1346
|
+
else
|
1347
|
+
return parts.compact.
|
1348
|
+
separate( config[:decimal] ).
|
1349
|
+
delete_if {|el| el.empty?}.
|
1350
|
+
join( config[:comma] ).
|
1351
|
+
strip
|
1352
|
+
end
|
1353
|
+
end
|
1354
|
+
alias_method :NUMWORDS, :numwords
|
1355
|
+
|
1356
|
+
|
1357
|
+
### Transform the given +number+ into an ordinal word. The +number+ object
|
1358
|
+
### can be either an Integer or a String.
|
1359
|
+
def ordinal( number )
|
1360
|
+
case number
|
1361
|
+
when Integer
|
1362
|
+
return number.to_s + (Nth[ number % 100 ] || Nth[ number % 10 ])
|
1363
|
+
|
1364
|
+
else
|
1365
|
+
return number.to_s.sub( /(#{OrdinalSuffixes})\Z/ ) { Ordinals[$1] }
|
1366
|
+
end
|
1367
|
+
end
|
1368
|
+
alias_method :ORD, :ordinal
|
1369
|
+
|
1370
|
+
|
1371
|
+
### Return a phrase describing the specified +number+ of objects in the
|
1372
|
+
### given +phrase+. The following options can be used to control the makeup
|
1373
|
+
### of the returned quantity String:
|
1374
|
+
###
|
1375
|
+
### [<b>:joinword</b>]
|
1376
|
+
### Sets the word (and any surrounding spaces) used as the word separating the
|
1377
|
+
### quantity from the noun in the resulting string. Defaults to <tt>' of
|
1378
|
+
### '</tt>.
|
1379
|
+
def quantify( phrase, number=0, args={} )
|
1380
|
+
num = number.to_i
|
1381
|
+
config = QuantifyDefaults.dup.update( args )
|
1382
|
+
|
1383
|
+
case num
|
1384
|
+
when 0
|
1385
|
+
no( phrase )
|
1386
|
+
when 1
|
1387
|
+
a( phrase )
|
1388
|
+
when SeveralRange
|
1389
|
+
"several " + plural( phrase, num )
|
1390
|
+
when NumberRange
|
1391
|
+
"a number of " + plural( phrase, num )
|
1392
|
+
when NumerousRange
|
1393
|
+
"numerous " + plural( phrase, num )
|
1394
|
+
when ManyRange
|
1395
|
+
"many " + plural( phrase, num )
|
1396
|
+
else
|
1397
|
+
|
1398
|
+
# Anything bigger than the ManyRange gets described like
|
1399
|
+
# "hundreds of thousands of..." or "millions of..."
|
1400
|
+
# depending, of course, on how many there are.
|
1401
|
+
thousands, subthousands = Math::log10( num ).to_i.divmod( 3 )
|
1402
|
+
stword =
|
1403
|
+
case subthousands
|
1404
|
+
when 2
|
1405
|
+
"hundreds"
|
1406
|
+
when 1
|
1407
|
+
"tens"
|
1408
|
+
else
|
1409
|
+
nil
|
1410
|
+
end
|
1411
|
+
thword = plural( to_thousands(thousands).strip )
|
1412
|
+
thword = nil if thword.empty?
|
1413
|
+
|
1414
|
+
[ # Hundreds (of)...
|
1415
|
+
stword,
|
1416
|
+
|
1417
|
+
# thousands (of)
|
1418
|
+
thword,
|
1419
|
+
|
1420
|
+
# stars.
|
1421
|
+
plural(phrase, number)
|
1422
|
+
].compact.join( config[:joinword] )
|
1423
|
+
end
|
1424
|
+
end
|
1425
|
+
|
1426
|
+
|
1427
|
+
### Return the specified +obj+ (which must support the <tt>#collect</tt>
|
1428
|
+
### method) as a conjunction. Each item is converted to a String if it is
|
1429
|
+
### not already (using #to_s) unless a block is given, in which case it is
|
1430
|
+
### called once for each object in the array, and the stringified return
|
1431
|
+
### value from the block is used instead. Returning +nil+ causes that
|
1432
|
+
### particular element to be omitted from the resulting conjunction. The
|
1433
|
+
### following options can be used to control the makeup of the returned
|
1434
|
+
### conjunction String:
|
1435
|
+
###
|
1436
|
+
### [<b>:separator</b>]
|
1437
|
+
### Specify one or more characters to separate items in the resulting
|
1438
|
+
### list. Defaults to <tt>', '</tt>.
|
1439
|
+
### [<b>:altsep</b>]
|
1440
|
+
### An alternate separator to use if any of the resulting conjunction's
|
1441
|
+
### clauses contain the <tt>:separator</tt> character/s. Defaults to <tt>'; '</tt>.
|
1442
|
+
### [<b>:penultimate</b>]
|
1443
|
+
### Flag that indicates whether or not to join the last clause onto the
|
1444
|
+
### rest of the conjunction using a penultimate <tt>:separator</tt>. E.g.,
|
1445
|
+
### %w{duck, cow, dog}.en.conjunction
|
1446
|
+
### # => "a duck, a cow, and a dog"
|
1447
|
+
### %w{duck cow dog}.en.conjunction( :penultimate => false )
|
1448
|
+
### "a duck, a cow and a dog"
|
1449
|
+
### Default to <tt>true</tt>.
|
1450
|
+
### [<b>:conjunctive</b>]
|
1451
|
+
### Sets the word used as the conjunctive (separating word) of the
|
1452
|
+
### resulting string. Default to <tt>'and'</tt>.
|
1453
|
+
### [<b>:combine</b>]
|
1454
|
+
### If set to <tt>true</tt> (the default), items which are indentical (after
|
1455
|
+
### surrounding spaces are stripped) will be combined in the resulting
|
1456
|
+
### conjunction. E.g.,
|
1457
|
+
### %w{goose cow goose dog}.en.conjunction
|
1458
|
+
### # => "two geese, a cow, and a dog"
|
1459
|
+
### %w{goose cow goose dog}.en.conjunction( :combine => false )
|
1460
|
+
### # => "a goose, a cow, a goose, and a dog"
|
1461
|
+
### [<b>:casefold</b>]
|
1462
|
+
### If set to <tt>true</tt> (the default), then items are compared
|
1463
|
+
### case-insensitively when combining them. This has no effect if
|
1464
|
+
### <tt>:combine</tt> is <tt>false</tt>.
|
1465
|
+
### [<b>:generalize</b>]
|
1466
|
+
### If set to <tt>true</tt>, then quantities of combined items are turned into
|
1467
|
+
### general descriptions instead of exact amounts.
|
1468
|
+
### ary = %w{goose pig dog horse goose reindeer goose dog horse}
|
1469
|
+
### ary.en.conjunction
|
1470
|
+
### # => "three geese, two dogs, two horses, a pig, and a reindeer"
|
1471
|
+
### ary.en.conjunction( :generalize => true )
|
1472
|
+
### # => "several geese, several dogs, several horses, a pig, and a reindeer"
|
1473
|
+
### See the #quantify method for specifics on how quantities are
|
1474
|
+
### generalized. Generalization defaults to <tt>false</tt>, and has no effect if
|
1475
|
+
### :combine is <tt>false</tt>.
|
1476
|
+
### [<b>:quantsort</b>]
|
1477
|
+
### If set to <tt>true</tt> (the default), items which are combined in the
|
1478
|
+
### resulting conjunction will be listed in order of amount, with greater
|
1479
|
+
### quantities sorted first. If <tt>:quantsort</tt> is <tt>false</tt>, combined items
|
1480
|
+
### will appear where the first instance of them occurred in the
|
1481
|
+
### list. This sort is also the fallback for indentical quantities (ie.,
|
1482
|
+
### items of the same quantity will be listed in the order they appeared
|
1483
|
+
### in the source list).
|
1484
|
+
###
|
1485
|
+
def conjunction( obj, args={} )
|
1486
|
+
config = ConjunctionDefaults.dup.update( args )
|
1487
|
+
phrases = []
|
1488
|
+
|
1489
|
+
# Transform items in the obj to phrases
|
1490
|
+
if block_given?
|
1491
|
+
phrases = obj.collect {|item| yield(item) }.compact
|
1492
|
+
else
|
1493
|
+
phrases = obj.collect {|item| item.to_s }
|
1494
|
+
end
|
1495
|
+
|
1496
|
+
# No need for a conjunction if there's only one thing
|
1497
|
+
return a(phrases[0]) if phrases.length < 2
|
1498
|
+
|
1499
|
+
# Set up a Proc to derive a collector key from a phrase depending on the
|
1500
|
+
# configuration
|
1501
|
+
keyfunc =
|
1502
|
+
if config[:casefold]
|
1503
|
+
proc {|key| key.downcase.strip}
|
1504
|
+
else
|
1505
|
+
proc {|key| key.strip}
|
1506
|
+
end
|
1507
|
+
|
1508
|
+
# Count and delete phrases that hash the same when the keyfunc munges
|
1509
|
+
# them into the same thing if we're combining (:combine => true).
|
1510
|
+
collector = {}
|
1511
|
+
if config[:combine]
|
1512
|
+
|
1513
|
+
phrases.each_index do |i|
|
1514
|
+
# Stop when reaching the end of a truncated list
|
1515
|
+
break if phrases[i].nil?
|
1516
|
+
|
1517
|
+
# Make the key using the configured key function
|
1518
|
+
phrase = keyfunc[ phrases[i] ]
|
1519
|
+
|
1520
|
+
# If the collector already has this key, increment its count,
|
1521
|
+
# eliminate the duplicate from the phrase list, and redo the loop.
|
1522
|
+
if collector.key?( phrase )
|
1523
|
+
collector[ phrase ] += 1
|
1524
|
+
phrases.delete_at( i )
|
1525
|
+
redo
|
1526
|
+
end
|
1527
|
+
|
1528
|
+
collector[ phrase ] = 1
|
1529
|
+
end
|
1530
|
+
else
|
1531
|
+
# If we're not combining, just make everything have a count of 1.
|
1532
|
+
phrases.uniq.each {|key| collector[ keyfunc[key] ] = 1}
|
1533
|
+
end
|
1534
|
+
|
1535
|
+
# If sort-by-quantity is turned on, sort the phrases first by how many
|
1536
|
+
# there are (most-first), and then by the order they were specified in.
|
1537
|
+
if config[:quantsort] && config[:combine]
|
1538
|
+
origorder = {}
|
1539
|
+
phrases.each_with_index {|phrase,i| origorder[ keyfunc[phrase] ] ||= i }
|
1540
|
+
phrases.sort! {|a,b|
|
1541
|
+
(collector[ keyfunc[b] ] <=> collector[ keyfunc[a] ]).nonzero? ||
|
1542
|
+
(origorder[ keyfunc[a] ] <=> origorder[ keyfunc[b] ])
|
1543
|
+
}
|
1544
|
+
end
|
1545
|
+
|
1546
|
+
# Set up a filtering function that adds either an indefinite article, an
|
1547
|
+
# indefinite quantifier, or a definite quantifier to each phrase
|
1548
|
+
# depending on the configuration and the count of phrases in the
|
1549
|
+
# collector.
|
1550
|
+
filter =
|
1551
|
+
if config[:generalize]
|
1552
|
+
proc {|phrase, count| quantify(phrase, count) }
|
1553
|
+
else
|
1554
|
+
proc {|phrase, count|
|
1555
|
+
if count > 1
|
1556
|
+
"%s %s" % [
|
1557
|
+
# :TODO: Make this threshold settable
|
1558
|
+
count < 10 ? count.en.numwords : count.to_s,
|
1559
|
+
plural(phrase, count)
|
1560
|
+
]
|
1561
|
+
else
|
1562
|
+
a( phrase )
|
1563
|
+
end
|
1564
|
+
}
|
1565
|
+
end
|
1566
|
+
|
1567
|
+
# Now use the configured filter to turn each phrase into its final
|
1568
|
+
# form. Hmmm... square-bracket Lisp?
|
1569
|
+
phrases.collect! {|phrase| filter[phrase, collector[ keyfunc[phrase] ]] }
|
1570
|
+
|
1571
|
+
# Prepend the conjunctive to the last element unless it's empty or
|
1572
|
+
# there's only one element
|
1573
|
+
phrases[-1].insert( 0, config[:conjunctive] + " " ) unless
|
1574
|
+
config[:conjunctive].strip.empty? or
|
1575
|
+
phrases.length < 2
|
1576
|
+
|
1577
|
+
# Catenate the last two elements if there's no penultimate separator,
|
1578
|
+
# and pick a separator based on how many phrases there are and whether
|
1579
|
+
# or not there's already an instance of it in the phrases.
|
1580
|
+
phrases[-2] << " " << phrases.pop unless config[:penultimate]
|
1581
|
+
sep = if phrases.length <= 2
|
1582
|
+
' '
|
1583
|
+
elsif phrases.grep( /#{config[:separator]}/ ).empty?
|
1584
|
+
config[:separator]
|
1585
|
+
else
|
1586
|
+
config[:altsep]
|
1587
|
+
end
|
1588
|
+
|
1589
|
+
return phrases.join( sep )
|
1590
|
+
end
|
1591
|
+
|
1592
|
+
|
1593
|
+
### Turns a camel-case +string+ ("camelCaseToEnglish") to plain English
|
1594
|
+
### ("camel case to english"). Each word is decapitalized.
|
1595
|
+
def camel_case_to_english( string )
|
1596
|
+
string.to_s.gsub( /([a-z])([A-Z])/ ) { "#$1 #$2" }.downcase
|
1597
|
+
end
|
1598
|
+
|
1599
|
+
|
1600
|
+
### Turns an English language +string+ into a CamelCase word.
|
1601
|
+
def english_to_camel_case( string )
|
1602
|
+
string.to_s.gsub( /\s+([a-z])/ ) { $1.upcase }
|
1603
|
+
end
|
1604
|
+
|
1605
|
+
|
1606
|
+
### This method doesn't work quite right yet. It does okay for simple cases,
|
1607
|
+
### but it misses more complex ones, e.g. 'as' used as a coordinating
|
1608
|
+
### conjunction in "A Portrait of the Artist as a Young Man". Perhaps after
|
1609
|
+
### there's a working (non-leaking) LinkParser for Ruby, this can be fixed
|
1610
|
+
### up. Until then it'll just be undocumented.
|
1611
|
+
|
1612
|
+
### Returns the given +string+ as a title-cased phrase.
|
1613
|
+
def titlecase( string ) # :nodoc:
|
1614
|
+
|
1615
|
+
# Split on word-boundaries
|
1616
|
+
words = string.split( /\b/ )
|
1617
|
+
|
1618
|
+
# Always capitalize the first and last words
|
1619
|
+
words.first.capitalize!
|
1620
|
+
words.last.capitalize!
|
1621
|
+
|
1622
|
+
# Now scan the rest of the tokens, skipping non-words and capitalization
|
1623
|
+
# exceptions.
|
1624
|
+
words.each_with_index do |word, i|
|
1625
|
+
|
1626
|
+
# Non-words
|
1627
|
+
next unless /^\w+$/.match( word )
|
1628
|
+
|
1629
|
+
# Skip exception-words
|
1630
|
+
next if TitleCaseExceptions.include?( word )
|
1631
|
+
|
1632
|
+
# Skip second parts of contractions
|
1633
|
+
next if words[i - 1] == "'" && /\w/.match( words[i - 2] )
|
1634
|
+
|
1635
|
+
# Have to do it this way instead of capitalize! because that method
|
1636
|
+
# also downcases all other letters.
|
1637
|
+
word.gsub!( /^(\w)(.*)/ ) { $1.upcase + $2 }
|
1638
|
+
end
|
1639
|
+
|
1640
|
+
return words.join
|
1641
|
+
end
|
1642
|
+
|
1643
|
+
|
1644
|
+
### Returns the proper noun form of a string by capitalizing most of the
|
1645
|
+
### words.
|
1646
|
+
###
|
1647
|
+
### Examples:
|
1648
|
+
### English.proper_noun("bosnia and herzegovina") ->
|
1649
|
+
### "Bosnia and Herzegovina"
|
1650
|
+
### English.proper_noun("macedonia, the former yugoslav republic of") ->
|
1651
|
+
### "Macedonia, the Former Yugoslav Republic of"
|
1652
|
+
### English.proper_noun("virgin islands, u.s.") ->
|
1653
|
+
### "Virgin Islands, U.S."
|
1654
|
+
def proper_noun( string )
|
1655
|
+
return string.split(/([ .]+)/).collect {|word|
|
1656
|
+
next word unless /^[a-z]/.match( word ) &&
|
1657
|
+
! (%w{and the of}.include?( word ))
|
1658
|
+
word.capitalize
|
1659
|
+
}.join
|
1660
|
+
end
|
1661
|
+
|
1662
|
+
end # module Linguistics::EN
|
1663
|
+
|
1664
|
+
|
1665
|
+
### Add the #separate and #separate! methods to Array.
|
1666
|
+
class Array # :nodoc:
|
1667
|
+
|
1668
|
+
### Returns a new Array that has had a new member inserted between all of
|
1669
|
+
### the current ones. The value used is the given +value+ argument unless a
|
1670
|
+
### block is given, in which case the block is called once for each pair of
|
1671
|
+
### the Array, and the return value is used as the separator.
|
1672
|
+
def separate( value=:__no_arg__, &block )
|
1673
|
+
ary = self.dup
|
1674
|
+
ary.separate!( value, &block )
|
1675
|
+
return ary
|
1676
|
+
end
|
1677
|
+
|
1678
|
+
### The same as #separate, but modifies the Array in place.
|
1679
|
+
def separate!( value=:__no_arg__ )
|
1680
|
+
raise ArgumentError, "wrong number of arguments: (0 for 1)" if
|
1681
|
+
value == :__no_arg__ && !block_given?
|
1682
|
+
|
1683
|
+
(1..( (self.length * 2) - 2 )).step(2) do |i|
|
1684
|
+
if block_given?
|
1685
|
+
self.insert( i, yield(self[i-1,2]) )
|
1686
|
+
else
|
1687
|
+
self.insert( i, value )
|
1688
|
+
end
|
1689
|
+
end
|
1690
|
+
self
|
1691
|
+
end
|
1692
|
+
|
1693
|
+
end
|
1694
|
+
|