Linguistics 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1694 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ # = Linguistics::EN
4
+ #
5
+ # This module contains English-language linguistic functions for the Linguistics
6
+ # module. It can be either loaded directly, or by passing some variant of 'en'
7
+ # or 'eng' to the Linguistics::use method.
8
+ #
9
+ # The functions contained by the module provide:
10
+ #
11
+ # == Plural Inflections
12
+ #
13
+ # Plural forms of all nouns, most verbs, and some adjectives are provided. Where
14
+ # appropriate, "classical" variants (for example: "brother" -> "brethren",
15
+ # "dogma" -> "dogmata", etc.) are also provided.
16
+ #
17
+ # These can be accessed via the #plural, #plural_noun, #plural_verb, and
18
+ # #plural_adjective methods.
19
+ #
20
+ # == Indefinite Articles
21
+ #
22
+ # Pronunciation-based "a"/"an" selection is provided for all English words, and
23
+ # most initialisms.
24
+ #
25
+ # See: #a, #an, and #no.
26
+ #
27
+ # == Numbers to Words
28
+ #
29
+ # Conversion from Numeric values to words are supported using the American
30
+ # "thousands" system. E.g., 2561 => "two thousand, five hundred and sixty-one".
31
+ #
32
+ # See the #numwords method.
33
+ #
34
+ # == Ordinals
35
+ #
36
+ # It is also possible to inflect numerals (1,2,3) and number words ("one",
37
+ # "two", "three") to ordinals (1st, 2nd, 3rd) and ordinates ("first", "second",
38
+ # "third").
39
+ #
40
+ # == Conjunctions
41
+ #
42
+ # This module also supports the creation of English conjunctions from Arrays of
43
+ # Strings or objects which respond to the #to_s message. Eg.,
44
+ #
45
+ # %w{cow pig chicken cow dog cow duck duck moose}.en.conjunction
46
+ # ==> "three cows, two ducks, a pig, a chicken, a dog, and a moose"
47
+ #
48
+ # == Infinitives
49
+ #
50
+ # Returns the infinitive form of English verbs:
51
+ #
52
+ # "dodging".en.infinitive
53
+ # ==> "dodge"
54
+ #
55
+ #
56
+ # == Authors
57
+ #
58
+ # * Michael Granger <ged@FaerieMUD.org>
59
+ #
60
+ # == Copyright
61
+ #
62
+ # This module is copyright (c) 2003-2005 The FaerieMUD Consortium. All rights
63
+ # reserved.
64
+ #
65
+ # This module is free software. You may use, modify, and/or redistribute this
66
+ # software under the terms of the Perl Artistic License. (See
67
+ # http://language.perl.com/misc/Artistic.html)
68
+ #
69
+ # The inflection functions of this module were adapted from Damien Conway's
70
+ # Lingua::EN::Inflect Perl module:
71
+ #
72
+ # Copyright (c) 1997-2000, Damian Conway. All Rights Reserved.
73
+ # This module is free software. It may be used, redistributed
74
+ # and/or modified under the same terms as Perl itself.
75
+ #
76
+ # The conjunctions code was adapted from the Lingua::Conjunction Perl module
77
+ # written by Robert Rothenberg and Damian Conway, which has no copyright
78
+ # statement included.
79
+ #
80
+ # == Version
81
+ #
82
+ # $Id: en.rb,v 1.8 2003/09/14 10:47:12 deveiant Exp $
83
+ #
84
+
85
+
86
+ ### This module contains English-language linguistics functions accessible from
87
+ ### the Linguistics module, or as a standalone function library.
88
+ module Linguistics::EN
89
+
90
+ begin
91
+ require 'crosscase'
92
+ rescue LoadError
93
+ else
94
+ include CrossCase
95
+ end
96
+
97
+ # Load in the secondary modules and add them to Linguistics::EN.
98
+ require 'linguistics/en/infinitive'
99
+ require 'linguistics/en/wordnet'
100
+ require 'linguistics/en/linkparser'
101
+
102
+ # Subversion revision
103
+ SVNRev = %q$Rev$
104
+
105
+ # Subversion revision tag
106
+ SVNId = %q$Id: en.rb,v 1.8 2003/09/14 10:47:12 deveiant Exp $
107
+
108
+ # Add 'english' to the list of default languages
109
+ Linguistics::DefaultLanguages.push( :en )
110
+
111
+
112
+ #################################################################
113
+ ### U T I L I T Y F U N C T I O N S
114
+ #################################################################
115
+
116
+ ### Wrap one or more parts in a non-capturing alteration Regexp
117
+ def self::matchgroup( *parts )
118
+ re = parts.flatten.join("|")
119
+ "(?:#{re})"
120
+ end
121
+
122
+
123
+ #################################################################
124
+ ### C O N S T A N T S
125
+ #################################################################
126
+
127
+ # :stopdoc:
128
+
129
+ #
130
+ # Plurals
131
+ #
132
+
133
+ PL_sb_irregular_s = {
134
+ "ephemeris" => "ephemerides",
135
+ "iris" => "irises|irides",
136
+ "clitoris" => "clitorises|clitorides",
137
+ "corpus" => "corpuses|corpora",
138
+ "opus" => "opuses|opera",
139
+ "genus" => "genera",
140
+ "mythos" => "mythoi",
141
+ "penis" => "penises|penes",
142
+ "testis" => "testes",
143
+ }
144
+
145
+ PL_sb_irregular_h = {
146
+ "child" => "children",
147
+ "brother" => "brothers|brethren",
148
+ "loaf" => "loaves",
149
+ "hoof" => "hoofs|hooves",
150
+ "beef" => "beefs|beeves",
151
+ "money" => "monies",
152
+ "mongoose" => "mongooses",
153
+ "ox" => "oxen",
154
+ "cow" => "cows|kine",
155
+ "soliloquy" => "soliloquies",
156
+ "graffito" => "graffiti",
157
+ "prima donna" => "prima donnas|prime donne",
158
+ "octopus" => "octopuses|octopodes",
159
+ "genie" => "genies|genii",
160
+ "ganglion" => "ganglions|ganglia",
161
+ "trilby" => "trilbys",
162
+ "turf" => "turfs|turves",
163
+ }.update( PL_sb_irregular_s )
164
+ PL_sb_irregular = matchgroup PL_sb_irregular_h.keys
165
+
166
+
167
+ # Classical "..a" -> "..ata"
168
+ PL_sb_C_a_ata = matchgroup %w[
169
+ anathema bema carcinoma charisma diploma
170
+ dogma drama edema enema enigma lemma
171
+ lymphoma magma melisma miasma oedema
172
+ sarcoma schema soma stigma stoma trauma
173
+ gumma pragma
174
+ ].collect {|word| word[0...-1]}
175
+
176
+ # Unconditional "..a" -> "..ae"
177
+ PL_sb_U_a_ae = matchgroup %w[
178
+ alumna alga vertebra persona
179
+ ]
180
+
181
+ # Classical "..a" -> "..ae"
182
+ PL_sb_C_a_ae = matchgroup %w[
183
+ amoeba antenna formula hyperbola
184
+ medusa nebula parabola abscissa
185
+ hydra nova lacuna aurora .*umbra
186
+ flora fauna
187
+ ]
188
+
189
+ # Classical "..en" -> "..ina"
190
+ PL_sb_C_en_ina = matchgroup %w[
191
+ stamen foramen lumen
192
+ ].collect {|word| word[0...-2] }
193
+
194
+ # Unconditional "..um" -> "..a"
195
+ PL_sb_U_um_a = matchgroup %w[
196
+ bacterium agendum desideratum erratum
197
+ stratum datum ovum extremum
198
+ candelabrum
199
+ ].collect {|word| word[0...-2] }
200
+
201
+ # Classical "..um" -> "..a"
202
+ PL_sb_C_um_a = matchgroup %w[
203
+ maximum minimum momentum optimum
204
+ quantum cranium curriculum dictum
205
+ phylum aquarium compendium emporium
206
+ enconium gymnasium honorarium interregnum
207
+ lustrum memorandum millenium rostrum
208
+ spectrum speculum stadium trapezium
209
+ ultimatum medium vacuum velum
210
+ consortium
211
+ ].collect {|word| word[0...-2]}
212
+
213
+ # Unconditional "..us" -> "i"
214
+ PL_sb_U_us_i = matchgroup %w[
215
+ alumnus alveolus bacillus bronchus
216
+ locus nucleus stimulus meniscus
217
+ ].collect {|word| word[0...-2]}
218
+
219
+ # Classical "..us" -> "..i"
220
+ PL_sb_C_us_i = matchgroup %w[
221
+ focus radius genius
222
+ incubus succubus nimbus
223
+ fungus nucleolus stylus
224
+ torus umbilicus uterus
225
+ hippopotamus
226
+ ].collect {|word| word[0...-2]}
227
+
228
+ # Classical "..us" -> "..us" (assimilated 4th declension latin nouns)
229
+ PL_sb_C_us_us = matchgroup %w[
230
+ status apparatus prospectus sinus
231
+ hiatus impetus plexus
232
+ ]
233
+
234
+ # Unconditional "..on" -> "a"
235
+ PL_sb_U_on_a = matchgroup %w[
236
+ criterion perihelion aphelion
237
+ phenomenon prolegomenon noumenon
238
+ organon asyndeton hyperbaton
239
+ ].collect {|word| word[0...-2]}
240
+
241
+ # Classical "..on" -> "..a"
242
+ PL_sb_C_on_a = matchgroup %w[
243
+ oxymoron
244
+ ].collect {|word| word[0...-2]}
245
+
246
+ # Classical "..o" -> "..i" (but normally -> "..os")
247
+ PL_sb_C_o_i_a = %w[
248
+ solo soprano basso alto
249
+ contralto tempo piano
250
+ ]
251
+ PL_sb_C_o_i = matchgroup PL_sb_C_o_i_a.collect{|word| word[0...-1]}
252
+
253
+ # Always "..o" -> "..os"
254
+ PL_sb_U_o_os = matchgroup( %w[
255
+ albino archipelago armadillo
256
+ commando crescendo fiasco
257
+ ditto dynamo embryo
258
+ ghetto guano inferno
259
+ jumbo lumbago magneto
260
+ manifesto medico octavo
261
+ photo pro quarto
262
+ canto lingo generalissimo
263
+ stylo rhino
264
+ ] | PL_sb_C_o_i_a )
265
+
266
+
267
+ # Unconditional "..[ei]x" -> "..ices"
268
+ PL_sb_U_ex_ices = matchgroup %w[
269
+ codex murex silex
270
+ ].collect {|word| word[0...-2]}
271
+ PL_sb_U_ix_ices = matchgroup %w[
272
+ radix helix
273
+ ].collect {|word| word[0...-2]}
274
+
275
+ # Classical "..[ei]x" -> "..ices"
276
+ PL_sb_C_ex_ices = matchgroup %w[
277
+ vortex vertex cortex latex
278
+ pontifex apex index simplex
279
+ ].collect {|word| word[0...-2]}
280
+ PL_sb_C_ix_ices = matchgroup %w[
281
+ appendix
282
+ ].collect {|word| word[0...-2]}
283
+
284
+
285
+ # Arabic: ".." -> "..i"
286
+ PL_sb_C_i = matchgroup %w[
287
+ afrit afreet efreet
288
+ ]
289
+
290
+
291
+ # Hebrew: ".." -> "..im"
292
+ PL_sb_C_im = matchgroup %w[
293
+ goy seraph cherub
294
+ ]
295
+
296
+ # Unconditional "..man" -> "..mans"
297
+ PL_sb_U_man_mans = matchgroup %w[
298
+ human
299
+ Alabaman Bahaman Burman German
300
+ Hiroshiman Liman Nakayaman Oklahoman
301
+ Panaman Selman Sonaman Tacoman Yakiman
302
+ Yokohaman Yuman
303
+ ]
304
+
305
+
306
+ PL_sb_uninflected_s = [
307
+ # Pairs or groups subsumed to a singular...
308
+ "breeches", "britches", "clippers", "gallows", "hijinks",
309
+ "headquarters", "pliers", "scissors", "testes", "herpes",
310
+ "pincers", "shears", "proceedings", "trousers",
311
+
312
+ # Unassimilated Latin 4th declension
313
+ "cantus", "coitus", "nexus",
314
+
315
+ # Recent imports...
316
+ "contretemps", "corps", "debris",
317
+ ".*ois",
318
+
319
+ # Diseases
320
+ ".*measles", "mumps",
321
+
322
+ # Miscellaneous others...
323
+ "diabetes", "jackanapes", "series", "species", "rabies",
324
+ "chassis", "innings", "news", "mews",
325
+ ]
326
+
327
+
328
+ # Don't inflect in classical mode, otherwise normal inflection
329
+ PL_sb_uninflected_herd = matchgroup %w[
330
+ wildebeest swine eland bison buffalo
331
+ elk moose rhinoceros
332
+ ]
333
+
334
+ PL_sb_uninflected = matchgroup [
335
+
336
+ # Some fish and herd animals
337
+ ".*fish", "tuna", "salmon", "mackerel", "trout",
338
+ "bream", "sea[- ]bass", "carp", "cod", "flounder", "whiting",
339
+
340
+ ".*deer", ".*sheep",
341
+
342
+ # All nationals ending in -ese
343
+ "Portuguese", "Amoyese", "Borghese", "Congoese", "Faroese",
344
+ "Foochowese", "Genevese", "Genoese", "Gilbertese", "Hottentotese",
345
+ "Kiplingese", "Kongoese", "Lucchese", "Maltese", "Nankingese",
346
+ "Niasese", "Pekingese", "Piedmontese", "Pistoiese", "Sarawakese",
347
+ "Shavese", "Vermontese", "Wenchowese", "Yengeese",
348
+ ".*[nrlm]ese",
349
+
350
+ # Some words ending in ...s (often pairs taken as a whole)
351
+ PL_sb_uninflected_s,
352
+
353
+ # Diseases
354
+ ".*pox",
355
+
356
+ # Other oddities
357
+ "graffiti", "djinn"
358
+ ]
359
+
360
+
361
+ # Singular words ending in ...s (all inflect with ...es)
362
+ PL_sb_singular_s = matchgroup %w[
363
+ .*ss
364
+ acropolis aegis alias arthritis asbestos atlas
365
+ bathos bias bronchitis bursitis caddis cannabis
366
+ canvas chaos cosmos dais digitalis encephalitis
367
+ epidermis ethos eyas gas glottis hepatitis
368
+ hubris ibis lens mantis marquis metropolis
369
+ neuritis pathos pelvis polis rhinoceros
370
+ sassafras tonsillitis trellis .*us
371
+ ]
372
+
373
+ PL_v_special_s = matchgroup [
374
+ PL_sb_singular_s,
375
+ PL_sb_uninflected_s,
376
+ PL_sb_irregular_s.keys,
377
+ '(.*[csx])is',
378
+ '(.*)ceps',
379
+ '[A-Z].*s',
380
+ ]
381
+
382
+ PL_sb_postfix_adj = '(' + {
383
+
384
+ 'general' => ['(?!major|lieutenant|brigadier|adjutant)\S+'],
385
+ 'martial' => ["court"],
386
+
387
+ }.collect {|key,val|
388
+ matchgroup( matchgroup(val) + "(?=(?:-|\\s+)#{key})" )
389
+ }.join("|") + ")(.*)"
390
+
391
+
392
+ PL_sb_military = %r'major|lieutenant|brigadier|adjutant|quartermaster'
393
+ PL_sb_general = %r'((?!#{PL_sb_military.source}).*?)((-|\s+)general)'
394
+
395
+ PL_prep = matchgroup %w[
396
+ about above across after among around at athwart before behind
397
+ below beneath beside besides between betwixt beyond but by
398
+ during except for from in into near of off on onto out over
399
+ since till to under until unto upon with
400
+ ]
401
+
402
+ PL_sb_prep_dual_compound = %r'(.*?)((?:-|\s+)(?:#{PL_prep}|d[eu])(?:-|\s+))a(?:-|\s+)(.*)'
403
+ PL_sb_prep_compound = %r'(.*?)((-|\s+)(#{PL_prep}|d[eu])((-|\s+)(.*))?)'
404
+
405
+
406
+ PL_pron_nom_h = {
407
+ # Nominative Reflexive
408
+ "i" => "we", "myself" => "ourselves",
409
+ "you" => "you", "yourself" => "yourselves",
410
+ "she" => "they", "herself" => "themselves",
411
+ "he" => "they", "himself" => "themselves",
412
+ "it" => "they", "itself" => "themselves",
413
+ "they" => "they", "themself" => "themselves",
414
+
415
+ # Possessive
416
+ "mine" => "ours",
417
+ "yours" => "yours",
418
+ "hers" => "theirs",
419
+ "his" => "theirs",
420
+ "its" => "theirs",
421
+ "theirs" => "theirs",
422
+ }
423
+ PL_pron_nom = matchgroup PL_pron_nom_h.keys
424
+
425
+ PL_pron_acc_h = {
426
+ # Accusative Reflexive
427
+ "me" => "us", "myself" => "ourselves",
428
+ "you" => "you", "yourself" => "yourselves",
429
+ "her" => "them", "herself" => "themselves",
430
+ "him" => "them", "himself" => "themselves",
431
+ "it" => "them", "itself" => "themselves",
432
+ "them" => "them", "themself" => "themselves",
433
+ }
434
+ PL_pron_acc = matchgroup PL_pron_acc_h.keys
435
+
436
+ PL_v_irregular_pres_h = {
437
+ # 1St pers. sing. 2nd pers. sing. 3rd pers. singular
438
+ # 3rd pers. (indet.)
439
+ "am" => "are", "are" => "are", "is" => "are",
440
+ "was" => "were", "were" => "were", "was" => "were",
441
+ "have" => "have", "have" => "have", "has" => "have",
442
+ }
443
+ PL_v_irregular_pres = matchgroup PL_v_irregular_pres_h.keys
444
+
445
+ PL_v_ambiguous_pres_h = {
446
+ # 1st pers. sing. 2nd pers. sing. 3rd pers. singular
447
+ # 3rd pers. (indet.)
448
+ "act" => "act", "act" => "act", "acts" => "act",
449
+ "blame" => "blame", "blame" => "blame", "blames" => "blame",
450
+ "can" => "can", "can" => "can", "can" => "can",
451
+ "must" => "must", "must" => "must", "must" => "must",
452
+ "fly" => "fly", "fly" => "fly", "flies" => "fly",
453
+ "copy" => "copy", "copy" => "copy", "copies" => "copy",
454
+ "drink" => "drink", "drink" => "drink", "drinks" => "drink",
455
+ "fight" => "fight", "fight" => "fight", "fights" => "fight",
456
+ "fire" => "fire", "fire" => "fire", "fires" => "fire",
457
+ "like" => "like", "like" => "like", "likes" => "like",
458
+ "look" => "look", "look" => "look", "looks" => "look",
459
+ "make" => "make", "make" => "make", "makes" => "make",
460
+ "reach" => "reach", "reach" => "reach", "reaches" => "reach",
461
+ "run" => "run", "run" => "run", "runs" => "run",
462
+ "sink" => "sink", "sink" => "sink", "sinks" => "sink",
463
+ "sleep" => "sleep", "sleep" => "sleep", "sleeps" => "sleep",
464
+ "view" => "view", "view" => "view", "views" => "view",
465
+ }
466
+ PL_v_ambiguous_pres = matchgroup PL_v_ambiguous_pres_h.keys
467
+
468
+ PL_v_irregular_non_pres = matchgroup %w[
469
+ did had ate made put
470
+ spent fought sank gave sought
471
+ shall could ought should
472
+ ]
473
+
474
+ PL_v_ambiguous_non_pres = matchgroup %w[
475
+ thought saw bent will might cut
476
+ ]
477
+
478
+ PL_count_zero = matchgroup %w[
479
+ 0 no zero nil
480
+ ]
481
+
482
+ PL_count_one = matchgroup %w[
483
+ 1 a an one each every this that
484
+ ]
485
+
486
+ PL_adj_special_h = {
487
+ "a" => "some", "an" => "some",
488
+ "this" => "these", "that" => "those",
489
+ }
490
+ PL_adj_special = matchgroup PL_adj_special_h.keys
491
+
492
+ PL_adj_poss_h = {
493
+ "my" => "our",
494
+ "your" => "your",
495
+ "its" => "their",
496
+ "her" => "their",
497
+ "his" => "their",
498
+ "their" => "their",
499
+ }
500
+ PL_adj_poss = matchgroup PL_adj_poss_h.keys
501
+
502
+
503
+ #
504
+ # Numerals, ordinals, and numbers-to-words
505
+ #
506
+
507
+ # Numerical inflections
508
+ Nth = {
509
+ 0 => 'th',
510
+ 1 => 'st',
511
+ 2 => 'nd',
512
+ 3 => 'rd',
513
+ 4 => 'th',
514
+ 5 => 'th',
515
+ 6 => 'th',
516
+ 7 => 'th',
517
+ 8 => 'th',
518
+ 9 => 'th',
519
+ 11 => 'th',
520
+ 12 => 'th',
521
+ 13 => 'th',
522
+ }
523
+
524
+ # Ordinal word parts
525
+ Ordinals = {
526
+ 'ty' => 'tieth',
527
+ 'one' => 'first',
528
+ 'two' => 'second',
529
+ 'three' => 'third',
530
+ 'five' => 'fifth',
531
+ 'eight' => 'eighth',
532
+ 'nine' => 'ninth',
533
+ 'twelve' => 'twelfth',
534
+ }
535
+ OrdinalSuffixes = Ordinals.keys.join("|") + "|"
536
+ Ordinals[""] = 'th'
537
+
538
+ # Numeral names
539
+ Units = [''] + %w[one two three four five six seven eight nine]
540
+ Teens = %w[ten eleven twelve thirteen fourteen
541
+ fifteen sixteen seventeen eighteen nineteen]
542
+ Tens = ['',''] + %w[twenty thirty forty fifty sixty seventy eighty ninety]
543
+ Thousands = [' ', ' thousand'] + %w[
544
+ m b tr quadr quint sext sept oct non dec undec duodec tredec
545
+ quattuordec quindec sexdec septemdec octodec novemdec vigint
546
+ ].collect {|prefix| ' ' + prefix + 'illion'}
547
+
548
+ # A collection of functions for transforming digits into word
549
+ # phrases. Indexed by the number of digits being transformed; e.g.,
550
+ # <tt>NumberToWordsFunctions[2]</tt> is the function for transforming
551
+ # double-digit numbers.
552
+ NumberToWordsFunctions = [
553
+ proc {|*args| raise "No digits (#{args.inspect})"},
554
+
555
+ # Single-digits
556
+ proc {|zero,x|
557
+ (x.nonzero? ? to_units(x) : "#{zero} ")
558
+ },
559
+
560
+ # Double-digits
561
+ proc {|zero,x,y|
562
+ if x.nonzero?
563
+ to_tens( x, y )
564
+ elsif y.nonzero?
565
+ "#{zero} " + NumberToWordsFunctions[1].call( zero, y )
566
+ else
567
+ ([zero] * 2).join(" ")
568
+ end
569
+ },
570
+
571
+ # Triple-digits
572
+ proc {|zero,x,y,z|
573
+ NumberToWordsFunctions[1].call(zero,x) +
574
+ NumberToWordsFunctions[2].call(zero,y,z)
575
+ }
576
+ ]
577
+
578
+
579
+ #
580
+ # Indefinite Articles
581
+ #
582
+
583
+ # This pattern matches strings of capitals starting with a "vowel-sound"
584
+ # consonant followed by another consonant, and which are not likely
585
+ # to be real words (oh, all right then, it's just magic!)
586
+ A_abbrev = %{
587
+ (?! FJO | [HLMNS]Y. | RY[EO] | SQU
588
+ | ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU])
589
+ [FHLMNRSX][A-Z]
590
+ }
591
+
592
+ # This pattern codes the beginnings of all english words begining with a
593
+ # 'y' followed by a consonant. Any other y-consonant prefix therefore
594
+ # implies an abbreviation.
595
+ A_y_cons = 'y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)'
596
+
597
+ # Exceptions to exceptions
598
+ A_explicit_an = matchgroup( "euler", "hour(?!i)", "heir", "honest", "hono" )
599
+
600
+
601
+ #
602
+ # Configuration defaults
603
+ #
604
+
605
+ # Default configuration arguments for the #numwords function
606
+ NumwordDefaults = {
607
+ :group => 0,
608
+ :comma => ', ',
609
+ :and => ' and ',
610
+ :zero => 'zero',
611
+ :decimal => 'point',
612
+ :asArray => false,
613
+ }
614
+
615
+ # Default ranges for #quantify
616
+ SeveralRange = 2..5
617
+ NumberRange = 6..19
618
+ NumerousRange = 20..45
619
+ ManyRange = 46..99
620
+
621
+ # Default configuration arguments for the #quantify function
622
+ QuantifyDefaults = {
623
+ :joinword => " of ",
624
+ }
625
+
626
+ # Default configuration arguments for the #conjunction (junction, what's
627
+ # your) function.
628
+ ConjunctionDefaults = {
629
+ :separator => ', ',
630
+ :altsep => '; ',
631
+ :penultimate => true,
632
+ :conjunctive => 'and',
633
+ :combine => true,
634
+ :casefold => true,
635
+ :generalize => false,
636
+ :quantsort => true,
637
+ }
638
+
639
+
640
+ #
641
+ # Title case
642
+ #
643
+
644
+ # "In titles, capitalize the first word, the last word, and all words in
645
+ # between except articles (a, an, and the), prepositions under five letters
646
+ # (in, of, to), and coordinating conjunctions (and, but). These rules apply
647
+ # to titles of long, short, and partial works as well as your own papers"
648
+ # (Anson, Schwegler, and Muth. The Longman Writer's Companion 240).
649
+
650
+ # Build the list of exceptions to title-capitalization
651
+ Articles = %w[a and the]
652
+ ShortPrepositions = ["amid", "at", "but", "by", "down", "from", "in",
653
+ "into", "like", "near", "of", "off", "on", "onto", "out", "over",
654
+ "past", "save", "with", "till", "to", "unto", "up", "upon", "with"]
655
+ CoordConjunctions = %w[and but as]
656
+ TitleCaseExceptions = Articles | ShortPrepositions | CoordConjunctions
657
+
658
+
659
+ # :startdoc:
660
+
661
+ #################################################################
662
+ ### " B A C K E N D " F U N C T I O N S
663
+ #################################################################
664
+
665
+
666
+ ###############
667
+ module_function
668
+ ###############
669
+
670
+ ### Debugging output
671
+ def debugMsg( *msgs ) # :nodoc:
672
+ $stderr.puts msgs.join(" ") if $DEBUG
673
+ end
674
+
675
+
676
+ ### Normalize a count to either 1 or 2 (singular or plural)
677
+ def normalizeCount( count, default=2 )
678
+ return default if count.nil? # Default to plural
679
+ if /^(#{PL_count_one})$/i =~ count.to_s ||
680
+ Linguistics::classical? &&
681
+ /^(#{PL_count_zero})$/ =~ count.to_s
682
+ return 1
683
+ else
684
+ return default
685
+ end
686
+ end
687
+
688
+
689
+ ### Do normal/classical switching and match capitalization in <tt>inflected</tt> by
690
+ ### examining the <tt>original</tt> input.
691
+ def postprocess( original, inflected )
692
+ inflected.sub!( /([^|]+)\|(.+)/ ) {
693
+ Linguistics::classical? ? $2 : $1
694
+ }
695
+
696
+ case original
697
+ when "I"
698
+ return inflected
699
+ when /^[A-Z]+$/
700
+ return inflected.upcase
701
+ when /^[A-Z]/
702
+ # Can't use #capitalize, as it will downcase the rest of the string,
703
+ # too.
704
+ inflected[0,1] = inflected[0,1].upcase
705
+ return inflected
706
+ else
707
+ return inflected
708
+ end
709
+ end
710
+
711
+
712
+ ### Pluralize nouns
713
+ def pluralize_noun( word, count=nil )
714
+ value = nil
715
+ count ||= Linguistics::num
716
+ count = normalizeCount( count )
717
+
718
+ return word if count == 1
719
+
720
+ # Handle user-defined nouns
721
+ #if value = ud_match( word, PL_sb_user_defined )
722
+ # return value
723
+ #end
724
+
725
+ # Handle empty word, singular count and uninflected plurals
726
+ case word
727
+ when ''
728
+ return word
729
+ when /^(#{PL_sb_uninflected})$/i
730
+ return word
731
+ else
732
+ if Linguistics::classical? &&
733
+ /^(#{PL_sb_uninflected_herd})$/i =~ word
734
+ return word
735
+ end
736
+ end
737
+
738
+ # Handle compounds ("Governor General", "mother-in-law", "aide-de-camp", etc.)
739
+ case word
740
+ when /^(?:#{PL_sb_postfix_adj})$/i
741
+ value = $2
742
+ return pluralize_noun( $1, 2 ) + value
743
+
744
+ when /^(?:#{PL_sb_prep_dual_compound})$/i
745
+ value = [ $2, $3 ]
746
+ return pluralize_noun( $1, 2 ) + value[0] + pluralize_noun( value[1] )
747
+
748
+ when /^(?:#{PL_sb_prep_compound})$/i
749
+ value = $2
750
+ return pluralize_noun( $1, 2 ) + value
751
+
752
+ # Handle pronouns
753
+ when /^((?:#{PL_prep})\s+)(#{PL_pron_acc})$/i
754
+ return $1 + PL_pron_acc_h[ $2.downcase ]
755
+
756
+ when /^(#{PL_pron_nom})$/i
757
+ return PL_pron_nom_h[ word.downcase ]
758
+
759
+ when /^(#{PL_pron_acc})$/i
760
+ return PL_pron_acc_h[ $1.downcase ]
761
+
762
+ # Handle isolated irregular plurals
763
+ when /(.*)\b(#{PL_sb_irregular})$/i
764
+ return $1 + PL_sb_irregular_h[ $2.downcase ]
765
+
766
+ when /(#{PL_sb_U_man_mans})$/i
767
+ return "#{$1}s"
768
+
769
+ # Handle families of irregular plurals
770
+ when /(.*)man$/i ; return "#{$1}men"
771
+ when /(.*[ml])ouse$/i ; return "#{$1}ice"
772
+ when /(.*)goose$/i ; return "#{$1}geese"
773
+ when /(.*)tooth$/i ; return "#{$1}teeth"
774
+ when /(.*)foot$/i ; return "#{$1}feet"
775
+
776
+ # Handle unassimilated imports
777
+ when /(.*)ceps$/i ; return word
778
+ when /(.*)zoon$/i ; return "#{$1}zoa"
779
+ when /(.*[csx])is$/i ; return "#{$1}es"
780
+ when /(#{PL_sb_U_ex_ices})ex$/i; return "#{$1}ices"
781
+ when /(#{PL_sb_U_ix_ices})ix$/i; return "#{$1}ices"
782
+ when /(#{PL_sb_U_um_a})um$/i ; return "#{$1}a"
783
+ when /(#{PL_sb_U_us_i})us$/i ; return "#{$1}i"
784
+ when /(#{PL_sb_U_on_a})on$/i ; return "#{$1}a"
785
+ when /(#{PL_sb_U_a_ae})$/i ; return "#{$1}e"
786
+ end
787
+
788
+ # Handle incompletely assimilated imports
789
+ if Linguistics::classical?
790
+ case word
791
+ when /(.*)trix$/i ; return "#{$1}trices"
792
+ when /(.*)eau$/i ; return "#{$1}eaux"
793
+ when /(.*)ieu$/i ; return "#{$1}ieux"
794
+ when /(.{2,}[yia])nx$/i ; return "#{$1}nges"
795
+ when /(#{PL_sb_C_en_ina})en$/i; return "#{$1}ina"
796
+ when /(#{PL_sb_C_ex_ices})ex$/i; return "#{$1}ices"
797
+ when /(#{PL_sb_C_ix_ices})ix$/i; return "#{$1}ices"
798
+ when /(#{PL_sb_C_um_a})um$/i ; return "#{$1}a"
799
+ when /(#{PL_sb_C_us_i})us$/i ; return "#{$1}i"
800
+ when /(#{PL_sb_C_us_us})$/i ; return "#{$1}"
801
+ when /(#{PL_sb_C_a_ae})$/i ; return "#{$1}e"
802
+ when /(#{PL_sb_C_a_ata})a$/i ; return "#{$1}ata"
803
+ when /(#{PL_sb_C_o_i})o$/i ; return "#{$1}i"
804
+ when /(#{PL_sb_C_on_a})on$/i ; return "#{$1}a"
805
+ when /#{PL_sb_C_im}$/i ; return "#{word}im"
806
+ when /#{PL_sb_C_i}$/i ; return "#{word}i"
807
+ end
808
+ end
809
+
810
+
811
+ # Handle singular nouns ending in ...s or other silibants
812
+ case word
813
+ when /^(#{PL_sb_singular_s})$/i; return "#{$1}es"
814
+ when /^([A-Z].*s)$/; return "#{$1}es"
815
+ when /(.*)([cs]h|[zx])$/i ; return "#{$1}#{$2}es"
816
+ # when /(.*)(us)$/i ; return "#{$1}#{$2}es"
817
+
818
+ # Handle ...f -> ...ves
819
+ when /(.*[eao])lf$/i ; return "#{$1}lves";
820
+ when /(.*[^d])eaf$/i ; return "#{$1}eaves"
821
+ when /(.*[nlw])ife$/i ; return "#{$1}ives"
822
+ when /(.*)arf$/i ; return "#{$1}arves"
823
+
824
+ # Handle ...y
825
+ when /(.*[aeiou])y$/i ; return "#{$1}ys"
826
+ when /([A-Z].*y)$/ ; return "#{$1}s"
827
+ when /(.*)y$/i ; return "#{$1}ies"
828
+
829
+ # Handle ...o
830
+ when /#{PL_sb_U_o_os}$/i ; return "#{word}s"
831
+ when /[aeiou]o$/i ; return "#{word}s"
832
+ when /o$/i ; return "#{word}es"
833
+
834
+ # Otherwise just add ...s
835
+ else
836
+ return "#{word}s"
837
+ end
838
+ end # def pluralize_noun
839
+
840
+
841
+
842
+ ### Pluralize special verbs
843
+ def pluralize_special_verb( word, count )
844
+ count ||= Linguistics::num
845
+ count = normalizeCount( count )
846
+
847
+ return nil if /^(#{PL_count_one})$/i =~ count.to_s
848
+
849
+ # Handle user-defined verbs
850
+ #if value = ud_match( word, PL_v_user_defined )
851
+ # return value
852
+ #end
853
+
854
+ case word
855
+
856
+ # Handle irregular present tense (simple and compound)
857
+ when /^(#{PL_v_irregular_pres})((\s.*)?)$/i
858
+ return PL_v_irregular_pres_h[ $1.downcase ] + $2
859
+
860
+ # Handle irregular future, preterite and perfect tenses
861
+ when /^(#{PL_v_irregular_non_pres})((\s.*)?)$/i
862
+ return word
863
+
864
+ # Handle special cases
865
+ when /^(#{PL_v_special_s})$/, /\s/
866
+ return nil
867
+
868
+ # Handle standard 3rd person (chop the ...(e)s off single words)
869
+ when /^(.*)([cs]h|[x]|zz|ss)es$/i
870
+ return $1 + $2
871
+ when /^(..+)ies$/i
872
+ return "#{$1}y"
873
+ when /^(.+)oes$/i
874
+ return "#{$1}o"
875
+ when /^(.*[^s])s$/i
876
+ return $1
877
+
878
+ # Otherwise, a regular verb (handle elsewhere)
879
+ else
880
+ return nil
881
+ end
882
+ end
883
+
884
+
885
+ ### Pluralize regular verbs
886
+ def pluralize_general_verb( word, count )
887
+ count ||= Linguistics::num
888
+ count = normalizeCount( count )
889
+
890
+ return word if /^(#{PL_count_one})$/i =~ count.to_s
891
+
892
+ case word
893
+
894
+ # Handle ambiguous present tenses (simple and compound)
895
+ when /^(#{PL_v_ambiguous_pres})((\s.*)?)$/i
896
+ return PL_v_ambiguous_pres_h[ $1.downcase ] + $2
897
+
898
+ # Handle ambiguous preterite and perfect tenses
899
+ when /^(#{PL_v_ambiguous_non_pres})((\s.*)?)$/i
900
+ return word
901
+
902
+ # Otherwise, 1st or 2nd person is uninflected
903
+ else
904
+ return word
905
+ end
906
+ end
907
+
908
+
909
+ ### Handle special adjectives
910
+ def pluralize_special_adjective( word, count )
911
+ count ||= Linguistics::num
912
+ count = normalizeCount( count )
913
+
914
+ return word if /^(#{PL_count_one})$/i =~ count.to_s
915
+
916
+ # Handle user-defined verbs
917
+ #if value = ud_match( word, PL_adj_user_defined )
918
+ # return value
919
+ #end
920
+
921
+ case word
922
+
923
+ # Handle known cases
924
+ when /^(#{PL_adj_special})$/i
925
+ return PL_adj_special_h[ $1.downcase ]
926
+
927
+ # Handle possessives
928
+ when /^(#{PL_adj_poss})$/i
929
+ return PL_adj_poss_h[ $1.downcase ]
930
+
931
+ when /^(.*)'s?$/
932
+ pl = plural_noun( $1 )
933
+ if /s$/ =~ pl
934
+ return "#{pl}'"
935
+ else
936
+ return "#{pl}'s"
937
+ end
938
+
939
+ # Otherwise, no idea
940
+ else
941
+ return nil
942
+ end
943
+ end
944
+
945
+
946
+ ### Returns the given word with a prepended indefinite article, unless
947
+ ### +count+ is non-nil and not singular.
948
+ def indef_article( word, count )
949
+ count ||= Linguistics::num
950
+ return "#{count} #{word}" if
951
+ count && /^(#{PL_count_one})$/i !~ count.to_s
952
+
953
+ # Handle user-defined variants
954
+ # return value if value = ud_match( word, A_a_user_defined )
955
+
956
+ case word
957
+
958
+ # Handle special cases
959
+ when /^(#{A_explicit_an})/i
960
+ return "an #{word}"
961
+
962
+ # Handle abbreviations
963
+ when /^(#{A_abbrev})/x
964
+ return "an #{word}"
965
+ when /^[aefhilmnorsx][.-]/i
966
+ return "an #{word}"
967
+ when /^[a-z][.-]/i
968
+ return "a #{word}"
969
+
970
+ # Handle consonants
971
+ when /^[^aeiouy]/i
972
+ return "a #{word}"
973
+
974
+ # Handle special vowel-forms
975
+ when /^e[uw]/i
976
+ return "a #{word}"
977
+ when /^onc?e\b/i
978
+ return "a #{word}"
979
+ when /^uni([^nmd]|mo)/i
980
+ return "a #{word}"
981
+ when /^u[bcfhjkqrst][aeiou]/i
982
+ return "a #{word}"
983
+
984
+ # Handle vowels
985
+ when /^[aeiou]/i
986
+ return "an #{word}"
987
+
988
+ # Handle y... (before certain consonants implies (unnaturalized) "i.." sound)
989
+ when /^(#{A_y_cons})/i
990
+ return "an #{word}"
991
+
992
+ # Otherwise, guess "a"
993
+ else
994
+ return "a #{word}"
995
+ end
996
+ end
997
+
998
+
999
+ ### Transform the specified number of units-place numerals into a
1000
+ ### word-phrase at the given number of +thousands+ places.
1001
+ def to_units( units, thousands=0 )
1002
+ return Units[ units ] + to_thousands( thousands )
1003
+ end
1004
+
1005
+
1006
+ ### Transform the specified number of tens- and units-place numerals into a
1007
+ ### word-phrase at the given number of +thousands+ places.
1008
+ def to_tens( tens, units, thousands=0 )
1009
+ unless tens == 1
1010
+ return Tens[ tens ] + ( tens.nonzero? && units.nonzero? ? '-' : '' ) +
1011
+ to_units( units, thousands )
1012
+ else
1013
+ return Teens[ units ] + to_thousands( thousands )
1014
+ end
1015
+ end
1016
+
1017
+
1018
+ ### Transform the specified number of hundreds-, tens-, and units-place
1019
+ ### numerals into a word phrase. If the number of thousands (+thousands+) is
1020
+ ### greater than 0, it will be used to determine where the decimal point is
1021
+ ### in relation to the hundreds-place number.
1022
+ def to_hundreds( hundreds, tens=0, units=0, thousands=0, joinword=" and " )
1023
+ joinword = ' ' if joinword.empty?
1024
+ if hundreds.nonzero?
1025
+ return to_units( hundreds ) + " hundred" +
1026
+ (tens.nonzero? || units.nonzero? ? joinword : '') +
1027
+ to_tens( tens, units ) +
1028
+ to_thousands( thousands )
1029
+ elsif tens.nonzero? || units.nonzero?
1030
+ return to_tens( tens, units ) + to_thousands( thousands )
1031
+ else
1032
+ return nil
1033
+ end
1034
+ end
1035
+
1036
+ ### Transform the specified number into one or more words like 'thousand',
1037
+ ### 'million', etc. Uses the thousands (American) system.
1038
+ def to_thousands( thousands=0 )
1039
+ parts = []
1040
+ (0..thousands).step( Thousands.length - 1 ) {|i|
1041
+ if i.zero?
1042
+ parts.push Thousands[ thousands % (Thousands.length - 1) ]
1043
+ else
1044
+ parts.push Thousands.last
1045
+ end
1046
+ }
1047
+
1048
+ return parts.join(" ")
1049
+ end
1050
+
1051
+
1052
+ ### Return the specified number +num+ as an array of number phrases.
1053
+ def number_to_words( num, config )
1054
+ return [config[:zero]] if num.to_i.zero?
1055
+ chunks = []
1056
+
1057
+ # Break into word-groups if groups is set
1058
+ if config[:group].nonzero?
1059
+
1060
+ # Build a Regexp with <config[:group]> number of digits. Any past
1061
+ # the first are optional.
1062
+ re = Regexp::new( "(\\d)" + ("(\\d)?" * (config[:group] - 1)) )
1063
+
1064
+ # Scan the string, and call the word-chunk function that deals with
1065
+ # chunks of the found number of digits.
1066
+ num.to_s.scan( re ) {|digits|
1067
+ debugMsg " digits = #{digits.inspect}"
1068
+ fn = NumberToWordsFunctions[ digits.nitems ]
1069
+ numerals = digits.flatten.compact.collect {|i| i.to_i}
1070
+ debugMsg " numerals = #{numerals.inspect}"
1071
+ chunks.push fn.call( config[:zero], *numerals ).strip
1072
+ }
1073
+ else
1074
+ phrase = num.to_s
1075
+ phrase.sub!( /\A\s*0+/, '' )
1076
+ mill = 0
1077
+
1078
+ # Match backward from the end of the digits in the string, turning
1079
+ # chunks of three, of two, and of one into words.
1080
+ mill += 1 while
1081
+ phrase.sub!( /(\d)(\d)(\d)(?=\D*\Z)/ ) {
1082
+ words = to_hundreds( $1.to_i, $2.to_i, $3.to_i, mill,
1083
+ config[:and] )
1084
+ chunks.unshift words.strip.squeeze(' ') unless words.nil?
1085
+ ''
1086
+ }
1087
+
1088
+ phrase.sub!( /(\d)(\d)(?=\D*\Z)/ ) {
1089
+ chunks.unshift to_tens( $1.to_i, $2.to_i, mill ).strip.squeeze(' ')
1090
+ ''
1091
+ }
1092
+ phrase.sub!( /(\d)(?=\D*\Z)/ ) {
1093
+ chunks.unshift to_units( $1.to_i, mill ).strip.squeeze(' ')
1094
+ ''
1095
+ }
1096
+ end
1097
+
1098
+ return chunks
1099
+ end
1100
+
1101
+
1102
+ #################################################################
1103
+ ### P U B L I C F U N C T I O N S
1104
+ #################################################################
1105
+
1106
+ ### Return the name of the language this module is for.
1107
+ def language
1108
+ "English"
1109
+ end
1110
+
1111
+
1112
+ ### Return the plural of the given +phrase+ if +count+ indicates it should
1113
+ ### be plural.
1114
+ def plural( phrase, count=nil )
1115
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1116
+ pre, word, post = md.to_a[1,3]
1117
+ return phrase if word.nil? or word.empty?
1118
+
1119
+ plural = postprocess( word,
1120
+ pluralize_special_adjective(word, count) ||
1121
+ pluralize_special_verb(word, count) ||
1122
+ pluralize_noun(word, count) )
1123
+
1124
+ return pre + plural + post
1125
+ end
1126
+ alias_method :PL, :plural
1127
+
1128
+
1129
+ ### Return the plural of the given noun +phrase+ if +count+ indicates it
1130
+ ### should be plural.
1131
+ def plural_noun( phrase, count=nil )
1132
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1133
+ pre, word, post = md.to_a[1,3]
1134
+ return phrase if word.nil? or word.empty?
1135
+
1136
+ plural = postprocess( word, pluralize_noun(word, count) )
1137
+ return pre + plural + post
1138
+ end
1139
+ alias_method :PL_N, :plural_noun
1140
+
1141
+
1142
+ ### Return the plural of the given verb +phrase+ if +count+ indicates it
1143
+ ### should be plural.
1144
+ def plural_verb( phrase, count=nil )
1145
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1146
+ pre, word, post = md.to_a[1,3]
1147
+ return phrase if word.nil? or word.empty?
1148
+
1149
+ plural = postprocess( word,
1150
+ pluralize_special_verb(word, count) ||
1151
+ pluralize_general_verb(word, count) )
1152
+ return pre + plural + post
1153
+ end
1154
+ alias_method :PL_V, :plural_verb
1155
+
1156
+
1157
+ ### Return the plural of the given adjectival +phrase+ if +count+ indicates
1158
+ ### it should be plural.
1159
+ def plural_adjective( phrase, count=nil )
1160
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1161
+ pre, word, post = md.to_a[1,3]
1162
+ return phrase if word.nil? or word.empty?
1163
+
1164
+ plural = postprocess( word,
1165
+ pluralize_special_adjective(word, count) || word )
1166
+ return pre + plural + post
1167
+ end
1168
+ alias_method :plural_adj, :plural_adjective
1169
+ alias_method :PL_ADJ, :plural_adjective
1170
+
1171
+
1172
+ ### Return the given phrase with the appropriate indefinite article ("a" or
1173
+ ### "an") prepended.
1174
+ def a( phrase, count=nil )
1175
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1176
+ pre, word, post = md.to_a[1,3]
1177
+ return phrase if word.nil? or word.empty?
1178
+
1179
+ result = indef_article( word, count )
1180
+ return pre + result + post
1181
+ end
1182
+ alias_method :an, :a
1183
+ alias_method :A, :a
1184
+ alias_method :AN, :a
1185
+
1186
+
1187
+ ### Translate zero-quantified +phrase+ to "no +phrase.plural+"
1188
+ def no( phrase, count=nil )
1189
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1190
+ pre, word, post = md.to_a[1,3]
1191
+ count ||= Linguistics::num || 0
1192
+
1193
+ unless /^#{PL_count_zero}$/ =~ count.to_s
1194
+ return "#{pre}#{count} " + plural( word, count ) + post
1195
+ else
1196
+ return "#{pre}no " + plural( word, 0 ) + post
1197
+ end
1198
+ end
1199
+ alias_method :NO, :no
1200
+
1201
+
1202
+ ### Participles
1203
+ def present_participle( word )
1204
+ plural = plural_verb( word.to_s, 2 )
1205
+
1206
+ plural.sub!( /ie$/, 'y' ) or
1207
+ plural.sub!( /ue$/, 'u' ) or
1208
+ plural.sub!( /([auy])e$/, '$1' ) or
1209
+ plural.sub!( /i$/, '' ) or
1210
+ plural.sub!( /([^e])e$/, "\\1" ) or
1211
+ /er$/.match( plural ) or
1212
+ plural.sub!( /([^aeiou][aeiouy]([bdgmnprst]))$/, "\\1\\2" )
1213
+
1214
+ return "#{plural}ing"
1215
+ end
1216
+ alias_method :part_pres, :present_participle
1217
+ alias_method :PART_PRES, :present_participle
1218
+
1219
+
1220
+
1221
+ ### Return the specified number as english words. One or more configuration
1222
+ ### values may be passed to control the returned String:
1223
+ ###
1224
+ ### [<b>:group</b>]
1225
+ ### Controls how many numbers at a time are grouped together. Valid values
1226
+ ### are +0+ (normal grouping), +1+ (single-digit grouping, e.g., "one,
1227
+ ### two, three, four"), +2+ (double-digit grouping, e.g., "twelve,
1228
+ ### thirty-four", or +3+ (triple-digit grouping, e.g., "one twenty-three,
1229
+ ### four").
1230
+ ### [<b>:comma</b>]
1231
+ ### Set the character/s used to separate word groups. Defaults to +", "+.
1232
+ ### [<b>:and</b>]
1233
+ ### Set the word and/or characters used where ' and ' (the default) is
1234
+ ### normally used. Setting <tt>:and</tt> to +' '+, for example, will cause
1235
+ ### +2556+ to be returned as "two-thousand, five hundred fifty-six"
1236
+ ### instead of ""two-thousand, five hundred and fifty-six".
1237
+ ### [<b>:zero</b>]
1238
+ ### Set the word used to represent the numeral +0+ in the result. +'zero'+
1239
+ ### is the default.
1240
+ ### [<b>:decimal</b>]
1241
+ ### Set the translation of any decimal points in the number; the default
1242
+ ### is +'point'+.
1243
+ ### [<b>:asArray</b>]
1244
+ ### If set to a true value, the number will be returned as an array of
1245
+ ### word groups instead of a String.
1246
+ def numwords( number, hashargs={} )
1247
+ num = number.to_s
1248
+ config = NumwordDefaults.dup.update( hashargs )
1249
+ raise "Bad chunking option: #{config[:group]}" unless
1250
+ config[:group].between?( 0, 3 )
1251
+
1252
+ # Array of number parts: first is everything to the left of the first
1253
+ # decimal, followed by any groups of decimal-delimted numbers after that
1254
+ parts = []
1255
+
1256
+ # Wordify any sign prefix
1257
+ sign = (/\A\s*\+/ =~ num) ? 'plus' : (/\A\s*\-/ =~ num) ? 'minus' : ''
1258
+
1259
+ # Strip any ordinal suffixes
1260
+ ord = true if num.sub!( /(st|nd|rd|th)\Z/, '' )
1261
+
1262
+ # Split the number into chunks delimited by '.'
1263
+ chunks = if !config[:decimal].empty? then
1264
+ if config[:group].nonzero?
1265
+ num.split(/\./)
1266
+ else
1267
+ num.split(/\./, 2)
1268
+ end
1269
+ else
1270
+ [ num ]
1271
+ end
1272
+
1273
+ # Wordify each chunk, pushing arrays into the parts array
1274
+ chunks.each_with_index {|chunk,section|
1275
+ chunk.gsub!( /\D+/, '' )
1276
+
1277
+ # If there's nothing in this chunk of the number, set it to zero
1278
+ # unless it's the whole-number part, in which case just push an
1279
+ # empty array.
1280
+ if chunk.empty?
1281
+ if section.zero?
1282
+ parts.push []
1283
+ next
1284
+ end
1285
+ end
1286
+
1287
+ # Split the number section into wordified parts unless this is the
1288
+ # second or succeeding part of a non-group number
1289
+ unless config[:group].zero? && section.nonzero?
1290
+ parts.push number_to_words( chunk, config )
1291
+ else
1292
+ parts.push number_to_words( chunk, config.dup.update(:group => 1) )
1293
+ end
1294
+ }
1295
+
1296
+ debugMsg "Parts => #{parts.inspect}"
1297
+
1298
+ # Turn the last word of the whole-number part back into an ordinal if
1299
+ # the original number came in that way.
1300
+ if ord && !parts[0].empty?
1301
+ parts[0][-1] = ordinal( parts[0].last )
1302
+ end
1303
+
1304
+ # If the caller's expecting an Array return, just flatten and return the
1305
+ # parts array.
1306
+ if config[:asArray]
1307
+ unless sign.empty?
1308
+ parts[0].unshift( sign )
1309
+ end
1310
+ return parts.flatten
1311
+ end
1312
+
1313
+ # Catenate each sub-parts array into a whole number part and one or more
1314
+ # post-decimal parts. If grouping is turned on, all sub-parts get joined
1315
+ # with commas, otherwise just the whole-number part is.
1316
+ if config[:group].zero?
1317
+ if parts[0].nitems > 1
1318
+
1319
+ # Join all but the last part together with commas
1320
+ wholenum = parts[0][0...-1].join( config[:comma] )
1321
+
1322
+ # If the last part is just a single word, append it to the
1323
+ # wholenum part with an 'and'. This is to get things like 'three
1324
+ # thousand and three' instead of 'three thousand, three'.
1325
+ if /^\s*(\S+)\s*$/ =~ parts[0].last
1326
+ wholenum += " and #{parts[0].last}"
1327
+ else
1328
+ wholenum += config[:comma] + parts[0].last
1329
+ end
1330
+ else
1331
+ wholenum = parts[0][0]
1332
+ end
1333
+ decimals = parts[1..-1].collect {|part| part.join(" ")}
1334
+
1335
+ debugMsg "Wholenum: #{wholenum.inspect}; decimals: #{decimals.inspect}"
1336
+
1337
+ # Join with the configured decimal; if it's empty, just join with
1338
+ # spaces.
1339
+ unless config[:decimal].empty?
1340
+ return sign + ([ wholenum ] + decimals).
1341
+ join( " #{config[:decimal]} " ).strip
1342
+ else
1343
+ return sign + ([ wholenum ] + decimals).
1344
+ join( " " ).strip
1345
+ end
1346
+ else
1347
+ return parts.compact.
1348
+ separate( config[:decimal] ).
1349
+ delete_if {|el| el.empty?}.
1350
+ join( config[:comma] ).
1351
+ strip
1352
+ end
1353
+ end
1354
+ alias_method :NUMWORDS, :numwords
1355
+
1356
+
1357
+ ### Transform the given +number+ into an ordinal word. The +number+ object
1358
+ ### can be either an Integer or a String.
1359
+ def ordinal( number )
1360
+ case number
1361
+ when Integer
1362
+ return number.to_s + (Nth[ number % 100 ] || Nth[ number % 10 ])
1363
+
1364
+ else
1365
+ return number.to_s.sub( /(#{OrdinalSuffixes})\Z/ ) { Ordinals[$1] }
1366
+ end
1367
+ end
1368
+ alias_method :ORD, :ordinal
1369
+
1370
+
1371
+ ### Return a phrase describing the specified +number+ of objects in the
1372
+ ### given +phrase+. The following options can be used to control the makeup
1373
+ ### of the returned quantity String:
1374
+ ###
1375
+ ### [<b>:joinword</b>]
1376
+ ### Sets the word (and any surrounding spaces) used as the word separating the
1377
+ ### quantity from the noun in the resulting string. Defaults to <tt>' of
1378
+ ### '</tt>.
1379
+ def quantify( phrase, number=0, args={} )
1380
+ num = number.to_i
1381
+ config = QuantifyDefaults.dup.update( args )
1382
+
1383
+ case num
1384
+ when 0
1385
+ no( phrase )
1386
+ when 1
1387
+ a( phrase )
1388
+ when SeveralRange
1389
+ "several " + plural( phrase, num )
1390
+ when NumberRange
1391
+ "a number of " + plural( phrase, num )
1392
+ when NumerousRange
1393
+ "numerous " + plural( phrase, num )
1394
+ when ManyRange
1395
+ "many " + plural( phrase, num )
1396
+ else
1397
+
1398
+ # Anything bigger than the ManyRange gets described like
1399
+ # "hundreds of thousands of..." or "millions of..."
1400
+ # depending, of course, on how many there are.
1401
+ thousands, subthousands = Math::log10( num ).to_i.divmod( 3 )
1402
+ stword =
1403
+ case subthousands
1404
+ when 2
1405
+ "hundreds"
1406
+ when 1
1407
+ "tens"
1408
+ else
1409
+ nil
1410
+ end
1411
+ thword = plural( to_thousands(thousands).strip )
1412
+ thword = nil if thword.empty?
1413
+
1414
+ [ # Hundreds (of)...
1415
+ stword,
1416
+
1417
+ # thousands (of)
1418
+ thword,
1419
+
1420
+ # stars.
1421
+ plural(phrase, number)
1422
+ ].compact.join( config[:joinword] )
1423
+ end
1424
+ end
1425
+
1426
+
1427
+ ### Return the specified +obj+ (which must support the <tt>#collect</tt>
1428
+ ### method) as a conjunction. Each item is converted to a String if it is
1429
+ ### not already (using #to_s) unless a block is given, in which case it is
1430
+ ### called once for each object in the array, and the stringified return
1431
+ ### value from the block is used instead. Returning +nil+ causes that
1432
+ ### particular element to be omitted from the resulting conjunction. The
1433
+ ### following options can be used to control the makeup of the returned
1434
+ ### conjunction String:
1435
+ ###
1436
+ ### [<b>:separator</b>]
1437
+ ### Specify one or more characters to separate items in the resulting
1438
+ ### list. Defaults to <tt>', '</tt>.
1439
+ ### [<b>:altsep</b>]
1440
+ ### An alternate separator to use if any of the resulting conjunction's
1441
+ ### clauses contain the <tt>:separator</tt> character/s. Defaults to <tt>'; '</tt>.
1442
+ ### [<b>:penultimate</b>]
1443
+ ### Flag that indicates whether or not to join the last clause onto the
1444
+ ### rest of the conjunction using a penultimate <tt>:separator</tt>. E.g.,
1445
+ ### %w{duck, cow, dog}.en.conjunction
1446
+ ### # => "a duck, a cow, and a dog"
1447
+ ### %w{duck cow dog}.en.conjunction( :penultimate => false )
1448
+ ### "a duck, a cow and a dog"
1449
+ ### Default to <tt>true</tt>.
1450
+ ### [<b>:conjunctive</b>]
1451
+ ### Sets the word used as the conjunctive (separating word) of the
1452
+ ### resulting string. Default to <tt>'and'</tt>.
1453
+ ### [<b>:combine</b>]
1454
+ ### If set to <tt>true</tt> (the default), items which are indentical (after
1455
+ ### surrounding spaces are stripped) will be combined in the resulting
1456
+ ### conjunction. E.g.,
1457
+ ### %w{goose cow goose dog}.en.conjunction
1458
+ ### # => "two geese, a cow, and a dog"
1459
+ ### %w{goose cow goose dog}.en.conjunction( :combine => false )
1460
+ ### # => "a goose, a cow, a goose, and a dog"
1461
+ ### [<b>:casefold</b>]
1462
+ ### If set to <tt>true</tt> (the default), then items are compared
1463
+ ### case-insensitively when combining them. This has no effect if
1464
+ ### <tt>:combine</tt> is <tt>false</tt>.
1465
+ ### [<b>:generalize</b>]
1466
+ ### If set to <tt>true</tt>, then quantities of combined items are turned into
1467
+ ### general descriptions instead of exact amounts.
1468
+ ### ary = %w{goose pig dog horse goose reindeer goose dog horse}
1469
+ ### ary.en.conjunction
1470
+ ### # => "three geese, two dogs, two horses, a pig, and a reindeer"
1471
+ ### ary.en.conjunction( :generalize => true )
1472
+ ### # => "several geese, several dogs, several horses, a pig, and a reindeer"
1473
+ ### See the #quantify method for specifics on how quantities are
1474
+ ### generalized. Generalization defaults to <tt>false</tt>, and has no effect if
1475
+ ### :combine is <tt>false</tt>.
1476
+ ### [<b>:quantsort</b>]
1477
+ ### If set to <tt>true</tt> (the default), items which are combined in the
1478
+ ### resulting conjunction will be listed in order of amount, with greater
1479
+ ### quantities sorted first. If <tt>:quantsort</tt> is <tt>false</tt>, combined items
1480
+ ### will appear where the first instance of them occurred in the
1481
+ ### list. This sort is also the fallback for indentical quantities (ie.,
1482
+ ### items of the same quantity will be listed in the order they appeared
1483
+ ### in the source list).
1484
+ ###
1485
+ def conjunction( obj, args={} )
1486
+ config = ConjunctionDefaults.dup.update( args )
1487
+ phrases = []
1488
+
1489
+ # Transform items in the obj to phrases
1490
+ if block_given?
1491
+ phrases = obj.collect {|item| yield(item) }.compact
1492
+ else
1493
+ phrases = obj.collect {|item| item.to_s }
1494
+ end
1495
+
1496
+ # No need for a conjunction if there's only one thing
1497
+ return a(phrases[0]) if phrases.length < 2
1498
+
1499
+ # Set up a Proc to derive a collector key from a phrase depending on the
1500
+ # configuration
1501
+ keyfunc =
1502
+ if config[:casefold]
1503
+ proc {|key| key.downcase.strip}
1504
+ else
1505
+ proc {|key| key.strip}
1506
+ end
1507
+
1508
+ # Count and delete phrases that hash the same when the keyfunc munges
1509
+ # them into the same thing if we're combining (:combine => true).
1510
+ collector = {}
1511
+ if config[:combine]
1512
+
1513
+ phrases.each_index do |i|
1514
+ # Stop when reaching the end of a truncated list
1515
+ break if phrases[i].nil?
1516
+
1517
+ # Make the key using the configured key function
1518
+ phrase = keyfunc[ phrases[i] ]
1519
+
1520
+ # If the collector already has this key, increment its count,
1521
+ # eliminate the duplicate from the phrase list, and redo the loop.
1522
+ if collector.key?( phrase )
1523
+ collector[ phrase ] += 1
1524
+ phrases.delete_at( i )
1525
+ redo
1526
+ end
1527
+
1528
+ collector[ phrase ] = 1
1529
+ end
1530
+ else
1531
+ # If we're not combining, just make everything have a count of 1.
1532
+ phrases.uniq.each {|key| collector[ keyfunc[key] ] = 1}
1533
+ end
1534
+
1535
+ # If sort-by-quantity is turned on, sort the phrases first by how many
1536
+ # there are (most-first), and then by the order they were specified in.
1537
+ if config[:quantsort] && config[:combine]
1538
+ origorder = {}
1539
+ phrases.each_with_index {|phrase,i| origorder[ keyfunc[phrase] ] ||= i }
1540
+ phrases.sort! {|a,b|
1541
+ (collector[ keyfunc[b] ] <=> collector[ keyfunc[a] ]).nonzero? ||
1542
+ (origorder[ keyfunc[a] ] <=> origorder[ keyfunc[b] ])
1543
+ }
1544
+ end
1545
+
1546
+ # Set up a filtering function that adds either an indefinite article, an
1547
+ # indefinite quantifier, or a definite quantifier to each phrase
1548
+ # depending on the configuration and the count of phrases in the
1549
+ # collector.
1550
+ filter =
1551
+ if config[:generalize]
1552
+ proc {|phrase, count| quantify(phrase, count) }
1553
+ else
1554
+ proc {|phrase, count|
1555
+ if count > 1
1556
+ "%s %s" % [
1557
+ # :TODO: Make this threshold settable
1558
+ count < 10 ? count.en.numwords : count.to_s,
1559
+ plural(phrase, count)
1560
+ ]
1561
+ else
1562
+ a( phrase )
1563
+ end
1564
+ }
1565
+ end
1566
+
1567
+ # Now use the configured filter to turn each phrase into its final
1568
+ # form. Hmmm... square-bracket Lisp?
1569
+ phrases.collect! {|phrase| filter[phrase, collector[ keyfunc[phrase] ]] }
1570
+
1571
+ # Prepend the conjunctive to the last element unless it's empty or
1572
+ # there's only one element
1573
+ phrases[-1].insert( 0, config[:conjunctive] + " " ) unless
1574
+ config[:conjunctive].strip.empty? or
1575
+ phrases.length < 2
1576
+
1577
+ # Catenate the last two elements if there's no penultimate separator,
1578
+ # and pick a separator based on how many phrases there are and whether
1579
+ # or not there's already an instance of it in the phrases.
1580
+ phrases[-2] << " " << phrases.pop unless config[:penultimate]
1581
+ sep = if phrases.length <= 2
1582
+ ' '
1583
+ elsif phrases.grep( /#{config[:separator]}/ ).empty?
1584
+ config[:separator]
1585
+ else
1586
+ config[:altsep]
1587
+ end
1588
+
1589
+ return phrases.join( sep )
1590
+ end
1591
+
1592
+
1593
+ ### Turns a camel-case +string+ ("camelCaseToEnglish") to plain English
1594
+ ### ("camel case to english"). Each word is decapitalized.
1595
+ def camel_case_to_english( string )
1596
+ string.to_s.gsub( /([a-z])([A-Z])/ ) { "#$1 #$2" }.downcase
1597
+ end
1598
+
1599
+
1600
+ ### Turns an English language +string+ into a CamelCase word.
1601
+ def english_to_camel_case( string )
1602
+ string.to_s.gsub( /\s+([a-z])/ ) { $1.upcase }
1603
+ end
1604
+
1605
+
1606
+ ### This method doesn't work quite right yet. It does okay for simple cases,
1607
+ ### but it misses more complex ones, e.g. 'as' used as a coordinating
1608
+ ### conjunction in "A Portrait of the Artist as a Young Man". Perhaps after
1609
+ ### there's a working (non-leaking) LinkParser for Ruby, this can be fixed
1610
+ ### up. Until then it'll just be undocumented.
1611
+
1612
+ ### Returns the given +string+ as a title-cased phrase.
1613
+ def titlecase( string ) # :nodoc:
1614
+
1615
+ # Split on word-boundaries
1616
+ words = string.split( /\b/ )
1617
+
1618
+ # Always capitalize the first and last words
1619
+ words.first.capitalize!
1620
+ words.last.capitalize!
1621
+
1622
+ # Now scan the rest of the tokens, skipping non-words and capitalization
1623
+ # exceptions.
1624
+ words.each_with_index do |word, i|
1625
+
1626
+ # Non-words
1627
+ next unless /^\w+$/.match( word )
1628
+
1629
+ # Skip exception-words
1630
+ next if TitleCaseExceptions.include?( word )
1631
+
1632
+ # Skip second parts of contractions
1633
+ next if words[i - 1] == "'" && /\w/.match( words[i - 2] )
1634
+
1635
+ # Have to do it this way instead of capitalize! because that method
1636
+ # also downcases all other letters.
1637
+ word.gsub!( /^(\w)(.*)/ ) { $1.upcase + $2 }
1638
+ end
1639
+
1640
+ return words.join
1641
+ end
1642
+
1643
+
1644
+ ### Returns the proper noun form of a string by capitalizing most of the
1645
+ ### words.
1646
+ ###
1647
+ ### Examples:
1648
+ ### English.proper_noun("bosnia and herzegovina") ->
1649
+ ### "Bosnia and Herzegovina"
1650
+ ### English.proper_noun("macedonia, the former yugoslav republic of") ->
1651
+ ### "Macedonia, the Former Yugoslav Republic of"
1652
+ ### English.proper_noun("virgin islands, u.s.") ->
1653
+ ### "Virgin Islands, U.S."
1654
+ def proper_noun( string )
1655
+ return string.split(/([ .]+)/).collect {|word|
1656
+ next word unless /^[a-z]/.match( word ) &&
1657
+ ! (%w{and the of}.include?( word ))
1658
+ word.capitalize
1659
+ }.join
1660
+ end
1661
+
1662
+ end # module Linguistics::EN
1663
+
1664
+
1665
+ ### Add the #separate and #separate! methods to Array.
1666
+ class Array # :nodoc:
1667
+
1668
+ ### Returns a new Array that has had a new member inserted between all of
1669
+ ### the current ones. The value used is the given +value+ argument unless a
1670
+ ### block is given, in which case the block is called once for each pair of
1671
+ ### the Array, and the return value is used as the separator.
1672
+ def separate( value=:__no_arg__, &block )
1673
+ ary = self.dup
1674
+ ary.separate!( value, &block )
1675
+ return ary
1676
+ end
1677
+
1678
+ ### The same as #separate, but modifies the Array in place.
1679
+ def separate!( value=:__no_arg__ )
1680
+ raise ArgumentError, "wrong number of arguments: (0 for 1)" if
1681
+ value == :__no_arg__ && !block_given?
1682
+
1683
+ (1..( (self.length * 2) - 2 )).step(2) do |i|
1684
+ if block_given?
1685
+ self.insert( i, yield(self[i-1,2]) )
1686
+ else
1687
+ self.insert( i, value )
1688
+ end
1689
+ end
1690
+ self
1691
+ end
1692
+
1693
+ end
1694
+