Linguistics 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1694 @@
1
+ #!/usr/bin/ruby
2
+ #
3
+ # = Linguistics::EN
4
+ #
5
+ # This module contains English-language linguistic functions for the Linguistics
6
+ # module. It can be either loaded directly, or by passing some variant of 'en'
7
+ # or 'eng' to the Linguistics::use method.
8
+ #
9
+ # The functions contained by the module provide:
10
+ #
11
+ # == Plural Inflections
12
+ #
13
+ # Plural forms of all nouns, most verbs, and some adjectives are provided. Where
14
+ # appropriate, "classical" variants (for example: "brother" -> "brethren",
15
+ # "dogma" -> "dogmata", etc.) are also provided.
16
+ #
17
+ # These can be accessed via the #plural, #plural_noun, #plural_verb, and
18
+ # #plural_adjective methods.
19
+ #
20
+ # == Indefinite Articles
21
+ #
22
+ # Pronunciation-based "a"/"an" selection is provided for all English words, and
23
+ # most initialisms.
24
+ #
25
+ # See: #a, #an, and #no.
26
+ #
27
+ # == Numbers to Words
28
+ #
29
+ # Conversion from Numeric values to words are supported using the American
30
+ # "thousands" system. E.g., 2561 => "two thousand, five hundred and sixty-one".
31
+ #
32
+ # See the #numwords method.
33
+ #
34
+ # == Ordinals
35
+ #
36
+ # It is also possible to inflect numerals (1,2,3) and number words ("one",
37
+ # "two", "three") to ordinals (1st, 2nd, 3rd) and ordinates ("first", "second",
38
+ # "third").
39
+ #
40
+ # == Conjunctions
41
+ #
42
+ # This module also supports the creation of English conjunctions from Arrays of
43
+ # Strings or objects which respond to the #to_s message. Eg.,
44
+ #
45
+ # %w{cow pig chicken cow dog cow duck duck moose}.en.conjunction
46
+ # ==> "three cows, two ducks, a pig, a chicken, a dog, and a moose"
47
+ #
48
+ # == Infinitives
49
+ #
50
+ # Returns the infinitive form of English verbs:
51
+ #
52
+ # "dodging".en.infinitive
53
+ # ==> "dodge"
54
+ #
55
+ #
56
+ # == Authors
57
+ #
58
+ # * Michael Granger <ged@FaerieMUD.org>
59
+ #
60
+ # == Copyright
61
+ #
62
+ # This module is copyright (c) 2003-2005 The FaerieMUD Consortium. All rights
63
+ # reserved.
64
+ #
65
+ # This module is free software. You may use, modify, and/or redistribute this
66
+ # software under the terms of the Perl Artistic License. (See
67
+ # http://language.perl.com/misc/Artistic.html)
68
+ #
69
+ # The inflection functions of this module were adapted from Damien Conway's
70
+ # Lingua::EN::Inflect Perl module:
71
+ #
72
+ # Copyright (c) 1997-2000, Damian Conway. All Rights Reserved.
73
+ # This module is free software. It may be used, redistributed
74
+ # and/or modified under the same terms as Perl itself.
75
+ #
76
+ # The conjunctions code was adapted from the Lingua::Conjunction Perl module
77
+ # written by Robert Rothenberg and Damian Conway, which has no copyright
78
+ # statement included.
79
+ #
80
+ # == Version
81
+ #
82
+ # $Id: en.rb,v 1.8 2003/09/14 10:47:12 deveiant Exp $
83
+ #
84
+
85
+
86
+ ### This module contains English-language linguistics functions accessible from
87
+ ### the Linguistics module, or as a standalone function library.
88
+ module Linguistics::EN
89
+
90
+ begin
91
+ require 'crosscase'
92
+ rescue LoadError
93
+ else
94
+ include CrossCase
95
+ end
96
+
97
+ # Load in the secondary modules and add them to Linguistics::EN.
98
+ require 'linguistics/en/infinitive'
99
+ require 'linguistics/en/wordnet'
100
+ require 'linguistics/en/linkparser'
101
+
102
+ # Subversion revision
103
+ SVNRev = %q$Rev$
104
+
105
+ # Subversion revision tag
106
+ SVNId = %q$Id: en.rb,v 1.8 2003/09/14 10:47:12 deveiant Exp $
107
+
108
+ # Add 'english' to the list of default languages
109
+ Linguistics::DefaultLanguages.push( :en )
110
+
111
+
112
+ #################################################################
113
+ ### U T I L I T Y F U N C T I O N S
114
+ #################################################################
115
+
116
+ ### Wrap one or more parts in a non-capturing alteration Regexp
117
+ def self::matchgroup( *parts )
118
+ re = parts.flatten.join("|")
119
+ "(?:#{re})"
120
+ end
121
+
122
+
123
+ #################################################################
124
+ ### C O N S T A N T S
125
+ #################################################################
126
+
127
+ # :stopdoc:
128
+
129
+ #
130
+ # Plurals
131
+ #
132
+
133
+ PL_sb_irregular_s = {
134
+ "ephemeris" => "ephemerides",
135
+ "iris" => "irises|irides",
136
+ "clitoris" => "clitorises|clitorides",
137
+ "corpus" => "corpuses|corpora",
138
+ "opus" => "opuses|opera",
139
+ "genus" => "genera",
140
+ "mythos" => "mythoi",
141
+ "penis" => "penises|penes",
142
+ "testis" => "testes",
143
+ }
144
+
145
+ PL_sb_irregular_h = {
146
+ "child" => "children",
147
+ "brother" => "brothers|brethren",
148
+ "loaf" => "loaves",
149
+ "hoof" => "hoofs|hooves",
150
+ "beef" => "beefs|beeves",
151
+ "money" => "monies",
152
+ "mongoose" => "mongooses",
153
+ "ox" => "oxen",
154
+ "cow" => "cows|kine",
155
+ "soliloquy" => "soliloquies",
156
+ "graffito" => "graffiti",
157
+ "prima donna" => "prima donnas|prime donne",
158
+ "octopus" => "octopuses|octopodes",
159
+ "genie" => "genies|genii",
160
+ "ganglion" => "ganglions|ganglia",
161
+ "trilby" => "trilbys",
162
+ "turf" => "turfs|turves",
163
+ }.update( PL_sb_irregular_s )
164
+ PL_sb_irregular = matchgroup PL_sb_irregular_h.keys
165
+
166
+
167
+ # Classical "..a" -> "..ata"
168
+ PL_sb_C_a_ata = matchgroup %w[
169
+ anathema bema carcinoma charisma diploma
170
+ dogma drama edema enema enigma lemma
171
+ lymphoma magma melisma miasma oedema
172
+ sarcoma schema soma stigma stoma trauma
173
+ gumma pragma
174
+ ].collect {|word| word[0...-1]}
175
+
176
+ # Unconditional "..a" -> "..ae"
177
+ PL_sb_U_a_ae = matchgroup %w[
178
+ alumna alga vertebra persona
179
+ ]
180
+
181
+ # Classical "..a" -> "..ae"
182
+ PL_sb_C_a_ae = matchgroup %w[
183
+ amoeba antenna formula hyperbola
184
+ medusa nebula parabola abscissa
185
+ hydra nova lacuna aurora .*umbra
186
+ flora fauna
187
+ ]
188
+
189
+ # Classical "..en" -> "..ina"
190
+ PL_sb_C_en_ina = matchgroup %w[
191
+ stamen foramen lumen
192
+ ].collect {|word| word[0...-2] }
193
+
194
+ # Unconditional "..um" -> "..a"
195
+ PL_sb_U_um_a = matchgroup %w[
196
+ bacterium agendum desideratum erratum
197
+ stratum datum ovum extremum
198
+ candelabrum
199
+ ].collect {|word| word[0...-2] }
200
+
201
+ # Classical "..um" -> "..a"
202
+ PL_sb_C_um_a = matchgroup %w[
203
+ maximum minimum momentum optimum
204
+ quantum cranium curriculum dictum
205
+ phylum aquarium compendium emporium
206
+ enconium gymnasium honorarium interregnum
207
+ lustrum memorandum millenium rostrum
208
+ spectrum speculum stadium trapezium
209
+ ultimatum medium vacuum velum
210
+ consortium
211
+ ].collect {|word| word[0...-2]}
212
+
213
+ # Unconditional "..us" -> "i"
214
+ PL_sb_U_us_i = matchgroup %w[
215
+ alumnus alveolus bacillus bronchus
216
+ locus nucleus stimulus meniscus
217
+ ].collect {|word| word[0...-2]}
218
+
219
+ # Classical "..us" -> "..i"
220
+ PL_sb_C_us_i = matchgroup %w[
221
+ focus radius genius
222
+ incubus succubus nimbus
223
+ fungus nucleolus stylus
224
+ torus umbilicus uterus
225
+ hippopotamus
226
+ ].collect {|word| word[0...-2]}
227
+
228
+ # Classical "..us" -> "..us" (assimilated 4th declension latin nouns)
229
+ PL_sb_C_us_us = matchgroup %w[
230
+ status apparatus prospectus sinus
231
+ hiatus impetus plexus
232
+ ]
233
+
234
+ # Unconditional "..on" -> "a"
235
+ PL_sb_U_on_a = matchgroup %w[
236
+ criterion perihelion aphelion
237
+ phenomenon prolegomenon noumenon
238
+ organon asyndeton hyperbaton
239
+ ].collect {|word| word[0...-2]}
240
+
241
+ # Classical "..on" -> "..a"
242
+ PL_sb_C_on_a = matchgroup %w[
243
+ oxymoron
244
+ ].collect {|word| word[0...-2]}
245
+
246
+ # Classical "..o" -> "..i" (but normally -> "..os")
247
+ PL_sb_C_o_i_a = %w[
248
+ solo soprano basso alto
249
+ contralto tempo piano
250
+ ]
251
+ PL_sb_C_o_i = matchgroup PL_sb_C_o_i_a.collect{|word| word[0...-1]}
252
+
253
+ # Always "..o" -> "..os"
254
+ PL_sb_U_o_os = matchgroup( %w[
255
+ albino archipelago armadillo
256
+ commando crescendo fiasco
257
+ ditto dynamo embryo
258
+ ghetto guano inferno
259
+ jumbo lumbago magneto
260
+ manifesto medico octavo
261
+ photo pro quarto
262
+ canto lingo generalissimo
263
+ stylo rhino
264
+ ] | PL_sb_C_o_i_a )
265
+
266
+
267
+ # Unconditional "..[ei]x" -> "..ices"
268
+ PL_sb_U_ex_ices = matchgroup %w[
269
+ codex murex silex
270
+ ].collect {|word| word[0...-2]}
271
+ PL_sb_U_ix_ices = matchgroup %w[
272
+ radix helix
273
+ ].collect {|word| word[0...-2]}
274
+
275
+ # Classical "..[ei]x" -> "..ices"
276
+ PL_sb_C_ex_ices = matchgroup %w[
277
+ vortex vertex cortex latex
278
+ pontifex apex index simplex
279
+ ].collect {|word| word[0...-2]}
280
+ PL_sb_C_ix_ices = matchgroup %w[
281
+ appendix
282
+ ].collect {|word| word[0...-2]}
283
+
284
+
285
+ # Arabic: ".." -> "..i"
286
+ PL_sb_C_i = matchgroup %w[
287
+ afrit afreet efreet
288
+ ]
289
+
290
+
291
+ # Hebrew: ".." -> "..im"
292
+ PL_sb_C_im = matchgroup %w[
293
+ goy seraph cherub
294
+ ]
295
+
296
+ # Unconditional "..man" -> "..mans"
297
+ PL_sb_U_man_mans = matchgroup %w[
298
+ human
299
+ Alabaman Bahaman Burman German
300
+ Hiroshiman Liman Nakayaman Oklahoman
301
+ Panaman Selman Sonaman Tacoman Yakiman
302
+ Yokohaman Yuman
303
+ ]
304
+
305
+
306
+ PL_sb_uninflected_s = [
307
+ # Pairs or groups subsumed to a singular...
308
+ "breeches", "britches", "clippers", "gallows", "hijinks",
309
+ "headquarters", "pliers", "scissors", "testes", "herpes",
310
+ "pincers", "shears", "proceedings", "trousers",
311
+
312
+ # Unassimilated Latin 4th declension
313
+ "cantus", "coitus", "nexus",
314
+
315
+ # Recent imports...
316
+ "contretemps", "corps", "debris",
317
+ ".*ois",
318
+
319
+ # Diseases
320
+ ".*measles", "mumps",
321
+
322
+ # Miscellaneous others...
323
+ "diabetes", "jackanapes", "series", "species", "rabies",
324
+ "chassis", "innings", "news", "mews",
325
+ ]
326
+
327
+
328
+ # Don't inflect in classical mode, otherwise normal inflection
329
+ PL_sb_uninflected_herd = matchgroup %w[
330
+ wildebeest swine eland bison buffalo
331
+ elk moose rhinoceros
332
+ ]
333
+
334
+ PL_sb_uninflected = matchgroup [
335
+
336
+ # Some fish and herd animals
337
+ ".*fish", "tuna", "salmon", "mackerel", "trout",
338
+ "bream", "sea[- ]bass", "carp", "cod", "flounder", "whiting",
339
+
340
+ ".*deer", ".*sheep",
341
+
342
+ # All nationals ending in -ese
343
+ "Portuguese", "Amoyese", "Borghese", "Congoese", "Faroese",
344
+ "Foochowese", "Genevese", "Genoese", "Gilbertese", "Hottentotese",
345
+ "Kiplingese", "Kongoese", "Lucchese", "Maltese", "Nankingese",
346
+ "Niasese", "Pekingese", "Piedmontese", "Pistoiese", "Sarawakese",
347
+ "Shavese", "Vermontese", "Wenchowese", "Yengeese",
348
+ ".*[nrlm]ese",
349
+
350
+ # Some words ending in ...s (often pairs taken as a whole)
351
+ PL_sb_uninflected_s,
352
+
353
+ # Diseases
354
+ ".*pox",
355
+
356
+ # Other oddities
357
+ "graffiti", "djinn"
358
+ ]
359
+
360
+
361
+ # Singular words ending in ...s (all inflect with ...es)
362
+ PL_sb_singular_s = matchgroup %w[
363
+ .*ss
364
+ acropolis aegis alias arthritis asbestos atlas
365
+ bathos bias bronchitis bursitis caddis cannabis
366
+ canvas chaos cosmos dais digitalis encephalitis
367
+ epidermis ethos eyas gas glottis hepatitis
368
+ hubris ibis lens mantis marquis metropolis
369
+ neuritis pathos pelvis polis rhinoceros
370
+ sassafras tonsillitis trellis .*us
371
+ ]
372
+
373
+ PL_v_special_s = matchgroup [
374
+ PL_sb_singular_s,
375
+ PL_sb_uninflected_s,
376
+ PL_sb_irregular_s.keys,
377
+ '(.*[csx])is',
378
+ '(.*)ceps',
379
+ '[A-Z].*s',
380
+ ]
381
+
382
+ PL_sb_postfix_adj = '(' + {
383
+
384
+ 'general' => ['(?!major|lieutenant|brigadier|adjutant)\S+'],
385
+ 'martial' => ["court"],
386
+
387
+ }.collect {|key,val|
388
+ matchgroup( matchgroup(val) + "(?=(?:-|\\s+)#{key})" )
389
+ }.join("|") + ")(.*)"
390
+
391
+
392
+ PL_sb_military = %r'major|lieutenant|brigadier|adjutant|quartermaster'
393
+ PL_sb_general = %r'((?!#{PL_sb_military.source}).*?)((-|\s+)general)'
394
+
395
+ PL_prep = matchgroup %w[
396
+ about above across after among around at athwart before behind
397
+ below beneath beside besides between betwixt beyond but by
398
+ during except for from in into near of off on onto out over
399
+ since till to under until unto upon with
400
+ ]
401
+
402
+ PL_sb_prep_dual_compound = %r'(.*?)((?:-|\s+)(?:#{PL_prep}|d[eu])(?:-|\s+))a(?:-|\s+)(.*)'
403
+ PL_sb_prep_compound = %r'(.*?)((-|\s+)(#{PL_prep}|d[eu])((-|\s+)(.*))?)'
404
+
405
+
406
+ PL_pron_nom_h = {
407
+ # Nominative Reflexive
408
+ "i" => "we", "myself" => "ourselves",
409
+ "you" => "you", "yourself" => "yourselves",
410
+ "she" => "they", "herself" => "themselves",
411
+ "he" => "they", "himself" => "themselves",
412
+ "it" => "they", "itself" => "themselves",
413
+ "they" => "they", "themself" => "themselves",
414
+
415
+ # Possessive
416
+ "mine" => "ours",
417
+ "yours" => "yours",
418
+ "hers" => "theirs",
419
+ "his" => "theirs",
420
+ "its" => "theirs",
421
+ "theirs" => "theirs",
422
+ }
423
+ PL_pron_nom = matchgroup PL_pron_nom_h.keys
424
+
425
+ PL_pron_acc_h = {
426
+ # Accusative Reflexive
427
+ "me" => "us", "myself" => "ourselves",
428
+ "you" => "you", "yourself" => "yourselves",
429
+ "her" => "them", "herself" => "themselves",
430
+ "him" => "them", "himself" => "themselves",
431
+ "it" => "them", "itself" => "themselves",
432
+ "them" => "them", "themself" => "themselves",
433
+ }
434
+ PL_pron_acc = matchgroup PL_pron_acc_h.keys
435
+
436
+ PL_v_irregular_pres_h = {
437
+ # 1St pers. sing. 2nd pers. sing. 3rd pers. singular
438
+ # 3rd pers. (indet.)
439
+ "am" => "are", "are" => "are", "is" => "are",
440
+ "was" => "were", "were" => "were", "was" => "were",
441
+ "have" => "have", "have" => "have", "has" => "have",
442
+ }
443
+ PL_v_irregular_pres = matchgroup PL_v_irregular_pres_h.keys
444
+
445
+ PL_v_ambiguous_pres_h = {
446
+ # 1st pers. sing. 2nd pers. sing. 3rd pers. singular
447
+ # 3rd pers. (indet.)
448
+ "act" => "act", "act" => "act", "acts" => "act",
449
+ "blame" => "blame", "blame" => "blame", "blames" => "blame",
450
+ "can" => "can", "can" => "can", "can" => "can",
451
+ "must" => "must", "must" => "must", "must" => "must",
452
+ "fly" => "fly", "fly" => "fly", "flies" => "fly",
453
+ "copy" => "copy", "copy" => "copy", "copies" => "copy",
454
+ "drink" => "drink", "drink" => "drink", "drinks" => "drink",
455
+ "fight" => "fight", "fight" => "fight", "fights" => "fight",
456
+ "fire" => "fire", "fire" => "fire", "fires" => "fire",
457
+ "like" => "like", "like" => "like", "likes" => "like",
458
+ "look" => "look", "look" => "look", "looks" => "look",
459
+ "make" => "make", "make" => "make", "makes" => "make",
460
+ "reach" => "reach", "reach" => "reach", "reaches" => "reach",
461
+ "run" => "run", "run" => "run", "runs" => "run",
462
+ "sink" => "sink", "sink" => "sink", "sinks" => "sink",
463
+ "sleep" => "sleep", "sleep" => "sleep", "sleeps" => "sleep",
464
+ "view" => "view", "view" => "view", "views" => "view",
465
+ }
466
+ PL_v_ambiguous_pres = matchgroup PL_v_ambiguous_pres_h.keys
467
+
468
+ PL_v_irregular_non_pres = matchgroup %w[
469
+ did had ate made put
470
+ spent fought sank gave sought
471
+ shall could ought should
472
+ ]
473
+
474
+ PL_v_ambiguous_non_pres = matchgroup %w[
475
+ thought saw bent will might cut
476
+ ]
477
+
478
+ PL_count_zero = matchgroup %w[
479
+ 0 no zero nil
480
+ ]
481
+
482
+ PL_count_one = matchgroup %w[
483
+ 1 a an one each every this that
484
+ ]
485
+
486
+ PL_adj_special_h = {
487
+ "a" => "some", "an" => "some",
488
+ "this" => "these", "that" => "those",
489
+ }
490
+ PL_adj_special = matchgroup PL_adj_special_h.keys
491
+
492
+ PL_adj_poss_h = {
493
+ "my" => "our",
494
+ "your" => "your",
495
+ "its" => "their",
496
+ "her" => "their",
497
+ "his" => "their",
498
+ "their" => "their",
499
+ }
500
+ PL_adj_poss = matchgroup PL_adj_poss_h.keys
501
+
502
+
503
+ #
504
+ # Numerals, ordinals, and numbers-to-words
505
+ #
506
+
507
+ # Numerical inflections
508
+ Nth = {
509
+ 0 => 'th',
510
+ 1 => 'st',
511
+ 2 => 'nd',
512
+ 3 => 'rd',
513
+ 4 => 'th',
514
+ 5 => 'th',
515
+ 6 => 'th',
516
+ 7 => 'th',
517
+ 8 => 'th',
518
+ 9 => 'th',
519
+ 11 => 'th',
520
+ 12 => 'th',
521
+ 13 => 'th',
522
+ }
523
+
524
+ # Ordinal word parts
525
+ Ordinals = {
526
+ 'ty' => 'tieth',
527
+ 'one' => 'first',
528
+ 'two' => 'second',
529
+ 'three' => 'third',
530
+ 'five' => 'fifth',
531
+ 'eight' => 'eighth',
532
+ 'nine' => 'ninth',
533
+ 'twelve' => 'twelfth',
534
+ }
535
+ OrdinalSuffixes = Ordinals.keys.join("|") + "|"
536
+ Ordinals[""] = 'th'
537
+
538
+ # Numeral names
539
+ Units = [''] + %w[one two three four five six seven eight nine]
540
+ Teens = %w[ten eleven twelve thirteen fourteen
541
+ fifteen sixteen seventeen eighteen nineteen]
542
+ Tens = ['',''] + %w[twenty thirty forty fifty sixty seventy eighty ninety]
543
+ Thousands = [' ', ' thousand'] + %w[
544
+ m b tr quadr quint sext sept oct non dec undec duodec tredec
545
+ quattuordec quindec sexdec septemdec octodec novemdec vigint
546
+ ].collect {|prefix| ' ' + prefix + 'illion'}
547
+
548
+ # A collection of functions for transforming digits into word
549
+ # phrases. Indexed by the number of digits being transformed; e.g.,
550
+ # <tt>NumberToWordsFunctions[2]</tt> is the function for transforming
551
+ # double-digit numbers.
552
+ NumberToWordsFunctions = [
553
+ proc {|*args| raise "No digits (#{args.inspect})"},
554
+
555
+ # Single-digits
556
+ proc {|zero,x|
557
+ (x.nonzero? ? to_units(x) : "#{zero} ")
558
+ },
559
+
560
+ # Double-digits
561
+ proc {|zero,x,y|
562
+ if x.nonzero?
563
+ to_tens( x, y )
564
+ elsif y.nonzero?
565
+ "#{zero} " + NumberToWordsFunctions[1].call( zero, y )
566
+ else
567
+ ([zero] * 2).join(" ")
568
+ end
569
+ },
570
+
571
+ # Triple-digits
572
+ proc {|zero,x,y,z|
573
+ NumberToWordsFunctions[1].call(zero,x) +
574
+ NumberToWordsFunctions[2].call(zero,y,z)
575
+ }
576
+ ]
577
+
578
+
579
+ #
580
+ # Indefinite Articles
581
+ #
582
+
583
+ # This pattern matches strings of capitals starting with a "vowel-sound"
584
+ # consonant followed by another consonant, and which are not likely
585
+ # to be real words (oh, all right then, it's just magic!)
586
+ A_abbrev = %{
587
+ (?! FJO | [HLMNS]Y. | RY[EO] | SQU
588
+ | ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU])
589
+ [FHLMNRSX][A-Z]
590
+ }
591
+
592
+ # This pattern codes the beginnings of all english words begining with a
593
+ # 'y' followed by a consonant. Any other y-consonant prefix therefore
594
+ # implies an abbreviation.
595
+ A_y_cons = 'y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)'
596
+
597
+ # Exceptions to exceptions
598
+ A_explicit_an = matchgroup( "euler", "hour(?!i)", "heir", "honest", "hono" )
599
+
600
+
601
+ #
602
+ # Configuration defaults
603
+ #
604
+
605
+ # Default configuration arguments for the #numwords function
606
+ NumwordDefaults = {
607
+ :group => 0,
608
+ :comma => ', ',
609
+ :and => ' and ',
610
+ :zero => 'zero',
611
+ :decimal => 'point',
612
+ :asArray => false,
613
+ }
614
+
615
+ # Default ranges for #quantify
616
+ SeveralRange = 2..5
617
+ NumberRange = 6..19
618
+ NumerousRange = 20..45
619
+ ManyRange = 46..99
620
+
621
+ # Default configuration arguments for the #quantify function
622
+ QuantifyDefaults = {
623
+ :joinword => " of ",
624
+ }
625
+
626
+ # Default configuration arguments for the #conjunction (junction, what's
627
+ # your) function.
628
+ ConjunctionDefaults = {
629
+ :separator => ', ',
630
+ :altsep => '; ',
631
+ :penultimate => true,
632
+ :conjunctive => 'and',
633
+ :combine => true,
634
+ :casefold => true,
635
+ :generalize => false,
636
+ :quantsort => true,
637
+ }
638
+
639
+
640
+ #
641
+ # Title case
642
+ #
643
+
644
+ # "In titles, capitalize the first word, the last word, and all words in
645
+ # between except articles (a, an, and the), prepositions under five letters
646
+ # (in, of, to), and coordinating conjunctions (and, but). These rules apply
647
+ # to titles of long, short, and partial works as well as your own papers"
648
+ # (Anson, Schwegler, and Muth. The Longman Writer's Companion 240).
649
+
650
+ # Build the list of exceptions to title-capitalization
651
+ Articles = %w[a and the]
652
+ ShortPrepositions = ["amid", "at", "but", "by", "down", "from", "in",
653
+ "into", "like", "near", "of", "off", "on", "onto", "out", "over",
654
+ "past", "save", "with", "till", "to", "unto", "up", "upon", "with"]
655
+ CoordConjunctions = %w[and but as]
656
+ TitleCaseExceptions = Articles | ShortPrepositions | CoordConjunctions
657
+
658
+
659
+ # :startdoc:
660
+
661
+ #################################################################
662
+ ### " B A C K E N D " F U N C T I O N S
663
+ #################################################################
664
+
665
+
666
+ ###############
667
+ module_function
668
+ ###############
669
+
670
+ ### Debugging output
671
+ def debugMsg( *msgs ) # :nodoc:
672
+ $stderr.puts msgs.join(" ") if $DEBUG
673
+ end
674
+
675
+
676
+ ### Normalize a count to either 1 or 2 (singular or plural)
677
+ def normalizeCount( count, default=2 )
678
+ return default if count.nil? # Default to plural
679
+ if /^(#{PL_count_one})$/i =~ count.to_s ||
680
+ Linguistics::classical? &&
681
+ /^(#{PL_count_zero})$/ =~ count.to_s
682
+ return 1
683
+ else
684
+ return default
685
+ end
686
+ end
687
+
688
+
689
+ ### Do normal/classical switching and match capitalization in <tt>inflected</tt> by
690
+ ### examining the <tt>original</tt> input.
691
+ def postprocess( original, inflected )
692
+ inflected.sub!( /([^|]+)\|(.+)/ ) {
693
+ Linguistics::classical? ? $2 : $1
694
+ }
695
+
696
+ case original
697
+ when "I"
698
+ return inflected
699
+ when /^[A-Z]+$/
700
+ return inflected.upcase
701
+ when /^[A-Z]/
702
+ # Can't use #capitalize, as it will downcase the rest of the string,
703
+ # too.
704
+ inflected[0,1] = inflected[0,1].upcase
705
+ return inflected
706
+ else
707
+ return inflected
708
+ end
709
+ end
710
+
711
+
712
+ ### Pluralize nouns
713
+ def pluralize_noun( word, count=nil )
714
+ value = nil
715
+ count ||= Linguistics::num
716
+ count = normalizeCount( count )
717
+
718
+ return word if count == 1
719
+
720
+ # Handle user-defined nouns
721
+ #if value = ud_match( word, PL_sb_user_defined )
722
+ # return value
723
+ #end
724
+
725
+ # Handle empty word, singular count and uninflected plurals
726
+ case word
727
+ when ''
728
+ return word
729
+ when /^(#{PL_sb_uninflected})$/i
730
+ return word
731
+ else
732
+ if Linguistics::classical? &&
733
+ /^(#{PL_sb_uninflected_herd})$/i =~ word
734
+ return word
735
+ end
736
+ end
737
+
738
+ # Handle compounds ("Governor General", "mother-in-law", "aide-de-camp", etc.)
739
+ case word
740
+ when /^(?:#{PL_sb_postfix_adj})$/i
741
+ value = $2
742
+ return pluralize_noun( $1, 2 ) + value
743
+
744
+ when /^(?:#{PL_sb_prep_dual_compound})$/i
745
+ value = [ $2, $3 ]
746
+ return pluralize_noun( $1, 2 ) + value[0] + pluralize_noun( value[1] )
747
+
748
+ when /^(?:#{PL_sb_prep_compound})$/i
749
+ value = $2
750
+ return pluralize_noun( $1, 2 ) + value
751
+
752
+ # Handle pronouns
753
+ when /^((?:#{PL_prep})\s+)(#{PL_pron_acc})$/i
754
+ return $1 + PL_pron_acc_h[ $2.downcase ]
755
+
756
+ when /^(#{PL_pron_nom})$/i
757
+ return PL_pron_nom_h[ word.downcase ]
758
+
759
+ when /^(#{PL_pron_acc})$/i
760
+ return PL_pron_acc_h[ $1.downcase ]
761
+
762
+ # Handle isolated irregular plurals
763
+ when /(.*)\b(#{PL_sb_irregular})$/i
764
+ return $1 + PL_sb_irregular_h[ $2.downcase ]
765
+
766
+ when /(#{PL_sb_U_man_mans})$/i
767
+ return "#{$1}s"
768
+
769
+ # Handle families of irregular plurals
770
+ when /(.*)man$/i ; return "#{$1}men"
771
+ when /(.*[ml])ouse$/i ; return "#{$1}ice"
772
+ when /(.*)goose$/i ; return "#{$1}geese"
773
+ when /(.*)tooth$/i ; return "#{$1}teeth"
774
+ when /(.*)foot$/i ; return "#{$1}feet"
775
+
776
+ # Handle unassimilated imports
777
+ when /(.*)ceps$/i ; return word
778
+ when /(.*)zoon$/i ; return "#{$1}zoa"
779
+ when /(.*[csx])is$/i ; return "#{$1}es"
780
+ when /(#{PL_sb_U_ex_ices})ex$/i; return "#{$1}ices"
781
+ when /(#{PL_sb_U_ix_ices})ix$/i; return "#{$1}ices"
782
+ when /(#{PL_sb_U_um_a})um$/i ; return "#{$1}a"
783
+ when /(#{PL_sb_U_us_i})us$/i ; return "#{$1}i"
784
+ when /(#{PL_sb_U_on_a})on$/i ; return "#{$1}a"
785
+ when /(#{PL_sb_U_a_ae})$/i ; return "#{$1}e"
786
+ end
787
+
788
+ # Handle incompletely assimilated imports
789
+ if Linguistics::classical?
790
+ case word
791
+ when /(.*)trix$/i ; return "#{$1}trices"
792
+ when /(.*)eau$/i ; return "#{$1}eaux"
793
+ when /(.*)ieu$/i ; return "#{$1}ieux"
794
+ when /(.{2,}[yia])nx$/i ; return "#{$1}nges"
795
+ when /(#{PL_sb_C_en_ina})en$/i; return "#{$1}ina"
796
+ when /(#{PL_sb_C_ex_ices})ex$/i; return "#{$1}ices"
797
+ when /(#{PL_sb_C_ix_ices})ix$/i; return "#{$1}ices"
798
+ when /(#{PL_sb_C_um_a})um$/i ; return "#{$1}a"
799
+ when /(#{PL_sb_C_us_i})us$/i ; return "#{$1}i"
800
+ when /(#{PL_sb_C_us_us})$/i ; return "#{$1}"
801
+ when /(#{PL_sb_C_a_ae})$/i ; return "#{$1}e"
802
+ when /(#{PL_sb_C_a_ata})a$/i ; return "#{$1}ata"
803
+ when /(#{PL_sb_C_o_i})o$/i ; return "#{$1}i"
804
+ when /(#{PL_sb_C_on_a})on$/i ; return "#{$1}a"
805
+ when /#{PL_sb_C_im}$/i ; return "#{word}im"
806
+ when /#{PL_sb_C_i}$/i ; return "#{word}i"
807
+ end
808
+ end
809
+
810
+
811
+ # Handle singular nouns ending in ...s or other silibants
812
+ case word
813
+ when /^(#{PL_sb_singular_s})$/i; return "#{$1}es"
814
+ when /^([A-Z].*s)$/; return "#{$1}es"
815
+ when /(.*)([cs]h|[zx])$/i ; return "#{$1}#{$2}es"
816
+ # when /(.*)(us)$/i ; return "#{$1}#{$2}es"
817
+
818
+ # Handle ...f -> ...ves
819
+ when /(.*[eao])lf$/i ; return "#{$1}lves";
820
+ when /(.*[^d])eaf$/i ; return "#{$1}eaves"
821
+ when /(.*[nlw])ife$/i ; return "#{$1}ives"
822
+ when /(.*)arf$/i ; return "#{$1}arves"
823
+
824
+ # Handle ...y
825
+ when /(.*[aeiou])y$/i ; return "#{$1}ys"
826
+ when /([A-Z].*y)$/ ; return "#{$1}s"
827
+ when /(.*)y$/i ; return "#{$1}ies"
828
+
829
+ # Handle ...o
830
+ when /#{PL_sb_U_o_os}$/i ; return "#{word}s"
831
+ when /[aeiou]o$/i ; return "#{word}s"
832
+ when /o$/i ; return "#{word}es"
833
+
834
+ # Otherwise just add ...s
835
+ else
836
+ return "#{word}s"
837
+ end
838
+ end # def pluralize_noun
839
+
840
+
841
+
842
+ ### Pluralize special verbs
843
+ def pluralize_special_verb( word, count )
844
+ count ||= Linguistics::num
845
+ count = normalizeCount( count )
846
+
847
+ return nil if /^(#{PL_count_one})$/i =~ count.to_s
848
+
849
+ # Handle user-defined verbs
850
+ #if value = ud_match( word, PL_v_user_defined )
851
+ # return value
852
+ #end
853
+
854
+ case word
855
+
856
+ # Handle irregular present tense (simple and compound)
857
+ when /^(#{PL_v_irregular_pres})((\s.*)?)$/i
858
+ return PL_v_irregular_pres_h[ $1.downcase ] + $2
859
+
860
+ # Handle irregular future, preterite and perfect tenses
861
+ when /^(#{PL_v_irregular_non_pres})((\s.*)?)$/i
862
+ return word
863
+
864
+ # Handle special cases
865
+ when /^(#{PL_v_special_s})$/, /\s/
866
+ return nil
867
+
868
+ # Handle standard 3rd person (chop the ...(e)s off single words)
869
+ when /^(.*)([cs]h|[x]|zz|ss)es$/i
870
+ return $1 + $2
871
+ when /^(..+)ies$/i
872
+ return "#{$1}y"
873
+ when /^(.+)oes$/i
874
+ return "#{$1}o"
875
+ when /^(.*[^s])s$/i
876
+ return $1
877
+
878
+ # Otherwise, a regular verb (handle elsewhere)
879
+ else
880
+ return nil
881
+ end
882
+ end
883
+
884
+
885
+ ### Pluralize regular verbs
886
+ def pluralize_general_verb( word, count )
887
+ count ||= Linguistics::num
888
+ count = normalizeCount( count )
889
+
890
+ return word if /^(#{PL_count_one})$/i =~ count.to_s
891
+
892
+ case word
893
+
894
+ # Handle ambiguous present tenses (simple and compound)
895
+ when /^(#{PL_v_ambiguous_pres})((\s.*)?)$/i
896
+ return PL_v_ambiguous_pres_h[ $1.downcase ] + $2
897
+
898
+ # Handle ambiguous preterite and perfect tenses
899
+ when /^(#{PL_v_ambiguous_non_pres})((\s.*)?)$/i
900
+ return word
901
+
902
+ # Otherwise, 1st or 2nd person is uninflected
903
+ else
904
+ return word
905
+ end
906
+ end
907
+
908
+
909
+ ### Handle special adjectives
910
+ def pluralize_special_adjective( word, count )
911
+ count ||= Linguistics::num
912
+ count = normalizeCount( count )
913
+
914
+ return word if /^(#{PL_count_one})$/i =~ count.to_s
915
+
916
+ # Handle user-defined verbs
917
+ #if value = ud_match( word, PL_adj_user_defined )
918
+ # return value
919
+ #end
920
+
921
+ case word
922
+
923
+ # Handle known cases
924
+ when /^(#{PL_adj_special})$/i
925
+ return PL_adj_special_h[ $1.downcase ]
926
+
927
+ # Handle possessives
928
+ when /^(#{PL_adj_poss})$/i
929
+ return PL_adj_poss_h[ $1.downcase ]
930
+
931
+ when /^(.*)'s?$/
932
+ pl = plural_noun( $1 )
933
+ if /s$/ =~ pl
934
+ return "#{pl}'"
935
+ else
936
+ return "#{pl}'s"
937
+ end
938
+
939
+ # Otherwise, no idea
940
+ else
941
+ return nil
942
+ end
943
+ end
944
+
945
+
946
+ ### Returns the given word with a prepended indefinite article, unless
947
+ ### +count+ is non-nil and not singular.
948
+ def indef_article( word, count )
949
+ count ||= Linguistics::num
950
+ return "#{count} #{word}" if
951
+ count && /^(#{PL_count_one})$/i !~ count.to_s
952
+
953
+ # Handle user-defined variants
954
+ # return value if value = ud_match( word, A_a_user_defined )
955
+
956
+ case word
957
+
958
+ # Handle special cases
959
+ when /^(#{A_explicit_an})/i
960
+ return "an #{word}"
961
+
962
+ # Handle abbreviations
963
+ when /^(#{A_abbrev})/x
964
+ return "an #{word}"
965
+ when /^[aefhilmnorsx][.-]/i
966
+ return "an #{word}"
967
+ when /^[a-z][.-]/i
968
+ return "a #{word}"
969
+
970
+ # Handle consonants
971
+ when /^[^aeiouy]/i
972
+ return "a #{word}"
973
+
974
+ # Handle special vowel-forms
975
+ when /^e[uw]/i
976
+ return "a #{word}"
977
+ when /^onc?e\b/i
978
+ return "a #{word}"
979
+ when /^uni([^nmd]|mo)/i
980
+ return "a #{word}"
981
+ when /^u[bcfhjkqrst][aeiou]/i
982
+ return "a #{word}"
983
+
984
+ # Handle vowels
985
+ when /^[aeiou]/i
986
+ return "an #{word}"
987
+
988
+ # Handle y... (before certain consonants implies (unnaturalized) "i.." sound)
989
+ when /^(#{A_y_cons})/i
990
+ return "an #{word}"
991
+
992
+ # Otherwise, guess "a"
993
+ else
994
+ return "a #{word}"
995
+ end
996
+ end
997
+
998
+
999
+ ### Transform the specified number of units-place numerals into a
1000
+ ### word-phrase at the given number of +thousands+ places.
1001
+ def to_units( units, thousands=0 )
1002
+ return Units[ units ] + to_thousands( thousands )
1003
+ end
1004
+
1005
+
1006
+ ### Transform the specified number of tens- and units-place numerals into a
1007
+ ### word-phrase at the given number of +thousands+ places.
1008
+ def to_tens( tens, units, thousands=0 )
1009
+ unless tens == 1
1010
+ return Tens[ tens ] + ( tens.nonzero? && units.nonzero? ? '-' : '' ) +
1011
+ to_units( units, thousands )
1012
+ else
1013
+ return Teens[ units ] + to_thousands( thousands )
1014
+ end
1015
+ end
1016
+
1017
+
1018
+ ### Transform the specified number of hundreds-, tens-, and units-place
1019
+ ### numerals into a word phrase. If the number of thousands (+thousands+) is
1020
+ ### greater than 0, it will be used to determine where the decimal point is
1021
+ ### in relation to the hundreds-place number.
1022
+ def to_hundreds( hundreds, tens=0, units=0, thousands=0, joinword=" and " )
1023
+ joinword = ' ' if joinword.empty?
1024
+ if hundreds.nonzero?
1025
+ return to_units( hundreds ) + " hundred" +
1026
+ (tens.nonzero? || units.nonzero? ? joinword : '') +
1027
+ to_tens( tens, units ) +
1028
+ to_thousands( thousands )
1029
+ elsif tens.nonzero? || units.nonzero?
1030
+ return to_tens( tens, units ) + to_thousands( thousands )
1031
+ else
1032
+ return nil
1033
+ end
1034
+ end
1035
+
1036
+ ### Transform the specified number into one or more words like 'thousand',
1037
+ ### 'million', etc. Uses the thousands (American) system.
1038
+ def to_thousands( thousands=0 )
1039
+ parts = []
1040
+ (0..thousands).step( Thousands.length - 1 ) {|i|
1041
+ if i.zero?
1042
+ parts.push Thousands[ thousands % (Thousands.length - 1) ]
1043
+ else
1044
+ parts.push Thousands.last
1045
+ end
1046
+ }
1047
+
1048
+ return parts.join(" ")
1049
+ end
1050
+
1051
+
1052
+ ### Return the specified number +num+ as an array of number phrases.
1053
+ def number_to_words( num, config )
1054
+ return [config[:zero]] if num.to_i.zero?
1055
+ chunks = []
1056
+
1057
+ # Break into word-groups if groups is set
1058
+ if config[:group].nonzero?
1059
+
1060
+ # Build a Regexp with <config[:group]> number of digits. Any past
1061
+ # the first are optional.
1062
+ re = Regexp::new( "(\\d)" + ("(\\d)?" * (config[:group] - 1)) )
1063
+
1064
+ # Scan the string, and call the word-chunk function that deals with
1065
+ # chunks of the found number of digits.
1066
+ num.to_s.scan( re ) {|digits|
1067
+ debugMsg " digits = #{digits.inspect}"
1068
+ fn = NumberToWordsFunctions[ digits.nitems ]
1069
+ numerals = digits.flatten.compact.collect {|i| i.to_i}
1070
+ debugMsg " numerals = #{numerals.inspect}"
1071
+ chunks.push fn.call( config[:zero], *numerals ).strip
1072
+ }
1073
+ else
1074
+ phrase = num.to_s
1075
+ phrase.sub!( /\A\s*0+/, '' )
1076
+ mill = 0
1077
+
1078
+ # Match backward from the end of the digits in the string, turning
1079
+ # chunks of three, of two, and of one into words.
1080
+ mill += 1 while
1081
+ phrase.sub!( /(\d)(\d)(\d)(?=\D*\Z)/ ) {
1082
+ words = to_hundreds( $1.to_i, $2.to_i, $3.to_i, mill,
1083
+ config[:and] )
1084
+ chunks.unshift words.strip.squeeze(' ') unless words.nil?
1085
+ ''
1086
+ }
1087
+
1088
+ phrase.sub!( /(\d)(\d)(?=\D*\Z)/ ) {
1089
+ chunks.unshift to_tens( $1.to_i, $2.to_i, mill ).strip.squeeze(' ')
1090
+ ''
1091
+ }
1092
+ phrase.sub!( /(\d)(?=\D*\Z)/ ) {
1093
+ chunks.unshift to_units( $1.to_i, mill ).strip.squeeze(' ')
1094
+ ''
1095
+ }
1096
+ end
1097
+
1098
+ return chunks
1099
+ end
1100
+
1101
+
1102
+ #################################################################
1103
+ ### P U B L I C F U N C T I O N S
1104
+ #################################################################
1105
+
1106
+ ### Return the name of the language this module is for.
1107
+ def language
1108
+ "English"
1109
+ end
1110
+
1111
+
1112
+ ### Return the plural of the given +phrase+ if +count+ indicates it should
1113
+ ### be plural.
1114
+ def plural( phrase, count=nil )
1115
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1116
+ pre, word, post = md.to_a[1,3]
1117
+ return phrase if word.nil? or word.empty?
1118
+
1119
+ plural = postprocess( word,
1120
+ pluralize_special_adjective(word, count) ||
1121
+ pluralize_special_verb(word, count) ||
1122
+ pluralize_noun(word, count) )
1123
+
1124
+ return pre + plural + post
1125
+ end
1126
+ alias_method :PL, :plural
1127
+
1128
+
1129
+ ### Return the plural of the given noun +phrase+ if +count+ indicates it
1130
+ ### should be plural.
1131
+ def plural_noun( phrase, count=nil )
1132
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1133
+ pre, word, post = md.to_a[1,3]
1134
+ return phrase if word.nil? or word.empty?
1135
+
1136
+ plural = postprocess( word, pluralize_noun(word, count) )
1137
+ return pre + plural + post
1138
+ end
1139
+ alias_method :PL_N, :plural_noun
1140
+
1141
+
1142
+ ### Return the plural of the given verb +phrase+ if +count+ indicates it
1143
+ ### should be plural.
1144
+ def plural_verb( phrase, count=nil )
1145
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1146
+ pre, word, post = md.to_a[1,3]
1147
+ return phrase if word.nil? or word.empty?
1148
+
1149
+ plural = postprocess( word,
1150
+ pluralize_special_verb(word, count) ||
1151
+ pluralize_general_verb(word, count) )
1152
+ return pre + plural + post
1153
+ end
1154
+ alias_method :PL_V, :plural_verb
1155
+
1156
+
1157
+ ### Return the plural of the given adjectival +phrase+ if +count+ indicates
1158
+ ### it should be plural.
1159
+ def plural_adjective( phrase, count=nil )
1160
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1161
+ pre, word, post = md.to_a[1,3]
1162
+ return phrase if word.nil? or word.empty?
1163
+
1164
+ plural = postprocess( word,
1165
+ pluralize_special_adjective(word, count) || word )
1166
+ return pre + plural + post
1167
+ end
1168
+ alias_method :plural_adj, :plural_adjective
1169
+ alias_method :PL_ADJ, :plural_adjective
1170
+
1171
+
1172
+ ### Return the given phrase with the appropriate indefinite article ("a" or
1173
+ ### "an") prepended.
1174
+ def a( phrase, count=nil )
1175
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1176
+ pre, word, post = md.to_a[1,3]
1177
+ return phrase if word.nil? or word.empty?
1178
+
1179
+ result = indef_article( word, count )
1180
+ return pre + result + post
1181
+ end
1182
+ alias_method :an, :a
1183
+ alias_method :A, :a
1184
+ alias_method :AN, :a
1185
+
1186
+
1187
+ ### Translate zero-quantified +phrase+ to "no +phrase.plural+"
1188
+ def no( phrase, count=nil )
1189
+ md = /\A(\s*)(.+?)(\s*)\Z/.match( phrase.to_s )
1190
+ pre, word, post = md.to_a[1,3]
1191
+ count ||= Linguistics::num || 0
1192
+
1193
+ unless /^#{PL_count_zero}$/ =~ count.to_s
1194
+ return "#{pre}#{count} " + plural( word, count ) + post
1195
+ else
1196
+ return "#{pre}no " + plural( word, 0 ) + post
1197
+ end
1198
+ end
1199
+ alias_method :NO, :no
1200
+
1201
+
1202
+ ### Participles
1203
+ def present_participle( word )
1204
+ plural = plural_verb( word.to_s, 2 )
1205
+
1206
+ plural.sub!( /ie$/, 'y' ) or
1207
+ plural.sub!( /ue$/, 'u' ) or
1208
+ plural.sub!( /([auy])e$/, '$1' ) or
1209
+ plural.sub!( /i$/, '' ) or
1210
+ plural.sub!( /([^e])e$/, "\\1" ) or
1211
+ /er$/.match( plural ) or
1212
+ plural.sub!( /([^aeiou][aeiouy]([bdgmnprst]))$/, "\\1\\2" )
1213
+
1214
+ return "#{plural}ing"
1215
+ end
1216
+ alias_method :part_pres, :present_participle
1217
+ alias_method :PART_PRES, :present_participle
1218
+
1219
+
1220
+
1221
+ ### Return the specified number as english words. One or more configuration
1222
+ ### values may be passed to control the returned String:
1223
+ ###
1224
+ ### [<b>:group</b>]
1225
+ ### Controls how many numbers at a time are grouped together. Valid values
1226
+ ### are +0+ (normal grouping), +1+ (single-digit grouping, e.g., "one,
1227
+ ### two, three, four"), +2+ (double-digit grouping, e.g., "twelve,
1228
+ ### thirty-four", or +3+ (triple-digit grouping, e.g., "one twenty-three,
1229
+ ### four").
1230
+ ### [<b>:comma</b>]
1231
+ ### Set the character/s used to separate word groups. Defaults to +", "+.
1232
+ ### [<b>:and</b>]
1233
+ ### Set the word and/or characters used where ' and ' (the default) is
1234
+ ### normally used. Setting <tt>:and</tt> to +' '+, for example, will cause
1235
+ ### +2556+ to be returned as "two-thousand, five hundred fifty-six"
1236
+ ### instead of ""two-thousand, five hundred and fifty-six".
1237
+ ### [<b>:zero</b>]
1238
+ ### Set the word used to represent the numeral +0+ in the result. +'zero'+
1239
+ ### is the default.
1240
+ ### [<b>:decimal</b>]
1241
+ ### Set the translation of any decimal points in the number; the default
1242
+ ### is +'point'+.
1243
+ ### [<b>:asArray</b>]
1244
+ ### If set to a true value, the number will be returned as an array of
1245
+ ### word groups instead of a String.
1246
+ def numwords( number, hashargs={} )
1247
+ num = number.to_s
1248
+ config = NumwordDefaults.dup.update( hashargs )
1249
+ raise "Bad chunking option: #{config[:group]}" unless
1250
+ config[:group].between?( 0, 3 )
1251
+
1252
+ # Array of number parts: first is everything to the left of the first
1253
+ # decimal, followed by any groups of decimal-delimted numbers after that
1254
+ parts = []
1255
+
1256
+ # Wordify any sign prefix
1257
+ sign = (/\A\s*\+/ =~ num) ? 'plus' : (/\A\s*\-/ =~ num) ? 'minus' : ''
1258
+
1259
+ # Strip any ordinal suffixes
1260
+ ord = true if num.sub!( /(st|nd|rd|th)\Z/, '' )
1261
+
1262
+ # Split the number into chunks delimited by '.'
1263
+ chunks = if !config[:decimal].empty? then
1264
+ if config[:group].nonzero?
1265
+ num.split(/\./)
1266
+ else
1267
+ num.split(/\./, 2)
1268
+ end
1269
+ else
1270
+ [ num ]
1271
+ end
1272
+
1273
+ # Wordify each chunk, pushing arrays into the parts array
1274
+ chunks.each_with_index {|chunk,section|
1275
+ chunk.gsub!( /\D+/, '' )
1276
+
1277
+ # If there's nothing in this chunk of the number, set it to zero
1278
+ # unless it's the whole-number part, in which case just push an
1279
+ # empty array.
1280
+ if chunk.empty?
1281
+ if section.zero?
1282
+ parts.push []
1283
+ next
1284
+ end
1285
+ end
1286
+
1287
+ # Split the number section into wordified parts unless this is the
1288
+ # second or succeeding part of a non-group number
1289
+ unless config[:group].zero? && section.nonzero?
1290
+ parts.push number_to_words( chunk, config )
1291
+ else
1292
+ parts.push number_to_words( chunk, config.dup.update(:group => 1) )
1293
+ end
1294
+ }
1295
+
1296
+ debugMsg "Parts => #{parts.inspect}"
1297
+
1298
+ # Turn the last word of the whole-number part back into an ordinal if
1299
+ # the original number came in that way.
1300
+ if ord && !parts[0].empty?
1301
+ parts[0][-1] = ordinal( parts[0].last )
1302
+ end
1303
+
1304
+ # If the caller's expecting an Array return, just flatten and return the
1305
+ # parts array.
1306
+ if config[:asArray]
1307
+ unless sign.empty?
1308
+ parts[0].unshift( sign )
1309
+ end
1310
+ return parts.flatten
1311
+ end
1312
+
1313
+ # Catenate each sub-parts array into a whole number part and one or more
1314
+ # post-decimal parts. If grouping is turned on, all sub-parts get joined
1315
+ # with commas, otherwise just the whole-number part is.
1316
+ if config[:group].zero?
1317
+ if parts[0].nitems > 1
1318
+
1319
+ # Join all but the last part together with commas
1320
+ wholenum = parts[0][0...-1].join( config[:comma] )
1321
+
1322
+ # If the last part is just a single word, append it to the
1323
+ # wholenum part with an 'and'. This is to get things like 'three
1324
+ # thousand and three' instead of 'three thousand, three'.
1325
+ if /^\s*(\S+)\s*$/ =~ parts[0].last
1326
+ wholenum += " and #{parts[0].last}"
1327
+ else
1328
+ wholenum += config[:comma] + parts[0].last
1329
+ end
1330
+ else
1331
+ wholenum = parts[0][0]
1332
+ end
1333
+ decimals = parts[1..-1].collect {|part| part.join(" ")}
1334
+
1335
+ debugMsg "Wholenum: #{wholenum.inspect}; decimals: #{decimals.inspect}"
1336
+
1337
+ # Join with the configured decimal; if it's empty, just join with
1338
+ # spaces.
1339
+ unless config[:decimal].empty?
1340
+ return sign + ([ wholenum ] + decimals).
1341
+ join( " #{config[:decimal]} " ).strip
1342
+ else
1343
+ return sign + ([ wholenum ] + decimals).
1344
+ join( " " ).strip
1345
+ end
1346
+ else
1347
+ return parts.compact.
1348
+ separate( config[:decimal] ).
1349
+ delete_if {|el| el.empty?}.
1350
+ join( config[:comma] ).
1351
+ strip
1352
+ end
1353
+ end
1354
+ alias_method :NUMWORDS, :numwords
1355
+
1356
+
1357
+ ### Transform the given +number+ into an ordinal word. The +number+ object
1358
+ ### can be either an Integer or a String.
1359
+ def ordinal( number )
1360
+ case number
1361
+ when Integer
1362
+ return number.to_s + (Nth[ number % 100 ] || Nth[ number % 10 ])
1363
+
1364
+ else
1365
+ return number.to_s.sub( /(#{OrdinalSuffixes})\Z/ ) { Ordinals[$1] }
1366
+ end
1367
+ end
1368
+ alias_method :ORD, :ordinal
1369
+
1370
+
1371
+ ### Return a phrase describing the specified +number+ of objects in the
1372
+ ### given +phrase+. The following options can be used to control the makeup
1373
+ ### of the returned quantity String:
1374
+ ###
1375
+ ### [<b>:joinword</b>]
1376
+ ### Sets the word (and any surrounding spaces) used as the word separating the
1377
+ ### quantity from the noun in the resulting string. Defaults to <tt>' of
1378
+ ### '</tt>.
1379
+ def quantify( phrase, number=0, args={} )
1380
+ num = number.to_i
1381
+ config = QuantifyDefaults.dup.update( args )
1382
+
1383
+ case num
1384
+ when 0
1385
+ no( phrase )
1386
+ when 1
1387
+ a( phrase )
1388
+ when SeveralRange
1389
+ "several " + plural( phrase, num )
1390
+ when NumberRange
1391
+ "a number of " + plural( phrase, num )
1392
+ when NumerousRange
1393
+ "numerous " + plural( phrase, num )
1394
+ when ManyRange
1395
+ "many " + plural( phrase, num )
1396
+ else
1397
+
1398
+ # Anything bigger than the ManyRange gets described like
1399
+ # "hundreds of thousands of..." or "millions of..."
1400
+ # depending, of course, on how many there are.
1401
+ thousands, subthousands = Math::log10( num ).to_i.divmod( 3 )
1402
+ stword =
1403
+ case subthousands
1404
+ when 2
1405
+ "hundreds"
1406
+ when 1
1407
+ "tens"
1408
+ else
1409
+ nil
1410
+ end
1411
+ thword = plural( to_thousands(thousands).strip )
1412
+ thword = nil if thword.empty?
1413
+
1414
+ [ # Hundreds (of)...
1415
+ stword,
1416
+
1417
+ # thousands (of)
1418
+ thword,
1419
+
1420
+ # stars.
1421
+ plural(phrase, number)
1422
+ ].compact.join( config[:joinword] )
1423
+ end
1424
+ end
1425
+
1426
+
1427
+ ### Return the specified +obj+ (which must support the <tt>#collect</tt>
1428
+ ### method) as a conjunction. Each item is converted to a String if it is
1429
+ ### not already (using #to_s) unless a block is given, in which case it is
1430
+ ### called once for each object in the array, and the stringified return
1431
+ ### value from the block is used instead. Returning +nil+ causes that
1432
+ ### particular element to be omitted from the resulting conjunction. The
1433
+ ### following options can be used to control the makeup of the returned
1434
+ ### conjunction String:
1435
+ ###
1436
+ ### [<b>:separator</b>]
1437
+ ### Specify one or more characters to separate items in the resulting
1438
+ ### list. Defaults to <tt>', '</tt>.
1439
+ ### [<b>:altsep</b>]
1440
+ ### An alternate separator to use if any of the resulting conjunction's
1441
+ ### clauses contain the <tt>:separator</tt> character/s. Defaults to <tt>'; '</tt>.
1442
+ ### [<b>:penultimate</b>]
1443
+ ### Flag that indicates whether or not to join the last clause onto the
1444
+ ### rest of the conjunction using a penultimate <tt>:separator</tt>. E.g.,
1445
+ ### %w{duck, cow, dog}.en.conjunction
1446
+ ### # => "a duck, a cow, and a dog"
1447
+ ### %w{duck cow dog}.en.conjunction( :penultimate => false )
1448
+ ### "a duck, a cow and a dog"
1449
+ ### Default to <tt>true</tt>.
1450
+ ### [<b>:conjunctive</b>]
1451
+ ### Sets the word used as the conjunctive (separating word) of the
1452
+ ### resulting string. Default to <tt>'and'</tt>.
1453
+ ### [<b>:combine</b>]
1454
+ ### If set to <tt>true</tt> (the default), items which are indentical (after
1455
+ ### surrounding spaces are stripped) will be combined in the resulting
1456
+ ### conjunction. E.g.,
1457
+ ### %w{goose cow goose dog}.en.conjunction
1458
+ ### # => "two geese, a cow, and a dog"
1459
+ ### %w{goose cow goose dog}.en.conjunction( :combine => false )
1460
+ ### # => "a goose, a cow, a goose, and a dog"
1461
+ ### [<b>:casefold</b>]
1462
+ ### If set to <tt>true</tt> (the default), then items are compared
1463
+ ### case-insensitively when combining them. This has no effect if
1464
+ ### <tt>:combine</tt> is <tt>false</tt>.
1465
+ ### [<b>:generalize</b>]
1466
+ ### If set to <tt>true</tt>, then quantities of combined items are turned into
1467
+ ### general descriptions instead of exact amounts.
1468
+ ### ary = %w{goose pig dog horse goose reindeer goose dog horse}
1469
+ ### ary.en.conjunction
1470
+ ### # => "three geese, two dogs, two horses, a pig, and a reindeer"
1471
+ ### ary.en.conjunction( :generalize => true )
1472
+ ### # => "several geese, several dogs, several horses, a pig, and a reindeer"
1473
+ ### See the #quantify method for specifics on how quantities are
1474
+ ### generalized. Generalization defaults to <tt>false</tt>, and has no effect if
1475
+ ### :combine is <tt>false</tt>.
1476
+ ### [<b>:quantsort</b>]
1477
+ ### If set to <tt>true</tt> (the default), items which are combined in the
1478
+ ### resulting conjunction will be listed in order of amount, with greater
1479
+ ### quantities sorted first. If <tt>:quantsort</tt> is <tt>false</tt>, combined items
1480
+ ### will appear where the first instance of them occurred in the
1481
+ ### list. This sort is also the fallback for indentical quantities (ie.,
1482
+ ### items of the same quantity will be listed in the order they appeared
1483
+ ### in the source list).
1484
+ ###
1485
+ def conjunction( obj, args={} )
1486
+ config = ConjunctionDefaults.dup.update( args )
1487
+ phrases = []
1488
+
1489
+ # Transform items in the obj to phrases
1490
+ if block_given?
1491
+ phrases = obj.collect {|item| yield(item) }.compact
1492
+ else
1493
+ phrases = obj.collect {|item| item.to_s }
1494
+ end
1495
+
1496
+ # No need for a conjunction if there's only one thing
1497
+ return a(phrases[0]) if phrases.length < 2
1498
+
1499
+ # Set up a Proc to derive a collector key from a phrase depending on the
1500
+ # configuration
1501
+ keyfunc =
1502
+ if config[:casefold]
1503
+ proc {|key| key.downcase.strip}
1504
+ else
1505
+ proc {|key| key.strip}
1506
+ end
1507
+
1508
+ # Count and delete phrases that hash the same when the keyfunc munges
1509
+ # them into the same thing if we're combining (:combine => true).
1510
+ collector = {}
1511
+ if config[:combine]
1512
+
1513
+ phrases.each_index do |i|
1514
+ # Stop when reaching the end of a truncated list
1515
+ break if phrases[i].nil?
1516
+
1517
+ # Make the key using the configured key function
1518
+ phrase = keyfunc[ phrases[i] ]
1519
+
1520
+ # If the collector already has this key, increment its count,
1521
+ # eliminate the duplicate from the phrase list, and redo the loop.
1522
+ if collector.key?( phrase )
1523
+ collector[ phrase ] += 1
1524
+ phrases.delete_at( i )
1525
+ redo
1526
+ end
1527
+
1528
+ collector[ phrase ] = 1
1529
+ end
1530
+ else
1531
+ # If we're not combining, just make everything have a count of 1.
1532
+ phrases.uniq.each {|key| collector[ keyfunc[key] ] = 1}
1533
+ end
1534
+
1535
+ # If sort-by-quantity is turned on, sort the phrases first by how many
1536
+ # there are (most-first), and then by the order they were specified in.
1537
+ if config[:quantsort] && config[:combine]
1538
+ origorder = {}
1539
+ phrases.each_with_index {|phrase,i| origorder[ keyfunc[phrase] ] ||= i }
1540
+ phrases.sort! {|a,b|
1541
+ (collector[ keyfunc[b] ] <=> collector[ keyfunc[a] ]).nonzero? ||
1542
+ (origorder[ keyfunc[a] ] <=> origorder[ keyfunc[b] ])
1543
+ }
1544
+ end
1545
+
1546
+ # Set up a filtering function that adds either an indefinite article, an
1547
+ # indefinite quantifier, or a definite quantifier to each phrase
1548
+ # depending on the configuration and the count of phrases in the
1549
+ # collector.
1550
+ filter =
1551
+ if config[:generalize]
1552
+ proc {|phrase, count| quantify(phrase, count) }
1553
+ else
1554
+ proc {|phrase, count|
1555
+ if count > 1
1556
+ "%s %s" % [
1557
+ # :TODO: Make this threshold settable
1558
+ count < 10 ? count.en.numwords : count.to_s,
1559
+ plural(phrase, count)
1560
+ ]
1561
+ else
1562
+ a( phrase )
1563
+ end
1564
+ }
1565
+ end
1566
+
1567
+ # Now use the configured filter to turn each phrase into its final
1568
+ # form. Hmmm... square-bracket Lisp?
1569
+ phrases.collect! {|phrase| filter[phrase, collector[ keyfunc[phrase] ]] }
1570
+
1571
+ # Prepend the conjunctive to the last element unless it's empty or
1572
+ # there's only one element
1573
+ phrases[-1].insert( 0, config[:conjunctive] + " " ) unless
1574
+ config[:conjunctive].strip.empty? or
1575
+ phrases.length < 2
1576
+
1577
+ # Catenate the last two elements if there's no penultimate separator,
1578
+ # and pick a separator based on how many phrases there are and whether
1579
+ # or not there's already an instance of it in the phrases.
1580
+ phrases[-2] << " " << phrases.pop unless config[:penultimate]
1581
+ sep = if phrases.length <= 2
1582
+ ' '
1583
+ elsif phrases.grep( /#{config[:separator]}/ ).empty?
1584
+ config[:separator]
1585
+ else
1586
+ config[:altsep]
1587
+ end
1588
+
1589
+ return phrases.join( sep )
1590
+ end
1591
+
1592
+
1593
+ ### Turns a camel-case +string+ ("camelCaseToEnglish") to plain English
1594
+ ### ("camel case to english"). Each word is decapitalized.
1595
+ def camel_case_to_english( string )
1596
+ string.to_s.gsub( /([a-z])([A-Z])/ ) { "#$1 #$2" }.downcase
1597
+ end
1598
+
1599
+
1600
+ ### Turns an English language +string+ into a CamelCase word.
1601
+ def english_to_camel_case( string )
1602
+ string.to_s.gsub( /\s+([a-z])/ ) { $1.upcase }
1603
+ end
1604
+
1605
+
1606
+ ### This method doesn't work quite right yet. It does okay for simple cases,
1607
+ ### but it misses more complex ones, e.g. 'as' used as a coordinating
1608
+ ### conjunction in "A Portrait of the Artist as a Young Man". Perhaps after
1609
+ ### there's a working (non-leaking) LinkParser for Ruby, this can be fixed
1610
+ ### up. Until then it'll just be undocumented.
1611
+
1612
+ ### Returns the given +string+ as a title-cased phrase.
1613
+ def titlecase( string ) # :nodoc:
1614
+
1615
+ # Split on word-boundaries
1616
+ words = string.split( /\b/ )
1617
+
1618
+ # Always capitalize the first and last words
1619
+ words.first.capitalize!
1620
+ words.last.capitalize!
1621
+
1622
+ # Now scan the rest of the tokens, skipping non-words and capitalization
1623
+ # exceptions.
1624
+ words.each_with_index do |word, i|
1625
+
1626
+ # Non-words
1627
+ next unless /^\w+$/.match( word )
1628
+
1629
+ # Skip exception-words
1630
+ next if TitleCaseExceptions.include?( word )
1631
+
1632
+ # Skip second parts of contractions
1633
+ next if words[i - 1] == "'" && /\w/.match( words[i - 2] )
1634
+
1635
+ # Have to do it this way instead of capitalize! because that method
1636
+ # also downcases all other letters.
1637
+ word.gsub!( /^(\w)(.*)/ ) { $1.upcase + $2 }
1638
+ end
1639
+
1640
+ return words.join
1641
+ end
1642
+
1643
+
1644
+ ### Returns the proper noun form of a string by capitalizing most of the
1645
+ ### words.
1646
+ ###
1647
+ ### Examples:
1648
+ ### English.proper_noun("bosnia and herzegovina") ->
1649
+ ### "Bosnia and Herzegovina"
1650
+ ### English.proper_noun("macedonia, the former yugoslav republic of") ->
1651
+ ### "Macedonia, the Former Yugoslav Republic of"
1652
+ ### English.proper_noun("virgin islands, u.s.") ->
1653
+ ### "Virgin Islands, U.S."
1654
+ def proper_noun( string )
1655
+ return string.split(/([ .]+)/).collect {|word|
1656
+ next word unless /^[a-z]/.match( word ) &&
1657
+ ! (%w{and the of}.include?( word ))
1658
+ word.capitalize
1659
+ }.join
1660
+ end
1661
+
1662
+ end # module Linguistics::EN
1663
+
1664
+
1665
+ ### Add the #separate and #separate! methods to Array.
1666
+ class Array # :nodoc:
1667
+
1668
+ ### Returns a new Array that has had a new member inserted between all of
1669
+ ### the current ones. The value used is the given +value+ argument unless a
1670
+ ### block is given, in which case the block is called once for each pair of
1671
+ ### the Array, and the return value is used as the separator.
1672
+ def separate( value=:__no_arg__, &block )
1673
+ ary = self.dup
1674
+ ary.separate!( value, &block )
1675
+ return ary
1676
+ end
1677
+
1678
+ ### The same as #separate, but modifies the Array in place.
1679
+ def separate!( value=:__no_arg__ )
1680
+ raise ArgumentError, "wrong number of arguments: (0 for 1)" if
1681
+ value == :__no_arg__ && !block_given?
1682
+
1683
+ (1..( (self.length * 2) - 2 )).step(2) do |i|
1684
+ if block_given?
1685
+ self.insert( i, yield(self[i-1,2]) )
1686
+ else
1687
+ self.insert( i, value )
1688
+ end
1689
+ end
1690
+ self
1691
+ end
1692
+
1693
+ end
1694
+