linguistics 1.0.9 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. data.tar.gz.sig +0 -0
  2. data/.gemtest +0 -0
  3. data/ChangeLog +849 -342
  4. data/History.rdoc +11 -0
  5. data/LICENSE +9 -9
  6. data/Manifest.txt +44 -0
  7. data/README.rdoc +226 -0
  8. data/Rakefile +32 -349
  9. data/examples/endocs.rb +272 -0
  10. data/examples/generalize_sentence.rb +2 -1
  11. data/examples/klingon.rb +22 -0
  12. data/lib/linguistics.rb +130 -292
  13. data/lib/linguistics/en.rb +337 -1628
  14. data/lib/linguistics/en/articles.rb +138 -0
  15. data/lib/linguistics/en/conjugation.rb +2245 -0
  16. data/lib/linguistics/en/conjunctions.rb +202 -0
  17. data/lib/linguistics/en/{infinitive.rb → infinitives.rb} +41 -55
  18. data/lib/linguistics/en/linkparser.rb +41 -49
  19. data/lib/linguistics/en/numbers.rb +483 -0
  20. data/lib/linguistics/en/participles.rb +33 -0
  21. data/lib/linguistics/en/pluralization.rb +810 -0
  22. data/lib/linguistics/en/stemmer.rb +75 -0
  23. data/lib/linguistics/en/titlecase.rb +121 -0
  24. data/lib/linguistics/en/wordnet.rb +63 -97
  25. data/lib/linguistics/inflector.rb +89 -0
  26. data/lib/linguistics/iso639.rb +534 -448
  27. data/lib/linguistics/languagebehavior.rb +36 -0
  28. data/lib/linguistics/monkeypatches.rb +42 -0
  29. data/spec/lib/constants.rb +15 -0
  30. data/spec/lib/helpers.rb +38 -0
  31. data/spec/linguistics/en/articles_spec.rb +797 -0
  32. data/spec/linguistics/en/conjugation_spec.rb +2083 -0
  33. data/spec/linguistics/en/conjunctions_spec.rb +154 -0
  34. data/spec/linguistics/en/infinitives_spec.rb +518 -0
  35. data/spec/linguistics/en/linkparser_spec.rb +66 -0
  36. data/spec/linguistics/en/numbers_spec.rb +1295 -0
  37. data/spec/linguistics/en/participles_spec.rb +55 -0
  38. data/spec/linguistics/en/pluralization_spec.rb +4636 -0
  39. data/spec/linguistics/en/stemmer_spec.rb +72 -0
  40. data/spec/linguistics/en/titlecase_spec.rb +841 -0
  41. data/spec/linguistics/en/wordnet_spec.rb +85 -0
  42. data/spec/linguistics/en_spec.rb +45 -167
  43. data/spec/linguistics/inflector_spec.rb +40 -0
  44. data/spec/linguistics/iso639_spec.rb +49 -53
  45. data/spec/linguistics/monkeypatches_spec.rb +40 -0
  46. data/spec/linguistics_spec.rb +46 -76
  47. metadata +241 -113
  48. metadata.gz.sig +0 -0
  49. data/README +0 -166
  50. data/README.english +0 -245
  51. data/rake/191_compat.rb +0 -26
  52. data/rake/dependencies.rb +0 -76
  53. data/rake/documentation.rb +0 -123
  54. data/rake/helpers.rb +0 -502
  55. data/rake/hg.rb +0 -318
  56. data/rake/manual.rb +0 -787
  57. data/rake/packaging.rb +0 -129
  58. data/rake/publishing.rb +0 -341
  59. data/rake/style.rb +0 -62
  60. data/rake/svn.rb +0 -668
  61. data/rake/testing.rb +0 -152
  62. data/rake/verifytask.rb +0 -64
  63. data/tests/en/infinitive.tests.rb +0 -207
  64. data/tests/en/inflect.tests.rb +0 -1389
  65. data/tests/en/lafcadio.tests.rb +0 -77
  66. data/tests/en/linkparser.tests.rb +0 -42
  67. data/tests/en/lprintf.tests.rb +0 -77
  68. data/tests/en/titlecase.tests.rb +0 -73
  69. data/tests/en/wordnet.tests.rb +0 -95
@@ -0,0 +1,483 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'linguistics/en' unless defined?( Linguistics::EN )
4
+
5
+ # Numeric methods for the English-language Linguistics module.
6
+ module Linguistics::EN::Numbers
7
+
8
+ # Register this module to the list of modules to include
9
+ Linguistics::EN.register_extension( self )
10
+
11
+ #
12
+ # Numerals, ordinals, and numbers-to-words
13
+ #
14
+
15
+ # Default configuration arguments for the #numwords function
16
+ NUMWORD_DEFAULTS = {
17
+ :group => 0,
18
+ :comma => ', ',
19
+ :and => ' and ',
20
+ :zero => 'zero',
21
+ :decimal => 'point',
22
+ :asArray => false,
23
+ }
24
+
25
+ # Default configuration arguments for the #quantify function
26
+ QUANTIFY_DEFAULTS = {
27
+ :joinword => " of ",
28
+ }
29
+
30
+ # Default ranges for #quantify
31
+ SEVERAL_RANGE = 2..5
32
+ NUMBER_RANGE = 6..19
33
+ NUMEROUS_RANGE = 20..45
34
+ MANY_RANGE = 46..99
35
+
36
+ # Numerical inflections
37
+ NTH = {
38
+ 0 => 'th',
39
+ 1 => 'st',
40
+ 2 => 'nd',
41
+ 3 => 'rd',
42
+ 4 => 'th',
43
+ 5 => 'th',
44
+ 6 => 'th',
45
+ 7 => 'th',
46
+ 8 => 'th',
47
+ 9 => 'th',
48
+ 11 => 'th',
49
+ 12 => 'th',
50
+ 13 => 'th',
51
+ }
52
+
53
+ # Ordinal word parts
54
+ ORDINALS = {
55
+ 'ty' => 'tieth',
56
+ 'one' => 'first',
57
+ 'two' => 'second',
58
+ 'three' => 'third',
59
+ 'five' => 'fifth',
60
+ 'eight' => 'eighth',
61
+ 'nine' => 'ninth',
62
+ 'twelve' => 'twelfth',
63
+ }
64
+ ORDINAL_SUFFIXES = ORDINALS.keys.join("|") + "|"
65
+ ORDINALS[""] = 'th'
66
+
67
+ # Numeral names
68
+ UNITS = [''] + %w[one two three four five six seven eight nine]
69
+ TEENS = %w[ten eleven twelve thirteen fourteen
70
+ fifteen sixteen seventeen eighteen nineteen]
71
+ TENS = ['',''] + %w[twenty thirty forty fifty sixty seventy eighty ninety]
72
+ THOUSANDS = [' ', ' thousand'] + %w[
73
+ m b tr quadr quint sext sept oct non dec undec duodec tredec
74
+ quattuordec quindec sexdec septemdec octodec novemdec vigint
75
+ ].collect {|prefix| ' ' + prefix + 'illion'}
76
+
77
+
78
+ # A collection of functions for transforming digits into word
79
+ # phrases. Indexed by the number of digits being transformed; e.g.,
80
+ # <tt>NUMBER_TO_WORDS_FUNCTIONS[2]</tt> is the function for transforming
81
+ # double-digit numbers.
82
+ NUMBER_TO_WORDS_FUNCTIONS = [
83
+ proc {|*args| raise "No digits (#{args.inspect})"},
84
+
85
+ # Single-digits
86
+ proc {|zero,x|
87
+ (x.nonzero? ? to_units(x) : "#{zero} ")
88
+ },
89
+
90
+ # Double-digits
91
+ proc {|zero,x,y|
92
+ if x.nonzero?
93
+ to_tens( x, y )
94
+ elsif y.nonzero?
95
+ "#{zero} " + NUMBER_TO_WORDS_FUNCTIONS[1].call( zero, y )
96
+ else
97
+ ([zero] * 2).join(" ")
98
+ end
99
+ },
100
+
101
+ # Triple-digits
102
+ proc {|zero,x,y,z|
103
+ NUMBER_TO_WORDS_FUNCTIONS[1].call(zero,x) +
104
+ NUMBER_TO_WORDS_FUNCTIONS[2].call(zero,y,z)
105
+ }
106
+ ]
107
+
108
+
109
+ ### Return the specified number as english words. One or more configuration
110
+ ### values may be passed to control the returned String:
111
+ ###
112
+ ### [<b>:group</b>]
113
+ ### Controls how many numbers at a time are grouped together. Valid values
114
+ ### are <code>0</code> (normal grouping), <code>1</code> (single-digit
115
+ ### grouping, e.g., "one, two, three, four"), <code>2</code>
116
+ ### (double-digit grouping, e.g., "twelve, thirty-four", or <code>3</code>
117
+ ### (triple-digit grouping, e.g., "one twenty-three, four").
118
+ ### [<b>:comma</b>]
119
+ ### Set the character/s used to separate word groups. Defaults to
120
+ ### <code>", "</code>.
121
+ ### [<b>:and</b>]
122
+ ### Set the word and/or characters used where <code>' and ' </code>(the
123
+ ### default) is normally used. Setting <code>:and</code> to
124
+ ### <code>' '</code>, for example, will cause <code>2556</code> to be
125
+ ### returned as "two-thousand, five hundred fifty-six" instead of
126
+ ### "two-thousand, five hundred and fifty-six".
127
+ ### [<b>:zero</b>]
128
+ ### Set the word used to represent the numeral <code>0</code> in the
129
+ ### result. <code>'zero'</code> is the default.
130
+ ### [<b>:decimal</b>]
131
+ ### Set the translation of any decimal points in the number; the default
132
+ ### is <code>'point'</code>.
133
+ ### [<b>:as_array</b>]
134
+ ### If set to a true value, the number will be returned as an array of
135
+ ### word groups instead of a String.
136
+ def numwords( hashargs={} )
137
+ num = self.to_s
138
+ self.log.debug "Turning %p into number words..." % [ num ]
139
+ config = NUMWORD_DEFAULTS.merge( hashargs )
140
+ raise "Bad chunking option: #{config[:group]}" unless
141
+ config[:group].between?( 0, 3 )
142
+
143
+ # Array of number parts: first is everything to the left of the first
144
+ # decimal, followed by any groups of decimal-delimted numbers after that
145
+ parts = []
146
+
147
+ # Wordify any sign prefix
148
+ sign = (/\A\s*\+/ =~ num) ? 'plus' : (/\A\s*\-/ =~ num) ? 'minus' : ''
149
+
150
+ # Strip any ordinal suffixes
151
+ ord = true if num.sub!( /(st|nd|rd|th)\Z/, '' )
152
+
153
+ # Split the number into chunks delimited by '.'
154
+ chunks = if !config[:decimal].empty? then
155
+ if config[:group].nonzero?
156
+ num.split(/\./)
157
+ else
158
+ num.split(/\./, 2)
159
+ end
160
+ else
161
+ [ num ]
162
+ end
163
+
164
+ # Wordify each chunk, pushing arrays into the parts array
165
+ chunks.each_with_index do |chunk,section|
166
+ chunk.gsub!( /\D+/, '' )
167
+ self.log.debug " working on chunk %p (section %d)" % [ chunk, section ]
168
+
169
+ # If there's nothing in this chunk of the number, set it to zero
170
+ # unless it's the whole-number part, in which case just push an
171
+ # empty array.
172
+ if chunk.empty?
173
+ self.log.debug " chunk is empty..."
174
+ if section.zero?
175
+ self.log.debug " skipping the empty whole-number part"
176
+ parts.push []
177
+ next
178
+ end
179
+ end
180
+
181
+ # Split the number section into wordified parts unless this is the
182
+ # second or succeeding part of a non-group number
183
+ unless config[:group].zero? && section.nonzero?
184
+ parts.push number_to_words( chunk, config )
185
+ self.log.debug " added %p" % [ parts.last ]
186
+ else
187
+ parts.push number_to_words( chunk, config.merge(:group => 1) )
188
+ self.log.debug " added %p" % [ parts.last ]
189
+ end
190
+ end
191
+
192
+ self.log.debug "Parts => %p" % [ parts ]
193
+
194
+ # Turn the last word of the whole-number part back into an ordinal if
195
+ # the original number came in that way.
196
+ if ord && !parts[0].empty?
197
+ self.log.debug " turning the last whole-number part back into an ordinal, since it " +
198
+ "came in that way"
199
+ parts[0][-1] = ordinal( parts[0].last )
200
+ end
201
+
202
+ # If the caller's expecting an Array return, just flatten and return the
203
+ # parts array.
204
+ if config[:as_array]
205
+ self.log.debug " returning the number parts as an Array"
206
+ unless sign.empty?
207
+ parts[0].unshift( sign )
208
+ end
209
+ return parts.flatten
210
+ end
211
+
212
+ # Catenate each sub-parts array into a whole number part and one or more
213
+ # post-decimal parts. If grouping is turned on, all sub-parts get joined
214
+ # with commas, otherwise just the whole-number part is.
215
+ if config[:group].zero?
216
+ self.log.debug " no custom grouping"
217
+ if parts[0].length > 1
218
+ self.log.debug " whole and decimal part; working on the whole number first"
219
+
220
+ # Join all but the last part together with commas
221
+ wholenum = parts[0][0...-1].join( config[:comma] )
222
+
223
+ # If the last part is just a single word, append it to the
224
+ # wholenum part with an 'and'. This is to get things like 'three
225
+ # thousand and three' instead of 'three thousand, three'.
226
+ if /^\s*(\S+)\s*$/ =~ parts[0].last
227
+ self.log.debug "last word is a single word; using the 'and' separator: %p" %
228
+ [ config[:and] ]
229
+ wholenum += config[:and] + parts[0].last
230
+ else
231
+ self.log.debug "last word has multiple words; using the comma separator: %p" %
232
+ [ config[:comma] ]
233
+ wholenum += config[:comma] + parts[0].last
234
+ end
235
+ else
236
+ self.log.debug " non-decimal."
237
+ wholenum = parts[0][0]
238
+ end
239
+
240
+ decimals = parts[1..-1].collect {|part| part.join(" ")}
241
+ self.log.debug " wholenum: %p; decimals: %p" % [ wholenum, decimals ]
242
+
243
+ # Join with the configured decimal; if it's empty, just join with
244
+ # spaces.
245
+ unless config[:decimal].empty?
246
+ self.log.debug " joining with the configured decimal: %p" % [ config[:decimal] ]
247
+ return sign + ([ wholenum ] + decimals).
248
+ join( " #{config[:decimal]} " ).strip
249
+ else
250
+ self.log.debug " joining with the spaces since no decimal is configured"
251
+ return sign + ([ wholenum ] + decimals).
252
+ join( " " ).strip
253
+ end
254
+
255
+ else
256
+ self.log.debug " grouping with decimal %p and comma %p" %
257
+ config.values_at( :decimal, :comma )
258
+ return parts.compact.
259
+ separate( config[:decimal] ).
260
+ delete_if {|el| el.empty?}.
261
+ join( config[:comma] ).
262
+ strip
263
+ end
264
+ end
265
+ Linguistics::EN.register_lprintf_formatter :NUMWORDS, :numwords
266
+
267
+
268
+ ### Transform the given +number+ into an ordinal word. The +number+ object
269
+ ### can be either an Integer or a String.
270
+ def ordinal
271
+ if self.respond_to?( :to_int )
272
+ number = self.to_int
273
+ return "%d%s" % [ number, (NTH[ number % 100 ] || NTH[ number % 10 ]) ]
274
+
275
+ else
276
+ number = self.to_s
277
+ self.log.debug "Making an ordinal out of a non-Integer (%p)" % [ number ]
278
+ return number.sub( /(#{ORDINAL_SUFFIXES})\Z/ ) { ORDINALS[$1] }
279
+ end
280
+ end
281
+ Linguistics::EN.register_lprintf_formatter :ORD, :ordinal
282
+
283
+
284
+ ### Transform the given +number+ into an ordinate word.
285
+ def ordinate
286
+ return self.numwords.en.ordinal
287
+ end
288
+
289
+
290
+ ### Return a phrase describing the specified +number+ of objects in the
291
+ ### inflected object in general terms. The following options can be used to
292
+ ### control the makeup of the returned quantity String:
293
+ ###
294
+ ### [<b>:joinword</b>]
295
+ ### Sets the word (and any surrounding spaces) used as the word separating the
296
+ ### quantity from the noun in the resulting string. Defaults to <tt>' of
297
+ ### '</tt>.
298
+ def quantify( number=0, args={} )
299
+ phrase = self.to_s
300
+ self.log.debug "Quantifying %d instances of %p" % [ number, phrase ]
301
+
302
+ num = number.to_i
303
+ config = QUANTIFY_DEFAULTS.merge( args )
304
+
305
+ case num
306
+ when 0
307
+ phrase.en.no
308
+ when 1
309
+ phrase.en.a
310
+ when SEVERAL_RANGE
311
+ "several " + phrase.en.plural( num )
312
+ when NUMBER_RANGE
313
+ "a number of " + phrase.en.plural( num )
314
+ when NUMEROUS_RANGE
315
+ "numerous " + phrase.en.plural( num )
316
+ when MANY_RANGE
317
+ "many " + phrase.en.plural( num )
318
+ else
319
+
320
+ # Anything bigger than the MANY_RANGE gets described like
321
+ # "hundreds of thousands of..." or "millions of..."
322
+ # depending, of course, on how many there are.
323
+ thousands, subthousands = Math::log10( num ).to_i.divmod( 3 )
324
+ self.log.debug "thousands = %p, subthousands = %p" % [ thousands, subthousands ]
325
+
326
+ stword =
327
+ case subthousands
328
+ when 2
329
+ "hundreds"
330
+ when 1
331
+ "tens"
332
+ else
333
+ nil
334
+ end
335
+
336
+ unless thousands.zero?
337
+ thword = to_thousands( thousands ).strip.en.plural
338
+ end
339
+
340
+ [ # Hundreds (of)...
341
+ stword,
342
+
343
+ # thousands (of)
344
+ thword,
345
+
346
+ # stars.
347
+ phrase.en.plural(number)
348
+ ].compact.join( config[:joinword] )
349
+ end
350
+ end
351
+ Linguistics::EN.register_lprintf_formatter :QUANT, :quantify
352
+
353
+
354
+ ###############
355
+ module_function
356
+ ###############
357
+
358
+ ### Transform the specified number of units-place numerals into a
359
+ ### word-phrase at the given number of +thousands+ places.
360
+ def to_units( units, thousands=0 )
361
+ return UNITS[ units ] + to_thousands( thousands )
362
+ end
363
+
364
+
365
+ ### Transform the specified number of tens- and units-place numerals into a
366
+ ### word-phrase at the given number of +thousands+ places.
367
+ def to_tens( tens, units, thousands=0 )
368
+ raise ArgumentError, "tens: no implicit conversion from nil" unless tens
369
+ raise ArgumentError, "units: no implicit conversion from nil" unless units
370
+
371
+ unless tens == 1
372
+ return TENS[ tens ] + ( tens.nonzero? && units.nonzero? ? '-' : '' ) +
373
+ to_units( units, thousands )
374
+ else
375
+ return TEENS[ units ] + to_thousands( thousands )
376
+ end
377
+ end
378
+
379
+
380
+ ### Transform the specified number of hundreds-, tens-, and units-place
381
+ ### numerals into a word phrase. If the number of thousands (+thousands+) is
382
+ ### greater than 0, it will be used to determine where the decimal point is
383
+ ### in relation to the hundreds-place number.
384
+ def to_hundreds( hundreds, tens=0, units=0, thousands=0, joinword=" and " )
385
+ joinword = ' ' if joinword.empty?
386
+ if hundreds.nonzero?
387
+ return to_units( hundreds ) + " hundred" +
388
+ (tens.nonzero? || units.nonzero? ? joinword : '') +
389
+ to_tens( tens, units ) +
390
+ to_thousands( thousands )
391
+ elsif tens.nonzero? || units.nonzero?
392
+ return to_tens( tens, units ) + to_thousands( thousands )
393
+ else
394
+ return nil
395
+ end
396
+ end
397
+
398
+ ### Transform the specified number into one or more words like 'thousand',
399
+ ### 'million', etc. Uses the thousands (American) system.
400
+ def to_thousands( thousands=0 )
401
+ parts = []
402
+ (0..thousands).step( THOUSANDS.length - 1 ) {|i|
403
+ if i.zero?
404
+ parts.push THOUSANDS[ thousands % (THOUSANDS.length - 1) ]
405
+ else
406
+ parts.push THOUSANDS.last
407
+ end
408
+ }
409
+
410
+ return parts.join(" ")
411
+ end
412
+
413
+
414
+ ### Return the specified number +number+ as an array of number phrases.
415
+ def number_to_words( number, config )
416
+ return [config[:zero]] if number.to_i.zero?
417
+
418
+ if config[:group].nonzero? then
419
+ return number_to_custom_word_groups( number, config[:group], config[:zero] )
420
+ else
421
+ return number_to_standard_word_groups( number, config[:and] )
422
+ end
423
+ end
424
+
425
+
426
+ ### Split the given +number+ up into groups of +groupsize+ and return
427
+ ### them as an Array of words. Use +zeroword+ for any occurences of '0'.
428
+ def number_to_custom_word_groups( number, groupsize, zeroword="zero" )
429
+ self.log.debug "Making custom word groups of %d digits out of %p" % [ groupsize, number ]
430
+
431
+ # Build a Regexp with <config[:group]> number of digits. Any past
432
+ # the first are optional.
433
+ re = Regexp.new( "(\\d)" + ("(\\d)?" * (groupsize - 1)) )
434
+ self.log.debug " regex for matching groups of %d digits is %p" % [ groupsize, re ]
435
+
436
+ # Scan the string, and call the word-chunk function that deals with
437
+ # chunks of the found number of digits.
438
+ return number.to_s.scan( re ).collect do |digits|
439
+ self.log.debug " digits = %p" % [ digits ]
440
+ numerals = digits.flatten.compact.collect {|i| i.to_i}
441
+ self.log.debug " numerals = %p" % [ numerals ]
442
+
443
+ fn = NUMBER_TO_WORDS_FUNCTIONS[ numerals.length ]
444
+ self.log.debug " number to word function is #%d: %p" % [ numerals.length, fn ]
445
+ fn.call( zeroword, *numerals ).strip
446
+ end
447
+ end
448
+
449
+
450
+ ### Split the given +number+ up into groups of three and return
451
+ ### the Array of words describing each group in the standard style.
452
+ def number_to_standard_word_groups( number, andword="and" )
453
+ phrase = number.to_s
454
+ phrase.sub!( /\A\s*0+/, '' )
455
+ chunks = []
456
+ mill = 0
457
+ self.log.debug "Making standard word groups out of %p" % [ phrase ]
458
+
459
+ # Match backward from the end of the digits in the string, turning
460
+ # chunks of three, of two, and of one into words.
461
+ mill += 1 while
462
+ phrase.sub!( /(\d)(\d)(\d)(?=\D*\Z)/ ) do
463
+ words = to_hundreds( $1.to_i, $2.to_i, $3.to_i, mill, andword )
464
+ chunks.unshift words.strip.squeeze(' ') unless words.nil?
465
+ ''
466
+ end
467
+
468
+ phrase.sub!( /(\d)(\d)(?=\D*\Z)/ ) do
469
+ chunks.unshift to_tens( $1.to_i, $2.to_i, mill ).strip.squeeze(' ')
470
+ ''
471
+ end
472
+
473
+ phrase.sub!( /(\d)(?=\D*\Z)/ ) do
474
+ chunks.unshift to_units( $1.to_i, mill ).strip.squeeze(' ')
475
+ ''
476
+ end
477
+
478
+ return chunks
479
+ end
480
+
481
+
482
+ end # module Linguistics::EN::Numbers
483
+