linguistics 1.0.9 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. data.tar.gz.sig +0 -0
  2. data/.gemtest +0 -0
  3. data/ChangeLog +849 -342
  4. data/History.rdoc +11 -0
  5. data/LICENSE +9 -9
  6. data/Manifest.txt +44 -0
  7. data/README.rdoc +226 -0
  8. data/Rakefile +32 -349
  9. data/examples/endocs.rb +272 -0
  10. data/examples/generalize_sentence.rb +2 -1
  11. data/examples/klingon.rb +22 -0
  12. data/lib/linguistics.rb +130 -292
  13. data/lib/linguistics/en.rb +337 -1628
  14. data/lib/linguistics/en/articles.rb +138 -0
  15. data/lib/linguistics/en/conjugation.rb +2245 -0
  16. data/lib/linguistics/en/conjunctions.rb +202 -0
  17. data/lib/linguistics/en/{infinitive.rb → infinitives.rb} +41 -55
  18. data/lib/linguistics/en/linkparser.rb +41 -49
  19. data/lib/linguistics/en/numbers.rb +483 -0
  20. data/lib/linguistics/en/participles.rb +33 -0
  21. data/lib/linguistics/en/pluralization.rb +810 -0
  22. data/lib/linguistics/en/stemmer.rb +75 -0
  23. data/lib/linguistics/en/titlecase.rb +121 -0
  24. data/lib/linguistics/en/wordnet.rb +63 -97
  25. data/lib/linguistics/inflector.rb +89 -0
  26. data/lib/linguistics/iso639.rb +534 -448
  27. data/lib/linguistics/languagebehavior.rb +36 -0
  28. data/lib/linguistics/monkeypatches.rb +42 -0
  29. data/spec/lib/constants.rb +15 -0
  30. data/spec/lib/helpers.rb +38 -0
  31. data/spec/linguistics/en/articles_spec.rb +797 -0
  32. data/spec/linguistics/en/conjugation_spec.rb +2083 -0
  33. data/spec/linguistics/en/conjunctions_spec.rb +154 -0
  34. data/spec/linguistics/en/infinitives_spec.rb +518 -0
  35. data/spec/linguistics/en/linkparser_spec.rb +66 -0
  36. data/spec/linguistics/en/numbers_spec.rb +1295 -0
  37. data/spec/linguistics/en/participles_spec.rb +55 -0
  38. data/spec/linguistics/en/pluralization_spec.rb +4636 -0
  39. data/spec/linguistics/en/stemmer_spec.rb +72 -0
  40. data/spec/linguistics/en/titlecase_spec.rb +841 -0
  41. data/spec/linguistics/en/wordnet_spec.rb +85 -0
  42. data/spec/linguistics/en_spec.rb +45 -167
  43. data/spec/linguistics/inflector_spec.rb +40 -0
  44. data/spec/linguistics/iso639_spec.rb +49 -53
  45. data/spec/linguistics/monkeypatches_spec.rb +40 -0
  46. data/spec/linguistics_spec.rb +46 -76
  47. metadata +241 -113
  48. metadata.gz.sig +0 -0
  49. data/README +0 -166
  50. data/README.english +0 -245
  51. data/rake/191_compat.rb +0 -26
  52. data/rake/dependencies.rb +0 -76
  53. data/rake/documentation.rb +0 -123
  54. data/rake/helpers.rb +0 -502
  55. data/rake/hg.rb +0 -318
  56. data/rake/manual.rb +0 -787
  57. data/rake/packaging.rb +0 -129
  58. data/rake/publishing.rb +0 -341
  59. data/rake/style.rb +0 -62
  60. data/rake/svn.rb +0 -668
  61. data/rake/testing.rb +0 -152
  62. data/rake/verifytask.rb +0 -64
  63. data/tests/en/infinitive.tests.rb +0 -207
  64. data/tests/en/inflect.tests.rb +0 -1389
  65. data/tests/en/lafcadio.tests.rb +0 -77
  66. data/tests/en/linkparser.tests.rb +0 -42
  67. data/tests/en/lprintf.tests.rb +0 -77
  68. data/tests/en/titlecase.tests.rb +0 -73
  69. data/tests/en/wordnet.tests.rb +0 -95
@@ -0,0 +1,483 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'linguistics/en' unless defined?( Linguistics::EN )
4
+
5
+ # Numeric methods for the English-language Linguistics module.
6
+ module Linguistics::EN::Numbers
7
+
8
+ # Register this module to the list of modules to include
9
+ Linguistics::EN.register_extension( self )
10
+
11
+ #
12
+ # Numerals, ordinals, and numbers-to-words
13
+ #
14
+
15
+ # Default configuration arguments for the #numwords function
16
+ NUMWORD_DEFAULTS = {
17
+ :group => 0,
18
+ :comma => ', ',
19
+ :and => ' and ',
20
+ :zero => 'zero',
21
+ :decimal => 'point',
22
+ :asArray => false,
23
+ }
24
+
25
+ # Default configuration arguments for the #quantify function
26
+ QUANTIFY_DEFAULTS = {
27
+ :joinword => " of ",
28
+ }
29
+
30
+ # Default ranges for #quantify
31
+ SEVERAL_RANGE = 2..5
32
+ NUMBER_RANGE = 6..19
33
+ NUMEROUS_RANGE = 20..45
34
+ MANY_RANGE = 46..99
35
+
36
+ # Numerical inflections
37
+ NTH = {
38
+ 0 => 'th',
39
+ 1 => 'st',
40
+ 2 => 'nd',
41
+ 3 => 'rd',
42
+ 4 => 'th',
43
+ 5 => 'th',
44
+ 6 => 'th',
45
+ 7 => 'th',
46
+ 8 => 'th',
47
+ 9 => 'th',
48
+ 11 => 'th',
49
+ 12 => 'th',
50
+ 13 => 'th',
51
+ }
52
+
53
+ # Ordinal word parts
54
+ ORDINALS = {
55
+ 'ty' => 'tieth',
56
+ 'one' => 'first',
57
+ 'two' => 'second',
58
+ 'three' => 'third',
59
+ 'five' => 'fifth',
60
+ 'eight' => 'eighth',
61
+ 'nine' => 'ninth',
62
+ 'twelve' => 'twelfth',
63
+ }
64
+ ORDINAL_SUFFIXES = ORDINALS.keys.join("|") + "|"
65
+ ORDINALS[""] = 'th'
66
+
67
+ # Numeral names
68
+ UNITS = [''] + %w[one two three four five six seven eight nine]
69
+ TEENS = %w[ten eleven twelve thirteen fourteen
70
+ fifteen sixteen seventeen eighteen nineteen]
71
+ TENS = ['',''] + %w[twenty thirty forty fifty sixty seventy eighty ninety]
72
+ THOUSANDS = [' ', ' thousand'] + %w[
73
+ m b tr quadr quint sext sept oct non dec undec duodec tredec
74
+ quattuordec quindec sexdec septemdec octodec novemdec vigint
75
+ ].collect {|prefix| ' ' + prefix + 'illion'}
76
+
77
+
78
+ # A collection of functions for transforming digits into word
79
+ # phrases. Indexed by the number of digits being transformed; e.g.,
80
+ # <tt>NUMBER_TO_WORDS_FUNCTIONS[2]</tt> is the function for transforming
81
+ # double-digit numbers.
82
+ NUMBER_TO_WORDS_FUNCTIONS = [
83
+ proc {|*args| raise "No digits (#{args.inspect})"},
84
+
85
+ # Single-digits
86
+ proc {|zero,x|
87
+ (x.nonzero? ? to_units(x) : "#{zero} ")
88
+ },
89
+
90
+ # Double-digits
91
+ proc {|zero,x,y|
92
+ if x.nonzero?
93
+ to_tens( x, y )
94
+ elsif y.nonzero?
95
+ "#{zero} " + NUMBER_TO_WORDS_FUNCTIONS[1].call( zero, y )
96
+ else
97
+ ([zero] * 2).join(" ")
98
+ end
99
+ },
100
+
101
+ # Triple-digits
102
+ proc {|zero,x,y,z|
103
+ NUMBER_TO_WORDS_FUNCTIONS[1].call(zero,x) +
104
+ NUMBER_TO_WORDS_FUNCTIONS[2].call(zero,y,z)
105
+ }
106
+ ]
107
+
108
+
109
+ ### Return the specified number as english words. One or more configuration
110
+ ### values may be passed to control the returned String:
111
+ ###
112
+ ### [<b>:group</b>]
113
+ ### Controls how many numbers at a time are grouped together. Valid values
114
+ ### are <code>0</code> (normal grouping), <code>1</code> (single-digit
115
+ ### grouping, e.g., "one, two, three, four"), <code>2</code>
116
+ ### (double-digit grouping, e.g., "twelve, thirty-four", or <code>3</code>
117
+ ### (triple-digit grouping, e.g., "one twenty-three, four").
118
+ ### [<b>:comma</b>]
119
+ ### Set the character/s used to separate word groups. Defaults to
120
+ ### <code>", "</code>.
121
+ ### [<b>:and</b>]
122
+ ### Set the word and/or characters used where <code>' and ' </code>(the
123
+ ### default) is normally used. Setting <code>:and</code> to
124
+ ### <code>' '</code>, for example, will cause <code>2556</code> to be
125
+ ### returned as "two-thousand, five hundred fifty-six" instead of
126
+ ### "two-thousand, five hundred and fifty-six".
127
+ ### [<b>:zero</b>]
128
+ ### Set the word used to represent the numeral <code>0</code> in the
129
+ ### result. <code>'zero'</code> is the default.
130
+ ### [<b>:decimal</b>]
131
+ ### Set the translation of any decimal points in the number; the default
132
+ ### is <code>'point'</code>.
133
+ ### [<b>:as_array</b>]
134
+ ### If set to a true value, the number will be returned as an array of
135
+ ### word groups instead of a String.
136
+ def numwords( hashargs={} )
137
+ num = self.to_s
138
+ self.log.debug "Turning %p into number words..." % [ num ]
139
+ config = NUMWORD_DEFAULTS.merge( hashargs )
140
+ raise "Bad chunking option: #{config[:group]}" unless
141
+ config[:group].between?( 0, 3 )
142
+
143
+ # Array of number parts: first is everything to the left of the first
144
+ # decimal, followed by any groups of decimal-delimted numbers after that
145
+ parts = []
146
+
147
+ # Wordify any sign prefix
148
+ sign = (/\A\s*\+/ =~ num) ? 'plus' : (/\A\s*\-/ =~ num) ? 'minus' : ''
149
+
150
+ # Strip any ordinal suffixes
151
+ ord = true if num.sub!( /(st|nd|rd|th)\Z/, '' )
152
+
153
+ # Split the number into chunks delimited by '.'
154
+ chunks = if !config[:decimal].empty? then
155
+ if config[:group].nonzero?
156
+ num.split(/\./)
157
+ else
158
+ num.split(/\./, 2)
159
+ end
160
+ else
161
+ [ num ]
162
+ end
163
+
164
+ # Wordify each chunk, pushing arrays into the parts array
165
+ chunks.each_with_index do |chunk,section|
166
+ chunk.gsub!( /\D+/, '' )
167
+ self.log.debug " working on chunk %p (section %d)" % [ chunk, section ]
168
+
169
+ # If there's nothing in this chunk of the number, set it to zero
170
+ # unless it's the whole-number part, in which case just push an
171
+ # empty array.
172
+ if chunk.empty?
173
+ self.log.debug " chunk is empty..."
174
+ if section.zero?
175
+ self.log.debug " skipping the empty whole-number part"
176
+ parts.push []
177
+ next
178
+ end
179
+ end
180
+
181
+ # Split the number section into wordified parts unless this is the
182
+ # second or succeeding part of a non-group number
183
+ unless config[:group].zero? && section.nonzero?
184
+ parts.push number_to_words( chunk, config )
185
+ self.log.debug " added %p" % [ parts.last ]
186
+ else
187
+ parts.push number_to_words( chunk, config.merge(:group => 1) )
188
+ self.log.debug " added %p" % [ parts.last ]
189
+ end
190
+ end
191
+
192
+ self.log.debug "Parts => %p" % [ parts ]
193
+
194
+ # Turn the last word of the whole-number part back into an ordinal if
195
+ # the original number came in that way.
196
+ if ord && !parts[0].empty?
197
+ self.log.debug " turning the last whole-number part back into an ordinal, since it " +
198
+ "came in that way"
199
+ parts[0][-1] = ordinal( parts[0].last )
200
+ end
201
+
202
+ # If the caller's expecting an Array return, just flatten and return the
203
+ # parts array.
204
+ if config[:as_array]
205
+ self.log.debug " returning the number parts as an Array"
206
+ unless sign.empty?
207
+ parts[0].unshift( sign )
208
+ end
209
+ return parts.flatten
210
+ end
211
+
212
+ # Catenate each sub-parts array into a whole number part and one or more
213
+ # post-decimal parts. If grouping is turned on, all sub-parts get joined
214
+ # with commas, otherwise just the whole-number part is.
215
+ if config[:group].zero?
216
+ self.log.debug " no custom grouping"
217
+ if parts[0].length > 1
218
+ self.log.debug " whole and decimal part; working on the whole number first"
219
+
220
+ # Join all but the last part together with commas
221
+ wholenum = parts[0][0...-1].join( config[:comma] )
222
+
223
+ # If the last part is just a single word, append it to the
224
+ # wholenum part with an 'and'. This is to get things like 'three
225
+ # thousand and three' instead of 'three thousand, three'.
226
+ if /^\s*(\S+)\s*$/ =~ parts[0].last
227
+ self.log.debug "last word is a single word; using the 'and' separator: %p" %
228
+ [ config[:and] ]
229
+ wholenum += config[:and] + parts[0].last
230
+ else
231
+ self.log.debug "last word has multiple words; using the comma separator: %p" %
232
+ [ config[:comma] ]
233
+ wholenum += config[:comma] + parts[0].last
234
+ end
235
+ else
236
+ self.log.debug " non-decimal."
237
+ wholenum = parts[0][0]
238
+ end
239
+
240
+ decimals = parts[1..-1].collect {|part| part.join(" ")}
241
+ self.log.debug " wholenum: %p; decimals: %p" % [ wholenum, decimals ]
242
+
243
+ # Join with the configured decimal; if it's empty, just join with
244
+ # spaces.
245
+ unless config[:decimal].empty?
246
+ self.log.debug " joining with the configured decimal: %p" % [ config[:decimal] ]
247
+ return sign + ([ wholenum ] + decimals).
248
+ join( " #{config[:decimal]} " ).strip
249
+ else
250
+ self.log.debug " joining with the spaces since no decimal is configured"
251
+ return sign + ([ wholenum ] + decimals).
252
+ join( " " ).strip
253
+ end
254
+
255
+ else
256
+ self.log.debug " grouping with decimal %p and comma %p" %
257
+ config.values_at( :decimal, :comma )
258
+ return parts.compact.
259
+ separate( config[:decimal] ).
260
+ delete_if {|el| el.empty?}.
261
+ join( config[:comma] ).
262
+ strip
263
+ end
264
+ end
265
+ Linguistics::EN.register_lprintf_formatter :NUMWORDS, :numwords
266
+
267
+
268
+ ### Transform the given +number+ into an ordinal word. The +number+ object
269
+ ### can be either an Integer or a String.
270
+ def ordinal
271
+ if self.respond_to?( :to_int )
272
+ number = self.to_int
273
+ return "%d%s" % [ number, (NTH[ number % 100 ] || NTH[ number % 10 ]) ]
274
+
275
+ else
276
+ number = self.to_s
277
+ self.log.debug "Making an ordinal out of a non-Integer (%p)" % [ number ]
278
+ return number.sub( /(#{ORDINAL_SUFFIXES})\Z/ ) { ORDINALS[$1] }
279
+ end
280
+ end
281
+ Linguistics::EN.register_lprintf_formatter :ORD, :ordinal
282
+
283
+
284
+ ### Transform the given +number+ into an ordinate word.
285
+ def ordinate
286
+ return self.numwords.en.ordinal
287
+ end
288
+
289
+
290
+ ### Return a phrase describing the specified +number+ of objects in the
291
+ ### inflected object in general terms. The following options can be used to
292
+ ### control the makeup of the returned quantity String:
293
+ ###
294
+ ### [<b>:joinword</b>]
295
+ ### Sets the word (and any surrounding spaces) used as the word separating the
296
+ ### quantity from the noun in the resulting string. Defaults to <tt>' of
297
+ ### '</tt>.
298
+ def quantify( number=0, args={} )
299
+ phrase = self.to_s
300
+ self.log.debug "Quantifying %d instances of %p" % [ number, phrase ]
301
+
302
+ num = number.to_i
303
+ config = QUANTIFY_DEFAULTS.merge( args )
304
+
305
+ case num
306
+ when 0
307
+ phrase.en.no
308
+ when 1
309
+ phrase.en.a
310
+ when SEVERAL_RANGE
311
+ "several " + phrase.en.plural( num )
312
+ when NUMBER_RANGE
313
+ "a number of " + phrase.en.plural( num )
314
+ when NUMEROUS_RANGE
315
+ "numerous " + phrase.en.plural( num )
316
+ when MANY_RANGE
317
+ "many " + phrase.en.plural( num )
318
+ else
319
+
320
+ # Anything bigger than the MANY_RANGE gets described like
321
+ # "hundreds of thousands of..." or "millions of..."
322
+ # depending, of course, on how many there are.
323
+ thousands, subthousands = Math::log10( num ).to_i.divmod( 3 )
324
+ self.log.debug "thousands = %p, subthousands = %p" % [ thousands, subthousands ]
325
+
326
+ stword =
327
+ case subthousands
328
+ when 2
329
+ "hundreds"
330
+ when 1
331
+ "tens"
332
+ else
333
+ nil
334
+ end
335
+
336
+ unless thousands.zero?
337
+ thword = to_thousands( thousands ).strip.en.plural
338
+ end
339
+
340
+ [ # Hundreds (of)...
341
+ stword,
342
+
343
+ # thousands (of)
344
+ thword,
345
+
346
+ # stars.
347
+ phrase.en.plural(number)
348
+ ].compact.join( config[:joinword] )
349
+ end
350
+ end
351
+ Linguistics::EN.register_lprintf_formatter :QUANT, :quantify
352
+
353
+
354
+ ###############
355
+ module_function
356
+ ###############
357
+
358
+ ### Transform the specified number of units-place numerals into a
359
+ ### word-phrase at the given number of +thousands+ places.
360
+ def to_units( units, thousands=0 )
361
+ return UNITS[ units ] + to_thousands( thousands )
362
+ end
363
+
364
+
365
+ ### Transform the specified number of tens- and units-place numerals into a
366
+ ### word-phrase at the given number of +thousands+ places.
367
+ def to_tens( tens, units, thousands=0 )
368
+ raise ArgumentError, "tens: no implicit conversion from nil" unless tens
369
+ raise ArgumentError, "units: no implicit conversion from nil" unless units
370
+
371
+ unless tens == 1
372
+ return TENS[ tens ] + ( tens.nonzero? && units.nonzero? ? '-' : '' ) +
373
+ to_units( units, thousands )
374
+ else
375
+ return TEENS[ units ] + to_thousands( thousands )
376
+ end
377
+ end
378
+
379
+
380
+ ### Transform the specified number of hundreds-, tens-, and units-place
381
+ ### numerals into a word phrase. If the number of thousands (+thousands+) is
382
+ ### greater than 0, it will be used to determine where the decimal point is
383
+ ### in relation to the hundreds-place number.
384
+ def to_hundreds( hundreds, tens=0, units=0, thousands=0, joinword=" and " )
385
+ joinword = ' ' if joinword.empty?
386
+ if hundreds.nonzero?
387
+ return to_units( hundreds ) + " hundred" +
388
+ (tens.nonzero? || units.nonzero? ? joinword : '') +
389
+ to_tens( tens, units ) +
390
+ to_thousands( thousands )
391
+ elsif tens.nonzero? || units.nonzero?
392
+ return to_tens( tens, units ) + to_thousands( thousands )
393
+ else
394
+ return nil
395
+ end
396
+ end
397
+
398
+ ### Transform the specified number into one or more words like 'thousand',
399
+ ### 'million', etc. Uses the thousands (American) system.
400
+ def to_thousands( thousands=0 )
401
+ parts = []
402
+ (0..thousands).step( THOUSANDS.length - 1 ) {|i|
403
+ if i.zero?
404
+ parts.push THOUSANDS[ thousands % (THOUSANDS.length - 1) ]
405
+ else
406
+ parts.push THOUSANDS.last
407
+ end
408
+ }
409
+
410
+ return parts.join(" ")
411
+ end
412
+
413
+
414
+ ### Return the specified number +number+ as an array of number phrases.
415
+ def number_to_words( number, config )
416
+ return [config[:zero]] if number.to_i.zero?
417
+
418
+ if config[:group].nonzero? then
419
+ return number_to_custom_word_groups( number, config[:group], config[:zero] )
420
+ else
421
+ return number_to_standard_word_groups( number, config[:and] )
422
+ end
423
+ end
424
+
425
+
426
+ ### Split the given +number+ up into groups of +groupsize+ and return
427
+ ### them as an Array of words. Use +zeroword+ for any occurences of '0'.
428
+ def number_to_custom_word_groups( number, groupsize, zeroword="zero" )
429
+ self.log.debug "Making custom word groups of %d digits out of %p" % [ groupsize, number ]
430
+
431
+ # Build a Regexp with <config[:group]> number of digits. Any past
432
+ # the first are optional.
433
+ re = Regexp.new( "(\\d)" + ("(\\d)?" * (groupsize - 1)) )
434
+ self.log.debug " regex for matching groups of %d digits is %p" % [ groupsize, re ]
435
+
436
+ # Scan the string, and call the word-chunk function that deals with
437
+ # chunks of the found number of digits.
438
+ return number.to_s.scan( re ).collect do |digits|
439
+ self.log.debug " digits = %p" % [ digits ]
440
+ numerals = digits.flatten.compact.collect {|i| i.to_i}
441
+ self.log.debug " numerals = %p" % [ numerals ]
442
+
443
+ fn = NUMBER_TO_WORDS_FUNCTIONS[ numerals.length ]
444
+ self.log.debug " number to word function is #%d: %p" % [ numerals.length, fn ]
445
+ fn.call( zeroword, *numerals ).strip
446
+ end
447
+ end
448
+
449
+
450
+ ### Split the given +number+ up into groups of three and return
451
+ ### the Array of words describing each group in the standard style.
452
+ def number_to_standard_word_groups( number, andword="and" )
453
+ phrase = number.to_s
454
+ phrase.sub!( /\A\s*0+/, '' )
455
+ chunks = []
456
+ mill = 0
457
+ self.log.debug "Making standard word groups out of %p" % [ phrase ]
458
+
459
+ # Match backward from the end of the digits in the string, turning
460
+ # chunks of three, of two, and of one into words.
461
+ mill += 1 while
462
+ phrase.sub!( /(\d)(\d)(\d)(?=\D*\Z)/ ) do
463
+ words = to_hundreds( $1.to_i, $2.to_i, $3.to_i, mill, andword )
464
+ chunks.unshift words.strip.squeeze(' ') unless words.nil?
465
+ ''
466
+ end
467
+
468
+ phrase.sub!( /(\d)(\d)(?=\D*\Z)/ ) do
469
+ chunks.unshift to_tens( $1.to_i, $2.to_i, mill ).strip.squeeze(' ')
470
+ ''
471
+ end
472
+
473
+ phrase.sub!( /(\d)(?=\D*\Z)/ ) do
474
+ chunks.unshift to_units( $1.to_i, mill ).strip.squeeze(' ')
475
+ ''
476
+ end
477
+
478
+ return chunks
479
+ end
480
+
481
+
482
+ end # module Linguistics::EN::Numbers
483
+