raingrams 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/History.txt +9 -0
  2. data/Manifest.txt +10 -10
  3. data/README.txt +9 -7
  4. data/Rakefile +3 -6
  5. data/TODO.txt +6 -0
  6. data/lib/raingrams/bigram_model.rb +3 -7
  7. data/lib/raingrams/extensions/object.rb +4 -1
  8. data/lib/raingrams/extensions/string.rb +3 -0
  9. data/lib/raingrams/extensions.rb +0 -5
  10. data/lib/raingrams/hexagram_model.rb +3 -7
  11. data/lib/raingrams/model.rb +622 -61
  12. data/lib/raingrams/ngram.rb +50 -9
  13. data/lib/raingrams/ngram_set.rb +43 -0
  14. data/lib/raingrams/open_vocabulary/model.rb +12 -0
  15. data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
  16. data/lib/raingrams/open_vocabulary.rb +0 -1
  17. data/lib/raingrams/pentagram_model.rb +3 -7
  18. data/lib/raingrams/probability_table.rb +153 -0
  19. data/lib/raingrams/quadgram_model.rb +3 -7
  20. data/lib/raingrams/raingrams.rb +10 -20
  21. data/lib/raingrams/tokens/start_sentence.rb +2 -2
  22. data/lib/raingrams/tokens/stop_sentence.rb +2 -2
  23. data/lib/raingrams/tokens/token.rb +49 -5
  24. data/lib/raingrams/tokens/unknown.rb +2 -2
  25. data/lib/raingrams/tokens.rb +1 -0
  26. data/lib/raingrams/trigram_model.rb +3 -7
  27. data/lib/raingrams/version.rb +1 -1
  28. data/lib/raingrams.rb +1 -1
  29. data/spec/ngram_set_spec.rb +54 -0
  30. data/spec/ngram_spec.rb +29 -0
  31. data/spec/probability_table_spec.rb +94 -0
  32. data/spec/raingrams_spec.rb +9 -0
  33. data/spec/spec_helper.rb +5 -0
  34. data/tasks/spec.rb +7 -0
  35. metadata +65 -55
  36. data/lib/raingrams/extensions/class.rb +0 -7
  37. data/lib/raingrams/extensions/false_class.rb +0 -7
  38. data/lib/raingrams/extensions/nil_class.rb +0 -7
  39. data/lib/raingrams/extensions/symbol.rb +0 -7
  40. data/lib/raingrams/extensions/true_class.rb +0 -7
  41. data/lib/raingrams/multigram_model.rb +0 -165
  42. data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
  43. data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
  44. data/lib/raingrams/unigram_model.rb +0 -70
  45. data/test/test_raingrams.rb +0 -0
@@ -1,7 +1,9 @@
1
1
  require 'raingrams/ngram'
2
- require 'raingrams/tokens/start_sentence'
3
- require 'raingrams/tokens/stop_sentence'
4
- require 'raingrams/exceptions/prefix_frequency_missing'
2
+ require 'raingrams/ngram_set'
3
+ require 'raingrams/probability_table'
4
+ require 'raingrams/tokens'
5
+
6
+ require 'set'
5
7
 
6
8
  module Raingrams
7
9
  class Model
@@ -9,11 +11,17 @@ module Raingrams
9
11
  # Size of ngrams to use
10
12
  attr_reader :ngram_size
11
13
 
14
+ # The sentence starting ngram
15
+ attr_reader :starting_ngram
16
+
17
+ # The sentence stopping ngram
18
+ attr_reader :stoping_ngram
19
+
12
20
  # Ignore case of parsed text
13
21
  attr_reader :ignore_case
14
22
 
15
23
  # Ignore the punctuation of parsed text
16
- attr_reader :ignore_punc
24
+ attr_reader :ignore_punctuation
17
25
 
18
26
  # Ignore URLs
19
27
  attr_reader :ignore_urls
@@ -24,138 +32,691 @@ module Raingrams
24
32
  # Ignore References
25
33
  attr_reader :ignore_references
26
34
 
27
- # Convert Acronyms to names within parsed text
28
- attr_reader :convert_acronyms
29
-
30
- # Convert Abbreviations to names within parsed text
31
- attr_reader :convert_abbrev
35
+ # Probabilities of all (n-1) grams
36
+ attr_reader :prefixes
37
+
38
+ #
39
+ # Creates a new NgramModel with the specified _options_.
40
+ #
41
+ # _options_ must contain the following keys:
42
+ # <tt>:ngram_size</tt>:: The size of each gram.
43
+ #
44
+ # _options_ may contain the following keys:
45
+ # <tt>:ignore_case</tt>:: Defaults to +false+.
46
+ # <tt>:ignore_punctuation</tt>:: Defaults to +true+.
47
+ # <tt>:ignore_urls</tt>:: Defaults to +false+.
48
+ # <tt>:ignore_phone_numbers</tt>:: Defaults to +false+.
49
+ #
50
+ def initialize(options={},&block)
51
+ @ngram_size = options[:ngram_size]
52
+ @starting_ngram = Ngram.new(Tokens.start * @ngram_size)
53
+ @stoping_ngram = Ngram.new(Tokens.stop * @ngram_size)
54
+
55
+ @ignore_case = false
56
+ @ignore_punctuation = true
57
+ @ignore_urls = true
58
+ @ignore_phone_numbers = false
59
+
60
+ if options.has_key?(:ignore_case)
61
+ @ignore_case = options[:ignore_case]
62
+ end
32
63
 
33
- # Frequencies of observed ngrams
34
- attr_reader :frequency
64
+ if options.has_key?(:ignore_punctuation)
65
+ @ignore_punctuation = options[:ignore_punctuation]
66
+ end
35
67
 
36
- # Normalized table of observed ngrams
37
- attr_reader :probability
68
+ if options.has_key?(:ignore_urls)
69
+ @ignore_urls = options[:ignore_urls]
70
+ end
38
71
 
39
- def initialize(opts={},&block)
40
- @ngram_size = opts[:ngram_size]
41
- @ignore_case = opts[:ignore_case] || false
42
- @ignore_punc = opts[:ignore_punc] || true
43
- @ignore_urls = opts[:ignore_urls] || false
44
- @ignore_phone_numbers = opts[:ignore_phone_numbers] || false
45
- @convert_acronyms = opts[:convert_acronyms] || false
46
- @convert_abbrev = opts[:convert_abbrev] || false
72
+ if options.has_key?(:ignore_phone_numbers)
73
+ @ignore_phone_numbers = options[:ignore_phone_numbers]
74
+ end
47
75
 
48
- @frequency = Hash.new { |hash,key| 0 }
49
- @probability = Hash.new { |hash,key| 0.0 }
76
+ @prefixes = {}
50
77
 
51
78
  block.call(self) if block
52
79
  end
53
80
 
81
+ #
82
+ # Creates a new NgramModel object with the given _options_. If a
83
+ # _block_ is given, it will be passed the newly created model.
84
+ #
85
+ def self.build(options={},&block)
86
+ self.new(options) do |model|
87
+ model.build(&block)
88
+ end
89
+ end
90
+
91
+ #
92
+ # Parses the specified _sentence_ and returns an Array of tokens.
93
+ #
54
94
  def parse_sentence(sentence)
95
+ # eat tailing punctuation
55
96
  sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
56
97
 
57
98
  if @ignore_urls
58
- sentence.gsub!(/\s*\w+:\/\/\w*\s*/,' ')
99
+ # remove URLs
100
+ sentence.gsub!(/\s*\w+:\/\/[\w\/,\._\-%\?&=]*\s*/,' ')
59
101
  end
60
102
 
61
103
  if @ignore_phone_numbers
104
+ # remove phone numbers
62
105
  sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
63
106
  end
64
107
 
65
108
  if @ignore_references
66
- sentence.gsub!(/\s*[\d+]\s*/,' ')
109
+ # remove RFC style references
110
+ sentence.gsub!(/\s*\[\d+\]\s*/,' ')
67
111
  end
68
112
 
69
113
  if @ignore_case
114
+ # downcase the sentence
70
115
  sentence.downcase!
71
116
  end
72
117
 
73
- if @ignore_punc
74
- return sentence.scan(/\w+[\.'\-\_]?\w*/)
118
+ if @ignore_punctuation
119
+ # split and ignore punctuation characters
120
+ return sentence.scan(/\w+[_\.:']?\w+/)
75
121
  else
76
- return sentence.scan(/(\w+|[-_,\.;'"])/)
122
+ # split and accept punctuation characters
123
+ return sentence.scan(/[\w\-_,\.;'"\\\/]+/)
77
124
  end
78
125
  end
79
126
 
80
- def parse_text(text,&block)
127
+ #
128
+ # Parses the specified _text_ and returns an Array of sentences.
129
+ #
130
+ def parse_text(text)
81
131
  text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
82
132
  end
83
133
 
84
- def train_with_ngram(ngram)
85
- @frequency[ngram] += 1
86
- return self
87
- end
134
+ #
135
+ # Returns the ngrams that compose the model.
136
+ #
137
+ def ngrams
138
+ ngram_set = NgramSet.new
88
139
 
89
- def train_with_ngrams(ngrams=[])
90
- ngrams.each { |ngram| train_with_ngram(ngram) }
91
- return self
92
- end
140
+ @prefixes.each do |prefix,table|
141
+ table.each_gram do |gram|
142
+ ngram_set << (prefix + gram)
143
+ end
144
+ end
93
145
 
94
- def ngrams
95
- @frequency.keys
146
+ return ngram_set
96
147
  end
97
148
 
149
+ #
150
+ # Returns +true+ if the model contains the specified _ngram_, returns
151
+ # +false+ otherwise.
152
+ #
98
153
  def has_ngram?(ngram)
99
- ngrams.include?(ngram)
154
+ @prefixes[ngram.prefix].has_gram?(ngram.last)
100
155
  end
101
156
 
157
+ #
158
+ # Iterates over the ngrams that compose the model, passing each one
159
+ # to the given _block_.
160
+ #
102
161
  def each_ngram(&block)
103
- ngrams.each(&block)
162
+ @prefixes.each do |prefix,table|
163
+ table.each_gram do |gram|
164
+ block.call(prefix + gram) if block
165
+ end
166
+ end
167
+
168
+ return self
104
169
  end
105
170
 
171
+ #
172
+ # Selects the ngrams that match the given _block_.
173
+ #
106
174
  def ngrams_with(&block)
107
- ngrams.select(&block)
175
+ selected_ngrams = NgramSet.new
176
+
177
+ each_ngram do |ngram|
178
+ selected_ngrams << ngram if block.call(ngram)
179
+ end
180
+
181
+ return ngrams
108
182
  end
109
183
 
110
- def vocabulary
111
- ngrams.flatten.uniq
184
+ #
185
+ # Returns the ngrams prefixed by the specified _prefix_.
186
+ #
187
+ def ngrams_prefixed_by(prefix)
188
+ ngram_set = NgramSet.new
189
+
190
+ return ngram_set unless @prefixes.has_key?(prefix)
191
+
192
+ ngram_set += @prefixes[prefix].grams.map do |gram|
193
+ prefix + gram
194
+ end
195
+
196
+ return ngram_set
112
197
  end
113
198
 
114
- def within_vocabulary?(gram)
115
- each_ngrams do |ngram|
116
- return true if ngram.include?(gram)
199
+ #
200
+ # Returns the ngrams postfixed by the specified _postfix_.
201
+ #
202
+ def ngrams_postfixed_by(postfix)
203
+ ngram_set = NgramSet.new
204
+
205
+ @prefixes.each do |prefix,table|
206
+ if prefix[1..-1] == postfix[0..-2]
207
+ if table.has_gram?(postfix.last)
208
+ ngram_set << (prefix + postfix.last)
209
+ end
210
+ end
117
211
  end
118
212
 
119
- return false
213
+ return ngram_set
120
214
  end
121
215
 
122
- def ngrams_starting_with(obj)
123
- ngrams_with { |ngram| ngram.starts_with?(obj.to_gram) }
216
+ #
217
+ # Returns the ngrams starting with the specified _gram_.
218
+ #
219
+ def ngrams_starting_with(gram)
220
+ ngram_set = NgramSet.new
221
+
222
+ @prefixes.each do |prefix,table|
223
+ if prefix.first == gram
224
+ table.each_gram do |gram|
225
+ ngram_set << (prefix + gram)
226
+ end
227
+ end
228
+ end
229
+
230
+ return ngram_set
124
231
  end
125
232
 
233
+ #
234
+ # Returns the ngrams which end with the specified _gram_.
235
+ #
126
236
  def ngrams_ending_with(gram)
127
- ngrams_with { |ngram| ngram.ends_with?(gram) }
237
+ ngram_set = NgramSet.new
238
+
239
+ @prefixes.each do |prefix,table|
240
+ if table.has_gram?(gram)
241
+ ngram_set << (prefix + gram)
242
+ end
243
+ end
244
+
245
+ return ngram_set
128
246
  end
129
247
 
130
- def probabilities_for(ngrams)
131
- ngrams.map { |ngram| @probability[ngram] }
248
+ #
249
+ # Returns the ngrams including the specified _grams_.
250
+ #
251
+ def ngrams_including(*grams)
252
+ ngram_set = NgramSet.new
253
+
254
+ @prefixes.each do |prefix,table|
255
+ if prefix.includes?(grams)
256
+ table.each_gram do |gram|
257
+ ngram_set << (prefix + gram)
258
+ end
259
+ else
260
+ table.each_gram do |gram|
261
+ if grams.include?(gram)
262
+ ngram_set << (prefix + gram)
263
+ end
264
+ end
265
+ end
266
+ end
267
+
268
+ return ngram_set
269
+ end
270
+
271
+ #
272
+ # Returns the ngrams extracted from the specified _words_.
273
+ #
274
+ def ngrams_from_words(words)
275
+ return (0...(words.length-@ngram_size+1)).map do |index|
276
+ Ngram.new(words[index,@ngram_size])
277
+ end
278
+ end
279
+
280
+ #
281
+ # Returns the ngrams extracted from the specified _fragment_ of text.
282
+ #
283
+ def ngrams_from_fragment(fragment)
284
+ ngrams_from_words(parse_sentence(fragment))
285
+ end
286
+
287
+ #
288
+ # Returns the ngrams extracted from the specified _sentence_.
289
+ #
290
+ def ngrams_from_sentence(sentence)
291
+ ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
292
+ end
293
+
294
+ #
295
+ # Returns the ngrams extracted from the specified _text_.
296
+ #
297
+ def ngrams_from_text(text)
298
+ parse_text(text).inject([]) do |ngrams,sentence|
299
+ ngrams + ngrams_from_sentence(sentence)
300
+ end
301
+ end
302
+
303
+ #
304
+ # Returns all ngrams which preceed the specified _gram_.
305
+ #
306
+ def ngrams_preceeding(gram)
307
+ ngram_set = NgramSet.new
308
+
309
+ ngrams_ending_with(gram).each do |ends_with|
310
+ ngrams_postfixed_by(ends_with.prefix).each do |ngram|
311
+ ngram_set << ngram
312
+ end
313
+ end
314
+
315
+ return ngram_set
316
+ end
317
+
318
+ #
319
+ # Returns all ngrams which occur directly after the specified _gram_.
320
+ #
321
+ def ngrams_following(gram)
322
+ ngram_set = NgramSet.new
323
+
324
+ ngrams_starting_with(gram).each do |starts_with|
325
+ ngrams_prefixed_by(starts_with.postfix).each do |ngram|
326
+ ngram_set << ngram
327
+ end
328
+ end
329
+
330
+ return ngram_set
331
+ end
332
+
333
+ #
334
+ # Returns all grams within the model.
335
+ #
336
+ def grams
337
+ @prefixes.keys.flatten.uniq
338
+ end
339
+
340
+ #
341
+ # Returns all grams which preceed the specified _gram_.
342
+ #
343
+ def grams_preceeding(gram)
344
+ gram_set = Set.new
345
+
346
+ ngrams_ending_with(gram).each do |ngram|
347
+ gram_set << ngram[-2]
348
+ end
349
+
350
+ return gram_set
351
+ end
352
+
353
+ #
354
+ # Returns all grams which occur directly after the specified _gram_.
355
+ #
356
+ def grams_following(gram)
357
+ gram_set = Set.new
358
+
359
+ ngram_starting_with(gram).each do |ngram|
360
+ gram_set << ngram[1]
361
+ end
362
+
363
+ return gram_set
364
+ end
365
+
366
+ #
367
+ # Returns the ngrams which occur within the specified _words_ and
368
+ # within the model.
369
+ #
370
+ def common_ngrams_from_words(words)
371
+ ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
372
+ end
373
+
374
+ #
375
+ # Returns the ngrams which occur within the specified _fragment_ and
376
+ # within the model.
377
+ #
378
+ def common_ngrams_from_fragment(fragment)
379
+ ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
380
+ end
381
+
382
+ #
383
+ # Returns the ngrams which occur within the specified _sentence_ and
384
+ # within the model.
385
+ #
386
+ def common_ngrams_from_sentence(sentence)
387
+ ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
388
+ end
389
+
390
+ #
391
+ # Returns the ngrams which occur within the specified _text_ and
392
+ # within the model.
393
+ #
394
+ def common_ngrams_from_text(text)
395
+ ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
132
396
  end
133
397
 
398
+ #
399
+ # Sets the frequency of the specified _ngram_ to the specified _value_.
400
+ #
401
+ def set_ngram_frequency(ngram,value)
402
+ probability_table(ngram).set_count(ngram.last,value)
403
+ end
404
+
405
+ #
406
+ # Train the model with the specified _ngram_.
407
+ #
408
+ def train_with_ngram(ngram)
409
+ probability_table(ngram).count(ngram.last)
410
+ end
411
+
412
+ #
413
+ # Train the model with the specified _ngrams_.
414
+ #
415
+ def train_with_ngrams(ngrams)
416
+ ngrams.each { |ngram| train_with_ngram(ngram) }
417
+ end
418
+
419
+ #
420
+ # Train the model with the specified _sentence_.
421
+ #
422
+ def train_with_sentence(sentence)
423
+ train_with_ngrams(ngrams_from_sentence(sentence))
424
+ end
425
+
426
+ #
427
+ # Train the model with the specified _text_.
428
+ #
429
+ def train_with_text(text)
430
+ train_with_ngrams(ngrams_from_text(text))
431
+ end
432
+
433
+ #
434
+ # Returns the probability of the specified _ngram_ occurring within
435
+ # arbitrary text.
436
+ #
134
437
  def probability_of_ngram(ngram)
135
- @probability[ngram]
438
+ prefix = ngram.prefix
439
+
440
+ if @prefixes.has_key?(prefix)
441
+ return @prefixes[prefix].probability_of(ngram.last)
442
+ else
443
+ return 0.0
444
+ end
136
445
  end
137
446
 
447
+ #
448
+ # Returns the probability of the specified _ngrams_ occurring within
449
+ # arbitrary text.
450
+ #
451
+ def probabilities_for(ngrams)
452
+ table = {}
453
+
454
+ ngrams.each do |ngram|
455
+ table[ngram] = probability_of_ngram(ngram)
456
+ end
457
+
458
+ return table
459
+ end
460
+
461
+ #
462
+ # Returns the joint probability of the specified _ngrams_ occurring
463
+ # within arbitrary text.
464
+ #
138
465
  def probability_of_ngrams(ngrams)
139
- probabilities_for(ngrams).inject { |joint,prob| joint * prob }
466
+ probabilities_for(ngrams).values.inject do |joint,prob|
467
+ joint * prob
468
+ end
140
469
  end
141
470
 
471
+ #
472
+ # Returns the probably of the specified _gram_ occurring within
473
+ # arbitrary text.
474
+ #
142
475
  def probability_of_gram(gram)
143
476
  probability_of_ngrams(ngrams_starting_with(gram))
144
477
  end
145
478
 
146
- def clear
147
- @frequency.clear
479
+ #
480
+ # Returns the probability of the specified _fragment_ occuring within
481
+ # arbitrary text.
482
+ #
483
+ def fragment_probability(fragment)
484
+ probability_of_ngrams(ngrams_from_fragment(fragment))
485
+ end
486
+
487
+ #
488
+ # Returns the probability of the specified _sentence_ occuring within
489
+ # arbitrary text.
490
+ #
491
+ def sentence_probability(sentence)
492
+ probability_of_ngrams(ngrams_from_sentence(sentence))
493
+ end
494
+
495
+ #
496
+ # Returns the probability of the specified _text_ occuring within
497
+ # arbitrary text.
498
+ #
499
+ def text_probability(text)
500
+ probability_of_ngrams(ngrams_from_text(text))
501
+ end
502
+
503
+ #
504
+ # Returns the joint probability of the common ngrams between the
505
+ # specified _fragment_ and the model.
506
+ #
507
+ def fragment_commonality(fragment)
508
+ probability_of_ngrams(common_ngrams_from_fragment(fragment))
509
+ end
148
510
 
149
- clear_probabilities
511
+ #
512
+ # Returns the joint probability of the common ngrams between the
513
+ # specified _sentence_ and the model.
514
+ #
515
+ def sentence_commonality(sentence)
516
+ probability_of_ngrams(common_ngrams_from_sentence(sentence))
517
+ end
518
+
519
+ #
520
+ # Returns the joint probability of the common ngrams between the
521
+ # specified _sentence_ and the model.
522
+ #
523
+ def text_commonality(text)
524
+ probability_of_ngrams(common_ngrams_from_text(text))
525
+ end
526
+
527
+ #
528
+ # Returns the conditional probability of the commonality of the
529
+ # specified _fragment_ against the _other_model_, given the commonality
530
+ # of the _fragment_ against the model.
531
+ #
532
+ def fragment_similarity(fragment,other_model)
533
+ other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
534
+ end
535
+
536
+ #
537
+ # Returns the conditional probability of the commonality of the
538
+ # specified _sentence_ against the _other_model_, given the commonality
539
+ # of the _sentence_ against the model.
540
+ #
541
+ def sentence_similarity(sentence,other_model)
542
+ other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
543
+ end
544
+
545
+ #
546
+ # Returns the conditional probability of the commonality of the
547
+ # specified _text_ against the _other_model_, given the commonality
548
+ # of the _text_ against the model.
549
+ #
550
+ def text_similarity(text,other_model)
551
+ other_model.text_commonality(text) / text_commonality(text)
552
+ end
553
+
554
+ #
555
+ # Returns a random gram from the model.
556
+ #
557
+ def random_gram
558
+ prefix = @prefixes.keys[rand(@prefixes.length)]
559
+
560
+ return prefix[rand(prefix.length)]
561
+ end
562
+
563
+ #
564
+ # Returns a random ngram from the model.
565
+ #
566
+ def random_ngram
567
+ prefix_index = rand(@prefixes.length)
568
+
569
+ prefix = @prefixes.keys[prefix_index]
570
+ table = @prefixes.values[prefix_index]
571
+
572
+ gram_index = rand(table.grams.length)
573
+
574
+ return (prefix + table.grams[gram_index])
575
+ end
576
+
577
+ #
578
+ # Returns a randomly generated sentence of grams using the given
579
+ # _options_.
580
+ #
581
+ def random_gram_sentence(options={})
582
+ grams = []
583
+ last_ngram = @starting_ngram
584
+
585
+ # prime the grams
586
+ grams += @starting_ngram
587
+
588
+ loop do
589
+ next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
590
+ last_ngram = next_ngrams[rand(next_ngrams.length)]
591
+
592
+ if last_ngram.nil?
593
+ return []
594
+ else
595
+ grams << last_ngram.last
596
+ break if last_ngram == @stoping_ngram
597
+ end
598
+ end
599
+
600
+ return grams
601
+ end
602
+
603
+ #
604
+ # Returns a randomly generated sentence of text using the given
605
+ # _options_.
606
+ #
607
+ def random_sentence(options={})
608
+ grams = random_gram_sentence(options)
609
+ sentence = grams.delete_if { |gram|
610
+ gram == Tokens.start || gram == Tokens.stop
611
+ }.join(' ')
612
+
613
+ sentence << '.' if @ignore_punctuation
614
+ return sentence
615
+ end
616
+
617
+ #
618
+ # Returns a randomly generated paragraph of text using the given
619
+ # _options_.
620
+ #
621
+ # _options_ may contain the following keys:
622
+ # <tt>:min_sentences</tt>:: Minimum number of sentences in the
623
+ # paragraph. Defaults to 3.
624
+ # <tt>:max_sentences</tt>:: Maximum number of sentences in the
625
+ # paragraph. Defaults to 6.
626
+ #
627
+ def random_paragraph(options={})
628
+ min_sentences = (options[:min_sentences] || 3)
629
+ max_sentences = (options[:max_sentences] || 6)
630
+ sentences = []
631
+
632
+ (rand(max_sentences - min_sentences) + min_sentences).times do
633
+ sentences << random_sentence(options)
634
+ end
635
+
636
+ return sentences.join(' ')
637
+ end
638
+
639
+ #
640
+ # Returns randomly generated text using the given _options_.
641
+ #
642
+ # _options_ may contain the following keys:
643
+ # <tt>:min_sentences</tt>:: Minimum number of sentences in the
644
+ # paragraph. Defaults to 3.
645
+ # <tt>:max_sentences</tt>:: Maximum number of sentences in the
646
+ # paragraph. Defaults to 6.
647
+ # <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
648
+ # Defaults to 3.
649
+ # <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
650
+ # Defaults to 5.
651
+ #
652
+ def random_text(options={})
653
+ min_paragraphs = (options[:min_paragraphs] || 3)
654
+ max_paragraphs = (options[:max_paragraphs] || 6)
655
+ paragraphs = []
656
+
657
+ (rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
658
+ paragraphs << random_paragraph(options)
659
+ end
660
+
661
+ return paragraphs.join("\n\n")
662
+ end
663
+
664
+ #
665
+ # Refreshes the probability tables of the model.
666
+ #
667
+ def refresh(&block)
668
+ block.call(self) if block
669
+
670
+ @prefixes.each_value { |table| table.build }
150
671
  return self
151
672
  end
152
673
 
153
- protected
674
+ #
675
+ # Clears and rebuilds the model.
676
+ #
677
+ def build(&block)
678
+ refresh do
679
+ clear
680
+
681
+ block.call(self) if block
682
+ end
683
+ end
154
684
 
155
- def clear_probabilities
156
- @probability.clear
685
+ #
686
+ # Clears the model of any training data.
687
+ #
688
+ def clear
689
+ @prefixes.clear
157
690
  return self
158
691
  end
159
692
 
693
+ protected
694
+
695
+ #
696
+ # Defines the default ngram _size_ for the model.
697
+ #
698
+ def self.ngram_size(size)
699
+ class_eval %{
700
+ def initialize(options={},&block)
701
+ super(options.merge(:ngram_size => #{size.to_i}),&block)
702
+ end
703
+ }
704
+ end
705
+
706
+ #
707
+ # Wraps the specified _setence_ with StartSentence and StopSentence
708
+ # tokens.
709
+ #
710
+ def wrap_sentence(sentence)
711
+ @starting_ngram + sentence.to_a + @stoping_ngram
712
+ end
713
+
714
+ #
715
+ # Returns the probability table for the specified _ngram_.
716
+ #
717
+ def probability_table(ngram)
718
+ @prefixes[ngram.prefix] ||= ProbabilityTable.new
719
+ end
720
+
160
721
  end
161
722
  end