raingrams 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +9 -0
- data/Manifest.txt +10 -10
- data/README.txt +9 -7
- data/Rakefile +3 -6
- data/TODO.txt +6 -0
- data/lib/raingrams/bigram_model.rb +3 -7
- data/lib/raingrams/extensions/object.rb +4 -1
- data/lib/raingrams/extensions/string.rb +3 -0
- data/lib/raingrams/extensions.rb +0 -5
- data/lib/raingrams/hexagram_model.rb +3 -7
- data/lib/raingrams/model.rb +622 -61
- data/lib/raingrams/ngram.rb +50 -9
- data/lib/raingrams/ngram_set.rb +43 -0
- data/lib/raingrams/open_vocabulary/model.rb +12 -0
- data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
- data/lib/raingrams/open_vocabulary.rb +0 -1
- data/lib/raingrams/pentagram_model.rb +3 -7
- data/lib/raingrams/probability_table.rb +153 -0
- data/lib/raingrams/quadgram_model.rb +3 -7
- data/lib/raingrams/raingrams.rb +10 -20
- data/lib/raingrams/tokens/start_sentence.rb +2 -2
- data/lib/raingrams/tokens/stop_sentence.rb +2 -2
- data/lib/raingrams/tokens/token.rb +49 -5
- data/lib/raingrams/tokens/unknown.rb +2 -2
- data/lib/raingrams/tokens.rb +1 -0
- data/lib/raingrams/trigram_model.rb +3 -7
- data/lib/raingrams/version.rb +1 -1
- data/lib/raingrams.rb +1 -1
- data/spec/ngram_set_spec.rb +54 -0
- data/spec/ngram_spec.rb +29 -0
- data/spec/probability_table_spec.rb +94 -0
- data/spec/raingrams_spec.rb +9 -0
- data/spec/spec_helper.rb +5 -0
- data/tasks/spec.rb +7 -0
- metadata +65 -55
- data/lib/raingrams/extensions/class.rb +0 -7
- data/lib/raingrams/extensions/false_class.rb +0 -7
- data/lib/raingrams/extensions/nil_class.rb +0 -7
- data/lib/raingrams/extensions/symbol.rb +0 -7
- data/lib/raingrams/extensions/true_class.rb +0 -7
- data/lib/raingrams/multigram_model.rb +0 -165
- data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
- data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
- data/lib/raingrams/unigram_model.rb +0 -70
- data/test/test_raingrams.rb +0 -0
data/lib/raingrams/model.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
require 'raingrams/ngram'
|
2
|
-
require 'raingrams/
|
3
|
-
require 'raingrams/
|
4
|
-
require 'raingrams/
|
2
|
+
require 'raingrams/ngram_set'
|
3
|
+
require 'raingrams/probability_table'
|
4
|
+
require 'raingrams/tokens'
|
5
|
+
|
6
|
+
require 'set'
|
5
7
|
|
6
8
|
module Raingrams
|
7
9
|
class Model
|
@@ -9,11 +11,17 @@ module Raingrams
|
|
9
11
|
# Size of ngrams to use
|
10
12
|
attr_reader :ngram_size
|
11
13
|
|
14
|
+
# The sentence starting ngram
|
15
|
+
attr_reader :starting_ngram
|
16
|
+
|
17
|
+
# The sentence stopping ngram
|
18
|
+
attr_reader :stoping_ngram
|
19
|
+
|
12
20
|
# Ignore case of parsed text
|
13
21
|
attr_reader :ignore_case
|
14
22
|
|
15
23
|
# Ignore the punctuation of parsed text
|
16
|
-
attr_reader :
|
24
|
+
attr_reader :ignore_punctuation
|
17
25
|
|
18
26
|
# Ignore URLs
|
19
27
|
attr_reader :ignore_urls
|
@@ -24,138 +32,691 @@ module Raingrams
|
|
24
32
|
# Ignore References
|
25
33
|
attr_reader :ignore_references
|
26
34
|
|
27
|
-
#
|
28
|
-
attr_reader :
|
29
|
-
|
30
|
-
#
|
31
|
-
|
35
|
+
# Probabilities of all (n-1) grams
|
36
|
+
attr_reader :prefixes
|
37
|
+
|
38
|
+
#
|
39
|
+
# Creates a new NgramModel with the specified _options_.
|
40
|
+
#
|
41
|
+
# _options_ must contain the following keys:
|
42
|
+
# <tt>:ngram_size</tt>:: The size of each gram.
|
43
|
+
#
|
44
|
+
# _options_ may contain the following keys:
|
45
|
+
# <tt>:ignore_case</tt>:: Defaults to +false+.
|
46
|
+
# <tt>:ignore_punctuation</tt>:: Defaults to +true+.
|
47
|
+
# <tt>:ignore_urls</tt>:: Defaults to +false+.
|
48
|
+
# <tt>:ignore_phone_numbers</tt>:: Defaults to +false+.
|
49
|
+
#
|
50
|
+
def initialize(options={},&block)
|
51
|
+
@ngram_size = options[:ngram_size]
|
52
|
+
@starting_ngram = Ngram.new(Tokens.start * @ngram_size)
|
53
|
+
@stoping_ngram = Ngram.new(Tokens.stop * @ngram_size)
|
54
|
+
|
55
|
+
@ignore_case = false
|
56
|
+
@ignore_punctuation = true
|
57
|
+
@ignore_urls = true
|
58
|
+
@ignore_phone_numbers = false
|
59
|
+
|
60
|
+
if options.has_key?(:ignore_case)
|
61
|
+
@ignore_case = options[:ignore_case]
|
62
|
+
end
|
32
63
|
|
33
|
-
|
34
|
-
|
64
|
+
if options.has_key?(:ignore_punctuation)
|
65
|
+
@ignore_punctuation = options[:ignore_punctuation]
|
66
|
+
end
|
35
67
|
|
36
|
-
|
37
|
-
|
68
|
+
if options.has_key?(:ignore_urls)
|
69
|
+
@ignore_urls = options[:ignore_urls]
|
70
|
+
end
|
38
71
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
@ignore_punc = opts[:ignore_punc] || true
|
43
|
-
@ignore_urls = opts[:ignore_urls] || false
|
44
|
-
@ignore_phone_numbers = opts[:ignore_phone_numbers] || false
|
45
|
-
@convert_acronyms = opts[:convert_acronyms] || false
|
46
|
-
@convert_abbrev = opts[:convert_abbrev] || false
|
72
|
+
if options.has_key?(:ignore_phone_numbers)
|
73
|
+
@ignore_phone_numbers = options[:ignore_phone_numbers]
|
74
|
+
end
|
47
75
|
|
48
|
-
@
|
49
|
-
@probability = Hash.new { |hash,key| 0.0 }
|
76
|
+
@prefixes = {}
|
50
77
|
|
51
78
|
block.call(self) if block
|
52
79
|
end
|
53
80
|
|
81
|
+
#
|
82
|
+
# Creates a new NgramModel object with the given _options_. If a
|
83
|
+
# _block_ is given, it will be passed the newly created model.
|
84
|
+
#
|
85
|
+
def self.build(options={},&block)
|
86
|
+
self.new(options) do |model|
|
87
|
+
model.build(&block)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# Parses the specified _sentence_ and returns an Array of tokens.
|
93
|
+
#
|
54
94
|
def parse_sentence(sentence)
|
95
|
+
# eat tailing punctuation
|
55
96
|
sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
|
56
97
|
|
57
98
|
if @ignore_urls
|
58
|
-
|
99
|
+
# remove URLs
|
100
|
+
sentence.gsub!(/\s*\w+:\/\/[\w\/,\._\-%\?&=]*\s*/,' ')
|
59
101
|
end
|
60
102
|
|
61
103
|
if @ignore_phone_numbers
|
104
|
+
# remove phone numbers
|
62
105
|
sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
|
63
106
|
end
|
64
107
|
|
65
108
|
if @ignore_references
|
66
|
-
|
109
|
+
# remove RFC style references
|
110
|
+
sentence.gsub!(/\s*\[\d+\]\s*/,' ')
|
67
111
|
end
|
68
112
|
|
69
113
|
if @ignore_case
|
114
|
+
# downcase the sentence
|
70
115
|
sentence.downcase!
|
71
116
|
end
|
72
117
|
|
73
|
-
if @
|
74
|
-
|
118
|
+
if @ignore_punctuation
|
119
|
+
# split and ignore punctuation characters
|
120
|
+
return sentence.scan(/\w+[_\.:']?\w+/)
|
75
121
|
else
|
76
|
-
|
122
|
+
# split and accept punctuation characters
|
123
|
+
return sentence.scan(/[\w\-_,\.;'"\\\/]+/)
|
77
124
|
end
|
78
125
|
end
|
79
126
|
|
80
|
-
|
127
|
+
#
|
128
|
+
# Parses the specified _text_ and returns an Array of sentences.
|
129
|
+
#
|
130
|
+
def parse_text(text)
|
81
131
|
text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
|
82
132
|
end
|
83
133
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
134
|
+
#
|
135
|
+
# Returns the ngrams that compose the model.
|
136
|
+
#
|
137
|
+
def ngrams
|
138
|
+
ngram_set = NgramSet.new
|
88
139
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
140
|
+
@prefixes.each do |prefix,table|
|
141
|
+
table.each_gram do |gram|
|
142
|
+
ngram_set << (prefix + gram)
|
143
|
+
end
|
144
|
+
end
|
93
145
|
|
94
|
-
|
95
|
-
@frequency.keys
|
146
|
+
return ngram_set
|
96
147
|
end
|
97
148
|
|
149
|
+
#
|
150
|
+
# Returns +true+ if the model contains the specified _ngram_, returns
|
151
|
+
# +false+ otherwise.
|
152
|
+
#
|
98
153
|
def has_ngram?(ngram)
|
99
|
-
|
154
|
+
@prefixes[ngram.prefix].has_gram?(ngram.last)
|
100
155
|
end
|
101
156
|
|
157
|
+
#
|
158
|
+
# Iterates over the ngrams that compose the model, passing each one
|
159
|
+
# to the given _block_.
|
160
|
+
#
|
102
161
|
def each_ngram(&block)
|
103
|
-
|
162
|
+
@prefixes.each do |prefix,table|
|
163
|
+
table.each_gram do |gram|
|
164
|
+
block.call(prefix + gram) if block
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
return self
|
104
169
|
end
|
105
170
|
|
171
|
+
#
|
172
|
+
# Selects the ngrams that match the given _block_.
|
173
|
+
#
|
106
174
|
def ngrams_with(&block)
|
107
|
-
|
175
|
+
selected_ngrams = NgramSet.new
|
176
|
+
|
177
|
+
each_ngram do |ngram|
|
178
|
+
selected_ngrams << ngram if block.call(ngram)
|
179
|
+
end
|
180
|
+
|
181
|
+
return ngrams
|
108
182
|
end
|
109
183
|
|
110
|
-
|
111
|
-
|
184
|
+
#
|
185
|
+
# Returns the ngrams prefixed by the specified _prefix_.
|
186
|
+
#
|
187
|
+
def ngrams_prefixed_by(prefix)
|
188
|
+
ngram_set = NgramSet.new
|
189
|
+
|
190
|
+
return ngram_set unless @prefixes.has_key?(prefix)
|
191
|
+
|
192
|
+
ngram_set += @prefixes[prefix].grams.map do |gram|
|
193
|
+
prefix + gram
|
194
|
+
end
|
195
|
+
|
196
|
+
return ngram_set
|
112
197
|
end
|
113
198
|
|
114
|
-
|
115
|
-
|
116
|
-
|
199
|
+
#
|
200
|
+
# Returns the ngrams postfixed by the specified _postfix_.
|
201
|
+
#
|
202
|
+
def ngrams_postfixed_by(postfix)
|
203
|
+
ngram_set = NgramSet.new
|
204
|
+
|
205
|
+
@prefixes.each do |prefix,table|
|
206
|
+
if prefix[1..-1] == postfix[0..-2]
|
207
|
+
if table.has_gram?(postfix.last)
|
208
|
+
ngram_set << (prefix + postfix.last)
|
209
|
+
end
|
210
|
+
end
|
117
211
|
end
|
118
212
|
|
119
|
-
return
|
213
|
+
return ngram_set
|
120
214
|
end
|
121
215
|
|
122
|
-
|
123
|
-
|
216
|
+
#
|
217
|
+
# Returns the ngrams starting with the specified _gram_.
|
218
|
+
#
|
219
|
+
def ngrams_starting_with(gram)
|
220
|
+
ngram_set = NgramSet.new
|
221
|
+
|
222
|
+
@prefixes.each do |prefix,table|
|
223
|
+
if prefix.first == gram
|
224
|
+
table.each_gram do |gram|
|
225
|
+
ngram_set << (prefix + gram)
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
return ngram_set
|
124
231
|
end
|
125
232
|
|
233
|
+
#
|
234
|
+
# Returns the ngrams which end with the specified _gram_.
|
235
|
+
#
|
126
236
|
def ngrams_ending_with(gram)
|
127
|
-
|
237
|
+
ngram_set = NgramSet.new
|
238
|
+
|
239
|
+
@prefixes.each do |prefix,table|
|
240
|
+
if table.has_gram?(gram)
|
241
|
+
ngram_set << (prefix + gram)
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
return ngram_set
|
128
246
|
end
|
129
247
|
|
130
|
-
|
131
|
-
|
248
|
+
#
|
249
|
+
# Returns the ngrams including the specified _grams_.
|
250
|
+
#
|
251
|
+
def ngrams_including(*grams)
|
252
|
+
ngram_set = NgramSet.new
|
253
|
+
|
254
|
+
@prefixes.each do |prefix,table|
|
255
|
+
if prefix.includes?(grams)
|
256
|
+
table.each_gram do |gram|
|
257
|
+
ngram_set << (prefix + gram)
|
258
|
+
end
|
259
|
+
else
|
260
|
+
table.each_gram do |gram|
|
261
|
+
if grams.include?(gram)
|
262
|
+
ngram_set << (prefix + gram)
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
return ngram_set
|
269
|
+
end
|
270
|
+
|
271
|
+
#
|
272
|
+
# Returns the ngrams extracted from the specified _words_.
|
273
|
+
#
|
274
|
+
def ngrams_from_words(words)
|
275
|
+
return (0...(words.length-@ngram_size+1)).map do |index|
|
276
|
+
Ngram.new(words[index,@ngram_size])
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
#
|
281
|
+
# Returns the ngrams extracted from the specified _fragment_ of text.
|
282
|
+
#
|
283
|
+
def ngrams_from_fragment(fragment)
|
284
|
+
ngrams_from_words(parse_sentence(fragment))
|
285
|
+
end
|
286
|
+
|
287
|
+
#
|
288
|
+
# Returns the ngrams extracted from the specified _sentence_.
|
289
|
+
#
|
290
|
+
def ngrams_from_sentence(sentence)
|
291
|
+
ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
|
292
|
+
end
|
293
|
+
|
294
|
+
#
|
295
|
+
# Returns the ngrams extracted from the specified _text_.
|
296
|
+
#
|
297
|
+
def ngrams_from_text(text)
|
298
|
+
parse_text(text).inject([]) do |ngrams,sentence|
|
299
|
+
ngrams + ngrams_from_sentence(sentence)
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
#
|
304
|
+
# Returns all ngrams which preceed the specified _gram_.
|
305
|
+
#
|
306
|
+
def ngrams_preceeding(gram)
|
307
|
+
ngram_set = NgramSet.new
|
308
|
+
|
309
|
+
ngrams_ending_with(gram).each do |ends_with|
|
310
|
+
ngrams_postfixed_by(ends_with.prefix).each do |ngram|
|
311
|
+
ngram_set << ngram
|
312
|
+
end
|
313
|
+
end
|
314
|
+
|
315
|
+
return ngram_set
|
316
|
+
end
|
317
|
+
|
318
|
+
#
|
319
|
+
# Returns all ngrams which occur directly after the specified _gram_.
|
320
|
+
#
|
321
|
+
def ngrams_following(gram)
|
322
|
+
ngram_set = NgramSet.new
|
323
|
+
|
324
|
+
ngrams_starting_with(gram).each do |starts_with|
|
325
|
+
ngrams_prefixed_by(starts_with.postfix).each do |ngram|
|
326
|
+
ngram_set << ngram
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
return ngram_set
|
331
|
+
end
|
332
|
+
|
333
|
+
#
|
334
|
+
# Returns all grams within the model.
|
335
|
+
#
|
336
|
+
def grams
|
337
|
+
@prefixes.keys.flatten.uniq
|
338
|
+
end
|
339
|
+
|
340
|
+
#
|
341
|
+
# Returns all grams which preceed the specified _gram_.
|
342
|
+
#
|
343
|
+
def grams_preceeding(gram)
|
344
|
+
gram_set = Set.new
|
345
|
+
|
346
|
+
ngrams_ending_with(gram).each do |ngram|
|
347
|
+
gram_set << ngram[-2]
|
348
|
+
end
|
349
|
+
|
350
|
+
return gram_set
|
351
|
+
end
|
352
|
+
|
353
|
+
#
|
354
|
+
# Returns all grams which occur directly after the specified _gram_.
|
355
|
+
#
|
356
|
+
def grams_following(gram)
|
357
|
+
gram_set = Set.new
|
358
|
+
|
359
|
+
ngram_starting_with(gram).each do |ngram|
|
360
|
+
gram_set << ngram[1]
|
361
|
+
end
|
362
|
+
|
363
|
+
return gram_set
|
364
|
+
end
|
365
|
+
|
366
|
+
#
|
367
|
+
# Returns the ngrams which occur within the specified _words_ and
|
368
|
+
# within the model.
|
369
|
+
#
|
370
|
+
def common_ngrams_from_words(words)
|
371
|
+
ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
|
372
|
+
end
|
373
|
+
|
374
|
+
#
|
375
|
+
# Returns the ngrams which occur within the specified _fragment_ and
|
376
|
+
# within the model.
|
377
|
+
#
|
378
|
+
def common_ngrams_from_fragment(fragment)
|
379
|
+
ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
|
380
|
+
end
|
381
|
+
|
382
|
+
#
|
383
|
+
# Returns the ngrams which occur within the specified _sentence_ and
|
384
|
+
# within the model.
|
385
|
+
#
|
386
|
+
def common_ngrams_from_sentence(sentence)
|
387
|
+
ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
|
388
|
+
end
|
389
|
+
|
390
|
+
#
|
391
|
+
# Returns the ngrams which occur within the specified _text_ and
|
392
|
+
# within the model.
|
393
|
+
#
|
394
|
+
def common_ngrams_from_text(text)
|
395
|
+
ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
|
132
396
|
end
|
133
397
|
|
398
|
+
#
|
399
|
+
# Sets the frequency of the specified _ngram_ to the specified _value_.
|
400
|
+
#
|
401
|
+
def set_ngram_frequency(ngram,value)
|
402
|
+
probability_table(ngram).set_count(ngram.last,value)
|
403
|
+
end
|
404
|
+
|
405
|
+
#
|
406
|
+
# Train the model with the specified _ngram_.
|
407
|
+
#
|
408
|
+
def train_with_ngram(ngram)
|
409
|
+
probability_table(ngram).count(ngram.last)
|
410
|
+
end
|
411
|
+
|
412
|
+
#
|
413
|
+
# Train the model with the specified _ngrams_.
|
414
|
+
#
|
415
|
+
def train_with_ngrams(ngrams)
|
416
|
+
ngrams.each { |ngram| train_with_ngram(ngram) }
|
417
|
+
end
|
418
|
+
|
419
|
+
#
|
420
|
+
# Train the model with the specified _sentence_.
|
421
|
+
#
|
422
|
+
def train_with_sentence(sentence)
|
423
|
+
train_with_ngrams(ngrams_from_sentence(sentence))
|
424
|
+
end
|
425
|
+
|
426
|
+
#
|
427
|
+
# Train the model with the specified _text_.
|
428
|
+
#
|
429
|
+
def train_with_text(text)
|
430
|
+
train_with_ngrams(ngrams_from_text(text))
|
431
|
+
end
|
432
|
+
|
433
|
+
#
|
434
|
+
# Returns the probability of the specified _ngram_ occurring within
|
435
|
+
# arbitrary text.
|
436
|
+
#
|
134
437
|
def probability_of_ngram(ngram)
|
135
|
-
|
438
|
+
prefix = ngram.prefix
|
439
|
+
|
440
|
+
if @prefixes.has_key?(prefix)
|
441
|
+
return @prefixes[prefix].probability_of(ngram.last)
|
442
|
+
else
|
443
|
+
return 0.0
|
444
|
+
end
|
136
445
|
end
|
137
446
|
|
447
|
+
#
|
448
|
+
# Returns the probability of the specified _ngrams_ occurring within
|
449
|
+
# arbitrary text.
|
450
|
+
#
|
451
|
+
def probabilities_for(ngrams)
|
452
|
+
table = {}
|
453
|
+
|
454
|
+
ngrams.each do |ngram|
|
455
|
+
table[ngram] = probability_of_ngram(ngram)
|
456
|
+
end
|
457
|
+
|
458
|
+
return table
|
459
|
+
end
|
460
|
+
|
461
|
+
#
|
462
|
+
# Returns the joint probability of the specified _ngrams_ occurring
|
463
|
+
# within arbitrary text.
|
464
|
+
#
|
138
465
|
def probability_of_ngrams(ngrams)
|
139
|
-
probabilities_for(ngrams).inject
|
466
|
+
probabilities_for(ngrams).values.inject do |joint,prob|
|
467
|
+
joint * prob
|
468
|
+
end
|
140
469
|
end
|
141
470
|
|
471
|
+
#
|
472
|
+
# Returns the probably of the specified _gram_ occurring within
|
473
|
+
# arbitrary text.
|
474
|
+
#
|
142
475
|
def probability_of_gram(gram)
|
143
476
|
probability_of_ngrams(ngrams_starting_with(gram))
|
144
477
|
end
|
145
478
|
|
146
|
-
|
147
|
-
|
479
|
+
#
|
480
|
+
# Returns the probability of the specified _fragment_ occuring within
|
481
|
+
# arbitrary text.
|
482
|
+
#
|
483
|
+
def fragment_probability(fragment)
|
484
|
+
probability_of_ngrams(ngrams_from_fragment(fragment))
|
485
|
+
end
|
486
|
+
|
487
|
+
#
|
488
|
+
# Returns the probability of the specified _sentence_ occuring within
|
489
|
+
# arbitrary text.
|
490
|
+
#
|
491
|
+
def sentence_probability(sentence)
|
492
|
+
probability_of_ngrams(ngrams_from_sentence(sentence))
|
493
|
+
end
|
494
|
+
|
495
|
+
#
|
496
|
+
# Returns the probability of the specified _text_ occuring within
|
497
|
+
# arbitrary text.
|
498
|
+
#
|
499
|
+
def text_probability(text)
|
500
|
+
probability_of_ngrams(ngrams_from_text(text))
|
501
|
+
end
|
502
|
+
|
503
|
+
#
|
504
|
+
# Returns the joint probability of the common ngrams between the
|
505
|
+
# specified _fragment_ and the model.
|
506
|
+
#
|
507
|
+
def fragment_commonality(fragment)
|
508
|
+
probability_of_ngrams(common_ngrams_from_fragment(fragment))
|
509
|
+
end
|
148
510
|
|
149
|
-
|
511
|
+
#
|
512
|
+
# Returns the joint probability of the common ngrams between the
|
513
|
+
# specified _sentence_ and the model.
|
514
|
+
#
|
515
|
+
def sentence_commonality(sentence)
|
516
|
+
probability_of_ngrams(common_ngrams_from_sentence(sentence))
|
517
|
+
end
|
518
|
+
|
519
|
+
#
|
520
|
+
# Returns the joint probability of the common ngrams between the
|
521
|
+
# specified _sentence_ and the model.
|
522
|
+
#
|
523
|
+
def text_commonality(text)
|
524
|
+
probability_of_ngrams(common_ngrams_from_text(text))
|
525
|
+
end
|
526
|
+
|
527
|
+
#
|
528
|
+
# Returns the conditional probability of the commonality of the
|
529
|
+
# specified _fragment_ against the _other_model_, given the commonality
|
530
|
+
# of the _fragment_ against the model.
|
531
|
+
#
|
532
|
+
def fragment_similarity(fragment,other_model)
|
533
|
+
other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
|
534
|
+
end
|
535
|
+
|
536
|
+
#
|
537
|
+
# Returns the conditional probability of the commonality of the
|
538
|
+
# specified _sentence_ against the _other_model_, given the commonality
|
539
|
+
# of the _sentence_ against the model.
|
540
|
+
#
|
541
|
+
def sentence_similarity(sentence,other_model)
|
542
|
+
other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
|
543
|
+
end
|
544
|
+
|
545
|
+
#
|
546
|
+
# Returns the conditional probability of the commonality of the
|
547
|
+
# specified _text_ against the _other_model_, given the commonality
|
548
|
+
# of the _text_ against the model.
|
549
|
+
#
|
550
|
+
def text_similarity(text,other_model)
|
551
|
+
other_model.text_commonality(text) / text_commonality(text)
|
552
|
+
end
|
553
|
+
|
554
|
+
#
|
555
|
+
# Returns a random gram from the model.
|
556
|
+
#
|
557
|
+
def random_gram
|
558
|
+
prefix = @prefixes.keys[rand(@prefixes.length)]
|
559
|
+
|
560
|
+
return prefix[rand(prefix.length)]
|
561
|
+
end
|
562
|
+
|
563
|
+
#
|
564
|
+
# Returns a random ngram from the model.
|
565
|
+
#
|
566
|
+
def random_ngram
|
567
|
+
prefix_index = rand(@prefixes.length)
|
568
|
+
|
569
|
+
prefix = @prefixes.keys[prefix_index]
|
570
|
+
table = @prefixes.values[prefix_index]
|
571
|
+
|
572
|
+
gram_index = rand(table.grams.length)
|
573
|
+
|
574
|
+
return (prefix + table.grams[gram_index])
|
575
|
+
end
|
576
|
+
|
577
|
+
#
|
578
|
+
# Returns a randomly generated sentence of grams using the given
|
579
|
+
# _options_.
|
580
|
+
#
|
581
|
+
def random_gram_sentence(options={})
|
582
|
+
grams = []
|
583
|
+
last_ngram = @starting_ngram
|
584
|
+
|
585
|
+
# prime the grams
|
586
|
+
grams += @starting_ngram
|
587
|
+
|
588
|
+
loop do
|
589
|
+
next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
|
590
|
+
last_ngram = next_ngrams[rand(next_ngrams.length)]
|
591
|
+
|
592
|
+
if last_ngram.nil?
|
593
|
+
return []
|
594
|
+
else
|
595
|
+
grams << last_ngram.last
|
596
|
+
break if last_ngram == @stoping_ngram
|
597
|
+
end
|
598
|
+
end
|
599
|
+
|
600
|
+
return grams
|
601
|
+
end
|
602
|
+
|
603
|
+
#
|
604
|
+
# Returns a randomly generated sentence of text using the given
|
605
|
+
# _options_.
|
606
|
+
#
|
607
|
+
def random_sentence(options={})
|
608
|
+
grams = random_gram_sentence(options)
|
609
|
+
sentence = grams.delete_if { |gram|
|
610
|
+
gram == Tokens.start || gram == Tokens.stop
|
611
|
+
}.join(' ')
|
612
|
+
|
613
|
+
sentence << '.' if @ignore_punctuation
|
614
|
+
return sentence
|
615
|
+
end
|
616
|
+
|
617
|
+
#
|
618
|
+
# Returns a randomly generated paragraph of text using the given
|
619
|
+
# _options_.
|
620
|
+
#
|
621
|
+
# _options_ may contain the following keys:
|
622
|
+
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
623
|
+
# paragraph. Defaults to 3.
|
624
|
+
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
625
|
+
# paragraph. Defaults to 6.
|
626
|
+
#
|
627
|
+
def random_paragraph(options={})
|
628
|
+
min_sentences = (options[:min_sentences] || 3)
|
629
|
+
max_sentences = (options[:max_sentences] || 6)
|
630
|
+
sentences = []
|
631
|
+
|
632
|
+
(rand(max_sentences - min_sentences) + min_sentences).times do
|
633
|
+
sentences << random_sentence(options)
|
634
|
+
end
|
635
|
+
|
636
|
+
return sentences.join(' ')
|
637
|
+
end
|
638
|
+
|
639
|
+
#
|
640
|
+
# Returns randomly generated text using the given _options_.
|
641
|
+
#
|
642
|
+
# _options_ may contain the following keys:
|
643
|
+
# <tt>:min_sentences</tt>:: Minimum number of sentences in the
|
644
|
+
# paragraph. Defaults to 3.
|
645
|
+
# <tt>:max_sentences</tt>:: Maximum number of sentences in the
|
646
|
+
# paragraph. Defaults to 6.
|
647
|
+
# <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
|
648
|
+
# Defaults to 3.
|
649
|
+
# <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
|
650
|
+
# Defaults to 5.
|
651
|
+
#
|
652
|
+
def random_text(options={})
|
653
|
+
min_paragraphs = (options[:min_paragraphs] || 3)
|
654
|
+
max_paragraphs = (options[:max_paragraphs] || 6)
|
655
|
+
paragraphs = []
|
656
|
+
|
657
|
+
(rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
|
658
|
+
paragraphs << random_paragraph(options)
|
659
|
+
end
|
660
|
+
|
661
|
+
return paragraphs.join("\n\n")
|
662
|
+
end
|
663
|
+
|
664
|
+
#
|
665
|
+
# Refreshes the probability tables of the model.
|
666
|
+
#
|
667
|
+
def refresh(&block)
|
668
|
+
block.call(self) if block
|
669
|
+
|
670
|
+
@prefixes.each_value { |table| table.build }
|
150
671
|
return self
|
151
672
|
end
|
152
673
|
|
153
|
-
|
674
|
+
#
|
675
|
+
# Clears and rebuilds the model.
|
676
|
+
#
|
677
|
+
def build(&block)
|
678
|
+
refresh do
|
679
|
+
clear
|
680
|
+
|
681
|
+
block.call(self) if block
|
682
|
+
end
|
683
|
+
end
|
154
684
|
|
155
|
-
|
156
|
-
|
685
|
+
#
|
686
|
+
# Clears the model of any training data.
|
687
|
+
#
|
688
|
+
def clear
|
689
|
+
@prefixes.clear
|
157
690
|
return self
|
158
691
|
end
|
159
692
|
|
693
|
+
protected
|
694
|
+
|
695
|
+
#
|
696
|
+
# Defines the default ngram _size_ for the model.
|
697
|
+
#
|
698
|
+
def self.ngram_size(size)
|
699
|
+
class_eval %{
|
700
|
+
def initialize(options={},&block)
|
701
|
+
super(options.merge(:ngram_size => #{size.to_i}),&block)
|
702
|
+
end
|
703
|
+
}
|
704
|
+
end
|
705
|
+
|
706
|
+
#
|
707
|
+
# Wraps the specified _setence_ with StartSentence and StopSentence
|
708
|
+
# tokens.
|
709
|
+
#
|
710
|
+
def wrap_sentence(sentence)
|
711
|
+
@starting_ngram + sentence.to_a + @stoping_ngram
|
712
|
+
end
|
713
|
+
|
714
|
+
#
|
715
|
+
# Returns the probability table for the specified _ngram_.
|
716
|
+
#
|
717
|
+
def probability_table(ngram)
|
718
|
+
@prefixes[ngram.prefix] ||= ProbabilityTable.new
|
719
|
+
end
|
720
|
+
|
160
721
|
end
|
161
722
|
end
|