raingrams 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,31 @@
1
+ == 0.1.1 / 2008-10-12
2
+
3
+ * Improved the parsing abilities of Model#parse_sentence and
4
+ Model#parse_text.
5
+ * Fixed a bug in Model#has_ngram?.
6
+ * Fixed a bug in Model#ngrams_starting_with.
7
+ * Removed Model#probability_of_gram, for now atleast.
8
+ * Renamed Ngram#includes? to Ngram#includes_all?.
9
+ * Renamed Model#ngrams_including to Model#ngrams_including_all.
10
+ * Renamed Model#frequencies_of_ngrams to Model#frequency_of_ngrams.
11
+ * Added the following methods:
12
+ * Ngram#includs_any?.
13
+ * Model.open.
14
+ * Model.train_with_paragraph.
15
+ * Model.train_with_text.
16
+ * Model.train_with_file.
17
+ * Model.train_with_url.
18
+ * Model#has_gram.
19
+ * Model#ngrams_including_all.
20
+ * Model#ngrams_from_paragraph.
21
+ * Model#train_with_paragraph.
22
+ * Model#train_with_file.
23
+ * Model#train_with_url.
24
+ * Model#frequency_of_ngram.
25
+ * Model#frequencies_for.
26
+ * Model#frequencies_of_ngrams.
27
+ * Model#save.
28
+
1
29
  == 0.1.0 / 2008-10-06
2
30
 
3
31
  * Various bug fixes.
data/Manifest.txt CHANGED
@@ -35,8 +35,17 @@ lib/raingrams/open_vocabulary/pentagram_model.rb
35
35
  lib/raingrams/open_vocabulary/hexagram_model.rb
36
36
  lib/raingrams/open_vocabulary.rb
37
37
  tasks/spec.rb
38
+ spec/training/snowcrash.txt
39
+ spec/helpers/training.rb
40
+ spec/helpers.rb
38
41
  spec/spec_helper.rb
39
42
  spec/ngram_spec.rb
40
43
  spec/ngram_set_spec.rb
41
44
  spec/probability_table_spec.rb
42
45
  spec/raingrams_spec.rb
46
+ spec/model_spec.rb
47
+ spec/model_examples.rb
48
+ spec/bigram_model_spec.rb
49
+ spec/trigram_model_spec.rb
50
+ spec/quadgram_model_spec.rb
51
+ spec/pentagram_model_spec.rb
data/README.txt CHANGED
@@ -6,22 +6,66 @@
6
6
  == DESCRIPTION:
7
7
 
8
8
  Raingrams is a flexible and general-purpose ngrams library written in Ruby.
9
- Raingrams supports any non-zero ngram size, text/non-text grams, multiple
9
+ Raingrams supports ngram sizes greater than 1, text/non-text grams, multiple
10
10
  parsing styles and open/closed vocabulary models.
11
11
 
12
12
  == FEATURES:
13
13
 
14
- * Supports all ngram sizes above 1.
14
+ * Supports ngram sizes greater than 1.
15
15
  * Supports text and non-text grams.
16
16
  * Supports Open and Closed vocabulary models.
17
17
  * Supports calculating the similarity and commonality of sample text against
18
18
  specified models.
19
19
  * Supports generating random text from models.
20
20
 
21
+ == REQUIREMENTS:
22
+
23
+ * Hpricot
24
+
21
25
  == INSTALL:
22
26
 
23
27
  $ sudo gem install raingrams
24
28
 
29
+ == EXAMPLES:
30
+
31
+ * Train a model with ycombinator comments:
32
+
33
+ require 'raingrams'
34
+ require 'hpricot'
35
+ require 'open-uri'
36
+
37
+ include Raingrams
38
+
39
+ model = BigramModel.build do |model|
40
+ doc = Hpricot(open('http://news.ycombinator.org/newcomments'))
41
+ doc.search('span.comment') do |span|
42
+ model.train_with_text(span.inner_text)
43
+ end
44
+ end
45
+
46
+ * Update a trained model:
47
+
48
+ model.train_with_text %{Interesting videos. Anders talks about functional
49
+ support on .net, concurrency, immutability. Guy Steele talks about
50
+ Fortress on JVM. Too bad they are afraid of macros (access to AST),
51
+ though Steele does say Fortress has some support.}
52
+
53
+ model.refresh
54
+
55
+ * Generate a random sentence:
56
+
57
+ model.random_sentence
58
+ # => "OTOOH if you use slicehost even offer to bash Apple makes it will
59
+ exit and its 38 month ago based configuration of little networks created."
60
+
61
+ * Dump a model to a file, to be marshaled later:
62
+
63
+ model.save('path/for/model')
64
+
65
+ * Load a model from a file:
66
+
67
+ Model.open('path/for/model')
68
+
25
69
  == LICENSE:
26
70
 
27
71
  The MIT License
data/Rakefile CHANGED
@@ -9,6 +9,7 @@ Hoe.new('raingrams', Raingrams::VERSION) do |p|
9
9
  p.rubyforge_name = 'raingrams'
10
10
  p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
11
11
  p.remote_rdoc_dir = 'docs'
12
+ p.extra_deps = ['hpricot']
12
13
  end
13
14
 
14
15
  # vim: syntax=Ruby
data/TODO.txt CHANGED
@@ -1,6 +1,5 @@
1
1
  == TODO:
2
2
 
3
- * Add spes for the Model class.
4
3
  * Add options to Model#random_sentence for weighting certain grams.
5
4
  * Add a command-line utility to utilize the Raingrams API.
6
5
 
@@ -4,6 +4,8 @@ require 'raingrams/probability_table'
4
4
  require 'raingrams/tokens'
5
5
 
6
6
  require 'set'
7
+ require 'hpricot'
8
+ require 'open-uri'
7
9
 
8
10
  module Raingrams
9
11
  class Model
@@ -56,6 +58,7 @@ module Raingrams
56
58
  @ignore_punctuation = true
57
59
  @ignore_urls = true
58
60
  @ignore_phone_numbers = false
61
+ @ignore_references = false
59
62
 
60
63
  if options.has_key?(:ignore_case)
61
64
  @ignore_case = options[:ignore_case]
@@ -73,14 +76,19 @@ module Raingrams
73
76
  @ignore_phone_numbers = options[:ignore_phone_numbers]
74
77
  end
75
78
 
79
+ if options.has_key?(:ignore_references)
80
+ @ignore_references = options[:ignore_references]
81
+ end
82
+
76
83
  @prefixes = {}
77
84
 
78
85
  block.call(self) if block
79
86
  end
80
87
 
81
88
  #
82
- # Creates a new NgramModel object with the given _options_. If a
83
- # _block_ is given, it will be passed the newly created model.
89
+ # Creates a new model object with the given _options_. If a
90
+ # _block_ is given, it will be passed the newly created model. After
91
+ # the block as been called the model will be built.
84
92
  #
85
93
  def self.build(options={},&block)
86
94
  self.new(options) do |model|
@@ -88,16 +96,74 @@ module Raingrams
88
96
  end
89
97
  end
90
98
 
99
+ #
100
+ # Creates a new model object with the given _options_ and trains it
101
+ # with the specified _paragraph_.
102
+ #
103
+ def self.train_with_paragraph(paragraph,options={})
104
+ self.build(options) do |model|
105
+ model.train_with_paragraph(paragraph)
106
+ end
107
+ end
108
+
109
+ #
110
+ # Creates a new model object with the given _options_ and trains it
111
+ # with the specified _text_.
112
+ #
113
+ def self.train_with_text(text,options={})
114
+ self.build(options) do |model|
115
+ model.train_with_text(text)
116
+ end
117
+ end
118
+
119
+ #
120
+ # Creates a new model object with the given _options_ and trains it
121
+ # with the contents of the specified _path_.
122
+ #
123
+ def self.train_with_file(path,options={})
124
+ self.build(options) do |model|
125
+ model.train_with_file(path)
126
+ end
127
+ end
128
+
129
+ #
130
+ # Creates a new model object with the given _options_ and trains it
131
+ # with the inner text of the paragraphs tags at the specified _url_.
132
+ #
133
+ def self.train_with_url(url,options={})
134
+ self.build(options) do |model|
135
+ model.train_with_url(url)
136
+ end
137
+ end
138
+
139
+ #
140
+ # Marshals a model from the contents of the file at the specified
141
+ # _path_.
142
+ #
143
+ def self.open(path)
144
+ model = nil
145
+
146
+ File.open(path) do |file|
147
+ model = Marshal.load(file)
148
+ end
149
+
150
+ return model
151
+ end
152
+
91
153
  #
92
154
  # Parses the specified _sentence_ and returns an Array of tokens.
93
155
  #
94
156
  def parse_sentence(sentence)
95
- # eat tailing punctuation
96
- sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
157
+ sentence = sentence.to_s
158
+
159
+ if @ignore_punctuation
160
+ # eat tailing punctuation
161
+ sentence.gsub!(/[\.\?!]*$/,'')
162
+ end
97
163
 
98
164
  if @ignore_urls
99
165
  # remove URLs
100
- sentence.gsub!(/\s*\w+:\/\/[\w\/,\._\-%\?&=]*\s*/,' ')
166
+ sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
101
167
  end
102
168
 
103
169
  if @ignore_phone_numbers
@@ -107,7 +173,7 @@ module Raingrams
107
173
 
108
174
  if @ignore_references
109
175
  # remove RFC style references
110
- sentence.gsub!(/\s*\[\d+\]\s*/,' ')
176
+ sentence.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
111
177
  end
112
178
 
113
179
  if @ignore_case
@@ -117,10 +183,10 @@ module Raingrams
117
183
 
118
184
  if @ignore_punctuation
119
185
  # split and ignore punctuation characters
120
- return sentence.scan(/\w+[_\.:']?\w+/)
186
+ return sentence.scan(/\w+[\-_\.:']\w+|\w+/)
121
187
  else
122
188
  # split and accept punctuation characters
123
- return sentence.scan(/[\w\-_,\.;'"\\\/]+/)
189
+ return sentence.scan(/[\w\-_,:;\.\?\!'"\\\/]+/)
124
190
  end
125
191
  end
126
192
 
@@ -128,7 +194,7 @@ module Raingrams
128
194
  # Parses the specified _text_ and returns an Array of sentences.
129
195
  #
130
196
  def parse_text(text)
131
- text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
197
+ text.to_s.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/)
132
198
  end
133
199
 
134
200
  #
@@ -138,8 +204,8 @@ module Raingrams
138
204
  ngram_set = NgramSet.new
139
205
 
140
206
  @prefixes.each do |prefix,table|
141
- table.each_gram do |gram|
142
- ngram_set << (prefix + gram)
207
+ table.each_gram do |postfix_gram|
208
+ ngram_set << (prefix + postfix_gram)
143
209
  end
144
210
  end
145
211
 
@@ -151,7 +217,11 @@ module Raingrams
151
217
  # +false+ otherwise.
152
218
  #
153
219
  def has_ngram?(ngram)
154
- @prefixes[ngram.prefix].has_gram?(ngram.last)
220
+ if @prefixes.has_key?(ngram.prefix)
221
+ return @prefixes[ngram.prefix].has_gram?(ngram.last)
222
+ else
223
+ return false
224
+ end
155
225
  end
156
226
 
157
227
  #
@@ -160,8 +230,8 @@ module Raingrams
160
230
  #
161
231
  def each_ngram(&block)
162
232
  @prefixes.each do |prefix,table|
163
- table.each_gram do |gram|
164
- block.call(prefix + gram) if block
233
+ table.each_gram do |postfix_gram|
234
+ block.call(prefix + postfix_gram) if block
165
235
  end
166
236
  end
167
237
 
@@ -178,7 +248,7 @@ module Raingrams
178
248
  selected_ngrams << ngram if block.call(ngram)
179
249
  end
180
250
 
181
- return ngrams
251
+ return selected_ngrams
182
252
  end
183
253
 
184
254
  #
@@ -221,8 +291,8 @@ module Raingrams
221
291
 
222
292
  @prefixes.each do |prefix,table|
223
293
  if prefix.first == gram
224
- table.each_gram do |gram|
225
- ngram_set << (prefix + gram)
294
+ table.each_gram do |postfix_gram|
295
+ ngram_set << (prefix + postfix_gram)
226
296
  end
227
297
  end
228
298
  end
@@ -246,20 +316,20 @@ module Raingrams
246
316
  end
247
317
 
248
318
  #
249
- # Returns the ngrams including the specified _grams_.
319
+ # Returns the ngrams including any of the specified _grams_.
250
320
  #
251
- def ngrams_including(*grams)
321
+ def ngrams_including_any(*grams)
252
322
  ngram_set = NgramSet.new
253
323
 
254
324
  @prefixes.each do |prefix,table|
255
- if prefix.includes?(grams)
256
- table.each_gram do |gram|
257
- ngram_set << (prefix + gram)
325
+ if prefix.includes_any?(*grams)
326
+ table.each_gram do |postfix_gram|
327
+ ngram_set << (prefix + postfix_gram)
258
328
  end
259
329
  else
260
- table.each_gram do |gram|
261
- if grams.include?(gram)
262
- ngram_set << (prefix + gram)
330
+ table.each_gram do |postfix_gram|
331
+ if grams.include?(postfix_gram)
332
+ ngram_set << (prefix + postfix_gram)
263
333
  end
264
334
  end
265
335
  end
@@ -268,6 +338,19 @@ module Raingrams
268
338
  return ngram_set
269
339
  end
270
340
 
341
+ #
342
+ # Returns the ngrams including all of the specified _grams_.
343
+ #
344
+ def ngrams_including_all(*grams)
345
+ ngram_set = NgramSet.new
346
+
347
+ each_ngram do |ngram|
348
+ ngram_set << ngram if ngram.includes_all?(*grams)
349
+ end
350
+
351
+ return ngram_set
352
+ end
353
+
271
354
  #
272
355
  # Returns the ngrams extracted from the specified _words_.
273
356
  #
@@ -300,6 +383,8 @@ module Raingrams
300
383
  end
301
384
  end
302
385
 
386
+ alias ngrams_from_paragraph ngrams_from_text
387
+
303
388
  #
304
389
  # Returns all ngrams which preceed the specified _gram_.
305
390
  #
@@ -334,7 +419,19 @@ module Raingrams
334
419
  # Returns all grams within the model.
335
420
  #
336
421
  def grams
337
- @prefixes.keys.flatten.uniq
422
+ @prefixes.keys.inject(Set.new) do |all_grams,gram|
423
+ all_grams + gram
424
+ end
425
+ end
426
+
427
+ #
428
+ # Returns +true+ if the model contain the specified _gram_, returns
429
+ # +false+ otherwise.
430
+ #
431
+ def has_gram?(gram)
432
+ @prefixes.keys.any? do |prefix|
433
+ prefix.include?(gram)
434
+ end
338
435
  end
339
436
 
340
437
  #
@@ -376,7 +473,7 @@ module Raingrams
376
473
  # within the model.
377
474
  #
378
475
  def common_ngrams_from_fragment(fragment)
379
- ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
476
+ ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
380
477
  end
381
478
 
382
479
  #
@@ -423,6 +520,13 @@ module Raingrams
423
520
  train_with_ngrams(ngrams_from_sentence(sentence))
424
521
  end
425
522
 
523
+ #
524
+ # Train the model with the specified _paragraphs_.
525
+ #
526
+ def train_with_paragraph(paragraph)
527
+ train_with_ngrams(ngrams_from_paragraph(paragraphs))
528
+ end
529
+
426
530
  #
427
531
  # Train the model with the specified _text_.
428
532
  #
@@ -430,6 +534,39 @@ module Raingrams
430
534
  train_with_ngrams(ngrams_from_text(text))
431
535
  end
432
536
 
537
+ #
538
+ # Train the model with the contents of the specified _path_.
539
+ #
540
+ def train_with_file(path)
541
+ train_with_text(File.read(path))
542
+ end
543
+
544
+ #
545
+ # Train the model with the inner text of the paragraph tags at the
546
+ # specified _url_.
547
+ #
548
+ def train_with_url(url)
549
+ doc = Hpricot(open(url))
550
+
551
+ return doc.search('p').map do |p|
552
+ train_with_paragraph(p.inner_text)
553
+ end
554
+ end
555
+
556
+ #
557
+ # Returns the observed frequency of the specified _ngram_ within
558
+ # the training text.
559
+ #
560
+ def frequency_of_ngram(ngram)
561
+ prefix = ngram.prefix
562
+
563
+ if @prefixes.has_key?(prefix)
564
+ return @prefixes[prefix].frequency_of(ngram.last)
565
+ else
566
+ return 0
567
+ end
568
+ end
569
+
433
570
  #
434
571
  # Returns the probability of the specified _ngram_ occurring within
435
572
  # arbitrary text.
@@ -444,6 +581,20 @@ module Raingrams
444
581
  end
445
582
  end
446
583
 
584
+ #
585
+ # Returns the observed frequency of the specified _ngrams_ occurring
586
+ # within the training text.
587
+ #
588
+ def frequencies_for(ngrams)
589
+ table = {}
590
+
591
+ ngrams.each do |ngram|
592
+ table[ngram] = frequency_of_ngram(ngram)
593
+ end
594
+
595
+ return table
596
+ end
597
+
447
598
  #
448
599
  # Returns the probability of the specified _ngrams_ occurring within
449
600
  # arbitrary text.
@@ -458,6 +609,16 @@ module Raingrams
458
609
  return table
459
610
  end
460
611
 
612
+ #
613
+ # Returns the total observed frequency of the specified _ngrams_
614
+ # occurring within the training text.
615
+ #
616
+ def frequency_of_ngrams(ngrams)
617
+ frequencies_for(ngrams).values.inject do |total,freq|
618
+ total + freq
619
+ end
620
+ end
621
+
461
622
  #
462
623
  # Returns the joint probability of the specified _ngrams_ occurring
463
624
  # within arbitrary text.
@@ -468,14 +629,6 @@ module Raingrams
468
629
  end
469
630
  end
470
631
 
471
- #
472
- # Returns the probably of the specified _gram_ occurring within
473
- # arbitrary text.
474
- #
475
- def probability_of_gram(gram)
476
- probability_of_ngrams(ngrams_starting_with(gram))
477
- end
478
-
479
632
  #
480
633
  # Returns the probability of the specified _fragment_ occuring within
481
634
  # arbitrary text.
@@ -582,9 +735,6 @@ module Raingrams
582
735
  grams = []
583
736
  last_ngram = @starting_ngram
584
737
 
585
- # prime the grams
586
- grams += @starting_ngram
587
-
588
738
  loop do
589
739
  next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
590
740
  last_ngram = next_ngrams[rand(next_ngrams.length)]
@@ -592,8 +742,11 @@ module Raingrams
592
742
  if last_ngram.nil?
593
743
  return []
594
744
  else
595
- grams << last_ngram.last
596
- break if last_ngram == @stoping_ngram
745
+ last_gram = last_ngram.last
746
+
747
+ break if last_gram == Tokens.stop
748
+
749
+ grams << last_gram
597
750
  end
598
751
  end
599
752
 
@@ -690,6 +843,17 @@ module Raingrams
690
843
  return self
691
844
  end
692
845
 
846
+ #
847
+ # Saves the model to the file at the specified _path_.
848
+ #
849
+ def save(path)
850
+ File.open(path,'w') do |file|
851
+ Marshal.dump(self,file)
852
+ end
853
+
854
+ return self
855
+ end
856
+
693
857
  protected
694
858
 
695
859
  #
@@ -70,8 +70,12 @@ module Raingrams
70
70
  super(obj.to_gram)
71
71
  end
72
72
 
73
- def includes?(*grams)
74
- (self & grams) == grams
73
+ def includes_any?(*grams)
74
+ grams.any? { |gram| include?(gram) }
75
+ end
76
+
77
+ def includes_all?(*grams)
78
+ grams.all? { |gram| include?(gram) }
75
79
  end
76
80
 
77
81
  def flatten
@@ -35,8 +35,12 @@ module Raingrams
35
35
  select { |ngram| ngram.include?(gram) }
36
36
  end
37
37
 
38
- def includes(*grams)
39
- select { |ngram| ngram.includes?(*grams) }
38
+ def including_any(*grams)
39
+ select { |ngram| ngram.includes_any?(*grams) }
40
+ end
41
+
42
+ def including_all(*grams)
43
+ select { |ngram| ngram.includes_all?(*grams) }
40
44
  end
41
45
 
42
46
  end
@@ -1,3 +1,3 @@
1
1
  module Raingrams
2
- VERSION = '0.1.0'
2
+ VERSION = '0.1.1'
3
3
  end