raingrams 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,31 @@
1
+ == 0.1.1 / 2008-10-12
2
+
3
+ * Improved the parsing abilities of Model#parse_sentence and
4
+ Model#parse_text.
5
+ * Fixed a bug in Model#has_ngram?.
6
+ * Fixed a bug in Model#ngrams_starting_with.
7
+ * Removed Model#probability_of_gram, for now atleast.
8
+ * Renamed Ngram#includes? to Ngram#includes_all?.
9
+ * Renamed Model#ngrams_including to Model#ngrams_including_all.
10
+ * Renamed Model#frequencies_of_ngrams to Model#frequency_of_ngrams.
11
+ * Added the following methods:
12
+ * Ngram#includs_any?.
13
+ * Model.open.
14
+ * Model.train_with_paragraph.
15
+ * Model.train_with_text.
16
+ * Model.train_with_file.
17
+ * Model.train_with_url.
18
+ * Model#has_gram.
19
+ * Model#ngrams_including_all.
20
+ * Model#ngrams_from_paragraph.
21
+ * Model#train_with_paragraph.
22
+ * Model#train_with_file.
23
+ * Model#train_with_url.
24
+ * Model#frequency_of_ngram.
25
+ * Model#frequencies_for.
26
+ * Model#frequencies_of_ngrams.
27
+ * Model#save.
28
+
1
29
  == 0.1.0 / 2008-10-06
2
30
 
3
31
  * Various bug fixes.
data/Manifest.txt CHANGED
@@ -35,8 +35,17 @@ lib/raingrams/open_vocabulary/pentagram_model.rb
35
35
  lib/raingrams/open_vocabulary/hexagram_model.rb
36
36
  lib/raingrams/open_vocabulary.rb
37
37
  tasks/spec.rb
38
+ spec/training/snowcrash.txt
39
+ spec/helpers/training.rb
40
+ spec/helpers.rb
38
41
  spec/spec_helper.rb
39
42
  spec/ngram_spec.rb
40
43
  spec/ngram_set_spec.rb
41
44
  spec/probability_table_spec.rb
42
45
  spec/raingrams_spec.rb
46
+ spec/model_spec.rb
47
+ spec/model_examples.rb
48
+ spec/bigram_model_spec.rb
49
+ spec/trigram_model_spec.rb
50
+ spec/quadgram_model_spec.rb
51
+ spec/pentagram_model_spec.rb
data/README.txt CHANGED
@@ -6,22 +6,66 @@
6
6
  == DESCRIPTION:
7
7
 
8
8
  Raingrams is a flexible and general-purpose ngrams library written in Ruby.
9
- Raingrams supports any non-zero ngram size, text/non-text grams, multiple
9
+ Raingrams supports ngram sizes greater than 1, text/non-text grams, multiple
10
10
  parsing styles and open/closed vocabulary models.
11
11
 
12
12
  == FEATURES:
13
13
 
14
- * Supports all ngram sizes above 1.
14
+ * Supports ngram sizes greater than 1.
15
15
  * Supports text and non-text grams.
16
16
  * Supports Open and Closed vocabulary models.
17
17
  * Supports calculating the similarity and commonality of sample text against
18
18
  specified models.
19
19
  * Supports generating random text from models.
20
20
 
21
+ == REQUIREMENTS:
22
+
23
+ * Hpricot
24
+
21
25
  == INSTALL:
22
26
 
23
27
  $ sudo gem install raingrams
24
28
 
29
+ == EXAMPLES:
30
+
31
+ * Train a model with ycombinator comments:
32
+
33
+ require 'raingrams'
34
+ require 'hpricot'
35
+ require 'open-uri'
36
+
37
+ include Raingrams
38
+
39
+ model = BigramModel.build do |model|
40
+ doc = Hpricot(open('http://news.ycombinator.org/newcomments'))
41
+ doc.search('span.comment') do |span|
42
+ model.train_with_text(span.inner_text)
43
+ end
44
+ end
45
+
46
+ * Update a trained model:
47
+
48
+ model.train_with_text %{Interesting videos. Anders talks about functional
49
+ support on .net, concurrency, immutability. Guy Steele talks about
50
+ Fortress on JVM. Too bad they are afraid of macros (access to AST),
51
+ though Steele does say Fortress has some support.}
52
+
53
+ model.refresh
54
+
55
+ * Generate a random sentence:
56
+
57
+ model.random_sentence
58
+ # => "OTOOH if you use slicehost even offer to bash Apple makes it will
59
+ exit and its 38 month ago based configuration of little networks created."
60
+
61
+ * Dump a model to a file, to be marshaled later:
62
+
63
+ model.save('path/for/model')
64
+
65
+ * Load a model from a file:
66
+
67
+ Model.open('path/for/model')
68
+
25
69
  == LICENSE:
26
70
 
27
71
  The MIT License
data/Rakefile CHANGED
@@ -9,6 +9,7 @@ Hoe.new('raingrams', Raingrams::VERSION) do |p|
9
9
  p.rubyforge_name = 'raingrams'
10
10
  p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
11
11
  p.remote_rdoc_dir = 'docs'
12
+ p.extra_deps = ['hpricot']
12
13
  end
13
14
 
14
15
  # vim: syntax=Ruby
data/TODO.txt CHANGED
@@ -1,6 +1,5 @@
1
1
  == TODO:
2
2
 
3
- * Add spes for the Model class.
4
3
  * Add options to Model#random_sentence for weighting certain grams.
5
4
  * Add a command-line utility to utilize the Raingrams API.
6
5
 
@@ -4,6 +4,8 @@ require 'raingrams/probability_table'
4
4
  require 'raingrams/tokens'
5
5
 
6
6
  require 'set'
7
+ require 'hpricot'
8
+ require 'open-uri'
7
9
 
8
10
  module Raingrams
9
11
  class Model
@@ -56,6 +58,7 @@ module Raingrams
56
58
  @ignore_punctuation = true
57
59
  @ignore_urls = true
58
60
  @ignore_phone_numbers = false
61
+ @ignore_references = false
59
62
 
60
63
  if options.has_key?(:ignore_case)
61
64
  @ignore_case = options[:ignore_case]
@@ -73,14 +76,19 @@ module Raingrams
73
76
  @ignore_phone_numbers = options[:ignore_phone_numbers]
74
77
  end
75
78
 
79
+ if options.has_key?(:ignore_references)
80
+ @ignore_references = options[:ignore_references]
81
+ end
82
+
76
83
  @prefixes = {}
77
84
 
78
85
  block.call(self) if block
79
86
  end
80
87
 
81
88
  #
82
- # Creates a new NgramModel object with the given _options_. If a
83
- # _block_ is given, it will be passed the newly created model.
89
+ # Creates a new model object with the given _options_. If a
90
+ # _block_ is given, it will be passed the newly created model. After
91
+ # the block as been called the model will be built.
84
92
  #
85
93
  def self.build(options={},&block)
86
94
  self.new(options) do |model|
@@ -88,16 +96,74 @@ module Raingrams
88
96
  end
89
97
  end
90
98
 
99
+ #
100
+ # Creates a new model object with the given _options_ and trains it
101
+ # with the specified _paragraph_.
102
+ #
103
+ def self.train_with_paragraph(paragraph,options={})
104
+ self.build(options) do |model|
105
+ model.train_with_paragraph(paragraph)
106
+ end
107
+ end
108
+
109
+ #
110
+ # Creates a new model object with the given _options_ and trains it
111
+ # with the specified _text_.
112
+ #
113
+ def self.train_with_text(text,options={})
114
+ self.build(options) do |model|
115
+ model.train_with_text(text)
116
+ end
117
+ end
118
+
119
+ #
120
+ # Creates a new model object with the given _options_ and trains it
121
+ # with the contents of the specified _path_.
122
+ #
123
+ def self.train_with_file(path,options={})
124
+ self.build(options) do |model|
125
+ model.train_with_file(path)
126
+ end
127
+ end
128
+
129
+ #
130
+ # Creates a new model object with the given _options_ and trains it
131
+ # with the inner text of the paragraphs tags at the specified _url_.
132
+ #
133
+ def self.train_with_url(url,options={})
134
+ self.build(options) do |model|
135
+ model.train_with_url(url)
136
+ end
137
+ end
138
+
139
+ #
140
+ # Marshals a model from the contents of the file at the specified
141
+ # _path_.
142
+ #
143
+ def self.open(path)
144
+ model = nil
145
+
146
+ File.open(path) do |file|
147
+ model = Marshal.load(file)
148
+ end
149
+
150
+ return model
151
+ end
152
+
91
153
  #
92
154
  # Parses the specified _sentence_ and returns an Array of tokens.
93
155
  #
94
156
  def parse_sentence(sentence)
95
- # eat tailing punctuation
96
- sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
157
+ sentence = sentence.to_s
158
+
159
+ if @ignore_punctuation
160
+ # eat tailing punctuation
161
+ sentence.gsub!(/[\.\?!]*$/,'')
162
+ end
97
163
 
98
164
  if @ignore_urls
99
165
  # remove URLs
100
- sentence.gsub!(/\s*\w+:\/\/[\w\/,\._\-%\?&=]*\s*/,' ')
166
+ sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
101
167
  end
102
168
 
103
169
  if @ignore_phone_numbers
@@ -107,7 +173,7 @@ module Raingrams
107
173
 
108
174
  if @ignore_references
109
175
  # remove RFC style references
110
- sentence.gsub!(/\s*\[\d+\]\s*/,' ')
176
+ sentence.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
111
177
  end
112
178
 
113
179
  if @ignore_case
@@ -117,10 +183,10 @@ module Raingrams
117
183
 
118
184
  if @ignore_punctuation
119
185
  # split and ignore punctuation characters
120
- return sentence.scan(/\w+[_\.:']?\w+/)
186
+ return sentence.scan(/\w+[\-_\.:']\w+|\w+/)
121
187
  else
122
188
  # split and accept punctuation characters
123
- return sentence.scan(/[\w\-_,\.;'"\\\/]+/)
189
+ return sentence.scan(/[\w\-_,:;\.\?\!'"\\\/]+/)
124
190
  end
125
191
  end
126
192
 
@@ -128,7 +194,7 @@ module Raingrams
128
194
  # Parses the specified _text_ and returns an Array of sentences.
129
195
  #
130
196
  def parse_text(text)
131
- text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
197
+ text.to_s.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/)
132
198
  end
133
199
 
134
200
  #
@@ -138,8 +204,8 @@ module Raingrams
138
204
  ngram_set = NgramSet.new
139
205
 
140
206
  @prefixes.each do |prefix,table|
141
- table.each_gram do |gram|
142
- ngram_set << (prefix + gram)
207
+ table.each_gram do |postfix_gram|
208
+ ngram_set << (prefix + postfix_gram)
143
209
  end
144
210
  end
145
211
 
@@ -151,7 +217,11 @@ module Raingrams
151
217
  # +false+ otherwise.
152
218
  #
153
219
  def has_ngram?(ngram)
154
- @prefixes[ngram.prefix].has_gram?(ngram.last)
220
+ if @prefixes.has_key?(ngram.prefix)
221
+ return @prefixes[ngram.prefix].has_gram?(ngram.last)
222
+ else
223
+ return false
224
+ end
155
225
  end
156
226
 
157
227
  #
@@ -160,8 +230,8 @@ module Raingrams
160
230
  #
161
231
  def each_ngram(&block)
162
232
  @prefixes.each do |prefix,table|
163
- table.each_gram do |gram|
164
- block.call(prefix + gram) if block
233
+ table.each_gram do |postfix_gram|
234
+ block.call(prefix + postfix_gram) if block
165
235
  end
166
236
  end
167
237
 
@@ -178,7 +248,7 @@ module Raingrams
178
248
  selected_ngrams << ngram if block.call(ngram)
179
249
  end
180
250
 
181
- return ngrams
251
+ return selected_ngrams
182
252
  end
183
253
 
184
254
  #
@@ -221,8 +291,8 @@ module Raingrams
221
291
 
222
292
  @prefixes.each do |prefix,table|
223
293
  if prefix.first == gram
224
- table.each_gram do |gram|
225
- ngram_set << (prefix + gram)
294
+ table.each_gram do |postfix_gram|
295
+ ngram_set << (prefix + postfix_gram)
226
296
  end
227
297
  end
228
298
  end
@@ -246,20 +316,20 @@ module Raingrams
246
316
  end
247
317
 
248
318
  #
249
- # Returns the ngrams including the specified _grams_.
319
+ # Returns the ngrams including any of the specified _grams_.
250
320
  #
251
- def ngrams_including(*grams)
321
+ def ngrams_including_any(*grams)
252
322
  ngram_set = NgramSet.new
253
323
 
254
324
  @prefixes.each do |prefix,table|
255
- if prefix.includes?(grams)
256
- table.each_gram do |gram|
257
- ngram_set << (prefix + gram)
325
+ if prefix.includes_any?(*grams)
326
+ table.each_gram do |postfix_gram|
327
+ ngram_set << (prefix + postfix_gram)
258
328
  end
259
329
  else
260
- table.each_gram do |gram|
261
- if grams.include?(gram)
262
- ngram_set << (prefix + gram)
330
+ table.each_gram do |postfix_gram|
331
+ if grams.include?(postfix_gram)
332
+ ngram_set << (prefix + postfix_gram)
263
333
  end
264
334
  end
265
335
  end
@@ -268,6 +338,19 @@ module Raingrams
268
338
  return ngram_set
269
339
  end
270
340
 
341
+ #
342
+ # Returns the ngrams including all of the specified _grams_.
343
+ #
344
+ def ngrams_including_all(*grams)
345
+ ngram_set = NgramSet.new
346
+
347
+ each_ngram do |ngram|
348
+ ngram_set << ngram if ngram.includes_all?(*grams)
349
+ end
350
+
351
+ return ngram_set
352
+ end
353
+
271
354
  #
272
355
  # Returns the ngrams extracted from the specified _words_.
273
356
  #
@@ -300,6 +383,8 @@ module Raingrams
300
383
  end
301
384
  end
302
385
 
386
+ alias ngrams_from_paragraph ngrams_from_text
387
+
303
388
  #
304
389
  # Returns all ngrams which preceed the specified _gram_.
305
390
  #
@@ -334,7 +419,19 @@ module Raingrams
334
419
  # Returns all grams within the model.
335
420
  #
336
421
  def grams
337
- @prefixes.keys.flatten.uniq
422
+ @prefixes.keys.inject(Set.new) do |all_grams,gram|
423
+ all_grams + gram
424
+ end
425
+ end
426
+
427
+ #
428
+ # Returns +true+ if the model contain the specified _gram_, returns
429
+ # +false+ otherwise.
430
+ #
431
+ def has_gram?(gram)
432
+ @prefixes.keys.any? do |prefix|
433
+ prefix.include?(gram)
434
+ end
338
435
  end
339
436
 
340
437
  #
@@ -376,7 +473,7 @@ module Raingrams
376
473
  # within the model.
377
474
  #
378
475
  def common_ngrams_from_fragment(fragment)
379
- ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
476
+ ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
380
477
  end
381
478
 
382
479
  #
@@ -423,6 +520,13 @@ module Raingrams
423
520
  train_with_ngrams(ngrams_from_sentence(sentence))
424
521
  end
425
522
 
523
+ #
524
+ # Train the model with the specified _paragraphs_.
525
+ #
526
+ def train_with_paragraph(paragraph)
527
+ train_with_ngrams(ngrams_from_paragraph(paragraphs))
528
+ end
529
+
426
530
  #
427
531
  # Train the model with the specified _text_.
428
532
  #
@@ -430,6 +534,39 @@ module Raingrams
430
534
  train_with_ngrams(ngrams_from_text(text))
431
535
  end
432
536
 
537
+ #
538
+ # Train the model with the contents of the specified _path_.
539
+ #
540
+ def train_with_file(path)
541
+ train_with_text(File.read(path))
542
+ end
543
+
544
+ #
545
+ # Train the model with the inner text of the paragraph tags at the
546
+ # specified _url_.
547
+ #
548
+ def train_with_url(url)
549
+ doc = Hpricot(open(url))
550
+
551
+ return doc.search('p').map do |p|
552
+ train_with_paragraph(p.inner_text)
553
+ end
554
+ end
555
+
556
+ #
557
+ # Returns the observed frequency of the specified _ngram_ within
558
+ # the training text.
559
+ #
560
+ def frequency_of_ngram(ngram)
561
+ prefix = ngram.prefix
562
+
563
+ if @prefixes.has_key?(prefix)
564
+ return @prefixes[prefix].frequency_of(ngram.last)
565
+ else
566
+ return 0
567
+ end
568
+ end
569
+
433
570
  #
434
571
  # Returns the probability of the specified _ngram_ occurring within
435
572
  # arbitrary text.
@@ -444,6 +581,20 @@ module Raingrams
444
581
  end
445
582
  end
446
583
 
584
+ #
585
+ # Returns the observed frequency of the specified _ngrams_ occurring
586
+ # within the training text.
587
+ #
588
+ def frequencies_for(ngrams)
589
+ table = {}
590
+
591
+ ngrams.each do |ngram|
592
+ table[ngram] = frequency_of_ngram(ngram)
593
+ end
594
+
595
+ return table
596
+ end
597
+
447
598
  #
448
599
  # Returns the probability of the specified _ngrams_ occurring within
449
600
  # arbitrary text.
@@ -458,6 +609,16 @@ module Raingrams
458
609
  return table
459
610
  end
460
611
 
612
+ #
613
+ # Returns the total observed frequency of the specified _ngrams_
614
+ # occurring within the training text.
615
+ #
616
+ def frequency_of_ngrams(ngrams)
617
+ frequencies_for(ngrams).values.inject do |total,freq|
618
+ total + freq
619
+ end
620
+ end
621
+
461
622
  #
462
623
  # Returns the joint probability of the specified _ngrams_ occurring
463
624
  # within arbitrary text.
@@ -468,14 +629,6 @@ module Raingrams
468
629
  end
469
630
  end
470
631
 
471
- #
472
- # Returns the probably of the specified _gram_ occurring within
473
- # arbitrary text.
474
- #
475
- def probability_of_gram(gram)
476
- probability_of_ngrams(ngrams_starting_with(gram))
477
- end
478
-
479
632
  #
480
633
  # Returns the probability of the specified _fragment_ occuring within
481
634
  # arbitrary text.
@@ -582,9 +735,6 @@ module Raingrams
582
735
  grams = []
583
736
  last_ngram = @starting_ngram
584
737
 
585
- # prime the grams
586
- grams += @starting_ngram
587
-
588
738
  loop do
589
739
  next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
590
740
  last_ngram = next_ngrams[rand(next_ngrams.length)]
@@ -592,8 +742,11 @@ module Raingrams
592
742
  if last_ngram.nil?
593
743
  return []
594
744
  else
595
- grams << last_ngram.last
596
- break if last_ngram == @stoping_ngram
745
+ last_gram = last_ngram.last
746
+
747
+ break if last_gram == Tokens.stop
748
+
749
+ grams << last_gram
597
750
  end
598
751
  end
599
752
 
@@ -690,6 +843,17 @@ module Raingrams
690
843
  return self
691
844
  end
692
845
 
846
+ #
847
+ # Saves the model to the file at the specified _path_.
848
+ #
849
+ def save(path)
850
+ File.open(path,'w') do |file|
851
+ Marshal.dump(self,file)
852
+ end
853
+
854
+ return self
855
+ end
856
+
693
857
  protected
694
858
 
695
859
  #
@@ -70,8 +70,12 @@ module Raingrams
70
70
  super(obj.to_gram)
71
71
  end
72
72
 
73
- def includes?(*grams)
74
- (self & grams) == grams
73
+ def includes_any?(*grams)
74
+ grams.any? { |gram| include?(gram) }
75
+ end
76
+
77
+ def includes_all?(*grams)
78
+ grams.all? { |gram| include?(gram) }
75
79
  end
76
80
 
77
81
  def flatten
@@ -35,8 +35,12 @@ module Raingrams
35
35
  select { |ngram| ngram.include?(gram) }
36
36
  end
37
37
 
38
- def includes(*grams)
39
- select { |ngram| ngram.includes?(*grams) }
38
+ def including_any(*grams)
39
+ select { |ngram| ngram.includes_any?(*grams) }
40
+ end
41
+
42
+ def including_all(*grams)
43
+ select { |ngram| ngram.includes_all?(*grams) }
40
44
  end
41
45
 
42
46
  end
@@ -1,3 +1,3 @@
1
1
  module Raingrams
2
- VERSION = '0.1.0'
2
+ VERSION = '0.1.1'
3
3
  end