RubyGems - raingrams - Versions diffs - 0.1.0 → 0.1.1 - Mend

raingrams 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/History.txt +28 -0
data/Manifest.txt +9 -0
data/README.txt +46 -2
data/Rakefile +1 -0
data/TODO.txt +0 -1
data/lib/raingrams/model.rb +204 -40
data/lib/raingrams/ngram.rb +6 -2
data/lib/raingrams/ngram_set.rb +6 -2
data/lib/raingrams/version.rb +1 -1
data/spec/bigram_model_spec.rb +111 -0
data/spec/helpers/training.rb +8 -0
data/spec/helpers.rb +1 -0
data/spec/model_examples.rb +83 -0
data/spec/model_spec.rb +118 -0
data/spec/ngram_set_spec.rb +11 -2
data/spec/ngram_spec.rb +1 -1
data/spec/pentagram_model_spec.rb +101 -0
data/spec/quadgram_model_spec.rb +106 -0
data/spec/spec_helper.rb +2 -0
data/spec/training/snowcrash.txt +88 -0
data/spec/trigram_model_spec.rb +109 -0
metadata +24 -4

data/History.txt CHANGED Viewed

@@ -1,3 +1,31 @@
+== 0.1.1 / 2008-10-12
+* Improved the parsing abilities of Model#parse_sentence and
+  Model#parse_text.
+* Fixed a bug in Model#has_ngram?.
+* Fixed a bug in Model#ngrams_starting_with.
+* Removed Model#probability_of_gram, for now atleast.
+* Renamed Ngram#includes? to Ngram#includes_all?.
+* Renamed Model#ngrams_including to Model#ngrams_including_all.
+* Renamed Model#frequencies_of_ngrams to Model#frequency_of_ngrams.
+* Added the following methods:
+  * Ngram#includs_any?.
+  * Model.open.
+  * Model.train_with_paragraph.
+  * Model.train_with_text.
+  * Model.train_with_file.
+  * Model.train_with_url.
+  * Model#has_gram.
+  * Model#ngrams_including_all.
+  * Model#ngrams_from_paragraph.
+  * Model#train_with_paragraph.
+  * Model#train_with_file.
+  * Model#train_with_url.
+  * Model#frequency_of_ngram.
+  * Model#frequencies_for.
+  * Model#frequencies_of_ngrams.
+  * Model#save.
 == 0.1.0 / 2008-10-06
 * Various bug fixes.

data/Manifest.txt CHANGED Viewed

@@ -35,8 +35,17 @@ lib/raingrams/open_vocabulary/pentagram_model.rb
 lib/raingrams/open_vocabulary/hexagram_model.rb
 lib/raingrams/open_vocabulary.rb
 tasks/spec.rb
+spec/training/snowcrash.txt
+spec/helpers/training.rb
+spec/helpers.rb
 spec/spec_helper.rb
 spec/ngram_spec.rb
 spec/ngram_set_spec.rb
 spec/probability_table_spec.rb
 spec/raingrams_spec.rb
+spec/model_spec.rb
+spec/model_examples.rb
+spec/bigram_model_spec.rb
+spec/trigram_model_spec.rb
+spec/quadgram_model_spec.rb
+spec/pentagram_model_spec.rb

data/README.txt CHANGED Viewed

@@ -6,22 +6,66 @@
 == DESCRIPTION:
 Raingrams is a flexible and general-purpose ngrams library written in Ruby.
-Raingrams supports any non-zero ngram size, text/non-text grams, multiple
+Raingrams supports ngram sizes greater than 1, text/non-text grams, multiple
 parsing styles and open/closed vocabulary models.
 == FEATURES:
-* Supports all ngram sizes above 1.
+* Supports ngram sizes greater than 1.
 * Supports text and non-text grams.
 * Supports Open and Closed vocabulary models.
 * Supports calculating the similarity and commonality of sample text against
   specified models.
 * Supports generating random text from models.
+== REQUIREMENTS:
+* Hpricot
 == INSTALL:
   $ sudo gem install raingrams
+== EXAMPLES:
+* Train a model with ycombinator comments:
+  require 'raingrams'
+  require 'hpricot'
+  require 'open-uri'
+  include Raingrams
+  model = BigramModel.build do |model|
+    doc = Hpricot(open('http://news.ycombinator.org/newcomments'))
+    doc.search('span.comment') do |span|
+      model.train_with_text(span.inner_text)
+    end
+  end
+* Update a trained model:
+  model.train_with_text %{Interesting videos. Anders talks about functional
+    support on .net, concurrency, immutability. Guy Steele talks about
+    Fortress on JVM. Too bad they are afraid of macros (access to AST),
+    though Steele does say Fortress has some support.}
+  model.refresh
+* Generate a random sentence:
+  model.random_sentence
+  # => "OTOOH if you use slicehost even offer to bash Apple makes it will
+  exit and its 38 month ago based configuration of little networks created."
+* Dump a model to a file, to be marshaled later:
+  model.save('path/for/model')
+* Load a model from a file:
+  Model.open('path/for/model')
 == LICENSE:
 The MIT License

data/Rakefile CHANGED Viewed

@@ -9,6 +9,7 @@ Hoe.new('raingrams', Raingrams::VERSION) do |p|
   p.rubyforge_name = 'raingrams'
   p.developer('Postmodern Modulus III', 'postmodern.mod3@gmail.com')
   p.remote_rdoc_dir = 'docs'
+  p.extra_deps = ['hpricot']
 end
 # vim: syntax=Ruby

data/TODO.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 == TODO:
-* Add spes for the Model class.
 * Add options to Model#random_sentence for weighting certain grams.
 * Add a command-line utility to utilize the Raingrams API.

data/lib/raingrams/model.rb CHANGED Viewed

@@ -4,6 +4,8 @@ require 'raingrams/probability_table'
 require 'raingrams/tokens'
 require 'set'
+require 'hpricot'
+require 'open-uri'
 module Raingrams
   class Model
@@ -56,6 +58,7 @@ module Raingrams
       @ignore_punctuation = true
       @ignore_urls = true
       @ignore_phone_numbers = false
+      @ignore_references = false
       if options.has_key?(:ignore_case)
         @ignore_case = options[:ignore_case]
@@ -73,14 +76,19 @@ module Raingrams
         @ignore_phone_numbers = options[:ignore_phone_numbers]
       end
+      if options.has_key?(:ignore_references)
+        @ignore_references = options[:ignore_references]
+      end
       @prefixes = {}
       block.call(self) if block
     end
     #
-    # Creates a new NgramModel object with the given _options_. If a
-    # _block_ is given, it will be passed the newly created model.
+    # Creates a new model object with the given _options_. If a
+    # _block_ is given, it will be passed the newly created model. After
+    # the block as been called the model will be built.
     #
     def self.build(options={},&block)
       self.new(options) do |model|
@@ -88,16 +96,74 @@ module Raingrams
       end
     end
+    #
+    # Creates a new model object with the given _options_ and trains it
+    # with the specified _paragraph_.
+    #
+    def self.train_with_paragraph(paragraph,options={})
+      self.build(options) do |model|
+        model.train_with_paragraph(paragraph)
+      end
+    end
+    #
+    # Creates a new model object with the given _options_ and trains it
+    # with the specified _text_.
+    #
+    def self.train_with_text(text,options={})
+      self.build(options) do |model|
+        model.train_with_text(text)
+      end
+    end
+    #
+    # Creates a new model object with the given _options_ and trains it
+    # with the contents of the specified _path_.
+    #
+    def self.train_with_file(path,options={})
+      self.build(options) do |model|
+        model.train_with_file(path)
+      end
+    end
+    #
+    # Creates a new model object with the given _options_ and trains it
+    # with the inner text of the paragraphs tags at the specified _url_.
+    #
+    def self.train_with_url(url,options={})
+      self.build(options) do |model|
+        model.train_with_url(url)
+      end
+    end
+    #
+    # Marshals a model from the contents of the file at the specified
+    # _path_.
+    #
+    def self.open(path)
+      model = nil
+      File.open(path) do |file|
+        model = Marshal.load(file)
+      end
+      return model
+    end
     #
     # Parses the specified _sentence_ and returns an Array of tokens.
     #
     def parse_sentence(sentence)
-      # eat tailing punctuation
-      sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
+      sentence = sentence.to_s
+      if @ignore_punctuation
+        # eat tailing punctuation
+        sentence.gsub!(/[\.\?!]*$/,'')
+      end
       if @ignore_urls
         # remove URLs
-        sentence.gsub!(/\s*\w+:\/\/[\w\/,\._\-%\?&=]*\s*/,' ')
+        sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
       end
       if @ignore_phone_numbers
@@ -107,7 +173,7 @@ module Raingrams
       if @ignore_references
         # remove RFC style references
-        sentence.gsub!(/\s*\[\d+\]\s*/,' ')
+        sentence.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
       end
       if @ignore_case
@@ -117,10 +183,10 @@ module Raingrams
       if @ignore_punctuation
         # split and ignore punctuation characters
-        return sentence.scan(/\w+[_\.:']?\w+/)
+        return sentence.scan(/\w+[\-_\.:']\w+|\w+/)
       else
         # split and accept punctuation characters
-        return sentence.scan(/[\w\-_,\.;'"\\\/]+/)
+        return sentence.scan(/[\w\-_,:;\.\?\!'"\\\/]+/)
       end
     end
@@ -128,7 +194,7 @@ module Raingrams
     # Parses the specified _text_ and returns an Array of sentences.
     #
     def parse_text(text)
-      text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
+      text.to_s.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/)
     end
     #
@@ -138,8 +204,8 @@ module Raingrams
       ngram_set = NgramSet.new
       @prefixes.each do |prefix,table|
-        table.each_gram do |gram|
-          ngram_set << (prefix + gram)
+        table.each_gram do |postfix_gram|
+          ngram_set << (prefix + postfix_gram)
         end
       end
@@ -151,7 +217,11 @@ module Raingrams
     # +false+ otherwise.
     #
     def has_ngram?(ngram)
-      @prefixes[ngram.prefix].has_gram?(ngram.last)
+      if @prefixes.has_key?(ngram.prefix)
+        return @prefixes[ngram.prefix].has_gram?(ngram.last)
+      else
+        return false
+      end
     end
     #
@@ -160,8 +230,8 @@ module Raingrams
     #
     def each_ngram(&block)
       @prefixes.each do |prefix,table|
-        table.each_gram do |gram|
-          block.call(prefix + gram) if block
+        table.each_gram do |postfix_gram|
+          block.call(prefix + postfix_gram) if block
         end
       end
@@ -178,7 +248,7 @@ module Raingrams
         selected_ngrams << ngram if block.call(ngram)
       end
-      return ngrams
+      return selected_ngrams
     end
     #
@@ -221,8 +291,8 @@ module Raingrams
       @prefixes.each do |prefix,table|
         if prefix.first == gram
-          table.each_gram do |gram|
-            ngram_set << (prefix + gram)
+          table.each_gram do |postfix_gram|
+            ngram_set << (prefix + postfix_gram)
           end
         end
       end
@@ -246,20 +316,20 @@ module Raingrams
     end
     #
-    # Returns the ngrams including the specified _grams_.
+    # Returns the ngrams including any of the specified _grams_.
     #
-    def ngrams_including(*grams)
+    def ngrams_including_any(*grams)
       ngram_set = NgramSet.new
       @prefixes.each do |prefix,table|
-        if prefix.includes?(grams)
-          table.each_gram do |gram|
-            ngram_set << (prefix + gram)
+        if prefix.includes_any?(*grams)
+          table.each_gram do |postfix_gram|
+            ngram_set << (prefix + postfix_gram)
           end
         else
-          table.each_gram do |gram|
-            if grams.include?(gram)
-              ngram_set << (prefix + gram)
+          table.each_gram do |postfix_gram|
+            if grams.include?(postfix_gram)
+              ngram_set << (prefix + postfix_gram)
             end
           end
         end
@@ -268,6 +338,19 @@ module Raingrams
       return ngram_set
     end
+    #
+    # Returns the ngrams including all of the specified _grams_.
+    #
+    def ngrams_including_all(*grams)
+      ngram_set = NgramSet.new
+      each_ngram do |ngram|
+        ngram_set << ngram if ngram.includes_all?(*grams)
+      end
+      return ngram_set
+    end
     #
     # Returns the ngrams extracted from the specified _words_.
     #
@@ -300,6 +383,8 @@ module Raingrams
       end
     end
+    alias ngrams_from_paragraph ngrams_from_text
     #
     # Returns all ngrams which preceed the specified _gram_.
     #
@@ -334,7 +419,19 @@ module Raingrams
     # Returns all grams within the model.
     #
     def grams
-      @prefixes.keys.flatten.uniq
+      @prefixes.keys.inject(Set.new) do |all_grams,gram|
+        all_grams + gram
+      end
+    end
+    #
+    # Returns +true+ if the model contain the specified _gram_, returns
+    # +false+ otherwise.
+    #
+    def has_gram?(gram)
+      @prefixes.keys.any? do |prefix|
+        prefix.include?(gram)
+      end
     end
     #
@@ -376,7 +473,7 @@ module Raingrams
     # within the model.
     #
     def common_ngrams_from_fragment(fragment)
-      ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
+      ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
     end
     #
@@ -423,6 +520,13 @@ module Raingrams
       train_with_ngrams(ngrams_from_sentence(sentence))
     end
+    #
+    # Train the model with the specified _paragraphs_.
+    #
+    def train_with_paragraph(paragraph)
+      train_with_ngrams(ngrams_from_paragraph(paragraphs))
+    end
     #
     # Train the model with the specified _text_.
     #
@@ -430,6 +534,39 @@ module Raingrams
       train_with_ngrams(ngrams_from_text(text))
     end
+    #
+    # Train the model with the contents of the specified _path_.
+    #
+    def train_with_file(path)
+      train_with_text(File.read(path))
+    end
+    #
+    # Train the model with the inner text of the paragraph tags at the
+    # specified _url_.
+    #
+    def train_with_url(url)
+      doc = Hpricot(open(url))
+      return doc.search('p').map do |p|
+        train_with_paragraph(p.inner_text)
+      end
+    end
+    #
+    # Returns the observed frequency of the specified _ngram_ within
+    # the training text.
+    #
+    def frequency_of_ngram(ngram)
+      prefix = ngram.prefix
+      if @prefixes.has_key?(prefix)
+        return @prefixes[prefix].frequency_of(ngram.last)
+      else
+        return 0
+      end
+    end
     #
     # Returns the probability of the specified _ngram_ occurring within
     # arbitrary text.
@@ -444,6 +581,20 @@ module Raingrams
       end
     end
+    #
+    # Returns the observed frequency of the specified _ngrams_ occurring
+    # within the training text.
+    #
+    def frequencies_for(ngrams)
+      table = {}
+      ngrams.each do |ngram|
+        table[ngram] = frequency_of_ngram(ngram)
+      end
+      return table
+    end
     #
     # Returns the probability of the specified _ngrams_ occurring within
     # arbitrary text.
@@ -458,6 +609,16 @@ module Raingrams
       return table
     end
+    #
+    # Returns the total observed frequency of the specified _ngrams_
+    # occurring within the training text.
+    #
+    def frequency_of_ngrams(ngrams)
+      frequencies_for(ngrams).values.inject do |total,freq|
+        total + freq
+      end
+    end
     #
     # Returns the joint probability of the specified _ngrams_ occurring
     # within arbitrary text.
@@ -468,14 +629,6 @@ module Raingrams
       end
     end
-    #
-    # Returns the probably of the specified _gram_ occurring within
-    # arbitrary text.
-    #
-    def probability_of_gram(gram)
-      probability_of_ngrams(ngrams_starting_with(gram))
-    end
     #
     # Returns the probability of the specified _fragment_ occuring within
     # arbitrary text.
@@ -582,9 +735,6 @@ module Raingrams
       grams = []
       last_ngram = @starting_ngram
-      # prime the grams
-      grams += @starting_ngram
       loop do
         next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
         last_ngram = next_ngrams[rand(next_ngrams.length)]
@@ -592,8 +742,11 @@ module Raingrams
         if last_ngram.nil?
           return []
         else
-          grams << last_ngram.last
-          break if last_ngram == @stoping_ngram
+          last_gram = last_ngram.last
+          break if last_gram == Tokens.stop
+          grams << last_gram
         end
       end
@@ -690,6 +843,17 @@ module Raingrams
       return self
     end
+    #
+    # Saves the model to the file at the specified _path_.
+    #
+    def save(path)
+      File.open(path,'w') do |file|
+        Marshal.dump(self,file)
+      end
+      return self
+    end
     protected
     #

data/lib/raingrams/ngram.rb CHANGED Viewed

@@ -70,8 +70,12 @@ module Raingrams
       super(obj.to_gram)
     end
-    def includes?(*grams)
-      (self & grams) == grams
+    def includes_any?(*grams)
+      grams.any? { |gram| include?(gram) }
+    end
+    def includes_all?(*grams)
+      grams.all? { |gram| include?(gram) }
     end
     def flatten

data/lib/raingrams/ngram_set.rb CHANGED Viewed

@@ -35,8 +35,12 @@ module Raingrams
       select { |ngram| ngram.include?(gram) }
     end
-    def includes(*grams)
-      select { |ngram| ngram.includes?(*grams) }
+    def including_any(*grams)
+      select { |ngram| ngram.includes_any?(*grams) }
+    end
+    def including_all(*grams)
+      select { |ngram| ngram.includes_all?(*grams) }
     end
   end

data/lib/raingrams/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Raingrams
-  VERSION = '0.1.0'
+  VERSION = '0.1.1'
 end