raingrams 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +22 -3
- data/LICENSE.txt +1 -1
- data/Manifest.txt +13 -6
- data/README.txt +27 -25
- data/Rakefile +2 -2
- data/lib/raingrams/helpers.rb +5 -0
- data/lib/raingrams/helpers/commonality.rb +67 -0
- data/lib/raingrams/helpers/frequency.rb +43 -0
- data/lib/raingrams/helpers/probability.rb +67 -0
- data/lib/raingrams/helpers/random.rb +122 -0
- data/lib/raingrams/helpers/similarity.rb +38 -0
- data/lib/raingrams/model.rb +30 -304
- data/lib/raingrams/probability_table.rb +9 -0
- data/lib/raingrams/tokens/tokens.rb +35 -0
- data/lib/raingrams/version.rb +1 -1
- data/tasks/spec.rb +2 -0
- metadata +20 -14
    
        data/History.txt
    CHANGED
    
    | @@ -1,4 +1,23 @@ | |
| 1 | 
            -
             | 
| 1 | 
            +
            === 0.1.2 / 2009-04-23
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            * Require nokogiri >= 1.2.0.
         | 
| 4 | 
            +
            * No longer require hpricot.
         | 
| 5 | 
            +
            * Added missing 'lib/raingrams/tokens/tokens.rb' file to the Manifest.
         | 
| 6 | 
            +
            * Added Raingrams::Helpers:
         | 
| 7 | 
            +
              * Moved text commonality calculating methods into
         | 
| 8 | 
            +
                Raingrams::Helpers::Commonality.
         | 
| 9 | 
            +
              * Moved text frequency calculating methods into
         | 
| 10 | 
            +
                Raingrams::Helpers::Frequency.
         | 
| 11 | 
            +
              * Moved text probability calculating methods into
         | 
| 12 | 
            +
                Raingrams::Helpers::Probability.
         | 
| 13 | 
            +
              * Moved random text generating methods into
         | 
| 14 | 
            +
                Raingrams::Helpers::Random.
         | 
| 15 | 
            +
              * Moved text similarity calculating methods into
         | 
| 16 | 
            +
                Raingrams::Helpers::Similarity.
         | 
| 17 | 
            +
            * Added Model#to_hash.
         | 
| 18 | 
            +
            * Capitalize randomly generated sentences if case is ignored.
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            === 0.1.1 / 2008-10-12
         | 
| 2 21 |  | 
| 3 22 | 
             
            * Improved the parsing abilities of Model#parse_sentence and
         | 
| 4 23 | 
             
              Model#parse_text.
         | 
| @@ -26,7 +45,7 @@ | |
| 26 45 | 
             
              * Model#frequencies_of_ngrams.
         | 
| 27 46 | 
             
              * Model#save.
         | 
| 28 47 |  | 
| 29 | 
            -
             | 
| 48 | 
            +
            === 0.1.0 / 2008-10-06
         | 
| 30 49 |  | 
| 31 50 | 
             
            * Various bug fixes.
         | 
| 32 51 | 
             
            * Added NgramSet and ProbabilityTable classes.
         | 
| @@ -35,7 +54,7 @@ | |
| 35 54 | 
             
            * Added random_gram_sentence, random_sentence, random_paragraph and
         | 
| 36 55 | 
             
              random_text methods to the Model class.
         | 
| 37 56 |  | 
| 38 | 
            -
             | 
| 57 | 
            +
            === 0.0.9 / 2008-01-09
         | 
| 39 58 |  | 
| 40 59 | 
             
            * Initial release.
         | 
| 41 60 | 
             
            * Supports all non-zero ngram sizes.
         | 
    
        data/LICENSE.txt
    CHANGED
    
    
    
        data/Manifest.txt
    CHANGED
    
    | @@ -5,27 +5,33 @@ README.txt | |
| 5 5 | 
             
            TODO.txt
         | 
| 6 6 | 
             
            Rakefile
         | 
| 7 7 | 
             
            lib/raingrams.rb
         | 
| 8 | 
            -
            lib/raingrams/version.rb
         | 
| 9 | 
            -
            lib/raingrams/raingrams.rb
         | 
| 10 | 
            -
            lib/raingrams/exceptions/prefix_frequency_missing.rb
         | 
| 11 8 | 
             
            lib/raingrams/exceptions.rb
         | 
| 9 | 
            +
            lib/raingrams/exceptions/prefix_frequency_missing.rb
         | 
| 10 | 
            +
            lib/raingrams/extensions.rb
         | 
| 12 11 | 
             
            lib/raingrams/extensions/object.rb
         | 
| 13 12 | 
             
            lib/raingrams/extensions/string.rb
         | 
| 14 | 
            -
            lib/raingrams/ | 
| 13 | 
            +
            lib/raingrams/tokens.rb
         | 
| 15 14 | 
             
            lib/raingrams/tokens/token.rb
         | 
| 16 15 | 
             
            lib/raingrams/tokens/start_sentence.rb
         | 
| 17 16 | 
             
            lib/raingrams/tokens/stop_sentence.rb
         | 
| 18 17 | 
             
            lib/raingrams/tokens/unknown.rb
         | 
| 19 | 
            -
            lib/raingrams/tokens.rb
         | 
| 18 | 
            +
            lib/raingrams/tokens/tokens.rb
         | 
| 20 19 | 
             
            lib/raingrams/ngram.rb
         | 
| 21 20 | 
             
            lib/raingrams/ngram_set.rb
         | 
| 22 21 | 
             
            lib/raingrams/probability_table.rb
         | 
| 22 | 
            +
            lib/raingrams/helpers.rb
         | 
| 23 | 
            +
            lib/raingrams/helpers/frequency.rb
         | 
| 24 | 
            +
            lib/raingrams/helpers/probability.rb
         | 
| 25 | 
            +
            lib/raingrams/helpers/similarity.rb
         | 
| 26 | 
            +
            lib/raingrams/helpers/commonality.rb
         | 
| 27 | 
            +
            lib/raingrams/helpers/random.rb
         | 
| 23 28 | 
             
            lib/raingrams/model.rb
         | 
| 24 29 | 
             
            lib/raingrams/bigram_model.rb
         | 
| 25 30 | 
             
            lib/raingrams/trigram_model.rb
         | 
| 26 31 | 
             
            lib/raingrams/quadgram_model.rb
         | 
| 27 32 | 
             
            lib/raingrams/pentagram_model.rb
         | 
| 28 33 | 
             
            lib/raingrams/hexagram_model.rb
         | 
| 34 | 
            +
            lib/raingrams/open_vocabulary.rb
         | 
| 29 35 | 
             
            lib/raingrams/open_vocabulary/open_model.rb
         | 
| 30 36 | 
             
            lib/raingrams/open_vocabulary/model.rb
         | 
| 31 37 | 
             
            lib/raingrams/open_vocabulary/bigram_model.rb
         | 
| @@ -33,7 +39,8 @@ lib/raingrams/open_vocabulary/trigram_model.rb | |
| 33 39 | 
             
            lib/raingrams/open_vocabulary/quadgram_model.rb
         | 
| 34 40 | 
             
            lib/raingrams/open_vocabulary/pentagram_model.rb
         | 
| 35 41 | 
             
            lib/raingrams/open_vocabulary/hexagram_model.rb
         | 
| 36 | 
            -
            lib/raingrams/ | 
| 42 | 
            +
            lib/raingrams/version.rb
         | 
| 43 | 
            +
            lib/raingrams/raingrams.rb
         | 
| 37 44 | 
             
            tasks/spec.rb
         | 
| 38 45 | 
             
            spec/training/snowcrash.txt
         | 
| 39 46 | 
             
            spec/helpers/training.rb
         | 
    
        data/README.txt
    CHANGED
    
    | @@ -1,7 +1,8 @@ | |
| 1 1 | 
             
            = Raingrams
         | 
| 2 2 |  | 
| 3 3 | 
             
            * http://raingrams.rubyforge.org/
         | 
| 4 | 
            -
            *  | 
| 4 | 
            +
            * http://github.com/postmodern/raingrams/
         | 
| 5 | 
            +
            * Postmodern (postmodern.mod3 at gmail.com)
         | 
| 5 6 |  | 
| 6 7 | 
             
            == DESCRIPTION:
         | 
| 7 8 |  | 
| @@ -20,7 +21,7 @@ parsing styles and open/closed vocabulary models. | |
| 20 21 |  | 
| 21 22 | 
             
            == REQUIREMENTS:
         | 
| 22 23 |  | 
| 23 | 
            -
            *  | 
| 24 | 
            +
            * {nokogiri}[http://nokogiri.rubyforge.org/] >= 1.2.0
         | 
| 24 25 |  | 
| 25 26 | 
             
            == INSTALL:
         | 
| 26 27 |  | 
| @@ -30,47 +31,48 @@ parsing styles and open/closed vocabulary models. | |
| 30 31 |  | 
| 31 32 | 
             
            * Train a model with ycombinator comments:
         | 
| 32 33 |  | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 34 | 
            +
                require 'raingrams'
         | 
| 35 | 
            +
                require 'nokogiri'
         | 
| 36 | 
            +
                require 'open-uri'
         | 
| 37 | 
            +
                
         | 
| 38 | 
            +
                include Raingrams
         | 
| 39 | 
            +
                
         | 
| 40 | 
            +
                model = BigramModel.build do |model|
         | 
| 41 | 
            +
                  doc = Nokogiri::HTML(open('http://news.ycombinator.org/newcomments'))
         | 
| 42 | 
            +
                  doc.search('span.comment') do |span|
         | 
| 43 | 
            +
                    model.train_with_text(span.inner_text)
         | 
| 44 | 
            +
                  end
         | 
| 43 45 | 
             
                end
         | 
| 44 | 
            -
              end
         | 
| 45 46 |  | 
| 46 47 | 
             
            * Update a trained model:
         | 
| 47 48 |  | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 49 | 
            +
                model.train_with_text %{Interesting videos. Anders talks about
         | 
| 50 | 
            +
                  functional support on .net, concurrency, immutability. Guy Steele
         | 
| 51 | 
            +
                  talks about Fortress on JVM. Too bad they are afraid of macros
         | 
| 52 | 
            +
                  (access to AST), though Steele does say Fortress has some support.}
         | 
| 53 | 
            +
                
         | 
| 54 | 
            +
                model.refresh
         | 
| 54 55 |  | 
| 55 56 | 
             
            * Generate a random sentence:
         | 
| 56 57 |  | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 58 | 
            +
                model.random_sentence
         | 
| 59 | 
            +
                # => "OTOOH if you use slicehost even offer to bash Apple makes it will
         | 
| 60 | 
            +
                exit and its 38 month ago based configuration of little networks
         | 
| 61 | 
            +
                created."
         | 
| 60 62 |  | 
| 61 63 | 
             
            * Dump a model to a file, to be marshaled later:
         | 
| 62 64 |  | 
| 63 | 
            -
             | 
| 65 | 
            +
                model.save('path/for/model')
         | 
| 64 66 |  | 
| 65 67 | 
             
            * Load a model from a file:
         | 
| 66 68 |  | 
| 67 | 
            -
             | 
| 69 | 
            +
                Model.open('path/for/model')
         | 
| 68 70 |  | 
| 69 71 | 
             
            == LICENSE:
         | 
| 70 72 |  | 
| 71 73 | 
             
            The MIT License
         | 
| 72 74 |  | 
| 73 | 
            -
            Copyright (c) 2007- | 
| 75 | 
            +
            Copyright (c) 2007-2009 Hal Brodigan
         | 
| 74 76 |  | 
| 75 77 | 
             
            Permission is hereby granted, free of charge, to any person obtaining
         | 
| 76 78 | 
             
            a copy of this software and associated documentation files (the
         | 
    
        data/Rakefile
    CHANGED
    
    | @@ -7,9 +7,9 @@ require './lib/raingrams/version.rb' | |
| 7 7 |  | 
| 8 8 | 
             
            Hoe.new('raingrams', Raingrams::VERSION) do |p|
         | 
| 9 9 | 
             
              p.rubyforge_name = 'raingrams'
         | 
| 10 | 
            -
              p.developer('Postmodern | 
| 10 | 
            +
              p.developer('Postmodern', 'postmodern.mod3@gmail.com')
         | 
| 11 11 | 
             
              p.remote_rdoc_dir = 'docs'
         | 
| 12 | 
            -
              p.extra_deps = [' | 
| 12 | 
            +
              p.extra_deps = [['nokogiri', '>=1.2.0']]
         | 
| 13 13 | 
             
            end
         | 
| 14 14 |  | 
| 15 15 | 
             
            # vim: syntax=Ruby
         | 
| @@ -0,0 +1,67 @@ | |
| 1 | 
            +
            require 'raingrams/helpers/probability'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Raingrams
         | 
| 4 | 
            +
              module Helpers
         | 
| 5 | 
            +
                module Commonality
         | 
| 6 | 
            +
                  def self.included(base)
         | 
| 7 | 
            +
                    base.module_eval { include Raingrams::Helpers::Probability }
         | 
| 8 | 
            +
                  end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  #
         | 
| 11 | 
            +
                  # Returns the ngrams which occur within the specified _words_ and
         | 
| 12 | 
            +
                  # within the model.
         | 
| 13 | 
            +
                  #
         | 
| 14 | 
            +
                  def common_ngrams_from_words(words)
         | 
| 15 | 
            +
                    ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
         | 
| 16 | 
            +
                  end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                  #
         | 
| 19 | 
            +
                  # Returns the ngrams which occur within the specified _fragment_ and
         | 
| 20 | 
            +
                  # within the model.
         | 
| 21 | 
            +
                  #
         | 
| 22 | 
            +
                  def common_ngrams_from_fragment(fragment)
         | 
| 23 | 
            +
                    ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
         | 
| 24 | 
            +
                  end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                  #
         | 
| 27 | 
            +
                  # Returns the ngrams which occur within the specified _sentence_ and
         | 
| 28 | 
            +
                  # within the model.
         | 
| 29 | 
            +
                  #
         | 
| 30 | 
            +
                  def common_ngrams_from_sentence(sentence)
         | 
| 31 | 
            +
                    ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
         | 
| 32 | 
            +
                  end
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                  #
         | 
| 35 | 
            +
                  # Returns the ngrams which occur within the specified _text_ and
         | 
| 36 | 
            +
                  # within the model.
         | 
| 37 | 
            +
                  #
         | 
| 38 | 
            +
                  def common_ngrams_from_text(text)
         | 
| 39 | 
            +
                    ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  #
         | 
| 43 | 
            +
                  # Returns the joint probability of the common ngrams between the
         | 
| 44 | 
            +
                  # specified _fragment_ and the model.
         | 
| 45 | 
            +
                  #
         | 
| 46 | 
            +
                  def fragment_commonality(fragment)
         | 
| 47 | 
            +
                    probability_of_ngrams(common_ngrams_from_fragment(fragment))
         | 
| 48 | 
            +
                  end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  #
         | 
| 51 | 
            +
                  # Returns the joint probability of the common ngrams between the
         | 
| 52 | 
            +
                  # specified _sentence_ and the model.
         | 
| 53 | 
            +
                  #
         | 
| 54 | 
            +
                  def sentence_commonality(sentence)
         | 
| 55 | 
            +
                    probability_of_ngrams(common_ngrams_from_sentence(sentence))
         | 
| 56 | 
            +
                  end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                  #
         | 
| 59 | 
            +
                  # Returns the joint probability of the common ngrams between the
         | 
| 60 | 
            +
                  # specified _sentence_ and the model.
         | 
| 61 | 
            +
                  #
         | 
| 62 | 
            +
                  def text_commonality(text)
         | 
| 63 | 
            +
                    probability_of_ngrams(common_ngrams_from_text(text))
         | 
| 64 | 
            +
                  end
         | 
| 65 | 
            +
                end
         | 
| 66 | 
            +
              end
         | 
| 67 | 
            +
            end
         | 
| @@ -0,0 +1,43 @@ | |
| 1 | 
            +
            module Raingrams
         | 
| 2 | 
            +
              module Helpers
         | 
| 3 | 
            +
                module Frequency
         | 
| 4 | 
            +
                  #
         | 
| 5 | 
            +
                  # Returns the observed frequency of the specified _ngram_ within
         | 
| 6 | 
            +
                  # the training text.
         | 
| 7 | 
            +
                  #
         | 
| 8 | 
            +
                  def frequency_of_ngram(ngram)
         | 
| 9 | 
            +
                    prefix = ngram.prefix
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                    if @prefixes.has_key?(prefix)
         | 
| 12 | 
            +
                      return @prefixes[prefix].frequency_of(ngram.last)
         | 
| 13 | 
            +
                    else
         | 
| 14 | 
            +
                      return 0
         | 
| 15 | 
            +
                    end
         | 
| 16 | 
            +
                  end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                  #
         | 
| 19 | 
            +
                  # Returns the observed frequency of the specified _ngrams_ occurring
         | 
| 20 | 
            +
                  # within the training text.
         | 
| 21 | 
            +
                  #
         | 
| 22 | 
            +
                  def frequencies_for(ngrams)
         | 
| 23 | 
            +
                    table = {}
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    ngrams.each do |ngram|
         | 
| 26 | 
            +
                      table[ngram] = frequency_of_ngram(ngram)
         | 
| 27 | 
            +
                    end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    return table
         | 
| 30 | 
            +
                  end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  #
         | 
| 33 | 
            +
                  # Returns the total observed frequency of the specified _ngrams_
         | 
| 34 | 
            +
                  # occurring within the training text.
         | 
| 35 | 
            +
                  #
         | 
| 36 | 
            +
                  def frequency_of_ngrams(ngrams)
         | 
| 37 | 
            +
                    frequencies_for(ngrams).values.inject do |total,freq|
         | 
| 38 | 
            +
                      total + freq
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
              end
         | 
| 43 | 
            +
            end
         | 
| @@ -0,0 +1,67 @@ | |
| 1 | 
            +
            module Raingrams
         | 
| 2 | 
            +
              module Helpers
         | 
| 3 | 
            +
                module Probability
         | 
| 4 | 
            +
                  #
         | 
| 5 | 
            +
                  # Returns the probability of the specified _ngram_ occurring within
         | 
| 6 | 
            +
                  # arbitrary text.
         | 
| 7 | 
            +
                  #
         | 
| 8 | 
            +
                  def probability_of_ngram(ngram)
         | 
| 9 | 
            +
                    prefix = ngram.prefix
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                    if @prefixes.has_key?(prefix)
         | 
| 12 | 
            +
                      return @prefixes[prefix].probability_of(ngram.last)
         | 
| 13 | 
            +
                    else
         | 
| 14 | 
            +
                      return 0.0
         | 
| 15 | 
            +
                    end
         | 
| 16 | 
            +
                  end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                  #
         | 
| 19 | 
            +
                  # Returns the probability of the specified _ngrams_ occurring within
         | 
| 20 | 
            +
                  # arbitrary text.
         | 
| 21 | 
            +
                  #
         | 
| 22 | 
            +
                  def probabilities_for(ngrams)
         | 
| 23 | 
            +
                    table = {}
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    ngrams.each do |ngram|
         | 
| 26 | 
            +
                      table[ngram] = probability_of_ngram(ngram)
         | 
| 27 | 
            +
                    end
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    return table
         | 
| 30 | 
            +
                  end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  #
         | 
| 33 | 
            +
                  # Returns the joint probability of the specified _ngrams_ occurring
         | 
| 34 | 
            +
                  # within arbitrary text.
         | 
| 35 | 
            +
                  #
         | 
| 36 | 
            +
                  def probability_of_ngrams(ngrams)
         | 
| 37 | 
            +
                    probabilities_for(ngrams).values.inject do |joint,prob|
         | 
| 38 | 
            +
                      joint * prob
         | 
| 39 | 
            +
                    end
         | 
| 40 | 
            +
                  end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  #
         | 
| 43 | 
            +
                  # Returns the probability of the specified _fragment_ occuring within
         | 
| 44 | 
            +
                  # arbitrary text.
         | 
| 45 | 
            +
                  #
         | 
| 46 | 
            +
                  def fragment_probability(fragment)
         | 
| 47 | 
            +
                    probability_of_ngrams(ngrams_from_fragment(fragment))
         | 
| 48 | 
            +
                  end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                  #
         | 
| 51 | 
            +
                  # Returns the probability of the specified _sentence_ occuring within
         | 
| 52 | 
            +
                  # arbitrary text.
         | 
| 53 | 
            +
                  #
         | 
| 54 | 
            +
                  def sentence_probability(sentence)
         | 
| 55 | 
            +
                    probability_of_ngrams(ngrams_from_sentence(sentence))
         | 
| 56 | 
            +
                  end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                  #
         | 
| 59 | 
            +
                  # Returns the probability of the specified _text_ occuring within
         | 
| 60 | 
            +
                  # arbitrary text.
         | 
| 61 | 
            +
                  #
         | 
| 62 | 
            +
                  def text_probability(text)
         | 
| 63 | 
            +
                    probability_of_ngrams(ngrams_from_text(text))
         | 
| 64 | 
            +
                  end
         | 
| 65 | 
            +
                end
         | 
| 66 | 
            +
              end
         | 
| 67 | 
            +
            end
         | 
| @@ -0,0 +1,122 @@ | |
| 1 | 
            +
            module Raingrams
         | 
| 2 | 
            +
              module Helpers
         | 
| 3 | 
            +
                module Random
         | 
| 4 | 
            +
                  #
         | 
| 5 | 
            +
                  # Returns a random gram from the model.
         | 
| 6 | 
            +
                  #
         | 
| 7 | 
            +
                  def random_gram
         | 
| 8 | 
            +
                    prefix = @prefixes.keys[rand(@prefixes.length)]
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                    return prefix[rand(prefix.length)]
         | 
| 11 | 
            +
                  end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  #
         | 
| 14 | 
            +
                  # Returns a random ngram from the model.
         | 
| 15 | 
            +
                  #
         | 
| 16 | 
            +
                  def random_ngram
         | 
| 17 | 
            +
                    prefix_index = rand(@prefixes.length)
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                    prefix = @prefixes.keys[prefix_index]
         | 
| 20 | 
            +
                    table = @prefixes.values[prefix_index]
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                    gram_index = rand(table.grams.length)
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    return (prefix + table.grams[gram_index])
         | 
| 25 | 
            +
                  end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                  #
         | 
| 28 | 
            +
                  # Returns a randomly generated sentence of grams using the given
         | 
| 29 | 
            +
                  # _options_.
         | 
| 30 | 
            +
                  #
         | 
| 31 | 
            +
                  def random_gram_sentence(options={})
         | 
| 32 | 
            +
                    grams = []
         | 
| 33 | 
            +
                    last_ngram = @starting_ngram
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    loop do
         | 
| 36 | 
            +
                      next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
         | 
| 37 | 
            +
                      last_ngram = next_ngrams[rand(next_ngrams.length)]
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                      if last_ngram.nil?
         | 
| 40 | 
            +
                        return []
         | 
| 41 | 
            +
                      else
         | 
| 42 | 
            +
                        last_gram = last_ngram.last
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                        break if last_gram == Tokens.stop
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                        grams << last_gram
         | 
| 47 | 
            +
                      end
         | 
| 48 | 
            +
                    end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    return grams
         | 
| 51 | 
            +
                  end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  #
         | 
| 54 | 
            +
                  # Returns a randomly generated sentence of text using the given
         | 
| 55 | 
            +
                  # _options_.
         | 
| 56 | 
            +
                  #
         | 
| 57 | 
            +
                  def random_sentence(options={})
         | 
| 58 | 
            +
                    grams = random_gram_sentence(options)
         | 
| 59 | 
            +
                    sentence = grams.delete_if { |gram|
         | 
| 60 | 
            +
                      gram == Tokens.start || gram == Tokens.stop
         | 
| 61 | 
            +
                    }.join(' ')
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    if @ignore_case
         | 
| 64 | 
            +
                      sentence.capitalize!
         | 
| 65 | 
            +
                    end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                    if @ignore_punctuation
         | 
| 68 | 
            +
                      sentence << '.'
         | 
| 69 | 
            +
                    end
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                    return sentence
         | 
| 72 | 
            +
                  end
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                  #
         | 
| 75 | 
            +
                  # Returns a randomly generated paragraph of text using the given
         | 
| 76 | 
            +
                  # _options_.
         | 
| 77 | 
            +
                  #
         | 
| 78 | 
            +
                  # _options_ may contain the following keys:
         | 
| 79 | 
            +
                  # <tt>:min_sentences</tt>:: Minimum number of sentences in the
         | 
| 80 | 
            +
                  #                           paragraph. Defaults to 3.
         | 
| 81 | 
            +
                  # <tt>:max_sentences</tt>:: Maximum number of sentences in the
         | 
| 82 | 
            +
                  #                           paragraph. Defaults to 6.
         | 
| 83 | 
            +
                  #
         | 
| 84 | 
            +
                  def random_paragraph(options={})
         | 
| 85 | 
            +
                    min_sentences = (options[:min_sentences] || 3)
         | 
| 86 | 
            +
                    max_sentences = (options[:max_sentences] || 6)
         | 
| 87 | 
            +
                    sentences = []
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                    (rand(max_sentences - min_sentences) + min_sentences).times do
         | 
| 90 | 
            +
                      sentences << random_sentence(options)
         | 
| 91 | 
            +
                    end
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                    return sentences.join(' ')
         | 
| 94 | 
            +
                  end
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                  #
         | 
| 97 | 
            +
                  # Returns randomly generated text using the given _options_.
         | 
| 98 | 
            +
                  #
         | 
| 99 | 
            +
                  # _options_ may contain the following keys:
         | 
| 100 | 
            +
                  # <tt>:min_sentences</tt>:: Minimum number of sentences in the
         | 
| 101 | 
            +
                  #                           paragraph. Defaults to 3.
         | 
| 102 | 
            +
                  # <tt>:max_sentences</tt>:: Maximum number of sentences in the
         | 
| 103 | 
            +
                  #                           paragraph. Defaults to 6.
         | 
| 104 | 
            +
                  # <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
         | 
| 105 | 
            +
                  #                            Defaults to 3.
         | 
| 106 | 
            +
                  # <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
         | 
| 107 | 
            +
                  #                            Defaults to 5.
         | 
| 108 | 
            +
                  #
         | 
| 109 | 
            +
                  def random_text(options={})
         | 
| 110 | 
            +
                    min_paragraphs = (options[:min_paragraphs] || 3)
         | 
| 111 | 
            +
                    max_paragraphs = (options[:max_paragraphs] || 6)
         | 
| 112 | 
            +
                    paragraphs = []
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                    (rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
         | 
| 115 | 
            +
                      paragraphs << random_paragraph(options)
         | 
| 116 | 
            +
                    end
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                    return paragraphs.join("\n\n")
         | 
| 119 | 
            +
                  end
         | 
| 120 | 
            +
                end
         | 
| 121 | 
            +
              end
         | 
| 122 | 
            +
            end
         | 
| @@ -0,0 +1,38 @@ | |
| 1 | 
            +
            require 'raingrams/helpers/commonality'
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Raingrams
         | 
| 4 | 
            +
              module Helpers
         | 
| 5 | 
            +
                module Similarity
         | 
| 6 | 
            +
                  def self.included(base)
         | 
| 7 | 
            +
                    base.module_eval { include Raingrams::Helpers::Commonality }
         | 
| 8 | 
            +
                  end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  #
         | 
| 11 | 
            +
                  # Returns the conditional probability of the commonality of the
         | 
| 12 | 
            +
                  # specified _fragment_ against the _other_model_, given the
         | 
| 13 | 
            +
                  # commonality of the _fragment_ against the model.
         | 
| 14 | 
            +
                  #
         | 
| 15 | 
            +
                  def fragment_similarity(fragment,other_model)
         | 
| 16 | 
            +
                    other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
         | 
| 17 | 
            +
                  end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                  #
         | 
| 20 | 
            +
                  # Returns the conditional probability of the commonality of the
         | 
| 21 | 
            +
                  # specified _sentence_ against the _other_model_, given the
         | 
| 22 | 
            +
                  # commonality of the _sentence_ against the model.
         | 
| 23 | 
            +
                  #
         | 
| 24 | 
            +
                  def sentence_similarity(sentence,other_model)
         | 
| 25 | 
            +
                    other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
         | 
| 26 | 
            +
                  end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                  #
         | 
| 29 | 
            +
                  # Returns the conditional probability of the commonality of the
         | 
| 30 | 
            +
                  # specified _text_ against the _other_model_, given the commonality
         | 
| 31 | 
            +
                  # of the _text_ against the model.
         | 
| 32 | 
            +
                  #
         | 
| 33 | 
            +
                  def text_similarity(text,other_model)
         | 
| 34 | 
            +
                    other_model.text_commonality(text) / text_commonality(text)
         | 
| 35 | 
            +
                  end
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
              end
         | 
| 38 | 
            +
            end
         | 
    
        data/lib/raingrams/model.rb
    CHANGED
    
    | @@ -1,15 +1,22 @@ | |
| 1 1 | 
             
            require 'raingrams/ngram'
         | 
| 2 2 | 
             
            require 'raingrams/ngram_set'
         | 
| 3 | 
            -
            require 'raingrams/probability_table'
         | 
| 4 3 | 
             
            require 'raingrams/tokens'
         | 
| 4 | 
            +
            require 'raingrams/probability_table'
         | 
| 5 | 
            +
            require 'raingrams/helpers'
         | 
| 5 6 |  | 
| 6 7 | 
             
            require 'set'
         | 
| 7 | 
            -
            require ' | 
| 8 | 
            +
            require 'nokogiri'
         | 
| 8 9 | 
             
            require 'open-uri'
         | 
| 9 10 |  | 
| 10 11 | 
             
            module Raingrams
         | 
| 11 12 | 
             
              class Model
         | 
| 12 13 |  | 
| 14 | 
            +
                include Helpers::Frequency
         | 
| 15 | 
            +
                include Helpers::Probability
         | 
| 16 | 
            +
                include Helpers::Similarity
         | 
| 17 | 
            +
                include Helpers::Commonality
         | 
| 18 | 
            +
                include Helpers::Random
         | 
| 19 | 
            +
             | 
| 13 20 | 
             
                # Size of ngrams to use
         | 
| 14 21 | 
             
                attr_reader :ngram_size
         | 
| 15 22 |  | 
| @@ -161,8 +168,12 @@ module Raingrams | |
| 161 168 | 
             
                    sentence.gsub!(/[\.\?!]*$/,'')
         | 
| 162 169 | 
             
                  end
         | 
| 163 170 |  | 
| 171 | 
            +
                  if @ignore_case
         | 
| 172 | 
            +
                    # downcase the sentence
         | 
| 173 | 
            +
                    sentence.downcase!
         | 
| 174 | 
            +
                  end
         | 
| 175 | 
            +
             | 
| 164 176 | 
             
                  if @ignore_urls
         | 
| 165 | 
            -
                    # remove URLs
         | 
| 166 177 | 
             
                    sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
         | 
| 167 178 | 
             
                  end
         | 
| 168 179 |  | 
| @@ -176,11 +187,6 @@ module Raingrams | |
| 176 187 | 
             
                    sentence.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
         | 
| 177 188 | 
             
                  end
         | 
| 178 189 |  | 
| 179 | 
            -
                  if @ignore_case
         | 
| 180 | 
            -
                    # downcase the sentence
         | 
| 181 | 
            -
                    sentence.downcase!
         | 
| 182 | 
            -
                  end
         | 
| 183 | 
            -
             | 
| 184 190 | 
             
                  if @ignore_punctuation
         | 
| 185 191 | 
             
                    # split and ignore punctuation characters
         | 
| 186 192 | 
             
                    return sentence.scan(/\w+[\-_\.:']\w+|\w+/)
         | 
| @@ -194,7 +200,13 @@ module Raingrams | |
| 194 200 | 
             
                # Parses the specified _text_ and returns an Array of sentences.
         | 
| 195 201 | 
             
                #
         | 
| 196 202 | 
             
                def parse_text(text)
         | 
| 197 | 
            -
                  text.to_s | 
| 203 | 
            +
                  text = text.to_s
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                  if @ignore_urls
         | 
| 206 | 
            +
                    text.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
         | 
| 207 | 
            +
                  end
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                  return text.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/)
         | 
| 198 210 | 
             
                end
         | 
| 199 211 |  | 
| 200 212 | 
             
                #
         | 
| @@ -460,38 +472,6 @@ module Raingrams | |
| 460 472 | 
             
                  return gram_set
         | 
| 461 473 | 
             
                end
         | 
| 462 474 |  | 
| 463 | 
            -
                #
         | 
| 464 | 
            -
                # Returns the ngrams which occur within the specified _words_ and
         | 
| 465 | 
            -
                # within the model.
         | 
| 466 | 
            -
                #
         | 
| 467 | 
            -
                def common_ngrams_from_words(words)
         | 
| 468 | 
            -
                  ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
         | 
| 469 | 
            -
                end
         | 
| 470 | 
            -
             | 
| 471 | 
            -
                #
         | 
| 472 | 
            -
                # Returns the ngrams which occur within the specified _fragment_ and
         | 
| 473 | 
            -
                # within the model.
         | 
| 474 | 
            -
                #
         | 
| 475 | 
            -
                def common_ngrams_from_fragment(fragment)
         | 
| 476 | 
            -
                  ngrams_from_fragment(fragment).select { |ngram| has_ngram?(ngram) }
         | 
| 477 | 
            -
                end
         | 
| 478 | 
            -
             | 
| 479 | 
            -
                #
         | 
| 480 | 
            -
                # Returns the ngrams which occur within the specified _sentence_ and
         | 
| 481 | 
            -
                # within the model.
         | 
| 482 | 
            -
                #
         | 
| 483 | 
            -
                def common_ngrams_from_sentence(sentence)
         | 
| 484 | 
            -
                  ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
         | 
| 485 | 
            -
                end
         | 
| 486 | 
            -
             | 
| 487 | 
            -
                #
         | 
| 488 | 
            -
                # Returns the ngrams which occur within the specified _text_ and
         | 
| 489 | 
            -
                # within the model.
         | 
| 490 | 
            -
                #
         | 
| 491 | 
            -
                def common_ngrams_from_text(text)
         | 
| 492 | 
            -
                  ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
         | 
| 493 | 
            -
                end
         | 
| 494 | 
            -
             | 
| 495 475 | 
             
                #
         | 
| 496 476 | 
             
                # Sets the frequency of the specified _ngram_ to the specified _value_.
         | 
| 497 477 | 
             
                #
         | 
| @@ -524,7 +504,7 @@ module Raingrams | |
| 524 504 | 
             
                # Train the model with the specified _paragraphs_.
         | 
| 525 505 | 
             
                #
         | 
| 526 506 | 
             
                def train_with_paragraph(paragraph)
         | 
| 527 | 
            -
                  train_with_ngrams(ngrams_from_paragraph( | 
| 507 | 
            +
                  train_with_ngrams(ngrams_from_paragraph(paragraph))
         | 
| 528 508 | 
             
                end
         | 
| 529 509 |  | 
| 530 510 | 
             
                #
         | 
| @@ -546,274 +526,13 @@ module Raingrams | |
| 546 526 | 
             
                # specified _url_.
         | 
| 547 527 | 
             
                #
         | 
| 548 528 | 
             
                def train_with_url(url)
         | 
| 549 | 
            -
                  doc =  | 
| 529 | 
            +
                  doc = Nokogiri::HTML(open(url))
         | 
| 550 530 |  | 
| 551 531 | 
             
                  return doc.search('p').map do |p|
         | 
| 552 532 | 
             
                    train_with_paragraph(p.inner_text)
         | 
| 553 533 | 
             
                  end
         | 
| 554 534 | 
             
                end
         | 
| 555 535 |  | 
| 556 | 
            -
                #
         | 
| 557 | 
            -
                # Returns the observed frequency of the specified _ngram_ within
         | 
| 558 | 
            -
                # the training text.
         | 
| 559 | 
            -
                #
         | 
| 560 | 
            -
                def frequency_of_ngram(ngram)
         | 
| 561 | 
            -
                  prefix = ngram.prefix
         | 
| 562 | 
            -
             | 
| 563 | 
            -
                  if @prefixes.has_key?(prefix)
         | 
| 564 | 
            -
                    return @prefixes[prefix].frequency_of(ngram.last)
         | 
| 565 | 
            -
                  else
         | 
| 566 | 
            -
                    return 0
         | 
| 567 | 
            -
                  end
         | 
| 568 | 
            -
                end
         | 
| 569 | 
            -
             | 
| 570 | 
            -
                #
         | 
| 571 | 
            -
                # Returns the probability of the specified _ngram_ occurring within
         | 
| 572 | 
            -
                # arbitrary text.
         | 
| 573 | 
            -
                #
         | 
| 574 | 
            -
                def probability_of_ngram(ngram)
         | 
| 575 | 
            -
                  prefix = ngram.prefix
         | 
| 576 | 
            -
             | 
| 577 | 
            -
                  if @prefixes.has_key?(prefix)
         | 
| 578 | 
            -
                    return @prefixes[prefix].probability_of(ngram.last)
         | 
| 579 | 
            -
                  else
         | 
| 580 | 
            -
                    return 0.0
         | 
| 581 | 
            -
                  end
         | 
| 582 | 
            -
                end
         | 
| 583 | 
            -
             | 
| 584 | 
            -
                #
         | 
| 585 | 
            -
                # Returns the observed frequency of the specified _ngrams_ occurring
         | 
| 586 | 
            -
                # within the training text.
         | 
| 587 | 
            -
                #
         | 
| 588 | 
            -
                def frequencies_for(ngrams)
         | 
| 589 | 
            -
                  table = {}
         | 
| 590 | 
            -
             | 
| 591 | 
            -
                  ngrams.each do |ngram|
         | 
| 592 | 
            -
                    table[ngram] = frequency_of_ngram(ngram)
         | 
| 593 | 
            -
                  end
         | 
| 594 | 
            -
             | 
| 595 | 
            -
                  return table
         | 
| 596 | 
            -
                end
         | 
| 597 | 
            -
             | 
| 598 | 
            -
                #
         | 
| 599 | 
            -
                # Returns the probability of the specified _ngrams_ occurring within
         | 
| 600 | 
            -
                # arbitrary text.
         | 
| 601 | 
            -
                #
         | 
| 602 | 
            -
                def probabilities_for(ngrams)
         | 
| 603 | 
            -
                  table = {}
         | 
| 604 | 
            -
             | 
| 605 | 
            -
                  ngrams.each do |ngram|
         | 
| 606 | 
            -
                    table[ngram] = probability_of_ngram(ngram)
         | 
| 607 | 
            -
                  end
         | 
| 608 | 
            -
             | 
| 609 | 
            -
                  return table
         | 
| 610 | 
            -
                end
         | 
| 611 | 
            -
             | 
| 612 | 
            -
                #
         | 
| 613 | 
            -
                # Returns the total observed frequency of the specified _ngrams_
         | 
| 614 | 
            -
                # occurring within the training text.
         | 
| 615 | 
            -
                #
         | 
| 616 | 
            -
                def frequency_of_ngrams(ngrams)
         | 
| 617 | 
            -
                  frequencies_for(ngrams).values.inject do |total,freq|
         | 
| 618 | 
            -
                    total + freq
         | 
| 619 | 
            -
                  end
         | 
| 620 | 
            -
                end
         | 
| 621 | 
            -
             | 
| 622 | 
            -
                #
         | 
| 623 | 
            -
                # Returns the joint probability of the specified _ngrams_ occurring
         | 
| 624 | 
            -
                # within arbitrary text.
         | 
| 625 | 
            -
                #
         | 
| 626 | 
            -
                def probability_of_ngrams(ngrams)
         | 
| 627 | 
            -
                  probabilities_for(ngrams).values.inject do |joint,prob|
         | 
| 628 | 
            -
                    joint * prob
         | 
| 629 | 
            -
                  end
         | 
| 630 | 
            -
                end
         | 
| 631 | 
            -
             | 
| 632 | 
            -
                #
         | 
| 633 | 
            -
                # Returns the probability of the specified _fragment_ occuring within
         | 
| 634 | 
            -
                # arbitrary text.
         | 
| 635 | 
            -
                #
         | 
| 636 | 
            -
                def fragment_probability(fragment)
         | 
| 637 | 
            -
                  probability_of_ngrams(ngrams_from_fragment(fragment))
         | 
| 638 | 
            -
                end
         | 
| 639 | 
            -
             | 
| 640 | 
            -
                #
         | 
| 641 | 
            -
                # Returns the probability of the specified _sentence_ occuring within
         | 
| 642 | 
            -
                # arbitrary text.
         | 
| 643 | 
            -
                #
         | 
| 644 | 
            -
                def sentence_probability(sentence)
         | 
| 645 | 
            -
                  probability_of_ngrams(ngrams_from_sentence(sentence))
         | 
| 646 | 
            -
                end
         | 
| 647 | 
            -
             | 
| 648 | 
            -
                #
         | 
| 649 | 
            -
                # Returns the probability of the specified _text_ occuring within
         | 
| 650 | 
            -
                # arbitrary text.
         | 
| 651 | 
            -
                #
         | 
| 652 | 
            -
                def text_probability(text)
         | 
| 653 | 
            -
                  probability_of_ngrams(ngrams_from_text(text))
         | 
| 654 | 
            -
                end
         | 
| 655 | 
            -
             | 
| 656 | 
            -
                #
         | 
| 657 | 
            -
                # Returns the joint probability of the common ngrams between the
         | 
| 658 | 
            -
                # specified _fragment_ and the model.
         | 
| 659 | 
            -
                #
         | 
| 660 | 
            -
                def fragment_commonality(fragment)
         | 
| 661 | 
            -
                  probability_of_ngrams(common_ngrams_from_fragment(fragment))
         | 
| 662 | 
            -
                end
         | 
| 663 | 
            -
             | 
| 664 | 
            -
                #
         | 
| 665 | 
            -
                # Returns the joint probability of the common ngrams between the
         | 
| 666 | 
            -
                # specified _sentence_ and the model.
         | 
| 667 | 
            -
                #
         | 
| 668 | 
            -
                def sentence_commonality(sentence)
         | 
| 669 | 
            -
                  probability_of_ngrams(common_ngrams_from_sentence(sentence))
         | 
| 670 | 
            -
                end
         | 
| 671 | 
            -
             | 
| 672 | 
            -
                #
         | 
| 673 | 
            -
                # Returns the joint probability of the common ngrams between the
         | 
| 674 | 
            -
                # specified _sentence_ and the model.
         | 
| 675 | 
            -
                #
         | 
| 676 | 
            -
                def text_commonality(text)
         | 
| 677 | 
            -
                  probability_of_ngrams(common_ngrams_from_text(text))
         | 
| 678 | 
            -
                end
         | 
| 679 | 
            -
             | 
| 680 | 
            -
                #
         | 
| 681 | 
            -
                # Returns the conditional probability of the commonality of the
         | 
| 682 | 
            -
                # specified _fragment_ against the _other_model_, given the commonality
         | 
| 683 | 
            -
                # of the _fragment_ against the model.
         | 
| 684 | 
            -
                #
         | 
| 685 | 
            -
                def fragment_similarity(fragment,other_model)
         | 
| 686 | 
            -
                  other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
         | 
| 687 | 
            -
                end
         | 
| 688 | 
            -
             | 
| 689 | 
            -
                #
         | 
| 690 | 
            -
                # Returns the conditional probability of the commonality of the
         | 
| 691 | 
            -
                # specified _sentence_ against the _other_model_, given the commonality
         | 
| 692 | 
            -
                # of the _sentence_ against the model.
         | 
| 693 | 
            -
                #
         | 
| 694 | 
            -
                def sentence_similarity(sentence,other_model)
         | 
| 695 | 
            -
                  other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
         | 
| 696 | 
            -
                end
         | 
| 697 | 
            -
             | 
| 698 | 
            -
                #
         | 
| 699 | 
            -
                # Returns the conditional probability of the commonality of the
         | 
| 700 | 
            -
                # specified _text_ against the _other_model_, given the commonality
         | 
| 701 | 
            -
                # of the _text_ against the model.
         | 
| 702 | 
            -
                #
         | 
| 703 | 
            -
                def text_similarity(text,other_model)
         | 
| 704 | 
            -
                  other_model.text_commonality(text) / text_commonality(text)
         | 
| 705 | 
            -
                end
         | 
| 706 | 
            -
             | 
| 707 | 
            -
                #
         | 
| 708 | 
            -
                # Returns a random gram from the model.
         | 
| 709 | 
            -
                #
         | 
| 710 | 
            -
                def random_gram
         | 
| 711 | 
            -
                  prefix = @prefixes.keys[rand(@prefixes.length)]
         | 
| 712 | 
            -
             | 
| 713 | 
            -
                  return prefix[rand(prefix.length)]
         | 
| 714 | 
            -
                end
         | 
| 715 | 
            -
             | 
| 716 | 
            -
                #
         | 
| 717 | 
            -
                # Returns a random ngram from the model.
         | 
| 718 | 
            -
                #
         | 
| 719 | 
            -
                def random_ngram
         | 
| 720 | 
            -
                  prefix_index = rand(@prefixes.length)
         | 
| 721 | 
            -
             | 
| 722 | 
            -
                  prefix = @prefixes.keys[prefix_index]
         | 
| 723 | 
            -
                  table = @prefixes.values[prefix_index]
         | 
| 724 | 
            -
             | 
| 725 | 
            -
                  gram_index = rand(table.grams.length)
         | 
| 726 | 
            -
             | 
| 727 | 
            -
                  return (prefix + table.grams[gram_index])
         | 
| 728 | 
            -
                end
         | 
| 729 | 
            -
             | 
| 730 | 
            -
                #
         | 
| 731 | 
            -
                # Returns a randomly generated sentence of grams using the given
         | 
| 732 | 
            -
                # _options_.
         | 
| 733 | 
            -
                #
         | 
| 734 | 
            -
                def random_gram_sentence(options={})
         | 
| 735 | 
            -
                  grams = []
         | 
| 736 | 
            -
                  last_ngram = @starting_ngram
         | 
| 737 | 
            -
                  
         | 
| 738 | 
            -
                  loop do
         | 
| 739 | 
            -
                    next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
         | 
| 740 | 
            -
                    last_ngram = next_ngrams[rand(next_ngrams.length)]
         | 
| 741 | 
            -
             | 
| 742 | 
            -
                    if last_ngram.nil?
         | 
| 743 | 
            -
                      return []
         | 
| 744 | 
            -
                    else
         | 
| 745 | 
            -
                      last_gram = last_ngram.last
         | 
| 746 | 
            -
             | 
| 747 | 
            -
                      break if last_gram == Tokens.stop
         | 
| 748 | 
            -
             | 
| 749 | 
            -
                      grams << last_gram
         | 
| 750 | 
            -
                    end
         | 
| 751 | 
            -
                  end
         | 
| 752 | 
            -
             | 
| 753 | 
            -
                  return grams
         | 
| 754 | 
            -
                end
         | 
| 755 | 
            -
             | 
| 756 | 
            -
                #
         | 
| 757 | 
            -
                # Returns a randomly generated sentence of text using the given
         | 
| 758 | 
            -
                # _options_.
         | 
| 759 | 
            -
                #
         | 
| 760 | 
            -
                def random_sentence(options={})
         | 
| 761 | 
            -
                  grams = random_gram_sentence(options)
         | 
| 762 | 
            -
                  sentence = grams.delete_if { |gram|
         | 
| 763 | 
            -
                    gram == Tokens.start || gram == Tokens.stop
         | 
| 764 | 
            -
                  }.join(' ')
         | 
| 765 | 
            -
             | 
| 766 | 
            -
                  sentence << '.' if @ignore_punctuation
         | 
| 767 | 
            -
                  return sentence
         | 
| 768 | 
            -
                end
         | 
| 769 | 
            -
             | 
| 770 | 
            -
                #
         | 
| 771 | 
            -
                # Returns a randomly generated paragraph of text using the given
         | 
| 772 | 
            -
                # _options_.
         | 
| 773 | 
            -
                #
         | 
| 774 | 
            -
                # _options_ may contain the following keys:
         | 
| 775 | 
            -
                # <tt>:min_sentences</tt>:: Minimum number of sentences in the
         | 
| 776 | 
            -
                #                           paragraph. Defaults to 3.
         | 
| 777 | 
            -
                # <tt>:max_sentences</tt>:: Maximum number of sentences in the
         | 
| 778 | 
            -
                #                           paragraph. Defaults to 6.
         | 
| 779 | 
            -
                #
         | 
| 780 | 
            -
                def random_paragraph(options={})
         | 
| 781 | 
            -
                  min_sentences = (options[:min_sentences] || 3)
         | 
| 782 | 
            -
                  max_sentences = (options[:max_sentences] || 6)
         | 
| 783 | 
            -
                  sentences = []
         | 
| 784 | 
            -
             | 
| 785 | 
            -
                  (rand(max_sentences - min_sentences) + min_sentences).times do
         | 
| 786 | 
            -
                    sentences << random_sentence(options)
         | 
| 787 | 
            -
                  end
         | 
| 788 | 
            -
             | 
| 789 | 
            -
                  return sentences.join(' ')
         | 
| 790 | 
            -
                end
         | 
| 791 | 
            -
             | 
| 792 | 
            -
                #
         | 
| 793 | 
            -
                # Returns randomly generated text using the given _options_.
         | 
| 794 | 
            -
                #
         | 
| 795 | 
            -
                # _options_ may contain the following keys:
         | 
| 796 | 
            -
                # <tt>:min_sentences</tt>:: Minimum number of sentences in the
         | 
| 797 | 
            -
                #                           paragraph. Defaults to 3.
         | 
| 798 | 
            -
                # <tt>:max_sentences</tt>:: Maximum number of sentences in the
         | 
| 799 | 
            -
                #                           paragraph. Defaults to 6.
         | 
| 800 | 
            -
                # <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
         | 
| 801 | 
            -
                #                            Defaults to 3.
         | 
| 802 | 
            -
                # <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
         | 
| 803 | 
            -
                #                            Defaults to 5.
         | 
| 804 | 
            -
                #
         | 
| 805 | 
            -
                def random_text(options={})
         | 
| 806 | 
            -
                  min_paragraphs = (options[:min_paragraphs] || 3)
         | 
| 807 | 
            -
                  max_paragraphs = (options[:max_paragraphs] || 6)
         | 
| 808 | 
            -
                  paragraphs = []
         | 
| 809 | 
            -
             | 
| 810 | 
            -
                  (rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
         | 
| 811 | 
            -
                    paragraphs << random_paragraph(options)
         | 
| 812 | 
            -
                  end
         | 
| 813 | 
            -
             | 
| 814 | 
            -
                  return paragraphs.join("\n\n")
         | 
| 815 | 
            -
                end
         | 
| 816 | 
            -
             | 
| 817 536 | 
             
                #
         | 
| 818 537 | 
             
                # Refreshes the probability tables of the model.
         | 
| 819 538 | 
             
                #
         | 
| @@ -854,6 +573,13 @@ module Raingrams | |
| 854 573 | 
             
                  return self
         | 
| 855 574 | 
             
                end
         | 
| 856 575 |  | 
| 576 | 
            +
                #
         | 
| 577 | 
            +
                # Returns a Hash representation of the model.
         | 
| 578 | 
            +
                #
         | 
| 579 | 
            +
                def to_hash
         | 
| 580 | 
            +
                  @prefixes
         | 
| 581 | 
            +
                end
         | 
| 582 | 
            +
             | 
| 857 583 | 
             
                protected
         | 
| 858 584 |  | 
| 859 585 | 
             
                #
         | 
| @@ -141,6 +141,15 @@ module Raingrams | |
| 141 141 | 
             
                  return self
         | 
| 142 142 | 
             
                end
         | 
| 143 143 |  | 
| 144 | 
            +
                #
         | 
| 145 | 
            +
                # Returns a Hash representation of the probability table.
         | 
| 146 | 
            +
                #
         | 
| 147 | 
            +
                def to_hash
         | 
| 148 | 
            +
                  build
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                  return @probabilities
         | 
| 151 | 
            +
                end
         | 
| 152 | 
            +
             | 
| 144 153 | 
             
                def inspect
         | 
| 145 154 | 
             
                  if @dirty
         | 
| 146 155 | 
             
                    "#<ProbabilityTable @total=#{@total} @frequencies=#{@frequencies.inspect}>"
         | 
| @@ -0,0 +1,35 @@ | |
| 1 | 
            +
            require 'raingrams/tokens/start_sentence'
         | 
| 2 | 
            +
            require 'raingrams/tokens/stop_sentence'
         | 
| 3 | 
            +
            require 'raingrams/tokens/unknown'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Raingrams
         | 
| 6 | 
            +
              module Tokens
         | 
| 7 | 
            +
                #
         | 
| 8 | 
            +
                # Returns all defined tokens.
         | 
| 9 | 
            +
                #
         | 
| 10 | 
            +
                def Tokens.all
         | 
| 11 | 
            +
                  @@raingram_tokens ||= {}
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                #
         | 
| 15 | 
            +
                # Returns the start sentence token.
         | 
| 16 | 
            +
                #
         | 
| 17 | 
            +
                def Tokens.start
         | 
| 18 | 
            +
                  Tokens.all[:start] ||= StartSentence.new
         | 
| 19 | 
            +
                end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                #
         | 
| 22 | 
            +
                # Returns the stop sentence token.
         | 
| 23 | 
            +
                #
         | 
| 24 | 
            +
                def Tokens.stop
         | 
| 25 | 
            +
                  Tokens.all[:stop] ||= StopSentence.new
         | 
| 26 | 
            +
                end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                #
         | 
| 29 | 
            +
                # Returns the unknown word token.
         | 
| 30 | 
            +
                #
         | 
| 31 | 
            +
                def Tokens.unknown
         | 
| 32 | 
            +
                  Tokens.all[:unknown] ||= Unknown.new
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
              end
         | 
| 35 | 
            +
            end
         | 
    
        data/lib/raingrams/version.rb
    CHANGED
    
    
    
        data/tasks/spec.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,26 +1,26 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: raingrams
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.2
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors: 
         | 
| 7 | 
            -
            - Postmodern | 
| 7 | 
            +
            - Postmodern
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 11 |  | 
| 12 | 
            -
            date:  | 
| 12 | 
            +
            date: 2009-04-23 00:00:00 -07:00
         | 
| 13 13 | 
             
            default_executable: 
         | 
| 14 14 | 
             
            dependencies: 
         | 
| 15 15 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 16 | 
            -
              name:  | 
| 16 | 
            +
              name: nokogiri
         | 
| 17 17 | 
             
              type: :runtime
         | 
| 18 18 | 
             
              version_requirement: 
         | 
| 19 19 | 
             
              version_requirements: !ruby/object:Gem::Requirement 
         | 
| 20 20 | 
             
                requirements: 
         | 
| 21 21 | 
             
                - - ">="
         | 
| 22 22 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 23 | 
            -
                    version:  | 
| 23 | 
            +
                    version: 1.2.0
         | 
| 24 24 | 
             
                version: 
         | 
| 25 25 | 
             
            - !ruby/object:Gem::Dependency 
         | 
| 26 26 | 
             
              name: hoe
         | 
| @@ -30,7 +30,7 @@ dependencies: | |
| 30 30 | 
             
                requirements: 
         | 
| 31 31 | 
             
                - - ">="
         | 
| 32 32 | 
             
                  - !ruby/object:Gem::Version 
         | 
| 33 | 
            -
                    version: 1. | 
| 33 | 
            +
                    version: 1.12.2
         | 
| 34 34 | 
             
                version: 
         | 
| 35 35 | 
             
            description: Raingrams is a flexible and general-purpose ngrams library written in Ruby. Raingrams supports ngram sizes greater than 1, text/non-text grams, multiple parsing styles and open/closed vocabulary models.
         | 
| 36 36 | 
             
            email: 
         | 
| @@ -45,7 +45,6 @@ extra_rdoc_files: | |
| 45 45 | 
             
            - Manifest.txt
         | 
| 46 46 | 
             
            - README.txt
         | 
| 47 47 | 
             
            - TODO.txt
         | 
| 48 | 
            -
            - spec/training/snowcrash.txt
         | 
| 49 48 | 
             
            files: 
         | 
| 50 49 | 
             
            - History.txt
         | 
| 51 50 | 
             
            - LICENSE.txt
         | 
| @@ -54,27 +53,33 @@ files: | |
| 54 53 | 
             
            - TODO.txt
         | 
| 55 54 | 
             
            - Rakefile
         | 
| 56 55 | 
             
            - lib/raingrams.rb
         | 
| 57 | 
            -
            - lib/raingrams/version.rb
         | 
| 58 | 
            -
            - lib/raingrams/raingrams.rb
         | 
| 59 | 
            -
            - lib/raingrams/exceptions/prefix_frequency_missing.rb
         | 
| 60 56 | 
             
            - lib/raingrams/exceptions.rb
         | 
| 57 | 
            +
            - lib/raingrams/exceptions/prefix_frequency_missing.rb
         | 
| 58 | 
            +
            - lib/raingrams/extensions.rb
         | 
| 61 59 | 
             
            - lib/raingrams/extensions/object.rb
         | 
| 62 60 | 
             
            - lib/raingrams/extensions/string.rb
         | 
| 63 | 
            -
            - lib/raingrams/ | 
| 61 | 
            +
            - lib/raingrams/tokens.rb
         | 
| 64 62 | 
             
            - lib/raingrams/tokens/token.rb
         | 
| 65 63 | 
             
            - lib/raingrams/tokens/start_sentence.rb
         | 
| 66 64 | 
             
            - lib/raingrams/tokens/stop_sentence.rb
         | 
| 67 65 | 
             
            - lib/raingrams/tokens/unknown.rb
         | 
| 68 | 
            -
            - lib/raingrams/tokens.rb
         | 
| 66 | 
            +
            - lib/raingrams/tokens/tokens.rb
         | 
| 69 67 | 
             
            - lib/raingrams/ngram.rb
         | 
| 70 68 | 
             
            - lib/raingrams/ngram_set.rb
         | 
| 71 69 | 
             
            - lib/raingrams/probability_table.rb
         | 
| 70 | 
            +
            - lib/raingrams/helpers.rb
         | 
| 71 | 
            +
            - lib/raingrams/helpers/frequency.rb
         | 
| 72 | 
            +
            - lib/raingrams/helpers/probability.rb
         | 
| 73 | 
            +
            - lib/raingrams/helpers/similarity.rb
         | 
| 74 | 
            +
            - lib/raingrams/helpers/commonality.rb
         | 
| 75 | 
            +
            - lib/raingrams/helpers/random.rb
         | 
| 72 76 | 
             
            - lib/raingrams/model.rb
         | 
| 73 77 | 
             
            - lib/raingrams/bigram_model.rb
         | 
| 74 78 | 
             
            - lib/raingrams/trigram_model.rb
         | 
| 75 79 | 
             
            - lib/raingrams/quadgram_model.rb
         | 
| 76 80 | 
             
            - lib/raingrams/pentagram_model.rb
         | 
| 77 81 | 
             
            - lib/raingrams/hexagram_model.rb
         | 
| 82 | 
            +
            - lib/raingrams/open_vocabulary.rb
         | 
| 78 83 | 
             
            - lib/raingrams/open_vocabulary/open_model.rb
         | 
| 79 84 | 
             
            - lib/raingrams/open_vocabulary/model.rb
         | 
| 80 85 | 
             
            - lib/raingrams/open_vocabulary/bigram_model.rb
         | 
| @@ -82,7 +87,8 @@ files: | |
| 82 87 | 
             
            - lib/raingrams/open_vocabulary/quadgram_model.rb
         | 
| 83 88 | 
             
            - lib/raingrams/open_vocabulary/pentagram_model.rb
         | 
| 84 89 | 
             
            - lib/raingrams/open_vocabulary/hexagram_model.rb
         | 
| 85 | 
            -
            - lib/raingrams/ | 
| 90 | 
            +
            - lib/raingrams/version.rb
         | 
| 91 | 
            +
            - lib/raingrams/raingrams.rb
         | 
| 86 92 | 
             
            - tasks/spec.rb
         | 
| 87 93 | 
             
            - spec/training/snowcrash.txt
         | 
| 88 94 | 
             
            - spec/helpers/training.rb
         | 
| @@ -121,7 +127,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 121 127 | 
             
            requirements: []
         | 
| 122 128 |  | 
| 123 129 | 
             
            rubyforge_project: raingrams
         | 
| 124 | 
            -
            rubygems_version: 1.3. | 
| 130 | 
            +
            rubygems_version: 1.3.1
         | 
| 125 131 | 
             
            signing_key: 
         | 
| 126 132 | 
             
            specification_version: 2
         | 
| 127 133 | 
             
            summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
         |