raingrams 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +9 -0
- data/Manifest.txt +10 -10
- data/README.txt +9 -7
- data/Rakefile +3 -6
- data/TODO.txt +6 -0
- data/lib/raingrams/bigram_model.rb +3 -7
- data/lib/raingrams/extensions/object.rb +4 -1
- data/lib/raingrams/extensions/string.rb +3 -0
- data/lib/raingrams/extensions.rb +0 -5
- data/lib/raingrams/hexagram_model.rb +3 -7
- data/lib/raingrams/model.rb +622 -61
- data/lib/raingrams/ngram.rb +50 -9
- data/lib/raingrams/ngram_set.rb +43 -0
- data/lib/raingrams/open_vocabulary/model.rb +12 -0
- data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
- data/lib/raingrams/open_vocabulary.rb +0 -1
- data/lib/raingrams/pentagram_model.rb +3 -7
- data/lib/raingrams/probability_table.rb +153 -0
- data/lib/raingrams/quadgram_model.rb +3 -7
- data/lib/raingrams/raingrams.rb +10 -20
- data/lib/raingrams/tokens/start_sentence.rb +2 -2
- data/lib/raingrams/tokens/stop_sentence.rb +2 -2
- data/lib/raingrams/tokens/token.rb +49 -5
- data/lib/raingrams/tokens/unknown.rb +2 -2
- data/lib/raingrams/tokens.rb +1 -0
- data/lib/raingrams/trigram_model.rb +3 -7
- data/lib/raingrams/version.rb +1 -1
- data/lib/raingrams.rb +1 -1
- data/spec/ngram_set_spec.rb +54 -0
- data/spec/ngram_spec.rb +29 -0
- data/spec/probability_table_spec.rb +94 -0
- data/spec/raingrams_spec.rb +9 -0
- data/spec/spec_helper.rb +5 -0
- data/tasks/spec.rb +7 -0
- metadata +65 -55
- data/lib/raingrams/extensions/class.rb +0 -7
- data/lib/raingrams/extensions/false_class.rb +0 -7
- data/lib/raingrams/extensions/nil_class.rb +0 -7
- data/lib/raingrams/extensions/symbol.rb +0 -7
- data/lib/raingrams/extensions/true_class.rb +0 -7
- data/lib/raingrams/multigram_model.rb +0 -165
- data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
- data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
- data/lib/raingrams/unigram_model.rb +0 -70
- data/test/test_raingrams.rb +0 -0
    
        data/lib/raingrams/model.rb
    CHANGED
    
    | @@ -1,7 +1,9 @@ | |
| 1 1 | 
             
            require 'raingrams/ngram'
         | 
| 2 | 
            -
            require 'raingrams/ | 
| 3 | 
            -
            require 'raingrams/ | 
| 4 | 
            -
            require 'raingrams/ | 
| 2 | 
            +
            require 'raingrams/ngram_set'
         | 
| 3 | 
            +
            require 'raingrams/probability_table'
         | 
| 4 | 
            +
            require 'raingrams/tokens'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            require 'set'
         | 
| 5 7 |  | 
| 6 8 | 
             
            module Raingrams
         | 
| 7 9 | 
             
              class Model
         | 
| @@ -9,11 +11,17 @@ module Raingrams | |
| 9 11 | 
             
                # Size of ngrams to use
         | 
| 10 12 | 
             
                attr_reader :ngram_size
         | 
| 11 13 |  | 
| 14 | 
            +
                # The sentence starting ngram
         | 
| 15 | 
            +
                attr_reader :starting_ngram
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                # The sentence stopping ngram
         | 
| 18 | 
            +
                attr_reader :stoping_ngram
         | 
| 19 | 
            +
             | 
| 12 20 | 
             
                # Ignore case of parsed text
         | 
| 13 21 | 
             
                attr_reader :ignore_case
         | 
| 14 22 |  | 
| 15 23 | 
             
                # Ignore the punctuation of parsed text
         | 
| 16 | 
            -
                attr_reader : | 
| 24 | 
            +
                attr_reader :ignore_punctuation
         | 
| 17 25 |  | 
| 18 26 | 
             
                # Ignore URLs
         | 
| 19 27 | 
             
                attr_reader :ignore_urls
         | 
| @@ -24,138 +32,691 @@ module Raingrams | |
| 24 32 | 
             
                # Ignore References
         | 
| 25 33 | 
             
                attr_reader :ignore_references
         | 
| 26 34 |  | 
| 27 | 
            -
                #  | 
| 28 | 
            -
                attr_reader : | 
| 29 | 
            -
             | 
| 30 | 
            -
                # | 
| 31 | 
            -
                 | 
| 35 | 
            +
                # Probabilities of all (n-1) grams
         | 
| 36 | 
            +
                attr_reader :prefixes
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                #
         | 
| 39 | 
            +
                # Creates a new NgramModel with the specified _options_.
         | 
| 40 | 
            +
                #
         | 
| 41 | 
            +
                # _options_ must contain the following keys:
         | 
| 42 | 
            +
                # <tt>:ngram_size</tt>:: The size of each gram.
         | 
| 43 | 
            +
                #
         | 
| 44 | 
            +
                # _options_ may contain the following keys:
         | 
| 45 | 
            +
                # <tt>:ignore_case</tt>:: Defaults to +false+.
         | 
| 46 | 
            +
                # <tt>:ignore_punctuation</tt>:: Defaults to +true+.
         | 
| 47 | 
            +
                # <tt>:ignore_urls</tt>:: Defaults to +false+.
         | 
| 48 | 
            +
                # <tt>:ignore_phone_numbers</tt>:: Defaults to +false+.
         | 
| 49 | 
            +
                #
         | 
| 50 | 
            +
                def initialize(options={},&block)
         | 
| 51 | 
            +
                  @ngram_size = options[:ngram_size]
         | 
| 52 | 
            +
                  @starting_ngram = Ngram.new(Tokens.start * @ngram_size)
         | 
| 53 | 
            +
                  @stoping_ngram = Ngram.new(Tokens.stop * @ngram_size)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                  @ignore_case = false
         | 
| 56 | 
            +
                  @ignore_punctuation = true
         | 
| 57 | 
            +
                  @ignore_urls = true
         | 
| 58 | 
            +
                  @ignore_phone_numbers = false
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                  if options.has_key?(:ignore_case)
         | 
| 61 | 
            +
                    @ignore_case = options[:ignore_case]
         | 
| 62 | 
            +
                  end
         | 
| 32 63 |  | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 64 | 
            +
                  if options.has_key?(:ignore_punctuation)
         | 
| 65 | 
            +
                    @ignore_punctuation = options[:ignore_punctuation]
         | 
| 66 | 
            +
                  end
         | 
| 35 67 |  | 
| 36 | 
            -
             | 
| 37 | 
            -
             | 
| 68 | 
            +
                  if options.has_key?(:ignore_urls)
         | 
| 69 | 
            +
                    @ignore_urls = options[:ignore_urls]
         | 
| 70 | 
            +
                  end
         | 
| 38 71 |  | 
| 39 | 
            -
             | 
| 40 | 
            -
             | 
| 41 | 
            -
                   | 
| 42 | 
            -
                  @ignore_punc = opts[:ignore_punc] || true
         | 
| 43 | 
            -
                  @ignore_urls = opts[:ignore_urls] || false
         | 
| 44 | 
            -
                  @ignore_phone_numbers = opts[:ignore_phone_numbers] || false
         | 
| 45 | 
            -
                  @convert_acronyms = opts[:convert_acronyms] || false
         | 
| 46 | 
            -
                  @convert_abbrev = opts[:convert_abbrev] || false
         | 
| 72 | 
            +
                  if options.has_key?(:ignore_phone_numbers)
         | 
| 73 | 
            +
                    @ignore_phone_numbers = options[:ignore_phone_numbers]
         | 
| 74 | 
            +
                  end
         | 
| 47 75 |  | 
| 48 | 
            -
                  @ | 
| 49 | 
            -
                  @probability = Hash.new { |hash,key| 0.0 }
         | 
| 76 | 
            +
                  @prefixes = {}
         | 
| 50 77 |  | 
| 51 78 | 
             
                  block.call(self) if block
         | 
| 52 79 | 
             
                end
         | 
| 53 80 |  | 
| 81 | 
            +
                #
         | 
| 82 | 
            +
                # Creates a new NgramModel object with the given _options_. If a
         | 
| 83 | 
            +
                # _block_ is given, it will be passed the newly created model.
         | 
| 84 | 
            +
                #
         | 
| 85 | 
            +
                def self.build(options={},&block)
         | 
| 86 | 
            +
                  self.new(options) do |model|
         | 
| 87 | 
            +
                    model.build(&block)
         | 
| 88 | 
            +
                  end
         | 
| 89 | 
            +
                end
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                #
         | 
| 92 | 
            +
                # Parses the specified _sentence_ and returns an Array of tokens.
         | 
| 93 | 
            +
                #
         | 
| 54 94 | 
             
                def parse_sentence(sentence)
         | 
| 95 | 
            +
                  # eat tailing punctuation
         | 
| 55 96 | 
             
                  sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
         | 
| 56 97 |  | 
| 57 98 | 
             
                  if @ignore_urls
         | 
| 58 | 
            -
                     | 
| 99 | 
            +
                    # remove URLs
         | 
| 100 | 
            +
                    sentence.gsub!(/\s*\w+:\/\/[\w\/,\._\-%\?&=]*\s*/,' ')
         | 
| 59 101 | 
             
                  end
         | 
| 60 102 |  | 
| 61 103 | 
             
                  if @ignore_phone_numbers
         | 
| 104 | 
            +
                    # remove phone numbers
         | 
| 62 105 | 
             
                    sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
         | 
| 63 106 | 
             
                  end
         | 
| 64 107 |  | 
| 65 108 | 
             
                  if @ignore_references
         | 
| 66 | 
            -
                     | 
| 109 | 
            +
                    # remove RFC style references
         | 
| 110 | 
            +
                    sentence.gsub!(/\s*\[\d+\]\s*/,' ')
         | 
| 67 111 | 
             
                  end
         | 
| 68 112 |  | 
| 69 113 | 
             
                  if @ignore_case
         | 
| 114 | 
            +
                    # downcase the sentence
         | 
| 70 115 | 
             
                    sentence.downcase!
         | 
| 71 116 | 
             
                  end
         | 
| 72 117 |  | 
| 73 | 
            -
                  if @ | 
| 74 | 
            -
                     | 
| 118 | 
            +
                  if @ignore_punctuation
         | 
| 119 | 
            +
                    # split and ignore punctuation characters
         | 
| 120 | 
            +
                    return sentence.scan(/\w+[_\.:']?\w+/)
         | 
| 75 121 | 
             
                  else
         | 
| 76 | 
            -
                     | 
| 122 | 
            +
                    # split and accept punctuation characters
         | 
| 123 | 
            +
                    return sentence.scan(/[\w\-_,\.;'"\\\/]+/)
         | 
| 77 124 | 
             
                  end
         | 
| 78 125 | 
             
                end
         | 
| 79 126 |  | 
| 80 | 
            -
                 | 
| 127 | 
            +
                #
         | 
| 128 | 
            +
                # Parses the specified _text_ and returns an Array of sentences.
         | 
| 129 | 
            +
                #
         | 
| 130 | 
            +
                def parse_text(text)
         | 
| 81 131 | 
             
                  text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
         | 
| 82 132 | 
             
                end
         | 
| 83 133 |  | 
| 84 | 
            -
                 | 
| 85 | 
            -
             | 
| 86 | 
            -
             | 
| 87 | 
            -
                 | 
| 134 | 
            +
                #
         | 
| 135 | 
            +
                # Returns the ngrams that compose the model.
         | 
| 136 | 
            +
                #
         | 
| 137 | 
            +
                def ngrams
         | 
| 138 | 
            +
                  ngram_set = NgramSet.new
         | 
| 88 139 |  | 
| 89 | 
            -
             | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 140 | 
            +
                  @prefixes.each do |prefix,table|
         | 
| 141 | 
            +
                    table.each_gram do |gram|
         | 
| 142 | 
            +
                      ngram_set << (prefix + gram)
         | 
| 143 | 
            +
                    end
         | 
| 144 | 
            +
                  end
         | 
| 93 145 |  | 
| 94 | 
            -
             | 
| 95 | 
            -
                  @frequency.keys
         | 
| 146 | 
            +
                  return ngram_set
         | 
| 96 147 | 
             
                end
         | 
| 97 148 |  | 
| 149 | 
            +
                #
         | 
| 150 | 
            +
                # Returns +true+ if the model contains the specified _ngram_, returns
         | 
| 151 | 
            +
                # +false+ otherwise.
         | 
| 152 | 
            +
                #
         | 
| 98 153 | 
             
                def has_ngram?(ngram)
         | 
| 99 | 
            -
                   | 
| 154 | 
            +
                  @prefixes[ngram.prefix].has_gram?(ngram.last)
         | 
| 100 155 | 
             
                end
         | 
| 101 156 |  | 
| 157 | 
            +
                #
         | 
| 158 | 
            +
                # Iterates over the ngrams that compose the model, passing each one
         | 
| 159 | 
            +
                # to the given _block_.
         | 
| 160 | 
            +
                #
         | 
| 102 161 | 
             
                def each_ngram(&block)
         | 
| 103 | 
            -
                   | 
| 162 | 
            +
                  @prefixes.each do |prefix,table|
         | 
| 163 | 
            +
                    table.each_gram do |gram|
         | 
| 164 | 
            +
                      block.call(prefix + gram) if block
         | 
| 165 | 
            +
                    end
         | 
| 166 | 
            +
                  end
         | 
| 167 | 
            +
             | 
| 168 | 
            +
                  return self
         | 
| 104 169 | 
             
                end
         | 
| 105 170 |  | 
| 171 | 
            +
                #
         | 
| 172 | 
            +
                # Selects the ngrams that match the given _block_.
         | 
| 173 | 
            +
                #
         | 
| 106 174 | 
             
                def ngrams_with(&block)
         | 
| 107 | 
            -
                   | 
| 175 | 
            +
                  selected_ngrams = NgramSet.new
         | 
| 176 | 
            +
             | 
| 177 | 
            +
                  each_ngram do |ngram|
         | 
| 178 | 
            +
                    selected_ngrams << ngram if block.call(ngram)
         | 
| 179 | 
            +
                  end
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                  return ngrams
         | 
| 108 182 | 
             
                end
         | 
| 109 183 |  | 
| 110 | 
            -
                 | 
| 111 | 
            -
             | 
| 184 | 
            +
                #
         | 
| 185 | 
            +
                # Returns the ngrams prefixed by the specified _prefix_.
         | 
| 186 | 
            +
                #
         | 
| 187 | 
            +
                def ngrams_prefixed_by(prefix)
         | 
| 188 | 
            +
                  ngram_set = NgramSet.new
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                  return ngram_set unless @prefixes.has_key?(prefix)
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                  ngram_set += @prefixes[prefix].grams.map do |gram|
         | 
| 193 | 
            +
                    prefix + gram
         | 
| 194 | 
            +
                  end
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                  return ngram_set
         | 
| 112 197 | 
             
                end
         | 
| 113 198 |  | 
| 114 | 
            -
                 | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
| 199 | 
            +
                #
         | 
| 200 | 
            +
                # Returns the ngrams postfixed by the specified _postfix_.
         | 
| 201 | 
            +
                #
         | 
| 202 | 
            +
                def ngrams_postfixed_by(postfix)
         | 
| 203 | 
            +
                  ngram_set = NgramSet.new
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                  @prefixes.each do |prefix,table|
         | 
| 206 | 
            +
                    if prefix[1..-1] == postfix[0..-2]
         | 
| 207 | 
            +
                      if table.has_gram?(postfix.last)
         | 
| 208 | 
            +
                        ngram_set << (prefix + postfix.last)
         | 
| 209 | 
            +
                      end
         | 
| 210 | 
            +
                    end
         | 
| 117 211 | 
             
                  end
         | 
| 118 212 |  | 
| 119 | 
            -
                  return  | 
| 213 | 
            +
                  return ngram_set
         | 
| 120 214 | 
             
                end
         | 
| 121 215 |  | 
| 122 | 
            -
                 | 
| 123 | 
            -
             | 
| 216 | 
            +
                #
         | 
| 217 | 
            +
                # Returns the ngrams starting with the specified _gram_.
         | 
| 218 | 
            +
                #
         | 
| 219 | 
            +
                def ngrams_starting_with(gram)
         | 
| 220 | 
            +
                  ngram_set = NgramSet.new
         | 
| 221 | 
            +
             | 
| 222 | 
            +
                  @prefixes.each do |prefix,table|
         | 
| 223 | 
            +
                    if prefix.first == gram
         | 
| 224 | 
            +
                      table.each_gram do |gram|
         | 
| 225 | 
            +
                        ngram_set << (prefix + gram)
         | 
| 226 | 
            +
                      end
         | 
| 227 | 
            +
                    end
         | 
| 228 | 
            +
                  end
         | 
| 229 | 
            +
             | 
| 230 | 
            +
                  return ngram_set
         | 
| 124 231 | 
             
                end
         | 
| 125 232 |  | 
| 233 | 
            +
                #
         | 
| 234 | 
            +
                # Returns the ngrams which end with the specified _gram_.
         | 
| 235 | 
            +
                #
         | 
| 126 236 | 
             
                def ngrams_ending_with(gram)
         | 
| 127 | 
            -
                   | 
| 237 | 
            +
                  ngram_set = NgramSet.new
         | 
| 238 | 
            +
             | 
| 239 | 
            +
                  @prefixes.each do |prefix,table|
         | 
| 240 | 
            +
                    if table.has_gram?(gram)
         | 
| 241 | 
            +
                      ngram_set << (prefix + gram)
         | 
| 242 | 
            +
                    end
         | 
| 243 | 
            +
                  end
         | 
| 244 | 
            +
             | 
| 245 | 
            +
                  return ngram_set
         | 
| 128 246 | 
             
                end
         | 
| 129 247 |  | 
| 130 | 
            -
                 | 
| 131 | 
            -
             | 
| 248 | 
            +
                #
         | 
| 249 | 
            +
                # Returns the ngrams including the specified _grams_.
         | 
| 250 | 
            +
                #
         | 
| 251 | 
            +
                def ngrams_including(*grams)
         | 
| 252 | 
            +
                  ngram_set = NgramSet.new
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                  @prefixes.each do |prefix,table|
         | 
| 255 | 
            +
                    if prefix.includes?(grams)
         | 
| 256 | 
            +
                      table.each_gram do |gram|
         | 
| 257 | 
            +
                        ngram_set << (prefix + gram)
         | 
| 258 | 
            +
                      end
         | 
| 259 | 
            +
                    else
         | 
| 260 | 
            +
                      table.each_gram do |gram|
         | 
| 261 | 
            +
                        if grams.include?(gram)
         | 
| 262 | 
            +
                          ngram_set << (prefix + gram)
         | 
| 263 | 
            +
                        end
         | 
| 264 | 
            +
                      end
         | 
| 265 | 
            +
                    end
         | 
| 266 | 
            +
                  end
         | 
| 267 | 
            +
             | 
| 268 | 
            +
                  return ngram_set
         | 
| 269 | 
            +
                end
         | 
| 270 | 
            +
             | 
| 271 | 
            +
                #
         | 
| 272 | 
            +
                # Returns the ngrams extracted from the specified _words_.
         | 
| 273 | 
            +
                #
         | 
| 274 | 
            +
                def ngrams_from_words(words)
         | 
| 275 | 
            +
                  return (0...(words.length-@ngram_size+1)).map do |index|
         | 
| 276 | 
            +
                    Ngram.new(words[index,@ngram_size])
         | 
| 277 | 
            +
                  end
         | 
| 278 | 
            +
                end
         | 
| 279 | 
            +
             | 
| 280 | 
            +
                #
         | 
| 281 | 
            +
                # Returns the ngrams extracted from the specified _fragment_ of text.
         | 
| 282 | 
            +
                #
         | 
| 283 | 
            +
                def ngrams_from_fragment(fragment)
         | 
| 284 | 
            +
                  ngrams_from_words(parse_sentence(fragment))
         | 
| 285 | 
            +
                end
         | 
| 286 | 
            +
             | 
| 287 | 
            +
                #
         | 
| 288 | 
            +
                # Returns the ngrams extracted from the specified _sentence_.
         | 
| 289 | 
            +
                #
         | 
| 290 | 
            +
                def ngrams_from_sentence(sentence)
         | 
| 291 | 
            +
                  ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
         | 
| 292 | 
            +
                end
         | 
| 293 | 
            +
             | 
| 294 | 
            +
                #
         | 
| 295 | 
            +
                # Returns the ngrams extracted from the specified _text_.
         | 
| 296 | 
            +
                #
         | 
| 297 | 
            +
                def ngrams_from_text(text)
         | 
| 298 | 
            +
                  parse_text(text).inject([]) do |ngrams,sentence|
         | 
| 299 | 
            +
                    ngrams + ngrams_from_sentence(sentence)
         | 
| 300 | 
            +
                  end
         | 
| 301 | 
            +
                end
         | 
| 302 | 
            +
             | 
| 303 | 
            +
                #
         | 
| 304 | 
            +
                # Returns all ngrams which preceed the specified _gram_.
         | 
| 305 | 
            +
                #
         | 
| 306 | 
            +
                def ngrams_preceeding(gram)
         | 
| 307 | 
            +
                  ngram_set = NgramSet.new
         | 
| 308 | 
            +
             | 
| 309 | 
            +
                  ngrams_ending_with(gram).each do |ends_with|
         | 
| 310 | 
            +
                    ngrams_postfixed_by(ends_with.prefix).each do |ngram|
         | 
| 311 | 
            +
                      ngram_set << ngram
         | 
| 312 | 
            +
                    end
         | 
| 313 | 
            +
                  end
         | 
| 314 | 
            +
             | 
| 315 | 
            +
                  return ngram_set
         | 
| 316 | 
            +
                end
         | 
| 317 | 
            +
             | 
| 318 | 
            +
                #
         | 
| 319 | 
            +
                # Returns all ngrams which occur directly after the specified _gram_.
         | 
| 320 | 
            +
                #
         | 
| 321 | 
            +
                def ngrams_following(gram)
         | 
| 322 | 
            +
                  ngram_set = NgramSet.new
         | 
| 323 | 
            +
             | 
| 324 | 
            +
                  ngrams_starting_with(gram).each do |starts_with|
         | 
| 325 | 
            +
                    ngrams_prefixed_by(starts_with.postfix).each do |ngram|
         | 
| 326 | 
            +
                      ngram_set << ngram
         | 
| 327 | 
            +
                    end
         | 
| 328 | 
            +
                  end
         | 
| 329 | 
            +
             | 
| 330 | 
            +
                  return ngram_set
         | 
| 331 | 
            +
                end
         | 
| 332 | 
            +
             | 
| 333 | 
            +
                #
         | 
| 334 | 
            +
                # Returns all grams within the model.
         | 
| 335 | 
            +
                #
         | 
| 336 | 
            +
                def grams
         | 
| 337 | 
            +
                  @prefixes.keys.flatten.uniq
         | 
| 338 | 
            +
                end
         | 
| 339 | 
            +
             | 
| 340 | 
            +
                #
         | 
| 341 | 
            +
                # Returns all grams which preceed the specified _gram_.
         | 
| 342 | 
            +
                #
         | 
| 343 | 
            +
                def grams_preceeding(gram)
         | 
| 344 | 
            +
                  gram_set = Set.new
         | 
| 345 | 
            +
             | 
| 346 | 
            +
                  ngrams_ending_with(gram).each do |ngram|
         | 
| 347 | 
            +
                    gram_set << ngram[-2]
         | 
| 348 | 
            +
                  end
         | 
| 349 | 
            +
             | 
| 350 | 
            +
                  return gram_set
         | 
| 351 | 
            +
                end
         | 
| 352 | 
            +
             | 
| 353 | 
            +
                #
         | 
| 354 | 
            +
                # Returns all grams which occur directly after the specified _gram_.
         | 
| 355 | 
            +
                #
         | 
| 356 | 
            +
                def grams_following(gram)
         | 
| 357 | 
            +
                  gram_set = Set.new
         | 
| 358 | 
            +
             | 
| 359 | 
            +
                  ngram_starting_with(gram).each do |ngram|
         | 
| 360 | 
            +
                    gram_set << ngram[1]
         | 
| 361 | 
            +
                  end
         | 
| 362 | 
            +
             | 
| 363 | 
            +
                  return gram_set
         | 
| 364 | 
            +
                end
         | 
| 365 | 
            +
             | 
| 366 | 
            +
                #
         | 
| 367 | 
            +
                # Returns the ngrams which occur within the specified _words_ and
         | 
| 368 | 
            +
                # within the model.
         | 
| 369 | 
            +
                #
         | 
| 370 | 
            +
                def common_ngrams_from_words(words)
         | 
| 371 | 
            +
                  ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
         | 
| 372 | 
            +
                end
         | 
| 373 | 
            +
             | 
| 374 | 
            +
                #
         | 
| 375 | 
            +
                # Returns the ngrams which occur within the specified _fragment_ and
         | 
| 376 | 
            +
                # within the model.
         | 
| 377 | 
            +
                #
         | 
| 378 | 
            +
                def common_ngrams_from_fragment(fragment)
         | 
| 379 | 
            +
                  ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
         | 
| 380 | 
            +
                end
         | 
| 381 | 
            +
             | 
| 382 | 
            +
                #
         | 
| 383 | 
            +
                # Returns the ngrams which occur within the specified _sentence_ and
         | 
| 384 | 
            +
                # within the model.
         | 
| 385 | 
            +
                #
         | 
| 386 | 
            +
                def common_ngrams_from_sentence(sentence)
         | 
| 387 | 
            +
                  ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
         | 
| 388 | 
            +
                end
         | 
| 389 | 
            +
             | 
| 390 | 
            +
                #
         | 
| 391 | 
            +
                # Returns the ngrams which occur within the specified _text_ and
         | 
| 392 | 
            +
                # within the model.
         | 
| 393 | 
            +
                #
         | 
| 394 | 
            +
                def common_ngrams_from_text(text)
         | 
| 395 | 
            +
                  ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
         | 
| 132 396 | 
             
                end
         | 
| 133 397 |  | 
| 398 | 
            +
                #
         | 
| 399 | 
            +
                # Sets the frequency of the specified _ngram_ to the specified _value_.
         | 
| 400 | 
            +
                #
         | 
| 401 | 
            +
                def set_ngram_frequency(ngram,value)
         | 
| 402 | 
            +
                  probability_table(ngram).set_count(ngram.last,value)
         | 
| 403 | 
            +
                end
         | 
| 404 | 
            +
             | 
| 405 | 
            +
                #
         | 
| 406 | 
            +
                # Train the model with the specified _ngram_.
         | 
| 407 | 
            +
                #
         | 
| 408 | 
            +
                def train_with_ngram(ngram)
         | 
| 409 | 
            +
                  probability_table(ngram).count(ngram.last)
         | 
| 410 | 
            +
                end
         | 
| 411 | 
            +
             | 
| 412 | 
            +
                #
         | 
| 413 | 
            +
                # Train the model with the specified _ngrams_.
         | 
| 414 | 
            +
                #
         | 
| 415 | 
            +
                def train_with_ngrams(ngrams)
         | 
| 416 | 
            +
                  ngrams.each { |ngram| train_with_ngram(ngram) }
         | 
| 417 | 
            +
                end
         | 
| 418 | 
            +
             | 
| 419 | 
            +
                #
         | 
| 420 | 
            +
                # Train the model with the specified _sentence_.
         | 
| 421 | 
            +
                #
         | 
| 422 | 
            +
                def train_with_sentence(sentence)
         | 
| 423 | 
            +
                  train_with_ngrams(ngrams_from_sentence(sentence))
         | 
| 424 | 
            +
                end
         | 
| 425 | 
            +
             | 
| 426 | 
            +
                #
         | 
| 427 | 
            +
                # Train the model with the specified _text_.
         | 
| 428 | 
            +
                #
         | 
| 429 | 
            +
                def train_with_text(text)
         | 
| 430 | 
            +
                  train_with_ngrams(ngrams_from_text(text))
         | 
| 431 | 
            +
                end
         | 
| 432 | 
            +
             | 
| 433 | 
            +
                #
         | 
| 434 | 
            +
                # Returns the probability of the specified _ngram_ occurring within
         | 
| 435 | 
            +
                # arbitrary text.
         | 
| 436 | 
            +
                #
         | 
| 134 437 | 
             
                def probability_of_ngram(ngram)
         | 
| 135 | 
            -
                   | 
| 438 | 
            +
                  prefix = ngram.prefix
         | 
| 439 | 
            +
             | 
| 440 | 
            +
                  if @prefixes.has_key?(prefix)
         | 
| 441 | 
            +
                    return @prefixes[prefix].probability_of(ngram.last)
         | 
| 442 | 
            +
                  else
         | 
| 443 | 
            +
                    return 0.0
         | 
| 444 | 
            +
                  end
         | 
| 136 445 | 
             
                end
         | 
| 137 446 |  | 
| 447 | 
            +
                #
         | 
| 448 | 
            +
                # Returns the probability of the specified _ngrams_ occurring within
         | 
| 449 | 
            +
                # arbitrary text.
         | 
| 450 | 
            +
                #
         | 
| 451 | 
            +
                def probabilities_for(ngrams)
         | 
| 452 | 
            +
                  table = {}
         | 
| 453 | 
            +
             | 
| 454 | 
            +
                  ngrams.each do |ngram|
         | 
| 455 | 
            +
                    table[ngram] = probability_of_ngram(ngram)
         | 
| 456 | 
            +
                  end
         | 
| 457 | 
            +
             | 
| 458 | 
            +
                  return table
         | 
| 459 | 
            +
                end
         | 
| 460 | 
            +
             | 
| 461 | 
            +
                #
         | 
| 462 | 
            +
                # Returns the joint probability of the specified _ngrams_ occurring
         | 
| 463 | 
            +
                # within arbitrary text.
         | 
| 464 | 
            +
                #
         | 
| 138 465 | 
             
                def probability_of_ngrams(ngrams)
         | 
| 139 | 
            -
                  probabilities_for(ngrams).inject  | 
| 466 | 
            +
                  probabilities_for(ngrams).values.inject do |joint,prob|
         | 
| 467 | 
            +
                    joint * prob
         | 
| 468 | 
            +
                  end
         | 
| 140 469 | 
             
                end
         | 
| 141 470 |  | 
| 471 | 
            +
                #
         | 
| 472 | 
            +
                # Returns the probably of the specified _gram_ occurring within
         | 
| 473 | 
            +
                # arbitrary text.
         | 
| 474 | 
            +
                #
         | 
| 142 475 | 
             
                def probability_of_gram(gram)
         | 
| 143 476 | 
             
                  probability_of_ngrams(ngrams_starting_with(gram))
         | 
| 144 477 | 
             
                end
         | 
| 145 478 |  | 
| 146 | 
            -
                 | 
| 147 | 
            -
             | 
| 479 | 
            +
                #
         | 
| 480 | 
            +
                # Returns the probability of the specified _fragment_ occuring within
         | 
| 481 | 
            +
                # arbitrary text.
         | 
| 482 | 
            +
                #
         | 
| 483 | 
            +
                def fragment_probability(fragment)
         | 
| 484 | 
            +
                  probability_of_ngrams(ngrams_from_fragment(fragment))
         | 
| 485 | 
            +
                end
         | 
| 486 | 
            +
             | 
| 487 | 
            +
                #
         | 
| 488 | 
            +
                # Returns the probability of the specified _sentence_ occuring within
         | 
| 489 | 
            +
                # arbitrary text.
         | 
| 490 | 
            +
                #
         | 
| 491 | 
            +
                def sentence_probability(sentence)
         | 
| 492 | 
            +
                  probability_of_ngrams(ngrams_from_sentence(sentence))
         | 
| 493 | 
            +
                end
         | 
| 494 | 
            +
             | 
| 495 | 
            +
                #
         | 
| 496 | 
            +
                # Returns the probability of the specified _text_ occuring within
         | 
| 497 | 
            +
                # arbitrary text.
         | 
| 498 | 
            +
                #
         | 
| 499 | 
            +
                def text_probability(text)
         | 
| 500 | 
            +
                  probability_of_ngrams(ngrams_from_text(text))
         | 
| 501 | 
            +
                end
         | 
| 502 | 
            +
             | 
| 503 | 
            +
                #
         | 
| 504 | 
            +
                # Returns the joint probability of the common ngrams between the
         | 
| 505 | 
            +
                # specified _fragment_ and the model.
         | 
| 506 | 
            +
                #
         | 
| 507 | 
            +
                def fragment_commonality(fragment)
         | 
| 508 | 
            +
                  probability_of_ngrams(common_ngrams_from_fragment(fragment))
         | 
| 509 | 
            +
                end
         | 
| 148 510 |  | 
| 149 | 
            -
             | 
| 511 | 
            +
                #
         | 
| 512 | 
            +
                # Returns the joint probability of the common ngrams between the
         | 
| 513 | 
            +
                # specified _sentence_ and the model.
         | 
| 514 | 
            +
                #
         | 
| 515 | 
            +
                def sentence_commonality(sentence)
         | 
| 516 | 
            +
                  probability_of_ngrams(common_ngrams_from_sentence(sentence))
         | 
| 517 | 
            +
                end
         | 
| 518 | 
            +
             | 
| 519 | 
            +
                #
         | 
| 520 | 
            +
                # Returns the joint probability of the common ngrams between the
         | 
| 521 | 
            +
                # specified _sentence_ and the model.
         | 
| 522 | 
            +
                #
         | 
| 523 | 
            +
                def text_commonality(text)
         | 
| 524 | 
            +
                  probability_of_ngrams(common_ngrams_from_text(text))
         | 
| 525 | 
            +
                end
         | 
| 526 | 
            +
             | 
| 527 | 
            +
                #
         | 
| 528 | 
            +
                # Returns the conditional probability of the commonality of the
         | 
| 529 | 
            +
                # specified _fragment_ against the _other_model_, given the commonality
         | 
| 530 | 
            +
                # of the _fragment_ against the model.
         | 
| 531 | 
            +
                #
         | 
| 532 | 
            +
                def fragment_similarity(fragment,other_model)
         | 
| 533 | 
            +
                  other_model.fragment_commonality(fragment) / fragment_commonality(fragment)
         | 
| 534 | 
            +
                end
         | 
| 535 | 
            +
             | 
| 536 | 
            +
                #
         | 
| 537 | 
            +
                # Returns the conditional probability of the commonality of the
         | 
| 538 | 
            +
                # specified _sentence_ against the _other_model_, given the commonality
         | 
| 539 | 
            +
                # of the _sentence_ against the model.
         | 
| 540 | 
            +
                #
         | 
| 541 | 
            +
                def sentence_similarity(sentence,other_model)
         | 
| 542 | 
            +
                  other_model.sentence_commonality(sentence) / sentence_commonality(sentence)
         | 
| 543 | 
            +
                end
         | 
| 544 | 
            +
             | 
| 545 | 
            +
                #
         | 
| 546 | 
            +
                # Returns the conditional probability of the commonality of the
         | 
| 547 | 
            +
                # specified _text_ against the _other_model_, given the commonality
         | 
| 548 | 
            +
                # of the _text_ against the model.
         | 
| 549 | 
            +
                #
         | 
| 550 | 
            +
                def text_similarity(text,other_model)
         | 
| 551 | 
            +
                  other_model.text_commonality(text) / text_commonality(text)
         | 
| 552 | 
            +
                end
         | 
| 553 | 
            +
             | 
| 554 | 
            +
                #
         | 
| 555 | 
            +
                # Returns a random gram from the model.
         | 
| 556 | 
            +
                #
         | 
| 557 | 
            +
                def random_gram
         | 
| 558 | 
            +
                  prefix = @prefixes.keys[rand(@prefixes.length)]
         | 
| 559 | 
            +
             | 
| 560 | 
            +
                  return prefix[rand(prefix.length)]
         | 
| 561 | 
            +
                end
         | 
| 562 | 
            +
             | 
| 563 | 
            +
                #
         | 
| 564 | 
            +
                # Returns a random ngram from the model.
         | 
| 565 | 
            +
                #
         | 
| 566 | 
            +
                def random_ngram
         | 
| 567 | 
            +
                  prefix_index = rand(@prefixes.length)
         | 
| 568 | 
            +
             | 
| 569 | 
            +
                  prefix = @prefixes.keys[prefix_index]
         | 
| 570 | 
            +
                  table = @prefixes.values[prefix_index]
         | 
| 571 | 
            +
             | 
| 572 | 
            +
                  gram_index = rand(table.grams.length)
         | 
| 573 | 
            +
             | 
| 574 | 
            +
                  return (prefix + table.grams[gram_index])
         | 
| 575 | 
            +
                end
         | 
| 576 | 
            +
             | 
| 577 | 
            +
                #
         | 
| 578 | 
            +
                # Returns a randomly generated sentence of grams using the given
         | 
| 579 | 
            +
                # _options_.
         | 
| 580 | 
            +
                #
         | 
| 581 | 
            +
                def random_gram_sentence(options={})
         | 
| 582 | 
            +
                  grams = []
         | 
| 583 | 
            +
                  last_ngram = @starting_ngram
         | 
| 584 | 
            +
                  
         | 
| 585 | 
            +
                  # prime the grams
         | 
| 586 | 
            +
                  grams += @starting_ngram
         | 
| 587 | 
            +
             | 
| 588 | 
            +
                  loop do
         | 
| 589 | 
            +
                    next_ngrams = ngrams_prefixed_by(last_ngram.postfix).to_a
         | 
| 590 | 
            +
                    last_ngram = next_ngrams[rand(next_ngrams.length)]
         | 
| 591 | 
            +
             | 
| 592 | 
            +
                    if last_ngram.nil?
         | 
| 593 | 
            +
                      return []
         | 
| 594 | 
            +
                    else
         | 
| 595 | 
            +
                      grams << last_ngram.last
         | 
| 596 | 
            +
                      break if last_ngram == @stoping_ngram
         | 
| 597 | 
            +
                    end
         | 
| 598 | 
            +
                  end
         | 
| 599 | 
            +
             | 
| 600 | 
            +
                  return grams
         | 
| 601 | 
            +
                end
         | 
| 602 | 
            +
             | 
| 603 | 
            +
                #
         | 
| 604 | 
            +
                # Returns a randomly generated sentence of text using the given
         | 
| 605 | 
            +
                # _options_.
         | 
| 606 | 
            +
                #
         | 
| 607 | 
            +
                def random_sentence(options={})
         | 
| 608 | 
            +
                  grams = random_gram_sentence(options)
         | 
| 609 | 
            +
                  sentence = grams.delete_if { |gram|
         | 
| 610 | 
            +
                    gram == Tokens.start || gram == Tokens.stop
         | 
| 611 | 
            +
                  }.join(' ')
         | 
| 612 | 
            +
             | 
| 613 | 
            +
                  sentence << '.' if @ignore_punctuation
         | 
| 614 | 
            +
                  return sentence
         | 
| 615 | 
            +
                end
         | 
| 616 | 
            +
             | 
| 617 | 
            +
                #
         | 
| 618 | 
            +
                # Returns a randomly generated paragraph of text using the given
         | 
| 619 | 
            +
                # _options_.
         | 
| 620 | 
            +
                #
         | 
| 621 | 
            +
                # _options_ may contain the following keys:
         | 
| 622 | 
            +
                # <tt>:min_sentences</tt>:: Minimum number of sentences in the
         | 
| 623 | 
            +
                #                           paragraph. Defaults to 3.
         | 
| 624 | 
            +
                # <tt>:max_sentences</tt>:: Maximum number of sentences in the
         | 
| 625 | 
            +
                #                           paragraph. Defaults to 6.
         | 
| 626 | 
            +
                #
         | 
| 627 | 
            +
                def random_paragraph(options={})
         | 
| 628 | 
            +
                  min_sentences = (options[:min_sentences] || 3)
         | 
| 629 | 
            +
                  max_sentences = (options[:max_sentences] || 6)
         | 
| 630 | 
            +
                  sentences = []
         | 
| 631 | 
            +
             | 
| 632 | 
            +
                  (rand(max_sentences - min_sentences) + min_sentences).times do
         | 
| 633 | 
            +
                    sentences << random_sentence(options)
         | 
| 634 | 
            +
                  end
         | 
| 635 | 
            +
             | 
| 636 | 
            +
                  return sentences.join(' ')
         | 
| 637 | 
            +
                end
         | 
| 638 | 
            +
             | 
| 639 | 
            +
                #
         | 
| 640 | 
            +
                # Returns randomly generated text using the given _options_.
         | 
| 641 | 
            +
                #
         | 
| 642 | 
            +
                # _options_ may contain the following keys:
         | 
| 643 | 
            +
                # <tt>:min_sentences</tt>:: Minimum number of sentences in the
         | 
| 644 | 
            +
                #                           paragraph. Defaults to 3.
         | 
| 645 | 
            +
                # <tt>:max_sentences</tt>:: Maximum number of sentences in the
         | 
| 646 | 
            +
                #                           paragraph. Defaults to 6.
         | 
| 647 | 
            +
                # <tt>:min_paragraphs</tt>:: Minimum number of paragraphs in the text.
         | 
| 648 | 
            +
                #                            Defaults to 3.
         | 
| 649 | 
            +
                # <tt>:max_paragraphs</tt>:: Maximum number of paragraphs in the text.
         | 
| 650 | 
            +
                #                            Defaults to 5.
         | 
| 651 | 
            +
                #
         | 
| 652 | 
            +
                def random_text(options={})
         | 
| 653 | 
            +
                  min_paragraphs = (options[:min_paragraphs] || 3)
         | 
| 654 | 
            +
                  max_paragraphs = (options[:max_paragraphs] || 6)
         | 
| 655 | 
            +
                  paragraphs = []
         | 
| 656 | 
            +
             | 
| 657 | 
            +
                  (rand(max_paragraphs - min_paragraphs) + min_paragraphs).times do
         | 
| 658 | 
            +
                    paragraphs << random_paragraph(options)
         | 
| 659 | 
            +
                  end
         | 
| 660 | 
            +
             | 
| 661 | 
            +
                  return paragraphs.join("\n\n")
         | 
| 662 | 
            +
                end
         | 
| 663 | 
            +
             | 
| 664 | 
            +
                #
         | 
| 665 | 
            +
                # Refreshes the probability tables of the model.
         | 
| 666 | 
            +
                #
         | 
| 667 | 
            +
                def refresh(&block)
         | 
| 668 | 
            +
                  block.call(self) if block
         | 
| 669 | 
            +
             | 
| 670 | 
            +
                  @prefixes.each_value { |table| table.build }
         | 
| 150 671 | 
             
                  return self
         | 
| 151 672 | 
             
                end
         | 
| 152 673 |  | 
| 153 | 
            -
                 | 
| 674 | 
            +
                #
         | 
| 675 | 
            +
                # Clears and rebuilds the model.
         | 
| 676 | 
            +
                #
         | 
| 677 | 
            +
                def build(&block)
         | 
| 678 | 
            +
                  refresh do
         | 
| 679 | 
            +
                    clear
         | 
| 680 | 
            +
             | 
| 681 | 
            +
                    block.call(self) if block
         | 
| 682 | 
            +
                  end
         | 
| 683 | 
            +
                end
         | 
| 154 684 |  | 
| 155 | 
            -
                 | 
| 156 | 
            -
             | 
| 685 | 
            +
                #
         | 
| 686 | 
            +
                # Clears the model of any training data.
         | 
| 687 | 
            +
                #
         | 
| 688 | 
            +
                def clear
         | 
| 689 | 
            +
                  @prefixes.clear
         | 
| 157 690 | 
             
                  return self
         | 
| 158 691 | 
             
                end
         | 
| 159 692 |  | 
| 693 | 
            +
                protected
         | 
| 694 | 
            +
             | 
| 695 | 
            +
                #
         | 
| 696 | 
            +
                # Defines the default ngram _size_ for the model.
         | 
| 697 | 
            +
                #
         | 
| 698 | 
            +
                def self.ngram_size(size)
         | 
| 699 | 
            +
                  class_eval %{
         | 
| 700 | 
            +
                    def initialize(options={},&block)
         | 
| 701 | 
            +
                      super(options.merge(:ngram_size => #{size.to_i}),&block)
         | 
| 702 | 
            +
                    end
         | 
| 703 | 
            +
                  }
         | 
| 704 | 
            +
                end
         | 
| 705 | 
            +
             | 
| 706 | 
            +
                #
         | 
| 707 | 
            +
                # Wraps the specified _setence_ with StartSentence and StopSentence
         | 
| 708 | 
            +
                # tokens.
         | 
| 709 | 
            +
                #
         | 
| 710 | 
            +
                def wrap_sentence(sentence)
         | 
| 711 | 
            +
                  @starting_ngram + sentence.to_a + @stoping_ngram
         | 
| 712 | 
            +
                end
         | 
| 713 | 
            +
             | 
| 714 | 
            +
                #
         | 
| 715 | 
            +
                # Returns the probability table for the specified _ngram_.
         | 
| 716 | 
            +
                #
         | 
| 717 | 
            +
                def probability_table(ngram)
         | 
| 718 | 
            +
                  @prefixes[ngram.prefix] ||= ProbabilityTable.new
         | 
| 719 | 
            +
                end
         | 
| 720 | 
            +
             | 
| 160 721 | 
             
              end
         | 
| 161 722 | 
             
            end
         |