RubyGems - markovian - Versions diffs - 0.2.9 → 0.3.0 - Mend

markovian 0.2.9 → 0.3.0

Files changed (17) hide show

checksums.yaml +4 -4
data/Gemfile +7 -3
data/README.md +12 -3
data/changelog.md +9 -0
data/lib/markovian.rb +2 -2
data/lib/markovian/chain.rb +81 -0
data/lib/markovian/{corpus → chain}/compiler.rb +11 -22
data/lib/markovian/{corpus → chain}/dictionary.rb +5 -13
data/lib/markovian/chain/dictionary_entry.rb +80 -0
data/lib/markovian/importers/twitter/csv_importer.rb +2 -2
data/lib/markovian/text_builder.rb +29 -20
data/lib/markovian/text_builder/end_of_sentence_filter.rb +31 -0
data/lib/markovian/version.rb +1 -1
metadata +7 -7
data/lib/markovian/corpus.rb +0 -31
data/lib/markovian/corpus/chain.rb +0 -53
data/lib/markovian/corpus/dictionary_entry.rb +0 -54

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3c30d2a0dcf0c8488bab6aa456ecd5a761fa04ae
-  data.tar.gz: c4900a0c75636c29e33697d9ab3bcff85bb13828
+  metadata.gz: 290b5c05432cd805aa1aafdae2d93b68cf1e9a8a
+  data.tar.gz: c51deea8332351976638c6767603ad137c85fb4b
 SHA512:
-  metadata.gz: 01e05e4c15feb7a2938603a6ef7fda354871d1474e493701475d5f1600498633c525982b9ca4b210b22303b6d18effe0c1d29412ee5fb201218cf4a0bd8f2a83
-  data.tar.gz: 99ff58a5fa7b60a0bfdd69cf450ba650e4958e6047ed18dd07ea8c6e211974c696d87fb671a46103257935b7d36d60fe7d6b25f9ac21bf7c8f387e5e8c4d6066
+  metadata.gz: eca6c116a0e9686b90ebd3e9335cd55f3a48261a3824dd5d2d71c58e6ba97b8749c738b042da0f2b72c02df58936a924ae9953a1970b64d02d70b58f3f953ae9
+  data.tar.gz: e2279a199969da3cf587952a57a6eb1fb3d7f22e967cba3f4dc700ba5022e5562e4e70eb5a34f3f199a65ee458ed232952a9f7e17b254054e0b3bd7327d89839

data/Gemfile CHANGED Viewed

@@ -1,10 +1,14 @@
-source 'https://rubygems.org'
+source "https://rubygems.org"
-# Specify your gem's dependencies in markov-ahkoppel2.gemspec
+# Specify your gem"s dependencies in markov-ahkoppel2.gemspec
 gemspec
 group :development, :test do
-  gem 'byebug', platform: :mri
+  gem "byebug", platform: :mri
+  # If you're developing both gems, use the local version of Tokeneyes
+  if File.exist?("../tokeneyes")
+    gem "tokeneyes", path: "../tokeneyes"
+  end
 end
 group :test do

data/README.md CHANGED Viewed

@@ -19,16 +19,25 @@ Fuller documentation will come shortly. For now, let's see how we can use Markov
  => path_to_twitter_archive
 > importer = Markovian::Importers::Twitter::CsvImporter.new(path)
  => #<Markovian::Importers::Twitter::CsvImporter:0x007fd0ca3282a8 @path=path_to_twitter_archive>
-# now assemble the corpus of tweets -- this may take a few seconds to compile
-> corpus = importer.corpus
+# now assemble the chain based on the tweets -- this may take a few seconds to compile
+> chain = importer.chain
  => #<Markovian::Corpus:0x007fd0ca03df70 ...>
 # Now, we can build some text!
-> Markovian::TextBuilder.new(corpus).construct("markov")
+> Markovian::TextBuilder.new(chain).construct("markov")
 => "markov chains a lot better than a month, i've been here half an hour of night when you can get behind belgium for the offline train journey"
 ```
 Exactly!
+## Features
+So far, Markovian gives you the ability to, given a set of inputs, generate random text. In
+addition, your money gets you:
+* A built-in importer to turn Twitter csv archives into Markov chain-derived text
+* A built-in filter  to remove final words that statistically (in the corpus) rarely end sentences.
+  Avoid unsightly sentences ending in "and so of" and so on!
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/changelog.md CHANGED Viewed

@@ -1,5 +1,14 @@
 # CHANGELOG
+## 0.3.0
+* TextBuilder now filters out final words that statistically rarely end sentences (first filter!)
+* TextBuilder#construct now includes seed text by default (instead of via opt-in)
+* Add Chain#word_entry to allow access to word data
+* Properly collect metadata about words (previously collected next_word's data)
+* Refactor Dictionary to provide access to entries, removing a lot of method duplication
+* Remove Corpus class (no longer necessary), make Chain the base
 ## 0.2.9
 Internal refactors only, no new functionality.

data/lib/markovian.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 require 'markovian/text_builder'
-require 'markovian/corpus'
-require 'markovian/corpus/compiler'
+require 'markovian/chain'
+require 'markovian/chain/compiler'
 # importers
 require 'markovian/importers/twitter/csv_importer'

data/lib/markovian/chain.rb ADDED Viewed

@@ -0,0 +1,81 @@
+require 'markovian/chain/dictionary'
+# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
+# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
+# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
+module Markovian
+  class Chain
+    def initialize
+      @one_key_dictionary = Dictionary.new
+      @two_key_dictionary = Dictionary.new
+    end
+    # Allow access to a word's metadata by providing its dictionary entry. For now, we only do
+    # individual words, not two-word phrases.
+    def word_entry(word)
+      @one_key_dictionary[word]
+    end
+    def lengthen(word, next_word:, previous_word: nil)
+      # When we encounter a word, we track its metadata and and what words surround it
+      write_to_dictionary(@one_key_dictionary, word, word, next_word)
+      write_to_dictionary(@two_key_dictionary, two_word_key(previous_word, word), word, next_word)
+      word
+    end
+    def next_word(word, previous_word: nil)
+      if dictionary_entry = entry(word, previous_word)
+        dictionary_entry.next_word
+      end
+    end
+    def random_word
+      one_key_dictionary.random_word
+    end
+    def ==(other)
+      self.one_key_dictionary == other.one_key_dictionary &&
+        self.two_key_dictionary == other.two_key_dictionary
+    end
+    protected
+    # for equality checking
+    attr_reader :one_key_dictionary, :two_key_dictionary
+    def entry(word, previous_word = nil)
+      if previous_word
+        entry_for_two_words(previous_word, word) || entry_for_one_word(word)
+      else
+        entry_for_one_word(word)
+      end
+    end
+    def entry_for_two_words(previous_word, word)
+      entry_if_present(@two_key_dictionary[two_word_key(previous_word, word)])
+    end
+    def entry_for_one_word(word)
+      # Not strictly necessary, since if there's an empty entry here we'll just get nils, but better to
+      # do it right.
+      entry_if_present(@one_key_dictionary[word])
+    end
+    def entry_if_present(entry)
+      # Ignore empty entries that haven't actually been seen in the corpus
+      # TODO refactor to not even create them
+      entry if entry.occurrences > 0
+    end
+    # We represent the two words as a space-delimited phrase for simplicity and speed of access via
+    # hash keys.
+    def two_word_key(word1, word2)
+      "#{word1} #{word2}"
+    end
+    def write_to_dictionary(dictionary, key, word_instance, next_word)
+      dictionary[key].record_observance(word_instance)
+      dictionary[key].push(next_word)
+    end
+  end
+end

data/lib/markovian/{corpus → chain}/compiler.rb RENAMED Viewed

@@ -17,40 +17,29 @@ require 'markovian/utils/text_splitter'
 # * Handling sentences or newlines is later -- I'm not sure the right way to do it.
 # * Capitalization is deferred for later.
 module Markovian
-  class Corpus
+  class Chain
     class Compiler
       # Pass in a text, and optionally an existing Markov chain to add data to. In many cases, you
       # may be building a chain using a set of smaller texts instead of one large texts (dialog,
       # for instance, or Twitter archives), and so may call this class repeatedly for elements of
-      # the parent corpus.
-      attr_reader :corpus
-      def initialize(starter_corpus = Corpus.new)
-        @corpus = starter_corpus
+      # the corpus.
+      attr_reader :chain
+      def initialize(starter_chain = Chain.new)
+        @chain = starter_chain
       end
-      def build_corpus(texts)
-        texts.each {|t| incorporate_text_into_corpus(t)}
-        corpus
+      def build_chain(texts)
+        texts.each {|t| incorporate_text_into_chain(t)}
+        chain
       end
-      def incorporate_text_into_corpus(text)
-        add_text_to_chain(split_into_components(text), forward_chain)
-        # to assemble backward text, we just create a corpus with all the texts reversed
-        # that allows us to see what words precede any given word
-        add_text_to_chain(split_into_components(text).reverse, backward_chain)
-        corpus
+      def incorporate_text_into_chain(text)
+        add_text_to_chain(split_into_components(text), chain)
+        chain
       end
       protected
-      def forward_chain
-        corpus.forward
-      end
-      def backward_chain
-        corpus.backward
-      end
       def add_text_to_chain(text_elements, chain)
         previous_word = nil
         text_elements.each_with_index do |word, index|

data/lib/markovian/{corpus → chain}/dictionary.rb RENAMED Viewed

@@ -1,21 +1,13 @@
-require 'markovian/corpus/dictionary_entry'
+require 'markovian/chain/dictionary_entry'
 #
 # This class represents a dictionary of words or phrases and the various words that can follow
 # them. The key is an opaque value, which could represent either a single word or a phrase as desired.
 module Markovian
-  class Corpus
+  class Chain
     class Dictionary
-      def push(key, word, direction: :forwards)
-        # Incoming we get a Tokeneyes::Word object
-        dictionary[key.to_s].push(word, direction: direction)
-      end
-      def next_word(key)
-        dictionary[key].next_word
-      end
-      def previous_word(key)
-        dictionary[key].previous_word
+      def [](key)
+        # Key could be a string or a Tokeneyes::Word object
+        dictionary[key.to_s]
       end
       def random_word

data/lib/markovian/chain/dictionary_entry.rb ADDED Viewed

@@ -0,0 +1,80 @@
+module Markovian
+  class Chain
+    class DictionaryEntry
+      # Below this, we don't have enough occurrences to draw conclusions about how a word is used.
+      SIGNIFICANT_OCCURRENCE_THRESHOLD = 50
+      attr_reader :word, :counts
+      def initialize(word)
+        @word = word.to_s
+        @next_words = []
+        @previous_words = []
+        @counts = Hash.new(0)
+      end
+      def record_observance(word_instance, direction: :forwards)
+        # The word has been observed, so let's increase the appropriate counts.
+        # We don't want to double-count words if we read the text both forward and backward, so
+        # only count in the forward direction. (If we encounter a scenario where someone only wants
+        # to read in the backward direction, we can deal with that then.)
+        validate_direction(direction)
+        if direction == :forwards
+          @counts[:total] += 1
+          @counts[:ends_sentence] += 1 if word_instance.ends_sentence?
+        end
+      end
+      def push(next_word, direction: :forwards)
+        # Also add the follwoing word
+        array_for_direction(direction) << next_word.to_s
+      end
+      def next_word
+        next_words.sample
+      end
+      def previous_word
+        previous_words.sample
+      end
+      def ==(other)
+        self.word == other.word &&
+          self.next_words == other.next_words &&
+          self.previous_words == other.previous_words
+      end
+      def occurrences
+        counts[:total]
+      end
+      def likelihood_to_end_sentence
+        # if we don't have enough data, we don't have enough data
+        if occurrences >= SIGNIFICANT_OCCURRENCE_THRESHOLD
+          counts[:ends_sentence].to_f / occurrences
+        end
+      end
+      def to_s
+        word
+      end
+      protected
+      # for equality checking and other usage
+      attr_reader :next_words, :previous_words
+      VALID_DIRECTIONS = [:backwards, :forwards]
+      def array_for_direction(direction)
+        validate_direction(direction)
+        direction == :backwards ? previous_words : next_words
+      end
+      def validate_direction(direction)
+        unless VALID_DIRECTIONS.include?(direction)
+          raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
+        end
+      end
+    end
+  end
+end

data/lib/markovian/importers/twitter/csv_importer.rb CHANGED Viewed

@@ -17,8 +17,8 @@ module Markovian
           tweet_enumerator.reject {|t| t.empty?}
         end
-        def corpus
-          Corpus::Compiler.new.build_corpus(texts_for_markov_analysis)
+        def chain
+          Chain::Compiler.new.build_chain(texts_for_markov_analysis)
         end
         protected

data/lib/markovian/text_builder.rb CHANGED Viewed

@@ -1,27 +1,34 @@
 require 'markovian/utils/text_splitter'
+require 'markovian/text_builder/end_of_sentence_filter'
-# This class, given a Markov corpus, will attempt to construct a new text based on a given seed using
+# This class, given a Markov chain, will attempt to construct a new text based on a given seed using
 # the Markov associations.
 module Markovian
   class TextBuilder
-    attr_reader :seed_text, :corpus
-    def initialize(corpus)
-      @corpus = corpus
+    attr_reader :seed_text, :chain
+    def initialize(chain)
+      @chain = chain
     end
-    def construct(seed_text, length: 140, start_result_with_seed: false)
+    def construct(seed_text, length: 140, exclude_seed_text: false)
       # TODO: if we don't hit a result for the first pair, move backward through the original text
       # until we get something
-      seed_pair = identify_starter_text(seed_text)
-      result_with_next_word(
-        previous_pair: seed_pair,
-        result: start_result_with_seed ? seed_text : nil,
+      seed_components = split_seed_text(seed_text)
+      output = result_with_next_word(
+        previous_pair: identify_starter_text(seed_components),
+        result: exclude_seed_text ? [] : seed_components,
         length: length
       )
+      format_output(apply_filters(output))
     end
-    def identify_starter_text(raw_text)
-      seed_components = split_seed_text(raw_text)
+    protected
+    def apply_filters(output)
+      EndOfSentenceFilter.new.filtered_sentence(sentence_with_word_data(output))
+    end
+    def identify_starter_text(seed_components)
       if seed_components.length >= 2
         seed_components[-2..-1]
       else
@@ -30,15 +37,13 @@ module Markovian
       end
     end
-    protected
     def result_with_next_word(previous_pair:, result:, length:)
       previous_word, current_word = previous_pair
-      if next_word = corpus.next_word(current_word, previous_word: previous_word)
+      if next_word = chain.next_word(current_word, previous_word: previous_word)
         # we use join rather than + to avoid leading spaces, and strip to ignore leading nils or
         # empty strings
-        interim_result = format_result_array([result, next_word])
-        if interim_result.length > length
+        interim_result = result + [next_word]
+        if format_output(interim_result).length > length
           result
         else
           result_with_next_word(
@@ -52,14 +57,18 @@ module Markovian
       end
     end
-    # Turn an array of words into an ongoing string
-    def format_result_array(array_of_words)
-      array_of_words.compact.map(&:strip).join(" ")
+    # Turn an array of Word objects into an ongoing string
+    def format_output(array_of_words)
+      array_of_words.compact.map(&:to_s).map(&:strip).join(" ")
+    end
+    def sentence_with_word_data(sentence)
+      @sentence_with_word_data ||= sentence.map {|word| chain.word_entry(word)}
     end
     def split_seed_text(seed_text)
       # We get back Tokeneyes::Word objects, but for now only care about the strings within
-      Utils::TextSplitter.new(seed_text).components.map(&:to_s)
+      Utils::TextSplitter.new(seed_text).components
     end
   end
 end

data/lib/markovian/text_builder/end_of_sentence_filter.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module Markovian
+  class TextBuilder
+    # This class will take sentence and apply appropriate filters. It will roll back a sentence up
+    # to a certain number of words if those words have a low likelihood of ending the sentence.
+    # Future changes will increase the qualities filtered for.
+    class EndOfSentenceFilter
+      MAX_WORDS_FILTERED = 3
+      def filtered_sentence(sentence)
+        filter_unlikely_ending_words(sentence)
+      end
+      protected
+      def filter_unlikely_ending_words(current_sentence, words_filtered = 0)
+        return current_sentence if words_filtered >= MAX_WORDS_FILTERED
+        last_word = current_sentence.last
+        likelihood = last_word.likelihood_to_end_sentence
+        if likelihood && rand < likelihood
+          # if we pop a word, consider removing the next one
+          filter_unlikely_ending_words(current_sentence[0..-2], words_filtered + 1)
+        else
+          # if this word hasn't been seen enough, allow it to end a sentence
+          current_sentence
+        end
+      end
+    end
+  end
+end

data/lib/markovian/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Markovian
-  VERSION = "0.2.9"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: markovian
 version: !ruby/object:Gem::Version
-  version: 0.2.9
+  version: 0.3.0
 platform: ruby
 authors:
 - Alex Koppel
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2015-09-28 00:00:00.000000000 Z
+date: 2015-10-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: tokeneyes
@@ -75,14 +75,14 @@ files:
 - db/seeds.rb
 - lib/.DS_Store
 - lib/markovian.rb
-- lib/markovian/corpus.rb
-- lib/markovian/corpus/chain.rb
-- lib/markovian/corpus/compiler.rb
-- lib/markovian/corpus/dictionary.rb
-- lib/markovian/corpus/dictionary_entry.rb
+- lib/markovian/chain.rb
+- lib/markovian/chain/compiler.rb
+- lib/markovian/chain/dictionary.rb
+- lib/markovian/chain/dictionary_entry.rb
 - lib/markovian/importers/twitter/csv_importer.rb
 - lib/markovian/importers/twitter/tweet.rb
 - lib/markovian/text_builder.rb
+- lib/markovian/text_builder/end_of_sentence_filter.rb
 - lib/markovian/utils/text_splitter.rb
 - lib/markovian/version.rb
 - markovian.gemspec

data/lib/markovian/corpus.rb DELETED Viewed

@@ -1,31 +0,0 @@
-require 'markovian/corpus/chain'
-# This class represents a pair of chains, one going forward and one going backward. With this, we
-# can construct phrases in which the original seed word appears at any point in the text (going
-# backward to create the earlier text, forward to create the rest).
-module Markovian
-  class Corpus
-    attr_reader :forward, :backward
-    def initialize
-      @forward, @backward = Chain.new, Chain.new
-    end
-    def next_word(word, previous_word: nil)
-      forward.next_word(word, previous_word: previous_word)
-    end
-    def previous_word(word, following_word: nil)
-      # backward goes in the opposite direction to forward
-      backward.next_word(word, previous_word: following_word)
-    end
-    def random_word
-      forward.random_word
-    end
-    def ==(other)
-      self.forward == other.forward &&
-        self.backward == other.backward
-    end
-  end
-end

data/lib/markovian/corpus/chain.rb DELETED Viewed

@@ -1,53 +0,0 @@
-require 'markovian/corpus/dictionary'
-# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
-# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
-# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
-module Markovian
-  class Corpus
-    class Chain
-      def initialize
-        @one_key_dictionary = Dictionary.new
-        @two_key_dictionary = Dictionary.new
-      end
-      def lengthen(word, next_word:, previous_word: nil)
-        @one_key_dictionary.push(word, next_word)
-        @two_key_dictionary.push(two_word_key(previous_word, word), next_word)
-        word
-      end
-      def next_word(word, previous_word: nil)
-        result_for_two_words(previous_word, word) || result_for_one_word(word)
-      end
-      def random_word
-        one_key_dictionary.random_word
-      end
-      def ==(other)
-        self.one_key_dictionary == other.one_key_dictionary &&
-          self.two_key_dictionary == other.two_key_dictionary
-      end
-      protected
-      # for equality checking
-      attr_reader :one_key_dictionary, :two_key_dictionary
-      def result_for_two_words(previous_word, word)
-        @two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
-      end
-      def result_for_one_word(word)
-        @one_key_dictionary.next_word(word)
-      end
-      # We represent the two words as a space-delimited phrase for simplicity and speed of access via
-      # hash keys.
-      def two_word_key(word1, word2)
-        "#{word1} #{word2}"
-      end
-    end
-  end
-end

data/lib/markovian/corpus/dictionary_entry.rb DELETED Viewed

@@ -1,54 +0,0 @@
-module Markovian
-  class Corpus
-    class DictionaryEntry
-      attr_reader :word, :count
-      def initialize(word)
-        @word = word
-        @next_words = []
-        @previous_words = []
-        @count = 0
-      end
-      def push(word, direction: :forwards)
-        # The incoming word will be a Tokeneyes::Word object
-        array_for_direction(direction) << word.to_s
-        # we don't want to double-count words if we read the text both forward and backward, so
-        # only count in the forward direction. (If we encounter a scenario where someone only wants
-        # to read in the backward direction, we can deal with that then.)
-        @count += 1 if direction == :forwards
-      end
-      def next_word
-        next_words.sample
-      end
-      def previous_word
-        previous_words.sample
-      end
-      def ==(other)
-        self.word == other.word &&
-          self.next_words == other.next_words &&
-          self.previous_words == other.previous_words
-      end
-      protected
-      # for equality checking
-      attr_reader :next_words, :previous_words
-      VALID_DIRECTIONS = [:backwards, :forwards]
-      def array_for_direction(direction)
-        validate_direction(direction)
-        direction == :backwards ? previous_words : next_words
-      end
-      def validate_direction(direction)
-        unless VALID_DIRECTIONS.include?(direction)
-          raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
-        end
-      end
-    end
-  end
-end