markovian 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 86536007f535d2ba077ed8ff3aeabf61d9ee6107
4
- data.tar.gz: 61bd78dab63cafa0b0643ddb39793dec563beae5
3
+ metadata.gz: 1f03bf82f92106eab96196be22cb2e09eb3d022b
4
+ data.tar.gz: fe37c3945b518f64d617be2758aef29a5dbfc52a
5
5
  SHA512:
6
- metadata.gz: e2695ad84f944bbb850e62693f3a96b4678f7d01ef54a557f58804056b6dea5f0431ca9cd21c38365676c61ae505cd931696b125b666672b9c30e42e335ec095
7
- data.tar.gz: 4ba6686bba2a42acb3afc5effec994e18b168fab8c286db00afe1b401dc73cf09d9eb3f039e7aa66fa5202796795297ff1fd98d21963fd83f902ce25543b6d62
6
+ metadata.gz: 219a602a1d41dc3aaff820ad616d0690b0a42810f40e83e08a219537f6ba765855b2b8008e4af3fe0537570040d86b0f499222a3a9e5815a44a827dfadfe418e
7
+ data.tar.gz: d00c3cf9bc069f775edf0b496b5ccf79800129a50dbe3d0069b4401294393561cedbb9f3606a855ffa6081e65cf0cd3272bf3861f777d4cd8883dc9353a9a9e7
data/.gitignore CHANGED
@@ -16,3 +16,4 @@
16
16
  /tmp
17
17
  .DS_Store
18
18
  Gemfile.lock
19
+ pkg
data/.ruby-gemset CHANGED
@@ -1 +1 @@
1
- markov-ahkoppel
1
+ markovian
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.2
1
+ 2.2.3
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ language: ruby
2
+ cache: bundler
3
+ rvm:
4
+ - 2.2
5
+ - jruby-9000
data/Gemfile CHANGED
@@ -4,7 +4,7 @@ source 'https://rubygems.org'
4
4
  gemspec
5
5
 
6
6
  group :development, :test do
7
- gem 'byebug'
7
+ gem 'byebug', platform: :mri
8
8
  end
9
9
 
10
10
  group :test do
data/README.md CHANGED
@@ -19,20 +19,11 @@ Fuller documentation will come shortly. For now, let's see how we can use Markov
19
19
  => path_to_twitter_archive
20
20
  > importer = Markovian::Importers::Twitter::CsvImporter.new(path)
21
21
  => #<Markovian::Importers::Twitter::CsvImporter:0x007fd0ca3282a8 @path=path_to_twitter_archive>
22
- > tweets = importer.texts_for_markov_analysis; puts tweets.count
23
- 14394
24
- => nil
25
-
26
- # Create a Chainset (the structure holding all the word relations)...
27
- > chainset = Markovian::ChainSet.new
28
- => #<Markovian::ChainSet:0x007fd0ca03df70 ...>
29
- # And add all the tweets to the Markov dictionary
30
- > tweets.each {|t| Markovian::Chain::TextCompiler.new(t, chainset).incorporate_into_chain}; puts "done."
31
- done.
32
- => nil
33
-
22
+ # now assemble the corpus of tweets -- this may take a few seconds to compile
23
+ > corpus = importer.corpus
24
+ => #<Markovian::Corpus:0x007fd0ca03df70 ...>
34
25
  # Now, we can build some text!
35
- > Markovian::TextBuilder.new("markov", chainset).construct
26
+ > Markovian::TextBuilder.new(corpus).construct("markov")
36
27
  => "markov chains a lot better than a month, i've been here half an hour of night when you can get behind belgium for the offline train journey"
37
28
  ```
38
29
 
data/changelog.md CHANGED
@@ -1,8 +1,18 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 0.2.0
4
+
5
+ * Rename Chainset/Chain to Corpus (better name is better)
6
+ * Add Corpus#random_word to provide a starting place for texts
7
+ * Refactor Rename Corpus::TextCompiler into Corpus::Compiler
8
+ * Add equality operators for Corpus/Chain/Dictionary
9
+ * Add Twitter::CsvImporter.corpus convenience method
10
+ * TextBuilder has a better interface now
11
+ * Dictionary#inspect produces sane output, not the entire dictionary contents
12
+
3
13
  ## 0.1.0 and below
4
14
 
5
- * Ability to build bidirectional chainsets (pair of chains) from arrays of texts
15
+ * Ability to build bidirectional corpuss (pair of chains) from arrays of texts
6
16
  * Ability to import Twitter archives and produce an array of tweets
7
- * Ability to generate Markovian texts from a chainset
17
+ * Ability to generate Markovian texts from a corpus
8
18
  * Gem framework
@@ -0,0 +1,52 @@
1
+ require 'markovian/corpus/dictionary'
2
+
3
+ # The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
4
+ # for small sample sizes, we track multiple chains (derived from both two-word phrases and single
5
+ # word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
6
+ module Markovian
7
+ class Corpus
8
+ class Chain
9
+ def initialize
10
+ @one_key_dictionary = Dictionary.new
11
+ @two_key_dictionary = Dictionary.new
12
+ end
13
+
14
+ attr_reader :one_key_dictionary, :two_key_dictionary
15
+ def lengthen(word, next_word:, previous_word: nil)
16
+ @one_key_dictionary.push(word, next_word)
17
+ @two_key_dictionary.push(two_word_key(previous_word, word), next_word)
18
+ word
19
+ end
20
+
21
+ def next_word(word, previous_word: nil)
22
+ result_for_two_words(previous_word, word) || result_for_one_word(word)
23
+ end
24
+
25
+ def random_word
26
+ one_key_dictionary.random_word
27
+ end
28
+
29
+ def ==(other)
30
+ self.one_key_dictionary == other.one_key_dictionary &&
31
+ self.two_key_dictionary == other.two_key_dictionary
32
+ end
33
+
34
+ protected
35
+
36
+ def result_for_two_words(previous_word, word)
37
+ @two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
38
+ end
39
+
40
+ def result_for_one_word(word)
41
+ @one_key_dictionary.next_word(word)
42
+ end
43
+
44
+ # We represent the two words as a space-delimited phrase for simplicity and speed of access via
45
+ # hash keys.
46
+ def two_word_key(word1, word2)
47
+ "#{word1} #{word2}"
48
+ end
49
+ end
50
+ end
51
+ end
52
+
@@ -1,6 +1,6 @@
1
1
  require 'markovian/utils/text_splitter'
2
2
 
3
- # Given a text to analyze, this class returns a hash of Markov results: two-word phrases (two by
3
+ # Given a piece of text, this class returns a hash of Markov results: two-word phrases (two by
4
4
  # default) pointing to an array of historical next words.
5
5
  #
6
6
  # So, for instance, the phrase "Cats are cute, cats are annoying" would map to:
@@ -17,34 +17,38 @@ require 'markovian/utils/text_splitter'
17
17
  # * Handling sentences or newlines is later -- I'm not sure the right way to do it.
18
18
  # * Capitalization is deferred for later.
19
19
  module Markovian
20
- class Chain
21
- class TextCompiler
20
+ class Corpus
21
+ class Compiler
22
22
  # Pass in a text, and optionally an existing Markov chain to add data to. In many cases, you
23
23
  # may be building a chain using a set of smaller texts instead of one large texts (dialog,
24
24
  # for instance, or Twitter archives), and so may call this class repeatedly for elements of
25
25
  # the parent corpus.
26
- attr_reader :text, :chainset
27
- def initialize(text, starter_chainset = ChainSet.new)
28
- @text = text
29
- @chainset = starter_chainset
26
+ attr_reader :corpus
27
+ def initialize(starter_corpus = Corpus.new)
28
+ @corpus = starter_corpus
30
29
  end
31
30
 
32
- def incorporate_into_chain
33
- add_text_to_chain(interesting_split_text, forward_chain)
34
- # to assemble backward text, we just create a chainset with all the texts reversed
31
+ def build_corpus(texts)
32
+ texts.each {|t| incorporate_text_into_corpus(t)}
33
+ corpus
34
+ end
35
+
36
+ def incorporate_text_into_corpus(text)
37
+ add_text_to_chain(split_into_components(text), forward_chain)
38
+ # to assemble backward text, we just create a corpus with all the texts reversed
35
39
  # that allows us to see what words precede any given word
36
- add_text_to_chain(interesting_split_text.reverse, backward_chain)
37
- chainset
40
+ add_text_to_chain(split_into_components(text).reverse, backward_chain)
41
+ corpus
38
42
  end
39
43
 
40
44
  protected
41
45
 
42
46
  def forward_chain
43
- chainset.forward
47
+ corpus.forward
44
48
  end
45
49
 
46
50
  def backward_chain
47
- chainset.backward
51
+ corpus.backward
48
52
  end
49
53
 
50
54
  def add_text_to_chain(text_elements, chain)
@@ -58,8 +62,8 @@ module Markovian
58
62
  end
59
63
  end
60
64
 
61
- def interesting_split_text
62
- @interesting_split_text ||= Utils::TextSplitter.new(text).components
65
+ def split_into_components(text)
66
+ Utils::TextSplitter.new(text).components
63
67
  end
64
68
  end
65
69
  end
@@ -4,7 +4,7 @@
4
4
  #
5
5
  # The key is an opaque value, which could represent either a single word or a phrase as desired.
6
6
  module Markovian
7
- class Chain
7
+ class Corpus
8
8
  class Dictionary
9
9
  def push(key, word)
10
10
  dictionary[key] += [word]
@@ -14,6 +14,21 @@ module Markovian
14
14
  dictionary[key].sample
15
15
  end
16
16
 
17
+ def random_word
18
+ dictionary.keys.sample
19
+ end
20
+
21
+ def ==(other)
22
+ self.dictionary == other.dictionary
23
+ end
24
+
25
+ # We override this method to avoid spitting out every single element in the dictionary if
26
+ # this (or any object containing it) gets inspected.
27
+ # See http://stackoverflow.com/questions/5771339/emulate-default-objectinspect-output.
28
+ def inspect
29
+ "#<#{self.class}:0x#{__id__.to_s(16)} @dictionary: #{dictionary.length} entries>"
30
+ end
31
+
17
32
  protected
18
33
 
19
34
  def dictionary
@@ -1,10 +1,10 @@
1
- require 'markovian/chain'
1
+ require 'markovian/corpus/chain'
2
2
 
3
3
  # This class represents a pair of chains, one going forward and one going backward. With this, we
4
4
  # can construct phrases in which the original seed word appears at any point in the text (going
5
5
  # backward to create the earlier text, forward to create the rest).
6
6
  module Markovian
7
- class ChainSet
7
+ class Corpus
8
8
  attr_reader :forward, :backward
9
9
  def initialize
10
10
  @forward, @backward = Chain.new, Chain.new
@@ -18,5 +18,14 @@ module Markovian
18
18
  # backward goes in the opposite direction to forward
19
19
  backward.next_word(word, previous_word: following_word)
20
20
  end
21
+
22
+ def random_word
23
+ forward.random_word
24
+ end
25
+
26
+ def ==(other)
27
+ self.forward == other.forward &&
28
+ self.backward == other.backward
29
+ end
21
30
  end
22
31
  end
@@ -17,6 +17,10 @@ module Markovian
17
17
  tweet_enumerator.reject {|t| t.empty?}
18
18
  end
19
19
 
20
+ def corpus
21
+ Corpus::Compiler.new.build_corpus(texts_for_markov_analysis)
22
+ end
23
+
20
24
  protected
21
25
 
22
26
  def csv_enumerator
@@ -1,31 +1,32 @@
1
1
  require 'markovian/utils/text_splitter'
2
2
 
3
- # This class, given a seed word and a Markov chain_set, will attempt to construct a new text using
3
+ # This class, given a Markov corpus, will attempt to construct a new text based on a given seed using
4
4
  # the Markov associations.
5
5
  module Markovian
6
6
  class TextBuilder
7
- attr_reader :seed_text, :chain_set
8
- def initialize(seed_text, chain_set)
9
- @seed_text = seed_text
10
- @chain_set = chain_set
7
+ attr_reader :seed_text, :corpus
8
+ def initialize(corpus)
9
+ @corpus = corpus
11
10
  end
12
11
 
13
- def construct(length: 140, seed: default_seed, start_result_with_seed_word: false)
12
+ def construct(seed_text, length: 140, start_result_with_seed: false)
14
13
  # TODO: if we don't hit a result for the first pair, move backward through the original text
15
14
  # until we get something
15
+ seed_pair = identify_starter_text(seed_text)
16
16
  result_with_next_word(
17
- previous_pair: seed,
18
- result: start_result_with_seed_word ? format_result_array(seed) : nil,
17
+ previous_pair: seed_pair,
18
+ result: start_result_with_seed ? seed_text : nil,
19
19
  length: length
20
20
  )
21
21
  end
22
22
 
23
- def default_seed
24
- if split_seed_text.length >= 2
25
- split_seed_text[-2..-1]
23
+ def identify_starter_text(raw_text)
24
+ seed_components = split_seed_text(raw_text)
25
+ if seed_components.length >= 2
26
+ seed_components[-2..-1]
26
27
  else
27
28
  # if we only have a one-word seed text, the previous word is nil
28
- [nil, split_seed_text.first]
29
+ [nil, seed_components.first]
29
30
  end
30
31
  end
31
32
 
@@ -33,7 +34,7 @@ module Markovian
33
34
 
34
35
  def result_with_next_word(previous_pair:, result:, length:)
35
36
  previous_word, current_word = previous_pair
36
- if next_word = chain_set.next_word(current_word, previous_word: previous_word)
37
+ if next_word = corpus.next_word(current_word, previous_word: previous_word)
37
38
  # we use join rather than + to avoid leading spaces, and strip to ignore leading nils or
38
39
  # empty strings
39
40
  interim_result = format_result_array([result, next_word])
@@ -56,8 +57,8 @@ module Markovian
56
57
  array_of_words.compact.map(&:strip).join(" ")
57
58
  end
58
59
 
59
- def split_seed_text
60
- @split_seed_text ||= Utils::TextSplitter.new(seed_text).components
60
+ def split_seed_text(seed_text)
61
+ Utils::TextSplitter.new(seed_text).components
61
62
  end
62
63
  end
63
64
  end
@@ -1,3 +1,3 @@
1
1
  module Markovian
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/markovian.rb CHANGED
@@ -1,7 +1,6 @@
1
- require 'oj'
2
- require 'markovian/chain_set'
3
1
  require 'markovian/text_builder'
4
- require 'markovian/chain/text_compiler'
2
+ require 'markovian/corpus'
3
+ require 'markovian/corpus/compiler'
5
4
  # importers
6
5
  require 'markovian/importers/twitter/csv_importer'
7
6
 
data/markovian.gemspec CHANGED
@@ -19,7 +19,6 @@ Gem::Specification.new do |spec|
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
20
  spec.require_paths = ["lib"]
21
21
 
22
- spec.add_runtime_dependency "oj"
23
22
  spec.add_development_dependency "bundler", "~> 1.7"
24
23
  spec.add_development_dependency "rake", "~> 10.0"
25
24
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markovian
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Koppel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-09 00:00:00.000000000 Z
11
+ date: 2015-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: oj
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: bundler
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -63,6 +49,7 @@ files:
63
49
  - ".rspec"
64
50
  - ".ruby-gemset"
65
51
  - ".ruby-version"
52
+ - ".travis.yml"
66
53
  - CODE_OF_CONDUCT.md
67
54
  - Gemfile
68
55
  - LICENSE.txt
@@ -74,10 +61,10 @@ files:
74
61
  - db/seeds.rb
75
62
  - lib/.DS_Store
76
63
  - lib/markovian.rb
77
- - lib/markovian/chain.rb
78
- - lib/markovian/chain/dictionary.rb
79
- - lib/markovian/chain/text_compiler.rb
80
- - lib/markovian/chain_set.rb
64
+ - lib/markovian/corpus.rb
65
+ - lib/markovian/corpus/chain.rb
66
+ - lib/markovian/corpus/compiler.rb
67
+ - lib/markovian/corpus/dictionary.rb
81
68
  - lib/markovian/importers/twitter/csv_importer.rb
82
69
  - lib/markovian/importers/twitter/tweet.rb
83
70
  - lib/markovian/text_builder.rb
@@ -104,7 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
91
  version: '0'
105
92
  requirements: []
106
93
  rubyforge_project:
107
- rubygems_version: 2.4.6
94
+ rubygems_version: 2.4.5.1
108
95
  signing_key:
109
96
  specification_version: 4
110
97
  summary: A simple, hopefully easy-to-use Markov chain generator.
@@ -1,41 +0,0 @@
1
- require 'markovian/chain/dictionary'
2
-
3
- # The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
4
- # for small sample sizes, we track multiple chains (derived from both two-word phrases and single
5
- # word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
6
- module Markovian
7
- class Chain
8
- def initialize
9
- @one_key_dictionary = Dictionary.new
10
- @two_key_dictionary = Dictionary.new
11
- end
12
-
13
- attr_reader :one_key_dictionary, :two_key_dictionary
14
- def lengthen(word, next_word:, previous_word: nil)
15
- @one_key_dictionary.push(word, next_word)
16
- @two_key_dictionary.push(two_word_key(previous_word, word), next_word)
17
- word
18
- end
19
-
20
- def next_word(word, previous_word: nil)
21
- result_for_two_words(previous_word, word) || result_for_one_word(word)
22
- end
23
-
24
- protected
25
-
26
- def result_for_two_words(previous_word, word)
27
- @two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
28
- end
29
-
30
- def result_for_one_word(word)
31
- @one_key_dictionary.next_word(word)
32
- end
33
-
34
- # We represent the two words as a space-delimited phrase for simplicity and speed of access via
35
- # hash keys.
36
- def two_word_key(word1, word2)
37
- "#{word1} #{word2}"
38
- end
39
- end
40
- end
41
-