markovian 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 86536007f535d2ba077ed8ff3aeabf61d9ee6107
4
- data.tar.gz: 61bd78dab63cafa0b0643ddb39793dec563beae5
3
+ metadata.gz: 1f03bf82f92106eab96196be22cb2e09eb3d022b
4
+ data.tar.gz: fe37c3945b518f64d617be2758aef29a5dbfc52a
5
5
  SHA512:
6
- metadata.gz: e2695ad84f944bbb850e62693f3a96b4678f7d01ef54a557f58804056b6dea5f0431ca9cd21c38365676c61ae505cd931696b125b666672b9c30e42e335ec095
7
- data.tar.gz: 4ba6686bba2a42acb3afc5effec994e18b168fab8c286db00afe1b401dc73cf09d9eb3f039e7aa66fa5202796795297ff1fd98d21963fd83f902ce25543b6d62
6
+ metadata.gz: 219a602a1d41dc3aaff820ad616d0690b0a42810f40e83e08a219537f6ba765855b2b8008e4af3fe0537570040d86b0f499222a3a9e5815a44a827dfadfe418e
7
+ data.tar.gz: d00c3cf9bc069f775edf0b496b5ccf79800129a50dbe3d0069b4401294393561cedbb9f3606a855ffa6081e65cf0cd3272bf3861f777d4cd8883dc9353a9a9e7
data/.gitignore CHANGED
@@ -16,3 +16,4 @@
16
16
  /tmp
17
17
  .DS_Store
18
18
  Gemfile.lock
19
+ pkg
data/.ruby-gemset CHANGED
@@ -1 +1 @@
1
- markov-ahkoppel
1
+ markovian
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.2
1
+ 2.2.3
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ language: ruby
2
+ cache: bundler
3
+ rvm:
4
+ - 2.2
5
+ - jruby-9000
data/Gemfile CHANGED
@@ -4,7 +4,7 @@ source 'https://rubygems.org'
4
4
  gemspec
5
5
 
6
6
  group :development, :test do
7
- gem 'byebug'
7
+ gem 'byebug', platform: :mri
8
8
  end
9
9
 
10
10
  group :test do
data/README.md CHANGED
@@ -19,20 +19,11 @@ Fuller documentation will come shortly. For now, let's see how we can use Markov
19
19
  => path_to_twitter_archive
20
20
  > importer = Markovian::Importers::Twitter::CsvImporter.new(path)
21
21
  => #<Markovian::Importers::Twitter::CsvImporter:0x007fd0ca3282a8 @path=path_to_twitter_archive>
22
- > tweets = importer.texts_for_markov_analysis; puts tweets.count
23
- 14394
24
- => nil
25
-
26
- # Create a Chainset (the structure holding all the word relations)...
27
- > chainset = Markovian::ChainSet.new
28
- => #<Markovian::ChainSet:0x007fd0ca03df70 ...>
29
- # And add all the tweets to the Markov dictionary
30
- > tweets.each {|t| Markovian::Chain::TextCompiler.new(t, chainset).incorporate_into_chain}; puts "done."
31
- done.
32
- => nil
33
-
22
+ # now assemble the corpus of tweets -- this may take a few seconds to compile
23
+ > corpus = importer.corpus
24
+ => #<Markovian::Corpus:0x007fd0ca03df70 ...>
34
25
  # Now, we can build some text!
35
- > Markovian::TextBuilder.new("markov", chainset).construct
26
+ > Markovian::TextBuilder.new(corpus).construct("markov")
36
27
  => "markov chains a lot better than a month, i've been here half an hour of night when you can get behind belgium for the offline train journey"
37
28
  ```
38
29
 
data/changelog.md CHANGED
@@ -1,8 +1,18 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 0.2.0
4
+
5
+ * Rename Chainset/Chain to Corpus (better name is better)
6
+ * Add Corpus#random_word to provide a starting place for texts
7
+ * Refactor Rename Corpus::TextCompiler into Corpus::Compiler
8
+ * Add equality operators for Corpus/Chain/Dictionary
9
+ * Add Twitter::CsvImporter.corpus convenience method
10
+ * TextBuilder has a better interface now
11
+ * Dictionary#inspect produces sane output, not the entire dictionary contents
12
+
3
13
  ## 0.1.0 and below
4
14
 
5
- * Ability to build bidirectional chainsets (pair of chains) from arrays of texts
15
+ * Ability to build bidirectional corpuss (pair of chains) from arrays of texts
6
16
  * Ability to import Twitter archives and produce an array of tweets
7
- * Ability to generate Markovian texts from a chainset
17
+ * Ability to generate Markovian texts from a corpus
8
18
  * Gem framework
@@ -0,0 +1,52 @@
1
+ require 'markovian/corpus/dictionary'
2
+
3
+ # The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
4
+ # for small sample sizes, we track multiple chains (derived from both two-word phrases and single
5
+ # word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
6
+ module Markovian
7
+ class Corpus
8
+ class Chain
9
+ def initialize
10
+ @one_key_dictionary = Dictionary.new
11
+ @two_key_dictionary = Dictionary.new
12
+ end
13
+
14
+ attr_reader :one_key_dictionary, :two_key_dictionary
15
+ def lengthen(word, next_word:, previous_word: nil)
16
+ @one_key_dictionary.push(word, next_word)
17
+ @two_key_dictionary.push(two_word_key(previous_word, word), next_word)
18
+ word
19
+ end
20
+
21
+ def next_word(word, previous_word: nil)
22
+ result_for_two_words(previous_word, word) || result_for_one_word(word)
23
+ end
24
+
25
+ def random_word
26
+ one_key_dictionary.random_word
27
+ end
28
+
29
+ def ==(other)
30
+ self.one_key_dictionary == other.one_key_dictionary &&
31
+ self.two_key_dictionary == other.two_key_dictionary
32
+ end
33
+
34
+ protected
35
+
36
+ def result_for_two_words(previous_word, word)
37
+ @two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
38
+ end
39
+
40
+ def result_for_one_word(word)
41
+ @one_key_dictionary.next_word(word)
42
+ end
43
+
44
+ # We represent the two words as a space-delimited phrase for simplicity and speed of access via
45
+ # hash keys.
46
+ def two_word_key(word1, word2)
47
+ "#{word1} #{word2}"
48
+ end
49
+ end
50
+ end
51
+ end
52
+
@@ -1,6 +1,6 @@
1
1
  require 'markovian/utils/text_splitter'
2
2
 
3
- # Given a text to analyze, this class returns a hash of Markov results: two-word phrases (two by
3
+ # Given a piece of text, this class returns a hash of Markov results: two-word phrases (two by
4
4
  # default) pointing to an array of historical next words.
5
5
  #
6
6
  # So, for instance, the phrase "Cats are cute, cats are annoying" would map to:
@@ -17,34 +17,38 @@ require 'markovian/utils/text_splitter'
17
17
  # * Handling sentences or newlines is later -- I'm not sure the right way to do it.
18
18
  # * Capitalization is deferred for later.
19
19
  module Markovian
20
- class Chain
21
- class TextCompiler
20
+ class Corpus
21
+ class Compiler
22
22
  # Pass in a text, and optionally an existing Markov chain to add data to. In many cases, you
23
23
  # may be building a chain using a set of smaller texts instead of one large texts (dialog,
24
24
  # for instance, or Twitter archives), and so may call this class repeatedly for elements of
25
25
  # the parent corpus.
26
- attr_reader :text, :chainset
27
- def initialize(text, starter_chainset = ChainSet.new)
28
- @text = text
29
- @chainset = starter_chainset
26
+ attr_reader :corpus
27
+ def initialize(starter_corpus = Corpus.new)
28
+ @corpus = starter_corpus
30
29
  end
31
30
 
32
- def incorporate_into_chain
33
- add_text_to_chain(interesting_split_text, forward_chain)
34
- # to assemble backward text, we just create a chainset with all the texts reversed
31
+ def build_corpus(texts)
32
+ texts.each {|t| incorporate_text_into_corpus(t)}
33
+ corpus
34
+ end
35
+
36
+ def incorporate_text_into_corpus(text)
37
+ add_text_to_chain(split_into_components(text), forward_chain)
38
+ # to assemble backward text, we just create a corpus with all the texts reversed
35
39
  # that allows us to see what words precede any given word
36
- add_text_to_chain(interesting_split_text.reverse, backward_chain)
37
- chainset
40
+ add_text_to_chain(split_into_components(text).reverse, backward_chain)
41
+ corpus
38
42
  end
39
43
 
40
44
  protected
41
45
 
42
46
  def forward_chain
43
- chainset.forward
47
+ corpus.forward
44
48
  end
45
49
 
46
50
  def backward_chain
47
- chainset.backward
51
+ corpus.backward
48
52
  end
49
53
 
50
54
  def add_text_to_chain(text_elements, chain)
@@ -58,8 +62,8 @@ module Markovian
58
62
  end
59
63
  end
60
64
 
61
- def interesting_split_text
62
- @interesting_split_text ||= Utils::TextSplitter.new(text).components
65
+ def split_into_components(text)
66
+ Utils::TextSplitter.new(text).components
63
67
  end
64
68
  end
65
69
  end
@@ -4,7 +4,7 @@
4
4
  #
5
5
  # The key is an opaque value, which could represent either a single word or a phrase as desired.
6
6
  module Markovian
7
- class Chain
7
+ class Corpus
8
8
  class Dictionary
9
9
  def push(key, word)
10
10
  dictionary[key] += [word]
@@ -14,6 +14,21 @@ module Markovian
14
14
  dictionary[key].sample
15
15
  end
16
16
 
17
+ def random_word
18
+ dictionary.keys.sample
19
+ end
20
+
21
+ def ==(other)
22
+ self.dictionary == other.dictionary
23
+ end
24
+
25
+ # We override this method to avoid spitting out every single element in the dictionary if
26
+ # this (or any object containing it) gets inspected.
27
+ # See http://stackoverflow.com/questions/5771339/emulate-default-objectinspect-output.
28
+ def inspect
29
+ "#<#{self.class}:0x#{__id__.to_s(16)} @dictionary: #{dictionary.length} entries>"
30
+ end
31
+
17
32
  protected
18
33
 
19
34
  def dictionary
@@ -1,10 +1,10 @@
1
- require 'markovian/chain'
1
+ require 'markovian/corpus/chain'
2
2
 
3
3
  # This class represents a pair of chains, one going forward and one going backward. With this, we
4
4
  # can construct phrases in which the original seed word appears at any point in the text (going
5
5
  # backward to create the earlier text, forward to create the rest).
6
6
  module Markovian
7
- class ChainSet
7
+ class Corpus
8
8
  attr_reader :forward, :backward
9
9
  def initialize
10
10
  @forward, @backward = Chain.new, Chain.new
@@ -18,5 +18,14 @@ module Markovian
18
18
  # backward goes in the opposite direction to forward
19
19
  backward.next_word(word, previous_word: following_word)
20
20
  end
21
+
22
+ def random_word
23
+ forward.random_word
24
+ end
25
+
26
+ def ==(other)
27
+ self.forward == other.forward &&
28
+ self.backward == other.backward
29
+ end
21
30
  end
22
31
  end
@@ -17,6 +17,10 @@ module Markovian
17
17
  tweet_enumerator.reject {|t| t.empty?}
18
18
  end
19
19
 
20
+ def corpus
21
+ Corpus::Compiler.new.build_corpus(texts_for_markov_analysis)
22
+ end
23
+
20
24
  protected
21
25
 
22
26
  def csv_enumerator
@@ -1,31 +1,32 @@
1
1
  require 'markovian/utils/text_splitter'
2
2
 
3
- # This class, given a seed word and a Markov chain_set, will attempt to construct a new text using
3
+ # This class, given a Markov corpus, will attempt to construct a new text based on a given seed using
4
4
  # the Markov associations.
5
5
  module Markovian
6
6
  class TextBuilder
7
- attr_reader :seed_text, :chain_set
8
- def initialize(seed_text, chain_set)
9
- @seed_text = seed_text
10
- @chain_set = chain_set
7
+ attr_reader :seed_text, :corpus
8
+ def initialize(corpus)
9
+ @corpus = corpus
11
10
  end
12
11
 
13
- def construct(length: 140, seed: default_seed, start_result_with_seed_word: false)
12
+ def construct(seed_text, length: 140, start_result_with_seed: false)
14
13
  # TODO: if we don't hit a result for the first pair, move backward through the original text
15
14
  # until we get something
15
+ seed_pair = identify_starter_text(seed_text)
16
16
  result_with_next_word(
17
- previous_pair: seed,
18
- result: start_result_with_seed_word ? format_result_array(seed) : nil,
17
+ previous_pair: seed_pair,
18
+ result: start_result_with_seed ? seed_text : nil,
19
19
  length: length
20
20
  )
21
21
  end
22
22
 
23
- def default_seed
24
- if split_seed_text.length >= 2
25
- split_seed_text[-2..-1]
23
+ def identify_starter_text(raw_text)
24
+ seed_components = split_seed_text(raw_text)
25
+ if seed_components.length >= 2
26
+ seed_components[-2..-1]
26
27
  else
27
28
  # if we only have a one-word seed text, the previous word is nil
28
- [nil, split_seed_text.first]
29
+ [nil, seed_components.first]
29
30
  end
30
31
  end
31
32
 
@@ -33,7 +34,7 @@ module Markovian
33
34
 
34
35
  def result_with_next_word(previous_pair:, result:, length:)
35
36
  previous_word, current_word = previous_pair
36
- if next_word = chain_set.next_word(current_word, previous_word: previous_word)
37
+ if next_word = corpus.next_word(current_word, previous_word: previous_word)
37
38
  # we use join rather than + to avoid leading spaces, and strip to ignore leading nils or
38
39
  # empty strings
39
40
  interim_result = format_result_array([result, next_word])
@@ -56,8 +57,8 @@ module Markovian
56
57
  array_of_words.compact.map(&:strip).join(" ")
57
58
  end
58
59
 
59
- def split_seed_text
60
- @split_seed_text ||= Utils::TextSplitter.new(seed_text).components
60
+ def split_seed_text(seed_text)
61
+ Utils::TextSplitter.new(seed_text).components
61
62
  end
62
63
  end
63
64
  end
@@ -1,3 +1,3 @@
1
1
  module Markovian
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/markovian.rb CHANGED
@@ -1,7 +1,6 @@
1
- require 'oj'
2
- require 'markovian/chain_set'
3
1
  require 'markovian/text_builder'
4
- require 'markovian/chain/text_compiler'
2
+ require 'markovian/corpus'
3
+ require 'markovian/corpus/compiler'
5
4
  # importers
6
5
  require 'markovian/importers/twitter/csv_importer'
7
6
 
data/markovian.gemspec CHANGED
@@ -19,7 +19,6 @@ Gem::Specification.new do |spec|
19
19
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
20
  spec.require_paths = ["lib"]
21
21
 
22
- spec.add_runtime_dependency "oj"
23
22
  spec.add_development_dependency "bundler", "~> 1.7"
24
23
  spec.add_development_dependency "rake", "~> 10.0"
25
24
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markovian
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Koppel
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2015-09-09 00:00:00.000000000 Z
11
+ date: 2015-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: oj
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ">="
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ">="
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: bundler
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -63,6 +49,7 @@ files:
63
49
  - ".rspec"
64
50
  - ".ruby-gemset"
65
51
  - ".ruby-version"
52
+ - ".travis.yml"
66
53
  - CODE_OF_CONDUCT.md
67
54
  - Gemfile
68
55
  - LICENSE.txt
@@ -74,10 +61,10 @@ files:
74
61
  - db/seeds.rb
75
62
  - lib/.DS_Store
76
63
  - lib/markovian.rb
77
- - lib/markovian/chain.rb
78
- - lib/markovian/chain/dictionary.rb
79
- - lib/markovian/chain/text_compiler.rb
80
- - lib/markovian/chain_set.rb
64
+ - lib/markovian/corpus.rb
65
+ - lib/markovian/corpus/chain.rb
66
+ - lib/markovian/corpus/compiler.rb
67
+ - lib/markovian/corpus/dictionary.rb
81
68
  - lib/markovian/importers/twitter/csv_importer.rb
82
69
  - lib/markovian/importers/twitter/tweet.rb
83
70
  - lib/markovian/text_builder.rb
@@ -104,7 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
104
91
  version: '0'
105
92
  requirements: []
106
93
  rubyforge_project:
107
- rubygems_version: 2.4.6
94
+ rubygems_version: 2.4.5.1
108
95
  signing_key:
109
96
  specification_version: 4
110
97
  summary: A simple, hopefully easy-to-use Markov chain generator.
@@ -1,41 +0,0 @@
1
- require 'markovian/chain/dictionary'
2
-
3
- # The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
4
- # for small sample sizes, we track multiple chains (derived from both two-word phrases and single
5
- # word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
6
- module Markovian
7
- class Chain
8
- def initialize
9
- @one_key_dictionary = Dictionary.new
10
- @two_key_dictionary = Dictionary.new
11
- end
12
-
13
- attr_reader :one_key_dictionary, :two_key_dictionary
14
- def lengthen(word, next_word:, previous_word: nil)
15
- @one_key_dictionary.push(word, next_word)
16
- @two_key_dictionary.push(two_word_key(previous_word, word), next_word)
17
- word
18
- end
19
-
20
- def next_word(word, previous_word: nil)
21
- result_for_two_words(previous_word, word) || result_for_one_word(word)
22
- end
23
-
24
- protected
25
-
26
- def result_for_two_words(previous_word, word)
27
- @two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
28
- end
29
-
30
- def result_for_one_word(word)
31
- @one_key_dictionary.next_word(word)
32
- end
33
-
34
- # We represent the two words as a space-delimited phrase for simplicity and speed of access via
35
- # hash keys.
36
- def two_word_key(word1, word2)
37
- "#{word1} #{word2}"
38
- end
39
- end
40
- end
41
-