markovian 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -1
- data/.ruby-version +1 -1
- data/.travis.yml +5 -0
- data/Gemfile +1 -1
- data/README.md +4 -13
- data/changelog.md +12 -2
- data/lib/markovian/corpus/chain.rb +52 -0
- data/lib/markovian/{chain/text_compiler.rb → corpus/compiler.rb} +20 -16
- data/lib/markovian/{chain → corpus}/dictionary.rb +16 -1
- data/lib/markovian/{chain_set.rb → corpus.rb} +11 -2
- data/lib/markovian/importers/twitter/csv_importer.rb +4 -0
- data/lib/markovian/text_builder.rb +16 -15
- data/lib/markovian/version.rb +1 -1
- data/lib/markovian.rb +2 -3
- data/markovian.gemspec +0 -1
- metadata +8 -21
- data/lib/markovian/chain.rb +0 -41
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f03bf82f92106eab96196be22cb2e09eb3d022b
|
4
|
+
data.tar.gz: fe37c3945b518f64d617be2758aef29a5dbfc52a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 219a602a1d41dc3aaff820ad616d0690b0a42810f40e83e08a219537f6ba765855b2b8008e4af3fe0537570040d86b0f499222a3a9e5815a44a827dfadfe418e
|
7
|
+
data.tar.gz: d00c3cf9bc069f775edf0b496b5ccf79800129a50dbe3d0069b4401294393561cedbb9f3606a855ffa6081e65cf0cd3272bf3861f777d4cd8883dc9353a9a9e7
|
data/.gitignore
CHANGED
data/.ruby-gemset
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
markovian
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.2
|
1
|
+
2.2.3
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -19,20 +19,11 @@ Fuller documentation will come shortly. For now, let's see how we can use Markov
|
|
19
19
|
=> path_to_twitter_archive
|
20
20
|
> importer = Markovian::Importers::Twitter::CsvImporter.new(path)
|
21
21
|
=> #<Markovian::Importers::Twitter::CsvImporter:0x007fd0ca3282a8 @path=path_to_twitter_archive>
|
22
|
-
|
23
|
-
|
24
|
-
=>
|
25
|
-
|
26
|
-
# Create a Chainset (the structure holding all the word relations)...
|
27
|
-
> chainset = Markovian::ChainSet.new
|
28
|
-
=> #<Markovian::ChainSet:0x007fd0ca03df70 ...>
|
29
|
-
# And add all the tweets to the Markov dictionary
|
30
|
-
> tweets.each {|t| Markovian::Chain::TextCompiler.new(t, chainset).incorporate_into_chain}; puts "done."
|
31
|
-
done.
|
32
|
-
=> nil
|
33
|
-
|
22
|
+
# now assemble the corpus of tweets -- this may take a few seconds to compile
|
23
|
+
> corpus = importer.corpus
|
24
|
+
=> #<Markovian::Corpus:0x007fd0ca03df70 ...>
|
34
25
|
# Now, we can build some text!
|
35
|
-
> Markovian::TextBuilder.new("markov"
|
26
|
+
> Markovian::TextBuilder.new(corpus).construct("markov")
|
36
27
|
=> "markov chains a lot better than a month, i've been here half an hour of night when you can get behind belgium for the offline train journey"
|
37
28
|
```
|
38
29
|
|
data/changelog.md
CHANGED
@@ -1,8 +1,18 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## 0.2.0
|
4
|
+
|
5
|
+
* Rename Chainset/Chain to Corpus (better name is better)
|
6
|
+
* Add Corpus#random_word to provide a starting place for texts
|
7
|
+
* Refactor Rename Corpus::TextCompiler into Corpus::Compiler
|
8
|
+
* Add equality operators for Corpus/Chain/Dictionary
|
9
|
+
* Add Twitter::CsvImporter.corpus convenience method
|
10
|
+
* TextBuilder has a better interface now
|
11
|
+
* Dictionary#inspect produces sane output, not the entire dictionary contents
|
12
|
+
|
3
13
|
## 0.1.0 and below
|
4
14
|
|
5
|
-
* Ability to build bidirectional
|
15
|
+
* Ability to build bidirectional corpuss (pair of chains) from arrays of texts
|
6
16
|
* Ability to import Twitter archives and produce an array of tweets
|
7
|
-
* Ability to generate Markovian texts from a
|
17
|
+
* Ability to generate Markovian texts from a corpus
|
8
18
|
* Gem framework
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'markovian/corpus/dictionary'
|
2
|
+
|
3
|
+
# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
|
4
|
+
# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
|
5
|
+
# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
|
6
|
+
module Markovian
|
7
|
+
class Corpus
|
8
|
+
class Chain
|
9
|
+
def initialize
|
10
|
+
@one_key_dictionary = Dictionary.new
|
11
|
+
@two_key_dictionary = Dictionary.new
|
12
|
+
end
|
13
|
+
|
14
|
+
attr_reader :one_key_dictionary, :two_key_dictionary
|
15
|
+
def lengthen(word, next_word:, previous_word: nil)
|
16
|
+
@one_key_dictionary.push(word, next_word)
|
17
|
+
@two_key_dictionary.push(two_word_key(previous_word, word), next_word)
|
18
|
+
word
|
19
|
+
end
|
20
|
+
|
21
|
+
def next_word(word, previous_word: nil)
|
22
|
+
result_for_two_words(previous_word, word) || result_for_one_word(word)
|
23
|
+
end
|
24
|
+
|
25
|
+
def random_word
|
26
|
+
one_key_dictionary.random_word
|
27
|
+
end
|
28
|
+
|
29
|
+
def ==(other)
|
30
|
+
self.one_key_dictionary == other.one_key_dictionary &&
|
31
|
+
self.two_key_dictionary == other.two_key_dictionary
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def result_for_two_words(previous_word, word)
|
37
|
+
@two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
|
38
|
+
end
|
39
|
+
|
40
|
+
def result_for_one_word(word)
|
41
|
+
@one_key_dictionary.next_word(word)
|
42
|
+
end
|
43
|
+
|
44
|
+
# We represent the two words as a space-delimited phrase for simplicity and speed of access via
|
45
|
+
# hash keys.
|
46
|
+
def two_word_key(word1, word2)
|
47
|
+
"#{word1} #{word2}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'markovian/utils/text_splitter'
|
2
2
|
|
3
|
-
# Given a
|
3
|
+
# Given a piece of text, this class returns a hash of Markov results: two-word phrases (two by
|
4
4
|
# default) pointing to an array of historical next words.
|
5
5
|
#
|
6
6
|
# So, for instance, the phrase "Cats are cute, cats are annoying" would map to:
|
@@ -17,34 +17,38 @@ require 'markovian/utils/text_splitter'
|
|
17
17
|
# * Handling sentences or newlines is later -- I'm not sure the right way to do it.
|
18
18
|
# * Capitalization is deferred for later.
|
19
19
|
module Markovian
|
20
|
-
class
|
21
|
-
class
|
20
|
+
class Corpus
|
21
|
+
class Compiler
|
22
22
|
# Pass in a text, and optionally an existing Markov chain to add data to. In many cases, you
|
23
23
|
# may be building a chain using a set of smaller texts instead of one large texts (dialog,
|
24
24
|
# for instance, or Twitter archives), and so may call this class repeatedly for elements of
|
25
25
|
# the parent corpus.
|
26
|
-
attr_reader :
|
27
|
-
def initialize(
|
28
|
-
@
|
29
|
-
@chainset = starter_chainset
|
26
|
+
attr_reader :corpus
|
27
|
+
def initialize(starter_corpus = Corpus.new)
|
28
|
+
@corpus = starter_corpus
|
30
29
|
end
|
31
30
|
|
32
|
-
def
|
33
|
-
|
34
|
-
|
31
|
+
def build_corpus(texts)
|
32
|
+
texts.each {|t| incorporate_text_into_corpus(t)}
|
33
|
+
corpus
|
34
|
+
end
|
35
|
+
|
36
|
+
def incorporate_text_into_corpus(text)
|
37
|
+
add_text_to_chain(split_into_components(text), forward_chain)
|
38
|
+
# to assemble backward text, we just create a corpus with all the texts reversed
|
35
39
|
# that allows us to see what words precede any given word
|
36
|
-
add_text_to_chain(
|
37
|
-
|
40
|
+
add_text_to_chain(split_into_components(text).reverse, backward_chain)
|
41
|
+
corpus
|
38
42
|
end
|
39
43
|
|
40
44
|
protected
|
41
45
|
|
42
46
|
def forward_chain
|
43
|
-
|
47
|
+
corpus.forward
|
44
48
|
end
|
45
49
|
|
46
50
|
def backward_chain
|
47
|
-
|
51
|
+
corpus.backward
|
48
52
|
end
|
49
53
|
|
50
54
|
def add_text_to_chain(text_elements, chain)
|
@@ -58,8 +62,8 @@ module Markovian
|
|
58
62
|
end
|
59
63
|
end
|
60
64
|
|
61
|
-
def
|
62
|
-
|
65
|
+
def split_into_components(text)
|
66
|
+
Utils::TextSplitter.new(text).components
|
63
67
|
end
|
64
68
|
end
|
65
69
|
end
|
@@ -4,7 +4,7 @@
|
|
4
4
|
#
|
5
5
|
# The key is an opaque value, which could represent either a single word or a phrase as desired.
|
6
6
|
module Markovian
|
7
|
-
class
|
7
|
+
class Corpus
|
8
8
|
class Dictionary
|
9
9
|
def push(key, word)
|
10
10
|
dictionary[key] += [word]
|
@@ -14,6 +14,21 @@ module Markovian
|
|
14
14
|
dictionary[key].sample
|
15
15
|
end
|
16
16
|
|
17
|
+
def random_word
|
18
|
+
dictionary.keys.sample
|
19
|
+
end
|
20
|
+
|
21
|
+
def ==(other)
|
22
|
+
self.dictionary == other.dictionary
|
23
|
+
end
|
24
|
+
|
25
|
+
# We override this method to avoid spitting out every single element in the dictionary if
|
26
|
+
# this (or any object containing it) gets inspected.
|
27
|
+
# See http://stackoverflow.com/questions/5771339/emulate-default-objectinspect-output.
|
28
|
+
def inspect
|
29
|
+
"#<#{self.class}:0x#{__id__.to_s(16)} @dictionary: #{dictionary.length} entries>"
|
30
|
+
end
|
31
|
+
|
17
32
|
protected
|
18
33
|
|
19
34
|
def dictionary
|
@@ -1,10 +1,10 @@
|
|
1
|
-
require 'markovian/chain'
|
1
|
+
require 'markovian/corpus/chain'
|
2
2
|
|
3
3
|
# This class represents a pair of chains, one going forward and one going backward. With this, we
|
4
4
|
# can construct phrases in which the original seed word appears at any point in the text (going
|
5
5
|
# backward to create the earlier text, forward to create the rest).
|
6
6
|
module Markovian
|
7
|
-
class
|
7
|
+
class Corpus
|
8
8
|
attr_reader :forward, :backward
|
9
9
|
def initialize
|
10
10
|
@forward, @backward = Chain.new, Chain.new
|
@@ -18,5 +18,14 @@ module Markovian
|
|
18
18
|
# backward goes in the opposite direction to forward
|
19
19
|
backward.next_word(word, previous_word: following_word)
|
20
20
|
end
|
21
|
+
|
22
|
+
def random_word
|
23
|
+
forward.random_word
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
self.forward == other.forward &&
|
28
|
+
self.backward == other.backward
|
29
|
+
end
|
21
30
|
end
|
22
31
|
end
|
@@ -1,31 +1,32 @@
|
|
1
1
|
require 'markovian/utils/text_splitter'
|
2
2
|
|
3
|
-
# This class, given a
|
3
|
+
# This class, given a Markov corpus, will attempt to construct a new text based on a given seed using
|
4
4
|
# the Markov associations.
|
5
5
|
module Markovian
|
6
6
|
class TextBuilder
|
7
|
-
attr_reader :seed_text, :
|
8
|
-
def initialize(
|
9
|
-
@
|
10
|
-
@chain_set = chain_set
|
7
|
+
attr_reader :seed_text, :corpus
|
8
|
+
def initialize(corpus)
|
9
|
+
@corpus = corpus
|
11
10
|
end
|
12
11
|
|
13
|
-
def construct(length: 140,
|
12
|
+
def construct(seed_text, length: 140, start_result_with_seed: false)
|
14
13
|
# TODO: if we don't hit a result for the first pair, move backward through the original text
|
15
14
|
# until we get something
|
15
|
+
seed_pair = identify_starter_text(seed_text)
|
16
16
|
result_with_next_word(
|
17
|
-
previous_pair:
|
18
|
-
result:
|
17
|
+
previous_pair: seed_pair,
|
18
|
+
result: start_result_with_seed ? seed_text : nil,
|
19
19
|
length: length
|
20
20
|
)
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
25
|
-
|
23
|
+
def identify_starter_text(raw_text)
|
24
|
+
seed_components = split_seed_text(raw_text)
|
25
|
+
if seed_components.length >= 2
|
26
|
+
seed_components[-2..-1]
|
26
27
|
else
|
27
28
|
# if we only have a one-word seed text, the previous word is nil
|
28
|
-
[nil,
|
29
|
+
[nil, seed_components.first]
|
29
30
|
end
|
30
31
|
end
|
31
32
|
|
@@ -33,7 +34,7 @@ module Markovian
|
|
33
34
|
|
34
35
|
def result_with_next_word(previous_pair:, result:, length:)
|
35
36
|
previous_word, current_word = previous_pair
|
36
|
-
if next_word =
|
37
|
+
if next_word = corpus.next_word(current_word, previous_word: previous_word)
|
37
38
|
# we use join rather than + to avoid leading spaces, and strip to ignore leading nils or
|
38
39
|
# empty strings
|
39
40
|
interim_result = format_result_array([result, next_word])
|
@@ -56,8 +57,8 @@ module Markovian
|
|
56
57
|
array_of_words.compact.map(&:strip).join(" ")
|
57
58
|
end
|
58
59
|
|
59
|
-
def split_seed_text
|
60
|
-
|
60
|
+
def split_seed_text(seed_text)
|
61
|
+
Utils::TextSplitter.new(seed_text).components
|
61
62
|
end
|
62
63
|
end
|
63
64
|
end
|
data/lib/markovian/version.rb
CHANGED
data/lib/markovian.rb
CHANGED
data/markovian.gemspec
CHANGED
@@ -19,7 +19,6 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
21
|
|
22
|
-
spec.add_runtime_dependency "oj"
|
23
22
|
spec.add_development_dependency "bundler", "~> 1.7"
|
24
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
25
24
|
end
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markovian
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Koppel
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: oj
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: bundler
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,6 +49,7 @@ files:
|
|
63
49
|
- ".rspec"
|
64
50
|
- ".ruby-gemset"
|
65
51
|
- ".ruby-version"
|
52
|
+
- ".travis.yml"
|
66
53
|
- CODE_OF_CONDUCT.md
|
67
54
|
- Gemfile
|
68
55
|
- LICENSE.txt
|
@@ -74,10 +61,10 @@ files:
|
|
74
61
|
- db/seeds.rb
|
75
62
|
- lib/.DS_Store
|
76
63
|
- lib/markovian.rb
|
77
|
-
- lib/markovian/
|
78
|
-
- lib/markovian/chain
|
79
|
-
- lib/markovian/
|
80
|
-
- lib/markovian/
|
64
|
+
- lib/markovian/corpus.rb
|
65
|
+
- lib/markovian/corpus/chain.rb
|
66
|
+
- lib/markovian/corpus/compiler.rb
|
67
|
+
- lib/markovian/corpus/dictionary.rb
|
81
68
|
- lib/markovian/importers/twitter/csv_importer.rb
|
82
69
|
- lib/markovian/importers/twitter/tweet.rb
|
83
70
|
- lib/markovian/text_builder.rb
|
@@ -104,7 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
91
|
version: '0'
|
105
92
|
requirements: []
|
106
93
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.4.
|
94
|
+
rubygems_version: 2.4.5.1
|
108
95
|
signing_key:
|
109
96
|
specification_version: 4
|
110
97
|
summary: A simple, hopefully easy-to-use Markov chain generator.
|
data/lib/markovian/chain.rb
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
require 'markovian/chain/dictionary'
|
2
|
-
|
3
|
-
# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
|
4
|
-
# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
|
5
|
-
# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
|
6
|
-
module Markovian
|
7
|
-
class Chain
|
8
|
-
def initialize
|
9
|
-
@one_key_dictionary = Dictionary.new
|
10
|
-
@two_key_dictionary = Dictionary.new
|
11
|
-
end
|
12
|
-
|
13
|
-
attr_reader :one_key_dictionary, :two_key_dictionary
|
14
|
-
def lengthen(word, next_word:, previous_word: nil)
|
15
|
-
@one_key_dictionary.push(word, next_word)
|
16
|
-
@two_key_dictionary.push(two_word_key(previous_word, word), next_word)
|
17
|
-
word
|
18
|
-
end
|
19
|
-
|
20
|
-
def next_word(word, previous_word: nil)
|
21
|
-
result_for_two_words(previous_word, word) || result_for_one_word(word)
|
22
|
-
end
|
23
|
-
|
24
|
-
protected
|
25
|
-
|
26
|
-
def result_for_two_words(previous_word, word)
|
27
|
-
@two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
|
28
|
-
end
|
29
|
-
|
30
|
-
def result_for_one_word(word)
|
31
|
-
@one_key_dictionary.next_word(word)
|
32
|
-
end
|
33
|
-
|
34
|
-
# We represent the two words as a space-delimited phrase for simplicity and speed of access via
|
35
|
-
# hash keys.
|
36
|
-
def two_word_key(word1, word2)
|
37
|
-
"#{word1} #{word2}"
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|