markovian 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.ruby-gemset +1 -1
- data/.ruby-version +1 -1
- data/.travis.yml +5 -0
- data/Gemfile +1 -1
- data/README.md +4 -13
- data/changelog.md +12 -2
- data/lib/markovian/corpus/chain.rb +52 -0
- data/lib/markovian/{chain/text_compiler.rb → corpus/compiler.rb} +20 -16
- data/lib/markovian/{chain → corpus}/dictionary.rb +16 -1
- data/lib/markovian/{chain_set.rb → corpus.rb} +11 -2
- data/lib/markovian/importers/twitter/csv_importer.rb +4 -0
- data/lib/markovian/text_builder.rb +16 -15
- data/lib/markovian/version.rb +1 -1
- data/lib/markovian.rb +2 -3
- data/markovian.gemspec +0 -1
- metadata +8 -21
- data/lib/markovian/chain.rb +0 -41
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f03bf82f92106eab96196be22cb2e09eb3d022b
|
4
|
+
data.tar.gz: fe37c3945b518f64d617be2758aef29a5dbfc52a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 219a602a1d41dc3aaff820ad616d0690b0a42810f40e83e08a219537f6ba765855b2b8008e4af3fe0537570040d86b0f499222a3a9e5815a44a827dfadfe418e
|
7
|
+
data.tar.gz: d00c3cf9bc069f775edf0b496b5ccf79800129a50dbe3d0069b4401294393561cedbb9f3606a855ffa6081e65cf0cd3272bf3861f777d4cd8883dc9353a9a9e7
|
data/.gitignore
CHANGED
data/.ruby-gemset
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
markovian
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.2
|
1
|
+
2.2.3
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -19,20 +19,11 @@ Fuller documentation will come shortly. For now, let's see how we can use Markov
|
|
19
19
|
=> path_to_twitter_archive
|
20
20
|
> importer = Markovian::Importers::Twitter::CsvImporter.new(path)
|
21
21
|
=> #<Markovian::Importers::Twitter::CsvImporter:0x007fd0ca3282a8 @path=path_to_twitter_archive>
|
22
|
-
|
23
|
-
|
24
|
-
=>
|
25
|
-
|
26
|
-
# Create a Chainset (the structure holding all the word relations)...
|
27
|
-
> chainset = Markovian::ChainSet.new
|
28
|
-
=> #<Markovian::ChainSet:0x007fd0ca03df70 ...>
|
29
|
-
# And add all the tweets to the Markov dictionary
|
30
|
-
> tweets.each {|t| Markovian::Chain::TextCompiler.new(t, chainset).incorporate_into_chain}; puts "done."
|
31
|
-
done.
|
32
|
-
=> nil
|
33
|
-
|
22
|
+
# now assemble the corpus of tweets -- this may take a few seconds to compile
|
23
|
+
> corpus = importer.corpus
|
24
|
+
=> #<Markovian::Corpus:0x007fd0ca03df70 ...>
|
34
25
|
# Now, we can build some text!
|
35
|
-
> Markovian::TextBuilder.new("markov"
|
26
|
+
> Markovian::TextBuilder.new(corpus).construct("markov")
|
36
27
|
=> "markov chains a lot better than a month, i've been here half an hour of night when you can get behind belgium for the offline train journey"
|
37
28
|
```
|
38
29
|
|
data/changelog.md
CHANGED
@@ -1,8 +1,18 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## 0.2.0
|
4
|
+
|
5
|
+
* Rename Chainset/Chain to Corpus (better name is better)
|
6
|
+
* Add Corpus#random_word to provide a starting place for texts
|
7
|
+
* Refactor Rename Corpus::TextCompiler into Corpus::Compiler
|
8
|
+
* Add equality operators for Corpus/Chain/Dictionary
|
9
|
+
* Add Twitter::CsvImporter.corpus convenience method
|
10
|
+
* TextBuilder has a better interface now
|
11
|
+
* Dictionary#inspect produces sane output, not the entire dictionary contents
|
12
|
+
|
3
13
|
## 0.1.0 and below
|
4
14
|
|
5
|
-
* Ability to build bidirectional
|
15
|
+
* Ability to build bidirectional corpuss (pair of chains) from arrays of texts
|
6
16
|
* Ability to import Twitter archives and produce an array of tweets
|
7
|
-
* Ability to generate Markovian texts from a
|
17
|
+
* Ability to generate Markovian texts from a corpus
|
8
18
|
* Gem framework
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'markovian/corpus/dictionary'
|
2
|
+
|
3
|
+
# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
|
4
|
+
# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
|
5
|
+
# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
|
6
|
+
module Markovian
|
7
|
+
class Corpus
|
8
|
+
class Chain
|
9
|
+
def initialize
|
10
|
+
@one_key_dictionary = Dictionary.new
|
11
|
+
@two_key_dictionary = Dictionary.new
|
12
|
+
end
|
13
|
+
|
14
|
+
attr_reader :one_key_dictionary, :two_key_dictionary
|
15
|
+
def lengthen(word, next_word:, previous_word: nil)
|
16
|
+
@one_key_dictionary.push(word, next_word)
|
17
|
+
@two_key_dictionary.push(two_word_key(previous_word, word), next_word)
|
18
|
+
word
|
19
|
+
end
|
20
|
+
|
21
|
+
def next_word(word, previous_word: nil)
|
22
|
+
result_for_two_words(previous_word, word) || result_for_one_word(word)
|
23
|
+
end
|
24
|
+
|
25
|
+
def random_word
|
26
|
+
one_key_dictionary.random_word
|
27
|
+
end
|
28
|
+
|
29
|
+
def ==(other)
|
30
|
+
self.one_key_dictionary == other.one_key_dictionary &&
|
31
|
+
self.two_key_dictionary == other.two_key_dictionary
|
32
|
+
end
|
33
|
+
|
34
|
+
protected
|
35
|
+
|
36
|
+
def result_for_two_words(previous_word, word)
|
37
|
+
@two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
|
38
|
+
end
|
39
|
+
|
40
|
+
def result_for_one_word(word)
|
41
|
+
@one_key_dictionary.next_word(word)
|
42
|
+
end
|
43
|
+
|
44
|
+
# We represent the two words as a space-delimited phrase for simplicity and speed of access via
|
45
|
+
# hash keys.
|
46
|
+
def two_word_key(word1, word2)
|
47
|
+
"#{word1} #{word2}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'markovian/utils/text_splitter'
|
2
2
|
|
3
|
-
# Given a
|
3
|
+
# Given a piece of text, this class returns a hash of Markov results: two-word phrases (two by
|
4
4
|
# default) pointing to an array of historical next words.
|
5
5
|
#
|
6
6
|
# So, for instance, the phrase "Cats are cute, cats are annoying" would map to:
|
@@ -17,34 +17,38 @@ require 'markovian/utils/text_splitter'
|
|
17
17
|
# * Handling sentences or newlines is later -- I'm not sure the right way to do it.
|
18
18
|
# * Capitalization is deferred for later.
|
19
19
|
module Markovian
|
20
|
-
class
|
21
|
-
class
|
20
|
+
class Corpus
|
21
|
+
class Compiler
|
22
22
|
# Pass in a text, and optionally an existing Markov chain to add data to. In many cases, you
|
23
23
|
# may be building a chain using a set of smaller texts instead of one large texts (dialog,
|
24
24
|
# for instance, or Twitter archives), and so may call this class repeatedly for elements of
|
25
25
|
# the parent corpus.
|
26
|
-
attr_reader :
|
27
|
-
def initialize(
|
28
|
-
@
|
29
|
-
@chainset = starter_chainset
|
26
|
+
attr_reader :corpus
|
27
|
+
def initialize(starter_corpus = Corpus.new)
|
28
|
+
@corpus = starter_corpus
|
30
29
|
end
|
31
30
|
|
32
|
-
def
|
33
|
-
|
34
|
-
|
31
|
+
def build_corpus(texts)
|
32
|
+
texts.each {|t| incorporate_text_into_corpus(t)}
|
33
|
+
corpus
|
34
|
+
end
|
35
|
+
|
36
|
+
def incorporate_text_into_corpus(text)
|
37
|
+
add_text_to_chain(split_into_components(text), forward_chain)
|
38
|
+
# to assemble backward text, we just create a corpus with all the texts reversed
|
35
39
|
# that allows us to see what words precede any given word
|
36
|
-
add_text_to_chain(
|
37
|
-
|
40
|
+
add_text_to_chain(split_into_components(text).reverse, backward_chain)
|
41
|
+
corpus
|
38
42
|
end
|
39
43
|
|
40
44
|
protected
|
41
45
|
|
42
46
|
def forward_chain
|
43
|
-
|
47
|
+
corpus.forward
|
44
48
|
end
|
45
49
|
|
46
50
|
def backward_chain
|
47
|
-
|
51
|
+
corpus.backward
|
48
52
|
end
|
49
53
|
|
50
54
|
def add_text_to_chain(text_elements, chain)
|
@@ -58,8 +62,8 @@ module Markovian
|
|
58
62
|
end
|
59
63
|
end
|
60
64
|
|
61
|
-
def
|
62
|
-
|
65
|
+
def split_into_components(text)
|
66
|
+
Utils::TextSplitter.new(text).components
|
63
67
|
end
|
64
68
|
end
|
65
69
|
end
|
@@ -4,7 +4,7 @@
|
|
4
4
|
#
|
5
5
|
# The key is an opaque value, which could represent either a single word or a phrase as desired.
|
6
6
|
module Markovian
|
7
|
-
class
|
7
|
+
class Corpus
|
8
8
|
class Dictionary
|
9
9
|
def push(key, word)
|
10
10
|
dictionary[key] += [word]
|
@@ -14,6 +14,21 @@ module Markovian
|
|
14
14
|
dictionary[key].sample
|
15
15
|
end
|
16
16
|
|
17
|
+
def random_word
|
18
|
+
dictionary.keys.sample
|
19
|
+
end
|
20
|
+
|
21
|
+
def ==(other)
|
22
|
+
self.dictionary == other.dictionary
|
23
|
+
end
|
24
|
+
|
25
|
+
# We override this method to avoid spitting out every single element in the dictionary if
|
26
|
+
# this (or any object containing it) gets inspected.
|
27
|
+
# See http://stackoverflow.com/questions/5771339/emulate-default-objectinspect-output.
|
28
|
+
def inspect
|
29
|
+
"#<#{self.class}:0x#{__id__.to_s(16)} @dictionary: #{dictionary.length} entries>"
|
30
|
+
end
|
31
|
+
|
17
32
|
protected
|
18
33
|
|
19
34
|
def dictionary
|
@@ -1,10 +1,10 @@
|
|
1
|
-
require 'markovian/chain'
|
1
|
+
require 'markovian/corpus/chain'
|
2
2
|
|
3
3
|
# This class represents a pair of chains, one going forward and one going backward. With this, we
|
4
4
|
# can construct phrases in which the original seed word appears at any point in the text (going
|
5
5
|
# backward to create the earlier text, forward to create the rest).
|
6
6
|
module Markovian
|
7
|
-
class
|
7
|
+
class Corpus
|
8
8
|
attr_reader :forward, :backward
|
9
9
|
def initialize
|
10
10
|
@forward, @backward = Chain.new, Chain.new
|
@@ -18,5 +18,14 @@ module Markovian
|
|
18
18
|
# backward goes in the opposite direction to forward
|
19
19
|
backward.next_word(word, previous_word: following_word)
|
20
20
|
end
|
21
|
+
|
22
|
+
def random_word
|
23
|
+
forward.random_word
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
self.forward == other.forward &&
|
28
|
+
self.backward == other.backward
|
29
|
+
end
|
21
30
|
end
|
22
31
|
end
|
@@ -1,31 +1,32 @@
|
|
1
1
|
require 'markovian/utils/text_splitter'
|
2
2
|
|
3
|
-
# This class, given a
|
3
|
+
# This class, given a Markov corpus, will attempt to construct a new text based on a given seed using
|
4
4
|
# the Markov associations.
|
5
5
|
module Markovian
|
6
6
|
class TextBuilder
|
7
|
-
attr_reader :seed_text, :
|
8
|
-
def initialize(
|
9
|
-
@
|
10
|
-
@chain_set = chain_set
|
7
|
+
attr_reader :seed_text, :corpus
|
8
|
+
def initialize(corpus)
|
9
|
+
@corpus = corpus
|
11
10
|
end
|
12
11
|
|
13
|
-
def construct(length: 140,
|
12
|
+
def construct(seed_text, length: 140, start_result_with_seed: false)
|
14
13
|
# TODO: if we don't hit a result for the first pair, move backward through the original text
|
15
14
|
# until we get something
|
15
|
+
seed_pair = identify_starter_text(seed_text)
|
16
16
|
result_with_next_word(
|
17
|
-
previous_pair:
|
18
|
-
result:
|
17
|
+
previous_pair: seed_pair,
|
18
|
+
result: start_result_with_seed ? seed_text : nil,
|
19
19
|
length: length
|
20
20
|
)
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
24
|
-
|
25
|
-
|
23
|
+
def identify_starter_text(raw_text)
|
24
|
+
seed_components = split_seed_text(raw_text)
|
25
|
+
if seed_components.length >= 2
|
26
|
+
seed_components[-2..-1]
|
26
27
|
else
|
27
28
|
# if we only have a one-word seed text, the previous word is nil
|
28
|
-
[nil,
|
29
|
+
[nil, seed_components.first]
|
29
30
|
end
|
30
31
|
end
|
31
32
|
|
@@ -33,7 +34,7 @@ module Markovian
|
|
33
34
|
|
34
35
|
def result_with_next_word(previous_pair:, result:, length:)
|
35
36
|
previous_word, current_word = previous_pair
|
36
|
-
if next_word =
|
37
|
+
if next_word = corpus.next_word(current_word, previous_word: previous_word)
|
37
38
|
# we use join rather than + to avoid leading spaces, and strip to ignore leading nils or
|
38
39
|
# empty strings
|
39
40
|
interim_result = format_result_array([result, next_word])
|
@@ -56,8 +57,8 @@ module Markovian
|
|
56
57
|
array_of_words.compact.map(&:strip).join(" ")
|
57
58
|
end
|
58
59
|
|
59
|
-
def split_seed_text
|
60
|
-
|
60
|
+
def split_seed_text(seed_text)
|
61
|
+
Utils::TextSplitter.new(seed_text).components
|
61
62
|
end
|
62
63
|
end
|
63
64
|
end
|
data/lib/markovian/version.rb
CHANGED
data/lib/markovian.rb
CHANGED
data/markovian.gemspec
CHANGED
@@ -19,7 +19,6 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
20
|
spec.require_paths = ["lib"]
|
21
21
|
|
22
|
-
spec.add_runtime_dependency "oj"
|
23
22
|
spec.add_development_dependency "bundler", "~> 1.7"
|
24
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
25
24
|
end
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markovian
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Koppel
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: oj
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: bundler
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -63,6 +49,7 @@ files:
|
|
63
49
|
- ".rspec"
|
64
50
|
- ".ruby-gemset"
|
65
51
|
- ".ruby-version"
|
52
|
+
- ".travis.yml"
|
66
53
|
- CODE_OF_CONDUCT.md
|
67
54
|
- Gemfile
|
68
55
|
- LICENSE.txt
|
@@ -74,10 +61,10 @@ files:
|
|
74
61
|
- db/seeds.rb
|
75
62
|
- lib/.DS_Store
|
76
63
|
- lib/markovian.rb
|
77
|
-
- lib/markovian/
|
78
|
-
- lib/markovian/chain
|
79
|
-
- lib/markovian/
|
80
|
-
- lib/markovian/
|
64
|
+
- lib/markovian/corpus.rb
|
65
|
+
- lib/markovian/corpus/chain.rb
|
66
|
+
- lib/markovian/corpus/compiler.rb
|
67
|
+
- lib/markovian/corpus/dictionary.rb
|
81
68
|
- lib/markovian/importers/twitter/csv_importer.rb
|
82
69
|
- lib/markovian/importers/twitter/tweet.rb
|
83
70
|
- lib/markovian/text_builder.rb
|
@@ -104,7 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
91
|
version: '0'
|
105
92
|
requirements: []
|
106
93
|
rubyforge_project:
|
107
|
-
rubygems_version: 2.4.
|
94
|
+
rubygems_version: 2.4.5.1
|
108
95
|
signing_key:
|
109
96
|
specification_version: 4
|
110
97
|
summary: A simple, hopefully easy-to-use Markov chain generator.
|
data/lib/markovian/chain.rb
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
require 'markovian/chain/dictionary'
|
2
|
-
|
3
|
-
# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
|
4
|
-
# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
|
5
|
-
# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
|
6
|
-
module Markovian
|
7
|
-
class Chain
|
8
|
-
def initialize
|
9
|
-
@one_key_dictionary = Dictionary.new
|
10
|
-
@two_key_dictionary = Dictionary.new
|
11
|
-
end
|
12
|
-
|
13
|
-
attr_reader :one_key_dictionary, :two_key_dictionary
|
14
|
-
def lengthen(word, next_word:, previous_word: nil)
|
15
|
-
@one_key_dictionary.push(word, next_word)
|
16
|
-
@two_key_dictionary.push(two_word_key(previous_word, word), next_word)
|
17
|
-
word
|
18
|
-
end
|
19
|
-
|
20
|
-
def next_word(word, previous_word: nil)
|
21
|
-
result_for_two_words(previous_word, word) || result_for_one_word(word)
|
22
|
-
end
|
23
|
-
|
24
|
-
protected
|
25
|
-
|
26
|
-
def result_for_two_words(previous_word, word)
|
27
|
-
@two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
|
28
|
-
end
|
29
|
-
|
30
|
-
def result_for_one_word(word)
|
31
|
-
@one_key_dictionary.next_word(word)
|
32
|
-
end
|
33
|
-
|
34
|
-
# We represent the two words as a space-delimited phrase for simplicity and speed of access via
|
35
|
-
# hash keys.
|
36
|
-
def two_word_key(word1, word2)
|
37
|
-
"#{word1} #{word2}"
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|