markovian 0.2.9 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +7 -3
- data/README.md +12 -3
- data/changelog.md +9 -0
- data/lib/markovian.rb +2 -2
- data/lib/markovian/chain.rb +81 -0
- data/lib/markovian/{corpus → chain}/compiler.rb +11 -22
- data/lib/markovian/{corpus → chain}/dictionary.rb +5 -13
- data/lib/markovian/chain/dictionary_entry.rb +80 -0
- data/lib/markovian/importers/twitter/csv_importer.rb +2 -2
- data/lib/markovian/text_builder.rb +29 -20
- data/lib/markovian/text_builder/end_of_sentence_filter.rb +31 -0
- data/lib/markovian/version.rb +1 -1
- metadata +7 -7
- data/lib/markovian/corpus.rb +0 -31
- data/lib/markovian/corpus/chain.rb +0 -53
- data/lib/markovian/corpus/dictionary_entry.rb +0 -54
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 290b5c05432cd805aa1aafdae2d93b68cf1e9a8a
|
4
|
+
data.tar.gz: c51deea8332351976638c6767603ad137c85fb4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eca6c116a0e9686b90ebd3e9335cd55f3a48261a3824dd5d2d71c58e6ba97b8749c738b042da0f2b72c02df58936a924ae9953a1970b64d02d70b58f3f953ae9
|
7
|
+
data.tar.gz: e2279a199969da3cf587952a57a6eb1fb3d7f22e967cba3f4dc700ba5022e5562e4e70eb5a34f3f199a65ee458ed232952a9f7e17b254054e0b3bd7327d89839
|
data/Gemfile
CHANGED
@@ -1,10 +1,14 @@
|
|
1
|
-
source
|
1
|
+
source "https://rubygems.org"
|
2
2
|
|
3
|
-
# Specify your gem
|
3
|
+
# Specify your gem"s dependencies in markov-ahkoppel2.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
6
|
group :development, :test do
|
7
|
-
gem
|
7
|
+
gem "byebug", platform: :mri
|
8
|
+
# If you're developing both gems, use the local version of Tokeneyes
|
9
|
+
if File.exist?("../tokeneyes")
|
10
|
+
gem "tokeneyes", path: "../tokeneyes"
|
11
|
+
end
|
8
12
|
end
|
9
13
|
|
10
14
|
group :test do
|
data/README.md
CHANGED
@@ -19,16 +19,25 @@ Fuller documentation will come shortly. For now, let's see how we can use Markov
|
|
19
19
|
=> path_to_twitter_archive
|
20
20
|
> importer = Markovian::Importers::Twitter::CsvImporter.new(path)
|
21
21
|
=> #<Markovian::Importers::Twitter::CsvImporter:0x007fd0ca3282a8 @path=path_to_twitter_archive>
|
22
|
-
# now assemble the
|
23
|
-
>
|
22
|
+
# now assemble the chain based on the tweets -- this may take a few seconds to compile
|
23
|
+
> chain = importer.chain
|
24
24
|
=> #<Markovian::Corpus:0x007fd0ca03df70 ...>
|
25
25
|
# Now, we can build some text!
|
26
|
-
> Markovian::TextBuilder.new(
|
26
|
+
> Markovian::TextBuilder.new(chain).construct("markov")
|
27
27
|
=> "markov chains a lot better than a month, i've been here half an hour of night when you can get behind belgium for the offline train journey"
|
28
28
|
```
|
29
29
|
|
30
30
|
Exactly!
|
31
31
|
|
32
|
+
## Features
|
33
|
+
|
34
|
+
So far, Markovian gives you the ability to, given a set of inputs, generate random text. In
|
35
|
+
addition, your money gets you:
|
36
|
+
|
37
|
+
* A built-in importer to turn Twitter csv archives into Markov chain-derived text
|
38
|
+
* A built-in filter to remove final words that statistically (in the corpus) rarely end sentences.
|
39
|
+
Avoid unsightly sentences ending in "and so of" and so on!
|
40
|
+
|
32
41
|
## Development
|
33
42
|
|
34
43
|
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/changelog.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## 0.3.0
|
4
|
+
|
5
|
+
* TextBuilder now filters out final words that statistically rarely end sentences (first filter!)
|
6
|
+
* TextBuilder#construct now includes seed text by default (instead of via opt-in)
|
7
|
+
* Add Chain#word_entry to allow access to word data
|
8
|
+
* Properly collect metadata about words (previously collected next_word's data)
|
9
|
+
* Refactor Dictionary to provide access to entries, removing a lot of method duplication
|
10
|
+
* Remove Corpus class (no longer necessary), make Chain the base
|
11
|
+
|
3
12
|
## 0.2.9
|
4
13
|
|
5
14
|
Internal refactors only, no new functionality.
|
data/lib/markovian.rb
CHANGED
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'markovian/chain/dictionary'
|
2
|
+
|
3
|
+
# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
|
4
|
+
# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
|
5
|
+
# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
|
6
|
+
module Markovian
|
7
|
+
class Chain
|
8
|
+
def initialize
|
9
|
+
@one_key_dictionary = Dictionary.new
|
10
|
+
@two_key_dictionary = Dictionary.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Allow access to a word's metadata by providing its dictionary entry. For now, we only do
|
14
|
+
# individual words, not two-word phrases.
|
15
|
+
def word_entry(word)
|
16
|
+
@one_key_dictionary[word]
|
17
|
+
end
|
18
|
+
|
19
|
+
def lengthen(word, next_word:, previous_word: nil)
|
20
|
+
# When we encounter a word, we track its metadata and and what words surround it
|
21
|
+
write_to_dictionary(@one_key_dictionary, word, word, next_word)
|
22
|
+
write_to_dictionary(@two_key_dictionary, two_word_key(previous_word, word), word, next_word)
|
23
|
+
word
|
24
|
+
end
|
25
|
+
|
26
|
+
def next_word(word, previous_word: nil)
|
27
|
+
if dictionary_entry = entry(word, previous_word)
|
28
|
+
dictionary_entry.next_word
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def random_word
|
33
|
+
one_key_dictionary.random_word
|
34
|
+
end
|
35
|
+
|
36
|
+
def ==(other)
|
37
|
+
self.one_key_dictionary == other.one_key_dictionary &&
|
38
|
+
self.two_key_dictionary == other.two_key_dictionary
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
# for equality checking
|
44
|
+
attr_reader :one_key_dictionary, :two_key_dictionary
|
45
|
+
|
46
|
+
def entry(word, previous_word = nil)
|
47
|
+
if previous_word
|
48
|
+
entry_for_two_words(previous_word, word) || entry_for_one_word(word)
|
49
|
+
else
|
50
|
+
entry_for_one_word(word)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def entry_for_two_words(previous_word, word)
|
55
|
+
entry_if_present(@two_key_dictionary[two_word_key(previous_word, word)])
|
56
|
+
end
|
57
|
+
|
58
|
+
def entry_for_one_word(word)
|
59
|
+
# Not strictly necessary, since if there's an empty entry here we'll just get nils, but better to
|
60
|
+
# do it right.
|
61
|
+
entry_if_present(@one_key_dictionary[word])
|
62
|
+
end
|
63
|
+
|
64
|
+
def entry_if_present(entry)
|
65
|
+
# Ignore empty entries that haven't actually been seen in the corpus
|
66
|
+
# TODO refactor to not even create them
|
67
|
+
entry if entry.occurrences > 0
|
68
|
+
end
|
69
|
+
|
70
|
+
# We represent the two words as a space-delimited phrase for simplicity and speed of access via
|
71
|
+
# hash keys.
|
72
|
+
def two_word_key(word1, word2)
|
73
|
+
"#{word1} #{word2}"
|
74
|
+
end
|
75
|
+
|
76
|
+
def write_to_dictionary(dictionary, key, word_instance, next_word)
|
77
|
+
dictionary[key].record_observance(word_instance)
|
78
|
+
dictionary[key].push(next_word)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -17,40 +17,29 @@ require 'markovian/utils/text_splitter'
|
|
17
17
|
# * Handling sentences or newlines is later -- I'm not sure the right way to do it.
|
18
18
|
# * Capitalization is deferred for later.
|
19
19
|
module Markovian
|
20
|
-
class
|
20
|
+
class Chain
|
21
21
|
class Compiler
|
22
22
|
# Pass in a text, and optionally an existing Markov chain to add data to. In many cases, you
|
23
23
|
# may be building a chain using a set of smaller texts instead of one large texts (dialog,
|
24
24
|
# for instance, or Twitter archives), and so may call this class repeatedly for elements of
|
25
|
-
# the
|
26
|
-
attr_reader :
|
27
|
-
def initialize(
|
28
|
-
@
|
25
|
+
# the corpus.
|
26
|
+
attr_reader :chain
|
27
|
+
def initialize(starter_chain = Chain.new)
|
28
|
+
@chain = starter_chain
|
29
29
|
end
|
30
30
|
|
31
|
-
def
|
32
|
-
texts.each {|t|
|
33
|
-
|
31
|
+
def build_chain(texts)
|
32
|
+
texts.each {|t| incorporate_text_into_chain(t)}
|
33
|
+
chain
|
34
34
|
end
|
35
35
|
|
36
|
-
def
|
37
|
-
add_text_to_chain(split_into_components(text),
|
38
|
-
|
39
|
-
# that allows us to see what words precede any given word
|
40
|
-
add_text_to_chain(split_into_components(text).reverse, backward_chain)
|
41
|
-
corpus
|
36
|
+
def incorporate_text_into_chain(text)
|
37
|
+
add_text_to_chain(split_into_components(text), chain)
|
38
|
+
chain
|
42
39
|
end
|
43
40
|
|
44
41
|
protected
|
45
42
|
|
46
|
-
def forward_chain
|
47
|
-
corpus.forward
|
48
|
-
end
|
49
|
-
|
50
|
-
def backward_chain
|
51
|
-
corpus.backward
|
52
|
-
end
|
53
|
-
|
54
43
|
def add_text_to_chain(text_elements, chain)
|
55
44
|
previous_word = nil
|
56
45
|
text_elements.each_with_index do |word, index|
|
@@ -1,21 +1,13 @@
|
|
1
|
-
require 'markovian/
|
1
|
+
require 'markovian/chain/dictionary_entry'
|
2
2
|
#
|
3
3
|
# This class represents a dictionary of words or phrases and the various words that can follow
|
4
4
|
# them. The key is an opaque value, which could represent either a single word or a phrase as desired.
|
5
5
|
module Markovian
|
6
|
-
class
|
6
|
+
class Chain
|
7
7
|
class Dictionary
|
8
|
-
def
|
9
|
-
#
|
10
|
-
dictionary[key.to_s]
|
11
|
-
end
|
12
|
-
|
13
|
-
def next_word(key)
|
14
|
-
dictionary[key].next_word
|
15
|
-
end
|
16
|
-
|
17
|
-
def previous_word(key)
|
18
|
-
dictionary[key].previous_word
|
8
|
+
def [](key)
|
9
|
+
# Key could be a string or a Tokeneyes::Word object
|
10
|
+
dictionary[key.to_s]
|
19
11
|
end
|
20
12
|
|
21
13
|
def random_word
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module Markovian
|
2
|
+
class Chain
|
3
|
+
class DictionaryEntry
|
4
|
+
# Below this, we don't have enough occurrences to draw conclusions about how a word is used.
|
5
|
+
SIGNIFICANT_OCCURRENCE_THRESHOLD = 50
|
6
|
+
|
7
|
+
attr_reader :word, :counts
|
8
|
+
def initialize(word)
|
9
|
+
@word = word.to_s
|
10
|
+
@next_words = []
|
11
|
+
@previous_words = []
|
12
|
+
@counts = Hash.new(0)
|
13
|
+
end
|
14
|
+
|
15
|
+
def record_observance(word_instance, direction: :forwards)
|
16
|
+
# The word has been observed, so let's increase the appropriate counts.
|
17
|
+
# We don't want to double-count words if we read the text both forward and backward, so
|
18
|
+
# only count in the forward direction. (If we encounter a scenario where someone only wants
|
19
|
+
# to read in the backward direction, we can deal with that then.)
|
20
|
+
validate_direction(direction)
|
21
|
+
if direction == :forwards
|
22
|
+
@counts[:total] += 1
|
23
|
+
@counts[:ends_sentence] += 1 if word_instance.ends_sentence?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def push(next_word, direction: :forwards)
|
28
|
+
# Also add the follwoing word
|
29
|
+
array_for_direction(direction) << next_word.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
def next_word
|
33
|
+
next_words.sample
|
34
|
+
end
|
35
|
+
|
36
|
+
def previous_word
|
37
|
+
previous_words.sample
|
38
|
+
end
|
39
|
+
|
40
|
+
def ==(other)
|
41
|
+
self.word == other.word &&
|
42
|
+
self.next_words == other.next_words &&
|
43
|
+
self.previous_words == other.previous_words
|
44
|
+
end
|
45
|
+
|
46
|
+
def occurrences
|
47
|
+
counts[:total]
|
48
|
+
end
|
49
|
+
|
50
|
+
def likelihood_to_end_sentence
|
51
|
+
# if we don't have enough data, we don't have enough data
|
52
|
+
if occurrences >= SIGNIFICANT_OCCURRENCE_THRESHOLD
|
53
|
+
counts[:ends_sentence].to_f / occurrences
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_s
|
58
|
+
word
|
59
|
+
end
|
60
|
+
|
61
|
+
protected
|
62
|
+
|
63
|
+
# for equality checking and other usage
|
64
|
+
attr_reader :next_words, :previous_words
|
65
|
+
|
66
|
+
VALID_DIRECTIONS = [:backwards, :forwards]
|
67
|
+
|
68
|
+
def array_for_direction(direction)
|
69
|
+
validate_direction(direction)
|
70
|
+
direction == :backwards ? previous_words : next_words
|
71
|
+
end
|
72
|
+
|
73
|
+
def validate_direction(direction)
|
74
|
+
unless VALID_DIRECTIONS.include?(direction)
|
75
|
+
raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -1,27 +1,34 @@
|
|
1
1
|
require 'markovian/utils/text_splitter'
|
2
|
+
require 'markovian/text_builder/end_of_sentence_filter'
|
2
3
|
|
3
|
-
# This class, given a Markov
|
4
|
+
# This class, given a Markov chain, will attempt to construct a new text based on a given seed using
|
4
5
|
# the Markov associations.
|
5
6
|
module Markovian
|
6
7
|
class TextBuilder
|
7
|
-
attr_reader :seed_text, :
|
8
|
-
def initialize(
|
9
|
-
@
|
8
|
+
attr_reader :seed_text, :chain
|
9
|
+
def initialize(chain)
|
10
|
+
@chain = chain
|
10
11
|
end
|
11
12
|
|
12
|
-
def construct(seed_text, length: 140,
|
13
|
+
def construct(seed_text, length: 140, exclude_seed_text: false)
|
13
14
|
# TODO: if we don't hit a result for the first pair, move backward through the original text
|
14
15
|
# until we get something
|
15
|
-
|
16
|
-
result_with_next_word(
|
17
|
-
previous_pair:
|
18
|
-
result:
|
16
|
+
seed_components = split_seed_text(seed_text)
|
17
|
+
output = result_with_next_word(
|
18
|
+
previous_pair: identify_starter_text(seed_components),
|
19
|
+
result: exclude_seed_text ? [] : seed_components,
|
19
20
|
length: length
|
20
21
|
)
|
22
|
+
format_output(apply_filters(output))
|
21
23
|
end
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
+
protected
|
26
|
+
|
27
|
+
def apply_filters(output)
|
28
|
+
EndOfSentenceFilter.new.filtered_sentence(sentence_with_word_data(output))
|
29
|
+
end
|
30
|
+
|
31
|
+
def identify_starter_text(seed_components)
|
25
32
|
if seed_components.length >= 2
|
26
33
|
seed_components[-2..-1]
|
27
34
|
else
|
@@ -30,15 +37,13 @@ module Markovian
|
|
30
37
|
end
|
31
38
|
end
|
32
39
|
|
33
|
-
protected
|
34
|
-
|
35
40
|
def result_with_next_word(previous_pair:, result:, length:)
|
36
41
|
previous_word, current_word = previous_pair
|
37
|
-
if next_word =
|
42
|
+
if next_word = chain.next_word(current_word, previous_word: previous_word)
|
38
43
|
# we use join rather than + to avoid leading spaces, and strip to ignore leading nils or
|
39
44
|
# empty strings
|
40
|
-
interim_result =
|
41
|
-
if interim_result.length > length
|
45
|
+
interim_result = result + [next_word]
|
46
|
+
if format_output(interim_result).length > length
|
42
47
|
result
|
43
48
|
else
|
44
49
|
result_with_next_word(
|
@@ -52,14 +57,18 @@ module Markovian
|
|
52
57
|
end
|
53
58
|
end
|
54
59
|
|
55
|
-
# Turn an array of
|
56
|
-
def
|
57
|
-
array_of_words.compact.map(&:strip).join(" ")
|
60
|
+
# Turn an array of Word objects into an ongoing string
|
61
|
+
def format_output(array_of_words)
|
62
|
+
array_of_words.compact.map(&:to_s).map(&:strip).join(" ")
|
63
|
+
end
|
64
|
+
|
65
|
+
def sentence_with_word_data(sentence)
|
66
|
+
@sentence_with_word_data ||= sentence.map {|word| chain.word_entry(word)}
|
58
67
|
end
|
59
68
|
|
60
69
|
def split_seed_text(seed_text)
|
61
70
|
# We get back Tokeneyes::Word objects, but for now only care about the strings within
|
62
|
-
Utils::TextSplitter.new(seed_text).components
|
71
|
+
Utils::TextSplitter.new(seed_text).components
|
63
72
|
end
|
64
73
|
end
|
65
74
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Markovian
|
2
|
+
class TextBuilder
|
3
|
+
# This class will take sentence and apply appropriate filters. It will roll back a sentence up
|
4
|
+
# to a certain number of words if those words have a low likelihood of ending the sentence.
|
5
|
+
# Future changes will increase the qualities filtered for.
|
6
|
+
class EndOfSentenceFilter
|
7
|
+
MAX_WORDS_FILTERED = 3
|
8
|
+
|
9
|
+
def filtered_sentence(sentence)
|
10
|
+
filter_unlikely_ending_words(sentence)
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
|
15
|
+
def filter_unlikely_ending_words(current_sentence, words_filtered = 0)
|
16
|
+
return current_sentence if words_filtered >= MAX_WORDS_FILTERED
|
17
|
+
|
18
|
+
last_word = current_sentence.last
|
19
|
+
likelihood = last_word.likelihood_to_end_sentence
|
20
|
+
if likelihood && rand < likelihood
|
21
|
+
# if we pop a word, consider removing the next one
|
22
|
+
filter_unlikely_ending_words(current_sentence[0..-2], words_filtered + 1)
|
23
|
+
else
|
24
|
+
# if this word hasn't been seen enough, allow it to end a sentence
|
25
|
+
current_sentence
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
data/lib/markovian/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markovian
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Koppel
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09
|
11
|
+
date: 2015-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: tokeneyes
|
@@ -75,14 +75,14 @@ files:
|
|
75
75
|
- db/seeds.rb
|
76
76
|
- lib/.DS_Store
|
77
77
|
- lib/markovian.rb
|
78
|
-
- lib/markovian/
|
79
|
-
- lib/markovian/
|
80
|
-
- lib/markovian/
|
81
|
-
- lib/markovian/
|
82
|
-
- lib/markovian/corpus/dictionary_entry.rb
|
78
|
+
- lib/markovian/chain.rb
|
79
|
+
- lib/markovian/chain/compiler.rb
|
80
|
+
- lib/markovian/chain/dictionary.rb
|
81
|
+
- lib/markovian/chain/dictionary_entry.rb
|
83
82
|
- lib/markovian/importers/twitter/csv_importer.rb
|
84
83
|
- lib/markovian/importers/twitter/tweet.rb
|
85
84
|
- lib/markovian/text_builder.rb
|
85
|
+
- lib/markovian/text_builder/end_of_sentence_filter.rb
|
86
86
|
- lib/markovian/utils/text_splitter.rb
|
87
87
|
- lib/markovian/version.rb
|
88
88
|
- markovian.gemspec
|
data/lib/markovian/corpus.rb
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
require 'markovian/corpus/chain'
|
2
|
-
|
3
|
-
# This class represents a pair of chains, one going forward and one going backward. With this, we
|
4
|
-
# can construct phrases in which the original seed word appears at any point in the text (going
|
5
|
-
# backward to create the earlier text, forward to create the rest).
|
6
|
-
module Markovian
|
7
|
-
class Corpus
|
8
|
-
attr_reader :forward, :backward
|
9
|
-
def initialize
|
10
|
-
@forward, @backward = Chain.new, Chain.new
|
11
|
-
end
|
12
|
-
|
13
|
-
def next_word(word, previous_word: nil)
|
14
|
-
forward.next_word(word, previous_word: previous_word)
|
15
|
-
end
|
16
|
-
|
17
|
-
def previous_word(word, following_word: nil)
|
18
|
-
# backward goes in the opposite direction to forward
|
19
|
-
backward.next_word(word, previous_word: following_word)
|
20
|
-
end
|
21
|
-
|
22
|
-
def random_word
|
23
|
-
forward.random_word
|
24
|
-
end
|
25
|
-
|
26
|
-
def ==(other)
|
27
|
-
self.forward == other.forward &&
|
28
|
-
self.backward == other.backward
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
@@ -1,53 +0,0 @@
|
|
1
|
-
require 'markovian/corpus/dictionary'
|
2
|
-
|
3
|
-
# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
|
4
|
-
# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
|
5
|
-
# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
|
6
|
-
module Markovian
|
7
|
-
class Corpus
|
8
|
-
class Chain
|
9
|
-
def initialize
|
10
|
-
@one_key_dictionary = Dictionary.new
|
11
|
-
@two_key_dictionary = Dictionary.new
|
12
|
-
end
|
13
|
-
|
14
|
-
def lengthen(word, next_word:, previous_word: nil)
|
15
|
-
@one_key_dictionary.push(word, next_word)
|
16
|
-
@two_key_dictionary.push(two_word_key(previous_word, word), next_word)
|
17
|
-
word
|
18
|
-
end
|
19
|
-
|
20
|
-
def next_word(word, previous_word: nil)
|
21
|
-
result_for_two_words(previous_word, word) || result_for_one_word(word)
|
22
|
-
end
|
23
|
-
|
24
|
-
def random_word
|
25
|
-
one_key_dictionary.random_word
|
26
|
-
end
|
27
|
-
|
28
|
-
def ==(other)
|
29
|
-
self.one_key_dictionary == other.one_key_dictionary &&
|
30
|
-
self.two_key_dictionary == other.two_key_dictionary
|
31
|
-
end
|
32
|
-
|
33
|
-
protected
|
34
|
-
|
35
|
-
# for equality checking
|
36
|
-
attr_reader :one_key_dictionary, :two_key_dictionary
|
37
|
-
|
38
|
-
def result_for_two_words(previous_word, word)
|
39
|
-
@two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
|
40
|
-
end
|
41
|
-
|
42
|
-
def result_for_one_word(word)
|
43
|
-
@one_key_dictionary.next_word(word)
|
44
|
-
end
|
45
|
-
|
46
|
-
# We represent the two words as a space-delimited phrase for simplicity and speed of access via
|
47
|
-
# hash keys.
|
48
|
-
def two_word_key(word1, word2)
|
49
|
-
"#{word1} #{word2}"
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
module Markovian
|
2
|
-
class Corpus
|
3
|
-
class DictionaryEntry
|
4
|
-
attr_reader :word, :count
|
5
|
-
def initialize(word)
|
6
|
-
@word = word
|
7
|
-
@next_words = []
|
8
|
-
@previous_words = []
|
9
|
-
@count = 0
|
10
|
-
end
|
11
|
-
|
12
|
-
def push(word, direction: :forwards)
|
13
|
-
# The incoming word will be a Tokeneyes::Word object
|
14
|
-
array_for_direction(direction) << word.to_s
|
15
|
-
# we don't want to double-count words if we read the text both forward and backward, so
|
16
|
-
# only count in the forward direction. (If we encounter a scenario where someone only wants
|
17
|
-
# to read in the backward direction, we can deal with that then.)
|
18
|
-
@count += 1 if direction == :forwards
|
19
|
-
end
|
20
|
-
|
21
|
-
def next_word
|
22
|
-
next_words.sample
|
23
|
-
end
|
24
|
-
|
25
|
-
def previous_word
|
26
|
-
previous_words.sample
|
27
|
-
end
|
28
|
-
|
29
|
-
def ==(other)
|
30
|
-
self.word == other.word &&
|
31
|
-
self.next_words == other.next_words &&
|
32
|
-
self.previous_words == other.previous_words
|
33
|
-
end
|
34
|
-
|
35
|
-
protected
|
36
|
-
|
37
|
-
# for equality checking
|
38
|
-
attr_reader :next_words, :previous_words
|
39
|
-
|
40
|
-
VALID_DIRECTIONS = [:backwards, :forwards]
|
41
|
-
|
42
|
-
def array_for_direction(direction)
|
43
|
-
validate_direction(direction)
|
44
|
-
direction == :backwards ? previous_words : next_words
|
45
|
-
end
|
46
|
-
|
47
|
-
def validate_direction(direction)
|
48
|
-
unless VALID_DIRECTIONS.include?(direction)
|
49
|
-
raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|