markovian 0.2.9 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -3
- data/README.md +12 -3
- data/changelog.md +9 -0
- data/lib/markovian.rb +2 -2
- data/lib/markovian/chain.rb +81 -0
- data/lib/markovian/{corpus → chain}/compiler.rb +11 -22
- data/lib/markovian/{corpus → chain}/dictionary.rb +5 -13
- data/lib/markovian/chain/dictionary_entry.rb +80 -0
- data/lib/markovian/importers/twitter/csv_importer.rb +2 -2
- data/lib/markovian/text_builder.rb +29 -20
- data/lib/markovian/text_builder/end_of_sentence_filter.rb +31 -0
- data/lib/markovian/version.rb +1 -1
- metadata +7 -7
- data/lib/markovian/corpus.rb +0 -31
- data/lib/markovian/corpus/chain.rb +0 -53
- data/lib/markovian/corpus/dictionary_entry.rb +0 -54
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 290b5c05432cd805aa1aafdae2d93b68cf1e9a8a
|
4
|
+
data.tar.gz: c51deea8332351976638c6767603ad137c85fb4b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eca6c116a0e9686b90ebd3e9335cd55f3a48261a3824dd5d2d71c58e6ba97b8749c738b042da0f2b72c02df58936a924ae9953a1970b64d02d70b58f3f953ae9
|
7
|
+
data.tar.gz: e2279a199969da3cf587952a57a6eb1fb3d7f22e967cba3f4dc700ba5022e5562e4e70eb5a34f3f199a65ee458ed232952a9f7e17b254054e0b3bd7327d89839
|
data/Gemfile
CHANGED
@@ -1,10 +1,14 @@
|
|
1
|
-
source
|
1
|
+
source "https://rubygems.org"
|
2
2
|
|
3
|
-
# Specify your gem
|
3
|
+
# Specify your gem"s dependencies in markov-ahkoppel2.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
6
|
group :development, :test do
|
7
|
-
gem
|
7
|
+
gem "byebug", platform: :mri
|
8
|
+
# If you're developing both gems, use the local version of Tokeneyes
|
9
|
+
if File.exist?("../tokeneyes")
|
10
|
+
gem "tokeneyes", path: "../tokeneyes"
|
11
|
+
end
|
8
12
|
end
|
9
13
|
|
10
14
|
group :test do
|
data/README.md
CHANGED
@@ -19,16 +19,25 @@ Fuller documentation will come shortly. For now, let's see how we can use Markov
|
|
19
19
|
=> path_to_twitter_archive
|
20
20
|
> importer = Markovian::Importers::Twitter::CsvImporter.new(path)
|
21
21
|
=> #<Markovian::Importers::Twitter::CsvImporter:0x007fd0ca3282a8 @path=path_to_twitter_archive>
|
22
|
-
# now assemble the
|
23
|
-
>
|
22
|
+
# now assemble the chain based on the tweets -- this may take a few seconds to compile
|
23
|
+
> chain = importer.chain
|
24
24
|
=> #<Markovian::Corpus:0x007fd0ca03df70 ...>
|
25
25
|
# Now, we can build some text!
|
26
|
-
> Markovian::TextBuilder.new(
|
26
|
+
> Markovian::TextBuilder.new(chain).construct("markov")
|
27
27
|
=> "markov chains a lot better than a month, i've been here half an hour of night when you can get behind belgium for the offline train journey"
|
28
28
|
```
|
29
29
|
|
30
30
|
Exactly!
|
31
31
|
|
32
|
+
## Features
|
33
|
+
|
34
|
+
So far, Markovian gives you the ability to, given a set of inputs, generate random text. In
|
35
|
+
addition, your money gets you:
|
36
|
+
|
37
|
+
* A built-in importer to turn Twitter csv archives into Markov chain-derived text
|
38
|
+
* A built-in filter to remove final words that statistically (in the corpus) rarely end sentences.
|
39
|
+
Avoid unsightly sentences ending in "and so of" and so on!
|
40
|
+
|
32
41
|
## Development
|
33
42
|
|
34
43
|
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/changelog.md
CHANGED
@@ -1,5 +1,14 @@
|
|
1
1
|
# CHANGELOG
|
2
2
|
|
3
|
+
## 0.3.0
|
4
|
+
|
5
|
+
* TextBuilder now filters out final words that statistically rarely end sentences (first filter!)
|
6
|
+
* TextBuilder#construct now includes seed text by default (instead of via opt-in)
|
7
|
+
* Add Chain#word_entry to allow access to word data
|
8
|
+
* Properly collect metadata about words (previously collected next_word's data)
|
9
|
+
* Refactor Dictionary to provide access to entries, removing a lot of method duplication
|
10
|
+
* Remove Corpus class (no longer necessary), make Chain the base
|
11
|
+
|
3
12
|
## 0.2.9
|
4
13
|
|
5
14
|
Internal refactors only, no new functionality.
|
data/lib/markovian.rb
CHANGED
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'markovian/chain/dictionary'
|
2
|
+
|
3
|
+
# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
|
4
|
+
# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
|
5
|
+
# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
|
6
|
+
module Markovian
|
7
|
+
class Chain
|
8
|
+
def initialize
|
9
|
+
@one_key_dictionary = Dictionary.new
|
10
|
+
@two_key_dictionary = Dictionary.new
|
11
|
+
end
|
12
|
+
|
13
|
+
# Allow access to a word's metadata by providing its dictionary entry. For now, we only do
|
14
|
+
# individual words, not two-word phrases.
|
15
|
+
def word_entry(word)
|
16
|
+
@one_key_dictionary[word]
|
17
|
+
end
|
18
|
+
|
19
|
+
def lengthen(word, next_word:, previous_word: nil)
|
20
|
+
# When we encounter a word, we track its metadata and and what words surround it
|
21
|
+
write_to_dictionary(@one_key_dictionary, word, word, next_word)
|
22
|
+
write_to_dictionary(@two_key_dictionary, two_word_key(previous_word, word), word, next_word)
|
23
|
+
word
|
24
|
+
end
|
25
|
+
|
26
|
+
def next_word(word, previous_word: nil)
|
27
|
+
if dictionary_entry = entry(word, previous_word)
|
28
|
+
dictionary_entry.next_word
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def random_word
|
33
|
+
one_key_dictionary.random_word
|
34
|
+
end
|
35
|
+
|
36
|
+
def ==(other)
|
37
|
+
self.one_key_dictionary == other.one_key_dictionary &&
|
38
|
+
self.two_key_dictionary == other.two_key_dictionary
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
# for equality checking
|
44
|
+
attr_reader :one_key_dictionary, :two_key_dictionary
|
45
|
+
|
46
|
+
def entry(word, previous_word = nil)
|
47
|
+
if previous_word
|
48
|
+
entry_for_two_words(previous_word, word) || entry_for_one_word(word)
|
49
|
+
else
|
50
|
+
entry_for_one_word(word)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def entry_for_two_words(previous_word, word)
|
55
|
+
entry_if_present(@two_key_dictionary[two_word_key(previous_word, word)])
|
56
|
+
end
|
57
|
+
|
58
|
+
def entry_for_one_word(word)
|
59
|
+
# Not strictly necessary, since if there's an empty entry here we'll just get nils, but better to
|
60
|
+
# do it right.
|
61
|
+
entry_if_present(@one_key_dictionary[word])
|
62
|
+
end
|
63
|
+
|
64
|
+
def entry_if_present(entry)
|
65
|
+
# Ignore empty entries that haven't actually been seen in the corpus
|
66
|
+
# TODO refactor to not even create them
|
67
|
+
entry if entry.occurrences > 0
|
68
|
+
end
|
69
|
+
|
70
|
+
# We represent the two words as a space-delimited phrase for simplicity and speed of access via
|
71
|
+
# hash keys.
|
72
|
+
def two_word_key(word1, word2)
|
73
|
+
"#{word1} #{word2}"
|
74
|
+
end
|
75
|
+
|
76
|
+
def write_to_dictionary(dictionary, key, word_instance, next_word)
|
77
|
+
dictionary[key].record_observance(word_instance)
|
78
|
+
dictionary[key].push(next_word)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -17,40 +17,29 @@ require 'markovian/utils/text_splitter'
|
|
17
17
|
# * Handling sentences or newlines is later -- I'm not sure the right way to do it.
|
18
18
|
# * Capitalization is deferred for later.
|
19
19
|
module Markovian
|
20
|
-
class
|
20
|
+
class Chain
|
21
21
|
class Compiler
|
22
22
|
# Pass in a text, and optionally an existing Markov chain to add data to. In many cases, you
|
23
23
|
# may be building a chain using a set of smaller texts instead of one large texts (dialog,
|
24
24
|
# for instance, or Twitter archives), and so may call this class repeatedly for elements of
|
25
|
-
# the
|
26
|
-
attr_reader :
|
27
|
-
def initialize(
|
28
|
-
@
|
25
|
+
# the corpus.
|
26
|
+
attr_reader :chain
|
27
|
+
def initialize(starter_chain = Chain.new)
|
28
|
+
@chain = starter_chain
|
29
29
|
end
|
30
30
|
|
31
|
-
def
|
32
|
-
texts.each {|t|
|
33
|
-
|
31
|
+
def build_chain(texts)
|
32
|
+
texts.each {|t| incorporate_text_into_chain(t)}
|
33
|
+
chain
|
34
34
|
end
|
35
35
|
|
36
|
-
def
|
37
|
-
add_text_to_chain(split_into_components(text),
|
38
|
-
|
39
|
-
# that allows us to see what words precede any given word
|
40
|
-
add_text_to_chain(split_into_components(text).reverse, backward_chain)
|
41
|
-
corpus
|
36
|
+
def incorporate_text_into_chain(text)
|
37
|
+
add_text_to_chain(split_into_components(text), chain)
|
38
|
+
chain
|
42
39
|
end
|
43
40
|
|
44
41
|
protected
|
45
42
|
|
46
|
-
def forward_chain
|
47
|
-
corpus.forward
|
48
|
-
end
|
49
|
-
|
50
|
-
def backward_chain
|
51
|
-
corpus.backward
|
52
|
-
end
|
53
|
-
|
54
43
|
def add_text_to_chain(text_elements, chain)
|
55
44
|
previous_word = nil
|
56
45
|
text_elements.each_with_index do |word, index|
|
@@ -1,21 +1,13 @@
|
|
1
|
-
require 'markovian/
|
1
|
+
require 'markovian/chain/dictionary_entry'
|
2
2
|
#
|
3
3
|
# This class represents a dictionary of words or phrases and the various words that can follow
|
4
4
|
# them. The key is an opaque value, which could represent either a single word or a phrase as desired.
|
5
5
|
module Markovian
|
6
|
-
class
|
6
|
+
class Chain
|
7
7
|
class Dictionary
|
8
|
-
def
|
9
|
-
#
|
10
|
-
dictionary[key.to_s]
|
11
|
-
end
|
12
|
-
|
13
|
-
def next_word(key)
|
14
|
-
dictionary[key].next_word
|
15
|
-
end
|
16
|
-
|
17
|
-
def previous_word(key)
|
18
|
-
dictionary[key].previous_word
|
8
|
+
def [](key)
|
9
|
+
# Key could be a string or a Tokeneyes::Word object
|
10
|
+
dictionary[key.to_s]
|
19
11
|
end
|
20
12
|
|
21
13
|
def random_word
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module Markovian
|
2
|
+
class Chain
|
3
|
+
class DictionaryEntry
|
4
|
+
# Below this, we don't have enough occurrences to draw conclusions about how a word is used.
|
5
|
+
SIGNIFICANT_OCCURRENCE_THRESHOLD = 50
|
6
|
+
|
7
|
+
attr_reader :word, :counts
|
8
|
+
def initialize(word)
|
9
|
+
@word = word.to_s
|
10
|
+
@next_words = []
|
11
|
+
@previous_words = []
|
12
|
+
@counts = Hash.new(0)
|
13
|
+
end
|
14
|
+
|
15
|
+
def record_observance(word_instance, direction: :forwards)
|
16
|
+
# The word has been observed, so let's increase the appropriate counts.
|
17
|
+
# We don't want to double-count words if we read the text both forward and backward, so
|
18
|
+
# only count in the forward direction. (If we encounter a scenario where someone only wants
|
19
|
+
# to read in the backward direction, we can deal with that then.)
|
20
|
+
validate_direction(direction)
|
21
|
+
if direction == :forwards
|
22
|
+
@counts[:total] += 1
|
23
|
+
@counts[:ends_sentence] += 1 if word_instance.ends_sentence?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def push(next_word, direction: :forwards)
|
28
|
+
# Also add the follwoing word
|
29
|
+
array_for_direction(direction) << next_word.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
def next_word
|
33
|
+
next_words.sample
|
34
|
+
end
|
35
|
+
|
36
|
+
def previous_word
|
37
|
+
previous_words.sample
|
38
|
+
end
|
39
|
+
|
40
|
+
def ==(other)
|
41
|
+
self.word == other.word &&
|
42
|
+
self.next_words == other.next_words &&
|
43
|
+
self.previous_words == other.previous_words
|
44
|
+
end
|
45
|
+
|
46
|
+
def occurrences
|
47
|
+
counts[:total]
|
48
|
+
end
|
49
|
+
|
50
|
+
def likelihood_to_end_sentence
|
51
|
+
# if we don't have enough data, we don't have enough data
|
52
|
+
if occurrences >= SIGNIFICANT_OCCURRENCE_THRESHOLD
|
53
|
+
counts[:ends_sentence].to_f / occurrences
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def to_s
|
58
|
+
word
|
59
|
+
end
|
60
|
+
|
61
|
+
protected
|
62
|
+
|
63
|
+
# for equality checking and other usage
|
64
|
+
attr_reader :next_words, :previous_words
|
65
|
+
|
66
|
+
VALID_DIRECTIONS = [:backwards, :forwards]
|
67
|
+
|
68
|
+
def array_for_direction(direction)
|
69
|
+
validate_direction(direction)
|
70
|
+
direction == :backwards ? previous_words : next_words
|
71
|
+
end
|
72
|
+
|
73
|
+
def validate_direction(direction)
|
74
|
+
unless VALID_DIRECTIONS.include?(direction)
|
75
|
+
raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -1,27 +1,34 @@
|
|
1
1
|
require 'markovian/utils/text_splitter'
|
2
|
+
require 'markovian/text_builder/end_of_sentence_filter'
|
2
3
|
|
3
|
-
# This class, given a Markov
|
4
|
+
# This class, given a Markov chain, will attempt to construct a new text based on a given seed using
|
4
5
|
# the Markov associations.
|
5
6
|
module Markovian
|
6
7
|
class TextBuilder
|
7
|
-
attr_reader :seed_text, :
|
8
|
-
def initialize(
|
9
|
-
@
|
8
|
+
attr_reader :seed_text, :chain
|
9
|
+
def initialize(chain)
|
10
|
+
@chain = chain
|
10
11
|
end
|
11
12
|
|
12
|
-
def construct(seed_text, length: 140,
|
13
|
+
def construct(seed_text, length: 140, exclude_seed_text: false)
|
13
14
|
# TODO: if we don't hit a result for the first pair, move backward through the original text
|
14
15
|
# until we get something
|
15
|
-
|
16
|
-
result_with_next_word(
|
17
|
-
previous_pair:
|
18
|
-
result:
|
16
|
+
seed_components = split_seed_text(seed_text)
|
17
|
+
output = result_with_next_word(
|
18
|
+
previous_pair: identify_starter_text(seed_components),
|
19
|
+
result: exclude_seed_text ? [] : seed_components,
|
19
20
|
length: length
|
20
21
|
)
|
22
|
+
format_output(apply_filters(output))
|
21
23
|
end
|
22
24
|
|
23
|
-
|
24
|
-
|
25
|
+
protected
|
26
|
+
|
27
|
+
def apply_filters(output)
|
28
|
+
EndOfSentenceFilter.new.filtered_sentence(sentence_with_word_data(output))
|
29
|
+
end
|
30
|
+
|
31
|
+
def identify_starter_text(seed_components)
|
25
32
|
if seed_components.length >= 2
|
26
33
|
seed_components[-2..-1]
|
27
34
|
else
|
@@ -30,15 +37,13 @@ module Markovian
|
|
30
37
|
end
|
31
38
|
end
|
32
39
|
|
33
|
-
protected
|
34
|
-
|
35
40
|
def result_with_next_word(previous_pair:, result:, length:)
|
36
41
|
previous_word, current_word = previous_pair
|
37
|
-
if next_word =
|
42
|
+
if next_word = chain.next_word(current_word, previous_word: previous_word)
|
38
43
|
# we use join rather than + to avoid leading spaces, and strip to ignore leading nils or
|
39
44
|
# empty strings
|
40
|
-
interim_result =
|
41
|
-
if interim_result.length > length
|
45
|
+
interim_result = result + [next_word]
|
46
|
+
if format_output(interim_result).length > length
|
42
47
|
result
|
43
48
|
else
|
44
49
|
result_with_next_word(
|
@@ -52,14 +57,18 @@ module Markovian
|
|
52
57
|
end
|
53
58
|
end
|
54
59
|
|
55
|
-
# Turn an array of
|
56
|
-
def
|
57
|
-
array_of_words.compact.map(&:strip).join(" ")
|
60
|
+
# Turn an array of Word objects into an ongoing string
|
61
|
+
def format_output(array_of_words)
|
62
|
+
array_of_words.compact.map(&:to_s).map(&:strip).join(" ")
|
63
|
+
end
|
64
|
+
|
65
|
+
def sentence_with_word_data(sentence)
|
66
|
+
@sentence_with_word_data ||= sentence.map {|word| chain.word_entry(word)}
|
58
67
|
end
|
59
68
|
|
60
69
|
def split_seed_text(seed_text)
|
61
70
|
# We get back Tokeneyes::Word objects, but for now only care about the strings within
|
62
|
-
Utils::TextSplitter.new(seed_text).components
|
71
|
+
Utils::TextSplitter.new(seed_text).components
|
63
72
|
end
|
64
73
|
end
|
65
74
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Markovian
|
2
|
+
class TextBuilder
|
3
|
+
# This class will take sentence and apply appropriate filters. It will roll back a sentence up
|
4
|
+
# to a certain number of words if those words have a low likelihood of ending the sentence.
|
5
|
+
# Future changes will increase the qualities filtered for.
|
6
|
+
class EndOfSentenceFilter
|
7
|
+
MAX_WORDS_FILTERED = 3
|
8
|
+
|
9
|
+
def filtered_sentence(sentence)
|
10
|
+
filter_unlikely_ending_words(sentence)
|
11
|
+
end
|
12
|
+
|
13
|
+
protected
|
14
|
+
|
15
|
+
def filter_unlikely_ending_words(current_sentence, words_filtered = 0)
|
16
|
+
return current_sentence if words_filtered >= MAX_WORDS_FILTERED
|
17
|
+
|
18
|
+
last_word = current_sentence.last
|
19
|
+
likelihood = last_word.likelihood_to_end_sentence
|
20
|
+
if likelihood && rand < likelihood
|
21
|
+
# if we pop a word, consider removing the next one
|
22
|
+
filter_unlikely_ending_words(current_sentence[0..-2], words_filtered + 1)
|
23
|
+
else
|
24
|
+
# if this word hasn't been seen enough, allow it to end a sentence
|
25
|
+
current_sentence
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
data/lib/markovian/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markovian
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Koppel
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09
|
11
|
+
date: 2015-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: tokeneyes
|
@@ -75,14 +75,14 @@ files:
|
|
75
75
|
- db/seeds.rb
|
76
76
|
- lib/.DS_Store
|
77
77
|
- lib/markovian.rb
|
78
|
-
- lib/markovian/
|
79
|
-
- lib/markovian/
|
80
|
-
- lib/markovian/
|
81
|
-
- lib/markovian/
|
82
|
-
- lib/markovian/corpus/dictionary_entry.rb
|
78
|
+
- lib/markovian/chain.rb
|
79
|
+
- lib/markovian/chain/compiler.rb
|
80
|
+
- lib/markovian/chain/dictionary.rb
|
81
|
+
- lib/markovian/chain/dictionary_entry.rb
|
83
82
|
- lib/markovian/importers/twitter/csv_importer.rb
|
84
83
|
- lib/markovian/importers/twitter/tweet.rb
|
85
84
|
- lib/markovian/text_builder.rb
|
85
|
+
- lib/markovian/text_builder/end_of_sentence_filter.rb
|
86
86
|
- lib/markovian/utils/text_splitter.rb
|
87
87
|
- lib/markovian/version.rb
|
88
88
|
- markovian.gemspec
|
data/lib/markovian/corpus.rb
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
require 'markovian/corpus/chain'
|
2
|
-
|
3
|
-
# This class represents a pair of chains, one going forward and one going backward. With this, we
|
4
|
-
# can construct phrases in which the original seed word appears at any point in the text (going
|
5
|
-
# backward to create the earlier text, forward to create the rest).
|
6
|
-
module Markovian
|
7
|
-
class Corpus
|
8
|
-
attr_reader :forward, :backward
|
9
|
-
def initialize
|
10
|
-
@forward, @backward = Chain.new, Chain.new
|
11
|
-
end
|
12
|
-
|
13
|
-
def next_word(word, previous_word: nil)
|
14
|
-
forward.next_word(word, previous_word: previous_word)
|
15
|
-
end
|
16
|
-
|
17
|
-
def previous_word(word, following_word: nil)
|
18
|
-
# backward goes in the opposite direction to forward
|
19
|
-
backward.next_word(word, previous_word: following_word)
|
20
|
-
end
|
21
|
-
|
22
|
-
def random_word
|
23
|
-
forward.random_word
|
24
|
-
end
|
25
|
-
|
26
|
-
def ==(other)
|
27
|
-
self.forward == other.forward &&
|
28
|
-
self.backward == other.backward
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|
@@ -1,53 +0,0 @@
|
|
1
|
-
require 'markovian/corpus/dictionary'
|
2
|
-
|
3
|
-
# The Chain represents Markov info as it's being assembled or expanded from a text. To compensate
|
4
|
-
# for small sample sizes, we track multiple chains (derived from both two-word phrases and single
|
5
|
-
# word). Phrases are prefered, but if we can't find a match, we'll try with a single word.
|
6
|
-
module Markovian
|
7
|
-
class Corpus
|
8
|
-
class Chain
|
9
|
-
def initialize
|
10
|
-
@one_key_dictionary = Dictionary.new
|
11
|
-
@two_key_dictionary = Dictionary.new
|
12
|
-
end
|
13
|
-
|
14
|
-
def lengthen(word, next_word:, previous_word: nil)
|
15
|
-
@one_key_dictionary.push(word, next_word)
|
16
|
-
@two_key_dictionary.push(two_word_key(previous_word, word), next_word)
|
17
|
-
word
|
18
|
-
end
|
19
|
-
|
20
|
-
def next_word(word, previous_word: nil)
|
21
|
-
result_for_two_words(previous_word, word) || result_for_one_word(word)
|
22
|
-
end
|
23
|
-
|
24
|
-
def random_word
|
25
|
-
one_key_dictionary.random_word
|
26
|
-
end
|
27
|
-
|
28
|
-
def ==(other)
|
29
|
-
self.one_key_dictionary == other.one_key_dictionary &&
|
30
|
-
self.two_key_dictionary == other.two_key_dictionary
|
31
|
-
end
|
32
|
-
|
33
|
-
protected
|
34
|
-
|
35
|
-
# for equality checking
|
36
|
-
attr_reader :one_key_dictionary, :two_key_dictionary
|
37
|
-
|
38
|
-
def result_for_two_words(previous_word, word)
|
39
|
-
@two_key_dictionary.next_word(two_word_key(previous_word, word)) if previous_word
|
40
|
-
end
|
41
|
-
|
42
|
-
def result_for_one_word(word)
|
43
|
-
@one_key_dictionary.next_word(word)
|
44
|
-
end
|
45
|
-
|
46
|
-
# We represent the two words as a space-delimited phrase for simplicity and speed of access via
|
47
|
-
# hash keys.
|
48
|
-
def two_word_key(word1, word2)
|
49
|
-
"#{word1} #{word2}"
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
module Markovian
|
2
|
-
class Corpus
|
3
|
-
class DictionaryEntry
|
4
|
-
attr_reader :word, :count
|
5
|
-
def initialize(word)
|
6
|
-
@word = word
|
7
|
-
@next_words = []
|
8
|
-
@previous_words = []
|
9
|
-
@count = 0
|
10
|
-
end
|
11
|
-
|
12
|
-
def push(word, direction: :forwards)
|
13
|
-
# The incoming word will be a Tokeneyes::Word object
|
14
|
-
array_for_direction(direction) << word.to_s
|
15
|
-
# we don't want to double-count words if we read the text both forward and backward, so
|
16
|
-
# only count in the forward direction. (If we encounter a scenario where someone only wants
|
17
|
-
# to read in the backward direction, we can deal with that then.)
|
18
|
-
@count += 1 if direction == :forwards
|
19
|
-
end
|
20
|
-
|
21
|
-
def next_word
|
22
|
-
next_words.sample
|
23
|
-
end
|
24
|
-
|
25
|
-
def previous_word
|
26
|
-
previous_words.sample
|
27
|
-
end
|
28
|
-
|
29
|
-
def ==(other)
|
30
|
-
self.word == other.word &&
|
31
|
-
self.next_words == other.next_words &&
|
32
|
-
self.previous_words == other.previous_words
|
33
|
-
end
|
34
|
-
|
35
|
-
protected
|
36
|
-
|
37
|
-
# for equality checking
|
38
|
-
attr_reader :next_words, :previous_words
|
39
|
-
|
40
|
-
VALID_DIRECTIONS = [:backwards, :forwards]
|
41
|
-
|
42
|
-
def array_for_direction(direction)
|
43
|
-
validate_direction(direction)
|
44
|
-
direction == :backwards ? previous_words : next_words
|
45
|
-
end
|
46
|
-
|
47
|
-
def validate_direction(direction)
|
48
|
-
unless VALID_DIRECTIONS.include?(direction)
|
49
|
-
raise ArgumentError.new("Invalid direction #{direction.inspect}, valid directions are #{VALID_DIRECTIONS.inspect}")
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|