markovfun 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +1 -0
- data/README.md +35 -8
- data/lib/markovfun/trigram.rb +104 -0
- data/lib/markovfun/util.rb +27 -0
- data/lib/markovfun/version.rb +1 -1
- data/lib/markovfun.rb +2 -123
- data/markovfun.gemspec +1 -0
- metadata +3 -1
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -2,11 +2,7 @@
|
|
2
2
|
|
3
3
|
This gem generates sentences from textfiles using trigrams.
|
4
4
|
It is based on Alex Rudnick's Python Markov chain generator,
|
5
|
-
<<<<<<< HEAD
|
6
5
|
the code for which is [here](https://github.com/alexrudnick/hackerschool-demos/tree/master/ngrams).
|
7
|
-
=======
|
8
|
-
the code for which is (here)[https://github.com/alexrudnick/hackerschool-demos/tree/master/ngrams].
|
9
|
-
>>>>>>> f8c8a08... Updated README
|
10
6
|
|
11
7
|
## Installation
|
12
8
|
|
@@ -24,11 +20,42 @@ Or install it yourself as:
|
|
24
20
|
|
25
21
|
## Usage
|
26
22
|
|
23
|
+
### File Processing
|
24
|
+
|
25
|
+
Get sentences from a text file:
|
26
|
+
|
27
|
+
`sentences = Markovfun::Util.get_sentences("bible.txt")`
|
28
|
+
|
29
|
+
### Trigrams
|
30
|
+
|
31
|
+
Create hash storing counts of words that follow two previous words.
|
32
|
+
|
33
|
+
`counts = Markovfun::Trigram.get_counts(sentences)`
|
34
|
+
|
35
|
+
Convert hash of counts to hash of probabilities.
|
36
|
+
|
37
|
+
`probs = Markovfun::Trigram.counts_to_probs(counts)`
|
38
|
+
|
39
|
+
Generate a sentence with a specified min length (in this case, 4) from the probability hash.
|
40
|
+
|
41
|
+
`sentence = Markovfun::Trigram.sentence_from_probs_hash(probs, 4)`
|
42
|
+
|
43
|
+
Score the sentence by "surprisal value" given a probability has.
|
44
|
+
|
45
|
+
`Markovfun::Trigram.score_sentence(sentence, probs)`
|
46
|
+
|
47
|
+
### Sample Program
|
48
|
+
|
27
49
|
Here's how you can generate a sentence from a text file.
|
28
50
|
|
29
51
|
```
|
30
|
-
sentences = Markovfun.get_sentences("bible.txt")
|
31
|
-
counts = Markovfun.get_counts(sentences)
|
32
|
-
probs = Markovfun.counts_to_probs(counts)
|
33
|
-
Markovfun.sentence_from_probs_hash(probs)
|
52
|
+
sentences = Markovfun::Util.get_sentences("bible.txt")
|
53
|
+
counts = Markovfun::Trigram.get_counts(sentences)
|
54
|
+
probs = Markovfun::Trigram.counts_to_probs(counts)
|
55
|
+
Markovfun::Trigram.sentence_from_probs_hash(probs, 4)
|
34
56
|
```
|
57
|
+
|
58
|
+
### Sample Sentence!
|
59
|
+
|
60
|
+
From "The Beautiful and the Damned":
|
61
|
+
"I liked him tremendously--ah, she had enjoyed a rather romantic figure, a scholar, a recluse, a tower of erudition."
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require "markovfun/version"
|
2
|
+
require "markovfun/util"
|
3
|
+
|
4
|
+
module Markovfun
|
5
|
+
module Trigram
|
6
|
+
include Markovfun::Util
|
7
|
+
|
8
|
+
# Generates a sentence, given a file.
|
9
|
+
def self.sentence_from_file(filename, min_length)
|
10
|
+
sentences = get_sentences(filename)
|
11
|
+
counts = get_counts(sentences)
|
12
|
+
probs = counts_to_probs(counts)
|
13
|
+
sentence_from_probs_hash(probs, min_length)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns a counts hash, given a list of sentences.
|
17
|
+
# The keys to the hash are all observed combinations of [prev2, prev1],
|
18
|
+
# where prev2 and prev1 are the two previous words.
|
19
|
+
# The values are hashes, in which the keys are words (cur) that have followed
|
20
|
+
# prev2 and prev1, and the values are the number of occurrences.
|
21
|
+
def self.get_counts(sentences)
|
22
|
+
counts_hash = {}
|
23
|
+
sentences.each do |sent|
|
24
|
+
# nil denotes the beginnings and ends of sentences
|
25
|
+
sent = [nil, nil] + sent + [nil]
|
26
|
+
sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
|
27
|
+
counts_hash[[prev2, prev1]] ||= {}
|
28
|
+
if !(counts_hash[[prev2, prev1]][cur])
|
29
|
+
counts_hash[[prev2, prev1]][cur] = 1
|
30
|
+
else
|
31
|
+
counts_hash[[prev2, prev1]][cur] += 1
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
counts_hash
|
36
|
+
end
|
37
|
+
|
38
|
+
# Generates a probability hash, given a counts hash.
|
39
|
+
# Similar to counts_hash, except containing the probability that a word
|
40
|
+
# follows two preceding words (as opposed to number of occurrences).
|
41
|
+
def self.counts_to_probs(counts_hash)
|
42
|
+
probs_hash = {}
|
43
|
+
counts_hash.each do |prev, cur_freq|
|
44
|
+
probs_hash[prev] ||= {}
|
45
|
+
cur_freq.each do |cur, freq|
|
46
|
+
prob = freq.to_f / cur_freq.values.reduce(:+)
|
47
|
+
probs_hash[prev][cur] = prob
|
48
|
+
end
|
49
|
+
end
|
50
|
+
probs_hash
|
51
|
+
end
|
52
|
+
|
53
|
+
# Generates a sample word, given a probability hash.
|
54
|
+
def self.sample_word(probs_hash)
|
55
|
+
score = rand
|
56
|
+
probs_hash.each do |word, prob|
|
57
|
+
return word if score < prob
|
58
|
+
score -= prob
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Generates a sample sentence, given a probability hash.
|
63
|
+
def self.sample_sentence(probs_hash)
|
64
|
+
prev2 = nil
|
65
|
+
prev1 = nil
|
66
|
+
out = []
|
67
|
+
|
68
|
+
while true
|
69
|
+
cur = sample_word(probs_hash[[prev2, prev1]])
|
70
|
+
if cur.nil?
|
71
|
+
return out
|
72
|
+
else
|
73
|
+
out << cur
|
74
|
+
prev2 = prev1
|
75
|
+
prev1 = cur
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Generates a sentence from a probability hash.
|
81
|
+
def self.sentence_from_probs_hash(probs, min_length)
|
82
|
+
sent = []
|
83
|
+
while score_sentence(sent, probs) > 30 || sent.length < min_length
|
84
|
+
sent = sample_sentence(probs)
|
85
|
+
end
|
86
|
+
sent = sent[0..-2].join(" ") + "."
|
87
|
+
sent
|
88
|
+
end
|
89
|
+
|
90
|
+
# Scores a sentence, depending on the likelihood that it occurs
|
91
|
+
# within a corpus.
|
92
|
+
def self.score_sentence(sent, probs)
|
93
|
+
total_surprise = 0
|
94
|
+
|
95
|
+
sent = sent[0..-2].split(" ").push(".")
|
96
|
+
sent = [nil, nil] + sent + [nil]
|
97
|
+
|
98
|
+
sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
|
99
|
+
total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
|
100
|
+
end
|
101
|
+
total_surprise
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "markovfun/version"
|
2
|
+
|
3
|
+
module Markovfun
|
4
|
+
module Util
|
5
|
+
|
6
|
+
# Gets lines from a file.
|
7
|
+
def self.get_lines(filename)
|
8
|
+
file = File.open(filename, "r")
|
9
|
+
data = file.read
|
10
|
+
file.close
|
11
|
+
lines = data.split("\n")
|
12
|
+
lines.map! { |l| l.strip.split(" ") }
|
13
|
+
end
|
14
|
+
|
15
|
+
# Gets sentences from a file.
|
16
|
+
def self.get_sentences(filename)
|
17
|
+
file = File.open(filename, "r")
|
18
|
+
data = file.read
|
19
|
+
file.close
|
20
|
+
data.gsub!(/\n/, " ")
|
21
|
+
data.gsub!(/"/,"")
|
22
|
+
sentences = data.split(".")
|
23
|
+
sentences.map! { |s| s.strip.split(" ").push(".") }
|
24
|
+
sentences.select! { |s| s[0].capitalize == s[0] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/markovfun/version.rb
CHANGED
data/lib/markovfun.rb
CHANGED
@@ -1,123 +1,2 @@
|
|
1
|
-
require
|
2
|
-
require '
|
3
|
-
|
4
|
-
module Markovfun
|
5
|
-
|
6
|
-
# Generates a sentence, given a file.
|
7
|
-
def self.sentence_from_file(filename)
|
8
|
-
sentences = get_sentences(filename)
|
9
|
-
counts = buildcounts(sentences)
|
10
|
-
probs = counts_to_probs(counts)
|
11
|
-
sentence_from_probs_hash(probs)
|
12
|
-
end
|
13
|
-
|
14
|
-
# Gets lines from a file.
|
15
|
-
def self.get_lines(filename)
|
16
|
-
file = File.open(filename, "r")
|
17
|
-
data = file.read
|
18
|
-
file.close
|
19
|
-
lines = data.split("\n")
|
20
|
-
lines.map! { |l| l.strip.split(" ") }
|
21
|
-
end
|
22
|
-
|
23
|
-
# Gets sentences from a file.
|
24
|
-
def self.get_sentences(filename)
|
25
|
-
file = File.open(filename, "r")
|
26
|
-
data = file.read
|
27
|
-
file.close
|
28
|
-
data.gsub!(/\n/, "")
|
29
|
-
data.gsub!(/"/,"")
|
30
|
-
sentences = data.split(".")
|
31
|
-
sentences.map! { |s| s.strip.split(" ").push(".") }
|
32
|
-
sentences.select! { |s| s[0].capitalize == s[0] }
|
33
|
-
end
|
34
|
-
|
35
|
-
# Returns a counts hash, given a list of sentences.
|
36
|
-
# The keys to the hash are all observed combinations of [prev2, prev1],
|
37
|
-
# where prev2 and prev1 are the two previous words.
|
38
|
-
# The values are hashes, in which the keys are words (cur) that have followed
|
39
|
-
# prev2 and prev1, and the values are the number of occurrences.
|
40
|
-
def self.get_counts(sentences)
|
41
|
-
counts_hash = {}
|
42
|
-
sentences.each do |sent|
|
43
|
-
# nil denotes the beginnings and ends of sentences
|
44
|
-
sent = [nil, nil] + sent + [nil]
|
45
|
-
sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
|
46
|
-
counts_hash[[prev2, prev1]] ||= {}
|
47
|
-
if !(counts_hash[[prev2, prev1]][cur])
|
48
|
-
counts_hash[[prev2, prev1]][cur] = 1
|
49
|
-
else
|
50
|
-
counts_hash[[prev2, prev1]][cur] += 1
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
counts_hash
|
55
|
-
end
|
56
|
-
|
57
|
-
# Generates a probability hash, given a counts hash.
|
58
|
-
# Similar to counts_hash, except containing the probability that a word
|
59
|
-
# follows two preceding words (as opposed to number of occurrences).
|
60
|
-
def self.counts_to_probs(counts_hash)
|
61
|
-
probs_hash = {}
|
62
|
-
counts_hash.each do |prev, cur_freq|
|
63
|
-
probs_hash[prev] ||= {}
|
64
|
-
cur_freq.each do |cur, freq|
|
65
|
-
prob = freq.to_f / cur_freq.values.reduce(:+)
|
66
|
-
probs_hash[prev][cur] = prob
|
67
|
-
end
|
68
|
-
end
|
69
|
-
probs_hash
|
70
|
-
end
|
71
|
-
|
72
|
-
# Generates a sample word, given a probability hash.
|
73
|
-
def self.sample_word(probs_hash)
|
74
|
-
score = rand
|
75
|
-
probs_hash.each do |word, prob|
|
76
|
-
return word if score < prob
|
77
|
-
score -= prob
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
# Generates a sample sentence, given a probability hash.
|
82
|
-
def self.sample_sentence(probs_hash)
|
83
|
-
prev2 = nil
|
84
|
-
prev1 = nil
|
85
|
-
out = []
|
86
|
-
|
87
|
-
while true
|
88
|
-
cur = sample_word(probs_hash[[prev2, prev1]])
|
89
|
-
if cur.nil?
|
90
|
-
return out
|
91
|
-
else
|
92
|
-
out << cur
|
93
|
-
prev2 = prev1
|
94
|
-
prev1 = cur
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
# Scores a sentence, depending on the likelihood that it occurs
|
100
|
-
# within a corpus.
|
101
|
-
def self.score_sentence(sent, probs)
|
102
|
-
total_surprise = 0
|
103
|
-
sent = [nil, nil] + sent + [nil]
|
104
|
-
|
105
|
-
sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
|
106
|
-
total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
|
107
|
-
end
|
108
|
-
total_surprise
|
109
|
-
end
|
110
|
-
|
111
|
-
# Generates a sentence from a probability hash.
|
112
|
-
def self.sentence_from_probs_hash(probs)
|
113
|
-
sent = []
|
114
|
-
while score_sentence(sent, probs) > 30 || sent.length < 4
|
115
|
-
sent = sample_sentence(probs)
|
116
|
-
end
|
117
|
-
puts "score: #{score_sentence(sent, probs)}"
|
118
|
-
sent = sent[0..-2].join(" ") + "."
|
119
|
-
puts sent
|
120
|
-
sent
|
121
|
-
end
|
122
|
-
|
123
|
-
end
|
1
|
+
require 'markovfun/trigram'
|
2
|
+
require 'markovfun/util'
|
data/markovfun.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markovfun
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -56,6 +56,8 @@ files:
|
|
56
56
|
- README.md
|
57
57
|
- Rakefile
|
58
58
|
- lib/markovfun.rb
|
59
|
+
- lib/markovfun/trigram.rb
|
60
|
+
- lib/markovfun/util.rb
|
59
61
|
- lib/markovfun/version.rb
|
60
62
|
- markovfun.gemspec
|
61
63
|
homepage: https://github.com/mariapacana/markovfun
|