markovfun 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +1 -0
- data/README.md +35 -8
- data/lib/markovfun/trigram.rb +104 -0
- data/lib/markovfun/util.rb +27 -0
- data/lib/markovfun/version.rb +1 -1
- data/lib/markovfun.rb +2 -123
- data/markovfun.gemspec +1 -0
- metadata +3 -1
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -2,11 +2,7 @@
|
|
2
2
|
|
3
3
|
This gem generates sentences from textfiles using trigrams.
|
4
4
|
It is based on Alex Rudnick's Python Markov chain generator,
|
5
|
-
<<<<<<< HEAD
|
6
5
|
the code for which is [here](https://github.com/alexrudnick/hackerschool-demos/tree/master/ngrams).
|
7
|
-
=======
|
8
|
-
the code for which is (here)[https://github.com/alexrudnick/hackerschool-demos/tree/master/ngrams].
|
9
|
-
>>>>>>> f8c8a08... Updated README
|
10
6
|
|
11
7
|
## Installation
|
12
8
|
|
@@ -24,11 +20,42 @@ Or install it yourself as:
|
|
24
20
|
|
25
21
|
## Usage
|
26
22
|
|
23
|
+
### File Processing
|
24
|
+
|
25
|
+
Get sentences from a text file:
|
26
|
+
|
27
|
+
`sentences = Markovfun::Util.get_sentences("bible.txt")`
|
28
|
+
|
29
|
+
### Trigrams
|
30
|
+
|
31
|
+
Create hash storing counts of words that follow two previous words.
|
32
|
+
|
33
|
+
`counts = Markovfun::Trigram.get_counts(sentences)`
|
34
|
+
|
35
|
+
Convert hash of counts to hash of probabilities.
|
36
|
+
|
37
|
+
`probs = Markovfun::Trigram.counts_to_probs(counts)`
|
38
|
+
|
39
|
+
Generate a sentence with a specified min length (in this case, 4) from the probability hash.
|
40
|
+
|
41
|
+
`sentence = Markovfun::Trigram.sentence_from_probs_hash(probs, 4)`
|
42
|
+
|
43
|
+
Score the sentence by "surprisal value" given a probability has.
|
44
|
+
|
45
|
+
`Markovfun::Trigram.score_sentence(sentence, probs)`
|
46
|
+
|
47
|
+
### Sample Program
|
48
|
+
|
27
49
|
Here's how you can generate a sentence from a text file.
|
28
50
|
|
29
51
|
```
|
30
|
-
sentences = Markovfun.get_sentences("bible.txt")
|
31
|
-
counts = Markovfun.get_counts(sentences)
|
32
|
-
probs = Markovfun.counts_to_probs(counts)
|
33
|
-
Markovfun.sentence_from_probs_hash(probs)
|
52
|
+
sentences = Markovfun::Util.get_sentences("bible.txt")
|
53
|
+
counts = Markovfun::Trigram.get_counts(sentences)
|
54
|
+
probs = Markovfun::Trigram.counts_to_probs(counts)
|
55
|
+
Markovfun::Trigram.sentence_from_probs_hash(probs, 4)
|
34
56
|
```
|
57
|
+
|
58
|
+
### Sample Sentence!
|
59
|
+
|
60
|
+
From "The Beautiful and the Damned":
|
61
|
+
"I liked him tremendously--ah, she had enjoyed a rather romantic figure, a scholar, a recluse, a tower of erudition."
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require "markovfun/version"
|
2
|
+
require "markovfun/util"
|
3
|
+
|
4
|
+
module Markovfun
|
5
|
+
module Trigram
|
6
|
+
include Markovfun::Util
|
7
|
+
|
8
|
+
# Generates a sentence, given a file.
|
9
|
+
def self.sentence_from_file(filename, min_length)
|
10
|
+
sentences = get_sentences(filename)
|
11
|
+
counts = get_counts(sentences)
|
12
|
+
probs = counts_to_probs(counts)
|
13
|
+
sentence_from_probs_hash(probs, min_length)
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns a counts hash, given a list of sentences.
|
17
|
+
# The keys to the hash are all observed combinations of [prev2, prev1],
|
18
|
+
# where prev2 and prev1 are the two previous words.
|
19
|
+
# The values are hashes, in which the keys are words (cur) that have followed
|
20
|
+
# prev2 and prev1, and the values are the number of occurrences.
|
21
|
+
def self.get_counts(sentences)
|
22
|
+
counts_hash = {}
|
23
|
+
sentences.each do |sent|
|
24
|
+
# nil denotes the beginnings and ends of sentences
|
25
|
+
sent = [nil, nil] + sent + [nil]
|
26
|
+
sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
|
27
|
+
counts_hash[[prev2, prev1]] ||= {}
|
28
|
+
if !(counts_hash[[prev2, prev1]][cur])
|
29
|
+
counts_hash[[prev2, prev1]][cur] = 1
|
30
|
+
else
|
31
|
+
counts_hash[[prev2, prev1]][cur] += 1
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
counts_hash
|
36
|
+
end
|
37
|
+
|
38
|
+
# Generates a probability hash, given a counts hash.
|
39
|
+
# Similar to counts_hash, except containing the probability that a word
|
40
|
+
# follows two preceding words (as opposed to number of occurrences).
|
41
|
+
def self.counts_to_probs(counts_hash)
|
42
|
+
probs_hash = {}
|
43
|
+
counts_hash.each do |prev, cur_freq|
|
44
|
+
probs_hash[prev] ||= {}
|
45
|
+
cur_freq.each do |cur, freq|
|
46
|
+
prob = freq.to_f / cur_freq.values.reduce(:+)
|
47
|
+
probs_hash[prev][cur] = prob
|
48
|
+
end
|
49
|
+
end
|
50
|
+
probs_hash
|
51
|
+
end
|
52
|
+
|
53
|
+
# Generates a sample word, given a probability hash.
|
54
|
+
def self.sample_word(probs_hash)
|
55
|
+
score = rand
|
56
|
+
probs_hash.each do |word, prob|
|
57
|
+
return word if score < prob
|
58
|
+
score -= prob
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Generates a sample sentence, given a probability hash.
|
63
|
+
def self.sample_sentence(probs_hash)
|
64
|
+
prev2 = nil
|
65
|
+
prev1 = nil
|
66
|
+
out = []
|
67
|
+
|
68
|
+
while true
|
69
|
+
cur = sample_word(probs_hash[[prev2, prev1]])
|
70
|
+
if cur.nil?
|
71
|
+
return out
|
72
|
+
else
|
73
|
+
out << cur
|
74
|
+
prev2 = prev1
|
75
|
+
prev1 = cur
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Generates a sentence from a probability hash.
|
81
|
+
def self.sentence_from_probs_hash(probs, min_length)
|
82
|
+
sent = []
|
83
|
+
while score_sentence(sent, probs) > 30 || sent.length < min_length
|
84
|
+
sent = sample_sentence(probs)
|
85
|
+
end
|
86
|
+
sent = sent[0..-2].join(" ") + "."
|
87
|
+
sent
|
88
|
+
end
|
89
|
+
|
90
|
+
# Scores a sentence, depending on the likelihood that it occurs
|
91
|
+
# within a corpus.
|
92
|
+
def self.score_sentence(sent, probs)
|
93
|
+
total_surprise = 0
|
94
|
+
|
95
|
+
sent = sent[0..-2].split(" ").push(".")
|
96
|
+
sent = [nil, nil] + sent + [nil]
|
97
|
+
|
98
|
+
sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
|
99
|
+
total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
|
100
|
+
end
|
101
|
+
total_surprise
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "markovfun/version"
|
2
|
+
|
3
|
+
module Markovfun
|
4
|
+
module Util
|
5
|
+
|
6
|
+
# Gets lines from a file.
|
7
|
+
def self.get_lines(filename)
|
8
|
+
file = File.open(filename, "r")
|
9
|
+
data = file.read
|
10
|
+
file.close
|
11
|
+
lines = data.split("\n")
|
12
|
+
lines.map! { |l| l.strip.split(" ") }
|
13
|
+
end
|
14
|
+
|
15
|
+
# Gets sentences from a file.
|
16
|
+
def self.get_sentences(filename)
|
17
|
+
file = File.open(filename, "r")
|
18
|
+
data = file.read
|
19
|
+
file.close
|
20
|
+
data.gsub!(/\n/, " ")
|
21
|
+
data.gsub!(/"/,"")
|
22
|
+
sentences = data.split(".")
|
23
|
+
sentences.map! { |s| s.strip.split(" ").push(".") }
|
24
|
+
sentences.select! { |s| s[0].capitalize == s[0] }
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
data/lib/markovfun/version.rb
CHANGED
data/lib/markovfun.rb
CHANGED
@@ -1,123 +1,2 @@
|
|
1
|
-
require
|
2
|
-
require '
|
3
|
-
|
4
|
-
module Markovfun
|
5
|
-
|
6
|
-
# Generates a sentence, given a file.
|
7
|
-
def self.sentence_from_file(filename)
|
8
|
-
sentences = get_sentences(filename)
|
9
|
-
counts = buildcounts(sentences)
|
10
|
-
probs = counts_to_probs(counts)
|
11
|
-
sentence_from_probs_hash(probs)
|
12
|
-
end
|
13
|
-
|
14
|
-
# Gets lines from a file.
|
15
|
-
def self.get_lines(filename)
|
16
|
-
file = File.open(filename, "r")
|
17
|
-
data = file.read
|
18
|
-
file.close
|
19
|
-
lines = data.split("\n")
|
20
|
-
lines.map! { |l| l.strip.split(" ") }
|
21
|
-
end
|
22
|
-
|
23
|
-
# Gets sentences from a file.
|
24
|
-
def self.get_sentences(filename)
|
25
|
-
file = File.open(filename, "r")
|
26
|
-
data = file.read
|
27
|
-
file.close
|
28
|
-
data.gsub!(/\n/, "")
|
29
|
-
data.gsub!(/"/,"")
|
30
|
-
sentences = data.split(".")
|
31
|
-
sentences.map! { |s| s.strip.split(" ").push(".") }
|
32
|
-
sentences.select! { |s| s[0].capitalize == s[0] }
|
33
|
-
end
|
34
|
-
|
35
|
-
# Returns a counts hash, given a list of sentences.
|
36
|
-
# The keys to the hash are all observed combinations of [prev2, prev1],
|
37
|
-
# where prev2 and prev1 are the two previous words.
|
38
|
-
# The values are hashes, in which the keys are words (cur) that have followed
|
39
|
-
# prev2 and prev1, and the values are the number of occurrences.
|
40
|
-
def self.get_counts(sentences)
|
41
|
-
counts_hash = {}
|
42
|
-
sentences.each do |sent|
|
43
|
-
# nil denotes the beginnings and ends of sentences
|
44
|
-
sent = [nil, nil] + sent + [nil]
|
45
|
-
sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
|
46
|
-
counts_hash[[prev2, prev1]] ||= {}
|
47
|
-
if !(counts_hash[[prev2, prev1]][cur])
|
48
|
-
counts_hash[[prev2, prev1]][cur] = 1
|
49
|
-
else
|
50
|
-
counts_hash[[prev2, prev1]][cur] += 1
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
counts_hash
|
55
|
-
end
|
56
|
-
|
57
|
-
# Generates a probability hash, given a counts hash.
|
58
|
-
# Similar to counts_hash, except containing the probability that a word
|
59
|
-
# follows two preceding words (as opposed to number of occurrences).
|
60
|
-
def self.counts_to_probs(counts_hash)
|
61
|
-
probs_hash = {}
|
62
|
-
counts_hash.each do |prev, cur_freq|
|
63
|
-
probs_hash[prev] ||= {}
|
64
|
-
cur_freq.each do |cur, freq|
|
65
|
-
prob = freq.to_f / cur_freq.values.reduce(:+)
|
66
|
-
probs_hash[prev][cur] = prob
|
67
|
-
end
|
68
|
-
end
|
69
|
-
probs_hash
|
70
|
-
end
|
71
|
-
|
72
|
-
# Generates a sample word, given a probability hash.
|
73
|
-
def self.sample_word(probs_hash)
|
74
|
-
score = rand
|
75
|
-
probs_hash.each do |word, prob|
|
76
|
-
return word if score < prob
|
77
|
-
score -= prob
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
|
-
# Generates a sample sentence, given a probability hash.
|
82
|
-
def self.sample_sentence(probs_hash)
|
83
|
-
prev2 = nil
|
84
|
-
prev1 = nil
|
85
|
-
out = []
|
86
|
-
|
87
|
-
while true
|
88
|
-
cur = sample_word(probs_hash[[prev2, prev1]])
|
89
|
-
if cur.nil?
|
90
|
-
return out
|
91
|
-
else
|
92
|
-
out << cur
|
93
|
-
prev2 = prev1
|
94
|
-
prev1 = cur
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
# Scores a sentence, depending on the likelihood that it occurs
|
100
|
-
# within a corpus.
|
101
|
-
def self.score_sentence(sent, probs)
|
102
|
-
total_surprise = 0
|
103
|
-
sent = [nil, nil] + sent + [nil]
|
104
|
-
|
105
|
-
sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
|
106
|
-
total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
|
107
|
-
end
|
108
|
-
total_surprise
|
109
|
-
end
|
110
|
-
|
111
|
-
# Generates a sentence from a probability hash.
|
112
|
-
def self.sentence_from_probs_hash(probs)
|
113
|
-
sent = []
|
114
|
-
while score_sentence(sent, probs) > 30 || sent.length < 4
|
115
|
-
sent = sample_sentence(probs)
|
116
|
-
end
|
117
|
-
puts "score: #{score_sentence(sent, probs)}"
|
118
|
-
sent = sent[0..-2].join(" ") + "."
|
119
|
-
puts sent
|
120
|
-
sent
|
121
|
-
end
|
122
|
-
|
123
|
-
end
|
1
|
+
require 'markovfun/trigram'
|
2
|
+
require 'markovfun/util'
|
data/markovfun.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markovfun
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -56,6 +56,8 @@ files:
|
|
56
56
|
- README.md
|
57
57
|
- Rakefile
|
58
58
|
- lib/markovfun.rb
|
59
|
+
- lib/markovfun/trigram.rb
|
60
|
+
- lib/markovfun/util.rb
|
59
61
|
- lib/markovfun/version.rb
|
60
62
|
- markovfun.gemspec
|
61
63
|
homepage: https://github.com/mariapacana/markovfun
|