markovfun 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -8,6 +8,7 @@ InstalledFiles
8
8
  _yardoc
9
9
  coverage
10
10
  doc/
11
+ texts/
11
12
  lib/bundler/man
12
13
  pkg
13
14
  rdoc
data/Gemfile CHANGED
@@ -2,3 +2,4 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in markovfun.gemspec
4
4
  gemspec
5
+ pry
data/README.md CHANGED
@@ -2,11 +2,7 @@
2
2
 
3
3
  This gem generates sentences from textfiles using trigrams.
4
4
  It is based on Alex Rudnick's Python Markov chain generator,
5
- <<<<<<< HEAD
6
5
  the code for which is [here](https://github.com/alexrudnick/hackerschool-demos/tree/master/ngrams).
7
- =======
8
- the code for which is (here)[https://github.com/alexrudnick/hackerschool-demos/tree/master/ngrams].
9
- >>>>>>> f8c8a08... Updated README
10
6
 
11
7
  ## Installation
12
8
 
@@ -24,11 +20,42 @@ Or install it yourself as:
24
20
 
25
21
  ## Usage
26
22
 
23
+ ### File Processing
24
+
25
+ Get sentences from a text file:
26
+
27
+ `sentences = Markovfun::Util.get_sentences("bible.txt")`
28
+
29
+ ### Trigrams
30
+
31
+ Create hash storing counts of words that follow two previous words.
32
+
33
+ `counts = Markovfun::Trigram.get_counts(sentences)`
34
+
35
+ Convert hash of counts to hash of probabilities.
36
+
37
+ `probs = Markovfun::Trigram.counts_to_probs(counts)`
38
+
39
+ Generate a sentence with a specified min length (in this case, 4) from the probability hash.
40
+
41
+ `sentence = Markovfun::Trigram.sentence_from_probs_hash(probs, 4)`
42
+
43
+ Score the sentence by "surprisal value" given a probability has.
44
+
45
+ `Markovfun::Trigram.score_sentence(sentence, probs)`
46
+
47
+ ### Sample Program
48
+
27
49
  Here's how you can generate a sentence from a text file.
28
50
 
29
51
  ```
30
- sentences = Markovfun.get_sentences("bible.txt")
31
- counts = Markovfun.get_counts(sentences)
32
- probs = Markovfun.counts_to_probs(counts)
33
- Markovfun.sentence_from_probs_hash(probs)
52
+ sentences = Markovfun::Util.get_sentences("bible.txt")
53
+ counts = Markovfun::Trigram.get_counts(sentences)
54
+ probs = Markovfun::Trigram.counts_to_probs(counts)
55
+ Markovfun::Trigram.sentence_from_probs_hash(probs, 4)
34
56
  ```
57
+
58
+ ### Sample Sentence!
59
+
60
+ From "The Beautiful and the Damned":
61
+ "I liked him tremendously--ah, she had enjoyed a rather romantic figure, a scholar, a recluse, a tower of erudition."
@@ -0,0 +1,104 @@
1
+ require "markovfun/version"
2
+ require "markovfun/util"
3
+
4
+ module Markovfun
5
+ module Trigram
6
+ include Markovfun::Util
7
+
8
+ # Generates a sentence, given a file.
9
+ def self.sentence_from_file(filename, min_length)
10
+ sentences = get_sentences(filename)
11
+ counts = get_counts(sentences)
12
+ probs = counts_to_probs(counts)
13
+ sentence_from_probs_hash(probs, min_length)
14
+ end
15
+
16
+ # Returns a counts hash, given a list of sentences.
17
+ # The keys to the hash are all observed combinations of [prev2, prev1],
18
+ # where prev2 and prev1 are the two previous words.
19
+ # The values are hashes, in which the keys are words (cur) that have followed
20
+ # prev2 and prev1, and the values are the number of occurrences.
21
+ def self.get_counts(sentences)
22
+ counts_hash = {}
23
+ sentences.each do |sent|
24
+ # nil denotes the beginnings and ends of sentences
25
+ sent = [nil, nil] + sent + [nil]
26
+ sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
27
+ counts_hash[[prev2, prev1]] ||= {}
28
+ if !(counts_hash[[prev2, prev1]][cur])
29
+ counts_hash[[prev2, prev1]][cur] = 1
30
+ else
31
+ counts_hash[[prev2, prev1]][cur] += 1
32
+ end
33
+ end
34
+ end
35
+ counts_hash
36
+ end
37
+
38
+ # Generates a probability hash, given a counts hash.
39
+ # Similar to counts_hash, except containing the probability that a word
40
+ # follows two preceding words (as opposed to number of occurrences).
41
+ def self.counts_to_probs(counts_hash)
42
+ probs_hash = {}
43
+ counts_hash.each do |prev, cur_freq|
44
+ probs_hash[prev] ||= {}
45
+ cur_freq.each do |cur, freq|
46
+ prob = freq.to_f / cur_freq.values.reduce(:+)
47
+ probs_hash[prev][cur] = prob
48
+ end
49
+ end
50
+ probs_hash
51
+ end
52
+
53
+ # Generates a sample word, given a probability hash.
54
+ def self.sample_word(probs_hash)
55
+ score = rand
56
+ probs_hash.each do |word, prob|
57
+ return word if score < prob
58
+ score -= prob
59
+ end
60
+ end
61
+
62
+ # Generates a sample sentence, given a probability hash.
63
+ def self.sample_sentence(probs_hash)
64
+ prev2 = nil
65
+ prev1 = nil
66
+ out = []
67
+
68
+ while true
69
+ cur = sample_word(probs_hash[[prev2, prev1]])
70
+ if cur.nil?
71
+ return out
72
+ else
73
+ out << cur
74
+ prev2 = prev1
75
+ prev1 = cur
76
+ end
77
+ end
78
+ end
79
+
80
+ # Generates a sentence from a probability hash.
81
+ def self.sentence_from_probs_hash(probs, min_length)
82
+ sent = []
83
+ while score_sentence(sent, probs) > 30 || sent.length < min_length
84
+ sent = sample_sentence(probs)
85
+ end
86
+ sent = sent[0..-2].join(" ") + "."
87
+ sent
88
+ end
89
+
90
+ # Scores a sentence, depending on the likelihood that it occurs
91
+ # within a corpus.
92
+ def self.score_sentence(sent, probs)
93
+ total_surprise = 0
94
+
95
+ sent = sent[0..-2].split(" ").push(".")
96
+ sent = [nil, nil] + sent + [nil]
97
+
98
+ sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
99
+ total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
100
+ end
101
+ total_surprise
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,27 @@
1
+ require "markovfun/version"
2
+
3
+ module Markovfun
4
+ module Util
5
+
6
+ # Gets lines from a file.
7
+ def self.get_lines(filename)
8
+ file = File.open(filename, "r")
9
+ data = file.read
10
+ file.close
11
+ lines = data.split("\n")
12
+ lines.map! { |l| l.strip.split(" ") }
13
+ end
14
+
15
+ # Gets sentences from a file.
16
+ def self.get_sentences(filename)
17
+ file = File.open(filename, "r")
18
+ data = file.read
19
+ file.close
20
+ data.gsub!(/\n/, " ")
21
+ data.gsub!(/"/,"")
22
+ sentences = data.split(".")
23
+ sentences.map! { |s| s.strip.split(" ").push(".") }
24
+ sentences.select! { |s| s[0].capitalize == s[0] }
25
+ end
26
+ end
27
+ end
@@ -1,3 +1,3 @@
1
1
  module Markovfun
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/markovfun.rb CHANGED
@@ -1,123 +1,2 @@
1
- require "markovfun/version"
2
- require 'pry'
3
-
4
- module Markovfun
5
-
6
- # Generates a sentence, given a file.
7
- def self.sentence_from_file(filename)
8
- sentences = get_sentences(filename)
9
- counts = buildcounts(sentences)
10
- probs = counts_to_probs(counts)
11
- sentence_from_probs_hash(probs)
12
- end
13
-
14
- # Gets lines from a file.
15
- def self.get_lines(filename)
16
- file = File.open(filename, "r")
17
- data = file.read
18
- file.close
19
- lines = data.split("\n")
20
- lines.map! { |l| l.strip.split(" ") }
21
- end
22
-
23
- # Gets sentences from a file.
24
- def self.get_sentences(filename)
25
- file = File.open(filename, "r")
26
- data = file.read
27
- file.close
28
- data.gsub!(/\n/, "")
29
- data.gsub!(/"/,"")
30
- sentences = data.split(".")
31
- sentences.map! { |s| s.strip.split(" ").push(".") }
32
- sentences.select! { |s| s[0].capitalize == s[0] }
33
- end
34
-
35
- # Returns a counts hash, given a list of sentences.
36
- # The keys to the hash are all observed combinations of [prev2, prev1],
37
- # where prev2 and prev1 are the two previous words.
38
- # The values are hashes, in which the keys are words (cur) that have followed
39
- # prev2 and prev1, and the values are the number of occurrences.
40
- def self.get_counts(sentences)
41
- counts_hash = {}
42
- sentences.each do |sent|
43
- # nil denotes the beginnings and ends of sentences
44
- sent = [nil, nil] + sent + [nil]
45
- sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
46
- counts_hash[[prev2, prev1]] ||= {}
47
- if !(counts_hash[[prev2, prev1]][cur])
48
- counts_hash[[prev2, prev1]][cur] = 1
49
- else
50
- counts_hash[[prev2, prev1]][cur] += 1
51
- end
52
- end
53
- end
54
- counts_hash
55
- end
56
-
57
- # Generates a probability hash, given a counts hash.
58
- # Similar to counts_hash, except containing the probability that a word
59
- # follows two preceding words (as opposed to number of occurrences).
60
- def self.counts_to_probs(counts_hash)
61
- probs_hash = {}
62
- counts_hash.each do |prev, cur_freq|
63
- probs_hash[prev] ||= {}
64
- cur_freq.each do |cur, freq|
65
- prob = freq.to_f / cur_freq.values.reduce(:+)
66
- probs_hash[prev][cur] = prob
67
- end
68
- end
69
- probs_hash
70
- end
71
-
72
- # Generates a sample word, given a probability hash.
73
- def self.sample_word(probs_hash)
74
- score = rand
75
- probs_hash.each do |word, prob|
76
- return word if score < prob
77
- score -= prob
78
- end
79
- end
80
-
81
- # Generates a sample sentence, given a probability hash.
82
- def self.sample_sentence(probs_hash)
83
- prev2 = nil
84
- prev1 = nil
85
- out = []
86
-
87
- while true
88
- cur = sample_word(probs_hash[[prev2, prev1]])
89
- if cur.nil?
90
- return out
91
- else
92
- out << cur
93
- prev2 = prev1
94
- prev1 = cur
95
- end
96
- end
97
- end
98
-
99
- # Scores a sentence, depending on the likelihood that it occurs
100
- # within a corpus.
101
- def self.score_sentence(sent, probs)
102
- total_surprise = 0
103
- sent = [nil, nil] + sent + [nil]
104
-
105
- sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
106
- total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
107
- end
108
- total_surprise
109
- end
110
-
111
- # Generates a sentence from a probability hash.
112
- def self.sentence_from_probs_hash(probs)
113
- sent = []
114
- while score_sentence(sent, probs) > 30 || sent.length < 4
115
- sent = sample_sentence(probs)
116
- end
117
- puts "score: #{score_sentence(sent, probs)}"
118
- sent = sent[0..-2].join(" ") + "."
119
- puts sent
120
- sent
121
- end
122
-
123
- end
1
+ require 'markovfun/trigram'
2
+ require 'markovfun/util'
data/markovfun.gemspec CHANGED
@@ -2,6 +2,7 @@
2
2
  lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'markovfun/version'
5
+ require 'pry'
5
6
 
6
7
  Gem::Specification.new do |spec|
7
8
  spec.name = "markovfun"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markovfun
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -56,6 +56,8 @@ files:
56
56
  - README.md
57
57
  - Rakefile
58
58
  - lib/markovfun.rb
59
+ - lib/markovfun/trigram.rb
60
+ - lib/markovfun/util.rb
59
61
  - lib/markovfun/version.rb
60
62
  - markovfun.gemspec
61
63
  homepage: https://github.com/mariapacana/markovfun