markovfun 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -8,6 +8,7 @@ InstalledFiles
8
8
  _yardoc
9
9
  coverage
10
10
  doc/
11
+ texts/
11
12
  lib/bundler/man
12
13
  pkg
13
14
  rdoc
data/Gemfile CHANGED
@@ -2,3 +2,4 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in markovfun.gemspec
4
4
  gemspec
5
+ pry
data/README.md CHANGED
@@ -2,11 +2,7 @@
2
2
 
3
3
  This gem generates sentences from textfiles using trigrams.
4
4
  It is based on Alex Rudnick's Python Markov chain generator,
5
- <<<<<<< HEAD
6
5
  the code for which is [here](https://github.com/alexrudnick/hackerschool-demos/tree/master/ngrams).
7
- =======
8
- the code for which is (here)[https://github.com/alexrudnick/hackerschool-demos/tree/master/ngrams].
9
- >>>>>>> f8c8a08... Updated README
10
6
 
11
7
  ## Installation
12
8
 
@@ -24,11 +20,42 @@ Or install it yourself as:
24
20
 
25
21
  ## Usage
26
22
 
23
+ ### File Processing
24
+
25
+ Get sentences from a text file:
26
+
27
+ `sentences = Markovfun::Util.get_sentences("bible.txt")`
28
+
29
+ ### Trigrams
30
+
31
+ Create hash storing counts of words that follow two previous words.
32
+
33
+ `counts = Markovfun::Trigram.get_counts(sentences)`
34
+
35
+ Convert hash of counts to hash of probabilities.
36
+
37
+ `probs = Markovfun::Trigram.counts_to_probs(counts)`
38
+
39
+ Generate a sentence with a specified min length (in this case, 4) from the probability hash.
40
+
41
+ `sentence = Markovfun::Trigram.sentence_from_probs_hash(probs, 4)`
42
+
43
+ Score the sentence by "surprisal value" given a probability has.
44
+
45
+ `Markovfun::Trigram.score_sentence(sentence, probs)`
46
+
47
+ ### Sample Program
48
+
27
49
  Here's how you can generate a sentence from a text file.
28
50
 
29
51
  ```
30
- sentences = Markovfun.get_sentences("bible.txt")
31
- counts = Markovfun.get_counts(sentences)
32
- probs = Markovfun.counts_to_probs(counts)
33
- Markovfun.sentence_from_probs_hash(probs)
52
+ sentences = Markovfun::Util.get_sentences("bible.txt")
53
+ counts = Markovfun::Trigram.get_counts(sentences)
54
+ probs = Markovfun::Trigram.counts_to_probs(counts)
55
+ Markovfun::Trigram.sentence_from_probs_hash(probs, 4)
34
56
  ```
57
+
58
+ ### Sample Sentence!
59
+
60
+ From "The Beautiful and the Damned":
61
+ "I liked him tremendously--ah, she had enjoyed a rather romantic figure, a scholar, a recluse, a tower of erudition."
@@ -0,0 +1,104 @@
1
+ require "markovfun/version"
2
+ require "markovfun/util"
3
+
4
+ module Markovfun
5
+ module Trigram
6
+ include Markovfun::Util
7
+
8
+ # Generates a sentence, given a file.
9
+ def self.sentence_from_file(filename, min_length)
10
+ sentences = get_sentences(filename)
11
+ counts = get_counts(sentences)
12
+ probs = counts_to_probs(counts)
13
+ sentence_from_probs_hash(probs, min_length)
14
+ end
15
+
16
+ # Returns a counts hash, given a list of sentences.
17
+ # The keys to the hash are all observed combinations of [prev2, prev1],
18
+ # where prev2 and prev1 are the two previous words.
19
+ # The values are hashes, in which the keys are words (cur) that have followed
20
+ # prev2 and prev1, and the values are the number of occurrences.
21
+ def self.get_counts(sentences)
22
+ counts_hash = {}
23
+ sentences.each do |sent|
24
+ # nil denotes the beginnings and ends of sentences
25
+ sent = [nil, nil] + sent + [nil]
26
+ sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
27
+ counts_hash[[prev2, prev1]] ||= {}
28
+ if !(counts_hash[[prev2, prev1]][cur])
29
+ counts_hash[[prev2, prev1]][cur] = 1
30
+ else
31
+ counts_hash[[prev2, prev1]][cur] += 1
32
+ end
33
+ end
34
+ end
35
+ counts_hash
36
+ end
37
+
38
+ # Generates a probability hash, given a counts hash.
39
+ # Similar to counts_hash, except containing the probability that a word
40
+ # follows two preceding words (as opposed to number of occurrences).
41
+ def self.counts_to_probs(counts_hash)
42
+ probs_hash = {}
43
+ counts_hash.each do |prev, cur_freq|
44
+ probs_hash[prev] ||= {}
45
+ cur_freq.each do |cur, freq|
46
+ prob = freq.to_f / cur_freq.values.reduce(:+)
47
+ probs_hash[prev][cur] = prob
48
+ end
49
+ end
50
+ probs_hash
51
+ end
52
+
53
+ # Generates a sample word, given a probability hash.
54
+ def self.sample_word(probs_hash)
55
+ score = rand
56
+ probs_hash.each do |word, prob|
57
+ return word if score < prob
58
+ score -= prob
59
+ end
60
+ end
61
+
62
+ # Generates a sample sentence, given a probability hash.
63
+ def self.sample_sentence(probs_hash)
64
+ prev2 = nil
65
+ prev1 = nil
66
+ out = []
67
+
68
+ while true
69
+ cur = sample_word(probs_hash[[prev2, prev1]])
70
+ if cur.nil?
71
+ return out
72
+ else
73
+ out << cur
74
+ prev2 = prev1
75
+ prev1 = cur
76
+ end
77
+ end
78
+ end
79
+
80
+ # Generates a sentence from a probability hash.
81
+ def self.sentence_from_probs_hash(probs, min_length)
82
+ sent = []
83
+ while score_sentence(sent, probs) > 30 || sent.length < min_length
84
+ sent = sample_sentence(probs)
85
+ end
86
+ sent = sent[0..-2].join(" ") + "."
87
+ sent
88
+ end
89
+
90
+ # Scores a sentence, depending on the likelihood that it occurs
91
+ # within a corpus.
92
+ def self.score_sentence(sent, probs)
93
+ total_surprise = 0
94
+
95
+ sent = sent[0..-2].split(" ").push(".")
96
+ sent = [nil, nil] + sent + [nil]
97
+
98
+ sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
99
+ total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
100
+ end
101
+ total_surprise
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,27 @@
1
+ require "markovfun/version"
2
+
3
+ module Markovfun
4
+ module Util
5
+
6
+ # Gets lines from a file.
7
+ def self.get_lines(filename)
8
+ file = File.open(filename, "r")
9
+ data = file.read
10
+ file.close
11
+ lines = data.split("\n")
12
+ lines.map! { |l| l.strip.split(" ") }
13
+ end
14
+
15
+ # Gets sentences from a file.
16
+ def self.get_sentences(filename)
17
+ file = File.open(filename, "r")
18
+ data = file.read
19
+ file.close
20
+ data.gsub!(/\n/, " ")
21
+ data.gsub!(/"/,"")
22
+ sentences = data.split(".")
23
+ sentences.map! { |s| s.strip.split(" ").push(".") }
24
+ sentences.select! { |s| s[0].capitalize == s[0] }
25
+ end
26
+ end
27
+ end
@@ -1,3 +1,3 @@
1
1
  module Markovfun
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
data/lib/markovfun.rb CHANGED
@@ -1,123 +1,2 @@
1
- require "markovfun/version"
2
- require 'pry'
3
-
4
- module Markovfun
5
-
6
- # Generates a sentence, given a file.
7
- def self.sentence_from_file(filename)
8
- sentences = get_sentences(filename)
9
- counts = buildcounts(sentences)
10
- probs = counts_to_probs(counts)
11
- sentence_from_probs_hash(probs)
12
- end
13
-
14
- # Gets lines from a file.
15
- def self.get_lines(filename)
16
- file = File.open(filename, "r")
17
- data = file.read
18
- file.close
19
- lines = data.split("\n")
20
- lines.map! { |l| l.strip.split(" ") }
21
- end
22
-
23
- # Gets sentences from a file.
24
- def self.get_sentences(filename)
25
- file = File.open(filename, "r")
26
- data = file.read
27
- file.close
28
- data.gsub!(/\n/, "")
29
- data.gsub!(/"/,"")
30
- sentences = data.split(".")
31
- sentences.map! { |s| s.strip.split(" ").push(".") }
32
- sentences.select! { |s| s[0].capitalize == s[0] }
33
- end
34
-
35
- # Returns a counts hash, given a list of sentences.
36
- # The keys to the hash are all observed combinations of [prev2, prev1],
37
- # where prev2 and prev1 are the two previous words.
38
- # The values are hashes, in which the keys are words (cur) that have followed
39
- # prev2 and prev1, and the values are the number of occurrences.
40
- def self.get_counts(sentences)
41
- counts_hash = {}
42
- sentences.each do |sent|
43
- # nil denotes the beginnings and ends of sentences
44
- sent = [nil, nil] + sent + [nil]
45
- sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
46
- counts_hash[[prev2, prev1]] ||= {}
47
- if !(counts_hash[[prev2, prev1]][cur])
48
- counts_hash[[prev2, prev1]][cur] = 1
49
- else
50
- counts_hash[[prev2, prev1]][cur] += 1
51
- end
52
- end
53
- end
54
- counts_hash
55
- end
56
-
57
- # Generates a probability hash, given a counts hash.
58
- # Similar to counts_hash, except containing the probability that a word
59
- # follows two preceding words (as opposed to number of occurrences).
60
- def self.counts_to_probs(counts_hash)
61
- probs_hash = {}
62
- counts_hash.each do |prev, cur_freq|
63
- probs_hash[prev] ||= {}
64
- cur_freq.each do |cur, freq|
65
- prob = freq.to_f / cur_freq.values.reduce(:+)
66
- probs_hash[prev][cur] = prob
67
- end
68
- end
69
- probs_hash
70
- end
71
-
72
- # Generates a sample word, given a probability hash.
73
- def self.sample_word(probs_hash)
74
- score = rand
75
- probs_hash.each do |word, prob|
76
- return word if score < prob
77
- score -= prob
78
- end
79
- end
80
-
81
- # Generates a sample sentence, given a probability hash.
82
- def self.sample_sentence(probs_hash)
83
- prev2 = nil
84
- prev1 = nil
85
- out = []
86
-
87
- while true
88
- cur = sample_word(probs_hash[[prev2, prev1]])
89
- if cur.nil?
90
- return out
91
- else
92
- out << cur
93
- prev2 = prev1
94
- prev1 = cur
95
- end
96
- end
97
- end
98
-
99
- # Scores a sentence, depending on the likelihood that it occurs
100
- # within a corpus.
101
- def self.score_sentence(sent, probs)
102
- total_surprise = 0
103
- sent = [nil, nil] + sent + [nil]
104
-
105
- sent.zip(sent[1..-1], sent[2..-1]).each do |prev2, prev1, cur|
106
- total_surprise += -Math.log(probs[[prev2, prev1]][cur], 2)
107
- end
108
- total_surprise
109
- end
110
-
111
- # Generates a sentence from a probability hash.
112
- def self.sentence_from_probs_hash(probs)
113
- sent = []
114
- while score_sentence(sent, probs) > 30 || sent.length < 4
115
- sent = sample_sentence(probs)
116
- end
117
- puts "score: #{score_sentence(sent, probs)}"
118
- sent = sent[0..-2].join(" ") + "."
119
- puts sent
120
- sent
121
- end
122
-
123
- end
1
+ require 'markovfun/trigram'
2
+ require 'markovfun/util'
data/markovfun.gemspec CHANGED
@@ -2,6 +2,7 @@
2
2
  lib = File.expand_path('../lib', __FILE__)
3
3
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
  require 'markovfun/version'
5
+ require 'pry'
5
6
 
6
7
  Gem::Specification.new do |spec|
7
8
  spec.name = "markovfun"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: markovfun
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -56,6 +56,8 @@ files:
56
56
  - README.md
57
57
  - Rakefile
58
58
  - lib/markovfun.rb
59
+ - lib/markovfun/trigram.rb
60
+ - lib/markovfun/util.rb
59
61
  - lib/markovfun/version.rb
60
62
  - markovfun.gemspec
61
63
  homepage: https://github.com/mariapacana/markovfun