literate_randomizer 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +19 -17
- data/lib/literate_randomizer.rb +31 -6
- data/lib/literate_randomizer/markov.rb +58 -162
- data/lib/literate_randomizer/randomizer.rb +155 -0
- data/lib/literate_randomizer/source_parser.rb +55 -0
- data/lib/literate_randomizer/util.rb +26 -0
- data/lib/literate_randomizer/version.rb +2 -1
- data/spec/literate_randomizer_spec.rb +47 -31
- metadata +6 -2
data/README.md
CHANGED
@@ -16,24 +16,22 @@ Or install it yourself as:
|
|
16
16
|
|
17
17
|
$ gem install literate_randomizer
|
18
18
|
|
19
|
-
## Usage
|
19
|
+
## Basic Usage (global instance)
|
20
20
|
|
21
|
-
|
21
|
+
The simplest way to use LiterateRandomizer is the global Randommizer instance: LiterateRandomizer.global. Any method you invoke on LiterateRandomizer gets forwarded to this instance. Examples:
|
22
22
|
|
23
23
|
require 'literate_randomizer'
|
24
24
|
|
25
|
-
|
25
|
+
LiterateRandomizer.word
|
26
|
+
# => "frivolous"
|
26
27
|
|
27
|
-
|
28
|
-
# => "
|
28
|
+
LiterateRandomizer.sentence
|
29
|
+
# => "Muscular arms round opening of sorts while Lord John Roxton."
|
29
30
|
|
30
|
-
|
31
|
-
# => "
|
31
|
+
LiterateRandomizer.paragraph
|
32
|
+
# => "Fulmination against the wandering that the woes of this. Particular package of the back to matchwood. File with hideous jaws of Southampton. Adventure and he. Skewered on to pledge."
|
32
33
|
|
33
|
-
|
34
|
-
# => "Fulmination against the wandering that the woes of this. Particular package of the back to matchwood. File with hideous jaws of Southampton. Adventure and he. Skewered on to pledge."
|
35
|
-
|
36
|
-
puts lr.paragraphs
|
34
|
+
puts LiterateRandomizer.paragraphs
|
37
35
|
|
38
36
|
The last line outputs:
|
39
37
|
|
@@ -51,7 +49,7 @@ When creating a randomizer, there are a few options. The source_material should
|
|
51
49
|
:source_material => string OR
|
52
50
|
:source_material_file => filename
|
53
51
|
:randomizer => Random.new(seed=0)
|
54
|
-
:punctuation_distribution => DEFAULT_PUNCTUATION_DISTRIBUTION -
|
52
|
+
:punctuation_distribution => DEFAULT_PUNCTUATION_DISTRIBUTION - punctuation is randomly selected from this array
|
55
53
|
|
56
54
|
**paragraph** options:
|
57
55
|
|
@@ -59,7 +57,7 @@ When creating a randomizer, there are a few options. The source_material should
|
|
59
57
|
:first_word => nil - the start word
|
60
58
|
:words => range or int - number of words in sentence
|
61
59
|
:sentences => range or int - number of sentences in paragraph
|
62
|
-
:punctuation => nil -
|
60
|
+
:punctuation => nil - punctuation to end the sentence with (nil == randomly selected from punctuation_distribution)
|
63
61
|
|
64
62
|
**paragraphs** options:
|
65
63
|
|
@@ -67,14 +65,14 @@ When creating a randomizer, there are a few options. The source_material should
|
|
67
65
|
:first_word => nil - the first word of the paragraph
|
68
66
|
:words => range or int - number of words in sentence
|
69
67
|
:sentences => range or int - number of sentences in paragraph
|
70
|
-
:punctuation => nil -
|
68
|
+
:punctuation => nil - punctuation to end the paragraph with (nil == randomly selected from punctuation_distribution)
|
71
69
|
:paragraphs => range or int - number of paragraphs in paragraph
|
72
70
|
:join => "\n\n" - join the paragraphs. if :join => false, returns an array of the paragraphs
|
73
71
|
|
74
72
|
Advanced example:
|
75
73
|
|
76
|
-
|
77
|
-
# => "A dense mob of our. Gods on that Challenger. Invariably to safety though. Weaponless but it my! Some bandy-legged lurching creature!!!"
|
74
|
+
LiterateRandomizer.paragraph :sentences => 5, :words => 3..8, :first_word => "A", :punctuation => "!!!"
|
75
|
+
# => "A dense mob of our. Gods on that Challenger. Invariably to safety though. Weaponless but it my! Some bandy-legged lurching creature!!!"
|
78
76
|
|
79
77
|
If you just want to use a single, global instance, you can initialize and access it this way:
|
80
78
|
|
@@ -84,12 +82,16 @@ If you just want to use a single, global instance, you can initialize and access
|
|
84
82
|
|
85
83
|
# after the first call, options are ignored and the existing randomizer is returned
|
86
84
|
LiterateRandomizer.global.sentence
|
87
|
-
# => "Muscular arms round opening of sorts while Lord John Roxton."
|
85
|
+
# => "Muscular arms round opening of sorts while Lord John Roxton."
|
88
86
|
|
89
87
|
# or even simpler, all methods on LiterateRandomizer are forward to LiterateRandomizer.global:
|
90
88
|
LiterateRandomizer.paragraph(:sentences => 3, :words => 3)
|
91
89
|
# => "Drama which would. Wrong fashion which. Throw them there."
|
92
90
|
|
91
|
+
## Inspiration
|
92
|
+
|
93
|
+
Thanks to Tim Riley for getting me started on the right track with this <a href="http://openmonkey.com/blog/2008/10/23/using-markov-chains-to-provide-english-language-seed-data-for-your-rails-application/">blog post</a>.
|
94
|
+
|
93
95
|
## Contributing
|
94
96
|
|
95
97
|
1. Fork it
|
data/lib/literate_randomizer.rb
CHANGED
@@ -1,22 +1,47 @@
|
|
1
|
-
%w{
|
2
|
-
|
3
|
-
|
1
|
+
%w{
|
2
|
+
version
|
3
|
+
util
|
4
|
+
source_parser
|
5
|
+
markov
|
6
|
+
randomizer
|
7
|
+
}.each {|file|require File.join(File.dirname(__FILE__),"literate_randomizer", file)}
|
4
8
|
|
5
9
|
module LiterateRandomizer
|
6
10
|
|
7
11
|
class << self
|
12
|
+
|
13
|
+
# Create a new Randomizer instance
|
14
|
+
#
|
15
|
+
# See LiterateRandomizer::Randomizer#initializer for options.
|
8
16
|
def create(options={})
|
9
|
-
|
17
|
+
Randomizer.new options
|
10
18
|
end
|
11
19
|
|
12
|
-
|
13
|
-
|
20
|
+
# Access or initialize the global randomizer instance.
|
21
|
+
#
|
22
|
+
# The first time this is called, the global instance is created and initialized. Subsequent calls with no parameters just return
|
23
|
+
# the global instance. If LiterateRandomize.global is called again with options, a new global instance is created.
|
24
|
+
#
|
25
|
+
# See LiterateRandomizer::Randomizer#initializer for options.
|
26
|
+
def global(options=nil)
|
27
|
+
return @global_instance if @global_instance && !options
|
28
|
+
@global_instance ||= Randomizer.new(options||{})
|
14
29
|
end
|
15
30
|
|
31
|
+
# Forwards method invocations to the global Randomizer instance. Unless you need more than one instance of Randomizer,
|
32
|
+
# this is the easiest way to use LiterateRandomizer.
|
33
|
+
#
|
34
|
+
# Examples:
|
35
|
+
#
|
36
|
+
# * LiterateRandomizer.word
|
37
|
+
# * LiterateRandomizer.sentence
|
38
|
+
# * LiterateRandomizer.paragraph
|
39
|
+
# * LiterateRandomizer.paragraphs
|
16
40
|
def method_missing(method, *arguments, &block)
|
17
41
|
global.send(method, *arguments, &block)
|
18
42
|
end
|
19
43
|
|
44
|
+
# correctly mirrors method_missing
|
20
45
|
def respond_to?(method)
|
21
46
|
super || global.respond_to?(method)
|
22
47
|
end
|
@@ -4,211 +4,107 @@
|
|
4
4
|
# by Shane Brinkman-Davis
|
5
5
|
|
6
6
|
module LiterateRandomizer
|
7
|
-
class MarkovChain
|
8
|
-
DEFAULT_PUNCTUATION_DISTRIBUTION = %w{. . . . . . . . . . . . . . . . ? !}
|
9
|
-
PREPOSITION_REGEX = /^(had|the|to|or|and|a|in|that|it|if|of|is|was|for|on|as|an|your|our|my|per|until)$/
|
10
|
-
attr_accessor :randomizer, :init_options, :punctuation_distribution
|
11
|
-
attr_reader :markov_words, :words, :first_words
|
12
|
-
|
13
|
-
def default_source_material
|
14
|
-
File.expand_path File.join(File.dirname(__FILE__),"..","..","data","the_lost_world_by_arthur_conan_doyle.txt")
|
15
|
-
end
|
16
7
|
|
17
|
-
|
18
|
-
|
19
|
-
# :source_material_file => filename
|
20
|
-
def source_material(options=init_options)
|
21
|
-
options[:source_material] || File.read(options[:source_material_file] || default_source_material)
|
22
|
-
end
|
8
|
+
# The Markov-Chain bi-gram model. Primary purpose is, given a word, return the next word that is "likely" based on the source material.
|
9
|
+
class MarkovModel
|
23
10
|
|
11
|
+
# The source of all random values. Must implement: #rand(limit)
|
12
|
+
#
|
13
|
+
# Default: Random.new()
|
14
|
+
attr_accessor :randomizer
|
24
15
|
|
25
|
-
|
26
|
-
|
27
|
-
markov_words[word][next_word] += 1
|
28
|
-
end
|
16
|
+
# A hash (string => true) of all unique words found in the source-material.
|
17
|
+
attr_reader :words
|
29
18
|
|
30
|
-
#
|
31
|
-
|
32
|
-
word &&= word[/[A-Za-z][A-Za-z'-]*/]
|
33
|
-
word &&= word[/[A-Za-z'-]*[A-Za-z]/]
|
34
|
-
(word && word.strip) || ""
|
35
|
-
end
|
19
|
+
# An array of all words that appear at the beginning of sentences in the source-material.
|
20
|
+
attr_reader :first_words
|
36
21
|
|
37
|
-
|
38
|
-
|
39
|
-
|
22
|
+
# Data structure incoding all Markov-Chains (bi-grams) found in the source-material.
|
23
|
+
#
|
24
|
+
# markov_chains is a hash of hashs. The top level keys are the "first words" in the chain.
|
25
|
+
# For each first-word, there are one or more words that followed that word in the text. Second-words are the second-level hash key.
|
26
|
+
# The second-level hash values are the count of the number of times that second word followed the first.
|
27
|
+
#
|
28
|
+
# Summary: {first_words => {second_words => found-in-source-material-in-sequence-count}}
|
29
|
+
attr_reader :markov_chains
|
40
30
|
|
41
|
-
|
42
|
-
|
43
|
-
|
31
|
+
# an instance of SourceParser attached to the source_material
|
32
|
+
attr_accessor :source_parser
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# cached copy of the options passed in on initialization
|
37
|
+
attr_accessor :init_options
|
44
38
|
|
45
|
-
|
46
|
-
|
39
|
+
# add a word/next_word pair to @markov_chains
|
40
|
+
def chain_add(word, next_word)
|
41
|
+
markov_chains[word] ||= Hash.new(0)
|
42
|
+
markov_chains[word][next_word] += 1
|
47
43
|
end
|
48
44
|
|
49
45
|
# remove all dead-end words
|
50
46
|
def prune_markov_words
|
51
|
-
@
|
52
|
-
@markov_key.delete(key) if @
|
47
|
+
@markov_chains.keys.each do |key|
|
48
|
+
@markov_key.delete(key) if @markov_chains[key].length == 0
|
53
49
|
end
|
54
50
|
end
|
55
51
|
|
56
|
-
|
57
|
-
|
52
|
+
# populate the @markov_chains hash
|
53
|
+
def populate_markov_chains
|
54
|
+
@markov_chains = {}
|
58
55
|
@words = {}
|
59
56
|
@first_words = {}
|
60
|
-
|
61
|
-
word_list
|
57
|
+
source_parser.each_sentence do |word_list|
|
58
|
+
next unless word_list.length >= 2
|
62
59
|
@first_words[word_list[0]] = true
|
63
60
|
word_list.each_with_index do |word, index|
|
64
61
|
@words[word] = true
|
65
62
|
next_word = word_list[index+1]
|
66
63
|
chain_add word, next_word if next_word
|
67
64
|
end
|
68
|
-
end
|
69
|
-
prune_markov_words
|
65
|
+
end
|
66
|
+
prune_markov_words
|
70
67
|
end
|
71
68
|
|
69
|
+
# populate the weight-sums for each chain
|
70
|
+
# (an optimization)
|
72
71
|
def populate_markov_sum
|
73
72
|
@markov_weighted_sum = {}
|
74
|
-
@
|
73
|
+
@markov_chains.each do |word,followers|
|
75
74
|
@markov_weighted_sum[word] = followers.inject(0) {|sum,kv| sum + kv[1]}
|
76
75
|
end
|
77
76
|
end
|
78
77
|
|
78
|
+
# Populate internal data-structures in preparation for #next_word
|
79
79
|
def populate
|
80
|
-
|
80
|
+
populate_markov_chains
|
81
81
|
populate_markov_sum
|
82
82
|
end
|
83
83
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
rand(r.max-r.min)+r.min
|
92
|
-
end
|
93
|
-
|
94
|
-
# options:
|
95
|
-
# :source_material => string OR
|
96
|
-
# :source_material_file => filename
|
97
|
-
# :randomizer - responds to .rand(limit) - this primarilly exists for testing
|
98
|
-
# :punctuation_distribution => DEFAULT_PUNCTUATION_DISTRIBUTION - punctiation is randomly selected from this array
|
84
|
+
public
|
85
|
+
# Initialize a new instance.
|
86
|
+
#
|
87
|
+
# Options:
|
88
|
+
#
|
89
|
+
# * :randomizer => Random.new # must respond to #rand(limit)
|
90
|
+
# * :source_parser => SourceParser.new options
|
99
91
|
def initialize(options={})
|
100
|
-
@
|
101
|
-
@
|
102
|
-
@punctuation_distribution = options[:punctuation_distribution] || DEFAULT_PUNCTUATION_DISTRIBUTION
|
92
|
+
@randomizer = randomizer || Random.new
|
93
|
+
@source_parser = options[:source_parser] || SourceParser.new(options)
|
103
94
|
|
104
95
|
populate
|
105
96
|
end
|
106
97
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
def next_word(word)
|
112
|
-
return if !markov_words[word]
|
98
|
+
# Given a word, return a weighted-randomly selected next-one.
|
99
|
+
def next_word(word,randomizer=@randomizer)
|
100
|
+
return if !markov_chains[word]
|
113
101
|
sum = @markov_weighted_sum[word]
|
114
|
-
random = rand(sum)+1
|
102
|
+
random = randomizer.rand(sum)+1
|
115
103
|
partial_sum = 0
|
116
|
-
(
|
104
|
+
(markov_chains[word].find do |w, count|
|
117
105
|
partial_sum += count
|
118
106
|
w!=word && partial_sum >= random
|
119
107
|
end||[]).first
|
120
108
|
end
|
121
|
-
|
122
|
-
def rand(limit=nil)
|
123
|
-
@randomizer.rand(limit)
|
124
|
-
end
|
125
|
-
|
126
|
-
# return a random word
|
127
|
-
def word
|
128
|
-
@cached_word_keys ||= words.keys
|
129
|
-
@cached_word_keys[rand(@cached_word_keys.length)]
|
130
|
-
end
|
131
|
-
|
132
|
-
# return a random first word of a sentence
|
133
|
-
def first_word
|
134
|
-
@cached_first_word_keys ||= first_words.keys
|
135
|
-
@cached_first_word_keys[rand(@cached_first_word_keys.length)]
|
136
|
-
end
|
137
|
-
|
138
|
-
# return a random first word of a sentence
|
139
|
-
def markov_word
|
140
|
-
@cached_markov_word_keys ||= markov_words.keys
|
141
|
-
@cached_markov_word_keys[rand(@cached_markov_word_keys.length)]
|
142
|
-
end
|
143
|
-
|
144
|
-
def punctuation
|
145
|
-
@punctuation_distribution[rand(@punctuation_distribution.length)]
|
146
|
-
end
|
147
|
-
|
148
|
-
def extend_trailing_preposition(max_words,words)
|
149
|
-
while words.length < max_words && words[-1] && words[-1][PREPOSITION_REGEX]
|
150
|
-
words << next_word(words[-1])
|
151
|
-
end
|
152
|
-
words
|
153
|
-
end
|
154
|
-
|
155
|
-
# return a random sentence
|
156
|
-
# options:
|
157
|
-
# * :first_word => nil - the start word
|
158
|
-
# * :words => range or int - number of words in sentence
|
159
|
-
# * :punctuation => nil - punction to end the sentence with (nil == randomly selected from punctuation_distribution)
|
160
|
-
def sentence(options={})
|
161
|
-
word = options[:first_word] || self.markov_word
|
162
|
-
num_words_option = options[:words] || (3..15)
|
163
|
-
count = rand_count num_words_option
|
164
|
-
punctuation = options[:punctuation] || self.punctuation
|
165
|
-
|
166
|
-
words = count.times.collect do
|
167
|
-
word.tap {word = next_word(word)}
|
168
|
-
end.compact
|
169
|
-
|
170
|
-
words = extend_trailing_preposition(max(num_words_option), words)
|
171
|
-
|
172
|
-
capitalize words.compact.join(" ") + punctuation
|
173
|
-
end
|
174
|
-
|
175
|
-
# return a random paragraph
|
176
|
-
# options:
|
177
|
-
# * :first_word => nil - the first word of the paragraph
|
178
|
-
# * :words => range or int - number of words in sentence
|
179
|
-
# * :sentences => range or int - number of sentences in paragraph
|
180
|
-
# * :punctuation => nil - punction to end the paragraph with (nil == randomly selected from punctuation_distribution)
|
181
|
-
def paragraph(options={})
|
182
|
-
count = rand_count options[:sentences] || (5..15)
|
183
|
-
|
184
|
-
count.times.collect do |i|
|
185
|
-
op = options.clone
|
186
|
-
op.delete :punctuation unless i==count-1
|
187
|
-
op.delete :first_word unless i==0
|
188
|
-
sentence op
|
189
|
-
end.join(" ")
|
190
|
-
end
|
191
|
-
|
192
|
-
# return random paragraphs
|
193
|
-
# options:
|
194
|
-
# * :first_word => nil - the first word of the paragraph
|
195
|
-
# * :words => range or int - number of words in sentence
|
196
|
-
# * :sentences => range or int - number of sentences in paragraph
|
197
|
-
# * :paragraphs => range or int - number of paragraphs in paragraph
|
198
|
-
# * :join => "\n\n" - join the paragraphs. if :join => false, returns an array of the paragraphs
|
199
|
-
# * :punctuation => nil - punction to end the paragraph with (nil == randomly selected from punctuation_distribution)
|
200
|
-
def paragraphs(options={})
|
201
|
-
count = rand_count options[:paragraphs] || (3..5)
|
202
|
-
join_str = options[:join]
|
203
|
-
|
204
|
-
res = count.times.collect do |i|
|
205
|
-
op = options.clone
|
206
|
-
op.delete :punctuation unless i==count-1
|
207
|
-
op.delete :first_word unless i==0
|
208
|
-
paragraph op
|
209
|
-
end
|
210
|
-
|
211
|
-
join_str!=false ? res.join(join_str || "\n\n") : res
|
212
|
-
end
|
213
109
|
end
|
214
|
-
end
|
110
|
+
end
|
@@ -0,0 +1,155 @@
|
|
1
|
+
module LiterateRandomizer
|
2
|
+
|
3
|
+
# The main class. Each instance has its own random number generator and can work against its own training source-material.
|
4
|
+
class Randomizer
|
5
|
+
# The default punctuation distribution. Punctuation is pulled randomly from this array. It can contain any string.
|
6
|
+
DEFAULT_PUNCTUATION_DISTRIBUTION = %w{. . . . . . . . . . . . . . . . ? !}
|
7
|
+
|
8
|
+
# LiterateRandomizer prefers to not end sentences with words that match the following regexp:
|
9
|
+
PREPOSITION_REGEX = /^(had|the|to|or|and|a|in|that|it|if|of|is|was|for|on|as|an|your|our|my|per|until)$/
|
10
|
+
|
11
|
+
|
12
|
+
# The source of all random values. Must implement: #rand(limit)
|
13
|
+
#
|
14
|
+
# Default: Random.new()
|
15
|
+
attr_accessor :randomizer
|
16
|
+
|
17
|
+
# To end setences, one of the strings in this array is selected at random (uniform-distribution)
|
18
|
+
#
|
19
|
+
# Default: DEFAULT_PUNCTUATION_DISTRIBUTION
|
20
|
+
attr_accessor :punctuation_distribution
|
21
|
+
|
22
|
+
# an instance of SourceParser attached to the source_material
|
23
|
+
attr_reader :source_parser
|
24
|
+
|
25
|
+
# The random-generator model
|
26
|
+
attr_reader :model
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# Check to see if the sentence ends in a PREPOSITION_REGEX word.
|
31
|
+
# If so, add more words up to max-words until it does.
|
32
|
+
def extend_trailing_preposition(max_words,words)
|
33
|
+
while words.length < max_words && words[-1] && words[-1][PREPOSITION_REGEX]
|
34
|
+
words << model.next_word(words[-1],randomizer)
|
35
|
+
end
|
36
|
+
words
|
37
|
+
end
|
38
|
+
|
39
|
+
public
|
40
|
+
# Initialize a new instance. Each Markov randomizer instance can run against its own source_material.
|
41
|
+
#
|
42
|
+
# Options:
|
43
|
+
#
|
44
|
+
# * :source_material => string OR
|
45
|
+
# * :source_material_file => filename
|
46
|
+
# * :punctuation_distribution => DEFAULT_PUNCTUATION_DISTRIBUTION
|
47
|
+
# punctiation is randomly selected from this array
|
48
|
+
#
|
49
|
+
# Advanced options: (primiarilly for testing)
|
50
|
+
#
|
51
|
+
# * :randomizer => Random.new # must respond to #rand(limit)
|
52
|
+
# * :source_parser => SourceParser.new options
|
53
|
+
# * :model => MarkovModel.new :source_parser => source_parser
|
54
|
+
def initialize(options={})
|
55
|
+
@init_options = options
|
56
|
+
@randomizer = randomizer || Random.new
|
57
|
+
@punctuation_distribution = options[:punctuation_distribution] || DEFAULT_PUNCTUATION_DISTRIBUTION
|
58
|
+
@source_parser = options[:source_parser] || SourceParser.new(options)
|
59
|
+
@model = options[:model] || MarkovModel.new(:source_parser => source_parser)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns a quick summary of the instance.
|
63
|
+
def inspect
|
64
|
+
"#<#{self.class}: #{model.words.length} words, #{model.markov_chains.length} word-chains, #{model.first_words.length} first_words>"
|
65
|
+
end
|
66
|
+
|
67
|
+
# return a random word
|
68
|
+
def word
|
69
|
+
@cached_word_keys ||= model.words.keys
|
70
|
+
@cached_word_keys[rand(@cached_word_keys.length)]
|
71
|
+
end
|
72
|
+
|
73
|
+
# return a random first word of a sentence
|
74
|
+
def first_word
|
75
|
+
@cached_first_word_keys ||= model.first_words.keys
|
76
|
+
@cached_first_word_keys[rand(@cached_first_word_keys.length)]
|
77
|
+
end
|
78
|
+
|
79
|
+
# return a random number generated by randomizer
|
80
|
+
def rand(limit=nil)
|
81
|
+
@randomizer.rand(limit)
|
82
|
+
end
|
83
|
+
|
84
|
+
# return a random end-sentence string from punctuation_distribution
|
85
|
+
def punctuation
|
86
|
+
@punctuation_distribution[rand(@punctuation_distribution.length)]
|
87
|
+
end
|
88
|
+
|
89
|
+
# return a random sentence
|
90
|
+
#
|
91
|
+
# Options:
|
92
|
+
#
|
93
|
+
# * :first_word => nil - the start word
|
94
|
+
# * :words => range or int - number of words in sentence
|
95
|
+
# * :punctuation => nil - punction to end the sentence with (nil == randomly selected from punctuation_distribution)
|
96
|
+
def sentence(options={})
|
97
|
+
word = options[:first_word] || self.first_word
|
98
|
+
num_words_option = options[:words] || (3..15)
|
99
|
+
count = Util.rand_count(num_words_option,randomizer)
|
100
|
+
punctuation = options[:punctuation] || self.punctuation
|
101
|
+
|
102
|
+
words = count.times.collect do
|
103
|
+
word.tap {word = model.next_word(word,randomizer)}
|
104
|
+
end.compact
|
105
|
+
|
106
|
+
words = extend_trailing_preposition(Util.max(num_words_option), words)
|
107
|
+
|
108
|
+
Util.capitalize words.compact.join(" ") + punctuation
|
109
|
+
end
|
110
|
+
|
111
|
+
# return a random paragraph
|
112
|
+
#
|
113
|
+
# Options:
|
114
|
+
#
|
115
|
+
# * :first_word => nil - the first word of the paragraph
|
116
|
+
# * :words => range or int - number of words in sentence
|
117
|
+
# * :sentences => range or int - number of sentences in paragraph
|
118
|
+
# * :punctuation => nil - punction to end the paragraph with (nil == randomly selected from punctuation_distribution)
|
119
|
+
def paragraph(options={})
|
120
|
+
count = Util.rand_count(options[:sentences] || (5..15),randomizer)
|
121
|
+
|
122
|
+
count.times.collect do |i|
|
123
|
+
op = options.clone
|
124
|
+
op.delete :punctuation unless i==count-1
|
125
|
+
op.delete :first_word unless i==0
|
126
|
+
sentence op
|
127
|
+
end.join(" ")
|
128
|
+
end
|
129
|
+
|
130
|
+
# return random paragraphs
|
131
|
+
#
|
132
|
+
# Options:
|
133
|
+
#
|
134
|
+
# * :first_word => nil - the first word of the paragraph
|
135
|
+
# * :words => range or int - number of words in sentence
|
136
|
+
# * :sentences => range or int - number of sentences in paragraph
|
137
|
+
# * :paragraphs => range or int - number of paragraphs in paragraph
|
138
|
+
# * :join => "\n\n" - join the paragraphs. if :join => false, returns an array of the paragraphs
|
139
|
+
# * :punctuation => nil - punction to end the paragraph with (nil == randomly selected from punctuation_distribution)
|
140
|
+
def paragraphs(options={})
|
141
|
+
count = Util.rand_count(options[:paragraphs] || (3..5),randomizer)
|
142
|
+
join_str = options[:join]
|
143
|
+
|
144
|
+
res = count.times.collect do |i|
|
145
|
+
op = options.clone
|
146
|
+
op.delete :punctuation unless i==count-1
|
147
|
+
op.delete :first_word unless i==0
|
148
|
+
paragraph op
|
149
|
+
end
|
150
|
+
|
151
|
+
join_str!=false ? res.join(join_str || "\n\n") : res
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module LiterateRandomizer
|
2
|
+
|
3
|
+
# Parse the source material and provide "each_sentence" - an easy way to walk the source material.
|
4
|
+
class SourceParser
|
5
|
+
private
|
6
|
+
attr_reader :init_options
|
7
|
+
|
8
|
+
public
|
9
|
+
# Options:
|
10
|
+
#
|
11
|
+
# * :source_material => string OR
|
12
|
+
# * :source_material_file => filename
|
13
|
+
def initialize(options)
|
14
|
+
@init_options = options
|
15
|
+
end
|
16
|
+
|
17
|
+
# read the default source material included with the gem
|
18
|
+
def default_source_material
|
19
|
+
File.expand_path File.join(File.dirname(__FILE__),"..","..","data","the_lost_world_by_arthur_conan_doyle.txt")
|
20
|
+
end
|
21
|
+
|
22
|
+
# Options:
|
23
|
+
#
|
24
|
+
# :source_material => string
|
25
|
+
# :source_material_file => filename
|
26
|
+
def source_material(options=init_options)
|
27
|
+
options[:source_material] || File.read(options[:source_material_file] || default_source_material)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Read the source material and split it into sentences
|
31
|
+
# NOTE: this re-reads the source material each time. Usually this only needs to happen once and it would waste memory to keep it around.
|
32
|
+
def source_sentences
|
33
|
+
source_material.split(/([.?!"]($|\s)|\n\s*\n)+/)
|
34
|
+
end
|
35
|
+
|
36
|
+
# remove any non-alpha characters from word
|
37
|
+
def scrub_word(word)
|
38
|
+
word &&= word[/[A-Za-z][A-Za-z'-]*/]
|
39
|
+
word &&= word[/[A-Za-z'-]*[A-Za-z]/]
|
40
|
+
(word && word.strip) || ""
|
41
|
+
end
|
42
|
+
|
43
|
+
# clean up all words in a string, returning an array of clean words
|
44
|
+
def scrub_sentence(sentence)
|
45
|
+
sentence.split(/([\s]|--)+/).collect {|a| scrub_word(a)}.select {|a| a.length>0}
|
46
|
+
end
|
47
|
+
|
48
|
+
# Yields to a block each sentence as an array of words
|
49
|
+
def each_sentence
|
50
|
+
source_sentences.each do |sentence|
|
51
|
+
yield scrub_sentence sentence
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module LiterateRandomizer
|
2
|
+
|
3
|
+
# A few utility methods
|
4
|
+
class Util
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# r can be an Integer of a Range. If an intenger, return r, else, return a the maximum value in the range.
|
8
|
+
def max(r)
|
9
|
+
return r if r.kind_of? Integer
|
10
|
+
r.max
|
11
|
+
end
|
12
|
+
|
13
|
+
# r can be an Integer of a Range. If an intenger, return r, else, return a random number within the range.
|
14
|
+
def rand_count(r,randomizer=Random.new)
|
15
|
+
return r if r.kind_of? Integer
|
16
|
+
randomizer.rand(r.max-r.min)+r.min
|
17
|
+
end
|
18
|
+
|
19
|
+
# return word with the first letter capitalized
|
20
|
+
def capitalize(word)
|
21
|
+
word.chars.first.upcase+word[1..-1]
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -2,6 +2,13 @@ require File.join(File.dirname(__FILE__),"..","lib","literate_randomizer")
|
|
2
2
|
|
3
3
|
describe LiterateRandomizer do
|
4
4
|
|
5
|
+
WORD="[a-zA-Z]+([-'][a-zA-Z]+)*"
|
6
|
+
CWORD="[A-Z][a-zA-Z]*([-'][a-zA-Z]+)*"
|
7
|
+
PUNCTUATION="[!.?]"
|
8
|
+
SENTENCE_TAIL = "( #{WORD})*#{PUNCTUATION}"
|
9
|
+
SENTENCE="#{CWORD}#{SENTENCE_TAIL}"
|
10
|
+
SENTENCES="#{SENTENCE}( #{SENTENCE})+"
|
11
|
+
|
5
12
|
def new_lr(options={})
|
6
13
|
$lr ||= LiterateRandomizer.create options
|
7
14
|
$lr.randomizer = Random.new(1)
|
@@ -18,74 +25,83 @@ describe LiterateRandomizer do
|
|
18
25
|
end
|
19
26
|
|
20
27
|
it "words.length should be the number of words in the file" do
|
21
|
-
new_lr.words.length.should ==
|
28
|
+
new_lr.model.words.length.should == 9117
|
22
29
|
end
|
23
30
|
|
24
31
|
it "first_words.length should be the number words starting sentences in the file" do
|
25
|
-
new_lr.first_words.length.should ==
|
26
|
-
end
|
27
|
-
|
28
|
-
it "source_sentences.length should be the number of sentences in the file" do
|
29
|
-
new_lr.source_sentences.length.should == 10699
|
30
|
-
new_lr.source_sentences.length.should > new_lr.first_word.length
|
32
|
+
new_lr.model.first_words.length.should == 585
|
31
33
|
end
|
32
34
|
|
33
35
|
it "word should return a random word" do
|
34
|
-
new_lr.word.should
|
36
|
+
new_lr.word.should match /[a-z]+/
|
35
37
|
end
|
36
38
|
|
37
39
|
it "sentence should return a random sentence" do
|
38
|
-
new_lr.sentence.should
|
40
|
+
new_lr.sentence.should match /^#{SENTENCE}$/
|
41
|
+
end
|
42
|
+
|
43
|
+
it "if we keep resetting the randomizer we should keep getting the same sentence" do
|
44
|
+
s = new_lr.sentence
|
45
|
+
10.times do
|
46
|
+
new_lr.sentence.should == s
|
47
|
+
end
|
39
48
|
end
|
40
49
|
|
41
50
|
it "sentence length should work" do
|
42
|
-
new_lr.sentence(:words => 1).should ==
|
43
|
-
new_lr.sentence(:words =>
|
44
|
-
new_lr.sentence(:words =>
|
45
|
-
new_lr.sentence(:words =>
|
46
|
-
new_lr.sentence(:words =>
|
47
|
-
|
51
|
+
new_lr.sentence(:words => 1).split(' ').length.should == 1
|
52
|
+
new_lr.sentence(:words => 2).split(' ').length.should == 2
|
53
|
+
new_lr.sentence(:words => 3).split(' ').length.should == 3
|
54
|
+
new_lr.sentence(:words => 9).split(' ').length.should == 9
|
55
|
+
a = new_lr.sentence(:words => 2..7).split(' ')
|
56
|
+
a.length.should >= 2
|
57
|
+
a.length.should <= 7
|
48
58
|
end
|
49
59
|
|
50
60
|
it "successive calls should vary" do
|
51
61
|
lr = new_lr
|
52
|
-
lr.sentence.
|
53
|
-
|
54
|
-
|
62
|
+
a,b,c = lr.sentence,lr.sentence,lr.sentence
|
63
|
+
a.should_not == b
|
64
|
+
b.should_not == c
|
65
|
+
c.should_not == a
|
55
66
|
end
|
56
67
|
|
57
68
|
it "paragraph should work" do
|
58
|
-
new_lr.paragraph.should
|
69
|
+
new_lr.paragraph.should match /([A-Z][a-zA-Z ]+[.!?])+/
|
59
70
|
end
|
60
71
|
|
61
|
-
it "
|
62
|
-
new_lr.paragraph(:sentences => 5, :words=>3).should
|
63
|
-
new_lr.paragraph(:sentences => 2..4, :words=>3).should
|
72
|
+
it "paragraph parameters should work" do
|
73
|
+
new_lr.paragraph(:sentences => 5, :words=>3).should match /^(#{CWORD} #{WORD} #{WORD}[.!?] ?){5,5}$/
|
74
|
+
new_lr.paragraph(:sentences => 2..4, :words=>3).should match /(#{CWORD} #{WORD} #{WORD}[.!?] ?){2,4}/
|
64
75
|
end
|
65
76
|
|
66
77
|
it "first_word should work" do
|
67
|
-
new_lr.paragraph(:first_word => "A",:sentences => 5, :words=>3).should
|
78
|
+
new_lr.paragraph(:first_word => "A",:sentences => 5, :words=>3).should match /^A#{SENTENCE_TAIL} #{SENTENCES}$/
|
68
79
|
end
|
69
80
|
|
70
81
|
it "punctuation should work" do
|
71
|
-
new_lr.paragraph(:punctuation => "!!!",:sentences => 5, :words=>3).should
|
82
|
+
new_lr.paragraph(:punctuation => "!!!",:sentences => 5, :words=>3).should match /^(#{CWORD} #{WORD} #{WORD}[.!?] ?){4,4}#{CWORD} #{WORD} #{WORD}!!!$/
|
72
83
|
end
|
73
84
|
|
74
85
|
it "global_randomizer_should work" do
|
75
|
-
LiterateRandomizer.global.class.should == LiterateRandomizer::
|
86
|
+
LiterateRandomizer.global.class.should == LiterateRandomizer::Randomizer
|
76
87
|
end
|
77
88
|
|
78
89
|
it "global_randomizer_should forwarding should work" do
|
79
90
|
LiterateRandomizer.respond_to?(:paragraph).should == true
|
80
91
|
LiterateRandomizer.respond_to?(:fonsfoaihdsfa).should == false
|
81
|
-
LiterateRandomizer.word.should
|
82
|
-
LiterateRandomizer.sentence.should
|
83
|
-
LiterateRandomizer.paragraph.should
|
92
|
+
LiterateRandomizer.word.should match /^#{WORD}$/
|
93
|
+
LiterateRandomizer.sentence.should match /^#{SENTENCE}$/
|
94
|
+
LiterateRandomizer.paragraph.should match /^#{SENTENCES}$/
|
95
|
+
end
|
96
|
+
|
97
|
+
it "join param should work" do
|
98
|
+
LiterateRandomizer.paragraphs(:paragraphs => 2, :words =>2, :sentences => 2, :join=>"--").should match /^#{SENTENCES}--#{SENTENCES}$/
|
84
99
|
end
|
85
100
|
|
86
101
|
it "global_randomizer_should forwarding should work" do
|
87
|
-
LiterateRandomizer.paragraphs(:words =>2, :sentences => 2).should
|
88
|
-
LiterateRandomizer.paragraphs(:words =>2, :sentences => 2, :join=>
|
89
|
-
|
102
|
+
LiterateRandomizer.paragraphs(:paragraphs => 2, :words =>2, :sentences => 2).should match /^#{SENTENCE} #{SENTENCE}\n\n#{SENTENCE} #{SENTENCE}$/
|
103
|
+
a = LiterateRandomizer.paragraphs(:paragraphs => 2, :words =>2, :sentences => 2, :join=>false)
|
104
|
+
a.length.should == 2
|
105
|
+
a.each {|b|b.should match /^#{SENTENCES}$/}
|
90
106
|
end
|
91
107
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: literate_randomizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-10-
|
12
|
+
date: 2012-10-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -43,6 +43,9 @@ files:
|
|
43
43
|
- data/the_lost_world_by_arthur_conan_doyle.txt
|
44
44
|
- lib/literate_randomizer.rb
|
45
45
|
- lib/literate_randomizer/markov.rb
|
46
|
+
- lib/literate_randomizer/randomizer.rb
|
47
|
+
- lib/literate_randomizer/source_parser.rb
|
48
|
+
- lib/literate_randomizer/util.rb
|
46
49
|
- lib/literate_randomizer/version.rb
|
47
50
|
- literate_randomizer.gemspec
|
48
51
|
- spec/literate_randomizer_spec.rb
|
@@ -73,3 +76,4 @@ summary: A random sentence and paragraph generator gem. Using Markov chains, thi
|
|
73
76
|
generates near-english prose.
|
74
77
|
test_files:
|
75
78
|
- spec/literate_randomizer_spec.rb
|
79
|
+
has_rdoc:
|