markov_words 1.0.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +54 -10
- data/bin/benchmark +84 -0
- data/lib/markov_words/generator.rb +15 -51
- data/lib/markov_words/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2605351d6864f6d2cb9ffc5f498b647b306dc600
|
4
|
+
data.tar.gz: 85d646b15bb737aca9394f69cde8ea03922bcc10
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1222be879d0fec47f344f71893b43cfba653bb05e5fdca56ba4409dad723efd4158458b7f4065d80fb4a31bcaa09878568264564ba36b5c3620fe7afe2092802
|
7
|
+
data.tar.gz: 13bd043e2168d8f2dbe1cf3f33c38268b0bed4a6f0fcfde0f5ae360a770c0b376913ef4973b350a3ad143864a5766ddfddd22de23ddfedb6b328d1c1e8864a08
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -72,28 +72,72 @@ You can also clear out the contents of the data file (because `MarkovWords` will
|
|
72
72
|
generator = MarkovWords::Generator.new(data_file: /tmp/markov.data, flush_data: true)
|
73
73
|
```
|
74
74
|
|
75
|
+
### Custom Metadata
|
75
76
|
|
76
|
-
|
77
|
+
A `Generator` object gives you access to its `.data_store`, which is an instance of a `FileStore` object. This gives you the ability to store custom metadata into the same database that holds the n-gram information.
|
77
78
|
|
78
|
-
|
79
|
+
One example of how you might use this would be to cache words for later use (since initial word generation can be slow, even after the database has been generated the first time):
|
79
80
|
|
80
81
|
```ruby
|
81
|
-
|
82
|
-
|
82
|
+
generator = MarkovWords::Generator.new
|
83
|
+
my_cache = 100.times.map { generator.word }
|
84
|
+
generator.data_store.store_data :cache, my_cache
|
83
85
|
|
84
|
-
#
|
85
|
-
|
86
|
+
# then later, perhaps on another page load in a web server...
|
87
|
+
my_cache = generator.data_store.retrieve_data :cache
|
86
88
|
```
|
87
89
|
|
88
|
-
|
90
|
+
### Benchmarking
|
89
91
|
|
90
|
-
|
91
|
-
|
92
|
-
|
92
|
+
We've included a `bin/benchmark` script, which will measure initial load times, and then the time it takes to generate 100 words at various dictionary n-gram sizes.
|
93
|
+
|
94
|
+
Here is an example run:
|
95
|
+
```
|
96
|
+
bin/benchmark 1 6 '/usr/share/dict/words'
|
97
|
+
Minimum n-gram size set to 1
|
98
|
+
Maximum n-gram size set to 6
|
99
|
+
Corpus file set to /usr/share/dict/words
|
100
|
+
|
101
|
+
Test initial database creation time versus gram size? (y/n) y
|
102
|
+
------------------------------------------------------------
|
103
|
+
user system total real
|
104
|
+
size: 1 4.080000 0.010000 4.090000 ( 4.108898)
|
105
|
+
size: 2 8.320000 0.090000 8.410000 ( 8.554122)
|
106
|
+
size: 3 12.710000 0.080000 12.790000 ( 12.869257)
|
107
|
+
size: 4 18.750000 0.160000 18.910000 ( 19.102232)
|
108
|
+
size: 5 25.440000 0.250000 25.690000 ( 25.953532)
|
109
|
+
size: 6 31.060000 0.340000 31.400000 ( 31.680680)
|
110
|
+
------------------------------------------------------------
|
111
|
+
|
112
|
+
Test existing database on disk, initial memory load? (y/n) y
|
113
|
+
------------------------------------------------------------
|
114
|
+
user system total real
|
115
|
+
size: 1 0.000000 0.000000 0.000000 ( 0.000587)
|
116
|
+
size: 2 0.000000 0.000000 0.000000 ( 0.005109)
|
117
|
+
size: 3 0.080000 0.010000 0.090000 ( 0.077303)
|
118
|
+
size: 4 0.330000 0.070000 0.400000 ( 0.395079)
|
119
|
+
size: 5 1.030000 0.130000 1.160000 ( 1.157014)
|
120
|
+
size: 6 2.920000 0.120000 3.040000 ( 3.045219)
|
121
|
+
------------------------------------------------------------
|
122
|
+
|
123
|
+
Test word generation averages for 100 words per gram size? (y/n) y
|
124
|
+
------------------------------------------------------------
|
125
|
+
user system total real
|
126
|
+
size: 1 0.010000 0.000000 0.010000 ( 0.003971)
|
127
|
+
size: 2 0.010000 0.000000 0.010000 ( 0.009460)
|
128
|
+
size: 3 0.120000 0.000000 0.120000 ( 0.127297)
|
129
|
+
size: 4 0.350000 0.010000 0.360000 ( 0.354564)
|
130
|
+
size: 5 2.250000 0.020000 2.270000 ( 2.302405)
|
131
|
+
size: 6 4.000000 0.120000 4.120000 ( 4.186757)
|
132
|
+
------------------------------------------------------------
|
93
133
|
```
|
94
134
|
|
95
135
|
## Change Log
|
96
136
|
|
137
|
+
- `2.0.0`
|
138
|
+
- Breaking changes:
|
139
|
+
- Removed all caching functions from `Generator`. They were cluttering up the code, without being a necessary function of a `Generator`.
|
140
|
+
- Added an `attr_accessor` for `Generator.data_store`, so that users can implement custom metadata for `Generator` objects, and store it in the same `FileStore` object that holds the database.
|
97
141
|
- `1.0.0` introduced a couple of breaking changes:
|
98
142
|
- `Words` class renamed to `Generator`.
|
99
143
|
- `Generator`:
|
data/bin/benchmark
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen-string-literal: true
|
3
|
+
|
4
|
+
require 'benchmark'
|
5
|
+
require 'bundler/setup'
|
6
|
+
require 'markov_words'
|
7
|
+
|
8
|
+
# Utility class to generate benchmarks for MarkovWords
|
9
|
+
class GeneratorBenchmark
|
10
|
+
LABEL_WIDTH = 7
|
11
|
+
def run
|
12
|
+
test_if_desired 'initial database creation time versus gram size' do
|
13
|
+
Benchmark.bm(LABEL_WIDTH) do |x|
|
14
|
+
@min_gram_size.upto(@max_gram_size) do |size|
|
15
|
+
generator =
|
16
|
+
MarkovWords::Generator.new(flush_data: true,
|
17
|
+
gram_size: size,
|
18
|
+
corpus_file: @corpus_file)
|
19
|
+
x.report("size: #{size}") { generator.word }
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
test_if_desired 'existing database on disk, initial memory load' do
|
25
|
+
Benchmark.bm(LABEL_WIDTH) do |x|
|
26
|
+
@min_gram_size.upto(@max_gram_size) do |size|
|
27
|
+
generator =
|
28
|
+
MarkovWords::Generator.new(flush_data: true,
|
29
|
+
gram_size: size,
|
30
|
+
corpus_file: @corpus_file)
|
31
|
+
_word = generator.word # this will run initial setup
|
32
|
+
generator_load_data_from_file =
|
33
|
+
MarkovWords::Generator.new(gram_size: size,
|
34
|
+
corpus_file: @corpus_file)
|
35
|
+
x.report("size: #{size}") { generator_load_data_from_file.word }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
test_if_desired 'word generation averages for 100 words per gram size' do
|
41
|
+
Benchmark.bm(LABEL_WIDTH) do |x|
|
42
|
+
@min_gram_size.upto(@max_gram_size) do |size|
|
43
|
+
generator =
|
44
|
+
MarkovWords::Generator.new(flush_data: true,
|
45
|
+
gram_size: size,
|
46
|
+
perform_caching: false,
|
47
|
+
corpus_file: @corpus_file)
|
48
|
+
_word = generator.word # this will run initial setup
|
49
|
+
x.report("size: #{size}") { 1.upto(100) { generator.word } }
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def initialize(opts)
|
56
|
+
@min_gram_size = opts.fetch :min_gram_size, 1
|
57
|
+
@max_gram_size = opts.fetch :max_gram_size, 6
|
58
|
+
@corpus_file = opts.fetch :corpus_file, '/usr/share/dict/words'
|
59
|
+
puts "Minimum n-gram size set to #{@min_gram_size}"
|
60
|
+
puts "Maximum n-gram size set to #{@max_gram_size}"
|
61
|
+
puts "Corpus file set to #{@corpus_file}"
|
62
|
+
end
|
63
|
+
|
64
|
+
def print_separator
|
65
|
+
printf "%s\n", Array.new(60).map { '-' }.join
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_if_desired(description, &block)
|
69
|
+
printf "\n%s", "Test #{description}? (y/n) "
|
70
|
+
if /y/.match?($stdin.readline)
|
71
|
+
print_separator
|
72
|
+
yield(block)
|
73
|
+
print_separator
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
if ARGV.empty?
|
79
|
+
puts "USAGE: bin/benchmark min_gram_size max_gram_size corpus_file\n"
|
80
|
+
end
|
81
|
+
bm = GeneratorBenchmark.new(min_gram_size: ARGV[0].to_i,
|
82
|
+
max_gram_size: ARGV[1].to_i,
|
83
|
+
corpus_file: ARGV[2])
|
84
|
+
bm.run
|
@@ -1,26 +1,27 @@
|
|
1
1
|
# frozen-string-literal: true
|
2
2
|
|
3
3
|
module MarkovWords
|
4
|
-
# This class takes care of word generation,
|
4
|
+
# This class takes care of word generation, and will store the database into
|
5
|
+
# a `FileStore` object.
|
5
6
|
class Generator
|
6
|
-
#
|
7
|
-
#
|
8
|
-
|
9
|
-
|
10
|
-
end
|
7
|
+
# It's useful to be able to access the data store object directly, for
|
8
|
+
# example if you were to want to implement storage of related metadata
|
9
|
+
# into the same storage system that holds the database.
|
10
|
+
attr_reader :data_store
|
11
11
|
|
12
12
|
# The current database of n-gram mappings
|
13
13
|
# @return [Hash] n-gram database
|
14
14
|
def grams
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
if @grams.nil?
|
16
|
+
@grams = @data_store.retrieve_data(:grams) ||
|
17
|
+
markov_corpus(@corpus_file, @gram_size)
|
18
|
+
else
|
19
|
+
@grams
|
20
|
+
end
|
18
21
|
end
|
19
22
|
|
20
23
|
# Create a new "Words" object
|
21
24
|
# @param opts [Hash]
|
22
|
-
# @option opts [Integer] :cache_size How many words to pre-calculate +
|
23
|
-
# store in the cache for quick retrieval
|
24
25
|
# @option opts [String] :corpus_file ('/usr/share/dict/words') Your
|
25
26
|
# dictionary of words.
|
26
27
|
# @option opts [String] :data_file Location where calculations are
|
@@ -34,7 +35,6 @@ module MarkovWords
|
|
34
35
|
# NOTE: If your corpus size is very small (<1000 words or so), it's hard
|
35
36
|
# to guarantee a min_length because so many n-grams will have no
|
36
37
|
# association, which terminates word generation.
|
37
|
-
# @option opts [Boolean] :perform_caching (true) Perform caching?
|
38
38
|
# @return [Words] A `MarkovWords::Generator` object.
|
39
39
|
def initialize(opts = {})
|
40
40
|
@grams = nil
|
@@ -42,33 +42,13 @@ module MarkovWords
|
|
42
42
|
@max_length = opts.fetch :max_length, 16
|
43
43
|
@min_length = opts.fetch :min_length, 3
|
44
44
|
|
45
|
-
initialize_cache(opts)
|
46
45
|
initialize_data(opts)
|
47
46
|
end
|
48
47
|
|
49
|
-
#
|
50
|
-
# `@cache_size`. If `perform_caching` is set to `false`, returns an empty
|
51
|
-
# array.
|
52
|
-
# @return [Array<String>] All words in the cache.
|
53
|
-
def refresh_cache
|
54
|
-
if @perform_caching
|
55
|
-
words_array = @data_store.retrieve_data(:cache) || []
|
56
|
-
words_array << generate_word while words_array.length < @cache_size
|
57
|
-
@data_store.store_data(:cache, words_array)
|
58
|
-
words_array
|
59
|
-
else
|
60
|
-
[]
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
|
-
# Generate a new word, or return one from the cache if available.
|
48
|
+
# Generate a new word
|
65
49
|
# @return [String] The word.
|
66
50
|
def word
|
67
|
-
|
68
|
-
load_word_from_cache
|
69
|
-
else
|
70
|
-
generate_word
|
71
|
-
end
|
51
|
+
generate_word
|
72
52
|
end
|
73
53
|
|
74
54
|
private
|
@@ -81,11 +61,6 @@ module MarkovWords
|
|
81
61
|
end
|
82
62
|
end
|
83
63
|
|
84
|
-
def initialize_cache(opts)
|
85
|
-
@cache_size = opts.fetch :cache_size, 100
|
86
|
-
@perform_caching = opts.fetch :perform_caching, true
|
87
|
-
end
|
88
|
-
|
89
64
|
def initialize_data(opts)
|
90
65
|
@corpus_file = opts.fetch :corpus_file, '/usr/share/dict/words'
|
91
66
|
@data_file = opts.fetch :data_file, 'tmp/markov_words.data'
|
@@ -138,18 +113,6 @@ module MarkovWords
|
|
138
113
|
/[\r\n]/.match? word
|
139
114
|
end
|
140
115
|
|
141
|
-
def load_word_from_cache
|
142
|
-
words_array = @data_store.retrieve_data(:cache)
|
143
|
-
if words_array.nil? || words_array.empty?
|
144
|
-
words_array = Array.new(@cache_size) { generate_word }
|
145
|
-
end
|
146
|
-
|
147
|
-
word = words_array.pop
|
148
|
-
@data_store.store_data(:cache, words_array)
|
149
|
-
|
150
|
-
word
|
151
|
-
end
|
152
|
-
|
153
116
|
# Generate a MarkovWords corpus from a datafile, with a given size of
|
154
117
|
# n-gram. Returns a hash of "grams", which are a map of a letter to the
|
155
118
|
# frequency of the letters that follow it, eg: {"c" => {"a" => 1, "b" =>
|
@@ -165,6 +128,7 @@ module MarkovWords
|
|
165
128
|
end
|
166
129
|
end
|
167
130
|
|
131
|
+
@data_store.store_data(:grams, grams)
|
168
132
|
grams
|
169
133
|
end
|
170
134
|
|
data/lib/markov_words/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: markov_words
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Donald Merand
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-12-
|
11
|
+
date: 2017-12-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -114,6 +114,7 @@ files:
|
|
114
114
|
- LICENSE.txt
|
115
115
|
- README.md
|
116
116
|
- Rakefile
|
117
|
+
- bin/benchmark
|
117
118
|
- bin/console
|
118
119
|
- bin/setup
|
119
120
|
- lib/markov_words.rb
|