markov_words 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/bin/benchmark +68 -64
- data/lib/markov_words/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f4c5006ece0a2f2eb6e4f30ac865ef48ba18ed80
|
|
4
|
+
data.tar.gz: dea0a20dc2d7f05f38f55821c7e88ad254aa5470
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1c6f73c9a170deae7f904635eb326b71effa6ed6f0a45da190f8b5e9009606956efd686e3de16f69ce0e5e9c4a4def2774d5b52186d2fbc0df48ca83de343584
|
|
7
|
+
data.tar.gz: 35edfedcbaa4501ab892def54cc213e641195c6bd4c7c7e52d3044bb8f2c7772a0439289958461f92c310c5760f8f3ba78ba7472a7daf3b4d95c5359df59b8f5
|
data/Gemfile.lock
CHANGED
data/bin/benchmark
CHANGED
|
@@ -5,80 +5,84 @@ require 'benchmark'
|
|
|
5
5
|
require 'bundler/setup'
|
|
6
6
|
require 'markov_words'
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
test_if_desired 'initial database creation time versus gram size' do
|
|
13
|
-
Benchmark.bm(LABEL_WIDTH) do |x|
|
|
14
|
-
@min_gram_size.upto(@max_gram_size) do |size|
|
|
15
|
-
generator =
|
|
16
|
-
MarkovWords::Generator.new(flush_data: true,
|
|
17
|
-
gram_size: size,
|
|
18
|
-
corpus_file: @corpus_file)
|
|
19
|
-
x.report("size: #{size}") { generator.word }
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
end
|
|
8
|
+
if ARGV.empty?
|
|
9
|
+
puts 'USAGE: bin/benchmark min_gram_size max_gram_size corpus_file'
|
|
10
|
+
puts 'EXAMPLE: bin/benchmark 2 6 /usr/share/dict/words'
|
|
11
|
+
end
|
|
23
12
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
MarkovWords::Generator.new(flush_data: true,
|
|
29
|
-
gram_size: size,
|
|
30
|
-
corpus_file: @corpus_file)
|
|
31
|
-
_word = generator.word # this will run initial setup
|
|
32
|
-
generator_load_data_from_file =
|
|
33
|
-
MarkovWords::Generator.new(gram_size: size,
|
|
34
|
-
corpus_file: @corpus_file)
|
|
35
|
-
x.report("size: #{size}") { generator_load_data_from_file.word }
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
end
|
|
13
|
+
@min_gram_size = ARGV[0].to_i || 1
|
|
14
|
+
@max_gram_size = ARGV[1].to_i || 6
|
|
15
|
+
@corpus_file = ARGV[2] || '/usr/share/dict/words'
|
|
16
|
+
@label_width = 7
|
|
39
17
|
|
|
40
|
-
|
|
41
|
-
|
|
18
|
+
puts "Minimum n-gram size set to #{@min_gram_size}"
|
|
19
|
+
puts "Maximum n-gram size set to #{@max_gram_size}"
|
|
20
|
+
puts "Corpus file set to #{@corpus_file}"
|
|
21
|
+
|
|
22
|
+
def print_separator
|
|
23
|
+
printf "%s\n", Array.new(70).map { '-' }.join
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def test_if_desired(description, **opts)
|
|
27
|
+
printf "\n%s", "Test #{description}? (y/n) "
|
|
28
|
+
if /y/.match?($stdin.readline)
|
|
29
|
+
print_separator
|
|
30
|
+
if opts.fetch :benchmark, true
|
|
31
|
+
Benchmark.bm(@label_width) do |report|
|
|
42
32
|
@min_gram_size.upto(@max_gram_size) do |size|
|
|
43
|
-
|
|
44
|
-
MarkovWords::Generator.new(flush_data: true,
|
|
45
|
-
gram_size: size,
|
|
46
|
-
perform_caching: false,
|
|
47
|
-
corpus_file: @corpus_file)
|
|
48
|
-
_word = generator.word # this will run initial setup
|
|
49
|
-
x.report("size: #{size}") { 1.upto(100) { generator.word } }
|
|
33
|
+
yield(report, size) if block_given?
|
|
50
34
|
end
|
|
51
35
|
end
|
|
36
|
+
else
|
|
37
|
+
@min_gram_size.upto(@max_gram_size) do |size|
|
|
38
|
+
yield(size) if block_given?
|
|
39
|
+
end
|
|
52
40
|
end
|
|
41
|
+
print_separator
|
|
53
42
|
end
|
|
43
|
+
end
|
|
54
44
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
puts "Maximum n-gram size set to #{@max_gram_size}"
|
|
61
|
-
puts "Corpus file set to #{@corpus_file}"
|
|
62
|
-
end
|
|
45
|
+
def new_generator(**opts)
|
|
46
|
+
opts = {
|
|
47
|
+
flush_data: true,
|
|
48
|
+
corpus_file: @corpus_file
|
|
49
|
+
}.merge(opts)
|
|
63
50
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
end
|
|
51
|
+
generator = MarkovWords::Generator.new(opts)
|
|
52
|
+
_word = generator.word if opts.fetch(:pre_seed, false)
|
|
67
53
|
|
|
68
|
-
|
|
69
|
-
printf "\n%s", "Test #{description}? (y/n) "
|
|
70
|
-
if /y/.match?($stdin.readline)
|
|
71
|
-
print_separator
|
|
72
|
-
yield(block)
|
|
73
|
-
print_separator
|
|
74
|
-
end
|
|
75
|
-
end
|
|
54
|
+
generator
|
|
76
55
|
end
|
|
77
56
|
|
|
78
|
-
|
|
79
|
-
|
|
57
|
+
report_name = 'initial database creation time versus gram size'
|
|
58
|
+
test_if_desired report_name do |report, size|
|
|
59
|
+
generator = new_generator(gram_size: size)
|
|
60
|
+
report.report("size: #{size}") { generator.word }
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
report_name = 'existing database on disk, initial memory load'
|
|
64
|
+
test_if_desired report_name do |report, size|
|
|
65
|
+
new_generator(pre_seed: true, gram_size: size)
|
|
66
|
+
generator_existing_db = new_generator(flush_data: false, gram_size: size)
|
|
67
|
+
report.report("size: #{size}") { generator_existing_db.word }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
report_name = 'word generation averages for 100 words per gram size'
|
|
71
|
+
test_if_desired report_name do |report, size|
|
|
72
|
+
generator = new_generator(pre_seed: true, gram_size: size)
|
|
73
|
+
report.report("size: #{size}") { 1.upto(100) { generator.word } }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
test_if_desired 'word uniqueness per gram size', benchmark: false do |size|
|
|
77
|
+
generator = new_generator(gram_size: size)
|
|
78
|
+
num_words = 6000
|
|
79
|
+
seen_words = {}
|
|
80
|
+
|
|
81
|
+
num_words.times do
|
|
82
|
+
word = generator.word
|
|
83
|
+
seen_words[word].nil? ? seen_words[word] = 0 : seen_words[word] += 1
|
|
84
|
+
end
|
|
85
|
+
num_dupes = seen_words.find_all{ |_key, val| val > 1 }.length
|
|
86
|
+
|
|
87
|
+
puts "n-gram size: #{size}, #{num_dupes}/#{num_words} words were duplicates."
|
|
80
88
|
end
|
|
81
|
-
bm = GeneratorBenchmark.new(min_gram_size: ARGV[0].to_i,
|
|
82
|
-
max_gram_size: ARGV[1].to_i,
|
|
83
|
-
corpus_file: ARGV[2])
|
|
84
|
-
bm.run
|
data/lib/markov_words/version.rb
CHANGED