markov_words 2.0.1 → 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/bin/benchmark +68 -64
- data/lib/markov_words/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4c5006ece0a2f2eb6e4f30ac865ef48ba18ed80
|
4
|
+
data.tar.gz: dea0a20dc2d7f05f38f55821c7e88ad254aa5470
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c6f73c9a170deae7f904635eb326b71effa6ed6f0a45da190f8b5e9009606956efd686e3de16f69ce0e5e9c4a4def2774d5b52186d2fbc0df48ca83de343584
|
7
|
+
data.tar.gz: 35edfedcbaa4501ab892def54cc213e641195c6bd4c7c7e52d3044bb8f2c7772a0439289958461f92c310c5760f8f3ba78ba7472a7daf3b4d95c5359df59b8f5
|
data/Gemfile.lock
CHANGED
data/bin/benchmark
CHANGED
@@ -5,80 +5,84 @@ require 'benchmark'
|
|
5
5
|
require 'bundler/setup'
|
6
6
|
require 'markov_words'
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
test_if_desired 'initial database creation time versus gram size' do
|
13
|
-
Benchmark.bm(LABEL_WIDTH) do |x|
|
14
|
-
@min_gram_size.upto(@max_gram_size) do |size|
|
15
|
-
generator =
|
16
|
-
MarkovWords::Generator.new(flush_data: true,
|
17
|
-
gram_size: size,
|
18
|
-
corpus_file: @corpus_file)
|
19
|
-
x.report("size: #{size}") { generator.word }
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
8
|
+
if ARGV.empty?
|
9
|
+
puts 'USAGE: bin/benchmark min_gram_size max_gram_size corpus_file'
|
10
|
+
puts 'EXAMPLE: bin/benchmark 2 6 /usr/share/dict/words'
|
11
|
+
end
|
23
12
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
MarkovWords::Generator.new(flush_data: true,
|
29
|
-
gram_size: size,
|
30
|
-
corpus_file: @corpus_file)
|
31
|
-
_word = generator.word # this will run initial setup
|
32
|
-
generator_load_data_from_file =
|
33
|
-
MarkovWords::Generator.new(gram_size: size,
|
34
|
-
corpus_file: @corpus_file)
|
35
|
-
x.report("size: #{size}") { generator_load_data_from_file.word }
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
13
|
+
@min_gram_size = ARGV[0].to_i || 1
|
14
|
+
@max_gram_size = ARGV[1].to_i || 6
|
15
|
+
@corpus_file = ARGV[2] || '/usr/share/dict/words'
|
16
|
+
@label_width = 7
|
39
17
|
|
40
|
-
|
41
|
-
|
18
|
+
puts "Minimum n-gram size set to #{@min_gram_size}"
|
19
|
+
puts "Maximum n-gram size set to #{@max_gram_size}"
|
20
|
+
puts "Corpus file set to #{@corpus_file}"
|
21
|
+
|
22
|
+
def print_separator
|
23
|
+
printf "%s\n", Array.new(70).map { '-' }.join
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_if_desired(description, **opts)
|
27
|
+
printf "\n%s", "Test #{description}? (y/n) "
|
28
|
+
if /y/.match?($stdin.readline)
|
29
|
+
print_separator
|
30
|
+
if opts.fetch :benchmark, true
|
31
|
+
Benchmark.bm(@label_width) do |report|
|
42
32
|
@min_gram_size.upto(@max_gram_size) do |size|
|
43
|
-
|
44
|
-
MarkovWords::Generator.new(flush_data: true,
|
45
|
-
gram_size: size,
|
46
|
-
perform_caching: false,
|
47
|
-
corpus_file: @corpus_file)
|
48
|
-
_word = generator.word # this will run initial setup
|
49
|
-
x.report("size: #{size}") { 1.upto(100) { generator.word } }
|
33
|
+
yield(report, size) if block_given?
|
50
34
|
end
|
51
35
|
end
|
36
|
+
else
|
37
|
+
@min_gram_size.upto(@max_gram_size) do |size|
|
38
|
+
yield(size) if block_given?
|
39
|
+
end
|
52
40
|
end
|
41
|
+
print_separator
|
53
42
|
end
|
43
|
+
end
|
54
44
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
puts "Maximum n-gram size set to #{@max_gram_size}"
|
61
|
-
puts "Corpus file set to #{@corpus_file}"
|
62
|
-
end
|
45
|
+
def new_generator(**opts)
|
46
|
+
opts = {
|
47
|
+
flush_data: true,
|
48
|
+
corpus_file: @corpus_file
|
49
|
+
}.merge(opts)
|
63
50
|
|
64
|
-
|
65
|
-
|
66
|
-
end
|
51
|
+
generator = MarkovWords::Generator.new(opts)
|
52
|
+
_word = generator.word if opts.fetch(:pre_seed, false)
|
67
53
|
|
68
|
-
|
69
|
-
printf "\n%s", "Test #{description}? (y/n) "
|
70
|
-
if /y/.match?($stdin.readline)
|
71
|
-
print_separator
|
72
|
-
yield(block)
|
73
|
-
print_separator
|
74
|
-
end
|
75
|
-
end
|
54
|
+
generator
|
76
55
|
end
|
77
56
|
|
78
|
-
|
79
|
-
|
57
|
+
report_name = 'initial database creation time versus gram size'
|
58
|
+
test_if_desired report_name do |report, size|
|
59
|
+
generator = new_generator(gram_size: size)
|
60
|
+
report.report("size: #{size}") { generator.word }
|
61
|
+
end
|
62
|
+
|
63
|
+
report_name = 'existing database on disk, initial memory load'
|
64
|
+
test_if_desired report_name do |report, size|
|
65
|
+
new_generator(pre_seed: true, gram_size: size)
|
66
|
+
generator_existing_db = new_generator(flush_data: false, gram_size: size)
|
67
|
+
report.report("size: #{size}") { generator_existing_db.word }
|
68
|
+
end
|
69
|
+
|
70
|
+
report_name = 'word generation averages for 100 words per gram size'
|
71
|
+
test_if_desired report_name do |report, size|
|
72
|
+
generator = new_generator(pre_seed: true, gram_size: size)
|
73
|
+
report.report("size: #{size}") { 1.upto(100) { generator.word } }
|
74
|
+
end
|
75
|
+
|
76
|
+
test_if_desired 'word uniqueness per gram size', benchmark: false do |size|
|
77
|
+
generator = new_generator(gram_size: size)
|
78
|
+
num_words = 6000
|
79
|
+
seen_words = {}
|
80
|
+
|
81
|
+
num_words.times do
|
82
|
+
word = generator.word
|
83
|
+
seen_words[word].nil? ? seen_words[word] = 0 : seen_words[word] += 1
|
84
|
+
end
|
85
|
+
num_dupes = seen_words.find_all{ |_key, val| val > 1 }.length
|
86
|
+
|
87
|
+
puts "n-gram size: #{size}, #{num_dupes}/#{num_words} words were duplicates."
|
80
88
|
end
|
81
|
-
bm = GeneratorBenchmark.new(min_gram_size: ARGV[0].to_i,
|
82
|
-
max_gram_size: ARGV[1].to_i,
|
83
|
-
corpus_file: ARGV[2])
|
84
|
-
bm.run
|
data/lib/markov_words/version.rb
CHANGED