dimsum 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/Gemfile +5 -0
- data/Rakefile +10 -0
- data/bin/dimsum +12 -25
- data/dimsum.gemspec +1 -1
- data/lib/dimsum.rb +34 -1
- data/lib/dimsum/version.rb +1 -1
- data/test/dimsum_test.rb +41 -0
- metadata +7 -5
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Rakefile
CHANGED
data/bin/dimsum
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
1
|
require 'optparse'
|
2
|
+
require 'dimsum'
|
3
|
+
|
4
|
+
include Dimsum
|
4
5
|
|
5
6
|
options = {}
|
6
7
|
|
@@ -11,29 +12,15 @@ OptionParser.new do |opts|
|
|
11
12
|
end
|
12
13
|
end.parse!
|
13
14
|
|
14
|
-
|
15
|
-
lines = options[:lines].to_i
|
16
|
-
raise ArgumentError, "The filename is required" unless filename
|
17
|
-
|
18
|
-
file_size = `wc -l #{filename}`.strip.to_i
|
15
|
+
options[:lines] ||= "5"
|
19
16
|
|
20
|
-
|
21
|
-
if last_2 == "\n\n"
|
22
|
-
file_size -= 1
|
23
|
-
end
|
24
|
-
|
25
|
-
random = Random.new
|
17
|
+
lines = options[:lines].to_i
|
26
18
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
STDOUT.print line
|
34
|
-
STDOUT.flush
|
35
|
-
lines -= 1
|
36
|
-
end
|
37
|
-
file_size -= 1
|
38
|
-
end
|
19
|
+
reservoir_sample = if ARGV.length > 0
|
20
|
+
filename = ARGV[0]
|
21
|
+
reservoir_file(filename, lines)
|
22
|
+
else
|
23
|
+
reservoir_stdin(lines)
|
24
|
+
end
|
39
25
|
|
26
|
+
reservoir_sample.each{ |l| STDOUT.print l }
|
data/dimsum.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
|
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split($\)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
-
gem.test_files = gem.files.grep(%r{^(test
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test)/})
|
14
14
|
gem.name = "dimsum"
|
15
15
|
gem.require_paths = ["lib"]
|
16
16
|
gem.bindir = 'bin'
|
data/lib/dimsum.rb
CHANGED
@@ -1,5 +1,38 @@
|
|
1
1
|
require "dimsum/version"
|
2
2
|
|
3
3
|
module Dimsum
|
4
|
-
|
4
|
+
def reservoir_file(filename, sample_size)
|
5
|
+
random = Random.new
|
6
|
+
out = []
|
7
|
+
|
8
|
+
File.open(filename, "r").each_line.each_with_index do |line, line_number|
|
9
|
+
keep, index = keep_line_in_index(line_number, sample_size, random)
|
10
|
+
out[index] = line if keep
|
11
|
+
end
|
12
|
+
out
|
13
|
+
end
|
14
|
+
|
15
|
+
def reservoir_stdin(sample_size)
|
16
|
+
random = Random.new
|
17
|
+
input_line_number = 0
|
18
|
+
out = []
|
19
|
+
|
20
|
+
STDIN.readlines.each do |line|
|
21
|
+
keep, index = keep_line_in_index(input_line_number, sample_size, random)
|
22
|
+
out[index] = line if keep
|
23
|
+
input_line_number += 1
|
24
|
+
end
|
25
|
+
|
26
|
+
out
|
27
|
+
end
|
28
|
+
|
29
|
+
def keep_line_in_index(input_line_number, sample_size, random_number_generator)
|
30
|
+
if input_line_number <= sample_size
|
31
|
+
[true, input_line_number]
|
32
|
+
else
|
33
|
+
r = random_number_generator.rand(0 .. input_line_number)
|
34
|
+
keep = r < sample_size
|
35
|
+
[keep, r]
|
36
|
+
end
|
37
|
+
end
|
5
38
|
end
|
data/lib/dimsum/version.rb
CHANGED
data/test/dimsum_test.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'lib/dimsum'
|
2
|
+
require 'minitest/autorun'
|
3
|
+
|
4
|
+
class TestDimsum < MiniTest::Unit::TestCase
|
5
|
+
include Dimsum
|
6
|
+
|
7
|
+
def test_keep_line_in_index__should_keep_any_line_number_less_than_sample_size
|
8
|
+
random = MiniTest::Mock.new
|
9
|
+
sample_size = 5
|
10
|
+
|
11
|
+
(0..4).each do |k|
|
12
|
+
keep, _ = keep_line_in_index(k, sample_size, random)
|
13
|
+
assert keep
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_keep_line_inde_index__should__keep_line_when_index_is_less_than_random
|
18
|
+
expected_index = 2
|
19
|
+
random = MiniTest::Mock.new
|
20
|
+
current_line = 10
|
21
|
+
random.expect(:rand, expected_index, [(0 .. current_line)])
|
22
|
+
sample_size = 5
|
23
|
+
|
24
|
+
keep, actual_index = keep_line_in_index(10, sample_size, random)
|
25
|
+
|
26
|
+
assert keep
|
27
|
+
assert_equal expected_index, actual_index
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_keep_line_inde_index__should_not_keep_line_when_index_is_more_or_equal_than_random
|
31
|
+
random_value = 8
|
32
|
+
random = MiniTest::Mock.new
|
33
|
+
current_line = 10
|
34
|
+
random.expect(:rand, random_value, [(0 .. current_line)])
|
35
|
+
sample_size = 5
|
36
|
+
keep, _ = keep_line_in_index(10, sample_size, random)
|
37
|
+
|
38
|
+
refute keep
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dimsum
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-
|
14
|
+
date: 2012-07-09 00:00:00.000000000 Z
|
15
15
|
dependencies: []
|
16
16
|
description: dimsum is a very simple ruby script that performs reservoir sampling
|
17
17
|
on the input file.
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- dimsum.gemspec
|
34
34
|
- lib/dimsum.rb
|
35
35
|
- lib/dimsum/version.rb
|
36
|
+
- test/dimsum_test.rb
|
36
37
|
homepage: ''
|
37
38
|
licenses: []
|
38
39
|
post_install_message:
|
@@ -47,7 +48,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
47
48
|
version: '0'
|
48
49
|
segments:
|
49
50
|
- 0
|
50
|
-
hash:
|
51
|
+
hash: 1690918052010414443
|
51
52
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
53
|
none: false
|
53
54
|
requirements:
|
@@ -56,11 +57,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
56
57
|
version: '0'
|
57
58
|
segments:
|
58
59
|
- 0
|
59
|
-
hash:
|
60
|
+
hash: 1690918052010414443
|
60
61
|
requirements: []
|
61
62
|
rubyforge_project:
|
62
63
|
rubygems_version: 1.8.11
|
63
64
|
signing_key:
|
64
65
|
specification_version: 3
|
65
66
|
summary: reservoir sampling
|
66
|
-
test_files:
|
67
|
+
test_files:
|
68
|
+
- test/dimsum_test.rb
|