dimsum 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +5 -0
- data/Rakefile +10 -0
- data/bin/dimsum +12 -25
- data/dimsum.gemspec +1 -1
- data/lib/dimsum.rb +34 -1
- data/lib/dimsum/version.rb +1 -1
- data/test/dimsum_test.rb +41 -0
- metadata +7 -5
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Rakefile
CHANGED
data/bin/dimsum
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
1
|
require 'optparse'
|
2
|
+
require 'dimsum'
|
3
|
+
|
4
|
+
include Dimsum
|
4
5
|
|
5
6
|
options = {}
|
6
7
|
|
@@ -11,29 +12,15 @@ OptionParser.new do |opts|
|
|
11
12
|
end
|
12
13
|
end.parse!
|
13
14
|
|
14
|
-
|
15
|
-
lines = options[:lines].to_i
|
16
|
-
raise ArgumentError, "The filename is required" unless filename
|
17
|
-
|
18
|
-
file_size = `wc -l #{filename}`.strip.to_i
|
15
|
+
options[:lines] ||= "5"
|
19
16
|
|
20
|
-
|
21
|
-
if last_2 == "\n\n"
|
22
|
-
file_size -= 1
|
23
|
-
end
|
24
|
-
|
25
|
-
random = Random.new
|
17
|
+
lines = options[:lines].to_i
|
26
18
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
STDOUT.print line
|
34
|
-
STDOUT.flush
|
35
|
-
lines -= 1
|
36
|
-
end
|
37
|
-
file_size -= 1
|
38
|
-
end
|
19
|
+
reservoir_sample = if ARGV.length > 0
|
20
|
+
filename = ARGV[0]
|
21
|
+
reservoir_file(filename, lines)
|
22
|
+
else
|
23
|
+
reservoir_stdin(lines)
|
24
|
+
end
|
39
25
|
|
26
|
+
reservoir_sample.each{ |l| STDOUT.print l }
|
data/dimsum.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
|
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split($\)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
-
gem.test_files = gem.files.grep(%r{^(test
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test)/})
|
14
14
|
gem.name = "dimsum"
|
15
15
|
gem.require_paths = ["lib"]
|
16
16
|
gem.bindir = 'bin'
|
data/lib/dimsum.rb
CHANGED
@@ -1,5 +1,38 @@
|
|
1
1
|
require "dimsum/version"
|
2
2
|
|
3
3
|
module Dimsum
|
4
|
-
|
4
|
+
def reservoir_file(filename, sample_size)
|
5
|
+
random = Random.new
|
6
|
+
out = []
|
7
|
+
|
8
|
+
File.open(filename, "r").each_line.each_with_index do |line, line_number|
|
9
|
+
keep, index = keep_line_in_index(line_number, sample_size, random)
|
10
|
+
out[index] = line if keep
|
11
|
+
end
|
12
|
+
out
|
13
|
+
end
|
14
|
+
|
15
|
+
def reservoir_stdin(sample_size)
|
16
|
+
random = Random.new
|
17
|
+
input_line_number = 0
|
18
|
+
out = []
|
19
|
+
|
20
|
+
STDIN.readlines.each do |line|
|
21
|
+
keep, index = keep_line_in_index(input_line_number, sample_size, random)
|
22
|
+
out[index] = line if keep
|
23
|
+
input_line_number += 1
|
24
|
+
end
|
25
|
+
|
26
|
+
out
|
27
|
+
end
|
28
|
+
|
29
|
+
def keep_line_in_index(input_line_number, sample_size, random_number_generator)
|
30
|
+
if input_line_number <= sample_size
|
31
|
+
[true, input_line_number]
|
32
|
+
else
|
33
|
+
r = random_number_generator.rand(0 .. input_line_number)
|
34
|
+
keep = r < sample_size
|
35
|
+
[keep, r]
|
36
|
+
end
|
37
|
+
end
|
5
38
|
end
|
data/lib/dimsum/version.rb
CHANGED
data/test/dimsum_test.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'lib/dimsum'
|
2
|
+
require 'minitest/autorun'
|
3
|
+
|
4
|
+
class TestDimsum < MiniTest::Unit::TestCase
|
5
|
+
include Dimsum
|
6
|
+
|
7
|
+
def test_keep_line_in_index__should_keep_any_line_number_less_than_sample_size
|
8
|
+
random = MiniTest::Mock.new
|
9
|
+
sample_size = 5
|
10
|
+
|
11
|
+
(0..4).each do |k|
|
12
|
+
keep, _ = keep_line_in_index(k, sample_size, random)
|
13
|
+
assert keep
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_keep_line_inde_index__should__keep_line_when_index_is_less_than_random
|
18
|
+
expected_index = 2
|
19
|
+
random = MiniTest::Mock.new
|
20
|
+
current_line = 10
|
21
|
+
random.expect(:rand, expected_index, [(0 .. current_line)])
|
22
|
+
sample_size = 5
|
23
|
+
|
24
|
+
keep, actual_index = keep_line_in_index(10, sample_size, random)
|
25
|
+
|
26
|
+
assert keep
|
27
|
+
assert_equal expected_index, actual_index
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_keep_line_inde_index__should_not_keep_line_when_index_is_more_or_equal_than_random
|
31
|
+
random_value = 8
|
32
|
+
random = MiniTest::Mock.new
|
33
|
+
current_line = 10
|
34
|
+
random.expect(:rand, random_value, [(0 .. current_line)])
|
35
|
+
sample_size = 5
|
36
|
+
keep, _ = keep_line_in_index(10, sample_size, random)
|
37
|
+
|
38
|
+
refute keep
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dimsum
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-
|
14
|
+
date: 2012-07-09 00:00:00.000000000 Z
|
15
15
|
dependencies: []
|
16
16
|
description: dimsum is a very simple ruby script that performs reservoir sampling
|
17
17
|
on the input file.
|
@@ -33,6 +33,7 @@ files:
|
|
33
33
|
- dimsum.gemspec
|
34
34
|
- lib/dimsum.rb
|
35
35
|
- lib/dimsum/version.rb
|
36
|
+
- test/dimsum_test.rb
|
36
37
|
homepage: ''
|
37
38
|
licenses: []
|
38
39
|
post_install_message:
|
@@ -47,7 +48,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
47
48
|
version: '0'
|
48
49
|
segments:
|
49
50
|
- 0
|
50
|
-
hash:
|
51
|
+
hash: 1690918052010414443
|
51
52
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
53
|
none: false
|
53
54
|
requirements:
|
@@ -56,11 +57,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
56
57
|
version: '0'
|
57
58
|
segments:
|
58
59
|
- 0
|
59
|
-
hash:
|
60
|
+
hash: 1690918052010414443
|
60
61
|
requirements: []
|
61
62
|
rubyforge_project:
|
62
63
|
rubygems_version: 1.8.11
|
63
64
|
signing_key:
|
64
65
|
specification_version: 3
|
65
66
|
summary: reservoir sampling
|
66
|
-
test_files:
|
67
|
+
test_files:
|
68
|
+
- test/dimsum_test.rb
|