dimsum 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ .rbenv-version
data/Gemfile CHANGED
@@ -2,3 +2,8 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in dimsum.gemspec
4
4
  gemspec
5
+
6
+ group :test do
7
+ gem 'ruby-debug19'
8
+ gem 'rake'
9
+ end
data/Rakefile CHANGED
@@ -1,2 +1,12 @@
1
1
  #!/usr/bin/env rake
2
2
  require "bundler/gem_tasks"
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.ruby_opts = ['-I.:lib:test']
7
+ t.libs << 'test'
8
+ t.test_files = ['./test/*_test.rb']
9
+ t.verbose = true
10
+ end
11
+
12
+ task :default => :test
data/bin/dimsum CHANGED
@@ -1,6 +1,7 @@
1
- #!/usr/bin/env ruby
2
-
3
1
  require 'optparse'
2
+ require 'dimsum'
3
+
4
+ include Dimsum
4
5
 
5
6
  options = {}
6
7
 
@@ -11,29 +12,15 @@ OptionParser.new do |opts|
11
12
  end
12
13
  end.parse!
13
14
 
14
- filename = ARGV[0]
15
- lines = options[:lines].to_i
16
- raise ArgumentError, "The filename is required" unless filename
17
-
18
- file_size = `wc -l #{filename}`.strip.to_i
15
+ options[:lines] ||= "5"
19
16
 
20
- last_2 = `tail -c2 #{filename}`
21
- if last_2 == "\n\n"
22
- file_size -= 1
23
- end
24
-
25
- random = Random.new
17
+ lines = options[:lines].to_i
26
18
 
27
- File.open(filename, "r").lines do |line|
28
- if file_size == 0
29
- next
30
- end
31
- r = random.rand(0 .. file_size - 1)
32
- if r < lines
33
- STDOUT.print line
34
- STDOUT.flush
35
- lines -= 1
36
- end
37
- file_size -= 1
38
- end
19
+ reservoir_sample = if ARGV.length > 0
20
+ filename = ARGV[0]
21
+ reservoir_file(filename, lines)
22
+ else
23
+ reservoir_stdin(lines)
24
+ end
39
25
 
26
+ reservoir_sample.each{ |l| STDOUT.print l }
data/dimsum.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
10
10
 
11
11
  gem.files = `git ls-files`.split($\)
12
12
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
13
+ gem.test_files = gem.files.grep(%r{^(test)/})
14
14
  gem.name = "dimsum"
15
15
  gem.require_paths = ["lib"]
16
16
  gem.bindir = 'bin'
data/lib/dimsum.rb CHANGED
@@ -1,5 +1,38 @@
1
1
  require "dimsum/version"
2
2
 
3
3
  module Dimsum
4
- # Your code goes here...
4
+ def reservoir_file(filename, sample_size)
5
+ random = Random.new
6
+ out = []
7
+
8
+ File.open(filename, "r").each_line.each_with_index do |line, line_number|
9
+ keep, index = keep_line_in_index(line_number, sample_size, random)
10
+ out[index] = line if keep
11
+ end
12
+ out
13
+ end
14
+
15
+ def reservoir_stdin(sample_size)
16
+ random = Random.new
17
+ input_line_number = 0
18
+ out = []
19
+
20
+ STDIN.readlines.each do |line|
21
+ keep, index = keep_line_in_index(input_line_number, sample_size, random)
22
+ out[index] = line if keep
23
+ input_line_number += 1
24
+ end
25
+
26
+ out
27
+ end
28
+
29
+ def keep_line_in_index(input_line_number, sample_size, random_number_generator)
30
+ if input_line_number <= sample_size
31
+ [true, input_line_number]
32
+ else
33
+ r = random_number_generator.rand(0 .. input_line_number)
34
+ keep = r < sample_size
35
+ [keep, r]
36
+ end
37
+ end
5
38
  end
@@ -1,3 +1,3 @@
1
1
  module Dimsum
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,41 @@
1
+ require 'lib/dimsum'
2
+ require 'minitest/autorun'
3
+
4
+ class TestDimsum < MiniTest::Unit::TestCase
5
+ include Dimsum
6
+
7
+ def test_keep_line_in_index__should_keep_any_line_number_less_than_sample_size
8
+ random = MiniTest::Mock.new
9
+ sample_size = 5
10
+
11
+ (0..4).each do |k|
12
+ keep, _ = keep_line_in_index(k, sample_size, random)
13
+ assert keep
14
+ end
15
+ end
16
+
17
+ def test_keep_line_inde_index__should__keep_line_when_index_is_less_than_random
18
+ expected_index = 2
19
+ random = MiniTest::Mock.new
20
+ current_line = 10
21
+ random.expect(:rand, expected_index, [(0 .. current_line)])
22
+ sample_size = 5
23
+
24
+ keep, actual_index = keep_line_in_index(10, sample_size, random)
25
+
26
+ assert keep
27
+ assert_equal expected_index, actual_index
28
+ end
29
+
30
+ def test_keep_line_inde_index__should_not_keep_line_when_index_is_more_or_equal_than_random
31
+ random_value = 8
32
+ random = MiniTest::Mock.new
33
+ current_line = 10
34
+ random.expect(:rand, random_value, [(0 .. current_line)])
35
+ sample_size = 5
36
+ keep, _ = keep_line_in_index(10, sample_size, random)
37
+
38
+ refute keep
39
+ end
40
+ end
41
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dimsum
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-06-30 00:00:00.000000000 Z
14
+ date: 2012-07-09 00:00:00.000000000 Z
15
15
  dependencies: []
16
16
  description: dimsum is a very simple ruby script that performs reservoir sampling
17
17
  on the input file.
@@ -33,6 +33,7 @@ files:
33
33
  - dimsum.gemspec
34
34
  - lib/dimsum.rb
35
35
  - lib/dimsum/version.rb
36
+ - test/dimsum_test.rb
36
37
  homepage: ''
37
38
  licenses: []
38
39
  post_install_message:
@@ -47,7 +48,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
47
48
  version: '0'
48
49
  segments:
49
50
  - 0
50
- hash: -1291815841305852171
51
+ hash: 1690918052010414443
51
52
  required_rubygems_version: !ruby/object:Gem::Requirement
52
53
  none: false
53
54
  requirements:
@@ -56,11 +57,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
56
57
  version: '0'
57
58
  segments:
58
59
  - 0
59
- hash: -1291815841305852171
60
+ hash: 1690918052010414443
60
61
  requirements: []
61
62
  rubyforge_project:
62
63
  rubygems_version: 1.8.11
63
64
  signing_key:
64
65
  specification_version: 3
65
66
  summary: reservoir sampling
66
- test_files: []
67
+ test_files:
68
+ - test/dimsum_test.rb