dimsum 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ .rbenv-version
data/Gemfile CHANGED
@@ -2,3 +2,8 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in dimsum.gemspec
4
4
  gemspec
5
+
6
+ group :test do
7
+ gem 'ruby-debug19'
8
+ gem 'rake'
9
+ end
data/Rakefile CHANGED
@@ -1,2 +1,12 @@
1
1
  #!/usr/bin/env rake
2
2
  require "bundler/gem_tasks"
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.ruby_opts = ['-I.:lib:test']
7
+ t.libs << 'test'
8
+ t.test_files = ['./test/*_test.rb']
9
+ t.verbose = true
10
+ end
11
+
12
+ task :default => :test
data/bin/dimsum CHANGED
@@ -1,6 +1,7 @@
1
- #!/usr/bin/env ruby
2
-
3
1
  require 'optparse'
2
+ require 'dimsum'
3
+
4
+ include Dimsum
4
5
 
5
6
  options = {}
6
7
 
@@ -11,29 +12,15 @@ OptionParser.new do |opts|
11
12
  end
12
13
  end.parse!
13
14
 
14
- filename = ARGV[0]
15
- lines = options[:lines].to_i
16
- raise ArgumentError, "The filename is required" unless filename
17
-
18
- file_size = `wc -l #{filename}`.strip.to_i
15
+ options[:lines] ||= "5"
19
16
 
20
- last_2 = `tail -c2 #{filename}`
21
- if last_2 == "\n\n"
22
- file_size -= 1
23
- end
24
-
25
- random = Random.new
17
+ lines = options[:lines].to_i
26
18
 
27
- File.open(filename, "r").lines do |line|
28
- if file_size == 0
29
- next
30
- end
31
- r = random.rand(0 .. file_size - 1)
32
- if r < lines
33
- STDOUT.print line
34
- STDOUT.flush
35
- lines -= 1
36
- end
37
- file_size -= 1
38
- end
19
+ reservoir_sample = if ARGV.length > 0
20
+ filename = ARGV[0]
21
+ reservoir_file(filename, lines)
22
+ else
23
+ reservoir_stdin(lines)
24
+ end
39
25
 
26
+ reservoir_sample.each{ |l| STDOUT.print l }
data/dimsum.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
10
10
 
11
11
  gem.files = `git ls-files`.split($\)
12
12
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
- gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
13
+ gem.test_files = gem.files.grep(%r{^(test)/})
14
14
  gem.name = "dimsum"
15
15
  gem.require_paths = ["lib"]
16
16
  gem.bindir = 'bin'
data/lib/dimsum.rb CHANGED
@@ -1,5 +1,38 @@
1
1
  require "dimsum/version"
2
2
 
3
3
  module Dimsum
4
- # Your code goes here...
4
+ def reservoir_file(filename, sample_size)
5
+ random = Random.new
6
+ out = []
7
+
8
+ File.open(filename, "r").each_line.each_with_index do |line, line_number|
9
+ keep, index = keep_line_in_index(line_number, sample_size, random)
10
+ out[index] = line if keep
11
+ end
12
+ out
13
+ end
14
+
15
+ def reservoir_stdin(sample_size)
16
+ random = Random.new
17
+ input_line_number = 0
18
+ out = []
19
+
20
+ STDIN.readlines.each do |line|
21
+ keep, index = keep_line_in_index(input_line_number, sample_size, random)
22
+ out[index] = line if keep
23
+ input_line_number += 1
24
+ end
25
+
26
+ out
27
+ end
28
+
29
+ def keep_line_in_index(input_line_number, sample_size, random_number_generator)
30
+ if input_line_number <= sample_size
31
+ [true, input_line_number]
32
+ else
33
+ r = random_number_generator.rand(0 .. input_line_number)
34
+ keep = r < sample_size
35
+ [keep, r]
36
+ end
37
+ end
5
38
  end
@@ -1,3 +1,3 @@
1
1
  module Dimsum
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -0,0 +1,41 @@
1
+ require 'lib/dimsum'
2
+ require 'minitest/autorun'
3
+
4
+ class TestDimsum < MiniTest::Unit::TestCase
5
+ include Dimsum
6
+
7
+ def test_keep_line_in_index__should_keep_any_line_number_less_than_sample_size
8
+ random = MiniTest::Mock.new
9
+ sample_size = 5
10
+
11
+ (0..4).each do |k|
12
+ keep, _ = keep_line_in_index(k, sample_size, random)
13
+ assert keep
14
+ end
15
+ end
16
+
17
+ def test_keep_line_inde_index__should__keep_line_when_index_is_less_than_random
18
+ expected_index = 2
19
+ random = MiniTest::Mock.new
20
+ current_line = 10
21
+ random.expect(:rand, expected_index, [(0 .. current_line)])
22
+ sample_size = 5
23
+
24
+ keep, actual_index = keep_line_in_index(10, sample_size, random)
25
+
26
+ assert keep
27
+ assert_equal expected_index, actual_index
28
+ end
29
+
30
+ def test_keep_line_inde_index__should_not_keep_line_when_index_is_more_or_equal_than_random
31
+ random_value = 8
32
+ random = MiniTest::Mock.new
33
+ current_line = 10
34
+ random.expect(:rand, random_value, [(0 .. current_line)])
35
+ sample_size = 5
36
+ keep, _ = keep_line_in_index(10, sample_size, random)
37
+
38
+ refute keep
39
+ end
40
+ end
41
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dimsum
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-06-30 00:00:00.000000000 Z
14
+ date: 2012-07-09 00:00:00.000000000 Z
15
15
  dependencies: []
16
16
  description: dimsum is a very simple ruby script that performs reservoir sampling
17
17
  on the input file.
@@ -33,6 +33,7 @@ files:
33
33
  - dimsum.gemspec
34
34
  - lib/dimsum.rb
35
35
  - lib/dimsum/version.rb
36
+ - test/dimsum_test.rb
36
37
  homepage: ''
37
38
  licenses: []
38
39
  post_install_message:
@@ -47,7 +48,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
47
48
  version: '0'
48
49
  segments:
49
50
  - 0
50
- hash: -1291815841305852171
51
+ hash: 1690918052010414443
51
52
  required_rubygems_version: !ruby/object:Gem::Requirement
52
53
  none: false
53
54
  requirements:
@@ -56,11 +57,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
56
57
  version: '0'
57
58
  segments:
58
59
  - 0
59
- hash: -1291815841305852171
60
+ hash: 1690918052010414443
60
61
  requirements: []
61
62
  rubyforge_project:
62
63
  rubygems_version: 1.8.11
63
64
  signing_key:
64
65
  specification_version: 3
65
66
  summary: reservoir sampling
66
- test_files: []
67
+ test_files:
68
+ - test/dimsum_test.rb