samplelines 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in samplelines.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Bill Dueber
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,68 @@
1
+ # samplelines
2
+
3
+ A simple command line tool to take samples lines from a file or set of files.
4
+
5
+ ~~~
6
+
7
+ samplelines [options] [filename(s) and/or STDIN and/or STDERR]
8
+ -v, --version print version info
9
+ -h, --help print usage information
10
+ -p, --percent set % chance of a line being output [trumps use of -1]
11
+ -1, --one-in compute odds of outputing a line as 1 out of every N
12
+ -m, --max return at most this many lines
13
+ -t, --time stop running after N seconds
14
+
15
+ ~~~
16
+
17
+ Either `-p` or `-1` is required. Filenames (or the special strings STDIN
18
+ and STDERR) will be processed in the order you specify them.
19
+
20
+ As a convenience, `samplelines` will automatically deal with gzipped files
21
+ that end in `.gz`
22
+
23
+ Note that there is no guarantee of output. If you give a low percentage and a small file, there's a chance nothing will be randomly chosen.
24
+
25
+ ## Examples
26
+
27
+ ~~~
28
+ # Pick about 10% of the lines from bigfile.txt
29
+ samplelines -p 10 bigfile.txt
30
+
31
+ # Ditto, but get a maximum of 100 lines
32
+ samplelines -p 10 -m 10 bigfile.txt
33
+
34
+ # This time, get about one out of every 500 lines
35
+ samplelines -1 500 -m 10 bigfile.txt
36
+
37
+ # Get about 2% of the lines from a set of gzipped files
38
+ samplelines -p 2 *.txt.gz
39
+
40
+ # Ditto, but don't get more than 10_000 lines or
41
+ # run for more than five seconds
42
+ samplelines -p 2 -t 5 -m 10000 *.txt.gz
43
+ ~~~
44
+
45
+
46
+ ## Installation
47
+
48
+ Add this line to your application's Gemfile:
49
+
50
+ gem 'samplelines'
51
+
52
+ And then execute:
53
+
54
+ $ bundle
55
+
56
+ Or install it yourself as:
57
+
58
+ $ gem install samplelines
59
+
60
+
61
+
62
+ ## Contributing
63
+
64
+ 1. Fork it
65
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
66
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
67
+ 4. Push to the branch (`git push origin my-new-feature`)
68
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/samplelines ADDED
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'samplelines'
4
+
5
+
6
+ cmdline = Samplelines.new(ARGV)
7
+
8
+ begin
9
+ cmdline.execute
10
+ rescue => e
11
+ $stderr.puts "#{e}"
12
+ exit 1
13
+ end
14
+
15
+
16
+
@@ -0,0 +1,132 @@
1
+ require 'slop'
2
+ require 'zlib'
3
+ require 'samplelines/version'
4
+
5
+ class Samplelines
6
+
7
+ class MultiFile
8
+ include Enumerable
9
+
10
+ attr_accessor :files, :filenames
11
+
12
+ def initialize(filenames = [])
13
+ self.filenames = filenames
14
+ self.files = []
15
+ if filenames.empty?
16
+ filenames = ['STDIN']
17
+ end
18
+
19
+ filenames.each do |fn|
20
+ case fn
21
+ when 'STDIN'
22
+ self.files.push $stdin
23
+ when 'STDERR'
24
+ self.files.push $stderr
25
+ else
26
+ f = File.open(fn, 'r:utf-8')
27
+ if fn =~ /\.gz\Z/
28
+ f = Zlib::GzipReader.new(f)
29
+ end
30
+ self.files.push f
31
+ end
32
+ end
33
+ end
34
+
35
+
36
+ def each
37
+ files.each do |f|
38
+ f.each_line {|l| yield l}
39
+ end
40
+ end
41
+ end
42
+
43
+ attr_accessor :options, :slop, :orig_argv, :remaining_argv, :input
44
+
45
+ def initialize(argv = ARGV)
46
+ self.orig_argv = argv.dup
47
+ self.remaining_argv = argv
48
+
49
+ self.slop = self.create_slop!
50
+ self.options = parse_options(self.remaining_argv)
51
+
52
+ end
53
+
54
+
55
+ def execute
56
+ if options[:version]
57
+ $stderr.puts "Samplelines version #{Samplelines::VERSION}"
58
+ return
59
+ end
60
+
61
+ if options[:help] or orig_argv.empty?
62
+ $stderr.puts slop.help
63
+ return
64
+ end
65
+
66
+ input = Samplelines::MultiFile.new(self.remaining_argv)
67
+ picker = create_picker(options)
68
+ max = options[:max] ? options[:max].to_i : nil
69
+
70
+ timer_finished = if options[:time]
71
+ seconds = options[:time].to_i
72
+ start_time = Time.new
73
+ ->() { Time.new - start_time > seconds}
74
+ else
75
+ ->() { false }
76
+ end
77
+
78
+ total = 0
79
+ input.each do |l|
80
+ if picker.call
81
+ total += 1
82
+ print l
83
+ end
84
+ return if max and total == max
85
+ return if timer_finished.call
86
+ end
87
+ end
88
+
89
+ def create_picker(opts)
90
+ if opts[:percent]
91
+ cutoff = opts[:percent].to_i
92
+ out_of = 100
93
+ elsif opts[:'one-in']
94
+ cutoff = 1
95
+ out_of = opts[:'one-in'].to_i
96
+ else
97
+ raise "Samplelines must take either -p or -1"
98
+ end
99
+ ->() do
100
+ rand(out_of) < cutoff
101
+ end
102
+ end
103
+
104
+ def create_slop!
105
+ return Slop.new(:strict=>true) do
106
+ banner "samplelines [options] [filename(s) and/or STDIN and/or STDERR]"
107
+
108
+ on 'v', 'version', 'print version info'
109
+ on 'h', 'help', 'print usage information'
110
+ on 'p', 'percent', 'set % chance of a line being output [trumps use of -1]', :argument=>true
111
+ on '1', 'one-in', 'compute odds of outputing a line as 1 out of every N', :argument=>true
112
+ on 'm', 'max', 'return at most this many lines', :argument => true
113
+ on 't', 'time', 'stop running after N seconds', :argument => true
114
+ end
115
+ end
116
+
117
+ def parse_options(argv)
118
+
119
+ begin
120
+ self.slop.parse!(argv)
121
+ rescue Slop::Error => e
122
+ $stderr.puts "Error: #{e.message}"
123
+ $stderr.puts "Exiting..."
124
+ $stderr.puts
125
+ $stderr.puts slop.help
126
+ exit 1
127
+ end
128
+
129
+ return self.slop.to_hash
130
+ end
131
+ end
132
+
@@ -0,0 +1,3 @@
1
+ class Samplelines
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'samplelines/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "samplelines"
8
+ spec.version = Samplelines::VERSION
9
+ spec.authors = ["Bill Dueber"]
10
+ spec.email = ["bill@dueber.com"]
11
+ spec.description = %q{Simple command line utility to pick a random sample of lines out of the given file(s)}
12
+ spec.summary = %q{Simple command line utility to pick a random sample of lines out of the given file(s)}
13
+ spec.homepage = "http://github.com/billdueber/samplelines"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: samplelines
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Bill Dueber
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-12-19 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '1.3'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '1.3'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rake
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: Simple command line utility to pick a random sample of lines out of the
47
+ given file(s)
48
+ email:
49
+ - bill@dueber.com
50
+ executables:
51
+ - samplelines
52
+ extensions: []
53
+ extra_rdoc_files: []
54
+ files:
55
+ - .gitignore
56
+ - Gemfile
57
+ - LICENSE.txt
58
+ - README.md
59
+ - Rakefile
60
+ - bin/samplelines
61
+ - lib/samplelines.rb
62
+ - lib/samplelines/version.rb
63
+ - samplelines.gemspec
64
+ homepage: http://github.com/billdueber/samplelines
65
+ licenses:
66
+ - MIT
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ required_rubygems_version: !ruby/object:Gem::Requirement
78
+ none: false
79
+ requirements:
80
+ - - ! '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubyforge_project:
85
+ rubygems_version: 1.8.23
86
+ signing_key:
87
+ specification_version: 3
88
+ summary: Simple command line utility to pick a random sample of lines out of the given
89
+ file(s)
90
+ test_files: []
91
+ has_rdoc: