samplelines 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +68 -0
- data/Rakefile +1 -0
- data/bin/samplelines +16 -0
- data/lib/samplelines.rb +132 -0
- data/lib/samplelines/version.rb +3 -0
- data/samplelines.gemspec +23 -0
- metadata +91 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Bill Dueber
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
# samplelines
|
2
|
+
|
3
|
+
A simple command line tool to take samples lines from a file or set of files.
|
4
|
+
|
5
|
+
~~~
|
6
|
+
|
7
|
+
samplelines [options] [filename(s) and/or STDIN and/or STDERR]
|
8
|
+
-v, --version print version info
|
9
|
+
-h, --help print usage information
|
10
|
+
-p, --percent set % chance of a line being output [trumps use of -1]
|
11
|
+
-1, --one-in compute odds of outputing a line as 1 out of every N
|
12
|
+
-m, --max return at most this many lines
|
13
|
+
-t, --time stop running after N seconds
|
14
|
+
|
15
|
+
~~~
|
16
|
+
|
17
|
+
Either `-p` or `-1` is required. Filenames (or the special strings STDIN
|
18
|
+
and STDERR) will be processed in the order you specify them.
|
19
|
+
|
20
|
+
As a convenience, `samplelines` will automatically deal with gzipped files
|
21
|
+
that end in `.gz`
|
22
|
+
|
23
|
+
Note that there is no guarantee of output. If you give a low percentage and a small file, there's a chance nothing will be randomly chosen.
|
24
|
+
|
25
|
+
## Examples
|
26
|
+
|
27
|
+
~~~
|
28
|
+
# Pick about 10% of the lines from bigfile.txt
|
29
|
+
samplelines -p 10 bigfile.txt
|
30
|
+
|
31
|
+
# Ditto, but get a maximum of 100 lines
|
32
|
+
samplelines -p 10 -m 10 bigfile.txt
|
33
|
+
|
34
|
+
# This time, get about one out of every 500 lines
|
35
|
+
samplelines -1 500 -m 10 bigfile.txt
|
36
|
+
|
37
|
+
# Get about 2% of the lines from a set of gzipped files
|
38
|
+
samplelines -p 2 *.txt.gz
|
39
|
+
|
40
|
+
# Ditto, but don't get more than 10_000 lines or
|
41
|
+
# run for more than five seconds
|
42
|
+
samplelines -p 2 -t 5 -m 10000 *.txt.gz
|
43
|
+
~~~
|
44
|
+
|
45
|
+
|
46
|
+
## Installation
|
47
|
+
|
48
|
+
Add this line to your application's Gemfile:
|
49
|
+
|
50
|
+
gem 'samplelines'
|
51
|
+
|
52
|
+
And then execute:
|
53
|
+
|
54
|
+
$ bundle
|
55
|
+
|
56
|
+
Or install it yourself as:
|
57
|
+
|
58
|
+
$ gem install samplelines
|
59
|
+
|
60
|
+
|
61
|
+
|
62
|
+
## Contributing
|
63
|
+
|
64
|
+
1. Fork it
|
65
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
66
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
67
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
68
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/samplelines
ADDED
data/lib/samplelines.rb
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'slop'
|
2
|
+
require 'zlib'
|
3
|
+
require 'samplelines/version'
|
4
|
+
|
5
|
+
class Samplelines
|
6
|
+
|
7
|
+
class MultiFile
|
8
|
+
include Enumerable
|
9
|
+
|
10
|
+
attr_accessor :files, :filenames
|
11
|
+
|
12
|
+
def initialize(filenames = [])
|
13
|
+
self.filenames = filenames
|
14
|
+
self.files = []
|
15
|
+
if filenames.empty?
|
16
|
+
filenames = ['STDIN']
|
17
|
+
end
|
18
|
+
|
19
|
+
filenames.each do |fn|
|
20
|
+
case fn
|
21
|
+
when 'STDIN'
|
22
|
+
self.files.push $stdin
|
23
|
+
when 'STDERR'
|
24
|
+
self.files.push $stderr
|
25
|
+
else
|
26
|
+
f = File.open(fn, 'r:utf-8')
|
27
|
+
if fn =~ /\.gz\Z/
|
28
|
+
f = Zlib::GzipReader.new(f)
|
29
|
+
end
|
30
|
+
self.files.push f
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
def each
|
37
|
+
files.each do |f|
|
38
|
+
f.each_line {|l| yield l}
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
attr_accessor :options, :slop, :orig_argv, :remaining_argv, :input
|
44
|
+
|
45
|
+
def initialize(argv = ARGV)
|
46
|
+
self.orig_argv = argv.dup
|
47
|
+
self.remaining_argv = argv
|
48
|
+
|
49
|
+
self.slop = self.create_slop!
|
50
|
+
self.options = parse_options(self.remaining_argv)
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
def execute
|
56
|
+
if options[:version]
|
57
|
+
$stderr.puts "Samplelines version #{Samplelines::VERSION}"
|
58
|
+
return
|
59
|
+
end
|
60
|
+
|
61
|
+
if options[:help] or orig_argv.empty?
|
62
|
+
$stderr.puts slop.help
|
63
|
+
return
|
64
|
+
end
|
65
|
+
|
66
|
+
input = Samplelines::MultiFile.new(self.remaining_argv)
|
67
|
+
picker = create_picker(options)
|
68
|
+
max = options[:max] ? options[:max].to_i : nil
|
69
|
+
|
70
|
+
timer_finished = if options[:time]
|
71
|
+
seconds = options[:time].to_i
|
72
|
+
start_time = Time.new
|
73
|
+
->() { Time.new - start_time > seconds}
|
74
|
+
else
|
75
|
+
->() { false }
|
76
|
+
end
|
77
|
+
|
78
|
+
total = 0
|
79
|
+
input.each do |l|
|
80
|
+
if picker.call
|
81
|
+
total += 1
|
82
|
+
print l
|
83
|
+
end
|
84
|
+
return if max and total == max
|
85
|
+
return if timer_finished.call
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
def create_picker(opts)
|
90
|
+
if opts[:percent]
|
91
|
+
cutoff = opts[:percent].to_i
|
92
|
+
out_of = 100
|
93
|
+
elsif opts[:'one-in']
|
94
|
+
cutoff = 1
|
95
|
+
out_of = opts[:'one-in'].to_i
|
96
|
+
else
|
97
|
+
raise "Samplelines must take either -p or -1"
|
98
|
+
end
|
99
|
+
->() do
|
100
|
+
rand(out_of) < cutoff
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def create_slop!
|
105
|
+
return Slop.new(:strict=>true) do
|
106
|
+
banner "samplelines [options] [filename(s) and/or STDIN and/or STDERR]"
|
107
|
+
|
108
|
+
on 'v', 'version', 'print version info'
|
109
|
+
on 'h', 'help', 'print usage information'
|
110
|
+
on 'p', 'percent', 'set % chance of a line being output [trumps use of -1]', :argument=>true
|
111
|
+
on '1', 'one-in', 'compute odds of outputing a line as 1 out of every N', :argument=>true
|
112
|
+
on 'm', 'max', 'return at most this many lines', :argument => true
|
113
|
+
on 't', 'time', 'stop running after N seconds', :argument => true
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def parse_options(argv)
|
118
|
+
|
119
|
+
begin
|
120
|
+
self.slop.parse!(argv)
|
121
|
+
rescue Slop::Error => e
|
122
|
+
$stderr.puts "Error: #{e.message}"
|
123
|
+
$stderr.puts "Exiting..."
|
124
|
+
$stderr.puts
|
125
|
+
$stderr.puts slop.help
|
126
|
+
exit 1
|
127
|
+
end
|
128
|
+
|
129
|
+
return self.slop.to_hash
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
data/samplelines.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'samplelines/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "samplelines"
|
8
|
+
spec.version = Samplelines::VERSION
|
9
|
+
spec.authors = ["Bill Dueber"]
|
10
|
+
spec.email = ["bill@dueber.com"]
|
11
|
+
spec.description = %q{Simple command line utility to pick a random sample of lines out of the given file(s)}
|
12
|
+
spec.summary = %q{Simple command line utility to pick a random sample of lines out of the given file(s)}
|
13
|
+
spec.homepage = "http://github.com/billdueber/samplelines"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: samplelines
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Bill Dueber
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-12-19 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.3'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.3'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: Simple command line utility to pick a random sample of lines out of the
|
47
|
+
given file(s)
|
48
|
+
email:
|
49
|
+
- bill@dueber.com
|
50
|
+
executables:
|
51
|
+
- samplelines
|
52
|
+
extensions: []
|
53
|
+
extra_rdoc_files: []
|
54
|
+
files:
|
55
|
+
- .gitignore
|
56
|
+
- Gemfile
|
57
|
+
- LICENSE.txt
|
58
|
+
- README.md
|
59
|
+
- Rakefile
|
60
|
+
- bin/samplelines
|
61
|
+
- lib/samplelines.rb
|
62
|
+
- lib/samplelines/version.rb
|
63
|
+
- samplelines.gemspec
|
64
|
+
homepage: http://github.com/billdueber/samplelines
|
65
|
+
licenses:
|
66
|
+
- MIT
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubyforge_project:
|
85
|
+
rubygems_version: 1.8.23
|
86
|
+
signing_key:
|
87
|
+
specification_version: 3
|
88
|
+
summary: Simple command line utility to pick a random sample of lines out of the given
|
89
|
+
file(s)
|
90
|
+
test_files: []
|
91
|
+
has_rdoc:
|