masticate 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Guardfile +10 -0
- data/bin/masticate +13 -16
- data/lib/masticate/csvify.rb +25 -6
- data/lib/masticate/mender.rb +37 -23
- data/lib/masticate/sniffer.rb +15 -15
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +12 -10
- data/masticate.gemspec +3 -1
- data/spec/data/broken_psv.txt +4 -4
- data/spec/data/junk_trailer.txt +8 -0
- data/spec/lib/csvify_spec.rb +17 -0
- data/spec/lib/mend_spec.rb +20 -0
- data/spec/{spec → lib}/sniffer_spec.rb +4 -6
- metadata +37 -12
- data/spec/spec/csvify_spec.rb +0 -14
- data/spec/spec/mend_spec.rb +0 -14
data/Guardfile
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard 'rspec', :version => 2 do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
7
|
+
watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
|
9
|
+
watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
|
10
|
+
end
|
data/bin/masticate
CHANGED
@@ -1,31 +1,25 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
require_relative "../lib/masticate"
|
4
4
|
|
5
5
|
command, filename = ARGV
|
6
6
|
|
7
7
|
case ARGV.shift
|
8
8
|
when 'sniff'
|
9
|
-
|
10
|
-
results = Masticate.sniff(file)
|
9
|
+
results = Masticate.sniff(filename)
|
11
10
|
col_sep = results[:col_sep]
|
12
11
|
col_sep = "TAB" if col_sep == "\t"
|
13
|
-
file.close
|
14
12
|
$stderr.puts <<-EOT
|
15
13
|
Processing complete.
|
16
14
|
Input delimiter: #{col_sep}
|
17
|
-
Field counts: #{results[:field_counts].
|
15
|
+
Field counts: #{results[:field_counts].inspect}
|
18
16
|
EOT
|
19
17
|
|
20
18
|
when 'mend'
|
21
|
-
|
22
|
-
metadata = Masticate.sniff(file)
|
19
|
+
metadata = Masticate.sniff(filename)
|
23
20
|
col_sep = metadata[:col_sep]
|
24
21
|
col_sep = "TAB" if col_sep == "\t"
|
25
|
-
|
26
|
-
file = File.open(filename)
|
27
|
-
results = Masticate.mend(file, metadata.merge(:output => $stdout))
|
28
|
-
file.close
|
22
|
+
results = Masticate.mend(filename, metadata)
|
29
23
|
$stderr.puts <<-EOT
|
30
24
|
Processing complete.
|
31
25
|
Input delimiter: #{col_sep}
|
@@ -34,11 +28,14 @@ Processing complete.
|
|
34
28
|
EOT
|
35
29
|
|
36
30
|
when 'csvify'
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
31
|
+
metadata = Masticate.sniff(filename)
|
32
|
+
results = Masticate.csvify(filename, metadata)
|
33
|
+
$stderr.puts <<-EOT
|
34
|
+
Processing complete.
|
35
|
+
Input delimiter: #{metadata[:col_sep]}
|
36
|
+
Lines in input: #{results[:input_count]}
|
37
|
+
Lines in output: #{results[:output_count]}
|
38
|
+
EOT
|
42
39
|
|
43
40
|
else
|
44
41
|
raise "unknown command #{command}"
|
data/lib/masticate/csvify.rb
CHANGED
@@ -2,19 +2,38 @@
|
|
2
2
|
require "csv"
|
3
3
|
|
4
4
|
class Masticate::Csvify
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :input
|
6
6
|
|
7
|
-
def initialize(
|
8
|
-
@
|
7
|
+
def initialize(filename)
|
8
|
+
@input = File.open(filename)
|
9
9
|
end
|
10
10
|
|
11
11
|
def csvify(opts)
|
12
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
12
13
|
csv_options = {}
|
13
14
|
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
14
15
|
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
|
17
|
+
input_count = @output_count = 0
|
18
|
+
CSV.foreach(input, csv_options) do |row|
|
19
|
+
input_count += 1
|
20
|
+
emit(row.to_csv)
|
21
|
+
end
|
22
|
+
@output.close if opts[:output]
|
23
|
+
@input.close
|
24
|
+
{
|
25
|
+
:input_count => input_count,
|
26
|
+
:output_count => @output_count
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
def emit(line)
|
31
|
+
@output_count += 1
|
32
|
+
begin
|
33
|
+
@output.puts line
|
34
|
+
rescue Errno::EPIPE
|
35
|
+
# output was closed, e.g. ran piped into `head`
|
36
|
+
# silently ignore this condition, it's not fatal and doesn't need a warning
|
18
37
|
end
|
19
38
|
end
|
20
39
|
end
|
data/lib/masticate/mender.rb
CHANGED
@@ -4,46 +4,60 @@
|
|
4
4
|
# (due to a newline embedded in a field). Glue those two lines into a single line in the output.
|
5
5
|
|
6
6
|
class Masticate::Mender
|
7
|
-
attr_reader :
|
7
|
+
attr_reader :input
|
8
8
|
|
9
|
-
def initialize(
|
10
|
-
@
|
9
|
+
def initialize(filename)
|
10
|
+
@input = open(filename)
|
11
11
|
end
|
12
12
|
|
13
13
|
def mend(opts)
|
14
|
-
output = opts[:output]
|
14
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
15
15
|
col_sep = opts[:col_sep]
|
16
16
|
|
17
|
-
|
18
|
-
@input_count = output_count = 0
|
17
|
+
expected_delim_count = nil
|
18
|
+
@input_count = @output_count = 0
|
19
19
|
while (line = get) do
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
20
|
+
unless line =~ /^\s*$/
|
21
|
+
if !expected_delim_count
|
22
|
+
# trust the first row
|
23
|
+
expected_delim_count = line.count(col_sep)
|
24
|
+
else
|
25
|
+
running_count = line.count(col_sep)
|
26
|
+
while !input.eof? && running_count < expected_delim_count do
|
27
|
+
nextbit = get
|
28
|
+
if nextbit
|
29
|
+
line = line + ' ' + nextbit
|
30
|
+
running_count = line.count(col_sep)
|
31
|
+
end
|
32
32
|
end
|
33
33
|
end
|
34
|
+
if line.count(col_sep) > 2
|
35
|
+
emit(line)
|
36
|
+
end
|
34
37
|
end
|
35
|
-
output_count += 1
|
36
|
-
output << line
|
37
38
|
end
|
38
39
|
|
40
|
+
@input.close
|
41
|
+
@output.close if opts[:output]
|
39
42
|
{
|
40
43
|
:input_records => @input_count,
|
41
|
-
:output_records => output_count
|
44
|
+
:output_records => @output_count
|
42
45
|
}
|
43
46
|
end
|
44
47
|
|
45
48
|
def get
|
46
|
-
|
47
|
-
|
49
|
+
line = input.gets
|
50
|
+
@input_count += 1
|
51
|
+
line && line.chomp
|
52
|
+
end
|
53
|
+
|
54
|
+
def emit(line)
|
55
|
+
@output_count += 1
|
56
|
+
begin
|
57
|
+
@output.puts line
|
58
|
+
rescue Errno::EPIPE
|
59
|
+
# output was closed, e.g. ran piped into `head`
|
60
|
+
# silently ignore this condition, it's not fatal and doesn't need a warning
|
61
|
+
end
|
48
62
|
end
|
49
63
|
end
|
data/lib/masticate/sniffer.rb
CHANGED
@@ -1,32 +1,29 @@
|
|
1
1
|
class Masticate::Sniffer
|
2
|
-
attr_reader :
|
3
|
-
attr_reader :col_sep
|
2
|
+
attr_reader :col_sep, :stats
|
4
3
|
|
5
4
|
CandidateDelimiters = [',', '|', "\t"]
|
6
5
|
|
7
|
-
def initialize(
|
8
|
-
@
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.sniff(file)
|
12
|
-
sniffer = new(file)
|
13
|
-
sniffer.sniff
|
6
|
+
def initialize(filename)
|
7
|
+
@filename = filename
|
14
8
|
end
|
15
9
|
|
16
10
|
def sniff
|
17
11
|
@col_sep = find_col_sep
|
12
|
+
@stats = stats
|
18
13
|
{
|
19
|
-
:col_sep => col_sep,
|
20
|
-
:field_counts => stats
|
14
|
+
:col_sep => @col_sep,
|
15
|
+
:field_counts => @stats,
|
16
|
+
:line1 => @line1
|
21
17
|
}
|
22
18
|
end
|
23
19
|
|
24
20
|
def find_col_sep
|
25
|
-
|
21
|
+
input = open(@filename)
|
22
|
+
@line1 = input.lines.first
|
26
23
|
delimcounts = CandidateDelimiters.each_with_object({}) do |delim,h|
|
27
|
-
h[delim] = consider_delim(line1, delim)
|
24
|
+
h[delim] = consider_delim(@line1, delim)
|
28
25
|
end
|
29
|
-
|
26
|
+
input.close
|
30
27
|
delimcounts.sort_by{|h,v| -v}.first.first
|
31
28
|
end
|
32
29
|
|
@@ -35,6 +32,9 @@ class Masticate::Sniffer
|
|
35
32
|
end
|
36
33
|
|
37
34
|
def stats
|
38
|
-
|
35
|
+
input = open(@filename)
|
36
|
+
counts = input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
|
37
|
+
input.close
|
38
|
+
counts
|
39
39
|
end
|
40
40
|
end
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -1,18 +1,20 @@
|
|
1
|
-
require "
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
require "open-uri"
|
2
|
+
|
3
|
+
require_relative "masticate/version"
|
4
|
+
require_relative "masticate/sniffer"
|
5
|
+
require_relative "masticate/mender"
|
6
|
+
require_relative "masticate/csvify"
|
5
7
|
|
6
8
|
module Masticate
|
7
|
-
def self.sniff(
|
8
|
-
Sniffer.new(
|
9
|
+
def self.sniff(filename)
|
10
|
+
Sniffer.new(filename).sniff
|
9
11
|
end
|
10
12
|
|
11
|
-
def self.mend(
|
12
|
-
Mender.new(
|
13
|
+
def self.mend(filename, opts)
|
14
|
+
Mender.new(filename).mend(opts)
|
13
15
|
end
|
14
16
|
|
15
|
-
def self.csvify(
|
16
|
-
Csvify.new(
|
17
|
+
def self.csvify(filename, opts)
|
18
|
+
Csvify.new(filename).csvify(opts)
|
17
19
|
end
|
18
20
|
end
|
data/masticate.gemspec
CHANGED
@@ -16,5 +16,7 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.require_paths = ["lib"]
|
17
17
|
gem.version = Masticate::VERSION
|
18
18
|
|
19
|
-
gem.add_development_dependency "rspec"
|
19
|
+
gem.add_development_dependency "rspec", "~> 2.9.0"
|
20
|
+
gem.add_development_dependency "guard-rspec", "~> 0.7.0"
|
21
|
+
gem.add_development_dependency "ruby_gntp", "~> 0.3.4"
|
20
22
|
end
|
data/spec/data/broken_psv.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
COL1|COL 2|Col 3 |col-4| col5 |col6
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
data1| data |data |d a t a|data|data
|
3
|
+
data2| data |data |d a t a|data|data
|
4
|
+
data3| data |this long row
|
5
5
|
is split across lines|d a t a|data|data
|
6
|
-
|
6
|
+
data4| data |data |d a t a|data|data
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# spec for file-sniffing functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
require "tempfile"
|
5
|
+
|
6
|
+
describe "csvification" do
|
7
|
+
it "should convert pipes to standard commas" do
|
8
|
+
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
9
|
+
tmp = Tempfile.new('csvify')
|
10
|
+
results = Masticate.csvify(filename, :output => tmp, :col_sep => '|')
|
11
|
+
output = File.read(tmp)
|
12
|
+
tmp.unlink
|
13
|
+
output.lines.count.should == 5
|
14
|
+
results[:input_count].should == 5
|
15
|
+
results[:output_count].should == 5
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# spec for file-sniffing functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "mending" do
|
6
|
+
it "should merge lines when delimiter counts don't match'" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
|
8
|
+
results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null")
|
9
|
+
results[:input_records].should == 7
|
10
|
+
results[:output_records].should == 5
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should strip trailer records" do
|
14
|
+
filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
|
15
|
+
metadata = Masticate.sniff(filename)
|
16
|
+
results = Masticate.mend(filename, metadata.merge(:output => "/dev/null"))
|
17
|
+
results[:input_records].should == 9
|
18
|
+
results[:output_records].should == 5
|
19
|
+
end
|
20
|
+
end
|
@@ -5,17 +5,15 @@ require "spec_helper"
|
|
5
5
|
describe "delimiter sniffing" do
|
6
6
|
it "should find tab delimiter" do
|
7
7
|
filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
|
8
|
-
|
9
|
-
results = Masticate.sniff(file)
|
8
|
+
results = Masticate.sniff(filename)
|
10
9
|
results[:col_sep].should == "\t"
|
11
|
-
results[:field_counts].should ==
|
10
|
+
results[:field_counts].should == {6 => 5}
|
12
11
|
end
|
13
12
|
|
14
13
|
it "should find pipe delimiter" do
|
15
14
|
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
16
|
-
|
17
|
-
results = Masticate.sniff(file)
|
15
|
+
results = Masticate.sniff(filename)
|
18
16
|
results[:col_sep].should == '|'
|
19
|
-
results[:field_counts].should ==
|
17
|
+
results[:field_counts].should == {6 => 5}
|
20
18
|
end
|
21
19
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,19 +9,41 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153254280 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
|
-
- -
|
19
|
+
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153254280
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: guard-rspec
|
27
|
+
requirement: &2153246900 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 0.7.0
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *2153246900
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: ruby_gntp
|
38
|
+
requirement: &2153246180 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.3.4
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *2153246180
|
25
47
|
description: Data file crunching
|
26
48
|
email:
|
27
49
|
- jmay@pobox.com
|
@@ -32,6 +54,7 @@ extra_rdoc_files: []
|
|
32
54
|
files:
|
33
55
|
- .gitignore
|
34
56
|
- Gemfile
|
57
|
+
- Guardfile
|
35
58
|
- LICENSE
|
36
59
|
- README.md
|
37
60
|
- Rakefile
|
@@ -43,11 +66,12 @@ files:
|
|
43
66
|
- lib/masticate/version.rb
|
44
67
|
- masticate.gemspec
|
45
68
|
- spec/data/broken_psv.txt
|
69
|
+
- spec/data/junk_trailer.txt
|
46
70
|
- spec/data/pipe_data.txt
|
47
71
|
- spec/data/tabbed_data.txt
|
48
|
-
- spec/
|
49
|
-
- spec/
|
50
|
-
- spec/
|
72
|
+
- spec/lib/csvify_spec.rb
|
73
|
+
- spec/lib/mend_spec.rb
|
74
|
+
- spec/lib/sniffer_spec.rb
|
51
75
|
- spec/spec_helper.rb
|
52
76
|
homepage: ''
|
53
77
|
licenses: []
|
@@ -75,10 +99,11 @@ specification_version: 3
|
|
75
99
|
summary: Utility functions for parsing incoming text data files.
|
76
100
|
test_files:
|
77
101
|
- spec/data/broken_psv.txt
|
102
|
+
- spec/data/junk_trailer.txt
|
78
103
|
- spec/data/pipe_data.txt
|
79
104
|
- spec/data/tabbed_data.txt
|
80
|
-
- spec/
|
81
|
-
- spec/
|
82
|
-
- spec/
|
105
|
+
- spec/lib/csvify_spec.rb
|
106
|
+
- spec/lib/mend_spec.rb
|
107
|
+
- spec/lib/sniffer_spec.rb
|
83
108
|
- spec/spec_helper.rb
|
84
109
|
has_rdoc:
|
data/spec/spec/csvify_spec.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
# spec for file-sniffing functions
|
2
|
-
|
3
|
-
require "spec_helper"
|
4
|
-
|
5
|
-
describe "csvification" do
|
6
|
-
it "should convert pipes to standard commas" do
|
7
|
-
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
8
|
-
file = File.open(filename)
|
9
|
-
strio = StringIO.new
|
10
|
-
Masticate.csvify(file, :output => strio, :col_sep => '|')
|
11
|
-
strio.close
|
12
|
-
strio.string.lines.count.should == 5
|
13
|
-
end
|
14
|
-
end
|
data/spec/spec/mend_spec.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
# spec for file-sniffing functions
|
2
|
-
|
3
|
-
require "spec_helper"
|
4
|
-
|
5
|
-
describe "mending" do
|
6
|
-
it "should merge lines when delimiter counts don't match'" do
|
7
|
-
filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
|
8
|
-
file = File.open(filename)
|
9
|
-
devnull = File.open('/dev/null', 'w')
|
10
|
-
results = Masticate.mend(file, :output => devnull, :col_sep => '|')
|
11
|
-
results[:input_records].should == 6
|
12
|
-
results[:output_records].should == 5
|
13
|
-
end
|
14
|
-
end
|