masticate 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Guardfile +10 -0
- data/bin/masticate +13 -16
- data/lib/masticate/csvify.rb +25 -6
- data/lib/masticate/mender.rb +37 -23
- data/lib/masticate/sniffer.rb +15 -15
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +12 -10
- data/masticate.gemspec +3 -1
- data/spec/data/broken_psv.txt +4 -4
- data/spec/data/junk_trailer.txt +8 -0
- data/spec/lib/csvify_spec.rb +17 -0
- data/spec/lib/mend_spec.rb +20 -0
- data/spec/{spec → lib}/sniffer_spec.rb +4 -6
- metadata +37 -12
- data/spec/spec/csvify_spec.rb +0 -14
- data/spec/spec/mend_spec.rb +0 -14
data/Guardfile
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard 'rspec', :version => 2 do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
7
|
+
watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
|
9
|
+
watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
|
10
|
+
end
|
data/bin/masticate
CHANGED
@@ -1,31 +1,25 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
|
3
|
+
require_relative "../lib/masticate"
|
4
4
|
|
5
5
|
command, filename = ARGV
|
6
6
|
|
7
7
|
case ARGV.shift
|
8
8
|
when 'sniff'
|
9
|
-
|
10
|
-
results = Masticate.sniff(file)
|
9
|
+
results = Masticate.sniff(filename)
|
11
10
|
col_sep = results[:col_sep]
|
12
11
|
col_sep = "TAB" if col_sep == "\t"
|
13
|
-
file.close
|
14
12
|
$stderr.puts <<-EOT
|
15
13
|
Processing complete.
|
16
14
|
Input delimiter: #{col_sep}
|
17
|
-
Field counts: #{results[:field_counts].
|
15
|
+
Field counts: #{results[:field_counts].inspect}
|
18
16
|
EOT
|
19
17
|
|
20
18
|
when 'mend'
|
21
|
-
|
22
|
-
metadata = Masticate.sniff(file)
|
19
|
+
metadata = Masticate.sniff(filename)
|
23
20
|
col_sep = metadata[:col_sep]
|
24
21
|
col_sep = "TAB" if col_sep == "\t"
|
25
|
-
|
26
|
-
file = File.open(filename)
|
27
|
-
results = Masticate.mend(file, metadata.merge(:output => $stdout))
|
28
|
-
file.close
|
22
|
+
results = Masticate.mend(filename, metadata)
|
29
23
|
$stderr.puts <<-EOT
|
30
24
|
Processing complete.
|
31
25
|
Input delimiter: #{col_sep}
|
@@ -34,11 +28,14 @@ Processing complete.
|
|
34
28
|
EOT
|
35
29
|
|
36
30
|
when 'csvify'
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
31
|
+
metadata = Masticate.sniff(filename)
|
32
|
+
results = Masticate.csvify(filename, metadata)
|
33
|
+
$stderr.puts <<-EOT
|
34
|
+
Processing complete.
|
35
|
+
Input delimiter: #{metadata[:col_sep]}
|
36
|
+
Lines in input: #{results[:input_count]}
|
37
|
+
Lines in output: #{results[:output_count]}
|
38
|
+
EOT
|
42
39
|
|
43
40
|
else
|
44
41
|
raise "unknown command #{command}"
|
data/lib/masticate/csvify.rb
CHANGED
@@ -2,19 +2,38 @@
|
|
2
2
|
require "csv"
|
3
3
|
|
4
4
|
class Masticate::Csvify
|
5
|
-
attr_reader :
|
5
|
+
attr_reader :input
|
6
6
|
|
7
|
-
def initialize(
|
8
|
-
@
|
7
|
+
def initialize(filename)
|
8
|
+
@input = File.open(filename)
|
9
9
|
end
|
10
10
|
|
11
11
|
def csvify(opts)
|
12
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
12
13
|
csv_options = {}
|
13
14
|
csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
|
14
15
|
csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
|
15
|
-
|
16
|
-
|
17
|
-
|
16
|
+
|
17
|
+
input_count = @output_count = 0
|
18
|
+
CSV.foreach(input, csv_options) do |row|
|
19
|
+
input_count += 1
|
20
|
+
emit(row.to_csv)
|
21
|
+
end
|
22
|
+
@output.close if opts[:output]
|
23
|
+
@input.close
|
24
|
+
{
|
25
|
+
:input_count => input_count,
|
26
|
+
:output_count => @output_count
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
def emit(line)
|
31
|
+
@output_count += 1
|
32
|
+
begin
|
33
|
+
@output.puts line
|
34
|
+
rescue Errno::EPIPE
|
35
|
+
# output was closed, e.g. ran piped into `head`
|
36
|
+
# silently ignore this condition, it's not fatal and doesn't need a warning
|
18
37
|
end
|
19
38
|
end
|
20
39
|
end
|
data/lib/masticate/mender.rb
CHANGED
@@ -4,46 +4,60 @@
|
|
4
4
|
# (due to a newline embedded in a field). Glue those two lines into a single line in the output.
|
5
5
|
|
6
6
|
class Masticate::Mender
|
7
|
-
attr_reader :
|
7
|
+
attr_reader :input
|
8
8
|
|
9
|
-
def initialize(
|
10
|
-
@
|
9
|
+
def initialize(filename)
|
10
|
+
@input = open(filename)
|
11
11
|
end
|
12
12
|
|
13
13
|
def mend(opts)
|
14
|
-
output = opts[:output]
|
14
|
+
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
15
15
|
col_sep = opts[:col_sep]
|
16
16
|
|
17
|
-
|
18
|
-
@input_count = output_count = 0
|
17
|
+
expected_delim_count = nil
|
18
|
+
@input_count = @output_count = 0
|
19
19
|
while (line = get) do
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
20
|
+
unless line =~ /^\s*$/
|
21
|
+
if !expected_delim_count
|
22
|
+
# trust the first row
|
23
|
+
expected_delim_count = line.count(col_sep)
|
24
|
+
else
|
25
|
+
running_count = line.count(col_sep)
|
26
|
+
while !input.eof? && running_count < expected_delim_count do
|
27
|
+
nextbit = get
|
28
|
+
if nextbit
|
29
|
+
line = line + ' ' + nextbit
|
30
|
+
running_count = line.count(col_sep)
|
31
|
+
end
|
32
32
|
end
|
33
33
|
end
|
34
|
+
if line.count(col_sep) > 2
|
35
|
+
emit(line)
|
36
|
+
end
|
34
37
|
end
|
35
|
-
output_count += 1
|
36
|
-
output << line
|
37
38
|
end
|
38
39
|
|
40
|
+
@input.close
|
41
|
+
@output.close if opts[:output]
|
39
42
|
{
|
40
43
|
:input_records => @input_count,
|
41
|
-
:output_records => output_count
|
44
|
+
:output_records => @output_count
|
42
45
|
}
|
43
46
|
end
|
44
47
|
|
45
48
|
def get
|
46
|
-
|
47
|
-
|
49
|
+
line = input.gets
|
50
|
+
@input_count += 1
|
51
|
+
line && line.chomp
|
52
|
+
end
|
53
|
+
|
54
|
+
def emit(line)
|
55
|
+
@output_count += 1
|
56
|
+
begin
|
57
|
+
@output.puts line
|
58
|
+
rescue Errno::EPIPE
|
59
|
+
# output was closed, e.g. ran piped into `head`
|
60
|
+
# silently ignore this condition, it's not fatal and doesn't need a warning
|
61
|
+
end
|
48
62
|
end
|
49
63
|
end
|
data/lib/masticate/sniffer.rb
CHANGED
@@ -1,32 +1,29 @@
|
|
1
1
|
class Masticate::Sniffer
|
2
|
-
attr_reader :
|
3
|
-
attr_reader :col_sep
|
2
|
+
attr_reader :col_sep, :stats
|
4
3
|
|
5
4
|
CandidateDelimiters = [',', '|', "\t"]
|
6
5
|
|
7
|
-
def initialize(
|
8
|
-
@
|
9
|
-
end
|
10
|
-
|
11
|
-
def self.sniff(file)
|
12
|
-
sniffer = new(file)
|
13
|
-
sniffer.sniff
|
6
|
+
def initialize(filename)
|
7
|
+
@filename = filename
|
14
8
|
end
|
15
9
|
|
16
10
|
def sniff
|
17
11
|
@col_sep = find_col_sep
|
12
|
+
@stats = stats
|
18
13
|
{
|
19
|
-
:col_sep => col_sep,
|
20
|
-
:field_counts => stats
|
14
|
+
:col_sep => @col_sep,
|
15
|
+
:field_counts => @stats,
|
16
|
+
:line1 => @line1
|
21
17
|
}
|
22
18
|
end
|
23
19
|
|
24
20
|
def find_col_sep
|
25
|
-
|
21
|
+
input = open(@filename)
|
22
|
+
@line1 = input.lines.first
|
26
23
|
delimcounts = CandidateDelimiters.each_with_object({}) do |delim,h|
|
27
|
-
h[delim] = consider_delim(line1, delim)
|
24
|
+
h[delim] = consider_delim(@line1, delim)
|
28
25
|
end
|
29
|
-
|
26
|
+
input.close
|
30
27
|
delimcounts.sort_by{|h,v| -v}.first.first
|
31
28
|
end
|
32
29
|
|
@@ -35,6 +32,9 @@ class Masticate::Sniffer
|
|
35
32
|
end
|
36
33
|
|
37
34
|
def stats
|
38
|
-
|
35
|
+
input = open(@filename)
|
36
|
+
counts = input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
|
37
|
+
input.close
|
38
|
+
counts
|
39
39
|
end
|
40
40
|
end
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -1,18 +1,20 @@
|
|
1
|
-
require "
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
require "open-uri"
|
2
|
+
|
3
|
+
require_relative "masticate/version"
|
4
|
+
require_relative "masticate/sniffer"
|
5
|
+
require_relative "masticate/mender"
|
6
|
+
require_relative "masticate/csvify"
|
5
7
|
|
6
8
|
module Masticate
|
7
|
-
def self.sniff(
|
8
|
-
Sniffer.new(
|
9
|
+
def self.sniff(filename)
|
10
|
+
Sniffer.new(filename).sniff
|
9
11
|
end
|
10
12
|
|
11
|
-
def self.mend(
|
12
|
-
Mender.new(
|
13
|
+
def self.mend(filename, opts)
|
14
|
+
Mender.new(filename).mend(opts)
|
13
15
|
end
|
14
16
|
|
15
|
-
def self.csvify(
|
16
|
-
Csvify.new(
|
17
|
+
def self.csvify(filename, opts)
|
18
|
+
Csvify.new(filename).csvify(opts)
|
17
19
|
end
|
18
20
|
end
|
data/masticate.gemspec
CHANGED
@@ -16,5 +16,7 @@ Gem::Specification.new do |gem|
|
|
16
16
|
gem.require_paths = ["lib"]
|
17
17
|
gem.version = Masticate::VERSION
|
18
18
|
|
19
|
-
gem.add_development_dependency "rspec"
|
19
|
+
gem.add_development_dependency "rspec", "~> 2.9.0"
|
20
|
+
gem.add_development_dependency "guard-rspec", "~> 0.7.0"
|
21
|
+
gem.add_development_dependency "ruby_gntp", "~> 0.3.4"
|
20
22
|
end
|
data/spec/data/broken_psv.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
COL1|COL 2|Col 3 |col-4| col5 |col6
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
data1| data |data |d a t a|data|data
|
3
|
+
data2| data |data |d a t a|data|data
|
4
|
+
data3| data |this long row
|
5
5
|
is split across lines|d a t a|data|data
|
6
|
-
|
6
|
+
data4| data |data |d a t a|data|data
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# spec for file-sniffing functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
require "tempfile"
|
5
|
+
|
6
|
+
describe "csvification" do
|
7
|
+
it "should convert pipes to standard commas" do
|
8
|
+
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
9
|
+
tmp = Tempfile.new('csvify')
|
10
|
+
results = Masticate.csvify(filename, :output => tmp, :col_sep => '|')
|
11
|
+
output = File.read(tmp)
|
12
|
+
tmp.unlink
|
13
|
+
output.lines.count.should == 5
|
14
|
+
results[:input_count].should == 5
|
15
|
+
results[:output_count].should == 5
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# spec for file-sniffing functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "mending" do
|
6
|
+
it "should merge lines when delimiter counts don't match'" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
|
8
|
+
results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null")
|
9
|
+
results[:input_records].should == 7
|
10
|
+
results[:output_records].should == 5
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should strip trailer records" do
|
14
|
+
filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
|
15
|
+
metadata = Masticate.sniff(filename)
|
16
|
+
results = Masticate.mend(filename, metadata.merge(:output => "/dev/null"))
|
17
|
+
results[:input_records].should == 9
|
18
|
+
results[:output_records].should == 5
|
19
|
+
end
|
20
|
+
end
|
@@ -5,17 +5,15 @@ require "spec_helper"
|
|
5
5
|
describe "delimiter sniffing" do
|
6
6
|
it "should find tab delimiter" do
|
7
7
|
filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
|
8
|
-
|
9
|
-
results = Masticate.sniff(file)
|
8
|
+
results = Masticate.sniff(filename)
|
10
9
|
results[:col_sep].should == "\t"
|
11
|
-
results[:field_counts].should ==
|
10
|
+
results[:field_counts].should == {6 => 5}
|
12
11
|
end
|
13
12
|
|
14
13
|
it "should find pipe delimiter" do
|
15
14
|
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
16
|
-
|
17
|
-
results = Masticate.sniff(file)
|
15
|
+
results = Masticate.sniff(filename)
|
18
16
|
results[:col_sep].should == '|'
|
19
|
-
results[:field_counts].should ==
|
17
|
+
results[:field_counts].should == {6 => 5}
|
20
18
|
end
|
21
19
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,19 +9,41 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-04-
|
12
|
+
date: 2012-04-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2153254280 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
|
-
- -
|
19
|
+
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 2.9.0
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2153254280
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: guard-rspec
|
27
|
+
requirement: &2153246900 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 0.7.0
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *2153246900
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: ruby_gntp
|
38
|
+
requirement: &2153246180 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.3.4
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *2153246180
|
25
47
|
description: Data file crunching
|
26
48
|
email:
|
27
49
|
- jmay@pobox.com
|
@@ -32,6 +54,7 @@ extra_rdoc_files: []
|
|
32
54
|
files:
|
33
55
|
- .gitignore
|
34
56
|
- Gemfile
|
57
|
+
- Guardfile
|
35
58
|
- LICENSE
|
36
59
|
- README.md
|
37
60
|
- Rakefile
|
@@ -43,11 +66,12 @@ files:
|
|
43
66
|
- lib/masticate/version.rb
|
44
67
|
- masticate.gemspec
|
45
68
|
- spec/data/broken_psv.txt
|
69
|
+
- spec/data/junk_trailer.txt
|
46
70
|
- spec/data/pipe_data.txt
|
47
71
|
- spec/data/tabbed_data.txt
|
48
|
-
- spec/
|
49
|
-
- spec/
|
50
|
-
- spec/
|
72
|
+
- spec/lib/csvify_spec.rb
|
73
|
+
- spec/lib/mend_spec.rb
|
74
|
+
- spec/lib/sniffer_spec.rb
|
51
75
|
- spec/spec_helper.rb
|
52
76
|
homepage: ''
|
53
77
|
licenses: []
|
@@ -75,10 +99,11 @@ specification_version: 3
|
|
75
99
|
summary: Utility functions for parsing incoming text data files.
|
76
100
|
test_files:
|
77
101
|
- spec/data/broken_psv.txt
|
102
|
+
- spec/data/junk_trailer.txt
|
78
103
|
- spec/data/pipe_data.txt
|
79
104
|
- spec/data/tabbed_data.txt
|
80
|
-
- spec/
|
81
|
-
- spec/
|
82
|
-
- spec/
|
105
|
+
- spec/lib/csvify_spec.rb
|
106
|
+
- spec/lib/mend_spec.rb
|
107
|
+
- spec/lib/sniffer_spec.rb
|
83
108
|
- spec/spec_helper.rb
|
84
109
|
has_rdoc:
|
data/spec/spec/csvify_spec.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
# spec for file-sniffing functions
|
2
|
-
|
3
|
-
require "spec_helper"
|
4
|
-
|
5
|
-
describe "csvification" do
|
6
|
-
it "should convert pipes to standard commas" do
|
7
|
-
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
8
|
-
file = File.open(filename)
|
9
|
-
strio = StringIO.new
|
10
|
-
Masticate.csvify(file, :output => strio, :col_sep => '|')
|
11
|
-
strio.close
|
12
|
-
strio.string.lines.count.should == 5
|
13
|
-
end
|
14
|
-
end
|
data/spec/spec/mend_spec.rb
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
# spec for file-sniffing functions
|
2
|
-
|
3
|
-
require "spec_helper"
|
4
|
-
|
5
|
-
describe "mending" do
|
6
|
-
it "should merge lines when delimiter counts don't match'" do
|
7
|
-
filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
|
8
|
-
file = File.open(filename)
|
9
|
-
devnull = File.open('/dev/null', 'w')
|
10
|
-
results = Masticate.mend(file, :output => devnull, :col_sep => '|')
|
11
|
-
results[:input_records].should == 6
|
12
|
-
results[:output_records].should == 5
|
13
|
-
end
|
14
|
-
end
|