masticate 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/Guardfile ADDED
@@ -0,0 +1,10 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec', :version => 2 do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+
9
+ watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
10
+ end
data/bin/masticate CHANGED
@@ -1,31 +1,25 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "masticate"
3
+ require_relative "../lib/masticate"
4
4
 
5
5
  command, filename = ARGV
6
6
 
7
7
  case ARGV.shift
8
8
  when 'sniff'
9
- file = File.open(filename)
10
- results = Masticate.sniff(file)
9
+ results = Masticate.sniff(filename)
11
10
  col_sep = results[:col_sep]
12
11
  col_sep = "TAB" if col_sep == "\t"
13
- file.close
14
12
  $stderr.puts <<-EOT
15
13
  Processing complete.
16
14
  Input delimiter: #{col_sep}
17
- Field counts: #{results[:field_counts].join(',')}
15
+ Field counts: #{results[:field_counts].inspect}
18
16
  EOT
19
17
 
20
18
  when 'mend'
21
- file = File.open(filename)
22
- metadata = Masticate.sniff(file)
19
+ metadata = Masticate.sniff(filename)
23
20
  col_sep = metadata[:col_sep]
24
21
  col_sep = "TAB" if col_sep == "\t"
25
- file.close
26
- file = File.open(filename)
27
- results = Masticate.mend(file, metadata.merge(:output => $stdout))
28
- file.close
22
+ results = Masticate.mend(filename, metadata)
29
23
  $stderr.puts <<-EOT
30
24
  Processing complete.
31
25
  Input delimiter: #{col_sep}
@@ -34,11 +28,14 @@ Processing complete.
34
28
  EOT
35
29
 
36
30
  when 'csvify'
37
- file = File.open(filename)
38
- metadata = Masticate.sniff(file)
39
- file.close
40
- file = File.open(filename)
41
- Masticate.csvify(file, metadata.merge(:output => $stdout))
31
+ metadata = Masticate.sniff(filename)
32
+ results = Masticate.csvify(filename, metadata)
33
+ $stderr.puts <<-EOT
34
+ Processing complete.
35
+ Input delimiter: #{metadata[:col_sep]}
36
+ Lines in input: #{results[:input_count]}
37
+ Lines in output: #{results[:output_count]}
38
+ EOT
42
39
 
43
40
  else
44
41
  raise "unknown command #{command}"
@@ -2,19 +2,38 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::Csvify
5
- attr_reader :file
5
+ attr_reader :input
6
6
 
7
- def initialize(file)
8
- @file = file
7
+ def initialize(filename)
8
+ @input = File.open(filename)
9
9
  end
10
10
 
11
11
  def csvify(opts)
12
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
12
13
  csv_options = {}
13
14
  csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
14
15
  csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
15
-
16
- CSV.foreach(file, csv_options) do |row|
17
- opts[:output] << row.to_csv
16
+
17
+ input_count = @output_count = 0
18
+ CSV.foreach(input, csv_options) do |row|
19
+ input_count += 1
20
+ emit(row.to_csv)
21
+ end
22
+ @output.close if opts[:output]
23
+ @input.close
24
+ {
25
+ :input_count => input_count,
26
+ :output_count => @output_count
27
+ }
28
+ end
29
+
30
+ def emit(line)
31
+ @output_count += 1
32
+ begin
33
+ @output.puts line
34
+ rescue Errno::EPIPE
35
+ # output was closed, e.g. ran piped into `head`
36
+ # silently ignore this condition, it's not fatal and doesn't need a warning
18
37
  end
19
38
  end
20
39
  end
@@ -4,46 +4,60 @@
4
4
  # (due to a newline embedded in a field). Glue those two lines into a single line in the output.
5
5
 
6
6
  class Masticate::Mender
7
- attr_reader :file
7
+ attr_reader :input
8
8
 
9
- def initialize(file)
10
- @file = file
9
+ def initialize(filename)
10
+ @input = open(filename)
11
11
  end
12
12
 
13
13
  def mend(opts)
14
- output = opts[:output]
14
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
15
15
  col_sep = opts[:col_sep]
16
16
 
17
- expected_count = nil
18
- @input_count = output_count = 0
17
+ expected_delim_count = nil
18
+ @input_count = @output_count = 0
19
19
  while (line = get) do
20
- if !expected_count
21
- # trust the first row
22
- expected_count = line.count(col_sep)
23
- else
24
- running_count = line.count(col_sep)
25
- until line.nil? || running_count >= expected_count
26
- nextbit = get
27
- if nextbit
28
- line = line.chomp + ' ' + nextbit
29
- running_count = line.count(col_sep)
30
- else
31
- line = nil
20
+ unless line =~ /^\s*$/
21
+ if !expected_delim_count
22
+ # trust the first row
23
+ expected_delim_count = line.count(col_sep)
24
+ else
25
+ running_count = line.count(col_sep)
26
+ while !input.eof? && running_count < expected_delim_count do
27
+ nextbit = get
28
+ if nextbit
29
+ line = line + ' ' + nextbit
30
+ running_count = line.count(col_sep)
31
+ end
32
32
  end
33
33
  end
34
+ if line.count(col_sep) > 2
35
+ emit(line)
36
+ end
34
37
  end
35
- output_count += 1
36
- output << line
37
38
  end
38
39
 
40
+ @input.close
41
+ @output.close if opts[:output]
39
42
  {
40
43
  :input_records => @input_count,
41
- :output_records => output_count
44
+ :output_records => @output_count
42
45
  }
43
46
  end
44
47
 
45
48
  def get
46
- (line = file.gets) && @input_count += 1
47
- line
49
+ line = input.gets
50
+ @input_count += 1
51
+ line && line.chomp
52
+ end
53
+
54
+ def emit(line)
55
+ @output_count += 1
56
+ begin
57
+ @output.puts line
58
+ rescue Errno::EPIPE
59
+ # output was closed, e.g. ran piped into `head`
60
+ # silently ignore this condition, it's not fatal and doesn't need a warning
61
+ end
48
62
  end
49
63
  end
@@ -1,32 +1,29 @@
1
1
  class Masticate::Sniffer
2
- attr_reader :file
3
- attr_reader :col_sep
2
+ attr_reader :col_sep, :stats
4
3
 
5
4
  CandidateDelimiters = [',', '|', "\t"]
6
5
 
7
- def initialize(file)
8
- @file = file
9
- end
10
-
11
- def self.sniff(file)
12
- sniffer = new(file)
13
- sniffer.sniff
6
+ def initialize(filename)
7
+ @filename = filename
14
8
  end
15
9
 
16
10
  def sniff
17
11
  @col_sep = find_col_sep
12
+ @stats = stats
18
13
  {
19
- :col_sep => col_sep,
20
- :field_counts => stats
14
+ :col_sep => @col_sep,
15
+ :field_counts => @stats,
16
+ :line1 => @line1
21
17
  }
22
18
  end
23
19
 
24
20
  def find_col_sep
25
- line1 = file.lines.first
21
+ input = open(@filename)
22
+ @line1 = input.lines.first
26
23
  delimcounts = CandidateDelimiters.each_with_object({}) do |delim,h|
27
- h[delim] = consider_delim(line1, delim)
24
+ h[delim] = consider_delim(@line1, delim)
28
25
  end
29
- file.seek(0) # reset file pointer
26
+ input.close
30
27
  delimcounts.sort_by{|h,v| -v}.first.first
31
28
  end
32
29
 
@@ -35,6 +32,9 @@ class Masticate::Sniffer
35
32
  end
36
33
 
37
34
  def stats
38
- file.lines.map {|line| line.split(col_sep).count}.uniq
35
+ input = open(@filename)
36
+ counts = input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
37
+ input.close
38
+ counts
39
39
  end
40
40
  end
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -1,18 +1,20 @@
1
- require "masticate/version"
2
- require "masticate/sniffer"
3
- require "masticate/mender"
4
- require "masticate/csvify"
1
+ require "open-uri"
2
+
3
+ require_relative "masticate/version"
4
+ require_relative "masticate/sniffer"
5
+ require_relative "masticate/mender"
6
+ require_relative "masticate/csvify"
5
7
 
6
8
  module Masticate
7
- def self.sniff(file)
8
- Sniffer.new(file).sniff
9
+ def self.sniff(filename)
10
+ Sniffer.new(filename).sniff
9
11
  end
10
12
 
11
- def self.mend(file, opts)
12
- Mender.new(file).mend(opts)
13
+ def self.mend(filename, opts)
14
+ Mender.new(filename).mend(opts)
13
15
  end
14
16
 
15
- def self.csvify(file, opts)
16
- Csvify.new(file).csvify(opts)
17
+ def self.csvify(filename, opts)
18
+ Csvify.new(filename).csvify(opts)
17
19
  end
18
20
  end
data/masticate.gemspec CHANGED
@@ -16,5 +16,7 @@ Gem::Specification.new do |gem|
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = Masticate::VERSION
18
18
 
19
- gem.add_development_dependency "rspec"
19
+ gem.add_development_dependency "rspec", "~> 2.9.0"
20
+ gem.add_development_dependency "guard-rspec", "~> 0.7.0"
21
+ gem.add_development_dependency "ruby_gntp", "~> 0.3.4"
20
22
  end
@@ -1,6 +1,6 @@
1
1
  COL1|COL 2|Col 3 |col-4| col5 |col6
2
- data| data |data |d a t a|data|data
3
- data| data |data |d a t a|data|data
4
- data| data |this long row
2
+ data1| data |data |d a t a|data|data
3
+ data2| data |data |d a t a|data|data
4
+ data3| data |this long row
5
5
  is split across lines|d a t a|data|data
6
- data| data |data |d a t a|data|data
6
+ data4| data |data |d a t a|data|data
@@ -0,0 +1,8 @@
1
+ COL1|COL 2|Col 3 |col-4| col5 |col6
2
+ one| data |data |d a t a|data|data
3
+ two| data |data |d a t a|data|data
4
+ three| data |data |d a t a|data,data|data
5
+ four| data |data "more data" |d a t a|data|data
6
+ trailer|123
7
+
8
+
@@ -0,0 +1,17 @@
1
+ # spec for file-sniffing functions
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+
6
+ describe "csvification" do
7
+ it "should convert pipes to standard commas" do
8
+ filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
9
+ tmp = Tempfile.new('csvify')
10
+ results = Masticate.csvify(filename, :output => tmp, :col_sep => '|')
11
+ output = File.read(tmp)
12
+ tmp.unlink
13
+ output.lines.count.should == 5
14
+ results[:input_count].should == 5
15
+ results[:output_count].should == 5
16
+ end
17
+ end
@@ -0,0 +1,20 @@
1
+ # spec for file-sniffing functions
2
+
3
+ require "spec_helper"
4
+
5
+ describe "mending" do
6
+ it "should merge lines when delimiter counts don't match'" do
7
+ filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
8
+ results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null")
9
+ results[:input_records].should == 7
10
+ results[:output_records].should == 5
11
+ end
12
+
13
+ it "should strip trailer records" do
14
+ filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
15
+ metadata = Masticate.sniff(filename)
16
+ results = Masticate.mend(filename, metadata.merge(:output => "/dev/null"))
17
+ results[:input_records].should == 9
18
+ results[:output_records].should == 5
19
+ end
20
+ end
@@ -5,17 +5,15 @@ require "spec_helper"
5
5
  describe "delimiter sniffing" do
6
6
  it "should find tab delimiter" do
7
7
  filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
8
- file = File.open(filename)
9
- results = Masticate.sniff(file)
8
+ results = Masticate.sniff(filename)
10
9
  results[:col_sep].should == "\t"
11
- results[:field_counts].should == [6]
10
+ results[:field_counts].should == {6 => 5}
12
11
  end
13
12
 
14
13
  it "should find pipe delimiter" do
15
14
  filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
16
- file = File.open(filename)
17
- results = Masticate.sniff(file)
15
+ results = Masticate.sniff(filename)
18
16
  results[:col_sep].should == '|'
19
- results[:field_counts].should == [6]
17
+ results[:field_counts].should == {6 => 5}
20
18
  end
21
19
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,19 +9,41 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-03 00:00:00.000000000 Z
12
+ date: 2012-04-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2151740780 !ruby/object:Gem::Requirement
16
+ requirement: &2153254280 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ! '>='
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2151740780
24
+ version_requirements: *2153254280
25
+ - !ruby/object:Gem::Dependency
26
+ name: guard-rspec
27
+ requirement: &2153246900 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 0.7.0
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *2153246900
36
+ - !ruby/object:Gem::Dependency
37
+ name: ruby_gntp
38
+ requirement: &2153246180 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 0.3.4
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *2153246180
25
47
  description: Data file crunching
26
48
  email:
27
49
  - jmay@pobox.com
@@ -32,6 +54,7 @@ extra_rdoc_files: []
32
54
  files:
33
55
  - .gitignore
34
56
  - Gemfile
57
+ - Guardfile
35
58
  - LICENSE
36
59
  - README.md
37
60
  - Rakefile
@@ -43,11 +66,12 @@ files:
43
66
  - lib/masticate/version.rb
44
67
  - masticate.gemspec
45
68
  - spec/data/broken_psv.txt
69
+ - spec/data/junk_trailer.txt
46
70
  - spec/data/pipe_data.txt
47
71
  - spec/data/tabbed_data.txt
48
- - spec/spec/csvify_spec.rb
49
- - spec/spec/mend_spec.rb
50
- - spec/spec/sniffer_spec.rb
72
+ - spec/lib/csvify_spec.rb
73
+ - spec/lib/mend_spec.rb
74
+ - spec/lib/sniffer_spec.rb
51
75
  - spec/spec_helper.rb
52
76
  homepage: ''
53
77
  licenses: []
@@ -75,10 +99,11 @@ specification_version: 3
75
99
  summary: Utility functions for parsing incoming text data files.
76
100
  test_files:
77
101
  - spec/data/broken_psv.txt
102
+ - spec/data/junk_trailer.txt
78
103
  - spec/data/pipe_data.txt
79
104
  - spec/data/tabbed_data.txt
80
- - spec/spec/csvify_spec.rb
81
- - spec/spec/mend_spec.rb
82
- - spec/spec/sniffer_spec.rb
105
+ - spec/lib/csvify_spec.rb
106
+ - spec/lib/mend_spec.rb
107
+ - spec/lib/sniffer_spec.rb
83
108
  - spec/spec_helper.rb
84
109
  has_rdoc:
@@ -1,14 +0,0 @@
1
- # spec for file-sniffing functions
2
-
3
- require "spec_helper"
4
-
5
- describe "csvification" do
6
- it "should convert pipes to standard commas" do
7
- filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
8
- file = File.open(filename)
9
- strio = StringIO.new
10
- Masticate.csvify(file, :output => strio, :col_sep => '|')
11
- strio.close
12
- strio.string.lines.count.should == 5
13
- end
14
- end
@@ -1,14 +0,0 @@
1
- # spec for file-sniffing functions
2
-
3
- require "spec_helper"
4
-
5
- describe "mending" do
6
- it "should merge lines when delimiter counts don't match'" do
7
- filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
8
- file = File.open(filename)
9
- devnull = File.open('/dev/null', 'w')
10
- results = Masticate.mend(file, :output => devnull, :col_sep => '|')
11
- results[:input_records].should == 6
12
- results[:output_records].should == 5
13
- end
14
- end