masticate 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Guardfile ADDED
@@ -0,0 +1,10 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec', :version => 2 do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+
9
+ watch(%r{^spec/support/(.+)\.rb$}) { "spec" }
10
+ end
data/bin/masticate CHANGED
@@ -1,31 +1,25 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require "masticate"
3
+ require_relative "../lib/masticate"
4
4
 
5
5
  command, filename = ARGV
6
6
 
7
7
  case ARGV.shift
8
8
  when 'sniff'
9
- file = File.open(filename)
10
- results = Masticate.sniff(file)
9
+ results = Masticate.sniff(filename)
11
10
  col_sep = results[:col_sep]
12
11
  col_sep = "TAB" if col_sep == "\t"
13
- file.close
14
12
  $stderr.puts <<-EOT
15
13
  Processing complete.
16
14
  Input delimiter: #{col_sep}
17
- Field counts: #{results[:field_counts].join(',')}
15
+ Field counts: #{results[:field_counts].inspect}
18
16
  EOT
19
17
 
20
18
  when 'mend'
21
- file = File.open(filename)
22
- metadata = Masticate.sniff(file)
19
+ metadata = Masticate.sniff(filename)
23
20
  col_sep = metadata[:col_sep]
24
21
  col_sep = "TAB" if col_sep == "\t"
25
- file.close
26
- file = File.open(filename)
27
- results = Masticate.mend(file, metadata.merge(:output => $stdout))
28
- file.close
22
+ results = Masticate.mend(filename, metadata)
29
23
  $stderr.puts <<-EOT
30
24
  Processing complete.
31
25
  Input delimiter: #{col_sep}
@@ -34,11 +28,14 @@ Processing complete.
34
28
  EOT
35
29
 
36
30
  when 'csvify'
37
- file = File.open(filename)
38
- metadata = Masticate.sniff(file)
39
- file.close
40
- file = File.open(filename)
41
- Masticate.csvify(file, metadata.merge(:output => $stdout))
31
+ metadata = Masticate.sniff(filename)
32
+ results = Masticate.csvify(filename, metadata)
33
+ $stderr.puts <<-EOT
34
+ Processing complete.
35
+ Input delimiter: #{metadata[:col_sep]}
36
+ Lines in input: #{results[:input_count]}
37
+ Lines in output: #{results[:output_count]}
38
+ EOT
42
39
 
43
40
  else
44
41
  raise "unknown command #{command}"
@@ -2,19 +2,38 @@
2
2
  require "csv"
3
3
 
4
4
  class Masticate::Csvify
5
- attr_reader :file
5
+ attr_reader :input
6
6
 
7
- def initialize(file)
8
- @file = file
7
+ def initialize(filename)
8
+ @input = File.open(filename)
9
9
  end
10
10
 
11
11
  def csvify(opts)
12
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
12
13
  csv_options = {}
13
14
  csv_options[:col_sep] = opts[:col_sep] if opts[:col_sep]
14
15
  csv_options[:quote_char] = opts[:quote_char] || opts[:col_sep] if opts[:quote_char] || opts[:col_sep]
15
-
16
- CSV.foreach(file, csv_options) do |row|
17
- opts[:output] << row.to_csv
16
+
17
+ input_count = @output_count = 0
18
+ CSV.foreach(input, csv_options) do |row|
19
+ input_count += 1
20
+ emit(row.to_csv)
21
+ end
22
+ @output.close if opts[:output]
23
+ @input.close
24
+ {
25
+ :input_count => input_count,
26
+ :output_count => @output_count
27
+ }
28
+ end
29
+
30
+ def emit(line)
31
+ @output_count += 1
32
+ begin
33
+ @output.puts line
34
+ rescue Errno::EPIPE
35
+ # output was closed, e.g. ran piped into `head`
36
+ # silently ignore this condition, it's not fatal and doesn't need a warning
18
37
  end
19
38
  end
20
39
  end
@@ -4,46 +4,60 @@
4
4
  # (due to a newline embedded in a field). Glue those two lines into a single line in the output.
5
5
 
6
6
  class Masticate::Mender
7
- attr_reader :file
7
+ attr_reader :input
8
8
 
9
- def initialize(file)
10
- @file = file
9
+ def initialize(filename)
10
+ @input = open(filename)
11
11
  end
12
12
 
13
13
  def mend(opts)
14
- output = opts[:output]
14
+ @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
15
15
  col_sep = opts[:col_sep]
16
16
 
17
- expected_count = nil
18
- @input_count = output_count = 0
17
+ expected_delim_count = nil
18
+ @input_count = @output_count = 0
19
19
  while (line = get) do
20
- if !expected_count
21
- # trust the first row
22
- expected_count = line.count(col_sep)
23
- else
24
- running_count = line.count(col_sep)
25
- until line.nil? || running_count >= expected_count
26
- nextbit = get
27
- if nextbit
28
- line = line.chomp + ' ' + nextbit
29
- running_count = line.count(col_sep)
30
- else
31
- line = nil
20
+ unless line =~ /^\s*$/
21
+ if !expected_delim_count
22
+ # trust the first row
23
+ expected_delim_count = line.count(col_sep)
24
+ else
25
+ running_count = line.count(col_sep)
26
+ while !input.eof? && running_count < expected_delim_count do
27
+ nextbit = get
28
+ if nextbit
29
+ line = line + ' ' + nextbit
30
+ running_count = line.count(col_sep)
31
+ end
32
32
  end
33
33
  end
34
+ if line.count(col_sep) > 2
35
+ emit(line)
36
+ end
34
37
  end
35
- output_count += 1
36
- output << line
37
38
  end
38
39
 
40
+ @input.close
41
+ @output.close if opts[:output]
39
42
  {
40
43
  :input_records => @input_count,
41
- :output_records => output_count
44
+ :output_records => @output_count
42
45
  }
43
46
  end
44
47
 
45
48
  def get
46
- (line = file.gets) && @input_count += 1
47
- line
49
+ line = input.gets
50
+ @input_count += 1
51
+ line && line.chomp
52
+ end
53
+
54
+ def emit(line)
55
+ @output_count += 1
56
+ begin
57
+ @output.puts line
58
+ rescue Errno::EPIPE
59
+ # output was closed, e.g. ran piped into `head`
60
+ # silently ignore this condition, it's not fatal and doesn't need a warning
61
+ end
48
62
  end
49
63
  end
@@ -1,32 +1,29 @@
1
1
  class Masticate::Sniffer
2
- attr_reader :file
3
- attr_reader :col_sep
2
+ attr_reader :col_sep, :stats
4
3
 
5
4
  CandidateDelimiters = [',', '|', "\t"]
6
5
 
7
- def initialize(file)
8
- @file = file
9
- end
10
-
11
- def self.sniff(file)
12
- sniffer = new(file)
13
- sniffer.sniff
6
+ def initialize(filename)
7
+ @filename = filename
14
8
  end
15
9
 
16
10
  def sniff
17
11
  @col_sep = find_col_sep
12
+ @stats = stats
18
13
  {
19
- :col_sep => col_sep,
20
- :field_counts => stats
14
+ :col_sep => @col_sep,
15
+ :field_counts => @stats,
16
+ :line1 => @line1
21
17
  }
22
18
  end
23
19
 
24
20
  def find_col_sep
25
- line1 = file.lines.first
21
+ input = open(@filename)
22
+ @line1 = input.lines.first
26
23
  delimcounts = CandidateDelimiters.each_with_object({}) do |delim,h|
27
- h[delim] = consider_delim(line1, delim)
24
+ h[delim] = consider_delim(@line1, delim)
28
25
  end
29
- file.seek(0) # reset file pointer
26
+ input.close
30
27
  delimcounts.sort_by{|h,v| -v}.first.first
31
28
  end
32
29
 
@@ -35,6 +32,9 @@ class Masticate::Sniffer
35
32
  end
36
33
 
37
34
  def stats
38
- file.lines.map {|line| line.split(col_sep).count}.uniq
35
+ input = open(@filename)
36
+ counts = input.lines.each_with_object(Hash.new(0)) {|line, counts| counts[line.split(col_sep).count] += 1}
37
+ input.close
38
+ counts
39
39
  end
40
40
  end
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -1,18 +1,20 @@
1
- require "masticate/version"
2
- require "masticate/sniffer"
3
- require "masticate/mender"
4
- require "masticate/csvify"
1
+ require "open-uri"
2
+
3
+ require_relative "masticate/version"
4
+ require_relative "masticate/sniffer"
5
+ require_relative "masticate/mender"
6
+ require_relative "masticate/csvify"
5
7
 
6
8
  module Masticate
7
- def self.sniff(file)
8
- Sniffer.new(file).sniff
9
+ def self.sniff(filename)
10
+ Sniffer.new(filename).sniff
9
11
  end
10
12
 
11
- def self.mend(file, opts)
12
- Mender.new(file).mend(opts)
13
+ def self.mend(filename, opts)
14
+ Mender.new(filename).mend(opts)
13
15
  end
14
16
 
15
- def self.csvify(file, opts)
16
- Csvify.new(file).csvify(opts)
17
+ def self.csvify(filename, opts)
18
+ Csvify.new(filename).csvify(opts)
17
19
  end
18
20
  end
data/masticate.gemspec CHANGED
@@ -16,5 +16,7 @@ Gem::Specification.new do |gem|
16
16
  gem.require_paths = ["lib"]
17
17
  gem.version = Masticate::VERSION
18
18
 
19
- gem.add_development_dependency "rspec"
19
+ gem.add_development_dependency "rspec", "~> 2.9.0"
20
+ gem.add_development_dependency "guard-rspec", "~> 0.7.0"
21
+ gem.add_development_dependency "ruby_gntp", "~> 0.3.4"
20
22
  end
@@ -1,6 +1,6 @@
1
1
  COL1|COL 2|Col 3 |col-4| col5 |col6
2
- data| data |data |d a t a|data|data
3
- data| data |data |d a t a|data|data
4
- data| data |this long row
2
+ data1| data |data |d a t a|data|data
3
+ data2| data |data |d a t a|data|data
4
+ data3| data |this long row
5
5
  is split across lines|d a t a|data|data
6
- data| data |data |d a t a|data|data
6
+ data4| data |data |d a t a|data|data
@@ -0,0 +1,8 @@
1
+ COL1|COL 2|Col 3 |col-4| col5 |col6
2
+ one| data |data |d a t a|data|data
3
+ two| data |data |d a t a|data|data
4
+ three| data |data |d a t a|data,data|data
5
+ four| data |data "more data" |d a t a|data|data
6
+ trailer|123
7
+
8
+
@@ -0,0 +1,17 @@
1
+ # spec for file-sniffing functions
2
+
3
+ require "spec_helper"
4
+ require "tempfile"
5
+
6
+ describe "csvification" do
7
+ it "should convert pipes to standard commas" do
8
+ filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
9
+ tmp = Tempfile.new('csvify')
10
+ results = Masticate.csvify(filename, :output => tmp, :col_sep => '|')
11
+ output = File.read(tmp)
12
+ tmp.unlink
13
+ output.lines.count.should == 5
14
+ results[:input_count].should == 5
15
+ results[:output_count].should == 5
16
+ end
17
+ end
@@ -0,0 +1,20 @@
1
+ # spec for file-sniffing functions
2
+
3
+ require "spec_helper"
4
+
5
+ describe "mending" do
6
+ it "should merge lines when delimiter counts don't match'" do
7
+ filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
8
+ results = Masticate.mend(filename, :col_sep => '|', :output => "/dev/null")
9
+ results[:input_records].should == 7
10
+ results[:output_records].should == 5
11
+ end
12
+
13
+ it "should strip trailer records" do
14
+ filename = File.dirname(__FILE__) + "/../data/junk_trailer.txt"
15
+ metadata = Masticate.sniff(filename)
16
+ results = Masticate.mend(filename, metadata.merge(:output => "/dev/null"))
17
+ results[:input_records].should == 9
18
+ results[:output_records].should == 5
19
+ end
20
+ end
@@ -5,17 +5,15 @@ require "spec_helper"
5
5
  describe "delimiter sniffing" do
6
6
  it "should find tab delimiter" do
7
7
  filename = File.dirname(__FILE__) + "/../data/tabbed_data.txt"
8
- file = File.open(filename)
9
- results = Masticate.sniff(file)
8
+ results = Masticate.sniff(filename)
10
9
  results[:col_sep].should == "\t"
11
- results[:field_counts].should == [6]
10
+ results[:field_counts].should == {6 => 5}
12
11
  end
13
12
 
14
13
  it "should find pipe delimiter" do
15
14
  filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
16
- file = File.open(filename)
17
- results = Masticate.sniff(file)
15
+ results = Masticate.sniff(filename)
18
16
  results[:col_sep].should == '|'
19
- results[:field_counts].should == [6]
17
+ results[:field_counts].should == {6 => 5}
20
18
  end
21
19
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,19 +9,41 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-04-03 00:00:00.000000000 Z
12
+ date: 2012-04-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2151740780 !ruby/object:Gem::Requirement
16
+ requirement: &2153254280 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ! '>='
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: '0'
21
+ version: 2.9.0
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2151740780
24
+ version_requirements: *2153254280
25
+ - !ruby/object:Gem::Dependency
26
+ name: guard-rspec
27
+ requirement: &2153246900 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ~>
31
+ - !ruby/object:Gem::Version
32
+ version: 0.7.0
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *2153246900
36
+ - !ruby/object:Gem::Dependency
37
+ name: ruby_gntp
38
+ requirement: &2153246180 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 0.3.4
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *2153246180
25
47
  description: Data file crunching
26
48
  email:
27
49
  - jmay@pobox.com
@@ -32,6 +54,7 @@ extra_rdoc_files: []
32
54
  files:
33
55
  - .gitignore
34
56
  - Gemfile
57
+ - Guardfile
35
58
  - LICENSE
36
59
  - README.md
37
60
  - Rakefile
@@ -43,11 +66,12 @@ files:
43
66
  - lib/masticate/version.rb
44
67
  - masticate.gemspec
45
68
  - spec/data/broken_psv.txt
69
+ - spec/data/junk_trailer.txt
46
70
  - spec/data/pipe_data.txt
47
71
  - spec/data/tabbed_data.txt
48
- - spec/spec/csvify_spec.rb
49
- - spec/spec/mend_spec.rb
50
- - spec/spec/sniffer_spec.rb
72
+ - spec/lib/csvify_spec.rb
73
+ - spec/lib/mend_spec.rb
74
+ - spec/lib/sniffer_spec.rb
51
75
  - spec/spec_helper.rb
52
76
  homepage: ''
53
77
  licenses: []
@@ -75,10 +99,11 @@ specification_version: 3
75
99
  summary: Utility functions for parsing incoming text data files.
76
100
  test_files:
77
101
  - spec/data/broken_psv.txt
102
+ - spec/data/junk_trailer.txt
78
103
  - spec/data/pipe_data.txt
79
104
  - spec/data/tabbed_data.txt
80
- - spec/spec/csvify_spec.rb
81
- - spec/spec/mend_spec.rb
82
- - spec/spec/sniffer_spec.rb
105
+ - spec/lib/csvify_spec.rb
106
+ - spec/lib/mend_spec.rb
107
+ - spec/lib/sniffer_spec.rb
83
108
  - spec/spec_helper.rb
84
109
  has_rdoc:
@@ -1,14 +0,0 @@
1
- # spec for file-sniffing functions
2
-
3
- require "spec_helper"
4
-
5
- describe "csvification" do
6
- it "should convert pipes to standard commas" do
7
- filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
8
- file = File.open(filename)
9
- strio = StringIO.new
10
- Masticate.csvify(file, :output => strio, :col_sep => '|')
11
- strio.close
12
- strio.string.lines.count.should == 5
13
- end
14
- end
@@ -1,14 +0,0 @@
1
- # spec for file-sniffing functions
2
-
3
- require "spec_helper"
4
-
5
- describe "mending" do
6
- it "should merge lines when delimiter counts don't match'" do
7
- filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
8
- file = File.open(filename)
9
- devnull = File.open('/dev/null', 'w')
10
- results = Masticate.mend(file, :output => devnull, :col_sep => '|')
11
- results[:input_records].should == 6
12
- results[:output_records].should == 5
13
- end
14
- end