masticate 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -16,3 +16,4 @@ test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
18
  .rspec
19
+ .irb_history
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Masticate
2
2
 
3
- TODO: Write a gem description
3
+ Data file crunching
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,7 +18,15 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO: Write usage instructions here
21
+ masticate sniff filename
22
+ masticate mend filename
23
+
24
+ or
25
+
26
+ > require 'masticate'
27
+ > f = File.open(filename)
28
+ > Masticate.sniff(f)
29
+ > Masticate.mend(f, :output => $stdout, :col_sep => "\t")
22
30
 
23
31
  ## Contributing
24
32
 
data/bin/masticate ADDED
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "masticate"
4
+
5
+ command, filename = ARGV
6
+
7
+ case ARGV.shift
8
+ when 'sniff'
9
+ file = File.open(filename)
10
+ results = Masticate.sniff(file)
11
+ col_sep = results[:col_sep]
12
+ col_sep = "TAB" if col_sep == "\t"
13
+ file.close
14
+ $stderr.puts <<-EOT
15
+ Processing complete.
16
+ Input delimiter: #{col_sep}
17
+ Field counts: #{results[:field_counts].join(',')}
18
+ EOT
19
+
20
+ when 'mend'
21
+ file = File.open(filename)
22
+ metadata = Masticate.sniff(file)
23
+ col_sep = metadata[:col_sep]
24
+ col_sep = "TAB" if col_sep == "\t"
25
+ file.close
26
+ file = File.open(filename)
27
+ results = Masticate.mend(file, metadata.merge(:output => $stdout))
28
+ file.close
29
+ $stderr.puts <<-EOT
30
+ Processing complete.
31
+ Input delimiter: #{col_sep}
32
+ Lines in input: #{results[:input_records]}
33
+ Lines in output: #{results[:output_records]}
34
+ EOT
35
+
36
+ when 'csvify'
37
+ file = File.open(filename)
38
+ metadata = Masticate.sniff(file)
39
+ file.close
40
+ file = File.open(filename)
41
+ Masticate.csvify(file, metadata.merge(:output => $stdout))
42
+
43
+ else
44
+ raise "unknown command #{command}"
45
+ end
@@ -0,0 +1,16 @@
1
+ # convert input to clean standard CSV
2
+ require "csv"
3
+
4
+ class Masticate::Csvify
5
+ attr_reader :file
6
+
7
+ def initialize(file)
8
+ @file = file
9
+ end
10
+
11
+ def csvify(opts)
12
+ CSV.foreach(file, :col_sep => opts[:col_sep]) do |row|
13
+ opts[:output] << row.to_csv
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,49 @@
1
+ # repair delimited input files
2
+ #
3
+ # A row that contains fewer delimiters than expected has been split across two lines
4
+ # (due to a newline embedded in a field). Glue those two lines into a single line in the output.
5
+
6
+ class Masticate::Mender
7
+ attr_reader :file
8
+
9
+ def initialize(file)
10
+ @file = file
11
+ end
12
+
13
+ def mend(opts)
14
+ output = opts[:output]
15
+ col_sep = opts[:col_sep]
16
+
17
+ expected_count = nil
18
+ @input_count = output_count = 0
19
+ while (line = get) do
20
+ if !expected_count
21
+ # trust the first row
22
+ expected_count = line.count(col_sep)
23
+ else
24
+ running_count = line.count(col_sep)
25
+ until line.nil? || running_count >= expected_count
26
+ nextbit = get
27
+ if nextbit
28
+ line = line.chomp + ' ' + nextbit
29
+ running_count = line.count(col_sep)
30
+ else
31
+ line = nil
32
+ end
33
+ end
34
+ end
35
+ output_count += 1
36
+ output << line
37
+ end
38
+
39
+ {
40
+ :input_records => @input_count,
41
+ :output_records => output_count
42
+ }
43
+ end
44
+
45
+ def get
46
+ (line = file.gets) && @input_count += 1
47
+ line
48
+ end
49
+ end
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -1,8 +1,18 @@
1
1
  require "masticate/version"
2
2
  require "masticate/sniffer"
3
+ require "masticate/mender"
4
+ require "masticate/csvify"
3
5
 
4
6
  module Masticate
5
7
  def self.sniff(file)
6
8
  Sniffer.new(file).sniff
7
9
  end
10
+
11
+ def self.mend(file, opts)
12
+ Mender.new(file).mend(opts)
13
+ end
14
+
15
+ def self.csvify(file, opts)
16
+ Csvify.new(file).csvify(opts)
17
+ end
8
18
  end
@@ -0,0 +1,6 @@
1
+ COL1|COL 2|Col 3 |col-4| col5 |col6
2
+ data| data |data |d a t a|data|data
3
+ data| data |data |d a t a|data|data
4
+ data| data |this long row
5
+ is split across lines|d a t a|data|data
6
+ data| data |data |d a t a|data|data
@@ -1,5 +1,5 @@
1
1
  COL1|COL 2|Col 3 |col-4| col5 |col6
2
2
  data| data |data |d a t a|data|data
3
3
  data| data |data |d a t a|data|data
4
- data| data |data |d a t a|data|data
4
+ data| data |data |d a t a|data,data|data
5
5
  data| data |data |d a t a|data|data
@@ -0,0 +1,14 @@
1
+ # spec for file-sniffing functions
2
+
3
+ require "spec_helper"
4
+
5
+ describe "csvification" do
6
+ it "should convert pipes to standard commas" do
7
+ filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
8
+ file = File.open(filename)
9
+ strio = StringIO.new
10
+ Masticate.csvify(file, :output => strio, :col_sep => '|')
11
+ strio.close
12
+ strio.string.lines.count.should == 5
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ # spec for file-sniffing functions
2
+
3
+ require "spec_helper"
4
+
5
+ describe "mending" do
6
+ it "should merge lines when delimiter counts don't match'" do
7
+ filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
8
+ file = File.open(filename)
9
+ devnull = File.open('/dev/null', 'w')
10
+ results = Masticate.mend(file, :output => devnull, :col_sep => '|')
11
+ results[:input_records].should == 6
12
+ results[:output_records].should == 5
13
+ end
14
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-04-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2153533260 !ruby/object:Gem::Requirement
16
+ requirement: &2151845500 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,11 +21,12 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153533260
24
+ version_requirements: *2151845500
25
25
  description: Data file crunching
26
26
  email:
27
27
  - jmay@pobox.com
28
- executables: []
28
+ executables:
29
+ - masticate
29
30
  extensions: []
30
31
  extra_rdoc_files: []
31
32
  files:
@@ -34,12 +35,18 @@ files:
34
35
  - LICENSE
35
36
  - README.md
36
37
  - Rakefile
38
+ - bin/masticate
37
39
  - lib/masticate.rb
40
+ - lib/masticate/csvify.rb
41
+ - lib/masticate/mender.rb
38
42
  - lib/masticate/sniffer.rb
39
43
  - lib/masticate/version.rb
40
44
  - masticate.gemspec
45
+ - spec/data/broken_psv.txt
41
46
  - spec/data/pipe_data.txt
42
47
  - spec/data/tabbed_data.txt
48
+ - spec/spec/csvify_spec.rb
49
+ - spec/spec/mend_spec.rb
43
50
  - spec/spec/sniffer_spec.rb
44
51
  - spec/spec_helper.rb
45
52
  homepage: ''
@@ -67,8 +74,11 @@ signing_key:
67
74
  specification_version: 3
68
75
  summary: Utility functions for parsing incoming text data files.
69
76
  test_files:
77
+ - spec/data/broken_psv.txt
70
78
  - spec/data/pipe_data.txt
71
79
  - spec/data/tabbed_data.txt
80
+ - spec/spec/csvify_spec.rb
81
+ - spec/spec/mend_spec.rb
72
82
  - spec/spec/sniffer_spec.rb
73
83
  - spec/spec_helper.rb
74
84
  has_rdoc: