masticate 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -16,3 +16,4 @@ test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
18
  .rspec
19
+ .irb_history
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Masticate
2
2
 
3
- TODO: Write a gem description
3
+ Data file crunching
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,7 +18,15 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO: Write usage instructions here
21
+ masticate sniff filename
22
+ masticate mend filename
23
+
24
+ or
25
+
26
+ > require 'masticate'
27
+ > f = File.open(filename)
28
+ > Masticate.sniff(f)
29
+ > Masticate.mend(f, :output => $stdout, :col_sep => "\t")
22
30
 
23
31
  ## Contributing
24
32
 
data/bin/masticate ADDED
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "masticate"
4
+
5
+ command, filename = ARGV
6
+
7
+ case ARGV.shift
8
+ when 'sniff'
9
+ file = File.open(filename)
10
+ results = Masticate.sniff(file)
11
+ col_sep = results[:col_sep]
12
+ col_sep = "TAB" if col_sep == "\t"
13
+ file.close
14
+ $stderr.puts <<-EOT
15
+ Processing complete.
16
+ Input delimiter: #{col_sep}
17
+ Field counts: #{results[:field_counts].join(',')}
18
+ EOT
19
+
20
+ when 'mend'
21
+ file = File.open(filename)
22
+ metadata = Masticate.sniff(file)
23
+ col_sep = metadata[:col_sep]
24
+ col_sep = "TAB" if col_sep == "\t"
25
+ file.close
26
+ file = File.open(filename)
27
+ results = Masticate.mend(file, metadata.merge(:output => $stdout))
28
+ file.close
29
+ $stderr.puts <<-EOT
30
+ Processing complete.
31
+ Input delimiter: #{col_sep}
32
+ Lines in input: #{results[:input_records]}
33
+ Lines in output: #{results[:output_records]}
34
+ EOT
35
+
36
+ when 'csvify'
37
+ file = File.open(filename)
38
+ metadata = Masticate.sniff(file)
39
+ file.close
40
+ file = File.open(filename)
41
+ Masticate.csvify(file, metadata.merge(:output => $stdout))
42
+
43
+ else
44
+ raise "unknown command #{command}"
45
+ end
@@ -0,0 +1,16 @@
1
+ # convert input to clean standard CSV
2
+ require "csv"
3
+
4
+ class Masticate::Csvify
5
+ attr_reader :file
6
+
7
+ def initialize(file)
8
+ @file = file
9
+ end
10
+
11
+ def csvify(opts)
12
+ CSV.foreach(file, :col_sep => opts[:col_sep]) do |row|
13
+ opts[:output] << row.to_csv
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,49 @@
1
+ # repair delimited input files
2
+ #
3
+ # A row that contains fewer delimiters than expected has been split across two lines
4
+ # (due to a newline embedded in a field). Glue those two lines into a single line in the output.
5
+
6
+ class Masticate::Mender
7
+ attr_reader :file
8
+
9
+ def initialize(file)
10
+ @file = file
11
+ end
12
+
13
+ def mend(opts)
14
+ output = opts[:output]
15
+ col_sep = opts[:col_sep]
16
+
17
+ expected_count = nil
18
+ @input_count = output_count = 0
19
+ while (line = get) do
20
+ if !expected_count
21
+ # trust the first row
22
+ expected_count = line.count(col_sep)
23
+ else
24
+ running_count = line.count(col_sep)
25
+ until line.nil? || running_count >= expected_count
26
+ nextbit = get
27
+ if nextbit
28
+ line = line.chomp + ' ' + nextbit
29
+ running_count = line.count(col_sep)
30
+ else
31
+ line = nil
32
+ end
33
+ end
34
+ end
35
+ output_count += 1
36
+ output << line
37
+ end
38
+
39
+ {
40
+ :input_records => @input_count,
41
+ :output_records => output_count
42
+ }
43
+ end
44
+
45
+ def get
46
+ (line = file.gets) && @input_count += 1
47
+ line
48
+ end
49
+ end
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/masticate.rb CHANGED
@@ -1,8 +1,18 @@
1
1
  require "masticate/version"
2
2
  require "masticate/sniffer"
3
+ require "masticate/mender"
4
+ require "masticate/csvify"
3
5
 
4
6
  module Masticate
5
7
  def self.sniff(file)
6
8
  Sniffer.new(file).sniff
7
9
  end
10
+
11
+ def self.mend(file, opts)
12
+ Mender.new(file).mend(opts)
13
+ end
14
+
15
+ def self.csvify(file, opts)
16
+ Csvify.new(file).csvify(opts)
17
+ end
8
18
  end
@@ -0,0 +1,6 @@
1
+ COL1|COL 2|Col 3 |col-4| col5 |col6
2
+ data| data |data |d a t a|data|data
3
+ data| data |data |d a t a|data|data
4
+ data| data |this long row
5
+ is split across lines|d a t a|data|data
6
+ data| data |data |d a t a|data|data
@@ -1,5 +1,5 @@
1
1
  COL1|COL 2|Col 3 |col-4| col5 |col6
2
2
  data| data |data |d a t a|data|data
3
3
  data| data |data |d a t a|data|data
4
- data| data |data |d a t a|data|data
4
+ data| data |data |d a t a|data,data|data
5
5
  data| data |data |d a t a|data|data
@@ -0,0 +1,14 @@
1
+ # spec for file-sniffing functions
2
+
3
+ require "spec_helper"
4
+
5
+ describe "csvification" do
6
+ it "should convert pipes to standard commas" do
7
+ filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
8
+ file = File.open(filename)
9
+ strio = StringIO.new
10
+ Masticate.csvify(file, :output => strio, :col_sep => '|')
11
+ strio.close
12
+ strio.string.lines.count.should == 5
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ # spec for file-sniffing functions
2
+
3
+ require "spec_helper"
4
+
5
+ describe "mending" do
6
+ it "should merge lines when delimiter counts don't match'" do
7
+ filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
8
+ file = File.open(filename)
9
+ devnull = File.open('/dev/null', 'w')
10
+ results = Masticate.mend(file, :output => devnull, :col_sep => '|')
11
+ results[:input_records].should == 6
12
+ results[:output_records].should == 5
13
+ end
14
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-04-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &2153533260 !ruby/object:Gem::Requirement
16
+ requirement: &2151845500 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,11 +21,12 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153533260
24
+ version_requirements: *2151845500
25
25
  description: Data file crunching
26
26
  email:
27
27
  - jmay@pobox.com
28
- executables: []
28
+ executables:
29
+ - masticate
29
30
  extensions: []
30
31
  extra_rdoc_files: []
31
32
  files:
@@ -34,12 +35,18 @@ files:
34
35
  - LICENSE
35
36
  - README.md
36
37
  - Rakefile
38
+ - bin/masticate
37
39
  - lib/masticate.rb
40
+ - lib/masticate/csvify.rb
41
+ - lib/masticate/mender.rb
38
42
  - lib/masticate/sniffer.rb
39
43
  - lib/masticate/version.rb
40
44
  - masticate.gemspec
45
+ - spec/data/broken_psv.txt
41
46
  - spec/data/pipe_data.txt
42
47
  - spec/data/tabbed_data.txt
48
+ - spec/spec/csvify_spec.rb
49
+ - spec/spec/mend_spec.rb
43
50
  - spec/spec/sniffer_spec.rb
44
51
  - spec/spec_helper.rb
45
52
  homepage: ''
@@ -67,8 +74,11 @@ signing_key:
67
74
  specification_version: 3
68
75
  summary: Utility functions for parsing incoming text data files.
69
76
  test_files:
77
+ - spec/data/broken_psv.txt
70
78
  - spec/data/pipe_data.txt
71
79
  - spec/data/tabbed_data.txt
80
+ - spec/spec/csvify_spec.rb
81
+ - spec/spec/mend_spec.rb
72
82
  - spec/spec/sniffer_spec.rb
73
83
  - spec/spec_helper.rb
74
84
  has_rdoc: