masticate 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/README.md +10 -2
- data/bin/masticate +45 -0
- data/lib/masticate/csvify.rb +16 -0
- data/lib/masticate/mender.rb +49 -0
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +10 -0
- data/spec/data/broken_psv.txt +6 -0
- data/spec/data/pipe_data.txt +1 -1
- data/spec/spec/csvify_spec.rb +14 -0
- data/spec/spec/mend_spec.rb +14 -0
- metadata +14 -4
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Masticate
|
2
2
|
|
3
|
-
|
3
|
+
Data file crunching
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -18,7 +18,15 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
21
|
+
masticate sniff filename
|
22
|
+
masticate mend filename
|
23
|
+
|
24
|
+
or
|
25
|
+
|
26
|
+
> require 'masticate'
|
27
|
+
> f = File.open(filename)
|
28
|
+
> Masticate.sniff(f)
|
29
|
+
> Masticate.mend(f, :output => $stdout, :col_sep => "\t")
|
22
30
|
|
23
31
|
## Contributing
|
24
32
|
|
data/bin/masticate
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "masticate"
|
4
|
+
|
5
|
+
command, filename = ARGV
|
6
|
+
|
7
|
+
case ARGV.shift
|
8
|
+
when 'sniff'
|
9
|
+
file = File.open(filename)
|
10
|
+
results = Masticate.sniff(file)
|
11
|
+
col_sep = results[:col_sep]
|
12
|
+
col_sep = "TAB" if col_sep == "\t"
|
13
|
+
file.close
|
14
|
+
$stderr.puts <<-EOT
|
15
|
+
Processing complete.
|
16
|
+
Input delimiter: #{col_sep}
|
17
|
+
Field counts: #{results[:field_counts].join(',')}
|
18
|
+
EOT
|
19
|
+
|
20
|
+
when 'mend'
|
21
|
+
file = File.open(filename)
|
22
|
+
metadata = Masticate.sniff(file)
|
23
|
+
col_sep = metadata[:col_sep]
|
24
|
+
col_sep = "TAB" if col_sep == "\t"
|
25
|
+
file.close
|
26
|
+
file = File.open(filename)
|
27
|
+
results = Masticate.mend(file, metadata.merge(:output => $stdout))
|
28
|
+
file.close
|
29
|
+
$stderr.puts <<-EOT
|
30
|
+
Processing complete.
|
31
|
+
Input delimiter: #{col_sep}
|
32
|
+
Lines in input: #{results[:input_records]}
|
33
|
+
Lines in output: #{results[:output_records]}
|
34
|
+
EOT
|
35
|
+
|
36
|
+
when 'csvify'
|
37
|
+
file = File.open(filename)
|
38
|
+
metadata = Masticate.sniff(file)
|
39
|
+
file.close
|
40
|
+
file = File.open(filename)
|
41
|
+
Masticate.csvify(file, metadata.merge(:output => $stdout))
|
42
|
+
|
43
|
+
else
|
44
|
+
raise "unknown command #{command}"
|
45
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# convert input to clean standard CSV
|
2
|
+
require "csv"
|
3
|
+
|
4
|
+
class Masticate::Csvify
|
5
|
+
attr_reader :file
|
6
|
+
|
7
|
+
def initialize(file)
|
8
|
+
@file = file
|
9
|
+
end
|
10
|
+
|
11
|
+
def csvify(opts)
|
12
|
+
CSV.foreach(file, :col_sep => opts[:col_sep]) do |row|
|
13
|
+
opts[:output] << row.to_csv
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# repair delimited input files
|
2
|
+
#
|
3
|
+
# A row that contains fewer delimiters than expected has been split across two lines
|
4
|
+
# (due to a newline embedded in a field). Glue those two lines into a single line in the output.
|
5
|
+
|
6
|
+
class Masticate::Mender
|
7
|
+
attr_reader :file
|
8
|
+
|
9
|
+
def initialize(file)
|
10
|
+
@file = file
|
11
|
+
end
|
12
|
+
|
13
|
+
def mend(opts)
|
14
|
+
output = opts[:output]
|
15
|
+
col_sep = opts[:col_sep]
|
16
|
+
|
17
|
+
expected_count = nil
|
18
|
+
@input_count = output_count = 0
|
19
|
+
while (line = get) do
|
20
|
+
if !expected_count
|
21
|
+
# trust the first row
|
22
|
+
expected_count = line.count(col_sep)
|
23
|
+
else
|
24
|
+
running_count = line.count(col_sep)
|
25
|
+
until line.nil? || running_count >= expected_count
|
26
|
+
nextbit = get
|
27
|
+
if nextbit
|
28
|
+
line = line.chomp + ' ' + nextbit
|
29
|
+
running_count = line.count(col_sep)
|
30
|
+
else
|
31
|
+
line = nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
output_count += 1
|
36
|
+
output << line
|
37
|
+
end
|
38
|
+
|
39
|
+
{
|
40
|
+
:input_records => @input_count,
|
41
|
+
:output_records => output_count
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
def get
|
46
|
+
(line = file.gets) && @input_count += 1
|
47
|
+
line
|
48
|
+
end
|
49
|
+
end
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -1,8 +1,18 @@
|
|
1
1
|
require "masticate/version"
|
2
2
|
require "masticate/sniffer"
|
3
|
+
require "masticate/mender"
|
4
|
+
require "masticate/csvify"
|
3
5
|
|
4
6
|
module Masticate
|
5
7
|
def self.sniff(file)
|
6
8
|
Sniffer.new(file).sniff
|
7
9
|
end
|
10
|
+
|
11
|
+
def self.mend(file, opts)
|
12
|
+
Mender.new(file).mend(opts)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.csvify(file, opts)
|
16
|
+
Csvify.new(file).csvify(opts)
|
17
|
+
end
|
8
18
|
end
|
data/spec/data/pipe_data.txt
CHANGED
@@ -0,0 +1,14 @@
|
|
1
|
+
# spec for file-sniffing functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "csvification" do
|
6
|
+
it "should convert pipes to standard commas" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
8
|
+
file = File.open(filename)
|
9
|
+
strio = StringIO.new
|
10
|
+
Masticate.csvify(file, :output => strio, :col_sep => '|')
|
11
|
+
strio.close
|
12
|
+
strio.string.lines.count.should == 5
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# spec for file-sniffing functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "mending" do
|
6
|
+
it "should merge lines when delimiter counts don't match'" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
|
8
|
+
file = File.open(filename)
|
9
|
+
devnull = File.open('/dev/null', 'w')
|
10
|
+
results = Masticate.mend(file, :output => devnull, :col_sep => '|')
|
11
|
+
results[:input_records].should == 6
|
12
|
+
results[:output_records].should == 5
|
13
|
+
end
|
14
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-04-03 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2151845500 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,11 +21,12 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2151845500
|
25
25
|
description: Data file crunching
|
26
26
|
email:
|
27
27
|
- jmay@pobox.com
|
28
|
-
executables:
|
28
|
+
executables:
|
29
|
+
- masticate
|
29
30
|
extensions: []
|
30
31
|
extra_rdoc_files: []
|
31
32
|
files:
|
@@ -34,12 +35,18 @@ files:
|
|
34
35
|
- LICENSE
|
35
36
|
- README.md
|
36
37
|
- Rakefile
|
38
|
+
- bin/masticate
|
37
39
|
- lib/masticate.rb
|
40
|
+
- lib/masticate/csvify.rb
|
41
|
+
- lib/masticate/mender.rb
|
38
42
|
- lib/masticate/sniffer.rb
|
39
43
|
- lib/masticate/version.rb
|
40
44
|
- masticate.gemspec
|
45
|
+
- spec/data/broken_psv.txt
|
41
46
|
- spec/data/pipe_data.txt
|
42
47
|
- spec/data/tabbed_data.txt
|
48
|
+
- spec/spec/csvify_spec.rb
|
49
|
+
- spec/spec/mend_spec.rb
|
43
50
|
- spec/spec/sniffer_spec.rb
|
44
51
|
- spec/spec_helper.rb
|
45
52
|
homepage: ''
|
@@ -67,8 +74,11 @@ signing_key:
|
|
67
74
|
specification_version: 3
|
68
75
|
summary: Utility functions for parsing incoming text data files.
|
69
76
|
test_files:
|
77
|
+
- spec/data/broken_psv.txt
|
70
78
|
- spec/data/pipe_data.txt
|
71
79
|
- spec/data/tabbed_data.txt
|
80
|
+
- spec/spec/csvify_spec.rb
|
81
|
+
- spec/spec/mend_spec.rb
|
72
82
|
- spec/spec/sniffer_spec.rb
|
73
83
|
- spec/spec_helper.rb
|
74
84
|
has_rdoc:
|