masticate 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/README.md +10 -2
- data/bin/masticate +45 -0
- data/lib/masticate/csvify.rb +16 -0
- data/lib/masticate/mender.rb +49 -0
- data/lib/masticate/version.rb +1 -1
- data/lib/masticate.rb +10 -0
- data/spec/data/broken_psv.txt +6 -0
- data/spec/data/pipe_data.txt +1 -1
- data/spec/spec/csvify_spec.rb +14 -0
- data/spec/spec/mend_spec.rb +14 -0
- metadata +14 -4
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Masticate
|
2
2
|
|
3
|
-
|
3
|
+
Data file crunching
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -18,7 +18,15 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
21
|
+
masticate sniff filename
|
22
|
+
masticate mend filename
|
23
|
+
|
24
|
+
or
|
25
|
+
|
26
|
+
> require 'masticate'
|
27
|
+
> f = File.open(filename)
|
28
|
+
> Masticate.sniff(f)
|
29
|
+
> Masticate.mend(f, :output => $stdout, :col_sep => "\t")
|
22
30
|
|
23
31
|
## Contributing
|
24
32
|
|
data/bin/masticate
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "masticate"
|
4
|
+
|
5
|
+
command, filename = ARGV
|
6
|
+
|
7
|
+
case ARGV.shift
|
8
|
+
when 'sniff'
|
9
|
+
file = File.open(filename)
|
10
|
+
results = Masticate.sniff(file)
|
11
|
+
col_sep = results[:col_sep]
|
12
|
+
col_sep = "TAB" if col_sep == "\t"
|
13
|
+
file.close
|
14
|
+
$stderr.puts <<-EOT
|
15
|
+
Processing complete.
|
16
|
+
Input delimiter: #{col_sep}
|
17
|
+
Field counts: #{results[:field_counts].join(',')}
|
18
|
+
EOT
|
19
|
+
|
20
|
+
when 'mend'
|
21
|
+
file = File.open(filename)
|
22
|
+
metadata = Masticate.sniff(file)
|
23
|
+
col_sep = metadata[:col_sep]
|
24
|
+
col_sep = "TAB" if col_sep == "\t"
|
25
|
+
file.close
|
26
|
+
file = File.open(filename)
|
27
|
+
results = Masticate.mend(file, metadata.merge(:output => $stdout))
|
28
|
+
file.close
|
29
|
+
$stderr.puts <<-EOT
|
30
|
+
Processing complete.
|
31
|
+
Input delimiter: #{col_sep}
|
32
|
+
Lines in input: #{results[:input_records]}
|
33
|
+
Lines in output: #{results[:output_records]}
|
34
|
+
EOT
|
35
|
+
|
36
|
+
when 'csvify'
|
37
|
+
file = File.open(filename)
|
38
|
+
metadata = Masticate.sniff(file)
|
39
|
+
file.close
|
40
|
+
file = File.open(filename)
|
41
|
+
Masticate.csvify(file, metadata.merge(:output => $stdout))
|
42
|
+
|
43
|
+
else
|
44
|
+
raise "unknown command #{command}"
|
45
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# convert input to clean standard CSV
|
2
|
+
require "csv"
|
3
|
+
|
4
|
+
class Masticate::Csvify
|
5
|
+
attr_reader :file
|
6
|
+
|
7
|
+
def initialize(file)
|
8
|
+
@file = file
|
9
|
+
end
|
10
|
+
|
11
|
+
def csvify(opts)
|
12
|
+
CSV.foreach(file, :col_sep => opts[:col_sep]) do |row|
|
13
|
+
opts[:output] << row.to_csv
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# repair delimited input files
|
2
|
+
#
|
3
|
+
# A row that contains fewer delimiters than expected has been split across two lines
|
4
|
+
# (due to a newline embedded in a field). Glue those two lines into a single line in the output.
|
5
|
+
|
6
|
+
class Masticate::Mender
|
7
|
+
attr_reader :file
|
8
|
+
|
9
|
+
def initialize(file)
|
10
|
+
@file = file
|
11
|
+
end
|
12
|
+
|
13
|
+
def mend(opts)
|
14
|
+
output = opts[:output]
|
15
|
+
col_sep = opts[:col_sep]
|
16
|
+
|
17
|
+
expected_count = nil
|
18
|
+
@input_count = output_count = 0
|
19
|
+
while (line = get) do
|
20
|
+
if !expected_count
|
21
|
+
# trust the first row
|
22
|
+
expected_count = line.count(col_sep)
|
23
|
+
else
|
24
|
+
running_count = line.count(col_sep)
|
25
|
+
until line.nil? || running_count >= expected_count
|
26
|
+
nextbit = get
|
27
|
+
if nextbit
|
28
|
+
line = line.chomp + ' ' + nextbit
|
29
|
+
running_count = line.count(col_sep)
|
30
|
+
else
|
31
|
+
line = nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
output_count += 1
|
36
|
+
output << line
|
37
|
+
end
|
38
|
+
|
39
|
+
{
|
40
|
+
:input_records => @input_count,
|
41
|
+
:output_records => output_count
|
42
|
+
}
|
43
|
+
end
|
44
|
+
|
45
|
+
def get
|
46
|
+
(line = file.gets) && @input_count += 1
|
47
|
+
line
|
48
|
+
end
|
49
|
+
end
|
data/lib/masticate/version.rb
CHANGED
data/lib/masticate.rb
CHANGED
@@ -1,8 +1,18 @@
|
|
1
1
|
require "masticate/version"
|
2
2
|
require "masticate/sniffer"
|
3
|
+
require "masticate/mender"
|
4
|
+
require "masticate/csvify"
|
3
5
|
|
4
6
|
module Masticate
|
5
7
|
def self.sniff(file)
|
6
8
|
Sniffer.new(file).sniff
|
7
9
|
end
|
10
|
+
|
11
|
+
def self.mend(file, opts)
|
12
|
+
Mender.new(file).mend(opts)
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.csvify(file, opts)
|
16
|
+
Csvify.new(file).csvify(opts)
|
17
|
+
end
|
8
18
|
end
|
data/spec/data/pipe_data.txt
CHANGED
@@ -0,0 +1,14 @@
|
|
1
|
+
# spec for file-sniffing functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "csvification" do
|
6
|
+
it "should convert pipes to standard commas" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/pipe_data.txt"
|
8
|
+
file = File.open(filename)
|
9
|
+
strio = StringIO.new
|
10
|
+
Masticate.csvify(file, :output => strio, :col_sep => '|')
|
11
|
+
strio.close
|
12
|
+
strio.string.lines.count.should == 5
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# spec for file-sniffing functions
|
2
|
+
|
3
|
+
require "spec_helper"
|
4
|
+
|
5
|
+
describe "mending" do
|
6
|
+
it "should merge lines when delimiter counts don't match'" do
|
7
|
+
filename = File.dirname(__FILE__) + "/../data/broken_psv.txt"
|
8
|
+
file = File.open(filename)
|
9
|
+
devnull = File.open('/dev/null', 'w')
|
10
|
+
results = Masticate.mend(file, :output => devnull, :col_sep => '|')
|
11
|
+
results[:input_records].should == 6
|
12
|
+
results[:output_records].should == 5
|
13
|
+
end
|
14
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-04-03 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &2151845500 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,11 +21,12 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2151845500
|
25
25
|
description: Data file crunching
|
26
26
|
email:
|
27
27
|
- jmay@pobox.com
|
28
|
-
executables:
|
28
|
+
executables:
|
29
|
+
- masticate
|
29
30
|
extensions: []
|
30
31
|
extra_rdoc_files: []
|
31
32
|
files:
|
@@ -34,12 +35,18 @@ files:
|
|
34
35
|
- LICENSE
|
35
36
|
- README.md
|
36
37
|
- Rakefile
|
38
|
+
- bin/masticate
|
37
39
|
- lib/masticate.rb
|
40
|
+
- lib/masticate/csvify.rb
|
41
|
+
- lib/masticate/mender.rb
|
38
42
|
- lib/masticate/sniffer.rb
|
39
43
|
- lib/masticate/version.rb
|
40
44
|
- masticate.gemspec
|
45
|
+
- spec/data/broken_psv.txt
|
41
46
|
- spec/data/pipe_data.txt
|
42
47
|
- spec/data/tabbed_data.txt
|
48
|
+
- spec/spec/csvify_spec.rb
|
49
|
+
- spec/spec/mend_spec.rb
|
43
50
|
- spec/spec/sniffer_spec.rb
|
44
51
|
- spec/spec_helper.rb
|
45
52
|
homepage: ''
|
@@ -67,8 +74,11 @@ signing_key:
|
|
67
74
|
specification_version: 3
|
68
75
|
summary: Utility functions for parsing incoming text data files.
|
69
76
|
test_files:
|
77
|
+
- spec/data/broken_psv.txt
|
70
78
|
- spec/data/pipe_data.txt
|
71
79
|
- spec/data/tabbed_data.txt
|
80
|
+
- spec/spec/csvify_spec.rb
|
81
|
+
- spec/spec/mend_spec.rb
|
72
82
|
- spec/spec/sniffer_spec.rb
|
73
83
|
- spec/spec_helper.rb
|
74
84
|
has_rdoc:
|