masticate 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/masticate/base.rb +5 -5
- data/lib/masticate/cook.rb +7 -2
- data/lib/masticate/mender.rb +12 -84
- data/lib/masticate/version.rb +1 -1
- data/spec/data/cooking_mend_result.csv +3 -3
- data/spec/data/newlines_in_input.psv +8 -0
- data/spec/data/newlines_merged.csv +4 -0
- data/spec/lib/mender_spec.rb +12 -0
- metadata +13 -9
data/lib/masticate/base.rb
CHANGED
@@ -70,11 +70,11 @@ class Masticate::Base
|
|
70
70
|
if row
|
71
71
|
row = row.map {|s| s && s.strip}
|
72
72
|
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
73
|
+
if self.class == Masticate::Mender
|
74
|
+
output = crunch(row, line, csv_options)
|
75
|
+
else
|
76
|
+
output = crunch(row)
|
77
|
+
end
|
78
78
|
emit(output) if output
|
79
79
|
end
|
80
80
|
end
|
data/lib/masticate/cook.rb
CHANGED
@@ -33,7 +33,13 @@ class Masticate::Cook < Masticate::Base
|
|
33
33
|
row = CSV.parse_line(line, csv_options)
|
34
34
|
|
35
35
|
steps.each do |step|
|
36
|
-
|
36
|
+
if row
|
37
|
+
if step.class == Masticate::Mender
|
38
|
+
row = step.crunch(row, line, csv_options)
|
39
|
+
else
|
40
|
+
row = step.crunch(row)
|
41
|
+
end
|
42
|
+
end
|
37
43
|
end
|
38
44
|
|
39
45
|
emit(row) if row
|
@@ -48,7 +54,6 @@ class Masticate::Cook < Masticate::Base
|
|
48
54
|
end
|
49
55
|
end
|
50
56
|
more_rows.each {|row| emit(row)}
|
51
|
-
# step.crunch(nil) {|row| emit(row)}
|
52
57
|
|
53
58
|
@output.close if opts[:output]
|
54
59
|
|
data/lib/masticate/mender.rb
CHANGED
@@ -12,20 +12,14 @@ class Masticate::Mender < Masticate::Base
|
|
12
12
|
@dejunk = opts[:dejunk]
|
13
13
|
|
14
14
|
@expected_field_count = nil
|
15
|
-
@holding =
|
15
|
+
@holding = ''
|
16
16
|
end
|
17
17
|
|
18
|
-
# attr_reader :col_sep
|
19
|
-
|
20
|
-
# def initialize(filename)
|
21
|
-
# @filename = filename
|
22
|
-
# end
|
23
|
-
|
24
18
|
def mend(opts)
|
25
19
|
execute(opts)
|
26
20
|
end
|
27
21
|
|
28
|
-
def crunch(row)
|
22
|
+
def crunch(row, line = '', csv_options = {})
|
29
23
|
if @inlined
|
30
24
|
if row
|
31
25
|
ncells = row.count/2-1
|
@@ -56,13 +50,19 @@ class Masticate::Mender < Masticate::Base
|
|
56
50
|
@expected_field_count = @headers.count
|
57
51
|
row = @headers
|
58
52
|
elsif row
|
59
|
-
@holding
|
60
|
-
|
53
|
+
@holding << ' ' unless @holding.empty?
|
54
|
+
@holding << line
|
55
|
+
|
56
|
+
row = CSV.parse_line(@holding, csv_options) #.map {|s| s && s.strip}
|
57
|
+
if row
|
58
|
+
row = row.map {|s| s && s.strip}
|
59
|
+
end
|
60
|
+
|
61
|
+
if row.count < @expected_field_count
|
61
62
|
# incomplete row; do not emit anything
|
62
63
|
row = nil
|
63
64
|
else
|
64
|
-
|
65
|
-
@holding = []
|
65
|
+
@holding = ''
|
66
66
|
end
|
67
67
|
|
68
68
|
if @dejunk && row && row.select {|s| s && !s.strip.empty?}.count <= 2
|
@@ -74,78 +74,6 @@ class Masticate::Mender < Masticate::Base
|
|
74
74
|
end
|
75
75
|
end
|
76
76
|
|
77
|
-
def old_mend(opts)
|
78
|
-
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
79
|
-
@col_sep = opts[:col_sep] || ','
|
80
|
-
@quote_char = opts[:quote_char] || "\0"
|
81
|
-
|
82
|
-
expected_field_count = nil
|
83
|
-
headers = nil
|
84
|
-
@output_count = 0
|
85
|
-
fieldcounts = Hash.new(0)
|
86
|
-
with_input do |input|
|
87
|
-
while (line = get) do
|
88
|
-
unless line =~ /^\s*$/
|
89
|
-
if opts[:inlined]
|
90
|
-
row = explode(line)
|
91
|
-
ncells = row.count/2-1
|
92
|
-
if !expected_field_count
|
93
|
-
headers = row[0..ncells]
|
94
|
-
expected_field_count = headers.count
|
95
|
-
fieldcounts[headers.count] += 1
|
96
|
-
emit(headers.to_csv(:col_sep => @col_sep))
|
97
|
-
else
|
98
|
-
if row[0..ncells] != headers
|
99
|
-
raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
|
100
|
-
end
|
101
|
-
end
|
102
|
-
row = row[ncells+1, expected_field_count]
|
103
|
-
fieldcounts[row.count] += 1
|
104
|
-
emit(row.to_csv(:col_sep => @col_sep))
|
105
|
-
elsif !expected_field_count
|
106
|
-
# trust the first row
|
107
|
-
headers = explode(line).map(&:strip)
|
108
|
-
case opts[:snip]
|
109
|
-
when Fixnum
|
110
|
-
headers.shift(opts[:snip])
|
111
|
-
when String
|
112
|
-
raise "TODO: snip named header. Multiple?"
|
113
|
-
when nil
|
114
|
-
# do nothing
|
115
|
-
else
|
116
|
-
raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
|
117
|
-
end
|
118
|
-
expected_field_count = headers.count
|
119
|
-
fieldcounts[headers.count] += 1
|
120
|
-
emit(headers.to_csv(:col_sep => @col_sep))
|
121
|
-
else
|
122
|
-
running_count = fieldcount(line)
|
123
|
-
while !input.eof? && running_count < expected_field_count do
|
124
|
-
nextbit = get
|
125
|
-
if nextbit
|
126
|
-
line = line + ' ' + nextbit
|
127
|
-
running_count = fieldcount(line)
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
unless opts[:dejunk] && junky?(line)
|
132
|
-
fieldcounts[fieldcount(line)] += 1
|
133
|
-
emit(line)
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
@output.close if opts[:output]
|
141
|
-
{
|
142
|
-
:input_count => @input_count,
|
143
|
-
:output_count => @output_count,
|
144
|
-
:field_counts => fieldcounts,
|
145
|
-
:headers => headers
|
146
|
-
}
|
147
|
-
end
|
148
|
-
|
149
77
|
def fieldcount(line)
|
150
78
|
explode(line).count
|
151
79
|
end
|
data/lib/masticate/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
1
|
Col 3
|
2
|
-
data
|
3
|
-
this long row
|
4
|
-
data
|
2
|
+
data
|
3
|
+
this long row is split across lines
|
4
|
+
data
|
@@ -0,0 +1,8 @@
|
|
1
|
+
LastName|FirstName|MiddleName|TitleName|AuditID|AuditByID|ChangeDTTM|AuditCode|AuditDescription|ObjectType|ReasonCodeID|ObjectID|AuthorizingName|DocumentID|DocumentAudTypeID|DocumentAudOrgID|DocumentAudComments|DocumentAudFileCount|DocumentAudIsInactiveFlag|PatientID|InternalExamID|PatientLast|PatientMiddle|PatientFirst
|
2
|
+
Charles|Dingle|W|RIS Administrator|12345|1234|4/20/2012 2:54:26PM|MERGE|Merge Folders|DOCUMENT|0|999||888|38|2|Dropped 8 lb weight on foot. c/o pain, swelling over 3rd, 4th metatarsals. Possible non-displaced fractures of proximal 3rd and 4th metatarsals seen on one view.|0|N|888|777|WHIPPLE|D|ANNE
|
3
|
+
Gomez|Bungle|||6|8|4/24/2012 10:31:53AM|MERGE|Merge Folders|DOCUMENT|0|101||505|38|2|broke little toe
|
4
|
+
|
5
|
+
fracture prox phalanx, angulated (I reduced it but did not order a post reduction film.)
|
6
|
+
|
7
|
+
sumner|0|N|499|172|SMITH|E|STEFANIE
|
8
|
+
Gomez|Bangle|||59|8|3/28/2012 3:26:39PM|MERGE|Merge Folders|DOCUMENT|0|11||22|38|2|neg.|0|N|380|173|OCHS|JONES|CARL
|
@@ -0,0 +1,4 @@
|
|
1
|
+
LastName,FirstName,MiddleName,TitleName,AuditID,AuditByID,ChangeDTTM,AuditCode,AuditDescription,ObjectType,ReasonCodeID,ObjectID,AuthorizingName,DocumentID,DocumentAudTypeID,DocumentAudOrgID,DocumentAudComments,DocumentAudFileCount,DocumentAudIsInactiveFlag,PatientID,InternalExamID,PatientLast,PatientMiddle,PatientFirst
|
2
|
+
Charles,Dingle,W,RIS Administrator,12345,1234,4/20/2012 2:54:26PM,MERGE,Merge Folders,DOCUMENT,0,999,,888,38,2,"Dropped 8 lb weight on foot. c/o pain, swelling over 3rd, 4th metatarsals. Possible non-displaced fractures of proximal 3rd and 4th metatarsals seen on one view.",0,N,888,777,WHIPPLE,D,ANNE
|
3
|
+
Gomez,Bungle,,,6,8,4/24/2012 10:31:53AM,MERGE,Merge Folders,DOCUMENT,0,101,,505,38,2,"broke little toe fracture prox phalanx, angulated (I reduced it but did not order a post reduction film.) sumner",0,N,499,172,SMITH,E,STEFANIE
|
4
|
+
Gomez,Bangle,,,59,8,3/28/2012 3:26:39PM,MERGE,Merge Folders,DOCUMENT,0,11,,22,38,2,neg.,0,N,380,173,OCHS,JONES,CARL
|
data/spec/lib/mender_spec.rb
CHANGED
@@ -39,4 +39,16 @@ describe "mending" do
|
|
39
39
|
# results[:field_counts].should == {11 => 11}
|
40
40
|
output.should == correct_output
|
41
41
|
end
|
42
|
+
|
43
|
+
it "should consolidate fields with embedded newlines" do
|
44
|
+
filename = File.dirname(__FILE__) + "/../data/newlines_in_input.psv"
|
45
|
+
tmp = Tempfile.new('mending')
|
46
|
+
results = Masticate.mend(filename, :col_sep => '|', :output => tmp)
|
47
|
+
output = File.read(tmp)
|
48
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/newlines_merged.csv")
|
49
|
+
|
50
|
+
results[:input_count].should == 6
|
51
|
+
results[:output_count].should == 4
|
52
|
+
output.should == correct_output
|
53
|
+
end
|
42
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-05-30 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &2157784500 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.9.2
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2157784500
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2157784000 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.9.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2157784000
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: guard-rspec
|
38
|
-
requirement: &
|
38
|
+
requirement: &2157783540 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.7.0
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2157783540
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: ruby_gntp
|
49
|
-
requirement: &
|
49
|
+
requirement: &2157783080 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: 0.3.4
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2157783080
|
58
58
|
description: Data file crunching
|
59
59
|
email:
|
60
60
|
- jmay@pobox.com
|
@@ -107,6 +107,8 @@ files:
|
|
107
107
|
- spec/data/junk_trailer.txt
|
108
108
|
- spec/data/namedcols.csv
|
109
109
|
- spec/data/namedcols.csv.output
|
110
|
+
- spec/data/newlines_in_input.psv
|
111
|
+
- spec/data/newlines_merged.csv
|
110
112
|
- spec/data/pipe_data.txt
|
111
113
|
- spec/data/quoted_csv_data.txt
|
112
114
|
- spec/data/recipe.txt
|
@@ -172,6 +174,8 @@ test_files:
|
|
172
174
|
- spec/data/junk_trailer.txt
|
173
175
|
- spec/data/namedcols.csv
|
174
176
|
- spec/data/namedcols.csv.output
|
177
|
+
- spec/data/newlines_in_input.psv
|
178
|
+
- spec/data/newlines_merged.csv
|
175
179
|
- spec/data/pipe_data.txt
|
176
180
|
- spec/data/quoted_csv_data.txt
|
177
181
|
- spec/data/recipe.txt
|