masticate 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/masticate/base.rb +5 -5
- data/lib/masticate/cook.rb +7 -2
- data/lib/masticate/mender.rb +12 -84
- data/lib/masticate/version.rb +1 -1
- data/spec/data/cooking_mend_result.csv +3 -3
- data/spec/data/newlines_in_input.psv +8 -0
- data/spec/data/newlines_merged.csv +4 -0
- data/spec/lib/mender_spec.rb +12 -0
- metadata +13 -9
data/lib/masticate/base.rb
CHANGED
@@ -70,11 +70,11 @@ class Masticate::Base
|
|
70
70
|
if row
|
71
71
|
row = row.map {|s| s && s.strip}
|
72
72
|
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
73
|
+
if self.class == Masticate::Mender
|
74
|
+
output = crunch(row, line, csv_options)
|
75
|
+
else
|
76
|
+
output = crunch(row)
|
77
|
+
end
|
78
78
|
emit(output) if output
|
79
79
|
end
|
80
80
|
end
|
data/lib/masticate/cook.rb
CHANGED
@@ -33,7 +33,13 @@ class Masticate::Cook < Masticate::Base
|
|
33
33
|
row = CSV.parse_line(line, csv_options)
|
34
34
|
|
35
35
|
steps.each do |step|
|
36
|
-
|
36
|
+
if row
|
37
|
+
if step.class == Masticate::Mender
|
38
|
+
row = step.crunch(row, line, csv_options)
|
39
|
+
else
|
40
|
+
row = step.crunch(row)
|
41
|
+
end
|
42
|
+
end
|
37
43
|
end
|
38
44
|
|
39
45
|
emit(row) if row
|
@@ -48,7 +54,6 @@ class Masticate::Cook < Masticate::Base
|
|
48
54
|
end
|
49
55
|
end
|
50
56
|
more_rows.each {|row| emit(row)}
|
51
|
-
# step.crunch(nil) {|row| emit(row)}
|
52
57
|
|
53
58
|
@output.close if opts[:output]
|
54
59
|
|
data/lib/masticate/mender.rb
CHANGED
@@ -12,20 +12,14 @@ class Masticate::Mender < Masticate::Base
|
|
12
12
|
@dejunk = opts[:dejunk]
|
13
13
|
|
14
14
|
@expected_field_count = nil
|
15
|
-
@holding =
|
15
|
+
@holding = ''
|
16
16
|
end
|
17
17
|
|
18
|
-
# attr_reader :col_sep
|
19
|
-
|
20
|
-
# def initialize(filename)
|
21
|
-
# @filename = filename
|
22
|
-
# end
|
23
|
-
|
24
18
|
def mend(opts)
|
25
19
|
execute(opts)
|
26
20
|
end
|
27
21
|
|
28
|
-
def crunch(row)
|
22
|
+
def crunch(row, line = '', csv_options = {})
|
29
23
|
if @inlined
|
30
24
|
if row
|
31
25
|
ncells = row.count/2-1
|
@@ -56,13 +50,19 @@ class Masticate::Mender < Masticate::Base
|
|
56
50
|
@expected_field_count = @headers.count
|
57
51
|
row = @headers
|
58
52
|
elsif row
|
59
|
-
@holding
|
60
|
-
|
53
|
+
@holding << ' ' unless @holding.empty?
|
54
|
+
@holding << line
|
55
|
+
|
56
|
+
row = CSV.parse_line(@holding, csv_options) #.map {|s| s && s.strip}
|
57
|
+
if row
|
58
|
+
row = row.map {|s| s && s.strip}
|
59
|
+
end
|
60
|
+
|
61
|
+
if row.count < @expected_field_count
|
61
62
|
# incomplete row; do not emit anything
|
62
63
|
row = nil
|
63
64
|
else
|
64
|
-
|
65
|
-
@holding = []
|
65
|
+
@holding = ''
|
66
66
|
end
|
67
67
|
|
68
68
|
if @dejunk && row && row.select {|s| s && !s.strip.empty?}.count <= 2
|
@@ -74,78 +74,6 @@ class Masticate::Mender < Masticate::Base
|
|
74
74
|
end
|
75
75
|
end
|
76
76
|
|
77
|
-
def old_mend(opts)
|
78
|
-
@output = opts[:output] ? File.open(opts[:output], "w") : $stdout
|
79
|
-
@col_sep = opts[:col_sep] || ','
|
80
|
-
@quote_char = opts[:quote_char] || "\0"
|
81
|
-
|
82
|
-
expected_field_count = nil
|
83
|
-
headers = nil
|
84
|
-
@output_count = 0
|
85
|
-
fieldcounts = Hash.new(0)
|
86
|
-
with_input do |input|
|
87
|
-
while (line = get) do
|
88
|
-
unless line =~ /^\s*$/
|
89
|
-
if opts[:inlined]
|
90
|
-
row = explode(line)
|
91
|
-
ncells = row.count/2-1
|
92
|
-
if !expected_field_count
|
93
|
-
headers = row[0..ncells]
|
94
|
-
expected_field_count = headers.count
|
95
|
-
fieldcounts[headers.count] += 1
|
96
|
-
emit(headers.to_csv(:col_sep => @col_sep))
|
97
|
-
else
|
98
|
-
if row[0..ncells] != headers
|
99
|
-
raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
|
100
|
-
end
|
101
|
-
end
|
102
|
-
row = row[ncells+1, expected_field_count]
|
103
|
-
fieldcounts[row.count] += 1
|
104
|
-
emit(row.to_csv(:col_sep => @col_sep))
|
105
|
-
elsif !expected_field_count
|
106
|
-
# trust the first row
|
107
|
-
headers = explode(line).map(&:strip)
|
108
|
-
case opts[:snip]
|
109
|
-
when Fixnum
|
110
|
-
headers.shift(opts[:snip])
|
111
|
-
when String
|
112
|
-
raise "TODO: snip named header. Multiple?"
|
113
|
-
when nil
|
114
|
-
# do nothing
|
115
|
-
else
|
116
|
-
raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
|
117
|
-
end
|
118
|
-
expected_field_count = headers.count
|
119
|
-
fieldcounts[headers.count] += 1
|
120
|
-
emit(headers.to_csv(:col_sep => @col_sep))
|
121
|
-
else
|
122
|
-
running_count = fieldcount(line)
|
123
|
-
while !input.eof? && running_count < expected_field_count do
|
124
|
-
nextbit = get
|
125
|
-
if nextbit
|
126
|
-
line = line + ' ' + nextbit
|
127
|
-
running_count = fieldcount(line)
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
unless opts[:dejunk] && junky?(line)
|
132
|
-
fieldcounts[fieldcount(line)] += 1
|
133
|
-
emit(line)
|
134
|
-
end
|
135
|
-
end
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
|
140
|
-
@output.close if opts[:output]
|
141
|
-
{
|
142
|
-
:input_count => @input_count,
|
143
|
-
:output_count => @output_count,
|
144
|
-
:field_counts => fieldcounts,
|
145
|
-
:headers => headers
|
146
|
-
}
|
147
|
-
end
|
148
|
-
|
149
77
|
def fieldcount(line)
|
150
78
|
explode(line).count
|
151
79
|
end
|
data/lib/masticate/version.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
1
|
Col 3
|
2
|
-
data
|
3
|
-
this long row
|
4
|
-
data
|
2
|
+
data
|
3
|
+
this long row is split across lines
|
4
|
+
data
|
@@ -0,0 +1,8 @@
|
|
1
|
+
LastName|FirstName|MiddleName|TitleName|AuditID|AuditByID|ChangeDTTM|AuditCode|AuditDescription|ObjectType|ReasonCodeID|ObjectID|AuthorizingName|DocumentID|DocumentAudTypeID|DocumentAudOrgID|DocumentAudComments|DocumentAudFileCount|DocumentAudIsInactiveFlag|PatientID|InternalExamID|PatientLast|PatientMiddle|PatientFirst
|
2
|
+
Charles|Dingle|W|RIS Administrator|12345|1234|4/20/2012 2:54:26PM|MERGE|Merge Folders|DOCUMENT|0|999||888|38|2|Dropped 8 lb weight on foot. c/o pain, swelling over 3rd, 4th metatarsals. Possible non-displaced fractures of proximal 3rd and 4th metatarsals seen on one view.|0|N|888|777|WHIPPLE|D|ANNE
|
3
|
+
Gomez|Bungle|||6|8|4/24/2012 10:31:53AM|MERGE|Merge Folders|DOCUMENT|0|101||505|38|2|broke little toe
|
4
|
+
|
5
|
+
fracture prox phalanx, angulated (I reduced it but did not order a post reduction film.)
|
6
|
+
|
7
|
+
sumner|0|N|499|172|SMITH|E|STEFANIE
|
8
|
+
Gomez|Bangle|||59|8|3/28/2012 3:26:39PM|MERGE|Merge Folders|DOCUMENT|0|11||22|38|2|neg.|0|N|380|173|OCHS|JONES|CARL
|
@@ -0,0 +1,4 @@
|
|
1
|
+
LastName,FirstName,MiddleName,TitleName,AuditID,AuditByID,ChangeDTTM,AuditCode,AuditDescription,ObjectType,ReasonCodeID,ObjectID,AuthorizingName,DocumentID,DocumentAudTypeID,DocumentAudOrgID,DocumentAudComments,DocumentAudFileCount,DocumentAudIsInactiveFlag,PatientID,InternalExamID,PatientLast,PatientMiddle,PatientFirst
|
2
|
+
Charles,Dingle,W,RIS Administrator,12345,1234,4/20/2012 2:54:26PM,MERGE,Merge Folders,DOCUMENT,0,999,,888,38,2,"Dropped 8 lb weight on foot. c/o pain, swelling over 3rd, 4th metatarsals. Possible non-displaced fractures of proximal 3rd and 4th metatarsals seen on one view.",0,N,888,777,WHIPPLE,D,ANNE
|
3
|
+
Gomez,Bungle,,,6,8,4/24/2012 10:31:53AM,MERGE,Merge Folders,DOCUMENT,0,101,,505,38,2,"broke little toe fracture prox phalanx, angulated (I reduced it but did not order a post reduction film.) sumner",0,N,499,172,SMITH,E,STEFANIE
|
4
|
+
Gomez,Bangle,,,59,8,3/28/2012 3:26:39PM,MERGE,Merge Folders,DOCUMENT,0,11,,22,38,2,neg.,0,N,380,173,OCHS,JONES,CARL
|
data/spec/lib/mender_spec.rb
CHANGED
@@ -39,4 +39,16 @@ describe "mending" do
|
|
39
39
|
# results[:field_counts].should == {11 => 11}
|
40
40
|
output.should == correct_output
|
41
41
|
end
|
42
|
+
|
43
|
+
it "should consolidate fields with embedded newlines" do
|
44
|
+
filename = File.dirname(__FILE__) + "/../data/newlines_in_input.psv"
|
45
|
+
tmp = Tempfile.new('mending')
|
46
|
+
results = Masticate.mend(filename, :col_sep => '|', :output => tmp)
|
47
|
+
output = File.read(tmp)
|
48
|
+
correct_output = File.read(File.dirname(__FILE__) + "/../data/newlines_merged.csv")
|
49
|
+
|
50
|
+
results[:input_count].should == 6
|
51
|
+
results[:output_count].should == 4
|
52
|
+
output.should == correct_output
|
53
|
+
end
|
42
54
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: masticate
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-05-30 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &2157784500 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: 0.9.2
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2157784500
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2157784000 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.9.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2157784000
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: guard-rspec
|
38
|
-
requirement: &
|
38
|
+
requirement: &2157783540 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: 0.7.0
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2157783540
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: ruby_gntp
|
49
|
-
requirement: &
|
49
|
+
requirement: &2157783080 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ~>
|
@@ -54,7 +54,7 @@ dependencies:
|
|
54
54
|
version: 0.3.4
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2157783080
|
58
58
|
description: Data file crunching
|
59
59
|
email:
|
60
60
|
- jmay@pobox.com
|
@@ -107,6 +107,8 @@ files:
|
|
107
107
|
- spec/data/junk_trailer.txt
|
108
108
|
- spec/data/namedcols.csv
|
109
109
|
- spec/data/namedcols.csv.output
|
110
|
+
- spec/data/newlines_in_input.psv
|
111
|
+
- spec/data/newlines_merged.csv
|
110
112
|
- spec/data/pipe_data.txt
|
111
113
|
- spec/data/quoted_csv_data.txt
|
112
114
|
- spec/data/recipe.txt
|
@@ -172,6 +174,8 @@ test_files:
|
|
172
174
|
- spec/data/junk_trailer.txt
|
173
175
|
- spec/data/namedcols.csv
|
174
176
|
- spec/data/namedcols.csv.output
|
177
|
+
- spec/data/newlines_in_input.psv
|
178
|
+
- spec/data/newlines_merged.csv
|
175
179
|
- spec/data/pipe_data.txt
|
176
180
|
- spec/data/quoted_csv_data.txt
|
177
181
|
- spec/data/recipe.txt
|