masticate 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -70,11 +70,11 @@ class Masticate::Base
70
70
  if row
71
71
  row = row.map {|s| s && s.strip}
72
72
  end
73
- # row2 = row.map {|s| s && s.strip}
74
- # if row2.nil?
75
- # puts "**** ROW IS [#{row.inspect}]"
76
- # end
77
- output = crunch(row)
73
+ if self.class == Masticate::Mender
74
+ output = crunch(row, line, csv_options)
75
+ else
76
+ output = crunch(row)
77
+ end
78
78
  emit(output) if output
79
79
  end
80
80
  end
@@ -33,7 +33,13 @@ class Masticate::Cook < Masticate::Base
33
33
  row = CSV.parse_line(line, csv_options)
34
34
 
35
35
  steps.each do |step|
36
- row = step.crunch(row) if row
36
+ if row
37
+ if step.class == Masticate::Mender
38
+ row = step.crunch(row, line, csv_options)
39
+ else
40
+ row = step.crunch(row)
41
+ end
42
+ end
37
43
  end
38
44
 
39
45
  emit(row) if row
@@ -48,7 +54,6 @@ class Masticate::Cook < Masticate::Base
48
54
  end
49
55
  end
50
56
  more_rows.each {|row| emit(row)}
51
- # step.crunch(nil) {|row| emit(row)}
52
57
 
53
58
  @output.close if opts[:output]
54
59
 
@@ -12,20 +12,14 @@ class Masticate::Mender < Masticate::Base
12
12
  @dejunk = opts[:dejunk]
13
13
 
14
14
  @expected_field_count = nil
15
- @holding = []
15
+ @holding = ''
16
16
  end
17
17
 
18
- # attr_reader :col_sep
19
-
20
- # def initialize(filename)
21
- # @filename = filename
22
- # end
23
-
24
18
  def mend(opts)
25
19
  execute(opts)
26
20
  end
27
21
 
28
- def crunch(row)
22
+ def crunch(row, line = '', csv_options = {})
29
23
  if @inlined
30
24
  if row
31
25
  ncells = row.count/2-1
@@ -56,13 +50,19 @@ class Masticate::Mender < Masticate::Base
56
50
  @expected_field_count = @headers.count
57
51
  row = @headers
58
52
  elsif row
59
- @holding += row
60
- if @holding.count < @expected_field_count
53
+ @holding << ' ' unless @holding.empty?
54
+ @holding << line
55
+
56
+ row = CSV.parse_line(@holding, csv_options) #.map {|s| s && s.strip}
57
+ if row
58
+ row = row.map {|s| s && s.strip}
59
+ end
60
+
61
+ if row.count < @expected_field_count
61
62
  # incomplete row; do not emit anything
62
63
  row = nil
63
64
  else
64
- row = @holding
65
- @holding = []
65
+ @holding = ''
66
66
  end
67
67
 
68
68
  if @dejunk && row && row.select {|s| s && !s.strip.empty?}.count <= 2
@@ -74,78 +74,6 @@ class Masticate::Mender < Masticate::Base
74
74
  end
75
75
  end
76
76
 
77
- def old_mend(opts)
78
- @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
79
- @col_sep = opts[:col_sep] || ','
80
- @quote_char = opts[:quote_char] || "\0"
81
-
82
- expected_field_count = nil
83
- headers = nil
84
- @output_count = 0
85
- fieldcounts = Hash.new(0)
86
- with_input do |input|
87
- while (line = get) do
88
- unless line =~ /^\s*$/
89
- if opts[:inlined]
90
- row = explode(line)
91
- ncells = row.count/2-1
92
- if !expected_field_count
93
- headers = row[0..ncells]
94
- expected_field_count = headers.count
95
- fieldcounts[headers.count] += 1
96
- emit(headers.to_csv(:col_sep => @col_sep))
97
- else
98
- if row[0..ncells] != headers
99
- raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
100
- end
101
- end
102
- row = row[ncells+1, expected_field_count]
103
- fieldcounts[row.count] += 1
104
- emit(row.to_csv(:col_sep => @col_sep))
105
- elsif !expected_field_count
106
- # trust the first row
107
- headers = explode(line).map(&:strip)
108
- case opts[:snip]
109
- when Fixnum
110
- headers.shift(opts[:snip])
111
- when String
112
- raise "TODO: snip named header. Multiple?"
113
- when nil
114
- # do nothing
115
- else
116
- raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
117
- end
118
- expected_field_count = headers.count
119
- fieldcounts[headers.count] += 1
120
- emit(headers.to_csv(:col_sep => @col_sep))
121
- else
122
- running_count = fieldcount(line)
123
- while !input.eof? && running_count < expected_field_count do
124
- nextbit = get
125
- if nextbit
126
- line = line + ' ' + nextbit
127
- running_count = fieldcount(line)
128
- end
129
- end
130
-
131
- unless opts[:dejunk] && junky?(line)
132
- fieldcounts[fieldcount(line)] += 1
133
- emit(line)
134
- end
135
- end
136
- end
137
- end
138
- end
139
-
140
- @output.close if opts[:output]
141
- {
142
- :input_count => @input_count,
143
- :output_count => @output_count,
144
- :field_counts => fieldcounts,
145
- :headers => headers
146
- }
147
- end
148
-
149
77
  def fieldcount(line)
150
78
  explode(line).count
151
79
  end
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.3.1"
2
+ VERSION = "0.3.2"
3
3
  end
@@ -1,4 +1,4 @@
1
1
  Col 3
2
- data
3
- this long row
4
- data
2
+ data
3
+ this long row is split across lines
4
+ data
@@ -0,0 +1,8 @@
1
+ LastName|FirstName|MiddleName|TitleName|AuditID|AuditByID|ChangeDTTM|AuditCode|AuditDescription|ObjectType|ReasonCodeID|ObjectID|AuthorizingName|DocumentID|DocumentAudTypeID|DocumentAudOrgID|DocumentAudComments|DocumentAudFileCount|DocumentAudIsInactiveFlag|PatientID|InternalExamID|PatientLast|PatientMiddle|PatientFirst
2
+ Charles|Dingle|W|RIS Administrator|12345|1234|4/20/2012 2:54:26PM|MERGE|Merge Folders|DOCUMENT|0|999||888|38|2|Dropped 8 lb weight on foot. c/o pain, swelling over 3rd, 4th metatarsals. Possible non-displaced fractures of proximal 3rd and 4th metatarsals seen on one view.|0|N|888|777|WHIPPLE|D|ANNE
3
+ Gomez|Bungle|||6|8|4/24/2012 10:31:53AM|MERGE|Merge Folders|DOCUMENT|0|101||505|38|2|broke little toe
4
+
5
+ fracture prox phalanx, angulated (I reduced it but did not order a post reduction film.)
6
+
7
+ sumner|0|N|499|172|SMITH|E|STEFANIE
8
+ Gomez|Bangle|||59|8|3/28/2012 3:26:39PM|MERGE|Merge Folders|DOCUMENT|0|11||22|38|2|neg.|0|N|380|173|OCHS|JONES|CARL
@@ -0,0 +1,4 @@
1
+ LastName,FirstName,MiddleName,TitleName,AuditID,AuditByID,ChangeDTTM,AuditCode,AuditDescription,ObjectType,ReasonCodeID,ObjectID,AuthorizingName,DocumentID,DocumentAudTypeID,DocumentAudOrgID,DocumentAudComments,DocumentAudFileCount,DocumentAudIsInactiveFlag,PatientID,InternalExamID,PatientLast,PatientMiddle,PatientFirst
2
+ Charles,Dingle,W,RIS Administrator,12345,1234,4/20/2012 2:54:26PM,MERGE,Merge Folders,DOCUMENT,0,999,,888,38,2,"Dropped 8 lb weight on foot. c/o pain, swelling over 3rd, 4th metatarsals. Possible non-displaced fractures of proximal 3rd and 4th metatarsals seen on one view.",0,N,888,777,WHIPPLE,D,ANNE
3
+ Gomez,Bungle,,,6,8,4/24/2012 10:31:53AM,MERGE,Merge Folders,DOCUMENT,0,101,,505,38,2,"broke little toe fracture prox phalanx, angulated (I reduced it but did not order a post reduction film.) sumner",0,N,499,172,SMITH,E,STEFANIE
4
+ Gomez,Bangle,,,59,8,3/28/2012 3:26:39PM,MERGE,Merge Folders,DOCUMENT,0,11,,22,38,2,neg.,0,N,380,173,OCHS,JONES,CARL
@@ -39,4 +39,16 @@ describe "mending" do
39
39
  # results[:field_counts].should == {11 => 11}
40
40
  output.should == correct_output
41
41
  end
42
+
43
+ it "should consolidate fields with embedded newlines" do
44
+ filename = File.dirname(__FILE__) + "/../data/newlines_in_input.psv"
45
+ tmp = Tempfile.new('mending')
46
+ results = Masticate.mend(filename, :col_sep => '|', :output => tmp)
47
+ output = File.read(tmp)
48
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/newlines_merged.csv")
49
+
50
+ results[:input_count].should == 6
51
+ results[:output_count].should == 4
52
+ output.should == correct_output
53
+ end
42
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &2153051960 !ruby/object:Gem::Requirement
16
+ requirement: &2157784500 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.9.2
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153051960
24
+ version_requirements: *2157784500
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rspec
27
- requirement: &2153051460 !ruby/object:Gem::Requirement
27
+ requirement: &2157784000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 2.9.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2153051460
35
+ version_requirements: *2157784000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: guard-rspec
38
- requirement: &2153051000 !ruby/object:Gem::Requirement
38
+ requirement: &2157783540 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 0.7.0
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2153051000
46
+ version_requirements: *2157783540
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: ruby_gntp
49
- requirement: &2153050540 !ruby/object:Gem::Requirement
49
+ requirement: &2157783080 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: 0.3.4
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2153050540
57
+ version_requirements: *2157783080
58
58
  description: Data file crunching
59
59
  email:
60
60
  - jmay@pobox.com
@@ -107,6 +107,8 @@ files:
107
107
  - spec/data/junk_trailer.txt
108
108
  - spec/data/namedcols.csv
109
109
  - spec/data/namedcols.csv.output
110
+ - spec/data/newlines_in_input.psv
111
+ - spec/data/newlines_merged.csv
110
112
  - spec/data/pipe_data.txt
111
113
  - spec/data/quoted_csv_data.txt
112
114
  - spec/data/recipe.txt
@@ -172,6 +174,8 @@ test_files:
172
174
  - spec/data/junk_trailer.txt
173
175
  - spec/data/namedcols.csv
174
176
  - spec/data/namedcols.csv.output
177
+ - spec/data/newlines_in_input.psv
178
+ - spec/data/newlines_merged.csv
175
179
  - spec/data/pipe_data.txt
176
180
  - spec/data/quoted_csv_data.txt
177
181
  - spec/data/recipe.txt