masticate 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,11 +70,11 @@ class Masticate::Base
70
70
  if row
71
71
  row = row.map {|s| s && s.strip}
72
72
  end
73
- # row2 = row.map {|s| s && s.strip}
74
- # if row2.nil?
75
- # puts "**** ROW IS [#{row.inspect}]"
76
- # end
77
- output = crunch(row)
73
+ if self.class == Masticate::Mender
74
+ output = crunch(row, line, csv_options)
75
+ else
76
+ output = crunch(row)
77
+ end
78
78
  emit(output) if output
79
79
  end
80
80
  end
@@ -33,7 +33,13 @@ class Masticate::Cook < Masticate::Base
33
33
  row = CSV.parse_line(line, csv_options)
34
34
 
35
35
  steps.each do |step|
36
- row = step.crunch(row) if row
36
+ if row
37
+ if step.class == Masticate::Mender
38
+ row = step.crunch(row, line, csv_options)
39
+ else
40
+ row = step.crunch(row)
41
+ end
42
+ end
37
43
  end
38
44
 
39
45
  emit(row) if row
@@ -48,7 +54,6 @@ class Masticate::Cook < Masticate::Base
48
54
  end
49
55
  end
50
56
  more_rows.each {|row| emit(row)}
51
- # step.crunch(nil) {|row| emit(row)}
52
57
 
53
58
  @output.close if opts[:output]
54
59
 
@@ -12,20 +12,14 @@ class Masticate::Mender < Masticate::Base
12
12
  @dejunk = opts[:dejunk]
13
13
 
14
14
  @expected_field_count = nil
15
- @holding = []
15
+ @holding = ''
16
16
  end
17
17
 
18
- # attr_reader :col_sep
19
-
20
- # def initialize(filename)
21
- # @filename = filename
22
- # end
23
-
24
18
  def mend(opts)
25
19
  execute(opts)
26
20
  end
27
21
 
28
- def crunch(row)
22
+ def crunch(row, line = '', csv_options = {})
29
23
  if @inlined
30
24
  if row
31
25
  ncells = row.count/2-1
@@ -56,13 +50,19 @@ class Masticate::Mender < Masticate::Base
56
50
  @expected_field_count = @headers.count
57
51
  row = @headers
58
52
  elsif row
59
- @holding += row
60
- if @holding.count < @expected_field_count
53
+ @holding << ' ' unless @holding.empty?
54
+ @holding << line
55
+
56
+ row = CSV.parse_line(@holding, csv_options) #.map {|s| s && s.strip}
57
+ if row
58
+ row = row.map {|s| s && s.strip}
59
+ end
60
+
61
+ if row.count < @expected_field_count
61
62
  # incomplete row; do not emit anything
62
63
  row = nil
63
64
  else
64
- row = @holding
65
- @holding = []
65
+ @holding = ''
66
66
  end
67
67
 
68
68
  if @dejunk && row && row.select {|s| s && !s.strip.empty?}.count <= 2
@@ -74,78 +74,6 @@ class Masticate::Mender < Masticate::Base
74
74
  end
75
75
  end
76
76
 
77
- def old_mend(opts)
78
- @output = opts[:output] ? File.open(opts[:output], "w") : $stdout
79
- @col_sep = opts[:col_sep] || ','
80
- @quote_char = opts[:quote_char] || "\0"
81
-
82
- expected_field_count = nil
83
- headers = nil
84
- @output_count = 0
85
- fieldcounts = Hash.new(0)
86
- with_input do |input|
87
- while (line = get) do
88
- unless line =~ /^\s*$/
89
- if opts[:inlined]
90
- row = explode(line)
91
- ncells = row.count/2-1
92
- if !expected_field_count
93
- headers = row[0..ncells]
94
- expected_field_count = headers.count
95
- fieldcounts[headers.count] += 1
96
- emit(headers.to_csv(:col_sep => @col_sep))
97
- else
98
- if row[0..ncells] != headers
99
- raise "Header mismatch on line #{@input_count}\n Expected: #{headers.join(',')}\n Found: #{row[0..ncells].join(',')}"
100
- end
101
- end
102
- row = row[ncells+1, expected_field_count]
103
- fieldcounts[row.count] += 1
104
- emit(row.to_csv(:col_sep => @col_sep))
105
- elsif !expected_field_count
106
- # trust the first row
107
- headers = explode(line).map(&:strip)
108
- case opts[:snip]
109
- when Fixnum
110
- headers.shift(opts[:snip])
111
- when String
112
- raise "TODO: snip named header. Multiple?"
113
- when nil
114
- # do nothing
115
- else
116
- raise "Do not understand snip instruction [#{opts[:snip].inspect}]"
117
- end
118
- expected_field_count = headers.count
119
- fieldcounts[headers.count] += 1
120
- emit(headers.to_csv(:col_sep => @col_sep))
121
- else
122
- running_count = fieldcount(line)
123
- while !input.eof? && running_count < expected_field_count do
124
- nextbit = get
125
- if nextbit
126
- line = line + ' ' + nextbit
127
- running_count = fieldcount(line)
128
- end
129
- end
130
-
131
- unless opts[:dejunk] && junky?(line)
132
- fieldcounts[fieldcount(line)] += 1
133
- emit(line)
134
- end
135
- end
136
- end
137
- end
138
- end
139
-
140
- @output.close if opts[:output]
141
- {
142
- :input_count => @input_count,
143
- :output_count => @output_count,
144
- :field_counts => fieldcounts,
145
- :headers => headers
146
- }
147
- end
148
-
149
77
  def fieldcount(line)
150
78
  explode(line).count
151
79
  end
@@ -1,3 +1,3 @@
1
1
  module Masticate
2
- VERSION = "0.3.1"
2
+ VERSION = "0.3.2"
3
3
  end
@@ -1,4 +1,4 @@
1
1
  Col 3
2
- data
3
- this long row
4
- data
2
+ data
3
+ this long row is split across lines
4
+ data
@@ -0,0 +1,8 @@
1
+ LastName|FirstName|MiddleName|TitleName|AuditID|AuditByID|ChangeDTTM|AuditCode|AuditDescription|ObjectType|ReasonCodeID|ObjectID|AuthorizingName|DocumentID|DocumentAudTypeID|DocumentAudOrgID|DocumentAudComments|DocumentAudFileCount|DocumentAudIsInactiveFlag|PatientID|InternalExamID|PatientLast|PatientMiddle|PatientFirst
2
+ Charles|Dingle|W|RIS Administrator|12345|1234|4/20/2012 2:54:26PM|MERGE|Merge Folders|DOCUMENT|0|999||888|38|2|Dropped 8 lb weight on foot. c/o pain, swelling over 3rd, 4th metatarsals. Possible non-displaced fractures of proximal 3rd and 4th metatarsals seen on one view.|0|N|888|777|WHIPPLE|D|ANNE
3
+ Gomez|Bungle|||6|8|4/24/2012 10:31:53AM|MERGE|Merge Folders|DOCUMENT|0|101||505|38|2|broke little toe
4
+
5
+ fracture prox phalanx, angulated (I reduced it but did not order a post reduction film.)
6
+
7
+ sumner|0|N|499|172|SMITH|E|STEFANIE
8
+ Gomez|Bangle|||59|8|3/28/2012 3:26:39PM|MERGE|Merge Folders|DOCUMENT|0|11||22|38|2|neg.|0|N|380|173|OCHS|JONES|CARL
@@ -0,0 +1,4 @@
1
+ LastName,FirstName,MiddleName,TitleName,AuditID,AuditByID,ChangeDTTM,AuditCode,AuditDescription,ObjectType,ReasonCodeID,ObjectID,AuthorizingName,DocumentID,DocumentAudTypeID,DocumentAudOrgID,DocumentAudComments,DocumentAudFileCount,DocumentAudIsInactiveFlag,PatientID,InternalExamID,PatientLast,PatientMiddle,PatientFirst
2
+ Charles,Dingle,W,RIS Administrator,12345,1234,4/20/2012 2:54:26PM,MERGE,Merge Folders,DOCUMENT,0,999,,888,38,2,"Dropped 8 lb weight on foot. c/o pain, swelling over 3rd, 4th metatarsals. Possible non-displaced fractures of proximal 3rd and 4th metatarsals seen on one view.",0,N,888,777,WHIPPLE,D,ANNE
3
+ Gomez,Bungle,,,6,8,4/24/2012 10:31:53AM,MERGE,Merge Folders,DOCUMENT,0,101,,505,38,2,"broke little toe fracture prox phalanx, angulated (I reduced it but did not order a post reduction film.) sumner",0,N,499,172,SMITH,E,STEFANIE
4
+ Gomez,Bangle,,,59,8,3/28/2012 3:26:39PM,MERGE,Merge Folders,DOCUMENT,0,11,,22,38,2,neg.,0,N,380,173,OCHS,JONES,CARL
@@ -39,4 +39,16 @@ describe "mending" do
39
39
  # results[:field_counts].should == {11 => 11}
40
40
  output.should == correct_output
41
41
  end
42
+
43
+ it "should consolidate fields with embedded newlines" do
44
+ filename = File.dirname(__FILE__) + "/../data/newlines_in_input.psv"
45
+ tmp = Tempfile.new('mending')
46
+ results = Masticate.mend(filename, :col_sep => '|', :output => tmp)
47
+ output = File.read(tmp)
48
+ correct_output = File.read(File.dirname(__FILE__) + "/../data/newlines_merged.csv")
49
+
50
+ results[:input_count].should == 6
51
+ results[:output_count].should == 4
52
+ output.should == correct_output
53
+ end
42
54
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: masticate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &2153051960 !ruby/object:Gem::Requirement
16
+ requirement: &2157784500 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: 0.9.2
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2153051960
24
+ version_requirements: *2157784500
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rspec
27
- requirement: &2153051460 !ruby/object:Gem::Requirement
27
+ requirement: &2157784000 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: 2.9.0
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2153051460
35
+ version_requirements: *2157784000
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: guard-rspec
38
- requirement: &2153051000 !ruby/object:Gem::Requirement
38
+ requirement: &2157783540 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: 0.7.0
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2153051000
46
+ version_requirements: *2157783540
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: ruby_gntp
49
- requirement: &2153050540 !ruby/object:Gem::Requirement
49
+ requirement: &2157783080 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ~>
@@ -54,7 +54,7 @@ dependencies:
54
54
  version: 0.3.4
55
55
  type: :development
56
56
  prerelease: false
57
- version_requirements: *2153050540
57
+ version_requirements: *2157783080
58
58
  description: Data file crunching
59
59
  email:
60
60
  - jmay@pobox.com
@@ -107,6 +107,8 @@ files:
107
107
  - spec/data/junk_trailer.txt
108
108
  - spec/data/namedcols.csv
109
109
  - spec/data/namedcols.csv.output
110
+ - spec/data/newlines_in_input.psv
111
+ - spec/data/newlines_merged.csv
110
112
  - spec/data/pipe_data.txt
111
113
  - spec/data/quoted_csv_data.txt
112
114
  - spec/data/recipe.txt
@@ -172,6 +174,8 @@ test_files:
172
174
  - spec/data/junk_trailer.txt
173
175
  - spec/data/namedcols.csv
174
176
  - spec/data/namedcols.csv.output
177
+ - spec/data/newlines_in_input.psv
178
+ - spec/data/newlines_merged.csv
175
179
  - spec/data/pipe_data.txt
176
180
  - spec/data/quoted_csv_data.txt
177
181
  - spec/data/recipe.txt