csv-diff 0.3.5 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c994476b56ebd4cc355f6ee7b668d57396a064e3
4
- data.tar.gz: 895f8255b56537b98a7a9ddf54fbfac8dfabcbfb
3
+ metadata.gz: d1b3b8deee34344d334e740285cb1f3c99074694
4
+ data.tar.gz: d95158d13861cb66fd460ee430714ec3c83cd0b1
5
5
  SHA512:
6
- metadata.gz: 9340c4debf4ccc9cac6f08c08e2f3310acb191eb31ba6ede84512f7f1e3111dc6f123b74e46ad5baf141a0169ee4712db20f782cad3a393193724d3c29dd459f
7
- data.tar.gz: 95c0008ca9b8069bd017ac904e8795853ec181eb6c9906252d9dbb32224a9f49e8eb48084ba9740aa9f8d048290319d4dcc6dd8e3e25bb660004ec50d4d038f9
6
+ metadata.gz: 50c74d6a4093012b0ba44fef70c2d749348d6777cfb9f2cfda66c6e075423191a4c6c22019a388b9d8bd14e22ac60d539f4e3b4aa85fd87fd774a64da15858c7
7
+ data.tar.gz: 8fa030a54e7a97db9913b3c36a1942de1e07a6549f9ae7aa58b5b3f44d522fe11f72d44e18b6b7612d2b2dc9f106ece1fea183557c507bcf18316891ab63f230
data/LICENSE CHANGED
File without changes
data/README.md CHANGED
File without changes
@@ -1,3 +1,4 @@
1
+ require 'csv-diff/source'
1
2
  require 'csv-diff/csv_source'
2
3
  require 'csv-diff/algorithm'
3
4
  require 'csv-diff/csv_diff'
@@ -36,8 +36,6 @@ class CSVDiff
36
36
  # For backwards compatibility and access to fields with differences
37
37
  def [](key)
38
38
  case key
39
- when String
40
- @fields[key]
41
39
  when :action
42
40
  a = diff_type.to_s
43
41
  a[0] = a[0].upcase
@@ -46,6 +44,8 @@ class CSVDiff
46
44
  @row
47
45
  when :sibling_position
48
46
  @sibling_position
47
+ else
48
+ @fields[key]
49
49
  end
50
50
  end
51
51
 
@@ -78,6 +78,9 @@ class CSVDiff
78
78
  unless left.case_sensitive? == right.case_sensitive?
79
79
  raise ArgumentError, "Left and right must have same settings for case-sensitivity"
80
80
  end
81
+ unless left.parent_fields.length == right.parent_fields.length
82
+ raise ArgumentError, "Left and right must have same settings for parent/child fields"
83
+ end
81
84
 
82
85
  # Ensure key fields are not also in the diff_fields
83
86
  diff_fields = diff_fields - key_fields
@@ -81,9 +81,11 @@ class CSVDiff
81
81
  # @option options [Boolean] :ignore_deletes If true, records that appear
82
82
  # in the left/from file but not in the right/to file are not reported.
83
83
  def initialize(left, right, options = {})
84
- @left = left.is_a?(CSVSource) ? left : CSVSource.new(left, options)
84
+ @left = left.is_a?(Source) ? left : CSVSource.new(left, options)
85
+ @left.index_source if @left.lines.nil?
85
86
  raise "No field names found in left (from) source" unless @left.field_names && @left.field_names.size > 0
86
- @right = right.is_a?(CSVSource) ? right : CSVSource.new(right, options)
87
+ @right = right.is_a?(Source) ? right : CSVSource.new(right, options)
88
+ @right.index_source if @right.lines.nil?
87
89
  raise "No field names found in right (to) source" unless @right.field_names && @right.field_names.size > 0
88
90
  @warnings = []
89
91
  @diff_fields = get_diff_fields(@left.field_names, @right.field_names, options)
@@ -141,7 +143,7 @@ class CSVDiff
141
143
  ignore_fields = options.fetch(:ignore_fields, [])
142
144
  ignore_fields = [ignore_fields] unless ignore_fields.is_a?(Array)
143
145
  ignore_fields.map! do |f|
144
- (f.is_a?(Fixnum) ? right_fields[f] : f).upcase
146
+ (f.is_a?(Numeric) ? right_fields[f] : f).upcase
145
147
  end
146
148
  diff_fields = []
147
149
  if options[:diff_common_fields_only]
@@ -2,57 +2,7 @@ class CSVDiff
2
2
 
3
3
  # Represents a CSV input (i.e. the left/from or right/to input) to the diff
4
4
  # process.
5
- class CSVSource
6
-
7
- # @return [String] the path to the source file
8
- attr_accessor :path
9
-
10
- # @return [Array<String>] The names of the fields in the source file
11
- attr_reader :field_names
12
- # @return [Array<String>] The names of the field(s) that uniquely
13
- # identify each row.
14
- attr_reader :key_fields
15
- # @return [Array<String>] The names of the field(s) that identify a
16
- # common parent of child records.
17
- attr_reader :parent_fields
18
- # @return [Array<String>] The names of the field(s) that distinguish a
19
- # child of a parent record.
20
- attr_reader :child_fields
21
-
22
- # @return [Array<Fixnum>] The indexes of the key fields in the source
23
- # file.
24
- attr_reader :key_field_indexes
25
- # @return [Array<Fixnum>] The indexes of the parent fields in the source
26
- # file.
27
- attr_reader :parent_field_indexes
28
- # @return [Array<Fixnum>] The indexes of the child fields in the source
29
- # file.
30
- attr_reader :child_field_indexes
31
-
32
- # @return [Boolean] True if the source has been indexed with case-
33
- # sensitive keys, or false if it has been indexed using upper-case key
34
- # values.
35
- attr_reader :case_sensitive
36
- alias_method :case_sensitive?, :case_sensitive
37
- # @return [Boolean] True if leading/trailing whitespace should be stripped
38
- # from fields
39
- attr_reader :trim_whitespace
40
- # @return [Hash<String,Hash>] A hash containing each line of the source,
41
- # keyed on the values of the +key_fields+.
42
- attr_reader :lines
43
- # @return [Hash<String,Array<String>>] A hash containing each parent key,
44
- # and an Array of the child keys it is a parent of.
45
- attr_reader :index
46
- # @return [Array<String>] An array of any warnings encountered while
47
- # processing the source.
48
- attr_reader :warnings
49
- # @return [Fixnum] A count of the lines processed from this source.
50
- # Excludes any header and duplicate records identified during indexing.
51
- attr_reader :line_count
52
- # @return [Fixnum] A count of the lines from this source that were skipped,
53
- # due either to duplicate keys or filter conditions.
54
- attr_reader :skip_count
55
-
5
+ class CSVSource < Source
56
6
 
57
7
  # Creates a new diff source.
58
8
  #
@@ -99,153 +49,23 @@ class CSVDiff
99
49
  # regular expression(s). Source rows with a field value that satisfies
100
50
  # the regular expressions will be excluded from the diff process.
101
51
  def initialize(source, options = {})
52
+ super(options)
102
53
  if source.is_a?(String)
103
54
  require 'csv'
104
55
  mode_string = options[:encoding] ? "r:#{options[:encoding]}" : 'r'
105
56
  csv_options = options.fetch(:csv_options, {})
106
57
  @path = source
107
- source = CSV.open(@path, mode_string, csv_options).readlines
108
- elsif !source.is_a?(Enumerable) || (source.is_a?(Enumerable) && source.size > 0 &&
109
- !source.first.is_a?(Enumerable))
110
- raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
111
- end
112
- if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
113
- (kf = options.fetch(:key_field, options[:key_fields]))
114
- @key_fields = [kf].flatten
115
- @parent_fields = @key_fields[0...-1]
116
- @child_fields = @key_fields[-1..-1]
117
- else
118
- @parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
119
- @child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
120
- @key_fields = @parent_fields + @child_fields
121
- end
122
- @field_names = options[:field_names]
123
- @warnings = []
124
- index_source(source, options)
125
- end
126
-
127
-
128
- # Returns the row in the CSV source corresponding to the supplied key.
129
- #
130
- # @param key [String] The unique key to use to lookup the row.
131
- # @return [Hash] The fields for the line corresponding to +key+, or nil
132
- # if the key is not recognised.
133
- def [](key)
134
- @lines[key]
135
- end
136
-
137
-
138
- private
139
-
140
- # Given an array of lines, where each line is an array of fields, indexes
141
- # the array contents so that it can be looked up by key.
142
- def index_source(lines, options)
143
- @lines = {}
144
- @index = Hash.new{ |h, k| h[k] = [] }
145
- if @field_names
146
- index_fields(options)
147
- end
148
- @case_sensitive = options.fetch(:case_sensitive, true)
149
- @trim_whitespace = options.fetch(:trim_whitespace, false)
150
- @line_count = 0
151
- @skip_count = 0
152
- line_num = 0
153
- lines.each do |row|
154
- line_num += 1
155
- next if line_num == 1 && @field_names && options[:ignore_header]
156
- unless @field_names
157
- @field_names = row.each_with_index.map{ |f, i| f || i.to_s }
158
- index_fields(options)
159
- next
160
- end
161
- field_vals = row
162
- line = {}
163
- filter = false
164
- @field_names.each_with_index do |field, i|
165
- line[field] = field_vals[i]
166
- line[field].strip! if @trim_whitespace && line[field]
167
- if @include_filter && f = @include_filter[i]
168
- filter = !check_filter(f, line[field])
169
- end
170
- if @exclude_filter && f = @exclude_filter[i]
171
- filter = check_filter(f, line[field])
172
- end
173
- break if filter
174
- end
175
- if filter
176
- @skip_count += 1
177
- next
58
+ # When you call CSV.open, it's best to pass in a block so that after it's yielded,
59
+ # the underlying file handle is closed. Otherwise, you risk leaking the handle.
60
+ @data = CSV.open(@path, mode_string, csv_options) do |csv|
61
+ csv.readlines
178
62
  end
179
- key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
180
- field_vals[kf].to_s : field_vals[kf].to_s.upcase }
181
- key = key_values.join('~')
182
- parent_key = key_values[0...(@parent_fields.length)].join('~')
183
- if @lines[key]
184
- @warnings << "Duplicate key '#{key}' encountered and ignored at line #{line_num}"
185
- @skip_count += 1
186
- else
187
- @index[parent_key] << key
188
- @lines[key] = line
189
- @line_count += 1
190
- end
191
- end
192
- end
193
-
194
-
195
- def index_fields(options)
196
- @key_field_indexes = find_field_indexes(@key_fields, @field_names)
197
- @parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
198
- @child_field_indexes = find_field_indexes(@child_fields, @field_names)
199
- @key_fields = @key_field_indexes.map{ |i| @field_names[i] }
200
- @parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
201
- @child_fields = @child_field_indexes.map{ |i| @field_names[i] }
202
-
203
- @include_filter = convert_filter(options, :include, @field_names)
204
- @exclude_filter = convert_filter(options, :exclude, @field_names)
205
- end
206
-
207
-
208
- # Converts an array of field names to an array of indexes of the fields
209
- # matching those names.
210
- def find_field_indexes(key_fields, field_names)
211
- key_fields.map do |field|
212
- if field.is_a?(Integer)
213
- field
214
- else
215
- field_names.index{ |field_name| field.to_s.downcase == field_name.downcase } or
216
- raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
217
- field_names.join(', ')}"
218
- end
219
- end
220
- end
221
-
222
-
223
- def convert_filter(options, key, field_names)
224
- return unless hsh = options[key]
225
- if !hsh.is_a?(Hash)
226
- raise ArgumentError, ":#{key} option must be a Hash of field name(s)/index(es) to RegExp(s)"
227
- end
228
- keys = hsh.keys
229
- idxs = find_field_indexes(keys, @field_names)
230
- Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
231
- end
232
-
233
-
234
- def check_filter(filter, field_val)
235
- case filter
236
- when String
237
- if @case_sensitive
238
- filter == field_val
239
- else
240
- filter.downcase == field_val.to_s.downcase
241
- end
242
- when Regexp
243
- filter.match(field_val)
244
- when Proc
245
- filter.call(field_val)
63
+ elsif source.is_a?(Enumerable) && source.size == 0 || (source.size > 0 && source.first.is_a?(Enumerable))
64
+ @data = source
246
65
  else
247
- raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
66
+ raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
248
67
  end
68
+ index_source
249
69
  end
250
70
 
251
71
  end
@@ -0,0 +1,275 @@
1
+ class CSVDiff
2
+
3
+ # Reppresents an input (i.e the left/from or tight/to input) to the diff
4
+ # process.
5
+ class Source
6
+
7
+ # @return [String] the path to the source file
8
+ attr_accessor :path
9
+ # @return [Array<Arrary>] The data for this source
10
+ attr_reader :data
11
+
12
+ # @return [Array<String>] The names of the fields in the source file
13
+ attr_reader :field_names
14
+ # @return [Array<String>] The names of the field(s) that uniquely
15
+ # identify each row.
16
+ attr_reader :key_fields
17
+ # @return [Array<String>] The names of the field(s) that identify a
18
+ # common parent of child records.
19
+ attr_reader :parent_fields
20
+ # @return [Array<String>] The names of the field(s) that distinguish a
21
+ # child of a parent record.
22
+ attr_reader :child_fields
23
+
24
+ # @return [Array<Fixnum>] The indexes of the key fields in the source
25
+ # file.
26
+ attr_reader :key_field_indexes
27
+ # @return [Array<Fixnum>] The indexes of the parent fields in the source
28
+ # file.
29
+ attr_reader :parent_field_indexes
30
+ # @return [Array<Fixnum>] The indexes of the child fields in the source
31
+ # file.
32
+ attr_reader :child_field_indexes
33
+
34
+ # @return [Boolean] True if the source has been indexed with case-
35
+ # sensitive keys, or false if it has been indexed using upper-case key
36
+ # values.
37
+ attr_reader :case_sensitive
38
+ alias_method :case_sensitive?, :case_sensitive
39
+ # @return [Boolean] True if leading/trailing whitespace should be stripped
40
+ # from fields
41
+ attr_reader :trim_whitespace
42
+ # @return [Hash<String,Hash>] A hash containing each line of the source,
43
+ # keyed on the values of the +key_fields+.
44
+ attr_reader :lines
45
+ # @return [Hash<String,Array<String>>] A hash containing each parent key,
46
+ # and an Array of the child keys it is a parent of.
47
+ attr_reader :index
48
+ # @return [Array<String>] An array of any warnings encountered while
49
+ # processing the source.
50
+ attr_reader :warnings
51
+ # @return [Fixnum] A count of the lines processed from this source.
52
+ # Excludes any header and duplicate records identified during indexing.
53
+ attr_reader :line_count
54
+ # @return [Fixnum] A count of the lines from this source that were skipped
55
+ # due to filter conditions.
56
+ attr_reader :skip_count
57
+ # @return [Fixnum] A count of the lines from this source that had the same
58
+ # key value as another line.
59
+ attr_reader :dup_count
60
+
61
+
62
+ # Creates a new diff source.
63
+ #
64
+ # A diff source must contain at least one field that will be used as the
65
+ # key to identify the same record in a different version of this file.
66
+ # If not specified via one of the options, the first field is assumed to
67
+ # be the unique key.
68
+ #
69
+ # If multiple fields combine to form a unique key, the parent is assumed
70
+ # to be identified by all but the last field of the unique key. If finer
71
+ # control is required, use a combination of the :parent_fields and
72
+ # :child_fields options.
73
+ #
74
+ # All key options can be specified either by field name, or by field
75
+ # index (0 based).
76
+ #
77
+ # @param options [Hash] An options hash.
78
+ # @option options [Array<String>] :field_names The names of each of the
79
+ # fields in +source+.
80
+ # @option options [Boolean] :ignore_header If true, and :field_names has
81
+ # been specified, then the first row of the file is ignored.
82
+ # @option options [String] :key_field The name of the field that uniquely
83
+ # identifies each row.
84
+ # @option options [Array<String>] :key_fields The names of the fields
85
+ # that uniquely identifies each row.
86
+ # @option options [String] :parent_field The name of the field(s) that
87
+ # identify a parent within which sibling order should be checked.
88
+ # @option options [String] :child_field The name of the field(s) that
89
+ # uniquely identify a child of a parent.
90
+ # @option options [Boolean] :case_sensitive If true (the default), keys
91
+ # are indexed as-is; if false, the index is built in upper-case for
92
+ # case-insensitive comparisons.
93
+ # @option options [Hash] :include A hash of field name(s) or index(es) to
94
+ # regular expression(s). Only source rows whose field values satisfy the
95
+ # regular expressions will be indexed and included in the diff process.
96
+ # @option options [Hash] :exclude A hash of field name(s) or index(es) to
97
+ # regular expression(s). Source rows with a field value that satisfies
98
+ # the regular expressions will be excluded from the diff process.
99
+ def initialize(options = {})
100
+ if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
101
+ (kf = options.fetch(:key_field, options[:key_fields]))
102
+ @key_fields = [kf].flatten
103
+ @parent_fields = @key_fields[0...-1]
104
+ @child_fields = @key_fields[-1..-1]
105
+ else
106
+ @parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
107
+ @child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
108
+ @key_fields = @parent_fields + @child_fields
109
+ end
110
+ @field_names = options[:field_names]
111
+ @case_sensitive = options.fetch(:case_sensitive, true)
112
+ @trim_whitespace = options.fetch(:trim_whitespace, false)
113
+ @ignore_header = options[:ignore_header]
114
+ @include = options[:include]
115
+ @exclued = options[:exclude]
116
+ @path = options.fetch(:path, 'NA') unless @path
117
+ @warnings = []
118
+ end
119
+
120
+
121
+ def path?
122
+ @path != 'NA'
123
+ end
124
+
125
+
126
+ # Returns the row in the CSV source corresponding to the supplied key.
127
+ #
128
+ # @param key [String] The unique key to use to lookup the row.
129
+ # @return [Hash] The fields for the line corresponding to +key+, or nil
130
+ # if the key is not recognised.
131
+ def [](key)
132
+ @lines[key]
133
+ end
134
+
135
+
136
+ # Given an array of lines, where each line is an array of fields, indexes
137
+ # the array contents so that it can be looked up by key.
138
+ def index_source
139
+ @lines = {}
140
+ @index = Hash.new{ |h, k| h[k] = [] }
141
+ if @field_names
142
+ index_fields
143
+ include_filter = convert_filter(@include, @field_names)
144
+ exclude_filter = convert_filter(@exclude, @field_names)
145
+ end
146
+ @line_count = 0
147
+ @skip_count = 0
148
+ @dup_count = 0
149
+ line_num = 0
150
+ @data.each do |row|
151
+ line_num += 1
152
+ next if line_num == 1 && @field_names && @ignore_header
153
+ unless @field_names
154
+ if row.class.name == 'CSV::Row'
155
+ @field_names = row.headers.each_with_index.map{ |f, i| f || i.to_s }
156
+ else
157
+ @field_names = row.each_with_index.map{ |f, i| f || i.to_s }
158
+ end
159
+ index_fields
160
+ include_filter = convert_filter(@include, @field_names)
161
+ exclude_filter = convert_filter(@exclude, @field_names)
162
+ next
163
+ end
164
+ field_vals = row
165
+ line = {}
166
+ filter = false
167
+ @field_names.each_with_index do |field, i|
168
+ val = field_vals[i]
169
+ val = val.to_s.strip if val && @trim_whitespace
170
+ line[field] = val
171
+ if include_filter && f = include_filter[i]
172
+ filter = !check_filter(f, line[field])
173
+ end
174
+ if exclude_filter && f = exclude_filter[i]
175
+ filter = check_filter(f, line[field])
176
+ end
177
+ break if filter
178
+ end
179
+ if filter
180
+ @skip_count += 1
181
+ next
182
+ end
183
+ key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
184
+ field_vals[kf].to_s :
185
+ field_vals[kf].to_s.upcase }
186
+ key = key_values.join('~')
187
+ parent_key = key_values[0...(@parent_fields.length)].join('~')
188
+ if @lines[key]
189
+ @warnings << "Duplicate key '#{key}' encountered at line #{line_num}"
190
+ @dup_count += 1
191
+ key += "[#{@dup_count}]"
192
+ end
193
+ @index[parent_key] << key
194
+ @lines[key] = line
195
+ @line_count += 1
196
+ end
197
+ end
198
+
199
+
200
+ # Save the data in this Source as a CSV at +file_path+.
201
+ #
202
+ # @parma file_path [String] The target path to save the data to.
203
+ # @param options [Hash] A set of options to pass to CSV.open to control
204
+ # how the CSV is generated.
205
+ def save_csv(file_path, options = {})
206
+ require 'csv'
207
+ default_opts = {
208
+ headers: @field_name, write_headers: true
209
+ }
210
+ CSV.open(file_path, 'wb', default_opts.merge(options)) do |csv|
211
+ @data.each{ |rec| csv << rec }
212
+ end
213
+ end
214
+
215
+
216
+ private
217
+
218
+
219
+ def index_fields
220
+ @key_field_indexes = find_field_indexes(@key_fields, @field_names)
221
+ @parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
222
+ @child_field_indexes = find_field_indexes(@child_fields, @field_names)
223
+ @key_fields = @key_field_indexes.map{ |i| @field_names[i] }
224
+ @parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
225
+ @child_fields = @child_field_indexes.map{ |i| @field_names[i] }
226
+ end
227
+
228
+
229
+ # Converts an array of field names to an array of indexes of the fields
230
+ # matching those names.
231
+ def find_field_indexes(key_fields, field_names)
232
+ key_fields.map do |field|
233
+ if field.is_a?(Integer)
234
+ field
235
+ else
236
+ field_names.index{ |field_name| field.to_s.downcase == field_name.to_s.downcase } or
237
+ raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
238
+ field_names.join(', ')}"
239
+ end
240
+ end
241
+ end
242
+
243
+
244
+ def convert_filter(hsh, field_names)
245
+ return unless hsh
246
+ if !hsh.is_a?(Hash)
247
+ raise ArgumentError, ":include/:exclude option must be a Hash of field name(s)/index(es) to RegExp(s)"
248
+ end
249
+ keys = hsh.keys
250
+ idxs = find_field_indexes(keys, @field_names)
251
+ Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
252
+ end
253
+
254
+
255
+ def check_filter(filter, field_val)
256
+ case filter
257
+ when String
258
+ if @case_sensitive
259
+ filter == field_val
260
+ else
261
+ filter.downcase == field_val.to_s.downcase
262
+ end
263
+ when Regexp
264
+ filter.match(field_val)
265
+ when Proc
266
+ filter.call(field_val)
267
+ else
268
+ raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
269
+ end
270
+ end
271
+
272
+ end
273
+
274
+ end
275
+
@@ -0,0 +1,142 @@
1
+ require 'nokogiri'
2
+ require 'cgi'
3
+
4
+
5
+ class CSVDiff
6
+
7
+ # Convert XML content to CSV format using XPath selectors to identify the
8
+ # rows and field values in an XML document
9
+ class XMLSource < Source
10
+
11
+ attr_accessor :context
12
+
13
+ # Create a new XMLSource, identified by +path+. Normally this is a path
14
+ # to the XML document, but any value is fine, as it isreally just a label
15
+ # to identify this data set.
16
+ #
17
+ # @param path [String] A label for this data set (often a path to the
18
+ # XML document used as the source).
19
+ # @param options [Hash] An options hash.
20
+ # @option options [Array<String>] :field_names The names of each of the
21
+ # fields in +source+.
22
+ # @option options [Boolean] :ignore_header If true, and :field_names has
23
+ # been specified, then the first row of the file is ignored.
24
+ # @option options [String] :key_field The name of the field that uniquely
25
+ # identifies each row.
26
+ # @option options [Array<String>] :key_fields The names of the fields
27
+ # that uniquely identifies each row.
28
+ # @option options [String] :parent_field The name of the field(s) that
29
+ # identify a parent within which sibling order should be checked.
30
+ # @option options [String] :child_field The name of the field(s) that
31
+ # uniquely identify a child of a parent.
32
+ # @option options [Boolean] :case_sensitive If true (the default), keys
33
+ # are indexed as-is; if false, the index is built in upper-case for
34
+ # case-insensitive comparisons.
35
+ # @option options [Hash] :include A hash of field name(s) or index(es) to
36
+ # regular expression(s). Only source rows whose field values satisfy the
37
+ # regular expressions will be indexed and included in the diff process.
38
+ # @option options [Hash] :exclude A hash of field name(s) or index(es) to
39
+ # regular expression(s). Source rows with a field value that satisfies
40
+ # the regular expressions will be excluded from the diff process.
41
+ # @option options [String] :context A context value from which fields
42
+ # can be populated using a Regexp.
43
+ def initialize(path, options = {})
44
+ super(options)
45
+ @path = path
46
+ @context = options[:context]
47
+ @data = []
48
+ end
49
+
50
+
51
+ # Process a +source+, converting the XML into a table of data, using
52
+ # +rec_xpath+ to identify the nodes that correspond each record that
53
+ # should appear in the output, and +field_maps+ to populate each field
54
+ # in each row.
55
+ #
56
+ # @param source [String|Array] may be a String containing XML content,
57
+ # an Array of paths to files containing XML content, or a path to
58
+ # a single file.
59
+ # @param rec_xpath [String] An XPath expression that selects all the
60
+ # items in the XML document that are to be converted into new rows.
61
+ # The returned items are not directly used to populate the fields,
62
+ # but provide a context for the field XPath expreessions that populate
63
+ # each field's content.
64
+ # @param field_maps [Hash<String, String>] A map of field names to
65
+ # expressions that are evaluated in the context of each row node
66
+ # selected by +rec_xpath+. The field expressions are typically XPath
67
+ # expressions evaluated in the context of the nodes returned by the
68
+ # +rec_xpath+. Alternatively, a String that is not an XPath expression
69
+ # is used as a literal value for a field, while a Regexp can also
70
+ # be used to pull a value from any context specified in the +options+
71
+ # hash. The Regexp should include a single grouping, as the value used
72
+ # will be the result in $1 after the match is performed.
73
+ # @param context [String] An optional context for the XML to be processed.
74
+ # The value passed here can be referenced in field map expressions
75
+ # using a Regexp, with the value of the first grouping in the regex
76
+ # being the value returned for the field.
77
+ def process(source, rec_xpath, field_maps, context = nil)
78
+ @field_names = field_maps.keys unless @field_names
79
+ case source
80
+ when Nokogiri::XML::Document
81
+ add_data(source, rec_xpath, field_maps, context || @context)
82
+ when /<\?xml/
83
+ doc = Nokogiri::XML(source)
84
+ add_data(doc, rec_xpath, field_maps, context || @context)
85
+ when Array
86
+ source.each{ |f| process_file(f, rec_xpath, field_maps) }
87
+ when String
88
+ process_file(source, rec_xpath, field_maps)
89
+ else
90
+ raise ArgumentError, "Unhandled source type #{source.class.name}"
91
+ end
92
+ @data
93
+ end
94
+
95
+
96
+ private
97
+
98
+
99
+ # Load the XML document at +file_path+ and process it into rows of data.
100
+ def process_file(file_path, rec_xpath, field_maps)
101
+ begin
102
+ File.open(file_path) do |f|
103
+ doc = Nokogiri::XML(f)
104
+ add_data(doc, rec_xpath, field_maps, @context || file_path)
105
+ end
106
+ rescue
107
+ STDERR.puts "An error occurred while attempting to open #{file_path}"
108
+ raise
109
+ end
110
+ end
111
+
112
+
113
+ # Locate records in +doc+ using +rec_xpath+ to identify the nodes that
114
+ # correspond to a new record in the data, and +field_maps+ to populate
115
+ # the fields in each row.
116
+ def add_data(doc, rec_xpath, field_maps, context)
117
+ doc.xpath(rec_xpath).each do |rec_node|
118
+ rec = []
119
+ field_maps.each do |field_name, expr|
120
+ case expr
121
+ when Regexp # Match context against Regexp and extract first grouping
122
+ if context
123
+ context =~ expr
124
+ rec << $1
125
+ else
126
+ rec << nil
127
+ end
128
+ when %r{[/(.@]} # XPath expression
129
+ res = rec_node.xpath(expr)
130
+ rec << CGI.unescape_html(res.to_s)
131
+ else # Use expr as the value for this field
132
+ rec << expr
133
+ end
134
+ end
135
+ @data << rec
136
+ end
137
+ end
138
+
139
+ end
140
+
141
+ end
142
+
File without changes
metadata CHANGED
@@ -1,17 +1,17 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-diff
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.5
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Gardiner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-03-05 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |2
14
- This library performs diffs of CSV files (or table-like sources).
14
+ This library performs diffs of CSV data, or any table-like source.
15
15
 
16
16
  Unlike a standard diff that compares line by line, and is sensitive to the
17
17
  ordering of records, CSV-Diff identifies common lines by key field(s), and
@@ -29,7 +29,9 @@ description: |2
29
29
  sibling order.
30
30
 
31
31
  This gem implements the core diff algorithm, and handles the loading and
32
- diffing of CSV files (or Arrays of Arrays). It returns a CSVDiff object
32
+ diffing of CSV files (or Arrays of Arrays). It also supports converting
33
+ data in XML format into tabular form, so that it can then be processed
34
+ like any other CSV or table-like source. It returns a CSVDiff object
33
35
  containing the details of differences in object form. This is useful for
34
36
  projects that need diff capability, but want to handle the reporting or
35
37
  actioning of differences themselves.
@@ -48,6 +50,8 @@ files:
48
50
  - lib/csv-diff/algorithm.rb
49
51
  - lib/csv-diff/csv_diff.rb
50
52
  - lib/csv-diff/csv_source.rb
53
+ - lib/csv-diff/source.rb
54
+ - lib/csv-diff/xml_source.rb
51
55
  - lib/csv_diff.rb
52
56
  homepage: https://github.com/agardiner/csv-diff
53
57
  licenses:
@@ -69,8 +73,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
69
73
  version: '0'
70
74
  requirements: []
71
75
  rubyforge_project:
72
- rubygems_version: 2.5.2
76
+ rubygems_version: 2.5.2.3
73
77
  signing_key:
74
78
  specification_version: 4
75
- summary: CSV Diff is a library for generating diffs from data in CSV format
79
+ summary: CSV Diff is a library for generating diffs from data in CSV or XML format
76
80
  test_files: []