csv-diff 0.3.5 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +0 -0
- data/README.md +0 -0
- data/lib/csv-diff.rb +1 -0
- data/lib/csv-diff/algorithm.rb +5 -2
- data/lib/csv-diff/csv_diff.rb +5 -3
- data/lib/csv-diff/csv_source.rb +10 -190
- data/lib/csv-diff/source.rb +275 -0
- data/lib/csv-diff/xml_source.rb +142 -0
- data/lib/csv_diff.rb +0 -0
- metadata +10 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1b3b8deee34344d334e740285cb1f3c99074694
|
4
|
+
data.tar.gz: d95158d13861cb66fd460ee430714ec3c83cd0b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50c74d6a4093012b0ba44fef70c2d749348d6777cfb9f2cfda66c6e075423191a4c6c22019a388b9d8bd14e22ac60d539f4e3b4aa85fd87fd774a64da15858c7
|
7
|
+
data.tar.gz: 8fa030a54e7a97db9913b3c36a1942de1e07a6549f9ae7aa58b5b3f44d522fe11f72d44e18b6b7612d2b2dc9f106ece1fea183557c507bcf18316891ab63f230
|
data/LICENSE
CHANGED
File without changes
|
data/README.md
CHANGED
File without changes
|
data/lib/csv-diff.rb
CHANGED
data/lib/csv-diff/algorithm.rb
CHANGED
@@ -36,8 +36,6 @@ class CSVDiff
|
|
36
36
|
# For backwards compatibility and access to fields with differences
|
37
37
|
def [](key)
|
38
38
|
case key
|
39
|
-
when String
|
40
|
-
@fields[key]
|
41
39
|
when :action
|
42
40
|
a = diff_type.to_s
|
43
41
|
a[0] = a[0].upcase
|
@@ -46,6 +44,8 @@ class CSVDiff
|
|
46
44
|
@row
|
47
45
|
when :sibling_position
|
48
46
|
@sibling_position
|
47
|
+
else
|
48
|
+
@fields[key]
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
@@ -78,6 +78,9 @@ class CSVDiff
|
|
78
78
|
unless left.case_sensitive? == right.case_sensitive?
|
79
79
|
raise ArgumentError, "Left and right must have same settings for case-sensitivity"
|
80
80
|
end
|
81
|
+
unless left.parent_fields.length == right.parent_fields.length
|
82
|
+
raise ArgumentError, "Left and right must have same settings for parent/child fields"
|
83
|
+
end
|
81
84
|
|
82
85
|
# Ensure key fields are not also in the diff_fields
|
83
86
|
diff_fields = diff_fields - key_fields
|
data/lib/csv-diff/csv_diff.rb
CHANGED
@@ -81,9 +81,11 @@ class CSVDiff
|
|
81
81
|
# @option options [Boolean] :ignore_deletes If true, records that appear
|
82
82
|
# in the left/from file but not in the right/to file are not reported.
|
83
83
|
def initialize(left, right, options = {})
|
84
|
-
@left = left.is_a?(
|
84
|
+
@left = left.is_a?(Source) ? left : CSVSource.new(left, options)
|
85
|
+
@left.index_source if @left.lines.nil?
|
85
86
|
raise "No field names found in left (from) source" unless @left.field_names && @left.field_names.size > 0
|
86
|
-
@right = right.is_a?(
|
87
|
+
@right = right.is_a?(Source) ? right : CSVSource.new(right, options)
|
88
|
+
@right.index_source if @right.lines.nil?
|
87
89
|
raise "No field names found in right (to) source" unless @right.field_names && @right.field_names.size > 0
|
88
90
|
@warnings = []
|
89
91
|
@diff_fields = get_diff_fields(@left.field_names, @right.field_names, options)
|
@@ -141,7 +143,7 @@ class CSVDiff
|
|
141
143
|
ignore_fields = options.fetch(:ignore_fields, [])
|
142
144
|
ignore_fields = [ignore_fields] unless ignore_fields.is_a?(Array)
|
143
145
|
ignore_fields.map! do |f|
|
144
|
-
(f.is_a?(
|
146
|
+
(f.is_a?(Numeric) ? right_fields[f] : f).upcase
|
145
147
|
end
|
146
148
|
diff_fields = []
|
147
149
|
if options[:diff_common_fields_only]
|
data/lib/csv-diff/csv_source.rb
CHANGED
@@ -2,57 +2,7 @@ class CSVDiff
|
|
2
2
|
|
3
3
|
# Represents a CSV input (i.e. the left/from or right/to input) to the diff
|
4
4
|
# process.
|
5
|
-
class CSVSource
|
6
|
-
|
7
|
-
# @return [String] the path to the source file
|
8
|
-
attr_accessor :path
|
9
|
-
|
10
|
-
# @return [Array<String>] The names of the fields in the source file
|
11
|
-
attr_reader :field_names
|
12
|
-
# @return [Array<String>] The names of the field(s) that uniquely
|
13
|
-
# identify each row.
|
14
|
-
attr_reader :key_fields
|
15
|
-
# @return [Array<String>] The names of the field(s) that identify a
|
16
|
-
# common parent of child records.
|
17
|
-
attr_reader :parent_fields
|
18
|
-
# @return [Array<String>] The names of the field(s) that distinguish a
|
19
|
-
# child of a parent record.
|
20
|
-
attr_reader :child_fields
|
21
|
-
|
22
|
-
# @return [Array<Fixnum>] The indexes of the key fields in the source
|
23
|
-
# file.
|
24
|
-
attr_reader :key_field_indexes
|
25
|
-
# @return [Array<Fixnum>] The indexes of the parent fields in the source
|
26
|
-
# file.
|
27
|
-
attr_reader :parent_field_indexes
|
28
|
-
# @return [Array<Fixnum>] The indexes of the child fields in the source
|
29
|
-
# file.
|
30
|
-
attr_reader :child_field_indexes
|
31
|
-
|
32
|
-
# @return [Boolean] True if the source has been indexed with case-
|
33
|
-
# sensitive keys, or false if it has been indexed using upper-case key
|
34
|
-
# values.
|
35
|
-
attr_reader :case_sensitive
|
36
|
-
alias_method :case_sensitive?, :case_sensitive
|
37
|
-
# @return [Boolean] True if leading/trailing whitespace should be stripped
|
38
|
-
# from fields
|
39
|
-
attr_reader :trim_whitespace
|
40
|
-
# @return [Hash<String,Hash>] A hash containing each line of the source,
|
41
|
-
# keyed on the values of the +key_fields+.
|
42
|
-
attr_reader :lines
|
43
|
-
# @return [Hash<String,Array<String>>] A hash containing each parent key,
|
44
|
-
# and an Array of the child keys it is a parent of.
|
45
|
-
attr_reader :index
|
46
|
-
# @return [Array<String>] An array of any warnings encountered while
|
47
|
-
# processing the source.
|
48
|
-
attr_reader :warnings
|
49
|
-
# @return [Fixnum] A count of the lines processed from this source.
|
50
|
-
# Excludes any header and duplicate records identified during indexing.
|
51
|
-
attr_reader :line_count
|
52
|
-
# @return [Fixnum] A count of the lines from this source that were skipped,
|
53
|
-
# due either to duplicate keys or filter conditions.
|
54
|
-
attr_reader :skip_count
|
55
|
-
|
5
|
+
class CSVSource < Source
|
56
6
|
|
57
7
|
# Creates a new diff source.
|
58
8
|
#
|
@@ -99,153 +49,23 @@ class CSVDiff
|
|
99
49
|
# regular expression(s). Source rows with a field value that satisfies
|
100
50
|
# the regular expressions will be excluded from the diff process.
|
101
51
|
def initialize(source, options = {})
|
52
|
+
super(options)
|
102
53
|
if source.is_a?(String)
|
103
54
|
require 'csv'
|
104
55
|
mode_string = options[:encoding] ? "r:#{options[:encoding]}" : 'r'
|
105
56
|
csv_options = options.fetch(:csv_options, {})
|
106
57
|
@path = source
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
end
|
112
|
-
if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
|
113
|
-
(kf = options.fetch(:key_field, options[:key_fields]))
|
114
|
-
@key_fields = [kf].flatten
|
115
|
-
@parent_fields = @key_fields[0...-1]
|
116
|
-
@child_fields = @key_fields[-1..-1]
|
117
|
-
else
|
118
|
-
@parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
|
119
|
-
@child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
|
120
|
-
@key_fields = @parent_fields + @child_fields
|
121
|
-
end
|
122
|
-
@field_names = options[:field_names]
|
123
|
-
@warnings = []
|
124
|
-
index_source(source, options)
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
# Returns the row in the CSV source corresponding to the supplied key.
|
129
|
-
#
|
130
|
-
# @param key [String] The unique key to use to lookup the row.
|
131
|
-
# @return [Hash] The fields for the line corresponding to +key+, or nil
|
132
|
-
# if the key is not recognised.
|
133
|
-
def [](key)
|
134
|
-
@lines[key]
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
private
|
139
|
-
|
140
|
-
# Given an array of lines, where each line is an array of fields, indexes
|
141
|
-
# the array contents so that it can be looked up by key.
|
142
|
-
def index_source(lines, options)
|
143
|
-
@lines = {}
|
144
|
-
@index = Hash.new{ |h, k| h[k] = [] }
|
145
|
-
if @field_names
|
146
|
-
index_fields(options)
|
147
|
-
end
|
148
|
-
@case_sensitive = options.fetch(:case_sensitive, true)
|
149
|
-
@trim_whitespace = options.fetch(:trim_whitespace, false)
|
150
|
-
@line_count = 0
|
151
|
-
@skip_count = 0
|
152
|
-
line_num = 0
|
153
|
-
lines.each do |row|
|
154
|
-
line_num += 1
|
155
|
-
next if line_num == 1 && @field_names && options[:ignore_header]
|
156
|
-
unless @field_names
|
157
|
-
@field_names = row.each_with_index.map{ |f, i| f || i.to_s }
|
158
|
-
index_fields(options)
|
159
|
-
next
|
160
|
-
end
|
161
|
-
field_vals = row
|
162
|
-
line = {}
|
163
|
-
filter = false
|
164
|
-
@field_names.each_with_index do |field, i|
|
165
|
-
line[field] = field_vals[i]
|
166
|
-
line[field].strip! if @trim_whitespace && line[field]
|
167
|
-
if @include_filter && f = @include_filter[i]
|
168
|
-
filter = !check_filter(f, line[field])
|
169
|
-
end
|
170
|
-
if @exclude_filter && f = @exclude_filter[i]
|
171
|
-
filter = check_filter(f, line[field])
|
172
|
-
end
|
173
|
-
break if filter
|
174
|
-
end
|
175
|
-
if filter
|
176
|
-
@skip_count += 1
|
177
|
-
next
|
58
|
+
# When you call CSV.open, it's best to pass in a block so that after it's yielded,
|
59
|
+
# the underlying file handle is closed. Otherwise, you risk leaking the handle.
|
60
|
+
@data = CSV.open(@path, mode_string, csv_options) do |csv|
|
61
|
+
csv.readlines
|
178
62
|
end
|
179
|
-
|
180
|
-
|
181
|
-
key = key_values.join('~')
|
182
|
-
parent_key = key_values[0...(@parent_fields.length)].join('~')
|
183
|
-
if @lines[key]
|
184
|
-
@warnings << "Duplicate key '#{key}' encountered and ignored at line #{line_num}"
|
185
|
-
@skip_count += 1
|
186
|
-
else
|
187
|
-
@index[parent_key] << key
|
188
|
-
@lines[key] = line
|
189
|
-
@line_count += 1
|
190
|
-
end
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
|
195
|
-
def index_fields(options)
|
196
|
-
@key_field_indexes = find_field_indexes(@key_fields, @field_names)
|
197
|
-
@parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
|
198
|
-
@child_field_indexes = find_field_indexes(@child_fields, @field_names)
|
199
|
-
@key_fields = @key_field_indexes.map{ |i| @field_names[i] }
|
200
|
-
@parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
|
201
|
-
@child_fields = @child_field_indexes.map{ |i| @field_names[i] }
|
202
|
-
|
203
|
-
@include_filter = convert_filter(options, :include, @field_names)
|
204
|
-
@exclude_filter = convert_filter(options, :exclude, @field_names)
|
205
|
-
end
|
206
|
-
|
207
|
-
|
208
|
-
# Converts an array of field names to an array of indexes of the fields
|
209
|
-
# matching those names.
|
210
|
-
def find_field_indexes(key_fields, field_names)
|
211
|
-
key_fields.map do |field|
|
212
|
-
if field.is_a?(Integer)
|
213
|
-
field
|
214
|
-
else
|
215
|
-
field_names.index{ |field_name| field.to_s.downcase == field_name.downcase } or
|
216
|
-
raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
|
217
|
-
field_names.join(', ')}"
|
218
|
-
end
|
219
|
-
end
|
220
|
-
end
|
221
|
-
|
222
|
-
|
223
|
-
def convert_filter(options, key, field_names)
|
224
|
-
return unless hsh = options[key]
|
225
|
-
if !hsh.is_a?(Hash)
|
226
|
-
raise ArgumentError, ":#{key} option must be a Hash of field name(s)/index(es) to RegExp(s)"
|
227
|
-
end
|
228
|
-
keys = hsh.keys
|
229
|
-
idxs = find_field_indexes(keys, @field_names)
|
230
|
-
Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
|
231
|
-
end
|
232
|
-
|
233
|
-
|
234
|
-
def check_filter(filter, field_val)
|
235
|
-
case filter
|
236
|
-
when String
|
237
|
-
if @case_sensitive
|
238
|
-
filter == field_val
|
239
|
-
else
|
240
|
-
filter.downcase == field_val.to_s.downcase
|
241
|
-
end
|
242
|
-
when Regexp
|
243
|
-
filter.match(field_val)
|
244
|
-
when Proc
|
245
|
-
filter.call(field_val)
|
63
|
+
elsif source.is_a?(Enumerable) && source.size == 0 || (source.size > 0 && source.first.is_a?(Enumerable))
|
64
|
+
@data = source
|
246
65
|
else
|
247
|
-
raise ArgumentError, "
|
66
|
+
raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
|
248
67
|
end
|
68
|
+
index_source
|
249
69
|
end
|
250
70
|
|
251
71
|
end
|
@@ -0,0 +1,275 @@
|
|
1
|
+
class CSVDiff
|
2
|
+
|
3
|
+
# Reppresents an input (i.e the left/from or tight/to input) to the diff
|
4
|
+
# process.
|
5
|
+
class Source
|
6
|
+
|
7
|
+
# @return [String] the path to the source file
|
8
|
+
attr_accessor :path
|
9
|
+
# @return [Array<Arrary>] The data for this source
|
10
|
+
attr_reader :data
|
11
|
+
|
12
|
+
# @return [Array<String>] The names of the fields in the source file
|
13
|
+
attr_reader :field_names
|
14
|
+
# @return [Array<String>] The names of the field(s) that uniquely
|
15
|
+
# identify each row.
|
16
|
+
attr_reader :key_fields
|
17
|
+
# @return [Array<String>] The names of the field(s) that identify a
|
18
|
+
# common parent of child records.
|
19
|
+
attr_reader :parent_fields
|
20
|
+
# @return [Array<String>] The names of the field(s) that distinguish a
|
21
|
+
# child of a parent record.
|
22
|
+
attr_reader :child_fields
|
23
|
+
|
24
|
+
# @return [Array<Fixnum>] The indexes of the key fields in the source
|
25
|
+
# file.
|
26
|
+
attr_reader :key_field_indexes
|
27
|
+
# @return [Array<Fixnum>] The indexes of the parent fields in the source
|
28
|
+
# file.
|
29
|
+
attr_reader :parent_field_indexes
|
30
|
+
# @return [Array<Fixnum>] The indexes of the child fields in the source
|
31
|
+
# file.
|
32
|
+
attr_reader :child_field_indexes
|
33
|
+
|
34
|
+
# @return [Boolean] True if the source has been indexed with case-
|
35
|
+
# sensitive keys, or false if it has been indexed using upper-case key
|
36
|
+
# values.
|
37
|
+
attr_reader :case_sensitive
|
38
|
+
alias_method :case_sensitive?, :case_sensitive
|
39
|
+
# @return [Boolean] True if leading/trailing whitespace should be stripped
|
40
|
+
# from fields
|
41
|
+
attr_reader :trim_whitespace
|
42
|
+
# @return [Hash<String,Hash>] A hash containing each line of the source,
|
43
|
+
# keyed on the values of the +key_fields+.
|
44
|
+
attr_reader :lines
|
45
|
+
# @return [Hash<String,Array<String>>] A hash containing each parent key,
|
46
|
+
# and an Array of the child keys it is a parent of.
|
47
|
+
attr_reader :index
|
48
|
+
# @return [Array<String>] An array of any warnings encountered while
|
49
|
+
# processing the source.
|
50
|
+
attr_reader :warnings
|
51
|
+
# @return [Fixnum] A count of the lines processed from this source.
|
52
|
+
# Excludes any header and duplicate records identified during indexing.
|
53
|
+
attr_reader :line_count
|
54
|
+
# @return [Fixnum] A count of the lines from this source that were skipped
|
55
|
+
# due to filter conditions.
|
56
|
+
attr_reader :skip_count
|
57
|
+
# @return [Fixnum] A count of the lines from this source that had the same
|
58
|
+
# key value as another line.
|
59
|
+
attr_reader :dup_count
|
60
|
+
|
61
|
+
|
62
|
+
# Creates a new diff source.
|
63
|
+
#
|
64
|
+
# A diff source must contain at least one field that will be used as the
|
65
|
+
# key to identify the same record in a different version of this file.
|
66
|
+
# If not specified via one of the options, the first field is assumed to
|
67
|
+
# be the unique key.
|
68
|
+
#
|
69
|
+
# If multiple fields combine to form a unique key, the parent is assumed
|
70
|
+
# to be identified by all but the last field of the unique key. If finer
|
71
|
+
# control is required, use a combination of the :parent_fields and
|
72
|
+
# :child_fields options.
|
73
|
+
#
|
74
|
+
# All key options can be specified either by field name, or by field
|
75
|
+
# index (0 based).
|
76
|
+
#
|
77
|
+
# @param options [Hash] An options hash.
|
78
|
+
# @option options [Array<String>] :field_names The names of each of the
|
79
|
+
# fields in +source+.
|
80
|
+
# @option options [Boolean] :ignore_header If true, and :field_names has
|
81
|
+
# been specified, then the first row of the file is ignored.
|
82
|
+
# @option options [String] :key_field The name of the field that uniquely
|
83
|
+
# identifies each row.
|
84
|
+
# @option options [Array<String>] :key_fields The names of the fields
|
85
|
+
# that uniquely identifies each row.
|
86
|
+
# @option options [String] :parent_field The name of the field(s) that
|
87
|
+
# identify a parent within which sibling order should be checked.
|
88
|
+
# @option options [String] :child_field The name of the field(s) that
|
89
|
+
# uniquely identify a child of a parent.
|
90
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
91
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
92
|
+
# case-insensitive comparisons.
|
93
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
94
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
95
|
+
# regular expressions will be indexed and included in the diff process.
|
96
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
97
|
+
# regular expression(s). Source rows with a field value that satisfies
|
98
|
+
# the regular expressions will be excluded from the diff process.
|
99
|
+
def initialize(options = {})
|
100
|
+
if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
|
101
|
+
(kf = options.fetch(:key_field, options[:key_fields]))
|
102
|
+
@key_fields = [kf].flatten
|
103
|
+
@parent_fields = @key_fields[0...-1]
|
104
|
+
@child_fields = @key_fields[-1..-1]
|
105
|
+
else
|
106
|
+
@parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
|
107
|
+
@child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
|
108
|
+
@key_fields = @parent_fields + @child_fields
|
109
|
+
end
|
110
|
+
@field_names = options[:field_names]
|
111
|
+
@case_sensitive = options.fetch(:case_sensitive, true)
|
112
|
+
@trim_whitespace = options.fetch(:trim_whitespace, false)
|
113
|
+
@ignore_header = options[:ignore_header]
|
114
|
+
@include = options[:include]
|
115
|
+
@exclued = options[:exclude]
|
116
|
+
@path = options.fetch(:path, 'NA') unless @path
|
117
|
+
@warnings = []
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def path?
|
122
|
+
@path != 'NA'
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
# Returns the row in the CSV source corresponding to the supplied key.
|
127
|
+
#
|
128
|
+
# @param key [String] The unique key to use to lookup the row.
|
129
|
+
# @return [Hash] The fields for the line corresponding to +key+, or nil
|
130
|
+
# if the key is not recognised.
|
131
|
+
def [](key)
|
132
|
+
@lines[key]
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
# Given an array of lines, where each line is an array of fields, indexes
|
137
|
+
# the array contents so that it can be looked up by key.
|
138
|
+
def index_source
|
139
|
+
@lines = {}
|
140
|
+
@index = Hash.new{ |h, k| h[k] = [] }
|
141
|
+
if @field_names
|
142
|
+
index_fields
|
143
|
+
include_filter = convert_filter(@include, @field_names)
|
144
|
+
exclude_filter = convert_filter(@exclude, @field_names)
|
145
|
+
end
|
146
|
+
@line_count = 0
|
147
|
+
@skip_count = 0
|
148
|
+
@dup_count = 0
|
149
|
+
line_num = 0
|
150
|
+
@data.each do |row|
|
151
|
+
line_num += 1
|
152
|
+
next if line_num == 1 && @field_names && @ignore_header
|
153
|
+
unless @field_names
|
154
|
+
if row.class.name == 'CSV::Row'
|
155
|
+
@field_names = row.headers.each_with_index.map{ |f, i| f || i.to_s }
|
156
|
+
else
|
157
|
+
@field_names = row.each_with_index.map{ |f, i| f || i.to_s }
|
158
|
+
end
|
159
|
+
index_fields
|
160
|
+
include_filter = convert_filter(@include, @field_names)
|
161
|
+
exclude_filter = convert_filter(@exclude, @field_names)
|
162
|
+
next
|
163
|
+
end
|
164
|
+
field_vals = row
|
165
|
+
line = {}
|
166
|
+
filter = false
|
167
|
+
@field_names.each_with_index do |field, i|
|
168
|
+
val = field_vals[i]
|
169
|
+
val = val.to_s.strip if val && @trim_whitespace
|
170
|
+
line[field] = val
|
171
|
+
if include_filter && f = include_filter[i]
|
172
|
+
filter = !check_filter(f, line[field])
|
173
|
+
end
|
174
|
+
if exclude_filter && f = exclude_filter[i]
|
175
|
+
filter = check_filter(f, line[field])
|
176
|
+
end
|
177
|
+
break if filter
|
178
|
+
end
|
179
|
+
if filter
|
180
|
+
@skip_count += 1
|
181
|
+
next
|
182
|
+
end
|
183
|
+
key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
|
184
|
+
field_vals[kf].to_s :
|
185
|
+
field_vals[kf].to_s.upcase }
|
186
|
+
key = key_values.join('~')
|
187
|
+
parent_key = key_values[0...(@parent_fields.length)].join('~')
|
188
|
+
if @lines[key]
|
189
|
+
@warnings << "Duplicate key '#{key}' encountered at line #{line_num}"
|
190
|
+
@dup_count += 1
|
191
|
+
key += "[#{@dup_count}]"
|
192
|
+
end
|
193
|
+
@index[parent_key] << key
|
194
|
+
@lines[key] = line
|
195
|
+
@line_count += 1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
|
200
|
+
# Save the data in this Source as a CSV at +file_path+.
|
201
|
+
#
|
202
|
+
# @parma file_path [String] The target path to save the data to.
|
203
|
+
# @param options [Hash] A set of options to pass to CSV.open to control
|
204
|
+
# how the CSV is generated.
|
205
|
+
def save_csv(file_path, options = {})
|
206
|
+
require 'csv'
|
207
|
+
default_opts = {
|
208
|
+
headers: @field_name, write_headers: true
|
209
|
+
}
|
210
|
+
CSV.open(file_path, 'wb', default_opts.merge(options)) do |csv|
|
211
|
+
@data.each{ |rec| csv << rec }
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
|
216
|
+
private
|
217
|
+
|
218
|
+
|
219
|
+
def index_fields
|
220
|
+
@key_field_indexes = find_field_indexes(@key_fields, @field_names)
|
221
|
+
@parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
|
222
|
+
@child_field_indexes = find_field_indexes(@child_fields, @field_names)
|
223
|
+
@key_fields = @key_field_indexes.map{ |i| @field_names[i] }
|
224
|
+
@parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
|
225
|
+
@child_fields = @child_field_indexes.map{ |i| @field_names[i] }
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
# Converts an array of field names to an array of indexes of the fields
|
230
|
+
# matching those names.
|
231
|
+
def find_field_indexes(key_fields, field_names)
|
232
|
+
key_fields.map do |field|
|
233
|
+
if field.is_a?(Integer)
|
234
|
+
field
|
235
|
+
else
|
236
|
+
field_names.index{ |field_name| field.to_s.downcase == field_name.to_s.downcase } or
|
237
|
+
raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
|
238
|
+
field_names.join(', ')}"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
def convert_filter(hsh, field_names)
|
245
|
+
return unless hsh
|
246
|
+
if !hsh.is_a?(Hash)
|
247
|
+
raise ArgumentError, ":include/:exclude option must be a Hash of field name(s)/index(es) to RegExp(s)"
|
248
|
+
end
|
249
|
+
keys = hsh.keys
|
250
|
+
idxs = find_field_indexes(keys, @field_names)
|
251
|
+
Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
def check_filter(filter, field_val)
|
256
|
+
case filter
|
257
|
+
when String
|
258
|
+
if @case_sensitive
|
259
|
+
filter == field_val
|
260
|
+
else
|
261
|
+
filter.downcase == field_val.to_s.downcase
|
262
|
+
end
|
263
|
+
when Regexp
|
264
|
+
filter.match(field_val)
|
265
|
+
when Proc
|
266
|
+
filter.call(field_val)
|
267
|
+
else
|
268
|
+
raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
275
|
+
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'cgi'
|
3
|
+
|
4
|
+
|
5
|
+
class CSVDiff
|
6
|
+
|
7
|
+
# Convert XML content to CSV format using XPath selectors to identify the
|
8
|
+
# rows and field values in an XML document
|
9
|
+
class XMLSource < Source
|
10
|
+
|
11
|
+
attr_accessor :context
|
12
|
+
|
13
|
+
# Create a new XMLSource, identified by +path+. Normally this is a path
|
14
|
+
# to the XML document, but any value is fine, as it isreally just a label
|
15
|
+
# to identify this data set.
|
16
|
+
#
|
17
|
+
# @param path [String] A label for this data set (often a path to the
|
18
|
+
# XML document used as the source).
|
19
|
+
# @param options [Hash] An options hash.
|
20
|
+
# @option options [Array<String>] :field_names The names of each of the
|
21
|
+
# fields in +source+.
|
22
|
+
# @option options [Boolean] :ignore_header If true, and :field_names has
|
23
|
+
# been specified, then the first row of the file is ignored.
|
24
|
+
# @option options [String] :key_field The name of the field that uniquely
|
25
|
+
# identifies each row.
|
26
|
+
# @option options [Array<String>] :key_fields The names of the fields
|
27
|
+
# that uniquely identifies each row.
|
28
|
+
# @option options [String] :parent_field The name of the field(s) that
|
29
|
+
# identify a parent within which sibling order should be checked.
|
30
|
+
# @option options [String] :child_field The name of the field(s) that
|
31
|
+
# uniquely identify a child of a parent.
|
32
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
33
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
34
|
+
# case-insensitive comparisons.
|
35
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
36
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
37
|
+
# regular expressions will be indexed and included in the diff process.
|
38
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
39
|
+
# regular expression(s). Source rows with a field value that satisfies
|
40
|
+
# the regular expressions will be excluded from the diff process.
|
41
|
+
# @option options [String] :context A context value from which fields
|
42
|
+
# can be populated using a Regexp.
|
43
|
+
def initialize(path, options = {})
|
44
|
+
super(options)
|
45
|
+
@path = path
|
46
|
+
@context = options[:context]
|
47
|
+
@data = []
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
# Process a +source+, converting the XML into a table of data, using
|
52
|
+
# +rec_xpath+ to identify the nodes that correspond each record that
|
53
|
+
# should appear in the output, and +field_maps+ to populate each field
|
54
|
+
# in each row.
|
55
|
+
#
|
56
|
+
# @param source [String|Array] may be a String containing XML content,
|
57
|
+
# an Array of paths to files containing XML content, or a path to
|
58
|
+
# a single file.
|
59
|
+
# @param rec_xpath [String] An XPath expression that selects all the
|
60
|
+
# items in the XML document that are to be converted into new rows.
|
61
|
+
# The returned items are not directly used to populate the fields,
|
62
|
+
# but provide a context for the field XPath expreessions that populate
|
63
|
+
# each field's content.
|
64
|
+
# @param field_maps [Hash<String, String>] A map of field names to
|
65
|
+
# expressions that are evaluated in the context of each row node
|
66
|
+
# selected by +rec_xpath+. The field expressions are typically XPath
|
67
|
+
# expressions evaluated in the context of the nodes returned by the
|
68
|
+
# +rec_xpath+. Alternatively, a String that is not an XPath expression
|
69
|
+
# is used as a literal value for a field, while a Regexp can also
|
70
|
+
# be used to pull a value from any context specified in the +options+
|
71
|
+
# hash. The Regexp should include a single grouping, as the value used
|
72
|
+
# will be the result in $1 after the match is performed.
|
73
|
+
# @param context [String] An optional context for the XML to be processed.
|
74
|
+
# The value passed here can be referenced in field map expressions
|
75
|
+
# using a Regexp, with the value of the first grouping in the regex
|
76
|
+
# being the value returned for the field.
|
77
|
+
def process(source, rec_xpath, field_maps, context = nil)
|
78
|
+
@field_names = field_maps.keys unless @field_names
|
79
|
+
case source
|
80
|
+
when Nokogiri::XML::Document
|
81
|
+
add_data(source, rec_xpath, field_maps, context || @context)
|
82
|
+
when /<\?xml/
|
83
|
+
doc = Nokogiri::XML(source)
|
84
|
+
add_data(doc, rec_xpath, field_maps, context || @context)
|
85
|
+
when Array
|
86
|
+
source.each{ |f| process_file(f, rec_xpath, field_maps) }
|
87
|
+
when String
|
88
|
+
process_file(source, rec_xpath, field_maps)
|
89
|
+
else
|
90
|
+
raise ArgumentError, "Unhandled source type #{source.class.name}"
|
91
|
+
end
|
92
|
+
@data
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
|
99
|
+
# Load the XML document at +file_path+ and process it into rows of data.
|
100
|
+
def process_file(file_path, rec_xpath, field_maps)
|
101
|
+
begin
|
102
|
+
File.open(file_path) do |f|
|
103
|
+
doc = Nokogiri::XML(f)
|
104
|
+
add_data(doc, rec_xpath, field_maps, @context || file_path)
|
105
|
+
end
|
106
|
+
rescue
|
107
|
+
STDERR.puts "An error occurred while attempting to open #{file_path}"
|
108
|
+
raise
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# Locate records in +doc+ using +rec_xpath+ to identify the nodes that
|
114
|
+
# correspond to a new record in the data, and +field_maps+ to populate
|
115
|
+
# the fields in each row.
|
116
|
+
def add_data(doc, rec_xpath, field_maps, context)
|
117
|
+
doc.xpath(rec_xpath).each do |rec_node|
|
118
|
+
rec = []
|
119
|
+
field_maps.each do |field_name, expr|
|
120
|
+
case expr
|
121
|
+
when Regexp # Match context against Regexp and extract first grouping
|
122
|
+
if context
|
123
|
+
context =~ expr
|
124
|
+
rec << $1
|
125
|
+
else
|
126
|
+
rec << nil
|
127
|
+
end
|
128
|
+
when %r{[/(.@]} # XPath expression
|
129
|
+
res = rec_node.xpath(expr)
|
130
|
+
rec << CGI.unescape_html(res.to_s)
|
131
|
+
else # Use expr as the value for this field
|
132
|
+
rec << expr
|
133
|
+
end
|
134
|
+
end
|
135
|
+
@data << rec
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
data/lib/csv_diff.rb
CHANGED
File without changes
|
metadata
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Gardiner
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |2
|
14
|
-
This library performs diffs of CSV
|
14
|
+
This library performs diffs of CSV data, or any table-like source.
|
15
15
|
|
16
16
|
Unlike a standard diff that compares line by line, and is sensitive to the
|
17
17
|
ordering of records, CSV-Diff identifies common lines by key field(s), and
|
@@ -29,7 +29,9 @@ description: |2
|
|
29
29
|
sibling order.
|
30
30
|
|
31
31
|
This gem implements the core diff algorithm, and handles the loading and
|
32
|
-
diffing of CSV files (or Arrays of Arrays). It
|
32
|
+
diffing of CSV files (or Arrays of Arrays). It also supports converting
|
33
|
+
data in XML format into tabular form, so that it can then be processed
|
34
|
+
like any other CSV or table-like source. It returns a CSVDiff object
|
33
35
|
containing the details of differences in object form. This is useful for
|
34
36
|
projects that need diff capability, but want to handle the reporting or
|
35
37
|
actioning of differences themselves.
|
@@ -48,6 +50,8 @@ files:
|
|
48
50
|
- lib/csv-diff/algorithm.rb
|
49
51
|
- lib/csv-diff/csv_diff.rb
|
50
52
|
- lib/csv-diff/csv_source.rb
|
53
|
+
- lib/csv-diff/source.rb
|
54
|
+
- lib/csv-diff/xml_source.rb
|
51
55
|
- lib/csv_diff.rb
|
52
56
|
homepage: https://github.com/agardiner/csv-diff
|
53
57
|
licenses:
|
@@ -69,8 +73,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
69
73
|
version: '0'
|
70
74
|
requirements: []
|
71
75
|
rubyforge_project:
|
72
|
-
rubygems_version: 2.5.2
|
76
|
+
rubygems_version: 2.5.2.3
|
73
77
|
signing_key:
|
74
78
|
specification_version: 4
|
75
|
-
summary: CSV Diff is a library for generating diffs from data in CSV format
|
79
|
+
summary: CSV Diff is a library for generating diffs from data in CSV or XML format
|
76
80
|
test_files: []
|