csv-diff 0.3.5 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/LICENSE +0 -0
- data/README.md +0 -0
- data/lib/csv-diff.rb +1 -0
- data/lib/csv-diff/algorithm.rb +5 -2
- data/lib/csv-diff/csv_diff.rb +5 -3
- data/lib/csv-diff/csv_source.rb +10 -190
- data/lib/csv-diff/source.rb +275 -0
- data/lib/csv-diff/xml_source.rb +142 -0
- data/lib/csv_diff.rb +0 -0
- metadata +10 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1b3b8deee34344d334e740285cb1f3c99074694
|
4
|
+
data.tar.gz: d95158d13861cb66fd460ee430714ec3c83cd0b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50c74d6a4093012b0ba44fef70c2d749348d6777cfb9f2cfda66c6e075423191a4c6c22019a388b9d8bd14e22ac60d539f4e3b4aa85fd87fd774a64da15858c7
|
7
|
+
data.tar.gz: 8fa030a54e7a97db9913b3c36a1942de1e07a6549f9ae7aa58b5b3f44d522fe11f72d44e18b6b7612d2b2dc9f106ece1fea183557c507bcf18316891ab63f230
|
data/LICENSE
CHANGED
File without changes
|
data/README.md
CHANGED
File without changes
|
data/lib/csv-diff.rb
CHANGED
data/lib/csv-diff/algorithm.rb
CHANGED
@@ -36,8 +36,6 @@ class CSVDiff
|
|
36
36
|
# For backwards compatibility and access to fields with differences
|
37
37
|
def [](key)
|
38
38
|
case key
|
39
|
-
when String
|
40
|
-
@fields[key]
|
41
39
|
when :action
|
42
40
|
a = diff_type.to_s
|
43
41
|
a[0] = a[0].upcase
|
@@ -46,6 +44,8 @@ class CSVDiff
|
|
46
44
|
@row
|
47
45
|
when :sibling_position
|
48
46
|
@sibling_position
|
47
|
+
else
|
48
|
+
@fields[key]
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
@@ -78,6 +78,9 @@ class CSVDiff
|
|
78
78
|
unless left.case_sensitive? == right.case_sensitive?
|
79
79
|
raise ArgumentError, "Left and right must have same settings for case-sensitivity"
|
80
80
|
end
|
81
|
+
unless left.parent_fields.length == right.parent_fields.length
|
82
|
+
raise ArgumentError, "Left and right must have same settings for parent/child fields"
|
83
|
+
end
|
81
84
|
|
82
85
|
# Ensure key fields are not also in the diff_fields
|
83
86
|
diff_fields = diff_fields - key_fields
|
data/lib/csv-diff/csv_diff.rb
CHANGED
@@ -81,9 +81,11 @@ class CSVDiff
|
|
81
81
|
# @option options [Boolean] :ignore_deletes If true, records that appear
|
82
82
|
# in the left/from file but not in the right/to file are not reported.
|
83
83
|
def initialize(left, right, options = {})
|
84
|
-
@left = left.is_a?(
|
84
|
+
@left = left.is_a?(Source) ? left : CSVSource.new(left, options)
|
85
|
+
@left.index_source if @left.lines.nil?
|
85
86
|
raise "No field names found in left (from) source" unless @left.field_names && @left.field_names.size > 0
|
86
|
-
@right = right.is_a?(
|
87
|
+
@right = right.is_a?(Source) ? right : CSVSource.new(right, options)
|
88
|
+
@right.index_source if @right.lines.nil?
|
87
89
|
raise "No field names found in right (to) source" unless @right.field_names && @right.field_names.size > 0
|
88
90
|
@warnings = []
|
89
91
|
@diff_fields = get_diff_fields(@left.field_names, @right.field_names, options)
|
@@ -141,7 +143,7 @@ class CSVDiff
|
|
141
143
|
ignore_fields = options.fetch(:ignore_fields, [])
|
142
144
|
ignore_fields = [ignore_fields] unless ignore_fields.is_a?(Array)
|
143
145
|
ignore_fields.map! do |f|
|
144
|
-
(f.is_a?(
|
146
|
+
(f.is_a?(Numeric) ? right_fields[f] : f).upcase
|
145
147
|
end
|
146
148
|
diff_fields = []
|
147
149
|
if options[:diff_common_fields_only]
|
data/lib/csv-diff/csv_source.rb
CHANGED
@@ -2,57 +2,7 @@ class CSVDiff
|
|
2
2
|
|
3
3
|
# Represents a CSV input (i.e. the left/from or right/to input) to the diff
|
4
4
|
# process.
|
5
|
-
class CSVSource
|
6
|
-
|
7
|
-
# @return [String] the path to the source file
|
8
|
-
attr_accessor :path
|
9
|
-
|
10
|
-
# @return [Array<String>] The names of the fields in the source file
|
11
|
-
attr_reader :field_names
|
12
|
-
# @return [Array<String>] The names of the field(s) that uniquely
|
13
|
-
# identify each row.
|
14
|
-
attr_reader :key_fields
|
15
|
-
# @return [Array<String>] The names of the field(s) that identify a
|
16
|
-
# common parent of child records.
|
17
|
-
attr_reader :parent_fields
|
18
|
-
# @return [Array<String>] The names of the field(s) that distinguish a
|
19
|
-
# child of a parent record.
|
20
|
-
attr_reader :child_fields
|
21
|
-
|
22
|
-
# @return [Array<Fixnum>] The indexes of the key fields in the source
|
23
|
-
# file.
|
24
|
-
attr_reader :key_field_indexes
|
25
|
-
# @return [Array<Fixnum>] The indexes of the parent fields in the source
|
26
|
-
# file.
|
27
|
-
attr_reader :parent_field_indexes
|
28
|
-
# @return [Array<Fixnum>] The indexes of the child fields in the source
|
29
|
-
# file.
|
30
|
-
attr_reader :child_field_indexes
|
31
|
-
|
32
|
-
# @return [Boolean] True if the source has been indexed with case-
|
33
|
-
# sensitive keys, or false if it has been indexed using upper-case key
|
34
|
-
# values.
|
35
|
-
attr_reader :case_sensitive
|
36
|
-
alias_method :case_sensitive?, :case_sensitive
|
37
|
-
# @return [Boolean] True if leading/trailing whitespace should be stripped
|
38
|
-
# from fields
|
39
|
-
attr_reader :trim_whitespace
|
40
|
-
# @return [Hash<String,Hash>] A hash containing each line of the source,
|
41
|
-
# keyed on the values of the +key_fields+.
|
42
|
-
attr_reader :lines
|
43
|
-
# @return [Hash<String,Array<String>>] A hash containing each parent key,
|
44
|
-
# and an Array of the child keys it is a parent of.
|
45
|
-
attr_reader :index
|
46
|
-
# @return [Array<String>] An array of any warnings encountered while
|
47
|
-
# processing the source.
|
48
|
-
attr_reader :warnings
|
49
|
-
# @return [Fixnum] A count of the lines processed from this source.
|
50
|
-
# Excludes any header and duplicate records identified during indexing.
|
51
|
-
attr_reader :line_count
|
52
|
-
# @return [Fixnum] A count of the lines from this source that were skipped,
|
53
|
-
# due either to duplicate keys or filter conditions.
|
54
|
-
attr_reader :skip_count
|
55
|
-
|
5
|
+
class CSVSource < Source
|
56
6
|
|
57
7
|
# Creates a new diff source.
|
58
8
|
#
|
@@ -99,153 +49,23 @@ class CSVDiff
|
|
99
49
|
# regular expression(s). Source rows with a field value that satisfies
|
100
50
|
# the regular expressions will be excluded from the diff process.
|
101
51
|
def initialize(source, options = {})
|
52
|
+
super(options)
|
102
53
|
if source.is_a?(String)
|
103
54
|
require 'csv'
|
104
55
|
mode_string = options[:encoding] ? "r:#{options[:encoding]}" : 'r'
|
105
56
|
csv_options = options.fetch(:csv_options, {})
|
106
57
|
@path = source
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
end
|
112
|
-
if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
|
113
|
-
(kf = options.fetch(:key_field, options[:key_fields]))
|
114
|
-
@key_fields = [kf].flatten
|
115
|
-
@parent_fields = @key_fields[0...-1]
|
116
|
-
@child_fields = @key_fields[-1..-1]
|
117
|
-
else
|
118
|
-
@parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
|
119
|
-
@child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
|
120
|
-
@key_fields = @parent_fields + @child_fields
|
121
|
-
end
|
122
|
-
@field_names = options[:field_names]
|
123
|
-
@warnings = []
|
124
|
-
index_source(source, options)
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
# Returns the row in the CSV source corresponding to the supplied key.
|
129
|
-
#
|
130
|
-
# @param key [String] The unique key to use to lookup the row.
|
131
|
-
# @return [Hash] The fields for the line corresponding to +key+, or nil
|
132
|
-
# if the key is not recognised.
|
133
|
-
def [](key)
|
134
|
-
@lines[key]
|
135
|
-
end
|
136
|
-
|
137
|
-
|
138
|
-
private
|
139
|
-
|
140
|
-
# Given an array of lines, where each line is an array of fields, indexes
|
141
|
-
# the array contents so that it can be looked up by key.
|
142
|
-
def index_source(lines, options)
|
143
|
-
@lines = {}
|
144
|
-
@index = Hash.new{ |h, k| h[k] = [] }
|
145
|
-
if @field_names
|
146
|
-
index_fields(options)
|
147
|
-
end
|
148
|
-
@case_sensitive = options.fetch(:case_sensitive, true)
|
149
|
-
@trim_whitespace = options.fetch(:trim_whitespace, false)
|
150
|
-
@line_count = 0
|
151
|
-
@skip_count = 0
|
152
|
-
line_num = 0
|
153
|
-
lines.each do |row|
|
154
|
-
line_num += 1
|
155
|
-
next if line_num == 1 && @field_names && options[:ignore_header]
|
156
|
-
unless @field_names
|
157
|
-
@field_names = row.each_with_index.map{ |f, i| f || i.to_s }
|
158
|
-
index_fields(options)
|
159
|
-
next
|
160
|
-
end
|
161
|
-
field_vals = row
|
162
|
-
line = {}
|
163
|
-
filter = false
|
164
|
-
@field_names.each_with_index do |field, i|
|
165
|
-
line[field] = field_vals[i]
|
166
|
-
line[field].strip! if @trim_whitespace && line[field]
|
167
|
-
if @include_filter && f = @include_filter[i]
|
168
|
-
filter = !check_filter(f, line[field])
|
169
|
-
end
|
170
|
-
if @exclude_filter && f = @exclude_filter[i]
|
171
|
-
filter = check_filter(f, line[field])
|
172
|
-
end
|
173
|
-
break if filter
|
174
|
-
end
|
175
|
-
if filter
|
176
|
-
@skip_count += 1
|
177
|
-
next
|
58
|
+
# When you call CSV.open, it's best to pass in a block so that after it's yielded,
|
59
|
+
# the underlying file handle is closed. Otherwise, you risk leaking the handle.
|
60
|
+
@data = CSV.open(@path, mode_string, csv_options) do |csv|
|
61
|
+
csv.readlines
|
178
62
|
end
|
179
|
-
|
180
|
-
|
181
|
-
key = key_values.join('~')
|
182
|
-
parent_key = key_values[0...(@parent_fields.length)].join('~')
|
183
|
-
if @lines[key]
|
184
|
-
@warnings << "Duplicate key '#{key}' encountered and ignored at line #{line_num}"
|
185
|
-
@skip_count += 1
|
186
|
-
else
|
187
|
-
@index[parent_key] << key
|
188
|
-
@lines[key] = line
|
189
|
-
@line_count += 1
|
190
|
-
end
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
|
195
|
-
def index_fields(options)
|
196
|
-
@key_field_indexes = find_field_indexes(@key_fields, @field_names)
|
197
|
-
@parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
|
198
|
-
@child_field_indexes = find_field_indexes(@child_fields, @field_names)
|
199
|
-
@key_fields = @key_field_indexes.map{ |i| @field_names[i] }
|
200
|
-
@parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
|
201
|
-
@child_fields = @child_field_indexes.map{ |i| @field_names[i] }
|
202
|
-
|
203
|
-
@include_filter = convert_filter(options, :include, @field_names)
|
204
|
-
@exclude_filter = convert_filter(options, :exclude, @field_names)
|
205
|
-
end
|
206
|
-
|
207
|
-
|
208
|
-
# Converts an array of field names to an array of indexes of the fields
|
209
|
-
# matching those names.
|
210
|
-
def find_field_indexes(key_fields, field_names)
|
211
|
-
key_fields.map do |field|
|
212
|
-
if field.is_a?(Integer)
|
213
|
-
field
|
214
|
-
else
|
215
|
-
field_names.index{ |field_name| field.to_s.downcase == field_name.downcase } or
|
216
|
-
raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
|
217
|
-
field_names.join(', ')}"
|
218
|
-
end
|
219
|
-
end
|
220
|
-
end
|
221
|
-
|
222
|
-
|
223
|
-
def convert_filter(options, key, field_names)
|
224
|
-
return unless hsh = options[key]
|
225
|
-
if !hsh.is_a?(Hash)
|
226
|
-
raise ArgumentError, ":#{key} option must be a Hash of field name(s)/index(es) to RegExp(s)"
|
227
|
-
end
|
228
|
-
keys = hsh.keys
|
229
|
-
idxs = find_field_indexes(keys, @field_names)
|
230
|
-
Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
|
231
|
-
end
|
232
|
-
|
233
|
-
|
234
|
-
def check_filter(filter, field_val)
|
235
|
-
case filter
|
236
|
-
when String
|
237
|
-
if @case_sensitive
|
238
|
-
filter == field_val
|
239
|
-
else
|
240
|
-
filter.downcase == field_val.to_s.downcase
|
241
|
-
end
|
242
|
-
when Regexp
|
243
|
-
filter.match(field_val)
|
244
|
-
when Proc
|
245
|
-
filter.call(field_val)
|
63
|
+
elsif source.is_a?(Enumerable) && source.size == 0 || (source.size > 0 && source.first.is_a?(Enumerable))
|
64
|
+
@data = source
|
246
65
|
else
|
247
|
-
raise ArgumentError, "
|
66
|
+
raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
|
248
67
|
end
|
68
|
+
index_source
|
249
69
|
end
|
250
70
|
|
251
71
|
end
|
@@ -0,0 +1,275 @@
|
|
1
|
+
class CSVDiff
|
2
|
+
|
3
|
+
# Reppresents an input (i.e the left/from or tight/to input) to the diff
|
4
|
+
# process.
|
5
|
+
class Source
|
6
|
+
|
7
|
+
# @return [String] the path to the source file
|
8
|
+
attr_accessor :path
|
9
|
+
# @return [Array<Arrary>] The data for this source
|
10
|
+
attr_reader :data
|
11
|
+
|
12
|
+
# @return [Array<String>] The names of the fields in the source file
|
13
|
+
attr_reader :field_names
|
14
|
+
# @return [Array<String>] The names of the field(s) that uniquely
|
15
|
+
# identify each row.
|
16
|
+
attr_reader :key_fields
|
17
|
+
# @return [Array<String>] The names of the field(s) that identify a
|
18
|
+
# common parent of child records.
|
19
|
+
attr_reader :parent_fields
|
20
|
+
# @return [Array<String>] The names of the field(s) that distinguish a
|
21
|
+
# child of a parent record.
|
22
|
+
attr_reader :child_fields
|
23
|
+
|
24
|
+
# @return [Array<Fixnum>] The indexes of the key fields in the source
|
25
|
+
# file.
|
26
|
+
attr_reader :key_field_indexes
|
27
|
+
# @return [Array<Fixnum>] The indexes of the parent fields in the source
|
28
|
+
# file.
|
29
|
+
attr_reader :parent_field_indexes
|
30
|
+
# @return [Array<Fixnum>] The indexes of the child fields in the source
|
31
|
+
# file.
|
32
|
+
attr_reader :child_field_indexes
|
33
|
+
|
34
|
+
# @return [Boolean] True if the source has been indexed with case-
|
35
|
+
# sensitive keys, or false if it has been indexed using upper-case key
|
36
|
+
# values.
|
37
|
+
attr_reader :case_sensitive
|
38
|
+
alias_method :case_sensitive?, :case_sensitive
|
39
|
+
# @return [Boolean] True if leading/trailing whitespace should be stripped
|
40
|
+
# from fields
|
41
|
+
attr_reader :trim_whitespace
|
42
|
+
# @return [Hash<String,Hash>] A hash containing each line of the source,
|
43
|
+
# keyed on the values of the +key_fields+.
|
44
|
+
attr_reader :lines
|
45
|
+
# @return [Hash<String,Array<String>>] A hash containing each parent key,
|
46
|
+
# and an Array of the child keys it is a parent of.
|
47
|
+
attr_reader :index
|
48
|
+
# @return [Array<String>] An array of any warnings encountered while
|
49
|
+
# processing the source.
|
50
|
+
attr_reader :warnings
|
51
|
+
# @return [Fixnum] A count of the lines processed from this source.
|
52
|
+
# Excludes any header and duplicate records identified during indexing.
|
53
|
+
attr_reader :line_count
|
54
|
+
# @return [Fixnum] A count of the lines from this source that were skipped
|
55
|
+
# due to filter conditions.
|
56
|
+
attr_reader :skip_count
|
57
|
+
# @return [Fixnum] A count of the lines from this source that had the same
|
58
|
+
# key value as another line.
|
59
|
+
attr_reader :dup_count
|
60
|
+
|
61
|
+
|
62
|
+
# Creates a new diff source.
|
63
|
+
#
|
64
|
+
# A diff source must contain at least one field that will be used as the
|
65
|
+
# key to identify the same record in a different version of this file.
|
66
|
+
# If not specified via one of the options, the first field is assumed to
|
67
|
+
# be the unique key.
|
68
|
+
#
|
69
|
+
# If multiple fields combine to form a unique key, the parent is assumed
|
70
|
+
# to be identified by all but the last field of the unique key. If finer
|
71
|
+
# control is required, use a combination of the :parent_fields and
|
72
|
+
# :child_fields options.
|
73
|
+
#
|
74
|
+
# All key options can be specified either by field name, or by field
|
75
|
+
# index (0 based).
|
76
|
+
#
|
77
|
+
# @param options [Hash] An options hash.
|
78
|
+
# @option options [Array<String>] :field_names The names of each of the
|
79
|
+
# fields in +source+.
|
80
|
+
# @option options [Boolean] :ignore_header If true, and :field_names has
|
81
|
+
# been specified, then the first row of the file is ignored.
|
82
|
+
# @option options [String] :key_field The name of the field that uniquely
|
83
|
+
# identifies each row.
|
84
|
+
# @option options [Array<String>] :key_fields The names of the fields
|
85
|
+
# that uniquely identifies each row.
|
86
|
+
# @option options [String] :parent_field The name of the field(s) that
|
87
|
+
# identify a parent within which sibling order should be checked.
|
88
|
+
# @option options [String] :child_field The name of the field(s) that
|
89
|
+
# uniquely identify a child of a parent.
|
90
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
91
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
92
|
+
# case-insensitive comparisons.
|
93
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
94
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
95
|
+
# regular expressions will be indexed and included in the diff process.
|
96
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
97
|
+
# regular expression(s). Source rows with a field value that satisfies
|
98
|
+
# the regular expressions will be excluded from the diff process.
|
99
|
+
def initialize(options = {})
|
100
|
+
if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
|
101
|
+
(kf = options.fetch(:key_field, options[:key_fields]))
|
102
|
+
@key_fields = [kf].flatten
|
103
|
+
@parent_fields = @key_fields[0...-1]
|
104
|
+
@child_fields = @key_fields[-1..-1]
|
105
|
+
else
|
106
|
+
@parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
|
107
|
+
@child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
|
108
|
+
@key_fields = @parent_fields + @child_fields
|
109
|
+
end
|
110
|
+
@field_names = options[:field_names]
|
111
|
+
@case_sensitive = options.fetch(:case_sensitive, true)
|
112
|
+
@trim_whitespace = options.fetch(:trim_whitespace, false)
|
113
|
+
@ignore_header = options[:ignore_header]
|
114
|
+
@include = options[:include]
|
115
|
+
@exclued = options[:exclude]
|
116
|
+
@path = options.fetch(:path, 'NA') unless @path
|
117
|
+
@warnings = []
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def path?
|
122
|
+
@path != 'NA'
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
# Returns the row in the CSV source corresponding to the supplied key.
|
127
|
+
#
|
128
|
+
# @param key [String] The unique key to use to lookup the row.
|
129
|
+
# @return [Hash] The fields for the line corresponding to +key+, or nil
|
130
|
+
# if the key is not recognised.
|
131
|
+
def [](key)
|
132
|
+
@lines[key]
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
# Given an array of lines, where each line is an array of fields, indexes
|
137
|
+
# the array contents so that it can be looked up by key.
|
138
|
+
def index_source
|
139
|
+
@lines = {}
|
140
|
+
@index = Hash.new{ |h, k| h[k] = [] }
|
141
|
+
if @field_names
|
142
|
+
index_fields
|
143
|
+
include_filter = convert_filter(@include, @field_names)
|
144
|
+
exclude_filter = convert_filter(@exclude, @field_names)
|
145
|
+
end
|
146
|
+
@line_count = 0
|
147
|
+
@skip_count = 0
|
148
|
+
@dup_count = 0
|
149
|
+
line_num = 0
|
150
|
+
@data.each do |row|
|
151
|
+
line_num += 1
|
152
|
+
next if line_num == 1 && @field_names && @ignore_header
|
153
|
+
unless @field_names
|
154
|
+
if row.class.name == 'CSV::Row'
|
155
|
+
@field_names = row.headers.each_with_index.map{ |f, i| f || i.to_s }
|
156
|
+
else
|
157
|
+
@field_names = row.each_with_index.map{ |f, i| f || i.to_s }
|
158
|
+
end
|
159
|
+
index_fields
|
160
|
+
include_filter = convert_filter(@include, @field_names)
|
161
|
+
exclude_filter = convert_filter(@exclude, @field_names)
|
162
|
+
next
|
163
|
+
end
|
164
|
+
field_vals = row
|
165
|
+
line = {}
|
166
|
+
filter = false
|
167
|
+
@field_names.each_with_index do |field, i|
|
168
|
+
val = field_vals[i]
|
169
|
+
val = val.to_s.strip if val && @trim_whitespace
|
170
|
+
line[field] = val
|
171
|
+
if include_filter && f = include_filter[i]
|
172
|
+
filter = !check_filter(f, line[field])
|
173
|
+
end
|
174
|
+
if exclude_filter && f = exclude_filter[i]
|
175
|
+
filter = check_filter(f, line[field])
|
176
|
+
end
|
177
|
+
break if filter
|
178
|
+
end
|
179
|
+
if filter
|
180
|
+
@skip_count += 1
|
181
|
+
next
|
182
|
+
end
|
183
|
+
key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
|
184
|
+
field_vals[kf].to_s :
|
185
|
+
field_vals[kf].to_s.upcase }
|
186
|
+
key = key_values.join('~')
|
187
|
+
parent_key = key_values[0...(@parent_fields.length)].join('~')
|
188
|
+
if @lines[key]
|
189
|
+
@warnings << "Duplicate key '#{key}' encountered at line #{line_num}"
|
190
|
+
@dup_count += 1
|
191
|
+
key += "[#{@dup_count}]"
|
192
|
+
end
|
193
|
+
@index[parent_key] << key
|
194
|
+
@lines[key] = line
|
195
|
+
@line_count += 1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
|
200
|
+
# Save the data in this Source as a CSV at +file_path+.
|
201
|
+
#
|
202
|
+
# @parma file_path [String] The target path to save the data to.
|
203
|
+
# @param options [Hash] A set of options to pass to CSV.open to control
|
204
|
+
# how the CSV is generated.
|
205
|
+
def save_csv(file_path, options = {})
|
206
|
+
require 'csv'
|
207
|
+
default_opts = {
|
208
|
+
headers: @field_name, write_headers: true
|
209
|
+
}
|
210
|
+
CSV.open(file_path, 'wb', default_opts.merge(options)) do |csv|
|
211
|
+
@data.each{ |rec| csv << rec }
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
|
216
|
+
private
|
217
|
+
|
218
|
+
|
219
|
+
def index_fields
|
220
|
+
@key_field_indexes = find_field_indexes(@key_fields, @field_names)
|
221
|
+
@parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
|
222
|
+
@child_field_indexes = find_field_indexes(@child_fields, @field_names)
|
223
|
+
@key_fields = @key_field_indexes.map{ |i| @field_names[i] }
|
224
|
+
@parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
|
225
|
+
@child_fields = @child_field_indexes.map{ |i| @field_names[i] }
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
# Converts an array of field names to an array of indexes of the fields
|
230
|
+
# matching those names.
|
231
|
+
def find_field_indexes(key_fields, field_names)
|
232
|
+
key_fields.map do |field|
|
233
|
+
if field.is_a?(Integer)
|
234
|
+
field
|
235
|
+
else
|
236
|
+
field_names.index{ |field_name| field.to_s.downcase == field_name.to_s.downcase } or
|
237
|
+
raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
|
238
|
+
field_names.join(', ')}"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
def convert_filter(hsh, field_names)
|
245
|
+
return unless hsh
|
246
|
+
if !hsh.is_a?(Hash)
|
247
|
+
raise ArgumentError, ":include/:exclude option must be a Hash of field name(s)/index(es) to RegExp(s)"
|
248
|
+
end
|
249
|
+
keys = hsh.keys
|
250
|
+
idxs = find_field_indexes(keys, @field_names)
|
251
|
+
Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
def check_filter(filter, field_val)
|
256
|
+
case filter
|
257
|
+
when String
|
258
|
+
if @case_sensitive
|
259
|
+
filter == field_val
|
260
|
+
else
|
261
|
+
filter.downcase == field_val.to_s.downcase
|
262
|
+
end
|
263
|
+
when Regexp
|
264
|
+
filter.match(field_val)
|
265
|
+
when Proc
|
266
|
+
filter.call(field_val)
|
267
|
+
else
|
268
|
+
raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
275
|
+
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'cgi'
|
3
|
+
|
4
|
+
|
5
|
+
class CSVDiff
|
6
|
+
|
7
|
+
# Convert XML content to CSV format using XPath selectors to identify the
|
8
|
+
# rows and field values in an XML document
|
9
|
+
class XMLSource < Source
|
10
|
+
|
11
|
+
attr_accessor :context
|
12
|
+
|
13
|
+
# Create a new XMLSource, identified by +path+. Normally this is a path
|
14
|
+
# to the XML document, but any value is fine, as it isreally just a label
|
15
|
+
# to identify this data set.
|
16
|
+
#
|
17
|
+
# @param path [String] A label for this data set (often a path to the
|
18
|
+
# XML document used as the source).
|
19
|
+
# @param options [Hash] An options hash.
|
20
|
+
# @option options [Array<String>] :field_names The names of each of the
|
21
|
+
# fields in +source+.
|
22
|
+
# @option options [Boolean] :ignore_header If true, and :field_names has
|
23
|
+
# been specified, then the first row of the file is ignored.
|
24
|
+
# @option options [String] :key_field The name of the field that uniquely
|
25
|
+
# identifies each row.
|
26
|
+
# @option options [Array<String>] :key_fields The names of the fields
|
27
|
+
# that uniquely identifies each row.
|
28
|
+
# @option options [String] :parent_field The name of the field(s) that
|
29
|
+
# identify a parent within which sibling order should be checked.
|
30
|
+
# @option options [String] :child_field The name of the field(s) that
|
31
|
+
# uniquely identify a child of a parent.
|
32
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
33
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
34
|
+
# case-insensitive comparisons.
|
35
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
36
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
37
|
+
# regular expressions will be indexed and included in the diff process.
|
38
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
39
|
+
# regular expression(s). Source rows with a field value that satisfies
|
40
|
+
# the regular expressions will be excluded from the diff process.
|
41
|
+
# @option options [String] :context A context value from which fields
|
42
|
+
# can be populated using a Regexp.
|
43
|
+
def initialize(path, options = {})
|
44
|
+
super(options)
|
45
|
+
@path = path
|
46
|
+
@context = options[:context]
|
47
|
+
@data = []
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
# Process a +source+, converting the XML into a table of data, using
|
52
|
+
# +rec_xpath+ to identify the nodes that correspond each record that
|
53
|
+
# should appear in the output, and +field_maps+ to populate each field
|
54
|
+
# in each row.
|
55
|
+
#
|
56
|
+
# @param source [String|Array] may be a String containing XML content,
|
57
|
+
# an Array of paths to files containing XML content, or a path to
|
58
|
+
# a single file.
|
59
|
+
# @param rec_xpath [String] An XPath expression that selects all the
|
60
|
+
# items in the XML document that are to be converted into new rows.
|
61
|
+
# The returned items are not directly used to populate the fields,
|
62
|
+
# but provide a context for the field XPath expreessions that populate
|
63
|
+
# each field's content.
|
64
|
+
# @param field_maps [Hash<String, String>] A map of field names to
|
65
|
+
# expressions that are evaluated in the context of each row node
|
66
|
+
# selected by +rec_xpath+. The field expressions are typically XPath
|
67
|
+
# expressions evaluated in the context of the nodes returned by the
|
68
|
+
# +rec_xpath+. Alternatively, a String that is not an XPath expression
|
69
|
+
# is used as a literal value for a field, while a Regexp can also
|
70
|
+
# be used to pull a value from any context specified in the +options+
|
71
|
+
# hash. The Regexp should include a single grouping, as the value used
|
72
|
+
# will be the result in $1 after the match is performed.
|
73
|
+
# @param context [String] An optional context for the XML to be processed.
|
74
|
+
# The value passed here can be referenced in field map expressions
|
75
|
+
# using a Regexp, with the value of the first grouping in the regex
|
76
|
+
# being the value returned for the field.
|
77
|
+
def process(source, rec_xpath, field_maps, context = nil)
|
78
|
+
@field_names = field_maps.keys unless @field_names
|
79
|
+
case source
|
80
|
+
when Nokogiri::XML::Document
|
81
|
+
add_data(source, rec_xpath, field_maps, context || @context)
|
82
|
+
when /<\?xml/
|
83
|
+
doc = Nokogiri::XML(source)
|
84
|
+
add_data(doc, rec_xpath, field_maps, context || @context)
|
85
|
+
when Array
|
86
|
+
source.each{ |f| process_file(f, rec_xpath, field_maps) }
|
87
|
+
when String
|
88
|
+
process_file(source, rec_xpath, field_maps)
|
89
|
+
else
|
90
|
+
raise ArgumentError, "Unhandled source type #{source.class.name}"
|
91
|
+
end
|
92
|
+
@data
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
|
99
|
+
# Load the XML document at +file_path+ and process it into rows of data.
|
100
|
+
def process_file(file_path, rec_xpath, field_maps)
|
101
|
+
begin
|
102
|
+
File.open(file_path) do |f|
|
103
|
+
doc = Nokogiri::XML(f)
|
104
|
+
add_data(doc, rec_xpath, field_maps, @context || file_path)
|
105
|
+
end
|
106
|
+
rescue
|
107
|
+
STDERR.puts "An error occurred while attempting to open #{file_path}"
|
108
|
+
raise
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# Locate records in +doc+ using +rec_xpath+ to identify the nodes that
|
114
|
+
# correspond to a new record in the data, and +field_maps+ to populate
|
115
|
+
# the fields in each row.
|
116
|
+
def add_data(doc, rec_xpath, field_maps, context)
|
117
|
+
doc.xpath(rec_xpath).each do |rec_node|
|
118
|
+
rec = []
|
119
|
+
field_maps.each do |field_name, expr|
|
120
|
+
case expr
|
121
|
+
when Regexp # Match context against Regexp and extract first grouping
|
122
|
+
if context
|
123
|
+
context =~ expr
|
124
|
+
rec << $1
|
125
|
+
else
|
126
|
+
rec << nil
|
127
|
+
end
|
128
|
+
when %r{[/(.@]} # XPath expression
|
129
|
+
res = rec_node.xpath(expr)
|
130
|
+
rec << CGI.unescape_html(res.to_s)
|
131
|
+
else # Use expr as the value for this field
|
132
|
+
rec << expr
|
133
|
+
end
|
134
|
+
end
|
135
|
+
@data << rec
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
data/lib/csv_diff.rb
CHANGED
File without changes
|
metadata
CHANGED
@@ -1,17 +1,17 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Gardiner
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |2
|
14
|
-
This library performs diffs of CSV
|
14
|
+
This library performs diffs of CSV data, or any table-like source.
|
15
15
|
|
16
16
|
Unlike a standard diff that compares line by line, and is sensitive to the
|
17
17
|
ordering of records, CSV-Diff identifies common lines by key field(s), and
|
@@ -29,7 +29,9 @@ description: |2
|
|
29
29
|
sibling order.
|
30
30
|
|
31
31
|
This gem implements the core diff algorithm, and handles the loading and
|
32
|
-
diffing of CSV files (or Arrays of Arrays). It
|
32
|
+
diffing of CSV files (or Arrays of Arrays). It also supports converting
|
33
|
+
data in XML format into tabular form, so that it can then be processed
|
34
|
+
like any other CSV or table-like source. It returns a CSVDiff object
|
33
35
|
containing the details of differences in object form. This is useful for
|
34
36
|
projects that need diff capability, but want to handle the reporting or
|
35
37
|
actioning of differences themselves.
|
@@ -48,6 +50,8 @@ files:
|
|
48
50
|
- lib/csv-diff/algorithm.rb
|
49
51
|
- lib/csv-diff/csv_diff.rb
|
50
52
|
- lib/csv-diff/csv_source.rb
|
53
|
+
- lib/csv-diff/source.rb
|
54
|
+
- lib/csv-diff/xml_source.rb
|
51
55
|
- lib/csv_diff.rb
|
52
56
|
homepage: https://github.com/agardiner/csv-diff
|
53
57
|
licenses:
|
@@ -69,8 +73,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
69
73
|
version: '0'
|
70
74
|
requirements: []
|
71
75
|
rubyforge_project:
|
72
|
-
rubygems_version: 2.5.2
|
76
|
+
rubygems_version: 2.5.2.3
|
73
77
|
signing_key:
|
74
78
|
specification_version: 4
|
75
|
-
summary: CSV Diff is a library for generating diffs from data in CSV format
|
79
|
+
summary: CSV Diff is a library for generating diffs from data in CSV or XML format
|
76
80
|
test_files: []
|