csv-diff 0.2 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -13
- data/LICENSE +1 -1
- data/README.md +18 -1
- data/lib/csv-diff.rb +1 -0
- data/lib/csv-diff/algorithm.rb +145 -41
- data/lib/csv-diff/csv_diff.rb +24 -13
- data/lib/csv-diff/csv_source.rb +24 -101
- data/lib/csv-diff/source.rb +275 -0
- data/lib/csv-diff/xml_source.rb +142 -0
- metadata +39 -19
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
NzAyNTY1OTk3MTA3Y2ZhNjk2YWRmNTJkYTljNGZhZDY2YjQ1OTg2ZQ==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d1b3b8deee34344d334e740285cb1f3c99074694
|
4
|
+
data.tar.gz: d95158d13861cb66fd460ee430714ec3c83cd0b1
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
YmVkNDVhMjk1M2EyNjFkZGIxOGE0Y2MwOWQwMWRhNzhjZDk2N2RhZmEyZGRm
|
11
|
-
MjliNmM4Y2ZmNzY4ZTJkY2EzZWY4Mjg3NmU3ZjQxM2RkYTBjODE=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YTljOGZhNDY0YzdjZGYzMTA2NTM3MzIwNDg4MTcwYWEyM2IyZTc1YWYxMjFm
|
14
|
-
YjMwYWU1NWMzNGVkZGRkYWYyZjUwMTQ2MWZlMjdkNjQwMjIwYWUwNmNlYjM3
|
15
|
-
NDlmZDk5MGNlMTk4ZDhlMzFiOGUyZTIwY2EyZTY3MjUwYjc2NWY=
|
6
|
+
metadata.gz: 50c74d6a4093012b0ba44fef70c2d749348d6777cfb9f2cfda66c6e075423191a4c6c22019a388b9d8bd14e22ac60d539f4e3b4aa85fd87fd774a64da15858c7
|
7
|
+
data.tar.gz: 8fa030a54e7a97db9913b3c36a1942de1e07a6549f9ae7aa58b5b3f44d522fe11f72d44e18b6b7612d2b2dc9f106ece1fea183557c507bcf18316891ab63f230
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -125,7 +125,7 @@ column in the data. In this case, a diff can be created simply via:
|
|
125
125
|
diff = CSVDiff.new(file1, file2)
|
126
126
|
```
|
127
127
|
|
128
|
-
###
|
128
|
+
### Specifying Unique Row Identifiers
|
129
129
|
|
130
130
|
Often however, rows are not uniquely identifiable via the first column in the file.
|
131
131
|
In a parent-child hierarchy, for example, combinations of parent and child may be
|
@@ -211,6 +211,23 @@ diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam'
|
|
211
211
|
ignore_fields: ['CreatedAt', 'UpdatedAt'])
|
212
212
|
```
|
213
213
|
|
214
|
+
### Filtering Rows
|
215
|
+
|
216
|
+
If you need to filter source data before running the diff process, you can use the :include
|
217
|
+
and :exclude options to do so. Both options take a Hash as their value; the hash should have
|
218
|
+
keys that are the field names or indexes (0-based) on which to filter, and whose values are
|
219
|
+
regular expressions or lambdas to be applied to values of the corresponding field. Rows will
|
220
|
+
only be diffed if they satisfy :include conditions, and do not satisfy :exclude conditions.
|
221
|
+
```ruby
|
222
|
+
# Generate a diff of Arsenal home games not refereed by Clattenburg
|
223
|
+
diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam', 'AwayTeam'],
|
224
|
+
include: {HomeTeam: 'Arsenal'}, exclude: {Referee: /Clattenburg/})
|
225
|
+
|
226
|
+
# Generate a diff of games played over the Xmas/New Year period
|
227
|
+
diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam', 'AwayTeam'],
|
228
|
+
include: {Date: lambda{ |d| holiday_period.include?(Date.strptime(d, '%y/%m/%d')) } })
|
229
|
+
```
|
230
|
+
|
214
231
|
### Ignoring Certain Changes
|
215
232
|
|
216
233
|
CSVDiff identifies Adds, Updates, Moves and Deletes; any of these changes can be selectively
|
data/lib/csv-diff.rb
CHANGED
data/lib/csv-diff/algorithm.rb
CHANGED
@@ -3,6 +3,55 @@ class CSVDiff
|
|
3
3
|
# Implements the CSV diff algorithm.
|
4
4
|
module Algorithm
|
5
5
|
|
6
|
+
# Holds the details of a single difference
|
7
|
+
class Diff
|
8
|
+
|
9
|
+
attr_accessor :diff_type
|
10
|
+
attr_reader :fields
|
11
|
+
attr_reader :row
|
12
|
+
attr_reader :sibling_position
|
13
|
+
|
14
|
+
def initialize(diff_type, fields, row_idx, pos_idx)
|
15
|
+
@diff_type = diff_type
|
16
|
+
@fields = fields
|
17
|
+
@row = row_idx + 1
|
18
|
+
self.sibling_position = pos_idx
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def sibling_position=(pos_idx)
|
23
|
+
if pos_idx.is_a?(Array)
|
24
|
+
pos_idx.compact!
|
25
|
+
if pos_idx.first != pos_idx.last
|
26
|
+
@sibling_position = pos_idx.map{ |pos| pos + 1 }
|
27
|
+
else
|
28
|
+
@sibling_position = pos_idx.first + 1
|
29
|
+
end
|
30
|
+
else
|
31
|
+
@sibling_position = pos_idx + 1
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
# For backwards compatibility and access to fields with differences
|
37
|
+
def [](key)
|
38
|
+
case key
|
39
|
+
when :action
|
40
|
+
a = diff_type.to_s
|
41
|
+
a[0] = a[0].upcase
|
42
|
+
a
|
43
|
+
when :row
|
44
|
+
@row
|
45
|
+
when :sibling_position
|
46
|
+
@sibling_position
|
47
|
+
else
|
48
|
+
@fields[key]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
|
6
55
|
# Diffs two CSVSource structures.
|
7
56
|
#
|
8
57
|
# @param left [CSVSource] A CSVSource object containing the contents of
|
@@ -13,24 +62,70 @@ class CSVDiff
|
|
13
62
|
# that uniquely identify each row.
|
14
63
|
# @param diff_fields [Array] An array containing the names of the fields
|
15
64
|
# to be diff-ed.
|
65
|
+
# @param options [Hash] An options hash.
|
66
|
+
# @option options [Boolean] :ignore_adds If set to true, we ignore any
|
67
|
+
# new items that appear only in +right+.
|
68
|
+
# @option options [Boolean] :ignore_moves If set to true, we ignore any
|
69
|
+
# changes in sibling order.
|
70
|
+
# @option options [Boolean] :ignore_updates If set to true, we ignore any
|
71
|
+
# items that exist in both +left+ and +right+.
|
72
|
+
# @option options [Boolean] :ignore_deletes If set to true, we ignore any
|
73
|
+
# new items that appear only in +left+.
|
74
|
+
# @option options [Hash<Object,Proc>] :equality_procs A Hash mapping fields
|
75
|
+
# to a 2-arg Proc that should be used to compare values in that field for
|
76
|
+
# equality.
|
16
77
|
def diff_sources(left, right, key_fields, diff_fields, options = {})
|
78
|
+
unless left.case_sensitive? == right.case_sensitive?
|
79
|
+
raise ArgumentError, "Left and right must have same settings for case-sensitivity"
|
80
|
+
end
|
81
|
+
unless left.parent_fields.length == right.parent_fields.length
|
82
|
+
raise ArgumentError, "Left and right must have same settings for parent/child fields"
|
83
|
+
end
|
84
|
+
|
85
|
+
# Ensure key fields are not also in the diff_fields
|
86
|
+
diff_fields = diff_fields - key_fields
|
87
|
+
|
17
88
|
left_index = left.index
|
18
89
|
left_values = left.lines
|
19
90
|
left_keys = left_values.keys
|
20
91
|
right_index = right.index
|
21
92
|
right_values = right.lines
|
22
93
|
right_keys = right_values.keys
|
23
|
-
|
94
|
+
parent_field_count = left.parent_fields.length
|
24
95
|
|
25
96
|
include_adds = !options[:ignore_adds]
|
26
97
|
include_moves = !options[:ignore_moves]
|
27
98
|
include_updates = !options[:ignore_updates]
|
28
99
|
include_deletes = !options[:ignore_deletes]
|
29
100
|
|
30
|
-
|
101
|
+
@case_sensitive = left.case_sensitive?
|
102
|
+
@equality_procs = options.fetch(:equality_procs, {})
|
103
|
+
|
104
|
+
diffs = {}
|
105
|
+
potential_moves = Hash.new{ |h, k| h[k] = [] }
|
106
|
+
|
107
|
+
# First identify deletions
|
108
|
+
if include_deletes
|
109
|
+
(left_keys - right_keys).each do |key|
|
110
|
+
# Delete
|
111
|
+
key_vals = key.split('~', -1)
|
112
|
+
parent = key_vals[0...parent_field_count].join('~')
|
113
|
+
child = key_vals[parent_field_count..-1].join('~')
|
114
|
+
left_parent = left_index[parent]
|
115
|
+
left_value = left_values[key]
|
116
|
+
row_idx = left_keys.index(key)
|
117
|
+
sib_idx = left_parent.index(key)
|
118
|
+
raise "Can't locate key #{key} in parent #{parent}" unless sib_idx
|
119
|
+
diffs[key] = Diff.new(:delete, left_value, row_idx, sib_idx)
|
120
|
+
potential_moves[child] << key
|
121
|
+
#puts "Delete: #{key}"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Now identify adds/updates
|
31
126
|
right_keys.each_with_index do |key, right_row_id|
|
32
|
-
key_vals = key.split('~')
|
33
|
-
parent = key_vals[0...
|
127
|
+
key_vals = key.split('~', -1)
|
128
|
+
parent = key_vals[0...parent_field_count].join('~')
|
34
129
|
left_parent = left_index[parent]
|
35
130
|
right_parent = right_index[parent]
|
36
131
|
left_value = left_values[key]
|
@@ -38,13 +133,12 @@ class CSVDiff
|
|
38
133
|
left_idx = left_parent && left_parent.index(key)
|
39
134
|
right_idx = right_parent && right_parent.index(key)
|
40
135
|
|
41
|
-
id = {}
|
42
|
-
id[:row] = right_row_id + 1
|
43
|
-
id[:sibling_position] = right_idx + 1
|
44
|
-
key_fields.each do |field_name|
|
45
|
-
id[field_name] = right_value[field_name]
|
46
|
-
end
|
47
136
|
if left_idx && right_idx
|
137
|
+
if include_updates && (changes = diff_row(left_value, right_value, diff_fields))
|
138
|
+
id = id_fields(key_fields, right_value)
|
139
|
+
diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
|
140
|
+
#puts "Change: #{key}"
|
141
|
+
end
|
48
142
|
if include_moves
|
49
143
|
left_common = left_parent & right_parent
|
50
144
|
right_common = right_parent & left_parent
|
@@ -52,42 +146,34 @@ class CSVDiff
|
|
52
146
|
right_pos = right_common.index(key)
|
53
147
|
if left_pos != right_pos
|
54
148
|
# Move
|
55
|
-
diffs[key]
|
56
|
-
|
149
|
+
if d = diffs[key]
|
150
|
+
d.sibling_position = [left_idx, right_idx]
|
151
|
+
else
|
152
|
+
id = id_fields(key_fields, right_value)
|
153
|
+
diffs[key] = Diff.new(:move, id, right_row_id, [left_idx, right_idx])
|
154
|
+
end
|
57
155
|
#puts "Move #{left_idx} -> #{right_idx}: #{key}"
|
58
156
|
end
|
59
157
|
end
|
60
|
-
|
61
|
-
diffs[key].merge!(id.merge(changes.merge(:action => 'Update')))
|
62
|
-
#puts "Change: #{key}"
|
63
|
-
end
|
64
|
-
elsif include_adds && right_idx
|
158
|
+
elsif right_idx
|
65
159
|
# Add
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
left_idx = left_parent.index(key)
|
80
|
-
next unless left_idx
|
81
|
-
id = {}
|
82
|
-
id[:row] = left_keys.index(key) + 1
|
83
|
-
id[:sibling_position] = left_idx + 1
|
84
|
-
key_fields.each do |field_name|
|
85
|
-
id[field_name] = left_value[field_name]
|
160
|
+
child = key_vals[parent_field_count..-1].join('~')
|
161
|
+
if potential_moves.has_key?(child) && old_key = potential_moves[child].pop
|
162
|
+
diffs.delete(old_key)
|
163
|
+
if include_updates
|
164
|
+
left_value = left_values[old_key]
|
165
|
+
id = id_fields(right.child_fields, right_value)
|
166
|
+
changes = diff_row(left_value, right_value, left.parent_fields + diff_fields)
|
167
|
+
diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
|
168
|
+
#puts "Update Parent: #{key}"
|
169
|
+
end
|
170
|
+
elsif include_adds
|
171
|
+
diffs[key] = Diff.new(:add, right_value, right_row_id, right_idx)
|
172
|
+
#puts "Add: #{key}"
|
86
173
|
end
|
87
|
-
diffs[key].merge!(id.merge(left_values[key].merge(:action => 'Delete')))
|
88
|
-
#puts "Delete: #{key}"
|
89
174
|
end
|
90
175
|
end
|
176
|
+
|
91
177
|
diffs
|
92
178
|
end
|
93
179
|
|
@@ -99,24 +185,42 @@ class CSVDiff
|
|
99
185
|
# file.
|
100
186
|
# @param right_row [Hash] The version of the CSV row from the right/to
|
101
187
|
# file.
|
188
|
+
# @param fields [Array<String>] An array of field names to compare.
|
102
189
|
# @return [Hash<String, Array>] A Hash whose keys are the fields that
|
103
190
|
# contain differences, and whose values are a two-element array of
|
104
191
|
# [left/from, right/to] values.
|
105
192
|
def diff_row(left_row, right_row, fields)
|
106
193
|
diffs = {}
|
107
194
|
fields.each do |attr|
|
195
|
+
eq_proc = @equality_procs[attr]
|
108
196
|
right_val = right_row[attr]
|
109
197
|
right_val = nil if right_val == ""
|
110
198
|
left_val = left_row[attr]
|
111
199
|
left_val = nil if left_val == ""
|
112
|
-
if
|
200
|
+
if eq_proc
|
201
|
+
diffs[attr] = [left_val, right_val] unless eq_proc.call(left_val, right_val)
|
202
|
+
elsif @case_sensitive
|
203
|
+
diffs[attr] = [left_val, right_val] unless left_val == right_val
|
204
|
+
elsif (left_val.to_s.upcase != right_val.to_s.upcase)
|
113
205
|
diffs[attr] = [left_val, right_val]
|
114
|
-
#puts "#{attr}: #{left_val} -> #{right_val}"
|
115
206
|
end
|
116
207
|
end
|
117
208
|
diffs if diffs.size > 0
|
118
209
|
end
|
119
210
|
|
211
|
+
|
212
|
+
private
|
213
|
+
|
214
|
+
|
215
|
+
# Return a hash containing just the key field values
|
216
|
+
def id_fields(key_fields, fields)
|
217
|
+
id = {}
|
218
|
+
key_fields.each do |field_name|
|
219
|
+
id[field_name] = fields[field_name]
|
220
|
+
end
|
221
|
+
id
|
222
|
+
end
|
223
|
+
|
120
224
|
end
|
121
225
|
|
122
226
|
end
|
data/lib/csv-diff/csv_diff.rb
CHANGED
@@ -28,13 +28,15 @@ class CSVDiff
|
|
28
28
|
# @return [Array<String>] An array of field names that are compared in the
|
29
29
|
# diff process.
|
30
30
|
attr_reader :diff_fields
|
31
|
-
# @return [Array<
|
32
|
-
#
|
31
|
+
# @return [Array<String>] An array of field namees of the key fields that
|
32
|
+
# uniquely identify each row.
|
33
33
|
attr_reader :key_fields
|
34
34
|
# @return [Array<String>] An array of field names for the parent field(s).
|
35
35
|
attr_reader :parent_fields
|
36
36
|
# @return [Array<String>] An array of field names for the child field(s).
|
37
37
|
attr_reader :child_fields
|
38
|
+
# @return [Hash] The options hash used for the diff.
|
39
|
+
attr_reader :options
|
38
40
|
|
39
41
|
|
40
42
|
# Generates a diff between two hierarchical tree structures, provided
|
@@ -79,13 +81,15 @@ class CSVDiff
|
|
79
81
|
# @option options [Boolean] :ignore_deletes If true, records that appear
|
80
82
|
# in the left/from file but not in the right/to file are not reported.
|
81
83
|
def initialize(left, right, options = {})
|
82
|
-
@left = left.is_a?(
|
84
|
+
@left = left.is_a?(Source) ? left : CSVSource.new(left, options)
|
85
|
+
@left.index_source if @left.lines.nil?
|
83
86
|
raise "No field names found in left (from) source" unless @left.field_names && @left.field_names.size > 0
|
84
|
-
@right = right.is_a?(
|
87
|
+
@right = right.is_a?(Source) ? right : CSVSource.new(right, options)
|
88
|
+
@right.index_source if @right.lines.nil?
|
85
89
|
raise "No field names found in right (to) source" unless @right.field_names && @right.field_names.size > 0
|
86
90
|
@warnings = []
|
87
|
-
@diff_fields = get_diff_fields(@left.field_names, @right.field_names, options
|
88
|
-
@key_fields = @left.key_fields
|
91
|
+
@diff_fields = get_diff_fields(@left.field_names, @right.field_names, options)
|
92
|
+
@key_fields = @left.key_fields
|
89
93
|
diff(options)
|
90
94
|
end
|
91
95
|
|
@@ -93,6 +97,7 @@ class CSVDiff
|
|
93
97
|
# Performs a diff with the specified +options+.
|
94
98
|
def diff(options = {})
|
95
99
|
@summary = nil
|
100
|
+
@options = options
|
96
101
|
@diffs = diff_sources(@left, @right, @key_fields, @diff_fields, options)
|
97
102
|
end
|
98
103
|
|
@@ -134,15 +139,21 @@ class CSVDiff
|
|
134
139
|
|
135
140
|
# Given two sets of field names, determines the common set of fields present
|
136
141
|
# in both, on which members can be diffed.
|
137
|
-
def get_diff_fields(left_fields, right_fields,
|
142
|
+
def get_diff_fields(left_fields, right_fields, options)
|
143
|
+
ignore_fields = options.fetch(:ignore_fields, [])
|
144
|
+
ignore_fields = [ignore_fields] unless ignore_fields.is_a?(Array)
|
145
|
+
ignore_fields.map! do |f|
|
146
|
+
(f.is_a?(Numeric) ? right_fields[f] : f).upcase
|
147
|
+
end
|
138
148
|
diff_fields = []
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
@warnings << "Field '#{fld}' is missing from the left (from) file, and won't be diffed"
|
149
|
+
if options[:diff_common_fields_only]
|
150
|
+
right_fields.each_with_index do |fld, i|
|
151
|
+
if left_fields.include?(fld)
|
152
|
+
diff_fields << fld unless ignore_fields.include?(fld.upcase)
|
153
|
+
end
|
145
154
|
end
|
155
|
+
else
|
156
|
+
diff_fields = (right_fields + left_fields).uniq.reject{ |fld| ignore_fields.include?(fld.upcase) }
|
146
157
|
end
|
147
158
|
diff_fields
|
148
159
|
end
|
data/lib/csv-diff/csv_source.rb
CHANGED
@@ -2,31 +2,7 @@ class CSVDiff
|
|
2
2
|
|
3
3
|
# Represents a CSV input (i.e. the left/from or right/to input) to the diff
|
4
4
|
# process.
|
5
|
-
class CSVSource
|
6
|
-
|
7
|
-
# @return [String] the path to the source file
|
8
|
-
attr_accessor :path
|
9
|
-
# @return [Array<String>] The names of the fields in the source file
|
10
|
-
attr_reader :field_names
|
11
|
-
# @return [Array<String>] The names of the field(s) that uniquely
|
12
|
-
# identify each row.
|
13
|
-
attr_reader :key_fields
|
14
|
-
# @return [Array<String>] The names of the field(s) that identify a
|
15
|
-
# common parent of child records.
|
16
|
-
attr_reader :parent_fields
|
17
|
-
# @return [Array<String>] The names of the field(s) that distinguish a
|
18
|
-
# child of a parent record.
|
19
|
-
attr_reader :child_fields
|
20
|
-
# @return [Hash<String,Hash>] A hash containing each line of the source,
|
21
|
-
# keyed on the values of the +key_fields+.
|
22
|
-
attr_reader :lines
|
23
|
-
# @return [Hash<String,Array<String>>] A hash containing each parent key,
|
24
|
-
# and an Array of the child keys it is a parent of.
|
25
|
-
attr_reader :index
|
26
|
-
# @return [Array<String>] An array of any warnings encountered while
|
27
|
-
# processing the source.
|
28
|
-
attr_reader :warnings
|
29
|
-
|
5
|
+
class CSVSource < Source
|
30
6
|
|
31
7
|
# Creates a new diff source.
|
32
8
|
#
|
@@ -59,90 +35,37 @@ class CSVDiff
|
|
59
35
|
# identifies each row.
|
60
36
|
# @option options [Array<String>] :key_fields The names of the fields
|
61
37
|
# that uniquely identifies each row.
|
62
|
-
# @option options [String] :parent_field The name of the field that
|
63
|
-
#
|
64
|
-
# @option options [String] :child_field The name of the field that
|
65
|
-
# uniquely
|
38
|
+
# @option options [String] :parent_field The name of the field(s) that
|
39
|
+
# identify a parent within which sibling order should be checked.
|
40
|
+
# @option options [String] :child_field The name of the field(s) that
|
41
|
+
# uniquely identify a child of a parent.
|
42
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
43
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
44
|
+
# case-insensitive comparisons.
|
45
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
46
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
47
|
+
# regular expressions will be indexed and included in the diff process.
|
48
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
49
|
+
# regular expression(s). Source rows with a field value that satisfies
|
50
|
+
# the regular expressions will be excluded from the diff process.
|
66
51
|
def initialize(source, options = {})
|
52
|
+
super(options)
|
67
53
|
if source.is_a?(String)
|
68
54
|
require 'csv'
|
69
55
|
mode_string = options[:encoding] ? "r:#{options[:encoding]}" : 'r'
|
70
56
|
csv_options = options.fetch(:csv_options, {})
|
71
57
|
@path = source
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
@parent_fields = @key_fields[0...-1]
|
77
|
-
@child_fields = @key_fields[-1..-1]
|
78
|
-
else
|
79
|
-
@parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
|
80
|
-
@child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
|
81
|
-
@key_fields = @parent_fields + @child_fields
|
82
|
-
end
|
83
|
-
@field_names = options[:field_names]
|
84
|
-
@warnings = []
|
85
|
-
index_source(source, options)
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
# Returns the row in the CSV source corresponding to the supplied key.
|
90
|
-
#
|
91
|
-
# @param key [String] The unique key to use to lookup the row.
|
92
|
-
# @return [Hash] The fields for the line corresponding to +key+, or nil
|
93
|
-
# if the key is not recognised.
|
94
|
-
def [](key)
|
95
|
-
@lines[key]
|
96
|
-
end
|
97
|
-
|
98
|
-
|
99
|
-
private
|
100
|
-
|
101
|
-
# Given an array of lines, where each line is an array of fields, indexes
|
102
|
-
# the array contents so that it can be looked up by key.
|
103
|
-
def index_source(lines, options)
|
104
|
-
@lines = {}
|
105
|
-
@index = Hash.new{ |h, k| h[k] = [] }
|
106
|
-
@key_fields = find_field_indexes(@key_fields, @field_names) if @field_names
|
107
|
-
line_num = 0
|
108
|
-
lines.each do |row|
|
109
|
-
line_num += 1
|
110
|
-
next if line_num == 1 && @field_names && options[:ignore_header]
|
111
|
-
unless @field_names
|
112
|
-
@field_names = row
|
113
|
-
@key_fields = find_field_indexes(@key_fields, @field_names)
|
114
|
-
next
|
115
|
-
end
|
116
|
-
field_vals = row
|
117
|
-
line = {}
|
118
|
-
@field_names.each_with_index do |field, i|
|
119
|
-
line[field] = field_vals[i]
|
120
|
-
end
|
121
|
-
key_values = @key_fields.map{ |kf| field_vals[kf].to_s.upcase }
|
122
|
-
key = key_values.join('~')
|
123
|
-
parent_key = key_values[0...(@parent_fields.length)].join('~')
|
124
|
-
if @lines[key]
|
125
|
-
@warnings << "Duplicate key '#{key}' encountered and ignored at line #{line_num}"
|
126
|
-
else
|
127
|
-
@index[parent_key] << key
|
128
|
-
@lines[key] = line
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
|
134
|
-
# Converts an array of field names to an array of indexes of the fields
|
135
|
-
# matching those names.
|
136
|
-
def find_field_indexes(key_fields, field_names)
|
137
|
-
key_fields.map do |field|
|
138
|
-
if field.is_a?(Fixnum)
|
139
|
-
field
|
140
|
-
else
|
141
|
-
field_names.index{ |field_name| field.to_s.downcase == field_name.downcase } or
|
142
|
-
raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
|
143
|
-
field_names.join(', ')}"
|
58
|
+
# When you call CSV.open, it's best to pass in a block so that after it's yielded,
|
59
|
+
# the underlying file handle is closed. Otherwise, you risk leaking the handle.
|
60
|
+
@data = CSV.open(@path, mode_string, csv_options) do |csv|
|
61
|
+
csv.readlines
|
144
62
|
end
|
63
|
+
elsif source.is_a?(Enumerable) && source.size == 0 || (source.size > 0 && source.first.is_a?(Enumerable))
|
64
|
+
@data = source
|
65
|
+
else
|
66
|
+
raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
|
145
67
|
end
|
68
|
+
index_source
|
146
69
|
end
|
147
70
|
|
148
71
|
end
|
@@ -0,0 +1,275 @@
|
|
1
|
+
class CSVDiff
|
2
|
+
|
3
|
+
# Reppresents an input (i.e the left/from or tight/to input) to the diff
|
4
|
+
# process.
|
5
|
+
class Source
|
6
|
+
|
7
|
+
# @return [String] the path to the source file
|
8
|
+
attr_accessor :path
|
9
|
+
# @return [Array<Arrary>] The data for this source
|
10
|
+
attr_reader :data
|
11
|
+
|
12
|
+
# @return [Array<String>] The names of the fields in the source file
|
13
|
+
attr_reader :field_names
|
14
|
+
# @return [Array<String>] The names of the field(s) that uniquely
|
15
|
+
# identify each row.
|
16
|
+
attr_reader :key_fields
|
17
|
+
# @return [Array<String>] The names of the field(s) that identify a
|
18
|
+
# common parent of child records.
|
19
|
+
attr_reader :parent_fields
|
20
|
+
# @return [Array<String>] The names of the field(s) that distinguish a
|
21
|
+
# child of a parent record.
|
22
|
+
attr_reader :child_fields
|
23
|
+
|
24
|
+
# @return [Array<Fixnum>] The indexes of the key fields in the source
|
25
|
+
# file.
|
26
|
+
attr_reader :key_field_indexes
|
27
|
+
# @return [Array<Fixnum>] The indexes of the parent fields in the source
|
28
|
+
# file.
|
29
|
+
attr_reader :parent_field_indexes
|
30
|
+
# @return [Array<Fixnum>] The indexes of the child fields in the source
|
31
|
+
# file.
|
32
|
+
attr_reader :child_field_indexes
|
33
|
+
|
34
|
+
# @return [Boolean] True if the source has been indexed with case-
|
35
|
+
# sensitive keys, or false if it has been indexed using upper-case key
|
36
|
+
# values.
|
37
|
+
attr_reader :case_sensitive
|
38
|
+
alias_method :case_sensitive?, :case_sensitive
|
39
|
+
# @return [Boolean] True if leading/trailing whitespace should be stripped
|
40
|
+
# from fields
|
41
|
+
attr_reader :trim_whitespace
|
42
|
+
# @return [Hash<String,Hash>] A hash containing each line of the source,
|
43
|
+
# keyed on the values of the +key_fields+.
|
44
|
+
attr_reader :lines
|
45
|
+
# @return [Hash<String,Array<String>>] A hash containing each parent key,
|
46
|
+
# and an Array of the child keys it is a parent of.
|
47
|
+
attr_reader :index
|
48
|
+
# @return [Array<String>] An array of any warnings encountered while
|
49
|
+
# processing the source.
|
50
|
+
attr_reader :warnings
|
51
|
+
# @return [Fixnum] A count of the lines processed from this source.
|
52
|
+
# Excludes any header and duplicate records identified during indexing.
|
53
|
+
attr_reader :line_count
|
54
|
+
# @return [Fixnum] A count of the lines from this source that were skipped
|
55
|
+
# due to filter conditions.
|
56
|
+
attr_reader :skip_count
|
57
|
+
# @return [Fixnum] A count of the lines from this source that had the same
|
58
|
+
# key value as another line.
|
59
|
+
attr_reader :dup_count
|
60
|
+
|
61
|
+
|
62
|
+
# Creates a new diff source.
|
63
|
+
#
|
64
|
+
# A diff source must contain at least one field that will be used as the
|
65
|
+
# key to identify the same record in a different version of this file.
|
66
|
+
# If not specified via one of the options, the first field is assumed to
|
67
|
+
# be the unique key.
|
68
|
+
#
|
69
|
+
# If multiple fields combine to form a unique key, the parent is assumed
|
70
|
+
# to be identified by all but the last field of the unique key. If finer
|
71
|
+
# control is required, use a combination of the :parent_fields and
|
72
|
+
# :child_fields options.
|
73
|
+
#
|
74
|
+
# All key options can be specified either by field name, or by field
|
75
|
+
# index (0 based).
|
76
|
+
#
|
77
|
+
# @param options [Hash] An options hash.
|
78
|
+
# @option options [Array<String>] :field_names The names of each of the
|
79
|
+
# fields in +source+.
|
80
|
+
# @option options [Boolean] :ignore_header If true, and :field_names has
|
81
|
+
# been specified, then the first row of the file is ignored.
|
82
|
+
# @option options [String] :key_field The name of the field that uniquely
|
83
|
+
# identifies each row.
|
84
|
+
# @option options [Array<String>] :key_fields The names of the fields
|
85
|
+
# that uniquely identifies each row.
|
86
|
+
# @option options [String] :parent_field The name of the field(s) that
|
87
|
+
# identify a parent within which sibling order should be checked.
|
88
|
+
# @option options [String] :child_field The name of the field(s) that
|
89
|
+
# uniquely identify a child of a parent.
|
90
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
91
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
92
|
+
# case-insensitive comparisons.
|
93
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
94
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
95
|
+
# regular expressions will be indexed and included in the diff process.
|
96
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
97
|
+
# regular expression(s). Source rows with a field value that satisfies
|
98
|
+
# the regular expressions will be excluded from the diff process.
|
99
|
+
def initialize(options = {})
|
100
|
+
if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
|
101
|
+
(kf = options.fetch(:key_field, options[:key_fields]))
|
102
|
+
@key_fields = [kf].flatten
|
103
|
+
@parent_fields = @key_fields[0...-1]
|
104
|
+
@child_fields = @key_fields[-1..-1]
|
105
|
+
else
|
106
|
+
@parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
|
107
|
+
@child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
|
108
|
+
@key_fields = @parent_fields + @child_fields
|
109
|
+
end
|
110
|
+
@field_names = options[:field_names]
|
111
|
+
@case_sensitive = options.fetch(:case_sensitive, true)
|
112
|
+
@trim_whitespace = options.fetch(:trim_whitespace, false)
|
113
|
+
@ignore_header = options[:ignore_header]
|
114
|
+
@include = options[:include]
|
115
|
+
@exclued = options[:exclude]
|
116
|
+
@path = options.fetch(:path, 'NA') unless @path
|
117
|
+
@warnings = []
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def path?
|
122
|
+
@path != 'NA'
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
# Returns the row in the CSV source corresponding to the supplied key.
|
127
|
+
#
|
128
|
+
# @param key [String] The unique key to use to lookup the row.
|
129
|
+
# @return [Hash] The fields for the line corresponding to +key+, or nil
|
130
|
+
# if the key is not recognised.
|
131
|
+
def [](key)
|
132
|
+
@lines[key]
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
# Given an array of lines, where each line is an array of fields, indexes
|
137
|
+
# the array contents so that it can be looked up by key.
|
138
|
+
def index_source
|
139
|
+
@lines = {}
|
140
|
+
@index = Hash.new{ |h, k| h[k] = [] }
|
141
|
+
if @field_names
|
142
|
+
index_fields
|
143
|
+
include_filter = convert_filter(@include, @field_names)
|
144
|
+
exclude_filter = convert_filter(@exclude, @field_names)
|
145
|
+
end
|
146
|
+
@line_count = 0
|
147
|
+
@skip_count = 0
|
148
|
+
@dup_count = 0
|
149
|
+
line_num = 0
|
150
|
+
@data.each do |row|
|
151
|
+
line_num += 1
|
152
|
+
next if line_num == 1 && @field_names && @ignore_header
|
153
|
+
unless @field_names
|
154
|
+
if row.class.name == 'CSV::Row'
|
155
|
+
@field_names = row.headers.each_with_index.map{ |f, i| f || i.to_s }
|
156
|
+
else
|
157
|
+
@field_names = row.each_with_index.map{ |f, i| f || i.to_s }
|
158
|
+
end
|
159
|
+
index_fields
|
160
|
+
include_filter = convert_filter(@include, @field_names)
|
161
|
+
exclude_filter = convert_filter(@exclude, @field_names)
|
162
|
+
next
|
163
|
+
end
|
164
|
+
field_vals = row
|
165
|
+
line = {}
|
166
|
+
filter = false
|
167
|
+
@field_names.each_with_index do |field, i|
|
168
|
+
val = field_vals[i]
|
169
|
+
val = val.to_s.strip if val && @trim_whitespace
|
170
|
+
line[field] = val
|
171
|
+
if include_filter && f = include_filter[i]
|
172
|
+
filter = !check_filter(f, line[field])
|
173
|
+
end
|
174
|
+
if exclude_filter && f = exclude_filter[i]
|
175
|
+
filter = check_filter(f, line[field])
|
176
|
+
end
|
177
|
+
break if filter
|
178
|
+
end
|
179
|
+
if filter
|
180
|
+
@skip_count += 1
|
181
|
+
next
|
182
|
+
end
|
183
|
+
key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
|
184
|
+
field_vals[kf].to_s :
|
185
|
+
field_vals[kf].to_s.upcase }
|
186
|
+
key = key_values.join('~')
|
187
|
+
parent_key = key_values[0...(@parent_fields.length)].join('~')
|
188
|
+
if @lines[key]
|
189
|
+
@warnings << "Duplicate key '#{key}' encountered at line #{line_num}"
|
190
|
+
@dup_count += 1
|
191
|
+
key += "[#{@dup_count}]"
|
192
|
+
end
|
193
|
+
@index[parent_key] << key
|
194
|
+
@lines[key] = line
|
195
|
+
@line_count += 1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
|
200
|
+
# Save the data in this Source as a CSV at +file_path+.
|
201
|
+
#
|
202
|
+
# @parma file_path [String] The target path to save the data to.
|
203
|
+
# @param options [Hash] A set of options to pass to CSV.open to control
|
204
|
+
# how the CSV is generated.
|
205
|
+
def save_csv(file_path, options = {})
|
206
|
+
require 'csv'
|
207
|
+
default_opts = {
|
208
|
+
headers: @field_name, write_headers: true
|
209
|
+
}
|
210
|
+
CSV.open(file_path, 'wb', default_opts.merge(options)) do |csv|
|
211
|
+
@data.each{ |rec| csv << rec }
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
|
216
|
+
private
|
217
|
+
|
218
|
+
|
219
|
+
def index_fields
|
220
|
+
@key_field_indexes = find_field_indexes(@key_fields, @field_names)
|
221
|
+
@parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
|
222
|
+
@child_field_indexes = find_field_indexes(@child_fields, @field_names)
|
223
|
+
@key_fields = @key_field_indexes.map{ |i| @field_names[i] }
|
224
|
+
@parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
|
225
|
+
@child_fields = @child_field_indexes.map{ |i| @field_names[i] }
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
# Converts an array of field names to an array of indexes of the fields
|
230
|
+
# matching those names.
|
231
|
+
def find_field_indexes(key_fields, field_names)
|
232
|
+
key_fields.map do |field|
|
233
|
+
if field.is_a?(Integer)
|
234
|
+
field
|
235
|
+
else
|
236
|
+
field_names.index{ |field_name| field.to_s.downcase == field_name.to_s.downcase } or
|
237
|
+
raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
|
238
|
+
field_names.join(', ')}"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
def convert_filter(hsh, field_names)
|
245
|
+
return unless hsh
|
246
|
+
if !hsh.is_a?(Hash)
|
247
|
+
raise ArgumentError, ":include/:exclude option must be a Hash of field name(s)/index(es) to RegExp(s)"
|
248
|
+
end
|
249
|
+
keys = hsh.keys
|
250
|
+
idxs = find_field_indexes(keys, @field_names)
|
251
|
+
Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
def check_filter(filter, field_val)
|
256
|
+
case filter
|
257
|
+
when String
|
258
|
+
if @case_sensitive
|
259
|
+
filter == field_val
|
260
|
+
else
|
261
|
+
filter.downcase == field_val.to_s.downcase
|
262
|
+
end
|
263
|
+
when Regexp
|
264
|
+
filter.match(field_val)
|
265
|
+
when Proc
|
266
|
+
filter.call(field_val)
|
267
|
+
else
|
268
|
+
raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
275
|
+
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'cgi'
|
3
|
+
|
4
|
+
|
5
|
+
class CSVDiff
|
6
|
+
|
7
|
+
# Convert XML content to CSV format using XPath selectors to identify the
|
8
|
+
# rows and field values in an XML document
|
9
|
+
class XMLSource < Source
|
10
|
+
|
11
|
+
attr_accessor :context
|
12
|
+
|
13
|
+
# Create a new XMLSource, identified by +path+. Normally this is a path
|
14
|
+
# to the XML document, but any value is fine, as it isreally just a label
|
15
|
+
# to identify this data set.
|
16
|
+
#
|
17
|
+
# @param path [String] A label for this data set (often a path to the
|
18
|
+
# XML document used as the source).
|
19
|
+
# @param options [Hash] An options hash.
|
20
|
+
# @option options [Array<String>] :field_names The names of each of the
|
21
|
+
# fields in +source+.
|
22
|
+
# @option options [Boolean] :ignore_header If true, and :field_names has
|
23
|
+
# been specified, then the first row of the file is ignored.
|
24
|
+
# @option options [String] :key_field The name of the field that uniquely
|
25
|
+
# identifies each row.
|
26
|
+
# @option options [Array<String>] :key_fields The names of the fields
|
27
|
+
# that uniquely identifies each row.
|
28
|
+
# @option options [String] :parent_field The name of the field(s) that
|
29
|
+
# identify a parent within which sibling order should be checked.
|
30
|
+
# @option options [String] :child_field The name of the field(s) that
|
31
|
+
# uniquely identify a child of a parent.
|
32
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
33
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
34
|
+
# case-insensitive comparisons.
|
35
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
36
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
37
|
+
# regular expressions will be indexed and included in the diff process.
|
38
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
39
|
+
# regular expression(s). Source rows with a field value that satisfies
|
40
|
+
# the regular expressions will be excluded from the diff process.
|
41
|
+
# @option options [String] :context A context value from which fields
|
42
|
+
# can be populated using a Regexp.
|
43
|
+
def initialize(path, options = {})
|
44
|
+
super(options)
|
45
|
+
@path = path
|
46
|
+
@context = options[:context]
|
47
|
+
@data = []
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
# Process a +source+, converting the XML into a table of data, using
|
52
|
+
# +rec_xpath+ to identify the nodes that correspond each record that
|
53
|
+
# should appear in the output, and +field_maps+ to populate each field
|
54
|
+
# in each row.
|
55
|
+
#
|
56
|
+
# @param source [String|Array] may be a String containing XML content,
|
57
|
+
# an Array of paths to files containing XML content, or a path to
|
58
|
+
# a single file.
|
59
|
+
# @param rec_xpath [String] An XPath expression that selects all the
|
60
|
+
# items in the XML document that are to be converted into new rows.
|
61
|
+
# The returned items are not directly used to populate the fields,
|
62
|
+
# but provide a context for the field XPath expreessions that populate
|
63
|
+
# each field's content.
|
64
|
+
# @param field_maps [Hash<String, String>] A map of field names to
|
65
|
+
# expressions that are evaluated in the context of each row node
|
66
|
+
# selected by +rec_xpath+. The field expressions are typically XPath
|
67
|
+
# expressions evaluated in the context of the nodes returned by the
|
68
|
+
# +rec_xpath+. Alternatively, a String that is not an XPath expression
|
69
|
+
# is used as a literal value for a field, while a Regexp can also
|
70
|
+
# be used to pull a value from any context specified in the +options+
|
71
|
+
# hash. The Regexp should include a single grouping, as the value used
|
72
|
+
# will be the result in $1 after the match is performed.
|
73
|
+
# @param context [String] An optional context for the XML to be processed.
|
74
|
+
# The value passed here can be referenced in field map expressions
|
75
|
+
# using a Regexp, with the value of the first grouping in the regex
|
76
|
+
# being the value returned for the field.
|
77
|
+
def process(source, rec_xpath, field_maps, context = nil)
|
78
|
+
@field_names = field_maps.keys unless @field_names
|
79
|
+
case source
|
80
|
+
when Nokogiri::XML::Document
|
81
|
+
add_data(source, rec_xpath, field_maps, context || @context)
|
82
|
+
when /<\?xml/
|
83
|
+
doc = Nokogiri::XML(source)
|
84
|
+
add_data(doc, rec_xpath, field_maps, context || @context)
|
85
|
+
when Array
|
86
|
+
source.each{ |f| process_file(f, rec_xpath, field_maps) }
|
87
|
+
when String
|
88
|
+
process_file(source, rec_xpath, field_maps)
|
89
|
+
else
|
90
|
+
raise ArgumentError, "Unhandled source type #{source.class.name}"
|
91
|
+
end
|
92
|
+
@data
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
|
99
|
+
# Load the XML document at +file_path+ and process it into rows of data.
|
100
|
+
def process_file(file_path, rec_xpath, field_maps)
|
101
|
+
begin
|
102
|
+
File.open(file_path) do |f|
|
103
|
+
doc = Nokogiri::XML(f)
|
104
|
+
add_data(doc, rec_xpath, field_maps, @context || file_path)
|
105
|
+
end
|
106
|
+
rescue
|
107
|
+
STDERR.puts "An error occurred while attempting to open #{file_path}"
|
108
|
+
raise
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# Locate records in +doc+ using +rec_xpath+ to identify the nodes that
|
114
|
+
# correspond to a new record in the data, and +field_maps+ to populate
|
115
|
+
# the fields in each row.
|
116
|
+
def add_data(doc, rec_xpath, field_maps, context)
|
117
|
+
doc.xpath(rec_xpath).each do |rec_node|
|
118
|
+
rec = []
|
119
|
+
field_maps.each do |field_name, expr|
|
120
|
+
case expr
|
121
|
+
when Regexp # Match context against Regexp and extract first grouping
|
122
|
+
if context
|
123
|
+
context =~ expr
|
124
|
+
rec << $1
|
125
|
+
else
|
126
|
+
rec << nil
|
127
|
+
end
|
128
|
+
when %r{[/(.@]} # XPath expression
|
129
|
+
res = rec_node.xpath(expr)
|
130
|
+
rec << CGI.unescape_html(res.to_s)
|
131
|
+
else # Use expr as the value for this field
|
132
|
+
rec << expr
|
133
|
+
end
|
134
|
+
end
|
135
|
+
@data << rec
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
metadata
CHANGED
@@ -1,27 +1,44 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Gardiner
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
13
|
+
description: |2
|
14
|
+
This library performs diffs of CSV data, or any table-like source.
|
15
|
+
|
16
|
+
Unlike a standard diff that compares line by line, and is sensitive to the
|
17
|
+
ordering of records, CSV-Diff identifies common lines by key field(s), and
|
18
|
+
then compares the contents of the fields in each line.
|
19
|
+
|
20
|
+
Data may be supplied in the form of CSV files, or as an array of arrays. The
|
21
|
+
diff process provides a fine level of control over what to diff, and can
|
22
|
+
optionally ignore certain types of changes (e.g. changes in position).
|
23
|
+
|
24
|
+
CSV-Diff is particularly well suited to data in parent-child format. Parent-
|
25
|
+
child data does not lend itself well to standard text diffs, as small changes
|
26
|
+
in the organisation of the tree at an upper level can lead to big movements
|
27
|
+
in the position of descendant records. By instead matching records by key,
|
28
|
+
CSV-Diff avoids this issue, while still being able to detect changes in
|
29
|
+
sibling order.
|
30
|
+
|
31
|
+
This gem implements the core diff algorithm, and handles the loading and
|
32
|
+
diffing of CSV files (or Arrays of Arrays). It also supports converting
|
33
|
+
data in XML format into tabular form, so that it can then be processed
|
34
|
+
like any other CSV or table-like source. It returns a CSVDiff object
|
35
|
+
containing the details of differences in object form. This is useful for
|
36
|
+
projects that need diff capability, but want to handle the reporting or
|
37
|
+
actioning of differences themselves.
|
38
|
+
|
39
|
+
For a pre-built diff reporting capability, see the csv-diff-report gem,
|
40
|
+
which provides a command-line tool for generating diff reports in HTML,
|
41
|
+
Excel, or text formats.
|
25
42
|
email: adam.b.gardiner@gmail.com
|
26
43
|
executables: []
|
27
44
|
extensions: []
|
@@ -33,9 +50,12 @@ files:
|
|
33
50
|
- lib/csv-diff/algorithm.rb
|
34
51
|
- lib/csv-diff/csv_diff.rb
|
35
52
|
- lib/csv-diff/csv_source.rb
|
53
|
+
- lib/csv-diff/source.rb
|
54
|
+
- lib/csv-diff/xml_source.rb
|
36
55
|
- lib/csv_diff.rb
|
37
56
|
homepage: https://github.com/agardiner/csv-diff
|
38
|
-
licenses:
|
57
|
+
licenses:
|
58
|
+
- MIT
|
39
59
|
metadata: {}
|
40
60
|
post_install_message: For command-line tools and diff reports, 'gem install csv-diff-report'
|
41
61
|
rdoc_options: []
|
@@ -43,18 +63,18 @@ require_paths:
|
|
43
63
|
- lib
|
44
64
|
required_ruby_version: !ruby/object:Gem::Requirement
|
45
65
|
requirements:
|
46
|
-
- -
|
66
|
+
- - ">="
|
47
67
|
- !ruby/object:Gem::Version
|
48
68
|
version: '0'
|
49
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
70
|
requirements:
|
51
|
-
- -
|
71
|
+
- - ">="
|
52
72
|
- !ruby/object:Gem::Version
|
53
73
|
version: '0'
|
54
74
|
requirements: []
|
55
75
|
rubyforge_project:
|
56
|
-
rubygems_version: 2.
|
76
|
+
rubygems_version: 2.5.2.3
|
57
77
|
signing_key:
|
58
78
|
specification_version: 4
|
59
|
-
summary: CSV Diff is a library for generating diffs from data in CSV format
|
79
|
+
summary: CSV Diff is a library for generating diffs from data in CSV or XML format
|
60
80
|
test_files: []
|