csv-diff 0.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -13
- data/LICENSE +1 -1
- data/README.md +18 -1
- data/lib/csv-diff.rb +1 -0
- data/lib/csv-diff/algorithm.rb +145 -41
- data/lib/csv-diff/csv_diff.rb +24 -13
- data/lib/csv-diff/csv_source.rb +24 -101
- data/lib/csv-diff/source.rb +275 -0
- data/lib/csv-diff/xml_source.rb +142 -0
- metadata +39 -19
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
data.tar.gz: !binary |-
|
6
|
-
NzAyNTY1OTk3MTA3Y2ZhNjk2YWRmNTJkYTljNGZhZDY2YjQ1OTg2ZQ==
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d1b3b8deee34344d334e740285cb1f3c99074694
|
4
|
+
data.tar.gz: d95158d13861cb66fd460ee430714ec3c83cd0b1
|
7
5
|
SHA512:
|
8
|
-
metadata.gz:
|
9
|
-
|
10
|
-
YmVkNDVhMjk1M2EyNjFkZGIxOGE0Y2MwOWQwMWRhNzhjZDk2N2RhZmEyZGRm
|
11
|
-
MjliNmM4Y2ZmNzY4ZTJkY2EzZWY4Mjg3NmU3ZjQxM2RkYTBjODE=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YTljOGZhNDY0YzdjZGYzMTA2NTM3MzIwNDg4MTcwYWEyM2IyZTc1YWYxMjFm
|
14
|
-
YjMwYWU1NWMzNGVkZGRkYWYyZjUwMTQ2MWZlMjdkNjQwMjIwYWUwNmNlYjM3
|
15
|
-
NDlmZDk5MGNlMTk4ZDhlMzFiOGUyZTIwY2EyZTY3MjUwYjc2NWY=
|
6
|
+
metadata.gz: 50c74d6a4093012b0ba44fef70c2d749348d6777cfb9f2cfda66c6e075423191a4c6c22019a388b9d8bd14e22ac60d539f4e3b4aa85fd87fd774a64da15858c7
|
7
|
+
data.tar.gz: 8fa030a54e7a97db9913b3c36a1942de1e07a6549f9ae7aa58b5b3f44d522fe11f72d44e18b6b7612d2b2dc9f106ece1fea183557c507bcf18316891ab63f230
|
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -125,7 +125,7 @@ column in the data. In this case, a diff can be created simply via:
|
|
125
125
|
diff = CSVDiff.new(file1, file2)
|
126
126
|
```
|
127
127
|
|
128
|
-
###
|
128
|
+
### Specifying Unique Row Identifiers
|
129
129
|
|
130
130
|
Often however, rows are not uniquely identifiable via the first column in the file.
|
131
131
|
In a parent-child hierarchy, for example, combinations of parent and child may be
|
@@ -211,6 +211,23 @@ diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam'
|
|
211
211
|
ignore_fields: ['CreatedAt', 'UpdatedAt'])
|
212
212
|
```
|
213
213
|
|
214
|
+
### Filtering Rows
|
215
|
+
|
216
|
+
If you need to filter source data before running the diff process, you can use the :include
|
217
|
+
and :exclude options to do so. Both options take a Hash as their value; the hash should have
|
218
|
+
keys that are the field names or indexes (0-based) on which to filter, and whose values are
|
219
|
+
regular expressions or lambdas to be applied to values of the corresponding field. Rows will
|
220
|
+
only be diffed if they satisfy :include conditions, and do not satisfy :exclude conditions.
|
221
|
+
```ruby
|
222
|
+
# Generate a diff of Arsenal home games not refereed by Clattenburg
|
223
|
+
diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam', 'AwayTeam'],
|
224
|
+
include: {HomeTeam: 'Arsenal'}, exclude: {Referee: /Clattenburg/})
|
225
|
+
|
226
|
+
# Generate a diff of games played over the Xmas/New Year period
|
227
|
+
diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam', 'AwayTeam'],
|
228
|
+
include: {Date: lambda{ |d| holiday_period.include?(Date.strptime(d, '%y/%m/%d')) } })
|
229
|
+
```
|
230
|
+
|
214
231
|
### Ignoring Certain Changes
|
215
232
|
|
216
233
|
CSVDiff identifies Adds, Updates, Moves and Deletes; any of these changes can be selectively
|
data/lib/csv-diff.rb
CHANGED
data/lib/csv-diff/algorithm.rb
CHANGED
@@ -3,6 +3,55 @@ class CSVDiff
|
|
3
3
|
# Implements the CSV diff algorithm.
|
4
4
|
module Algorithm
|
5
5
|
|
6
|
+
# Holds the details of a single difference
|
7
|
+
class Diff
|
8
|
+
|
9
|
+
attr_accessor :diff_type
|
10
|
+
attr_reader :fields
|
11
|
+
attr_reader :row
|
12
|
+
attr_reader :sibling_position
|
13
|
+
|
14
|
+
def initialize(diff_type, fields, row_idx, pos_idx)
|
15
|
+
@diff_type = diff_type
|
16
|
+
@fields = fields
|
17
|
+
@row = row_idx + 1
|
18
|
+
self.sibling_position = pos_idx
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def sibling_position=(pos_idx)
|
23
|
+
if pos_idx.is_a?(Array)
|
24
|
+
pos_idx.compact!
|
25
|
+
if pos_idx.first != pos_idx.last
|
26
|
+
@sibling_position = pos_idx.map{ |pos| pos + 1 }
|
27
|
+
else
|
28
|
+
@sibling_position = pos_idx.first + 1
|
29
|
+
end
|
30
|
+
else
|
31
|
+
@sibling_position = pos_idx + 1
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
# For backwards compatibility and access to fields with differences
|
37
|
+
def [](key)
|
38
|
+
case key
|
39
|
+
when :action
|
40
|
+
a = diff_type.to_s
|
41
|
+
a[0] = a[0].upcase
|
42
|
+
a
|
43
|
+
when :row
|
44
|
+
@row
|
45
|
+
when :sibling_position
|
46
|
+
@sibling_position
|
47
|
+
else
|
48
|
+
@fields[key]
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
|
6
55
|
# Diffs two CSVSource structures.
|
7
56
|
#
|
8
57
|
# @param left [CSVSource] A CSVSource object containing the contents of
|
@@ -13,24 +62,70 @@ class CSVDiff
|
|
13
62
|
# that uniquely identify each row.
|
14
63
|
# @param diff_fields [Array] An array containing the names of the fields
|
15
64
|
# to be diff-ed.
|
65
|
+
# @param options [Hash] An options hash.
|
66
|
+
# @option options [Boolean] :ignore_adds If set to true, we ignore any
|
67
|
+
# new items that appear only in +right+.
|
68
|
+
# @option options [Boolean] :ignore_moves If set to true, we ignore any
|
69
|
+
# changes in sibling order.
|
70
|
+
# @option options [Boolean] :ignore_updates If set to true, we ignore any
|
71
|
+
# items that exist in both +left+ and +right+.
|
72
|
+
# @option options [Boolean] :ignore_deletes If set to true, we ignore any
|
73
|
+
# new items that appear only in +left+.
|
74
|
+
# @option options [Hash<Object,Proc>] :equality_procs A Hash mapping fields
|
75
|
+
# to a 2-arg Proc that should be used to compare values in that field for
|
76
|
+
# equality.
|
16
77
|
def diff_sources(left, right, key_fields, diff_fields, options = {})
|
78
|
+
unless left.case_sensitive? == right.case_sensitive?
|
79
|
+
raise ArgumentError, "Left and right must have same settings for case-sensitivity"
|
80
|
+
end
|
81
|
+
unless left.parent_fields.length == right.parent_fields.length
|
82
|
+
raise ArgumentError, "Left and right must have same settings for parent/child fields"
|
83
|
+
end
|
84
|
+
|
85
|
+
# Ensure key fields are not also in the diff_fields
|
86
|
+
diff_fields = diff_fields - key_fields
|
87
|
+
|
17
88
|
left_index = left.index
|
18
89
|
left_values = left.lines
|
19
90
|
left_keys = left_values.keys
|
20
91
|
right_index = right.index
|
21
92
|
right_values = right.lines
|
22
93
|
right_keys = right_values.keys
|
23
|
-
|
94
|
+
parent_field_count = left.parent_fields.length
|
24
95
|
|
25
96
|
include_adds = !options[:ignore_adds]
|
26
97
|
include_moves = !options[:ignore_moves]
|
27
98
|
include_updates = !options[:ignore_updates]
|
28
99
|
include_deletes = !options[:ignore_deletes]
|
29
100
|
|
30
|
-
|
101
|
+
@case_sensitive = left.case_sensitive?
|
102
|
+
@equality_procs = options.fetch(:equality_procs, {})
|
103
|
+
|
104
|
+
diffs = {}
|
105
|
+
potential_moves = Hash.new{ |h, k| h[k] = [] }
|
106
|
+
|
107
|
+
# First identify deletions
|
108
|
+
if include_deletes
|
109
|
+
(left_keys - right_keys).each do |key|
|
110
|
+
# Delete
|
111
|
+
key_vals = key.split('~', -1)
|
112
|
+
parent = key_vals[0...parent_field_count].join('~')
|
113
|
+
child = key_vals[parent_field_count..-1].join('~')
|
114
|
+
left_parent = left_index[parent]
|
115
|
+
left_value = left_values[key]
|
116
|
+
row_idx = left_keys.index(key)
|
117
|
+
sib_idx = left_parent.index(key)
|
118
|
+
raise "Can't locate key #{key} in parent #{parent}" unless sib_idx
|
119
|
+
diffs[key] = Diff.new(:delete, left_value, row_idx, sib_idx)
|
120
|
+
potential_moves[child] << key
|
121
|
+
#puts "Delete: #{key}"
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Now identify adds/updates
|
31
126
|
right_keys.each_with_index do |key, right_row_id|
|
32
|
-
key_vals = key.split('~')
|
33
|
-
parent = key_vals[0...
|
127
|
+
key_vals = key.split('~', -1)
|
128
|
+
parent = key_vals[0...parent_field_count].join('~')
|
34
129
|
left_parent = left_index[parent]
|
35
130
|
right_parent = right_index[parent]
|
36
131
|
left_value = left_values[key]
|
@@ -38,13 +133,12 @@ class CSVDiff
|
|
38
133
|
left_idx = left_parent && left_parent.index(key)
|
39
134
|
right_idx = right_parent && right_parent.index(key)
|
40
135
|
|
41
|
-
id = {}
|
42
|
-
id[:row] = right_row_id + 1
|
43
|
-
id[:sibling_position] = right_idx + 1
|
44
|
-
key_fields.each do |field_name|
|
45
|
-
id[field_name] = right_value[field_name]
|
46
|
-
end
|
47
136
|
if left_idx && right_idx
|
137
|
+
if include_updates && (changes = diff_row(left_value, right_value, diff_fields))
|
138
|
+
id = id_fields(key_fields, right_value)
|
139
|
+
diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
|
140
|
+
#puts "Change: #{key}"
|
141
|
+
end
|
48
142
|
if include_moves
|
49
143
|
left_common = left_parent & right_parent
|
50
144
|
right_common = right_parent & left_parent
|
@@ -52,42 +146,34 @@ class CSVDiff
|
|
52
146
|
right_pos = right_common.index(key)
|
53
147
|
if left_pos != right_pos
|
54
148
|
# Move
|
55
|
-
diffs[key]
|
56
|
-
|
149
|
+
if d = diffs[key]
|
150
|
+
d.sibling_position = [left_idx, right_idx]
|
151
|
+
else
|
152
|
+
id = id_fields(key_fields, right_value)
|
153
|
+
diffs[key] = Diff.new(:move, id, right_row_id, [left_idx, right_idx])
|
154
|
+
end
|
57
155
|
#puts "Move #{left_idx} -> #{right_idx}: #{key}"
|
58
156
|
end
|
59
157
|
end
|
60
|
-
|
61
|
-
diffs[key].merge!(id.merge(changes.merge(:action => 'Update')))
|
62
|
-
#puts "Change: #{key}"
|
63
|
-
end
|
64
|
-
elsif include_adds && right_idx
|
158
|
+
elsif right_idx
|
65
159
|
# Add
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
left_idx = left_parent.index(key)
|
80
|
-
next unless left_idx
|
81
|
-
id = {}
|
82
|
-
id[:row] = left_keys.index(key) + 1
|
83
|
-
id[:sibling_position] = left_idx + 1
|
84
|
-
key_fields.each do |field_name|
|
85
|
-
id[field_name] = left_value[field_name]
|
160
|
+
child = key_vals[parent_field_count..-1].join('~')
|
161
|
+
if potential_moves.has_key?(child) && old_key = potential_moves[child].pop
|
162
|
+
diffs.delete(old_key)
|
163
|
+
if include_updates
|
164
|
+
left_value = left_values[old_key]
|
165
|
+
id = id_fields(right.child_fields, right_value)
|
166
|
+
changes = diff_row(left_value, right_value, left.parent_fields + diff_fields)
|
167
|
+
diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
|
168
|
+
#puts "Update Parent: #{key}"
|
169
|
+
end
|
170
|
+
elsif include_adds
|
171
|
+
diffs[key] = Diff.new(:add, right_value, right_row_id, right_idx)
|
172
|
+
#puts "Add: #{key}"
|
86
173
|
end
|
87
|
-
diffs[key].merge!(id.merge(left_values[key].merge(:action => 'Delete')))
|
88
|
-
#puts "Delete: #{key}"
|
89
174
|
end
|
90
175
|
end
|
176
|
+
|
91
177
|
diffs
|
92
178
|
end
|
93
179
|
|
@@ -99,24 +185,42 @@ class CSVDiff
|
|
99
185
|
# file.
|
100
186
|
# @param right_row [Hash] The version of the CSV row from the right/to
|
101
187
|
# file.
|
188
|
+
# @param fields [Array<String>] An array of field names to compare.
|
102
189
|
# @return [Hash<String, Array>] A Hash whose keys are the fields that
|
103
190
|
# contain differences, and whose values are a two-element array of
|
104
191
|
# [left/from, right/to] values.
|
105
192
|
def diff_row(left_row, right_row, fields)
|
106
193
|
diffs = {}
|
107
194
|
fields.each do |attr|
|
195
|
+
eq_proc = @equality_procs[attr]
|
108
196
|
right_val = right_row[attr]
|
109
197
|
right_val = nil if right_val == ""
|
110
198
|
left_val = left_row[attr]
|
111
199
|
left_val = nil if left_val == ""
|
112
|
-
if
|
200
|
+
if eq_proc
|
201
|
+
diffs[attr] = [left_val, right_val] unless eq_proc.call(left_val, right_val)
|
202
|
+
elsif @case_sensitive
|
203
|
+
diffs[attr] = [left_val, right_val] unless left_val == right_val
|
204
|
+
elsif (left_val.to_s.upcase != right_val.to_s.upcase)
|
113
205
|
diffs[attr] = [left_val, right_val]
|
114
|
-
#puts "#{attr}: #{left_val} -> #{right_val}"
|
115
206
|
end
|
116
207
|
end
|
117
208
|
diffs if diffs.size > 0
|
118
209
|
end
|
119
210
|
|
211
|
+
|
212
|
+
private
|
213
|
+
|
214
|
+
|
215
|
+
# Return a hash containing just the key field values
|
216
|
+
def id_fields(key_fields, fields)
|
217
|
+
id = {}
|
218
|
+
key_fields.each do |field_name|
|
219
|
+
id[field_name] = fields[field_name]
|
220
|
+
end
|
221
|
+
id
|
222
|
+
end
|
223
|
+
|
120
224
|
end
|
121
225
|
|
122
226
|
end
|
data/lib/csv-diff/csv_diff.rb
CHANGED
@@ -28,13 +28,15 @@ class CSVDiff
|
|
28
28
|
# @return [Array<String>] An array of field names that are compared in the
|
29
29
|
# diff process.
|
30
30
|
attr_reader :diff_fields
|
31
|
-
# @return [Array<
|
32
|
-
#
|
31
|
+
# @return [Array<String>] An array of field namees of the key fields that
|
32
|
+
# uniquely identify each row.
|
33
33
|
attr_reader :key_fields
|
34
34
|
# @return [Array<String>] An array of field names for the parent field(s).
|
35
35
|
attr_reader :parent_fields
|
36
36
|
# @return [Array<String>] An array of field names for the child field(s).
|
37
37
|
attr_reader :child_fields
|
38
|
+
# @return [Hash] The options hash used for the diff.
|
39
|
+
attr_reader :options
|
38
40
|
|
39
41
|
|
40
42
|
# Generates a diff between two hierarchical tree structures, provided
|
@@ -79,13 +81,15 @@ class CSVDiff
|
|
79
81
|
# @option options [Boolean] :ignore_deletes If true, records that appear
|
80
82
|
# in the left/from file but not in the right/to file are not reported.
|
81
83
|
def initialize(left, right, options = {})
|
82
|
-
@left = left.is_a?(
|
84
|
+
@left = left.is_a?(Source) ? left : CSVSource.new(left, options)
|
85
|
+
@left.index_source if @left.lines.nil?
|
83
86
|
raise "No field names found in left (from) source" unless @left.field_names && @left.field_names.size > 0
|
84
|
-
@right = right.is_a?(
|
87
|
+
@right = right.is_a?(Source) ? right : CSVSource.new(right, options)
|
88
|
+
@right.index_source if @right.lines.nil?
|
85
89
|
raise "No field names found in right (to) source" unless @right.field_names && @right.field_names.size > 0
|
86
90
|
@warnings = []
|
87
|
-
@diff_fields = get_diff_fields(@left.field_names, @right.field_names, options
|
88
|
-
@key_fields = @left.key_fields
|
91
|
+
@diff_fields = get_diff_fields(@left.field_names, @right.field_names, options)
|
92
|
+
@key_fields = @left.key_fields
|
89
93
|
diff(options)
|
90
94
|
end
|
91
95
|
|
@@ -93,6 +97,7 @@ class CSVDiff
|
|
93
97
|
# Performs a diff with the specified +options+.
|
94
98
|
def diff(options = {})
|
95
99
|
@summary = nil
|
100
|
+
@options = options
|
96
101
|
@diffs = diff_sources(@left, @right, @key_fields, @diff_fields, options)
|
97
102
|
end
|
98
103
|
|
@@ -134,15 +139,21 @@ class CSVDiff
|
|
134
139
|
|
135
140
|
# Given two sets of field names, determines the common set of fields present
|
136
141
|
# in both, on which members can be diffed.
|
137
|
-
def get_diff_fields(left_fields, right_fields,
|
142
|
+
def get_diff_fields(left_fields, right_fields, options)
|
143
|
+
ignore_fields = options.fetch(:ignore_fields, [])
|
144
|
+
ignore_fields = [ignore_fields] unless ignore_fields.is_a?(Array)
|
145
|
+
ignore_fields.map! do |f|
|
146
|
+
(f.is_a?(Numeric) ? right_fields[f] : f).upcase
|
147
|
+
end
|
138
148
|
diff_fields = []
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
@warnings << "Field '#{fld}' is missing from the left (from) file, and won't be diffed"
|
149
|
+
if options[:diff_common_fields_only]
|
150
|
+
right_fields.each_with_index do |fld, i|
|
151
|
+
if left_fields.include?(fld)
|
152
|
+
diff_fields << fld unless ignore_fields.include?(fld.upcase)
|
153
|
+
end
|
145
154
|
end
|
155
|
+
else
|
156
|
+
diff_fields = (right_fields + left_fields).uniq.reject{ |fld| ignore_fields.include?(fld.upcase) }
|
146
157
|
end
|
147
158
|
diff_fields
|
148
159
|
end
|
data/lib/csv-diff/csv_source.rb
CHANGED
@@ -2,31 +2,7 @@ class CSVDiff
|
|
2
2
|
|
3
3
|
# Represents a CSV input (i.e. the left/from or right/to input) to the diff
|
4
4
|
# process.
|
5
|
-
class CSVSource
|
6
|
-
|
7
|
-
# @return [String] the path to the source file
|
8
|
-
attr_accessor :path
|
9
|
-
# @return [Array<String>] The names of the fields in the source file
|
10
|
-
attr_reader :field_names
|
11
|
-
# @return [Array<String>] The names of the field(s) that uniquely
|
12
|
-
# identify each row.
|
13
|
-
attr_reader :key_fields
|
14
|
-
# @return [Array<String>] The names of the field(s) that identify a
|
15
|
-
# common parent of child records.
|
16
|
-
attr_reader :parent_fields
|
17
|
-
# @return [Array<String>] The names of the field(s) that distinguish a
|
18
|
-
# child of a parent record.
|
19
|
-
attr_reader :child_fields
|
20
|
-
# @return [Hash<String,Hash>] A hash containing each line of the source,
|
21
|
-
# keyed on the values of the +key_fields+.
|
22
|
-
attr_reader :lines
|
23
|
-
# @return [Hash<String,Array<String>>] A hash containing each parent key,
|
24
|
-
# and an Array of the child keys it is a parent of.
|
25
|
-
attr_reader :index
|
26
|
-
# @return [Array<String>] An array of any warnings encountered while
|
27
|
-
# processing the source.
|
28
|
-
attr_reader :warnings
|
29
|
-
|
5
|
+
class CSVSource < Source
|
30
6
|
|
31
7
|
# Creates a new diff source.
|
32
8
|
#
|
@@ -59,90 +35,37 @@ class CSVDiff
|
|
59
35
|
# identifies each row.
|
60
36
|
# @option options [Array<String>] :key_fields The names of the fields
|
61
37
|
# that uniquely identifies each row.
|
62
|
-
# @option options [String] :parent_field The name of the field that
|
63
|
-
#
|
64
|
-
# @option options [String] :child_field The name of the field that
|
65
|
-
# uniquely
|
38
|
+
# @option options [String] :parent_field The name of the field(s) that
|
39
|
+
# identify a parent within which sibling order should be checked.
|
40
|
+
# @option options [String] :child_field The name of the field(s) that
|
41
|
+
# uniquely identify a child of a parent.
|
42
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
43
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
44
|
+
# case-insensitive comparisons.
|
45
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
46
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
47
|
+
# regular expressions will be indexed and included in the diff process.
|
48
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
49
|
+
# regular expression(s). Source rows with a field value that satisfies
|
50
|
+
# the regular expressions will be excluded from the diff process.
|
66
51
|
def initialize(source, options = {})
|
52
|
+
super(options)
|
67
53
|
if source.is_a?(String)
|
68
54
|
require 'csv'
|
69
55
|
mode_string = options[:encoding] ? "r:#{options[:encoding]}" : 'r'
|
70
56
|
csv_options = options.fetch(:csv_options, {})
|
71
57
|
@path = source
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
@parent_fields = @key_fields[0...-1]
|
77
|
-
@child_fields = @key_fields[-1..-1]
|
78
|
-
else
|
79
|
-
@parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
|
80
|
-
@child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
|
81
|
-
@key_fields = @parent_fields + @child_fields
|
82
|
-
end
|
83
|
-
@field_names = options[:field_names]
|
84
|
-
@warnings = []
|
85
|
-
index_source(source, options)
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
# Returns the row in the CSV source corresponding to the supplied key.
|
90
|
-
#
|
91
|
-
# @param key [String] The unique key to use to lookup the row.
|
92
|
-
# @return [Hash] The fields for the line corresponding to +key+, or nil
|
93
|
-
# if the key is not recognised.
|
94
|
-
def [](key)
|
95
|
-
@lines[key]
|
96
|
-
end
|
97
|
-
|
98
|
-
|
99
|
-
private
|
100
|
-
|
101
|
-
# Given an array of lines, where each line is an array of fields, indexes
|
102
|
-
# the array contents so that it can be looked up by key.
|
103
|
-
def index_source(lines, options)
|
104
|
-
@lines = {}
|
105
|
-
@index = Hash.new{ |h, k| h[k] = [] }
|
106
|
-
@key_fields = find_field_indexes(@key_fields, @field_names) if @field_names
|
107
|
-
line_num = 0
|
108
|
-
lines.each do |row|
|
109
|
-
line_num += 1
|
110
|
-
next if line_num == 1 && @field_names && options[:ignore_header]
|
111
|
-
unless @field_names
|
112
|
-
@field_names = row
|
113
|
-
@key_fields = find_field_indexes(@key_fields, @field_names)
|
114
|
-
next
|
115
|
-
end
|
116
|
-
field_vals = row
|
117
|
-
line = {}
|
118
|
-
@field_names.each_with_index do |field, i|
|
119
|
-
line[field] = field_vals[i]
|
120
|
-
end
|
121
|
-
key_values = @key_fields.map{ |kf| field_vals[kf].to_s.upcase }
|
122
|
-
key = key_values.join('~')
|
123
|
-
parent_key = key_values[0...(@parent_fields.length)].join('~')
|
124
|
-
if @lines[key]
|
125
|
-
@warnings << "Duplicate key '#{key}' encountered and ignored at line #{line_num}"
|
126
|
-
else
|
127
|
-
@index[parent_key] << key
|
128
|
-
@lines[key] = line
|
129
|
-
end
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
|
134
|
-
# Converts an array of field names to an array of indexes of the fields
|
135
|
-
# matching those names.
|
136
|
-
def find_field_indexes(key_fields, field_names)
|
137
|
-
key_fields.map do |field|
|
138
|
-
if field.is_a?(Fixnum)
|
139
|
-
field
|
140
|
-
else
|
141
|
-
field_names.index{ |field_name| field.to_s.downcase == field_name.downcase } or
|
142
|
-
raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
|
143
|
-
field_names.join(', ')}"
|
58
|
+
# When you call CSV.open, it's best to pass in a block so that after it's yielded,
|
59
|
+
# the underlying file handle is closed. Otherwise, you risk leaking the handle.
|
60
|
+
@data = CSV.open(@path, mode_string, csv_options) do |csv|
|
61
|
+
csv.readlines
|
144
62
|
end
|
63
|
+
elsif source.is_a?(Enumerable) && source.size == 0 || (source.size > 0 && source.first.is_a?(Enumerable))
|
64
|
+
@data = source
|
65
|
+
else
|
66
|
+
raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
|
145
67
|
end
|
68
|
+
index_source
|
146
69
|
end
|
147
70
|
|
148
71
|
end
|
@@ -0,0 +1,275 @@
|
|
1
|
+
class CSVDiff
|
2
|
+
|
3
|
+
# Reppresents an input (i.e the left/from or tight/to input) to the diff
|
4
|
+
# process.
|
5
|
+
class Source
|
6
|
+
|
7
|
+
# @return [String] the path to the source file
|
8
|
+
attr_accessor :path
|
9
|
+
# @return [Array<Arrary>] The data for this source
|
10
|
+
attr_reader :data
|
11
|
+
|
12
|
+
# @return [Array<String>] The names of the fields in the source file
|
13
|
+
attr_reader :field_names
|
14
|
+
# @return [Array<String>] The names of the field(s) that uniquely
|
15
|
+
# identify each row.
|
16
|
+
attr_reader :key_fields
|
17
|
+
# @return [Array<String>] The names of the field(s) that identify a
|
18
|
+
# common parent of child records.
|
19
|
+
attr_reader :parent_fields
|
20
|
+
# @return [Array<String>] The names of the field(s) that distinguish a
|
21
|
+
# child of a parent record.
|
22
|
+
attr_reader :child_fields
|
23
|
+
|
24
|
+
# @return [Array<Fixnum>] The indexes of the key fields in the source
|
25
|
+
# file.
|
26
|
+
attr_reader :key_field_indexes
|
27
|
+
# @return [Array<Fixnum>] The indexes of the parent fields in the source
|
28
|
+
# file.
|
29
|
+
attr_reader :parent_field_indexes
|
30
|
+
# @return [Array<Fixnum>] The indexes of the child fields in the source
|
31
|
+
# file.
|
32
|
+
attr_reader :child_field_indexes
|
33
|
+
|
34
|
+
# @return [Boolean] True if the source has been indexed with case-
|
35
|
+
# sensitive keys, or false if it has been indexed using upper-case key
|
36
|
+
# values.
|
37
|
+
attr_reader :case_sensitive
|
38
|
+
alias_method :case_sensitive?, :case_sensitive
|
39
|
+
# @return [Boolean] True if leading/trailing whitespace should be stripped
|
40
|
+
# from fields
|
41
|
+
attr_reader :trim_whitespace
|
42
|
+
# @return [Hash<String,Hash>] A hash containing each line of the source,
|
43
|
+
# keyed on the values of the +key_fields+.
|
44
|
+
attr_reader :lines
|
45
|
+
# @return [Hash<String,Array<String>>] A hash containing each parent key,
|
46
|
+
# and an Array of the child keys it is a parent of.
|
47
|
+
attr_reader :index
|
48
|
+
# @return [Array<String>] An array of any warnings encountered while
|
49
|
+
# processing the source.
|
50
|
+
attr_reader :warnings
|
51
|
+
# @return [Fixnum] A count of the lines processed from this source.
|
52
|
+
# Excludes any header and duplicate records identified during indexing.
|
53
|
+
attr_reader :line_count
|
54
|
+
# @return [Fixnum] A count of the lines from this source that were skipped
|
55
|
+
# due to filter conditions.
|
56
|
+
attr_reader :skip_count
|
57
|
+
# @return [Fixnum] A count of the lines from this source that had the same
|
58
|
+
# key value as another line.
|
59
|
+
attr_reader :dup_count
|
60
|
+
|
61
|
+
|
62
|
+
# Creates a new diff source.
|
63
|
+
#
|
64
|
+
# A diff source must contain at least one field that will be used as the
|
65
|
+
# key to identify the same record in a different version of this file.
|
66
|
+
# If not specified via one of the options, the first field is assumed to
|
67
|
+
# be the unique key.
|
68
|
+
#
|
69
|
+
# If multiple fields combine to form a unique key, the parent is assumed
|
70
|
+
# to be identified by all but the last field of the unique key. If finer
|
71
|
+
# control is required, use a combination of the :parent_fields and
|
72
|
+
# :child_fields options.
|
73
|
+
#
|
74
|
+
# All key options can be specified either by field name, or by field
|
75
|
+
# index (0 based).
|
76
|
+
#
|
77
|
+
# @param options [Hash] An options hash.
|
78
|
+
# @option options [Array<String>] :field_names The names of each of the
|
79
|
+
# fields in +source+.
|
80
|
+
# @option options [Boolean] :ignore_header If true, and :field_names has
|
81
|
+
# been specified, then the first row of the file is ignored.
|
82
|
+
# @option options [String] :key_field The name of the field that uniquely
|
83
|
+
# identifies each row.
|
84
|
+
# @option options [Array<String>] :key_fields The names of the fields
|
85
|
+
# that uniquely identifies each row.
|
86
|
+
# @option options [String] :parent_field The name of the field(s) that
|
87
|
+
# identify a parent within which sibling order should be checked.
|
88
|
+
# @option options [String] :child_field The name of the field(s) that
|
89
|
+
# uniquely identify a child of a parent.
|
90
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
91
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
92
|
+
# case-insensitive comparisons.
|
93
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
94
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
95
|
+
# regular expressions will be indexed and included in the diff process.
|
96
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
97
|
+
# regular expression(s). Source rows with a field value that satisfies
|
98
|
+
# the regular expressions will be excluded from the diff process.
|
99
|
+
def initialize(options = {})
|
100
|
+
if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
|
101
|
+
(kf = options.fetch(:key_field, options[:key_fields]))
|
102
|
+
@key_fields = [kf].flatten
|
103
|
+
@parent_fields = @key_fields[0...-1]
|
104
|
+
@child_fields = @key_fields[-1..-1]
|
105
|
+
else
|
106
|
+
@parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
|
107
|
+
@child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
|
108
|
+
@key_fields = @parent_fields + @child_fields
|
109
|
+
end
|
110
|
+
@field_names = options[:field_names]
|
111
|
+
@case_sensitive = options.fetch(:case_sensitive, true)
|
112
|
+
@trim_whitespace = options.fetch(:trim_whitespace, false)
|
113
|
+
@ignore_header = options[:ignore_header]
|
114
|
+
@include = options[:include]
|
115
|
+
@exclued = options[:exclude]
|
116
|
+
@path = options.fetch(:path, 'NA') unless @path
|
117
|
+
@warnings = []
|
118
|
+
end
|
119
|
+
|
120
|
+
|
121
|
+
def path?
|
122
|
+
@path != 'NA'
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
# Returns the row in the CSV source corresponding to the supplied key.
|
127
|
+
#
|
128
|
+
# @param key [String] The unique key to use to lookup the row.
|
129
|
+
# @return [Hash] The fields for the line corresponding to +key+, or nil
|
130
|
+
# if the key is not recognised.
|
131
|
+
def [](key)
|
132
|
+
@lines[key]
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
# Given an array of lines, where each line is an array of fields, indexes
|
137
|
+
# the array contents so that it can be looked up by key.
|
138
|
+
def index_source
|
139
|
+
@lines = {}
|
140
|
+
@index = Hash.new{ |h, k| h[k] = [] }
|
141
|
+
if @field_names
|
142
|
+
index_fields
|
143
|
+
include_filter = convert_filter(@include, @field_names)
|
144
|
+
exclude_filter = convert_filter(@exclude, @field_names)
|
145
|
+
end
|
146
|
+
@line_count = 0
|
147
|
+
@skip_count = 0
|
148
|
+
@dup_count = 0
|
149
|
+
line_num = 0
|
150
|
+
@data.each do |row|
|
151
|
+
line_num += 1
|
152
|
+
next if line_num == 1 && @field_names && @ignore_header
|
153
|
+
unless @field_names
|
154
|
+
if row.class.name == 'CSV::Row'
|
155
|
+
@field_names = row.headers.each_with_index.map{ |f, i| f || i.to_s }
|
156
|
+
else
|
157
|
+
@field_names = row.each_with_index.map{ |f, i| f || i.to_s }
|
158
|
+
end
|
159
|
+
index_fields
|
160
|
+
include_filter = convert_filter(@include, @field_names)
|
161
|
+
exclude_filter = convert_filter(@exclude, @field_names)
|
162
|
+
next
|
163
|
+
end
|
164
|
+
field_vals = row
|
165
|
+
line = {}
|
166
|
+
filter = false
|
167
|
+
@field_names.each_with_index do |field, i|
|
168
|
+
val = field_vals[i]
|
169
|
+
val = val.to_s.strip if val && @trim_whitespace
|
170
|
+
line[field] = val
|
171
|
+
if include_filter && f = include_filter[i]
|
172
|
+
filter = !check_filter(f, line[field])
|
173
|
+
end
|
174
|
+
if exclude_filter && f = exclude_filter[i]
|
175
|
+
filter = check_filter(f, line[field])
|
176
|
+
end
|
177
|
+
break if filter
|
178
|
+
end
|
179
|
+
if filter
|
180
|
+
@skip_count += 1
|
181
|
+
next
|
182
|
+
end
|
183
|
+
key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
|
184
|
+
field_vals[kf].to_s :
|
185
|
+
field_vals[kf].to_s.upcase }
|
186
|
+
key = key_values.join('~')
|
187
|
+
parent_key = key_values[0...(@parent_fields.length)].join('~')
|
188
|
+
if @lines[key]
|
189
|
+
@warnings << "Duplicate key '#{key}' encountered at line #{line_num}"
|
190
|
+
@dup_count += 1
|
191
|
+
key += "[#{@dup_count}]"
|
192
|
+
end
|
193
|
+
@index[parent_key] << key
|
194
|
+
@lines[key] = line
|
195
|
+
@line_count += 1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
|
200
|
+
# Save the data in this Source as a CSV at +file_path+.
|
201
|
+
#
|
202
|
+
# @parma file_path [String] The target path to save the data to.
|
203
|
+
# @param options [Hash] A set of options to pass to CSV.open to control
|
204
|
+
# how the CSV is generated.
|
205
|
+
def save_csv(file_path, options = {})
|
206
|
+
require 'csv'
|
207
|
+
default_opts = {
|
208
|
+
headers: @field_name, write_headers: true
|
209
|
+
}
|
210
|
+
CSV.open(file_path, 'wb', default_opts.merge(options)) do |csv|
|
211
|
+
@data.each{ |rec| csv << rec }
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
|
216
|
+
private
|
217
|
+
|
218
|
+
|
219
|
+
def index_fields
|
220
|
+
@key_field_indexes = find_field_indexes(@key_fields, @field_names)
|
221
|
+
@parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
|
222
|
+
@child_field_indexes = find_field_indexes(@child_fields, @field_names)
|
223
|
+
@key_fields = @key_field_indexes.map{ |i| @field_names[i] }
|
224
|
+
@parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
|
225
|
+
@child_fields = @child_field_indexes.map{ |i| @field_names[i] }
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
# Converts an array of field names to an array of indexes of the fields
|
230
|
+
# matching those names.
|
231
|
+
def find_field_indexes(key_fields, field_names)
|
232
|
+
key_fields.map do |field|
|
233
|
+
if field.is_a?(Integer)
|
234
|
+
field
|
235
|
+
else
|
236
|
+
field_names.index{ |field_name| field.to_s.downcase == field_name.to_s.downcase } or
|
237
|
+
raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
|
238
|
+
field_names.join(', ')}"
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
|
244
|
+
def convert_filter(hsh, field_names)
|
245
|
+
return unless hsh
|
246
|
+
if !hsh.is_a?(Hash)
|
247
|
+
raise ArgumentError, ":include/:exclude option must be a Hash of field name(s)/index(es) to RegExp(s)"
|
248
|
+
end
|
249
|
+
keys = hsh.keys
|
250
|
+
idxs = find_field_indexes(keys, @field_names)
|
251
|
+
Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
|
252
|
+
end
|
253
|
+
|
254
|
+
|
255
|
+
def check_filter(filter, field_val)
|
256
|
+
case filter
|
257
|
+
when String
|
258
|
+
if @case_sensitive
|
259
|
+
filter == field_val
|
260
|
+
else
|
261
|
+
filter.downcase == field_val.to_s.downcase
|
262
|
+
end
|
263
|
+
when Regexp
|
264
|
+
filter.match(field_val)
|
265
|
+
when Proc
|
266
|
+
filter.call(field_val)
|
267
|
+
else
|
268
|
+
raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
|
269
|
+
end
|
270
|
+
end
|
271
|
+
|
272
|
+
end
|
273
|
+
|
274
|
+
end
|
275
|
+
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'cgi'
|
3
|
+
|
4
|
+
|
5
|
+
class CSVDiff
|
6
|
+
|
7
|
+
# Convert XML content to CSV format using XPath selectors to identify the
|
8
|
+
# rows and field values in an XML document
|
9
|
+
class XMLSource < Source
|
10
|
+
|
11
|
+
attr_accessor :context
|
12
|
+
|
13
|
+
# Create a new XMLSource, identified by +path+. Normally this is a path
|
14
|
+
# to the XML document, but any value is fine, as it isreally just a label
|
15
|
+
# to identify this data set.
|
16
|
+
#
|
17
|
+
# @param path [String] A label for this data set (often a path to the
|
18
|
+
# XML document used as the source).
|
19
|
+
# @param options [Hash] An options hash.
|
20
|
+
# @option options [Array<String>] :field_names The names of each of the
|
21
|
+
# fields in +source+.
|
22
|
+
# @option options [Boolean] :ignore_header If true, and :field_names has
|
23
|
+
# been specified, then the first row of the file is ignored.
|
24
|
+
# @option options [String] :key_field The name of the field that uniquely
|
25
|
+
# identifies each row.
|
26
|
+
# @option options [Array<String>] :key_fields The names of the fields
|
27
|
+
# that uniquely identifies each row.
|
28
|
+
# @option options [String] :parent_field The name of the field(s) that
|
29
|
+
# identify a parent within which sibling order should be checked.
|
30
|
+
# @option options [String] :child_field The name of the field(s) that
|
31
|
+
# uniquely identify a child of a parent.
|
32
|
+
# @option options [Boolean] :case_sensitive If true (the default), keys
|
33
|
+
# are indexed as-is; if false, the index is built in upper-case for
|
34
|
+
# case-insensitive comparisons.
|
35
|
+
# @option options [Hash] :include A hash of field name(s) or index(es) to
|
36
|
+
# regular expression(s). Only source rows whose field values satisfy the
|
37
|
+
# regular expressions will be indexed and included in the diff process.
|
38
|
+
# @option options [Hash] :exclude A hash of field name(s) or index(es) to
|
39
|
+
# regular expression(s). Source rows with a field value that satisfies
|
40
|
+
# the regular expressions will be excluded from the diff process.
|
41
|
+
# @option options [String] :context A context value from which fields
|
42
|
+
# can be populated using a Regexp.
|
43
|
+
def initialize(path, options = {})
|
44
|
+
super(options)
|
45
|
+
@path = path
|
46
|
+
@context = options[:context]
|
47
|
+
@data = []
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
# Process a +source+, converting the XML into a table of data, using
|
52
|
+
# +rec_xpath+ to identify the nodes that correspond each record that
|
53
|
+
# should appear in the output, and +field_maps+ to populate each field
|
54
|
+
# in each row.
|
55
|
+
#
|
56
|
+
# @param source [String|Array] may be a String containing XML content,
|
57
|
+
# an Array of paths to files containing XML content, or a path to
|
58
|
+
# a single file.
|
59
|
+
# @param rec_xpath [String] An XPath expression that selects all the
|
60
|
+
# items in the XML document that are to be converted into new rows.
|
61
|
+
# The returned items are not directly used to populate the fields,
|
62
|
+
# but provide a context for the field XPath expreessions that populate
|
63
|
+
# each field's content.
|
64
|
+
# @param field_maps [Hash<String, String>] A map of field names to
|
65
|
+
# expressions that are evaluated in the context of each row node
|
66
|
+
# selected by +rec_xpath+. The field expressions are typically XPath
|
67
|
+
# expressions evaluated in the context of the nodes returned by the
|
68
|
+
# +rec_xpath+. Alternatively, a String that is not an XPath expression
|
69
|
+
# is used as a literal value for a field, while a Regexp can also
|
70
|
+
# be used to pull a value from any context specified in the +options+
|
71
|
+
# hash. The Regexp should include a single grouping, as the value used
|
72
|
+
# will be the result in $1 after the match is performed.
|
73
|
+
# @param context [String] An optional context for the XML to be processed.
|
74
|
+
# The value passed here can be referenced in field map expressions
|
75
|
+
# using a Regexp, with the value of the first grouping in the regex
|
76
|
+
# being the value returned for the field.
|
77
|
+
def process(source, rec_xpath, field_maps, context = nil)
|
78
|
+
@field_names = field_maps.keys unless @field_names
|
79
|
+
case source
|
80
|
+
when Nokogiri::XML::Document
|
81
|
+
add_data(source, rec_xpath, field_maps, context || @context)
|
82
|
+
when /<\?xml/
|
83
|
+
doc = Nokogiri::XML(source)
|
84
|
+
add_data(doc, rec_xpath, field_maps, context || @context)
|
85
|
+
when Array
|
86
|
+
source.each{ |f| process_file(f, rec_xpath, field_maps) }
|
87
|
+
when String
|
88
|
+
process_file(source, rec_xpath, field_maps)
|
89
|
+
else
|
90
|
+
raise ArgumentError, "Unhandled source type #{source.class.name}"
|
91
|
+
end
|
92
|
+
@data
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
private
|
97
|
+
|
98
|
+
|
99
|
+
# Load the XML document at +file_path+ and process it into rows of data.
|
100
|
+
def process_file(file_path, rec_xpath, field_maps)
|
101
|
+
begin
|
102
|
+
File.open(file_path) do |f|
|
103
|
+
doc = Nokogiri::XML(f)
|
104
|
+
add_data(doc, rec_xpath, field_maps, @context || file_path)
|
105
|
+
end
|
106
|
+
rescue
|
107
|
+
STDERR.puts "An error occurred while attempting to open #{file_path}"
|
108
|
+
raise
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
# Locate records in +doc+ using +rec_xpath+ to identify the nodes that
|
114
|
+
# correspond to a new record in the data, and +field_maps+ to populate
|
115
|
+
# the fields in each row.
|
116
|
+
def add_data(doc, rec_xpath, field_maps, context)
|
117
|
+
doc.xpath(rec_xpath).each do |rec_node|
|
118
|
+
rec = []
|
119
|
+
field_maps.each do |field_name, expr|
|
120
|
+
case expr
|
121
|
+
when Regexp # Match context against Regexp and extract first grouping
|
122
|
+
if context
|
123
|
+
context =~ expr
|
124
|
+
rec << $1
|
125
|
+
else
|
126
|
+
rec << nil
|
127
|
+
end
|
128
|
+
when %r{[/(.@]} # XPath expression
|
129
|
+
res = rec_node.xpath(expr)
|
130
|
+
rec << CGI.unescape_html(res.to_s)
|
131
|
+
else # Use expr as the value for this field
|
132
|
+
rec << expr
|
133
|
+
end
|
134
|
+
end
|
135
|
+
@data << rec
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
|
metadata
CHANGED
@@ -1,27 +1,44 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-diff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Gardiner
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
13
|
+
description: |2
|
14
|
+
This library performs diffs of CSV data, or any table-like source.
|
15
|
+
|
16
|
+
Unlike a standard diff that compares line by line, and is sensitive to the
|
17
|
+
ordering of records, CSV-Diff identifies common lines by key field(s), and
|
18
|
+
then compares the contents of the fields in each line.
|
19
|
+
|
20
|
+
Data may be supplied in the form of CSV files, or as an array of arrays. The
|
21
|
+
diff process provides a fine level of control over what to diff, and can
|
22
|
+
optionally ignore certain types of changes (e.g. changes in position).
|
23
|
+
|
24
|
+
CSV-Diff is particularly well suited to data in parent-child format. Parent-
|
25
|
+
child data does not lend itself well to standard text diffs, as small changes
|
26
|
+
in the organisation of the tree at an upper level can lead to big movements
|
27
|
+
in the position of descendant records. By instead matching records by key,
|
28
|
+
CSV-Diff avoids this issue, while still being able to detect changes in
|
29
|
+
sibling order.
|
30
|
+
|
31
|
+
This gem implements the core diff algorithm, and handles the loading and
|
32
|
+
diffing of CSV files (or Arrays of Arrays). It also supports converting
|
33
|
+
data in XML format into tabular form, so that it can then be processed
|
34
|
+
like any other CSV or table-like source. It returns a CSVDiff object
|
35
|
+
containing the details of differences in object form. This is useful for
|
36
|
+
projects that need diff capability, but want to handle the reporting or
|
37
|
+
actioning of differences themselves.
|
38
|
+
|
39
|
+
For a pre-built diff reporting capability, see the csv-diff-report gem,
|
40
|
+
which provides a command-line tool for generating diff reports in HTML,
|
41
|
+
Excel, or text formats.
|
25
42
|
email: adam.b.gardiner@gmail.com
|
26
43
|
executables: []
|
27
44
|
extensions: []
|
@@ -33,9 +50,12 @@ files:
|
|
33
50
|
- lib/csv-diff/algorithm.rb
|
34
51
|
- lib/csv-diff/csv_diff.rb
|
35
52
|
- lib/csv-diff/csv_source.rb
|
53
|
+
- lib/csv-diff/source.rb
|
54
|
+
- lib/csv-diff/xml_source.rb
|
36
55
|
- lib/csv_diff.rb
|
37
56
|
homepage: https://github.com/agardiner/csv-diff
|
38
|
-
licenses:
|
57
|
+
licenses:
|
58
|
+
- MIT
|
39
59
|
metadata: {}
|
40
60
|
post_install_message: For command-line tools and diff reports, 'gem install csv-diff-report'
|
41
61
|
rdoc_options: []
|
@@ -43,18 +63,18 @@ require_paths:
|
|
43
63
|
- lib
|
44
64
|
required_ruby_version: !ruby/object:Gem::Requirement
|
45
65
|
requirements:
|
46
|
-
- -
|
66
|
+
- - ">="
|
47
67
|
- !ruby/object:Gem::Version
|
48
68
|
version: '0'
|
49
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
70
|
requirements:
|
51
|
-
- -
|
71
|
+
- - ">="
|
52
72
|
- !ruby/object:Gem::Version
|
53
73
|
version: '0'
|
54
74
|
requirements: []
|
55
75
|
rubyforge_project:
|
56
|
-
rubygems_version: 2.
|
76
|
+
rubygems_version: 2.5.2.3
|
57
77
|
signing_key:
|
58
78
|
specification_version: 4
|
59
|
-
summary: CSV Diff is a library for generating diffs from data in CSV format
|
79
|
+
summary: CSV Diff is a library for generating diffs from data in CSV or XML format
|
60
80
|
test_files: []
|