csv-diff 0.2 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- ZGY3NTBiZGFiNTQxMmZiZmMwOTE2MDIzYmEwNWQ3NTY0MzZmNjRkMg==
5
- data.tar.gz: !binary |-
6
- NzAyNTY1OTk3MTA3Y2ZhNjk2YWRmNTJkYTljNGZhZDY2YjQ1OTg2ZQ==
2
+ SHA1:
3
+ metadata.gz: d1b3b8deee34344d334e740285cb1f3c99074694
4
+ data.tar.gz: d95158d13861cb66fd460ee430714ec3c83cd0b1
7
5
  SHA512:
8
- metadata.gz: !binary |-
9
- ZDRmODg2NDlhYTY4NDM1MWYxNjgyZGQ0MGRiZDNkYmI3MDkwYjFkZWVhOWYw
10
- YmVkNDVhMjk1M2EyNjFkZGIxOGE0Y2MwOWQwMWRhNzhjZDk2N2RhZmEyZGRm
11
- MjliNmM4Y2ZmNzY4ZTJkY2EzZWY4Mjg3NmU3ZjQxM2RkYTBjODE=
12
- data.tar.gz: !binary |-
13
- YTljOGZhNDY0YzdjZGYzMTA2NTM3MzIwNDg4MTcwYWEyM2IyZTc1YWYxMjFm
14
- YjMwYWU1NWMzNGVkZGRkYWYyZjUwMTQ2MWZlMjdkNjQwMjIwYWUwNmNlYjM3
15
- NDlmZDk5MGNlMTk4ZDhlMzFiOGUyZTIwY2EyZTY3MjUwYjc2NWY=
6
+ metadata.gz: 50c74d6a4093012b0ba44fef70c2d749348d6777cfb9f2cfda66c6e075423191a4c6c22019a388b9d8bd14e22ac60d539f4e3b4aa85fd87fd774a64da15858c7
7
+ data.tar.gz: 8fa030a54e7a97db9913b3c36a1942de1e07a6549f9ae7aa58b5b3f44d522fe11f72d44e18b6b7612d2b2dc9f106ece1fea183557c507bcf18316891ab63f230
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2013, Adam Gardiner
1
+ Copyright (c) 2013-2016, Adam Gardiner
2
2
  All rights reserved.
3
3
 
4
4
  Redistribution and use in source and binary forms, with or without
data/README.md CHANGED
@@ -125,7 +125,7 @@ column in the data. In this case, a diff can be created simply via:
125
125
  diff = CSVDiff.new(file1, file2)
126
126
  ```
127
127
 
128
- ### Specifynig Unique Row Identifiers
128
+ ### Specifying Unique Row Identifiers
129
129
 
130
130
  Often however, rows are not uniquely identifiable via the first column in the file.
131
131
  In a parent-child hierarchy, for example, combinations of parent and child may be
@@ -211,6 +211,23 @@ diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam'
211
211
  ignore_fields: ['CreatedAt', 'UpdatedAt'])
212
212
  ```
213
213
 
214
+ ### Filtering Rows
215
+
216
+ If you need to filter source data before running the diff process, you can use the :include
217
+ and :exclude options to do so. Both options take a Hash as their value; the hash should have
218
+ keys that are the field names or indexes (0-based) on which to filter, and whose values are
219
+ regular expressions or lambdas to be applied to values of the corresponding field. Rows will
220
+ only be diffed if they satisfy :include conditions, and do not satisfy :exclude conditions.
221
+ ```ruby
222
+ # Generate a diff of Arsenal home games not refereed by Clattenburg
223
+ diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam', 'AwayTeam'],
224
+ include: {HomeTeam: 'Arsenal'}, exclude: {Referee: /Clattenburg/})
225
+
226
+ # Generate a diff of games played over the Xmas/New Year period
227
+ diff = CSVDiff.new(file1, file2, parent_field: 'Date', child_fields: ['HomeTeam', 'AwayTeam'],
228
+ include: {Date: lambda{ |d| holiday_period.include?(Date.strptime(d, '%y/%m/%d')) } })
229
+ ```
230
+
214
231
  ### Ignoring Certain Changes
215
232
 
216
233
  CSVDiff identifies Adds, Updates, Moves and Deletes; any of these changes can be selectively
@@ -1,3 +1,4 @@
1
+ require 'csv-diff/source'
1
2
  require 'csv-diff/csv_source'
2
3
  require 'csv-diff/algorithm'
3
4
  require 'csv-diff/csv_diff'
@@ -3,6 +3,55 @@ class CSVDiff
3
3
  # Implements the CSV diff algorithm.
4
4
  module Algorithm
5
5
 
6
+ # Holds the details of a single difference
7
+ class Diff
8
+
9
+ attr_accessor :diff_type
10
+ attr_reader :fields
11
+ attr_reader :row
12
+ attr_reader :sibling_position
13
+
14
+ def initialize(diff_type, fields, row_idx, pos_idx)
15
+ @diff_type = diff_type
16
+ @fields = fields
17
+ @row = row_idx + 1
18
+ self.sibling_position = pos_idx
19
+ end
20
+
21
+
22
+ def sibling_position=(pos_idx)
23
+ if pos_idx.is_a?(Array)
24
+ pos_idx.compact!
25
+ if pos_idx.first != pos_idx.last
26
+ @sibling_position = pos_idx.map{ |pos| pos + 1 }
27
+ else
28
+ @sibling_position = pos_idx.first + 1
29
+ end
30
+ else
31
+ @sibling_position = pos_idx + 1
32
+ end
33
+ end
34
+
35
+
36
+ # For backwards compatibility and access to fields with differences
37
+ def [](key)
38
+ case key
39
+ when :action
40
+ a = diff_type.to_s
41
+ a[0] = a[0].upcase
42
+ a
43
+ when :row
44
+ @row
45
+ when :sibling_position
46
+ @sibling_position
47
+ else
48
+ @fields[key]
49
+ end
50
+ end
51
+
52
+ end
53
+
54
+
6
55
  # Diffs two CSVSource structures.
7
56
  #
8
57
  # @param left [CSVSource] A CSVSource object containing the contents of
@@ -13,24 +62,70 @@ class CSVDiff
13
62
  # that uniquely identify each row.
14
63
  # @param diff_fields [Array] An array containing the names of the fields
15
64
  # to be diff-ed.
65
+ # @param options [Hash] An options hash.
66
+ # @option options [Boolean] :ignore_adds If set to true, we ignore any
67
+ # new items that appear only in +right+.
68
+ # @option options [Boolean] :ignore_moves If set to true, we ignore any
69
+ # changes in sibling order.
70
+ # @option options [Boolean] :ignore_updates If set to true, we ignore any
71
+ # items that exist in both +left+ and +right+.
72
+ # @option options [Boolean] :ignore_deletes If set to true, we ignore any
73
+ # new items that appear only in +left+.
74
+ # @option options [Hash<Object,Proc>] :equality_procs A Hash mapping fields
75
+ # to a 2-arg Proc that should be used to compare values in that field for
76
+ # equality.
16
77
  def diff_sources(left, right, key_fields, diff_fields, options = {})
78
+ unless left.case_sensitive? == right.case_sensitive?
79
+ raise ArgumentError, "Left and right must have same settings for case-sensitivity"
80
+ end
81
+ unless left.parent_fields.length == right.parent_fields.length
82
+ raise ArgumentError, "Left and right must have same settings for parent/child fields"
83
+ end
84
+
85
+ # Ensure key fields are not also in the diff_fields
86
+ diff_fields = diff_fields - key_fields
87
+
17
88
  left_index = left.index
18
89
  left_values = left.lines
19
90
  left_keys = left_values.keys
20
91
  right_index = right.index
21
92
  right_values = right.lines
22
93
  right_keys = right_values.keys
23
- parent_fields = left.parent_fields.length
94
+ parent_field_count = left.parent_fields.length
24
95
 
25
96
  include_adds = !options[:ignore_adds]
26
97
  include_moves = !options[:ignore_moves]
27
98
  include_updates = !options[:ignore_updates]
28
99
  include_deletes = !options[:ignore_deletes]
29
100
 
30
- diffs = Hash.new{ |h, k| h[k] = {} }
101
+ @case_sensitive = left.case_sensitive?
102
+ @equality_procs = options.fetch(:equality_procs, {})
103
+
104
+ diffs = {}
105
+ potential_moves = Hash.new{ |h, k| h[k] = [] }
106
+
107
+ # First identify deletions
108
+ if include_deletes
109
+ (left_keys - right_keys).each do |key|
110
+ # Delete
111
+ key_vals = key.split('~', -1)
112
+ parent = key_vals[0...parent_field_count].join('~')
113
+ child = key_vals[parent_field_count..-1].join('~')
114
+ left_parent = left_index[parent]
115
+ left_value = left_values[key]
116
+ row_idx = left_keys.index(key)
117
+ sib_idx = left_parent.index(key)
118
+ raise "Can't locate key #{key} in parent #{parent}" unless sib_idx
119
+ diffs[key] = Diff.new(:delete, left_value, row_idx, sib_idx)
120
+ potential_moves[child] << key
121
+ #puts "Delete: #{key}"
122
+ end
123
+ end
124
+
125
+ # Now identify adds/updates
31
126
  right_keys.each_with_index do |key, right_row_id|
32
- key_vals = key.split('~')
33
- parent = key_vals[0...parent_fields].join('~')
127
+ key_vals = key.split('~', -1)
128
+ parent = key_vals[0...parent_field_count].join('~')
34
129
  left_parent = left_index[parent]
35
130
  right_parent = right_index[parent]
36
131
  left_value = left_values[key]
@@ -38,13 +133,12 @@ class CSVDiff
38
133
  left_idx = left_parent && left_parent.index(key)
39
134
  right_idx = right_parent && right_parent.index(key)
40
135
 
41
- id = {}
42
- id[:row] = right_row_id + 1
43
- id[:sibling_position] = right_idx + 1
44
- key_fields.each do |field_name|
45
- id[field_name] = right_value[field_name]
46
- end
47
136
  if left_idx && right_idx
137
+ if include_updates && (changes = diff_row(left_value, right_value, diff_fields))
138
+ id = id_fields(key_fields, right_value)
139
+ diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
140
+ #puts "Change: #{key}"
141
+ end
48
142
  if include_moves
49
143
  left_common = left_parent & right_parent
50
144
  right_common = right_parent & left_parent
@@ -52,42 +146,34 @@ class CSVDiff
52
146
  right_pos = right_common.index(key)
53
147
  if left_pos != right_pos
54
148
  # Move
55
- diffs[key].merge!(id.merge!(:action => 'Move',
56
- :sibling_position => [left_idx + 1, right_idx + 1]))
149
+ if d = diffs[key]
150
+ d.sibling_position = [left_idx, right_idx]
151
+ else
152
+ id = id_fields(key_fields, right_value)
153
+ diffs[key] = Diff.new(:move, id, right_row_id, [left_idx, right_idx])
154
+ end
57
155
  #puts "Move #{left_idx} -> #{right_idx}: #{key}"
58
156
  end
59
157
  end
60
- if include_updates && (changes = diff_row(left_value, right_value, diff_fields))
61
- diffs[key].merge!(id.merge(changes.merge(:action => 'Update')))
62
- #puts "Change: #{key}"
63
- end
64
- elsif include_adds && right_idx
158
+ elsif right_idx
65
159
  # Add
66
- diffs[key].merge!(id.merge(right_values[key].merge(:action => 'Add')))
67
- #puts "Add: #{key}"
68
- end
69
- end
70
-
71
- # Now identify deletions
72
- if include_deletes
73
- (left_keys - right_keys).each do |key|
74
- # Delete
75
- key_vals = key.split('~')
76
- parent = key_vals[0...parent_fields].join('~')
77
- left_parent = left_index[parent]
78
- left_value = left_values[key]
79
- left_idx = left_parent.index(key)
80
- next unless left_idx
81
- id = {}
82
- id[:row] = left_keys.index(key) + 1
83
- id[:sibling_position] = left_idx + 1
84
- key_fields.each do |field_name|
85
- id[field_name] = left_value[field_name]
160
+ child = key_vals[parent_field_count..-1].join('~')
161
+ if potential_moves.has_key?(child) && old_key = potential_moves[child].pop
162
+ diffs.delete(old_key)
163
+ if include_updates
164
+ left_value = left_values[old_key]
165
+ id = id_fields(right.child_fields, right_value)
166
+ changes = diff_row(left_value, right_value, left.parent_fields + diff_fields)
167
+ diffs[key] = Diff.new(:update, id.merge!(changes), right_row_id, right_idx)
168
+ #puts "Update Parent: #{key}"
169
+ end
170
+ elsif include_adds
171
+ diffs[key] = Diff.new(:add, right_value, right_row_id, right_idx)
172
+ #puts "Add: #{key}"
86
173
  end
87
- diffs[key].merge!(id.merge(left_values[key].merge(:action => 'Delete')))
88
- #puts "Delete: #{key}"
89
174
  end
90
175
  end
176
+
91
177
  diffs
92
178
  end
93
179
 
@@ -99,24 +185,42 @@ class CSVDiff
99
185
  # file.
100
186
  # @param right_row [Hash] The version of the CSV row from the right/to
101
187
  # file.
188
+ # @param fields [Array<String>] An array of field names to compare.
102
189
  # @return [Hash<String, Array>] A Hash whose keys are the fields that
103
190
  # contain differences, and whose values are a two-element array of
104
191
  # [left/from, right/to] values.
105
192
  def diff_row(left_row, right_row, fields)
106
193
  diffs = {}
107
194
  fields.each do |attr|
195
+ eq_proc = @equality_procs[attr]
108
196
  right_val = right_row[attr]
109
197
  right_val = nil if right_val == ""
110
198
  left_val = left_row[attr]
111
199
  left_val = nil if left_val == ""
112
- if left_val != right_val
200
+ if eq_proc
201
+ diffs[attr] = [left_val, right_val] unless eq_proc.call(left_val, right_val)
202
+ elsif @case_sensitive
203
+ diffs[attr] = [left_val, right_val] unless left_val == right_val
204
+ elsif (left_val.to_s.upcase != right_val.to_s.upcase)
113
205
  diffs[attr] = [left_val, right_val]
114
- #puts "#{attr}: #{left_val} -> #{right_val}"
115
206
  end
116
207
  end
117
208
  diffs if diffs.size > 0
118
209
  end
119
210
 
211
+
212
+ private
213
+
214
+
215
+ # Return a hash containing just the key field values
216
+ def id_fields(key_fields, fields)
217
+ id = {}
218
+ key_fields.each do |field_name|
219
+ id[field_name] = fields[field_name]
220
+ end
221
+ id
222
+ end
223
+
120
224
  end
121
225
 
122
226
  end
@@ -28,13 +28,15 @@ class CSVDiff
28
28
  # @return [Array<String>] An array of field names that are compared in the
29
29
  # diff process.
30
30
  attr_reader :diff_fields
31
- # @return [Array<Fixnum>] An array of field indexes identifying the key
32
- # fields that uniquely identify each row.
31
+ # @return [Array<String>] An array of field namees of the key fields that
32
+ # uniquely identify each row.
33
33
  attr_reader :key_fields
34
34
  # @return [Array<String>] An array of field names for the parent field(s).
35
35
  attr_reader :parent_fields
36
36
  # @return [Array<String>] An array of field names for the child field(s).
37
37
  attr_reader :child_fields
38
+ # @return [Hash] The options hash used for the diff.
39
+ attr_reader :options
38
40
 
39
41
 
40
42
  # Generates a diff between two hierarchical tree structures, provided
@@ -79,13 +81,15 @@ class CSVDiff
79
81
  # @option options [Boolean] :ignore_deletes If true, records that appear
80
82
  # in the left/from file but not in the right/to file are not reported.
81
83
  def initialize(left, right, options = {})
82
- @left = left.is_a?(CSVSource) ? left : CSVSource.new(left, options)
84
+ @left = left.is_a?(Source) ? left : CSVSource.new(left, options)
85
+ @left.index_source if @left.lines.nil?
83
86
  raise "No field names found in left (from) source" unless @left.field_names && @left.field_names.size > 0
84
- @right = right.is_a?(CSVSource) ? right : CSVSource.new(right, options)
87
+ @right = right.is_a?(Source) ? right : CSVSource.new(right, options)
88
+ @right.index_source if @right.lines.nil?
85
89
  raise "No field names found in right (to) source" unless @right.field_names && @right.field_names.size > 0
86
90
  @warnings = []
87
- @diff_fields = get_diff_fields(@left.field_names, @right.field_names, options[:ignore_fields])
88
- @key_fields = @left.key_fields.map{ |kf| @diff_fields[kf] }
91
+ @diff_fields = get_diff_fields(@left.field_names, @right.field_names, options)
92
+ @key_fields = @left.key_fields
89
93
  diff(options)
90
94
  end
91
95
 
@@ -93,6 +97,7 @@ class CSVDiff
93
97
  # Performs a diff with the specified +options+.
94
98
  def diff(options = {})
95
99
  @summary = nil
100
+ @options = options
96
101
  @diffs = diff_sources(@left, @right, @key_fields, @diff_fields, options)
97
102
  end
98
103
 
@@ -134,15 +139,21 @@ class CSVDiff
134
139
 
135
140
  # Given two sets of field names, determines the common set of fields present
136
141
  # in both, on which members can be diffed.
137
- def get_diff_fields(left_fields, right_fields, ignore_fields)
142
+ def get_diff_fields(left_fields, right_fields, options)
143
+ ignore_fields = options.fetch(:ignore_fields, [])
144
+ ignore_fields = [ignore_fields] unless ignore_fields.is_a?(Array)
145
+ ignore_fields.map! do |f|
146
+ (f.is_a?(Numeric) ? right_fields[f] : f).upcase
147
+ end
138
148
  diff_fields = []
139
- right_fields.each_with_index do |fld, i|
140
- if left_fields.include?(fld)
141
- diff_fields << fld unless ignore_fields && (ignore_fields.include?(fld) ||
142
- ignore_fields.include?(i))
143
- else
144
- @warnings << "Field '#{fld}' is missing from the left (from) file, and won't be diffed"
149
+ if options[:diff_common_fields_only]
150
+ right_fields.each_with_index do |fld, i|
151
+ if left_fields.include?(fld)
152
+ diff_fields << fld unless ignore_fields.include?(fld.upcase)
153
+ end
145
154
  end
155
+ else
156
+ diff_fields = (right_fields + left_fields).uniq.reject{ |fld| ignore_fields.include?(fld.upcase) }
146
157
  end
147
158
  diff_fields
148
159
  end
@@ -2,31 +2,7 @@ class CSVDiff
2
2
 
3
3
  # Represents a CSV input (i.e. the left/from or right/to input) to the diff
4
4
  # process.
5
- class CSVSource
6
-
7
- # @return [String] the path to the source file
8
- attr_accessor :path
9
- # @return [Array<String>] The names of the fields in the source file
10
- attr_reader :field_names
11
- # @return [Array<String>] The names of the field(s) that uniquely
12
- # identify each row.
13
- attr_reader :key_fields
14
- # @return [Array<String>] The names of the field(s) that identify a
15
- # common parent of child records.
16
- attr_reader :parent_fields
17
- # @return [Array<String>] The names of the field(s) that distinguish a
18
- # child of a parent record.
19
- attr_reader :child_fields
20
- # @return [Hash<String,Hash>] A hash containing each line of the source,
21
- # keyed on the values of the +key_fields+.
22
- attr_reader :lines
23
- # @return [Hash<String,Array<String>>] A hash containing each parent key,
24
- # and an Array of the child keys it is a parent of.
25
- attr_reader :index
26
- # @return [Array<String>] An array of any warnings encountered while
27
- # processing the source.
28
- attr_reader :warnings
29
-
5
+ class CSVSource < Source
30
6
 
31
7
  # Creates a new diff source.
32
8
  #
@@ -59,90 +35,37 @@ class CSVDiff
59
35
  # identifies each row.
60
36
  # @option options [Array<String>] :key_fields The names of the fields
61
37
  # that uniquely identifies each row.
62
- # @option options [String] :parent_field The name of the field that
63
- # identifies a parent within which sibling order should be checked.
64
- # @option options [String] :child_field The name of the field that
65
- # uniquely identifies a child of a parent.
38
+ # @option options [String] :parent_field The name of the field(s) that
39
+ # identify a parent within which sibling order should be checked.
40
+ # @option options [String] :child_field The name of the field(s) that
41
+ # uniquely identify a child of a parent.
42
+ # @option options [Boolean] :case_sensitive If true (the default), keys
43
+ # are indexed as-is; if false, the index is built in upper-case for
44
+ # case-insensitive comparisons.
45
+ # @option options [Hash] :include A hash of field name(s) or index(es) to
46
+ # regular expression(s). Only source rows whose field values satisfy the
47
+ # regular expressions will be indexed and included in the diff process.
48
+ # @option options [Hash] :exclude A hash of field name(s) or index(es) to
49
+ # regular expression(s). Source rows with a field value that satisfies
50
+ # the regular expressions will be excluded from the diff process.
66
51
  def initialize(source, options = {})
52
+ super(options)
67
53
  if source.is_a?(String)
68
54
  require 'csv'
69
55
  mode_string = options[:encoding] ? "r:#{options[:encoding]}" : 'r'
70
56
  csv_options = options.fetch(:csv_options, {})
71
57
  @path = source
72
- source = CSV.open(@path, mode_string, csv_options).readlines
73
- end
74
- if kf = options.fetch(:key_field, options[:key_fields])
75
- @key_fields = [kf].flatten
76
- @parent_fields = @key_fields[0...-1]
77
- @child_fields = @key_fields[-1..-1]
78
- else
79
- @parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
80
- @child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
81
- @key_fields = @parent_fields + @child_fields
82
- end
83
- @field_names = options[:field_names]
84
- @warnings = []
85
- index_source(source, options)
86
- end
87
-
88
-
89
- # Returns the row in the CSV source corresponding to the supplied key.
90
- #
91
- # @param key [String] The unique key to use to lookup the row.
92
- # @return [Hash] The fields for the line corresponding to +key+, or nil
93
- # if the key is not recognised.
94
- def [](key)
95
- @lines[key]
96
- end
97
-
98
-
99
- private
100
-
101
- # Given an array of lines, where each line is an array of fields, indexes
102
- # the array contents so that it can be looked up by key.
103
- def index_source(lines, options)
104
- @lines = {}
105
- @index = Hash.new{ |h, k| h[k] = [] }
106
- @key_fields = find_field_indexes(@key_fields, @field_names) if @field_names
107
- line_num = 0
108
- lines.each do |row|
109
- line_num += 1
110
- next if line_num == 1 && @field_names && options[:ignore_header]
111
- unless @field_names
112
- @field_names = row
113
- @key_fields = find_field_indexes(@key_fields, @field_names)
114
- next
115
- end
116
- field_vals = row
117
- line = {}
118
- @field_names.each_with_index do |field, i|
119
- line[field] = field_vals[i]
120
- end
121
- key_values = @key_fields.map{ |kf| field_vals[kf].to_s.upcase }
122
- key = key_values.join('~')
123
- parent_key = key_values[0...(@parent_fields.length)].join('~')
124
- if @lines[key]
125
- @warnings << "Duplicate key '#{key}' encountered and ignored at line #{line_num}"
126
- else
127
- @index[parent_key] << key
128
- @lines[key] = line
129
- end
130
- end
131
- end
132
-
133
-
134
- # Converts an array of field names to an array of indexes of the fields
135
- # matching those names.
136
- def find_field_indexes(key_fields, field_names)
137
- key_fields.map do |field|
138
- if field.is_a?(Fixnum)
139
- field
140
- else
141
- field_names.index{ |field_name| field.to_s.downcase == field_name.downcase } or
142
- raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
143
- field_names.join(', ')}"
58
+ # When you call CSV.open, it's best to pass in a block so that after it's yielded,
59
+ # the underlying file handle is closed. Otherwise, you risk leaking the handle.
60
+ @data = CSV.open(@path, mode_string, csv_options) do |csv|
61
+ csv.readlines
144
62
  end
63
+ elsif source.is_a?(Enumerable) && source.size == 0 || (source.size > 0 && source.first.is_a?(Enumerable))
64
+ @data = source
65
+ else
66
+ raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
145
67
  end
68
+ index_source
146
69
  end
147
70
 
148
71
  end
@@ -0,0 +1,275 @@
1
+ class CSVDiff
2
+
3
+ # Reppresents an input (i.e the left/from or tight/to input) to the diff
4
+ # process.
5
+ class Source
6
+
7
+ # @return [String] the path to the source file
8
+ attr_accessor :path
9
+ # @return [Array<Arrary>] The data for this source
10
+ attr_reader :data
11
+
12
+ # @return [Array<String>] The names of the fields in the source file
13
+ attr_reader :field_names
14
+ # @return [Array<String>] The names of the field(s) that uniquely
15
+ # identify each row.
16
+ attr_reader :key_fields
17
+ # @return [Array<String>] The names of the field(s) that identify a
18
+ # common parent of child records.
19
+ attr_reader :parent_fields
20
+ # @return [Array<String>] The names of the field(s) that distinguish a
21
+ # child of a parent record.
22
+ attr_reader :child_fields
23
+
24
+ # @return [Array<Fixnum>] The indexes of the key fields in the source
25
+ # file.
26
+ attr_reader :key_field_indexes
27
+ # @return [Array<Fixnum>] The indexes of the parent fields in the source
28
+ # file.
29
+ attr_reader :parent_field_indexes
30
+ # @return [Array<Fixnum>] The indexes of the child fields in the source
31
+ # file.
32
+ attr_reader :child_field_indexes
33
+
34
+ # @return [Boolean] True if the source has been indexed with case-
35
+ # sensitive keys, or false if it has been indexed using upper-case key
36
+ # values.
37
+ attr_reader :case_sensitive
38
+ alias_method :case_sensitive?, :case_sensitive
39
+ # @return [Boolean] True if leading/trailing whitespace should be stripped
40
+ # from fields
41
+ attr_reader :trim_whitespace
42
+ # @return [Hash<String,Hash>] A hash containing each line of the source,
43
+ # keyed on the values of the +key_fields+.
44
+ attr_reader :lines
45
+ # @return [Hash<String,Array<String>>] A hash containing each parent key,
46
+ # and an Array of the child keys it is a parent of.
47
+ attr_reader :index
48
+ # @return [Array<String>] An array of any warnings encountered while
49
+ # processing the source.
50
+ attr_reader :warnings
51
+ # @return [Fixnum] A count of the lines processed from this source.
52
+ # Excludes any header and duplicate records identified during indexing.
53
+ attr_reader :line_count
54
+ # @return [Fixnum] A count of the lines from this source that were skipped
55
+ # due to filter conditions.
56
+ attr_reader :skip_count
57
+ # @return [Fixnum] A count of the lines from this source that had the same
58
+ # key value as another line.
59
+ attr_reader :dup_count
60
+
61
+
62
+ # Creates a new diff source.
63
+ #
64
+ # A diff source must contain at least one field that will be used as the
65
+ # key to identify the same record in a different version of this file.
66
+ # If not specified via one of the options, the first field is assumed to
67
+ # be the unique key.
68
+ #
69
+ # If multiple fields combine to form a unique key, the parent is assumed
70
+ # to be identified by all but the last field of the unique key. If finer
71
+ # control is required, use a combination of the :parent_fields and
72
+ # :child_fields options.
73
+ #
74
+ # All key options can be specified either by field name, or by field
75
+ # index (0 based).
76
+ #
77
+ # @param options [Hash] An options hash.
78
+ # @option options [Array<String>] :field_names The names of each of the
79
+ # fields in +source+.
80
+ # @option options [Boolean] :ignore_header If true, and :field_names has
81
+ # been specified, then the first row of the file is ignored.
82
+ # @option options [String] :key_field The name of the field that uniquely
83
+ # identifies each row.
84
+ # @option options [Array<String>] :key_fields The names of the fields
85
+ # that uniquely identifies each row.
86
+ # @option options [String] :parent_field The name of the field(s) that
87
+ # identify a parent within which sibling order should be checked.
88
+ # @option options [String] :child_field The name of the field(s) that
89
+ # uniquely identify a child of a parent.
90
+ # @option options [Boolean] :case_sensitive If true (the default), keys
91
+ # are indexed as-is; if false, the index is built in upper-case for
92
+ # case-insensitive comparisons.
93
+ # @option options [Hash] :include A hash of field name(s) or index(es) to
94
+ # regular expression(s). Only source rows whose field values satisfy the
95
+ # regular expressions will be indexed and included in the diff process.
96
+ # @option options [Hash] :exclude A hash of field name(s) or index(es) to
97
+ # regular expression(s). Source rows with a field value that satisfies
98
+ # the regular expressions will be excluded from the diff process.
99
+ def initialize(options = {})
100
+ if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
101
+ (kf = options.fetch(:key_field, options[:key_fields]))
102
+ @key_fields = [kf].flatten
103
+ @parent_fields = @key_fields[0...-1]
104
+ @child_fields = @key_fields[-1..-1]
105
+ else
106
+ @parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
107
+ @child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
108
+ @key_fields = @parent_fields + @child_fields
109
+ end
110
+ @field_names = options[:field_names]
111
+ @case_sensitive = options.fetch(:case_sensitive, true)
112
+ @trim_whitespace = options.fetch(:trim_whitespace, false)
113
+ @ignore_header = options[:ignore_header]
114
+ @include = options[:include]
115
+ @exclued = options[:exclude]
116
+ @path = options.fetch(:path, 'NA') unless @path
117
+ @warnings = []
118
+ end
119
+
120
+
121
+ def path?
122
+ @path != 'NA'
123
+ end
124
+
125
+
126
+ # Returns the row in the CSV source corresponding to the supplied key.
127
+ #
128
+ # @param key [String] The unique key to use to lookup the row.
129
+ # @return [Hash] The fields for the line corresponding to +key+, or nil
130
+ # if the key is not recognised.
131
+ def [](key)
132
+ @lines[key]
133
+ end
134
+
135
+
136
+ # Given an array of lines, where each line is an array of fields, indexes
137
+ # the array contents so that it can be looked up by key.
138
+ def index_source
139
+ @lines = {}
140
+ @index = Hash.new{ |h, k| h[k] = [] }
141
+ if @field_names
142
+ index_fields
143
+ include_filter = convert_filter(@include, @field_names)
144
+ exclude_filter = convert_filter(@exclude, @field_names)
145
+ end
146
+ @line_count = 0
147
+ @skip_count = 0
148
+ @dup_count = 0
149
+ line_num = 0
150
+ @data.each do |row|
151
+ line_num += 1
152
+ next if line_num == 1 && @field_names && @ignore_header
153
+ unless @field_names
154
+ if row.class.name == 'CSV::Row'
155
+ @field_names = row.headers.each_with_index.map{ |f, i| f || i.to_s }
156
+ else
157
+ @field_names = row.each_with_index.map{ |f, i| f || i.to_s }
158
+ end
159
+ index_fields
160
+ include_filter = convert_filter(@include, @field_names)
161
+ exclude_filter = convert_filter(@exclude, @field_names)
162
+ next
163
+ end
164
+ field_vals = row
165
+ line = {}
166
+ filter = false
167
+ @field_names.each_with_index do |field, i|
168
+ val = field_vals[i]
169
+ val = val.to_s.strip if val && @trim_whitespace
170
+ line[field] = val
171
+ if include_filter && f = include_filter[i]
172
+ filter = !check_filter(f, line[field])
173
+ end
174
+ if exclude_filter && f = exclude_filter[i]
175
+ filter = check_filter(f, line[field])
176
+ end
177
+ break if filter
178
+ end
179
+ if filter
180
+ @skip_count += 1
181
+ next
182
+ end
183
+ key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
184
+ field_vals[kf].to_s :
185
+ field_vals[kf].to_s.upcase }
186
+ key = key_values.join('~')
187
+ parent_key = key_values[0...(@parent_fields.length)].join('~')
188
+ if @lines[key]
189
+ @warnings << "Duplicate key '#{key}' encountered at line #{line_num}"
190
+ @dup_count += 1
191
+ key += "[#{@dup_count}]"
192
+ end
193
+ @index[parent_key] << key
194
+ @lines[key] = line
195
+ @line_count += 1
196
+ end
197
+ end
198
+
199
+
200
+ # Save the data in this Source as a CSV at +file_path+.
201
+ #
202
+ # @parma file_path [String] The target path to save the data to.
203
+ # @param options [Hash] A set of options to pass to CSV.open to control
204
+ # how the CSV is generated.
205
+ def save_csv(file_path, options = {})
206
+ require 'csv'
207
+ default_opts = {
208
+ headers: @field_name, write_headers: true
209
+ }
210
+ CSV.open(file_path, 'wb', default_opts.merge(options)) do |csv|
211
+ @data.each{ |rec| csv << rec }
212
+ end
213
+ end
214
+
215
+
216
+ private
217
+
218
+
219
+ def index_fields
220
+ @key_field_indexes = find_field_indexes(@key_fields, @field_names)
221
+ @parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
222
+ @child_field_indexes = find_field_indexes(@child_fields, @field_names)
223
+ @key_fields = @key_field_indexes.map{ |i| @field_names[i] }
224
+ @parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
225
+ @child_fields = @child_field_indexes.map{ |i| @field_names[i] }
226
+ end
227
+
228
+
229
+ # Converts an array of field names to an array of indexes of the fields
230
+ # matching those names.
231
+ def find_field_indexes(key_fields, field_names)
232
+ key_fields.map do |field|
233
+ if field.is_a?(Integer)
234
+ field
235
+ else
236
+ field_names.index{ |field_name| field.to_s.downcase == field_name.to_s.downcase } or
237
+ raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
238
+ field_names.join(', ')}"
239
+ end
240
+ end
241
+ end
242
+
243
+
244
+ def convert_filter(hsh, field_names)
245
+ return unless hsh
246
+ if !hsh.is_a?(Hash)
247
+ raise ArgumentError, ":include/:exclude option must be a Hash of field name(s)/index(es) to RegExp(s)"
248
+ end
249
+ keys = hsh.keys
250
+ idxs = find_field_indexes(keys, @field_names)
251
+ Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
252
+ end
253
+
254
+
255
+ def check_filter(filter, field_val)
256
+ case filter
257
+ when String
258
+ if @case_sensitive
259
+ filter == field_val
260
+ else
261
+ filter.downcase == field_val.to_s.downcase
262
+ end
263
+ when Regexp
264
+ filter.match(field_val)
265
+ when Proc
266
+ filter.call(field_val)
267
+ else
268
+ raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
269
+ end
270
+ end
271
+
272
+ end
273
+
274
+ end
275
+
@@ -0,0 +1,142 @@
1
+ require 'nokogiri'
2
+ require 'cgi'
3
+
4
+
5
+ class CSVDiff
6
+
7
+ # Convert XML content to CSV format using XPath selectors to identify the
8
+ # rows and field values in an XML document
9
+ class XMLSource < Source
10
+
11
+ attr_accessor :context
12
+
13
+ # Create a new XMLSource, identified by +path+. Normally this is a path
14
+ # to the XML document, but any value is fine, as it isreally just a label
15
+ # to identify this data set.
16
+ #
17
+ # @param path [String] A label for this data set (often a path to the
18
+ # XML document used as the source).
19
+ # @param options [Hash] An options hash.
20
+ # @option options [Array<String>] :field_names The names of each of the
21
+ # fields in +source+.
22
+ # @option options [Boolean] :ignore_header If true, and :field_names has
23
+ # been specified, then the first row of the file is ignored.
24
+ # @option options [String] :key_field The name of the field that uniquely
25
+ # identifies each row.
26
+ # @option options [Array<String>] :key_fields The names of the fields
27
+ # that uniquely identifies each row.
28
+ # @option options [String] :parent_field The name of the field(s) that
29
+ # identify a parent within which sibling order should be checked.
30
+ # @option options [String] :child_field The name of the field(s) that
31
+ # uniquely identify a child of a parent.
32
+ # @option options [Boolean] :case_sensitive If true (the default), keys
33
+ # are indexed as-is; if false, the index is built in upper-case for
34
+ # case-insensitive comparisons.
35
+ # @option options [Hash] :include A hash of field name(s) or index(es) to
36
+ # regular expression(s). Only source rows whose field values satisfy the
37
+ # regular expressions will be indexed and included in the diff process.
38
+ # @option options [Hash] :exclude A hash of field name(s) or index(es) to
39
+ # regular expression(s). Source rows with a field value that satisfies
40
+ # the regular expressions will be excluded from the diff process.
41
+ # @option options [String] :context A context value from which fields
42
+ # can be populated using a Regexp.
43
+ def initialize(path, options = {})
44
+ super(options)
45
+ @path = path
46
+ @context = options[:context]
47
+ @data = []
48
+ end
49
+
50
+
51
+ # Process a +source+, converting the XML into a table of data, using
52
+ # +rec_xpath+ to identify the nodes that correspond each record that
53
+ # should appear in the output, and +field_maps+ to populate each field
54
+ # in each row.
55
+ #
56
+ # @param source [String|Array] may be a String containing XML content,
57
+ # an Array of paths to files containing XML content, or a path to
58
+ # a single file.
59
+ # @param rec_xpath [String] An XPath expression that selects all the
60
+ # items in the XML document that are to be converted into new rows.
61
+ # The returned items are not directly used to populate the fields,
62
+ # but provide a context for the field XPath expreessions that populate
63
+ # each field's content.
64
+ # @param field_maps [Hash<String, String>] A map of field names to
65
+ # expressions that are evaluated in the context of each row node
66
+ # selected by +rec_xpath+. The field expressions are typically XPath
67
+ # expressions evaluated in the context of the nodes returned by the
68
+ # +rec_xpath+. Alternatively, a String that is not an XPath expression
69
+ # is used as a literal value for a field, while a Regexp can also
70
+ # be used to pull a value from any context specified in the +options+
71
+ # hash. The Regexp should include a single grouping, as the value used
72
+ # will be the result in $1 after the match is performed.
73
+ # @param context [String] An optional context for the XML to be processed.
74
+ # The value passed here can be referenced in field map expressions
75
+ # using a Regexp, with the value of the first grouping in the regex
76
+ # being the value returned for the field.
77
+ def process(source, rec_xpath, field_maps, context = nil)
78
+ @field_names = field_maps.keys unless @field_names
79
+ case source
80
+ when Nokogiri::XML::Document
81
+ add_data(source, rec_xpath, field_maps, context || @context)
82
+ when /<\?xml/
83
+ doc = Nokogiri::XML(source)
84
+ add_data(doc, rec_xpath, field_maps, context || @context)
85
+ when Array
86
+ source.each{ |f| process_file(f, rec_xpath, field_maps) }
87
+ when String
88
+ process_file(source, rec_xpath, field_maps)
89
+ else
90
+ raise ArgumentError, "Unhandled source type #{source.class.name}"
91
+ end
92
+ @data
93
+ end
94
+
95
+
96
+ private
97
+
98
+
99
+ # Load the XML document at +file_path+ and process it into rows of data.
100
+ def process_file(file_path, rec_xpath, field_maps)
101
+ begin
102
+ File.open(file_path) do |f|
103
+ doc = Nokogiri::XML(f)
104
+ add_data(doc, rec_xpath, field_maps, @context || file_path)
105
+ end
106
+ rescue
107
+ STDERR.puts "An error occurred while attempting to open #{file_path}"
108
+ raise
109
+ end
110
+ end
111
+
112
+
113
+ # Locate records in +doc+ using +rec_xpath+ to identify the nodes that
114
+ # correspond to a new record in the data, and +field_maps+ to populate
115
+ # the fields in each row.
116
+ def add_data(doc, rec_xpath, field_maps, context)
117
+ doc.xpath(rec_xpath).each do |rec_node|
118
+ rec = []
119
+ field_maps.each do |field_name, expr|
120
+ case expr
121
+ when Regexp # Match context against Regexp and extract first grouping
122
+ if context
123
+ context =~ expr
124
+ rec << $1
125
+ else
126
+ rec << nil
127
+ end
128
+ when %r{[/(.@]} # XPath expression
129
+ res = rec_node.xpath(expr)
130
+ rec << CGI.unescape_html(res.to_s)
131
+ else # Use expr as the value for this field
132
+ rec << expr
133
+ end
134
+ end
135
+ @data << rec
136
+ end
137
+ end
138
+
139
+ end
140
+
141
+ end
142
+
metadata CHANGED
@@ -1,27 +1,44 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-diff
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.2'
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Gardiner
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-11 00:00:00.000000000 Z
11
+ date: 2020-07-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description: ! " This library performs diffs of CSV files.\n\n Unlike
14
- a standard diff that compares line by line, and is sensitive to the\n ordering
15
- of records, CSV-Diff identifies common lines by key field(s), and\n then
16
- compares the contents of the fields in each line.\n\n Data may be supplied
17
- in the form of CSV files, or as an array of arrays. The\n diff process provides
18
- a fine level of control over what to diff, and can\n optionally ignore certain
19
- types of changes (e.g. changes in position).\n\n CSV-Diff is particularly
20
- well suited to data in parent-child format. Parent-\n child data does not
21
- lend itself well to standard text diffs, as small changes\n in the organisation
22
- of the tree at an upper level can lead to big movements\n in the position
23
- of descendant records. By instead matching records by key,\n CSV-Diff avoids
24
- this issue, while still being able to detect changes in\n sibling order.\n"
13
+ description: |2
14
+ This library performs diffs of CSV data, or any table-like source.
15
+
16
+ Unlike a standard diff that compares line by line, and is sensitive to the
17
+ ordering of records, CSV-Diff identifies common lines by key field(s), and
18
+ then compares the contents of the fields in each line.
19
+
20
+ Data may be supplied in the form of CSV files, or as an array of arrays. The
21
+ diff process provides a fine level of control over what to diff, and can
22
+ optionally ignore certain types of changes (e.g. changes in position).
23
+
24
+ CSV-Diff is particularly well suited to data in parent-child format. Parent-
25
+ child data does not lend itself well to standard text diffs, as small changes
26
+ in the organisation of the tree at an upper level can lead to big movements
27
+ in the position of descendant records. By instead matching records by key,
28
+ CSV-Diff avoids this issue, while still being able to detect changes in
29
+ sibling order.
30
+
31
+ This gem implements the core diff algorithm, and handles the loading and
32
+ diffing of CSV files (or Arrays of Arrays). It also supports converting
33
+ data in XML format into tabular form, so that it can then be processed
34
+ like any other CSV or table-like source. It returns a CSVDiff object
35
+ containing the details of differences in object form. This is useful for
36
+ projects that need diff capability, but want to handle the reporting or
37
+ actioning of differences themselves.
38
+
39
+ For a pre-built diff reporting capability, see the csv-diff-report gem,
40
+ which provides a command-line tool for generating diff reports in HTML,
41
+ Excel, or text formats.
25
42
  email: adam.b.gardiner@gmail.com
26
43
  executables: []
27
44
  extensions: []
@@ -33,9 +50,12 @@ files:
33
50
  - lib/csv-diff/algorithm.rb
34
51
  - lib/csv-diff/csv_diff.rb
35
52
  - lib/csv-diff/csv_source.rb
53
+ - lib/csv-diff/source.rb
54
+ - lib/csv-diff/xml_source.rb
36
55
  - lib/csv_diff.rb
37
56
  homepage: https://github.com/agardiner/csv-diff
38
- licenses: []
57
+ licenses:
58
+ - MIT
39
59
  metadata: {}
40
60
  post_install_message: For command-line tools and diff reports, 'gem install csv-diff-report'
41
61
  rdoc_options: []
@@ -43,18 +63,18 @@ require_paths:
43
63
  - lib
44
64
  required_ruby_version: !ruby/object:Gem::Requirement
45
65
  requirements:
46
- - - ! '>='
66
+ - - ">="
47
67
  - !ruby/object:Gem::Version
48
68
  version: '0'
49
69
  required_rubygems_version: !ruby/object:Gem::Requirement
50
70
  requirements:
51
- - - ! '>='
71
+ - - ">="
52
72
  - !ruby/object:Gem::Version
53
73
  version: '0'
54
74
  requirements: []
55
75
  rubyforge_project:
56
- rubygems_version: 2.4.1
76
+ rubygems_version: 2.5.2.3
57
77
  signing_key:
58
78
  specification_version: 4
59
- summary: CSV Diff is a library for generating diffs from data in CSV format
79
+ summary: CSV Diff is a library for generating diffs from data in CSV or XML format
60
80
  test_files: []