RubyGems - csv-diff - Versions diffs - 0.3.5 → 0.5.0 - Mend

csv-diff 0.3.5 → 0.5.0

Files changed (11) hide show

checksums.yaml +4 -4
data/LICENSE +0 -0
data/README.md +0 -0
data/lib/csv-diff.rb +1 -0
data/lib/csv-diff/algorithm.rb +5 -2
data/lib/csv-diff/csv_diff.rb +5 -3
data/lib/csv-diff/csv_source.rb +10 -190
data/lib/csv-diff/source.rb +275 -0
data/lib/csv-diff/xml_source.rb +142 -0
data/lib/csv_diff.rb +0 -0
metadata +10 -6

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: c994476b56ebd4cc355f6ee7b668d57396a064e3
-  data.tar.gz: 895f8255b56537b98a7a9ddf54fbfac8dfabcbfb
+  metadata.gz: d1b3b8deee34344d334e740285cb1f3c99074694
+  data.tar.gz: d95158d13861cb66fd460ee430714ec3c83cd0b1
 SHA512:
-  metadata.gz: 9340c4debf4ccc9cac6f08c08e2f3310acb191eb31ba6ede84512f7f1e3111dc6f123b74e46ad5baf141a0169ee4712db20f782cad3a393193724d3c29dd459f
-  data.tar.gz: 95c0008ca9b8069bd017ac904e8795853ec181eb6c9906252d9dbb32224a9f49e8eb48084ba9740aa9f8d048290319d4dcc6dd8e3e25bb660004ec50d4d038f9
+  metadata.gz: 50c74d6a4093012b0ba44fef70c2d749348d6777cfb9f2cfda66c6e075423191a4c6c22019a388b9d8bd14e22ac60d539f4e3b4aa85fd87fd774a64da15858c7
+  data.tar.gz: 8fa030a54e7a97db9913b3c36a1942de1e07a6549f9ae7aa58b5b3f44d522fe11f72d44e18b6b7612d2b2dc9f106ece1fea183557c507bcf18316891ab63f230

data/LICENSE CHANGED

File without changes

data/README.md CHANGED

File without changes

data/lib/csv-diff.rb CHANGED

@@ -1,3 +1,4 @@
+require 'csv-diff/source'
 require 'csv-diff/csv_source'
 require 'csv-diff/algorithm'
 require 'csv-diff/csv_diff'

data/lib/csv-diff/algorithm.rb CHANGED

@@ -36,8 +36,6 @@ class CSVDiff
             # For backwards compatibility and access to fields with differences
             def [](key)
                 case key
-                when String
-                    @fields[key]
                 when :action
                     a = diff_type.to_s
                     a[0] = a[0].upcase
@@ -46,6 +44,8 @@ class CSVDiff
                     @row
                 when :sibling_position
                     @sibling_position
+                else
+                    @fields[key]
                 end
             end
@@ -78,6 +78,9 @@ class CSVDiff
             unless left.case_sensitive? == right.case_sensitive?
                 raise ArgumentError, "Left and right must have same settings for case-sensitivity"
             end
+            unless left.parent_fields.length == right.parent_fields.length
+                raise ArgumentError, "Left and right must have same settings for parent/child fields"
+            end
             # Ensure key fields are not also in the diff_fields
             diff_fields = diff_fields - key_fields

data/lib/csv-diff/csv_diff.rb CHANGED

@@ -81,9 +81,11 @@ class CSVDiff
     # @option options [Boolean] :ignore_deletes If true, records that appear
     #   in the left/from file but not in the right/to file are not reported.
     def initialize(left, right, options = {})
-        @left = left.is_a?(CSVSource) ? left : CSVSource.new(left, options)
+        @left = left.is_a?(Source) ? left : CSVSource.new(left, options)
+        @left.index_source if @left.lines.nil?
         raise "No field names found in left (from) source" unless @left.field_names && @left.field_names.size > 0
-        @right = right.is_a?(CSVSource) ? right : CSVSource.new(right, options)
+        @right = right.is_a?(Source) ? right : CSVSource.new(right, options)
+        @right.index_source if @right.lines.nil?
         raise "No field names found in right (to) source" unless @right.field_names && @right.field_names.size > 0
         @warnings = []
         @diff_fields = get_diff_fields(@left.field_names, @right.field_names, options)
@@ -141,7 +143,7 @@ class CSVDiff
         ignore_fields = options.fetch(:ignore_fields, [])
         ignore_fields = [ignore_fields] unless ignore_fields.is_a?(Array)
         ignore_fields.map! do |f|
-            (f.is_a?(Fixnum) ? right_fields[f] : f).upcase
+            (f.is_a?(Numeric) ? right_fields[f] : f).upcase
         end
         diff_fields = []
         if options[:diff_common_fields_only]

data/lib/csv-diff/csv_source.rb CHANGED

@@ -2,57 +2,7 @@ class CSVDiff
     # Represents a CSV input (i.e. the left/from or right/to input) to the diff
     # process.
-    class CSVSource
-        # @return [String] the path to the source file
-        attr_accessor :path
-        # @return [Array<String>] The names of the fields in the source file
-        attr_reader :field_names
-        # @return [Array<String>] The names of the field(s) that uniquely
-        #   identify each row.
-        attr_reader :key_fields
-        # @return [Array<String>] The names of the field(s) that identify a
-        #   common parent of child records.
-        attr_reader :parent_fields
-        # @return [Array<String>] The names of the field(s) that distinguish a
-        #   child of a parent record.
-        attr_reader :child_fields
-        # @return [Array<Fixnum>] The indexes of the key fields in the source
-        #   file.
-        attr_reader :key_field_indexes
-        # @return [Array<Fixnum>] The indexes of the parent fields in the source
-        #   file.
-        attr_reader :parent_field_indexes
-        # @return [Array<Fixnum>] The indexes of the child fields in the source
-        #   file.
-        attr_reader :child_field_indexes
-        # @return [Boolean] True if the source has been indexed with case-
-        #   sensitive keys, or false if it has been indexed using upper-case key
-        #   values.
-        attr_reader :case_sensitive
-        alias_method :case_sensitive?, :case_sensitive
-        # @return [Boolean] True if leading/trailing whitespace should be stripped
-        #   from fields
-        attr_reader :trim_whitespace
-        # @return [Hash<String,Hash>] A hash containing each line of the source,
-        #   keyed on the values of the +key_fields+.
-        attr_reader :lines
-        # @return [Hash<String,Array<String>>] A hash containing each parent key,
-        #   and an Array of the child keys it is a parent of.
-        attr_reader :index
-        # @return [Array<String>] An array of any warnings encountered while
-        #   processing the source.
-        attr_reader :warnings
-        # @return [Fixnum] A count of the lines processed from this source.
-        #   Excludes any header and duplicate records identified during indexing.
-        attr_reader :line_count
-        # @return [Fixnum] A count of the lines from this source that were skipped,
-        #   due either to duplicate keys or filter conditions.
-        attr_reader :skip_count
+    class CSVSource < Source
         # Creates a new diff source.
         #
@@ -99,153 +49,23 @@ class CSVDiff
         #   regular expression(s). Source rows with a field value that satisfies
         #   the regular expressions will be excluded from the diff process.
         def initialize(source, options = {})
+            super(options)
             if source.is_a?(String)
                 require 'csv'
                 mode_string = options[:encoding] ? "r:#{options[:encoding]}" : 'r'
                 csv_options = options.fetch(:csv_options, {})
                 @path = source
-                source = CSV.open(@path, mode_string, csv_options).readlines
-            elsif !source.is_a?(Enumerable) || (source.is_a?(Enumerable) && source.size > 0 &&
-                                                !source.first.is_a?(Enumerable))
-                raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
-            end
-            if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
-               (kf = options.fetch(:key_field, options[:key_fields]))
-                @key_fields = [kf].flatten
-                @parent_fields = @key_fields[0...-1]
-                @child_fields = @key_fields[-1..-1]
-            else
-                @parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
-                @child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
-                @key_fields = @parent_fields + @child_fields
-            end
-            @field_names = options[:field_names]
-            @warnings = []
-            index_source(source, options)
-        end
-        # Returns the row in the CSV source corresponding to the supplied key.
-        #
-        # @param key [String] The unique key to use to lookup the row.
-        # @return [Hash] The fields for the line corresponding to +key+, or nil
-        #   if the key is not recognised.
-        def [](key)
-            @lines[key]
-        end
-        private
-        # Given an array of lines, where each line is an array of fields, indexes
-        # the array contents so that it can be looked up by key.
-        def index_source(lines, options)
-            @lines = {}
-            @index = Hash.new{ |h, k| h[k] = [] }
-            if @field_names
-                index_fields(options)
-            end
-            @case_sensitive = options.fetch(:case_sensitive, true)
-            @trim_whitespace = options.fetch(:trim_whitespace, false)
-            @line_count = 0
-            @skip_count = 0
-            line_num = 0
-            lines.each do |row|
-                line_num += 1
-                next if line_num == 1 && @field_names && options[:ignore_header]
-                unless @field_names
-                    @field_names = row.each_with_index.map{ |f, i| f || i.to_s }
-                    index_fields(options)
-                    next
-                end
-                field_vals = row
-                line = {}
-                filter = false
-                @field_names.each_with_index do |field, i|
-                    line[field] = field_vals[i]
-                    line[field].strip! if @trim_whitespace && line[field]
-                    if @include_filter && f = @include_filter[i]
-                        filter = !check_filter(f, line[field])
-                    end
-                    if @exclude_filter && f = @exclude_filter[i]
-                        filter = check_filter(f, line[field])
-                    end
-                    break if filter
-                end
-                if filter
-                    @skip_count += 1
-                    next
+                # When you call CSV.open, it's best to pass in a block so that after it's yielded,
+                # the underlying file handle is closed. Otherwise, you risk leaking the handle.
+                @data = CSV.open(@path, mode_string, csv_options) do |csv|
+                     csv.readlines
                 end
-                key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
-                                                     field_vals[kf].to_s : field_vals[kf].to_s.upcase }
-                key = key_values.join('~')
-                parent_key = key_values[0...(@parent_fields.length)].join('~')
-                if @lines[key]
-                    @warnings << "Duplicate key '#{key}' encountered and ignored at line #{line_num}"
-                    @skip_count += 1
-                else
-                    @index[parent_key] << key
-                    @lines[key] = line
-                    @line_count += 1
-                end
-            end
-        end
-        def index_fields(options)
-            @key_field_indexes = find_field_indexes(@key_fields, @field_names)
-            @parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
-            @child_field_indexes = find_field_indexes(@child_fields, @field_names)
-            @key_fields = @key_field_indexes.map{ |i| @field_names[i] }
-            @parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
-            @child_fields = @child_field_indexes.map{ |i| @field_names[i] }
-            @include_filter = convert_filter(options, :include, @field_names)
-            @exclude_filter = convert_filter(options, :exclude, @field_names)
-        end
-        # Converts an array of field names to an array of indexes of the fields
-        # matching those names.
-        def find_field_indexes(key_fields, field_names)
-            key_fields.map do |field|
-                if field.is_a?(Integer)
-                    field
-                else
-                    field_names.index{ |field_name| field.to_s.downcase == field_name.downcase } or
-                        raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
-                            field_names.join(', ')}"
-                end
-            end
-        end
-        def convert_filter(options, key, field_names)
-            return unless hsh = options[key]
-            if !hsh.is_a?(Hash)
-                raise ArgumentError, ":#{key} option must be a Hash of field name(s)/index(es) to RegExp(s)"
-            end
-            keys = hsh.keys
-            idxs = find_field_indexes(keys, @field_names)
-            Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
-        end
-        def check_filter(filter, field_val)
-            case filter
-            when String
-                if @case_sensitive
-                    filter == field_val
-                else
-                    filter.downcase == field_val.to_s.downcase
-                end
-            when Regexp
-                filter.match(field_val)
-            when Proc
-                filter.call(field_val)
+            elsif source.is_a?(Enumerable) && source.size == 0 || (source.size > 0 && source.first.is_a?(Enumerable))
+                @data = source
             else
-                raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
+                raise ArgumentError, "source must be a path to a file or an Enumerable<Enumerable>"
             end
+            index_source
         end
     end

data/lib/csv-diff/source.rb ADDED

@@ -0,0 +1,275 @@
+class CSVDiff
+    # Reppresents an input (i.e the left/from or tight/to input) to the diff
+    # process.
+    class Source
+        # @return [String] the path to the source file
+        attr_accessor :path
+        # @return [Array<Arrary>] The data for this source
+        attr_reader :data
+        # @return [Array<String>] The names of the fields in the source file
+        attr_reader :field_names
+        # @return [Array<String>] The names of the field(s) that uniquely
+        #   identify each row.
+        attr_reader :key_fields
+        # @return [Array<String>] The names of the field(s) that identify a
+        #   common parent of child records.
+        attr_reader :parent_fields
+        # @return [Array<String>] The names of the field(s) that distinguish a
+        #   child of a parent record.
+        attr_reader :child_fields
+        # @return [Array<Fixnum>] The indexes of the key fields in the source
+        #   file.
+        attr_reader :key_field_indexes
+        # @return [Array<Fixnum>] The indexes of the parent fields in the source
+        #   file.
+        attr_reader :parent_field_indexes
+        # @return [Array<Fixnum>] The indexes of the child fields in the source
+        #   file.
+        attr_reader :child_field_indexes
+        # @return [Boolean] True if the source has been indexed with case-
+        #   sensitive keys, or false if it has been indexed using upper-case key
+        #   values.
+        attr_reader :case_sensitive
+        alias_method :case_sensitive?, :case_sensitive
+        # @return [Boolean] True if leading/trailing whitespace should be stripped
+        #   from fields
+        attr_reader :trim_whitespace
+        # @return [Hash<String,Hash>] A hash containing each line of the source,
+        #   keyed on the values of the +key_fields+.
+        attr_reader :lines
+        # @return [Hash<String,Array<String>>] A hash containing each parent key,
+        #   and an Array of the child keys it is a parent of.
+        attr_reader :index
+        # @return [Array<String>] An array of any warnings encountered while
+        #   processing the source.
+        attr_reader :warnings
+        # @return [Fixnum] A count of the lines processed from this source.
+        #   Excludes any header and duplicate records identified during indexing.
+        attr_reader :line_count
+        # @return [Fixnum] A count of the lines from this source that were skipped
+        #   due to filter conditions.
+        attr_reader :skip_count
+        # @return [Fixnum] A count of the lines from this source that had the same
+        #   key value as another line.
+        attr_reader :dup_count
+        # Creates a new diff source.
+        #
+        # A diff source must contain at least one field that will be used as the
+        # key to identify the same record in a different version of this file.
+        # If not specified via one of the options, the first field is assumed to
+        # be the unique key.
+        #
+        # If multiple fields combine to form a unique key, the parent is assumed
+        # to be identified by all but the last field of the unique key. If finer
+        # control is required, use a combination of the :parent_fields and
+        # :child_fields options.
+        #
+        # All key options can be specified either by field name, or by field
+        # index (0 based).
+        #
+        # @param options [Hash] An options hash.
+        # @option options [Array<String>] :field_names The names of each of the
+        #   fields in +source+.
+        # @option options [Boolean] :ignore_header If true, and :field_names has
+        #   been specified, then the first row of the file is ignored.
+        # @option options [String] :key_field The name of the field that uniquely
+        #   identifies each row.
+        # @option options [Array<String>] :key_fields The names of the fields
+        #   that uniquely identifies each row.
+        # @option options [String] :parent_field The name of the field(s) that
+        #   identify a parent within which sibling order should be checked.
+        # @option options [String] :child_field The name of the field(s) that
+        #   uniquely identify a child of a parent.
+        # @option options [Boolean] :case_sensitive If true (the default), keys
+        #   are indexed as-is; if false, the index is built in upper-case for
+        #   case-insensitive comparisons.
+        # @option options [Hash] :include A hash of field name(s) or index(es) to
+        #   regular expression(s). Only source rows whose field values satisfy the
+        #   regular expressions will be indexed and included in the diff process.
+        # @option options [Hash] :exclude A hash of field name(s) or index(es) to
+        #   regular expression(s). Source rows with a field value that satisfies
+        #   the regular expressions will be excluded from the diff process.
+        def initialize(options = {})
+            if (options.keys & [:parent_field, :parent_fields, :child_field, :child_fields]).empty? &&
+               (kf = options.fetch(:key_field, options[:key_fields]))
+                @key_fields = [kf].flatten
+                @parent_fields = @key_fields[0...-1]
+                @child_fields = @key_fields[-1..-1]
+            else
+                @parent_fields = [options.fetch(:parent_field, options[:parent_fields]) || []].flatten
+                @child_fields = [options.fetch(:child_field, options[:child_fields]) || [0]].flatten
+                @key_fields = @parent_fields + @child_fields
+            end
+            @field_names = options[:field_names]
+            @case_sensitive = options.fetch(:case_sensitive, true)
+            @trim_whitespace = options.fetch(:trim_whitespace, false)
+            @ignore_header = options[:ignore_header]
+            @include = options[:include]
+            @exclued = options[:exclude]
+            @path = options.fetch(:path, 'NA') unless @path
+            @warnings = []
+        end
+        def path?
+            @path != 'NA'
+        end
+        # Returns the row in the CSV source corresponding to the supplied key.
+        #
+        # @param key [String] The unique key to use to lookup the row.
+        # @return [Hash] The fields for the line corresponding to +key+, or nil
+        #   if the key is not recognised.
+        def [](key)
+            @lines[key]
+        end
+        # Given an array of lines, where each line is an array of fields, indexes
+        # the array contents so that it can be looked up by key.
+        def index_source
+            @lines = {}
+            @index = Hash.new{ |h, k| h[k] = [] }
+            if @field_names
+                index_fields
+                include_filter = convert_filter(@include, @field_names)
+                exclude_filter = convert_filter(@exclude, @field_names)
+            end
+            @line_count = 0
+            @skip_count = 0
+            @dup_count = 0
+            line_num = 0
+            @data.each do |row|
+                line_num += 1
+                next if line_num == 1 && @field_names && @ignore_header
+                unless @field_names
+                    if row.class.name == 'CSV::Row'
+                        @field_names = row.headers.each_with_index.map{ |f, i| f || i.to_s }
+                    else
+                        @field_names = row.each_with_index.map{ |f, i| f || i.to_s }
+                    end
+                    index_fields
+                    include_filter = convert_filter(@include, @field_names)
+                    exclude_filter = convert_filter(@exclude, @field_names)
+                    next
+                end
+                field_vals = row
+                line = {}
+                filter = false
+                @field_names.each_with_index do |field, i|
+                    val = field_vals[i]
+                    val = val.to_s.strip if val && @trim_whitespace
+                    line[field] = val
+                    if include_filter && f = include_filter[i]
+                        filter = !check_filter(f, line[field])
+                    end
+                    if exclude_filter && f = exclude_filter[i]
+                        filter = check_filter(f, line[field])
+                    end
+                    break if filter
+                end
+                if filter
+                    @skip_count += 1
+                    next
+                end
+                key_values = @key_field_indexes.map{ |kf| @case_sensitive ?
+                                                          field_vals[kf].to_s :
+                                                          field_vals[kf].to_s.upcase }
+                key = key_values.join('~')
+                parent_key = key_values[0...(@parent_fields.length)].join('~')
+                if @lines[key]
+                    @warnings << "Duplicate key '#{key}' encountered at line #{line_num}"
+                    @dup_count += 1
+                    key += "[#{@dup_count}]"
+                end
+                @index[parent_key] << key
+                @lines[key] = line
+                @line_count += 1
+            end
+        end
+        # Save the data in this Source as a CSV at +file_path+.
+        #
+        # @parma file_path [String] The target path to save the data to.
+        # @param options [Hash] A set of options to pass to CSV.open to control
+        #   how the CSV is generated.
+        def save_csv(file_path, options = {})
+            require 'csv'
+            default_opts = {
+                headers: @field_name, write_headers: true
+            }
+            CSV.open(file_path, 'wb', default_opts.merge(options)) do |csv|
+                @data.each{ |rec| csv << rec }
+            end
+        end
+        private
+        def index_fields
+            @key_field_indexes = find_field_indexes(@key_fields, @field_names)
+            @parent_field_indexes = find_field_indexes(@parent_fields, @field_names)
+            @child_field_indexes = find_field_indexes(@child_fields, @field_names)
+            @key_fields = @key_field_indexes.map{ |i| @field_names[i] }
+            @parent_fields = @parent_field_indexes.map{ |i| @field_names[i] }
+            @child_fields = @child_field_indexes.map{ |i| @field_names[i] }
+        end
+        # Converts an array of field names to an array of indexes of the fields
+        # matching those names.
+        def find_field_indexes(key_fields, field_names)
+            key_fields.map do |field|
+                if field.is_a?(Integer)
+                    field
+                else
+                    field_names.index{ |field_name| field.to_s.downcase == field_name.to_s.downcase } or
+                        raise ArgumentError, "Could not locate field '#{field}' in source field names: #{
+                            field_names.join(', ')}"
+                end
+            end
+        end
+        def convert_filter(hsh, field_names)
+            return unless hsh
+            if !hsh.is_a?(Hash)
+                raise ArgumentError, ":include/:exclude option must be a Hash of field name(s)/index(es) to RegExp(s)"
+            end
+            keys = hsh.keys
+            idxs = find_field_indexes(keys, @field_names)
+            Hash[keys.each_with_index.map{ |k, i| [idxs[i], hsh[k]] }]
+        end
+        def check_filter(filter, field_val)
+            case filter
+            when String
+                if @case_sensitive
+                    filter == field_val
+                else
+                    filter.downcase == field_val.to_s.downcase
+                end
+            when Regexp
+                filter.match(field_val)
+            when Proc
+                filter.call(field_val)
+            else
+                raise ArgumentError, "Unsupported filter expression: #{filter.inspect}"
+            end
+        end
+    end
+end

data/lib/csv-diff/xml_source.rb ADDED

@@ -0,0 +1,142 @@
+require 'nokogiri'
+require 'cgi'
+class CSVDiff
+    # Convert XML content to CSV format using XPath selectors to identify the
+    # rows and field values in an XML document
+    class XMLSource < Source
+        attr_accessor :context
+        # Create a new XMLSource, identified by +path+. Normally this is a path
+        # to the XML document, but any value is fine, as it isreally just a label
+        # to identify this data set.
+        #
+        # @param path [String] A label for this data set (often a path to the
+        #   XML document used as the source).
+        # @param options [Hash] An options hash.
+        # @option options [Array<String>] :field_names The names of each of the
+        #   fields in +source+.
+        # @option options [Boolean] :ignore_header If true, and :field_names has
+        #   been specified, then the first row of the file is ignored.
+        # @option options [String] :key_field The name of the field that uniquely
+        #   identifies each row.
+        # @option options [Array<String>] :key_fields The names of the fields
+        #   that uniquely identifies each row.
+        # @option options [String] :parent_field The name of the field(s) that
+        #   identify a parent within which sibling order should be checked.
+        # @option options [String] :child_field The name of the field(s) that
+        #   uniquely identify a child of a parent.
+        # @option options [Boolean] :case_sensitive If true (the default), keys
+        #   are indexed as-is; if false, the index is built in upper-case for
+        #   case-insensitive comparisons.
+        # @option options [Hash] :include A hash of field name(s) or index(es) to
+        #   regular expression(s). Only source rows whose field values satisfy the
+        #   regular expressions will be indexed and included in the diff process.
+        # @option options [Hash] :exclude A hash of field name(s) or index(es) to
+        #   regular expression(s). Source rows with a field value that satisfies
+        #   the regular expressions will be excluded from the diff process.
+        # @option options [String] :context A context value from which fields
+        #   can be populated using a Regexp.
+        def initialize(path, options = {})
+            super(options)
+            @path = path
+            @context = options[:context]
+            @data = []
+        end
+        # Process a +source+, converting the XML into a table of data, using
+        # +rec_xpath+ to identify the nodes that correspond each record that
+        # should appear in the output, and +field_maps+ to populate each field
+        # in each row.
+        #
+        # @param source [String|Array] may be a String containing XML content,
+        #   an Array of paths to files containing XML content, or a path to
+        #   a single file.
+        # @param rec_xpath [String] An XPath expression that selects all the
+        #   items in the XML document that are to be converted into new rows.
+        #   The returned items are not directly used to populate the fields,
+        #   but provide a context for the field XPath expreessions that populate
+        #   each field's content.
+        # @param field_maps [Hash<String, String>] A map of field names to
+        #   expressions that are evaluated in the context of each row node
+        #   selected by +rec_xpath+. The field expressions are typically XPath
+        #   expressions evaluated in the context of the nodes returned by the
+        #   +rec_xpath+. Alternatively, a String that is not an XPath expression
+        #   is used as a literal value for a field, while a Regexp can also
+        #   be used to pull a value from any context specified in the +options+
+        #   hash. The Regexp should include a single grouping, as the value used
+        #   will be the result in $1 after the match is performed.
+        # @param context [String] An optional context for the XML to be processed.
+        #   The value passed here can be referenced in field map expressions
+        #   using a Regexp, with the value of the first grouping in the regex
+        #   being the value returned for the field.
+        def process(source, rec_xpath, field_maps, context = nil)
+            @field_names = field_maps.keys unless @field_names
+            case source
+            when Nokogiri::XML::Document
+                add_data(source, rec_xpath, field_maps, context || @context)
+            when /<\?xml/
+                doc = Nokogiri::XML(source)
+                add_data(doc, rec_xpath, field_maps, context || @context)
+            when Array
+                source.each{ |f| process_file(f, rec_xpath, field_maps) }
+            when String
+                process_file(source, rec_xpath, field_maps)
+            else
+                raise ArgumentError, "Unhandled source type #{source.class.name}"
+            end
+            @data
+        end
+        private
+        # Load the XML document at +file_path+ and process it into rows of data.
+        def process_file(file_path, rec_xpath, field_maps)
+            begin
+                File.open(file_path) do |f|
+                    doc = Nokogiri::XML(f)
+                    add_data(doc, rec_xpath, field_maps, @context || file_path)
+                end
+            rescue
+                STDERR.puts "An error occurred while attempting to open #{file_path}"
+                raise
+            end
+        end
+        # Locate records in +doc+ using +rec_xpath+ to identify the nodes that
+        # correspond to a new record in the data, and +field_maps+ to populate
+        # the fields in each row.
+        def add_data(doc, rec_xpath, field_maps, context)
+            doc.xpath(rec_xpath).each do |rec_node|
+                rec = []
+                field_maps.each do |field_name, expr|
+                    case expr
+                    when Regexp         # Match context against Regexp and extract first grouping
+                        if context
+                            context =~ expr
+                            rec << $1
+                        else
+                            rec << nil
+                        end
+                    when %r{[/(.@]}     # XPath expression
+                        res = rec_node.xpath(expr)
+                        rec << CGI.unescape_html(res.to_s)
+                    else                # Use expr as the value for this field
+                        rec << expr
+                    end
+                end
+                @data << rec
+            end
+        end
+    end
+end

data/lib/csv_diff.rb CHANGED

File without changes

metadata CHANGED

@@ -1,17 +1,17 @@
 --- !ruby/object:Gem::Specification
 name: csv-diff
 version: !ruby/object:Gem::Version
-  version: 0.3.5
+  version: 0.5.0
 platform: ruby
 authors:
 - Adam Gardiner
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-03-05 00:00:00.000000000 Z
+date: 2020-07-15 00:00:00.000000000 Z
 dependencies: []
 description: |2
-          This library performs diffs of CSV files (or table-like sources).
+          This library performs diffs of CSV data, or any table-like source.
           Unlike a standard diff that compares line by line, and is sensitive to the
           ordering of records, CSV-Diff identifies common lines by key field(s), and
@@ -29,7 +29,9 @@ description: |2
           sibling order.
           This gem implements the core diff algorithm, and handles the loading and
-          diffing of CSV files (or Arrays of Arrays). It returns a CSVDiff object
+          diffing of CSV files (or Arrays of Arrays). It also supports converting
+          data in XML format into tabular form, so that it can then be processed
+          like any other CSV or table-like source.  It returns a CSVDiff object
           containing the details of differences in object form. This is useful for
           projects that need diff capability, but want to handle the reporting or
           actioning of differences themselves.
@@ -48,6 +50,8 @@ files:
 - lib/csv-diff/algorithm.rb
 - lib/csv-diff/csv_diff.rb
 - lib/csv-diff/csv_source.rb
+- lib/csv-diff/source.rb
+- lib/csv-diff/xml_source.rb
 - lib/csv_diff.rb
 homepage: https://github.com/agardiner/csv-diff
 licenses:
@@ -69,8 +73,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.2
+rubygems_version: 2.5.2.3
 signing_key:
 specification_version: 4
-summary: CSV Diff is a library for generating diffs from data in CSV format
+summary: CSV Diff is a library for generating diffs from data in CSV or XML format
 test_files: []