RubyGems - ndr_import - Versions diffs - 10.2.0 → 11.0.0 - Mend

ndr_import 10.2.0 → 11.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -0
data/README.md +1 -1
data/lib/ndr_import/file/all.rb +1 -0
data/lib/ndr_import/file/vcf.rb +25 -0
data/lib/ndr_import/file/xml.rb +22 -7
data/lib/ndr_import/helpers/file/xml_streaming.rb +34 -9
data/lib/ndr_import/non_tabular/table.rb +1 -1
data/lib/ndr_import/table.rb +4 -2
data/lib/ndr_import/universal_importer_helper.rb +8 -8
data/lib/ndr_import/vcf/table.rb +21 -0
data/lib/ndr_import/version.rb +1 -1
data/lib/ndr_import/xml/column_mapping.rb +87 -0
data/lib/ndr_import/xml/masked_mappings.rb +74 -0
data/lib/ndr_import/xml/table.rb +125 -11
data/lib/ndr_import/xml/unmapped_xpath_error.rb +15 -0
data/lib/ndr_import.rb +2 -0
metadata +28 -9

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: aaca9cf96e7433b7889c004f769997288ea1cec77322754e965b97a545ffb0ab
-  data.tar.gz: 4a1e456f8766f2ea3422b0b712b2ca4c420873248e66fc77b1db8559470a1247
+  metadata.gz: 52ae4b12ab514a7bda584c93b3124fdeab03cea5292424c0e380c92f6e3a3c1a
+  data.tar.gz: 4b9dff76aa434bb87e542bcf694d4f99a91b2cd69e2eeaf6730a32b78fe61cc9
 SHA512:
-  metadata.gz: '081ce6d8bce5dde04dca97a68057897a093bc90cca23f7336415330d92dcc599b96b606e9626358731c6638f0c3c1e41b7e763b2c03488cb76eeaa5c5c7a2cd8'
-  data.tar.gz: 802ba8017a16cc843196004854c3a82d161e76bdd4c2f61a5f93e34018dae9090212dd67c4e503a46c5ee78d6ed2da2d7e6f17192e62e4abefbd2382389d1cce
+  metadata.gz: 27e3c4578ab466ae9977727de5b972e8e5bf7e2f9fa62ab53cd60437e4d7232e5da3ce3afbcf6b94dd943b219b051ccc6177b73189e736ecf8d10f582f6f0bc9
+  data.tar.gz: b06ef69bba53f56314574ff9749011289180624d0f73f7660537610ba29e3ec01114810bc781ccb8c67440c30f1406a05c56e508299a026a7467bb7968d4cb16

data/CHANGELOG.md CHANGED Viewed

@@ -1,6 +1,18 @@
 ## [Unreleased]
+=======
 *no unreleased changes*
+## 11.0.0 / 2023-10-27
+### Changed
+* XML enhancements. Breaking change, the enhancements are not backward compatible
+### Fixed
+* Replace unsupported seven_zip_ruby gem with seven-zip fork
+## 10.3.0 / 2023-09-07
+### Added
+* VCF file support
+* Support Ruby 3.2. Drop support for Ruby 2.7, Rails 6.0
 ## 10.2.0 / 2023-05-16
 * avro file support
 * allow storage of `significant_mapped_fields` in `Table`

data/README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 # NdrImport [![Build Status](https://github.com/NHSDigital/ndr_import/workflows/Test/badge.svg)](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://rubygems.org/gems/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
 This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
-1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip and avro files.
+1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip, avro and VCF files.
 2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
 ## Installation

data/lib/ndr_import/file/all.rb CHANGED Viewed

@@ -8,6 +8,7 @@ require_relative 'pdf'
 require_relative 'seven_zip'
 require_relative 'text'
 require_relative 'unregistered_filetype'
+require_relative 'vcf'
 require_relative 'word'
 require_relative 'xml'
 require_relative 'zip'

data/lib/ndr_import/file/vcf.rb ADDED Viewed

@@ -0,0 +1,25 @@
+require 'bio-vcf/vcfline'
+require 'ndr_support/safe_file'
+require_relative 'registry'
+module NdrImport
+  # This is one of a collection of file handlers that deal with individual formats of data.
+  # They can be instantiated directly or via the factory method Registry.tables
+  module File
+    # This class is a vcf file handler that returns a single table.
+    class Vcf < Base
+      private
+      def rows(&block)
+        return enum_for(:rows) unless block
+        ::File.read(@filename).each_line do |line|
+          next if line =~ /^##/
+          yield BioVcf::VcfLine.parse(line)
+        end
+      end
+    end
+    Registry.register(Vcf, 'vcf')
+  end
+end

data/lib/ndr_import/file/xml.rb CHANGED Viewed

@@ -12,22 +12,37 @@ module NdrImport
       include NdrImport::Helpers::File::Xml
       include NdrImport::Helpers::File::XmlStreaming
+      def initialize(*)
+        super
+        @pattern_match_xpath = @options['pattern_match_record_xpath']
+      end
       private
       # Iterate through the file, yielding each 'xml_record_xpath' element in turn.
       def rows(&block)
         return enum_for(:rows) unless block
-        xpath = @options['xml_record_xpath']
         if @options['slurp']
-          doc = read_xml_file(@filename)
-          doc.xpath(xpath).each(&block)
+          record_elements(read_xml_file(@filename)).each(&block)
+        else
+          each_node(@filename, xml_record_xpath, @pattern_match_xpath, &block)
+        end
+      end
+      def xml_record_xpath
+        @pattern_match_xpath ? @options['xml_record_xpath'] : "*/#{@options['xml_record_xpath']}"
+      end
+      def record_elements(doc)
+        if @pattern_match_xpath
+          doc.root.children.find_all do |element|
+            element.name =~ Regexp.new(@options['xml_record_xpath'])
+          end
         else
-          each_node(@filename, xpath, &block)
+          doc.root.xpath(@options['xml_record_xpath'])
         end
-      rescue StandardError => e
-        raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
       end
     end
     # Not all xml files may want to be registered, so 'xml' is not registered by design.

data/lib/ndr_import/helpers/file/xml_streaming.rb CHANGED Viewed

@@ -33,15 +33,20 @@ module NdrImport
           # wrapper to hold a representation of each element we descent into:
           StackItem = Struct.new(:name, :attrs, :empty)
-          def initialize(xpath)
+          def initialize(xpath, pattern_match_xpath)
             @xpath = xpath
+            @pattern_match_xpath = pattern_match_xpath
             @stack = []
             @match_depth = nil
           end
           # Has this cursor already passed inside a similar node?
+          # attribute comparison allows for e.g.:
+          # <SameName>
+          #   <SameName code="N"/>
+          # </SameName>
           def in?(node)
-            @stack.detect { |item| item.name == node.name }
+            @stack.detect { |item| item.name == node.name && item.attrs == node.attributes }
           end
           def enter(node)
@@ -85,9 +90,27 @@ module NdrImport
           def current_stack_match?
             parent_stack = @stack[0..-2]
-            return false unless dom_stubs[@stack].at_xpath(@xpath)
+            stack_match = if @pattern_match_xpath
+                            dom_stubs[@stack].root.children.find_all do |node|
+                              node.name =~ Regexp.new(@xpath)
+                            end.first
+                          else
+                            dom_stubs[@stack].at_xpath(@xpath)
+                          end
-            parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
+            return false unless stack_match
+            parent_stack.empty? || xpath_not_in_parent_document?(dom_stubs[parent_stack])
+          end
+          def xpath_not_in_parent_document?(parent_document)
+            if @pattern_match_xpath
+              parent_document.root.children.find_all do |node|
+                node.name =~ Regexp.new(@xpath)
+              end.first.nil?
+            else
+              !parent_document.at_xpath(@xpath)
+            end
           end
           # A cached collection of DOM fragments, to represent the structure
@@ -116,13 +139,15 @@ module NdrImport
         #
         # In the case of dodgy encoding, may fall back to slurping the
         # file, but will still use stream parsing for XML.
-        def each_node(safe_path, xpath, &block)
-          return enum_for(:each_node, safe_path, xpath) unless block
+        #
+        # Optionally pattern match the xpath
+        def each_node(safe_path, xpath, pattern_match_xpath = nil, &block)
+          return enum_for(:each_node, safe_path, xpath, pattern_match_xpath) unless block
           require 'nokogiri'
           with_encoding_check(safe_path) do |stream, encoding|
-            stream_xml_nodes(stream, xpath, encoding, &block)
+            stream_xml_nodes(stream, xpath, pattern_match_xpath, encoding, &block)
           end
         end
@@ -153,9 +178,9 @@ module NdrImport
           system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
         end
-        def stream_xml_nodes(io, node_xpath, encoding = nil)
+        def stream_xml_nodes(io, node_xpath, pattern_match_xpath, encoding = nil)
           # Track nesting as the cursor moves through the document:
-          cursor = Cursor.new(node_xpath)
+          cursor = Cursor.new(node_xpath, pattern_match_xpath)
           # If markup isn't well-formed, try to work around it:
           options = Nokogiri::XML::ParseOptions::RECOVER

data/lib/ndr_import/non_tabular/table.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module NdrImport
       include UTF8Encoding
       TABULAR_ONLY_OPTIONS = %w[delimiter last_data_column liberal_parsing tablename_pattern
-                                header_lines footer_lines xml_record_xpath slurp].freeze
+                                header_lines footer_lines slurp].freeze
       NON_TABULAR_OPTIONS = %w[capture_end_line capture_start_line start_line_pattern
                                end_line_pattern remove_lines start_in_a_record

data/lib/ndr_import/table.rb CHANGED Viewed

@@ -6,13 +6,14 @@ module NdrImport
   # required to transform a table of data into "records". Particular attention
   # has been made to use enumerables throughout to help with the transformation
   # of large quantities of data.
+  # rubocop:disable Metrics/ClassLength
   class Table
     include NdrImport::Mapper
     def self.all_valid_options
       %w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
-         tablename_pattern header_lines footer_lines format klass columns xml_record_xpath slurp
-         row_identifier significant_mapped_fields]
+         tablename_pattern header_lines footer_lines format klass columns slurp row_identifier
+         significant_mapped_fields]
     end
     def all_valid_options
@@ -250,4 +251,5 @@ module NdrImport
       index - 1
     end
   end # class Table
+  # rubocop:enable Metrics/ClassLength
 end

data/lib/ndr_import/universal_importer_helper.rb CHANGED Viewed

@@ -52,14 +52,14 @@ module NdrImport
         # now at the individual file level, can we find the table mapping?
         table_mapping = get_table_mapping(filename, nil)
-        options = {
-          'unzip_path'       => unzip_path,
-          'col_sep'          => table_mapping.try(:delimiter),
-          'file_password'    => table_mapping.try(:file_password),
-          'liberal_parsing'  => table_mapping.try(:liberal_parsing),
-          'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
-          'slurp'            => table_mapping.try(:slurp)
-        }
+        options = { 'unzip_path'                 => unzip_path,
+                    'col_sep'                    => table_mapping.try(:delimiter),
+                    'file_password'              => table_mapping.try(:file_password),
+                    'liberal_parsing'            => table_mapping.try(:liberal_parsing),
+                    'xml_record_xpath'           => table_mapping.try(:xml_record_xpath),
+                    'slurp'                      => table_mapping.try(:slurp),
+                    'yield_xml_record'           => table_mapping.try(:yield_xml_record),
+                    'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath) }
         tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
         yield_tables_and_their_content(filename, tables, &block)

data/lib/ndr_import/vcf/table.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'ndr_import/table'
+module NdrImport
+  module Vcf
+    # Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
+    # All other Table logic is inherited from `NdrImport::Table`
+    class Table < ::NdrImport::Table
+      def self.all_valid_options
+        super - %w[delimiter header_lines footer_lines]
+      end
+      def header_lines
+        1
+      end
+      def footer_lines
+        0
+      end
+    end
+  end
+end

data/lib/ndr_import/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 # This stores the current version of the NdrImport gem
 module NdrImport
-  VERSION = '10.2.0'
+  VERSION = '11.0.0'
 end

data/lib/ndr_import/xml/column_mapping.rb ADDED Viewed

@@ -0,0 +1,87 @@
+module NdrImport
+  module Xml
+    # This class generates new XML column mappings where repeating columns/sections have been
+    # identified in the xml.
+    # This avoids the need for mappings to verbosly define repeating columns/sections
+    class ColumnMapping
+      attr_accessor :existing_column, :unmapped_node_parts, :klass_increment, :xml_line, :klass,
+                    :repeating_item, :increment_field_name, :build_new_record, :klass_section_xpath
+      def initialize(existing_column, unmapped_node_parts, klass_increment, xml_line, klass)
+        @existing_column      = existing_column
+        @unmapped_node_parts  = unmapped_node_parts
+        @klass_increment      = klass_increment
+        @xml_line             = xml_line
+        @klass                = klass
+        @repeating_item       = existing_column.dig('xml_cell', 'multiple')
+        @increment_field_name = existing_column.dig('xml_cell', 'increment_field_name')
+        @build_new_record     = existing_column.dig('xml_cell', 'build_new_record')
+        @klass_section_xpath  = existing_column.dig('xml_cell', 'klass_section')
+      end
+      def call
+        new_column                              = existing_column.deep_dup
+        new_column['column']                    = unmapped_node_parts[:column_name]
+        new_column['xml_cell']['relative_path'] = unmapped_node_parts[:column_relative_path]
+        # create unique rawtext names for repeating sections within a record
+        apply_new_rawtext_and_mapped_names_to(new_column) if repeating_item
+        return new_column unless incremented_klass_needed?
+        new_column['klass'] = incremented_klass
+        new_column
+      end
+      private
+      # If a table level klass is defined, there is nothing to increment at the column level.
+      # Similarly, not all repeating sections/items require a separate record.
+      # No need to create new records for a single occurence of a repeating section
+      def incremented_klass_needed?
+        return false if klass.present?
+        # Column mapping needs to explicitly flag when additionals should not be made
+        return false if build_new_record == false
+        return false if xml_line.xpath(klass_section_xpath).one? && repeating_item
+        true
+      end
+      def incremented_klass
+        if existing_column['klass'].is_a?(Array)
+          existing_column['klass'].map do |column_klass|
+            column_klass + "##{klass_increment}"
+          end
+        else
+          existing_column['klass'] + "##{klass_increment}"
+        end
+      end
+      # Append "_1", "_2" etc to repeating rawtext and optionally mapped field names within a
+      # single record, so data is not overwritten
+      def apply_new_rawtext_and_mapped_names_to(new_column)
+        existing_rawtext        = existing_column['rawtext_name'] || existing_column['column']
+        column_name_increment   = new_column['column'].scan(/\[(\d+)\]/)
+        relative_path_increment = new_column.dig('xml_cell', 'relative_path').scan(/\[(\d+)\]/)
+        # Find all the increments (e.g. [1], [2]) from the new column and use their sum
+        # as the rawtext and column name increment
+        increment = (column_name_increment + relative_path_increment).flatten.map(&:to_i).sum
+        new_column['rawtext_name'] = existing_rawtext + "_#{increment}" unless increment.zero?
+        return unless !increment.zero? && increment_field_name
+        new_column['mappings'] = incremented_mappings_for(new_column, increment)
+      end
+      # Increment the mapped `field` names
+      def incremented_mappings_for(new_column, increment)
+        new_column['mappings'].map do |mapping|
+          mapping['field'] = "#{mapping['field']}_#{increment}"
+          mapping
+        end
+      end
+    end
+  end
+end

data/lib/ndr_import/xml/masked_mappings.rb ADDED Viewed

@@ -0,0 +1,74 @@
+module NdrImport
+  module Xml
+    # This class applies a do_not_capture mask to those mappings that do not relate to each klass.
+    # Overriding the NdrImport::Table method to avoid memoizing. This by design, column mappings
+    # can change if new mappings are added on the fly where repeating sections are present
+    class MaskedMappings
+      attr_accessor :klass, :augmented_columns
+      def initialize(klass, augmented_columns)
+        @klass             = klass
+        @augmented_columns = augmented_columns
+      end
+      def call
+        return { klass => augmented_columns } if klass.present?
+        masked_mappings = column_level_klass_masked_mappings
+        augmented_masked_mappings = masked_mappings
+        # Remove any masked klasses where additional columns mappings
+        # have been added for repeated sections
+        # e.g. SomeTestKlass column mappings are not needed if SomeTestKlass#1
+        # have been added
+        masked_mappings.each do |masked_key, columns|
+          # There may be occasions where the e.g. SomeTestKlass should be kept,
+          # This can be flagged in the one the klass's column mappings
+          next if columns.any? { |column| column.dig('xml_cell', 'keep_klass') }
+          if masked_mappings.keys.any? { |key| key =~ /\A#{masked_key}#\d+\z/ }
+            augmented_masked_mappings.delete(masked_key)
+          end
+        end
+        augmented_masked_mappings
+      end
+      private
+      # This method duplicates the mappings and applies a do_not_capture mask to those that do not
+      # relate to this klass, returning the masked mappings
+      def mask_mappings_by_klass(klass)
+        augmented_columns.deep_dup.map do |mapping|
+          Array(mapping['klass']).flatten.include?(klass) ? mapping : { 'do_not_capture' => true }
+        end
+      end
+      def column_level_klass_masked_mappings
+        ensure_mappings_define_klass
+        # Loop through each klass
+        masked_mappings = {}
+        augmented_columns.pluck('klass').flatten.compact.uniq.each do |klass|
+          # Do not capture fields that relate to other klasses
+          masked_mappings[klass] = mask_mappings_by_klass(klass)
+        end
+        masked_mappings
+      end
+      # This method ensures that every column mapping defines a klass (unless it is a column that
+      # we do not capture). It is only used where a table level klass is not defined.
+      def ensure_mappings_define_klass
+        klassless_mappings = augmented_columns.
+                             select { |mapping| mapping.nil? || mapping['klass'].nil? }.
+                             reject { |mapping| mapping['do_not_capture'] }.
+                             map { |mapping| mapping['column'] || mapping['standard_mapping'] }
+        return if klassless_mappings.empty?
+        # All column mappings for the single item file require a klass definition.
+        raise "Missing klass for column(s): #{klassless_mappings.to_sentence}"
+      end
+    end
+  end
+end

data/lib/ndr_import/xml/table.rb CHANGED Viewed

@@ -7,10 +7,17 @@ module NdrImport
     # attention has been made to use enumerables throughout to help with the
     # transformation of large quantities of data.
     class Table < ::NdrImport::Table
+      require 'ndr_import/xml/column_mapping'
+      require 'ndr_import/xml/masked_mappings'
+      XML_OPTIONS = %w[pattern_match_record_xpath xml_record_xpath yield_xml_record].freeze
       def self.all_valid_options
-        super - %w[delimiter header_lines footer_lines]
+        super - %w[delimiter header_lines footer_lines] + XML_OPTIONS
       end
+      attr_reader(*XML_OPTIONS)
       def header_lines
         0
       end
@@ -24,26 +31,130 @@ module NdrImport
       # and fields for each mapped klass.
       def transform_line(line, index)
         return enum_for(:transform_line, line, index) unless block_given?
         raise 'Not an Nokogiri::XML::Element!' unless line.is_a? Nokogiri::XML::Element
-        validate_column_mappings(line)
+        augmented_masked_mappings = augment_and_validate_column_mappings_for(line)
-        xml_line = column_xpaths.map { |column_xpath| line.xpath(column_xpath).inner_text }
+        xml_line = xml_line_from(line)
-        masked_mappings.each do |klass, klass_mappings|
+        records_from_xml_line = []
+        augmented_masked_mappings.each do |klass, klass_mappings|
           fields = mapped_line(xml_line, klass_mappings)
           next if fields[:skip].to_s == 'true'.freeze
-          yield(klass, fields, index)
+          if yield_xml_record
+            records_from_xml_line << [klass, fields, index]
+          else
+            yield(klass, fields, index)
+          end
         end
+        yield(records_from_xml_line.compact) if yield_xml_record
       end
       private
+      def augment_and_validate_column_mappings_for(line)
+        augment_column_mappings_for(line)
+        validate_column_mappings(line)
+        NdrImport::Xml::MaskedMappings.new(@klass, @augmented_columns.deep_dup).call
+      end
+      # Add missing column mappings (and column_xpaths) where
+      # repeating sections / data items appear
+      def augment_column_mappings_for(line)
+        # Start with a fresh set of @augmented_columns for each line, adding new mappings as
+        # required for each `line`
+        @augmented_columns       = @columns.deep_dup
+        @augmented_column_xpaths = column_xpaths.deep_dup
+        unmapped_xpaths(line).each do |unmapped_xpath|
+          existing_column = find_existing_column_for(unmapped_xpath.dup)
+          next unless existing_column
+          unmapped_xpath_hash   = labelled_xpath_components_from(unmapped_xpath)
+          klass_increment_match = unmapped_xpath.match(/\[(\d+)\]/)
+          raise "could not identify klass for #{unmapped_xpath}" unless klass_increment_match
+          new_column = NdrImport::Xml::ColumnMapping.new(existing_column, unmapped_xpath_hash,
+                                                         klass_increment_match[1], line,
+                                                         @klass).call
+          @augmented_columns << new_column
+          @augmented_column_xpaths << build_xpath_from(new_column)
+        end
+      end
+      def xml_line_from(line)
+        @augmented_column_xpaths.map do |column_xpath|
+          # Augmenting the column mappings should account for repeating sections/items
+          # TODO: Is this needed now that we removed "duplicated" klass mappings?
+          line.xpath(column_xpath).count > 1 ? '' : line.xpath(column_xpath).inner_text
+        end
+      end
+      def find_existing_column_for(unmapped_xpath)
+        # Remove any e.g. [2] which will be present on repeating sections
+        unmapped_xpath.gsub!(/\[\d+\]/, '')
+        unmapped_xpath_hash = labelled_xpath_components_from(unmapped_xpath)
+        columns.detect do |column|
+          column['column'] == unmapped_xpath_hash[:column_name] &&
+            column.dig('xml_cell', 'relative_path') == unmapped_xpath_hash[:column_relative_path] &&
+            column.dig('xml_cell', 'attribute') == unmapped_xpath_hash[:column_attribute]
+        end
+      end
+      # Returns a Hash containing labelled components for the given `unmapped_xpath`
+      # For example, an `unmapped_xpath` of "Record/Demographics/Sex/@code" would result in:
+      # { column_attribute: '@code',
+      #   column_name: 'Sex',
+      #   column_relative_path: 'Record/Demographics' }
+      def labelled_xpath_components_from(unmapped_xpath)
+        xpath_components = unmapped_xpath.split('/')
+        column_attribute = new_column_attribute_from(xpath_components)
+        # I dislike the `EnforcedShorthandSyntax`, code is less readable
+        # rubocop:disable Style::HashSyntax
+        { column_attribute: column_attribute,
+          column_name: new_column_name_from(xpath_components, column_attribute),
+          column_relative_path: new_relative_path_from(xpath_components, column_attribute) }
+        # rubocop:enable Style::HashSyntax
+      end
+      def new_column_attribute_from(xpath_components)
+        xpath_components.last.starts_with?('@') ? xpath_components.last[1...] : nil
+      end
+      def new_column_name_from(xpath_components, column_attribute)
+        return xpath_components[-2] if column_attribute.present?
+        xpath_components.last
+      end
+      # xpaths can be e.g. Record/Demographics/Sex/@code or Record/Demographics/Surname
+      # `xpath_components` is an array of the xpath's components, for example:
+      # Record/Demographics/Sex/@code => ['Record', 'Demographics', 'Sex', '@code']
+      #
+      # For the relative path, we want to return Record/Demographics.
+      # The upper_limit removes the "field name" (Sex or Surname here) and optionally the
+      # attribute (@code here) if present, from `xpath_components`.
+      # The resulting array is joined back together to form the relative path.
+      def new_relative_path_from(xpath_components, column_attribute)
+        upper_limit = column_attribute.present? ? -3 : -2
+        xpath_components.count > 1 ? xpath_components[0..upper_limit].join('/') : nil
+      end
       # Ensure every leaf is accounted for in the column mappings
       def validate_column_mappings(line)
-        missing_nodes = mappable_xpaths_from(line) - column_xpaths
-        raise "Unmapped data! #{missing_nodes}" unless missing_nodes.empty?
+        missing_xpaths = unmapped_xpaths(line)
+        return if missing_xpaths.none?
+        raise(NdrImport::Xml::UnmappedXpathError, missing_xpaths.to_sentence)
+      end
+      # Not memoized this by design, we want to re-calculate unmapped xpaths after
+      # `@augmented_column_xpaths` have been augmented for each `line`
+      def unmapped_xpaths(line)
+        mappable_xpaths_from(line) - (@augmented_column_xpaths || column_xpaths)
       end
       def column_name_from(column)
@@ -58,9 +169,12 @@ module NdrImport
         xpaths = []
         line.xpath('.//*[not(child::*)]').each do |node|
-          xpath = node.path.sub(line.path + '/', '')
-          xpaths << xpath
-          node.attributes.each_key { |key| xpaths << "#{xpath}/@#{key}" }
+          xpath = node.path.sub("#{line.path}/", '')
+          if node.attributes.any?
+            node.attributes.each_key { |key| xpaths << "#{xpath}/@#{key}" }
+          else
+            xpaths << xpath
+          end
         end
         xpaths
       end

data/lib/ndr_import/xml/unmapped_xpath_error.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module NdrImport
+  module Xml
+    # Raised if an unmapped xpath is identified
+    class UnmappedXpathError < StandardError
+      attr_reader :missing_xpaths
+      def initialize(missing_xpaths)
+        @missing_xpaths = missing_xpaths
+        message         = "Unmapped xpath(s): #{missing_xpaths}"
+        super(message)
+      end
+    end
+  end
+end

data/lib/ndr_import.rb CHANGED Viewed

@@ -10,8 +10,10 @@ require 'ndr_import/fixed_width/table'
 require 'ndr_import/xml/table'
 require 'ndr_import/pdf_form/table'
 require 'ndr_import/avro/table'
+require 'ndr_import/vcf/table'
 require 'ndr_import/unmapped_data_error'
 require 'ndr_import/acroform_reader'
+require 'ndr_import/xml/unmapped_xpath_error'
 module NdrImport
   def self.root

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ndr_import
 version: !ruby/object:Gem::Version
-  version: 10.2.0
+  version: 11.0.0
 platform: ruby
 authors:
 - NCRS Development Team
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-05-18 00:00:00.000000000 Z
+date: 2023-10-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activemodel
@@ -30,7 +30,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: '6.0'
+        version: '6.1'
     - - "<"
       - !ruby/object:Gem::Version
         version: '7.1'
@@ -40,7 +40,7 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: '6.0'
+        version: '6.1'
     - - "<"
       - !ruby/object:Gem::Version
         version: '7.1'
@@ -106,6 +106,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: 1.11.0
+- !ruby/object:Gem::Dependency
+  name: bio-vcf
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.5
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.5
 - !ruby/object:Gem::Dependency
   name: docx
   requirement: !ruby/object:Gem::Requirement
@@ -197,19 +211,19 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
-  name: seven_zip_ruby
+  name: seven-zip
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.3'
+        version: '1.4'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.3'
+        version: '1.4'
 - !ruby/object:Gem::Dependency
   name: spreadsheet
   requirement: !ruby/object:Gem::Requirement
@@ -399,6 +413,7 @@ files:
 - lib/ndr_import/file/seven_zip.rb
 - lib/ndr_import/file/text.rb
 - lib/ndr_import/file/unregistered_filetype.rb
+- lib/ndr_import/file/vcf.rb
 - lib/ndr_import/file/word.rb
 - lib/ndr_import/file/xml.rb
 - lib/ndr_import/file/zip.rb
@@ -424,9 +439,13 @@ files:
 - lib/ndr_import/table.rb
 - lib/ndr_import/universal_importer_helper.rb
 - lib/ndr_import/unmapped_data_error.rb
+- lib/ndr_import/vcf/table.rb
 - lib/ndr_import/version.rb
+- lib/ndr_import/xml/column_mapping.rb
 - lib/ndr_import/xml/control_char_escaper.rb
+- lib/ndr_import/xml/masked_mappings.rb
 - lib/ndr_import/xml/table.rb
+- lib/ndr_import/xml/unmapped_xpath_error.rb
 homepage: https://github.com/NHSDigital/ndr_import
 licenses:
 - MIT
@@ -439,14 +458,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '2.7'
+      version: '3.0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.33
+rubygems_version: 3.4.10
 signing_key:
 specification_version: 4
 summary: NDR Import