ndr_import 10.2.0 → 11.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: aaca9cf96e7433b7889c004f769997288ea1cec77322754e965b97a545ffb0ab
4
- data.tar.gz: 4a1e456f8766f2ea3422b0b712b2ca4c420873248e66fc77b1db8559470a1247
3
+ metadata.gz: 52ae4b12ab514a7bda584c93b3124fdeab03cea5292424c0e380c92f6e3a3c1a
4
+ data.tar.gz: 4b9dff76aa434bb87e542bcf694d4f99a91b2cd69e2eeaf6730a32b78fe61cc9
5
5
  SHA512:
6
- metadata.gz: '081ce6d8bce5dde04dca97a68057897a093bc90cca23f7336415330d92dcc599b96b606e9626358731c6638f0c3c1e41b7e763b2c03488cb76eeaa5c5c7a2cd8'
7
- data.tar.gz: 802ba8017a16cc843196004854c3a82d161e76bdd4c2f61a5f93e34018dae9090212dd67c4e503a46c5ee78d6ed2da2d7e6f17192e62e4abefbd2382389d1cce
6
+ metadata.gz: 27e3c4578ab466ae9977727de5b972e8e5bf7e2f9fa62ab53cd60437e4d7232e5da3ce3afbcf6b94dd943b219b051ccc6177b73189e736ecf8d10f582f6f0bc9
7
+ data.tar.gz: b06ef69bba53f56314574ff9749011289180624d0f73f7660537610ba29e3ec01114810bc781ccb8c67440c30f1406a05c56e508299a026a7467bb7968d4cb16
data/CHANGELOG.md CHANGED
@@ -1,6 +1,18 @@
1
1
  ## [Unreleased]
2
+ =======
2
3
  *no unreleased changes*
3
4
 
5
+ ## 11.0.0 / 2023-10-27
6
+ ### Changed
7
+ * XML enhancements. Breaking change, the enhancements are not backward compatible
8
+ ### Fixed
9
+ * Replace unsupported seven_zip_ruby gem with seven-zip fork
10
+
11
+ ## 10.3.0 / 2023-09-07
12
+ ### Added
13
+ * VCF file support
14
+ * Support Ruby 3.2. Drop support for Ruby 2.7, Rails 6.0
15
+
4
16
  ## 10.2.0 / 2023-05-16
5
17
  * avro file support
6
18
  * allow storage of `significant_mapped_fields` in `Table`
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # NdrImport [![Build Status](https://github.com/NHSDigital/ndr_import/workflows/Test/badge.svg)](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://rubygems.org/gems/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
2
2
  This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
3
3
 
4
- 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip and avro files.
4
+ 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip, avro and VCF files.
5
5
  2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
6
6
 
7
7
  ## Installation
@@ -8,6 +8,7 @@ require_relative 'pdf'
8
8
  require_relative 'seven_zip'
9
9
  require_relative 'text'
10
10
  require_relative 'unregistered_filetype'
11
+ require_relative 'vcf'
11
12
  require_relative 'word'
12
13
  require_relative 'xml'
13
14
  require_relative 'zip'
@@ -0,0 +1,25 @@
1
+ require 'bio-vcf/vcfline'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a vcf file handler that returns a single table.
10
+ class Vcf < Base
11
+ private
12
+
13
+ def rows(&block)
14
+ return enum_for(:rows) unless block
15
+
16
+ ::File.read(@filename).each_line do |line|
17
+ next if line =~ /^##/
18
+
19
+ yield BioVcf::VcfLine.parse(line)
20
+ end
21
+ end
22
+ end
23
+ Registry.register(Vcf, 'vcf')
24
+ end
25
+ end
@@ -12,22 +12,37 @@ module NdrImport
12
12
  include NdrImport::Helpers::File::Xml
13
13
  include NdrImport::Helpers::File::XmlStreaming
14
14
 
15
+ def initialize(*)
16
+ super
17
+
18
+ @pattern_match_xpath = @options['pattern_match_record_xpath']
19
+ end
20
+
15
21
  private
16
22
 
17
23
  # Iterate through the file, yielding each 'xml_record_xpath' element in turn.
18
24
  def rows(&block)
19
25
  return enum_for(:rows) unless block
20
26
 
21
- xpath = @options['xml_record_xpath']
22
-
23
27
  if @options['slurp']
24
- doc = read_xml_file(@filename)
25
- doc.xpath(xpath).each(&block)
28
+ record_elements(read_xml_file(@filename)).each(&block)
29
+ else
30
+ each_node(@filename, xml_record_xpath, @pattern_match_xpath, &block)
31
+ end
32
+ end
33
+
34
+ def xml_record_xpath
35
+ @pattern_match_xpath ? @options['xml_record_xpath'] : "*/#{@options['xml_record_xpath']}"
36
+ end
37
+
38
+ def record_elements(doc)
39
+ if @pattern_match_xpath
40
+ doc.root.children.find_all do |element|
41
+ element.name =~ Regexp.new(@options['xml_record_xpath'])
42
+ end
26
43
  else
27
- each_node(@filename, xpath, &block)
44
+ doc.root.xpath(@options['xml_record_xpath'])
28
45
  end
29
- rescue StandardError => e
30
- raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
31
46
  end
32
47
  end
33
48
  # Not all xml files may want to be registered, so 'xml' is not registered by design.
@@ -33,15 +33,20 @@ module NdrImport
33
33
  # wrapper to hold a representation of each element we descent into:
34
34
  StackItem = Struct.new(:name, :attrs, :empty)
35
35
 
36
- def initialize(xpath)
36
+ def initialize(xpath, pattern_match_xpath)
37
37
  @xpath = xpath
38
+ @pattern_match_xpath = pattern_match_xpath
38
39
  @stack = []
39
40
  @match_depth = nil
40
41
  end
41
42
 
42
43
  # Has this cursor already passed inside a similar node?
44
+ # attribute comparison allows for e.g.:
45
+ # <SameName>
46
+ # <SameName code="N"/>
47
+ # </SameName>
43
48
  def in?(node)
44
- @stack.detect { |item| item.name == node.name }
49
+ @stack.detect { |item| item.name == node.name && item.attrs == node.attributes }
45
50
  end
46
51
 
47
52
  def enter(node)
@@ -85,9 +90,27 @@ module NdrImport
85
90
  def current_stack_match?
86
91
  parent_stack = @stack[0..-2]
87
92
 
88
- return false unless dom_stubs[@stack].at_xpath(@xpath)
93
+ stack_match = if @pattern_match_xpath
94
+ dom_stubs[@stack].root.children.find_all do |node|
95
+ node.name =~ Regexp.new(@xpath)
96
+ end.first
97
+ else
98
+ dom_stubs[@stack].at_xpath(@xpath)
99
+ end
89
100
 
90
- parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
101
+ return false unless stack_match
102
+
103
+ parent_stack.empty? || xpath_not_in_parent_document?(dom_stubs[parent_stack])
104
+ end
105
+
106
+ def xpath_not_in_parent_document?(parent_document)
107
+ if @pattern_match_xpath
108
+ parent_document.root.children.find_all do |node|
109
+ node.name =~ Regexp.new(@xpath)
110
+ end.first.nil?
111
+ else
112
+ !parent_document.at_xpath(@xpath)
113
+ end
91
114
  end
92
115
 
93
116
  # A cached collection of DOM fragments, to represent the structure
@@ -116,13 +139,15 @@ module NdrImport
116
139
  #
117
140
  # In the case of dodgy encoding, may fall back to slurping the
118
141
  # file, but will still use stream parsing for XML.
119
- def each_node(safe_path, xpath, &block)
120
- return enum_for(:each_node, safe_path, xpath) unless block
142
+ #
143
+ # Optionally pattern match the xpath
144
+ def each_node(safe_path, xpath, pattern_match_xpath = nil, &block)
145
+ return enum_for(:each_node, safe_path, xpath, pattern_match_xpath) unless block
121
146
 
122
147
  require 'nokogiri'
123
148
 
124
149
  with_encoding_check(safe_path) do |stream, encoding|
125
- stream_xml_nodes(stream, xpath, encoding, &block)
150
+ stream_xml_nodes(stream, xpath, pattern_match_xpath, encoding, &block)
126
151
  end
127
152
  end
128
153
 
@@ -153,9 +178,9 @@ module NdrImport
153
178
  system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
154
179
  end
155
180
 
156
- def stream_xml_nodes(io, node_xpath, encoding = nil)
181
+ def stream_xml_nodes(io, node_xpath, pattern_match_xpath, encoding = nil)
157
182
  # Track nesting as the cursor moves through the document:
158
- cursor = Cursor.new(node_xpath)
183
+ cursor = Cursor.new(node_xpath, pattern_match_xpath)
159
184
 
160
185
  # If markup isn't well-formed, try to work around it:
161
186
  options = Nokogiri::XML::ParseOptions::RECOVER
@@ -17,7 +17,7 @@ module NdrImport
17
17
  include UTF8Encoding
18
18
 
19
19
  TABULAR_ONLY_OPTIONS = %w[delimiter last_data_column liberal_parsing tablename_pattern
20
- header_lines footer_lines xml_record_xpath slurp].freeze
20
+ header_lines footer_lines slurp].freeze
21
21
 
22
22
  NON_TABULAR_OPTIONS = %w[capture_end_line capture_start_line start_line_pattern
23
23
  end_line_pattern remove_lines start_in_a_record
@@ -6,13 +6,14 @@ module NdrImport
6
6
  # required to transform a table of data into "records". Particular attention
7
7
  # has been made to use enumerables throughout to help with the transformation
8
8
  # of large quantities of data.
9
+ # rubocop:disable Metrics/ClassLength
9
10
  class Table
10
11
  include NdrImport::Mapper
11
12
 
12
13
  def self.all_valid_options
13
14
  %w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
14
- tablename_pattern header_lines footer_lines format klass columns xml_record_xpath slurp
15
- row_identifier significant_mapped_fields]
15
+ tablename_pattern header_lines footer_lines format klass columns slurp row_identifier
16
+ significant_mapped_fields]
16
17
  end
17
18
 
18
19
  def all_valid_options
@@ -250,4 +251,5 @@ module NdrImport
250
251
  index - 1
251
252
  end
252
253
  end # class Table
254
+ # rubocop:enable Metrics/ClassLength
253
255
  end
@@ -52,14 +52,14 @@ module NdrImport
52
52
  # now at the individual file level, can we find the table mapping?
53
53
  table_mapping = get_table_mapping(filename, nil)
54
54
 
55
- options = {
56
- 'unzip_path' => unzip_path,
57
- 'col_sep' => table_mapping.try(:delimiter),
58
- 'file_password' => table_mapping.try(:file_password),
59
- 'liberal_parsing' => table_mapping.try(:liberal_parsing),
60
- 'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
61
- 'slurp' => table_mapping.try(:slurp)
62
- }
55
+ options = { 'unzip_path' => unzip_path,
56
+ 'col_sep' => table_mapping.try(:delimiter),
57
+ 'file_password' => table_mapping.try(:file_password),
58
+ 'liberal_parsing' => table_mapping.try(:liberal_parsing),
59
+ 'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
60
+ 'slurp' => table_mapping.try(:slurp),
61
+ 'yield_xml_record' => table_mapping.try(:yield_xml_record),
62
+ 'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath) }
63
63
 
64
64
  tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
65
65
  yield_tables_and_their_content(filename, tables, &block)
@@ -0,0 +1,21 @@
1
+ require 'ndr_import/table'
2
+
3
+ module NdrImport
4
+ module Vcf
5
+ # Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
6
+ # All other Table logic is inherited from `NdrImport::Table`
7
+ class Table < ::NdrImport::Table
8
+ def self.all_valid_options
9
+ super - %w[delimiter header_lines footer_lines]
10
+ end
11
+
12
+ def header_lines
13
+ 1
14
+ end
15
+
16
+ def footer_lines
17
+ 0
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '10.2.0'
4
+ VERSION = '11.0.0'
5
5
  end
@@ -0,0 +1,87 @@
1
+ module NdrImport
2
+ module Xml
3
+ # This class generates new XML column mappings where repeating columns/sections have been
4
+ # identified in the xml.
5
+ # This avoids the need for mappings to verbosly define repeating columns/sections
6
+ class ColumnMapping
7
+ attr_accessor :existing_column, :unmapped_node_parts, :klass_increment, :xml_line, :klass,
8
+ :repeating_item, :increment_field_name, :build_new_record, :klass_section_xpath
9
+
10
+ def initialize(existing_column, unmapped_node_parts, klass_increment, xml_line, klass)
11
+ @existing_column = existing_column
12
+ @unmapped_node_parts = unmapped_node_parts
13
+ @klass_increment = klass_increment
14
+ @xml_line = xml_line
15
+ @klass = klass
16
+ @repeating_item = existing_column.dig('xml_cell', 'multiple')
17
+ @increment_field_name = existing_column.dig('xml_cell', 'increment_field_name')
18
+ @build_new_record = existing_column.dig('xml_cell', 'build_new_record')
19
+ @klass_section_xpath = existing_column.dig('xml_cell', 'klass_section')
20
+ end
21
+
22
+ def call
23
+ new_column = existing_column.deep_dup
24
+ new_column['column'] = unmapped_node_parts[:column_name]
25
+ new_column['xml_cell']['relative_path'] = unmapped_node_parts[:column_relative_path]
26
+
27
+ # create unique rawtext names for repeating sections within a record
28
+ apply_new_rawtext_and_mapped_names_to(new_column) if repeating_item
29
+
30
+ return new_column unless incremented_klass_needed?
31
+
32
+ new_column['klass'] = incremented_klass
33
+ new_column
34
+ end
35
+
36
+ private
37
+
38
+ # If a table level klass is defined, there is nothing to increment at the column level.
39
+ # Similarly, not all repeating sections/items require a separate record.
40
+ # No need to create new records for a single occurence of a repeating section
41
+ def incremented_klass_needed?
42
+ return false if klass.present?
43
+ # Column mapping needs to explicitly flag when additionals should not be made
44
+ return false if build_new_record == false
45
+ return false if xml_line.xpath(klass_section_xpath).one? && repeating_item
46
+
47
+ true
48
+ end
49
+
50
+ def incremented_klass
51
+ if existing_column['klass'].is_a?(Array)
52
+ existing_column['klass'].map do |column_klass|
53
+ column_klass + "##{klass_increment}"
54
+ end
55
+ else
56
+ existing_column['klass'] + "##{klass_increment}"
57
+ end
58
+ end
59
+
60
+ # Append "_1", "_2" etc to repeating rawtext and optionally mapped field names within a
61
+ # single record, so data is not overwritten
62
+ def apply_new_rawtext_and_mapped_names_to(new_column)
63
+ existing_rawtext = existing_column['rawtext_name'] || existing_column['column']
64
+ column_name_increment = new_column['column'].scan(/\[(\d+)\]/)
65
+ relative_path_increment = new_column.dig('xml_cell', 'relative_path').scan(/\[(\d+)\]/)
66
+
67
+ # Find all the increments (e.g. [1], [2]) from the new column and use their sum
68
+ # as the rawtext and column name increment
69
+ increment = (column_name_increment + relative_path_increment).flatten.map(&:to_i).sum
70
+ new_column['rawtext_name'] = existing_rawtext + "_#{increment}" unless increment.zero?
71
+
72
+ return unless !increment.zero? && increment_field_name
73
+
74
+ new_column['mappings'] = incremented_mappings_for(new_column, increment)
75
+ end
76
+
77
+ # Increment the mapped `field` names
78
+ def incremented_mappings_for(new_column, increment)
79
+ new_column['mappings'].map do |mapping|
80
+ mapping['field'] = "#{mapping['field']}_#{increment}"
81
+
82
+ mapping
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,74 @@
1
+ module NdrImport
2
+ module Xml
3
+ # This class applies a do_not_capture mask to those mappings that do not relate to each klass.
4
+ # Overriding the NdrImport::Table method to avoid memoizing. This by design, column mappings
5
+ # can change if new mappings are added on the fly where repeating sections are present
6
+ class MaskedMappings
7
+ attr_accessor :klass, :augmented_columns
8
+
9
+ def initialize(klass, augmented_columns)
10
+ @klass = klass
11
+ @augmented_columns = augmented_columns
12
+ end
13
+
14
+ def call
15
+ return { klass => augmented_columns } if klass.present?
16
+
17
+ masked_mappings = column_level_klass_masked_mappings
18
+
19
+ augmented_masked_mappings = masked_mappings
20
+ # Remove any masked klasses where additional columns mappings
21
+ # have been added for repeated sections
22
+ # e.g. SomeTestKlass column mappings are not needed if SomeTestKlass#1
23
+ # have been added
24
+ masked_mappings.each do |masked_key, columns|
25
+ # There may be occasions where the e.g. SomeTestKlass should be kept,
26
+ # This can be flagged in the one the klass's column mappings
27
+ next if columns.any? { |column| column.dig('xml_cell', 'keep_klass') }
28
+
29
+ if masked_mappings.keys.any? { |key| key =~ /\A#{masked_key}#\d+\z/ }
30
+ augmented_masked_mappings.delete(masked_key)
31
+ end
32
+ end
33
+
34
+ augmented_masked_mappings
35
+ end
36
+
37
+ private
38
+
39
+ # This method duplicates the mappings and applies a do_not_capture mask to those that do not
40
+ # relate to this klass, returning the masked mappings
41
+ def mask_mappings_by_klass(klass)
42
+ augmented_columns.deep_dup.map do |mapping|
43
+ Array(mapping['klass']).flatten.include?(klass) ? mapping : { 'do_not_capture' => true }
44
+ end
45
+ end
46
+
47
+ def column_level_klass_masked_mappings
48
+ ensure_mappings_define_klass
49
+
50
+ # Loop through each klass
51
+ masked_mappings = {}
52
+ augmented_columns.pluck('klass').flatten.compact.uniq.each do |klass|
53
+ # Do not capture fields that relate to other klasses
54
+ masked_mappings[klass] = mask_mappings_by_klass(klass)
55
+ end
56
+ masked_mappings
57
+ end
58
+
59
+ # This method ensures that every column mapping defines a klass (unless it is a column that
60
+ # we do not capture). It is only used where a table level klass is not defined.
61
+ def ensure_mappings_define_klass
62
+ klassless_mappings = augmented_columns.
63
+ select { |mapping| mapping.nil? || mapping['klass'].nil? }.
64
+ reject { |mapping| mapping['do_not_capture'] }.
65
+ map { |mapping| mapping['column'] || mapping['standard_mapping'] }
66
+
67
+ return if klassless_mappings.empty?
68
+
69
+ # All column mappings for the single item file require a klass definition.
70
+ raise "Missing klass for column(s): #{klassless_mappings.to_sentence}"
71
+ end
72
+ end
73
+ end
74
+ end
@@ -7,10 +7,17 @@ module NdrImport
7
7
  # attention has been made to use enumerables throughout to help with the
8
8
  # transformation of large quantities of data.
9
9
  class Table < ::NdrImport::Table
10
+ require 'ndr_import/xml/column_mapping'
11
+ require 'ndr_import/xml/masked_mappings'
12
+
13
+ XML_OPTIONS = %w[pattern_match_record_xpath xml_record_xpath yield_xml_record].freeze
14
+
10
15
  def self.all_valid_options
11
- super - %w[delimiter header_lines footer_lines]
16
+ super - %w[delimiter header_lines footer_lines] + XML_OPTIONS
12
17
  end
13
18
 
19
+ attr_reader(*XML_OPTIONS)
20
+
14
21
  def header_lines
15
22
  0
16
23
  end
@@ -24,26 +31,130 @@ module NdrImport
24
31
  # and fields for each mapped klass.
25
32
  def transform_line(line, index)
26
33
  return enum_for(:transform_line, line, index) unless block_given?
27
-
28
34
  raise 'Not an Nokogiri::XML::Element!' unless line.is_a? Nokogiri::XML::Element
29
35
 
30
- validate_column_mappings(line)
36
+ augmented_masked_mappings = augment_and_validate_column_mappings_for(line)
31
37
 
32
- xml_line = column_xpaths.map { |column_xpath| line.xpath(column_xpath).inner_text }
38
+ xml_line = xml_line_from(line)
33
39
 
34
- masked_mappings.each do |klass, klass_mappings|
40
+ records_from_xml_line = []
41
+ augmented_masked_mappings.each do |klass, klass_mappings|
35
42
  fields = mapped_line(xml_line, klass_mappings)
43
+
36
44
  next if fields[:skip].to_s == 'true'.freeze
37
- yield(klass, fields, index)
45
+
46
+ if yield_xml_record
47
+ records_from_xml_line << [klass, fields, index]
48
+ else
49
+ yield(klass, fields, index)
50
+ end
38
51
  end
52
+ yield(records_from_xml_line.compact) if yield_xml_record
39
53
  end
40
54
 
41
55
  private
42
56
 
57
+ def augment_and_validate_column_mappings_for(line)
58
+ augment_column_mappings_for(line)
59
+ validate_column_mappings(line)
60
+
61
+ NdrImport::Xml::MaskedMappings.new(@klass, @augmented_columns.deep_dup).call
62
+ end
63
+
64
+ # Add missing column mappings (and column_xpaths) where
65
+ # repeating sections / data items appear
66
+ def augment_column_mappings_for(line)
67
+ # Start with a fresh set of @augmented_columns for each line, adding new mappings as
68
+ # required for each `line`
69
+ @augmented_columns = @columns.deep_dup
70
+ @augmented_column_xpaths = column_xpaths.deep_dup
71
+
72
+ unmapped_xpaths(line).each do |unmapped_xpath|
73
+ existing_column = find_existing_column_for(unmapped_xpath.dup)
74
+ next unless existing_column
75
+
76
+ unmapped_xpath_hash = labelled_xpath_components_from(unmapped_xpath)
77
+ klass_increment_match = unmapped_xpath.match(/\[(\d+)\]/)
78
+ raise "could not identify klass for #{unmapped_xpath}" unless klass_increment_match
79
+
80
+ new_column = NdrImport::Xml::ColumnMapping.new(existing_column, unmapped_xpath_hash,
81
+ klass_increment_match[1], line,
82
+ @klass).call
83
+ @augmented_columns << new_column
84
+ @augmented_column_xpaths << build_xpath_from(new_column)
85
+ end
86
+ end
87
+
88
+ def xml_line_from(line)
89
+ @augmented_column_xpaths.map do |column_xpath|
90
+ # Augmenting the column mappings should account for repeating sections/items
91
+ # TODO: Is this needed now that we removed "duplicated" klass mappings?
92
+ line.xpath(column_xpath).count > 1 ? '' : line.xpath(column_xpath).inner_text
93
+ end
94
+ end
95
+
96
+ def find_existing_column_for(unmapped_xpath)
97
+ # Remove any e.g. [2] which will be present on repeating sections
98
+ unmapped_xpath.gsub!(/\[\d+\]/, '')
99
+ unmapped_xpath_hash = labelled_xpath_components_from(unmapped_xpath)
100
+ columns.detect do |column|
101
+ column['column'] == unmapped_xpath_hash[:column_name] &&
102
+ column.dig('xml_cell', 'relative_path') == unmapped_xpath_hash[:column_relative_path] &&
103
+ column.dig('xml_cell', 'attribute') == unmapped_xpath_hash[:column_attribute]
104
+ end
105
+ end
106
+
107
+ # Returns a Hash containing labelled components for the given `unmapped_xpath`
108
+ # For example, an `unmapped_xpath` of "Record/Demographics/Sex/@code" would result in:
109
+ # { column_attribute: '@code',
110
+ # column_name: 'Sex',
111
+ # column_relative_path: 'Record/Demographics' }
112
+ def labelled_xpath_components_from(unmapped_xpath)
113
+ xpath_components = unmapped_xpath.split('/')
114
+ column_attribute = new_column_attribute_from(xpath_components)
115
+ # I dislike the `EnforcedShorthandSyntax`, code is less readable
116
+ # rubocop:disable Style::HashSyntax
117
+ { column_attribute: column_attribute,
118
+ column_name: new_column_name_from(xpath_components, column_attribute),
119
+ column_relative_path: new_relative_path_from(xpath_components, column_attribute) }
120
+ # rubocop:enable Style::HashSyntax
121
+ end
122
+
123
+ def new_column_attribute_from(xpath_components)
124
+ xpath_components.last.starts_with?('@') ? xpath_components.last[1...] : nil
125
+ end
126
+
127
+ def new_column_name_from(xpath_components, column_attribute)
128
+ return xpath_components[-2] if column_attribute.present?
129
+
130
+ xpath_components.last
131
+ end
132
+
133
+ # xpaths can be e.g. Record/Demographics/Sex/@code or Record/Demographics/Surname
134
+ # `xpath_components` is an array of the xpath's components, for example:
135
+ # Record/Demographics/Sex/@code => ['Record', 'Demographics', 'Sex', '@code']
136
+ #
137
+ # For the relative path, we want to return Record/Demographics.
138
+ # The upper_limit removes the "field name" (Sex or Surname here) and optionally the
139
+ # attribute (@code here) if present, from `xpath_components`.
140
+ # The resulting array is joined back together to form the relative path.
141
+ def new_relative_path_from(xpath_components, column_attribute)
142
+ upper_limit = column_attribute.present? ? -3 : -2
143
+ xpath_components.count > 1 ? xpath_components[0..upper_limit].join('/') : nil
144
+ end
145
+
43
146
  # Ensure every leaf is accounted for in the column mappings
44
147
  def validate_column_mappings(line)
45
- missing_nodes = mappable_xpaths_from(line) - column_xpaths
46
- raise "Unmapped data! #{missing_nodes}" unless missing_nodes.empty?
148
+ missing_xpaths = unmapped_xpaths(line)
149
+ return if missing_xpaths.none?
150
+
151
+ raise(NdrImport::Xml::UnmappedXpathError, missing_xpaths.to_sentence)
152
+ end
153
+
154
+ # Not memoized this by design, we want to re-calculate unmapped xpaths after
155
+ # `@augmented_column_xpaths` have been augmented for each `line`
156
+ def unmapped_xpaths(line)
157
+ mappable_xpaths_from(line) - (@augmented_column_xpaths || column_xpaths)
47
158
  end
48
159
 
49
160
  def column_name_from(column)
@@ -58,9 +169,12 @@ module NdrImport
58
169
  xpaths = []
59
170
 
60
171
  line.xpath('.//*[not(child::*)]').each do |node|
61
- xpath = node.path.sub(line.path + '/', '')
62
- xpaths << xpath
63
- node.attributes.each_key { |key| xpaths << "#{xpath}/@#{key}" }
172
+ xpath = node.path.sub("#{line.path}/", '')
173
+ if node.attributes.any?
174
+ node.attributes.each_key { |key| xpaths << "#{xpath}/@#{key}" }
175
+ else
176
+ xpaths << xpath
177
+ end
64
178
  end
65
179
  xpaths
66
180
  end
@@ -0,0 +1,15 @@
1
+ module NdrImport
2
+ module Xml
3
+ # Raised if an unmapped xpath is identified
4
+ class UnmappedXpathError < StandardError
5
+ attr_reader :missing_xpaths
6
+
7
+ def initialize(missing_xpaths)
8
+ @missing_xpaths = missing_xpaths
9
+ message = "Unmapped xpath(s): #{missing_xpaths}"
10
+
11
+ super(message)
12
+ end
13
+ end
14
+ end
15
+ end
data/lib/ndr_import.rb CHANGED
@@ -10,8 +10,10 @@ require 'ndr_import/fixed_width/table'
10
10
  require 'ndr_import/xml/table'
11
11
  require 'ndr_import/pdf_form/table'
12
12
  require 'ndr_import/avro/table'
13
+ require 'ndr_import/vcf/table'
13
14
  require 'ndr_import/unmapped_data_error'
14
15
  require 'ndr_import/acroform_reader'
16
+ require 'ndr_import/xml/unmapped_xpath_error'
15
17
 
16
18
  module NdrImport
17
19
  def self.root
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 10.2.0
4
+ version: 11.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-18 00:00:00.000000000 Z
11
+ date: 2023-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '6.0'
33
+ version: '6.1'
34
34
  - - "<"
35
35
  - !ruby/object:Gem::Version
36
36
  version: '7.1'
@@ -40,7 +40,7 @@ dependencies:
40
40
  requirements:
41
41
  - - ">="
42
42
  - !ruby/object:Gem::Version
43
- version: '6.0'
43
+ version: '6.1'
44
44
  - - "<"
45
45
  - !ruby/object:Gem::Version
46
46
  version: '7.1'
@@ -106,6 +106,20 @@ dependencies:
106
106
  - - "~>"
107
107
  - !ruby/object:Gem::Version
108
108
  version: 1.11.0
109
+ - !ruby/object:Gem::Dependency
110
+ name: bio-vcf
111
+ requirement: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - "~>"
114
+ - !ruby/object:Gem::Version
115
+ version: 0.9.5
116
+ type: :runtime
117
+ prerelease: false
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - "~>"
121
+ - !ruby/object:Gem::Version
122
+ version: 0.9.5
109
123
  - !ruby/object:Gem::Dependency
110
124
  name: docx
111
125
  requirement: !ruby/object:Gem::Requirement
@@ -197,19 +211,19 @@ dependencies:
197
211
  - !ruby/object:Gem::Version
198
212
  version: '0'
199
213
  - !ruby/object:Gem::Dependency
200
- name: seven_zip_ruby
214
+ name: seven-zip
201
215
  requirement: !ruby/object:Gem::Requirement
202
216
  requirements:
203
217
  - - "~>"
204
218
  - !ruby/object:Gem::Version
205
- version: '1.3'
219
+ version: '1.4'
206
220
  type: :runtime
207
221
  prerelease: false
208
222
  version_requirements: !ruby/object:Gem::Requirement
209
223
  requirements:
210
224
  - - "~>"
211
225
  - !ruby/object:Gem::Version
212
- version: '1.3'
226
+ version: '1.4'
213
227
  - !ruby/object:Gem::Dependency
214
228
  name: spreadsheet
215
229
  requirement: !ruby/object:Gem::Requirement
@@ -399,6 +413,7 @@ files:
399
413
  - lib/ndr_import/file/seven_zip.rb
400
414
  - lib/ndr_import/file/text.rb
401
415
  - lib/ndr_import/file/unregistered_filetype.rb
416
+ - lib/ndr_import/file/vcf.rb
402
417
  - lib/ndr_import/file/word.rb
403
418
  - lib/ndr_import/file/xml.rb
404
419
  - lib/ndr_import/file/zip.rb
@@ -424,9 +439,13 @@ files:
424
439
  - lib/ndr_import/table.rb
425
440
  - lib/ndr_import/universal_importer_helper.rb
426
441
  - lib/ndr_import/unmapped_data_error.rb
442
+ - lib/ndr_import/vcf/table.rb
427
443
  - lib/ndr_import/version.rb
444
+ - lib/ndr_import/xml/column_mapping.rb
428
445
  - lib/ndr_import/xml/control_char_escaper.rb
446
+ - lib/ndr_import/xml/masked_mappings.rb
429
447
  - lib/ndr_import/xml/table.rb
448
+ - lib/ndr_import/xml/unmapped_xpath_error.rb
430
449
  homepage: https://github.com/NHSDigital/ndr_import
431
450
  licenses:
432
451
  - MIT
@@ -439,14 +458,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
439
458
  requirements:
440
459
  - - ">="
441
460
  - !ruby/object:Gem::Version
442
- version: '2.7'
461
+ version: '3.0'
443
462
  required_rubygems_version: !ruby/object:Gem::Requirement
444
463
  requirements:
445
464
  - - ">="
446
465
  - !ruby/object:Gem::Version
447
466
  version: '0'
448
467
  requirements: []
449
- rubygems_version: 3.2.33
468
+ rubygems_version: 3.4.10
450
469
  signing_key:
451
470
  specification_version: 4
452
471
  summary: NDR Import