ndr_import 10.3.0 → 11.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/ndr_import/file/xml.rb +22 -7
- data/lib/ndr_import/helpers/file/xml_streaming.rb +34 -9
- data/lib/ndr_import/non_tabular/table.rb +1 -1
- data/lib/ndr_import/table.rb +4 -2
- data/lib/ndr_import/universal_importer_helper.rb +8 -8
- data/lib/ndr_import/vcf/table.rb +1 -1
- data/lib/ndr_import/version.rb +1 -1
- data/lib/ndr_import/xml/column_mapping.rb +87 -0
- data/lib/ndr_import/xml/masked_mappings.rb +74 -0
- data/lib/ndr_import/xml/table.rb +125 -11
- data/lib/ndr_import/xml/unmapped_xpath_error.rb +15 -0
- data/lib/ndr_import.rb +1 -0
- metadata +8 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52ae4b12ab514a7bda584c93b3124fdeab03cea5292424c0e380c92f6e3a3c1a
|
4
|
+
data.tar.gz: 4b9dff76aa434bb87e542bcf694d4f99a91b2cd69e2eeaf6730a32b78fe61cc9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 27e3c4578ab466ae9977727de5b972e8e5bf7e2f9fa62ab53cd60437e4d7232e5da3ce3afbcf6b94dd943b219b051ccc6177b73189e736ecf8d10f582f6f0bc9
|
7
|
+
data.tar.gz: b06ef69bba53f56314574ff9749011289180624d0f73f7660537610ba29e3ec01114810bc781ccb8c67440c30f1406a05c56e508299a026a7467bb7968d4cb16
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,12 @@
|
|
2
2
|
=======
|
3
3
|
*no unreleased changes*
|
4
4
|
|
5
|
+
## 11.0.0 / 2023-10-27
|
6
|
+
### Changed
|
7
|
+
* XML enhancements. Breaking change, the enhancements are not backward compatible
|
8
|
+
### Fixed
|
9
|
+
* Replace unsupported seven_zip_ruby gem with seven-zip fork
|
10
|
+
|
5
11
|
## 10.3.0 / 2023-09-07
|
6
12
|
### Added
|
7
13
|
* VCF file support
|
data/lib/ndr_import/file/xml.rb
CHANGED
@@ -12,22 +12,37 @@ module NdrImport
|
|
12
12
|
include NdrImport::Helpers::File::Xml
|
13
13
|
include NdrImport::Helpers::File::XmlStreaming
|
14
14
|
|
15
|
+
def initialize(*)
|
16
|
+
super
|
17
|
+
|
18
|
+
@pattern_match_xpath = @options['pattern_match_record_xpath']
|
19
|
+
end
|
20
|
+
|
15
21
|
private
|
16
22
|
|
17
23
|
# Iterate through the file, yielding each 'xml_record_xpath' element in turn.
|
18
24
|
def rows(&block)
|
19
25
|
return enum_for(:rows) unless block
|
20
26
|
|
21
|
-
xpath = @options['xml_record_xpath']
|
22
|
-
|
23
27
|
if @options['slurp']
|
24
|
-
|
25
|
-
|
28
|
+
record_elements(read_xml_file(@filename)).each(&block)
|
29
|
+
else
|
30
|
+
each_node(@filename, xml_record_xpath, @pattern_match_xpath, &block)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def xml_record_xpath
|
35
|
+
@pattern_match_xpath ? @options['xml_record_xpath'] : "*/#{@options['xml_record_xpath']}"
|
36
|
+
end
|
37
|
+
|
38
|
+
def record_elements(doc)
|
39
|
+
if @pattern_match_xpath
|
40
|
+
doc.root.children.find_all do |element|
|
41
|
+
element.name =~ Regexp.new(@options['xml_record_xpath'])
|
42
|
+
end
|
26
43
|
else
|
27
|
-
|
44
|
+
doc.root.xpath(@options['xml_record_xpath'])
|
28
45
|
end
|
29
|
-
rescue StandardError => e
|
30
|
-
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
31
46
|
end
|
32
47
|
end
|
33
48
|
# Not all xml files may want to be registered, so 'xml' is not registered by design.
|
@@ -33,15 +33,20 @@ module NdrImport
|
|
33
33
|
# wrapper to hold a representation of each element we descent into:
|
34
34
|
StackItem = Struct.new(:name, :attrs, :empty)
|
35
35
|
|
36
|
-
def initialize(xpath)
|
36
|
+
def initialize(xpath, pattern_match_xpath)
|
37
37
|
@xpath = xpath
|
38
|
+
@pattern_match_xpath = pattern_match_xpath
|
38
39
|
@stack = []
|
39
40
|
@match_depth = nil
|
40
41
|
end
|
41
42
|
|
42
43
|
# Has this cursor already passed inside a similar node?
|
44
|
+
# attribute comparison allows for e.g.:
|
45
|
+
# <SameName>
|
46
|
+
# <SameName code="N"/>
|
47
|
+
# </SameName>
|
43
48
|
def in?(node)
|
44
|
-
@stack.detect { |item| item.name == node.name }
|
49
|
+
@stack.detect { |item| item.name == node.name && item.attrs == node.attributes }
|
45
50
|
end
|
46
51
|
|
47
52
|
def enter(node)
|
@@ -85,9 +90,27 @@ module NdrImport
|
|
85
90
|
def current_stack_match?
|
86
91
|
parent_stack = @stack[0..-2]
|
87
92
|
|
88
|
-
|
93
|
+
stack_match = if @pattern_match_xpath
|
94
|
+
dom_stubs[@stack].root.children.find_all do |node|
|
95
|
+
node.name =~ Regexp.new(@xpath)
|
96
|
+
end.first
|
97
|
+
else
|
98
|
+
dom_stubs[@stack].at_xpath(@xpath)
|
99
|
+
end
|
89
100
|
|
90
|
-
|
101
|
+
return false unless stack_match
|
102
|
+
|
103
|
+
parent_stack.empty? || xpath_not_in_parent_document?(dom_stubs[parent_stack])
|
104
|
+
end
|
105
|
+
|
106
|
+
def xpath_not_in_parent_document?(parent_document)
|
107
|
+
if @pattern_match_xpath
|
108
|
+
parent_document.root.children.find_all do |node|
|
109
|
+
node.name =~ Regexp.new(@xpath)
|
110
|
+
end.first.nil?
|
111
|
+
else
|
112
|
+
!parent_document.at_xpath(@xpath)
|
113
|
+
end
|
91
114
|
end
|
92
115
|
|
93
116
|
# A cached collection of DOM fragments, to represent the structure
|
@@ -116,13 +139,15 @@ module NdrImport
|
|
116
139
|
#
|
117
140
|
# In the case of dodgy encoding, may fall back to slurping the
|
118
141
|
# file, but will still use stream parsing for XML.
|
119
|
-
|
120
|
-
|
142
|
+
#
|
143
|
+
# Optionally pattern match the xpath
|
144
|
+
def each_node(safe_path, xpath, pattern_match_xpath = nil, &block)
|
145
|
+
return enum_for(:each_node, safe_path, xpath, pattern_match_xpath) unless block
|
121
146
|
|
122
147
|
require 'nokogiri'
|
123
148
|
|
124
149
|
with_encoding_check(safe_path) do |stream, encoding|
|
125
|
-
stream_xml_nodes(stream, xpath, encoding, &block)
|
150
|
+
stream_xml_nodes(stream, xpath, pattern_match_xpath, encoding, &block)
|
126
151
|
end
|
127
152
|
end
|
128
153
|
|
@@ -153,9 +178,9 @@ module NdrImport
|
|
153
178
|
system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
|
154
179
|
end
|
155
180
|
|
156
|
-
def stream_xml_nodes(io, node_xpath, encoding = nil)
|
181
|
+
def stream_xml_nodes(io, node_xpath, pattern_match_xpath, encoding = nil)
|
157
182
|
# Track nesting as the cursor moves through the document:
|
158
|
-
cursor = Cursor.new(node_xpath)
|
183
|
+
cursor = Cursor.new(node_xpath, pattern_match_xpath)
|
159
184
|
|
160
185
|
# If markup isn't well-formed, try to work around it:
|
161
186
|
options = Nokogiri::XML::ParseOptions::RECOVER
|
@@ -17,7 +17,7 @@ module NdrImport
|
|
17
17
|
include UTF8Encoding
|
18
18
|
|
19
19
|
TABULAR_ONLY_OPTIONS = %w[delimiter last_data_column liberal_parsing tablename_pattern
|
20
|
-
header_lines footer_lines
|
20
|
+
header_lines footer_lines slurp].freeze
|
21
21
|
|
22
22
|
NON_TABULAR_OPTIONS = %w[capture_end_line capture_start_line start_line_pattern
|
23
23
|
end_line_pattern remove_lines start_in_a_record
|
data/lib/ndr_import/table.rb
CHANGED
@@ -6,13 +6,14 @@ module NdrImport
|
|
6
6
|
# required to transform a table of data into "records". Particular attention
|
7
7
|
# has been made to use enumerables throughout to help with the transformation
|
8
8
|
# of large quantities of data.
|
9
|
+
# rubocop:disable Metrics/ClassLength
|
9
10
|
class Table
|
10
11
|
include NdrImport::Mapper
|
11
12
|
|
12
13
|
def self.all_valid_options
|
13
14
|
%w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
|
14
|
-
tablename_pattern header_lines footer_lines format klass columns
|
15
|
-
|
15
|
+
tablename_pattern header_lines footer_lines format klass columns slurp row_identifier
|
16
|
+
significant_mapped_fields]
|
16
17
|
end
|
17
18
|
|
18
19
|
def all_valid_options
|
@@ -250,4 +251,5 @@ module NdrImport
|
|
250
251
|
index - 1
|
251
252
|
end
|
252
253
|
end # class Table
|
254
|
+
# rubocop:enable Metrics/ClassLength
|
253
255
|
end
|
@@ -52,14 +52,14 @@ module NdrImport
|
|
52
52
|
# now at the individual file level, can we find the table mapping?
|
53
53
|
table_mapping = get_table_mapping(filename, nil)
|
54
54
|
|
55
|
-
options = {
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
55
|
+
options = { 'unzip_path' => unzip_path,
|
56
|
+
'col_sep' => table_mapping.try(:delimiter),
|
57
|
+
'file_password' => table_mapping.try(:file_password),
|
58
|
+
'liberal_parsing' => table_mapping.try(:liberal_parsing),
|
59
|
+
'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
|
60
|
+
'slurp' => table_mapping.try(:slurp),
|
61
|
+
'yield_xml_record' => table_mapping.try(:yield_xml_record),
|
62
|
+
'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath) }
|
63
63
|
|
64
64
|
tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
|
65
65
|
yield_tables_and_their_content(filename, tables, &block)
|
data/lib/ndr_import/vcf/table.rb
CHANGED
@@ -6,7 +6,7 @@ module NdrImport
|
|
6
6
|
# All other Table logic is inherited from `NdrImport::Table`
|
7
7
|
class Table < ::NdrImport::Table
|
8
8
|
def self.all_valid_options
|
9
|
-
super - %w[delimiter header_lines footer_lines
|
9
|
+
super - %w[delimiter header_lines footer_lines]
|
10
10
|
end
|
11
11
|
|
12
12
|
def header_lines
|
data/lib/ndr_import/version.rb
CHANGED
@@ -0,0 +1,87 @@
|
|
1
|
+
module NdrImport
|
2
|
+
module Xml
|
3
|
+
# This class generates new XML column mappings where repeating columns/sections have been
|
4
|
+
# identified in the xml.
|
5
|
+
# This avoids the need for mappings to verbosly define repeating columns/sections
|
6
|
+
class ColumnMapping
|
7
|
+
attr_accessor :existing_column, :unmapped_node_parts, :klass_increment, :xml_line, :klass,
|
8
|
+
:repeating_item, :increment_field_name, :build_new_record, :klass_section_xpath
|
9
|
+
|
10
|
+
def initialize(existing_column, unmapped_node_parts, klass_increment, xml_line, klass)
|
11
|
+
@existing_column = existing_column
|
12
|
+
@unmapped_node_parts = unmapped_node_parts
|
13
|
+
@klass_increment = klass_increment
|
14
|
+
@xml_line = xml_line
|
15
|
+
@klass = klass
|
16
|
+
@repeating_item = existing_column.dig('xml_cell', 'multiple')
|
17
|
+
@increment_field_name = existing_column.dig('xml_cell', 'increment_field_name')
|
18
|
+
@build_new_record = existing_column.dig('xml_cell', 'build_new_record')
|
19
|
+
@klass_section_xpath = existing_column.dig('xml_cell', 'klass_section')
|
20
|
+
end
|
21
|
+
|
22
|
+
def call
|
23
|
+
new_column = existing_column.deep_dup
|
24
|
+
new_column['column'] = unmapped_node_parts[:column_name]
|
25
|
+
new_column['xml_cell']['relative_path'] = unmapped_node_parts[:column_relative_path]
|
26
|
+
|
27
|
+
# create unique rawtext names for repeating sections within a record
|
28
|
+
apply_new_rawtext_and_mapped_names_to(new_column) if repeating_item
|
29
|
+
|
30
|
+
return new_column unless incremented_klass_needed?
|
31
|
+
|
32
|
+
new_column['klass'] = incremented_klass
|
33
|
+
new_column
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# If a table level klass is defined, there is nothing to increment at the column level.
|
39
|
+
# Similarly, not all repeating sections/items require a separate record.
|
40
|
+
# No need to create new records for a single occurence of a repeating section
|
41
|
+
def incremented_klass_needed?
|
42
|
+
return false if klass.present?
|
43
|
+
# Column mapping needs to explicitly flag when additionals should not be made
|
44
|
+
return false if build_new_record == false
|
45
|
+
return false if xml_line.xpath(klass_section_xpath).one? && repeating_item
|
46
|
+
|
47
|
+
true
|
48
|
+
end
|
49
|
+
|
50
|
+
def incremented_klass
|
51
|
+
if existing_column['klass'].is_a?(Array)
|
52
|
+
existing_column['klass'].map do |column_klass|
|
53
|
+
column_klass + "##{klass_increment}"
|
54
|
+
end
|
55
|
+
else
|
56
|
+
existing_column['klass'] + "##{klass_increment}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Append "_1", "_2" etc to repeating rawtext and optionally mapped field names within a
|
61
|
+
# single record, so data is not overwritten
|
62
|
+
def apply_new_rawtext_and_mapped_names_to(new_column)
|
63
|
+
existing_rawtext = existing_column['rawtext_name'] || existing_column['column']
|
64
|
+
column_name_increment = new_column['column'].scan(/\[(\d+)\]/)
|
65
|
+
relative_path_increment = new_column.dig('xml_cell', 'relative_path').scan(/\[(\d+)\]/)
|
66
|
+
|
67
|
+
# Find all the increments (e.g. [1], [2]) from the new column and use their sum
|
68
|
+
# as the rawtext and column name increment
|
69
|
+
increment = (column_name_increment + relative_path_increment).flatten.map(&:to_i).sum
|
70
|
+
new_column['rawtext_name'] = existing_rawtext + "_#{increment}" unless increment.zero?
|
71
|
+
|
72
|
+
return unless !increment.zero? && increment_field_name
|
73
|
+
|
74
|
+
new_column['mappings'] = incremented_mappings_for(new_column, increment)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Increment the mapped `field` names
|
78
|
+
def incremented_mappings_for(new_column, increment)
|
79
|
+
new_column['mappings'].map do |mapping|
|
80
|
+
mapping['field'] = "#{mapping['field']}_#{increment}"
|
81
|
+
|
82
|
+
mapping
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module NdrImport
|
2
|
+
module Xml
|
3
|
+
# This class applies a do_not_capture mask to those mappings that do not relate to each klass.
|
4
|
+
# Overriding the NdrImport::Table method to avoid memoizing. This by design, column mappings
|
5
|
+
# can change if new mappings are added on the fly where repeating sections are present
|
6
|
+
class MaskedMappings
|
7
|
+
attr_accessor :klass, :augmented_columns
|
8
|
+
|
9
|
+
def initialize(klass, augmented_columns)
|
10
|
+
@klass = klass
|
11
|
+
@augmented_columns = augmented_columns
|
12
|
+
end
|
13
|
+
|
14
|
+
def call
|
15
|
+
return { klass => augmented_columns } if klass.present?
|
16
|
+
|
17
|
+
masked_mappings = column_level_klass_masked_mappings
|
18
|
+
|
19
|
+
augmented_masked_mappings = masked_mappings
|
20
|
+
# Remove any masked klasses where additional columns mappings
|
21
|
+
# have been added for repeated sections
|
22
|
+
# e.g. SomeTestKlass column mappings are not needed if SomeTestKlass#1
|
23
|
+
# have been added
|
24
|
+
masked_mappings.each do |masked_key, columns|
|
25
|
+
# There may be occasions where the e.g. SomeTestKlass should be kept,
|
26
|
+
# This can be flagged in the one the klass's column mappings
|
27
|
+
next if columns.any? { |column| column.dig('xml_cell', 'keep_klass') }
|
28
|
+
|
29
|
+
if masked_mappings.keys.any? { |key| key =~ /\A#{masked_key}#\d+\z/ }
|
30
|
+
augmented_masked_mappings.delete(masked_key)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
augmented_masked_mappings
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
# This method duplicates the mappings and applies a do_not_capture mask to those that do not
|
40
|
+
# relate to this klass, returning the masked mappings
|
41
|
+
def mask_mappings_by_klass(klass)
|
42
|
+
augmented_columns.deep_dup.map do |mapping|
|
43
|
+
Array(mapping['klass']).flatten.include?(klass) ? mapping : { 'do_not_capture' => true }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def column_level_klass_masked_mappings
|
48
|
+
ensure_mappings_define_klass
|
49
|
+
|
50
|
+
# Loop through each klass
|
51
|
+
masked_mappings = {}
|
52
|
+
augmented_columns.pluck('klass').flatten.compact.uniq.each do |klass|
|
53
|
+
# Do not capture fields that relate to other klasses
|
54
|
+
masked_mappings[klass] = mask_mappings_by_klass(klass)
|
55
|
+
end
|
56
|
+
masked_mappings
|
57
|
+
end
|
58
|
+
|
59
|
+
# This method ensures that every column mapping defines a klass (unless it is a column that
|
60
|
+
# we do not capture). It is only used where a table level klass is not defined.
|
61
|
+
def ensure_mappings_define_klass
|
62
|
+
klassless_mappings = augmented_columns.
|
63
|
+
select { |mapping| mapping.nil? || mapping['klass'].nil? }.
|
64
|
+
reject { |mapping| mapping['do_not_capture'] }.
|
65
|
+
map { |mapping| mapping['column'] || mapping['standard_mapping'] }
|
66
|
+
|
67
|
+
return if klassless_mappings.empty?
|
68
|
+
|
69
|
+
# All column mappings for the single item file require a klass definition.
|
70
|
+
raise "Missing klass for column(s): #{klassless_mappings.to_sentence}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
data/lib/ndr_import/xml/table.rb
CHANGED
@@ -7,10 +7,17 @@ module NdrImport
|
|
7
7
|
# attention has been made to use enumerables throughout to help with the
|
8
8
|
# transformation of large quantities of data.
|
9
9
|
class Table < ::NdrImport::Table
|
10
|
+
require 'ndr_import/xml/column_mapping'
|
11
|
+
require 'ndr_import/xml/masked_mappings'
|
12
|
+
|
13
|
+
XML_OPTIONS = %w[pattern_match_record_xpath xml_record_xpath yield_xml_record].freeze
|
14
|
+
|
10
15
|
def self.all_valid_options
|
11
|
-
super - %w[delimiter header_lines footer_lines]
|
16
|
+
super - %w[delimiter header_lines footer_lines] + XML_OPTIONS
|
12
17
|
end
|
13
18
|
|
19
|
+
attr_reader(*XML_OPTIONS)
|
20
|
+
|
14
21
|
def header_lines
|
15
22
|
0
|
16
23
|
end
|
@@ -24,26 +31,130 @@ module NdrImport
|
|
24
31
|
# and fields for each mapped klass.
|
25
32
|
def transform_line(line, index)
|
26
33
|
return enum_for(:transform_line, line, index) unless block_given?
|
27
|
-
|
28
34
|
raise 'Not an Nokogiri::XML::Element!' unless line.is_a? Nokogiri::XML::Element
|
29
35
|
|
30
|
-
|
36
|
+
augmented_masked_mappings = augment_and_validate_column_mappings_for(line)
|
31
37
|
|
32
|
-
xml_line =
|
38
|
+
xml_line = xml_line_from(line)
|
33
39
|
|
34
|
-
|
40
|
+
records_from_xml_line = []
|
41
|
+
augmented_masked_mappings.each do |klass, klass_mappings|
|
35
42
|
fields = mapped_line(xml_line, klass_mappings)
|
43
|
+
|
36
44
|
next if fields[:skip].to_s == 'true'.freeze
|
37
|
-
|
45
|
+
|
46
|
+
if yield_xml_record
|
47
|
+
records_from_xml_line << [klass, fields, index]
|
48
|
+
else
|
49
|
+
yield(klass, fields, index)
|
50
|
+
end
|
38
51
|
end
|
52
|
+
yield(records_from_xml_line.compact) if yield_xml_record
|
39
53
|
end
|
40
54
|
|
41
55
|
private
|
42
56
|
|
57
|
+
def augment_and_validate_column_mappings_for(line)
|
58
|
+
augment_column_mappings_for(line)
|
59
|
+
validate_column_mappings(line)
|
60
|
+
|
61
|
+
NdrImport::Xml::MaskedMappings.new(@klass, @augmented_columns.deep_dup).call
|
62
|
+
end
|
63
|
+
|
64
|
+
# Add missing column mappings (and column_xpaths) where
|
65
|
+
# repeating sections / data items appear
|
66
|
+
def augment_column_mappings_for(line)
|
67
|
+
# Start with a fresh set of @augmented_columns for each line, adding new mappings as
|
68
|
+
# required for each `line`
|
69
|
+
@augmented_columns = @columns.deep_dup
|
70
|
+
@augmented_column_xpaths = column_xpaths.deep_dup
|
71
|
+
|
72
|
+
unmapped_xpaths(line).each do |unmapped_xpath|
|
73
|
+
existing_column = find_existing_column_for(unmapped_xpath.dup)
|
74
|
+
next unless existing_column
|
75
|
+
|
76
|
+
unmapped_xpath_hash = labelled_xpath_components_from(unmapped_xpath)
|
77
|
+
klass_increment_match = unmapped_xpath.match(/\[(\d+)\]/)
|
78
|
+
raise "could not identify klass for #{unmapped_xpath}" unless klass_increment_match
|
79
|
+
|
80
|
+
new_column = NdrImport::Xml::ColumnMapping.new(existing_column, unmapped_xpath_hash,
|
81
|
+
klass_increment_match[1], line,
|
82
|
+
@klass).call
|
83
|
+
@augmented_columns << new_column
|
84
|
+
@augmented_column_xpaths << build_xpath_from(new_column)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def xml_line_from(line)
|
89
|
+
@augmented_column_xpaths.map do |column_xpath|
|
90
|
+
# Augmenting the column mappings should account for repeating sections/items
|
91
|
+
# TODO: Is this needed now that we removed "duplicated" klass mappings?
|
92
|
+
line.xpath(column_xpath).count > 1 ? '' : line.xpath(column_xpath).inner_text
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def find_existing_column_for(unmapped_xpath)
|
97
|
+
# Remove any e.g. [2] which will be present on repeating sections
|
98
|
+
unmapped_xpath.gsub!(/\[\d+\]/, '')
|
99
|
+
unmapped_xpath_hash = labelled_xpath_components_from(unmapped_xpath)
|
100
|
+
columns.detect do |column|
|
101
|
+
column['column'] == unmapped_xpath_hash[:column_name] &&
|
102
|
+
column.dig('xml_cell', 'relative_path') == unmapped_xpath_hash[:column_relative_path] &&
|
103
|
+
column.dig('xml_cell', 'attribute') == unmapped_xpath_hash[:column_attribute]
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns a Hash containing labelled components for the given `unmapped_xpath`
|
108
|
+
# For example, an `unmapped_xpath` of "Record/Demographics/Sex/@code" would result in:
|
109
|
+
# { column_attribute: '@code',
|
110
|
+
# column_name: 'Sex',
|
111
|
+
# column_relative_path: 'Record/Demographics' }
|
112
|
+
def labelled_xpath_components_from(unmapped_xpath)
|
113
|
+
xpath_components = unmapped_xpath.split('/')
|
114
|
+
column_attribute = new_column_attribute_from(xpath_components)
|
115
|
+
# I dislike the `EnforcedShorthandSyntax`, code is less readable
|
116
|
+
# rubocop:disable Style::HashSyntax
|
117
|
+
{ column_attribute: column_attribute,
|
118
|
+
column_name: new_column_name_from(xpath_components, column_attribute),
|
119
|
+
column_relative_path: new_relative_path_from(xpath_components, column_attribute) }
|
120
|
+
# rubocop:enable Style::HashSyntax
|
121
|
+
end
|
122
|
+
|
123
|
+
def new_column_attribute_from(xpath_components)
|
124
|
+
xpath_components.last.starts_with?('@') ? xpath_components.last[1...] : nil
|
125
|
+
end
|
126
|
+
|
127
|
+
def new_column_name_from(xpath_components, column_attribute)
|
128
|
+
return xpath_components[-2] if column_attribute.present?
|
129
|
+
|
130
|
+
xpath_components.last
|
131
|
+
end
|
132
|
+
|
133
|
+
# xpaths can be e.g. Record/Demographics/Sex/@code or Record/Demographics/Surname
|
134
|
+
# `xpath_components` is an array of the xpath's components, for example:
|
135
|
+
# Record/Demographics/Sex/@code => ['Record', 'Demographics', 'Sex', '@code']
|
136
|
+
#
|
137
|
+
# For the relative path, we want to return Record/Demographics.
|
138
|
+
# The upper_limit removes the "field name" (Sex or Surname here) and optionally the
|
139
|
+
# attribute (@code here) if present, from `xpath_components`.
|
140
|
+
# The resulting array is joined back together to form the relative path.
|
141
|
+
def new_relative_path_from(xpath_components, column_attribute)
|
142
|
+
upper_limit = column_attribute.present? ? -3 : -2
|
143
|
+
xpath_components.count > 1 ? xpath_components[0..upper_limit].join('/') : nil
|
144
|
+
end
|
145
|
+
|
43
146
|
# Ensure every leaf is accounted for in the column mappings
|
44
147
|
def validate_column_mappings(line)
|
45
|
-
|
46
|
-
|
148
|
+
missing_xpaths = unmapped_xpaths(line)
|
149
|
+
return if missing_xpaths.none?
|
150
|
+
|
151
|
+
raise(NdrImport::Xml::UnmappedXpathError, missing_xpaths.to_sentence)
|
152
|
+
end
|
153
|
+
|
154
|
+
# Not memoized this by design, we want to re-calculate unmapped xpaths after
|
155
|
+
# `@augmented_column_xpaths` have been augmented for each `line`
|
156
|
+
def unmapped_xpaths(line)
|
157
|
+
mappable_xpaths_from(line) - (@augmented_column_xpaths || column_xpaths)
|
47
158
|
end
|
48
159
|
|
49
160
|
def column_name_from(column)
|
@@ -58,9 +169,12 @@ module NdrImport
|
|
58
169
|
xpaths = []
|
59
170
|
|
60
171
|
line.xpath('.//*[not(child::*)]').each do |node|
|
61
|
-
xpath = node.path.sub(line.path
|
62
|
-
|
63
|
-
|
172
|
+
xpath = node.path.sub("#{line.path}/", '')
|
173
|
+
if node.attributes.any?
|
174
|
+
node.attributes.each_key { |key| xpaths << "#{xpath}/@#{key}" }
|
175
|
+
else
|
176
|
+
xpaths << xpath
|
177
|
+
end
|
64
178
|
end
|
65
179
|
xpaths
|
66
180
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module NdrImport
|
2
|
+
module Xml
|
3
|
+
# Raised if an unmapped xpath is identified
|
4
|
+
class UnmappedXpathError < StandardError
|
5
|
+
attr_reader :missing_xpaths
|
6
|
+
|
7
|
+
def initialize(missing_xpaths)
|
8
|
+
@missing_xpaths = missing_xpaths
|
9
|
+
message = "Unmapped xpath(s): #{missing_xpaths}"
|
10
|
+
|
11
|
+
super(message)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/ndr_import.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 11.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -211,19 +211,19 @@ dependencies:
|
|
211
211
|
- !ruby/object:Gem::Version
|
212
212
|
version: '0'
|
213
213
|
- !ruby/object:Gem::Dependency
|
214
|
-
name:
|
214
|
+
name: seven-zip
|
215
215
|
requirement: !ruby/object:Gem::Requirement
|
216
216
|
requirements:
|
217
217
|
- - "~>"
|
218
218
|
- !ruby/object:Gem::Version
|
219
|
-
version: '1.
|
219
|
+
version: '1.4'
|
220
220
|
type: :runtime
|
221
221
|
prerelease: false
|
222
222
|
version_requirements: !ruby/object:Gem::Requirement
|
223
223
|
requirements:
|
224
224
|
- - "~>"
|
225
225
|
- !ruby/object:Gem::Version
|
226
|
-
version: '1.
|
226
|
+
version: '1.4'
|
227
227
|
- !ruby/object:Gem::Dependency
|
228
228
|
name: spreadsheet
|
229
229
|
requirement: !ruby/object:Gem::Requirement
|
@@ -441,8 +441,11 @@ files:
|
|
441
441
|
- lib/ndr_import/unmapped_data_error.rb
|
442
442
|
- lib/ndr_import/vcf/table.rb
|
443
443
|
- lib/ndr_import/version.rb
|
444
|
+
- lib/ndr_import/xml/column_mapping.rb
|
444
445
|
- lib/ndr_import/xml/control_char_escaper.rb
|
446
|
+
- lib/ndr_import/xml/masked_mappings.rb
|
445
447
|
- lib/ndr_import/xml/table.rb
|
448
|
+
- lib/ndr_import/xml/unmapped_xpath_error.rb
|
446
449
|
homepage: https://github.com/NHSDigital/ndr_import
|
447
450
|
licenses:
|
448
451
|
- MIT
|