ndr_import 10.2.0 → 11.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +1 -1
- data/lib/ndr_import/file/all.rb +1 -0
- data/lib/ndr_import/file/vcf.rb +25 -0
- data/lib/ndr_import/file/xml.rb +22 -7
- data/lib/ndr_import/helpers/file/xml_streaming.rb +34 -9
- data/lib/ndr_import/non_tabular/table.rb +1 -1
- data/lib/ndr_import/table.rb +4 -2
- data/lib/ndr_import/universal_importer_helper.rb +8 -8
- data/lib/ndr_import/vcf/table.rb +21 -0
- data/lib/ndr_import/version.rb +1 -1
- data/lib/ndr_import/xml/column_mapping.rb +87 -0
- data/lib/ndr_import/xml/masked_mappings.rb +74 -0
- data/lib/ndr_import/xml/table.rb +125 -11
- data/lib/ndr_import/xml/unmapped_xpath_error.rb +15 -0
- data/lib/ndr_import.rb +2 -0
- metadata +28 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 52ae4b12ab514a7bda584c93b3124fdeab03cea5292424c0e380c92f6e3a3c1a
|
4
|
+
data.tar.gz: 4b9dff76aa434bb87e542bcf694d4f99a91b2cd69e2eeaf6730a32b78fe61cc9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 27e3c4578ab466ae9977727de5b972e8e5bf7e2f9fa62ab53cd60437e4d7232e5da3ce3afbcf6b94dd943b219b051ccc6177b73189e736ecf8d10f582f6f0bc9
|
7
|
+
data.tar.gz: b06ef69bba53f56314574ff9749011289180624d0f73f7660537610ba29e3ec01114810bc781ccb8c67440c30f1406a05c56e508299a026a7467bb7968d4cb16
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,18 @@
|
|
1
1
|
## [Unreleased]
|
2
|
+
=======
|
2
3
|
*no unreleased changes*
|
3
4
|
|
5
|
+
## 11.0.0 / 2023-10-27
|
6
|
+
### Changed
|
7
|
+
* XML enhancements. Breaking change, the enhancements are not backward compatible
|
8
|
+
### Fixed
|
9
|
+
* Replace unsupported seven_zip_ruby gem with seven-zip fork
|
10
|
+
|
11
|
+
## 10.3.0 / 2023-09-07
|
12
|
+
### Added
|
13
|
+
* VCF file support
|
14
|
+
* Support Ruby 3.2. Drop support for Ruby 2.7, Rails 6.0
|
15
|
+
|
4
16
|
## 10.2.0 / 2023-05-16
|
5
17
|
* avro file support
|
6
18
|
* allow storage of `significant_mapped_fields` in `Table`
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# NdrImport [](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [](https://rubygems.org/gems/ndr_import) [](https://www.rubydoc.info/gems/ndr_import)
|
2
2
|
This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
|
3
3
|
|
4
|
-
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip and
|
4
|
+
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip, avro and VCF files.
|
5
5
|
2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
|
6
6
|
|
7
7
|
## Installation
|
data/lib/ndr_import/file/all.rb
CHANGED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'bio-vcf/vcfline'
|
2
|
+
require 'ndr_support/safe_file'
|
3
|
+
require_relative 'registry'
|
4
|
+
|
5
|
+
module NdrImport
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
8
|
+
module File
|
9
|
+
# This class is a vcf file handler that returns a single table.
|
10
|
+
class Vcf < Base
|
11
|
+
private
|
12
|
+
|
13
|
+
def rows(&block)
|
14
|
+
return enum_for(:rows) unless block
|
15
|
+
|
16
|
+
::File.read(@filename).each_line do |line|
|
17
|
+
next if line =~ /^##/
|
18
|
+
|
19
|
+
yield BioVcf::VcfLine.parse(line)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
Registry.register(Vcf, 'vcf')
|
24
|
+
end
|
25
|
+
end
|
data/lib/ndr_import/file/xml.rb
CHANGED
@@ -12,22 +12,37 @@ module NdrImport
|
|
12
12
|
include NdrImport::Helpers::File::Xml
|
13
13
|
include NdrImport::Helpers::File::XmlStreaming
|
14
14
|
|
15
|
+
def initialize(*)
|
16
|
+
super
|
17
|
+
|
18
|
+
@pattern_match_xpath = @options['pattern_match_record_xpath']
|
19
|
+
end
|
20
|
+
|
15
21
|
private
|
16
22
|
|
17
23
|
# Iterate through the file, yielding each 'xml_record_xpath' element in turn.
|
18
24
|
def rows(&block)
|
19
25
|
return enum_for(:rows) unless block
|
20
26
|
|
21
|
-
xpath = @options['xml_record_xpath']
|
22
|
-
|
23
27
|
if @options['slurp']
|
24
|
-
|
25
|
-
|
28
|
+
record_elements(read_xml_file(@filename)).each(&block)
|
29
|
+
else
|
30
|
+
each_node(@filename, xml_record_xpath, @pattern_match_xpath, &block)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def xml_record_xpath
|
35
|
+
@pattern_match_xpath ? @options['xml_record_xpath'] : "*/#{@options['xml_record_xpath']}"
|
36
|
+
end
|
37
|
+
|
38
|
+
def record_elements(doc)
|
39
|
+
if @pattern_match_xpath
|
40
|
+
doc.root.children.find_all do |element|
|
41
|
+
element.name =~ Regexp.new(@options['xml_record_xpath'])
|
42
|
+
end
|
26
43
|
else
|
27
|
-
|
44
|
+
doc.root.xpath(@options['xml_record_xpath'])
|
28
45
|
end
|
29
|
-
rescue StandardError => e
|
30
|
-
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
31
46
|
end
|
32
47
|
end
|
33
48
|
# Not all xml files may want to be registered, so 'xml' is not registered by design.
|
@@ -33,15 +33,20 @@ module NdrImport
|
|
33
33
|
# wrapper to hold a representation of each element we descent into:
|
34
34
|
StackItem = Struct.new(:name, :attrs, :empty)
|
35
35
|
|
36
|
-
def initialize(xpath)
|
36
|
+
def initialize(xpath, pattern_match_xpath)
|
37
37
|
@xpath = xpath
|
38
|
+
@pattern_match_xpath = pattern_match_xpath
|
38
39
|
@stack = []
|
39
40
|
@match_depth = nil
|
40
41
|
end
|
41
42
|
|
42
43
|
# Has this cursor already passed inside a similar node?
|
44
|
+
# attribute comparison allows for e.g.:
|
45
|
+
# <SameName>
|
46
|
+
# <SameName code="N"/>
|
47
|
+
# </SameName>
|
43
48
|
def in?(node)
|
44
|
-
@stack.detect { |item| item.name == node.name }
|
49
|
+
@stack.detect { |item| item.name == node.name && item.attrs == node.attributes }
|
45
50
|
end
|
46
51
|
|
47
52
|
def enter(node)
|
@@ -85,9 +90,27 @@ module NdrImport
|
|
85
90
|
def current_stack_match?
|
86
91
|
parent_stack = @stack[0..-2]
|
87
92
|
|
88
|
-
|
93
|
+
stack_match = if @pattern_match_xpath
|
94
|
+
dom_stubs[@stack].root.children.find_all do |node|
|
95
|
+
node.name =~ Regexp.new(@xpath)
|
96
|
+
end.first
|
97
|
+
else
|
98
|
+
dom_stubs[@stack].at_xpath(@xpath)
|
99
|
+
end
|
89
100
|
|
90
|
-
|
101
|
+
return false unless stack_match
|
102
|
+
|
103
|
+
parent_stack.empty? || xpath_not_in_parent_document?(dom_stubs[parent_stack])
|
104
|
+
end
|
105
|
+
|
106
|
+
def xpath_not_in_parent_document?(parent_document)
|
107
|
+
if @pattern_match_xpath
|
108
|
+
parent_document.root.children.find_all do |node|
|
109
|
+
node.name =~ Regexp.new(@xpath)
|
110
|
+
end.first.nil?
|
111
|
+
else
|
112
|
+
!parent_document.at_xpath(@xpath)
|
113
|
+
end
|
91
114
|
end
|
92
115
|
|
93
116
|
# A cached collection of DOM fragments, to represent the structure
|
@@ -116,13 +139,15 @@ module NdrImport
|
|
116
139
|
#
|
117
140
|
# In the case of dodgy encoding, may fall back to slurping the
|
118
141
|
# file, but will still use stream parsing for XML.
|
119
|
-
|
120
|
-
|
142
|
+
#
|
143
|
+
# Optionally pattern match the xpath
|
144
|
+
def each_node(safe_path, xpath, pattern_match_xpath = nil, &block)
|
145
|
+
return enum_for(:each_node, safe_path, xpath, pattern_match_xpath) unless block
|
121
146
|
|
122
147
|
require 'nokogiri'
|
123
148
|
|
124
149
|
with_encoding_check(safe_path) do |stream, encoding|
|
125
|
-
stream_xml_nodes(stream, xpath, encoding, &block)
|
150
|
+
stream_xml_nodes(stream, xpath, pattern_match_xpath, encoding, &block)
|
126
151
|
end
|
127
152
|
end
|
128
153
|
|
@@ -153,9 +178,9 @@ module NdrImport
|
|
153
178
|
system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
|
154
179
|
end
|
155
180
|
|
156
|
-
def stream_xml_nodes(io, node_xpath, encoding = nil)
|
181
|
+
def stream_xml_nodes(io, node_xpath, pattern_match_xpath, encoding = nil)
|
157
182
|
# Track nesting as the cursor moves through the document:
|
158
|
-
cursor = Cursor.new(node_xpath)
|
183
|
+
cursor = Cursor.new(node_xpath, pattern_match_xpath)
|
159
184
|
|
160
185
|
# If markup isn't well-formed, try to work around it:
|
161
186
|
options = Nokogiri::XML::ParseOptions::RECOVER
|
@@ -17,7 +17,7 @@ module NdrImport
|
|
17
17
|
include UTF8Encoding
|
18
18
|
|
19
19
|
TABULAR_ONLY_OPTIONS = %w[delimiter last_data_column liberal_parsing tablename_pattern
|
20
|
-
header_lines footer_lines
|
20
|
+
header_lines footer_lines slurp].freeze
|
21
21
|
|
22
22
|
NON_TABULAR_OPTIONS = %w[capture_end_line capture_start_line start_line_pattern
|
23
23
|
end_line_pattern remove_lines start_in_a_record
|
data/lib/ndr_import/table.rb
CHANGED
@@ -6,13 +6,14 @@ module NdrImport
|
|
6
6
|
# required to transform a table of data into "records". Particular attention
|
7
7
|
# has been made to use enumerables throughout to help with the transformation
|
8
8
|
# of large quantities of data.
|
9
|
+
# rubocop:disable Metrics/ClassLength
|
9
10
|
class Table
|
10
11
|
include NdrImport::Mapper
|
11
12
|
|
12
13
|
def self.all_valid_options
|
13
14
|
%w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
|
14
|
-
tablename_pattern header_lines footer_lines format klass columns
|
15
|
-
|
15
|
+
tablename_pattern header_lines footer_lines format klass columns slurp row_identifier
|
16
|
+
significant_mapped_fields]
|
16
17
|
end
|
17
18
|
|
18
19
|
def all_valid_options
|
@@ -250,4 +251,5 @@ module NdrImport
|
|
250
251
|
index - 1
|
251
252
|
end
|
252
253
|
end # class Table
|
254
|
+
# rubocop:enable Metrics/ClassLength
|
253
255
|
end
|
@@ -52,14 +52,14 @@ module NdrImport
|
|
52
52
|
# now at the individual file level, can we find the table mapping?
|
53
53
|
table_mapping = get_table_mapping(filename, nil)
|
54
54
|
|
55
|
-
options = {
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
55
|
+
options = { 'unzip_path' => unzip_path,
|
56
|
+
'col_sep' => table_mapping.try(:delimiter),
|
57
|
+
'file_password' => table_mapping.try(:file_password),
|
58
|
+
'liberal_parsing' => table_mapping.try(:liberal_parsing),
|
59
|
+
'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
|
60
|
+
'slurp' => table_mapping.try(:slurp),
|
61
|
+
'yield_xml_record' => table_mapping.try(:yield_xml_record),
|
62
|
+
'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath) }
|
63
63
|
|
64
64
|
tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
|
65
65
|
yield_tables_and_their_content(filename, tables, &block)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'ndr_import/table'
|
2
|
+
|
3
|
+
module NdrImport
|
4
|
+
module Vcf
|
5
|
+
# Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
|
6
|
+
# All other Table logic is inherited from `NdrImport::Table`
|
7
|
+
class Table < ::NdrImport::Table
|
8
|
+
def self.all_valid_options
|
9
|
+
super - %w[delimiter header_lines footer_lines]
|
10
|
+
end
|
11
|
+
|
12
|
+
def header_lines
|
13
|
+
1
|
14
|
+
end
|
15
|
+
|
16
|
+
def footer_lines
|
17
|
+
0
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/ndr_import/version.rb
CHANGED
@@ -0,0 +1,87 @@
|
|
1
|
+
module NdrImport
|
2
|
+
module Xml
|
3
|
+
# This class generates new XML column mappings where repeating columns/sections have been
|
4
|
+
# identified in the xml.
|
5
|
+
# This avoids the need for mappings to verbosly define repeating columns/sections
|
6
|
+
class ColumnMapping
|
7
|
+
attr_accessor :existing_column, :unmapped_node_parts, :klass_increment, :xml_line, :klass,
|
8
|
+
:repeating_item, :increment_field_name, :build_new_record, :klass_section_xpath
|
9
|
+
|
10
|
+
def initialize(existing_column, unmapped_node_parts, klass_increment, xml_line, klass)
|
11
|
+
@existing_column = existing_column
|
12
|
+
@unmapped_node_parts = unmapped_node_parts
|
13
|
+
@klass_increment = klass_increment
|
14
|
+
@xml_line = xml_line
|
15
|
+
@klass = klass
|
16
|
+
@repeating_item = existing_column.dig('xml_cell', 'multiple')
|
17
|
+
@increment_field_name = existing_column.dig('xml_cell', 'increment_field_name')
|
18
|
+
@build_new_record = existing_column.dig('xml_cell', 'build_new_record')
|
19
|
+
@klass_section_xpath = existing_column.dig('xml_cell', 'klass_section')
|
20
|
+
end
|
21
|
+
|
22
|
+
def call
|
23
|
+
new_column = existing_column.deep_dup
|
24
|
+
new_column['column'] = unmapped_node_parts[:column_name]
|
25
|
+
new_column['xml_cell']['relative_path'] = unmapped_node_parts[:column_relative_path]
|
26
|
+
|
27
|
+
# create unique rawtext names for repeating sections within a record
|
28
|
+
apply_new_rawtext_and_mapped_names_to(new_column) if repeating_item
|
29
|
+
|
30
|
+
return new_column unless incremented_klass_needed?
|
31
|
+
|
32
|
+
new_column['klass'] = incremented_klass
|
33
|
+
new_column
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
|
38
|
+
# If a table level klass is defined, there is nothing to increment at the column level.
|
39
|
+
# Similarly, not all repeating sections/items require a separate record.
|
40
|
+
# No need to create new records for a single occurence of a repeating section
|
41
|
+
def incremented_klass_needed?
|
42
|
+
return false if klass.present?
|
43
|
+
# Column mapping needs to explicitly flag when additionals should not be made
|
44
|
+
return false if build_new_record == false
|
45
|
+
return false if xml_line.xpath(klass_section_xpath).one? && repeating_item
|
46
|
+
|
47
|
+
true
|
48
|
+
end
|
49
|
+
|
50
|
+
def incremented_klass
|
51
|
+
if existing_column['klass'].is_a?(Array)
|
52
|
+
existing_column['klass'].map do |column_klass|
|
53
|
+
column_klass + "##{klass_increment}"
|
54
|
+
end
|
55
|
+
else
|
56
|
+
existing_column['klass'] + "##{klass_increment}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Append "_1", "_2" etc to repeating rawtext and optionally mapped field names within a
|
61
|
+
# single record, so data is not overwritten
|
62
|
+
def apply_new_rawtext_and_mapped_names_to(new_column)
|
63
|
+
existing_rawtext = existing_column['rawtext_name'] || existing_column['column']
|
64
|
+
column_name_increment = new_column['column'].scan(/\[(\d+)\]/)
|
65
|
+
relative_path_increment = new_column.dig('xml_cell', 'relative_path').scan(/\[(\d+)\]/)
|
66
|
+
|
67
|
+
# Find all the increments (e.g. [1], [2]) from the new column and use their sum
|
68
|
+
# as the rawtext and column name increment
|
69
|
+
increment = (column_name_increment + relative_path_increment).flatten.map(&:to_i).sum
|
70
|
+
new_column['rawtext_name'] = existing_rawtext + "_#{increment}" unless increment.zero?
|
71
|
+
|
72
|
+
return unless !increment.zero? && increment_field_name
|
73
|
+
|
74
|
+
new_column['mappings'] = incremented_mappings_for(new_column, increment)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Increment the mapped `field` names
|
78
|
+
def incremented_mappings_for(new_column, increment)
|
79
|
+
new_column['mappings'].map do |mapping|
|
80
|
+
mapping['field'] = "#{mapping['field']}_#{increment}"
|
81
|
+
|
82
|
+
mapping
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module NdrImport
|
2
|
+
module Xml
|
3
|
+
# This class applies a do_not_capture mask to those mappings that do not relate to each klass.
|
4
|
+
# Overriding the NdrImport::Table method to avoid memoizing. This by design, column mappings
|
5
|
+
# can change if new mappings are added on the fly where repeating sections are present
|
6
|
+
class MaskedMappings
|
7
|
+
attr_accessor :klass, :augmented_columns
|
8
|
+
|
9
|
+
def initialize(klass, augmented_columns)
|
10
|
+
@klass = klass
|
11
|
+
@augmented_columns = augmented_columns
|
12
|
+
end
|
13
|
+
|
14
|
+
def call
|
15
|
+
return { klass => augmented_columns } if klass.present?
|
16
|
+
|
17
|
+
masked_mappings = column_level_klass_masked_mappings
|
18
|
+
|
19
|
+
augmented_masked_mappings = masked_mappings
|
20
|
+
# Remove any masked klasses where additional columns mappings
|
21
|
+
# have been added for repeated sections
|
22
|
+
# e.g. SomeTestKlass column mappings are not needed if SomeTestKlass#1
|
23
|
+
# have been added
|
24
|
+
masked_mappings.each do |masked_key, columns|
|
25
|
+
# There may be occasions where the e.g. SomeTestKlass should be kept,
|
26
|
+
# This can be flagged in the one the klass's column mappings
|
27
|
+
next if columns.any? { |column| column.dig('xml_cell', 'keep_klass') }
|
28
|
+
|
29
|
+
if masked_mappings.keys.any? { |key| key =~ /\A#{masked_key}#\d+\z/ }
|
30
|
+
augmented_masked_mappings.delete(masked_key)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
augmented_masked_mappings
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
# This method duplicates the mappings and applies a do_not_capture mask to those that do not
|
40
|
+
# relate to this klass, returning the masked mappings
|
41
|
+
def mask_mappings_by_klass(klass)
|
42
|
+
augmented_columns.deep_dup.map do |mapping|
|
43
|
+
Array(mapping['klass']).flatten.include?(klass) ? mapping : { 'do_not_capture' => true }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def column_level_klass_masked_mappings
|
48
|
+
ensure_mappings_define_klass
|
49
|
+
|
50
|
+
# Loop through each klass
|
51
|
+
masked_mappings = {}
|
52
|
+
augmented_columns.pluck('klass').flatten.compact.uniq.each do |klass|
|
53
|
+
# Do not capture fields that relate to other klasses
|
54
|
+
masked_mappings[klass] = mask_mappings_by_klass(klass)
|
55
|
+
end
|
56
|
+
masked_mappings
|
57
|
+
end
|
58
|
+
|
59
|
+
# This method ensures that every column mapping defines a klass (unless it is a column that
|
60
|
+
# we do not capture). It is only used where a table level klass is not defined.
|
61
|
+
def ensure_mappings_define_klass
|
62
|
+
klassless_mappings = augmented_columns.
|
63
|
+
select { |mapping| mapping.nil? || mapping['klass'].nil? }.
|
64
|
+
reject { |mapping| mapping['do_not_capture'] }.
|
65
|
+
map { |mapping| mapping['column'] || mapping['standard_mapping'] }
|
66
|
+
|
67
|
+
return if klassless_mappings.empty?
|
68
|
+
|
69
|
+
# All column mappings for the single item file require a klass definition.
|
70
|
+
raise "Missing klass for column(s): #{klassless_mappings.to_sentence}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
data/lib/ndr_import/xml/table.rb
CHANGED
@@ -7,10 +7,17 @@ module NdrImport
|
|
7
7
|
# attention has been made to use enumerables throughout to help with the
|
8
8
|
# transformation of large quantities of data.
|
9
9
|
class Table < ::NdrImport::Table
|
10
|
+
require 'ndr_import/xml/column_mapping'
|
11
|
+
require 'ndr_import/xml/masked_mappings'
|
12
|
+
|
13
|
+
XML_OPTIONS = %w[pattern_match_record_xpath xml_record_xpath yield_xml_record].freeze
|
14
|
+
|
10
15
|
def self.all_valid_options
|
11
|
-
super - %w[delimiter header_lines footer_lines]
|
16
|
+
super - %w[delimiter header_lines footer_lines] + XML_OPTIONS
|
12
17
|
end
|
13
18
|
|
19
|
+
attr_reader(*XML_OPTIONS)
|
20
|
+
|
14
21
|
def header_lines
|
15
22
|
0
|
16
23
|
end
|
@@ -24,26 +31,130 @@ module NdrImport
|
|
24
31
|
# and fields for each mapped klass.
|
25
32
|
def transform_line(line, index)
|
26
33
|
return enum_for(:transform_line, line, index) unless block_given?
|
27
|
-
|
28
34
|
raise 'Not an Nokogiri::XML::Element!' unless line.is_a? Nokogiri::XML::Element
|
29
35
|
|
30
|
-
|
36
|
+
augmented_masked_mappings = augment_and_validate_column_mappings_for(line)
|
31
37
|
|
32
|
-
xml_line =
|
38
|
+
xml_line = xml_line_from(line)
|
33
39
|
|
34
|
-
|
40
|
+
records_from_xml_line = []
|
41
|
+
augmented_masked_mappings.each do |klass, klass_mappings|
|
35
42
|
fields = mapped_line(xml_line, klass_mappings)
|
43
|
+
|
36
44
|
next if fields[:skip].to_s == 'true'.freeze
|
37
|
-
|
45
|
+
|
46
|
+
if yield_xml_record
|
47
|
+
records_from_xml_line << [klass, fields, index]
|
48
|
+
else
|
49
|
+
yield(klass, fields, index)
|
50
|
+
end
|
38
51
|
end
|
52
|
+
yield(records_from_xml_line.compact) if yield_xml_record
|
39
53
|
end
|
40
54
|
|
41
55
|
private
|
42
56
|
|
57
|
+
def augment_and_validate_column_mappings_for(line)
|
58
|
+
augment_column_mappings_for(line)
|
59
|
+
validate_column_mappings(line)
|
60
|
+
|
61
|
+
NdrImport::Xml::MaskedMappings.new(@klass, @augmented_columns.deep_dup).call
|
62
|
+
end
|
63
|
+
|
64
|
+
# Add missing column mappings (and column_xpaths) where
|
65
|
+
# repeating sections / data items appear
|
66
|
+
def augment_column_mappings_for(line)
|
67
|
+
# Start with a fresh set of @augmented_columns for each line, adding new mappings as
|
68
|
+
# required for each `line`
|
69
|
+
@augmented_columns = @columns.deep_dup
|
70
|
+
@augmented_column_xpaths = column_xpaths.deep_dup
|
71
|
+
|
72
|
+
unmapped_xpaths(line).each do |unmapped_xpath|
|
73
|
+
existing_column = find_existing_column_for(unmapped_xpath.dup)
|
74
|
+
next unless existing_column
|
75
|
+
|
76
|
+
unmapped_xpath_hash = labelled_xpath_components_from(unmapped_xpath)
|
77
|
+
klass_increment_match = unmapped_xpath.match(/\[(\d+)\]/)
|
78
|
+
raise "could not identify klass for #{unmapped_xpath}" unless klass_increment_match
|
79
|
+
|
80
|
+
new_column = NdrImport::Xml::ColumnMapping.new(existing_column, unmapped_xpath_hash,
|
81
|
+
klass_increment_match[1], line,
|
82
|
+
@klass).call
|
83
|
+
@augmented_columns << new_column
|
84
|
+
@augmented_column_xpaths << build_xpath_from(new_column)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def xml_line_from(line)
|
89
|
+
@augmented_column_xpaths.map do |column_xpath|
|
90
|
+
# Augmenting the column mappings should account for repeating sections/items
|
91
|
+
# TODO: Is this needed now that we removed "duplicated" klass mappings?
|
92
|
+
line.xpath(column_xpath).count > 1 ? '' : line.xpath(column_xpath).inner_text
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def find_existing_column_for(unmapped_xpath)
|
97
|
+
# Remove any e.g. [2] which will be present on repeating sections
|
98
|
+
unmapped_xpath.gsub!(/\[\d+\]/, '')
|
99
|
+
unmapped_xpath_hash = labelled_xpath_components_from(unmapped_xpath)
|
100
|
+
columns.detect do |column|
|
101
|
+
column['column'] == unmapped_xpath_hash[:column_name] &&
|
102
|
+
column.dig('xml_cell', 'relative_path') == unmapped_xpath_hash[:column_relative_path] &&
|
103
|
+
column.dig('xml_cell', 'attribute') == unmapped_xpath_hash[:column_attribute]
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Returns a Hash containing labelled components for the given `unmapped_xpath`
|
108
|
+
# For example, an `unmapped_xpath` of "Record/Demographics/Sex/@code" would result in:
|
109
|
+
# { column_attribute: '@code',
|
110
|
+
# column_name: 'Sex',
|
111
|
+
# column_relative_path: 'Record/Demographics' }
|
112
|
+
def labelled_xpath_components_from(unmapped_xpath)
|
113
|
+
xpath_components = unmapped_xpath.split('/')
|
114
|
+
column_attribute = new_column_attribute_from(xpath_components)
|
115
|
+
# I dislike the `EnforcedShorthandSyntax`, code is less readable
|
116
|
+
# rubocop:disable Style::HashSyntax
|
117
|
+
{ column_attribute: column_attribute,
|
118
|
+
column_name: new_column_name_from(xpath_components, column_attribute),
|
119
|
+
column_relative_path: new_relative_path_from(xpath_components, column_attribute) }
|
120
|
+
# rubocop:enable Style::HashSyntax
|
121
|
+
end
|
122
|
+
|
123
|
+
def new_column_attribute_from(xpath_components)
|
124
|
+
xpath_components.last.starts_with?('@') ? xpath_components.last[1...] : nil
|
125
|
+
end
|
126
|
+
|
127
|
+
def new_column_name_from(xpath_components, column_attribute)
|
128
|
+
return xpath_components[-2] if column_attribute.present?
|
129
|
+
|
130
|
+
xpath_components.last
|
131
|
+
end
|
132
|
+
|
133
|
+
# xpaths can be e.g. Record/Demographics/Sex/@code or Record/Demographics/Surname
|
134
|
+
# `xpath_components` is an array of the xpath's components, for example:
|
135
|
+
# Record/Demographics/Sex/@code => ['Record', 'Demographics', 'Sex', '@code']
|
136
|
+
#
|
137
|
+
# For the relative path, we want to return Record/Demographics.
|
138
|
+
# The upper_limit removes the "field name" (Sex or Surname here) and optionally the
|
139
|
+
# attribute (@code here) if present, from `xpath_components`.
|
140
|
+
# The resulting array is joined back together to form the relative path.
|
141
|
+
def new_relative_path_from(xpath_components, column_attribute)
|
142
|
+
upper_limit = column_attribute.present? ? -3 : -2
|
143
|
+
xpath_components.count > 1 ? xpath_components[0..upper_limit].join('/') : nil
|
144
|
+
end
|
145
|
+
|
43
146
|
# Ensure every leaf is accounted for in the column mappings
|
44
147
|
def validate_column_mappings(line)
|
45
|
-
|
46
|
-
|
148
|
+
missing_xpaths = unmapped_xpaths(line)
|
149
|
+
return if missing_xpaths.none?
|
150
|
+
|
151
|
+
raise(NdrImport::Xml::UnmappedXpathError, missing_xpaths.to_sentence)
|
152
|
+
end
|
153
|
+
|
154
|
+
# Not memoized this by design, we want to re-calculate unmapped xpaths after
|
155
|
+
# `@augmented_column_xpaths` have been augmented for each `line`
|
156
|
+
def unmapped_xpaths(line)
|
157
|
+
mappable_xpaths_from(line) - (@augmented_column_xpaths || column_xpaths)
|
47
158
|
end
|
48
159
|
|
49
160
|
def column_name_from(column)
|
@@ -58,9 +169,12 @@ module NdrImport
|
|
58
169
|
xpaths = []
|
59
170
|
|
60
171
|
line.xpath('.//*[not(child::*)]').each do |node|
|
61
|
-
xpath = node.path.sub(line.path
|
62
|
-
|
63
|
-
|
172
|
+
xpath = node.path.sub("#{line.path}/", '')
|
173
|
+
if node.attributes.any?
|
174
|
+
node.attributes.each_key { |key| xpaths << "#{xpath}/@#{key}" }
|
175
|
+
else
|
176
|
+
xpaths << xpath
|
177
|
+
end
|
64
178
|
end
|
65
179
|
xpaths
|
66
180
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module NdrImport
|
2
|
+
module Xml
|
3
|
+
# Raised if an unmapped xpath is identified
|
4
|
+
class UnmappedXpathError < StandardError
|
5
|
+
attr_reader :missing_xpaths
|
6
|
+
|
7
|
+
def initialize(missing_xpaths)
|
8
|
+
@missing_xpaths = missing_xpaths
|
9
|
+
message = "Unmapped xpath(s): #{missing_xpaths}"
|
10
|
+
|
11
|
+
super(message)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/lib/ndr_import.rb
CHANGED
@@ -10,8 +10,10 @@ require 'ndr_import/fixed_width/table'
|
|
10
10
|
require 'ndr_import/xml/table'
|
11
11
|
require 'ndr_import/pdf_form/table'
|
12
12
|
require 'ndr_import/avro/table'
|
13
|
+
require 'ndr_import/vcf/table'
|
13
14
|
require 'ndr_import/unmapped_data_error'
|
14
15
|
require 'ndr_import/acroform_reader'
|
16
|
+
require 'ndr_import/xml/unmapped_xpath_error'
|
15
17
|
|
16
18
|
module NdrImport
|
17
19
|
def self.root
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 11.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '6.
|
33
|
+
version: '6.1'
|
34
34
|
- - "<"
|
35
35
|
- !ruby/object:Gem::Version
|
36
36
|
version: '7.1'
|
@@ -40,7 +40,7 @@ dependencies:
|
|
40
40
|
requirements:
|
41
41
|
- - ">="
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '6.
|
43
|
+
version: '6.1'
|
44
44
|
- - "<"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '7.1'
|
@@ -106,6 +106,20 @@ dependencies:
|
|
106
106
|
- - "~>"
|
107
107
|
- !ruby/object:Gem::Version
|
108
108
|
version: 1.11.0
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
name: bio-vcf
|
111
|
+
requirement: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - "~>"
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: 0.9.5
|
116
|
+
type: :runtime
|
117
|
+
prerelease: false
|
118
|
+
version_requirements: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - "~>"
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 0.9.5
|
109
123
|
- !ruby/object:Gem::Dependency
|
110
124
|
name: docx
|
111
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -197,19 +211,19 @@ dependencies:
|
|
197
211
|
- !ruby/object:Gem::Version
|
198
212
|
version: '0'
|
199
213
|
- !ruby/object:Gem::Dependency
|
200
|
-
name:
|
214
|
+
name: seven-zip
|
201
215
|
requirement: !ruby/object:Gem::Requirement
|
202
216
|
requirements:
|
203
217
|
- - "~>"
|
204
218
|
- !ruby/object:Gem::Version
|
205
|
-
version: '1.
|
219
|
+
version: '1.4'
|
206
220
|
type: :runtime
|
207
221
|
prerelease: false
|
208
222
|
version_requirements: !ruby/object:Gem::Requirement
|
209
223
|
requirements:
|
210
224
|
- - "~>"
|
211
225
|
- !ruby/object:Gem::Version
|
212
|
-
version: '1.
|
226
|
+
version: '1.4'
|
213
227
|
- !ruby/object:Gem::Dependency
|
214
228
|
name: spreadsheet
|
215
229
|
requirement: !ruby/object:Gem::Requirement
|
@@ -399,6 +413,7 @@ files:
|
|
399
413
|
- lib/ndr_import/file/seven_zip.rb
|
400
414
|
- lib/ndr_import/file/text.rb
|
401
415
|
- lib/ndr_import/file/unregistered_filetype.rb
|
416
|
+
- lib/ndr_import/file/vcf.rb
|
402
417
|
- lib/ndr_import/file/word.rb
|
403
418
|
- lib/ndr_import/file/xml.rb
|
404
419
|
- lib/ndr_import/file/zip.rb
|
@@ -424,9 +439,13 @@ files:
|
|
424
439
|
- lib/ndr_import/table.rb
|
425
440
|
- lib/ndr_import/universal_importer_helper.rb
|
426
441
|
- lib/ndr_import/unmapped_data_error.rb
|
442
|
+
- lib/ndr_import/vcf/table.rb
|
427
443
|
- lib/ndr_import/version.rb
|
444
|
+
- lib/ndr_import/xml/column_mapping.rb
|
428
445
|
- lib/ndr_import/xml/control_char_escaper.rb
|
446
|
+
- lib/ndr_import/xml/masked_mappings.rb
|
429
447
|
- lib/ndr_import/xml/table.rb
|
448
|
+
- lib/ndr_import/xml/unmapped_xpath_error.rb
|
430
449
|
homepage: https://github.com/NHSDigital/ndr_import
|
431
450
|
licenses:
|
432
451
|
- MIT
|
@@ -439,14 +458,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
439
458
|
requirements:
|
440
459
|
- - ">="
|
441
460
|
- !ruby/object:Gem::Version
|
442
|
-
version: '
|
461
|
+
version: '3.0'
|
443
462
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
444
463
|
requirements:
|
445
464
|
- - ">="
|
446
465
|
- !ruby/object:Gem::Version
|
447
466
|
version: '0'
|
448
467
|
requirements: []
|
449
|
-
rubygems_version: 3.
|
468
|
+
rubygems_version: 3.4.10
|
450
469
|
signing_key:
|
451
470
|
specification_version: 4
|
452
471
|
summary: NDR Import
|