ndr_import 10.1.3 → 10.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/README.md +1 -1
- data/lib/ndr_import/avro/table.rb +49 -0
- data/lib/ndr_import/file/all.rb +2 -0
- data/lib/ndr_import/file/avro.rb +34 -0
- data/lib/ndr_import/file/vcf.rb +25 -0
- data/lib/ndr_import/table.rb +1 -1
- data/lib/ndr_import/vcf/table.rb +21 -0
- data/lib/ndr_import/version.rb +1 -1
- data/lib/ndr_import.rb +2 -0
- metadata +38 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 877346774d65eccb73f913081d75a48345a92b9e4c11e6f87702c9e0d59ebc3d
|
|
4
|
+
data.tar.gz: e59f23137e7568ce5e6eb1e4734b9046aed8f8fa93f392a9f2d82091013c3c04
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5bd15b2baf53b654a9f2be306d45dd3a509c298427790c139431f9f77df84b14f6506d625620b6781e9d44315fb8ada40307865a059c2ae3e78945293065dc9f
|
|
7
|
+
data.tar.gz: 06070cd9f9d5311835a523f8f926c938e0c65561c863d8bc972a1250a278ccf06991013773c31f877173873b600bbee40e6ce9faa6d3da167dd93a4fc11b4965
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
|
-
|
|
2
|
+
=======
|
|
3
|
+
*no unreleased changes*
|
|
4
|
+
|
|
5
|
+
## 10.3.0 / 2023-09-07
|
|
6
|
+
### Added
|
|
7
|
+
* VCF file support
|
|
8
|
+
* Support Ruby 3.2. Drop support for Ruby 2.7, Rails 6.0
|
|
9
|
+
|
|
10
|
+
## 10.2.0 / 2023-05-16
|
|
11
|
+
* avro file support
|
|
12
|
+
* allow storage of `significant_mapped_fields` in `Table`
|
|
3
13
|
|
|
4
14
|
## 10.1.3 / 2022-12-08
|
|
5
15
|
### Added
|
data/README.md
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# NdrImport [](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [](https://rubygems.org/gems/ndr_import) [](https://www.rubydoc.info/gems/ndr_import)
|
|
2
2
|
This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
|
|
3
3
|
|
|
4
|
-
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and
|
|
4
|
+
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip, avro and VCF files.
|
|
5
5
|
2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
|
|
6
6
|
|
|
7
7
|
## Installation
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
require 'ndr_import/table'
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module Avro
|
|
5
|
+
# Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
|
|
6
|
+
# All other Table logic is inherited from `NdrImport::Table`
|
|
7
|
+
class Table < ::NdrImport::Table
|
|
8
|
+
# Scaffold an `NdrImport::Avro::Table` instance from avro schema file
|
|
9
|
+
def self.from_schema(safe_path)
|
|
10
|
+
raise SecurityError, "#{safe_path} is not a SafePath" unless safe_path.is_a? SafePath
|
|
11
|
+
|
|
12
|
+
table_columns = columns_from(::Avro::Schema.parse(::File.open(safe_path)))
|
|
13
|
+
file_name = SafeFile.basename(safe_path).sub(/\.avsc\z/, '.avro')
|
|
14
|
+
|
|
15
|
+
new(filename_pattern: "/#{file_name}\\z/",
|
|
16
|
+
klass: 'ExampleKlass',
|
|
17
|
+
columns: table_columns)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def self.all_valid_options
|
|
21
|
+
super - %w[delimiter header_lines footer_lines]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def header_lines
|
|
25
|
+
1
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def footer_lines
|
|
29
|
+
0
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.columns_from(schema)
|
|
33
|
+
schema.fields.map do |field|
|
|
34
|
+
column = { column: field.name }
|
|
35
|
+
column[:mappings] = { field: field.name, daysafter: '1970-01-01' } if date_field?(field)
|
|
36
|
+
|
|
37
|
+
column
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.date_field?(field)
|
|
42
|
+
field.type.schemas.any? { |schema| schema.logical_type == 'date' }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private_class_method :columns_from
|
|
46
|
+
private_class_method :date_field?
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
data/lib/ndr_import/file/all.rb
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
require_relative 'base'
|
|
2
2
|
require_relative 'acro_form'
|
|
3
|
+
require_relative 'avro'
|
|
3
4
|
require_relative 'delimited'
|
|
4
5
|
require_relative 'docx'
|
|
5
6
|
require_relative 'excel'
|
|
@@ -7,6 +8,7 @@ require_relative 'pdf'
|
|
|
7
8
|
require_relative 'seven_zip'
|
|
8
9
|
require_relative 'text'
|
|
9
10
|
require_relative 'unregistered_filetype'
|
|
11
|
+
require_relative 'vcf'
|
|
10
12
|
require_relative 'word'
|
|
11
13
|
require_relative 'xml'
|
|
12
14
|
require_relative 'zip'
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
require 'avro'
|
|
2
|
+
require 'ndr_support/safe_file'
|
|
3
|
+
require_relative 'registry'
|
|
4
|
+
|
|
5
|
+
module NdrImport
|
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
|
8
|
+
module File
|
|
9
|
+
# This class is an avro file handler that returns a single table.
|
|
10
|
+
class Avro < Base
|
|
11
|
+
private
|
|
12
|
+
|
|
13
|
+
def rows(&block)
|
|
14
|
+
return enum_for(:rows) unless block
|
|
15
|
+
|
|
16
|
+
# Create an instance of DatumReader
|
|
17
|
+
reader = ::Avro::IO::DatumReader.new
|
|
18
|
+
# Open @filename in read mode
|
|
19
|
+
file = ::File.open(@filename, 'rb')
|
|
20
|
+
# Equivalent to DataFileReader instance creation in Java
|
|
21
|
+
dr = ::Avro::DataFile::Reader.new(file, reader)
|
|
22
|
+
|
|
23
|
+
dr.each_with_index do |avro_row, i|
|
|
24
|
+
# Ensure the first row is always the "header"
|
|
25
|
+
yield(avro_row.keys) if i.zero?
|
|
26
|
+
yield(avro_row.values.map(&:to_s))
|
|
27
|
+
end
|
|
28
|
+
rescue StandardError => e
|
|
29
|
+
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
Registry.register(Avro, 'avro')
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require 'bio-vcf/vcfline'
|
|
2
|
+
require 'ndr_support/safe_file'
|
|
3
|
+
require_relative 'registry'
|
|
4
|
+
|
|
5
|
+
module NdrImport
|
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
|
8
|
+
module File
|
|
9
|
+
# This class is a vcf file handler that returns a single table.
|
|
10
|
+
class Vcf < Base
|
|
11
|
+
private
|
|
12
|
+
|
|
13
|
+
def rows(&block)
|
|
14
|
+
return enum_for(:rows) unless block
|
|
15
|
+
|
|
16
|
+
::File.read(@filename).each_line do |line|
|
|
17
|
+
next if line =~ /^##/
|
|
18
|
+
|
|
19
|
+
yield BioVcf::VcfLine.parse(line)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
Registry.register(Vcf, 'vcf')
|
|
24
|
+
end
|
|
25
|
+
end
|
data/lib/ndr_import/table.rb
CHANGED
|
@@ -12,7 +12,7 @@ module NdrImport
|
|
|
12
12
|
def self.all_valid_options
|
|
13
13
|
%w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
|
|
14
14
|
tablename_pattern header_lines footer_lines format klass columns xml_record_xpath slurp
|
|
15
|
-
row_identifier]
|
|
15
|
+
row_identifier significant_mapped_fields]
|
|
16
16
|
end
|
|
17
17
|
|
|
18
18
|
def all_valid_options
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'ndr_import/table'
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module Vcf
|
|
5
|
+
# Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
|
|
6
|
+
# All other Table logic is inherited from `NdrImport::Table`
|
|
7
|
+
class Table < ::NdrImport::Table
|
|
8
|
+
def self.all_valid_options
|
|
9
|
+
super - %w[delimiter header_lines footer_lines xml_record_xpath]
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def header_lines
|
|
13
|
+
1
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def footer_lines
|
|
17
|
+
0
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
data/lib/ndr_import/version.rb
CHANGED
data/lib/ndr_import.rb
CHANGED
|
@@ -9,6 +9,8 @@ require 'ndr_import/non_tabular/table'
|
|
|
9
9
|
require 'ndr_import/fixed_width/table'
|
|
10
10
|
require 'ndr_import/xml/table'
|
|
11
11
|
require 'ndr_import/pdf_form/table'
|
|
12
|
+
require 'ndr_import/avro/table'
|
|
13
|
+
require 'ndr_import/vcf/table'
|
|
12
14
|
require 'ndr_import/unmapped_data_error'
|
|
13
15
|
require 'ndr_import/acroform_reader'
|
|
14
16
|
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ndr_import
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 10.
|
|
4
|
+
version: 10.3.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- NCRS Development Team
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2023-09-07 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activemodel
|
|
@@ -30,7 +30,7 @@ dependencies:
|
|
|
30
30
|
requirements:
|
|
31
31
|
- - ">="
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: '6.
|
|
33
|
+
version: '6.1'
|
|
34
34
|
- - "<"
|
|
35
35
|
- !ruby/object:Gem::Version
|
|
36
36
|
version: '7.1'
|
|
@@ -40,7 +40,7 @@ dependencies:
|
|
|
40
40
|
requirements:
|
|
41
41
|
- - ">="
|
|
42
42
|
- !ruby/object:Gem::Version
|
|
43
|
-
version: '6.
|
|
43
|
+
version: '6.1'
|
|
44
44
|
- - "<"
|
|
45
45
|
- !ruby/object:Gem::Version
|
|
46
46
|
version: '7.1'
|
|
@@ -92,6 +92,34 @@ dependencies:
|
|
|
92
92
|
- - "~>"
|
|
93
93
|
- !ruby/object:Gem::Version
|
|
94
94
|
version: '2.0'
|
|
95
|
+
- !ruby/object:Gem::Dependency
|
|
96
|
+
name: avro
|
|
97
|
+
requirement: !ruby/object:Gem::Requirement
|
|
98
|
+
requirements:
|
|
99
|
+
- - "~>"
|
|
100
|
+
- !ruby/object:Gem::Version
|
|
101
|
+
version: 1.11.0
|
|
102
|
+
type: :runtime
|
|
103
|
+
prerelease: false
|
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
105
|
+
requirements:
|
|
106
|
+
- - "~>"
|
|
107
|
+
- !ruby/object:Gem::Version
|
|
108
|
+
version: 1.11.0
|
|
109
|
+
- !ruby/object:Gem::Dependency
|
|
110
|
+
name: bio-vcf
|
|
111
|
+
requirement: !ruby/object:Gem::Requirement
|
|
112
|
+
requirements:
|
|
113
|
+
- - "~>"
|
|
114
|
+
- !ruby/object:Gem::Version
|
|
115
|
+
version: 0.9.5
|
|
116
|
+
type: :runtime
|
|
117
|
+
prerelease: false
|
|
118
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
119
|
+
requirements:
|
|
120
|
+
- - "~>"
|
|
121
|
+
- !ruby/object:Gem::Version
|
|
122
|
+
version: 0.9.5
|
|
95
123
|
- !ruby/object:Gem::Dependency
|
|
96
124
|
name: docx
|
|
97
125
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -369,9 +397,11 @@ files:
|
|
|
369
397
|
- exe/word_to_text
|
|
370
398
|
- lib/ndr_import.rb
|
|
371
399
|
- lib/ndr_import/acroform_reader.rb
|
|
400
|
+
- lib/ndr_import/avro/table.rb
|
|
372
401
|
- lib/ndr_import/csv_library.rb
|
|
373
402
|
- lib/ndr_import/file/acro_form.rb
|
|
374
403
|
- lib/ndr_import/file/all.rb
|
|
404
|
+
- lib/ndr_import/file/avro.rb
|
|
375
405
|
- lib/ndr_import/file/base.rb
|
|
376
406
|
- lib/ndr_import/file/delimited.rb
|
|
377
407
|
- lib/ndr_import/file/docx.rb
|
|
@@ -383,6 +413,7 @@ files:
|
|
|
383
413
|
- lib/ndr_import/file/seven_zip.rb
|
|
384
414
|
- lib/ndr_import/file/text.rb
|
|
385
415
|
- lib/ndr_import/file/unregistered_filetype.rb
|
|
416
|
+
- lib/ndr_import/file/vcf.rb
|
|
386
417
|
- lib/ndr_import/file/word.rb
|
|
387
418
|
- lib/ndr_import/file/xml.rb
|
|
388
419
|
- lib/ndr_import/file/zip.rb
|
|
@@ -408,6 +439,7 @@ files:
|
|
|
408
439
|
- lib/ndr_import/table.rb
|
|
409
440
|
- lib/ndr_import/universal_importer_helper.rb
|
|
410
441
|
- lib/ndr_import/unmapped_data_error.rb
|
|
442
|
+
- lib/ndr_import/vcf/table.rb
|
|
411
443
|
- lib/ndr_import/version.rb
|
|
412
444
|
- lib/ndr_import/xml/control_char_escaper.rb
|
|
413
445
|
- lib/ndr_import/xml/table.rb
|
|
@@ -423,14 +455,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
423
455
|
requirements:
|
|
424
456
|
- - ">="
|
|
425
457
|
- !ruby/object:Gem::Version
|
|
426
|
-
version: '
|
|
458
|
+
version: '3.0'
|
|
427
459
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
428
460
|
requirements:
|
|
429
461
|
- - ">="
|
|
430
462
|
- !ruby/object:Gem::Version
|
|
431
463
|
version: '0'
|
|
432
464
|
requirements: []
|
|
433
|
-
rubygems_version: 3.
|
|
465
|
+
rubygems_version: 3.4.10
|
|
434
466
|
signing_key:
|
|
435
467
|
specification_version: 4
|
|
436
468
|
summary: NDR Import
|