ndr_import 10.1.3 → 10.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/README.md +1 -1
- data/lib/ndr_import/avro/table.rb +49 -0
- data/lib/ndr_import/file/all.rb +2 -0
- data/lib/ndr_import/file/avro.rb +34 -0
- data/lib/ndr_import/file/vcf.rb +25 -0
- data/lib/ndr_import/table.rb +1 -1
- data/lib/ndr_import/vcf/table.rb +21 -0
- data/lib/ndr_import/version.rb +1 -1
- data/lib/ndr_import.rb +2 -0
- metadata +38 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 877346774d65eccb73f913081d75a48345a92b9e4c11e6f87702c9e0d59ebc3d
|
4
|
+
data.tar.gz: e59f23137e7568ce5e6eb1e4734b9046aed8f8fa93f392a9f2d82091013c3c04
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5bd15b2baf53b654a9f2be306d45dd3a509c298427790c139431f9f77df84b14f6506d625620b6781e9d44315fb8ada40307865a059c2ae3e78945293065dc9f
|
7
|
+
data.tar.gz: 06070cd9f9d5311835a523f8f926c938e0c65561c863d8bc972a1250a278ccf06991013773c31f877173873b600bbee40e6ce9faa6d3da167dd93a4fc11b4965
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## [Unreleased]
|
2
|
-
|
2
|
+
=======
|
3
|
+
*no unreleased changes*
|
4
|
+
|
5
|
+
## 10.3.0 / 2023-09-07
|
6
|
+
### Added
|
7
|
+
* VCF file support
|
8
|
+
* Support Ruby 3.2. Drop support for Ruby 2.7, Rails 6.0
|
9
|
+
|
10
|
+
## 10.2.0 / 2023-05-16
|
11
|
+
* avro file support
|
12
|
+
* allow storage of `significant_mapped_fields` in `Table`
|
3
13
|
|
4
14
|
## 10.1.3 / 2022-12-08
|
5
15
|
### Added
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# NdrImport [![Build Status](https://github.com/NHSDigital/ndr_import/workflows/Test/badge.svg)](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://rubygems.org/gems/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
|
2
2
|
This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
|
3
3
|
|
4
|
-
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and
|
4
|
+
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip, avro and VCF files.
|
5
5
|
2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
|
6
6
|
|
7
7
|
## Installation
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'ndr_import/table'
|
2
|
+
|
3
|
+
module NdrImport
|
4
|
+
module Avro
|
5
|
+
# Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
|
6
|
+
# All other Table logic is inherited from `NdrImport::Table`
|
7
|
+
class Table < ::NdrImport::Table
|
8
|
+
# Scaffold an `NdrImport::Avro::Table` instance from avro schema file
|
9
|
+
def self.from_schema(safe_path)
|
10
|
+
raise SecurityError, "#{safe_path} is not a SafePath" unless safe_path.is_a? SafePath
|
11
|
+
|
12
|
+
table_columns = columns_from(::Avro::Schema.parse(::File.open(safe_path)))
|
13
|
+
file_name = SafeFile.basename(safe_path).sub(/\.avsc\z/, '.avro')
|
14
|
+
|
15
|
+
new(filename_pattern: "/#{file_name}\\z/",
|
16
|
+
klass: 'ExampleKlass',
|
17
|
+
columns: table_columns)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.all_valid_options
|
21
|
+
super - %w[delimiter header_lines footer_lines]
|
22
|
+
end
|
23
|
+
|
24
|
+
def header_lines
|
25
|
+
1
|
26
|
+
end
|
27
|
+
|
28
|
+
def footer_lines
|
29
|
+
0
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.columns_from(schema)
|
33
|
+
schema.fields.map do |field|
|
34
|
+
column = { column: field.name }
|
35
|
+
column[:mappings] = { field: field.name, daysafter: '1970-01-01' } if date_field?(field)
|
36
|
+
|
37
|
+
column
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.date_field?(field)
|
42
|
+
field.type.schemas.any? { |schema| schema.logical_type == 'date' }
|
43
|
+
end
|
44
|
+
|
45
|
+
private_class_method :columns_from
|
46
|
+
private_class_method :date_field?
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/ndr_import/file/all.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require_relative 'base'
|
2
2
|
require_relative 'acro_form'
|
3
|
+
require_relative 'avro'
|
3
4
|
require_relative 'delimited'
|
4
5
|
require_relative 'docx'
|
5
6
|
require_relative 'excel'
|
@@ -7,6 +8,7 @@ require_relative 'pdf'
|
|
7
8
|
require_relative 'seven_zip'
|
8
9
|
require_relative 'text'
|
9
10
|
require_relative 'unregistered_filetype'
|
11
|
+
require_relative 'vcf'
|
10
12
|
require_relative 'word'
|
11
13
|
require_relative 'xml'
|
12
14
|
require_relative 'zip'
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'avro'
|
2
|
+
require 'ndr_support/safe_file'
|
3
|
+
require_relative 'registry'
|
4
|
+
|
5
|
+
module NdrImport
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
8
|
+
module File
|
9
|
+
# This class is an avro file handler that returns a single table.
|
10
|
+
class Avro < Base
|
11
|
+
private
|
12
|
+
|
13
|
+
def rows(&block)
|
14
|
+
return enum_for(:rows) unless block
|
15
|
+
|
16
|
+
# Create an instance of DatumReader
|
17
|
+
reader = ::Avro::IO::DatumReader.new
|
18
|
+
# Open @filename in read mode
|
19
|
+
file = ::File.open(@filename, 'rb')
|
20
|
+
# Equivalent to DataFileReader instance creation in Java
|
21
|
+
dr = ::Avro::DataFile::Reader.new(file, reader)
|
22
|
+
|
23
|
+
dr.each_with_index do |avro_row, i|
|
24
|
+
# Ensure the first row is always the "header"
|
25
|
+
yield(avro_row.keys) if i.zero?
|
26
|
+
yield(avro_row.values.map(&:to_s))
|
27
|
+
end
|
28
|
+
rescue StandardError => e
|
29
|
+
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
Registry.register(Avro, 'avro')
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'bio-vcf/vcfline'
|
2
|
+
require 'ndr_support/safe_file'
|
3
|
+
require_relative 'registry'
|
4
|
+
|
5
|
+
module NdrImport
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
8
|
+
module File
|
9
|
+
# This class is a vcf file handler that returns a single table.
|
10
|
+
class Vcf < Base
|
11
|
+
private
|
12
|
+
|
13
|
+
def rows(&block)
|
14
|
+
return enum_for(:rows) unless block
|
15
|
+
|
16
|
+
::File.read(@filename).each_line do |line|
|
17
|
+
next if line =~ /^##/
|
18
|
+
|
19
|
+
yield BioVcf::VcfLine.parse(line)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
Registry.register(Vcf, 'vcf')
|
24
|
+
end
|
25
|
+
end
|
data/lib/ndr_import/table.rb
CHANGED
@@ -12,7 +12,7 @@ module NdrImport
|
|
12
12
|
def self.all_valid_options
|
13
13
|
%w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
|
14
14
|
tablename_pattern header_lines footer_lines format klass columns xml_record_xpath slurp
|
15
|
-
row_identifier]
|
15
|
+
row_identifier significant_mapped_fields]
|
16
16
|
end
|
17
17
|
|
18
18
|
def all_valid_options
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'ndr_import/table'
|
2
|
+
|
3
|
+
module NdrImport
|
4
|
+
module Vcf
|
5
|
+
# Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
|
6
|
+
# All other Table logic is inherited from `NdrImport::Table`
|
7
|
+
class Table < ::NdrImport::Table
|
8
|
+
def self.all_valid_options
|
9
|
+
super - %w[delimiter header_lines footer_lines xml_record_xpath]
|
10
|
+
end
|
11
|
+
|
12
|
+
def header_lines
|
13
|
+
1
|
14
|
+
end
|
15
|
+
|
16
|
+
def footer_lines
|
17
|
+
0
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/ndr_import/version.rb
CHANGED
data/lib/ndr_import.rb
CHANGED
@@ -9,6 +9,8 @@ require 'ndr_import/non_tabular/table'
|
|
9
9
|
require 'ndr_import/fixed_width/table'
|
10
10
|
require 'ndr_import/xml/table'
|
11
11
|
require 'ndr_import/pdf_form/table'
|
12
|
+
require 'ndr_import/avro/table'
|
13
|
+
require 'ndr_import/vcf/table'
|
12
14
|
require 'ndr_import/unmapped_data_error'
|
13
15
|
require 'ndr_import/acroform_reader'
|
14
16
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 10.
|
4
|
+
version: 10.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-09-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -30,7 +30,7 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '6.
|
33
|
+
version: '6.1'
|
34
34
|
- - "<"
|
35
35
|
- !ruby/object:Gem::Version
|
36
36
|
version: '7.1'
|
@@ -40,7 +40,7 @@ dependencies:
|
|
40
40
|
requirements:
|
41
41
|
- - ">="
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '6.
|
43
|
+
version: '6.1'
|
44
44
|
- - "<"
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: '7.1'
|
@@ -92,6 +92,34 @@ dependencies:
|
|
92
92
|
- - "~>"
|
93
93
|
- !ruby/object:Gem::Version
|
94
94
|
version: '2.0'
|
95
|
+
- !ruby/object:Gem::Dependency
|
96
|
+
name: avro
|
97
|
+
requirement: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - "~>"
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.11.0
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - "~>"
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: 1.11.0
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
name: bio-vcf
|
111
|
+
requirement: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - "~>"
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: 0.9.5
|
116
|
+
type: :runtime
|
117
|
+
prerelease: false
|
118
|
+
version_requirements: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - "~>"
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 0.9.5
|
95
123
|
- !ruby/object:Gem::Dependency
|
96
124
|
name: docx
|
97
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -369,9 +397,11 @@ files:
|
|
369
397
|
- exe/word_to_text
|
370
398
|
- lib/ndr_import.rb
|
371
399
|
- lib/ndr_import/acroform_reader.rb
|
400
|
+
- lib/ndr_import/avro/table.rb
|
372
401
|
- lib/ndr_import/csv_library.rb
|
373
402
|
- lib/ndr_import/file/acro_form.rb
|
374
403
|
- lib/ndr_import/file/all.rb
|
404
|
+
- lib/ndr_import/file/avro.rb
|
375
405
|
- lib/ndr_import/file/base.rb
|
376
406
|
- lib/ndr_import/file/delimited.rb
|
377
407
|
- lib/ndr_import/file/docx.rb
|
@@ -383,6 +413,7 @@ files:
|
|
383
413
|
- lib/ndr_import/file/seven_zip.rb
|
384
414
|
- lib/ndr_import/file/text.rb
|
385
415
|
- lib/ndr_import/file/unregistered_filetype.rb
|
416
|
+
- lib/ndr_import/file/vcf.rb
|
386
417
|
- lib/ndr_import/file/word.rb
|
387
418
|
- lib/ndr_import/file/xml.rb
|
388
419
|
- lib/ndr_import/file/zip.rb
|
@@ -408,6 +439,7 @@ files:
|
|
408
439
|
- lib/ndr_import/table.rb
|
409
440
|
- lib/ndr_import/universal_importer_helper.rb
|
410
441
|
- lib/ndr_import/unmapped_data_error.rb
|
442
|
+
- lib/ndr_import/vcf/table.rb
|
411
443
|
- lib/ndr_import/version.rb
|
412
444
|
- lib/ndr_import/xml/control_char_escaper.rb
|
413
445
|
- lib/ndr_import/xml/table.rb
|
@@ -423,14 +455,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
423
455
|
requirements:
|
424
456
|
- - ">="
|
425
457
|
- !ruby/object:Gem::Version
|
426
|
-
version: '
|
458
|
+
version: '3.0'
|
427
459
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
428
460
|
requirements:
|
429
461
|
- - ">="
|
430
462
|
- !ruby/object:Gem::Version
|
431
463
|
version: '0'
|
432
464
|
requirements: []
|
433
|
-
rubygems_version: 3.
|
465
|
+
rubygems_version: 3.4.10
|
434
466
|
signing_key:
|
435
467
|
specification_version: 4
|
436
468
|
summary: NDR Import
|