ndr_import 10.1.3 → 10.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/README.md +1 -1
- data/lib/ndr_import/avro/table.rb +49 -0
- data/lib/ndr_import/file/all.rb +1 -0
- data/lib/ndr_import/file/avro.rb +34 -0
- data/lib/ndr_import/table.rb +1 -1
- data/lib/ndr_import/version.rb +1 -1
- data/lib/ndr_import.rb +1 -0
- metadata +19 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aaca9cf96e7433b7889c004f769997288ea1cec77322754e965b97a545ffb0ab
|
4
|
+
data.tar.gz: 4a1e456f8766f2ea3422b0b712b2ca4c420873248e66fc77b1db8559470a1247
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '081ce6d8bce5dde04dca97a68057897a093bc90cca23f7336415330d92dcc599b96b606e9626358731c6638f0c3c1e41b7e763b2c03488cb76eeaa5c5c7a2cd8'
|
7
|
+
data.tar.gz: 802ba8017a16cc843196004854c3a82d161e76bdd4c2f61a5f93e34018dae9090212dd67c4e503a46c5ee78d6ed2da2d7e6f17192e62e4abefbd2382389d1cce
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# NdrImport [](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [](https://rubygems.org/gems/ndr_import) [](https://www.rubydoc.info/gems/ndr_import)
|
2
2
|
This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
|
3
3
|
|
4
|
-
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and
|
4
|
+
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip and avro files.
|
5
5
|
2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
|
6
6
|
|
7
7
|
## Installation
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'ndr_import/table'
|
2
|
+
|
3
|
+
module NdrImport
|
4
|
+
module Avro
|
5
|
+
# Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
|
6
|
+
# All other Table logic is inherited from `NdrImport::Table`
|
7
|
+
class Table < ::NdrImport::Table
|
8
|
+
# Scaffold an `NdrImport::Avro::Table` instance from avro schema file
|
9
|
+
def self.from_schema(safe_path)
|
10
|
+
raise SecurityError, "#{safe_path} is not a SafePath" unless safe_path.is_a? SafePath
|
11
|
+
|
12
|
+
table_columns = columns_from(::Avro::Schema.parse(::File.open(safe_path)))
|
13
|
+
file_name = SafeFile.basename(safe_path).sub(/\.avsc\z/, '.avro')
|
14
|
+
|
15
|
+
new(filename_pattern: "/#{file_name}\\z/",
|
16
|
+
klass: 'ExampleKlass',
|
17
|
+
columns: table_columns)
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.all_valid_options
|
21
|
+
super - %w[delimiter header_lines footer_lines]
|
22
|
+
end
|
23
|
+
|
24
|
+
def header_lines
|
25
|
+
1
|
26
|
+
end
|
27
|
+
|
28
|
+
def footer_lines
|
29
|
+
0
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.columns_from(schema)
|
33
|
+
schema.fields.map do |field|
|
34
|
+
column = { column: field.name }
|
35
|
+
column[:mappings] = { field: field.name, daysafter: '1970-01-01' } if date_field?(field)
|
36
|
+
|
37
|
+
column
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.date_field?(field)
|
42
|
+
field.type.schemas.any? { |schema| schema.logical_type == 'date' }
|
43
|
+
end
|
44
|
+
|
45
|
+
private_class_method :columns_from
|
46
|
+
private_class_method :date_field?
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/lib/ndr_import/file/all.rb
CHANGED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'avro'
|
2
|
+
require 'ndr_support/safe_file'
|
3
|
+
require_relative 'registry'
|
4
|
+
|
5
|
+
module NdrImport
|
6
|
+
# This is one of a collection of file handlers that deal with individual formats of data.
|
7
|
+
# They can be instantiated directly or via the factory method Registry.tables
|
8
|
+
module File
|
9
|
+
# This class is an avro file handler that returns a single table.
|
10
|
+
class Avro < Base
|
11
|
+
private
|
12
|
+
|
13
|
+
def rows(&block)
|
14
|
+
return enum_for(:rows) unless block
|
15
|
+
|
16
|
+
# Create an instance of DatumReader
|
17
|
+
reader = ::Avro::IO::DatumReader.new
|
18
|
+
# Open @filename in read mode
|
19
|
+
file = ::File.open(@filename, 'rb')
|
20
|
+
# Equivalent to DataFileReader instance creation in Java
|
21
|
+
dr = ::Avro::DataFile::Reader.new(file, reader)
|
22
|
+
|
23
|
+
dr.each_with_index do |avro_row, i|
|
24
|
+
# Ensure the first row is always the "header"
|
25
|
+
yield(avro_row.keys) if i.zero?
|
26
|
+
yield(avro_row.values.map(&:to_s))
|
27
|
+
end
|
28
|
+
rescue StandardError => e
|
29
|
+
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
30
|
+
end
|
31
|
+
end
|
32
|
+
Registry.register(Avro, 'avro')
|
33
|
+
end
|
34
|
+
end
|
data/lib/ndr_import/table.rb
CHANGED
@@ -12,7 +12,7 @@ module NdrImport
|
|
12
12
|
def self.all_valid_options
|
13
13
|
%w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
|
14
14
|
tablename_pattern header_lines footer_lines format klass columns xml_record_xpath slurp
|
15
|
-
row_identifier]
|
15
|
+
row_identifier significant_mapped_fields]
|
16
16
|
end
|
17
17
|
|
18
18
|
def all_valid_options
|
data/lib/ndr_import/version.rb
CHANGED
data/lib/ndr_import.rb
CHANGED
@@ -9,6 +9,7 @@ require 'ndr_import/non_tabular/table'
|
|
9
9
|
require 'ndr_import/fixed_width/table'
|
10
10
|
require 'ndr_import/xml/table'
|
11
11
|
require 'ndr_import/pdf_form/table'
|
12
|
+
require 'ndr_import/avro/table'
|
12
13
|
require 'ndr_import/unmapped_data_error'
|
13
14
|
require 'ndr_import/acroform_reader'
|
14
15
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 10.
|
4
|
+
version: 10.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-05-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -92,6 +92,20 @@ dependencies:
|
|
92
92
|
- - "~>"
|
93
93
|
- !ruby/object:Gem::Version
|
94
94
|
version: '2.0'
|
95
|
+
- !ruby/object:Gem::Dependency
|
96
|
+
name: avro
|
97
|
+
requirement: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - "~>"
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.11.0
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - "~>"
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: 1.11.0
|
95
109
|
- !ruby/object:Gem::Dependency
|
96
110
|
name: docx
|
97
111
|
requirement: !ruby/object:Gem::Requirement
|
@@ -369,9 +383,11 @@ files:
|
|
369
383
|
- exe/word_to_text
|
370
384
|
- lib/ndr_import.rb
|
371
385
|
- lib/ndr_import/acroform_reader.rb
|
386
|
+
- lib/ndr_import/avro/table.rb
|
372
387
|
- lib/ndr_import/csv_library.rb
|
373
388
|
- lib/ndr_import/file/acro_form.rb
|
374
389
|
- lib/ndr_import/file/all.rb
|
390
|
+
- lib/ndr_import/file/avro.rb
|
375
391
|
- lib/ndr_import/file/base.rb
|
376
392
|
- lib/ndr_import/file/delimited.rb
|
377
393
|
- lib/ndr_import/file/docx.rb
|
@@ -430,7 +446,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
430
446
|
- !ruby/object:Gem::Version
|
431
447
|
version: '0'
|
432
448
|
requirements: []
|
433
|
-
rubygems_version: 3.
|
449
|
+
rubygems_version: 3.2.33
|
434
450
|
signing_key:
|
435
451
|
specification_version: 4
|
436
452
|
summary: NDR Import
|