ndr_import 10.1.3 → 10.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: adf4468c5093398277008f08e3b4ff01619b0b39152dd7a604cccc863e7229f2
4
- data.tar.gz: aa3225ff8384c7bc42f33251c2943060382b9c27dae6243fbe70c18796597fbf
3
+ metadata.gz: aaca9cf96e7433b7889c004f769997288ea1cec77322754e965b97a545ffb0ab
4
+ data.tar.gz: 4a1e456f8766f2ea3422b0b712b2ca4c420873248e66fc77b1db8559470a1247
5
5
  SHA512:
6
- metadata.gz: d437ffcdf6733d4d62859f3450f47a7067bc7155c93404c14b883b5b2cb45dd661d555146c6317d87be640ad7033b7d214c5be6842cd32009e8462a398105249
7
- data.tar.gz: f7d10bf3ba7c1e16c75cb05585ff70d88dd56b00305b389752b5d5dec329d88f313454494307025b53cb02e1164c10a4328286b8c55220cdf506f5cf7bf4e094
6
+ metadata.gz: '081ce6d8bce5dde04dca97a68057897a093bc90cca23f7336415330d92dcc599b96b606e9626358731c6638f0c3c1e41b7e763b2c03488cb76eeaa5c5c7a2cd8'
7
+ data.tar.gz: 802ba8017a16cc843196004854c3a82d161e76bdd4c2f61a5f93e34018dae9090212dd67c4e503a46c5ee78d6ed2da2d7e6f17192e62e4abefbd2382389d1cce
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ## [Unreleased]
2
- * No unreleased changes
2
+ *no unreleased changes*
3
+
4
+ ## 10.2.0 / 2023-05-16
5
+ * avro file support
6
+ * allow storage of `significant_mapped_fields` in `Table`
3
7
 
4
8
  ## 10.1.3 / 2022-12-08
5
9
  ### Added
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # NdrImport [![Build Status](https://github.com/NHSDigital/ndr_import/workflows/Test/badge.svg)](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://rubygems.org/gems/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
2
2
  This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
3
3
 
4
- 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and Zip files.
4
+ 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip and avro files.
5
5
  2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
6
6
 
7
7
  ## Installation
@@ -0,0 +1,49 @@
1
+ require 'ndr_import/table'
2
+
3
+ module NdrImport
4
+ module Avro
5
+ # Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
6
+ # All other Table logic is inherited from `NdrImport::Table`
7
+ class Table < ::NdrImport::Table
8
+ # Scaffold an `NdrImport::Avro::Table` instance from avro schema file
9
+ def self.from_schema(safe_path)
10
+ raise SecurityError, "#{safe_path} is not a SafePath" unless safe_path.is_a? SafePath
11
+
12
+ table_columns = columns_from(::Avro::Schema.parse(::File.open(safe_path)))
13
+ file_name = SafeFile.basename(safe_path).sub(/\.avsc\z/, '.avro')
14
+
15
+ new(filename_pattern: "/#{file_name}\\z/",
16
+ klass: 'ExampleKlass',
17
+ columns: table_columns)
18
+ end
19
+
20
+ def self.all_valid_options
21
+ super - %w[delimiter header_lines footer_lines]
22
+ end
23
+
24
+ def header_lines
25
+ 1
26
+ end
27
+
28
+ def footer_lines
29
+ 0
30
+ end
31
+
32
+ def self.columns_from(schema)
33
+ schema.fields.map do |field|
34
+ column = { column: field.name }
35
+ column[:mappings] = { field: field.name, daysafter: '1970-01-01' } if date_field?(field)
36
+
37
+ column
38
+ end
39
+ end
40
+
41
+ def self.date_field?(field)
42
+ field.type.schemas.any? { |schema| schema.logical_type == 'date' }
43
+ end
44
+
45
+ private_class_method :columns_from
46
+ private_class_method :date_field?
47
+ end
48
+ end
49
+ end
@@ -1,5 +1,6 @@
1
1
  require_relative 'base'
2
2
  require_relative 'acro_form'
3
+ require_relative 'avro'
3
4
  require_relative 'delimited'
4
5
  require_relative 'docx'
5
6
  require_relative 'excel'
@@ -0,0 +1,34 @@
1
+ require 'avro'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is an avro file handler that returns a single table.
10
+ class Avro < Base
11
+ private
12
+
13
+ def rows(&block)
14
+ return enum_for(:rows) unless block
15
+
16
+ # Create an instance of DatumReader
17
+ reader = ::Avro::IO::DatumReader.new
18
+ # Open @filename in read mode
19
+ file = ::File.open(@filename, 'rb')
20
+ # Equivalent to DataFileReader instance creation in Java
21
+ dr = ::Avro::DataFile::Reader.new(file, reader)
22
+
23
+ dr.each_with_index do |avro_row, i|
24
+ # Ensure the first row is always the "header"
25
+ yield(avro_row.keys) if i.zero?
26
+ yield(avro_row.values.map(&:to_s))
27
+ end
28
+ rescue StandardError => e
29
+ raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
30
+ end
31
+ end
32
+ Registry.register(Avro, 'avro')
33
+ end
34
+ end
@@ -12,7 +12,7 @@ module NdrImport
12
12
  def self.all_valid_options
13
13
  %w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
14
14
  tablename_pattern header_lines footer_lines format klass columns xml_record_xpath slurp
15
- row_identifier]
15
+ row_identifier significant_mapped_fields]
16
16
  end
17
17
 
18
18
  def all_valid_options
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '10.1.3'
4
+ VERSION = '10.2.0'
5
5
  end
data/lib/ndr_import.rb CHANGED
@@ -9,6 +9,7 @@ require 'ndr_import/non_tabular/table'
9
9
  require 'ndr_import/fixed_width/table'
10
10
  require 'ndr_import/xml/table'
11
11
  require 'ndr_import/pdf_form/table'
12
+ require 'ndr_import/avro/table'
12
13
  require 'ndr_import/unmapped_data_error'
13
14
  require 'ndr_import/acroform_reader'
14
15
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 10.1.3
4
+ version: 10.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-08 00:00:00.000000000 Z
11
+ date: 2023-05-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -92,6 +92,20 @@ dependencies:
92
92
  - - "~>"
93
93
  - !ruby/object:Gem::Version
94
94
  version: '2.0'
95
+ - !ruby/object:Gem::Dependency
96
+ name: avro
97
+ requirement: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - "~>"
100
+ - !ruby/object:Gem::Version
101
+ version: 1.11.0
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - "~>"
107
+ - !ruby/object:Gem::Version
108
+ version: 1.11.0
95
109
  - !ruby/object:Gem::Dependency
96
110
  name: docx
97
111
  requirement: !ruby/object:Gem::Requirement
@@ -369,9 +383,11 @@ files:
369
383
  - exe/word_to_text
370
384
  - lib/ndr_import.rb
371
385
  - lib/ndr_import/acroform_reader.rb
386
+ - lib/ndr_import/avro/table.rb
372
387
  - lib/ndr_import/csv_library.rb
373
388
  - lib/ndr_import/file/acro_form.rb
374
389
  - lib/ndr_import/file/all.rb
390
+ - lib/ndr_import/file/avro.rb
375
391
  - lib/ndr_import/file/base.rb
376
392
  - lib/ndr_import/file/delimited.rb
377
393
  - lib/ndr_import/file/docx.rb
@@ -430,7 +446,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
430
446
  - !ruby/object:Gem::Version
431
447
  version: '0'
432
448
  requirements: []
433
- rubygems_version: 3.3.26
449
+ rubygems_version: 3.2.33
434
450
  signing_key:
435
451
  specification_version: 4
436
452
  summary: NDR Import