ndr_import 10.1.3 → 10.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: adf4468c5093398277008f08e3b4ff01619b0b39152dd7a604cccc863e7229f2
4
- data.tar.gz: aa3225ff8384c7bc42f33251c2943060382b9c27dae6243fbe70c18796597fbf
3
+ metadata.gz: 877346774d65eccb73f913081d75a48345a92b9e4c11e6f87702c9e0d59ebc3d
4
+ data.tar.gz: e59f23137e7568ce5e6eb1e4734b9046aed8f8fa93f392a9f2d82091013c3c04
5
5
  SHA512:
6
- metadata.gz: d437ffcdf6733d4d62859f3450f47a7067bc7155c93404c14b883b5b2cb45dd661d555146c6317d87be640ad7033b7d214c5be6842cd32009e8462a398105249
7
- data.tar.gz: f7d10bf3ba7c1e16c75cb05585ff70d88dd56b00305b389752b5d5dec329d88f313454494307025b53cb02e1164c10a4328286b8c55220cdf506f5cf7bf4e094
6
+ metadata.gz: 5bd15b2baf53b654a9f2be306d45dd3a509c298427790c139431f9f77df84b14f6506d625620b6781e9d44315fb8ada40307865a059c2ae3e78945293065dc9f
7
+ data.tar.gz: 06070cd9f9d5311835a523f8f926c938e0c65561c863d8bc972a1250a278ccf06991013773c31f877173873b600bbee40e6ce9faa6d3da167dd93a4fc11b4965
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## [Unreleased]
2
- * No unreleased changes
2
+ =======
3
+ *no unreleased changes*
4
+
5
+ ## 10.3.0 / 2023-09-07
6
+ ### Added
7
+ * VCF file support
8
+ * Support Ruby 3.2. Drop support for Ruby 2.7, Rails 6.0
9
+
10
+ ## 10.2.0 / 2023-05-16
11
+ * avro file support
12
+ * allow storage of `significant_mapped_fields` in `Table`
3
13
 
4
14
  ## 10.1.3 / 2022-12-08
5
15
  ### Added
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # NdrImport [![Build Status](https://github.com/NHSDigital/ndr_import/workflows/Test/badge.svg)](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://rubygems.org/gems/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
2
2
  This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
3
3
 
4
- 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and Zip files.
4
+ 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip, avro and VCF files.
5
5
  2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
6
6
 
7
7
  ## Installation
@@ -0,0 +1,49 @@
1
+ require 'ndr_import/table'
2
+
3
+ module NdrImport
4
+ module Avro
5
+ # Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
6
+ # All other Table logic is inherited from `NdrImport::Table`
7
+ class Table < ::NdrImport::Table
8
+ # Scaffold an `NdrImport::Avro::Table` instance from avro schema file
9
+ def self.from_schema(safe_path)
10
+ raise SecurityError, "#{safe_path} is not a SafePath" unless safe_path.is_a? SafePath
11
+
12
+ table_columns = columns_from(::Avro::Schema.parse(::File.open(safe_path)))
13
+ file_name = SafeFile.basename(safe_path).sub(/\.avsc\z/, '.avro')
14
+
15
+ new(filename_pattern: "/#{file_name}\\z/",
16
+ klass: 'ExampleKlass',
17
+ columns: table_columns)
18
+ end
19
+
20
+ def self.all_valid_options
21
+ super - %w[delimiter header_lines footer_lines]
22
+ end
23
+
24
+ def header_lines
25
+ 1
26
+ end
27
+
28
+ def footer_lines
29
+ 0
30
+ end
31
+
32
+ def self.columns_from(schema)
33
+ schema.fields.map do |field|
34
+ column = { column: field.name }
35
+ column[:mappings] = { field: field.name, daysafter: '1970-01-01' } if date_field?(field)
36
+
37
+ column
38
+ end
39
+ end
40
+
41
+ def self.date_field?(field)
42
+ field.type.schemas.any? { |schema| schema.logical_type == 'date' }
43
+ end
44
+
45
+ private_class_method :columns_from
46
+ private_class_method :date_field?
47
+ end
48
+ end
49
+ end
@@ -1,5 +1,6 @@
1
1
  require_relative 'base'
2
2
  require_relative 'acro_form'
3
+ require_relative 'avro'
3
4
  require_relative 'delimited'
4
5
  require_relative 'docx'
5
6
  require_relative 'excel'
@@ -7,6 +8,7 @@ require_relative 'pdf'
7
8
  require_relative 'seven_zip'
8
9
  require_relative 'text'
9
10
  require_relative 'unregistered_filetype'
11
+ require_relative 'vcf'
10
12
  require_relative 'word'
11
13
  require_relative 'xml'
12
14
  require_relative 'zip'
@@ -0,0 +1,34 @@
1
+ require 'avro'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is an avro file handler that returns a single table.
10
+ class Avro < Base
11
+ private
12
+
13
+ def rows(&block)
14
+ return enum_for(:rows) unless block
15
+
16
+ # Create an instance of DatumReader
17
+ reader = ::Avro::IO::DatumReader.new
18
+ # Open @filename in read mode
19
+ file = ::File.open(@filename, 'rb')
20
+ # Equivalent to DataFileReader instance creation in Java
21
+ dr = ::Avro::DataFile::Reader.new(file, reader)
22
+
23
+ dr.each_with_index do |avro_row, i|
24
+ # Ensure the first row is always the "header"
25
+ yield(avro_row.keys) if i.zero?
26
+ yield(avro_row.values.map(&:to_s))
27
+ end
28
+ rescue StandardError => e
29
+ raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
30
+ end
31
+ end
32
+ Registry.register(Avro, 'avro')
33
+ end
34
+ end
@@ -0,0 +1,25 @@
1
+ require 'bio-vcf/vcfline'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a vcf file handler that returns a single table.
10
+ class Vcf < Base
11
+ private
12
+
13
+ def rows(&block)
14
+ return enum_for(:rows) unless block
15
+
16
+ ::File.read(@filename).each_line do |line|
17
+ next if line =~ /^##/
18
+
19
+ yield BioVcf::VcfLine.parse(line)
20
+ end
21
+ end
22
+ end
23
+ Registry.register(Vcf, 'vcf')
24
+ end
25
+ end
@@ -12,7 +12,7 @@ module NdrImport
12
12
  def self.all_valid_options
13
13
  %w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
14
14
  tablename_pattern header_lines footer_lines format klass columns xml_record_xpath slurp
15
- row_identifier]
15
+ row_identifier significant_mapped_fields]
16
16
  end
17
17
 
18
18
  def all_valid_options
@@ -0,0 +1,21 @@
1
+ require 'ndr_import/table'
2
+
3
+ module NdrImport
4
+ module Vcf
5
+ # Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
6
+ # All other Table logic is inherited from `NdrImport::Table`
7
+ class Table < ::NdrImport::Table
8
+ def self.all_valid_options
9
+ super - %w[delimiter header_lines footer_lines xml_record_xpath]
10
+ end
11
+
12
+ def header_lines
13
+ 1
14
+ end
15
+
16
+ def footer_lines
17
+ 0
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '10.1.3'
4
+ VERSION = '10.3.0'
5
5
  end
data/lib/ndr_import.rb CHANGED
@@ -9,6 +9,8 @@ require 'ndr_import/non_tabular/table'
9
9
  require 'ndr_import/fixed_width/table'
10
10
  require 'ndr_import/xml/table'
11
11
  require 'ndr_import/pdf_form/table'
12
+ require 'ndr_import/avro/table'
13
+ require 'ndr_import/vcf/table'
12
14
  require 'ndr_import/unmapped_data_error'
13
15
  require 'ndr_import/acroform_reader'
14
16
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 10.1.3
4
+ version: 10.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-08 00:00:00.000000000 Z
11
+ date: 2023-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '6.0'
33
+ version: '6.1'
34
34
  - - "<"
35
35
  - !ruby/object:Gem::Version
36
36
  version: '7.1'
@@ -40,7 +40,7 @@ dependencies:
40
40
  requirements:
41
41
  - - ">="
42
42
  - !ruby/object:Gem::Version
43
- version: '6.0'
43
+ version: '6.1'
44
44
  - - "<"
45
45
  - !ruby/object:Gem::Version
46
46
  version: '7.1'
@@ -92,6 +92,34 @@ dependencies:
92
92
  - - "~>"
93
93
  - !ruby/object:Gem::Version
94
94
  version: '2.0'
95
+ - !ruby/object:Gem::Dependency
96
+ name: avro
97
+ requirement: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - "~>"
100
+ - !ruby/object:Gem::Version
101
+ version: 1.11.0
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - "~>"
107
+ - !ruby/object:Gem::Version
108
+ version: 1.11.0
109
+ - !ruby/object:Gem::Dependency
110
+ name: bio-vcf
111
+ requirement: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - "~>"
114
+ - !ruby/object:Gem::Version
115
+ version: 0.9.5
116
+ type: :runtime
117
+ prerelease: false
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - "~>"
121
+ - !ruby/object:Gem::Version
122
+ version: 0.9.5
95
123
  - !ruby/object:Gem::Dependency
96
124
  name: docx
97
125
  requirement: !ruby/object:Gem::Requirement
@@ -369,9 +397,11 @@ files:
369
397
  - exe/word_to_text
370
398
  - lib/ndr_import.rb
371
399
  - lib/ndr_import/acroform_reader.rb
400
+ - lib/ndr_import/avro/table.rb
372
401
  - lib/ndr_import/csv_library.rb
373
402
  - lib/ndr_import/file/acro_form.rb
374
403
  - lib/ndr_import/file/all.rb
404
+ - lib/ndr_import/file/avro.rb
375
405
  - lib/ndr_import/file/base.rb
376
406
  - lib/ndr_import/file/delimited.rb
377
407
  - lib/ndr_import/file/docx.rb
@@ -383,6 +413,7 @@ files:
383
413
  - lib/ndr_import/file/seven_zip.rb
384
414
  - lib/ndr_import/file/text.rb
385
415
  - lib/ndr_import/file/unregistered_filetype.rb
416
+ - lib/ndr_import/file/vcf.rb
386
417
  - lib/ndr_import/file/word.rb
387
418
  - lib/ndr_import/file/xml.rb
388
419
  - lib/ndr_import/file/zip.rb
@@ -408,6 +439,7 @@ files:
408
439
  - lib/ndr_import/table.rb
409
440
  - lib/ndr_import/universal_importer_helper.rb
410
441
  - lib/ndr_import/unmapped_data_error.rb
442
+ - lib/ndr_import/vcf/table.rb
411
443
  - lib/ndr_import/version.rb
412
444
  - lib/ndr_import/xml/control_char_escaper.rb
413
445
  - lib/ndr_import/xml/table.rb
@@ -423,14 +455,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
423
455
  requirements:
424
456
  - - ">="
425
457
  - !ruby/object:Gem::Version
426
- version: '2.7'
458
+ version: '3.0'
427
459
  required_rubygems_version: !ruby/object:Gem::Requirement
428
460
  requirements:
429
461
  - - ">="
430
462
  - !ruby/object:Gem::Version
431
463
  version: '0'
432
464
  requirements: []
433
- rubygems_version: 3.3.26
465
+ rubygems_version: 3.4.10
434
466
  signing_key:
435
467
  specification_version: 4
436
468
  summary: NDR Import