ndr_import 10.1.3 → 10.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: adf4468c5093398277008f08e3b4ff01619b0b39152dd7a604cccc863e7229f2
4
- data.tar.gz: aa3225ff8384c7bc42f33251c2943060382b9c27dae6243fbe70c18796597fbf
3
+ metadata.gz: 877346774d65eccb73f913081d75a48345a92b9e4c11e6f87702c9e0d59ebc3d
4
+ data.tar.gz: e59f23137e7568ce5e6eb1e4734b9046aed8f8fa93f392a9f2d82091013c3c04
5
5
  SHA512:
6
- metadata.gz: d437ffcdf6733d4d62859f3450f47a7067bc7155c93404c14b883b5b2cb45dd661d555146c6317d87be640ad7033b7d214c5be6842cd32009e8462a398105249
7
- data.tar.gz: f7d10bf3ba7c1e16c75cb05585ff70d88dd56b00305b389752b5d5dec329d88f313454494307025b53cb02e1164c10a4328286b8c55220cdf506f5cf7bf4e094
6
+ metadata.gz: 5bd15b2baf53b654a9f2be306d45dd3a509c298427790c139431f9f77df84b14f6506d625620b6781e9d44315fb8ada40307865a059c2ae3e78945293065dc9f
7
+ data.tar.gz: 06070cd9f9d5311835a523f8f926c938e0c65561c863d8bc972a1250a278ccf06991013773c31f877173873b600bbee40e6ce9faa6d3da167dd93a4fc11b4965
data/CHANGELOG.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## [Unreleased]
2
- * No unreleased changes
2
+ =======
3
+ *no unreleased changes*
4
+
5
+ ## 10.3.0 / 2023-09-07
6
+ ### Added
7
+ * VCF file support
8
+ * Support Ruby 3.2. Drop support for Ruby 2.7, Rails 6.0
9
+
10
+ ## 10.2.0 / 2023-05-16
11
+ * avro file support
12
+ * allow storage of `significant_mapped_fields` in `Table`
3
13
 
4
14
  ## 10.1.3 / 2022-12-08
5
15
  ### Added
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # NdrImport [![Build Status](https://github.com/NHSDigital/ndr_import/workflows/Test/badge.svg)](https://github.com/NHSDigital/ndr_import/actions?query=workflow%3Atest) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://rubygems.org/gems/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
2
2
  This is the NHS Digital (NHSD) National Disease Registers (NDR) Import ETL ruby gem, providing:
3
3
 
4
- 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and Zip files.
4
+ 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), JSON Lines, .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip, Zip, avro and VCF files.
5
5
  2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
6
6
 
7
7
  ## Installation
@@ -0,0 +1,49 @@
1
+ require 'ndr_import/table'
2
+
3
+ module NdrImport
4
+ module Avro
5
+ # Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
6
+ # All other Table logic is inherited from `NdrImport::Table`
7
+ class Table < ::NdrImport::Table
8
+ # Scaffold an `NdrImport::Avro::Table` instance from avro schema file
9
+ def self.from_schema(safe_path)
10
+ raise SecurityError, "#{safe_path} is not a SafePath" unless safe_path.is_a? SafePath
11
+
12
+ table_columns = columns_from(::Avro::Schema.parse(::File.open(safe_path)))
13
+ file_name = SafeFile.basename(safe_path).sub(/\.avsc\z/, '.avro')
14
+
15
+ new(filename_pattern: "/#{file_name}\\z/",
16
+ klass: 'ExampleKlass',
17
+ columns: table_columns)
18
+ end
19
+
20
+ def self.all_valid_options
21
+ super - %w[delimiter header_lines footer_lines]
22
+ end
23
+
24
+ def header_lines
25
+ 1
26
+ end
27
+
28
+ def footer_lines
29
+ 0
30
+ end
31
+
32
+ def self.columns_from(schema)
33
+ schema.fields.map do |field|
34
+ column = { column: field.name }
35
+ column[:mappings] = { field: field.name, daysafter: '1970-01-01' } if date_field?(field)
36
+
37
+ column
38
+ end
39
+ end
40
+
41
+ def self.date_field?(field)
42
+ field.type.schemas.any? { |schema| schema.logical_type == 'date' }
43
+ end
44
+
45
+ private_class_method :columns_from
46
+ private_class_method :date_field?
47
+ end
48
+ end
49
+ end
@@ -1,5 +1,6 @@
1
1
  require_relative 'base'
2
2
  require_relative 'acro_form'
3
+ require_relative 'avro'
3
4
  require_relative 'delimited'
4
5
  require_relative 'docx'
5
6
  require_relative 'excel'
@@ -7,6 +8,7 @@ require_relative 'pdf'
7
8
  require_relative 'seven_zip'
8
9
  require_relative 'text'
9
10
  require_relative 'unregistered_filetype'
11
+ require_relative 'vcf'
10
12
  require_relative 'word'
11
13
  require_relative 'xml'
12
14
  require_relative 'zip'
@@ -0,0 +1,34 @@
1
+ require 'avro'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is an avro file handler that returns a single table.
10
+ class Avro < Base
11
+ private
12
+
13
+ def rows(&block)
14
+ return enum_for(:rows) unless block
15
+
16
+ # Create an instance of DatumReader
17
+ reader = ::Avro::IO::DatumReader.new
18
+ # Open @filename in read mode
19
+ file = ::File.open(@filename, 'rb')
20
+ # Equivalent to DataFileReader instance creation in Java
21
+ dr = ::Avro::DataFile::Reader.new(file, reader)
22
+
23
+ dr.each_with_index do |avro_row, i|
24
+ # Ensure the first row is always the "header"
25
+ yield(avro_row.keys) if i.zero?
26
+ yield(avro_row.values.map(&:to_s))
27
+ end
28
+ rescue StandardError => e
29
+ raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
30
+ end
31
+ end
32
+ Registry.register(Avro, 'avro')
33
+ end
34
+ end
@@ -0,0 +1,25 @@
1
+ require 'bio-vcf/vcfline'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a vcf file handler that returns a single table.
10
+ class Vcf < Base
11
+ private
12
+
13
+ def rows(&block)
14
+ return enum_for(:rows) unless block
15
+
16
+ ::File.read(@filename).each_line do |line|
17
+ next if line =~ /^##/
18
+
19
+ yield BioVcf::VcfLine.parse(line)
20
+ end
21
+ end
22
+ end
23
+ Registry.register(Vcf, 'vcf')
24
+ end
25
+ end
@@ -12,7 +12,7 @@ module NdrImport
12
12
  def self.all_valid_options
13
13
  %w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
14
14
  tablename_pattern header_lines footer_lines format klass columns xml_record_xpath slurp
15
- row_identifier]
15
+ row_identifier significant_mapped_fields]
16
16
  end
17
17
 
18
18
  def all_valid_options
@@ -0,0 +1,21 @@
1
+ require 'ndr_import/table'
2
+
3
+ module NdrImport
4
+ module Vcf
5
+ # Syntatic sugar to ensure `header_lines` and `footer_lines` are 1 and 0 respectively.
6
+ # All other Table logic is inherited from `NdrImport::Table`
7
+ class Table < ::NdrImport::Table
8
+ def self.all_valid_options
9
+ super - %w[delimiter header_lines footer_lines xml_record_xpath]
10
+ end
11
+
12
+ def header_lines
13
+ 1
14
+ end
15
+
16
+ def footer_lines
17
+ 0
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '10.1.3'
4
+ VERSION = '10.3.0'
5
5
  end
data/lib/ndr_import.rb CHANGED
@@ -9,6 +9,8 @@ require 'ndr_import/non_tabular/table'
9
9
  require 'ndr_import/fixed_width/table'
10
10
  require 'ndr_import/xml/table'
11
11
  require 'ndr_import/pdf_form/table'
12
+ require 'ndr_import/avro/table'
13
+ require 'ndr_import/vcf/table'
12
14
  require 'ndr_import/unmapped_data_error'
13
15
  require 'ndr_import/acroform_reader'
14
16
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 10.1.3
4
+ version: 10.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-08 00:00:00.000000000 Z
11
+ date: 2023-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -30,7 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '6.0'
33
+ version: '6.1'
34
34
  - - "<"
35
35
  - !ruby/object:Gem::Version
36
36
  version: '7.1'
@@ -40,7 +40,7 @@ dependencies:
40
40
  requirements:
41
41
  - - ">="
42
42
  - !ruby/object:Gem::Version
43
- version: '6.0'
43
+ version: '6.1'
44
44
  - - "<"
45
45
  - !ruby/object:Gem::Version
46
46
  version: '7.1'
@@ -92,6 +92,34 @@ dependencies:
92
92
  - - "~>"
93
93
  - !ruby/object:Gem::Version
94
94
  version: '2.0'
95
+ - !ruby/object:Gem::Dependency
96
+ name: avro
97
+ requirement: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - "~>"
100
+ - !ruby/object:Gem::Version
101
+ version: 1.11.0
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - "~>"
107
+ - !ruby/object:Gem::Version
108
+ version: 1.11.0
109
+ - !ruby/object:Gem::Dependency
110
+ name: bio-vcf
111
+ requirement: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - "~>"
114
+ - !ruby/object:Gem::Version
115
+ version: 0.9.5
116
+ type: :runtime
117
+ prerelease: false
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - "~>"
121
+ - !ruby/object:Gem::Version
122
+ version: 0.9.5
95
123
  - !ruby/object:Gem::Dependency
96
124
  name: docx
97
125
  requirement: !ruby/object:Gem::Requirement
@@ -369,9 +397,11 @@ files:
369
397
  - exe/word_to_text
370
398
  - lib/ndr_import.rb
371
399
  - lib/ndr_import/acroform_reader.rb
400
+ - lib/ndr_import/avro/table.rb
372
401
  - lib/ndr_import/csv_library.rb
373
402
  - lib/ndr_import/file/acro_form.rb
374
403
  - lib/ndr_import/file/all.rb
404
+ - lib/ndr_import/file/avro.rb
375
405
  - lib/ndr_import/file/base.rb
376
406
  - lib/ndr_import/file/delimited.rb
377
407
  - lib/ndr_import/file/docx.rb
@@ -383,6 +413,7 @@ files:
383
413
  - lib/ndr_import/file/seven_zip.rb
384
414
  - lib/ndr_import/file/text.rb
385
415
  - lib/ndr_import/file/unregistered_filetype.rb
416
+ - lib/ndr_import/file/vcf.rb
386
417
  - lib/ndr_import/file/word.rb
387
418
  - lib/ndr_import/file/xml.rb
388
419
  - lib/ndr_import/file/zip.rb
@@ -408,6 +439,7 @@ files:
408
439
  - lib/ndr_import/table.rb
409
440
  - lib/ndr_import/universal_importer_helper.rb
410
441
  - lib/ndr_import/unmapped_data_error.rb
442
+ - lib/ndr_import/vcf/table.rb
411
443
  - lib/ndr_import/version.rb
412
444
  - lib/ndr_import/xml/control_char_escaper.rb
413
445
  - lib/ndr_import/xml/table.rb
@@ -423,14 +455,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
423
455
  requirements:
424
456
  - - ">="
425
457
  - !ruby/object:Gem::Version
426
- version: '2.7'
458
+ version: '3.0'
427
459
  required_rubygems_version: !ruby/object:Gem::Requirement
428
460
  requirements:
429
461
  - - ">="
430
462
  - !ruby/object:Gem::Version
431
463
  version: '0'
432
464
  requirements: []
433
- rubygems_version: 3.3.26
465
+ rubygems_version: 3.4.10
434
466
  signing_key:
435
467
  specification_version: 4
436
468
  summary: NDR Import