ndr_import 8.3.0 → 8.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 82e0bb47cb3a5c62b7cd5701c7196760cf9977357490a0e9ce708c2d814a51fc
4
- data.tar.gz: 111f781ab628de952d6b305e4fd7f0f8eb412b6c7b7b539dca2786e46904b05d
3
+ metadata.gz: 1ff3d3bfb186b79ee65c5f10a9c67508bbd91668424197b7dad3f256fbc7c78b
4
+ data.tar.gz: 73a0245645a2996d35d0bc3eefcec7c6cdfa37808b9ffb4c82dd49cb70ecc2ab
5
5
  SHA512:
6
- metadata.gz: f6d269d1d9e73ce154f7e8886a9109c16b74ba83d5b5c8f296f2c241609f1f5c9309afd08bee8538b34c36e534d48bacc5e1af6ff1fe89f89d9d30846c8841bc
7
- data.tar.gz: e332a7c3af8857095e5f1dc0de7fd7693f7df4b639c3f1a34858b4f5c7af68b73dbff751142e485a0f836e7addf656043dd944816ad03ecf9b56797d8cd5a740
6
+ metadata.gz: 04d43b45585b8577b98b2b7c77295071a73db86dc1bdd0b213e6479ff8ed5027d7c6dc3248a04dc83d6398f17683e7f0421e5205f46e57d328b5efd57c427fc5
7
+ data.tar.gz: 3ac7c8948019c4d45ab0ae2b29e21fca1f21657e845252e659f437b8a81521f052a03aa320089ab1afa33dfdeee2c27d5f97766724c0453fcb15f50f77b52476
data/CHANGELOG.md CHANGED
@@ -1,6 +1,9 @@
1
1
  ## [Unreleased]
2
2
  *no unreleased changes*
3
3
 
4
+ ## 8.4.0 / 2019-03-15
5
+ * Added ability to extract and transform PDF form data (#24)
6
+
4
7
  ## 8.3.0 / 2019-03-04
5
8
  ### Added
6
9
  * Allow `klass` in the column level mapping to be embedded array.
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This is the Public Health England (PHE) National Disease Registers (NDR) Import ETL ruby gem, providing:
4
4
 
5
- 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), .xls(x) spreadsheets, .doc(x) word documents, PDF, XML, 7-Zip and Zip files.
5
+ 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and Zip files.
6
6
  2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
7
7
 
8
8
  ## Installation
data/code_safety.yml CHANGED
@@ -19,7 +19,7 @@ file safety:
19
19
  CHANGELOG.md:
20
20
  comments:
21
21
  reviewed_by: josh.pencheon
22
- safe_revision: 6271165699e2d886f94d45277baa955023e3f094
22
+ safe_revision: 9dd7962195b259862a3294aba3de061b391435eb
23
23
  CODE_OF_CONDUCT.md:
24
24
  comments:
25
25
  reviewed_by: timgentry
@@ -39,7 +39,7 @@ file safety:
39
39
  README.md:
40
40
  comments:
41
41
  reviewed_by: josh.pencheon
42
- safe_revision: f2feb0c430947839183d7223e60e7c29b2c0f846
42
+ safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
43
43
  Rakefile:
44
44
  comments:
45
45
  reviewed_by: josh.pencheon
@@ -59,15 +59,23 @@ file safety:
59
59
  lib/ndr_import.rb:
60
60
  comments:
61
61
  reviewed_by: josh.pencheon
62
- safe_revision: eca44583e9989159b45e90021dd1c65228447180
62
+ safe_revision: 24d6449fd0612552f132dfbf4cada2ae28d0469e
63
+ lib/ndr_import/acroform_reader.rb:
64
+ comments:
65
+ reviewed_by: josh.pencheon
66
+ safe_revision: 24d6449fd0612552f132dfbf4cada2ae28d0469e
63
67
  lib/ndr_import/csv_library.rb:
64
68
  comments:
65
69
  reviewed_by: josh.pencheon
66
70
  safe_revision: be12e57519d3737e8d3901d7b01485c6995708dd
71
+ lib/ndr_import/file/acro_form.rb:
72
+ comments:
73
+ reviewed_by: josh.pencheon
74
+ safe_revision: 8a6ea666616c5b5d7c93cdf5aa019e8fc69d19e1
67
75
  lib/ndr_import/file/all.rb:
68
76
  comments:
69
77
  reviewed_by: josh.pencheon
70
- safe_revision: f2feb0c430947839183d7223e60e7c29b2c0f846
78
+ safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
71
79
  lib/ndr_import/file/base.rb:
72
80
  comments:
73
81
  reviewed_by: timgentry
@@ -180,6 +188,10 @@ file safety:
180
188
  comments:
181
189
  reviewed_by: josh.pencheon
182
190
  safe_revision: ac30f66578ab380649be800a4426d917ddbcb329
191
+ lib/ndr_import/pdf_form/table.rb:
192
+ comments:
193
+ reviewed_by: josh.pencheon
194
+ safe_revision: 5fd247eeb13a3f1356ab2d76ac9fabf9e19d5d36
183
195
  lib/ndr_import/standard_mappings.rb:
184
196
  comments:
185
197
  reviewed_by: josh.pencheon
@@ -192,10 +204,14 @@ file safety:
192
204
  comments:
193
205
  reviewed_by: josh.pencheon
194
206
  safe_revision: 3e3a852b58e8b169535e29029e535a10f6b9cd42
207
+ lib/ndr_import/unmapped_data_error.rb:
208
+ comments:
209
+ reviewed_by: josh.pencheon
210
+ safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
195
211
  lib/ndr_import/version.rb:
196
212
  comments: another check?
197
213
  reviewed_by: josh.pencheon
198
- safe_revision: 6271165699e2d886f94d45277baa955023e3f094
214
+ safe_revision: 9dd7962195b259862a3294aba3de061b391435eb
199
215
  lib/ndr_import/xml/table.rb:
200
216
  comments:
201
217
  reviewed_by: josh.pencheon
@@ -204,6 +220,10 @@ file safety:
204
220
  comments:
205
221
  reviewed_by: josh.pencheon
206
222
  safe_revision: 607c0668f1fffd70d181bc1a31c4f56eed5f6189
223
+ test/file/acro_form_test.rb:
224
+ comments:
225
+ reviewed_by: josh.pencheon
226
+ safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
207
227
  test/file/base_test.rb:
208
228
  comments:
209
229
  reviewed_by: timgentry
@@ -227,7 +247,7 @@ file safety:
227
247
  test/file/registry_test.rb:
228
248
  comments:
229
249
  reviewed_by: josh.pencheon
230
- safe_revision: f2feb0c430947839183d7223e60e7c29b2c0f846
250
+ safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
231
251
  test/file/seven_zip_test.rb:
232
252
  comments:
233
253
  reviewed_by: josh.pencheon
@@ -292,10 +312,18 @@ file safety:
292
312
  comments:
293
313
  reviewed_by: timgentry
294
314
  safe_revision: cf382902508a21a95b80ac4582fbbd117164e80e
315
+ test/pdf_form/table_test.rb:
316
+ comments:
317
+ reviewed_by: josh.pencheon
318
+ safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
295
319
  test/readme_test.rb:
296
320
  comments:
297
321
  reviewed_by: timgentry
298
322
  safe_revision: cf382902508a21a95b80ac4582fbbd117164e80e
323
+ test/resources/acro_form.pdf:
324
+ comments:
325
+ reviewed_by: josh.pencheon
326
+ safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
299
327
  test/resources/blank_tab_test.xlsx:
300
328
  comments: reviewed contents
301
329
  reviewed_by: joshpencheon
@@ -0,0 +1,33 @@
1
+ require 'pdf-reader'
2
+
3
+ module NdrImport
4
+ # PDF AcroForm reader using the pdf-reader gem
5
+ class AcroFormReader < ::PDF::Reader
6
+ def fields_hash
7
+ fields = {}
8
+ fields_from(acroform[:Fields]).each do |field|
9
+ field_name = field[:T]
10
+ unless field[:Subtype] == :Widget || field.key?(:Kids)
11
+ raise "Widgets or Radio boxes expected, found a #{field[:Subtype].inspect}"
12
+ end
13
+ raise "Non-unique column name #{field_name}" if fields.key?(field_name)
14
+ fields[field_name] = field[:V]
15
+ end
16
+ fields
17
+ end
18
+
19
+ private
20
+
21
+ def acroform
22
+ @objects.deref(root[:AcroForm])
23
+ end
24
+
25
+ def fields_from(refs)
26
+ Array(refs).flat_map do |ref|
27
+ value = @objects[ref]
28
+ # PDF has its own Hash class
29
+ value.is_a?(::Hash) ? value : fields_from(value)
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,28 @@
1
+ require 'pdf-reader'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is an AcroForm PDF file handler that returns a single table
10
+ class AcroForm < Base
11
+ private
12
+
13
+ def rows(&block)
14
+ return enum_for(:rows) unless block
15
+
16
+ [reader.fields_hash].each(&block)
17
+ rescue NoMethodError
18
+ raise "Failed to read #{SafeFile.basename(@filename)} as an AcroForm PDF"
19
+ end
20
+
21
+ def reader
22
+ @reader ||= AcroFormReader.new(SafeFile.safepath_to_string(@filename))
23
+ end
24
+ end
25
+
26
+ Registry.register(AcroForm, 'acroform')
27
+ end
28
+ end
@@ -1,4 +1,5 @@
1
1
  require_relative 'base'
2
+ require_relative 'acro_form'
2
3
  require_relative 'delimited'
3
4
  require_relative 'docx'
4
5
  require_relative 'excel'
@@ -0,0 +1,71 @@
1
+ require 'ndr_import/table'
2
+
3
+ module NdrImport
4
+ module PdfForm
5
+ # This class maintains the state of a PDF form table mapping and encapsulates
6
+ # the logic required to transform a table of data into "records". Particular
7
+ # attention has been made to use enumerables throughout to help with the
8
+ # transformation of large quantities of data.
9
+ class Table < ::NdrImport::Table
10
+ def self.all_valid_options
11
+ super - %w[delimiter footer_lines format header_lines]
12
+ end
13
+
14
+ def footer_lines
15
+ 0
16
+ end
17
+
18
+ def format
19
+ 'acroform'
20
+ end
21
+
22
+ def header_lines
23
+ 0
24
+ end
25
+
26
+ # This method transforms an incoming line (Hash) of data.
27
+ # Each of the klass masked mappings are applied to the hash values, which are reordered by
28
+ # the mappng definition, yielding the klass and fields for each mapped klass.
29
+ def transform_line(line, index)
30
+ return enum_for(:transform_line, line, index) unless block_given?
31
+
32
+ raise 'NdrImport::PdfForm::Table expects a Hash!' unless line.is_a? Hash
33
+
34
+ validate_column_mappings(line)
35
+
36
+ masked_mappings.each do |klass, klass_mappings|
37
+ ordered_line = order_values_by_mappings(line, klass_mappings)
38
+ fields = mapped_line(ordered_line, klass_mappings)
39
+ next if fields[:skip].to_s == 'true'.freeze
40
+ yield(klass, fields, index)
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ # Ensure every key has a column mapping
47
+ def validate_column_mappings(line)
48
+ unmapped = []
49
+ line.each_key do |key|
50
+ next if column_names.include? key
51
+ unmapped << key
52
+ end
53
+ raise NdrImport::UnmappedDataError, unmapped if unmapped.any?
54
+ end
55
+
56
+ def column_name_from(column)
57
+ column[Strings::COLUMN] || column[Strings::STANDARD_MAPPING]
58
+ end
59
+
60
+ def column_names
61
+ @column_names ||= columns.map { |column| column_name_from(column) }
62
+ end
63
+
64
+ # Return an Array of the `hash` values in the order the columns are defined in the mapping,
65
+ # allowing mapped_line to work as normal
66
+ def order_values_by_mappings(hash, column_mappings)
67
+ column_mappings.map { |column_mapping| hash[column_name_from(column_mapping)].to_s }
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,14 @@
1
+ require 'active_support/core_ext/array/conversions'
2
+
3
+ module NdrImport
4
+ # Raised if incoming data has not been mapped.
5
+ class UnmappedDataError < StandardError
6
+ attr_reader :keys
7
+
8
+ def initialize(keys)
9
+ @keys = keys
10
+ message = "Unmapped data: #{keys.to_sentence}"
11
+ super(message)
12
+ end
13
+ end
14
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '8.3.0'.freeze
4
+ VERSION = '8.4.0'.freeze
5
5
  end
data/lib/ndr_import.rb CHANGED
@@ -8,6 +8,9 @@ require 'ndr_import/table'
8
8
  require 'ndr_import/non_tabular/table'
9
9
  require 'ndr_import/fixed_width/table'
10
10
  require 'ndr_import/xml/table'
11
+ require 'ndr_import/pdf_form/table'
12
+ require 'ndr_import/unmapped_data_error'
13
+ require 'ndr_import/acroform_reader'
11
14
 
12
15
  module NdrImport
13
16
  def self.root
@@ -0,0 +1,39 @@
1
+ require 'test_helper'
2
+ require 'ndr_import/file/acro_form'
3
+
4
+ module NdrImport
5
+ module File
6
+ # Acro Form file handler tests
7
+ class AcroFormTest < ActiveSupport::TestCase
8
+ def setup
9
+ @permanent_test_files = SafePath.new('permanent_test_files')
10
+ end
11
+
12
+ test 'should read pdf correctly' do
13
+ file_path = @permanent_test_files.join('acro_form.pdf')
14
+ handler = NdrImport::File::AcroForm.new(file_path, nil)
15
+
16
+ expected_row = { 'Group3' => nil,
17
+ 'Textbox1_required' => nil,
18
+ 'Textbox2_required' => nil,
19
+ 'List Box_required' => '3',
20
+ 'Dropdown_required' => '3',
21
+ 'Textbox1_optional' => nil,
22
+ 'Textbox2_optional' => nil,
23
+ 'List Box_optional' => '3',
24
+ 'Dropdown_optional' => '3',
25
+ 'Date_required' => nil,
26
+ 'Date__optional' => nil,
27
+ 'Textbox3_numerical_required' => nil,
28
+ 'Textbox3_numerical_optional' => nil }
29
+
30
+ handler.tables.each do |tablename, sheet|
31
+ assert_nil tablename
32
+ assert_instance_of Enumerator, sheet
33
+ assert_equal 1, sheet.to_a.size
34
+ assert_equal expected_row, sheet.to_a.first
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -12,7 +12,8 @@ module NdrImport
12
12
 
13
13
  test 'Registry.handlers' do
14
14
  assert_instance_of Hash, NdrImport::File::Registry.handlers
15
- assert_equal %w[7z csv delimited doc docx nontabular pdf text txt xls xlsx xml_table zip],
15
+ assert_equal %w[7z acroform csv delimited doc docx nontabular
16
+ pdf text txt xls xlsx xml_table zip],
16
17
  NdrImport::File::Registry.handlers.keys.sort
17
18
  end
18
19
 
@@ -0,0 +1,119 @@
1
+ require 'test_helper'
2
+
3
+ # This tests the NdrImport::PdfForm::Table mapping class
4
+ module PdfForm
5
+ class TableTest < ActiveSupport::TestCase
6
+ def setup
7
+ @form_data = [{
8
+ 'address2' => 'Address 2',
9
+ 'should_be_blank' => '',
10
+ 'date_2' => '2018-12-01',
11
+ 'date_1' => '2018-01-01',
12
+ 'address1' => 'Address'
13
+ }]
14
+ end
15
+
16
+ def test_should_transform_pdf_form_hash
17
+ table = NdrImport::PdfForm::Table.new(klass: 'SomeTestKlass',
18
+ columns: pdf_form_column_mapping)
19
+
20
+ assert_equal 'acroform', table.format
21
+
22
+ transformed_data = table.transform(@form_data)
23
+ assert_equal 1, transformed_data.count
24
+
25
+ expected_data = ['SomeTestKlass', { rawtext: {
26
+ 'address1' => 'Address',
27
+ 'address2' => 'Address 2',
28
+ 'missing_from_data' => '',
29
+ 'date_1' => '2018-01-01',
30
+ 'date_2' => '2018-12-01',
31
+ 'should_be_blank' => ''
32
+ } }, 0]
33
+
34
+ klass, fields, index = *transformed_data.first
35
+ assert_equal expected_data[0], klass
36
+ assert_equal expected_data[1], fields
37
+ assert_equal expected_data[-1], index
38
+ end
39
+
40
+ def test_should_transform_mutli_klass_pdf_form_hash
41
+ table = NdrImport::PdfForm::Table.new(columns: multi_klass_pdf_form_column_mapping)
42
+
43
+ expected_data = [
44
+ ['TestKlassOne',
45
+ { rawtext:
46
+ { 'address1' => 'Address',
47
+ 'address2' => 'Address 2',
48
+ 'missing_from_data' => '' } },
49
+ 0],
50
+ ['TestKlassTwo',
51
+ { rawtext:
52
+ { 'address1' => 'Address',
53
+ 'date_1' => '2018-01-01',
54
+ 'date_2' => '2018-12-01',
55
+ 'should_be_blank' => '' } },
56
+ 0]
57
+ ]
58
+
59
+ transformed_data = table.transform(@form_data).to_a
60
+ assert_equal 2, transformed_data.count
61
+
62
+ expected_data.each_with_index do |expected, index|
63
+ transformed = transformed_data[index]
64
+ assert_equal expected, transformed
65
+ end
66
+ end
67
+
68
+ def test_should_fail_with_unmappped_form_data
69
+ table = NdrImport::PdfForm::Table.new(klass: 'SomeTestKlass',
70
+ columns: partial_pdf_form_column_mapping)
71
+
72
+ exception = assert_raises(NdrImport::UnmappedDataError) { table.transform(@form_data).to_a }
73
+ assert exception.message == 'Unmapped data: address2 and date_1'
74
+ end
75
+
76
+ def test_should_not_be_valid_with_bespoke_format
77
+ exception = assert_raises(ArgumentError) { NdrImport::PdfForm::Table.new(format: 'a_format') }
78
+ exception.message == 'Unrecognised options: ["format"]'
79
+ end
80
+
81
+ private
82
+
83
+ def pdf_form_column_mapping
84
+ [
85
+ { 'column' => 'address1' },
86
+ { 'column' => 'address2' },
87
+ { 'column' => 'missing_from_data' },
88
+ { 'column' => 'date_1' },
89
+ { 'column' => 'date_2' },
90
+ { 'column' => 'should_be_blank' }
91
+ ]
92
+ end
93
+
94
+ def multi_klass_pdf_form_column_mapping
95
+ [
96
+ { 'column' => 'address1',
97
+ 'klass' => %w[TestKlassOne TestKlassTwo] },
98
+ { 'column' => 'address2',
99
+ 'klass' => 'TestKlassOne' },
100
+ { 'column' => 'missing_from_data',
101
+ 'klass' => 'TestKlassOne' },
102
+ { 'column' => 'date_1',
103
+ 'klass' => 'TestKlassTwo' },
104
+ { 'column' => 'date_2',
105
+ 'klass' => 'TestKlassTwo' },
106
+ { 'column' => 'should_be_blank',
107
+ 'klass' => 'TestKlassTwo' }
108
+ ]
109
+ end
110
+
111
+ def partial_pdf_form_column_mapping
112
+ [
113
+ { 'column' => 'address1' },
114
+ { 'column' => 'date_2' },
115
+ { 'column' => 'should_be_blank' }
116
+ ]
117
+ end
118
+ end
119
+ end
Binary file
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.3.0
4
+ version: 8.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-03-04 00:00:00.000000000 Z
11
+ date: 2019-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -363,7 +363,9 @@ files:
363
363
  - gemfiles/Gemfile.rails51
364
364
  - gemfiles/Gemfile.rails52
365
365
  - lib/ndr_import.rb
366
+ - lib/ndr_import/acroform_reader.rb
366
367
  - lib/ndr_import/csv_library.rb
368
+ - lib/ndr_import/file/acro_form.rb
367
369
  - lib/ndr_import/file/all.rb
368
370
  - lib/ndr_import/file/base.rb
369
371
  - lib/ndr_import/file/delimited.rb
@@ -393,12 +395,15 @@ files:
393
395
  - lib/ndr_import/non_tabular/record.rb
394
396
  - lib/ndr_import/non_tabular/table.rb
395
397
  - lib/ndr_import/non_tabular_file_helper.rb
398
+ - lib/ndr_import/pdf_form/table.rb
396
399
  - lib/ndr_import/standard_mappings.rb
397
400
  - lib/ndr_import/table.rb
398
401
  - lib/ndr_import/universal_importer_helper.rb
402
+ - lib/ndr_import/unmapped_data_error.rb
399
403
  - lib/ndr_import/version.rb
400
404
  - lib/ndr_import/xml/table.rb
401
405
  - ndr_import.gemspec
406
+ - test/file/acro_form_test.rb
402
407
  - test/file/base_test.rb
403
408
  - test/file/delimited_test.rb
404
409
  - test/file/docx_test.rb
@@ -421,7 +426,9 @@ files:
421
426
  - test/non_tabular/mapping_test.rb
422
427
  - test/non_tabular/table_test.rb
423
428
  - test/non_tabular_file_helper_test.rb
429
+ - test/pdf_form/table_test.rb
424
430
  - test/readme_test.rb
431
+ - test/resources/acro_form.pdf
425
432
  - test/resources/blank_tab_test.xlsx
426
433
  - test/resources/bomd.csv
427
434
  - test/resources/broken.csv
@@ -495,6 +502,7 @@ signing_key:
495
502
  specification_version: 4
496
503
  summary: NDR Import
497
504
  test_files:
505
+ - test/file/acro_form_test.rb
498
506
  - test/file/base_test.rb
499
507
  - test/file/delimited_test.rb
500
508
  - test/file/docx_test.rb
@@ -517,7 +525,9 @@ test_files:
517
525
  - test/non_tabular/mapping_test.rb
518
526
  - test/non_tabular/table_test.rb
519
527
  - test/non_tabular_file_helper_test.rb
528
+ - test/pdf_form/table_test.rb
520
529
  - test/readme_test.rb
530
+ - test/resources/acro_form.pdf
521
531
  - test/resources/blank_tab_test.xlsx
522
532
  - test/resources/bomd.csv
523
533
  - test/resources/broken.csv