ndr_import 8.5.0 → 8.5.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -0
  3. data/Gemfile +0 -3
  4. data/README.md +6 -0
  5. data/bin/console +14 -0
  6. data/bin/setup +8 -0
  7. data/code_safety.yml +27 -11
  8. data/exe/pdf_acro_form_to_yaml +23 -0
  9. data/exe/pdf_to_text +28 -0
  10. data/exe/word_to_text +26 -0
  11. data/gemfiles/Gemfile.rails52 +0 -3
  12. data/gemfiles/Gemfile.rails60 +5 -0
  13. data/lib/ndr_import/version.rb +1 -1
  14. data/ndr_import.gemspec +9 -7
  15. metadata +23 -164
  16. data/gemfiles/Gemfile.rails50 +0 -8
  17. data/gemfiles/Gemfile.rails51 +0 -9
  18. data/test/file/acro_form_test.rb +0 -39
  19. data/test/file/base_test.rb +0 -54
  20. data/test/file/delimited_test.rb +0 -233
  21. data/test/file/docx_test.rb +0 -53
  22. data/test/file/excel_test.rb +0 -124
  23. data/test/file/pdf_test.rb +0 -36
  24. data/test/file/registry_test.rb +0 -62
  25. data/test/file/seven_zip_test.rb +0 -59
  26. data/test/file/text_test.rb +0 -92
  27. data/test/file/word_test.rb +0 -35
  28. data/test/file/xml_test.rb +0 -21
  29. data/test/file/zip_test.rb +0 -47
  30. data/test/fixed_width/table_test.rb +0 -35
  31. data/test/helpers/file/delimited_test.rb +0 -105
  32. data/test/helpers/file/excel_test.rb +0 -82
  33. data/test/helpers/file/pdf_test.rb +0 -27
  34. data/test/helpers/file/word_test.rb +0 -26
  35. data/test/helpers/file/xml_test.rb +0 -131
  36. data/test/helpers/file/zip_test.rb +0 -75
  37. data/test/mapper_test.rb +0 -676
  38. data/test/non_tabular/mapping_test.rb +0 -36
  39. data/test/non_tabular/table_test.rb +0 -590
  40. data/test/non_tabular_file_helper_test.rb +0 -501
  41. data/test/pdf_form/table_test.rb +0 -119
  42. data/test/readme_test.rb +0 -53
  43. data/test/resources/acro_form.pdf +0 -0
  44. data/test/resources/blank_tab_test.xlsx +0 -0
  45. data/test/resources/bomd.csv +0 -3
  46. data/test/resources/broken.csv +0 -3
  47. data/test/resources/filesystem_paths.yml +0 -26
  48. data/test/resources/flat_file.pdf +0 -0
  49. data/test/resources/flat_file.txt +0 -27
  50. data/test/resources/flat_file.yml +0 -20
  51. data/test/resources/hello_utf16be.txt +0 -0
  52. data/test/resources/hello_utf16le.txt +0 -0
  53. data/test/resources/hello_utf8.txt +0 -2
  54. data/test/resources/hello_windows.txt +0 -2
  55. data/test/resources/hello_world.doc +0 -0
  56. data/test/resources/hello_world.docx +0 -0
  57. data/test/resources/hello_world.pdf +0 -0
  58. data/test/resources/hello_world.txt +0 -2
  59. data/test/resources/high_ascii_delimited.txt +0 -2
  60. data/test/resources/high_ascii_delimited_example_two.txt +0 -3
  61. data/test/resources/malformed.csv +0 -3
  62. data/test/resources/malformed.xml +0 -6
  63. data/test/resources/malformed_pipe.csv +0 -3
  64. data/test/resources/normal.7z +0 -0
  65. data/test/resources/normal.csv +0 -3
  66. data/test/resources/normal.csv.zip +0 -0
  67. data/test/resources/normal_pipe.csv +0 -3
  68. data/test/resources/normal_thorn.csv +0 -3
  69. data/test/resources/not_a_pdf.pdf +0 -0
  70. data/test/resources/not_a_word_file.doc +0 -0
  71. data/test/resources/not_a_word_file.docx +0 -0
  72. data/test/resources/not_sign_delimited.txt +0 -3
  73. data/test/resources/password_protected_hello_world.docx +0 -0
  74. data/test/resources/password_protected_sample_xlsx.xlsx +0 -0
  75. data/test/resources/sample.xml +0 -34
  76. data/test/resources/sample_xls.xls +0 -0
  77. data/test/resources/sample_xlsx.xlsx +0 -0
  78. data/test/resources/sheet_streaming.xls +0 -0
  79. data/test/resources/sheet_streaming.xlsx +0 -0
  80. data/test/resources/standard_mappings.yml +0 -39
  81. data/test/resources/txt_file_xls_extension.xls +0 -1
  82. data/test/resources/txt_file_xlsx_extension.xlsx +0 -1
  83. data/test/resources/utf-16be_xml.xml +0 -0
  84. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  85. data/test/resources/utf-16le_xml.xml +0 -0
  86. data/test/resources/utf-8_xml.xml +0 -9
  87. data/test/resources/windows-1252_xml.xml +0 -9
  88. data/test/resources/windows.csv +0 -5
  89. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  90. data/test/standard_mappings_test.rb +0 -22
  91. data/test/table_test.rb +0 -545
  92. data/test/test_helper.rb +0 -35
  93. data/test/universal_importer_helper_test.rb +0 -86
  94. data/test/xml/table_test.rb +0 -90
@@ -1,119 +0,0 @@
1
- require 'test_helper'
2
-
3
- # This tests the NdrImport::PdfForm::Table mapping class
4
- module PdfForm
5
- class TableTest < ActiveSupport::TestCase
6
- def setup
7
- @form_data = [{
8
- 'address2' => 'Address 2',
9
- 'should_be_blank' => '',
10
- 'date_2' => '2018-12-01',
11
- 'date_1' => '2018-01-01',
12
- 'address1' => 'Address'
13
- }]
14
- end
15
-
16
- def test_should_transform_pdf_form_hash
17
- table = NdrImport::PdfForm::Table.new(klass: 'SomeTestKlass',
18
- columns: pdf_form_column_mapping)
19
-
20
- assert_equal 'acroform', table.format
21
-
22
- transformed_data = table.transform(@form_data)
23
- assert_equal 1, transformed_data.count
24
-
25
- expected_data = ['SomeTestKlass', { rawtext: {
26
- 'address1' => 'Address',
27
- 'address2' => 'Address 2',
28
- 'missing_from_data' => '',
29
- 'date_1' => '2018-01-01',
30
- 'date_2' => '2018-12-01',
31
- 'should_be_blank' => ''
32
- } }, 0]
33
-
34
- klass, fields, index = *transformed_data.first
35
- assert_equal expected_data[0], klass
36
- assert_equal expected_data[1], fields
37
- assert_equal expected_data[-1], index
38
- end
39
-
40
- def test_should_transform_mutli_klass_pdf_form_hash
41
- table = NdrImport::PdfForm::Table.new(columns: multi_klass_pdf_form_column_mapping)
42
-
43
- expected_data = [
44
- ['TestKlassOne',
45
- { rawtext:
46
- { 'address1' => 'Address',
47
- 'address2' => 'Address 2',
48
- 'missing_from_data' => '' } },
49
- 0],
50
- ['TestKlassTwo',
51
- { rawtext:
52
- { 'address1' => 'Address',
53
- 'date_1' => '2018-01-01',
54
- 'date_2' => '2018-12-01',
55
- 'should_be_blank' => '' } },
56
- 0]
57
- ]
58
-
59
- transformed_data = table.transform(@form_data).to_a
60
- assert_equal 2, transformed_data.count
61
-
62
- expected_data.each_with_index do |expected, index|
63
- transformed = transformed_data[index]
64
- assert_equal expected, transformed
65
- end
66
- end
67
-
68
- def test_should_fail_with_unmappped_form_data
69
- table = NdrImport::PdfForm::Table.new(klass: 'SomeTestKlass',
70
- columns: partial_pdf_form_column_mapping)
71
-
72
- exception = assert_raises(NdrImport::UnmappedDataError) { table.transform(@form_data).to_a }
73
- assert exception.message == 'Unmapped data: address2 and date_1'
74
- end
75
-
76
- def test_should_not_be_valid_with_bespoke_format
77
- exception = assert_raises(ArgumentError) { NdrImport::PdfForm::Table.new(format: 'a_format') }
78
- exception.message == 'Unrecognised options: ["format"]'
79
- end
80
-
81
- private
82
-
83
- def pdf_form_column_mapping
84
- [
85
- { 'column' => 'address1' },
86
- { 'column' => 'address2' },
87
- { 'column' => 'missing_from_data' },
88
- { 'column' => 'date_1' },
89
- { 'column' => 'date_2' },
90
- { 'column' => 'should_be_blank' }
91
- ]
92
- end
93
-
94
- def multi_klass_pdf_form_column_mapping
95
- [
96
- { 'column' => 'address1',
97
- 'klass' => %w[TestKlassOne TestKlassTwo] },
98
- { 'column' => 'address2',
99
- 'klass' => 'TestKlassOne' },
100
- { 'column' => 'missing_from_data',
101
- 'klass' => 'TestKlassOne' },
102
- { 'column' => 'date_1',
103
- 'klass' => 'TestKlassTwo' },
104
- { 'column' => 'date_2',
105
- 'klass' => 'TestKlassTwo' },
106
- { 'column' => 'should_be_blank',
107
- 'klass' => 'TestKlassTwo' }
108
- ]
109
- end
110
-
111
- def partial_pdf_form_column_mapping
112
- [
113
- { 'column' => 'address1' },
114
- { 'column' => 'date_2' },
115
- { 'column' => 'should_be_blank' }
116
- ]
117
- end
118
- end
119
- end
@@ -1,53 +0,0 @@
1
- # encoding: UTF-8
2
- require 'test_helper'
3
-
4
- # This tests the README page example
5
- class ReadmeTest < ActiveSupport::TestCase
6
- test 'readme example' do
7
- require 'ndr_import/non_tabular/table'
8
- require 'ndr_import/file/registry'
9
-
10
- unzip_path = SafePath.new('test_space_rw')
11
- source_file = SafePath.new('permanent_test_files').join('flat_file.pdf')
12
- options = { 'unzip_path' => unzip_path }
13
-
14
- table = NdrImport::NonTabular::Table.new(
15
- 'start_in_a_record' => false,
16
- 'end_in_a_record' => false,
17
- 'klass' => 'SomeTestKlass',
18
- 'start_line_pattern' => /\A------\z/,
19
- 'remove_lines' => { 'footer' => [/\A== Page \d+ of \d+ ==\z/i] },
20
- 'columns' => [
21
- {
22
- 'column' => 'one',
23
- 'non_tabular_cell' => { 'lines' => Range.new(0, -1, true), 'capture' => /^(.*)$/i }
24
- }
25
- ]
26
- )
27
-
28
- # Use the Registry to enumerate over the files and their tables
29
- files = NdrImport::File::Registry.files(source_file, options)
30
- files.each do |filename|
31
- tables = NdrImport::File::Registry.tables(filename, nil, options)
32
- tables.each do |_tablename, table_content|
33
- # Use the NonTabular::Table to tabulate the "table" contents
34
- table.transform(table_content).each do |_klass, _fields, _index|
35
- # Your code goes here
36
- end
37
-
38
- # Now we test the example
39
- results = []
40
- table.transform(table_content).each do |_klass, fields, _index|
41
- results << fields[:rawtext]['one']
42
- end
43
- assert table.is_a?(NdrImport::NonTabular::Table)
44
- assert_equal 4, results.count
45
- assert results.first.start_with?('1')
46
- assert results.last.start_with?('4')
47
- assert results.any? { |result| result =~ /This is captured/ }
48
- refute results.any? { |result| result =~ /This is never captured/ }
49
- refute results.any? { |result| result =~ /== Page/ }
50
- end
51
- end
52
- end
53
- end
Binary file
@@ -1,3 +0,0 @@
1
- A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z
2
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
@@ -1,3 +0,0 @@
1
- I,really,think
2
- "this","is "not" "
3
- going,to,work
@@ -1,26 +0,0 @@
1
- <% require 'tmpdir' %>
2
- # This allows us different filesystem paths for different platforms
3
- ---
4
- test_files: &test_files
5
- root: <%= Dir.mktmpdir %>
6
-
7
- ? !ruby/regexp /.*/
8
- :
9
- test_space_r:
10
- <<: *test_files
11
- prms:
12
- - r
13
- test_space_w:
14
- <<: *test_files
15
- prms:
16
- - w
17
- test_space_rw:
18
- <<: *test_files
19
- prms:
20
- - r
21
- - w
22
- permanent_test_files:
23
- root: <%= NdrImport.root + '/test/resources' %>
24
- prms:
25
- - r
26
- - w
Binary file
@@ -1,27 +0,0 @@
1
- 0
2
- This is never captured
3
- ------
4
- 1
5
- == Page 1 of 5 ==
6
- Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut
7
- labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris
8
- nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit
9
- esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt
10
- == Page 2 of 5 ==
11
- in culpa qui officia deserunt mollit anim id est laborum.
12
- ------
13
- 2
14
- This is captured
15
- Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
16
- == Page 3 of 5 ==
17
- ------
18
- 3
19
- Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
20
- ------
21
- == Page 4 of 5 ==
22
- 4
23
- Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
24
- ------
25
- -1
26
- == Page 5 of 5 ==
27
- This is never captured
@@ -1,20 +0,0 @@
1
- --- !ruby/object:NdrImport::NonTabular::Table
2
- # canonical_name: somename
3
- # filename_pattern:
4
- # format: pipe
5
- klass: SomeTestKlass
6
- start_line_pattern: !ruby/regexp /\A------\z/
7
- # end_line_pattern:
8
- start_in_a_record: false
9
- end_in_a_record: false
10
- remove_lines:
11
- footer:
12
- - !ruby/regexp /\A== Page \d+ of \d+ ==\z/i
13
- columns:
14
- - column: one
15
- non_tabular_cell:
16
- lines: !ruby/range
17
- begin: 0
18
- end: -1
19
- excl: true
20
- capture: !ruby/regexp /^(.*)$/i
@@ -1,2 +0,0 @@
1
- Hello world
2
- This is a thorny þ issue!
@@ -1,2 +0,0 @@
1
- Hello windows world
2
- This is a thorny � issue!
@@ -1,2 +0,0 @@
1
- Hello world,
2
- this is a text document
@@ -1,2 +0,0 @@
1
- 345465�1234567890�Dr Bob��BLOGGS�JOE�M�05 NOV 1990�1 noddy street, anytown�CB22 3AD�IP�234534654�42�25 Jun 2015�14253�WWJ�SMITH, John�SGH�AA�LAB���Made discvery|�Dr Josh|
2
- 345465�1234567890�Dr Bob��BLOGGS�JOE�M�05 NOV 1990�1 noddy street, anytown�CB22 3AD�IP�234534654�42�25 Jun 2015�14253�WWJ�SMITH, John�SGH�AA�LAB���Made discvery|�Dr Josh|
@@ -1,3 +0,0 @@
1
- A�B�C�D�E�F�G�H�I�J�K�L�M�N�O�P�Q�R�S�T�U�V�W�X�Y�Z
2
- 1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1
3
- 2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2
@@ -1,3 +0,0 @@
1
- A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z
2
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3
- 2,2,2,2,2,2,2,2"malformed",2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
@@ -1,6 +0,0 @@
1
- <?xml version="1.0"?>
2
- <root>
3
- <note><![CDATA[
4
- This is  a note!
5
- ]]></note>
6
- </root>
@@ -1,3 +0,0 @@
1
- A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z
2
- 1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1
3
- 2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2"malformed"|2|2|2|2|2|2|2|2|2
Binary file
@@ -1,3 +0,0 @@
1
- A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z
2
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
@@ -1,3 +0,0 @@
1
- A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z
2
- 1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1|1
3
- 2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2|2
@@ -1,3 +0,0 @@
1
- A�B�C�D�E�F�G�H�I�J�K�L�M�N�O�P�Q�R�S�T�U�V�W�X�Y�Z
2
- 1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1�1
3
- 2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2�2
File without changes
File without changes
File without changes
@@ -1,3 +0,0 @@
1
- one¬two¬three
2
- some¬data¬here
3
- more¬data¬here
@@ -1,34 +0,0 @@
1
- <root>
2
- <record>
3
- <no_relative_path value="A value"/>
4
- <no_path_or_att>Another value</no_path_or_att>
5
- <demographics>
6
- <demographics_1>AAA</demographics_1>
7
- <address>
8
- <address_line1>Address</address_line1>
9
- <address_line1>Address 2</address_line1>
10
- </address>
11
- <demographics_2 code="03">Inner text</demographics_2>
12
- </demographics>
13
- <pathology>
14
- <pathology_date_1>2018-01-01</pathology_date_1>
15
- <pathology_date_2 />
16
- </pathology>
17
- </record>
18
- <record>
19
- <demographics>
20
- <address>
21
- <address_line1>Address</address_line1>
22
- <address_line1>Address 2</address_line1>
23
- </address>
24
- <demographics_2 code="03">Inner text</demographics_2>
25
- <demographics_1>AAA</demographics_1>
26
- </demographics>
27
- <no_path_or_att><![CDATA[Another value]]></no_path_or_att>
28
- <pathology>
29
- <pathology_date_1>2018-01-01</pathology_date_1>
30
- <pathology_date_2 />
31
- </pathology>
32
- <no_relative_path value="A value"/>
33
- </record>
34
- </root>
@@ -1,39 +0,0 @@
1
- ---
2
- surname:
3
- column: surname
4
- rawtext_name: surname
5
- mappings:
6
- - field: surname
7
- clean: :name
8
- previoussurname:
9
- column: previoussurname
10
- rawtext_name: previoussurname
11
- mappings:
12
- - field: previoussurname
13
- clean: :name
14
- forenames:
15
- column: forenames
16
- rawtext_name: forenames
17
- mappings:
18
- - field: forenames
19
- clean: :name
20
- sex:
21
- column: sex
22
- rawtext_name: sex
23
- mappings:
24
- - field: sex
25
- clean: :sex
26
- nhsnumber:
27
- column: nhsnumber
28
- rawtext_name: nhsnumber
29
- mappings:
30
- - field: nhsnumber
31
- clean: :nhsnumber
32
- postcode:
33
- column: postcode
34
- rawtext_name: postcode
35
- mappings:
36
- - field: postcode
37
- clean: :postcode
38
- test:
39
- column: standard_mapping_column_name
@@ -1 +0,0 @@
1
- Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
@@ -1 +0,0 @@
1
- Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.