ndr_import 8.5.0 → 8.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Gemfile +0 -3
- data/README.md +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/code_safety.yml +27 -11
- data/exe/pdf_acro_form_to_yaml +23 -0
- data/exe/pdf_to_text +28 -0
- data/exe/word_to_text +26 -0
- data/gemfiles/Gemfile.rails52 +0 -3
- data/gemfiles/Gemfile.rails60 +5 -0
- data/lib/ndr_import/version.rb +1 -1
- data/ndr_import.gemspec +9 -7
- metadata +23 -164
- data/gemfiles/Gemfile.rails50 +0 -8
- data/gemfiles/Gemfile.rails51 +0 -9
- data/test/file/acro_form_test.rb +0 -39
- data/test/file/base_test.rb +0 -54
- data/test/file/delimited_test.rb +0 -233
- data/test/file/docx_test.rb +0 -53
- data/test/file/excel_test.rb +0 -124
- data/test/file/pdf_test.rb +0 -36
- data/test/file/registry_test.rb +0 -62
- data/test/file/seven_zip_test.rb +0 -59
- data/test/file/text_test.rb +0 -92
- data/test/file/word_test.rb +0 -35
- data/test/file/xml_test.rb +0 -21
- data/test/file/zip_test.rb +0 -47
- data/test/fixed_width/table_test.rb +0 -35
- data/test/helpers/file/delimited_test.rb +0 -105
- data/test/helpers/file/excel_test.rb +0 -82
- data/test/helpers/file/pdf_test.rb +0 -27
- data/test/helpers/file/word_test.rb +0 -26
- data/test/helpers/file/xml_test.rb +0 -131
- data/test/helpers/file/zip_test.rb +0 -75
- data/test/mapper_test.rb +0 -676
- data/test/non_tabular/mapping_test.rb +0 -36
- data/test/non_tabular/table_test.rb +0 -590
- data/test/non_tabular_file_helper_test.rb +0 -501
- data/test/pdf_form/table_test.rb +0 -119
- data/test/readme_test.rb +0 -53
- data/test/resources/acro_form.pdf +0 -0
- data/test/resources/blank_tab_test.xlsx +0 -0
- data/test/resources/bomd.csv +0 -3
- data/test/resources/broken.csv +0 -3
- data/test/resources/filesystem_paths.yml +0 -26
- data/test/resources/flat_file.pdf +0 -0
- data/test/resources/flat_file.txt +0 -27
- data/test/resources/flat_file.yml +0 -20
- data/test/resources/hello_utf16be.txt +0 -0
- data/test/resources/hello_utf16le.txt +0 -0
- data/test/resources/hello_utf8.txt +0 -2
- data/test/resources/hello_windows.txt +0 -2
- data/test/resources/hello_world.doc +0 -0
- data/test/resources/hello_world.docx +0 -0
- data/test/resources/hello_world.pdf +0 -0
- data/test/resources/hello_world.txt +0 -2
- data/test/resources/high_ascii_delimited.txt +0 -2
- data/test/resources/high_ascii_delimited_example_two.txt +0 -3
- data/test/resources/malformed.csv +0 -3
- data/test/resources/malformed.xml +0 -6
- data/test/resources/malformed_pipe.csv +0 -3
- data/test/resources/normal.7z +0 -0
- data/test/resources/normal.csv +0 -3
- data/test/resources/normal.csv.zip +0 -0
- data/test/resources/normal_pipe.csv +0 -3
- data/test/resources/normal_thorn.csv +0 -3
- data/test/resources/not_a_pdf.pdf +0 -0
- data/test/resources/not_a_word_file.doc +0 -0
- data/test/resources/not_a_word_file.docx +0 -0
- data/test/resources/not_sign_delimited.txt +0 -3
- data/test/resources/password_protected_hello_world.docx +0 -0
- data/test/resources/password_protected_sample_xlsx.xlsx +0 -0
- data/test/resources/sample.xml +0 -34
- data/test/resources/sample_xls.xls +0 -0
- data/test/resources/sample_xlsx.xlsx +0 -0
- data/test/resources/sheet_streaming.xls +0 -0
- data/test/resources/sheet_streaming.xlsx +0 -0
- data/test/resources/standard_mappings.yml +0 -39
- data/test/resources/txt_file_xls_extension.xls +0 -1
- data/test/resources/txt_file_xlsx_extension.xlsx +0 -1
- data/test/resources/utf-16be_xml.xml +0 -0
- data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
- data/test/resources/utf-16le_xml.xml +0 -0
- data/test/resources/utf-8_xml.xml +0 -9
- data/test/resources/windows-1252_xml.xml +0 -9
- data/test/resources/windows.csv +0 -5
- data/test/resources/xlsx_file_xls_extension.xls +0 -0
- data/test/standard_mappings_test.rb +0 -22
- data/test/table_test.rb +0 -545
- data/test/test_helper.rb +0 -35
- data/test/universal_importer_helper_test.rb +0 -86
- data/test/xml/table_test.rb +0 -90
data/test/pdf_form/table_test.rb
DELETED
@@ -1,119 +0,0 @@
|
|
1
|
-
require 'test_helper'
|
2
|
-
|
3
|
-
# This tests the NdrImport::PdfForm::Table mapping class
|
4
|
-
module PdfForm
|
5
|
-
class TableTest < ActiveSupport::TestCase
|
6
|
-
def setup
|
7
|
-
@form_data = [{
|
8
|
-
'address2' => 'Address 2',
|
9
|
-
'should_be_blank' => '',
|
10
|
-
'date_2' => '2018-12-01',
|
11
|
-
'date_1' => '2018-01-01',
|
12
|
-
'address1' => 'Address'
|
13
|
-
}]
|
14
|
-
end
|
15
|
-
|
16
|
-
def test_should_transform_pdf_form_hash
|
17
|
-
table = NdrImport::PdfForm::Table.new(klass: 'SomeTestKlass',
|
18
|
-
columns: pdf_form_column_mapping)
|
19
|
-
|
20
|
-
assert_equal 'acroform', table.format
|
21
|
-
|
22
|
-
transformed_data = table.transform(@form_data)
|
23
|
-
assert_equal 1, transformed_data.count
|
24
|
-
|
25
|
-
expected_data = ['SomeTestKlass', { rawtext: {
|
26
|
-
'address1' => 'Address',
|
27
|
-
'address2' => 'Address 2',
|
28
|
-
'missing_from_data' => '',
|
29
|
-
'date_1' => '2018-01-01',
|
30
|
-
'date_2' => '2018-12-01',
|
31
|
-
'should_be_blank' => ''
|
32
|
-
} }, 0]
|
33
|
-
|
34
|
-
klass, fields, index = *transformed_data.first
|
35
|
-
assert_equal expected_data[0], klass
|
36
|
-
assert_equal expected_data[1], fields
|
37
|
-
assert_equal expected_data[-1], index
|
38
|
-
end
|
39
|
-
|
40
|
-
def test_should_transform_mutli_klass_pdf_form_hash
|
41
|
-
table = NdrImport::PdfForm::Table.new(columns: multi_klass_pdf_form_column_mapping)
|
42
|
-
|
43
|
-
expected_data = [
|
44
|
-
['TestKlassOne',
|
45
|
-
{ rawtext:
|
46
|
-
{ 'address1' => 'Address',
|
47
|
-
'address2' => 'Address 2',
|
48
|
-
'missing_from_data' => '' } },
|
49
|
-
0],
|
50
|
-
['TestKlassTwo',
|
51
|
-
{ rawtext:
|
52
|
-
{ 'address1' => 'Address',
|
53
|
-
'date_1' => '2018-01-01',
|
54
|
-
'date_2' => '2018-12-01',
|
55
|
-
'should_be_blank' => '' } },
|
56
|
-
0]
|
57
|
-
]
|
58
|
-
|
59
|
-
transformed_data = table.transform(@form_data).to_a
|
60
|
-
assert_equal 2, transformed_data.count
|
61
|
-
|
62
|
-
expected_data.each_with_index do |expected, index|
|
63
|
-
transformed = transformed_data[index]
|
64
|
-
assert_equal expected, transformed
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
def test_should_fail_with_unmappped_form_data
|
69
|
-
table = NdrImport::PdfForm::Table.new(klass: 'SomeTestKlass',
|
70
|
-
columns: partial_pdf_form_column_mapping)
|
71
|
-
|
72
|
-
exception = assert_raises(NdrImport::UnmappedDataError) { table.transform(@form_data).to_a }
|
73
|
-
assert exception.message == 'Unmapped data: address2 and date_1'
|
74
|
-
end
|
75
|
-
|
76
|
-
def test_should_not_be_valid_with_bespoke_format
|
77
|
-
exception = assert_raises(ArgumentError) { NdrImport::PdfForm::Table.new(format: 'a_format') }
|
78
|
-
exception.message == 'Unrecognised options: ["format"]'
|
79
|
-
end
|
80
|
-
|
81
|
-
private
|
82
|
-
|
83
|
-
def pdf_form_column_mapping
|
84
|
-
[
|
85
|
-
{ 'column' => 'address1' },
|
86
|
-
{ 'column' => 'address2' },
|
87
|
-
{ 'column' => 'missing_from_data' },
|
88
|
-
{ 'column' => 'date_1' },
|
89
|
-
{ 'column' => 'date_2' },
|
90
|
-
{ 'column' => 'should_be_blank' }
|
91
|
-
]
|
92
|
-
end
|
93
|
-
|
94
|
-
def multi_klass_pdf_form_column_mapping
|
95
|
-
[
|
96
|
-
{ 'column' => 'address1',
|
97
|
-
'klass' => %w[TestKlassOne TestKlassTwo] },
|
98
|
-
{ 'column' => 'address2',
|
99
|
-
'klass' => 'TestKlassOne' },
|
100
|
-
{ 'column' => 'missing_from_data',
|
101
|
-
'klass' => 'TestKlassOne' },
|
102
|
-
{ 'column' => 'date_1',
|
103
|
-
'klass' => 'TestKlassTwo' },
|
104
|
-
{ 'column' => 'date_2',
|
105
|
-
'klass' => 'TestKlassTwo' },
|
106
|
-
{ 'column' => 'should_be_blank',
|
107
|
-
'klass' => 'TestKlassTwo' }
|
108
|
-
]
|
109
|
-
end
|
110
|
-
|
111
|
-
def partial_pdf_form_column_mapping
|
112
|
-
[
|
113
|
-
{ 'column' => 'address1' },
|
114
|
-
{ 'column' => 'date_2' },
|
115
|
-
{ 'column' => 'should_be_blank' }
|
116
|
-
]
|
117
|
-
end
|
118
|
-
end
|
119
|
-
end
|
data/test/readme_test.rb
DELETED
@@ -1,53 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
require 'test_helper'
|
3
|
-
|
4
|
-
# This tests the README page example
|
5
|
-
class ReadmeTest < ActiveSupport::TestCase
|
6
|
-
test 'readme example' do
|
7
|
-
require 'ndr_import/non_tabular/table'
|
8
|
-
require 'ndr_import/file/registry'
|
9
|
-
|
10
|
-
unzip_path = SafePath.new('test_space_rw')
|
11
|
-
source_file = SafePath.new('permanent_test_files').join('flat_file.pdf')
|
12
|
-
options = { 'unzip_path' => unzip_path }
|
13
|
-
|
14
|
-
table = NdrImport::NonTabular::Table.new(
|
15
|
-
'start_in_a_record' => false,
|
16
|
-
'end_in_a_record' => false,
|
17
|
-
'klass' => 'SomeTestKlass',
|
18
|
-
'start_line_pattern' => /\A------\z/,
|
19
|
-
'remove_lines' => { 'footer' => [/\A== Page \d+ of \d+ ==\z/i] },
|
20
|
-
'columns' => [
|
21
|
-
{
|
22
|
-
'column' => 'one',
|
23
|
-
'non_tabular_cell' => { 'lines' => Range.new(0, -1, true), 'capture' => /^(.*)$/i }
|
24
|
-
}
|
25
|
-
]
|
26
|
-
)
|
27
|
-
|
28
|
-
# Use the Registry to enumerate over the files and their tables
|
29
|
-
files = NdrImport::File::Registry.files(source_file, options)
|
30
|
-
files.each do |filename|
|
31
|
-
tables = NdrImport::File::Registry.tables(filename, nil, options)
|
32
|
-
tables.each do |_tablename, table_content|
|
33
|
-
# Use the NonTabular::Table to tabulate the "table" contents
|
34
|
-
table.transform(table_content).each do |_klass, _fields, _index|
|
35
|
-
# Your code goes here
|
36
|
-
end
|
37
|
-
|
38
|
-
# Now we test the example
|
39
|
-
results = []
|
40
|
-
table.transform(table_content).each do |_klass, fields, _index|
|
41
|
-
results << fields[:rawtext]['one']
|
42
|
-
end
|
43
|
-
assert table.is_a?(NdrImport::NonTabular::Table)
|
44
|
-
assert_equal 4, results.count
|
45
|
-
assert results.first.start_with?('1')
|
46
|
-
assert results.last.start_with?('4')
|
47
|
-
assert results.any? { |result| result =~ /This is captured/ }
|
48
|
-
refute results.any? { |result| result =~ /This is never captured/ }
|
49
|
-
refute results.any? { |result| result =~ /== Page/ }
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
Binary file
|
Binary file
|
data/test/resources/bomd.csv
DELETED
data/test/resources/broken.csv
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
<% require 'tmpdir' %>
|
2
|
-
# This allows us different filesystem paths for different platforms
|
3
|
-
---
|
4
|
-
test_files: &test_files
|
5
|
-
root: <%= Dir.mktmpdir %>
|
6
|
-
|
7
|
-
? !ruby/regexp /.*/
|
8
|
-
:
|
9
|
-
test_space_r:
|
10
|
-
<<: *test_files
|
11
|
-
prms:
|
12
|
-
- r
|
13
|
-
test_space_w:
|
14
|
-
<<: *test_files
|
15
|
-
prms:
|
16
|
-
- w
|
17
|
-
test_space_rw:
|
18
|
-
<<: *test_files
|
19
|
-
prms:
|
20
|
-
- r
|
21
|
-
- w
|
22
|
-
permanent_test_files:
|
23
|
-
root: <%= NdrImport.root + '/test/resources' %>
|
24
|
-
prms:
|
25
|
-
- r
|
26
|
-
- w
|
Binary file
|
@@ -1,27 +0,0 @@
|
|
1
|
-
0
|
2
|
-
This is never captured
|
3
|
-
------
|
4
|
-
1
|
5
|
-
== Page 1 of 5 ==
|
6
|
-
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut
|
7
|
-
labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris
|
8
|
-
nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit
|
9
|
-
esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt
|
10
|
-
== Page 2 of 5 ==
|
11
|
-
in culpa qui officia deserunt mollit anim id est laborum.
|
12
|
-
------
|
13
|
-
2
|
14
|
-
This is captured
|
15
|
-
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
16
|
-
== Page 3 of 5 ==
|
17
|
-
------
|
18
|
-
3
|
19
|
-
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
|
20
|
-
------
|
21
|
-
== Page 4 of 5 ==
|
22
|
-
4
|
23
|
-
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
24
|
-
------
|
25
|
-
-1
|
26
|
-
== Page 5 of 5 ==
|
27
|
-
This is never captured
|
@@ -1,20 +0,0 @@
|
|
1
|
-
--- !ruby/object:NdrImport::NonTabular::Table
|
2
|
-
# canonical_name: somename
|
3
|
-
# filename_pattern:
|
4
|
-
# format: pipe
|
5
|
-
klass: SomeTestKlass
|
6
|
-
start_line_pattern: !ruby/regexp /\A------\z/
|
7
|
-
# end_line_pattern:
|
8
|
-
start_in_a_record: false
|
9
|
-
end_in_a_record: false
|
10
|
-
remove_lines:
|
11
|
-
footer:
|
12
|
-
- !ruby/regexp /\A== Page \d+ of \d+ ==\z/i
|
13
|
-
columns:
|
14
|
-
- column: one
|
15
|
-
non_tabular_cell:
|
16
|
-
lines: !ruby/range
|
17
|
-
begin: 0
|
18
|
-
end: -1
|
19
|
-
excl: true
|
20
|
-
capture: !ruby/regexp /^(.*)$/i
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,2 +0,0 @@
|
|
1
|
-
345465�1234567890�Dr Bob��BLOGGS�JOE�M�05 NOV 1990�1 noddy street, anytown�CB22 3AD�IP�234534654�42�25 Jun 2015�14253�WWJ�SMITH, John�SGH�AA�LAB���Made discvery|�Dr Josh|
|
2
|
-
345465�1234567890�Dr Bob��BLOGGS�JOE�M�05 NOV 1990�1 noddy street, anytown�CB22 3AD�IP�234534654�42�25 Jun 2015�14253�WWJ�SMITH, John�SGH�AA�LAB���Made discvery|�Dr Josh|
|
data/test/resources/normal.7z
DELETED
Binary file
|
data/test/resources/normal.csv
DELETED
Binary file
|
File without changes
|
File without changes
|
File without changes
|
Binary file
|
Binary file
|
data/test/resources/sample.xml
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
<root>
|
2
|
-
<record>
|
3
|
-
<no_relative_path value="A value"/>
|
4
|
-
<no_path_or_att>Another value</no_path_or_att>
|
5
|
-
<demographics>
|
6
|
-
<demographics_1>AAA</demographics_1>
|
7
|
-
<address>
|
8
|
-
<address_line1>Address</address_line1>
|
9
|
-
<address_line1>Address 2</address_line1>
|
10
|
-
</address>
|
11
|
-
<demographics_2 code="03">Inner text</demographics_2>
|
12
|
-
</demographics>
|
13
|
-
<pathology>
|
14
|
-
<pathology_date_1>2018-01-01</pathology_date_1>
|
15
|
-
<pathology_date_2 />
|
16
|
-
</pathology>
|
17
|
-
</record>
|
18
|
-
<record>
|
19
|
-
<demographics>
|
20
|
-
<address>
|
21
|
-
<address_line1>Address</address_line1>
|
22
|
-
<address_line1>Address 2</address_line1>
|
23
|
-
</address>
|
24
|
-
<demographics_2 code="03">Inner text</demographics_2>
|
25
|
-
<demographics_1>AAA</demographics_1>
|
26
|
-
</demographics>
|
27
|
-
<no_path_or_att><![CDATA[Another value]]></no_path_or_att>
|
28
|
-
<pathology>
|
29
|
-
<pathology_date_1>2018-01-01</pathology_date_1>
|
30
|
-
<pathology_date_2 />
|
31
|
-
</pathology>
|
32
|
-
<no_relative_path value="A value"/>
|
33
|
-
</record>
|
34
|
-
</root>
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,39 +0,0 @@
|
|
1
|
-
---
|
2
|
-
surname:
|
3
|
-
column: surname
|
4
|
-
rawtext_name: surname
|
5
|
-
mappings:
|
6
|
-
- field: surname
|
7
|
-
clean: :name
|
8
|
-
previoussurname:
|
9
|
-
column: previoussurname
|
10
|
-
rawtext_name: previoussurname
|
11
|
-
mappings:
|
12
|
-
- field: previoussurname
|
13
|
-
clean: :name
|
14
|
-
forenames:
|
15
|
-
column: forenames
|
16
|
-
rawtext_name: forenames
|
17
|
-
mappings:
|
18
|
-
- field: forenames
|
19
|
-
clean: :name
|
20
|
-
sex:
|
21
|
-
column: sex
|
22
|
-
rawtext_name: sex
|
23
|
-
mappings:
|
24
|
-
- field: sex
|
25
|
-
clean: :sex
|
26
|
-
nhsnumber:
|
27
|
-
column: nhsnumber
|
28
|
-
rawtext_name: nhsnumber
|
29
|
-
mappings:
|
30
|
-
- field: nhsnumber
|
31
|
-
clean: :nhsnumber
|
32
|
-
postcode:
|
33
|
-
column: postcode
|
34
|
-
rawtext_name: postcode
|
35
|
-
mappings:
|
36
|
-
- field: postcode
|
37
|
-
clean: :postcode
|
38
|
-
test:
|
39
|
-
column: standard_mapping_column_name
|
@@ -1 +0,0 @@
|
|
1
|
-
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
@@ -1 +0,0 @@
|
|
1
|
-
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
Binary file
|
Binary file
|
Binary file
|