ndr_import 8.5.0 → 8.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +7 -0
  3. data/Gemfile +0 -3
  4. data/README.md +6 -0
  5. data/bin/console +14 -0
  6. data/bin/setup +8 -0
  7. data/code_safety.yml +27 -11
  8. data/exe/pdf_acro_form_to_yaml +23 -0
  9. data/exe/pdf_to_text +28 -0
  10. data/exe/word_to_text +26 -0
  11. data/gemfiles/Gemfile.rails52 +0 -3
  12. data/gemfiles/Gemfile.rails60 +5 -0
  13. data/lib/ndr_import/version.rb +1 -1
  14. data/ndr_import.gemspec +9 -7
  15. metadata +23 -164
  16. data/gemfiles/Gemfile.rails50 +0 -8
  17. data/gemfiles/Gemfile.rails51 +0 -9
  18. data/test/file/acro_form_test.rb +0 -39
  19. data/test/file/base_test.rb +0 -54
  20. data/test/file/delimited_test.rb +0 -233
  21. data/test/file/docx_test.rb +0 -53
  22. data/test/file/excel_test.rb +0 -124
  23. data/test/file/pdf_test.rb +0 -36
  24. data/test/file/registry_test.rb +0 -62
  25. data/test/file/seven_zip_test.rb +0 -59
  26. data/test/file/text_test.rb +0 -92
  27. data/test/file/word_test.rb +0 -35
  28. data/test/file/xml_test.rb +0 -21
  29. data/test/file/zip_test.rb +0 -47
  30. data/test/fixed_width/table_test.rb +0 -35
  31. data/test/helpers/file/delimited_test.rb +0 -105
  32. data/test/helpers/file/excel_test.rb +0 -82
  33. data/test/helpers/file/pdf_test.rb +0 -27
  34. data/test/helpers/file/word_test.rb +0 -26
  35. data/test/helpers/file/xml_test.rb +0 -131
  36. data/test/helpers/file/zip_test.rb +0 -75
  37. data/test/mapper_test.rb +0 -676
  38. data/test/non_tabular/mapping_test.rb +0 -36
  39. data/test/non_tabular/table_test.rb +0 -590
  40. data/test/non_tabular_file_helper_test.rb +0 -501
  41. data/test/pdf_form/table_test.rb +0 -119
  42. data/test/readme_test.rb +0 -53
  43. data/test/resources/acro_form.pdf +0 -0
  44. data/test/resources/blank_tab_test.xlsx +0 -0
  45. data/test/resources/bomd.csv +0 -3
  46. data/test/resources/broken.csv +0 -3
  47. data/test/resources/filesystem_paths.yml +0 -26
  48. data/test/resources/flat_file.pdf +0 -0
  49. data/test/resources/flat_file.txt +0 -27
  50. data/test/resources/flat_file.yml +0 -20
  51. data/test/resources/hello_utf16be.txt +0 -0
  52. data/test/resources/hello_utf16le.txt +0 -0
  53. data/test/resources/hello_utf8.txt +0 -2
  54. data/test/resources/hello_windows.txt +0 -2
  55. data/test/resources/hello_world.doc +0 -0
  56. data/test/resources/hello_world.docx +0 -0
  57. data/test/resources/hello_world.pdf +0 -0
  58. data/test/resources/hello_world.txt +0 -2
  59. data/test/resources/high_ascii_delimited.txt +0 -2
  60. data/test/resources/high_ascii_delimited_example_two.txt +0 -3
  61. data/test/resources/malformed.csv +0 -3
  62. data/test/resources/malformed.xml +0 -6
  63. data/test/resources/malformed_pipe.csv +0 -3
  64. data/test/resources/normal.7z +0 -0
  65. data/test/resources/normal.csv +0 -3
  66. data/test/resources/normal.csv.zip +0 -0
  67. data/test/resources/normal_pipe.csv +0 -3
  68. data/test/resources/normal_thorn.csv +0 -3
  69. data/test/resources/not_a_pdf.pdf +0 -0
  70. data/test/resources/not_a_word_file.doc +0 -0
  71. data/test/resources/not_a_word_file.docx +0 -0
  72. data/test/resources/not_sign_delimited.txt +0 -3
  73. data/test/resources/password_protected_hello_world.docx +0 -0
  74. data/test/resources/password_protected_sample_xlsx.xlsx +0 -0
  75. data/test/resources/sample.xml +0 -34
  76. data/test/resources/sample_xls.xls +0 -0
  77. data/test/resources/sample_xlsx.xlsx +0 -0
  78. data/test/resources/sheet_streaming.xls +0 -0
  79. data/test/resources/sheet_streaming.xlsx +0 -0
  80. data/test/resources/standard_mappings.yml +0 -39
  81. data/test/resources/txt_file_xls_extension.xls +0 -1
  82. data/test/resources/txt_file_xlsx_extension.xlsx +0 -1
  83. data/test/resources/utf-16be_xml.xml +0 -0
  84. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  85. data/test/resources/utf-16le_xml.xml +0 -0
  86. data/test/resources/utf-8_xml.xml +0 -9
  87. data/test/resources/windows-1252_xml.xml +0 -9
  88. data/test/resources/windows.csv +0 -5
  89. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  90. data/test/standard_mappings_test.rb +0 -22
  91. data/test/table_test.rb +0 -545
  92. data/test/test_helper.rb +0 -35
  93. data/test/universal_importer_helper_test.rb +0 -86
  94. data/test/xml/table_test.rb +0 -90
@@ -1,501 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- require 'test_helper'
4
-
5
- # Test non tabular mapper class that expose private method(s) for testing
6
- class NonTabularTestMapper
7
- # include NdrImport::Mapper
8
- include NdrImport::NonTabularFileHelper
9
-
10
- attr_accessor :mappings
11
-
12
- public :read_non_tabular_string
13
- end
14
-
15
- # This tests the NonTabularFileHelper class
16
- class NonTabularFileHelperTest < ActiveSupport::TestCase
17
- simple_divider_example = <<-STR
18
- 111
19
- Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt.
20
- ------
21
- 222
22
- Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo.
23
- ------
24
- 333
25
- Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla.
26
- ------
27
- 444
28
- Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim.
29
- STR
30
-
31
- test 'should raise error with no non_tabular_row' do
32
- mapper = NonTabularTestMapper.new
33
- mapper.mappings = YAML.load <<-YML
34
- columns:
35
- - column: one
36
- YML
37
- assert_raise NdrImport::MappingError do
38
- mapper.read_non_tabular_string(simple_divider_example)
39
- end
40
- end
41
-
42
- test 'should raise error with no non_tabular_row start_line_pattern' do
43
- mapper = NonTabularTestMapper.new
44
- mapper.mappings = YAML.load <<-YML
45
- non_tabular_row:
46
- columns:
47
- - column: one
48
- YML
49
- assert_raise NdrImport::MappingError do
50
- mapper.read_non_tabular_string(simple_divider_example)
51
- end
52
-
53
- mapper = NonTabularTestMapper.new
54
- mapper.mappings = YAML.load <<-YML
55
- non_tabular_row:
56
- start_line_pattern:
57
- columns:
58
- - column: one
59
- YML
60
- assert_raise NdrImport::MappingError do
61
- mapper.read_non_tabular_string(simple_divider_example)
62
- end
63
- end
64
-
65
- test 'should raise error with no column non_tabular_cell' do
66
- mapper = NonTabularTestMapper.new
67
- mapper.mappings = YAML.load <<-YML
68
- non_tabular_row:
69
- start_line_pattern: !ruby/regexp /^-{6}$/
70
- columns:
71
- - column: one
72
- YML
73
- assert_raise NdrImport::MappingError do
74
- mapper.read_non_tabular_string(simple_divider_example)
75
- end
76
- end
77
-
78
- test 'should raise error with no column non_tabular_cell lines' do
79
- mapper = NonTabularTestMapper.new
80
- mapper.mappings = YAML.load <<-YML
81
- non_tabular_row:
82
- start_line_pattern: !ruby/regexp /^-{6}$/
83
- columns:
84
- - column: one
85
- non_tabular_cell:
86
- YML
87
- assert_raise NdrImport::MappingError do
88
- mapper.read_non_tabular_string(simple_divider_example)
89
- end
90
-
91
- mapper = NonTabularTestMapper.new
92
- mapper.mappings = YAML.load <<-YML
93
- non_tabular_row:
94
- start_line_pattern: !ruby/regexp /^-{6}$/
95
- columns:
96
- - column: one
97
- non_tabular_cell:
98
- lines:
99
- YML
100
- assert_raise NdrImport::MappingError do
101
- mapper.read_non_tabular_string(simple_divider_example)
102
- end
103
- end
104
-
105
- test 'should raise error with no column non_tabular_cell capture' do
106
- mapper = NonTabularTestMapper.new
107
- mapper.mappings = YAML.load <<-YML
108
- non_tabular_row:
109
- start_line_pattern: !ruby/regexp /^-{6}$/
110
- columns:
111
- - column: one
112
- non_tabular_cell:
113
- lines: !ruby/range
114
- begin: 0
115
- end: -1
116
- excl: false
117
- YML
118
- assert_raise NdrImport::MappingError do
119
- mapper.read_non_tabular_string(simple_divider_example)
120
- end
121
-
122
- mapper = NonTabularTestMapper.new
123
- mapper.mappings = YAML.load <<-YML
124
- non_tabular_row:
125
- start_line_pattern: !ruby/regexp /^-{6}$/
126
- columns:
127
- - column: one
128
- non_tabular_cell:
129
- lines: !ruby/range
130
- begin: 0
131
- end: -1
132
- excl: false
133
- capture:
134
- YML
135
- assert_raise NdrImport::MappingError do
136
- mapper.read_non_tabular_string(simple_divider_example)
137
- end
138
- end
139
-
140
- test 'should only return two results with no start_in_a_record or end_in_a_record' do
141
- mapper = NonTabularTestMapper.new
142
- mapper.mappings = YAML.load <<-YML
143
- non_tabular_row:
144
- start_line_pattern: !ruby/regexp /^-{6}$/
145
- columns:
146
- - column: one
147
- non_tabular_cell:
148
- lines: !ruby/range
149
- begin: 0
150
- end: -1
151
- excl: false
152
- capture: !ruby/regexp /^(.*)$/i
153
- YML
154
- results = mapper.read_non_tabular_string(simple_divider_example)
155
- assert_equal 2, results.count
156
- assert results.first[0].start_with?('222')
157
- assert results.last[0].start_with?('333')
158
- end
159
-
160
- test 'should return three results with start_in_a_record' do
161
- mapper = NonTabularTestMapper.new
162
- mapper.mappings = YAML.load <<-YML
163
- non_tabular_row:
164
- start_line_pattern: !ruby/regexp /^-{6}$/
165
- start_in_a_record: true
166
- columns:
167
- - column: one
168
- non_tabular_cell:
169
- lines: !ruby/range
170
- begin: 0
171
- end: -1
172
- excl: false
173
- capture: !ruby/regexp /^(.*)$/i
174
- YML
175
- results = mapper.read_non_tabular_string(simple_divider_example)
176
- assert_equal 3, results.count
177
- assert results.first[0].start_with?('111')
178
- assert results.last[0].start_with?('333')
179
- end
180
-
181
- test 'should return three results with end_in_a_record' do
182
- mapper = NonTabularTestMapper.new
183
- mapper.mappings = YAML.load <<-YML
184
- non_tabular_row:
185
- start_line_pattern: !ruby/regexp /^-{6}$/
186
- end_in_a_record: true
187
- columns:
188
- - column: one
189
- non_tabular_cell:
190
- lines: !ruby/range
191
- begin: 0
192
- end: -1
193
- excl: false
194
- capture: !ruby/regexp /^(.*)$/i
195
- YML
196
- results = mapper.read_non_tabular_string(simple_divider_example)
197
- assert_equal 3, results.count
198
- assert results.first[0].start_with?('222')
199
- assert results.last[0].start_with?('444')
200
- end
201
-
202
- test 'should return four results with start_in_a_record and end_in_a_record' do
203
- mapper = NonTabularTestMapper.new
204
- mapper.mappings = YAML.load <<-YML
205
- non_tabular_row:
206
- start_line_pattern: !ruby/regexp /^-{6}$/
207
- start_in_a_record: true
208
- end_in_a_record: true
209
- columns:
210
- - column: one
211
- non_tabular_cell:
212
- lines: !ruby/range
213
- begin: 0
214
- end: -1
215
- excl: false
216
- capture: !ruby/regexp /^(.*)$/i
217
- YML
218
- results = mapper.read_non_tabular_string(simple_divider_example)
219
- assert_equal 4, results.count
220
- assert results.first[0].start_with?('111')
221
- assert results.last[0].start_with?('444')
222
- end
223
-
224
- no_divider_example = <<-STR
225
- 111
226
- Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt.
227
- STR
228
-
229
- test 'should return one results with start_in_a_record and end_in_a_record' do
230
- mapper = NonTabularTestMapper.new
231
- mapper.mappings = YAML.load <<-YML
232
- non_tabular_row:
233
- start_line_pattern: !ruby/regexp /^-{6}$/
234
- start_in_a_record: true
235
- end_in_a_record: true
236
- columns:
237
- - column: one
238
- non_tabular_cell:
239
- lines: !ruby/range
240
- begin: 0
241
- end: -1
242
- excl: false
243
- capture: !ruby/regexp /^(.*)$/i
244
- YML
245
- results = mapper.read_non_tabular_string(no_divider_example)
246
- assert_equal 1, results.count
247
- assert results.first[0].start_with?('111')
248
- end
249
-
250
- simple_start_and_end_divider_example = <<-STR
251
- ----- START -----
252
- 111
253
- Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt.
254
- ------ END ------
255
- This is never captured
256
- ----- START -----
257
- 222
258
- Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo.
259
- ------ END ------
260
- This is never captured
261
- ----- START -----
262
- 333
263
- Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla.
264
- ------ END ------
265
- This is never captured
266
- ----- START -----
267
- 444
268
- This is captured
269
- Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim.
270
- ------ END ------
271
- STR
272
-
273
- test 'should return four results with start and end dividers' do
274
- mapper = NonTabularTestMapper.new
275
- mapper.mappings = YAML.load <<-YML
276
- non_tabular_row:
277
- start_line_pattern: !ruby/regexp /^----- START -----$/
278
- end_line_pattern: !ruby/regexp /^------ END ------$/
279
- columns:
280
- - column: one
281
- non_tabular_cell:
282
- lines: !ruby/range
283
- begin: 0
284
- end: -1
285
- excl: false
286
- capture: !ruby/regexp /^(.*)$/i
287
- YML
288
- results = mapper.read_non_tabular_string(simple_start_and_end_divider_example)
289
- assert_equal 4, results.count
290
- assert results.first[0].start_with?('111')
291
- assert results.last[0].start_with?('444')
292
-
293
- assert results.flatten.any? { |result| result =~ /This is captured/ }
294
- refute results.flatten.any? { |result| result =~ /This is never captured/ }
295
- end
296
-
297
- test 'documentation example' do
298
- mapper = NonTabularTestMapper.new
299
- mapper.mappings = YAML.load <<-YML
300
- non_tabular_row:
301
- start_line_pattern: !ruby/regexp /^D\\|/
302
- capture_start_line: true
303
- end_in_a_record: true
304
- columns:
305
- - standard_mapping: nhsnumber
306
- non_tabular_cell:
307
- lines: 0
308
- capture:
309
- - !ruby/regexp /^D\\|([^|]*).*/
310
- - column: fulltextreport
311
- non_tabular_cell:
312
- lines: !ruby/range
313
- begin: 1
314
- end: -1
315
- excl: false
316
- capture: !ruby/regexp /^(?:R|\\d+)\\|(.*)$/i
317
- join: "\\n"
318
- YML
319
- documentation_example = [
320
- 'D|1111111111|...',
321
- 'R|This is a',
322
- '1|multiline report'
323
- ].join("\n")
324
-
325
- results = mapper.read_non_tabular_string(documentation_example)
326
- assert_equal 1, results.count
327
- result = results.first
328
- assert_equal '1111111111', result[0]
329
- assert_equal "This is a\nmultiline report", result[1]
330
- end
331
-
332
- test 'should capture' do
333
- mapper = NonTabularTestMapper.new
334
- mapper.mappings = YAML.load <<-YML
335
- non_tabular_row:
336
- start_line_pattern: !ruby/regexp /^-{6}$/
337
- columns:
338
- - standard_mapping: nhsnumber
339
- non_tabular_cell:
340
- lines: 0
341
- capture: !ruby/regexp /^(\\d*)$/i
342
- - column: address
343
- non_tabular_cell:
344
- lines: !ruby/range
345
- begin: 1
346
- end: 5
347
- excl: false
348
- capture: !ruby/regexp /^.{50}(.*)$/i
349
- join: ", "
350
- - standard_mapping: postcode
351
- non_tabular_cell:
352
- lines: 6
353
- capture: !ruby/regexp /^.{50}(.*)$/i
354
- - column: capture_inclusive
355
- non_tabular_cell:
356
- lines: !ruby/object:RegexpRange
357
- begin: !ruby/regexp /^CAPTURE INCLUSIVE$/
358
- end: !ruby/regexp /^Capture me.$/i
359
- excl: false
360
- capture: !ruby/regexp /^(.*)$/i
361
- join: "\\n"
362
- - column: capture_exclusive
363
- non_tabular_cell:
364
- lines: !ruby/object:RegexpRange
365
- begin: !ruby/regexp /^CAPTURE EXCLUSIVE$/
366
- end: !ruby/regexp /^Do NOT capture me.$/i
367
- excl: true
368
- capture: !ruby/regexp /^(.*)$/i
369
- join: "\\n"
370
- - column: capture_to_end
371
- non_tabular_cell:
372
- lines: !ruby/object:RegexpRange
373
- begin: !ruby/regexp /^CAPTURE TO END$/
374
- end: -1
375
- excl: false
376
- capture: !ruby/regexp /^(.*)$/i
377
- join: "\\n"
378
- YML
379
- capture_example = <<-STR
380
- This is never captured
381
- ------
382
- 1111111111
383
- <----------------- 50 characters ---------------->Unit C, Magog Court
384
- Shelford Bottom
385
- Hinton Way
386
- Cambridge
387
-
388
- CB22 3AD
389
-
390
- CAPTURE INCLUSIVE
391
- Lorem ipsum dolor sit amet,
392
- consectetur adipisicing elit,
393
- Capture me.
394
-
395
- CAPTURE EXCLUSIVE
396
- Ut enim ad minim veniam, quis nostrud exercitation.
397
- Do NOT capture me.
398
-
399
- CAPTURE TO END
400
- Lorem ipsum dolor sit amet, consectetur adipisicing elit.
401
- Ut enim ad minim veniam, quis nostrud exercitation ullamco.
402
- Duis aute irure dolor in reprehenderit in voluptate velit.
403
- Excepteur sint occaecat cupidatat non proident, sunt in culpa.
404
- ------
405
- This is never captured
406
- STR
407
- results = mapper.read_non_tabular_string(capture_example)
408
- assert_equal 1, results.count
409
- result = results.first
410
- assert_equal '1111111111', result[0]
411
- assert_equal 'Unit C, Magog Court, Shelford Bottom, Hinton Way, Cambridge', result[1]
412
- assert_equal 'CB22 3AD', result[2]
413
- assert_equal "CAPTURE INCLUSIVE\nLorem ipsum dolor sit amet,\n" \
414
- "consectetur adipisicing elit,\nCapture me.",
415
- result[3]
416
- assert_equal "CAPTURE EXCLUSIVE\nUt enim ad minim veniam, quis nostrud exercitation.",
417
- result[4]
418
- assert_equal "CAPTURE TO END\n" \
419
- "Lorem ipsum dolor sit amet, consectetur adipisicing elit.\n" \
420
- "Ut enim ad minim veniam, quis nostrud exercitation ullamco.\n" \
421
- "Duis aute irure dolor in reprehenderit in voluptate velit.\n" \
422
- 'Excepteur sint occaecat cupidatat non proident, sunt in culpa.',
423
- result[5]
424
-
425
- assert_equal 25, mapper.non_tabular_lines.last.absolute_line_number
426
- end
427
-
428
- test 'handles non utf8 characters' do
429
- mixed_encoding_example = <<-STR
430
- 111
431
- Lorem ipsum dolor sit amet.
432
- ------
433
- 111
434
- Lorem ipsum dolor\xBE sit amet.
435
- ------
436
- 111
437
- Lorem ipsum dolor sit amet.
438
- ------
439
- STR
440
-
441
- mapper = NonTabularTestMapper.new
442
- mapper.mappings = YAML.load <<-YML
443
- non_tabular_row:
444
- start_line_pattern: !ruby/regexp /^111$/
445
- end_in_a_record: true
446
- columns:
447
- - column: one
448
- non_tabular_cell:
449
- lines: !ruby/range
450
- begin: 0
451
- end: -1
452
- excl: true
453
- capture: !ruby/regexp /^(.*)$/i
454
- YML
455
-
456
- results = mapper.read_non_tabular_string(mixed_encoding_example)
457
-
458
- assert_equal 3, results.count, 'records were lost'
459
-
460
- assert_equal [27, 28, 27], results.map { |row| row.first.chars.to_a.length }
461
- assert_equal [27, 29, 27], results.map { |row| row.first.bytes.to_a.length }
462
-
463
- results.each do |row|
464
- assert row.first.valid_encoding?
465
- assert_equal Encoding.find('UTF-8'), row.first.encoding
466
- end
467
- end
468
-
469
- test 'should not allow junk bytes' do
470
- junk = <<-STR
471
- 111
472
- Lorem ipsum dolor sit amet.
473
- ------
474
- 111
475
- Lorem ipsum dolor\x8D sit amet.
476
- ------
477
- 111
478
- Lorem ipsum dolor sit amet.
479
- ------
480
- STR
481
-
482
- mapper = NonTabularTestMapper.new
483
- mapper.mappings = YAML.load <<-YML
484
- non_tabular_row:
485
- start_line_pattern: !ruby/regexp /^111$/
486
- end_in_a_record: true
487
- columns:
488
- - column: one
489
- non_tabular_cell:
490
- lines: !ruby/range
491
- begin: 0
492
- end: -1
493
- excl: true
494
- capture: !ruby/regexp /^(.*)$/i
495
- YML
496
-
497
- assert_raises(UTF8Encoding::UTF8CoercionError) do
498
- mapper.read_non_tabular_string(junk)
499
- end
500
- end
501
- end