ndr_import 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +14 -0
  3. data/.rubocop.yml +27 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +22 -0
  6. data/CODE_OF_CONDUCT.md +13 -0
  7. data/Gemfile +4 -0
  8. data/Guardfile +16 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +69 -0
  11. data/Rakefile +13 -0
  12. data/code_safety.yml +374 -0
  13. data/gemfiles/Gemfile.rails32 +5 -0
  14. data/gemfiles/Gemfile.rails32.lock +142 -0
  15. data/gemfiles/Gemfile.rails41 +5 -0
  16. data/gemfiles/Gemfile.rails41.lock +145 -0
  17. data/gemfiles/Gemfile.rails42 +5 -0
  18. data/gemfiles/Gemfile.rails42.lock +145 -0
  19. data/lib/ndr_import.rb +13 -0
  20. data/lib/ndr_import/csv_library.rb +40 -0
  21. data/lib/ndr_import/file/all.rb +8 -0
  22. data/lib/ndr_import/file/base.rb +76 -0
  23. data/lib/ndr_import/file/delimited.rb +86 -0
  24. data/lib/ndr_import/file/excel.rb +131 -0
  25. data/lib/ndr_import/file/pdf.rb +38 -0
  26. data/lib/ndr_import/file/registry.rb +50 -0
  27. data/lib/ndr_import/file/text.rb +52 -0
  28. data/lib/ndr_import/file/word.rb +30 -0
  29. data/lib/ndr_import/file/zip.rb +67 -0
  30. data/lib/ndr_import/helpers/file/delimited.rb +105 -0
  31. data/lib/ndr_import/helpers/file/excel.rb +181 -0
  32. data/lib/ndr_import/helpers/file/pdf.rb +29 -0
  33. data/lib/ndr_import/helpers/file/word.rb +27 -0
  34. data/lib/ndr_import/helpers/file/xml.rb +45 -0
  35. data/lib/ndr_import/helpers/file/zip.rb +44 -0
  36. data/lib/ndr_import/mapper.rb +220 -0
  37. data/lib/ndr_import/mapping_error.rb +5 -0
  38. data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
  39. data/lib/ndr_import/non_tabular/line.rb +46 -0
  40. data/lib/ndr_import/non_tabular/mapping.rb +35 -0
  41. data/lib/ndr_import/non_tabular/record.rb +99 -0
  42. data/lib/ndr_import/non_tabular/table.rb +193 -0
  43. data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
  44. data/lib/ndr_import/standard_mappings.rb +23 -0
  45. data/lib/ndr_import/table.rb +179 -0
  46. data/lib/ndr_import/version.rb +4 -0
  47. data/ndr_import.gemspec +44 -0
  48. data/test/file/base_test.rb +54 -0
  49. data/test/file/delimited_test.rb +143 -0
  50. data/test/file/excel_test.rb +85 -0
  51. data/test/file/pdf_test.rb +35 -0
  52. data/test/file/registry_test.rb +60 -0
  53. data/test/file/text_test.rb +92 -0
  54. data/test/file/word_test.rb +35 -0
  55. data/test/file/zip_test.rb +47 -0
  56. data/test/helpers/file/delimited_test.rb +113 -0
  57. data/test/helpers/file/excel_test.rb +97 -0
  58. data/test/helpers/file/pdf_test.rb +26 -0
  59. data/test/helpers/file/word_test.rb +26 -0
  60. data/test/helpers/file/xml_test.rb +131 -0
  61. data/test/helpers/file/zip_test.rb +75 -0
  62. data/test/mapper_test.rb +551 -0
  63. data/test/non_tabular/mapping_test.rb +36 -0
  64. data/test/non_tabular/table_test.rb +510 -0
  65. data/test/non_tabular_file_helper_test.rb +501 -0
  66. data/test/readme_test.rb +53 -0
  67. data/test/resources/bomd.csv +3 -0
  68. data/test/resources/broken.csv +3 -0
  69. data/test/resources/filesystem_paths.yml +26 -0
  70. data/test/resources/flat_file.pdf +0 -0
  71. data/test/resources/flat_file.txt +27 -0
  72. data/test/resources/flat_file.yml +20 -0
  73. data/test/resources/hello_utf16be.txt +0 -0
  74. data/test/resources/hello_utf16le.txt +0 -0
  75. data/test/resources/hello_utf8.txt +2 -0
  76. data/test/resources/hello_windows.txt +2 -0
  77. data/test/resources/hello_world.doc +0 -0
  78. data/test/resources/hello_world.pdf +0 -0
  79. data/test/resources/hello_world.txt +2 -0
  80. data/test/resources/high_ascii_delimited.txt +2 -0
  81. data/test/resources/malformed.xml +6 -0
  82. data/test/resources/normal.csv +3 -0
  83. data/test/resources/normal.csv.zip +0 -0
  84. data/test/resources/normal_pipe.csv +3 -0
  85. data/test/resources/normal_thorn.csv +3 -0
  86. data/test/resources/not_a_pdf.pdf +0 -0
  87. data/test/resources/not_a_word_file.doc +0 -0
  88. data/test/resources/sample_xls.xls +0 -0
  89. data/test/resources/sample_xlsx.xlsx +0 -0
  90. data/test/resources/standard_mappings.yml +39 -0
  91. data/test/resources/txt_file_xls_extension.xls +1 -0
  92. data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
  93. data/test/resources/utf-16be_xml.xml +0 -0
  94. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  95. data/test/resources/utf-16le_xml.xml +0 -0
  96. data/test/resources/utf-8_xml.xml +9 -0
  97. data/test/resources/windows-1252_xml.xml +9 -0
  98. data/test/resources/windows.csv +5 -0
  99. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  100. data/test/standard_mappings_test.rb +22 -0
  101. data/test/table_test.rb +288 -0
  102. data/test/test_helper.rb +13 -0
  103. metadata +443 -0
@@ -0,0 +1,36 @@
1
+ require 'test_helper'
2
+
3
+ # This tests the NdrImport::NonTabular::Mapping mapping class
4
+ class MappingTestTest < ActiveSupport::TestCase
5
+ def test_should_raise_error_with_no_non_tabular_row
6
+ assert_raise NdrImport::MappingError do
7
+ NdrImport::NonTabular::Mapping.new(
8
+ 'columns' => [{ 'column' => 'one' }]
9
+ )
10
+ end
11
+ end
12
+
13
+ def test_should_raise_error_with_no_non_tabular_row_start_line_pattern
14
+ assert_raise NdrImport::MappingError do
15
+ NdrImport::NonTabular::Mapping.new(
16
+ 'non_tabular_row' => nil,
17
+ 'columns' => [{ 'column' => 'one' }]
18
+ )
19
+ end
20
+
21
+ assert_raise NdrImport::MappingError do
22
+ NdrImport::NonTabular::Mapping.new(
23
+ 'non_tabular_row' => { 'start_line_pattern' => nil },
24
+ 'columns' => [{ 'column' => 'one' }]
25
+ )
26
+ end
27
+ end
28
+
29
+ def test_should_initialize_with_non_tabular_row
30
+ mapping = NdrImport::NonTabular::Mapping.new(
31
+ 'non_tabular_row' => { 'start_line_pattern' => /\A-*\z/ },
32
+ 'columns' => [{ 'column' => 'one' }]
33
+ )
34
+ assert_equal(/\A-*\z/, mapping.start_line_pattern)
35
+ end
36
+ end
@@ -0,0 +1,510 @@
1
+ require 'test_helper'
2
+
3
+ # This tests the NdrImport::NonTabular::Table mapping class
4
+ class TableTest < ActiveSupport::TestCase
5
+ def setup
6
+ @simple_divider_example = <<-STR.split(/\n/).map
7
+ 111
8
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt.
9
+ ------
10
+ 222
11
+ Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo.
12
+ ------
13
+ 333
14
+ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla.
15
+ ------
16
+ 444
17
+ Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim.
18
+ STR
19
+
20
+ @no_divider_example = <<-STR.split(/\n/).map
21
+ 111
22
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt.
23
+ STR
24
+
25
+ @simple_start_and_end_divider_example = <<-STR.split(/\n/).map
26
+ ----- START -----
27
+ 111
28
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt.
29
+ ------ END ------
30
+ This is never captured
31
+ ----- START -----
32
+ 222
33
+ Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo.
34
+ ------ END ------
35
+ This is never captured
36
+ ----- START -----
37
+ 333
38
+ Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla.
39
+ ------ END ------
40
+ This is never captured
41
+ ----- START -----
42
+ 444
43
+ This is captured
44
+ Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim.
45
+ ------ END ------
46
+ STR
47
+ end
48
+
49
+ def test_all_valid_options
50
+ valid_options = %w(
51
+ canonical_name capture_start_line columns end_in_a_record end_line_pattern filename_pattern
52
+ format klass remove_lines start_in_a_record start_line_pattern
53
+ )
54
+ assert_equal valid_options.sort,
55
+ NdrImport::NonTabular::Table.all_valid_options.sort
56
+ end
57
+
58
+ def test_should_raise_error_with_no_start_line_pattern
59
+ assert_raise NdrImport::MappingError do
60
+ NdrImport::NonTabular::Table.new(
61
+ 'columns' => [{ 'column' => 'one' }]
62
+ )
63
+ end
64
+
65
+ assert_raise NdrImport::MappingError do
66
+ NdrImport::NonTabular::Table.new(
67
+ 'start_line_pattern' => nil,
68
+ 'columns' => [{ 'column' => 'one' }]
69
+ )
70
+ end
71
+ end
72
+
73
+ def test_should_initialize_with_non_tabular_row
74
+ table = NdrImport::NonTabular::Table.new(
75
+ 'start_line_pattern' => /\A-*\z/,
76
+ 'columns' => [{ 'column' => 'one' }]
77
+ )
78
+ assert_equal(/\A-*\z/, table.start_line_pattern)
79
+ end
80
+
81
+ def test_should_test_flat_file_txt
82
+ table = YAML.load_file(SafePath.new('permanent_test_files').join('flat_file.yml'))
83
+ assert table.is_a?(NdrImport::NonTabular::Table)
84
+ filename = SafePath.new('permanent_test_files').join('flat_file.txt')
85
+ enum = table.transform(File.new(filename).each)
86
+ # puts enum.to_a.inspect
87
+
88
+ results = []
89
+ enum.each do |_klass, fields, _index|
90
+ results << fields[:rawtext]['one']
91
+ end
92
+
93
+ assert_equal 4, results.count
94
+ assert results.first.start_with?('1')
95
+ assert results.last.start_with?('4')
96
+
97
+ assert results.any? { |result| result =~ /This is captured/ }
98
+ refute results.any? { |result| result =~ /This is never captured/ }
99
+ refute results.any? { |result| result =~ /== Page/ }
100
+ end
101
+
102
+ def test_should_raise_error_with_no_column_non_tabular_cell
103
+ table = YAML.load <<-YML.strip_heredoc
104
+ --- !ruby/object:NdrImport::NonTabular::Table
105
+ start_line_pattern: !ruby/regexp /^-{6}$/
106
+ klass: SomeTestKlass
107
+ columns:
108
+ - column: one
109
+ YML
110
+ assert_raise NdrImport::MappingError do
111
+ table.transform(@simple_divider_example).to_a
112
+ end
113
+ end
114
+
115
+ def test_should_raise_error_with_no_column_non_tabular_cell_lines
116
+ table = YAML.load <<-YML.strip_heredoc
117
+ --- !ruby/object:NdrImport::NonTabular::Table
118
+ start_line_pattern: !ruby/regexp /^-{6}$/
119
+ klass: SomeTestKlass
120
+ columns:
121
+ - column: one
122
+ non_tabular_cell:
123
+ YML
124
+ assert_raise NdrImport::MappingError do
125
+ table.transform(@simple_divider_example).to_a
126
+ end
127
+
128
+ table = YAML.load <<-YML.strip_heredoc
129
+ --- !ruby/object:NdrImport::NonTabular::Table
130
+ start_line_pattern: !ruby/regexp /^-{6}$/
131
+ klass: SomeTestKlass
132
+ columns:
133
+ - column: one
134
+ non_tabular_cell:
135
+ lines:
136
+ YML
137
+ assert_raise NdrImport::MappingError do
138
+ table.transform(@simple_divider_example).to_a
139
+ end
140
+ end
141
+
142
+ def test_should_raise_error_with_no_column_non_tabular_cell_capture
143
+ table = YAML.load <<-YML.strip_heredoc
144
+ --- !ruby/object:NdrImport::NonTabular::Table
145
+ start_line_pattern: !ruby/regexp /^-{6}$/
146
+ klass: SomeTestKlass
147
+ columns:
148
+ - column: one
149
+ non_tabular_cell:
150
+ lines: !ruby/range
151
+ begin: 0
152
+ end: -1
153
+ excl: false
154
+ YML
155
+ assert_raise NdrImport::MappingError do
156
+ table.transform(@simple_divider_example).to_a
157
+ end
158
+
159
+ table = YAML.load <<-YML.strip_heredoc
160
+ --- !ruby/object:NdrImport::NonTabular::Table
161
+ start_line_pattern: !ruby/regexp /^-{6}$/
162
+ klass: SomeTestKlass
163
+ columns:
164
+ - column: one
165
+ non_tabular_cell:
166
+ lines: !ruby/range
167
+ begin: 0
168
+ end: -1
169
+ excl: false
170
+ capture:
171
+ YML
172
+ assert_raise NdrImport::MappingError do
173
+ table.transform(@simple_divider_example).to_a
174
+ end
175
+ end
176
+
177
+ def test_should_only_return_two_results_with_no_start_in_a_record_or_end_in_a_record
178
+ table = YAML.load <<-YML.strip_heredoc
179
+ --- !ruby/object:NdrImport::NonTabular::Table
180
+ start_line_pattern: !ruby/regexp /^-{6}$/
181
+ klass: SomeTestKlass
182
+ columns:
183
+ - column: one
184
+ non_tabular_cell:
185
+ lines: !ruby/range
186
+ begin: 0
187
+ end: -1
188
+ excl: false
189
+ capture: !ruby/regexp /^(.*)$/i
190
+ YML
191
+ enum = table.transform(@simple_divider_example)
192
+ assert_instance_of Enumerator, enum
193
+ results = enum.map { |_klass, fields, _index| fields[:rawtext]['one'] }
194
+
195
+ assert_equal 2, results.count
196
+ assert results.first.start_with?('222')
197
+ assert results.last.start_with?('333')
198
+ end
199
+
200
+ def test_should_return_three_results_with_start_in_a_record
201
+ table = YAML.load <<-YML.strip_heredoc
202
+ --- !ruby/object:NdrImport::NonTabular::Table
203
+ start_line_pattern: !ruby/regexp /^-{6}$/
204
+ start_in_a_record: true
205
+ klass: SomeTestKlass
206
+ columns:
207
+ - column: one
208
+ non_tabular_cell:
209
+ lines: !ruby/range
210
+ begin: 0
211
+ end: -1
212
+ excl: false
213
+ capture: !ruby/regexp /^(.*)$/i
214
+ YML
215
+ enum = table.transform(@simple_divider_example)
216
+ assert_instance_of Enumerator, enum
217
+ results = enum.map { |_klass, fields, _index| fields[:rawtext]['one'] }
218
+
219
+ assert_equal 3, results.count
220
+ assert results.first.start_with?('111')
221
+ assert results.last.start_with?('333')
222
+ end
223
+
224
+ def test_should_return_three_results_with_end_in_a_record
225
+ table = YAML.load <<-YML.strip_heredoc
226
+ --- !ruby/object:NdrImport::NonTabular::Table
227
+ start_line_pattern: !ruby/regexp /^-{6}$/
228
+ end_in_a_record: true
229
+ klass: SomeTestKlass
230
+ columns:
231
+ - column: one
232
+ non_tabular_cell:
233
+ lines: !ruby/range
234
+ begin: 0
235
+ end: -1
236
+ excl: false
237
+ capture: !ruby/regexp /^(.*)$/i
238
+ YML
239
+ enum = table.transform(@simple_divider_example)
240
+ assert_instance_of Enumerator, enum
241
+ results = enum.map { |_klass, fields, _index| fields[:rawtext]['one'] }
242
+
243
+ assert_equal 3, results.count
244
+ assert results.first.start_with?('222')
245
+ assert results.last.start_with?('444')
246
+ end
247
+
248
+ def test_should_return_four_results_with_start_in_a_record_and_end_in_a_record
249
+ table = YAML.load <<-YML.strip_heredoc
250
+ --- !ruby/object:NdrImport::NonTabular::Table
251
+ start_line_pattern: !ruby/regexp /^-{6}$/
252
+ start_in_a_record: true
253
+ end_in_a_record: true
254
+ klass: SomeTestKlass
255
+ columns:
256
+ - column: one
257
+ non_tabular_cell:
258
+ lines: !ruby/range
259
+ begin: 0
260
+ end: -1
261
+ excl: false
262
+ capture: !ruby/regexp /^(.*)$/i
263
+ YML
264
+ enum = table.transform(@simple_divider_example)
265
+ assert_instance_of Enumerator, enum
266
+ results = enum.map { |_klass, fields, _index| fields[:rawtext]['one'] }
267
+
268
+ assert_equal 4, results.count
269
+ assert results.first.start_with?('111')
270
+ assert results.last.start_with?('444')
271
+ end
272
+
273
+ def test_should_return_one_results_with_start_in_a_record_and_end_in_a_record
274
+ table = YAML.load <<-YML.strip_heredoc
275
+ --- !ruby/object:NdrImport::NonTabular::Table
276
+ start_line_pattern: !ruby/regexp /^-{6}$/
277
+ start_in_a_record: true
278
+ end_in_a_record: true
279
+ klass: SomeTestKlass
280
+ columns:
281
+ - column: one
282
+ non_tabular_cell:
283
+ lines: !ruby/range
284
+ begin: 0
285
+ end: -1
286
+ excl: false
287
+ capture: !ruby/regexp /^(.*)$/i
288
+ YML
289
+ enum = table.transform(@no_divider_example)
290
+ assert_instance_of Enumerator, enum
291
+ results = enum.map { |_klass, fields, _index| fields[:rawtext]['one'] }
292
+
293
+ assert_equal 1, results.count
294
+ assert results.first.start_with?('111')
295
+ end
296
+
297
+ def test_should_return_four_results_with_start_and_end_dividers
298
+ table = YAML.load <<-YML.strip_heredoc
299
+ --- !ruby/object:NdrImport::NonTabular::Table
300
+ start_line_pattern: !ruby/regexp /^----- START -----$/
301
+ end_line_pattern: !ruby/regexp /^------ END ------$/
302
+ klass: SomeTestKlass
303
+ columns:
304
+ - column: one
305
+ non_tabular_cell:
306
+ lines: !ruby/range
307
+ begin: 0
308
+ end: -1
309
+ excl: false
310
+ capture: !ruby/regexp /^(.*)$/i
311
+ YML
312
+ enum = table.transform(@simple_start_and_end_divider_example)
313
+ assert_instance_of Enumerator, enum
314
+
315
+ results = enum.map { |_klass, fields, _index| fields[:rawtext]['one'] }
316
+
317
+ assert_equal 4, results.count
318
+ assert results.first.start_with?('111')
319
+ assert results.last.start_with?('444')
320
+
321
+ assert results.any? { |result| result =~ /This is captured/ }
322
+ refute results.any? { |result| result =~ /This is never captured/ }
323
+ end
324
+
325
+ def test_should_capture
326
+ table = YAML.load <<-YML.strip_heredoc
327
+ --- !ruby/object:NdrImport::NonTabular::Table
328
+ start_line_pattern: !ruby/regexp /^-{6}$/
329
+ klass: SomeTestKlass
330
+ columns:
331
+ - standard_mapping: nhsnumber
332
+ non_tabular_cell:
333
+ lines: 0
334
+ capture: !ruby/regexp /^(\\d*)$/i
335
+ - column: address
336
+ non_tabular_cell:
337
+ lines: !ruby/range
338
+ begin: 1
339
+ end: 5
340
+ excl: false
341
+ capture: !ruby/regexp /^.{50}(.*)$/i
342
+ join: ", "
343
+ - standard_mapping: postcode
344
+ non_tabular_cell:
345
+ lines: 6
346
+ capture: !ruby/regexp /^.{50}(.*)$/i
347
+ - column: capture_inclusive
348
+ non_tabular_cell:
349
+ lines: !ruby/object:RegexpRange
350
+ begin: !ruby/regexp /^CAPTURE INCLUSIVE$/
351
+ end: !ruby/regexp /^Capture me.$/i
352
+ excl: false
353
+ capture: !ruby/regexp /^(.*)$/i
354
+ join: "\\n"
355
+ - column: capture_exclusive
356
+ non_tabular_cell:
357
+ lines: !ruby/object:RegexpRange
358
+ begin: !ruby/regexp /^CAPTURE EXCLUSIVE$/
359
+ end: !ruby/regexp /^Do NOT capture me.$/i
360
+ excl: true
361
+ capture: !ruby/regexp /^(.*)$/i
362
+ join: "\\n"
363
+ - column: capture_to_end
364
+ non_tabular_cell:
365
+ lines: !ruby/object:RegexpRange
366
+ begin: !ruby/regexp /^CAPTURE TO END$/
367
+ end: -1
368
+ excl: false
369
+ capture: !ruby/regexp /^(.*)$/i
370
+ join: "\\n"
371
+ YML
372
+ capture_example = <<-STR
373
+ This is never captured
374
+ ------
375
+ 1111111111
376
+ <----------------- 50 characters ---------------->Unit C, Magog Court
377
+ Shelford Bottom
378
+ Hinton Way
379
+ Cambridge
380
+
381
+ CB22 3AD
382
+
383
+ CAPTURE INCLUSIVE
384
+ Lorem ipsum dolor sit amet,
385
+ consectetur adipisicing elit,
386
+ Capture me.
387
+
388
+ CAPTURE EXCLUSIVE
389
+ Ut enim ad minim veniam, quis nostrud exercitation.
390
+ Do NOT capture me.
391
+
392
+ CAPTURE TO END
393
+ Lorem ipsum dolor sit amet, consectetur adipisicing elit.
394
+ Ut enim ad minim veniam, quis nostrud exercitation ullamco.
395
+ Duis aute irure dolor in reprehenderit in voluptate velit.
396
+ Excepteur sint occaecat cupidatat non proident, sunt in culpa.
397
+ ------
398
+ This is never captured
399
+ STR
400
+ enum = table.transform(capture_example.split(/\n/).map)
401
+ assert_instance_of Enumerator, enum
402
+
403
+ output = []
404
+ enum.each do |klass, fields, index|
405
+ output << [klass, fields, index]
406
+ end
407
+
408
+ expected_output = [
409
+ [
410
+ 'SomeTestKlass', {
411
+ 'nhsnumber' => '1111111111',
412
+ 'postcode' => 'CB223AD',
413
+ :rawtext => {
414
+ 'nhsnumber' => '1111111111',
415
+ 'address' => 'Unit C, Magog Court, Shelford Bottom, Hinton Way, Cambridge',
416
+ 'postcode' => 'CB22 3AD',
417
+ 'capture_inclusive' => "CAPTURE INCLUSIVE\nLorem ipsum dolor sit amet,\n" \
418
+ "consectetur adipisicing elit,\nCapture me.",
419
+ 'capture_exclusive' => "CAPTURE EXCLUSIVE\n" \
420
+ 'Ut enim ad minim veniam, quis nostrud exercitation.',
421
+ 'capture_to_end' => "CAPTURE TO END\n" \
422
+ "Lorem ipsum dolor sit amet, consectetur adipisicing elit.\n" \
423
+ "Ut enim ad minim veniam, quis nostrud exercitation ullamco.\n" \
424
+ "Duis aute irure dolor in reprehenderit in voluptate velit.\n" \
425
+ 'Excepteur sint occaecat cupidatat non proident, sunt in culpa.'
426
+ }
427
+ },
428
+ 0
429
+ ]
430
+ ]
431
+ assert_equal expected_output.sort, output.sort
432
+ assert_equal 25, table.non_tabular_lines.last.absolute_line_number
433
+ end
434
+
435
+ def test_handles_non_utf8_characters
436
+ mixed_encoding_example = <<-STR.each_line
437
+ 111
438
+ Lorem ipsum dolor sit amet.
439
+ ------
440
+ 111
441
+ Lorem ipsum dolor\xBE sit amet.
442
+ ------
443
+ 111
444
+ Lorem ipsum dolor sit amet.
445
+ ------
446
+ STR
447
+
448
+ table = YAML.load <<-YML.strip_heredoc
449
+ --- !ruby/object:NdrImport::NonTabular::Table
450
+ start_line_pattern: !ruby/regexp /^111$/
451
+ end_in_a_record: true
452
+ klass: SomeTestKlass
453
+ columns:
454
+ - column: one
455
+ non_tabular_cell:
456
+ lines: !ruby/range
457
+ begin: 0
458
+ end: -1
459
+ excl: true
460
+ capture: !ruby/regexp /^(.*)$/i
461
+ YML
462
+
463
+ enum = table.transform(mixed_encoding_example)
464
+ assert_instance_of Enumerator, enum
465
+ results = enum.map { |_klass, fields, _index| fields[:rawtext]['one'] }
466
+
467
+ assert_equal 3, results.count, 'records were lost'
468
+
469
+ assert_equal [27, 28, 27], results.map { |row| row.chars.to_a.length }
470
+ assert_equal [27, 29, 27], results.map { |row| row.bytes.to_a.length }
471
+
472
+ results.each do |row|
473
+ assert row.first.valid_encoding?
474
+ assert_equal Encoding.find('UTF-8'), row.first.encoding
475
+ end
476
+ end
477
+
478
+ def test_should_not_allow_junk_bytes
479
+ junk = <<-STR.each_line
480
+ 111
481
+ Lorem ipsum dolor sit amet.
482
+ ------
483
+ 111
484
+ Lorem ipsum dolor\x8D sit amet.
485
+ ------
486
+ 111
487
+ Lorem ipsum dolor sit amet.
488
+ ------
489
+ STR
490
+
491
+ table = YAML.load <<-YML.strip_heredoc
492
+ --- !ruby/object:NdrImport::NonTabular::Table
493
+ start_line_pattern: !ruby/regexp /^111$/
494
+ end_in_a_record: true
495
+ klass: SomeTestKlass
496
+ columns:
497
+ - column: one
498
+ non_tabular_cell:
499
+ lines: !ruby/range
500
+ begin: 0
501
+ end: -1
502
+ excl: true
503
+ capture: !ruby/regexp /^(.*)$/i
504
+ YML
505
+
506
+ assert_raises(UTF8Encoding::UTF8CoercionError) do
507
+ table.transform(junk).to_a
508
+ end
509
+ end
510
+ end