ndr_import 9.0.2 → 10.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7dc0801c147101346c5a2e0af96ef9a658bc068088bb54f4f65970bb012e6cf5
4
- data.tar.gz: 22823d5d0415a95eebac8151475d6eba7fdbd1744003ee0d88b1dedf86b837b2
3
+ metadata.gz: 2463ac35899a6db81e345b75b0ea10186530f559460dfca1211ba7694f52b760
4
+ data.tar.gz: 2f2cf39e959beeb3cfe6bcad033eb2c0695486dce3b640fe3de8b0c2b9b88a2f
5
5
  SHA512:
6
- metadata.gz: 3c29ef1ab701b94301aca343d8bf8497fda2633d120b3945dc140cb30d1b40b30bfcfb2334fe2bdd6f02d6128f1743ac42601f8129687502503e69c14d77df98
7
- data.tar.gz: 49e739dab53bf1276655e01a91dd9253221e041c0f3241b85f362a2d429597fbb4589d14b89b1d83fed7995b043288e72f3bf1bd27d2678e6f7d2c4b1a5c3d8f
6
+ metadata.gz: c6eca601043ff01ebe910375a36131bdf55ebd3f18664e3db3c07180f007b0f06dc50cc3016634a12764befc1e28d621a09a7b382392f67608aa4d21c83c7f2d
7
+ data.tar.gz: dc568494bfc4b39b7ee47c7738511274cb85ad9532e380f3c27870682a1a21d2acda4d7d8d855597abdd0643b8afe0ed6e161b74f201fa53640b5c9641f895c5
@@ -0,0 +1,2 @@
1
+ # Admins should have oversight of the version:
2
+ lib/ndr_import/version.rb @publichealthengland/ndr-admins
@@ -0,0 +1,23 @@
1
+ name: Lint
2
+
3
+ on: [pull_request]
4
+
5
+ jobs:
6
+ rubocop:
7
+ name: RuboCop
8
+
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v2
13
+ with:
14
+ fetch-depth: 0 # fetch everything
15
+ - name: Set up Ruby
16
+ uses: ruby/setup-ruby@v1
17
+ with:
18
+ ruby-version: 3.0
19
+ - name: Install dependencies
20
+ run: bundle install
21
+ - name: Run RuboCop against BASE..HEAD changes
22
+ run: bundle exec rake rubocop:diff origin/${GITHUB_BASE_REF#*/}
23
+
@@ -0,0 +1,72 @@
1
+ name: Test
2
+
3
+ on: [push]
4
+
5
+ jobs:
6
+ test:
7
+ strategy:
8
+ fail-fast: false
9
+ matrix:
10
+ ruby-version:
11
+ - 2.6
12
+ - 2.7
13
+ - 3.0
14
+ gemfile:
15
+ - gemfiles/Gemfile.rails52
16
+ - gemfiles/Gemfile.rails60
17
+
18
+ name: Ruby ${{ matrix.ruby-version }} / Bundle ${{ matrix.gemfile }}
19
+
20
+ runs-on: ubuntu-latest
21
+
22
+ env:
23
+ BUNDLE_GEMFILE: ${{ matrix.gemfile }}
24
+
25
+ steps:
26
+ - uses: actions/checkout@v2
27
+ - name: Set up Ruby
28
+ uses: ruby/setup-ruby@v1
29
+ with:
30
+ ruby-version: ${{ matrix.ruby-version }}
31
+ - name: Install dependencies
32
+ run: bundle install
33
+ - name: Run tests
34
+ run: bundle exec rake
35
+
36
+ # A utility job upon which Branch Protection can depend,
37
+ # thus remaining agnostic of the matrix.
38
+ test_matrix:
39
+ if: ${{ always() }}
40
+ runs-on: ubuntu-latest
41
+ name: Matrix
42
+ needs: test
43
+ steps:
44
+ - name: Check build matrix status
45
+ if: ${{ needs.test.result != 'success' }}
46
+ run: exit 1
47
+
48
+ notify:
49
+ # Run only on master, but regardless of whether tests past:
50
+ if: ${{ always() && github.ref == 'refs/heads/master' }}
51
+
52
+ needs: test_matrix
53
+
54
+ runs-on: ubuntu-latest
55
+
56
+ steps:
57
+ - uses: 8398a7/action-slack@v3
58
+ with:
59
+ status: custom
60
+ fields: workflow,commit,author
61
+ custom_payload: |
62
+ {
63
+ channel: 'C7FQWGDHP',
64
+ username: 'CI – ' + '${{ github.repository }}'.split('/')[1],
65
+ icon_emoji: ':hammer_and_wrench:',
66
+ attachments: [{
67
+ color: '${{ needs.test_matrix.result }}' === 'success' ? 'good' : '${{ needs.test_matrix.result }}' === 'failure' ? 'danger' : 'warning',
68
+ text: `${process.env.AS_WORKFLOW} against \`${{ github.ref }}\` (${process.env.AS_COMMIT}) for ${{ github.actor }} resulted in *${{ needs.test_matrix.result }}*.`
69
+ }]
70
+ }
71
+ env:
72
+ SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
data/CHANGELOG.md CHANGED
@@ -1,6 +1,37 @@
1
1
  ## [Unreleased]
2
2
  *no unreleased changes*
3
3
 
4
+ ## 10.1.1 / 2021-03-15
5
+ ### Fixed
6
+ * XML: ensure invalid control character *references* are also escaped (#64)
7
+
8
+ ## 10.1.0 / 2021-03-08
9
+ ### Added
10
+ * Allow optional `last_data_column` in NdrImport::Table mappings (#61)
11
+
12
+ ## 10.0.0 / 2021-02-22
13
+ ### Changed
14
+ * By default, escape any control characters found in XML (#60)
15
+
16
+ ## 9.1.0 / 2021-02-01
17
+ ### Added
18
+ * `CSVLibrary` is now deprecated.
19
+ * Handle xlsm files
20
+
21
+ ### Fixed
22
+ * Fix `CSVLibrary.foreach` on Ruby 3.0
23
+ * Updated jekyll bundle
24
+
25
+ ## 9.0.3 / 2021-01-04
26
+ ### Fixed
27
+ * Address issue importing multiple files against the same table (#54)
28
+
29
+ ### Changed
30
+ * ensure keyword args are properly splatted for ruby 2.7
31
+
32
+ ### Added
33
+ * Ruby 2.7 to travis matrix
34
+
4
35
  ## 9.0.2 / 2020-08-14
5
36
  ### Changed
6
37
  * Configure Nokogiri with HUGE for large xml files
data/README.md CHANGED
@@ -1,5 +1,4 @@
1
- # NdrImport [![Build Status](https://travis-ci.org/PublicHealthEngland/ndr_import.svg?branch=master)](https://travis-ci.org/PublicHealthEngland/ndr_import) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://badge.fury.io/rb/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
2
-
1
+ # NdrImport [![Build Status](https://github.com/publichealthengland/ndr_import/workflows/Test/badge.svg)](https://github.com/publichealthengland/ndr_import/actions?query=workflow%3Atest) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://rubygems.org/gems/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
3
2
  This is the Public Health England (PHE) National Disease Registers (NDR) Import ETL ruby gem, providing:
4
3
 
5
4
  1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and Zip files.
data/code_safety.yml CHANGED
@@ -1,5 +1,17 @@
1
1
  ---
2
2
  file safety:
3
+ ".github/CODEOWNERS":
4
+ comments:
5
+ reviewed_by: ollietulloch
6
+ safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
7
+ ".github/workflows/lint.yml":
8
+ comments:
9
+ reviewed_by: ollietulloch
10
+ safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
11
+ ".github/workflows/test.yml":
12
+ comments:
13
+ reviewed_by: ollietulloch
14
+ safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
3
15
  ".gitignore":
4
16
  comments: whole file re-reviewed
5
17
  reviewed_by: josh.pencheon
@@ -12,14 +24,10 @@ file safety:
12
24
  comments:
13
25
  reviewed_by: josh.pencheon
14
26
  safe_revision: b09e268ff9c8349b914aa1b7ba888e1d39f97e4a
15
- ".travis.yml":
16
- comments:
17
- reviewed_by: josh.pencheon
18
- safe_revision: d3d9a987befeecb122a448d8d06e66d74da13fb5
19
27
  CHANGELOG.md:
20
28
  comments:
21
- reviewed_by: ollietulloch
22
- safe_revision: aa006cd76123db2101d145a07d201dc6a709ed6e
29
+ reviewed_by: josh.pencheon
30
+ safe_revision: 47fa3633ec2e48f1ee9fb12aad03e817e73c54bf
23
31
  CODE_OF_CONDUCT.md:
24
32
  comments:
25
33
  reviewed_by: timgentry
@@ -38,8 +46,8 @@ file safety:
38
46
  safe_revision: 5d185a0aeba6a9cd2ff5e59efadcaeec9be45d8b
39
47
  README.md:
40
48
  comments:
41
- reviewed_by: josh.pencheon
42
- safe_revision: 1bc459db8970dde36e9b240b6dd08cca629664e3
49
+ reviewed_by: ollietulloch
50
+ safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
43
51
  Rakefile:
44
52
  comments:
45
53
  reviewed_by: josh.pencheon
@@ -59,7 +67,7 @@ file safety:
59
67
  docs/Gemfile.lock:
60
68
  comments:
61
69
  reviewed_by: ollietulloch
62
- safe_revision: 6f274715bb341c3070190f04f67af9500b510580
70
+ safe_revision: ea0149c7739676463a252ffd9fbe4af238762b2b
63
71
  docs/_config.yml:
64
72
  comments:
65
73
  reviewed_by: josh.pencheon
@@ -146,8 +154,8 @@ file safety:
146
154
  safe_revision: 24d6449fd0612552f132dfbf4cada2ae28d0469e
147
155
  lib/ndr_import/csv_library.rb:
148
156
  comments:
149
- reviewed_by: josh.pencheon
150
- safe_revision: be12e57519d3737e8d3901d7b01485c6995708dd
157
+ reviewed_by: ollietulloch
158
+ safe_revision: 6b8668967dbd42d7893a0fa5f0aa1ec1c11227e1
151
159
  lib/ndr_import/file/acro_form.rb:
152
160
  comments:
153
161
  reviewed_by: josh.pencheon
@@ -170,8 +178,8 @@ file safety:
170
178
  safe_revision: 897f8b648d633368cf2001d17ab89c06a12d445b
171
179
  lib/ndr_import/file/excel.rb:
172
180
  comments:
173
- reviewed_by: josh.pencheon
174
- safe_revision: c3183e522bce50008df576ceb47fe4761ab8f966
181
+ reviewed_by: ollietulloch
182
+ safe_revision: 37482c79448bea80033f6f69d97584df330c9861
175
183
  lib/ndr_import/file/office_file_helper.rb:
176
184
  comments:
177
185
  reviewed_by: josh.pencheon
@@ -214,8 +222,8 @@ file safety:
214
222
  safe_revision: dfc958d44b6c58355445fa395db08a62213ee709
215
223
  lib/ndr_import/helpers/file/delimited.rb:
216
224
  comments:
217
- reviewed_by: josh.pencheon
218
- safe_revision: 607c0668f1fffd70d181bc1a31c4f56eed5f6189
225
+ reviewed_by: ollietulloch
226
+ safe_revision: 4a5cc1d362c632fc1f9242c69982fbce33557e17
219
227
  lib/ndr_import/helpers/file/excel.rb:
220
228
  comments:
221
229
  reviewed_by: joshpencheon
@@ -230,8 +238,8 @@ file safety:
230
238
  safe_revision: 45da71ebd3acbc0fe53755bcd75483ba17cb6924
231
239
  lib/ndr_import/helpers/file/xml.rb:
232
240
  comments:
233
- reviewed_by: ollietulloch
234
- safe_revision: 4d337bd233f7e60cf9d363c92400f21269a28da7
241
+ reviewed_by: josh.pencheon
242
+ safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
235
243
  lib/ndr_import/helpers/file/xml_streaming.rb:
236
244
  comments: uses SafePath and Shellwords when accessing filesystem, or making system
237
245
  calls
@@ -272,7 +280,7 @@ file safety:
272
280
  lib/ndr_import/non_tabular/table.rb:
273
281
  comments:
274
282
  reviewed_by: josh.pencheon
275
- safe_revision: 71979e0a602ca5a0ce415c194f10add9959f0116
283
+ safe_revision: f9df064adcfd38f09d83ad8c5496c84188faed98
276
284
  lib/ndr_import/non_tabular_file_helper.rb:
277
285
  comments:
278
286
  reviewed_by: josh.pencheon
@@ -288,19 +296,23 @@ file safety:
288
296
  lib/ndr_import/table.rb:
289
297
  comments: uses File.basename
290
298
  reviewed_by: josh.pencheon
291
- safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
299
+ safe_revision: 3cf7473181f7f835b3dfe7822f6833d751805eaf
292
300
  lib/ndr_import/universal_importer_helper.rb:
293
301
  comments:
294
302
  reviewed_by: josh.pencheon
295
- safe_revision: 55e502bb4445cb8b985e530e8eb26d92b574ded9
303
+ safe_revision: 85869d99ae93252b7f3ef2d0a4db817c88d35c9e
296
304
  lib/ndr_import/unmapped_data_error.rb:
297
305
  comments:
298
306
  reviewed_by: josh.pencheon
299
307
  safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
300
308
  lib/ndr_import/version.rb:
301
309
  comments: another check?
302
- reviewed_by: ollietulloch
303
- safe_revision: aa006cd76123db2101d145a07d201dc6a709ed6e
310
+ reviewed_by: josh.pencheon
311
+ safe_revision: 47fa3633ec2e48f1ee9fb12aad03e817e73c54bf
312
+ lib/ndr_import/xml/control_char_escaper.rb:
313
+ comments:
314
+ reviewed_by: josh.pencheon
315
+ safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
304
316
  lib/ndr_import/xml/table.rb:
305
317
  comments:
306
318
  reviewed_by: josh.pencheon
@@ -309,6 +321,10 @@ file safety:
309
321
  comments:
310
322
  reviewed_by: josh.pencheon
311
323
  safe_revision: 95e6ee9997d06471fe6f2f169c3c701471086371
324
+ test/csv_library_test.rb:
325
+ comments:
326
+ reviewed_by: ollietulloch
327
+ safe_revision: 6b8668967dbd42d7893a0fa5f0aa1ec1c11227e1
312
328
  test/file/acro_form_test.rb:
313
329
  comments:
314
330
  reviewed_by: josh.pencheon
@@ -327,16 +343,16 @@ file safety:
327
343
  safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
328
344
  test/file/excel_test.rb:
329
345
  comments:
330
- reviewed_by: josh.pencheon
331
- safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
346
+ reviewed_by: ollietulloch
347
+ safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
332
348
  test/file/pdf_test.rb:
333
349
  comments:
334
350
  reviewed_by: josh.pencheon
335
351
  safe_revision: cb24ed3ea8116730d07f74546cd6fed0738b171d
336
352
  test/file/registry_test.rb:
337
353
  comments:
338
- reviewed_by: josh.pencheon
339
- safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
354
+ reviewed_by: ollietulloch
355
+ safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
340
356
  test/file/seven_zip_test.rb:
341
357
  comments:
342
358
  reviewed_by: josh.pencheon
@@ -383,8 +399,8 @@ file safety:
383
399
  safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
384
400
  test/helpers/file/xml_test.rb:
385
401
  comments:
386
- reviewed_by: timgentry
387
- safe_revision: 137170d443ea6bcc0afb18f62202c285ae6501eb
402
+ reviewed_by: josh.pencheon
403
+ safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
388
404
  test/helpers/file/zip_test.rb:
389
405
  comments:
390
406
  reviewed_by: josh.pencheon
@@ -495,8 +511,8 @@ file safety:
495
511
  safe_revision: 71979e0a602ca5a0ce415c194f10add9959f0116
496
512
  test/resources/malformed.xml:
497
513
  comments:
498
- reviewed_by: timgentry
499
- safe_revision: 137170d443ea6bcc0afb18f62202c285ae6501eb
514
+ reviewed_by: joshpencheon
515
+ safe_revision: 3947f13e0cbd17f449eba292ad343eeb82116fe9
500
516
  test/resources/malformed_pipe.csv:
501
517
  comments:
502
518
  reviewed_by: josh.pencheon
@@ -553,6 +569,10 @@ file safety:
553
569
  comments:
554
570
  reviewed_by: timgentry
555
571
  safe_revision: 8c30f89f0562ab120769c166d4e93ff839c055f7
572
+ test/resources/sample_xlsm.xlsm:
573
+ comments:
574
+ reviewed_by: ollietulloch
575
+ safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
556
576
  test/resources/sample_xlsx.xlsx:
557
577
  comments:
558
578
  reviewed_by: timgentry
@@ -569,6 +589,10 @@ file safety:
569
589
  comments:
570
590
  reviewed_by: timgentry
571
591
  safe_revision: 31fb1935f4578729d8786eea41cf0ce0a19be1cd
592
+ test/resources/two_files_single_table_mapping.zip:
593
+ comments:
594
+ reviewed_by: ollietulloch
595
+ safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
572
596
  test/resources/txt_file_xls_extension.xls:
573
597
  comments:
574
598
  reviewed_by: timgentry
@@ -601,6 +625,22 @@ file safety:
601
625
  comments:
602
626
  reviewed_by: timgentry
603
627
  safe_revision: f755c6960182f7dd460c18866cccfdf09178e860
628
+ test/resources/with-control-char-references-in-cdata.xml:
629
+ comments:
630
+ reviewed_by: josh.pencheon
631
+ safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
632
+ test/resources/with-control-char-references.xml:
633
+ comments:
634
+ reviewed_by: josh.pencheon
635
+ safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
636
+ test/resources/with-control-chars.xml:
637
+ comments:
638
+ reviewed_by: joshpencheon
639
+ safe_revision: 3947f13e0cbd17f449eba292ad343eeb82116fe9
640
+ test/resources/with-non-control-char-references.xml:
641
+ comments:
642
+ reviewed_by: josh.pencheon
643
+ safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
604
644
  test/resources/xlsx_file_xls_extension.xls:
605
645
  comments:
606
646
  reviewed_by: timgentry
@@ -612,7 +652,7 @@ file safety:
612
652
  test/table_test.rb:
613
653
  comments:
614
654
  reviewed_by: josh.pencheon
615
- safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
655
+ safe_revision: 3cf7473181f7f835b3dfe7822f6833d751805eaf
616
656
  test/test_helper.rb:
617
657
  comments:
618
658
  reviewed_by: josh.pencheon
@@ -620,7 +660,11 @@ file safety:
620
660
  test/universal_importer_helper_test.rb:
621
661
  comments:
622
662
  reviewed_by: josh.pencheon
623
- safe_revision: c3183e522bce50008df576ceb47fe4761ab8f966
663
+ safe_revision: 85869d99ae93252b7f3ef2d0a4db817c88d35c9e
664
+ test/xml/control_char_escaper_test.rb:
665
+ comments:
666
+ reviewed_by: josh.pencheon
667
+ safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
624
668
  test/xml/table_test.rb:
625
669
  comments:
626
670
  reviewed_by: josh.pencheon
@@ -1,38 +1,60 @@
1
1
  # This file allows us to choose the CSV library we want to use.
2
2
 
3
3
  require 'csv'
4
+ require 'active_support/deprecation'
5
+
4
6
  # Using relevant core CSV library.
5
- CSVLibrary = CSV
7
+ class CSVLibrary < CSV; end
6
8
 
7
9
  class << CSVLibrary
8
10
  # Is the library we're using FasterCSV?
9
11
  def fastercsv?
12
+ deprecate('if you desparately want fastercsv, please use it explicitly')
10
13
  not self.const_defined?(:Reader)
11
14
  end
12
15
 
13
16
  # Ensure that we can pass "mode" straight through the underlying IO object
17
+ #
18
+ # Note: this could likely be refactored now, as upstream support for something
19
+ # very similar was added:
20
+ #
21
+ # https://github.com/ruby/csv/commit/b4edaf2cf1aa36f5c6264c07514b66739b87ceee
22
+ #
14
23
  def foreach(path, **options, &block)
15
- return to_enum(__method__, path, options) unless block
16
- open(path, options.delete(:mode) || 'r', options) do |csv|
24
+ deprecate('CSV#foreach exists, with an optional `mode` argument')
25
+ return to_enum(__method__, path, **options) unless block
26
+ open(path, options.delete(:mode) || 'r', **options) do |csv|
17
27
  csv.each(&block)
18
28
  end
19
29
  end
20
30
 
21
31
  def write_csv_to_string(data)
32
+ deprecate('write_csv_to_string -> generate')
22
33
  self.generate do |csv|
23
34
  data.each { |line| csv << line }
24
35
  end
25
36
  end
26
37
 
27
38
  def write_csv_to_file(data, filepath, mode = 'w')
39
+ deprecate('write_csv_to_file -> open')
28
40
  self.open(filepath, mode) do |csv|
29
41
  data.each { |line| csv << line }
30
42
  end
31
43
  end
32
44
 
33
45
  def read_csv_from_file(filepath)
46
+ deprecate('read_csv_from_file -> read')
34
47
  self.read(filepath)
35
48
  end
49
+
50
+ private
51
+
52
+ def deprecate(additional_message = nil)
53
+ ActiveSupport::Deprecation.warn(<<~MESSAGE)
54
+ CSVLibrary is deprecated, and will be removed in a future version of ndr_import.
55
+ Please use standard functionality provided by Ruby's CSV library (#{additional_message}).
56
+ MESSAGE
57
+ end
36
58
  end
37
59
 
38
60
  # Forward port CSV::Cell, as it is sometimes
@@ -90,14 +90,14 @@ module NdrImport
90
90
  case SafeFile.extname(path).downcase
91
91
  when '.xls'
92
92
  Roo::Excel.new(SafeFile.safepath_to_string(path))
93
- when '.xlsx'
93
+ when '.xlsm', '.xlsx'
94
94
  if @options['file_password']
95
95
  Roo::Excelx.new(StringIO.new(decrypted_file_string(path, @options['file_password'])))
96
96
  else
97
97
  Roo::Excelx.new(SafeFile.safepath_to_string(path))
98
98
  end
99
99
  else
100
- fail "Received file path with unexpected extension #{SafeFile.extname(path)}"
100
+ raise "Received file path with unexpected extension #{SafeFile.extname(path)}"
101
101
  end
102
102
  rescue Ole::Storage::FormatError => e
103
103
  # TODO: Do we need to remove the new_file after using it?
@@ -105,16 +105,14 @@ module NdrImport
105
105
  # try to load the .xls file as an .xlsx file, useful for sources like USOM
106
106
  # roo check file extensions in file_type_check (GenericSpreadsheet),
107
107
  # so we create a duplicate file in xlsx extension
108
- if /(.*)\.xls$/.match(path)
109
- new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
110
- new_file_path = SafeFile.dirname(path).join(new_file_name)
111
- copy_file(path, new_file_path)
108
+ raise e.message unless /(.*)\.xls$/.match(path)
112
109
 
113
- load_workbook(new_file_path)
114
- else
115
- raise e.message
116
- end
117
- rescue => e
110
+ new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
111
+ new_file_path = SafeFile.dirname(path).join(new_file_name)
112
+ copy_file(path, new_file_path)
113
+
114
+ load_workbook(new_file_path)
115
+ rescue RuntimeError, ::Zip::Error => e
118
116
  raise ["Unable to read the file '#{path}'", e.message].join('; ')
119
117
  end
120
118
 
@@ -133,6 +131,6 @@ module NdrImport
133
131
  end
134
132
  end
135
133
 
136
- Registry.register(Excel, 'xls', 'xlsx')
134
+ Registry.register(Excel, 'xls', 'xlsm', 'xlsx')
137
135
  end
138
136
  end
@@ -32,11 +32,11 @@ module NdrImport
32
32
  return enum_for(:delimited_rows, path, col_sep, liberal) unless block_given?
33
33
 
34
34
  safe_path = SafeFile.safepath_to_string(path)
35
- encodings = determine_encodings!(safe_path, col_sep, liberal)
35
+ options = determine_encodings!(safe_path, col_sep, liberal)
36
36
 
37
- # By now, we know `encodings` should let us read the whole
37
+ # By now, we know `options` should let us read the whole
38
38
  # file succesfully; if there are problems, we should crash.
39
- CSVLibrary.foreach(safe_path, encodings) do |line|
39
+ CSV.foreach(safe_path, options.delete(:mode), **options) do |line|
40
40
  yield line.map(&:to_s)
41
41
  end
42
42
  end
@@ -46,7 +46,7 @@ module NdrImport
46
46
  # Derive the source encoding by trying all supported encodings.
47
47
  # Returns first set of working options, or raises if none could be found.
48
48
  def determine_encodings!(safe_path, col_sep, liberal)
49
- # delimiter encoding => # FasterCSV encoding string
49
+ # delimiter encoding => # CSV encoding string
50
50
  supported_encodings = {
51
51
  'UTF-8' => 'r:bom|utf-8',
52
52
  'Windows-1252' => 'r:windows-1252:utf-8'
@@ -67,14 +67,13 @@ module NdrImport
67
67
  begin
68
68
  options = {
69
69
  col_sep: (col_sep || ',').force_encoding(delimiter_encoding),
70
- liberal_parsing: liberal,
71
- mode: access_mode
70
+ liberal_parsing: liberal
72
71
  }
73
72
 
74
73
  row_num = 0
75
74
  # Iterate through the file; if we reach the end, this encoding worked:
76
- CSVLibrary.foreach(safe_path, options) { |_line| row_num += 1 }
77
- return options
75
+ CSV.foreach(safe_path, access_mode, **options) { |_line| row_num += 1 }
76
+ return options.merge(mode: access_mode)
78
77
  rescue ArgumentError => e
79
78
  next if e.message =~ /invalid byte sequence/ # This encoding didn't work
80
79
  raise(e)
@@ -1,3 +1,4 @@
1
+ require 'ndr_import/xml/control_char_escaper'
1
2
  require 'ndr_support/safe_file'
2
3
  require 'ndr_support/utf8_encoding'
3
4
 
@@ -10,16 +11,21 @@ module NdrImport
10
11
 
11
12
  private
12
13
 
13
- def read_xml_file(path)
14
- file_data = SafeFile.new(path).read
14
+ # By default, escapes any control characters found in the XML
15
+ # - their use is forbidden in XML 1.0, and highly discouraged
16
+ # in XML 1.1; any found are most likely to be erroneous.
17
+ def read_xml_file(path, preserve_control_chars: false)
18
+ file_data = ensure_utf8!(SafeFile.read(path))
15
19
 
16
20
  require 'nokogiri'
17
21
 
18
- doc = Nokogiri::XML((ensure_utf8! file_data)) do |config|
19
- config.huge
22
+ doc = nil
23
+
24
+ escaping_control_chars_if_necessary(preserve_control_chars, file_data) do
25
+ doc = Nokogiri::XML(file_data, &:huge)
26
+ doc.encoding = 'UTF-8'
27
+ emulate_strict_mode_fatal_check!(doc)
20
28
  end
21
- doc.encoding = 'UTF-8'
22
- emulate_strict_mode_fatal_check!(doc)
23
29
 
24
30
  doc
25
31
  end
@@ -40,11 +46,27 @@ module NdrImport
40
46
  end
41
47
 
42
48
  return unless fatal_errors.any?
49
+
43
50
  raise Nokogiri::XML::SyntaxError, <<~MSG
44
51
  The file had #{fatal_errors.length} fatal error(s)!"
45
52
  #{fatal_errors.join("\n")}
46
53
  MSG
47
54
  end
55
+
56
+ def escaping_control_chars_if_necessary(preserve_control_chars, file_data)
57
+ return yield if preserve_control_chars
58
+
59
+ tried_escaping = false
60
+ begin
61
+ yield
62
+ rescue Nokogiri::XML::SyntaxError => e
63
+ raise e if tried_escaping
64
+
65
+ NdrImport::Xml::ControlCharEscaper.new(file_data).escape!
66
+ tried_escaping = true
67
+ retry
68
+ end
69
+ end
48
70
  end
49
71
  end
50
72
  end
@@ -16,7 +16,7 @@ module NdrImport
16
16
 
17
17
  include UTF8Encoding
18
18
 
19
- TABULAR_ONLY_OPTIONS = %w[delimiter liberal_parsing tablename_pattern
19
+ TABULAR_ONLY_OPTIONS = %w[delimiter last_data_column liberal_parsing tablename_pattern
20
20
  header_lines footer_lines xml_record_xpath].freeze
21
21
 
22
22
  NON_TABULAR_OPTIONS = %w[capture_end_line capture_start_line start_line_pattern
@@ -10,8 +10,9 @@ module NdrImport
10
10
  include NdrImport::Mapper
11
11
 
12
12
  def self.all_valid_options
13
- %w[canonical_name delimiter liberal_parsing filename_pattern file_password tablename_pattern
14
- header_lines footer_lines format klass columns xml_record_xpath row_identifier]
13
+ %w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
14
+ tablename_pattern header_lines footer_lines format klass columns xml_record_xpath
15
+ row_identifier]
15
16
  end
16
17
 
17
18
  def all_valid_options
@@ -50,8 +51,9 @@ module NdrImport
50
51
  @header_best_guess = nil
51
52
  @notifier.try(:started)
52
53
 
54
+ last_col = last_column_to_transform
53
55
  skip_footer_lines(lines, footer_lines).each do |line|
54
- process_line(line, &block)
56
+ line.is_a?(Array) ? process_line(line[0..last_col], &block) : process_line(line, &block)
55
57
  end
56
58
 
57
59
  @notifier.try(:finished)
@@ -226,5 +228,26 @@ module NdrImport
226
228
  def column_names(column_mappings)
227
229
  column_mappings.map { |c| (c['column'] || c['standard_mapping']).downcase }
228
230
  end
231
+
232
+ # If specified in the mapping, stop transforming data at a given index (column)
233
+ def last_column_to_transform
234
+ return -1 if last_data_column.nil?
235
+ return last_data_column - 1 if last_data_column.is_a?(Integer)
236
+
237
+ error = "Unknown 'last_data_column' format: #{last_data_column} " \
238
+ "(#{last_data_column.class})"
239
+ raise error unless last_data_column.is_a?(String) && last_data_column =~ /\A[A-Z]+\z/i
240
+
241
+ # If it's an excel column label (eg 'K', 'AF', 'DDE'), convert it to an index
242
+ index_from_column_label
243
+ end
244
+
245
+ def index_from_column_label
246
+ alphabet_index_hash = ('A'..'Z').map.with_index.to_h
247
+ index = last_data_column.upcase.chars.inject(0) do |char_index, char|
248
+ (char_index * 26) + (alphabet_index_hash[char] + 1)
249
+ end
250
+ index - 1
251
+ end
229
252
  end # class Table
230
253
  end
@@ -7,11 +7,30 @@ module NdrImport
7
7
  # complexity of enumerating over files and tables (which should be universally useful).
8
8
  # It is assumed that the host module/class defines `unzip_path`.
9
9
  module UniversalImporterHelper
10
+ # Helper class to allow multiple source enumerators to contribute to one overall table.
11
+ class TableEnumProxy
12
+ include Enumerable
13
+
14
+ def initialize
15
+ @table_enums = []
16
+ end
17
+
18
+ def add_table_enum(table_enum)
19
+ @table_enums << table_enum
20
+ end
21
+
22
+ def each(&block)
23
+ return enum_for(:each) unless block
24
+
25
+ @table_enums.each { |table_enum| table_enum.each(&block) }
26
+ end
27
+ end
28
+
10
29
  def table_enumerators(filename)
11
- table_enumerators = {}
30
+ table_enumerators = Hash.new { |hash, key| hash[key] = TableEnumProxy.new }
12
31
 
13
32
  extract(filename).each do |table, rows|
14
- table_enumerators[table.canonical_name] = table.transform(rows)
33
+ table_enumerators[table.canonical_name].add_table_enum table.transform(rows)
15
34
  end
16
35
 
17
36
  table_enumerators
@@ -29,9 +48,7 @@ module NdrImport
29
48
  def extract(source_file, &block)
30
49
  return enum_for(:extract, source_file) unless block
31
50
 
32
- files = NdrImport::File::Registry.files(source_file,
33
- 'unzip_path' => unzip_path)
34
- files.each do |filename|
51
+ NdrImport::File::Registry.files(source_file, 'unzip_path' => unzip_path).each do |filename|
35
52
  # now at the individual file level, can we find the table mapping?
36
53
  table_mapping = get_table_mapping(filename, nil)
37
54
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '9.0.2'.freeze
4
+ VERSION = '10.1.1'
5
5
  end
@@ -0,0 +1,51 @@
1
+ require 'ndr_support/utf8_encoding'
2
+
3
+ module NdrImport
4
+ module Xml
5
+ # A class to remove control characters, and XML entities representing them
6
+ class ControlCharEscaper
7
+ include UTF8Encoding
8
+
9
+ # Matches XML character reference entities
10
+ CHARACTER_REFERENCES = /&#(?:(?<decimal>\d+)|x(?<hex>\h+));/.freeze
11
+
12
+ attr_reader :data
13
+
14
+ def initialize(data)
15
+ @data = data
16
+ end
17
+
18
+ def escape!
19
+ unescape_control_char_references!(data)
20
+ escape_control_chars!(data)
21
+ end
22
+
23
+ private
24
+
25
+ def unescape_control_char_references!(data)
26
+ data.gsub!(CHARACTER_REFERENCES) do |reference|
27
+ char = try_to_extract_char_from(Regexp.last_match)
28
+
29
+ if char&.match?(CONTROL_CHARACTERS)
30
+ escape_control_chars!(char)
31
+ else
32
+ reference
33
+ end
34
+ end
35
+ end
36
+
37
+ def try_to_extract_char_from(match)
38
+ if match.nil?
39
+ nil
40
+ elsif match[:decimal]
41
+ match[:decimal].to_i(10).chr
42
+ elsif match[:hex]
43
+ match[:hex].to_i(16).chr
44
+ end
45
+ rescue RangeError
46
+ # Return everything if the match was against junk:
47
+ match.to_s
48
+ end
49
+ end
50
+ end
51
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 9.0.2
4
+ version: 10.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-08-14 00:00:00.000000000 Z
11
+ date: 2021-03-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -365,6 +365,9 @@ executables:
365
365
  extensions: []
366
366
  extra_rdoc_files: []
367
367
  files:
368
+ - ".github/CODEOWNERS"
369
+ - ".github/workflows/lint.yml"
370
+ - ".github/workflows/test.yml"
368
371
  - ".gitignore"
369
372
  - ".hound.yml"
370
373
  - ".rubocop.yml"
@@ -424,6 +427,7 @@ files:
424
427
  - lib/ndr_import/universal_importer_helper.rb
425
428
  - lib/ndr_import/unmapped_data_error.rb
426
429
  - lib/ndr_import/version.rb
430
+ - lib/ndr_import/xml/control_char_escaper.rb
427
431
  - lib/ndr_import/xml/table.rb
428
432
  - ndr_import.gemspec
429
433
  homepage: https://github.com/PublicHealthEngland/ndr_import