ndr_import 9.0.2 → 10.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/CODEOWNERS +2 -0
- data/.github/workflows/lint.yml +23 -0
- data/.github/workflows/test.yml +72 -0
- data/CHANGELOG.md +31 -0
- data/README.md +1 -2
- data/code_safety.yml +76 -32
- data/lib/ndr_import/csv_library.rb +25 -3
- data/lib/ndr_import/file/excel.rb +10 -12
- data/lib/ndr_import/helpers/file/delimited.rb +7 -8
- data/lib/ndr_import/helpers/file/xml.rb +28 -6
- data/lib/ndr_import/non_tabular/table.rb +1 -1
- data/lib/ndr_import/table.rb +26 -3
- data/lib/ndr_import/universal_importer_helper.rb +22 -5
- data/lib/ndr_import/version.rb +1 -1
- data/lib/ndr_import/xml/control_char_escaper.rb +51 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2463ac35899a6db81e345b75b0ea10186530f559460dfca1211ba7694f52b760
|
4
|
+
data.tar.gz: 2f2cf39e959beeb3cfe6bcad033eb2c0695486dce3b640fe3de8b0c2b9b88a2f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c6eca601043ff01ebe910375a36131bdf55ebd3f18664e3db3c07180f007b0f06dc50cc3016634a12764befc1e28d621a09a7b382392f67608aa4d21c83c7f2d
|
7
|
+
data.tar.gz: dc568494bfc4b39b7ee47c7738511274cb85ad9532e380f3c27870682a1a21d2acda4d7d8d855597abdd0643b8afe0ed6e161b74f201fa53640b5c9641f895c5
|
data/.github/CODEOWNERS
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
name: Lint
|
2
|
+
|
3
|
+
on: [pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
rubocop:
|
7
|
+
name: RuboCop
|
8
|
+
|
9
|
+
runs-on: ubuntu-latest
|
10
|
+
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v2
|
13
|
+
with:
|
14
|
+
fetch-depth: 0 # fetch everything
|
15
|
+
- name: Set up Ruby
|
16
|
+
uses: ruby/setup-ruby@v1
|
17
|
+
with:
|
18
|
+
ruby-version: 3.0
|
19
|
+
- name: Install dependencies
|
20
|
+
run: bundle install
|
21
|
+
- name: Run RuboCop against BASE..HEAD changes
|
22
|
+
run: bundle exec rake rubocop:diff origin/${GITHUB_BASE_REF#*/}
|
23
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
name: Test
|
2
|
+
|
3
|
+
on: [push]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
test:
|
7
|
+
strategy:
|
8
|
+
fail-fast: false
|
9
|
+
matrix:
|
10
|
+
ruby-version:
|
11
|
+
- 2.6
|
12
|
+
- 2.7
|
13
|
+
- 3.0
|
14
|
+
gemfile:
|
15
|
+
- gemfiles/Gemfile.rails52
|
16
|
+
- gemfiles/Gemfile.rails60
|
17
|
+
|
18
|
+
name: Ruby ${{ matrix.ruby-version }} / Bundle ${{ matrix.gemfile }}
|
19
|
+
|
20
|
+
runs-on: ubuntu-latest
|
21
|
+
|
22
|
+
env:
|
23
|
+
BUNDLE_GEMFILE: ${{ matrix.gemfile }}
|
24
|
+
|
25
|
+
steps:
|
26
|
+
- uses: actions/checkout@v2
|
27
|
+
- name: Set up Ruby
|
28
|
+
uses: ruby/setup-ruby@v1
|
29
|
+
with:
|
30
|
+
ruby-version: ${{ matrix.ruby-version }}
|
31
|
+
- name: Install dependencies
|
32
|
+
run: bundle install
|
33
|
+
- name: Run tests
|
34
|
+
run: bundle exec rake
|
35
|
+
|
36
|
+
# A utility job upon which Branch Protection can depend,
|
37
|
+
# thus remaining agnostic of the matrix.
|
38
|
+
test_matrix:
|
39
|
+
if: ${{ always() }}
|
40
|
+
runs-on: ubuntu-latest
|
41
|
+
name: Matrix
|
42
|
+
needs: test
|
43
|
+
steps:
|
44
|
+
- name: Check build matrix status
|
45
|
+
if: ${{ needs.test.result != 'success' }}
|
46
|
+
run: exit 1
|
47
|
+
|
48
|
+
notify:
|
49
|
+
# Run only on master, but regardless of whether tests past:
|
50
|
+
if: ${{ always() && github.ref == 'refs/heads/master' }}
|
51
|
+
|
52
|
+
needs: test_matrix
|
53
|
+
|
54
|
+
runs-on: ubuntu-latest
|
55
|
+
|
56
|
+
steps:
|
57
|
+
- uses: 8398a7/action-slack@v3
|
58
|
+
with:
|
59
|
+
status: custom
|
60
|
+
fields: workflow,commit,author
|
61
|
+
custom_payload: |
|
62
|
+
{
|
63
|
+
channel: 'C7FQWGDHP',
|
64
|
+
username: 'CI – ' + '${{ github.repository }}'.split('/')[1],
|
65
|
+
icon_emoji: ':hammer_and_wrench:',
|
66
|
+
attachments: [{
|
67
|
+
color: '${{ needs.test_matrix.result }}' === 'success' ? 'good' : '${{ needs.test_matrix.result }}' === 'failure' ? 'danger' : 'warning',
|
68
|
+
text: `${process.env.AS_WORKFLOW} against \`${{ github.ref }}\` (${process.env.AS_COMMIT}) for ${{ github.actor }} resulted in *${{ needs.test_matrix.result }}*.`
|
69
|
+
}]
|
70
|
+
}
|
71
|
+
env:
|
72
|
+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,37 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
*no unreleased changes*
|
3
3
|
|
4
|
+
## 10.1.1 / 2021-03-15
|
5
|
+
### Fixed
|
6
|
+
* XML: ensure invalid control character *references* are also escaped (#64)
|
7
|
+
|
8
|
+
## 10.1.0 / 2021-03-08
|
9
|
+
### Added
|
10
|
+
* Allow optional `last_data_column` in NdrImport::Table mappings (#61)
|
11
|
+
|
12
|
+
## 10.0.0 / 2021-02-22
|
13
|
+
### Changed
|
14
|
+
* By default, escape any control characters found in XML (#60)
|
15
|
+
|
16
|
+
## 9.1.0 / 2021-02-01
|
17
|
+
### Added
|
18
|
+
* `CSVLibrary` is now deprecated.
|
19
|
+
* Handle xlsm files
|
20
|
+
|
21
|
+
### Fixed
|
22
|
+
* Fix `CSVLibrary.foreach` on Ruby 3.0
|
23
|
+
* Updated jekyll bundle
|
24
|
+
|
25
|
+
## 9.0.3 / 2021-01-04
|
26
|
+
### Fixed
|
27
|
+
* Address issue importing multiple files against the same table (#54)
|
28
|
+
|
29
|
+
### Changed
|
30
|
+
* ensure keyword args are properly splatted for ruby 2.7
|
31
|
+
|
32
|
+
### Added
|
33
|
+
* Ruby 2.7 to travis matrix
|
34
|
+
|
4
35
|
## 9.0.2 / 2020-08-14
|
5
36
|
### Changed
|
6
37
|
* Configure Nokogiri with HUGE for large xml files
|
data/README.md
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
# NdrImport [![Build Status](https://
|
2
|
-
|
1
|
+
# NdrImport [![Build Status](https://github.com/publichealthengland/ndr_import/workflows/Test/badge.svg)](https://github.com/publichealthengland/ndr_import/actions?query=workflow%3Atest) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://rubygems.org/gems/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
|
3
2
|
This is the Public Health England (PHE) National Disease Registers (NDR) Import ETL ruby gem, providing:
|
4
3
|
|
5
4
|
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and Zip files.
|
data/code_safety.yml
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
---
|
2
2
|
file safety:
|
3
|
+
".github/CODEOWNERS":
|
4
|
+
comments:
|
5
|
+
reviewed_by: ollietulloch
|
6
|
+
safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
|
7
|
+
".github/workflows/lint.yml":
|
8
|
+
comments:
|
9
|
+
reviewed_by: ollietulloch
|
10
|
+
safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
|
11
|
+
".github/workflows/test.yml":
|
12
|
+
comments:
|
13
|
+
reviewed_by: ollietulloch
|
14
|
+
safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
|
3
15
|
".gitignore":
|
4
16
|
comments: whole file re-reviewed
|
5
17
|
reviewed_by: josh.pencheon
|
@@ -12,14 +24,10 @@ file safety:
|
|
12
24
|
comments:
|
13
25
|
reviewed_by: josh.pencheon
|
14
26
|
safe_revision: b09e268ff9c8349b914aa1b7ba888e1d39f97e4a
|
15
|
-
".travis.yml":
|
16
|
-
comments:
|
17
|
-
reviewed_by: josh.pencheon
|
18
|
-
safe_revision: d3d9a987befeecb122a448d8d06e66d74da13fb5
|
19
27
|
CHANGELOG.md:
|
20
28
|
comments:
|
21
|
-
reviewed_by:
|
22
|
-
safe_revision:
|
29
|
+
reviewed_by: josh.pencheon
|
30
|
+
safe_revision: 47fa3633ec2e48f1ee9fb12aad03e817e73c54bf
|
23
31
|
CODE_OF_CONDUCT.md:
|
24
32
|
comments:
|
25
33
|
reviewed_by: timgentry
|
@@ -38,8 +46,8 @@ file safety:
|
|
38
46
|
safe_revision: 5d185a0aeba6a9cd2ff5e59efadcaeec9be45d8b
|
39
47
|
README.md:
|
40
48
|
comments:
|
41
|
-
reviewed_by:
|
42
|
-
safe_revision:
|
49
|
+
reviewed_by: ollietulloch
|
50
|
+
safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
|
43
51
|
Rakefile:
|
44
52
|
comments:
|
45
53
|
reviewed_by: josh.pencheon
|
@@ -59,7 +67,7 @@ file safety:
|
|
59
67
|
docs/Gemfile.lock:
|
60
68
|
comments:
|
61
69
|
reviewed_by: ollietulloch
|
62
|
-
safe_revision:
|
70
|
+
safe_revision: ea0149c7739676463a252ffd9fbe4af238762b2b
|
63
71
|
docs/_config.yml:
|
64
72
|
comments:
|
65
73
|
reviewed_by: josh.pencheon
|
@@ -146,8 +154,8 @@ file safety:
|
|
146
154
|
safe_revision: 24d6449fd0612552f132dfbf4cada2ae28d0469e
|
147
155
|
lib/ndr_import/csv_library.rb:
|
148
156
|
comments:
|
149
|
-
reviewed_by:
|
150
|
-
safe_revision:
|
157
|
+
reviewed_by: ollietulloch
|
158
|
+
safe_revision: 6b8668967dbd42d7893a0fa5f0aa1ec1c11227e1
|
151
159
|
lib/ndr_import/file/acro_form.rb:
|
152
160
|
comments:
|
153
161
|
reviewed_by: josh.pencheon
|
@@ -170,8 +178,8 @@ file safety:
|
|
170
178
|
safe_revision: 897f8b648d633368cf2001d17ab89c06a12d445b
|
171
179
|
lib/ndr_import/file/excel.rb:
|
172
180
|
comments:
|
173
|
-
reviewed_by:
|
174
|
-
safe_revision:
|
181
|
+
reviewed_by: ollietulloch
|
182
|
+
safe_revision: 37482c79448bea80033f6f69d97584df330c9861
|
175
183
|
lib/ndr_import/file/office_file_helper.rb:
|
176
184
|
comments:
|
177
185
|
reviewed_by: josh.pencheon
|
@@ -214,8 +222,8 @@ file safety:
|
|
214
222
|
safe_revision: dfc958d44b6c58355445fa395db08a62213ee709
|
215
223
|
lib/ndr_import/helpers/file/delimited.rb:
|
216
224
|
comments:
|
217
|
-
reviewed_by:
|
218
|
-
safe_revision:
|
225
|
+
reviewed_by: ollietulloch
|
226
|
+
safe_revision: 4a5cc1d362c632fc1f9242c69982fbce33557e17
|
219
227
|
lib/ndr_import/helpers/file/excel.rb:
|
220
228
|
comments:
|
221
229
|
reviewed_by: joshpencheon
|
@@ -230,8 +238,8 @@ file safety:
|
|
230
238
|
safe_revision: 45da71ebd3acbc0fe53755bcd75483ba17cb6924
|
231
239
|
lib/ndr_import/helpers/file/xml.rb:
|
232
240
|
comments:
|
233
|
-
reviewed_by:
|
234
|
-
safe_revision:
|
241
|
+
reviewed_by: josh.pencheon
|
242
|
+
safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
|
235
243
|
lib/ndr_import/helpers/file/xml_streaming.rb:
|
236
244
|
comments: uses SafePath and Shellwords when accessing filesystem, or making system
|
237
245
|
calls
|
@@ -272,7 +280,7 @@ file safety:
|
|
272
280
|
lib/ndr_import/non_tabular/table.rb:
|
273
281
|
comments:
|
274
282
|
reviewed_by: josh.pencheon
|
275
|
-
safe_revision:
|
283
|
+
safe_revision: f9df064adcfd38f09d83ad8c5496c84188faed98
|
276
284
|
lib/ndr_import/non_tabular_file_helper.rb:
|
277
285
|
comments:
|
278
286
|
reviewed_by: josh.pencheon
|
@@ -288,19 +296,23 @@ file safety:
|
|
288
296
|
lib/ndr_import/table.rb:
|
289
297
|
comments: uses File.basename
|
290
298
|
reviewed_by: josh.pencheon
|
291
|
-
safe_revision:
|
299
|
+
safe_revision: 3cf7473181f7f835b3dfe7822f6833d751805eaf
|
292
300
|
lib/ndr_import/universal_importer_helper.rb:
|
293
301
|
comments:
|
294
302
|
reviewed_by: josh.pencheon
|
295
|
-
safe_revision:
|
303
|
+
safe_revision: 85869d99ae93252b7f3ef2d0a4db817c88d35c9e
|
296
304
|
lib/ndr_import/unmapped_data_error.rb:
|
297
305
|
comments:
|
298
306
|
reviewed_by: josh.pencheon
|
299
307
|
safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
|
300
308
|
lib/ndr_import/version.rb:
|
301
309
|
comments: another check?
|
302
|
-
reviewed_by:
|
303
|
-
safe_revision:
|
310
|
+
reviewed_by: josh.pencheon
|
311
|
+
safe_revision: 47fa3633ec2e48f1ee9fb12aad03e817e73c54bf
|
312
|
+
lib/ndr_import/xml/control_char_escaper.rb:
|
313
|
+
comments:
|
314
|
+
reviewed_by: josh.pencheon
|
315
|
+
safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
|
304
316
|
lib/ndr_import/xml/table.rb:
|
305
317
|
comments:
|
306
318
|
reviewed_by: josh.pencheon
|
@@ -309,6 +321,10 @@ file safety:
|
|
309
321
|
comments:
|
310
322
|
reviewed_by: josh.pencheon
|
311
323
|
safe_revision: 95e6ee9997d06471fe6f2f169c3c701471086371
|
324
|
+
test/csv_library_test.rb:
|
325
|
+
comments:
|
326
|
+
reviewed_by: ollietulloch
|
327
|
+
safe_revision: 6b8668967dbd42d7893a0fa5f0aa1ec1c11227e1
|
312
328
|
test/file/acro_form_test.rb:
|
313
329
|
comments:
|
314
330
|
reviewed_by: josh.pencheon
|
@@ -327,16 +343,16 @@ file safety:
|
|
327
343
|
safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
|
328
344
|
test/file/excel_test.rb:
|
329
345
|
comments:
|
330
|
-
reviewed_by:
|
331
|
-
safe_revision:
|
346
|
+
reviewed_by: ollietulloch
|
347
|
+
safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
|
332
348
|
test/file/pdf_test.rb:
|
333
349
|
comments:
|
334
350
|
reviewed_by: josh.pencheon
|
335
351
|
safe_revision: cb24ed3ea8116730d07f74546cd6fed0738b171d
|
336
352
|
test/file/registry_test.rb:
|
337
353
|
comments:
|
338
|
-
reviewed_by:
|
339
|
-
safe_revision:
|
354
|
+
reviewed_by: ollietulloch
|
355
|
+
safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
|
340
356
|
test/file/seven_zip_test.rb:
|
341
357
|
comments:
|
342
358
|
reviewed_by: josh.pencheon
|
@@ -383,8 +399,8 @@ file safety:
|
|
383
399
|
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
384
400
|
test/helpers/file/xml_test.rb:
|
385
401
|
comments:
|
386
|
-
reviewed_by:
|
387
|
-
safe_revision:
|
402
|
+
reviewed_by: josh.pencheon
|
403
|
+
safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
|
388
404
|
test/helpers/file/zip_test.rb:
|
389
405
|
comments:
|
390
406
|
reviewed_by: josh.pencheon
|
@@ -495,8 +511,8 @@ file safety:
|
|
495
511
|
safe_revision: 71979e0a602ca5a0ce415c194f10add9959f0116
|
496
512
|
test/resources/malformed.xml:
|
497
513
|
comments:
|
498
|
-
reviewed_by:
|
499
|
-
safe_revision:
|
514
|
+
reviewed_by: joshpencheon
|
515
|
+
safe_revision: 3947f13e0cbd17f449eba292ad343eeb82116fe9
|
500
516
|
test/resources/malformed_pipe.csv:
|
501
517
|
comments:
|
502
518
|
reviewed_by: josh.pencheon
|
@@ -553,6 +569,10 @@ file safety:
|
|
553
569
|
comments:
|
554
570
|
reviewed_by: timgentry
|
555
571
|
safe_revision: 8c30f89f0562ab120769c166d4e93ff839c055f7
|
572
|
+
test/resources/sample_xlsm.xlsm:
|
573
|
+
comments:
|
574
|
+
reviewed_by: ollietulloch
|
575
|
+
safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
|
556
576
|
test/resources/sample_xlsx.xlsx:
|
557
577
|
comments:
|
558
578
|
reviewed_by: timgentry
|
@@ -569,6 +589,10 @@ file safety:
|
|
569
589
|
comments:
|
570
590
|
reviewed_by: timgentry
|
571
591
|
safe_revision: 31fb1935f4578729d8786eea41cf0ce0a19be1cd
|
592
|
+
test/resources/two_files_single_table_mapping.zip:
|
593
|
+
comments:
|
594
|
+
reviewed_by: ollietulloch
|
595
|
+
safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
|
572
596
|
test/resources/txt_file_xls_extension.xls:
|
573
597
|
comments:
|
574
598
|
reviewed_by: timgentry
|
@@ -601,6 +625,22 @@ file safety:
|
|
601
625
|
comments:
|
602
626
|
reviewed_by: timgentry
|
603
627
|
safe_revision: f755c6960182f7dd460c18866cccfdf09178e860
|
628
|
+
test/resources/with-control-char-references-in-cdata.xml:
|
629
|
+
comments:
|
630
|
+
reviewed_by: josh.pencheon
|
631
|
+
safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
|
632
|
+
test/resources/with-control-char-references.xml:
|
633
|
+
comments:
|
634
|
+
reviewed_by: josh.pencheon
|
635
|
+
safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
|
636
|
+
test/resources/with-control-chars.xml:
|
637
|
+
comments:
|
638
|
+
reviewed_by: joshpencheon
|
639
|
+
safe_revision: 3947f13e0cbd17f449eba292ad343eeb82116fe9
|
640
|
+
test/resources/with-non-control-char-references.xml:
|
641
|
+
comments:
|
642
|
+
reviewed_by: josh.pencheon
|
643
|
+
safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
|
604
644
|
test/resources/xlsx_file_xls_extension.xls:
|
605
645
|
comments:
|
606
646
|
reviewed_by: timgentry
|
@@ -612,7 +652,7 @@ file safety:
|
|
612
652
|
test/table_test.rb:
|
613
653
|
comments:
|
614
654
|
reviewed_by: josh.pencheon
|
615
|
-
safe_revision:
|
655
|
+
safe_revision: 3cf7473181f7f835b3dfe7822f6833d751805eaf
|
616
656
|
test/test_helper.rb:
|
617
657
|
comments:
|
618
658
|
reviewed_by: josh.pencheon
|
@@ -620,7 +660,11 @@ file safety:
|
|
620
660
|
test/universal_importer_helper_test.rb:
|
621
661
|
comments:
|
622
662
|
reviewed_by: josh.pencheon
|
623
|
-
safe_revision:
|
663
|
+
safe_revision: 85869d99ae93252b7f3ef2d0a4db817c88d35c9e
|
664
|
+
test/xml/control_char_escaper_test.rb:
|
665
|
+
comments:
|
666
|
+
reviewed_by: josh.pencheon
|
667
|
+
safe_revision: 9a6cc769abce5f9bfa5b4f8bd5cda52dfe18b12b
|
624
668
|
test/xml/table_test.rb:
|
625
669
|
comments:
|
626
670
|
reviewed_by: josh.pencheon
|
@@ -1,38 +1,60 @@
|
|
1
1
|
# This file allows us to choose the CSV library we want to use.
|
2
2
|
|
3
3
|
require 'csv'
|
4
|
+
require 'active_support/deprecation'
|
5
|
+
|
4
6
|
# Using relevant core CSV library.
|
5
|
-
CSVLibrary
|
7
|
+
class CSVLibrary < CSV; end
|
6
8
|
|
7
9
|
class << CSVLibrary
|
8
10
|
# Is the library we're using FasterCSV?
|
9
11
|
def fastercsv?
|
12
|
+
deprecate('if you desparately want fastercsv, please use it explicitly')
|
10
13
|
not self.const_defined?(:Reader)
|
11
14
|
end
|
12
15
|
|
13
16
|
# Ensure that we can pass "mode" straight through the underlying IO object
|
17
|
+
#
|
18
|
+
# Note: this could likely be refactored now, as upstream support for something
|
19
|
+
# very similar was added:
|
20
|
+
#
|
21
|
+
# https://github.com/ruby/csv/commit/b4edaf2cf1aa36f5c6264c07514b66739b87ceee
|
22
|
+
#
|
14
23
|
def foreach(path, **options, &block)
|
15
|
-
|
16
|
-
|
24
|
+
deprecate('CSV#foreach exists, with an optional `mode` argument')
|
25
|
+
return to_enum(__method__, path, **options) unless block
|
26
|
+
open(path, options.delete(:mode) || 'r', **options) do |csv|
|
17
27
|
csv.each(&block)
|
18
28
|
end
|
19
29
|
end
|
20
30
|
|
21
31
|
def write_csv_to_string(data)
|
32
|
+
deprecate('write_csv_to_string -> generate')
|
22
33
|
self.generate do |csv|
|
23
34
|
data.each { |line| csv << line }
|
24
35
|
end
|
25
36
|
end
|
26
37
|
|
27
38
|
def write_csv_to_file(data, filepath, mode = 'w')
|
39
|
+
deprecate('write_csv_to_file -> open')
|
28
40
|
self.open(filepath, mode) do |csv|
|
29
41
|
data.each { |line| csv << line }
|
30
42
|
end
|
31
43
|
end
|
32
44
|
|
33
45
|
def read_csv_from_file(filepath)
|
46
|
+
deprecate('read_csv_from_file -> read')
|
34
47
|
self.read(filepath)
|
35
48
|
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def deprecate(additional_message = nil)
|
53
|
+
ActiveSupport::Deprecation.warn(<<~MESSAGE)
|
54
|
+
CSVLibrary is deprecated, and will be removed in a future version of ndr_import.
|
55
|
+
Please use standard functionality provided by Ruby's CSV library (#{additional_message}).
|
56
|
+
MESSAGE
|
57
|
+
end
|
36
58
|
end
|
37
59
|
|
38
60
|
# Forward port CSV::Cell, as it is sometimes
|
@@ -90,14 +90,14 @@ module NdrImport
|
|
90
90
|
case SafeFile.extname(path).downcase
|
91
91
|
when '.xls'
|
92
92
|
Roo::Excel.new(SafeFile.safepath_to_string(path))
|
93
|
-
when '.xlsx'
|
93
|
+
when '.xlsm', '.xlsx'
|
94
94
|
if @options['file_password']
|
95
95
|
Roo::Excelx.new(StringIO.new(decrypted_file_string(path, @options['file_password'])))
|
96
96
|
else
|
97
97
|
Roo::Excelx.new(SafeFile.safepath_to_string(path))
|
98
98
|
end
|
99
99
|
else
|
100
|
-
|
100
|
+
raise "Received file path with unexpected extension #{SafeFile.extname(path)}"
|
101
101
|
end
|
102
102
|
rescue Ole::Storage::FormatError => e
|
103
103
|
# TODO: Do we need to remove the new_file after using it?
|
@@ -105,16 +105,14 @@ module NdrImport
|
|
105
105
|
# try to load the .xls file as an .xlsx file, useful for sources like USOM
|
106
106
|
# roo check file extensions in file_type_check (GenericSpreadsheet),
|
107
107
|
# so we create a duplicate file in xlsx extension
|
108
|
-
|
109
|
-
new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
|
110
|
-
new_file_path = SafeFile.dirname(path).join(new_file_name)
|
111
|
-
copy_file(path, new_file_path)
|
108
|
+
raise e.message unless /(.*)\.xls$/.match(path)
|
112
109
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
110
|
+
new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
|
111
|
+
new_file_path = SafeFile.dirname(path).join(new_file_name)
|
112
|
+
copy_file(path, new_file_path)
|
113
|
+
|
114
|
+
load_workbook(new_file_path)
|
115
|
+
rescue RuntimeError, ::Zip::Error => e
|
118
116
|
raise ["Unable to read the file '#{path}'", e.message].join('; ')
|
119
117
|
end
|
120
118
|
|
@@ -133,6 +131,6 @@ module NdrImport
|
|
133
131
|
end
|
134
132
|
end
|
135
133
|
|
136
|
-
Registry.register(Excel, 'xls', 'xlsx')
|
134
|
+
Registry.register(Excel, 'xls', 'xlsm', 'xlsx')
|
137
135
|
end
|
138
136
|
end
|
@@ -32,11 +32,11 @@ module NdrImport
|
|
32
32
|
return enum_for(:delimited_rows, path, col_sep, liberal) unless block_given?
|
33
33
|
|
34
34
|
safe_path = SafeFile.safepath_to_string(path)
|
35
|
-
|
35
|
+
options = determine_encodings!(safe_path, col_sep, liberal)
|
36
36
|
|
37
|
-
# By now, we know `
|
37
|
+
# By now, we know `options` should let us read the whole
|
38
38
|
# file succesfully; if there are problems, we should crash.
|
39
|
-
|
39
|
+
CSV.foreach(safe_path, options.delete(:mode), **options) do |line|
|
40
40
|
yield line.map(&:to_s)
|
41
41
|
end
|
42
42
|
end
|
@@ -46,7 +46,7 @@ module NdrImport
|
|
46
46
|
# Derive the source encoding by trying all supported encodings.
|
47
47
|
# Returns first set of working options, or raises if none could be found.
|
48
48
|
def determine_encodings!(safe_path, col_sep, liberal)
|
49
|
-
# delimiter encoding => #
|
49
|
+
# delimiter encoding => # CSV encoding string
|
50
50
|
supported_encodings = {
|
51
51
|
'UTF-8' => 'r:bom|utf-8',
|
52
52
|
'Windows-1252' => 'r:windows-1252:utf-8'
|
@@ -67,14 +67,13 @@ module NdrImport
|
|
67
67
|
begin
|
68
68
|
options = {
|
69
69
|
col_sep: (col_sep || ',').force_encoding(delimiter_encoding),
|
70
|
-
liberal_parsing: liberal
|
71
|
-
mode: access_mode
|
70
|
+
liberal_parsing: liberal
|
72
71
|
}
|
73
72
|
|
74
73
|
row_num = 0
|
75
74
|
# Iterate through the file; if we reach the end, this encoding worked:
|
76
|
-
|
77
|
-
return options
|
75
|
+
CSV.foreach(safe_path, access_mode, **options) { |_line| row_num += 1 }
|
76
|
+
return options.merge(mode: access_mode)
|
78
77
|
rescue ArgumentError => e
|
79
78
|
next if e.message =~ /invalid byte sequence/ # This encoding didn't work
|
80
79
|
raise(e)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'ndr_import/xml/control_char_escaper'
|
1
2
|
require 'ndr_support/safe_file'
|
2
3
|
require 'ndr_support/utf8_encoding'
|
3
4
|
|
@@ -10,16 +11,21 @@ module NdrImport
|
|
10
11
|
|
11
12
|
private
|
12
13
|
|
13
|
-
|
14
|
-
|
14
|
+
# By default, escapes any control characters found in the XML
|
15
|
+
# - their use is forbidden in XML 1.0, and highly discouraged
|
16
|
+
# in XML 1.1; any found are most likely to be erroneous.
|
17
|
+
def read_xml_file(path, preserve_control_chars: false)
|
18
|
+
file_data = ensure_utf8!(SafeFile.read(path))
|
15
19
|
|
16
20
|
require 'nokogiri'
|
17
21
|
|
18
|
-
doc =
|
19
|
-
|
22
|
+
doc = nil
|
23
|
+
|
24
|
+
escaping_control_chars_if_necessary(preserve_control_chars, file_data) do
|
25
|
+
doc = Nokogiri::XML(file_data, &:huge)
|
26
|
+
doc.encoding = 'UTF-8'
|
27
|
+
emulate_strict_mode_fatal_check!(doc)
|
20
28
|
end
|
21
|
-
doc.encoding = 'UTF-8'
|
22
|
-
emulate_strict_mode_fatal_check!(doc)
|
23
29
|
|
24
30
|
doc
|
25
31
|
end
|
@@ -40,11 +46,27 @@ module NdrImport
|
|
40
46
|
end
|
41
47
|
|
42
48
|
return unless fatal_errors.any?
|
49
|
+
|
43
50
|
raise Nokogiri::XML::SyntaxError, <<~MSG
|
44
51
|
The file had #{fatal_errors.length} fatal error(s)!"
|
45
52
|
#{fatal_errors.join("\n")}
|
46
53
|
MSG
|
47
54
|
end
|
55
|
+
|
56
|
+
def escaping_control_chars_if_necessary(preserve_control_chars, file_data)
|
57
|
+
return yield if preserve_control_chars
|
58
|
+
|
59
|
+
tried_escaping = false
|
60
|
+
begin
|
61
|
+
yield
|
62
|
+
rescue Nokogiri::XML::SyntaxError => e
|
63
|
+
raise e if tried_escaping
|
64
|
+
|
65
|
+
NdrImport::Xml::ControlCharEscaper.new(file_data).escape!
|
66
|
+
tried_escaping = true
|
67
|
+
retry
|
68
|
+
end
|
69
|
+
end
|
48
70
|
end
|
49
71
|
end
|
50
72
|
end
|
@@ -16,7 +16,7 @@ module NdrImport
|
|
16
16
|
|
17
17
|
include UTF8Encoding
|
18
18
|
|
19
|
-
TABULAR_ONLY_OPTIONS = %w[delimiter liberal_parsing tablename_pattern
|
19
|
+
TABULAR_ONLY_OPTIONS = %w[delimiter last_data_column liberal_parsing tablename_pattern
|
20
20
|
header_lines footer_lines xml_record_xpath].freeze
|
21
21
|
|
22
22
|
NON_TABULAR_OPTIONS = %w[capture_end_line capture_start_line start_line_pattern
|
data/lib/ndr_import/table.rb
CHANGED
@@ -10,8 +10,9 @@ module NdrImport
|
|
10
10
|
include NdrImport::Mapper
|
11
11
|
|
12
12
|
def self.all_valid_options
|
13
|
-
%w[canonical_name delimiter liberal_parsing filename_pattern file_password
|
14
|
-
header_lines footer_lines format klass columns xml_record_xpath
|
13
|
+
%w[canonical_name delimiter liberal_parsing filename_pattern file_password last_data_column
|
14
|
+
tablename_pattern header_lines footer_lines format klass columns xml_record_xpath
|
15
|
+
row_identifier]
|
15
16
|
end
|
16
17
|
|
17
18
|
def all_valid_options
|
@@ -50,8 +51,9 @@ module NdrImport
|
|
50
51
|
@header_best_guess = nil
|
51
52
|
@notifier.try(:started)
|
52
53
|
|
54
|
+
last_col = last_column_to_transform
|
53
55
|
skip_footer_lines(lines, footer_lines).each do |line|
|
54
|
-
process_line(line, &block)
|
56
|
+
line.is_a?(Array) ? process_line(line[0..last_col], &block) : process_line(line, &block)
|
55
57
|
end
|
56
58
|
|
57
59
|
@notifier.try(:finished)
|
@@ -226,5 +228,26 @@ module NdrImport
|
|
226
228
|
def column_names(column_mappings)
|
227
229
|
column_mappings.map { |c| (c['column'] || c['standard_mapping']).downcase }
|
228
230
|
end
|
231
|
+
|
232
|
+
# If specified in the mapping, stop transforming data at a given index (column)
|
233
|
+
def last_column_to_transform
|
234
|
+
return -1 if last_data_column.nil?
|
235
|
+
return last_data_column - 1 if last_data_column.is_a?(Integer)
|
236
|
+
|
237
|
+
error = "Unknown 'last_data_column' format: #{last_data_column} " \
|
238
|
+
"(#{last_data_column.class})"
|
239
|
+
raise error unless last_data_column.is_a?(String) && last_data_column =~ /\A[A-Z]+\z/i
|
240
|
+
|
241
|
+
# If it's an excel column label (eg 'K', 'AF', 'DDE'), convert it to an index
|
242
|
+
index_from_column_label
|
243
|
+
end
|
244
|
+
|
245
|
+
def index_from_column_label
|
246
|
+
alphabet_index_hash = ('A'..'Z').map.with_index.to_h
|
247
|
+
index = last_data_column.upcase.chars.inject(0) do |char_index, char|
|
248
|
+
(char_index * 26) + (alphabet_index_hash[char] + 1)
|
249
|
+
end
|
250
|
+
index - 1
|
251
|
+
end
|
229
252
|
end # class Table
|
230
253
|
end
|
@@ -7,11 +7,30 @@ module NdrImport
|
|
7
7
|
# complexity of enumerating over files and tables (which should be universally useful).
|
8
8
|
# It is assumed that the host module/class defines `unzip_path`.
|
9
9
|
module UniversalImporterHelper
|
10
|
+
# Helper class to allow multiple source enumerators to contribute to one overall table.
|
11
|
+
class TableEnumProxy
|
12
|
+
include Enumerable
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@table_enums = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_table_enum(table_enum)
|
19
|
+
@table_enums << table_enum
|
20
|
+
end
|
21
|
+
|
22
|
+
def each(&block)
|
23
|
+
return enum_for(:each) unless block
|
24
|
+
|
25
|
+
@table_enums.each { |table_enum| table_enum.each(&block) }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
10
29
|
def table_enumerators(filename)
|
11
|
-
table_enumerators = {}
|
30
|
+
table_enumerators = Hash.new { |hash, key| hash[key] = TableEnumProxy.new }
|
12
31
|
|
13
32
|
extract(filename).each do |table, rows|
|
14
|
-
table_enumerators[table.canonical_name]
|
33
|
+
table_enumerators[table.canonical_name].add_table_enum table.transform(rows)
|
15
34
|
end
|
16
35
|
|
17
36
|
table_enumerators
|
@@ -29,9 +48,7 @@ module NdrImport
|
|
29
48
|
def extract(source_file, &block)
|
30
49
|
return enum_for(:extract, source_file) unless block
|
31
50
|
|
32
|
-
|
33
|
-
'unzip_path' => unzip_path)
|
34
|
-
files.each do |filename|
|
51
|
+
NdrImport::File::Registry.files(source_file, 'unzip_path' => unzip_path).each do |filename|
|
35
52
|
# now at the individual file level, can we find the table mapping?
|
36
53
|
table_mapping = get_table_mapping(filename, nil)
|
37
54
|
|
data/lib/ndr_import/version.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'ndr_support/utf8_encoding'
|
2
|
+
|
3
|
+
module NdrImport
|
4
|
+
module Xml
|
5
|
+
# A class to remove control characters, and XML entities representing them
|
6
|
+
class ControlCharEscaper
|
7
|
+
include UTF8Encoding
|
8
|
+
|
9
|
+
# Matches XML character reference entities
|
10
|
+
CHARACTER_REFERENCES = /&#(?:(?<decimal>\d+)|x(?<hex>\h+));/.freeze
|
11
|
+
|
12
|
+
attr_reader :data
|
13
|
+
|
14
|
+
def initialize(data)
|
15
|
+
@data = data
|
16
|
+
end
|
17
|
+
|
18
|
+
def escape!
|
19
|
+
unescape_control_char_references!(data)
|
20
|
+
escape_control_chars!(data)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def unescape_control_char_references!(data)
|
26
|
+
data.gsub!(CHARACTER_REFERENCES) do |reference|
|
27
|
+
char = try_to_extract_char_from(Regexp.last_match)
|
28
|
+
|
29
|
+
if char&.match?(CONTROL_CHARACTERS)
|
30
|
+
escape_control_chars!(char)
|
31
|
+
else
|
32
|
+
reference
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def try_to_extract_char_from(match)
|
38
|
+
if match.nil?
|
39
|
+
nil
|
40
|
+
elsif match[:decimal]
|
41
|
+
match[:decimal].to_i(10).chr
|
42
|
+
elsif match[:hex]
|
43
|
+
match[:hex].to_i(16).chr
|
44
|
+
end
|
45
|
+
rescue RangeError
|
46
|
+
# Return everything if the match was against junk:
|
47
|
+
match.to_s
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 10.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -365,6 +365,9 @@ executables:
|
|
365
365
|
extensions: []
|
366
366
|
extra_rdoc_files: []
|
367
367
|
files:
|
368
|
+
- ".github/CODEOWNERS"
|
369
|
+
- ".github/workflows/lint.yml"
|
370
|
+
- ".github/workflows/test.yml"
|
368
371
|
- ".gitignore"
|
369
372
|
- ".hound.yml"
|
370
373
|
- ".rubocop.yml"
|
@@ -424,6 +427,7 @@ files:
|
|
424
427
|
- lib/ndr_import/universal_importer_helper.rb
|
425
428
|
- lib/ndr_import/unmapped_data_error.rb
|
426
429
|
- lib/ndr_import/version.rb
|
430
|
+
- lib/ndr_import/xml/control_char_escaper.rb
|
427
431
|
- lib/ndr_import/xml/table.rb
|
428
432
|
- ndr_import.gemspec
|
429
433
|
homepage: https://github.com/PublicHealthEngland/ndr_import
|