ndr_import 9.0.0 → 10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/CODEOWNERS +2 -0
- data/.github/workflows/lint.yml +23 -0
- data/.github/workflows/test.yml +72 -0
- data/.gitignore +1 -1
- data/CHANGELOG.md +31 -0
- data/README.md +1 -2
- data/code_safety.yml +121 -33
- data/lib/ndr_import/csv_library.rb +25 -3
- data/lib/ndr_import/file/excel.rb +10 -12
- data/lib/ndr_import/helpers/file/delimited.rb +7 -8
- data/lib/ndr_import/helpers/file/xml.rb +19 -6
- data/lib/ndr_import/helpers/file/xml_streaming.rb +2 -0
- data/lib/ndr_import/universal_importer_helper.rb +23 -2
- data/lib/ndr_import/version.rb +1 -1
- data/ndr_import.gemspec +4 -4
- metadata +17 -14
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b98a9642fed703edb02ce4bc18d5c15869f1dd10d0e072866a84a0b6b9529141
|
|
4
|
+
data.tar.gz: 8c4aa215b0e87ca31676a96c703789bfb93d22bf3fa32b44ee7169a4ccfa4607
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: da1b4ae10264ac9a5ff7d09832c979f81608c6e428052bbe1dc403f5d5cc0d9c1f44348b59d02be340cc48a277cf2bfe84f5fd80560f7ebc3b8379b529f65a4f
|
|
7
|
+
data.tar.gz: b969c50b4aec9687571c53f1b49b6798ff65ef2c1d4edeb36ab535e8af59a2387fe8d4a5941ef9c1257686e59c990b30de69a1423779dd8bf77ccfc94bd04786
|
data/.github/CODEOWNERS
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
name: Lint
|
|
2
|
+
|
|
3
|
+
on: [pull_request]
|
|
4
|
+
|
|
5
|
+
jobs:
|
|
6
|
+
rubocop:
|
|
7
|
+
name: RuboCop
|
|
8
|
+
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v2
|
|
13
|
+
with:
|
|
14
|
+
fetch-depth: 0 # fetch everything
|
|
15
|
+
- name: Set up Ruby
|
|
16
|
+
uses: ruby/setup-ruby@v1
|
|
17
|
+
with:
|
|
18
|
+
ruby-version: 3.0
|
|
19
|
+
- name: Install dependencies
|
|
20
|
+
run: bundle install
|
|
21
|
+
- name: Run RuboCop against BASE..HEAD changes
|
|
22
|
+
run: bundle exec rake rubocop:diff origin/${GITHUB_BASE_REF#*/}
|
|
23
|
+
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
name: Test
|
|
2
|
+
|
|
3
|
+
on: [push]
|
|
4
|
+
|
|
5
|
+
jobs:
|
|
6
|
+
test:
|
|
7
|
+
strategy:
|
|
8
|
+
fail-fast: false
|
|
9
|
+
matrix:
|
|
10
|
+
ruby-version:
|
|
11
|
+
- 2.6
|
|
12
|
+
- 2.7
|
|
13
|
+
- 3.0
|
|
14
|
+
gemfile:
|
|
15
|
+
- gemfiles/Gemfile.rails52
|
|
16
|
+
- gemfiles/Gemfile.rails60
|
|
17
|
+
|
|
18
|
+
name: Ruby ${{ matrix.ruby-version }} / Bundle ${{ matrix.gemfile }}
|
|
19
|
+
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
|
|
22
|
+
env:
|
|
23
|
+
BUNDLE_GEMFILE: ${{ matrix.gemfile }}
|
|
24
|
+
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v2
|
|
27
|
+
- name: Set up Ruby
|
|
28
|
+
uses: ruby/setup-ruby@v1
|
|
29
|
+
with:
|
|
30
|
+
ruby-version: ${{ matrix.ruby-version }}
|
|
31
|
+
- name: Install dependencies
|
|
32
|
+
run: bundle install
|
|
33
|
+
- name: Run tests
|
|
34
|
+
run: bundle exec rake
|
|
35
|
+
|
|
36
|
+
# A utility job upon which Branch Protection can depend,
|
|
37
|
+
# thus remaining agnostic of the matrix.
|
|
38
|
+
test_matrix:
|
|
39
|
+
if: ${{ always() }}
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
name: Matrix
|
|
42
|
+
needs: test
|
|
43
|
+
steps:
|
|
44
|
+
- name: Check build matrix status
|
|
45
|
+
if: ${{ needs.test.result != 'success' }}
|
|
46
|
+
run: exit 1
|
|
47
|
+
|
|
48
|
+
notify:
|
|
49
|
+
# Run only on master, but regardless of whether tests past:
|
|
50
|
+
if: ${{ always() && github.ref == 'refs/heads/master' }}
|
|
51
|
+
|
|
52
|
+
needs: test_matrix
|
|
53
|
+
|
|
54
|
+
runs-on: ubuntu-latest
|
|
55
|
+
|
|
56
|
+
steps:
|
|
57
|
+
- uses: 8398a7/action-slack@v3
|
|
58
|
+
with:
|
|
59
|
+
status: custom
|
|
60
|
+
fields: workflow,commit,author
|
|
61
|
+
custom_payload: |
|
|
62
|
+
{
|
|
63
|
+
channel: 'C7FQWGDHP',
|
|
64
|
+
username: 'CI – ' + '${{ github.repository }}'.split('/')[1],
|
|
65
|
+
icon_emoji: ':hammer_and_wrench:',
|
|
66
|
+
attachments: [{
|
|
67
|
+
color: '${{ needs.test_matrix.result }}' === 'success' ? 'good' : '${{ needs.test_matrix.result }}' === 'failure' ? 'danger' : 'warning',
|
|
68
|
+
text: `${process.env.AS_WORKFLOW} against \`${{ github.ref }}\` (${process.env.AS_COMMIT}) for ${{ github.actor }} resulted in *${{ needs.test_matrix.result }}*.`
|
|
69
|
+
}]
|
|
70
|
+
}
|
|
71
|
+
env:
|
|
72
|
+
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,37 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
*no unreleased changes*
|
|
3
3
|
|
|
4
|
+
## 10.0 / 2021-02-22
|
|
5
|
+
### Changed
|
|
6
|
+
* By default, escape any control characters found in XML (#60)
|
|
7
|
+
|
|
8
|
+
## 9.1.0 / 2021-02-01
|
|
9
|
+
### Added
|
|
10
|
+
* `CSVLibrary` is now deprecated.
|
|
11
|
+
* Handle xlsm files
|
|
12
|
+
|
|
13
|
+
### Fixed
|
|
14
|
+
* Fix `CSVLibrary.foreach` on Ruby 3.0
|
|
15
|
+
* Updated jekyll bundle
|
|
16
|
+
|
|
17
|
+
## 9.0.3 / 2021-01-04
|
|
18
|
+
### Fixed
|
|
19
|
+
* Address issue importing multiple files against the same table (#54)
|
|
20
|
+
|
|
21
|
+
### Changed
|
|
22
|
+
* ensure keyword args are properly splatted for ruby 2.7
|
|
23
|
+
|
|
24
|
+
### Added
|
|
25
|
+
* Ruby 2.7 to travis matrix
|
|
26
|
+
|
|
27
|
+
## 9.0.2 / 2020-08-14
|
|
28
|
+
### Changed
|
|
29
|
+
* Configure Nokogiri with HUGE for large xml files
|
|
30
|
+
|
|
31
|
+
## 9.0.1 / 2020-03-26
|
|
32
|
+
### Fixed
|
|
33
|
+
* bumps to `nokogiri` / `spreadsheet` / `rubyzip` dependencies
|
|
34
|
+
|
|
4
35
|
## 9.0.0 / 2019-07-31
|
|
5
36
|
### Changed
|
|
6
37
|
* `File::Xml` will now stream XML files by default. Use `slurp: true` for the old behaviour. (#43)
|
data/README.md
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
# NdrImport [](https://github.com/publichealthengland/ndr_import/actions?query=workflow%3Atest) [](https://rubygems.org/gems/ndr_import) [](https://www.rubydoc.info/gems/ndr_import)
|
|
3
2
|
This is the Public Health England (PHE) National Disease Registers (NDR) Import ETL ruby gem, providing:
|
|
4
3
|
|
|
5
4
|
1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), .xls(x) spreadsheets, .doc(x) word documents, PDF, PDF AcroForms, XML, 7-Zip and Zip files.
|
data/code_safety.yml
CHANGED
|
@@ -1,9 +1,21 @@
|
|
|
1
1
|
---
|
|
2
2
|
file safety:
|
|
3
|
+
".github/CODEOWNERS":
|
|
4
|
+
comments:
|
|
5
|
+
reviewed_by: ollietulloch
|
|
6
|
+
safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
|
|
7
|
+
".github/workflows/lint.yml":
|
|
8
|
+
comments:
|
|
9
|
+
reviewed_by: ollietulloch
|
|
10
|
+
safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
|
|
11
|
+
".github/workflows/test.yml":
|
|
12
|
+
comments:
|
|
13
|
+
reviewed_by: ollietulloch
|
|
14
|
+
safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
|
|
3
15
|
".gitignore":
|
|
4
16
|
comments: whole file re-reviewed
|
|
5
17
|
reviewed_by: josh.pencheon
|
|
6
|
-
safe_revision:
|
|
18
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
7
19
|
".hound.yml":
|
|
8
20
|
comments:
|
|
9
21
|
reviewed_by: timgentry
|
|
@@ -12,14 +24,10 @@ file safety:
|
|
|
12
24
|
comments:
|
|
13
25
|
reviewed_by: josh.pencheon
|
|
14
26
|
safe_revision: b09e268ff9c8349b914aa1b7ba888e1d39f97e4a
|
|
15
|
-
".travis.yml":
|
|
16
|
-
comments:
|
|
17
|
-
reviewed_by: josh.pencheon
|
|
18
|
-
safe_revision: d3d9a987befeecb122a448d8d06e66d74da13fb5
|
|
19
27
|
CHANGELOG.md:
|
|
20
28
|
comments:
|
|
21
|
-
reviewed_by:
|
|
22
|
-
safe_revision:
|
|
29
|
+
reviewed_by: joshpencheon
|
|
30
|
+
safe_revision: 8ba7aae5e4839bed03ddc6837dd657ef7720e8ce
|
|
23
31
|
CODE_OF_CONDUCT.md:
|
|
24
32
|
comments:
|
|
25
33
|
reviewed_by: timgentry
|
|
@@ -38,8 +46,8 @@ file safety:
|
|
|
38
46
|
safe_revision: 5d185a0aeba6a9cd2ff5e59efadcaeec9be45d8b
|
|
39
47
|
README.md:
|
|
40
48
|
comments:
|
|
41
|
-
reviewed_by:
|
|
42
|
-
safe_revision:
|
|
49
|
+
reviewed_by: ollietulloch
|
|
50
|
+
safe_revision: b64ff21375dcde2b8fefe622ee9861f0fea21487
|
|
43
51
|
Rakefile:
|
|
44
52
|
comments:
|
|
45
53
|
reviewed_by: josh.pencheon
|
|
@@ -52,6 +60,70 @@ file safety:
|
|
|
52
60
|
comments:
|
|
53
61
|
reviewed_by: josh.pencheon
|
|
54
62
|
safe_revision: e1d967c10059e8c635452838c3f3dd2b969d9ae4
|
|
63
|
+
docs/Gemfile:
|
|
64
|
+
comments:
|
|
65
|
+
reviewed_by: josh.pencheon
|
|
66
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
67
|
+
docs/Gemfile.lock:
|
|
68
|
+
comments:
|
|
69
|
+
reviewed_by: ollietulloch
|
|
70
|
+
safe_revision: ea0149c7739676463a252ffd9fbe4af238762b2b
|
|
71
|
+
docs/_config.yml:
|
|
72
|
+
comments:
|
|
73
|
+
reviewed_by: josh.pencheon
|
|
74
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
75
|
+
docs/_includes/footer.html:
|
|
76
|
+
comments:
|
|
77
|
+
reviewed_by: josh.pencheon
|
|
78
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
79
|
+
docs/_includes/header.html:
|
|
80
|
+
comments:
|
|
81
|
+
reviewed_by: josh.pencheon
|
|
82
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
83
|
+
docs/capturing-data.md:
|
|
84
|
+
comments:
|
|
85
|
+
reviewed_by: josh.pencheon
|
|
86
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
87
|
+
docs/date-formats.md:
|
|
88
|
+
comments:
|
|
89
|
+
reviewed_by: josh.pencheon
|
|
90
|
+
safe_revision: fa21d6d967bf132800b456b585795beec80b08a3
|
|
91
|
+
docs/getting-started.md:
|
|
92
|
+
comments:
|
|
93
|
+
reviewed_by: josh.pencheon
|
|
94
|
+
safe_revision: fa21d6d967bf132800b456b585795beec80b08a3
|
|
95
|
+
docs/identifying-and-splitting-records.md:
|
|
96
|
+
comments:
|
|
97
|
+
reviewed_by: josh.pencheon
|
|
98
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
99
|
+
docs/inbuilt-cleaning-methods.md:
|
|
100
|
+
comments:
|
|
101
|
+
reviewed_by: josh.pencheon
|
|
102
|
+
safe_revision: 694b57ce14e0709fc4d31a1357f8416e98f5de91
|
|
103
|
+
docs/index.md:
|
|
104
|
+
comments:
|
|
105
|
+
reviewed_by: josh.pencheon
|
|
106
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
107
|
+
docs/local-code-transformation-in-yaml-mappings.md:
|
|
108
|
+
comments:
|
|
109
|
+
reviewed_by: josh.pencheon
|
|
110
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
111
|
+
docs/non-tabular-mappings.md:
|
|
112
|
+
comments:
|
|
113
|
+
reviewed_by: josh.pencheon
|
|
114
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
115
|
+
docs/priority-field-mapping.md:
|
|
116
|
+
comments:
|
|
117
|
+
reviewed_by: josh.pencheon
|
|
118
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
119
|
+
docs/standard-yaml-mappings.md:
|
|
120
|
+
comments:
|
|
121
|
+
reviewed_by: josh.pencheon
|
|
122
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
123
|
+
docs/yaml-mapping-user-guide.md:
|
|
124
|
+
comments:
|
|
125
|
+
reviewed_by: josh.pencheon
|
|
126
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
|
55
127
|
exe/pdf_acro_form_to_yaml:
|
|
56
128
|
comments:
|
|
57
129
|
reviewed_by: josh.pencheon
|
|
@@ -82,8 +154,8 @@ file safety:
|
|
|
82
154
|
safe_revision: 24d6449fd0612552f132dfbf4cada2ae28d0469e
|
|
83
155
|
lib/ndr_import/csv_library.rb:
|
|
84
156
|
comments:
|
|
85
|
-
reviewed_by:
|
|
86
|
-
safe_revision:
|
|
157
|
+
reviewed_by: ollietulloch
|
|
158
|
+
safe_revision: 6b8668967dbd42d7893a0fa5f0aa1ec1c11227e1
|
|
87
159
|
lib/ndr_import/file/acro_form.rb:
|
|
88
160
|
comments:
|
|
89
161
|
reviewed_by: josh.pencheon
|
|
@@ -106,8 +178,8 @@ file safety:
|
|
|
106
178
|
safe_revision: 897f8b648d633368cf2001d17ab89c06a12d445b
|
|
107
179
|
lib/ndr_import/file/excel.rb:
|
|
108
180
|
comments:
|
|
109
|
-
reviewed_by:
|
|
110
|
-
safe_revision:
|
|
181
|
+
reviewed_by: ollietulloch
|
|
182
|
+
safe_revision: 37482c79448bea80033f6f69d97584df330c9861
|
|
111
183
|
lib/ndr_import/file/office_file_helper.rb:
|
|
112
184
|
comments:
|
|
113
185
|
reviewed_by: josh.pencheon
|
|
@@ -150,8 +222,8 @@ file safety:
|
|
|
150
222
|
safe_revision: dfc958d44b6c58355445fa395db08a62213ee709
|
|
151
223
|
lib/ndr_import/helpers/file/delimited.rb:
|
|
152
224
|
comments:
|
|
153
|
-
reviewed_by:
|
|
154
|
-
safe_revision:
|
|
225
|
+
reviewed_by: ollietulloch
|
|
226
|
+
safe_revision: 4a5cc1d362c632fc1f9242c69982fbce33557e17
|
|
155
227
|
lib/ndr_import/helpers/file/excel.rb:
|
|
156
228
|
comments:
|
|
157
229
|
reviewed_by: joshpencheon
|
|
@@ -166,13 +238,13 @@ file safety:
|
|
|
166
238
|
safe_revision: 45da71ebd3acbc0fe53755bcd75483ba17cb6924
|
|
167
239
|
lib/ndr_import/helpers/file/xml.rb:
|
|
168
240
|
comments:
|
|
169
|
-
reviewed_by:
|
|
170
|
-
safe_revision:
|
|
241
|
+
reviewed_by: joshpencheon
|
|
242
|
+
safe_revision: 3947f13e0cbd17f449eba292ad343eeb82116fe9
|
|
171
243
|
lib/ndr_import/helpers/file/xml_streaming.rb:
|
|
172
244
|
comments: uses SafePath and Shellwords when accessing filesystem, or making system
|
|
173
245
|
calls
|
|
174
246
|
reviewed_by: josh.pencheon
|
|
175
|
-
safe_revision:
|
|
247
|
+
safe_revision: 55e502bb4445cb8b985e530e8eb26d92b574ded9
|
|
176
248
|
lib/ndr_import/helpers/file/zip.rb:
|
|
177
249
|
comments:
|
|
178
250
|
reviewed_by: timgentry
|
|
@@ -227,16 +299,16 @@ file safety:
|
|
|
227
299
|
safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
|
|
228
300
|
lib/ndr_import/universal_importer_helper.rb:
|
|
229
301
|
comments:
|
|
230
|
-
reviewed_by:
|
|
231
|
-
safe_revision:
|
|
302
|
+
reviewed_by: ollietulloch
|
|
303
|
+
safe_revision: ee2e74e4ceda4ff48cbda6872a6bdf0874212c21
|
|
232
304
|
lib/ndr_import/unmapped_data_error.rb:
|
|
233
305
|
comments:
|
|
234
306
|
reviewed_by: josh.pencheon
|
|
235
307
|
safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
|
|
236
308
|
lib/ndr_import/version.rb:
|
|
237
309
|
comments: another check?
|
|
238
|
-
reviewed_by:
|
|
239
|
-
safe_revision:
|
|
310
|
+
reviewed_by: joshpencheon
|
|
311
|
+
safe_revision: 8ba7aae5e4839bed03ddc6837dd657ef7720e8ce
|
|
240
312
|
lib/ndr_import/xml/table.rb:
|
|
241
313
|
comments:
|
|
242
314
|
reviewed_by: josh.pencheon
|
|
@@ -244,7 +316,11 @@ file safety:
|
|
|
244
316
|
ndr_import.gemspec:
|
|
245
317
|
comments:
|
|
246
318
|
reviewed_by: josh.pencheon
|
|
247
|
-
safe_revision:
|
|
319
|
+
safe_revision: 95e6ee9997d06471fe6f2f169c3c701471086371
|
|
320
|
+
test/csv_library_test.rb:
|
|
321
|
+
comments:
|
|
322
|
+
reviewed_by: ollietulloch
|
|
323
|
+
safe_revision: 6b8668967dbd42d7893a0fa5f0aa1ec1c11227e1
|
|
248
324
|
test/file/acro_form_test.rb:
|
|
249
325
|
comments:
|
|
250
326
|
reviewed_by: josh.pencheon
|
|
@@ -263,16 +339,16 @@ file safety:
|
|
|
263
339
|
safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
|
|
264
340
|
test/file/excel_test.rb:
|
|
265
341
|
comments:
|
|
266
|
-
reviewed_by:
|
|
267
|
-
safe_revision:
|
|
342
|
+
reviewed_by: ollietulloch
|
|
343
|
+
safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
|
|
268
344
|
test/file/pdf_test.rb:
|
|
269
345
|
comments:
|
|
270
346
|
reviewed_by: josh.pencheon
|
|
271
347
|
safe_revision: cb24ed3ea8116730d07f74546cd6fed0738b171d
|
|
272
348
|
test/file/registry_test.rb:
|
|
273
349
|
comments:
|
|
274
|
-
reviewed_by:
|
|
275
|
-
safe_revision:
|
|
350
|
+
reviewed_by: ollietulloch
|
|
351
|
+
safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
|
|
276
352
|
test/file/seven_zip_test.rb:
|
|
277
353
|
comments:
|
|
278
354
|
reviewed_by: josh.pencheon
|
|
@@ -319,8 +395,8 @@ file safety:
|
|
|
319
395
|
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
|
320
396
|
test/helpers/file/xml_test.rb:
|
|
321
397
|
comments:
|
|
322
|
-
reviewed_by:
|
|
323
|
-
safe_revision:
|
|
398
|
+
reviewed_by: joshpencheon
|
|
399
|
+
safe_revision: 3947f13e0cbd17f449eba292ad343eeb82116fe9
|
|
324
400
|
test/helpers/file/zip_test.rb:
|
|
325
401
|
comments:
|
|
326
402
|
reviewed_by: josh.pencheon
|
|
@@ -431,8 +507,8 @@ file safety:
|
|
|
431
507
|
safe_revision: 71979e0a602ca5a0ce415c194f10add9959f0116
|
|
432
508
|
test/resources/malformed.xml:
|
|
433
509
|
comments:
|
|
434
|
-
reviewed_by:
|
|
435
|
-
safe_revision:
|
|
510
|
+
reviewed_by: joshpencheon
|
|
511
|
+
safe_revision: 3947f13e0cbd17f449eba292ad343eeb82116fe9
|
|
436
512
|
test/resources/malformed_pipe.csv:
|
|
437
513
|
comments:
|
|
438
514
|
reviewed_by: josh.pencheon
|
|
@@ -489,6 +565,10 @@ file safety:
|
|
|
489
565
|
comments:
|
|
490
566
|
reviewed_by: timgentry
|
|
491
567
|
safe_revision: 8c30f89f0562ab120769c166d4e93ff839c055f7
|
|
568
|
+
test/resources/sample_xlsm.xlsm:
|
|
569
|
+
comments:
|
|
570
|
+
reviewed_by: ollietulloch
|
|
571
|
+
safe_revision: 85a080deaa93e4220ad1bf566f29cbdac9b31c0f
|
|
492
572
|
test/resources/sample_xlsx.xlsx:
|
|
493
573
|
comments:
|
|
494
574
|
reviewed_by: timgentry
|
|
@@ -505,6 +585,10 @@ file safety:
|
|
|
505
585
|
comments:
|
|
506
586
|
reviewed_by: timgentry
|
|
507
587
|
safe_revision: 31fb1935f4578729d8786eea41cf0ce0a19be1cd
|
|
588
|
+
test/resources/two_files_single_table_mapping.zip:
|
|
589
|
+
comments:
|
|
590
|
+
reviewed_by: ollietulloch
|
|
591
|
+
safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
|
|
508
592
|
test/resources/txt_file_xls_extension.xls:
|
|
509
593
|
comments:
|
|
510
594
|
reviewed_by: timgentry
|
|
@@ -537,6 +621,10 @@ file safety:
|
|
|
537
621
|
comments:
|
|
538
622
|
reviewed_by: timgentry
|
|
539
623
|
safe_revision: f755c6960182f7dd460c18866cccfdf09178e860
|
|
624
|
+
test/resources/with-control-chars.xml:
|
|
625
|
+
comments:
|
|
626
|
+
reviewed_by: joshpencheon
|
|
627
|
+
safe_revision: 3947f13e0cbd17f449eba292ad343eeb82116fe9
|
|
540
628
|
test/resources/xlsx_file_xls_extension.xls:
|
|
541
629
|
comments:
|
|
542
630
|
reviewed_by: timgentry
|
|
@@ -555,8 +643,8 @@ file safety:
|
|
|
555
643
|
safe_revision: 93ccee82fc2165d1ca2d9b03d146ae03e769ea96
|
|
556
644
|
test/universal_importer_helper_test.rb:
|
|
557
645
|
comments:
|
|
558
|
-
reviewed_by:
|
|
559
|
-
safe_revision:
|
|
646
|
+
reviewed_by: ollietulloch
|
|
647
|
+
safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
|
|
560
648
|
test/xml/table_test.rb:
|
|
561
649
|
comments:
|
|
562
650
|
reviewed_by: josh.pencheon
|
|
@@ -1,38 +1,60 @@
|
|
|
1
1
|
# This file allows us to choose the CSV library we want to use.
|
|
2
2
|
|
|
3
3
|
require 'csv'
|
|
4
|
+
require 'active_support/deprecation'
|
|
5
|
+
|
|
4
6
|
# Using relevant core CSV library.
|
|
5
|
-
CSVLibrary
|
|
7
|
+
class CSVLibrary < CSV; end
|
|
6
8
|
|
|
7
9
|
class << CSVLibrary
|
|
8
10
|
# Is the library we're using FasterCSV?
|
|
9
11
|
def fastercsv?
|
|
12
|
+
deprecate('if you desparately want fastercsv, please use it explicitly')
|
|
10
13
|
not self.const_defined?(:Reader)
|
|
11
14
|
end
|
|
12
15
|
|
|
13
16
|
# Ensure that we can pass "mode" straight through the underlying IO object
|
|
17
|
+
#
|
|
18
|
+
# Note: this could likely be refactored now, as upstream support for something
|
|
19
|
+
# very similar was added:
|
|
20
|
+
#
|
|
21
|
+
# https://github.com/ruby/csv/commit/b4edaf2cf1aa36f5c6264c07514b66739b87ceee
|
|
22
|
+
#
|
|
14
23
|
def foreach(path, **options, &block)
|
|
15
|
-
|
|
16
|
-
|
|
24
|
+
deprecate('CSV#foreach exists, with an optional `mode` argument')
|
|
25
|
+
return to_enum(__method__, path, **options) unless block
|
|
26
|
+
open(path, options.delete(:mode) || 'r', **options) do |csv|
|
|
17
27
|
csv.each(&block)
|
|
18
28
|
end
|
|
19
29
|
end
|
|
20
30
|
|
|
21
31
|
def write_csv_to_string(data)
|
|
32
|
+
deprecate('write_csv_to_string -> generate')
|
|
22
33
|
self.generate do |csv|
|
|
23
34
|
data.each { |line| csv << line }
|
|
24
35
|
end
|
|
25
36
|
end
|
|
26
37
|
|
|
27
38
|
def write_csv_to_file(data, filepath, mode = 'w')
|
|
39
|
+
deprecate('write_csv_to_file -> open')
|
|
28
40
|
self.open(filepath, mode) do |csv|
|
|
29
41
|
data.each { |line| csv << line }
|
|
30
42
|
end
|
|
31
43
|
end
|
|
32
44
|
|
|
33
45
|
def read_csv_from_file(filepath)
|
|
46
|
+
deprecate('read_csv_from_file -> read')
|
|
34
47
|
self.read(filepath)
|
|
35
48
|
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
def deprecate(additional_message = nil)
|
|
53
|
+
ActiveSupport::Deprecation.warn(<<~MESSAGE)
|
|
54
|
+
CSVLibrary is deprecated, and will be removed in a future version of ndr_import.
|
|
55
|
+
Please use standard functionality provided by Ruby's CSV library (#{additional_message}).
|
|
56
|
+
MESSAGE
|
|
57
|
+
end
|
|
36
58
|
end
|
|
37
59
|
|
|
38
60
|
# Forward port CSV::Cell, as it is sometimes
|
|
@@ -90,14 +90,14 @@ module NdrImport
|
|
|
90
90
|
case SafeFile.extname(path).downcase
|
|
91
91
|
when '.xls'
|
|
92
92
|
Roo::Excel.new(SafeFile.safepath_to_string(path))
|
|
93
|
-
when '.xlsx'
|
|
93
|
+
when '.xlsm', '.xlsx'
|
|
94
94
|
if @options['file_password']
|
|
95
95
|
Roo::Excelx.new(StringIO.new(decrypted_file_string(path, @options['file_password'])))
|
|
96
96
|
else
|
|
97
97
|
Roo::Excelx.new(SafeFile.safepath_to_string(path))
|
|
98
98
|
end
|
|
99
99
|
else
|
|
100
|
-
|
|
100
|
+
raise "Received file path with unexpected extension #{SafeFile.extname(path)}"
|
|
101
101
|
end
|
|
102
102
|
rescue Ole::Storage::FormatError => e
|
|
103
103
|
# TODO: Do we need to remove the new_file after using it?
|
|
@@ -105,16 +105,14 @@ module NdrImport
|
|
|
105
105
|
# try to load the .xls file as an .xlsx file, useful for sources like USOM
|
|
106
106
|
# roo check file extensions in file_type_check (GenericSpreadsheet),
|
|
107
107
|
# so we create a duplicate file in xlsx extension
|
|
108
|
-
|
|
109
|
-
new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
|
|
110
|
-
new_file_path = SafeFile.dirname(path).join(new_file_name)
|
|
111
|
-
copy_file(path, new_file_path)
|
|
108
|
+
raise e.message unless /(.*)\.xls$/.match(path)
|
|
112
109
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
110
|
+
new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
|
|
111
|
+
new_file_path = SafeFile.dirname(path).join(new_file_name)
|
|
112
|
+
copy_file(path, new_file_path)
|
|
113
|
+
|
|
114
|
+
load_workbook(new_file_path)
|
|
115
|
+
rescue RuntimeError, ::Zip::Error => e
|
|
118
116
|
raise ["Unable to read the file '#{path}'", e.message].join('; ')
|
|
119
117
|
end
|
|
120
118
|
|
|
@@ -133,6 +131,6 @@ module NdrImport
|
|
|
133
131
|
end
|
|
134
132
|
end
|
|
135
133
|
|
|
136
|
-
Registry.register(Excel, 'xls', 'xlsx')
|
|
134
|
+
Registry.register(Excel, 'xls', 'xlsm', 'xlsx')
|
|
137
135
|
end
|
|
138
136
|
end
|
|
@@ -32,11 +32,11 @@ module NdrImport
|
|
|
32
32
|
return enum_for(:delimited_rows, path, col_sep, liberal) unless block_given?
|
|
33
33
|
|
|
34
34
|
safe_path = SafeFile.safepath_to_string(path)
|
|
35
|
-
|
|
35
|
+
options = determine_encodings!(safe_path, col_sep, liberal)
|
|
36
36
|
|
|
37
|
-
# By now, we know `
|
|
37
|
+
# By now, we know `options` should let us read the whole
|
|
38
38
|
# file succesfully; if there are problems, we should crash.
|
|
39
|
-
|
|
39
|
+
CSV.foreach(safe_path, options.delete(:mode), **options) do |line|
|
|
40
40
|
yield line.map(&:to_s)
|
|
41
41
|
end
|
|
42
42
|
end
|
|
@@ -46,7 +46,7 @@ module NdrImport
|
|
|
46
46
|
# Derive the source encoding by trying all supported encodings.
|
|
47
47
|
# Returns first set of working options, or raises if none could be found.
|
|
48
48
|
def determine_encodings!(safe_path, col_sep, liberal)
|
|
49
|
-
# delimiter encoding => #
|
|
49
|
+
# delimiter encoding => # CSV encoding string
|
|
50
50
|
supported_encodings = {
|
|
51
51
|
'UTF-8' => 'r:bom|utf-8',
|
|
52
52
|
'Windows-1252' => 'r:windows-1252:utf-8'
|
|
@@ -67,14 +67,13 @@ module NdrImport
|
|
|
67
67
|
begin
|
|
68
68
|
options = {
|
|
69
69
|
col_sep: (col_sep || ',').force_encoding(delimiter_encoding),
|
|
70
|
-
liberal_parsing: liberal
|
|
71
|
-
mode: access_mode
|
|
70
|
+
liberal_parsing: liberal
|
|
72
71
|
}
|
|
73
72
|
|
|
74
73
|
row_num = 0
|
|
75
74
|
# Iterate through the file; if we reach the end, this encoding worked:
|
|
76
|
-
|
|
77
|
-
return options
|
|
75
|
+
CSV.foreach(safe_path, access_mode, **options) { |_line| row_num += 1 }
|
|
76
|
+
return options.merge(mode: access_mode)
|
|
78
77
|
rescue ArgumentError => e
|
|
79
78
|
next if e.message =~ /invalid byte sequence/ # This encoding didn't work
|
|
80
79
|
raise(e)
|
|
@@ -10,15 +10,20 @@ module NdrImport
|
|
|
10
10
|
|
|
11
11
|
private
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
# By default, escapes any control characters found in the XML
|
|
14
|
+
# - their use is forbidden in XML 1.0, and highly discouraged
|
|
15
|
+
# in XML 1.1; any found are most likely to be erroneous.
|
|
16
|
+
def read_xml_file(path, preserve_control_chars: false)
|
|
17
|
+
file_data = ensure_utf8!(SafeFile.read(path))
|
|
18
|
+
escape_xml_control_chars!(file_data) unless preserve_control_chars
|
|
15
19
|
|
|
16
20
|
require 'nokogiri'
|
|
17
21
|
|
|
18
|
-
Nokogiri::XML(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
+
doc = Nokogiri::XML(file_data, &:huge)
|
|
23
|
+
doc.encoding = 'UTF-8'
|
|
24
|
+
emulate_strict_mode_fatal_check!(doc)
|
|
25
|
+
|
|
26
|
+
doc
|
|
22
27
|
end
|
|
23
28
|
|
|
24
29
|
# Nokogiri can use give a `STRICT` parse option to libxml, but our friendly
|
|
@@ -37,11 +42,19 @@ module NdrImport
|
|
|
37
42
|
end
|
|
38
43
|
|
|
39
44
|
return unless fatal_errors.any?
|
|
45
|
+
|
|
40
46
|
raise Nokogiri::XML::SyntaxError, <<~MSG
|
|
41
47
|
The file had #{fatal_errors.length} fatal error(s)!"
|
|
42
48
|
#{fatal_errors.join("\n")}
|
|
43
49
|
MSG
|
|
44
50
|
end
|
|
51
|
+
|
|
52
|
+
# In place, escape out any control chars that would cause
|
|
53
|
+
# libxml to crash. Very few are allowable in XML 1.0, and
|
|
54
|
+
# remain heavily discouraged in XML 1.1.
|
|
55
|
+
def escape_xml_control_chars!(data)
|
|
56
|
+
escape_control_chars!(data)
|
|
57
|
+
end
|
|
45
58
|
end
|
|
46
59
|
end
|
|
47
60
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
require 'shellwords'
|
|
2
|
+
|
|
1
3
|
require 'ndr_import/file/registry'
|
|
2
4
|
|
|
3
5
|
module NdrImport
|
|
@@ -5,11 +7,30 @@ module NdrImport
|
|
|
5
7
|
# complexity of enumerating over files and tables (which should be universally useful).
|
|
6
8
|
# It is assumed that the host module/class defines `unzip_path`.
|
|
7
9
|
module UniversalImporterHelper
|
|
10
|
+
# Helper class to allow multiple source enumerators to contribute to one overall table.
|
|
11
|
+
class TableEnumProxy
|
|
12
|
+
include Enumerable
|
|
13
|
+
|
|
14
|
+
def initialize
|
|
15
|
+
@table_enums = []
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def add_table_enum(table_enum)
|
|
19
|
+
@table_enums << table_enum
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def each(&block)
|
|
23
|
+
return enum_for(:each) unless block
|
|
24
|
+
|
|
25
|
+
@table_enums.each { |table_enum| table_enum.each(&block) }
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
8
29
|
def table_enumerators(filename)
|
|
9
|
-
table_enumerators = {}
|
|
30
|
+
table_enumerators = Hash.new { |hash, key| hash[key] = TableEnumProxy.new }
|
|
10
31
|
|
|
11
32
|
extract(filename).each do |table, rows|
|
|
12
|
-
table_enumerators[table.canonical_name]
|
|
33
|
+
table_enumerators[table.canonical_name].add_table_enum table.transform(rows)
|
|
13
34
|
end
|
|
14
35
|
|
|
15
36
|
table_enumerators
|
data/lib/ndr_import/version.rb
CHANGED
data/ndr_import.gemspec
CHANGED
|
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
|
|
|
15
15
|
# Specify which files should be added to the gem when it is released.
|
|
16
16
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
17
17
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
|
18
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
18
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(docs|test|spec|features)/}) }
|
|
19
19
|
end
|
|
20
20
|
spec.files -= %w[.travis.yml] # Not needed in the gem
|
|
21
21
|
spec.bindir = 'exe'
|
|
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
|
|
|
26
26
|
spec.add_dependency 'activesupport', '>= 5.0', '< 7'
|
|
27
27
|
spec.add_dependency 'ndr_support', '>= 5.3.2', '< 6'
|
|
28
28
|
|
|
29
|
-
spec.add_dependency 'rubyzip', '~>
|
|
29
|
+
spec.add_dependency 'rubyzip', '~> 2.0'
|
|
30
30
|
spec.add_dependency 'roo', '~> 2.0'
|
|
31
31
|
|
|
32
32
|
spec.add_dependency 'docx', '~> 0.3'
|
|
@@ -36,12 +36,12 @@ Gem::Specification.new do |spec|
|
|
|
36
36
|
spec.add_dependency 'pdf-reader', '~> 2.1'
|
|
37
37
|
spec.add_dependency 'roo-xls'
|
|
38
38
|
spec.add_dependency 'seven_zip_ruby', '~> 1.2'
|
|
39
|
-
spec.add_dependency 'spreadsheet', '1.
|
|
39
|
+
spec.add_dependency 'spreadsheet', '1.2.6'
|
|
40
40
|
|
|
41
41
|
spec.required_ruby_version = '>= 2.5'
|
|
42
42
|
|
|
43
43
|
spec.add_development_dependency 'bundler'
|
|
44
|
-
spec.add_development_dependency 'rake', '~>
|
|
44
|
+
spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.3'
|
|
45
45
|
spec.add_development_dependency 'minitest'
|
|
46
46
|
spec.add_development_dependency 'mocha'
|
|
47
47
|
spec.add_development_dependency 'ndr_dev_support', '>= 3.1.3'
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ndr_import
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: '10.0'
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- NCRS Development Team
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-02-22 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activemodel
|
|
@@ -70,20 +70,14 @@ dependencies:
|
|
|
70
70
|
requirements:
|
|
71
71
|
- - "~>"
|
|
72
72
|
- !ruby/object:Gem::Version
|
|
73
|
-
version: '
|
|
74
|
-
- - ">="
|
|
75
|
-
- !ruby/object:Gem::Version
|
|
76
|
-
version: 1.2.2
|
|
73
|
+
version: '2.0'
|
|
77
74
|
type: :runtime
|
|
78
75
|
prerelease: false
|
|
79
76
|
version_requirements: !ruby/object:Gem::Requirement
|
|
80
77
|
requirements:
|
|
81
78
|
- - "~>"
|
|
82
79
|
- !ruby/object:Gem::Version
|
|
83
|
-
version: '
|
|
84
|
-
- - ">="
|
|
85
|
-
- !ruby/object:Gem::Version
|
|
86
|
-
version: 1.2.2
|
|
80
|
+
version: '2.0'
|
|
87
81
|
- !ruby/object:Gem::Dependency
|
|
88
82
|
name: roo
|
|
89
83
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -208,14 +202,14 @@ dependencies:
|
|
|
208
202
|
requirements:
|
|
209
203
|
- - '='
|
|
210
204
|
- !ruby/object:Gem::Version
|
|
211
|
-
version: 1.
|
|
205
|
+
version: 1.2.6
|
|
212
206
|
type: :runtime
|
|
213
207
|
prerelease: false
|
|
214
208
|
version_requirements: !ruby/object:Gem::Requirement
|
|
215
209
|
requirements:
|
|
216
210
|
- - '='
|
|
217
211
|
- !ruby/object:Gem::Version
|
|
218
|
-
version: 1.
|
|
212
|
+
version: 1.2.6
|
|
219
213
|
- !ruby/object:Gem::Dependency
|
|
220
214
|
name: bundler
|
|
221
215
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -236,14 +230,20 @@ dependencies:
|
|
|
236
230
|
requirements:
|
|
237
231
|
- - "~>"
|
|
238
232
|
- !ruby/object:Gem::Version
|
|
239
|
-
version: '
|
|
233
|
+
version: '12.3'
|
|
234
|
+
- - ">="
|
|
235
|
+
- !ruby/object:Gem::Version
|
|
236
|
+
version: 12.3.3
|
|
240
237
|
type: :development
|
|
241
238
|
prerelease: false
|
|
242
239
|
version_requirements: !ruby/object:Gem::Requirement
|
|
243
240
|
requirements:
|
|
244
241
|
- - "~>"
|
|
245
242
|
- !ruby/object:Gem::Version
|
|
246
|
-
version: '
|
|
243
|
+
version: '12.3'
|
|
244
|
+
- - ">="
|
|
245
|
+
- !ruby/object:Gem::Version
|
|
246
|
+
version: 12.3.3
|
|
247
247
|
- !ruby/object:Gem::Dependency
|
|
248
248
|
name: minitest
|
|
249
249
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -365,6 +365,9 @@ executables:
|
|
|
365
365
|
extensions: []
|
|
366
366
|
extra_rdoc_files: []
|
|
367
367
|
files:
|
|
368
|
+
- ".github/CODEOWNERS"
|
|
369
|
+
- ".github/workflows/lint.yml"
|
|
370
|
+
- ".github/workflows/test.yml"
|
|
368
371
|
- ".gitignore"
|
|
369
372
|
- ".hound.yml"
|
|
370
373
|
- ".rubocop.yml"
|