ndr_import 8.5.1 → 9.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/CHANGELOG.md +24 -0
- data/README.md +1 -1
- data/code_safety.yml +99 -22
- data/lib/ndr_import/file/base.rb +2 -1
- data/lib/ndr_import/file/docx.rb +4 -3
- data/lib/ndr_import/file/excel.rb +2 -2
- data/lib/ndr_import/file/xml.rb +9 -2
- data/lib/ndr_import/helpers/file/xml.rb +6 -3
- data/lib/ndr_import/helpers/file/xml_streaming.rb +183 -0
- data/lib/ndr_import/mapper.rb +1 -1
- data/lib/ndr_import/non_tabular/column_mapping.rb +2 -1
- data/lib/ndr_import/non_tabular/record.rb +4 -1
- data/lib/ndr_import/universal_importer_helper.rb +4 -1
- data/lib/ndr_import/version.rb +1 -1
- data/ndr_import.gemspec +5 -5
- metadata +15 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7dc0801c147101346c5a2e0af96ef9a658bc068088bb54f4f65970bb012e6cf5
|
4
|
+
data.tar.gz: 22823d5d0415a95eebac8151475d6eba7fdbd1744003ee0d88b1dedf86b837b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3c29ef1ab701b94301aca343d8bf8497fda2633d120b3945dc140cb30d1b40b30bfcfb2334fe2bdd6f02d6128f1743ac42601f8129687502503e69c14d77df98
|
7
|
+
data.tar.gz: 49e739dab53bf1276655e01a91dd9253221e041c0f3241b85f362a2d429597fbb4589d14b89b1d83fed7995b043288e72f3bf1bd27d2678e6f7d2c4b1a5c3d8f
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,30 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
*no unreleased changes*
|
3
3
|
|
4
|
+
## 9.0.2 / 2020-08-14
|
5
|
+
### Changed
|
6
|
+
* Configure Nokogiri with HUGE for large xml files
|
7
|
+
|
8
|
+
## 9.0.1 / 2020-03-26
|
9
|
+
### Fixed
|
10
|
+
* bumps to `nokogiri` / `spreadsheet` / `rubyzip` dependencies
|
11
|
+
|
12
|
+
## 9.0.0 / 2019-07-31
|
13
|
+
### Changed
|
14
|
+
* `File::Xml` will now stream XML files by default. Use `slurp: true` for the old behaviour. (#43)
|
15
|
+
|
16
|
+
### Added
|
17
|
+
* Add `XmlStreaming` helper, for more performant handling of large XML documents with Nokogiri. (#43)
|
18
|
+
|
19
|
+
## 8.6.0 / 2019-06-07
|
20
|
+
### Added
|
21
|
+
* Allow conditional preservation of blank lines when joining lines in non-tabular data (#41)
|
22
|
+
|
23
|
+
## 8.5.2 / 2019-05-17
|
24
|
+
### Fixed
|
25
|
+
* Fixed issue with `file_password` option key as a String or Symbol
|
26
|
+
* Tempfiles now take their encoding from the incoming string/stream
|
27
|
+
|
4
28
|
## 8.5.1 / 2019-05-15
|
5
29
|
### Added
|
6
30
|
* Add data loader tools (#39)
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# NdrImport [![Build Status](https://travis-ci.org/PublicHealthEngland/ndr_import.svg?branch=master)](https://travis-ci.org/PublicHealthEngland/ndr_import) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://badge.fury.io/rb/ndr_import)
|
1
|
+
# NdrImport [![Build Status](https://travis-ci.org/PublicHealthEngland/ndr_import.svg?branch=master)](https://travis-ci.org/PublicHealthEngland/ndr_import) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://badge.fury.io/rb/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
|
2
2
|
|
3
3
|
This is the Public Health England (PHE) National Disease Registers (NDR) Import ETL ruby gem, providing:
|
4
4
|
|
data/code_safety.yml
CHANGED
@@ -3,7 +3,7 @@ file safety:
|
|
3
3
|
".gitignore":
|
4
4
|
comments: whole file re-reviewed
|
5
5
|
reviewed_by: josh.pencheon
|
6
|
-
safe_revision:
|
6
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
7
7
|
".hound.yml":
|
8
8
|
comments:
|
9
9
|
reviewed_by: timgentry
|
@@ -18,8 +18,8 @@ file safety:
|
|
18
18
|
safe_revision: d3d9a987befeecb122a448d8d06e66d74da13fb5
|
19
19
|
CHANGELOG.md:
|
20
20
|
comments:
|
21
|
-
reviewed_by:
|
22
|
-
safe_revision:
|
21
|
+
reviewed_by: ollietulloch
|
22
|
+
safe_revision: aa006cd76123db2101d145a07d201dc6a709ed6e
|
23
23
|
CODE_OF_CONDUCT.md:
|
24
24
|
comments:
|
25
25
|
reviewed_by: timgentry
|
@@ -39,7 +39,7 @@ file safety:
|
|
39
39
|
README.md:
|
40
40
|
comments:
|
41
41
|
reviewed_by: josh.pencheon
|
42
|
-
safe_revision:
|
42
|
+
safe_revision: 1bc459db8970dde36e9b240b6dd08cca629664e3
|
43
43
|
Rakefile:
|
44
44
|
comments:
|
45
45
|
reviewed_by: josh.pencheon
|
@@ -52,6 +52,70 @@ file safety:
|
|
52
52
|
comments:
|
53
53
|
reviewed_by: josh.pencheon
|
54
54
|
safe_revision: e1d967c10059e8c635452838c3f3dd2b969d9ae4
|
55
|
+
docs/Gemfile:
|
56
|
+
comments:
|
57
|
+
reviewed_by: josh.pencheon
|
58
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
59
|
+
docs/Gemfile.lock:
|
60
|
+
comments:
|
61
|
+
reviewed_by: ollietulloch
|
62
|
+
safe_revision: 6f274715bb341c3070190f04f67af9500b510580
|
63
|
+
docs/_config.yml:
|
64
|
+
comments:
|
65
|
+
reviewed_by: josh.pencheon
|
66
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
67
|
+
docs/_includes/footer.html:
|
68
|
+
comments:
|
69
|
+
reviewed_by: josh.pencheon
|
70
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
71
|
+
docs/_includes/header.html:
|
72
|
+
comments:
|
73
|
+
reviewed_by: josh.pencheon
|
74
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
75
|
+
docs/capturing-data.md:
|
76
|
+
comments:
|
77
|
+
reviewed_by: josh.pencheon
|
78
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
79
|
+
docs/date-formats.md:
|
80
|
+
comments:
|
81
|
+
reviewed_by: josh.pencheon
|
82
|
+
safe_revision: fa21d6d967bf132800b456b585795beec80b08a3
|
83
|
+
docs/getting-started.md:
|
84
|
+
comments:
|
85
|
+
reviewed_by: josh.pencheon
|
86
|
+
safe_revision: fa21d6d967bf132800b456b585795beec80b08a3
|
87
|
+
docs/identifying-and-splitting-records.md:
|
88
|
+
comments:
|
89
|
+
reviewed_by: josh.pencheon
|
90
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
91
|
+
docs/inbuilt-cleaning-methods.md:
|
92
|
+
comments:
|
93
|
+
reviewed_by: josh.pencheon
|
94
|
+
safe_revision: 694b57ce14e0709fc4d31a1357f8416e98f5de91
|
95
|
+
docs/index.md:
|
96
|
+
comments:
|
97
|
+
reviewed_by: josh.pencheon
|
98
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
99
|
+
docs/local-code-transformation-in-yaml-mappings.md:
|
100
|
+
comments:
|
101
|
+
reviewed_by: josh.pencheon
|
102
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
103
|
+
docs/non-tabular-mappings.md:
|
104
|
+
comments:
|
105
|
+
reviewed_by: josh.pencheon
|
106
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
107
|
+
docs/priority-field-mapping.md:
|
108
|
+
comments:
|
109
|
+
reviewed_by: josh.pencheon
|
110
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
111
|
+
docs/standard-yaml-mappings.md:
|
112
|
+
comments:
|
113
|
+
reviewed_by: josh.pencheon
|
114
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
115
|
+
docs/yaml-mapping-user-guide.md:
|
116
|
+
comments:
|
117
|
+
reviewed_by: josh.pencheon
|
118
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
55
119
|
exe/pdf_acro_form_to_yaml:
|
56
120
|
comments:
|
57
121
|
reviewed_by: josh.pencheon
|
@@ -94,8 +158,8 @@ file safety:
|
|
94
158
|
safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
|
95
159
|
lib/ndr_import/file/base.rb:
|
96
160
|
comments:
|
97
|
-
reviewed_by:
|
98
|
-
safe_revision:
|
161
|
+
reviewed_by: josh.pencheon
|
162
|
+
safe_revision: c3183e522bce50008df576ceb47fe4761ab8f966
|
99
163
|
lib/ndr_import/file/delimited.rb:
|
100
164
|
comments:
|
101
165
|
reviewed_by: josh.pencheon
|
@@ -103,11 +167,11 @@ file safety:
|
|
103
167
|
lib/ndr_import/file/docx.rb:
|
104
168
|
comments:
|
105
169
|
reviewed_by: josh.pencheon
|
106
|
-
safe_revision:
|
170
|
+
safe_revision: 897f8b648d633368cf2001d17ab89c06a12d445b
|
107
171
|
lib/ndr_import/file/excel.rb:
|
108
172
|
comments:
|
109
173
|
reviewed_by: josh.pencheon
|
110
|
-
safe_revision:
|
174
|
+
safe_revision: c3183e522bce50008df576ceb47fe4761ab8f966
|
111
175
|
lib/ndr_import/file/office_file_helper.rb:
|
112
176
|
comments:
|
113
177
|
reviewed_by: josh.pencheon
|
@@ -139,7 +203,7 @@ file safety:
|
|
139
203
|
lib/ndr_import/file/xml.rb:
|
140
204
|
comments:
|
141
205
|
reviewed_by: josh.pencheon
|
142
|
-
safe_revision:
|
206
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
143
207
|
lib/ndr_import/file/zip.rb:
|
144
208
|
comments:
|
145
209
|
reviewed_by: timgentry
|
@@ -166,8 +230,13 @@ file safety:
|
|
166
230
|
safe_revision: 45da71ebd3acbc0fe53755bcd75483ba17cb6924
|
167
231
|
lib/ndr_import/helpers/file/xml.rb:
|
168
232
|
comments:
|
233
|
+
reviewed_by: ollietulloch
|
234
|
+
safe_revision: 4d337bd233f7e60cf9d363c92400f21269a28da7
|
235
|
+
lib/ndr_import/helpers/file/xml_streaming.rb:
|
236
|
+
comments: uses SafePath and Shellwords when accessing filesystem, or making system
|
237
|
+
calls
|
169
238
|
reviewed_by: josh.pencheon
|
170
|
-
safe_revision:
|
239
|
+
safe_revision: 55e502bb4445cb8b985e530e8eb26d92b574ded9
|
171
240
|
lib/ndr_import/helpers/file/zip.rb:
|
172
241
|
comments:
|
173
242
|
reviewed_by: timgentry
|
@@ -175,7 +244,7 @@ file safety:
|
|
175
244
|
lib/ndr_import/mapper.rb:
|
176
245
|
comments: Writes to a Tempfile, but cleans up. Ruby will respect TMP_DIR
|
177
246
|
reviewed_by: josh.pencheon
|
178
|
-
safe_revision:
|
247
|
+
safe_revision: 897f8b648d633368cf2001d17ab89c06a12d445b
|
179
248
|
lib/ndr_import/mapping_error.rb:
|
180
249
|
comments:
|
181
250
|
reviewed_by: timgentry
|
@@ -187,7 +256,7 @@ file safety:
|
|
187
256
|
lib/ndr_import/non_tabular/column_mapping.rb:
|
188
257
|
comments:
|
189
258
|
reviewed_by: josh.pencheon
|
190
|
-
safe_revision:
|
259
|
+
safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
|
191
260
|
lib/ndr_import/non_tabular/line.rb:
|
192
261
|
comments:
|
193
262
|
reviewed_by: timgentry
|
@@ -198,8 +267,8 @@ file safety:
|
|
198
267
|
safe_revision: cf382902508a21a95b80ac4582fbbd117164e80e
|
199
268
|
lib/ndr_import/non_tabular/record.rb:
|
200
269
|
comments:
|
201
|
-
reviewed_by:
|
202
|
-
safe_revision:
|
270
|
+
reviewed_by: josh.pencheon
|
271
|
+
safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
|
203
272
|
lib/ndr_import/non_tabular/table.rb:
|
204
273
|
comments:
|
205
274
|
reviewed_by: josh.pencheon
|
@@ -223,15 +292,15 @@ file safety:
|
|
223
292
|
lib/ndr_import/universal_importer_helper.rb:
|
224
293
|
comments:
|
225
294
|
reviewed_by: josh.pencheon
|
226
|
-
safe_revision:
|
295
|
+
safe_revision: 55e502bb4445cb8b985e530e8eb26d92b574ded9
|
227
296
|
lib/ndr_import/unmapped_data_error.rb:
|
228
297
|
comments:
|
229
298
|
reviewed_by: josh.pencheon
|
230
299
|
safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
|
231
300
|
lib/ndr_import/version.rb:
|
232
301
|
comments: another check?
|
233
|
-
reviewed_by:
|
234
|
-
safe_revision:
|
302
|
+
reviewed_by: ollietulloch
|
303
|
+
safe_revision: aa006cd76123db2101d145a07d201dc6a709ed6e
|
235
304
|
lib/ndr_import/xml/table.rb:
|
236
305
|
comments:
|
237
306
|
reviewed_by: josh.pencheon
|
@@ -239,7 +308,7 @@ file safety:
|
|
239
308
|
ndr_import.gemspec:
|
240
309
|
comments:
|
241
310
|
reviewed_by: josh.pencheon
|
242
|
-
safe_revision:
|
311
|
+
safe_revision: 95e6ee9997d06471fe6f2f169c3c701471086371
|
243
312
|
test/file/acro_form_test.rb:
|
244
313
|
comments:
|
245
314
|
reviewed_by: josh.pencheon
|
@@ -283,7 +352,7 @@ file safety:
|
|
283
352
|
test/file/xml_test.rb:
|
284
353
|
comments:
|
285
354
|
reviewed_by: josh.pencheon
|
286
|
-
safe_revision:
|
355
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
287
356
|
test/file/zip_test.rb:
|
288
357
|
comments:
|
289
358
|
reviewed_by: timgentry
|
@@ -308,6 +377,10 @@ file safety:
|
|
308
377
|
comments:
|
309
378
|
reviewed_by: timgentry
|
310
379
|
safe_revision: 9abdd6ced1d0c90ce8dd88abee4eb6472c7ff0d6
|
380
|
+
test/helpers/file/xml_streaming_test.rb:
|
381
|
+
comments:
|
382
|
+
reviewed_by: josh.pencheon
|
383
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
311
384
|
test/helpers/file/xml_test.rb:
|
312
385
|
comments:
|
313
386
|
reviewed_by: timgentry
|
@@ -330,8 +403,8 @@ file safety:
|
|
330
403
|
safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
|
331
404
|
test/non_tabular_file_helper_test.rb:
|
332
405
|
comments:
|
333
|
-
reviewed_by:
|
334
|
-
safe_revision:
|
406
|
+
reviewed_by: josh.pencheon
|
407
|
+
safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
|
335
408
|
test/pdf_form/table_test.rb:
|
336
409
|
comments:
|
337
410
|
reviewed_by: josh.pencheon
|
@@ -356,6 +429,10 @@ file safety:
|
|
356
429
|
comments:
|
357
430
|
reviewed_by: timgentry
|
358
431
|
safe_revision: dab4b8a3e4b29d85eccd971e79936982d888cffd
|
432
|
+
test/resources/claims_utf16be_but_isnt.xml:
|
433
|
+
comments:
|
434
|
+
reviewed_by: josh.pencheon
|
435
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
359
436
|
test/resources/filesystem_paths.yml:
|
360
437
|
comments:
|
361
438
|
reviewed_by: timgentry
|
@@ -543,7 +620,7 @@ file safety:
|
|
543
620
|
test/universal_importer_helper_test.rb:
|
544
621
|
comments:
|
545
622
|
reviewed_by: josh.pencheon
|
546
|
-
safe_revision:
|
623
|
+
safe_revision: c3183e522bce50008df576ceb47fe4761ab8f966
|
547
624
|
test/xml/table_test.rb:
|
548
625
|
comments:
|
549
626
|
reviewed_by: josh.pencheon
|
data/lib/ndr_import/file/base.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'active_support/core_ext/hash'
|
1
2
|
require 'ndr_support/safe_file'
|
2
3
|
require 'ndr_import/csv_library'
|
3
4
|
require_relative 'registry'
|
@@ -11,7 +12,7 @@ module NdrImport
|
|
11
12
|
def initialize(filename, format, options = {})
|
12
13
|
@filename = filename
|
13
14
|
@format = format
|
14
|
-
@options = options
|
15
|
+
@options = options.stringify_keys
|
15
16
|
|
16
17
|
validate_filename_is_safe_and_readable!
|
17
18
|
end
|
data/lib/ndr_import/file/docx.rb
CHANGED
@@ -17,7 +17,7 @@ module NdrImport
|
|
17
17
|
def rows(&block)
|
18
18
|
return enum_for(:rows) unless block
|
19
19
|
|
20
|
-
send(@options
|
20
|
+
send(@options['file_password'] ? :decrypted_path : :unencrypted_path) do |path|
|
21
21
|
doc = ::Docx::Document.open(path)
|
22
22
|
|
23
23
|
doc.paragraphs.each do |p|
|
@@ -32,8 +32,9 @@ module NdrImport
|
|
32
32
|
|
33
33
|
# This method returns the path to the temporary, decrypted file
|
34
34
|
def decrypted_path
|
35
|
-
|
36
|
-
|
35
|
+
file_string = decrypted_file_string(@filename, @options['file_password'])
|
36
|
+
Tempfile.create(['decrypted', '.docx'], encoding: file_string.encoding) do |file|
|
37
|
+
file.write(file_string)
|
37
38
|
file.close
|
38
39
|
|
39
40
|
yield file.path
|
@@ -91,8 +91,8 @@ module NdrImport
|
|
91
91
|
when '.xls'
|
92
92
|
Roo::Excel.new(SafeFile.safepath_to_string(path))
|
93
93
|
when '.xlsx'
|
94
|
-
if @options
|
95
|
-
Roo::Excelx.new(StringIO.new(decrypted_file_string(path, @options[
|
94
|
+
if @options['file_password']
|
95
|
+
Roo::Excelx.new(StringIO.new(decrypted_file_string(path, @options['file_password'])))
|
96
96
|
else
|
97
97
|
Roo::Excelx.new(SafeFile.safepath_to_string(path))
|
98
98
|
end
|
data/lib/ndr_import/file/xml.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'ndr_support/safe_file'
|
2
2
|
require 'ndr_import/helpers/file/xml'
|
3
|
+
require 'ndr_import/helpers/file/xml_streaming'
|
3
4
|
require_relative 'registry'
|
4
5
|
|
5
6
|
module NdrImport
|
@@ -9,6 +10,7 @@ module NdrImport
|
|
9
10
|
# This class is a xml file handler that returns a single table.
|
10
11
|
class Xml < Base
|
11
12
|
include NdrImport::Helpers::File::Xml
|
13
|
+
include NdrImport::Helpers::File::XmlStreaming
|
12
14
|
|
13
15
|
private
|
14
16
|
|
@@ -16,9 +18,14 @@ module NdrImport
|
|
16
18
|
def rows(&block)
|
17
19
|
return enum_for(:rows) unless block
|
18
20
|
|
19
|
-
|
21
|
+
xpath = @options['xml_record_xpath']
|
20
22
|
|
21
|
-
|
23
|
+
if @options['slurp']
|
24
|
+
doc = read_xml_file(@filename)
|
25
|
+
doc.xpath(xpath).each(&block)
|
26
|
+
else
|
27
|
+
each_node(@filename, xpath, &block)
|
28
|
+
end
|
22
29
|
rescue StandardError => e
|
23
30
|
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
24
31
|
end
|
@@ -15,10 +15,13 @@ module NdrImport
|
|
15
15
|
|
16
16
|
require 'nokogiri'
|
17
17
|
|
18
|
-
Nokogiri::XML(ensure_utf8! file_data)
|
19
|
-
|
20
|
-
emulate_strict_mode_fatal_check!(doc)
|
18
|
+
doc = Nokogiri::XML((ensure_utf8! file_data)) do |config|
|
19
|
+
config.huge
|
21
20
|
end
|
21
|
+
doc.encoding = 'UTF-8'
|
22
|
+
emulate_strict_mode_fatal_check!(doc)
|
23
|
+
|
24
|
+
doc
|
22
25
|
end
|
23
26
|
|
24
27
|
# Nokogiri can use give a `STRICT` parse option to libxml, but our friendly
|
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
|
3
|
+
require 'ndr_support/safe_file'
|
4
|
+
require 'ndr_support/utf8_encoding'
|
5
|
+
|
6
|
+
module NdrImport
|
7
|
+
module Helpers
|
8
|
+
module File
|
9
|
+
# This mixin adds XML streaming functionality, to support more performant handling
|
10
|
+
# of large files by Nokogiri. Uses the `XML::Reader` API, and maintains a temporary
|
11
|
+
# DOM as the XML is streamed to allow XPath querying from the root node.
|
12
|
+
#
|
13
|
+
# If the system has `iconv` available, will attempt to verify the encoding of the
|
14
|
+
# file being read externally, so it can be streamed in to Ruby. Otherwise, will load
|
15
|
+
# the raw data in to check the encoding, but still stream it through Nokogiri's parser.
|
16
|
+
module XmlStreaming
|
17
|
+
# Base error for all streaming-specific issues.
|
18
|
+
class Error < StandardError; end
|
19
|
+
|
20
|
+
# Raised if nested tags are accounted which the streaming approach cannnot handle.
|
21
|
+
class NestingError < Error
|
22
|
+
def initialize(node)
|
23
|
+
super <<~STR
|
24
|
+
Element '#{node.name}' was found nested inside another of the same type.
|
25
|
+
This is not accessible, and a known limitation of XmlStreaming.
|
26
|
+
STR
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Object to track state as the XML is iterated over, and detect
|
31
|
+
# when an element of interest is entered.
|
32
|
+
class Cursor
|
33
|
+
# wrapper to hold a representation of each element we descent into:
|
34
|
+
StackItem = Struct.new(:name, :attrs, :empty)
|
35
|
+
|
36
|
+
def initialize(xpath)
|
37
|
+
@xpath = xpath
|
38
|
+
@stack = []
|
39
|
+
@match_depth = nil
|
40
|
+
end
|
41
|
+
|
42
|
+
# Has this cursor already passed inside a similar node?
|
43
|
+
def in?(node)
|
44
|
+
@stack.detect { |item| item.name == node.name }
|
45
|
+
end
|
46
|
+
|
47
|
+
def enter(node)
|
48
|
+
@stack.push StackItem.new(node.name, node.attributes, node.empty_element?)
|
49
|
+
end
|
50
|
+
|
51
|
+
def leave(_node)
|
52
|
+
@stack.pop
|
53
|
+
@match_depth = nil if @match_depth && @stack.length < @match_depth
|
54
|
+
end
|
55
|
+
|
56
|
+
# Does the element that the cursor is currently on match what
|
57
|
+
# is being looked for?
|
58
|
+
def matches?
|
59
|
+
# Can't match again if we're inside a match already:
|
60
|
+
return false if @matched_depth
|
61
|
+
|
62
|
+
match = current_stack_match?
|
63
|
+
|
64
|
+
# "empty element" matches are yielded immediately, without
|
65
|
+
# tagging the stack as having matched, because there won't
|
66
|
+
# be an equivalent closing tag to end the match with later.
|
67
|
+
if in_empty_element?
|
68
|
+
@stack.pop
|
69
|
+
elsif match
|
70
|
+
@match_depth = @stack.length
|
71
|
+
end
|
72
|
+
|
73
|
+
match
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def in_empty_element?
|
79
|
+
@stack.last.empty
|
80
|
+
end
|
81
|
+
|
82
|
+
# Does the current state of the stack mean we've met the xpath
|
83
|
+
# criteria? Must be an exact match, not just matching a parent
|
84
|
+
# element in the DOM.
|
85
|
+
def current_stack_match?
|
86
|
+
parent_stack = @stack[0..-2]
|
87
|
+
|
88
|
+
return false unless dom_stubs[@stack].at_xpath(@xpath)
|
89
|
+
|
90
|
+
parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
|
91
|
+
end
|
92
|
+
|
93
|
+
# A cached collection of DOM fragments, to represent the structure
|
94
|
+
# necessary to use xpath to descend into the main document's DOM.
|
95
|
+
def dom_stubs
|
96
|
+
@dom_stubs ||= Hash.new do |hash, items|
|
97
|
+
hash[items.dup] = Nokogiri::XML::Builder.new do |dom|
|
98
|
+
add_items_to_dom(dom, items.dup)
|
99
|
+
end.doc
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Helper to recursively build XML fragment.
|
104
|
+
def add_items_to_dom(dom, items)
|
105
|
+
item = items.shift
|
106
|
+
dom.send(item.name, item.attrs) do
|
107
|
+
add_items_to_dom(dom, items) if items.any?
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
include UTF8Encoding
|
113
|
+
|
114
|
+
# Streams the contents of the given `safe_path`, and yields
|
115
|
+
# each element matching `xpath` as they're found.
|
116
|
+
#
|
117
|
+
# In the case of dodgy encoding, may fall back to slurping the
|
118
|
+
# file, but will still use stream parsing for XML.
|
119
|
+
def each_node(safe_path, xpath, &block)
|
120
|
+
return enum_for(:each_node, safe_path, xpath) unless block
|
121
|
+
|
122
|
+
require 'nokogiri'
|
123
|
+
|
124
|
+
with_encoding_check(safe_path) do |stream, encoding|
|
125
|
+
stream_xml_nodes(stream, xpath, encoding, &block)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
|
131
|
+
# We need to ensure the raw data is UTF8 before we start streaming
|
132
|
+
# it with nokogiri. If we can do an external check, great. Otherwise,
|
133
|
+
# we need to slurp and convert the raw data before presenting it.
|
134
|
+
def with_encoding_check(safe_path)
|
135
|
+
forced_encoding = nil
|
136
|
+
|
137
|
+
stream = ::File.open(SafeFile.safepath_to_string(safe_path))
|
138
|
+
|
139
|
+
unless external_utf8_check?(safe_path)
|
140
|
+
stream = StringIO.new ensure_utf8!(stream.read)
|
141
|
+
forced_encoding = 'UTF8'
|
142
|
+
end
|
143
|
+
|
144
|
+
yield stream, forced_encoding
|
145
|
+
end
|
146
|
+
|
147
|
+
# Use iconv, if available, to check raw data encoding:
|
148
|
+
def external_utf8_check?(safe_path)
|
149
|
+
iconv = system('command -v iconv > /dev/null 2>&1')
|
150
|
+
return false unless iconv
|
151
|
+
|
152
|
+
path = SafeFile.safepath_to_string(safe_path)
|
153
|
+
system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
|
154
|
+
end
|
155
|
+
|
156
|
+
def stream_xml_nodes(io, node_xpath, encoding = nil)
|
157
|
+
# Track nesting as the cursor moves through the document:
|
158
|
+
cursor = Cursor.new(node_xpath)
|
159
|
+
|
160
|
+
# If markup isn't well-formed, try to work around it:
|
161
|
+
options = Nokogiri::XML::ParseOptions::RECOVER
|
162
|
+
reader = Nokogiri::XML::Reader(io, nil, encoding, options)
|
163
|
+
|
164
|
+
reader.each do |node|
|
165
|
+
case node.node_type
|
166
|
+
when Nokogiri::XML::Reader::TYPE_ELEMENT # "opening tag"
|
167
|
+
raise NestingError, node if cursor.in?(node)
|
168
|
+
|
169
|
+
cursor.enter(node)
|
170
|
+
next unless cursor.matches?
|
171
|
+
|
172
|
+
# The xpath matched - construct a DOM fragment to yield back:
|
173
|
+
element = Nokogiri::XML(node.outer_xml).at("./#{node.name}")
|
174
|
+
yield element
|
175
|
+
when Nokogiri::XML::Reader::TYPE_END_ELEMENT # "closing tag"
|
176
|
+
cursor.leave(node)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
data/lib/ndr_import/mapper.rb
CHANGED
@@ -277,7 +277,7 @@ module NdrImport::Mapper
|
|
277
277
|
end
|
278
278
|
|
279
279
|
def read_docx(stream)
|
280
|
-
Tempfile.create(encoding:
|
280
|
+
Tempfile.create(encoding: stream.external_encoding) do |tempfile|
|
281
281
|
tempfile.write(stream.read)
|
282
282
|
|
283
283
|
docx = ::Docx::Document.open(tempfile.path)
|
@@ -6,7 +6,7 @@ module NdrImport
|
|
6
6
|
# the logic associated with finding matching lines of source data and subsequently
|
7
7
|
# capturing arrays of values within them.
|
8
8
|
class ColumnMapping
|
9
|
-
attr_accessor :name, :cell_mapping, :lines, :capture, :join
|
9
|
+
attr_accessor :name, :cell_mapping, :lines, :capture, :join, :preserve_blank_lines
|
10
10
|
|
11
11
|
def initialize(column_mapping)
|
12
12
|
@name = column_mapping['rawtext_name'] ||
|
@@ -18,6 +18,7 @@ module NdrImport
|
|
18
18
|
|
19
19
|
@lines = @cell_mapping['lines']
|
20
20
|
@join = @cell_mapping['join']
|
21
|
+
@preserve_blank_lines = @cell_mapping['preserve_blank_lines']
|
21
22
|
end
|
22
23
|
|
23
24
|
# This method returns the range of matching source data lines. If the range is a
|
@@ -74,7 +74,10 @@ module NdrImport
|
|
74
74
|
begin
|
75
75
|
matches = get_matches(column_mapping)
|
76
76
|
# Join the non-blank lines together and add to the array of cells
|
77
|
-
|
77
|
+
lines = matches.select do |value|
|
78
|
+
column_mapping.preserve_blank_lines ? value : value.present?
|
79
|
+
end
|
80
|
+
cells << lines.join(column_mapping.join || '')
|
78
81
|
rescue RegexpRange::PatternMatchError
|
79
82
|
cells << nil
|
80
83
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
|
1
3
|
require 'ndr_import/file/registry'
|
2
4
|
|
3
5
|
module NdrImport
|
@@ -38,7 +40,8 @@ module NdrImport
|
|
38
40
|
'col_sep' => table_mapping.try(:delimiter),
|
39
41
|
'file_password' => table_mapping.try(:file_password),
|
40
42
|
'liberal_parsing' => table_mapping.try(:liberal_parsing),
|
41
|
-
'xml_record_xpath' => table_mapping.try(:xml_record_xpath)
|
43
|
+
'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
|
44
|
+
'slurp' => table_mapping.try(:slurp)
|
42
45
|
}
|
43
46
|
|
44
47
|
tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
|
data/lib/ndr_import/version.rb
CHANGED
data/ndr_import.gemspec
CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
|
|
15
15
|
# Specify which files should be added to the gem when it is released.
|
16
16
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
17
17
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
18
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(docs|test|spec|features)/}) }
|
19
19
|
end
|
20
20
|
spec.files -= %w[.travis.yml] # Not needed in the gem
|
21
21
|
spec.bindir = 'exe'
|
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_dependency 'activesupport', '>= 5.0', '< 7'
|
27
27
|
spec.add_dependency 'ndr_support', '>= 5.3.2', '< 6'
|
28
28
|
|
29
|
-
spec.add_dependency 'rubyzip', '~>
|
29
|
+
spec.add_dependency 'rubyzip', '~> 2.0'
|
30
30
|
spec.add_dependency 'roo', '~> 2.0'
|
31
31
|
|
32
32
|
spec.add_dependency 'docx', '~> 0.3'
|
@@ -36,15 +36,15 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.add_dependency 'pdf-reader', '~> 2.1'
|
37
37
|
spec.add_dependency 'roo-xls'
|
38
38
|
spec.add_dependency 'seven_zip_ruby', '~> 1.2'
|
39
|
-
spec.add_dependency 'spreadsheet', '1.
|
39
|
+
spec.add_dependency 'spreadsheet', '1.2.6'
|
40
40
|
|
41
41
|
spec.required_ruby_version = '>= 2.5'
|
42
42
|
|
43
43
|
spec.add_development_dependency 'bundler'
|
44
|
-
spec.add_development_dependency 'rake', '~>
|
44
|
+
spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.3'
|
45
45
|
spec.add_development_dependency 'minitest'
|
46
46
|
spec.add_development_dependency 'mocha'
|
47
|
-
spec.add_development_dependency 'ndr_dev_support', '
|
47
|
+
spec.add_development_dependency 'ndr_dev_support', '>= 3.1.3'
|
48
48
|
spec.add_development_dependency 'guard'
|
49
49
|
spec.add_development_dependency 'guard-rubocop'
|
50
50
|
spec.add_development_dependency 'guard-test'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 9.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -70,20 +70,14 @@ dependencies:
|
|
70
70
|
requirements:
|
71
71
|
- - "~>"
|
72
72
|
- !ruby/object:Gem::Version
|
73
|
-
version: '
|
74
|
-
- - ">="
|
75
|
-
- !ruby/object:Gem::Version
|
76
|
-
version: 1.2.2
|
73
|
+
version: '2.0'
|
77
74
|
type: :runtime
|
78
75
|
prerelease: false
|
79
76
|
version_requirements: !ruby/object:Gem::Requirement
|
80
77
|
requirements:
|
81
78
|
- - "~>"
|
82
79
|
- !ruby/object:Gem::Version
|
83
|
-
version: '
|
84
|
-
- - ">="
|
85
|
-
- !ruby/object:Gem::Version
|
86
|
-
version: 1.2.2
|
80
|
+
version: '2.0'
|
87
81
|
- !ruby/object:Gem::Dependency
|
88
82
|
name: roo
|
89
83
|
requirement: !ruby/object:Gem::Requirement
|
@@ -208,14 +202,14 @@ dependencies:
|
|
208
202
|
requirements:
|
209
203
|
- - '='
|
210
204
|
- !ruby/object:Gem::Version
|
211
|
-
version: 1.
|
205
|
+
version: 1.2.6
|
212
206
|
type: :runtime
|
213
207
|
prerelease: false
|
214
208
|
version_requirements: !ruby/object:Gem::Requirement
|
215
209
|
requirements:
|
216
210
|
- - '='
|
217
211
|
- !ruby/object:Gem::Version
|
218
|
-
version: 1.
|
212
|
+
version: 1.2.6
|
219
213
|
- !ruby/object:Gem::Dependency
|
220
214
|
name: bundler
|
221
215
|
requirement: !ruby/object:Gem::Requirement
|
@@ -236,14 +230,20 @@ dependencies:
|
|
236
230
|
requirements:
|
237
231
|
- - "~>"
|
238
232
|
- !ruby/object:Gem::Version
|
239
|
-
version: '
|
233
|
+
version: '12.3'
|
234
|
+
- - ">="
|
235
|
+
- !ruby/object:Gem::Version
|
236
|
+
version: 12.3.3
|
240
237
|
type: :development
|
241
238
|
prerelease: false
|
242
239
|
version_requirements: !ruby/object:Gem::Requirement
|
243
240
|
requirements:
|
244
241
|
- - "~>"
|
245
242
|
- !ruby/object:Gem::Version
|
246
|
-
version: '
|
243
|
+
version: '12.3'
|
244
|
+
- - ">="
|
245
|
+
- !ruby/object:Gem::Version
|
246
|
+
version: 12.3.3
|
247
247
|
- !ruby/object:Gem::Dependency
|
248
248
|
name: minitest
|
249
249
|
requirement: !ruby/object:Gem::Requirement
|
@@ -276,9 +276,6 @@ dependencies:
|
|
276
276
|
name: ndr_dev_support
|
277
277
|
requirement: !ruby/object:Gem::Requirement
|
278
278
|
requirements:
|
279
|
-
- - "~>"
|
280
|
-
- !ruby/object:Gem::Version
|
281
|
-
version: '3.1'
|
282
279
|
- - ">="
|
283
280
|
- !ruby/object:Gem::Version
|
284
281
|
version: 3.1.3
|
@@ -286,9 +283,6 @@ dependencies:
|
|
286
283
|
prerelease: false
|
287
284
|
version_requirements: !ruby/object:Gem::Requirement
|
288
285
|
requirements:
|
289
|
-
- - "~>"
|
290
|
-
- !ruby/object:Gem::Version
|
291
|
-
version: '3.1'
|
292
286
|
- - ">="
|
293
287
|
- !ruby/object:Gem::Version
|
294
288
|
version: 3.1.3
|
@@ -413,6 +407,7 @@ files:
|
|
413
407
|
- lib/ndr_import/helpers/file/pdf.rb
|
414
408
|
- lib/ndr_import/helpers/file/word.rb
|
415
409
|
- lib/ndr_import/helpers/file/xml.rb
|
410
|
+
- lib/ndr_import/helpers/file/xml_streaming.rb
|
416
411
|
- lib/ndr_import/helpers/file/zip.rb
|
417
412
|
- lib/ndr_import/mapper.rb
|
418
413
|
- lib/ndr_import/mapping_error.rb
|