ndr_import 8.5.2 → 9.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/CHANGELOG.md +29 -0
- data/README.md +1 -1
- data/code_safety.yml +106 -25
- data/lib/ndr_import/csv_library.rb +2 -2
- data/lib/ndr_import/file/xml.rb +9 -2
- data/lib/ndr_import/helpers/file/delimited.rb +2 -2
- data/lib/ndr_import/helpers/file/xml.rb +6 -3
- data/lib/ndr_import/helpers/file/xml_streaming.rb +183 -0
- data/lib/ndr_import/non_tabular/column_mapping.rb +2 -1
- data/lib/ndr_import/non_tabular/record.rb +4 -1
- data/lib/ndr_import/universal_importer_helper.rb +25 -2
- data/lib/ndr_import/version.rb +1 -1
- data/ndr_import.gemspec +5 -5
- metadata +15 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2237d88e9dee0141fe297fd2eb0d2e76f5434e9bb791fb667505193f30263014
|
4
|
+
data.tar.gz: 52746c16790c3da92e64ac14ed7d77786a41c5457ddc0b8190f52d4414e4c6d7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8b8793a45dea035b2abf04a44c20d2ec4fc90bcfef9c5dec4415fab26abdba2ff218b7d2996a165f3447f86046dfd558bfb363e21221f147fec3db9c7c83668c
|
7
|
+
data.tar.gz: eedb044185f1bc7e843e91e634fdc8b88cae9c026344d92e3cdc1046979873d843d476c7a0f7ebfb3c631901b47c9109b6826696124c56b07082ba4a6842af9c
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,35 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
*no unreleased changes*
|
3
3
|
|
4
|
+
## 9.0.3 / 2021-01-04
|
5
|
+
### Fixed
|
6
|
+
* Address issue importing multiple files against the same table (#54)
|
7
|
+
|
8
|
+
### Changed
|
9
|
+
* ensure keyword args are properly splatted for ruby 2.7
|
10
|
+
|
11
|
+
### Added
|
12
|
+
* Ruby 2.7 to travis matrix
|
13
|
+
|
14
|
+
## 9.0.2 / 2020-08-14
|
15
|
+
### Changed
|
16
|
+
* Configure Nokogiri with HUGE for large xml files
|
17
|
+
|
18
|
+
## 9.0.1 / 2020-03-26
|
19
|
+
### Fixed
|
20
|
+
* bumps to `nokogiri` / `spreadsheet` / `rubyzip` dependencies
|
21
|
+
|
22
|
+
## 9.0.0 / 2019-07-31
|
23
|
+
### Changed
|
24
|
+
* `File::Xml` will now stream XML files by default. Use `slurp: true` for the old behaviour. (#43)
|
25
|
+
|
26
|
+
### Added
|
27
|
+
* Add `XmlStreaming` helper, for more performant handling of large XML documents with Nokogiri. (#43)
|
28
|
+
|
29
|
+
## 8.6.0 / 2019-06-07
|
30
|
+
### Added
|
31
|
+
* Allow conditional preservation of blank lines when joining lines in non-tabular data (#41)
|
32
|
+
|
4
33
|
## 8.5.2 / 2019-05-17
|
5
34
|
### Fixed
|
6
35
|
* Fixed issue with `file_password` option key as a String or Symbol
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# NdrImport [![Build Status](https://travis-ci.org/PublicHealthEngland/ndr_import.svg?branch=master)](https://travis-ci.org/PublicHealthEngland/ndr_import) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://badge.fury.io/rb/ndr_import)
|
1
|
+
# NdrImport [![Build Status](https://travis-ci.org/PublicHealthEngland/ndr_import.svg?branch=master)](https://travis-ci.org/PublicHealthEngland/ndr_import) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://badge.fury.io/rb/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
|
2
2
|
|
3
3
|
This is the Public Health England (PHE) National Disease Registers (NDR) Import ETL ruby gem, providing:
|
4
4
|
|
data/code_safety.yml
CHANGED
@@ -3,7 +3,7 @@ file safety:
|
|
3
3
|
".gitignore":
|
4
4
|
comments: whole file re-reviewed
|
5
5
|
reviewed_by: josh.pencheon
|
6
|
-
safe_revision:
|
6
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
7
7
|
".hound.yml":
|
8
8
|
comments:
|
9
9
|
reviewed_by: timgentry
|
@@ -14,12 +14,12 @@ file safety:
|
|
14
14
|
safe_revision: b09e268ff9c8349b914aa1b7ba888e1d39f97e4a
|
15
15
|
".travis.yml":
|
16
16
|
comments:
|
17
|
-
reviewed_by:
|
18
|
-
safe_revision:
|
17
|
+
reviewed_by: ollietulloch
|
18
|
+
safe_revision: ed8513d290796b9cb9554c4ebe6e87ac21fbbeae
|
19
19
|
CHANGELOG.md:
|
20
20
|
comments:
|
21
|
-
reviewed_by:
|
22
|
-
safe_revision:
|
21
|
+
reviewed_by: ollietulloch
|
22
|
+
safe_revision: 92d2d9d56f17c143709e25560d7620aaf147008f
|
23
23
|
CODE_OF_CONDUCT.md:
|
24
24
|
comments:
|
25
25
|
reviewed_by: timgentry
|
@@ -39,7 +39,7 @@ file safety:
|
|
39
39
|
README.md:
|
40
40
|
comments:
|
41
41
|
reviewed_by: josh.pencheon
|
42
|
-
safe_revision:
|
42
|
+
safe_revision: 1bc459db8970dde36e9b240b6dd08cca629664e3
|
43
43
|
Rakefile:
|
44
44
|
comments:
|
45
45
|
reviewed_by: josh.pencheon
|
@@ -52,6 +52,70 @@ file safety:
|
|
52
52
|
comments:
|
53
53
|
reviewed_by: josh.pencheon
|
54
54
|
safe_revision: e1d967c10059e8c635452838c3f3dd2b969d9ae4
|
55
|
+
docs/Gemfile:
|
56
|
+
comments:
|
57
|
+
reviewed_by: josh.pencheon
|
58
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
59
|
+
docs/Gemfile.lock:
|
60
|
+
comments:
|
61
|
+
reviewed_by: ollietulloch
|
62
|
+
safe_revision: 6f274715bb341c3070190f04f67af9500b510580
|
63
|
+
docs/_config.yml:
|
64
|
+
comments:
|
65
|
+
reviewed_by: josh.pencheon
|
66
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
67
|
+
docs/_includes/footer.html:
|
68
|
+
comments:
|
69
|
+
reviewed_by: josh.pencheon
|
70
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
71
|
+
docs/_includes/header.html:
|
72
|
+
comments:
|
73
|
+
reviewed_by: josh.pencheon
|
74
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
75
|
+
docs/capturing-data.md:
|
76
|
+
comments:
|
77
|
+
reviewed_by: josh.pencheon
|
78
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
79
|
+
docs/date-formats.md:
|
80
|
+
comments:
|
81
|
+
reviewed_by: josh.pencheon
|
82
|
+
safe_revision: fa21d6d967bf132800b456b585795beec80b08a3
|
83
|
+
docs/getting-started.md:
|
84
|
+
comments:
|
85
|
+
reviewed_by: josh.pencheon
|
86
|
+
safe_revision: fa21d6d967bf132800b456b585795beec80b08a3
|
87
|
+
docs/identifying-and-splitting-records.md:
|
88
|
+
comments:
|
89
|
+
reviewed_by: josh.pencheon
|
90
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
91
|
+
docs/inbuilt-cleaning-methods.md:
|
92
|
+
comments:
|
93
|
+
reviewed_by: josh.pencheon
|
94
|
+
safe_revision: 694b57ce14e0709fc4d31a1357f8416e98f5de91
|
95
|
+
docs/index.md:
|
96
|
+
comments:
|
97
|
+
reviewed_by: josh.pencheon
|
98
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
99
|
+
docs/local-code-transformation-in-yaml-mappings.md:
|
100
|
+
comments:
|
101
|
+
reviewed_by: josh.pencheon
|
102
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
103
|
+
docs/non-tabular-mappings.md:
|
104
|
+
comments:
|
105
|
+
reviewed_by: josh.pencheon
|
106
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
107
|
+
docs/priority-field-mapping.md:
|
108
|
+
comments:
|
109
|
+
reviewed_by: josh.pencheon
|
110
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
111
|
+
docs/standard-yaml-mappings.md:
|
112
|
+
comments:
|
113
|
+
reviewed_by: josh.pencheon
|
114
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
115
|
+
docs/yaml-mapping-user-guide.md:
|
116
|
+
comments:
|
117
|
+
reviewed_by: josh.pencheon
|
118
|
+
safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
|
55
119
|
exe/pdf_acro_form_to_yaml:
|
56
120
|
comments:
|
57
121
|
reviewed_by: josh.pencheon
|
@@ -82,8 +146,8 @@ file safety:
|
|
82
146
|
safe_revision: 24d6449fd0612552f132dfbf4cada2ae28d0469e
|
83
147
|
lib/ndr_import/csv_library.rb:
|
84
148
|
comments:
|
85
|
-
reviewed_by:
|
86
|
-
safe_revision:
|
149
|
+
reviewed_by: ollietulloch
|
150
|
+
safe_revision: acc72173b81702d55fe3aff78054425609a4d339
|
87
151
|
lib/ndr_import/file/acro_form.rb:
|
88
152
|
comments:
|
89
153
|
reviewed_by: josh.pencheon
|
@@ -139,7 +203,7 @@ file safety:
|
|
139
203
|
lib/ndr_import/file/xml.rb:
|
140
204
|
comments:
|
141
205
|
reviewed_by: josh.pencheon
|
142
|
-
safe_revision:
|
206
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
143
207
|
lib/ndr_import/file/zip.rb:
|
144
208
|
comments:
|
145
209
|
reviewed_by: timgentry
|
@@ -150,8 +214,8 @@ file safety:
|
|
150
214
|
safe_revision: dfc958d44b6c58355445fa395db08a62213ee709
|
151
215
|
lib/ndr_import/helpers/file/delimited.rb:
|
152
216
|
comments:
|
153
|
-
reviewed_by:
|
154
|
-
safe_revision:
|
217
|
+
reviewed_by: ollietulloch
|
218
|
+
safe_revision: acc72173b81702d55fe3aff78054425609a4d339
|
155
219
|
lib/ndr_import/helpers/file/excel.rb:
|
156
220
|
comments:
|
157
221
|
reviewed_by: joshpencheon
|
@@ -166,8 +230,13 @@ file safety:
|
|
166
230
|
safe_revision: 45da71ebd3acbc0fe53755bcd75483ba17cb6924
|
167
231
|
lib/ndr_import/helpers/file/xml.rb:
|
168
232
|
comments:
|
233
|
+
reviewed_by: ollietulloch
|
234
|
+
safe_revision: 4d337bd233f7e60cf9d363c92400f21269a28da7
|
235
|
+
lib/ndr_import/helpers/file/xml_streaming.rb:
|
236
|
+
comments: uses SafePath and Shellwords when accessing filesystem, or making system
|
237
|
+
calls
|
169
238
|
reviewed_by: josh.pencheon
|
170
|
-
safe_revision:
|
239
|
+
safe_revision: 55e502bb4445cb8b985e530e8eb26d92b574ded9
|
171
240
|
lib/ndr_import/helpers/file/zip.rb:
|
172
241
|
comments:
|
173
242
|
reviewed_by: timgentry
|
@@ -187,7 +256,7 @@ file safety:
|
|
187
256
|
lib/ndr_import/non_tabular/column_mapping.rb:
|
188
257
|
comments:
|
189
258
|
reviewed_by: josh.pencheon
|
190
|
-
safe_revision:
|
259
|
+
safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
|
191
260
|
lib/ndr_import/non_tabular/line.rb:
|
192
261
|
comments:
|
193
262
|
reviewed_by: timgentry
|
@@ -198,8 +267,8 @@ file safety:
|
|
198
267
|
safe_revision: cf382902508a21a95b80ac4582fbbd117164e80e
|
199
268
|
lib/ndr_import/non_tabular/record.rb:
|
200
269
|
comments:
|
201
|
-
reviewed_by:
|
202
|
-
safe_revision:
|
270
|
+
reviewed_by: josh.pencheon
|
271
|
+
safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
|
203
272
|
lib/ndr_import/non_tabular/table.rb:
|
204
273
|
comments:
|
205
274
|
reviewed_by: josh.pencheon
|
@@ -222,16 +291,16 @@ file safety:
|
|
222
291
|
safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
|
223
292
|
lib/ndr_import/universal_importer_helper.rb:
|
224
293
|
comments:
|
225
|
-
reviewed_by:
|
226
|
-
safe_revision:
|
294
|
+
reviewed_by: ollietulloch
|
295
|
+
safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
|
227
296
|
lib/ndr_import/unmapped_data_error.rb:
|
228
297
|
comments:
|
229
298
|
reviewed_by: josh.pencheon
|
230
299
|
safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
|
231
300
|
lib/ndr_import/version.rb:
|
232
301
|
comments: another check?
|
233
|
-
reviewed_by:
|
234
|
-
safe_revision:
|
302
|
+
reviewed_by: ollietulloch
|
303
|
+
safe_revision: 92d2d9d56f17c143709e25560d7620aaf147008f
|
235
304
|
lib/ndr_import/xml/table.rb:
|
236
305
|
comments:
|
237
306
|
reviewed_by: josh.pencheon
|
@@ -239,7 +308,7 @@ file safety:
|
|
239
308
|
ndr_import.gemspec:
|
240
309
|
comments:
|
241
310
|
reviewed_by: josh.pencheon
|
242
|
-
safe_revision:
|
311
|
+
safe_revision: 95e6ee9997d06471fe6f2f169c3c701471086371
|
243
312
|
test/file/acro_form_test.rb:
|
244
313
|
comments:
|
245
314
|
reviewed_by: josh.pencheon
|
@@ -283,7 +352,7 @@ file safety:
|
|
283
352
|
test/file/xml_test.rb:
|
284
353
|
comments:
|
285
354
|
reviewed_by: josh.pencheon
|
286
|
-
safe_revision:
|
355
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
287
356
|
test/file/zip_test.rb:
|
288
357
|
comments:
|
289
358
|
reviewed_by: timgentry
|
@@ -308,6 +377,10 @@ file safety:
|
|
308
377
|
comments:
|
309
378
|
reviewed_by: timgentry
|
310
379
|
safe_revision: 9abdd6ced1d0c90ce8dd88abee4eb6472c7ff0d6
|
380
|
+
test/helpers/file/xml_streaming_test.rb:
|
381
|
+
comments:
|
382
|
+
reviewed_by: josh.pencheon
|
383
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
311
384
|
test/helpers/file/xml_test.rb:
|
312
385
|
comments:
|
313
386
|
reviewed_by: timgentry
|
@@ -330,8 +403,8 @@ file safety:
|
|
330
403
|
safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
|
331
404
|
test/non_tabular_file_helper_test.rb:
|
332
405
|
comments:
|
333
|
-
reviewed_by:
|
334
|
-
safe_revision:
|
406
|
+
reviewed_by: josh.pencheon
|
407
|
+
safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
|
335
408
|
test/pdf_form/table_test.rb:
|
336
409
|
comments:
|
337
410
|
reviewed_by: josh.pencheon
|
@@ -356,6 +429,10 @@ file safety:
|
|
356
429
|
comments:
|
357
430
|
reviewed_by: timgentry
|
358
431
|
safe_revision: dab4b8a3e4b29d85eccd971e79936982d888cffd
|
432
|
+
test/resources/claims_utf16be_but_isnt.xml:
|
433
|
+
comments:
|
434
|
+
reviewed_by: josh.pencheon
|
435
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
359
436
|
test/resources/filesystem_paths.yml:
|
360
437
|
comments:
|
361
438
|
reviewed_by: timgentry
|
@@ -492,6 +569,10 @@ file safety:
|
|
492
569
|
comments:
|
493
570
|
reviewed_by: timgentry
|
494
571
|
safe_revision: 31fb1935f4578729d8786eea41cf0ce0a19be1cd
|
572
|
+
test/resources/two_files_single_table_mapping.zip:
|
573
|
+
comments:
|
574
|
+
reviewed_by: ollietulloch
|
575
|
+
safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
|
495
576
|
test/resources/txt_file_xls_extension.xls:
|
496
577
|
comments:
|
497
578
|
reviewed_by: timgentry
|
@@ -542,8 +623,8 @@ file safety:
|
|
542
623
|
safe_revision: 93ccee82fc2165d1ca2d9b03d146ae03e769ea96
|
543
624
|
test/universal_importer_helper_test.rb:
|
544
625
|
comments:
|
545
|
-
reviewed_by:
|
546
|
-
safe_revision:
|
626
|
+
reviewed_by: ollietulloch
|
627
|
+
safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
|
547
628
|
test/xml/table_test.rb:
|
548
629
|
comments:
|
549
630
|
reviewed_by: josh.pencheon
|
@@ -12,8 +12,8 @@ class << CSVLibrary
|
|
12
12
|
|
13
13
|
# Ensure that we can pass "mode" straight through the underlying IO object
|
14
14
|
def foreach(path, **options, &block)
|
15
|
-
return to_enum(__method__, path, options) unless block
|
16
|
-
open(path, options.delete(:mode) || 'r', options) do |csv|
|
15
|
+
return to_enum(__method__, path, **options) unless block
|
16
|
+
open(path, options.delete(:mode) || 'r', **options) do |csv|
|
17
17
|
csv.each(&block)
|
18
18
|
end
|
19
19
|
end
|
data/lib/ndr_import/file/xml.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'ndr_support/safe_file'
|
2
2
|
require 'ndr_import/helpers/file/xml'
|
3
|
+
require 'ndr_import/helpers/file/xml_streaming'
|
3
4
|
require_relative 'registry'
|
4
5
|
|
5
6
|
module NdrImport
|
@@ -9,6 +10,7 @@ module NdrImport
|
|
9
10
|
# This class is a xml file handler that returns a single table.
|
10
11
|
class Xml < Base
|
11
12
|
include NdrImport::Helpers::File::Xml
|
13
|
+
include NdrImport::Helpers::File::XmlStreaming
|
12
14
|
|
13
15
|
private
|
14
16
|
|
@@ -16,9 +18,14 @@ module NdrImport
|
|
16
18
|
def rows(&block)
|
17
19
|
return enum_for(:rows) unless block
|
18
20
|
|
19
|
-
|
21
|
+
xpath = @options['xml_record_xpath']
|
20
22
|
|
21
|
-
|
23
|
+
if @options['slurp']
|
24
|
+
doc = read_xml_file(@filename)
|
25
|
+
doc.xpath(xpath).each(&block)
|
26
|
+
else
|
27
|
+
each_node(@filename, xpath, &block)
|
28
|
+
end
|
22
29
|
rescue StandardError => e
|
23
30
|
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
24
31
|
end
|
@@ -36,7 +36,7 @@ module NdrImport
|
|
36
36
|
|
37
37
|
# By now, we know `encodings` should let us read the whole
|
38
38
|
# file succesfully; if there are problems, we should crash.
|
39
|
-
CSVLibrary.foreach(safe_path, encodings) do |line|
|
39
|
+
CSVLibrary.foreach(safe_path, **encodings) do |line|
|
40
40
|
yield line.map(&:to_s)
|
41
41
|
end
|
42
42
|
end
|
@@ -73,7 +73,7 @@ module NdrImport
|
|
73
73
|
|
74
74
|
row_num = 0
|
75
75
|
# Iterate through the file; if we reach the end, this encoding worked:
|
76
|
-
CSVLibrary.foreach(safe_path, options) { |_line| row_num += 1 }
|
76
|
+
CSVLibrary.foreach(safe_path, **options) { |_line| row_num += 1 }
|
77
77
|
return options
|
78
78
|
rescue ArgumentError => e
|
79
79
|
next if e.message =~ /invalid byte sequence/ # This encoding didn't work
|
@@ -15,10 +15,13 @@ module NdrImport
|
|
15
15
|
|
16
16
|
require 'nokogiri'
|
17
17
|
|
18
|
-
Nokogiri::XML(ensure_utf8! file_data)
|
19
|
-
|
20
|
-
emulate_strict_mode_fatal_check!(doc)
|
18
|
+
doc = Nokogiri::XML((ensure_utf8! file_data)) do |config|
|
19
|
+
config.huge
|
21
20
|
end
|
21
|
+
doc.encoding = 'UTF-8'
|
22
|
+
emulate_strict_mode_fatal_check!(doc)
|
23
|
+
|
24
|
+
doc
|
22
25
|
end
|
23
26
|
|
24
27
|
# Nokogiri can use give a `STRICT` parse option to libxml, but our friendly
|
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
|
3
|
+
require 'ndr_support/safe_file'
|
4
|
+
require 'ndr_support/utf8_encoding'
|
5
|
+
|
6
|
+
module NdrImport
|
7
|
+
module Helpers
|
8
|
+
module File
|
9
|
+
# This mixin adds XML streaming functionality, to support more performant handling
|
10
|
+
# of large files by Nokogiri. Uses the `XML::Reader` API, and maintains a temporary
|
11
|
+
# DOM as the XML is streamed to allow XPath querying from the root node.
|
12
|
+
#
|
13
|
+
# If the system has `iconv` available, will attempt to verify the encoding of the
|
14
|
+
# file being read externally, so it can be streamed in to Ruby. Otherwise, will load
|
15
|
+
# the raw data in to check the encoding, but still stream it through Nokogiri's parser.
|
16
|
+
module XmlStreaming
|
17
|
+
# Base error for all streaming-specific issues.
|
18
|
+
class Error < StandardError; end
|
19
|
+
|
20
|
+
# Raised if nested tags are accounted which the streaming approach cannnot handle.
|
21
|
+
class NestingError < Error
|
22
|
+
def initialize(node)
|
23
|
+
super <<~STR
|
24
|
+
Element '#{node.name}' was found nested inside another of the same type.
|
25
|
+
This is not accessible, and a known limitation of XmlStreaming.
|
26
|
+
STR
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Object to track state as the XML is iterated over, and detect
|
31
|
+
# when an element of interest is entered.
|
32
|
+
class Cursor
|
33
|
+
# wrapper to hold a representation of each element we descent into:
|
34
|
+
StackItem = Struct.new(:name, :attrs, :empty)
|
35
|
+
|
36
|
+
def initialize(xpath)
|
37
|
+
@xpath = xpath
|
38
|
+
@stack = []
|
39
|
+
@match_depth = nil
|
40
|
+
end
|
41
|
+
|
42
|
+
# Has this cursor already passed inside a similar node?
|
43
|
+
def in?(node)
|
44
|
+
@stack.detect { |item| item.name == node.name }
|
45
|
+
end
|
46
|
+
|
47
|
+
def enter(node)
|
48
|
+
@stack.push StackItem.new(node.name, node.attributes, node.empty_element?)
|
49
|
+
end
|
50
|
+
|
51
|
+
def leave(_node)
|
52
|
+
@stack.pop
|
53
|
+
@match_depth = nil if @match_depth && @stack.length < @match_depth
|
54
|
+
end
|
55
|
+
|
56
|
+
# Does the element that the cursor is currently on match what
|
57
|
+
# is being looked for?
|
58
|
+
def matches?
|
59
|
+
# Can't match again if we're inside a match already:
|
60
|
+
return false if @matched_depth
|
61
|
+
|
62
|
+
match = current_stack_match?
|
63
|
+
|
64
|
+
# "empty element" matches are yielded immediately, without
|
65
|
+
# tagging the stack as having matched, because there won't
|
66
|
+
# be an equivalent closing tag to end the match with later.
|
67
|
+
if in_empty_element?
|
68
|
+
@stack.pop
|
69
|
+
elsif match
|
70
|
+
@match_depth = @stack.length
|
71
|
+
end
|
72
|
+
|
73
|
+
match
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def in_empty_element?
|
79
|
+
@stack.last.empty
|
80
|
+
end
|
81
|
+
|
82
|
+
# Does the current state of the stack mean we've met the xpath
|
83
|
+
# criteria? Must be an exact match, not just matching a parent
|
84
|
+
# element in the DOM.
|
85
|
+
def current_stack_match?
|
86
|
+
parent_stack = @stack[0..-2]
|
87
|
+
|
88
|
+
return false unless dom_stubs[@stack].at_xpath(@xpath)
|
89
|
+
|
90
|
+
parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
|
91
|
+
end
|
92
|
+
|
93
|
+
# A cached collection of DOM fragments, to represent the structure
|
94
|
+
# necessary to use xpath to descend into the main document's DOM.
|
95
|
+
def dom_stubs
|
96
|
+
@dom_stubs ||= Hash.new do |hash, items|
|
97
|
+
hash[items.dup] = Nokogiri::XML::Builder.new do |dom|
|
98
|
+
add_items_to_dom(dom, items.dup)
|
99
|
+
end.doc
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Helper to recursively build XML fragment.
|
104
|
+
def add_items_to_dom(dom, items)
|
105
|
+
item = items.shift
|
106
|
+
dom.send(item.name, item.attrs) do
|
107
|
+
add_items_to_dom(dom, items) if items.any?
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
include UTF8Encoding
|
113
|
+
|
114
|
+
# Streams the contents of the given `safe_path`, and yields
|
115
|
+
# each element matching `xpath` as they're found.
|
116
|
+
#
|
117
|
+
# In the case of dodgy encoding, may fall back to slurping the
|
118
|
+
# file, but will still use stream parsing for XML.
|
119
|
+
def each_node(safe_path, xpath, &block)
|
120
|
+
return enum_for(:each_node, safe_path, xpath) unless block
|
121
|
+
|
122
|
+
require 'nokogiri'
|
123
|
+
|
124
|
+
with_encoding_check(safe_path) do |stream, encoding|
|
125
|
+
stream_xml_nodes(stream, xpath, encoding, &block)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
|
131
|
+
# We need to ensure the raw data is UTF8 before we start streaming
|
132
|
+
# it with nokogiri. If we can do an external check, great. Otherwise,
|
133
|
+
# we need to slurp and convert the raw data before presenting it.
|
134
|
+
def with_encoding_check(safe_path)
|
135
|
+
forced_encoding = nil
|
136
|
+
|
137
|
+
stream = ::File.open(SafeFile.safepath_to_string(safe_path))
|
138
|
+
|
139
|
+
unless external_utf8_check?(safe_path)
|
140
|
+
stream = StringIO.new ensure_utf8!(stream.read)
|
141
|
+
forced_encoding = 'UTF8'
|
142
|
+
end
|
143
|
+
|
144
|
+
yield stream, forced_encoding
|
145
|
+
end
|
146
|
+
|
147
|
+
# Use iconv, if available, to check raw data encoding:
|
148
|
+
def external_utf8_check?(safe_path)
|
149
|
+
iconv = system('command -v iconv > /dev/null 2>&1')
|
150
|
+
return false unless iconv
|
151
|
+
|
152
|
+
path = SafeFile.safepath_to_string(safe_path)
|
153
|
+
system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
|
154
|
+
end
|
155
|
+
|
156
|
+
def stream_xml_nodes(io, node_xpath, encoding = nil)
|
157
|
+
# Track nesting as the cursor moves through the document:
|
158
|
+
cursor = Cursor.new(node_xpath)
|
159
|
+
|
160
|
+
# If markup isn't well-formed, try to work around it:
|
161
|
+
options = Nokogiri::XML::ParseOptions::RECOVER
|
162
|
+
reader = Nokogiri::XML::Reader(io, nil, encoding, options)
|
163
|
+
|
164
|
+
reader.each do |node|
|
165
|
+
case node.node_type
|
166
|
+
when Nokogiri::XML::Reader::TYPE_ELEMENT # "opening tag"
|
167
|
+
raise NestingError, node if cursor.in?(node)
|
168
|
+
|
169
|
+
cursor.enter(node)
|
170
|
+
next unless cursor.matches?
|
171
|
+
|
172
|
+
# The xpath matched - construct a DOM fragment to yield back:
|
173
|
+
element = Nokogiri::XML(node.outer_xml).at("./#{node.name}")
|
174
|
+
yield element
|
175
|
+
when Nokogiri::XML::Reader::TYPE_END_ELEMENT # "closing tag"
|
176
|
+
cursor.leave(node)
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -6,7 +6,7 @@ module NdrImport
|
|
6
6
|
# the logic associated with finding matching lines of source data and subsequently
|
7
7
|
# capturing arrays of values within them.
|
8
8
|
class ColumnMapping
|
9
|
-
attr_accessor :name, :cell_mapping, :lines, :capture, :join
|
9
|
+
attr_accessor :name, :cell_mapping, :lines, :capture, :join, :preserve_blank_lines
|
10
10
|
|
11
11
|
def initialize(column_mapping)
|
12
12
|
@name = column_mapping['rawtext_name'] ||
|
@@ -18,6 +18,7 @@ module NdrImport
|
|
18
18
|
|
19
19
|
@lines = @cell_mapping['lines']
|
20
20
|
@join = @cell_mapping['join']
|
21
|
+
@preserve_blank_lines = @cell_mapping['preserve_blank_lines']
|
21
22
|
end
|
22
23
|
|
23
24
|
# This method returns the range of matching source data lines. If the range is a
|
@@ -74,7 +74,10 @@ module NdrImport
|
|
74
74
|
begin
|
75
75
|
matches = get_matches(column_mapping)
|
76
76
|
# Join the non-blank lines together and add to the array of cells
|
77
|
-
|
77
|
+
lines = matches.select do |value|
|
78
|
+
column_mapping.preserve_blank_lines ? value : value.present?
|
79
|
+
end
|
80
|
+
cells << lines.join(column_mapping.join || '')
|
78
81
|
rescue RegexpRange::PatternMatchError
|
79
82
|
cells << nil
|
80
83
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'shellwords'
|
2
|
+
|
1
3
|
require 'ndr_import/file/registry'
|
2
4
|
|
3
5
|
module NdrImport
|
@@ -5,11 +7,31 @@ module NdrImport
|
|
5
7
|
# complexity of enumerating over files and tables (which should be universally useful).
|
6
8
|
# It is assumed that the host module/class defines `unzip_path`.
|
7
9
|
module UniversalImporterHelper
|
10
|
+
# Helper class to allow multiple source enumerators to contribute to one overall table.
|
11
|
+
class TableEnumProxy
|
12
|
+
include Enumerable
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@table_enums = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def add_table_enum(table_enum)
|
19
|
+
@table_enums << table_enum
|
20
|
+
end
|
21
|
+
|
22
|
+
def each(&block)
|
23
|
+
return enum_for(:each) unless block
|
24
|
+
|
25
|
+
@table_enums.each { |table_enum| table_enum.each(&block) }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
8
29
|
def table_enumerators(filename)
|
9
30
|
table_enumerators = {}
|
31
|
+
table_enumerators = Hash.new { |hash, key| hash[key] = TableEnumProxy.new }
|
10
32
|
|
11
33
|
extract(filename).each do |table, rows|
|
12
|
-
table_enumerators[table.canonical_name]
|
34
|
+
table_enumerators[table.canonical_name].add_table_enum table.transform(rows)
|
13
35
|
end
|
14
36
|
|
15
37
|
table_enumerators
|
@@ -38,7 +60,8 @@ module NdrImport
|
|
38
60
|
'col_sep' => table_mapping.try(:delimiter),
|
39
61
|
'file_password' => table_mapping.try(:file_password),
|
40
62
|
'liberal_parsing' => table_mapping.try(:liberal_parsing),
|
41
|
-
'xml_record_xpath' => table_mapping.try(:xml_record_xpath)
|
63
|
+
'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
|
64
|
+
'slurp' => table_mapping.try(:slurp)
|
42
65
|
}
|
43
66
|
|
44
67
|
tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
|
data/lib/ndr_import/version.rb
CHANGED
data/ndr_import.gemspec
CHANGED
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
|
|
15
15
|
# Specify which files should be added to the gem when it is released.
|
16
16
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
17
17
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
18
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(docs|test|spec|features)/}) }
|
19
19
|
end
|
20
20
|
spec.files -= %w[.travis.yml] # Not needed in the gem
|
21
21
|
spec.bindir = 'exe'
|
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_dependency 'activesupport', '>= 5.0', '< 7'
|
27
27
|
spec.add_dependency 'ndr_support', '>= 5.3.2', '< 6'
|
28
28
|
|
29
|
-
spec.add_dependency 'rubyzip', '~>
|
29
|
+
spec.add_dependency 'rubyzip', '~> 2.0'
|
30
30
|
spec.add_dependency 'roo', '~> 2.0'
|
31
31
|
|
32
32
|
spec.add_dependency 'docx', '~> 0.3'
|
@@ -36,15 +36,15 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.add_dependency 'pdf-reader', '~> 2.1'
|
37
37
|
spec.add_dependency 'roo-xls'
|
38
38
|
spec.add_dependency 'seven_zip_ruby', '~> 1.2'
|
39
|
-
spec.add_dependency 'spreadsheet', '1.
|
39
|
+
spec.add_dependency 'spreadsheet', '1.2.6'
|
40
40
|
|
41
41
|
spec.required_ruby_version = '>= 2.5'
|
42
42
|
|
43
43
|
spec.add_development_dependency 'bundler'
|
44
|
-
spec.add_development_dependency 'rake', '~>
|
44
|
+
spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.3'
|
45
45
|
spec.add_development_dependency 'minitest'
|
46
46
|
spec.add_development_dependency 'mocha'
|
47
|
-
spec.add_development_dependency 'ndr_dev_support', '
|
47
|
+
spec.add_development_dependency 'ndr_dev_support', '>= 3.1.3'
|
48
48
|
spec.add_development_dependency 'guard'
|
49
49
|
spec.add_development_dependency 'guard-rubocop'
|
50
50
|
spec.add_development_dependency 'guard-test'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 9.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -70,20 +70,14 @@ dependencies:
|
|
70
70
|
requirements:
|
71
71
|
- - "~>"
|
72
72
|
- !ruby/object:Gem::Version
|
73
|
-
version: '
|
74
|
-
- - ">="
|
75
|
-
- !ruby/object:Gem::Version
|
76
|
-
version: 1.2.2
|
73
|
+
version: '2.0'
|
77
74
|
type: :runtime
|
78
75
|
prerelease: false
|
79
76
|
version_requirements: !ruby/object:Gem::Requirement
|
80
77
|
requirements:
|
81
78
|
- - "~>"
|
82
79
|
- !ruby/object:Gem::Version
|
83
|
-
version: '
|
84
|
-
- - ">="
|
85
|
-
- !ruby/object:Gem::Version
|
86
|
-
version: 1.2.2
|
80
|
+
version: '2.0'
|
87
81
|
- !ruby/object:Gem::Dependency
|
88
82
|
name: roo
|
89
83
|
requirement: !ruby/object:Gem::Requirement
|
@@ -208,14 +202,14 @@ dependencies:
|
|
208
202
|
requirements:
|
209
203
|
- - '='
|
210
204
|
- !ruby/object:Gem::Version
|
211
|
-
version: 1.
|
205
|
+
version: 1.2.6
|
212
206
|
type: :runtime
|
213
207
|
prerelease: false
|
214
208
|
version_requirements: !ruby/object:Gem::Requirement
|
215
209
|
requirements:
|
216
210
|
- - '='
|
217
211
|
- !ruby/object:Gem::Version
|
218
|
-
version: 1.
|
212
|
+
version: 1.2.6
|
219
213
|
- !ruby/object:Gem::Dependency
|
220
214
|
name: bundler
|
221
215
|
requirement: !ruby/object:Gem::Requirement
|
@@ -236,14 +230,20 @@ dependencies:
|
|
236
230
|
requirements:
|
237
231
|
- - "~>"
|
238
232
|
- !ruby/object:Gem::Version
|
239
|
-
version: '
|
233
|
+
version: '12.3'
|
234
|
+
- - ">="
|
235
|
+
- !ruby/object:Gem::Version
|
236
|
+
version: 12.3.3
|
240
237
|
type: :development
|
241
238
|
prerelease: false
|
242
239
|
version_requirements: !ruby/object:Gem::Requirement
|
243
240
|
requirements:
|
244
241
|
- - "~>"
|
245
242
|
- !ruby/object:Gem::Version
|
246
|
-
version: '
|
243
|
+
version: '12.3'
|
244
|
+
- - ">="
|
245
|
+
- !ruby/object:Gem::Version
|
246
|
+
version: 12.3.3
|
247
247
|
- !ruby/object:Gem::Dependency
|
248
248
|
name: minitest
|
249
249
|
requirement: !ruby/object:Gem::Requirement
|
@@ -276,9 +276,6 @@ dependencies:
|
|
276
276
|
name: ndr_dev_support
|
277
277
|
requirement: !ruby/object:Gem::Requirement
|
278
278
|
requirements:
|
279
|
-
- - "~>"
|
280
|
-
- !ruby/object:Gem::Version
|
281
|
-
version: '3.1'
|
282
279
|
- - ">="
|
283
280
|
- !ruby/object:Gem::Version
|
284
281
|
version: 3.1.3
|
@@ -286,9 +283,6 @@ dependencies:
|
|
286
283
|
prerelease: false
|
287
284
|
version_requirements: !ruby/object:Gem::Requirement
|
288
285
|
requirements:
|
289
|
-
- - "~>"
|
290
|
-
- !ruby/object:Gem::Version
|
291
|
-
version: '3.1'
|
292
286
|
- - ">="
|
293
287
|
- !ruby/object:Gem::Version
|
294
288
|
version: 3.1.3
|
@@ -413,6 +407,7 @@ files:
|
|
413
407
|
- lib/ndr_import/helpers/file/pdf.rb
|
414
408
|
- lib/ndr_import/helpers/file/word.rb
|
415
409
|
- lib/ndr_import/helpers/file/xml.rb
|
410
|
+
- lib/ndr_import/helpers/file/xml_streaming.rb
|
416
411
|
- lib/ndr_import/helpers/file/zip.rb
|
417
412
|
- lib/ndr_import/mapper.rb
|
418
413
|
- lib/ndr_import/mapping_error.rb
|