ndr_import 8.5.2 → 9.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 66af1dbe18a160875b514f4b7c7bf259b6b43f0565d45beeccb87fd647a5983e
4
- data.tar.gz: 0c2a1a0d00ab0371ef9b9d845487cd8e0b67137816b02cefbce7d554c4df6a32
3
+ metadata.gz: 2237d88e9dee0141fe297fd2eb0d2e76f5434e9bb791fb667505193f30263014
4
+ data.tar.gz: 52746c16790c3da92e64ac14ed7d77786a41c5457ddc0b8190f52d4414e4c6d7
5
5
  SHA512:
6
- metadata.gz: 93e749531fc2834866ecf83a16231b233b2800eab53a8ea1d663fa9525fc014a9f9a77b6fd93b43100fe34bbd2e435e3c77e6b4b5ce538927646ceb39c6551c1
7
- data.tar.gz: d04b1fb034f37f6129213a1615bd7db360690ed1da9e9028990dc0bd70aeff0ac9211a2028912d89700c67907e03453d6b9cf2868b67194a450b9a14f48ca4e1
6
+ metadata.gz: 8b8793a45dea035b2abf04a44c20d2ec4fc90bcfef9c5dec4415fab26abdba2ff218b7d2996a165f3447f86046dfd558bfb363e21221f147fec3db9c7c83668c
7
+ data.tar.gz: eedb044185f1bc7e843e91e634fdc8b88cae9c026344d92e3cdc1046979873d843d476c7a0f7ebfb3c631901b47c9109b6826696124c56b07082ba4a6842af9c
data/.gitignore CHANGED
@@ -5,7 +5,7 @@
5
5
  /gemfiles/Gemfile.*.lock
6
6
  /_yardoc/
7
7
  /coverage/
8
- /doc/
8
+ /docs/_site/
9
9
  /pkg/
10
10
  /spec/reports/
11
11
  /tmp/
@@ -1,6 +1,35 @@
1
1
  ## [Unreleased]
2
2
  *no unreleased changes*
3
3
 
4
+ ## 9.0.3 / 2021-01-04
5
+ ### Fixed
6
+ * Address issue importing multiple files against the same table (#54)
7
+
8
+ ### Changed
9
+ * ensure keyword args are properly splatted for ruby 2.7
10
+
11
+ ### Added
12
+ * Ruby 2.7 to travis matrix
13
+
14
+ ## 9.0.2 / 2020-08-14
15
+ ### Changed
16
+ * Configure Nokogiri with HUGE for large xml files
17
+
18
+ ## 9.0.1 / 2020-03-26
19
+ ### Fixed
20
+ * bumps to `nokogiri` / `spreadsheet` / `rubyzip` dependencies
21
+
22
+ ## 9.0.0 / 2019-07-31
23
+ ### Changed
24
+ * `File::Xml` will now stream XML files by default. Use `slurp: true` for the old behaviour. (#43)
25
+
26
+ ### Added
27
+ * Add `XmlStreaming` helper, for more performant handling of large XML documents with Nokogiri. (#43)
28
+
29
+ ## 8.6.0 / 2019-06-07
30
+ ### Added
31
+ * Allow conditional preservation of blank lines when joining lines in non-tabular data (#41)
32
+
4
33
  ## 8.5.2 / 2019-05-17
5
34
  ### Fixed
6
35
  * Fixed issue with `file_password` option key as a String or Symbol
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # NdrImport [![Build Status](https://travis-ci.org/PublicHealthEngland/ndr_import.svg?branch=master)](https://travis-ci.org/PublicHealthEngland/ndr_import) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://badge.fury.io/rb/ndr_import)
1
+ # NdrImport [![Build Status](https://travis-ci.org/PublicHealthEngland/ndr_import.svg?branch=master)](https://travis-ci.org/PublicHealthEngland/ndr_import) [![Gem Version](https://badge.fury.io/rb/ndr_import.svg)](https://badge.fury.io/rb/ndr_import) [![Documentation](https://img.shields.io/badge/ndr_import-docs-blue.svg)](https://www.rubydoc.info/gems/ndr_import)
2
2
 
3
3
  This is the Public Health England (PHE) National Disease Registers (NDR) Import ETL ruby gem, providing:
4
4
 
@@ -3,7 +3,7 @@ file safety:
3
3
  ".gitignore":
4
4
  comments: whole file re-reviewed
5
5
  reviewed_by: josh.pencheon
6
- safe_revision: 3ef51291c413fd5772d61a8394359146a02ae628
6
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
7
7
  ".hound.yml":
8
8
  comments:
9
9
  reviewed_by: timgentry
@@ -14,12 +14,12 @@ file safety:
14
14
  safe_revision: b09e268ff9c8349b914aa1b7ba888e1d39f97e4a
15
15
  ".travis.yml":
16
16
  comments:
17
- reviewed_by: josh.pencheon
18
- safe_revision: d3d9a987befeecb122a448d8d06e66d74da13fb5
17
+ reviewed_by: ollietulloch
18
+ safe_revision: ed8513d290796b9cb9554c4ebe6e87ac21fbbeae
19
19
  CHANGELOG.md:
20
20
  comments:
21
- reviewed_by: josh.pencheon
22
- safe_revision: bd60b56117e65f106d7cb02e0c5772b9e5ec470e
21
+ reviewed_by: ollietulloch
22
+ safe_revision: 92d2d9d56f17c143709e25560d7620aaf147008f
23
23
  CODE_OF_CONDUCT.md:
24
24
  comments:
25
25
  reviewed_by: timgentry
@@ -39,7 +39,7 @@ file safety:
39
39
  README.md:
40
40
  comments:
41
41
  reviewed_by: josh.pencheon
42
- safe_revision: e1d967c10059e8c635452838c3f3dd2b969d9ae4
42
+ safe_revision: 1bc459db8970dde36e9b240b6dd08cca629664e3
43
43
  Rakefile:
44
44
  comments:
45
45
  reviewed_by: josh.pencheon
@@ -52,6 +52,70 @@ file safety:
52
52
  comments:
53
53
  reviewed_by: josh.pencheon
54
54
  safe_revision: e1d967c10059e8c635452838c3f3dd2b969d9ae4
55
+ docs/Gemfile:
56
+ comments:
57
+ reviewed_by: josh.pencheon
58
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
59
+ docs/Gemfile.lock:
60
+ comments:
61
+ reviewed_by: ollietulloch
62
+ safe_revision: 6f274715bb341c3070190f04f67af9500b510580
63
+ docs/_config.yml:
64
+ comments:
65
+ reviewed_by: josh.pencheon
66
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
67
+ docs/_includes/footer.html:
68
+ comments:
69
+ reviewed_by: josh.pencheon
70
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
71
+ docs/_includes/header.html:
72
+ comments:
73
+ reviewed_by: josh.pencheon
74
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
75
+ docs/capturing-data.md:
76
+ comments:
77
+ reviewed_by: josh.pencheon
78
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
79
+ docs/date-formats.md:
80
+ comments:
81
+ reviewed_by: josh.pencheon
82
+ safe_revision: fa21d6d967bf132800b456b585795beec80b08a3
83
+ docs/getting-started.md:
84
+ comments:
85
+ reviewed_by: josh.pencheon
86
+ safe_revision: fa21d6d967bf132800b456b585795beec80b08a3
87
+ docs/identifying-and-splitting-records.md:
88
+ comments:
89
+ reviewed_by: josh.pencheon
90
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
91
+ docs/inbuilt-cleaning-methods.md:
92
+ comments:
93
+ reviewed_by: josh.pencheon
94
+ safe_revision: 694b57ce14e0709fc4d31a1357f8416e98f5de91
95
+ docs/index.md:
96
+ comments:
97
+ reviewed_by: josh.pencheon
98
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
99
+ docs/local-code-transformation-in-yaml-mappings.md:
100
+ comments:
101
+ reviewed_by: josh.pencheon
102
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
103
+ docs/non-tabular-mappings.md:
104
+ comments:
105
+ reviewed_by: josh.pencheon
106
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
107
+ docs/priority-field-mapping.md:
108
+ comments:
109
+ reviewed_by: josh.pencheon
110
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
111
+ docs/standard-yaml-mappings.md:
112
+ comments:
113
+ reviewed_by: josh.pencheon
114
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
115
+ docs/yaml-mapping-user-guide.md:
116
+ comments:
117
+ reviewed_by: josh.pencheon
118
+ safe_revision: 02aaf91b116c510a7c16f2b6f2389736b2742f49
55
119
  exe/pdf_acro_form_to_yaml:
56
120
  comments:
57
121
  reviewed_by: josh.pencheon
@@ -82,8 +146,8 @@ file safety:
82
146
  safe_revision: 24d6449fd0612552f132dfbf4cada2ae28d0469e
83
147
  lib/ndr_import/csv_library.rb:
84
148
  comments:
85
- reviewed_by: josh.pencheon
86
- safe_revision: be12e57519d3737e8d3901d7b01485c6995708dd
149
+ reviewed_by: ollietulloch
150
+ safe_revision: acc72173b81702d55fe3aff78054425609a4d339
87
151
  lib/ndr_import/file/acro_form.rb:
88
152
  comments:
89
153
  reviewed_by: josh.pencheon
@@ -139,7 +203,7 @@ file safety:
139
203
  lib/ndr_import/file/xml.rb:
140
204
  comments:
141
205
  reviewed_by: josh.pencheon
142
- safe_revision: 4ab72f84201c2d5f0147b7dfd041f488f6ff0422
206
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
143
207
  lib/ndr_import/file/zip.rb:
144
208
  comments:
145
209
  reviewed_by: timgentry
@@ -150,8 +214,8 @@ file safety:
150
214
  safe_revision: dfc958d44b6c58355445fa395db08a62213ee709
151
215
  lib/ndr_import/helpers/file/delimited.rb:
152
216
  comments:
153
- reviewed_by: josh.pencheon
154
- safe_revision: 607c0668f1fffd70d181bc1a31c4f56eed5f6189
217
+ reviewed_by: ollietulloch
218
+ safe_revision: acc72173b81702d55fe3aff78054425609a4d339
155
219
  lib/ndr_import/helpers/file/excel.rb:
156
220
  comments:
157
221
  reviewed_by: joshpencheon
@@ -166,8 +230,13 @@ file safety:
166
230
  safe_revision: 45da71ebd3acbc0fe53755bcd75483ba17cb6924
167
231
  lib/ndr_import/helpers/file/xml.rb:
168
232
  comments:
233
+ reviewed_by: ollietulloch
234
+ safe_revision: 4d337bd233f7e60cf9d363c92400f21269a28da7
235
+ lib/ndr_import/helpers/file/xml_streaming.rb:
236
+ comments: uses SafePath and Shellwords when accessing filesystem, or making system
237
+ calls
169
238
  reviewed_by: josh.pencheon
170
- safe_revision: d2245268ec6a0e4f60c521d171a820f299632c4f
239
+ safe_revision: 55e502bb4445cb8b985e530e8eb26d92b574ded9
171
240
  lib/ndr_import/helpers/file/zip.rb:
172
241
  comments:
173
242
  reviewed_by: timgentry
@@ -187,7 +256,7 @@ file safety:
187
256
  lib/ndr_import/non_tabular/column_mapping.rb:
188
257
  comments:
189
258
  reviewed_by: josh.pencheon
190
- safe_revision: f216687d0bba7f2940f74a3353a32be3c900c194
259
+ safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
191
260
  lib/ndr_import/non_tabular/line.rb:
192
261
  comments:
193
262
  reviewed_by: timgentry
@@ -198,8 +267,8 @@ file safety:
198
267
  safe_revision: cf382902508a21a95b80ac4582fbbd117164e80e
199
268
  lib/ndr_import/non_tabular/record.rb:
200
269
  comments:
201
- reviewed_by: timgentry
202
- safe_revision: 63b1e8c275bd63377309d9b124176f39f75576a9
270
+ reviewed_by: josh.pencheon
271
+ safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
203
272
  lib/ndr_import/non_tabular/table.rb:
204
273
  comments:
205
274
  reviewed_by: josh.pencheon
@@ -222,16 +291,16 @@ file safety:
222
291
  safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
223
292
  lib/ndr_import/universal_importer_helper.rb:
224
293
  comments:
225
- reviewed_by: josh.pencheon
226
- safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
294
+ reviewed_by: ollietulloch
295
+ safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
227
296
  lib/ndr_import/unmapped_data_error.rb:
228
297
  comments:
229
298
  reviewed_by: josh.pencheon
230
299
  safe_revision: 5cd2cd0b3a1e254d30d4acc28c6731825a1f84f5
231
300
  lib/ndr_import/version.rb:
232
301
  comments: another check?
233
- reviewed_by: josh.pencheon
234
- safe_revision: bd60b56117e65f106d7cb02e0c5772b9e5ec470e
302
+ reviewed_by: ollietulloch
303
+ safe_revision: 92d2d9d56f17c143709e25560d7620aaf147008f
235
304
  lib/ndr_import/xml/table.rb:
236
305
  comments:
237
306
  reviewed_by: josh.pencheon
@@ -239,7 +308,7 @@ file safety:
239
308
  ndr_import.gemspec:
240
309
  comments:
241
310
  reviewed_by: josh.pencheon
242
- safe_revision: d3d9a987befeecb122a448d8d06e66d74da13fb5
311
+ safe_revision: 95e6ee9997d06471fe6f2f169c3c701471086371
243
312
  test/file/acro_form_test.rb:
244
313
  comments:
245
314
  reviewed_by: josh.pencheon
@@ -283,7 +352,7 @@ file safety:
283
352
  test/file/xml_test.rb:
284
353
  comments:
285
354
  reviewed_by: josh.pencheon
286
- safe_revision: 4ab72f84201c2d5f0147b7dfd041f488f6ff0422
355
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
287
356
  test/file/zip_test.rb:
288
357
  comments:
289
358
  reviewed_by: timgentry
@@ -308,6 +377,10 @@ file safety:
308
377
  comments:
309
378
  reviewed_by: timgentry
310
379
  safe_revision: 9abdd6ced1d0c90ce8dd88abee4eb6472c7ff0d6
380
+ test/helpers/file/xml_streaming_test.rb:
381
+ comments:
382
+ reviewed_by: josh.pencheon
383
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
311
384
  test/helpers/file/xml_test.rb:
312
385
  comments:
313
386
  reviewed_by: timgentry
@@ -330,8 +403,8 @@ file safety:
330
403
  safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
331
404
  test/non_tabular_file_helper_test.rb:
332
405
  comments:
333
- reviewed_by: timgentry
334
- safe_revision: cf382902508a21a95b80ac4582fbbd117164e80e
406
+ reviewed_by: josh.pencheon
407
+ safe_revision: bb44ade56a2151706eede2c31142440ccf49e6f6
335
408
  test/pdf_form/table_test.rb:
336
409
  comments:
337
410
  reviewed_by: josh.pencheon
@@ -356,6 +429,10 @@ file safety:
356
429
  comments:
357
430
  reviewed_by: timgentry
358
431
  safe_revision: dab4b8a3e4b29d85eccd971e79936982d888cffd
432
+ test/resources/claims_utf16be_but_isnt.xml:
433
+ comments:
434
+ reviewed_by: josh.pencheon
435
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
359
436
  test/resources/filesystem_paths.yml:
360
437
  comments:
361
438
  reviewed_by: timgentry
@@ -492,6 +569,10 @@ file safety:
492
569
  comments:
493
570
  reviewed_by: timgentry
494
571
  safe_revision: 31fb1935f4578729d8786eea41cf0ce0a19be1cd
572
+ test/resources/two_files_single_table_mapping.zip:
573
+ comments:
574
+ reviewed_by: ollietulloch
575
+ safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
495
576
  test/resources/txt_file_xls_extension.xls:
496
577
  comments:
497
578
  reviewed_by: timgentry
@@ -542,8 +623,8 @@ file safety:
542
623
  safe_revision: 93ccee82fc2165d1ca2d9b03d146ae03e769ea96
543
624
  test/universal_importer_helper_test.rb:
544
625
  comments:
545
- reviewed_by: josh.pencheon
546
- safe_revision: c3183e522bce50008df576ceb47fe4761ab8f966
626
+ reviewed_by: ollietulloch
627
+ safe_revision: 830de0f8cb139c5f61525652b424423935cfc7ac
547
628
  test/xml/table_test.rb:
548
629
  comments:
549
630
  reviewed_by: josh.pencheon
@@ -12,8 +12,8 @@ class << CSVLibrary
12
12
 
13
13
  # Ensure that we can pass "mode" straight through the underlying IO object
14
14
  def foreach(path, **options, &block)
15
- return to_enum(__method__, path, options) unless block
16
- open(path, options.delete(:mode) || 'r', options) do |csv|
15
+ return to_enum(__method__, path, **options) unless block
16
+ open(path, options.delete(:mode) || 'r', **options) do |csv|
17
17
  csv.each(&block)
18
18
  end
19
19
  end
@@ -1,5 +1,6 @@
1
1
  require 'ndr_support/safe_file'
2
2
  require 'ndr_import/helpers/file/xml'
3
+ require 'ndr_import/helpers/file/xml_streaming'
3
4
  require_relative 'registry'
4
5
 
5
6
  module NdrImport
@@ -9,6 +10,7 @@ module NdrImport
9
10
  # This class is a xml file handler that returns a single table.
10
11
  class Xml < Base
11
12
  include NdrImport::Helpers::File::Xml
13
+ include NdrImport::Helpers::File::XmlStreaming
12
14
 
13
15
  private
14
16
 
@@ -16,9 +18,14 @@ module NdrImport
16
18
  def rows(&block)
17
19
  return enum_for(:rows) unless block
18
20
 
19
- doc = read_xml_file(@filename)
21
+ xpath = @options['xml_record_xpath']
20
22
 
21
- doc.xpath(@options['xml_record_xpath']).each(&block)
23
+ if @options['slurp']
24
+ doc = read_xml_file(@filename)
25
+ doc.xpath(xpath).each(&block)
26
+ else
27
+ each_node(@filename, xpath, &block)
28
+ end
22
29
  rescue StandardError => e
23
30
  raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
24
31
  end
@@ -36,7 +36,7 @@ module NdrImport
36
36
 
37
37
  # By now, we know `encodings` should let us read the whole
38
38
  # file succesfully; if there are problems, we should crash.
39
- CSVLibrary.foreach(safe_path, encodings) do |line|
39
+ CSVLibrary.foreach(safe_path, **encodings) do |line|
40
40
  yield line.map(&:to_s)
41
41
  end
42
42
  end
@@ -73,7 +73,7 @@ module NdrImport
73
73
 
74
74
  row_num = 0
75
75
  # Iterate through the file; if we reach the end, this encoding worked:
76
- CSVLibrary.foreach(safe_path, options) { |_line| row_num += 1 }
76
+ CSVLibrary.foreach(safe_path, **options) { |_line| row_num += 1 }
77
77
  return options
78
78
  rescue ArgumentError => e
79
79
  next if e.message =~ /invalid byte sequence/ # This encoding didn't work
@@ -15,10 +15,13 @@ module NdrImport
15
15
 
16
16
  require 'nokogiri'
17
17
 
18
- Nokogiri::XML(ensure_utf8! file_data).tap do |doc|
19
- doc.encoding = 'UTF-8'
20
- emulate_strict_mode_fatal_check!(doc)
18
+ doc = Nokogiri::XML((ensure_utf8! file_data)) do |config|
19
+ config.huge
21
20
  end
21
+ doc.encoding = 'UTF-8'
22
+ emulate_strict_mode_fatal_check!(doc)
23
+
24
+ doc
22
25
  end
23
26
 
24
27
  # Nokogiri can use give a `STRICT` parse option to libxml, but our friendly
@@ -0,0 +1,183 @@
1
+ require 'shellwords'
2
+
3
+ require 'ndr_support/safe_file'
4
+ require 'ndr_support/utf8_encoding'
5
+
6
+ module NdrImport
7
+ module Helpers
8
+ module File
9
+ # This mixin adds XML streaming functionality, to support more performant handling
10
+ # of large files by Nokogiri. Uses the `XML::Reader` API, and maintains a temporary
11
+ # DOM as the XML is streamed to allow XPath querying from the root node.
12
+ #
13
+ # If the system has `iconv` available, will attempt to verify the encoding of the
14
+ # file being read externally, so it can be streamed in to Ruby. Otherwise, will load
15
+ # the raw data in to check the encoding, but still stream it through Nokogiri's parser.
16
+ module XmlStreaming
17
+ # Base error for all streaming-specific issues.
18
+ class Error < StandardError; end
19
+
20
+ # Raised if nested tags are accounted which the streaming approach cannnot handle.
21
+ class NestingError < Error
22
+ def initialize(node)
23
+ super <<~STR
24
+ Element '#{node.name}' was found nested inside another of the same type.
25
+ This is not accessible, and a known limitation of XmlStreaming.
26
+ STR
27
+ end
28
+ end
29
+
30
+ # Object to track state as the XML is iterated over, and detect
31
+ # when an element of interest is entered.
32
+ class Cursor
33
+ # wrapper to hold a representation of each element we descent into:
34
+ StackItem = Struct.new(:name, :attrs, :empty)
35
+
36
+ def initialize(xpath)
37
+ @xpath = xpath
38
+ @stack = []
39
+ @match_depth = nil
40
+ end
41
+
42
+ # Has this cursor already passed inside a similar node?
43
+ def in?(node)
44
+ @stack.detect { |item| item.name == node.name }
45
+ end
46
+
47
+ def enter(node)
48
+ @stack.push StackItem.new(node.name, node.attributes, node.empty_element?)
49
+ end
50
+
51
+ def leave(_node)
52
+ @stack.pop
53
+ @match_depth = nil if @match_depth && @stack.length < @match_depth
54
+ end
55
+
56
+ # Does the element that the cursor is currently on match what
57
+ # is being looked for?
58
+ def matches?
59
+ # Can't match again if we're inside a match already:
60
+ return false if @matched_depth
61
+
62
+ match = current_stack_match?
63
+
64
+ # "empty element" matches are yielded immediately, without
65
+ # tagging the stack as having matched, because there won't
66
+ # be an equivalent closing tag to end the match with later.
67
+ if in_empty_element?
68
+ @stack.pop
69
+ elsif match
70
+ @match_depth = @stack.length
71
+ end
72
+
73
+ match
74
+ end
75
+
76
+ private
77
+
78
+ def in_empty_element?
79
+ @stack.last.empty
80
+ end
81
+
82
+ # Does the current state of the stack mean we've met the xpath
83
+ # criteria? Must be an exact match, not just matching a parent
84
+ # element in the DOM.
85
+ def current_stack_match?
86
+ parent_stack = @stack[0..-2]
87
+
88
+ return false unless dom_stubs[@stack].at_xpath(@xpath)
89
+
90
+ parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
91
+ end
92
+
93
+ # A cached collection of DOM fragments, to represent the structure
94
+ # necessary to use xpath to descend into the main document's DOM.
95
+ def dom_stubs
96
+ @dom_stubs ||= Hash.new do |hash, items|
97
+ hash[items.dup] = Nokogiri::XML::Builder.new do |dom|
98
+ add_items_to_dom(dom, items.dup)
99
+ end.doc
100
+ end
101
+ end
102
+
103
+ # Helper to recursively build XML fragment.
104
+ def add_items_to_dom(dom, items)
105
+ item = items.shift
106
+ dom.send(item.name, item.attrs) do
107
+ add_items_to_dom(dom, items) if items.any?
108
+ end
109
+ end
110
+ end
111
+
112
+ include UTF8Encoding
113
+
114
+ # Streams the contents of the given `safe_path`, and yields
115
+ # each element matching `xpath` as they're found.
116
+ #
117
+ # In the case of dodgy encoding, may fall back to slurping the
118
+ # file, but will still use stream parsing for XML.
119
+ def each_node(safe_path, xpath, &block)
120
+ return enum_for(:each_node, safe_path, xpath) unless block
121
+
122
+ require 'nokogiri'
123
+
124
+ with_encoding_check(safe_path) do |stream, encoding|
125
+ stream_xml_nodes(stream, xpath, encoding, &block)
126
+ end
127
+ end
128
+
129
+ private
130
+
131
+ # We need to ensure the raw data is UTF8 before we start streaming
132
+ # it with nokogiri. If we can do an external check, great. Otherwise,
133
+ # we need to slurp and convert the raw data before presenting it.
134
+ def with_encoding_check(safe_path)
135
+ forced_encoding = nil
136
+
137
+ stream = ::File.open(SafeFile.safepath_to_string(safe_path))
138
+
139
+ unless external_utf8_check?(safe_path)
140
+ stream = StringIO.new ensure_utf8!(stream.read)
141
+ forced_encoding = 'UTF8'
142
+ end
143
+
144
+ yield stream, forced_encoding
145
+ end
146
+
147
+ # Use iconv, if available, to check raw data encoding:
148
+ def external_utf8_check?(safe_path)
149
+ iconv = system('command -v iconv > /dev/null 2>&1')
150
+ return false unless iconv
151
+
152
+ path = SafeFile.safepath_to_string(safe_path)
153
+ system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
154
+ end
155
+
156
+ def stream_xml_nodes(io, node_xpath, encoding = nil)
157
+ # Track nesting as the cursor moves through the document:
158
+ cursor = Cursor.new(node_xpath)
159
+
160
+ # If markup isn't well-formed, try to work around it:
161
+ options = Nokogiri::XML::ParseOptions::RECOVER
162
+ reader = Nokogiri::XML::Reader(io, nil, encoding, options)
163
+
164
+ reader.each do |node|
165
+ case node.node_type
166
+ when Nokogiri::XML::Reader::TYPE_ELEMENT # "opening tag"
167
+ raise NestingError, node if cursor.in?(node)
168
+
169
+ cursor.enter(node)
170
+ next unless cursor.matches?
171
+
172
+ # The xpath matched - construct a DOM fragment to yield back:
173
+ element = Nokogiri::XML(node.outer_xml).at("./#{node.name}")
174
+ yield element
175
+ when Nokogiri::XML::Reader::TYPE_END_ELEMENT # "closing tag"
176
+ cursor.leave(node)
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
182
+ end
183
+ end
@@ -6,7 +6,7 @@ module NdrImport
6
6
  # the logic associated with finding matching lines of source data and subsequently
7
7
  # capturing arrays of values within them.
8
8
  class ColumnMapping
9
- attr_accessor :name, :cell_mapping, :lines, :capture, :join
9
+ attr_accessor :name, :cell_mapping, :lines, :capture, :join, :preserve_blank_lines
10
10
 
11
11
  def initialize(column_mapping)
12
12
  @name = column_mapping['rawtext_name'] ||
@@ -18,6 +18,7 @@ module NdrImport
18
18
 
19
19
  @lines = @cell_mapping['lines']
20
20
  @join = @cell_mapping['join']
21
+ @preserve_blank_lines = @cell_mapping['preserve_blank_lines']
21
22
  end
22
23
 
23
24
  # This method returns the range of matching source data lines. If the range is a
@@ -74,7 +74,10 @@ module NdrImport
74
74
  begin
75
75
  matches = get_matches(column_mapping)
76
76
  # Join the non-blank lines together and add to the array of cells
77
- cells << matches.select { |value| !value.blank? }.join(column_mapping.join || '')
77
+ lines = matches.select do |value|
78
+ column_mapping.preserve_blank_lines ? value : value.present?
79
+ end
80
+ cells << lines.join(column_mapping.join || '')
78
81
  rescue RegexpRange::PatternMatchError
79
82
  cells << nil
80
83
  end
@@ -1,3 +1,5 @@
1
+ require 'shellwords'
2
+
1
3
  require 'ndr_import/file/registry'
2
4
 
3
5
  module NdrImport
@@ -5,11 +7,31 @@ module NdrImport
5
7
  # complexity of enumerating over files and tables (which should be universally useful).
6
8
  # It is assumed that the host module/class defines `unzip_path`.
7
9
  module UniversalImporterHelper
10
+ # Helper class to allow multiple source enumerators to contribute to one overall table.
11
+ class TableEnumProxy
12
+ include Enumerable
13
+
14
+ def initialize
15
+ @table_enums = []
16
+ end
17
+
18
+ def add_table_enum(table_enum)
19
+ @table_enums << table_enum
20
+ end
21
+
22
+ def each(&block)
23
+ return enum_for(:each) unless block
24
+
25
+ @table_enums.each { |table_enum| table_enum.each(&block) }
26
+ end
27
+ end
28
+
8
29
  def table_enumerators(filename)
9
30
  table_enumerators = {}
31
+ table_enumerators = Hash.new { |hash, key| hash[key] = TableEnumProxy.new }
10
32
 
11
33
  extract(filename).each do |table, rows|
12
- table_enumerators[table.canonical_name] = table.transform(rows)
34
+ table_enumerators[table.canonical_name].add_table_enum table.transform(rows)
13
35
  end
14
36
 
15
37
  table_enumerators
@@ -38,7 +60,8 @@ module NdrImport
38
60
  'col_sep' => table_mapping.try(:delimiter),
39
61
  'file_password' => table_mapping.try(:file_password),
40
62
  'liberal_parsing' => table_mapping.try(:liberal_parsing),
41
- 'xml_record_xpath' => table_mapping.try(:xml_record_xpath)
63
+ 'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
64
+ 'slurp' => table_mapping.try(:slurp)
42
65
  }
43
66
 
44
67
  tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '8.5.2'.freeze
4
+ VERSION = '9.0.3'.freeze
5
5
  end
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
15
15
  # Specify which files should be added to the gem when it is released.
16
16
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
17
17
  spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
18
- `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(docs|test|spec|features)/}) }
19
19
  end
20
20
  spec.files -= %w[.travis.yml] # Not needed in the gem
21
21
  spec.bindir = 'exe'
@@ -26,7 +26,7 @@ Gem::Specification.new do |spec|
26
26
  spec.add_dependency 'activesupport', '>= 5.0', '< 7'
27
27
  spec.add_dependency 'ndr_support', '>= 5.3.2', '< 6'
28
28
 
29
- spec.add_dependency 'rubyzip', '~> 1.2', '>= 1.2.2'
29
+ spec.add_dependency 'rubyzip', '~> 2.0'
30
30
  spec.add_dependency 'roo', '~> 2.0'
31
31
 
32
32
  spec.add_dependency 'docx', '~> 0.3'
@@ -36,15 +36,15 @@ Gem::Specification.new do |spec|
36
36
  spec.add_dependency 'pdf-reader', '~> 2.1'
37
37
  spec.add_dependency 'roo-xls'
38
38
  spec.add_dependency 'seven_zip_ruby', '~> 1.2'
39
- spec.add_dependency 'spreadsheet', '1.0.3'
39
+ spec.add_dependency 'spreadsheet', '1.2.6'
40
40
 
41
41
  spec.required_ruby_version = '>= 2.5'
42
42
 
43
43
  spec.add_development_dependency 'bundler'
44
- spec.add_development_dependency 'rake', '~> 10.0'
44
+ spec.add_development_dependency 'rake', '~> 12.3', '>= 12.3.3'
45
45
  spec.add_development_dependency 'minitest'
46
46
  spec.add_development_dependency 'mocha'
47
- spec.add_development_dependency 'ndr_dev_support', '~> 3.1', '>= 3.1.3'
47
+ spec.add_development_dependency 'ndr_dev_support', '>= 3.1.3'
48
48
  spec.add_development_dependency 'guard'
49
49
  spec.add_development_dependency 'guard-rubocop'
50
50
  spec.add_development_dependency 'guard-test'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.5.2
4
+ version: 9.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-05-17 00:00:00.000000000 Z
11
+ date: 2021-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -70,20 +70,14 @@ dependencies:
70
70
  requirements:
71
71
  - - "~>"
72
72
  - !ruby/object:Gem::Version
73
- version: '1.2'
74
- - - ">="
75
- - !ruby/object:Gem::Version
76
- version: 1.2.2
73
+ version: '2.0'
77
74
  type: :runtime
78
75
  prerelease: false
79
76
  version_requirements: !ruby/object:Gem::Requirement
80
77
  requirements:
81
78
  - - "~>"
82
79
  - !ruby/object:Gem::Version
83
- version: '1.2'
84
- - - ">="
85
- - !ruby/object:Gem::Version
86
- version: 1.2.2
80
+ version: '2.0'
87
81
  - !ruby/object:Gem::Dependency
88
82
  name: roo
89
83
  requirement: !ruby/object:Gem::Requirement
@@ -208,14 +202,14 @@ dependencies:
208
202
  requirements:
209
203
  - - '='
210
204
  - !ruby/object:Gem::Version
211
- version: 1.0.3
205
+ version: 1.2.6
212
206
  type: :runtime
213
207
  prerelease: false
214
208
  version_requirements: !ruby/object:Gem::Requirement
215
209
  requirements:
216
210
  - - '='
217
211
  - !ruby/object:Gem::Version
218
- version: 1.0.3
212
+ version: 1.2.6
219
213
  - !ruby/object:Gem::Dependency
220
214
  name: bundler
221
215
  requirement: !ruby/object:Gem::Requirement
@@ -236,14 +230,20 @@ dependencies:
236
230
  requirements:
237
231
  - - "~>"
238
232
  - !ruby/object:Gem::Version
239
- version: '10.0'
233
+ version: '12.3'
234
+ - - ">="
235
+ - !ruby/object:Gem::Version
236
+ version: 12.3.3
240
237
  type: :development
241
238
  prerelease: false
242
239
  version_requirements: !ruby/object:Gem::Requirement
243
240
  requirements:
244
241
  - - "~>"
245
242
  - !ruby/object:Gem::Version
246
- version: '10.0'
243
+ version: '12.3'
244
+ - - ">="
245
+ - !ruby/object:Gem::Version
246
+ version: 12.3.3
247
247
  - !ruby/object:Gem::Dependency
248
248
  name: minitest
249
249
  requirement: !ruby/object:Gem::Requirement
@@ -276,9 +276,6 @@ dependencies:
276
276
  name: ndr_dev_support
277
277
  requirement: !ruby/object:Gem::Requirement
278
278
  requirements:
279
- - - "~>"
280
- - !ruby/object:Gem::Version
281
- version: '3.1'
282
279
  - - ">="
283
280
  - !ruby/object:Gem::Version
284
281
  version: 3.1.3
@@ -286,9 +283,6 @@ dependencies:
286
283
  prerelease: false
287
284
  version_requirements: !ruby/object:Gem::Requirement
288
285
  requirements:
289
- - - "~>"
290
- - !ruby/object:Gem::Version
291
- version: '3.1'
292
286
  - - ">="
293
287
  - !ruby/object:Gem::Version
294
288
  version: 3.1.3
@@ -413,6 +407,7 @@ files:
413
407
  - lib/ndr_import/helpers/file/pdf.rb
414
408
  - lib/ndr_import/helpers/file/word.rb
415
409
  - lib/ndr_import/helpers/file/xml.rb
410
+ - lib/ndr_import/helpers/file/xml_streaming.rb
416
411
  - lib/ndr_import/helpers/file/zip.rb
417
412
  - lib/ndr_import/mapper.rb
418
413
  - lib/ndr_import/mapping_error.rb