ndr_import 8.6.0 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ffddef09a58ca08e5d491449ef48cd829dfd045bf6f38b5289cff240859940e0
4
- data.tar.gz: 30540a5af117b4ae57e002794f26df18544a99b07c7fdfb95a2ad552e9efa7b6
3
+ metadata.gz: 500566629ada1bfbab0def117060fec471bf2fab2c320a112b05a97a8474ee95
4
+ data.tar.gz: 572dc475c3704f2f749e9de11e031829e593d1e2a6fa24a6755342f2f9bdbaae
5
5
  SHA512:
6
- metadata.gz: 2ff3840cb513c4d0253c9c40db709563bf7bd8d62777ed49358e9a67a5304c03b7f18a966bb894099a4f7f9a64933bc11151048e29ce50989a791732cceee74c
7
- data.tar.gz: de437a4e4e62c77e93050df6c0d50c486d2faca290830acb1d719ddadc0b2165bd5405a11a38db4bd0ca438432b082fddafb42edaf41b09a98efcdfd79c63f55
6
+ metadata.gz: 8c8491043c7e58d0ca4953c58ff12914c1dd8deb587c67a9cbd488e8420b26f9e8ea55f7b23cbea68bf02956c3215602932dee5ffd23506d4017ac997378b835
7
+ data.tar.gz: 03c24de311a201a871f752773f0d43c38e2c9e49d31585759c0b009cc76adbf6668350d0bc7c6c2746f9b84998bd384055b3fa47806fc242a3fa49fdb3ed4851
@@ -1,6 +1,13 @@
1
1
  ## [Unreleased]
2
2
  *no unreleased changes*
3
3
 
4
+ ## 9.0.0 / 2019-07-31
5
+ ### Changed
6
+ * `File::Xml` will now stream XML files by default. Use `slurp: true` for the old behaviour. (#43)
7
+
8
+ ### Added
9
+ * Add `XmlStreaming` helper, for more performant handling of large XML documents with Nokogiri. (#43)
10
+
4
11
  ## 8.6.0 / 2019-06-07
5
12
  ### Added
6
13
  * Allow conditional preservation of blank lines when joining lines in non-tabular data (#41)
@@ -19,7 +19,7 @@ file safety:
19
19
  CHANGELOG.md:
20
20
  comments:
21
21
  reviewed_by: josh.pencheon
22
- safe_revision: 53c17f14ee19a9ea83d044508093147daa32ba3d
22
+ safe_revision: 0a4e05d45ee65e25edc36de84d3c450cc15dc3ed
23
23
  CODE_OF_CONDUCT.md:
24
24
  comments:
25
25
  reviewed_by: timgentry
@@ -139,7 +139,7 @@ file safety:
139
139
  lib/ndr_import/file/xml.rb:
140
140
  comments:
141
141
  reviewed_by: josh.pencheon
142
- safe_revision: 4ab72f84201c2d5f0147b7dfd041f488f6ff0422
142
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
143
143
  lib/ndr_import/file/zip.rb:
144
144
  comments:
145
145
  reviewed_by: timgentry
@@ -168,6 +168,11 @@ file safety:
168
168
  comments:
169
169
  reviewed_by: josh.pencheon
170
170
  safe_revision: d2245268ec6a0e4f60c521d171a820f299632c4f
171
+ lib/ndr_import/helpers/file/xml_streaming.rb:
172
+ comments: uses SafePath and Shellwords when accessing filesystem, or making system
173
+ calls
174
+ reviewed_by: josh.pencheon
175
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
171
176
  lib/ndr_import/helpers/file/zip.rb:
172
177
  comments:
173
178
  reviewed_by: timgentry
@@ -223,7 +228,7 @@ file safety:
223
228
  lib/ndr_import/universal_importer_helper.rb:
224
229
  comments:
225
230
  reviewed_by: josh.pencheon
226
- safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
231
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
227
232
  lib/ndr_import/unmapped_data_error.rb:
228
233
  comments:
229
234
  reviewed_by: josh.pencheon
@@ -231,7 +236,7 @@ file safety:
231
236
  lib/ndr_import/version.rb:
232
237
  comments: another check?
233
238
  reviewed_by: josh.pencheon
234
- safe_revision: 53c17f14ee19a9ea83d044508093147daa32ba3d
239
+ safe_revision: 0a4e05d45ee65e25edc36de84d3c450cc15dc3ed
235
240
  lib/ndr_import/xml/table.rb:
236
241
  comments:
237
242
  reviewed_by: josh.pencheon
@@ -239,7 +244,7 @@ file safety:
239
244
  ndr_import.gemspec:
240
245
  comments:
241
246
  reviewed_by: josh.pencheon
242
- safe_revision: d3d9a987befeecb122a448d8d06e66d74da13fb5
247
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
243
248
  test/file/acro_form_test.rb:
244
249
  comments:
245
250
  reviewed_by: josh.pencheon
@@ -283,7 +288,7 @@ file safety:
283
288
  test/file/xml_test.rb:
284
289
  comments:
285
290
  reviewed_by: josh.pencheon
286
- safe_revision: 4ab72f84201c2d5f0147b7dfd041f488f6ff0422
291
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
287
292
  test/file/zip_test.rb:
288
293
  comments:
289
294
  reviewed_by: timgentry
@@ -308,6 +313,10 @@ file safety:
308
313
  comments:
309
314
  reviewed_by: timgentry
310
315
  safe_revision: 9abdd6ced1d0c90ce8dd88abee4eb6472c7ff0d6
316
+ test/helpers/file/xml_streaming_test.rb:
317
+ comments:
318
+ reviewed_by: josh.pencheon
319
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
311
320
  test/helpers/file/xml_test.rb:
312
321
  comments:
313
322
  reviewed_by: timgentry
@@ -356,6 +365,10 @@ file safety:
356
365
  comments:
357
366
  reviewed_by: timgentry
358
367
  safe_revision: dab4b8a3e4b29d85eccd971e79936982d888cffd
368
+ test/resources/claims_utf16be_but_isnt.xml:
369
+ comments:
370
+ reviewed_by: josh.pencheon
371
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
359
372
  test/resources/filesystem_paths.yml:
360
373
  comments:
361
374
  reviewed_by: timgentry
@@ -1,5 +1,6 @@
1
1
  require 'ndr_support/safe_file'
2
2
  require 'ndr_import/helpers/file/xml'
3
+ require 'ndr_import/helpers/file/xml_streaming'
3
4
  require_relative 'registry'
4
5
 
5
6
  module NdrImport
@@ -9,6 +10,7 @@ module NdrImport
9
10
  # This class is a xml file handler that returns a single table.
10
11
  class Xml < Base
11
12
  include NdrImport::Helpers::File::Xml
13
+ include NdrImport::Helpers::File::XmlStreaming
12
14
 
13
15
  private
14
16
 
@@ -16,9 +18,14 @@ module NdrImport
16
18
  def rows(&block)
17
19
  return enum_for(:rows) unless block
18
20
 
19
- doc = read_xml_file(@filename)
21
+ xpath = @options['xml_record_xpath']
20
22
 
21
- doc.xpath(@options['xml_record_xpath']).each(&block)
23
+ if @options['slurp']
24
+ doc = read_xml_file(@filename)
25
+ doc.xpath(xpath).each(&block)
26
+ else
27
+ each_node(@filename, xpath, &block)
28
+ end
22
29
  rescue StandardError => e
23
30
  raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
24
31
  end
@@ -0,0 +1,181 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_support/utf8_encoding'
3
+
4
+ module NdrImport
5
+ module Helpers
6
+ module File
7
+ # This mixin adds XML streaming functionality, to support more performant handling
8
+ # of large files by Nokogiri. Uses the `XML::Reader` API, and maintains a temporary
9
+ # DOM as the XML is streamed to allow XPath querying from the root node.
10
+ #
11
+ # If the system has `iconv` available, will attempt to verify the encoding of the
12
+ # file being read externally, so it can be streamed in to Ruby. Otherwise, will load
13
+ # the raw data in to check the encoding, but still stream it through Nokogiri's parser.
14
+ module XmlStreaming
15
+ # Base error for all streaming-specific issues.
16
+ class Error < StandardError; end
17
+
18
+ # Raised if nested tags are accounted which the streaming approach cannnot handle.
19
+ class NestingError < Error
20
+ def initialize(node)
21
+ super <<~STR
22
+ Element '#{node.name}' was found nested inside another of the same type.
23
+ This is not accessible, and a known limitation of XmlStreaming.
24
+ STR
25
+ end
26
+ end
27
+
28
+ # Object to track state as the XML is iterated over, and detect
29
+ # when an element of interest is entered.
30
+ class Cursor
31
+ # wrapper to hold a representation of each element we descent into:
32
+ StackItem = Struct.new(:name, :attrs, :empty)
33
+
34
+ def initialize(xpath)
35
+ @xpath = xpath
36
+ @stack = []
37
+ @match_depth = nil
38
+ end
39
+
40
+ # Has this cursor already passed inside a similar node?
41
+ def in?(node)
42
+ @stack.detect { |item| item.name == node.name }
43
+ end
44
+
45
+ def enter(node)
46
+ @stack.push StackItem.new(node.name, node.attributes, node.empty_element?)
47
+ end
48
+
49
+ def leave(_node)
50
+ @stack.pop
51
+ @match_depth = nil if @match_depth && @stack.length < @match_depth
52
+ end
53
+
54
+ # Does the element that the cursor is currently on match what
55
+ # is being looked for?
56
+ def matches?
57
+ # Can't match again if we're inside a match already:
58
+ return false if @matched_depth
59
+
60
+ match = current_stack_match?
61
+
62
+ # "empty element" matches are yielded immediately, without
63
+ # tagging the stack as having matched, because there won't
64
+ # be an equivalent closing tag to end the match with later.
65
+ if in_empty_element?
66
+ @stack.pop
67
+ elsif match
68
+ @match_depth = @stack.length
69
+ end
70
+
71
+ match
72
+ end
73
+
74
+ private
75
+
76
+ def in_empty_element?
77
+ @stack.last.empty
78
+ end
79
+
80
+ # Does the current state of the stack mean we've met the xpath
81
+ # criteria? Must be an exact match, not just matching a parent
82
+ # element in the DOM.
83
+ def current_stack_match?
84
+ parent_stack = @stack[0..-2]
85
+
86
+ return false unless dom_stubs[@stack].at_xpath(@xpath)
87
+
88
+ parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
89
+ end
90
+
91
+ # A cached collection of DOM fragments, to represent the structure
92
+ # necessary to use xpath to descend into the main document's DOM.
93
+ def dom_stubs
94
+ @dom_stubs ||= Hash.new do |hash, items|
95
+ hash[items.dup] = Nokogiri::XML::Builder.new do |dom|
96
+ add_items_to_dom(dom, items.dup)
97
+ end.doc
98
+ end
99
+ end
100
+
101
+ # Helper to recursively build XML fragment.
102
+ def add_items_to_dom(dom, items)
103
+ item = items.shift
104
+ dom.send(item.name, item.attrs) do
105
+ add_items_to_dom(dom, items) if items.any?
106
+ end
107
+ end
108
+ end
109
+
110
+ include UTF8Encoding
111
+
112
+ # Streams the contents of the given `safe_path`, and yields
113
+ # each element matching `xpath` as they're found.
114
+ #
115
+ # In the case of dodgy encoding, may fall back to slurping the
116
+ # file, but will still use stream parsing for XML.
117
+ def each_node(safe_path, xpath, &block)
118
+ return enum_for(:each_node, safe_path, xpath) unless block
119
+
120
+ require 'nokogiri'
121
+
122
+ with_encoding_check(safe_path) do |stream, encoding|
123
+ stream_xml_nodes(stream, xpath, encoding, &block)
124
+ end
125
+ end
126
+
127
+ private
128
+
129
+ # We need to ensure the raw data is UTF8 before we start streaming
130
+ # it with nokogiri. If we can do an external check, great. Otherwise,
131
+ # we need to slurp and convert the raw data before presenting it.
132
+ def with_encoding_check(safe_path)
133
+ forced_encoding = nil
134
+
135
+ stream = ::File.open(SafeFile.safepath_to_string(safe_path))
136
+
137
+ unless external_utf8_check?(safe_path)
138
+ stream = StringIO.new ensure_utf8!(stream.read)
139
+ forced_encoding = 'UTF8'
140
+ end
141
+
142
+ yield stream, forced_encoding
143
+ end
144
+
145
+ # Use iconv, if available, to check raw data encoding:
146
+ def external_utf8_check?(safe_path)
147
+ iconv = system('command -v iconv > /dev/null 2>&1')
148
+ return false unless iconv
149
+
150
+ path = SafeFile.safepath_to_string(safe_path)
151
+ system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
152
+ end
153
+
154
+ def stream_xml_nodes(io, node_xpath, encoding = nil)
155
+ # Track nesting as the cursor moves through the document:
156
+ cursor = Cursor.new(node_xpath)
157
+
158
+ # If markup isn't well-formed, try to work around it:
159
+ options = Nokogiri::XML::ParseOptions::RECOVER
160
+ reader = Nokogiri::XML::Reader(io, nil, encoding, options)
161
+
162
+ reader.each do |node|
163
+ case node.node_type
164
+ when Nokogiri::XML::Reader::TYPE_ELEMENT # "opening tag"
165
+ raise NestingError, node if cursor.in?(node)
166
+
167
+ cursor.enter(node)
168
+ next unless cursor.matches?
169
+
170
+ # The xpath matched - construct a DOM fragment to yield back:
171
+ element = Nokogiri::XML(node.outer_xml).at("./#{node.name}")
172
+ yield element
173
+ when Nokogiri::XML::Reader::TYPE_END_ELEMENT # "closing tag"
174
+ cursor.leave(node)
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
@@ -38,7 +38,8 @@ module NdrImport
38
38
  'col_sep' => table_mapping.try(:delimiter),
39
39
  'file_password' => table_mapping.try(:file_password),
40
40
  'liberal_parsing' => table_mapping.try(:liberal_parsing),
41
- 'xml_record_xpath' => table_mapping.try(:xml_record_xpath)
41
+ 'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
42
+ 'slurp' => table_mapping.try(:slurp)
42
43
  }
43
44
 
44
45
  tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '8.6.0'.freeze
4
+ VERSION = '9.0.0'.freeze
5
5
  end
@@ -44,7 +44,7 @@ Gem::Specification.new do |spec|
44
44
  spec.add_development_dependency 'rake', '~> 10.0'
45
45
  spec.add_development_dependency 'minitest'
46
46
  spec.add_development_dependency 'mocha'
47
- spec.add_development_dependency 'ndr_dev_support', '~> 3.1', '>= 3.1.3'
47
+ spec.add_development_dependency 'ndr_dev_support', '>= 3.1.3'
48
48
  spec.add_development_dependency 'guard'
49
49
  spec.add_development_dependency 'guard-rubocop'
50
50
  spec.add_development_dependency 'guard-test'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.6.0
4
+ version: 9.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-06-07 00:00:00.000000000 Z
11
+ date: 2019-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -276,9 +276,6 @@ dependencies:
276
276
  name: ndr_dev_support
277
277
  requirement: !ruby/object:Gem::Requirement
278
278
  requirements:
279
- - - "~>"
280
- - !ruby/object:Gem::Version
281
- version: '3.1'
282
279
  - - ">="
283
280
  - !ruby/object:Gem::Version
284
281
  version: 3.1.3
@@ -286,9 +283,6 @@ dependencies:
286
283
  prerelease: false
287
284
  version_requirements: !ruby/object:Gem::Requirement
288
285
  requirements:
289
- - - "~>"
290
- - !ruby/object:Gem::Version
291
- version: '3.1'
292
286
  - - ">="
293
287
  - !ruby/object:Gem::Version
294
288
  version: 3.1.3
@@ -413,6 +407,7 @@ files:
413
407
  - lib/ndr_import/helpers/file/pdf.rb
414
408
  - lib/ndr_import/helpers/file/word.rb
415
409
  - lib/ndr_import/helpers/file/xml.rb
410
+ - lib/ndr_import/helpers/file/xml_streaming.rb
416
411
  - lib/ndr_import/helpers/file/zip.rb
417
412
  - lib/ndr_import/mapper.rb
418
413
  - lib/ndr_import/mapping_error.rb