ndr_import 8.6.0 → 9.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ffddef09a58ca08e5d491449ef48cd829dfd045bf6f38b5289cff240859940e0
4
- data.tar.gz: 30540a5af117b4ae57e002794f26df18544a99b07c7fdfb95a2ad552e9efa7b6
3
+ metadata.gz: 500566629ada1bfbab0def117060fec471bf2fab2c320a112b05a97a8474ee95
4
+ data.tar.gz: 572dc475c3704f2f749e9de11e031829e593d1e2a6fa24a6755342f2f9bdbaae
5
5
  SHA512:
6
- metadata.gz: 2ff3840cb513c4d0253c9c40db709563bf7bd8d62777ed49358e9a67a5304c03b7f18a966bb894099a4f7f9a64933bc11151048e29ce50989a791732cceee74c
7
- data.tar.gz: de437a4e4e62c77e93050df6c0d50c486d2faca290830acb1d719ddadc0b2165bd5405a11a38db4bd0ca438432b082fddafb42edaf41b09a98efcdfd79c63f55
6
+ metadata.gz: 8c8491043c7e58d0ca4953c58ff12914c1dd8deb587c67a9cbd488e8420b26f9e8ea55f7b23cbea68bf02956c3215602932dee5ffd23506d4017ac997378b835
7
+ data.tar.gz: 03c24de311a201a871f752773f0d43c38e2c9e49d31585759c0b009cc76adbf6668350d0bc7c6c2746f9b84998bd384055b3fa47806fc242a3fa49fdb3ed4851
@@ -1,6 +1,13 @@
1
1
  ## [Unreleased]
2
2
  *no unreleased changes*
3
3
 
4
+ ## 9.0.0 / 2019-07-31
5
+ ### Changed
6
+ * `File::Xml` will now stream XML files by default. Use `slurp: true` for the old behaviour. (#43)
7
+
8
+ ### Added
9
+ * Add `XmlStreaming` helper, for more performant handling of large XML documents with Nokogiri. (#43)
10
+
4
11
  ## 8.6.0 / 2019-06-07
5
12
  ### Added
6
13
  * Allow conditional preservation of blank lines when joining lines in non-tabular data (#41)
@@ -19,7 +19,7 @@ file safety:
19
19
  CHANGELOG.md:
20
20
  comments:
21
21
  reviewed_by: josh.pencheon
22
- safe_revision: 53c17f14ee19a9ea83d044508093147daa32ba3d
22
+ safe_revision: 0a4e05d45ee65e25edc36de84d3c450cc15dc3ed
23
23
  CODE_OF_CONDUCT.md:
24
24
  comments:
25
25
  reviewed_by: timgentry
@@ -139,7 +139,7 @@ file safety:
139
139
  lib/ndr_import/file/xml.rb:
140
140
  comments:
141
141
  reviewed_by: josh.pencheon
142
- safe_revision: 4ab72f84201c2d5f0147b7dfd041f488f6ff0422
142
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
143
143
  lib/ndr_import/file/zip.rb:
144
144
  comments:
145
145
  reviewed_by: timgentry
@@ -168,6 +168,11 @@ file safety:
168
168
  comments:
169
169
  reviewed_by: josh.pencheon
170
170
  safe_revision: d2245268ec6a0e4f60c521d171a820f299632c4f
171
+ lib/ndr_import/helpers/file/xml_streaming.rb:
172
+ comments: uses SafePath and Shellwords when accessing filesystem, or making system
173
+ calls
174
+ reviewed_by: josh.pencheon
175
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
171
176
  lib/ndr_import/helpers/file/zip.rb:
172
177
  comments:
173
178
  reviewed_by: timgentry
@@ -223,7 +228,7 @@ file safety:
223
228
  lib/ndr_import/universal_importer_helper.rb:
224
229
  comments:
225
230
  reviewed_by: josh.pencheon
226
- safe_revision: a69d4a57ddcf13cdc13c27bd2eb91a395fa7ea36
231
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
227
232
  lib/ndr_import/unmapped_data_error.rb:
228
233
  comments:
229
234
  reviewed_by: josh.pencheon
@@ -231,7 +236,7 @@ file safety:
231
236
  lib/ndr_import/version.rb:
232
237
  comments: another check?
233
238
  reviewed_by: josh.pencheon
234
- safe_revision: 53c17f14ee19a9ea83d044508093147daa32ba3d
239
+ safe_revision: 0a4e05d45ee65e25edc36de84d3c450cc15dc3ed
235
240
  lib/ndr_import/xml/table.rb:
236
241
  comments:
237
242
  reviewed_by: josh.pencheon
@@ -239,7 +244,7 @@ file safety:
239
244
  ndr_import.gemspec:
240
245
  comments:
241
246
  reviewed_by: josh.pencheon
242
- safe_revision: d3d9a987befeecb122a448d8d06e66d74da13fb5
247
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
243
248
  test/file/acro_form_test.rb:
244
249
  comments:
245
250
  reviewed_by: josh.pencheon
@@ -283,7 +288,7 @@ file safety:
283
288
  test/file/xml_test.rb:
284
289
  comments:
285
290
  reviewed_by: josh.pencheon
286
- safe_revision: 4ab72f84201c2d5f0147b7dfd041f488f6ff0422
291
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
287
292
  test/file/zip_test.rb:
288
293
  comments:
289
294
  reviewed_by: timgentry
@@ -308,6 +313,10 @@ file safety:
308
313
  comments:
309
314
  reviewed_by: timgentry
310
315
  safe_revision: 9abdd6ced1d0c90ce8dd88abee4eb6472c7ff0d6
316
+ test/helpers/file/xml_streaming_test.rb:
317
+ comments:
318
+ reviewed_by: josh.pencheon
319
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
311
320
  test/helpers/file/xml_test.rb:
312
321
  comments:
313
322
  reviewed_by: timgentry
@@ -356,6 +365,10 @@ file safety:
356
365
  comments:
357
366
  reviewed_by: timgentry
358
367
  safe_revision: dab4b8a3e4b29d85eccd971e79936982d888cffd
368
+ test/resources/claims_utf16be_but_isnt.xml:
369
+ comments:
370
+ reviewed_by: josh.pencheon
371
+ safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
359
372
  test/resources/filesystem_paths.yml:
360
373
  comments:
361
374
  reviewed_by: timgentry
@@ -1,5 +1,6 @@
1
1
  require 'ndr_support/safe_file'
2
2
  require 'ndr_import/helpers/file/xml'
3
+ require 'ndr_import/helpers/file/xml_streaming'
3
4
  require_relative 'registry'
4
5
 
5
6
  module NdrImport
@@ -9,6 +10,7 @@ module NdrImport
9
10
  # This class is a xml file handler that returns a single table.
10
11
  class Xml < Base
11
12
  include NdrImport::Helpers::File::Xml
13
+ include NdrImport::Helpers::File::XmlStreaming
12
14
 
13
15
  private
14
16
 
@@ -16,9 +18,14 @@ module NdrImport
16
18
  def rows(&block)
17
19
  return enum_for(:rows) unless block
18
20
 
19
- doc = read_xml_file(@filename)
21
+ xpath = @options['xml_record_xpath']
20
22
 
21
- doc.xpath(@options['xml_record_xpath']).each(&block)
23
+ if @options['slurp']
24
+ doc = read_xml_file(@filename)
25
+ doc.xpath(xpath).each(&block)
26
+ else
27
+ each_node(@filename, xpath, &block)
28
+ end
22
29
  rescue StandardError => e
23
30
  raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
24
31
  end
@@ -0,0 +1,181 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_support/utf8_encoding'
3
+
4
+ module NdrImport
5
+ module Helpers
6
+ module File
7
+ # This mixin adds XML streaming functionality, to support more performant handling
8
+ # of large files by Nokogiri. Uses the `XML::Reader` API, and maintains a temporary
9
+ # DOM as the XML is streamed to allow XPath querying from the root node.
10
+ #
11
+ # If the system has `iconv` available, will attempt to verify the encoding of the
12
+ # file being read externally, so it can be streamed in to Ruby. Otherwise, will load
13
+ # the raw data in to check the encoding, but still stream it through Nokogiri's parser.
14
+ module XmlStreaming
15
+ # Base error for all streaming-specific issues.
16
+ class Error < StandardError; end
17
+
18
+ # Raised if nested tags are accounted which the streaming approach cannnot handle.
19
+ class NestingError < Error
20
+ def initialize(node)
21
+ super <<~STR
22
+ Element '#{node.name}' was found nested inside another of the same type.
23
+ This is not accessible, and a known limitation of XmlStreaming.
24
+ STR
25
+ end
26
+ end
27
+
28
+ # Object to track state as the XML is iterated over, and detect
29
+ # when an element of interest is entered.
30
+ class Cursor
31
+ # wrapper to hold a representation of each element we descent into:
32
+ StackItem = Struct.new(:name, :attrs, :empty)
33
+
34
+ def initialize(xpath)
35
+ @xpath = xpath
36
+ @stack = []
37
+ @match_depth = nil
38
+ end
39
+
40
+ # Has this cursor already passed inside a similar node?
41
+ def in?(node)
42
+ @stack.detect { |item| item.name == node.name }
43
+ end
44
+
45
+ def enter(node)
46
+ @stack.push StackItem.new(node.name, node.attributes, node.empty_element?)
47
+ end
48
+
49
+ def leave(_node)
50
+ @stack.pop
51
+ @match_depth = nil if @match_depth && @stack.length < @match_depth
52
+ end
53
+
54
+ # Does the element that the cursor is currently on match what
55
+ # is being looked for?
56
+ def matches?
57
+ # Can't match again if we're inside a match already:
58
+ return false if @matched_depth
59
+
60
+ match = current_stack_match?
61
+
62
+ # "empty element" matches are yielded immediately, without
63
+ # tagging the stack as having matched, because there won't
64
+ # be an equivalent closing tag to end the match with later.
65
+ if in_empty_element?
66
+ @stack.pop
67
+ elsif match
68
+ @match_depth = @stack.length
69
+ end
70
+
71
+ match
72
+ end
73
+
74
+ private
75
+
76
+ def in_empty_element?
77
+ @stack.last.empty
78
+ end
79
+
80
+ # Does the current state of the stack mean we've met the xpath
81
+ # criteria? Must be an exact match, not just matching a parent
82
+ # element in the DOM.
83
+ def current_stack_match?
84
+ parent_stack = @stack[0..-2]
85
+
86
+ return false unless dom_stubs[@stack].at_xpath(@xpath)
87
+
88
+ parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
89
+ end
90
+
91
+ # A cached collection of DOM fragments, to represent the structure
92
+ # necessary to use xpath to descend into the main document's DOM.
93
+ def dom_stubs
94
+ @dom_stubs ||= Hash.new do |hash, items|
95
+ hash[items.dup] = Nokogiri::XML::Builder.new do |dom|
96
+ add_items_to_dom(dom, items.dup)
97
+ end.doc
98
+ end
99
+ end
100
+
101
+ # Helper to recursively build XML fragment.
102
+ def add_items_to_dom(dom, items)
103
+ item = items.shift
104
+ dom.send(item.name, item.attrs) do
105
+ add_items_to_dom(dom, items) if items.any?
106
+ end
107
+ end
108
+ end
109
+
110
+ include UTF8Encoding
111
+
112
+ # Streams the contents of the given `safe_path`, and yields
113
+ # each element matching `xpath` as they're found.
114
+ #
115
+ # In the case of dodgy encoding, may fall back to slurping the
116
+ # file, but will still use stream parsing for XML.
117
+ def each_node(safe_path, xpath, &block)
118
+ return enum_for(:each_node, safe_path, xpath) unless block
119
+
120
+ require 'nokogiri'
121
+
122
+ with_encoding_check(safe_path) do |stream, encoding|
123
+ stream_xml_nodes(stream, xpath, encoding, &block)
124
+ end
125
+ end
126
+
127
+ private
128
+
129
+ # We need to ensure the raw data is UTF8 before we start streaming
130
+ # it with nokogiri. If we can do an external check, great. Otherwise,
131
+ # we need to slurp and convert the raw data before presenting it.
132
+ def with_encoding_check(safe_path)
133
+ forced_encoding = nil
134
+
135
+ stream = ::File.open(SafeFile.safepath_to_string(safe_path))
136
+
137
+ unless external_utf8_check?(safe_path)
138
+ stream = StringIO.new ensure_utf8!(stream.read)
139
+ forced_encoding = 'UTF8'
140
+ end
141
+
142
+ yield stream, forced_encoding
143
+ end
144
+
145
+ # Use iconv, if available, to check raw data encoding:
146
+ def external_utf8_check?(safe_path)
147
+ iconv = system('command -v iconv > /dev/null 2>&1')
148
+ return false unless iconv
149
+
150
+ path = SafeFile.safepath_to_string(safe_path)
151
+ system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
152
+ end
153
+
154
+ def stream_xml_nodes(io, node_xpath, encoding = nil)
155
+ # Track nesting as the cursor moves through the document:
156
+ cursor = Cursor.new(node_xpath)
157
+
158
+ # If markup isn't well-formed, try to work around it:
159
+ options = Nokogiri::XML::ParseOptions::RECOVER
160
+ reader = Nokogiri::XML::Reader(io, nil, encoding, options)
161
+
162
+ reader.each do |node|
163
+ case node.node_type
164
+ when Nokogiri::XML::Reader::TYPE_ELEMENT # "opening tag"
165
+ raise NestingError, node if cursor.in?(node)
166
+
167
+ cursor.enter(node)
168
+ next unless cursor.matches?
169
+
170
+ # The xpath matched - construct a DOM fragment to yield back:
171
+ element = Nokogiri::XML(node.outer_xml).at("./#{node.name}")
172
+ yield element
173
+ when Nokogiri::XML::Reader::TYPE_END_ELEMENT # "closing tag"
174
+ cursor.leave(node)
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
@@ -38,7 +38,8 @@ module NdrImport
38
38
  'col_sep' => table_mapping.try(:delimiter),
39
39
  'file_password' => table_mapping.try(:file_password),
40
40
  'liberal_parsing' => table_mapping.try(:liberal_parsing),
41
- 'xml_record_xpath' => table_mapping.try(:xml_record_xpath)
41
+ 'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
42
+ 'slurp' => table_mapping.try(:slurp)
42
43
  }
43
44
 
44
45
  tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '8.6.0'.freeze
4
+ VERSION = '9.0.0'.freeze
5
5
  end
@@ -44,7 +44,7 @@ Gem::Specification.new do |spec|
44
44
  spec.add_development_dependency 'rake', '~> 10.0'
45
45
  spec.add_development_dependency 'minitest'
46
46
  spec.add_development_dependency 'mocha'
47
- spec.add_development_dependency 'ndr_dev_support', '~> 3.1', '>= 3.1.3'
47
+ spec.add_development_dependency 'ndr_dev_support', '>= 3.1.3'
48
48
  spec.add_development_dependency 'guard'
49
49
  spec.add_development_dependency 'guard-rubocop'
50
50
  spec.add_development_dependency 'guard-test'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.6.0
4
+ version: 9.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-06-07 00:00:00.000000000 Z
11
+ date: 2019-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -276,9 +276,6 @@ dependencies:
276
276
  name: ndr_dev_support
277
277
  requirement: !ruby/object:Gem::Requirement
278
278
  requirements:
279
- - - "~>"
280
- - !ruby/object:Gem::Version
281
- version: '3.1'
282
279
  - - ">="
283
280
  - !ruby/object:Gem::Version
284
281
  version: 3.1.3
@@ -286,9 +283,6 @@ dependencies:
286
283
  prerelease: false
287
284
  version_requirements: !ruby/object:Gem::Requirement
288
285
  requirements:
289
- - - "~>"
290
- - !ruby/object:Gem::Version
291
- version: '3.1'
292
286
  - - ">="
293
287
  - !ruby/object:Gem::Version
294
288
  version: 3.1.3
@@ -413,6 +407,7 @@ files:
413
407
  - lib/ndr_import/helpers/file/pdf.rb
414
408
  - lib/ndr_import/helpers/file/word.rb
415
409
  - lib/ndr_import/helpers/file/xml.rb
410
+ - lib/ndr_import/helpers/file/xml_streaming.rb
416
411
  - lib/ndr_import/helpers/file/zip.rb
417
412
  - lib/ndr_import/mapper.rb
418
413
  - lib/ndr_import/mapping_error.rb