ndr_import 8.6.0 → 9.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/code_safety.yml +19 -6
- data/lib/ndr_import/file/xml.rb +9 -2
- data/lib/ndr_import/helpers/file/xml_streaming.rb +181 -0
- data/lib/ndr_import/universal_importer_helper.rb +2 -1
- data/lib/ndr_import/version.rb +1 -1
- data/ndr_import.gemspec +1 -1
- metadata +3 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 500566629ada1bfbab0def117060fec471bf2fab2c320a112b05a97a8474ee95
|
4
|
+
data.tar.gz: 572dc475c3704f2f749e9de11e031829e593d1e2a6fa24a6755342f2f9bdbaae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c8491043c7e58d0ca4953c58ff12914c1dd8deb587c67a9cbd488e8420b26f9e8ea55f7b23cbea68bf02956c3215602932dee5ffd23506d4017ac997378b835
|
7
|
+
data.tar.gz: 03c24de311a201a871f752773f0d43c38e2c9e49d31585759c0b009cc76adbf6668350d0bc7c6c2746f9b84998bd384055b3fa47806fc242a3fa49fdb3ed4851
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,13 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
*no unreleased changes*
|
3
3
|
|
4
|
+
## 9.0.0 / 2019-07-31
|
5
|
+
### Changed
|
6
|
+
* `File::Xml` will now stream XML files by default. Use `slurp: true` for the old behaviour. (#43)
|
7
|
+
|
8
|
+
### Added
|
9
|
+
* Add `XmlStreaming` helper, for more performant handling of large XML documents with Nokogiri. (#43)
|
10
|
+
|
4
11
|
## 8.6.0 / 2019-06-07
|
5
12
|
### Added
|
6
13
|
* Allow conditional preservation of blank lines when joining lines in non-tabular data (#41)
|
data/code_safety.yml
CHANGED
@@ -19,7 +19,7 @@ file safety:
|
|
19
19
|
CHANGELOG.md:
|
20
20
|
comments:
|
21
21
|
reviewed_by: josh.pencheon
|
22
|
-
safe_revision:
|
22
|
+
safe_revision: 0a4e05d45ee65e25edc36de84d3c450cc15dc3ed
|
23
23
|
CODE_OF_CONDUCT.md:
|
24
24
|
comments:
|
25
25
|
reviewed_by: timgentry
|
@@ -139,7 +139,7 @@ file safety:
|
|
139
139
|
lib/ndr_import/file/xml.rb:
|
140
140
|
comments:
|
141
141
|
reviewed_by: josh.pencheon
|
142
|
-
safe_revision:
|
142
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
143
143
|
lib/ndr_import/file/zip.rb:
|
144
144
|
comments:
|
145
145
|
reviewed_by: timgentry
|
@@ -168,6 +168,11 @@ file safety:
|
|
168
168
|
comments:
|
169
169
|
reviewed_by: josh.pencheon
|
170
170
|
safe_revision: d2245268ec6a0e4f60c521d171a820f299632c4f
|
171
|
+
lib/ndr_import/helpers/file/xml_streaming.rb:
|
172
|
+
comments: uses SafePath and Shellwords when accessing filesystem, or making system
|
173
|
+
calls
|
174
|
+
reviewed_by: josh.pencheon
|
175
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
171
176
|
lib/ndr_import/helpers/file/zip.rb:
|
172
177
|
comments:
|
173
178
|
reviewed_by: timgentry
|
@@ -223,7 +228,7 @@ file safety:
|
|
223
228
|
lib/ndr_import/universal_importer_helper.rb:
|
224
229
|
comments:
|
225
230
|
reviewed_by: josh.pencheon
|
226
|
-
safe_revision:
|
231
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
227
232
|
lib/ndr_import/unmapped_data_error.rb:
|
228
233
|
comments:
|
229
234
|
reviewed_by: josh.pencheon
|
@@ -231,7 +236,7 @@ file safety:
|
|
231
236
|
lib/ndr_import/version.rb:
|
232
237
|
comments: another check?
|
233
238
|
reviewed_by: josh.pencheon
|
234
|
-
safe_revision:
|
239
|
+
safe_revision: 0a4e05d45ee65e25edc36de84d3c450cc15dc3ed
|
235
240
|
lib/ndr_import/xml/table.rb:
|
236
241
|
comments:
|
237
242
|
reviewed_by: josh.pencheon
|
@@ -239,7 +244,7 @@ file safety:
|
|
239
244
|
ndr_import.gemspec:
|
240
245
|
comments:
|
241
246
|
reviewed_by: josh.pencheon
|
242
|
-
safe_revision:
|
247
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
243
248
|
test/file/acro_form_test.rb:
|
244
249
|
comments:
|
245
250
|
reviewed_by: josh.pencheon
|
@@ -283,7 +288,7 @@ file safety:
|
|
283
288
|
test/file/xml_test.rb:
|
284
289
|
comments:
|
285
290
|
reviewed_by: josh.pencheon
|
286
|
-
safe_revision:
|
291
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
287
292
|
test/file/zip_test.rb:
|
288
293
|
comments:
|
289
294
|
reviewed_by: timgentry
|
@@ -308,6 +313,10 @@ file safety:
|
|
308
313
|
comments:
|
309
314
|
reviewed_by: timgentry
|
310
315
|
safe_revision: 9abdd6ced1d0c90ce8dd88abee4eb6472c7ff0d6
|
316
|
+
test/helpers/file/xml_streaming_test.rb:
|
317
|
+
comments:
|
318
|
+
reviewed_by: josh.pencheon
|
319
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
311
320
|
test/helpers/file/xml_test.rb:
|
312
321
|
comments:
|
313
322
|
reviewed_by: timgentry
|
@@ -356,6 +365,10 @@ file safety:
|
|
356
365
|
comments:
|
357
366
|
reviewed_by: timgentry
|
358
367
|
safe_revision: dab4b8a3e4b29d85eccd971e79936982d888cffd
|
368
|
+
test/resources/claims_utf16be_but_isnt.xml:
|
369
|
+
comments:
|
370
|
+
reviewed_by: josh.pencheon
|
371
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
359
372
|
test/resources/filesystem_paths.yml:
|
360
373
|
comments:
|
361
374
|
reviewed_by: timgentry
|
data/lib/ndr_import/file/xml.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'ndr_support/safe_file'
|
2
2
|
require 'ndr_import/helpers/file/xml'
|
3
|
+
require 'ndr_import/helpers/file/xml_streaming'
|
3
4
|
require_relative 'registry'
|
4
5
|
|
5
6
|
module NdrImport
|
@@ -9,6 +10,7 @@ module NdrImport
|
|
9
10
|
# This class is a xml file handler that returns a single table.
|
10
11
|
class Xml < Base
|
11
12
|
include NdrImport::Helpers::File::Xml
|
13
|
+
include NdrImport::Helpers::File::XmlStreaming
|
12
14
|
|
13
15
|
private
|
14
16
|
|
@@ -16,9 +18,14 @@ module NdrImport
|
|
16
18
|
def rows(&block)
|
17
19
|
return enum_for(:rows) unless block
|
18
20
|
|
19
|
-
|
21
|
+
xpath = @options['xml_record_xpath']
|
20
22
|
|
21
|
-
|
23
|
+
if @options['slurp']
|
24
|
+
doc = read_xml_file(@filename)
|
25
|
+
doc.xpath(xpath).each(&block)
|
26
|
+
else
|
27
|
+
each_node(@filename, xpath, &block)
|
28
|
+
end
|
22
29
|
rescue StandardError => e
|
23
30
|
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
24
31
|
end
|
@@ -0,0 +1,181 @@
|
|
1
|
+
require 'ndr_support/safe_file'
|
2
|
+
require 'ndr_support/utf8_encoding'
|
3
|
+
|
4
|
+
module NdrImport
|
5
|
+
module Helpers
|
6
|
+
module File
|
7
|
+
# This mixin adds XML streaming functionality, to support more performant handling
|
8
|
+
# of large files by Nokogiri. Uses the `XML::Reader` API, and maintains a temporary
|
9
|
+
# DOM as the XML is streamed to allow XPath querying from the root node.
|
10
|
+
#
|
11
|
+
# If the system has `iconv` available, will attempt to verify the encoding of the
|
12
|
+
# file being read externally, so it can be streamed in to Ruby. Otherwise, will load
|
13
|
+
# the raw data in to check the encoding, but still stream it through Nokogiri's parser.
|
14
|
+
module XmlStreaming
|
15
|
+
# Base error for all streaming-specific issues.
|
16
|
+
class Error < StandardError; end
|
17
|
+
|
18
|
+
# Raised if nested tags are accounted which the streaming approach cannnot handle.
|
19
|
+
class NestingError < Error
|
20
|
+
def initialize(node)
|
21
|
+
super <<~STR
|
22
|
+
Element '#{node.name}' was found nested inside another of the same type.
|
23
|
+
This is not accessible, and a known limitation of XmlStreaming.
|
24
|
+
STR
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Object to track state as the XML is iterated over, and detect
|
29
|
+
# when an element of interest is entered.
|
30
|
+
class Cursor
|
31
|
+
# wrapper to hold a representation of each element we descent into:
|
32
|
+
StackItem = Struct.new(:name, :attrs, :empty)
|
33
|
+
|
34
|
+
def initialize(xpath)
|
35
|
+
@xpath = xpath
|
36
|
+
@stack = []
|
37
|
+
@match_depth = nil
|
38
|
+
end
|
39
|
+
|
40
|
+
# Has this cursor already passed inside a similar node?
|
41
|
+
def in?(node)
|
42
|
+
@stack.detect { |item| item.name == node.name }
|
43
|
+
end
|
44
|
+
|
45
|
+
def enter(node)
|
46
|
+
@stack.push StackItem.new(node.name, node.attributes, node.empty_element?)
|
47
|
+
end
|
48
|
+
|
49
|
+
def leave(_node)
|
50
|
+
@stack.pop
|
51
|
+
@match_depth = nil if @match_depth && @stack.length < @match_depth
|
52
|
+
end
|
53
|
+
|
54
|
+
# Does the element that the cursor is currently on match what
|
55
|
+
# is being looked for?
|
56
|
+
def matches?
|
57
|
+
# Can't match again if we're inside a match already:
|
58
|
+
return false if @matched_depth
|
59
|
+
|
60
|
+
match = current_stack_match?
|
61
|
+
|
62
|
+
# "empty element" matches are yielded immediately, without
|
63
|
+
# tagging the stack as having matched, because there won't
|
64
|
+
# be an equivalent closing tag to end the match with later.
|
65
|
+
if in_empty_element?
|
66
|
+
@stack.pop
|
67
|
+
elsif match
|
68
|
+
@match_depth = @stack.length
|
69
|
+
end
|
70
|
+
|
71
|
+
match
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
def in_empty_element?
|
77
|
+
@stack.last.empty
|
78
|
+
end
|
79
|
+
|
80
|
+
# Does the current state of the stack mean we've met the xpath
|
81
|
+
# criteria? Must be an exact match, not just matching a parent
|
82
|
+
# element in the DOM.
|
83
|
+
def current_stack_match?
|
84
|
+
parent_stack = @stack[0..-2]
|
85
|
+
|
86
|
+
return false unless dom_stubs[@stack].at_xpath(@xpath)
|
87
|
+
|
88
|
+
parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
|
89
|
+
end
|
90
|
+
|
91
|
+
# A cached collection of DOM fragments, to represent the structure
|
92
|
+
# necessary to use xpath to descend into the main document's DOM.
|
93
|
+
def dom_stubs
|
94
|
+
@dom_stubs ||= Hash.new do |hash, items|
|
95
|
+
hash[items.dup] = Nokogiri::XML::Builder.new do |dom|
|
96
|
+
add_items_to_dom(dom, items.dup)
|
97
|
+
end.doc
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Helper to recursively build XML fragment.
|
102
|
+
def add_items_to_dom(dom, items)
|
103
|
+
item = items.shift
|
104
|
+
dom.send(item.name, item.attrs) do
|
105
|
+
add_items_to_dom(dom, items) if items.any?
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
include UTF8Encoding
|
111
|
+
|
112
|
+
# Streams the contents of the given `safe_path`, and yields
|
113
|
+
# each element matching `xpath` as they're found.
|
114
|
+
#
|
115
|
+
# In the case of dodgy encoding, may fall back to slurping the
|
116
|
+
# file, but will still use stream parsing for XML.
|
117
|
+
def each_node(safe_path, xpath, &block)
|
118
|
+
return enum_for(:each_node, safe_path, xpath) unless block
|
119
|
+
|
120
|
+
require 'nokogiri'
|
121
|
+
|
122
|
+
with_encoding_check(safe_path) do |stream, encoding|
|
123
|
+
stream_xml_nodes(stream, xpath, encoding, &block)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
# We need to ensure the raw data is UTF8 before we start streaming
|
130
|
+
# it with nokogiri. If we can do an external check, great. Otherwise,
|
131
|
+
# we need to slurp and convert the raw data before presenting it.
|
132
|
+
def with_encoding_check(safe_path)
|
133
|
+
forced_encoding = nil
|
134
|
+
|
135
|
+
stream = ::File.open(SafeFile.safepath_to_string(safe_path))
|
136
|
+
|
137
|
+
unless external_utf8_check?(safe_path)
|
138
|
+
stream = StringIO.new ensure_utf8!(stream.read)
|
139
|
+
forced_encoding = 'UTF8'
|
140
|
+
end
|
141
|
+
|
142
|
+
yield stream, forced_encoding
|
143
|
+
end
|
144
|
+
|
145
|
+
# Use iconv, if available, to check raw data encoding:
|
146
|
+
def external_utf8_check?(safe_path)
|
147
|
+
iconv = system('command -v iconv > /dev/null 2>&1')
|
148
|
+
return false unless iconv
|
149
|
+
|
150
|
+
path = SafeFile.safepath_to_string(safe_path)
|
151
|
+
system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
|
152
|
+
end
|
153
|
+
|
154
|
+
def stream_xml_nodes(io, node_xpath, encoding = nil)
|
155
|
+
# Track nesting as the cursor moves through the document:
|
156
|
+
cursor = Cursor.new(node_xpath)
|
157
|
+
|
158
|
+
# If markup isn't well-formed, try to work around it:
|
159
|
+
options = Nokogiri::XML::ParseOptions::RECOVER
|
160
|
+
reader = Nokogiri::XML::Reader(io, nil, encoding, options)
|
161
|
+
|
162
|
+
reader.each do |node|
|
163
|
+
case node.node_type
|
164
|
+
when Nokogiri::XML::Reader::TYPE_ELEMENT # "opening tag"
|
165
|
+
raise NestingError, node if cursor.in?(node)
|
166
|
+
|
167
|
+
cursor.enter(node)
|
168
|
+
next unless cursor.matches?
|
169
|
+
|
170
|
+
# The xpath matched - construct a DOM fragment to yield back:
|
171
|
+
element = Nokogiri::XML(node.outer_xml).at("./#{node.name}")
|
172
|
+
yield element
|
173
|
+
when Nokogiri::XML::Reader::TYPE_END_ELEMENT # "closing tag"
|
174
|
+
cursor.leave(node)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
@@ -38,7 +38,8 @@ module NdrImport
|
|
38
38
|
'col_sep' => table_mapping.try(:delimiter),
|
39
39
|
'file_password' => table_mapping.try(:file_password),
|
40
40
|
'liberal_parsing' => table_mapping.try(:liberal_parsing),
|
41
|
-
'xml_record_xpath' => table_mapping.try(:xml_record_xpath)
|
41
|
+
'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
|
42
|
+
'slurp' => table_mapping.try(:slurp)
|
42
43
|
}
|
43
44
|
|
44
45
|
tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
|
data/lib/ndr_import/version.rb
CHANGED
data/ndr_import.gemspec
CHANGED
@@ -44,7 +44,7 @@ Gem::Specification.new do |spec|
|
|
44
44
|
spec.add_development_dependency 'rake', '~> 10.0'
|
45
45
|
spec.add_development_dependency 'minitest'
|
46
46
|
spec.add_development_dependency 'mocha'
|
47
|
-
spec.add_development_dependency 'ndr_dev_support', '
|
47
|
+
spec.add_development_dependency 'ndr_dev_support', '>= 3.1.3'
|
48
48
|
spec.add_development_dependency 'guard'
|
49
49
|
spec.add_development_dependency 'guard-rubocop'
|
50
50
|
spec.add_development_dependency 'guard-test'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 9.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-07-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -276,9 +276,6 @@ dependencies:
|
|
276
276
|
name: ndr_dev_support
|
277
277
|
requirement: !ruby/object:Gem::Requirement
|
278
278
|
requirements:
|
279
|
-
- - "~>"
|
280
|
-
- !ruby/object:Gem::Version
|
281
|
-
version: '3.1'
|
282
279
|
- - ">="
|
283
280
|
- !ruby/object:Gem::Version
|
284
281
|
version: 3.1.3
|
@@ -286,9 +283,6 @@ dependencies:
|
|
286
283
|
prerelease: false
|
287
284
|
version_requirements: !ruby/object:Gem::Requirement
|
288
285
|
requirements:
|
289
|
-
- - "~>"
|
290
|
-
- !ruby/object:Gem::Version
|
291
|
-
version: '3.1'
|
292
286
|
- - ">="
|
293
287
|
- !ruby/object:Gem::Version
|
294
288
|
version: 3.1.3
|
@@ -413,6 +407,7 @@ files:
|
|
413
407
|
- lib/ndr_import/helpers/file/pdf.rb
|
414
408
|
- lib/ndr_import/helpers/file/word.rb
|
415
409
|
- lib/ndr_import/helpers/file/xml.rb
|
410
|
+
- lib/ndr_import/helpers/file/xml_streaming.rb
|
416
411
|
- lib/ndr_import/helpers/file/zip.rb
|
417
412
|
- lib/ndr_import/mapper.rb
|
418
413
|
- lib/ndr_import/mapping_error.rb
|