ndr_import 8.6.0 → 9.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/code_safety.yml +19 -6
- data/lib/ndr_import/file/xml.rb +9 -2
- data/lib/ndr_import/helpers/file/xml_streaming.rb +181 -0
- data/lib/ndr_import/universal_importer_helper.rb +2 -1
- data/lib/ndr_import/version.rb +1 -1
- data/ndr_import.gemspec +1 -1
- metadata +3 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 500566629ada1bfbab0def117060fec471bf2fab2c320a112b05a97a8474ee95
|
4
|
+
data.tar.gz: 572dc475c3704f2f749e9de11e031829e593d1e2a6fa24a6755342f2f9bdbaae
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8c8491043c7e58d0ca4953c58ff12914c1dd8deb587c67a9cbd488e8420b26f9e8ea55f7b23cbea68bf02956c3215602932dee5ffd23506d4017ac997378b835
|
7
|
+
data.tar.gz: 03c24de311a201a871f752773f0d43c38e2c9e49d31585759c0b009cc76adbf6668350d0bc7c6c2746f9b84998bd384055b3fa47806fc242a3fa49fdb3ed4851
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,13 @@
|
|
1
1
|
## [Unreleased]
|
2
2
|
*no unreleased changes*
|
3
3
|
|
4
|
+
## 9.0.0 / 2019-07-31
|
5
|
+
### Changed
|
6
|
+
* `File::Xml` will now stream XML files by default. Use `slurp: true` for the old behaviour. (#43)
|
7
|
+
|
8
|
+
### Added
|
9
|
+
* Add `XmlStreaming` helper, for more performant handling of large XML documents with Nokogiri. (#43)
|
10
|
+
|
4
11
|
## 8.6.0 / 2019-06-07
|
5
12
|
### Added
|
6
13
|
* Allow conditional preservation of blank lines when joining lines in non-tabular data (#41)
|
data/code_safety.yml
CHANGED
@@ -19,7 +19,7 @@ file safety:
|
|
19
19
|
CHANGELOG.md:
|
20
20
|
comments:
|
21
21
|
reviewed_by: josh.pencheon
|
22
|
-
safe_revision:
|
22
|
+
safe_revision: 0a4e05d45ee65e25edc36de84d3c450cc15dc3ed
|
23
23
|
CODE_OF_CONDUCT.md:
|
24
24
|
comments:
|
25
25
|
reviewed_by: timgentry
|
@@ -139,7 +139,7 @@ file safety:
|
|
139
139
|
lib/ndr_import/file/xml.rb:
|
140
140
|
comments:
|
141
141
|
reviewed_by: josh.pencheon
|
142
|
-
safe_revision:
|
142
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
143
143
|
lib/ndr_import/file/zip.rb:
|
144
144
|
comments:
|
145
145
|
reviewed_by: timgentry
|
@@ -168,6 +168,11 @@ file safety:
|
|
168
168
|
comments:
|
169
169
|
reviewed_by: josh.pencheon
|
170
170
|
safe_revision: d2245268ec6a0e4f60c521d171a820f299632c4f
|
171
|
+
lib/ndr_import/helpers/file/xml_streaming.rb:
|
172
|
+
comments: uses SafePath and Shellwords when accessing filesystem, or making system
|
173
|
+
calls
|
174
|
+
reviewed_by: josh.pencheon
|
175
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
171
176
|
lib/ndr_import/helpers/file/zip.rb:
|
172
177
|
comments:
|
173
178
|
reviewed_by: timgentry
|
@@ -223,7 +228,7 @@ file safety:
|
|
223
228
|
lib/ndr_import/universal_importer_helper.rb:
|
224
229
|
comments:
|
225
230
|
reviewed_by: josh.pencheon
|
226
|
-
safe_revision:
|
231
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
227
232
|
lib/ndr_import/unmapped_data_error.rb:
|
228
233
|
comments:
|
229
234
|
reviewed_by: josh.pencheon
|
@@ -231,7 +236,7 @@ file safety:
|
|
231
236
|
lib/ndr_import/version.rb:
|
232
237
|
comments: another check?
|
233
238
|
reviewed_by: josh.pencheon
|
234
|
-
safe_revision:
|
239
|
+
safe_revision: 0a4e05d45ee65e25edc36de84d3c450cc15dc3ed
|
235
240
|
lib/ndr_import/xml/table.rb:
|
236
241
|
comments:
|
237
242
|
reviewed_by: josh.pencheon
|
@@ -239,7 +244,7 @@ file safety:
|
|
239
244
|
ndr_import.gemspec:
|
240
245
|
comments:
|
241
246
|
reviewed_by: josh.pencheon
|
242
|
-
safe_revision:
|
247
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
243
248
|
test/file/acro_form_test.rb:
|
244
249
|
comments:
|
245
250
|
reviewed_by: josh.pencheon
|
@@ -283,7 +288,7 @@ file safety:
|
|
283
288
|
test/file/xml_test.rb:
|
284
289
|
comments:
|
285
290
|
reviewed_by: josh.pencheon
|
286
|
-
safe_revision:
|
291
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
287
292
|
test/file/zip_test.rb:
|
288
293
|
comments:
|
289
294
|
reviewed_by: timgentry
|
@@ -308,6 +313,10 @@ file safety:
|
|
308
313
|
comments:
|
309
314
|
reviewed_by: timgentry
|
310
315
|
safe_revision: 9abdd6ced1d0c90ce8dd88abee4eb6472c7ff0d6
|
316
|
+
test/helpers/file/xml_streaming_test.rb:
|
317
|
+
comments:
|
318
|
+
reviewed_by: josh.pencheon
|
319
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
311
320
|
test/helpers/file/xml_test.rb:
|
312
321
|
comments:
|
313
322
|
reviewed_by: timgentry
|
@@ -356,6 +365,10 @@ file safety:
|
|
356
365
|
comments:
|
357
366
|
reviewed_by: timgentry
|
358
367
|
safe_revision: dab4b8a3e4b29d85eccd971e79936982d888cffd
|
368
|
+
test/resources/claims_utf16be_but_isnt.xml:
|
369
|
+
comments:
|
370
|
+
reviewed_by: josh.pencheon
|
371
|
+
safe_revision: ae75fb49baf028ac8ce08e4bedcd3625ff3ff0cd
|
359
372
|
test/resources/filesystem_paths.yml:
|
360
373
|
comments:
|
361
374
|
reviewed_by: timgentry
|
data/lib/ndr_import/file/xml.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'ndr_support/safe_file'
|
2
2
|
require 'ndr_import/helpers/file/xml'
|
3
|
+
require 'ndr_import/helpers/file/xml_streaming'
|
3
4
|
require_relative 'registry'
|
4
5
|
|
5
6
|
module NdrImport
|
@@ -9,6 +10,7 @@ module NdrImport
|
|
9
10
|
# This class is a xml file handler that returns a single table.
|
10
11
|
class Xml < Base
|
11
12
|
include NdrImport::Helpers::File::Xml
|
13
|
+
include NdrImport::Helpers::File::XmlStreaming
|
12
14
|
|
13
15
|
private
|
14
16
|
|
@@ -16,9 +18,14 @@ module NdrImport
|
|
16
18
|
def rows(&block)
|
17
19
|
return enum_for(:rows) unless block
|
18
20
|
|
19
|
-
|
21
|
+
xpath = @options['xml_record_xpath']
|
20
22
|
|
21
|
-
|
23
|
+
if @options['slurp']
|
24
|
+
doc = read_xml_file(@filename)
|
25
|
+
doc.xpath(xpath).each(&block)
|
26
|
+
else
|
27
|
+
each_node(@filename, xpath, &block)
|
28
|
+
end
|
22
29
|
rescue StandardError => e
|
23
30
|
raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
|
24
31
|
end
|
@@ -0,0 +1,181 @@
|
|
1
|
+
require 'ndr_support/safe_file'
|
2
|
+
require 'ndr_support/utf8_encoding'
|
3
|
+
|
4
|
+
module NdrImport
|
5
|
+
module Helpers
|
6
|
+
module File
|
7
|
+
# This mixin adds XML streaming functionality, to support more performant handling
|
8
|
+
# of large files by Nokogiri. Uses the `XML::Reader` API, and maintains a temporary
|
9
|
+
# DOM as the XML is streamed to allow XPath querying from the root node.
|
10
|
+
#
|
11
|
+
# If the system has `iconv` available, will attempt to verify the encoding of the
|
12
|
+
# file being read externally, so it can be streamed in to Ruby. Otherwise, will load
|
13
|
+
# the raw data in to check the encoding, but still stream it through Nokogiri's parser.
|
14
|
+
module XmlStreaming
|
15
|
+
# Base error for all streaming-specific issues.
|
16
|
+
class Error < StandardError; end
|
17
|
+
|
18
|
+
# Raised if nested tags are accounted which the streaming approach cannnot handle.
|
19
|
+
class NestingError < Error
|
20
|
+
def initialize(node)
|
21
|
+
super <<~STR
|
22
|
+
Element '#{node.name}' was found nested inside another of the same type.
|
23
|
+
This is not accessible, and a known limitation of XmlStreaming.
|
24
|
+
STR
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Object to track state as the XML is iterated over, and detect
|
29
|
+
# when an element of interest is entered.
|
30
|
+
class Cursor
|
31
|
+
# wrapper to hold a representation of each element we descent into:
|
32
|
+
StackItem = Struct.new(:name, :attrs, :empty)
|
33
|
+
|
34
|
+
def initialize(xpath)
|
35
|
+
@xpath = xpath
|
36
|
+
@stack = []
|
37
|
+
@match_depth = nil
|
38
|
+
end
|
39
|
+
|
40
|
+
# Has this cursor already passed inside a similar node?
|
41
|
+
def in?(node)
|
42
|
+
@stack.detect { |item| item.name == node.name }
|
43
|
+
end
|
44
|
+
|
45
|
+
def enter(node)
|
46
|
+
@stack.push StackItem.new(node.name, node.attributes, node.empty_element?)
|
47
|
+
end
|
48
|
+
|
49
|
+
def leave(_node)
|
50
|
+
@stack.pop
|
51
|
+
@match_depth = nil if @match_depth && @stack.length < @match_depth
|
52
|
+
end
|
53
|
+
|
54
|
+
# Does the element that the cursor is currently on match what
|
55
|
+
# is being looked for?
|
56
|
+
def matches?
|
57
|
+
# Can't match again if we're inside a match already:
|
58
|
+
return false if @matched_depth
|
59
|
+
|
60
|
+
match = current_stack_match?
|
61
|
+
|
62
|
+
# "empty element" matches are yielded immediately, without
|
63
|
+
# tagging the stack as having matched, because there won't
|
64
|
+
# be an equivalent closing tag to end the match with later.
|
65
|
+
if in_empty_element?
|
66
|
+
@stack.pop
|
67
|
+
elsif match
|
68
|
+
@match_depth = @stack.length
|
69
|
+
end
|
70
|
+
|
71
|
+
match
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
def in_empty_element?
|
77
|
+
@stack.last.empty
|
78
|
+
end
|
79
|
+
|
80
|
+
# Does the current state of the stack mean we've met the xpath
|
81
|
+
# criteria? Must be an exact match, not just matching a parent
|
82
|
+
# element in the DOM.
|
83
|
+
def current_stack_match?
|
84
|
+
parent_stack = @stack[0..-2]
|
85
|
+
|
86
|
+
return false unless dom_stubs[@stack].at_xpath(@xpath)
|
87
|
+
|
88
|
+
parent_stack.empty? || !dom_stubs[parent_stack].at_xpath(@xpath)
|
89
|
+
end
|
90
|
+
|
91
|
+
# A cached collection of DOM fragments, to represent the structure
|
92
|
+
# necessary to use xpath to descend into the main document's DOM.
|
93
|
+
def dom_stubs
|
94
|
+
@dom_stubs ||= Hash.new do |hash, items|
|
95
|
+
hash[items.dup] = Nokogiri::XML::Builder.new do |dom|
|
96
|
+
add_items_to_dom(dom, items.dup)
|
97
|
+
end.doc
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Helper to recursively build XML fragment.
|
102
|
+
def add_items_to_dom(dom, items)
|
103
|
+
item = items.shift
|
104
|
+
dom.send(item.name, item.attrs) do
|
105
|
+
add_items_to_dom(dom, items) if items.any?
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
include UTF8Encoding
|
111
|
+
|
112
|
+
# Streams the contents of the given `safe_path`, and yields
|
113
|
+
# each element matching `xpath` as they're found.
|
114
|
+
#
|
115
|
+
# In the case of dodgy encoding, may fall back to slurping the
|
116
|
+
# file, but will still use stream parsing for XML.
|
117
|
+
def each_node(safe_path, xpath, &block)
|
118
|
+
return enum_for(:each_node, safe_path, xpath) unless block
|
119
|
+
|
120
|
+
require 'nokogiri'
|
121
|
+
|
122
|
+
with_encoding_check(safe_path) do |stream, encoding|
|
123
|
+
stream_xml_nodes(stream, xpath, encoding, &block)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
private
|
128
|
+
|
129
|
+
# We need to ensure the raw data is UTF8 before we start streaming
|
130
|
+
# it with nokogiri. If we can do an external check, great. Otherwise,
|
131
|
+
# we need to slurp and convert the raw data before presenting it.
|
132
|
+
def with_encoding_check(safe_path)
|
133
|
+
forced_encoding = nil
|
134
|
+
|
135
|
+
stream = ::File.open(SafeFile.safepath_to_string(safe_path))
|
136
|
+
|
137
|
+
unless external_utf8_check?(safe_path)
|
138
|
+
stream = StringIO.new ensure_utf8!(stream.read)
|
139
|
+
forced_encoding = 'UTF8'
|
140
|
+
end
|
141
|
+
|
142
|
+
yield stream, forced_encoding
|
143
|
+
end
|
144
|
+
|
145
|
+
# Use iconv, if available, to check raw data encoding:
|
146
|
+
def external_utf8_check?(safe_path)
|
147
|
+
iconv = system('command -v iconv > /dev/null 2>&1')
|
148
|
+
return false unless iconv
|
149
|
+
|
150
|
+
path = SafeFile.safepath_to_string(safe_path)
|
151
|
+
system("iconv -f UTF-8 #{Shellwords.escape(path)} > /dev/null 2>&1")
|
152
|
+
end
|
153
|
+
|
154
|
+
def stream_xml_nodes(io, node_xpath, encoding = nil)
|
155
|
+
# Track nesting as the cursor moves through the document:
|
156
|
+
cursor = Cursor.new(node_xpath)
|
157
|
+
|
158
|
+
# If markup isn't well-formed, try to work around it:
|
159
|
+
options = Nokogiri::XML::ParseOptions::RECOVER
|
160
|
+
reader = Nokogiri::XML::Reader(io, nil, encoding, options)
|
161
|
+
|
162
|
+
reader.each do |node|
|
163
|
+
case node.node_type
|
164
|
+
when Nokogiri::XML::Reader::TYPE_ELEMENT # "opening tag"
|
165
|
+
raise NestingError, node if cursor.in?(node)
|
166
|
+
|
167
|
+
cursor.enter(node)
|
168
|
+
next unless cursor.matches?
|
169
|
+
|
170
|
+
# The xpath matched - construct a DOM fragment to yield back:
|
171
|
+
element = Nokogiri::XML(node.outer_xml).at("./#{node.name}")
|
172
|
+
yield element
|
173
|
+
when Nokogiri::XML::Reader::TYPE_END_ELEMENT # "closing tag"
|
174
|
+
cursor.leave(node)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
@@ -38,7 +38,8 @@ module NdrImport
|
|
38
38
|
'col_sep' => table_mapping.try(:delimiter),
|
39
39
|
'file_password' => table_mapping.try(:file_password),
|
40
40
|
'liberal_parsing' => table_mapping.try(:liberal_parsing),
|
41
|
-
'xml_record_xpath' => table_mapping.try(:xml_record_xpath)
|
41
|
+
'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
|
42
|
+
'slurp' => table_mapping.try(:slurp)
|
42
43
|
}
|
43
44
|
|
44
45
|
tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
|
data/lib/ndr_import/version.rb
CHANGED
data/ndr_import.gemspec
CHANGED
@@ -44,7 +44,7 @@ Gem::Specification.new do |spec|
|
|
44
44
|
spec.add_development_dependency 'rake', '~> 10.0'
|
45
45
|
spec.add_development_dependency 'minitest'
|
46
46
|
spec.add_development_dependency 'mocha'
|
47
|
-
spec.add_development_dependency 'ndr_dev_support', '
|
47
|
+
spec.add_development_dependency 'ndr_dev_support', '>= 3.1.3'
|
48
48
|
spec.add_development_dependency 'guard'
|
49
49
|
spec.add_development_dependency 'guard-rubocop'
|
50
50
|
spec.add_development_dependency 'guard-test'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ndr_import
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 9.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- NCRS Development Team
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-07-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activemodel
|
@@ -276,9 +276,6 @@ dependencies:
|
|
276
276
|
name: ndr_dev_support
|
277
277
|
requirement: !ruby/object:Gem::Requirement
|
278
278
|
requirements:
|
279
|
-
- - "~>"
|
280
|
-
- !ruby/object:Gem::Version
|
281
|
-
version: '3.1'
|
282
279
|
- - ">="
|
283
280
|
- !ruby/object:Gem::Version
|
284
281
|
version: 3.1.3
|
@@ -286,9 +283,6 @@ dependencies:
|
|
286
283
|
prerelease: false
|
287
284
|
version_requirements: !ruby/object:Gem::Requirement
|
288
285
|
requirements:
|
289
|
-
- - "~>"
|
290
|
-
- !ruby/object:Gem::Version
|
291
|
-
version: '3.1'
|
292
286
|
- - ">="
|
293
287
|
- !ruby/object:Gem::Version
|
294
288
|
version: 3.1.3
|
@@ -413,6 +407,7 @@ files:
|
|
413
407
|
- lib/ndr_import/helpers/file/pdf.rb
|
414
408
|
- lib/ndr_import/helpers/file/word.rb
|
415
409
|
- lib/ndr_import/helpers/file/xml.rb
|
410
|
+
- lib/ndr_import/helpers/file/xml_streaming.rb
|
416
411
|
- lib/ndr_import/helpers/file/zip.rb
|
417
412
|
- lib/ndr_import/mapper.rb
|
418
413
|
- lib/ndr_import/mapping_error.rb
|