setlr 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- setlr/__init__.py +89 -0
- setlr/core.py +1019 -0
- setlr/iterparse_filter.py +669 -0
- setlr/trig_store.py +158 -0
- setlr-1.0.2.dist-info/METADATA +209 -0
- setlr-1.0.2.dist-info/RECORD +10 -0
- setlr-1.0.2.dist-info/WHEEL +5 -0
- setlr-1.0.2.dist-info/entry_points.txt +2 -0
- setlr-1.0.2.dist-info/licenses/LICENSE +201 -0
- setlr-1.0.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
"""An experimental XPath-based streaming filter for ElementTree's iterparse
|
|
2
|
+
|
|
3
|
+
For details see:
|
|
4
|
+
http://dalkescientific.com/writings/diary/archive/2006/11/06/iterparse_filter.html
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import print_function
|
|
7
|
+
# I have got to rearrange my site to use shorter URLs.
|
|
8
|
+
|
|
9
|
+
from future import standard_library
|
|
10
|
+
standard_library.install_aliases()
|
|
11
|
+
from builtins import zip
|
|
12
|
+
from builtins import object
|
|
13
|
+
__version__ = "0.9-experimental"
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
dtd_validation = False
|
|
18
|
+
try:
|
|
19
|
+
from lxml import etree
|
|
20
|
+
dtd_validation = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
try:
|
|
23
|
+
# Python 2.5
|
|
24
|
+
import xml.etree.cElementTree as etree
|
|
25
|
+
except ImportError:
|
|
26
|
+
try:
|
|
27
|
+
# Python 2.5
|
|
28
|
+
import xml.etree.ElementTree as etree
|
|
29
|
+
except ImportError:
|
|
30
|
+
try:
|
|
31
|
+
# normal cElementTree install
|
|
32
|
+
import cElementTree as etree
|
|
33
|
+
except ImportError:
|
|
34
|
+
# normal ElementTree install
|
|
35
|
+
import elementtree.ElementTree as etree
|
|
36
|
+
|
|
37
|
+
# define "letter" as "any character except /:[]()@={}* or in \s"
|
|
38
|
+
# (XXX make it match the XML spec)
|
|
39
|
+
# A URI is:
|
|
40
|
+
# letter+
|
|
41
|
+
# letter+ ':' letter+ --- a namespace prefixed term, like xml:space
|
|
42
|
+
# '{' [^}]* '}' letter+ --- a Clark namespace term, like {http://a}b
|
|
43
|
+
# Can also use a '*' in place of a URI or in the tag part of a namespaced field
|
|
44
|
+
#
|
|
45
|
+
# URIs are separated only be '/' and '//'.
|
|
46
|
+
# These may not occur together, eg, '///' is not allowed.
|
|
47
|
+
|
|
48
|
+
# Basing this tokenization method in part on elementtree.ElementPath
|
|
49
|
+
xpath_tokenizer = re.compile( r"""
|
|
50
|
+
(// | / ) # separators
|
|
51
|
+
|
|
52
|
+
| (?: # namespaced term
|
|
53
|
+
([^\/\:\[\]\(\)\@\=\{\}\*\s]+) : # namespace
|
|
54
|
+
([^\/\:\[\]\(\)\@\=\{\}\*\s]+|\*) # tag
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
| (?:
|
|
58
|
+
\{([^}]*)\} # namespace in Clark notation
|
|
59
|
+
([^\/\:\[\]\(\)\@\=\{\}\*\s]+|\*) # tag
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
| ([^\/\:\[\]\(\)\@\=\{\}\*\s]+|\*) # tag with no namespace
|
|
63
|
+
|
|
64
|
+
| (.) # everything else; used to identify errors
|
|
65
|
+
""", re.X).findall
|
|
66
|
+
# """"" # fix emacs cruft; having too many special characters fools it
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def tokenize(s):
|
|
70
|
+
pos = 0
|
|
71
|
+
for token in xpath_tokenizer(s):
|
|
72
|
+
op = token[0]
|
|
73
|
+
if op in ("/", "//"):
|
|
74
|
+
yield (op, None, pos)
|
|
75
|
+
elif token[1]:
|
|
76
|
+
yield ("namespace", (token[1], token[2]), pos)
|
|
77
|
+
elif token[3]:
|
|
78
|
+
yield ("clark", (token[3], token[4]), pos)
|
|
79
|
+
elif token[5]:
|
|
80
|
+
yield ("default", token[5], pos)
|
|
81
|
+
elif token[6]:
|
|
82
|
+
raise SyntaxError("Unknown symbol %r at position %d" %
|
|
83
|
+
(token[6], pos))
|
|
84
|
+
else:
|
|
85
|
+
raise AssertionError("Unknown token: %r" % (token,))
|
|
86
|
+
|
|
87
|
+
def _make_original_tag(op, args):
|
|
88
|
+
if op == "namespace":
|
|
89
|
+
return "%s:%s" % (args[0], args[1])
|
|
90
|
+
if op == "clark":
|
|
91
|
+
return "{%s}:%s" % (args[0], args[1])
|
|
92
|
+
if op == "default":
|
|
93
|
+
return args
|
|
94
|
+
raise AssertionError("Strange: %r %r" % (op, args))
|
|
95
|
+
|
|
96
|
+
def _verify_ordering(tokens):
|
|
97
|
+
if not tokens:
|
|
98
|
+
raise SyntaxError(
|
|
99
|
+
"empty xpath not supported (don't know how to handle that case)")
|
|
100
|
+
pos = 0
|
|
101
|
+
prev = None
|
|
102
|
+
SEP = 1
|
|
103
|
+
URI = 2
|
|
104
|
+
# Check that the path alternates between separator and uri
|
|
105
|
+
for op, args, pos in tokens:
|
|
106
|
+
if op in ("/", "//"):
|
|
107
|
+
if prev == SEP:
|
|
108
|
+
raise SyntaxError(
|
|
109
|
+
"separator %r may not follow separator at position %d" %
|
|
110
|
+
(op, pos))
|
|
111
|
+
prev = SEP
|
|
112
|
+
elif op in ("namespace", "clark", "default"):
|
|
113
|
+
if prev == URI:
|
|
114
|
+
errmsg = _make_original_tag(op, args)
|
|
115
|
+
raise SyntaxError(
|
|
116
|
+
"%r may not follow a separator at position %d" %
|
|
117
|
+
(errormsg, pos))
|
|
118
|
+
prev = URI
|
|
119
|
+
else:
|
|
120
|
+
raise AssertionError("Unknown op: %r, %r, %r" % (op, args, pos))
|
|
121
|
+
|
|
122
|
+
if tokens[-1][0] == "//":
|
|
123
|
+
raise AssertionError("xpath may not end with '//'")
|
|
124
|
+
|
|
125
|
+
# There are further optimizations. For example, if this
|
|
126
|
+
# returned a match function instead of the regex then it
|
|
127
|
+
# could special case terms like /blah//* to mean "startswith('/blah/')"
|
|
128
|
+
# The small performance advantages for most cases doesn't
|
|
129
|
+
# currently warrant the extra work.
|
|
130
|
+
def to_regexp(s, namespaces={}, default_namespace=None):
|
|
131
|
+
tokens = list(tokenize(s))
|
|
132
|
+
_verify_ordering(tokens)
|
|
133
|
+
|
|
134
|
+
### Process the tokens
|
|
135
|
+
re_terms = []
|
|
136
|
+
if tokens[0][0] == "/":
|
|
137
|
+
re_terms.append("^")
|
|
138
|
+
tokens.pop(0)
|
|
139
|
+
|
|
140
|
+
for op, args, pos in tokens:
|
|
141
|
+
if op == "/":
|
|
142
|
+
pass
|
|
143
|
+
elif op == "//":
|
|
144
|
+
re_terms.append("(/[^/]+)*")
|
|
145
|
+
elif op in ("namespace", "clark", "default"):
|
|
146
|
+
# Break each apart to get the correct namespace and tag
|
|
147
|
+
if op == "namespace":
|
|
148
|
+
namespace, tag = args
|
|
149
|
+
try:
|
|
150
|
+
full_namespace = namespaces[namespace]
|
|
151
|
+
except KeyError:
|
|
152
|
+
raise SyntaxError("Unknown namespace %r at position %d" %
|
|
153
|
+
(namespace, pos))
|
|
154
|
+
elif op == "clark":
|
|
155
|
+
full_namespace, tag = args
|
|
156
|
+
elif op == "default":
|
|
157
|
+
full_namespace = default_namespace
|
|
158
|
+
tag = args
|
|
159
|
+
|
|
160
|
+
# Figure out which pattern to use for the combination
|
|
161
|
+
# of (namespace, namespace==None) x (tag, tag=='*')
|
|
162
|
+
if full_namespace is None:
|
|
163
|
+
# No namespace specified
|
|
164
|
+
if tag == "*":
|
|
165
|
+
# Select everything between the /s
|
|
166
|
+
re_terms.append("/[^/]+")
|
|
167
|
+
else:
|
|
168
|
+
# Select exactly the tag, no namespace
|
|
169
|
+
re_terms.append("/%s" % (re.escape(tag),))
|
|
170
|
+
else:
|
|
171
|
+
# namespace specified
|
|
172
|
+
if tag == "*":
|
|
173
|
+
# Select only fields in the given namespace
|
|
174
|
+
re_terms.append("/" +
|
|
175
|
+
re.escape("{%s}" % (full_namespace,)) +
|
|
176
|
+
"[^/]+")
|
|
177
|
+
else:
|
|
178
|
+
# Must match namespace and tag, exactly
|
|
179
|
+
re_terms.append("/" +
|
|
180
|
+
re.escape("{%s}%s" % (full_namespace, tag)))
|
|
181
|
+
else:
|
|
182
|
+
raise AssertionError("Unknown op %r" % (op,))
|
|
183
|
+
|
|
184
|
+
# Must be a complete match
|
|
185
|
+
re_terms.append("/$")
|
|
186
|
+
|
|
187
|
+
return "".join(re_terms)
|
|
188
|
+
|
|
189
|
+
class IterParseFilter(object):
|
|
190
|
+
def __init__(self, namespaces=None, default_namespace=None, validate_dtd=False):
|
|
191
|
+
if namespaces is None:
|
|
192
|
+
namespaces = {}
|
|
193
|
+
self.namespaces = namespaces
|
|
194
|
+
self.default_namespace = default_namespace
|
|
195
|
+
self.validate_dtd = validate_dtd
|
|
196
|
+
|
|
197
|
+
self._start_document_handlers = []
|
|
198
|
+
self._end_document_handlers = []
|
|
199
|
+
|
|
200
|
+
self._start_filters = []
|
|
201
|
+
self._end_filters = []
|
|
202
|
+
self._default_start_filters = []
|
|
203
|
+
self._default_end_filters = []
|
|
204
|
+
self._iter_start_filters = []
|
|
205
|
+
self._iter_end_filters = []
|
|
206
|
+
|
|
207
|
+
self._start_ns_handlers = []
|
|
208
|
+
self._end_ns_handlers = []
|
|
209
|
+
self._iter_start_ns = False
|
|
210
|
+
self._iter_end_ns = False
|
|
211
|
+
|
|
212
|
+
def on_start_document(self, handler):
|
|
213
|
+
self._start_document_handlers.append(handler)
|
|
214
|
+
def on_end_document(self, handler):
|
|
215
|
+
self._end_document_handlers.append(handler)
|
|
216
|
+
|
|
217
|
+
def _add_handler(self, filters, path, handler):
|
|
218
|
+
path_re = to_regexp(path,
|
|
219
|
+
namespaces = self.namespaces,
|
|
220
|
+
default_namespace = self.default_namespace)
|
|
221
|
+
filters.append( (path, re.compile(path_re).search, handler) )
|
|
222
|
+
def on_start(self, path, handler):
|
|
223
|
+
self._add_handler(self._start_filters, path, handler)
|
|
224
|
+
def on_end(self, path, handler):
|
|
225
|
+
self._add_handler(self._end_filters, path, handler)
|
|
226
|
+
def on_start_default(self, path, handler):
|
|
227
|
+
self._add_handler(self._default_start_filters, path, handler)
|
|
228
|
+
def on_end_default(self, path, handler):
|
|
229
|
+
self._add_handler(self._default_end_filters, path, handler)
|
|
230
|
+
|
|
231
|
+
def _add_yielder(self, yielders, path):
|
|
232
|
+
path_re = to_regexp(path,
|
|
233
|
+
namespaces = self.namespaces,
|
|
234
|
+
default_namespace = self.default_namespace)
|
|
235
|
+
|
|
236
|
+
yielders.append( (path, re.compile(path_re).search) )
|
|
237
|
+
def iter_start(self, path):
|
|
238
|
+
self._add_yielder(self._iter_start_filters, path)
|
|
239
|
+
def iter_end(self, path):
|
|
240
|
+
self._add_yielder(self._iter_end_filters, path)
|
|
241
|
+
|
|
242
|
+
def on_start_ns(self, handler):
|
|
243
|
+
self._start_ns_handlers.append(handler)
|
|
244
|
+
def on_end_ns(self, handler):
|
|
245
|
+
self._end_ns_handlers.append(handler)
|
|
246
|
+
def iter_start_ns(self):
|
|
247
|
+
self._iter_start_ns = True
|
|
248
|
+
def iter_end_ns(self):
|
|
249
|
+
self._iter_end_ns = True
|
|
250
|
+
|
|
251
|
+
def _get_filter_info(self, category):
|
|
252
|
+
for (_, _, pat, handler) in self.filters[category]:
|
|
253
|
+
yield (pat, handler)
|
|
254
|
+
|
|
255
|
+
def create_fa(self):
|
|
256
|
+
# Make copies of everything to emphasize that they must
|
|
257
|
+
# not be changed during processing.
|
|
258
|
+
return FilterAutomata(
|
|
259
|
+
start_document_handlers = self._start_document_handlers,
|
|
260
|
+
end_document_handlers = self._end_document_handlers[::-1], # reverse!
|
|
261
|
+
start_filters = self._start_filters[:],
|
|
262
|
+
end_filters = self._end_filters[::-1], # reversing here!
|
|
263
|
+
default_start_filters = self._default_start_filters[:],
|
|
264
|
+
default_end_filters = self._default_end_filters[::-1], # reversing!
|
|
265
|
+
iter_start_filters = self._iter_start_filters[:],
|
|
266
|
+
iter_end_filters = self._iter_end_filters[:],
|
|
267
|
+
|
|
268
|
+
start_ns_handlers = self._start_ns_handlers[:],
|
|
269
|
+
end_ns_handlers = self._end_ns_handlers[::-1], # reversing here!
|
|
270
|
+
iter_start_ns = self._iter_start_ns,
|
|
271
|
+
iter_end_ns = self._iter_end_ns)
|
|
272
|
+
|
|
273
|
+
# These forward to the underlying automata; make a new one each time.
|
|
274
|
+
def parse(self, file, state=None):
|
|
275
|
+
return self.create_fa().parse(file, state, self.validate_dtd)
|
|
276
|
+
|
|
277
|
+
# Experimental
|
|
278
|
+
def iterparse(self, file):
|
|
279
|
+
return self.create_fa().iterparse(file, self.validate_dtd)
|
|
280
|
+
# I need a better name
|
|
281
|
+
def handler_parse(self, file, state=None):
|
|
282
|
+
return self.create_fa().handler_parse(file, state)
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
class FilterAutomata(object):
|
|
286
|
+
def __init__(self,
|
|
287
|
+
start_document_handlers,
|
|
288
|
+
end_document_handlers,
|
|
289
|
+
|
|
290
|
+
start_filters,
|
|
291
|
+
end_filters,
|
|
292
|
+
default_start_filters,
|
|
293
|
+
default_end_filters,
|
|
294
|
+
iter_start_filters,
|
|
295
|
+
iter_end_filters,
|
|
296
|
+
|
|
297
|
+
start_ns_handlers,
|
|
298
|
+
end_ns_handlers,
|
|
299
|
+
iter_start_ns,
|
|
300
|
+
iter_end_ns):
|
|
301
|
+
self.start_document_handlers = start_document_handlers
|
|
302
|
+
self.end_document_handlers = end_document_handlers
|
|
303
|
+
|
|
304
|
+
self.start_filters = start_filters
|
|
305
|
+
self.end_filters = end_filters
|
|
306
|
+
self.default_start_filters = default_start_filters
|
|
307
|
+
self.default_end_filters = default_end_filters
|
|
308
|
+
self.iter_start_filters = iter_start_filters
|
|
309
|
+
self.iter_end_filters = iter_end_filters
|
|
310
|
+
|
|
311
|
+
self.start_ns_handlers = start_ns_handlers
|
|
312
|
+
self.end_ns_handlers = end_ns_handlers
|
|
313
|
+
self.iter_start_ns = iter_start_ns
|
|
314
|
+
self.iter_end_ns = iter_end_ns
|
|
315
|
+
|
|
316
|
+
# Can cache results over multiple invocations
|
|
317
|
+
# NOTE: not thread-safe. Though given the GIL
|
|
318
|
+
# this shouldn't be a problem.
|
|
319
|
+
self.dfa = {}
|
|
320
|
+
|
|
321
|
+
def _new_node(self, stack_as_path):
|
|
322
|
+
start_handlers = []
|
|
323
|
+
for (path, matcher, handler) in self.start_filters:
|
|
324
|
+
if matcher(stack_as_path):
|
|
325
|
+
start_handlers.append(handler)
|
|
326
|
+
|
|
327
|
+
if not start_handlers:
|
|
328
|
+
# Any defaults?
|
|
329
|
+
for (path, matcher, handler) in self.default_start_filters:
|
|
330
|
+
if matcher(stack_as_path):
|
|
331
|
+
start_handlers.append(handler)
|
|
332
|
+
|
|
333
|
+
end_handlers = []
|
|
334
|
+
for (path, matcher, handler) in self.end_filters:
|
|
335
|
+
if matcher(stack_as_path):
|
|
336
|
+
end_handlers.append(handler)
|
|
337
|
+
if not end_handlers:
|
|
338
|
+
# Any defaults?
|
|
339
|
+
for (path, matcher, handler) in self.default_end_filters:
|
|
340
|
+
if matcher(stack_as_path):
|
|
341
|
+
end_handlers.append(handler)
|
|
342
|
+
|
|
343
|
+
# Have all the handlers, now check for yields
|
|
344
|
+
iter_start = False
|
|
345
|
+
for (path, matcher) in self.iter_start_filters:
|
|
346
|
+
if matcher(stack_as_path):
|
|
347
|
+
iter_start = True
|
|
348
|
+
break
|
|
349
|
+
|
|
350
|
+
iter_end = False
|
|
351
|
+
for (path, matcher) in self.iter_end_filters:
|
|
352
|
+
if matcher(stack_as_path):
|
|
353
|
+
iter_end = True
|
|
354
|
+
break
|
|
355
|
+
|
|
356
|
+
new_node = ({}, start_handlers, end_handlers, iter_start, iter_end)
|
|
357
|
+
return new_node
|
|
358
|
+
|
|
359
|
+
def _needed_actions(self, iter=False, handler=False):
|
|
360
|
+
if (not handler) and (not cb):
|
|
361
|
+
raise AssertionError("must specify one")
|
|
362
|
+
actions = ("start", "end")
|
|
363
|
+
if ( (handler and self.start_ns_handlers) or
|
|
364
|
+
(iter and self.iter_start_ns) ):
|
|
365
|
+
actions = actions + ("start-ns",)
|
|
366
|
+
|
|
367
|
+
if ( (handler and self.end_ns_handlers) or
|
|
368
|
+
(iter and self.iter_end_ns) ):
|
|
369
|
+
actions = actions + ("end-ns",)
|
|
370
|
+
return actions
|
|
371
|
+
|
|
372
|
+
# I plan to implement 'handler_parse' as a near copy of 'parse'
|
|
373
|
+
# but without any yield statements.
|
|
374
|
+
def handler_parse(self, file, state=None):
|
|
375
|
+
for x in self.parse(file, state):
|
|
376
|
+
pass
|
|
377
|
+
|
|
378
|
+
# I plan to implement 'iterparse' as a near copy of 'parse'
|
|
379
|
+
# but without any references to callbacks
|
|
380
|
+
def iterparse(self, file, validate_dtd=False):
|
|
381
|
+
return self.parse(file, None, validate_dtd)
|
|
382
|
+
|
|
383
|
+
def parse(self, file, state=None, validate_dtd=False):
|
|
384
|
+
if not dtd_validation:
|
|
385
|
+
validate_dtd = False
|
|
386
|
+
node_stack = []
|
|
387
|
+
node_stack_append = node_stack.append
|
|
388
|
+
tag_stack = []
|
|
389
|
+
tag_stack_append = tag_stack.append
|
|
390
|
+
# children, start handlers, end handlers, iter start, iter end
|
|
391
|
+
node = (self.dfa, [], [], False, False)
|
|
392
|
+
|
|
393
|
+
# synthesize start-document events
|
|
394
|
+
for handler in self.start_document_handlers:
|
|
395
|
+
handler("start-document", None, state)
|
|
396
|
+
|
|
397
|
+
# figure out if I also need start-ns and/or end-ns events
|
|
398
|
+
needed_actions = self._needed_actions(True, True)
|
|
399
|
+
kwargs = {}
|
|
400
|
+
if validate_dtd:
|
|
401
|
+
kwargs = dict(dtd_validation=True)
|
|
402
|
+
last_start = 0
|
|
403
|
+
total_mem = 0
|
|
404
|
+
before = None
|
|
405
|
+
for (event, ele) in etree.iterparse(file, needed_actions, **kwargs):
|
|
406
|
+
if event == "start":
|
|
407
|
+
tag = ele.tag
|
|
408
|
+
# Descend into node; track where I am
|
|
409
|
+
tag_stack_append(tag)
|
|
410
|
+
node_stack_append(node)
|
|
411
|
+
stack_as_path = "/" + ("/".join(tag_stack)) + "/"
|
|
412
|
+
new_node = self._new_node(stack_as_path)
|
|
413
|
+
node = new_node
|
|
414
|
+
|
|
415
|
+
# call the start handlers then yield the element
|
|
416
|
+
for start_handler in node[1]:
|
|
417
|
+
start_handler(event, ele, state)
|
|
418
|
+
if node[3]:
|
|
419
|
+
yield (event, ele)
|
|
420
|
+
#print total_mem
|
|
421
|
+
|
|
422
|
+
elif event == "end":
|
|
423
|
+
# call the end handlers then yield the element
|
|
424
|
+
for end_handler in node[2]:
|
|
425
|
+
end_handler(event, ele, state)
|
|
426
|
+
del tag_stack[-1]
|
|
427
|
+
if node[4]:
|
|
428
|
+
yield (event, ele)
|
|
429
|
+
# It's safe to call clear() here because no descendants will be
|
|
430
|
+
# accessed
|
|
431
|
+
ele.clear()
|
|
432
|
+
if ele.getparent() is not None:
|
|
433
|
+
ele.getparent().remove(ele)
|
|
434
|
+
|
|
435
|
+
# Also eliminate now-empty references from the root node to elem
|
|
436
|
+
#for ancestor in ele.xpath('ancestor-or-self::*'):
|
|
437
|
+
# while ancestor.getprevious() is not None:
|
|
438
|
+
# del ancestor.getparent()[0]
|
|
439
|
+
node = node_stack.pop()
|
|
440
|
+
|
|
441
|
+
elif event == "start-ns":
|
|
442
|
+
for handler in self.start_ns_handlers:
|
|
443
|
+
handler(event, ele, state)
|
|
444
|
+
if self.iter_start_ns:
|
|
445
|
+
print('start-ns')
|
|
446
|
+
yield (event, ele)
|
|
447
|
+
|
|
448
|
+
elif event == "end-ns":
|
|
449
|
+
for handler in self.end_ns_handlers:
|
|
450
|
+
handler(event, ele, state)
|
|
451
|
+
if self.iter_start_ns:
|
|
452
|
+
print('end-ns')
|
|
453
|
+
yield (event, ele)
|
|
454
|
+
# It's safe to call clear() here because no descendants will be
|
|
455
|
+
# accessed
|
|
456
|
+
ele.clear()
|
|
457
|
+
ele.getparent().remove(ele)
|
|
458
|
+
# Also eliminate now-empty references from the root node to elem
|
|
459
|
+
#for ancestor in ele.xpath('ancestor-or-self::*'):
|
|
460
|
+
# while ancestor.getprevious() is not None:
|
|
461
|
+
# del ancestor.getparent()[0]
|
|
462
|
+
|
|
463
|
+
for handler in self.end_document_handlers:
|
|
464
|
+
handler("end-document", None, state)
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
#### An incomplete test suite ####
|
|
468
|
+
|
|
469
|
+
def test_path(path, args):
|
|
470
|
+
#print "**** test_path", repr(path), repr(args)
|
|
471
|
+
pattern = to_regexp(path)
|
|
472
|
+
pat = re.compile(pattern)
|
|
473
|
+
s = "/" + ("/".join(args)) + "/"
|
|
474
|
+
#print pattern, s
|
|
475
|
+
return bool(pat.search(s))
|
|
476
|
+
|
|
477
|
+
def test_ns_path(path, args):
|
|
478
|
+
#print "**** test_path", repr(path), repr(args)
|
|
479
|
+
pattern = to_regexp(path,
|
|
480
|
+
namespaces = {
|
|
481
|
+
"xml": "http://www.w3.org/XML/1998/namespace",
|
|
482
|
+
"das2": "http://biodas.org/documents/das2"},
|
|
483
|
+
# the empty namespace is not the same as no namespace!
|
|
484
|
+
default_namespace = "")
|
|
485
|
+
|
|
486
|
+
pat = re.compile(pattern)
|
|
487
|
+
s = "/" + ("/".join(args)) + "/"
|
|
488
|
+
#print pattern, s
|
|
489
|
+
return bool(pat.search(s))
|
|
490
|
+
|
|
491
|
+
def test_syntax():
|
|
492
|
+
for (xpath, tag_list, expect) in (
|
|
493
|
+
("A", ["A"], 1),
|
|
494
|
+
("A", ["AA"], 0),
|
|
495
|
+
("A", ["B", "A"], 1),
|
|
496
|
+
("/A", ["B", "A"], 0),
|
|
497
|
+
("/B", ["B", "A"], 0),
|
|
498
|
+
("//A", ["B", "A"], 1),
|
|
499
|
+
("A//B", ["A", "B"], 1),
|
|
500
|
+
("A//B", ["C", "A", "B"], 1),
|
|
501
|
+
("/A//B", ["C", "A", "B"], 0),
|
|
502
|
+
("/B/*", ["B", "A"], 1),
|
|
503
|
+
# Test back-tracking; both greedy and non-greedy cases
|
|
504
|
+
("A//B//C//D", ["A", "B", "C", "B", "D"], 1),
|
|
505
|
+
("A//B/D", ["A", "B", "C", "B", "D"], 1),
|
|
506
|
+
|
|
507
|
+
# Clark namespace tests
|
|
508
|
+
("{http://x.com}A", ["{http://x.com}A"], 1),
|
|
509
|
+
("{http://x.org}A", ["{http://x.com}A"], 0),
|
|
510
|
+
("{http://x.org}A", ["{http://x.com}B", "{http://x.org}A"], 1),
|
|
511
|
+
("*", ["{http://x.com}A"], 1),
|
|
512
|
+
("{http://x.com}*", ["{http://x.com}A"], 1),
|
|
513
|
+
("{http://x.com}*", ["{http://x.org}A"], 0),
|
|
514
|
+
|
|
515
|
+
):
|
|
516
|
+
got = test_path(xpath, tag_list)
|
|
517
|
+
if got != expect:
|
|
518
|
+
raise AssertionError("xpath %r against %r got %r, expected %r" %
|
|
519
|
+
(xpath, tag_list, got, bool(expect)))
|
|
520
|
+
|
|
521
|
+
for (xpath, tag_list, expect) in (
|
|
522
|
+
# various namespace checks
|
|
523
|
+
("xml:A", ["{http://www.w3.org/XML/1998/namespace}A"], 1),
|
|
524
|
+
("xml:A", ["{http://www.w3.org/XML/1998/namespace2}A"], 0),
|
|
525
|
+
("xml:A", ["{http://www.w3.org/XML/1998/namespace}AA"], 0),
|
|
526
|
+
("xml:A", ["{http://www.w3.org/XML/1998/namespace}B",
|
|
527
|
+
"{http://www.w3.org/XML/1998/namespace}A"], 1),
|
|
528
|
+
("xml:B", ["{http://www.w3.org/XML/1998/namespace}B",
|
|
529
|
+
"{http://www.w3.org/XML/1998/namespace}A"], 0),
|
|
530
|
+
|
|
531
|
+
("A", ["{}A"], 1),
|
|
532
|
+
("A", ["A"], 0),
|
|
533
|
+
|
|
534
|
+
("*", ["A"], 0),
|
|
535
|
+
("*", ["{}A"], 1),
|
|
536
|
+
("das2:*", ["{http://biodas.org/documents/das2}AAA"], 1),
|
|
537
|
+
("das2:*", ["{}AAA"], 0),
|
|
538
|
+
("xml:*/das2:*", ["{http://www.w3.org/XML/1998/namespace}ABC",
|
|
539
|
+
"{http://biodas.org/documents/das2}ABC"], 1),
|
|
540
|
+
("das2:*/xml:*", ["{http://www.w3.org/XML/1998/namespace}ABC",
|
|
541
|
+
"{http://biodas.org/documents/das2}ABC"], 0),
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
):
|
|
545
|
+
got = test_ns_path(xpath, tag_list)
|
|
546
|
+
if got != expect:
|
|
547
|
+
raise AssertionError("xpath %r against %r got %r, expected %r" %
|
|
548
|
+
(xpath, tag_list, got, bool(expect)))
|
|
549
|
+
|
|
550
|
+
def test_filtering():
|
|
551
|
+
import io as StringIO
|
|
552
|
+
f = StringIO.StringIO("""\
|
|
553
|
+
<A><AA>
|
|
554
|
+
<B xmlns="http://z/"><C/><spam:D xmlns:spam="http://spam/">eggs</spam:D></B>
|
|
555
|
+
<B x='6'>foo<B y='7'>bar</B>baz</B>
|
|
556
|
+
</AA></A>""")
|
|
557
|
+
special = object()
|
|
558
|
+
class Capture(object):
|
|
559
|
+
def __init__(self):
|
|
560
|
+
self.history = []
|
|
561
|
+
def __call__(self, event, ele, state):
|
|
562
|
+
if state is not special:
|
|
563
|
+
raise AssertionError("Did not get expected state")
|
|
564
|
+
self.history.append( (event, ele) )
|
|
565
|
+
|
|
566
|
+
filter = IterParseFilter()
|
|
567
|
+
capture_all = Capture()
|
|
568
|
+
filter.on_start_document(capture_all)
|
|
569
|
+
filter.on_start("*", capture_all)
|
|
570
|
+
filter.on_end("*", capture_all)
|
|
571
|
+
filter.on_end_document(capture_all)
|
|
572
|
+
filter.on_start_ns(capture_all)
|
|
573
|
+
filter.on_end_ns(capture_all)
|
|
574
|
+
|
|
575
|
+
for x in filter.parse(f, state=special):
|
|
576
|
+
raise AssertionError("should not yield %r" % (x,))
|
|
577
|
+
|
|
578
|
+
expect_history = (
|
|
579
|
+
("start-document", None),
|
|
580
|
+
("start", "A"),
|
|
581
|
+
("start", "AA"),
|
|
582
|
+
("start-ns", ("", "http://z/")),
|
|
583
|
+
("start", "{http://z/}B"),
|
|
584
|
+
("start", "{http://z/}C"),
|
|
585
|
+
("end", "{http://z/}C"),
|
|
586
|
+
("start-ns", ("spam", "http://spam/")),
|
|
587
|
+
("start", "{http://spam/}D"),
|
|
588
|
+
("end", "{http://spam/}D"),
|
|
589
|
+
("end-ns", None),
|
|
590
|
+
("end", "{http://z/}B"),
|
|
591
|
+
("end-ns", None),
|
|
592
|
+
("start", "B"),
|
|
593
|
+
("start", "B"),
|
|
594
|
+
("end", "B"),
|
|
595
|
+
("end", "B"),
|
|
596
|
+
("end", "AA"),
|
|
597
|
+
("end","A"),
|
|
598
|
+
("end-document", None),
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
for (got, expect) in zip(capture_all.history, expect_history):
|
|
602
|
+
event, ele = got
|
|
603
|
+
tag = getattr(ele, "tag", ele)
|
|
604
|
+
if (event, tag) != expect:
|
|
605
|
+
raise AssertionError("Expected %r Got %r" % (expect, (event, tag)))
|
|
606
|
+
if len(capture_all.history) != len(expect_history):
|
|
607
|
+
raise AssertionError("Length mismatch")
|
|
608
|
+
|
|
609
|
+
f.seek(0)
|
|
610
|
+
filter = IterParseFilter()
|
|
611
|
+
def must_match_B(event, ele, state):
|
|
612
|
+
if ele.tag != "B":
|
|
613
|
+
raise AssertionError("%r is not B" % (ele.tag,))
|
|
614
|
+
def must_match_B_y7(event, ele, state):
|
|
615
|
+
if ele.tag != "B":
|
|
616
|
+
raise AssertionError("%r is not B" % (ele.tag,))
|
|
617
|
+
if ele.attrib["y"] != "7":
|
|
618
|
+
raise AssertionError("%r is not the correct B" % (ele.tag,))
|
|
619
|
+
|
|
620
|
+
filter.on_start("B", must_match_B)
|
|
621
|
+
filter.on_start("B/B", must_match_B_y7)
|
|
622
|
+
|
|
623
|
+
f.seek
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
def test_parse():
|
|
627
|
+
import os
|
|
628
|
+
filename = "/Users/dalke/Music/iTunes/iTunes Music Library.xml"
|
|
629
|
+
if not os.path.exists(filename):
|
|
630
|
+
print ("Cannot find %r: skipping test" % (filename,))
|
|
631
|
+
return
|
|
632
|
+
|
|
633
|
+
# Work through callbacks
|
|
634
|
+
ef = IterParseFilter()
|
|
635
|
+
def print_info(event, ele, state):
|
|
636
|
+
d = {}
|
|
637
|
+
children = iter(ele)
|
|
638
|
+
for child in children:
|
|
639
|
+
key = child.text
|
|
640
|
+
value = children.next().text
|
|
641
|
+
d[key] = value
|
|
642
|
+
print ("%r is by %r" % (d["Name"], d.get("Artist", "<unknown>")))
|
|
643
|
+
ele.clear()
|
|
644
|
+
|
|
645
|
+
ef.on_end("/plist/dict/dict/dict", print_info)
|
|
646
|
+
ef.handler_parse(open(filename))
|
|
647
|
+
|
|
648
|
+
# Work through iterators
|
|
649
|
+
ef = IterParseFilter()
|
|
650
|
+
ef.iter_end("/plist/dict/dict/dict")
|
|
651
|
+
for (event, ele) in ef.iterparse(open(filename)):
|
|
652
|
+
d = {}
|
|
653
|
+
children = iter(ele)
|
|
654
|
+
for child in children:
|
|
655
|
+
key = child.text
|
|
656
|
+
value = children.next().text
|
|
657
|
+
d[key] = value
|
|
658
|
+
print ("%r is a %r song" % (d["Name"], d.get("Genre", "<unknown>")))
|
|
659
|
+
ele.clear()
|
|
660
|
+
|
|
661
|
+
|
|
662
|
+
def test():
|
|
663
|
+
test_syntax()
|
|
664
|
+
test_filtering()
|
|
665
|
+
test_parse()
|
|
666
|
+
|
|
667
|
+
if __name__ == "__main__":
|
|
668
|
+
test()
|
|
669
|
+
print ("All tests passed.")
|