lxml 6.0.0__cp310-cp310-manylinux_2_31_armv7l.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/ElementInclude.py +244 -0
- lxml/__init__.py +22 -0
- lxml/_elementpath.cpython-310-arm-linux-gnueabihf.so +0 -0
- lxml/_elementpath.py +343 -0
- lxml/apihelpers.pxi +1801 -0
- lxml/builder.cpython-310-arm-linux-gnueabihf.so +0 -0
- lxml/builder.py +243 -0
- lxml/classlookup.pxi +580 -0
- lxml/cleanup.pxi +215 -0
- lxml/cssselect.py +101 -0
- lxml/debug.pxi +36 -0
- lxml/docloader.pxi +178 -0
- lxml/doctestcompare.py +488 -0
- lxml/dtd.pxi +479 -0
- lxml/etree.cpython-310-arm-linux-gnueabihf.so +0 -0
- lxml/etree.h +244 -0
- lxml/etree.pyx +3853 -0
- lxml/etree_api.h +204 -0
- lxml/extensions.pxi +830 -0
- lxml/html/ElementSoup.py +10 -0
- lxml/html/__init__.py +1927 -0
- lxml/html/_diffcommand.py +86 -0
- lxml/html/_difflib.cpython-310-arm-linux-gnueabihf.so +0 -0
- lxml/html/_difflib.py +2106 -0
- lxml/html/_html5builder.py +100 -0
- lxml/html/_setmixin.py +56 -0
- lxml/html/builder.py +173 -0
- lxml/html/clean.py +21 -0
- lxml/html/defs.py +135 -0
- lxml/html/diff.cpython-310-arm-linux-gnueabihf.so +0 -0
- lxml/html/diff.py +972 -0
- lxml/html/formfill.py +299 -0
- lxml/html/html5parser.py +260 -0
- lxml/html/soupparser.py +314 -0
- lxml/html/usedoctest.py +13 -0
- lxml/includes/__init__.pxd +0 -0
- lxml/includes/__init__.py +0 -0
- lxml/includes/c14n.pxd +25 -0
- lxml/includes/config.pxd +3 -0
- lxml/includes/dtdvalid.pxd +18 -0
- lxml/includes/etree_defs.h +379 -0
- lxml/includes/etreepublic.pxd +237 -0
- lxml/includes/extlibs/__init__.py +0 -0
- lxml/includes/extlibs/libcharset.h +45 -0
- lxml/includes/extlibs/localcharset.h +137 -0
- lxml/includes/extlibs/zconf.h +543 -0
- lxml/includes/extlibs/zlib.h +1938 -0
- lxml/includes/htmlparser.pxd +56 -0
- lxml/includes/libexslt/__init__.py +0 -0
- lxml/includes/libexslt/exslt.h +108 -0
- lxml/includes/libexslt/exsltconfig.h +70 -0
- lxml/includes/libexslt/exsltexports.h +63 -0
- lxml/includes/libxml/HTMLparser.h +339 -0
- lxml/includes/libxml/HTMLtree.h +148 -0
- lxml/includes/libxml/SAX.h +18 -0
- lxml/includes/libxml/SAX2.h +170 -0
- lxml/includes/libxml/__init__.py +0 -0
- lxml/includes/libxml/c14n.h +115 -0
- lxml/includes/libxml/catalog.h +183 -0
- lxml/includes/libxml/chvalid.h +230 -0
- lxml/includes/libxml/debugXML.h +79 -0
- lxml/includes/libxml/dict.h +82 -0
- lxml/includes/libxml/encoding.h +307 -0
- lxml/includes/libxml/entities.h +147 -0
- lxml/includes/libxml/globals.h +25 -0
- lxml/includes/libxml/hash.h +251 -0
- lxml/includes/libxml/list.h +137 -0
- lxml/includes/libxml/nanoftp.h +16 -0
- lxml/includes/libxml/nanohttp.h +98 -0
- lxml/includes/libxml/parser.h +1633 -0
- lxml/includes/libxml/parserInternals.h +591 -0
- lxml/includes/libxml/relaxng.h +224 -0
- lxml/includes/libxml/schemasInternals.h +959 -0
- lxml/includes/libxml/schematron.h +143 -0
- lxml/includes/libxml/threads.h +81 -0
- lxml/includes/libxml/tree.h +1326 -0
- lxml/includes/libxml/uri.h +106 -0
- lxml/includes/libxml/valid.h +485 -0
- lxml/includes/libxml/xinclude.h +141 -0
- lxml/includes/libxml/xlink.h +193 -0
- lxml/includes/libxml/xmlIO.h +419 -0
- lxml/includes/libxml/xmlautomata.h +163 -0
- lxml/includes/libxml/xmlerror.h +962 -0
- lxml/includes/libxml/xmlexports.h +96 -0
- lxml/includes/libxml/xmlmemory.h +188 -0
- lxml/includes/libxml/xmlmodule.h +61 -0
- lxml/includes/libxml/xmlreader.h +444 -0
- lxml/includes/libxml/xmlregexp.h +116 -0
- lxml/includes/libxml/xmlsave.h +111 -0
- lxml/includes/libxml/xmlschemas.h +254 -0
- lxml/includes/libxml/xmlschemastypes.h +152 -0
- lxml/includes/libxml/xmlstring.h +140 -0
- lxml/includes/libxml/xmlunicode.h +15 -0
- lxml/includes/libxml/xmlversion.h +332 -0
- lxml/includes/libxml/xmlwriter.h +489 -0
- lxml/includes/libxml/xpath.h +569 -0
- lxml/includes/libxml/xpathInternals.h +639 -0
- lxml/includes/libxml/xpointer.h +48 -0
- lxml/includes/libxslt/__init__.py +0 -0
- lxml/includes/libxslt/attributes.h +39 -0
- lxml/includes/libxslt/documents.h +93 -0
- lxml/includes/libxslt/extensions.h +262 -0
- lxml/includes/libxslt/extra.h +72 -0
- lxml/includes/libxslt/functions.h +78 -0
- lxml/includes/libxslt/imports.h +75 -0
- lxml/includes/libxslt/keys.h +53 -0
- lxml/includes/libxslt/namespaces.h +68 -0
- lxml/includes/libxslt/numbersInternals.h +73 -0
- lxml/includes/libxslt/pattern.h +84 -0
- lxml/includes/libxslt/preproc.h +43 -0
- lxml/includes/libxslt/security.h +104 -0
- lxml/includes/libxslt/templates.h +77 -0
- lxml/includes/libxslt/transform.h +207 -0
- lxml/includes/libxslt/variables.h +118 -0
- lxml/includes/libxslt/xslt.h +110 -0
- lxml/includes/libxslt/xsltInternals.h +1995 -0
- lxml/includes/libxslt/xsltconfig.h +146 -0
- lxml/includes/libxslt/xsltexports.h +64 -0
- lxml/includes/libxslt/xsltlocale.h +44 -0
- lxml/includes/libxslt/xsltutils.h +343 -0
- lxml/includes/lxml-version.h +3 -0
- lxml/includes/relaxng.pxd +64 -0
- lxml/includes/schematron.pxd +34 -0
- lxml/includes/tree.pxd +492 -0
- lxml/includes/uri.pxd +5 -0
- lxml/includes/xinclude.pxd +22 -0
- lxml/includes/xmlerror.pxd +852 -0
- lxml/includes/xmlparser.pxd +303 -0
- lxml/includes/xmlschema.pxd +35 -0
- lxml/includes/xpath.pxd +136 -0
- lxml/includes/xslt.pxd +190 -0
- lxml/isoschematron/__init__.py +348 -0
- lxml/isoschematron/resources/rng/iso-schematron.rng +709 -0
- lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -0
- lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl +77 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +313 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1160 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +55 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt +84 -0
- lxml/iterparse.pxi +438 -0
- lxml/lxml.etree.h +244 -0
- lxml/lxml.etree_api.h +204 -0
- lxml/nsclasses.pxi +281 -0
- lxml/objectify.cpython-310-arm-linux-gnueabihf.so +0 -0
- lxml/objectify.pyx +2149 -0
- lxml/objectpath.pxi +332 -0
- lxml/parser.pxi +2059 -0
- lxml/parsertarget.pxi +180 -0
- lxml/proxy.pxi +619 -0
- lxml/public-api.pxi +178 -0
- lxml/pyclasslookup.py +3 -0
- lxml/readonlytree.pxi +565 -0
- lxml/relaxng.pxi +165 -0
- lxml/sax.cpython-310-arm-linux-gnueabihf.so +0 -0
- lxml/sax.py +286 -0
- lxml/saxparser.pxi +875 -0
- lxml/schematron.pxi +173 -0
- lxml/serializer.pxi +1849 -0
- lxml/usedoctest.py +13 -0
- lxml/xinclude.pxi +67 -0
- lxml/xmlerror.pxi +1654 -0
- lxml/xmlid.pxi +179 -0
- lxml/xmlschema.pxi +215 -0
- lxml/xpath.pxi +487 -0
- lxml/xslt.pxi +957 -0
- lxml/xsltext.pxi +242 -0
- lxml-6.0.0.dist-info/METADATA +163 -0
- lxml-6.0.0.dist-info/RECORD +174 -0
- lxml-6.0.0.dist-info/WHEEL +5 -0
- lxml-6.0.0.dist-info/licenses/LICENSE.txt +31 -0
- lxml-6.0.0.dist-info/licenses/LICENSES.txt +29 -0
- lxml-6.0.0.dist-info/top_level.txt +1 -0
lxml/parser.pxi
ADDED
@@ -0,0 +1,2059 @@
|
|
1
|
+
# Parsers for XML and HTML
|
2
|
+
|
3
|
+
from lxml.includes cimport xmlparser
|
4
|
+
from lxml.includes cimport htmlparser
|
5
|
+
|
6
|
+
cdef object _GenericAlias
|
7
|
+
try:
|
8
|
+
from types import GenericAlias as _GenericAlias
|
9
|
+
except ImportError:
|
10
|
+
# Python 3.8 - we only need this as return value from "__class_getitem__"
|
11
|
+
def _GenericAlias(cls, item):
|
12
|
+
return f"{cls.__name__}[{item.__name__}]"
|
13
|
+
|
14
|
+
|
15
|
+
class ParseError(LxmlSyntaxError):
|
16
|
+
"""Syntax error while parsing an XML document.
|
17
|
+
|
18
|
+
For compatibility with ElementTree 1.3 and later.
|
19
|
+
"""
|
20
|
+
def __init__(self, message, code, line, column, filename=None):
|
21
|
+
super(_ParseError, self).__init__(message)
|
22
|
+
self.lineno, self.offset = (line, column - 1)
|
23
|
+
self.code = code
|
24
|
+
self.filename = filename
|
25
|
+
|
26
|
+
@property
|
27
|
+
def position(self):
|
28
|
+
return self.lineno, self.offset + 1
|
29
|
+
|
30
|
+
@position.setter
|
31
|
+
def position(self, new_pos):
|
32
|
+
self.lineno, column = new_pos
|
33
|
+
self.offset = column - 1
|
34
|
+
|
35
|
+
cdef object _ParseError = ParseError
|
36
|
+
|
37
|
+
|
38
|
+
class XMLSyntaxError(ParseError):
|
39
|
+
"""Syntax error while parsing an XML document.
|
40
|
+
"""
|
41
|
+
|
42
|
+
cdef class ParserError(LxmlError):
|
43
|
+
"""Internal lxml parser error.
|
44
|
+
"""
|
45
|
+
|
46
|
+
|
47
|
+
@cython.final
|
48
|
+
@cython.internal
|
49
|
+
cdef class _ParserDictionaryContext:
|
50
|
+
# Global parser context to share the string dictionary.
|
51
|
+
#
|
52
|
+
# This class is a delegate singleton!
|
53
|
+
#
|
54
|
+
# It creates _ParserDictionaryContext objects for each thread to keep thread state,
|
55
|
+
# but those must never be used directly. Always stick to using the static
|
56
|
+
# __GLOBAL_PARSER_CONTEXT as defined below the class.
|
57
|
+
#
|
58
|
+
|
59
|
+
cdef tree.xmlDict* _c_dict
|
60
|
+
cdef _BaseParser _default_parser
|
61
|
+
cdef list _implied_parser_contexts
|
62
|
+
|
63
|
+
def __cinit__(self):
|
64
|
+
self._implied_parser_contexts = []
|
65
|
+
|
66
|
+
def __dealloc__(self):
|
67
|
+
if self._c_dict is not NULL:
|
68
|
+
xmlparser.xmlDictFree(self._c_dict)
|
69
|
+
|
70
|
+
cdef int initMainParserContext(self) except -1:
|
71
|
+
"""Put the global context into the thread dictionary of the main
|
72
|
+
thread. To be called once and only in the main thread."""
|
73
|
+
thread_dict = python.PyThreadState_GetDict()
|
74
|
+
if thread_dict is not NULL:
|
75
|
+
(<dict>thread_dict)["_ParserDictionaryContext"] = self
|
76
|
+
|
77
|
+
cdef _ParserDictionaryContext _findThreadParserContext(self):
|
78
|
+
"Find (or create) the _ParserDictionaryContext object for the current thread"
|
79
|
+
cdef _ParserDictionaryContext context
|
80
|
+
thread_dict = python.PyThreadState_GetDict()
|
81
|
+
if thread_dict is NULL:
|
82
|
+
return self
|
83
|
+
d = <dict>thread_dict
|
84
|
+
result = python.PyDict_GetItem(d, "_ParserDictionaryContext")
|
85
|
+
if result is not NULL:
|
86
|
+
return <object>result
|
87
|
+
context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
|
88
|
+
d["_ParserDictionaryContext"] = context
|
89
|
+
return context
|
90
|
+
|
91
|
+
cdef int setDefaultParser(self, _BaseParser parser) except -1:
|
92
|
+
"Set the default parser for the current thread"
|
93
|
+
cdef _ParserDictionaryContext context
|
94
|
+
context = self._findThreadParserContext()
|
95
|
+
context._default_parser = parser
|
96
|
+
|
97
|
+
cdef _BaseParser getDefaultParser(self):
|
98
|
+
"Return (or create) the default parser of the current thread"
|
99
|
+
cdef _ParserDictionaryContext context
|
100
|
+
context = self._findThreadParserContext()
|
101
|
+
if context._default_parser is None:
|
102
|
+
if self._default_parser is None:
|
103
|
+
self._default_parser = __DEFAULT_XML_PARSER._copy()
|
104
|
+
if context is not self:
|
105
|
+
context._default_parser = self._default_parser._copy()
|
106
|
+
return context._default_parser
|
107
|
+
|
108
|
+
cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
|
109
|
+
"Return the thread-local dict or create a new one if necessary."
|
110
|
+
cdef _ParserDictionaryContext context
|
111
|
+
context = self._findThreadParserContext()
|
112
|
+
if context._c_dict is NULL:
|
113
|
+
# thread dict not yet set up => use default or create a new one
|
114
|
+
if default is not NULL:
|
115
|
+
context._c_dict = default
|
116
|
+
xmlparser.xmlDictReference(default)
|
117
|
+
return default
|
118
|
+
if self._c_dict is NULL:
|
119
|
+
self._c_dict = xmlparser.xmlDictCreate()
|
120
|
+
if context is not self:
|
121
|
+
context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
|
122
|
+
return context._c_dict
|
123
|
+
|
124
|
+
cdef int initThreadDictRef(self, tree.xmlDict** c_dict_ref) except -1:
|
125
|
+
c_dict = c_dict_ref[0]
|
126
|
+
c_thread_dict = self._getThreadDict(c_dict)
|
127
|
+
if c_dict is c_thread_dict:
|
128
|
+
return 0
|
129
|
+
if c_dict is not NULL:
|
130
|
+
xmlparser.xmlDictFree(c_dict)
|
131
|
+
c_dict_ref[0] = c_thread_dict
|
132
|
+
xmlparser.xmlDictReference(c_thread_dict)
|
133
|
+
|
134
|
+
cdef int initParserDict(self, xmlparser.xmlParserCtxt* pctxt) except -1:
|
135
|
+
"Assure we always use the same string dictionary."
|
136
|
+
self.initThreadDictRef(&pctxt.dict)
|
137
|
+
pctxt.dictNames = 1
|
138
|
+
|
139
|
+
cdef int initXPathParserDict(self, xpath.xmlXPathContext* pctxt) except -1:
|
140
|
+
"Assure we always use the same string dictionary."
|
141
|
+
self.initThreadDictRef(&pctxt.dict)
|
142
|
+
|
143
|
+
cdef int initDocDict(self, xmlDoc* result) except -1:
|
144
|
+
"Store dict of last object parsed if no shared dict yet"
|
145
|
+
# XXX We also free the result dict here if there already was one.
|
146
|
+
# This case should only occur for new documents with empty dicts,
|
147
|
+
# otherwise we'd free data that's in use => segfault
|
148
|
+
self.initThreadDictRef(&result.dict)
|
149
|
+
|
150
|
+
cdef _ParserContext findImpliedContext(self):
|
151
|
+
"""Return any current implied xml parser context for the current
|
152
|
+
thread. This is used when the resolver functions are called
|
153
|
+
with an xmlParserCtxt that was generated from within libxml2
|
154
|
+
(i.e. without a _ParserContext) - which happens when parsing
|
155
|
+
schema and xinclude external references."""
|
156
|
+
cdef _ParserDictionaryContext context
|
157
|
+
cdef _ParserContext implied_context
|
158
|
+
|
159
|
+
# see if we have a current implied parser
|
160
|
+
context = self._findThreadParserContext()
|
161
|
+
if context._implied_parser_contexts:
|
162
|
+
implied_context = context._implied_parser_contexts[-1]
|
163
|
+
return implied_context
|
164
|
+
return None
|
165
|
+
|
166
|
+
cdef int pushImpliedContextFromParser(self, _BaseParser parser) except -1:
|
167
|
+
"Push a new implied context object taken from the parser."
|
168
|
+
if parser is not None:
|
169
|
+
self.pushImpliedContext(parser._getParserContext())
|
170
|
+
else:
|
171
|
+
self.pushImpliedContext(None)
|
172
|
+
|
173
|
+
cdef int pushImpliedContext(self, _ParserContext parser_context) except -1:
|
174
|
+
"Push a new implied context object."
|
175
|
+
cdef _ParserDictionaryContext context
|
176
|
+
context = self._findThreadParserContext()
|
177
|
+
context._implied_parser_contexts.append(parser_context)
|
178
|
+
|
179
|
+
cdef int popImpliedContext(self) except -1:
|
180
|
+
"Pop the current implied context object."
|
181
|
+
cdef _ParserDictionaryContext context
|
182
|
+
context = self._findThreadParserContext()
|
183
|
+
context._implied_parser_contexts.pop()
|
184
|
+
|
185
|
+
cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
|
186
|
+
__GLOBAL_PARSER_CONTEXT.initMainParserContext()
|
187
|
+
|
188
|
+
############################################################
|
189
|
+
## support for Python unicode I/O
|
190
|
+
############################################################
|
191
|
+
|
192
|
+
# name of Python Py_UNICODE encoding as known to libxml2
|
193
|
+
cdef const_char* _PY_UNICODE_ENCODING = NULL
|
194
|
+
|
195
|
+
cdef int _setupPythonUnicode() except -1:
|
196
|
+
"""Sets _PY_UNICODE_ENCODING to the internal encoding name of Python unicode
|
197
|
+
strings if libxml2 supports reading native Python unicode. This depends
|
198
|
+
on iconv and the local Python installation, so we simply check if we find
|
199
|
+
a matching encoding handler.
|
200
|
+
"""
|
201
|
+
cdef tree.xmlCharEncodingHandler* enchandler
|
202
|
+
cdef Py_ssize_t l
|
203
|
+
cdef const_char* enc
|
204
|
+
cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
|
205
|
+
cdef const_xmlChar* buffer = <const_xmlChar*>uchars
|
206
|
+
# apparently, libxml2 can't detect UTF-16 on some systems
|
207
|
+
if (buffer[0] == c'<' and buffer[1] == c'\0' and
|
208
|
+
buffer[2] == c't' and buffer[3] == c'\0'):
|
209
|
+
enc = "UTF-16LE"
|
210
|
+
elif (buffer[0] == c'\0' and buffer[1] == c'<' and
|
211
|
+
buffer[2] == c'\0' and buffer[3] == c't'):
|
212
|
+
enc = "UTF-16BE"
|
213
|
+
else:
|
214
|
+
# let libxml2 give it a try
|
215
|
+
enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
|
216
|
+
if enc is NULL:
|
217
|
+
# not my fault, it's YOUR broken system :)
|
218
|
+
return 0
|
219
|
+
enchandler = tree.xmlFindCharEncodingHandler(enc)
|
220
|
+
if enchandler is not NULL:
|
221
|
+
global _PY_UNICODE_ENCODING
|
222
|
+
tree.xmlCharEncCloseFunc(enchandler)
|
223
|
+
_PY_UNICODE_ENCODING = enc
|
224
|
+
return 0
|
225
|
+
|
226
|
+
cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
|
227
|
+
"Work around bug in libxml2: find iconv name of encoding on our own."
|
228
|
+
cdef tree.xmlCharEncoding enc
|
229
|
+
enc = tree.xmlDetectCharEncoding(buffer, size)
|
230
|
+
if enc == tree.XML_CHAR_ENCODING_UTF16LE:
|
231
|
+
if size >= 4 and (buffer[0] == <const_xmlChar> b'\xFF' and
|
232
|
+
buffer[1] == <const_xmlChar> b'\xFE' and
|
233
|
+
buffer[2] == 0 and buffer[3] == 0):
|
234
|
+
return "UTF-32LE" # according to BOM
|
235
|
+
else:
|
236
|
+
return "UTF-16LE"
|
237
|
+
elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
|
238
|
+
return "UTF-16BE"
|
239
|
+
elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
|
240
|
+
return "UCS-4LE"
|
241
|
+
elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
|
242
|
+
return "UCS-4BE"
|
243
|
+
elif enc == tree.XML_CHAR_ENCODING_NONE:
|
244
|
+
return NULL
|
245
|
+
else:
|
246
|
+
# returns a constant char*, no need to free it
|
247
|
+
return tree.xmlGetCharEncodingName(enc)
|
248
|
+
|
249
|
+
# Python 3.12 removed support for "Py_UNICODE".
|
250
|
+
if python.PY_VERSION_HEX < 0x030C0000:
|
251
|
+
_setupPythonUnicode()
|
252
|
+
|
253
|
+
|
254
|
+
cdef unicode _find_PyUCS4EncodingName():
|
255
|
+
"""
|
256
|
+
Find a suitable encoding for Py_UCS4 PyUnicode strings in libxml2.
|
257
|
+
"""
|
258
|
+
ustring = "<xml>\U0001F92A</xml>"
|
259
|
+
cdef const xmlChar* buffer = <const xmlChar*> python.PyUnicode_DATA(ustring)
|
260
|
+
cdef Py_ssize_t py_buffer_len = python.PyUnicode_GET_LENGTH(ustring)
|
261
|
+
|
262
|
+
encoding_name = ''
|
263
|
+
cdef tree.xmlCharEncoding enc = tree.xmlDetectCharEncoding(buffer, py_buffer_len)
|
264
|
+
enchandler = tree.xmlGetCharEncodingHandler(enc)
|
265
|
+
if enchandler is not NULL:
|
266
|
+
try:
|
267
|
+
if enchandler.name:
|
268
|
+
encoding_name = enchandler.name.decode('UTF-8')
|
269
|
+
finally:
|
270
|
+
tree.xmlCharEncCloseFunc(enchandler)
|
271
|
+
else:
|
272
|
+
c_name = tree.xmlGetCharEncodingName(enc)
|
273
|
+
if c_name:
|
274
|
+
encoding_name = c_name.decode('UTF-8')
|
275
|
+
|
276
|
+
|
277
|
+
if encoding_name and not encoding_name.endswith('LE') and not encoding_name.endswith('BE'):
|
278
|
+
encoding_name += 'BE' if python.PY_BIG_ENDIAN else 'LE'
|
279
|
+
return encoding_name or None
|
280
|
+
|
281
|
+
_pyucs4_encoding_name = _find_PyUCS4EncodingName()
|
282
|
+
|
283
|
+
|
284
|
+
############################################################
|
285
|
+
## support for file-like objects
|
286
|
+
############################################################
|
287
|
+
|
288
|
+
@cython.final
|
289
|
+
@cython.internal
|
290
|
+
cdef class _FileReaderContext:
|
291
|
+
cdef object _filelike
|
292
|
+
cdef object _encoding
|
293
|
+
cdef object _url
|
294
|
+
cdef object _bytes
|
295
|
+
cdef _ExceptionContext _exc_context
|
296
|
+
cdef Py_ssize_t _bytes_read
|
297
|
+
cdef char* _c_url
|
298
|
+
cdef bint _close_file_after_read
|
299
|
+
|
300
|
+
def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
|
301
|
+
self._exc_context = exc_context
|
302
|
+
self._filelike = filelike
|
303
|
+
self._close_file_after_read = close_file
|
304
|
+
self._encoding = encoding
|
305
|
+
if url is not None:
|
306
|
+
url = _encodeFilename(url)
|
307
|
+
self._c_url = _cstr(url)
|
308
|
+
self._url = url
|
309
|
+
self._bytes = b''
|
310
|
+
self._bytes_read = 0
|
311
|
+
|
312
|
+
cdef _close_file(self):
|
313
|
+
if self._filelike is None or not self._close_file_after_read:
|
314
|
+
return
|
315
|
+
try:
|
316
|
+
close = self._filelike.close
|
317
|
+
except AttributeError:
|
318
|
+
close = None
|
319
|
+
finally:
|
320
|
+
self._filelike = None
|
321
|
+
if close is not None:
|
322
|
+
close()
|
323
|
+
|
324
|
+
cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self) noexcept:
|
325
|
+
cdef xmlparser.xmlParserInputBuffer* c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
|
326
|
+
if c_buffer:
|
327
|
+
c_buffer.readcallback = _readFilelikeParser
|
328
|
+
c_buffer.context = <python.PyObject*> self
|
329
|
+
return c_buffer
|
330
|
+
|
331
|
+
cdef xmlparser.xmlParserInput* _createParserInput(
|
332
|
+
self, xmlparser.xmlParserCtxt* ctxt) noexcept:
|
333
|
+
cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer()
|
334
|
+
if not c_buffer:
|
335
|
+
return NULL
|
336
|
+
return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
|
337
|
+
|
338
|
+
cdef tree.xmlDtd* _readDtd(self) noexcept:
|
339
|
+
cdef xmlparser.xmlParserInputBuffer* c_buffer = self._createParserInputBuffer()
|
340
|
+
if not c_buffer:
|
341
|
+
return NULL
|
342
|
+
with nogil:
|
343
|
+
return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
|
344
|
+
|
345
|
+
cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options) noexcept:
|
346
|
+
cdef xmlDoc* result
|
347
|
+
cdef void* c_callback_context = <python.PyObject*> self
|
348
|
+
cdef char* c_encoding = _cstr(self._encoding) if self._encoding is not None else NULL
|
349
|
+
|
350
|
+
orig_options = ctxt.options
|
351
|
+
with nogil:
|
352
|
+
if ctxt.html:
|
353
|
+
result = htmlparser.htmlCtxtReadIO(
|
354
|
+
ctxt, _readFilelikeParser, NULL, c_callback_context,
|
355
|
+
self._c_url, c_encoding, options)
|
356
|
+
if result is not NULL:
|
357
|
+
if _fixHtmlDictNames(ctxt.dict, result) < 0:
|
358
|
+
tree.xmlFreeDoc(result)
|
359
|
+
result = NULL
|
360
|
+
else:
|
361
|
+
result = xmlparser.xmlCtxtReadIO(
|
362
|
+
ctxt, _readFilelikeParser, NULL, c_callback_context,
|
363
|
+
self._c_url, c_encoding, options)
|
364
|
+
ctxt.options = orig_options # work around libxml2 problem
|
365
|
+
|
366
|
+
try:
|
367
|
+
self._close_file()
|
368
|
+
except:
|
369
|
+
self._exc_context._store_raised()
|
370
|
+
finally:
|
371
|
+
return result # swallow any exceptions
|
372
|
+
|
373
|
+
cdef int copyToBuffer(self, char* c_buffer, int c_requested) noexcept:
|
374
|
+
cdef int c_byte_count = 0
|
375
|
+
cdef char* c_start
|
376
|
+
cdef Py_ssize_t byte_count, remaining
|
377
|
+
if self._bytes_read < 0:
|
378
|
+
return 0
|
379
|
+
try:
|
380
|
+
byte_count = python.PyBytes_GET_SIZE(self._bytes)
|
381
|
+
remaining = byte_count - self._bytes_read
|
382
|
+
while c_requested > remaining:
|
383
|
+
c_start = _cstr(self._bytes) + self._bytes_read
|
384
|
+
cstring_h.memcpy(c_buffer, c_start, remaining)
|
385
|
+
c_byte_count += remaining
|
386
|
+
c_buffer += remaining
|
387
|
+
c_requested -= remaining
|
388
|
+
|
389
|
+
self._bytes = self._filelike.read(c_requested)
|
390
|
+
if not isinstance(self._bytes, bytes):
|
391
|
+
if isinstance(self._bytes, unicode):
|
392
|
+
if self._encoding is None:
|
393
|
+
self._bytes = (<unicode>self._bytes).encode('utf8')
|
394
|
+
else:
|
395
|
+
self._bytes = python.PyUnicode_AsEncodedString(
|
396
|
+
self._bytes, _cstr(self._encoding), NULL)
|
397
|
+
else:
|
398
|
+
self._close_file()
|
399
|
+
raise TypeError, \
|
400
|
+
"reading from file-like objects must return byte strings or unicode strings"
|
401
|
+
|
402
|
+
remaining = python.PyBytes_GET_SIZE(self._bytes)
|
403
|
+
if remaining == 0:
|
404
|
+
self._bytes_read = -1
|
405
|
+
self._close_file()
|
406
|
+
return c_byte_count
|
407
|
+
self._bytes_read = 0
|
408
|
+
|
409
|
+
if c_requested > 0:
|
410
|
+
c_start = _cstr(self._bytes) + self._bytes_read
|
411
|
+
cstring_h.memcpy(c_buffer, c_start, c_requested)
|
412
|
+
c_byte_count += c_requested
|
413
|
+
self._bytes_read += c_requested
|
414
|
+
except:
|
415
|
+
c_byte_count = -1
|
416
|
+
self._exc_context._store_raised()
|
417
|
+
try:
|
418
|
+
self._close_file()
|
419
|
+
except:
|
420
|
+
self._exc_context._store_raised()
|
421
|
+
finally:
|
422
|
+
return c_byte_count # swallow any exceptions
|
423
|
+
|
424
|
+
cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) noexcept with gil:
|
425
|
+
return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
|
426
|
+
|
427
|
+
|
428
|
+
############################################################
|
429
|
+
## support for custom document loaders
|
430
|
+
############################################################
|
431
|
+
|
432
|
+
cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
|
433
|
+
xmlparser.xmlParserCtxt* c_context) noexcept with gil:
|
434
|
+
cdef _ResolverContext context
|
435
|
+
cdef xmlparser.xmlParserInput* c_input
|
436
|
+
cdef _InputDocument doc_ref
|
437
|
+
cdef _FileReaderContext file_context
|
438
|
+
# if there is no _ParserContext associated with the xmlParserCtxt
|
439
|
+
# passed, check to see if the thread state object has an implied
|
440
|
+
# context.
|
441
|
+
if c_context._private is not NULL:
|
442
|
+
context = <_ResolverContext>c_context._private
|
443
|
+
else:
|
444
|
+
context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
|
445
|
+
|
446
|
+
if context is None:
|
447
|
+
if __DEFAULT_ENTITY_LOADER is NULL:
|
448
|
+
return NULL
|
449
|
+
with nogil:
|
450
|
+
# free the GIL as we might do serious I/O here (e.g. HTTP)
|
451
|
+
c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
|
452
|
+
return c_input
|
453
|
+
|
454
|
+
try:
|
455
|
+
if c_url is NULL:
|
456
|
+
url = None
|
457
|
+
else:
|
458
|
+
# parsing a related document (DTD etc.) => UTF-8 encoded URL?
|
459
|
+
url = _decodeFilename(<const_xmlChar*>c_url)
|
460
|
+
if c_pubid is NULL:
|
461
|
+
pubid = None
|
462
|
+
else:
|
463
|
+
pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
|
464
|
+
|
465
|
+
doc_ref = context._resolvers.resolve(url, pubid, context)
|
466
|
+
except:
|
467
|
+
context._store_raised()
|
468
|
+
return NULL
|
469
|
+
|
470
|
+
if doc_ref is not None:
|
471
|
+
if doc_ref._type == PARSER_DATA_STRING:
|
472
|
+
data = doc_ref._data_bytes
|
473
|
+
filename = doc_ref._filename
|
474
|
+
if not filename:
|
475
|
+
filename = None
|
476
|
+
elif not isinstance(filename, bytes):
|
477
|
+
# most likely a text URL
|
478
|
+
filename = filename.encode('utf8')
|
479
|
+
if not isinstance(filename, bytes):
|
480
|
+
filename = None
|
481
|
+
|
482
|
+
c_input = xmlparser.xmlNewInputStream(c_context)
|
483
|
+
if c_input is not NULL:
|
484
|
+
if filename is not None:
|
485
|
+
c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
|
486
|
+
c_input.base = _xcstr(data)
|
487
|
+
c_input.length = python.PyBytes_GET_SIZE(data)
|
488
|
+
c_input.cur = c_input.base
|
489
|
+
c_input.end = c_input.base + c_input.length
|
490
|
+
elif doc_ref._type == PARSER_DATA_FILENAME:
|
491
|
+
data = None
|
492
|
+
c_filename = _cstr(doc_ref._filename)
|
493
|
+
with nogil:
|
494
|
+
# free the GIL as we might do serious I/O here
|
495
|
+
c_input = xmlparser.xmlNewInputFromFile(
|
496
|
+
c_context, c_filename)
|
497
|
+
elif doc_ref._type == PARSER_DATA_FILE:
|
498
|
+
file_context = _FileReaderContext(doc_ref._file, context, url,
|
499
|
+
None, doc_ref._close_file)
|
500
|
+
c_input = file_context._createParserInput(c_context)
|
501
|
+
data = file_context
|
502
|
+
else:
|
503
|
+
data = None
|
504
|
+
c_input = NULL
|
505
|
+
|
506
|
+
if data is not None:
|
507
|
+
context._storage.add(data)
|
508
|
+
if c_input is not NULL:
|
509
|
+
return c_input
|
510
|
+
|
511
|
+
if __DEFAULT_ENTITY_LOADER is NULL:
|
512
|
+
return NULL
|
513
|
+
|
514
|
+
with nogil:
|
515
|
+
# free the GIL as we might do serious I/O here (e.g. HTTP)
|
516
|
+
c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
|
517
|
+
return c_input
|
518
|
+
|
519
|
+
cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
|
520
|
+
__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
|
521
|
+
|
522
|
+
|
523
|
+
cdef xmlparser.xmlExternalEntityLoader _register_document_loader() noexcept nogil:
|
524
|
+
cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
|
525
|
+
xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
|
526
|
+
return old
|
527
|
+
|
528
|
+
cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) noexcept nogil:
|
529
|
+
xmlparser.xmlSetExternalEntityLoader(old)
|
530
|
+
|
531
|
+
|
532
|
+
############################################################
|
533
|
+
## Parsers
|
534
|
+
############################################################
|
535
|
+
|
536
|
+
@cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc.
|
537
|
+
@cython.internal
|
538
|
+
cdef class _ParserContext(_ResolverContext):
|
539
|
+
cdef _ErrorLog _error_log
|
540
|
+
cdef _ParserSchemaValidationContext _validator
|
541
|
+
cdef xmlparser.xmlParserCtxt* _c_ctxt
|
542
|
+
cdef xmlparser.xmlExternalEntityLoader _orig_loader
|
543
|
+
cdef python.PyThread_type_lock _lock
|
544
|
+
cdef _Document _doc
|
545
|
+
cdef bint _collect_ids
|
546
|
+
|
547
|
+
def __cinit__(self):
|
548
|
+
self._collect_ids = True
|
549
|
+
if config.ENABLE_THREADING:
|
550
|
+
self._lock = python.PyThread_allocate_lock()
|
551
|
+
self._error_log = _ErrorLog()
|
552
|
+
|
553
|
+
def __dealloc__(self):
|
554
|
+
if config.ENABLE_THREADING and self._lock is not NULL:
|
555
|
+
python.PyThread_free_lock(self._lock)
|
556
|
+
self._lock = NULL
|
557
|
+
if self._c_ctxt is not NULL:
|
558
|
+
if <void*>self._validator is not NULL and self._validator is not None:
|
559
|
+
# If the parser was not closed correctly (e.g. interrupted iterparse()),
|
560
|
+
# and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
|
561
|
+
# validator plug might still be in place, which will make xmlFreeParserCtxt()
|
562
|
+
# crash when trying to xmlFree() a static SAX handler.
|
563
|
+
# Thus, make sure we disconnect the handler interceptor here at the latest.
|
564
|
+
self._validator.disconnect()
|
565
|
+
xmlparser.xmlFreeParserCtxt(self._c_ctxt)
|
566
|
+
|
567
|
+
cdef _ParserContext _copy(self):
|
568
|
+
cdef _ParserContext context
|
569
|
+
context = self.__class__()
|
570
|
+
context._collect_ids = self._collect_ids
|
571
|
+
context._validator = self._validator.copy()
|
572
|
+
_initParserContext(context, self._resolvers._copy(), NULL)
|
573
|
+
return context
|
574
|
+
|
575
|
+
cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
|
576
|
+
"""
|
577
|
+
Connects the libxml2-level context to the lxml-level parser context.
|
578
|
+
"""
|
579
|
+
self._c_ctxt = c_ctxt
|
580
|
+
c_ctxt._private = <void*>self
|
581
|
+
|
582
|
+
cdef void _resetParserContext(self) noexcept:
|
583
|
+
if self._c_ctxt is not NULL:
|
584
|
+
if self._c_ctxt.html:
|
585
|
+
htmlparser.htmlCtxtReset(self._c_ctxt)
|
586
|
+
self._c_ctxt.disableSAX = 0 # work around bug in libxml2
|
587
|
+
else:
|
588
|
+
xmlparser.xmlClearParserCtxt(self._c_ctxt)
|
589
|
+
# work around bug in libxml2 [2.9.10 .. 2.9.14]:
|
590
|
+
# https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
|
591
|
+
self._c_ctxt.nsNr = 0
|
592
|
+
|
593
|
+
cdef int prepare(self, bint set_document_loader=True) except -1:
|
594
|
+
cdef int result
|
595
|
+
if config.ENABLE_THREADING and self._lock is not NULL:
|
596
|
+
with nogil:
|
597
|
+
result = python.PyThread_acquire_lock(
|
598
|
+
self._lock, python.WAIT_LOCK)
|
599
|
+
if result == 0:
|
600
|
+
raise ParserError, "parser locking failed"
|
601
|
+
self._error_log.clear()
|
602
|
+
self._doc = None
|
603
|
+
# Connect the lxml error log with libxml2's error handling. In the case of parsing
|
604
|
+
# HTML, ctxt->sax is not set to null, so this always works. The libxml2 function
|
605
|
+
# that does this is htmlInitParserCtxt in HTMLparser.c. For HTML (and possibly XML
|
606
|
+
# too), libxml2's SAX's serror is set to be the place where errors are sent when
|
607
|
+
# schannel is set to ctxt->sax->serror in xmlCtxtErrMemory in libxml2's
|
608
|
+
# parserInternals.c.
|
609
|
+
# Need a cast here because older libxml2 releases do not use 'const' in the functype.
|
610
|
+
self._c_ctxt.sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
|
611
|
+
self._orig_loader = _register_document_loader() if set_document_loader else NULL
|
612
|
+
if self._validator is not None:
|
613
|
+
self._validator.connect(self._c_ctxt, self._error_log)
|
614
|
+
return 0
|
615
|
+
|
616
|
+
cdef int cleanup(self) except -1:
|
617
|
+
if self._orig_loader is not NULL:
|
618
|
+
_reset_document_loader(self._orig_loader)
|
619
|
+
try:
|
620
|
+
if self._validator is not None:
|
621
|
+
self._validator.disconnect()
|
622
|
+
self._resetParserContext()
|
623
|
+
self.clear()
|
624
|
+
self._doc = None
|
625
|
+
self._c_ctxt.sax.serror = NULL
|
626
|
+
finally:
|
627
|
+
if config.ENABLE_THREADING and self._lock is not NULL:
|
628
|
+
python.PyThread_release_lock(self._lock)
|
629
|
+
return 0
|
630
|
+
|
631
|
+
cdef object _handleParseResult(self, _BaseParser parser,
|
632
|
+
xmlDoc* result, filename):
|
633
|
+
c_doc = self._handleParseResultDoc(parser, result, filename)
|
634
|
+
if self._doc is not None and self._doc._c_doc is c_doc:
|
635
|
+
return self._doc
|
636
|
+
else:
|
637
|
+
return _documentFactory(c_doc, parser)
|
638
|
+
|
639
|
+
cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
|
640
|
+
xmlDoc* result, filename) except NULL:
|
641
|
+
recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
|
642
|
+
return _handleParseResult(self, self._c_ctxt, result,
|
643
|
+
filename, recover,
|
644
|
+
free_doc=self._doc is None)
|
645
|
+
|
646
|
+
cdef _initParserContext(_ParserContext context,
|
647
|
+
_ResolverRegistry resolvers,
|
648
|
+
xmlparser.xmlParserCtxt* c_ctxt):
|
649
|
+
_initResolverContext(context, resolvers)
|
650
|
+
if c_ctxt is not NULL:
|
651
|
+
context._initParserContext(c_ctxt)
|
652
|
+
|
653
|
+
cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, const xmlerror.xmlError* error) noexcept with gil:
|
654
|
+
"""
|
655
|
+
Add an error created by libxml2 to the lxml-level error_log.
|
656
|
+
"""
|
657
|
+
(<_ParserContext>_parser_context._private)._error_log._receive(error)
|
658
|
+
|
659
|
+
cdef void _receiveParserError(void* c_context, const xmlerror.xmlError* error) noexcept nogil:
|
660
|
+
if __DEBUG:
|
661
|
+
if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
|
662
|
+
_forwardError(NULL, error)
|
663
|
+
else:
|
664
|
+
_forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
|
665
|
+
|
666
|
+
cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
|
667
|
+
_ErrorLog error_log) except -1:
|
668
|
+
if filename is not None and \
|
669
|
+
ctxt.lastError.domain == xmlerror.XML_FROM_IO:
|
670
|
+
if isinstance(filename, bytes):
|
671
|
+
filename = _decodeFilenameWithLength(
|
672
|
+
<bytes>filename, len(<bytes>filename))
|
673
|
+
if ctxt.lastError.message is not NULL:
|
674
|
+
try:
|
675
|
+
message = ctxt.lastError.message.decode('utf-8')
|
676
|
+
except UnicodeDecodeError:
|
677
|
+
# the filename may be in there => play it safe
|
678
|
+
message = ctxt.lastError.message.decode('iso8859-1')
|
679
|
+
message = f"Error reading file '{filename}': {message.strip()}"
|
680
|
+
else:
|
681
|
+
message = f"Error reading '{filename}'"
|
682
|
+
raise IOError, message
|
683
|
+
elif error_log:
|
684
|
+
raise error_log._buildParseException(
|
685
|
+
XMLSyntaxError, "Document is not well formed")
|
686
|
+
elif ctxt.lastError.message is not NULL:
|
687
|
+
message = ctxt.lastError.message.strip()
|
688
|
+
code = ctxt.lastError.code
|
689
|
+
line = ctxt.lastError.line
|
690
|
+
column = ctxt.lastError.int2
|
691
|
+
if ctxt.lastError.line > 0:
|
692
|
+
message = f"line {line}: {message}"
|
693
|
+
raise XMLSyntaxError(message, code, line, column, filename)
|
694
|
+
else:
|
695
|
+
raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
|
696
|
+
filename)
|
697
|
+
|
698
|
+
cdef xmlDoc* _handleParseResult(_ParserContext context,
|
699
|
+
xmlparser.xmlParserCtxt* c_ctxt,
|
700
|
+
xmlDoc* result, filename,
|
701
|
+
bint recover, bint free_doc) except NULL:
|
702
|
+
# The C-level argument xmlDoc* result is passed in as NULL if the parser was not able
|
703
|
+
# to parse the document.
|
704
|
+
cdef bint well_formed
|
705
|
+
if result is not NULL:
|
706
|
+
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
707
|
+
|
708
|
+
if c_ctxt.myDoc is not NULL:
|
709
|
+
if c_ctxt.myDoc is not result:
|
710
|
+
__GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
|
711
|
+
tree.xmlFreeDoc(c_ctxt.myDoc)
|
712
|
+
c_ctxt.myDoc = NULL
|
713
|
+
|
714
|
+
if result is not NULL:
|
715
|
+
# "wellFormed" in libxml2 is 0 if the parser found fatal errors. It still returns a
|
716
|
+
# parse result document if 'recover=True'. Here, we determine if we can present
|
717
|
+
# the document to the user or consider it incorrect or broken enough to raise an error.
|
718
|
+
if (context._validator is not None and
|
719
|
+
not context._validator.isvalid()):
|
720
|
+
well_formed = 0 # actually not 'valid', but anyway ...
|
721
|
+
elif (not c_ctxt.wellFormed and not c_ctxt.html and
|
722
|
+
c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
|
723
|
+
[1 for error in context._error_log
|
724
|
+
if error.type == ErrorTypes.ERR_INVALID_CHAR]):
|
725
|
+
# An encoding error occurred and libxml2 switched from UTF-8
|
726
|
+
# input to (undecoded) Latin-1, at some arbitrary point in the
|
727
|
+
# document. Better raise an error than allowing for a broken
|
728
|
+
# tree with mixed encodings. This is fixed in libxml2 2.12.
|
729
|
+
well_formed = 0
|
730
|
+
elif recover or (c_ctxt.wellFormed and
|
731
|
+
c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
|
732
|
+
well_formed = 1
|
733
|
+
elif not c_ctxt.replaceEntities and not c_ctxt.validate \
|
734
|
+
and context is not None:
|
735
|
+
# in this mode, we ignore errors about undefined entities
|
736
|
+
for error in context._error_log.filter_from_errors():
|
737
|
+
if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
|
738
|
+
error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
|
739
|
+
well_formed = 0
|
740
|
+
break
|
741
|
+
else:
|
742
|
+
well_formed = 1
|
743
|
+
else:
|
744
|
+
well_formed = 0
|
745
|
+
|
746
|
+
if not well_formed:
|
747
|
+
if free_doc:
|
748
|
+
tree.xmlFreeDoc(result)
|
749
|
+
result = NULL
|
750
|
+
|
751
|
+
if context is not None and context._has_raised():
|
752
|
+
if result is not NULL:
|
753
|
+
if free_doc:
|
754
|
+
tree.xmlFreeDoc(result)
|
755
|
+
result = NULL
|
756
|
+
context._raise_if_stored()
|
757
|
+
|
758
|
+
if result is NULL:
|
759
|
+
if context is not None:
|
760
|
+
_raiseParseError(c_ctxt, filename, context._error_log)
|
761
|
+
else:
|
762
|
+
_raiseParseError(c_ctxt, filename, None)
|
763
|
+
else:
|
764
|
+
if result.URL is NULL and filename is not None:
|
765
|
+
result.URL = tree.xmlStrdup(_xcstr(filename))
|
766
|
+
if result.encoding is NULL:
|
767
|
+
result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
|
768
|
+
|
769
|
+
if context._validator is not None and \
|
770
|
+
context._validator._add_default_attributes:
|
771
|
+
# we currently need to do this here as libxml2 does not
|
772
|
+
# support inserting default attributes during parse-time
|
773
|
+
# validation
|
774
|
+
context._validator.inject_default_attributes(result)
|
775
|
+
|
776
|
+
return result
|
777
|
+
|
778
|
+
cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) noexcept nogil:
|
779
|
+
cdef xmlNode* c_node
|
780
|
+
if c_doc is NULL:
|
781
|
+
return 0
|
782
|
+
c_node = c_doc.children
|
783
|
+
tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
|
784
|
+
if c_node.type == tree.XML_ELEMENT_NODE:
|
785
|
+
if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
|
786
|
+
return -1
|
787
|
+
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
788
|
+
return 0
|
789
|
+
|
790
|
+
cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
|
791
|
+
xmlNode* c_start_node) noexcept nogil:
|
792
|
+
"""
|
793
|
+
Move names to the dict, iterating in document order, starting at
|
794
|
+
c_start_node. This is used in incremental parsing after each chunk.
|
795
|
+
"""
|
796
|
+
cdef xmlNode* c_node
|
797
|
+
if not c_doc:
|
798
|
+
return 0
|
799
|
+
if not c_start_node:
|
800
|
+
return _fixHtmlDictNames(c_dict, c_doc)
|
801
|
+
c_node = c_start_node
|
802
|
+
tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
|
803
|
+
if c_node.type == tree.XML_ELEMENT_NODE:
|
804
|
+
if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
|
805
|
+
return -1
|
806
|
+
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
807
|
+
return 0
|
808
|
+
|
809
|
+
cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
|
810
|
+
xmlNode* c_node) noexcept nogil:
|
811
|
+
cdef xmlNode* c_attr
|
812
|
+
c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
|
813
|
+
if c_name is NULL:
|
814
|
+
return -1
|
815
|
+
if c_name is not c_node.name:
|
816
|
+
tree.xmlFree(<char*>c_node.name)
|
817
|
+
c_node.name = c_name
|
818
|
+
c_attr = <xmlNode*>c_node.properties
|
819
|
+
while c_attr is not NULL:
|
820
|
+
c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
|
821
|
+
if c_name is NULL:
|
822
|
+
return -1
|
823
|
+
if c_name is not c_attr.name:
|
824
|
+
tree.xmlFree(<char*>c_attr.name)
|
825
|
+
c_attr.name = c_name
|
826
|
+
c_attr = c_attr.next
|
827
|
+
return 0
|
828
|
+
|
829
|
+
|
830
|
+
@cython.internal
|
831
|
+
cdef class _BaseParser:
|
832
|
+
cdef ElementClassLookup _class_lookup
|
833
|
+
cdef _ResolverRegistry _resolvers
|
834
|
+
cdef _ParserContext _parser_context
|
835
|
+
cdef _ParserContext _push_parser_context
|
836
|
+
cdef int _parse_options
|
837
|
+
cdef bint _for_html
|
838
|
+
cdef bint _remove_comments
|
839
|
+
cdef bint _remove_pis
|
840
|
+
cdef bint _strip_cdata
|
841
|
+
cdef bint _collect_ids
|
842
|
+
cdef bint _resolve_external_entities
|
843
|
+
cdef XMLSchema _schema
|
844
|
+
cdef bytes _filename
|
845
|
+
cdef readonly object target
|
846
|
+
cdef object _default_encoding
|
847
|
+
cdef tuple _events_to_collect # (event_types, tag)
|
848
|
+
|
849
|
+
def __init__(self, int parse_options, bint for_html, XMLSchema schema,
|
850
|
+
remove_comments, remove_pis, strip_cdata, collect_ids,
|
851
|
+
target, encoding, bint resolve_external_entities=True):
|
852
|
+
cdef tree.xmlCharEncodingHandler* enchandler
|
853
|
+
cdef int c_encoding
|
854
|
+
if not isinstance(self, (XMLParser, HTMLParser)):
|
855
|
+
raise TypeError, "This class cannot be instantiated"
|
856
|
+
|
857
|
+
self._parse_options = parse_options
|
858
|
+
self.target = target
|
859
|
+
self._for_html = for_html
|
860
|
+
self._remove_comments = remove_comments
|
861
|
+
self._remove_pis = remove_pis
|
862
|
+
self._strip_cdata = strip_cdata
|
863
|
+
self._collect_ids = collect_ids
|
864
|
+
self._resolve_external_entities = resolve_external_entities
|
865
|
+
self._schema = schema
|
866
|
+
|
867
|
+
self._resolvers = _ResolverRegistry()
|
868
|
+
|
869
|
+
if encoding is None:
|
870
|
+
self._default_encoding = None
|
871
|
+
else:
|
872
|
+
encoding = _utf8(encoding)
|
873
|
+
enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
|
874
|
+
if enchandler is NULL:
|
875
|
+
raise LookupError, f"unknown encoding: '{encoding}'"
|
876
|
+
tree.xmlCharEncCloseFunc(enchandler)
|
877
|
+
self._default_encoding = encoding
|
878
|
+
|
879
|
+
cdef _setBaseURL(self, base_url):
|
880
|
+
self._filename = _encodeFilename(base_url)
|
881
|
+
|
882
|
+
cdef _collectEvents(self, event_types, tag):
|
883
|
+
if event_types is None:
|
884
|
+
event_types = ()
|
885
|
+
else:
|
886
|
+
event_types = tuple(set(event_types))
|
887
|
+
_buildParseEventFilter(event_types) # purely for validation
|
888
|
+
self._events_to_collect = (event_types, tag)
|
889
|
+
|
890
|
+
cdef _ParserContext _getParserContext(self):
|
891
|
+
cdef xmlparser.xmlParserCtxt* pctxt
|
892
|
+
if self._parser_context is None:
|
893
|
+
self._parser_context = self._createContext(self.target, None)
|
894
|
+
self._parser_context._collect_ids = self._collect_ids
|
895
|
+
if self._schema is not None:
|
896
|
+
self._parser_context._validator = \
|
897
|
+
self._schema._newSaxValidator(
|
898
|
+
self._parse_options & xmlparser.XML_PARSE_DTDATTR)
|
899
|
+
pctxt = self._newParserCtxt()
|
900
|
+
_initParserContext(self._parser_context, self._resolvers, pctxt)
|
901
|
+
self._configureSaxContext(pctxt)
|
902
|
+
return self._parser_context
|
903
|
+
|
904
|
+
cdef _ParserContext _getPushParserContext(self):
|
905
|
+
cdef xmlparser.xmlParserCtxt* pctxt
|
906
|
+
if self._push_parser_context is None:
|
907
|
+
self._push_parser_context = self._createContext(
|
908
|
+
self.target, self._events_to_collect)
|
909
|
+
self._push_parser_context._collect_ids = self._collect_ids
|
910
|
+
if self._schema is not None:
|
911
|
+
self._push_parser_context._validator = \
|
912
|
+
self._schema._newSaxValidator(
|
913
|
+
self._parse_options & xmlparser.XML_PARSE_DTDATTR)
|
914
|
+
pctxt = self._newPushParserCtxt()
|
915
|
+
_initParserContext(
|
916
|
+
self._push_parser_context, self._resolvers, pctxt)
|
917
|
+
self._configureSaxContext(pctxt)
|
918
|
+
return self._push_parser_context
|
919
|
+
|
920
|
+
cdef _ParserContext _createContext(self, target, events_to_collect):
|
921
|
+
"""
|
922
|
+
This method creates and configures the lxml-level parser.
|
923
|
+
"""
|
924
|
+
cdef _SaxParserContext sax_context
|
925
|
+
if target is not None:
|
926
|
+
sax_context = _TargetParserContext(self)
|
927
|
+
(<_TargetParserContext>sax_context)._setTarget(target)
|
928
|
+
elif events_to_collect:
|
929
|
+
sax_context = _SaxParserContext(self)
|
930
|
+
else:
|
931
|
+
# nothing special to configure
|
932
|
+
return _ParserContext()
|
933
|
+
if events_to_collect:
|
934
|
+
events, tag = events_to_collect
|
935
|
+
sax_context._setEventFilter(events, tag)
|
936
|
+
return sax_context
|
937
|
+
|
938
|
+
@cython.final
|
939
|
+
cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
|
940
|
+
if self._remove_comments:
|
941
|
+
pctxt.sax.comment = NULL
|
942
|
+
if self._remove_pis:
|
943
|
+
pctxt.sax.processingInstruction = NULL
|
944
|
+
if self._strip_cdata:
|
945
|
+
# hard switch-off for CDATA nodes => makes them plain text
|
946
|
+
pctxt.sax.cdataBlock = NULL
|
947
|
+
if not self._resolve_external_entities:
|
948
|
+
pctxt.sax.getEntity = _getInternalEntityOnly
|
949
|
+
|
950
|
+
cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
|
951
|
+
cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
|
952
|
+
if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
|
953
|
+
# need to extend SAX1 context to SAX2 to get proper error reports
|
954
|
+
if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
|
955
|
+
sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
|
956
|
+
if sax is NULL:
|
957
|
+
raise MemoryError()
|
958
|
+
cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
|
959
|
+
sizeof(htmlparser.htmlDefaultSAXHandler))
|
960
|
+
c_ctxt.sax = sax
|
961
|
+
sax.initialized = xmlparser.XML_SAX2_MAGIC
|
962
|
+
# Need a cast here because older libxml2 releases do not use 'const' in the functype.
|
963
|
+
sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
|
964
|
+
sax.startElementNs = NULL
|
965
|
+
sax.endElementNs = NULL
|
966
|
+
sax._private = NULL
|
967
|
+
return 0
|
968
|
+
|
969
|
+
cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
|
970
|
+
"""
|
971
|
+
Create and initialise a libxml2-level parser context.
|
972
|
+
"""
|
973
|
+
cdef xmlparser.xmlParserCtxt* c_ctxt
|
974
|
+
if self._for_html:
|
975
|
+
c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
|
976
|
+
if c_ctxt is not NULL:
|
977
|
+
self._registerHtmlErrorHandler(c_ctxt)
|
978
|
+
else:
|
979
|
+
c_ctxt = xmlparser.xmlNewParserCtxt()
|
980
|
+
if c_ctxt is NULL:
|
981
|
+
raise MemoryError
|
982
|
+
c_ctxt.sax.startDocument = _initSaxDocument
|
983
|
+
return c_ctxt
|
984
|
+
|
985
|
+
cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
|
986
|
+
cdef xmlparser.xmlParserCtxt* c_ctxt
|
987
|
+
cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
|
988
|
+
if self._for_html:
|
989
|
+
c_ctxt = htmlparser.htmlCreatePushParserCtxt(
|
990
|
+
NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
|
991
|
+
if c_ctxt is not NULL:
|
992
|
+
self._registerHtmlErrorHandler(c_ctxt)
|
993
|
+
htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
|
994
|
+
else:
|
995
|
+
c_ctxt = xmlparser.xmlCreatePushParserCtxt(
|
996
|
+
NULL, NULL, NULL, 0, c_filename)
|
997
|
+
if c_ctxt is not NULL:
|
998
|
+
xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
|
999
|
+
if c_ctxt is NULL:
|
1000
|
+
raise MemoryError()
|
1001
|
+
c_ctxt.sax.startDocument = _initSaxDocument
|
1002
|
+
return c_ctxt
|
1003
|
+
|
1004
|
+
@property
|
1005
|
+
def error_log(self):
|
1006
|
+
"""The error log of the last parser run.
|
1007
|
+
"""
|
1008
|
+
cdef _ParserContext context
|
1009
|
+
context = self._getParserContext()
|
1010
|
+
return context._error_log.copy()
|
1011
|
+
|
1012
|
+
@property
|
1013
|
+
def resolvers(self):
|
1014
|
+
"""The custom resolver registry of this parser."""
|
1015
|
+
return self._resolvers
|
1016
|
+
|
1017
|
+
@property
|
1018
|
+
def version(self):
|
1019
|
+
"""The version of the underlying XML parser."""
|
1020
|
+
return "libxml2 %d.%d.%d" % LIBXML_VERSION
|
1021
|
+
|
1022
|
+
def set_element_class_lookup(self, ElementClassLookup lookup = None):
|
1023
|
+
"""set_element_class_lookup(self, lookup = None)
|
1024
|
+
|
1025
|
+
Set a lookup scheme for element classes generated from this parser.
|
1026
|
+
|
1027
|
+
Reset it by passing None or nothing.
|
1028
|
+
"""
|
1029
|
+
self._class_lookup = lookup
|
1030
|
+
|
1031
|
+
cdef _BaseParser _copy(self):
|
1032
|
+
"Create a new parser with the same configuration."
|
1033
|
+
cdef _BaseParser parser
|
1034
|
+
parser = self.__class__()
|
1035
|
+
parser._parse_options = self._parse_options
|
1036
|
+
parser._for_html = self._for_html
|
1037
|
+
parser._remove_comments = self._remove_comments
|
1038
|
+
parser._remove_pis = self._remove_pis
|
1039
|
+
parser._strip_cdata = self._strip_cdata
|
1040
|
+
parser._filename = self._filename
|
1041
|
+
parser._resolvers = self._resolvers
|
1042
|
+
parser.target = self.target
|
1043
|
+
parser._class_lookup = self._class_lookup
|
1044
|
+
parser._default_encoding = self._default_encoding
|
1045
|
+
parser._schema = self._schema
|
1046
|
+
parser._events_to_collect = self._events_to_collect
|
1047
|
+
return parser
|
1048
|
+
|
1049
|
+
def copy(self):
|
1050
|
+
"""copy(self)
|
1051
|
+
|
1052
|
+
Create a new parser with the same configuration.
|
1053
|
+
"""
|
1054
|
+
return self._copy()
|
1055
|
+
|
1056
|
+
def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
|
1057
|
+
"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
|
1058
|
+
|
1059
|
+
Creates a new element associated with this parser.
|
1060
|
+
"""
|
1061
|
+
return _makeElement(_tag, NULL, None, self, None, None,
|
1062
|
+
attrib, nsmap, _extra)
|
1063
|
+
|
1064
|
+
# internal parser methods
|
1065
|
+
|
1066
|
+
cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
|
1067
|
+
"""Parse unicode document, share dictionary if possible.
|
1068
|
+
"""
|
1069
|
+
cdef _ParserContext context
|
1070
|
+
cdef xmlDoc* result
|
1071
|
+
cdef xmlparser.xmlParserCtxt* pctxt
|
1072
|
+
cdef Py_ssize_t py_buffer_len
|
1073
|
+
cdef int buffer_len, c_kind
|
1074
|
+
cdef const_char* c_text
|
1075
|
+
cdef const_char* c_encoding = _PY_UNICODE_ENCODING
|
1076
|
+
if python.PyUnicode_IS_READY(utext):
|
1077
|
+
# PEP-393 string
|
1078
|
+
c_text = <const_char*>python.PyUnicode_DATA(utext)
|
1079
|
+
py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
|
1080
|
+
c_kind = python.PyUnicode_KIND(utext)
|
1081
|
+
if c_kind == 1:
|
1082
|
+
if python.PyUnicode_MAX_CHAR_VALUE(utext) <= 127:
|
1083
|
+
c_encoding = 'UTF-8'
|
1084
|
+
else:
|
1085
|
+
c_encoding = 'ISO-8859-1'
|
1086
|
+
elif c_kind == 2:
|
1087
|
+
py_buffer_len *= 2
|
1088
|
+
if python.PY_BIG_ENDIAN:
|
1089
|
+
c_encoding = 'UTF-16BE' # actually UCS-2
|
1090
|
+
else:
|
1091
|
+
c_encoding = 'UTF-16LE' # actually UCS-2
|
1092
|
+
elif c_kind == 4:
|
1093
|
+
py_buffer_len *= 4
|
1094
|
+
if python.PY_BIG_ENDIAN:
|
1095
|
+
c_encoding = 'UTF-32BE' # actually UCS-4
|
1096
|
+
else:
|
1097
|
+
c_encoding = 'UTF-32LE' # actually UCS-4
|
1098
|
+
else:
|
1099
|
+
assert False, f"Illegal Unicode kind {c_kind}"
|
1100
|
+
else:
|
1101
|
+
# old Py_UNICODE string
|
1102
|
+
py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
|
1103
|
+
c_text = python.PyUnicode_AS_DATA(utext)
|
1104
|
+
assert 0 <= py_buffer_len <= limits.INT_MAX
|
1105
|
+
buffer_len = py_buffer_len
|
1106
|
+
|
1107
|
+
context = self._getParserContext()
|
1108
|
+
context.prepare()
|
1109
|
+
try:
|
1110
|
+
pctxt = context._c_ctxt
|
1111
|
+
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
1112
|
+
orig_options = pctxt.options
|
1113
|
+
with nogil:
|
1114
|
+
if self._for_html:
|
1115
|
+
result = htmlparser.htmlCtxtReadMemory(
|
1116
|
+
pctxt, c_text, buffer_len, c_filename, c_encoding,
|
1117
|
+
self._parse_options)
|
1118
|
+
if result is not NULL:
|
1119
|
+
if _fixHtmlDictNames(pctxt.dict, result) < 0:
|
1120
|
+
tree.xmlFreeDoc(result)
|
1121
|
+
result = NULL
|
1122
|
+
else:
|
1123
|
+
result = xmlparser.xmlCtxtReadMemory(
|
1124
|
+
pctxt, c_text, buffer_len, c_filename, c_encoding,
|
1125
|
+
self._parse_options)
|
1126
|
+
pctxt.options = orig_options # work around libxml2 problem
|
1127
|
+
|
1128
|
+
return context._handleParseResultDoc(self, result, None)
|
1129
|
+
finally:
|
1130
|
+
context.cleanup()
|
1131
|
+
|
1132
|
+
cdef xmlDoc* _parseDoc(self, const char* c_text, int c_len, char* c_filename) except NULL:
|
1133
|
+
"""Parse document, share dictionary if possible.
|
1134
|
+
"""
|
1135
|
+
cdef _ParserContext context
|
1136
|
+
cdef xmlDoc* result
|
1137
|
+
cdef xmlparser.xmlParserCtxt* pctxt
|
1138
|
+
cdef char* c_encoding
|
1139
|
+
cdef tree.xmlCharEncoding enc
|
1140
|
+
context = self._getParserContext()
|
1141
|
+
context.prepare()
|
1142
|
+
try:
|
1143
|
+
pctxt = context._c_ctxt
|
1144
|
+
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
1145
|
+
|
1146
|
+
if self._default_encoding is None:
|
1147
|
+
c_encoding = NULL
|
1148
|
+
# libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
|
1149
|
+
# NOTE: limit to problematic cases because it changes character offsets
|
1150
|
+
if c_len >= 4 and (c_text[0] == b'\xFF' and c_text[1] == b'\xFE' and
|
1151
|
+
c_text[2] == 0 and c_text[3] == 0):
|
1152
|
+
c_encoding = "UTF-32LE"
|
1153
|
+
c_text += 4
|
1154
|
+
c_len -= 4
|
1155
|
+
elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
|
1156
|
+
c_text[2] == b'\xFE' and c_text[3] == b'\xFF'):
|
1157
|
+
c_encoding = "UTF-32BE"
|
1158
|
+
c_text += 4
|
1159
|
+
c_len -= 4
|
1160
|
+
else:
|
1161
|
+
# no BOM => try to determine encoding
|
1162
|
+
enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
|
1163
|
+
if enc == tree.XML_CHAR_ENCODING_UCS4LE:
|
1164
|
+
c_encoding = 'UTF-32LE'
|
1165
|
+
elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
|
1166
|
+
c_encoding = 'UTF-32BE'
|
1167
|
+
else:
|
1168
|
+
c_encoding = _cstr(self._default_encoding)
|
1169
|
+
|
1170
|
+
orig_options = pctxt.options
|
1171
|
+
with nogil:
|
1172
|
+
if self._for_html:
|
1173
|
+
result = htmlparser.htmlCtxtReadMemory(
|
1174
|
+
pctxt, c_text, c_len, c_filename,
|
1175
|
+
c_encoding, self._parse_options)
|
1176
|
+
if result is not NULL:
|
1177
|
+
if _fixHtmlDictNames(pctxt.dict, result) < 0:
|
1178
|
+
tree.xmlFreeDoc(result)
|
1179
|
+
result = NULL
|
1180
|
+
else:
|
1181
|
+
result = xmlparser.xmlCtxtReadMemory(
|
1182
|
+
pctxt, c_text, c_len, c_filename,
|
1183
|
+
c_encoding, self._parse_options)
|
1184
|
+
pctxt.options = orig_options # work around libxml2 problem
|
1185
|
+
|
1186
|
+
return context._handleParseResultDoc(self, result, None)
|
1187
|
+
finally:
|
1188
|
+
context.cleanup()
|
1189
|
+
|
1190
|
+
cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
|
1191
|
+
cdef _ParserContext context
|
1192
|
+
cdef xmlDoc* result
|
1193
|
+
cdef xmlparser.xmlParserCtxt* pctxt
|
1194
|
+
cdef char* c_encoding
|
1195
|
+
result = NULL
|
1196
|
+
|
1197
|
+
context = self._getParserContext()
|
1198
|
+
context.prepare()
|
1199
|
+
try:
|
1200
|
+
pctxt = context._c_ctxt
|
1201
|
+
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
1202
|
+
|
1203
|
+
if self._default_encoding is None:
|
1204
|
+
c_encoding = NULL
|
1205
|
+
else:
|
1206
|
+
c_encoding = _cstr(self._default_encoding)
|
1207
|
+
|
1208
|
+
orig_options = pctxt.options
|
1209
|
+
with nogil:
|
1210
|
+
if self._for_html:
|
1211
|
+
result = htmlparser.htmlCtxtReadFile(
|
1212
|
+
pctxt, c_filename, c_encoding, self._parse_options)
|
1213
|
+
if result is not NULL:
|
1214
|
+
if _fixHtmlDictNames(pctxt.dict, result) < 0:
|
1215
|
+
tree.xmlFreeDoc(result)
|
1216
|
+
result = NULL
|
1217
|
+
else:
|
1218
|
+
result = xmlparser.xmlCtxtReadFile(
|
1219
|
+
pctxt, c_filename, c_encoding, self._parse_options)
|
1220
|
+
pctxt.options = orig_options # work around libxml2 problem
|
1221
|
+
|
1222
|
+
return context._handleParseResultDoc(self, result, c_filename)
|
1223
|
+
finally:
|
1224
|
+
context.cleanup()
|
1225
|
+
|
1226
|
+
cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
|
1227
|
+
encoding) except NULL:
|
1228
|
+
cdef _ParserContext context
|
1229
|
+
cdef _FileReaderContext file_context
|
1230
|
+
cdef xmlDoc* result
|
1231
|
+
cdef xmlparser.xmlParserCtxt* pctxt
|
1232
|
+
cdef char* c_filename
|
1233
|
+
if not filename:
|
1234
|
+
filename = None
|
1235
|
+
|
1236
|
+
context = self._getParserContext()
|
1237
|
+
context.prepare()
|
1238
|
+
try:
|
1239
|
+
pctxt = context._c_ctxt
|
1240
|
+
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
1241
|
+
file_context = _FileReaderContext(
|
1242
|
+
filelike, context, filename,
|
1243
|
+
encoding or self._default_encoding)
|
1244
|
+
result = file_context._readDoc(pctxt, self._parse_options)
|
1245
|
+
|
1246
|
+
return context._handleParseResultDoc(
|
1247
|
+
self, result, filename)
|
1248
|
+
finally:
|
1249
|
+
context.cleanup()
|
1250
|
+
|
1251
|
+
|
1252
|
+
cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name) noexcept nogil:
|
1253
|
+
"""
|
1254
|
+
Callback function to intercept the entity resolution when external entity loading is disabled.
|
1255
|
+
"""
|
1256
|
+
cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name)
|
1257
|
+
if not entity:
|
1258
|
+
return NULL
|
1259
|
+
if entity.etype not in (
|
1260
|
+
tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY,
|
1261
|
+
tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY,
|
1262
|
+
tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY):
|
1263
|
+
return entity
|
1264
|
+
|
1265
|
+
# Reject all external entities and fail the parsing instead. There is currently
|
1266
|
+
# no way in libxml2 to just prevent the entity resolution in this case.
|
1267
|
+
cdef xmlerror.xmlError c_error
|
1268
|
+
cdef xmlerror.xmlStructuredErrorFunc err_func
|
1269
|
+
cdef xmlparser.xmlParserInput* parser_input
|
1270
|
+
cdef void* err_context
|
1271
|
+
|
1272
|
+
c_ctxt = <xmlparser.xmlParserCtxt *> ctxt
|
1273
|
+
err_func = xmlerror.xmlStructuredError
|
1274
|
+
if err_func:
|
1275
|
+
parser_input = c_ctxt.input
|
1276
|
+
# Copied from xmlVErrParser() in libxml2: get current input from stack.
|
1277
|
+
if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1:
|
1278
|
+
parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2]
|
1279
|
+
|
1280
|
+
c_error = xmlerror.xmlError(
|
1281
|
+
domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER,
|
1282
|
+
code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE,
|
1283
|
+
level=xmlerror.xmlErrorLevel.XML_ERR_FATAL,
|
1284
|
+
message=b"External entity resolution is disabled for security reasons "
|
1285
|
+
b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' "
|
1286
|
+
b"if you consider it safe to enable it.",
|
1287
|
+
file=parser_input.filename,
|
1288
|
+
node=entity,
|
1289
|
+
str1=<char*> name,
|
1290
|
+
str2=NULL,
|
1291
|
+
str3=NULL,
|
1292
|
+
line=parser_input.line if parser_input else 0,
|
1293
|
+
int1=0,
|
1294
|
+
int2=parser_input.col if parser_input else 0,
|
1295
|
+
)
|
1296
|
+
err_context = xmlerror.xmlStructuredErrorContext
|
1297
|
+
err_func(err_context, &c_error)
|
1298
|
+
|
1299
|
+
c_ctxt.wellFormed = 0
|
1300
|
+
# The entity was looked up and does not need to be freed.
|
1301
|
+
return NULL
|
1302
|
+
|
1303
|
+
|
1304
|
+
cdef void _initSaxDocument(void* ctxt) noexcept with gil:
|
1305
|
+
xmlparser.xmlSAX2StartDocument(ctxt)
|
1306
|
+
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
1307
|
+
c_doc = c_ctxt.myDoc
|
1308
|
+
|
1309
|
+
# set up document dict
|
1310
|
+
if c_doc and c_ctxt.dict and not c_doc.dict:
|
1311
|
+
# I have no idea why libxml2 disables this - we need it
|
1312
|
+
c_ctxt.dictNames = 1
|
1313
|
+
c_doc.dict = c_ctxt.dict
|
1314
|
+
xmlparser.xmlDictReference(c_ctxt.dict)
|
1315
|
+
|
1316
|
+
# set up XML ID hash table
|
1317
|
+
if c_ctxt._private:
|
1318
|
+
context = <_ParserContext>c_ctxt._private
|
1319
|
+
if context._collect_ids:
|
1320
|
+
# keep the global parser dict from filling up with XML IDs
|
1321
|
+
if c_doc and not c_doc.ids:
|
1322
|
+
# memory errors are not fatal here
|
1323
|
+
c_dict = xmlparser.xmlDictCreate()
|
1324
|
+
if c_dict:
|
1325
|
+
c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
|
1326
|
+
xmlparser.xmlDictFree(c_dict)
|
1327
|
+
else:
|
1328
|
+
c_doc.ids = tree.xmlHashCreate(0)
|
1329
|
+
else:
|
1330
|
+
c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
|
1331
|
+
if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
|
1332
|
+
# already initialised but empty => clear
|
1333
|
+
tree.xmlHashFree(c_doc.ids, NULL)
|
1334
|
+
c_doc.ids = NULL
|
1335
|
+
|
1336
|
+
|
1337
|
+
############################################################
|
1338
|
+
## ET feed parser
|
1339
|
+
############################################################
|
1340
|
+
|
1341
|
+
cdef class _FeedParser(_BaseParser):
|
1342
|
+
cdef bint _feed_parser_running
|
1343
|
+
|
1344
|
+
@property
|
1345
|
+
def feed_error_log(self):
|
1346
|
+
"""The error log of the last (or current) run of the feed parser.
|
1347
|
+
|
1348
|
+
Note that this is local to the feed parser and thus is
|
1349
|
+
different from what the ``error_log`` property returns.
|
1350
|
+
"""
|
1351
|
+
return self._getPushParserContext()._error_log.copy()
|
1352
|
+
|
1353
|
+
cpdef feed(self, data):
|
1354
|
+
"""feed(self, data)
|
1355
|
+
|
1356
|
+
Feeds data to the parser. The argument should be an 8-bit string
|
1357
|
+
buffer containing encoded data, although Unicode is supported as long
|
1358
|
+
as both string types are not mixed.
|
1359
|
+
|
1360
|
+
This is the main entry point to the consumer interface of a
|
1361
|
+
parser. The parser will parse as much of the XML stream as it
|
1362
|
+
can on each call. To finish parsing or to reset the parser,
|
1363
|
+
call the ``close()`` method. Both methods may raise
|
1364
|
+
ParseError if errors occur in the input data. If an error is
|
1365
|
+
raised, there is no longer a need to call ``close()``.
|
1366
|
+
|
1367
|
+
The feed parser interface is independent of the normal parser
|
1368
|
+
usage. You can use the same parser as a feed parser and in
|
1369
|
+
the ``parse()`` function concurrently.
|
1370
|
+
"""
|
1371
|
+
cdef _ParserContext context
|
1372
|
+
cdef bytes bstring
|
1373
|
+
cdef xmlparser.xmlParserCtxt* pctxt
|
1374
|
+
cdef Py_ssize_t py_buffer_len, ustart
|
1375
|
+
cdef const_char* char_data
|
1376
|
+
cdef const_char* c_encoding
|
1377
|
+
cdef int buffer_len
|
1378
|
+
cdef int error
|
1379
|
+
cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
|
1380
|
+
|
1381
|
+
if isinstance(data, bytes):
|
1382
|
+
if self._default_encoding is None:
|
1383
|
+
c_encoding = NULL
|
1384
|
+
else:
|
1385
|
+
c_encoding = self._default_encoding
|
1386
|
+
char_data = _cstr(data)
|
1387
|
+
py_buffer_len = python.PyBytes_GET_SIZE(data)
|
1388
|
+
ustart = 0
|
1389
|
+
elif isinstance(data, unicode):
|
1390
|
+
c_encoding = b"UTF-8"
|
1391
|
+
char_data = NULL
|
1392
|
+
py_buffer_len = len(<unicode> data)
|
1393
|
+
ustart = 0
|
1394
|
+
else:
|
1395
|
+
raise TypeError, "Parsing requires string data"
|
1396
|
+
|
1397
|
+
context = self._getPushParserContext()
|
1398
|
+
pctxt = context._c_ctxt
|
1399
|
+
error = 0
|
1400
|
+
if not self._feed_parser_running:
|
1401
|
+
context.prepare(set_document_loader=False)
|
1402
|
+
self._feed_parser_running = 1
|
1403
|
+
c_filename = (_cstr(self._filename)
|
1404
|
+
if self._filename is not None else NULL)
|
1405
|
+
|
1406
|
+
# We have to give *mlCtxtResetPush() enough input to figure
|
1407
|
+
# out the character encoding (at least four bytes),
|
1408
|
+
# however if we give it all we got, we'll have nothing for
|
1409
|
+
# *mlParseChunk() and things go wrong.
|
1410
|
+
buffer_len = 0
|
1411
|
+
if char_data is not NULL:
|
1412
|
+
buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
|
1413
|
+
orig_loader = _register_document_loader()
|
1414
|
+
if self._for_html:
|
1415
|
+
error = _htmlCtxtResetPush(
|
1416
|
+
pctxt, char_data, buffer_len, c_filename, c_encoding,
|
1417
|
+
self._parse_options)
|
1418
|
+
else:
|
1419
|
+
xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
|
1420
|
+
error = xmlparser.xmlCtxtResetPush(
|
1421
|
+
pctxt, char_data, buffer_len, c_filename, c_encoding)
|
1422
|
+
_reset_document_loader(orig_loader)
|
1423
|
+
py_buffer_len -= buffer_len
|
1424
|
+
char_data += buffer_len
|
1425
|
+
if error:
|
1426
|
+
raise MemoryError()
|
1427
|
+
__GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
|
1428
|
+
|
1429
|
+
#print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
|
1430
|
+
|
1431
|
+
fixup_error = 0
|
1432
|
+
while py_buffer_len > 0 and (error == 0 or recover):
|
1433
|
+
if char_data is NULL:
|
1434
|
+
# Unicode parsing by converting chunks to UTF-8
|
1435
|
+
buffer_len = 2**19 # len(bytes) <= 4 * (2**19) == 2 MiB
|
1436
|
+
bstring = (<unicode> data)[ustart : ustart+buffer_len].encode('UTF-8')
|
1437
|
+
ustart += buffer_len
|
1438
|
+
py_buffer_len -= buffer_len # may end up < 0
|
1439
|
+
error, fixup_error = _parse_data_chunk(pctxt, <const char*> bstring, <int> len(bstring))
|
1440
|
+
else:
|
1441
|
+
# Direct byte string parsing.
|
1442
|
+
buffer_len = <int>py_buffer_len if py_buffer_len <= limits.INT_MAX else limits.INT_MAX
|
1443
|
+
error, fixup_error = _parse_data_chunk(pctxt, char_data, buffer_len)
|
1444
|
+
py_buffer_len -= buffer_len
|
1445
|
+
char_data += buffer_len
|
1446
|
+
|
1447
|
+
if fixup_error:
|
1448
|
+
context.store_exception(MemoryError())
|
1449
|
+
|
1450
|
+
if context._has_raised():
|
1451
|
+
# propagate Python exceptions immediately
|
1452
|
+
recover = 0
|
1453
|
+
error = 1
|
1454
|
+
break
|
1455
|
+
|
1456
|
+
if error and not pctxt.replaceEntities and not pctxt.validate:
|
1457
|
+
# in this mode, we ignore errors about undefined entities
|
1458
|
+
for entry in context._error_log.filter_from_errors():
|
1459
|
+
if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
|
1460
|
+
entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
|
1461
|
+
break
|
1462
|
+
else:
|
1463
|
+
error = 0
|
1464
|
+
|
1465
|
+
if not pctxt.wellFormed and xmlparser.xmlCtxtIsStopped(pctxt) and context._has_raised():
|
1466
|
+
# propagate Python exceptions immediately
|
1467
|
+
recover = 0
|
1468
|
+
error = 1
|
1469
|
+
|
1470
|
+
if fixup_error or not recover and (error or not pctxt.wellFormed):
|
1471
|
+
self._feed_parser_running = 0
|
1472
|
+
try:
|
1473
|
+
context._handleParseResult(self, pctxt.myDoc, None)
|
1474
|
+
finally:
|
1475
|
+
context.cleanup()
|
1476
|
+
|
1477
|
+
cpdef close(self):
|
1478
|
+
"""close(self)
|
1479
|
+
|
1480
|
+
Terminates feeding data to this parser. This tells the parser to
|
1481
|
+
process any remaining data in the feed buffer, and then returns the
|
1482
|
+
root Element of the tree that was parsed.
|
1483
|
+
|
1484
|
+
This method must be called after passing the last chunk of data into
|
1485
|
+
the ``feed()`` method. It should only be called when using the feed
|
1486
|
+
parser interface, all other usage is undefined.
|
1487
|
+
"""
|
1488
|
+
if not self._feed_parser_running:
|
1489
|
+
raise XMLSyntaxError("no element found",
|
1490
|
+
xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
|
1491
|
+
self._filename)
|
1492
|
+
|
1493
|
+
context = self._getPushParserContext()
|
1494
|
+
pctxt = context._c_ctxt
|
1495
|
+
|
1496
|
+
self._feed_parser_running = 0
|
1497
|
+
if self._for_html:
|
1498
|
+
htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
|
1499
|
+
else:
|
1500
|
+
xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
|
1501
|
+
|
1502
|
+
if (pctxt.recovery and not xmlparser.xmlCtxtIsStopped(pctxt) and
|
1503
|
+
isinstance(context, _SaxParserContext)):
|
1504
|
+
# apply any left-over 'end' events
|
1505
|
+
(<_SaxParserContext>context).flushEvents()
|
1506
|
+
|
1507
|
+
try:
|
1508
|
+
result = context._handleParseResult(self, pctxt.myDoc, None)
|
1509
|
+
finally:
|
1510
|
+
context.cleanup()
|
1511
|
+
|
1512
|
+
if isinstance(result, _Document):
|
1513
|
+
return (<_Document>result).getroot()
|
1514
|
+
else:
|
1515
|
+
return result
|
1516
|
+
|
1517
|
+
|
1518
|
+
cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt,
|
1519
|
+
const char* char_data, int buffer_len):
|
1520
|
+
fixup_error = 0
|
1521
|
+
with nogil:
|
1522
|
+
if c_ctxt.html:
|
1523
|
+
c_node = c_ctxt.node # last node where the parser stopped
|
1524
|
+
orig_loader = _register_document_loader()
|
1525
|
+
error = htmlparser.htmlParseChunk(c_ctxt, char_data, buffer_len, 0)
|
1526
|
+
_reset_document_loader(orig_loader)
|
1527
|
+
# and now for the fun part: move node names to the dict
|
1528
|
+
if c_ctxt.myDoc:
|
1529
|
+
fixup_error = _fixHtmlDictSubtreeNames(
|
1530
|
+
c_ctxt.dict, c_ctxt.myDoc, c_node)
|
1531
|
+
if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict:
|
1532
|
+
xmlparser.xmlDictFree(c_ctxt.myDoc.dict)
|
1533
|
+
c_ctxt.myDoc.dict = c_ctxt.dict
|
1534
|
+
xmlparser.xmlDictReference(c_ctxt.dict)
|
1535
|
+
else:
|
1536
|
+
orig_loader = _register_document_loader()
|
1537
|
+
error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0)
|
1538
|
+
_reset_document_loader(orig_loader)
|
1539
|
+
return (error, fixup_error)
|
1540
|
+
|
1541
|
+
|
1542
|
+
cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
|
1543
|
+
const_char* c_data, int buffer_len,
|
1544
|
+
const_char* c_filename, const_char* c_encoding,
|
1545
|
+
int parse_options) except -1:
|
1546
|
+
cdef xmlparser.xmlParserInput* c_input_stream
|
1547
|
+
# libxml2 lacks an HTML push parser setup function
|
1548
|
+
error = xmlparser.xmlCtxtResetPush(
|
1549
|
+
c_ctxt, c_data, buffer_len, c_filename, c_encoding)
|
1550
|
+
if error:
|
1551
|
+
return error
|
1552
|
+
|
1553
|
+
# fix libxml2 setup for HTML
|
1554
|
+
if tree.LIBXML_VERSION < 21400:
|
1555
|
+
c_ctxt.progressive = 1 # TODO: remove
|
1556
|
+
c_ctxt.html = 1
|
1557
|
+
htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
|
1558
|
+
|
1559
|
+
return 0
|
1560
|
+
|
1561
|
+
|
1562
|
+
############################################################
|
1563
|
+
## XML parser
|
1564
|
+
############################################################
|
1565
|
+
|
1566
|
+
cdef int _XML_DEFAULT_PARSE_OPTIONS
|
1567
|
+
_XML_DEFAULT_PARSE_OPTIONS = (
|
1568
|
+
xmlparser.XML_PARSE_NOENT |
|
1569
|
+
xmlparser.XML_PARSE_NOCDATA |
|
1570
|
+
xmlparser.XML_PARSE_NONET |
|
1571
|
+
xmlparser.XML_PARSE_COMPACT |
|
1572
|
+
xmlparser.XML_PARSE_BIG_LINES
|
1573
|
+
)
|
1574
|
+
|
1575
|
+
cdef class XMLParser(_FeedParser):
|
1576
|
+
"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, \
|
1577
|
+
load_dtd=False, no_network=True, decompress=False, ns_clean=False, \
|
1578
|
+
recover=False, schema: XMLSchema =None, huge_tree=False, \
|
1579
|
+
remove_blank_text=False, resolve_entities=True, \
|
1580
|
+
remove_comments=False, remove_pis=False, strip_cdata=True, \
|
1581
|
+
collect_ids=True, target=None, compact=True)
|
1582
|
+
|
1583
|
+
The XML parser.
|
1584
|
+
|
1585
|
+
Parsers can be supplied as additional argument to various parse
|
1586
|
+
functions of the lxml API. A default parser is always available
|
1587
|
+
and can be replaced by a call to the global function
|
1588
|
+
'set_default_parser'. New parsers can be created at any time
|
1589
|
+
without a major run-time overhead.
|
1590
|
+
|
1591
|
+
The keyword arguments in the constructor are mainly based on the
|
1592
|
+
libxml2 parser configuration. A DTD will also be loaded if DTD
|
1593
|
+
validation or attribute default values are requested (unless you
|
1594
|
+
additionally provide an XMLSchema from which the default
|
1595
|
+
attributes can be read).
|
1596
|
+
|
1597
|
+
Available boolean keyword arguments:
|
1598
|
+
|
1599
|
+
- attribute_defaults - inject default attributes from DTD or XMLSchema
|
1600
|
+
- dtd_validation - validate against a DTD referenced by the document
|
1601
|
+
- load_dtd - use DTD for parsing
|
1602
|
+
- no_network - prevent network access for related files (default: True)
|
1603
|
+
- decompress - automatically decompress gzip input
|
1604
|
+
(default: False, changed in lxml 6.0, disabling only affects libxml2 2.15+)
|
1605
|
+
- ns_clean - clean up redundant namespace declarations
|
1606
|
+
- recover - try hard to parse through broken XML
|
1607
|
+
- remove_blank_text - discard blank text nodes that appear ignorable
|
1608
|
+
- remove_comments - discard comments
|
1609
|
+
- remove_pis - discard processing instructions
|
1610
|
+
- strip_cdata - replace CDATA sections by normal text content (default: True)
|
1611
|
+
- compact - save memory for short text content (default: True)
|
1612
|
+
- collect_ids - use a hash table of XML IDs for fast access
|
1613
|
+
(default: True, always True with DTD validation)
|
1614
|
+
- huge_tree - disable security restrictions and support very deep trees
|
1615
|
+
and very long text content
|
1616
|
+
|
1617
|
+
Other keyword arguments:
|
1618
|
+
|
1619
|
+
- resolve_entities - replace entities by their text value: False for keeping the
|
1620
|
+
entity references, True for resolving them, and 'internal' for resolving
|
1621
|
+
internal definitions only (no external file/URL access).
|
1622
|
+
The default used to be True and was changed to 'internal' in lxml 5.0.
|
1623
|
+
- encoding - override the document encoding (note: libiconv encoding name)
|
1624
|
+
- target - a parser target object that will receive the parse events
|
1625
|
+
- schema - an XMLSchema to validate against
|
1626
|
+
|
1627
|
+
Note that you should avoid sharing parsers between threads. While this is
|
1628
|
+
not harmful, it is more efficient to use separate parsers. This does not
|
1629
|
+
apply to the default parser.
|
1630
|
+
"""
|
1631
|
+
def __init__(self, *, encoding=None, attribute_defaults=False,
|
1632
|
+
dtd_validation=False, load_dtd=False, no_network=True, decompress=False,
|
1633
|
+
ns_clean=False, recover=False, XMLSchema schema=None,
|
1634
|
+
huge_tree=False, remove_blank_text=False, resolve_entities='internal',
|
1635
|
+
remove_comments=False, remove_pis=False, strip_cdata=True,
|
1636
|
+
collect_ids=True, target=None, compact=True):
|
1637
|
+
cdef int parse_options
|
1638
|
+
cdef bint resolve_external = True
|
1639
|
+
parse_options = _XML_DEFAULT_PARSE_OPTIONS
|
1640
|
+
if load_dtd:
|
1641
|
+
parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
|
1642
|
+
if dtd_validation:
|
1643
|
+
parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
|
1644
|
+
xmlparser.XML_PARSE_DTDLOAD
|
1645
|
+
if attribute_defaults:
|
1646
|
+
parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
|
1647
|
+
if schema is None:
|
1648
|
+
parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
|
1649
|
+
if ns_clean:
|
1650
|
+
parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
|
1651
|
+
if recover:
|
1652
|
+
parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
|
1653
|
+
if remove_blank_text:
|
1654
|
+
parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
|
1655
|
+
if huge_tree:
|
1656
|
+
parse_options = parse_options | xmlparser.XML_PARSE_HUGE
|
1657
|
+
if not no_network:
|
1658
|
+
parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
|
1659
|
+
if not compact:
|
1660
|
+
parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
|
1661
|
+
if not resolve_entities:
|
1662
|
+
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
|
1663
|
+
elif resolve_entities == 'internal':
|
1664
|
+
resolve_external = False
|
1665
|
+
if not strip_cdata:
|
1666
|
+
parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
|
1667
|
+
|
1668
|
+
_BaseParser.__init__(self, parse_options, False, schema,
|
1669
|
+
remove_comments, remove_pis, strip_cdata,
|
1670
|
+
collect_ids, target, encoding, resolve_external)
|
1671
|
+
|
1672
|
+
# Allow subscripting XMLParser in type annotions (PEP 560)
|
1673
|
+
def __class_getitem__(cls, item):
|
1674
|
+
return _GenericAlias(cls, item)
|
1675
|
+
|
1676
|
+
|
1677
|
+
cdef class XMLPullParser(XMLParser):
|
1678
|
+
"""XMLPullParser(self, events=None, *, tag=None, **kwargs)
|
1679
|
+
|
1680
|
+
XML parser that collects parse events in an iterator.
|
1681
|
+
|
1682
|
+
The collected events are the same as for iterparse(), but the
|
1683
|
+
parser itself is non-blocking in the sense that it receives
|
1684
|
+
data chunks incrementally through its .feed() method, instead
|
1685
|
+
of reading them directly from a file(-like) object all by itself.
|
1686
|
+
|
1687
|
+
By default, it collects Element end events. To change that,
|
1688
|
+
pass any subset of the available events into the ``events``
|
1689
|
+
argument: ``'start'``, ``'end'``, ``'start-ns'``,
|
1690
|
+
``'end-ns'``, ``'comment'``, ``'pi'``.
|
1691
|
+
|
1692
|
+
To support loading external dependencies relative to the input
|
1693
|
+
source, you can pass the ``base_url``.
|
1694
|
+
"""
|
1695
|
+
def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
|
1696
|
+
XMLParser.__init__(self, **kwargs)
|
1697
|
+
if events is None:
|
1698
|
+
events = ('end',)
|
1699
|
+
self._setBaseURL(base_url)
|
1700
|
+
self._collectEvents(events, tag)
|
1701
|
+
|
1702
|
+
def read_events(self):
|
1703
|
+
return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
|
1704
|
+
|
1705
|
+
|
1706
|
+
cdef class ETCompatXMLParser(XMLParser):
|
1707
|
+
"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
|
1708
|
+
dtd_validation=False, load_dtd=False, no_network=True, decompress=False, \
|
1709
|
+
ns_clean=False, recover=False, schema=None, \
|
1710
|
+
huge_tree=False, remove_blank_text=False, resolve_entities=True, \
|
1711
|
+
remove_comments=True, remove_pis=True, strip_cdata=True, \
|
1712
|
+
target=None, compact=True)
|
1713
|
+
|
1714
|
+
An XML parser with an ElementTree compatible default setup.
|
1715
|
+
|
1716
|
+
See the XMLParser class for details.
|
1717
|
+
|
1718
|
+
This parser has ``remove_comments`` and ``remove_pis`` enabled by default
|
1719
|
+
and thus ignores comments and processing instructions.
|
1720
|
+
"""
|
1721
|
+
def __init__(self, *, encoding=None, attribute_defaults=False,
|
1722
|
+
dtd_validation=False, load_dtd=False, no_network=True, decompress=False,
|
1723
|
+
ns_clean=False, recover=False, schema=None,
|
1724
|
+
huge_tree=False, remove_blank_text=False, resolve_entities=True,
|
1725
|
+
remove_comments=True, remove_pis=True, strip_cdata=True,
|
1726
|
+
target=None, compact=True):
|
1727
|
+
XMLParser.__init__(self,
|
1728
|
+
attribute_defaults=attribute_defaults,
|
1729
|
+
dtd_validation=dtd_validation,
|
1730
|
+
load_dtd=load_dtd,
|
1731
|
+
no_network=no_network,
|
1732
|
+
decompress=decompress,
|
1733
|
+
ns_clean=ns_clean,
|
1734
|
+
recover=recover,
|
1735
|
+
remove_blank_text=remove_blank_text,
|
1736
|
+
huge_tree=huge_tree,
|
1737
|
+
compact=compact,
|
1738
|
+
resolve_entities=resolve_entities,
|
1739
|
+
remove_comments=remove_comments,
|
1740
|
+
remove_pis=remove_pis,
|
1741
|
+
strip_cdata=strip_cdata,
|
1742
|
+
target=target,
|
1743
|
+
encoding=encoding,
|
1744
|
+
schema=schema,
|
1745
|
+
)
|
1746
|
+
|
1747
|
+
# ET 1.2 compatible name
|
1748
|
+
XMLTreeBuilder = ETCompatXMLParser
|
1749
|
+
|
1750
|
+
|
1751
|
+
cdef XMLParser __DEFAULT_XML_PARSER
|
1752
|
+
__DEFAULT_XML_PARSER = XMLParser()
|
1753
|
+
|
1754
|
+
__GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
|
1755
|
+
|
1756
|
+
def set_default_parser(_BaseParser parser=None):
|
1757
|
+
"""set_default_parser(parser=None)
|
1758
|
+
|
1759
|
+
Set a default parser for the current thread. This parser is used
|
1760
|
+
globally whenever no parser is supplied to the various parse functions of
|
1761
|
+
the lxml API. If this function is called without a parser (or if it is
|
1762
|
+
None), the default parser is reset to the original configuration.
|
1763
|
+
|
1764
|
+
Note that the pre-installed default parser is not thread-safe. Avoid the
|
1765
|
+
default parser in multi-threaded environments. You can create a separate
|
1766
|
+
parser for each thread explicitly or use a parser pool.
|
1767
|
+
"""
|
1768
|
+
if parser is None:
|
1769
|
+
parser = __DEFAULT_XML_PARSER
|
1770
|
+
__GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
|
1771
|
+
|
1772
|
+
def get_default_parser():
|
1773
|
+
"get_default_parser()"
|
1774
|
+
return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
1775
|
+
|
1776
|
+
############################################################
|
1777
|
+
## HTML parser
|
1778
|
+
############################################################
|
1779
|
+
|
1780
|
+
cdef int _HTML_DEFAULT_PARSE_OPTIONS
|
1781
|
+
_HTML_DEFAULT_PARSE_OPTIONS = (
|
1782
|
+
htmlparser.HTML_PARSE_RECOVER |
|
1783
|
+
htmlparser.HTML_PARSE_NONET |
|
1784
|
+
htmlparser.HTML_PARSE_COMPACT
|
1785
|
+
)
|
1786
|
+
|
1787
|
+
cdef object _UNUSED = object()
|
1788
|
+
|
1789
|
+
cdef class HTMLParser(_FeedParser):
|
1790
|
+
"""HTMLParser(self, encoding=None, remove_blank_text=False, \
|
1791
|
+
remove_comments=False, remove_pis=False, \
|
1792
|
+
no_network=True, decompress=False, target=None, schema: XMLSchema =None, \
|
1793
|
+
recover=True, compact=True, collect_ids=True, huge_tree=False)
|
1794
|
+
|
1795
|
+
The HTML parser.
|
1796
|
+
|
1797
|
+
This parser allows reading HTML into a normal XML tree. By
|
1798
|
+
default, it can read broken (non well-formed) HTML, depending on
|
1799
|
+
the capabilities of libxml2. Use the 'recover' option to switch
|
1800
|
+
this off.
|
1801
|
+
|
1802
|
+
Available boolean keyword arguments:
|
1803
|
+
|
1804
|
+
- recover - try hard to parse through broken HTML (default: True)
|
1805
|
+
- no_network - prevent network access for related files (default: True)
|
1806
|
+
- decompress - automatically decompress gzip input
|
1807
|
+
(default: False, changed in lxml 6.0, disabling only affects libxml2 2.15+)
|
1808
|
+
- remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)
|
1809
|
+
- remove_comments - discard comments
|
1810
|
+
- remove_pis - discard processing instructions
|
1811
|
+
- compact - save memory for short text content (default: True)
|
1812
|
+
- default_doctype - add a default doctype even if it is not found in the HTML (default: True)
|
1813
|
+
- collect_ids - use a hash table of XML IDs for fast access (default: True)
|
1814
|
+
- huge_tree - disable security restrictions and support very deep trees
|
1815
|
+
and very long text content
|
1816
|
+
|
1817
|
+
Other keyword arguments:
|
1818
|
+
|
1819
|
+
- encoding - override the document encoding (note: libiconv encoding name)
|
1820
|
+
- target - a parser target object that will receive the parse events
|
1821
|
+
- schema - an XMLSchema to validate against
|
1822
|
+
|
1823
|
+
Note that you should avoid sharing parsers between threads for performance
|
1824
|
+
reasons.
|
1825
|
+
"""
|
1826
|
+
def __init__(self, *, encoding=None, remove_blank_text=False,
|
1827
|
+
remove_comments=False, remove_pis=False, strip_cdata=_UNUSED,
|
1828
|
+
no_network=True, decompress=False, target=None, XMLSchema schema=None,
|
1829
|
+
recover=True, compact=True, default_doctype=True,
|
1830
|
+
collect_ids=True, huge_tree=False):
|
1831
|
+
cdef int parse_options
|
1832
|
+
parse_options = _HTML_DEFAULT_PARSE_OPTIONS
|
1833
|
+
if remove_blank_text:
|
1834
|
+
parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
|
1835
|
+
if not recover:
|
1836
|
+
parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
|
1837
|
+
if not no_network:
|
1838
|
+
parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
|
1839
|
+
if not compact:
|
1840
|
+
parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
|
1841
|
+
if not default_doctype:
|
1842
|
+
parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
|
1843
|
+
if huge_tree:
|
1844
|
+
parse_options = parse_options | xmlparser.XML_PARSE_HUGE
|
1845
|
+
|
1846
|
+
if strip_cdata is not _UNUSED:
|
1847
|
+
import warnings
|
1848
|
+
warnings.warn(
|
1849
|
+
"The 'strip_cdata' option of HTMLParser() has never done anything and will eventually be removed.",
|
1850
|
+
DeprecationWarning)
|
1851
|
+
_BaseParser.__init__(self, parse_options, True, schema,
|
1852
|
+
remove_comments, remove_pis, strip_cdata,
|
1853
|
+
collect_ids, target, encoding)
|
1854
|
+
|
1855
|
+
# Allow subscripting HTMLParser in type annotions (PEP 560)
|
1856
|
+
def __class_getitem__(cls, item):
|
1857
|
+
return _GenericAlias(cls, item)
|
1858
|
+
|
1859
|
+
|
1860
|
+
cdef HTMLParser __DEFAULT_HTML_PARSER
|
1861
|
+
__DEFAULT_HTML_PARSER = HTMLParser()
|
1862
|
+
|
1863
|
+
|
1864
|
+
cdef class HTMLPullParser(HTMLParser):
|
1865
|
+
"""HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
|
1866
|
+
|
1867
|
+
HTML parser that collects parse events in an iterator.
|
1868
|
+
|
1869
|
+
The collected events are the same as for iterparse(), but the
|
1870
|
+
parser itself is non-blocking in the sense that it receives
|
1871
|
+
data chunks incrementally through its .feed() method, instead
|
1872
|
+
of reading them directly from a file(-like) object all by itself.
|
1873
|
+
|
1874
|
+
By default, it collects Element end events. To change that,
|
1875
|
+
pass any subset of the available events into the ``events``
|
1876
|
+
argument: ``'start'``, ``'end'``, ``'start-ns'``,
|
1877
|
+
``'end-ns'``, ``'comment'``, ``'pi'``.
|
1878
|
+
|
1879
|
+
To support loading external dependencies relative to the input
|
1880
|
+
source, you can pass the ``base_url``.
|
1881
|
+
"""
|
1882
|
+
def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
|
1883
|
+
HTMLParser.__init__(self, **kwargs)
|
1884
|
+
if events is None:
|
1885
|
+
events = ('end',)
|
1886
|
+
self._setBaseURL(base_url)
|
1887
|
+
self._collectEvents(events, tag)
|
1888
|
+
|
1889
|
+
def read_events(self):
|
1890
|
+
return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
|
1891
|
+
|
1892
|
+
|
1893
|
+
############################################################
|
1894
|
+
## helper functions for document creation
|
1895
|
+
############################################################
|
1896
|
+
|
1897
|
+
cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
|
1898
|
+
cdef char* c_filename
|
1899
|
+
if parser is None:
|
1900
|
+
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
1901
|
+
if not filename:
|
1902
|
+
c_filename = NULL
|
1903
|
+
else:
|
1904
|
+
filename_utf = _encodeFilenameUTF8(filename)
|
1905
|
+
c_filename = _cstr(filename_utf)
|
1906
|
+
if isinstance(text, bytes):
|
1907
|
+
return _parseDoc_bytes(<bytes> text, filename, c_filename, parser)
|
1908
|
+
elif isinstance(text, unicode):
|
1909
|
+
return _parseDoc_unicode(<unicode> text, filename, c_filename, parser)
|
1910
|
+
else:
|
1911
|
+
return _parseDoc_charbuffer(text, filename, c_filename, parser)
|
1912
|
+
|
1913
|
+
|
1914
|
+
cdef xmlDoc* _parseDoc_unicode(unicode text, filename, char* c_filename, _BaseParser parser) except NULL:
|
1915
|
+
cdef Py_ssize_t c_len
|
1916
|
+
if python.PyUnicode_IS_READY(text):
|
1917
|
+
# PEP-393 Unicode string
|
1918
|
+
c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
|
1919
|
+
else:
|
1920
|
+
# old Py_UNICODE string
|
1921
|
+
c_len = python.PyUnicode_GET_DATA_SIZE(text)
|
1922
|
+
if c_len > limits.INT_MAX:
|
1923
|
+
return parser._parseDocFromFilelike(
|
1924
|
+
StringIO(text), filename, None)
|
1925
|
+
return parser._parseUnicodeDoc(text, c_filename)
|
1926
|
+
|
1927
|
+
|
1928
|
+
cdef xmlDoc* _parseDoc_bytes(bytes text, filename, char* c_filename, _BaseParser parser) except NULL:
|
1929
|
+
cdef Py_ssize_t c_len = len(text)
|
1930
|
+
if c_len > limits.INT_MAX:
|
1931
|
+
return parser._parseDocFromFilelike(BytesIO(text), filename, None)
|
1932
|
+
return parser._parseDoc(text, c_len, c_filename)
|
1933
|
+
|
1934
|
+
|
1935
|
+
cdef xmlDoc* _parseDoc_charbuffer(text, filename, char* c_filename, _BaseParser parser) except NULL:
|
1936
|
+
cdef const unsigned char[::1] data = memoryview(text).cast('B') # cast to 'unsigned char' buffer
|
1937
|
+
cdef Py_ssize_t c_len = len(data)
|
1938
|
+
if c_len > limits.INT_MAX:
|
1939
|
+
return parser._parseDocFromFilelike(BytesIO(text), filename, None)
|
1940
|
+
return parser._parseDoc(<const char*>&data[0], c_len, c_filename)
|
1941
|
+
|
1942
|
+
|
1943
|
+
cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
|
1944
|
+
if parser is None:
|
1945
|
+
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
1946
|
+
return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
|
1947
|
+
|
1948
|
+
|
1949
|
+
cdef xmlDoc* _parseDocFromFilelike(source, filename,
|
1950
|
+
_BaseParser parser) except NULL:
|
1951
|
+
if parser is None:
|
1952
|
+
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
1953
|
+
return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
|
1954
|
+
|
1955
|
+
|
1956
|
+
cdef xmlDoc* _newXMLDoc() except NULL:
|
1957
|
+
cdef xmlDoc* result
|
1958
|
+
result = tree.xmlNewDoc(NULL)
|
1959
|
+
if result is NULL:
|
1960
|
+
raise MemoryError()
|
1961
|
+
if result.encoding is NULL:
|
1962
|
+
result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
|
1963
|
+
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
1964
|
+
return result
|
1965
|
+
|
1966
|
+
cdef xmlDoc* _newHTMLDoc() except NULL:
|
1967
|
+
cdef xmlDoc* result
|
1968
|
+
result = tree.htmlNewDoc(NULL, NULL)
|
1969
|
+
if result is NULL:
|
1970
|
+
raise MemoryError()
|
1971
|
+
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
1972
|
+
return result
|
1973
|
+
|
1974
|
+
cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
|
1975
|
+
cdef xmlDoc* result
|
1976
|
+
if recursive:
|
1977
|
+
with nogil:
|
1978
|
+
result = tree.xmlCopyDoc(c_doc, recursive)
|
1979
|
+
else:
|
1980
|
+
result = tree.xmlCopyDoc(c_doc, 0)
|
1981
|
+
if result is NULL:
|
1982
|
+
raise MemoryError()
|
1983
|
+
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
1984
|
+
return result
|
1985
|
+
|
1986
|
+
cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
|
1987
|
+
"Recursively copy the document and make c_new_root the new root node."
|
1988
|
+
cdef xmlDoc* result
|
1989
|
+
cdef xmlNode* c_node
|
1990
|
+
result = tree.xmlCopyDoc(c_doc, 0) # non recursive
|
1991
|
+
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
1992
|
+
with nogil:
|
1993
|
+
c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
|
1994
|
+
if c_node is NULL:
|
1995
|
+
raise MemoryError()
|
1996
|
+
tree.xmlDocSetRootElement(result, c_node)
|
1997
|
+
_copyTail(c_new_root.next, c_node)
|
1998
|
+
return result
|
1999
|
+
|
2000
|
+
cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
|
2001
|
+
"Recursively copy the element into the document. c_doc is not modified."
|
2002
|
+
cdef xmlNode* c_root
|
2003
|
+
c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
|
2004
|
+
if c_root is NULL:
|
2005
|
+
raise MemoryError()
|
2006
|
+
_copyTail(c_node.next, c_root)
|
2007
|
+
return c_root
|
2008
|
+
|
2009
|
+
|
2010
|
+
############################################################
|
2011
|
+
## API level helper functions for _Document creation
|
2012
|
+
############################################################
|
2013
|
+
|
2014
|
+
cdef _Document _parseDocument(source, _BaseParser parser, base_url):
|
2015
|
+
cdef _Document doc
|
2016
|
+
source = _getFSPathOrObject(source)
|
2017
|
+
if _isString(source):
|
2018
|
+
# parse the file directly from the filesystem
|
2019
|
+
doc = _parseDocumentFromURL(_encodeFilename(source), parser)
|
2020
|
+
# fix base URL if requested
|
2021
|
+
if base_url is not None:
|
2022
|
+
base_url = _encodeFilenameUTF8(base_url)
|
2023
|
+
if doc._c_doc.URL is not NULL:
|
2024
|
+
tree.xmlFree(<char*>doc._c_doc.URL)
|
2025
|
+
doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
|
2026
|
+
return doc
|
2027
|
+
|
2028
|
+
if base_url is not None:
|
2029
|
+
url = base_url
|
2030
|
+
else:
|
2031
|
+
url = _getFilenameForFile(source)
|
2032
|
+
|
2033
|
+
if hasattr(source, 'getvalue') and hasattr(source, 'tell'):
|
2034
|
+
# StringIO - reading from start?
|
2035
|
+
if source.tell() == 0:
|
2036
|
+
return _parseMemoryDocument(source.getvalue(), url, parser)
|
2037
|
+
|
2038
|
+
# Support for file-like objects (urlgrabber.urlopen, ...)
|
2039
|
+
if hasattr(source, 'read'):
|
2040
|
+
return _parseFilelikeDocument(source, url, parser)
|
2041
|
+
|
2042
|
+
raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
|
2043
|
+
|
2044
|
+
cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
|
2045
|
+
c_doc = _parseDocFromFile(url, parser)
|
2046
|
+
return _documentFactory(c_doc, parser)
|
2047
|
+
|
2048
|
+
cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
|
2049
|
+
if isinstance(text, unicode):
|
2050
|
+
if _hasEncodingDeclaration(text):
|
2051
|
+
raise ValueError(
|
2052
|
+
"Unicode strings with encoding declaration are not supported. "
|
2053
|
+
"Please use bytes input or XML fragments without declaration.")
|
2054
|
+
c_doc = _parseDoc(text, url, parser)
|
2055
|
+
return _documentFactory(c_doc, parser)
|
2056
|
+
|
2057
|
+
cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
|
2058
|
+
c_doc = _parseDocFromFilelike(source, url, parser)
|
2059
|
+
return _documentFactory(c_doc, parser)
|