lxml 5.3.2__cp39-cp39-macosx_10_9_universal2.whl → 6.0.0__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/__init__.py +1 -1
- lxml/_elementpath.cpython-39-darwin.so +0 -0
- lxml/_elementpath.py +3 -1
- lxml/apihelpers.pxi +25 -17
- lxml/builder.cpython-39-darwin.so +0 -0
- lxml/builder.py +11 -0
- lxml/debug.pxi +0 -54
- lxml/etree.cpython-39-darwin.so +0 -0
- lxml/etree.h +24 -28
- lxml/etree.pyx +154 -33
- lxml/etree_api.h +59 -50
- lxml/extensions.pxi +3 -6
- lxml/html/__init__.py +7 -3
- lxml/html/_difflib.cpython-39-darwin.so +0 -0
- lxml/html/_difflib.py +2106 -0
- lxml/html/builder.py +40 -0
- lxml/html/defs.py +3 -3
- lxml/html/diff.cpython-39-darwin.so +0 -0
- lxml/html/diff.py +406 -312
- lxml/includes/etree_defs.h +6 -6
- lxml/includes/libexslt/exsltconfig.h +3 -3
- lxml/includes/libxml/HTMLparser.h +41 -45
- lxml/includes/libxml/HTMLtree.h +1 -0
- lxml/includes/libxml/SAX.h +2 -186
- lxml/includes/libxml/SAX2.h +2 -3
- lxml/includes/libxml/c14n.h +1 -12
- lxml/includes/libxml/catalog.h +1 -0
- lxml/includes/libxml/debugXML.h +0 -138
- lxml/includes/libxml/encoding.h +131 -59
- lxml/includes/libxml/entities.h +12 -20
- lxml/includes/libxml/globals.h +0 -16
- lxml/includes/libxml/hash.h +19 -0
- lxml/includes/libxml/list.h +2 -2
- lxml/includes/libxml/nanoftp.h +3 -173
- lxml/includes/libxml/nanohttp.h +17 -0
- lxml/includes/libxml/parser.h +505 -256
- lxml/includes/libxml/parserInternals.h +26 -98
- lxml/includes/libxml/relaxng.h +7 -2
- lxml/includes/libxml/threads.h +0 -6
- lxml/includes/libxml/tree.h +61 -97
- lxml/includes/libxml/uri.h +11 -0
- lxml/includes/libxml/valid.h +49 -14
- lxml/includes/libxml/xinclude.h +12 -0
- lxml/includes/libxml/xlink.h +4 -0
- lxml/includes/libxml/xmlIO.h +33 -35
- lxml/includes/libxml/xmlautomata.h +19 -2
- lxml/includes/libxml/xmlerror.h +32 -18
- lxml/includes/libxml/xmlexports.h +61 -15
- lxml/includes/libxml/xmlmemory.h +27 -64
- lxml/includes/libxml/xmlmodule.h +4 -0
- lxml/includes/libxml/xmlreader.h +13 -3
- lxml/includes/libxml/xmlregexp.h +7 -106
- lxml/includes/libxml/xmlsave.h +15 -1
- lxml/includes/libxml/xmlschemas.h +10 -5
- lxml/includes/libxml/xmlunicode.h +3 -190
- lxml/includes/libxml/xmlversion.h +15 -194
- lxml/includes/libxml/xmlwriter.h +1 -0
- lxml/includes/libxml/xpath.h +9 -15
- lxml/includes/libxml/xpathInternals.h +9 -3
- lxml/includes/libxml/xpointer.h +1 -91
- lxml/includes/libxslt/xsltconfig.h +6 -6
- lxml/includes/lxml-version.h +1 -1
- lxml/includes/tree.pxd +10 -12
- lxml/includes/xmlparser.pxd +46 -8
- lxml/lxml.etree.h +24 -28
- lxml/lxml.etree_api.h +59 -50
- lxml/objectify.cpython-39-darwin.so +0 -0
- lxml/objectify.pyx +11 -7
- lxml/parser.pxi +106 -47
- lxml/sax.cpython-39-darwin.so +0 -0
- lxml/sax.py +11 -0
- lxml/saxparser.pxi +14 -14
- lxml/schematron.pxi +8 -3
- lxml/serializer.pxi +71 -3
- lxml/xslt.pxi +10 -3
- lxml-6.0.0.dist-info/METADATA +163 -0
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/RECORD +81 -79
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/WHEEL +2 -1
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/licenses/LICENSE.txt +3 -1
- lxml-5.3.2.dist-info/METADATA +0 -100
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/licenses/LICENSES.txt +0 -0
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/top_level.txt +0 -0
lxml/objectify.pyx
CHANGED
@@ -18,6 +18,7 @@ from lxml.includes cimport tree
|
|
18
18
|
cimport lxml.includes.etreepublic as cetree
|
19
19
|
cimport libc.string as cstring_h # not to be confused with stdlib 'string'
|
20
20
|
from libc.string cimport const_char
|
21
|
+
from libc cimport limits
|
21
22
|
|
22
23
|
__all__ = ['BoolElement', 'DataElement', 'E', 'Element', 'ElementMaker',
|
23
24
|
'FloatElement', 'IntElement', 'NoneElement',
|
@@ -420,8 +421,11 @@ cdef object _lookupChild(_Element parent, tag):
|
|
420
421
|
cdef tree.xmlNode* c_node
|
421
422
|
c_node = parent._c_node
|
422
423
|
ns, tag = cetree.getNsTagWithEmptyNs(tag)
|
424
|
+
c_tag_len = len(<bytes> tag)
|
425
|
+
if c_tag_len > limits.INT_MAX:
|
426
|
+
return None
|
423
427
|
c_tag = tree.xmlDictExists(
|
424
|
-
c_node.doc.dict, _xcstr(tag),
|
428
|
+
c_node.doc.dict, _xcstr(tag), <int> c_tag_len)
|
425
429
|
if c_tag is NULL:
|
426
430
|
return None # not in the hash map => not in the tree
|
427
431
|
if ns is None:
|
@@ -1283,7 +1287,7 @@ cdef object _guessElementClass(tree.xmlNode* c_node):
|
|
1283
1287
|
return None
|
1284
1288
|
if value == '':
|
1285
1289
|
return StringElement
|
1286
|
-
|
1290
|
+
|
1287
1291
|
for type_check, pytype in _TYPE_CHECKS:
|
1288
1292
|
try:
|
1289
1293
|
type_check(value)
|
@@ -1689,8 +1693,8 @@ def annotate(element_or_tree, *, ignore_old=True, ignore_xsi=False,
|
|
1689
1693
|
|
1690
1694
|
If the 'ignore_xsi' keyword argument is False (the default), existing
|
1691
1695
|
'xsi:type' attributes will be used for the type annotation, if they fit the
|
1692
|
-
element text values.
|
1693
|
-
|
1696
|
+
element text values.
|
1697
|
+
|
1694
1698
|
Note that the mapping from Python types to XSI types is usually ambiguous.
|
1695
1699
|
Currently, only the first XSI type name in the corresponding PyType
|
1696
1700
|
definition will be used for annotation. Thus, you should consider naming
|
@@ -1705,7 +1709,7 @@ def annotate(element_or_tree, *, ignore_old=True, ignore_xsi=False,
|
|
1705
1709
|
elements. Pass 'string', for example, to make string values the default.
|
1706
1710
|
|
1707
1711
|
The keyword arguments 'annotate_xsi' (default: 0) and 'annotate_pytype'
|
1708
|
-
(default: 1) control which kind(s) of annotation to use.
|
1712
|
+
(default: 1) control which kind(s) of annotation to use.
|
1709
1713
|
"""
|
1710
1714
|
cdef _Element element
|
1711
1715
|
element = cetree.rootNodeOrRaise(element_or_tree)
|
@@ -1878,7 +1882,7 @@ def deannotate(element_or_tree, *, bint pytype=True, bint xsi=True,
|
|
1878
1882
|
and/or 'xsi:type' attributes and/or 'xsi:nil' attributes.
|
1879
1883
|
|
1880
1884
|
If the 'pytype' keyword argument is True (the default), 'py:pytype'
|
1881
|
-
attributes will be removed. If the 'xsi' keyword argument is True (the
|
1885
|
+
attributes will be removed. If the 'xsi' keyword argument is True (the
|
1882
1886
|
default), 'xsi:type' attributes will be removed.
|
1883
1887
|
If the 'xsi_nil' keyword argument is True (default: False), 'xsi:nil'
|
1884
1888
|
attributes will be removed.
|
@@ -2124,7 +2128,7 @@ def DataElement(_value, attrib=None, nsmap=None, *, _pytype=None, _xsi=None,
|
|
2124
2128
|
stringify = unicode if py_type is None else py_type.stringify
|
2125
2129
|
strval = stringify(_value)
|
2126
2130
|
|
2127
|
-
if _pytype is not None:
|
2131
|
+
if _pytype is not None:
|
2128
2132
|
if _pytype == "NoneType" or _pytype == "none":
|
2129
2133
|
strval = None
|
2130
2134
|
_attributes[XML_SCHEMA_INSTANCE_NIL_ATTR] = "true"
|
lxml/parser.pxi
CHANGED
@@ -3,6 +3,14 @@
|
|
3
3
|
from lxml.includes cimport xmlparser
|
4
4
|
from lxml.includes cimport htmlparser
|
5
5
|
|
6
|
+
cdef object _GenericAlias
|
7
|
+
try:
|
8
|
+
from types import GenericAlias as _GenericAlias
|
9
|
+
except ImportError:
|
10
|
+
# Python 3.8 - we only need this as return value from "__class_getitem__"
|
11
|
+
def _GenericAlias(cls, item):
|
12
|
+
return f"{cls.__name__}[{item.__name__}]"
|
13
|
+
|
6
14
|
|
7
15
|
class ParseError(LxmlSyntaxError):
|
8
16
|
"""Syntax error while parsing an XML document.
|
@@ -53,7 +61,6 @@ cdef class _ParserDictionaryContext:
|
|
53
61
|
cdef list _implied_parser_contexts
|
54
62
|
|
55
63
|
def __cinit__(self):
|
56
|
-
self._c_dict = NULL
|
57
64
|
self._implied_parser_contexts = []
|
58
65
|
|
59
66
|
def __dealloc__(self):
|
@@ -295,9 +302,7 @@ cdef class _FileReaderContext:
|
|
295
302
|
self._filelike = filelike
|
296
303
|
self._close_file_after_read = close_file
|
297
304
|
self._encoding = encoding
|
298
|
-
if url is None:
|
299
|
-
self._c_url = NULL
|
300
|
-
else:
|
305
|
+
if url is not None:
|
301
306
|
url = _encodeFilename(url)
|
302
307
|
self._c_url = _cstr(url)
|
303
308
|
self._url = url
|
@@ -419,8 +424,6 @@ cdef class _FileReaderContext:
|
|
419
424
|
cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) noexcept with gil:
|
420
425
|
return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
|
421
426
|
|
422
|
-
cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) noexcept nogil:
|
423
|
-
return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt)
|
424
427
|
|
425
428
|
############################################################
|
426
429
|
## support for custom document loaders
|
@@ -542,11 +545,8 @@ cdef class _ParserContext(_ResolverContext):
|
|
542
545
|
cdef bint _collect_ids
|
543
546
|
|
544
547
|
def __cinit__(self):
|
545
|
-
self._c_ctxt = NULL
|
546
548
|
self._collect_ids = True
|
547
|
-
if
|
548
|
-
self._lock = NULL
|
549
|
-
else:
|
549
|
+
if config.ENABLE_THREADING:
|
550
550
|
self._lock = python.PyThread_allocate_lock()
|
551
551
|
self._error_log = _ErrorLog()
|
552
552
|
|
@@ -573,6 +573,9 @@ cdef class _ParserContext(_ResolverContext):
|
|
573
573
|
return context
|
574
574
|
|
575
575
|
cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
|
576
|
+
"""
|
577
|
+
Connects the libxml2-level context to the lxml-level parser context.
|
578
|
+
"""
|
576
579
|
self._c_ctxt = c_ctxt
|
577
580
|
c_ctxt._private = <void*>self
|
578
581
|
|
@@ -597,6 +600,12 @@ cdef class _ParserContext(_ResolverContext):
|
|
597
600
|
raise ParserError, "parser locking failed"
|
598
601
|
self._error_log.clear()
|
599
602
|
self._doc = None
|
603
|
+
# Connect the lxml error log with libxml2's error handling. In the case of parsing
|
604
|
+
# HTML, ctxt->sax is not set to null, so this always works. The libxml2 function
|
605
|
+
# that does this is htmlInitParserCtxt in HTMLparser.c. For HTML (and possibly XML
|
606
|
+
# too), libxml2's SAX's serror is set to be the place where errors are sent when
|
607
|
+
# schannel is set to ctxt->sax->serror in xmlCtxtErrMemory in libxml2's
|
608
|
+
# parserInternals.c.
|
600
609
|
# Need a cast here because older libxml2 releases do not use 'const' in the functype.
|
601
610
|
self._c_ctxt.sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
|
602
611
|
self._orig_loader = _register_document_loader() if set_document_loader else NULL
|
@@ -642,6 +651,9 @@ cdef _initParserContext(_ParserContext context,
|
|
642
651
|
context._initParserContext(c_ctxt)
|
643
652
|
|
644
653
|
cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, const xmlerror.xmlError* error) noexcept with gil:
|
654
|
+
"""
|
655
|
+
Add an error created by libxml2 to the lxml-level error_log.
|
656
|
+
"""
|
645
657
|
(<_ParserContext>_parser_context._private)._error_log._receive(error)
|
646
658
|
|
647
659
|
cdef void _receiveParserError(void* c_context, const xmlerror.xmlError* error) noexcept nogil:
|
@@ -687,6 +699,8 @@ cdef xmlDoc* _handleParseResult(_ParserContext context,
|
|
687
699
|
xmlparser.xmlParserCtxt* c_ctxt,
|
688
700
|
xmlDoc* result, filename,
|
689
701
|
bint recover, bint free_doc) except NULL:
|
702
|
+
# The C-level argument xmlDoc* result is passed in as NULL if the parser was not able
|
703
|
+
# to parse the document.
|
690
704
|
cdef bint well_formed
|
691
705
|
if result is not NULL:
|
692
706
|
__GLOBAL_PARSER_CONTEXT.initDocDict(result)
|
@@ -698,6 +712,9 @@ cdef xmlDoc* _handleParseResult(_ParserContext context,
|
|
698
712
|
c_ctxt.myDoc = NULL
|
699
713
|
|
700
714
|
if result is not NULL:
|
715
|
+
# "wellFormed" in libxml2 is 0 if the parser found fatal errors. It still returns a
|
716
|
+
# parse result document if 'recover=True'. Here, we determine if we can present
|
717
|
+
# the document to the user or consider it incorrect or broken enough to raise an error.
|
701
718
|
if (context._validator is not None and
|
702
719
|
not context._validator.isvalid()):
|
703
720
|
well_formed = 0 # actually not 'valid', but anyway ...
|
@@ -901,6 +918,9 @@ cdef class _BaseParser:
|
|
901
918
|
return self._push_parser_context
|
902
919
|
|
903
920
|
cdef _ParserContext _createContext(self, target, events_to_collect):
|
921
|
+
"""
|
922
|
+
This method creates and configures the lxml-level parser.
|
923
|
+
"""
|
904
924
|
cdef _SaxParserContext sax_context
|
905
925
|
if target is not None:
|
906
926
|
sax_context = _TargetParserContext(self)
|
@@ -947,6 +967,9 @@ cdef class _BaseParser:
|
|
947
967
|
return 0
|
948
968
|
|
949
969
|
cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
|
970
|
+
"""
|
971
|
+
Create and initialise a libxml2-level parser context.
|
972
|
+
"""
|
950
973
|
cdef xmlparser.xmlParserCtxt* c_ctxt
|
951
974
|
if self._for_html:
|
952
975
|
c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
|
@@ -1106,8 +1129,7 @@ cdef class _BaseParser:
|
|
1106
1129
|
finally:
|
1107
1130
|
context.cleanup()
|
1108
1131
|
|
1109
|
-
cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
|
1110
|
-
char* c_filename) except NULL:
|
1132
|
+
cdef xmlDoc* _parseDoc(self, const char* c_text, int c_len, char* c_filename) except NULL:
|
1111
1133
|
"""Parse document, share dictionary if possible.
|
1112
1134
|
"""
|
1113
1135
|
cdef _ParserContext context
|
@@ -1440,7 +1462,7 @@ cdef class _FeedParser(_BaseParser):
|
|
1440
1462
|
else:
|
1441
1463
|
error = 0
|
1442
1464
|
|
1443
|
-
if not pctxt.wellFormed and pctxt
|
1465
|
+
if not pctxt.wellFormed and xmlparser.xmlCtxtIsStopped(pctxt) and context._has_raised():
|
1444
1466
|
# propagate Python exceptions immediately
|
1445
1467
|
recover = 0
|
1446
1468
|
error = 1
|
@@ -1477,7 +1499,7 @@ cdef class _FeedParser(_BaseParser):
|
|
1477
1499
|
else:
|
1478
1500
|
xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
|
1479
1501
|
|
1480
|
-
if (pctxt.recovery and not pctxt
|
1502
|
+
if (pctxt.recovery and not xmlparser.xmlCtxtIsStopped(pctxt) and
|
1481
1503
|
isinstance(context, _SaxParserContext)):
|
1482
1504
|
# apply any left-over 'end' events
|
1483
1505
|
(<_SaxParserContext>context).flushEvents()
|
@@ -1529,7 +1551,8 @@ cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
|
|
1529
1551
|
return error
|
1530
1552
|
|
1531
1553
|
# fix libxml2 setup for HTML
|
1532
|
-
|
1554
|
+
if tree.LIBXML_VERSION < 21400:
|
1555
|
+
c_ctxt.progressive = 1 # TODO: remove
|
1533
1556
|
c_ctxt.html = 1
|
1534
1557
|
htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
|
1535
1558
|
|
@@ -1547,10 +1570,15 @@ _XML_DEFAULT_PARSE_OPTIONS = (
|
|
1547
1570
|
xmlparser.XML_PARSE_NONET |
|
1548
1571
|
xmlparser.XML_PARSE_COMPACT |
|
1549
1572
|
xmlparser.XML_PARSE_BIG_LINES
|
1550
|
-
|
1573
|
+
)
|
1551
1574
|
|
1552
1575
|
cdef class XMLParser(_FeedParser):
|
1553
|
-
"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False,
|
1576
|
+
"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, \
|
1577
|
+
load_dtd=False, no_network=True, decompress=False, ns_clean=False, \
|
1578
|
+
recover=False, schema: XMLSchema =None, huge_tree=False, \
|
1579
|
+
remove_blank_text=False, resolve_entities=True, \
|
1580
|
+
remove_comments=False, remove_pis=False, strip_cdata=True, \
|
1581
|
+
collect_ids=True, target=None, compact=True)
|
1554
1582
|
|
1555
1583
|
The XML parser.
|
1556
1584
|
|
@@ -1572,6 +1600,8 @@ cdef class XMLParser(_FeedParser):
|
|
1572
1600
|
- dtd_validation - validate against a DTD referenced by the document
|
1573
1601
|
- load_dtd - use DTD for parsing
|
1574
1602
|
- no_network - prevent network access for related files (default: True)
|
1603
|
+
- decompress - automatically decompress gzip input
|
1604
|
+
(default: False, changed in lxml 6.0, disabling only affects libxml2 2.15+)
|
1575
1605
|
- ns_clean - clean up redundant namespace declarations
|
1576
1606
|
- recover - try hard to parse through broken XML
|
1577
1607
|
- remove_blank_text - discard blank text nodes that appear ignorable
|
@@ -1579,9 +1609,10 @@ cdef class XMLParser(_FeedParser):
|
|
1579
1609
|
- remove_pis - discard processing instructions
|
1580
1610
|
- strip_cdata - replace CDATA sections by normal text content (default: True)
|
1581
1611
|
- compact - save memory for short text content (default: True)
|
1582
|
-
- collect_ids - use a hash table of XML IDs for fast access
|
1612
|
+
- collect_ids - use a hash table of XML IDs for fast access
|
1613
|
+
(default: True, always True with DTD validation)
|
1583
1614
|
- huge_tree - disable security restrictions and support very deep trees
|
1584
|
-
and very long text content
|
1615
|
+
and very long text content
|
1585
1616
|
|
1586
1617
|
Other keyword arguments:
|
1587
1618
|
|
@@ -1598,7 +1629,7 @@ cdef class XMLParser(_FeedParser):
|
|
1598
1629
|
apply to the default parser.
|
1599
1630
|
"""
|
1600
1631
|
def __init__(self, *, encoding=None, attribute_defaults=False,
|
1601
|
-
dtd_validation=False, load_dtd=False, no_network=True,
|
1632
|
+
dtd_validation=False, load_dtd=False, no_network=True, decompress=False,
|
1602
1633
|
ns_clean=False, recover=False, XMLSchema schema=None,
|
1603
1634
|
huge_tree=False, remove_blank_text=False, resolve_entities='internal',
|
1604
1635
|
remove_comments=False, remove_pis=False, strip_cdata=True,
|
@@ -1638,6 +1669,10 @@ cdef class XMLParser(_FeedParser):
|
|
1638
1669
|
remove_comments, remove_pis, strip_cdata,
|
1639
1670
|
collect_ids, target, encoding, resolve_external)
|
1640
1671
|
|
1672
|
+
# Allow subscripting XMLParser in type annotions (PEP 560)
|
1673
|
+
def __class_getitem__(cls, item):
|
1674
|
+
return _GenericAlias(cls, item)
|
1675
|
+
|
1641
1676
|
|
1642
1677
|
cdef class XMLPullParser(XMLParser):
|
1643
1678
|
"""XMLPullParser(self, events=None, *, tag=None, **kwargs)
|
@@ -1670,7 +1705,7 @@ cdef class XMLPullParser(XMLParser):
|
|
1670
1705
|
|
1671
1706
|
cdef class ETCompatXMLParser(XMLParser):
|
1672
1707
|
"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
|
1673
|
-
dtd_validation=False, load_dtd=False, no_network=True, \
|
1708
|
+
dtd_validation=False, load_dtd=False, no_network=True, decompress=False, \
|
1674
1709
|
ns_clean=False, recover=False, schema=None, \
|
1675
1710
|
huge_tree=False, remove_blank_text=False, resolve_entities=True, \
|
1676
1711
|
remove_comments=True, remove_pis=True, strip_cdata=True, \
|
@@ -1684,7 +1719,7 @@ cdef class ETCompatXMLParser(XMLParser):
|
|
1684
1719
|
and thus ignores comments and processing instructions.
|
1685
1720
|
"""
|
1686
1721
|
def __init__(self, *, encoding=None, attribute_defaults=False,
|
1687
|
-
dtd_validation=False, load_dtd=False, no_network=True,
|
1722
|
+
dtd_validation=False, load_dtd=False, no_network=True, decompress=False,
|
1688
1723
|
ns_clean=False, recover=False, schema=None,
|
1689
1724
|
huge_tree=False, remove_blank_text=False, resolve_entities=True,
|
1690
1725
|
remove_comments=True, remove_pis=True, strip_cdata=True,
|
@@ -1694,6 +1729,7 @@ cdef class ETCompatXMLParser(XMLParser):
|
|
1694
1729
|
dtd_validation=dtd_validation,
|
1695
1730
|
load_dtd=load_dtd,
|
1696
1731
|
no_network=no_network,
|
1732
|
+
decompress=decompress,
|
1697
1733
|
ns_clean=ns_clean,
|
1698
1734
|
recover=recover,
|
1699
1735
|
remove_blank_text=remove_blank_text,
|
@@ -1705,7 +1741,8 @@ cdef class ETCompatXMLParser(XMLParser):
|
|
1705
1741
|
strip_cdata=strip_cdata,
|
1706
1742
|
target=target,
|
1707
1743
|
encoding=encoding,
|
1708
|
-
schema=schema
|
1744
|
+
schema=schema,
|
1745
|
+
)
|
1709
1746
|
|
1710
1747
|
# ET 1.2 compatible name
|
1711
1748
|
XMLTreeBuilder = ETCompatXMLParser
|
@@ -1752,7 +1789,7 @@ cdef object _UNUSED = object()
|
|
1752
1789
|
cdef class HTMLParser(_FeedParser):
|
1753
1790
|
"""HTMLParser(self, encoding=None, remove_blank_text=False, \
|
1754
1791
|
remove_comments=False, remove_pis=False, \
|
1755
|
-
no_network=True, target=None, schema: XMLSchema =None, \
|
1792
|
+
no_network=True, decompress=False, target=None, schema: XMLSchema =None, \
|
1756
1793
|
recover=True, compact=True, collect_ids=True, huge_tree=False)
|
1757
1794
|
|
1758
1795
|
The HTML parser.
|
@@ -1766,6 +1803,8 @@ cdef class HTMLParser(_FeedParser):
|
|
1766
1803
|
|
1767
1804
|
- recover - try hard to parse through broken HTML (default: True)
|
1768
1805
|
- no_network - prevent network access for related files (default: True)
|
1806
|
+
- decompress - automatically decompress gzip input
|
1807
|
+
(default: False, changed in lxml 6.0, disabling only affects libxml2 2.15+)
|
1769
1808
|
- remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)
|
1770
1809
|
- remove_comments - discard comments
|
1771
1810
|
- remove_pis - discard processing instructions
|
@@ -1773,7 +1812,7 @@ cdef class HTMLParser(_FeedParser):
|
|
1773
1812
|
- default_doctype - add a default doctype even if it is not found in the HTML (default: True)
|
1774
1813
|
- collect_ids - use a hash table of XML IDs for fast access (default: True)
|
1775
1814
|
- huge_tree - disable security restrictions and support very deep trees
|
1776
|
-
and very long text content
|
1815
|
+
and very long text content
|
1777
1816
|
|
1778
1817
|
Other keyword arguments:
|
1779
1818
|
|
@@ -1786,7 +1825,7 @@ cdef class HTMLParser(_FeedParser):
|
|
1786
1825
|
"""
|
1787
1826
|
def __init__(self, *, encoding=None, remove_blank_text=False,
|
1788
1827
|
remove_comments=False, remove_pis=False, strip_cdata=_UNUSED,
|
1789
|
-
no_network=True, target=None, XMLSchema schema=None,
|
1828
|
+
no_network=True, decompress=False, target=None, XMLSchema schema=None,
|
1790
1829
|
recover=True, compact=True, default_doctype=True,
|
1791
1830
|
collect_ids=True, huge_tree=False):
|
1792
1831
|
cdef int parse_options
|
@@ -1813,6 +1852,10 @@ cdef class HTMLParser(_FeedParser):
|
|
1813
1852
|
remove_comments, remove_pis, strip_cdata,
|
1814
1853
|
collect_ids, target, encoding)
|
1815
1854
|
|
1855
|
+
# Allow subscripting HTMLParser in type annotions (PEP 560)
|
1856
|
+
def __class_getitem__(cls, item):
|
1857
|
+
return _GenericAlias(cls, item)
|
1858
|
+
|
1816
1859
|
|
1817
1860
|
cdef HTMLParser __DEFAULT_HTML_PARSER
|
1818
1861
|
__DEFAULT_HTML_PARSER = HTMLParser()
|
@@ -1853,8 +1896,6 @@ cdef class HTMLPullParser(HTMLParser):
|
|
1853
1896
|
|
1854
1897
|
cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
|
1855
1898
|
cdef char* c_filename
|
1856
|
-
cdef char* c_text
|
1857
|
-
cdef Py_ssize_t c_len
|
1858
1899
|
if parser is None:
|
1859
1900
|
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
1860
1901
|
if not filename:
|
@@ -1862,36 +1903,56 @@ cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
|
|
1862
1903
|
else:
|
1863
1904
|
filename_utf = _encodeFilenameUTF8(filename)
|
1864
1905
|
c_filename = _cstr(filename_utf)
|
1865
|
-
if isinstance(text,
|
1866
|
-
|
1867
|
-
|
1868
|
-
|
1869
|
-
|
1870
|
-
|
1871
|
-
|
1872
|
-
|
1873
|
-
|
1874
|
-
|
1875
|
-
|
1906
|
+
if isinstance(text, bytes):
|
1907
|
+
return _parseDoc_bytes(<bytes> text, filename, c_filename, parser)
|
1908
|
+
elif isinstance(text, unicode):
|
1909
|
+
return _parseDoc_unicode(<unicode> text, filename, c_filename, parser)
|
1910
|
+
else:
|
1911
|
+
return _parseDoc_charbuffer(text, filename, c_filename, parser)
|
1912
|
+
|
1913
|
+
|
1914
|
+
cdef xmlDoc* _parseDoc_unicode(unicode text, filename, char* c_filename, _BaseParser parser) except NULL:
|
1915
|
+
cdef Py_ssize_t c_len
|
1916
|
+
if python.PyUnicode_IS_READY(text):
|
1917
|
+
# PEP-393 Unicode string
|
1918
|
+
c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
|
1876
1919
|
else:
|
1877
|
-
|
1878
|
-
|
1879
|
-
|
1880
|
-
|
1881
|
-
|
1882
|
-
|
1920
|
+
# old Py_UNICODE string
|
1921
|
+
c_len = python.PyUnicode_GET_DATA_SIZE(text)
|
1922
|
+
if c_len > limits.INT_MAX:
|
1923
|
+
return parser._parseDocFromFilelike(
|
1924
|
+
StringIO(text), filename, None)
|
1925
|
+
return parser._parseUnicodeDoc(text, c_filename)
|
1926
|
+
|
1927
|
+
|
1928
|
+
cdef xmlDoc* _parseDoc_bytes(bytes text, filename, char* c_filename, _BaseParser parser) except NULL:
|
1929
|
+
cdef Py_ssize_t c_len = len(text)
|
1930
|
+
if c_len > limits.INT_MAX:
|
1931
|
+
return parser._parseDocFromFilelike(BytesIO(text), filename, None)
|
1932
|
+
return parser._parseDoc(text, c_len, c_filename)
|
1933
|
+
|
1934
|
+
|
1935
|
+
cdef xmlDoc* _parseDoc_charbuffer(text, filename, char* c_filename, _BaseParser parser) except NULL:
|
1936
|
+
cdef const unsigned char[::1] data = memoryview(text).cast('B') # cast to 'unsigned char' buffer
|
1937
|
+
cdef Py_ssize_t c_len = len(data)
|
1938
|
+
if c_len > limits.INT_MAX:
|
1939
|
+
return parser._parseDocFromFilelike(BytesIO(text), filename, None)
|
1940
|
+
return parser._parseDoc(<const char*>&data[0], c_len, c_filename)
|
1941
|
+
|
1883
1942
|
|
1884
1943
|
cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
|
1885
1944
|
if parser is None:
|
1886
1945
|
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
1887
1946
|
return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
|
1888
1947
|
|
1948
|
+
|
1889
1949
|
cdef xmlDoc* _parseDocFromFilelike(source, filename,
|
1890
1950
|
_BaseParser parser) except NULL:
|
1891
1951
|
if parser is None:
|
1892
1952
|
parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
|
1893
1953
|
return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
|
1894
1954
|
|
1955
|
+
|
1895
1956
|
cdef xmlDoc* _newXMLDoc() except NULL:
|
1896
1957
|
cdef xmlDoc* result
|
1897
1958
|
result = tree.xmlNewDoc(NULL)
|
@@ -1990,8 +2051,6 @@ cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
|
|
1990
2051
|
raise ValueError(
|
1991
2052
|
"Unicode strings with encoding declaration are not supported. "
|
1992
2053
|
"Please use bytes input or XML fragments without declaration.")
|
1993
|
-
elif not isinstance(text, bytes):
|
1994
|
-
raise ValueError, "can only parse strings"
|
1995
2054
|
c_doc = _parseDoc(text, url, parser)
|
1996
2055
|
return _documentFactory(c_doc, parser)
|
1997
2056
|
|
lxml/sax.cpython-39-darwin.so
CHANGED
Binary file
|
lxml/sax.py
CHANGED
@@ -18,6 +18,13 @@ from lxml import etree
|
|
18
18
|
from lxml.etree import ElementTree, SubElement
|
19
19
|
from lxml.etree import Comment, ProcessingInstruction
|
20
20
|
|
21
|
+
try:
|
22
|
+
from types import GenericAlias as _GenericAlias
|
23
|
+
except ImportError:
|
24
|
+
# Python 3.8 - we only need this as return value from "__class_getitem__"
|
25
|
+
def _GenericAlias(cls, item):
|
26
|
+
return f"{cls.__name__}[{item.__name__}]"
|
27
|
+
|
21
28
|
|
22
29
|
class SaxError(etree.LxmlError):
|
23
30
|
"""General SAX error.
|
@@ -152,6 +159,10 @@ class ElementTreeContentHandler(ContentHandler):
|
|
152
159
|
|
153
160
|
ignorableWhitespace = characters
|
154
161
|
|
162
|
+
# Allow subscripting sax.ElementTreeContentHandler in type annotions (PEP 560)
|
163
|
+
def __class_getitem__(cls, item):
|
164
|
+
return _GenericAlias(cls, item)
|
165
|
+
|
155
166
|
|
156
167
|
class ElementTreeProducer:
|
157
168
|
"""Produces SAX events for an element and children.
|
lxml/saxparser.pxi
CHANGED
@@ -217,7 +217,7 @@ cdef class _SaxParserContext(_ParserContext):
|
|
217
217
|
finally:
|
218
218
|
self._parser = None # clear circular reference ASAP
|
219
219
|
if self._matcher is not None:
|
220
|
-
self._matcher.cacheTags(self._doc, True)
|
220
|
+
self._matcher.cacheTags(self._doc, force_into_dict=True)
|
221
221
|
return 0
|
222
222
|
|
223
223
|
cdef int pushEvent(self, event, xmlNode* c_node) except -1:
|
@@ -297,7 +297,7 @@ cdef void _handleSaxStart(
|
|
297
297
|
cdef int i
|
298
298
|
cdef size_t c_len
|
299
299
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
300
|
-
if c_ctxt._private is NULL or c_ctxt
|
300
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
301
301
|
return
|
302
302
|
context = <_SaxParserContext>c_ctxt._private
|
303
303
|
cdef int event_filter = context._event_filter
|
@@ -345,7 +345,7 @@ cdef void _handleSaxTargetStart(
|
|
345
345
|
cdef int i
|
346
346
|
cdef size_t c_len
|
347
347
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
348
|
-
if c_ctxt._private is NULL or c_ctxt
|
348
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
349
349
|
return
|
350
350
|
context = <_SaxParserContext>c_ctxt._private
|
351
351
|
|
@@ -411,7 +411,7 @@ cdef void _handleSaxTargetStart(
|
|
411
411
|
cdef void _handleSaxStartNoNs(void* ctxt, const_xmlChar* c_name,
|
412
412
|
const_xmlChar** c_attributes) noexcept with gil:
|
413
413
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
414
|
-
if c_ctxt._private is NULL or c_ctxt
|
414
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
415
415
|
return
|
416
416
|
context = <_SaxParserContext>c_ctxt._private
|
417
417
|
try:
|
@@ -436,7 +436,7 @@ cdef void _handleSaxStartNoNs(void* ctxt, const_xmlChar* c_name,
|
|
436
436
|
cdef void _handleSaxTargetStartNoNs(void* ctxt, const_xmlChar* c_name,
|
437
437
|
const_xmlChar** c_attributes) noexcept with gil:
|
438
438
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
439
|
-
if c_ctxt._private is NULL or c_ctxt
|
439
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
440
440
|
return
|
441
441
|
context = <_SaxParserContext>c_ctxt._private
|
442
442
|
try:
|
@@ -493,7 +493,7 @@ cdef void _handleSaxEnd(void* ctxt, const_xmlChar* c_localname,
|
|
493
493
|
const_xmlChar* c_prefix,
|
494
494
|
const_xmlChar* c_namespace) noexcept with gil:
|
495
495
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
496
|
-
if c_ctxt._private is NULL or c_ctxt
|
496
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
497
497
|
return
|
498
498
|
context = <_SaxParserContext>c_ctxt._private
|
499
499
|
try:
|
@@ -516,7 +516,7 @@ cdef void _handleSaxEnd(void* ctxt, const_xmlChar* c_localname,
|
|
516
516
|
|
517
517
|
cdef void _handleSaxEndNoNs(void* ctxt, const_xmlChar* c_name) noexcept with gil:
|
518
518
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
519
|
-
if c_ctxt._private is NULL or c_ctxt
|
519
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
520
520
|
return
|
521
521
|
context = <_SaxParserContext>c_ctxt._private
|
522
522
|
try:
|
@@ -569,7 +569,7 @@ cdef int _pushSaxEndEvent(_SaxParserContext context,
|
|
569
569
|
cdef void _handleSaxData(void* ctxt, const_xmlChar* c_data, int data_len) noexcept with gil:
|
570
570
|
# can only be called if parsing with a target
|
571
571
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
572
|
-
if c_ctxt._private is NULL or c_ctxt
|
572
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
573
573
|
return
|
574
574
|
context = <_SaxParserContext>c_ctxt._private
|
575
575
|
try:
|
@@ -586,7 +586,7 @@ cdef void _handleSaxTargetDoctype(void* ctxt, const_xmlChar* c_name,
|
|
586
586
|
const_xmlChar* c_system) noexcept with gil:
|
587
587
|
# can only be called if parsing with a target
|
588
588
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
589
|
-
if c_ctxt._private is NULL or c_ctxt
|
589
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
590
590
|
return
|
591
591
|
context = <_SaxParserContext>c_ctxt._private
|
592
592
|
try:
|
@@ -602,7 +602,7 @@ cdef void _handleSaxTargetDoctype(void* ctxt, const_xmlChar* c_name,
|
|
602
602
|
|
603
603
|
cdef void _handleSaxStartDocument(void* ctxt) noexcept with gil:
|
604
604
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
605
|
-
if c_ctxt._private is NULL or c_ctxt
|
605
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
606
606
|
return
|
607
607
|
context = <_SaxParserContext>c_ctxt._private
|
608
608
|
context._origSaxStartDocument(ctxt)
|
@@ -619,7 +619,7 @@ cdef void _handleSaxTargetPI(void* ctxt, const_xmlChar* c_target,
|
|
619
619
|
const_xmlChar* c_data) noexcept with gil:
|
620
620
|
# can only be called if parsing with a target
|
621
621
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
622
|
-
if c_ctxt._private is NULL or c_ctxt
|
622
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
623
623
|
return
|
624
624
|
context = <_SaxParserContext>c_ctxt._private
|
625
625
|
try:
|
@@ -638,7 +638,7 @@ cdef void _handleSaxPIEvent(void* ctxt, const_xmlChar* target,
|
|
638
638
|
const_xmlChar* data) noexcept with gil:
|
639
639
|
# can only be called when collecting pi events
|
640
640
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
641
|
-
if c_ctxt._private is NULL or c_ctxt
|
641
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
642
642
|
return
|
643
643
|
context = <_SaxParserContext>c_ctxt._private
|
644
644
|
context._origSaxPI(ctxt, target, data)
|
@@ -656,7 +656,7 @@ cdef void _handleSaxPIEvent(void* ctxt, const_xmlChar* target,
|
|
656
656
|
cdef void _handleSaxTargetComment(void* ctxt, const_xmlChar* c_data) noexcept with gil:
|
657
657
|
# can only be called if parsing with a target
|
658
658
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
659
|
-
if c_ctxt._private is NULL or c_ctxt
|
659
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
660
660
|
return
|
661
661
|
context = <_SaxParserContext>c_ctxt._private
|
662
662
|
try:
|
@@ -672,7 +672,7 @@ cdef void _handleSaxTargetComment(void* ctxt, const_xmlChar* c_data) noexcept wi
|
|
672
672
|
cdef void _handleSaxComment(void* ctxt, const_xmlChar* text) noexcept with gil:
|
673
673
|
# can only be called when collecting comment events
|
674
674
|
c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
|
675
|
-
if c_ctxt._private is NULL or c_ctxt
|
675
|
+
if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
|
676
676
|
return
|
677
677
|
context = <_SaxParserContext>c_ctxt._private
|
678
678
|
context._origSaxComment(ctxt, text)
|
lxml/schematron.pxi
CHANGED
@@ -69,9 +69,6 @@ cdef class Schematron(_Validator):
|
|
69
69
|
"""
|
70
70
|
cdef schematron.xmlSchematron* _c_schema
|
71
71
|
cdef xmlDoc* _c_schema_doc
|
72
|
-
def __cinit__(self):
|
73
|
-
self._c_schema = NULL
|
74
|
-
self._c_schema_doc = NULL
|
75
72
|
|
76
73
|
def __init__(self, etree=None, *, file=None):
|
77
74
|
cdef _Document doc
|
@@ -83,6 +80,14 @@ cdef class Schematron(_Validator):
|
|
83
80
|
if not config.ENABLE_SCHEMATRON:
|
84
81
|
raise SchematronError, \
|
85
82
|
"lxml.etree was compiled without Schematron support."
|
83
|
+
|
84
|
+
import warnings
|
85
|
+
warnings.warn(
|
86
|
+
"The (non-ISO) Schematron feature is deprecated and will be removed from libxml2 and lxml. "
|
87
|
+
"Use 'lxml.isoschematron' instead.",
|
88
|
+
DeprecationWarning,
|
89
|
+
)
|
90
|
+
|
86
91
|
if etree is not None:
|
87
92
|
doc = _documentOrRaise(etree)
|
88
93
|
root_node = _rootNodeOrRaise(etree)
|