lxml 5.4.0__cp311-cp311-macosx_10_9_universal2.whl → 6.0.0__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. lxml/__init__.py +1 -1
  2. lxml/_elementpath.cpython-311-darwin.so +0 -0
  3. lxml/_elementpath.py +3 -1
  4. lxml/apihelpers.pxi +25 -17
  5. lxml/builder.cpython-311-darwin.so +0 -0
  6. lxml/builder.py +11 -0
  7. lxml/debug.pxi +0 -54
  8. lxml/etree.cpython-311-darwin.so +0 -0
  9. lxml/etree.h +24 -28
  10. lxml/etree.pyx +154 -33
  11. lxml/etree_api.h +59 -50
  12. lxml/extensions.pxi +3 -6
  13. lxml/html/__init__.py +7 -3
  14. lxml/html/_difflib.cpython-311-darwin.so +0 -0
  15. lxml/html/_difflib.py +2106 -0
  16. lxml/html/builder.py +40 -0
  17. lxml/html/defs.py +3 -3
  18. lxml/html/diff.cpython-311-darwin.so +0 -0
  19. lxml/html/diff.py +406 -312
  20. lxml/includes/etree_defs.h +6 -6
  21. lxml/includes/libxml/HTMLparser.h +33 -30
  22. lxml/includes/libxml/HTMLtree.h +1 -0
  23. lxml/includes/libxml/SAX.h +2 -186
  24. lxml/includes/libxml/SAX2.h +2 -3
  25. lxml/includes/libxml/catalog.h +1 -0
  26. lxml/includes/libxml/debugXML.h +0 -138
  27. lxml/includes/libxml/encoding.h +124 -61
  28. lxml/includes/libxml/entities.h +0 -19
  29. lxml/includes/libxml/globals.h +0 -16
  30. lxml/includes/libxml/nanoftp.h +3 -173
  31. lxml/includes/libxml/parser.h +474 -231
  32. lxml/includes/libxml/parserInternals.h +21 -101
  33. lxml/includes/libxml/relaxng.h +7 -2
  34. lxml/includes/libxml/threads.h +0 -6
  35. lxml/includes/libxml/tree.h +29 -85
  36. lxml/includes/libxml/valid.h +20 -12
  37. lxml/includes/libxml/xinclude.h +5 -0
  38. lxml/includes/libxml/xlink.h +4 -0
  39. lxml/includes/libxml/xmlIO.h +15 -34
  40. lxml/includes/libxml/xmlautomata.h +19 -2
  41. lxml/includes/libxml/xmlerror.h +18 -18
  42. lxml/includes/libxml/xmlexports.h +6 -56
  43. lxml/includes/libxml/xmlmemory.h +19 -19
  44. lxml/includes/libxml/xmlmodule.h +4 -0
  45. lxml/includes/libxml/xmlreader.h +11 -3
  46. lxml/includes/libxml/xmlregexp.h +7 -106
  47. lxml/includes/libxml/xmlsave.h +11 -2
  48. lxml/includes/libxml/xmlschemas.h +10 -5
  49. lxml/includes/libxml/xmlunicode.h +3 -354
  50. lxml/includes/libxml/xmlversion.h +19 -34
  51. lxml/includes/libxml/xpath.h +5 -15
  52. lxml/includes/libxml/xpathInternals.h +9 -3
  53. lxml/includes/libxml/xpointer.h +1 -91
  54. lxml/includes/lxml-version.h +1 -1
  55. lxml/includes/tree.pxd +10 -12
  56. lxml/includes/xmlparser.pxd +46 -8
  57. lxml/lxml.etree.h +24 -28
  58. lxml/lxml.etree_api.h +59 -50
  59. lxml/objectify.cpython-311-darwin.so +0 -0
  60. lxml/objectify.pyx +11 -7
  61. lxml/parser.pxi +106 -47
  62. lxml/sax.cpython-311-darwin.so +0 -0
  63. lxml/sax.py +11 -0
  64. lxml/saxparser.pxi +14 -14
  65. lxml/schematron.pxi +8 -3
  66. lxml/serializer.pxi +71 -3
  67. lxml/xslt.pxi +10 -3
  68. lxml-6.0.0.dist-info/METADATA +163 -0
  69. {lxml-5.4.0.dist-info → lxml-6.0.0.dist-info}/RECORD +73 -71
  70. {lxml-5.4.0.dist-info → lxml-6.0.0.dist-info}/WHEEL +2 -1
  71. {lxml-5.4.0.dist-info → lxml-6.0.0.dist-info}/licenses/LICENSE.txt +3 -1
  72. lxml-5.4.0.dist-info/METADATA +0 -96
  73. {lxml-5.4.0.dist-info → lxml-6.0.0.dist-info}/licenses/LICENSES.txt +0 -0
  74. {lxml-5.4.0.dist-info → lxml-6.0.0.dist-info}/top_level.txt +0 -0
lxml/lxml.etree_api.h CHANGED
@@ -1,4 +1,4 @@
1
- /* Generated by Cython 3.0.12 */
1
+ /* Generated by Cython 3.1.2 */
2
2
 
3
3
  #ifndef __PYX_HAVE_API__lxml__etree
4
4
  #define __PYX_HAVE_API__lxml__etree
@@ -98,19 +98,26 @@ static void (*__pyx_api_f_4lxml_5etree_initTagMatch)(struct LxmlElementTagMatche
98
98
  #define initTagMatch __pyx_api_f_4lxml_5etree_initTagMatch
99
99
  static xmlNs *(*__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix)(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *) = 0;
100
100
  #define findOrBuildNodeNsPrefix __pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix
101
- #ifndef __PYX_HAVE_RT_ImportFunction_3_0_12
102
- #define __PYX_HAVE_RT_ImportFunction_3_0_12
103
- static int __Pyx_ImportFunction_3_0_12(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
101
+ static int __Pyx_ImportFunction_3_1_2(PyObject *module, const char *funcname, void (**f)(void), const char *sig);
102
+
103
+ #ifndef __PYX_HAVE_RT_ImportFunction_3_1_2
104
+ #define __PYX_HAVE_RT_ImportFunction_3_1_2
105
+ static int __Pyx_ImportFunction_3_1_2(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
104
106
  PyObject *d = 0;
105
107
  PyObject *cobj = 0;
106
108
  union {
107
109
  void (*fp)(void);
108
110
  void *p;
109
111
  } tmp;
110
- d = PyObject_GetAttrString(module, (char *)"__pyx_capi__");
112
+ d = PyObject_GetAttrString(module, "__pyx_capi__");
111
113
  if (!d)
112
114
  goto bad;
115
+ #if (defined(Py_LIMITED_API) && Py_LIMITED_API >= 0x030d0000) || (!defined(Py_LIMITED_API) && PY_VERSION_HEX >= 0x030d0000)
116
+ PyDict_GetItemStringRef(d, funcname, &cobj);
117
+ #else
113
118
  cobj = PyDict_GetItemString(d, funcname);
119
+ Py_XINCREF(cobj);
120
+ #endif
114
121
  if (!cobj) {
115
122
  PyErr_Format(PyExc_ImportError,
116
123
  "%.200s does not export expected C function %.200s",
@@ -128,9 +135,11 @@ static int __Pyx_ImportFunction_3_0_12(PyObject *module, const char *funcname, v
128
135
  if (!(*f))
129
136
  goto bad;
130
137
  Py_DECREF(d);
138
+ Py_DECREF(cobj);
131
139
  return 0;
132
140
  bad:
133
141
  Py_XDECREF(d);
142
+ Py_XDECREF(cobj);
134
143
  return -1;
135
144
  }
136
145
  #endif
@@ -140,51 +149,51 @@ static int import_lxml__etree(void) {
140
149
  PyObject *module = 0;
141
150
  module = PyImport_ImportModule("lxml.etree");
142
151
  if (!module) goto bad;
143
- if (__Pyx_ImportFunction_3_0_12(module, "deepcopyNodeToDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
144
- if (__Pyx_ImportFunction_3_0_12(module, "elementTreeFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementTreeFactory, "struct LxmlElementTree *(struct LxmlElement *)") < 0) goto bad;
145
- if (__Pyx_ImportFunction_3_0_12(module, "newElementTree", (void (**)(void))&__pyx_api_f_4lxml_5etree_newElementTree, "struct LxmlElementTree *(struct LxmlElement *, PyObject *)") < 0) goto bad;
146
- if (__Pyx_ImportFunction_3_0_12(module, "adoptExternalDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_adoptExternalDocument, "struct LxmlElementTree *(xmlDoc *, PyObject *, int)") < 0) goto bad;
147
- if (__Pyx_ImportFunction_3_0_12(module, "elementFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementFactory, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
148
- if (__Pyx_ImportFunction_3_0_12(module, "makeElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeElement, "struct LxmlElement *(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
149
- if (__Pyx_ImportFunction_3_0_12(module, "makeSubElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeSubElement, "struct LxmlElement *(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
150
- if (__Pyx_ImportFunction_3_0_12(module, "setElementClassLookupFunction", (void (**)(void))&__pyx_api_f_4lxml_5etree_setElementClassLookupFunction, "void (_element_class_lookup_function, PyObject *)") < 0) goto bad;
151
- if (__Pyx_ImportFunction_3_0_12(module, "lookupDefaultElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupDefaultElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
152
- if (__Pyx_ImportFunction_3_0_12(module, "lookupNamespaceElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
153
- if (__Pyx_ImportFunction_3_0_12(module, "callLookupFallback", (void (**)(void))&__pyx_api_f_4lxml_5etree_callLookupFallback, "PyObject *(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *)") < 0) goto bad;
154
- if (__Pyx_ImportFunction_3_0_12(module, "tagMatches", (void (**)(void))&__pyx_api_f_4lxml_5etree_tagMatches, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
155
- if (__Pyx_ImportFunction_3_0_12(module, "documentOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_documentOrRaise, "struct LxmlDocument *(PyObject *)") < 0) goto bad;
156
- if (__Pyx_ImportFunction_3_0_12(module, "rootNodeOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_rootNodeOrRaise, "struct LxmlElement *(PyObject *)") < 0) goto bad;
157
- if (__Pyx_ImportFunction_3_0_12(module, "hasText", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasText, "int (xmlNode *)") < 0) goto bad;
158
- if (__Pyx_ImportFunction_3_0_12(module, "hasTail", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasTail, "int (xmlNode *)") < 0) goto bad;
159
- if (__Pyx_ImportFunction_3_0_12(module, "textOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_textOf, "PyObject *(xmlNode *)") < 0) goto bad;
160
- if (__Pyx_ImportFunction_3_0_12(module, "tailOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_tailOf, "PyObject *(xmlNode *)") < 0) goto bad;
161
- if (__Pyx_ImportFunction_3_0_12(module, "setNodeText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setNodeText, "int (xmlNode *, PyObject *)") < 0) goto bad;
162
- if (__Pyx_ImportFunction_3_0_12(module, "setTailText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setTailText, "int (xmlNode *, PyObject *)") < 0) goto bad;
163
- if (__Pyx_ImportFunction_3_0_12(module, "attributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValue, "PyObject *(xmlNode *, xmlAttr *)") < 0) goto bad;
164
- if (__Pyx_ImportFunction_3_0_12(module, "attributeValueFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValueFromNsName, "PyObject *(xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
165
- if (__Pyx_ImportFunction_3_0_12(module, "getAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_getAttributeValue, "PyObject *(struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
166
- if (__Pyx_ImportFunction_3_0_12(module, "iterattributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_iterattributes, "PyObject *(struct LxmlElement *, int)") < 0) goto bad;
167
- if (__Pyx_ImportFunction_3_0_12(module, "collectAttributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_collectAttributes, "PyObject *(xmlNode *, int)") < 0) goto bad;
168
- if (__Pyx_ImportFunction_3_0_12(module, "setAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_setAttributeValue, "int (struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
169
- if (__Pyx_ImportFunction_3_0_12(module, "delAttribute", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttribute, "int (struct LxmlElement *, PyObject *)") < 0) goto bad;
170
- if (__Pyx_ImportFunction_3_0_12(module, "delAttributeFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttributeFromNsName, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
171
- if (__Pyx_ImportFunction_3_0_12(module, "hasChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasChild, "int (xmlNode *)") < 0) goto bad;
172
- if (__Pyx_ImportFunction_3_0_12(module, "findChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChild, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
173
- if (__Pyx_ImportFunction_3_0_12(module, "findChildForwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildForwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
174
- if (__Pyx_ImportFunction_3_0_12(module, "findChildBackwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildBackwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
175
- if (__Pyx_ImportFunction_3_0_12(module, "nextElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_nextElement, "xmlNode *(xmlNode *)") < 0) goto bad;
176
- if (__Pyx_ImportFunction_3_0_12(module, "previousElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_previousElement, "xmlNode *(xmlNode *)") < 0) goto bad;
177
- if (__Pyx_ImportFunction_3_0_12(module, "appendChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChild, "void (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
178
- if (__Pyx_ImportFunction_3_0_12(module, "appendChildToElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChildToElement, "int (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
179
- if (__Pyx_ImportFunction_3_0_12(module, "pyunicode", (void (**)(void))&__pyx_api_f_4lxml_5etree_pyunicode, "PyObject *(const xmlChar *)") < 0) goto bad;
180
- if (__Pyx_ImportFunction_3_0_12(module, "utf8", (void (**)(void))&__pyx_api_f_4lxml_5etree_utf8, "PyObject *(PyObject *)") < 0) goto bad;
181
- if (__Pyx_ImportFunction_3_0_12(module, "getNsTag", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTag, "PyObject *(PyObject *)") < 0) goto bad;
182
- if (__Pyx_ImportFunction_3_0_12(module, "getNsTagWithEmptyNs", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs, "PyObject *(PyObject *)") < 0) goto bad;
183
- if (__Pyx_ImportFunction_3_0_12(module, "namespacedName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedName, "PyObject *(xmlNode *)") < 0) goto bad;
184
- if (__Pyx_ImportFunction_3_0_12(module, "namespacedNameFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedNameFromNsName, "PyObject *(const xmlChar *, const xmlChar *)") < 0) goto bad;
185
- if (__Pyx_ImportFunction_3_0_12(module, "iteratorStoreNext", (void (**)(void))&__pyx_api_f_4lxml_5etree_iteratorStoreNext, "void (struct LxmlElementIterator *, struct LxmlElement *)") < 0) goto bad;
186
- if (__Pyx_ImportFunction_3_0_12(module, "initTagMatch", (void (**)(void))&__pyx_api_f_4lxml_5etree_initTagMatch, "void (struct LxmlElementTagMatcher *, PyObject *)") < 0) goto bad;
187
- if (__Pyx_ImportFunction_3_0_12(module, "findOrBuildNodeNsPrefix", (void (**)(void))&__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix, "xmlNs *(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
152
+ if (__Pyx_ImportFunction_3_1_2(module, "deepcopyNodeToDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
153
+ if (__Pyx_ImportFunction_3_1_2(module, "elementTreeFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementTreeFactory, "struct LxmlElementTree *(struct LxmlElement *)") < 0) goto bad;
154
+ if (__Pyx_ImportFunction_3_1_2(module, "newElementTree", (void (**)(void))&__pyx_api_f_4lxml_5etree_newElementTree, "struct LxmlElementTree *(struct LxmlElement *, PyObject *)") < 0) goto bad;
155
+ if (__Pyx_ImportFunction_3_1_2(module, "adoptExternalDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_adoptExternalDocument, "struct LxmlElementTree *(xmlDoc *, PyObject *, int)") < 0) goto bad;
156
+ if (__Pyx_ImportFunction_3_1_2(module, "elementFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementFactory, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
157
+ if (__Pyx_ImportFunction_3_1_2(module, "makeElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeElement, "struct LxmlElement *(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
158
+ if (__Pyx_ImportFunction_3_1_2(module, "makeSubElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeSubElement, "struct LxmlElement *(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
159
+ if (__Pyx_ImportFunction_3_1_2(module, "setElementClassLookupFunction", (void (**)(void))&__pyx_api_f_4lxml_5etree_setElementClassLookupFunction, "void (_element_class_lookup_function, PyObject *)") < 0) goto bad;
160
+ if (__Pyx_ImportFunction_3_1_2(module, "lookupDefaultElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupDefaultElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
161
+ if (__Pyx_ImportFunction_3_1_2(module, "lookupNamespaceElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
162
+ if (__Pyx_ImportFunction_3_1_2(module, "callLookupFallback", (void (**)(void))&__pyx_api_f_4lxml_5etree_callLookupFallback, "PyObject *(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *)") < 0) goto bad;
163
+ if (__Pyx_ImportFunction_3_1_2(module, "tagMatches", (void (**)(void))&__pyx_api_f_4lxml_5etree_tagMatches, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
164
+ if (__Pyx_ImportFunction_3_1_2(module, "documentOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_documentOrRaise, "struct LxmlDocument *(PyObject *)") < 0) goto bad;
165
+ if (__Pyx_ImportFunction_3_1_2(module, "rootNodeOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_rootNodeOrRaise, "struct LxmlElement *(PyObject *)") < 0) goto bad;
166
+ if (__Pyx_ImportFunction_3_1_2(module, "hasText", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasText, "int (xmlNode *)") < 0) goto bad;
167
+ if (__Pyx_ImportFunction_3_1_2(module, "hasTail", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasTail, "int (xmlNode *)") < 0) goto bad;
168
+ if (__Pyx_ImportFunction_3_1_2(module, "textOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_textOf, "PyObject *(xmlNode *)") < 0) goto bad;
169
+ if (__Pyx_ImportFunction_3_1_2(module, "tailOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_tailOf, "PyObject *(xmlNode *)") < 0) goto bad;
170
+ if (__Pyx_ImportFunction_3_1_2(module, "setNodeText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setNodeText, "int (xmlNode *, PyObject *)") < 0) goto bad;
171
+ if (__Pyx_ImportFunction_3_1_2(module, "setTailText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setTailText, "int (xmlNode *, PyObject *)") < 0) goto bad;
172
+ if (__Pyx_ImportFunction_3_1_2(module, "attributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValue, "PyObject *(xmlNode *, xmlAttr *)") < 0) goto bad;
173
+ if (__Pyx_ImportFunction_3_1_2(module, "attributeValueFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValueFromNsName, "PyObject *(xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
174
+ if (__Pyx_ImportFunction_3_1_2(module, "getAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_getAttributeValue, "PyObject *(struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
175
+ if (__Pyx_ImportFunction_3_1_2(module, "iterattributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_iterattributes, "PyObject *(struct LxmlElement *, int)") < 0) goto bad;
176
+ if (__Pyx_ImportFunction_3_1_2(module, "collectAttributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_collectAttributes, "PyObject *(xmlNode *, int)") < 0) goto bad;
177
+ if (__Pyx_ImportFunction_3_1_2(module, "setAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_setAttributeValue, "int (struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
178
+ if (__Pyx_ImportFunction_3_1_2(module, "delAttribute", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttribute, "int (struct LxmlElement *, PyObject *)") < 0) goto bad;
179
+ if (__Pyx_ImportFunction_3_1_2(module, "delAttributeFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttributeFromNsName, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
180
+ if (__Pyx_ImportFunction_3_1_2(module, "hasChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasChild, "int (xmlNode *)") < 0) goto bad;
181
+ if (__Pyx_ImportFunction_3_1_2(module, "findChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChild, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
182
+ if (__Pyx_ImportFunction_3_1_2(module, "findChildForwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildForwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
183
+ if (__Pyx_ImportFunction_3_1_2(module, "findChildBackwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildBackwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
184
+ if (__Pyx_ImportFunction_3_1_2(module, "nextElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_nextElement, "xmlNode *(xmlNode *)") < 0) goto bad;
185
+ if (__Pyx_ImportFunction_3_1_2(module, "previousElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_previousElement, "xmlNode *(xmlNode *)") < 0) goto bad;
186
+ if (__Pyx_ImportFunction_3_1_2(module, "appendChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChild, "void (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
187
+ if (__Pyx_ImportFunction_3_1_2(module, "appendChildToElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChildToElement, "int (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
188
+ if (__Pyx_ImportFunction_3_1_2(module, "pyunicode", (void (**)(void))&__pyx_api_f_4lxml_5etree_pyunicode, "PyObject *(const xmlChar *)") < 0) goto bad;
189
+ if (__Pyx_ImportFunction_3_1_2(module, "utf8", (void (**)(void))&__pyx_api_f_4lxml_5etree_utf8, "PyObject *(PyObject *)") < 0) goto bad;
190
+ if (__Pyx_ImportFunction_3_1_2(module, "getNsTag", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTag, "PyObject *(PyObject *)") < 0) goto bad;
191
+ if (__Pyx_ImportFunction_3_1_2(module, "getNsTagWithEmptyNs", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs, "PyObject *(PyObject *)") < 0) goto bad;
192
+ if (__Pyx_ImportFunction_3_1_2(module, "namespacedName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedName, "PyObject *(xmlNode *)") < 0) goto bad;
193
+ if (__Pyx_ImportFunction_3_1_2(module, "namespacedNameFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedNameFromNsName, "PyObject *(const xmlChar *, const xmlChar *)") < 0) goto bad;
194
+ if (__Pyx_ImportFunction_3_1_2(module, "iteratorStoreNext", (void (**)(void))&__pyx_api_f_4lxml_5etree_iteratorStoreNext, "void (struct LxmlElementIterator *, struct LxmlElement *)") < 0) goto bad;
195
+ if (__Pyx_ImportFunction_3_1_2(module, "initTagMatch", (void (**)(void))&__pyx_api_f_4lxml_5etree_initTagMatch, "void (struct LxmlElementTagMatcher *, PyObject *)") < 0) goto bad;
196
+ if (__Pyx_ImportFunction_3_1_2(module, "findOrBuildNodeNsPrefix", (void (**)(void))&__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix, "xmlNs *(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
188
197
  Py_DECREF(module); module = 0;
189
198
  return 0;
190
199
  bad:
Binary file
lxml/objectify.pyx CHANGED
@@ -18,6 +18,7 @@ from lxml.includes cimport tree
18
18
  cimport lxml.includes.etreepublic as cetree
19
19
  cimport libc.string as cstring_h # not to be confused with stdlib 'string'
20
20
  from libc.string cimport const_char
21
+ from libc cimport limits
21
22
 
22
23
  __all__ = ['BoolElement', 'DataElement', 'E', 'Element', 'ElementMaker',
23
24
  'FloatElement', 'IntElement', 'NoneElement',
@@ -420,8 +421,11 @@ cdef object _lookupChild(_Element parent, tag):
420
421
  cdef tree.xmlNode* c_node
421
422
  c_node = parent._c_node
422
423
  ns, tag = cetree.getNsTagWithEmptyNs(tag)
424
+ c_tag_len = len(<bytes> tag)
425
+ if c_tag_len > limits.INT_MAX:
426
+ return None
423
427
  c_tag = tree.xmlDictExists(
424
- c_node.doc.dict, _xcstr(tag), python.PyBytes_GET_SIZE(tag))
428
+ c_node.doc.dict, _xcstr(tag), <int> c_tag_len)
425
429
  if c_tag is NULL:
426
430
  return None # not in the hash map => not in the tree
427
431
  if ns is None:
@@ -1283,7 +1287,7 @@ cdef object _guessElementClass(tree.xmlNode* c_node):
1283
1287
  return None
1284
1288
  if value == '':
1285
1289
  return StringElement
1286
-
1290
+
1287
1291
  for type_check, pytype in _TYPE_CHECKS:
1288
1292
  try:
1289
1293
  type_check(value)
@@ -1689,8 +1693,8 @@ def annotate(element_or_tree, *, ignore_old=True, ignore_xsi=False,
1689
1693
 
1690
1694
  If the 'ignore_xsi' keyword argument is False (the default), existing
1691
1695
  'xsi:type' attributes will be used for the type annotation, if they fit the
1692
- element text values.
1693
-
1696
+ element text values.
1697
+
1694
1698
  Note that the mapping from Python types to XSI types is usually ambiguous.
1695
1699
  Currently, only the first XSI type name in the corresponding PyType
1696
1700
  definition will be used for annotation. Thus, you should consider naming
@@ -1705,7 +1709,7 @@ def annotate(element_or_tree, *, ignore_old=True, ignore_xsi=False,
1705
1709
  elements. Pass 'string', for example, to make string values the default.
1706
1710
 
1707
1711
  The keyword arguments 'annotate_xsi' (default: 0) and 'annotate_pytype'
1708
- (default: 1) control which kind(s) of annotation to use.
1712
+ (default: 1) control which kind(s) of annotation to use.
1709
1713
  """
1710
1714
  cdef _Element element
1711
1715
  element = cetree.rootNodeOrRaise(element_or_tree)
@@ -1878,7 +1882,7 @@ def deannotate(element_or_tree, *, bint pytype=True, bint xsi=True,
1878
1882
  and/or 'xsi:type' attributes and/or 'xsi:nil' attributes.
1879
1883
 
1880
1884
  If the 'pytype' keyword argument is True (the default), 'py:pytype'
1881
- attributes will be removed. If the 'xsi' keyword argument is True (the
1885
+ attributes will be removed. If the 'xsi' keyword argument is True (the
1882
1886
  default), 'xsi:type' attributes will be removed.
1883
1887
  If the 'xsi_nil' keyword argument is True (default: False), 'xsi:nil'
1884
1888
  attributes will be removed.
@@ -2124,7 +2128,7 @@ def DataElement(_value, attrib=None, nsmap=None, *, _pytype=None, _xsi=None,
2124
2128
  stringify = unicode if py_type is None else py_type.stringify
2125
2129
  strval = stringify(_value)
2126
2130
 
2127
- if _pytype is not None:
2131
+ if _pytype is not None:
2128
2132
  if _pytype == "NoneType" or _pytype == "none":
2129
2133
  strval = None
2130
2134
  _attributes[XML_SCHEMA_INSTANCE_NIL_ATTR] = "true"
lxml/parser.pxi CHANGED
@@ -3,6 +3,14 @@
3
3
  from lxml.includes cimport xmlparser
4
4
  from lxml.includes cimport htmlparser
5
5
 
6
+ cdef object _GenericAlias
7
+ try:
8
+ from types import GenericAlias as _GenericAlias
9
+ except ImportError:
10
+ # Python 3.8 - we only need this as return value from "__class_getitem__"
11
+ def _GenericAlias(cls, item):
12
+ return f"{cls.__name__}[{item.__name__}]"
13
+
6
14
 
7
15
  class ParseError(LxmlSyntaxError):
8
16
  """Syntax error while parsing an XML document.
@@ -53,7 +61,6 @@ cdef class _ParserDictionaryContext:
53
61
  cdef list _implied_parser_contexts
54
62
 
55
63
  def __cinit__(self):
56
- self._c_dict = NULL
57
64
  self._implied_parser_contexts = []
58
65
 
59
66
  def __dealloc__(self):
@@ -295,9 +302,7 @@ cdef class _FileReaderContext:
295
302
  self._filelike = filelike
296
303
  self._close_file_after_read = close_file
297
304
  self._encoding = encoding
298
- if url is None:
299
- self._c_url = NULL
300
- else:
305
+ if url is not None:
301
306
  url = _encodeFilename(url)
302
307
  self._c_url = _cstr(url)
303
308
  self._url = url
@@ -419,8 +424,6 @@ cdef class _FileReaderContext:
419
424
  cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) noexcept with gil:
420
425
  return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
421
426
 
422
- cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) noexcept nogil:
423
- return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt)
424
427
 
425
428
  ############################################################
426
429
  ## support for custom document loaders
@@ -542,11 +545,8 @@ cdef class _ParserContext(_ResolverContext):
542
545
  cdef bint _collect_ids
543
546
 
544
547
  def __cinit__(self):
545
- self._c_ctxt = NULL
546
548
  self._collect_ids = True
547
- if not config.ENABLE_THREADING:
548
- self._lock = NULL
549
- else:
549
+ if config.ENABLE_THREADING:
550
550
  self._lock = python.PyThread_allocate_lock()
551
551
  self._error_log = _ErrorLog()
552
552
 
@@ -573,6 +573,9 @@ cdef class _ParserContext(_ResolverContext):
573
573
  return context
574
574
 
575
575
  cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
576
+ """
577
+ Connects the libxml2-level context to the lxml-level parser context.
578
+ """
576
579
  self._c_ctxt = c_ctxt
577
580
  c_ctxt._private = <void*>self
578
581
 
@@ -597,6 +600,12 @@ cdef class _ParserContext(_ResolverContext):
597
600
  raise ParserError, "parser locking failed"
598
601
  self._error_log.clear()
599
602
  self._doc = None
603
+ # Connect the lxml error log with libxml2's error handling. In the case of parsing
604
+ # HTML, ctxt->sax is not set to null, so this always works. The libxml2 function
605
+ # that does this is htmlInitParserCtxt in HTMLparser.c. For HTML (and possibly XML
606
+ # too), libxml2's SAX's serror is set to be the place where errors are sent when
607
+ # schannel is set to ctxt->sax->serror in xmlCtxtErrMemory in libxml2's
608
+ # parserInternals.c.
600
609
  # Need a cast here because older libxml2 releases do not use 'const' in the functype.
601
610
  self._c_ctxt.sax.serror = <xmlerror.xmlStructuredErrorFunc> _receiveParserError
602
611
  self._orig_loader = _register_document_loader() if set_document_loader else NULL
@@ -642,6 +651,9 @@ cdef _initParserContext(_ParserContext context,
642
651
  context._initParserContext(c_ctxt)
643
652
 
644
653
  cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, const xmlerror.xmlError* error) noexcept with gil:
654
+ """
655
+ Add an error created by libxml2 to the lxml-level error_log.
656
+ """
645
657
  (<_ParserContext>_parser_context._private)._error_log._receive(error)
646
658
 
647
659
  cdef void _receiveParserError(void* c_context, const xmlerror.xmlError* error) noexcept nogil:
@@ -687,6 +699,8 @@ cdef xmlDoc* _handleParseResult(_ParserContext context,
687
699
  xmlparser.xmlParserCtxt* c_ctxt,
688
700
  xmlDoc* result, filename,
689
701
  bint recover, bint free_doc) except NULL:
702
+ # The C-level argument xmlDoc* result is passed in as NULL if the parser was not able
703
+ # to parse the document.
690
704
  cdef bint well_formed
691
705
  if result is not NULL:
692
706
  __GLOBAL_PARSER_CONTEXT.initDocDict(result)
@@ -698,6 +712,9 @@ cdef xmlDoc* _handleParseResult(_ParserContext context,
698
712
  c_ctxt.myDoc = NULL
699
713
 
700
714
  if result is not NULL:
715
+ # "wellFormed" in libxml2 is 0 if the parser found fatal errors. It still returns a
716
+ # parse result document if 'recover=True'. Here, we determine if we can present
717
+ # the document to the user or consider it incorrect or broken enough to raise an error.
701
718
  if (context._validator is not None and
702
719
  not context._validator.isvalid()):
703
720
  well_formed = 0 # actually not 'valid', but anyway ...
@@ -901,6 +918,9 @@ cdef class _BaseParser:
901
918
  return self._push_parser_context
902
919
 
903
920
  cdef _ParserContext _createContext(self, target, events_to_collect):
921
+ """
922
+ This method creates and configures the lxml-level parser.
923
+ """
904
924
  cdef _SaxParserContext sax_context
905
925
  if target is not None:
906
926
  sax_context = _TargetParserContext(self)
@@ -947,6 +967,9 @@ cdef class _BaseParser:
947
967
  return 0
948
968
 
949
969
  cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
970
+ """
971
+ Create and initialise a libxml2-level parser context.
972
+ """
950
973
  cdef xmlparser.xmlParserCtxt* c_ctxt
951
974
  if self._for_html:
952
975
  c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
@@ -1106,8 +1129,7 @@ cdef class _BaseParser:
1106
1129
  finally:
1107
1130
  context.cleanup()
1108
1131
 
1109
- cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
1110
- char* c_filename) except NULL:
1132
+ cdef xmlDoc* _parseDoc(self, const char* c_text, int c_len, char* c_filename) except NULL:
1111
1133
  """Parse document, share dictionary if possible.
1112
1134
  """
1113
1135
  cdef _ParserContext context
@@ -1440,7 +1462,7 @@ cdef class _FeedParser(_BaseParser):
1440
1462
  else:
1441
1463
  error = 0
1442
1464
 
1443
- if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
1465
+ if not pctxt.wellFormed and xmlparser.xmlCtxtIsStopped(pctxt) and context._has_raised():
1444
1466
  # propagate Python exceptions immediately
1445
1467
  recover = 0
1446
1468
  error = 1
@@ -1477,7 +1499,7 @@ cdef class _FeedParser(_BaseParser):
1477
1499
  else:
1478
1500
  xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
1479
1501
 
1480
- if (pctxt.recovery and not pctxt.disableSAX and
1502
+ if (pctxt.recovery and not xmlparser.xmlCtxtIsStopped(pctxt) and
1481
1503
  isinstance(context, _SaxParserContext)):
1482
1504
  # apply any left-over 'end' events
1483
1505
  (<_SaxParserContext>context).flushEvents()
@@ -1529,7 +1551,8 @@ cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
1529
1551
  return error
1530
1552
 
1531
1553
  # fix libxml2 setup for HTML
1532
- c_ctxt.progressive = 1
1554
+ if tree.LIBXML_VERSION < 21400:
1555
+ c_ctxt.progressive = 1 # TODO: remove
1533
1556
  c_ctxt.html = 1
1534
1557
  htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
1535
1558
 
@@ -1547,10 +1570,15 @@ _XML_DEFAULT_PARSE_OPTIONS = (
1547
1570
  xmlparser.XML_PARSE_NONET |
1548
1571
  xmlparser.XML_PARSE_COMPACT |
1549
1572
  xmlparser.XML_PARSE_BIG_LINES
1550
- )
1573
+ )
1551
1574
 
1552
1575
  cdef class XMLParser(_FeedParser):
1553
- """XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
1576
+ """XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, \
1577
+ load_dtd=False, no_network=True, decompress=False, ns_clean=False, \
1578
+ recover=False, schema: XMLSchema =None, huge_tree=False, \
1579
+ remove_blank_text=False, resolve_entities=True, \
1580
+ remove_comments=False, remove_pis=False, strip_cdata=True, \
1581
+ collect_ids=True, target=None, compact=True)
1554
1582
 
1555
1583
  The XML parser.
1556
1584
 
@@ -1572,6 +1600,8 @@ cdef class XMLParser(_FeedParser):
1572
1600
  - dtd_validation - validate against a DTD referenced by the document
1573
1601
  - load_dtd - use DTD for parsing
1574
1602
  - no_network - prevent network access for related files (default: True)
1603
+ - decompress - automatically decompress gzip input
1604
+ (default: False, changed in lxml 6.0, disabling only affects libxml2 2.15+)
1575
1605
  - ns_clean - clean up redundant namespace declarations
1576
1606
  - recover - try hard to parse through broken XML
1577
1607
  - remove_blank_text - discard blank text nodes that appear ignorable
@@ -1579,9 +1609,10 @@ cdef class XMLParser(_FeedParser):
1579
1609
  - remove_pis - discard processing instructions
1580
1610
  - strip_cdata - replace CDATA sections by normal text content (default: True)
1581
1611
  - compact - save memory for short text content (default: True)
1582
- - collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
1612
+ - collect_ids - use a hash table of XML IDs for fast access
1613
+ (default: True, always True with DTD validation)
1583
1614
  - huge_tree - disable security restrictions and support very deep trees
1584
- and very long text content (only affects libxml2 2.7+)
1615
+ and very long text content
1585
1616
 
1586
1617
  Other keyword arguments:
1587
1618
 
@@ -1598,7 +1629,7 @@ cdef class XMLParser(_FeedParser):
1598
1629
  apply to the default parser.
1599
1630
  """
1600
1631
  def __init__(self, *, encoding=None, attribute_defaults=False,
1601
- dtd_validation=False, load_dtd=False, no_network=True,
1632
+ dtd_validation=False, load_dtd=False, no_network=True, decompress=False,
1602
1633
  ns_clean=False, recover=False, XMLSchema schema=None,
1603
1634
  huge_tree=False, remove_blank_text=False, resolve_entities='internal',
1604
1635
  remove_comments=False, remove_pis=False, strip_cdata=True,
@@ -1638,6 +1669,10 @@ cdef class XMLParser(_FeedParser):
1638
1669
  remove_comments, remove_pis, strip_cdata,
1639
1670
  collect_ids, target, encoding, resolve_external)
1640
1671
 
1672
+ # Allow subscripting XMLParser in type annotions (PEP 560)
1673
+ def __class_getitem__(cls, item):
1674
+ return _GenericAlias(cls, item)
1675
+
1641
1676
 
1642
1677
  cdef class XMLPullParser(XMLParser):
1643
1678
  """XMLPullParser(self, events=None, *, tag=None, **kwargs)
@@ -1670,7 +1705,7 @@ cdef class XMLPullParser(XMLParser):
1670
1705
 
1671
1706
  cdef class ETCompatXMLParser(XMLParser):
1672
1707
  """ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
1673
- dtd_validation=False, load_dtd=False, no_network=True, \
1708
+ dtd_validation=False, load_dtd=False, no_network=True, decompress=False, \
1674
1709
  ns_clean=False, recover=False, schema=None, \
1675
1710
  huge_tree=False, remove_blank_text=False, resolve_entities=True, \
1676
1711
  remove_comments=True, remove_pis=True, strip_cdata=True, \
@@ -1684,7 +1719,7 @@ cdef class ETCompatXMLParser(XMLParser):
1684
1719
  and thus ignores comments and processing instructions.
1685
1720
  """
1686
1721
  def __init__(self, *, encoding=None, attribute_defaults=False,
1687
- dtd_validation=False, load_dtd=False, no_network=True,
1722
+ dtd_validation=False, load_dtd=False, no_network=True, decompress=False,
1688
1723
  ns_clean=False, recover=False, schema=None,
1689
1724
  huge_tree=False, remove_blank_text=False, resolve_entities=True,
1690
1725
  remove_comments=True, remove_pis=True, strip_cdata=True,
@@ -1694,6 +1729,7 @@ cdef class ETCompatXMLParser(XMLParser):
1694
1729
  dtd_validation=dtd_validation,
1695
1730
  load_dtd=load_dtd,
1696
1731
  no_network=no_network,
1732
+ decompress=decompress,
1697
1733
  ns_clean=ns_clean,
1698
1734
  recover=recover,
1699
1735
  remove_blank_text=remove_blank_text,
@@ -1705,7 +1741,8 @@ cdef class ETCompatXMLParser(XMLParser):
1705
1741
  strip_cdata=strip_cdata,
1706
1742
  target=target,
1707
1743
  encoding=encoding,
1708
- schema=schema)
1744
+ schema=schema,
1745
+ )
1709
1746
 
1710
1747
  # ET 1.2 compatible name
1711
1748
  XMLTreeBuilder = ETCompatXMLParser
@@ -1752,7 +1789,7 @@ cdef object _UNUSED = object()
1752
1789
  cdef class HTMLParser(_FeedParser):
1753
1790
  """HTMLParser(self, encoding=None, remove_blank_text=False, \
1754
1791
  remove_comments=False, remove_pis=False, \
1755
- no_network=True, target=None, schema: XMLSchema =None, \
1792
+ no_network=True, decompress=False, target=None, schema: XMLSchema =None, \
1756
1793
  recover=True, compact=True, collect_ids=True, huge_tree=False)
1757
1794
 
1758
1795
  The HTML parser.
@@ -1766,6 +1803,8 @@ cdef class HTMLParser(_FeedParser):
1766
1803
 
1767
1804
  - recover - try hard to parse through broken HTML (default: True)
1768
1805
  - no_network - prevent network access for related files (default: True)
1806
+ - decompress - automatically decompress gzip input
1807
+ (default: False, changed in lxml 6.0, disabling only affects libxml2 2.15+)
1769
1808
  - remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)
1770
1809
  - remove_comments - discard comments
1771
1810
  - remove_pis - discard processing instructions
@@ -1773,7 +1812,7 @@ cdef class HTMLParser(_FeedParser):
1773
1812
  - default_doctype - add a default doctype even if it is not found in the HTML (default: True)
1774
1813
  - collect_ids - use a hash table of XML IDs for fast access (default: True)
1775
1814
  - huge_tree - disable security restrictions and support very deep trees
1776
- and very long text content (only affects libxml2 2.7+)
1815
+ and very long text content
1777
1816
 
1778
1817
  Other keyword arguments:
1779
1818
 
@@ -1786,7 +1825,7 @@ cdef class HTMLParser(_FeedParser):
1786
1825
  """
1787
1826
  def __init__(self, *, encoding=None, remove_blank_text=False,
1788
1827
  remove_comments=False, remove_pis=False, strip_cdata=_UNUSED,
1789
- no_network=True, target=None, XMLSchema schema=None,
1828
+ no_network=True, decompress=False, target=None, XMLSchema schema=None,
1790
1829
  recover=True, compact=True, default_doctype=True,
1791
1830
  collect_ids=True, huge_tree=False):
1792
1831
  cdef int parse_options
@@ -1813,6 +1852,10 @@ cdef class HTMLParser(_FeedParser):
1813
1852
  remove_comments, remove_pis, strip_cdata,
1814
1853
  collect_ids, target, encoding)
1815
1854
 
1855
+ # Allow subscripting HTMLParser in type annotions (PEP 560)
1856
+ def __class_getitem__(cls, item):
1857
+ return _GenericAlias(cls, item)
1858
+
1816
1859
 
1817
1860
  cdef HTMLParser __DEFAULT_HTML_PARSER
1818
1861
  __DEFAULT_HTML_PARSER = HTMLParser()
@@ -1853,8 +1896,6 @@ cdef class HTMLPullParser(HTMLParser):
1853
1896
 
1854
1897
  cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1855
1898
  cdef char* c_filename
1856
- cdef char* c_text
1857
- cdef Py_ssize_t c_len
1858
1899
  if parser is None:
1859
1900
  parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1860
1901
  if not filename:
@@ -1862,36 +1903,56 @@ cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
1862
1903
  else:
1863
1904
  filename_utf = _encodeFilenameUTF8(filename)
1864
1905
  c_filename = _cstr(filename_utf)
1865
- if isinstance(text, unicode):
1866
- if python.PyUnicode_IS_READY(text):
1867
- # PEP-393 Unicode string
1868
- c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
1869
- else:
1870
- # old Py_UNICODE string
1871
- c_len = python.PyUnicode_GET_DATA_SIZE(text)
1872
- if c_len > limits.INT_MAX:
1873
- return (<_BaseParser>parser)._parseDocFromFilelike(
1874
- StringIO(text), filename, None)
1875
- return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
1906
+ if isinstance(text, bytes):
1907
+ return _parseDoc_bytes(<bytes> text, filename, c_filename, parser)
1908
+ elif isinstance(text, unicode):
1909
+ return _parseDoc_unicode(<unicode> text, filename, c_filename, parser)
1910
+ else:
1911
+ return _parseDoc_charbuffer(text, filename, c_filename, parser)
1912
+
1913
+
1914
+ cdef xmlDoc* _parseDoc_unicode(unicode text, filename, char* c_filename, _BaseParser parser) except NULL:
1915
+ cdef Py_ssize_t c_len
1916
+ if python.PyUnicode_IS_READY(text):
1917
+ # PEP-393 Unicode string
1918
+ c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
1876
1919
  else:
1877
- c_len = python.PyBytes_GET_SIZE(text)
1878
- if c_len > limits.INT_MAX:
1879
- return (<_BaseParser>parser)._parseDocFromFilelike(
1880
- BytesIO(text), filename, None)
1881
- c_text = _cstr(text)
1882
- return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
1920
+ # old Py_UNICODE string
1921
+ c_len = python.PyUnicode_GET_DATA_SIZE(text)
1922
+ if c_len > limits.INT_MAX:
1923
+ return parser._parseDocFromFilelike(
1924
+ StringIO(text), filename, None)
1925
+ return parser._parseUnicodeDoc(text, c_filename)
1926
+
1927
+
1928
+ cdef xmlDoc* _parseDoc_bytes(bytes text, filename, char* c_filename, _BaseParser parser) except NULL:
1929
+ cdef Py_ssize_t c_len = len(text)
1930
+ if c_len > limits.INT_MAX:
1931
+ return parser._parseDocFromFilelike(BytesIO(text), filename, None)
1932
+ return parser._parseDoc(text, c_len, c_filename)
1933
+
1934
+
1935
+ cdef xmlDoc* _parseDoc_charbuffer(text, filename, char* c_filename, _BaseParser parser) except NULL:
1936
+ cdef const unsigned char[::1] data = memoryview(text).cast('B') # cast to 'unsigned char' buffer
1937
+ cdef Py_ssize_t c_len = len(data)
1938
+ if c_len > limits.INT_MAX:
1939
+ return parser._parseDocFromFilelike(BytesIO(text), filename, None)
1940
+ return parser._parseDoc(<const char*>&data[0], c_len, c_filename)
1941
+
1883
1942
 
1884
1943
  cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
1885
1944
  if parser is None:
1886
1945
  parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1887
1946
  return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
1888
1947
 
1948
+
1889
1949
  cdef xmlDoc* _parseDocFromFilelike(source, filename,
1890
1950
  _BaseParser parser) except NULL:
1891
1951
  if parser is None:
1892
1952
  parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
1893
1953
  return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
1894
1954
 
1955
+
1895
1956
  cdef xmlDoc* _newXMLDoc() except NULL:
1896
1957
  cdef xmlDoc* result
1897
1958
  result = tree.xmlNewDoc(NULL)
@@ -1990,8 +2051,6 @@ cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
1990
2051
  raise ValueError(
1991
2052
  "Unicode strings with encoding declaration are not supported. "
1992
2053
  "Please use bytes input or XML fragments without declaration.")
1993
- elif not isinstance(text, bytes):
1994
- raise ValueError, "can only parse strings"
1995
2054
  c_doc = _parseDoc(text, url, parser)
1996
2055
  return _documentFactory(c_doc, parser)
1997
2056
 
Binary file
lxml/sax.py CHANGED
@@ -18,6 +18,13 @@ from lxml import etree
18
18
  from lxml.etree import ElementTree, SubElement
19
19
  from lxml.etree import Comment, ProcessingInstruction
20
20
 
21
+ try:
22
+ from types import GenericAlias as _GenericAlias
23
+ except ImportError:
24
+ # Python 3.8 - we only need this as return value from "__class_getitem__"
25
+ def _GenericAlias(cls, item):
26
+ return f"{cls.__name__}[{item.__name__}]"
27
+
21
28
 
22
29
  class SaxError(etree.LxmlError):
23
30
  """General SAX error.
@@ -152,6 +159,10 @@ class ElementTreeContentHandler(ContentHandler):
152
159
 
153
160
  ignorableWhitespace = characters
154
161
 
162
+ # Allow subscripting sax.ElementTreeContentHandler in type annotions (PEP 560)
163
+ def __class_getitem__(cls, item):
164
+ return _GenericAlias(cls, item)
165
+
155
166
 
156
167
  class ElementTreeProducer:
157
168
  """Produces SAX events for an element and children.