lxml 5.1.1__pp38-pypy38_pp73-win_amd64.whl → 5.2.1__pp38-pypy38_pp73-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lxml/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # this is a package
2
2
 
3
- __version__ = "5.1.1"
3
+ __version__ = "5.2.1"
4
4
 
5
5
 
6
6
  def get_include():
lxml/etree.h CHANGED
@@ -1,4 +1,4 @@
1
- /* Generated by Cython 3.0.9 */
1
+ /* Generated by Cython 3.0.10 */
2
2
 
3
3
  #ifndef __PYX_HAVE__lxml__etree
4
4
  #define __PYX_HAVE__lxml__etree
@@ -66,7 +66,7 @@ struct LxmlElementTree {
66
66
  struct LxmlElement *_context_node;
67
67
  };
68
68
 
69
- /* "lxml/etree.pyx":2645
69
+ /* "lxml/etree.pyx":2646
70
70
  *
71
71
  *
72
72
  * cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher, # <<<<<<<<<<<<<<
@@ -82,7 +82,7 @@ struct LxmlElementTagMatcher {
82
82
  char *_name;
83
83
  };
84
84
 
85
- /* "lxml/etree.pyx":2676
85
+ /* "lxml/etree.pyx":2677
86
86
  * self._name = NULL
87
87
  *
88
88
  * cdef public class _ElementIterator(_ElementTagMatcher) [ # <<<<<<<<<<<<<<
Binary file
lxml/etree.pyx CHANGED
@@ -2306,6 +2306,7 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
2306
2306
  root = self.getroot()
2307
2307
  if _isString(path):
2308
2308
  if path[:1] == "/":
2309
+ path = "." + path
2309
2310
  from warnings import warn
2310
2311
  warn(
2311
2312
  "This search incorrectly ignores the root element, and will be "
lxml/etree_api.h CHANGED
@@ -1,4 +1,4 @@
1
- /* Generated by Cython 3.0.9 */
1
+ /* Generated by Cython 3.0.10 */
2
2
 
3
3
  #ifndef __PYX_HAVE_API__lxml__etree
4
4
  #define __PYX_HAVE_API__lxml__etree
@@ -98,9 +98,9 @@ static void (*__pyx_api_f_4lxml_5etree_initTagMatch)(struct LxmlElementTagMatche
98
98
  #define initTagMatch __pyx_api_f_4lxml_5etree_initTagMatch
99
99
  static xmlNs *(*__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix)(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *) = 0;
100
100
  #define findOrBuildNodeNsPrefix __pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix
101
- #ifndef __PYX_HAVE_RT_ImportFunction_3_0_9
102
- #define __PYX_HAVE_RT_ImportFunction_3_0_9
103
- static int __Pyx_ImportFunction_3_0_9(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
101
+ #ifndef __PYX_HAVE_RT_ImportFunction_3_0_10
102
+ #define __PYX_HAVE_RT_ImportFunction_3_0_10
103
+ static int __Pyx_ImportFunction_3_0_10(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
104
104
  PyObject *d = 0;
105
105
  PyObject *cobj = 0;
106
106
  union {
@@ -140,51 +140,51 @@ static int import_lxml__etree(void) {
140
140
  PyObject *module = 0;
141
141
  module = PyImport_ImportModule("lxml.etree");
142
142
  if (!module) goto bad;
143
- if (__Pyx_ImportFunction_3_0_9(module, "deepcopyNodeToDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
144
- if (__Pyx_ImportFunction_3_0_9(module, "elementTreeFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementTreeFactory, "struct LxmlElementTree *(struct LxmlElement *)") < 0) goto bad;
145
- if (__Pyx_ImportFunction_3_0_9(module, "newElementTree", (void (**)(void))&__pyx_api_f_4lxml_5etree_newElementTree, "struct LxmlElementTree *(struct LxmlElement *, PyObject *)") < 0) goto bad;
146
- if (__Pyx_ImportFunction_3_0_9(module, "adoptExternalDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_adoptExternalDocument, "struct LxmlElementTree *(xmlDoc *, PyObject *, int)") < 0) goto bad;
147
- if (__Pyx_ImportFunction_3_0_9(module, "elementFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementFactory, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
148
- if (__Pyx_ImportFunction_3_0_9(module, "makeElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeElement, "struct LxmlElement *(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
149
- if (__Pyx_ImportFunction_3_0_9(module, "makeSubElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeSubElement, "struct LxmlElement *(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
150
- if (__Pyx_ImportFunction_3_0_9(module, "setElementClassLookupFunction", (void (**)(void))&__pyx_api_f_4lxml_5etree_setElementClassLookupFunction, "void (_element_class_lookup_function, PyObject *)") < 0) goto bad;
151
- if (__Pyx_ImportFunction_3_0_9(module, "lookupDefaultElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupDefaultElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
152
- if (__Pyx_ImportFunction_3_0_9(module, "lookupNamespaceElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
153
- if (__Pyx_ImportFunction_3_0_9(module, "callLookupFallback", (void (**)(void))&__pyx_api_f_4lxml_5etree_callLookupFallback, "PyObject *(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *)") < 0) goto bad;
154
- if (__Pyx_ImportFunction_3_0_9(module, "tagMatches", (void (**)(void))&__pyx_api_f_4lxml_5etree_tagMatches, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
155
- if (__Pyx_ImportFunction_3_0_9(module, "documentOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_documentOrRaise, "struct LxmlDocument *(PyObject *)") < 0) goto bad;
156
- if (__Pyx_ImportFunction_3_0_9(module, "rootNodeOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_rootNodeOrRaise, "struct LxmlElement *(PyObject *)") < 0) goto bad;
157
- if (__Pyx_ImportFunction_3_0_9(module, "hasText", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasText, "int (xmlNode *)") < 0) goto bad;
158
- if (__Pyx_ImportFunction_3_0_9(module, "hasTail", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasTail, "int (xmlNode *)") < 0) goto bad;
159
- if (__Pyx_ImportFunction_3_0_9(module, "textOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_textOf, "PyObject *(xmlNode *)") < 0) goto bad;
160
- if (__Pyx_ImportFunction_3_0_9(module, "tailOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_tailOf, "PyObject *(xmlNode *)") < 0) goto bad;
161
- if (__Pyx_ImportFunction_3_0_9(module, "setNodeText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setNodeText, "int (xmlNode *, PyObject *)") < 0) goto bad;
162
- if (__Pyx_ImportFunction_3_0_9(module, "setTailText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setTailText, "int (xmlNode *, PyObject *)") < 0) goto bad;
163
- if (__Pyx_ImportFunction_3_0_9(module, "attributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValue, "PyObject *(xmlNode *, xmlAttr *)") < 0) goto bad;
164
- if (__Pyx_ImportFunction_3_0_9(module, "attributeValueFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValueFromNsName, "PyObject *(xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
165
- if (__Pyx_ImportFunction_3_0_9(module, "getAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_getAttributeValue, "PyObject *(struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
166
- if (__Pyx_ImportFunction_3_0_9(module, "iterattributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_iterattributes, "PyObject *(struct LxmlElement *, int)") < 0) goto bad;
167
- if (__Pyx_ImportFunction_3_0_9(module, "collectAttributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_collectAttributes, "PyObject *(xmlNode *, int)") < 0) goto bad;
168
- if (__Pyx_ImportFunction_3_0_9(module, "setAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_setAttributeValue, "int (struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
169
- if (__Pyx_ImportFunction_3_0_9(module, "delAttribute", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttribute, "int (struct LxmlElement *, PyObject *)") < 0) goto bad;
170
- if (__Pyx_ImportFunction_3_0_9(module, "delAttributeFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttributeFromNsName, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
171
- if (__Pyx_ImportFunction_3_0_9(module, "hasChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasChild, "int (xmlNode *)") < 0) goto bad;
172
- if (__Pyx_ImportFunction_3_0_9(module, "findChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChild, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
173
- if (__Pyx_ImportFunction_3_0_9(module, "findChildForwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildForwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
174
- if (__Pyx_ImportFunction_3_0_9(module, "findChildBackwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildBackwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
175
- if (__Pyx_ImportFunction_3_0_9(module, "nextElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_nextElement, "xmlNode *(xmlNode *)") < 0) goto bad;
176
- if (__Pyx_ImportFunction_3_0_9(module, "previousElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_previousElement, "xmlNode *(xmlNode *)") < 0) goto bad;
177
- if (__Pyx_ImportFunction_3_0_9(module, "appendChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChild, "void (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
178
- if (__Pyx_ImportFunction_3_0_9(module, "appendChildToElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChildToElement, "int (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
179
- if (__Pyx_ImportFunction_3_0_9(module, "pyunicode", (void (**)(void))&__pyx_api_f_4lxml_5etree_pyunicode, "PyObject *(const xmlChar *)") < 0) goto bad;
180
- if (__Pyx_ImportFunction_3_0_9(module, "utf8", (void (**)(void))&__pyx_api_f_4lxml_5etree_utf8, "PyObject *(PyObject *)") < 0) goto bad;
181
- if (__Pyx_ImportFunction_3_0_9(module, "getNsTag", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTag, "PyObject *(PyObject *)") < 0) goto bad;
182
- if (__Pyx_ImportFunction_3_0_9(module, "getNsTagWithEmptyNs", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs, "PyObject *(PyObject *)") < 0) goto bad;
183
- if (__Pyx_ImportFunction_3_0_9(module, "namespacedName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedName, "PyObject *(xmlNode *)") < 0) goto bad;
184
- if (__Pyx_ImportFunction_3_0_9(module, "namespacedNameFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedNameFromNsName, "PyObject *(const xmlChar *, const xmlChar *)") < 0) goto bad;
185
- if (__Pyx_ImportFunction_3_0_9(module, "iteratorStoreNext", (void (**)(void))&__pyx_api_f_4lxml_5etree_iteratorStoreNext, "void (struct LxmlElementIterator *, struct LxmlElement *)") < 0) goto bad;
186
- if (__Pyx_ImportFunction_3_0_9(module, "initTagMatch", (void (**)(void))&__pyx_api_f_4lxml_5etree_initTagMatch, "void (struct LxmlElementTagMatcher *, PyObject *)") < 0) goto bad;
187
- if (__Pyx_ImportFunction_3_0_9(module, "findOrBuildNodeNsPrefix", (void (**)(void))&__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix, "xmlNs *(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
143
+ if (__Pyx_ImportFunction_3_0_10(module, "deepcopyNodeToDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
144
+ if (__Pyx_ImportFunction_3_0_10(module, "elementTreeFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementTreeFactory, "struct LxmlElementTree *(struct LxmlElement *)") < 0) goto bad;
145
+ if (__Pyx_ImportFunction_3_0_10(module, "newElementTree", (void (**)(void))&__pyx_api_f_4lxml_5etree_newElementTree, "struct LxmlElementTree *(struct LxmlElement *, PyObject *)") < 0) goto bad;
146
+ if (__Pyx_ImportFunction_3_0_10(module, "adoptExternalDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_adoptExternalDocument, "struct LxmlElementTree *(xmlDoc *, PyObject *, int)") < 0) goto bad;
147
+ if (__Pyx_ImportFunction_3_0_10(module, "elementFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementFactory, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
148
+ if (__Pyx_ImportFunction_3_0_10(module, "makeElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeElement, "struct LxmlElement *(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
149
+ if (__Pyx_ImportFunction_3_0_10(module, "makeSubElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeSubElement, "struct LxmlElement *(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
150
+ if (__Pyx_ImportFunction_3_0_10(module, "setElementClassLookupFunction", (void (**)(void))&__pyx_api_f_4lxml_5etree_setElementClassLookupFunction, "void (_element_class_lookup_function, PyObject *)") < 0) goto bad;
151
+ if (__Pyx_ImportFunction_3_0_10(module, "lookupDefaultElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupDefaultElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
152
+ if (__Pyx_ImportFunction_3_0_10(module, "lookupNamespaceElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
153
+ if (__Pyx_ImportFunction_3_0_10(module, "callLookupFallback", (void (**)(void))&__pyx_api_f_4lxml_5etree_callLookupFallback, "PyObject *(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *)") < 0) goto bad;
154
+ if (__Pyx_ImportFunction_3_0_10(module, "tagMatches", (void (**)(void))&__pyx_api_f_4lxml_5etree_tagMatches, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
155
+ if (__Pyx_ImportFunction_3_0_10(module, "documentOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_documentOrRaise, "struct LxmlDocument *(PyObject *)") < 0) goto bad;
156
+ if (__Pyx_ImportFunction_3_0_10(module, "rootNodeOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_rootNodeOrRaise, "struct LxmlElement *(PyObject *)") < 0) goto bad;
157
+ if (__Pyx_ImportFunction_3_0_10(module, "hasText", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasText, "int (xmlNode *)") < 0) goto bad;
158
+ if (__Pyx_ImportFunction_3_0_10(module, "hasTail", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasTail, "int (xmlNode *)") < 0) goto bad;
159
+ if (__Pyx_ImportFunction_3_0_10(module, "textOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_textOf, "PyObject *(xmlNode *)") < 0) goto bad;
160
+ if (__Pyx_ImportFunction_3_0_10(module, "tailOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_tailOf, "PyObject *(xmlNode *)") < 0) goto bad;
161
+ if (__Pyx_ImportFunction_3_0_10(module, "setNodeText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setNodeText, "int (xmlNode *, PyObject *)") < 0) goto bad;
162
+ if (__Pyx_ImportFunction_3_0_10(module, "setTailText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setTailText, "int (xmlNode *, PyObject *)") < 0) goto bad;
163
+ if (__Pyx_ImportFunction_3_0_10(module, "attributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValue, "PyObject *(xmlNode *, xmlAttr *)") < 0) goto bad;
164
+ if (__Pyx_ImportFunction_3_0_10(module, "attributeValueFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValueFromNsName, "PyObject *(xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
165
+ if (__Pyx_ImportFunction_3_0_10(module, "getAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_getAttributeValue, "PyObject *(struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
166
+ if (__Pyx_ImportFunction_3_0_10(module, "iterattributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_iterattributes, "PyObject *(struct LxmlElement *, int)") < 0) goto bad;
167
+ if (__Pyx_ImportFunction_3_0_10(module, "collectAttributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_collectAttributes, "PyObject *(xmlNode *, int)") < 0) goto bad;
168
+ if (__Pyx_ImportFunction_3_0_10(module, "setAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_setAttributeValue, "int (struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
169
+ if (__Pyx_ImportFunction_3_0_10(module, "delAttribute", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttribute, "int (struct LxmlElement *, PyObject *)") < 0) goto bad;
170
+ if (__Pyx_ImportFunction_3_0_10(module, "delAttributeFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttributeFromNsName, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
171
+ if (__Pyx_ImportFunction_3_0_10(module, "hasChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasChild, "int (xmlNode *)") < 0) goto bad;
172
+ if (__Pyx_ImportFunction_3_0_10(module, "findChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChild, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
173
+ if (__Pyx_ImportFunction_3_0_10(module, "findChildForwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildForwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
174
+ if (__Pyx_ImportFunction_3_0_10(module, "findChildBackwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildBackwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
175
+ if (__Pyx_ImportFunction_3_0_10(module, "nextElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_nextElement, "xmlNode *(xmlNode *)") < 0) goto bad;
176
+ if (__Pyx_ImportFunction_3_0_10(module, "previousElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_previousElement, "xmlNode *(xmlNode *)") < 0) goto bad;
177
+ if (__Pyx_ImportFunction_3_0_10(module, "appendChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChild, "void (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
178
+ if (__Pyx_ImportFunction_3_0_10(module, "appendChildToElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChildToElement, "int (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
179
+ if (__Pyx_ImportFunction_3_0_10(module, "pyunicode", (void (**)(void))&__pyx_api_f_4lxml_5etree_pyunicode, "PyObject *(const xmlChar *)") < 0) goto bad;
180
+ if (__Pyx_ImportFunction_3_0_10(module, "utf8", (void (**)(void))&__pyx_api_f_4lxml_5etree_utf8, "PyObject *(PyObject *)") < 0) goto bad;
181
+ if (__Pyx_ImportFunction_3_0_10(module, "getNsTag", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTag, "PyObject *(PyObject *)") < 0) goto bad;
182
+ if (__Pyx_ImportFunction_3_0_10(module, "getNsTagWithEmptyNs", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs, "PyObject *(PyObject *)") < 0) goto bad;
183
+ if (__Pyx_ImportFunction_3_0_10(module, "namespacedName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedName, "PyObject *(xmlNode *)") < 0) goto bad;
184
+ if (__Pyx_ImportFunction_3_0_10(module, "namespacedNameFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedNameFromNsName, "PyObject *(const xmlChar *, const xmlChar *)") < 0) goto bad;
185
+ if (__Pyx_ImportFunction_3_0_10(module, "iteratorStoreNext", (void (**)(void))&__pyx_api_f_4lxml_5etree_iteratorStoreNext, "void (struct LxmlElementIterator *, struct LxmlElement *)") < 0) goto bad;
186
+ if (__Pyx_ImportFunction_3_0_10(module, "initTagMatch", (void (**)(void))&__pyx_api_f_4lxml_5etree_initTagMatch, "void (struct LxmlElementTagMatcher *, PyObject *)") < 0) goto bad;
187
+ if (__Pyx_ImportFunction_3_0_10(module, "findOrBuildNodeNsPrefix", (void (**)(void))&__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix, "xmlNs *(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
188
188
  Py_DECREF(module); module = 0;
189
189
  return 0;
190
190
  bad:
lxml/html/clean.py CHANGED
@@ -1,772 +1,21 @@
1
1
  # cython: language_level=3str
2
2
 
3
- """A cleanup tool for HTML.
4
-
5
- Removes unwanted tags and content. See the `Cleaner` class for
6
- details.
7
- """
8
-
9
- import copy
10
- import re
11
- from urllib.parse import urlsplit, unquote_plus
12
-
13
- from lxml import etree
14
- from lxml.html import defs
15
- from lxml.html import fromstring, XHTML_NAMESPACE
16
- from lxml.html import xhtml_to_html, _transform_result
17
-
18
-
19
- __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
20
- 'word_break', 'word_break_html']
21
-
22
- # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
23
- # Particularly the CSS cleaning; most of the tag cleaning is integrated now
24
- # I have multiple kinds of schemes searched; but should schemes be
25
- # whitelisted instead?
26
- # max height?
27
- # remove images? Also in CSS? background attribute?
28
- # Some way to whitelist object, iframe, etc (e.g., if you want to
29
- # allow *just* embedded YouTube movies)
30
- # Log what was deleted and why?
31
- # style="behavior: ..." might be bad in IE?
32
- # Should we have something for just <meta http-equiv>? That's the worst of the
33
- # metas.
34
- # UTF-7 detections? Example:
35
- # <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
36
- # you don't always have to have the charset set, if the page has no charset
37
- # and there's UTF7-like code in it.
38
- # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
39
-
40
-
41
- # This is an IE-specific construct you can have in a stylesheet to
42
- # run some Javascript:
43
- _replace_css_javascript = re.compile(
44
- r'expression\s*\(.*?\)', re.S|re.I).sub
45
-
46
- # Do I have to worry about @\nimport?
47
- _replace_css_import = re.compile(
48
- r'@\s*import', re.I).sub
49
-
50
- _looks_like_tag_content = re.compile(
51
- r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=',
52
- (re.ASCII)).search
53
-
54
- # All kinds of schemes besides just javascript: that can cause
55
- # execution:
56
- _find_image_dataurls = re.compile(
57
- r'data:image/(.+);base64,', re.I).findall
58
- _possibly_malicious_schemes = re.compile(
59
- r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
60
- re.I).findall
61
- # SVG images can contain script content
62
- _is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search
63
-
64
- def _has_javascript_scheme(s):
65
- safe_image_urls = 0
66
- for image_type in _find_image_dataurls(s):
67
- if _is_unsafe_image_type(image_type):
68
- return True
69
- safe_image_urls += 1
70
- return len(_possibly_malicious_schemes(s)) > safe_image_urls
71
-
72
- _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
73
-
74
- # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
75
- _conditional_comment_re = re.compile(
76
- r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
77
-
78
- _find_styled_elements = etree.XPath(
79
- "descendant-or-self::*[@style]")
80
-
81
- _find_external_links = etree.XPath(
82
- ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
83
- "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
84
- namespaces={'x':XHTML_NAMESPACE})
85
-
86
-
87
- class Cleaner:
88
- """
89
- Instances cleans the document of each of the possible offending
90
- elements. The cleaning is controlled by attributes; you can
91
- override attributes in a subclass, or set them in the constructor.
92
-
93
- ``scripts``:
94
- Removes any ``<script>`` tags.
95
-
96
- ``javascript``:
97
- Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
98
- as they could contain Javascript.
99
-
100
- ``comments``:
101
- Removes any comments.
102
-
103
- ``style``:
104
- Removes any style tags.
105
-
106
- ``inline_style``
107
- Removes any style attributes. Defaults to the value of the ``style`` option.
108
-
109
- ``links``:
110
- Removes any ``<link>`` tags
111
-
112
- ``meta``:
113
- Removes any ``<meta>`` tags
114
-
115
- ``page_structure``:
116
- Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
117
-
118
- ``processing_instructions``:
119
- Removes any processing instructions.
120
-
121
- ``embedded``:
122
- Removes any embedded objects (flash, iframes)
123
-
124
- ``frames``:
125
- Removes any frame-related tags
126
-
127
- ``forms``:
128
- Removes any form tags
129
-
130
- ``annoying_tags``:
131
- Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
132
-
133
- ``remove_tags``:
134
- A list of tags to remove. Only the tags will be removed,
135
- their content will get pulled up into the parent tag.
136
-
137
- ``kill_tags``:
138
- A list of tags to kill. Killing also removes the tag's content,
139
- i.e. the whole subtree, not just the tag itself.
140
-
141
- ``allow_tags``:
142
- A list of tags to include (default include all).
143
-
144
- ``remove_unknown_tags``:
145
- Remove any tags that aren't standard parts of HTML.
146
-
147
- ``safe_attrs_only``:
148
- If true, only include 'safe' attributes (specifically the list
149
- from the feedparser HTML sanitisation web site).
150
-
151
- ``safe_attrs``:
152
- A set of attribute names to override the default list of attributes
153
- considered 'safe' (when safe_attrs_only=True).
154
-
155
- ``add_nofollow``:
156
- If true, then any <a> tags will have ``rel="nofollow"`` added to them.
157
-
158
- ``host_whitelist``:
159
- A list or set of hosts that you can use for embedded content
160
- (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
161
- You can also implement/override the method
162
- ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
163
- implement more complex rules for what can be embedded.
164
- Anything that passes this test will be shown, regardless of
165
- the value of (for instance) ``embedded``.
166
-
167
- Note that this parameter might not work as intended if you do not
168
- make the links absolute before doing the cleaning.
169
-
170
- Note that you may also need to set ``whitelist_tags``.
171
-
172
- ``whitelist_tags``:
173
- A set of tags that can be included with ``host_whitelist``.
174
- The default is ``iframe`` and ``embed``; you may wish to
175
- include other tags like ``script``, or you may want to
176
- implement ``allow_embedded_url`` for more control. Set to None to
177
- include all tags.
178
-
179
- This modifies the document *in place*.
180
- """
181
-
182
- scripts = True
183
- javascript = True
184
- comments = True
185
- style = False
186
- inline_style = None
187
- links = True
188
- meta = True
189
- page_structure = True
190
- processing_instructions = True
191
- embedded = True
192
- frames = True
193
- forms = True
194
- annoying_tags = True
195
- remove_tags = ()
196
- allow_tags = ()
197
- kill_tags = ()
198
- remove_unknown_tags = True
199
- safe_attrs_only = True
200
- safe_attrs = defs.safe_attrs
201
- add_nofollow = False
202
- host_whitelist = ()
203
- whitelist_tags = {'iframe', 'embed'}
204
-
205
- def __init__(self, **kw):
206
- not_an_attribute = object()
207
- for name, value in kw.items():
208
- default = getattr(self, name, not_an_attribute)
209
- if default is None or default is True or default is False:
210
- pass
211
- elif isinstance(default, (frozenset, set, tuple, list)):
212
- # Catch common error of passing ('host') instead of a tuple.
213
- if isinstance(value, str):
214
- raise TypeError(
215
- f"Expected a collection, got str: {name}={value!r}")
216
- else:
217
- raise TypeError(
218
- f"Unknown parameter: {name}={value!r}")
219
- setattr(self, name, value)
220
-
221
- if self.inline_style is None and 'inline_style' not in kw:
222
- self.inline_style = self.style
223
-
224
- if kw.get("allow_tags"):
225
- if kw.get("remove_unknown_tags"):
226
- raise ValueError("It does not make sense to pass in both "
227
- "allow_tags and remove_unknown_tags")
228
- self.remove_unknown_tags = False
229
-
230
- self.host_whitelist = frozenset(self.host_whitelist) if self.host_whitelist else ()
231
-
232
- # Used to lookup the primary URL for a given tag that is up for
233
- # removal:
234
- _tag_link_attrs = dict(
235
- script='src',
236
- link='href',
237
- # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
238
- # From what I can tell, both attributes can contain a link:
239
- applet=['code', 'object'],
240
- iframe='src',
241
- embed='src',
242
- layer='src',
243
- # FIXME: there doesn't really seem like a general way to figure out what
244
- # links an <object> tag uses; links often go in <param> tags with values
245
- # that we don't really know. You'd have to have knowledge about specific
246
- # kinds of plugins (probably keyed off classid), and match against those.
247
- ##object=?,
248
- # FIXME: not looking at the action currently, because it is more complex
249
- # than than -- if you keep the form, you should keep the form controls.
250
- ##form='action',
251
- a='href',
252
- )
253
-
254
- def __call__(self, doc):
255
- """
256
- Cleans the document.
257
- """
258
- try:
259
- getroot = doc.getroot
260
- except AttributeError:
261
- pass # Element instance
262
- else:
263
- doc = getroot() # ElementTree instance, instead of an element
264
- # convert XHTML to HTML
265
- xhtml_to_html(doc)
266
- # Normalize a case that IE treats <image> like <img>, and that
267
- # can confuse either this step or later steps.
268
- for el in doc.iter('image'):
269
- el.tag = 'img'
270
- if not self.comments:
271
- # Of course, if we were going to kill comments anyway, we don't
272
- # need to worry about this
273
- self.kill_conditional_comments(doc)
274
-
275
- kill_tags = set(self.kill_tags or ())
276
- remove_tags = set(self.remove_tags or ())
277
- allow_tags = set(self.allow_tags or ())
278
-
279
- if self.scripts:
280
- kill_tags.add('script')
281
- if self.safe_attrs_only:
282
- safe_attrs = set(self.safe_attrs)
283
- for el in doc.iter(etree.Element):
284
- attrib = el.attrib
285
- for aname in attrib.keys():
286
- if aname not in safe_attrs:
287
- del attrib[aname]
288
- if self.javascript:
289
- if not (self.safe_attrs_only and
290
- self.safe_attrs == defs.safe_attrs):
291
- # safe_attrs handles events attributes itself
292
- for el in doc.iter(etree.Element):
293
- attrib = el.attrib
294
- for aname in attrib.keys():
295
- if aname.startswith('on'):
296
- del attrib[aname]
297
- doc.rewrite_links(self._remove_javascript_link,
298
- resolve_base_href=False)
299
- # If we're deleting style then we don't have to remove JS links
300
- # from styles, otherwise...
301
- if not self.inline_style:
302
- for el in _find_styled_elements(doc):
303
- old = el.get('style')
304
- new = _replace_css_javascript('', old)
305
- new = _replace_css_import('', new)
306
- if self._has_sneaky_javascript(new):
307
- # Something tricky is going on...
308
- del el.attrib['style']
309
- elif new != old:
310
- el.set('style', new)
311
- if not self.style:
312
- for el in list(doc.iter('style')):
313
- if el.get('type', '').lower().strip() == 'text/javascript':
314
- el.drop_tree()
315
- continue
316
- old = el.text or ''
317
- new = _replace_css_javascript('', old)
318
- # The imported CSS can do anything; we just can't allow:
319
- new = _replace_css_import('', new)
320
- if self._has_sneaky_javascript(new):
321
- # Something tricky is going on...
322
- el.text = '/* deleted */'
323
- elif new != old:
324
- el.text = new
325
- if self.comments:
326
- kill_tags.add(etree.Comment)
327
- if self.processing_instructions:
328
- kill_tags.add(etree.ProcessingInstruction)
329
- if self.style:
330
- kill_tags.add('style')
331
- if self.inline_style:
332
- etree.strip_attributes(doc, 'style')
333
- if self.links:
334
- kill_tags.add('link')
335
- elif self.style or self.javascript:
336
- # We must get rid of included stylesheets if Javascript is not
337
- # allowed, as you can put Javascript in them
338
- for el in list(doc.iter('link')):
339
- if 'stylesheet' in el.get('rel', '').lower():
340
- # Note this kills alternate stylesheets as well
341
- if not self.allow_element(el):
342
- el.drop_tree()
343
- if self.meta:
344
- kill_tags.add('meta')
345
- if self.page_structure:
346
- remove_tags.update(('head', 'html', 'title'))
347
- if self.embedded:
348
- # FIXME: is <layer> really embedded?
349
- # We should get rid of any <param> tags not inside <applet>;
350
- # These are not really valid anyway.
351
- for el in list(doc.iter('param')):
352
- parent = el.getparent()
353
- while parent is not None and parent.tag not in ('applet', 'object'):
354
- parent = parent.getparent()
355
- if parent is None:
356
- el.drop_tree()
357
- kill_tags.update(('applet',))
358
- # The alternate contents that are in an iframe are a good fallback:
359
- remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
360
- if self.frames:
361
- # FIXME: ideally we should look at the frame links, but
362
- # generally frames don't mix properly with an HTML
363
- # fragment anyway.
364
- kill_tags.update(defs.frame_tags)
365
- if self.forms:
366
- remove_tags.add('form')
367
- kill_tags.update(('button', 'input', 'select', 'textarea'))
368
- if self.annoying_tags:
369
- remove_tags.update(('blink', 'marquee'))
370
-
371
- _remove = []
372
- _kill = []
373
- for el in doc.iter():
374
- if el.tag in kill_tags:
375
- if self.allow_element(el):
376
- continue
377
- _kill.append(el)
378
- elif el.tag in remove_tags:
379
- if self.allow_element(el):
380
- continue
381
- _remove.append(el)
382
-
383
- if _remove and _remove[0] == doc:
384
- # We have to drop the parent-most tag, which we can't
385
- # do. Instead we'll rewrite it:
386
- el = _remove.pop(0)
387
- el.tag = 'div'
388
- el.attrib.clear()
389
- elif _kill and _kill[0] == doc:
390
- # We have to drop the parent-most element, which we can't
391
- # do. Instead we'll clear it:
392
- el = _kill.pop(0)
393
- if el.tag != 'html':
394
- el.tag = 'div'
395
- el.clear()
396
-
397
- _kill.reverse() # start with innermost tags
398
- for el in _kill:
399
- el.drop_tree()
400
- for el in _remove:
401
- el.drop_tag()
402
-
403
- if self.remove_unknown_tags:
404
- if allow_tags:
405
- raise ValueError(
406
- "It does not make sense to pass in both allow_tags and remove_unknown_tags")
407
- allow_tags = set(defs.tags)
408
- if allow_tags:
409
- # make sure we do not remove comments/PIs if users want them (which is rare enough)
410
- if not self.comments:
411
- allow_tags.add(etree.Comment)
412
- if not self.processing_instructions:
413
- allow_tags.add(etree.ProcessingInstruction)
414
-
415
- bad = []
416
- for el in doc.iter():
417
- if el.tag not in allow_tags:
418
- bad.append(el)
419
- if bad:
420
- if bad[0] is doc:
421
- el = bad.pop(0)
422
- el.tag = 'div'
423
- el.attrib.clear()
424
- for el in bad:
425
- el.drop_tag()
426
- if self.add_nofollow:
427
- for el in _find_external_links(doc):
428
- if not self.allow_follow(el):
429
- rel = el.get('rel')
430
- if rel:
431
- if ('nofollow' in rel
432
- and ' nofollow ' in (' %s ' % rel)):
433
- continue
434
- rel = '%s nofollow' % rel
435
- else:
436
- rel = 'nofollow'
437
- el.set('rel', rel)
438
-
439
- def allow_follow(self, anchor):
440
- """
441
- Override to suppress rel="nofollow" on some anchors.
442
- """
443
- return False
444
-
445
- def allow_element(self, el):
446
- """
447
- Decide whether an element is configured to be accepted or rejected.
448
-
449
- :param el: an element.
450
- :return: true to accept the element or false to reject/discard it.
451
- """
452
- if el.tag not in self._tag_link_attrs:
453
- return False
454
- attr = self._tag_link_attrs[el.tag]
455
- if isinstance(attr, (list, tuple)):
456
- for one_attr in attr:
457
- url = el.get(one_attr)
458
- if not url:
459
- return False
460
- if not self.allow_embedded_url(el, url):
461
- return False
462
- return True
463
- else:
464
- url = el.get(attr)
465
- if not url:
466
- return False
467
- return self.allow_embedded_url(el, url)
468
-
469
- def allow_embedded_url(self, el, url):
470
- """
471
- Decide whether a URL that was found in an element's attributes or text
472
- if configured to be accepted or rejected.
473
-
474
- :param el: an element.
475
- :param url: a URL found on the element.
476
- :return: true to accept the URL and false to reject it.
477
- """
478
- if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
479
- return False
480
- parts = urlsplit(url)
481
- if parts.scheme not in ('http', 'https'):
482
- return False
483
- if parts.hostname in self.host_whitelist:
484
- return True
485
- return False
486
-
487
- def kill_conditional_comments(self, doc):
488
- """
489
- IE conditional comments basically embed HTML that the parser
490
- doesn't normally see. We can't allow anything like that, so
491
- we'll kill any comments that could be conditional.
492
- """
493
- has_conditional_comment = _conditional_comment_re.search
494
- self._kill_elements(
495
- doc, lambda el: has_conditional_comment(el.text),
496
- etree.Comment)
497
-
498
- def _kill_elements(self, doc, condition, iterate=None):
499
- bad = []
500
- for el in doc.iter(iterate):
501
- if condition(el):
502
- bad.append(el)
503
- for el in bad:
504
- el.drop_tree()
505
-
506
- def _remove_javascript_link(self, link):
507
- # links like "j a v a s c r i p t:" might be interpreted in IE
508
- new = _substitute_whitespace('', unquote_plus(link))
509
- if _has_javascript_scheme(new):
510
- # FIXME: should this be None to delete?
511
- return ''
512
- return link
513
-
514
- _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
515
-
516
- def _has_sneaky_javascript(self, style):
517
- """
518
- Depending on the browser, stuff like ``e x p r e s s i o n(...)``
519
- can get interpreted, or ``expre/* stuff */ssion(...)``. This
520
- checks for attempt to do stuff like this.
521
-
522
- Typically the response will be to kill the entire style; if you
523
- have just a bit of Javascript in the style another rule will catch
524
- that and remove only the Javascript from the style; this catches
525
- more sneaky attempts.
526
- """
527
- style = self._substitute_comments('', style)
528
- style = style.replace('\\', '')
529
- style = _substitute_whitespace('', style)
530
- style = style.lower()
531
- if _has_javascript_scheme(style):
532
- return True
533
- if 'expression(' in style:
534
- return True
535
- if '@import' in style:
536
- return True
537
- if '</noscript' in style:
538
- # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
539
- return True
540
- if _looks_like_tag_content(style):
541
- # e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
542
- return True
543
- return False
544
-
545
- def clean_html(self, html):
546
- result_type = type(html)
547
- if isinstance(html, (str, bytes)):
548
- doc = fromstring(html)
549
- else:
550
- doc = copy.deepcopy(html)
551
- self(doc)
552
- return _transform_result(result_type, doc)
553
-
554
- clean = Cleaner()
555
- clean_html = clean.clean_html
556
-
557
- ############################################################
558
- ## Autolinking
559
- ############################################################
560
-
561
- _link_regexes = [
562
- re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
563
- # This is conservative, but autolinking can be a bit conservative:
564
- re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
565
- ]
566
-
567
- _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
568
-
569
- _avoid_hosts = [
570
- re.compile(r'^localhost', re.I),
571
- re.compile(r'\bexample\.(?:com|org|net)$', re.I),
572
- re.compile(r'^127\.0\.0\.1$'),
3
+ """Backward-compatibility module for lxml_html_clean"""
4
+
5
+ try:
6
+ from lxml_html_clean import *
7
+
8
+ __all__ = [
9
+ "clean_html",
10
+ "clean",
11
+ "Cleaner",
12
+ "autolink",
13
+ "autolink_html",
14
+ "word_break",
15
+ "word_break_html",
573
16
  ]
574
-
575
- _avoid_classes = ['nolink']
576
-
577
- def autolink(el, link_regexes=_link_regexes,
578
- avoid_elements=_avoid_elements,
579
- avoid_hosts=_avoid_hosts,
580
- avoid_classes=_avoid_classes):
581
- """
582
- Turn any URLs into links.
583
-
584
- It will search for links identified by the given regular
585
- expressions (by default mailto and http(s) links).
586
-
587
- It won't link text in an element in avoid_elements, or an element
588
- with a class in avoid_classes. It won't link to anything with a
589
- host that matches one of the regular expressions in avoid_hosts
590
- (default localhost and 127.0.0.1).
591
-
592
- If you pass in an element, the element's tail will not be
593
- substituted, only the contents of the element.
594
- """
595
- if el.tag in avoid_elements:
596
- return
597
- class_name = el.get('class')
598
- if class_name:
599
- class_name = class_name.split()
600
- for match_class in avoid_classes:
601
- if match_class in class_name:
602
- return
603
- for child in list(el):
604
- autolink(child, link_regexes=link_regexes,
605
- avoid_elements=avoid_elements,
606
- avoid_hosts=avoid_hosts,
607
- avoid_classes=avoid_classes)
608
- if child.tail:
609
- text, tail_children = _link_text(
610
- child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
611
- if tail_children:
612
- child.tail = text
613
- index = el.index(child)
614
- el[index+1:index+1] = tail_children
615
- if el.text:
616
- text, pre_children = _link_text(
617
- el.text, link_regexes, avoid_hosts, factory=el.makeelement)
618
- if pre_children:
619
- el.text = text
620
- el[:0] = pre_children
621
-
622
- def _link_text(text, link_regexes, avoid_hosts, factory):
623
- leading_text = ''
624
- links = []
625
- last_pos = 0
626
- while 1:
627
- best_match, best_pos = None, None
628
- for regex in link_regexes:
629
- regex_pos = last_pos
630
- while 1:
631
- match = regex.search(text, pos=regex_pos)
632
- if match is None:
633
- break
634
- host = match.group('host')
635
- for host_regex in avoid_hosts:
636
- if host_regex.search(host):
637
- regex_pos = match.end()
638
- break
639
- else:
640
- break
641
- if match is None:
642
- continue
643
- if best_pos is None or match.start() < best_pos:
644
- best_match = match
645
- best_pos = match.start()
646
- if best_match is None:
647
- # No more matches
648
- if links:
649
- assert not links[-1].tail
650
- links[-1].tail = text
651
- else:
652
- assert not leading_text
653
- leading_text = text
654
- break
655
- link = best_match.group(0)
656
- end = best_match.end()
657
- if link.endswith('.') or link.endswith(','):
658
- # These punctuation marks shouldn't end a link
659
- end -= 1
660
- link = link[:-1]
661
- prev_text = text[:best_match.start()]
662
- if links:
663
- assert not links[-1].tail
664
- links[-1].tail = prev_text
665
- else:
666
- assert not leading_text
667
- leading_text = prev_text
668
- anchor = factory('a')
669
- anchor.set('href', link)
670
- body = best_match.group('body')
671
- if not body:
672
- body = link
673
- if body.endswith('.') or body.endswith(','):
674
- body = body[:-1]
675
- anchor.text = body
676
- links.append(anchor)
677
- text = text[end:]
678
- return leading_text, links
679
-
680
- def autolink_html(html, *args, **kw):
681
- result_type = type(html)
682
- if isinstance(html, (str, bytes)):
683
- doc = fromstring(html)
684
- else:
685
- doc = copy.deepcopy(html)
686
- autolink(doc, *args, **kw)
687
- return _transform_result(result_type, doc)
688
-
689
- autolink_html.__doc__ = autolink.__doc__
690
-
691
- ############################################################
692
- ## Word wrapping
693
- ############################################################
694
-
695
- _avoid_word_break_elements = ['pre', 'textarea', 'code']
696
- _avoid_word_break_classes = ['nobreak']
697
-
698
- def word_break(el, max_width=40,
699
- avoid_elements=_avoid_word_break_elements,
700
- avoid_classes=_avoid_word_break_classes,
701
- break_character=chr(0x200b)):
702
- """
703
- Breaks any long words found in the body of the text (not attributes).
704
-
705
- Doesn't effect any of the tags in avoid_elements, by default
706
- ``<textarea>`` and ``<pre>``
707
-
708
- Breaks words by inserting &#8203;, which is a unicode character
709
- for Zero Width Space character. This generally takes up no space
710
- in rendering, but does copy as a space, and in monospace contexts
711
- usually takes up space.
712
-
713
- See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
714
- """
715
- # Character suggestion of &#8203 comes from:
716
- # http://www.cs.tut.fi/~jkorpela/html/nobr.html
717
- if el.tag in _avoid_word_break_elements:
718
- return
719
- class_name = el.get('class')
720
- if class_name:
721
- dont_break = False
722
- class_name = class_name.split()
723
- for avoid in avoid_classes:
724
- if avoid in class_name:
725
- dont_break = True
726
- break
727
- if dont_break:
728
- return
729
- if el.text:
730
- el.text = _break_text(el.text, max_width, break_character)
731
- for child in el:
732
- word_break(child, max_width=max_width,
733
- avoid_elements=avoid_elements,
734
- avoid_classes=avoid_classes,
735
- break_character=break_character)
736
- if child.tail:
737
- child.tail = _break_text(child.tail, max_width, break_character)
738
-
739
- def word_break_html(html, *args, **kw):
740
- result_type = type(html)
741
- doc = fromstring(html)
742
- word_break(doc, *args, **kw)
743
- return _transform_result(result_type, doc)
744
-
745
- def _break_text(text, max_width, break_character):
746
- words = text.split()
747
- for word in words:
748
- if len(word) > max_width:
749
- replacement = _insert_break(word, max_width, break_character)
750
- text = text.replace(word, replacement)
751
- return text
752
-
753
- _break_prefer_re = re.compile(r'[^a-z]', re.I)
754
-
755
- def _insert_break(word, width, break_character):
756
- orig_word = word
757
- result = ''
758
- while len(word) > width:
759
- start = word[:width]
760
- breaks = list(_break_prefer_re.finditer(start))
761
- if breaks:
762
- last_break = breaks[-1]
763
- # Only walk back up to 10 characters to find a nice break:
764
- if last_break.end() > width-10:
765
- # FIXME: should the break character be at the end of the
766
- # chunk, or the beginning of the next chunk?
767
- start = word[:last_break.end()]
768
- result += start + break_character
769
- word = word[len(start):]
770
- result += word
771
- return result
772
-
17
+ except ImportError:
18
+ raise ImportError(
19
+ "lxml.html.clean module is now a separate project lxml_html_clean.\n"
20
+ "Install lxml[html_clean] or lxml_html_clean directly."
21
+ ) from None
@@ -1,3 +1,3 @@
1
1
  #ifndef LXML_VERSION_STRING
2
- #define LXML_VERSION_STRING "5.1.1"
2
+ #define LXML_VERSION_STRING "5.2.1"
3
3
  #endif
lxml/lxml.etree.h CHANGED
@@ -1,4 +1,4 @@
1
- /* Generated by Cython 3.0.9 */
1
+ /* Generated by Cython 3.0.10 */
2
2
 
3
3
  #ifndef __PYX_HAVE__lxml__etree
4
4
  #define __PYX_HAVE__lxml__etree
@@ -66,7 +66,7 @@ struct LxmlElementTree {
66
66
  struct LxmlElement *_context_node;
67
67
  };
68
68
 
69
- /* "lxml/etree.pyx":2645
69
+ /* "lxml/etree.pyx":2646
70
70
  *
71
71
  *
72
72
  * cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher, # <<<<<<<<<<<<<<
@@ -82,7 +82,7 @@ struct LxmlElementTagMatcher {
82
82
  char *_name;
83
83
  };
84
84
 
85
- /* "lxml/etree.pyx":2676
85
+ /* "lxml/etree.pyx":2677
86
86
  * self._name = NULL
87
87
  *
88
88
  * cdef public class _ElementIterator(_ElementTagMatcher) [ # <<<<<<<<<<<<<<
lxml/lxml.etree_api.h CHANGED
@@ -1,4 +1,4 @@
1
- /* Generated by Cython 3.0.9 */
1
+ /* Generated by Cython 3.0.10 */
2
2
 
3
3
  #ifndef __PYX_HAVE_API__lxml__etree
4
4
  #define __PYX_HAVE_API__lxml__etree
@@ -98,9 +98,9 @@ static void (*__pyx_api_f_4lxml_5etree_initTagMatch)(struct LxmlElementTagMatche
98
98
  #define initTagMatch __pyx_api_f_4lxml_5etree_initTagMatch
99
99
  static xmlNs *(*__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix)(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *) = 0;
100
100
  #define findOrBuildNodeNsPrefix __pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix
101
- #ifndef __PYX_HAVE_RT_ImportFunction_3_0_9
102
- #define __PYX_HAVE_RT_ImportFunction_3_0_9
103
- static int __Pyx_ImportFunction_3_0_9(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
101
+ #ifndef __PYX_HAVE_RT_ImportFunction_3_0_10
102
+ #define __PYX_HAVE_RT_ImportFunction_3_0_10
103
+ static int __Pyx_ImportFunction_3_0_10(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
104
104
  PyObject *d = 0;
105
105
  PyObject *cobj = 0;
106
106
  union {
@@ -140,51 +140,51 @@ static int import_lxml__etree(void) {
140
140
  PyObject *module = 0;
141
141
  module = PyImport_ImportModule("lxml.etree");
142
142
  if (!module) goto bad;
143
- if (__Pyx_ImportFunction_3_0_9(module, "deepcopyNodeToDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
144
- if (__Pyx_ImportFunction_3_0_9(module, "elementTreeFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementTreeFactory, "struct LxmlElementTree *(struct LxmlElement *)") < 0) goto bad;
145
- if (__Pyx_ImportFunction_3_0_9(module, "newElementTree", (void (**)(void))&__pyx_api_f_4lxml_5etree_newElementTree, "struct LxmlElementTree *(struct LxmlElement *, PyObject *)") < 0) goto bad;
146
- if (__Pyx_ImportFunction_3_0_9(module, "adoptExternalDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_adoptExternalDocument, "struct LxmlElementTree *(xmlDoc *, PyObject *, int)") < 0) goto bad;
147
- if (__Pyx_ImportFunction_3_0_9(module, "elementFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementFactory, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
148
- if (__Pyx_ImportFunction_3_0_9(module, "makeElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeElement, "struct LxmlElement *(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
149
- if (__Pyx_ImportFunction_3_0_9(module, "makeSubElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeSubElement, "struct LxmlElement *(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
150
- if (__Pyx_ImportFunction_3_0_9(module, "setElementClassLookupFunction", (void (**)(void))&__pyx_api_f_4lxml_5etree_setElementClassLookupFunction, "void (_element_class_lookup_function, PyObject *)") < 0) goto bad;
151
- if (__Pyx_ImportFunction_3_0_9(module, "lookupDefaultElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupDefaultElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
152
- if (__Pyx_ImportFunction_3_0_9(module, "lookupNamespaceElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
153
- if (__Pyx_ImportFunction_3_0_9(module, "callLookupFallback", (void (**)(void))&__pyx_api_f_4lxml_5etree_callLookupFallback, "PyObject *(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *)") < 0) goto bad;
154
- if (__Pyx_ImportFunction_3_0_9(module, "tagMatches", (void (**)(void))&__pyx_api_f_4lxml_5etree_tagMatches, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
155
- if (__Pyx_ImportFunction_3_0_9(module, "documentOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_documentOrRaise, "struct LxmlDocument *(PyObject *)") < 0) goto bad;
156
- if (__Pyx_ImportFunction_3_0_9(module, "rootNodeOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_rootNodeOrRaise, "struct LxmlElement *(PyObject *)") < 0) goto bad;
157
- if (__Pyx_ImportFunction_3_0_9(module, "hasText", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasText, "int (xmlNode *)") < 0) goto bad;
158
- if (__Pyx_ImportFunction_3_0_9(module, "hasTail", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasTail, "int (xmlNode *)") < 0) goto bad;
159
- if (__Pyx_ImportFunction_3_0_9(module, "textOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_textOf, "PyObject *(xmlNode *)") < 0) goto bad;
160
- if (__Pyx_ImportFunction_3_0_9(module, "tailOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_tailOf, "PyObject *(xmlNode *)") < 0) goto bad;
161
- if (__Pyx_ImportFunction_3_0_9(module, "setNodeText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setNodeText, "int (xmlNode *, PyObject *)") < 0) goto bad;
162
- if (__Pyx_ImportFunction_3_0_9(module, "setTailText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setTailText, "int (xmlNode *, PyObject *)") < 0) goto bad;
163
- if (__Pyx_ImportFunction_3_0_9(module, "attributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValue, "PyObject *(xmlNode *, xmlAttr *)") < 0) goto bad;
164
- if (__Pyx_ImportFunction_3_0_9(module, "attributeValueFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValueFromNsName, "PyObject *(xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
165
- if (__Pyx_ImportFunction_3_0_9(module, "getAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_getAttributeValue, "PyObject *(struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
166
- if (__Pyx_ImportFunction_3_0_9(module, "iterattributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_iterattributes, "PyObject *(struct LxmlElement *, int)") < 0) goto bad;
167
- if (__Pyx_ImportFunction_3_0_9(module, "collectAttributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_collectAttributes, "PyObject *(xmlNode *, int)") < 0) goto bad;
168
- if (__Pyx_ImportFunction_3_0_9(module, "setAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_setAttributeValue, "int (struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
169
- if (__Pyx_ImportFunction_3_0_9(module, "delAttribute", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttribute, "int (struct LxmlElement *, PyObject *)") < 0) goto bad;
170
- if (__Pyx_ImportFunction_3_0_9(module, "delAttributeFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttributeFromNsName, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
171
- if (__Pyx_ImportFunction_3_0_9(module, "hasChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasChild, "int (xmlNode *)") < 0) goto bad;
172
- if (__Pyx_ImportFunction_3_0_9(module, "findChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChild, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
173
- if (__Pyx_ImportFunction_3_0_9(module, "findChildForwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildForwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
174
- if (__Pyx_ImportFunction_3_0_9(module, "findChildBackwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildBackwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
175
- if (__Pyx_ImportFunction_3_0_9(module, "nextElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_nextElement, "xmlNode *(xmlNode *)") < 0) goto bad;
176
- if (__Pyx_ImportFunction_3_0_9(module, "previousElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_previousElement, "xmlNode *(xmlNode *)") < 0) goto bad;
177
- if (__Pyx_ImportFunction_3_0_9(module, "appendChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChild, "void (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
178
- if (__Pyx_ImportFunction_3_0_9(module, "appendChildToElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChildToElement, "int (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
179
- if (__Pyx_ImportFunction_3_0_9(module, "pyunicode", (void (**)(void))&__pyx_api_f_4lxml_5etree_pyunicode, "PyObject *(const xmlChar *)") < 0) goto bad;
180
- if (__Pyx_ImportFunction_3_0_9(module, "utf8", (void (**)(void))&__pyx_api_f_4lxml_5etree_utf8, "PyObject *(PyObject *)") < 0) goto bad;
181
- if (__Pyx_ImportFunction_3_0_9(module, "getNsTag", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTag, "PyObject *(PyObject *)") < 0) goto bad;
182
- if (__Pyx_ImportFunction_3_0_9(module, "getNsTagWithEmptyNs", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs, "PyObject *(PyObject *)") < 0) goto bad;
183
- if (__Pyx_ImportFunction_3_0_9(module, "namespacedName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedName, "PyObject *(xmlNode *)") < 0) goto bad;
184
- if (__Pyx_ImportFunction_3_0_9(module, "namespacedNameFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedNameFromNsName, "PyObject *(const xmlChar *, const xmlChar *)") < 0) goto bad;
185
- if (__Pyx_ImportFunction_3_0_9(module, "iteratorStoreNext", (void (**)(void))&__pyx_api_f_4lxml_5etree_iteratorStoreNext, "void (struct LxmlElementIterator *, struct LxmlElement *)") < 0) goto bad;
186
- if (__Pyx_ImportFunction_3_0_9(module, "initTagMatch", (void (**)(void))&__pyx_api_f_4lxml_5etree_initTagMatch, "void (struct LxmlElementTagMatcher *, PyObject *)") < 0) goto bad;
187
- if (__Pyx_ImportFunction_3_0_9(module, "findOrBuildNodeNsPrefix", (void (**)(void))&__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix, "xmlNs *(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
143
+ if (__Pyx_ImportFunction_3_0_10(module, "deepcopyNodeToDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_deepcopyNodeToDocument, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
144
+ if (__Pyx_ImportFunction_3_0_10(module, "elementTreeFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementTreeFactory, "struct LxmlElementTree *(struct LxmlElement *)") < 0) goto bad;
145
+ if (__Pyx_ImportFunction_3_0_10(module, "newElementTree", (void (**)(void))&__pyx_api_f_4lxml_5etree_newElementTree, "struct LxmlElementTree *(struct LxmlElement *, PyObject *)") < 0) goto bad;
146
+ if (__Pyx_ImportFunction_3_0_10(module, "adoptExternalDocument", (void (**)(void))&__pyx_api_f_4lxml_5etree_adoptExternalDocument, "struct LxmlElementTree *(xmlDoc *, PyObject *, int)") < 0) goto bad;
147
+ if (__Pyx_ImportFunction_3_0_10(module, "elementFactory", (void (**)(void))&__pyx_api_f_4lxml_5etree_elementFactory, "struct LxmlElement *(struct LxmlDocument *, xmlNode *)") < 0) goto bad;
148
+ if (__Pyx_ImportFunction_3_0_10(module, "makeElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeElement, "struct LxmlElement *(PyObject *, struct LxmlDocument *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
149
+ if (__Pyx_ImportFunction_3_0_10(module, "makeSubElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_makeSubElement, "struct LxmlElement *(struct LxmlElement *, PyObject *, PyObject *, PyObject *, PyObject *, PyObject *)") < 0) goto bad;
150
+ if (__Pyx_ImportFunction_3_0_10(module, "setElementClassLookupFunction", (void (**)(void))&__pyx_api_f_4lxml_5etree_setElementClassLookupFunction, "void (_element_class_lookup_function, PyObject *)") < 0) goto bad;
151
+ if (__Pyx_ImportFunction_3_0_10(module, "lookupDefaultElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupDefaultElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
152
+ if (__Pyx_ImportFunction_3_0_10(module, "lookupNamespaceElementClass", (void (**)(void))&__pyx_api_f_4lxml_5etree_lookupNamespaceElementClass, "PyObject *(PyObject *, PyObject *, xmlNode *)") < 0) goto bad;
153
+ if (__Pyx_ImportFunction_3_0_10(module, "callLookupFallback", (void (**)(void))&__pyx_api_f_4lxml_5etree_callLookupFallback, "PyObject *(struct LxmlFallbackElementClassLookup *, struct LxmlDocument *, xmlNode *)") < 0) goto bad;
154
+ if (__Pyx_ImportFunction_3_0_10(module, "tagMatches", (void (**)(void))&__pyx_api_f_4lxml_5etree_tagMatches, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
155
+ if (__Pyx_ImportFunction_3_0_10(module, "documentOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_documentOrRaise, "struct LxmlDocument *(PyObject *)") < 0) goto bad;
156
+ if (__Pyx_ImportFunction_3_0_10(module, "rootNodeOrRaise", (void (**)(void))&__pyx_api_f_4lxml_5etree_rootNodeOrRaise, "struct LxmlElement *(PyObject *)") < 0) goto bad;
157
+ if (__Pyx_ImportFunction_3_0_10(module, "hasText", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasText, "int (xmlNode *)") < 0) goto bad;
158
+ if (__Pyx_ImportFunction_3_0_10(module, "hasTail", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasTail, "int (xmlNode *)") < 0) goto bad;
159
+ if (__Pyx_ImportFunction_3_0_10(module, "textOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_textOf, "PyObject *(xmlNode *)") < 0) goto bad;
160
+ if (__Pyx_ImportFunction_3_0_10(module, "tailOf", (void (**)(void))&__pyx_api_f_4lxml_5etree_tailOf, "PyObject *(xmlNode *)") < 0) goto bad;
161
+ if (__Pyx_ImportFunction_3_0_10(module, "setNodeText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setNodeText, "int (xmlNode *, PyObject *)") < 0) goto bad;
162
+ if (__Pyx_ImportFunction_3_0_10(module, "setTailText", (void (**)(void))&__pyx_api_f_4lxml_5etree_setTailText, "int (xmlNode *, PyObject *)") < 0) goto bad;
163
+ if (__Pyx_ImportFunction_3_0_10(module, "attributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValue, "PyObject *(xmlNode *, xmlAttr *)") < 0) goto bad;
164
+ if (__Pyx_ImportFunction_3_0_10(module, "attributeValueFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_attributeValueFromNsName, "PyObject *(xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
165
+ if (__Pyx_ImportFunction_3_0_10(module, "getAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_getAttributeValue, "PyObject *(struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
166
+ if (__Pyx_ImportFunction_3_0_10(module, "iterattributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_iterattributes, "PyObject *(struct LxmlElement *, int)") < 0) goto bad;
167
+ if (__Pyx_ImportFunction_3_0_10(module, "collectAttributes", (void (**)(void))&__pyx_api_f_4lxml_5etree_collectAttributes, "PyObject *(xmlNode *, int)") < 0) goto bad;
168
+ if (__Pyx_ImportFunction_3_0_10(module, "setAttributeValue", (void (**)(void))&__pyx_api_f_4lxml_5etree_setAttributeValue, "int (struct LxmlElement *, PyObject *, PyObject *)") < 0) goto bad;
169
+ if (__Pyx_ImportFunction_3_0_10(module, "delAttribute", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttribute, "int (struct LxmlElement *, PyObject *)") < 0) goto bad;
170
+ if (__Pyx_ImportFunction_3_0_10(module, "delAttributeFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_delAttributeFromNsName, "int (xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
171
+ if (__Pyx_ImportFunction_3_0_10(module, "hasChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_hasChild, "int (xmlNode *)") < 0) goto bad;
172
+ if (__Pyx_ImportFunction_3_0_10(module, "findChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChild, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
173
+ if (__Pyx_ImportFunction_3_0_10(module, "findChildForwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildForwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
174
+ if (__Pyx_ImportFunction_3_0_10(module, "findChildBackwards", (void (**)(void))&__pyx_api_f_4lxml_5etree_findChildBackwards, "xmlNode *(xmlNode *, Py_ssize_t)") < 0) goto bad;
175
+ if (__Pyx_ImportFunction_3_0_10(module, "nextElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_nextElement, "xmlNode *(xmlNode *)") < 0) goto bad;
176
+ if (__Pyx_ImportFunction_3_0_10(module, "previousElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_previousElement, "xmlNode *(xmlNode *)") < 0) goto bad;
177
+ if (__Pyx_ImportFunction_3_0_10(module, "appendChild", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChild, "void (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
178
+ if (__Pyx_ImportFunction_3_0_10(module, "appendChildToElement", (void (**)(void))&__pyx_api_f_4lxml_5etree_appendChildToElement, "int (struct LxmlElement *, struct LxmlElement *)") < 0) goto bad;
179
+ if (__Pyx_ImportFunction_3_0_10(module, "pyunicode", (void (**)(void))&__pyx_api_f_4lxml_5etree_pyunicode, "PyObject *(const xmlChar *)") < 0) goto bad;
180
+ if (__Pyx_ImportFunction_3_0_10(module, "utf8", (void (**)(void))&__pyx_api_f_4lxml_5etree_utf8, "PyObject *(PyObject *)") < 0) goto bad;
181
+ if (__Pyx_ImportFunction_3_0_10(module, "getNsTag", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTag, "PyObject *(PyObject *)") < 0) goto bad;
182
+ if (__Pyx_ImportFunction_3_0_10(module, "getNsTagWithEmptyNs", (void (**)(void))&__pyx_api_f_4lxml_5etree_getNsTagWithEmptyNs, "PyObject *(PyObject *)") < 0) goto bad;
183
+ if (__Pyx_ImportFunction_3_0_10(module, "namespacedName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedName, "PyObject *(xmlNode *)") < 0) goto bad;
184
+ if (__Pyx_ImportFunction_3_0_10(module, "namespacedNameFromNsName", (void (**)(void))&__pyx_api_f_4lxml_5etree_namespacedNameFromNsName, "PyObject *(const xmlChar *, const xmlChar *)") < 0) goto bad;
185
+ if (__Pyx_ImportFunction_3_0_10(module, "iteratorStoreNext", (void (**)(void))&__pyx_api_f_4lxml_5etree_iteratorStoreNext, "void (struct LxmlElementIterator *, struct LxmlElement *)") < 0) goto bad;
186
+ if (__Pyx_ImportFunction_3_0_10(module, "initTagMatch", (void (**)(void))&__pyx_api_f_4lxml_5etree_initTagMatch, "void (struct LxmlElementTagMatcher *, PyObject *)") < 0) goto bad;
187
+ if (__Pyx_ImportFunction_3_0_10(module, "findOrBuildNodeNsPrefix", (void (**)(void))&__pyx_api_f_4lxml_5etree_findOrBuildNodeNsPrefix, "xmlNs *(struct LxmlDocument *, xmlNode *, const xmlChar *, const xmlChar *)") < 0) goto bad;
188
188
  Py_DECREF(module); module = 0;
189
189
  return 0;
190
190
  bad:
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lxml
3
- Version: 5.1.1
3
+ Version: 5.2.1
4
4
  Summary: Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API.
5
5
  Home-page: https://lxml.de/
6
6
  Author: lxml dev team
@@ -34,10 +34,12 @@ Provides-Extra: cssselect
34
34
  Requires-Dist: cssselect >=0.7 ; extra == 'cssselect'
35
35
  Provides-Extra: html5
36
36
  Requires-Dist: html5lib ; extra == 'html5'
37
+ Provides-Extra: html_clean
38
+ Requires-Dist: lxml-html-clean ; extra == 'html_clean'
37
39
  Provides-Extra: htmlsoup
38
40
  Requires-Dist: BeautifulSoup4 ; extra == 'htmlsoup'
39
41
  Provides-Extra: source
40
- Requires-Dist: Cython >=3.0.9 ; extra == 'source'
42
+ Requires-Dist: Cython >=3.0.10 ; extra == 'source'
41
43
 
42
44
  lxml is a Pythonic, mature binding for the libxml2 and libxslt libraries. It
43
45
  provides safe and convenient access to these libraries using the ElementTree
@@ -62,34 +64,26 @@ an appropriate version of Cython installed.
62
64
 
63
65
  After an official release of a new stable series, bug fixes may become
64
66
  available at
65
- https://github.com/lxml/lxml/tree/lxml-5.1 .
66
- Running ``easy_install lxml==5.1bugfix`` will install
67
+ https://github.com/lxml/lxml/tree/lxml-5.2 .
68
+ Running ``easy_install lxml==5.2bugfix`` will install
67
69
  the unreleased branch state from
68
- https://github.com/lxml/lxml/tarball/lxml-5.1#egg=lxml-5.1bugfix
70
+ https://github.com/lxml/lxml/tarball/lxml-5.2#egg=lxml-5.2bugfix
69
71
  as soon as a maintenance branch has been established. Note that this
70
72
  requires Cython to be installed at an appropriate version for the build.
71
73
 
72
- 5.1.1 (2024-03-28)
74
+ 5.2.1 (2024-04-02)
73
75
  ==================
74
76
 
75
77
  Bugs fixed
76
78
  ----------
77
79
 
78
- * LP#2048920: ``iterlinks()`` in ``lxml.html`` rejected ``bytes`` input in 5.1.0.
80
+ * LP#2059910: The minimum CPU architecture for the Linux x86 binary wheels was set back to
81
+ "core2", but with SSE 4.2 enabled.
79
82
 
80
- * High source line numbers from the parser are no longer truncated
81
- (up to a C ``long``) when using libxml2 2.11 or later.
83
+ * LP#2059977: ``Element.iterfind("//absolute_path")`` failed with a ``SyntaxError``
84
+ where it should have issued a warning.
82
85
 
83
- * GH#407: A compatibility test was adapted to recent expat versions.
84
- Patch by Miro Hrončok.
85
-
86
- Other changes
87
- -------------
88
-
89
- * Binary wheels use the library versions libxml2 2.12.6 and libxslt 1.1.39.
90
-
91
- * Windows binary wheels use the library versions libxml2 2.11.7 and libxslt 1.1.39.
92
-
93
- * Built with Cython 3.0.9.
86
+ * GH#416: The documentation build was using the non-standard ``which`` command.
87
+ Patch by Michał Górny.
94
88
 
95
89
 
@@ -1,5 +1,5 @@
1
1
  lxml/ElementInclude.py,sha256=pEvLKSyhNWtZTRr6liUJD-1mIqmbxbcurxcqZv7vNHg,8804
2
- lxml/__init__.py,sha256=RXd2OOpJetd-S9VxFcgMJb-dN7BvAaYuMeiHbthroig,596
2
+ lxml/__init__.py,sha256=tzYdqsFAQjFVtpQ0r5v4iCKYBxYL3QCUaYNNBsKYgRc,596
3
3
  lxml/_elementpath.py,sha256=waZwwmgKbOlNaztLRUOXWBnHnZdQwDWwv7M0GgHwJ44,11229
4
4
  lxml/apihelpers.pxi,sha256=4S__cOXO4gq5tsr453GJPeRdbqI5B830SFYsxyUPok0,65403
5
5
  lxml/builder.py,sha256=5P0LKYgwJws2fmKAZOr9gkqsBtY-lUC9LDF2W5B5Aik,8332
@@ -10,16 +10,16 @@ lxml/debug.pxi,sha256=HdUEKw8hU1Wzz4nO3hheqrauZlPATsv7N4QPQxCdth4,3372
10
10
  lxml/docloader.pxi,sha256=tP6b_zZ5s0-Zj2kf2mFksI8U5whArwOVDOVuaZcbgtM,5950
11
11
  lxml/doctestcompare.py,sha256=1r23O3Ki1BypqzkVWh97FEdRq-ENvmFEWuABOWdE0Lo,18219
12
12
  lxml/dtd.pxi,sha256=-M93Bx2WbDuyjTW1Btl2eG0U7q7YtkAU9sz9ZQ_5pa4,15706
13
- lxml/etree.h,sha256=cJHoZm3ijtZPtr8H7LbOvA0PXCtYWTvMJkmo7yA5BNU,9911
14
- lxml/etree.pypy38-pp73-win_amd64.pyd,sha256=bXXiz-67-_5eYLcFzFelkYKofGYJNz-97-q7bDDwmDU,3973632
15
- lxml/etree.pyx,sha256=58IECKq3vwBdGgqwvAGnDclq6t7YALya1QKx4Sq3UIo,137562
16
- lxml/etree_api.h,sha256=NLn4SV8nwM2zqnFuV339sy-S3e2NOINiWKZJSN8l2Yk,17014
13
+ lxml/etree.h,sha256=NPhL2CztGkHk3YgMx2j7pkimDavjx_8CTmCRrdMxslM,9912
14
+ lxml/etree.pypy38-pp73-win_amd64.pyd,sha256=dLrxTDA_cSb6KxnGySwz1VE_0_LHuazpVuBHtLNi0TA,3972096
15
+ lxml/etree.pyx,sha256=NYTMRlUmCau1qaPlc9OEe7lsGUq6MVLkic8gZmjBEkg,137597
16
+ lxml/etree_api.h,sha256=XDHi7lgGoIcyoS8hB27di8OAb0RC6ubGs3ETzd3KDSE,17063
17
17
  lxml/extensions.pxi,sha256=s3IkovN6q7I1lvAEs3rh7F-8TtTjTGXRSkptYV9012I,32921
18
18
  lxml/iterparse.pxi,sha256=u-fbcHXv9WPz6aOnexe4Vwv5zHJz0-Gc8ScOL-qhcpQ,16965
19
- lxml/lxml.etree.h,sha256=FEusZqW8FOIvmzI5W_IpnqvSCbOPTQrFyQ-CXbUwt9c,10159
20
- lxml/lxml.etree_api.h,sha256=M7LaywvBBH99cgu8CtEayqEiygSqfuppNgz8qOJwYIQ,17214
19
+ lxml/lxml.etree.h,sha256=1pHssiGfgydjVlSwEGghrwF4_y2kyELiSojvGEhFHzw,10160
20
+ lxml/lxml.etree_api.h,sha256=vBxFPTxggSL251E5RupQ5ylcFTgKBxQJtF9dnmaofzI,17263
21
21
  lxml/nsclasses.pxi,sha256=9TiZjPbP73H8VN0Xu1RmuRB7G-D0pTTSqLPbdTJQt5U,9410
22
- lxml/objectify.pypy38-pp73-win_amd64.pyd,sha256=cvwhg_4PspkTWuKIi6hAwudlHRCluh7QH7QmdHpWxw8,1721344
22
+ lxml/objectify.pypy38-pp73-win_amd64.pyd,sha256=ADRhMhvU6hHiWSJHW1NL1WlNgEMncf1YEcHNcj4FwnU,1721344
23
23
  lxml/objectify.pyx,sha256=IngqdSCtxUav_NklHutFR92AuuNfr9nYchEY3Zprchc,77880
24
24
  lxml/objectpath.pxi,sha256=og7LX-a88RrgJI3lUAtuJ8Hvx4RpNf57k7pmafzCuCA,11782
25
25
  lxml/parser.pxi,sha256=i4OJWLw7PbCMteTM5n-ur7LOhgy3cTNus5h_iy5-Dfo,83860
@@ -47,7 +47,7 @@ lxml/html/_diffcommand.py,sha256=MfccaYAAKCtzCRe_MCXihC3vnuPUKiJbmOx85Dt37To,216
47
47
  lxml/html/_html5builder.py,sha256=XfqNFDQ5HUOWTqubeOe1m5qmIut6I_3Egye75wer7tE,3330
48
48
  lxml/html/_setmixin.py,sha256=6cUyIeiMIn5zUytcWHsdWZXyMJXVsfJVVQoAIIe9h7Q,1244
49
49
  lxml/html/builder.py,sha256=X4-ZNqczoi9h035AN-4BmfSYDfsVyKpXXGsYPUFAh48,4625
50
- lxml/html/clean.py,sha256=uBjahSAbXoCfXB8N3d13V3ei7bD4v7FSbnSnsh5Wp6I,28938
50
+ lxml/html/clean.py,sha256=B_rsm3Mz7pn_Z8LnkXeSocL257--sqWJGaxBp2LlmB8,524
51
51
  lxml/html/defs.py,sha256=w_8kGoMweUNZxTjQ9UlMeUo8wyTTvzjDG4ub24j8RRg,4371
52
52
  lxml/html/diff.py,sha256=Nygvg_a29fblqyj07CeCtl9_OV3AWPFTBjwqEOk8-PM,31271
53
53
  lxml/html/formfill.py,sha256=8yXFIO4DNVY-HG8q4O4OtxIQ8qmLpXYQ8T1rFUTkK8c,9980
@@ -62,7 +62,7 @@ lxml/includes/dtdvalid.pxd,sha256=VOt94bqe417bxvfkfbtbsd5kVnwQj4oYSHMfnV2pcsM,70
62
62
  lxml/includes/etree_defs.h,sha256=XXz1CliDqEBPvxGNtnmYEwE6D1Xtdib1sMk9JvvkHDM,14624
63
63
  lxml/includes/etreepublic.pxd,sha256=s2HRJSuZhwlDi3oHsSBPChA4dgTnWjWWMzBb3VxrtJw,10421
64
64
  lxml/includes/htmlparser.pxd,sha256=UEV2wIp8RHekwUAcPWMQe8KBfEv39u0c3LHOsdvRr0k,2858
65
- lxml/includes/lxml-version.h,sha256=NSdLWdmJLugVjJ1c96DK6f1SRE20OTvFbpjOvFp28YQ,74
65
+ lxml/includes/lxml-version.h,sha256=mC7fdlttvjzautEbr-o-68YHVS5tRvZJLPoqsjmXNy4,74
66
66
  lxml/includes/relaxng.pxd,sha256=H5W5XObI5YtRpULeiRy6K6ldUZC0tDQauAFZ7PJSqE4,2679
67
67
  lxml/includes/schematron.pxd,sha256=JmKDG9ElNPXJ7rZzOpQjPaIdzIZ1pzMEOT_GjZRK98U,1638
68
68
  lxml/includes/tree.pxd,sha256=Uo6hPybaSgh6P9YdQfpr0mftCrSyYZ_w_Z0FyCtOr1c,20885
@@ -163,9 +163,9 @@ lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl
163
163
  lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl,sha256=Vo1F576odRkdLBmuEEWrLa5LF3SMTX9B6EEGnxUbv_8,73560
164
164
  lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl,sha256=UI7xHx0leNXS9BaU6cyQ4CHeflufKmAE-zjOrihuH-E,20970
165
165
  lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt,sha256=OGLiFswuLJEW5EPYKOeoauuCJFEtVa6jyzBE1OcJI98,3310
166
- lxml-5.1.1.dist-info/LICENSE.txt,sha256=tircdrXghpYNqK2q-tqeBOMqrzXhzLFM71dCQFqu0nw,1517
167
- lxml-5.1.1.dist-info/LICENSES.txt,sha256=zlP1CNDLiL20Yh4jbKNIgaP6GAX5ffzPOJYBKKTi6tk,1543
168
- lxml-5.1.1.dist-info/METADATA,sha256=VvoexjEomv_ZU8-YM7gLjv1vYmQGkPE3lIQUy_U40Po,3616
169
- lxml-5.1.1.dist-info/WHEEL,sha256=fAsO8Y4sTZ_pYj32GOpGS8gjaRg1Ie4l9VhGLuQJiq4,107
170
- lxml-5.1.1.dist-info/top_level.txt,sha256=NjD988wqaKq512nshNdLt-uDxsjkp4Bh51m6N-dhUrk,5
171
- lxml-5.1.1.dist-info/RECORD,,
166
+ lxml-5.2.1.dist-info/LICENSE.txt,sha256=tircdrXghpYNqK2q-tqeBOMqrzXhzLFM71dCQFqu0nw,1517
167
+ lxml-5.2.1.dist-info/LICENSES.txt,sha256=zlP1CNDLiL20Yh4jbKNIgaP6GAX5ffzPOJYBKKTi6tk,1543
168
+ lxml-5.2.1.dist-info/METADATA,sha256=VmZHKN68I3oth5F5GL2yItClDMbCa-6Hxne8JLsILsc,3533
169
+ lxml-5.2.1.dist-info/WHEEL,sha256=fAsO8Y4sTZ_pYj32GOpGS8gjaRg1Ie4l9VhGLuQJiq4,107
170
+ lxml-5.2.1.dist-info/top_level.txt,sha256=NjD988wqaKq512nshNdLt-uDxsjkp4Bh51m6N-dhUrk,5
171
+ lxml-5.2.1.dist-info/RECORD,,
File without changes