lxml 5.2.0__cp310-cp310-win32.whl → 5.2.2__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/ElementInclude.py +244 -244
- lxml/__init__.py +22 -22
- lxml/_elementpath.cp310-win32.pyd +0 -0
- lxml/_elementpath.py +341 -341
- lxml/apihelpers.pxi +1793 -1793
- lxml/builder.cp310-win32.pyd +0 -0
- lxml/builder.py +232 -232
- lxml/classlookup.pxi +580 -580
- lxml/cleanup.pxi +215 -215
- lxml/cssselect.py +101 -101
- lxml/debug.pxi +90 -90
- lxml/docloader.pxi +178 -178
- lxml/doctestcompare.py +488 -488
- lxml/dtd.pxi +478 -478
- lxml/etree.cp310-win32.pyd +0 -0
- lxml/etree.h +6 -6
- lxml/etree.pyx +3732 -3711
- lxml/extensions.pxi +833 -833
- lxml/html/ElementSoup.py +10 -10
- lxml/html/__init__.py +1923 -1923
- lxml/html/_diffcommand.py +86 -86
- lxml/html/_html5builder.py +100 -100
- lxml/html/_setmixin.py +56 -56
- lxml/html/builder.py +133 -133
- lxml/html/clean.py +21 -21
- lxml/html/defs.py +135 -135
- lxml/html/diff.cp310-win32.pyd +0 -0
- lxml/html/diff.py +878 -878
- lxml/html/formfill.py +299 -299
- lxml/html/html5parser.py +260 -260
- lxml/html/soupparser.py +314 -314
- lxml/html/usedoctest.py +13 -13
- lxml/includes/c14n.pxd +25 -25
- lxml/includes/config.pxd +3 -3
- lxml/includes/dtdvalid.pxd +18 -18
- lxml/includes/etree_defs.h +379 -379
- lxml/includes/etreepublic.pxd +237 -237
- lxml/includes/htmlparser.pxd +56 -56
- lxml/includes/lxml-version.h +1 -1
- lxml/includes/relaxng.pxd +64 -64
- lxml/includes/schematron.pxd +34 -34
- lxml/includes/tree.pxd +494 -494
- lxml/includes/uri.pxd +5 -5
- lxml/includes/xinclude.pxd +22 -22
- lxml/includes/xmlerror.pxd +852 -852
- lxml/includes/xmlparser.pxd +265 -265
- lxml/includes/xmlschema.pxd +35 -35
- lxml/includes/xpath.pxd +136 -136
- lxml/includes/xslt.pxd +190 -190
- lxml/isoschematron/__init__.py +348 -348
- lxml/isoschematron/resources/rng/iso-schematron.rng +709 -709
- lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -75
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +312 -312
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1159 -1159
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +54 -54
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -1796
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -588
- lxml/iterparse.pxi +438 -438
- lxml/lxml.etree.h +6 -6
- lxml/nsclasses.pxi +281 -281
- lxml/objectify.cp310-win32.pyd +0 -0
- lxml/objectify.pyx +2145 -2145
- lxml/objectpath.pxi +332 -332
- lxml/parser.pxi +1994 -1994
- lxml/parsertarget.pxi +180 -180
- lxml/proxy.pxi +619 -619
- lxml/public-api.pxi +178 -178
- lxml/pyclasslookup.py +3 -3
- lxml/readonlytree.pxi +565 -565
- lxml/relaxng.pxi +165 -165
- lxml/sax.cp310-win32.pyd +0 -0
- lxml/sax.py +275 -275
- lxml/saxparser.pxi +875 -875
- lxml/schematron.pxi +168 -168
- lxml/serializer.pxi +1871 -1871
- lxml/usedoctest.py +13 -13
- lxml/xinclude.pxi +67 -67
- lxml/xmlerror.pxi +1654 -1654
- lxml/xmlid.pxi +179 -179
- lxml/xmlschema.pxi +215 -215
- lxml/xpath.pxi +487 -487
- lxml/xslt.pxi +950 -950
- lxml/xsltext.pxi +242 -242
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSE.txt +29 -29
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSES.txt +29 -29
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/METADATA +9 -17
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/RECORD +89 -89
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/WHEEL +0 -0
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/top_level.txt +0 -0
lxml/cleanup.pxi
CHANGED
@@ -1,215 +1,215 @@
|
|
1
|
-
# functions for tree cleanup and removing elements from subtrees
|
2
|
-
|
3
|
-
def cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None):
|
4
|
-
"""cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None)
|
5
|
-
|
6
|
-
Remove all namespace declarations from a subtree that are not used
|
7
|
-
by any of the elements or attributes in that tree.
|
8
|
-
|
9
|
-
If a 'top_nsmap' is provided, it must be a mapping from prefixes
|
10
|
-
to namespace URIs. These namespaces will be declared on the top
|
11
|
-
element of the subtree before running the cleanup, which allows
|
12
|
-
moving namespace declarations to the top of the tree.
|
13
|
-
|
14
|
-
If a 'keep_ns_prefixes' is provided, it must be a list of prefixes.
|
15
|
-
These prefixes will not be removed as part of the cleanup.
|
16
|
-
"""
|
17
|
-
element = _rootNodeOrRaise(tree_or_element)
|
18
|
-
c_element = element._c_node
|
19
|
-
|
20
|
-
if top_nsmap:
|
21
|
-
doc = element._doc
|
22
|
-
# declare namespaces from nsmap, then apply them to the subtree
|
23
|
-
_setNodeNamespaces(c_element, doc, None, top_nsmap)
|
24
|
-
moveNodeToDocument(doc, c_element.doc, c_element)
|
25
|
-
|
26
|
-
keep_ns_prefixes = (
|
27
|
-
set([_utf8(prefix) for prefix in keep_ns_prefixes])
|
28
|
-
if keep_ns_prefixes else None)
|
29
|
-
|
30
|
-
_removeUnusedNamespaceDeclarations(c_element, keep_ns_prefixes)
|
31
|
-
|
32
|
-
|
33
|
-
def strip_attributes(tree_or_element, *attribute_names):
|
34
|
-
"""strip_attributes(tree_or_element, *attribute_names)
|
35
|
-
|
36
|
-
Delete all attributes with the provided attribute names from an
|
37
|
-
Element (or ElementTree) and its descendants.
|
38
|
-
|
39
|
-
Attribute names can contain wildcards as in `_Element.iter`.
|
40
|
-
|
41
|
-
Example usage::
|
42
|
-
|
43
|
-
strip_attributes(root_element,
|
44
|
-
'simpleattr',
|
45
|
-
'{http://some/ns}attrname',
|
46
|
-
'{http://other/ns}*')
|
47
|
-
"""
|
48
|
-
cdef _MultiTagMatcher matcher
|
49
|
-
element = _rootNodeOrRaise(tree_or_element)
|
50
|
-
if not attribute_names:
|
51
|
-
return
|
52
|
-
|
53
|
-
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, attribute_names)
|
54
|
-
matcher.cacheTags(element._doc)
|
55
|
-
if matcher.rejectsAllAttributes():
|
56
|
-
return
|
57
|
-
_strip_attributes(element._c_node, matcher)
|
58
|
-
|
59
|
-
|
60
|
-
cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
|
61
|
-
cdef xmlAttr* c_attr
|
62
|
-
cdef xmlAttr* c_next_attr
|
63
|
-
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
64
|
-
if c_node.type == tree.XML_ELEMENT_NODE:
|
65
|
-
c_attr = c_node.properties
|
66
|
-
while c_attr is not NULL:
|
67
|
-
c_next_attr = c_attr.next
|
68
|
-
if matcher.matchesAttribute(c_attr):
|
69
|
-
tree.xmlRemoveProp(c_attr)
|
70
|
-
c_attr = c_next_attr
|
71
|
-
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
72
|
-
|
73
|
-
|
74
|
-
def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
|
75
|
-
"""strip_elements(tree_or_element, *tag_names, with_tail=True)
|
76
|
-
|
77
|
-
Delete all elements with the provided tag names from a tree or
|
78
|
-
subtree. This will remove the elements and their entire subtree,
|
79
|
-
including all their attributes, text content and descendants. It
|
80
|
-
will also remove the tail text of the element unless you
|
81
|
-
explicitly set the ``with_tail`` keyword argument option to False.
|
82
|
-
|
83
|
-
Tag names can contain wildcards as in `_Element.iter`.
|
84
|
-
|
85
|
-
Note that this will not delete the element (or ElementTree root
|
86
|
-
element) that you passed even if it matches. It will only treat
|
87
|
-
its descendants. If you want to include the root element, check
|
88
|
-
its tag name directly before even calling this function.
|
89
|
-
|
90
|
-
Example usage::
|
91
|
-
|
92
|
-
strip_elements(some_element,
|
93
|
-
'simpletagname', # non-namespaced tag
|
94
|
-
'{http://some/ns}tagname', # namespaced tag
|
95
|
-
'{http://some/other/ns}*' # any tag from a namespace
|
96
|
-
lxml.etree.Comment # comments
|
97
|
-
)
|
98
|
-
"""
|
99
|
-
cdef _MultiTagMatcher matcher
|
100
|
-
doc = _documentOrRaise(tree_or_element)
|
101
|
-
element = _rootNodeOrRaise(tree_or_element)
|
102
|
-
if not tag_names:
|
103
|
-
return
|
104
|
-
|
105
|
-
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
|
106
|
-
matcher.cacheTags(doc)
|
107
|
-
if matcher.rejectsAll():
|
108
|
-
return
|
109
|
-
|
110
|
-
if isinstance(tree_or_element, _ElementTree):
|
111
|
-
# include PIs and comments next to the root node
|
112
|
-
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
113
|
-
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
|
114
|
-
if matcher.matchesType(tree.XML_PI_NODE):
|
115
|
-
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
|
116
|
-
_strip_elements(doc, element._c_node, matcher, with_tail)
|
117
|
-
|
118
|
-
cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
|
119
|
-
bint with_tail):
|
120
|
-
cdef xmlNode* c_child
|
121
|
-
cdef xmlNode* c_next
|
122
|
-
|
123
|
-
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
124
|
-
if c_node.type == tree.XML_ELEMENT_NODE:
|
125
|
-
# we run through the children here to prevent any problems
|
126
|
-
# with the tree iteration which would occur if we unlinked the
|
127
|
-
# c_node itself
|
128
|
-
c_child = _findChildForwards(c_node, 0)
|
129
|
-
while c_child is not NULL:
|
130
|
-
c_next = _nextElement(c_child)
|
131
|
-
if matcher.matches(c_child):
|
132
|
-
if c_child.type == tree.XML_ELEMENT_NODE:
|
133
|
-
if not with_tail:
|
134
|
-
tree.xmlUnlinkNode(c_child)
|
135
|
-
_removeNode(doc, c_child)
|
136
|
-
else:
|
137
|
-
if with_tail:
|
138
|
-
_removeText(c_child.next)
|
139
|
-
tree.xmlUnlinkNode(c_child)
|
140
|
-
attemptDeallocation(c_child)
|
141
|
-
c_child = c_next
|
142
|
-
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
143
|
-
|
144
|
-
|
145
|
-
def strip_tags(tree_or_element, *tag_names):
|
146
|
-
"""strip_tags(tree_or_element, *tag_names)
|
147
|
-
|
148
|
-
Delete all elements with the provided tag names from a tree or
|
149
|
-
subtree. This will remove the elements and their attributes, but
|
150
|
-
*not* their text/tail content or descendants. Instead, it will
|
151
|
-
merge the text content and children of the element into its
|
152
|
-
parent.
|
153
|
-
|
154
|
-
Tag names can contain wildcards as in `_Element.iter`.
|
155
|
-
|
156
|
-
Note that this will not delete the element (or ElementTree root
|
157
|
-
element) that you passed even if it matches. It will only treat
|
158
|
-
its descendants.
|
159
|
-
|
160
|
-
Example usage::
|
161
|
-
|
162
|
-
strip_tags(some_element,
|
163
|
-
'simpletagname', # non-namespaced tag
|
164
|
-
'{http://some/ns}tagname', # namespaced tag
|
165
|
-
'{http://some/other/ns}*' # any tag from a namespace
|
166
|
-
Comment # comments (including their text!)
|
167
|
-
)
|
168
|
-
"""
|
169
|
-
cdef _MultiTagMatcher matcher
|
170
|
-
doc = _documentOrRaise(tree_or_element)
|
171
|
-
element = _rootNodeOrRaise(tree_or_element)
|
172
|
-
if not tag_names:
|
173
|
-
return
|
174
|
-
|
175
|
-
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
|
176
|
-
matcher.cacheTags(doc)
|
177
|
-
if matcher.rejectsAll():
|
178
|
-
return
|
179
|
-
|
180
|
-
if isinstance(tree_or_element, _ElementTree):
|
181
|
-
# include PIs and comments next to the root node
|
182
|
-
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
183
|
-
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
|
184
|
-
if matcher.matchesType(tree.XML_PI_NODE):
|
185
|
-
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
|
186
|
-
_strip_tags(doc, element._c_node, matcher)
|
187
|
-
|
188
|
-
cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
|
189
|
-
cdef xmlNode* c_child
|
190
|
-
cdef xmlNode* c_next
|
191
|
-
|
192
|
-
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
193
|
-
if c_node.type == tree.XML_ELEMENT_NODE:
|
194
|
-
# we run through the children here to prevent any problems
|
195
|
-
# with the tree iteration which would occur if we unlinked the
|
196
|
-
# c_node itself
|
197
|
-
c_child = _findChildForwards(c_node, 0)
|
198
|
-
while c_child is not NULL:
|
199
|
-
if not matcher.matches(c_child):
|
200
|
-
c_child = _nextElement(c_child)
|
201
|
-
continue
|
202
|
-
if c_child.type == tree.XML_ELEMENT_NODE:
|
203
|
-
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
|
204
|
-
_replaceNodeByChildren(doc, c_child)
|
205
|
-
if not attemptDeallocation(c_child):
|
206
|
-
if c_child.nsDef is not NULL:
|
207
|
-
# make namespaces absolute
|
208
|
-
moveNodeToDocument(doc, doc._c_doc, c_child)
|
209
|
-
c_child = c_next
|
210
|
-
else:
|
211
|
-
c_next = _nextElement(c_child)
|
212
|
-
tree.xmlUnlinkNode(c_child)
|
213
|
-
attemptDeallocation(c_child)
|
214
|
-
c_child = c_next
|
215
|
-
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
1
|
+
# functions for tree cleanup and removing elements from subtrees
|
2
|
+
|
3
|
+
def cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None):
|
4
|
+
"""cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None)
|
5
|
+
|
6
|
+
Remove all namespace declarations from a subtree that are not used
|
7
|
+
by any of the elements or attributes in that tree.
|
8
|
+
|
9
|
+
If a 'top_nsmap' is provided, it must be a mapping from prefixes
|
10
|
+
to namespace URIs. These namespaces will be declared on the top
|
11
|
+
element of the subtree before running the cleanup, which allows
|
12
|
+
moving namespace declarations to the top of the tree.
|
13
|
+
|
14
|
+
If a 'keep_ns_prefixes' is provided, it must be a list of prefixes.
|
15
|
+
These prefixes will not be removed as part of the cleanup.
|
16
|
+
"""
|
17
|
+
element = _rootNodeOrRaise(tree_or_element)
|
18
|
+
c_element = element._c_node
|
19
|
+
|
20
|
+
if top_nsmap:
|
21
|
+
doc = element._doc
|
22
|
+
# declare namespaces from nsmap, then apply them to the subtree
|
23
|
+
_setNodeNamespaces(c_element, doc, None, top_nsmap)
|
24
|
+
moveNodeToDocument(doc, c_element.doc, c_element)
|
25
|
+
|
26
|
+
keep_ns_prefixes = (
|
27
|
+
set([_utf8(prefix) for prefix in keep_ns_prefixes])
|
28
|
+
if keep_ns_prefixes else None)
|
29
|
+
|
30
|
+
_removeUnusedNamespaceDeclarations(c_element, keep_ns_prefixes)
|
31
|
+
|
32
|
+
|
33
|
+
def strip_attributes(tree_or_element, *attribute_names):
|
34
|
+
"""strip_attributes(tree_or_element, *attribute_names)
|
35
|
+
|
36
|
+
Delete all attributes with the provided attribute names from an
|
37
|
+
Element (or ElementTree) and its descendants.
|
38
|
+
|
39
|
+
Attribute names can contain wildcards as in `_Element.iter`.
|
40
|
+
|
41
|
+
Example usage::
|
42
|
+
|
43
|
+
strip_attributes(root_element,
|
44
|
+
'simpleattr',
|
45
|
+
'{http://some/ns}attrname',
|
46
|
+
'{http://other/ns}*')
|
47
|
+
"""
|
48
|
+
cdef _MultiTagMatcher matcher
|
49
|
+
element = _rootNodeOrRaise(tree_or_element)
|
50
|
+
if not attribute_names:
|
51
|
+
return
|
52
|
+
|
53
|
+
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, attribute_names)
|
54
|
+
matcher.cacheTags(element._doc)
|
55
|
+
if matcher.rejectsAllAttributes():
|
56
|
+
return
|
57
|
+
_strip_attributes(element._c_node, matcher)
|
58
|
+
|
59
|
+
|
60
|
+
cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
|
61
|
+
cdef xmlAttr* c_attr
|
62
|
+
cdef xmlAttr* c_next_attr
|
63
|
+
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
64
|
+
if c_node.type == tree.XML_ELEMENT_NODE:
|
65
|
+
c_attr = c_node.properties
|
66
|
+
while c_attr is not NULL:
|
67
|
+
c_next_attr = c_attr.next
|
68
|
+
if matcher.matchesAttribute(c_attr):
|
69
|
+
tree.xmlRemoveProp(c_attr)
|
70
|
+
c_attr = c_next_attr
|
71
|
+
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
72
|
+
|
73
|
+
|
74
|
+
def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
|
75
|
+
"""strip_elements(tree_or_element, *tag_names, with_tail=True)
|
76
|
+
|
77
|
+
Delete all elements with the provided tag names from a tree or
|
78
|
+
subtree. This will remove the elements and their entire subtree,
|
79
|
+
including all their attributes, text content and descendants. It
|
80
|
+
will also remove the tail text of the element unless you
|
81
|
+
explicitly set the ``with_tail`` keyword argument option to False.
|
82
|
+
|
83
|
+
Tag names can contain wildcards as in `_Element.iter`.
|
84
|
+
|
85
|
+
Note that this will not delete the element (or ElementTree root
|
86
|
+
element) that you passed even if it matches. It will only treat
|
87
|
+
its descendants. If you want to include the root element, check
|
88
|
+
its tag name directly before even calling this function.
|
89
|
+
|
90
|
+
Example usage::
|
91
|
+
|
92
|
+
strip_elements(some_element,
|
93
|
+
'simpletagname', # non-namespaced tag
|
94
|
+
'{http://some/ns}tagname', # namespaced tag
|
95
|
+
'{http://some/other/ns}*' # any tag from a namespace
|
96
|
+
lxml.etree.Comment # comments
|
97
|
+
)
|
98
|
+
"""
|
99
|
+
cdef _MultiTagMatcher matcher
|
100
|
+
doc = _documentOrRaise(tree_or_element)
|
101
|
+
element = _rootNodeOrRaise(tree_or_element)
|
102
|
+
if not tag_names:
|
103
|
+
return
|
104
|
+
|
105
|
+
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
|
106
|
+
matcher.cacheTags(doc)
|
107
|
+
if matcher.rejectsAll():
|
108
|
+
return
|
109
|
+
|
110
|
+
if isinstance(tree_or_element, _ElementTree):
|
111
|
+
# include PIs and comments next to the root node
|
112
|
+
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
113
|
+
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
|
114
|
+
if matcher.matchesType(tree.XML_PI_NODE):
|
115
|
+
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
|
116
|
+
_strip_elements(doc, element._c_node, matcher, with_tail)
|
117
|
+
|
118
|
+
cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
|
119
|
+
bint with_tail):
|
120
|
+
cdef xmlNode* c_child
|
121
|
+
cdef xmlNode* c_next
|
122
|
+
|
123
|
+
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
124
|
+
if c_node.type == tree.XML_ELEMENT_NODE:
|
125
|
+
# we run through the children here to prevent any problems
|
126
|
+
# with the tree iteration which would occur if we unlinked the
|
127
|
+
# c_node itself
|
128
|
+
c_child = _findChildForwards(c_node, 0)
|
129
|
+
while c_child is not NULL:
|
130
|
+
c_next = _nextElement(c_child)
|
131
|
+
if matcher.matches(c_child):
|
132
|
+
if c_child.type == tree.XML_ELEMENT_NODE:
|
133
|
+
if not with_tail:
|
134
|
+
tree.xmlUnlinkNode(c_child)
|
135
|
+
_removeNode(doc, c_child)
|
136
|
+
else:
|
137
|
+
if with_tail:
|
138
|
+
_removeText(c_child.next)
|
139
|
+
tree.xmlUnlinkNode(c_child)
|
140
|
+
attemptDeallocation(c_child)
|
141
|
+
c_child = c_next
|
142
|
+
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
143
|
+
|
144
|
+
|
145
|
+
def strip_tags(tree_or_element, *tag_names):
|
146
|
+
"""strip_tags(tree_or_element, *tag_names)
|
147
|
+
|
148
|
+
Delete all elements with the provided tag names from a tree or
|
149
|
+
subtree. This will remove the elements and their attributes, but
|
150
|
+
*not* their text/tail content or descendants. Instead, it will
|
151
|
+
merge the text content and children of the element into its
|
152
|
+
parent.
|
153
|
+
|
154
|
+
Tag names can contain wildcards as in `_Element.iter`.
|
155
|
+
|
156
|
+
Note that this will not delete the element (or ElementTree root
|
157
|
+
element) that you passed even if it matches. It will only treat
|
158
|
+
its descendants.
|
159
|
+
|
160
|
+
Example usage::
|
161
|
+
|
162
|
+
strip_tags(some_element,
|
163
|
+
'simpletagname', # non-namespaced tag
|
164
|
+
'{http://some/ns}tagname', # namespaced tag
|
165
|
+
'{http://some/other/ns}*' # any tag from a namespace
|
166
|
+
Comment # comments (including their text!)
|
167
|
+
)
|
168
|
+
"""
|
169
|
+
cdef _MultiTagMatcher matcher
|
170
|
+
doc = _documentOrRaise(tree_or_element)
|
171
|
+
element = _rootNodeOrRaise(tree_or_element)
|
172
|
+
if not tag_names:
|
173
|
+
return
|
174
|
+
|
175
|
+
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
|
176
|
+
matcher.cacheTags(doc)
|
177
|
+
if matcher.rejectsAll():
|
178
|
+
return
|
179
|
+
|
180
|
+
if isinstance(tree_or_element, _ElementTree):
|
181
|
+
# include PIs and comments next to the root node
|
182
|
+
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
183
|
+
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
|
184
|
+
if matcher.matchesType(tree.XML_PI_NODE):
|
185
|
+
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
|
186
|
+
_strip_tags(doc, element._c_node, matcher)
|
187
|
+
|
188
|
+
cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
|
189
|
+
cdef xmlNode* c_child
|
190
|
+
cdef xmlNode* c_next
|
191
|
+
|
192
|
+
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
193
|
+
if c_node.type == tree.XML_ELEMENT_NODE:
|
194
|
+
# we run through the children here to prevent any problems
|
195
|
+
# with the tree iteration which would occur if we unlinked the
|
196
|
+
# c_node itself
|
197
|
+
c_child = _findChildForwards(c_node, 0)
|
198
|
+
while c_child is not NULL:
|
199
|
+
if not matcher.matches(c_child):
|
200
|
+
c_child = _nextElement(c_child)
|
201
|
+
continue
|
202
|
+
if c_child.type == tree.XML_ELEMENT_NODE:
|
203
|
+
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
|
204
|
+
_replaceNodeByChildren(doc, c_child)
|
205
|
+
if not attemptDeallocation(c_child):
|
206
|
+
if c_child.nsDef is not NULL:
|
207
|
+
# make namespaces absolute
|
208
|
+
moveNodeToDocument(doc, doc._c_doc, c_child)
|
209
|
+
c_child = c_next
|
210
|
+
else:
|
211
|
+
c_next = _nextElement(c_child)
|
212
|
+
tree.xmlUnlinkNode(c_child)
|
213
|
+
attemptDeallocation(c_child)
|
214
|
+
c_child = c_next
|
215
|
+
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
lxml/cssselect.py
CHANGED
@@ -1,101 +1,101 @@
|
|
1
|
-
"""CSS Selectors based on XPath.
|
2
|
-
|
3
|
-
This module supports selecting XML/HTML tags based on CSS selectors.
|
4
|
-
See the `CSSSelector` class for details.
|
5
|
-
|
6
|
-
This is a thin wrapper around cssselect 0.7 or later.
|
7
|
-
"""
|
8
|
-
|
9
|
-
|
10
|
-
from . import etree
|
11
|
-
try:
|
12
|
-
import cssselect as external_cssselect
|
13
|
-
except ImportError:
|
14
|
-
raise ImportError(
|
15
|
-
'cssselect does not seem to be installed. '
|
16
|
-
'See https://pypi.org/project/cssselect/')
|
17
|
-
|
18
|
-
|
19
|
-
SelectorSyntaxError = external_cssselect.SelectorSyntaxError
|
20
|
-
ExpressionError = external_cssselect.ExpressionError
|
21
|
-
SelectorError = external_cssselect.SelectorError
|
22
|
-
|
23
|
-
|
24
|
-
__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError',
|
25
|
-
'CSSSelector']
|
26
|
-
|
27
|
-
|
28
|
-
class LxmlTranslator(external_cssselect.GenericTranslator):
|
29
|
-
"""
|
30
|
-
A custom CSS selector to XPath translator with lxml-specific extensions.
|
31
|
-
"""
|
32
|
-
def xpath_contains_function(self, xpath, function):
|
33
|
-
# Defined there, removed in later drafts:
|
34
|
-
# http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
|
35
|
-
if function.argument_types() not in (['STRING'], ['IDENT']):
|
36
|
-
raise ExpressionError(
|
37
|
-
"Expected a single string or ident for :contains(), got %r"
|
38
|
-
% function.arguments)
|
39
|
-
value = function.arguments[0].value
|
40
|
-
return xpath.add_condition(
|
41
|
-
'contains(__lxml_internal_css:lower-case(string(.)), %s)'
|
42
|
-
% self.xpath_literal(value.lower()))
|
43
|
-
|
44
|
-
|
45
|
-
class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator):
|
46
|
-
"""
|
47
|
-
lxml extensions + HTML support.
|
48
|
-
"""
|
49
|
-
|
50
|
-
|
51
|
-
def _make_lower_case(context, s):
|
52
|
-
return s.lower()
|
53
|
-
|
54
|
-
ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
|
55
|
-
ns.prefix = '__lxml_internal_css'
|
56
|
-
ns['lower-case'] = _make_lower_case
|
57
|
-
|
58
|
-
|
59
|
-
class CSSSelector(etree.XPath):
|
60
|
-
"""A CSS selector.
|
61
|
-
|
62
|
-
Usage::
|
63
|
-
|
64
|
-
>>> from lxml import etree, cssselect
|
65
|
-
>>> select = cssselect.CSSSelector("a tag > child")
|
66
|
-
|
67
|
-
>>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
|
68
|
-
>>> [ el.tag for el in select(root) ]
|
69
|
-
['child']
|
70
|
-
|
71
|
-
To use CSS namespaces, you need to pass a prefix-to-namespace
|
72
|
-
mapping as ``namespaces`` keyword argument::
|
73
|
-
|
74
|
-
>>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
75
|
-
>>> select_ns = cssselect.CSSSelector('root > rdf|Description',
|
76
|
-
... namespaces={'rdf': rdfns})
|
77
|
-
|
78
|
-
>>> rdf = etree.XML((
|
79
|
-
... '<root xmlns:rdf="%s">'
|
80
|
-
... '<rdf:Description>blah</rdf:Description>'
|
81
|
-
... '</root>') % rdfns)
|
82
|
-
>>> [(el.tag, el.text) for el in select_ns(rdf)]
|
83
|
-
[('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
|
84
|
-
|
85
|
-
"""
|
86
|
-
def __init__(self, css, namespaces=None, translator='xml'):
|
87
|
-
if translator == 'xml':
|
88
|
-
translator = LxmlTranslator()
|
89
|
-
elif translator == 'html':
|
90
|
-
translator = LxmlHTMLTranslator()
|
91
|
-
elif translator == 'xhtml':
|
92
|
-
translator = LxmlHTMLTranslator(xhtml=True)
|
93
|
-
path = translator.css_to_xpath(css)
|
94
|
-
super().__init__(path, namespaces=namespaces)
|
95
|
-
self.css = css
|
96
|
-
|
97
|
-
def __repr__(self):
|
98
|
-
return '<%s %x for %r>' % (
|
99
|
-
self.__class__.__name__,
|
100
|
-
abs(id(self)),
|
101
|
-
self.css)
|
1
|
+
"""CSS Selectors based on XPath.
|
2
|
+
|
3
|
+
This module supports selecting XML/HTML tags based on CSS selectors.
|
4
|
+
See the `CSSSelector` class for details.
|
5
|
+
|
6
|
+
This is a thin wrapper around cssselect 0.7 or later.
|
7
|
+
"""
|
8
|
+
|
9
|
+
|
10
|
+
from . import etree
|
11
|
+
try:
|
12
|
+
import cssselect as external_cssselect
|
13
|
+
except ImportError:
|
14
|
+
raise ImportError(
|
15
|
+
'cssselect does not seem to be installed. '
|
16
|
+
'See https://pypi.org/project/cssselect/')
|
17
|
+
|
18
|
+
|
19
|
+
SelectorSyntaxError = external_cssselect.SelectorSyntaxError
|
20
|
+
ExpressionError = external_cssselect.ExpressionError
|
21
|
+
SelectorError = external_cssselect.SelectorError
|
22
|
+
|
23
|
+
|
24
|
+
__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError',
|
25
|
+
'CSSSelector']
|
26
|
+
|
27
|
+
|
28
|
+
class LxmlTranslator(external_cssselect.GenericTranslator):
|
29
|
+
"""
|
30
|
+
A custom CSS selector to XPath translator with lxml-specific extensions.
|
31
|
+
"""
|
32
|
+
def xpath_contains_function(self, xpath, function):
|
33
|
+
# Defined there, removed in later drafts:
|
34
|
+
# http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
|
35
|
+
if function.argument_types() not in (['STRING'], ['IDENT']):
|
36
|
+
raise ExpressionError(
|
37
|
+
"Expected a single string or ident for :contains(), got %r"
|
38
|
+
% function.arguments)
|
39
|
+
value = function.arguments[0].value
|
40
|
+
return xpath.add_condition(
|
41
|
+
'contains(__lxml_internal_css:lower-case(string(.)), %s)'
|
42
|
+
% self.xpath_literal(value.lower()))
|
43
|
+
|
44
|
+
|
45
|
+
class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator):
|
46
|
+
"""
|
47
|
+
lxml extensions + HTML support.
|
48
|
+
"""
|
49
|
+
|
50
|
+
|
51
|
+
def _make_lower_case(context, s):
|
52
|
+
return s.lower()
|
53
|
+
|
54
|
+
ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
|
55
|
+
ns.prefix = '__lxml_internal_css'
|
56
|
+
ns['lower-case'] = _make_lower_case
|
57
|
+
|
58
|
+
|
59
|
+
class CSSSelector(etree.XPath):
|
60
|
+
"""A CSS selector.
|
61
|
+
|
62
|
+
Usage::
|
63
|
+
|
64
|
+
>>> from lxml import etree, cssselect
|
65
|
+
>>> select = cssselect.CSSSelector("a tag > child")
|
66
|
+
|
67
|
+
>>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
|
68
|
+
>>> [ el.tag for el in select(root) ]
|
69
|
+
['child']
|
70
|
+
|
71
|
+
To use CSS namespaces, you need to pass a prefix-to-namespace
|
72
|
+
mapping as ``namespaces`` keyword argument::
|
73
|
+
|
74
|
+
>>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
75
|
+
>>> select_ns = cssselect.CSSSelector('root > rdf|Description',
|
76
|
+
... namespaces={'rdf': rdfns})
|
77
|
+
|
78
|
+
>>> rdf = etree.XML((
|
79
|
+
... '<root xmlns:rdf="%s">'
|
80
|
+
... '<rdf:Description>blah</rdf:Description>'
|
81
|
+
... '</root>') % rdfns)
|
82
|
+
>>> [(el.tag, el.text) for el in select_ns(rdf)]
|
83
|
+
[('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
|
84
|
+
|
85
|
+
"""
|
86
|
+
def __init__(self, css, namespaces=None, translator='xml'):
|
87
|
+
if translator == 'xml':
|
88
|
+
translator = LxmlTranslator()
|
89
|
+
elif translator == 'html':
|
90
|
+
translator = LxmlHTMLTranslator()
|
91
|
+
elif translator == 'xhtml':
|
92
|
+
translator = LxmlHTMLTranslator(xhtml=True)
|
93
|
+
path = translator.css_to_xpath(css)
|
94
|
+
super().__init__(path, namespaces=namespaces)
|
95
|
+
self.css = css
|
96
|
+
|
97
|
+
def __repr__(self):
|
98
|
+
return '<%s %x for %r>' % (
|
99
|
+
self.__class__.__name__,
|
100
|
+
abs(id(self)),
|
101
|
+
self.css)
|