lxml 6.0.0__cp39-cp39-manylinux_2_31_armv7l.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/ElementInclude.py +244 -0
- lxml/__init__.py +22 -0
- lxml/_elementpath.cpython-39-arm-linux-gnueabihf.so +0 -0
- lxml/_elementpath.py +343 -0
- lxml/apihelpers.pxi +1801 -0
- lxml/builder.cpython-39-arm-linux-gnueabihf.so +0 -0
- lxml/builder.py +243 -0
- lxml/classlookup.pxi +580 -0
- lxml/cleanup.pxi +215 -0
- lxml/cssselect.py +101 -0
- lxml/debug.pxi +36 -0
- lxml/docloader.pxi +178 -0
- lxml/doctestcompare.py +488 -0
- lxml/dtd.pxi +479 -0
- lxml/etree.cpython-39-arm-linux-gnueabihf.so +0 -0
- lxml/etree.h +244 -0
- lxml/etree.pyx +3853 -0
- lxml/etree_api.h +204 -0
- lxml/extensions.pxi +830 -0
- lxml/html/ElementSoup.py +10 -0
- lxml/html/__init__.py +1927 -0
- lxml/html/_diffcommand.py +86 -0
- lxml/html/_difflib.cpython-39-arm-linux-gnueabihf.so +0 -0
- lxml/html/_difflib.py +2106 -0
- lxml/html/_html5builder.py +100 -0
- lxml/html/_setmixin.py +56 -0
- lxml/html/builder.py +173 -0
- lxml/html/clean.py +21 -0
- lxml/html/defs.py +135 -0
- lxml/html/diff.cpython-39-arm-linux-gnueabihf.so +0 -0
- lxml/html/diff.py +972 -0
- lxml/html/formfill.py +299 -0
- lxml/html/html5parser.py +260 -0
- lxml/html/soupparser.py +314 -0
- lxml/html/usedoctest.py +13 -0
- lxml/includes/__init__.pxd +0 -0
- lxml/includes/__init__.py +0 -0
- lxml/includes/c14n.pxd +25 -0
- lxml/includes/config.pxd +3 -0
- lxml/includes/dtdvalid.pxd +18 -0
- lxml/includes/etree_defs.h +379 -0
- lxml/includes/etreepublic.pxd +237 -0
- lxml/includes/extlibs/__init__.py +0 -0
- lxml/includes/extlibs/libcharset.h +45 -0
- lxml/includes/extlibs/localcharset.h +137 -0
- lxml/includes/extlibs/zconf.h +543 -0
- lxml/includes/extlibs/zlib.h +1938 -0
- lxml/includes/htmlparser.pxd +56 -0
- lxml/includes/libexslt/__init__.py +0 -0
- lxml/includes/libexslt/exslt.h +108 -0
- lxml/includes/libexslt/exsltconfig.h +70 -0
- lxml/includes/libexslt/exsltexports.h +63 -0
- lxml/includes/libxml/HTMLparser.h +339 -0
- lxml/includes/libxml/HTMLtree.h +148 -0
- lxml/includes/libxml/SAX.h +18 -0
- lxml/includes/libxml/SAX2.h +170 -0
- lxml/includes/libxml/__init__.py +0 -0
- lxml/includes/libxml/c14n.h +115 -0
- lxml/includes/libxml/catalog.h +183 -0
- lxml/includes/libxml/chvalid.h +230 -0
- lxml/includes/libxml/debugXML.h +79 -0
- lxml/includes/libxml/dict.h +82 -0
- lxml/includes/libxml/encoding.h +307 -0
- lxml/includes/libxml/entities.h +147 -0
- lxml/includes/libxml/globals.h +25 -0
- lxml/includes/libxml/hash.h +251 -0
- lxml/includes/libxml/list.h +137 -0
- lxml/includes/libxml/nanoftp.h +16 -0
- lxml/includes/libxml/nanohttp.h +98 -0
- lxml/includes/libxml/parser.h +1633 -0
- lxml/includes/libxml/parserInternals.h +591 -0
- lxml/includes/libxml/relaxng.h +224 -0
- lxml/includes/libxml/schemasInternals.h +959 -0
- lxml/includes/libxml/schematron.h +143 -0
- lxml/includes/libxml/threads.h +81 -0
- lxml/includes/libxml/tree.h +1326 -0
- lxml/includes/libxml/uri.h +106 -0
- lxml/includes/libxml/valid.h +485 -0
- lxml/includes/libxml/xinclude.h +141 -0
- lxml/includes/libxml/xlink.h +193 -0
- lxml/includes/libxml/xmlIO.h +419 -0
- lxml/includes/libxml/xmlautomata.h +163 -0
- lxml/includes/libxml/xmlerror.h +962 -0
- lxml/includes/libxml/xmlexports.h +96 -0
- lxml/includes/libxml/xmlmemory.h +188 -0
- lxml/includes/libxml/xmlmodule.h +61 -0
- lxml/includes/libxml/xmlreader.h +444 -0
- lxml/includes/libxml/xmlregexp.h +116 -0
- lxml/includes/libxml/xmlsave.h +111 -0
- lxml/includes/libxml/xmlschemas.h +254 -0
- lxml/includes/libxml/xmlschemastypes.h +152 -0
- lxml/includes/libxml/xmlstring.h +140 -0
- lxml/includes/libxml/xmlunicode.h +15 -0
- lxml/includes/libxml/xmlversion.h +332 -0
- lxml/includes/libxml/xmlwriter.h +489 -0
- lxml/includes/libxml/xpath.h +569 -0
- lxml/includes/libxml/xpathInternals.h +639 -0
- lxml/includes/libxml/xpointer.h +48 -0
- lxml/includes/libxslt/__init__.py +0 -0
- lxml/includes/libxslt/attributes.h +39 -0
- lxml/includes/libxslt/documents.h +93 -0
- lxml/includes/libxslt/extensions.h +262 -0
- lxml/includes/libxslt/extra.h +72 -0
- lxml/includes/libxslt/functions.h +78 -0
- lxml/includes/libxslt/imports.h +75 -0
- lxml/includes/libxslt/keys.h +53 -0
- lxml/includes/libxslt/namespaces.h +68 -0
- lxml/includes/libxslt/numbersInternals.h +73 -0
- lxml/includes/libxslt/pattern.h +84 -0
- lxml/includes/libxslt/preproc.h +43 -0
- lxml/includes/libxslt/security.h +104 -0
- lxml/includes/libxslt/templates.h +77 -0
- lxml/includes/libxslt/transform.h +207 -0
- lxml/includes/libxslt/variables.h +118 -0
- lxml/includes/libxslt/xslt.h +110 -0
- lxml/includes/libxslt/xsltInternals.h +1995 -0
- lxml/includes/libxslt/xsltconfig.h +146 -0
- lxml/includes/libxslt/xsltexports.h +64 -0
- lxml/includes/libxslt/xsltlocale.h +44 -0
- lxml/includes/libxslt/xsltutils.h +343 -0
- lxml/includes/lxml-version.h +3 -0
- lxml/includes/relaxng.pxd +64 -0
- lxml/includes/schematron.pxd +34 -0
- lxml/includes/tree.pxd +492 -0
- lxml/includes/uri.pxd +5 -0
- lxml/includes/xinclude.pxd +22 -0
- lxml/includes/xmlerror.pxd +852 -0
- lxml/includes/xmlparser.pxd +303 -0
- lxml/includes/xmlschema.pxd +35 -0
- lxml/includes/xpath.pxd +136 -0
- lxml/includes/xslt.pxd +190 -0
- lxml/isoschematron/__init__.py +348 -0
- lxml/isoschematron/resources/rng/iso-schematron.rng +709 -0
- lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -0
- lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl +77 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +313 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1160 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +55 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt +84 -0
- lxml/iterparse.pxi +438 -0
- lxml/lxml.etree.h +244 -0
- lxml/lxml.etree_api.h +204 -0
- lxml/nsclasses.pxi +281 -0
- lxml/objectify.cpython-39-arm-linux-gnueabihf.so +0 -0
- lxml/objectify.pyx +2149 -0
- lxml/objectpath.pxi +332 -0
- lxml/parser.pxi +2059 -0
- lxml/parsertarget.pxi +180 -0
- lxml/proxy.pxi +619 -0
- lxml/public-api.pxi +178 -0
- lxml/pyclasslookup.py +3 -0
- lxml/readonlytree.pxi +565 -0
- lxml/relaxng.pxi +165 -0
- lxml/sax.cpython-39-arm-linux-gnueabihf.so +0 -0
- lxml/sax.py +286 -0
- lxml/saxparser.pxi +875 -0
- lxml/schematron.pxi +173 -0
- lxml/serializer.pxi +1849 -0
- lxml/usedoctest.py +13 -0
- lxml/xinclude.pxi +67 -0
- lxml/xmlerror.pxi +1654 -0
- lxml/xmlid.pxi +179 -0
- lxml/xmlschema.pxi +215 -0
- lxml/xpath.pxi +487 -0
- lxml/xslt.pxi +957 -0
- lxml/xsltext.pxi +242 -0
- lxml-6.0.0.dist-info/METADATA +163 -0
- lxml-6.0.0.dist-info/RECORD +174 -0
- lxml-6.0.0.dist-info/WHEEL +5 -0
- lxml-6.0.0.dist-info/licenses/LICENSE.txt +31 -0
- lxml-6.0.0.dist-info/licenses/LICENSES.txt +29 -0
- lxml-6.0.0.dist-info/top_level.txt +1 -0
lxml/cleanup.pxi
ADDED
@@ -0,0 +1,215 @@
|
|
1
|
+
# functions for tree cleanup and removing elements from subtrees
|
2
|
+
|
3
|
+
def cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None):
|
4
|
+
"""cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None)
|
5
|
+
|
6
|
+
Remove all namespace declarations from a subtree that are not used
|
7
|
+
by any of the elements or attributes in that tree.
|
8
|
+
|
9
|
+
If a 'top_nsmap' is provided, it must be a mapping from prefixes
|
10
|
+
to namespace URIs. These namespaces will be declared on the top
|
11
|
+
element of the subtree before running the cleanup, which allows
|
12
|
+
moving namespace declarations to the top of the tree.
|
13
|
+
|
14
|
+
If a 'keep_ns_prefixes' is provided, it must be a list of prefixes.
|
15
|
+
These prefixes will not be removed as part of the cleanup.
|
16
|
+
"""
|
17
|
+
element = _rootNodeOrRaise(tree_or_element)
|
18
|
+
c_element = element._c_node
|
19
|
+
|
20
|
+
if top_nsmap:
|
21
|
+
doc = element._doc
|
22
|
+
# declare namespaces from nsmap, then apply them to the subtree
|
23
|
+
_setNodeNamespaces(c_element, doc, None, top_nsmap)
|
24
|
+
moveNodeToDocument(doc, c_element.doc, c_element)
|
25
|
+
|
26
|
+
keep_ns_prefixes = (
|
27
|
+
set([_utf8(prefix) for prefix in keep_ns_prefixes])
|
28
|
+
if keep_ns_prefixes else None)
|
29
|
+
|
30
|
+
_removeUnusedNamespaceDeclarations(c_element, keep_ns_prefixes)
|
31
|
+
|
32
|
+
|
33
|
+
def strip_attributes(tree_or_element, *attribute_names):
|
34
|
+
"""strip_attributes(tree_or_element, *attribute_names)
|
35
|
+
|
36
|
+
Delete all attributes with the provided attribute names from an
|
37
|
+
Element (or ElementTree) and its descendants.
|
38
|
+
|
39
|
+
Attribute names can contain wildcards as in `_Element.iter`.
|
40
|
+
|
41
|
+
Example usage::
|
42
|
+
|
43
|
+
strip_attributes(root_element,
|
44
|
+
'simpleattr',
|
45
|
+
'{http://some/ns}attrname',
|
46
|
+
'{http://other/ns}*')
|
47
|
+
"""
|
48
|
+
cdef _MultiTagMatcher matcher
|
49
|
+
element = _rootNodeOrRaise(tree_or_element)
|
50
|
+
if not attribute_names:
|
51
|
+
return
|
52
|
+
|
53
|
+
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, attribute_names)
|
54
|
+
matcher.cacheTags(element._doc)
|
55
|
+
if matcher.rejectsAllAttributes():
|
56
|
+
return
|
57
|
+
_strip_attributes(element._c_node, matcher)
|
58
|
+
|
59
|
+
|
60
|
+
cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
|
61
|
+
cdef xmlAttr* c_attr
|
62
|
+
cdef xmlAttr* c_next_attr
|
63
|
+
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
64
|
+
if c_node.type == tree.XML_ELEMENT_NODE:
|
65
|
+
c_attr = c_node.properties
|
66
|
+
while c_attr is not NULL:
|
67
|
+
c_next_attr = c_attr.next
|
68
|
+
if matcher.matchesAttribute(c_attr):
|
69
|
+
tree.xmlRemoveProp(c_attr)
|
70
|
+
c_attr = c_next_attr
|
71
|
+
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
72
|
+
|
73
|
+
|
74
|
+
def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
|
75
|
+
"""strip_elements(tree_or_element, *tag_names, with_tail=True)
|
76
|
+
|
77
|
+
Delete all elements with the provided tag names from a tree or
|
78
|
+
subtree. This will remove the elements and their entire subtree,
|
79
|
+
including all their attributes, text content and descendants. It
|
80
|
+
will also remove the tail text of the element unless you
|
81
|
+
explicitly set the ``with_tail`` keyword argument option to False.
|
82
|
+
|
83
|
+
Tag names can contain wildcards as in `_Element.iter`.
|
84
|
+
|
85
|
+
Note that this will not delete the element (or ElementTree root
|
86
|
+
element) that you passed even if it matches. It will only treat
|
87
|
+
its descendants. If you want to include the root element, check
|
88
|
+
its tag name directly before even calling this function.
|
89
|
+
|
90
|
+
Example usage::
|
91
|
+
|
92
|
+
strip_elements(some_element,
|
93
|
+
'simpletagname', # non-namespaced tag
|
94
|
+
'{http://some/ns}tagname', # namespaced tag
|
95
|
+
'{http://some/other/ns}*' # any tag from a namespace
|
96
|
+
lxml.etree.Comment # comments
|
97
|
+
)
|
98
|
+
"""
|
99
|
+
cdef _MultiTagMatcher matcher
|
100
|
+
doc = _documentOrRaise(tree_or_element)
|
101
|
+
element = _rootNodeOrRaise(tree_or_element)
|
102
|
+
if not tag_names:
|
103
|
+
return
|
104
|
+
|
105
|
+
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
|
106
|
+
matcher.cacheTags(doc)
|
107
|
+
if matcher.rejectsAll():
|
108
|
+
return
|
109
|
+
|
110
|
+
if isinstance(tree_or_element, _ElementTree):
|
111
|
+
# include PIs and comments next to the root node
|
112
|
+
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
113
|
+
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
|
114
|
+
if matcher.matchesType(tree.XML_PI_NODE):
|
115
|
+
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
|
116
|
+
_strip_elements(doc, element._c_node, matcher, with_tail)
|
117
|
+
|
118
|
+
cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
|
119
|
+
bint with_tail):
|
120
|
+
cdef xmlNode* c_child
|
121
|
+
cdef xmlNode* c_next
|
122
|
+
|
123
|
+
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
124
|
+
if c_node.type == tree.XML_ELEMENT_NODE:
|
125
|
+
# we run through the children here to prevent any problems
|
126
|
+
# with the tree iteration which would occur if we unlinked the
|
127
|
+
# c_node itself
|
128
|
+
c_child = _findChildForwards(c_node, 0)
|
129
|
+
while c_child is not NULL:
|
130
|
+
c_next = _nextElement(c_child)
|
131
|
+
if matcher.matches(c_child):
|
132
|
+
if c_child.type == tree.XML_ELEMENT_NODE:
|
133
|
+
if not with_tail:
|
134
|
+
tree.xmlUnlinkNode(c_child)
|
135
|
+
_removeNode(doc, c_child)
|
136
|
+
else:
|
137
|
+
if with_tail:
|
138
|
+
_removeText(c_child.next)
|
139
|
+
tree.xmlUnlinkNode(c_child)
|
140
|
+
attemptDeallocation(c_child)
|
141
|
+
c_child = c_next
|
142
|
+
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
143
|
+
|
144
|
+
|
145
|
+
def strip_tags(tree_or_element, *tag_names):
|
146
|
+
"""strip_tags(tree_or_element, *tag_names)
|
147
|
+
|
148
|
+
Delete all elements with the provided tag names from a tree or
|
149
|
+
subtree. This will remove the elements and their attributes, but
|
150
|
+
*not* their text/tail content or descendants. Instead, it will
|
151
|
+
merge the text content and children of the element into its
|
152
|
+
parent.
|
153
|
+
|
154
|
+
Tag names can contain wildcards as in `_Element.iter`.
|
155
|
+
|
156
|
+
Note that this will not delete the element (or ElementTree root
|
157
|
+
element) that you passed even if it matches. It will only treat
|
158
|
+
its descendants.
|
159
|
+
|
160
|
+
Example usage::
|
161
|
+
|
162
|
+
strip_tags(some_element,
|
163
|
+
'simpletagname', # non-namespaced tag
|
164
|
+
'{http://some/ns}tagname', # namespaced tag
|
165
|
+
'{http://some/other/ns}*' # any tag from a namespace
|
166
|
+
Comment # comments (including their text!)
|
167
|
+
)
|
168
|
+
"""
|
169
|
+
cdef _MultiTagMatcher matcher
|
170
|
+
doc = _documentOrRaise(tree_or_element)
|
171
|
+
element = _rootNodeOrRaise(tree_or_element)
|
172
|
+
if not tag_names:
|
173
|
+
return
|
174
|
+
|
175
|
+
matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
|
176
|
+
matcher.cacheTags(doc)
|
177
|
+
if matcher.rejectsAll():
|
178
|
+
return
|
179
|
+
|
180
|
+
if isinstance(tree_or_element, _ElementTree):
|
181
|
+
# include PIs and comments next to the root node
|
182
|
+
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
183
|
+
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
|
184
|
+
if matcher.matchesType(tree.XML_PI_NODE):
|
185
|
+
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
|
186
|
+
_strip_tags(doc, element._c_node, matcher)
|
187
|
+
|
188
|
+
cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
|
189
|
+
cdef xmlNode* c_child
|
190
|
+
cdef xmlNode* c_next
|
191
|
+
|
192
|
+
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
193
|
+
if c_node.type == tree.XML_ELEMENT_NODE:
|
194
|
+
# we run through the children here to prevent any problems
|
195
|
+
# with the tree iteration which would occur if we unlinked the
|
196
|
+
# c_node itself
|
197
|
+
c_child = _findChildForwards(c_node, 0)
|
198
|
+
while c_child is not NULL:
|
199
|
+
if not matcher.matches(c_child):
|
200
|
+
c_child = _nextElement(c_child)
|
201
|
+
continue
|
202
|
+
if c_child.type == tree.XML_ELEMENT_NODE:
|
203
|
+
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
|
204
|
+
_replaceNodeByChildren(doc, c_child)
|
205
|
+
if not attemptDeallocation(c_child):
|
206
|
+
if c_child.nsDef is not NULL:
|
207
|
+
# make namespaces absolute
|
208
|
+
moveNodeToDocument(doc, doc._c_doc, c_child)
|
209
|
+
c_child = c_next
|
210
|
+
else:
|
211
|
+
c_next = _nextElement(c_child)
|
212
|
+
tree.xmlUnlinkNode(c_child)
|
213
|
+
attemptDeallocation(c_child)
|
214
|
+
c_child = c_next
|
215
|
+
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
lxml/cssselect.py
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
"""CSS Selectors based on XPath.
|
2
|
+
|
3
|
+
This module supports selecting XML/HTML tags based on CSS selectors.
|
4
|
+
See the `CSSSelector` class for details.
|
5
|
+
|
6
|
+
This is a thin wrapper around cssselect 0.7 or later.
|
7
|
+
"""
|
8
|
+
|
9
|
+
|
10
|
+
from . import etree
|
11
|
+
try:
|
12
|
+
import cssselect as external_cssselect
|
13
|
+
except ImportError:
|
14
|
+
raise ImportError(
|
15
|
+
'cssselect does not seem to be installed. '
|
16
|
+
'See https://pypi.org/project/cssselect/')
|
17
|
+
|
18
|
+
|
19
|
+
SelectorSyntaxError = external_cssselect.SelectorSyntaxError
|
20
|
+
ExpressionError = external_cssselect.ExpressionError
|
21
|
+
SelectorError = external_cssselect.SelectorError
|
22
|
+
|
23
|
+
|
24
|
+
__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError',
|
25
|
+
'CSSSelector']
|
26
|
+
|
27
|
+
|
28
|
+
class LxmlTranslator(external_cssselect.GenericTranslator):
|
29
|
+
"""
|
30
|
+
A custom CSS selector to XPath translator with lxml-specific extensions.
|
31
|
+
"""
|
32
|
+
def xpath_contains_function(self, xpath, function):
|
33
|
+
# Defined there, removed in later drafts:
|
34
|
+
# http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
|
35
|
+
if function.argument_types() not in (['STRING'], ['IDENT']):
|
36
|
+
raise ExpressionError(
|
37
|
+
"Expected a single string or ident for :contains(), got %r"
|
38
|
+
% function.arguments)
|
39
|
+
value = function.arguments[0].value
|
40
|
+
return xpath.add_condition(
|
41
|
+
'contains(__lxml_internal_css:lower-case(string(.)), %s)'
|
42
|
+
% self.xpath_literal(value.lower()))
|
43
|
+
|
44
|
+
|
45
|
+
class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator):
|
46
|
+
"""
|
47
|
+
lxml extensions + HTML support.
|
48
|
+
"""
|
49
|
+
|
50
|
+
|
51
|
+
def _make_lower_case(context, s):
|
52
|
+
return s.lower()
|
53
|
+
|
54
|
+
ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
|
55
|
+
ns.prefix = '__lxml_internal_css'
|
56
|
+
ns['lower-case'] = _make_lower_case
|
57
|
+
|
58
|
+
|
59
|
+
class CSSSelector(etree.XPath):
|
60
|
+
"""A CSS selector.
|
61
|
+
|
62
|
+
Usage::
|
63
|
+
|
64
|
+
>>> from lxml import etree, cssselect
|
65
|
+
>>> select = cssselect.CSSSelector("a tag > child")
|
66
|
+
|
67
|
+
>>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
|
68
|
+
>>> [ el.tag for el in select(root) ]
|
69
|
+
['child']
|
70
|
+
|
71
|
+
To use CSS namespaces, you need to pass a prefix-to-namespace
|
72
|
+
mapping as ``namespaces`` keyword argument::
|
73
|
+
|
74
|
+
>>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
|
75
|
+
>>> select_ns = cssselect.CSSSelector('root > rdf|Description',
|
76
|
+
... namespaces={'rdf': rdfns})
|
77
|
+
|
78
|
+
>>> rdf = etree.XML((
|
79
|
+
... '<root xmlns:rdf="%s">'
|
80
|
+
... '<rdf:Description>blah</rdf:Description>'
|
81
|
+
... '</root>') % rdfns)
|
82
|
+
>>> [(el.tag, el.text) for el in select_ns(rdf)]
|
83
|
+
[('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
|
84
|
+
|
85
|
+
"""
|
86
|
+
def __init__(self, css, namespaces=None, translator='xml'):
|
87
|
+
if translator == 'xml':
|
88
|
+
translator = LxmlTranslator()
|
89
|
+
elif translator == 'html':
|
90
|
+
translator = LxmlHTMLTranslator()
|
91
|
+
elif translator == 'xhtml':
|
92
|
+
translator = LxmlHTMLTranslator(xhtml=True)
|
93
|
+
path = translator.css_to_xpath(css)
|
94
|
+
super().__init__(path, namespaces=namespaces)
|
95
|
+
self.css = css
|
96
|
+
|
97
|
+
def __repr__(self):
|
98
|
+
return '<%s %x for %r>' % (
|
99
|
+
self.__class__.__name__,
|
100
|
+
abs(id(self)),
|
101
|
+
self.css)
|
lxml/debug.pxi
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
@cython.final
|
2
|
+
@cython.internal
|
3
|
+
cdef class _MemDebug:
|
4
|
+
"""Debugging support for the memory allocation in libxml2.
|
5
|
+
"""
|
6
|
+
def bytes_used(self):
|
7
|
+
"""bytes_used(self)
|
8
|
+
|
9
|
+
Returns the total amount of memory (in bytes) currently used by libxml2.
|
10
|
+
Note that libxml2 constrains this value to a C int, which limits
|
11
|
+
the accuracy on 64 bit systems.
|
12
|
+
"""
|
13
|
+
return tree.xmlMemUsed()
|
14
|
+
|
15
|
+
def blocks_used(self):
|
16
|
+
"""blocks_used(self)
|
17
|
+
|
18
|
+
Returns the total number of memory blocks currently allocated by libxml2.
|
19
|
+
Note that libxml2 constrains this value to a C int, which limits
|
20
|
+
the accuracy on 64 bit systems.
|
21
|
+
"""
|
22
|
+
return tree.xmlMemBlocks()
|
23
|
+
|
24
|
+
def dict_size(self):
|
25
|
+
"""dict_size(self)
|
26
|
+
|
27
|
+
Returns the current size of the global name dictionary used by libxml2
|
28
|
+
for the current thread. Each thread has its own dictionary.
|
29
|
+
"""
|
30
|
+
c_dict = __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL)
|
31
|
+
if c_dict is NULL:
|
32
|
+
raise MemoryError()
|
33
|
+
return tree.xmlDictSize(c_dict)
|
34
|
+
|
35
|
+
|
36
|
+
memory_debugger = _MemDebug()
|
lxml/docloader.pxi
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
# Custom resolver API
|
2
|
+
|
3
|
+
ctypedef enum _InputDocumentDataType:
|
4
|
+
PARSER_DATA_INVALID
|
5
|
+
PARSER_DATA_EMPTY
|
6
|
+
PARSER_DATA_STRING
|
7
|
+
PARSER_DATA_FILENAME
|
8
|
+
PARSER_DATA_FILE
|
9
|
+
|
10
|
+
@cython.final
|
11
|
+
@cython.internal
|
12
|
+
cdef class _InputDocument:
|
13
|
+
cdef _InputDocumentDataType _type
|
14
|
+
cdef bytes _data_bytes
|
15
|
+
cdef object _filename
|
16
|
+
cdef object _file
|
17
|
+
cdef bint _close_file
|
18
|
+
|
19
|
+
def __cinit__(self):
|
20
|
+
self._type = PARSER_DATA_INVALID
|
21
|
+
|
22
|
+
|
23
|
+
cdef class Resolver:
|
24
|
+
"This is the base class of all resolvers."
|
25
|
+
def resolve(self, system_url, public_id, context):
|
26
|
+
"""resolve(self, system_url, public_id, context)
|
27
|
+
|
28
|
+
Override this method to resolve an external source by
|
29
|
+
``system_url`` and ``public_id``. The third argument is an
|
30
|
+
opaque context object.
|
31
|
+
|
32
|
+
Return the result of one of the ``resolve_*()`` methods.
|
33
|
+
"""
|
34
|
+
return None
|
35
|
+
|
36
|
+
def resolve_empty(self, context):
|
37
|
+
"""resolve_empty(self, context)
|
38
|
+
|
39
|
+
Return an empty input document.
|
40
|
+
|
41
|
+
Pass context as parameter.
|
42
|
+
"""
|
43
|
+
cdef _InputDocument doc_ref
|
44
|
+
doc_ref = _InputDocument()
|
45
|
+
doc_ref._type = PARSER_DATA_EMPTY
|
46
|
+
return doc_ref
|
47
|
+
|
48
|
+
def resolve_string(self, string, context, *, base_url=None):
|
49
|
+
"""resolve_string(self, string, context, base_url=None)
|
50
|
+
|
51
|
+
Return a parsable string as input document.
|
52
|
+
|
53
|
+
Pass data string and context as parameters. You can pass the
|
54
|
+
source URL or filename through the ``base_url`` keyword
|
55
|
+
argument.
|
56
|
+
"""
|
57
|
+
cdef _InputDocument doc_ref
|
58
|
+
if isinstance(string, unicode):
|
59
|
+
string = (<unicode>string).encode('utf8')
|
60
|
+
elif not isinstance(string, bytes):
|
61
|
+
raise TypeError, "argument must be a byte string or unicode string"
|
62
|
+
doc_ref = _InputDocument()
|
63
|
+
doc_ref._type = PARSER_DATA_STRING
|
64
|
+
doc_ref._data_bytes = string
|
65
|
+
if base_url is not None:
|
66
|
+
doc_ref._filename = _encodeFilename(base_url)
|
67
|
+
return doc_ref
|
68
|
+
|
69
|
+
def resolve_filename(self, filename, context):
|
70
|
+
"""resolve_filename(self, filename, context)
|
71
|
+
|
72
|
+
Return the name of a parsable file as input document.
|
73
|
+
|
74
|
+
Pass filename and context as parameters. You can also pass a
|
75
|
+
URL with an HTTP, FTP or file target.
|
76
|
+
"""
|
77
|
+
cdef _InputDocument doc_ref
|
78
|
+
doc_ref = _InputDocument()
|
79
|
+
doc_ref._type = PARSER_DATA_FILENAME
|
80
|
+
doc_ref._filename = _encodeFilename(filename)
|
81
|
+
return doc_ref
|
82
|
+
|
83
|
+
def resolve_file(self, f, context, *, base_url=None, bint close=True):
|
84
|
+
"""resolve_file(self, f, context, base_url=None, close=True)
|
85
|
+
|
86
|
+
Return an open file-like object as input document.
|
87
|
+
|
88
|
+
Pass open file and context as parameters. You can pass the
|
89
|
+
base URL or filename of the file through the ``base_url``
|
90
|
+
keyword argument. If the ``close`` flag is True (the
|
91
|
+
default), the file will be closed after reading.
|
92
|
+
|
93
|
+
Note that using ``.resolve_filename()`` is more efficient,
|
94
|
+
especially in threaded environments.
|
95
|
+
"""
|
96
|
+
cdef _InputDocument doc_ref
|
97
|
+
try:
|
98
|
+
f.read
|
99
|
+
except AttributeError:
|
100
|
+
raise TypeError, "Argument is not a file-like object"
|
101
|
+
doc_ref = _InputDocument()
|
102
|
+
doc_ref._type = PARSER_DATA_FILE
|
103
|
+
if base_url is not None:
|
104
|
+
doc_ref._filename = _encodeFilename(base_url)
|
105
|
+
else:
|
106
|
+
doc_ref._filename = _getFilenameForFile(f)
|
107
|
+
doc_ref._close_file = close
|
108
|
+
doc_ref._file = f
|
109
|
+
return doc_ref
|
110
|
+
|
111
|
+
@cython.final
|
112
|
+
@cython.internal
|
113
|
+
cdef class _ResolverRegistry:
|
114
|
+
cdef object _resolvers
|
115
|
+
cdef Resolver _default_resolver
|
116
|
+
def __cinit__(self, Resolver default_resolver=None):
|
117
|
+
self._resolvers = set()
|
118
|
+
self._default_resolver = default_resolver
|
119
|
+
|
120
|
+
def add(self, Resolver resolver not None):
|
121
|
+
"""add(self, resolver)
|
122
|
+
|
123
|
+
Register a resolver.
|
124
|
+
|
125
|
+
For each requested entity, the 'resolve' method of the resolver will
|
126
|
+
be called and the result will be passed to the parser. If this method
|
127
|
+
returns None, the request will be delegated to other resolvers or the
|
128
|
+
default resolver. The resolvers will be tested in an arbitrary order
|
129
|
+
until the first match is found.
|
130
|
+
"""
|
131
|
+
self._resolvers.add(resolver)
|
132
|
+
|
133
|
+
def remove(self, resolver):
|
134
|
+
"remove(self, resolver)"
|
135
|
+
self._resolvers.discard(resolver)
|
136
|
+
|
137
|
+
cdef _ResolverRegistry _copy(self):
|
138
|
+
cdef _ResolverRegistry registry
|
139
|
+
registry = _ResolverRegistry(self._default_resolver)
|
140
|
+
registry._resolvers = self._resolvers.copy()
|
141
|
+
return registry
|
142
|
+
|
143
|
+
def copy(self):
|
144
|
+
"copy(self)"
|
145
|
+
return self._copy()
|
146
|
+
|
147
|
+
def resolve(self, system_url, public_id, context):
|
148
|
+
"resolve(self, system_url, public_id, context)"
|
149
|
+
for resolver in self._resolvers:
|
150
|
+
result = resolver.resolve(system_url, public_id, context)
|
151
|
+
if result is not None:
|
152
|
+
return result
|
153
|
+
if self._default_resolver is None:
|
154
|
+
return None
|
155
|
+
return self._default_resolver.resolve(system_url, public_id, context)
|
156
|
+
|
157
|
+
def __repr__(self):
|
158
|
+
return repr(self._resolvers)
|
159
|
+
|
160
|
+
|
161
|
+
@cython.internal
|
162
|
+
cdef class _ResolverContext(_ExceptionContext):
|
163
|
+
cdef _ResolverRegistry _resolvers
|
164
|
+
cdef _TempStore _storage
|
165
|
+
|
166
|
+
cdef int clear(self) except -1:
|
167
|
+
_ExceptionContext.clear(self)
|
168
|
+
self._storage.clear()
|
169
|
+
return 0
|
170
|
+
|
171
|
+
|
172
|
+
cdef _initResolverContext(_ResolverContext context,
|
173
|
+
_ResolverRegistry resolvers):
|
174
|
+
if resolvers is None:
|
175
|
+
context._resolvers = _ResolverRegistry()
|
176
|
+
else:
|
177
|
+
context._resolvers = resolvers
|
178
|
+
context._storage = _TempStore()
|