lxml 6.0.0__cp311-cp311-manylinux_2_31_armv7l.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/ElementInclude.py +244 -0
- lxml/__init__.py +22 -0
- lxml/_elementpath.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/_elementpath.py +343 -0
- lxml/apihelpers.pxi +1801 -0
- lxml/builder.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/builder.py +243 -0
- lxml/classlookup.pxi +580 -0
- lxml/cleanup.pxi +215 -0
- lxml/cssselect.py +101 -0
- lxml/debug.pxi +36 -0
- lxml/docloader.pxi +178 -0
- lxml/doctestcompare.py +488 -0
- lxml/dtd.pxi +479 -0
- lxml/etree.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/etree.h +244 -0
- lxml/etree.pyx +3853 -0
- lxml/etree_api.h +204 -0
- lxml/extensions.pxi +830 -0
- lxml/html/ElementSoup.py +10 -0
- lxml/html/__init__.py +1927 -0
- lxml/html/_diffcommand.py +86 -0
- lxml/html/_difflib.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/html/_difflib.py +2106 -0
- lxml/html/_html5builder.py +100 -0
- lxml/html/_setmixin.py +56 -0
- lxml/html/builder.py +173 -0
- lxml/html/clean.py +21 -0
- lxml/html/defs.py +135 -0
- lxml/html/diff.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/html/diff.py +972 -0
- lxml/html/formfill.py +299 -0
- lxml/html/html5parser.py +260 -0
- lxml/html/soupparser.py +314 -0
- lxml/html/usedoctest.py +13 -0
- lxml/includes/__init__.pxd +0 -0
- lxml/includes/__init__.py +0 -0
- lxml/includes/c14n.pxd +25 -0
- lxml/includes/config.pxd +3 -0
- lxml/includes/dtdvalid.pxd +18 -0
- lxml/includes/etree_defs.h +379 -0
- lxml/includes/etreepublic.pxd +237 -0
- lxml/includes/extlibs/__init__.py +0 -0
- lxml/includes/extlibs/libcharset.h +45 -0
- lxml/includes/extlibs/localcharset.h +137 -0
- lxml/includes/extlibs/zconf.h +543 -0
- lxml/includes/extlibs/zlib.h +1938 -0
- lxml/includes/htmlparser.pxd +56 -0
- lxml/includes/libexslt/__init__.py +0 -0
- lxml/includes/libexslt/exslt.h +108 -0
- lxml/includes/libexslt/exsltconfig.h +70 -0
- lxml/includes/libexslt/exsltexports.h +63 -0
- lxml/includes/libxml/HTMLparser.h +339 -0
- lxml/includes/libxml/HTMLtree.h +148 -0
- lxml/includes/libxml/SAX.h +18 -0
- lxml/includes/libxml/SAX2.h +170 -0
- lxml/includes/libxml/__init__.py +0 -0
- lxml/includes/libxml/c14n.h +115 -0
- lxml/includes/libxml/catalog.h +183 -0
- lxml/includes/libxml/chvalid.h +230 -0
- lxml/includes/libxml/debugXML.h +79 -0
- lxml/includes/libxml/dict.h +82 -0
- lxml/includes/libxml/encoding.h +307 -0
- lxml/includes/libxml/entities.h +147 -0
- lxml/includes/libxml/globals.h +25 -0
- lxml/includes/libxml/hash.h +251 -0
- lxml/includes/libxml/list.h +137 -0
- lxml/includes/libxml/nanoftp.h +16 -0
- lxml/includes/libxml/nanohttp.h +98 -0
- lxml/includes/libxml/parser.h +1633 -0
- lxml/includes/libxml/parserInternals.h +591 -0
- lxml/includes/libxml/relaxng.h +224 -0
- lxml/includes/libxml/schemasInternals.h +959 -0
- lxml/includes/libxml/schematron.h +143 -0
- lxml/includes/libxml/threads.h +81 -0
- lxml/includes/libxml/tree.h +1326 -0
- lxml/includes/libxml/uri.h +106 -0
- lxml/includes/libxml/valid.h +485 -0
- lxml/includes/libxml/xinclude.h +141 -0
- lxml/includes/libxml/xlink.h +193 -0
- lxml/includes/libxml/xmlIO.h +419 -0
- lxml/includes/libxml/xmlautomata.h +163 -0
- lxml/includes/libxml/xmlerror.h +962 -0
- lxml/includes/libxml/xmlexports.h +96 -0
- lxml/includes/libxml/xmlmemory.h +188 -0
- lxml/includes/libxml/xmlmodule.h +61 -0
- lxml/includes/libxml/xmlreader.h +444 -0
- lxml/includes/libxml/xmlregexp.h +116 -0
- lxml/includes/libxml/xmlsave.h +111 -0
- lxml/includes/libxml/xmlschemas.h +254 -0
- lxml/includes/libxml/xmlschemastypes.h +152 -0
- lxml/includes/libxml/xmlstring.h +140 -0
- lxml/includes/libxml/xmlunicode.h +15 -0
- lxml/includes/libxml/xmlversion.h +332 -0
- lxml/includes/libxml/xmlwriter.h +489 -0
- lxml/includes/libxml/xpath.h +569 -0
- lxml/includes/libxml/xpathInternals.h +639 -0
- lxml/includes/libxml/xpointer.h +48 -0
- lxml/includes/libxslt/__init__.py +0 -0
- lxml/includes/libxslt/attributes.h +39 -0
- lxml/includes/libxslt/documents.h +93 -0
- lxml/includes/libxslt/extensions.h +262 -0
- lxml/includes/libxslt/extra.h +72 -0
- lxml/includes/libxslt/functions.h +78 -0
- lxml/includes/libxslt/imports.h +75 -0
- lxml/includes/libxslt/keys.h +53 -0
- lxml/includes/libxslt/namespaces.h +68 -0
- lxml/includes/libxslt/numbersInternals.h +73 -0
- lxml/includes/libxslt/pattern.h +84 -0
- lxml/includes/libxslt/preproc.h +43 -0
- lxml/includes/libxslt/security.h +104 -0
- lxml/includes/libxslt/templates.h +77 -0
- lxml/includes/libxslt/transform.h +207 -0
- lxml/includes/libxslt/variables.h +118 -0
- lxml/includes/libxslt/xslt.h +110 -0
- lxml/includes/libxslt/xsltInternals.h +1995 -0
- lxml/includes/libxslt/xsltconfig.h +146 -0
- lxml/includes/libxslt/xsltexports.h +64 -0
- lxml/includes/libxslt/xsltlocale.h +44 -0
- lxml/includes/libxslt/xsltutils.h +343 -0
- lxml/includes/lxml-version.h +3 -0
- lxml/includes/relaxng.pxd +64 -0
- lxml/includes/schematron.pxd +34 -0
- lxml/includes/tree.pxd +492 -0
- lxml/includes/uri.pxd +5 -0
- lxml/includes/xinclude.pxd +22 -0
- lxml/includes/xmlerror.pxd +852 -0
- lxml/includes/xmlparser.pxd +303 -0
- lxml/includes/xmlschema.pxd +35 -0
- lxml/includes/xpath.pxd +136 -0
- lxml/includes/xslt.pxd +190 -0
- lxml/isoschematron/__init__.py +348 -0
- lxml/isoschematron/resources/rng/iso-schematron.rng +709 -0
- lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -0
- lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl +77 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +313 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1160 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +55 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt +84 -0
- lxml/iterparse.pxi +438 -0
- lxml/lxml.etree.h +244 -0
- lxml/lxml.etree_api.h +204 -0
- lxml/nsclasses.pxi +281 -0
- lxml/objectify.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/objectify.pyx +2149 -0
- lxml/objectpath.pxi +332 -0
- lxml/parser.pxi +2059 -0
- lxml/parsertarget.pxi +180 -0
- lxml/proxy.pxi +619 -0
- lxml/public-api.pxi +178 -0
- lxml/pyclasslookup.py +3 -0
- lxml/readonlytree.pxi +565 -0
- lxml/relaxng.pxi +165 -0
- lxml/sax.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/sax.py +286 -0
- lxml/saxparser.pxi +875 -0
- lxml/schematron.pxi +173 -0
- lxml/serializer.pxi +1849 -0
- lxml/usedoctest.py +13 -0
- lxml/xinclude.pxi +67 -0
- lxml/xmlerror.pxi +1654 -0
- lxml/xmlid.pxi +179 -0
- lxml/xmlschema.pxi +215 -0
- lxml/xpath.pxi +487 -0
- lxml/xslt.pxi +957 -0
- lxml/xsltext.pxi +242 -0
- lxml-6.0.0.dist-info/METADATA +163 -0
- lxml-6.0.0.dist-info/RECORD +174 -0
- lxml-6.0.0.dist-info/WHEEL +5 -0
- lxml-6.0.0.dist-info/licenses/LICENSE.txt +31 -0
- lxml-6.0.0.dist-info/licenses/LICENSES.txt +29 -0
- lxml-6.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,100 @@
|
|
1
|
+
"""
|
2
|
+
Legacy module - don't use in new code!
|
3
|
+
|
4
|
+
html5lib now has its own proper implementation.
|
5
|
+
|
6
|
+
This module implements a tree builder for html5lib that generates lxml
|
7
|
+
html element trees. This module uses camelCase as it follows the
|
8
|
+
html5lib style guide.
|
9
|
+
"""
|
10
|
+
|
11
|
+
from html5lib.treebuilders import _base, etree as etree_builders
|
12
|
+
from lxml import html, etree
|
13
|
+
|
14
|
+
|
15
|
+
class DocumentType:
|
16
|
+
|
17
|
+
def __init__(self, name, publicId, systemId):
|
18
|
+
self.name = name
|
19
|
+
self.publicId = publicId
|
20
|
+
self.systemId = systemId
|
21
|
+
|
22
|
+
class Document:
|
23
|
+
|
24
|
+
def __init__(self):
|
25
|
+
self._elementTree = None
|
26
|
+
self.childNodes = []
|
27
|
+
|
28
|
+
def appendChild(self, element):
|
29
|
+
self._elementTree.getroot().addnext(element._element)
|
30
|
+
|
31
|
+
|
32
|
+
class TreeBuilder(_base.TreeBuilder):
|
33
|
+
documentClass = Document
|
34
|
+
doctypeClass = DocumentType
|
35
|
+
elementClass = None
|
36
|
+
commentClass = None
|
37
|
+
fragmentClass = Document
|
38
|
+
|
39
|
+
def __init__(self, *args, **kwargs):
|
40
|
+
html_builder = etree_builders.getETreeModule(html, fullTree=False)
|
41
|
+
etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
|
42
|
+
self.elementClass = html_builder.Element
|
43
|
+
self.commentClass = etree_builder.Comment
|
44
|
+
_base.TreeBuilder.__init__(self, *args, **kwargs)
|
45
|
+
|
46
|
+
def reset(self):
|
47
|
+
_base.TreeBuilder.reset(self)
|
48
|
+
self.rootInserted = False
|
49
|
+
self.initialComments = []
|
50
|
+
self.doctype = None
|
51
|
+
|
52
|
+
def getDocument(self):
|
53
|
+
return self.document._elementTree
|
54
|
+
|
55
|
+
def getFragment(self):
|
56
|
+
fragment = []
|
57
|
+
element = self.openElements[0]._element
|
58
|
+
if element.text:
|
59
|
+
fragment.append(element.text)
|
60
|
+
fragment.extend(element.getchildren())
|
61
|
+
if element.tail:
|
62
|
+
fragment.append(element.tail)
|
63
|
+
return fragment
|
64
|
+
|
65
|
+
def insertDoctype(self, name, publicId, systemId):
|
66
|
+
doctype = self.doctypeClass(name, publicId, systemId)
|
67
|
+
self.doctype = doctype
|
68
|
+
|
69
|
+
def insertComment(self, data, parent=None):
|
70
|
+
if not self.rootInserted:
|
71
|
+
self.initialComments.append(data)
|
72
|
+
else:
|
73
|
+
_base.TreeBuilder.insertComment(self, data, parent)
|
74
|
+
|
75
|
+
def insertRoot(self, name):
|
76
|
+
buf = []
|
77
|
+
if self.doctype and self.doctype.name:
|
78
|
+
buf.append('<!DOCTYPE %s' % self.doctype.name)
|
79
|
+
if self.doctype.publicId is not None or self.doctype.systemId is not None:
|
80
|
+
buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
|
81
|
+
self.doctype.systemId))
|
82
|
+
buf.append('>')
|
83
|
+
buf.append('<html></html>')
|
84
|
+
root = html.fromstring(''.join(buf))
|
85
|
+
|
86
|
+
# Append the initial comments:
|
87
|
+
for comment in self.initialComments:
|
88
|
+
root.addprevious(etree.Comment(comment))
|
89
|
+
|
90
|
+
# Create the root document and add the ElementTree to it
|
91
|
+
self.document = self.documentClass()
|
92
|
+
self.document._elementTree = root.getroottree()
|
93
|
+
|
94
|
+
# Add the root element to the internal child/open data structures
|
95
|
+
root_element = self.elementClass(name)
|
96
|
+
root_element._element = root
|
97
|
+
self.document.childNodes.append(root_element)
|
98
|
+
self.openElements.append(root_element)
|
99
|
+
|
100
|
+
self.rootInserted = True
|
lxml/html/_setmixin.py
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
try:
|
2
|
+
from collections.abc import MutableSet
|
3
|
+
except ImportError:
|
4
|
+
from collections.abc import MutableSet
|
5
|
+
|
6
|
+
|
7
|
+
class SetMixin(MutableSet):
|
8
|
+
|
9
|
+
"""
|
10
|
+
Mix-in for sets. You must define __iter__, add, remove
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __len__(self):
|
14
|
+
length = 0
|
15
|
+
for item in self:
|
16
|
+
length += 1
|
17
|
+
return length
|
18
|
+
|
19
|
+
def __contains__(self, item):
|
20
|
+
for has_item in self:
|
21
|
+
if item == has_item:
|
22
|
+
return True
|
23
|
+
return False
|
24
|
+
|
25
|
+
issubset = MutableSet.__le__
|
26
|
+
issuperset = MutableSet.__ge__
|
27
|
+
|
28
|
+
union = MutableSet.__or__
|
29
|
+
intersection = MutableSet.__and__
|
30
|
+
difference = MutableSet.__sub__
|
31
|
+
symmetric_difference = MutableSet.__xor__
|
32
|
+
|
33
|
+
def copy(self):
|
34
|
+
return set(self)
|
35
|
+
|
36
|
+
def update(self, other):
|
37
|
+
self |= other
|
38
|
+
|
39
|
+
def intersection_update(self, other):
|
40
|
+
self &= other
|
41
|
+
|
42
|
+
def difference_update(self, other):
|
43
|
+
self -= other
|
44
|
+
|
45
|
+
def symmetric_difference_update(self, other):
|
46
|
+
self ^= other
|
47
|
+
|
48
|
+
def discard(self, item):
|
49
|
+
try:
|
50
|
+
self.remove(item)
|
51
|
+
except KeyError:
|
52
|
+
pass
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def _from_iterable(cls, it):
|
56
|
+
return set(it)
|
lxml/html/builder.py
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
# --------------------------------------------------------------------
|
2
|
+
# The ElementTree toolkit is
|
3
|
+
# Copyright (c) 1999-2004 by Fredrik Lundh
|
4
|
+
# --------------------------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
A set of HTML generator tags for building HTML documents.
|
8
|
+
|
9
|
+
Usage::
|
10
|
+
|
11
|
+
>>> from lxml.html.builder import *
|
12
|
+
>>> html = HTML(
|
13
|
+
... HEAD( TITLE("Hello World") ),
|
14
|
+
... BODY( CLASS("main"),
|
15
|
+
... H1("Hello World !")
|
16
|
+
... )
|
17
|
+
... )
|
18
|
+
|
19
|
+
>>> import lxml.etree
|
20
|
+
>>> print lxml.etree.tostring(html, pretty_print=True)
|
21
|
+
<html>
|
22
|
+
<head>
|
23
|
+
<title>Hello World</title>
|
24
|
+
</head>
|
25
|
+
<body class="main">
|
26
|
+
<h1>Hello World !</h1>
|
27
|
+
</body>
|
28
|
+
</html>
|
29
|
+
|
30
|
+
"""
|
31
|
+
|
32
|
+
from lxml.builder import ElementMaker
|
33
|
+
from lxml.html import html_parser
|
34
|
+
|
35
|
+
E = ElementMaker(makeelement=html_parser.makeelement)
|
36
|
+
|
37
|
+
# elements
|
38
|
+
A = E.a #: anchor
|
39
|
+
ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
|
40
|
+
ACRONYM = E.acronym #:
|
41
|
+
ADDRESS = E.address #: information on author
|
42
|
+
APPLET = E.applet #: Java applet (DEPRECATED)
|
43
|
+
AREA = E.area #: client-side image map area
|
44
|
+
ARTICLE = E.article #: self-contained article
|
45
|
+
ASIDE = E.aside #: indirectly-related content
|
46
|
+
AUDIO = E.audio #: embedded audio file
|
47
|
+
B = E.b #: bold text style
|
48
|
+
BASE = E.base #: document base URI
|
49
|
+
BASEFONT = E.basefont #: base font size (DEPRECATED)
|
50
|
+
BDI = E.bdi #: isolate bidirectional text
|
51
|
+
BDO = E.bdo #: I18N BiDi over-ride
|
52
|
+
BIG = E.big #: large text style
|
53
|
+
BLOCKQUOTE = E.blockquote #: long quotation
|
54
|
+
BODY = E.body #: document body
|
55
|
+
BR = E.br #: forced line break
|
56
|
+
BUTTON = E.button #: push button
|
57
|
+
CANVAS = E.canvas #: scriptable graphics container
|
58
|
+
CAPTION = E.caption #: table caption
|
59
|
+
CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
|
60
|
+
CITE = E.cite #: citation
|
61
|
+
CODE = E.code #: computer code fragment
|
62
|
+
COL = E.col #: table column
|
63
|
+
COLGROUP = E.colgroup #: table column group
|
64
|
+
DATA = E.data #: machine-readable translation
|
65
|
+
DATALIST = E.datalist #: list of options for an input
|
66
|
+
DD = E.dd #: definition description
|
67
|
+
DEL = getattr(E, 'del') #: deleted text
|
68
|
+
DETAILS = E.details #: expandable section
|
69
|
+
DFN = E.dfn #: instance definition
|
70
|
+
DIALOG = E.dialog #: dialog box
|
71
|
+
DIR = E.dir #: directory list (DEPRECATED)
|
72
|
+
DIV = E.div #: generic language/style container
|
73
|
+
DL = E.dl #: definition list
|
74
|
+
DT = E.dt #: definition term
|
75
|
+
EM = E.em #: emphasis
|
76
|
+
EMBED = E.embed #: embedded external content
|
77
|
+
FIELDSET = E.fieldset #: form control group
|
78
|
+
FIGCAPTION = E.figcaption #: figure caption
|
79
|
+
FIGURE = E.figure #: self-contained, possibly-captioned content
|
80
|
+
FONT = E.font #: local change to font (DEPRECATED)
|
81
|
+
FOOTER = E.footer #: footer for nearest ancestor
|
82
|
+
FORM = E.form #: interactive form
|
83
|
+
FRAME = E.frame #: subwindow
|
84
|
+
FRAMESET = E.frameset #: window subdivision
|
85
|
+
H1 = E.h1 #: heading
|
86
|
+
H2 = E.h2 #: heading
|
87
|
+
H3 = E.h3 #: heading
|
88
|
+
H4 = E.h4 #: heading
|
89
|
+
H5 = E.h5 #: heading
|
90
|
+
H6 = E.h6 #: heading
|
91
|
+
HEAD = E.head #: document head
|
92
|
+
HEADER = E.header #: heading content
|
93
|
+
HGROUP = E.hgroup #: heading group
|
94
|
+
HR = E.hr #: horizontal rule
|
95
|
+
HTML = E.html #: document root element
|
96
|
+
I = E.i #: italic text style
|
97
|
+
IFRAME = E.iframe #: inline subwindow
|
98
|
+
IMG = E.img #: Embedded image
|
99
|
+
INPUT = E.input #: form control
|
100
|
+
INS = E.ins #: inserted text
|
101
|
+
ISINDEX = E.isindex #: single line prompt (DEPRECATED)
|
102
|
+
KBD = E.kbd #: text to be entered by the user
|
103
|
+
LABEL = E.label #: form field label text
|
104
|
+
LEGEND = E.legend #: fieldset legend
|
105
|
+
LI = E.li #: list item
|
106
|
+
LINK = E.link #: a media-independent link
|
107
|
+
MAIN = E.main #: main content
|
108
|
+
MAP = E.map #: client-side image map
|
109
|
+
MARK = E.mark #: marked/highlighted text
|
110
|
+
MARQUEE = E.marquee #: scrolling text
|
111
|
+
MENU = E.menu #: menu list (DEPRECATED)
|
112
|
+
META = E.meta #: generic metainformation
|
113
|
+
METER = E.meter #: numerical value display
|
114
|
+
NAV = E.nav #: navigation section
|
115
|
+
NOBR = E.nobr #: prevent wrapping
|
116
|
+
NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
|
117
|
+
NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
|
118
|
+
OBJECT = E.object #: generic embedded object
|
119
|
+
OL = E.ol #: ordered list
|
120
|
+
OPTGROUP = E.optgroup #: option group
|
121
|
+
OPTION = E.option #: selectable choice
|
122
|
+
OUTPUT = E.output #: result of a calculation
|
123
|
+
P = E.p #: paragraph
|
124
|
+
PARAM = E.param #: named property value
|
125
|
+
PICTURE = E.picture #: picture with multiple sources
|
126
|
+
PORTAL = E.portal #: embedded preview
|
127
|
+
PRE = E.pre #: preformatted text
|
128
|
+
PROGRESS = E.progress #: progress bar
|
129
|
+
Q = E.q #: short inline quotation
|
130
|
+
RB = E.rb #: ruby base text
|
131
|
+
RP = E.rp #: ruby parentheses
|
132
|
+
RT = E.rt #: ruby text component
|
133
|
+
RTC = E.rtc #: ruby semantic annotation
|
134
|
+
RUBY = E.ruby #: ruby annotations
|
135
|
+
S = E.s #: strike-through text style (DEPRECATED)
|
136
|
+
SAMP = E.samp #: sample program output, scripts, etc.
|
137
|
+
SCRIPT = E.script #: script statements
|
138
|
+
SEARCH = E.search #: set of form controls for a search
|
139
|
+
SECTION = E.section #: generic standalone section
|
140
|
+
SELECT = E.select #: option selector
|
141
|
+
SLOT = E.slot #: placeholder for JS use
|
142
|
+
SMALL = E.small #: small text style
|
143
|
+
SOURCE = E.source #: source for picture/audio/video element
|
144
|
+
SPAN = E.span #: generic language/style container
|
145
|
+
STRIKE = E.strike #: strike-through text (DEPRECATED)
|
146
|
+
STRONG = E.strong #: strong emphasis
|
147
|
+
STYLE = E.style #: style info
|
148
|
+
SUB = E.sub #: subscript
|
149
|
+
SUMMARY = E.summary #: summary for <details>
|
150
|
+
SUP = E.sup #: superscript
|
151
|
+
TABLE = E.table #:
|
152
|
+
TBODY = E.tbody #: table body
|
153
|
+
TD = E.td #: table data cell
|
154
|
+
TEMPLATE = E.template #: fragment for JS use
|
155
|
+
TEXTAREA = E.textarea #: multi-line text field
|
156
|
+
TFOOT = E.tfoot #: table footer
|
157
|
+
TH = E.th #: table header cell
|
158
|
+
THEAD = E.thead #: table header
|
159
|
+
TIME = E.time #: date/time
|
160
|
+
TITLE = E.title #: document title
|
161
|
+
TR = E.tr #: table row
|
162
|
+
TRACK = E.track #: audio/video track
|
163
|
+
TT = E.tt #: teletype or monospaced text style
|
164
|
+
U = E.u #: underlined text style (DEPRECATED)
|
165
|
+
UL = E.ul #: unordered list
|
166
|
+
VAR = E.var #: instance of a variable or program argument
|
167
|
+
VIDEO = E.video #: embedded video file
|
168
|
+
WBR = E.wbr #: word break
|
169
|
+
|
170
|
+
# attributes (only reserved words are included here)
|
171
|
+
ATTR = dict
|
172
|
+
def CLASS(v): return {'class': v}
|
173
|
+
def FOR(v): return {'for': v}
|
lxml/html/clean.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# cython: language_level=3str
|
2
|
+
|
3
|
+
"""Backward-compatibility module for lxml_html_clean"""
|
4
|
+
|
5
|
+
try:
|
6
|
+
from lxml_html_clean import *
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"clean_html",
|
10
|
+
"clean",
|
11
|
+
"Cleaner",
|
12
|
+
"autolink",
|
13
|
+
"autolink_html",
|
14
|
+
"word_break",
|
15
|
+
"word_break_html",
|
16
|
+
]
|
17
|
+
except ImportError:
|
18
|
+
raise ImportError(
|
19
|
+
"lxml.html.clean module is now a separate project lxml_html_clean.\n"
|
20
|
+
"Install lxml[html_clean] or lxml_html_clean directly."
|
21
|
+
) from None
|
lxml/html/defs.py
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
# FIXME: this should all be confirmed against what a DTD says
|
2
|
+
# (probably in a test; this may not match the DTD exactly, but we
|
3
|
+
# should document just how it differs).
|
4
|
+
|
5
|
+
"""
|
6
|
+
Data taken from https://www.w3.org/TR/html401/index/elements.html
|
7
|
+
and https://html.spec.whatwg.org/multipage/syntax.html#elements-2
|
8
|
+
for html5_tags.
|
9
|
+
"""
|
10
|
+
|
11
|
+
empty_tags = frozenset([
|
12
|
+
'area', 'base', 'basefont', 'br', 'col', 'embed', 'frame', 'hr',
|
13
|
+
'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track', 'wbr'])
|
14
|
+
|
15
|
+
deprecated_tags = frozenset([
|
16
|
+
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
|
17
|
+
'menu', 's', 'strike', 'u'])
|
18
|
+
|
19
|
+
# archive actually takes a space-separated list of URIs
|
20
|
+
link_attrs = frozenset([
|
21
|
+
'action', 'archive', 'background', 'cite', 'classid',
|
22
|
+
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
|
23
|
+
'usemap',
|
24
|
+
# Not standard:
|
25
|
+
'dynsrc', 'lowsrc',
|
26
|
+
# HTML5 formaction
|
27
|
+
'formaction'
|
28
|
+
])
|
29
|
+
|
30
|
+
# Not in the HTML 4 spec:
|
31
|
+
# onerror, onresize
|
32
|
+
event_attrs = frozenset([
|
33
|
+
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
|
34
|
+
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
|
35
|
+
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
|
36
|
+
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
|
37
|
+
'onunload',
|
38
|
+
])
|
39
|
+
|
40
|
+
safe_attrs = frozenset([
|
41
|
+
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
|
42
|
+
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
|
43
|
+
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
|
44
|
+
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
|
45
|
+
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
|
46
|
+
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
|
47
|
+
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
|
48
|
+
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
|
49
|
+
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
50
|
+
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
|
51
|
+
|
52
|
+
# From http://htmlhelp.com/reference/html40/olist.html
|
53
|
+
top_level_tags = frozenset([
|
54
|
+
'html', 'head', 'body', 'frameset',
|
55
|
+
])
|
56
|
+
|
57
|
+
head_tags = frozenset([
|
58
|
+
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
|
59
|
+
])
|
60
|
+
|
61
|
+
general_block_tags = frozenset([
|
62
|
+
'address',
|
63
|
+
'blockquote',
|
64
|
+
'center',
|
65
|
+
'del',
|
66
|
+
'div',
|
67
|
+
'h1',
|
68
|
+
'h2',
|
69
|
+
'h3',
|
70
|
+
'h4',
|
71
|
+
'h5',
|
72
|
+
'h6',
|
73
|
+
'hr',
|
74
|
+
'ins',
|
75
|
+
'isindex',
|
76
|
+
'noscript',
|
77
|
+
'p',
|
78
|
+
'pre',
|
79
|
+
])
|
80
|
+
|
81
|
+
list_tags = frozenset([
|
82
|
+
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
|
83
|
+
])
|
84
|
+
|
85
|
+
table_tags = frozenset([
|
86
|
+
'table', 'caption', 'colgroup', 'col',
|
87
|
+
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
|
88
|
+
])
|
89
|
+
|
90
|
+
# just this one from
|
91
|
+
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
|
92
|
+
block_tags = general_block_tags | list_tags | table_tags | frozenset([
|
93
|
+
# Partial form tags
|
94
|
+
'fieldset', 'form', 'legend', 'optgroup', 'option',
|
95
|
+
])
|
96
|
+
|
97
|
+
form_tags = frozenset([
|
98
|
+
'form', 'button', 'fieldset', 'legend', 'input', 'label',
|
99
|
+
'select', 'optgroup', 'option', 'textarea',
|
100
|
+
])
|
101
|
+
|
102
|
+
special_inline_tags = frozenset([
|
103
|
+
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
|
104
|
+
'img', 'map', 'area', 'object', 'param', 'q', 'script',
|
105
|
+
'span', 'sub', 'sup',
|
106
|
+
])
|
107
|
+
|
108
|
+
phrase_tags = frozenset([
|
109
|
+
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
|
110
|
+
'ins', 'kbd', 'samp', 'strong', 'var',
|
111
|
+
])
|
112
|
+
|
113
|
+
font_style_tags = frozenset([
|
114
|
+
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
|
115
|
+
])
|
116
|
+
|
117
|
+
frame_tags = frozenset([
|
118
|
+
'frameset', 'frame', 'noframes',
|
119
|
+
])
|
120
|
+
|
121
|
+
html5_tags = frozenset([
|
122
|
+
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
|
123
|
+
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
|
124
|
+
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
|
125
|
+
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
|
126
|
+
'svg', 'time', 'track', 'video', 'wbr'
|
127
|
+
])
|
128
|
+
|
129
|
+
# These tags aren't standard
|
130
|
+
nonstandard_tags = frozenset(['blink', 'marquee'])
|
131
|
+
|
132
|
+
|
133
|
+
tags = (top_level_tags | head_tags | general_block_tags | list_tags
|
134
|
+
| table_tags | form_tags | special_inline_tags | phrase_tags
|
135
|
+
| font_style_tags | nonstandard_tags | html5_tags)
|
Binary file
|