lxml 5.3.2__cp310-cp310-win32.win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/ElementInclude.py +244 -0
- lxml/__init__.py +22 -0
- lxml/_elementpath.cp310-win32.pyd +0 -0
- lxml/_elementpath.py +341 -0
- lxml/apihelpers.pxi +1793 -0
- lxml/builder.cp310-win32.pyd +0 -0
- lxml/builder.py +232 -0
- lxml/classlookup.pxi +580 -0
- lxml/cleanup.pxi +215 -0
- lxml/cssselect.py +101 -0
- lxml/debug.pxi +90 -0
- lxml/docloader.pxi +178 -0
- lxml/doctestcompare.py +488 -0
- lxml/dtd.pxi +479 -0
- lxml/etree.cp310-win32.pyd +0 -0
- lxml/etree.h +248 -0
- lxml/etree.pyx +3732 -0
- lxml/etree_api.h +195 -0
- lxml/extensions.pxi +833 -0
- lxml/html/ElementSoup.py +10 -0
- lxml/html/__init__.py +1923 -0
- lxml/html/_diffcommand.py +86 -0
- lxml/html/_html5builder.py +100 -0
- lxml/html/_setmixin.py +56 -0
- lxml/html/builder.py +133 -0
- lxml/html/clean.py +21 -0
- lxml/html/defs.py +135 -0
- lxml/html/diff.cp310-win32.pyd +0 -0
- lxml/html/diff.py +878 -0
- lxml/html/formfill.py +299 -0
- lxml/html/html5parser.py +260 -0
- lxml/html/soupparser.py +314 -0
- lxml/html/usedoctest.py +13 -0
- lxml/includes/__init__.pxd +0 -0
- lxml/includes/__init__.py +0 -0
- lxml/includes/c14n.pxd +25 -0
- lxml/includes/config.pxd +3 -0
- lxml/includes/dtdvalid.pxd +18 -0
- lxml/includes/etree_defs.h +379 -0
- lxml/includes/etreepublic.pxd +237 -0
- lxml/includes/extlibs/__init__.py +0 -0
- lxml/includes/extlibs/zconf.h +543 -0
- lxml/includes/extlibs/zlib.h +1938 -0
- lxml/includes/htmlparser.pxd +56 -0
- lxml/includes/libexslt/__init__.py +0 -0
- lxml/includes/libexslt/exslt.h +108 -0
- lxml/includes/libexslt/exsltconfig.h +70 -0
- lxml/includes/libexslt/exsltexports.h +63 -0
- lxml/includes/libexslt/libexslt.h +29 -0
- lxml/includes/libxml/HTMLparser.h +320 -0
- lxml/includes/libxml/HTMLtree.h +147 -0
- lxml/includes/libxml/SAX.h +204 -0
- lxml/includes/libxml/SAX2.h +173 -0
- lxml/includes/libxml/__init__.py +0 -0
- lxml/includes/libxml/c14n.h +128 -0
- lxml/includes/libxml/catalog.h +182 -0
- lxml/includes/libxml/chvalid.h +230 -0
- lxml/includes/libxml/debugXML.h +217 -0
- lxml/includes/libxml/dict.h +81 -0
- lxml/includes/libxml/encoding.h +233 -0
- lxml/includes/libxml/entities.h +151 -0
- lxml/includes/libxml/globals.h +529 -0
- lxml/includes/libxml/hash.h +236 -0
- lxml/includes/libxml/list.h +137 -0
- lxml/includes/libxml/nanoftp.h +186 -0
- lxml/includes/libxml/nanohttp.h +81 -0
- lxml/includes/libxml/parser.h +1265 -0
- lxml/includes/libxml/parserInternals.h +662 -0
- lxml/includes/libxml/pattern.h +100 -0
- lxml/includes/libxml/relaxng.h +218 -0
- lxml/includes/libxml/schemasInternals.h +958 -0
- lxml/includes/libxml/schematron.h +142 -0
- lxml/includes/libxml/threads.h +94 -0
- lxml/includes/libxml/tree.h +1314 -0
- lxml/includes/libxml/uri.h +94 -0
- lxml/includes/libxml/valid.h +448 -0
- lxml/includes/libxml/xinclude.h +129 -0
- lxml/includes/libxml/xlink.h +189 -0
- lxml/includes/libxml/xmlIO.h +369 -0
- lxml/includes/libxml/xmlautomata.h +146 -0
- lxml/includes/libxml/xmlerror.h +919 -0
- lxml/includes/libxml/xmlexports.h +50 -0
- lxml/includes/libxml/xmlmemory.h +228 -0
- lxml/includes/libxml/xmlmodule.h +57 -0
- lxml/includes/libxml/xmlreader.h +428 -0
- lxml/includes/libxml/xmlregexp.h +222 -0
- lxml/includes/libxml/xmlsave.h +88 -0
- lxml/includes/libxml/xmlschemas.h +246 -0
- lxml/includes/libxml/xmlschemastypes.h +152 -0
- lxml/includes/libxml/xmlstring.h +140 -0
- lxml/includes/libxml/xmlunicode.h +202 -0
- lxml/includes/libxml/xmlversion.h +526 -0
- lxml/includes/libxml/xmlwriter.h +488 -0
- lxml/includes/libxml/xpath.h +575 -0
- lxml/includes/libxml/xpathInternals.h +632 -0
- lxml/includes/libxml/xpointer.h +137 -0
- lxml/includes/libxslt/__init__.py +0 -0
- lxml/includes/libxslt/attributes.h +39 -0
- lxml/includes/libxslt/documents.h +93 -0
- lxml/includes/libxslt/extensions.h +262 -0
- lxml/includes/libxslt/extra.h +72 -0
- lxml/includes/libxslt/functions.h +78 -0
- lxml/includes/libxslt/imports.h +75 -0
- lxml/includes/libxslt/keys.h +53 -0
- lxml/includes/libxslt/libxslt.h +36 -0
- lxml/includes/libxslt/namespaces.h +68 -0
- lxml/includes/libxslt/numbersInternals.h +73 -0
- lxml/includes/libxslt/preproc.h +43 -0
- lxml/includes/libxslt/security.h +104 -0
- lxml/includes/libxslt/templates.h +77 -0
- lxml/includes/libxslt/transform.h +207 -0
- lxml/includes/libxslt/trio.h +216 -0
- lxml/includes/libxslt/triodef.h +220 -0
- lxml/includes/libxslt/variables.h +118 -0
- lxml/includes/libxslt/win32config.h +51 -0
- lxml/includes/libxslt/xslt.h +110 -0
- lxml/includes/libxslt/xsltInternals.h +1992 -0
- lxml/includes/libxslt/xsltconfig.h +179 -0
- lxml/includes/libxslt/xsltexports.h +64 -0
- lxml/includes/libxslt/xsltlocale.h +44 -0
- lxml/includes/libxslt/xsltutils.h +343 -0
- lxml/includes/lxml-version.h +3 -0
- lxml/includes/relaxng.pxd +64 -0
- lxml/includes/schematron.pxd +34 -0
- lxml/includes/tree.pxd +494 -0
- lxml/includes/uri.pxd +5 -0
- lxml/includes/xinclude.pxd +22 -0
- lxml/includes/xmlerror.pxd +852 -0
- lxml/includes/xmlparser.pxd +265 -0
- lxml/includes/xmlschema.pxd +35 -0
- lxml/includes/xpath.pxd +136 -0
- lxml/includes/xslt.pxd +190 -0
- lxml/isoschematron/__init__.py +348 -0
- lxml/isoschematron/resources/rng/iso-schematron.rng +709 -0
- lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -0
- lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl +77 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +313 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1160 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +55 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt +84 -0
- lxml/iterparse.pxi +438 -0
- lxml/lxml.etree.h +248 -0
- lxml/lxml.etree_api.h +195 -0
- lxml/nsclasses.pxi +281 -0
- lxml/objectify.cp310-win32.pyd +0 -0
- lxml/objectify.pyx +2145 -0
- lxml/objectpath.pxi +332 -0
- lxml/parser.pxi +2000 -0
- lxml/parsertarget.pxi +180 -0
- lxml/proxy.pxi +619 -0
- lxml/public-api.pxi +178 -0
- lxml/pyclasslookup.py +3 -0
- lxml/readonlytree.pxi +565 -0
- lxml/relaxng.pxi +165 -0
- lxml/sax.cp310-win32.pyd +0 -0
- lxml/sax.py +275 -0
- lxml/saxparser.pxi +875 -0
- lxml/schematron.pxi +168 -0
- lxml/serializer.pxi +1781 -0
- lxml/usedoctest.py +13 -0
- lxml/xinclude.pxi +67 -0
- lxml/xmlerror.pxi +1654 -0
- lxml/xmlid.pxi +179 -0
- lxml/xmlschema.pxi +215 -0
- lxml/xpath.pxi +487 -0
- lxml/xslt.pxi +950 -0
- lxml/xsltext.pxi +242 -0
- lxml-5.3.2.dist-info/METADATA +100 -0
- lxml-5.3.2.dist-info/RECORD +175 -0
- lxml-5.3.2.dist-info/WHEEL +5 -0
- lxml-5.3.2.dist-info/licenses/LICENSE.txt +29 -0
- lxml-5.3.2.dist-info/licenses/LICENSES.txt +29 -0
- lxml-5.3.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
import optparse
|
2
|
+
import sys
|
3
|
+
import re
|
4
|
+
import os
|
5
|
+
from .diff import htmldiff
|
6
|
+
|
7
|
+
description = """\
|
8
|
+
"""
|
9
|
+
|
10
|
+
parser = optparse.OptionParser(
|
11
|
+
usage="%prog [OPTIONS] FILE1 FILE2\n"
|
12
|
+
"%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
|
13
|
+
description=description,
|
14
|
+
)
|
15
|
+
|
16
|
+
parser.add_option(
|
17
|
+
'-o', '--output',
|
18
|
+
metavar="FILE",
|
19
|
+
dest="output",
|
20
|
+
default="-",
|
21
|
+
help="File to write the difference to",
|
22
|
+
)
|
23
|
+
|
24
|
+
parser.add_option(
|
25
|
+
'-a', '--annotation',
|
26
|
+
action="store_true",
|
27
|
+
dest="annotation",
|
28
|
+
help="Do an annotation")
|
29
|
+
|
30
|
+
def main(args=None):
|
31
|
+
if args is None:
|
32
|
+
args = sys.argv[1:]
|
33
|
+
options, args = parser.parse_args(args)
|
34
|
+
if options.annotation:
|
35
|
+
return annotate(options, args)
|
36
|
+
if len(args) != 2:
|
37
|
+
print('Error: you must give two files')
|
38
|
+
parser.print_help()
|
39
|
+
sys.exit(1)
|
40
|
+
file1, file2 = args
|
41
|
+
input1 = read_file(file1)
|
42
|
+
input2 = read_file(file2)
|
43
|
+
body1 = split_body(input1)[1]
|
44
|
+
pre, body2, post = split_body(input2)
|
45
|
+
result = htmldiff(body1, body2)
|
46
|
+
result = pre + result + post
|
47
|
+
if options.output == '-':
|
48
|
+
if not result.endswith('\n'):
|
49
|
+
result += '\n'
|
50
|
+
sys.stdout.write(result)
|
51
|
+
else:
|
52
|
+
with open(options.output, 'wb') as f:
|
53
|
+
f.write(result)
|
54
|
+
|
55
|
+
def read_file(filename):
|
56
|
+
if filename == '-':
|
57
|
+
c = sys.stdin.read()
|
58
|
+
elif not os.path.exists(filename):
|
59
|
+
raise OSError(
|
60
|
+
"Input file %s does not exist" % filename)
|
61
|
+
else:
|
62
|
+
with open(filename, 'rb') as f:
|
63
|
+
c = f.read()
|
64
|
+
return c
|
65
|
+
|
66
|
+
body_start_re = re.compile(
|
67
|
+
r"<body.*?>", re.I|re.S)
|
68
|
+
body_end_re = re.compile(
|
69
|
+
r"</body.*?>", re.I|re.S)
|
70
|
+
|
71
|
+
def split_body(html):
|
72
|
+
pre = post = ''
|
73
|
+
match = body_start_re.search(html)
|
74
|
+
if match:
|
75
|
+
pre = html[:match.end()]
|
76
|
+
html = html[match.end():]
|
77
|
+
match = body_end_re.search(html)
|
78
|
+
if match:
|
79
|
+
post = html[match.start():]
|
80
|
+
html = html[:match.start()]
|
81
|
+
return pre, html, post
|
82
|
+
|
83
|
+
def annotate(options, args):
|
84
|
+
print("Not yet implemented")
|
85
|
+
sys.exit(1)
|
86
|
+
|
@@ -0,0 +1,100 @@
|
|
1
|
+
"""
|
2
|
+
Legacy module - don't use in new code!
|
3
|
+
|
4
|
+
html5lib now has its own proper implementation.
|
5
|
+
|
6
|
+
This module implements a tree builder for html5lib that generates lxml
|
7
|
+
html element trees. This module uses camelCase as it follows the
|
8
|
+
html5lib style guide.
|
9
|
+
"""
|
10
|
+
|
11
|
+
from html5lib.treebuilders import _base, etree as etree_builders
|
12
|
+
from lxml import html, etree
|
13
|
+
|
14
|
+
|
15
|
+
class DocumentType:
|
16
|
+
|
17
|
+
def __init__(self, name, publicId, systemId):
|
18
|
+
self.name = name
|
19
|
+
self.publicId = publicId
|
20
|
+
self.systemId = systemId
|
21
|
+
|
22
|
+
class Document:
|
23
|
+
|
24
|
+
def __init__(self):
|
25
|
+
self._elementTree = None
|
26
|
+
self.childNodes = []
|
27
|
+
|
28
|
+
def appendChild(self, element):
|
29
|
+
self._elementTree.getroot().addnext(element._element)
|
30
|
+
|
31
|
+
|
32
|
+
class TreeBuilder(_base.TreeBuilder):
|
33
|
+
documentClass = Document
|
34
|
+
doctypeClass = DocumentType
|
35
|
+
elementClass = None
|
36
|
+
commentClass = None
|
37
|
+
fragmentClass = Document
|
38
|
+
|
39
|
+
def __init__(self, *args, **kwargs):
|
40
|
+
html_builder = etree_builders.getETreeModule(html, fullTree=False)
|
41
|
+
etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
|
42
|
+
self.elementClass = html_builder.Element
|
43
|
+
self.commentClass = etree_builder.Comment
|
44
|
+
_base.TreeBuilder.__init__(self, *args, **kwargs)
|
45
|
+
|
46
|
+
def reset(self):
|
47
|
+
_base.TreeBuilder.reset(self)
|
48
|
+
self.rootInserted = False
|
49
|
+
self.initialComments = []
|
50
|
+
self.doctype = None
|
51
|
+
|
52
|
+
def getDocument(self):
|
53
|
+
return self.document._elementTree
|
54
|
+
|
55
|
+
def getFragment(self):
|
56
|
+
fragment = []
|
57
|
+
element = self.openElements[0]._element
|
58
|
+
if element.text:
|
59
|
+
fragment.append(element.text)
|
60
|
+
fragment.extend(element.getchildren())
|
61
|
+
if element.tail:
|
62
|
+
fragment.append(element.tail)
|
63
|
+
return fragment
|
64
|
+
|
65
|
+
def insertDoctype(self, name, publicId, systemId):
|
66
|
+
doctype = self.doctypeClass(name, publicId, systemId)
|
67
|
+
self.doctype = doctype
|
68
|
+
|
69
|
+
def insertComment(self, data, parent=None):
|
70
|
+
if not self.rootInserted:
|
71
|
+
self.initialComments.append(data)
|
72
|
+
else:
|
73
|
+
_base.TreeBuilder.insertComment(self, data, parent)
|
74
|
+
|
75
|
+
def insertRoot(self, name):
|
76
|
+
buf = []
|
77
|
+
if self.doctype and self.doctype.name:
|
78
|
+
buf.append('<!DOCTYPE %s' % self.doctype.name)
|
79
|
+
if self.doctype.publicId is not None or self.doctype.systemId is not None:
|
80
|
+
buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
|
81
|
+
self.doctype.systemId))
|
82
|
+
buf.append('>')
|
83
|
+
buf.append('<html></html>')
|
84
|
+
root = html.fromstring(''.join(buf))
|
85
|
+
|
86
|
+
# Append the initial comments:
|
87
|
+
for comment in self.initialComments:
|
88
|
+
root.addprevious(etree.Comment(comment))
|
89
|
+
|
90
|
+
# Create the root document and add the ElementTree to it
|
91
|
+
self.document = self.documentClass()
|
92
|
+
self.document._elementTree = root.getroottree()
|
93
|
+
|
94
|
+
# Add the root element to the internal child/open data structures
|
95
|
+
root_element = self.elementClass(name)
|
96
|
+
root_element._element = root
|
97
|
+
self.document.childNodes.append(root_element)
|
98
|
+
self.openElements.append(root_element)
|
99
|
+
|
100
|
+
self.rootInserted = True
|
lxml/html/_setmixin.py
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
try:
|
2
|
+
from collections.abc import MutableSet
|
3
|
+
except ImportError:
|
4
|
+
from collections.abc import MutableSet
|
5
|
+
|
6
|
+
|
7
|
+
class SetMixin(MutableSet):
|
8
|
+
|
9
|
+
"""
|
10
|
+
Mix-in for sets. You must define __iter__, add, remove
|
11
|
+
"""
|
12
|
+
|
13
|
+
def __len__(self):
|
14
|
+
length = 0
|
15
|
+
for item in self:
|
16
|
+
length += 1
|
17
|
+
return length
|
18
|
+
|
19
|
+
def __contains__(self, item):
|
20
|
+
for has_item in self:
|
21
|
+
if item == has_item:
|
22
|
+
return True
|
23
|
+
return False
|
24
|
+
|
25
|
+
issubset = MutableSet.__le__
|
26
|
+
issuperset = MutableSet.__ge__
|
27
|
+
|
28
|
+
union = MutableSet.__or__
|
29
|
+
intersection = MutableSet.__and__
|
30
|
+
difference = MutableSet.__sub__
|
31
|
+
symmetric_difference = MutableSet.__xor__
|
32
|
+
|
33
|
+
def copy(self):
|
34
|
+
return set(self)
|
35
|
+
|
36
|
+
def update(self, other):
|
37
|
+
self |= other
|
38
|
+
|
39
|
+
def intersection_update(self, other):
|
40
|
+
self &= other
|
41
|
+
|
42
|
+
def difference_update(self, other):
|
43
|
+
self -= other
|
44
|
+
|
45
|
+
def symmetric_difference_update(self, other):
|
46
|
+
self ^= other
|
47
|
+
|
48
|
+
def discard(self, item):
|
49
|
+
try:
|
50
|
+
self.remove(item)
|
51
|
+
except KeyError:
|
52
|
+
pass
|
53
|
+
|
54
|
+
@classmethod
|
55
|
+
def _from_iterable(cls, it):
|
56
|
+
return set(it)
|
lxml/html/builder.py
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
# --------------------------------------------------------------------
|
2
|
+
# The ElementTree toolkit is
|
3
|
+
# Copyright (c) 1999-2004 by Fredrik Lundh
|
4
|
+
# --------------------------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
A set of HTML generator tags for building HTML documents.
|
8
|
+
|
9
|
+
Usage::
|
10
|
+
|
11
|
+
>>> from lxml.html.builder import *
|
12
|
+
>>> html = HTML(
|
13
|
+
... HEAD( TITLE("Hello World") ),
|
14
|
+
... BODY( CLASS("main"),
|
15
|
+
... H1("Hello World !")
|
16
|
+
... )
|
17
|
+
... )
|
18
|
+
|
19
|
+
>>> import lxml.etree
|
20
|
+
>>> print lxml.etree.tostring(html, pretty_print=True)
|
21
|
+
<html>
|
22
|
+
<head>
|
23
|
+
<title>Hello World</title>
|
24
|
+
</head>
|
25
|
+
<body class="main">
|
26
|
+
<h1>Hello World !</h1>
|
27
|
+
</body>
|
28
|
+
</html>
|
29
|
+
|
30
|
+
"""
|
31
|
+
|
32
|
+
from lxml.builder import ElementMaker
|
33
|
+
from lxml.html import html_parser
|
34
|
+
|
35
|
+
E = ElementMaker(makeelement=html_parser.makeelement)
|
36
|
+
|
37
|
+
# elements
|
38
|
+
A = E.a #: anchor
|
39
|
+
ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
|
40
|
+
ACRONYM = E.acronym #:
|
41
|
+
ADDRESS = E.address #: information on author
|
42
|
+
APPLET = E.applet #: Java applet (DEPRECATED)
|
43
|
+
AREA = E.area #: client-side image map area
|
44
|
+
B = E.b #: bold text style
|
45
|
+
BASE = E.base #: document base URI
|
46
|
+
BASEFONT = E.basefont #: base font size (DEPRECATED)
|
47
|
+
BDO = E.bdo #: I18N BiDi over-ride
|
48
|
+
BIG = E.big #: large text style
|
49
|
+
BLOCKQUOTE = E.blockquote #: long quotation
|
50
|
+
BODY = E.body #: document body
|
51
|
+
BR = E.br #: forced line break
|
52
|
+
BUTTON = E.button #: push button
|
53
|
+
CAPTION = E.caption #: table caption
|
54
|
+
CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
|
55
|
+
CITE = E.cite #: citation
|
56
|
+
CODE = E.code #: computer code fragment
|
57
|
+
COL = E.col #: table column
|
58
|
+
COLGROUP = E.colgroup #: table column group
|
59
|
+
DD = E.dd #: definition description
|
60
|
+
DEL = getattr(E, 'del') #: deleted text
|
61
|
+
DFN = E.dfn #: instance definition
|
62
|
+
DIR = E.dir #: directory list (DEPRECATED)
|
63
|
+
DIV = E.div #: generic language/style container
|
64
|
+
DL = E.dl #: definition list
|
65
|
+
DT = E.dt #: definition term
|
66
|
+
EM = E.em #: emphasis
|
67
|
+
FIELDSET = E.fieldset #: form control group
|
68
|
+
FONT = E.font #: local change to font (DEPRECATED)
|
69
|
+
FORM = E.form #: interactive form
|
70
|
+
FRAME = E.frame #: subwindow
|
71
|
+
FRAMESET = E.frameset #: window subdivision
|
72
|
+
H1 = E.h1 #: heading
|
73
|
+
H2 = E.h2 #: heading
|
74
|
+
H3 = E.h3 #: heading
|
75
|
+
H4 = E.h4 #: heading
|
76
|
+
H5 = E.h5 #: heading
|
77
|
+
H6 = E.h6 #: heading
|
78
|
+
HEAD = E.head #: document head
|
79
|
+
HR = E.hr #: horizontal rule
|
80
|
+
HTML = E.html #: document root element
|
81
|
+
I = E.i #: italic text style
|
82
|
+
IFRAME = E.iframe #: inline subwindow
|
83
|
+
IMG = E.img #: Embedded image
|
84
|
+
INPUT = E.input #: form control
|
85
|
+
INS = E.ins #: inserted text
|
86
|
+
ISINDEX = E.isindex #: single line prompt (DEPRECATED)
|
87
|
+
KBD = E.kbd #: text to be entered by the user
|
88
|
+
LABEL = E.label #: form field label text
|
89
|
+
LEGEND = E.legend #: fieldset legend
|
90
|
+
LI = E.li #: list item
|
91
|
+
LINK = E.link #: a media-independent link
|
92
|
+
MAP = E.map #: client-side image map
|
93
|
+
MENU = E.menu #: menu list (DEPRECATED)
|
94
|
+
META = E.meta #: generic metainformation
|
95
|
+
NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
|
96
|
+
NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
|
97
|
+
OBJECT = E.object #: generic embedded object
|
98
|
+
OL = E.ol #: ordered list
|
99
|
+
OPTGROUP = E.optgroup #: option group
|
100
|
+
OPTION = E.option #: selectable choice
|
101
|
+
P = E.p #: paragraph
|
102
|
+
PARAM = E.param #: named property value
|
103
|
+
PRE = E.pre #: preformatted text
|
104
|
+
Q = E.q #: short inline quotation
|
105
|
+
S = E.s #: strike-through text style (DEPRECATED)
|
106
|
+
SAMP = E.samp #: sample program output, scripts, etc.
|
107
|
+
SCRIPT = E.script #: script statements
|
108
|
+
SELECT = E.select #: option selector
|
109
|
+
SMALL = E.small #: small text style
|
110
|
+
SPAN = E.span #: generic language/style container
|
111
|
+
STRIKE = E.strike #: strike-through text (DEPRECATED)
|
112
|
+
STRONG = E.strong #: strong emphasis
|
113
|
+
STYLE = E.style #: style info
|
114
|
+
SUB = E.sub #: subscript
|
115
|
+
SUP = E.sup #: superscript
|
116
|
+
TABLE = E.table #:
|
117
|
+
TBODY = E.tbody #: table body
|
118
|
+
TD = E.td #: table data cell
|
119
|
+
TEXTAREA = E.textarea #: multi-line text field
|
120
|
+
TFOOT = E.tfoot #: table footer
|
121
|
+
TH = E.th #: table header cell
|
122
|
+
THEAD = E.thead #: table header
|
123
|
+
TITLE = E.title #: document title
|
124
|
+
TR = E.tr #: table row
|
125
|
+
TT = E.tt #: teletype or monospaced text style
|
126
|
+
U = E.u #: underlined text style (DEPRECATED)
|
127
|
+
UL = E.ul #: unordered list
|
128
|
+
VAR = E.var #: instance of a variable or program argument
|
129
|
+
|
130
|
+
# attributes (only reserved words are included here)
|
131
|
+
ATTR = dict
|
132
|
+
def CLASS(v): return {'class': v}
|
133
|
+
def FOR(v): return {'for': v}
|
lxml/html/clean.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# cython: language_level=3str
|
2
|
+
|
3
|
+
"""Backward-compatibility module for lxml_html_clean"""
|
4
|
+
|
5
|
+
try:
|
6
|
+
from lxml_html_clean import *
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"clean_html",
|
10
|
+
"clean",
|
11
|
+
"Cleaner",
|
12
|
+
"autolink",
|
13
|
+
"autolink_html",
|
14
|
+
"word_break",
|
15
|
+
"word_break_html",
|
16
|
+
]
|
17
|
+
except ImportError:
|
18
|
+
raise ImportError(
|
19
|
+
"lxml.html.clean module is now a separate project lxml_html_clean.\n"
|
20
|
+
"Install lxml[html_clean] or lxml_html_clean directly."
|
21
|
+
) from None
|
lxml/html/defs.py
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
# FIXME: this should all be confirmed against what a DTD says
|
2
|
+
# (probably in a test; this may not match the DTD exactly, but we
|
3
|
+
# should document just how it differs).
|
4
|
+
|
5
|
+
"""
|
6
|
+
Data taken from https://www.w3.org/TR/html401/index/elements.html
|
7
|
+
and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
|
8
|
+
for html5_tags.
|
9
|
+
"""
|
10
|
+
|
11
|
+
empty_tags = frozenset([
|
12
|
+
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
|
13
|
+
'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
|
14
|
+
|
15
|
+
deprecated_tags = frozenset([
|
16
|
+
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
|
17
|
+
'menu', 's', 'strike', 'u'])
|
18
|
+
|
19
|
+
# archive actually takes a space-separated list of URIs
|
20
|
+
link_attrs = frozenset([
|
21
|
+
'action', 'archive', 'background', 'cite', 'classid',
|
22
|
+
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
|
23
|
+
'usemap',
|
24
|
+
# Not standard:
|
25
|
+
'dynsrc', 'lowsrc',
|
26
|
+
# HTML5 formaction
|
27
|
+
'formaction'
|
28
|
+
])
|
29
|
+
|
30
|
+
# Not in the HTML 4 spec:
|
31
|
+
# onerror, onresize
|
32
|
+
event_attrs = frozenset([
|
33
|
+
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
|
34
|
+
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
|
35
|
+
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
|
36
|
+
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
|
37
|
+
'onunload',
|
38
|
+
])
|
39
|
+
|
40
|
+
safe_attrs = frozenset([
|
41
|
+
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
|
42
|
+
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
|
43
|
+
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
|
44
|
+
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
|
45
|
+
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
|
46
|
+
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
|
47
|
+
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
|
48
|
+
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
|
49
|
+
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
50
|
+
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
|
51
|
+
|
52
|
+
# From http://htmlhelp.com/reference/html40/olist.html
|
53
|
+
top_level_tags = frozenset([
|
54
|
+
'html', 'head', 'body', 'frameset',
|
55
|
+
])
|
56
|
+
|
57
|
+
head_tags = frozenset([
|
58
|
+
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
|
59
|
+
])
|
60
|
+
|
61
|
+
general_block_tags = frozenset([
|
62
|
+
'address',
|
63
|
+
'blockquote',
|
64
|
+
'center',
|
65
|
+
'del',
|
66
|
+
'div',
|
67
|
+
'h1',
|
68
|
+
'h2',
|
69
|
+
'h3',
|
70
|
+
'h4',
|
71
|
+
'h5',
|
72
|
+
'h6',
|
73
|
+
'hr',
|
74
|
+
'ins',
|
75
|
+
'isindex',
|
76
|
+
'noscript',
|
77
|
+
'p',
|
78
|
+
'pre',
|
79
|
+
])
|
80
|
+
|
81
|
+
list_tags = frozenset([
|
82
|
+
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
|
83
|
+
])
|
84
|
+
|
85
|
+
table_tags = frozenset([
|
86
|
+
'table', 'caption', 'colgroup', 'col',
|
87
|
+
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
|
88
|
+
])
|
89
|
+
|
90
|
+
# just this one from
|
91
|
+
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
|
92
|
+
block_tags = general_block_tags | list_tags | table_tags | frozenset([
|
93
|
+
# Partial form tags
|
94
|
+
'fieldset', 'form', 'legend', 'optgroup', 'option',
|
95
|
+
])
|
96
|
+
|
97
|
+
form_tags = frozenset([
|
98
|
+
'form', 'button', 'fieldset', 'legend', 'input', 'label',
|
99
|
+
'select', 'optgroup', 'option', 'textarea',
|
100
|
+
])
|
101
|
+
|
102
|
+
special_inline_tags = frozenset([
|
103
|
+
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
|
104
|
+
'img', 'map', 'area', 'object', 'param', 'q', 'script',
|
105
|
+
'span', 'sub', 'sup',
|
106
|
+
])
|
107
|
+
|
108
|
+
phrase_tags = frozenset([
|
109
|
+
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
|
110
|
+
'ins', 'kbd', 'samp', 'strong', 'var',
|
111
|
+
])
|
112
|
+
|
113
|
+
font_style_tags = frozenset([
|
114
|
+
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
|
115
|
+
])
|
116
|
+
|
117
|
+
frame_tags = frozenset([
|
118
|
+
'frameset', 'frame', 'noframes',
|
119
|
+
])
|
120
|
+
|
121
|
+
html5_tags = frozenset([
|
122
|
+
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
|
123
|
+
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
|
124
|
+
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
|
125
|
+
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
|
126
|
+
'svg', 'time', 'track', 'video', 'wbr'
|
127
|
+
])
|
128
|
+
|
129
|
+
# These tags aren't standard
|
130
|
+
nonstandard_tags = frozenset(['blink', 'marquee'])
|
131
|
+
|
132
|
+
|
133
|
+
tags = (top_level_tags | head_tags | general_block_tags | list_tags
|
134
|
+
| table_tags | form_tags | special_inline_tags | phrase_tags
|
135
|
+
| font_style_tags | nonstandard_tags | html5_tags)
|
Binary file
|