lxml 5.2.0__cp310-cp310-win32.whl → 5.2.2__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/ElementInclude.py +244 -244
- lxml/__init__.py +22 -22
- lxml/_elementpath.cp310-win32.pyd +0 -0
- lxml/_elementpath.py +341 -341
- lxml/apihelpers.pxi +1793 -1793
- lxml/builder.cp310-win32.pyd +0 -0
- lxml/builder.py +232 -232
- lxml/classlookup.pxi +580 -580
- lxml/cleanup.pxi +215 -215
- lxml/cssselect.py +101 -101
- lxml/debug.pxi +90 -90
- lxml/docloader.pxi +178 -178
- lxml/doctestcompare.py +488 -488
- lxml/dtd.pxi +478 -478
- lxml/etree.cp310-win32.pyd +0 -0
- lxml/etree.h +6 -6
- lxml/etree.pyx +3732 -3711
- lxml/extensions.pxi +833 -833
- lxml/html/ElementSoup.py +10 -10
- lxml/html/__init__.py +1923 -1923
- lxml/html/_diffcommand.py +86 -86
- lxml/html/_html5builder.py +100 -100
- lxml/html/_setmixin.py +56 -56
- lxml/html/builder.py +133 -133
- lxml/html/clean.py +21 -21
- lxml/html/defs.py +135 -135
- lxml/html/diff.cp310-win32.pyd +0 -0
- lxml/html/diff.py +878 -878
- lxml/html/formfill.py +299 -299
- lxml/html/html5parser.py +260 -260
- lxml/html/soupparser.py +314 -314
- lxml/html/usedoctest.py +13 -13
- lxml/includes/c14n.pxd +25 -25
- lxml/includes/config.pxd +3 -3
- lxml/includes/dtdvalid.pxd +18 -18
- lxml/includes/etree_defs.h +379 -379
- lxml/includes/etreepublic.pxd +237 -237
- lxml/includes/htmlparser.pxd +56 -56
- lxml/includes/lxml-version.h +1 -1
- lxml/includes/relaxng.pxd +64 -64
- lxml/includes/schematron.pxd +34 -34
- lxml/includes/tree.pxd +494 -494
- lxml/includes/uri.pxd +5 -5
- lxml/includes/xinclude.pxd +22 -22
- lxml/includes/xmlerror.pxd +852 -852
- lxml/includes/xmlparser.pxd +265 -265
- lxml/includes/xmlschema.pxd +35 -35
- lxml/includes/xpath.pxd +136 -136
- lxml/includes/xslt.pxd +190 -190
- lxml/isoschematron/__init__.py +348 -348
- lxml/isoschematron/resources/rng/iso-schematron.rng +709 -709
- lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -75
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +312 -312
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1159 -1159
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +54 -54
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -1796
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -588
- lxml/iterparse.pxi +438 -438
- lxml/lxml.etree.h +6 -6
- lxml/nsclasses.pxi +281 -281
- lxml/objectify.cp310-win32.pyd +0 -0
- lxml/objectify.pyx +2145 -2145
- lxml/objectpath.pxi +332 -332
- lxml/parser.pxi +1994 -1994
- lxml/parsertarget.pxi +180 -180
- lxml/proxy.pxi +619 -619
- lxml/public-api.pxi +178 -178
- lxml/pyclasslookup.py +3 -3
- lxml/readonlytree.pxi +565 -565
- lxml/relaxng.pxi +165 -165
- lxml/sax.cp310-win32.pyd +0 -0
- lxml/sax.py +275 -275
- lxml/saxparser.pxi +875 -875
- lxml/schematron.pxi +168 -168
- lxml/serializer.pxi +1871 -1871
- lxml/usedoctest.py +13 -13
- lxml/xinclude.pxi +67 -67
- lxml/xmlerror.pxi +1654 -1654
- lxml/xmlid.pxi +179 -179
- lxml/xmlschema.pxi +215 -215
- lxml/xpath.pxi +487 -487
- lxml/xslt.pxi +950 -950
- lxml/xsltext.pxi +242 -242
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSE.txt +29 -29
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSES.txt +29 -29
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/METADATA +9 -17
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/RECORD +89 -89
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/WHEEL +0 -0
- {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/top_level.txt +0 -0
lxml/html/builder.py
CHANGED
@@ -1,133 +1,133 @@
|
|
1
|
-
# --------------------------------------------------------------------
|
2
|
-
# The ElementTree toolkit is
|
3
|
-
# Copyright (c) 1999-2004 by Fredrik Lundh
|
4
|
-
# --------------------------------------------------------------------
|
5
|
-
|
6
|
-
"""
|
7
|
-
A set of HTML generator tags for building HTML documents.
|
8
|
-
|
9
|
-
Usage::
|
10
|
-
|
11
|
-
>>> from lxml.html.builder import *
|
12
|
-
>>> html = HTML(
|
13
|
-
... HEAD( TITLE("Hello World") ),
|
14
|
-
... BODY( CLASS("main"),
|
15
|
-
... H1("Hello World !")
|
16
|
-
... )
|
17
|
-
... )
|
18
|
-
|
19
|
-
>>> import lxml.etree
|
20
|
-
>>> print lxml.etree.tostring(html, pretty_print=True)
|
21
|
-
<html>
|
22
|
-
<head>
|
23
|
-
<title>Hello World</title>
|
24
|
-
</head>
|
25
|
-
<body class="main">
|
26
|
-
<h1>Hello World !</h1>
|
27
|
-
</body>
|
28
|
-
</html>
|
29
|
-
|
30
|
-
"""
|
31
|
-
|
32
|
-
from lxml.builder import ElementMaker
|
33
|
-
from lxml.html import html_parser
|
34
|
-
|
35
|
-
E = ElementMaker(makeelement=html_parser.makeelement)
|
36
|
-
|
37
|
-
# elements
|
38
|
-
A = E.a #: anchor
|
39
|
-
ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
|
40
|
-
ACRONYM = E.acronym #:
|
41
|
-
ADDRESS = E.address #: information on author
|
42
|
-
APPLET = E.applet #: Java applet (DEPRECATED)
|
43
|
-
AREA = E.area #: client-side image map area
|
44
|
-
B = E.b #: bold text style
|
45
|
-
BASE = E.base #: document base URI
|
46
|
-
BASEFONT = E.basefont #: base font size (DEPRECATED)
|
47
|
-
BDO = E.bdo #: I18N BiDi over-ride
|
48
|
-
BIG = E.big #: large text style
|
49
|
-
BLOCKQUOTE = E.blockquote #: long quotation
|
50
|
-
BODY = E.body #: document body
|
51
|
-
BR = E.br #: forced line break
|
52
|
-
BUTTON = E.button #: push button
|
53
|
-
CAPTION = E.caption #: table caption
|
54
|
-
CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
|
55
|
-
CITE = E.cite #: citation
|
56
|
-
CODE = E.code #: computer code fragment
|
57
|
-
COL = E.col #: table column
|
58
|
-
COLGROUP = E.colgroup #: table column group
|
59
|
-
DD = E.dd #: definition description
|
60
|
-
DEL = getattr(E, 'del') #: deleted text
|
61
|
-
DFN = E.dfn #: instance definition
|
62
|
-
DIR = E.dir #: directory list (DEPRECATED)
|
63
|
-
DIV = E.div #: generic language/style container
|
64
|
-
DL = E.dl #: definition list
|
65
|
-
DT = E.dt #: definition term
|
66
|
-
EM = E.em #: emphasis
|
67
|
-
FIELDSET = E.fieldset #: form control group
|
68
|
-
FONT = E.font #: local change to font (DEPRECATED)
|
69
|
-
FORM = E.form #: interactive form
|
70
|
-
FRAME = E.frame #: subwindow
|
71
|
-
FRAMESET = E.frameset #: window subdivision
|
72
|
-
H1 = E.h1 #: heading
|
73
|
-
H2 = E.h2 #: heading
|
74
|
-
H3 = E.h3 #: heading
|
75
|
-
H4 = E.h4 #: heading
|
76
|
-
H5 = E.h5 #: heading
|
77
|
-
H6 = E.h6 #: heading
|
78
|
-
HEAD = E.head #: document head
|
79
|
-
HR = E.hr #: horizontal rule
|
80
|
-
HTML = E.html #: document root element
|
81
|
-
I = E.i #: italic text style
|
82
|
-
IFRAME = E.iframe #: inline subwindow
|
83
|
-
IMG = E.img #: Embedded image
|
84
|
-
INPUT = E.input #: form control
|
85
|
-
INS = E.ins #: inserted text
|
86
|
-
ISINDEX = E.isindex #: single line prompt (DEPRECATED)
|
87
|
-
KBD = E.kbd #: text to be entered by the user
|
88
|
-
LABEL = E.label #: form field label text
|
89
|
-
LEGEND = E.legend #: fieldset legend
|
90
|
-
LI = E.li #: list item
|
91
|
-
LINK = E.link #: a media-independent link
|
92
|
-
MAP = E.map #: client-side image map
|
93
|
-
MENU = E.menu #: menu list (DEPRECATED)
|
94
|
-
META = E.meta #: generic metainformation
|
95
|
-
NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
|
96
|
-
NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
|
97
|
-
OBJECT = E.object #: generic embedded object
|
98
|
-
OL = E.ol #: ordered list
|
99
|
-
OPTGROUP = E.optgroup #: option group
|
100
|
-
OPTION = E.option #: selectable choice
|
101
|
-
P = E.p #: paragraph
|
102
|
-
PARAM = E.param #: named property value
|
103
|
-
PRE = E.pre #: preformatted text
|
104
|
-
Q = E.q #: short inline quotation
|
105
|
-
S = E.s #: strike-through text style (DEPRECATED)
|
106
|
-
SAMP = E.samp #: sample program output, scripts, etc.
|
107
|
-
SCRIPT = E.script #: script statements
|
108
|
-
SELECT = E.select #: option selector
|
109
|
-
SMALL = E.small #: small text style
|
110
|
-
SPAN = E.span #: generic language/style container
|
111
|
-
STRIKE = E.strike #: strike-through text (DEPRECATED)
|
112
|
-
STRONG = E.strong #: strong emphasis
|
113
|
-
STYLE = E.style #: style info
|
114
|
-
SUB = E.sub #: subscript
|
115
|
-
SUP = E.sup #: superscript
|
116
|
-
TABLE = E.table #:
|
117
|
-
TBODY = E.tbody #: table body
|
118
|
-
TD = E.td #: table data cell
|
119
|
-
TEXTAREA = E.textarea #: multi-line text field
|
120
|
-
TFOOT = E.tfoot #: table footer
|
121
|
-
TH = E.th #: table header cell
|
122
|
-
THEAD = E.thead #: table header
|
123
|
-
TITLE = E.title #: document title
|
124
|
-
TR = E.tr #: table row
|
125
|
-
TT = E.tt #: teletype or monospaced text style
|
126
|
-
U = E.u #: underlined text style (DEPRECATED)
|
127
|
-
UL = E.ul #: unordered list
|
128
|
-
VAR = E.var #: instance of a variable or program argument
|
129
|
-
|
130
|
-
# attributes (only reserved words are included here)
|
131
|
-
ATTR = dict
|
132
|
-
def CLASS(v): return {'class': v}
|
133
|
-
def FOR(v): return {'for': v}
|
1
|
+
# --------------------------------------------------------------------
|
2
|
+
# The ElementTree toolkit is
|
3
|
+
# Copyright (c) 1999-2004 by Fredrik Lundh
|
4
|
+
# --------------------------------------------------------------------
|
5
|
+
|
6
|
+
"""
|
7
|
+
A set of HTML generator tags for building HTML documents.
|
8
|
+
|
9
|
+
Usage::
|
10
|
+
|
11
|
+
>>> from lxml.html.builder import *
|
12
|
+
>>> html = HTML(
|
13
|
+
... HEAD( TITLE("Hello World") ),
|
14
|
+
... BODY( CLASS("main"),
|
15
|
+
... H1("Hello World !")
|
16
|
+
... )
|
17
|
+
... )
|
18
|
+
|
19
|
+
>>> import lxml.etree
|
20
|
+
>>> print lxml.etree.tostring(html, pretty_print=True)
|
21
|
+
<html>
|
22
|
+
<head>
|
23
|
+
<title>Hello World</title>
|
24
|
+
</head>
|
25
|
+
<body class="main">
|
26
|
+
<h1>Hello World !</h1>
|
27
|
+
</body>
|
28
|
+
</html>
|
29
|
+
|
30
|
+
"""
|
31
|
+
|
32
|
+
from lxml.builder import ElementMaker
|
33
|
+
from lxml.html import html_parser
|
34
|
+
|
35
|
+
E = ElementMaker(makeelement=html_parser.makeelement)
|
36
|
+
|
37
|
+
# elements
|
38
|
+
A = E.a #: anchor
|
39
|
+
ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
|
40
|
+
ACRONYM = E.acronym #:
|
41
|
+
ADDRESS = E.address #: information on author
|
42
|
+
APPLET = E.applet #: Java applet (DEPRECATED)
|
43
|
+
AREA = E.area #: client-side image map area
|
44
|
+
B = E.b #: bold text style
|
45
|
+
BASE = E.base #: document base URI
|
46
|
+
BASEFONT = E.basefont #: base font size (DEPRECATED)
|
47
|
+
BDO = E.bdo #: I18N BiDi over-ride
|
48
|
+
BIG = E.big #: large text style
|
49
|
+
BLOCKQUOTE = E.blockquote #: long quotation
|
50
|
+
BODY = E.body #: document body
|
51
|
+
BR = E.br #: forced line break
|
52
|
+
BUTTON = E.button #: push button
|
53
|
+
CAPTION = E.caption #: table caption
|
54
|
+
CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
|
55
|
+
CITE = E.cite #: citation
|
56
|
+
CODE = E.code #: computer code fragment
|
57
|
+
COL = E.col #: table column
|
58
|
+
COLGROUP = E.colgroup #: table column group
|
59
|
+
DD = E.dd #: definition description
|
60
|
+
DEL = getattr(E, 'del') #: deleted text
|
61
|
+
DFN = E.dfn #: instance definition
|
62
|
+
DIR = E.dir #: directory list (DEPRECATED)
|
63
|
+
DIV = E.div #: generic language/style container
|
64
|
+
DL = E.dl #: definition list
|
65
|
+
DT = E.dt #: definition term
|
66
|
+
EM = E.em #: emphasis
|
67
|
+
FIELDSET = E.fieldset #: form control group
|
68
|
+
FONT = E.font #: local change to font (DEPRECATED)
|
69
|
+
FORM = E.form #: interactive form
|
70
|
+
FRAME = E.frame #: subwindow
|
71
|
+
FRAMESET = E.frameset #: window subdivision
|
72
|
+
H1 = E.h1 #: heading
|
73
|
+
H2 = E.h2 #: heading
|
74
|
+
H3 = E.h3 #: heading
|
75
|
+
H4 = E.h4 #: heading
|
76
|
+
H5 = E.h5 #: heading
|
77
|
+
H6 = E.h6 #: heading
|
78
|
+
HEAD = E.head #: document head
|
79
|
+
HR = E.hr #: horizontal rule
|
80
|
+
HTML = E.html #: document root element
|
81
|
+
I = E.i #: italic text style
|
82
|
+
IFRAME = E.iframe #: inline subwindow
|
83
|
+
IMG = E.img #: Embedded image
|
84
|
+
INPUT = E.input #: form control
|
85
|
+
INS = E.ins #: inserted text
|
86
|
+
ISINDEX = E.isindex #: single line prompt (DEPRECATED)
|
87
|
+
KBD = E.kbd #: text to be entered by the user
|
88
|
+
LABEL = E.label #: form field label text
|
89
|
+
LEGEND = E.legend #: fieldset legend
|
90
|
+
LI = E.li #: list item
|
91
|
+
LINK = E.link #: a media-independent link
|
92
|
+
MAP = E.map #: client-side image map
|
93
|
+
MENU = E.menu #: menu list (DEPRECATED)
|
94
|
+
META = E.meta #: generic metainformation
|
95
|
+
NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
|
96
|
+
NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
|
97
|
+
OBJECT = E.object #: generic embedded object
|
98
|
+
OL = E.ol #: ordered list
|
99
|
+
OPTGROUP = E.optgroup #: option group
|
100
|
+
OPTION = E.option #: selectable choice
|
101
|
+
P = E.p #: paragraph
|
102
|
+
PARAM = E.param #: named property value
|
103
|
+
PRE = E.pre #: preformatted text
|
104
|
+
Q = E.q #: short inline quotation
|
105
|
+
S = E.s #: strike-through text style (DEPRECATED)
|
106
|
+
SAMP = E.samp #: sample program output, scripts, etc.
|
107
|
+
SCRIPT = E.script #: script statements
|
108
|
+
SELECT = E.select #: option selector
|
109
|
+
SMALL = E.small #: small text style
|
110
|
+
SPAN = E.span #: generic language/style container
|
111
|
+
STRIKE = E.strike #: strike-through text (DEPRECATED)
|
112
|
+
STRONG = E.strong #: strong emphasis
|
113
|
+
STYLE = E.style #: style info
|
114
|
+
SUB = E.sub #: subscript
|
115
|
+
SUP = E.sup #: superscript
|
116
|
+
TABLE = E.table #:
|
117
|
+
TBODY = E.tbody #: table body
|
118
|
+
TD = E.td #: table data cell
|
119
|
+
TEXTAREA = E.textarea #: multi-line text field
|
120
|
+
TFOOT = E.tfoot #: table footer
|
121
|
+
TH = E.th #: table header cell
|
122
|
+
THEAD = E.thead #: table header
|
123
|
+
TITLE = E.title #: document title
|
124
|
+
TR = E.tr #: table row
|
125
|
+
TT = E.tt #: teletype or monospaced text style
|
126
|
+
U = E.u #: underlined text style (DEPRECATED)
|
127
|
+
UL = E.ul #: unordered list
|
128
|
+
VAR = E.var #: instance of a variable or program argument
|
129
|
+
|
130
|
+
# attributes (only reserved words are included here)
|
131
|
+
ATTR = dict
|
132
|
+
def CLASS(v): return {'class': v}
|
133
|
+
def FOR(v): return {'for': v}
|
lxml/html/clean.py
CHANGED
@@ -1,21 +1,21 @@
|
|
1
|
-
# cython: language_level=3str
|
2
|
-
|
3
|
-
"""Backward-compatibility module for lxml_html_clean"""
|
4
|
-
|
5
|
-
try:
|
6
|
-
from lxml_html_clean import *
|
7
|
-
|
8
|
-
__all__ = [
|
9
|
-
"clean_html",
|
10
|
-
"clean",
|
11
|
-
"Cleaner",
|
12
|
-
"autolink",
|
13
|
-
"autolink_html",
|
14
|
-
"word_break",
|
15
|
-
"word_break_html",
|
16
|
-
]
|
17
|
-
except ImportError:
|
18
|
-
raise ImportError(
|
19
|
-
"lxml.html.clean module is now a separate project lxml_html_clean.\n"
|
20
|
-
"Install lxml[html_clean] or lxml_html_clean directly."
|
21
|
-
) from None
|
1
|
+
# cython: language_level=3str
|
2
|
+
|
3
|
+
"""Backward-compatibility module for lxml_html_clean"""
|
4
|
+
|
5
|
+
try:
|
6
|
+
from lxml_html_clean import *
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"clean_html",
|
10
|
+
"clean",
|
11
|
+
"Cleaner",
|
12
|
+
"autolink",
|
13
|
+
"autolink_html",
|
14
|
+
"word_break",
|
15
|
+
"word_break_html",
|
16
|
+
]
|
17
|
+
except ImportError:
|
18
|
+
raise ImportError(
|
19
|
+
"lxml.html.clean module is now a separate project lxml_html_clean.\n"
|
20
|
+
"Install lxml[html_clean] or lxml_html_clean directly."
|
21
|
+
) from None
|
lxml/html/defs.py
CHANGED
@@ -1,135 +1,135 @@
|
|
1
|
-
# FIXME: this should all be confirmed against what a DTD says
|
2
|
-
# (probably in a test; this may not match the DTD exactly, but we
|
3
|
-
# should document just how it differs).
|
4
|
-
|
5
|
-
"""
|
6
|
-
Data taken from https://www.w3.org/TR/html401/index/elements.html
|
7
|
-
and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
|
8
|
-
for html5_tags.
|
9
|
-
"""
|
10
|
-
|
11
|
-
empty_tags = frozenset([
|
12
|
-
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
|
13
|
-
'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
|
14
|
-
|
15
|
-
deprecated_tags = frozenset([
|
16
|
-
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
|
17
|
-
'menu', 's', 'strike', 'u'])
|
18
|
-
|
19
|
-
# archive actually takes a space-separated list of URIs
|
20
|
-
link_attrs = frozenset([
|
21
|
-
'action', 'archive', 'background', 'cite', 'classid',
|
22
|
-
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
|
23
|
-
'usemap',
|
24
|
-
# Not standard:
|
25
|
-
'dynsrc', 'lowsrc',
|
26
|
-
# HTML5 formaction
|
27
|
-
'formaction'
|
28
|
-
])
|
29
|
-
|
30
|
-
# Not in the HTML 4 spec:
|
31
|
-
# onerror, onresize
|
32
|
-
event_attrs = frozenset([
|
33
|
-
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
|
34
|
-
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
|
35
|
-
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
|
36
|
-
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
|
37
|
-
'onunload',
|
38
|
-
])
|
39
|
-
|
40
|
-
safe_attrs = frozenset([
|
41
|
-
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
|
42
|
-
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
|
43
|
-
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
|
44
|
-
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
|
45
|
-
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
|
46
|
-
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
|
47
|
-
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
|
48
|
-
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
|
49
|
-
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
50
|
-
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
|
51
|
-
|
52
|
-
# From http://htmlhelp.com/reference/html40/olist.html
|
53
|
-
top_level_tags = frozenset([
|
54
|
-
'html', 'head', 'body', 'frameset',
|
55
|
-
])
|
56
|
-
|
57
|
-
head_tags = frozenset([
|
58
|
-
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
|
59
|
-
])
|
60
|
-
|
61
|
-
general_block_tags = frozenset([
|
62
|
-
'address',
|
63
|
-
'blockquote',
|
64
|
-
'center',
|
65
|
-
'del',
|
66
|
-
'div',
|
67
|
-
'h1',
|
68
|
-
'h2',
|
69
|
-
'h3',
|
70
|
-
'h4',
|
71
|
-
'h5',
|
72
|
-
'h6',
|
73
|
-
'hr',
|
74
|
-
'ins',
|
75
|
-
'isindex',
|
76
|
-
'noscript',
|
77
|
-
'p',
|
78
|
-
'pre',
|
79
|
-
])
|
80
|
-
|
81
|
-
list_tags = frozenset([
|
82
|
-
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
|
83
|
-
])
|
84
|
-
|
85
|
-
table_tags = frozenset([
|
86
|
-
'table', 'caption', 'colgroup', 'col',
|
87
|
-
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
|
88
|
-
])
|
89
|
-
|
90
|
-
# just this one from
|
91
|
-
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
|
92
|
-
block_tags = general_block_tags | list_tags | table_tags | frozenset([
|
93
|
-
# Partial form tags
|
94
|
-
'fieldset', 'form', 'legend', 'optgroup', 'option',
|
95
|
-
])
|
96
|
-
|
97
|
-
form_tags = frozenset([
|
98
|
-
'form', 'button', 'fieldset', 'legend', 'input', 'label',
|
99
|
-
'select', 'optgroup', 'option', 'textarea',
|
100
|
-
])
|
101
|
-
|
102
|
-
special_inline_tags = frozenset([
|
103
|
-
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
|
104
|
-
'img', 'map', 'area', 'object', 'param', 'q', 'script',
|
105
|
-
'span', 'sub', 'sup',
|
106
|
-
])
|
107
|
-
|
108
|
-
phrase_tags = frozenset([
|
109
|
-
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
|
110
|
-
'ins', 'kbd', 'samp', 'strong', 'var',
|
111
|
-
])
|
112
|
-
|
113
|
-
font_style_tags = frozenset([
|
114
|
-
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
|
115
|
-
])
|
116
|
-
|
117
|
-
frame_tags = frozenset([
|
118
|
-
'frameset', 'frame', 'noframes',
|
119
|
-
])
|
120
|
-
|
121
|
-
html5_tags = frozenset([
|
122
|
-
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
|
123
|
-
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
|
124
|
-
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
|
125
|
-
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
|
126
|
-
'svg', 'time', 'track', 'video', 'wbr'
|
127
|
-
])
|
128
|
-
|
129
|
-
# These tags aren't standard
|
130
|
-
nonstandard_tags = frozenset(['blink', 'marquee'])
|
131
|
-
|
132
|
-
|
133
|
-
tags = (top_level_tags | head_tags | general_block_tags | list_tags
|
134
|
-
| table_tags | form_tags | special_inline_tags | phrase_tags
|
135
|
-
| font_style_tags | nonstandard_tags | html5_tags)
|
1
|
+
# FIXME: this should all be confirmed against what a DTD says
|
2
|
+
# (probably in a test; this may not match the DTD exactly, but we
|
3
|
+
# should document just how it differs).
|
4
|
+
|
5
|
+
"""
|
6
|
+
Data taken from https://www.w3.org/TR/html401/index/elements.html
|
7
|
+
and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
|
8
|
+
for html5_tags.
|
9
|
+
"""
|
10
|
+
|
11
|
+
empty_tags = frozenset([
|
12
|
+
'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
|
13
|
+
'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
|
14
|
+
|
15
|
+
deprecated_tags = frozenset([
|
16
|
+
'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
|
17
|
+
'menu', 's', 'strike', 'u'])
|
18
|
+
|
19
|
+
# archive actually takes a space-separated list of URIs
|
20
|
+
link_attrs = frozenset([
|
21
|
+
'action', 'archive', 'background', 'cite', 'classid',
|
22
|
+
'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
|
23
|
+
'usemap',
|
24
|
+
# Not standard:
|
25
|
+
'dynsrc', 'lowsrc',
|
26
|
+
# HTML5 formaction
|
27
|
+
'formaction'
|
28
|
+
])
|
29
|
+
|
30
|
+
# Not in the HTML 4 spec:
|
31
|
+
# onerror, onresize
|
32
|
+
event_attrs = frozenset([
|
33
|
+
'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
|
34
|
+
'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
|
35
|
+
'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
|
36
|
+
'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
|
37
|
+
'onunload',
|
38
|
+
])
|
39
|
+
|
40
|
+
safe_attrs = frozenset([
|
41
|
+
'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
|
42
|
+
'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
|
43
|
+
'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
|
44
|
+
'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
|
45
|
+
'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
|
46
|
+
'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
|
47
|
+
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
|
48
|
+
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
|
49
|
+
'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
|
50
|
+
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
|
51
|
+
|
52
|
+
# From http://htmlhelp.com/reference/html40/olist.html
|
53
|
+
top_level_tags = frozenset([
|
54
|
+
'html', 'head', 'body', 'frameset',
|
55
|
+
])
|
56
|
+
|
57
|
+
head_tags = frozenset([
|
58
|
+
'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
|
59
|
+
])
|
60
|
+
|
61
|
+
general_block_tags = frozenset([
|
62
|
+
'address',
|
63
|
+
'blockquote',
|
64
|
+
'center',
|
65
|
+
'del',
|
66
|
+
'div',
|
67
|
+
'h1',
|
68
|
+
'h2',
|
69
|
+
'h3',
|
70
|
+
'h4',
|
71
|
+
'h5',
|
72
|
+
'h6',
|
73
|
+
'hr',
|
74
|
+
'ins',
|
75
|
+
'isindex',
|
76
|
+
'noscript',
|
77
|
+
'p',
|
78
|
+
'pre',
|
79
|
+
])
|
80
|
+
|
81
|
+
list_tags = frozenset([
|
82
|
+
'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
|
83
|
+
])
|
84
|
+
|
85
|
+
table_tags = frozenset([
|
86
|
+
'table', 'caption', 'colgroup', 'col',
|
87
|
+
'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
|
88
|
+
])
|
89
|
+
|
90
|
+
# just this one from
|
91
|
+
# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
|
92
|
+
block_tags = general_block_tags | list_tags | table_tags | frozenset([
|
93
|
+
# Partial form tags
|
94
|
+
'fieldset', 'form', 'legend', 'optgroup', 'option',
|
95
|
+
])
|
96
|
+
|
97
|
+
form_tags = frozenset([
|
98
|
+
'form', 'button', 'fieldset', 'legend', 'input', 'label',
|
99
|
+
'select', 'optgroup', 'option', 'textarea',
|
100
|
+
])
|
101
|
+
|
102
|
+
special_inline_tags = frozenset([
|
103
|
+
'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
|
104
|
+
'img', 'map', 'area', 'object', 'param', 'q', 'script',
|
105
|
+
'span', 'sub', 'sup',
|
106
|
+
])
|
107
|
+
|
108
|
+
phrase_tags = frozenset([
|
109
|
+
'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
|
110
|
+
'ins', 'kbd', 'samp', 'strong', 'var',
|
111
|
+
])
|
112
|
+
|
113
|
+
font_style_tags = frozenset([
|
114
|
+
'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
|
115
|
+
])
|
116
|
+
|
117
|
+
frame_tags = frozenset([
|
118
|
+
'frameset', 'frame', 'noframes',
|
119
|
+
])
|
120
|
+
|
121
|
+
html5_tags = frozenset([
|
122
|
+
'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
|
123
|
+
'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
|
124
|
+
'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
|
125
|
+
'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
|
126
|
+
'svg', 'time', 'track', 'video', 'wbr'
|
127
|
+
])
|
128
|
+
|
129
|
+
# These tags aren't standard
|
130
|
+
nonstandard_tags = frozenset(['blink', 'marquee'])
|
131
|
+
|
132
|
+
|
133
|
+
tags = (top_level_tags | head_tags | general_block_tags | list_tags
|
134
|
+
| table_tags | form_tags | special_inline_tags | phrase_tags
|
135
|
+
| font_style_tags | nonstandard_tags | html5_tags)
|
lxml/html/diff.cp310-win32.pyd
CHANGED
Binary file
|