lxml 5.2.0__cp310-cp310-win32.whl → 5.2.2__cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. lxml/ElementInclude.py +244 -244
  2. lxml/__init__.py +22 -22
  3. lxml/_elementpath.cp310-win32.pyd +0 -0
  4. lxml/_elementpath.py +341 -341
  5. lxml/apihelpers.pxi +1793 -1793
  6. lxml/builder.cp310-win32.pyd +0 -0
  7. lxml/builder.py +232 -232
  8. lxml/classlookup.pxi +580 -580
  9. lxml/cleanup.pxi +215 -215
  10. lxml/cssselect.py +101 -101
  11. lxml/debug.pxi +90 -90
  12. lxml/docloader.pxi +178 -178
  13. lxml/doctestcompare.py +488 -488
  14. lxml/dtd.pxi +478 -478
  15. lxml/etree.cp310-win32.pyd +0 -0
  16. lxml/etree.h +6 -6
  17. lxml/etree.pyx +3732 -3711
  18. lxml/extensions.pxi +833 -833
  19. lxml/html/ElementSoup.py +10 -10
  20. lxml/html/__init__.py +1923 -1923
  21. lxml/html/_diffcommand.py +86 -86
  22. lxml/html/_html5builder.py +100 -100
  23. lxml/html/_setmixin.py +56 -56
  24. lxml/html/builder.py +133 -133
  25. lxml/html/clean.py +21 -21
  26. lxml/html/defs.py +135 -135
  27. lxml/html/diff.cp310-win32.pyd +0 -0
  28. lxml/html/diff.py +878 -878
  29. lxml/html/formfill.py +299 -299
  30. lxml/html/html5parser.py +260 -260
  31. lxml/html/soupparser.py +314 -314
  32. lxml/html/usedoctest.py +13 -13
  33. lxml/includes/c14n.pxd +25 -25
  34. lxml/includes/config.pxd +3 -3
  35. lxml/includes/dtdvalid.pxd +18 -18
  36. lxml/includes/etree_defs.h +379 -379
  37. lxml/includes/etreepublic.pxd +237 -237
  38. lxml/includes/htmlparser.pxd +56 -56
  39. lxml/includes/lxml-version.h +1 -1
  40. lxml/includes/relaxng.pxd +64 -64
  41. lxml/includes/schematron.pxd +34 -34
  42. lxml/includes/tree.pxd +494 -494
  43. lxml/includes/uri.pxd +5 -5
  44. lxml/includes/xinclude.pxd +22 -22
  45. lxml/includes/xmlerror.pxd +852 -852
  46. lxml/includes/xmlparser.pxd +265 -265
  47. lxml/includes/xmlschema.pxd +35 -35
  48. lxml/includes/xpath.pxd +136 -136
  49. lxml/includes/xslt.pxd +190 -190
  50. lxml/isoschematron/__init__.py +348 -348
  51. lxml/isoschematron/resources/rng/iso-schematron.rng +709 -709
  52. lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -75
  53. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +312 -312
  54. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1159 -1159
  55. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +54 -54
  56. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -1796
  57. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -588
  58. lxml/iterparse.pxi +438 -438
  59. lxml/lxml.etree.h +6 -6
  60. lxml/nsclasses.pxi +281 -281
  61. lxml/objectify.cp310-win32.pyd +0 -0
  62. lxml/objectify.pyx +2145 -2145
  63. lxml/objectpath.pxi +332 -332
  64. lxml/parser.pxi +1994 -1994
  65. lxml/parsertarget.pxi +180 -180
  66. lxml/proxy.pxi +619 -619
  67. lxml/public-api.pxi +178 -178
  68. lxml/pyclasslookup.py +3 -3
  69. lxml/readonlytree.pxi +565 -565
  70. lxml/relaxng.pxi +165 -165
  71. lxml/sax.cp310-win32.pyd +0 -0
  72. lxml/sax.py +275 -275
  73. lxml/saxparser.pxi +875 -875
  74. lxml/schematron.pxi +168 -168
  75. lxml/serializer.pxi +1871 -1871
  76. lxml/usedoctest.py +13 -13
  77. lxml/xinclude.pxi +67 -67
  78. lxml/xmlerror.pxi +1654 -1654
  79. lxml/xmlid.pxi +179 -179
  80. lxml/xmlschema.pxi +215 -215
  81. lxml/xpath.pxi +487 -487
  82. lxml/xslt.pxi +950 -950
  83. lxml/xsltext.pxi +242 -242
  84. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSE.txt +29 -29
  85. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSES.txt +29 -29
  86. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/METADATA +9 -17
  87. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/RECORD +89 -89
  88. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/WHEEL +0 -0
  89. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/top_level.txt +0 -0
lxml/html/builder.py CHANGED
@@ -1,133 +1,133 @@
1
- # --------------------------------------------------------------------
2
- # The ElementTree toolkit is
3
- # Copyright (c) 1999-2004 by Fredrik Lundh
4
- # --------------------------------------------------------------------
5
-
6
- """
7
- A set of HTML generator tags for building HTML documents.
8
-
9
- Usage::
10
-
11
- >>> from lxml.html.builder import *
12
- >>> html = HTML(
13
- ... HEAD( TITLE("Hello World") ),
14
- ... BODY( CLASS("main"),
15
- ... H1("Hello World !")
16
- ... )
17
- ... )
18
-
19
- >>> import lxml.etree
20
- >>> print lxml.etree.tostring(html, pretty_print=True)
21
- <html>
22
- <head>
23
- <title>Hello World</title>
24
- </head>
25
- <body class="main">
26
- <h1>Hello World !</h1>
27
- </body>
28
- </html>
29
-
30
- """
31
-
32
- from lxml.builder import ElementMaker
33
- from lxml.html import html_parser
34
-
35
- E = ElementMaker(makeelement=html_parser.makeelement)
36
-
37
- # elements
38
- A = E.a #: anchor
39
- ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
40
- ACRONYM = E.acronym #:
41
- ADDRESS = E.address #: information on author
42
- APPLET = E.applet #: Java applet (DEPRECATED)
43
- AREA = E.area #: client-side image map area
44
- B = E.b #: bold text style
45
- BASE = E.base #: document base URI
46
- BASEFONT = E.basefont #: base font size (DEPRECATED)
47
- BDO = E.bdo #: I18N BiDi over-ride
48
- BIG = E.big #: large text style
49
- BLOCKQUOTE = E.blockquote #: long quotation
50
- BODY = E.body #: document body
51
- BR = E.br #: forced line break
52
- BUTTON = E.button #: push button
53
- CAPTION = E.caption #: table caption
54
- CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
55
- CITE = E.cite #: citation
56
- CODE = E.code #: computer code fragment
57
- COL = E.col #: table column
58
- COLGROUP = E.colgroup #: table column group
59
- DD = E.dd #: definition description
60
- DEL = getattr(E, 'del') #: deleted text
61
- DFN = E.dfn #: instance definition
62
- DIR = E.dir #: directory list (DEPRECATED)
63
- DIV = E.div #: generic language/style container
64
- DL = E.dl #: definition list
65
- DT = E.dt #: definition term
66
- EM = E.em #: emphasis
67
- FIELDSET = E.fieldset #: form control group
68
- FONT = E.font #: local change to font (DEPRECATED)
69
- FORM = E.form #: interactive form
70
- FRAME = E.frame #: subwindow
71
- FRAMESET = E.frameset #: window subdivision
72
- H1 = E.h1 #: heading
73
- H2 = E.h2 #: heading
74
- H3 = E.h3 #: heading
75
- H4 = E.h4 #: heading
76
- H5 = E.h5 #: heading
77
- H6 = E.h6 #: heading
78
- HEAD = E.head #: document head
79
- HR = E.hr #: horizontal rule
80
- HTML = E.html #: document root element
81
- I = E.i #: italic text style
82
- IFRAME = E.iframe #: inline subwindow
83
- IMG = E.img #: Embedded image
84
- INPUT = E.input #: form control
85
- INS = E.ins #: inserted text
86
- ISINDEX = E.isindex #: single line prompt (DEPRECATED)
87
- KBD = E.kbd #: text to be entered by the user
88
- LABEL = E.label #: form field label text
89
- LEGEND = E.legend #: fieldset legend
90
- LI = E.li #: list item
91
- LINK = E.link #: a media-independent link
92
- MAP = E.map #: client-side image map
93
- MENU = E.menu #: menu list (DEPRECATED)
94
- META = E.meta #: generic metainformation
95
- NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
96
- NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
97
- OBJECT = E.object #: generic embedded object
98
- OL = E.ol #: ordered list
99
- OPTGROUP = E.optgroup #: option group
100
- OPTION = E.option #: selectable choice
101
- P = E.p #: paragraph
102
- PARAM = E.param #: named property value
103
- PRE = E.pre #: preformatted text
104
- Q = E.q #: short inline quotation
105
- S = E.s #: strike-through text style (DEPRECATED)
106
- SAMP = E.samp #: sample program output, scripts, etc.
107
- SCRIPT = E.script #: script statements
108
- SELECT = E.select #: option selector
109
- SMALL = E.small #: small text style
110
- SPAN = E.span #: generic language/style container
111
- STRIKE = E.strike #: strike-through text (DEPRECATED)
112
- STRONG = E.strong #: strong emphasis
113
- STYLE = E.style #: style info
114
- SUB = E.sub #: subscript
115
- SUP = E.sup #: superscript
116
- TABLE = E.table #:
117
- TBODY = E.tbody #: table body
118
- TD = E.td #: table data cell
119
- TEXTAREA = E.textarea #: multi-line text field
120
- TFOOT = E.tfoot #: table footer
121
- TH = E.th #: table header cell
122
- THEAD = E.thead #: table header
123
- TITLE = E.title #: document title
124
- TR = E.tr #: table row
125
- TT = E.tt #: teletype or monospaced text style
126
- U = E.u #: underlined text style (DEPRECATED)
127
- UL = E.ul #: unordered list
128
- VAR = E.var #: instance of a variable or program argument
129
-
130
- # attributes (only reserved words are included here)
131
- ATTR = dict
132
- def CLASS(v): return {'class': v}
133
- def FOR(v): return {'for': v}
1
+ # --------------------------------------------------------------------
2
+ # The ElementTree toolkit is
3
+ # Copyright (c) 1999-2004 by Fredrik Lundh
4
+ # --------------------------------------------------------------------
5
+
6
+ """
7
+ A set of HTML generator tags for building HTML documents.
8
+
9
+ Usage::
10
+
11
+ >>> from lxml.html.builder import *
12
+ >>> html = HTML(
13
+ ... HEAD( TITLE("Hello World") ),
14
+ ... BODY( CLASS("main"),
15
+ ... H1("Hello World !")
16
+ ... )
17
+ ... )
18
+
19
+ >>> import lxml.etree
20
+ >>> print lxml.etree.tostring(html, pretty_print=True)
21
+ <html>
22
+ <head>
23
+ <title>Hello World</title>
24
+ </head>
25
+ <body class="main">
26
+ <h1>Hello World !</h1>
27
+ </body>
28
+ </html>
29
+
30
+ """
31
+
32
+ from lxml.builder import ElementMaker
33
+ from lxml.html import html_parser
34
+
35
+ E = ElementMaker(makeelement=html_parser.makeelement)
36
+
37
+ # elements
38
+ A = E.a #: anchor
39
+ ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
40
+ ACRONYM = E.acronym #:
41
+ ADDRESS = E.address #: information on author
42
+ APPLET = E.applet #: Java applet (DEPRECATED)
43
+ AREA = E.area #: client-side image map area
44
+ B = E.b #: bold text style
45
+ BASE = E.base #: document base URI
46
+ BASEFONT = E.basefont #: base font size (DEPRECATED)
47
+ BDO = E.bdo #: I18N BiDi over-ride
48
+ BIG = E.big #: large text style
49
+ BLOCKQUOTE = E.blockquote #: long quotation
50
+ BODY = E.body #: document body
51
+ BR = E.br #: forced line break
52
+ BUTTON = E.button #: push button
53
+ CAPTION = E.caption #: table caption
54
+ CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
55
+ CITE = E.cite #: citation
56
+ CODE = E.code #: computer code fragment
57
+ COL = E.col #: table column
58
+ COLGROUP = E.colgroup #: table column group
59
+ DD = E.dd #: definition description
60
+ DEL = getattr(E, 'del') #: deleted text
61
+ DFN = E.dfn #: instance definition
62
+ DIR = E.dir #: directory list (DEPRECATED)
63
+ DIV = E.div #: generic language/style container
64
+ DL = E.dl #: definition list
65
+ DT = E.dt #: definition term
66
+ EM = E.em #: emphasis
67
+ FIELDSET = E.fieldset #: form control group
68
+ FONT = E.font #: local change to font (DEPRECATED)
69
+ FORM = E.form #: interactive form
70
+ FRAME = E.frame #: subwindow
71
+ FRAMESET = E.frameset #: window subdivision
72
+ H1 = E.h1 #: heading
73
+ H2 = E.h2 #: heading
74
+ H3 = E.h3 #: heading
75
+ H4 = E.h4 #: heading
76
+ H5 = E.h5 #: heading
77
+ H6 = E.h6 #: heading
78
+ HEAD = E.head #: document head
79
+ HR = E.hr #: horizontal rule
80
+ HTML = E.html #: document root element
81
+ I = E.i #: italic text style
82
+ IFRAME = E.iframe #: inline subwindow
83
+ IMG = E.img #: Embedded image
84
+ INPUT = E.input #: form control
85
+ INS = E.ins #: inserted text
86
+ ISINDEX = E.isindex #: single line prompt (DEPRECATED)
87
+ KBD = E.kbd #: text to be entered by the user
88
+ LABEL = E.label #: form field label text
89
+ LEGEND = E.legend #: fieldset legend
90
+ LI = E.li #: list item
91
+ LINK = E.link #: a media-independent link
92
+ MAP = E.map #: client-side image map
93
+ MENU = E.menu #: menu list (DEPRECATED)
94
+ META = E.meta #: generic metainformation
95
+ NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
96
+ NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
97
+ OBJECT = E.object #: generic embedded object
98
+ OL = E.ol #: ordered list
99
+ OPTGROUP = E.optgroup #: option group
100
+ OPTION = E.option #: selectable choice
101
+ P = E.p #: paragraph
102
+ PARAM = E.param #: named property value
103
+ PRE = E.pre #: preformatted text
104
+ Q = E.q #: short inline quotation
105
+ S = E.s #: strike-through text style (DEPRECATED)
106
+ SAMP = E.samp #: sample program output, scripts, etc.
107
+ SCRIPT = E.script #: script statements
108
+ SELECT = E.select #: option selector
109
+ SMALL = E.small #: small text style
110
+ SPAN = E.span #: generic language/style container
111
+ STRIKE = E.strike #: strike-through text (DEPRECATED)
112
+ STRONG = E.strong #: strong emphasis
113
+ STYLE = E.style #: style info
114
+ SUB = E.sub #: subscript
115
+ SUP = E.sup #: superscript
116
+ TABLE = E.table #:
117
+ TBODY = E.tbody #: table body
118
+ TD = E.td #: table data cell
119
+ TEXTAREA = E.textarea #: multi-line text field
120
+ TFOOT = E.tfoot #: table footer
121
+ TH = E.th #: table header cell
122
+ THEAD = E.thead #: table header
123
+ TITLE = E.title #: document title
124
+ TR = E.tr #: table row
125
+ TT = E.tt #: teletype or monospaced text style
126
+ U = E.u #: underlined text style (DEPRECATED)
127
+ UL = E.ul #: unordered list
128
+ VAR = E.var #: instance of a variable or program argument
129
+
130
+ # attributes (only reserved words are included here)
131
+ ATTR = dict
132
+ def CLASS(v): return {'class': v}
133
+ def FOR(v): return {'for': v}
lxml/html/clean.py CHANGED
@@ -1,21 +1,21 @@
1
- # cython: language_level=3str
2
-
3
- """Backward-compatibility module for lxml_html_clean"""
4
-
5
- try:
6
- from lxml_html_clean import *
7
-
8
- __all__ = [
9
- "clean_html",
10
- "clean",
11
- "Cleaner",
12
- "autolink",
13
- "autolink_html",
14
- "word_break",
15
- "word_break_html",
16
- ]
17
- except ImportError:
18
- raise ImportError(
19
- "lxml.html.clean module is now a separate project lxml_html_clean.\n"
20
- "Install lxml[html_clean] or lxml_html_clean directly."
21
- ) from None
1
+ # cython: language_level=3str
2
+
3
+ """Backward-compatibility module for lxml_html_clean"""
4
+
5
+ try:
6
+ from lxml_html_clean import *
7
+
8
+ __all__ = [
9
+ "clean_html",
10
+ "clean",
11
+ "Cleaner",
12
+ "autolink",
13
+ "autolink_html",
14
+ "word_break",
15
+ "word_break_html",
16
+ ]
17
+ except ImportError:
18
+ raise ImportError(
19
+ "lxml.html.clean module is now a separate project lxml_html_clean.\n"
20
+ "Install lxml[html_clean] or lxml_html_clean directly."
21
+ ) from None
lxml/html/defs.py CHANGED
@@ -1,135 +1,135 @@
1
- # FIXME: this should all be confirmed against what a DTD says
2
- # (probably in a test; this may not match the DTD exactly, but we
3
- # should document just how it differs).
4
-
5
- """
6
- Data taken from https://www.w3.org/TR/html401/index/elements.html
7
- and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
8
- for html5_tags.
9
- """
10
-
11
- empty_tags = frozenset([
12
- 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
13
- 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
14
-
15
- deprecated_tags = frozenset([
16
- 'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
17
- 'menu', 's', 'strike', 'u'])
18
-
19
- # archive actually takes a space-separated list of URIs
20
- link_attrs = frozenset([
21
- 'action', 'archive', 'background', 'cite', 'classid',
22
- 'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
23
- 'usemap',
24
- # Not standard:
25
- 'dynsrc', 'lowsrc',
26
- # HTML5 formaction
27
- 'formaction'
28
- ])
29
-
30
- # Not in the HTML 4 spec:
31
- # onerror, onresize
32
- event_attrs = frozenset([
33
- 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
34
- 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
35
- 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
36
- 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
37
- 'onunload',
38
- ])
39
-
40
- safe_attrs = frozenset([
41
- 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
42
- 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
43
- 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
44
- 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
45
- 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
46
- 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
47
- 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
48
- 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
49
- 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
50
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
51
-
52
- # From http://htmlhelp.com/reference/html40/olist.html
53
- top_level_tags = frozenset([
54
- 'html', 'head', 'body', 'frameset',
55
- ])
56
-
57
- head_tags = frozenset([
58
- 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
59
- ])
60
-
61
- general_block_tags = frozenset([
62
- 'address',
63
- 'blockquote',
64
- 'center',
65
- 'del',
66
- 'div',
67
- 'h1',
68
- 'h2',
69
- 'h3',
70
- 'h4',
71
- 'h5',
72
- 'h6',
73
- 'hr',
74
- 'ins',
75
- 'isindex',
76
- 'noscript',
77
- 'p',
78
- 'pre',
79
- ])
80
-
81
- list_tags = frozenset([
82
- 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
83
- ])
84
-
85
- table_tags = frozenset([
86
- 'table', 'caption', 'colgroup', 'col',
87
- 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
88
- ])
89
-
90
- # just this one from
91
- # http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
92
- block_tags = general_block_tags | list_tags | table_tags | frozenset([
93
- # Partial form tags
94
- 'fieldset', 'form', 'legend', 'optgroup', 'option',
95
- ])
96
-
97
- form_tags = frozenset([
98
- 'form', 'button', 'fieldset', 'legend', 'input', 'label',
99
- 'select', 'optgroup', 'option', 'textarea',
100
- ])
101
-
102
- special_inline_tags = frozenset([
103
- 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
104
- 'img', 'map', 'area', 'object', 'param', 'q', 'script',
105
- 'span', 'sub', 'sup',
106
- ])
107
-
108
- phrase_tags = frozenset([
109
- 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
110
- 'ins', 'kbd', 'samp', 'strong', 'var',
111
- ])
112
-
113
- font_style_tags = frozenset([
114
- 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
115
- ])
116
-
117
- frame_tags = frozenset([
118
- 'frameset', 'frame', 'noframes',
119
- ])
120
-
121
- html5_tags = frozenset([
122
- 'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
123
- 'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
124
- 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
125
- 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
126
- 'svg', 'time', 'track', 'video', 'wbr'
127
- ])
128
-
129
- # These tags aren't standard
130
- nonstandard_tags = frozenset(['blink', 'marquee'])
131
-
132
-
133
- tags = (top_level_tags | head_tags | general_block_tags | list_tags
134
- | table_tags | form_tags | special_inline_tags | phrase_tags
135
- | font_style_tags | nonstandard_tags | html5_tags)
1
+ # FIXME: this should all be confirmed against what a DTD says
2
+ # (probably in a test; this may not match the DTD exactly, but we
3
+ # should document just how it differs).
4
+
5
+ """
6
+ Data taken from https://www.w3.org/TR/html401/index/elements.html
7
+ and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
8
+ for html5_tags.
9
+ """
10
+
11
+ empty_tags = frozenset([
12
+ 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
13
+ 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
14
+
15
+ deprecated_tags = frozenset([
16
+ 'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
17
+ 'menu', 's', 'strike', 'u'])
18
+
19
+ # archive actually takes a space-separated list of URIs
20
+ link_attrs = frozenset([
21
+ 'action', 'archive', 'background', 'cite', 'classid',
22
+ 'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
23
+ 'usemap',
24
+ # Not standard:
25
+ 'dynsrc', 'lowsrc',
26
+ # HTML5 formaction
27
+ 'formaction'
28
+ ])
29
+
30
+ # Not in the HTML 4 spec:
31
+ # onerror, onresize
32
+ event_attrs = frozenset([
33
+ 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
34
+ 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
35
+ 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
36
+ 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
37
+ 'onunload',
38
+ ])
39
+
40
+ safe_attrs = frozenset([
41
+ 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
42
+ 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
43
+ 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
44
+ 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
45
+ 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
46
+ 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
47
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
48
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
49
+ 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
50
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
51
+
52
+ # From http://htmlhelp.com/reference/html40/olist.html
53
+ top_level_tags = frozenset([
54
+ 'html', 'head', 'body', 'frameset',
55
+ ])
56
+
57
+ head_tags = frozenset([
58
+ 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
59
+ ])
60
+
61
+ general_block_tags = frozenset([
62
+ 'address',
63
+ 'blockquote',
64
+ 'center',
65
+ 'del',
66
+ 'div',
67
+ 'h1',
68
+ 'h2',
69
+ 'h3',
70
+ 'h4',
71
+ 'h5',
72
+ 'h6',
73
+ 'hr',
74
+ 'ins',
75
+ 'isindex',
76
+ 'noscript',
77
+ 'p',
78
+ 'pre',
79
+ ])
80
+
81
+ list_tags = frozenset([
82
+ 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
83
+ ])
84
+
85
+ table_tags = frozenset([
86
+ 'table', 'caption', 'colgroup', 'col',
87
+ 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
88
+ ])
89
+
90
+ # just this one from
91
+ # http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
92
+ block_tags = general_block_tags | list_tags | table_tags | frozenset([
93
+ # Partial form tags
94
+ 'fieldset', 'form', 'legend', 'optgroup', 'option',
95
+ ])
96
+
97
+ form_tags = frozenset([
98
+ 'form', 'button', 'fieldset', 'legend', 'input', 'label',
99
+ 'select', 'optgroup', 'option', 'textarea',
100
+ ])
101
+
102
+ special_inline_tags = frozenset([
103
+ 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
104
+ 'img', 'map', 'area', 'object', 'param', 'q', 'script',
105
+ 'span', 'sub', 'sup',
106
+ ])
107
+
108
+ phrase_tags = frozenset([
109
+ 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
110
+ 'ins', 'kbd', 'samp', 'strong', 'var',
111
+ ])
112
+
113
+ font_style_tags = frozenset([
114
+ 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
115
+ ])
116
+
117
+ frame_tags = frozenset([
118
+ 'frameset', 'frame', 'noframes',
119
+ ])
120
+
121
+ html5_tags = frozenset([
122
+ 'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
123
+ 'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
124
+ 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
125
+ 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
126
+ 'svg', 'time', 'track', 'video', 'wbr'
127
+ ])
128
+
129
+ # These tags aren't standard
130
+ nonstandard_tags = frozenset(['blink', 'marquee'])
131
+
132
+
133
+ tags = (top_level_tags | head_tags | general_block_tags | list_tags
134
+ | table_tags | form_tags | special_inline_tags | phrase_tags
135
+ | font_style_tags | nonstandard_tags | html5_tags)
Binary file