lxml 6.0.0__cp310-cp310-manylinux_2_31_armv7l.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. lxml/ElementInclude.py +244 -0
  2. lxml/__init__.py +22 -0
  3. lxml/_elementpath.cpython-310-arm-linux-gnueabihf.so +0 -0
  4. lxml/_elementpath.py +343 -0
  5. lxml/apihelpers.pxi +1801 -0
  6. lxml/builder.cpython-310-arm-linux-gnueabihf.so +0 -0
  7. lxml/builder.py +243 -0
  8. lxml/classlookup.pxi +580 -0
  9. lxml/cleanup.pxi +215 -0
  10. lxml/cssselect.py +101 -0
  11. lxml/debug.pxi +36 -0
  12. lxml/docloader.pxi +178 -0
  13. lxml/doctestcompare.py +488 -0
  14. lxml/dtd.pxi +479 -0
  15. lxml/etree.cpython-310-arm-linux-gnueabihf.so +0 -0
  16. lxml/etree.h +244 -0
  17. lxml/etree.pyx +3853 -0
  18. lxml/etree_api.h +204 -0
  19. lxml/extensions.pxi +830 -0
  20. lxml/html/ElementSoup.py +10 -0
  21. lxml/html/__init__.py +1927 -0
  22. lxml/html/_diffcommand.py +86 -0
  23. lxml/html/_difflib.cpython-310-arm-linux-gnueabihf.so +0 -0
  24. lxml/html/_difflib.py +2106 -0
  25. lxml/html/_html5builder.py +100 -0
  26. lxml/html/_setmixin.py +56 -0
  27. lxml/html/builder.py +173 -0
  28. lxml/html/clean.py +21 -0
  29. lxml/html/defs.py +135 -0
  30. lxml/html/diff.cpython-310-arm-linux-gnueabihf.so +0 -0
  31. lxml/html/diff.py +972 -0
  32. lxml/html/formfill.py +299 -0
  33. lxml/html/html5parser.py +260 -0
  34. lxml/html/soupparser.py +314 -0
  35. lxml/html/usedoctest.py +13 -0
  36. lxml/includes/__init__.pxd +0 -0
  37. lxml/includes/__init__.py +0 -0
  38. lxml/includes/c14n.pxd +25 -0
  39. lxml/includes/config.pxd +3 -0
  40. lxml/includes/dtdvalid.pxd +18 -0
  41. lxml/includes/etree_defs.h +379 -0
  42. lxml/includes/etreepublic.pxd +237 -0
  43. lxml/includes/extlibs/__init__.py +0 -0
  44. lxml/includes/extlibs/libcharset.h +45 -0
  45. lxml/includes/extlibs/localcharset.h +137 -0
  46. lxml/includes/extlibs/zconf.h +543 -0
  47. lxml/includes/extlibs/zlib.h +1938 -0
  48. lxml/includes/htmlparser.pxd +56 -0
  49. lxml/includes/libexslt/__init__.py +0 -0
  50. lxml/includes/libexslt/exslt.h +108 -0
  51. lxml/includes/libexslt/exsltconfig.h +70 -0
  52. lxml/includes/libexslt/exsltexports.h +63 -0
  53. lxml/includes/libxml/HTMLparser.h +339 -0
  54. lxml/includes/libxml/HTMLtree.h +148 -0
  55. lxml/includes/libxml/SAX.h +18 -0
  56. lxml/includes/libxml/SAX2.h +170 -0
  57. lxml/includes/libxml/__init__.py +0 -0
  58. lxml/includes/libxml/c14n.h +115 -0
  59. lxml/includes/libxml/catalog.h +183 -0
  60. lxml/includes/libxml/chvalid.h +230 -0
  61. lxml/includes/libxml/debugXML.h +79 -0
  62. lxml/includes/libxml/dict.h +82 -0
  63. lxml/includes/libxml/encoding.h +307 -0
  64. lxml/includes/libxml/entities.h +147 -0
  65. lxml/includes/libxml/globals.h +25 -0
  66. lxml/includes/libxml/hash.h +251 -0
  67. lxml/includes/libxml/list.h +137 -0
  68. lxml/includes/libxml/nanoftp.h +16 -0
  69. lxml/includes/libxml/nanohttp.h +98 -0
  70. lxml/includes/libxml/parser.h +1633 -0
  71. lxml/includes/libxml/parserInternals.h +591 -0
  72. lxml/includes/libxml/relaxng.h +224 -0
  73. lxml/includes/libxml/schemasInternals.h +959 -0
  74. lxml/includes/libxml/schematron.h +143 -0
  75. lxml/includes/libxml/threads.h +81 -0
  76. lxml/includes/libxml/tree.h +1326 -0
  77. lxml/includes/libxml/uri.h +106 -0
  78. lxml/includes/libxml/valid.h +485 -0
  79. lxml/includes/libxml/xinclude.h +141 -0
  80. lxml/includes/libxml/xlink.h +193 -0
  81. lxml/includes/libxml/xmlIO.h +419 -0
  82. lxml/includes/libxml/xmlautomata.h +163 -0
  83. lxml/includes/libxml/xmlerror.h +962 -0
  84. lxml/includes/libxml/xmlexports.h +96 -0
  85. lxml/includes/libxml/xmlmemory.h +188 -0
  86. lxml/includes/libxml/xmlmodule.h +61 -0
  87. lxml/includes/libxml/xmlreader.h +444 -0
  88. lxml/includes/libxml/xmlregexp.h +116 -0
  89. lxml/includes/libxml/xmlsave.h +111 -0
  90. lxml/includes/libxml/xmlschemas.h +254 -0
  91. lxml/includes/libxml/xmlschemastypes.h +152 -0
  92. lxml/includes/libxml/xmlstring.h +140 -0
  93. lxml/includes/libxml/xmlunicode.h +15 -0
  94. lxml/includes/libxml/xmlversion.h +332 -0
  95. lxml/includes/libxml/xmlwriter.h +489 -0
  96. lxml/includes/libxml/xpath.h +569 -0
  97. lxml/includes/libxml/xpathInternals.h +639 -0
  98. lxml/includes/libxml/xpointer.h +48 -0
  99. lxml/includes/libxslt/__init__.py +0 -0
  100. lxml/includes/libxslt/attributes.h +39 -0
  101. lxml/includes/libxslt/documents.h +93 -0
  102. lxml/includes/libxslt/extensions.h +262 -0
  103. lxml/includes/libxslt/extra.h +72 -0
  104. lxml/includes/libxslt/functions.h +78 -0
  105. lxml/includes/libxslt/imports.h +75 -0
  106. lxml/includes/libxslt/keys.h +53 -0
  107. lxml/includes/libxslt/namespaces.h +68 -0
  108. lxml/includes/libxslt/numbersInternals.h +73 -0
  109. lxml/includes/libxslt/pattern.h +84 -0
  110. lxml/includes/libxslt/preproc.h +43 -0
  111. lxml/includes/libxslt/security.h +104 -0
  112. lxml/includes/libxslt/templates.h +77 -0
  113. lxml/includes/libxslt/transform.h +207 -0
  114. lxml/includes/libxslt/variables.h +118 -0
  115. lxml/includes/libxslt/xslt.h +110 -0
  116. lxml/includes/libxslt/xsltInternals.h +1995 -0
  117. lxml/includes/libxslt/xsltconfig.h +146 -0
  118. lxml/includes/libxslt/xsltexports.h +64 -0
  119. lxml/includes/libxslt/xsltlocale.h +44 -0
  120. lxml/includes/libxslt/xsltutils.h +343 -0
  121. lxml/includes/lxml-version.h +3 -0
  122. lxml/includes/relaxng.pxd +64 -0
  123. lxml/includes/schematron.pxd +34 -0
  124. lxml/includes/tree.pxd +492 -0
  125. lxml/includes/uri.pxd +5 -0
  126. lxml/includes/xinclude.pxd +22 -0
  127. lxml/includes/xmlerror.pxd +852 -0
  128. lxml/includes/xmlparser.pxd +303 -0
  129. lxml/includes/xmlschema.pxd +35 -0
  130. lxml/includes/xpath.pxd +136 -0
  131. lxml/includes/xslt.pxd +190 -0
  132. lxml/isoschematron/__init__.py +348 -0
  133. lxml/isoschematron/resources/rng/iso-schematron.rng +709 -0
  134. lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -0
  135. lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl +77 -0
  136. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +313 -0
  137. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1160 -0
  138. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +55 -0
  139. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -0
  140. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -0
  141. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt +84 -0
  142. lxml/iterparse.pxi +438 -0
  143. lxml/lxml.etree.h +244 -0
  144. lxml/lxml.etree_api.h +204 -0
  145. lxml/nsclasses.pxi +281 -0
  146. lxml/objectify.cpython-310-arm-linux-gnueabihf.so +0 -0
  147. lxml/objectify.pyx +2149 -0
  148. lxml/objectpath.pxi +332 -0
  149. lxml/parser.pxi +2059 -0
  150. lxml/parsertarget.pxi +180 -0
  151. lxml/proxy.pxi +619 -0
  152. lxml/public-api.pxi +178 -0
  153. lxml/pyclasslookup.py +3 -0
  154. lxml/readonlytree.pxi +565 -0
  155. lxml/relaxng.pxi +165 -0
  156. lxml/sax.cpython-310-arm-linux-gnueabihf.so +0 -0
  157. lxml/sax.py +286 -0
  158. lxml/saxparser.pxi +875 -0
  159. lxml/schematron.pxi +173 -0
  160. lxml/serializer.pxi +1849 -0
  161. lxml/usedoctest.py +13 -0
  162. lxml/xinclude.pxi +67 -0
  163. lxml/xmlerror.pxi +1654 -0
  164. lxml/xmlid.pxi +179 -0
  165. lxml/xmlschema.pxi +215 -0
  166. lxml/xpath.pxi +487 -0
  167. lxml/xslt.pxi +957 -0
  168. lxml/xsltext.pxi +242 -0
  169. lxml-6.0.0.dist-info/METADATA +163 -0
  170. lxml-6.0.0.dist-info/RECORD +174 -0
  171. lxml-6.0.0.dist-info/WHEEL +5 -0
  172. lxml-6.0.0.dist-info/licenses/LICENSE.txt +31 -0
  173. lxml-6.0.0.dist-info/licenses/LICENSES.txt +29 -0
  174. lxml-6.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,100 @@
1
+ """
2
+ Legacy module - don't use in new code!
3
+
4
+ html5lib now has its own proper implementation.
5
+
6
+ This module implements a tree builder for html5lib that generates lxml
7
+ html element trees. This module uses camelCase as it follows the
8
+ html5lib style guide.
9
+ """
10
+
11
+ from html5lib.treebuilders import _base, etree as etree_builders
12
+ from lxml import html, etree
13
+
14
+
15
+ class DocumentType:
16
+
17
+ def __init__(self, name, publicId, systemId):
18
+ self.name = name
19
+ self.publicId = publicId
20
+ self.systemId = systemId
21
+
22
+ class Document:
23
+
24
+ def __init__(self):
25
+ self._elementTree = None
26
+ self.childNodes = []
27
+
28
+ def appendChild(self, element):
29
+ self._elementTree.getroot().addnext(element._element)
30
+
31
+
32
+ class TreeBuilder(_base.TreeBuilder):
33
+ documentClass = Document
34
+ doctypeClass = DocumentType
35
+ elementClass = None
36
+ commentClass = None
37
+ fragmentClass = Document
38
+
39
+ def __init__(self, *args, **kwargs):
40
+ html_builder = etree_builders.getETreeModule(html, fullTree=False)
41
+ etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
42
+ self.elementClass = html_builder.Element
43
+ self.commentClass = etree_builder.Comment
44
+ _base.TreeBuilder.__init__(self, *args, **kwargs)
45
+
46
+ def reset(self):
47
+ _base.TreeBuilder.reset(self)
48
+ self.rootInserted = False
49
+ self.initialComments = []
50
+ self.doctype = None
51
+
52
+ def getDocument(self):
53
+ return self.document._elementTree
54
+
55
+ def getFragment(self):
56
+ fragment = []
57
+ element = self.openElements[0]._element
58
+ if element.text:
59
+ fragment.append(element.text)
60
+ fragment.extend(element.getchildren())
61
+ if element.tail:
62
+ fragment.append(element.tail)
63
+ return fragment
64
+
65
+ def insertDoctype(self, name, publicId, systemId):
66
+ doctype = self.doctypeClass(name, publicId, systemId)
67
+ self.doctype = doctype
68
+
69
+ def insertComment(self, data, parent=None):
70
+ if not self.rootInserted:
71
+ self.initialComments.append(data)
72
+ else:
73
+ _base.TreeBuilder.insertComment(self, data, parent)
74
+
75
+ def insertRoot(self, name):
76
+ buf = []
77
+ if self.doctype and self.doctype.name:
78
+ buf.append('<!DOCTYPE %s' % self.doctype.name)
79
+ if self.doctype.publicId is not None or self.doctype.systemId is not None:
80
+ buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
81
+ self.doctype.systemId))
82
+ buf.append('>')
83
+ buf.append('<html></html>')
84
+ root = html.fromstring(''.join(buf))
85
+
86
+ # Append the initial comments:
87
+ for comment in self.initialComments:
88
+ root.addprevious(etree.Comment(comment))
89
+
90
+ # Create the root document and add the ElementTree to it
91
+ self.document = self.documentClass()
92
+ self.document._elementTree = root.getroottree()
93
+
94
+ # Add the root element to the internal child/open data structures
95
+ root_element = self.elementClass(name)
96
+ root_element._element = root
97
+ self.document.childNodes.append(root_element)
98
+ self.openElements.append(root_element)
99
+
100
+ self.rootInserted = True
lxml/html/_setmixin.py ADDED
@@ -0,0 +1,56 @@
1
+ try:
2
+ from collections.abc import MutableSet
3
+ except ImportError:
4
+ from collections.abc import MutableSet
5
+
6
+
7
+ class SetMixin(MutableSet):
8
+
9
+ """
10
+ Mix-in for sets. You must define __iter__, add, remove
11
+ """
12
+
13
+ def __len__(self):
14
+ length = 0
15
+ for item in self:
16
+ length += 1
17
+ return length
18
+
19
+ def __contains__(self, item):
20
+ for has_item in self:
21
+ if item == has_item:
22
+ return True
23
+ return False
24
+
25
+ issubset = MutableSet.__le__
26
+ issuperset = MutableSet.__ge__
27
+
28
+ union = MutableSet.__or__
29
+ intersection = MutableSet.__and__
30
+ difference = MutableSet.__sub__
31
+ symmetric_difference = MutableSet.__xor__
32
+
33
+ def copy(self):
34
+ return set(self)
35
+
36
+ def update(self, other):
37
+ self |= other
38
+
39
+ def intersection_update(self, other):
40
+ self &= other
41
+
42
+ def difference_update(self, other):
43
+ self -= other
44
+
45
+ def symmetric_difference_update(self, other):
46
+ self ^= other
47
+
48
+ def discard(self, item):
49
+ try:
50
+ self.remove(item)
51
+ except KeyError:
52
+ pass
53
+
54
+ @classmethod
55
+ def _from_iterable(cls, it):
56
+ return set(it)
lxml/html/builder.py ADDED
@@ -0,0 +1,173 @@
1
+ # --------------------------------------------------------------------
2
+ # The ElementTree toolkit is
3
+ # Copyright (c) 1999-2004 by Fredrik Lundh
4
+ # --------------------------------------------------------------------
5
+
6
+ """
7
+ A set of HTML generator tags for building HTML documents.
8
+
9
+ Usage::
10
+
11
+ >>> from lxml.html.builder import *
12
+ >>> html = HTML(
13
+ ... HEAD( TITLE("Hello World") ),
14
+ ... BODY( CLASS("main"),
15
+ ... H1("Hello World !")
16
+ ... )
17
+ ... )
18
+
19
+ >>> import lxml.etree
20
+ >>> print lxml.etree.tostring(html, pretty_print=True)
21
+ <html>
22
+ <head>
23
+ <title>Hello World</title>
24
+ </head>
25
+ <body class="main">
26
+ <h1>Hello World !</h1>
27
+ </body>
28
+ </html>
29
+
30
+ """
31
+
32
+ from lxml.builder import ElementMaker
33
+ from lxml.html import html_parser
34
+
35
+ E = ElementMaker(makeelement=html_parser.makeelement)
36
+
37
+ # elements
38
+ A = E.a #: anchor
39
+ ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
40
+ ACRONYM = E.acronym #:
41
+ ADDRESS = E.address #: information on author
42
+ APPLET = E.applet #: Java applet (DEPRECATED)
43
+ AREA = E.area #: client-side image map area
44
+ ARTICLE = E.article #: self-contained article
45
+ ASIDE = E.aside #: indirectly-related content
46
+ AUDIO = E.audio #: embedded audio file
47
+ B = E.b #: bold text style
48
+ BASE = E.base #: document base URI
49
+ BASEFONT = E.basefont #: base font size (DEPRECATED)
50
+ BDI = E.bdi #: isolate bidirectional text
51
+ BDO = E.bdo #: I18N BiDi over-ride
52
+ BIG = E.big #: large text style
53
+ BLOCKQUOTE = E.blockquote #: long quotation
54
+ BODY = E.body #: document body
55
+ BR = E.br #: forced line break
56
+ BUTTON = E.button #: push button
57
+ CANVAS = E.canvas #: scriptable graphics container
58
+ CAPTION = E.caption #: table caption
59
+ CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
60
+ CITE = E.cite #: citation
61
+ CODE = E.code #: computer code fragment
62
+ COL = E.col #: table column
63
+ COLGROUP = E.colgroup #: table column group
64
+ DATA = E.data #: machine-readable translation
65
+ DATALIST = E.datalist #: list of options for an input
66
+ DD = E.dd #: definition description
67
+ DEL = getattr(E, 'del') #: deleted text
68
+ DETAILS = E.details #: expandable section
69
+ DFN = E.dfn #: instance definition
70
+ DIALOG = E.dialog #: dialog box
71
+ DIR = E.dir #: directory list (DEPRECATED)
72
+ DIV = E.div #: generic language/style container
73
+ DL = E.dl #: definition list
74
+ DT = E.dt #: definition term
75
+ EM = E.em #: emphasis
76
+ EMBED = E.embed #: embedded external content
77
+ FIELDSET = E.fieldset #: form control group
78
+ FIGCAPTION = E.figcaption #: figure caption
79
+ FIGURE = E.figure #: self-contained, possibly-captioned content
80
+ FONT = E.font #: local change to font (DEPRECATED)
81
+ FOOTER = E.footer #: footer for nearest ancestor
82
+ FORM = E.form #: interactive form
83
+ FRAME = E.frame #: subwindow
84
+ FRAMESET = E.frameset #: window subdivision
85
+ H1 = E.h1 #: heading
86
+ H2 = E.h2 #: heading
87
+ H3 = E.h3 #: heading
88
+ H4 = E.h4 #: heading
89
+ H5 = E.h5 #: heading
90
+ H6 = E.h6 #: heading
91
+ HEAD = E.head #: document head
92
+ HEADER = E.header #: heading content
93
+ HGROUP = E.hgroup #: heading group
94
+ HR = E.hr #: horizontal rule
95
+ HTML = E.html #: document root element
96
+ I = E.i #: italic text style
97
+ IFRAME = E.iframe #: inline subwindow
98
+ IMG = E.img #: Embedded image
99
+ INPUT = E.input #: form control
100
+ INS = E.ins #: inserted text
101
+ ISINDEX = E.isindex #: single line prompt (DEPRECATED)
102
+ KBD = E.kbd #: text to be entered by the user
103
+ LABEL = E.label #: form field label text
104
+ LEGEND = E.legend #: fieldset legend
105
+ LI = E.li #: list item
106
+ LINK = E.link #: a media-independent link
107
+ MAIN = E.main #: main content
108
+ MAP = E.map #: client-side image map
109
+ MARK = E.mark #: marked/highlighted text
110
+ MARQUEE = E.marquee #: scrolling text
111
+ MENU = E.menu #: menu list (DEPRECATED)
112
+ META = E.meta #: generic metainformation
113
+ METER = E.meter #: numerical value display
114
+ NAV = E.nav #: navigation section
115
+ NOBR = E.nobr #: prevent wrapping
116
+ NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
117
+ NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
118
+ OBJECT = E.object #: generic embedded object
119
+ OL = E.ol #: ordered list
120
+ OPTGROUP = E.optgroup #: option group
121
+ OPTION = E.option #: selectable choice
122
+ OUTPUT = E.output #: result of a calculation
123
+ P = E.p #: paragraph
124
+ PARAM = E.param #: named property value
125
+ PICTURE = E.picture #: picture with multiple sources
126
+ PORTAL = E.portal #: embedded preview
127
+ PRE = E.pre #: preformatted text
128
+ PROGRESS = E.progress #: progress bar
129
+ Q = E.q #: short inline quotation
130
+ RB = E.rb #: ruby base text
131
+ RP = E.rp #: ruby parentheses
132
+ RT = E.rt #: ruby text component
133
+ RTC = E.rtc #: ruby semantic annotation
134
+ RUBY = E.ruby #: ruby annotations
135
+ S = E.s #: strike-through text style (DEPRECATED)
136
+ SAMP = E.samp #: sample program output, scripts, etc.
137
+ SCRIPT = E.script #: script statements
138
+ SEARCH = E.search #: set of form controls for a search
139
+ SECTION = E.section #: generic standalone section
140
+ SELECT = E.select #: option selector
141
+ SLOT = E.slot #: placeholder for JS use
142
+ SMALL = E.small #: small text style
143
+ SOURCE = E.source #: source for picture/audio/video element
144
+ SPAN = E.span #: generic language/style container
145
+ STRIKE = E.strike #: strike-through text (DEPRECATED)
146
+ STRONG = E.strong #: strong emphasis
147
+ STYLE = E.style #: style info
148
+ SUB = E.sub #: subscript
149
+ SUMMARY = E.summary #: summary for <details>
150
+ SUP = E.sup #: superscript
151
+ TABLE = E.table #:
152
+ TBODY = E.tbody #: table body
153
+ TD = E.td #: table data cell
154
+ TEMPLATE = E.template #: fragment for JS use
155
+ TEXTAREA = E.textarea #: multi-line text field
156
+ TFOOT = E.tfoot #: table footer
157
+ TH = E.th #: table header cell
158
+ THEAD = E.thead #: table header
159
+ TIME = E.time #: date/time
160
+ TITLE = E.title #: document title
161
+ TR = E.tr #: table row
162
+ TRACK = E.track #: audio/video track
163
+ TT = E.tt #: teletype or monospaced text style
164
+ U = E.u #: underlined text style (DEPRECATED)
165
+ UL = E.ul #: unordered list
166
+ VAR = E.var #: instance of a variable or program argument
167
+ VIDEO = E.video #: embedded video file
168
+ WBR = E.wbr #: word break
169
+
170
+ # attributes (only reserved words are included here)
171
+ ATTR = dict
172
+ def CLASS(v): return {'class': v}
173
+ def FOR(v): return {'for': v}
lxml/html/clean.py ADDED
@@ -0,0 +1,21 @@
1
+ # cython: language_level=3str
2
+
3
+ """Backward-compatibility module for lxml_html_clean"""
4
+
5
+ try:
6
+ from lxml_html_clean import *
7
+
8
+ __all__ = [
9
+ "clean_html",
10
+ "clean",
11
+ "Cleaner",
12
+ "autolink",
13
+ "autolink_html",
14
+ "word_break",
15
+ "word_break_html",
16
+ ]
17
+ except ImportError:
18
+ raise ImportError(
19
+ "lxml.html.clean module is now a separate project lxml_html_clean.\n"
20
+ "Install lxml[html_clean] or lxml_html_clean directly."
21
+ ) from None
lxml/html/defs.py ADDED
@@ -0,0 +1,135 @@
1
+ # FIXME: this should all be confirmed against what a DTD says
2
+ # (probably in a test; this may not match the DTD exactly, but we
3
+ # should document just how it differs).
4
+
5
+ """
6
+ Data taken from https://www.w3.org/TR/html401/index/elements.html
7
+ and https://html.spec.whatwg.org/multipage/syntax.html#elements-2
8
+ for html5_tags.
9
+ """
10
+
11
+ empty_tags = frozenset([
12
+ 'area', 'base', 'basefont', 'br', 'col', 'embed', 'frame', 'hr',
13
+ 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track', 'wbr'])
14
+
15
+ deprecated_tags = frozenset([
16
+ 'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
17
+ 'menu', 's', 'strike', 'u'])
18
+
19
+ # archive actually takes a space-separated list of URIs
20
+ link_attrs = frozenset([
21
+ 'action', 'archive', 'background', 'cite', 'classid',
22
+ 'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
23
+ 'usemap',
24
+ # Not standard:
25
+ 'dynsrc', 'lowsrc',
26
+ # HTML5 formaction
27
+ 'formaction'
28
+ ])
29
+
30
+ # Not in the HTML 4 spec:
31
+ # onerror, onresize
32
+ event_attrs = frozenset([
33
+ 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
34
+ 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
35
+ 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
36
+ 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
37
+ 'onunload',
38
+ ])
39
+
40
+ safe_attrs = frozenset([
41
+ 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
42
+ 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
43
+ 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
44
+ 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
45
+ 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
46
+ 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
47
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
48
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
49
+ 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
50
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
51
+
52
+ # From http://htmlhelp.com/reference/html40/olist.html
53
+ top_level_tags = frozenset([
54
+ 'html', 'head', 'body', 'frameset',
55
+ ])
56
+
57
+ head_tags = frozenset([
58
+ 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
59
+ ])
60
+
61
+ general_block_tags = frozenset([
62
+ 'address',
63
+ 'blockquote',
64
+ 'center',
65
+ 'del',
66
+ 'div',
67
+ 'h1',
68
+ 'h2',
69
+ 'h3',
70
+ 'h4',
71
+ 'h5',
72
+ 'h6',
73
+ 'hr',
74
+ 'ins',
75
+ 'isindex',
76
+ 'noscript',
77
+ 'p',
78
+ 'pre',
79
+ ])
80
+
81
+ list_tags = frozenset([
82
+ 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
83
+ ])
84
+
85
+ table_tags = frozenset([
86
+ 'table', 'caption', 'colgroup', 'col',
87
+ 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
88
+ ])
89
+
90
+ # just this one from
91
+ # http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
92
+ block_tags = general_block_tags | list_tags | table_tags | frozenset([
93
+ # Partial form tags
94
+ 'fieldset', 'form', 'legend', 'optgroup', 'option',
95
+ ])
96
+
97
+ form_tags = frozenset([
98
+ 'form', 'button', 'fieldset', 'legend', 'input', 'label',
99
+ 'select', 'optgroup', 'option', 'textarea',
100
+ ])
101
+
102
+ special_inline_tags = frozenset([
103
+ 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
104
+ 'img', 'map', 'area', 'object', 'param', 'q', 'script',
105
+ 'span', 'sub', 'sup',
106
+ ])
107
+
108
+ phrase_tags = frozenset([
109
+ 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
110
+ 'ins', 'kbd', 'samp', 'strong', 'var',
111
+ ])
112
+
113
+ font_style_tags = frozenset([
114
+ 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
115
+ ])
116
+
117
+ frame_tags = frozenset([
118
+ 'frameset', 'frame', 'noframes',
119
+ ])
120
+
121
+ html5_tags = frozenset([
122
+ 'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
123
+ 'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
124
+ 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
125
+ 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
126
+ 'svg', 'time', 'track', 'video', 'wbr'
127
+ ])
128
+
129
+ # These tags aren't standard
130
+ nonstandard_tags = frozenset(['blink', 'marquee'])
131
+
132
+
133
+ tags = (top_level_tags | head_tags | general_block_tags | list_tags
134
+ | table_tags | form_tags | special_inline_tags | phrase_tags
135
+ | font_style_tags | nonstandard_tags | html5_tags)