lxml 5.3.2__cp310-cp310-win32.win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. lxml/ElementInclude.py +244 -0
  2. lxml/__init__.py +22 -0
  3. lxml/_elementpath.cp310-win32.pyd +0 -0
  4. lxml/_elementpath.py +341 -0
  5. lxml/apihelpers.pxi +1793 -0
  6. lxml/builder.cp310-win32.pyd +0 -0
  7. lxml/builder.py +232 -0
  8. lxml/classlookup.pxi +580 -0
  9. lxml/cleanup.pxi +215 -0
  10. lxml/cssselect.py +101 -0
  11. lxml/debug.pxi +90 -0
  12. lxml/docloader.pxi +178 -0
  13. lxml/doctestcompare.py +488 -0
  14. lxml/dtd.pxi +479 -0
  15. lxml/etree.cp310-win32.pyd +0 -0
  16. lxml/etree.h +248 -0
  17. lxml/etree.pyx +3732 -0
  18. lxml/etree_api.h +195 -0
  19. lxml/extensions.pxi +833 -0
  20. lxml/html/ElementSoup.py +10 -0
  21. lxml/html/__init__.py +1923 -0
  22. lxml/html/_diffcommand.py +86 -0
  23. lxml/html/_html5builder.py +100 -0
  24. lxml/html/_setmixin.py +56 -0
  25. lxml/html/builder.py +133 -0
  26. lxml/html/clean.py +21 -0
  27. lxml/html/defs.py +135 -0
  28. lxml/html/diff.cp310-win32.pyd +0 -0
  29. lxml/html/diff.py +878 -0
  30. lxml/html/formfill.py +299 -0
  31. lxml/html/html5parser.py +260 -0
  32. lxml/html/soupparser.py +314 -0
  33. lxml/html/usedoctest.py +13 -0
  34. lxml/includes/__init__.pxd +0 -0
  35. lxml/includes/__init__.py +0 -0
  36. lxml/includes/c14n.pxd +25 -0
  37. lxml/includes/config.pxd +3 -0
  38. lxml/includes/dtdvalid.pxd +18 -0
  39. lxml/includes/etree_defs.h +379 -0
  40. lxml/includes/etreepublic.pxd +237 -0
  41. lxml/includes/extlibs/__init__.py +0 -0
  42. lxml/includes/extlibs/zconf.h +543 -0
  43. lxml/includes/extlibs/zlib.h +1938 -0
  44. lxml/includes/htmlparser.pxd +56 -0
  45. lxml/includes/libexslt/__init__.py +0 -0
  46. lxml/includes/libexslt/exslt.h +108 -0
  47. lxml/includes/libexslt/exsltconfig.h +70 -0
  48. lxml/includes/libexslt/exsltexports.h +63 -0
  49. lxml/includes/libexslt/libexslt.h +29 -0
  50. lxml/includes/libxml/HTMLparser.h +320 -0
  51. lxml/includes/libxml/HTMLtree.h +147 -0
  52. lxml/includes/libxml/SAX.h +204 -0
  53. lxml/includes/libxml/SAX2.h +173 -0
  54. lxml/includes/libxml/__init__.py +0 -0
  55. lxml/includes/libxml/c14n.h +128 -0
  56. lxml/includes/libxml/catalog.h +182 -0
  57. lxml/includes/libxml/chvalid.h +230 -0
  58. lxml/includes/libxml/debugXML.h +217 -0
  59. lxml/includes/libxml/dict.h +81 -0
  60. lxml/includes/libxml/encoding.h +233 -0
  61. lxml/includes/libxml/entities.h +151 -0
  62. lxml/includes/libxml/globals.h +529 -0
  63. lxml/includes/libxml/hash.h +236 -0
  64. lxml/includes/libxml/list.h +137 -0
  65. lxml/includes/libxml/nanoftp.h +186 -0
  66. lxml/includes/libxml/nanohttp.h +81 -0
  67. lxml/includes/libxml/parser.h +1265 -0
  68. lxml/includes/libxml/parserInternals.h +662 -0
  69. lxml/includes/libxml/pattern.h +100 -0
  70. lxml/includes/libxml/relaxng.h +218 -0
  71. lxml/includes/libxml/schemasInternals.h +958 -0
  72. lxml/includes/libxml/schematron.h +142 -0
  73. lxml/includes/libxml/threads.h +94 -0
  74. lxml/includes/libxml/tree.h +1314 -0
  75. lxml/includes/libxml/uri.h +94 -0
  76. lxml/includes/libxml/valid.h +448 -0
  77. lxml/includes/libxml/xinclude.h +129 -0
  78. lxml/includes/libxml/xlink.h +189 -0
  79. lxml/includes/libxml/xmlIO.h +369 -0
  80. lxml/includes/libxml/xmlautomata.h +146 -0
  81. lxml/includes/libxml/xmlerror.h +919 -0
  82. lxml/includes/libxml/xmlexports.h +50 -0
  83. lxml/includes/libxml/xmlmemory.h +228 -0
  84. lxml/includes/libxml/xmlmodule.h +57 -0
  85. lxml/includes/libxml/xmlreader.h +428 -0
  86. lxml/includes/libxml/xmlregexp.h +222 -0
  87. lxml/includes/libxml/xmlsave.h +88 -0
  88. lxml/includes/libxml/xmlschemas.h +246 -0
  89. lxml/includes/libxml/xmlschemastypes.h +152 -0
  90. lxml/includes/libxml/xmlstring.h +140 -0
  91. lxml/includes/libxml/xmlunicode.h +202 -0
  92. lxml/includes/libxml/xmlversion.h +526 -0
  93. lxml/includes/libxml/xmlwriter.h +488 -0
  94. lxml/includes/libxml/xpath.h +575 -0
  95. lxml/includes/libxml/xpathInternals.h +632 -0
  96. lxml/includes/libxml/xpointer.h +137 -0
  97. lxml/includes/libxslt/__init__.py +0 -0
  98. lxml/includes/libxslt/attributes.h +39 -0
  99. lxml/includes/libxslt/documents.h +93 -0
  100. lxml/includes/libxslt/extensions.h +262 -0
  101. lxml/includes/libxslt/extra.h +72 -0
  102. lxml/includes/libxslt/functions.h +78 -0
  103. lxml/includes/libxslt/imports.h +75 -0
  104. lxml/includes/libxslt/keys.h +53 -0
  105. lxml/includes/libxslt/libxslt.h +36 -0
  106. lxml/includes/libxslt/namespaces.h +68 -0
  107. lxml/includes/libxslt/numbersInternals.h +73 -0
  108. lxml/includes/libxslt/preproc.h +43 -0
  109. lxml/includes/libxslt/security.h +104 -0
  110. lxml/includes/libxslt/templates.h +77 -0
  111. lxml/includes/libxslt/transform.h +207 -0
  112. lxml/includes/libxslt/trio.h +216 -0
  113. lxml/includes/libxslt/triodef.h +220 -0
  114. lxml/includes/libxslt/variables.h +118 -0
  115. lxml/includes/libxslt/win32config.h +51 -0
  116. lxml/includes/libxslt/xslt.h +110 -0
  117. lxml/includes/libxslt/xsltInternals.h +1992 -0
  118. lxml/includes/libxslt/xsltconfig.h +179 -0
  119. lxml/includes/libxslt/xsltexports.h +64 -0
  120. lxml/includes/libxslt/xsltlocale.h +44 -0
  121. lxml/includes/libxslt/xsltutils.h +343 -0
  122. lxml/includes/lxml-version.h +3 -0
  123. lxml/includes/relaxng.pxd +64 -0
  124. lxml/includes/schematron.pxd +34 -0
  125. lxml/includes/tree.pxd +494 -0
  126. lxml/includes/uri.pxd +5 -0
  127. lxml/includes/xinclude.pxd +22 -0
  128. lxml/includes/xmlerror.pxd +852 -0
  129. lxml/includes/xmlparser.pxd +265 -0
  130. lxml/includes/xmlschema.pxd +35 -0
  131. lxml/includes/xpath.pxd +136 -0
  132. lxml/includes/xslt.pxd +190 -0
  133. lxml/isoschematron/__init__.py +348 -0
  134. lxml/isoschematron/resources/rng/iso-schematron.rng +709 -0
  135. lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -0
  136. lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl +77 -0
  137. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +313 -0
  138. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1160 -0
  139. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +55 -0
  140. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -0
  141. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -0
  142. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt +84 -0
  143. lxml/iterparse.pxi +438 -0
  144. lxml/lxml.etree.h +248 -0
  145. lxml/lxml.etree_api.h +195 -0
  146. lxml/nsclasses.pxi +281 -0
  147. lxml/objectify.cp310-win32.pyd +0 -0
  148. lxml/objectify.pyx +2145 -0
  149. lxml/objectpath.pxi +332 -0
  150. lxml/parser.pxi +2000 -0
  151. lxml/parsertarget.pxi +180 -0
  152. lxml/proxy.pxi +619 -0
  153. lxml/public-api.pxi +178 -0
  154. lxml/pyclasslookup.py +3 -0
  155. lxml/readonlytree.pxi +565 -0
  156. lxml/relaxng.pxi +165 -0
  157. lxml/sax.cp310-win32.pyd +0 -0
  158. lxml/sax.py +275 -0
  159. lxml/saxparser.pxi +875 -0
  160. lxml/schematron.pxi +168 -0
  161. lxml/serializer.pxi +1781 -0
  162. lxml/usedoctest.py +13 -0
  163. lxml/xinclude.pxi +67 -0
  164. lxml/xmlerror.pxi +1654 -0
  165. lxml/xmlid.pxi +179 -0
  166. lxml/xmlschema.pxi +215 -0
  167. lxml/xpath.pxi +487 -0
  168. lxml/xslt.pxi +950 -0
  169. lxml/xsltext.pxi +242 -0
  170. lxml-5.3.2.dist-info/METADATA +100 -0
  171. lxml-5.3.2.dist-info/RECORD +175 -0
  172. lxml-5.3.2.dist-info/WHEEL +5 -0
  173. lxml-5.3.2.dist-info/licenses/LICENSE.txt +29 -0
  174. lxml-5.3.2.dist-info/licenses/LICENSES.txt +29 -0
  175. lxml-5.3.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,86 @@
1
+ import optparse
2
+ import sys
3
+ import re
4
+ import os
5
+ from .diff import htmldiff
6
+
7
+ description = """\
8
+ """
9
+
10
+ parser = optparse.OptionParser(
11
+ usage="%prog [OPTIONS] FILE1 FILE2\n"
12
+ "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
13
+ description=description,
14
+ )
15
+
16
+ parser.add_option(
17
+ '-o', '--output',
18
+ metavar="FILE",
19
+ dest="output",
20
+ default="-",
21
+ help="File to write the difference to",
22
+ )
23
+
24
+ parser.add_option(
25
+ '-a', '--annotation',
26
+ action="store_true",
27
+ dest="annotation",
28
+ help="Do an annotation")
29
+
30
+ def main(args=None):
31
+ if args is None:
32
+ args = sys.argv[1:]
33
+ options, args = parser.parse_args(args)
34
+ if options.annotation:
35
+ return annotate(options, args)
36
+ if len(args) != 2:
37
+ print('Error: you must give two files')
38
+ parser.print_help()
39
+ sys.exit(1)
40
+ file1, file2 = args
41
+ input1 = read_file(file1)
42
+ input2 = read_file(file2)
43
+ body1 = split_body(input1)[1]
44
+ pre, body2, post = split_body(input2)
45
+ result = htmldiff(body1, body2)
46
+ result = pre + result + post
47
+ if options.output == '-':
48
+ if not result.endswith('\n'):
49
+ result += '\n'
50
+ sys.stdout.write(result)
51
+ else:
52
+ with open(options.output, 'wb') as f:
53
+ f.write(result)
54
+
55
+ def read_file(filename):
56
+ if filename == '-':
57
+ c = sys.stdin.read()
58
+ elif not os.path.exists(filename):
59
+ raise OSError(
60
+ "Input file %s does not exist" % filename)
61
+ else:
62
+ with open(filename, 'rb') as f:
63
+ c = f.read()
64
+ return c
65
+
66
+ body_start_re = re.compile(
67
+ r"<body.*?>", re.I|re.S)
68
+ body_end_re = re.compile(
69
+ r"</body.*?>", re.I|re.S)
70
+
71
+ def split_body(html):
72
+ pre = post = ''
73
+ match = body_start_re.search(html)
74
+ if match:
75
+ pre = html[:match.end()]
76
+ html = html[match.end():]
77
+ match = body_end_re.search(html)
78
+ if match:
79
+ post = html[match.start():]
80
+ html = html[:match.start()]
81
+ return pre, html, post
82
+
83
+ def annotate(options, args):
84
+ print("Not yet implemented")
85
+ sys.exit(1)
86
+
@@ -0,0 +1,100 @@
1
+ """
2
+ Legacy module - don't use in new code!
3
+
4
+ html5lib now has its own proper implementation.
5
+
6
+ This module implements a tree builder for html5lib that generates lxml
7
+ html element trees. This module uses camelCase as it follows the
8
+ html5lib style guide.
9
+ """
10
+
11
+ from html5lib.treebuilders import _base, etree as etree_builders
12
+ from lxml import html, etree
13
+
14
+
15
+ class DocumentType:
16
+
17
+ def __init__(self, name, publicId, systemId):
18
+ self.name = name
19
+ self.publicId = publicId
20
+ self.systemId = systemId
21
+
22
+ class Document:
23
+
24
+ def __init__(self):
25
+ self._elementTree = None
26
+ self.childNodes = []
27
+
28
+ def appendChild(self, element):
29
+ self._elementTree.getroot().addnext(element._element)
30
+
31
+
32
+ class TreeBuilder(_base.TreeBuilder):
33
+ documentClass = Document
34
+ doctypeClass = DocumentType
35
+ elementClass = None
36
+ commentClass = None
37
+ fragmentClass = Document
38
+
39
+ def __init__(self, *args, **kwargs):
40
+ html_builder = etree_builders.getETreeModule(html, fullTree=False)
41
+ etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
42
+ self.elementClass = html_builder.Element
43
+ self.commentClass = etree_builder.Comment
44
+ _base.TreeBuilder.__init__(self, *args, **kwargs)
45
+
46
+ def reset(self):
47
+ _base.TreeBuilder.reset(self)
48
+ self.rootInserted = False
49
+ self.initialComments = []
50
+ self.doctype = None
51
+
52
+ def getDocument(self):
53
+ return self.document._elementTree
54
+
55
+ def getFragment(self):
56
+ fragment = []
57
+ element = self.openElements[0]._element
58
+ if element.text:
59
+ fragment.append(element.text)
60
+ fragment.extend(element.getchildren())
61
+ if element.tail:
62
+ fragment.append(element.tail)
63
+ return fragment
64
+
65
+ def insertDoctype(self, name, publicId, systemId):
66
+ doctype = self.doctypeClass(name, publicId, systemId)
67
+ self.doctype = doctype
68
+
69
+ def insertComment(self, data, parent=None):
70
+ if not self.rootInserted:
71
+ self.initialComments.append(data)
72
+ else:
73
+ _base.TreeBuilder.insertComment(self, data, parent)
74
+
75
+ def insertRoot(self, name):
76
+ buf = []
77
+ if self.doctype and self.doctype.name:
78
+ buf.append('<!DOCTYPE %s' % self.doctype.name)
79
+ if self.doctype.publicId is not None or self.doctype.systemId is not None:
80
+ buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
81
+ self.doctype.systemId))
82
+ buf.append('>')
83
+ buf.append('<html></html>')
84
+ root = html.fromstring(''.join(buf))
85
+
86
+ # Append the initial comments:
87
+ for comment in self.initialComments:
88
+ root.addprevious(etree.Comment(comment))
89
+
90
+ # Create the root document and add the ElementTree to it
91
+ self.document = self.documentClass()
92
+ self.document._elementTree = root.getroottree()
93
+
94
+ # Add the root element to the internal child/open data structures
95
+ root_element = self.elementClass(name)
96
+ root_element._element = root
97
+ self.document.childNodes.append(root_element)
98
+ self.openElements.append(root_element)
99
+
100
+ self.rootInserted = True
lxml/html/_setmixin.py ADDED
@@ -0,0 +1,56 @@
1
+ try:
2
+ from collections.abc import MutableSet
3
+ except ImportError:
4
+ from collections.abc import MutableSet
5
+
6
+
7
+ class SetMixin(MutableSet):
8
+
9
+ """
10
+ Mix-in for sets. You must define __iter__, add, remove
11
+ """
12
+
13
+ def __len__(self):
14
+ length = 0
15
+ for item in self:
16
+ length += 1
17
+ return length
18
+
19
+ def __contains__(self, item):
20
+ for has_item in self:
21
+ if item == has_item:
22
+ return True
23
+ return False
24
+
25
+ issubset = MutableSet.__le__
26
+ issuperset = MutableSet.__ge__
27
+
28
+ union = MutableSet.__or__
29
+ intersection = MutableSet.__and__
30
+ difference = MutableSet.__sub__
31
+ symmetric_difference = MutableSet.__xor__
32
+
33
+ def copy(self):
34
+ return set(self)
35
+
36
+ def update(self, other):
37
+ self |= other
38
+
39
+ def intersection_update(self, other):
40
+ self &= other
41
+
42
+ def difference_update(self, other):
43
+ self -= other
44
+
45
+ def symmetric_difference_update(self, other):
46
+ self ^= other
47
+
48
+ def discard(self, item):
49
+ try:
50
+ self.remove(item)
51
+ except KeyError:
52
+ pass
53
+
54
+ @classmethod
55
+ def _from_iterable(cls, it):
56
+ return set(it)
lxml/html/builder.py ADDED
@@ -0,0 +1,133 @@
1
+ # --------------------------------------------------------------------
2
+ # The ElementTree toolkit is
3
+ # Copyright (c) 1999-2004 by Fredrik Lundh
4
+ # --------------------------------------------------------------------
5
+
6
+ """
7
+ A set of HTML generator tags for building HTML documents.
8
+
9
+ Usage::
10
+
11
+ >>> from lxml.html.builder import *
12
+ >>> html = HTML(
13
+ ... HEAD( TITLE("Hello World") ),
14
+ ... BODY( CLASS("main"),
15
+ ... H1("Hello World !")
16
+ ... )
17
+ ... )
18
+
19
+ >>> import lxml.etree
20
+ >>> print lxml.etree.tostring(html, pretty_print=True)
21
+ <html>
22
+ <head>
23
+ <title>Hello World</title>
24
+ </head>
25
+ <body class="main">
26
+ <h1>Hello World !</h1>
27
+ </body>
28
+ </html>
29
+
30
+ """
31
+
32
+ from lxml.builder import ElementMaker
33
+ from lxml.html import html_parser
34
+
35
+ E = ElementMaker(makeelement=html_parser.makeelement)
36
+
37
+ # elements
38
+ A = E.a #: anchor
39
+ ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
40
+ ACRONYM = E.acronym #:
41
+ ADDRESS = E.address #: information on author
42
+ APPLET = E.applet #: Java applet (DEPRECATED)
43
+ AREA = E.area #: client-side image map area
44
+ B = E.b #: bold text style
45
+ BASE = E.base #: document base URI
46
+ BASEFONT = E.basefont #: base font size (DEPRECATED)
47
+ BDO = E.bdo #: I18N BiDi over-ride
48
+ BIG = E.big #: large text style
49
+ BLOCKQUOTE = E.blockquote #: long quotation
50
+ BODY = E.body #: document body
51
+ BR = E.br #: forced line break
52
+ BUTTON = E.button #: push button
53
+ CAPTION = E.caption #: table caption
54
+ CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
55
+ CITE = E.cite #: citation
56
+ CODE = E.code #: computer code fragment
57
+ COL = E.col #: table column
58
+ COLGROUP = E.colgroup #: table column group
59
+ DD = E.dd #: definition description
60
+ DEL = getattr(E, 'del') #: deleted text
61
+ DFN = E.dfn #: instance definition
62
+ DIR = E.dir #: directory list (DEPRECATED)
63
+ DIV = E.div #: generic language/style container
64
+ DL = E.dl #: definition list
65
+ DT = E.dt #: definition term
66
+ EM = E.em #: emphasis
67
+ FIELDSET = E.fieldset #: form control group
68
+ FONT = E.font #: local change to font (DEPRECATED)
69
+ FORM = E.form #: interactive form
70
+ FRAME = E.frame #: subwindow
71
+ FRAMESET = E.frameset #: window subdivision
72
+ H1 = E.h1 #: heading
73
+ H2 = E.h2 #: heading
74
+ H3 = E.h3 #: heading
75
+ H4 = E.h4 #: heading
76
+ H5 = E.h5 #: heading
77
+ H6 = E.h6 #: heading
78
+ HEAD = E.head #: document head
79
+ HR = E.hr #: horizontal rule
80
+ HTML = E.html #: document root element
81
+ I = E.i #: italic text style
82
+ IFRAME = E.iframe #: inline subwindow
83
+ IMG = E.img #: Embedded image
84
+ INPUT = E.input #: form control
85
+ INS = E.ins #: inserted text
86
+ ISINDEX = E.isindex #: single line prompt (DEPRECATED)
87
+ KBD = E.kbd #: text to be entered by the user
88
+ LABEL = E.label #: form field label text
89
+ LEGEND = E.legend #: fieldset legend
90
+ LI = E.li #: list item
91
+ LINK = E.link #: a media-independent link
92
+ MAP = E.map #: client-side image map
93
+ MENU = E.menu #: menu list (DEPRECATED)
94
+ META = E.meta #: generic metainformation
95
+ NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
96
+ NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
97
+ OBJECT = E.object #: generic embedded object
98
+ OL = E.ol #: ordered list
99
+ OPTGROUP = E.optgroup #: option group
100
+ OPTION = E.option #: selectable choice
101
+ P = E.p #: paragraph
102
+ PARAM = E.param #: named property value
103
+ PRE = E.pre #: preformatted text
104
+ Q = E.q #: short inline quotation
105
+ S = E.s #: strike-through text style (DEPRECATED)
106
+ SAMP = E.samp #: sample program output, scripts, etc.
107
+ SCRIPT = E.script #: script statements
108
+ SELECT = E.select #: option selector
109
+ SMALL = E.small #: small text style
110
+ SPAN = E.span #: generic language/style container
111
+ STRIKE = E.strike #: strike-through text (DEPRECATED)
112
+ STRONG = E.strong #: strong emphasis
113
+ STYLE = E.style #: style info
114
+ SUB = E.sub #: subscript
115
+ SUP = E.sup #: superscript
116
+ TABLE = E.table #:
117
+ TBODY = E.tbody #: table body
118
+ TD = E.td #: table data cell
119
+ TEXTAREA = E.textarea #: multi-line text field
120
+ TFOOT = E.tfoot #: table footer
121
+ TH = E.th #: table header cell
122
+ THEAD = E.thead #: table header
123
+ TITLE = E.title #: document title
124
+ TR = E.tr #: table row
125
+ TT = E.tt #: teletype or monospaced text style
126
+ U = E.u #: underlined text style (DEPRECATED)
127
+ UL = E.ul #: unordered list
128
+ VAR = E.var #: instance of a variable or program argument
129
+
130
+ # attributes (only reserved words are included here)
131
+ ATTR = dict
132
+ def CLASS(v): return {'class': v}
133
+ def FOR(v): return {'for': v}
lxml/html/clean.py ADDED
@@ -0,0 +1,21 @@
1
+ # cython: language_level=3str
2
+
3
+ """Backward-compatibility module for lxml_html_clean"""
4
+
5
+ try:
6
+ from lxml_html_clean import *
7
+
8
+ __all__ = [
9
+ "clean_html",
10
+ "clean",
11
+ "Cleaner",
12
+ "autolink",
13
+ "autolink_html",
14
+ "word_break",
15
+ "word_break_html",
16
+ ]
17
+ except ImportError:
18
+ raise ImportError(
19
+ "lxml.html.clean module is now a separate project lxml_html_clean.\n"
20
+ "Install lxml[html_clean] or lxml_html_clean directly."
21
+ ) from None
lxml/html/defs.py ADDED
@@ -0,0 +1,135 @@
1
+ # FIXME: this should all be confirmed against what a DTD says
2
+ # (probably in a test; this may not match the DTD exactly, but we
3
+ # should document just how it differs).
4
+
5
+ """
6
+ Data taken from https://www.w3.org/TR/html401/index/elements.html
7
+ and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
8
+ for html5_tags.
9
+ """
10
+
11
+ empty_tags = frozenset([
12
+ 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
13
+ 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
14
+
15
+ deprecated_tags = frozenset([
16
+ 'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
17
+ 'menu', 's', 'strike', 'u'])
18
+
19
+ # archive actually takes a space-separated list of URIs
20
+ link_attrs = frozenset([
21
+ 'action', 'archive', 'background', 'cite', 'classid',
22
+ 'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
23
+ 'usemap',
24
+ # Not standard:
25
+ 'dynsrc', 'lowsrc',
26
+ # HTML5 formaction
27
+ 'formaction'
28
+ ])
29
+
30
+ # Not in the HTML 4 spec:
31
+ # onerror, onresize
32
+ event_attrs = frozenset([
33
+ 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
34
+ 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
35
+ 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
36
+ 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
37
+ 'onunload',
38
+ ])
39
+
40
+ safe_attrs = frozenset([
41
+ 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
42
+ 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
43
+ 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
44
+ 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
45
+ 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
46
+ 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
47
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
48
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
49
+ 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
50
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
51
+
52
+ # From http://htmlhelp.com/reference/html40/olist.html
53
+ top_level_tags = frozenset([
54
+ 'html', 'head', 'body', 'frameset',
55
+ ])
56
+
57
+ head_tags = frozenset([
58
+ 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
59
+ ])
60
+
61
+ general_block_tags = frozenset([
62
+ 'address',
63
+ 'blockquote',
64
+ 'center',
65
+ 'del',
66
+ 'div',
67
+ 'h1',
68
+ 'h2',
69
+ 'h3',
70
+ 'h4',
71
+ 'h5',
72
+ 'h6',
73
+ 'hr',
74
+ 'ins',
75
+ 'isindex',
76
+ 'noscript',
77
+ 'p',
78
+ 'pre',
79
+ ])
80
+
81
+ list_tags = frozenset([
82
+ 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
83
+ ])
84
+
85
+ table_tags = frozenset([
86
+ 'table', 'caption', 'colgroup', 'col',
87
+ 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
88
+ ])
89
+
90
+ # just this one from
91
+ # http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
92
+ block_tags = general_block_tags | list_tags | table_tags | frozenset([
93
+ # Partial form tags
94
+ 'fieldset', 'form', 'legend', 'optgroup', 'option',
95
+ ])
96
+
97
+ form_tags = frozenset([
98
+ 'form', 'button', 'fieldset', 'legend', 'input', 'label',
99
+ 'select', 'optgroup', 'option', 'textarea',
100
+ ])
101
+
102
+ special_inline_tags = frozenset([
103
+ 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
104
+ 'img', 'map', 'area', 'object', 'param', 'q', 'script',
105
+ 'span', 'sub', 'sup',
106
+ ])
107
+
108
+ phrase_tags = frozenset([
109
+ 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
110
+ 'ins', 'kbd', 'samp', 'strong', 'var',
111
+ ])
112
+
113
+ font_style_tags = frozenset([
114
+ 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
115
+ ])
116
+
117
+ frame_tags = frozenset([
118
+ 'frameset', 'frame', 'noframes',
119
+ ])
120
+
121
+ html5_tags = frozenset([
122
+ 'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
123
+ 'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
124
+ 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
125
+ 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
126
+ 'svg', 'time', 'track', 'video', 'wbr'
127
+ ])
128
+
129
+ # These tags aren't standard
130
+ nonstandard_tags = frozenset(['blink', 'marquee'])
131
+
132
+
133
+ tags = (top_level_tags | head_tags | general_block_tags | list_tags
134
+ | table_tags | form_tags | special_inline_tags | phrase_tags
135
+ | font_style_tags | nonstandard_tags | html5_tags)
Binary file