lxml 5.2.0__cp310-cp310-win32.whl → 5.2.2__cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. lxml/ElementInclude.py +244 -244
  2. lxml/__init__.py +22 -22
  3. lxml/_elementpath.cp310-win32.pyd +0 -0
  4. lxml/_elementpath.py +341 -341
  5. lxml/apihelpers.pxi +1793 -1793
  6. lxml/builder.cp310-win32.pyd +0 -0
  7. lxml/builder.py +232 -232
  8. lxml/classlookup.pxi +580 -580
  9. lxml/cleanup.pxi +215 -215
  10. lxml/cssselect.py +101 -101
  11. lxml/debug.pxi +90 -90
  12. lxml/docloader.pxi +178 -178
  13. lxml/doctestcompare.py +488 -488
  14. lxml/dtd.pxi +478 -478
  15. lxml/etree.cp310-win32.pyd +0 -0
  16. lxml/etree.h +6 -6
  17. lxml/etree.pyx +3732 -3711
  18. lxml/extensions.pxi +833 -833
  19. lxml/html/ElementSoup.py +10 -10
  20. lxml/html/__init__.py +1923 -1923
  21. lxml/html/_diffcommand.py +86 -86
  22. lxml/html/_html5builder.py +100 -100
  23. lxml/html/_setmixin.py +56 -56
  24. lxml/html/builder.py +133 -133
  25. lxml/html/clean.py +21 -21
  26. lxml/html/defs.py +135 -135
  27. lxml/html/diff.cp310-win32.pyd +0 -0
  28. lxml/html/diff.py +878 -878
  29. lxml/html/formfill.py +299 -299
  30. lxml/html/html5parser.py +260 -260
  31. lxml/html/soupparser.py +314 -314
  32. lxml/html/usedoctest.py +13 -13
  33. lxml/includes/c14n.pxd +25 -25
  34. lxml/includes/config.pxd +3 -3
  35. lxml/includes/dtdvalid.pxd +18 -18
  36. lxml/includes/etree_defs.h +379 -379
  37. lxml/includes/etreepublic.pxd +237 -237
  38. lxml/includes/htmlparser.pxd +56 -56
  39. lxml/includes/lxml-version.h +1 -1
  40. lxml/includes/relaxng.pxd +64 -64
  41. lxml/includes/schematron.pxd +34 -34
  42. lxml/includes/tree.pxd +494 -494
  43. lxml/includes/uri.pxd +5 -5
  44. lxml/includes/xinclude.pxd +22 -22
  45. lxml/includes/xmlerror.pxd +852 -852
  46. lxml/includes/xmlparser.pxd +265 -265
  47. lxml/includes/xmlschema.pxd +35 -35
  48. lxml/includes/xpath.pxd +136 -136
  49. lxml/includes/xslt.pxd +190 -190
  50. lxml/isoschematron/__init__.py +348 -348
  51. lxml/isoschematron/resources/rng/iso-schematron.rng +709 -709
  52. lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -75
  53. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +312 -312
  54. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1159 -1159
  55. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +54 -54
  56. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -1796
  57. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -588
  58. lxml/iterparse.pxi +438 -438
  59. lxml/lxml.etree.h +6 -6
  60. lxml/nsclasses.pxi +281 -281
  61. lxml/objectify.cp310-win32.pyd +0 -0
  62. lxml/objectify.pyx +2145 -2145
  63. lxml/objectpath.pxi +332 -332
  64. lxml/parser.pxi +1994 -1994
  65. lxml/parsertarget.pxi +180 -180
  66. lxml/proxy.pxi +619 -619
  67. lxml/public-api.pxi +178 -178
  68. lxml/pyclasslookup.py +3 -3
  69. lxml/readonlytree.pxi +565 -565
  70. lxml/relaxng.pxi +165 -165
  71. lxml/sax.cp310-win32.pyd +0 -0
  72. lxml/sax.py +275 -275
  73. lxml/saxparser.pxi +875 -875
  74. lxml/schematron.pxi +168 -168
  75. lxml/serializer.pxi +1871 -1871
  76. lxml/usedoctest.py +13 -13
  77. lxml/xinclude.pxi +67 -67
  78. lxml/xmlerror.pxi +1654 -1654
  79. lxml/xmlid.pxi +179 -179
  80. lxml/xmlschema.pxi +215 -215
  81. lxml/xpath.pxi +487 -487
  82. lxml/xslt.pxi +950 -950
  83. lxml/xsltext.pxi +242 -242
  84. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSE.txt +29 -29
  85. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSES.txt +29 -29
  86. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/METADATA +9 -17
  87. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/RECORD +89 -89
  88. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/WHEEL +0 -0
  89. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/top_level.txt +0 -0
lxml/html/html5parser.py CHANGED
@@ -1,260 +1,260 @@
1
- """
2
- An interface to html5lib that mimics the lxml.html interface.
3
- """
4
- import sys
5
- import string
6
-
7
- from html5lib import HTMLParser as _HTMLParser
8
- from html5lib.treebuilders.etree_lxml import TreeBuilder
9
- from lxml import etree
10
- from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
11
-
12
- # python3 compatibility
13
- try:
14
- _strings = basestring
15
- except NameError:
16
- _strings = (bytes, str)
17
- try:
18
- from urllib2 import urlopen
19
- except ImportError:
20
- from urllib.request import urlopen
21
- try:
22
- from urlparse import urlparse
23
- except ImportError:
24
- from urllib.parse import urlparse
25
-
26
-
27
- class HTMLParser(_HTMLParser):
28
- """An html5lib HTML parser with lxml as tree."""
29
-
30
- def __init__(self, strict=False, **kwargs):
31
- _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
32
-
33
-
34
- try:
35
- from html5lib import XHTMLParser as _XHTMLParser
36
- except ImportError:
37
- pass
38
- else:
39
- class XHTMLParser(_XHTMLParser):
40
- """An html5lib XHTML Parser with lxml as tree."""
41
-
42
- def __init__(self, strict=False, **kwargs):
43
- _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
44
-
45
- xhtml_parser = XHTMLParser()
46
-
47
-
48
- def _find_tag(tree, tag):
49
- elem = tree.find(tag)
50
- if elem is not None:
51
- return elem
52
- return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
53
-
54
-
55
- def document_fromstring(html, guess_charset=None, parser=None):
56
- """
57
- Parse a whole document into a string.
58
-
59
- If `guess_charset` is true, or if the input is not Unicode but a
60
- byte string, the `chardet` library will perform charset guessing
61
- on the string.
62
- """
63
- if not isinstance(html, _strings):
64
- raise TypeError('string required')
65
-
66
- if parser is None:
67
- parser = html_parser
68
-
69
- options = {}
70
- if guess_charset is None and isinstance(html, bytes):
71
- # html5lib does not accept useChardet as an argument, if it
72
- # detected the html argument would produce unicode objects.
73
- guess_charset = True
74
- if guess_charset is not None:
75
- options['useChardet'] = guess_charset
76
- return parser.parse(html, **options).getroot()
77
-
78
-
79
- def fragments_fromstring(html, no_leading_text=False,
80
- guess_charset=None, parser=None):
81
- """Parses several HTML elements, returning a list of elements.
82
-
83
- The first item in the list may be a string. If no_leading_text is true,
84
- then it will be an error if there is leading text, and it will always be
85
- a list of only elements.
86
-
87
- If `guess_charset` is true, the `chardet` library will perform charset
88
- guessing on the string.
89
- """
90
- if not isinstance(html, _strings):
91
- raise TypeError('string required')
92
-
93
- if parser is None:
94
- parser = html_parser
95
-
96
- options = {}
97
- if guess_charset is None and isinstance(html, bytes):
98
- # html5lib does not accept useChardet as an argument, if it
99
- # detected the html argument would produce unicode objects.
100
- guess_charset = False
101
- if guess_charset is not None:
102
- options['useChardet'] = guess_charset
103
- children = parser.parseFragment(html, 'div', **options)
104
- if children and isinstance(children[0], _strings):
105
- if no_leading_text:
106
- if children[0].strip():
107
- raise etree.ParserError('There is leading text: %r' %
108
- children[0])
109
- del children[0]
110
- return children
111
-
112
-
113
- def fragment_fromstring(html, create_parent=False,
114
- guess_charset=None, parser=None):
115
- """Parses a single HTML element; it is an error if there is more than
116
- one element, or if anything but whitespace precedes or follows the
117
- element.
118
-
119
- If 'create_parent' is true (or is a tag name) then a parent node
120
- will be created to encapsulate the HTML in a single element. In
121
- this case, leading or trailing text is allowed.
122
-
123
- If `guess_charset` is true, the `chardet` library will perform charset
124
- guessing on the string.
125
- """
126
- if not isinstance(html, _strings):
127
- raise TypeError('string required')
128
-
129
- accept_leading_text = bool(create_parent)
130
-
131
- elements = fragments_fromstring(
132
- html, guess_charset=guess_charset, parser=parser,
133
- no_leading_text=not accept_leading_text)
134
-
135
- if create_parent:
136
- if not isinstance(create_parent, _strings):
137
- create_parent = 'div'
138
- new_root = Element(create_parent)
139
- if elements:
140
- if isinstance(elements[0], _strings):
141
- new_root.text = elements[0]
142
- del elements[0]
143
- new_root.extend(elements)
144
- return new_root
145
-
146
- if not elements:
147
- raise etree.ParserError('No elements found')
148
- if len(elements) > 1:
149
- raise etree.ParserError('Multiple elements found')
150
- result = elements[0]
151
- if result.tail and result.tail.strip():
152
- raise etree.ParserError('Element followed by text: %r' % result.tail)
153
- result.tail = None
154
- return result
155
-
156
-
157
- def fromstring(html, guess_charset=None, parser=None):
158
- """Parse the html, returning a single element/document.
159
-
160
- This tries to minimally parse the chunk of text, without knowing if it
161
- is a fragment or a document.
162
-
163
- 'base_url' will set the document's base_url attribute (and the tree's
164
- docinfo.URL)
165
-
166
- If `guess_charset` is true, or if the input is not Unicode but a
167
- byte string, the `chardet` library will perform charset guessing
168
- on the string.
169
- """
170
- if not isinstance(html, _strings):
171
- raise TypeError('string required')
172
- doc = document_fromstring(html, parser=parser,
173
- guess_charset=guess_charset)
174
-
175
- # document starts with doctype or <html>, full document!
176
- start = html[:50]
177
- if isinstance(start, bytes):
178
- # Allow text comparison in python3.
179
- # Decode as ascii, that also covers latin-1 and utf-8 for the
180
- # characters we need.
181
- start = start.decode('ascii', 'replace')
182
-
183
- start = start.lstrip().lower()
184
- if start.startswith('<html') or start.startswith('<!doctype'):
185
- return doc
186
-
187
- head = _find_tag(doc, 'head')
188
-
189
- # if the head is not empty we have a full document
190
- if len(head):
191
- return doc
192
-
193
- body = _find_tag(doc, 'body')
194
-
195
- # The body has just one element, so it was probably a single
196
- # element passed in
197
- if (len(body) == 1 and (not body.text or not body.text.strip())
198
- and (not body[-1].tail or not body[-1].tail.strip())):
199
- return body[0]
200
-
201
- # Now we have a body which represents a bunch of tags which have the
202
- # content that was passed in. We will create a fake container, which
203
- # is the body tag, except <body> implies too much structure.
204
- if _contains_block_level_tag(body):
205
- body.tag = 'div'
206
- else:
207
- body.tag = 'span'
208
- return body
209
-
210
-
211
- def parse(filename_url_or_file, guess_charset=None, parser=None):
212
- """Parse a filename, URL, or file-like object into an HTML document
213
- tree. Note: this returns a tree, not an element. Use
214
- ``parse(...).getroot()`` to get the document root.
215
-
216
- If ``guess_charset`` is true, the ``useChardet`` option is passed into
217
- html5lib to enable character detection. This option is on by default
218
- when parsing from URLs, off by default when parsing from file(-like)
219
- objects (which tend to return Unicode more often than not), and on by
220
- default when parsing from a file path (which is read in binary mode).
221
- """
222
- if parser is None:
223
- parser = html_parser
224
- if not isinstance(filename_url_or_file, _strings):
225
- fp = filename_url_or_file
226
- if guess_charset is None:
227
- # assume that file-like objects return Unicode more often than bytes
228
- guess_charset = False
229
- elif _looks_like_url(filename_url_or_file):
230
- fp = urlopen(filename_url_or_file)
231
- if guess_charset is None:
232
- # assume that URLs return bytes
233
- guess_charset = True
234
- else:
235
- fp = open(filename_url_or_file, 'rb')
236
- if guess_charset is None:
237
- guess_charset = True
238
-
239
- options = {}
240
- # html5lib does not accept useChardet as an argument, if it
241
- # detected the html argument would produce unicode objects.
242
- if guess_charset:
243
- options['useChardet'] = guess_charset
244
- return parser.parse(fp, **options)
245
-
246
-
247
- def _looks_like_url(str):
248
- scheme = urlparse(str)[0]
249
- if not scheme:
250
- return False
251
- elif (sys.platform == 'win32' and
252
- scheme in string.ascii_letters
253
- and len(scheme) == 1):
254
- # looks like a 'normal' absolute path
255
- return False
256
- else:
257
- return True
258
-
259
-
260
- html_parser = HTMLParser()
1
+ """
2
+ An interface to html5lib that mimics the lxml.html interface.
3
+ """
4
+ import sys
5
+ import string
6
+
7
+ from html5lib import HTMLParser as _HTMLParser
8
+ from html5lib.treebuilders.etree_lxml import TreeBuilder
9
+ from lxml import etree
10
+ from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
11
+
12
+ # python3 compatibility
13
+ try:
14
+ _strings = basestring
15
+ except NameError:
16
+ _strings = (bytes, str)
17
+ try:
18
+ from urllib2 import urlopen
19
+ except ImportError:
20
+ from urllib.request import urlopen
21
+ try:
22
+ from urlparse import urlparse
23
+ except ImportError:
24
+ from urllib.parse import urlparse
25
+
26
+
27
+ class HTMLParser(_HTMLParser):
28
+ """An html5lib HTML parser with lxml as tree."""
29
+
30
+ def __init__(self, strict=False, **kwargs):
31
+ _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
32
+
33
+
34
+ try:
35
+ from html5lib import XHTMLParser as _XHTMLParser
36
+ except ImportError:
37
+ pass
38
+ else:
39
+ class XHTMLParser(_XHTMLParser):
40
+ """An html5lib XHTML Parser with lxml as tree."""
41
+
42
+ def __init__(self, strict=False, **kwargs):
43
+ _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
44
+
45
+ xhtml_parser = XHTMLParser()
46
+
47
+
48
+ def _find_tag(tree, tag):
49
+ elem = tree.find(tag)
50
+ if elem is not None:
51
+ return elem
52
+ return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
53
+
54
+
55
+ def document_fromstring(html, guess_charset=None, parser=None):
56
+ """
57
+ Parse a whole document into a string.
58
+
59
+ If `guess_charset` is true, or if the input is not Unicode but a
60
+ byte string, the `chardet` library will perform charset guessing
61
+ on the string.
62
+ """
63
+ if not isinstance(html, _strings):
64
+ raise TypeError('string required')
65
+
66
+ if parser is None:
67
+ parser = html_parser
68
+
69
+ options = {}
70
+ if guess_charset is None and isinstance(html, bytes):
71
+ # html5lib does not accept useChardet as an argument, if it
72
+ # detected the html argument would produce unicode objects.
73
+ guess_charset = True
74
+ if guess_charset is not None:
75
+ options['useChardet'] = guess_charset
76
+ return parser.parse(html, **options).getroot()
77
+
78
+
79
+ def fragments_fromstring(html, no_leading_text=False,
80
+ guess_charset=None, parser=None):
81
+ """Parses several HTML elements, returning a list of elements.
82
+
83
+ The first item in the list may be a string. If no_leading_text is true,
84
+ then it will be an error if there is leading text, and it will always be
85
+ a list of only elements.
86
+
87
+ If `guess_charset` is true, the `chardet` library will perform charset
88
+ guessing on the string.
89
+ """
90
+ if not isinstance(html, _strings):
91
+ raise TypeError('string required')
92
+
93
+ if parser is None:
94
+ parser = html_parser
95
+
96
+ options = {}
97
+ if guess_charset is None and isinstance(html, bytes):
98
+ # html5lib does not accept useChardet as an argument, if it
99
+ # detected the html argument would produce unicode objects.
100
+ guess_charset = False
101
+ if guess_charset is not None:
102
+ options['useChardet'] = guess_charset
103
+ children = parser.parseFragment(html, 'div', **options)
104
+ if children and isinstance(children[0], _strings):
105
+ if no_leading_text:
106
+ if children[0].strip():
107
+ raise etree.ParserError('There is leading text: %r' %
108
+ children[0])
109
+ del children[0]
110
+ return children
111
+
112
+
113
+ def fragment_fromstring(html, create_parent=False,
114
+ guess_charset=None, parser=None):
115
+ """Parses a single HTML element; it is an error if there is more than
116
+ one element, or if anything but whitespace precedes or follows the
117
+ element.
118
+
119
+ If 'create_parent' is true (or is a tag name) then a parent node
120
+ will be created to encapsulate the HTML in a single element. In
121
+ this case, leading or trailing text is allowed.
122
+
123
+ If `guess_charset` is true, the `chardet` library will perform charset
124
+ guessing on the string.
125
+ """
126
+ if not isinstance(html, _strings):
127
+ raise TypeError('string required')
128
+
129
+ accept_leading_text = bool(create_parent)
130
+
131
+ elements = fragments_fromstring(
132
+ html, guess_charset=guess_charset, parser=parser,
133
+ no_leading_text=not accept_leading_text)
134
+
135
+ if create_parent:
136
+ if not isinstance(create_parent, _strings):
137
+ create_parent = 'div'
138
+ new_root = Element(create_parent)
139
+ if elements:
140
+ if isinstance(elements[0], _strings):
141
+ new_root.text = elements[0]
142
+ del elements[0]
143
+ new_root.extend(elements)
144
+ return new_root
145
+
146
+ if not elements:
147
+ raise etree.ParserError('No elements found')
148
+ if len(elements) > 1:
149
+ raise etree.ParserError('Multiple elements found')
150
+ result = elements[0]
151
+ if result.tail and result.tail.strip():
152
+ raise etree.ParserError('Element followed by text: %r' % result.tail)
153
+ result.tail = None
154
+ return result
155
+
156
+
157
+ def fromstring(html, guess_charset=None, parser=None):
158
+ """Parse the html, returning a single element/document.
159
+
160
+ This tries to minimally parse the chunk of text, without knowing if it
161
+ is a fragment or a document.
162
+
163
+ 'base_url' will set the document's base_url attribute (and the tree's
164
+ docinfo.URL)
165
+
166
+ If `guess_charset` is true, or if the input is not Unicode but a
167
+ byte string, the `chardet` library will perform charset guessing
168
+ on the string.
169
+ """
170
+ if not isinstance(html, _strings):
171
+ raise TypeError('string required')
172
+ doc = document_fromstring(html, parser=parser,
173
+ guess_charset=guess_charset)
174
+
175
+ # document starts with doctype or <html>, full document!
176
+ start = html[:50]
177
+ if isinstance(start, bytes):
178
+ # Allow text comparison in python3.
179
+ # Decode as ascii, that also covers latin-1 and utf-8 for the
180
+ # characters we need.
181
+ start = start.decode('ascii', 'replace')
182
+
183
+ start = start.lstrip().lower()
184
+ if start.startswith('<html') or start.startswith('<!doctype'):
185
+ return doc
186
+
187
+ head = _find_tag(doc, 'head')
188
+
189
+ # if the head is not empty we have a full document
190
+ if len(head):
191
+ return doc
192
+
193
+ body = _find_tag(doc, 'body')
194
+
195
+ # The body has just one element, so it was probably a single
196
+ # element passed in
197
+ if (len(body) == 1 and (not body.text or not body.text.strip())
198
+ and (not body[-1].tail or not body[-1].tail.strip())):
199
+ return body[0]
200
+
201
+ # Now we have a body which represents a bunch of tags which have the
202
+ # content that was passed in. We will create a fake container, which
203
+ # is the body tag, except <body> implies too much structure.
204
+ if _contains_block_level_tag(body):
205
+ body.tag = 'div'
206
+ else:
207
+ body.tag = 'span'
208
+ return body
209
+
210
+
211
+ def parse(filename_url_or_file, guess_charset=None, parser=None):
212
+ """Parse a filename, URL, or file-like object into an HTML document
213
+ tree. Note: this returns a tree, not an element. Use
214
+ ``parse(...).getroot()`` to get the document root.
215
+
216
+ If ``guess_charset`` is true, the ``useChardet`` option is passed into
217
+ html5lib to enable character detection. This option is on by default
218
+ when parsing from URLs, off by default when parsing from file(-like)
219
+ objects (which tend to return Unicode more often than not), and on by
220
+ default when parsing from a file path (which is read in binary mode).
221
+ """
222
+ if parser is None:
223
+ parser = html_parser
224
+ if not isinstance(filename_url_or_file, _strings):
225
+ fp = filename_url_or_file
226
+ if guess_charset is None:
227
+ # assume that file-like objects return Unicode more often than bytes
228
+ guess_charset = False
229
+ elif _looks_like_url(filename_url_or_file):
230
+ fp = urlopen(filename_url_or_file)
231
+ if guess_charset is None:
232
+ # assume that URLs return bytes
233
+ guess_charset = True
234
+ else:
235
+ fp = open(filename_url_or_file, 'rb')
236
+ if guess_charset is None:
237
+ guess_charset = True
238
+
239
+ options = {}
240
+ # html5lib does not accept useChardet as an argument, if it
241
+ # detected the html argument would produce unicode objects.
242
+ if guess_charset:
243
+ options['useChardet'] = guess_charset
244
+ return parser.parse(fp, **options)
245
+
246
+
247
+ def _looks_like_url(str):
248
+ scheme = urlparse(str)[0]
249
+ if not scheme:
250
+ return False
251
+ elif (sys.platform == 'win32' and
252
+ scheme in string.ascii_letters
253
+ and len(scheme) == 1):
254
+ # looks like a 'normal' absolute path
255
+ return False
256
+ else:
257
+ return True
258
+
259
+
260
+ html_parser = HTMLParser()