lxml 6.0.0__cp310-cp310-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (177) hide show
  1. lxml/ElementInclude.py +244 -0
  2. lxml/__init__.py +22 -0
  3. lxml/_elementpath.cp310-win_arm64.pyd +0 -0
  4. lxml/_elementpath.py +343 -0
  5. lxml/apihelpers.pxi +1801 -0
  6. lxml/builder.cp310-win_arm64.pyd +0 -0
  7. lxml/builder.py +243 -0
  8. lxml/classlookup.pxi +580 -0
  9. lxml/cleanup.pxi +215 -0
  10. lxml/cssselect.py +101 -0
  11. lxml/debug.pxi +36 -0
  12. lxml/docloader.pxi +178 -0
  13. lxml/doctestcompare.py +488 -0
  14. lxml/dtd.pxi +479 -0
  15. lxml/etree.cp310-win_arm64.pyd +0 -0
  16. lxml/etree.h +244 -0
  17. lxml/etree.pyx +3853 -0
  18. lxml/etree_api.h +204 -0
  19. lxml/extensions.pxi +830 -0
  20. lxml/html/ElementSoup.py +10 -0
  21. lxml/html/__init__.py +1927 -0
  22. lxml/html/_diffcommand.py +86 -0
  23. lxml/html/_difflib.cp310-win_arm64.pyd +0 -0
  24. lxml/html/_difflib.py +2106 -0
  25. lxml/html/_html5builder.py +100 -0
  26. lxml/html/_setmixin.py +56 -0
  27. lxml/html/builder.py +173 -0
  28. lxml/html/clean.py +21 -0
  29. lxml/html/defs.py +135 -0
  30. lxml/html/diff.cp310-win_arm64.pyd +0 -0
  31. lxml/html/diff.py +972 -0
  32. lxml/html/formfill.py +299 -0
  33. lxml/html/html5parser.py +260 -0
  34. lxml/html/soupparser.py +314 -0
  35. lxml/html/usedoctest.py +13 -0
  36. lxml/includes/__init__.pxd +0 -0
  37. lxml/includes/__init__.py +0 -0
  38. lxml/includes/c14n.pxd +25 -0
  39. lxml/includes/config.pxd +3 -0
  40. lxml/includes/dtdvalid.pxd +18 -0
  41. lxml/includes/etree_defs.h +379 -0
  42. lxml/includes/etreepublic.pxd +237 -0
  43. lxml/includes/extlibs/__init__.py +0 -0
  44. lxml/includes/extlibs/zconf.h +543 -0
  45. lxml/includes/extlibs/zlib.h +1938 -0
  46. lxml/includes/htmlparser.pxd +56 -0
  47. lxml/includes/libexslt/__init__.py +0 -0
  48. lxml/includes/libexslt/exslt.h +108 -0
  49. lxml/includes/libexslt/exsltconfig.h +70 -0
  50. lxml/includes/libexslt/exsltexports.h +63 -0
  51. lxml/includes/libexslt/libexslt.h +29 -0
  52. lxml/includes/libxml/HTMLparser.h +320 -0
  53. lxml/includes/libxml/HTMLtree.h +147 -0
  54. lxml/includes/libxml/SAX.h +204 -0
  55. lxml/includes/libxml/SAX2.h +173 -0
  56. lxml/includes/libxml/__init__.py +0 -0
  57. lxml/includes/libxml/c14n.h +128 -0
  58. lxml/includes/libxml/catalog.h +182 -0
  59. lxml/includes/libxml/chvalid.h +230 -0
  60. lxml/includes/libxml/debugXML.h +217 -0
  61. lxml/includes/libxml/dict.h +81 -0
  62. lxml/includes/libxml/encoding.h +233 -0
  63. lxml/includes/libxml/entities.h +151 -0
  64. lxml/includes/libxml/globals.h +529 -0
  65. lxml/includes/libxml/hash.h +236 -0
  66. lxml/includes/libxml/list.h +137 -0
  67. lxml/includes/libxml/nanoftp.h +186 -0
  68. lxml/includes/libxml/nanohttp.h +81 -0
  69. lxml/includes/libxml/parser.h +1265 -0
  70. lxml/includes/libxml/parserInternals.h +662 -0
  71. lxml/includes/libxml/pattern.h +100 -0
  72. lxml/includes/libxml/relaxng.h +218 -0
  73. lxml/includes/libxml/schemasInternals.h +958 -0
  74. lxml/includes/libxml/schematron.h +142 -0
  75. lxml/includes/libxml/threads.h +94 -0
  76. lxml/includes/libxml/tree.h +1314 -0
  77. lxml/includes/libxml/uri.h +94 -0
  78. lxml/includes/libxml/valid.h +448 -0
  79. lxml/includes/libxml/xinclude.h +129 -0
  80. lxml/includes/libxml/xlink.h +189 -0
  81. lxml/includes/libxml/xmlIO.h +369 -0
  82. lxml/includes/libxml/xmlautomata.h +146 -0
  83. lxml/includes/libxml/xmlerror.h +919 -0
  84. lxml/includes/libxml/xmlexports.h +50 -0
  85. lxml/includes/libxml/xmlmemory.h +228 -0
  86. lxml/includes/libxml/xmlmodule.h +57 -0
  87. lxml/includes/libxml/xmlreader.h +428 -0
  88. lxml/includes/libxml/xmlregexp.h +222 -0
  89. lxml/includes/libxml/xmlsave.h +88 -0
  90. lxml/includes/libxml/xmlschemas.h +246 -0
  91. lxml/includes/libxml/xmlschemastypes.h +152 -0
  92. lxml/includes/libxml/xmlstring.h +140 -0
  93. lxml/includes/libxml/xmlunicode.h +202 -0
  94. lxml/includes/libxml/xmlversion.h +526 -0
  95. lxml/includes/libxml/xmlwriter.h +488 -0
  96. lxml/includes/libxml/xpath.h +575 -0
  97. lxml/includes/libxml/xpathInternals.h +632 -0
  98. lxml/includes/libxml/xpointer.h +137 -0
  99. lxml/includes/libxslt/__init__.py +0 -0
  100. lxml/includes/libxslt/attributes.h +39 -0
  101. lxml/includes/libxslt/documents.h +93 -0
  102. lxml/includes/libxslt/extensions.h +262 -0
  103. lxml/includes/libxslt/extra.h +72 -0
  104. lxml/includes/libxslt/functions.h +78 -0
  105. lxml/includes/libxslt/imports.h +75 -0
  106. lxml/includes/libxslt/keys.h +53 -0
  107. lxml/includes/libxslt/libxslt.h +36 -0
  108. lxml/includes/libxslt/namespaces.h +68 -0
  109. lxml/includes/libxslt/numbersInternals.h +73 -0
  110. lxml/includes/libxslt/preproc.h +43 -0
  111. lxml/includes/libxslt/security.h +104 -0
  112. lxml/includes/libxslt/templates.h +77 -0
  113. lxml/includes/libxslt/transform.h +207 -0
  114. lxml/includes/libxslt/trio.h +216 -0
  115. lxml/includes/libxslt/triodef.h +220 -0
  116. lxml/includes/libxslt/variables.h +118 -0
  117. lxml/includes/libxslt/win32config.h +51 -0
  118. lxml/includes/libxslt/xslt.h +110 -0
  119. lxml/includes/libxslt/xsltInternals.h +1992 -0
  120. lxml/includes/libxslt/xsltconfig.h +179 -0
  121. lxml/includes/libxslt/xsltexports.h +64 -0
  122. lxml/includes/libxslt/xsltlocale.h +44 -0
  123. lxml/includes/libxslt/xsltutils.h +343 -0
  124. lxml/includes/lxml-version.h +3 -0
  125. lxml/includes/relaxng.pxd +64 -0
  126. lxml/includes/schematron.pxd +34 -0
  127. lxml/includes/tree.pxd +492 -0
  128. lxml/includes/uri.pxd +5 -0
  129. lxml/includes/xinclude.pxd +22 -0
  130. lxml/includes/xmlerror.pxd +852 -0
  131. lxml/includes/xmlparser.pxd +303 -0
  132. lxml/includes/xmlschema.pxd +35 -0
  133. lxml/includes/xpath.pxd +136 -0
  134. lxml/includes/xslt.pxd +190 -0
  135. lxml/isoschematron/__init__.py +348 -0
  136. lxml/isoschematron/resources/rng/iso-schematron.rng +709 -0
  137. lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -0
  138. lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl +77 -0
  139. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +313 -0
  140. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1160 -0
  141. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +55 -0
  142. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -0
  143. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -0
  144. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt +84 -0
  145. lxml/iterparse.pxi +438 -0
  146. lxml/lxml.etree.h +244 -0
  147. lxml/lxml.etree_api.h +204 -0
  148. lxml/nsclasses.pxi +281 -0
  149. lxml/objectify.cp310-win_arm64.pyd +0 -0
  150. lxml/objectify.pyx +2149 -0
  151. lxml/objectpath.pxi +332 -0
  152. lxml/parser.pxi +2059 -0
  153. lxml/parsertarget.pxi +180 -0
  154. lxml/proxy.pxi +619 -0
  155. lxml/public-api.pxi +178 -0
  156. lxml/pyclasslookup.py +3 -0
  157. lxml/readonlytree.pxi +565 -0
  158. lxml/relaxng.pxi +165 -0
  159. lxml/sax.cp310-win_arm64.pyd +0 -0
  160. lxml/sax.py +286 -0
  161. lxml/saxparser.pxi +875 -0
  162. lxml/schematron.pxi +173 -0
  163. lxml/serializer.pxi +1849 -0
  164. lxml/usedoctest.py +13 -0
  165. lxml/xinclude.pxi +67 -0
  166. lxml/xmlerror.pxi +1654 -0
  167. lxml/xmlid.pxi +179 -0
  168. lxml/xmlschema.pxi +215 -0
  169. lxml/xpath.pxi +487 -0
  170. lxml/xslt.pxi +957 -0
  171. lxml/xsltext.pxi +242 -0
  172. lxml-6.0.0.dist-info/METADATA +163 -0
  173. lxml-6.0.0.dist-info/RECORD +177 -0
  174. lxml-6.0.0.dist-info/WHEEL +5 -0
  175. lxml-6.0.0.dist-info/licenses/LICENSE.txt +31 -0
  176. lxml-6.0.0.dist-info/licenses/LICENSES.txt +29 -0
  177. lxml-6.0.0.dist-info/top_level.txt +1 -0
lxml/html/formfill.py ADDED
@@ -0,0 +1,299 @@
1
+ from lxml.etree import XPath, ElementBase
2
+ from lxml.html import fromstring, XHTML_NAMESPACE
3
+ from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
4
+ from lxml.html import defs
5
+ import copy
6
+
7
+ try:
8
+ basestring
9
+ except NameError:
10
+ # Python 3
11
+ basestring = str
12
+
13
+ __all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
14
+ 'insert_errors', 'insert_errors_html',
15
+ 'DefaultErrorCreator']
16
+
17
+ class FormNotFound(LookupError):
18
+ """
19
+ Raised when no form can be found
20
+ """
21
+
22
+ _form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
23
+ _input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
24
+ namespaces={'x':XHTML_NAMESPACE})
25
+ _label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
26
+ namespaces={'x':XHTML_NAMESPACE})
27
+ _name_xpath = XPath('descendant-or-self::*[@name=$name]')
28
+
29
+ def fill_form(
30
+ el,
31
+ values,
32
+ form_id=None,
33
+ form_index=None,
34
+ ):
35
+ el = _find_form(el, form_id=form_id, form_index=form_index)
36
+ _fill_form(el, values)
37
+
38
+ def fill_form_html(html, values, form_id=None, form_index=None):
39
+ result_type = type(html)
40
+ if isinstance(html, basestring):
41
+ doc = fromstring(html)
42
+ else:
43
+ doc = copy.deepcopy(html)
44
+ fill_form(doc, values, form_id=form_id, form_index=form_index)
45
+ return _transform_result(result_type, doc)
46
+
47
+ def _fill_form(el, values):
48
+ counts = {}
49
+ if hasattr(values, 'mixed'):
50
+ # For Paste request parameters
51
+ values = values.mixed()
52
+ inputs = _input_xpath(el)
53
+ for input in inputs:
54
+ name = input.get('name')
55
+ if not name:
56
+ continue
57
+ if _takes_multiple(input):
58
+ value = values.get(name, [])
59
+ if not isinstance(value, (list, tuple)):
60
+ value = [value]
61
+ _fill_multiple(input, value)
62
+ elif name not in values:
63
+ continue
64
+ else:
65
+ index = counts.get(name, 0)
66
+ counts[name] = index + 1
67
+ value = values[name]
68
+ if isinstance(value, (list, tuple)):
69
+ try:
70
+ value = value[index]
71
+ except IndexError:
72
+ continue
73
+ elif index > 0:
74
+ continue
75
+ _fill_single(input, value)
76
+
77
+ def _takes_multiple(input):
78
+ if _nons(input.tag) == 'select' and input.get('multiple'):
79
+ # FIXME: multiple="0"?
80
+ return True
81
+ type = input.get('type', '').lower()
82
+ if type in ('radio', 'checkbox'):
83
+ return True
84
+ return False
85
+
86
+ def _fill_multiple(input, value):
87
+ type = input.get('type', '').lower()
88
+ if type == 'checkbox':
89
+ v = input.get('value')
90
+ if v is None:
91
+ if not value:
92
+ result = False
93
+ else:
94
+ result = value[0]
95
+ if isinstance(value, basestring):
96
+ # The only valid "on" value for an unnamed checkbox is 'on'
97
+ result = result == 'on'
98
+ _check(input, result)
99
+ else:
100
+ _check(input, v in value)
101
+ elif type == 'radio':
102
+ v = input.get('value')
103
+ _check(input, v in value)
104
+ else:
105
+ assert _nons(input.tag) == 'select'
106
+ for option in _options_xpath(input):
107
+ v = option.get('value')
108
+ if v is None:
109
+ # This seems to be the default, at least on IE
110
+ # FIXME: but I'm not sure
111
+ v = option.text_content()
112
+ _select(option, v in value)
113
+
114
+ def _check(el, check):
115
+ if check:
116
+ el.set('checked', '')
117
+ else:
118
+ if 'checked' in el.attrib:
119
+ del el.attrib['checked']
120
+
121
+ def _select(el, select):
122
+ if select:
123
+ el.set('selected', '')
124
+ else:
125
+ if 'selected' in el.attrib:
126
+ del el.attrib['selected']
127
+
128
+ def _fill_single(input, value):
129
+ if _nons(input.tag) == 'textarea':
130
+ input.text = value
131
+ else:
132
+ input.set('value', value)
133
+
134
+ def _find_form(el, form_id=None, form_index=None):
135
+ if form_id is None and form_index is None:
136
+ forms = _forms_xpath(el)
137
+ for form in forms:
138
+ return form
139
+ raise FormNotFound(
140
+ "No forms in page")
141
+ if form_id is not None:
142
+ form = el.get_element_by_id(form_id)
143
+ if form is not None:
144
+ return form
145
+ forms = _form_name_xpath(el, name=form_id)
146
+ if forms:
147
+ return forms[0]
148
+ else:
149
+ raise FormNotFound(
150
+ "No form with the name or id of %r (forms: %s)"
151
+ % (id, ', '.join(_find_form_ids(el))))
152
+ if form_index is not None:
153
+ forms = _forms_xpath(el)
154
+ try:
155
+ return forms[form_index]
156
+ except IndexError:
157
+ raise FormNotFound(
158
+ "There is no form with the index %r (%i forms found)"
159
+ % (form_index, len(forms)))
160
+
161
+ def _find_form_ids(el):
162
+ forms = _forms_xpath(el)
163
+ if not forms:
164
+ yield '(no forms)'
165
+ return
166
+ for index, form in enumerate(forms):
167
+ if form.get('id'):
168
+ if form.get('name'):
169
+ yield '%s or %s' % (form.get('id'),
170
+ form.get('name'))
171
+ else:
172
+ yield form.get('id')
173
+ elif form.get('name'):
174
+ yield form.get('name')
175
+ else:
176
+ yield '(unnamed form %s)' % index
177
+
178
+ ############################################################
179
+ ## Error filling
180
+ ############################################################
181
+
182
+ class DefaultErrorCreator:
183
+ insert_before = True
184
+ block_inside = True
185
+ error_container_tag = 'div'
186
+ error_message_class = 'error-message'
187
+ error_block_class = 'error-block'
188
+ default_message = "Invalid"
189
+
190
+ def __init__(self, **kw):
191
+ for name, value in kw.items():
192
+ if not hasattr(self, name):
193
+ raise TypeError(
194
+ "Unexpected keyword argument: %s" % name)
195
+ setattr(self, name, value)
196
+
197
+ def __call__(self, el, is_block, message):
198
+ error_el = el.makeelement(self.error_container_tag)
199
+ if self.error_message_class:
200
+ error_el.set('class', self.error_message_class)
201
+ if is_block and self.error_block_class:
202
+ error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
203
+ if message is None or message == '':
204
+ message = self.default_message
205
+ if isinstance(message, ElementBase):
206
+ error_el.append(message)
207
+ else:
208
+ assert isinstance(message, basestring), (
209
+ "Bad message; should be a string or element: %r" % message)
210
+ error_el.text = message or self.default_message
211
+ if is_block and self.block_inside:
212
+ if self.insert_before:
213
+ error_el.tail = el.text
214
+ el.text = None
215
+ el.insert(0, error_el)
216
+ else:
217
+ el.append(error_el)
218
+ else:
219
+ parent = el.getparent()
220
+ pos = parent.index(el)
221
+ if self.insert_before:
222
+ parent.insert(pos, error_el)
223
+ else:
224
+ error_el.tail = el.tail
225
+ el.tail = None
226
+ parent.insert(pos+1, error_el)
227
+
228
+ default_error_creator = DefaultErrorCreator()
229
+
230
+
231
+ def insert_errors(
232
+ el,
233
+ errors,
234
+ form_id=None,
235
+ form_index=None,
236
+ error_class="error",
237
+ error_creator=default_error_creator,
238
+ ):
239
+ el = _find_form(el, form_id=form_id, form_index=form_index)
240
+ for name, error in errors.items():
241
+ if error is None:
242
+ continue
243
+ for error_el, message in _find_elements_for_name(el, name, error):
244
+ assert isinstance(message, (basestring, type(None), ElementBase)), (
245
+ "Bad message: %r" % message)
246
+ _insert_error(error_el, message, error_class, error_creator)
247
+
248
+ def insert_errors_html(html, values, **kw):
249
+ result_type = type(html)
250
+ if isinstance(html, basestring):
251
+ doc = fromstring(html)
252
+ else:
253
+ doc = copy.deepcopy(html)
254
+ insert_errors(doc, values, **kw)
255
+ return _transform_result(result_type, doc)
256
+
257
+ def _insert_error(el, error, error_class, error_creator):
258
+ if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
259
+ is_block = False
260
+ else:
261
+ is_block = True
262
+ if _nons(el.tag) != 'form' and error_class:
263
+ _add_class(el, error_class)
264
+ if el.get('id'):
265
+ labels = _label_for_xpath(el, for_id=el.get('id'))
266
+ if labels:
267
+ for label in labels:
268
+ _add_class(label, error_class)
269
+ error_creator(el, is_block, error)
270
+
271
+ def _add_class(el, class_name):
272
+ if el.get('class'):
273
+ el.set('class', el.get('class')+' '+class_name)
274
+ else:
275
+ el.set('class', class_name)
276
+
277
+ def _find_elements_for_name(form, name, error):
278
+ if name is None:
279
+ # An error for the entire form
280
+ yield form, error
281
+ return
282
+ if name.startswith('#'):
283
+ # By id
284
+ el = form.get_element_by_id(name[1:])
285
+ if el is not None:
286
+ yield el, error
287
+ return
288
+ els = _name_xpath(form, name=name)
289
+ if not els:
290
+ # FIXME: should this raise an exception?
291
+ return
292
+ if not isinstance(error, (list, tuple)):
293
+ yield els[0], error
294
+ return
295
+ # FIXME: if error is longer than els, should it raise an error?
296
+ for el, err in zip(els, error):
297
+ if err is None:
298
+ continue
299
+ yield el, err
@@ -0,0 +1,260 @@
1
+ """
2
+ An interface to html5lib that mimics the lxml.html interface.
3
+ """
4
+ import sys
5
+ import string
6
+
7
+ from html5lib import HTMLParser as _HTMLParser
8
+ from html5lib.treebuilders.etree_lxml import TreeBuilder
9
+ from lxml import etree
10
+ from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
11
+
12
+ # python3 compatibility
13
+ try:
14
+ _strings = basestring
15
+ except NameError:
16
+ _strings = (bytes, str)
17
+ try:
18
+ from urllib2 import urlopen
19
+ except ImportError:
20
+ from urllib.request import urlopen
21
+ try:
22
+ from urlparse import urlparse
23
+ except ImportError:
24
+ from urllib.parse import urlparse
25
+
26
+
27
+ class HTMLParser(_HTMLParser):
28
+ """An html5lib HTML parser with lxml as tree."""
29
+
30
+ def __init__(self, strict=False, **kwargs):
31
+ _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
32
+
33
+
34
+ try:
35
+ from html5lib import XHTMLParser as _XHTMLParser
36
+ except ImportError:
37
+ pass
38
+ else:
39
+ class XHTMLParser(_XHTMLParser):
40
+ """An html5lib XHTML Parser with lxml as tree."""
41
+
42
+ def __init__(self, strict=False, **kwargs):
43
+ _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
44
+
45
+ xhtml_parser = XHTMLParser()
46
+
47
+
48
+ def _find_tag(tree, tag):
49
+ elem = tree.find(tag)
50
+ if elem is not None:
51
+ return elem
52
+ return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
53
+
54
+
55
+ def document_fromstring(html, guess_charset=None, parser=None):
56
+ """
57
+ Parse a whole document into a string.
58
+
59
+ If `guess_charset` is true, or if the input is not Unicode but a
60
+ byte string, the `chardet` library will perform charset guessing
61
+ on the string.
62
+ """
63
+ if not isinstance(html, _strings):
64
+ raise TypeError('string required')
65
+
66
+ if parser is None:
67
+ parser = html_parser
68
+
69
+ options = {}
70
+ if guess_charset is None and isinstance(html, bytes):
71
+ # html5lib does not accept useChardet as an argument, if it
72
+ # detected the html argument would produce unicode objects.
73
+ guess_charset = True
74
+ if guess_charset is not None:
75
+ options['useChardet'] = guess_charset
76
+ return parser.parse(html, **options).getroot()
77
+
78
+
79
+ def fragments_fromstring(html, no_leading_text=False,
80
+ guess_charset=None, parser=None):
81
+ """Parses several HTML elements, returning a list of elements.
82
+
83
+ The first item in the list may be a string. If no_leading_text is true,
84
+ then it will be an error if there is leading text, and it will always be
85
+ a list of only elements.
86
+
87
+ If `guess_charset` is true, the `chardet` library will perform charset
88
+ guessing on the string.
89
+ """
90
+ if not isinstance(html, _strings):
91
+ raise TypeError('string required')
92
+
93
+ if parser is None:
94
+ parser = html_parser
95
+
96
+ options = {}
97
+ if guess_charset is None and isinstance(html, bytes):
98
+ # html5lib does not accept useChardet as an argument, if it
99
+ # detected the html argument would produce unicode objects.
100
+ guess_charset = False
101
+ if guess_charset is not None:
102
+ options['useChardet'] = guess_charset
103
+ children = parser.parseFragment(html, 'div', **options)
104
+ if children and isinstance(children[0], _strings):
105
+ if no_leading_text:
106
+ if children[0].strip():
107
+ raise etree.ParserError('There is leading text: %r' %
108
+ children[0])
109
+ del children[0]
110
+ return children
111
+
112
+
113
+ def fragment_fromstring(html, create_parent=False,
114
+ guess_charset=None, parser=None):
115
+ """Parses a single HTML element; it is an error if there is more than
116
+ one element, or if anything but whitespace precedes or follows the
117
+ element.
118
+
119
+ If 'create_parent' is true (or is a tag name) then a parent node
120
+ will be created to encapsulate the HTML in a single element. In
121
+ this case, leading or trailing text is allowed.
122
+
123
+ If `guess_charset` is true, the `chardet` library will perform charset
124
+ guessing on the string.
125
+ """
126
+ if not isinstance(html, _strings):
127
+ raise TypeError('string required')
128
+
129
+ accept_leading_text = bool(create_parent)
130
+
131
+ elements = fragments_fromstring(
132
+ html, guess_charset=guess_charset, parser=parser,
133
+ no_leading_text=not accept_leading_text)
134
+
135
+ if create_parent:
136
+ if not isinstance(create_parent, _strings):
137
+ create_parent = 'div'
138
+ new_root = Element(create_parent)
139
+ if elements:
140
+ if isinstance(elements[0], _strings):
141
+ new_root.text = elements[0]
142
+ del elements[0]
143
+ new_root.extend(elements)
144
+ return new_root
145
+
146
+ if not elements:
147
+ raise etree.ParserError('No elements found')
148
+ if len(elements) > 1:
149
+ raise etree.ParserError('Multiple elements found')
150
+ result = elements[0]
151
+ if result.tail and result.tail.strip():
152
+ raise etree.ParserError('Element followed by text: %r' % result.tail)
153
+ result.tail = None
154
+ return result
155
+
156
+
157
+ def fromstring(html, guess_charset=None, parser=None):
158
+ """Parse the html, returning a single element/document.
159
+
160
+ This tries to minimally parse the chunk of text, without knowing if it
161
+ is a fragment or a document.
162
+
163
+ 'base_url' will set the document's base_url attribute (and the tree's
164
+ docinfo.URL)
165
+
166
+ If `guess_charset` is true, or if the input is not Unicode but a
167
+ byte string, the `chardet` library will perform charset guessing
168
+ on the string.
169
+ """
170
+ if not isinstance(html, _strings):
171
+ raise TypeError('string required')
172
+ doc = document_fromstring(html, parser=parser,
173
+ guess_charset=guess_charset)
174
+
175
+ # document starts with doctype or <html>, full document!
176
+ start = html[:50]
177
+ if isinstance(start, bytes):
178
+ # Allow text comparison in python3.
179
+ # Decode as ascii, that also covers latin-1 and utf-8 for the
180
+ # characters we need.
181
+ start = start.decode('ascii', 'replace')
182
+
183
+ start = start.lstrip().lower()
184
+ if start.startswith('<html') or start.startswith('<!doctype'):
185
+ return doc
186
+
187
+ head = _find_tag(doc, 'head')
188
+
189
+ # if the head is not empty we have a full document
190
+ if len(head):
191
+ return doc
192
+
193
+ body = _find_tag(doc, 'body')
194
+
195
+ # The body has just one element, so it was probably a single
196
+ # element passed in
197
+ if (len(body) == 1 and (not body.text or not body.text.strip())
198
+ and (not body[-1].tail or not body[-1].tail.strip())):
199
+ return body[0]
200
+
201
+ # Now we have a body which represents a bunch of tags which have the
202
+ # content that was passed in. We will create a fake container, which
203
+ # is the body tag, except <body> implies too much structure.
204
+ if _contains_block_level_tag(body):
205
+ body.tag = 'div'
206
+ else:
207
+ body.tag = 'span'
208
+ return body
209
+
210
+
211
+ def parse(filename_url_or_file, guess_charset=None, parser=None):
212
+ """Parse a filename, URL, or file-like object into an HTML document
213
+ tree. Note: this returns a tree, not an element. Use
214
+ ``parse(...).getroot()`` to get the document root.
215
+
216
+ If ``guess_charset`` is true, the ``useChardet`` option is passed into
217
+ html5lib to enable character detection. This option is on by default
218
+ when parsing from URLs, off by default when parsing from file(-like)
219
+ objects (which tend to return Unicode more often than not), and on by
220
+ default when parsing from a file path (which is read in binary mode).
221
+ """
222
+ if parser is None:
223
+ parser = html_parser
224
+ if not isinstance(filename_url_or_file, _strings):
225
+ fp = filename_url_or_file
226
+ if guess_charset is None:
227
+ # assume that file-like objects return Unicode more often than bytes
228
+ guess_charset = False
229
+ elif _looks_like_url(filename_url_or_file):
230
+ fp = urlopen(filename_url_or_file)
231
+ if guess_charset is None:
232
+ # assume that URLs return bytes
233
+ guess_charset = True
234
+ else:
235
+ fp = open(filename_url_or_file, 'rb')
236
+ if guess_charset is None:
237
+ guess_charset = True
238
+
239
+ options = {}
240
+ # html5lib does not accept useChardet as an argument, if it
241
+ # detected the html argument would produce unicode objects.
242
+ if guess_charset:
243
+ options['useChardet'] = guess_charset
244
+ return parser.parse(fp, **options)
245
+
246
+
247
+ def _looks_like_url(str):
248
+ scheme = urlparse(str)[0]
249
+ if not scheme:
250
+ return False
251
+ elif (sys.platform == 'win32' and
252
+ scheme in string.ascii_letters
253
+ and len(scheme) == 1):
254
+ # looks like a 'normal' absolute path
255
+ return False
256
+ else:
257
+ return True
258
+
259
+
260
+ html_parser = HTMLParser()