lxml 5.2.0__cp310-cp310-win32.whl → 5.2.2__cp310-cp310-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. lxml/ElementInclude.py +244 -244
  2. lxml/__init__.py +22 -22
  3. lxml/_elementpath.cp310-win32.pyd +0 -0
  4. lxml/_elementpath.py +341 -341
  5. lxml/apihelpers.pxi +1793 -1793
  6. lxml/builder.cp310-win32.pyd +0 -0
  7. lxml/builder.py +232 -232
  8. lxml/classlookup.pxi +580 -580
  9. lxml/cleanup.pxi +215 -215
  10. lxml/cssselect.py +101 -101
  11. lxml/debug.pxi +90 -90
  12. lxml/docloader.pxi +178 -178
  13. lxml/doctestcompare.py +488 -488
  14. lxml/dtd.pxi +478 -478
  15. lxml/etree.cp310-win32.pyd +0 -0
  16. lxml/etree.h +6 -6
  17. lxml/etree.pyx +3732 -3711
  18. lxml/extensions.pxi +833 -833
  19. lxml/html/ElementSoup.py +10 -10
  20. lxml/html/__init__.py +1923 -1923
  21. lxml/html/_diffcommand.py +86 -86
  22. lxml/html/_html5builder.py +100 -100
  23. lxml/html/_setmixin.py +56 -56
  24. lxml/html/builder.py +133 -133
  25. lxml/html/clean.py +21 -21
  26. lxml/html/defs.py +135 -135
  27. lxml/html/diff.cp310-win32.pyd +0 -0
  28. lxml/html/diff.py +878 -878
  29. lxml/html/formfill.py +299 -299
  30. lxml/html/html5parser.py +260 -260
  31. lxml/html/soupparser.py +314 -314
  32. lxml/html/usedoctest.py +13 -13
  33. lxml/includes/c14n.pxd +25 -25
  34. lxml/includes/config.pxd +3 -3
  35. lxml/includes/dtdvalid.pxd +18 -18
  36. lxml/includes/etree_defs.h +379 -379
  37. lxml/includes/etreepublic.pxd +237 -237
  38. lxml/includes/htmlparser.pxd +56 -56
  39. lxml/includes/lxml-version.h +1 -1
  40. lxml/includes/relaxng.pxd +64 -64
  41. lxml/includes/schematron.pxd +34 -34
  42. lxml/includes/tree.pxd +494 -494
  43. lxml/includes/uri.pxd +5 -5
  44. lxml/includes/xinclude.pxd +22 -22
  45. lxml/includes/xmlerror.pxd +852 -852
  46. lxml/includes/xmlparser.pxd +265 -265
  47. lxml/includes/xmlschema.pxd +35 -35
  48. lxml/includes/xpath.pxd +136 -136
  49. lxml/includes/xslt.pxd +190 -190
  50. lxml/isoschematron/__init__.py +348 -348
  51. lxml/isoschematron/resources/rng/iso-schematron.rng +709 -709
  52. lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -75
  53. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +312 -312
  54. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1159 -1159
  55. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +54 -54
  56. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -1796
  57. lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -588
  58. lxml/iterparse.pxi +438 -438
  59. lxml/lxml.etree.h +6 -6
  60. lxml/nsclasses.pxi +281 -281
  61. lxml/objectify.cp310-win32.pyd +0 -0
  62. lxml/objectify.pyx +2145 -2145
  63. lxml/objectpath.pxi +332 -332
  64. lxml/parser.pxi +1994 -1994
  65. lxml/parsertarget.pxi +180 -180
  66. lxml/proxy.pxi +619 -619
  67. lxml/public-api.pxi +178 -178
  68. lxml/pyclasslookup.py +3 -3
  69. lxml/readonlytree.pxi +565 -565
  70. lxml/relaxng.pxi +165 -165
  71. lxml/sax.cp310-win32.pyd +0 -0
  72. lxml/sax.py +275 -275
  73. lxml/saxparser.pxi +875 -875
  74. lxml/schematron.pxi +168 -168
  75. lxml/serializer.pxi +1871 -1871
  76. lxml/usedoctest.py +13 -13
  77. lxml/xinclude.pxi +67 -67
  78. lxml/xmlerror.pxi +1654 -1654
  79. lxml/xmlid.pxi +179 -179
  80. lxml/xmlschema.pxi +215 -215
  81. lxml/xpath.pxi +487 -487
  82. lxml/xslt.pxi +950 -950
  83. lxml/xsltext.pxi +242 -242
  84. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSE.txt +29 -29
  85. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/LICENSES.txt +29 -29
  86. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/METADATA +9 -17
  87. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/RECORD +89 -89
  88. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/WHEEL +0 -0
  89. {lxml-5.2.0.dist-info → lxml-5.2.2.dist-info}/top_level.txt +0 -0
lxml/html/soupparser.py CHANGED
@@ -1,314 +1,314 @@
1
- """External interface to the BeautifulSoup HTML parser.
2
- """
3
-
4
- __all__ = ["fromstring", "parse", "convert_tree"]
5
-
6
- import re
7
- from lxml import etree, html
8
-
9
- try:
10
- from bs4 import (
11
- BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
12
- Declaration, Doctype)
13
- _DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
14
- except ImportError:
15
- from BeautifulSoup import (
16
- BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
17
- Declaration)
18
- _DECLARATION_OR_DOCTYPE = Declaration
19
-
20
-
21
- def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
22
- """Parse a string of HTML data into an Element tree using the
23
- BeautifulSoup parser.
24
-
25
- Returns the root ``<html>`` Element of the tree.
26
-
27
- You can pass a different BeautifulSoup parser through the
28
- `beautifulsoup` keyword, and a diffent Element factory function
29
- through the `makeelement` keyword. By default, the standard
30
- ``BeautifulSoup`` class and the default factory of `lxml.html` are
31
- used.
32
- """
33
- return _parse(data, beautifulsoup, makeelement, **bsargs)
34
-
35
-
36
- def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
37
- """Parse a file into an ElemenTree using the BeautifulSoup parser.
38
-
39
- You can pass a different BeautifulSoup parser through the
40
- `beautifulsoup` keyword, and a diffent Element factory function
41
- through the `makeelement` keyword. By default, the standard
42
- ``BeautifulSoup`` class and the default factory of `lxml.html` are
43
- used.
44
- """
45
- if not hasattr(file, 'read'):
46
- file = open(file)
47
- root = _parse(file, beautifulsoup, makeelement, **bsargs)
48
- return etree.ElementTree(root)
49
-
50
-
51
- def convert_tree(beautiful_soup_tree, makeelement=None):
52
- """Convert a BeautifulSoup tree to a list of Element trees.
53
-
54
- Returns a list instead of a single root Element to support
55
- HTML-like soup with more than one root element.
56
-
57
- You can pass a different Element factory through the `makeelement`
58
- keyword.
59
- """
60
- root = _convert_tree(beautiful_soup_tree, makeelement)
61
- children = root.getchildren()
62
- for child in children:
63
- root.remove(child)
64
- return children
65
-
66
-
67
- # helpers
68
-
69
- def _parse(source, beautifulsoup, makeelement, **bsargs):
70
- if beautifulsoup is None:
71
- beautifulsoup = BeautifulSoup
72
- if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
73
- if 'convertEntities' not in bsargs:
74
- bsargs['convertEntities'] = 'html'
75
- if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
76
- if 'features' not in bsargs:
77
- bsargs['features'] = 'html.parser' # use Python html parser
78
- tree = beautifulsoup(source, **bsargs)
79
- root = _convert_tree(tree, makeelement)
80
- # from ET: wrap the document in a html root element, if necessary
81
- if len(root) == 1 and root[0].tag == "html":
82
- return root[0]
83
- root.tag = "html"
84
- return root
85
-
86
-
87
- _parse_doctype_declaration = re.compile(
88
- r'(?:\s|[<!])*DOCTYPE\s*HTML'
89
- r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
90
- r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
91
- re.IGNORECASE).match
92
-
93
-
94
- class _PseudoTag:
95
- # Minimal imitation of BeautifulSoup.Tag
96
- def __init__(self, contents):
97
- self.name = 'html'
98
- self.attrs = []
99
- self.contents = contents
100
-
101
- def __iter__(self):
102
- return self.contents.__iter__()
103
-
104
-
105
- def _convert_tree(beautiful_soup_tree, makeelement):
106
- if makeelement is None:
107
- makeelement = html.html_parser.makeelement
108
-
109
- # Split the tree into three parts:
110
- # i) everything before the root element: document type
111
- # declaration, comments, processing instructions, whitespace
112
- # ii) the root(s),
113
- # iii) everything after the root: comments, processing
114
- # instructions, whitespace
115
- first_element_idx = last_element_idx = None
116
- html_root = declaration = None
117
- for i, e in enumerate(beautiful_soup_tree):
118
- if isinstance(e, Tag):
119
- if first_element_idx is None:
120
- first_element_idx = i
121
- last_element_idx = i
122
- if html_root is None and e.name and e.name.lower() == 'html':
123
- html_root = e
124
- elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
125
- declaration = e
126
-
127
- # For a nice, well-formatted document, the variable roots below is
128
- # a list consisting of a single <html> element. However, the document
129
- # may be a soup like '<meta><head><title>Hello</head><body>Hi
130
- # all<\p>'. In this example roots is a list containing meta, head
131
- # and body elements.
132
- if first_element_idx is None:
133
- pre_root = post_root = []
134
- roots = beautiful_soup_tree.contents
135
- else:
136
- pre_root = beautiful_soup_tree.contents[:first_element_idx]
137
- roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
138
- post_root = beautiful_soup_tree.contents[last_element_idx+1:]
139
-
140
- # Reorganize so that there is one <html> root...
141
- if html_root is not None:
142
- # ... use existing one if possible, ...
143
- i = roots.index(html_root)
144
- html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
145
- else:
146
- # ... otherwise create a new one.
147
- html_root = _PseudoTag(roots)
148
-
149
- convert_node = _init_node_converters(makeelement)
150
-
151
- # Process pre_root
152
- res_root = convert_node(html_root)
153
- prev = res_root
154
- for e in reversed(pre_root):
155
- converted = convert_node(e)
156
- if converted is not None:
157
- prev.addprevious(converted)
158
- prev = converted
159
-
160
- # ditto for post_root
161
- prev = res_root
162
- for e in post_root:
163
- converted = convert_node(e)
164
- if converted is not None:
165
- prev.addnext(converted)
166
- prev = converted
167
-
168
- if declaration is not None:
169
- try:
170
- # bs4 provides full Doctype string
171
- doctype_string = declaration.output_ready()
172
- except AttributeError:
173
- doctype_string = declaration.string
174
-
175
- match = _parse_doctype_declaration(doctype_string)
176
- if not match:
177
- # Something is wrong if we end up in here. Since soupparser should
178
- # tolerate errors, do not raise Exception, just let it pass.
179
- pass
180
- else:
181
- external_id, sys_uri = match.groups()
182
- docinfo = res_root.getroottree().docinfo
183
- # strip quotes and update DOCTYPE values (any of None, '', '...')
184
- docinfo.public_id = external_id and external_id[1:-1]
185
- docinfo.system_url = sys_uri and sys_uri[1:-1]
186
-
187
- return res_root
188
-
189
-
190
- def _init_node_converters(makeelement):
191
- converters = {}
192
- ordered_node_types = []
193
-
194
- def converter(*types):
195
- def add(handler):
196
- for t in types:
197
- converters[t] = handler
198
- ordered_node_types.append(t)
199
- return handler
200
- return add
201
-
202
- def find_best_converter(node):
203
- for t in ordered_node_types:
204
- if isinstance(node, t):
205
- return converters[t]
206
- return None
207
-
208
- def convert_node(bs_node, parent=None):
209
- # duplicated in convert_tag() below
210
- try:
211
- handler = converters[type(bs_node)]
212
- except KeyError:
213
- handler = converters[type(bs_node)] = find_best_converter(bs_node)
214
- if handler is None:
215
- return None
216
- return handler(bs_node, parent)
217
-
218
- def map_attrs(bs_attrs):
219
- if isinstance(bs_attrs, dict): # bs4
220
- attribs = {}
221
- for k, v in bs_attrs.items():
222
- if isinstance(v, list):
223
- v = " ".join(v)
224
- attribs[k] = unescape(v)
225
- else:
226
- attribs = {k: unescape(v) for k, v in bs_attrs}
227
- return attribs
228
-
229
- def append_text(parent, text):
230
- if len(parent) == 0:
231
- parent.text = (parent.text or '') + text
232
- else:
233
- parent[-1].tail = (parent[-1].tail or '') + text
234
-
235
- # converters are tried in order of their definition
236
-
237
- @converter(Tag, _PseudoTag)
238
- def convert_tag(bs_node, parent):
239
- attrs = bs_node.attrs
240
- if parent is not None:
241
- attribs = map_attrs(attrs) if attrs else None
242
- res = etree.SubElement(parent, bs_node.name, attrib=attribs)
243
- else:
244
- attribs = map_attrs(attrs) if attrs else {}
245
- res = makeelement(bs_node.name, attrib=attribs)
246
-
247
- for child in bs_node:
248
- # avoid double recursion by inlining convert_node(), see above
249
- try:
250
- handler = converters[type(child)]
251
- except KeyError:
252
- pass
253
- else:
254
- if handler is not None:
255
- handler(child, res)
256
- continue
257
- convert_node(child, res)
258
- return res
259
-
260
- @converter(Comment)
261
- def convert_comment(bs_node, parent):
262
- res = html.HtmlComment(bs_node)
263
- if parent is not None:
264
- parent.append(res)
265
- return res
266
-
267
- @converter(ProcessingInstruction)
268
- def convert_pi(bs_node, parent):
269
- if bs_node.endswith('?'):
270
- # The PI is of XML style (<?as df?>) but BeautifulSoup
271
- # interpreted it as being SGML style (<?as df>). Fix.
272
- bs_node = bs_node[:-1]
273
- res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
274
- if parent is not None:
275
- parent.append(res)
276
- return res
277
-
278
- @converter(NavigableString)
279
- def convert_text(bs_node, parent):
280
- if parent is not None:
281
- append_text(parent, unescape(bs_node))
282
- return None
283
-
284
- return convert_node
285
-
286
-
287
- # copied from ET's ElementSoup
288
-
289
- try:
290
- from html.entities import name2codepoint # Python 3
291
- except ImportError:
292
- from htmlentitydefs import name2codepoint
293
-
294
-
295
- handle_entities = re.compile(r"&(\w+);").sub
296
-
297
-
298
- try:
299
- unichr
300
- except NameError:
301
- # Python 3
302
- unichr = chr
303
-
304
-
305
- def unescape(string):
306
- if not string:
307
- return ''
308
- # work around oddities in BeautifulSoup's entity handling
309
- def unescape_entity(m):
310
- try:
311
- return unichr(name2codepoint[m.group(1)])
312
- except KeyError:
313
- return m.group(0) # use as is
314
- return handle_entities(unescape_entity, string)
1
+ """External interface to the BeautifulSoup HTML parser.
2
+ """
3
+
4
+ __all__ = ["fromstring", "parse", "convert_tree"]
5
+
6
+ import re
7
+ from lxml import etree, html
8
+
9
+ try:
10
+ from bs4 import (
11
+ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
12
+ Declaration, Doctype)
13
+ _DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
14
+ except ImportError:
15
+ from BeautifulSoup import (
16
+ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
17
+ Declaration)
18
+ _DECLARATION_OR_DOCTYPE = Declaration
19
+
20
+
21
+ def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
22
+ """Parse a string of HTML data into an Element tree using the
23
+ BeautifulSoup parser.
24
+
25
+ Returns the root ``<html>`` Element of the tree.
26
+
27
+ You can pass a different BeautifulSoup parser through the
28
+ `beautifulsoup` keyword, and a diffent Element factory function
29
+ through the `makeelement` keyword. By default, the standard
30
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
31
+ used.
32
+ """
33
+ return _parse(data, beautifulsoup, makeelement, **bsargs)
34
+
35
+
36
+ def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
37
+ """Parse a file into an ElemenTree using the BeautifulSoup parser.
38
+
39
+ You can pass a different BeautifulSoup parser through the
40
+ `beautifulsoup` keyword, and a diffent Element factory function
41
+ through the `makeelement` keyword. By default, the standard
42
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
43
+ used.
44
+ """
45
+ if not hasattr(file, 'read'):
46
+ file = open(file)
47
+ root = _parse(file, beautifulsoup, makeelement, **bsargs)
48
+ return etree.ElementTree(root)
49
+
50
+
51
+ def convert_tree(beautiful_soup_tree, makeelement=None):
52
+ """Convert a BeautifulSoup tree to a list of Element trees.
53
+
54
+ Returns a list instead of a single root Element to support
55
+ HTML-like soup with more than one root element.
56
+
57
+ You can pass a different Element factory through the `makeelement`
58
+ keyword.
59
+ """
60
+ root = _convert_tree(beautiful_soup_tree, makeelement)
61
+ children = root.getchildren()
62
+ for child in children:
63
+ root.remove(child)
64
+ return children
65
+
66
+
67
+ # helpers
68
+
69
+ def _parse(source, beautifulsoup, makeelement, **bsargs):
70
+ if beautifulsoup is None:
71
+ beautifulsoup = BeautifulSoup
72
+ if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
73
+ if 'convertEntities' not in bsargs:
74
+ bsargs['convertEntities'] = 'html'
75
+ if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
76
+ if 'features' not in bsargs:
77
+ bsargs['features'] = 'html.parser' # use Python html parser
78
+ tree = beautifulsoup(source, **bsargs)
79
+ root = _convert_tree(tree, makeelement)
80
+ # from ET: wrap the document in a html root element, if necessary
81
+ if len(root) == 1 and root[0].tag == "html":
82
+ return root[0]
83
+ root.tag = "html"
84
+ return root
85
+
86
+
87
+ _parse_doctype_declaration = re.compile(
88
+ r'(?:\s|[<!])*DOCTYPE\s*HTML'
89
+ r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
90
+ r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
91
+ re.IGNORECASE).match
92
+
93
+
94
+ class _PseudoTag:
95
+ # Minimal imitation of BeautifulSoup.Tag
96
+ def __init__(self, contents):
97
+ self.name = 'html'
98
+ self.attrs = []
99
+ self.contents = contents
100
+
101
+ def __iter__(self):
102
+ return self.contents.__iter__()
103
+
104
+
105
+ def _convert_tree(beautiful_soup_tree, makeelement):
106
+ if makeelement is None:
107
+ makeelement = html.html_parser.makeelement
108
+
109
+ # Split the tree into three parts:
110
+ # i) everything before the root element: document type
111
+ # declaration, comments, processing instructions, whitespace
112
+ # ii) the root(s),
113
+ # iii) everything after the root: comments, processing
114
+ # instructions, whitespace
115
+ first_element_idx = last_element_idx = None
116
+ html_root = declaration = None
117
+ for i, e in enumerate(beautiful_soup_tree):
118
+ if isinstance(e, Tag):
119
+ if first_element_idx is None:
120
+ first_element_idx = i
121
+ last_element_idx = i
122
+ if html_root is None and e.name and e.name.lower() == 'html':
123
+ html_root = e
124
+ elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
125
+ declaration = e
126
+
127
+ # For a nice, well-formatted document, the variable roots below is
128
+ # a list consisting of a single <html> element. However, the document
129
+ # may be a soup like '<meta><head><title>Hello</head><body>Hi
130
+ # all<\p>'. In this example roots is a list containing meta, head
131
+ # and body elements.
132
+ if first_element_idx is None:
133
+ pre_root = post_root = []
134
+ roots = beautiful_soup_tree.contents
135
+ else:
136
+ pre_root = beautiful_soup_tree.contents[:first_element_idx]
137
+ roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
138
+ post_root = beautiful_soup_tree.contents[last_element_idx+1:]
139
+
140
+ # Reorganize so that there is one <html> root...
141
+ if html_root is not None:
142
+ # ... use existing one if possible, ...
143
+ i = roots.index(html_root)
144
+ html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
145
+ else:
146
+ # ... otherwise create a new one.
147
+ html_root = _PseudoTag(roots)
148
+
149
+ convert_node = _init_node_converters(makeelement)
150
+
151
+ # Process pre_root
152
+ res_root = convert_node(html_root)
153
+ prev = res_root
154
+ for e in reversed(pre_root):
155
+ converted = convert_node(e)
156
+ if converted is not None:
157
+ prev.addprevious(converted)
158
+ prev = converted
159
+
160
+ # ditto for post_root
161
+ prev = res_root
162
+ for e in post_root:
163
+ converted = convert_node(e)
164
+ if converted is not None:
165
+ prev.addnext(converted)
166
+ prev = converted
167
+
168
+ if declaration is not None:
169
+ try:
170
+ # bs4 provides full Doctype string
171
+ doctype_string = declaration.output_ready()
172
+ except AttributeError:
173
+ doctype_string = declaration.string
174
+
175
+ match = _parse_doctype_declaration(doctype_string)
176
+ if not match:
177
+ # Something is wrong if we end up in here. Since soupparser should
178
+ # tolerate errors, do not raise Exception, just let it pass.
179
+ pass
180
+ else:
181
+ external_id, sys_uri = match.groups()
182
+ docinfo = res_root.getroottree().docinfo
183
+ # strip quotes and update DOCTYPE values (any of None, '', '...')
184
+ docinfo.public_id = external_id and external_id[1:-1]
185
+ docinfo.system_url = sys_uri and sys_uri[1:-1]
186
+
187
+ return res_root
188
+
189
+
190
+ def _init_node_converters(makeelement):
191
+ converters = {}
192
+ ordered_node_types = []
193
+
194
+ def converter(*types):
195
+ def add(handler):
196
+ for t in types:
197
+ converters[t] = handler
198
+ ordered_node_types.append(t)
199
+ return handler
200
+ return add
201
+
202
+ def find_best_converter(node):
203
+ for t in ordered_node_types:
204
+ if isinstance(node, t):
205
+ return converters[t]
206
+ return None
207
+
208
+ def convert_node(bs_node, parent=None):
209
+ # duplicated in convert_tag() below
210
+ try:
211
+ handler = converters[type(bs_node)]
212
+ except KeyError:
213
+ handler = converters[type(bs_node)] = find_best_converter(bs_node)
214
+ if handler is None:
215
+ return None
216
+ return handler(bs_node, parent)
217
+
218
+ def map_attrs(bs_attrs):
219
+ if isinstance(bs_attrs, dict): # bs4
220
+ attribs = {}
221
+ for k, v in bs_attrs.items():
222
+ if isinstance(v, list):
223
+ v = " ".join(v)
224
+ attribs[k] = unescape(v)
225
+ else:
226
+ attribs = {k: unescape(v) for k, v in bs_attrs}
227
+ return attribs
228
+
229
+ def append_text(parent, text):
230
+ if len(parent) == 0:
231
+ parent.text = (parent.text or '') + text
232
+ else:
233
+ parent[-1].tail = (parent[-1].tail or '') + text
234
+
235
+ # converters are tried in order of their definition
236
+
237
+ @converter(Tag, _PseudoTag)
238
+ def convert_tag(bs_node, parent):
239
+ attrs = bs_node.attrs
240
+ if parent is not None:
241
+ attribs = map_attrs(attrs) if attrs else None
242
+ res = etree.SubElement(parent, bs_node.name, attrib=attribs)
243
+ else:
244
+ attribs = map_attrs(attrs) if attrs else {}
245
+ res = makeelement(bs_node.name, attrib=attribs)
246
+
247
+ for child in bs_node:
248
+ # avoid double recursion by inlining convert_node(), see above
249
+ try:
250
+ handler = converters[type(child)]
251
+ except KeyError:
252
+ pass
253
+ else:
254
+ if handler is not None:
255
+ handler(child, res)
256
+ continue
257
+ convert_node(child, res)
258
+ return res
259
+
260
+ @converter(Comment)
261
+ def convert_comment(bs_node, parent):
262
+ res = html.HtmlComment(bs_node)
263
+ if parent is not None:
264
+ parent.append(res)
265
+ return res
266
+
267
+ @converter(ProcessingInstruction)
268
+ def convert_pi(bs_node, parent):
269
+ if bs_node.endswith('?'):
270
+ # The PI is of XML style (<?as df?>) but BeautifulSoup
271
+ # interpreted it as being SGML style (<?as df>). Fix.
272
+ bs_node = bs_node[:-1]
273
+ res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
274
+ if parent is not None:
275
+ parent.append(res)
276
+ return res
277
+
278
+ @converter(NavigableString)
279
+ def convert_text(bs_node, parent):
280
+ if parent is not None:
281
+ append_text(parent, unescape(bs_node))
282
+ return None
283
+
284
+ return convert_node
285
+
286
+
287
+ # copied from ET's ElementSoup
288
+
289
+ try:
290
+ from html.entities import name2codepoint # Python 3
291
+ except ImportError:
292
+ from htmlentitydefs import name2codepoint
293
+
294
+
295
+ handle_entities = re.compile(r"&(\w+);").sub
296
+
297
+
298
+ try:
299
+ unichr
300
+ except NameError:
301
+ # Python 3
302
+ unichr = chr
303
+
304
+
305
+ def unescape(string):
306
+ if not string:
307
+ return ''
308
+ # work around oddities in BeautifulSoup's entity handling
309
+ def unescape_entity(m):
310
+ try:
311
+ return unichr(name2codepoint[m.group(1)])
312
+ except KeyError:
313
+ return m.group(0) # use as is
314
+ return handle_entities(unescape_entity, string)
lxml/html/usedoctest.py CHANGED
@@ -1,13 +1,13 @@
1
- """Doctest module for HTML comparison.
2
-
3
- Usage::
4
-
5
- >>> import lxml.html.usedoctest
6
- >>> # now do your HTML doctests ...
7
-
8
- See `lxml.doctestcompare`.
9
- """
10
-
11
- from lxml import doctestcompare
12
-
13
- doctestcompare.temp_install(html=True, del_module=__name__)
1
+ """Doctest module for HTML comparison.
2
+
3
+ Usage::
4
+
5
+ >>> import lxml.html.usedoctest
6
+ >>> # now do your HTML doctests ...
7
+
8
+ See `lxml.doctestcompare`.
9
+ """
10
+
11
+ from lxml import doctestcompare
12
+
13
+ doctestcompare.temp_install(html=True, del_module=__name__)