forkparser 2026.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- feedparser/__init__.py +66 -0
- feedparser/api.py +376 -0
- feedparser/datetimes/__init__.py +73 -0
- feedparser/datetimes/asctime.py +80 -0
- feedparser/datetimes/greek.py +90 -0
- feedparser/datetimes/hungarian.py +66 -0
- feedparser/datetimes/iso8601.py +160 -0
- feedparser/datetimes/korean.py +94 -0
- feedparser/datetimes/perforce.py +63 -0
- feedparser/datetimes/rfc822.py +179 -0
- feedparser/datetimes/w3dtf.py +128 -0
- feedparser/encodings.py +649 -0
- feedparser/exceptions.py +55 -0
- feedparser/html.py +350 -0
- feedparser/http.py +74 -0
- feedparser/mixin.py +838 -0
- feedparser/namespaces/__init__.py +0 -0
- feedparser/namespaces/_base.py +547 -0
- feedparser/namespaces/admin.py +53 -0
- feedparser/namespaces/cc.py +70 -0
- feedparser/namespaces/dc.py +138 -0
- feedparser/namespaces/georss.py +682 -0
- feedparser/namespaces/itunes.py +113 -0
- feedparser/namespaces/mediarss.py +142 -0
- feedparser/namespaces/psc.py +74 -0
- feedparser/parsers/__init__.py +0 -0
- feedparser/parsers/json.py +135 -0
- feedparser/parsers/loose.py +75 -0
- feedparser/parsers/strict.py +141 -0
- feedparser/py.typed +0 -0
- feedparser/sanitizer.py +978 -0
- feedparser/sgml.py +98 -0
- feedparser/urls.py +233 -0
- feedparser/util.py +157 -0
- forkparser-2026.1.0.dist-info/METADATA +75 -0
- forkparser-2026.1.0.dist-info/RECORD +38 -0
- forkparser-2026.1.0.dist-info/WHEEL +4 -0
- forkparser-2026.1.0.dist-info/licenses/LICENSE +65 -0
feedparser/sanitizer.py
ADDED
|
@@ -0,0 +1,978 @@
|
|
|
1
|
+
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
|
|
2
|
+
# Copyright 2002-2008 Mark Pilgrim
|
|
3
|
+
# All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# This file is a part of feedparser.
|
|
6
|
+
#
|
|
7
|
+
# Redistribution and use in source and binary forms, with or without
|
|
8
|
+
# modification, are permitted provided that the following conditions are met:
|
|
9
|
+
#
|
|
10
|
+
# * Redistributions of source code must retain the above copyright notice,
|
|
11
|
+
# this list of conditions and the following disclaimer.
|
|
12
|
+
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
# this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
# and/or other materials provided with the distribution.
|
|
15
|
+
#
|
|
16
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
|
17
|
+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
19
|
+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
20
|
+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
21
|
+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
22
|
+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
23
|
+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
24
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
25
|
+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
26
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import re
|
|
31
|
+
|
|
32
|
+
from .html import BaseHTMLProcessor
|
|
33
|
+
from .urls import make_safe_absolute_uri
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class HTMLSanitizer(BaseHTMLProcessor):
|
|
37
|
+
acceptable_elements = {
|
|
38
|
+
"a",
|
|
39
|
+
"abbr",
|
|
40
|
+
"acronym",
|
|
41
|
+
"address",
|
|
42
|
+
"area",
|
|
43
|
+
"article",
|
|
44
|
+
"aside",
|
|
45
|
+
"audio",
|
|
46
|
+
"b",
|
|
47
|
+
"big",
|
|
48
|
+
"blockquote",
|
|
49
|
+
"br",
|
|
50
|
+
"button",
|
|
51
|
+
"canvas",
|
|
52
|
+
"caption",
|
|
53
|
+
"center",
|
|
54
|
+
"cite",
|
|
55
|
+
"code",
|
|
56
|
+
"col",
|
|
57
|
+
"colgroup",
|
|
58
|
+
"command",
|
|
59
|
+
"datagrid",
|
|
60
|
+
"datalist",
|
|
61
|
+
"dd",
|
|
62
|
+
"del",
|
|
63
|
+
"details",
|
|
64
|
+
"dfn",
|
|
65
|
+
"dialog",
|
|
66
|
+
"dir",
|
|
67
|
+
"div",
|
|
68
|
+
"dl",
|
|
69
|
+
"dt",
|
|
70
|
+
"em",
|
|
71
|
+
"event-source",
|
|
72
|
+
"fieldset",
|
|
73
|
+
"figcaption",
|
|
74
|
+
"figure",
|
|
75
|
+
"font",
|
|
76
|
+
"footer",
|
|
77
|
+
"form",
|
|
78
|
+
"h1",
|
|
79
|
+
"h2",
|
|
80
|
+
"h3",
|
|
81
|
+
"h4",
|
|
82
|
+
"h5",
|
|
83
|
+
"h6",
|
|
84
|
+
"header",
|
|
85
|
+
"hr",
|
|
86
|
+
"i",
|
|
87
|
+
"img",
|
|
88
|
+
"input",
|
|
89
|
+
"ins",
|
|
90
|
+
"kbd",
|
|
91
|
+
"keygen",
|
|
92
|
+
"label",
|
|
93
|
+
"legend",
|
|
94
|
+
"li",
|
|
95
|
+
"m",
|
|
96
|
+
"map",
|
|
97
|
+
"menu",
|
|
98
|
+
"meter",
|
|
99
|
+
"multicol",
|
|
100
|
+
"nav",
|
|
101
|
+
"nextid",
|
|
102
|
+
"noscript",
|
|
103
|
+
"ol",
|
|
104
|
+
"optgroup",
|
|
105
|
+
"option",
|
|
106
|
+
"output",
|
|
107
|
+
"p",
|
|
108
|
+
"pre",
|
|
109
|
+
"progress",
|
|
110
|
+
"q",
|
|
111
|
+
"rp",
|
|
112
|
+
"rt",
|
|
113
|
+
"ruby",
|
|
114
|
+
"s",
|
|
115
|
+
"samp",
|
|
116
|
+
"section",
|
|
117
|
+
"select",
|
|
118
|
+
"small",
|
|
119
|
+
"sound",
|
|
120
|
+
"source",
|
|
121
|
+
"spacer",
|
|
122
|
+
"span",
|
|
123
|
+
"strike",
|
|
124
|
+
"strong",
|
|
125
|
+
"sub",
|
|
126
|
+
"sup",
|
|
127
|
+
"table",
|
|
128
|
+
"tbody",
|
|
129
|
+
"td",
|
|
130
|
+
"textarea",
|
|
131
|
+
"tfoot",
|
|
132
|
+
"th",
|
|
133
|
+
"thead",
|
|
134
|
+
"time",
|
|
135
|
+
"tr",
|
|
136
|
+
"tt",
|
|
137
|
+
"u",
|
|
138
|
+
"ul",
|
|
139
|
+
"var",
|
|
140
|
+
"video",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
acceptable_attributes = {
|
|
144
|
+
"abbr",
|
|
145
|
+
"accept",
|
|
146
|
+
"accept-charset",
|
|
147
|
+
"accesskey",
|
|
148
|
+
"action",
|
|
149
|
+
"align",
|
|
150
|
+
"alt",
|
|
151
|
+
"autocomplete",
|
|
152
|
+
"autofocus",
|
|
153
|
+
"axis",
|
|
154
|
+
"background",
|
|
155
|
+
"balance",
|
|
156
|
+
"bgcolor",
|
|
157
|
+
"bgproperties",
|
|
158
|
+
"border",
|
|
159
|
+
"bordercolor",
|
|
160
|
+
"bordercolordark",
|
|
161
|
+
"bordercolorlight",
|
|
162
|
+
"bottompadding",
|
|
163
|
+
"cellpadding",
|
|
164
|
+
"cellspacing",
|
|
165
|
+
"ch",
|
|
166
|
+
"challenge",
|
|
167
|
+
"char",
|
|
168
|
+
"charoff",
|
|
169
|
+
"charset",
|
|
170
|
+
"checked",
|
|
171
|
+
"choff",
|
|
172
|
+
"cite",
|
|
173
|
+
"class",
|
|
174
|
+
"clear",
|
|
175
|
+
"color",
|
|
176
|
+
"cols",
|
|
177
|
+
"colspan",
|
|
178
|
+
"compact",
|
|
179
|
+
"contenteditable",
|
|
180
|
+
"controls",
|
|
181
|
+
"coords",
|
|
182
|
+
"data",
|
|
183
|
+
"datafld",
|
|
184
|
+
"datapagesize",
|
|
185
|
+
"datasrc",
|
|
186
|
+
"datetime",
|
|
187
|
+
"default",
|
|
188
|
+
"delay",
|
|
189
|
+
"dir",
|
|
190
|
+
"disabled",
|
|
191
|
+
"draggable",
|
|
192
|
+
"dynsrc",
|
|
193
|
+
"enctype",
|
|
194
|
+
"end",
|
|
195
|
+
"face",
|
|
196
|
+
"for",
|
|
197
|
+
"form",
|
|
198
|
+
"frame",
|
|
199
|
+
"galleryimg",
|
|
200
|
+
"gutter",
|
|
201
|
+
"headers",
|
|
202
|
+
"height",
|
|
203
|
+
"hidden",
|
|
204
|
+
"hidefocus",
|
|
205
|
+
"high",
|
|
206
|
+
"href",
|
|
207
|
+
"hreflang",
|
|
208
|
+
"hspace",
|
|
209
|
+
"icon",
|
|
210
|
+
"id",
|
|
211
|
+
"inputmode",
|
|
212
|
+
"ismap",
|
|
213
|
+
"keytype",
|
|
214
|
+
"label",
|
|
215
|
+
"lang",
|
|
216
|
+
"leftspacing",
|
|
217
|
+
"list",
|
|
218
|
+
"longdesc",
|
|
219
|
+
"loop",
|
|
220
|
+
"loopcount",
|
|
221
|
+
"loopend",
|
|
222
|
+
"loopstart",
|
|
223
|
+
"low",
|
|
224
|
+
"lowsrc",
|
|
225
|
+
"max",
|
|
226
|
+
"maxlength",
|
|
227
|
+
"media",
|
|
228
|
+
"method",
|
|
229
|
+
"min",
|
|
230
|
+
"multiple",
|
|
231
|
+
"name",
|
|
232
|
+
"nohref",
|
|
233
|
+
"noshade",
|
|
234
|
+
"nowrap",
|
|
235
|
+
"open",
|
|
236
|
+
"optimum",
|
|
237
|
+
"pattern",
|
|
238
|
+
"ping",
|
|
239
|
+
"point-size",
|
|
240
|
+
"poster",
|
|
241
|
+
"pqg",
|
|
242
|
+
"preload",
|
|
243
|
+
"prompt",
|
|
244
|
+
"radiogroup",
|
|
245
|
+
"readonly",
|
|
246
|
+
"rel",
|
|
247
|
+
"repeat-max",
|
|
248
|
+
"repeat-min",
|
|
249
|
+
"replace",
|
|
250
|
+
"required",
|
|
251
|
+
"rev",
|
|
252
|
+
"rightspacing",
|
|
253
|
+
"rows",
|
|
254
|
+
"rowspan",
|
|
255
|
+
"rules",
|
|
256
|
+
"scope",
|
|
257
|
+
"selected",
|
|
258
|
+
"shape",
|
|
259
|
+
"size",
|
|
260
|
+
"span",
|
|
261
|
+
"src",
|
|
262
|
+
"srcset",
|
|
263
|
+
"start",
|
|
264
|
+
"step",
|
|
265
|
+
"style",
|
|
266
|
+
"summary",
|
|
267
|
+
"suppress",
|
|
268
|
+
"tabindex",
|
|
269
|
+
"target",
|
|
270
|
+
"template",
|
|
271
|
+
"title",
|
|
272
|
+
"toppadding",
|
|
273
|
+
"type",
|
|
274
|
+
"unselectable",
|
|
275
|
+
"urn",
|
|
276
|
+
"usemap",
|
|
277
|
+
"valign",
|
|
278
|
+
"value",
|
|
279
|
+
"variable",
|
|
280
|
+
"volume",
|
|
281
|
+
"vrml",
|
|
282
|
+
"vspace",
|
|
283
|
+
"width",
|
|
284
|
+
"wrap",
|
|
285
|
+
"xml:lang",
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
unacceptable_elements_with_end_tag = {
|
|
289
|
+
"applet",
|
|
290
|
+
"script",
|
|
291
|
+
"style",
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
acceptable_css_properties = {
|
|
295
|
+
"azimuth",
|
|
296
|
+
"background-color",
|
|
297
|
+
"border-bottom-color",
|
|
298
|
+
"border-collapse",
|
|
299
|
+
"border-color",
|
|
300
|
+
"border-left-color",
|
|
301
|
+
"border-right-color",
|
|
302
|
+
"border-top-color",
|
|
303
|
+
"clear",
|
|
304
|
+
"color",
|
|
305
|
+
"cursor",
|
|
306
|
+
"direction",
|
|
307
|
+
"display",
|
|
308
|
+
"elevation",
|
|
309
|
+
"float",
|
|
310
|
+
"font",
|
|
311
|
+
"font-family",
|
|
312
|
+
"font-size",
|
|
313
|
+
"font-style",
|
|
314
|
+
"font-variant",
|
|
315
|
+
"font-weight",
|
|
316
|
+
"height",
|
|
317
|
+
"letter-spacing",
|
|
318
|
+
"line-height",
|
|
319
|
+
"overflow",
|
|
320
|
+
"pause",
|
|
321
|
+
"pause-after",
|
|
322
|
+
"pause-before",
|
|
323
|
+
"pitch",
|
|
324
|
+
"pitch-range",
|
|
325
|
+
"richness",
|
|
326
|
+
"speak",
|
|
327
|
+
"speak-header",
|
|
328
|
+
"speak-numeral",
|
|
329
|
+
"speak-punctuation",
|
|
330
|
+
"speech-rate",
|
|
331
|
+
"stress",
|
|
332
|
+
"text-align",
|
|
333
|
+
"text-decoration",
|
|
334
|
+
"text-indent",
|
|
335
|
+
"unicode-bidi",
|
|
336
|
+
"vertical-align",
|
|
337
|
+
"voice-family",
|
|
338
|
+
"volume",
|
|
339
|
+
"white-space",
|
|
340
|
+
"width",
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
# survey of common keywords found in feeds
|
|
344
|
+
acceptable_css_keywords = {
|
|
345
|
+
"!important",
|
|
346
|
+
"aqua",
|
|
347
|
+
"auto",
|
|
348
|
+
"black",
|
|
349
|
+
"block",
|
|
350
|
+
"blue",
|
|
351
|
+
"bold",
|
|
352
|
+
"both",
|
|
353
|
+
"bottom",
|
|
354
|
+
"brown",
|
|
355
|
+
"center",
|
|
356
|
+
"collapse",
|
|
357
|
+
"dashed",
|
|
358
|
+
"dotted",
|
|
359
|
+
"fuchsia",
|
|
360
|
+
"gray",
|
|
361
|
+
"green",
|
|
362
|
+
"italic",
|
|
363
|
+
"left",
|
|
364
|
+
"lime",
|
|
365
|
+
"maroon",
|
|
366
|
+
"medium",
|
|
367
|
+
"navy",
|
|
368
|
+
"none",
|
|
369
|
+
"normal",
|
|
370
|
+
"nowrap",
|
|
371
|
+
"olive",
|
|
372
|
+
"pointer",
|
|
373
|
+
"purple",
|
|
374
|
+
"red",
|
|
375
|
+
"right",
|
|
376
|
+
"silver",
|
|
377
|
+
"solid",
|
|
378
|
+
"teal",
|
|
379
|
+
"top",
|
|
380
|
+
"transparent",
|
|
381
|
+
"underline",
|
|
382
|
+
"white",
|
|
383
|
+
"yellow",
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
valid_css_values = re.compile(
|
|
387
|
+
r"^("
|
|
388
|
+
r"#[0-9a-f]+" # Hex values
|
|
389
|
+
r"|rgb\(\d+%?,\d*%?,?\d*%?\)?" # RGB values
|
|
390
|
+
r"|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?" # Sizes/widths
|
|
391
|
+
r")$"
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
mathml_elements = {
|
|
395
|
+
"annotation",
|
|
396
|
+
"annotation-xml",
|
|
397
|
+
"maction",
|
|
398
|
+
"maligngroup",
|
|
399
|
+
"malignmark",
|
|
400
|
+
"math",
|
|
401
|
+
"menclose",
|
|
402
|
+
"merror",
|
|
403
|
+
"mfenced",
|
|
404
|
+
"mfrac",
|
|
405
|
+
"mglyph",
|
|
406
|
+
"mi",
|
|
407
|
+
"mlabeledtr",
|
|
408
|
+
"mlongdiv",
|
|
409
|
+
"mmultiscripts",
|
|
410
|
+
"mn",
|
|
411
|
+
"mo",
|
|
412
|
+
"mover",
|
|
413
|
+
"mpadded",
|
|
414
|
+
"mphantom",
|
|
415
|
+
"mprescripts",
|
|
416
|
+
"mroot",
|
|
417
|
+
"mrow",
|
|
418
|
+
"ms",
|
|
419
|
+
"mscarries",
|
|
420
|
+
"mscarry",
|
|
421
|
+
"msgroup",
|
|
422
|
+
"msline",
|
|
423
|
+
"mspace",
|
|
424
|
+
"msqrt",
|
|
425
|
+
"msrow",
|
|
426
|
+
"mstack",
|
|
427
|
+
"mstyle",
|
|
428
|
+
"msub",
|
|
429
|
+
"msubsup",
|
|
430
|
+
"msup",
|
|
431
|
+
"mtable",
|
|
432
|
+
"mtd",
|
|
433
|
+
"mtext",
|
|
434
|
+
"mtr",
|
|
435
|
+
"munder",
|
|
436
|
+
"munderover",
|
|
437
|
+
"none",
|
|
438
|
+
"semantics",
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
mathml_attributes = {
|
|
442
|
+
"accent",
|
|
443
|
+
"accentunder",
|
|
444
|
+
"actiontype",
|
|
445
|
+
"align",
|
|
446
|
+
"alignmentscope",
|
|
447
|
+
"altimg",
|
|
448
|
+
"altimg-height",
|
|
449
|
+
"altimg-valign",
|
|
450
|
+
"altimg-width",
|
|
451
|
+
"alttext",
|
|
452
|
+
"bevelled",
|
|
453
|
+
"charalign",
|
|
454
|
+
"close",
|
|
455
|
+
"columnalign",
|
|
456
|
+
"columnlines",
|
|
457
|
+
"columnspacing",
|
|
458
|
+
"columnspan",
|
|
459
|
+
"columnwidth",
|
|
460
|
+
"crossout",
|
|
461
|
+
"decimalpoint",
|
|
462
|
+
"denomalign",
|
|
463
|
+
"depth",
|
|
464
|
+
"dir",
|
|
465
|
+
"display",
|
|
466
|
+
"displaystyle",
|
|
467
|
+
"edge",
|
|
468
|
+
"encoding",
|
|
469
|
+
"equalcolumns",
|
|
470
|
+
"equalrows",
|
|
471
|
+
"fence",
|
|
472
|
+
"fontstyle",
|
|
473
|
+
"fontweight",
|
|
474
|
+
"form",
|
|
475
|
+
"frame",
|
|
476
|
+
"framespacing",
|
|
477
|
+
"groupalign",
|
|
478
|
+
"height",
|
|
479
|
+
"href",
|
|
480
|
+
"id",
|
|
481
|
+
"indentalign",
|
|
482
|
+
"indentalignfirst",
|
|
483
|
+
"indentalignlast",
|
|
484
|
+
"indentshift",
|
|
485
|
+
"indentshiftfirst",
|
|
486
|
+
"indentshiftlast",
|
|
487
|
+
"indenttarget",
|
|
488
|
+
"infixlinebreakstyle",
|
|
489
|
+
"largeop",
|
|
490
|
+
"length",
|
|
491
|
+
"linebreak",
|
|
492
|
+
"linebreakmultchar",
|
|
493
|
+
"linebreakstyle",
|
|
494
|
+
"lineleading",
|
|
495
|
+
"linethickness",
|
|
496
|
+
"location",
|
|
497
|
+
"longdivstyle",
|
|
498
|
+
"lquote",
|
|
499
|
+
"lspace",
|
|
500
|
+
"mathbackground",
|
|
501
|
+
"mathcolor",
|
|
502
|
+
"mathsize",
|
|
503
|
+
"mathvariant",
|
|
504
|
+
"maxsize",
|
|
505
|
+
"minlabelspacing",
|
|
506
|
+
"minsize",
|
|
507
|
+
"movablelimits",
|
|
508
|
+
"notation",
|
|
509
|
+
"numalign",
|
|
510
|
+
"open",
|
|
511
|
+
"other",
|
|
512
|
+
"overflow",
|
|
513
|
+
"position",
|
|
514
|
+
"rowalign",
|
|
515
|
+
"rowlines",
|
|
516
|
+
"rowspacing",
|
|
517
|
+
"rowspan",
|
|
518
|
+
"rquote",
|
|
519
|
+
"rspace",
|
|
520
|
+
"scriptlevel",
|
|
521
|
+
"scriptminsize",
|
|
522
|
+
"scriptsizemultiplier",
|
|
523
|
+
"selection",
|
|
524
|
+
"separator",
|
|
525
|
+
"separators",
|
|
526
|
+
"shift",
|
|
527
|
+
"side",
|
|
528
|
+
"src",
|
|
529
|
+
"stackalign",
|
|
530
|
+
"stretchy",
|
|
531
|
+
"subscriptshift",
|
|
532
|
+
"superscriptshift",
|
|
533
|
+
"symmetric",
|
|
534
|
+
"voffset",
|
|
535
|
+
"width",
|
|
536
|
+
"xlink:href",
|
|
537
|
+
"xlink:show",
|
|
538
|
+
"xlink:type",
|
|
539
|
+
"xmlns",
|
|
540
|
+
"xmlns:xlink",
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
# svgtiny - foreignObject + linearGradient + radialGradient + stop
|
|
544
|
+
svg_elements = {
|
|
545
|
+
"a",
|
|
546
|
+
"animate",
|
|
547
|
+
"animateColor",
|
|
548
|
+
"animateMotion",
|
|
549
|
+
"animateTransform",
|
|
550
|
+
"circle",
|
|
551
|
+
"defs",
|
|
552
|
+
"desc",
|
|
553
|
+
"ellipse",
|
|
554
|
+
"font-face",
|
|
555
|
+
"font-face-name",
|
|
556
|
+
"font-face-src",
|
|
557
|
+
"foreignObject",
|
|
558
|
+
"g",
|
|
559
|
+
"glyph",
|
|
560
|
+
"hkern",
|
|
561
|
+
"line",
|
|
562
|
+
"linearGradient",
|
|
563
|
+
"marker",
|
|
564
|
+
"metadata",
|
|
565
|
+
"missing-glyph",
|
|
566
|
+
"mpath",
|
|
567
|
+
"path",
|
|
568
|
+
"polygon",
|
|
569
|
+
"polyline",
|
|
570
|
+
"radialGradient",
|
|
571
|
+
"rect",
|
|
572
|
+
"set",
|
|
573
|
+
"stop",
|
|
574
|
+
"svg",
|
|
575
|
+
"switch",
|
|
576
|
+
"text",
|
|
577
|
+
"title",
|
|
578
|
+
"tspan",
|
|
579
|
+
"use",
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
# svgtiny + class + opacity + offset + xmlns + xmlns:xlink
|
|
583
|
+
svg_attributes = {
|
|
584
|
+
"accent-height",
|
|
585
|
+
"accumulate",
|
|
586
|
+
"additive",
|
|
587
|
+
"alphabetic",
|
|
588
|
+
"arabic-form",
|
|
589
|
+
"ascent",
|
|
590
|
+
"attributeName",
|
|
591
|
+
"attributeType",
|
|
592
|
+
"baseProfile",
|
|
593
|
+
"bbox",
|
|
594
|
+
"begin",
|
|
595
|
+
"by",
|
|
596
|
+
"calcMode",
|
|
597
|
+
"cap-height",
|
|
598
|
+
"class",
|
|
599
|
+
"color",
|
|
600
|
+
"color-rendering",
|
|
601
|
+
"content",
|
|
602
|
+
"cx",
|
|
603
|
+
"cy",
|
|
604
|
+
"d",
|
|
605
|
+
"descent",
|
|
606
|
+
"display",
|
|
607
|
+
"dur",
|
|
608
|
+
"dx",
|
|
609
|
+
"dy",
|
|
610
|
+
"end",
|
|
611
|
+
"fill",
|
|
612
|
+
"fill-opacity",
|
|
613
|
+
"fill-rule",
|
|
614
|
+
"font-family",
|
|
615
|
+
"font-size",
|
|
616
|
+
"font-stretch",
|
|
617
|
+
"font-style",
|
|
618
|
+
"font-variant",
|
|
619
|
+
"font-weight",
|
|
620
|
+
"from",
|
|
621
|
+
"fx",
|
|
622
|
+
"fy",
|
|
623
|
+
"g1",
|
|
624
|
+
"g2",
|
|
625
|
+
"glyph-name",
|
|
626
|
+
"gradientUnits",
|
|
627
|
+
"hanging",
|
|
628
|
+
"height",
|
|
629
|
+
"horiz-adv-x",
|
|
630
|
+
"horiz-origin-x",
|
|
631
|
+
"id",
|
|
632
|
+
"ideographic",
|
|
633
|
+
"k",
|
|
634
|
+
"keyPoints",
|
|
635
|
+
"keySplines",
|
|
636
|
+
"keyTimes",
|
|
637
|
+
"lang",
|
|
638
|
+
"marker-end",
|
|
639
|
+
"marker-mid",
|
|
640
|
+
"marker-start",
|
|
641
|
+
"markerHeight",
|
|
642
|
+
"markerUnits",
|
|
643
|
+
"markerWidth",
|
|
644
|
+
"mathematical",
|
|
645
|
+
"max",
|
|
646
|
+
"min",
|
|
647
|
+
"name",
|
|
648
|
+
"offset",
|
|
649
|
+
"opacity",
|
|
650
|
+
"orient",
|
|
651
|
+
"origin",
|
|
652
|
+
"overline-position",
|
|
653
|
+
"overline-thickness",
|
|
654
|
+
"panose-1",
|
|
655
|
+
"path",
|
|
656
|
+
"pathLength",
|
|
657
|
+
"points",
|
|
658
|
+
"preserveAspectRatio",
|
|
659
|
+
"r",
|
|
660
|
+
"refX",
|
|
661
|
+
"refY",
|
|
662
|
+
"repeatCount",
|
|
663
|
+
"repeatDur",
|
|
664
|
+
"requiredExtensions",
|
|
665
|
+
"requiredFeatures",
|
|
666
|
+
"restart",
|
|
667
|
+
"rotate",
|
|
668
|
+
"rx",
|
|
669
|
+
"ry",
|
|
670
|
+
"slope",
|
|
671
|
+
"stemh",
|
|
672
|
+
"stemv",
|
|
673
|
+
"stop-color",
|
|
674
|
+
"stop-opacity",
|
|
675
|
+
"strikethrough-position",
|
|
676
|
+
"strikethrough-thickness",
|
|
677
|
+
"stroke",
|
|
678
|
+
"stroke-dasharray",
|
|
679
|
+
"stroke-dashoffset",
|
|
680
|
+
"stroke-linecap",
|
|
681
|
+
"stroke-linejoin",
|
|
682
|
+
"stroke-miterlimit",
|
|
683
|
+
"stroke-opacity",
|
|
684
|
+
"stroke-width",
|
|
685
|
+
"systemLanguage",
|
|
686
|
+
"target",
|
|
687
|
+
"text-anchor",
|
|
688
|
+
"to",
|
|
689
|
+
"transform",
|
|
690
|
+
"type",
|
|
691
|
+
"u1",
|
|
692
|
+
"u2",
|
|
693
|
+
"underline-position",
|
|
694
|
+
"underline-thickness",
|
|
695
|
+
"unicode",
|
|
696
|
+
"unicode-range",
|
|
697
|
+
"units-per-em",
|
|
698
|
+
"values",
|
|
699
|
+
"version",
|
|
700
|
+
"viewBox",
|
|
701
|
+
"visibility",
|
|
702
|
+
"width",
|
|
703
|
+
"widths",
|
|
704
|
+
"x",
|
|
705
|
+
"x-height",
|
|
706
|
+
"x1",
|
|
707
|
+
"x2",
|
|
708
|
+
"xlink:actuate",
|
|
709
|
+
"xlink:arcrole",
|
|
710
|
+
"xlink:href",
|
|
711
|
+
"xlink:role",
|
|
712
|
+
"xlink:show",
|
|
713
|
+
"xlink:title",
|
|
714
|
+
"xlink:type",
|
|
715
|
+
"xml:base",
|
|
716
|
+
"xml:lang",
|
|
717
|
+
"xml:space",
|
|
718
|
+
"xmlns",
|
|
719
|
+
"xmlns:xlink",
|
|
720
|
+
"y",
|
|
721
|
+
"y1",
|
|
722
|
+
"y2",
|
|
723
|
+
"zoomAndPan",
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
svg_attr_map = None
|
|
727
|
+
svg_elem_map = None
|
|
728
|
+
|
|
729
|
+
acceptable_svg_properties = {
|
|
730
|
+
"fill",
|
|
731
|
+
"fill-opacity",
|
|
732
|
+
"fill-rule",
|
|
733
|
+
"stroke",
|
|
734
|
+
"stroke-linecap",
|
|
735
|
+
"stroke-linejoin",
|
|
736
|
+
"stroke-opacity",
|
|
737
|
+
"stroke-width",
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
def __init__(self, encoding=None, _type="application/xhtml+xml"):
|
|
741
|
+
super().__init__(encoding, _type)
|
|
742
|
+
|
|
743
|
+
self.unacceptablestack = 0
|
|
744
|
+
self.mathmlOK = 0
|
|
745
|
+
self.svgOK = 0
|
|
746
|
+
|
|
747
|
+
def reset(self):
|
|
748
|
+
super().reset()
|
|
749
|
+
self.unacceptablestack = 0
|
|
750
|
+
self.mathmlOK = 0
|
|
751
|
+
self.svgOK = 0
|
|
752
|
+
|
|
753
|
+
def unknown_starttag(self, tag, attrs):
|
|
754
|
+
acceptable_attributes = self.acceptable_attributes
|
|
755
|
+
keymap = {}
|
|
756
|
+
if tag not in self.acceptable_elements or self.svgOK:
|
|
757
|
+
if tag in self.unacceptable_elements_with_end_tag:
|
|
758
|
+
self.unacceptablestack += 1
|
|
759
|
+
|
|
760
|
+
# add implicit namespaces to html5 inline svg/mathml
|
|
761
|
+
if self._type.endswith("html"):
|
|
762
|
+
if not dict(attrs).get("xmlns"):
|
|
763
|
+
if tag == "svg":
|
|
764
|
+
attrs.append(("xmlns", "http://www.w3.org/2000/svg"))
|
|
765
|
+
if tag == "math":
|
|
766
|
+
attrs.append(("xmlns", "http://www.w3.org/1998/Math/MathML"))
|
|
767
|
+
|
|
768
|
+
# not otherwise acceptable, perhaps it is MathML or SVG?
|
|
769
|
+
if (
|
|
770
|
+
tag == "math"
|
|
771
|
+
and ("xmlns", "http://www.w3.org/1998/Math/MathML") in attrs
|
|
772
|
+
):
|
|
773
|
+
self.mathmlOK += 1
|
|
774
|
+
if tag == "svg" and ("xmlns", "http://www.w3.org/2000/svg") in attrs:
|
|
775
|
+
self.svgOK += 1
|
|
776
|
+
|
|
777
|
+
# chose acceptable attributes based on tag class, else bail
|
|
778
|
+
if self.mathmlOK and tag in self.mathml_elements:
|
|
779
|
+
acceptable_attributes = self.mathml_attributes
|
|
780
|
+
elif self.svgOK and tag in self.svg_elements:
|
|
781
|
+
# For most vocabularies, lowercasing is a good idea. Many
|
|
782
|
+
# svg elements, however, are camel case.
|
|
783
|
+
if not self.svg_attr_map:
|
|
784
|
+
lower = [attr.lower() for attr in self.svg_attributes]
|
|
785
|
+
mix = [a for a in self.svg_attributes if a not in lower]
|
|
786
|
+
self.svg_attributes = lower
|
|
787
|
+
self.svg_attr_map = {a.lower(): a for a in mix}
|
|
788
|
+
|
|
789
|
+
lower = [attr.lower() for attr in self.svg_elements]
|
|
790
|
+
mix = [a for a in self.svg_elements if a not in lower]
|
|
791
|
+
self.svg_elements = lower
|
|
792
|
+
self.svg_elem_map = {a.lower(): a for a in mix}
|
|
793
|
+
acceptable_attributes = self.svg_attributes
|
|
794
|
+
tag = self.svg_elem_map.get(tag, tag)
|
|
795
|
+
keymap = self.svg_attr_map
|
|
796
|
+
elif tag not in self.acceptable_elements:
|
|
797
|
+
return
|
|
798
|
+
|
|
799
|
+
# declare xlink namespace, if needed
|
|
800
|
+
if self.mathmlOK or self.svgOK:
|
|
801
|
+
if any(a for a in attrs if a[0].startswith("xlink:")):
|
|
802
|
+
if not ("xmlns:xlink", "http://www.w3.org/1999/xlink") in attrs:
|
|
803
|
+
attrs.append(("xmlns:xlink", "http://www.w3.org/1999/xlink"))
|
|
804
|
+
|
|
805
|
+
clean_attrs = []
|
|
806
|
+
for key, value in self.normalize_attrs(attrs):
|
|
807
|
+
if key == "style" and "style" in acceptable_attributes:
|
|
808
|
+
clean_value = self.sanitize_style(value)
|
|
809
|
+
if clean_value:
|
|
810
|
+
clean_attrs.append((key, clean_value))
|
|
811
|
+
elif key in acceptable_attributes:
|
|
812
|
+
key = keymap.get(key, key)
|
|
813
|
+
# make sure the uri uses an acceptable uri scheme
|
|
814
|
+
if key == "href":
|
|
815
|
+
value = make_safe_absolute_uri(value)
|
|
816
|
+
clean_attrs.append((key, value))
|
|
817
|
+
super().unknown_starttag(tag, clean_attrs)
|
|
818
|
+
|
|
819
|
+
def unknown_endtag(self, tag):
|
|
820
|
+
if tag not in self.acceptable_elements:
|
|
821
|
+
if tag in self.unacceptable_elements_with_end_tag:
|
|
822
|
+
self.unacceptablestack -= 1
|
|
823
|
+
if self.mathmlOK and tag in self.mathml_elements:
|
|
824
|
+
if tag == "math" and self.mathmlOK:
|
|
825
|
+
self.mathmlOK -= 1
|
|
826
|
+
elif self.svgOK and tag in self.svg_elements:
|
|
827
|
+
tag = self.svg_elem_map.get(tag, tag)
|
|
828
|
+
if tag == "svg" and self.svgOK:
|
|
829
|
+
self.svgOK -= 1
|
|
830
|
+
else:
|
|
831
|
+
return
|
|
832
|
+
super().unknown_endtag(tag)
|
|
833
|
+
|
|
834
|
+
def handle_pi(self, text):
|
|
835
|
+
pass
|
|
836
|
+
|
|
837
|
+
def handle_decl(self, text):
|
|
838
|
+
pass
|
|
839
|
+
|
|
840
|
+
def handle_data(self, text):
|
|
841
|
+
if not self.unacceptablestack:
|
|
842
|
+
super().handle_data(text)
|
|
843
|
+
|
|
844
|
+
def sanitize_style(self, style):
|
|
845
|
+
# disallow urls
|
|
846
|
+
style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
|
|
847
|
+
|
|
848
|
+
# gauntlet
|
|
849
|
+
if not re.match(
|
|
850
|
+
r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""",
|
|
851
|
+
style,
|
|
852
|
+
):
|
|
853
|
+
return ""
|
|
854
|
+
# This replaced a regexp that used re.match and was prone to
|
|
855
|
+
# pathological back-tracking.
|
|
856
|
+
if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", "", style).strip():
|
|
857
|
+
return ""
|
|
858
|
+
|
|
859
|
+
clean = []
|
|
860
|
+
for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
|
|
861
|
+
if not value:
|
|
862
|
+
continue
|
|
863
|
+
if prop.lower() in self.acceptable_css_properties:
|
|
864
|
+
clean.append(prop + ": " + value + ";")
|
|
865
|
+
elif prop.split("-")[0].lower() in [
|
|
866
|
+
"background",
|
|
867
|
+
"border",
|
|
868
|
+
"margin",
|
|
869
|
+
"padding",
|
|
870
|
+
]:
|
|
871
|
+
for keyword in value.split():
|
|
872
|
+
if (
|
|
873
|
+
keyword not in self.acceptable_css_keywords
|
|
874
|
+
and not self.valid_css_values.match(keyword)
|
|
875
|
+
):
|
|
876
|
+
break
|
|
877
|
+
else:
|
|
878
|
+
clean.append(prop + ": " + value + ";")
|
|
879
|
+
elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
|
|
880
|
+
clean.append(prop + ": " + value + ";")
|
|
881
|
+
|
|
882
|
+
return " ".join(clean)
|
|
883
|
+
|
|
884
|
+
def parse_comment(self, i, report=1):
|
|
885
|
+
ret = super().parse_comment(i, report)
|
|
886
|
+
if ret >= 0:
|
|
887
|
+
return ret
|
|
888
|
+
# if ret == -1, this may be a malicious attempt to circumvent
|
|
889
|
+
# sanitization, or a page-destroying unclosed comment
|
|
890
|
+
match = re.compile(r"--[^>]*>").search(self.rawdata, i + 4)
|
|
891
|
+
if match:
|
|
892
|
+
return match.end()
|
|
893
|
+
# unclosed comment; deliberately fail to handle_data()
|
|
894
|
+
return len(self.rawdata)
|
|
895
|
+
|
|
896
|
+
|
|
897
|
+
def sanitize_html(html_source, encoding, _type):
|
|
898
|
+
p = HTMLSanitizer(encoding, _type)
|
|
899
|
+
html_source = html_source.replace("<![CDATA[", "<![CDATA[")
|
|
900
|
+
p.feed(html_source)
|
|
901
|
+
data = p.output()
|
|
902
|
+
data = data.strip().replace("\r\n", "\n")
|
|
903
|
+
return data
|
|
904
|
+
|
|
905
|
+
|
|
906
|
+
# Match XML entity declarations.
|
|
907
|
+
# Example: <!ENTITY copyright "(C)">
|
|
908
|
+
RE_ENTITY_PATTERN = re.compile(rb"^\s*<!ENTITY([^>]*?)>", re.MULTILINE)
|
|
909
|
+
|
|
910
|
+
# Match XML DOCTYPE declarations.
|
|
911
|
+
# Example: <!DOCTYPE feed [ ]>
|
|
912
|
+
RE_DOCTYPE_PATTERN = re.compile(rb"^\s*<!DOCTYPE([^>]*?)>", re.MULTILINE)
|
|
913
|
+
|
|
914
|
+
# Match safe entity declarations.
|
|
915
|
+
# This will allow hexadecimal character references through,
|
|
916
|
+
# as well as text, but not arbitrary nested entities.
|
|
917
|
+
# Example: cubed "³"
|
|
918
|
+
# Example: copyright "(C)"
|
|
919
|
+
# Forbidden: explode1 "&explode2;&explode2;"
|
|
920
|
+
RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
|
|
921
|
+
|
|
922
|
+
|
|
923
|
+
def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]:
|
|
924
|
+
"""Strip and replaces the DOCTYPE.
|
|
925
|
+
|
|
926
|
+
One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration.
|
|
927
|
+
Therefore, this function must identify that version while replacing the DOCTYPE.
|
|
928
|
+
|
|
929
|
+
As a convenience to the loose XML parser, entities are pre-computed and returned.
|
|
930
|
+
|
|
931
|
+
The tuple that is returned has the following values, in order:
|
|
932
|
+
|
|
933
|
+
1. The version extracted from the XML DOCTYPE.
|
|
934
|
+
The value will either be "rss091n" or None.
|
|
935
|
+
2. Binary XML content with a replaced DOCTYPE.
|
|
936
|
+
3. A dictionary of entities and replacements.
|
|
937
|
+
"""
|
|
938
|
+
|
|
939
|
+
# Verify this looks like an XML feed.
|
|
940
|
+
if not re.match(rb"^\s*<", data):
|
|
941
|
+
return None, data, {}
|
|
942
|
+
|
|
943
|
+
# Divide the document into two groups by finding the location
|
|
944
|
+
# of the first element that doesn't begin with '<?' or '<!'.
|
|
945
|
+
match = re.search(rb"<\w", data)
|
|
946
|
+
first_element = match.start() + 1 if match is not None else 0
|
|
947
|
+
head, data = data[:first_element], data[first_element:]
|
|
948
|
+
|
|
949
|
+
# Save, and then remove, any ENTITY declarations.
|
|
950
|
+
entity_results = RE_ENTITY_PATTERN.findall(head)
|
|
951
|
+
head = RE_ENTITY_PATTERN.sub(b"", head)
|
|
952
|
+
|
|
953
|
+
# Find the DOCTYPE declaration and check the feed type.
|
|
954
|
+
doctype_results = RE_DOCTYPE_PATTERN.findall(head)
|
|
955
|
+
doctype = doctype_results and doctype_results[0] or b""
|
|
956
|
+
if b"netscape" in doctype.lower():
|
|
957
|
+
version = "rss091n"
|
|
958
|
+
else:
|
|
959
|
+
version = None
|
|
960
|
+
|
|
961
|
+
# Re-insert the safe ENTITY declarations if a DOCTYPE was found.
|
|
962
|
+
replacement = b""
|
|
963
|
+
if len(doctype_results) == 1 and entity_results:
|
|
964
|
+
safe_entities = [e for e in entity_results if RE_SAFE_ENTITY_PATTERN.match(e)]
|
|
965
|
+
if safe_entities:
|
|
966
|
+
replacement = (
|
|
967
|
+
b"<!DOCTYPE feed [\n<!ENTITY"
|
|
968
|
+
+ b">\n<!ENTITY ".join(safe_entities)
|
|
969
|
+
+ b">\n]>"
|
|
970
|
+
)
|
|
971
|
+
data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
|
|
972
|
+
|
|
973
|
+
# Precompute the safe entities for the loose parser.
|
|
974
|
+
entities = {
|
|
975
|
+
k.decode("utf-8"): v.decode("utf-8")
|
|
976
|
+
for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
|
|
977
|
+
}
|
|
978
|
+
return version, data, entities
|