forkparser 2026.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,978 @@
1
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
2
+ # Copyright 2002-2008 Mark Pilgrim
3
+ # All rights reserved.
4
+ #
5
+ # This file is a part of feedparser.
6
+ #
7
+ # Redistribution and use in source and binary forms, with or without
8
+ # modification, are permitted provided that the following conditions are met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
17
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
+ # POSSIBILITY OF SUCH DAMAGE.
27
+
28
+ from __future__ import annotations
29
+
30
+ import re
31
+
32
+ from .html import BaseHTMLProcessor
33
+ from .urls import make_safe_absolute_uri
34
+
35
+
36
+ class HTMLSanitizer(BaseHTMLProcessor):
37
+ acceptable_elements = {
38
+ "a",
39
+ "abbr",
40
+ "acronym",
41
+ "address",
42
+ "area",
43
+ "article",
44
+ "aside",
45
+ "audio",
46
+ "b",
47
+ "big",
48
+ "blockquote",
49
+ "br",
50
+ "button",
51
+ "canvas",
52
+ "caption",
53
+ "center",
54
+ "cite",
55
+ "code",
56
+ "col",
57
+ "colgroup",
58
+ "command",
59
+ "datagrid",
60
+ "datalist",
61
+ "dd",
62
+ "del",
63
+ "details",
64
+ "dfn",
65
+ "dialog",
66
+ "dir",
67
+ "div",
68
+ "dl",
69
+ "dt",
70
+ "em",
71
+ "event-source",
72
+ "fieldset",
73
+ "figcaption",
74
+ "figure",
75
+ "font",
76
+ "footer",
77
+ "form",
78
+ "h1",
79
+ "h2",
80
+ "h3",
81
+ "h4",
82
+ "h5",
83
+ "h6",
84
+ "header",
85
+ "hr",
86
+ "i",
87
+ "img",
88
+ "input",
89
+ "ins",
90
+ "kbd",
91
+ "keygen",
92
+ "label",
93
+ "legend",
94
+ "li",
95
+ "m",
96
+ "map",
97
+ "menu",
98
+ "meter",
99
+ "multicol",
100
+ "nav",
101
+ "nextid",
102
+ "noscript",
103
+ "ol",
104
+ "optgroup",
105
+ "option",
106
+ "output",
107
+ "p",
108
+ "pre",
109
+ "progress",
110
+ "q",
111
+ "rp",
112
+ "rt",
113
+ "ruby",
114
+ "s",
115
+ "samp",
116
+ "section",
117
+ "select",
118
+ "small",
119
+ "sound",
120
+ "source",
121
+ "spacer",
122
+ "span",
123
+ "strike",
124
+ "strong",
125
+ "sub",
126
+ "sup",
127
+ "table",
128
+ "tbody",
129
+ "td",
130
+ "textarea",
131
+ "tfoot",
132
+ "th",
133
+ "thead",
134
+ "time",
135
+ "tr",
136
+ "tt",
137
+ "u",
138
+ "ul",
139
+ "var",
140
+ "video",
141
+ }
142
+
143
+ acceptable_attributes = {
144
+ "abbr",
145
+ "accept",
146
+ "accept-charset",
147
+ "accesskey",
148
+ "action",
149
+ "align",
150
+ "alt",
151
+ "autocomplete",
152
+ "autofocus",
153
+ "axis",
154
+ "background",
155
+ "balance",
156
+ "bgcolor",
157
+ "bgproperties",
158
+ "border",
159
+ "bordercolor",
160
+ "bordercolordark",
161
+ "bordercolorlight",
162
+ "bottompadding",
163
+ "cellpadding",
164
+ "cellspacing",
165
+ "ch",
166
+ "challenge",
167
+ "char",
168
+ "charoff",
169
+ "charset",
170
+ "checked",
171
+ "choff",
172
+ "cite",
173
+ "class",
174
+ "clear",
175
+ "color",
176
+ "cols",
177
+ "colspan",
178
+ "compact",
179
+ "contenteditable",
180
+ "controls",
181
+ "coords",
182
+ "data",
183
+ "datafld",
184
+ "datapagesize",
185
+ "datasrc",
186
+ "datetime",
187
+ "default",
188
+ "delay",
189
+ "dir",
190
+ "disabled",
191
+ "draggable",
192
+ "dynsrc",
193
+ "enctype",
194
+ "end",
195
+ "face",
196
+ "for",
197
+ "form",
198
+ "frame",
199
+ "galleryimg",
200
+ "gutter",
201
+ "headers",
202
+ "height",
203
+ "hidden",
204
+ "hidefocus",
205
+ "high",
206
+ "href",
207
+ "hreflang",
208
+ "hspace",
209
+ "icon",
210
+ "id",
211
+ "inputmode",
212
+ "ismap",
213
+ "keytype",
214
+ "label",
215
+ "lang",
216
+ "leftspacing",
217
+ "list",
218
+ "longdesc",
219
+ "loop",
220
+ "loopcount",
221
+ "loopend",
222
+ "loopstart",
223
+ "low",
224
+ "lowsrc",
225
+ "max",
226
+ "maxlength",
227
+ "media",
228
+ "method",
229
+ "min",
230
+ "multiple",
231
+ "name",
232
+ "nohref",
233
+ "noshade",
234
+ "nowrap",
235
+ "open",
236
+ "optimum",
237
+ "pattern",
238
+ "ping",
239
+ "point-size",
240
+ "poster",
241
+ "pqg",
242
+ "preload",
243
+ "prompt",
244
+ "radiogroup",
245
+ "readonly",
246
+ "rel",
247
+ "repeat-max",
248
+ "repeat-min",
249
+ "replace",
250
+ "required",
251
+ "rev",
252
+ "rightspacing",
253
+ "rows",
254
+ "rowspan",
255
+ "rules",
256
+ "scope",
257
+ "selected",
258
+ "shape",
259
+ "size",
260
+ "span",
261
+ "src",
262
+ "srcset",
263
+ "start",
264
+ "step",
265
+ "style",
266
+ "summary",
267
+ "suppress",
268
+ "tabindex",
269
+ "target",
270
+ "template",
271
+ "title",
272
+ "toppadding",
273
+ "type",
274
+ "unselectable",
275
+ "urn",
276
+ "usemap",
277
+ "valign",
278
+ "value",
279
+ "variable",
280
+ "volume",
281
+ "vrml",
282
+ "vspace",
283
+ "width",
284
+ "wrap",
285
+ "xml:lang",
286
+ }
287
+
288
+ unacceptable_elements_with_end_tag = {
289
+ "applet",
290
+ "script",
291
+ "style",
292
+ }
293
+
294
+ acceptable_css_properties = {
295
+ "azimuth",
296
+ "background-color",
297
+ "border-bottom-color",
298
+ "border-collapse",
299
+ "border-color",
300
+ "border-left-color",
301
+ "border-right-color",
302
+ "border-top-color",
303
+ "clear",
304
+ "color",
305
+ "cursor",
306
+ "direction",
307
+ "display",
308
+ "elevation",
309
+ "float",
310
+ "font",
311
+ "font-family",
312
+ "font-size",
313
+ "font-style",
314
+ "font-variant",
315
+ "font-weight",
316
+ "height",
317
+ "letter-spacing",
318
+ "line-height",
319
+ "overflow",
320
+ "pause",
321
+ "pause-after",
322
+ "pause-before",
323
+ "pitch",
324
+ "pitch-range",
325
+ "richness",
326
+ "speak",
327
+ "speak-header",
328
+ "speak-numeral",
329
+ "speak-punctuation",
330
+ "speech-rate",
331
+ "stress",
332
+ "text-align",
333
+ "text-decoration",
334
+ "text-indent",
335
+ "unicode-bidi",
336
+ "vertical-align",
337
+ "voice-family",
338
+ "volume",
339
+ "white-space",
340
+ "width",
341
+ }
342
+
343
+ # survey of common keywords found in feeds
344
+ acceptable_css_keywords = {
345
+ "!important",
346
+ "aqua",
347
+ "auto",
348
+ "black",
349
+ "block",
350
+ "blue",
351
+ "bold",
352
+ "both",
353
+ "bottom",
354
+ "brown",
355
+ "center",
356
+ "collapse",
357
+ "dashed",
358
+ "dotted",
359
+ "fuchsia",
360
+ "gray",
361
+ "green",
362
+ "italic",
363
+ "left",
364
+ "lime",
365
+ "maroon",
366
+ "medium",
367
+ "navy",
368
+ "none",
369
+ "normal",
370
+ "nowrap",
371
+ "olive",
372
+ "pointer",
373
+ "purple",
374
+ "red",
375
+ "right",
376
+ "silver",
377
+ "solid",
378
+ "teal",
379
+ "top",
380
+ "transparent",
381
+ "underline",
382
+ "white",
383
+ "yellow",
384
+ }
385
+
386
+ valid_css_values = re.compile(
387
+ r"^("
388
+ r"#[0-9a-f]+" # Hex values
389
+ r"|rgb\(\d+%?,\d*%?,?\d*%?\)?" # RGB values
390
+ r"|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?" # Sizes/widths
391
+ r")$"
392
+ )
393
+
394
+ mathml_elements = {
395
+ "annotation",
396
+ "annotation-xml",
397
+ "maction",
398
+ "maligngroup",
399
+ "malignmark",
400
+ "math",
401
+ "menclose",
402
+ "merror",
403
+ "mfenced",
404
+ "mfrac",
405
+ "mglyph",
406
+ "mi",
407
+ "mlabeledtr",
408
+ "mlongdiv",
409
+ "mmultiscripts",
410
+ "mn",
411
+ "mo",
412
+ "mover",
413
+ "mpadded",
414
+ "mphantom",
415
+ "mprescripts",
416
+ "mroot",
417
+ "mrow",
418
+ "ms",
419
+ "mscarries",
420
+ "mscarry",
421
+ "msgroup",
422
+ "msline",
423
+ "mspace",
424
+ "msqrt",
425
+ "msrow",
426
+ "mstack",
427
+ "mstyle",
428
+ "msub",
429
+ "msubsup",
430
+ "msup",
431
+ "mtable",
432
+ "mtd",
433
+ "mtext",
434
+ "mtr",
435
+ "munder",
436
+ "munderover",
437
+ "none",
438
+ "semantics",
439
+ }
440
+
441
+ mathml_attributes = {
442
+ "accent",
443
+ "accentunder",
444
+ "actiontype",
445
+ "align",
446
+ "alignmentscope",
447
+ "altimg",
448
+ "altimg-height",
449
+ "altimg-valign",
450
+ "altimg-width",
451
+ "alttext",
452
+ "bevelled",
453
+ "charalign",
454
+ "close",
455
+ "columnalign",
456
+ "columnlines",
457
+ "columnspacing",
458
+ "columnspan",
459
+ "columnwidth",
460
+ "crossout",
461
+ "decimalpoint",
462
+ "denomalign",
463
+ "depth",
464
+ "dir",
465
+ "display",
466
+ "displaystyle",
467
+ "edge",
468
+ "encoding",
469
+ "equalcolumns",
470
+ "equalrows",
471
+ "fence",
472
+ "fontstyle",
473
+ "fontweight",
474
+ "form",
475
+ "frame",
476
+ "framespacing",
477
+ "groupalign",
478
+ "height",
479
+ "href",
480
+ "id",
481
+ "indentalign",
482
+ "indentalignfirst",
483
+ "indentalignlast",
484
+ "indentshift",
485
+ "indentshiftfirst",
486
+ "indentshiftlast",
487
+ "indenttarget",
488
+ "infixlinebreakstyle",
489
+ "largeop",
490
+ "length",
491
+ "linebreak",
492
+ "linebreakmultchar",
493
+ "linebreakstyle",
494
+ "lineleading",
495
+ "linethickness",
496
+ "location",
497
+ "longdivstyle",
498
+ "lquote",
499
+ "lspace",
500
+ "mathbackground",
501
+ "mathcolor",
502
+ "mathsize",
503
+ "mathvariant",
504
+ "maxsize",
505
+ "minlabelspacing",
506
+ "minsize",
507
+ "movablelimits",
508
+ "notation",
509
+ "numalign",
510
+ "open",
511
+ "other",
512
+ "overflow",
513
+ "position",
514
+ "rowalign",
515
+ "rowlines",
516
+ "rowspacing",
517
+ "rowspan",
518
+ "rquote",
519
+ "rspace",
520
+ "scriptlevel",
521
+ "scriptminsize",
522
+ "scriptsizemultiplier",
523
+ "selection",
524
+ "separator",
525
+ "separators",
526
+ "shift",
527
+ "side",
528
+ "src",
529
+ "stackalign",
530
+ "stretchy",
531
+ "subscriptshift",
532
+ "superscriptshift",
533
+ "symmetric",
534
+ "voffset",
535
+ "width",
536
+ "xlink:href",
537
+ "xlink:show",
538
+ "xlink:type",
539
+ "xmlns",
540
+ "xmlns:xlink",
541
+ }
542
+
543
+ # svgtiny - foreignObject + linearGradient + radialGradient + stop
544
+ svg_elements = {
545
+ "a",
546
+ "animate",
547
+ "animateColor",
548
+ "animateMotion",
549
+ "animateTransform",
550
+ "circle",
551
+ "defs",
552
+ "desc",
553
+ "ellipse",
554
+ "font-face",
555
+ "font-face-name",
556
+ "font-face-src",
557
+ "foreignObject",
558
+ "g",
559
+ "glyph",
560
+ "hkern",
561
+ "line",
562
+ "linearGradient",
563
+ "marker",
564
+ "metadata",
565
+ "missing-glyph",
566
+ "mpath",
567
+ "path",
568
+ "polygon",
569
+ "polyline",
570
+ "radialGradient",
571
+ "rect",
572
+ "set",
573
+ "stop",
574
+ "svg",
575
+ "switch",
576
+ "text",
577
+ "title",
578
+ "tspan",
579
+ "use",
580
+ }
581
+
582
+ # svgtiny + class + opacity + offset + xmlns + xmlns:xlink
583
+ svg_attributes = {
584
+ "accent-height",
585
+ "accumulate",
586
+ "additive",
587
+ "alphabetic",
588
+ "arabic-form",
589
+ "ascent",
590
+ "attributeName",
591
+ "attributeType",
592
+ "baseProfile",
593
+ "bbox",
594
+ "begin",
595
+ "by",
596
+ "calcMode",
597
+ "cap-height",
598
+ "class",
599
+ "color",
600
+ "color-rendering",
601
+ "content",
602
+ "cx",
603
+ "cy",
604
+ "d",
605
+ "descent",
606
+ "display",
607
+ "dur",
608
+ "dx",
609
+ "dy",
610
+ "end",
611
+ "fill",
612
+ "fill-opacity",
613
+ "fill-rule",
614
+ "font-family",
615
+ "font-size",
616
+ "font-stretch",
617
+ "font-style",
618
+ "font-variant",
619
+ "font-weight",
620
+ "from",
621
+ "fx",
622
+ "fy",
623
+ "g1",
624
+ "g2",
625
+ "glyph-name",
626
+ "gradientUnits",
627
+ "hanging",
628
+ "height",
629
+ "horiz-adv-x",
630
+ "horiz-origin-x",
631
+ "id",
632
+ "ideographic",
633
+ "k",
634
+ "keyPoints",
635
+ "keySplines",
636
+ "keyTimes",
637
+ "lang",
638
+ "marker-end",
639
+ "marker-mid",
640
+ "marker-start",
641
+ "markerHeight",
642
+ "markerUnits",
643
+ "markerWidth",
644
+ "mathematical",
645
+ "max",
646
+ "min",
647
+ "name",
648
+ "offset",
649
+ "opacity",
650
+ "orient",
651
+ "origin",
652
+ "overline-position",
653
+ "overline-thickness",
654
+ "panose-1",
655
+ "path",
656
+ "pathLength",
657
+ "points",
658
+ "preserveAspectRatio",
659
+ "r",
660
+ "refX",
661
+ "refY",
662
+ "repeatCount",
663
+ "repeatDur",
664
+ "requiredExtensions",
665
+ "requiredFeatures",
666
+ "restart",
667
+ "rotate",
668
+ "rx",
669
+ "ry",
670
+ "slope",
671
+ "stemh",
672
+ "stemv",
673
+ "stop-color",
674
+ "stop-opacity",
675
+ "strikethrough-position",
676
+ "strikethrough-thickness",
677
+ "stroke",
678
+ "stroke-dasharray",
679
+ "stroke-dashoffset",
680
+ "stroke-linecap",
681
+ "stroke-linejoin",
682
+ "stroke-miterlimit",
683
+ "stroke-opacity",
684
+ "stroke-width",
685
+ "systemLanguage",
686
+ "target",
687
+ "text-anchor",
688
+ "to",
689
+ "transform",
690
+ "type",
691
+ "u1",
692
+ "u2",
693
+ "underline-position",
694
+ "underline-thickness",
695
+ "unicode",
696
+ "unicode-range",
697
+ "units-per-em",
698
+ "values",
699
+ "version",
700
+ "viewBox",
701
+ "visibility",
702
+ "width",
703
+ "widths",
704
+ "x",
705
+ "x-height",
706
+ "x1",
707
+ "x2",
708
+ "xlink:actuate",
709
+ "xlink:arcrole",
710
+ "xlink:href",
711
+ "xlink:role",
712
+ "xlink:show",
713
+ "xlink:title",
714
+ "xlink:type",
715
+ "xml:base",
716
+ "xml:lang",
717
+ "xml:space",
718
+ "xmlns",
719
+ "xmlns:xlink",
720
+ "y",
721
+ "y1",
722
+ "y2",
723
+ "zoomAndPan",
724
+ }
725
+
726
+ svg_attr_map = None
727
+ svg_elem_map = None
728
+
729
+ acceptable_svg_properties = {
730
+ "fill",
731
+ "fill-opacity",
732
+ "fill-rule",
733
+ "stroke",
734
+ "stroke-linecap",
735
+ "stroke-linejoin",
736
+ "stroke-opacity",
737
+ "stroke-width",
738
+ }
739
+
740
+ def __init__(self, encoding=None, _type="application/xhtml+xml"):
741
+ super().__init__(encoding, _type)
742
+
743
+ self.unacceptablestack = 0
744
+ self.mathmlOK = 0
745
+ self.svgOK = 0
746
+
747
+ def reset(self):
748
+ super().reset()
749
+ self.unacceptablestack = 0
750
+ self.mathmlOK = 0
751
+ self.svgOK = 0
752
+
753
+ def unknown_starttag(self, tag, attrs):
754
+ acceptable_attributes = self.acceptable_attributes
755
+ keymap = {}
756
+ if tag not in self.acceptable_elements or self.svgOK:
757
+ if tag in self.unacceptable_elements_with_end_tag:
758
+ self.unacceptablestack += 1
759
+
760
+ # add implicit namespaces to html5 inline svg/mathml
761
+ if self._type.endswith("html"):
762
+ if not dict(attrs).get("xmlns"):
763
+ if tag == "svg":
764
+ attrs.append(("xmlns", "http://www.w3.org/2000/svg"))
765
+ if tag == "math":
766
+ attrs.append(("xmlns", "http://www.w3.org/1998/Math/MathML"))
767
+
768
+ # not otherwise acceptable, perhaps it is MathML or SVG?
769
+ if (
770
+ tag == "math"
771
+ and ("xmlns", "http://www.w3.org/1998/Math/MathML") in attrs
772
+ ):
773
+ self.mathmlOK += 1
774
+ if tag == "svg" and ("xmlns", "http://www.w3.org/2000/svg") in attrs:
775
+ self.svgOK += 1
776
+
777
+ # chose acceptable attributes based on tag class, else bail
778
+ if self.mathmlOK and tag in self.mathml_elements:
779
+ acceptable_attributes = self.mathml_attributes
780
+ elif self.svgOK and tag in self.svg_elements:
781
+ # For most vocabularies, lowercasing is a good idea. Many
782
+ # svg elements, however, are camel case.
783
+ if not self.svg_attr_map:
784
+ lower = [attr.lower() for attr in self.svg_attributes]
785
+ mix = [a for a in self.svg_attributes if a not in lower]
786
+ self.svg_attributes = lower
787
+ self.svg_attr_map = {a.lower(): a for a in mix}
788
+
789
+ lower = [attr.lower() for attr in self.svg_elements]
790
+ mix = [a for a in self.svg_elements if a not in lower]
791
+ self.svg_elements = lower
792
+ self.svg_elem_map = {a.lower(): a for a in mix}
793
+ acceptable_attributes = self.svg_attributes
794
+ tag = self.svg_elem_map.get(tag, tag)
795
+ keymap = self.svg_attr_map
796
+ elif tag not in self.acceptable_elements:
797
+ return
798
+
799
+ # declare xlink namespace, if needed
800
+ if self.mathmlOK or self.svgOK:
801
+ if any(a for a in attrs if a[0].startswith("xlink:")):
802
+ if not ("xmlns:xlink", "http://www.w3.org/1999/xlink") in attrs:
803
+ attrs.append(("xmlns:xlink", "http://www.w3.org/1999/xlink"))
804
+
805
+ clean_attrs = []
806
+ for key, value in self.normalize_attrs(attrs):
807
+ if key == "style" and "style" in acceptable_attributes:
808
+ clean_value = self.sanitize_style(value)
809
+ if clean_value:
810
+ clean_attrs.append((key, clean_value))
811
+ elif key in acceptable_attributes:
812
+ key = keymap.get(key, key)
813
+ # make sure the uri uses an acceptable uri scheme
814
+ if key == "href":
815
+ value = make_safe_absolute_uri(value)
816
+ clean_attrs.append((key, value))
817
+ super().unknown_starttag(tag, clean_attrs)
818
+
819
+ def unknown_endtag(self, tag):
820
+ if tag not in self.acceptable_elements:
821
+ if tag in self.unacceptable_elements_with_end_tag:
822
+ self.unacceptablestack -= 1
823
+ if self.mathmlOK and tag in self.mathml_elements:
824
+ if tag == "math" and self.mathmlOK:
825
+ self.mathmlOK -= 1
826
+ elif self.svgOK and tag in self.svg_elements:
827
+ tag = self.svg_elem_map.get(tag, tag)
828
+ if tag == "svg" and self.svgOK:
829
+ self.svgOK -= 1
830
+ else:
831
+ return
832
+ super().unknown_endtag(tag)
833
+
834
+ def handle_pi(self, text):
835
+ pass
836
+
837
+ def handle_decl(self, text):
838
+ pass
839
+
840
+ def handle_data(self, text):
841
+ if not self.unacceptablestack:
842
+ super().handle_data(text)
843
+
844
+ def sanitize_style(self, style):
845
+ # disallow urls
846
+ style = re.compile(r"url\s*\(\s*[^\s)]+?\s*\)\s*").sub(" ", style)
847
+
848
+ # gauntlet
849
+ if not re.match(
850
+ r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""",
851
+ style,
852
+ ):
853
+ return ""
854
+ # This replaced a regexp that used re.match and was prone to
855
+ # pathological back-tracking.
856
+ if re.sub(r"\s*[-\w]+\s*:\s*[^:;]*;?", "", style).strip():
857
+ return ""
858
+
859
+ clean = []
860
+ for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
861
+ if not value:
862
+ continue
863
+ if prop.lower() in self.acceptable_css_properties:
864
+ clean.append(prop + ": " + value + ";")
865
+ elif prop.split("-")[0].lower() in [
866
+ "background",
867
+ "border",
868
+ "margin",
869
+ "padding",
870
+ ]:
871
+ for keyword in value.split():
872
+ if (
873
+ keyword not in self.acceptable_css_keywords
874
+ and not self.valid_css_values.match(keyword)
875
+ ):
876
+ break
877
+ else:
878
+ clean.append(prop + ": " + value + ";")
879
+ elif self.svgOK and prop.lower() in self.acceptable_svg_properties:
880
+ clean.append(prop + ": " + value + ";")
881
+
882
+ return " ".join(clean)
883
+
884
+ def parse_comment(self, i, report=1):
885
+ ret = super().parse_comment(i, report)
886
+ if ret >= 0:
887
+ return ret
888
+ # if ret == -1, this may be a malicious attempt to circumvent
889
+ # sanitization, or a page-destroying unclosed comment
890
+ match = re.compile(r"--[^>]*>").search(self.rawdata, i + 4)
891
+ if match:
892
+ return match.end()
893
+ # unclosed comment; deliberately fail to handle_data()
894
+ return len(self.rawdata)
895
+
896
+
897
+ def sanitize_html(html_source, encoding, _type):
898
+ p = HTMLSanitizer(encoding, _type)
899
+ html_source = html_source.replace("<![CDATA[", "&lt;![CDATA[")
900
+ p.feed(html_source)
901
+ data = p.output()
902
+ data = data.strip().replace("\r\n", "\n")
903
+ return data
904
+
905
+
906
+ # Match XML entity declarations.
907
+ # Example: <!ENTITY copyright "(C)">
908
+ RE_ENTITY_PATTERN = re.compile(rb"^\s*<!ENTITY([^>]*?)>", re.MULTILINE)
909
+
910
+ # Match XML DOCTYPE declarations.
911
+ # Example: <!DOCTYPE feed [ ]>
912
+ RE_DOCTYPE_PATTERN = re.compile(rb"^\s*<!DOCTYPE([^>]*?)>", re.MULTILINE)
913
+
914
+ # Match safe entity declarations.
915
+ # This will allow hexadecimal character references through,
916
+ # as well as text, but not arbitrary nested entities.
917
+ # Example: cubed "&#179;"
918
+ # Example: copyright "(C)"
919
+ # Forbidden: explode1 "&explode2;&explode2;"
920
+ RE_SAFE_ENTITY_PATTERN = re.compile(rb'\s+(\w+)\s+"(&#\w+;|[^&"]*)"')
921
+
922
+
923
+ def replace_doctype(data: bytes) -> tuple[str | None, bytes, dict[str, str]]:
924
+ """Strip and replaces the DOCTYPE.
925
+
926
+ One RSS format -- Netscape's RSS 0.91 -- is identified within the XML declaration.
927
+ Therefore, this function must identify that version while replacing the DOCTYPE.
928
+
929
+ As a convenience to the loose XML parser, entities are pre-computed and returned.
930
+
931
+ The tuple that is returned has the following values, in order:
932
+
933
+ 1. The version extracted from the XML DOCTYPE.
934
+ The value will either be "rss091n" or None.
935
+ 2. Binary XML content with a replaced DOCTYPE.
936
+ 3. A dictionary of entities and replacements.
937
+ """
938
+
939
+ # Verify this looks like an XML feed.
940
+ if not re.match(rb"^\s*<", data):
941
+ return None, data, {}
942
+
943
+ # Divide the document into two groups by finding the location
944
+ # of the first element that doesn't begin with '<?' or '<!'.
945
+ match = re.search(rb"<\w", data)
946
+ first_element = match.start() + 1 if match is not None else 0
947
+ head, data = data[:first_element], data[first_element:]
948
+
949
+ # Save, and then remove, any ENTITY declarations.
950
+ entity_results = RE_ENTITY_PATTERN.findall(head)
951
+ head = RE_ENTITY_PATTERN.sub(b"", head)
952
+
953
+ # Find the DOCTYPE declaration and check the feed type.
954
+ doctype_results = RE_DOCTYPE_PATTERN.findall(head)
955
+ doctype = doctype_results and doctype_results[0] or b""
956
+ if b"netscape" in doctype.lower():
957
+ version = "rss091n"
958
+ else:
959
+ version = None
960
+
961
+ # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
962
+ replacement = b""
963
+ if len(doctype_results) == 1 and entity_results:
964
+ safe_entities = [e for e in entity_results if RE_SAFE_ENTITY_PATTERN.match(e)]
965
+ if safe_entities:
966
+ replacement = (
967
+ b"<!DOCTYPE feed [\n<!ENTITY"
968
+ + b">\n<!ENTITY ".join(safe_entities)
969
+ + b">\n]>"
970
+ )
971
+ data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
972
+
973
+ # Precompute the safe entities for the loose parser.
974
+ entities = {
975
+ k.decode("utf-8"): v.decode("utf-8")
976
+ for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement)
977
+ }
978
+ return version, data, entities