forkparser 2026.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
feedparser/mixin.py ADDED
@@ -0,0 +1,838 @@
1
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
2
+ # Copyright 2002-2008 Mark Pilgrim
3
+ # All rights reserved.
4
+ #
5
+ # This file is a part of feedparser.
6
+ #
7
+ # Redistribution and use in source and binary forms, with or without
8
+ # modification, are permitted provided that the following conditions are met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
17
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
+ # POSSIBILITY OF SUCH DAMAGE.
27
+
28
+ import base64
29
+ import binascii
30
+ import copy
31
+ import html.entities
32
+ import re
33
+ import xml.sax.saxutils
34
+
35
+ from .html import _cp1252
36
+ from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
37
+ from .sanitizer import HTMLSanitizer, sanitize_html
38
+ from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
39
+ from .util import FeedParserDict
40
+
41
+ email_pattern = re.compile(
42
+ r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)"
43
+ r"|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))"
44
+ r"(\?subject=\S+)?"
45
+ )
46
+
47
+
48
+ class XMLParserMixin(
49
+ _base.Namespace,
50
+ cc.Namespace,
51
+ dc.Namespace,
52
+ georss.Namespace,
53
+ itunes.Namespace,
54
+ mediarss.Namespace,
55
+ psc.Namespace,
56
+ ):
57
+ namespaces = {
58
+ "": "",
59
+ "http://backend.userland.com/rss": "",
60
+ "http://blogs.law.harvard.edu/tech/rss": "",
61
+ "http://purl.org/rss/1.0/": "",
62
+ "http://my.netscape.com/rdf/simple/0.9/": "",
63
+ "http://example.com/newformat#": "",
64
+ "http://example.com/necho": "",
65
+ "http://purl.org/echo/": "",
66
+ "uri/of/echo/namespace#": "",
67
+ "http://purl.org/pie/": "",
68
+ "http://purl.org/atom/ns#": "",
69
+ "http://www.w3.org/2005/Atom": "",
70
+ "http://purl.org/rss/1.0/modules/rss091#": "",
71
+ "http://webns.net/mvcb/": "admin",
72
+ "http://purl.org/rss/1.0/modules/aggregation/": "ag",
73
+ "http://purl.org/rss/1.0/modules/annotate/": "annotate",
74
+ "http://media.tangent.org/rss/1.0/": "audio",
75
+ "http://backend.userland.com/blogChannelModule": "blogChannel",
76
+ "http://creativecommons.org/ns#license": "cc",
77
+ "http://web.resource.org/cc/": "cc",
78
+ "http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html": (
79
+ "creativeCommons"
80
+ ),
81
+ "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
82
+ "http://purl.org/rss/1.0/modules/company": "co",
83
+ "http://purl.org/rss/1.0/modules/content/": "content",
84
+ "http://my.theinfo.org/changed/1.0/rss/": "cp",
85
+ "http://purl.org/dc/elements/1.1/": "dc",
86
+ "http://purl.org/dc/terms/": "dcterms",
87
+ "http://purl.org/rss/1.0/modules/email/": "email",
88
+ "http://purl.org/rss/1.0/modules/event/": "ev",
89
+ "http://rssnamespace.org/feedburner/ext/1.0": "feedburner",
90
+ "http://freshmeat.net/rss/fm/": "fm",
91
+ "http://xmlns.com/foaf/0.1/": "foaf",
92
+ "http://www.w3.org/2003/01/geo/wgs84_pos#": "geo",
93
+ "http://www.georss.org/georss": "georss",
94
+ "http://www.opengis.net/gml": "gml",
95
+ "http://postneo.com/icbm/": "icbm",
96
+ "http://purl.org/rss/1.0/modules/image/": "image",
97
+ "http://www.itunes.com/DTDs/PodCast-1.0.dtd": "itunes",
98
+ "http://example.com/DTDs/PodCast-1.0.dtd": "itunes",
99
+ "http://purl.org/rss/1.0/modules/link/": "l",
100
+ "http://search.yahoo.com/mrss": "media",
101
+ # Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
102
+ "http://search.yahoo.com/mrss/": "media",
103
+ "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
104
+ "http://prismstandard.org/namespaces/1.2/basic/": "prism",
105
+ "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
106
+ "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
107
+ "http://purl.org/rss/1.0/modules/reference/": "ref",
108
+ "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
109
+ "http://purl.org/rss/1.0/modules/search/": "search",
110
+ "http://purl.org/rss/1.0/modules/slash/": "slash",
111
+ "http://schemas.xmlsoap.org/soap/envelope/": "soap",
112
+ "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
113
+ "http://hacks.benhammersley.com/rss/streaming/": "str",
114
+ "http://purl.org/rss/1.0/modules/subscription/": "sub",
115
+ "http://purl.org/rss/1.0/modules/syndication/": "sy",
116
+ "http://schemas.pocketsoap.com/rss/myDescModule/": "szf",
117
+ "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
118
+ "http://purl.org/rss/1.0/modules/threading/": "thr",
119
+ "http://purl.org/rss/1.0/modules/textinput/": "ti",
120
+ "http://madskills.com/public/xml/rss/module/trackback/": "trackback",
121
+ "http://wellformedweb.org/commentAPI/": "wfw",
122
+ "http://purl.org/rss/1.0/modules/wiki/": "wiki",
123
+ "http://www.w3.org/1999/xhtml": "xhtml",
124
+ "http://www.w3.org/1999/xlink": "xlink",
125
+ "http://www.w3.org/XML/1998/namespace": "xml",
126
+ "http://podlove.org/simple-chapters": "psc",
127
+ }
128
+ _matchnamespaces: dict[str, str] = {}
129
+
130
+ can_be_relative_uri = {
131
+ "comments",
132
+ "docs",
133
+ "href",
134
+ "icon",
135
+ "id",
136
+ "link",
137
+ "logo",
138
+ "url",
139
+ "wfw_comment",
140
+ "wfw_commentrss",
141
+ }
142
+
143
+ can_contain_relative_uris = {
144
+ "content",
145
+ "copyright",
146
+ "description",
147
+ "info",
148
+ "rights",
149
+ "subtitle",
150
+ "summary",
151
+ "tagline",
152
+ "title",
153
+ }
154
+
155
+ can_contain_dangerous_markup = {
156
+ "content",
157
+ "copyright",
158
+ "description",
159
+ "info",
160
+ "rights",
161
+ "subtitle",
162
+ "summary",
163
+ "tagline",
164
+ "title",
165
+ }
166
+
167
+ html_types = {
168
+ "application/xhtml+xml",
169
+ "text/html",
170
+ }
171
+
172
+ def __init__(self):
173
+ if not self._matchnamespaces:
174
+ for k, v in self.namespaces.items():
175
+ self._matchnamespaces[k.lower()] = v
176
+ self.feeddata = FeedParserDict() # feed-level data
177
+ self.entries = [] # list of entry-level data
178
+ self.version = "" # feed type/version, see SUPPORTED_VERSIONS
179
+ self.namespaces_in_use = {} # dictionary of namespaces defined by the feed
180
+ self.resolve_relative_uris = False
181
+ self.sanitize_html = False
182
+
183
+ # the following are used internally to track state;
184
+ # this is really out of control and should be refactored
185
+ self.infeed = 0
186
+ self.inentry = 0
187
+ self.incontent = 0
188
+ self.intextinput = 0
189
+ self.inimage = 0
190
+ self.inauthor = 0
191
+ self.incontributor = 0
192
+ self.inpublisher = 0
193
+ self.insource = 0
194
+ self.isentrylink = 0
195
+
196
+ self.sourcedata = FeedParserDict()
197
+ self.contentparams = FeedParserDict()
198
+ self._summaryKey = None
199
+ self.namespacemap = {}
200
+ self.elementstack = []
201
+ self.basestack = []
202
+ self.langstack = []
203
+ self.svgOK = 0
204
+ self.title_depth = -1
205
+ self.depth = 0
206
+ self.hasContent = 0
207
+ if self.lang:
208
+ self.feeddata["language"] = self.lang.replace("_", "-")
209
+
210
+ # A map of the following form:
211
+ # {
212
+ # object_that_value_is_set_on: {
213
+ # property_name: depth_of_node_property_was_extracted_from,
214
+ # other_property: depth_of_node_property_was_extracted_from,
215
+ # },
216
+ # }
217
+ self.property_depth_map = {}
218
+ super().__init__()
219
+
220
+ def _normalize_attributes(self, kv):
221
+ raise NotImplementedError
222
+
223
+ def unknown_starttag(self, tag, attrs):
224
+ # increment depth counter
225
+ self.depth += 1
226
+
227
+ # normalize attrs
228
+ attrs = [self._normalize_attributes(attr) for attr in attrs]
229
+
230
+ # track xml:base and xml:lang
231
+ attrs_d = dict(attrs)
232
+ baseuri = attrs_d.get("xml:base", attrs_d.get("base")) or self.baseuri
233
+ if isinstance(baseuri, bytes):
234
+ baseuri = baseuri.decode(self.encoding, "ignore")
235
+ # ensure that self.baseuri is always an absolute URI that
236
+ # uses a whitelisted URI scheme (e.g. not `javascript:`)
237
+ if self.baseuri:
238
+ self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri
239
+ else:
240
+ self.baseuri = _urljoin(self.baseuri, baseuri)
241
+ lang = attrs_d.get("xml:lang", attrs_d.get("lang"))
242
+ if lang == "":
243
+ # xml:lang could be explicitly set to '', we need to capture that
244
+ lang = None
245
+ elif lang is None:
246
+ # if no xml:lang is specified, use parent lang
247
+ lang = self.lang
248
+ if lang:
249
+ if tag in ("feed", "rss", "rdf:RDF"):
250
+ self.feeddata["language"] = lang.replace("_", "-")
251
+ self.lang = lang
252
+ self.basestack.append(self.baseuri)
253
+ self.langstack.append(lang)
254
+
255
+ # track namespaces
256
+ for prefix, uri in attrs:
257
+ if prefix.startswith("xmlns:"):
258
+ self.track_namespace(prefix[6:], uri)
259
+ elif prefix == "xmlns":
260
+ self.track_namespace(None, uri)
261
+
262
+ # track inline content
263
+ if self.incontent and not self.contentparams.get("type", "xml").endswith("xml"):
264
+ if tag in ("xhtml:div", "div"):
265
+ return # typepad does this 10/2007
266
+ # element declared itself as escaped markup, but it isn't really
267
+ self.contentparams["type"] = "application/xhtml+xml"
268
+ if self.incontent and self.contentparams.get("type") == "application/xhtml+xml":
269
+ if tag.find(":") != -1:
270
+ prefix, tag = tag.split(":", 1)
271
+ namespace = self.namespaces_in_use.get(prefix, "")
272
+ if tag == "math" and namespace == "http://www.w3.org/1998/Math/MathML":
273
+ attrs.append(("xmlns", namespace))
274
+ if tag == "svg" and namespace == "http://www.w3.org/2000/svg":
275
+ attrs.append(("xmlns", namespace))
276
+ if tag == "svg":
277
+ self.svgOK += 1
278
+ return self.handle_data(f"<{tag}{self.strattrs(attrs)}>", escape=0)
279
+
280
+ # match namespaces
281
+ if tag.find(":") != -1:
282
+ prefix, suffix = tag.split(":", 1)
283
+ else:
284
+ prefix, suffix = "", tag
285
+ prefix = self.namespacemap.get(prefix, prefix)
286
+ if prefix:
287
+ prefix = prefix + "_"
288
+
289
+ # Special hack for better tracking of empty textinput/image elements in
290
+ # illformed feeds.
291
+ if (not prefix) and tag not in ("title", "link", "description", "name"):
292
+ self.intextinput = 0
293
+ if (not prefix) and tag not in (
294
+ "title",
295
+ "link",
296
+ "description",
297
+ "url",
298
+ "href",
299
+ "width",
300
+ "height",
301
+ ):
302
+ self.inimage = 0
303
+
304
+ # call special handler (if defined) or default handler
305
+ methodname = "_start_" + prefix + suffix
306
+ try:
307
+ method = getattr(self, methodname)
308
+ return method(attrs_d)
309
+ except AttributeError:
310
+ # Since there's no handler or something has gone wrong we
311
+ # explicitly add the element and its attributes.
312
+ unknown_tag = prefix + suffix
313
+ if len(attrs_d) == 0:
314
+ # No attributes so merge it into the enclosing dictionary
315
+ return self.push(unknown_tag, 1)
316
+ # Has attributes so create it in its own dictionary
317
+ context = self._get_context()
318
+ context[unknown_tag] = attrs_d
319
+
320
+ def unknown_endtag(self, tag):
321
+ # match namespaces
322
+ if tag.find(":") != -1:
323
+ prefix, suffix = tag.split(":", 1)
324
+ else:
325
+ prefix, suffix = "", tag
326
+ prefix = self.namespacemap.get(prefix, prefix)
327
+ if prefix:
328
+ prefix = prefix + "_"
329
+ if suffix == "svg" and self.svgOK:
330
+ self.svgOK -= 1
331
+
332
+ # call special handler (if defined) or default handler
333
+ methodname = "_end_" + prefix + suffix
334
+ try:
335
+ if self.svgOK:
336
+ raise AttributeError()
337
+ method = getattr(self, methodname)
338
+ method()
339
+ except AttributeError:
340
+ self.pop(prefix + suffix)
341
+
342
+ # track inline content
343
+ if self.incontent and not self.contentparams.get("type", "xml").endswith("xml"):
344
+ # element declared itself as escaped markup, but it isn't really
345
+ if tag in ("xhtml:div", "div"):
346
+ return # typepad does this 10/2007
347
+ self.contentparams["type"] = "application/xhtml+xml"
348
+ if self.incontent and self.contentparams.get("type") == "application/xhtml+xml":
349
+ tag = tag.split(":")[-1]
350
+ self.handle_data("</%s>" % tag, escape=0)
351
+
352
+ # track xml:base and xml:lang going out of scope
353
+ if self.basestack:
354
+ self.basestack.pop()
355
+ if self.basestack and self.basestack[-1]:
356
+ self.baseuri = self.basestack[-1]
357
+ if self.langstack:
358
+ self.langstack.pop()
359
+ if self.langstack: # and (self.langstack[-1] is not None):
360
+ self.lang = self.langstack[-1]
361
+
362
+ self.depth -= 1
363
+
364
+ def handle_charref(self, ref):
365
+ # Called for each character reference, e.g. for '&#160;', ref is '160'
366
+ if not self.elementstack:
367
+ return
368
+ ref = ref.lower()
369
+ if ref in ("34", "38", "39", "60", "62", "x22", "x26", "x27", "x3c", "x3e"):
370
+ text = "&#%s;" % ref
371
+ else:
372
+ if ref[0] == "x":
373
+ c = int(ref[1:], 16)
374
+ else:
375
+ c = int(ref)
376
+ text = chr(c).encode("utf-8")
377
+ self.elementstack[-1][2].append(text)
378
+
379
+ def handle_entityref(self, ref):
380
+ # Called for each entity reference, e.g. for '&copy;', ref is 'copy'
381
+ if not self.elementstack:
382
+ return
383
+ if ref in ("lt", "gt", "quot", "amp", "apos"):
384
+ text = "&%s;" % ref
385
+ elif ref in self.entities:
386
+ text = self.entities[ref]
387
+ if text.startswith("&#") and text.endswith(";"):
388
+ return self.handle_entityref(text)
389
+ else:
390
+ try:
391
+ html.entities.name2codepoint[ref]
392
+ except KeyError:
393
+ text = "&%s;" % ref
394
+ else:
395
+ text = chr(html.entities.name2codepoint[ref]).encode("utf-8")
396
+ self.elementstack[-1][2].append(text)
397
+
398
+ def handle_data(self, text, escape=1):
399
+ # Called for each block of plain text, i.e. outside of any tag and
400
+ # not containing any character or entity references
401
+ if not self.elementstack:
402
+ return
403
+ if escape and self.contentparams.get("type") == "application/xhtml+xml":
404
+ text = xml.sax.saxutils.escape(text)
405
+ self.elementstack[-1][2].append(text)
406
+
407
+ def handle_comment(self, text):
408
+ # Called for each comment, e.g. <!-- insert message here -->
409
+ pass
410
+
411
+ def handle_pi(self, text):
412
+ # Called for each processing instruction, e.g. <?instruction>
413
+ pass
414
+
415
+ def handle_decl(self, text):
416
+ pass
417
+
418
+ def parse_declaration(self, i):
419
+ # Override internal declaration handler to handle CDATA blocks.
420
+ if self.rawdata[i : i + 9] == "<![CDATA[":
421
+ k = self.rawdata.find("]]>", i)
422
+ if k == -1:
423
+ # CDATA block began but didn't finish
424
+ k = len(self.rawdata)
425
+ return k
426
+ self.handle_data(xml.sax.saxutils.escape(self.rawdata[i + 9 : k]), 0)
427
+ return k + 3
428
+ k = self.rawdata.find(">", i)
429
+ if k >= 0:
430
+ return k + 1
431
+ # We have an incomplete CDATA block.
432
+ return k
433
+
434
+ @staticmethod
435
+ def map_content_type(content_type):
436
+ content_type = content_type.lower()
437
+ if content_type == "text" or content_type == "plain":
438
+ content_type = "text/plain"
439
+ elif content_type == "html":
440
+ content_type = "text/html"
441
+ elif content_type == "xhtml":
442
+ content_type = "application/xhtml+xml"
443
+ return content_type
444
+
445
+ def track_namespace(self, prefix, uri):
446
+ loweruri = uri.lower()
447
+ if not self.version:
448
+ if (prefix, loweruri) == (None, "http://my.netscape.com/rdf/simple/0.9/"):
449
+ self.version = "rss090"
450
+ elif loweruri == "http://purl.org/rss/1.0/":
451
+ self.version = "rss10"
452
+ elif loweruri == "http://www.w3.org/2005/atom":
453
+ self.version = "atom10"
454
+ if loweruri.find("backend.userland.com/rss") != -1:
455
+ # match any backend.userland.com namespace
456
+ uri = "http://backend.userland.com/rss"
457
+ loweruri = uri
458
+ if loweruri in self._matchnamespaces:
459
+ self.namespacemap[prefix] = self._matchnamespaces[loweruri]
460
+ self.namespaces_in_use[self._matchnamespaces[loweruri]] = uri
461
+ else:
462
+ self.namespaces_in_use[prefix or ""] = uri
463
+
464
+ def resolve_uri(self, uri):
465
+ return _urljoin(self.baseuri or "", uri)
466
+
467
+ @staticmethod
468
+ def decode_entities(element, data):
469
+ return data
470
+
471
+ @staticmethod
472
+ def strattrs(attrs):
473
+ return "".join(
474
+ ' {}="{}"'.format(t[0], xml.sax.saxutils.escape(t[1], {'"': "&quot;"}))
475
+ for t in attrs
476
+ )
477
+
478
+ def push(self, element, expecting_text):
479
+ self.elementstack.append([element, expecting_text, []])
480
+
481
+ def pop(self, element, strip_whitespace=1):
482
+ if not self.elementstack:
483
+ return
484
+ if self.elementstack[-1][0] != element:
485
+ return
486
+
487
+ element, expecting_text, pieces = self.elementstack.pop()
488
+
489
+ # Ensure each piece is a str for Python 3
490
+ for i, v in enumerate(pieces):
491
+ if isinstance(v, bytes):
492
+ pieces[i] = v.decode("utf-8")
493
+
494
+ if (
495
+ self.version == "atom10"
496
+ and self.contentparams.get("type", "text") == "application/xhtml+xml"
497
+ ):
498
+ # remove enclosing child element, but only if it is a <div> and
499
+ # only if all the remaining content is nested underneath it.
500
+ # This means that the divs would be retained in the following:
501
+ # <div>foo</div><div>bar</div>
502
+ while pieces and len(pieces) > 1 and not pieces[-1].strip():
503
+ del pieces[-1]
504
+ while pieces and len(pieces) > 1 and not pieces[0].strip():
505
+ del pieces[0]
506
+ if (
507
+ pieces
508
+ and (pieces[0] == "<div>" or pieces[0].startswith("<div "))
509
+ and pieces[-1] == "</div>"
510
+ ):
511
+ depth = 0
512
+ for piece in pieces[:-1]:
513
+ if piece.startswith("</"):
514
+ depth -= 1
515
+ if depth == 0:
516
+ break
517
+ elif piece.startswith("<") and not piece.endswith("/>"):
518
+ depth += 1
519
+ else:
520
+ pieces = pieces[1:-1]
521
+
522
+ output = "".join(pieces)
523
+ if strip_whitespace:
524
+ output = output.strip()
525
+ if not expecting_text:
526
+ return output
527
+
528
+ # decode base64 content
529
+ if base64 and self.contentparams.get("base64", 0):
530
+ try:
531
+ output = base64.decodebytes(output.encode("utf8")).decode("utf8")
532
+ except (binascii.Error, binascii.Incomplete, UnicodeDecodeError):
533
+ pass
534
+
535
+ # resolve relative URIs
536
+ if (element in self.can_be_relative_uri) and output:
537
+ # do not resolve guid elements with isPermalink="false"
538
+ if not element == "id" or self.guidislink:
539
+ output = self.resolve_uri(output)
540
+
541
+ # decode entities within embedded markup
542
+ if not self.contentparams.get("base64", 0):
543
+ output = self.decode_entities(element, output)
544
+
545
+ # some feed formats require consumers to guess
546
+ # whether the content is html or plain text
547
+ if (
548
+ not self.version.startswith("atom")
549
+ and self.contentparams.get("type") == "text/plain"
550
+ ):
551
+ if self.looks_like_html(output):
552
+ self.contentparams["type"] = "text/html"
553
+
554
+ # remove temporary cruft from contentparams
555
+ try:
556
+ del self.contentparams["mode"]
557
+ except KeyError:
558
+ pass
559
+ try:
560
+ del self.contentparams["base64"]
561
+ except KeyError:
562
+ pass
563
+
564
+ is_htmlish = (
565
+ self.map_content_type(self.contentparams.get("type", "text/html"))
566
+ in self.html_types
567
+ )
568
+ # resolve relative URIs within embedded markup
569
+ if is_htmlish and self.resolve_relative_uris:
570
+ if element in self.can_contain_relative_uris:
571
+ output = resolve_relative_uris(
572
+ output,
573
+ self.baseuri,
574
+ self.encoding,
575
+ self.contentparams.get("type", "text/html"),
576
+ )
577
+
578
+ # sanitize embedded markup
579
+ if is_htmlish and self.sanitize_html:
580
+ if element in self.can_contain_dangerous_markup:
581
+ output = sanitize_html(
582
+ output, self.encoding, self.contentparams.get("type", "text/html")
583
+ )
584
+
585
+ if self.encoding and isinstance(output, bytes):
586
+ output = output.decode(self.encoding, "ignore")
587
+
588
+ # address common error where people take data that is already
589
+ # utf-8, presume that it is iso-8859-1, and re-encode it.
590
+ if self.encoding in ("utf-8", "utf-8_INVALID_PYTHON_3") and not isinstance(
591
+ output, bytes
592
+ ):
593
+ try:
594
+ output = output.encode("iso-8859-1").decode("utf-8")
595
+ except (UnicodeEncodeError, UnicodeDecodeError):
596
+ pass
597
+
598
+ # map win-1252 extensions to the proper code points
599
+ if not isinstance(output, bytes):
600
+ output = output.translate(_cp1252)
601
+
602
+ # categories/tags/keywords/whatever are handled in _end_category or
603
+ # _end_tags or _end_itunes_keywords
604
+ if element in ("category", "tags", "itunes_keywords"):
605
+ return output
606
+
607
+ if element == "title" and -1 < self.title_depth <= self.depth:
608
+ return output
609
+
610
+ # store output in appropriate place(s)
611
+ if self.inentry and not self.insource:
612
+ if element == "content":
613
+ self.entries[-1].setdefault(element, [])
614
+ contentparams = copy.deepcopy(self.contentparams)
615
+ contentparams["value"] = output
616
+ self.entries[-1][element].append(contentparams)
617
+ elif element == "link":
618
+ if not self.inimage:
619
+ # query variables in urls in link elements are improperly
620
+ # converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
621
+ # unhandled character references. fix this special case.
622
+ output = output.replace("&amp;", "&")
623
+ output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
624
+ if self.isentrylink or not self.entries[-1].get(element):
625
+ self.entries[-1][element] = output
626
+ if output:
627
+ self.entries[-1]["links"][-1]["href"] = output
628
+ else:
629
+ if element == "description":
630
+ element = "summary"
631
+ old_value_depth = self.property_depth_map.setdefault(
632
+ self.entries[-1], {}
633
+ ).get(element)
634
+ if old_value_depth is None or self.depth <= old_value_depth:
635
+ self.property_depth_map[self.entries[-1]][element] = self.depth
636
+ self.entries[-1][element] = output
637
+ if self.incontent:
638
+ contentparams = copy.deepcopy(self.contentparams)
639
+ contentparams["value"] = output
640
+ self.entries[-1][element + "_detail"] = contentparams
641
+ elif (
642
+ self.infeed or self.insource
643
+ ): # and (not self.intextinput) and (not self.inimage):
644
+ context = self._get_context()
645
+ if element == "description":
646
+ element = "subtitle"
647
+ context[element] = output
648
+ if element == "link":
649
+ # fix query variables; see above for the explanation
650
+ output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
651
+ context[element] = output
652
+ context["links"][-1]["href"] = output
653
+ elif self.incontent:
654
+ contentparams = copy.deepcopy(self.contentparams)
655
+ contentparams["value"] = output
656
+ context[element + "_detail"] = contentparams
657
+ return output
658
+
659
+ def push_content(self, tag, attrs_d, default_content_type, expecting_text):
660
+ self.incontent += 1
661
+ if self.lang:
662
+ self.lang = self.lang.replace("_", "-")
663
+ self.contentparams = FeedParserDict(
664
+ {
665
+ "type": self.map_content_type(
666
+ attrs_d.get("type", default_content_type)
667
+ ),
668
+ "language": self.lang,
669
+ "base": self.baseuri,
670
+ }
671
+ )
672
+ self.contentparams["base64"] = self._is_base64(attrs_d, self.contentparams)
673
+ self.push(tag, expecting_text)
674
+
675
+ def pop_content(self, tag):
676
+ value = self.pop(tag)
677
+ self.incontent -= 1
678
+ self.contentparams.clear()
679
+ return value
680
+
681
+ # a number of elements in a number of RSS variants are nominally plain
682
+ # text, but this is routinely ignored. This is an attempt to detect
683
+ # the most common cases. As false positives often result in silent
684
+ # data loss, this function errs on the conservative side.
685
+ @staticmethod
686
+ def looks_like_html(s):
687
+ """
688
+ :type s: str
689
+ :rtype: bool
690
+ """
691
+
692
+ # must have a close tag or an entity reference to qualify
693
+ if not (re.search(r"</(\w+)>", s) or re.search(r"&#?\w+;", s)):
694
+ return False
695
+
696
+ # all tags must be in a restricted subset of valid HTML tags
697
+ if any(
698
+ t
699
+ for t in re.findall(r"</?(\w+)", s)
700
+ if t.lower() not in HTMLSanitizer.acceptable_elements
701
+ ):
702
+ return False
703
+
704
+ # all entities must have been defined as valid HTML entities
705
+ if any(
706
+ e for e in re.findall(r"&(\w+);", s) if e not in html.entities.entitydefs
707
+ ):
708
+ return False
709
+
710
+ return True
711
+
712
+ def _map_to_standard_prefix(self, name):
713
+ colonpos = name.find(":")
714
+ if colonpos != -1:
715
+ prefix = name[:colonpos]
716
+ suffix = name[colonpos + 1 :]
717
+ prefix = self.namespacemap.get(prefix, prefix)
718
+ name = prefix + ":" + suffix
719
+ return name
720
+
721
+ def _get_attribute(self, attrs_d, name):
722
+ return attrs_d.get(self._map_to_standard_prefix(name))
723
+
724
+ def _is_base64(self, attrs_d, contentparams):
725
+ if attrs_d.get("mode", "") == "base64":
726
+ return 1
727
+ if self.contentparams["type"].startswith("text/"):
728
+ return 0
729
+ if self.contentparams["type"].endswith("+xml"):
730
+ return 0
731
+ if self.contentparams["type"].endswith("/xml"):
732
+ return 0
733
+ return 1
734
+
735
+ @staticmethod
736
+ def _enforce_href(attrs_d):
737
+ href = attrs_d.get("url", attrs_d.get("uri", attrs_d.get("href", None)))
738
+ if href:
739
+ try:
740
+ del attrs_d["url"]
741
+ except KeyError:
742
+ pass
743
+ try:
744
+ del attrs_d["uri"]
745
+ except KeyError:
746
+ pass
747
+ attrs_d["href"] = href
748
+ return attrs_d
749
+
750
+ def _save(self, key, value, overwrite=False):
751
+ context = self._get_context()
752
+ if overwrite:
753
+ context[key] = value
754
+ else:
755
+ context.setdefault(key, value)
756
+
757
+ def _get_context(self):
758
+ if self.insource:
759
+ context = self.sourcedata
760
+ elif self.inimage and "image" in self.feeddata:
761
+ context = self.feeddata["image"]
762
+ elif self.intextinput:
763
+ context = self.feeddata["textinput"]
764
+ elif self.inentry:
765
+ context = self.entries[-1]
766
+ else:
767
+ context = self.feeddata
768
+ return context
769
+
770
+ def _save_author(self, key, value, prefix="author"):
771
+ context = self._get_context()
772
+ context.setdefault(prefix + "_detail", FeedParserDict())
773
+ context[prefix + "_detail"][key] = value
774
+ self._sync_author_detail()
775
+ context.setdefault("authors", [FeedParserDict()])
776
+ context["authors"][-1][key] = value
777
+
778
+ def _save_contributor(self, key, value):
779
+ context = self._get_context()
780
+ context.setdefault("contributors", [FeedParserDict()])
781
+ context["contributors"][-1][key] = value
782
+
783
+ def _sync_author_detail(self, key="author"):
784
+ context = self._get_context()
785
+ detail = context.get("%ss" % key, [FeedParserDict()])[-1]
786
+ if detail:
787
+ name = detail.get("name")
788
+ email = detail.get("email")
789
+ if name and email:
790
+ context[key] = f"{name} ({email})"
791
+ elif name:
792
+ context[key] = name
793
+ elif email:
794
+ context[key] = email
795
+ else:
796
+ author, email = context.get(key), None
797
+ if not author:
798
+ return
799
+ emailmatch = email_pattern.search(author)
800
+ if emailmatch:
801
+ email = emailmatch.group(0)
802
+ # probably a better way to do the following, but it passes
803
+ # all the tests
804
+ author = author.replace(email, "")
805
+ author = author.replace("()", "")
806
+ author = author.replace("<>", "")
807
+ author = author.replace("&lt;&gt;", "")
808
+ author = author.strip()
809
+ if author and (author[0] == "("):
810
+ author = author[1:]
811
+ if author and (author[-1] == ")"):
812
+ author = author[:-1]
813
+ author = author.strip()
814
+ if author or email:
815
+ context.setdefault("%s_detail" % key, detail)
816
+ if author:
817
+ detail["name"] = author
818
+ if email:
819
+ detail["email"] = email
820
+
821
+ def _add_tag(self, term, scheme, label):
822
+ context = self._get_context()
823
+ tags = context.setdefault("tags", [])
824
+ if (not term) and (not scheme) and (not label):
825
+ return
826
+ value = FeedParserDict(term=term, scheme=scheme, label=label)
827
+ if value not in tags:
828
+ tags.append(value)
829
+
830
+ def _start_tags(self, attrs_d):
831
+ # This is a completely-made up element. Its semantics are determined
832
+ # only by a single feed that precipitated bug report 392 on Google Code.
833
+ # In short, this is junk code.
834
+ self.push("tags", 1)
835
+
836
+ def _end_tags(self):
837
+ for term in self.pop("tags").split(","):
838
+ self._add_tag(term.strip(), None, None)