forkparser 2026.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- feedparser/__init__.py +66 -0
- feedparser/api.py +376 -0
- feedparser/datetimes/__init__.py +73 -0
- feedparser/datetimes/asctime.py +80 -0
- feedparser/datetimes/greek.py +90 -0
- feedparser/datetimes/hungarian.py +66 -0
- feedparser/datetimes/iso8601.py +160 -0
- feedparser/datetimes/korean.py +94 -0
- feedparser/datetimes/perforce.py +63 -0
- feedparser/datetimes/rfc822.py +179 -0
- feedparser/datetimes/w3dtf.py +128 -0
- feedparser/encodings.py +649 -0
- feedparser/exceptions.py +55 -0
- feedparser/html.py +350 -0
- feedparser/http.py +74 -0
- feedparser/mixin.py +838 -0
- feedparser/namespaces/__init__.py +0 -0
- feedparser/namespaces/_base.py +547 -0
- feedparser/namespaces/admin.py +53 -0
- feedparser/namespaces/cc.py +70 -0
- feedparser/namespaces/dc.py +138 -0
- feedparser/namespaces/georss.py +682 -0
- feedparser/namespaces/itunes.py +113 -0
- feedparser/namespaces/mediarss.py +142 -0
- feedparser/namespaces/psc.py +74 -0
- feedparser/parsers/__init__.py +0 -0
- feedparser/parsers/json.py +135 -0
- feedparser/parsers/loose.py +75 -0
- feedparser/parsers/strict.py +141 -0
- feedparser/py.typed +0 -0
- feedparser/sanitizer.py +978 -0
- feedparser/sgml.py +98 -0
- feedparser/urls.py +233 -0
- feedparser/util.py +157 -0
- forkparser-2026.1.0.dist-info/METADATA +75 -0
- forkparser-2026.1.0.dist-info/RECORD +38 -0
- forkparser-2026.1.0.dist-info/WHEEL +4 -0
- forkparser-2026.1.0.dist-info/licenses/LICENSE +65 -0
feedparser/mixin.py
ADDED
|
@@ -0,0 +1,838 @@
|
|
|
1
|
+
# Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
|
|
2
|
+
# Copyright 2002-2008 Mark Pilgrim
|
|
3
|
+
# All rights reserved.
|
|
4
|
+
#
|
|
5
|
+
# This file is a part of feedparser.
|
|
6
|
+
#
|
|
7
|
+
# Redistribution and use in source and binary forms, with or without
|
|
8
|
+
# modification, are permitted provided that the following conditions are met:
|
|
9
|
+
#
|
|
10
|
+
# * Redistributions of source code must retain the above copyright notice,
|
|
11
|
+
# this list of conditions and the following disclaimer.
|
|
12
|
+
# * Redistributions in binary form must reproduce the above copyright notice,
|
|
13
|
+
# this list of conditions and the following disclaimer in the documentation
|
|
14
|
+
# and/or other materials provided with the distribution.
|
|
15
|
+
#
|
|
16
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
|
|
17
|
+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
18
|
+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
19
|
+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
20
|
+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
21
|
+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
22
|
+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
23
|
+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
24
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
25
|
+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
26
|
+
# POSSIBILITY OF SUCH DAMAGE.
|
|
27
|
+
|
|
28
|
+
import base64
|
|
29
|
+
import binascii
|
|
30
|
+
import copy
|
|
31
|
+
import html.entities
|
|
32
|
+
import re
|
|
33
|
+
import xml.sax.saxutils
|
|
34
|
+
|
|
35
|
+
from .html import _cp1252
|
|
36
|
+
from .namespaces import _base, cc, dc, georss, itunes, mediarss, psc
|
|
37
|
+
from .sanitizer import HTMLSanitizer, sanitize_html
|
|
38
|
+
from .urls import _urljoin, make_safe_absolute_uri, resolve_relative_uris
|
|
39
|
+
from .util import FeedParserDict
|
|
40
|
+
|
|
41
|
+
email_pattern = re.compile(
|
|
42
|
+
r"(([a-zA-Z0-9_.+-]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)"
|
|
43
|
+
r"|(([a-zA-Z0-9-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(]?))"
|
|
44
|
+
r"(\?subject=\S+)?"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class XMLParserMixin(
|
|
49
|
+
_base.Namespace,
|
|
50
|
+
cc.Namespace,
|
|
51
|
+
dc.Namespace,
|
|
52
|
+
georss.Namespace,
|
|
53
|
+
itunes.Namespace,
|
|
54
|
+
mediarss.Namespace,
|
|
55
|
+
psc.Namespace,
|
|
56
|
+
):
|
|
57
|
+
namespaces = {
|
|
58
|
+
"": "",
|
|
59
|
+
"http://backend.userland.com/rss": "",
|
|
60
|
+
"http://blogs.law.harvard.edu/tech/rss": "",
|
|
61
|
+
"http://purl.org/rss/1.0/": "",
|
|
62
|
+
"http://my.netscape.com/rdf/simple/0.9/": "",
|
|
63
|
+
"http://example.com/newformat#": "",
|
|
64
|
+
"http://example.com/necho": "",
|
|
65
|
+
"http://purl.org/echo/": "",
|
|
66
|
+
"uri/of/echo/namespace#": "",
|
|
67
|
+
"http://purl.org/pie/": "",
|
|
68
|
+
"http://purl.org/atom/ns#": "",
|
|
69
|
+
"http://www.w3.org/2005/Atom": "",
|
|
70
|
+
"http://purl.org/rss/1.0/modules/rss091#": "",
|
|
71
|
+
"http://webns.net/mvcb/": "admin",
|
|
72
|
+
"http://purl.org/rss/1.0/modules/aggregation/": "ag",
|
|
73
|
+
"http://purl.org/rss/1.0/modules/annotate/": "annotate",
|
|
74
|
+
"http://media.tangent.org/rss/1.0/": "audio",
|
|
75
|
+
"http://backend.userland.com/blogChannelModule": "blogChannel",
|
|
76
|
+
"http://creativecommons.org/ns#license": "cc",
|
|
77
|
+
"http://web.resource.org/cc/": "cc",
|
|
78
|
+
"http://cyber.law.harvard.edu/rss/creativeCommonsRssModule.html": (
|
|
79
|
+
"creativeCommons"
|
|
80
|
+
),
|
|
81
|
+
"http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
|
|
82
|
+
"http://purl.org/rss/1.0/modules/company": "co",
|
|
83
|
+
"http://purl.org/rss/1.0/modules/content/": "content",
|
|
84
|
+
"http://my.theinfo.org/changed/1.0/rss/": "cp",
|
|
85
|
+
"http://purl.org/dc/elements/1.1/": "dc",
|
|
86
|
+
"http://purl.org/dc/terms/": "dcterms",
|
|
87
|
+
"http://purl.org/rss/1.0/modules/email/": "email",
|
|
88
|
+
"http://purl.org/rss/1.0/modules/event/": "ev",
|
|
89
|
+
"http://rssnamespace.org/feedburner/ext/1.0": "feedburner",
|
|
90
|
+
"http://freshmeat.net/rss/fm/": "fm",
|
|
91
|
+
"http://xmlns.com/foaf/0.1/": "foaf",
|
|
92
|
+
"http://www.w3.org/2003/01/geo/wgs84_pos#": "geo",
|
|
93
|
+
"http://www.georss.org/georss": "georss",
|
|
94
|
+
"http://www.opengis.net/gml": "gml",
|
|
95
|
+
"http://postneo.com/icbm/": "icbm",
|
|
96
|
+
"http://purl.org/rss/1.0/modules/image/": "image",
|
|
97
|
+
"http://www.itunes.com/DTDs/PodCast-1.0.dtd": "itunes",
|
|
98
|
+
"http://example.com/DTDs/PodCast-1.0.dtd": "itunes",
|
|
99
|
+
"http://purl.org/rss/1.0/modules/link/": "l",
|
|
100
|
+
"http://search.yahoo.com/mrss": "media",
|
|
101
|
+
# Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
|
|
102
|
+
"http://search.yahoo.com/mrss/": "media",
|
|
103
|
+
"http://madskills.com/public/xml/rss/module/pingback/": "pingback",
|
|
104
|
+
"http://prismstandard.org/namespaces/1.2/basic/": "prism",
|
|
105
|
+
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
|
|
106
|
+
"http://www.w3.org/2000/01/rdf-schema#": "rdfs",
|
|
107
|
+
"http://purl.org/rss/1.0/modules/reference/": "ref",
|
|
108
|
+
"http://purl.org/rss/1.0/modules/richequiv/": "reqv",
|
|
109
|
+
"http://purl.org/rss/1.0/modules/search/": "search",
|
|
110
|
+
"http://purl.org/rss/1.0/modules/slash/": "slash",
|
|
111
|
+
"http://schemas.xmlsoap.org/soap/envelope/": "soap",
|
|
112
|
+
"http://purl.org/rss/1.0/modules/servicestatus/": "ss",
|
|
113
|
+
"http://hacks.benhammersley.com/rss/streaming/": "str",
|
|
114
|
+
"http://purl.org/rss/1.0/modules/subscription/": "sub",
|
|
115
|
+
"http://purl.org/rss/1.0/modules/syndication/": "sy",
|
|
116
|
+
"http://schemas.pocketsoap.com/rss/myDescModule/": "szf",
|
|
117
|
+
"http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
|
|
118
|
+
"http://purl.org/rss/1.0/modules/threading/": "thr",
|
|
119
|
+
"http://purl.org/rss/1.0/modules/textinput/": "ti",
|
|
120
|
+
"http://madskills.com/public/xml/rss/module/trackback/": "trackback",
|
|
121
|
+
"http://wellformedweb.org/commentAPI/": "wfw",
|
|
122
|
+
"http://purl.org/rss/1.0/modules/wiki/": "wiki",
|
|
123
|
+
"http://www.w3.org/1999/xhtml": "xhtml",
|
|
124
|
+
"http://www.w3.org/1999/xlink": "xlink",
|
|
125
|
+
"http://www.w3.org/XML/1998/namespace": "xml",
|
|
126
|
+
"http://podlove.org/simple-chapters": "psc",
|
|
127
|
+
}
|
|
128
|
+
_matchnamespaces: dict[str, str] = {}
|
|
129
|
+
|
|
130
|
+
can_be_relative_uri = {
|
|
131
|
+
"comments",
|
|
132
|
+
"docs",
|
|
133
|
+
"href",
|
|
134
|
+
"icon",
|
|
135
|
+
"id",
|
|
136
|
+
"link",
|
|
137
|
+
"logo",
|
|
138
|
+
"url",
|
|
139
|
+
"wfw_comment",
|
|
140
|
+
"wfw_commentrss",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
can_contain_relative_uris = {
|
|
144
|
+
"content",
|
|
145
|
+
"copyright",
|
|
146
|
+
"description",
|
|
147
|
+
"info",
|
|
148
|
+
"rights",
|
|
149
|
+
"subtitle",
|
|
150
|
+
"summary",
|
|
151
|
+
"tagline",
|
|
152
|
+
"title",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
can_contain_dangerous_markup = {
|
|
156
|
+
"content",
|
|
157
|
+
"copyright",
|
|
158
|
+
"description",
|
|
159
|
+
"info",
|
|
160
|
+
"rights",
|
|
161
|
+
"subtitle",
|
|
162
|
+
"summary",
|
|
163
|
+
"tagline",
|
|
164
|
+
"title",
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
html_types = {
|
|
168
|
+
"application/xhtml+xml",
|
|
169
|
+
"text/html",
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
def __init__(self):
|
|
173
|
+
if not self._matchnamespaces:
|
|
174
|
+
for k, v in self.namespaces.items():
|
|
175
|
+
self._matchnamespaces[k.lower()] = v
|
|
176
|
+
self.feeddata = FeedParserDict() # feed-level data
|
|
177
|
+
self.entries = [] # list of entry-level data
|
|
178
|
+
self.version = "" # feed type/version, see SUPPORTED_VERSIONS
|
|
179
|
+
self.namespaces_in_use = {} # dictionary of namespaces defined by the feed
|
|
180
|
+
self.resolve_relative_uris = False
|
|
181
|
+
self.sanitize_html = False
|
|
182
|
+
|
|
183
|
+
# the following are used internally to track state;
|
|
184
|
+
# this is really out of control and should be refactored
|
|
185
|
+
self.infeed = 0
|
|
186
|
+
self.inentry = 0
|
|
187
|
+
self.incontent = 0
|
|
188
|
+
self.intextinput = 0
|
|
189
|
+
self.inimage = 0
|
|
190
|
+
self.inauthor = 0
|
|
191
|
+
self.incontributor = 0
|
|
192
|
+
self.inpublisher = 0
|
|
193
|
+
self.insource = 0
|
|
194
|
+
self.isentrylink = 0
|
|
195
|
+
|
|
196
|
+
self.sourcedata = FeedParserDict()
|
|
197
|
+
self.contentparams = FeedParserDict()
|
|
198
|
+
self._summaryKey = None
|
|
199
|
+
self.namespacemap = {}
|
|
200
|
+
self.elementstack = []
|
|
201
|
+
self.basestack = []
|
|
202
|
+
self.langstack = []
|
|
203
|
+
self.svgOK = 0
|
|
204
|
+
self.title_depth = -1
|
|
205
|
+
self.depth = 0
|
|
206
|
+
self.hasContent = 0
|
|
207
|
+
if self.lang:
|
|
208
|
+
self.feeddata["language"] = self.lang.replace("_", "-")
|
|
209
|
+
|
|
210
|
+
# A map of the following form:
|
|
211
|
+
# {
|
|
212
|
+
# object_that_value_is_set_on: {
|
|
213
|
+
# property_name: depth_of_node_property_was_extracted_from,
|
|
214
|
+
# other_property: depth_of_node_property_was_extracted_from,
|
|
215
|
+
# },
|
|
216
|
+
# }
|
|
217
|
+
self.property_depth_map = {}
|
|
218
|
+
super().__init__()
|
|
219
|
+
|
|
220
|
+
def _normalize_attributes(self, kv):
|
|
221
|
+
raise NotImplementedError
|
|
222
|
+
|
|
223
|
+
def unknown_starttag(self, tag, attrs):
|
|
224
|
+
# increment depth counter
|
|
225
|
+
self.depth += 1
|
|
226
|
+
|
|
227
|
+
# normalize attrs
|
|
228
|
+
attrs = [self._normalize_attributes(attr) for attr in attrs]
|
|
229
|
+
|
|
230
|
+
# track xml:base and xml:lang
|
|
231
|
+
attrs_d = dict(attrs)
|
|
232
|
+
baseuri = attrs_d.get("xml:base", attrs_d.get("base")) or self.baseuri
|
|
233
|
+
if isinstance(baseuri, bytes):
|
|
234
|
+
baseuri = baseuri.decode(self.encoding, "ignore")
|
|
235
|
+
# ensure that self.baseuri is always an absolute URI that
|
|
236
|
+
# uses a whitelisted URI scheme (e.g. not `javascript:`)
|
|
237
|
+
if self.baseuri:
|
|
238
|
+
self.baseuri = make_safe_absolute_uri(self.baseuri, baseuri) or self.baseuri
|
|
239
|
+
else:
|
|
240
|
+
self.baseuri = _urljoin(self.baseuri, baseuri)
|
|
241
|
+
lang = attrs_d.get("xml:lang", attrs_d.get("lang"))
|
|
242
|
+
if lang == "":
|
|
243
|
+
# xml:lang could be explicitly set to '', we need to capture that
|
|
244
|
+
lang = None
|
|
245
|
+
elif lang is None:
|
|
246
|
+
# if no xml:lang is specified, use parent lang
|
|
247
|
+
lang = self.lang
|
|
248
|
+
if lang:
|
|
249
|
+
if tag in ("feed", "rss", "rdf:RDF"):
|
|
250
|
+
self.feeddata["language"] = lang.replace("_", "-")
|
|
251
|
+
self.lang = lang
|
|
252
|
+
self.basestack.append(self.baseuri)
|
|
253
|
+
self.langstack.append(lang)
|
|
254
|
+
|
|
255
|
+
# track namespaces
|
|
256
|
+
for prefix, uri in attrs:
|
|
257
|
+
if prefix.startswith("xmlns:"):
|
|
258
|
+
self.track_namespace(prefix[6:], uri)
|
|
259
|
+
elif prefix == "xmlns":
|
|
260
|
+
self.track_namespace(None, uri)
|
|
261
|
+
|
|
262
|
+
# track inline content
|
|
263
|
+
if self.incontent and not self.contentparams.get("type", "xml").endswith("xml"):
|
|
264
|
+
if tag in ("xhtml:div", "div"):
|
|
265
|
+
return # typepad does this 10/2007
|
|
266
|
+
# element declared itself as escaped markup, but it isn't really
|
|
267
|
+
self.contentparams["type"] = "application/xhtml+xml"
|
|
268
|
+
if self.incontent and self.contentparams.get("type") == "application/xhtml+xml":
|
|
269
|
+
if tag.find(":") != -1:
|
|
270
|
+
prefix, tag = tag.split(":", 1)
|
|
271
|
+
namespace = self.namespaces_in_use.get(prefix, "")
|
|
272
|
+
if tag == "math" and namespace == "http://www.w3.org/1998/Math/MathML":
|
|
273
|
+
attrs.append(("xmlns", namespace))
|
|
274
|
+
if tag == "svg" and namespace == "http://www.w3.org/2000/svg":
|
|
275
|
+
attrs.append(("xmlns", namespace))
|
|
276
|
+
if tag == "svg":
|
|
277
|
+
self.svgOK += 1
|
|
278
|
+
return self.handle_data(f"<{tag}{self.strattrs(attrs)}>", escape=0)
|
|
279
|
+
|
|
280
|
+
# match namespaces
|
|
281
|
+
if tag.find(":") != -1:
|
|
282
|
+
prefix, suffix = tag.split(":", 1)
|
|
283
|
+
else:
|
|
284
|
+
prefix, suffix = "", tag
|
|
285
|
+
prefix = self.namespacemap.get(prefix, prefix)
|
|
286
|
+
if prefix:
|
|
287
|
+
prefix = prefix + "_"
|
|
288
|
+
|
|
289
|
+
# Special hack for better tracking of empty textinput/image elements in
|
|
290
|
+
# illformed feeds.
|
|
291
|
+
if (not prefix) and tag not in ("title", "link", "description", "name"):
|
|
292
|
+
self.intextinput = 0
|
|
293
|
+
if (not prefix) and tag not in (
|
|
294
|
+
"title",
|
|
295
|
+
"link",
|
|
296
|
+
"description",
|
|
297
|
+
"url",
|
|
298
|
+
"href",
|
|
299
|
+
"width",
|
|
300
|
+
"height",
|
|
301
|
+
):
|
|
302
|
+
self.inimage = 0
|
|
303
|
+
|
|
304
|
+
# call special handler (if defined) or default handler
|
|
305
|
+
methodname = "_start_" + prefix + suffix
|
|
306
|
+
try:
|
|
307
|
+
method = getattr(self, methodname)
|
|
308
|
+
return method(attrs_d)
|
|
309
|
+
except AttributeError:
|
|
310
|
+
# Since there's no handler or something has gone wrong we
|
|
311
|
+
# explicitly add the element and its attributes.
|
|
312
|
+
unknown_tag = prefix + suffix
|
|
313
|
+
if len(attrs_d) == 0:
|
|
314
|
+
# No attributes so merge it into the enclosing dictionary
|
|
315
|
+
return self.push(unknown_tag, 1)
|
|
316
|
+
# Has attributes so create it in its own dictionary
|
|
317
|
+
context = self._get_context()
|
|
318
|
+
context[unknown_tag] = attrs_d
|
|
319
|
+
|
|
320
|
+
def unknown_endtag(self, tag):
|
|
321
|
+
# match namespaces
|
|
322
|
+
if tag.find(":") != -1:
|
|
323
|
+
prefix, suffix = tag.split(":", 1)
|
|
324
|
+
else:
|
|
325
|
+
prefix, suffix = "", tag
|
|
326
|
+
prefix = self.namespacemap.get(prefix, prefix)
|
|
327
|
+
if prefix:
|
|
328
|
+
prefix = prefix + "_"
|
|
329
|
+
if suffix == "svg" and self.svgOK:
|
|
330
|
+
self.svgOK -= 1
|
|
331
|
+
|
|
332
|
+
# call special handler (if defined) or default handler
|
|
333
|
+
methodname = "_end_" + prefix + suffix
|
|
334
|
+
try:
|
|
335
|
+
if self.svgOK:
|
|
336
|
+
raise AttributeError()
|
|
337
|
+
method = getattr(self, methodname)
|
|
338
|
+
method()
|
|
339
|
+
except AttributeError:
|
|
340
|
+
self.pop(prefix + suffix)
|
|
341
|
+
|
|
342
|
+
# track inline content
|
|
343
|
+
if self.incontent and not self.contentparams.get("type", "xml").endswith("xml"):
|
|
344
|
+
# element declared itself as escaped markup, but it isn't really
|
|
345
|
+
if tag in ("xhtml:div", "div"):
|
|
346
|
+
return # typepad does this 10/2007
|
|
347
|
+
self.contentparams["type"] = "application/xhtml+xml"
|
|
348
|
+
if self.incontent and self.contentparams.get("type") == "application/xhtml+xml":
|
|
349
|
+
tag = tag.split(":")[-1]
|
|
350
|
+
self.handle_data("</%s>" % tag, escape=0)
|
|
351
|
+
|
|
352
|
+
# track xml:base and xml:lang going out of scope
|
|
353
|
+
if self.basestack:
|
|
354
|
+
self.basestack.pop()
|
|
355
|
+
if self.basestack and self.basestack[-1]:
|
|
356
|
+
self.baseuri = self.basestack[-1]
|
|
357
|
+
if self.langstack:
|
|
358
|
+
self.langstack.pop()
|
|
359
|
+
if self.langstack: # and (self.langstack[-1] is not None):
|
|
360
|
+
self.lang = self.langstack[-1]
|
|
361
|
+
|
|
362
|
+
self.depth -= 1
|
|
363
|
+
|
|
364
|
+
def handle_charref(self, ref):
|
|
365
|
+
# Called for each character reference, e.g. for ' ', ref is '160'
|
|
366
|
+
if not self.elementstack:
|
|
367
|
+
return
|
|
368
|
+
ref = ref.lower()
|
|
369
|
+
if ref in ("34", "38", "39", "60", "62", "x22", "x26", "x27", "x3c", "x3e"):
|
|
370
|
+
text = "&#%s;" % ref
|
|
371
|
+
else:
|
|
372
|
+
if ref[0] == "x":
|
|
373
|
+
c = int(ref[1:], 16)
|
|
374
|
+
else:
|
|
375
|
+
c = int(ref)
|
|
376
|
+
text = chr(c).encode("utf-8")
|
|
377
|
+
self.elementstack[-1][2].append(text)
|
|
378
|
+
|
|
379
|
+
def handle_entityref(self, ref):
|
|
380
|
+
# Called for each entity reference, e.g. for '©', ref is 'copy'
|
|
381
|
+
if not self.elementstack:
|
|
382
|
+
return
|
|
383
|
+
if ref in ("lt", "gt", "quot", "amp", "apos"):
|
|
384
|
+
text = "&%s;" % ref
|
|
385
|
+
elif ref in self.entities:
|
|
386
|
+
text = self.entities[ref]
|
|
387
|
+
if text.startswith("&#") and text.endswith(";"):
|
|
388
|
+
return self.handle_entityref(text)
|
|
389
|
+
else:
|
|
390
|
+
try:
|
|
391
|
+
html.entities.name2codepoint[ref]
|
|
392
|
+
except KeyError:
|
|
393
|
+
text = "&%s;" % ref
|
|
394
|
+
else:
|
|
395
|
+
text = chr(html.entities.name2codepoint[ref]).encode("utf-8")
|
|
396
|
+
self.elementstack[-1][2].append(text)
|
|
397
|
+
|
|
398
|
+
def handle_data(self, text, escape=1):
|
|
399
|
+
# Called for each block of plain text, i.e. outside of any tag and
|
|
400
|
+
# not containing any character or entity references
|
|
401
|
+
if not self.elementstack:
|
|
402
|
+
return
|
|
403
|
+
if escape and self.contentparams.get("type") == "application/xhtml+xml":
|
|
404
|
+
text = xml.sax.saxutils.escape(text)
|
|
405
|
+
self.elementstack[-1][2].append(text)
|
|
406
|
+
|
|
407
|
+
def handle_comment(self, text):
|
|
408
|
+
# Called for each comment, e.g. <!-- insert message here -->
|
|
409
|
+
pass
|
|
410
|
+
|
|
411
|
+
def handle_pi(self, text):
|
|
412
|
+
# Called for each processing instruction, e.g. <?instruction>
|
|
413
|
+
pass
|
|
414
|
+
|
|
415
|
+
def handle_decl(self, text):
|
|
416
|
+
pass
|
|
417
|
+
|
|
418
|
+
def parse_declaration(self, i):
|
|
419
|
+
# Override internal declaration handler to handle CDATA blocks.
|
|
420
|
+
if self.rawdata[i : i + 9] == "<![CDATA[":
|
|
421
|
+
k = self.rawdata.find("]]>", i)
|
|
422
|
+
if k == -1:
|
|
423
|
+
# CDATA block began but didn't finish
|
|
424
|
+
k = len(self.rawdata)
|
|
425
|
+
return k
|
|
426
|
+
self.handle_data(xml.sax.saxutils.escape(self.rawdata[i + 9 : k]), 0)
|
|
427
|
+
return k + 3
|
|
428
|
+
k = self.rawdata.find(">", i)
|
|
429
|
+
if k >= 0:
|
|
430
|
+
return k + 1
|
|
431
|
+
# We have an incomplete CDATA block.
|
|
432
|
+
return k
|
|
433
|
+
|
|
434
|
+
@staticmethod
|
|
435
|
+
def map_content_type(content_type):
|
|
436
|
+
content_type = content_type.lower()
|
|
437
|
+
if content_type == "text" or content_type == "plain":
|
|
438
|
+
content_type = "text/plain"
|
|
439
|
+
elif content_type == "html":
|
|
440
|
+
content_type = "text/html"
|
|
441
|
+
elif content_type == "xhtml":
|
|
442
|
+
content_type = "application/xhtml+xml"
|
|
443
|
+
return content_type
|
|
444
|
+
|
|
445
|
+
def track_namespace(self, prefix, uri):
|
|
446
|
+
loweruri = uri.lower()
|
|
447
|
+
if not self.version:
|
|
448
|
+
if (prefix, loweruri) == (None, "http://my.netscape.com/rdf/simple/0.9/"):
|
|
449
|
+
self.version = "rss090"
|
|
450
|
+
elif loweruri == "http://purl.org/rss/1.0/":
|
|
451
|
+
self.version = "rss10"
|
|
452
|
+
elif loweruri == "http://www.w3.org/2005/atom":
|
|
453
|
+
self.version = "atom10"
|
|
454
|
+
if loweruri.find("backend.userland.com/rss") != -1:
|
|
455
|
+
# match any backend.userland.com namespace
|
|
456
|
+
uri = "http://backend.userland.com/rss"
|
|
457
|
+
loweruri = uri
|
|
458
|
+
if loweruri in self._matchnamespaces:
|
|
459
|
+
self.namespacemap[prefix] = self._matchnamespaces[loweruri]
|
|
460
|
+
self.namespaces_in_use[self._matchnamespaces[loweruri]] = uri
|
|
461
|
+
else:
|
|
462
|
+
self.namespaces_in_use[prefix or ""] = uri
|
|
463
|
+
|
|
464
|
+
def resolve_uri(self, uri):
|
|
465
|
+
return _urljoin(self.baseuri or "", uri)
|
|
466
|
+
|
|
467
|
+
@staticmethod
|
|
468
|
+
def decode_entities(element, data):
|
|
469
|
+
return data
|
|
470
|
+
|
|
471
|
+
@staticmethod
|
|
472
|
+
def strattrs(attrs):
|
|
473
|
+
return "".join(
|
|
474
|
+
' {}="{}"'.format(t[0], xml.sax.saxutils.escape(t[1], {'"': """}))
|
|
475
|
+
for t in attrs
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
def push(self, element, expecting_text):
|
|
479
|
+
self.elementstack.append([element, expecting_text, []])
|
|
480
|
+
|
|
481
|
+
def pop(self, element, strip_whitespace=1):
|
|
482
|
+
if not self.elementstack:
|
|
483
|
+
return
|
|
484
|
+
if self.elementstack[-1][0] != element:
|
|
485
|
+
return
|
|
486
|
+
|
|
487
|
+
element, expecting_text, pieces = self.elementstack.pop()
|
|
488
|
+
|
|
489
|
+
# Ensure each piece is a str for Python 3
|
|
490
|
+
for i, v in enumerate(pieces):
|
|
491
|
+
if isinstance(v, bytes):
|
|
492
|
+
pieces[i] = v.decode("utf-8")
|
|
493
|
+
|
|
494
|
+
if (
|
|
495
|
+
self.version == "atom10"
|
|
496
|
+
and self.contentparams.get("type", "text") == "application/xhtml+xml"
|
|
497
|
+
):
|
|
498
|
+
# remove enclosing child element, but only if it is a <div> and
|
|
499
|
+
# only if all the remaining content is nested underneath it.
|
|
500
|
+
# This means that the divs would be retained in the following:
|
|
501
|
+
# <div>foo</div><div>bar</div>
|
|
502
|
+
while pieces and len(pieces) > 1 and not pieces[-1].strip():
|
|
503
|
+
del pieces[-1]
|
|
504
|
+
while pieces and len(pieces) > 1 and not pieces[0].strip():
|
|
505
|
+
del pieces[0]
|
|
506
|
+
if (
|
|
507
|
+
pieces
|
|
508
|
+
and (pieces[0] == "<div>" or pieces[0].startswith("<div "))
|
|
509
|
+
and pieces[-1] == "</div>"
|
|
510
|
+
):
|
|
511
|
+
depth = 0
|
|
512
|
+
for piece in pieces[:-1]:
|
|
513
|
+
if piece.startswith("</"):
|
|
514
|
+
depth -= 1
|
|
515
|
+
if depth == 0:
|
|
516
|
+
break
|
|
517
|
+
elif piece.startswith("<") and not piece.endswith("/>"):
|
|
518
|
+
depth += 1
|
|
519
|
+
else:
|
|
520
|
+
pieces = pieces[1:-1]
|
|
521
|
+
|
|
522
|
+
output = "".join(pieces)
|
|
523
|
+
if strip_whitespace:
|
|
524
|
+
output = output.strip()
|
|
525
|
+
if not expecting_text:
|
|
526
|
+
return output
|
|
527
|
+
|
|
528
|
+
# decode base64 content
|
|
529
|
+
if base64 and self.contentparams.get("base64", 0):
|
|
530
|
+
try:
|
|
531
|
+
output = base64.decodebytes(output.encode("utf8")).decode("utf8")
|
|
532
|
+
except (binascii.Error, binascii.Incomplete, UnicodeDecodeError):
|
|
533
|
+
pass
|
|
534
|
+
|
|
535
|
+
# resolve relative URIs
|
|
536
|
+
if (element in self.can_be_relative_uri) and output:
|
|
537
|
+
# do not resolve guid elements with isPermalink="false"
|
|
538
|
+
if not element == "id" or self.guidislink:
|
|
539
|
+
output = self.resolve_uri(output)
|
|
540
|
+
|
|
541
|
+
# decode entities within embedded markup
|
|
542
|
+
if not self.contentparams.get("base64", 0):
|
|
543
|
+
output = self.decode_entities(element, output)
|
|
544
|
+
|
|
545
|
+
# some feed formats require consumers to guess
|
|
546
|
+
# whether the content is html or plain text
|
|
547
|
+
if (
|
|
548
|
+
not self.version.startswith("atom")
|
|
549
|
+
and self.contentparams.get("type") == "text/plain"
|
|
550
|
+
):
|
|
551
|
+
if self.looks_like_html(output):
|
|
552
|
+
self.contentparams["type"] = "text/html"
|
|
553
|
+
|
|
554
|
+
# remove temporary cruft from contentparams
|
|
555
|
+
try:
|
|
556
|
+
del self.contentparams["mode"]
|
|
557
|
+
except KeyError:
|
|
558
|
+
pass
|
|
559
|
+
try:
|
|
560
|
+
del self.contentparams["base64"]
|
|
561
|
+
except KeyError:
|
|
562
|
+
pass
|
|
563
|
+
|
|
564
|
+
is_htmlish = (
|
|
565
|
+
self.map_content_type(self.contentparams.get("type", "text/html"))
|
|
566
|
+
in self.html_types
|
|
567
|
+
)
|
|
568
|
+
# resolve relative URIs within embedded markup
|
|
569
|
+
if is_htmlish and self.resolve_relative_uris:
|
|
570
|
+
if element in self.can_contain_relative_uris:
|
|
571
|
+
output = resolve_relative_uris(
|
|
572
|
+
output,
|
|
573
|
+
self.baseuri,
|
|
574
|
+
self.encoding,
|
|
575
|
+
self.contentparams.get("type", "text/html"),
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# sanitize embedded markup
|
|
579
|
+
if is_htmlish and self.sanitize_html:
|
|
580
|
+
if element in self.can_contain_dangerous_markup:
|
|
581
|
+
output = sanitize_html(
|
|
582
|
+
output, self.encoding, self.contentparams.get("type", "text/html")
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
if self.encoding and isinstance(output, bytes):
|
|
586
|
+
output = output.decode(self.encoding, "ignore")
|
|
587
|
+
|
|
588
|
+
# address common error where people take data that is already
|
|
589
|
+
# utf-8, presume that it is iso-8859-1, and re-encode it.
|
|
590
|
+
if self.encoding in ("utf-8", "utf-8_INVALID_PYTHON_3") and not isinstance(
|
|
591
|
+
output, bytes
|
|
592
|
+
):
|
|
593
|
+
try:
|
|
594
|
+
output = output.encode("iso-8859-1").decode("utf-8")
|
|
595
|
+
except (UnicodeEncodeError, UnicodeDecodeError):
|
|
596
|
+
pass
|
|
597
|
+
|
|
598
|
+
# map win-1252 extensions to the proper code points
|
|
599
|
+
if not isinstance(output, bytes):
|
|
600
|
+
output = output.translate(_cp1252)
|
|
601
|
+
|
|
602
|
+
# categories/tags/keywords/whatever are handled in _end_category or
|
|
603
|
+
# _end_tags or _end_itunes_keywords
|
|
604
|
+
if element in ("category", "tags", "itunes_keywords"):
|
|
605
|
+
return output
|
|
606
|
+
|
|
607
|
+
if element == "title" and -1 < self.title_depth <= self.depth:
|
|
608
|
+
return output
|
|
609
|
+
|
|
610
|
+
# store output in appropriate place(s)
|
|
611
|
+
if self.inentry and not self.insource:
|
|
612
|
+
if element == "content":
|
|
613
|
+
self.entries[-1].setdefault(element, [])
|
|
614
|
+
contentparams = copy.deepcopy(self.contentparams)
|
|
615
|
+
contentparams["value"] = output
|
|
616
|
+
self.entries[-1][element].append(contentparams)
|
|
617
|
+
elif element == "link":
|
|
618
|
+
if not self.inimage:
|
|
619
|
+
# query variables in urls in link elements are improperly
|
|
620
|
+
# converted from `?a=1&b=2` to `?a=1&b;=2` as if they're
|
|
621
|
+
# unhandled character references. fix this special case.
|
|
622
|
+
output = output.replace("&", "&")
|
|
623
|
+
output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
|
|
624
|
+
if self.isentrylink or not self.entries[-1].get(element):
|
|
625
|
+
self.entries[-1][element] = output
|
|
626
|
+
if output:
|
|
627
|
+
self.entries[-1]["links"][-1]["href"] = output
|
|
628
|
+
else:
|
|
629
|
+
if element == "description":
|
|
630
|
+
element = "summary"
|
|
631
|
+
old_value_depth = self.property_depth_map.setdefault(
|
|
632
|
+
self.entries[-1], {}
|
|
633
|
+
).get(element)
|
|
634
|
+
if old_value_depth is None or self.depth <= old_value_depth:
|
|
635
|
+
self.property_depth_map[self.entries[-1]][element] = self.depth
|
|
636
|
+
self.entries[-1][element] = output
|
|
637
|
+
if self.incontent:
|
|
638
|
+
contentparams = copy.deepcopy(self.contentparams)
|
|
639
|
+
contentparams["value"] = output
|
|
640
|
+
self.entries[-1][element + "_detail"] = contentparams
|
|
641
|
+
elif (
|
|
642
|
+
self.infeed or self.insource
|
|
643
|
+
): # and (not self.intextinput) and (not self.inimage):
|
|
644
|
+
context = self._get_context()
|
|
645
|
+
if element == "description":
|
|
646
|
+
element = "subtitle"
|
|
647
|
+
context[element] = output
|
|
648
|
+
if element == "link":
|
|
649
|
+
# fix query variables; see above for the explanation
|
|
650
|
+
output = re.sub("&([A-Za-z0-9_]+);", r"&\g<1>", output)
|
|
651
|
+
context[element] = output
|
|
652
|
+
context["links"][-1]["href"] = output
|
|
653
|
+
elif self.incontent:
|
|
654
|
+
contentparams = copy.deepcopy(self.contentparams)
|
|
655
|
+
contentparams["value"] = output
|
|
656
|
+
context[element + "_detail"] = contentparams
|
|
657
|
+
return output
|
|
658
|
+
|
|
659
|
+
def push_content(self, tag, attrs_d, default_content_type, expecting_text):
|
|
660
|
+
self.incontent += 1
|
|
661
|
+
if self.lang:
|
|
662
|
+
self.lang = self.lang.replace("_", "-")
|
|
663
|
+
self.contentparams = FeedParserDict(
|
|
664
|
+
{
|
|
665
|
+
"type": self.map_content_type(
|
|
666
|
+
attrs_d.get("type", default_content_type)
|
|
667
|
+
),
|
|
668
|
+
"language": self.lang,
|
|
669
|
+
"base": self.baseuri,
|
|
670
|
+
}
|
|
671
|
+
)
|
|
672
|
+
self.contentparams["base64"] = self._is_base64(attrs_d, self.contentparams)
|
|
673
|
+
self.push(tag, expecting_text)
|
|
674
|
+
|
|
675
|
+
def pop_content(self, tag):
|
|
676
|
+
value = self.pop(tag)
|
|
677
|
+
self.incontent -= 1
|
|
678
|
+
self.contentparams.clear()
|
|
679
|
+
return value
|
|
680
|
+
|
|
681
|
+
# a number of elements in a number of RSS variants are nominally plain
|
|
682
|
+
# text, but this is routinely ignored. This is an attempt to detect
|
|
683
|
+
# the most common cases. As false positives often result in silent
|
|
684
|
+
# data loss, this function errs on the conservative side.
|
|
685
|
+
@staticmethod
|
|
686
|
+
def looks_like_html(s):
|
|
687
|
+
"""
|
|
688
|
+
:type s: str
|
|
689
|
+
:rtype: bool
|
|
690
|
+
"""
|
|
691
|
+
|
|
692
|
+
# must have a close tag or an entity reference to qualify
|
|
693
|
+
if not (re.search(r"</(\w+)>", s) or re.search(r"&#?\w+;", s)):
|
|
694
|
+
return False
|
|
695
|
+
|
|
696
|
+
# all tags must be in a restricted subset of valid HTML tags
|
|
697
|
+
if any(
|
|
698
|
+
t
|
|
699
|
+
for t in re.findall(r"</?(\w+)", s)
|
|
700
|
+
if t.lower() not in HTMLSanitizer.acceptable_elements
|
|
701
|
+
):
|
|
702
|
+
return False
|
|
703
|
+
|
|
704
|
+
# all entities must have been defined as valid HTML entities
|
|
705
|
+
if any(
|
|
706
|
+
e for e in re.findall(r"&(\w+);", s) if e not in html.entities.entitydefs
|
|
707
|
+
):
|
|
708
|
+
return False
|
|
709
|
+
|
|
710
|
+
return True
|
|
711
|
+
|
|
712
|
+
def _map_to_standard_prefix(self, name):
|
|
713
|
+
colonpos = name.find(":")
|
|
714
|
+
if colonpos != -1:
|
|
715
|
+
prefix = name[:colonpos]
|
|
716
|
+
suffix = name[colonpos + 1 :]
|
|
717
|
+
prefix = self.namespacemap.get(prefix, prefix)
|
|
718
|
+
name = prefix + ":" + suffix
|
|
719
|
+
return name
|
|
720
|
+
|
|
721
|
+
def _get_attribute(self, attrs_d, name):
|
|
722
|
+
return attrs_d.get(self._map_to_standard_prefix(name))
|
|
723
|
+
|
|
724
|
+
def _is_base64(self, attrs_d, contentparams):
|
|
725
|
+
if attrs_d.get("mode", "") == "base64":
|
|
726
|
+
return 1
|
|
727
|
+
if self.contentparams["type"].startswith("text/"):
|
|
728
|
+
return 0
|
|
729
|
+
if self.contentparams["type"].endswith("+xml"):
|
|
730
|
+
return 0
|
|
731
|
+
if self.contentparams["type"].endswith("/xml"):
|
|
732
|
+
return 0
|
|
733
|
+
return 1
|
|
734
|
+
|
|
735
|
+
@staticmethod
|
|
736
|
+
def _enforce_href(attrs_d):
|
|
737
|
+
href = attrs_d.get("url", attrs_d.get("uri", attrs_d.get("href", None)))
|
|
738
|
+
if href:
|
|
739
|
+
try:
|
|
740
|
+
del attrs_d["url"]
|
|
741
|
+
except KeyError:
|
|
742
|
+
pass
|
|
743
|
+
try:
|
|
744
|
+
del attrs_d["uri"]
|
|
745
|
+
except KeyError:
|
|
746
|
+
pass
|
|
747
|
+
attrs_d["href"] = href
|
|
748
|
+
return attrs_d
|
|
749
|
+
|
|
750
|
+
def _save(self, key, value, overwrite=False):
|
|
751
|
+
context = self._get_context()
|
|
752
|
+
if overwrite:
|
|
753
|
+
context[key] = value
|
|
754
|
+
else:
|
|
755
|
+
context.setdefault(key, value)
|
|
756
|
+
|
|
757
|
+
def _get_context(self):
|
|
758
|
+
if self.insource:
|
|
759
|
+
context = self.sourcedata
|
|
760
|
+
elif self.inimage and "image" in self.feeddata:
|
|
761
|
+
context = self.feeddata["image"]
|
|
762
|
+
elif self.intextinput:
|
|
763
|
+
context = self.feeddata["textinput"]
|
|
764
|
+
elif self.inentry:
|
|
765
|
+
context = self.entries[-1]
|
|
766
|
+
else:
|
|
767
|
+
context = self.feeddata
|
|
768
|
+
return context
|
|
769
|
+
|
|
770
|
+
def _save_author(self, key, value, prefix="author"):
|
|
771
|
+
context = self._get_context()
|
|
772
|
+
context.setdefault(prefix + "_detail", FeedParserDict())
|
|
773
|
+
context[prefix + "_detail"][key] = value
|
|
774
|
+
self._sync_author_detail()
|
|
775
|
+
context.setdefault("authors", [FeedParserDict()])
|
|
776
|
+
context["authors"][-1][key] = value
|
|
777
|
+
|
|
778
|
+
def _save_contributor(self, key, value):
|
|
779
|
+
context = self._get_context()
|
|
780
|
+
context.setdefault("contributors", [FeedParserDict()])
|
|
781
|
+
context["contributors"][-1][key] = value
|
|
782
|
+
|
|
783
|
+
def _sync_author_detail(self, key="author"):
|
|
784
|
+
context = self._get_context()
|
|
785
|
+
detail = context.get("%ss" % key, [FeedParserDict()])[-1]
|
|
786
|
+
if detail:
|
|
787
|
+
name = detail.get("name")
|
|
788
|
+
email = detail.get("email")
|
|
789
|
+
if name and email:
|
|
790
|
+
context[key] = f"{name} ({email})"
|
|
791
|
+
elif name:
|
|
792
|
+
context[key] = name
|
|
793
|
+
elif email:
|
|
794
|
+
context[key] = email
|
|
795
|
+
else:
|
|
796
|
+
author, email = context.get(key), None
|
|
797
|
+
if not author:
|
|
798
|
+
return
|
|
799
|
+
emailmatch = email_pattern.search(author)
|
|
800
|
+
if emailmatch:
|
|
801
|
+
email = emailmatch.group(0)
|
|
802
|
+
# probably a better way to do the following, but it passes
|
|
803
|
+
# all the tests
|
|
804
|
+
author = author.replace(email, "")
|
|
805
|
+
author = author.replace("()", "")
|
|
806
|
+
author = author.replace("<>", "")
|
|
807
|
+
author = author.replace("<>", "")
|
|
808
|
+
author = author.strip()
|
|
809
|
+
if author and (author[0] == "("):
|
|
810
|
+
author = author[1:]
|
|
811
|
+
if author and (author[-1] == ")"):
|
|
812
|
+
author = author[:-1]
|
|
813
|
+
author = author.strip()
|
|
814
|
+
if author or email:
|
|
815
|
+
context.setdefault("%s_detail" % key, detail)
|
|
816
|
+
if author:
|
|
817
|
+
detail["name"] = author
|
|
818
|
+
if email:
|
|
819
|
+
detail["email"] = email
|
|
820
|
+
|
|
821
|
+
def _add_tag(self, term, scheme, label):
|
|
822
|
+
context = self._get_context()
|
|
823
|
+
tags = context.setdefault("tags", [])
|
|
824
|
+
if (not term) and (not scheme) and (not label):
|
|
825
|
+
return
|
|
826
|
+
value = FeedParserDict(term=term, scheme=scheme, label=label)
|
|
827
|
+
if value not in tags:
|
|
828
|
+
tags.append(value)
|
|
829
|
+
|
|
830
|
+
def _start_tags(self, attrs_d):
|
|
831
|
+
# This is a completely-made up element. Its semantics are determined
|
|
832
|
+
# only by a single feed that precipitated bug report 392 on Google Code.
|
|
833
|
+
# In short, this is junk code.
|
|
834
|
+
self.push("tags", 1)
|
|
835
|
+
|
|
836
|
+
def _end_tags(self):
|
|
837
|
+
for term in self.pop("tags").split(","):
|
|
838
|
+
self._add_tag(term.strip(), None, None)
|