forkparser 2026.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
feedparser/html.py ADDED
@@ -0,0 +1,350 @@
1
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
2
+ # Copyright 2002-2008 Mark Pilgrim
3
+ # All rights reserved.
4
+ #
5
+ # This file is a part of feedparser.
6
+ #
7
+ # Redistribution and use in source and binary forms, with or without
8
+ # modification, are permitted provided that the following conditions are met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
17
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
+ # POSSIBILITY OF SUCH DAMAGE.
27
+
28
+ import html.entities
29
+ import re
30
+
31
+ # These items must all be imported into this module due to .__code__ replacements.
32
+ from .sgml import ( # noqa: F401
33
+ attrfind,
34
+ charref,
35
+ endbracket,
36
+ entityref,
37
+ incomplete,
38
+ interesting,
39
+ sgmllib,
40
+ shorttag,
41
+ shorttagopen,
42
+ starttagopen,
43
+ tagfind,
44
+ )
45
+
46
+ _cp1252 = {
47
+ 128: "\u20ac", # euro sign
48
+ 130: "\u201a", # single low-9 quotation mark
49
+ 131: "\u0192", # latin small letter f with hook
50
+ 132: "\u201e", # double low-9 quotation mark
51
+ 133: "\u2026", # horizontal ellipsis
52
+ 134: "\u2020", # dagger
53
+ 135: "\u2021", # double dagger
54
+ 136: "\u02c6", # modifier letter circumflex accent
55
+ 137: "\u2030", # per mille sign
56
+ 138: "\u0160", # latin capital letter s with caron
57
+ 139: "\u2039", # single left-pointing angle quotation mark
58
+ 140: "\u0152", # latin capital ligature oe
59
+ 142: "\u017d", # latin capital letter z with caron
60
+ 145: "\u2018", # left single quotation mark
61
+ 146: "\u2019", # right single quotation mark
62
+ 147: "\u201c", # left double quotation mark
63
+ 148: "\u201d", # right double quotation mark
64
+ 149: "\u2022", # bullet
65
+ 150: "\u2013", # en dash
66
+ 151: "\u2014", # em dash
67
+ 152: "\u02dc", # small tilde
68
+ 153: "\u2122", # trade mark sign
69
+ 154: "\u0161", # latin small letter s with caron
70
+ 155: "\u203a", # single right-pointing angle quotation mark
71
+ 156: "\u0153", # latin small ligature oe
72
+ 158: "\u017e", # latin small letter z with caron
73
+ 159: "\u0178", # latin capital letter y with diaeresis
74
+ }
75
+
76
+
77
+ class BaseHTMLProcessor(sgmllib.SGMLParser):
78
+ special = re.compile("""[<>'"]""")
79
+ bare_ampersand = re.compile(r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)")
80
+ elements_no_end_tag = {
81
+ "area",
82
+ "base",
83
+ "basefont",
84
+ "br",
85
+ "col",
86
+ "command",
87
+ "embed",
88
+ "frame",
89
+ "hr",
90
+ "img",
91
+ "input",
92
+ "isindex",
93
+ "keygen",
94
+ "link",
95
+ "meta",
96
+ "param",
97
+ "source",
98
+ "track",
99
+ "wbr",
100
+ }
101
+
102
+ def __init__(self, encoding=None, _type="application/xhtml+xml"):
103
+ if encoding:
104
+ self.encoding = encoding
105
+ self._type = _type
106
+ self.pieces = []
107
+ super().__init__()
108
+
109
+ def reset(self):
110
+ self.pieces = []
111
+ super().reset()
112
+
113
+ def _shorttag_replace(self, match):
114
+ """
115
+ :type match: Match[str]
116
+ :rtype: str
117
+ """
118
+
119
+ tag = match.group(1)
120
+ if tag in self.elements_no_end_tag:
121
+ return "<" + tag + " />"
122
+ return "<" + tag + "></" + tag + ">"
123
+
124
+ # By declaring these methods and overriding their compiled code
125
+ # with the code from sgmllib, the original code will execute in
126
+ # feedparser's scope instead of sgmllib's. This means that the
127
+ # `tagfind` and `charref` regular expressions will be found as
128
+ # they're declared above, not as they're declared in sgmllib.
129
+ def goahead(self, i):
130
+ raise NotImplementedError
131
+
132
+ # Replace goahead with SGMLParser's goahead() code object.
133
+ goahead.__code__ = sgmllib.SGMLParser.goahead.__code__
134
+
135
+ def __parse_starttag(self, i):
136
+ raise NotImplementedError
137
+
138
+ # Replace __parse_starttag with SGMLParser's parse_starttag() code object.
139
+ __parse_starttag.__code__ = sgmllib.SGMLParser.parse_starttag.__code__
140
+
141
+ def parse_starttag(self, i):
142
+ j = self.__parse_starttag(i)
143
+ if self._type == "application/xhtml+xml":
144
+ if j > 2 and self.rawdata[j - 2 : j] == "/>":
145
+ self.unknown_endtag(self.lasttag)
146
+ return j
147
+
148
+ def feed(self, data):
149
+ """
150
+ :type data: str
151
+ :rtype: None
152
+ """
153
+
154
+ data = re.sub(r"<!((?!DOCTYPE|--|\[))", r"&lt;!\1", data, flags=re.IGNORECASE)
155
+ data = re.sub(r"<([^<>\s]+?)\s*/>", self._shorttag_replace, data)
156
+ data = data.replace("&#39;", "'")
157
+ data = data.replace("&#34;", '"')
158
+ super().feed(data)
159
+ super().close()
160
+
161
+ @staticmethod
162
+ def normalize_attrs(attrs):
163
+ """
164
+ :type attrs: List[Tuple[str, str]]
165
+ :rtype: List[Tuple[str, str]]
166
+ """
167
+
168
+ if not attrs:
169
+ return attrs
170
+ # utility method to be called by descendants
171
+ # Collapse any duplicate attribute names and values by converting
172
+ # *attrs* into a dictionary, then convert it back to a list.
173
+ attrs_d = {k.lower(): v for k, v in attrs}
174
+ attrs = [
175
+ (k, k in ("rel", "type") and v.lower() or v) for k, v in attrs_d.items()
176
+ ]
177
+ attrs.sort()
178
+ return attrs
179
+
180
+ def unknown_starttag(self, tag, attrs):
181
+ """
182
+ :type tag: str
183
+ :type attrs: List[Tuple[str, str]]
184
+ :rtype: None
185
+ """
186
+
187
+ # Called for each start tag
188
+ # attrs is a list of (attr, value) tuples
189
+ # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
190
+ uattrs = []
191
+ strattrs = ""
192
+ if attrs:
193
+ for key, value in attrs:
194
+ value = value.replace(">", "&gt;")
195
+ value = value.replace("<", "&lt;")
196
+ value = value.replace('"', "&quot;")
197
+ value = self.bare_ampersand.sub("&amp;", value)
198
+ uattrs.append((key, value))
199
+ strattrs = "".join(f' {key}="{value}"' for key, value in uattrs)
200
+ if tag in self.elements_no_end_tag:
201
+ self.pieces.append(f"<{tag}{strattrs} />")
202
+ else:
203
+ self.pieces.append(f"<{tag}{strattrs}>")
204
+
205
+ def unknown_endtag(self, tag):
206
+ """
207
+ :type tag: str
208
+ :rtype: None
209
+ """
210
+
211
+ # Called for each end tag, e.g. for </pre>, tag will be 'pre'
212
+ # Reconstruct the original end tag.
213
+ if tag not in self.elements_no_end_tag:
214
+ self.pieces.append("</%s>" % tag)
215
+
216
+ def handle_charref(self, ref):
217
+ """
218
+ :type ref: str
219
+ :rtype: None
220
+ """
221
+
222
+ # Called for each character reference, e.g. '&#160;' will extract '160'
223
+ # Reconstruct the original character reference.
224
+ ref = ref.lower()
225
+ if ref.startswith("x"):
226
+ value = int(ref[1:], 16)
227
+ else:
228
+ value = int(ref)
229
+
230
+ if value in _cp1252:
231
+ self.pieces.append("&#%s;" % hex(ord(_cp1252[value]))[1:])
232
+ else:
233
+ self.pieces.append("&#%s;" % ref)
234
+
235
+ def handle_entityref(self, ref):
236
+ """
237
+ :type ref: str
238
+ :rtype: None
239
+ """
240
+
241
+ # Called for each entity reference, e.g. '&copy;' will extract 'copy'
242
+ # Reconstruct the original entity reference.
243
+ if ref in html.entities.name2codepoint or ref == "apos":
244
+ self.pieces.append("&%s;" % ref)
245
+ else:
246
+ self.pieces.append("&amp;%s" % ref)
247
+
248
+ def handle_data(self, text):
249
+ """
250
+ :type text: str
251
+ :rtype: None
252
+ """
253
+
254
+ # called for each block of plain text, i.e. outside of any tag and
255
+ # not containing any character or entity references
256
+ # Store the original text verbatim.
257
+ self.pieces.append(text)
258
+
259
+ def handle_comment(self, text):
260
+ """
261
+ :type text: str
262
+ :rtype: None
263
+ """
264
+
265
+ # Called for HTML comments, e.g. <!-- insert Javascript code here -->
266
+ # Reconstruct the original comment.
267
+ self.pieces.append("<!--%s-->" % text)
268
+
269
+ def handle_pi(self, text):
270
+ """
271
+ :type text: str
272
+ :rtype: None
273
+ """
274
+
275
+ # Called for each processing instruction, e.g. <?instruction>
276
+ # Reconstruct original processing instruction.
277
+ self.pieces.append("<?%s>" % text)
278
+
279
+ def handle_decl(self, text):
280
+ """
281
+ :type text: str
282
+ :rtype: None
283
+ """
284
+
285
+ # called for the DOCTYPE, if present, e.g.
286
+ # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
287
+ # "http://www.w3.org/TR/html4/loose.dtd">
288
+ # Reconstruct original DOCTYPE
289
+ self.pieces.append("<!%s>" % text)
290
+
291
+ _new_declname_match = re.compile(r"[a-zA-Z][-_.a-zA-Z0-9:]*\s*").match
292
+
293
+ def _scan_name(self, i, declstartpos):
294
+ """
295
+ :type i: int
296
+ :type declstartpos: int
297
+ :rtype: Tuple[Optional[str], int]
298
+ """
299
+
300
+ rawdata = self.rawdata
301
+ n = len(rawdata)
302
+ if i == n:
303
+ return None, -1
304
+ m = self._new_declname_match(rawdata, i)
305
+ if m:
306
+ s = m.group()
307
+ name = s.strip()
308
+ if (i + len(s)) == n:
309
+ return None, -1 # end of buffer
310
+ return name.lower(), m.end()
311
+ self.handle_data(rawdata)
312
+ # self.updatepos(declstartpos, i)
313
+ return None, -1
314
+
315
+ def convert_charref(self, name):
316
+ """
317
+ :type name: str
318
+ :rtype: str
319
+ """
320
+
321
+ return "&#%s;" % name
322
+
323
+ def convert_entityref(self, name):
324
+ """
325
+ :type name: str
326
+ :rtype: str
327
+ """
328
+
329
+ return "&%s;" % name
330
+
331
+ def output(self):
332
+ """Return processed HTML as a single string.
333
+
334
+ :rtype: str
335
+ """
336
+
337
+ return "".join(self.pieces)
338
+
339
+ def parse_declaration(self, i):
340
+ """
341
+ :type i: int
342
+ :rtype: int
343
+ """
344
+
345
+ try:
346
+ return sgmllib.SGMLParser.parse_declaration(self, i)
347
+ except (AssertionError, sgmllib.SGMLParseError):
348
+ # Escape the doctype declaration and continue parsing.
349
+ self.handle_data("&lt;")
350
+ return i + 1
feedparser/http.py ADDED
@@ -0,0 +1,74 @@
1
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
2
+ # Copyright 2002-2008 Mark Pilgrim
3
+ # All rights reserved.
4
+ #
5
+ # This file is a part of feedparser.
6
+ #
7
+ # Redistribution and use in source and binary forms, with or without
8
+ # modification, are permitted provided that the following conditions are met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
17
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
+ # POSSIBILITY OF SUCH DAMAGE.
27
+
28
+ from __future__ import annotations
29
+
30
+ import typing
31
+
32
+ import requests
33
+
34
+ from .datetimes import _parse_date
35
+
36
+ # HTTP "Accept" header to send to servers when downloading feeds.
37
+ ACCEPT_HEADER: str = (
38
+ "application/atom+xml"
39
+ ",application/rdf+xml"
40
+ ",application/rss+xml"
41
+ ",application/x-netcdf"
42
+ ",application/xml"
43
+ ";q=0.9,text/xml"
44
+ ";q=0.2,*/*"
45
+ ";q=0.1"
46
+ )
47
+
48
+
49
+ def get(url: str, result: dict[str, typing.Any]) -> bytes:
50
+ try:
51
+ response = requests.get(
52
+ url,
53
+ headers={"Accept": ACCEPT_HEADER},
54
+ timeout=10,
55
+ )
56
+ except requests.RequestException as exception:
57
+ result["bozo"] = True
58
+ result["bozo_exception"] = exception
59
+ return b""
60
+
61
+ # Lowercase the HTTP header keys for comparisons per RFC 2616.
62
+ result["headers"] = {k.lower(): v for k, v in response.headers.items()}
63
+
64
+ # save HTTP headers
65
+ if "etag" in result["headers"]:
66
+ result["etag"] = result["headers"]["etag"]
67
+ if "last-modified" in result["headers"]:
68
+ modified = result["headers"]["last-modified"]
69
+ if modified:
70
+ result["modified"] = modified
71
+ result["modified_parsed"] = _parse_date(modified)
72
+ result["href"] = response.url
73
+ result["status"] = response.status_code
74
+ return response.content