forkparser 2026.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
feedparser/__init__.py ADDED
@@ -0,0 +1,66 @@
1
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
2
+ # Copyright 2002-2008 Mark Pilgrim
3
+ # All rights reserved.
4
+ #
5
+ # This file is part of feedparser.
6
+ #
7
+ # Redistribution and use in source and binary forms, with or without modification,
8
+ # are permitted provided that the following conditions are met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
17
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
+ # POSSIBILITY OF SUCH DAMAGE."""
27
+
28
+ from .api import parse
29
+ from .datetimes import registerDateHandler
30
+ from .exceptions import (
31
+ CharacterEncodingOverride,
32
+ CharacterEncodingUnknown,
33
+ FeedparserError,
34
+ NonXMLContentType,
35
+ UndeclaredNamespace,
36
+ )
37
+ from .util import FeedParserDict
38
+
39
+ __author__ = "Kurt McKee <contactme@kurtmckee.org>"
40
+ __license__ = "BSD 2-clause"
41
+ __version__ = "6.0.12"
42
+
43
+ # If you want feedparser to automatically resolve all relative URIs, set this
44
+ # to 1.
45
+ RESOLVE_RELATIVE_URIS = 1
46
+
47
+ # If you want feedparser to automatically sanitize all potentially unsafe
48
+ # HTML content, set this to 1.
49
+ SANITIZE_HTML = 1
50
+
51
+
52
+ # If you want feedparser to use only a prefix of the feed to detect encodings
53
+ # (uses less memory), set this to 1.
54
+ OPTIMISTIC_ENCODING_DETECTION = 1
55
+
56
+
57
+ __all__ = (
58
+ "parse",
59
+ "registerDateHandler",
60
+ "FeedParserDict",
61
+ "FeedparserError",
62
+ "CharacterEncodingOverride",
63
+ "CharacterEncodingUnknown",
64
+ "NonXMLContentType",
65
+ "UndeclaredNamespace",
66
+ )
feedparser/api.py ADDED
@@ -0,0 +1,376 @@
1
+ # The public API for feedparser
2
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
3
+ # Copyright 2002-2008 Mark Pilgrim
4
+ # All rights reserved.
5
+ #
6
+ # This file is a part of feedparser.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without modification,
9
+ # are permitted provided that the following conditions are met:
10
+ #
11
+ # * Redistributions of source code must retain the above copyright notice,
12
+ # this list of conditions and the following disclaimer.
13
+ # * Redistributions in binary form must reproduce the above copyright notice,
14
+ # this list of conditions and the following disclaimer in the documentation
15
+ # and/or other materials provided with the distribution.
16
+ #
17
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
18
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
+ # POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ import io
30
+ import urllib.error
31
+ import urllib.parse
32
+ import xml.sax
33
+ from typing import IO
34
+
35
+ from . import http
36
+ from .encodings import MissingEncoding, convert_file_to_utf8
37
+ from .html import BaseHTMLProcessor
38
+ from .mixin import XMLParserMixin
39
+ from .parsers.json import JSONParser
40
+ from .parsers.loose import LooseXMLParser
41
+ from .parsers.strict import StrictXMLParser
42
+ from .sanitizer import replace_doctype
43
+ from .urls import make_safe_absolute_uri
44
+ from .util import FeedParserDict
45
+
46
+ # List of preferred XML parsers, by SAX driver name. These will be tried first,
47
+ # but if they're not installed, Python will keep searching through its own list
48
+ # of pre-installed parsers until it finds one that supports everything we need.
49
+ PREFERRED_XML_PARSERS = ["drv_libxml2"]
50
+
51
+ _XML_AVAILABLE = True
52
+
53
+ SUPPORTED_VERSIONS = {
54
+ "": "unknown",
55
+ "rss090": "RSS 0.90",
56
+ "rss091n": "RSS 0.91 (Netscape)",
57
+ "rss091u": "RSS 0.91 (Userland)",
58
+ "rss092": "RSS 0.92",
59
+ "rss093": "RSS 0.93",
60
+ "rss094": "RSS 0.94",
61
+ "rss20": "RSS 2.0",
62
+ "rss10": "RSS 1.0",
63
+ "rss": "RSS (unknown version)",
64
+ "atom01": "Atom 0.1",
65
+ "atom02": "Atom 0.2",
66
+ "atom03": "Atom 0.3",
67
+ "atom10": "Atom 1.0",
68
+ "atom": "Atom (unknown version)",
69
+ "cdf": "CDF",
70
+ "json1": "JSON feed 1",
71
+ }
72
+
73
+
74
+ def _open_resource(
75
+ url_file_stream_or_string,
76
+ result,
77
+ ):
78
+ """URL, filename, or string --> stream
79
+
80
+ This function lets you define parsers that take any input source
81
+ (URL, pathname to local or network file, or actual data as a string)
82
+ and deal with it in a uniform manner. Returned object is guaranteed
83
+ to have all the basic stdio read methods (read, readline, readlines).
84
+ Just .close() the object when you're done with it.
85
+
86
+ :return: A seekable, readable file object.
87
+ """
88
+
89
+ # Some notes on the history of the implementation of _open_resource().
90
+ #
91
+ # parse() might need to go over the feed content twice:
92
+ # if the strict parser fails, it tries again with the loose parser.
93
+ #
94
+ # In 5.2.0, this returned an open file, to be read() by parse().
95
+ # By 6.0.8, this returned bytes directly.
96
+ #
97
+ # Since #296 (>6.0.8), this once again returns an open file
98
+ # (to reduce memory usage, see convert_file_to_utf8() for details).
99
+ # However, to accommodate parse() needing the content twice,
100
+ # the returned file is guaranteed to be seekable.
101
+ # (If the underlying resource is not seekable,
102
+ # the content is read and wrapped in a io.BytesIO/StringIO.)
103
+
104
+ if callable(getattr(url_file_stream_or_string, "read", None)):
105
+ if callable(getattr(url_file_stream_or_string, "seekable", None)):
106
+ if url_file_stream_or_string.seekable():
107
+ return url_file_stream_or_string
108
+ return _to_in_memory_file(url_file_stream_or_string.read())
109
+
110
+ looks_like_url = isinstance(
111
+ url_file_stream_or_string, str
112
+ ) and urllib.parse.urlparse(url_file_stream_or_string)[0] in (
113
+ "http",
114
+ "https",
115
+ )
116
+ if looks_like_url:
117
+ data = http.get(url_file_stream_or_string, result)
118
+ return io.BytesIO(data)
119
+
120
+ # try to open with native open function (if url_file_stream_or_string is a filename)
121
+ try:
122
+ return open(url_file_stream_or_string, "rb")
123
+ except (OSError, TypeError, ValueError):
124
+ # if url_file_stream_or_string is a str object that
125
+ # cannot be converted to the encoding returned by
126
+ # sys.getfilesystemencoding(), a UnicodeEncodeError
127
+ # will be thrown
128
+ # If url_file_stream_or_string is a string that contains NULL
129
+ # (such as an XML document encoded in UTF-32), TypeError will
130
+ # be thrown.
131
+ pass
132
+
133
+ # treat url_file_stream_or_string as bytes/string
134
+ return _to_in_memory_file(url_file_stream_or_string)
135
+
136
+
137
+ def _to_in_memory_file(data):
138
+ if isinstance(data, str):
139
+ return io.StringIO(data)
140
+ return io.BytesIO(data)
141
+
142
+
143
+ class LooseFeedParser(LooseXMLParser, XMLParserMixin, BaseHTMLProcessor):
144
+ pass
145
+
146
+
147
+ class StrictFeedParser(StrictXMLParser, XMLParserMixin, xml.sax.handler.ContentHandler):
148
+ pass
149
+
150
+
151
+ def parse(
152
+ url_file_stream_or_string,
153
+ response_headers: dict[str, str] | None = None,
154
+ resolve_relative_uris: bool | None = None,
155
+ sanitize_html: bool | None = None,
156
+ optimistic_encoding_detection: bool | None = None,
157
+ ) -> FeedParserDict:
158
+ """Parse a feed from a URL, file, stream, or string.
159
+
160
+ :param url_file_stream_or_string:
161
+ File-like object, URL, file path, or string. Both byte and text strings
162
+ are accepted. If necessary, encoding will be derived from the response
163
+ headers or automatically detected.
164
+
165
+ Note that strings may trigger network I/O or filesystem access
166
+ depending on the value. Wrap an untrusted string in
167
+ a :class:`io.StringIO` or :class:`io.BytesIO` to avoid this. Do not
168
+ pass untrusted strings to this function.
169
+
170
+ When a URL is not passed the feed location to use in relative URL
171
+ resolution should be passed in the ``Content-Location`` response header
172
+ (see ``response_headers`` below).
173
+ :param response_headers:
174
+ A mapping of HTTP header name to HTTP header value. Multiple values may
175
+ be joined with a comma. If a HTTP request was made, these headers
176
+ override any matching headers in the response. Otherwise this specifies
177
+ the entirety of the response headers.
178
+ :param resolve_relative_uris:
179
+ Should feedparser attempt to resolve relative URIs absolute ones within
180
+ HTML content? Defaults to the value of
181
+ :data:`feedparser.RESOLVE_RELATIVE_URIS`, which is ``True``.
182
+ :param sanitize_html:
183
+ Should feedparser skip HTML sanitization? Only disable this if you know
184
+ what you are doing! Defaults to the value of
185
+ :data:`feedparser.SANITIZE_HTML`, which is ``True``.
186
+ :param optimistic_encoding_detection:
187
+ Should feedparser use only a prefix of the feed to detect encodings
188
+ (uses less memory, but the wrong encoding may be detected in rare cases).
189
+ Defaults to the value of
190
+ :data:`feedparser.OPTIMISTIC_ENCODING_DETECTION`, which is ``True``.
191
+
192
+ """
193
+
194
+ result = FeedParserDict(
195
+ bozo=False,
196
+ entries=[],
197
+ feed=FeedParserDict(),
198
+ headers={},
199
+ )
200
+
201
+ try:
202
+ file = _open_resource(
203
+ url_file_stream_or_string,
204
+ result,
205
+ )
206
+ except urllib.error.URLError as error:
207
+ result.update(
208
+ {
209
+ "bozo": True,
210
+ "bozo_exception": error,
211
+ }
212
+ )
213
+ return result
214
+
215
+ # at this point, the file is guaranteed to be seekable;
216
+ # we read 1 byte/character to see if it's empty and return early
217
+ # (this preserves the behavior in 6.0.8)
218
+ initial_file_offset = file.tell()
219
+ if not file.read(1):
220
+ return result
221
+ file.seek(initial_file_offset)
222
+
223
+ # overwrite existing headers using response_headers
224
+ result["headers"].update(response_headers or {})
225
+
226
+ try:
227
+ _parse_file_inplace(
228
+ file,
229
+ result,
230
+ resolve_relative_uris=resolve_relative_uris,
231
+ sanitize_html=sanitize_html,
232
+ optimistic_encoding_detection=optimistic_encoding_detection,
233
+ )
234
+ finally:
235
+ if not hasattr(url_file_stream_or_string, "read"):
236
+ # the file does not come from the user, close it
237
+ file.close()
238
+
239
+ return result
240
+
241
+
242
+ def _parse_file_inplace(
243
+ file: IO[bytes] | IO[str],
244
+ result: dict,
245
+ *,
246
+ resolve_relative_uris: bool | None = None,
247
+ sanitize_html: bool | None = None,
248
+ optimistic_encoding_detection: bool | None = None,
249
+ ) -> None:
250
+ # Avoid a cyclic import.
251
+ import feedparser
252
+
253
+ if sanitize_html is None:
254
+ sanitize_html = bool(feedparser.SANITIZE_HTML)
255
+ if resolve_relative_uris is None:
256
+ resolve_relative_uris = bool(feedparser.RESOLVE_RELATIVE_URIS)
257
+ if optimistic_encoding_detection is None:
258
+ optimistic_encoding_detection = bool(feedparser.OPTIMISTIC_ENCODING_DETECTION)
259
+
260
+ stream_factory = convert_file_to_utf8(
261
+ result["headers"], file, result, optimistic_encoding_detection
262
+ )
263
+ # We're done with file, all access must happen through stream_factory.
264
+ del file
265
+
266
+ # Some notes about the stream_factory.get_{text,binary}_file() methods:
267
+ #
268
+ # Calling them a second time will raise io.UnsupportedOperation
269
+ # if the underlying file was not seekable.
270
+ #
271
+ # Calling close() on the returned file is ignored
272
+ # (that is, the underlying file is *not* closed),
273
+ # because the SAX parser closes the file when done;
274
+ # we don't want that, since we might try again with the loose parser.
275
+
276
+ use_json_parser = False
277
+ if result["content-type"] in {"application/json", "application/feed+json"}:
278
+ use_json_parser = True
279
+ use_strict_parser = bool(result["encoding"])
280
+
281
+ result["version"], stream_factory.prefix, entities = replace_doctype(
282
+ stream_factory.prefix
283
+ )
284
+
285
+ # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
286
+ contentloc = result["headers"].get("content-location", "")
287
+ href = result.get("href", "")
288
+ baseuri = (
289
+ make_safe_absolute_uri(href, contentloc)
290
+ or make_safe_absolute_uri(contentloc)
291
+ or href
292
+ )
293
+
294
+ baselang = result["headers"].get("content-language", None)
295
+ if isinstance(baselang, bytes) and baselang is not None:
296
+ baselang = baselang.decode("utf-8", "ignore")
297
+
298
+ if not _XML_AVAILABLE:
299
+ use_strict_parser = False
300
+
301
+ feed_parser: JSONParser | StrictFeedParser | LooseFeedParser
302
+
303
+ if use_strict_parser and not use_json_parser:
304
+ # Initialize the SAX parser.
305
+ feed_parser = StrictFeedParser(baseuri, baselang, "utf-8")
306
+ feed_parser.resolve_relative_uris = resolve_relative_uris
307
+ feed_parser.sanitize_html = sanitize_html
308
+ saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
309
+ saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
310
+ try:
311
+ # Disable downloading external doctype references, if possible.
312
+ saxparser.setFeature(xml.sax.handler.feature_external_ges, 0)
313
+ except xml.sax.SAXNotSupportedException:
314
+ pass
315
+ saxparser.setContentHandler(feed_parser)
316
+ saxparser.setErrorHandler(feed_parser) # type: ignore[arg-type]
317
+ source = xml.sax.xmlreader.InputSource()
318
+
319
+ # If an encoding was detected, decode the file on the fly;
320
+ # otherwise, pass it as-is and let the SAX parser deal with it.
321
+ try:
322
+ source.setCharacterStream(stream_factory.get_text_file())
323
+ except MissingEncoding:
324
+ source.setByteStream(stream_factory.get_binary_file())
325
+
326
+ try:
327
+ saxparser.parse(source)
328
+ except xml.sax.SAXException as e:
329
+ result["bozo"] = 1
330
+ result["bozo_exception"] = feed_parser.exc or e
331
+ use_strict_parser = False
332
+
333
+ # The loose XML parser will be tried if the strict XML parser was not used
334
+ # (or if it failed to parse the feed).
335
+ if not use_strict_parser and not use_json_parser:
336
+ feed_parser = LooseFeedParser(baseuri, baselang, "utf-8", entities)
337
+ feed_parser.resolve_relative_uris = resolve_relative_uris
338
+ feed_parser.sanitize_html = sanitize_html
339
+
340
+ # If an encoding was detected, use it; otherwise, assume utf-8 and do your best.
341
+ # Will raise io.UnsupportedOperation if the underlying file is not seekable.
342
+ data = stream_factory.get_text_file("utf-8", "replace").read()
343
+
344
+ # As of 6.0.8, LooseFeedParser.feed() can be called exactly once
345
+ # with the entire data (it does some re.sub() and str.replace() on it).
346
+ #
347
+ # SGMLParser (of which LooseFeedParser is a subclass)
348
+ # *can* be fed in a streaming fashion,
349
+ # by calling feed() repeatedly with chunks of text.
350
+ #
351
+ # When/if LooseFeedParser will support being fed chunks,
352
+ # replace the read() call above with read(size)/feed() calls in a loop.
353
+
354
+ feed_parser.feed(data)
355
+
356
+ # If parsing with the loose XML parser resulted in no information,
357
+ # flag that the JSON parser should be tried.
358
+ if not (feed_parser.entries or feed_parser.feeddata or feed_parser.version):
359
+ use_json_parser = True
360
+
361
+ if use_json_parser:
362
+ result["version"] = None
363
+ feed_parser = JSONParser(baseuri, baselang, "utf-8")
364
+ try:
365
+ feed_parser.feed(stream_factory.get_file())
366
+ except Exception as e:
367
+ result["bozo"] = 1
368
+ result["bozo_exception"] = e
369
+
370
+ result["feed"] = feed_parser.feeddata
371
+ result["entries"] = feed_parser.entries
372
+ result["version"] = result["version"] or feed_parser.version
373
+ if isinstance(feed_parser, JSONParser):
374
+ result["namespaces"] = {}
375
+ else:
376
+ result["namespaces"] = feed_parser.namespaces_in_use
@@ -0,0 +1,73 @@
1
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
2
+ # Copyright 2002-2008 Mark Pilgrim
3
+ # All rights reserved.
4
+ #
5
+ # This file is a part of feedparser.
6
+ #
7
+ # Redistribution and use in source and binary forms, with or without
8
+ # modification, are permitted provided that the following conditions are met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
17
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
+ # POSSIBILITY OF SUCH DAMAGE.
27
+
28
+ from collections.abc import Callable
29
+ from time import struct_time
30
+
31
+ from .asctime import _parse_date_asctime
32
+ from .greek import _parse_date_greek
33
+ from .hungarian import _parse_date_hungarian
34
+ from .iso8601 import _parse_date_iso8601
35
+ from .korean import _parse_date_nate, _parse_date_onblog
36
+ from .perforce import _parse_date_perforce
37
+ from .rfc822 import _parse_date_rfc822
38
+ from .w3dtf import _parse_date_w3dtf
39
+
40
+ _date_handlers: list[Callable[[str], struct_time | None]] = []
41
+
42
+
43
+ def registerDateHandler(func):
44
+ """Register a date handler function (takes string, returns 9-tuple date in GMT)"""
45
+ _date_handlers.insert(0, func)
46
+
47
+
48
+ def _parse_date(date_string):
49
+ """Parses a variety of date formats into a 9-tuple in GMT"""
50
+ if not date_string:
51
+ return None
52
+ for handler in _date_handlers:
53
+ try:
54
+ date9tuple = handler(date_string)
55
+ except (KeyError, OverflowError, ValueError, AttributeError):
56
+ continue
57
+ if not date9tuple:
58
+ continue
59
+ if len(date9tuple) != 9:
60
+ continue
61
+ return date9tuple
62
+ return None
63
+
64
+
65
+ registerDateHandler(_parse_date_onblog)
66
+ registerDateHandler(_parse_date_nate)
67
+ registerDateHandler(_parse_date_greek)
68
+ registerDateHandler(_parse_date_hungarian)
69
+ registerDateHandler(_parse_date_perforce)
70
+ registerDateHandler(_parse_date_asctime)
71
+ registerDateHandler(_parse_date_iso8601)
72
+ registerDateHandler(_parse_date_rfc822)
73
+ registerDateHandler(_parse_date_w3dtf)
@@ -0,0 +1,80 @@
1
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
2
+ # Copyright 2002-2008 Mark Pilgrim
3
+ # All rights reserved.
4
+ #
5
+ # This file is a part of feedparser.
6
+ #
7
+ # Redistribution and use in source and binary forms, with or without
8
+ # modification, are permitted provided that the following conditions are met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
17
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
+ # POSSIBILITY OF SUCH DAMAGE.
27
+
28
+ from .rfc822 import _parse_date_rfc822
29
+
30
+ _months = [
31
+ "jan",
32
+ "feb",
33
+ "mar",
34
+ "apr",
35
+ "may",
36
+ "jun",
37
+ "jul",
38
+ "aug",
39
+ "sep",
40
+ "oct",
41
+ "nov",
42
+ "dec",
43
+ ]
44
+
45
+
46
+ def _parse_date_asctime(dt):
47
+ """Parse asctime-style dates.
48
+
49
+ Converts asctime to RFC822-compatible dates and uses the RFC822 parser
50
+ to do the actual parsing.
51
+
52
+ Supported formats (format is standardized to the first one listed):
53
+
54
+ * {weekday name} {month name} dd hh:mm:ss {+-tz} yyyy
55
+ * {weekday name} {month name} dd hh:mm:ss yyyy
56
+ """
57
+
58
+ parts = dt.split()
59
+
60
+ # Insert a GMT timezone, if needed.
61
+ if len(parts) == 5:
62
+ parts.insert(4, "+0000")
63
+
64
+ # Exit if there are not six parts.
65
+ if len(parts) != 6:
66
+ return None
67
+
68
+ # Reassemble the parts in an RFC822-compatible order and parse them.
69
+ return _parse_date_rfc822(
70
+ " ".join(
71
+ [
72
+ parts[0],
73
+ parts[2],
74
+ parts[1],
75
+ parts[5],
76
+ parts[3],
77
+ parts[4],
78
+ ]
79
+ )
80
+ )
@@ -0,0 +1,90 @@
1
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
2
+ # Copyright 2002-2008 Mark Pilgrim
3
+ # All rights reserved.
4
+ #
5
+ # This file is a part of feedparser.
6
+ #
7
+ # Redistribution and use in source and binary forms, with or without
8
+ # modification, are permitted provided that the following conditions are met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
17
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26
+ # POSSIBILITY OF SUCH DAMAGE.
27
+
28
+ import re
29
+
30
+ from .rfc822 import _parse_date_rfc822
31
+
32
+ # Unicode strings for Greek date strings
33
+ _greek_months = {
34
+ "\u0399\u03b1\u03bd": "Jan", # c9e1ed in iso-8859-7
35
+ "\u03a6\u03b5\u03b2": "Feb", # d6e5e2 in iso-8859-7
36
+ "\u039c\u03ac\u03ce": "Mar", # ccdcfe in iso-8859-7
37
+ "\u039c\u03b1\u03ce": "Mar", # cce1fe in iso-8859-7
38
+ "\u0391\u03c0\u03c1": "Apr", # c1f0f1 in iso-8859-7
39
+ "\u039c\u03ac\u03b9": "May", # ccdce9 in iso-8859-7
40
+ "\u039c\u03b1\u03ca": "May", # cce1fa in iso-8859-7
41
+ "\u039c\u03b1\u03b9": "May", # cce1e9 in iso-8859-7
42
+ "\u0399\u03bf\u03cd\u03bd": "Jun", # c9effded in iso-8859-7
43
+ "\u0399\u03bf\u03bd": "Jun", # c9efed in iso-8859-7
44
+ "\u0399\u03bf\u03cd\u03bb": "Jul", # c9effdeb in iso-8859-7
45
+ "\u0399\u03bf\u03bb": "Jul", # c9f9eb in iso-8859-7
46
+ "\u0391\u03cd\u03b3": "Aug", # c1fde3 in iso-8859-7
47
+ "\u0391\u03c5\u03b3": "Aug", # c1f5e3 in iso-8859-7
48
+ "\u03a3\u03b5\u03c0": "Sep", # d3e5f0 in iso-8859-7
49
+ "\u039f\u03ba\u03c4": "Oct", # cfeaf4 in iso-8859-7
50
+ "\u039d\u03bf\u03ad": "Nov", # cdefdd in iso-8859-7
51
+ "\u039d\u03bf\u03b5": "Nov", # cdefe5 in iso-8859-7
52
+ "\u0394\u03b5\u03ba": "Dec", # c4e5ea in iso-8859-7
53
+ }
54
+
55
+ _greek_wdays = {
56
+ "\u039a\u03c5\u03c1": "Sun", # caf5f1 in iso-8859-7
57
+ "\u0394\u03b5\u03c5": "Mon", # c4e5f5 in iso-8859-7
58
+ "\u03a4\u03c1\u03b9": "Tue", # d4f1e9 in iso-8859-7
59
+ "\u03a4\u03b5\u03c4": "Wed", # d4e5f4 in iso-8859-7
60
+ "\u03a0\u03b5\u03bc": "Thu", # d0e5ec in iso-8859-7
61
+ "\u03a0\u03b1\u03c1": "Fri", # d0e1f1 in iso-8859-7
62
+ "\u03a3\u03b1\u03b2": "Sat", # d3e1e2 in iso-8859-7
63
+ }
64
+
65
+ _greek_date_format_re = re.compile(
66
+ r"([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)"
67
+ )
68
+
69
+
70
+ def _parse_date_greek(date_string):
71
+ """Parse a string according to a Greek 8-bit date format."""
72
+ m = _greek_date_format_re.match(date_string)
73
+ if not m:
74
+ return
75
+ wday = _greek_wdays[m.group(1)]
76
+ month = _greek_months[m.group(3)]
77
+ rfc822date = (
78
+ "%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(offset)s"
79
+ % {
80
+ "wday": wday,
81
+ "day": m.group(2),
82
+ "month": month,
83
+ "year": m.group(4),
84
+ "hour": m.group(5),
85
+ "minute": m.group(6),
86
+ "second": m.group(7),
87
+ "offset": m.group(8),
88
+ }
89
+ )
90
+ return _parse_date_rfc822(rfc822date)