forkparser 2026.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,649 @@
1
+ # Character encoding routines
2
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
3
+ # Copyright 2002-2008 Mark Pilgrim
4
+ # All rights reserved.
5
+ #
6
+ # This file is a part of feedparser.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without modification,
9
+ # are permitted provided that the following conditions are met:
10
+ #
11
+ # * Redistributions of source code must retain the above copyright notice,
12
+ # this list of conditions and the following disclaimer.
13
+ # * Redistributions in binary form must reproduce the above copyright notice,
14
+ # this list of conditions and the following disclaimer in the documentation
15
+ # and/or other materials provided with the distribution.
16
+ #
17
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
18
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
+ # POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ from __future__ import annotations
30
+
31
+ import codecs
32
+ import io
33
+ import re
34
+ import typing
35
+
36
+ try:
37
+ try:
38
+ import cchardet as chardet # type: ignore[import]
39
+ except ImportError:
40
+ import chardet # type: ignore[no-redef]
41
+ except ImportError:
42
+ lazy_chardet_encoding = None
43
+ else:
44
+
45
+ def lazy_chardet_encoding(data):
46
+ return chardet.detect(data)["encoding"] or ""
47
+
48
+
49
+ from .exceptions import (
50
+ CharacterEncodingOverride,
51
+ CharacterEncodingUnknown,
52
+ FeedparserError,
53
+ NonXMLContentType,
54
+ )
55
+
56
+ # Each marker represents some of the characters of the opening XML
57
+ # processing instruction ('<?xm') in the specified encoding.
58
+ EBCDIC_MARKER = b"\x4c\x6f\xa7\x94"
59
+ UTF16BE_MARKER = b"\x00\x3c\x00\x3f"
60
+ UTF16LE_MARKER = b"\x3c\x00\x3f\x00"
61
+ UTF32BE_MARKER = b"\x00\x00\x00\x3c"
62
+ UTF32LE_MARKER = b"\x3c\x00\x00\x00"
63
+
64
+ ZERO_BYTES = b"\x00\x00"
65
+
66
+ # Match the opening XML declaration.
67
+ # Example: <?xml version="1.0" encoding="utf-8"?>
68
+ RE_XML_DECLARATION = re.compile(r"^<\?xml[^>]*?>")
69
+
70
+ # Capture the value of the XML processing instruction's encoding attribute.
71
+ # Example: <?xml version="1.0" encoding="utf-8"?>
72
+ RE_XML_PI_ENCODING = re.compile(rb'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
73
+
74
+
75
+ def parse_content_type(line: str) -> tuple[str, str]:
76
+ """Parse an HTTP Content-Type header.
77
+
78
+ The return value will be a tuple of strings:
79
+ the MIME type, and the value of the "charset" (if any).
80
+
81
+ This is a custom replacement for Python's cgi.parse_header().
82
+ The cgi module will be removed in Python 3.13.
83
+ """
84
+
85
+ chunks = line.split(";")
86
+ if not chunks:
87
+ return "", ""
88
+
89
+ mime_type = chunks[0].strip()
90
+ charset_value = ""
91
+ for chunk in chunks[1:]:
92
+ key, _, value = chunk.partition("=")
93
+ if key.strip().lower() == "charset":
94
+ charset_value = value.strip().strip("\"'")
95
+
96
+ return mime_type, charset_value
97
+
98
+
99
+ def convert_to_utf8(
100
+ http_headers: dict[str, str], data: bytes, result: dict[str, typing.Any]
101
+ ) -> bytes:
102
+ """Detect and convert the character encoding to UTF-8."""
103
+
104
+ # This is so much trickier than it sounds, it's not even funny.
105
+ # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
106
+ # is application/xml, application/*+xml,
107
+ # application/xml-external-parsed-entity, or application/xml-dtd,
108
+ # the encoding given in the charset parameter of the HTTP Content-Type
109
+ # takes precedence over the encoding given in the XML prefix within the
110
+ # document, and defaults to 'utf-8' if neither are specified. But, if
111
+ # the HTTP Content-Type is text/xml, text/*+xml, or
112
+ # text/xml-external-parsed-entity, the encoding given in the XML prefix
113
+ # within the document is ALWAYS IGNORED and only the encoding given in
114
+ # the charset parameter of the HTTP Content-Type header should be
115
+ # respected, and it defaults to 'us-ascii' if not specified.
116
+
117
+ # Furthermore, discussion on the atom-syntax mailing list with the
118
+ # author of RFC 3023 leads me to the conclusion that any document
119
+ # served with a Content-Type of text/* and no charset parameter
120
+ # must be treated as us-ascii. (We now do this.) And also that it
121
+ # must always be flagged as non-well-formed. (We now do this too.)
122
+
123
+ # If Content-Type is unspecified (input was local file or non-HTTP source)
124
+ # or unrecognized (server just got it totally wrong), then go by the
125
+ # encoding given in the XML prefix of the document and default to
126
+ # 'iso-8859-1' as per the HTTP specification (RFC 2616).
127
+
128
+ # Then, assuming we didn't find a character encoding in the HTTP headers
129
+ # (and the HTTP Content-type allowed us to look in the body), we need
130
+ # to sniff the first few bytes of the XML data and try to determine
131
+ # whether the encoding is ASCII-compatible. Section F of the XML
132
+ # specification shows the way here:
133
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
134
+
135
+ # If the sniffed encoding is not ASCII-compatible, we need to make it
136
+ # ASCII compatible so that we can sniff further into the XML declaration
137
+ # to find the encoding attribute, which will tell us the true encoding.
138
+
139
+ # Of course, none of this guarantees that we will be able to parse the
140
+ # feed in the declared character encoding (assuming it was declared
141
+ # correctly, which many are not).
142
+
143
+ bom_encoding = ""
144
+ xml_encoding = ""
145
+
146
+ # Look at the first few bytes of the document to guess what
147
+ # its encoding may be. We only need to decode enough of the
148
+ # document that we can use an ASCII-compatible regular
149
+ # expression to search for an XML encoding declaration.
150
+ # The heuristic follows the XML specification, section F:
151
+ # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
152
+ # Check for BOMs first.
153
+ if data[:4] == codecs.BOM_UTF32_BE:
154
+ bom_encoding = "utf-32be"
155
+ data = data[4:]
156
+ elif data[:4] == codecs.BOM_UTF32_LE:
157
+ bom_encoding = "utf-32le"
158
+ data = data[4:]
159
+ elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
160
+ bom_encoding = "utf-16be"
161
+ data = data[2:]
162
+ elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
163
+ bom_encoding = "utf-16le"
164
+ data = data[2:]
165
+ elif data[:3] == codecs.BOM_UTF8:
166
+ bom_encoding = "utf-8"
167
+ data = data[3:]
168
+ # Check for the characters '<?xm' in several encodings.
169
+ elif data[:4] == EBCDIC_MARKER:
170
+ bom_encoding = "cp037"
171
+ elif data[:4] == UTF16BE_MARKER:
172
+ bom_encoding = "utf-16be"
173
+ elif data[:4] == UTF16LE_MARKER:
174
+ bom_encoding = "utf-16le"
175
+ elif data[:4] == UTF32BE_MARKER:
176
+ bom_encoding = "utf-32be"
177
+ elif data[:4] == UTF32LE_MARKER:
178
+ bom_encoding = "utf-32le"
179
+
180
+ tempdata = data
181
+ try:
182
+ if bom_encoding:
183
+ tempdata = data.decode(bom_encoding).encode("utf-8")
184
+ except UnicodeDecodeError:
185
+ xml_encoding_match = None
186
+ else:
187
+ xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
188
+
189
+ if xml_encoding_match:
190
+ xml_encoding = xml_encoding_match.groups()[0].decode("utf-8").lower()
191
+ # Normalize the xml_encoding if necessary.
192
+ if bom_encoding and (
193
+ xml_encoding
194
+ in (
195
+ "u16",
196
+ "utf-16",
197
+ "utf16",
198
+ "utf_16",
199
+ "u32",
200
+ "utf-32",
201
+ "utf32",
202
+ "utf_32",
203
+ "iso-10646-ucs-2",
204
+ "iso-10646-ucs-4",
205
+ "csucs4",
206
+ "csunicode",
207
+ "ucs-2",
208
+ "ucs-4",
209
+ )
210
+ ):
211
+ xml_encoding = bom_encoding
212
+
213
+ # Find the HTTP Content-Type and, hopefully, a character
214
+ # encoding provided by the server. The Content-Type is used
215
+ # to choose the "correct" encoding among the BOM encoding,
216
+ # XML declaration encoding, and HTTP encoding, following the
217
+ # heuristic defined in RFC 3023.
218
+ http_content_type = http_headers.get("content-type") or ""
219
+ http_content_type, http_encoding = parse_content_type(http_content_type)
220
+
221
+ acceptable_content_type = 0
222
+ application_content_types = (
223
+ "application/xml",
224
+ "application/xml-dtd",
225
+ "application/xml-external-parsed-entity",
226
+ )
227
+ text_content_types = ("text/xml", "text/xml-external-parsed-entity")
228
+ json_content_types = ("application/feed+json", "application/json")
229
+ json = False
230
+ if http_content_type in application_content_types or (
231
+ http_content_type.startswith("application/")
232
+ and http_content_type.endswith("+xml")
233
+ ):
234
+ acceptable_content_type = 1
235
+ rfc3023_encoding = http_encoding or xml_encoding or "utf-8"
236
+ elif http_content_type in text_content_types or (
237
+ http_content_type.startswith("text/") and http_content_type.endswith("+xml")
238
+ ):
239
+ acceptable_content_type = 1
240
+ rfc3023_encoding = http_encoding or "us-ascii"
241
+ elif http_content_type in json_content_types or (
242
+ not http_content_type and data and data.lstrip().startswith(b"{")
243
+ ):
244
+ http_content_type = json_content_types[0]
245
+ acceptable_content_type = 1
246
+ json = True
247
+ rfc3023_encoding = http_encoding or "utf-8" # RFC 7159, 8.1.
248
+ elif http_content_type.startswith("text/"):
249
+ rfc3023_encoding = http_encoding or "us-ascii"
250
+ elif http_headers and "content-type" not in http_headers:
251
+ rfc3023_encoding = xml_encoding or "iso-8859-1"
252
+ else:
253
+ rfc3023_encoding = xml_encoding or "utf-8"
254
+ # gb18030 is a superset of gb2312, so always replace gb2312
255
+ # with gb18030 for greater compatibility.
256
+ if rfc3023_encoding.lower() == "gb2312":
257
+ rfc3023_encoding = "gb18030"
258
+ if xml_encoding.lower() == "gb2312":
259
+ xml_encoding = "gb18030"
260
+
261
+ # there are four encodings to keep track of:
262
+ # - http_encoding is the encoding declared in the Content-Type HTTP header
263
+ # - xml_encoding is the encoding declared in the <?xml declaration
264
+ # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML data
265
+ # - rfc3023_encoding is the actual encoding, as per RFC 3023
266
+ # and a variety of other conflicting specifications
267
+ error: FeedparserError | None = None
268
+
269
+ if http_headers and (not acceptable_content_type):
270
+ if "content-type" in http_headers:
271
+ msg = "%s is not an accepted media type" % http_headers["content-type"]
272
+ else:
273
+ msg = "no Content-type specified"
274
+ error = NonXMLContentType(msg)
275
+
276
+ # determine character encoding
277
+ known_encoding = False
278
+ tried_encodings = []
279
+ # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
280
+ for encoding_to_try in (
281
+ rfc3023_encoding,
282
+ xml_encoding,
283
+ bom_encoding,
284
+ lazy_chardet_encoding,
285
+ "utf-8",
286
+ "windows-1252",
287
+ "iso-8859-2",
288
+ ):
289
+ if callable(encoding_to_try):
290
+ proposed_encoding = encoding_to_try(data)
291
+ else:
292
+ proposed_encoding = encoding_to_try
293
+ if not proposed_encoding:
294
+ continue
295
+ if proposed_encoding in tried_encodings:
296
+ continue
297
+ tried_encodings.append(proposed_encoding)
298
+ try:
299
+ text = data.decode(proposed_encoding)
300
+ except (UnicodeDecodeError, LookupError):
301
+ continue
302
+
303
+ known_encoding = True
304
+ if not json:
305
+ # Update the encoding in the opening XML processing instruction.
306
+ new_declaration = """<?xml version='1.0' encoding='utf-8'?>"""
307
+ if RE_XML_DECLARATION.search(text):
308
+ text = RE_XML_DECLARATION.sub(new_declaration, text)
309
+ else:
310
+ text = new_declaration + "\n" + text
311
+ data = text.encode("utf-8")
312
+ break
313
+
314
+ # if still no luck, give up
315
+ if not known_encoding:
316
+ error = CharacterEncodingUnknown(
317
+ "document encoding unknown, I tried "
318
+ + "%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked"
319
+ % (rfc3023_encoding, xml_encoding)
320
+ )
321
+ rfc3023_encoding = ""
322
+ elif proposed_encoding != rfc3023_encoding:
323
+ error = CharacterEncodingOverride(
324
+ "document declared as %s, but parsed as %s"
325
+ % (rfc3023_encoding, proposed_encoding)
326
+ )
327
+ rfc3023_encoding = proposed_encoding
328
+
329
+ result["content-type"] = http_content_type # for selecting the parser
330
+ result["encoding"] = rfc3023_encoding
331
+ if error:
332
+ result["bozo"] = True
333
+ result["bozo_exception"] = error
334
+ return data
335
+
336
+
337
+ # How much to read from a binary file in order to detect encoding.
338
+ # In initial tests, 4k was enough for ~160 mostly-English feeds;
339
+ # 64k seems like a safe margin.
340
+ CONVERT_FILE_PREFIX_LEN = 2**16
341
+
342
+ # How much to read from a text file, and use as an utf-8 bytes prefix.
343
+ # Note that no encoding detection is needed in this case.
344
+ CONVERT_FILE_STR_PREFIX_LEN = 2**13
345
+
346
+ CONVERT_FILE_TEST_CHUNK_LEN = 2**16
347
+
348
+
349
+ def convert_file_to_utf8(
350
+ http_headers, file, result, optimistic_encoding_detection=True
351
+ ):
352
+ """Like convert_to_utf8(), but for a stream.
353
+
354
+ Unlike convert_to_utf8(), do not read the entire file in memory;
355
+ instead, return a text stream that decodes it on the fly.
356
+ This should consume significantly less memory,
357
+ because it avoids (repeatedly) converting the entire file contents
358
+ from bytes to str and back.
359
+
360
+ To detect the encoding, only a prefix of the file contents is used.
361
+ In rare cases, the wrong encoding may be detected for this prefix;
362
+ use optimistic_encoding_detection=False to use the entire file contents
363
+ (equivalent to a plain convert_to_utf8() call).
364
+
365
+ Args:
366
+ http_headers (dict): The response headers.
367
+ file (IO[bytes] or IO[str]): A read()-able (binary) stream.
368
+ result (dict): The result dictionary.
369
+ optimistic_encoding_detection (bool):
370
+ If true, use only a prefix of the file content to detect encoding.
371
+
372
+ Returns:
373
+ StreamFactory: a stream factory, with the detected encoding set, if any
374
+
375
+ """
376
+ # Currently, this wraps convert_to_utf8(), because the logic is simply
377
+ # too complicated to ensure it's re-implemented correctly for a stream.
378
+ # That said, it should be possible to change the implementation
379
+ # transparently (not sure it's worth it, though).
380
+
381
+ # If file is a text stream, we don't need to detect encoding;
382
+ # we still need a bytes prefix to run functions on for side effects:
383
+ # convert_to_utf8() to sniff / set result['content-type'], and
384
+ # replace_doctype() to extract safe_entities.
385
+
386
+ if isinstance(file.read(0), str):
387
+ prefix = file.read(CONVERT_FILE_STR_PREFIX_LEN).encode("utf-8")
388
+ prefix = convert_to_utf8(http_headers, prefix, result)
389
+ result["encoding"] = "utf-8"
390
+ return StreamFactory(prefix, file, "utf-8")
391
+
392
+ if optimistic_encoding_detection:
393
+ prefix = convert_file_prefix_to_utf8(http_headers, file, result)
394
+ factory = StreamFactory(prefix, file, result.get("encoding"))
395
+
396
+ # Before returning factory, ensure the entire file can be decoded;
397
+ # if it cannot, fall back to convert_to_utf8().
398
+ #
399
+ # Not doing this means feedparser.parse() may raise UnicodeDecodeError
400
+ # instead of setting bozo_exception to CharacterEncodingOverride,
401
+ # breaking the 6.x API.
402
+
403
+ try:
404
+ text_file = factory.get_text_file()
405
+ except MissingEncoding:
406
+ return factory
407
+ try:
408
+ # read in chunks to limit memory usage
409
+ while text_file.read(CONVERT_FILE_TEST_CHUNK_LEN):
410
+ pass
411
+ except UnicodeDecodeError:
412
+ # fall back to convert_to_utf8()
413
+ file = factory.get_binary_file()
414
+ else:
415
+ return factory
416
+
417
+ # this shouldn't increase memory usage if file is BytesIO,
418
+ # since BytesIO does copy-on-write; https://bugs.python.org/issue22003
419
+ data = convert_to_utf8(http_headers, file.read(), result)
420
+
421
+ # note that data *is* the prefix
422
+ return StreamFactory(data, io.BytesIO(b""), result.get("encoding"))
423
+
424
+
425
+ def convert_file_prefix_to_utf8(
426
+ http_headers,
427
+ file: typing.IO[bytes],
428
+ result,
429
+ *,
430
+ prefix_len: int = CONVERT_FILE_PREFIX_LEN,
431
+ read_to_ascii_len: int = 2**8,
432
+ ) -> bytes:
433
+ """Like convert_to_utf8(), but only use the prefix of a binary file.
434
+
435
+ Set result like convert_to_utf8() would.
436
+
437
+ Return the updated prefix, as bytes.
438
+
439
+ """
440
+ # This is complicated by convert_to_utf8() detecting the wrong encoding
441
+ # if we have only part of the bytes that make a code-point:
442
+ #
443
+ # '😀'.encode('utf-8') -> utf-8
444
+ # '😀'.encode('utf-8')[:-1] -> windows-1252 + bozo
445
+
446
+ prefix = file.read(prefix_len - 1)
447
+
448
+ # reading up to after an ASCII byte increases
449
+ # the likelihood of being on a code point boundary
450
+ prefix += read_to_after_ascii_byte(file, read_to_ascii_len)
451
+
452
+ # call convert_to_utf8() up to 4 times,
453
+ # to make sure we eventually land on a code point boundary
454
+ candidates = []
455
+ for attempt in range(4):
456
+ byte = file.read(1)
457
+
458
+ # we're at the end of the file, and the loop already ran once
459
+ if not byte and attempt != 0:
460
+ break
461
+
462
+ prefix += byte
463
+
464
+ fake_result: typing.Any = {}
465
+ converted_prefix = convert_to_utf8(http_headers, prefix, fake_result)
466
+
467
+ # an encoding was detected successfully, keep it
468
+ if not fake_result.get("bozo"):
469
+ break
470
+
471
+ candidates.append((file.tell(), converted_prefix, fake_result))
472
+
473
+ # no encoding was detected successfully, pick the "best" one
474
+ else:
475
+
476
+ def key(candidate):
477
+ *_, result = candidate
478
+
479
+ exc = result.get("bozo_exception")
480
+ exc_score = 0
481
+ if isinstance(exc, NonXMLContentType):
482
+ exc_score = 20
483
+ elif isinstance(exc, CharacterEncodingOverride):
484
+ exc_score = 10
485
+
486
+ return (
487
+ exc_score,
488
+ # prefer utf- encodings to anything else
489
+ result.get("encoding").startswith("utf-"),
490
+ )
491
+
492
+ candidates.sort(key=key)
493
+ offset, converted_prefix, fake_result = candidates[-1]
494
+
495
+ file.seek(offset)
496
+
497
+ result.update(fake_result)
498
+ return converted_prefix
499
+
500
+
501
+ def read_to_after_ascii_byte(file: typing.IO[bytes], max_len: int) -> bytes:
502
+ offset = file.tell()
503
+ buffer = b""
504
+
505
+ for _ in range(max_len):
506
+ byte = file.read(1)
507
+
508
+ # end of file, nothing to do
509
+ if not byte:
510
+ break
511
+
512
+ buffer += byte
513
+
514
+ # we stop after a ASCII character
515
+ if byte < b"\x80":
516
+ break
517
+
518
+ # couldn't find an ASCII character, reset the file to the original offset
519
+ else:
520
+ file.seek(offset)
521
+ return b""
522
+
523
+ return buffer
524
+
525
+
526
+ class MissingEncoding(io.UnsupportedOperation):
527
+ pass
528
+
529
+
530
+ class StreamFactory:
531
+ """Decode on the fly a binary stream that *may* have a known encoding.
532
+
533
+ If the underlying stream is seekable, it is possible to call
534
+ the get_{text,binary}_file() methods more than once.
535
+
536
+ """
537
+
538
+ def __init__(self, prefix: bytes, file, encoding=None):
539
+ self.prefix = prefix
540
+ self.file = ResetFileWrapper(file)
541
+ self.encoding = encoding
542
+ self.should_reset = False
543
+
544
+ def get_text_file(self, fallback_encoding=None, errors="strict"):
545
+ encoding = self.encoding or fallback_encoding
546
+ if encoding is None:
547
+ raise MissingEncoding("cannot create text stream without encoding")
548
+
549
+ if isinstance(self.file.read(0), str):
550
+ file = PrefixFileWrapper(self.prefix.decode(encoding), self.file)
551
+ else:
552
+ file = PrefixFileWrapper(
553
+ self.prefix.decode("utf-8", errors),
554
+ codecs.getreader(encoding)(self.file, errors),
555
+ )
556
+
557
+ self.reset()
558
+ return file
559
+
560
+ def get_binary_file(self):
561
+ if isinstance(self.file.read(0), str):
562
+ raise io.UnsupportedOperation(
563
+ "underlying stream is text, not binary"
564
+ ) from None
565
+
566
+ file = PrefixFileWrapper(self.prefix, self.file)
567
+
568
+ self.reset()
569
+ return file
570
+
571
+ def get_file(self):
572
+ try:
573
+ return self.get_text_file()
574
+ except MissingEncoding:
575
+ return self.get_binary_file()
576
+
577
+ def reset(self):
578
+ if self.should_reset:
579
+ self.file.reset()
580
+ self.should_reset = True
581
+
582
+
583
+ class ResetFileWrapper:
584
+ """Given a seekable file, allow reading its content again
585
+ (from the current position) by calling reset().
586
+
587
+ """
588
+
589
+ def __init__(self, file):
590
+ self.file = file
591
+ try:
592
+ self.file_initial_offset = file.tell()
593
+ except OSError:
594
+ self.file_initial_offset = None
595
+
596
+ def read(self, size=-1):
597
+ return self.file.read(size)
598
+
599
+ def reset(self):
600
+ # raises io.UnsupportedOperation if the underlying stream is not seekable
601
+ self.file.seek(self.file_initial_offset)
602
+
603
+
604
+ class PrefixFileWrapper:
605
+ """Stitch a (possibly modified) prefix and a file into a new file object.
606
+
607
+ >>> file = io.StringIO('abcdef')
608
+ >>> file.read(2)
609
+ 'ab'
610
+ >>> wrapped = PrefixFileWrapper(file.read(2).upper(), file)
611
+ >>> wrapped.read()
612
+ 'CDef'
613
+
614
+ """
615
+
616
+ def __init__(self, prefix, file):
617
+ self.prefix = prefix
618
+ self.file = file
619
+ self.offset = 0
620
+
621
+ def read(self, size=-1):
622
+ buffer = self.file.read(0)
623
+
624
+ if self.offset < len(self.prefix):
625
+ if size < 0:
626
+ chunk = self.prefix[self.offset :]
627
+ else:
628
+ chunk = self.prefix[self.offset : self.offset + size]
629
+ size -= len(chunk)
630
+ buffer += chunk
631
+ self.offset += len(chunk)
632
+
633
+ while True:
634
+ chunk = self.file.read(size)
635
+ if not chunk:
636
+ break
637
+ buffer += chunk
638
+ self.offset += len(chunk)
639
+
640
+ if size <= 0:
641
+ break
642
+
643
+ size -= len(chunk)
644
+
645
+ return buffer
646
+
647
+ def close(self):
648
+ # do not touch the underlying stream
649
+ pass
@@ -0,0 +1,55 @@
1
+ # Exceptions used throughout feedparser
2
+ # Copyright 2010-2025 Kurt McKee <contactme@kurtmckee.org>
3
+ # Copyright 2002-2008 Mark Pilgrim
4
+ # All rights reserved.
5
+ #
6
+ # This file is a part of feedparser.
7
+ #
8
+ # Redistribution and use in source and binary forms, with or without modification,
9
+ # are permitted provided that the following conditions are met:
10
+ #
11
+ # * Redistributions of source code must retain the above copyright notice,
12
+ # this list of conditions and the following disclaimer.
13
+ # * Redistributions in binary form must reproduce the above copyright notice,
14
+ # this list of conditions and the following disclaimer in the documentation
15
+ # and/or other materials provided with the distribution.
16
+ #
17
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
18
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
+ # POSSIBILITY OF SUCH DAMAGE.
28
+
29
+ __all__ = [
30
+ "FeedparserError",
31
+ "CharacterEncodingOverride",
32
+ "CharacterEncodingUnknown",
33
+ "NonXMLContentType",
34
+ "UndeclaredNamespace",
35
+ ]
36
+
37
+
38
+ class FeedparserError(Exception):
39
+ pass
40
+
41
+
42
+ class CharacterEncodingOverride(FeedparserError):
43
+ pass
44
+
45
+
46
+ class CharacterEncodingUnknown(FeedparserError):
47
+ pass
48
+
49
+
50
+ class NonXMLContentType(FeedparserError):
51
+ pass
52
+
53
+
54
+ class UndeclaredNamespace(Exception):
55
+ pass