everythingtohtml 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. everythingtohtml/__about__.py +3 -0
  2. everythingtohtml/__init__.py +34 -0
  3. everythingtohtml/__main__.py +130 -0
  4. everythingtohtml/_base_converter.py +78 -0
  5. everythingtohtml/_everything_to_html.py +408 -0
  6. everythingtohtml/_exceptions.py +63 -0
  7. everythingtohtml/_html_builder.py +106 -0
  8. everythingtohtml/_merge.py +145 -0
  9. everythingtohtml/_stream_info.py +46 -0
  10. everythingtohtml/_text_utils.py +46 -0
  11. everythingtohtml/converters/__init__.py +45 -0
  12. everythingtohtml/converters/_csv_converter.py +73 -0
  13. everythingtohtml/converters/_doc_converter.py +385 -0
  14. everythingtohtml/converters/_docx_converter.py +105 -0
  15. everythingtohtml/converters/_eml_converter.py +104 -0
  16. everythingtohtml/converters/_epub_converter.py +131 -0
  17. everythingtohtml/converters/_html_converter.py +66 -0
  18. everythingtohtml/converters/_ipynb_converter.py +96 -0
  19. everythingtohtml/converters/_json_converter.py +78 -0
  20. everythingtohtml/converters/_markdown_converter.py +57 -0
  21. everythingtohtml/converters/_odt_converter.py +171 -0
  22. everythingtohtml/converters/_pdf_converter.py +204 -0
  23. everythingtohtml/converters/_plain_text_converter.py +64 -0
  24. everythingtohtml/converters/_pptx_converter.py +233 -0
  25. everythingtohtml/converters/_rss_converter.py +146 -0
  26. everythingtohtml/converters/_rst_converter.py +57 -0
  27. everythingtohtml/converters/_xlsx_converter.py +84 -0
  28. everythingtohtml/converters/_yaml_converter.py +56 -0
  29. everythingtohtml/py.typed +0 -0
  30. everythingtohtml-0.1.2.dist-info/METADATA +294 -0
  31. everythingtohtml-0.1.2.dist-info/RECORD +34 -0
  32. everythingtohtml-0.1.2.dist-info/WHEEL +4 -0
  33. everythingtohtml-0.1.2.dist-info/entry_points.txt +3 -0
  34. everythingtohtml-0.1.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,3 @@
1
+ """Version information for everythingtohtml."""
2
+
3
+ __version__ = "0.1.2"
@@ -0,0 +1,34 @@
1
+ """everythingtohtml — convert (almost) any file into clean, self-contained HTML.
2
+
3
+ Quick start
4
+ -----------
5
+ >>> from everythingtohtml import EverythingToHtml
6
+ >>> eth = EverythingToHtml()
7
+ >>> result = eth.convert("document.docx") # doctest: +SKIP
8
+ >>> open("document.html", "w", encoding="utf-8").write(result.html) # doctest: +SKIP
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from .__about__ import __version__
14
+ from ._base_converter import DocumentConverter, DocumentConverterResult
15
+ from ._everything_to_html import EverythingToHtml
16
+ from ._exceptions import (
17
+ EverythingToHtmlException,
18
+ FileConversionException,
19
+ MissingDependencyException,
20
+ UnsupportedFormatException,
21
+ )
22
+ from ._stream_info import StreamInfo
23
+
24
+ __all__ = [
25
+ "__version__",
26
+ "EverythingToHtml",
27
+ "DocumentConverter",
28
+ "DocumentConverterResult",
29
+ "StreamInfo",
30
+ "EverythingToHtmlException",
31
+ "FileConversionException",
32
+ "MissingDependencyException",
33
+ "UnsupportedFormatException",
34
+ ]
@@ -0,0 +1,130 @@
1
+ """Command-line interface for everythingtohtml.
2
+
3
+ Examples
4
+ --------
5
+ everythingtohtml report.docx > report.html
6
+ everythingtohtml data.csv -o data.html
7
+ everythingtohtml https://example.com/feed.rss
8
+ cat notes.md | everythingtohtml --extension .md > notes.html
9
+
10
+ # merge several documents into one HTML (great for Word files)
11
+ everythingtohtml a.docx b.docx c.doc -o merged.html
12
+ everythingtohtml old.docx new.docx --columns -o side-by-side.html
13
+
14
+ # compare two documents with a highlighted line diff
15
+ everythingtohtml old.docx new.docx --diff -o changes.html
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import sys
22
+ from collections.abc import Sequence
23
+
24
+ from .__about__ import __version__
25
+ from ._everything_to_html import EverythingToHtml
26
+ from ._exceptions import EverythingToHtmlException
27
+ from ._stream_info import StreamInfo
28
+
29
+
30
+ def _build_parser() -> argparse.ArgumentParser:
31
+ parser = argparse.ArgumentParser(
32
+ prog="everythingtohtml",
33
+ description="Convert files, URLs, or stdin into clean, self-contained HTML.",
34
+ )
35
+ parser.add_argument(
36
+ "sources",
37
+ nargs="*",
38
+ help="One or more paths/URLs. Two or more are merged; omit (or '-') for stdin.",
39
+ )
40
+ parser.add_argument(
41
+ "-o",
42
+ "--output",
43
+ help="Write HTML to this file instead of stdout.",
44
+ )
45
+ parser.add_argument(
46
+ "-e",
47
+ "--extension",
48
+ help="Hint the source extension (e.g. '.md') when reading from stdin.",
49
+ )
50
+ parser.add_argument(
51
+ "-m",
52
+ "--mimetype",
53
+ help="Hint the source mimetype when it cannot be inferred.",
54
+ )
55
+ parser.add_argument(
56
+ "--charset",
57
+ help="Hint the source character encoding (e.g. 'utf-8').",
58
+ )
59
+ parser.add_argument(
60
+ "--columns",
61
+ action="store_true",
62
+ help="When merging multiple sources, lay them out side by side.",
63
+ )
64
+ parser.add_argument(
65
+ "--diff",
66
+ action="store_true",
67
+ help="Compare exactly two sources with a highlighted line diff.",
68
+ )
69
+ parser.add_argument(
70
+ "--no-toc",
71
+ action="store_true",
72
+ help="When merging, omit the table of contents.",
73
+ )
74
+ parser.add_argument(
75
+ "--use-plugins",
76
+ action="store_true",
77
+ help="Load third-party converter plugins registered via entry points.",
78
+ )
79
+ parser.add_argument(
80
+ "--version",
81
+ action="version",
82
+ version=f"everythingtohtml {__version__}",
83
+ )
84
+ return parser
85
+
86
+
87
+ def main(argv: Sequence[str] | None = None) -> int:
88
+ parser = _build_parser()
89
+ args = parser.parse_args(argv)
90
+
91
+ stream_info = StreamInfo(
92
+ extension=args.extension,
93
+ mimetype=args.mimetype,
94
+ charset=args.charset,
95
+ )
96
+ engine = EverythingToHtml(enable_plugins=args.use_plugins)
97
+ sources = args.sources
98
+
99
+ try:
100
+ if args.diff:
101
+ if len(sources) != 2:
102
+ parser.error("--diff requires exactly two sources")
103
+ result = engine.diff(sources[0], sources[1])
104
+ elif len(sources) > 1:
105
+ result = engine.merge(
106
+ sources,
107
+ layout="columns" if args.columns else "stacked",
108
+ include_toc=not args.no_toc,
109
+ )
110
+ elif len(sources) == 1 and sources[0] != "-":
111
+ result = engine.convert(sources[0], stream_info=stream_info)
112
+ else:
113
+ data = sys.stdin.buffer.read()
114
+ result = engine.convert(data, stream_info=stream_info)
115
+ except EverythingToHtmlException as exc:
116
+ print(f"error: {exc}", file=sys.stderr)
117
+ return 1
118
+
119
+ if args.output:
120
+ with open(args.output, "w", encoding="utf-8") as handle:
121
+ handle.write(result.html)
122
+ else:
123
+ sys.stdout.reconfigure(encoding="utf-8") # type: ignore[union-attr]
124
+ sys.stdout.write(result.html)
125
+
126
+ return 0
127
+
128
+
129
+ if __name__ == "__main__":
130
+ raise SystemExit(main())
@@ -0,0 +1,78 @@
1
+ """The converter contract and the result object every converter returns."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, BinaryIO
6
+
7
+ from ._stream_info import StreamInfo
8
+
9
+ __all__ = ["DocumentConverter", "DocumentConverterResult"]
10
+
11
+
12
+ class DocumentConverterResult:
13
+ """The output of a successful conversion.
14
+
15
+ The canonical payload is :attr:`html`. ``text_content`` is provided as an
16
+ alias so code written against markdown-style converters keeps working when
17
+ pointed at this library, and ``str(result)`` yields the HTML directly.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ html: str,
23
+ *,
24
+ title: str | None = None,
25
+ metadata: dict[str, Any] | None = None,
26
+ ) -> None:
27
+ self.html = html
28
+ self.title = title
29
+ self.metadata = metadata or {}
30
+
31
+ @property
32
+ def text_content(self) -> str:
33
+ """Alias for :attr:`html` (drop-in compatibility with markdown converters)."""
34
+ return self.html
35
+
36
+ @text_content.setter
37
+ def text_content(self, value: str) -> None:
38
+ self.html = value
39
+
40
+ def __str__(self) -> str:
41
+ return self.html
42
+
43
+
44
+ class DocumentConverter:
45
+ """Base class for all converters.
46
+
47
+ A converter answers two questions:
48
+
49
+ * :meth:`accepts` — *can* I handle this stream? (cheap, no side effects)
50
+ * :meth:`convert` — *do* the conversion and return HTML.
51
+
52
+ ``accepts`` must not consume the stream destructively: read what you need to
53
+ sniff, then ``seek(0)`` back. ``convert`` may read freely; the engine resets
54
+ the stream before handing it to each converter.
55
+ """
56
+
57
+ # Higher priority converters are tried first. Specific format converters use
58
+ # low numbers; greedy catch-alls (plain text) use high numbers.
59
+ PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
60
+ PRIORITY_GENERIC_FILE_FORMAT = 10.0
61
+
62
+ priority: float = PRIORITY_SPECIFIC_FILE_FORMAT
63
+
64
+ def accepts(
65
+ self,
66
+ file_stream: BinaryIO,
67
+ stream_info: StreamInfo,
68
+ **kwargs: Any,
69
+ ) -> bool:
70
+ raise NotImplementedError
71
+
72
+ def convert(
73
+ self,
74
+ file_stream: BinaryIO,
75
+ stream_info: StreamInfo,
76
+ **kwargs: Any,
77
+ ) -> DocumentConverterResult:
78
+ raise NotImplementedError
@@ -0,0 +1,408 @@
1
+ """The :class:`EverythingToHtml` engine: detection, dispatch, and plugins."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import mimetypes
7
+ import os
8
+ import sys
9
+ from collections.abc import Iterable
10
+ from typing import Any, BinaryIO
11
+ from urllib.parse import unquote, urlparse
12
+ from urllib.request import url2pathname
13
+
14
+ from ._base_converter import DocumentConverter, DocumentConverterResult
15
+ from ._exceptions import (
16
+ FailedConversionAttempt,
17
+ FileConversionException,
18
+ MissingDependencyException,
19
+ UnsupportedFormatException,
20
+ )
21
+ from ._stream_info import StreamInfo
22
+ from .converters import (
23
+ CsvConverter,
24
+ DocConverter,
25
+ DocxConverter,
26
+ EmlConverter,
27
+ EpubConverter,
28
+ HtmlConverter,
29
+ IpynbConverter,
30
+ JsonConverter,
31
+ MarkdownConverter,
32
+ OdtConverter,
33
+ PdfConverter,
34
+ PlainTextConverter,
35
+ PptxConverter,
36
+ RssConverter,
37
+ RstConverter,
38
+ XlsxConverter,
39
+ YamlConverter,
40
+ )
41
+
42
+ __all__ = ["EverythingToHtml"]
43
+
44
+ _PLUGIN_ENTRY_POINT_GROUP = "everythingtohtml.converter"
45
+
46
+ # Built-in converters, registered in dependency-light first order. Specific
47
+ # formats are registered after the catch-all so that, on ties, they win.
48
+ _BUILTIN_CONVERTERS: tuple[type[DocumentConverter], ...] = (
49
+ PlainTextConverter,
50
+ HtmlConverter,
51
+ MarkdownConverter,
52
+ CsvConverter,
53
+ JsonConverter,
54
+ YamlConverter,
55
+ IpynbConverter,
56
+ RssConverter,
57
+ RstConverter,
58
+ EmlConverter,
59
+ EpubConverter,
60
+ OdtConverter,
61
+ DocConverter,
62
+ DocxConverter,
63
+ XlsxConverter,
64
+ PptxConverter,
65
+ PdfConverter,
66
+ )
67
+
68
+
69
+ class _Registration:
70
+ __slots__ = ("converter", "priority", "index")
71
+
72
+ def __init__(self, converter: DocumentConverter, index: int) -> None:
73
+ self.converter = converter
74
+ self.priority = converter.priority
75
+ self.index = index
76
+
77
+
78
+ class EverythingToHtml:
79
+ """Convert files, streams, and URIs into clean, self-contained HTML.
80
+
81
+ Example
82
+ -------
83
+ >>> from everythingtohtml import EverythingToHtml
84
+ >>> eth = EverythingToHtml()
85
+ >>> result = eth.convert("README.md")
86
+ >>> result.html.startswith("<!DOCTYPE html>")
87
+ True
88
+ """
89
+
90
+ def __init__(
91
+ self,
92
+ *,
93
+ enable_builtins: bool = True,
94
+ enable_plugins: bool = False,
95
+ ) -> None:
96
+ self._registrations: list[_Registration] = []
97
+ self._next_index = 0
98
+ if enable_builtins:
99
+ for converter_cls in _BUILTIN_CONVERTERS:
100
+ self.register_converter(converter_cls())
101
+ if enable_plugins:
102
+ self.load_plugins()
103
+
104
+ # -- registration ------------------------------------------------------
105
+
106
+ def register_converter(self, converter: DocumentConverter) -> None:
107
+ """Add a converter. Later registrations win ties against earlier ones."""
108
+ self._registrations.append(_Registration(converter, self._next_index))
109
+ self._next_index += 1
110
+
111
+ def load_plugins(self) -> None:
112
+ """Discover and register third-party converters via entry points.
113
+
114
+ A plugin advertises a callable under the ``everythingtohtml.converter``
115
+ entry-point group; the callable receives this engine and registers its
116
+ own converters. See ``docs/PLUGINS.md``.
117
+ """
118
+ from importlib.metadata import entry_points
119
+
120
+ for ep in entry_points(group=_PLUGIN_ENTRY_POINT_GROUP):
121
+ register = ep.load()
122
+ register(self)
123
+
124
+ @property
125
+ def _ordered(self) -> list[DocumentConverter]:
126
+ # Lowest priority number first; for equal priority, most-recently
127
+ # registered first (so plugins and later converters can override).
128
+ ordered = sorted(self._registrations, key=lambda r: (r.priority, -r.index))
129
+ return [r.converter for r in ordered]
130
+
131
+ # -- public conversion API --------------------------------------------
132
+
133
+ def convert(
134
+ self,
135
+ source: str | os.PathLike[str] | bytes | BinaryIO,
136
+ *,
137
+ stream_info: StreamInfo | None = None,
138
+ **kwargs: Any,
139
+ ) -> DocumentConverterResult:
140
+ """Convert ``source`` to HTML.
141
+
142
+ ``source`` may be a local path, a URI (``http``, ``https``, ``file``,
143
+ ``data``), raw ``bytes``, or an already-open binary stream.
144
+ """
145
+ if isinstance(source, (str, os.PathLike)):
146
+ text = os.fspath(source)
147
+ if _looks_like_uri(text):
148
+ return self.convert_uri(text, stream_info=stream_info, **kwargs)
149
+ return self.convert_local(text, stream_info=stream_info, **kwargs)
150
+ if isinstance(source, (bytes, bytearray)):
151
+ return self.convert_stream(io.BytesIO(bytes(source)), stream_info=stream_info, **kwargs)
152
+ return self.convert_stream(source, stream_info=stream_info, **kwargs)
153
+
154
+ def convert_local(
155
+ self,
156
+ path: str | os.PathLike[str],
157
+ *,
158
+ stream_info: StreamInfo | None = None,
159
+ **kwargs: Any,
160
+ ) -> DocumentConverterResult:
161
+ """Convert a file on the local filesystem."""
162
+ path = os.fspath(path)
163
+ base = StreamInfo(
164
+ local_path=path,
165
+ filename=os.path.basename(path),
166
+ extension=_ext(path),
167
+ mimetype=mimetypes.guess_type(path)[0],
168
+ )
169
+ guessed = base.copy_and_update(stream_info)
170
+ with open(path, "rb") as stream:
171
+ return self._convert(stream, guessed, **kwargs)
172
+
173
+ def convert_stream(
174
+ self,
175
+ stream: BinaryIO,
176
+ *,
177
+ stream_info: StreamInfo | None = None,
178
+ **kwargs: Any,
179
+ ) -> DocumentConverterResult:
180
+ """Convert an open binary stream.
181
+
182
+ The stream must be seekable; if it is not, it is buffered into memory.
183
+ """
184
+ if not stream.seekable():
185
+ stream = io.BytesIO(stream.read())
186
+ guessed = stream_info or StreamInfo()
187
+ return self._convert(stream, guessed, **kwargs)
188
+
189
+ def convert_uri(
190
+ self,
191
+ uri: str,
192
+ *,
193
+ stream_info: StreamInfo | None = None,
194
+ **kwargs: Any,
195
+ ) -> DocumentConverterResult:
196
+ """Convert content addressed by a URI (``http(s)``, ``file``, ``data``)."""
197
+ uri = uri.strip()
198
+ parsed = urlparse(uri)
199
+ scheme = parsed.scheme.lower()
200
+
201
+ if scheme == "file":
202
+ local = url2pathname(parsed.path)
203
+ return self.convert_local(local, stream_info=stream_info, **kwargs)
204
+ if scheme == "data":
205
+ stream, info = _read_data_uri(uri)
206
+ return self._convert(stream, info.copy_and_update(stream_info), **kwargs)
207
+ if scheme in ("http", "https"):
208
+ stream, info = _fetch_http(uri)
209
+ return self._convert(stream, info.copy_and_update(stream_info), **kwargs)
210
+ raise UnsupportedFormatException(f"Unsupported URI scheme: {scheme!r}")
211
+
212
+ # -- multi-document composition ---------------------------------------
213
+
214
+ def merge(
215
+ self,
216
+ sources: Iterable[str | os.PathLike[str] | bytes | BinaryIO],
217
+ *,
218
+ title: str | None = None,
219
+ layout: str = "stacked",
220
+ include_toc: bool = True,
221
+ labels: list[str] | None = None,
222
+ **kwargs: Any,
223
+ ) -> DocumentConverterResult:
224
+ """Convert several sources and combine them into one HTML document.
225
+
226
+ ``layout="stacked"`` (default) renders them top-to-bottom with a table of
227
+ contents; ``layout="columns"`` places them side by side for comparison.
228
+ Each document's detected title is used as its heading unless ``labels`` is
229
+ given. Great for collating or comparing a set of Word documents.
230
+ """
231
+ from ._merge import body_fragment, build_merged_html
232
+
233
+ items: list[tuple[str, str]] = []
234
+ for index, source in enumerate(sources):
235
+ result = self.convert(source, **kwargs)
236
+ label = (
237
+ labels[index]
238
+ if labels and index < len(labels)
239
+ else (result.title or _source_label(source, index))
240
+ )
241
+ items.append((label, body_fragment(result.html)))
242
+
243
+ if not items:
244
+ raise ValueError("merge() requires at least one source")
245
+
246
+ html = build_merged_html(items, title=title, layout=layout, include_toc=include_toc)
247
+ return DocumentConverterResult(html, title=title)
248
+
249
+ def diff(
250
+ self,
251
+ left: str | os.PathLike[str] | bytes | BinaryIO,
252
+ right: str | os.PathLike[str] | bytes | BinaryIO,
253
+ *,
254
+ title: str | None = None,
255
+ left_label: str | None = None,
256
+ right_label: str | None = None,
257
+ **kwargs: Any,
258
+ ) -> DocumentConverterResult:
259
+ """Render a side-by-side line diff of two sources' text content."""
260
+ from ._merge import build_diff_html, plain_text_lines
261
+
262
+ left_result = self.convert(left, **kwargs)
263
+ right_result = self.convert(right, **kwargs)
264
+
265
+ html = build_diff_html(
266
+ left_label or left_result.title or _source_label(left, 0),
267
+ plain_text_lines(left_result.html),
268
+ right_label or right_result.title or _source_label(right, 1),
269
+ plain_text_lines(right_result.html),
270
+ title=title,
271
+ )
272
+ return DocumentConverterResult(html, title=title or "Document comparison")
273
+
274
+ # -- internals ---------------------------------------------------------
275
+
276
+ def _convert(
277
+ self,
278
+ stream: BinaryIO,
279
+ stream_info: StreamInfo,
280
+ **kwargs: Any,
281
+ ) -> DocumentConverterResult:
282
+ enriched = self._sniff(stream, stream_info)
283
+
284
+ attempts: list[FailedConversionAttempt] = []
285
+ missing_dependency: MissingDependencyException | None = None
286
+ for converter in self._ordered:
287
+ stream.seek(0)
288
+ try:
289
+ if not converter.accepts(stream, enriched, **kwargs):
290
+ continue
291
+ except Exception: # a misbehaving accepts() should not abort dispatch
292
+ continue
293
+
294
+ stream.seek(0)
295
+ try:
296
+ return converter.convert(stream, enriched, **kwargs)
297
+ except MissingDependencyException as exc:
298
+ # Remember the first actionable "install this extra" signal so it
299
+ # can be surfaced if nothing else handles the input.
300
+ if missing_dependency is None:
301
+ missing_dependency = exc
302
+ attempts.append(FailedConversionAttempt(converter, sys.exc_info()))
303
+ except Exception:
304
+ attempts.append(FailedConversionAttempt(converter, sys.exc_info()))
305
+
306
+ # A missing optional dependency is the most useful thing to report, so it
307
+ # takes precedence over the generic "all converters failed" message.
308
+ if missing_dependency is not None:
309
+ raise missing_dependency
310
+ if attempts:
311
+ raise FileConversionException(attempts=attempts)
312
+ raise UnsupportedFormatException(
313
+ "No converter could handle this input "
314
+ f"(extension={enriched.extension!r}, mimetype={enriched.mimetype!r}). "
315
+ "It may need an optional extra; see 'pip install everythingtohtml[all]'."
316
+ )
317
+
318
+ @staticmethod
319
+ def _sniff(stream: BinaryIO, stream_info: StreamInfo) -> StreamInfo:
320
+ """Fill in missing extension/mimetype hints from magic bytes."""
321
+ if stream_info.extension and stream_info.mimetype:
322
+ return stream_info
323
+
324
+ pos = stream.tell()
325
+ header = stream.read(2048)
326
+ stream.seek(pos)
327
+ if not header:
328
+ return stream_info
329
+
330
+ try:
331
+ import puremagic
332
+
333
+ matches = puremagic.magic_string(header)
334
+ except Exception:
335
+ return stream_info
336
+
337
+ if not matches:
338
+ return stream_info
339
+
340
+ best = matches[0]
341
+ updates: dict[str, object] = {}
342
+ if not stream_info.extension and getattr(best, "extension", None):
343
+ updates["extension"] = best.extension
344
+ if not stream_info.mimetype and getattr(best, "mime_type", None):
345
+ updates["mimetype"] = best.mime_type
346
+ return stream_info.copy_and_update(**updates) if updates else stream_info
347
+
348
+
349
+ # -- module-level helpers --------------------------------------------------
350
+
351
+
352
+ def _ext(path: str) -> str | None:
353
+ ext = os.path.splitext(path)[1]
354
+ return ext or None
355
+
356
+
357
+ def _source_label(source: object, index: int) -> str:
358
+ """A human-friendly label for a source in merged/diffed output."""
359
+ if isinstance(source, (str, os.PathLike)):
360
+ return os.path.basename(os.fspath(source)) or os.fspath(source)
361
+ return f"Document {index + 1}"
362
+
363
+
364
+ def _looks_like_uri(text: str) -> bool:
365
+ """True if ``text`` has a URI scheme rather than being a local path.
366
+
367
+ A single-character "scheme" is treated as a Windows drive letter (``C:\\...``)
368
+ and therefore *not* a URI. Recognised URI schemes are dispatched by
369
+ :meth:`EverythingToHtml.convert_uri`, which rejects unsupported ones.
370
+ """
371
+ scheme = urlparse(text).scheme.lower()
372
+ return bool(scheme) and len(scheme) > 1
373
+
374
+
375
+ def _read_data_uri(uri: str) -> tuple[BinaryIO, StreamInfo]:
376
+ import base64
377
+
378
+ header, _, data = uri[len("data:") :].partition(",")
379
+ is_base64 = header.endswith(";base64")
380
+ mimetype = header.split(";", 1)[0] or None
381
+ charset = None
382
+ for part in header.split(";"):
383
+ if part.startswith("charset="):
384
+ charset = part[len("charset=") :]
385
+ raw = base64.b64decode(data) if is_base64 else unquote(data).encode("utf-8")
386
+ ext = mimetypes.guess_extension(mimetype) if mimetype else None
387
+ return io.BytesIO(raw), StreamInfo(mimetype=mimetype, charset=charset, extension=ext)
388
+
389
+
390
+ def _fetch_http(uri: str) -> tuple[BinaryIO, StreamInfo]:
391
+ from urllib.request import Request, urlopen
392
+
393
+ request = Request(uri, headers={"User-Agent": "everythingtohtml"})
394
+ with urlopen(request, timeout=30) as response: # noqa: S310 - scheme checked by caller
395
+ raw = response.read()
396
+ content_type = response.headers.get_content_type()
397
+ charset = response.headers.get_content_charset()
398
+ final_url = response.geturl()
399
+
400
+ path = urlparse(final_url).path
401
+ filename = os.path.basename(path) or None
402
+ return io.BytesIO(raw), StreamInfo(
403
+ mimetype=content_type or None,
404
+ charset=charset,
405
+ extension=_ext(path),
406
+ filename=filename,
407
+ url=final_url,
408
+ )