everythingtohtml 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- everythingtohtml/__about__.py +3 -0
- everythingtohtml/__init__.py +34 -0
- everythingtohtml/__main__.py +130 -0
- everythingtohtml/_base_converter.py +78 -0
- everythingtohtml/_everything_to_html.py +408 -0
- everythingtohtml/_exceptions.py +63 -0
- everythingtohtml/_html_builder.py +106 -0
- everythingtohtml/_merge.py +145 -0
- everythingtohtml/_stream_info.py +46 -0
- everythingtohtml/_text_utils.py +46 -0
- everythingtohtml/converters/__init__.py +45 -0
- everythingtohtml/converters/_csv_converter.py +73 -0
- everythingtohtml/converters/_doc_converter.py +385 -0
- everythingtohtml/converters/_docx_converter.py +105 -0
- everythingtohtml/converters/_eml_converter.py +104 -0
- everythingtohtml/converters/_epub_converter.py +131 -0
- everythingtohtml/converters/_html_converter.py +66 -0
- everythingtohtml/converters/_ipynb_converter.py +96 -0
- everythingtohtml/converters/_json_converter.py +78 -0
- everythingtohtml/converters/_markdown_converter.py +57 -0
- everythingtohtml/converters/_odt_converter.py +171 -0
- everythingtohtml/converters/_pdf_converter.py +204 -0
- everythingtohtml/converters/_plain_text_converter.py +64 -0
- everythingtohtml/converters/_pptx_converter.py +233 -0
- everythingtohtml/converters/_rss_converter.py +146 -0
- everythingtohtml/converters/_rst_converter.py +57 -0
- everythingtohtml/converters/_xlsx_converter.py +84 -0
- everythingtohtml/converters/_yaml_converter.py +56 -0
- everythingtohtml/py.typed +0 -0
- everythingtohtml-0.1.2.dist-info/METADATA +294 -0
- everythingtohtml-0.1.2.dist-info/RECORD +34 -0
- everythingtohtml-0.1.2.dist-info/WHEEL +4 -0
- everythingtohtml-0.1.2.dist-info/entry_points.txt +3 -0
- everythingtohtml-0.1.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""everythingtohtml — convert (almost) any file into clean, self-contained HTML.
|
|
2
|
+
|
|
3
|
+
Quick start
|
|
4
|
+
-----------
|
|
5
|
+
>>> from everythingtohtml import EverythingToHtml
|
|
6
|
+
>>> eth = EverythingToHtml()
|
|
7
|
+
>>> result = eth.convert("document.docx") # doctest: +SKIP
|
|
8
|
+
>>> open("document.html", "w", encoding="utf-8").write(result.html) # doctest: +SKIP
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from .__about__ import __version__
|
|
14
|
+
from ._base_converter import DocumentConverter, DocumentConverterResult
|
|
15
|
+
from ._everything_to_html import EverythingToHtml
|
|
16
|
+
from ._exceptions import (
|
|
17
|
+
EverythingToHtmlException,
|
|
18
|
+
FileConversionException,
|
|
19
|
+
MissingDependencyException,
|
|
20
|
+
UnsupportedFormatException,
|
|
21
|
+
)
|
|
22
|
+
from ._stream_info import StreamInfo
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"__version__",
|
|
26
|
+
"EverythingToHtml",
|
|
27
|
+
"DocumentConverter",
|
|
28
|
+
"DocumentConverterResult",
|
|
29
|
+
"StreamInfo",
|
|
30
|
+
"EverythingToHtmlException",
|
|
31
|
+
"FileConversionException",
|
|
32
|
+
"MissingDependencyException",
|
|
33
|
+
"UnsupportedFormatException",
|
|
34
|
+
]
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Command-line interface for everythingtohtml.
|
|
2
|
+
|
|
3
|
+
Examples
|
|
4
|
+
--------
|
|
5
|
+
everythingtohtml report.docx > report.html
|
|
6
|
+
everythingtohtml data.csv -o data.html
|
|
7
|
+
everythingtohtml https://example.com/feed.rss
|
|
8
|
+
cat notes.md | everythingtohtml --extension .md > notes.html
|
|
9
|
+
|
|
10
|
+
# merge several documents into one HTML (great for Word files)
|
|
11
|
+
everythingtohtml a.docx b.docx c.doc -o merged.html
|
|
12
|
+
everythingtohtml old.docx new.docx --columns -o side-by-side.html
|
|
13
|
+
|
|
14
|
+
# compare two documents with a highlighted line diff
|
|
15
|
+
everythingtohtml old.docx new.docx --diff -o changes.html
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import sys
|
|
22
|
+
from collections.abc import Sequence
|
|
23
|
+
|
|
24
|
+
from .__about__ import __version__
|
|
25
|
+
from ._everything_to_html import EverythingToHtml
|
|
26
|
+
from ._exceptions import EverythingToHtmlException
|
|
27
|
+
from ._stream_info import StreamInfo
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
31
|
+
parser = argparse.ArgumentParser(
|
|
32
|
+
prog="everythingtohtml",
|
|
33
|
+
description="Convert files, URLs, or stdin into clean, self-contained HTML.",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"sources",
|
|
37
|
+
nargs="*",
|
|
38
|
+
help="One or more paths/URLs. Two or more are merged; omit (or '-') for stdin.",
|
|
39
|
+
)
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"-o",
|
|
42
|
+
"--output",
|
|
43
|
+
help="Write HTML to this file instead of stdout.",
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"-e",
|
|
47
|
+
"--extension",
|
|
48
|
+
help="Hint the source extension (e.g. '.md') when reading from stdin.",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"-m",
|
|
52
|
+
"--mimetype",
|
|
53
|
+
help="Hint the source mimetype when it cannot be inferred.",
|
|
54
|
+
)
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--charset",
|
|
57
|
+
help="Hint the source character encoding (e.g. 'utf-8').",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--columns",
|
|
61
|
+
action="store_true",
|
|
62
|
+
help="When merging multiple sources, lay them out side by side.",
|
|
63
|
+
)
|
|
64
|
+
parser.add_argument(
|
|
65
|
+
"--diff",
|
|
66
|
+
action="store_true",
|
|
67
|
+
help="Compare exactly two sources with a highlighted line diff.",
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(
|
|
70
|
+
"--no-toc",
|
|
71
|
+
action="store_true",
|
|
72
|
+
help="When merging, omit the table of contents.",
|
|
73
|
+
)
|
|
74
|
+
parser.add_argument(
|
|
75
|
+
"--use-plugins",
|
|
76
|
+
action="store_true",
|
|
77
|
+
help="Load third-party converter plugins registered via entry points.",
|
|
78
|
+
)
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"--version",
|
|
81
|
+
action="version",
|
|
82
|
+
version=f"everythingtohtml {__version__}",
|
|
83
|
+
)
|
|
84
|
+
return parser
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
88
|
+
parser = _build_parser()
|
|
89
|
+
args = parser.parse_args(argv)
|
|
90
|
+
|
|
91
|
+
stream_info = StreamInfo(
|
|
92
|
+
extension=args.extension,
|
|
93
|
+
mimetype=args.mimetype,
|
|
94
|
+
charset=args.charset,
|
|
95
|
+
)
|
|
96
|
+
engine = EverythingToHtml(enable_plugins=args.use_plugins)
|
|
97
|
+
sources = args.sources
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
if args.diff:
|
|
101
|
+
if len(sources) != 2:
|
|
102
|
+
parser.error("--diff requires exactly two sources")
|
|
103
|
+
result = engine.diff(sources[0], sources[1])
|
|
104
|
+
elif len(sources) > 1:
|
|
105
|
+
result = engine.merge(
|
|
106
|
+
sources,
|
|
107
|
+
layout="columns" if args.columns else "stacked",
|
|
108
|
+
include_toc=not args.no_toc,
|
|
109
|
+
)
|
|
110
|
+
elif len(sources) == 1 and sources[0] != "-":
|
|
111
|
+
result = engine.convert(sources[0], stream_info=stream_info)
|
|
112
|
+
else:
|
|
113
|
+
data = sys.stdin.buffer.read()
|
|
114
|
+
result = engine.convert(data, stream_info=stream_info)
|
|
115
|
+
except EverythingToHtmlException as exc:
|
|
116
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
117
|
+
return 1
|
|
118
|
+
|
|
119
|
+
if args.output:
|
|
120
|
+
with open(args.output, "w", encoding="utf-8") as handle:
|
|
121
|
+
handle.write(result.html)
|
|
122
|
+
else:
|
|
123
|
+
sys.stdout.reconfigure(encoding="utf-8") # type: ignore[union-attr]
|
|
124
|
+
sys.stdout.write(result.html)
|
|
125
|
+
|
|
126
|
+
return 0
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
if __name__ == "__main__":
|
|
130
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""The converter contract and the result object every converter returns."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, BinaryIO
|
|
6
|
+
|
|
7
|
+
from ._stream_info import StreamInfo
|
|
8
|
+
|
|
9
|
+
__all__ = ["DocumentConverter", "DocumentConverterResult"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DocumentConverterResult:
|
|
13
|
+
"""The output of a successful conversion.
|
|
14
|
+
|
|
15
|
+
The canonical payload is :attr:`html`. ``text_content`` is provided as an
|
|
16
|
+
alias so code written against markdown-style converters keeps working when
|
|
17
|
+
pointed at this library, and ``str(result)`` yields the HTML directly.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
html: str,
|
|
23
|
+
*,
|
|
24
|
+
title: str | None = None,
|
|
25
|
+
metadata: dict[str, Any] | None = None,
|
|
26
|
+
) -> None:
|
|
27
|
+
self.html = html
|
|
28
|
+
self.title = title
|
|
29
|
+
self.metadata = metadata or {}
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def text_content(self) -> str:
|
|
33
|
+
"""Alias for :attr:`html` (drop-in compatibility with markdown converters)."""
|
|
34
|
+
return self.html
|
|
35
|
+
|
|
36
|
+
@text_content.setter
|
|
37
|
+
def text_content(self, value: str) -> None:
|
|
38
|
+
self.html = value
|
|
39
|
+
|
|
40
|
+
def __str__(self) -> str:
|
|
41
|
+
return self.html
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DocumentConverter:
|
|
45
|
+
"""Base class for all converters.
|
|
46
|
+
|
|
47
|
+
A converter answers two questions:
|
|
48
|
+
|
|
49
|
+
* :meth:`accepts` — *can* I handle this stream? (cheap, no side effects)
|
|
50
|
+
* :meth:`convert` — *do* the conversion and return HTML.
|
|
51
|
+
|
|
52
|
+
``accepts`` must not consume the stream destructively: read what you need to
|
|
53
|
+
sniff, then ``seek(0)`` back. ``convert`` may read freely; the engine resets
|
|
54
|
+
the stream before handing it to each converter.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
# Higher priority converters are tried first. Specific format converters use
|
|
58
|
+
# low numbers; greedy catch-alls (plain text) use high numbers.
|
|
59
|
+
PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
|
|
60
|
+
PRIORITY_GENERIC_FILE_FORMAT = 10.0
|
|
61
|
+
|
|
62
|
+
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT
|
|
63
|
+
|
|
64
|
+
def accepts(
|
|
65
|
+
self,
|
|
66
|
+
file_stream: BinaryIO,
|
|
67
|
+
stream_info: StreamInfo,
|
|
68
|
+
**kwargs: Any,
|
|
69
|
+
) -> bool:
|
|
70
|
+
raise NotImplementedError
|
|
71
|
+
|
|
72
|
+
def convert(
|
|
73
|
+
self,
|
|
74
|
+
file_stream: BinaryIO,
|
|
75
|
+
stream_info: StreamInfo,
|
|
76
|
+
**kwargs: Any,
|
|
77
|
+
) -> DocumentConverterResult:
|
|
78
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
"""The :class:`EverythingToHtml` engine: detection, dispatch, and plugins."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import mimetypes
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from collections.abc import Iterable
|
|
10
|
+
from typing import Any, BinaryIO
|
|
11
|
+
from urllib.parse import unquote, urlparse
|
|
12
|
+
from urllib.request import url2pathname
|
|
13
|
+
|
|
14
|
+
from ._base_converter import DocumentConverter, DocumentConverterResult
|
|
15
|
+
from ._exceptions import (
|
|
16
|
+
FailedConversionAttempt,
|
|
17
|
+
FileConversionException,
|
|
18
|
+
MissingDependencyException,
|
|
19
|
+
UnsupportedFormatException,
|
|
20
|
+
)
|
|
21
|
+
from ._stream_info import StreamInfo
|
|
22
|
+
from .converters import (
|
|
23
|
+
CsvConverter,
|
|
24
|
+
DocConverter,
|
|
25
|
+
DocxConverter,
|
|
26
|
+
EmlConverter,
|
|
27
|
+
EpubConverter,
|
|
28
|
+
HtmlConverter,
|
|
29
|
+
IpynbConverter,
|
|
30
|
+
JsonConverter,
|
|
31
|
+
MarkdownConverter,
|
|
32
|
+
OdtConverter,
|
|
33
|
+
PdfConverter,
|
|
34
|
+
PlainTextConverter,
|
|
35
|
+
PptxConverter,
|
|
36
|
+
RssConverter,
|
|
37
|
+
RstConverter,
|
|
38
|
+
XlsxConverter,
|
|
39
|
+
YamlConverter,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
__all__ = ["EverythingToHtml"]
|
|
43
|
+
|
|
44
|
+
_PLUGIN_ENTRY_POINT_GROUP = "everythingtohtml.converter"
|
|
45
|
+
|
|
46
|
+
# Built-in converters, registered in dependency-light first order. Specific
|
|
47
|
+
# formats are registered after the catch-all so that, on ties, they win.
|
|
48
|
+
_BUILTIN_CONVERTERS: tuple[type[DocumentConverter], ...] = (
|
|
49
|
+
PlainTextConverter,
|
|
50
|
+
HtmlConverter,
|
|
51
|
+
MarkdownConverter,
|
|
52
|
+
CsvConverter,
|
|
53
|
+
JsonConverter,
|
|
54
|
+
YamlConverter,
|
|
55
|
+
IpynbConverter,
|
|
56
|
+
RssConverter,
|
|
57
|
+
RstConverter,
|
|
58
|
+
EmlConverter,
|
|
59
|
+
EpubConverter,
|
|
60
|
+
OdtConverter,
|
|
61
|
+
DocConverter,
|
|
62
|
+
DocxConverter,
|
|
63
|
+
XlsxConverter,
|
|
64
|
+
PptxConverter,
|
|
65
|
+
PdfConverter,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class _Registration:
|
|
70
|
+
__slots__ = ("converter", "priority", "index")
|
|
71
|
+
|
|
72
|
+
def __init__(self, converter: DocumentConverter, index: int) -> None:
|
|
73
|
+
self.converter = converter
|
|
74
|
+
self.priority = converter.priority
|
|
75
|
+
self.index = index
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class EverythingToHtml:
|
|
79
|
+
"""Convert files, streams, and URIs into clean, self-contained HTML.
|
|
80
|
+
|
|
81
|
+
Example
|
|
82
|
+
-------
|
|
83
|
+
>>> from everythingtohtml import EverythingToHtml
|
|
84
|
+
>>> eth = EverythingToHtml()
|
|
85
|
+
>>> result = eth.convert("README.md")
|
|
86
|
+
>>> result.html.startswith("<!DOCTYPE html>")
|
|
87
|
+
True
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
*,
|
|
93
|
+
enable_builtins: bool = True,
|
|
94
|
+
enable_plugins: bool = False,
|
|
95
|
+
) -> None:
|
|
96
|
+
self._registrations: list[_Registration] = []
|
|
97
|
+
self._next_index = 0
|
|
98
|
+
if enable_builtins:
|
|
99
|
+
for converter_cls in _BUILTIN_CONVERTERS:
|
|
100
|
+
self.register_converter(converter_cls())
|
|
101
|
+
if enable_plugins:
|
|
102
|
+
self.load_plugins()
|
|
103
|
+
|
|
104
|
+
# -- registration ------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
def register_converter(self, converter: DocumentConverter) -> None:
|
|
107
|
+
"""Add a converter. Later registrations win ties against earlier ones."""
|
|
108
|
+
self._registrations.append(_Registration(converter, self._next_index))
|
|
109
|
+
self._next_index += 1
|
|
110
|
+
|
|
111
|
+
def load_plugins(self) -> None:
|
|
112
|
+
"""Discover and register third-party converters via entry points.
|
|
113
|
+
|
|
114
|
+
A plugin advertises a callable under the ``everythingtohtml.converter``
|
|
115
|
+
entry-point group; the callable receives this engine and registers its
|
|
116
|
+
own converters. See ``docs/PLUGINS.md``.
|
|
117
|
+
"""
|
|
118
|
+
from importlib.metadata import entry_points
|
|
119
|
+
|
|
120
|
+
for ep in entry_points(group=_PLUGIN_ENTRY_POINT_GROUP):
|
|
121
|
+
register = ep.load()
|
|
122
|
+
register(self)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def _ordered(self) -> list[DocumentConverter]:
|
|
126
|
+
# Lowest priority number first; for equal priority, most-recently
|
|
127
|
+
# registered first (so plugins and later converters can override).
|
|
128
|
+
ordered = sorted(self._registrations, key=lambda r: (r.priority, -r.index))
|
|
129
|
+
return [r.converter for r in ordered]
|
|
130
|
+
|
|
131
|
+
# -- public conversion API --------------------------------------------
|
|
132
|
+
|
|
133
|
+
def convert(
|
|
134
|
+
self,
|
|
135
|
+
source: str | os.PathLike[str] | bytes | BinaryIO,
|
|
136
|
+
*,
|
|
137
|
+
stream_info: StreamInfo | None = None,
|
|
138
|
+
**kwargs: Any,
|
|
139
|
+
) -> DocumentConverterResult:
|
|
140
|
+
"""Convert ``source`` to HTML.
|
|
141
|
+
|
|
142
|
+
``source`` may be a local path, a URI (``http``, ``https``, ``file``,
|
|
143
|
+
``data``), raw ``bytes``, or an already-open binary stream.
|
|
144
|
+
"""
|
|
145
|
+
if isinstance(source, (str, os.PathLike)):
|
|
146
|
+
text = os.fspath(source)
|
|
147
|
+
if _looks_like_uri(text):
|
|
148
|
+
return self.convert_uri(text, stream_info=stream_info, **kwargs)
|
|
149
|
+
return self.convert_local(text, stream_info=stream_info, **kwargs)
|
|
150
|
+
if isinstance(source, (bytes, bytearray)):
|
|
151
|
+
return self.convert_stream(io.BytesIO(bytes(source)), stream_info=stream_info, **kwargs)
|
|
152
|
+
return self.convert_stream(source, stream_info=stream_info, **kwargs)
|
|
153
|
+
|
|
154
|
+
def convert_local(
|
|
155
|
+
self,
|
|
156
|
+
path: str | os.PathLike[str],
|
|
157
|
+
*,
|
|
158
|
+
stream_info: StreamInfo | None = None,
|
|
159
|
+
**kwargs: Any,
|
|
160
|
+
) -> DocumentConverterResult:
|
|
161
|
+
"""Convert a file on the local filesystem."""
|
|
162
|
+
path = os.fspath(path)
|
|
163
|
+
base = StreamInfo(
|
|
164
|
+
local_path=path,
|
|
165
|
+
filename=os.path.basename(path),
|
|
166
|
+
extension=_ext(path),
|
|
167
|
+
mimetype=mimetypes.guess_type(path)[0],
|
|
168
|
+
)
|
|
169
|
+
guessed = base.copy_and_update(stream_info)
|
|
170
|
+
with open(path, "rb") as stream:
|
|
171
|
+
return self._convert(stream, guessed, **kwargs)
|
|
172
|
+
|
|
173
|
+
def convert_stream(
|
|
174
|
+
self,
|
|
175
|
+
stream: BinaryIO,
|
|
176
|
+
*,
|
|
177
|
+
stream_info: StreamInfo | None = None,
|
|
178
|
+
**kwargs: Any,
|
|
179
|
+
) -> DocumentConverterResult:
|
|
180
|
+
"""Convert an open binary stream.
|
|
181
|
+
|
|
182
|
+
The stream must be seekable; if it is not, it is buffered into memory.
|
|
183
|
+
"""
|
|
184
|
+
if not stream.seekable():
|
|
185
|
+
stream = io.BytesIO(stream.read())
|
|
186
|
+
guessed = stream_info or StreamInfo()
|
|
187
|
+
return self._convert(stream, guessed, **kwargs)
|
|
188
|
+
|
|
189
|
+
def convert_uri(
|
|
190
|
+
self,
|
|
191
|
+
uri: str,
|
|
192
|
+
*,
|
|
193
|
+
stream_info: StreamInfo | None = None,
|
|
194
|
+
**kwargs: Any,
|
|
195
|
+
) -> DocumentConverterResult:
|
|
196
|
+
"""Convert content addressed by a URI (``http(s)``, ``file``, ``data``)."""
|
|
197
|
+
uri = uri.strip()
|
|
198
|
+
parsed = urlparse(uri)
|
|
199
|
+
scheme = parsed.scheme.lower()
|
|
200
|
+
|
|
201
|
+
if scheme == "file":
|
|
202
|
+
local = url2pathname(parsed.path)
|
|
203
|
+
return self.convert_local(local, stream_info=stream_info, **kwargs)
|
|
204
|
+
if scheme == "data":
|
|
205
|
+
stream, info = _read_data_uri(uri)
|
|
206
|
+
return self._convert(stream, info.copy_and_update(stream_info), **kwargs)
|
|
207
|
+
if scheme in ("http", "https"):
|
|
208
|
+
stream, info = _fetch_http(uri)
|
|
209
|
+
return self._convert(stream, info.copy_and_update(stream_info), **kwargs)
|
|
210
|
+
raise UnsupportedFormatException(f"Unsupported URI scheme: {scheme!r}")
|
|
211
|
+
|
|
212
|
+
# -- multi-document composition ---------------------------------------
|
|
213
|
+
|
|
214
|
+
def merge(
|
|
215
|
+
self,
|
|
216
|
+
sources: Iterable[str | os.PathLike[str] | bytes | BinaryIO],
|
|
217
|
+
*,
|
|
218
|
+
title: str | None = None,
|
|
219
|
+
layout: str = "stacked",
|
|
220
|
+
include_toc: bool = True,
|
|
221
|
+
labels: list[str] | None = None,
|
|
222
|
+
**kwargs: Any,
|
|
223
|
+
) -> DocumentConverterResult:
|
|
224
|
+
"""Convert several sources and combine them into one HTML document.
|
|
225
|
+
|
|
226
|
+
``layout="stacked"`` (default) renders them top-to-bottom with a table of
|
|
227
|
+
contents; ``layout="columns"`` places them side by side for comparison.
|
|
228
|
+
Each document's detected title is used as its heading unless ``labels`` is
|
|
229
|
+
given. Great for collating or comparing a set of Word documents.
|
|
230
|
+
"""
|
|
231
|
+
from ._merge import body_fragment, build_merged_html
|
|
232
|
+
|
|
233
|
+
items: list[tuple[str, str]] = []
|
|
234
|
+
for index, source in enumerate(sources):
|
|
235
|
+
result = self.convert(source, **kwargs)
|
|
236
|
+
label = (
|
|
237
|
+
labels[index]
|
|
238
|
+
if labels and index < len(labels)
|
|
239
|
+
else (result.title or _source_label(source, index))
|
|
240
|
+
)
|
|
241
|
+
items.append((label, body_fragment(result.html)))
|
|
242
|
+
|
|
243
|
+
if not items:
|
|
244
|
+
raise ValueError("merge() requires at least one source")
|
|
245
|
+
|
|
246
|
+
html = build_merged_html(items, title=title, layout=layout, include_toc=include_toc)
|
|
247
|
+
return DocumentConverterResult(html, title=title)
|
|
248
|
+
|
|
249
|
+
def diff(
|
|
250
|
+
self,
|
|
251
|
+
left: str | os.PathLike[str] | bytes | BinaryIO,
|
|
252
|
+
right: str | os.PathLike[str] | bytes | BinaryIO,
|
|
253
|
+
*,
|
|
254
|
+
title: str | None = None,
|
|
255
|
+
left_label: str | None = None,
|
|
256
|
+
right_label: str | None = None,
|
|
257
|
+
**kwargs: Any,
|
|
258
|
+
) -> DocumentConverterResult:
|
|
259
|
+
"""Render a side-by-side line diff of two sources' text content."""
|
|
260
|
+
from ._merge import build_diff_html, plain_text_lines
|
|
261
|
+
|
|
262
|
+
left_result = self.convert(left, **kwargs)
|
|
263
|
+
right_result = self.convert(right, **kwargs)
|
|
264
|
+
|
|
265
|
+
html = build_diff_html(
|
|
266
|
+
left_label or left_result.title or _source_label(left, 0),
|
|
267
|
+
plain_text_lines(left_result.html),
|
|
268
|
+
right_label or right_result.title or _source_label(right, 1),
|
|
269
|
+
plain_text_lines(right_result.html),
|
|
270
|
+
title=title,
|
|
271
|
+
)
|
|
272
|
+
return DocumentConverterResult(html, title=title or "Document comparison")
|
|
273
|
+
|
|
274
|
+
# -- internals ---------------------------------------------------------
|
|
275
|
+
|
|
276
|
+
def _convert(
|
|
277
|
+
self,
|
|
278
|
+
stream: BinaryIO,
|
|
279
|
+
stream_info: StreamInfo,
|
|
280
|
+
**kwargs: Any,
|
|
281
|
+
) -> DocumentConverterResult:
|
|
282
|
+
enriched = self._sniff(stream, stream_info)
|
|
283
|
+
|
|
284
|
+
attempts: list[FailedConversionAttempt] = []
|
|
285
|
+
missing_dependency: MissingDependencyException | None = None
|
|
286
|
+
for converter in self._ordered:
|
|
287
|
+
stream.seek(0)
|
|
288
|
+
try:
|
|
289
|
+
if not converter.accepts(stream, enriched, **kwargs):
|
|
290
|
+
continue
|
|
291
|
+
except Exception: # a misbehaving accepts() should not abort dispatch
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
stream.seek(0)
|
|
295
|
+
try:
|
|
296
|
+
return converter.convert(stream, enriched, **kwargs)
|
|
297
|
+
except MissingDependencyException as exc:
|
|
298
|
+
# Remember the first actionable "install this extra" signal so it
|
|
299
|
+
# can be surfaced if nothing else handles the input.
|
|
300
|
+
if missing_dependency is None:
|
|
301
|
+
missing_dependency = exc
|
|
302
|
+
attempts.append(FailedConversionAttempt(converter, sys.exc_info()))
|
|
303
|
+
except Exception:
|
|
304
|
+
attempts.append(FailedConversionAttempt(converter, sys.exc_info()))
|
|
305
|
+
|
|
306
|
+
# A missing optional dependency is the most useful thing to report, so it
|
|
307
|
+
# takes precedence over the generic "all converters failed" message.
|
|
308
|
+
if missing_dependency is not None:
|
|
309
|
+
raise missing_dependency
|
|
310
|
+
if attempts:
|
|
311
|
+
raise FileConversionException(attempts=attempts)
|
|
312
|
+
raise UnsupportedFormatException(
|
|
313
|
+
"No converter could handle this input "
|
|
314
|
+
f"(extension={enriched.extension!r}, mimetype={enriched.mimetype!r}). "
|
|
315
|
+
"It may need an optional extra; see 'pip install everythingtohtml[all]'."
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
@staticmethod
|
|
319
|
+
def _sniff(stream: BinaryIO, stream_info: StreamInfo) -> StreamInfo:
|
|
320
|
+
"""Fill in missing extension/mimetype hints from magic bytes."""
|
|
321
|
+
if stream_info.extension and stream_info.mimetype:
|
|
322
|
+
return stream_info
|
|
323
|
+
|
|
324
|
+
pos = stream.tell()
|
|
325
|
+
header = stream.read(2048)
|
|
326
|
+
stream.seek(pos)
|
|
327
|
+
if not header:
|
|
328
|
+
return stream_info
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
import puremagic
|
|
332
|
+
|
|
333
|
+
matches = puremagic.magic_string(header)
|
|
334
|
+
except Exception:
|
|
335
|
+
return stream_info
|
|
336
|
+
|
|
337
|
+
if not matches:
|
|
338
|
+
return stream_info
|
|
339
|
+
|
|
340
|
+
best = matches[0]
|
|
341
|
+
updates: dict[str, object] = {}
|
|
342
|
+
if not stream_info.extension and getattr(best, "extension", None):
|
|
343
|
+
updates["extension"] = best.extension
|
|
344
|
+
if not stream_info.mimetype and getattr(best, "mime_type", None):
|
|
345
|
+
updates["mimetype"] = best.mime_type
|
|
346
|
+
return stream_info.copy_and_update(**updates) if updates else stream_info
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
# -- module-level helpers --------------------------------------------------
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _ext(path: str) -> str | None:
|
|
353
|
+
ext = os.path.splitext(path)[1]
|
|
354
|
+
return ext or None
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _source_label(source: object, index: int) -> str:
|
|
358
|
+
"""A human-friendly label for a source in merged/diffed output."""
|
|
359
|
+
if isinstance(source, (str, os.PathLike)):
|
|
360
|
+
return os.path.basename(os.fspath(source)) or os.fspath(source)
|
|
361
|
+
return f"Document {index + 1}"
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _looks_like_uri(text: str) -> bool:
|
|
365
|
+
"""True if ``text`` has a URI scheme rather than being a local path.
|
|
366
|
+
|
|
367
|
+
A single-character "scheme" is treated as a Windows drive letter (``C:\\...``)
|
|
368
|
+
and therefore *not* a URI. Recognised URI schemes are dispatched by
|
|
369
|
+
:meth:`EverythingToHtml.convert_uri`, which rejects unsupported ones.
|
|
370
|
+
"""
|
|
371
|
+
scheme = urlparse(text).scheme.lower()
|
|
372
|
+
return bool(scheme) and len(scheme) > 1
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _read_data_uri(uri: str) -> tuple[BinaryIO, StreamInfo]:
|
|
376
|
+
import base64
|
|
377
|
+
|
|
378
|
+
header, _, data = uri[len("data:") :].partition(",")
|
|
379
|
+
is_base64 = header.endswith(";base64")
|
|
380
|
+
mimetype = header.split(";", 1)[0] or None
|
|
381
|
+
charset = None
|
|
382
|
+
for part in header.split(";"):
|
|
383
|
+
if part.startswith("charset="):
|
|
384
|
+
charset = part[len("charset=") :]
|
|
385
|
+
raw = base64.b64decode(data) if is_base64 else unquote(data).encode("utf-8")
|
|
386
|
+
ext = mimetypes.guess_extension(mimetype) if mimetype else None
|
|
387
|
+
return io.BytesIO(raw), StreamInfo(mimetype=mimetype, charset=charset, extension=ext)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _fetch_http(uri: str) -> tuple[BinaryIO, StreamInfo]:
|
|
391
|
+
from urllib.request import Request, urlopen
|
|
392
|
+
|
|
393
|
+
request = Request(uri, headers={"User-Agent": "everythingtohtml"})
|
|
394
|
+
with urlopen(request, timeout=30) as response: # noqa: S310 - scheme checked by caller
|
|
395
|
+
raw = response.read()
|
|
396
|
+
content_type = response.headers.get_content_type()
|
|
397
|
+
charset = response.headers.get_content_charset()
|
|
398
|
+
final_url = response.geturl()
|
|
399
|
+
|
|
400
|
+
path = urlparse(final_url).path
|
|
401
|
+
filename = os.path.basename(path) or None
|
|
402
|
+
return io.BytesIO(raw), StreamInfo(
|
|
403
|
+
mimetype=content_type or None,
|
|
404
|
+
charset=charset,
|
|
405
|
+
extension=_ext(path),
|
|
406
|
+
filename=filename,
|
|
407
|
+
url=final_url,
|
|
408
|
+
)
|