everythingtohtml 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. everythingtohtml/__about__.py +3 -0
  2. everythingtohtml/__init__.py +34 -0
  3. everythingtohtml/__main__.py +130 -0
  4. everythingtohtml/_base_converter.py +78 -0
  5. everythingtohtml/_everything_to_html.py +408 -0
  6. everythingtohtml/_exceptions.py +63 -0
  7. everythingtohtml/_html_builder.py +106 -0
  8. everythingtohtml/_merge.py +145 -0
  9. everythingtohtml/_stream_info.py +46 -0
  10. everythingtohtml/_text_utils.py +46 -0
  11. everythingtohtml/converters/__init__.py +45 -0
  12. everythingtohtml/converters/_csv_converter.py +73 -0
  13. everythingtohtml/converters/_doc_converter.py +385 -0
  14. everythingtohtml/converters/_docx_converter.py +105 -0
  15. everythingtohtml/converters/_eml_converter.py +104 -0
  16. everythingtohtml/converters/_epub_converter.py +131 -0
  17. everythingtohtml/converters/_html_converter.py +66 -0
  18. everythingtohtml/converters/_ipynb_converter.py +96 -0
  19. everythingtohtml/converters/_json_converter.py +78 -0
  20. everythingtohtml/converters/_markdown_converter.py +57 -0
  21. everythingtohtml/converters/_odt_converter.py +171 -0
  22. everythingtohtml/converters/_pdf_converter.py +204 -0
  23. everythingtohtml/converters/_plain_text_converter.py +64 -0
  24. everythingtohtml/converters/_pptx_converter.py +233 -0
  25. everythingtohtml/converters/_rss_converter.py +146 -0
  26. everythingtohtml/converters/_rst_converter.py +57 -0
  27. everythingtohtml/converters/_xlsx_converter.py +84 -0
  28. everythingtohtml/converters/_yaml_converter.py +56 -0
  29. everythingtohtml/py.typed +0 -0
  30. everythingtohtml-0.1.2.dist-info/METADATA +294 -0
  31. everythingtohtml-0.1.2.dist-info/RECORD +34 -0
  32. everythingtohtml-0.1.2.dist-info/WHEEL +4 -0
  33. everythingtohtml-0.1.2.dist-info/entry_points.txt +3 -0
  34. everythingtohtml-0.1.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,63 @@
1
+ """Exception hierarchy for everythingtohtml.
2
+
3
+ The layout mirrors the ergonomics of well-behaved conversion libraries: a single
4
+ base class so callers can ``except EverythingToHtmlException`` and catch anything
5
+ the library raises on purpose.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ __all__ = [
11
+ "EverythingToHtmlException",
12
+ "UnsupportedFormatException",
13
+ "FileConversionException",
14
+ "MissingDependencyException",
15
+ ]
16
+
17
+
18
+ class EverythingToHtmlException(Exception):
19
+ """Base class for all exceptions raised by everythingtohtml."""
20
+
21
+
22
+ class MissingDependencyException(EverythingToHtmlException):
23
+ """Raised when a converter needs an optional dependency that is not installed.
24
+
25
+ The message tells the user exactly which extra to install, e.g.::
26
+
27
+ pip install everythingtohtml[docx]
28
+ """
29
+
30
+
31
+ class UnsupportedFormatException(EverythingToHtmlException):
32
+ """Raised when no registered converter is able to handle the input."""
33
+
34
+
35
+ class FailedConversionAttempt:
36
+ """Bookkeeping for a converter that accepted the input but then failed.
37
+
38
+ Collected so :class:`FileConversionException` can report every attempt rather
39
+ than only the last traceback, which makes debugging multi-converter inputs far
40
+ easier.
41
+ """
42
+
43
+ def __init__(self, converter: object, exc_info: object | None = None) -> None:
44
+ self.converter = converter
45
+ self.exc_info = exc_info
46
+
47
+
48
+ class FileConversionException(EverythingToHtmlException):
49
+ """Raised when one or more converters accepted the input but all failed."""
50
+
51
+ def __init__(
52
+ self,
53
+ message: str | None = None,
54
+ attempts: list[FailedConversionAttempt] | None = None,
55
+ ) -> None:
56
+ self.attempts = attempts or []
57
+ if message is None:
58
+ if self.attempts:
59
+ names = ", ".join(type(a.converter).__name__ for a in self.attempts)
60
+ message = f"All converters that accepted the input failed: {names}"
61
+ else:
62
+ message = "File conversion failed."
63
+ super().__init__(message)
@@ -0,0 +1,106 @@
1
+ """Helpers for assembling clean, self-contained HTML documents.
2
+
3
+ Every converter produces an HTML *fragment* (the meaningful body markup). This
4
+ module wraps fragments into a full document with a small, readable default
5
+ stylesheet so the output looks decent on its own while staying easy to restyle.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from html import escape as _escape
11
+
12
+ __all__ = ["escape_text", "escape_attr", "wrap_document", "DEFAULT_STYLESHEET"]
13
+
14
+
15
+ DEFAULT_STYLESHEET = """\
16
+ :root { color-scheme: light dark; }
17
+ * { box-sizing: border-box; }
18
+ body {
19
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica,
20
+ Arial, sans-serif;
21
+ line-height: 1.6;
22
+ max-width: 50rem;
23
+ margin: 2rem auto;
24
+ padding: 0 1rem;
25
+ color: #1a1a1a;
26
+ background: #ffffff;
27
+ }
28
+ @media (prefers-color-scheme: dark) {
29
+ body { color: #e6e6e6; background: #121212; }
30
+ a { color: #6ea8fe; }
31
+ table th { background: #1e1e1e; }
32
+ table td, table th { border-color: #333; }
33
+ table tr:nth-child(even) td { background: rgba(255,255,255,0.03); }
34
+ pre, code { background: #1e1e1e; }
35
+ }
36
+ h1, h2, h3 { line-height: 1.25; }
37
+ a { color: #0b5cff; }
38
+ /* Tables: scroll horizontally when wider than the page instead of overflowing. */
39
+ table {
40
+ border-collapse: collapse;
41
+ margin: 1rem 0;
42
+ display: block;
43
+ max-width: 100%;
44
+ overflow-x: auto;
45
+ font-size: 0.95em;
46
+ }
47
+ table td, table th {
48
+ border: 1px solid #d4d4dc;
49
+ padding: 0.4rem 0.65rem;
50
+ text-align: left;
51
+ vertical-align: top;
52
+ }
53
+ table th { background: #f5f5f5; font-weight: 600; }
54
+ table tr:nth-child(even) td { background: rgba(0,0,0,0.025); }
55
+ /* Office converters wrap cell content in <p>; keep cells compact. */
56
+ table td > p, table th > p { margin: 0.1rem 0; }
57
+ table td > p:only-child, table th > p:only-child { margin: 0; }
58
+ pre {
59
+ background: #f5f5f5; padding: 1rem; overflow-x: auto; border-radius: 6px;
60
+ }
61
+ code { background: #f5f5f5; padding: 0.1rem 0.3rem; border-radius: 4px; }
62
+ pre code { padding: 0; background: none; }
63
+ blockquote {
64
+ border-left: 4px solid #ddd; margin: 1rem 0; padding: 0.2rem 1rem; color: #666;
65
+ }
66
+ img { max-width: 100%; height: auto; }
67
+ """
68
+
69
+
70
+ def escape_text(text: str) -> str:
71
+ """Escape text for safe inclusion in HTML element content."""
72
+ return _escape(text, quote=False)
73
+
74
+
75
+ def escape_attr(text: str) -> str:
76
+ """Escape text for safe inclusion in a double-quoted HTML attribute."""
77
+ return _escape(text, quote=True)
78
+
79
+
80
+ def wrap_document(
81
+ body: str,
82
+ *,
83
+ title: str | None = None,
84
+ lang: str = "en",
85
+ include_style: bool = True,
86
+ extra_head: str = "",
87
+ ) -> str:
88
+ """Wrap an HTML *fragment* into a complete, standalone HTML5 document."""
89
+ safe_title = escape_text(title) if title else "Converted Document"
90
+ style = f"<style>\n{DEFAULT_STYLESHEET}</style>\n" if include_style else ""
91
+ return (
92
+ "<!DOCTYPE html>\n"
93
+ f'<html lang="{escape_attr(lang)}">\n'
94
+ "<head>\n"
95
+ '<meta charset="utf-8">\n'
96
+ '<meta name="viewport" content="width=device-width, initial-scale=1">\n'
97
+ '<meta name="generator" content="everythingtohtml">\n'
98
+ f"<title>{safe_title}</title>\n"
99
+ f"{style}"
100
+ f"{extra_head}"
101
+ "</head>\n"
102
+ "<body>\n"
103
+ f"{body}\n"
104
+ "</body>\n"
105
+ "</html>\n"
106
+ )
@@ -0,0 +1,145 @@
1
+ """Combine and compare multiple converted documents in a single HTML page.
2
+
3
+ These helpers operate on the *output* of converters, so they work for any format
4
+ the library supports — merge a folder of Word docs, stack a PDF next to its
5
+ Markdown source, or diff two revisions of a spec.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import difflib
11
+
12
+ from ._html_builder import escape_attr, escape_text, wrap_document
13
+
14
+ __all__ = ["body_fragment", "plain_text_lines", "build_merged_html", "build_diff_html"]
15
+
16
+
17
+ _MERGE_STYLE = """
18
+ .merge-toc { border: 1px solid #ddd; border-radius: 6px; padding: 0.5rem 1rem; margin-bottom: 2rem; }
19
+ .merge-toc ol { margin: 0.3rem 0; }
20
+ .merge-doc { margin-bottom: 3rem; padding-bottom: 2rem; border-bottom: 2px solid #eee; }
21
+ .merge-doc:last-child { border-bottom: none; }
22
+ .merge-doc > h2.merge-title { margin-top: 0; }
23
+ .merge-columns { display: flex; gap: 2rem; align-items: flex-start; }
24
+ .merge-columns .merge-col { flex: 1 1 0; min-width: 0; }
25
+ @media (max-width: 48rem) { .merge-columns { flex-direction: column; } }
26
+ """
27
+
28
+ _DIFF_STYLE = """
29
+ table.diff { width: 100%; border-collapse: collapse; font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; font-size: 0.85rem; }
30
+ table.diff td { padding: 0 0.4rem; vertical-align: top; white-space: pre-wrap; word-break: break-word; }
31
+ .diff_header { color: #999; text-align: right; }
32
+ td.diff_header { padding-right: 0.6rem; }
33
+ .diff_next { background: #f3f3f3; }
34
+ .diff_add { background: #d6ffd6; }
35
+ .diff_chg { background: #fff5b1; }
36
+ .diff_sub { background: #ffd6d6; }
37
+ .diff-legend { font-size: 0.85rem; margin: 0.5rem 0 1rem; }
38
+ .diff-legend span { padding: 0.1rem 0.4rem; border-radius: 3px; margin-right: 0.5rem; }
39
+ @media (prefers-color-scheme: dark) {
40
+ .diff_add { background: #14532d; } .diff_sub { background: #5b1a1a; }
41
+ .diff_chg { background: #5c4d00; } .diff_next, .diff_header { color: #aaa; }
42
+ }
43
+ """
44
+
45
+
46
+ def body_fragment(html: str) -> str:
47
+ """Extract the inner ``<body>`` markup from a full HTML document."""
48
+ from bs4 import BeautifulSoup
49
+
50
+ soup = BeautifulSoup(html, "html.parser")
51
+ body = soup.body
52
+ return body.decode_contents() if body else soup.decode_contents()
53
+
54
+
55
+ def plain_text_lines(html: str) -> list[str]:
56
+ """Extract visible text from an HTML document as a list of lines."""
57
+ from bs4 import BeautifulSoup
58
+
59
+ soup = BeautifulSoup(html, "html.parser")
60
+ for tag in soup(["script", "style"]):
61
+ tag.decompose()
62
+ text = soup.get_text("\n")
63
+ return [line.rstrip() for line in text.splitlines() if line.strip()]
64
+
65
+
66
+ def build_merged_html(
67
+ items: list[tuple[str, str]],
68
+ *,
69
+ title: str | None = None,
70
+ layout: str = "stacked",
71
+ include_toc: bool = True,
72
+ ) -> str:
73
+ """Combine ``(label, body_fragment)`` pairs into one HTML document.
74
+
75
+ ``layout="stacked"`` renders documents top-to-bottom with an optional table of
76
+ contents; ``layout="columns"`` places them side by side for visual comparison.
77
+ """
78
+ doc_title = title or "Merged document"
79
+
80
+ if layout == "columns":
81
+ cols = "".join(
82
+ f'<section class="merge-col"><h2 class="merge-title">{escape_text(label)}</h2>'
83
+ f"{body}</section>"
84
+ for label, body in items
85
+ )
86
+ body_html = f'<div class="merge-columns">{cols}</div>'
87
+ return wrap_document(
88
+ body_html,
89
+ title=doc_title,
90
+ extra_head=f"<style>{_MERGE_STYLE}</style>\n",
91
+ )
92
+
93
+ parts: list[str] = []
94
+ if include_toc and len(items) > 1:
95
+ links = "".join(
96
+ f'<li><a href="#doc-{i}">{escape_text(label)}</a></li>'
97
+ for i, (label, _) in enumerate(items, start=1)
98
+ )
99
+ parts.append(f'<nav class="merge-toc"><strong>Contents</strong><ol>{links}</ol></nav>')
100
+
101
+ for i, (label, body) in enumerate(items, start=1):
102
+ parts.append(
103
+ f'<section class="merge-doc" id="doc-{i}">'
104
+ f'<h2 class="merge-title">{escape_text(label)}</h2>{body}</section>'
105
+ )
106
+
107
+ return wrap_document(
108
+ "\n".join(parts),
109
+ title=doc_title,
110
+ extra_head=f"<style>{_MERGE_STYLE}</style>\n",
111
+ )
112
+
113
+
114
+ def build_diff_html(
115
+ left_label: str,
116
+ left_lines: list[str],
117
+ right_label: str,
118
+ right_lines: list[str],
119
+ *,
120
+ title: str | None = None,
121
+ context: bool = True,
122
+ numlines: int = 3,
123
+ ) -> str:
124
+ """Render a side-by-side line diff of two documents' text content."""
125
+ differ = difflib.HtmlDiff(wrapcolumn=72)
126
+ table = differ.make_table(
127
+ left_lines,
128
+ right_lines,
129
+ fromdesc=escape_attr(left_label),
130
+ todesc=escape_attr(right_label),
131
+ context=context,
132
+ numlines=numlines,
133
+ )
134
+ legend = (
135
+ '<p class="diff-legend">'
136
+ '<span class="diff_add">added</span>'
137
+ '<span class="diff_chg">changed</span>'
138
+ '<span class="diff_sub">removed</span></p>'
139
+ )
140
+ body = f"<h1>{escape_text(title or 'Document comparison')}</h1>{legend}{table}"
141
+ return wrap_document(
142
+ body,
143
+ title=title or "Document comparison",
144
+ extra_head=f"<style>{_DIFF_STYLE}</style>\n",
145
+ )
@@ -0,0 +1,46 @@
1
+ """Carries everything we know about an input stream as it flows through converters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import asdict, dataclass, replace
6
+
7
+ __all__ = ["StreamInfo"]
8
+
9
+
10
+ @dataclass(frozen=True, kw_only=True)
11
+ class StreamInfo:
12
+ """Immutable bag of hints describing a binary stream.
13
+
14
+ Every field is optional. Converters use whichever signals are available
15
+ (extension, mimetype, magic bytes, declared charset) to decide whether they
16
+ can handle the stream. Because it is frozen, passing it between converters is
17
+ safe; use :meth:`copy_and_update` to derive a refined copy.
18
+ """
19
+
20
+ mimetype: str | None = None
21
+ extension: str | None = None
22
+ charset: str | None = None
23
+ filename: str | None = None
24
+ local_path: str | None = None
25
+ url: str | None = None
26
+
27
+ def copy_and_update(self, *args: StreamInfo | None, **kwargs: object) -> StreamInfo:
28
+ """Return a new ``StreamInfo`` with non-``None`` values layered on top.
29
+
30
+ Accepts other ``StreamInfo`` instances (applied in order) and/or keyword
31
+ overrides. ``None`` values never overwrite an existing value.
32
+ """
33
+ updates: dict[str, object] = {}
34
+ for arg in args:
35
+ if arg is None:
36
+ continue
37
+ updates.update({k: v for k, v in asdict(arg).items() if v is not None})
38
+ updates.update({k: v for k, v in kwargs.items() if v is not None})
39
+ return replace(self, **updates) # type: ignore[arg-type]
40
+
41
+ def normalized_extension(self) -> str | None:
42
+ """Lower-cased extension with a leading dot, or ``None``."""
43
+ if not self.extension:
44
+ return None
45
+ ext = self.extension.lower()
46
+ return ext if ext.startswith(".") else f".{ext}"
@@ -0,0 +1,46 @@
1
+ """Robust byte-stream-to-``str`` decoding shared by text-based converters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import BinaryIO
6
+
7
+ from ._stream_info import StreamInfo
8
+
9
+ __all__ = ["read_text"]
10
+
11
+
12
+ def read_text(file_stream: BinaryIO, stream_info: StreamInfo) -> str:
13
+ """Decode a binary stream to text.
14
+
15
+ Order of preference:
16
+
17
+ 1. The charset declared on ``stream_info`` (e.g. parsed from an HTTP header).
18
+ 2. UTF-8, which is correct the overwhelming majority of the time.
19
+ 3. ``charset_normalizer``'s best guess for legacy encodings.
20
+ 4. UTF-8 with replacement as a last resort so we never hard-fail.
21
+ """
22
+ raw = file_stream.read()
23
+ if isinstance(raw, str): # already-decoded stream
24
+ return raw
25
+
26
+ if stream_info.charset:
27
+ try:
28
+ return raw.decode(stream_info.charset)
29
+ except (LookupError, UnicodeDecodeError):
30
+ pass
31
+
32
+ try:
33
+ return raw.decode("utf-8")
34
+ except UnicodeDecodeError:
35
+ pass
36
+
37
+ try:
38
+ from charset_normalizer import from_bytes
39
+
40
+ best = from_bytes(raw).best()
41
+ if best is not None:
42
+ return str(best)
43
+ except Exception: # pragma: no cover - defensive, charset_normalizer is a dep
44
+ pass
45
+
46
+ return raw.decode("utf-8", errors="replace")
@@ -0,0 +1,45 @@
1
+ """Built-in converters bundled with everythingtohtml.
2
+
3
+ Each converter is a small, self-contained class implementing the
4
+ :class:`~everythingtohtml._base_converter.DocumentConverter` contract.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from ._csv_converter import CsvConverter
10
+ from ._doc_converter import DocConverter
11
+ from ._docx_converter import DocxConverter
12
+ from ._eml_converter import EmlConverter
13
+ from ._epub_converter import EpubConverter
14
+ from ._html_converter import HtmlConverter
15
+ from ._ipynb_converter import IpynbConverter
16
+ from ._json_converter import JsonConverter
17
+ from ._markdown_converter import MarkdownConverter
18
+ from ._odt_converter import OdtConverter
19
+ from ._pdf_converter import PdfConverter
20
+ from ._plain_text_converter import PlainTextConverter
21
+ from ._pptx_converter import PptxConverter
22
+ from ._rss_converter import RssConverter
23
+ from ._rst_converter import RstConverter
24
+ from ._xlsx_converter import XlsxConverter
25
+ from ._yaml_converter import YamlConverter
26
+
27
+ __all__ = [
28
+ "CsvConverter",
29
+ "DocConverter",
30
+ "DocxConverter",
31
+ "EmlConverter",
32
+ "EpubConverter",
33
+ "HtmlConverter",
34
+ "IpynbConverter",
35
+ "JsonConverter",
36
+ "MarkdownConverter",
37
+ "OdtConverter",
38
+ "PdfConverter",
39
+ "PlainTextConverter",
40
+ "PptxConverter",
41
+ "RssConverter",
42
+ "RstConverter",
43
+ "XlsxConverter",
44
+ "YamlConverter",
45
+ ]
@@ -0,0 +1,73 @@
1
+ """CSV/TSV -> HTML ``<table>`` using the stdlib csv module with dialect sniffing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ from typing import Any, BinaryIO
8
+
9
+ from .._base_converter import DocumentConverter, DocumentConverterResult
10
+ from .._html_builder import escape_text, wrap_document
11
+ from .._stream_info import StreamInfo
12
+ from .._text_utils import read_text
13
+
14
+ __all__ = ["CsvConverter"]
15
+
16
+ _ACCEPTED_EXTENSIONS = {".csv", ".tsv"}
17
+ _ACCEPTED_MIME_TYPES = {"text/csv", "text/tab-separated-values", "application/csv"}
18
+
19
+
20
+ class CsvConverter(DocumentConverter):
21
+ """Render delimited data as an HTML table, treating the first row as headers."""
22
+
23
+ priority = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
24
+
25
+ def accepts(
26
+ self,
27
+ file_stream: BinaryIO,
28
+ stream_info: StreamInfo,
29
+ **kwargs: Any,
30
+ ) -> bool:
31
+ ext = stream_info.normalized_extension()
32
+ mimetype = (stream_info.mimetype or "").split(";", 1)[0].strip().lower()
33
+ return ext in _ACCEPTED_EXTENSIONS or mimetype in _ACCEPTED_MIME_TYPES
34
+
35
+ def convert(
36
+ self,
37
+ file_stream: BinaryIO,
38
+ stream_info: StreamInfo,
39
+ **kwargs: Any,
40
+ ) -> DocumentConverterResult:
41
+ text = read_text(file_stream, stream_info)
42
+ ext = stream_info.normalized_extension()
43
+
44
+ delimiter = "\t" if ext == ".tsv" else None
45
+ if delimiter is None:
46
+ try:
47
+ dialect = csv.Sniffer().sniff(text[:8192], delimiters=",;\t|")
48
+ delimiter = dialect.delimiter
49
+ except csv.Error:
50
+ delimiter = ","
51
+
52
+ reader = csv.reader(io.StringIO(text), delimiter=delimiter)
53
+ rows = [row for row in reader]
54
+
55
+ title = stream_info.filename
56
+ if not rows:
57
+ body = "<p><em>(empty file)</em></p>"
58
+ return DocumentConverterResult(wrap_document(body, title=title), title=title)
59
+
60
+ parts: list[str] = ["<table>"]
61
+ header, *data_rows = rows
62
+ parts.append("<thead><tr>")
63
+ parts.extend(f"<th>{escape_text(cell)}</th>" for cell in header)
64
+ parts.append("</tr></thead>")
65
+ parts.append("<tbody>")
66
+ for row in data_rows:
67
+ parts.append("<tr>")
68
+ parts.extend(f"<td>{escape_text(cell)}</td>" for cell in row)
69
+ parts.append("</tr>")
70
+ parts.append("</tbody></table>")
71
+
72
+ html = wrap_document("".join(parts), title=title)
73
+ return DocumentConverterResult(html, title=title)