everythingtohtml 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- everythingtohtml/__about__.py +3 -0
- everythingtohtml/__init__.py +34 -0
- everythingtohtml/__main__.py +130 -0
- everythingtohtml/_base_converter.py +78 -0
- everythingtohtml/_everything_to_html.py +408 -0
- everythingtohtml/_exceptions.py +63 -0
- everythingtohtml/_html_builder.py +106 -0
- everythingtohtml/_merge.py +145 -0
- everythingtohtml/_stream_info.py +46 -0
- everythingtohtml/_text_utils.py +46 -0
- everythingtohtml/converters/__init__.py +45 -0
- everythingtohtml/converters/_csv_converter.py +73 -0
- everythingtohtml/converters/_doc_converter.py +385 -0
- everythingtohtml/converters/_docx_converter.py +105 -0
- everythingtohtml/converters/_eml_converter.py +104 -0
- everythingtohtml/converters/_epub_converter.py +131 -0
- everythingtohtml/converters/_html_converter.py +66 -0
- everythingtohtml/converters/_ipynb_converter.py +96 -0
- everythingtohtml/converters/_json_converter.py +78 -0
- everythingtohtml/converters/_markdown_converter.py +57 -0
- everythingtohtml/converters/_odt_converter.py +171 -0
- everythingtohtml/converters/_pdf_converter.py +204 -0
- everythingtohtml/converters/_plain_text_converter.py +64 -0
- everythingtohtml/converters/_pptx_converter.py +233 -0
- everythingtohtml/converters/_rss_converter.py +146 -0
- everythingtohtml/converters/_rst_converter.py +57 -0
- everythingtohtml/converters/_xlsx_converter.py +84 -0
- everythingtohtml/converters/_yaml_converter.py +56 -0
- everythingtohtml/py.typed +0 -0
- everythingtohtml-0.1.2.dist-info/METADATA +294 -0
- everythingtohtml-0.1.2.dist-info/RECORD +34 -0
- everythingtohtml-0.1.2.dist-info/WHEEL +4 -0
- everythingtohtml-0.1.2.dist-info/entry_points.txt +3 -0
- everythingtohtml-0.1.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Exception hierarchy for everythingtohtml.
|
|
2
|
+
|
|
3
|
+
The layout mirrors the ergonomics of well-behaved conversion libraries: a single
|
|
4
|
+
base class so callers can ``except EverythingToHtmlException`` and catch anything
|
|
5
|
+
the library raises on purpose.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"EverythingToHtmlException",
|
|
12
|
+
"UnsupportedFormatException",
|
|
13
|
+
"FileConversionException",
|
|
14
|
+
"MissingDependencyException",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EverythingToHtmlException(Exception):
|
|
19
|
+
"""Base class for all exceptions raised by everythingtohtml."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MissingDependencyException(EverythingToHtmlException):
|
|
23
|
+
"""Raised when a converter needs an optional dependency that is not installed.
|
|
24
|
+
|
|
25
|
+
The message tells the user exactly which extra to install, e.g.::
|
|
26
|
+
|
|
27
|
+
pip install everythingtohtml[docx]
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class UnsupportedFormatException(EverythingToHtmlException):
|
|
32
|
+
"""Raised when no registered converter is able to handle the input."""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FailedConversionAttempt:
|
|
36
|
+
"""Bookkeeping for a converter that accepted the input but then failed.
|
|
37
|
+
|
|
38
|
+
Collected so :class:`FileConversionException` can report every attempt rather
|
|
39
|
+
than only the last traceback, which makes debugging multi-converter inputs far
|
|
40
|
+
easier.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, converter: object, exc_info: object | None = None) -> None:
|
|
44
|
+
self.converter = converter
|
|
45
|
+
self.exc_info = exc_info
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FileConversionException(EverythingToHtmlException):
|
|
49
|
+
"""Raised when one or more converters accepted the input but all failed."""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
message: str | None = None,
|
|
54
|
+
attempts: list[FailedConversionAttempt] | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
self.attempts = attempts or []
|
|
57
|
+
if message is None:
|
|
58
|
+
if self.attempts:
|
|
59
|
+
names = ", ".join(type(a.converter).__name__ for a in self.attempts)
|
|
60
|
+
message = f"All converters that accepted the input failed: {names}"
|
|
61
|
+
else:
|
|
62
|
+
message = "File conversion failed."
|
|
63
|
+
super().__init__(message)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Helpers for assembling clean, self-contained HTML documents.
|
|
2
|
+
|
|
3
|
+
Every converter produces an HTML *fragment* (the meaningful body markup). This
|
|
4
|
+
module wraps fragments into a full document with a small, readable default
|
|
5
|
+
stylesheet so the output looks decent on its own while staying easy to restyle.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from html import escape as _escape
|
|
11
|
+
|
|
12
|
+
__all__ = ["escape_text", "escape_attr", "wrap_document", "DEFAULT_STYLESHEET"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
DEFAULT_STYLESHEET = """\
|
|
16
|
+
:root { color-scheme: light dark; }
|
|
17
|
+
* { box-sizing: border-box; }
|
|
18
|
+
body {
|
|
19
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica,
|
|
20
|
+
Arial, sans-serif;
|
|
21
|
+
line-height: 1.6;
|
|
22
|
+
max-width: 50rem;
|
|
23
|
+
margin: 2rem auto;
|
|
24
|
+
padding: 0 1rem;
|
|
25
|
+
color: #1a1a1a;
|
|
26
|
+
background: #ffffff;
|
|
27
|
+
}
|
|
28
|
+
@media (prefers-color-scheme: dark) {
|
|
29
|
+
body { color: #e6e6e6; background: #121212; }
|
|
30
|
+
a { color: #6ea8fe; }
|
|
31
|
+
table th { background: #1e1e1e; }
|
|
32
|
+
table td, table th { border-color: #333; }
|
|
33
|
+
table tr:nth-child(even) td { background: rgba(255,255,255,0.03); }
|
|
34
|
+
pre, code { background: #1e1e1e; }
|
|
35
|
+
}
|
|
36
|
+
h1, h2, h3 { line-height: 1.25; }
|
|
37
|
+
a { color: #0b5cff; }
|
|
38
|
+
/* Tables: scroll horizontally when wider than the page instead of overflowing. */
|
|
39
|
+
table {
|
|
40
|
+
border-collapse: collapse;
|
|
41
|
+
margin: 1rem 0;
|
|
42
|
+
display: block;
|
|
43
|
+
max-width: 100%;
|
|
44
|
+
overflow-x: auto;
|
|
45
|
+
font-size: 0.95em;
|
|
46
|
+
}
|
|
47
|
+
table td, table th {
|
|
48
|
+
border: 1px solid #d4d4dc;
|
|
49
|
+
padding: 0.4rem 0.65rem;
|
|
50
|
+
text-align: left;
|
|
51
|
+
vertical-align: top;
|
|
52
|
+
}
|
|
53
|
+
table th { background: #f5f5f5; font-weight: 600; }
|
|
54
|
+
table tr:nth-child(even) td { background: rgba(0,0,0,0.025); }
|
|
55
|
+
/* Office converters wrap cell content in <p>; keep cells compact. */
|
|
56
|
+
table td > p, table th > p { margin: 0.1rem 0; }
|
|
57
|
+
table td > p:only-child, table th > p:only-child { margin: 0; }
|
|
58
|
+
pre {
|
|
59
|
+
background: #f5f5f5; padding: 1rem; overflow-x: auto; border-radius: 6px;
|
|
60
|
+
}
|
|
61
|
+
code { background: #f5f5f5; padding: 0.1rem 0.3rem; border-radius: 4px; }
|
|
62
|
+
pre code { padding: 0; background: none; }
|
|
63
|
+
blockquote {
|
|
64
|
+
border-left: 4px solid #ddd; margin: 1rem 0; padding: 0.2rem 1rem; color: #666;
|
|
65
|
+
}
|
|
66
|
+
img { max-width: 100%; height: auto; }
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def escape_text(text: str) -> str:
|
|
71
|
+
"""Escape text for safe inclusion in HTML element content."""
|
|
72
|
+
return _escape(text, quote=False)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def escape_attr(text: str) -> str:
|
|
76
|
+
"""Escape text for safe inclusion in a double-quoted HTML attribute."""
|
|
77
|
+
return _escape(text, quote=True)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def wrap_document(
|
|
81
|
+
body: str,
|
|
82
|
+
*,
|
|
83
|
+
title: str | None = None,
|
|
84
|
+
lang: str = "en",
|
|
85
|
+
include_style: bool = True,
|
|
86
|
+
extra_head: str = "",
|
|
87
|
+
) -> str:
|
|
88
|
+
"""Wrap an HTML *fragment* into a complete, standalone HTML5 document."""
|
|
89
|
+
safe_title = escape_text(title) if title else "Converted Document"
|
|
90
|
+
style = f"<style>\n{DEFAULT_STYLESHEET}</style>\n" if include_style else ""
|
|
91
|
+
return (
|
|
92
|
+
"<!DOCTYPE html>\n"
|
|
93
|
+
f'<html lang="{escape_attr(lang)}">\n'
|
|
94
|
+
"<head>\n"
|
|
95
|
+
'<meta charset="utf-8">\n'
|
|
96
|
+
'<meta name="viewport" content="width=device-width, initial-scale=1">\n'
|
|
97
|
+
'<meta name="generator" content="everythingtohtml">\n'
|
|
98
|
+
f"<title>{safe_title}</title>\n"
|
|
99
|
+
f"{style}"
|
|
100
|
+
f"{extra_head}"
|
|
101
|
+
"</head>\n"
|
|
102
|
+
"<body>\n"
|
|
103
|
+
f"{body}\n"
|
|
104
|
+
"</body>\n"
|
|
105
|
+
"</html>\n"
|
|
106
|
+
)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"""Combine and compare multiple converted documents in a single HTML page.
|
|
2
|
+
|
|
3
|
+
These helpers operate on the *output* of converters, so they work for any format
|
|
4
|
+
the library supports — merge a folder of Word docs, stack a PDF next to its
|
|
5
|
+
Markdown source, or diff two revisions of a spec.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import difflib
|
|
11
|
+
|
|
12
|
+
from ._html_builder import escape_attr, escape_text, wrap_document
|
|
13
|
+
|
|
14
|
+
__all__ = ["body_fragment", "plain_text_lines", "build_merged_html", "build_diff_html"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_MERGE_STYLE = """
|
|
18
|
+
.merge-toc { border: 1px solid #ddd; border-radius: 6px; padding: 0.5rem 1rem; margin-bottom: 2rem; }
|
|
19
|
+
.merge-toc ol { margin: 0.3rem 0; }
|
|
20
|
+
.merge-doc { margin-bottom: 3rem; padding-bottom: 2rem; border-bottom: 2px solid #eee; }
|
|
21
|
+
.merge-doc:last-child { border-bottom: none; }
|
|
22
|
+
.merge-doc > h2.merge-title { margin-top: 0; }
|
|
23
|
+
.merge-columns { display: flex; gap: 2rem; align-items: flex-start; }
|
|
24
|
+
.merge-columns .merge-col { flex: 1 1 0; min-width: 0; }
|
|
25
|
+
@media (max-width: 48rem) { .merge-columns { flex-direction: column; } }
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
_DIFF_STYLE = """
|
|
29
|
+
table.diff { width: 100%; border-collapse: collapse; font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; font-size: 0.85rem; }
|
|
30
|
+
table.diff td { padding: 0 0.4rem; vertical-align: top; white-space: pre-wrap; word-break: break-word; }
|
|
31
|
+
.diff_header { color: #999; text-align: right; }
|
|
32
|
+
td.diff_header { padding-right: 0.6rem; }
|
|
33
|
+
.diff_next { background: #f3f3f3; }
|
|
34
|
+
.diff_add { background: #d6ffd6; }
|
|
35
|
+
.diff_chg { background: #fff5b1; }
|
|
36
|
+
.diff_sub { background: #ffd6d6; }
|
|
37
|
+
.diff-legend { font-size: 0.85rem; margin: 0.5rem 0 1rem; }
|
|
38
|
+
.diff-legend span { padding: 0.1rem 0.4rem; border-radius: 3px; margin-right: 0.5rem; }
|
|
39
|
+
@media (prefers-color-scheme: dark) {
|
|
40
|
+
.diff_add { background: #14532d; } .diff_sub { background: #5b1a1a; }
|
|
41
|
+
.diff_chg { background: #5c4d00; } .diff_next, .diff_header { color: #aaa; }
|
|
42
|
+
}
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def body_fragment(html: str) -> str:
|
|
47
|
+
"""Extract the inner ``<body>`` markup from a full HTML document."""
|
|
48
|
+
from bs4 import BeautifulSoup
|
|
49
|
+
|
|
50
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
51
|
+
body = soup.body
|
|
52
|
+
return body.decode_contents() if body else soup.decode_contents()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def plain_text_lines(html: str) -> list[str]:
|
|
56
|
+
"""Extract visible text from an HTML document as a list of lines."""
|
|
57
|
+
from bs4 import BeautifulSoup
|
|
58
|
+
|
|
59
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
60
|
+
for tag in soup(["script", "style"]):
|
|
61
|
+
tag.decompose()
|
|
62
|
+
text = soup.get_text("\n")
|
|
63
|
+
return [line.rstrip() for line in text.splitlines() if line.strip()]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def build_merged_html(
|
|
67
|
+
items: list[tuple[str, str]],
|
|
68
|
+
*,
|
|
69
|
+
title: str | None = None,
|
|
70
|
+
layout: str = "stacked",
|
|
71
|
+
include_toc: bool = True,
|
|
72
|
+
) -> str:
|
|
73
|
+
"""Combine ``(label, body_fragment)`` pairs into one HTML document.
|
|
74
|
+
|
|
75
|
+
``layout="stacked"`` renders documents top-to-bottom with an optional table of
|
|
76
|
+
contents; ``layout="columns"`` places them side by side for visual comparison.
|
|
77
|
+
"""
|
|
78
|
+
doc_title = title or "Merged document"
|
|
79
|
+
|
|
80
|
+
if layout == "columns":
|
|
81
|
+
cols = "".join(
|
|
82
|
+
f'<section class="merge-col"><h2 class="merge-title">{escape_text(label)}</h2>'
|
|
83
|
+
f"{body}</section>"
|
|
84
|
+
for label, body in items
|
|
85
|
+
)
|
|
86
|
+
body_html = f'<div class="merge-columns">{cols}</div>'
|
|
87
|
+
return wrap_document(
|
|
88
|
+
body_html,
|
|
89
|
+
title=doc_title,
|
|
90
|
+
extra_head=f"<style>{_MERGE_STYLE}</style>\n",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
parts: list[str] = []
|
|
94
|
+
if include_toc and len(items) > 1:
|
|
95
|
+
links = "".join(
|
|
96
|
+
f'<li><a href="#doc-{i}">{escape_text(label)}</a></li>'
|
|
97
|
+
for i, (label, _) in enumerate(items, start=1)
|
|
98
|
+
)
|
|
99
|
+
parts.append(f'<nav class="merge-toc"><strong>Contents</strong><ol>{links}</ol></nav>')
|
|
100
|
+
|
|
101
|
+
for i, (label, body) in enumerate(items, start=1):
|
|
102
|
+
parts.append(
|
|
103
|
+
f'<section class="merge-doc" id="doc-{i}">'
|
|
104
|
+
f'<h2 class="merge-title">{escape_text(label)}</h2>{body}</section>'
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return wrap_document(
|
|
108
|
+
"\n".join(parts),
|
|
109
|
+
title=doc_title,
|
|
110
|
+
extra_head=f"<style>{_MERGE_STYLE}</style>\n",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def build_diff_html(
|
|
115
|
+
left_label: str,
|
|
116
|
+
left_lines: list[str],
|
|
117
|
+
right_label: str,
|
|
118
|
+
right_lines: list[str],
|
|
119
|
+
*,
|
|
120
|
+
title: str | None = None,
|
|
121
|
+
context: bool = True,
|
|
122
|
+
numlines: int = 3,
|
|
123
|
+
) -> str:
|
|
124
|
+
"""Render a side-by-side line diff of two documents' text content."""
|
|
125
|
+
differ = difflib.HtmlDiff(wrapcolumn=72)
|
|
126
|
+
table = differ.make_table(
|
|
127
|
+
left_lines,
|
|
128
|
+
right_lines,
|
|
129
|
+
fromdesc=escape_attr(left_label),
|
|
130
|
+
todesc=escape_attr(right_label),
|
|
131
|
+
context=context,
|
|
132
|
+
numlines=numlines,
|
|
133
|
+
)
|
|
134
|
+
legend = (
|
|
135
|
+
'<p class="diff-legend">'
|
|
136
|
+
'<span class="diff_add">added</span>'
|
|
137
|
+
'<span class="diff_chg">changed</span>'
|
|
138
|
+
'<span class="diff_sub">removed</span></p>'
|
|
139
|
+
)
|
|
140
|
+
body = f"<h1>{escape_text(title or 'Document comparison')}</h1>{legend}{table}"
|
|
141
|
+
return wrap_document(
|
|
142
|
+
body,
|
|
143
|
+
title=title or "Document comparison",
|
|
144
|
+
extra_head=f"<style>{_DIFF_STYLE}</style>\n",
|
|
145
|
+
)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Carries everything we know about an input stream as it flows through converters."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import asdict, dataclass, replace
|
|
6
|
+
|
|
7
|
+
__all__ = ["StreamInfo"]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True, kw_only=True)
|
|
11
|
+
class StreamInfo:
|
|
12
|
+
"""Immutable bag of hints describing a binary stream.
|
|
13
|
+
|
|
14
|
+
Every field is optional. Converters use whichever signals are available
|
|
15
|
+
(extension, mimetype, magic bytes, declared charset) to decide whether they
|
|
16
|
+
can handle the stream. Because it is frozen, passing it between converters is
|
|
17
|
+
safe; use :meth:`copy_and_update` to derive a refined copy.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
mimetype: str | None = None
|
|
21
|
+
extension: str | None = None
|
|
22
|
+
charset: str | None = None
|
|
23
|
+
filename: str | None = None
|
|
24
|
+
local_path: str | None = None
|
|
25
|
+
url: str | None = None
|
|
26
|
+
|
|
27
|
+
def copy_and_update(self, *args: StreamInfo | None, **kwargs: object) -> StreamInfo:
|
|
28
|
+
"""Return a new ``StreamInfo`` with non-``None`` values layered on top.
|
|
29
|
+
|
|
30
|
+
Accepts other ``StreamInfo`` instances (applied in order) and/or keyword
|
|
31
|
+
overrides. ``None`` values never overwrite an existing value.
|
|
32
|
+
"""
|
|
33
|
+
updates: dict[str, object] = {}
|
|
34
|
+
for arg in args:
|
|
35
|
+
if arg is None:
|
|
36
|
+
continue
|
|
37
|
+
updates.update({k: v for k, v in asdict(arg).items() if v is not None})
|
|
38
|
+
updates.update({k: v for k, v in kwargs.items() if v is not None})
|
|
39
|
+
return replace(self, **updates) # type: ignore[arg-type]
|
|
40
|
+
|
|
41
|
+
def normalized_extension(self) -> str | None:
|
|
42
|
+
"""Lower-cased extension with a leading dot, or ``None``."""
|
|
43
|
+
if not self.extension:
|
|
44
|
+
return None
|
|
45
|
+
ext = self.extension.lower()
|
|
46
|
+
return ext if ext.startswith(".") else f".{ext}"
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Robust byte-stream-to-``str`` decoding shared by text-based converters."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import BinaryIO
|
|
6
|
+
|
|
7
|
+
from ._stream_info import StreamInfo
|
|
8
|
+
|
|
9
|
+
__all__ = ["read_text"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def read_text(file_stream: BinaryIO, stream_info: StreamInfo) -> str:
|
|
13
|
+
"""Decode a binary stream to text.
|
|
14
|
+
|
|
15
|
+
Order of preference:
|
|
16
|
+
|
|
17
|
+
1. The charset declared on ``stream_info`` (e.g. parsed from an HTTP header).
|
|
18
|
+
2. UTF-8, which is correct the overwhelming majority of the time.
|
|
19
|
+
3. ``charset_normalizer``'s best guess for legacy encodings.
|
|
20
|
+
4. UTF-8 with replacement as a last resort so we never hard-fail.
|
|
21
|
+
"""
|
|
22
|
+
raw = file_stream.read()
|
|
23
|
+
if isinstance(raw, str): # already-decoded stream
|
|
24
|
+
return raw
|
|
25
|
+
|
|
26
|
+
if stream_info.charset:
|
|
27
|
+
try:
|
|
28
|
+
return raw.decode(stream_info.charset)
|
|
29
|
+
except (LookupError, UnicodeDecodeError):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
return raw.decode("utf-8")
|
|
34
|
+
except UnicodeDecodeError:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
from charset_normalizer import from_bytes
|
|
39
|
+
|
|
40
|
+
best = from_bytes(raw).best()
|
|
41
|
+
if best is not None:
|
|
42
|
+
return str(best)
|
|
43
|
+
except Exception: # pragma: no cover - defensive, charset_normalizer is a dep
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
return raw.decode("utf-8", errors="replace")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Built-in converters bundled with everythingtohtml.
|
|
2
|
+
|
|
3
|
+
Each converter is a small, self-contained class implementing the
|
|
4
|
+
:class:`~everythingtohtml._base_converter.DocumentConverter` contract.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from ._csv_converter import CsvConverter
|
|
10
|
+
from ._doc_converter import DocConverter
|
|
11
|
+
from ._docx_converter import DocxConverter
|
|
12
|
+
from ._eml_converter import EmlConverter
|
|
13
|
+
from ._epub_converter import EpubConverter
|
|
14
|
+
from ._html_converter import HtmlConverter
|
|
15
|
+
from ._ipynb_converter import IpynbConverter
|
|
16
|
+
from ._json_converter import JsonConverter
|
|
17
|
+
from ._markdown_converter import MarkdownConverter
|
|
18
|
+
from ._odt_converter import OdtConverter
|
|
19
|
+
from ._pdf_converter import PdfConverter
|
|
20
|
+
from ._plain_text_converter import PlainTextConverter
|
|
21
|
+
from ._pptx_converter import PptxConverter
|
|
22
|
+
from ._rss_converter import RssConverter
|
|
23
|
+
from ._rst_converter import RstConverter
|
|
24
|
+
from ._xlsx_converter import XlsxConverter
|
|
25
|
+
from ._yaml_converter import YamlConverter
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"CsvConverter",
|
|
29
|
+
"DocConverter",
|
|
30
|
+
"DocxConverter",
|
|
31
|
+
"EmlConverter",
|
|
32
|
+
"EpubConverter",
|
|
33
|
+
"HtmlConverter",
|
|
34
|
+
"IpynbConverter",
|
|
35
|
+
"JsonConverter",
|
|
36
|
+
"MarkdownConverter",
|
|
37
|
+
"OdtConverter",
|
|
38
|
+
"PdfConverter",
|
|
39
|
+
"PlainTextConverter",
|
|
40
|
+
"PptxConverter",
|
|
41
|
+
"RssConverter",
|
|
42
|
+
"RstConverter",
|
|
43
|
+
"XlsxConverter",
|
|
44
|
+
"YamlConverter",
|
|
45
|
+
]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""CSV/TSV -> HTML ``<table>`` using the stdlib csv module with dialect sniffing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import io
|
|
7
|
+
from typing import Any, BinaryIO
|
|
8
|
+
|
|
9
|
+
from .._base_converter import DocumentConverter, DocumentConverterResult
|
|
10
|
+
from .._html_builder import escape_text, wrap_document
|
|
11
|
+
from .._stream_info import StreamInfo
|
|
12
|
+
from .._text_utils import read_text
|
|
13
|
+
|
|
14
|
+
__all__ = ["CsvConverter"]
|
|
15
|
+
|
|
16
|
+
_ACCEPTED_EXTENSIONS = {".csv", ".tsv"}
|
|
17
|
+
_ACCEPTED_MIME_TYPES = {"text/csv", "text/tab-separated-values", "application/csv"}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CsvConverter(DocumentConverter):
|
|
21
|
+
"""Render delimited data as an HTML table, treating the first row as headers."""
|
|
22
|
+
|
|
23
|
+
priority = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
|
24
|
+
|
|
25
|
+
def accepts(
|
|
26
|
+
self,
|
|
27
|
+
file_stream: BinaryIO,
|
|
28
|
+
stream_info: StreamInfo,
|
|
29
|
+
**kwargs: Any,
|
|
30
|
+
) -> bool:
|
|
31
|
+
ext = stream_info.normalized_extension()
|
|
32
|
+
mimetype = (stream_info.mimetype or "").split(";", 1)[0].strip().lower()
|
|
33
|
+
return ext in _ACCEPTED_EXTENSIONS or mimetype in _ACCEPTED_MIME_TYPES
|
|
34
|
+
|
|
35
|
+
def convert(
|
|
36
|
+
self,
|
|
37
|
+
file_stream: BinaryIO,
|
|
38
|
+
stream_info: StreamInfo,
|
|
39
|
+
**kwargs: Any,
|
|
40
|
+
) -> DocumentConverterResult:
|
|
41
|
+
text = read_text(file_stream, stream_info)
|
|
42
|
+
ext = stream_info.normalized_extension()
|
|
43
|
+
|
|
44
|
+
delimiter = "\t" if ext == ".tsv" else None
|
|
45
|
+
if delimiter is None:
|
|
46
|
+
try:
|
|
47
|
+
dialect = csv.Sniffer().sniff(text[:8192], delimiters=",;\t|")
|
|
48
|
+
delimiter = dialect.delimiter
|
|
49
|
+
except csv.Error:
|
|
50
|
+
delimiter = ","
|
|
51
|
+
|
|
52
|
+
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
|
|
53
|
+
rows = [row for row in reader]
|
|
54
|
+
|
|
55
|
+
title = stream_info.filename
|
|
56
|
+
if not rows:
|
|
57
|
+
body = "<p><em>(empty file)</em></p>"
|
|
58
|
+
return DocumentConverterResult(wrap_document(body, title=title), title=title)
|
|
59
|
+
|
|
60
|
+
parts: list[str] = ["<table>"]
|
|
61
|
+
header, *data_rows = rows
|
|
62
|
+
parts.append("<thead><tr>")
|
|
63
|
+
parts.extend(f"<th>{escape_text(cell)}</th>" for cell in header)
|
|
64
|
+
parts.append("</tr></thead>")
|
|
65
|
+
parts.append("<tbody>")
|
|
66
|
+
for row in data_rows:
|
|
67
|
+
parts.append("<tr>")
|
|
68
|
+
parts.extend(f"<td>{escape_text(cell)}</td>" for cell in row)
|
|
69
|
+
parts.append("</tr>")
|
|
70
|
+
parts.append("</tbody></table>")
|
|
71
|
+
|
|
72
|
+
html = wrap_document("".join(parts), title=title)
|
|
73
|
+
return DocumentConverterResult(html, title=title)
|