decant-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- decant/__init__.py +0 -0
- decant/cli/__init__.py +0 -0
- decant/cli/main.py +158 -0
- decant/core/__init__.py +0 -0
- decant/core/constants.py +65 -0
- decant/core/content_selector.py +77 -0
- decant/core/degradation.py +147 -0
- decant/core/model.py +139 -0
- decant/core/parser.py +1073 -0
- decant/core/renderer.py +578 -0
- decant/core/sanitizer.py +58 -0
- decant/io/__init__.py +0 -0
- decant/io/reader.py +31 -0
- decant/io/writer.py +26 -0
- decant_cli-0.1.0.dist-info/METADATA +63 -0
- decant_cli-0.1.0.dist-info/RECORD +20 -0
- decant_cli-0.1.0.dist-info/WHEEL +5 -0
- decant_cli-0.1.0.dist-info/entry_points.txt +2 -0
- decant_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
- decant_cli-0.1.0.dist-info/top_level.txt +1 -0
decant/core/renderer.py
ADDED
|
@@ -0,0 +1,578 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document model to HTML renderer.
|
|
3
|
+
|
|
4
|
+
Generates self-contained readable HTML with inline CSS.
|
|
5
|
+
Pipeline: Step 5 (final - after model creation)
|
|
6
|
+
See decisions.md section 10 for rendering invariants.
|
|
7
|
+
"""
|
|
8
|
+
import html as html_module
|
|
9
|
+
|
|
10
|
+
from decant.core.model import (
|
|
11
|
+
Document, Section, Heading,
|
|
12
|
+
Paragraph, ListBlock, ListItem, Quote, Preformatted, Image, Table,
|
|
13
|
+
Text, Emphasis, Strong, Code, Link, LineBreak
|
|
14
|
+
)
|
|
15
|
+
from decant.core.constants import (
|
|
16
|
+
FONT_STACK, BODY_FONT_SIZE, HEADING_MULTIPLIERS,
|
|
17
|
+
LINE_HEIGHT, LETTER_SPACING, WORD_SPACING,
|
|
18
|
+
PARAGRAPH_SPACING, HEADING_MARGIN_TOP, HEADING_MARGIN_BOTTOM,
|
|
19
|
+
LIST_ITEM_SPACING,
|
|
20
|
+
BACKGROUND_COLOR, TEXT_COLOR, LINK_COLOR, LINK_HOVER_COLOR, LINK_VISITED_COLOR,
|
|
21
|
+
MAX_LINE_WIDTH, CONTAINER_PADDING,
|
|
22
|
+
PRINT_MIN_FONT_SIZE,
|
|
23
|
+
OPENDYSLEXIC_BASE64,
|
|
24
|
+
OPENDYSLEXIC_BOLD_BASE64
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _is_placeholder(paragraph: Paragraph) -> bool:
|
|
29
|
+
"""Return True if paragraph is a placeholder like [Table removed]."""
|
|
30
|
+
if len(paragraph.inlines) != 1:
|
|
31
|
+
return False
|
|
32
|
+
inline = paragraph.inlines[0]
|
|
33
|
+
if not isinstance(inline, Text):
|
|
34
|
+
return False
|
|
35
|
+
return inline.text.startswith("[") and inline.text.endswith("]")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _placeholder_type(text: str) -> str:
|
|
39
|
+
"""Classify placeholder text into type category."""
|
|
40
|
+
if text.startswith("[Table"):
|
|
41
|
+
return "table"
|
|
42
|
+
if text.startswith("[Image"):
|
|
43
|
+
return "image"
|
|
44
|
+
if text.startswith("[Form"):
|
|
45
|
+
return "form"
|
|
46
|
+
if text == "[-]":
|
|
47
|
+
return "hr"
|
|
48
|
+
return "other"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def render_notice_banner(document: Document) -> str:
|
|
52
|
+
"""Generate notice banner if document has placeholders."""
|
|
53
|
+
counts: dict[str, int] = {}
|
|
54
|
+
for section in document.sections:
|
|
55
|
+
for block in section.blocks:
|
|
56
|
+
if isinstance(block, Paragraph) and _is_placeholder(block):
|
|
57
|
+
ptype = _placeholder_type(block.inlines[0].text)
|
|
58
|
+
if ptype not in ("hr", "other"):
|
|
59
|
+
counts[ptype] = counts.get(ptype, 0) + 1
|
|
60
|
+
|
|
61
|
+
if not counts:
|
|
62
|
+
return ""
|
|
63
|
+
|
|
64
|
+
parts = []
|
|
65
|
+
for ptype in ("table", "image", "form"):
|
|
66
|
+
n = counts.get(ptype, 0)
|
|
67
|
+
if n > 0:
|
|
68
|
+
label = ptype + ("s" if n != 1 else "")
|
|
69
|
+
parts.append(f"{n} {label}")
|
|
70
|
+
|
|
71
|
+
if not parts:
|
|
72
|
+
return ""
|
|
73
|
+
|
|
74
|
+
summary = parts[0]
|
|
75
|
+
if len(parts) == 2:
|
|
76
|
+
summary = f"{parts[0]} and {parts[1]}"
|
|
77
|
+
elif len(parts) > 2:
|
|
78
|
+
summary = ", ".join(parts[:-1]) + f", and {parts[-1]}"
|
|
79
|
+
|
|
80
|
+
if document.source_url:
|
|
81
|
+
escaped_url = html_module.escape(document.source_url)
|
|
82
|
+
suffix = (
|
|
83
|
+
f', or <a href="{escaped_url}">'
|
|
84
|
+
f'view the original page</a> for the full content.'
|
|
85
|
+
)
|
|
86
|
+
else:
|
|
87
|
+
suffix = " for details."
|
|
88
|
+
|
|
89
|
+
return (
|
|
90
|
+
f'<div class="decant-notice">'
|
|
91
|
+
f'This document contains {summary} that could not be included. '
|
|
92
|
+
f'Look for the marked notes below{suffix}'
|
|
93
|
+
f'</div>\n'
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def render(document: Document, use_opendyslexic: bool = False) -> str:
|
|
98
|
+
"""
|
|
99
|
+
Render Document model to self-contained HTML.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
document: Document model with title and sections
|
|
103
|
+
use_opendyslexic: If True, embed OpenDyslexic font
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Complete HTML string with inline CSS
|
|
107
|
+
"""
|
|
108
|
+
css = generate_css(use_opendyslexic)
|
|
109
|
+
|
|
110
|
+
source_url = document.source_url
|
|
111
|
+
sections_html = "\n".join(
|
|
112
|
+
render_section(section, source_url=source_url)
|
|
113
|
+
for section in document.sections
|
|
114
|
+
)
|
|
115
|
+
banner = render_notice_banner(document)
|
|
116
|
+
|
|
117
|
+
# Assemble complete HTML document
|
|
118
|
+
html_output = f"""<!DOCTYPE html>
|
|
119
|
+
<html lang="en">
|
|
120
|
+
<head>
|
|
121
|
+
<meta charset="utf-8">
|
|
122
|
+
<title>{html_module.escape(document.title)}</title>
|
|
123
|
+
<style>
|
|
124
|
+
{css}
|
|
125
|
+
</style>
|
|
126
|
+
</head>
|
|
127
|
+
<body>
|
|
128
|
+
<div class="container">
|
|
129
|
+
{banner}{sections_html}
|
|
130
|
+
</div>
|
|
131
|
+
</body>
|
|
132
|
+
</html>"""
|
|
133
|
+
|
|
134
|
+
return html_output
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def generate_css(use_opendyslexic: bool) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Generate CSS from constants.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
use_opendyslexic: If True, include @font-face for OpenDyslexic
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
CSS string for inline <style> block
|
|
146
|
+
"""
|
|
147
|
+
# Font family - conditional on OpenDyslexic
|
|
148
|
+
if use_opendyslexic and OPENDYSLEXIC_BASE64:
|
|
149
|
+
font_family = "'OpenDyslexic', " + FONT_STACK
|
|
150
|
+
font_face = f"""
|
|
151
|
+
@font-face {{
|
|
152
|
+
font-family: 'OpenDyslexic';
|
|
153
|
+
src: url(data:font/woff2;base64,{OPENDYSLEXIC_BASE64}) format('woff2');
|
|
154
|
+
font-weight: normal;
|
|
155
|
+
font-style: normal;
|
|
156
|
+
}}
|
|
157
|
+
|
|
158
|
+
@font-face {{
|
|
159
|
+
font-family: 'OpenDyslexic';
|
|
160
|
+
src: url(data:font/woff2;base64,{OPENDYSLEXIC_BOLD_BASE64}) format('woff2');
|
|
161
|
+
font-weight: bold;
|
|
162
|
+
font-style: normal;
|
|
163
|
+
}}
|
|
164
|
+
"""
|
|
165
|
+
else:
|
|
166
|
+
font_family = FONT_STACK
|
|
167
|
+
font_face = ""
|
|
168
|
+
|
|
169
|
+
# Restyle <em> as bold (not italic) for dyslexic readers (BDA guidance)
|
|
170
|
+
em_restyle = ""
|
|
171
|
+
if use_opendyslexic and OPENDYSLEXIC_BASE64:
|
|
172
|
+
em_restyle = """
|
|
173
|
+
em {
|
|
174
|
+
font-style: normal;
|
|
175
|
+
font-weight: bold;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
.placeholder {
|
|
179
|
+
font-style: normal;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
.decant-table th,
|
|
183
|
+
.decant-table td {
|
|
184
|
+
padding: 0.6em 0.85em;
|
|
185
|
+
line-height: 1.6;
|
|
186
|
+
}
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
# Generate heading styles
|
|
190
|
+
heading_styles = ""
|
|
191
|
+
for level in range(1, 7):
|
|
192
|
+
multiplier = HEADING_MULTIPLIERS[level]
|
|
193
|
+
heading_styles += f"""
|
|
194
|
+
h{level} {{
|
|
195
|
+
font-size: calc({BODY_FONT_SIZE} * {multiplier});
|
|
196
|
+
margin-top: {HEADING_MARGIN_TOP};
|
|
197
|
+
margin-bottom: {HEADING_MARGIN_BOTTOM};
|
|
198
|
+
font-weight: bold;
|
|
199
|
+
}}
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
css = f"""{font_face}
|
|
203
|
+
* {{
|
|
204
|
+
box-sizing: border-box;
|
|
205
|
+
}}
|
|
206
|
+
|
|
207
|
+
body {{
|
|
208
|
+
font-family: {font_family};
|
|
209
|
+
font-size: {BODY_FONT_SIZE};
|
|
210
|
+
line-height: {LINE_HEIGHT};
|
|
211
|
+
letter-spacing: {LETTER_SPACING};
|
|
212
|
+
word-spacing: {WORD_SPACING};
|
|
213
|
+
background-color: {BACKGROUND_COLOR};
|
|
214
|
+
color: {TEXT_COLOR};
|
|
215
|
+
margin: 0;
|
|
216
|
+
padding: 0;
|
|
217
|
+
}}
|
|
218
|
+
|
|
219
|
+
.container {{
|
|
220
|
+
max-width: {MAX_LINE_WIDTH};
|
|
221
|
+
margin: 0 auto;
|
|
222
|
+
padding: {CONTAINER_PADDING};
|
|
223
|
+
}}
|
|
224
|
+
|
|
225
|
+
{heading_styles}
|
|
226
|
+
|
|
227
|
+
p {{
|
|
228
|
+
margin: 0 0 {PARAGRAPH_SPACING} 0;
|
|
229
|
+
text-align: left;
|
|
230
|
+
}}
|
|
231
|
+
|
|
232
|
+
ul, ol {{
|
|
233
|
+
margin: 0 0 {PARAGRAPH_SPACING} 0;
|
|
234
|
+
padding-left: 2em;
|
|
235
|
+
}}
|
|
236
|
+
|
|
237
|
+
li {{
|
|
238
|
+
margin-bottom: {LIST_ITEM_SPACING};
|
|
239
|
+
}}
|
|
240
|
+
|
|
241
|
+
blockquote {{
|
|
242
|
+
margin: 0 0 {PARAGRAPH_SPACING} 2em;
|
|
243
|
+
padding-left: 1em;
|
|
244
|
+
border-left: 3px solid {TEXT_COLOR};
|
|
245
|
+
}}
|
|
246
|
+
|
|
247
|
+
pre {{
|
|
248
|
+
background-color: #f5f5f5;
|
|
249
|
+
padding: 1em;
|
|
250
|
+
margin: 0 0 {PARAGRAPH_SPACING} 0;
|
|
251
|
+
overflow-x: auto;
|
|
252
|
+
font-family: 'Courier New', monospace;
|
|
253
|
+
}}
|
|
254
|
+
|
|
255
|
+
code {{
|
|
256
|
+
background-color: #f5f5f5;
|
|
257
|
+
padding: 0.2em 0.4em;
|
|
258
|
+
font-family: 'Courier New', monospace;
|
|
259
|
+
}}
|
|
260
|
+
|
|
261
|
+
img {{
|
|
262
|
+
max-width: 100%;
|
|
263
|
+
height: auto;
|
|
264
|
+
display: block;
|
|
265
|
+
margin: 0 0 {PARAGRAPH_SPACING} 0;
|
|
266
|
+
}}
|
|
267
|
+
|
|
268
|
+
a {{
|
|
269
|
+
color: {LINK_COLOR};
|
|
270
|
+
text-decoration: underline;
|
|
271
|
+
}}
|
|
272
|
+
|
|
273
|
+
a:hover {{
|
|
274
|
+
color: {LINK_HOVER_COLOR};
|
|
275
|
+
}}
|
|
276
|
+
|
|
277
|
+
a:visited {{
|
|
278
|
+
color: {LINK_VISITED_COLOR};
|
|
279
|
+
}}
|
|
280
|
+
|
|
281
|
+
figure {{
|
|
282
|
+
margin: 1.5em 0;
|
|
283
|
+
padding: 0;
|
|
284
|
+
}}
|
|
285
|
+
|
|
286
|
+
figure img {{
|
|
287
|
+
display: block;
|
|
288
|
+
max-width: 100%;
|
|
289
|
+
height: auto;
|
|
290
|
+
}}
|
|
291
|
+
|
|
292
|
+
figcaption {{
|
|
293
|
+
font-size: 0.9em;
|
|
294
|
+
color: #555;
|
|
295
|
+
margin-top: 0.5em;
|
|
296
|
+
line-height: 1.4;
|
|
297
|
+
}}
|
|
298
|
+
|
|
299
|
+
.decant-notice {{
|
|
300
|
+
background-color: #f0f0e8;
|
|
301
|
+
border-left: 3px solid #b0a870;
|
|
302
|
+
padding: 0.75em 1em;
|
|
303
|
+
margin-bottom: 1.5em;
|
|
304
|
+
font-size: 0.9em;
|
|
305
|
+
color: #555;
|
|
306
|
+
line-height: 1.5;
|
|
307
|
+
}}
|
|
308
|
+
|
|
309
|
+
.placeholder {{
|
|
310
|
+
color: #666;
|
|
311
|
+
font-style: italic;
|
|
312
|
+
}}
|
|
313
|
+
|
|
314
|
+
.view-original {{
|
|
315
|
+
font-style: normal;
|
|
316
|
+
margin-left: 0.3em;
|
|
317
|
+
}}
|
|
318
|
+
|
|
319
|
+
.decant-table {{
|
|
320
|
+
border-collapse: collapse;
|
|
321
|
+
width: 100%;
|
|
322
|
+
max-width: 100%;
|
|
323
|
+
margin: 1.2em 0;
|
|
324
|
+
font-size: 0.95em;
|
|
325
|
+
overflow-x: auto;
|
|
326
|
+
}}
|
|
327
|
+
.decant-table th,
|
|
328
|
+
.decant-table td {{
|
|
329
|
+
border: 1px solid #ccc;
|
|
330
|
+
padding: 0.5em 0.75em;
|
|
331
|
+
text-align: left;
|
|
332
|
+
vertical-align: top;
|
|
333
|
+
}}
|
|
334
|
+
.decant-table th {{
|
|
335
|
+
background-color: #f5f5f5;
|
|
336
|
+
font-weight: bold;
|
|
337
|
+
}}
|
|
338
|
+
.decant-table tr:nth-child(even) td {{
|
|
339
|
+
background-color: #fafafa;
|
|
340
|
+
}}
|
|
341
|
+
|
|
342
|
+
{em_restyle}
|
|
343
|
+
@media print {{
|
|
344
|
+
body {{
|
|
345
|
+
font-size: {PRINT_MIN_FONT_SIZE};
|
|
346
|
+
}}
|
|
347
|
+
img {{
|
|
348
|
+
max-width: 100%;
|
|
349
|
+
page-break-inside: avoid;
|
|
350
|
+
}}
|
|
351
|
+
figcaption {{
|
|
352
|
+
color: #333;
|
|
353
|
+
}}
|
|
354
|
+
.decant-notice {{
|
|
355
|
+
border-left-color: #999;
|
|
356
|
+
}}
|
|
357
|
+
.decant-table {{
|
|
358
|
+
font-size: 0.9em;
|
|
359
|
+
}}
|
|
360
|
+
.decant-table th,
|
|
361
|
+
.decant-table td {{
|
|
362
|
+
border: 1px solid #999;
|
|
363
|
+
}}
|
|
364
|
+
.decant-table th {{
|
|
365
|
+
background-color: #eee !important;
|
|
366
|
+
}}
|
|
367
|
+
.decant-table tr:nth-child(even) td {{
|
|
368
|
+
background-color: transparent !important;
|
|
369
|
+
}}
|
|
370
|
+
}}
|
|
371
|
+
"""
|
|
372
|
+
|
|
373
|
+
return css
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def render_section(section: Section, source_url: str = "") -> str:
|
|
377
|
+
"""
|
|
378
|
+
Render Section to HTML.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
section: Section with heading and blocks
|
|
382
|
+
source_url: Optional source URL for placeholder links
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
HTML string for section
|
|
386
|
+
"""
|
|
387
|
+
# Render heading
|
|
388
|
+
level = section.heading.level
|
|
389
|
+
heading_html = render_inlines(section.heading.inlines)
|
|
390
|
+
heading = f"<h{level}>{heading_html}</h{level}>\n"
|
|
391
|
+
|
|
392
|
+
# Render blocks
|
|
393
|
+
blocks_html = "\n".join(
|
|
394
|
+
render_block(block, source_url=source_url)
|
|
395
|
+
for block in section.blocks
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return heading + blocks_html
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def render_block(block, source_url: str = "") -> str:
|
|
402
|
+
"""
|
|
403
|
+
Render Block to HTML (dispatcher).
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
block: Block model object
|
|
407
|
+
source_url: Optional source URL for placeholder links
|
|
408
|
+
|
|
409
|
+
Returns:
|
|
410
|
+
HTML string for block
|
|
411
|
+
"""
|
|
412
|
+
if isinstance(block, Paragraph):
|
|
413
|
+
return render_paragraph(block, source_url=source_url)
|
|
414
|
+
elif isinstance(block, ListBlock):
|
|
415
|
+
return render_list(block)
|
|
416
|
+
elif isinstance(block, Quote):
|
|
417
|
+
return render_quote(block, source_url=source_url)
|
|
418
|
+
elif isinstance(block, Preformatted):
|
|
419
|
+
return render_preformatted(block)
|
|
420
|
+
elif isinstance(block, Image):
|
|
421
|
+
return render_image(block)
|
|
422
|
+
elif isinstance(block, Table):
|
|
423
|
+
return render_table(block)
|
|
424
|
+
else:
|
|
425
|
+
# Unknown block type - skip
|
|
426
|
+
return ""
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def render_paragraph(para: Paragraph, source_url: str = "") -> str:
|
|
430
|
+
"""
|
|
431
|
+
Render Paragraph to HTML.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
para: Paragraph with inlines
|
|
435
|
+
source_url: Optional source URL for placeholder links
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
HTML <p> element
|
|
439
|
+
"""
|
|
440
|
+
if _is_placeholder(para) and source_url:
|
|
441
|
+
escaped_text = html_module.escape(para.inlines[0].text)
|
|
442
|
+
escaped_url = html_module.escape(source_url)
|
|
443
|
+
return (
|
|
444
|
+
f'<p class="placeholder">{escaped_text} '
|
|
445
|
+
f'<a href="{escaped_url}" class="view-original">'
|
|
446
|
+
f'View original</a></p>\n'
|
|
447
|
+
)
|
|
448
|
+
content = render_inlines(para.inlines)
|
|
449
|
+
return f"<p>{content}</p>\n"
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def render_list(list_block: ListBlock) -> str:
|
|
453
|
+
"""
|
|
454
|
+
Render ListBlock to HTML (handles nested lists).
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
list_block: ListBlock with items
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
HTML <ul> or <ol> element
|
|
461
|
+
"""
|
|
462
|
+
tag = "ol" if list_block.ordered else "ul"
|
|
463
|
+
items_html = ""
|
|
464
|
+
|
|
465
|
+
for item in list_block.items:
|
|
466
|
+
# Render item inlines
|
|
467
|
+
content = render_inlines(item.inlines)
|
|
468
|
+
|
|
469
|
+
# Render nested lists if present
|
|
470
|
+
nested = ""
|
|
471
|
+
for child_list in item.children:
|
|
472
|
+
nested += render_list(child_list)
|
|
473
|
+
|
|
474
|
+
items_html += f"<li>{content}{nested}</li>\n"
|
|
475
|
+
|
|
476
|
+
return f"<{tag}>\n{items_html}</{tag}>\n"
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def render_quote(quote: Quote, source_url: str = "") -> str:
|
|
480
|
+
"""
|
|
481
|
+
Render Quote to HTML (recursive).
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
quote: Quote containing blocks
|
|
485
|
+
source_url: Optional source URL for placeholder links
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
HTML <blockquote> element
|
|
489
|
+
"""
|
|
490
|
+
blocks_html = "\n".join(
|
|
491
|
+
render_block(block, source_url=source_url)
|
|
492
|
+
for block in quote.blocks
|
|
493
|
+
)
|
|
494
|
+
return f"<blockquote>\n{blocks_html}</blockquote>\n"
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def render_preformatted(pre: Preformatted) -> str:
|
|
498
|
+
"""
|
|
499
|
+
Render Preformatted to HTML.
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
pre: Preformatted with text
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
HTML <pre> element
|
|
506
|
+
"""
|
|
507
|
+
escaped_text = html_module.escape(pre.text)
|
|
508
|
+
return f"<pre>{escaped_text}</pre>\n"
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def render_table(table: Table) -> str:
|
|
512
|
+
"""Render Table to styled HTML table."""
|
|
513
|
+
parts = ['<table class="decant-table">\n']
|
|
514
|
+
for row in table.rows:
|
|
515
|
+
parts.append("<tr>\n")
|
|
516
|
+
for cell in row.cells:
|
|
517
|
+
tag = "th" if cell.is_header else "td"
|
|
518
|
+
content = render_inlines(cell.inlines)
|
|
519
|
+
parts.append(f"<{tag}>{content}</{tag}>\n")
|
|
520
|
+
parts.append("</tr>\n")
|
|
521
|
+
parts.append("</table>\n")
|
|
522
|
+
return "".join(parts)
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def render_image(image: Image) -> str:
|
|
526
|
+
"""Render Image to HTML <img> tag, wrapped in <figure> if captioned."""
|
|
527
|
+
escaped_src = html_module.escape(image.src)
|
|
528
|
+
escaped_alt = html_module.escape(image.alt)
|
|
529
|
+
img_tag = f'<img src="{escaped_src}" alt="{escaped_alt}">'
|
|
530
|
+
if image.caption:
|
|
531
|
+
escaped_caption = html_module.escape(image.caption)
|
|
532
|
+
return f'<figure>{img_tag}\n<figcaption>{escaped_caption}</figcaption></figure>\n'
|
|
533
|
+
return img_tag + '\n'
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def render_inlines(inlines: list) -> str:
|
|
537
|
+
"""
|
|
538
|
+
Render list of Inline elements to HTML.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
inlines: List of Inline model objects
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
HTML string with inline elements
|
|
545
|
+
"""
|
|
546
|
+
return "".join(render_inline(inline) for inline in inlines)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def render_inline(inline) -> str:
|
|
550
|
+
"""
|
|
551
|
+
Render single Inline element to HTML (dispatcher).
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
inline: Inline model object
|
|
555
|
+
|
|
556
|
+
Returns:
|
|
557
|
+
HTML string for inline element
|
|
558
|
+
"""
|
|
559
|
+
if isinstance(inline, Text):
|
|
560
|
+
return html_module.escape(inline.text)
|
|
561
|
+
elif isinstance(inline, Emphasis):
|
|
562
|
+
content = render_inlines(inline.children)
|
|
563
|
+
return f"<em>{content}</em>"
|
|
564
|
+
elif isinstance(inline, Strong):
|
|
565
|
+
content = render_inlines(inline.children)
|
|
566
|
+
return f"<strong>{content}</strong>"
|
|
567
|
+
elif isinstance(inline, Code):
|
|
568
|
+
escaped = html_module.escape(inline.text)
|
|
569
|
+
return f"<code>{escaped}</code>"
|
|
570
|
+
elif isinstance(inline, Link):
|
|
571
|
+
escaped_href = html_module.escape(inline.href)
|
|
572
|
+
content = render_inlines(inline.children)
|
|
573
|
+
return f'<a href="{escaped_href}">{content}</a>'
|
|
574
|
+
elif isinstance(inline, LineBreak):
|
|
575
|
+
return "<br>"
|
|
576
|
+
else:
|
|
577
|
+
# Unknown inline type
|
|
578
|
+
return ""
|
decant/core/sanitizer.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML sanitization using nh3 library.
|
|
3
|
+
|
|
4
|
+
Security boundary: strips active content, dangerous attributes, and unsafe URLs
|
|
5
|
+
before DOM parsing. See decisions.md section 9 for allowlist specification.
|
|
6
|
+
"""
|
|
7
|
+
import nh3
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Allowed HTML tags (from decisions.md section 9)
|
|
11
|
+
ALLOWED_TAGS = {
|
|
12
|
+
# Structure
|
|
13
|
+
"html", "head", "title", "body", "main", "article",
|
|
14
|
+
# Headings
|
|
15
|
+
"h1", "h2", "h3", "h4", "h5", "h6",
|
|
16
|
+
# Block elements
|
|
17
|
+
"p", "ul", "ol", "li", "blockquote", "pre", "code",
|
|
18
|
+
# Inline elements
|
|
19
|
+
"em", "i", "strong", "b", "a", "br",
|
|
20
|
+
# Elements for degradation (kept so parser can create placeholders)
|
|
21
|
+
"table", "tr", "td", "th", "img", "figure", "figcaption",
|
|
22
|
+
"dl", "dt", "dd", "hr", "form", "input", "textarea", "select", "option", "button",
|
|
23
|
+
"graphic",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
# Allowed attributes per tag
|
|
27
|
+
ALLOWED_ATTRIBUTES = {
|
|
28
|
+
"a": {"href"},
|
|
29
|
+
"img": {"alt", "src"},
|
|
30
|
+
"graphic": {"alt", "src"},
|
|
31
|
+
"td": {"colspan", "rowspan"},
|
|
32
|
+
"th": {"colspan", "rowspan"},
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Allowed URL schemes (blocks javascript:, data:, etc.)
|
|
36
|
+
ALLOWED_URL_SCHEMES = {"http", "https"}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def sanitize(html: str) -> str:
|
|
40
|
+
"""
|
|
41
|
+
Sanitize HTML using nh3 allowlist.
|
|
42
|
+
|
|
43
|
+
Removes scripts, event handlers, dangerous attributes, and unsafe URLs.
|
|
44
|
+
Keeps only tags/attributes needed for parsing and degradation.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
html: Raw HTML string (untrusted input)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Sanitized HTML string safe for parsing
|
|
51
|
+
"""
|
|
52
|
+
return nh3.clean(
|
|
53
|
+
html,
|
|
54
|
+
tags=ALLOWED_TAGS,
|
|
55
|
+
attributes=ALLOWED_ATTRIBUTES,
|
|
56
|
+
url_schemes=ALLOWED_URL_SCHEMES,
|
|
57
|
+
strip_comments=True,
|
|
58
|
+
)
|
decant/io/__init__.py
ADDED
|
File without changes
|
decant/io/reader.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML input reader.
|
|
3
|
+
|
|
4
|
+
Handles file and stdin input for CLI.
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def read_html(input_path: str | None) -> str:
|
|
10
|
+
"""
|
|
11
|
+
Read HTML from file or stdin.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
input_path: Path to HTML file, or None for stdin
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
HTML string
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
FileNotFoundError: If input file doesn't exist
|
|
21
|
+
IOError: If read fails
|
|
22
|
+
"""
|
|
23
|
+
if input_path:
|
|
24
|
+
# Read from file
|
|
25
|
+
with open(input_path, 'r', encoding='utf-8') as f:
|
|
26
|
+
return f.read()
|
|
27
|
+
else:
|
|
28
|
+
# Read from stdin
|
|
29
|
+
if sys.stdin.isatty():
|
|
30
|
+
raise IOError("No input provided (stdin is a TTY)")
|
|
31
|
+
return sys.stdin.read()
|
decant/io/writer.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTML output writer.
|
|
3
|
+
|
|
4
|
+
Handles file and stdout output for CLI.
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def write_html(html: str, output_path: str | None) -> None:
|
|
10
|
+
"""
|
|
11
|
+
Write HTML to file or stdout.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
html: HTML string to write
|
|
15
|
+
output_path: Path to output file, or None for stdout
|
|
16
|
+
|
|
17
|
+
Raises:
|
|
18
|
+
IOError: If write fails
|
|
19
|
+
"""
|
|
20
|
+
if output_path:
|
|
21
|
+
# Write to file
|
|
22
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
23
|
+
f.write(html)
|
|
24
|
+
else:
|
|
25
|
+
# Write to stdout
|
|
26
|
+
sys.stdout.write(html)
|