decant-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,578 @@
1
+ """
2
+ Document model to HTML renderer.
3
+
4
+ Generates self-contained readable HTML with inline CSS.
5
+ Pipeline: Step 5 (final - after model creation)
6
+ See decisions.md section 10 for rendering invariants.
7
+ """
8
+ import html as html_module
9
+
10
+ from decant.core.model import (
11
+ Document, Section, Heading,
12
+ Paragraph, ListBlock, ListItem, Quote, Preformatted, Image, Table,
13
+ Text, Emphasis, Strong, Code, Link, LineBreak
14
+ )
15
+ from decant.core.constants import (
16
+ FONT_STACK, BODY_FONT_SIZE, HEADING_MULTIPLIERS,
17
+ LINE_HEIGHT, LETTER_SPACING, WORD_SPACING,
18
+ PARAGRAPH_SPACING, HEADING_MARGIN_TOP, HEADING_MARGIN_BOTTOM,
19
+ LIST_ITEM_SPACING,
20
+ BACKGROUND_COLOR, TEXT_COLOR, LINK_COLOR, LINK_HOVER_COLOR, LINK_VISITED_COLOR,
21
+ MAX_LINE_WIDTH, CONTAINER_PADDING,
22
+ PRINT_MIN_FONT_SIZE,
23
+ OPENDYSLEXIC_BASE64,
24
+ OPENDYSLEXIC_BOLD_BASE64
25
+ )
26
+
27
+
28
+ def _is_placeholder(paragraph: Paragraph) -> bool:
29
+ """Return True if paragraph is a placeholder like [Table removed]."""
30
+ if len(paragraph.inlines) != 1:
31
+ return False
32
+ inline = paragraph.inlines[0]
33
+ if not isinstance(inline, Text):
34
+ return False
35
+ return inline.text.startswith("[") and inline.text.endswith("]")
36
+
37
+
38
+ def _placeholder_type(text: str) -> str:
39
+ """Classify placeholder text into type category."""
40
+ if text.startswith("[Table"):
41
+ return "table"
42
+ if text.startswith("[Image"):
43
+ return "image"
44
+ if text.startswith("[Form"):
45
+ return "form"
46
+ if text == "[-]":
47
+ return "hr"
48
+ return "other"
49
+
50
+
51
+ def render_notice_banner(document: Document) -> str:
52
+ """Generate notice banner if document has placeholders."""
53
+ counts: dict[str, int] = {}
54
+ for section in document.sections:
55
+ for block in section.blocks:
56
+ if isinstance(block, Paragraph) and _is_placeholder(block):
57
+ ptype = _placeholder_type(block.inlines[0].text)
58
+ if ptype not in ("hr", "other"):
59
+ counts[ptype] = counts.get(ptype, 0) + 1
60
+
61
+ if not counts:
62
+ return ""
63
+
64
+ parts = []
65
+ for ptype in ("table", "image", "form"):
66
+ n = counts.get(ptype, 0)
67
+ if n > 0:
68
+ label = ptype + ("s" if n != 1 else "")
69
+ parts.append(f"{n} {label}")
70
+
71
+ if not parts:
72
+ return ""
73
+
74
+ summary = parts[0]
75
+ if len(parts) == 2:
76
+ summary = f"{parts[0]} and {parts[1]}"
77
+ elif len(parts) > 2:
78
+ summary = ", ".join(parts[:-1]) + f", and {parts[-1]}"
79
+
80
+ if document.source_url:
81
+ escaped_url = html_module.escape(document.source_url)
82
+ suffix = (
83
+ f', or <a href="{escaped_url}">'
84
+ f'view the original page</a> for the full content.'
85
+ )
86
+ else:
87
+ suffix = " for details."
88
+
89
+ return (
90
+ f'<div class="decant-notice">'
91
+ f'This document contains {summary} that could not be included. '
92
+ f'Look for the marked notes below{suffix}'
93
+ f'</div>\n'
94
+ )
95
+
96
+
97
+ def render(document: Document, use_opendyslexic: bool = False) -> str:
98
+ """
99
+ Render Document model to self-contained HTML.
100
+
101
+ Args:
102
+ document: Document model with title and sections
103
+ use_opendyslexic: If True, embed OpenDyslexic font
104
+
105
+ Returns:
106
+ Complete HTML string with inline CSS
107
+ """
108
+ css = generate_css(use_opendyslexic)
109
+
110
+ source_url = document.source_url
111
+ sections_html = "\n".join(
112
+ render_section(section, source_url=source_url)
113
+ for section in document.sections
114
+ )
115
+ banner = render_notice_banner(document)
116
+
117
+ # Assemble complete HTML document
118
+ html_output = f"""<!DOCTYPE html>
119
+ <html lang="en">
120
+ <head>
121
+ <meta charset="utf-8">
122
+ <title>{html_module.escape(document.title)}</title>
123
+ <style>
124
+ {css}
125
+ </style>
126
+ </head>
127
+ <body>
128
+ <div class="container">
129
+ {banner}{sections_html}
130
+ </div>
131
+ </body>
132
+ </html>"""
133
+
134
+ return html_output
135
+
136
+
137
+ def generate_css(use_opendyslexic: bool) -> str:
138
+ """
139
+ Generate CSS from constants.
140
+
141
+ Args:
142
+ use_opendyslexic: If True, include @font-face for OpenDyslexic
143
+
144
+ Returns:
145
+ CSS string for inline <style> block
146
+ """
147
+ # Font family - conditional on OpenDyslexic
148
+ if use_opendyslexic and OPENDYSLEXIC_BASE64:
149
+ font_family = "'OpenDyslexic', " + FONT_STACK
150
+ font_face = f"""
151
+ @font-face {{
152
+ font-family: 'OpenDyslexic';
153
+ src: url(data:font/woff2;base64,{OPENDYSLEXIC_BASE64}) format('woff2');
154
+ font-weight: normal;
155
+ font-style: normal;
156
+ }}
157
+
158
+ @font-face {{
159
+ font-family: 'OpenDyslexic';
160
+ src: url(data:font/woff2;base64,{OPENDYSLEXIC_BOLD_BASE64}) format('woff2');
161
+ font-weight: bold;
162
+ font-style: normal;
163
+ }}
164
+ """
165
+ else:
166
+ font_family = FONT_STACK
167
+ font_face = ""
168
+
169
+ # Restyle <em> as bold (not italic) for dyslexic readers (BDA guidance)
170
+ em_restyle = ""
171
+ if use_opendyslexic and OPENDYSLEXIC_BASE64:
172
+ em_restyle = """
173
+ em {
174
+ font-style: normal;
175
+ font-weight: bold;
176
+ }
177
+
178
+ .placeholder {
179
+ font-style: normal;
180
+ }
181
+
182
+ .decant-table th,
183
+ .decant-table td {
184
+ padding: 0.6em 0.85em;
185
+ line-height: 1.6;
186
+ }
187
+ """
188
+
189
+ # Generate heading styles
190
+ heading_styles = ""
191
+ for level in range(1, 7):
192
+ multiplier = HEADING_MULTIPLIERS[level]
193
+ heading_styles += f"""
194
+ h{level} {{
195
+ font-size: calc({BODY_FONT_SIZE} * {multiplier});
196
+ margin-top: {HEADING_MARGIN_TOP};
197
+ margin-bottom: {HEADING_MARGIN_BOTTOM};
198
+ font-weight: bold;
199
+ }}
200
+ """
201
+
202
+ css = f"""{font_face}
203
+ * {{
204
+ box-sizing: border-box;
205
+ }}
206
+
207
+ body {{
208
+ font-family: {font_family};
209
+ font-size: {BODY_FONT_SIZE};
210
+ line-height: {LINE_HEIGHT};
211
+ letter-spacing: {LETTER_SPACING};
212
+ word-spacing: {WORD_SPACING};
213
+ background-color: {BACKGROUND_COLOR};
214
+ color: {TEXT_COLOR};
215
+ margin: 0;
216
+ padding: 0;
217
+ }}
218
+
219
+ .container {{
220
+ max-width: {MAX_LINE_WIDTH};
221
+ margin: 0 auto;
222
+ padding: {CONTAINER_PADDING};
223
+ }}
224
+
225
+ {heading_styles}
226
+
227
+ p {{
228
+ margin: 0 0 {PARAGRAPH_SPACING} 0;
229
+ text-align: left;
230
+ }}
231
+
232
+ ul, ol {{
233
+ margin: 0 0 {PARAGRAPH_SPACING} 0;
234
+ padding-left: 2em;
235
+ }}
236
+
237
+ li {{
238
+ margin-bottom: {LIST_ITEM_SPACING};
239
+ }}
240
+
241
+ blockquote {{
242
+ margin: 0 0 {PARAGRAPH_SPACING} 2em;
243
+ padding-left: 1em;
244
+ border-left: 3px solid {TEXT_COLOR};
245
+ }}
246
+
247
+ pre {{
248
+ background-color: #f5f5f5;
249
+ padding: 1em;
250
+ margin: 0 0 {PARAGRAPH_SPACING} 0;
251
+ overflow-x: auto;
252
+ font-family: 'Courier New', monospace;
253
+ }}
254
+
255
+ code {{
256
+ background-color: #f5f5f5;
257
+ padding: 0.2em 0.4em;
258
+ font-family: 'Courier New', monospace;
259
+ }}
260
+
261
+ img {{
262
+ max-width: 100%;
263
+ height: auto;
264
+ display: block;
265
+ margin: 0 0 {PARAGRAPH_SPACING} 0;
266
+ }}
267
+
268
+ a {{
269
+ color: {LINK_COLOR};
270
+ text-decoration: underline;
271
+ }}
272
+
273
+ a:hover {{
274
+ color: {LINK_HOVER_COLOR};
275
+ }}
276
+
277
+ a:visited {{
278
+ color: {LINK_VISITED_COLOR};
279
+ }}
280
+
281
+ figure {{
282
+ margin: 1.5em 0;
283
+ padding: 0;
284
+ }}
285
+
286
+ figure img {{
287
+ display: block;
288
+ max-width: 100%;
289
+ height: auto;
290
+ }}
291
+
292
+ figcaption {{
293
+ font-size: 0.9em;
294
+ color: #555;
295
+ margin-top: 0.5em;
296
+ line-height: 1.4;
297
+ }}
298
+
299
+ .decant-notice {{
300
+ background-color: #f0f0e8;
301
+ border-left: 3px solid #b0a870;
302
+ padding: 0.75em 1em;
303
+ margin-bottom: 1.5em;
304
+ font-size: 0.9em;
305
+ color: #555;
306
+ line-height: 1.5;
307
+ }}
308
+
309
+ .placeholder {{
310
+ color: #666;
311
+ font-style: italic;
312
+ }}
313
+
314
+ .view-original {{
315
+ font-style: normal;
316
+ margin-left: 0.3em;
317
+ }}
318
+
319
+ .decant-table {{
320
+ border-collapse: collapse;
321
+ width: 100%;
322
+ max-width: 100%;
323
+ margin: 1.2em 0;
324
+ font-size: 0.95em;
325
+ overflow-x: auto;
326
+ }}
327
+ .decant-table th,
328
+ .decant-table td {{
329
+ border: 1px solid #ccc;
330
+ padding: 0.5em 0.75em;
331
+ text-align: left;
332
+ vertical-align: top;
333
+ }}
334
+ .decant-table th {{
335
+ background-color: #f5f5f5;
336
+ font-weight: bold;
337
+ }}
338
+ .decant-table tr:nth-child(even) td {{
339
+ background-color: #fafafa;
340
+ }}
341
+
342
+ {em_restyle}
343
+ @media print {{
344
+ body {{
345
+ font-size: {PRINT_MIN_FONT_SIZE};
346
+ }}
347
+ img {{
348
+ max-width: 100%;
349
+ page-break-inside: avoid;
350
+ }}
351
+ figcaption {{
352
+ color: #333;
353
+ }}
354
+ .decant-notice {{
355
+ border-left-color: #999;
356
+ }}
357
+ .decant-table {{
358
+ font-size: 0.9em;
359
+ }}
360
+ .decant-table th,
361
+ .decant-table td {{
362
+ border: 1px solid #999;
363
+ }}
364
+ .decant-table th {{
365
+ background-color: #eee !important;
366
+ }}
367
+ .decant-table tr:nth-child(even) td {{
368
+ background-color: transparent !important;
369
+ }}
370
+ }}
371
+ """
372
+
373
+ return css
374
+
375
+
376
+ def render_section(section: Section, source_url: str = "") -> str:
377
+ """
378
+ Render Section to HTML.
379
+
380
+ Args:
381
+ section: Section with heading and blocks
382
+ source_url: Optional source URL for placeholder links
383
+
384
+ Returns:
385
+ HTML string for section
386
+ """
387
+ # Render heading
388
+ level = section.heading.level
389
+ heading_html = render_inlines(section.heading.inlines)
390
+ heading = f"<h{level}>{heading_html}</h{level}>\n"
391
+
392
+ # Render blocks
393
+ blocks_html = "\n".join(
394
+ render_block(block, source_url=source_url)
395
+ for block in section.blocks
396
+ )
397
+
398
+ return heading + blocks_html
399
+
400
+
401
+ def render_block(block, source_url: str = "") -> str:
402
+ """
403
+ Render Block to HTML (dispatcher).
404
+
405
+ Args:
406
+ block: Block model object
407
+ source_url: Optional source URL for placeholder links
408
+
409
+ Returns:
410
+ HTML string for block
411
+ """
412
+ if isinstance(block, Paragraph):
413
+ return render_paragraph(block, source_url=source_url)
414
+ elif isinstance(block, ListBlock):
415
+ return render_list(block)
416
+ elif isinstance(block, Quote):
417
+ return render_quote(block, source_url=source_url)
418
+ elif isinstance(block, Preformatted):
419
+ return render_preformatted(block)
420
+ elif isinstance(block, Image):
421
+ return render_image(block)
422
+ elif isinstance(block, Table):
423
+ return render_table(block)
424
+ else:
425
+ # Unknown block type - skip
426
+ return ""
427
+
428
+
429
+ def render_paragraph(para: Paragraph, source_url: str = "") -> str:
430
+ """
431
+ Render Paragraph to HTML.
432
+
433
+ Args:
434
+ para: Paragraph with inlines
435
+ source_url: Optional source URL for placeholder links
436
+
437
+ Returns:
438
+ HTML <p> element
439
+ """
440
+ if _is_placeholder(para) and source_url:
441
+ escaped_text = html_module.escape(para.inlines[0].text)
442
+ escaped_url = html_module.escape(source_url)
443
+ return (
444
+ f'<p class="placeholder">{escaped_text} '
445
+ f'<a href="{escaped_url}" class="view-original">'
446
+ f'View original</a></p>\n'
447
+ )
448
+ content = render_inlines(para.inlines)
449
+ return f"<p>{content}</p>\n"
450
+
451
+
452
+ def render_list(list_block: ListBlock) -> str:
453
+ """
454
+ Render ListBlock to HTML (handles nested lists).
455
+
456
+ Args:
457
+ list_block: ListBlock with items
458
+
459
+ Returns:
460
+ HTML <ul> or <ol> element
461
+ """
462
+ tag = "ol" if list_block.ordered else "ul"
463
+ items_html = ""
464
+
465
+ for item in list_block.items:
466
+ # Render item inlines
467
+ content = render_inlines(item.inlines)
468
+
469
+ # Render nested lists if present
470
+ nested = ""
471
+ for child_list in item.children:
472
+ nested += render_list(child_list)
473
+
474
+ items_html += f"<li>{content}{nested}</li>\n"
475
+
476
+ return f"<{tag}>\n{items_html}</{tag}>\n"
477
+
478
+
479
+ def render_quote(quote: Quote, source_url: str = "") -> str:
480
+ """
481
+ Render Quote to HTML (recursive).
482
+
483
+ Args:
484
+ quote: Quote containing blocks
485
+ source_url: Optional source URL for placeholder links
486
+
487
+ Returns:
488
+ HTML <blockquote> element
489
+ """
490
+ blocks_html = "\n".join(
491
+ render_block(block, source_url=source_url)
492
+ for block in quote.blocks
493
+ )
494
+ return f"<blockquote>\n{blocks_html}</blockquote>\n"
495
+
496
+
497
+ def render_preformatted(pre: Preformatted) -> str:
498
+ """
499
+ Render Preformatted to HTML.
500
+
501
+ Args:
502
+ pre: Preformatted with text
503
+
504
+ Returns:
505
+ HTML <pre> element
506
+ """
507
+ escaped_text = html_module.escape(pre.text)
508
+ return f"<pre>{escaped_text}</pre>\n"
509
+
510
+
511
+ def render_table(table: Table) -> str:
512
+ """Render Table to styled HTML table."""
513
+ parts = ['<table class="decant-table">\n']
514
+ for row in table.rows:
515
+ parts.append("<tr>\n")
516
+ for cell in row.cells:
517
+ tag = "th" if cell.is_header else "td"
518
+ content = render_inlines(cell.inlines)
519
+ parts.append(f"<{tag}>{content}</{tag}>\n")
520
+ parts.append("</tr>\n")
521
+ parts.append("</table>\n")
522
+ return "".join(parts)
523
+
524
+
525
+ def render_image(image: Image) -> str:
526
+ """Render Image to HTML <img> tag, wrapped in <figure> if captioned."""
527
+ escaped_src = html_module.escape(image.src)
528
+ escaped_alt = html_module.escape(image.alt)
529
+ img_tag = f'<img src="{escaped_src}" alt="{escaped_alt}">'
530
+ if image.caption:
531
+ escaped_caption = html_module.escape(image.caption)
532
+ return f'<figure>{img_tag}\n<figcaption>{escaped_caption}</figcaption></figure>\n'
533
+ return img_tag + '\n'
534
+
535
+
536
+ def render_inlines(inlines: list) -> str:
537
+ """
538
+ Render list of Inline elements to HTML.
539
+
540
+ Args:
541
+ inlines: List of Inline model objects
542
+
543
+ Returns:
544
+ HTML string with inline elements
545
+ """
546
+ return "".join(render_inline(inline) for inline in inlines)
547
+
548
+
549
+ def render_inline(inline) -> str:
550
+ """
551
+ Render single Inline element to HTML (dispatcher).
552
+
553
+ Args:
554
+ inline: Inline model object
555
+
556
+ Returns:
557
+ HTML string for inline element
558
+ """
559
+ if isinstance(inline, Text):
560
+ return html_module.escape(inline.text)
561
+ elif isinstance(inline, Emphasis):
562
+ content = render_inlines(inline.children)
563
+ return f"<em>{content}</em>"
564
+ elif isinstance(inline, Strong):
565
+ content = render_inlines(inline.children)
566
+ return f"<strong>{content}</strong>"
567
+ elif isinstance(inline, Code):
568
+ escaped = html_module.escape(inline.text)
569
+ return f"<code>{escaped}</code>"
570
+ elif isinstance(inline, Link):
571
+ escaped_href = html_module.escape(inline.href)
572
+ content = render_inlines(inline.children)
573
+ return f'<a href="{escaped_href}">{content}</a>'
574
+ elif isinstance(inline, LineBreak):
575
+ return "<br>"
576
+ else:
577
+ # Unknown inline type
578
+ return ""
@@ -0,0 +1,58 @@
1
+ """
2
+ HTML sanitization using nh3 library.
3
+
4
+ Security boundary: strips active content, dangerous attributes, and unsafe URLs
5
+ before DOM parsing. See decisions.md section 9 for allowlist specification.
6
+ """
7
+ import nh3
8
+
9
+
10
+ # Allowed HTML tags (from decisions.md section 9)
11
+ ALLOWED_TAGS = {
12
+ # Structure
13
+ "html", "head", "title", "body", "main", "article",
14
+ # Headings
15
+ "h1", "h2", "h3", "h4", "h5", "h6",
16
+ # Block elements
17
+ "p", "ul", "ol", "li", "blockquote", "pre", "code",
18
+ # Inline elements
19
+ "em", "i", "strong", "b", "a", "br",
20
+ # Elements for degradation (kept so parser can create placeholders)
21
+ "table", "tr", "td", "th", "img", "figure", "figcaption",
22
+ "dl", "dt", "dd", "hr", "form", "input", "textarea", "select", "option", "button",
23
+ "graphic",
24
+ }
25
+
26
+ # Allowed attributes per tag
27
+ ALLOWED_ATTRIBUTES = {
28
+ "a": {"href"},
29
+ "img": {"alt", "src"},
30
+ "graphic": {"alt", "src"},
31
+ "td": {"colspan", "rowspan"},
32
+ "th": {"colspan", "rowspan"},
33
+ }
34
+
35
+ # Allowed URL schemes (blocks javascript:, data:, etc.)
36
+ ALLOWED_URL_SCHEMES = {"http", "https"}
37
+
38
+
39
+ def sanitize(html: str) -> str:
40
+ """
41
+ Sanitize HTML using nh3 allowlist.
42
+
43
+ Removes scripts, event handlers, dangerous attributes, and unsafe URLs.
44
+ Keeps only tags/attributes needed for parsing and degradation.
45
+
46
+ Args:
47
+ html: Raw HTML string (untrusted input)
48
+
49
+ Returns:
50
+ Sanitized HTML string safe for parsing
51
+ """
52
+ return nh3.clean(
53
+ html,
54
+ tags=ALLOWED_TAGS,
55
+ attributes=ALLOWED_ATTRIBUTES,
56
+ url_schemes=ALLOWED_URL_SCHEMES,
57
+ strip_comments=True,
58
+ )
decant/io/__init__.py ADDED
File without changes
decant/io/reader.py ADDED
@@ -0,0 +1,31 @@
1
+ """
2
+ HTML input reader.
3
+
4
+ Handles file and stdin input for CLI.
5
+ """
6
+ import sys
7
+
8
+
9
+ def read_html(input_path: str | None) -> str:
10
+ """
11
+ Read HTML from file or stdin.
12
+
13
+ Args:
14
+ input_path: Path to HTML file, or None for stdin
15
+
16
+ Returns:
17
+ HTML string
18
+
19
+ Raises:
20
+ FileNotFoundError: If input file doesn't exist
21
+ IOError: If read fails
22
+ """
23
+ if input_path:
24
+ # Read from file
25
+ with open(input_path, 'r', encoding='utf-8') as f:
26
+ return f.read()
27
+ else:
28
+ # Read from stdin
29
+ if sys.stdin.isatty():
30
+ raise IOError("No input provided (stdin is a TTY)")
31
+ return sys.stdin.read()
decant/io/writer.py ADDED
@@ -0,0 +1,26 @@
1
+ """
2
+ HTML output writer.
3
+
4
+ Handles file and stdout output for CLI.
5
+ """
6
+ import sys
7
+
8
+
9
+ def write_html(html: str, output_path: str | None) -> None:
10
+ """
11
+ Write HTML to file or stdout.
12
+
13
+ Args:
14
+ html: HTML string to write
15
+ output_path: Path to output file, or None for stdout
16
+
17
+ Raises:
18
+ IOError: If write fails
19
+ """
20
+ if output_path:
21
+ # Write to file
22
+ with open(output_path, 'w', encoding='utf-8') as f:
23
+ f.write(html)
24
+ else:
25
+ # Write to stdout
26
+ sys.stdout.write(html)