ima-claude 2.9.0 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,663 @@
1
+ """
2
+ Generate a styled IMA branded document PDF directly from a Word doc.
3
+
4
+ Usage:
5
+ python3 generate_pdf.py <path_to_docx> [--out <output.pdf>]
6
+
7
+ Required: pip install reportlab python-docx
8
+ """
9
+
10
+ import sys
11
+ import io
12
+ import re
13
+ import base64
14
+ import tempfile
15
+ import urllib.request
16
+ from pathlib import Path
17
+
18
+ sys.path.insert(0, str(Path(__file__).parent))
19
+ from extract_docx import extract_document
20
+
21
+ try:
22
+ from reportlab.lib.pagesizes import LETTER
23
+ from reportlab.lib.units import inch
24
+ from reportlab.lib.colors import HexColor, white, black
25
+ from reportlab.lib.styles import ParagraphStyle
26
+ from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY
27
+ from reportlab.platypus import (
28
+ BaseDocTemplate, PageTemplate, Frame, Paragraph, Spacer,
29
+ ListFlowable, ListItem, HRFlowable, PageBreak, NextPageTemplate,
30
+ KeepTogether, Image
31
+ )
32
+ from reportlab.platypus.flowables import Flowable
33
+ from reportlab.pdfgen import canvas as pdfcanvas
34
+ from reportlab.pdfbase import pdfmetrics
35
+ from reportlab.pdfbase.ttfonts import TTFont
36
+ except ImportError:
37
+ print("ERROR: reportlab not installed. Run:")
38
+ print(" pip install reportlab")
39
+ sys.exit(1)
40
+
41
+
42
+ # ── Register Lato fonts ──────────────────────────────────────────────────────
43
+ FONT_DIR = Path(__file__).parent.parent / "fonts"
44
+ LATO_FONTS = ("Lato-Regular.ttf", "Lato-Bold.ttf", "Lato-Italic.ttf", "Lato-BoldItalic.ttf")
45
+ LATO_BASE_URL = "https://github.com/google/fonts/raw/main/ofl/lato/"
46
+
47
+
48
+ def ensure_fonts():
49
+ """Download Lato fonts from Google Fonts GitHub repo if any are missing."""
50
+ if FONT_DIR.exists() and all((FONT_DIR / f).exists() for f in LATO_FONTS):
51
+ return
52
+
53
+ print("Downloading Lato fonts...")
54
+ FONT_DIR.mkdir(parents=True, exist_ok=True)
55
+
56
+ for font_file in LATO_FONTS:
57
+ dest = FONT_DIR / font_file
58
+ if dest.exists():
59
+ continue
60
+ url = LATO_BASE_URL + font_file
61
+ urllib.request.urlretrieve(url, dest)
62
+ print(f" {font_file}")
63
+
64
+
65
+ def register_fonts():
66
+ """Register Lato TTF fonts with reportlab."""
67
+ ensure_fonts()
68
+ fonts = {
69
+ "Lato": "Lato-Regular.ttf",
70
+ "Lato-Bold": "Lato-Bold.ttf",
71
+ "Lato-Italic": "Lato-Italic.ttf",
72
+ "Lato-BoldItalic": "Lato-BoldItalic.ttf",
73
+ }
74
+ for name, filename in fonts.items():
75
+ path = FONT_DIR / filename
76
+ if path.exists():
77
+ pdfmetrics.registerFont(TTFont(name, str(path)))
78
+ else:
79
+ print(f"WARNING: Font not found: {path}")
80
+
81
+ # Register font family so <b> and <i> markup works in Paragraphs
82
+ from reportlab.pdfbase.pdfmetrics import registerFontFamily
83
+ registerFontFamily(
84
+ "Lato",
85
+ normal="Lato",
86
+ bold="Lato-Bold",
87
+ italic="Lato-Italic",
88
+ boldItalic="Lato-BoldItalic",
89
+ )
90
+
91
+
92
+ # ── Brand colours (IMA Brand Book v4.0) ──────────────────────────────────────
93
+ NAVY = HexColor("#00066F") # Trustworthy Indigo
94
+ GOLD = HexColor("#FFCC00") # Vital Gold
95
+ BODY_TEXT = HexColor("#000000")
96
+ GREY_LIGHT = HexColor("#CCCCCC")
97
+ GREY_TEXT = HexColor("#666666")
98
+
99
+ PAGE_W, PAGE_H = LETTER # 612 × 792 pt
100
+ MARGIN = 0.5 * inch
101
+
102
+
103
+ # ── Styles (confirmed from Canva) ────────────────────────────────────────────
104
+ def build_styles():
105
+ S = {}
106
+
107
+ def s(name, **kw):
108
+ S[name] = ParagraphStyle(name=name, **kw)
109
+
110
+ # ── Cover ────────────────────────────────────────────────────────────
111
+ s("cover_title_1",
112
+ fontName="Lato-Bold", fontSize=90.5, leading=95,
113
+ textColor=white, alignment=TA_CENTER, spaceAfter=8)
114
+
115
+ s("cover_title_2",
116
+ fontName="Lato-Bold", fontSize=67, leading=72,
117
+ textColor=white, alignment=TA_CENTER, spaceAfter=8)
118
+
119
+ s("cover_subtitle",
120
+ fontName="Lato-Bold", fontSize=24, leading=30,
121
+ textColor=white, alignment=TA_CENTER, spaceAfter=6)
122
+
123
+ s("cover_authors",
124
+ fontName="Lato", fontSize=20, leading=26,
125
+ textColor=white, alignment=TA_CENTER, spaceAfter=4)
126
+
127
+ s("cover_disclaimer",
128
+ fontName="Lato", fontSize=12, leading=16,
129
+ textColor=white, alignment=TA_CENTER, spaceAfter=3)
130
+
131
+ s("cover_date",
132
+ fontName="Lato-Italic", fontSize=12, leading=16,
133
+ textColor=white, alignment=TA_CENTER)
134
+
135
+ # ── Content headings ─────────────────────────────────────────────────
136
+ s("intro_heading",
137
+ fontName="Lato-Bold", fontSize=15, leading=20,
138
+ textColor=NAVY, alignment=TA_CENTER,
139
+ spaceBefore=6, spaceAfter=10)
140
+
141
+ s("section_heading",
142
+ fontName="Lato-Bold", fontSize=15, leading=20,
143
+ textColor=NAVY, spaceBefore=16, spaceAfter=5)
144
+
145
+ s("sub_heading",
146
+ fontName="Lato-Bold", fontSize=13, leading=17,
147
+ textColor=NAVY, spaceBefore=10, spaceAfter=3)
148
+
149
+ # ── Body text ────────────────────────────────────────────────────────
150
+ s("body",
151
+ fontName="Lato", fontSize=12, leading=14.5,
152
+ textColor=BODY_TEXT, spaceBefore=0, spaceAfter=6,
153
+ alignment=TA_JUSTIFY)
154
+
155
+ s("bullet",
156
+ fontName="Lato", fontSize=12, leading=14.5,
157
+ textColor=BODY_TEXT, spaceBefore=1, spaceAfter=1)
158
+
159
+ # ── Warning box — navy bg, white + Vital Gold text ───────────────────
160
+ s("warning",
161
+ fontName="Lato-Bold", fontSize=12, leading=16,
162
+ textColor=white, alignment=TA_CENTER,
163
+ backColor=NAVY, borderColor=NAVY,
164
+ borderWidth=0, borderPad=10,
165
+ spaceBefore=10, spaceAfter=10)
166
+
167
+ # ── Q&A ──────────────────────────────────────────────────────────────
168
+ s("qa_question",
169
+ fontName="Lato-Bold", fontSize=12, leading=16,
170
+ textColor=NAVY, spaceBefore=10, spaceAfter=2)
171
+
172
+ s("qa_answer",
173
+ fontName="Lato", fontSize=12, leading=14.5,
174
+ textColor=BODY_TEXT, spaceBefore=2, spaceAfter=4,
175
+ alignment=TA_JUSTIFY)
176
+
177
+ # ── References ───────────────────────────────────────────────────────
178
+ s("ref_heading",
179
+ fontName="Lato-Bold", fontSize=13, leading=17,
180
+ textColor=NAVY, spaceBefore=10, spaceAfter=8)
181
+
182
+ s("reference",
183
+ fontName="Lato", fontSize=8, leading=11,
184
+ textColor=HexColor("#333333"), spaceBefore=1, spaceAfter=1,
185
+ leftIndent=14, firstLineIndent=-14)
186
+
187
+ # ── Captions ─────────────────────────────────────────────────────────
188
+ s("caption",
189
+ fontName="Lato", fontSize=12, leading=14.5,
190
+ textColor=BODY_TEXT, spaceBefore=4, spaceAfter=4)
191
+
192
+ s("disclaimer",
193
+ fontName="Lato", fontSize=12, leading=16,
194
+ textColor=white, alignment=TA_CENTER,
195
+ spaceBefore=4, spaceAfter=4)
196
+
197
+ s("content_disclaimer",
198
+ fontName="Lato", fontSize=12, leading=16,
199
+ textColor=BODY_TEXT, alignment=TA_JUSTIFY,
200
+ spaceBefore=4, spaceAfter=6)
201
+
202
+ # ── Footer ───────────────────────────────────────────────────────────
203
+ s("footer",
204
+ fontName="Lato", fontSize=10, leading=12,
205
+ textColor=GREY_TEXT, alignment=TA_CENTER)
206
+
207
+ return S
208
+
209
+
210
+ # ── Page canvas callbacks ─────────────────────────────────────────────────────
211
+ def make_cover_canvas(footer_h):
212
+ def on_cover(canv, doc):
213
+ canv.saveState()
214
+ # Full navy background
215
+ canv.setFillColor(NAVY)
216
+ canv.rect(0, 0, PAGE_W, PAGE_H, fill=1, stroke=0)
217
+ canv.restoreState()
218
+ return on_cover
219
+
220
+
221
+ def make_content_canvas(title_short, date_str, footer_h):
222
+ if date_str:
223
+ clean_date = date_str.replace("Updated ", "").strip()
224
+ footer_text = f"{title_short} ({clean_date})"
225
+ else:
226
+ footer_text = title_short
227
+
228
+ def on_content(canv, doc):
229
+ canv.saveState()
230
+ # Bottom footer rule
231
+ canv.setStrokeColor(GREY_LIGHT)
232
+ canv.setLineWidth(0.5)
233
+ canv.line(MARGIN, footer_h + 8, PAGE_W - MARGIN, footer_h + 8)
234
+ # Single centered footer line — Lato 10pt
235
+ canv.setFont("Lato", 10)
236
+ canv.setFillColor(GREY_TEXT)
237
+ canv.drawCentredString(PAGE_W / 2, footer_h - 2, footer_text)
238
+ canv.restoreState()
239
+ return on_content
240
+
241
+
242
+ # ── Markup helpers ────────────────────────────────────────────────────────────
243
+ def safe(text):
244
+ return (text.replace("&", "&amp;")
245
+ .replace("<", "&lt;")
246
+ .replace(">", "&gt;"))
247
+
248
+
249
+ def runs_to_markup(runs):
250
+ if not runs:
251
+ return ""
252
+ out = []
253
+ for r in runs:
254
+ t = safe(r.get("text", ""))
255
+ if r.get("bold") and r.get("italic"):
256
+ out.append(f"<b><i>{t}</i></b>")
257
+ elif r.get("bold"):
258
+ out.append(f"<b>{t}</b>")
259
+ elif r.get("italic"):
260
+ out.append(f"<i>{t}</i>")
261
+ else:
262
+ out.append(t)
263
+ return "".join(out)
264
+
265
+
266
+ def para_markup(entry):
267
+ markup = runs_to_markup(entry.get("runs", []))
268
+ return markup if markup else safe(entry["text"])
269
+
270
+
271
+ def warning_markup(text):
272
+ """Format warning text with Vital Gold for emphasis portions."""
273
+ # The warning box has white text with gold emphasis
274
+ # For now, render all as white; gold portions need manual Word markup
275
+ return safe(text)
276
+
277
+
278
+ # ── Extract cover metadata ────────────────────────────────────────────────────
279
+ def extract_cover_meta(sections):
280
+ """
281
+ Pull title, authors, disclaimer, date from the document preamble.
282
+ Only the FIRST heading is the guide title — stop taking headings after that
283
+ so section headings like 'Introduction' don't bleed onto the cover.
284
+ """
285
+ title = disclaimer = date_str = ""
286
+ author_list = []
287
+ got_title = False
288
+
289
+ for entry in sections[:30]:
290
+ t = entry["type"]
291
+ text = entry["text"].strip()
292
+
293
+ if t in ("h1", "heading_bold"):
294
+ if not got_title:
295
+ title = text
296
+ got_title = True
297
+ else:
298
+ break
299
+
300
+ elif t == "author":
301
+ author_list.append(text)
302
+
303
+ elif t == "disclaimer":
304
+ disclaimer = text
305
+
306
+ elif t == "date":
307
+ date_str = text
308
+
309
+ elif t == "body" and got_title:
310
+ break
311
+
312
+ authors = "<br/>".join(safe(a) for a in author_list) if author_list else ""
313
+
314
+ return title, authors, disclaimer, date_str
315
+
316
+
317
+ # ── Image extraction ─────────────────────────────────────────────────────────
318
+ def extract_images_from_docx(docx_path):
319
+ """Extract embedded images from DOCX, return dict of paragraph_index -> temp file path."""
320
+ from docx import Document
321
+ from docx.oxml.ns import qn
322
+
323
+ doc = Document(docx_path)
324
+ images = {} # rId -> (blob, content_type)
325
+ for rel in doc.part.rels.values():
326
+ if 'image' in rel.reltype:
327
+ images[rel.rId] = (rel.target_part.blob, rel.target_part.content_type)
328
+
329
+ # Find which paragraphs have images
330
+ positions = {} # para_index -> [(rId, blob, content_type)]
331
+ for i, para in enumerate(doc.paragraphs):
332
+ for run in para.runs:
333
+ drawings = run._element.findall(qn('w:drawing'))
334
+ for d in drawings:
335
+ blips = d.findall('.//' + qn('a:blip'))
336
+ for blip in blips:
337
+ embed = blip.get(qn('r:embed'))
338
+ if embed and embed in images:
339
+ blob, ct = images[embed]
340
+ positions.setdefault(i, []).append((embed, blob, ct))
341
+
342
+ # Write to temp files and return para_index -> [filepath]
343
+ result = {}
344
+ for para_idx, items in positions.items():
345
+ paths = []
346
+ for rId, blob, ct in items:
347
+ ext = '.png' if 'png' in ct else '.jpg'
348
+ tf = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
349
+ tf.write(blob)
350
+ tf.close()
351
+ paths.append(tf.name)
352
+ result[para_idx] = paths
353
+
354
+ return result
355
+
356
+
357
+ # ── Build story helpers ───────────────────────────────────────────────────────
358
+ def group_consecutive_bullets(sections):
359
+ """Merge consecutive bullet entries into grouped blocks.
360
+
361
+ Returns a new list where runs of {'type': 'bullet'} entries are replaced
362
+ by a single {'type': 'bullet_group', 'items': [...]} dict.
363
+ """
364
+ result = []
365
+ i = 0
366
+ while i < len(sections):
367
+ if sections[i]["type"] == "bullet":
368
+ group = []
369
+ while i < len(sections) and sections[i]["type"] == "bullet":
370
+ group.append(sections[i])
371
+ i += 1
372
+ result.append({"type": "bullet_group", "items": group})
373
+ else:
374
+ result.append(sections[i])
375
+ i += 1
376
+ return result
377
+
378
+
379
+ def block_to_flowables(block, styles, intro_added, heading_count):
380
+ """Map a single block dict to a list of ReportLab flowables.
381
+
382
+ Returns (flowables, intro_added, heading_count) — the bool state is
383
+ threaded through so this stays a pure transformation.
384
+ """
385
+ t = block["type"]
386
+
387
+ if t in ("h1", "date", "author"):
388
+ return [], intro_added, heading_count
389
+
390
+ if t == "heading_bold":
391
+ heading_count += 1
392
+ if heading_count == 1:
393
+ return [], intro_added, heading_count
394
+ if not intro_added and block["text"].strip().lower() == "introduction":
395
+ return (
396
+ [Spacer(1, 0.15 * inch),
397
+ Paragraph("Introduction", styles["intro_heading"]),
398
+ Spacer(1, 0.05 * inch)],
399
+ True,
400
+ heading_count,
401
+ )
402
+ return [Paragraph(para_markup(block), styles["section_heading"])], intro_added, heading_count
403
+
404
+ if t == "h2":
405
+ if not intro_added and block["text"].strip().lower() == "introduction":
406
+ return (
407
+ [Spacer(1, 0.15 * inch),
408
+ Paragraph("Introduction", styles["intro_heading"]),
409
+ Spacer(1, 0.05 * inch)],
410
+ True,
411
+ heading_count,
412
+ )
413
+ return [Paragraph(para_markup(block), styles["section_heading"])], intro_added, heading_count
414
+
415
+ if t == "h3":
416
+ return [Paragraph(para_markup(block), styles["sub_heading"])], intro_added, heading_count
417
+
418
+ if t == "ref_heading":
419
+ return (
420
+ [PageBreak(),
421
+ Paragraph("References", styles["ref_heading"]),
422
+ HRFlowable(width="100%", color=NAVY, thickness=1.5, spaceAfter=8)],
423
+ intro_added,
424
+ heading_count,
425
+ )
426
+
427
+ if t == "warning":
428
+ return [Paragraph(warning_markup(block["text"]), styles["warning"])], intro_added, heading_count
429
+
430
+ if t == "disclaimer":
431
+ return [Paragraph(safe(block["text"]), styles["content_disclaimer"])], intro_added, heading_count
432
+
433
+ if t == "body":
434
+ return [Paragraph(para_markup(block), styles["body"])], intro_added, heading_count
435
+
436
+ if t == "bullet_group":
437
+ items = [
438
+ ListItem(
439
+ Paragraph(para_markup(entry), styles["bullet"]),
440
+ bulletColor=NAVY, bulletFontSize=10, leftIndent=18,
441
+ )
442
+ for entry in block["items"]
443
+ ]
444
+ return (
445
+ [ListFlowable(
446
+ items, bulletType="bullet",
447
+ bulletFontName="Lato", bulletFontSize=10,
448
+ leftIndent=18, bulletOffsetY=-1,
449
+ spaceBefore=4, spaceAfter=6,
450
+ )],
451
+ intro_added,
452
+ heading_count,
453
+ )
454
+
455
+ if t in ("figure_caption", "table_caption"):
456
+ return [Paragraph(safe(block["text"]), styles["caption"])], intro_added, heading_count
457
+
458
+ # Default
459
+ return [Paragraph(para_markup(block), styles["body"])], intro_added, heading_count
460
+
461
+
462
+ # ── Build story ───────────────────────────────────────────────────────────────
463
+ def build_story(data, styles, title_short, date_str, image_positions=None):
464
+ story = []
465
+ footer_h = 0.4 * inch
466
+ if image_positions is None:
467
+ image_positions = {}
468
+
469
+ # ── Cover page (all on navy background) ──────────────────────────────
470
+ title, authors, disclaimer, cover_date = extract_cover_meta(
471
+ data["sections"]
472
+ )
473
+
474
+ story.append(Spacer(1, 1.2 * inch))
475
+ if title:
476
+ # Split title visually: first half as hero text, second half as subtitle.
477
+ # This is a placeholder cover — ima-cover-creator replaces it for final output.
478
+ words = title.split()
479
+ mid = max(1, len(words) // 2)
480
+ line1 = " ".join(words[:mid]).upper()
481
+ line2 = " ".join(words[mid:]).upper() if len(words) > mid else ""
482
+
483
+ story.append(Paragraph(safe(line1), styles["cover_title_1"]))
484
+ if line2:
485
+ story.append(Spacer(1, 0.2 * inch))
486
+ story.append(Paragraph(safe(line2), styles["cover_title_2"]))
487
+
488
+ story.append(Spacer(1, 0.5 * inch))
489
+ if authors:
490
+ story.append(Paragraph(authors, styles["cover_authors"]))
491
+ if disclaimer:
492
+ story.append(Spacer(1, 0.4 * inch))
493
+ story.append(Paragraph(safe(disclaimer), styles["cover_disclaimer"]))
494
+ if cover_date:
495
+ story.append(Spacer(1, 0.15 * inch))
496
+ story.append(Paragraph(safe(cover_date), styles["cover_date"]))
497
+
498
+ story.append(NextPageTemplate("content"))
499
+ story.append(PageBreak())
500
+
501
+ # ── Content — pipeline: group bullets → map blocks → flatten ─────────
502
+ grouped = group_consecutive_bullets(data["sections"])
503
+ intro_added = False
504
+ heading_count = 0
505
+ rendered_images = set()
506
+
507
+ # Build a lookup of section index -> paragraph index for image placement
508
+ for block in grouped:
509
+ flowables, intro_added, heading_count = block_to_flowables(
510
+ block, styles, intro_added, heading_count
511
+ )
512
+ story.extend(flowables)
513
+
514
+ # Insert images at this paragraph position
515
+ para_idx = block.get("index", -1)
516
+ if para_idx in image_positions and para_idx not in rendered_images:
517
+ max_w = PAGE_W - 2 * MARGIN - 0.5 * inch
518
+ for img_path in image_positions[para_idx]:
519
+ try:
520
+ img = Image(img_path, width=max_w, height=None)
521
+ # Let reportlab scale proportionally
522
+ img._restrictSize(max_w, PAGE_H - 3 * inch)
523
+ story.append(Spacer(1, 6))
524
+ story.append(img)
525
+ story.append(Spacer(1, 6))
526
+ except Exception as e:
527
+ print(f" Warning: Could not embed image at para {para_idx}: {e}")
528
+ rendered_images.add(para_idx)
529
+
530
+ # Also check for images between consecutive section indices
531
+ if block.get("type") == "bullet_group":
532
+ for item in block.get("items", []):
533
+ item_idx = item.get("index", -1)
534
+ if item_idx in image_positions and item_idx not in rendered_images:
535
+ max_w = PAGE_W - 2 * MARGIN - 0.5 * inch
536
+ for img_path in image_positions[item_idx]:
537
+ try:
538
+ img = Image(img_path, width=max_w, height=None)
539
+ img._restrictSize(max_w, PAGE_H - 3 * inch)
540
+ story.append(Spacer(1, 6))
541
+ story.append(img)
542
+ story.append(Spacer(1, 6))
543
+ except Exception as e:
544
+ print(f" Warning: Could not embed image: {e}")
545
+ rendered_images.add(item_idx)
546
+
547
+ # Insert any remaining images not yet rendered (e.g. image-only paragraphs)
548
+ for para_idx, paths in image_positions.items():
549
+ if para_idx not in rendered_images:
550
+ max_w = PAGE_W - 2 * MARGIN - 0.5 * inch
551
+ for img_path in paths:
552
+ try:
553
+ img = Image(img_path, width=max_w, height=None)
554
+ img._restrictSize(max_w, PAGE_H - 3 * inch)
555
+ story.append(Spacer(1, 6))
556
+ story.append(img)
557
+ story.append(Spacer(1, 6))
558
+ except Exception as e:
559
+ print(f" Warning: Could not embed orphan image: {e}")
560
+ rendered_images.add(para_idx)
561
+
562
+ # ── Q&A ───────────────────────────────────────────────────────────────
563
+ if data.get("qa_pairs"):
564
+ story.append(Spacer(1, 0.1 * inch))
565
+ story.append(HRFlowable(width="100%", color=GREY_LIGHT,
566
+ thickness=0.5, spaceAfter=8))
567
+ for qa in data["qa_pairs"]:
568
+ block = [Paragraph(para_markup(qa["question"]), styles["qa_question"])]
569
+ for ans in qa["answer_parts"]:
570
+ block.append(Paragraph(para_markup(ans), styles["qa_answer"]))
571
+ story.append(KeepTogether(block))
572
+
573
+ # ── References ────────────────────────────────────────────────────────
574
+ if data.get("references"):
575
+ story.append(PageBreak())
576
+ story.append(Paragraph("References", styles["ref_heading"]))
577
+ story.append(HRFlowable(width="100%", color=NAVY, thickness=1.5,
578
+ spaceAfter=8))
579
+ for ref in data["references"]:
580
+ story.append(Paragraph(safe(ref["text"]), styles["reference"]))
581
+
582
+ return story
583
+
584
+
585
+ # ── Main ──────────────────────────────────────────────────────────────────────
586
+ def generate_pdf(docx_path, out_path):
587
+ print(f"Extracting: {docx_path}")
588
+ data = extract_document(docx_path)
589
+
590
+ # Extract images from DOCX
591
+ print("Extracting images...")
592
+ image_positions = extract_images_from_docx(docx_path)
593
+ print(f" Found images at {len(image_positions)} paragraph positions")
594
+
595
+ register_fonts()
596
+ styles = build_styles()
597
+
598
+ title, authors, disclaimer, date_str = extract_cover_meta(
599
+ data["sections"]
600
+ )
601
+ title_short = (title[:60] + "...") if len(title) > 60 else title
602
+
603
+ footer_h = 0.4 * inch
604
+
605
+ doc = BaseDocTemplate(
606
+ str(out_path),
607
+ pagesize=LETTER,
608
+ leftMargin=MARGIN, rightMargin=MARGIN,
609
+ topMargin=MARGIN, bottomMargin=MARGIN,
610
+ )
611
+
612
+ cover_frame = Frame(
613
+ MARGIN, footer_h + 0.3 * inch,
614
+ PAGE_W - 2 * MARGIN,
615
+ PAGE_H - 2 * MARGIN - footer_h,
616
+ id="cover_frame"
617
+ )
618
+ content_frame = Frame(
619
+ MARGIN, footer_h + 0.4 * inch,
620
+ PAGE_W - 2 * MARGIN,
621
+ PAGE_H - 2 * MARGIN - footer_h,
622
+ id="content_frame"
623
+ )
624
+
625
+ doc.addPageTemplates([
626
+ PageTemplate(
627
+ id="cover",
628
+ frames=[cover_frame],
629
+ onPage=make_cover_canvas(footer_h)
630
+ ),
631
+ PageTemplate(
632
+ id="content",
633
+ frames=[content_frame],
634
+ onPage=make_content_canvas(title_short, date_str, footer_h)
635
+ ),
636
+ ])
637
+
638
+ story = build_story(data, styles, title_short, date_str, image_positions)
639
+
640
+ print(f"Building PDF...")
641
+ doc.build(story)
642
+ print(f"Done → {out_path}")
643
+
644
+
645
+ # ── Entry point ───────────────────────────────────────────────────────────────
646
+ if __name__ == "__main__":
647
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
648
+
649
+ if len(sys.argv) < 2:
650
+ print("Usage: python generate_pdf.py <path_to_docx> [--out output.pdf]")
651
+ sys.exit(1)
652
+
653
+ docx_path = Path(sys.argv[1])
654
+ if not docx_path.exists():
655
+ print(f"Error: File not found: {docx_path}")
656
+ sys.exit(1)
657
+
658
+ if "--out" in sys.argv:
659
+ out_path = Path(sys.argv[sys.argv.index("--out") + 1])
660
+ else:
661
+ out_path = docx_path.with_suffix(".pdf")
662
+
663
+ generate_pdf(docx_path, out_path)