pdfgen-juanipis 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. pdfgen/__init__.py +17 -0
  2. pdfgen/api.py +69 -0
  3. pdfgen/assets/banner-clean.png +0 -0
  4. pdfgen/assets/banner.png +0 -0
  5. pdfgen/assets/fonts/BCDEEE_Calibri_5.ttf +0 -0
  6. pdfgen/assets/fonts/BCDFEE_CenturyGothic-Bold_9.ttf +0 -0
  7. pdfgen/assets/fonts/BCDGEE_CenturyGothic-Bold_14.ttf +0 -0
  8. pdfgen/assets/fonts/BCDHEE_Calibri-Bold_20.ttf +0 -0
  9. pdfgen/assets/fonts/BCDIEE_Calibri-Bold_25.ttf +0 -0
  10. pdfgen/assets/fonts/BCDJEE_Calibri_27.ttf +0 -0
  11. pdfgen/assets/fonts/BCDKEE_Calibri-Italic_33.ttf +0 -0
  12. pdfgen/assets/fonts/BCDLEE_Calibri-Italic_52.ttf +0 -0
  13. pdfgen/assets/fonts/BCDMEE_SegoeUI_54.ttf +0 -0
  14. pdfgen/assets/fonts/BCDNEE_SegoeUI_60.ttf +0 -0
  15. pdfgen/assets/fonts/BCDOEE_Aptos Narrow,Bold_142.ttf +0 -0
  16. pdfgen/assets/fonts/BCDPEE_Aptos Narrow,Bold_144.ttf +0 -0
  17. pdfgen/assets/fonts/BCEAEE_Aptos Narrow_149.ttf +0 -0
  18. pdfgen/assets/fonts/BCEBEE_Aptos Narrow_154.ttf +0 -0
  19. pdfgen/assets/fonts/TimesNewRomanPS-BoldMT_38.ttf +0 -0
  20. pdfgen/assets/logo.png +0 -0
  21. pdfgen/cli.py +106 -0
  22. pdfgen/pagination.py +1045 -0
  23. pdfgen/render.py +348 -0
  24. pdfgen/schema.json +126 -0
  25. pdfgen/templates/boletin.css +389 -0
  26. pdfgen/templates/boletin_template.html.jinja +129 -0
  27. pdfgen/validator.py +247 -0
  28. pdfgen_juanipis-0.1.3.dist-info/METADATA +170 -0
  29. pdfgen_juanipis-0.1.3.dist-info/RECORD +33 -0
  30. pdfgen_juanipis-0.1.3.dist-info/WHEEL +5 -0
  31. pdfgen_juanipis-0.1.3.dist-info/entry_points.txt +2 -0
  32. pdfgen_juanipis-0.1.3.dist-info/licenses/LICENSE +21 -0
  33. pdfgen_juanipis-0.1.3.dist-info/top_level.txt +1 -0
pdfgen/pagination.py ADDED
@@ -0,0 +1,1045 @@
1
+ import dataclasses
2
+ import logging
3
+ import math
4
+ import os
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
8
+
9
+ try:
10
+ from weasyprint import HTML, CSS
11
+
12
+ WEASYPRINT_AVAILABLE = True
13
+ except Exception: # pragma: no cover - optional dependency for measurement
14
+ HTML = None
15
+ CSS = None
16
+ WEASYPRINT_AVAILABLE = False
17
+
18
+ LOGGER = logging.getLogger(__name__)
19
+ CSS_PX_TO_PT = 72.0 / 96.0
20
+
21
+
22
+ @dataclasses.dataclass(frozen=True)
23
+ class LayoutConfig:
24
+ page_width_pt: float = 612.0
25
+ page_height_pt: float = 792.0
26
+ content_left_pt: float = 85.1
27
+ content_width_pt: float = 444.0
28
+ default_intro_top_pt: float = 122.18
29
+ default_content_top_pt: float = 150.0
30
+ continuation_content_top_pt: float = 110.0
31
+ header_title_top_pt: float = 73.7
32
+ header_subtitle_top_pt: float = 90.77
33
+ header_title_left_pt: float = 94.7
34
+ header_title_width_pt: float = 430.0
35
+ header_subtitle_left_pt: float = 260.8
36
+ header_subtitle_width_pt: float = 220.0
37
+ header_logo_top_pt: float = 13.95
38
+ header_logo_height_pt: float = 36.75
39
+ header_banner_height_pt: float = 79.5
40
+ header_title_min_top_pt: float = 100.0
41
+ footer_contact_bottom_pt: float = 32.0
42
+ footer_page_bottom_pt: float = 134.0
43
+ footer_meta_bottom_pt: float = 70.0
44
+ footer_meta_gap_pt: float = 6.0
45
+ header_gap_pt: float = 6.0
46
+ intro_gap_pt: float = 12.0
47
+ header_subtitle_gap_pt: float = 2.0
48
+ safety_pad_pt: float = 6.0
49
+ min_content_height_pt: float = 48.0
50
+
51
+ def to_template(self) -> Dict[str, float]:
52
+ return {
53
+ "content_top": self.default_content_top_pt,
54
+ "intro_top": self.default_intro_top_pt,
55
+ }
56
+
57
+
58
+ @dataclasses.dataclass
59
+ class PageLayoutState:
60
+ intro_top_pt: float
61
+ content_top_pt: float
62
+ content_height_base_pt: float
63
+ content_height_meta_pt: float
64
+ reserved_base_pt: float
65
+ footer_meta_bottom_pt: float
66
+
67
+
68
+ @dataclasses.dataclass
69
+ class BlockItem:
70
+ data: Dict[str, Any]
71
+ height_pt: float
72
+ keep_with_next: bool = False
73
+ refs: List[str] = dataclasses.field(default_factory=list)
74
+ notes: List[str] = dataclasses.field(default_factory=list)
75
+
76
+
77
+ @dataclasses.dataclass
78
+ class PageBuild:
79
+ blocks: List[BlockItem]
80
+ height_pt: float
81
+ refs: List[str]
82
+ notes: List[str]
83
+
84
+
85
+ class BlockMeasurer:
86
+ def __init__(self, css_path: str, base_url: str, layout: LayoutConfig):
87
+ self.css_path = css_path
88
+ self.base_url = base_url
89
+ self.layout = layout
90
+ self._height_cache: Dict[Tuple[Any, ...], float] = {}
91
+
92
+ def measure_html(self, html_fragment: str) -> float:
93
+ key = ("html", html_fragment)
94
+ cached = self._height_cache.get(key)
95
+ if cached is not None:
96
+ return cached
97
+
98
+ height = self._measure_with_weasyprint(
99
+ f"<div class=\"content\"><div id=\"probe\">{html_fragment}</div></div>",
100
+ "probe",
101
+ )
102
+ if height is None:
103
+ height = self._estimate_html_height(html_fragment)
104
+ self._height_cache[key] = height
105
+ return height
106
+
107
+ def measure_text_block(self, text: str, class_name: str) -> float:
108
+ key = ("text", class_name, text)
109
+ cached = self._height_cache.get(key)
110
+ if cached is not None:
111
+ return cached
112
+
113
+ html = f"<div id=\"probe\" class=\"{class_name}\">{text}</div>"
114
+ height = self._measure_with_weasyprint(html, "probe")
115
+ if height is None:
116
+ height = self._estimate_text_height(text, class_name)
117
+ self._height_cache[key] = height
118
+ return height
119
+
120
+ def measure_table(self, table: Dict[str, Any], show_header: bool) -> float:
121
+ rows_key = tuple(tuple(row.get("vals", [])) + (row.get("dep", ""),) for row in table["rows"])
122
+ key = ("table", show_header, rows_key)
123
+ cached = self._height_cache.get(key)
124
+ if cached is not None:
125
+ return cached
126
+
127
+ html = build_table_html(table, show_header=show_header)
128
+ content_width = table.get("total_width") or self.layout.content_width_pt
129
+ height = self._measure_with_weasyprint(html, "probe-table", content_width=content_width)
130
+ if height is None:
131
+ height = self._estimate_table_height(table, show_header)
132
+ self._height_cache[key] = height
133
+ return height
134
+
135
+ def measure_footer_meta(self, refs: List[str], notes: List[str]) -> float:
136
+ if not refs and not notes:
137
+ return 0.0
138
+ refs_html = "".join(f"<div class=\"refs-text\">{ref}</div>" for ref in refs)
139
+ notes_html = "".join(f"<div>{note}</div>" for note in notes)
140
+ html = """
141
+ <div id=\"probe\" class=\"footer-meta\">
142
+ {refs_block}
143
+ {notes_block}
144
+ </div>
145
+ """.format(
146
+ refs_block=(
147
+ f"<div class=\"refs\"><div class=\"refs-line\"></div>{refs_html}</div>"
148
+ if refs
149
+ else ""
150
+ ),
151
+ notes_block=(f"<div class=\"footer-notes\">{notes_html}</div>" if notes else ""),
152
+ )
153
+ height = self._measure_with_weasyprint(html, "probe")
154
+ if height is None:
155
+ height = self._estimate_refs_height(refs) + self._estimate_notes_height(notes)
156
+ return height
157
+
158
+ def measure_footer_contact(self, site: str, phone: str) -> float:
159
+ html = f"<div id=\"probe\" class=\"footer-contact\"><div>{site}</div><div>{phone}</div></div>"
160
+ height = self._measure_with_weasyprint(html, "probe")
161
+ if height is None:
162
+ height = 22.0
163
+ return height
164
+
165
+ def measure_footer_page(self, page_number: str) -> float:
166
+ if not page_number:
167
+ return 0.0
168
+ html = f"<div id=\"probe\" class=\"footer-page\">{page_number}</div>"
169
+ height = self._measure_with_weasyprint(html, "probe")
170
+ if height is None:
171
+ height = 8.0
172
+ return height
173
+
174
+ def _measure_with_weasyprint(
175
+ self, body_html: str, probe_id: str, content_width: Optional[float] = None
176
+ ) -> Optional[float]:
177
+ if not WEASYPRINT_AVAILABLE:
178
+ return None
179
+
180
+ if content_width is None:
181
+ content_width = self.layout.content_width_pt
182
+ measure_css = MEASURE_CSS.format(content_width=content_width)
183
+ full_html = f"""
184
+ <!DOCTYPE html>
185
+ <html lang=\"es\">
186
+ <head>
187
+ <meta charset=\"utf-8\" />
188
+ </head>
189
+ <body>
190
+ <div class=\"measure-root\">{body_html}</div>
191
+ </body>
192
+ </html>
193
+ """
194
+ try:
195
+ document = HTML(string=full_html, base_url=self.base_url).render(
196
+ stylesheets=[
197
+ CSS(filename=str(self.css_path)),
198
+ CSS(string=measure_css),
199
+ ]
200
+ )
201
+ except Exception as exc: # pragma: no cover - runtime dependency may fail
202
+ LOGGER.warning("WeasyPrint measurement failed: %s", exc)
203
+ return None
204
+
205
+ if not document.pages:
206
+ return None
207
+
208
+ box = _find_box_by_id(document.pages[0], probe_id)
209
+ if box is None:
210
+ return None
211
+
212
+ height = getattr(box, "height", 0.0) or 0.0
213
+ height += getattr(box, "margin_top", 0.0) or 0.0
214
+ height += getattr(box, "margin_bottom", 0.0) or 0.0
215
+ height += getattr(box, "padding_top", 0.0) or 0.0
216
+ height += getattr(box, "padding_bottom", 0.0) or 0.0
217
+ return float(height) * CSS_PX_TO_PT
218
+
219
+ def _estimate_html_height(self, html_fragment: str) -> float:
220
+ lines = (
221
+ html_fragment.count("<br")
222
+ + html_fragment.count("</p>")
223
+ + html_fragment.count("</div>")
224
+ )
225
+ if "section-title" in html_fragment:
226
+ return 20 + lines * 14
227
+ return lines * 14 + 10
228
+
229
+ def _estimate_text_height(self, text: str, class_name: str) -> float:
230
+ chars_per_line = 80
231
+ if class_name in {"header-title", "header-subtitle"}:
232
+ chars_per_line = 45
233
+ elif class_name == "intro":
234
+ chars_per_line = 70
235
+ lines = max(1, math.ceil(len(text) / chars_per_line))
236
+ font_size = 12.0
237
+ if class_name in {"header-title", "header-subtitle"}:
238
+ font_size = 14.0
239
+ line_height = 1.05 if class_name in {"header-title", "header-subtitle"} else 1.1
240
+ return lines * font_size * line_height
241
+
242
+ def _estimate_table_height(self, table: Dict[str, Any], show_header: bool) -> float:
243
+ num_rows = len(table["rows"])
244
+ header_height = 40 if show_header else 0
245
+ row_height = 16
246
+ return header_height + (num_rows * row_height) + 16
247
+
248
+ def _estimate_refs_height(self, refs: List[str]) -> float:
249
+ if not refs:
250
+ return 0.0
251
+ line_height = 8.0 * 1.1
252
+ return 6.0 + len(refs) * line_height
253
+
254
+ def _estimate_notes_height(self, notes: List[str]) -> float:
255
+ if not notes:
256
+ return 0.0
257
+ line_height = 8.0 * 1.1
258
+ return len(notes) * line_height
259
+
260
+
261
+ MEASURE_CSS = """
262
+ @page {{ size: Letter; margin: 0; }}
263
+ html, body {{ margin: 0; padding: 0; }}
264
+ .measure-root {{ margin: 0; padding: 0; }}
265
+ .page {{ position: static !important; width: auto; height: auto; }}
266
+ .content, .intro, .header-title, .header-subtitle, .footer-contact,
267
+ .footer-page, .footer-meta, .refs, .footer-notes {{
268
+ position: static !important;
269
+ }}
270
+ .content, .intro, .footer-meta, .footer-contact {{ width: {content_width}pt; }}
271
+ .table-wrap {{ margin-left: 0 !important; }}
272
+ """
273
+
274
+
275
+ def _find_box_by_id(page: Any, element_id: str) -> Optional[Any]:
276
+ root = getattr(page, "_page_box", None)
277
+ if root is None:
278
+ return None
279
+
280
+ for box in _iter_boxes(root):
281
+ element = getattr(box, "element", None)
282
+ if element is not None and element.get("id") == element_id:
283
+ return box
284
+ return None
285
+
286
+
287
+ def _iter_boxes(box: Any) -> Iterable[Any]:
288
+ yield box
289
+ for child in getattr(box, "children", []) or []:
290
+ yield from _iter_boxes(child)
291
+
292
+
293
+ def build_table_html(table: Dict[str, Any], show_header: bool = True) -> str:
294
+ total_width = table.get("total_width") or 532.66
295
+ dep_width = table.get("dep_width") or 120.0
296
+ groups = table.get("groups", [])
297
+ num_cols = sum(len(group.get("months", [])) for group in groups)
298
+ num_width = (total_width - dep_width) / (num_cols if num_cols else 1)
299
+
300
+ cols = [f"<col style=\"width: {dep_width:.2f}pt;\">"]
301
+ cols.extend([f"<col style=\"width: {num_width:.2f}pt;\">" for _ in range(num_cols)])
302
+
303
+ header_html = ""
304
+ if show_header:
305
+ header_top = [
306
+ "<tr>",
307
+ "<th class=\"col-dep\" rowspan=\"2\">Departamento/Mes</th>",
308
+ ]
309
+ for group in groups:
310
+ title = group.get("title", "")
311
+ span = len(group.get("months", []))
312
+ header_top.append(f"<th class=\"col-num\" colspan=\"{span}\">{title}</th>")
313
+ header_top.append("</tr>")
314
+
315
+ header_bottom = ["<tr>"]
316
+ for group in groups:
317
+ for month in group.get("months", []):
318
+ header_bottom.append(f"<th class=\"col-num\">{month}</th>")
319
+ header_bottom.append("</tr>")
320
+
321
+ header_html = f"<thead>{''.join(header_top)}{''.join(header_bottom)}</thead>"
322
+
323
+ body_rows = []
324
+ for row in table.get("rows", []):
325
+ cells = [f"<td class=\"col-dep\">{row.get('dep', '')}</td>"]
326
+ cells.extend(f"<td>{val}</td>" for val in row.get("vals", []))
327
+ body_rows.append(f"<tr>{''.join(cells)}</tr>")
328
+
329
+ body_html = f"<tbody>{''.join(body_rows)}</tbody>"
330
+
331
+ return (
332
+ f"<div class=\"content\" style=\"width: {total_width:.2f}pt;\">"
333
+ f"<div class=\"table-wrap\" style=\"width: {total_width:.2f}pt; margin-left: 0;\">"
334
+ "<table id=\"probe-table\" class=\"tabla-abaco\">"
335
+ f"<colgroup>{''.join(cols)}</colgroup>"
336
+ f"{header_html}"
337
+ f"{body_html}"
338
+ "</table>"
339
+ "</div>"
340
+ "</div>"
341
+ )
342
+
343
+
344
+ class Paginator:
345
+ def __init__(
346
+ self,
347
+ layout: LayoutConfig,
348
+ css_path: str,
349
+ base_url: str,
350
+ fonts_conf_path: Optional[str] = None,
351
+ ):
352
+ if fonts_conf_path:
353
+ os.environ.setdefault("FONTCONFIG_FILE", str(fonts_conf_path))
354
+ self.layout = layout
355
+ self.measurer = BlockMeasurer(css_path, base_url, layout)
356
+ self._header_single_line_height = self.measurer.measure_text_block("X", "header-title")
357
+
358
+ def paginate(self, pages_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
359
+ result_pages: List[Dict[str, Any]] = []
360
+ for page in pages_data:
361
+ if page.get("cover"):
362
+ page_copy = dict(page)
363
+ page_copy.setdefault("page_number", "")
364
+ page_copy.setdefault("show_header_titles", False)
365
+ result_pages.append(page_copy)
366
+ continue
367
+ result_pages.extend(self._paginate_single_page(page, result_pages))
368
+ return result_pages
369
+
370
+ def _paginate_single_page(
371
+ self, page: Dict[str, Any], accumulated_pages: List[Dict[str, Any]]
372
+ ) -> List[Dict[str, Any]]:
373
+ blocks = page.get("blocks", [])
374
+ refs = page.get("refs", [])
375
+ notes = page.get("footer_notes", [])
376
+ has_meta = bool(refs or notes)
377
+
378
+ (
379
+ header_title_top,
380
+ header_subtitle_top,
381
+ header_bottom,
382
+ header_title_style,
383
+ header_subtitle_style,
384
+ ) = self._compute_header_positions(page, show_titles=True)
385
+ (
386
+ header_title_top_other,
387
+ header_subtitle_top_other,
388
+ header_bottom_other,
389
+ header_title_style_other,
390
+ header_subtitle_style_other,
391
+ ) = self._compute_header_positions(page, show_titles=False)
392
+ layout_first = self._compute_layout_state(
393
+ page,
394
+ header_bottom,
395
+ include_intro=True,
396
+ compact_top=False,
397
+ )
398
+ layout_other = self._compute_layout_state(
399
+ page,
400
+ header_bottom_other,
401
+ include_intro=False,
402
+ compact_top=True,
403
+ )
404
+
405
+ min_page_height = min(
406
+ layout_first.content_height_base_pt,
407
+ layout_first.content_height_meta_pt,
408
+ layout_other.content_height_base_pt,
409
+ layout_other.content_height_meta_pt,
410
+ )
411
+
412
+ refs_catalog = page.get("refs_catalog", {})
413
+ normalized_blocks = self._normalize_blocks(blocks, min_page_height, refs_catalog)
414
+ pages_build: List[PageBuild] = []
415
+ idx = 0
416
+ page_idx = 0
417
+ while idx < len(normalized_blocks):
418
+ layout_state = layout_first if page_idx == 0 else layout_other
419
+ page_refs: List[str] = []
420
+ page_notes: List[str] = []
421
+
422
+ if has_meta:
423
+ remaining_height = sum(block.height_pt for block in normalized_blocks[idx:])
424
+ if remaining_height <= layout_state.content_height_meta_pt:
425
+ limit = layout_state.content_height_meta_pt
426
+ else:
427
+ limit = layout_state.content_height_base_pt
428
+ else:
429
+ limit = layout_state.content_height_base_pt
430
+ limit = max(limit, self.layout.min_content_height_pt)
431
+
432
+ used = 0.0
433
+ page_blocks: List[BlockItem] = []
434
+ while idx < len(normalized_blocks):
435
+ block = normalized_blocks[idx]
436
+ block_height = block.height_pt
437
+ block_refs = list(block.refs)
438
+ block_notes = list(block.notes)
439
+
440
+ if block.keep_with_next and idx + 1 < len(normalized_blocks):
441
+ next_block = normalized_blocks[idx + 1]
442
+ next_height = next_block.height_pt
443
+ if next_block.data.get("type") == "table":
444
+ available = limit - used - block_height
445
+ if available <= 0:
446
+ if page_blocks:
447
+ break
448
+ else:
449
+ next_table = next_block.data.get("table", {})
450
+ show_header = next_table.get("show_header", True)
451
+ max_rows = self._max_table_rows_that_fit(
452
+ next_table,
453
+ next_table.get("rows", []),
454
+ available,
455
+ show_header,
456
+ )
457
+ next_height = (
458
+ self.measurer.measure_table(
459
+ {
460
+ "groups": next_table.get("groups", []),
461
+ "rows": next_table.get("rows", [])[: max_rows or 1],
462
+ "total_width": next_table.get("total_width"),
463
+ "dep_width": next_table.get("dep_width"),
464
+ },
465
+ show_header,
466
+ )
467
+ if max_rows
468
+ else next_height
469
+ )
470
+ if used + block_height + next_height > limit:
471
+ if page_blocks:
472
+ break
473
+ if page_idx == 0 and page.get("intro"):
474
+ break
475
+
476
+ if block_refs or block_notes:
477
+ new_limit = min(
478
+ limit,
479
+ self._content_height_with_meta(
480
+ layout_state, page_refs + block_refs, page_notes + block_notes
481
+ ),
482
+ )
483
+ if used > new_limit and page_blocks:
484
+ break
485
+ limit = new_limit
486
+
487
+ split_table = False
488
+ if block.data.get("type") == "table":
489
+ available_height = limit - used
490
+ if available_height <= 0 and page_blocks:
491
+ break
492
+ if available_height <= 0:
493
+ available_height = limit
494
+
495
+ block, split_table = self._split_table_to_fit(
496
+ normalized_blocks,
497
+ idx,
498
+ available_height,
499
+ )
500
+ block_height = block.height_pt
501
+
502
+ if used + block_height > limit and page_blocks:
503
+ break
504
+
505
+ if used + block_height > limit and not page_blocks:
506
+ LOGGER.warning(
507
+ "Block exceeds page height limit (%.2f > %.2f); forcing placement.",
508
+ block_height,
509
+ limit,
510
+ )
511
+
512
+ page_blocks.append(block)
513
+ used += block_height
514
+ if block_refs:
515
+ page_refs.extend(block_refs)
516
+ if block_notes:
517
+ page_notes.extend(block_notes)
518
+ idx += 1
519
+
520
+ if split_table:
521
+ break
522
+
523
+ pages_build.append(PageBuild(blocks=page_blocks, height_pt=used, refs=page_refs, notes=page_notes))
524
+ page_idx += 1
525
+
526
+ output_pages: List[Dict[str, Any]] = []
527
+ for build_idx, build in enumerate(pages_build):
528
+ is_first = build_idx == 0
529
+ is_last = build_idx == len(pages_build) - 1
530
+ layout_state = layout_first if is_first else layout_other
531
+
532
+ show_header_titles = len(accumulated_pages) == 0 and is_first
533
+ output_pages.append(
534
+ self._build_page_dict(
535
+ page,
536
+ build,
537
+ layout_state,
538
+ include_intro=is_first,
539
+ include_meta=(has_meta and is_last),
540
+ page_number=str(len(accumulated_pages) + len(output_pages) + 1),
541
+ header_title_top=header_title_top if show_header_titles else header_title_top_other,
542
+ header_subtitle_top=header_subtitle_top if show_header_titles else header_subtitle_top_other,
543
+ header_title_style=header_title_style if show_header_titles else header_title_style_other,
544
+ header_subtitle_style=(
545
+ header_subtitle_style if show_header_titles else header_subtitle_style_other
546
+ ),
547
+ show_header_titles=show_header_titles,
548
+ )
549
+ )
550
+
551
+ return output_pages
552
+
553
+ def _build_page_dict(
554
+ self,
555
+ source_page: Dict[str, Any],
556
+ build: PageBuild,
557
+ layout_state: PageLayoutState,
558
+ include_intro: bool,
559
+ include_meta: bool,
560
+ page_number: str,
561
+ header_title_top: float,
562
+ header_subtitle_top: float,
563
+ header_title_style: Dict[str, float],
564
+ header_subtitle_style: Dict[str, float],
565
+ show_header_titles: bool,
566
+ ) -> Dict[str, Any]:
567
+ refs = list(build.refs)
568
+ notes = list(build.notes)
569
+ if include_meta:
570
+ refs.extend(source_page.get("refs", []))
571
+ notes.extend(source_page.get("footer_notes", []))
572
+
573
+ banner_path = source_page["header_banner_path"]
574
+ banner_path_cont = source_page.get("header_banner_path_cont")
575
+ if not banner_path_cont:
576
+ try:
577
+ banner_file = Path(banner_path)
578
+ candidate = banner_file.with_name(f"{banner_file.stem}-clean{banner_file.suffix}")
579
+ if candidate.exists():
580
+ banner_path_cont = str(candidate)
581
+ except OSError:
582
+ banner_path_cont = None
583
+ if not banner_path_cont:
584
+ banner_path_cont = banner_path
585
+
586
+ # Prefer clean banner for all pages when available to avoid duplicated titles.
587
+ if banner_path_cont and banner_path_cont != banner_path:
588
+ banner_path = banner_path_cont
589
+
590
+ page_dict = {
591
+ "header_banner_path": banner_path,
592
+ "header_banner_path_cont": banner_path_cont,
593
+ "header_logo_path": source_page["header_logo_path"],
594
+ "title_line1": source_page["title_line1"],
595
+ "title_line2": source_page["title_line2"],
596
+ "intro": source_page.get("intro", "") if include_intro else "",
597
+ "blocks": [block.data for block in build.blocks],
598
+ "refs": refs,
599
+ "footer_notes": notes,
600
+ "page_number": page_number,
601
+ "footer_site": source_page.get("footer_site", ""),
602
+ "footer_phone": source_page.get("footer_phone", ""),
603
+ "intro_top": layout_state.intro_top_pt,
604
+ "content_top": layout_state.content_top_pt,
605
+ "header_title_top": header_title_top,
606
+ "header_subtitle_top": header_subtitle_top,
607
+ "header_title_left": header_title_style["left"],
608
+ "header_title_width": header_title_style["width"],
609
+ "header_title_align": header_title_style["align"],
610
+ "header_subtitle_left": header_subtitle_style["left"],
611
+ "header_subtitle_width": header_subtitle_style["width"],
612
+ "header_subtitle_align": header_subtitle_style["align"],
613
+ "show_header_titles": show_header_titles,
614
+ "footer_meta_bottom": layout_state.footer_meta_bottom_pt,
615
+ }
616
+ return page_dict
617
+
618
+ def _compute_header_positions(
619
+ self, page: Dict[str, Any], show_titles: bool = True
620
+ ) -> Tuple[float, float, float, Dict[str, float], Dict[str, float]]:
621
+ title_text = page.get("title_line1", "") if show_titles else ""
622
+ subtitle_text = page.get("title_line2", "") if show_titles else ""
623
+
624
+ title_height = self.measurer.measure_text_block(title_text, "header-title")
625
+ subtitle_height = self.measurer.measure_text_block(subtitle_text, "header-subtitle")
626
+
627
+ title_top = max(self.layout.header_title_top_pt, self.layout.header_title_min_top_pt)
628
+ subtitle_top = max(
629
+ self.layout.header_subtitle_top_pt,
630
+ title_top + title_height + self.layout.header_subtitle_gap_pt,
631
+ )
632
+
633
+ is_multi_line = title_height > self._header_single_line_height * 1.15
634
+ if is_multi_line:
635
+ title_style = {
636
+ "left": self.layout.content_left_pt,
637
+ "width": self.layout.content_width_pt,
638
+ "align": "center",
639
+ }
640
+ subtitle_style = {
641
+ "left": self.layout.content_left_pt,
642
+ "width": self.layout.content_width_pt,
643
+ "align": "center",
644
+ }
645
+ else:
646
+ title_style = {
647
+ "left": self.layout.header_title_left_pt,
648
+ "width": self.layout.header_title_width_pt,
649
+ "align": "left",
650
+ }
651
+ subtitle_style = {
652
+ "left": self.layout.header_subtitle_left_pt,
653
+ "width": self.layout.header_subtitle_width_pt,
654
+ "align": "left",
655
+ }
656
+
657
+ header_bottom = max(
658
+ self.layout.header_banner_height_pt,
659
+ self.layout.header_logo_top_pt + self.layout.header_logo_height_pt,
660
+ title_top + title_height,
661
+ subtitle_top + subtitle_height,
662
+ )
663
+ return title_top, subtitle_top, header_bottom, title_style, subtitle_style
664
+
665
+ def _compute_layout_state(
666
+ self,
667
+ page: Dict[str, Any],
668
+ header_bottom: float,
669
+ include_intro: bool,
670
+ compact_top: bool,
671
+ ) -> PageLayoutState:
672
+ intro_text = page.get("intro", "") if include_intro else ""
673
+ intro_height = (
674
+ self.measurer.measure_text_block(intro_text, "intro") if intro_text else 0.0
675
+ )
676
+
677
+ intro_top = max(self.layout.default_intro_top_pt, header_bottom + self.layout.header_gap_pt)
678
+
679
+ min_content_top = (
680
+ self.layout.continuation_content_top_pt if compact_top else self.layout.default_content_top_pt
681
+ )
682
+ if include_intro and intro_text:
683
+ content_top = max(
684
+ min_content_top,
685
+ intro_top + intro_height + self.layout.intro_gap_pt,
686
+ )
687
+ else:
688
+ content_top = max(min_content_top, header_bottom + self.layout.header_gap_pt)
689
+
690
+ footer_contact_height = self.measurer.measure_footer_contact(
691
+ page.get("footer_site", ""),
692
+ page.get("footer_phone", ""),
693
+ )
694
+ footer_page_height = self.measurer.measure_footer_page(page.get("page_number", ""))
695
+ footer_meta_height = self.measurer.measure_footer_meta(
696
+ page.get("refs", []),
697
+ page.get("footer_notes", []),
698
+ )
699
+
700
+ reserved_base = max(
701
+ self.layout.footer_contact_bottom_pt + footer_contact_height,
702
+ self.layout.footer_page_bottom_pt + footer_page_height,
703
+ )
704
+ footer_meta_bottom = max(
705
+ self.layout.footer_meta_bottom_pt,
706
+ self.layout.footer_contact_bottom_pt + footer_contact_height + self.layout.footer_meta_gap_pt,
707
+ )
708
+ reserved_meta = max(
709
+ reserved_base,
710
+ footer_meta_bottom + footer_meta_height,
711
+ )
712
+
713
+ content_height_base = (
714
+ self.layout.page_height_pt - content_top - reserved_base - self.layout.safety_pad_pt
715
+ )
716
+ content_height_meta = (
717
+ self.layout.page_height_pt - content_top - reserved_meta - self.layout.safety_pad_pt
718
+ )
719
+
720
+ if content_height_meta < self.layout.min_content_height_pt:
721
+ LOGGER.warning(
722
+ "Footer/meta area exceeds available page space; clamping content height to %.2fpt.",
723
+ self.layout.min_content_height_pt,
724
+ )
725
+
726
+ content_height_base = max(content_height_base, self.layout.min_content_height_pt)
727
+ content_height_meta = max(content_height_meta, self.layout.min_content_height_pt)
728
+
729
+ return PageLayoutState(
730
+ intro_top_pt=intro_top,
731
+ content_top_pt=content_top,
732
+ content_height_base_pt=content_height_base,
733
+ content_height_meta_pt=content_height_meta,
734
+ reserved_base_pt=reserved_base,
735
+ footer_meta_bottom_pt=footer_meta_bottom,
736
+ )
737
+
738
+ def _normalize_blocks(
739
+ self,
740
+ blocks: List[Dict[str, Any]],
741
+ max_height_pt: float,
742
+ refs_catalog: Dict[str, str],
743
+ ) -> List[BlockItem]:
744
+ normalized: List[BlockItem] = []
745
+ for block in blocks:
746
+ block_refs = block.get("refs", [])
747
+ block_notes = block.get("footer_notes", [])
748
+ if block.get("type") == "table":
749
+ table = block.get("table", {})
750
+ show_header = table.get("show_header", True)
751
+ height = self.measurer.measure_table(table, show_header)
752
+ normalized.append(
753
+ BlockItem(data=block, height_pt=height, refs=block_refs, notes=block_notes)
754
+ )
755
+ else:
756
+ html = block.get("html", "")
757
+ keep_with_next = _needs_keep_with_next(html)
758
+ split_html = self._split_html_block(html, max_height_pt)
759
+ for idx, chunk in enumerate(split_html):
760
+ if block_refs:
761
+ chunk_refs = block_refs if idx == 0 else []
762
+ else:
763
+ chunk_refs = _refs_from_html(chunk, refs_catalog)
764
+ height = self.measurer.measure_html(chunk)
765
+ normalized.append(
766
+ BlockItem(
767
+ data={"type": "html", "html": chunk},
768
+ height_pt=height,
769
+ keep_with_next=keep_with_next and idx == 0,
770
+ refs=chunk_refs,
771
+ notes=block_notes if idx == 0 else [],
772
+ )
773
+ )
774
+ return normalized
775
+
776
+ def _split_table_block(self, block: Dict[str, Any], max_height_pt: float) -> List[Dict[str, Any]]:
777
+ if block.get("type") != "table":
778
+ return [block]
779
+
780
+ table = block["table"]
781
+ rows = table.get("rows", [])
782
+ if not rows:
783
+ return [block]
784
+
785
+ result_blocks: List[Dict[str, Any]] = []
786
+ start_idx = 0
787
+ first_chunk = True
788
+
789
+ while start_idx < len(rows):
790
+ show_header = first_chunk
791
+ max_rows = self._max_table_rows_that_fit(table, rows[start_idx:], max_height_pt, show_header)
792
+ if max_rows < 1:
793
+ max_rows = 1
794
+ chunk_rows = rows[start_idx : start_idx + max_rows]
795
+ result_blocks.append(
796
+ {
797
+ "type": "table",
798
+ "table": {
799
+ "groups": table.get("groups", []),
800
+ "rows": chunk_rows,
801
+ "total_width": table.get("total_width"),
802
+ "dep_width": table.get("dep_width"),
803
+ "show_header": show_header,
804
+ },
805
+ }
806
+ )
807
+ start_idx += max_rows
808
+ first_chunk = False
809
+
810
+ return result_blocks
811
+
812
+ def _split_table_to_fit(
813
+ self,
814
+ blocks: List[BlockItem],
815
+ idx: int,
816
+ max_height_pt: float,
817
+ ) -> Tuple[BlockItem, bool]:
818
+ block = blocks[idx]
819
+ table = block.data.get("table", {})
820
+ rows = table.get("rows", [])
821
+ if not rows:
822
+ return block, False
823
+
824
+ show_header = table.get("show_header", True)
825
+ if block.height_pt <= max_height_pt:
826
+ return block, False
827
+
828
+ max_rows = self._max_table_rows_that_fit(table, rows, max_height_pt, show_header)
829
+ if max_rows <= 0:
830
+ max_rows = 1
831
+
832
+ chunk_rows = rows[:max_rows]
833
+ remainder_rows = rows[max_rows:]
834
+
835
+ chunk_block = {
836
+ "type": "table",
837
+ "table": {
838
+ "groups": table.get("groups", []),
839
+ "rows": chunk_rows,
840
+ "total_width": table.get("total_width"),
841
+ "dep_width": table.get("dep_width"),
842
+ "show_header": show_header,
843
+ },
844
+ }
845
+
846
+ chunk_height = self.measurer.measure_table(chunk_block["table"], show_header)
847
+ blocks[idx] = BlockItem(
848
+ data=chunk_block,
849
+ height_pt=chunk_height,
850
+ refs=list(block.refs),
851
+ notes=list(block.notes),
852
+ )
853
+
854
+ if remainder_rows:
855
+ remainder_show_header = False if show_header else False
856
+ remainder_block = {
857
+ "type": "table",
858
+ "table": {
859
+ "groups": table.get("groups", []),
860
+ "rows": remainder_rows,
861
+ "total_width": table.get("total_width"),
862
+ "dep_width": table.get("dep_width"),
863
+ "show_header": remainder_show_header,
864
+ },
865
+ }
866
+ remainder_height = self.measurer.measure_table(
867
+ remainder_block["table"], remainder_show_header
868
+ )
869
+ blocks.insert(
870
+ idx + 1,
871
+ BlockItem(data=remainder_block, height_pt=remainder_height),
872
+ )
873
+
874
+ return blocks[idx], bool(remainder_rows)
875
+
876
+ def _max_table_rows_that_fit(
877
+ self,
878
+ table: Dict[str, Any],
879
+ remaining_rows: List[Dict[str, Any]],
880
+ max_height_pt: float,
881
+ show_header: bool,
882
+ ) -> int:
883
+ low = 1
884
+ high = len(remaining_rows)
885
+ best = 0
886
+ while low <= high:
887
+ mid = (low + high) // 2
888
+ test_table = {
889
+ "groups": table.get("groups", []),
890
+ "rows": remaining_rows[:mid],
891
+ "total_width": table.get("total_width"),
892
+ "dep_width": table.get("dep_width"),
893
+ }
894
+ height = self.measurer.measure_table(test_table, show_header)
895
+ if height <= max_height_pt:
896
+ best = mid
897
+ low = mid + 1
898
+ else:
899
+ high = mid - 1
900
+ return best
901
+
902
+ def _split_html_block(self, html: str, max_height_pt: float) -> List[str]:
903
+ height = self.measurer.measure_html(html)
904
+ if height <= max_height_pt:
905
+ return [html]
906
+
907
+ chunks = split_html_into_chunks(html)
908
+ if len(chunks) == 1:
909
+ return [html]
910
+
911
+ result: List[str] = []
912
+ buffer: List[str] = []
913
+ for chunk in chunks:
914
+ candidate = "".join(buffer + [chunk])
915
+ candidate_height = self.measurer.measure_html(candidate)
916
+ if candidate_height <= max_height_pt or not buffer:
917
+ buffer.append(chunk)
918
+ continue
919
+
920
+ result.append("".join(buffer))
921
+ buffer = [chunk]
922
+
923
+ if buffer:
924
+ result.append("".join(buffer))
925
+
926
+ return result
927
+
928
+ def _content_height_with_meta(
929
+ self, layout_state: PageLayoutState, refs: List[str], notes: List[str]
930
+ ) -> float:
931
+ if not refs and not notes:
932
+ return layout_state.content_height_base_pt
933
+
934
+ footer_meta_height = self.measurer.measure_footer_meta(refs, notes)
935
+ reserved_meta = max(
936
+ layout_state.reserved_base_pt,
937
+ layout_state.footer_meta_bottom_pt + footer_meta_height,
938
+ )
939
+ content_height = (
940
+ self.layout.page_height_pt
941
+ - layout_state.content_top_pt
942
+ - reserved_meta
943
+ - self.layout.safety_pad_pt
944
+ )
945
+ if content_height < self.layout.min_content_height_pt:
946
+ LOGGER.warning(
947
+ "Footer/meta area exceeds available page space; clamping content height to %.2fpt.",
948
+ self.layout.min_content_height_pt,
949
+ )
950
+ return self.layout.min_content_height_pt
951
+ return content_height
952
+
953
+ def split_html_into_chunks(html: str) -> List[str]:
954
+ lowered = html.lower()
955
+ for tag in ("p", "div", "li", "h1", "h2", "h3", "h4", "h5", "h6"):
956
+ close_tag = f"</{tag}>"
957
+ if close_tag in lowered:
958
+ parts = re.split(f"({re.escape(close_tag)})", html, flags=re.IGNORECASE)
959
+ chunks: List[str] = []
960
+ buffer = ""
961
+ for part in parts:
962
+ buffer += part
963
+ if part.lower() == close_tag:
964
+ if buffer.strip():
965
+ chunks.append(buffer)
966
+ buffer = ""
967
+ if buffer.strip():
968
+ chunks.append(buffer)
969
+ if len(chunks) > 1:
970
+ return chunks
971
+
972
+ if "<br" in lowered:
973
+ parts = re.split(r"(<br\s*/?>)", html, flags=re.IGNORECASE)
974
+ chunks = []
975
+ buffer = ""
976
+ for part in parts:
977
+ buffer += part
978
+ if part.lower().startswith("<br"):
979
+ chunks.append(buffer)
980
+ buffer = ""
981
+ if buffer.strip():
982
+ chunks.append(buffer)
983
+ if len(chunks) > 1:
984
+ return chunks
985
+
986
+ # Fallback: split very long single-paragraph HTML by sentences.
987
+ text_only = re.sub(r"\s+", " ", re.sub(r"<[^>]+>", " ", html)).strip()
988
+ if text_only and (len(text_only) > 800 or ("<p" in lowered and "</p>" in lowered)):
989
+ sentences = re.split(r"(?<=[\.\?\!])\s+", text_only)
990
+ if len(sentences) > 1:
991
+ chunks = [f"<p>{s.strip()}</p>" for s in sentences if s.strip()]
992
+ if len(chunks) > 1:
993
+ return chunks
994
+
995
+ return [html]
996
+
997
+
998
+ def _needs_keep_with_next(html: str) -> bool:
999
+ lowered = html.lower()
1000
+ return "section-title" in lowered or "section-title-serif" in lowered or "section-subtitle" in lowered
1001
+
1002
+
1003
+ def _refs_from_html(html: str, refs_catalog: Dict[str, str]) -> List[str]:
1004
+ if not refs_catalog:
1005
+ return []
1006
+ ids = _extract_ref_ids(html)
1007
+ refs = []
1008
+ for ref_id in ids:
1009
+ ref_text = refs_catalog.get(ref_id)
1010
+ if ref_text:
1011
+ refs.append(ref_text)
1012
+ return refs
1013
+
1014
+
1015
+ def _extract_ref_ids(html: str) -> List[str]:
1016
+ ids: List[str] = []
1017
+ seen = set()
1018
+ for match in re.findall(r"\[(.*?)\]", html):
1019
+ for token in re.split(r"[;,]\s*", match.strip()):
1020
+ token = token.strip()
1021
+ if not token:
1022
+ continue
1023
+ range_match = re.match(r"^(\d+)\s*[-–]\s*(\d+)$", token)
1024
+ if range_match:
1025
+ start = int(range_match.group(1))
1026
+ end = int(range_match.group(2))
1027
+ step = 1 if end >= start else -1
1028
+ for val in range(start, end + step, step):
1029
+ key = str(val)
1030
+ if key not in seen:
1031
+ ids.append(key)
1032
+ seen.add(key)
1033
+ continue
1034
+ if re.match(r"^\d+$", token):
1035
+ if token not in seen:
1036
+ ids.append(token)
1037
+ seen.add(token)
1038
+ return ids
1039
+
1040
+
1041
+ def _suffix_sums(values: List[float]) -> List[float]:
1042
+ suffix = [0.0] * (len(values) + 1)
1043
+ for idx in range(len(values) - 1, -1, -1):
1044
+ suffix[idx] = suffix[idx + 1] + values[idx]
1045
+ return suffix