docintel-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. docintel/__init__.py +6 -0
  2. docintel/app.py +45 -0
  3. docintel/auth/__init__.py +12 -0
  4. docintel/auth/api_keys.py +48 -0
  5. docintel/auth/limiter.py +41 -0
  6. docintel/auth/middleware.py +34 -0
  7. docintel/auth/oidc.py +45 -0
  8. docintel/cli.py +21 -0
  9. docintel/client.py +193 -0
  10. docintel/config.py +20 -0
  11. docintel/jobs/__init__.py +16 -0
  12. docintel/jobs/helpers.py +38 -0
  13. docintel/jobs/models.py +78 -0
  14. docintel/jobs/queue.py +75 -0
  15. docintel/jobs/store.py +82 -0
  16. docintel/jobs/tasks.py +173 -0
  17. docintel/jobs/webhooks.py +32 -0
  18. docintel/openapi/__init__.py +1 -0
  19. docintel/openapi/openapi.yaml +380 -0
  20. docintel/ops/__init__.py +1 -0
  21. docintel/ops/logging.py +40 -0
  22. docintel/ops/metrics.py +57 -0
  23. docintel/ops/middleware.py +40 -0
  24. docintel/routes/__init__.py +1 -0
  25. docintel/routes/jobs.py +26 -0
  26. docintel/routes/match.py +43 -0
  27. docintel/routes/openapi_docs.py +57 -0
  28. docintel/routes/ops.py +22 -0
  29. docintel/routes/pdf.py +420 -0
  30. docintel/routes/text.py +41 -0
  31. docintel/services/__init__.py +1 -0
  32. docintel/services/matching/__init__.py +6 -0
  33. docintel/services/matching/models.py +19 -0
  34. docintel/services/matching/scorer.py +64 -0
  35. docintel/services/pdf/__init__.py +26 -0
  36. docintel/services/pdf/annotator.py +188 -0
  37. docintel/services/pdf/models.py +104 -0
  38. docintel/services/pdf/ocr.py +130 -0
  39. docintel/services/pdf/pii.py +105 -0
  40. docintel/services/pdf/presets.py +26 -0
  41. docintel/services/pdf/search.py +29 -0
  42. docintel/services/pdf/sensitive.py +212 -0
  43. docintel/services/pdf/structure.py +118 -0
  44. docintel/services/pdf/structure_llm.py +136 -0
  45. docintel/services/pdf/structure_render.py +136 -0
  46. docintel/services/pdf/structure_schema.py +99 -0
  47. docintel/services/summary/__init__.py +6 -0
  48. docintel/services/summary/models.py +21 -0
  49. docintel/services/summary/textrank.py +57 -0
  50. docintel/ui.py +347 -0
  51. docintel/wsgi.py +5 -0
  52. docintel_platform-1.0.2.dist-info/METADATA +607 -0
  53. docintel_platform-1.0.2.dist-info/RECORD +56 -0
  54. docintel_platform-1.0.2.dist-info/WHEEL +5 -0
  55. docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
  56. docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,26 @@
1
+ """Default Presidio entity presets (extend via API or custom recognizers)."""
2
+
3
+ # Core Presidio entities suitable for HR, legal, and compliance workflows.
4
+ DEFAULT_PII_ENTITIES: tuple[str, ...] = (
5
+ "EMAIL_ADDRESS",
6
+ "PHONE_NUMBER",
7
+ "US_SSN",
8
+ "CREDIT_CARD",
9
+ "US_BANK_NUMBER",
10
+ "US_DRIVER_LICENSE",
11
+ "US_ITIN",
12
+ "US_PASSPORT",
13
+ "PERSON",
14
+ "LOCATION",
15
+ "DATE_TIME",
16
+ "IP_ADDRESS",
17
+ "IBAN_CODE",
18
+ "MEDICAL_LICENSE",
19
+ "URL",
20
+ )
21
+
22
+ # Minimum extracted characters before a page is treated as scanned (OCR fallback).
23
+ MIN_NATIVE_TEXT_CHARS = 20
24
+
25
+ # EasyOCR render scale (higher improves accuracy, increases memory).
26
+ OCR_RENDER_SCALE = 2.0
@@ -0,0 +1,29 @@
1
+ """Text search helpers for PDF pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Iterable
7
+
8
+ import fitz
9
+
10
+
11
+ def search_for_text(lines: Iterable[str], pattern: str) -> Iterable[str]:
12
+ """Yield regex matches from each line (case insensitive)."""
13
+ for line in lines:
14
+ for result in re.findall(pattern, line, re.IGNORECASE):
15
+ yield result
16
+
17
+
18
+ def extract_info(input_file: str) -> dict:
19
+ """Return basic metadata for a PDF file."""
20
+ pdf_doc = fitz.open(input_file)
21
+ info = {
22
+ "file": input_file,
23
+ "page_count": pdf_doc.page_count,
24
+ "encrypted": pdf_doc.is_encrypted,
25
+ }
26
+ if not pdf_doc.is_encrypted:
27
+ info.update({key: value for key, value in pdf_doc.metadata.items() if value})
28
+ pdf_doc.close()
29
+ return info
@@ -0,0 +1,212 @@
1
+ """Scanned and native PDF sensitive-data detection with OCR + Presidio."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Protocol, Sequence
8
+
9
+ import fitz
10
+
11
+ from docintel.services.pdf.annotator import _open_pdf, _save_pdf, highlight_matches, redact_matches
12
+ from docintel.services.pdf.models import Action, PIIDetectionResult
13
+ from docintel.services.pdf.ocr import (
14
+ build_indexed_text,
15
+ embed_invisible_text_layer,
16
+ extract_page_ocr,
17
+ merge_rects,
18
+ page_has_native_text,
19
+ rects_for_char_range,
20
+ )
21
+ from docintel.services.pdf.pii import PIIHit, detect_pii_in_text
22
+ from docintel.services.pdf.search import search_for_text
23
+
24
+
25
+ class ProgressCallback(Protocol):
26
+ def __call__(
27
+ self,
28
+ *,
29
+ stage: str,
30
+ pages_done: int,
31
+ pages_total: int,
32
+ message: str,
33
+ ) -> None: ...
34
+
35
+
36
+ def _ensure_ocr_stack() -> None:
37
+ try:
38
+ import easyocr # noqa: F401
39
+ import presidio_analyzer # noqa: F401
40
+ except ImportError as exc:
41
+ raise RuntimeError(
42
+ "OCR and Presidio dependencies are not installed. "
43
+ "Run: pip install -e '.[ocr]' && python -m spacy download en_core_web_sm"
44
+ ) from exc
45
+
46
+
47
+ def _apply_rect_action(page: fitz.Page, rect: fitz.Rect, action: Action) -> bool:
48
+ if action == Action.REDACT:
49
+ page.add_redact_annot(rect, text=" ", fill=(0, 0, 0))
50
+ return True
51
+ if action == Action.FRAME:
52
+ annot = page.add_rect_annot(rect)
53
+ annot.set_colors(stroke=fitz.utils.getColor("red"))
54
+ annot.update()
55
+ return True
56
+ if action == Action.UNDERLINE:
57
+ annot = page.add_underline_annot([rect])
58
+ annot.update()
59
+ return True
60
+ if action == Action.SQUIGGLY:
61
+ annot = page.add_squiggly_annot([rect])
62
+ annot.update()
63
+ return True
64
+ if action == Action.STRIKEOUT:
65
+ annot = page.add_strikeout_annot([rect])
66
+ annot.update()
67
+ return True
68
+
69
+ annot = page.add_highlight_annot([rect])
70
+ annot.update()
71
+ return True
72
+
73
+
74
+ def _annotate_rects(page: fitz.Page, rects: list[fitz.Rect], action: Action) -> int:
75
+ applied = 0
76
+ for rect in rects:
77
+ if _apply_rect_action(page, rect, action):
78
+ applied += 1
79
+ if action == Action.REDACT and applied:
80
+ page.apply_redactions()
81
+ return applied
82
+
83
+
84
+ def _native_rects_for_hit(page: fitz.Page, hit: PIIHit) -> list[fitz.Rect]:
85
+ return page.search_for(hit.text)
86
+
87
+
88
+ def _ocr_rects_for_hit(hit: PIIHit, indexed) -> list[fitz.Rect]:
89
+ rects = rects_for_char_range(hit.start, hit.end, indexed)
90
+ merged = merge_rects(rects)
91
+ return [merged] if merged else []
92
+
93
+
94
+ def _regex_hits(page_text: str, pattern: str) -> list[PIIHit]:
95
+ hits: list[PIIHit] = []
96
+ for match in re.finditer(pattern, page_text, flags=re.IGNORECASE):
97
+ hits.append(
98
+ PIIHit(
99
+ entity_type="REGEX",
100
+ text=match.group(0),
101
+ start=match.start(),
102
+ end=match.end(),
103
+ score=1.0,
104
+ )
105
+ )
106
+ return hits
107
+
108
+
109
+ def detect_sensitive_pdf(
110
+ input_file: str | Path,
111
+ output_file: str | Path,
112
+ *,
113
+ entities: Sequence[str] | None = None,
114
+ action: Action | str = Action.HIGHLIGHT,
115
+ force_ocr: bool = False,
116
+ add_text_layer: bool = True,
117
+ pattern: str | None = None,
118
+ min_score: float = 0.35,
119
+ password: str | None = None,
120
+ progress_callback: ProgressCallback | None = None,
121
+ ) -> PIIDetectionResult:
122
+ """
123
+ Detect sensitive information with Presidio (and optional regex), annotate PDF.
124
+
125
+ Uses native PDF text when available. Falls back to EasyOCR for scanned pages.
126
+ """
127
+ _ensure_ocr_stack()
128
+ selected_action = action if isinstance(action, Action) else Action.from_value(action)
129
+ if selected_action == Action.REMOVE:
130
+ raise ValueError("Action 'Remove' is not supported for sensitive detection.")
131
+
132
+ pdf_doc = _open_pdf(input_file, password)
133
+ total_annotations = 0
134
+ pages_processed = 0
135
+ ocr_pages: list[int] = []
136
+ findings: list[dict] = []
137
+
138
+ total_pages = pdf_doc.page_count
139
+ for page_index in range(total_pages):
140
+ if progress_callback is not None:
141
+ progress_callback(
142
+ stage="detecting",
143
+ pages_done=page_index,
144
+ pages_total=total_pages,
145
+ message=f"Processing page {page_index + 1} of {total_pages}",
146
+ )
147
+ page = pdf_doc[page_index]
148
+ pages_processed += 1
149
+ use_ocr = force_ocr or not page_has_native_text(page)
150
+
151
+ if use_ocr:
152
+ ocr_pages.append(page_index)
153
+ ocr_spans = extract_page_ocr(page)
154
+ if add_text_layer and ocr_spans:
155
+ embed_invisible_text_layer(page, ocr_spans)
156
+ page_text, indexed = build_indexed_text(ocr_spans)
157
+ else:
158
+ ocr_spans = []
159
+ indexed = []
160
+ page_text = page.get_text("text")
161
+
162
+ hits = detect_pii_in_text(page_text, entities=entities, min_score=min_score)
163
+ if pattern:
164
+ hits.extend(_regex_hits(page_text, pattern))
165
+
166
+ for hit in hits:
167
+ if use_ocr:
168
+ rects = _ocr_rects_for_hit(hit, indexed)
169
+ else:
170
+ rects = _native_rects_for_hit(page, hit)
171
+
172
+ if not rects:
173
+ continue
174
+
175
+ total_annotations += _annotate_rects(page, rects, selected_action)
176
+ findings.append(
177
+ {
178
+ "page": page_index,
179
+ "entity_type": hit.entity_type,
180
+ "text": hit.text,
181
+ "score": round(hit.score, 4),
182
+ "ocr_used": use_ocr,
183
+ }
184
+ )
185
+
186
+ # Legacy regex path for native PDFs when pattern matches line fragments
187
+ if pattern and not use_ocr:
188
+ matched_values = list(search_for_text(page_text.split("\n"), pattern))
189
+ if matched_values:
190
+ if selected_action == Action.REDACT:
191
+ total_annotations += redact_matches(page, matched_values)
192
+ else:
193
+ total_annotations += highlight_matches(page, matched_values, selected_action)
194
+
195
+ if progress_callback is not None:
196
+ progress_callback(
197
+ stage="detecting",
198
+ pages_done=total_pages,
199
+ pages_total=total_pages,
200
+ message="Detection complete",
201
+ )
202
+
203
+ _save_pdf(pdf_doc, Path(output_file))
204
+ return PIIDetectionResult(
205
+ input_path=str(input_file),
206
+ output_path=str(output_file),
207
+ action=selected_action,
208
+ matches=total_annotations,
209
+ pages_processed=pages_processed,
210
+ ocr_pages=ocr_pages,
211
+ findings=findings,
212
+ )
@@ -0,0 +1,118 @@
1
+ """LLM-backed PDF structuring: unstructured scan to curated digital PDF."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Callable, Protocol
7
+
8
+ import fitz
9
+
10
+ from docintel.services.pdf.annotator import _open_pdf
11
+ from docintel.services.pdf.models import StructureMode, StructureResult
12
+ from docintel.services.pdf.ocr import build_indexed_text, extract_page_ocr, page_has_native_text
13
+ from docintel.services.pdf.structure_llm import structure_document
14
+ from docintel.services.pdf.structure_render import render_curated_pdf, render_searchable_pdf
15
+ from docintel.services.pdf.structure_schema import StructuredDocument
16
+
17
+
18
+ class ProgressCallback(Protocol):
19
+ def __call__(
20
+ self,
21
+ *,
22
+ stage: str,
23
+ pages_done: int,
24
+ pages_total: int,
25
+ message: str,
26
+ ) -> None: ...
27
+
28
+
29
+ def _ensure_ocr_stack() -> None:
30
+ try:
31
+ import easyocr # noqa: F401
32
+ except ImportError as exc:
33
+ raise RuntimeError(
34
+ "OCR dependencies are not installed. Run: pip install -e '.[ocr]'"
35
+ ) from exc
36
+
37
+
38
+ def _extract_page_text(page: fitz.Page, force_ocr: bool) -> tuple[str, bool]:
39
+ use_ocr = force_ocr or not page_has_native_text(page)
40
+ if use_ocr:
41
+ _ensure_ocr_stack()
42
+ spans = extract_page_ocr(page)
43
+ text, _ = build_indexed_text(spans)
44
+ return text, True
45
+ return page.get_text("text"), False
46
+
47
+
48
+ def structure_pdf(
49
+ input_file: str | Path,
50
+ output_file: str | Path,
51
+ *,
52
+ mode: StructureMode | str = StructureMode.CURATE,
53
+ force_ocr: bool = False,
54
+ redact_before_llm: bool = False,
55
+ structure_fn: Callable[[list[tuple[int, str]]], StructuredDocument] | None = None,
56
+ progress_callback: ProgressCallback | None = None,
57
+ password: str | None = None,
58
+ ) -> StructureResult:
59
+ """
60
+ Convert an unstructured or scanned PDF into a curated structured PDF.
61
+
62
+ Uses EasyOCR when native text is missing, then an LLM to clean and structure
63
+ content before rendering a new PDF (curate) or embedding a searchable layer
64
+ (searchable).
65
+ """
66
+ selected_mode = mode if isinstance(mode, StructureMode) else StructureMode.from_value(mode)
67
+
68
+ pdf_doc = _open_pdf(input_file, password)
69
+ page_texts: list[tuple[int, str]] = []
70
+ ocr_pages: list[int] = []
71
+
72
+ total_pages = pdf_doc.page_count
73
+ for page_index in range(total_pages):
74
+ if progress_callback is not None:
75
+ progress_callback(
76
+ stage="extracting",
77
+ pages_done=page_index,
78
+ pages_total=total_pages,
79
+ message=f"Extracting page {page_index + 1} of {total_pages}",
80
+ )
81
+ text, used_ocr = _extract_page_text(pdf_doc[page_index], force_ocr=force_ocr)
82
+ if used_ocr:
83
+ ocr_pages.append(page_index)
84
+ if redact_before_llm and text.strip():
85
+ from docintel.services.pdf.pii import mask_pii_in_text
86
+
87
+ text, _ = mask_pii_in_text(text)
88
+ page_texts.append((page_index, text))
89
+
90
+ if progress_callback is not None:
91
+ progress_callback(
92
+ stage="extracting",
93
+ pages_done=total_pages,
94
+ pages_total=total_pages,
95
+ message="Extraction complete",
96
+ )
97
+
98
+ if structure_fn is not None:
99
+ document = structure_fn(page_texts)
100
+ else:
101
+ document = structure_document(page_texts, progress_callback=progress_callback)
102
+
103
+ output_path = Path(output_file)
104
+ pages_processed = pdf_doc.page_count
105
+ if selected_mode == StructureMode.SEARCHABLE:
106
+ render_searchable_pdf(pdf_doc, document.pages, output_path)
107
+ else:
108
+ pdf_doc.close()
109
+ render_curated_pdf(document, output_path)
110
+
111
+ return StructureResult(
112
+ input_path=str(input_file),
113
+ output_path=str(output_path),
114
+ mode=selected_mode,
115
+ pages_processed=pages_processed,
116
+ ocr_pages=ocr_pages,
117
+ document_title=document.title,
118
+ )
@@ -0,0 +1,136 @@
1
+ """LLM structuring for OCR and native PDF text."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from typing import Any
8
+
9
+ from docintel.services.pdf.structure_schema import StructuredDocument, StructuredPage
10
+
11
+ STRUCTURE_SYSTEM_PROMPT = """You convert noisy OCR or unstructured PDF text into clean structured JSON.
12
+ Rules:
13
+ - Fix OCR typos only when the intended word is obvious from context.
14
+ - Do not invent facts, numbers, names, or clauses that are not in the source.
15
+ - Preserve reading order.
16
+ - Use headings only when the source clearly has a section title.
17
+ - Return valid JSON only, matching the schema exactly."""
18
+
19
+ STRUCTURE_USER_TEMPLATE = """Page number: {page_number}
20
+ Source text:
21
+ ---
22
+ {source_text}
23
+ ---
24
+
25
+ Return JSON with this schema:
26
+ {{
27
+ "page_title": "document or section title if visible on this page, else empty string",
28
+ "plain_text": "full cleaned page text in reading order with paragraph breaks",
29
+ "sections": [
30
+ {{
31
+ "heading": "section heading or empty string",
32
+ "level": 1,
33
+ "paragraphs": ["paragraph text"],
34
+ "list_items": ["bullet items"],
35
+ "tables": [{{"headers": ["col"], "rows": [["value"]]}}]
36
+ }}
37
+ ]
38
+ }}"""
39
+
40
+
41
+ def _ensure_llm_stack() -> None:
42
+ try:
43
+ import openai # noqa: F401
44
+ except ImportError as exc:
45
+ raise RuntimeError(
46
+ "LLM dependencies are not installed. Run: pip install -e '.[llm]'"
47
+ ) from exc
48
+
49
+
50
+ def _llm_settings() -> tuple[str, str, str | None]:
51
+ api_key = os.getenv("DOCINTEL_LLM_API_KEY", "").strip()
52
+ if not api_key:
53
+ raise RuntimeError(
54
+ "DOCINTEL_LLM_API_KEY is not set. Configure an OpenAI-compatible API key."
55
+ )
56
+ model = os.getenv("DOCINTEL_LLM_MODEL", "gpt-4o-mini").strip()
57
+ base_url = os.getenv("DOCINTEL_LLM_BASE_URL", "").strip() or None
58
+ return api_key, model, base_url
59
+
60
+
61
+ def _parse_json_response(raw: str) -> dict[str, Any]:
62
+ cleaned = raw.strip()
63
+ if cleaned.startswith("```"):
64
+ lines = cleaned.splitlines()
65
+ if lines and lines[0].startswith("```"):
66
+ lines = lines[1:]
67
+ if lines and lines[-1].strip() == "```":
68
+ lines = lines[:-1]
69
+ cleaned = "\n".join(lines).strip()
70
+ payload = json.loads(cleaned)
71
+ if not isinstance(payload, dict):
72
+ raise ValueError("LLM response must be a JSON object.")
73
+ return payload
74
+
75
+
76
+ def structure_page_text(page_index: int, source_text: str) -> StructuredPage:
77
+ """Send one page of source text to the LLM and return structured output."""
78
+ _ensure_llm_stack()
79
+ api_key, model, base_url = _llm_settings()
80
+
81
+ from openai import OpenAI
82
+
83
+ client_kwargs: dict[str, Any] = {"api_key": api_key}
84
+ if base_url:
85
+ client_kwargs["base_url"] = base_url
86
+ client = OpenAI(**client_kwargs)
87
+
88
+ user_prompt = STRUCTURE_USER_TEMPLATE.format(
89
+ page_number=page_index + 1,
90
+ source_text=source_text[:12000],
91
+ )
92
+ response = client.chat.completions.create(
93
+ model=model,
94
+ temperature=0.1,
95
+ response_format={"type": "json_object"},
96
+ messages=[
97
+ {"role": "system", "content": STRUCTURE_SYSTEM_PROMPT},
98
+ {"role": "user", "content": user_prompt},
99
+ ],
100
+ )
101
+ content = response.choices[0].message.content or "{}"
102
+ payload = _parse_json_response(content)
103
+ return StructuredPage.from_llm_payload(page_index, payload)
104
+
105
+
106
+ def structure_document(
107
+ page_texts: list[tuple[int, str]],
108
+ *,
109
+ progress_callback=None,
110
+ ) -> StructuredDocument:
111
+ """Structure each page with the LLM and merge into one document model."""
112
+ structured_pages: list[StructuredPage] = []
113
+ total = len(page_texts)
114
+ for offset, (page_index, text) in enumerate(page_texts):
115
+ cleaned = text.strip()
116
+ if progress_callback is not None:
117
+ progress_callback(
118
+ stage="structuring",
119
+ pages_done=offset,
120
+ pages_total=total,
121
+ message=f"Structuring page {page_index + 1} of {total}",
122
+ )
123
+ if not cleaned:
124
+ structured_pages.append(
125
+ StructuredPage(page_index=page_index, title="", sections=[], plain_text="")
126
+ )
127
+ continue
128
+ structured_pages.append(structure_page_text(page_index, cleaned))
129
+ if progress_callback is not None:
130
+ progress_callback(
131
+ stage="structuring",
132
+ pages_done=total,
133
+ pages_total=total,
134
+ message="Structuring complete",
135
+ )
136
+ return StructuredDocument.from_pages(structured_pages)
@@ -0,0 +1,136 @@
1
+ """Render structured documents into curated or searchable PDFs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+
8
+ import fitz
9
+
10
+ from docintel.services.pdf.annotator import _save_pdf
11
+ from docintel.services.pdf.structure_schema import SectionBlock, StructuredDocument, StructuredPage
12
+
13
+
14
+ @dataclass
15
+ class _RenderContext:
16
+ pdf: fitz.Document
17
+ page: fitz.Page
18
+ margin: float
19
+ y: float
20
+
21
+
22
+ def _new_page(ctx: _RenderContext) -> None:
23
+ ctx.page = ctx.pdf.new_page()
24
+ ctx.y = ctx.margin
25
+
26
+
27
+ def _ensure_space(ctx: _RenderContext, height: float) -> None:
28
+ if ctx.y + height > ctx.page.rect.height - ctx.margin:
29
+ _new_page(ctx)
30
+
31
+
32
+ def _write_line(ctx: _RenderContext, text: str, fontsize: float, indent: float = 0) -> None:
33
+ if not text.strip():
34
+ return
35
+ line_height = fontsize * 1.45
36
+ _ensure_space(ctx, line_height)
37
+ ctx.page.insert_text(
38
+ (ctx.margin + indent, ctx.y),
39
+ text,
40
+ fontsize=fontsize,
41
+ fontname="helv",
42
+ )
43
+ ctx.y += line_height
44
+
45
+
46
+ def _write_wrapped_paragraph(ctx: _RenderContext, text: str, fontsize: float = 11, indent: float = 0) -> None:
47
+ if not text.strip():
48
+ return
49
+ usable_width = ctx.page.rect.width - (2 * ctx.margin) - indent
50
+ approx_chars = max(40, int(usable_width / (fontsize * 0.55)))
51
+ words = text.split()
52
+ line: list[str] = []
53
+ line_len = 0
54
+ for word in words:
55
+ extra = len(word) + (1 if line else 0)
56
+ if line and line_len + extra > approx_chars:
57
+ _write_line(ctx, " ".join(line), fontsize, indent=indent)
58
+ line = [word]
59
+ line_len = len(word)
60
+ else:
61
+ line.append(word)
62
+ line_len += extra
63
+ if line:
64
+ _write_line(ctx, " ".join(line), fontsize, indent=indent)
65
+ ctx.y += 4
66
+
67
+
68
+ def _write_section(ctx: _RenderContext, section: SectionBlock) -> None:
69
+ if section.heading:
70
+ heading_size = max(12, 18 - section.level)
71
+ _write_line(ctx, section.heading, heading_size)
72
+ ctx.y += 4
73
+ for paragraph in section.paragraphs:
74
+ _write_wrapped_paragraph(ctx, paragraph)
75
+ for item in section.list_items:
76
+ _write_wrapped_paragraph(ctx, f"- {item}", indent=12)
77
+ for table in section.tables:
78
+ if table.headers:
79
+ _write_line(ctx, " | ".join(table.headers), 10)
80
+ for row in table.rows:
81
+ _write_line(ctx, " | ".join(row), 10)
82
+ ctx.y += 6
83
+
84
+
85
+ def render_curated_pdf(document: StructuredDocument, output_path: Path) -> None:
86
+ """Build a new typeset PDF from structured content."""
87
+ pdf = fitz.open()
88
+ page = pdf.new_page()
89
+ ctx = _RenderContext(pdf=pdf, page=page, margin=72, y=72)
90
+
91
+ _write_line(ctx, document.title, 18)
92
+ ctx.y += 10
93
+
94
+ for structured_page in document.pages:
95
+ for section in structured_page.sections:
96
+ _write_section(ctx, section)
97
+ if structured_page.sections:
98
+ continue
99
+ if structured_page.plain_text:
100
+ for line in structured_page.plain_text.splitlines():
101
+ _write_wrapped_paragraph(ctx, line)
102
+
103
+ _save_pdf(pdf, output_path)
104
+
105
+
106
+ def render_searchable_pdf(
107
+ source_doc: fitz.Document,
108
+ pages: list[StructuredPage],
109
+ output_path: Path,
110
+ ) -> None:
111
+ """Keep original page layout and embed an invisible curated text layer."""
112
+ for structured_page in pages:
113
+ if structured_page.page_index >= source_doc.page_count:
114
+ continue
115
+ page = source_doc[structured_page.page_index]
116
+ text = structured_page.plain_text.strip()
117
+ if not text:
118
+ continue
119
+ y = 72
120
+ line_height = 13
121
+ for line in text.splitlines():
122
+ cleaned = line.strip()
123
+ if not cleaned:
124
+ continue
125
+ if y > page.rect.height - 72:
126
+ break
127
+ page.insert_text(
128
+ (72, y),
129
+ cleaned,
130
+ fontsize=10,
131
+ fontname="helv",
132
+ render_mode=3,
133
+ )
134
+ y += line_height
135
+
136
+ _save_pdf(source_doc, output_path)