docintel-platform 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docintel/__init__.py +6 -0
- docintel/app.py +45 -0
- docintel/auth/__init__.py +12 -0
- docintel/auth/api_keys.py +48 -0
- docintel/auth/limiter.py +41 -0
- docintel/auth/middleware.py +34 -0
- docintel/auth/oidc.py +45 -0
- docintel/cli.py +21 -0
- docintel/client.py +193 -0
- docintel/config.py +20 -0
- docintel/jobs/__init__.py +16 -0
- docintel/jobs/helpers.py +38 -0
- docintel/jobs/models.py +78 -0
- docintel/jobs/queue.py +75 -0
- docintel/jobs/store.py +82 -0
- docintel/jobs/tasks.py +173 -0
- docintel/jobs/webhooks.py +32 -0
- docintel/openapi/__init__.py +1 -0
- docintel/openapi/openapi.yaml +380 -0
- docintel/ops/__init__.py +1 -0
- docintel/ops/logging.py +40 -0
- docintel/ops/metrics.py +57 -0
- docintel/ops/middleware.py +40 -0
- docintel/routes/__init__.py +1 -0
- docintel/routes/jobs.py +26 -0
- docintel/routes/match.py +43 -0
- docintel/routes/openapi_docs.py +57 -0
- docintel/routes/ops.py +22 -0
- docintel/routes/pdf.py +420 -0
- docintel/routes/text.py +41 -0
- docintel/services/__init__.py +1 -0
- docintel/services/matching/__init__.py +6 -0
- docintel/services/matching/models.py +19 -0
- docintel/services/matching/scorer.py +64 -0
- docintel/services/pdf/__init__.py +26 -0
- docintel/services/pdf/annotator.py +188 -0
- docintel/services/pdf/models.py +104 -0
- docintel/services/pdf/ocr.py +130 -0
- docintel/services/pdf/pii.py +105 -0
- docintel/services/pdf/presets.py +26 -0
- docintel/services/pdf/search.py +29 -0
- docintel/services/pdf/sensitive.py +212 -0
- docintel/services/pdf/structure.py +118 -0
- docintel/services/pdf/structure_llm.py +136 -0
- docintel/services/pdf/structure_render.py +136 -0
- docintel/services/pdf/structure_schema.py +99 -0
- docintel/services/summary/__init__.py +6 -0
- docintel/services/summary/models.py +21 -0
- docintel/services/summary/textrank.py +57 -0
- docintel/ui.py +347 -0
- docintel/wsgi.py +5 -0
- docintel_platform-1.0.2.dist-info/METADATA +607 -0
- docintel_platform-1.0.2.dist-info/RECORD +56 -0
- docintel_platform-1.0.2.dist-info/WHEEL +5 -0
- docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
- docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Default Presidio entity presets (extend via API or custom recognizers)."""
|
|
2
|
+
|
|
3
|
+
# Core Presidio entities suitable for HR, legal, and compliance workflows.
|
|
4
|
+
DEFAULT_PII_ENTITIES: tuple[str, ...] = (
|
|
5
|
+
"EMAIL_ADDRESS",
|
|
6
|
+
"PHONE_NUMBER",
|
|
7
|
+
"US_SSN",
|
|
8
|
+
"CREDIT_CARD",
|
|
9
|
+
"US_BANK_NUMBER",
|
|
10
|
+
"US_DRIVER_LICENSE",
|
|
11
|
+
"US_ITIN",
|
|
12
|
+
"US_PASSPORT",
|
|
13
|
+
"PERSON",
|
|
14
|
+
"LOCATION",
|
|
15
|
+
"DATE_TIME",
|
|
16
|
+
"IP_ADDRESS",
|
|
17
|
+
"IBAN_CODE",
|
|
18
|
+
"MEDICAL_LICENSE",
|
|
19
|
+
"URL",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Minimum extracted characters before a page is treated as scanned (OCR fallback).
|
|
23
|
+
MIN_NATIVE_TEXT_CHARS = 20
|
|
24
|
+
|
|
25
|
+
# EasyOCR render scale (higher improves accuracy, increases memory).
|
|
26
|
+
OCR_RENDER_SCALE = 2.0
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Text search helpers for PDF pages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
import fitz
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def search_for_text(lines: Iterable[str], pattern: str) -> Iterable[str]:
|
|
12
|
+
"""Yield regex matches from each line (case insensitive)."""
|
|
13
|
+
for line in lines:
|
|
14
|
+
for result in re.findall(pattern, line, re.IGNORECASE):
|
|
15
|
+
yield result
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract_info(input_file: str) -> dict:
|
|
19
|
+
"""Return basic metadata for a PDF file."""
|
|
20
|
+
pdf_doc = fitz.open(input_file)
|
|
21
|
+
info = {
|
|
22
|
+
"file": input_file,
|
|
23
|
+
"page_count": pdf_doc.page_count,
|
|
24
|
+
"encrypted": pdf_doc.is_encrypted,
|
|
25
|
+
}
|
|
26
|
+
if not pdf_doc.is_encrypted:
|
|
27
|
+
info.update({key: value for key, value in pdf_doc.metadata.items() if value})
|
|
28
|
+
pdf_doc.close()
|
|
29
|
+
return info
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""Scanned and native PDF sensitive-data detection with OCR + Presidio."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Protocol, Sequence
|
|
8
|
+
|
|
9
|
+
import fitz
|
|
10
|
+
|
|
11
|
+
from docintel.services.pdf.annotator import _open_pdf, _save_pdf, highlight_matches, redact_matches
|
|
12
|
+
from docintel.services.pdf.models import Action, PIIDetectionResult
|
|
13
|
+
from docintel.services.pdf.ocr import (
|
|
14
|
+
build_indexed_text,
|
|
15
|
+
embed_invisible_text_layer,
|
|
16
|
+
extract_page_ocr,
|
|
17
|
+
merge_rects,
|
|
18
|
+
page_has_native_text,
|
|
19
|
+
rects_for_char_range,
|
|
20
|
+
)
|
|
21
|
+
from docintel.services.pdf.pii import PIIHit, detect_pii_in_text
|
|
22
|
+
from docintel.services.pdf.search import search_for_text
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ProgressCallback(Protocol):
|
|
26
|
+
def __call__(
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
stage: str,
|
|
30
|
+
pages_done: int,
|
|
31
|
+
pages_total: int,
|
|
32
|
+
message: str,
|
|
33
|
+
) -> None: ...
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _ensure_ocr_stack() -> None:
|
|
37
|
+
try:
|
|
38
|
+
import easyocr # noqa: F401
|
|
39
|
+
import presidio_analyzer # noqa: F401
|
|
40
|
+
except ImportError as exc:
|
|
41
|
+
raise RuntimeError(
|
|
42
|
+
"OCR and Presidio dependencies are not installed. "
|
|
43
|
+
"Run: pip install -e '.[ocr]' && python -m spacy download en_core_web_sm"
|
|
44
|
+
) from exc
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _apply_rect_action(page: fitz.Page, rect: fitz.Rect, action: Action) -> bool:
|
|
48
|
+
if action == Action.REDACT:
|
|
49
|
+
page.add_redact_annot(rect, text=" ", fill=(0, 0, 0))
|
|
50
|
+
return True
|
|
51
|
+
if action == Action.FRAME:
|
|
52
|
+
annot = page.add_rect_annot(rect)
|
|
53
|
+
annot.set_colors(stroke=fitz.utils.getColor("red"))
|
|
54
|
+
annot.update()
|
|
55
|
+
return True
|
|
56
|
+
if action == Action.UNDERLINE:
|
|
57
|
+
annot = page.add_underline_annot([rect])
|
|
58
|
+
annot.update()
|
|
59
|
+
return True
|
|
60
|
+
if action == Action.SQUIGGLY:
|
|
61
|
+
annot = page.add_squiggly_annot([rect])
|
|
62
|
+
annot.update()
|
|
63
|
+
return True
|
|
64
|
+
if action == Action.STRIKEOUT:
|
|
65
|
+
annot = page.add_strikeout_annot([rect])
|
|
66
|
+
annot.update()
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
annot = page.add_highlight_annot([rect])
|
|
70
|
+
annot.update()
|
|
71
|
+
return True
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _annotate_rects(page: fitz.Page, rects: list[fitz.Rect], action: Action) -> int:
|
|
75
|
+
applied = 0
|
|
76
|
+
for rect in rects:
|
|
77
|
+
if _apply_rect_action(page, rect, action):
|
|
78
|
+
applied += 1
|
|
79
|
+
if action == Action.REDACT and applied:
|
|
80
|
+
page.apply_redactions()
|
|
81
|
+
return applied
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _native_rects_for_hit(page: fitz.Page, hit: PIIHit) -> list[fitz.Rect]:
|
|
85
|
+
return page.search_for(hit.text)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _ocr_rects_for_hit(hit: PIIHit, indexed) -> list[fitz.Rect]:
|
|
89
|
+
rects = rects_for_char_range(hit.start, hit.end, indexed)
|
|
90
|
+
merged = merge_rects(rects)
|
|
91
|
+
return [merged] if merged else []
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _regex_hits(page_text: str, pattern: str) -> list[PIIHit]:
|
|
95
|
+
hits: list[PIIHit] = []
|
|
96
|
+
for match in re.finditer(pattern, page_text, flags=re.IGNORECASE):
|
|
97
|
+
hits.append(
|
|
98
|
+
PIIHit(
|
|
99
|
+
entity_type="REGEX",
|
|
100
|
+
text=match.group(0),
|
|
101
|
+
start=match.start(),
|
|
102
|
+
end=match.end(),
|
|
103
|
+
score=1.0,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
return hits
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def detect_sensitive_pdf(
|
|
110
|
+
input_file: str | Path,
|
|
111
|
+
output_file: str | Path,
|
|
112
|
+
*,
|
|
113
|
+
entities: Sequence[str] | None = None,
|
|
114
|
+
action: Action | str = Action.HIGHLIGHT,
|
|
115
|
+
force_ocr: bool = False,
|
|
116
|
+
add_text_layer: bool = True,
|
|
117
|
+
pattern: str | None = None,
|
|
118
|
+
min_score: float = 0.35,
|
|
119
|
+
password: str | None = None,
|
|
120
|
+
progress_callback: ProgressCallback | None = None,
|
|
121
|
+
) -> PIIDetectionResult:
|
|
122
|
+
"""
|
|
123
|
+
Detect sensitive information with Presidio (and optional regex), annotate PDF.
|
|
124
|
+
|
|
125
|
+
Uses native PDF text when available. Falls back to EasyOCR for scanned pages.
|
|
126
|
+
"""
|
|
127
|
+
_ensure_ocr_stack()
|
|
128
|
+
selected_action = action if isinstance(action, Action) else Action.from_value(action)
|
|
129
|
+
if selected_action == Action.REMOVE:
|
|
130
|
+
raise ValueError("Action 'Remove' is not supported for sensitive detection.")
|
|
131
|
+
|
|
132
|
+
pdf_doc = _open_pdf(input_file, password)
|
|
133
|
+
total_annotations = 0
|
|
134
|
+
pages_processed = 0
|
|
135
|
+
ocr_pages: list[int] = []
|
|
136
|
+
findings: list[dict] = []
|
|
137
|
+
|
|
138
|
+
total_pages = pdf_doc.page_count
|
|
139
|
+
for page_index in range(total_pages):
|
|
140
|
+
if progress_callback is not None:
|
|
141
|
+
progress_callback(
|
|
142
|
+
stage="detecting",
|
|
143
|
+
pages_done=page_index,
|
|
144
|
+
pages_total=total_pages,
|
|
145
|
+
message=f"Processing page {page_index + 1} of {total_pages}",
|
|
146
|
+
)
|
|
147
|
+
page = pdf_doc[page_index]
|
|
148
|
+
pages_processed += 1
|
|
149
|
+
use_ocr = force_ocr or not page_has_native_text(page)
|
|
150
|
+
|
|
151
|
+
if use_ocr:
|
|
152
|
+
ocr_pages.append(page_index)
|
|
153
|
+
ocr_spans = extract_page_ocr(page)
|
|
154
|
+
if add_text_layer and ocr_spans:
|
|
155
|
+
embed_invisible_text_layer(page, ocr_spans)
|
|
156
|
+
page_text, indexed = build_indexed_text(ocr_spans)
|
|
157
|
+
else:
|
|
158
|
+
ocr_spans = []
|
|
159
|
+
indexed = []
|
|
160
|
+
page_text = page.get_text("text")
|
|
161
|
+
|
|
162
|
+
hits = detect_pii_in_text(page_text, entities=entities, min_score=min_score)
|
|
163
|
+
if pattern:
|
|
164
|
+
hits.extend(_regex_hits(page_text, pattern))
|
|
165
|
+
|
|
166
|
+
for hit in hits:
|
|
167
|
+
if use_ocr:
|
|
168
|
+
rects = _ocr_rects_for_hit(hit, indexed)
|
|
169
|
+
else:
|
|
170
|
+
rects = _native_rects_for_hit(page, hit)
|
|
171
|
+
|
|
172
|
+
if not rects:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
total_annotations += _annotate_rects(page, rects, selected_action)
|
|
176
|
+
findings.append(
|
|
177
|
+
{
|
|
178
|
+
"page": page_index,
|
|
179
|
+
"entity_type": hit.entity_type,
|
|
180
|
+
"text": hit.text,
|
|
181
|
+
"score": round(hit.score, 4),
|
|
182
|
+
"ocr_used": use_ocr,
|
|
183
|
+
}
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Legacy regex path for native PDFs when pattern matches line fragments
|
|
187
|
+
if pattern and not use_ocr:
|
|
188
|
+
matched_values = list(search_for_text(page_text.split("\n"), pattern))
|
|
189
|
+
if matched_values:
|
|
190
|
+
if selected_action == Action.REDACT:
|
|
191
|
+
total_annotations += redact_matches(page, matched_values)
|
|
192
|
+
else:
|
|
193
|
+
total_annotations += highlight_matches(page, matched_values, selected_action)
|
|
194
|
+
|
|
195
|
+
if progress_callback is not None:
|
|
196
|
+
progress_callback(
|
|
197
|
+
stage="detecting",
|
|
198
|
+
pages_done=total_pages,
|
|
199
|
+
pages_total=total_pages,
|
|
200
|
+
message="Detection complete",
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
_save_pdf(pdf_doc, Path(output_file))
|
|
204
|
+
return PIIDetectionResult(
|
|
205
|
+
input_path=str(input_file),
|
|
206
|
+
output_path=str(output_file),
|
|
207
|
+
action=selected_action,
|
|
208
|
+
matches=total_annotations,
|
|
209
|
+
pages_processed=pages_processed,
|
|
210
|
+
ocr_pages=ocr_pages,
|
|
211
|
+
findings=findings,
|
|
212
|
+
)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""LLM-backed PDF structuring: unstructured scan to curated digital PDF."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Protocol
|
|
7
|
+
|
|
8
|
+
import fitz
|
|
9
|
+
|
|
10
|
+
from docintel.services.pdf.annotator import _open_pdf
|
|
11
|
+
from docintel.services.pdf.models import StructureMode, StructureResult
|
|
12
|
+
from docintel.services.pdf.ocr import build_indexed_text, extract_page_ocr, page_has_native_text
|
|
13
|
+
from docintel.services.pdf.structure_llm import structure_document
|
|
14
|
+
from docintel.services.pdf.structure_render import render_curated_pdf, render_searchable_pdf
|
|
15
|
+
from docintel.services.pdf.structure_schema import StructuredDocument
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ProgressCallback(Protocol):
|
|
19
|
+
def __call__(
|
|
20
|
+
self,
|
|
21
|
+
*,
|
|
22
|
+
stage: str,
|
|
23
|
+
pages_done: int,
|
|
24
|
+
pages_total: int,
|
|
25
|
+
message: str,
|
|
26
|
+
) -> None: ...
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _ensure_ocr_stack() -> None:
|
|
30
|
+
try:
|
|
31
|
+
import easyocr # noqa: F401
|
|
32
|
+
except ImportError as exc:
|
|
33
|
+
raise RuntimeError(
|
|
34
|
+
"OCR dependencies are not installed. Run: pip install -e '.[ocr]'"
|
|
35
|
+
) from exc
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _extract_page_text(page: fitz.Page, force_ocr: bool) -> tuple[str, bool]:
|
|
39
|
+
use_ocr = force_ocr or not page_has_native_text(page)
|
|
40
|
+
if use_ocr:
|
|
41
|
+
_ensure_ocr_stack()
|
|
42
|
+
spans = extract_page_ocr(page)
|
|
43
|
+
text, _ = build_indexed_text(spans)
|
|
44
|
+
return text, True
|
|
45
|
+
return page.get_text("text"), False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def structure_pdf(
|
|
49
|
+
input_file: str | Path,
|
|
50
|
+
output_file: str | Path,
|
|
51
|
+
*,
|
|
52
|
+
mode: StructureMode | str = StructureMode.CURATE,
|
|
53
|
+
force_ocr: bool = False,
|
|
54
|
+
redact_before_llm: bool = False,
|
|
55
|
+
structure_fn: Callable[[list[tuple[int, str]]], StructuredDocument] | None = None,
|
|
56
|
+
progress_callback: ProgressCallback | None = None,
|
|
57
|
+
password: str | None = None,
|
|
58
|
+
) -> StructureResult:
|
|
59
|
+
"""
|
|
60
|
+
Convert an unstructured or scanned PDF into a curated structured PDF.
|
|
61
|
+
|
|
62
|
+
Uses EasyOCR when native text is missing, then an LLM to clean and structure
|
|
63
|
+
content before rendering a new PDF (curate) or embedding a searchable layer
|
|
64
|
+
(searchable).
|
|
65
|
+
"""
|
|
66
|
+
selected_mode = mode if isinstance(mode, StructureMode) else StructureMode.from_value(mode)
|
|
67
|
+
|
|
68
|
+
pdf_doc = _open_pdf(input_file, password)
|
|
69
|
+
page_texts: list[tuple[int, str]] = []
|
|
70
|
+
ocr_pages: list[int] = []
|
|
71
|
+
|
|
72
|
+
total_pages = pdf_doc.page_count
|
|
73
|
+
for page_index in range(total_pages):
|
|
74
|
+
if progress_callback is not None:
|
|
75
|
+
progress_callback(
|
|
76
|
+
stage="extracting",
|
|
77
|
+
pages_done=page_index,
|
|
78
|
+
pages_total=total_pages,
|
|
79
|
+
message=f"Extracting page {page_index + 1} of {total_pages}",
|
|
80
|
+
)
|
|
81
|
+
text, used_ocr = _extract_page_text(pdf_doc[page_index], force_ocr=force_ocr)
|
|
82
|
+
if used_ocr:
|
|
83
|
+
ocr_pages.append(page_index)
|
|
84
|
+
if redact_before_llm and text.strip():
|
|
85
|
+
from docintel.services.pdf.pii import mask_pii_in_text
|
|
86
|
+
|
|
87
|
+
text, _ = mask_pii_in_text(text)
|
|
88
|
+
page_texts.append((page_index, text))
|
|
89
|
+
|
|
90
|
+
if progress_callback is not None:
|
|
91
|
+
progress_callback(
|
|
92
|
+
stage="extracting",
|
|
93
|
+
pages_done=total_pages,
|
|
94
|
+
pages_total=total_pages,
|
|
95
|
+
message="Extraction complete",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if structure_fn is not None:
|
|
99
|
+
document = structure_fn(page_texts)
|
|
100
|
+
else:
|
|
101
|
+
document = structure_document(page_texts, progress_callback=progress_callback)
|
|
102
|
+
|
|
103
|
+
output_path = Path(output_file)
|
|
104
|
+
pages_processed = pdf_doc.page_count
|
|
105
|
+
if selected_mode == StructureMode.SEARCHABLE:
|
|
106
|
+
render_searchable_pdf(pdf_doc, document.pages, output_path)
|
|
107
|
+
else:
|
|
108
|
+
pdf_doc.close()
|
|
109
|
+
render_curated_pdf(document, output_path)
|
|
110
|
+
|
|
111
|
+
return StructureResult(
|
|
112
|
+
input_path=str(input_file),
|
|
113
|
+
output_path=str(output_path),
|
|
114
|
+
mode=selected_mode,
|
|
115
|
+
pages_processed=pages_processed,
|
|
116
|
+
ocr_pages=ocr_pages,
|
|
117
|
+
document_title=document.title,
|
|
118
|
+
)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""LLM structuring for OCR and native PDF text."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from docintel.services.pdf.structure_schema import StructuredDocument, StructuredPage
|
|
10
|
+
|
|
11
|
+
STRUCTURE_SYSTEM_PROMPT = """You convert noisy OCR or unstructured PDF text into clean structured JSON.
|
|
12
|
+
Rules:
|
|
13
|
+
- Fix OCR typos only when the intended word is obvious from context.
|
|
14
|
+
- Do not invent facts, numbers, names, or clauses that are not in the source.
|
|
15
|
+
- Preserve reading order.
|
|
16
|
+
- Use headings only when the source clearly has a section title.
|
|
17
|
+
- Return valid JSON only, matching the schema exactly."""
|
|
18
|
+
|
|
19
|
+
STRUCTURE_USER_TEMPLATE = """Page number: {page_number}
|
|
20
|
+
Source text:
|
|
21
|
+
---
|
|
22
|
+
{source_text}
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
Return JSON with this schema:
|
|
26
|
+
{{
|
|
27
|
+
"page_title": "document or section title if visible on this page, else empty string",
|
|
28
|
+
"plain_text": "full cleaned page text in reading order with paragraph breaks",
|
|
29
|
+
"sections": [
|
|
30
|
+
{{
|
|
31
|
+
"heading": "section heading or empty string",
|
|
32
|
+
"level": 1,
|
|
33
|
+
"paragraphs": ["paragraph text"],
|
|
34
|
+
"list_items": ["bullet items"],
|
|
35
|
+
"tables": [{{"headers": ["col"], "rows": [["value"]]}}]
|
|
36
|
+
}}
|
|
37
|
+
]
|
|
38
|
+
}}"""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _ensure_llm_stack() -> None:
|
|
42
|
+
try:
|
|
43
|
+
import openai # noqa: F401
|
|
44
|
+
except ImportError as exc:
|
|
45
|
+
raise RuntimeError(
|
|
46
|
+
"LLM dependencies are not installed. Run: pip install -e '.[llm]'"
|
|
47
|
+
) from exc
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _llm_settings() -> tuple[str, str, str | None]:
|
|
51
|
+
api_key = os.getenv("DOCINTEL_LLM_API_KEY", "").strip()
|
|
52
|
+
if not api_key:
|
|
53
|
+
raise RuntimeError(
|
|
54
|
+
"DOCINTEL_LLM_API_KEY is not set. Configure an OpenAI-compatible API key."
|
|
55
|
+
)
|
|
56
|
+
model = os.getenv("DOCINTEL_LLM_MODEL", "gpt-4o-mini").strip()
|
|
57
|
+
base_url = os.getenv("DOCINTEL_LLM_BASE_URL", "").strip() or None
|
|
58
|
+
return api_key, model, base_url
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _parse_json_response(raw: str) -> dict[str, Any]:
|
|
62
|
+
cleaned = raw.strip()
|
|
63
|
+
if cleaned.startswith("```"):
|
|
64
|
+
lines = cleaned.splitlines()
|
|
65
|
+
if lines and lines[0].startswith("```"):
|
|
66
|
+
lines = lines[1:]
|
|
67
|
+
if lines and lines[-1].strip() == "```":
|
|
68
|
+
lines = lines[:-1]
|
|
69
|
+
cleaned = "\n".join(lines).strip()
|
|
70
|
+
payload = json.loads(cleaned)
|
|
71
|
+
if not isinstance(payload, dict):
|
|
72
|
+
raise ValueError("LLM response must be a JSON object.")
|
|
73
|
+
return payload
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def structure_page_text(page_index: int, source_text: str) -> StructuredPage:
|
|
77
|
+
"""Send one page of source text to the LLM and return structured output."""
|
|
78
|
+
_ensure_llm_stack()
|
|
79
|
+
api_key, model, base_url = _llm_settings()
|
|
80
|
+
|
|
81
|
+
from openai import OpenAI
|
|
82
|
+
|
|
83
|
+
client_kwargs: dict[str, Any] = {"api_key": api_key}
|
|
84
|
+
if base_url:
|
|
85
|
+
client_kwargs["base_url"] = base_url
|
|
86
|
+
client = OpenAI(**client_kwargs)
|
|
87
|
+
|
|
88
|
+
user_prompt = STRUCTURE_USER_TEMPLATE.format(
|
|
89
|
+
page_number=page_index + 1,
|
|
90
|
+
source_text=source_text[:12000],
|
|
91
|
+
)
|
|
92
|
+
response = client.chat.completions.create(
|
|
93
|
+
model=model,
|
|
94
|
+
temperature=0.1,
|
|
95
|
+
response_format={"type": "json_object"},
|
|
96
|
+
messages=[
|
|
97
|
+
{"role": "system", "content": STRUCTURE_SYSTEM_PROMPT},
|
|
98
|
+
{"role": "user", "content": user_prompt},
|
|
99
|
+
],
|
|
100
|
+
)
|
|
101
|
+
content = response.choices[0].message.content or "{}"
|
|
102
|
+
payload = _parse_json_response(content)
|
|
103
|
+
return StructuredPage.from_llm_payload(page_index, payload)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def structure_document(
|
|
107
|
+
page_texts: list[tuple[int, str]],
|
|
108
|
+
*,
|
|
109
|
+
progress_callback=None,
|
|
110
|
+
) -> StructuredDocument:
|
|
111
|
+
"""Structure each page with the LLM and merge into one document model."""
|
|
112
|
+
structured_pages: list[StructuredPage] = []
|
|
113
|
+
total = len(page_texts)
|
|
114
|
+
for offset, (page_index, text) in enumerate(page_texts):
|
|
115
|
+
cleaned = text.strip()
|
|
116
|
+
if progress_callback is not None:
|
|
117
|
+
progress_callback(
|
|
118
|
+
stage="structuring",
|
|
119
|
+
pages_done=offset,
|
|
120
|
+
pages_total=total,
|
|
121
|
+
message=f"Structuring page {page_index + 1} of {total}",
|
|
122
|
+
)
|
|
123
|
+
if not cleaned:
|
|
124
|
+
structured_pages.append(
|
|
125
|
+
StructuredPage(page_index=page_index, title="", sections=[], plain_text="")
|
|
126
|
+
)
|
|
127
|
+
continue
|
|
128
|
+
structured_pages.append(structure_page_text(page_index, cleaned))
|
|
129
|
+
if progress_callback is not None:
|
|
130
|
+
progress_callback(
|
|
131
|
+
stage="structuring",
|
|
132
|
+
pages_done=total,
|
|
133
|
+
pages_total=total,
|
|
134
|
+
message="Structuring complete",
|
|
135
|
+
)
|
|
136
|
+
return StructuredDocument.from_pages(structured_pages)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Render structured documents into curated or searchable PDFs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import fitz
|
|
9
|
+
|
|
10
|
+
from docintel.services.pdf.annotator import _save_pdf
|
|
11
|
+
from docintel.services.pdf.structure_schema import SectionBlock, StructuredDocument, StructuredPage
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class _RenderContext:
|
|
16
|
+
pdf: fitz.Document
|
|
17
|
+
page: fitz.Page
|
|
18
|
+
margin: float
|
|
19
|
+
y: float
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _new_page(ctx: _RenderContext) -> None:
|
|
23
|
+
ctx.page = ctx.pdf.new_page()
|
|
24
|
+
ctx.y = ctx.margin
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _ensure_space(ctx: _RenderContext, height: float) -> None:
|
|
28
|
+
if ctx.y + height > ctx.page.rect.height - ctx.margin:
|
|
29
|
+
_new_page(ctx)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _write_line(ctx: _RenderContext, text: str, fontsize: float, indent: float = 0) -> None:
|
|
33
|
+
if not text.strip():
|
|
34
|
+
return
|
|
35
|
+
line_height = fontsize * 1.45
|
|
36
|
+
_ensure_space(ctx, line_height)
|
|
37
|
+
ctx.page.insert_text(
|
|
38
|
+
(ctx.margin + indent, ctx.y),
|
|
39
|
+
text,
|
|
40
|
+
fontsize=fontsize,
|
|
41
|
+
fontname="helv",
|
|
42
|
+
)
|
|
43
|
+
ctx.y += line_height
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _write_wrapped_paragraph(ctx: _RenderContext, text: str, fontsize: float = 11, indent: float = 0) -> None:
|
|
47
|
+
if not text.strip():
|
|
48
|
+
return
|
|
49
|
+
usable_width = ctx.page.rect.width - (2 * ctx.margin) - indent
|
|
50
|
+
approx_chars = max(40, int(usable_width / (fontsize * 0.55)))
|
|
51
|
+
words = text.split()
|
|
52
|
+
line: list[str] = []
|
|
53
|
+
line_len = 0
|
|
54
|
+
for word in words:
|
|
55
|
+
extra = len(word) + (1 if line else 0)
|
|
56
|
+
if line and line_len + extra > approx_chars:
|
|
57
|
+
_write_line(ctx, " ".join(line), fontsize, indent=indent)
|
|
58
|
+
line = [word]
|
|
59
|
+
line_len = len(word)
|
|
60
|
+
else:
|
|
61
|
+
line.append(word)
|
|
62
|
+
line_len += extra
|
|
63
|
+
if line:
|
|
64
|
+
_write_line(ctx, " ".join(line), fontsize, indent=indent)
|
|
65
|
+
ctx.y += 4
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _write_section(ctx: _RenderContext, section: SectionBlock) -> None:
|
|
69
|
+
if section.heading:
|
|
70
|
+
heading_size = max(12, 18 - section.level)
|
|
71
|
+
_write_line(ctx, section.heading, heading_size)
|
|
72
|
+
ctx.y += 4
|
|
73
|
+
for paragraph in section.paragraphs:
|
|
74
|
+
_write_wrapped_paragraph(ctx, paragraph)
|
|
75
|
+
for item in section.list_items:
|
|
76
|
+
_write_wrapped_paragraph(ctx, f"- {item}", indent=12)
|
|
77
|
+
for table in section.tables:
|
|
78
|
+
if table.headers:
|
|
79
|
+
_write_line(ctx, " | ".join(table.headers), 10)
|
|
80
|
+
for row in table.rows:
|
|
81
|
+
_write_line(ctx, " | ".join(row), 10)
|
|
82
|
+
ctx.y += 6
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def render_curated_pdf(document: StructuredDocument, output_path: Path) -> None:
|
|
86
|
+
"""Build a new typeset PDF from structured content."""
|
|
87
|
+
pdf = fitz.open()
|
|
88
|
+
page = pdf.new_page()
|
|
89
|
+
ctx = _RenderContext(pdf=pdf, page=page, margin=72, y=72)
|
|
90
|
+
|
|
91
|
+
_write_line(ctx, document.title, 18)
|
|
92
|
+
ctx.y += 10
|
|
93
|
+
|
|
94
|
+
for structured_page in document.pages:
|
|
95
|
+
for section in structured_page.sections:
|
|
96
|
+
_write_section(ctx, section)
|
|
97
|
+
if structured_page.sections:
|
|
98
|
+
continue
|
|
99
|
+
if structured_page.plain_text:
|
|
100
|
+
for line in structured_page.plain_text.splitlines():
|
|
101
|
+
_write_wrapped_paragraph(ctx, line)
|
|
102
|
+
|
|
103
|
+
_save_pdf(pdf, output_path)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def render_searchable_pdf(
|
|
107
|
+
source_doc: fitz.Document,
|
|
108
|
+
pages: list[StructuredPage],
|
|
109
|
+
output_path: Path,
|
|
110
|
+
) -> None:
|
|
111
|
+
"""Keep original page layout and embed an invisible curated text layer."""
|
|
112
|
+
for structured_page in pages:
|
|
113
|
+
if structured_page.page_index >= source_doc.page_count:
|
|
114
|
+
continue
|
|
115
|
+
page = source_doc[structured_page.page_index]
|
|
116
|
+
text = structured_page.plain_text.strip()
|
|
117
|
+
if not text:
|
|
118
|
+
continue
|
|
119
|
+
y = 72
|
|
120
|
+
line_height = 13
|
|
121
|
+
for line in text.splitlines():
|
|
122
|
+
cleaned = line.strip()
|
|
123
|
+
if not cleaned:
|
|
124
|
+
continue
|
|
125
|
+
if y > page.rect.height - 72:
|
|
126
|
+
break
|
|
127
|
+
page.insert_text(
|
|
128
|
+
(72, y),
|
|
129
|
+
cleaned,
|
|
130
|
+
fontsize=10,
|
|
131
|
+
fontname="helv",
|
|
132
|
+
render_mode=3,
|
|
133
|
+
)
|
|
134
|
+
y += line_height
|
|
135
|
+
|
|
136
|
+
_save_pdf(source_doc, output_path)
|