docintel-platform 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docintel/__init__.py +6 -0
- docintel/app.py +45 -0
- docintel/auth/__init__.py +12 -0
- docintel/auth/api_keys.py +48 -0
- docintel/auth/limiter.py +41 -0
- docintel/auth/middleware.py +34 -0
- docintel/auth/oidc.py +45 -0
- docintel/cli.py +21 -0
- docintel/client.py +193 -0
- docintel/config.py +20 -0
- docintel/jobs/__init__.py +16 -0
- docintel/jobs/helpers.py +38 -0
- docintel/jobs/models.py +78 -0
- docintel/jobs/queue.py +75 -0
- docintel/jobs/store.py +82 -0
- docintel/jobs/tasks.py +173 -0
- docintel/jobs/webhooks.py +32 -0
- docintel/openapi/__init__.py +1 -0
- docintel/openapi/openapi.yaml +380 -0
- docintel/ops/__init__.py +1 -0
- docintel/ops/logging.py +40 -0
- docintel/ops/metrics.py +57 -0
- docintel/ops/middleware.py +40 -0
- docintel/routes/__init__.py +1 -0
- docintel/routes/jobs.py +26 -0
- docintel/routes/match.py +43 -0
- docintel/routes/openapi_docs.py +57 -0
- docintel/routes/ops.py +22 -0
- docintel/routes/pdf.py +420 -0
- docintel/routes/text.py +41 -0
- docintel/services/__init__.py +1 -0
- docintel/services/matching/__init__.py +6 -0
- docintel/services/matching/models.py +19 -0
- docintel/services/matching/scorer.py +64 -0
- docintel/services/pdf/__init__.py +26 -0
- docintel/services/pdf/annotator.py +188 -0
- docintel/services/pdf/models.py +104 -0
- docintel/services/pdf/ocr.py +130 -0
- docintel/services/pdf/pii.py +105 -0
- docintel/services/pdf/presets.py +26 -0
- docintel/services/pdf/search.py +29 -0
- docintel/services/pdf/sensitive.py +212 -0
- docintel/services/pdf/structure.py +118 -0
- docintel/services/pdf/structure_llm.py +136 -0
- docintel/services/pdf/structure_render.py +136 -0
- docintel/services/pdf/structure_schema.py +99 -0
- docintel/services/summary/__init__.py +6 -0
- docintel/services/summary/models.py +21 -0
- docintel/services/summary/textrank.py +57 -0
- docintel/ui.py +347 -0
- docintel/wsgi.py +5 -0
- docintel_platform-1.0.2.dist-info/METADATA +607 -0
- docintel_platform-1.0.2.dist-info/RECORD +56 -0
- docintel_platform-1.0.2.dist-info/WHEEL +5 -0
- docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
- docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
docintel/routes/text.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Text summarization API routes."""
|
|
2
|
+
|
|
3
|
+
from flask import Blueprint, jsonify, request
|
|
4
|
+
|
|
5
|
+
from docintel.auth.limiter import limiter
|
|
6
|
+
from docintel.services.summary import summarize_text
|
|
7
|
+
from docintel.services.summary.textrank import DEFAULT_SENTENCE_COUNT, MAX_SENTENCE_COUNT
|
|
8
|
+
|
|
9
|
+
text_bp = Blueprint("text", __name__, url_prefix="/v1/text")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@text_bp.post("/summarize")
|
|
13
|
+
@limiter.limit("100 per hour")
|
|
14
|
+
def summarize():
|
|
15
|
+
"""Extractively summarize plain text using TextRank sentence ranking."""
|
|
16
|
+
payload = request.get_json(silent=True)
|
|
17
|
+
if not isinstance(payload, dict):
|
|
18
|
+
return jsonify({"error": "Request body must be JSON."}), 400
|
|
19
|
+
|
|
20
|
+
text = payload.get("text", "")
|
|
21
|
+
sentences = payload.get("sentences", DEFAULT_SENTENCE_COUNT)
|
|
22
|
+
|
|
23
|
+
if not isinstance(text, str):
|
|
24
|
+
return jsonify({"error": "Field 'text' must be a string."}), 400
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
sentences = int(sentences)
|
|
28
|
+
except (TypeError, ValueError):
|
|
29
|
+
return jsonify({"error": "Field 'sentences' must be an integer."}), 400
|
|
30
|
+
|
|
31
|
+
if sentences < 1 or sentences > MAX_SENTENCE_COUNT:
|
|
32
|
+
return jsonify(
|
|
33
|
+
{"error": f"Field 'sentences' must be between 1 and {MAX_SENTENCE_COUNT}."}
|
|
34
|
+
), 400
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
result = summarize_text(text=text, sentence_count=sentences)
|
|
38
|
+
except ValueError as exc:
|
|
39
|
+
return jsonify({"error": str(exc)}), 400
|
|
40
|
+
|
|
41
|
+
return jsonify({"status": "ok", **result.to_dict()}), 200
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Document processing services."""
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Types for resume matching."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class MatchResult:
|
|
10
|
+
score: float
|
|
11
|
+
matched_keywords: list[str]
|
|
12
|
+
missing_keywords: list[str]
|
|
13
|
+
|
|
14
|
+
def to_dict(self) -> dict:
|
|
15
|
+
return {
|
|
16
|
+
"score": self.score,
|
|
17
|
+
"matched_keywords": self.matched_keywords,
|
|
18
|
+
"missing_keywords": self.missing_keywords,
|
|
19
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""TF-IDF resume matching engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
6
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
7
|
+
|
|
8
|
+
from docintel.services.matching.models import MatchResult
|
|
9
|
+
|
|
10
|
+
DEFAULT_TOP_KEYWORDS = 25
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _clean_text(text: str) -> str:
|
|
14
|
+
return " ".join(text.strip().split())
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def match_resume_to_job(
|
|
18
|
+
resume: str,
|
|
19
|
+
job_description: str,
|
|
20
|
+
*,
|
|
21
|
+
top_keywords: int = DEFAULT_TOP_KEYWORDS,
|
|
22
|
+
) -> MatchResult:
|
|
23
|
+
"""Score resume fit against a job description using TF-IDF cosine similarity."""
|
|
24
|
+
resume_text = _clean_text(resume)
|
|
25
|
+
job_text = _clean_text(job_description)
|
|
26
|
+
|
|
27
|
+
if not resume_text:
|
|
28
|
+
raise ValueError("Resume text is required.")
|
|
29
|
+
if not job_text:
|
|
30
|
+
raise ValueError("Job description text is required.")
|
|
31
|
+
|
|
32
|
+
vectorizer = TfidfVectorizer(
|
|
33
|
+
stop_words="english",
|
|
34
|
+
token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z0-9+#.]+\b",
|
|
35
|
+
)
|
|
36
|
+
matrix = vectorizer.fit_transform([resume_text, job_text])
|
|
37
|
+
similarity = cosine_similarity(matrix[0:1], matrix[1:2])[0][0]
|
|
38
|
+
score = round(float(similarity) * 100, 2)
|
|
39
|
+
|
|
40
|
+
features = vectorizer.get_feature_names_out()
|
|
41
|
+
resume_weights = matrix[0].toarray()[0]
|
|
42
|
+
job_weights = matrix[1].toarray()[0]
|
|
43
|
+
|
|
44
|
+
matched: list[tuple[str, float]] = []
|
|
45
|
+
missing: list[tuple[str, float]] = []
|
|
46
|
+
|
|
47
|
+
for index, term in enumerate(features):
|
|
48
|
+
job_weight = job_weights[index]
|
|
49
|
+
if job_weight <= 0:
|
|
50
|
+
continue
|
|
51
|
+
if resume_weights[index] > 0:
|
|
52
|
+
matched.append((term, job_weight))
|
|
53
|
+
else:
|
|
54
|
+
missing.append((term, job_weight))
|
|
55
|
+
|
|
56
|
+
matched.sort(key=lambda item: item[1], reverse=True)
|
|
57
|
+
missing.sort(key=lambda item: item[1], reverse=True)
|
|
58
|
+
|
|
59
|
+
limit = max(1, top_keywords)
|
|
60
|
+
return MatchResult(
|
|
61
|
+
score=score,
|
|
62
|
+
matched_keywords=[term for term, _ in matched[:limit]],
|
|
63
|
+
missing_keywords=[term for term, _ in missing[:limit]],
|
|
64
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""PDF search and annotation service."""
|
|
2
|
+
|
|
3
|
+
from docintel.services.pdf.annotator import PDFAnnotator, annotate_pdf
|
|
4
|
+
from docintel.services.pdf.models import Action, PIIDetectionResult, ProcessResult, StructureMode, StructureResult
|
|
5
|
+
from docintel.services.pdf.pii import detect_pii_in_text, list_supported_entities
|
|
6
|
+
from docintel.services.pdf.presets import DEFAULT_PII_ENTITIES
|
|
7
|
+
from docintel.services.pdf.search import extract_info, search_for_text
|
|
8
|
+
from docintel.services.pdf.sensitive import detect_sensitive_pdf
|
|
9
|
+
from docintel.services.pdf.structure import structure_pdf
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"Action",
|
|
13
|
+
"DEFAULT_PII_ENTITIES",
|
|
14
|
+
"PDFAnnotator",
|
|
15
|
+
"PIIDetectionResult",
|
|
16
|
+
"ProcessResult",
|
|
17
|
+
"StructureMode",
|
|
18
|
+
"StructureResult",
|
|
19
|
+
"annotate_pdf",
|
|
20
|
+
"detect_pii_in_text",
|
|
21
|
+
"detect_sensitive_pdf",
|
|
22
|
+
"extract_info",
|
|
23
|
+
"list_supported_entities",
|
|
24
|
+
"search_for_text",
|
|
25
|
+
"structure_pdf",
|
|
26
|
+
]
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Core PDF annotation engine."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable, Sequence
|
|
7
|
+
|
|
8
|
+
import fitz
|
|
9
|
+
|
|
10
|
+
from docintel.services.pdf.models import Action, ProcessResult
|
|
11
|
+
from docintel.services.pdf.search import search_for_text
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _normalize_pages(pages: Sequence[int | str] | None) -> list[int] | None:
|
|
15
|
+
if pages is None:
|
|
16
|
+
return None
|
|
17
|
+
return [int(page) for page in pages]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _open_pdf(path: str | Path, password: str | None = None) -> fitz.Document:
|
|
21
|
+
pdf_path = Path(path)
|
|
22
|
+
if not pdf_path.is_file():
|
|
23
|
+
raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
|
24
|
+
|
|
25
|
+
pdf_doc = fitz.open(pdf_path)
|
|
26
|
+
if pdf_doc.is_encrypted:
|
|
27
|
+
if not password or not pdf_doc.authenticate(password):
|
|
28
|
+
pdf_doc.close()
|
|
29
|
+
raise PermissionError(f"PDF is encrypted: {pdf_path}")
|
|
30
|
+
return pdf_doc
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _save_pdf(pdf_doc: fitz.Document, output_path: Path) -> None:
|
|
34
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
35
|
+
pdf_doc.save(output_path)
|
|
36
|
+
pdf_doc.close()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def redact_matches(page: fitz.Page, matched_values: Iterable[str]) -> int:
|
|
40
|
+
count = 0
|
|
41
|
+
for value in matched_values:
|
|
42
|
+
areas = page.search_for(value)
|
|
43
|
+
if not areas:
|
|
44
|
+
continue
|
|
45
|
+
count += 1
|
|
46
|
+
for area in areas:
|
|
47
|
+
page.add_redact_annot(area, text=" ", fill=(0, 0, 0))
|
|
48
|
+
if count:
|
|
49
|
+
page.apply_redactions()
|
|
50
|
+
return count
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def frame_matches(page: fitz.Page, matched_values: Iterable[str]) -> int:
|
|
54
|
+
count = 0
|
|
55
|
+
for value in matched_values:
|
|
56
|
+
areas = page.search_for(value)
|
|
57
|
+
if not areas:
|
|
58
|
+
continue
|
|
59
|
+
count += 1
|
|
60
|
+
for area in areas:
|
|
61
|
+
annot = page.add_rect_annot(area)
|
|
62
|
+
annot.set_colors(stroke=fitz.utils.getColor("red"))
|
|
63
|
+
annot.update()
|
|
64
|
+
return count
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def highlight_matches(page: fitz.Page, matched_values: Iterable[str], action: Action) -> int:
|
|
68
|
+
count = 0
|
|
69
|
+
for value in matched_values:
|
|
70
|
+
areas = page.search_for(value)
|
|
71
|
+
if not areas:
|
|
72
|
+
continue
|
|
73
|
+
count += 1
|
|
74
|
+
if action == Action.SQUIGGLY:
|
|
75
|
+
annot = page.add_squiggly_annot(areas)
|
|
76
|
+
elif action == Action.UNDERLINE:
|
|
77
|
+
annot = page.add_underline_annot(areas)
|
|
78
|
+
elif action == Action.STRIKEOUT:
|
|
79
|
+
annot = page.add_strikeout_annot(areas)
|
|
80
|
+
else:
|
|
81
|
+
annot = page.add_highlight_annot(areas)
|
|
82
|
+
annot.update()
|
|
83
|
+
return count
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def remove_annotations(
|
|
87
|
+
input_file: str | Path,
|
|
88
|
+
output_file: str | Path,
|
|
89
|
+
pages: Sequence[int] | None = None,
|
|
90
|
+
password: str | None = None,
|
|
91
|
+
) -> ProcessResult:
|
|
92
|
+
pdf_doc = _open_pdf(input_file, password)
|
|
93
|
+
removed = 0
|
|
94
|
+
pages_processed = 0
|
|
95
|
+
page_list = _normalize_pages(pages)
|
|
96
|
+
|
|
97
|
+
for page_index in range(pdf_doc.page_count):
|
|
98
|
+
if page_list is not None and page_index not in page_list:
|
|
99
|
+
continue
|
|
100
|
+
pages_processed += 1
|
|
101
|
+
page = pdf_doc[page_index]
|
|
102
|
+
annot = page.first_annot
|
|
103
|
+
while annot:
|
|
104
|
+
removed += 1
|
|
105
|
+
next_annot = annot.next
|
|
106
|
+
page.delete_annot(annot)
|
|
107
|
+
annot = next_annot
|
|
108
|
+
|
|
109
|
+
_save_pdf(pdf_doc, Path(output_file))
|
|
110
|
+
return ProcessResult(
|
|
111
|
+
input_path=str(input_file),
|
|
112
|
+
output_path=str(output_file),
|
|
113
|
+
action=Action.REMOVE,
|
|
114
|
+
matches=removed,
|
|
115
|
+
pages_processed=pages_processed,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class PDFAnnotator:
|
|
120
|
+
"""High-level API for annotating PDF files."""
|
|
121
|
+
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
pattern: str,
|
|
125
|
+
action: Action | str = Action.HIGHLIGHT,
|
|
126
|
+
pages: Sequence[int | str] | None = None,
|
|
127
|
+
password: str | None = None,
|
|
128
|
+
):
|
|
129
|
+
self.pattern = pattern
|
|
130
|
+
self.action = action if isinstance(action, Action) else Action.from_value(action)
|
|
131
|
+
self.pages = _normalize_pages(pages)
|
|
132
|
+
self.password = password
|
|
133
|
+
|
|
134
|
+
def annotate(self, input_file: str | Path, output_file: str | Path) -> ProcessResult:
|
|
135
|
+
return annotate_pdf(
|
|
136
|
+
input_file=input_file,
|
|
137
|
+
output_file=output_file,
|
|
138
|
+
pattern=self.pattern,
|
|
139
|
+
action=self.action,
|
|
140
|
+
pages=self.pages,
|
|
141
|
+
password=self.password,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def annotate_pdf(
|
|
146
|
+
input_file: str | Path,
|
|
147
|
+
output_file: str | Path,
|
|
148
|
+
pattern: str,
|
|
149
|
+
action: Action | str = Action.HIGHLIGHT,
|
|
150
|
+
pages: Sequence[int | str] | None = None,
|
|
151
|
+
password: str | None = None,
|
|
152
|
+
) -> ProcessResult:
|
|
153
|
+
"""Search a PDF and apply the requested annotation action."""
|
|
154
|
+
selected_action = action if isinstance(action, Action) else Action.from_value(action)
|
|
155
|
+
page_list = _normalize_pages(pages)
|
|
156
|
+
|
|
157
|
+
if selected_action == Action.REMOVE:
|
|
158
|
+
return remove_annotations(input_file, output_file, page_list, password)
|
|
159
|
+
|
|
160
|
+
pdf_doc = _open_pdf(input_file, password)
|
|
161
|
+
total_matches = 0
|
|
162
|
+
pages_processed = 0
|
|
163
|
+
|
|
164
|
+
for page_index in range(pdf_doc.page_count):
|
|
165
|
+
if page_list is not None and page_index not in page_list:
|
|
166
|
+
continue
|
|
167
|
+
pages_processed += 1
|
|
168
|
+
page = pdf_doc[page_index]
|
|
169
|
+
page_lines = page.get_text("text").split("\n")
|
|
170
|
+
matched_values = list(search_for_text(page_lines, pattern))
|
|
171
|
+
if not matched_values:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
if selected_action == Action.REDACT:
|
|
175
|
+
total_matches += redact_matches(page, matched_values)
|
|
176
|
+
elif selected_action == Action.FRAME:
|
|
177
|
+
total_matches += frame_matches(page, matched_values)
|
|
178
|
+
else:
|
|
179
|
+
total_matches += highlight_matches(page, matched_values, selected_action)
|
|
180
|
+
|
|
181
|
+
_save_pdf(pdf_doc, Path(output_file))
|
|
182
|
+
return ProcessResult(
|
|
183
|
+
input_path=str(input_file),
|
|
184
|
+
output_path=str(output_file),
|
|
185
|
+
action=selected_action,
|
|
186
|
+
matches=total_matches,
|
|
187
|
+
pages_processed=pages_processed,
|
|
188
|
+
)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Shared types for PDF annotation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from enum import Enum
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StructureMode(str, Enum):
|
|
10
|
+
CURATE = "curate"
|
|
11
|
+
SEARCHABLE = "searchable"
|
|
12
|
+
|
|
13
|
+
@classmethod
|
|
14
|
+
def from_value(cls, value: str) -> "StructureMode":
|
|
15
|
+
normalized = value.strip().lower()
|
|
16
|
+
for mode in cls:
|
|
17
|
+
if mode.value == normalized:
|
|
18
|
+
return mode
|
|
19
|
+
valid = ", ".join(mode.value for mode in cls)
|
|
20
|
+
raise ValueError(f"Unsupported mode '{value}'. Choose from: {valid}")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Action(str, Enum):
|
|
24
|
+
HIGHLIGHT = "Highlight"
|
|
25
|
+
SQUIGGLY = "Squiggly"
|
|
26
|
+
UNDERLINE = "Underline"
|
|
27
|
+
STRIKEOUT = "Strikeout"
|
|
28
|
+
REDACT = "Redact"
|
|
29
|
+
FRAME = "Frame"
|
|
30
|
+
REMOVE = "Remove"
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def choices(cls) -> tuple[str, ...]:
|
|
34
|
+
return tuple(action.value for action in cls)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_value(cls, value: str) -> "Action":
|
|
38
|
+
normalized = value.strip().lower()
|
|
39
|
+
for action in cls:
|
|
40
|
+
if action.value.lower() == normalized:
|
|
41
|
+
return action
|
|
42
|
+
valid = ", ".join(cls.choices())
|
|
43
|
+
raise ValueError(f"Unsupported action '{value}'. Choose from: {valid}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class ProcessResult:
|
|
48
|
+
input_path: str
|
|
49
|
+
output_path: str
|
|
50
|
+
action: Action
|
|
51
|
+
matches: int
|
|
52
|
+
pages_processed: int
|
|
53
|
+
|
|
54
|
+
def to_dict(self) -> dict:
|
|
55
|
+
return {
|
|
56
|
+
"input_path": self.input_path,
|
|
57
|
+
"output_path": self.output_path,
|
|
58
|
+
"action": self.action.value,
|
|
59
|
+
"matches": self.matches,
|
|
60
|
+
"pages_processed": self.pages_processed,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def __str__(self) -> str:
|
|
64
|
+
return (
|
|
65
|
+
f"{self.matches} matches annotated in {self.pages_processed} pages "
|
|
66
|
+
f"-> {self.output_path}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass(frozen=True)
|
|
71
|
+
class StructureResult:
|
|
72
|
+
input_path: str
|
|
73
|
+
output_path: str
|
|
74
|
+
mode: StructureMode
|
|
75
|
+
pages_processed: int
|
|
76
|
+
ocr_pages: list[int]
|
|
77
|
+
document_title: str
|
|
78
|
+
|
|
79
|
+
def to_dict(self) -> dict:
|
|
80
|
+
return {
|
|
81
|
+
"input_path": self.input_path,
|
|
82
|
+
"output_path": self.output_path,
|
|
83
|
+
"mode": self.mode.value,
|
|
84
|
+
"pages_processed": self.pages_processed,
|
|
85
|
+
"ocr_pages": self.ocr_pages,
|
|
86
|
+
"document_title": self.document_title,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass(frozen=True)
|
|
91
|
+
class PIIDetectionResult(ProcessResult):
|
|
92
|
+
ocr_pages: list[int]
|
|
93
|
+
findings: list[dict]
|
|
94
|
+
|
|
95
|
+
def to_dict(self) -> dict:
|
|
96
|
+
payload = super().to_dict()
|
|
97
|
+
payload.update(
|
|
98
|
+
{
|
|
99
|
+
"ocr_pages": self.ocr_pages,
|
|
100
|
+
"findings": self.findings,
|
|
101
|
+
"finding_count": len(self.findings),
|
|
102
|
+
}
|
|
103
|
+
)
|
|
104
|
+
return payload
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""EasyOCR extraction for scanned PDF pages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from functools import lru_cache
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
import fitz
|
|
10
|
+
|
|
11
|
+
from docintel.services.pdf.presets import MIN_NATIVE_TEXT_CHARS, OCR_RENDER_SCALE
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class OCRSpan:
|
|
19
|
+
"""A text region detected by OCR with PDF coordinates."""
|
|
20
|
+
|
|
21
|
+
text: str
|
|
22
|
+
rect: fitz.Rect
|
|
23
|
+
confidence: float
|
|
24
|
+
char_start: int = 0
|
|
25
|
+
char_end: int = 0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass(frozen=True)
|
|
29
|
+
class IndexedSpan:
|
|
30
|
+
"""Character offsets mapped to a PDF rectangle."""
|
|
31
|
+
|
|
32
|
+
char_start: int
|
|
33
|
+
char_end: int
|
|
34
|
+
rect: fitz.Rect
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def page_has_native_text(page: fitz.Page, min_chars: int = MIN_NATIVE_TEXT_CHARS) -> bool:
|
|
38
|
+
"""Return True when the PDF text layer has enough content to skip OCR."""
|
|
39
|
+
return len(page.get_text("text").strip()) >= min_chars
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@lru_cache(maxsize=1)
|
|
43
|
+
def _easyocr_reader():
|
|
44
|
+
import easyocr
|
|
45
|
+
|
|
46
|
+
return easyocr.Reader(["en"], gpu=False, verbose=False)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _pixmap_to_array(pix: fitz.Pixmap) -> np.ndarray:
|
|
50
|
+
import numpy as np
|
|
51
|
+
|
|
52
|
+
return np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def extract_page_ocr(page: fitz.Page, scale: float = OCR_RENDER_SCALE) -> list[OCRSpan]:
|
|
56
|
+
"""Run EasyOCR on a PDF page and return text boxes in PDF coordinates."""
|
|
57
|
+
matrix = fitz.Matrix(scale, scale)
|
|
58
|
+
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
|
|
59
|
+
image = _pixmap_to_array(pixmap)
|
|
60
|
+
|
|
61
|
+
reader = _easyocr_reader()
|
|
62
|
+
detections = reader.readtext(image)
|
|
63
|
+
|
|
64
|
+
spans: list[OCRSpan] = []
|
|
65
|
+
for bbox, text, confidence in detections:
|
|
66
|
+
cleaned = str(text).strip()
|
|
67
|
+
if not cleaned:
|
|
68
|
+
continue
|
|
69
|
+
xs = [point[0] for point in bbox]
|
|
70
|
+
ys = [point[1] for point in bbox]
|
|
71
|
+
rect = fitz.Rect(
|
|
72
|
+
min(xs) / scale,
|
|
73
|
+
min(ys) / scale,
|
|
74
|
+
max(xs) / scale,
|
|
75
|
+
max(ys) / scale,
|
|
76
|
+
)
|
|
77
|
+
spans.append(OCRSpan(text=cleaned, rect=rect, confidence=float(confidence)))
|
|
78
|
+
|
|
79
|
+
return spans
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def build_indexed_text(spans: list[OCRSpan]) -> tuple[str, list[IndexedSpan]]:
|
|
83
|
+
"""Join OCR spans into page text and track character offsets per box."""
|
|
84
|
+
chunks: list[str] = []
|
|
85
|
+
indexed: list[IndexedSpan] = []
|
|
86
|
+
position = 0
|
|
87
|
+
|
|
88
|
+
for index, span in enumerate(spans):
|
|
89
|
+
if index > 0:
|
|
90
|
+
chunks.append(" ")
|
|
91
|
+
position += 1
|
|
92
|
+
start = position
|
|
93
|
+
chunks.append(span.text)
|
|
94
|
+
position += len(span.text)
|
|
95
|
+
indexed.append(IndexedSpan(char_start=start, char_end=position, rect=span.rect))
|
|
96
|
+
|
|
97
|
+
return "".join(chunks), indexed
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def rects_for_char_range(start: int, end: int, indexed: list[IndexedSpan]) -> list[fitz.Rect]:
|
|
101
|
+
"""Map a character span to one or more PDF rectangles."""
|
|
102
|
+
rects: list[fitz.Rect] = []
|
|
103
|
+
for item in indexed:
|
|
104
|
+
if item.char_end <= start or item.char_start >= end:
|
|
105
|
+
continue
|
|
106
|
+
rects.append(item.rect)
|
|
107
|
+
return rects
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def merge_rects(rects: list[fitz.Rect]) -> fitz.Rect | None:
|
|
111
|
+
"""Merge rectangles into a single bounding box."""
|
|
112
|
+
if not rects:
|
|
113
|
+
return None
|
|
114
|
+
return fitz.Rect(
|
|
115
|
+
min(rect.x0 for rect in rects),
|
|
116
|
+
min(rect.y0 for rect in rects),
|
|
117
|
+
max(rect.x1 for rect in rects),
|
|
118
|
+
max(rect.y1 for rect in rects),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def embed_invisible_text_layer(page: fitz.Page, spans: list[OCRSpan]) -> None:
|
|
123
|
+
"""Add a searchable text layer from OCR spans (invisible rendering)."""
|
|
124
|
+
for span in spans:
|
|
125
|
+
page.insert_text(
|
|
126
|
+
(span.rect.x0, span.rect.y1),
|
|
127
|
+
span.text,
|
|
128
|
+
fontsize=max(6, span.rect.height * 0.8),
|
|
129
|
+
render_mode=3,
|
|
130
|
+
)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Microsoft Presidio PII detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from functools import lru_cache
|
|
7
|
+
from typing import Sequence
|
|
8
|
+
|
|
9
|
+
from docintel.services.pdf.presets import DEFAULT_PII_ENTITIES
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class PIIHit:
|
|
14
|
+
"""A sensitive entity detected in text."""
|
|
15
|
+
|
|
16
|
+
entity_type: str
|
|
17
|
+
text: str
|
|
18
|
+
start: int
|
|
19
|
+
end: int
|
|
20
|
+
score: float
|
|
21
|
+
|
|
22
|
+
def to_dict(self) -> dict:
|
|
23
|
+
return {
|
|
24
|
+
"entity_type": self.entity_type,
|
|
25
|
+
"text": self.text,
|
|
26
|
+
"start": self.start,
|
|
27
|
+
"end": self.end,
|
|
28
|
+
"score": round(self.score, 4),
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@lru_cache(maxsize=1)
|
|
33
|
+
def _analyzer_engine():
|
|
34
|
+
from presidio_analyzer import AnalyzerEngine
|
|
35
|
+
|
|
36
|
+
return AnalyzerEngine()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def detect_pii_in_text(
|
|
40
|
+
text: str,
|
|
41
|
+
*,
|
|
42
|
+
entities: Sequence[str] | None = None,
|
|
43
|
+
language: str = "en",
|
|
44
|
+
min_score: float = 0.35,
|
|
45
|
+
) -> list[PIIHit]:
|
|
46
|
+
"""Run Presidio analyzer on plain text."""
|
|
47
|
+
if not text.strip():
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
selected_entities = list(entities) if entities else list(DEFAULT_PII_ENTITIES)
|
|
51
|
+
analyzer = _analyzer_engine()
|
|
52
|
+
results = analyzer.analyze(
|
|
53
|
+
text=text,
|
|
54
|
+
language=language,
|
|
55
|
+
entities=selected_entities,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
hits: list[PIIHit] = []
|
|
59
|
+
for result in results:
|
|
60
|
+
if result.score < min_score:
|
|
61
|
+
continue
|
|
62
|
+
hits.append(
|
|
63
|
+
PIIHit(
|
|
64
|
+
entity_type=result.entity_type,
|
|
65
|
+
text=text[result.start : result.end],
|
|
66
|
+
start=result.start,
|
|
67
|
+
end=result.end,
|
|
68
|
+
score=float(result.score),
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
return hits
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def list_supported_entities(language: str = "en") -> list[str]:
|
|
75
|
+
"""Return Presidio-supported entity types for a language."""
|
|
76
|
+
return sorted(_analyzer_engine().get_supported_entities(language=language))
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def mask_pii_in_text(
|
|
80
|
+
text: str,
|
|
81
|
+
*,
|
|
82
|
+
entities: Sequence[str] | None = None,
|
|
83
|
+
language: str = "en",
|
|
84
|
+
min_score: float = 0.35,
|
|
85
|
+
mask_template: str = "[REDACTED_{entity}]",
|
|
86
|
+
) -> tuple[str, int]:
|
|
87
|
+
"""
|
|
88
|
+
Replace detected PII spans with redaction tokens before external LLM calls.
|
|
89
|
+
|
|
90
|
+
Returns masked text and the number of entities redacted.
|
|
91
|
+
"""
|
|
92
|
+
hits = detect_pii_in_text(
|
|
93
|
+
text,
|
|
94
|
+
entities=entities,
|
|
95
|
+
language=language,
|
|
96
|
+
min_score=min_score,
|
|
97
|
+
)
|
|
98
|
+
if not hits:
|
|
99
|
+
return text, 0
|
|
100
|
+
|
|
101
|
+
masked = text
|
|
102
|
+
for hit in sorted(hits, key=lambda item: item.start, reverse=True):
|
|
103
|
+
token = mask_template.format(entity=hit.entity_type)
|
|
104
|
+
masked = masked[: hit.start] + token + masked[hit.end :]
|
|
105
|
+
return masked, len(hits)
|