docintel-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. docintel/__init__.py +6 -0
  2. docintel/app.py +45 -0
  3. docintel/auth/__init__.py +12 -0
  4. docintel/auth/api_keys.py +48 -0
  5. docintel/auth/limiter.py +41 -0
  6. docintel/auth/middleware.py +34 -0
  7. docintel/auth/oidc.py +45 -0
  8. docintel/cli.py +21 -0
  9. docintel/client.py +193 -0
  10. docintel/config.py +20 -0
  11. docintel/jobs/__init__.py +16 -0
  12. docintel/jobs/helpers.py +38 -0
  13. docintel/jobs/models.py +78 -0
  14. docintel/jobs/queue.py +75 -0
  15. docintel/jobs/store.py +82 -0
  16. docintel/jobs/tasks.py +173 -0
  17. docintel/jobs/webhooks.py +32 -0
  18. docintel/openapi/__init__.py +1 -0
  19. docintel/openapi/openapi.yaml +380 -0
  20. docintel/ops/__init__.py +1 -0
  21. docintel/ops/logging.py +40 -0
  22. docintel/ops/metrics.py +57 -0
  23. docintel/ops/middleware.py +40 -0
  24. docintel/routes/__init__.py +1 -0
  25. docintel/routes/jobs.py +26 -0
  26. docintel/routes/match.py +43 -0
  27. docintel/routes/openapi_docs.py +57 -0
  28. docintel/routes/ops.py +22 -0
  29. docintel/routes/pdf.py +420 -0
  30. docintel/routes/text.py +41 -0
  31. docintel/services/__init__.py +1 -0
  32. docintel/services/matching/__init__.py +6 -0
  33. docintel/services/matching/models.py +19 -0
  34. docintel/services/matching/scorer.py +64 -0
  35. docintel/services/pdf/__init__.py +26 -0
  36. docintel/services/pdf/annotator.py +188 -0
  37. docintel/services/pdf/models.py +104 -0
  38. docintel/services/pdf/ocr.py +130 -0
  39. docintel/services/pdf/pii.py +105 -0
  40. docintel/services/pdf/presets.py +26 -0
  41. docintel/services/pdf/search.py +29 -0
  42. docintel/services/pdf/sensitive.py +212 -0
  43. docintel/services/pdf/structure.py +118 -0
  44. docintel/services/pdf/structure_llm.py +136 -0
  45. docintel/services/pdf/structure_render.py +136 -0
  46. docintel/services/pdf/structure_schema.py +99 -0
  47. docintel/services/summary/__init__.py +6 -0
  48. docintel/services/summary/models.py +21 -0
  49. docintel/services/summary/textrank.py +57 -0
  50. docintel/ui.py +347 -0
  51. docintel/wsgi.py +5 -0
  52. docintel_platform-1.0.2.dist-info/METADATA +607 -0
  53. docintel_platform-1.0.2.dist-info/RECORD +56 -0
  54. docintel_platform-1.0.2.dist-info/WHEEL +5 -0
  55. docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
  56. docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,41 @@
1
+ """Text summarization API routes."""
2
+
3
+ from flask import Blueprint, jsonify, request
4
+
5
+ from docintel.auth.limiter import limiter
6
+ from docintel.services.summary import summarize_text
7
+ from docintel.services.summary.textrank import DEFAULT_SENTENCE_COUNT, MAX_SENTENCE_COUNT
8
+
9
+ text_bp = Blueprint("text", __name__, url_prefix="/v1/text")
10
+
11
+
12
+ @text_bp.post("/summarize")
13
+ @limiter.limit("100 per hour")
14
+ def summarize():
15
+ """Extractively summarize plain text using TextRank sentence ranking."""
16
+ payload = request.get_json(silent=True)
17
+ if not isinstance(payload, dict):
18
+ return jsonify({"error": "Request body must be JSON."}), 400
19
+
20
+ text = payload.get("text", "")
21
+ sentences = payload.get("sentences", DEFAULT_SENTENCE_COUNT)
22
+
23
+ if not isinstance(text, str):
24
+ return jsonify({"error": "Field 'text' must be a string."}), 400
25
+
26
+ try:
27
+ sentences = int(sentences)
28
+ except (TypeError, ValueError):
29
+ return jsonify({"error": "Field 'sentences' must be an integer."}), 400
30
+
31
+ if sentences < 1 or sentences > MAX_SENTENCE_COUNT:
32
+ return jsonify(
33
+ {"error": f"Field 'sentences' must be between 1 and {MAX_SENTENCE_COUNT}."}
34
+ ), 400
35
+
36
+ try:
37
+ result = summarize_text(text=text, sentence_count=sentences)
38
+ except ValueError as exc:
39
+ return jsonify({"error": str(exc)}), 400
40
+
41
+ return jsonify({"status": "ok", **result.to_dict()}), 200
@@ -0,0 +1 @@
1
+ """Document processing services."""
@@ -0,0 +1,6 @@
1
+ """Resume-to-job matching service."""
2
+
3
+ from docintel.services.matching.models import MatchResult
4
+ from docintel.services.matching.scorer import match_resume_to_job
5
+
6
+ __all__ = ["MatchResult", "match_resume_to_job"]
@@ -0,0 +1,19 @@
1
+ """Types for resume matching."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class MatchResult:
10
+ score: float
11
+ matched_keywords: list[str]
12
+ missing_keywords: list[str]
13
+
14
+ def to_dict(self) -> dict:
15
+ return {
16
+ "score": self.score,
17
+ "matched_keywords": self.matched_keywords,
18
+ "missing_keywords": self.missing_keywords,
19
+ }
@@ -0,0 +1,64 @@
1
+ """TF-IDF resume matching engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ from docintel.services.matching.models import MatchResult
9
+
10
+ DEFAULT_TOP_KEYWORDS = 25
11
+
12
+
13
+ def _clean_text(text: str) -> str:
14
+ return " ".join(text.strip().split())
15
+
16
+
17
+ def match_resume_to_job(
18
+ resume: str,
19
+ job_description: str,
20
+ *,
21
+ top_keywords: int = DEFAULT_TOP_KEYWORDS,
22
+ ) -> MatchResult:
23
+ """Score resume fit against a job description using TF-IDF cosine similarity."""
24
+ resume_text = _clean_text(resume)
25
+ job_text = _clean_text(job_description)
26
+
27
+ if not resume_text:
28
+ raise ValueError("Resume text is required.")
29
+ if not job_text:
30
+ raise ValueError("Job description text is required.")
31
+
32
+ vectorizer = TfidfVectorizer(
33
+ stop_words="english",
34
+ token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z0-9+#.]+\b",
35
+ )
36
+ matrix = vectorizer.fit_transform([resume_text, job_text])
37
+ similarity = cosine_similarity(matrix[0:1], matrix[1:2])[0][0]
38
+ score = round(float(similarity) * 100, 2)
39
+
40
+ features = vectorizer.get_feature_names_out()
41
+ resume_weights = matrix[0].toarray()[0]
42
+ job_weights = matrix[1].toarray()[0]
43
+
44
+ matched: list[tuple[str, float]] = []
45
+ missing: list[tuple[str, float]] = []
46
+
47
+ for index, term in enumerate(features):
48
+ job_weight = job_weights[index]
49
+ if job_weight <= 0:
50
+ continue
51
+ if resume_weights[index] > 0:
52
+ matched.append((term, job_weight))
53
+ else:
54
+ missing.append((term, job_weight))
55
+
56
+ matched.sort(key=lambda item: item[1], reverse=True)
57
+ missing.sort(key=lambda item: item[1], reverse=True)
58
+
59
+ limit = max(1, top_keywords)
60
+ return MatchResult(
61
+ score=score,
62
+ matched_keywords=[term for term, _ in matched[:limit]],
63
+ missing_keywords=[term for term, _ in missing[:limit]],
64
+ )
@@ -0,0 +1,26 @@
1
+ """PDF search and annotation service."""
2
+
3
+ from docintel.services.pdf.annotator import PDFAnnotator, annotate_pdf
4
+ from docintel.services.pdf.models import Action, PIIDetectionResult, ProcessResult, StructureMode, StructureResult
5
+ from docintel.services.pdf.pii import detect_pii_in_text, list_supported_entities
6
+ from docintel.services.pdf.presets import DEFAULT_PII_ENTITIES
7
+ from docintel.services.pdf.search import extract_info, search_for_text
8
+ from docintel.services.pdf.sensitive import detect_sensitive_pdf
9
+ from docintel.services.pdf.structure import structure_pdf
10
+
11
+ __all__ = [
12
+ "Action",
13
+ "DEFAULT_PII_ENTITIES",
14
+ "PDFAnnotator",
15
+ "PIIDetectionResult",
16
+ "ProcessResult",
17
+ "StructureMode",
18
+ "StructureResult",
19
+ "annotate_pdf",
20
+ "detect_pii_in_text",
21
+ "detect_sensitive_pdf",
22
+ "extract_info",
23
+ "list_supported_entities",
24
+ "search_for_text",
25
+ "structure_pdf",
26
+ ]
@@ -0,0 +1,188 @@
1
+ """Core PDF annotation engine."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Iterable, Sequence
7
+
8
+ import fitz
9
+
10
+ from docintel.services.pdf.models import Action, ProcessResult
11
+ from docintel.services.pdf.search import search_for_text
12
+
13
+
14
+ def _normalize_pages(pages: Sequence[int | str] | None) -> list[int] | None:
15
+ if pages is None:
16
+ return None
17
+ return [int(page) for page in pages]
18
+
19
+
20
+ def _open_pdf(path: str | Path, password: str | None = None) -> fitz.Document:
21
+ pdf_path = Path(path)
22
+ if not pdf_path.is_file():
23
+ raise FileNotFoundError(f"PDF not found: {pdf_path}")
24
+
25
+ pdf_doc = fitz.open(pdf_path)
26
+ if pdf_doc.is_encrypted:
27
+ if not password or not pdf_doc.authenticate(password):
28
+ pdf_doc.close()
29
+ raise PermissionError(f"PDF is encrypted: {pdf_path}")
30
+ return pdf_doc
31
+
32
+
33
+ def _save_pdf(pdf_doc: fitz.Document, output_path: Path) -> None:
34
+ output_path.parent.mkdir(parents=True, exist_ok=True)
35
+ pdf_doc.save(output_path)
36
+ pdf_doc.close()
37
+
38
+
39
+ def redact_matches(page: fitz.Page, matched_values: Iterable[str]) -> int:
40
+ count = 0
41
+ for value in matched_values:
42
+ areas = page.search_for(value)
43
+ if not areas:
44
+ continue
45
+ count += 1
46
+ for area in areas:
47
+ page.add_redact_annot(area, text=" ", fill=(0, 0, 0))
48
+ if count:
49
+ page.apply_redactions()
50
+ return count
51
+
52
+
53
+ def frame_matches(page: fitz.Page, matched_values: Iterable[str]) -> int:
54
+ count = 0
55
+ for value in matched_values:
56
+ areas = page.search_for(value)
57
+ if not areas:
58
+ continue
59
+ count += 1
60
+ for area in areas:
61
+ annot = page.add_rect_annot(area)
62
+ annot.set_colors(stroke=fitz.utils.getColor("red"))
63
+ annot.update()
64
+ return count
65
+
66
+
67
+ def highlight_matches(page: fitz.Page, matched_values: Iterable[str], action: Action) -> int:
68
+ count = 0
69
+ for value in matched_values:
70
+ areas = page.search_for(value)
71
+ if not areas:
72
+ continue
73
+ count += 1
74
+ if action == Action.SQUIGGLY:
75
+ annot = page.add_squiggly_annot(areas)
76
+ elif action == Action.UNDERLINE:
77
+ annot = page.add_underline_annot(areas)
78
+ elif action == Action.STRIKEOUT:
79
+ annot = page.add_strikeout_annot(areas)
80
+ else:
81
+ annot = page.add_highlight_annot(areas)
82
+ annot.update()
83
+ return count
84
+
85
+
86
+ def remove_annotations(
87
+ input_file: str | Path,
88
+ output_file: str | Path,
89
+ pages: Sequence[int] | None = None,
90
+ password: str | None = None,
91
+ ) -> ProcessResult:
92
+ pdf_doc = _open_pdf(input_file, password)
93
+ removed = 0
94
+ pages_processed = 0
95
+ page_list = _normalize_pages(pages)
96
+
97
+ for page_index in range(pdf_doc.page_count):
98
+ if page_list is not None and page_index not in page_list:
99
+ continue
100
+ pages_processed += 1
101
+ page = pdf_doc[page_index]
102
+ annot = page.first_annot
103
+ while annot:
104
+ removed += 1
105
+ next_annot = annot.next
106
+ page.delete_annot(annot)
107
+ annot = next_annot
108
+
109
+ _save_pdf(pdf_doc, Path(output_file))
110
+ return ProcessResult(
111
+ input_path=str(input_file),
112
+ output_path=str(output_file),
113
+ action=Action.REMOVE,
114
+ matches=removed,
115
+ pages_processed=pages_processed,
116
+ )
117
+
118
+
119
+ class PDFAnnotator:
120
+ """High-level API for annotating PDF files."""
121
+
122
+ def __init__(
123
+ self,
124
+ pattern: str,
125
+ action: Action | str = Action.HIGHLIGHT,
126
+ pages: Sequence[int | str] | None = None,
127
+ password: str | None = None,
128
+ ):
129
+ self.pattern = pattern
130
+ self.action = action if isinstance(action, Action) else Action.from_value(action)
131
+ self.pages = _normalize_pages(pages)
132
+ self.password = password
133
+
134
+ def annotate(self, input_file: str | Path, output_file: str | Path) -> ProcessResult:
135
+ return annotate_pdf(
136
+ input_file=input_file,
137
+ output_file=output_file,
138
+ pattern=self.pattern,
139
+ action=self.action,
140
+ pages=self.pages,
141
+ password=self.password,
142
+ )
143
+
144
+
145
+ def annotate_pdf(
146
+ input_file: str | Path,
147
+ output_file: str | Path,
148
+ pattern: str,
149
+ action: Action | str = Action.HIGHLIGHT,
150
+ pages: Sequence[int | str] | None = None,
151
+ password: str | None = None,
152
+ ) -> ProcessResult:
153
+ """Search a PDF and apply the requested annotation action."""
154
+ selected_action = action if isinstance(action, Action) else Action.from_value(action)
155
+ page_list = _normalize_pages(pages)
156
+
157
+ if selected_action == Action.REMOVE:
158
+ return remove_annotations(input_file, output_file, page_list, password)
159
+
160
+ pdf_doc = _open_pdf(input_file, password)
161
+ total_matches = 0
162
+ pages_processed = 0
163
+
164
+ for page_index in range(pdf_doc.page_count):
165
+ if page_list is not None and page_index not in page_list:
166
+ continue
167
+ pages_processed += 1
168
+ page = pdf_doc[page_index]
169
+ page_lines = page.get_text("text").split("\n")
170
+ matched_values = list(search_for_text(page_lines, pattern))
171
+ if not matched_values:
172
+ continue
173
+
174
+ if selected_action == Action.REDACT:
175
+ total_matches += redact_matches(page, matched_values)
176
+ elif selected_action == Action.FRAME:
177
+ total_matches += frame_matches(page, matched_values)
178
+ else:
179
+ total_matches += highlight_matches(page, matched_values, selected_action)
180
+
181
+ _save_pdf(pdf_doc, Path(output_file))
182
+ return ProcessResult(
183
+ input_path=str(input_file),
184
+ output_path=str(output_file),
185
+ action=selected_action,
186
+ matches=total_matches,
187
+ pages_processed=pages_processed,
188
+ )
@@ -0,0 +1,104 @@
1
+ """Shared types for PDF annotation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+
8
+
9
+ class StructureMode(str, Enum):
10
+ CURATE = "curate"
11
+ SEARCHABLE = "searchable"
12
+
13
+ @classmethod
14
+ def from_value(cls, value: str) -> "StructureMode":
15
+ normalized = value.strip().lower()
16
+ for mode in cls:
17
+ if mode.value == normalized:
18
+ return mode
19
+ valid = ", ".join(mode.value for mode in cls)
20
+ raise ValueError(f"Unsupported mode '{value}'. Choose from: {valid}")
21
+
22
+
23
+ class Action(str, Enum):
24
+ HIGHLIGHT = "Highlight"
25
+ SQUIGGLY = "Squiggly"
26
+ UNDERLINE = "Underline"
27
+ STRIKEOUT = "Strikeout"
28
+ REDACT = "Redact"
29
+ FRAME = "Frame"
30
+ REMOVE = "Remove"
31
+
32
+ @classmethod
33
+ def choices(cls) -> tuple[str, ...]:
34
+ return tuple(action.value for action in cls)
35
+
36
+ @classmethod
37
+ def from_value(cls, value: str) -> "Action":
38
+ normalized = value.strip().lower()
39
+ for action in cls:
40
+ if action.value.lower() == normalized:
41
+ return action
42
+ valid = ", ".join(cls.choices())
43
+ raise ValueError(f"Unsupported action '{value}'. Choose from: {valid}")
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class ProcessResult:
48
+ input_path: str
49
+ output_path: str
50
+ action: Action
51
+ matches: int
52
+ pages_processed: int
53
+
54
+ def to_dict(self) -> dict:
55
+ return {
56
+ "input_path": self.input_path,
57
+ "output_path": self.output_path,
58
+ "action": self.action.value,
59
+ "matches": self.matches,
60
+ "pages_processed": self.pages_processed,
61
+ }
62
+
63
+ def __str__(self) -> str:
64
+ return (
65
+ f"{self.matches} matches annotated in {self.pages_processed} pages "
66
+ f"-> {self.output_path}"
67
+ )
68
+
69
+
70
+ @dataclass(frozen=True)
71
+ class StructureResult:
72
+ input_path: str
73
+ output_path: str
74
+ mode: StructureMode
75
+ pages_processed: int
76
+ ocr_pages: list[int]
77
+ document_title: str
78
+
79
+ def to_dict(self) -> dict:
80
+ return {
81
+ "input_path": self.input_path,
82
+ "output_path": self.output_path,
83
+ "mode": self.mode.value,
84
+ "pages_processed": self.pages_processed,
85
+ "ocr_pages": self.ocr_pages,
86
+ "document_title": self.document_title,
87
+ }
88
+
89
+
90
+ @dataclass(frozen=True)
91
+ class PIIDetectionResult(ProcessResult):
92
+ ocr_pages: list[int]
93
+ findings: list[dict]
94
+
95
+ def to_dict(self) -> dict:
96
+ payload = super().to_dict()
97
+ payload.update(
98
+ {
99
+ "ocr_pages": self.ocr_pages,
100
+ "findings": self.findings,
101
+ "finding_count": len(self.findings),
102
+ }
103
+ )
104
+ return payload
@@ -0,0 +1,130 @@
1
+ """EasyOCR extraction for scanned PDF pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from functools import lru_cache
7
+ from typing import TYPE_CHECKING
8
+
9
+ import fitz
10
+
11
+ from docintel.services.pdf.presets import MIN_NATIVE_TEXT_CHARS, OCR_RENDER_SCALE
12
+
13
+ if TYPE_CHECKING:
14
+ import numpy as np
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class OCRSpan:
19
+ """A text region detected by OCR with PDF coordinates."""
20
+
21
+ text: str
22
+ rect: fitz.Rect
23
+ confidence: float
24
+ char_start: int = 0
25
+ char_end: int = 0
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class IndexedSpan:
30
+ """Character offsets mapped to a PDF rectangle."""
31
+
32
+ char_start: int
33
+ char_end: int
34
+ rect: fitz.Rect
35
+
36
+
37
+ def page_has_native_text(page: fitz.Page, min_chars: int = MIN_NATIVE_TEXT_CHARS) -> bool:
38
+ """Return True when the PDF text layer has enough content to skip OCR."""
39
+ return len(page.get_text("text").strip()) >= min_chars
40
+
41
+
42
+ @lru_cache(maxsize=1)
43
+ def _easyocr_reader():
44
+ import easyocr
45
+
46
+ return easyocr.Reader(["en"], gpu=False, verbose=False)
47
+
48
+
49
+ def _pixmap_to_array(pix: fitz.Pixmap) -> np.ndarray:
50
+ import numpy as np
51
+
52
+ return np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
53
+
54
+
55
+ def extract_page_ocr(page: fitz.Page, scale: float = OCR_RENDER_SCALE) -> list[OCRSpan]:
56
+ """Run EasyOCR on a PDF page and return text boxes in PDF coordinates."""
57
+ matrix = fitz.Matrix(scale, scale)
58
+ pixmap = page.get_pixmap(matrix=matrix, alpha=False)
59
+ image = _pixmap_to_array(pixmap)
60
+
61
+ reader = _easyocr_reader()
62
+ detections = reader.readtext(image)
63
+
64
+ spans: list[OCRSpan] = []
65
+ for bbox, text, confidence in detections:
66
+ cleaned = str(text).strip()
67
+ if not cleaned:
68
+ continue
69
+ xs = [point[0] for point in bbox]
70
+ ys = [point[1] for point in bbox]
71
+ rect = fitz.Rect(
72
+ min(xs) / scale,
73
+ min(ys) / scale,
74
+ max(xs) / scale,
75
+ max(ys) / scale,
76
+ )
77
+ spans.append(OCRSpan(text=cleaned, rect=rect, confidence=float(confidence)))
78
+
79
+ return spans
80
+
81
+
82
+ def build_indexed_text(spans: list[OCRSpan]) -> tuple[str, list[IndexedSpan]]:
83
+ """Join OCR spans into page text and track character offsets per box."""
84
+ chunks: list[str] = []
85
+ indexed: list[IndexedSpan] = []
86
+ position = 0
87
+
88
+ for index, span in enumerate(spans):
89
+ if index > 0:
90
+ chunks.append(" ")
91
+ position += 1
92
+ start = position
93
+ chunks.append(span.text)
94
+ position += len(span.text)
95
+ indexed.append(IndexedSpan(char_start=start, char_end=position, rect=span.rect))
96
+
97
+ return "".join(chunks), indexed
98
+
99
+
100
+ def rects_for_char_range(start: int, end: int, indexed: list[IndexedSpan]) -> list[fitz.Rect]:
101
+ """Map a character span to one or more PDF rectangles."""
102
+ rects: list[fitz.Rect] = []
103
+ for item in indexed:
104
+ if item.char_end <= start or item.char_start >= end:
105
+ continue
106
+ rects.append(item.rect)
107
+ return rects
108
+
109
+
110
+ def merge_rects(rects: list[fitz.Rect]) -> fitz.Rect | None:
111
+ """Merge rectangles into a single bounding box."""
112
+ if not rects:
113
+ return None
114
+ return fitz.Rect(
115
+ min(rect.x0 for rect in rects),
116
+ min(rect.y0 for rect in rects),
117
+ max(rect.x1 for rect in rects),
118
+ max(rect.y1 for rect in rects),
119
+ )
120
+
121
+
122
+ def embed_invisible_text_layer(page: fitz.Page, spans: list[OCRSpan]) -> None:
123
+ """Add a searchable text layer from OCR spans (invisible rendering)."""
124
+ for span in spans:
125
+ page.insert_text(
126
+ (span.rect.x0, span.rect.y1),
127
+ span.text,
128
+ fontsize=max(6, span.rect.height * 0.8),
129
+ render_mode=3,
130
+ )
@@ -0,0 +1,105 @@
1
+ """Microsoft Presidio PII detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from functools import lru_cache
7
+ from typing import Sequence
8
+
9
+ from docintel.services.pdf.presets import DEFAULT_PII_ENTITIES
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class PIIHit:
14
+ """A sensitive entity detected in text."""
15
+
16
+ entity_type: str
17
+ text: str
18
+ start: int
19
+ end: int
20
+ score: float
21
+
22
+ def to_dict(self) -> dict:
23
+ return {
24
+ "entity_type": self.entity_type,
25
+ "text": self.text,
26
+ "start": self.start,
27
+ "end": self.end,
28
+ "score": round(self.score, 4),
29
+ }
30
+
31
+
32
+ @lru_cache(maxsize=1)
33
+ def _analyzer_engine():
34
+ from presidio_analyzer import AnalyzerEngine
35
+
36
+ return AnalyzerEngine()
37
+
38
+
39
+ def detect_pii_in_text(
40
+ text: str,
41
+ *,
42
+ entities: Sequence[str] | None = None,
43
+ language: str = "en",
44
+ min_score: float = 0.35,
45
+ ) -> list[PIIHit]:
46
+ """Run Presidio analyzer on plain text."""
47
+ if not text.strip():
48
+ return []
49
+
50
+ selected_entities = list(entities) if entities else list(DEFAULT_PII_ENTITIES)
51
+ analyzer = _analyzer_engine()
52
+ results = analyzer.analyze(
53
+ text=text,
54
+ language=language,
55
+ entities=selected_entities,
56
+ )
57
+
58
+ hits: list[PIIHit] = []
59
+ for result in results:
60
+ if result.score < min_score:
61
+ continue
62
+ hits.append(
63
+ PIIHit(
64
+ entity_type=result.entity_type,
65
+ text=text[result.start : result.end],
66
+ start=result.start,
67
+ end=result.end,
68
+ score=float(result.score),
69
+ )
70
+ )
71
+ return hits
72
+
73
+
74
+ def list_supported_entities(language: str = "en") -> list[str]:
75
+ """Return Presidio-supported entity types for a language."""
76
+ return sorted(_analyzer_engine().get_supported_entities(language=language))
77
+
78
+
79
+ def mask_pii_in_text(
80
+ text: str,
81
+ *,
82
+ entities: Sequence[str] | None = None,
83
+ language: str = "en",
84
+ min_score: float = 0.35,
85
+ mask_template: str = "[REDACTED_{entity}]",
86
+ ) -> tuple[str, int]:
87
+ """
88
+ Replace detected PII spans with redaction tokens before external LLM calls.
89
+
90
+ Returns masked text and the number of entities redacted.
91
+ """
92
+ hits = detect_pii_in_text(
93
+ text,
94
+ entities=entities,
95
+ language=language,
96
+ min_score=min_score,
97
+ )
98
+ if not hits:
99
+ return text, 0
100
+
101
+ masked = text
102
+ for hit in sorted(hits, key=lambda item: item.start, reverse=True):
103
+ token = mask_template.format(entity=hit.entity_type)
104
+ masked = masked[: hit.start] + token + masked[hit.end :]
105
+ return masked, len(hits)