docintel-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. docintel/__init__.py +6 -0
  2. docintel/app.py +45 -0
  3. docintel/auth/__init__.py +12 -0
  4. docintel/auth/api_keys.py +48 -0
  5. docintel/auth/limiter.py +41 -0
  6. docintel/auth/middleware.py +34 -0
  7. docintel/auth/oidc.py +45 -0
  8. docintel/cli.py +21 -0
  9. docintel/client.py +193 -0
  10. docintel/config.py +20 -0
  11. docintel/jobs/__init__.py +16 -0
  12. docintel/jobs/helpers.py +38 -0
  13. docintel/jobs/models.py +78 -0
  14. docintel/jobs/queue.py +75 -0
  15. docintel/jobs/store.py +82 -0
  16. docintel/jobs/tasks.py +173 -0
  17. docintel/jobs/webhooks.py +32 -0
  18. docintel/openapi/__init__.py +1 -0
  19. docintel/openapi/openapi.yaml +380 -0
  20. docintel/ops/__init__.py +1 -0
  21. docintel/ops/logging.py +40 -0
  22. docintel/ops/metrics.py +57 -0
  23. docintel/ops/middleware.py +40 -0
  24. docintel/routes/__init__.py +1 -0
  25. docintel/routes/jobs.py +26 -0
  26. docintel/routes/match.py +43 -0
  27. docintel/routes/openapi_docs.py +57 -0
  28. docintel/routes/ops.py +22 -0
  29. docintel/routes/pdf.py +420 -0
  30. docintel/routes/text.py +41 -0
  31. docintel/services/__init__.py +1 -0
  32. docintel/services/matching/__init__.py +6 -0
  33. docintel/services/matching/models.py +19 -0
  34. docintel/services/matching/scorer.py +64 -0
  35. docintel/services/pdf/__init__.py +26 -0
  36. docintel/services/pdf/annotator.py +188 -0
  37. docintel/services/pdf/models.py +104 -0
  38. docintel/services/pdf/ocr.py +130 -0
  39. docintel/services/pdf/pii.py +105 -0
  40. docintel/services/pdf/presets.py +26 -0
  41. docintel/services/pdf/search.py +29 -0
  42. docintel/services/pdf/sensitive.py +212 -0
  43. docintel/services/pdf/structure.py +118 -0
  44. docintel/services/pdf/structure_llm.py +136 -0
  45. docintel/services/pdf/structure_render.py +136 -0
  46. docintel/services/pdf/structure_schema.py +99 -0
  47. docintel/services/summary/__init__.py +6 -0
  48. docintel/services/summary/models.py +21 -0
  49. docintel/services/summary/textrank.py +57 -0
  50. docintel/ui.py +347 -0
  51. docintel/wsgi.py +5 -0
  52. docintel_platform-1.0.2.dist-info/METADATA +607 -0
  53. docintel_platform-1.0.2.dist-info/RECORD +56 -0
  54. docintel_platform-1.0.2.dist-info/WHEEL +5 -0
  55. docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
  56. docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,57 @@
1
+ """In-process request metrics (per worker in multi-process deployments)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import threading
6
+ import time
7
+ from dataclasses import dataclass, field
8
+ from typing import Any
9
+
10
+
11
+ @dataclass
12
+ class MetricsStore:
13
+ """Thread-safe counters for HTTP request observability."""
14
+
15
+ total_requests: int = 0
16
+ total_errors: int = 0
17
+ total_latency_ms: float = 0.0
18
+ requests_by_endpoint: dict[str, int] = field(default_factory=dict)
19
+ requests_by_status: dict[str, int] = field(default_factory=dict)
20
+ latency_by_endpoint_ms: dict[str, float] = field(default_factory=dict)
21
+ started_at: float = field(default_factory=time.time)
22
+ _lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
23
+
24
+ def record(self, endpoint: str, status_code: int, duration_ms: float) -> None:
25
+ key = endpoint or "unknown"
26
+ status = str(status_code)
27
+ with self._lock:
28
+ self.total_requests += 1
29
+ if status_code >= 400:
30
+ self.total_errors += 1
31
+ self.total_latency_ms += duration_ms
32
+ self.requests_by_endpoint[key] = self.requests_by_endpoint.get(key, 0) + 1
33
+ self.requests_by_status[status] = self.requests_by_status.get(status, 0) + 1
34
+ self.latency_by_endpoint_ms[key] = (
35
+ self.latency_by_endpoint_ms.get(key, 0.0) + duration_ms
36
+ )
37
+
38
+ def snapshot(self) -> dict[str, Any]:
39
+ uptime_seconds = max(time.time() - self.started_at, 0.001)
40
+ with self._lock:
41
+ total = self.total_requests
42
+ avg_latency = self.total_latency_ms / total if total else 0.0
43
+ return {
44
+ "total_requests": total,
45
+ "total_errors": self.total_errors,
46
+ "avg_latency_ms": round(avg_latency, 2),
47
+ "requests_per_second": round(total / uptime_seconds, 4),
48
+ "uptime_seconds": round(uptime_seconds, 2),
49
+ "requests_by_endpoint": dict(self.requests_by_endpoint),
50
+ "requests_by_status": dict(self.requests_by_status),
51
+ "latency_by_endpoint_ms": {
52
+ key: round(value, 2) for key, value in self.latency_by_endpoint_ms.items()
53
+ },
54
+ }
55
+
56
+
57
+ metrics_store = MetricsStore()
@@ -0,0 +1,40 @@
1
+ """Request instrumentation hooks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+
8
+ from flask import Flask, g, request
9
+
10
+ from docintel.ops.metrics import metrics_store
11
+
12
+ logger = logging.getLogger("docintel.request")
13
+
14
+
15
+ def register_request_hooks(app: Flask) -> None:
16
+ """Attach timing, logging, and metrics collection to each request."""
17
+
18
+ @app.before_request
19
+ def _start_timer() -> None:
20
+ g.request_start = time.perf_counter()
21
+
22
+ @app.after_request
23
+ def _record_request(response):
24
+ start = getattr(g, "request_start", None)
25
+ duration_ms = (time.perf_counter() - start) * 1000 if start is not None else 0.0
26
+ endpoint = request.endpoint or request.path
27
+
28
+ metrics_store.record(endpoint, response.status_code, duration_ms)
29
+
30
+ logger.info(
31
+ "request completed",
32
+ extra={
33
+ "method": request.method,
34
+ "path": request.path,
35
+ "endpoint": endpoint,
36
+ "status_code": response.status_code,
37
+ "duration_ms": round(duration_ms, 2),
38
+ },
39
+ )
40
+ return response
@@ -0,0 +1 @@
1
+ """HTTP route blueprints."""
@@ -0,0 +1,26 @@
1
+ """Async job status API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from flask import Blueprint, jsonify
6
+
7
+ from docintel.auth.limiter import limiter
8
+ from docintel.jobs.store import get_job
9
+
10
+ jobs_bp = Blueprint("jobs", __name__, url_prefix="/v1/jobs")
11
+
12
+
13
+ @jobs_bp.get("/<job_id>")
14
+ @limiter.limit("600 per hour")
15
+ def job_status(job_id: str):
16
+ """Poll async job status and download URL when complete."""
17
+ record = get_job(job_id)
18
+ if record is None:
19
+ return jsonify({"error": f"Job not found: {job_id}"}), 404
20
+
21
+ payload = {
22
+ "status": "ok",
23
+ **record.to_dict(),
24
+ "poll_url": f"/v1/jobs/{job_id}",
25
+ }
26
+ return jsonify(payload), 200
@@ -0,0 +1,43 @@
1
+ """Resume matching API routes."""
2
+
3
+ from flask import Blueprint, jsonify, request
4
+
5
+ from docintel.auth.limiter import limiter
6
+ from docintel.services.matching import match_resume_to_job
7
+
8
+ match_bp = Blueprint("match", __name__, url_prefix="/v1/match")
9
+
10
+
11
+ @match_bp.post("/resume")
12
+ @limiter.limit("100 per hour")
13
+ def match_resume():
14
+ """Score how well a resume matches a job description."""
15
+ payload = request.get_json(silent=True)
16
+ if not isinstance(payload, dict):
17
+ return jsonify({"error": "Request body must be JSON."}), 400
18
+
19
+ resume = payload.get("resume", "")
20
+ job_description = payload.get("job_description", "")
21
+ top_keywords = payload.get("top_keywords", 25)
22
+
23
+ if not isinstance(resume, str) or not isinstance(job_description, str):
24
+ return jsonify({"error": "Fields 'resume' and 'job_description' must be strings."}), 400
25
+
26
+ try:
27
+ top_keywords = int(top_keywords)
28
+ except (TypeError, ValueError):
29
+ return jsonify({"error": "Field 'top_keywords' must be an integer."}), 400
30
+
31
+ if top_keywords < 1 or top_keywords > 100:
32
+ return jsonify({"error": "Field 'top_keywords' must be between 1 and 100."}), 400
33
+
34
+ try:
35
+ result = match_resume_to_job(
36
+ resume=resume,
37
+ job_description=job_description,
38
+ top_keywords=top_keywords,
39
+ )
40
+ except ValueError as exc:
41
+ return jsonify({"error": str(exc)}), 400
42
+
43
+ return jsonify({"status": "ok", **result.to_dict()}), 200
@@ -0,0 +1,57 @@
1
+ """OpenAPI specification and Swagger UI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import yaml
8
+ from flask import Blueprint, Response, jsonify, render_template_string
9
+
10
+ from docintel import __version__
11
+
12
+ docs_bp = Blueprint("openapi", __name__)
13
+
14
+ SWAGGER_UI_HTML = """<!DOCTYPE html>
15
+ <html lang="en">
16
+ <head>
17
+ <meta charset="utf-8"/>
18
+ <title>Document Intelligence API</title>
19
+ <link rel="stylesheet" href="https://unpkg.com/swagger-ui-dist@5/swagger-ui.css"/>
20
+ </head>
21
+ <body>
22
+ <div id="swagger-ui"></div>
23
+ <script src="https://unpkg.com/swagger-ui-dist@5/swagger-ui-bundle.js"></script>
24
+ <script>
25
+ window.onload = () => {
26
+ SwaggerUIBundle({
27
+ url: '/openapi.json',
28
+ dom_id: '#swagger-ui',
29
+ deepLinking: true,
30
+ presets: [SwaggerUIBundle.presets.apis],
31
+ });
32
+ };
33
+ </script>
34
+ </body>
35
+ </html>
36
+ """
37
+
38
+
39
+ def _load_openapi_spec() -> dict:
40
+ spec_path = Path(__file__).resolve().parents[1] / "openapi" / "openapi.yaml"
41
+ with spec_path.open("r", encoding="utf-8") as handle:
42
+ spec = yaml.safe_load(handle)
43
+ spec.setdefault("info", {})
44
+ spec["info"]["version"] = __version__
45
+ return spec
46
+
47
+
48
+ @docs_bp.get("/openapi.json")
49
+ def openapi_json():
50
+ """Return the OpenAPI 3 specification."""
51
+ return jsonify(_load_openapi_spec())
52
+
53
+
54
+ @docs_bp.get("/docs")
55
+ def swagger_ui():
56
+ """Interactive Swagger UI for the REST API."""
57
+ return Response(render_template_string(SWAGGER_UI_HTML), mimetype="text/html")
docintel/routes/ops.py ADDED
@@ -0,0 +1,22 @@
1
+ """Operations endpoints (metrics)."""
2
+
3
+ from flask import Blueprint, jsonify
4
+
5
+ from docintel import __version__
6
+ from docintel.ops.metrics import metrics_store
7
+
8
+ ops_bp = Blueprint("ops", __name__)
9
+
10
+
11
+ @ops_bp.get("/metrics")
12
+ def metrics():
13
+ """Return in-process request counters and latency aggregates."""
14
+ payload = metrics_store.snapshot()
15
+ return jsonify(
16
+ {
17
+ "status": "ok",
18
+ "service": "document-intelligence-platform",
19
+ "version": __version__,
20
+ **payload,
21
+ }
22
+ )
docintel/routes/pdf.py ADDED
@@ -0,0 +1,420 @@
1
+ """PDF annotation API routes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import uuid
6
+ from pathlib import Path
7
+
8
+ from flask import Blueprint, current_app, jsonify, request, send_file
9
+ from werkzeug.utils import secure_filename
10
+
11
+ from docintel.auth.limiter import limiter
12
+ from docintel.services.pdf import (
13
+ Action,
14
+ DEFAULT_PII_ENTITIES,
15
+ StructureMode,
16
+ annotate_pdf,
17
+ detect_sensitive_pdf,
18
+ list_supported_entities,
19
+ structure_pdf,
20
+ )
21
+
22
+ pdf_bp = Blueprint("pdf", __name__, url_prefix="/v1/pdf")
23
+
24
+
25
+ def _upload_dir() -> Path:
26
+ path = Path(current_app.config["UPLOAD_DIR"])
27
+ path.mkdir(parents=True, exist_ok=True)
28
+ return path
29
+
30
+
31
+ def _parse_pages(raw_pages: str | None) -> list[int] | None:
32
+ if not raw_pages or not raw_pages.strip():
33
+ return None
34
+ return [int(page.strip()) for page in raw_pages.split(",") if page.strip()]
35
+
36
+
37
+ @pdf_bp.post("/annotate")
38
+ @limiter.limit("60 per hour")
39
+ def annotate():
40
+ """Search a PDF and apply highlight, redact, or other annotation actions."""
41
+ upload = request.files.get("file")
42
+ if upload is None or not upload.filename:
43
+ return jsonify({"error": "Missing PDF file in form field 'file'."}), 400
44
+
45
+ filename = secure_filename(upload.filename)
46
+ if not filename.lower().endswith(".pdf"):
47
+ return jsonify({"error": "Only PDF files are supported."}), 400
48
+
49
+ action_raw = request.form.get("action", Action.HIGHLIGHT.value)
50
+ try:
51
+ action = Action.from_value(action_raw)
52
+ except ValueError as exc:
53
+ return jsonify({"error": str(exc)}), 400
54
+
55
+ pattern = request.form.get("pattern", "")
56
+ if action != Action.REMOVE and not pattern.strip():
57
+ return jsonify({"error": "Missing search pattern in form field 'pattern'."}), 400
58
+
59
+ try:
60
+ pages = _parse_pages(request.form.get("pages"))
61
+ except ValueError:
62
+ return jsonify({"error": "Invalid pages value. Use comma-separated page indexes."}), 400
63
+
64
+ job_id = uuid.uuid4().hex[:12]
65
+ work_dir = _upload_dir() / job_id
66
+ work_dir.mkdir(parents=True, exist_ok=True)
67
+
68
+ input_path = work_dir / filename
69
+ output_path = work_dir / f"annotated_{filename}"
70
+ upload.save(input_path)
71
+
72
+ try:
73
+ result = annotate_pdf(
74
+ input_file=input_path,
75
+ output_file=output_path,
76
+ pattern=pattern,
77
+ action=action,
78
+ pages=pages,
79
+ )
80
+ except FileNotFoundError as exc:
81
+ return jsonify({"error": str(exc)}), 404
82
+ except PermissionError as exc:
83
+ return jsonify({"error": str(exc)}), 403
84
+ except ValueError as exc:
85
+ return jsonify({"error": str(exc)}), 400
86
+
87
+ response_format = request.args.get("format", request.form.get("format", "file")).lower()
88
+
89
+ if response_format == "json":
90
+ payload = {
91
+ "status": "ok",
92
+ **result.to_dict(),
93
+ "download_url": f"/v1/pdf/files/{job_id}/{output_path.name}",
94
+ }
95
+ return jsonify(payload), 200
96
+
97
+ response = send_file(
98
+ output_path,
99
+ mimetype="application/pdf",
100
+ as_attachment=True,
101
+ download_name=output_path.name,
102
+ )
103
+ response.headers["X-Docintel-Matches"] = str(result.matches)
104
+ response.headers["X-Docintel-Pages-Processed"] = str(result.pages_processed)
105
+ response.headers["X-Docintel-Action"] = result.action.value
106
+ return response
107
+
108
+
109
+ def _parse_entities(raw_entities: str | None) -> list[str] | None:
110
+ if not raw_entities or not raw_entities.strip():
111
+ return None
112
+ return [item.strip() for item in raw_entities.split(",") if item.strip()]
113
+
114
+
115
+ @pdf_bp.get("/entities")
116
+ @limiter.limit("120 per hour")
117
+ def supported_entities():
118
+ """List Presidio entity types available for sensitive detection."""
119
+ try:
120
+ entities = list_supported_entities()
121
+ except Exception as exc:
122
+ return jsonify({"error": str(exc)}), 503
123
+
124
+ return jsonify(
125
+ {
126
+ "status": "ok",
127
+ "default_entities": list(DEFAULT_PII_ENTITIES),
128
+ "supported_entities": entities,
129
+ }
130
+ )
131
+
132
+
133
+ @pdf_bp.post("/detect-sensitive")
134
+ @limiter.limit("30 per hour")
135
+ def detect_sensitive():
136
+ """
137
+ Detect PII with Presidio and annotate the PDF.
138
+
139
+ Auto-falls back to EasyOCR when native PDF text is empty (scanned documents).
140
+ """
141
+ upload = request.files.get("file")
142
+ if upload is None or not upload.filename:
143
+ return jsonify({"error": "Missing PDF file in form field 'file'."}), 400
144
+
145
+ filename = secure_filename(upload.filename)
146
+ if not filename.lower().endswith(".pdf"):
147
+ return jsonify({"error": "Only PDF files are supported."}), 400
148
+
149
+ action_raw = request.form.get("action", Action.HIGHLIGHT.value)
150
+ try:
151
+ action = Action.from_value(action_raw)
152
+ except ValueError as exc:
153
+ return jsonify({"error": str(exc)}), 400
154
+
155
+ if action == Action.REMOVE:
156
+ return jsonify({"error": "Action 'Remove' is not supported for sensitive detection."}), 400
157
+
158
+ entities = _parse_entities(request.form.get("entities"))
159
+ pattern = request.form.get("pattern", "").strip() or None
160
+ force_ocr = request.form.get("force_ocr", "false").lower() == "true"
161
+ add_text_layer = request.form.get("add_text_layer", "true").lower() == "true"
162
+
163
+ try:
164
+ min_score = float(request.form.get("min_score", "0.35"))
165
+ except ValueError:
166
+ return jsonify({"error": "Field 'min_score' must be a number."}), 400
167
+
168
+ callback_url = request.form.get("callback_url", "").strip() or None
169
+ run_async = _parse_async_flag()
170
+
171
+ job_id = uuid.uuid4().hex[:12]
172
+ work_dir = _upload_dir() / job_id
173
+ work_dir.mkdir(parents=True, exist_ok=True)
174
+
175
+ input_path = work_dir / filename
176
+ output_path = work_dir / f"sensitive_{filename}"
177
+ upload.save(input_path)
178
+
179
+ if run_async:
180
+ return _enqueue_detect_sensitive_job(
181
+ job_id=job_id,
182
+ input_path=input_path,
183
+ output_path=output_path,
184
+ action=action,
185
+ entities=entities,
186
+ pattern=pattern,
187
+ force_ocr=force_ocr,
188
+ add_text_layer=add_text_layer,
189
+ min_score=min_score,
190
+ callback_url=callback_url,
191
+ )
192
+
193
+ try:
194
+ result = detect_sensitive_pdf(
195
+ input_file=input_path,
196
+ output_file=output_path,
197
+ entities=entities,
198
+ action=action,
199
+ force_ocr=force_ocr,
200
+ add_text_layer=add_text_layer,
201
+ pattern=pattern,
202
+ min_score=min_score,
203
+ )
204
+ except RuntimeError as exc:
205
+ return jsonify({"error": str(exc)}), 503
206
+ except FileNotFoundError as exc:
207
+ return jsonify({"error": str(exc)}), 404
208
+ except PermissionError as exc:
209
+ return jsonify({"error": str(exc)}), 403
210
+ except ValueError as exc:
211
+ return jsonify({"error": str(exc)}), 400
212
+
213
+ response_format = request.args.get("format", request.form.get("format", "file")).lower()
214
+
215
+ if response_format == "json":
216
+ payload = {
217
+ "status": "ok",
218
+ **result.to_dict(),
219
+ "download_url": f"/v1/pdf/files/{job_id}/{output_path.name}",
220
+ }
221
+ return jsonify(payload), 200
222
+
223
+ response = send_file(
224
+ output_path,
225
+ mimetype="application/pdf",
226
+ as_attachment=True,
227
+ download_name=output_path.name,
228
+ )
229
+ response.headers["X-Docintel-Matches"] = str(result.matches)
230
+ response.headers["X-Docintel-Pages-Processed"] = str(result.pages_processed)
231
+ response.headers["X-Docintel-OCR-Pages"] = ",".join(str(page) for page in result.ocr_pages)
232
+ response.headers["X-Docintel-Action"] = result.action.value
233
+ return response
234
+
235
+
236
+ def _parse_async_flag() -> bool:
237
+ raw = request.args.get("async", request.form.get("async", "false"))
238
+ return str(raw).lower() == "true"
239
+
240
+
241
+ @pdf_bp.post("/structure")
242
+ @limiter.limit("20 per hour")
243
+ def structure():
244
+ """
245
+ Structure an unstructured or scanned PDF with OCR and an LLM.
246
+
247
+ Returns a curated typeset PDF (curate) or the original with a searchable
248
+ invisible text layer (searchable).
249
+
250
+ Use ``async=true`` to queue the job and poll ``GET /v1/jobs/<job_id>``.
251
+ """
252
+ upload = request.files.get("file")
253
+ if upload is None or not upload.filename:
254
+ return jsonify({"error": "Missing PDF file in form field 'file'."}), 400
255
+
256
+ filename = secure_filename(upload.filename)
257
+ if not filename.lower().endswith(".pdf"):
258
+ return jsonify({"error": "Only PDF files are supported."}), 400
259
+
260
+ mode_raw = request.form.get("mode", StructureMode.CURATE.value)
261
+ try:
262
+ mode = StructureMode.from_value(mode_raw)
263
+ except ValueError as exc:
264
+ return jsonify({"error": str(exc)}), 400
265
+
266
+ force_ocr = request.form.get("force_ocr", "false").lower() == "true"
267
+ redact_before_llm = request.form.get("redact_before_llm", "false").lower() == "true"
268
+ callback_url = request.form.get("callback_url", "").strip() or None
269
+ run_async = _parse_async_flag()
270
+
271
+ job_id = uuid.uuid4().hex[:12]
272
+ work_dir = _upload_dir() / job_id
273
+ work_dir.mkdir(parents=True, exist_ok=True)
274
+
275
+ input_path = work_dir / filename
276
+ output_path = work_dir / f"structured_{filename}"
277
+ upload.save(input_path)
278
+
279
+ if run_async:
280
+ return _enqueue_structure_job(
281
+ job_id=job_id,
282
+ input_path=input_path,
283
+ output_path=output_path,
284
+ mode=mode,
285
+ force_ocr=force_ocr,
286
+ redact_before_llm=redact_before_llm,
287
+ callback_url=callback_url,
288
+ )
289
+
290
+ try:
291
+ result = structure_pdf(
292
+ input_file=input_path,
293
+ output_file=output_path,
294
+ mode=mode,
295
+ force_ocr=force_ocr,
296
+ redact_before_llm=redact_before_llm,
297
+ )
298
+ except RuntimeError as exc:
299
+ return jsonify({"error": str(exc)}), 503
300
+ except FileNotFoundError as exc:
301
+ return jsonify({"error": str(exc)}), 404
302
+ except PermissionError as exc:
303
+ return jsonify({"error": str(exc)}), 403
304
+ except ValueError as exc:
305
+ return jsonify({"error": str(exc)}), 400
306
+
307
+ response_format = request.args.get("format", request.form.get("format", "file")).lower()
308
+
309
+ if response_format == "json":
310
+ payload = {
311
+ "status": "ok",
312
+ **result.to_dict(),
313
+ "download_url": f"/v1/pdf/files/{job_id}/{output_path.name}",
314
+ }
315
+ return jsonify(payload), 200
316
+
317
+ response = send_file(
318
+ output_path,
319
+ mimetype="application/pdf",
320
+ as_attachment=True,
321
+ download_name=output_path.name,
322
+ )
323
+ response.headers["X-Docintel-Pages-Processed"] = str(result.pages_processed)
324
+ response.headers["X-Docintel-OCR-Pages"] = ",".join(str(page) for page in result.ocr_pages)
325
+ response.headers["X-Docintel-Mode"] = result.mode.value
326
+ response.headers["X-Docintel-Document-Title"] = result.document_title
327
+ return response
328
+
329
+
330
+ def _enqueue_structure_job(
331
+ *,
332
+ job_id: str,
333
+ input_path: Path,
334
+ output_path: Path,
335
+ mode: StructureMode,
336
+ force_ocr: bool,
337
+ redact_before_llm: bool,
338
+ callback_url: str | None,
339
+ ):
340
+ from docintel.jobs.helpers import enqueue_async_response
341
+ from docintel.jobs.models import JobType
342
+ from docintel.jobs.queue import enqueue_structure_job
343
+
344
+ accepted = enqueue_async_response(
345
+ job_id=job_id,
346
+ job_type=JobType.PDF_STRUCTURE,
347
+ callback_url=callback_url,
348
+ )
349
+ if accepted[1] != 202:
350
+ return accepted
351
+
352
+ enqueue_structure_job(
353
+ job_id=job_id,
354
+ input_path=str(input_path),
355
+ output_path=str(output_path),
356
+ mode=mode.value,
357
+ force_ocr=force_ocr,
358
+ output_filename=output_path.name,
359
+ redact_before_llm=redact_before_llm,
360
+ )
361
+ return accepted
362
+
363
+
364
+ def _enqueue_detect_sensitive_job(
365
+ *,
366
+ job_id: str,
367
+ input_path: Path,
368
+ output_path: Path,
369
+ action: Action,
370
+ entities: list[str] | None,
371
+ pattern: str | None,
372
+ force_ocr: bool,
373
+ add_text_layer: bool,
374
+ min_score: float,
375
+ callback_url: str | None,
376
+ ):
377
+ from docintel.jobs.helpers import enqueue_async_response
378
+ from docintel.jobs.models import JobType
379
+ from docintel.jobs.queue import enqueue_detect_sensitive_job
380
+
381
+ accepted = enqueue_async_response(
382
+ job_id=job_id,
383
+ job_type=JobType.PDF_DETECT_SENSITIVE,
384
+ callback_url=callback_url,
385
+ )
386
+ if accepted[1] != 202:
387
+ return accepted
388
+
389
+ enqueue_detect_sensitive_job(
390
+ job_id=job_id,
391
+ input_path=str(input_path),
392
+ output_path=str(output_path),
393
+ output_filename=output_path.name,
394
+ action=action.value,
395
+ force_ocr=force_ocr,
396
+ add_text_layer=add_text_layer,
397
+ min_score=min_score,
398
+ entities=entities,
399
+ pattern=pattern,
400
+ )
401
+ return accepted
402
+
403
+
404
+ @pdf_bp.get("/files/<job_id>/<filename>")
405
+ @limiter.limit("200 per hour")
406
+ def download_file(job_id: str, filename: str):
407
+ """Download a previously generated PDF when using JSON response mode."""
408
+ safe_job = secure_filename(job_id)
409
+ safe_name = secure_filename(filename)
410
+ file_path = _upload_dir() / safe_job / safe_name
411
+
412
+ if not file_path.is_file():
413
+ return jsonify({"error": "Annotated PDF not found."}), 404
414
+
415
+ return send_file(
416
+ file_path,
417
+ mimetype="application/pdf",
418
+ as_attachment=True,
419
+ download_name=safe_name,
420
+ )