docintel-platform 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docintel/__init__.py +6 -0
- docintel/app.py +45 -0
- docintel/auth/__init__.py +12 -0
- docintel/auth/api_keys.py +48 -0
- docintel/auth/limiter.py +41 -0
- docintel/auth/middleware.py +34 -0
- docintel/auth/oidc.py +45 -0
- docintel/cli.py +21 -0
- docintel/client.py +193 -0
- docintel/config.py +20 -0
- docintel/jobs/__init__.py +16 -0
- docintel/jobs/helpers.py +38 -0
- docintel/jobs/models.py +78 -0
- docintel/jobs/queue.py +75 -0
- docintel/jobs/store.py +82 -0
- docintel/jobs/tasks.py +173 -0
- docintel/jobs/webhooks.py +32 -0
- docintel/openapi/__init__.py +1 -0
- docintel/openapi/openapi.yaml +380 -0
- docintel/ops/__init__.py +1 -0
- docintel/ops/logging.py +40 -0
- docintel/ops/metrics.py +57 -0
- docintel/ops/middleware.py +40 -0
- docintel/routes/__init__.py +1 -0
- docintel/routes/jobs.py +26 -0
- docintel/routes/match.py +43 -0
- docintel/routes/openapi_docs.py +57 -0
- docintel/routes/ops.py +22 -0
- docintel/routes/pdf.py +420 -0
- docintel/routes/text.py +41 -0
- docintel/services/__init__.py +1 -0
- docintel/services/matching/__init__.py +6 -0
- docintel/services/matching/models.py +19 -0
- docintel/services/matching/scorer.py +64 -0
- docintel/services/pdf/__init__.py +26 -0
- docintel/services/pdf/annotator.py +188 -0
- docintel/services/pdf/models.py +104 -0
- docintel/services/pdf/ocr.py +130 -0
- docintel/services/pdf/pii.py +105 -0
- docintel/services/pdf/presets.py +26 -0
- docintel/services/pdf/search.py +29 -0
- docintel/services/pdf/sensitive.py +212 -0
- docintel/services/pdf/structure.py +118 -0
- docintel/services/pdf/structure_llm.py +136 -0
- docintel/services/pdf/structure_render.py +136 -0
- docintel/services/pdf/structure_schema.py +99 -0
- docintel/services/summary/__init__.py +6 -0
- docintel/services/summary/models.py +21 -0
- docintel/services/summary/textrank.py +57 -0
- docintel/ui.py +347 -0
- docintel/wsgi.py +5 -0
- docintel_platform-1.0.2.dist-info/METADATA +607 -0
- docintel_platform-1.0.2.dist-info/RECORD +56 -0
- docintel_platform-1.0.2.dist-info/WHEEL +5 -0
- docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
- docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
docintel/ops/metrics.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""In-process request metrics (per worker in multi-process deployments)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class MetricsStore:
|
|
13
|
+
"""Thread-safe counters for HTTP request observability."""
|
|
14
|
+
|
|
15
|
+
total_requests: int = 0
|
|
16
|
+
total_errors: int = 0
|
|
17
|
+
total_latency_ms: float = 0.0
|
|
18
|
+
requests_by_endpoint: dict[str, int] = field(default_factory=dict)
|
|
19
|
+
requests_by_status: dict[str, int] = field(default_factory=dict)
|
|
20
|
+
latency_by_endpoint_ms: dict[str, float] = field(default_factory=dict)
|
|
21
|
+
started_at: float = field(default_factory=time.time)
|
|
22
|
+
_lock: threading.Lock = field(default_factory=threading.Lock, repr=False)
|
|
23
|
+
|
|
24
|
+
def record(self, endpoint: str, status_code: int, duration_ms: float) -> None:
|
|
25
|
+
key = endpoint or "unknown"
|
|
26
|
+
status = str(status_code)
|
|
27
|
+
with self._lock:
|
|
28
|
+
self.total_requests += 1
|
|
29
|
+
if status_code >= 400:
|
|
30
|
+
self.total_errors += 1
|
|
31
|
+
self.total_latency_ms += duration_ms
|
|
32
|
+
self.requests_by_endpoint[key] = self.requests_by_endpoint.get(key, 0) + 1
|
|
33
|
+
self.requests_by_status[status] = self.requests_by_status.get(status, 0) + 1
|
|
34
|
+
self.latency_by_endpoint_ms[key] = (
|
|
35
|
+
self.latency_by_endpoint_ms.get(key, 0.0) + duration_ms
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def snapshot(self) -> dict[str, Any]:
|
|
39
|
+
uptime_seconds = max(time.time() - self.started_at, 0.001)
|
|
40
|
+
with self._lock:
|
|
41
|
+
total = self.total_requests
|
|
42
|
+
avg_latency = self.total_latency_ms / total if total else 0.0
|
|
43
|
+
return {
|
|
44
|
+
"total_requests": total,
|
|
45
|
+
"total_errors": self.total_errors,
|
|
46
|
+
"avg_latency_ms": round(avg_latency, 2),
|
|
47
|
+
"requests_per_second": round(total / uptime_seconds, 4),
|
|
48
|
+
"uptime_seconds": round(uptime_seconds, 2),
|
|
49
|
+
"requests_by_endpoint": dict(self.requests_by_endpoint),
|
|
50
|
+
"requests_by_status": dict(self.requests_by_status),
|
|
51
|
+
"latency_by_endpoint_ms": {
|
|
52
|
+
key: round(value, 2) for key, value in self.latency_by_endpoint_ms.items()
|
|
53
|
+
},
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
metrics_store = MetricsStore()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Request instrumentation hooks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from flask import Flask, g, request
|
|
9
|
+
|
|
10
|
+
from docintel.ops.metrics import metrics_store
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("docintel.request")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def register_request_hooks(app: Flask) -> None:
|
|
16
|
+
"""Attach timing, logging, and metrics collection to each request."""
|
|
17
|
+
|
|
18
|
+
@app.before_request
|
|
19
|
+
def _start_timer() -> None:
|
|
20
|
+
g.request_start = time.perf_counter()
|
|
21
|
+
|
|
22
|
+
@app.after_request
|
|
23
|
+
def _record_request(response):
|
|
24
|
+
start = getattr(g, "request_start", None)
|
|
25
|
+
duration_ms = (time.perf_counter() - start) * 1000 if start is not None else 0.0
|
|
26
|
+
endpoint = request.endpoint or request.path
|
|
27
|
+
|
|
28
|
+
metrics_store.record(endpoint, response.status_code, duration_ms)
|
|
29
|
+
|
|
30
|
+
logger.info(
|
|
31
|
+
"request completed",
|
|
32
|
+
extra={
|
|
33
|
+
"method": request.method,
|
|
34
|
+
"path": request.path,
|
|
35
|
+
"endpoint": endpoint,
|
|
36
|
+
"status_code": response.status_code,
|
|
37
|
+
"duration_ms": round(duration_ms, 2),
|
|
38
|
+
},
|
|
39
|
+
)
|
|
40
|
+
return response
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""HTTP route blueprints."""
|
docintel/routes/jobs.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Async job status API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from flask import Blueprint, jsonify
|
|
6
|
+
|
|
7
|
+
from docintel.auth.limiter import limiter
|
|
8
|
+
from docintel.jobs.store import get_job
|
|
9
|
+
|
|
10
|
+
jobs_bp = Blueprint("jobs", __name__, url_prefix="/v1/jobs")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@jobs_bp.get("/<job_id>")
|
|
14
|
+
@limiter.limit("600 per hour")
|
|
15
|
+
def job_status(job_id: str):
|
|
16
|
+
"""Poll async job status and download URL when complete."""
|
|
17
|
+
record = get_job(job_id)
|
|
18
|
+
if record is None:
|
|
19
|
+
return jsonify({"error": f"Job not found: {job_id}"}), 404
|
|
20
|
+
|
|
21
|
+
payload = {
|
|
22
|
+
"status": "ok",
|
|
23
|
+
**record.to_dict(),
|
|
24
|
+
"poll_url": f"/v1/jobs/{job_id}",
|
|
25
|
+
}
|
|
26
|
+
return jsonify(payload), 200
|
docintel/routes/match.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Resume matching API routes."""
|
|
2
|
+
|
|
3
|
+
from flask import Blueprint, jsonify, request
|
|
4
|
+
|
|
5
|
+
from docintel.auth.limiter import limiter
|
|
6
|
+
from docintel.services.matching import match_resume_to_job
|
|
7
|
+
|
|
8
|
+
match_bp = Blueprint("match", __name__, url_prefix="/v1/match")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@match_bp.post("/resume")
|
|
12
|
+
@limiter.limit("100 per hour")
|
|
13
|
+
def match_resume():
|
|
14
|
+
"""Score how well a resume matches a job description."""
|
|
15
|
+
payload = request.get_json(silent=True)
|
|
16
|
+
if not isinstance(payload, dict):
|
|
17
|
+
return jsonify({"error": "Request body must be JSON."}), 400
|
|
18
|
+
|
|
19
|
+
resume = payload.get("resume", "")
|
|
20
|
+
job_description = payload.get("job_description", "")
|
|
21
|
+
top_keywords = payload.get("top_keywords", 25)
|
|
22
|
+
|
|
23
|
+
if not isinstance(resume, str) or not isinstance(job_description, str):
|
|
24
|
+
return jsonify({"error": "Fields 'resume' and 'job_description' must be strings."}), 400
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
top_keywords = int(top_keywords)
|
|
28
|
+
except (TypeError, ValueError):
|
|
29
|
+
return jsonify({"error": "Field 'top_keywords' must be an integer."}), 400
|
|
30
|
+
|
|
31
|
+
if top_keywords < 1 or top_keywords > 100:
|
|
32
|
+
return jsonify({"error": "Field 'top_keywords' must be between 1 and 100."}), 400
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
result = match_resume_to_job(
|
|
36
|
+
resume=resume,
|
|
37
|
+
job_description=job_description,
|
|
38
|
+
top_keywords=top_keywords,
|
|
39
|
+
)
|
|
40
|
+
except ValueError as exc:
|
|
41
|
+
return jsonify({"error": str(exc)}), 400
|
|
42
|
+
|
|
43
|
+
return jsonify({"status": "ok", **result.to_dict()}), 200
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""OpenAPI specification and Swagger UI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
from flask import Blueprint, Response, jsonify, render_template_string
|
|
9
|
+
|
|
10
|
+
from docintel import __version__
|
|
11
|
+
|
|
12
|
+
docs_bp = Blueprint("openapi", __name__)
|
|
13
|
+
|
|
14
|
+
SWAGGER_UI_HTML = """<!DOCTYPE html>
|
|
15
|
+
<html lang="en">
|
|
16
|
+
<head>
|
|
17
|
+
<meta charset="utf-8"/>
|
|
18
|
+
<title>Document Intelligence API</title>
|
|
19
|
+
<link rel="stylesheet" href="https://unpkg.com/swagger-ui-dist@5/swagger-ui.css"/>
|
|
20
|
+
</head>
|
|
21
|
+
<body>
|
|
22
|
+
<div id="swagger-ui"></div>
|
|
23
|
+
<script src="https://unpkg.com/swagger-ui-dist@5/swagger-ui-bundle.js"></script>
|
|
24
|
+
<script>
|
|
25
|
+
window.onload = () => {
|
|
26
|
+
SwaggerUIBundle({
|
|
27
|
+
url: '/openapi.json',
|
|
28
|
+
dom_id: '#swagger-ui',
|
|
29
|
+
deepLinking: true,
|
|
30
|
+
presets: [SwaggerUIBundle.presets.apis],
|
|
31
|
+
});
|
|
32
|
+
};
|
|
33
|
+
</script>
|
|
34
|
+
</body>
|
|
35
|
+
</html>
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _load_openapi_spec() -> dict:
|
|
40
|
+
spec_path = Path(__file__).resolve().parents[1] / "openapi" / "openapi.yaml"
|
|
41
|
+
with spec_path.open("r", encoding="utf-8") as handle:
|
|
42
|
+
spec = yaml.safe_load(handle)
|
|
43
|
+
spec.setdefault("info", {})
|
|
44
|
+
spec["info"]["version"] = __version__
|
|
45
|
+
return spec
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@docs_bp.get("/openapi.json")
|
|
49
|
+
def openapi_json():
|
|
50
|
+
"""Return the OpenAPI 3 specification."""
|
|
51
|
+
return jsonify(_load_openapi_spec())
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@docs_bp.get("/docs")
|
|
55
|
+
def swagger_ui():
|
|
56
|
+
"""Interactive Swagger UI for the REST API."""
|
|
57
|
+
return Response(render_template_string(SWAGGER_UI_HTML), mimetype="text/html")
|
docintel/routes/ops.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Operations endpoints (metrics)."""
|
|
2
|
+
|
|
3
|
+
from flask import Blueprint, jsonify
|
|
4
|
+
|
|
5
|
+
from docintel import __version__
|
|
6
|
+
from docintel.ops.metrics import metrics_store
|
|
7
|
+
|
|
8
|
+
ops_bp = Blueprint("ops", __name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@ops_bp.get("/metrics")
|
|
12
|
+
def metrics():
|
|
13
|
+
"""Return in-process request counters and latency aggregates."""
|
|
14
|
+
payload = metrics_store.snapshot()
|
|
15
|
+
return jsonify(
|
|
16
|
+
{
|
|
17
|
+
"status": "ok",
|
|
18
|
+
"service": "document-intelligence-platform",
|
|
19
|
+
"version": __version__,
|
|
20
|
+
**payload,
|
|
21
|
+
}
|
|
22
|
+
)
|
docintel/routes/pdf.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""PDF annotation API routes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from flask import Blueprint, current_app, jsonify, request, send_file
|
|
9
|
+
from werkzeug.utils import secure_filename
|
|
10
|
+
|
|
11
|
+
from docintel.auth.limiter import limiter
|
|
12
|
+
from docintel.services.pdf import (
|
|
13
|
+
Action,
|
|
14
|
+
DEFAULT_PII_ENTITIES,
|
|
15
|
+
StructureMode,
|
|
16
|
+
annotate_pdf,
|
|
17
|
+
detect_sensitive_pdf,
|
|
18
|
+
list_supported_entities,
|
|
19
|
+
structure_pdf,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
pdf_bp = Blueprint("pdf", __name__, url_prefix="/v1/pdf")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _upload_dir() -> Path:
|
|
26
|
+
path = Path(current_app.config["UPLOAD_DIR"])
|
|
27
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
return path
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _parse_pages(raw_pages: str | None) -> list[int] | None:
|
|
32
|
+
if not raw_pages or not raw_pages.strip():
|
|
33
|
+
return None
|
|
34
|
+
return [int(page.strip()) for page in raw_pages.split(",") if page.strip()]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pdf_bp.post("/annotate")
|
|
38
|
+
@limiter.limit("60 per hour")
|
|
39
|
+
def annotate():
|
|
40
|
+
"""Search a PDF and apply highlight, redact, or other annotation actions."""
|
|
41
|
+
upload = request.files.get("file")
|
|
42
|
+
if upload is None or not upload.filename:
|
|
43
|
+
return jsonify({"error": "Missing PDF file in form field 'file'."}), 400
|
|
44
|
+
|
|
45
|
+
filename = secure_filename(upload.filename)
|
|
46
|
+
if not filename.lower().endswith(".pdf"):
|
|
47
|
+
return jsonify({"error": "Only PDF files are supported."}), 400
|
|
48
|
+
|
|
49
|
+
action_raw = request.form.get("action", Action.HIGHLIGHT.value)
|
|
50
|
+
try:
|
|
51
|
+
action = Action.from_value(action_raw)
|
|
52
|
+
except ValueError as exc:
|
|
53
|
+
return jsonify({"error": str(exc)}), 400
|
|
54
|
+
|
|
55
|
+
pattern = request.form.get("pattern", "")
|
|
56
|
+
if action != Action.REMOVE and not pattern.strip():
|
|
57
|
+
return jsonify({"error": "Missing search pattern in form field 'pattern'."}), 400
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
pages = _parse_pages(request.form.get("pages"))
|
|
61
|
+
except ValueError:
|
|
62
|
+
return jsonify({"error": "Invalid pages value. Use comma-separated page indexes."}), 400
|
|
63
|
+
|
|
64
|
+
job_id = uuid.uuid4().hex[:12]
|
|
65
|
+
work_dir = _upload_dir() / job_id
|
|
66
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
|
|
68
|
+
input_path = work_dir / filename
|
|
69
|
+
output_path = work_dir / f"annotated_{filename}"
|
|
70
|
+
upload.save(input_path)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
result = annotate_pdf(
|
|
74
|
+
input_file=input_path,
|
|
75
|
+
output_file=output_path,
|
|
76
|
+
pattern=pattern,
|
|
77
|
+
action=action,
|
|
78
|
+
pages=pages,
|
|
79
|
+
)
|
|
80
|
+
except FileNotFoundError as exc:
|
|
81
|
+
return jsonify({"error": str(exc)}), 404
|
|
82
|
+
except PermissionError as exc:
|
|
83
|
+
return jsonify({"error": str(exc)}), 403
|
|
84
|
+
except ValueError as exc:
|
|
85
|
+
return jsonify({"error": str(exc)}), 400
|
|
86
|
+
|
|
87
|
+
response_format = request.args.get("format", request.form.get("format", "file")).lower()
|
|
88
|
+
|
|
89
|
+
if response_format == "json":
|
|
90
|
+
payload = {
|
|
91
|
+
"status": "ok",
|
|
92
|
+
**result.to_dict(),
|
|
93
|
+
"download_url": f"/v1/pdf/files/{job_id}/{output_path.name}",
|
|
94
|
+
}
|
|
95
|
+
return jsonify(payload), 200
|
|
96
|
+
|
|
97
|
+
response = send_file(
|
|
98
|
+
output_path,
|
|
99
|
+
mimetype="application/pdf",
|
|
100
|
+
as_attachment=True,
|
|
101
|
+
download_name=output_path.name,
|
|
102
|
+
)
|
|
103
|
+
response.headers["X-Docintel-Matches"] = str(result.matches)
|
|
104
|
+
response.headers["X-Docintel-Pages-Processed"] = str(result.pages_processed)
|
|
105
|
+
response.headers["X-Docintel-Action"] = result.action.value
|
|
106
|
+
return response
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _parse_entities(raw_entities: str | None) -> list[str] | None:
|
|
110
|
+
if not raw_entities or not raw_entities.strip():
|
|
111
|
+
return None
|
|
112
|
+
return [item.strip() for item in raw_entities.split(",") if item.strip()]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@pdf_bp.get("/entities")
|
|
116
|
+
@limiter.limit("120 per hour")
|
|
117
|
+
def supported_entities():
|
|
118
|
+
"""List Presidio entity types available for sensitive detection."""
|
|
119
|
+
try:
|
|
120
|
+
entities = list_supported_entities()
|
|
121
|
+
except Exception as exc:
|
|
122
|
+
return jsonify({"error": str(exc)}), 503
|
|
123
|
+
|
|
124
|
+
return jsonify(
|
|
125
|
+
{
|
|
126
|
+
"status": "ok",
|
|
127
|
+
"default_entities": list(DEFAULT_PII_ENTITIES),
|
|
128
|
+
"supported_entities": entities,
|
|
129
|
+
}
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@pdf_bp.post("/detect-sensitive")
|
|
134
|
+
@limiter.limit("30 per hour")
|
|
135
|
+
def detect_sensitive():
|
|
136
|
+
"""
|
|
137
|
+
Detect PII with Presidio and annotate the PDF.
|
|
138
|
+
|
|
139
|
+
Auto-falls back to EasyOCR when native PDF text is empty (scanned documents).
|
|
140
|
+
"""
|
|
141
|
+
upload = request.files.get("file")
|
|
142
|
+
if upload is None or not upload.filename:
|
|
143
|
+
return jsonify({"error": "Missing PDF file in form field 'file'."}), 400
|
|
144
|
+
|
|
145
|
+
filename = secure_filename(upload.filename)
|
|
146
|
+
if not filename.lower().endswith(".pdf"):
|
|
147
|
+
return jsonify({"error": "Only PDF files are supported."}), 400
|
|
148
|
+
|
|
149
|
+
action_raw = request.form.get("action", Action.HIGHLIGHT.value)
|
|
150
|
+
try:
|
|
151
|
+
action = Action.from_value(action_raw)
|
|
152
|
+
except ValueError as exc:
|
|
153
|
+
return jsonify({"error": str(exc)}), 400
|
|
154
|
+
|
|
155
|
+
if action == Action.REMOVE:
|
|
156
|
+
return jsonify({"error": "Action 'Remove' is not supported for sensitive detection."}), 400
|
|
157
|
+
|
|
158
|
+
entities = _parse_entities(request.form.get("entities"))
|
|
159
|
+
pattern = request.form.get("pattern", "").strip() or None
|
|
160
|
+
force_ocr = request.form.get("force_ocr", "false").lower() == "true"
|
|
161
|
+
add_text_layer = request.form.get("add_text_layer", "true").lower() == "true"
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
min_score = float(request.form.get("min_score", "0.35"))
|
|
165
|
+
except ValueError:
|
|
166
|
+
return jsonify({"error": "Field 'min_score' must be a number."}), 400
|
|
167
|
+
|
|
168
|
+
callback_url = request.form.get("callback_url", "").strip() or None
|
|
169
|
+
run_async = _parse_async_flag()
|
|
170
|
+
|
|
171
|
+
job_id = uuid.uuid4().hex[:12]
|
|
172
|
+
work_dir = _upload_dir() / job_id
|
|
173
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
174
|
+
|
|
175
|
+
input_path = work_dir / filename
|
|
176
|
+
output_path = work_dir / f"sensitive_{filename}"
|
|
177
|
+
upload.save(input_path)
|
|
178
|
+
|
|
179
|
+
if run_async:
|
|
180
|
+
return _enqueue_detect_sensitive_job(
|
|
181
|
+
job_id=job_id,
|
|
182
|
+
input_path=input_path,
|
|
183
|
+
output_path=output_path,
|
|
184
|
+
action=action,
|
|
185
|
+
entities=entities,
|
|
186
|
+
pattern=pattern,
|
|
187
|
+
force_ocr=force_ocr,
|
|
188
|
+
add_text_layer=add_text_layer,
|
|
189
|
+
min_score=min_score,
|
|
190
|
+
callback_url=callback_url,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
result = detect_sensitive_pdf(
|
|
195
|
+
input_file=input_path,
|
|
196
|
+
output_file=output_path,
|
|
197
|
+
entities=entities,
|
|
198
|
+
action=action,
|
|
199
|
+
force_ocr=force_ocr,
|
|
200
|
+
add_text_layer=add_text_layer,
|
|
201
|
+
pattern=pattern,
|
|
202
|
+
min_score=min_score,
|
|
203
|
+
)
|
|
204
|
+
except RuntimeError as exc:
|
|
205
|
+
return jsonify({"error": str(exc)}), 503
|
|
206
|
+
except FileNotFoundError as exc:
|
|
207
|
+
return jsonify({"error": str(exc)}), 404
|
|
208
|
+
except PermissionError as exc:
|
|
209
|
+
return jsonify({"error": str(exc)}), 403
|
|
210
|
+
except ValueError as exc:
|
|
211
|
+
return jsonify({"error": str(exc)}), 400
|
|
212
|
+
|
|
213
|
+
response_format = request.args.get("format", request.form.get("format", "file")).lower()
|
|
214
|
+
|
|
215
|
+
if response_format == "json":
|
|
216
|
+
payload = {
|
|
217
|
+
"status": "ok",
|
|
218
|
+
**result.to_dict(),
|
|
219
|
+
"download_url": f"/v1/pdf/files/{job_id}/{output_path.name}",
|
|
220
|
+
}
|
|
221
|
+
return jsonify(payload), 200
|
|
222
|
+
|
|
223
|
+
response = send_file(
|
|
224
|
+
output_path,
|
|
225
|
+
mimetype="application/pdf",
|
|
226
|
+
as_attachment=True,
|
|
227
|
+
download_name=output_path.name,
|
|
228
|
+
)
|
|
229
|
+
response.headers["X-Docintel-Matches"] = str(result.matches)
|
|
230
|
+
response.headers["X-Docintel-Pages-Processed"] = str(result.pages_processed)
|
|
231
|
+
response.headers["X-Docintel-OCR-Pages"] = ",".join(str(page) for page in result.ocr_pages)
|
|
232
|
+
response.headers["X-Docintel-Action"] = result.action.value
|
|
233
|
+
return response
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _parse_async_flag() -> bool:
|
|
237
|
+
raw = request.args.get("async", request.form.get("async", "false"))
|
|
238
|
+
return str(raw).lower() == "true"
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@pdf_bp.post("/structure")
|
|
242
|
+
@limiter.limit("20 per hour")
|
|
243
|
+
def structure():
|
|
244
|
+
"""
|
|
245
|
+
Structure an unstructured or scanned PDF with OCR and an LLM.
|
|
246
|
+
|
|
247
|
+
Returns a curated typeset PDF (curate) or the original with a searchable
|
|
248
|
+
invisible text layer (searchable).
|
|
249
|
+
|
|
250
|
+
Use ``async=true`` to queue the job and poll ``GET /v1/jobs/<job_id>``.
|
|
251
|
+
"""
|
|
252
|
+
upload = request.files.get("file")
|
|
253
|
+
if upload is None or not upload.filename:
|
|
254
|
+
return jsonify({"error": "Missing PDF file in form field 'file'."}), 400
|
|
255
|
+
|
|
256
|
+
filename = secure_filename(upload.filename)
|
|
257
|
+
if not filename.lower().endswith(".pdf"):
|
|
258
|
+
return jsonify({"error": "Only PDF files are supported."}), 400
|
|
259
|
+
|
|
260
|
+
mode_raw = request.form.get("mode", StructureMode.CURATE.value)
|
|
261
|
+
try:
|
|
262
|
+
mode = StructureMode.from_value(mode_raw)
|
|
263
|
+
except ValueError as exc:
|
|
264
|
+
return jsonify({"error": str(exc)}), 400
|
|
265
|
+
|
|
266
|
+
force_ocr = request.form.get("force_ocr", "false").lower() == "true"
|
|
267
|
+
redact_before_llm = request.form.get("redact_before_llm", "false").lower() == "true"
|
|
268
|
+
callback_url = request.form.get("callback_url", "").strip() or None
|
|
269
|
+
run_async = _parse_async_flag()
|
|
270
|
+
|
|
271
|
+
job_id = uuid.uuid4().hex[:12]
|
|
272
|
+
work_dir = _upload_dir() / job_id
|
|
273
|
+
work_dir.mkdir(parents=True, exist_ok=True)
|
|
274
|
+
|
|
275
|
+
input_path = work_dir / filename
|
|
276
|
+
output_path = work_dir / f"structured_{filename}"
|
|
277
|
+
upload.save(input_path)
|
|
278
|
+
|
|
279
|
+
if run_async:
|
|
280
|
+
return _enqueue_structure_job(
|
|
281
|
+
job_id=job_id,
|
|
282
|
+
input_path=input_path,
|
|
283
|
+
output_path=output_path,
|
|
284
|
+
mode=mode,
|
|
285
|
+
force_ocr=force_ocr,
|
|
286
|
+
redact_before_llm=redact_before_llm,
|
|
287
|
+
callback_url=callback_url,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
result = structure_pdf(
|
|
292
|
+
input_file=input_path,
|
|
293
|
+
output_file=output_path,
|
|
294
|
+
mode=mode,
|
|
295
|
+
force_ocr=force_ocr,
|
|
296
|
+
redact_before_llm=redact_before_llm,
|
|
297
|
+
)
|
|
298
|
+
except RuntimeError as exc:
|
|
299
|
+
return jsonify({"error": str(exc)}), 503
|
|
300
|
+
except FileNotFoundError as exc:
|
|
301
|
+
return jsonify({"error": str(exc)}), 404
|
|
302
|
+
except PermissionError as exc:
|
|
303
|
+
return jsonify({"error": str(exc)}), 403
|
|
304
|
+
except ValueError as exc:
|
|
305
|
+
return jsonify({"error": str(exc)}), 400
|
|
306
|
+
|
|
307
|
+
response_format = request.args.get("format", request.form.get("format", "file")).lower()
|
|
308
|
+
|
|
309
|
+
if response_format == "json":
|
|
310
|
+
payload = {
|
|
311
|
+
"status": "ok",
|
|
312
|
+
**result.to_dict(),
|
|
313
|
+
"download_url": f"/v1/pdf/files/{job_id}/{output_path.name}",
|
|
314
|
+
}
|
|
315
|
+
return jsonify(payload), 200
|
|
316
|
+
|
|
317
|
+
response = send_file(
|
|
318
|
+
output_path,
|
|
319
|
+
mimetype="application/pdf",
|
|
320
|
+
as_attachment=True,
|
|
321
|
+
download_name=output_path.name,
|
|
322
|
+
)
|
|
323
|
+
response.headers["X-Docintel-Pages-Processed"] = str(result.pages_processed)
|
|
324
|
+
response.headers["X-Docintel-OCR-Pages"] = ",".join(str(page) for page in result.ocr_pages)
|
|
325
|
+
response.headers["X-Docintel-Mode"] = result.mode.value
|
|
326
|
+
response.headers["X-Docintel-Document-Title"] = result.document_title
|
|
327
|
+
return response
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _enqueue_structure_job(
|
|
331
|
+
*,
|
|
332
|
+
job_id: str,
|
|
333
|
+
input_path: Path,
|
|
334
|
+
output_path: Path,
|
|
335
|
+
mode: StructureMode,
|
|
336
|
+
force_ocr: bool,
|
|
337
|
+
redact_before_llm: bool,
|
|
338
|
+
callback_url: str | None,
|
|
339
|
+
):
|
|
340
|
+
from docintel.jobs.helpers import enqueue_async_response
|
|
341
|
+
from docintel.jobs.models import JobType
|
|
342
|
+
from docintel.jobs.queue import enqueue_structure_job
|
|
343
|
+
|
|
344
|
+
accepted = enqueue_async_response(
|
|
345
|
+
job_id=job_id,
|
|
346
|
+
job_type=JobType.PDF_STRUCTURE,
|
|
347
|
+
callback_url=callback_url,
|
|
348
|
+
)
|
|
349
|
+
if accepted[1] != 202:
|
|
350
|
+
return accepted
|
|
351
|
+
|
|
352
|
+
enqueue_structure_job(
|
|
353
|
+
job_id=job_id,
|
|
354
|
+
input_path=str(input_path),
|
|
355
|
+
output_path=str(output_path),
|
|
356
|
+
mode=mode.value,
|
|
357
|
+
force_ocr=force_ocr,
|
|
358
|
+
output_filename=output_path.name,
|
|
359
|
+
redact_before_llm=redact_before_llm,
|
|
360
|
+
)
|
|
361
|
+
return accepted
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _enqueue_detect_sensitive_job(
|
|
365
|
+
*,
|
|
366
|
+
job_id: str,
|
|
367
|
+
input_path: Path,
|
|
368
|
+
output_path: Path,
|
|
369
|
+
action: Action,
|
|
370
|
+
entities: list[str] | None,
|
|
371
|
+
pattern: str | None,
|
|
372
|
+
force_ocr: bool,
|
|
373
|
+
add_text_layer: bool,
|
|
374
|
+
min_score: float,
|
|
375
|
+
callback_url: str | None,
|
|
376
|
+
):
|
|
377
|
+
from docintel.jobs.helpers import enqueue_async_response
|
|
378
|
+
from docintel.jobs.models import JobType
|
|
379
|
+
from docintel.jobs.queue import enqueue_detect_sensitive_job
|
|
380
|
+
|
|
381
|
+
accepted = enqueue_async_response(
|
|
382
|
+
job_id=job_id,
|
|
383
|
+
job_type=JobType.PDF_DETECT_SENSITIVE,
|
|
384
|
+
callback_url=callback_url,
|
|
385
|
+
)
|
|
386
|
+
if accepted[1] != 202:
|
|
387
|
+
return accepted
|
|
388
|
+
|
|
389
|
+
enqueue_detect_sensitive_job(
|
|
390
|
+
job_id=job_id,
|
|
391
|
+
input_path=str(input_path),
|
|
392
|
+
output_path=str(output_path),
|
|
393
|
+
output_filename=output_path.name,
|
|
394
|
+
action=action.value,
|
|
395
|
+
force_ocr=force_ocr,
|
|
396
|
+
add_text_layer=add_text_layer,
|
|
397
|
+
min_score=min_score,
|
|
398
|
+
entities=entities,
|
|
399
|
+
pattern=pattern,
|
|
400
|
+
)
|
|
401
|
+
return accepted
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
@pdf_bp.get("/files/<job_id>/<filename>")
|
|
405
|
+
@limiter.limit("200 per hour")
|
|
406
|
+
def download_file(job_id: str, filename: str):
|
|
407
|
+
"""Download a previously generated PDF when using JSON response mode."""
|
|
408
|
+
safe_job = secure_filename(job_id)
|
|
409
|
+
safe_name = secure_filename(filename)
|
|
410
|
+
file_path = _upload_dir() / safe_job / safe_name
|
|
411
|
+
|
|
412
|
+
if not file_path.is_file():
|
|
413
|
+
return jsonify({"error": "Annotated PDF not found."}), 404
|
|
414
|
+
|
|
415
|
+
return send_file(
|
|
416
|
+
file_path,
|
|
417
|
+
mimetype="application/pdf",
|
|
418
|
+
as_attachment=True,
|
|
419
|
+
download_name=safe_name,
|
|
420
|
+
)
|