docintel-platform 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docintel/__init__.py +6 -0
- docintel/app.py +45 -0
- docintel/auth/__init__.py +12 -0
- docintel/auth/api_keys.py +48 -0
- docintel/auth/limiter.py +41 -0
- docintel/auth/middleware.py +34 -0
- docintel/auth/oidc.py +45 -0
- docintel/cli.py +21 -0
- docintel/client.py +193 -0
- docintel/config.py +20 -0
- docintel/jobs/__init__.py +16 -0
- docintel/jobs/helpers.py +38 -0
- docintel/jobs/models.py +78 -0
- docintel/jobs/queue.py +75 -0
- docintel/jobs/store.py +82 -0
- docintel/jobs/tasks.py +173 -0
- docintel/jobs/webhooks.py +32 -0
- docintel/openapi/__init__.py +1 -0
- docintel/openapi/openapi.yaml +380 -0
- docintel/ops/__init__.py +1 -0
- docintel/ops/logging.py +40 -0
- docintel/ops/metrics.py +57 -0
- docintel/ops/middleware.py +40 -0
- docintel/routes/__init__.py +1 -0
- docintel/routes/jobs.py +26 -0
- docintel/routes/match.py +43 -0
- docintel/routes/openapi_docs.py +57 -0
- docintel/routes/ops.py +22 -0
- docintel/routes/pdf.py +420 -0
- docintel/routes/text.py +41 -0
- docintel/services/__init__.py +1 -0
- docintel/services/matching/__init__.py +6 -0
- docintel/services/matching/models.py +19 -0
- docintel/services/matching/scorer.py +64 -0
- docintel/services/pdf/__init__.py +26 -0
- docintel/services/pdf/annotator.py +188 -0
- docintel/services/pdf/models.py +104 -0
- docintel/services/pdf/ocr.py +130 -0
- docintel/services/pdf/pii.py +105 -0
- docintel/services/pdf/presets.py +26 -0
- docintel/services/pdf/search.py +29 -0
- docintel/services/pdf/sensitive.py +212 -0
- docintel/services/pdf/structure.py +118 -0
- docintel/services/pdf/structure_llm.py +136 -0
- docintel/services/pdf/structure_render.py +136 -0
- docintel/services/pdf/structure_schema.py +99 -0
- docintel/services/summary/__init__.py +6 -0
- docintel/services/summary/models.py +21 -0
- docintel/services/summary/textrank.py +57 -0
- docintel/ui.py +347 -0
- docintel/wsgi.py +5 -0
- docintel_platform-1.0.2.dist-info/METADATA +607 -0
- docintel_platform-1.0.2.dist-info/RECORD +56 -0
- docintel_platform-1.0.2.dist-info/WHEEL +5 -0
- docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
- docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
docintel/__init__.py
ADDED
docintel/app.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Flask application factory."""
|
|
2
|
+
|
|
3
|
+
from flask import Flask, jsonify
|
|
4
|
+
|
|
5
|
+
from docintel import __version__
|
|
6
|
+
from docintel.config import Config
|
|
7
|
+
from docintel.auth.limiter import init_limiter
|
|
8
|
+
from docintel.auth.middleware import register_auth
|
|
9
|
+
from docintel.ops.logging import configure_logging
|
|
10
|
+
from docintel.ops.middleware import register_request_hooks
|
|
11
|
+
from docintel.routes.jobs import jobs_bp
|
|
12
|
+
from docintel.routes.openapi_docs import docs_bp
|
|
13
|
+
from docintel.routes.match import match_bp
|
|
14
|
+
from docintel.routes.ops import ops_bp
|
|
15
|
+
from docintel.routes.pdf import pdf_bp
|
|
16
|
+
from docintel.routes.text import text_bp
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_app(config: type[Config] = Config) -> Flask:
|
|
20
|
+
app = Flask(__name__)
|
|
21
|
+
app.config.from_object(config)
|
|
22
|
+
|
|
23
|
+
configure_logging(config.LOG_LEVEL)
|
|
24
|
+
register_request_hooks(app)
|
|
25
|
+
register_auth(app)
|
|
26
|
+
init_limiter(app)
|
|
27
|
+
|
|
28
|
+
@app.get("/health")
|
|
29
|
+
def health():
|
|
30
|
+
return jsonify(
|
|
31
|
+
{
|
|
32
|
+
"status": "ok",
|
|
33
|
+
"service": "document-intelligence-platform",
|
|
34
|
+
"version": __version__,
|
|
35
|
+
}
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
app.register_blueprint(docs_bp)
|
|
39
|
+
app.register_blueprint(pdf_bp)
|
|
40
|
+
app.register_blueprint(jobs_bp)
|
|
41
|
+
app.register_blueprint(match_bp)
|
|
42
|
+
app.register_blueprint(text_bp)
|
|
43
|
+
app.register_blueprint(ops_bp)
|
|
44
|
+
|
|
45
|
+
return app
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""API authentication and rate limiting."""
|
|
2
|
+
|
|
3
|
+
from docintel.auth.api_keys import auth_required, extract_bearer_token, validate_credentials
|
|
4
|
+
from docintel.auth.limiter import init_limiter, limiter
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"auth_required",
|
|
8
|
+
"extract_bearer_token",
|
|
9
|
+
"init_limiter",
|
|
10
|
+
"limiter",
|
|
11
|
+
"validate_credentials",
|
|
12
|
+
]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""API key authentication."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class AuthContext:
|
|
11
|
+
method: str
|
|
12
|
+
subject: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _configured_keys() -> set[str]:
|
|
16
|
+
raw = os.getenv("DOCINTEL_API_KEYS", "")
|
|
17
|
+
return {item.strip() for item in raw.split(",") if item.strip()}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def auth_required() -> bool:
|
|
21
|
+
if os.getenv("DOCINTEL_AUTH_REQUIRED", "false").lower() == "true":
|
|
22
|
+
return True
|
|
23
|
+
return bool(_configured_keys())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_bearer_token() -> str | None:
|
|
27
|
+
from flask import request
|
|
28
|
+
|
|
29
|
+
header = request.headers.get("Authorization", "").strip()
|
|
30
|
+
if not header.lower().startswith("bearer "):
|
|
31
|
+
return None
|
|
32
|
+
token = header[7:].strip()
|
|
33
|
+
return token or None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_api_key(token: str) -> AuthContext | None:
|
|
37
|
+
if token in _configured_keys():
|
|
38
|
+
return AuthContext(method="api_key", subject=token[:8])
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def validate_credentials(token: str) -> AuthContext | None:
|
|
43
|
+
from docintel.auth.oidc import validate_oidc_token
|
|
44
|
+
|
|
45
|
+
api_match = validate_api_key(token)
|
|
46
|
+
if api_match is not None:
|
|
47
|
+
return api_match
|
|
48
|
+
return validate_oidc_token(token)
|
docintel/auth/limiter.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Redis-backed per-tenant rate limits."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from flask_limiter import Limiter
|
|
8
|
+
from flask_limiter.util import get_remote_address
|
|
9
|
+
|
|
10
|
+
from docintel.auth.api_keys import extract_bearer_token
|
|
11
|
+
from docintel.jobs.store import redis_url
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _rate_limit_key() -> str:
|
|
15
|
+
token = extract_bearer_token()
|
|
16
|
+
if token:
|
|
17
|
+
return f"key:{token[:12]}"
|
|
18
|
+
return get_remote_address()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def rate_limits_enabled() -> bool:
|
|
22
|
+
return os.getenv("DOCINTEL_RATE_LIMIT_ENABLED", "true").lower() == "true"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def storage_uri() -> str:
|
|
26
|
+
if rate_limits_enabled():
|
|
27
|
+
return redis_url()
|
|
28
|
+
return "memory://"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
limiter = Limiter(
|
|
32
|
+
key_func=_rate_limit_key,
|
|
33
|
+
default_limits=[],
|
|
34
|
+
storage_uri="memory://",
|
|
35
|
+
strategy="fixed-window",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def init_limiter(app) -> None:
|
|
40
|
+
app.config["RATELIMIT_STORAGE_URI"] = storage_uri()
|
|
41
|
+
limiter.init_app(app)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Authentication middleware for protected API routes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from flask import Flask, g, jsonify, request
|
|
6
|
+
|
|
7
|
+
from docintel.auth.api_keys import auth_required, extract_bearer_token, validate_credentials
|
|
8
|
+
|
|
9
|
+
PUBLIC_PREFIXES = ("/health", "/docs", "/openapi.json", "/metrics")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def register_auth(app: Flask) -> None:
|
|
13
|
+
@app.before_request
|
|
14
|
+
def _enforce_api_auth():
|
|
15
|
+
if not auth_required():
|
|
16
|
+
return None
|
|
17
|
+
|
|
18
|
+
path = request.path or ""
|
|
19
|
+
if any(path == prefix or path.startswith(prefix + "/") for prefix in PUBLIC_PREFIXES):
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
if not path.startswith("/v1/"):
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
token = extract_bearer_token()
|
|
26
|
+
if not token:
|
|
27
|
+
return jsonify({"error": "Missing Authorization: Bearer <api_key> header."}), 401
|
|
28
|
+
|
|
29
|
+
context = validate_credentials(token)
|
|
30
|
+
if context is None:
|
|
31
|
+
return jsonify({"error": "Invalid API credentials."}), 401
|
|
32
|
+
|
|
33
|
+
g.auth_context = context
|
|
34
|
+
return None
|
docintel/auth/oidc.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Optional OIDC bearer token validation (Session 5 hook, no-op when unset)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from docintel.auth.api_keys import AuthContext
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def oidc_enabled() -> bool:
|
|
11
|
+
return bool(os.getenv("DOCINTEL_OIDC_ISSUER", "").strip())
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def validate_oidc_token(token: str) -> AuthContext | None:
|
|
15
|
+
if not oidc_enabled():
|
|
16
|
+
return None
|
|
17
|
+
if token.count(".") != 2:
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
issuer = os.getenv("DOCINTEL_OIDC_ISSUER", "").strip()
|
|
21
|
+
audience = os.getenv("DOCINTEL_OIDC_AUDIENCE", "").strip() or None
|
|
22
|
+
jwks_url = os.getenv("DOCINTEL_OIDC_JWKS_URL", "").strip()
|
|
23
|
+
if not jwks_url and issuer:
|
|
24
|
+
jwks_url = issuer.rstrip("/") + "/.well-known/jwks.json"
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import jwt
|
|
28
|
+
from jwt import PyJWKClient
|
|
29
|
+
except ImportError as exc:
|
|
30
|
+
raise RuntimeError(
|
|
31
|
+
"OIDC auth requires PyJWT. Run: pip install -e '.[auth]'"
|
|
32
|
+
) from exc
|
|
33
|
+
|
|
34
|
+
client = PyJWKClient(jwks_url)
|
|
35
|
+
signing_key = client.get_signing_key_from_jwt(token)
|
|
36
|
+
claims = jwt.decode(
|
|
37
|
+
token,
|
|
38
|
+
signing_key.key,
|
|
39
|
+
algorithms=["RS256", "ES256"],
|
|
40
|
+
audience=audience,
|
|
41
|
+
issuer=issuer,
|
|
42
|
+
options={"verify_aud": audience is not None},
|
|
43
|
+
)
|
|
44
|
+
subject = str(claims.get("sub") or claims.get("email") or "oidc-user")
|
|
45
|
+
return AuthContext(method="oidc", subject=subject)
|
docintel/cli.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""CLI entry point."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
from docintel.app import create_app
|
|
6
|
+
from docintel.config import Config
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main() -> None:
|
|
10
|
+
parser = argparse.ArgumentParser(description="Run the document intelligence API.")
|
|
11
|
+
parser.add_argument("--host", default=Config.HOST)
|
|
12
|
+
parser.add_argument("--port", type=int, default=Config.PORT)
|
|
13
|
+
parser.add_argument("--debug", action="store_true")
|
|
14
|
+
args = parser.parse_args()
|
|
15
|
+
|
|
16
|
+
app = create_app()
|
|
17
|
+
app.run(host=args.host, port=args.port, debug=args.debug)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
main()
|
docintel/client.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Python client for the Document Intelligence Platform REST API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DocintelError(Exception):
|
|
13
|
+
"""Raised when the API returns an error response."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DocintelClient:
|
|
17
|
+
"""HTTP client for ``/v1/*`` document intelligence endpoints."""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
base_url: str = "http://127.0.0.1:5000",
|
|
22
|
+
api_key: str | None = None,
|
|
23
|
+
timeout: int = 120,
|
|
24
|
+
) -> None:
|
|
25
|
+
self.base_url = base_url.rstrip("/")
|
|
26
|
+
self.timeout = timeout
|
|
27
|
+
self._session = requests.Session()
|
|
28
|
+
if api_key:
|
|
29
|
+
self._session.headers["Authorization"] = f"Bearer {api_key}"
|
|
30
|
+
|
|
31
|
+
def _url(self, path: str) -> str:
|
|
32
|
+
if not path.startswith("/"):
|
|
33
|
+
path = f"/{path}"
|
|
34
|
+
return f"{self.base_url}{path}"
|
|
35
|
+
|
|
36
|
+
def _raise_for_status(self, response: requests.Response) -> None:
|
|
37
|
+
if response.ok:
|
|
38
|
+
return
|
|
39
|
+
try:
|
|
40
|
+
payload = response.json()
|
|
41
|
+
message = payload.get("error", response.text)
|
|
42
|
+
except Exception:
|
|
43
|
+
message = response.text or f"HTTP {response.status_code}"
|
|
44
|
+
raise DocintelError(message)
|
|
45
|
+
|
|
46
|
+
def health(self) -> dict[str, Any]:
|
|
47
|
+
response = self._session.get(self._url("/health"), timeout=self.timeout)
|
|
48
|
+
self._raise_for_status(response)
|
|
49
|
+
return response.json()
|
|
50
|
+
|
|
51
|
+
def get_job(self, job_id: str) -> dict[str, Any]:
|
|
52
|
+
response = self._session.get(self._url(f"/v1/jobs/{job_id}"), timeout=self.timeout)
|
|
53
|
+
self._raise_for_status(response)
|
|
54
|
+
return response.json()
|
|
55
|
+
|
|
56
|
+
def poll_job(
|
|
57
|
+
self,
|
|
58
|
+
job_id: str,
|
|
59
|
+
*,
|
|
60
|
+
interval_seconds: float = 2.0,
|
|
61
|
+
timeout_seconds: float = 600.0,
|
|
62
|
+
) -> dict[str, Any]:
|
|
63
|
+
deadline = time.time() + timeout_seconds
|
|
64
|
+
while time.time() < deadline:
|
|
65
|
+
payload = self.get_job(job_id)
|
|
66
|
+
status = payload.get("job_status")
|
|
67
|
+
if status == "completed":
|
|
68
|
+
return payload
|
|
69
|
+
if status == "failed":
|
|
70
|
+
raise DocintelError(payload.get("error", "Job failed"))
|
|
71
|
+
time.sleep(interval_seconds)
|
|
72
|
+
raise DocintelError(f"Job {job_id} timed out after {timeout_seconds}s")
|
|
73
|
+
|
|
74
|
+
def download(self, download_url: str) -> bytes:
|
|
75
|
+
response = self._session.get(self._url(download_url), timeout=self.timeout)
|
|
76
|
+
self._raise_for_status(response)
|
|
77
|
+
return response.content
|
|
78
|
+
|
|
79
|
+
def structure_pdf(
|
|
80
|
+
self,
|
|
81
|
+
pdf_path: str | Path,
|
|
82
|
+
*,
|
|
83
|
+
mode: str = "curate",
|
|
84
|
+
force_ocr: bool = False,
|
|
85
|
+
redact_before_llm: bool = False,
|
|
86
|
+
async_job: bool = False,
|
|
87
|
+
callback_url: str | None = None,
|
|
88
|
+
poll: bool = True,
|
|
89
|
+
) -> dict[str, Any] | bytes:
|
|
90
|
+
path = Path(pdf_path)
|
|
91
|
+
params = {"async": "true"} if async_job else {}
|
|
92
|
+
data = {
|
|
93
|
+
"mode": mode,
|
|
94
|
+
"force_ocr": str(force_ocr).lower(),
|
|
95
|
+
"redact_before_llm": str(redact_before_llm).lower(),
|
|
96
|
+
}
|
|
97
|
+
if callback_url:
|
|
98
|
+
data["callback_url"] = callback_url
|
|
99
|
+
with path.open("rb") as handle:
|
|
100
|
+
response = self._session.post(
|
|
101
|
+
self._url("/v1/pdf/structure"),
|
|
102
|
+
params=params,
|
|
103
|
+
files={"file": (path.name, handle, "application/pdf")},
|
|
104
|
+
data=data,
|
|
105
|
+
timeout=self.timeout,
|
|
106
|
+
)
|
|
107
|
+
if response.status_code == 202:
|
|
108
|
+
payload = response.json()
|
|
109
|
+
if not poll:
|
|
110
|
+
return payload
|
|
111
|
+
payload = self.poll_job(payload["job_id"])
|
|
112
|
+
return self.download(payload["download_url"])
|
|
113
|
+
self._raise_for_status(response)
|
|
114
|
+
if "application/pdf" in response.headers.get("Content-Type", ""):
|
|
115
|
+
return response.content
|
|
116
|
+
return response.json()
|
|
117
|
+
|
|
118
|
+
def detect_sensitive(
|
|
119
|
+
self,
|
|
120
|
+
pdf_path: str | Path,
|
|
121
|
+
*,
|
|
122
|
+
action: str = "Highlight",
|
|
123
|
+
entities: str | None = None,
|
|
124
|
+
force_ocr: bool = False,
|
|
125
|
+
add_text_layer: bool = True,
|
|
126
|
+
async_job: bool = False,
|
|
127
|
+
callback_url: str | None = None,
|
|
128
|
+
response_format: str = "json",
|
|
129
|
+
poll: bool = True,
|
|
130
|
+
) -> dict[str, Any] | bytes:
|
|
131
|
+
path = Path(pdf_path)
|
|
132
|
+
params: dict[str, str] = {}
|
|
133
|
+
if async_job:
|
|
134
|
+
params["async"] = "true"
|
|
135
|
+
if response_format == "json":
|
|
136
|
+
params["format"] = "json"
|
|
137
|
+
data: dict[str, str] = {
|
|
138
|
+
"action": action,
|
|
139
|
+
"force_ocr": str(force_ocr).lower(),
|
|
140
|
+
"add_text_layer": str(add_text_layer).lower(),
|
|
141
|
+
}
|
|
142
|
+
if entities:
|
|
143
|
+
data["entities"] = entities
|
|
144
|
+
if callback_url:
|
|
145
|
+
data["callback_url"] = callback_url
|
|
146
|
+
with path.open("rb") as handle:
|
|
147
|
+
response = self._session.post(
|
|
148
|
+
self._url("/v1/pdf/detect-sensitive"),
|
|
149
|
+
params=params,
|
|
150
|
+
files={"file": (path.name, handle, "application/pdf")},
|
|
151
|
+
data=data,
|
|
152
|
+
timeout=self.timeout,
|
|
153
|
+
)
|
|
154
|
+
if response.status_code == 202:
|
|
155
|
+
payload = response.json()
|
|
156
|
+
if not poll:
|
|
157
|
+
return payload
|
|
158
|
+
payload = self.poll_job(payload["job_id"])
|
|
159
|
+
if payload.get("download_url"):
|
|
160
|
+
return self.download(payload["download_url"])
|
|
161
|
+
return payload
|
|
162
|
+
self._raise_for_status(response)
|
|
163
|
+
if "application/pdf" in response.headers.get("Content-Type", ""):
|
|
164
|
+
return response.content
|
|
165
|
+
return response.json()
|
|
166
|
+
|
|
167
|
+
def match_resume(
|
|
168
|
+
self,
|
|
169
|
+
resume: str,
|
|
170
|
+
job_description: str,
|
|
171
|
+
*,
|
|
172
|
+
top_keywords: int = 25,
|
|
173
|
+
) -> dict[str, Any]:
|
|
174
|
+
response = self._session.post(
|
|
175
|
+
self._url("/v1/match/resume"),
|
|
176
|
+
json={
|
|
177
|
+
"resume": resume,
|
|
178
|
+
"job_description": job_description,
|
|
179
|
+
"top_keywords": top_keywords,
|
|
180
|
+
},
|
|
181
|
+
timeout=self.timeout,
|
|
182
|
+
)
|
|
183
|
+
self._raise_for_status(response)
|
|
184
|
+
return response.json()
|
|
185
|
+
|
|
186
|
+
def summarize(self, text: str, *, sentences: int = 3) -> dict[str, Any]:
|
|
187
|
+
response = self._session.post(
|
|
188
|
+
self._url("/v1/text/summarize"),
|
|
189
|
+
json={"text": text, "sentences": sentences},
|
|
190
|
+
timeout=self.timeout,
|
|
191
|
+
)
|
|
192
|
+
self._raise_for_status(response)
|
|
193
|
+
return response.json()
|
docintel/config.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Application configuration."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Config:
|
|
7
|
+
HOST = os.getenv("DOCINTEL_HOST", "127.0.0.1")
|
|
8
|
+
PORT = int(os.getenv("DOCINTEL_PORT", "5000"))
|
|
9
|
+
DEBUG = os.getenv("DOCINTEL_DEBUG", "false").lower() == "true"
|
|
10
|
+
UPLOAD_DIR = os.getenv("DOCINTEL_UPLOAD_DIR", "uploads")
|
|
11
|
+
LOG_LEVEL = os.getenv("DOCINTEL_LOG_LEVEL", "INFO")
|
|
12
|
+
REDIS_URL = os.getenv("DOCINTEL_REDIS_URL", "redis://localhost:6379/0")
|
|
13
|
+
JOBS_ENABLED = os.getenv("DOCINTEL_JOBS_ENABLED", "true").lower() == "true"
|
|
14
|
+
QUEUE_NAME = os.getenv("DOCINTEL_QUEUE_NAME", "docintel")
|
|
15
|
+
API_KEYS = os.getenv("DOCINTEL_API_KEYS", "")
|
|
16
|
+
AUTH_REQUIRED = os.getenv("DOCINTEL_AUTH_REQUIRED", "false").lower() == "true"
|
|
17
|
+
RATE_LIMIT_ENABLED = os.getenv("DOCINTEL_RATE_LIMIT_ENABLED", "true").lower() == "true"
|
|
18
|
+
OIDC_ISSUER = os.getenv("DOCINTEL_OIDC_ISSUER", "")
|
|
19
|
+
OIDC_AUDIENCE = os.getenv("DOCINTEL_OIDC_AUDIENCE", "")
|
|
20
|
+
OIDC_JWKS_URL = os.getenv("DOCINTEL_OIDC_JWKS_URL", "")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Async job queue for long-running document tasks."""
|
|
2
|
+
|
|
3
|
+
from docintel.jobs.models import JobRecord, JobStatus, JobType
|
|
4
|
+
from docintel.jobs.store import get_job, jobs_enabled, ping_redis, save_job
|
|
5
|
+
from docintel.jobs.tasks import create_queued_job
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"JobRecord",
|
|
9
|
+
"JobStatus",
|
|
10
|
+
"JobType",
|
|
11
|
+
"create_queued_job",
|
|
12
|
+
"get_job",
|
|
13
|
+
"jobs_enabled",
|
|
14
|
+
"ping_redis",
|
|
15
|
+
"save_job",
|
|
16
|
+
]
|
docintel/jobs/helpers.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Shared helpers for async job enqueue from HTTP routes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from flask import jsonify
|
|
6
|
+
|
|
7
|
+
from docintel.jobs.models import JobType
|
|
8
|
+
from docintel.jobs.store import jobs_enabled, ping_redis
|
|
9
|
+
from docintel.jobs.tasks import create_queued_job
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def enqueue_async_response(
|
|
13
|
+
*,
|
|
14
|
+
job_id: str,
|
|
15
|
+
job_type: JobType,
|
|
16
|
+
callback_url: str | None,
|
|
17
|
+
):
|
|
18
|
+
"""Validate Redis and return a standard 202 async job payload."""
|
|
19
|
+
if not jobs_enabled():
|
|
20
|
+
return jsonify({"error": "Async jobs are disabled on this server."}), 503
|
|
21
|
+
if not ping_redis():
|
|
22
|
+
return jsonify(
|
|
23
|
+
{
|
|
24
|
+
"error": "Redis is not reachable. Start Redis or set DOCINTEL_REDIS_URL.",
|
|
25
|
+
"hint": "Use async=false for synchronous processing without a queue.",
|
|
26
|
+
}
|
|
27
|
+
), 503
|
|
28
|
+
|
|
29
|
+
create_queued_job(job_id, job_type=job_type, callback_url=callback_url)
|
|
30
|
+
payload = {
|
|
31
|
+
"status": "ok",
|
|
32
|
+
"job_id": job_id,
|
|
33
|
+
"job_type": job_type.value,
|
|
34
|
+
"job_status": "queued",
|
|
35
|
+
"poll_url": f"/v1/jobs/{job_id}",
|
|
36
|
+
"message": "Job queued. Poll poll_url until job_status is completed.",
|
|
37
|
+
}
|
|
38
|
+
return jsonify(payload), 202
|
docintel/jobs/models.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Job status types for async document processing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class JobStatus(str, Enum):
|
|
11
|
+
QUEUED = "queued"
|
|
12
|
+
RUNNING = "running"
|
|
13
|
+
COMPLETED = "completed"
|
|
14
|
+
FAILED = "failed"
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def from_value(cls, value: str) -> "JobStatus":
|
|
18
|
+
normalized = value.strip().lower()
|
|
19
|
+
for status in cls:
|
|
20
|
+
if status.value == normalized:
|
|
21
|
+
return status
|
|
22
|
+
raise ValueError(f"Unknown job status: {value}")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class JobType(str, Enum):
|
|
26
|
+
PDF_STRUCTURE = "pdf_structure"
|
|
27
|
+
PDF_DETECT_SENSITIVE = "pdf_detect_sensitive"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class JobRecord:
|
|
32
|
+
job_id: str
|
|
33
|
+
job_type: JobType
|
|
34
|
+
status: JobStatus
|
|
35
|
+
progress: int = 0
|
|
36
|
+
progress_message: str = ""
|
|
37
|
+
pages_done: int = 0
|
|
38
|
+
pages_total: int = 0
|
|
39
|
+
callback_url: str | None = None
|
|
40
|
+
download_url: str | None = None
|
|
41
|
+
error: str | None = None
|
|
42
|
+
result: dict[str, Any] = field(default_factory=dict)
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> dict[str, Any]:
|
|
45
|
+
payload: dict[str, Any] = {
|
|
46
|
+
"job_id": self.job_id,
|
|
47
|
+
"job_type": self.job_type.value,
|
|
48
|
+
"job_status": self.status.value,
|
|
49
|
+
"progress": self.progress,
|
|
50
|
+
"progress_message": self.progress_message,
|
|
51
|
+
"pages_done": self.pages_done,
|
|
52
|
+
"pages_total": self.pages_total,
|
|
53
|
+
}
|
|
54
|
+
if self.callback_url:
|
|
55
|
+
payload["callback_url"] = self.callback_url
|
|
56
|
+
if self.download_url:
|
|
57
|
+
payload["download_url"] = self.download_url
|
|
58
|
+
if self.error:
|
|
59
|
+
payload["error"] = self.error
|
|
60
|
+
if self.result:
|
|
61
|
+
payload["result"] = self.result
|
|
62
|
+
return payload
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_dict(cls, payload: dict[str, Any]) -> "JobRecord":
|
|
66
|
+
return cls(
|
|
67
|
+
job_id=str(payload["job_id"]),
|
|
68
|
+
job_type=JobType(payload["job_type"]),
|
|
69
|
+
status=JobStatus(payload.get("job_status", payload.get("status"))),
|
|
70
|
+
progress=int(payload.get("progress", 0)),
|
|
71
|
+
progress_message=str(payload.get("progress_message", "")),
|
|
72
|
+
pages_done=int(payload.get("pages_done", 0)),
|
|
73
|
+
pages_total=int(payload.get("pages_total", 0)),
|
|
74
|
+
callback_url=payload.get("callback_url"),
|
|
75
|
+
download_url=payload.get("download_url"),
|
|
76
|
+
error=payload.get("error"),
|
|
77
|
+
result=dict(payload.get("result") or {}),
|
|
78
|
+
)
|
docintel/jobs/queue.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""RQ queue helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from docintel.jobs.store import redis_url
|
|
8
|
+
|
|
9
|
+
QUEUE_NAME = os.getenv("DOCINTEL_QUEUE_NAME", "docintel")
|
|
10
|
+
DEFAULT_RESULT_TTL = 60 * 60 * 24
|
|
11
|
+
DEFAULT_FAILURE_TTL = 60 * 60 * 24
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_queue():
|
|
15
|
+
from redis import Redis
|
|
16
|
+
from rq import Queue
|
|
17
|
+
|
|
18
|
+
connection = Redis.from_url(redis_url())
|
|
19
|
+
return Queue(QUEUE_NAME, connection=connection)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def enqueue_structure_job(
|
|
23
|
+
job_id: str,
|
|
24
|
+
input_path: str,
|
|
25
|
+
output_path: str,
|
|
26
|
+
mode: str,
|
|
27
|
+
force_ocr: bool,
|
|
28
|
+
output_filename: str,
|
|
29
|
+
redact_before_llm: bool = False,
|
|
30
|
+
) -> None:
|
|
31
|
+
queue = get_queue()
|
|
32
|
+
queue.enqueue(
|
|
33
|
+
"docintel.jobs.tasks.run_structure_pdf_job",
|
|
34
|
+
job_id=job_id,
|
|
35
|
+
input_path=input_path,
|
|
36
|
+
output_path=output_path,
|
|
37
|
+
mode=mode,
|
|
38
|
+
force_ocr=force_ocr,
|
|
39
|
+
output_filename=output_filename,
|
|
40
|
+
redact_before_llm=redact_before_llm,
|
|
41
|
+
job_timeout=1800,
|
|
42
|
+
result_ttl=DEFAULT_RESULT_TTL,
|
|
43
|
+
failure_ttl=DEFAULT_FAILURE_TTL,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def enqueue_detect_sensitive_job(
|
|
48
|
+
job_id: str,
|
|
49
|
+
input_path: str,
|
|
50
|
+
output_path: str,
|
|
51
|
+
output_filename: str,
|
|
52
|
+
action: str,
|
|
53
|
+
force_ocr: bool,
|
|
54
|
+
add_text_layer: bool,
|
|
55
|
+
min_score: float,
|
|
56
|
+
entities: list[str] | None = None,
|
|
57
|
+
pattern: str | None = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
queue = get_queue()
|
|
60
|
+
queue.enqueue(
|
|
61
|
+
"docintel.jobs.tasks.run_detect_sensitive_pdf_job",
|
|
62
|
+
job_id=job_id,
|
|
63
|
+
input_path=input_path,
|
|
64
|
+
output_path=output_path,
|
|
65
|
+
output_filename=output_filename,
|
|
66
|
+
action=action,
|
|
67
|
+
force_ocr=force_ocr,
|
|
68
|
+
add_text_layer=add_text_layer,
|
|
69
|
+
min_score=min_score,
|
|
70
|
+
entities=entities,
|
|
71
|
+
pattern=pattern,
|
|
72
|
+
job_timeout=1800,
|
|
73
|
+
result_ttl=DEFAULT_RESULT_TTL,
|
|
74
|
+
failure_ttl=DEFAULT_FAILURE_TTL,
|
|
75
|
+
)
|