docintel-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. docintel/__init__.py +6 -0
  2. docintel/app.py +45 -0
  3. docintel/auth/__init__.py +12 -0
  4. docintel/auth/api_keys.py +48 -0
  5. docintel/auth/limiter.py +41 -0
  6. docintel/auth/middleware.py +34 -0
  7. docintel/auth/oidc.py +45 -0
  8. docintel/cli.py +21 -0
  9. docintel/client.py +193 -0
  10. docintel/config.py +20 -0
  11. docintel/jobs/__init__.py +16 -0
  12. docintel/jobs/helpers.py +38 -0
  13. docintel/jobs/models.py +78 -0
  14. docintel/jobs/queue.py +75 -0
  15. docintel/jobs/store.py +82 -0
  16. docintel/jobs/tasks.py +173 -0
  17. docintel/jobs/webhooks.py +32 -0
  18. docintel/openapi/__init__.py +1 -0
  19. docintel/openapi/openapi.yaml +380 -0
  20. docintel/ops/__init__.py +1 -0
  21. docintel/ops/logging.py +40 -0
  22. docintel/ops/metrics.py +57 -0
  23. docintel/ops/middleware.py +40 -0
  24. docintel/routes/__init__.py +1 -0
  25. docintel/routes/jobs.py +26 -0
  26. docintel/routes/match.py +43 -0
  27. docintel/routes/openapi_docs.py +57 -0
  28. docintel/routes/ops.py +22 -0
  29. docintel/routes/pdf.py +420 -0
  30. docintel/routes/text.py +41 -0
  31. docintel/services/__init__.py +1 -0
  32. docintel/services/matching/__init__.py +6 -0
  33. docintel/services/matching/models.py +19 -0
  34. docintel/services/matching/scorer.py +64 -0
  35. docintel/services/pdf/__init__.py +26 -0
  36. docintel/services/pdf/annotator.py +188 -0
  37. docintel/services/pdf/models.py +104 -0
  38. docintel/services/pdf/ocr.py +130 -0
  39. docintel/services/pdf/pii.py +105 -0
  40. docintel/services/pdf/presets.py +26 -0
  41. docintel/services/pdf/search.py +29 -0
  42. docintel/services/pdf/sensitive.py +212 -0
  43. docintel/services/pdf/structure.py +118 -0
  44. docintel/services/pdf/structure_llm.py +136 -0
  45. docintel/services/pdf/structure_render.py +136 -0
  46. docintel/services/pdf/structure_schema.py +99 -0
  47. docintel/services/summary/__init__.py +6 -0
  48. docintel/services/summary/models.py +21 -0
  49. docintel/services/summary/textrank.py +57 -0
  50. docintel/ui.py +347 -0
  51. docintel/wsgi.py +5 -0
  52. docintel_platform-1.0.2.dist-info/METADATA +607 -0
  53. docintel_platform-1.0.2.dist-info/RECORD +56 -0
  54. docintel_platform-1.0.2.dist-info/WHEEL +5 -0
  55. docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
  56. docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
docintel/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Document intelligence platform."""
2
+
3
+ from docintel.client import DocintelClient, DocintelError
4
+
5
+ __version__ = "1.0.0"
6
+ __all__ = ["DocintelClient", "DocintelError", "__version__"]
docintel/app.py ADDED
@@ -0,0 +1,45 @@
1
+ """Flask application factory."""
2
+
3
+ from flask import Flask, jsonify
4
+
5
+ from docintel import __version__
6
+ from docintel.config import Config
7
+ from docintel.auth.limiter import init_limiter
8
+ from docintel.auth.middleware import register_auth
9
+ from docintel.ops.logging import configure_logging
10
+ from docintel.ops.middleware import register_request_hooks
11
+ from docintel.routes.jobs import jobs_bp
12
+ from docintel.routes.openapi_docs import docs_bp
13
+ from docintel.routes.match import match_bp
14
+ from docintel.routes.ops import ops_bp
15
+ from docintel.routes.pdf import pdf_bp
16
+ from docintel.routes.text import text_bp
17
+
18
+
19
+ def create_app(config: type[Config] = Config) -> Flask:
20
+ app = Flask(__name__)
21
+ app.config.from_object(config)
22
+
23
+ configure_logging(config.LOG_LEVEL)
24
+ register_request_hooks(app)
25
+ register_auth(app)
26
+ init_limiter(app)
27
+
28
+ @app.get("/health")
29
+ def health():
30
+ return jsonify(
31
+ {
32
+ "status": "ok",
33
+ "service": "document-intelligence-platform",
34
+ "version": __version__,
35
+ }
36
+ )
37
+
38
+ app.register_blueprint(docs_bp)
39
+ app.register_blueprint(pdf_bp)
40
+ app.register_blueprint(jobs_bp)
41
+ app.register_blueprint(match_bp)
42
+ app.register_blueprint(text_bp)
43
+ app.register_blueprint(ops_bp)
44
+
45
+ return app
@@ -0,0 +1,12 @@
1
+ """API authentication and rate limiting."""
2
+
3
+ from docintel.auth.api_keys import auth_required, extract_bearer_token, validate_credentials
4
+ from docintel.auth.limiter import init_limiter, limiter
5
+
6
+ __all__ = [
7
+ "auth_required",
8
+ "extract_bearer_token",
9
+ "init_limiter",
10
+ "limiter",
11
+ "validate_credentials",
12
+ ]
@@ -0,0 +1,48 @@
1
+ """API key authentication."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class AuthContext:
11
+ method: str
12
+ subject: str
13
+
14
+
15
+ def _configured_keys() -> set[str]:
16
+ raw = os.getenv("DOCINTEL_API_KEYS", "")
17
+ return {item.strip() for item in raw.split(",") if item.strip()}
18
+
19
+
20
+ def auth_required() -> bool:
21
+ if os.getenv("DOCINTEL_AUTH_REQUIRED", "false").lower() == "true":
22
+ return True
23
+ return bool(_configured_keys())
24
+
25
+
26
+ def extract_bearer_token() -> str | None:
27
+ from flask import request
28
+
29
+ header = request.headers.get("Authorization", "").strip()
30
+ if not header.lower().startswith("bearer "):
31
+ return None
32
+ token = header[7:].strip()
33
+ return token or None
34
+
35
+
36
+ def validate_api_key(token: str) -> AuthContext | None:
37
+ if token in _configured_keys():
38
+ return AuthContext(method="api_key", subject=token[:8])
39
+ return None
40
+
41
+
42
+ def validate_credentials(token: str) -> AuthContext | None:
43
+ from docintel.auth.oidc import validate_oidc_token
44
+
45
+ api_match = validate_api_key(token)
46
+ if api_match is not None:
47
+ return api_match
48
+ return validate_oidc_token(token)
@@ -0,0 +1,41 @@
1
+ """Redis-backed per-tenant rate limits."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ from flask_limiter import Limiter
8
+ from flask_limiter.util import get_remote_address
9
+
10
+ from docintel.auth.api_keys import extract_bearer_token
11
+ from docintel.jobs.store import redis_url
12
+
13
+
14
+ def _rate_limit_key() -> str:
15
+ token = extract_bearer_token()
16
+ if token:
17
+ return f"key:{token[:12]}"
18
+ return get_remote_address()
19
+
20
+
21
+ def rate_limits_enabled() -> bool:
22
+ return os.getenv("DOCINTEL_RATE_LIMIT_ENABLED", "true").lower() == "true"
23
+
24
+
25
+ def storage_uri() -> str:
26
+ if rate_limits_enabled():
27
+ return redis_url()
28
+ return "memory://"
29
+
30
+
31
+ limiter = Limiter(
32
+ key_func=_rate_limit_key,
33
+ default_limits=[],
34
+ storage_uri="memory://",
35
+ strategy="fixed-window",
36
+ )
37
+
38
+
39
+ def init_limiter(app) -> None:
40
+ app.config["RATELIMIT_STORAGE_URI"] = storage_uri()
41
+ limiter.init_app(app)
@@ -0,0 +1,34 @@
1
+ """Authentication middleware for protected API routes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from flask import Flask, g, jsonify, request
6
+
7
+ from docintel.auth.api_keys import auth_required, extract_bearer_token, validate_credentials
8
+
9
+ PUBLIC_PREFIXES = ("/health", "/docs", "/openapi.json", "/metrics")
10
+
11
+
12
+ def register_auth(app: Flask) -> None:
13
+ @app.before_request
14
+ def _enforce_api_auth():
15
+ if not auth_required():
16
+ return None
17
+
18
+ path = request.path or ""
19
+ if any(path == prefix or path.startswith(prefix + "/") for prefix in PUBLIC_PREFIXES):
20
+ return None
21
+
22
+ if not path.startswith("/v1/"):
23
+ return None
24
+
25
+ token = extract_bearer_token()
26
+ if not token:
27
+ return jsonify({"error": "Missing Authorization: Bearer <api_key> header."}), 401
28
+
29
+ context = validate_credentials(token)
30
+ if context is None:
31
+ return jsonify({"error": "Invalid API credentials."}), 401
32
+
33
+ g.auth_context = context
34
+ return None
docintel/auth/oidc.py ADDED
@@ -0,0 +1,45 @@
1
+ """Optional OIDC bearer token validation (Session 5 hook, no-op when unset)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ from docintel.auth.api_keys import AuthContext
8
+
9
+
10
+ def oidc_enabled() -> bool:
11
+ return bool(os.getenv("DOCINTEL_OIDC_ISSUER", "").strip())
12
+
13
+
14
+ def validate_oidc_token(token: str) -> AuthContext | None:
15
+ if not oidc_enabled():
16
+ return None
17
+ if token.count(".") != 2:
18
+ return None
19
+
20
+ issuer = os.getenv("DOCINTEL_OIDC_ISSUER", "").strip()
21
+ audience = os.getenv("DOCINTEL_OIDC_AUDIENCE", "").strip() or None
22
+ jwks_url = os.getenv("DOCINTEL_OIDC_JWKS_URL", "").strip()
23
+ if not jwks_url and issuer:
24
+ jwks_url = issuer.rstrip("/") + "/.well-known/jwks.json"
25
+
26
+ try:
27
+ import jwt
28
+ from jwt import PyJWKClient
29
+ except ImportError as exc:
30
+ raise RuntimeError(
31
+ "OIDC auth requires PyJWT. Run: pip install -e '.[auth]'"
32
+ ) from exc
33
+
34
+ client = PyJWKClient(jwks_url)
35
+ signing_key = client.get_signing_key_from_jwt(token)
36
+ claims = jwt.decode(
37
+ token,
38
+ signing_key.key,
39
+ algorithms=["RS256", "ES256"],
40
+ audience=audience,
41
+ issuer=issuer,
42
+ options={"verify_aud": audience is not None},
43
+ )
44
+ subject = str(claims.get("sub") or claims.get("email") or "oidc-user")
45
+ return AuthContext(method="oidc", subject=subject)
docintel/cli.py ADDED
@@ -0,0 +1,21 @@
1
+ """CLI entry point."""
2
+
3
+ import argparse
4
+
5
+ from docintel.app import create_app
6
+ from docintel.config import Config
7
+
8
+
9
+ def main() -> None:
10
+ parser = argparse.ArgumentParser(description="Run the document intelligence API.")
11
+ parser.add_argument("--host", default=Config.HOST)
12
+ parser.add_argument("--port", type=int, default=Config.PORT)
13
+ parser.add_argument("--debug", action="store_true")
14
+ args = parser.parse_args()
15
+
16
+ app = create_app()
17
+ app.run(host=args.host, port=args.port, debug=args.debug)
18
+
19
+
20
+ if __name__ == "__main__":
21
+ main()
docintel/client.py ADDED
@@ -0,0 +1,193 @@
1
+ """Python client for the Document Intelligence Platform REST API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import requests
10
+
11
+
12
+ class DocintelError(Exception):
13
+ """Raised when the API returns an error response."""
14
+
15
+
16
+ class DocintelClient:
17
+ """HTTP client for ``/v1/*`` document intelligence endpoints."""
18
+
19
+ def __init__(
20
+ self,
21
+ base_url: str = "http://127.0.0.1:5000",
22
+ api_key: str | None = None,
23
+ timeout: int = 120,
24
+ ) -> None:
25
+ self.base_url = base_url.rstrip("/")
26
+ self.timeout = timeout
27
+ self._session = requests.Session()
28
+ if api_key:
29
+ self._session.headers["Authorization"] = f"Bearer {api_key}"
30
+
31
+ def _url(self, path: str) -> str:
32
+ if not path.startswith("/"):
33
+ path = f"/{path}"
34
+ return f"{self.base_url}{path}"
35
+
36
+ def _raise_for_status(self, response: requests.Response) -> None:
37
+ if response.ok:
38
+ return
39
+ try:
40
+ payload = response.json()
41
+ message = payload.get("error", response.text)
42
+ except Exception:
43
+ message = response.text or f"HTTP {response.status_code}"
44
+ raise DocintelError(message)
45
+
46
+ def health(self) -> dict[str, Any]:
47
+ response = self._session.get(self._url("/health"), timeout=self.timeout)
48
+ self._raise_for_status(response)
49
+ return response.json()
50
+
51
+ def get_job(self, job_id: str) -> dict[str, Any]:
52
+ response = self._session.get(self._url(f"/v1/jobs/{job_id}"), timeout=self.timeout)
53
+ self._raise_for_status(response)
54
+ return response.json()
55
+
56
+ def poll_job(
57
+ self,
58
+ job_id: str,
59
+ *,
60
+ interval_seconds: float = 2.0,
61
+ timeout_seconds: float = 600.0,
62
+ ) -> dict[str, Any]:
63
+ deadline = time.time() + timeout_seconds
64
+ while time.time() < deadline:
65
+ payload = self.get_job(job_id)
66
+ status = payload.get("job_status")
67
+ if status == "completed":
68
+ return payload
69
+ if status == "failed":
70
+ raise DocintelError(payload.get("error", "Job failed"))
71
+ time.sleep(interval_seconds)
72
+ raise DocintelError(f"Job {job_id} timed out after {timeout_seconds}s")
73
+
74
+ def download(self, download_url: str) -> bytes:
75
+ response = self._session.get(self._url(download_url), timeout=self.timeout)
76
+ self._raise_for_status(response)
77
+ return response.content
78
+
79
+ def structure_pdf(
80
+ self,
81
+ pdf_path: str | Path,
82
+ *,
83
+ mode: str = "curate",
84
+ force_ocr: bool = False,
85
+ redact_before_llm: bool = False,
86
+ async_job: bool = False,
87
+ callback_url: str | None = None,
88
+ poll: bool = True,
89
+ ) -> dict[str, Any] | bytes:
90
+ path = Path(pdf_path)
91
+ params = {"async": "true"} if async_job else {}
92
+ data = {
93
+ "mode": mode,
94
+ "force_ocr": str(force_ocr).lower(),
95
+ "redact_before_llm": str(redact_before_llm).lower(),
96
+ }
97
+ if callback_url:
98
+ data["callback_url"] = callback_url
99
+ with path.open("rb") as handle:
100
+ response = self._session.post(
101
+ self._url("/v1/pdf/structure"),
102
+ params=params,
103
+ files={"file": (path.name, handle, "application/pdf")},
104
+ data=data,
105
+ timeout=self.timeout,
106
+ )
107
+ if response.status_code == 202:
108
+ payload = response.json()
109
+ if not poll:
110
+ return payload
111
+ payload = self.poll_job(payload["job_id"])
112
+ return self.download(payload["download_url"])
113
+ self._raise_for_status(response)
114
+ if "application/pdf" in response.headers.get("Content-Type", ""):
115
+ return response.content
116
+ return response.json()
117
+
118
+ def detect_sensitive(
119
+ self,
120
+ pdf_path: str | Path,
121
+ *,
122
+ action: str = "Highlight",
123
+ entities: str | None = None,
124
+ force_ocr: bool = False,
125
+ add_text_layer: bool = True,
126
+ async_job: bool = False,
127
+ callback_url: str | None = None,
128
+ response_format: str = "json",
129
+ poll: bool = True,
130
+ ) -> dict[str, Any] | bytes:
131
+ path = Path(pdf_path)
132
+ params: dict[str, str] = {}
133
+ if async_job:
134
+ params["async"] = "true"
135
+ if response_format == "json":
136
+ params["format"] = "json"
137
+ data: dict[str, str] = {
138
+ "action": action,
139
+ "force_ocr": str(force_ocr).lower(),
140
+ "add_text_layer": str(add_text_layer).lower(),
141
+ }
142
+ if entities:
143
+ data["entities"] = entities
144
+ if callback_url:
145
+ data["callback_url"] = callback_url
146
+ with path.open("rb") as handle:
147
+ response = self._session.post(
148
+ self._url("/v1/pdf/detect-sensitive"),
149
+ params=params,
150
+ files={"file": (path.name, handle, "application/pdf")},
151
+ data=data,
152
+ timeout=self.timeout,
153
+ )
154
+ if response.status_code == 202:
155
+ payload = response.json()
156
+ if not poll:
157
+ return payload
158
+ payload = self.poll_job(payload["job_id"])
159
+ if payload.get("download_url"):
160
+ return self.download(payload["download_url"])
161
+ return payload
162
+ self._raise_for_status(response)
163
+ if "application/pdf" in response.headers.get("Content-Type", ""):
164
+ return response.content
165
+ return response.json()
166
+
167
+ def match_resume(
168
+ self,
169
+ resume: str,
170
+ job_description: str,
171
+ *,
172
+ top_keywords: int = 25,
173
+ ) -> dict[str, Any]:
174
+ response = self._session.post(
175
+ self._url("/v1/match/resume"),
176
+ json={
177
+ "resume": resume,
178
+ "job_description": job_description,
179
+ "top_keywords": top_keywords,
180
+ },
181
+ timeout=self.timeout,
182
+ )
183
+ self._raise_for_status(response)
184
+ return response.json()
185
+
186
+ def summarize(self, text: str, *, sentences: int = 3) -> dict[str, Any]:
187
+ response = self._session.post(
188
+ self._url("/v1/text/summarize"),
189
+ json={"text": text, "sentences": sentences},
190
+ timeout=self.timeout,
191
+ )
192
+ self._raise_for_status(response)
193
+ return response.json()
docintel/config.py ADDED
@@ -0,0 +1,20 @@
1
+ """Application configuration."""
2
+
3
+ import os
4
+
5
+
6
+ class Config:
7
+ HOST = os.getenv("DOCINTEL_HOST", "127.0.0.1")
8
+ PORT = int(os.getenv("DOCINTEL_PORT", "5000"))
9
+ DEBUG = os.getenv("DOCINTEL_DEBUG", "false").lower() == "true"
10
+ UPLOAD_DIR = os.getenv("DOCINTEL_UPLOAD_DIR", "uploads")
11
+ LOG_LEVEL = os.getenv("DOCINTEL_LOG_LEVEL", "INFO")
12
+ REDIS_URL = os.getenv("DOCINTEL_REDIS_URL", "redis://localhost:6379/0")
13
+ JOBS_ENABLED = os.getenv("DOCINTEL_JOBS_ENABLED", "true").lower() == "true"
14
+ QUEUE_NAME = os.getenv("DOCINTEL_QUEUE_NAME", "docintel")
15
+ API_KEYS = os.getenv("DOCINTEL_API_KEYS", "")
16
+ AUTH_REQUIRED = os.getenv("DOCINTEL_AUTH_REQUIRED", "false").lower() == "true"
17
+ RATE_LIMIT_ENABLED = os.getenv("DOCINTEL_RATE_LIMIT_ENABLED", "true").lower() == "true"
18
+ OIDC_ISSUER = os.getenv("DOCINTEL_OIDC_ISSUER", "")
19
+ OIDC_AUDIENCE = os.getenv("DOCINTEL_OIDC_AUDIENCE", "")
20
+ OIDC_JWKS_URL = os.getenv("DOCINTEL_OIDC_JWKS_URL", "")
@@ -0,0 +1,16 @@
1
+ """Async job queue for long-running document tasks."""
2
+
3
+ from docintel.jobs.models import JobRecord, JobStatus, JobType
4
+ from docintel.jobs.store import get_job, jobs_enabled, ping_redis, save_job
5
+ from docintel.jobs.tasks import create_queued_job
6
+
7
+ __all__ = [
8
+ "JobRecord",
9
+ "JobStatus",
10
+ "JobType",
11
+ "create_queued_job",
12
+ "get_job",
13
+ "jobs_enabled",
14
+ "ping_redis",
15
+ "save_job",
16
+ ]
@@ -0,0 +1,38 @@
1
+ """Shared helpers for async job enqueue from HTTP routes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from flask import jsonify
6
+
7
+ from docintel.jobs.models import JobType
8
+ from docintel.jobs.store import jobs_enabled, ping_redis
9
+ from docintel.jobs.tasks import create_queued_job
10
+
11
+
12
+ def enqueue_async_response(
13
+ *,
14
+ job_id: str,
15
+ job_type: JobType,
16
+ callback_url: str | None,
17
+ ):
18
+ """Validate Redis and return a standard 202 async job payload."""
19
+ if not jobs_enabled():
20
+ return jsonify({"error": "Async jobs are disabled on this server."}), 503
21
+ if not ping_redis():
22
+ return jsonify(
23
+ {
24
+ "error": "Redis is not reachable. Start Redis or set DOCINTEL_REDIS_URL.",
25
+ "hint": "Use async=false for synchronous processing without a queue.",
26
+ }
27
+ ), 503
28
+
29
+ create_queued_job(job_id, job_type=job_type, callback_url=callback_url)
30
+ payload = {
31
+ "status": "ok",
32
+ "job_id": job_id,
33
+ "job_type": job_type.value,
34
+ "job_status": "queued",
35
+ "poll_url": f"/v1/jobs/{job_id}",
36
+ "message": "Job queued. Poll poll_url until job_status is completed.",
37
+ }
38
+ return jsonify(payload), 202
@@ -0,0 +1,78 @@
1
+ """Job status types for async document processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from enum import Enum
7
+ from typing import Any
8
+
9
+
10
+ class JobStatus(str, Enum):
11
+ QUEUED = "queued"
12
+ RUNNING = "running"
13
+ COMPLETED = "completed"
14
+ FAILED = "failed"
15
+
16
+ @classmethod
17
+ def from_value(cls, value: str) -> "JobStatus":
18
+ normalized = value.strip().lower()
19
+ for status in cls:
20
+ if status.value == normalized:
21
+ return status
22
+ raise ValueError(f"Unknown job status: {value}")
23
+
24
+
25
+ class JobType(str, Enum):
26
+ PDF_STRUCTURE = "pdf_structure"
27
+ PDF_DETECT_SENSITIVE = "pdf_detect_sensitive"
28
+
29
+
30
+ @dataclass
31
+ class JobRecord:
32
+ job_id: str
33
+ job_type: JobType
34
+ status: JobStatus
35
+ progress: int = 0
36
+ progress_message: str = ""
37
+ pages_done: int = 0
38
+ pages_total: int = 0
39
+ callback_url: str | None = None
40
+ download_url: str | None = None
41
+ error: str | None = None
42
+ result: dict[str, Any] = field(default_factory=dict)
43
+
44
+ def to_dict(self) -> dict[str, Any]:
45
+ payload: dict[str, Any] = {
46
+ "job_id": self.job_id,
47
+ "job_type": self.job_type.value,
48
+ "job_status": self.status.value,
49
+ "progress": self.progress,
50
+ "progress_message": self.progress_message,
51
+ "pages_done": self.pages_done,
52
+ "pages_total": self.pages_total,
53
+ }
54
+ if self.callback_url:
55
+ payload["callback_url"] = self.callback_url
56
+ if self.download_url:
57
+ payload["download_url"] = self.download_url
58
+ if self.error:
59
+ payload["error"] = self.error
60
+ if self.result:
61
+ payload["result"] = self.result
62
+ return payload
63
+
64
+ @classmethod
65
+ def from_dict(cls, payload: dict[str, Any]) -> "JobRecord":
66
+ return cls(
67
+ job_id=str(payload["job_id"]),
68
+ job_type=JobType(payload["job_type"]),
69
+ status=JobStatus(payload.get("job_status", payload.get("status"))),
70
+ progress=int(payload.get("progress", 0)),
71
+ progress_message=str(payload.get("progress_message", "")),
72
+ pages_done=int(payload.get("pages_done", 0)),
73
+ pages_total=int(payload.get("pages_total", 0)),
74
+ callback_url=payload.get("callback_url"),
75
+ download_url=payload.get("download_url"),
76
+ error=payload.get("error"),
77
+ result=dict(payload.get("result") or {}),
78
+ )
docintel/jobs/queue.py ADDED
@@ -0,0 +1,75 @@
1
+ """RQ queue helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+ from docintel.jobs.store import redis_url
8
+
9
+ QUEUE_NAME = os.getenv("DOCINTEL_QUEUE_NAME", "docintel")
10
+ DEFAULT_RESULT_TTL = 60 * 60 * 24
11
+ DEFAULT_FAILURE_TTL = 60 * 60 * 24
12
+
13
+
14
+ def get_queue():
15
+ from redis import Redis
16
+ from rq import Queue
17
+
18
+ connection = Redis.from_url(redis_url())
19
+ return Queue(QUEUE_NAME, connection=connection)
20
+
21
+
22
+ def enqueue_structure_job(
23
+ job_id: str,
24
+ input_path: str,
25
+ output_path: str,
26
+ mode: str,
27
+ force_ocr: bool,
28
+ output_filename: str,
29
+ redact_before_llm: bool = False,
30
+ ) -> None:
31
+ queue = get_queue()
32
+ queue.enqueue(
33
+ "docintel.jobs.tasks.run_structure_pdf_job",
34
+ job_id=job_id,
35
+ input_path=input_path,
36
+ output_path=output_path,
37
+ mode=mode,
38
+ force_ocr=force_ocr,
39
+ output_filename=output_filename,
40
+ redact_before_llm=redact_before_llm,
41
+ job_timeout=1800,
42
+ result_ttl=DEFAULT_RESULT_TTL,
43
+ failure_ttl=DEFAULT_FAILURE_TTL,
44
+ )
45
+
46
+
47
+ def enqueue_detect_sensitive_job(
48
+ job_id: str,
49
+ input_path: str,
50
+ output_path: str,
51
+ output_filename: str,
52
+ action: str,
53
+ force_ocr: bool,
54
+ add_text_layer: bool,
55
+ min_score: float,
56
+ entities: list[str] | None = None,
57
+ pattern: str | None = None,
58
+ ) -> None:
59
+ queue = get_queue()
60
+ queue.enqueue(
61
+ "docintel.jobs.tasks.run_detect_sensitive_pdf_job",
62
+ job_id=job_id,
63
+ input_path=input_path,
64
+ output_path=output_path,
65
+ output_filename=output_filename,
66
+ action=action,
67
+ force_ocr=force_ocr,
68
+ add_text_layer=add_text_layer,
69
+ min_score=min_score,
70
+ entities=entities,
71
+ pattern=pattern,
72
+ job_timeout=1800,
73
+ result_ttl=DEFAULT_RESULT_TTL,
74
+ failure_ttl=DEFAULT_FAILURE_TTL,
75
+ )