docintel-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. docintel/__init__.py +6 -0
  2. docintel/app.py +45 -0
  3. docintel/auth/__init__.py +12 -0
  4. docintel/auth/api_keys.py +48 -0
  5. docintel/auth/limiter.py +41 -0
  6. docintel/auth/middleware.py +34 -0
  7. docintel/auth/oidc.py +45 -0
  8. docintel/cli.py +21 -0
  9. docintel/client.py +193 -0
  10. docintel/config.py +20 -0
  11. docintel/jobs/__init__.py +16 -0
  12. docintel/jobs/helpers.py +38 -0
  13. docintel/jobs/models.py +78 -0
  14. docintel/jobs/queue.py +75 -0
  15. docintel/jobs/store.py +82 -0
  16. docintel/jobs/tasks.py +173 -0
  17. docintel/jobs/webhooks.py +32 -0
  18. docintel/openapi/__init__.py +1 -0
  19. docintel/openapi/openapi.yaml +380 -0
  20. docintel/ops/__init__.py +1 -0
  21. docintel/ops/logging.py +40 -0
  22. docintel/ops/metrics.py +57 -0
  23. docintel/ops/middleware.py +40 -0
  24. docintel/routes/__init__.py +1 -0
  25. docintel/routes/jobs.py +26 -0
  26. docintel/routes/match.py +43 -0
  27. docintel/routes/openapi_docs.py +57 -0
  28. docintel/routes/ops.py +22 -0
  29. docintel/routes/pdf.py +420 -0
  30. docintel/routes/text.py +41 -0
  31. docintel/services/__init__.py +1 -0
  32. docintel/services/matching/__init__.py +6 -0
  33. docintel/services/matching/models.py +19 -0
  34. docintel/services/matching/scorer.py +64 -0
  35. docintel/services/pdf/__init__.py +26 -0
  36. docintel/services/pdf/annotator.py +188 -0
  37. docintel/services/pdf/models.py +104 -0
  38. docintel/services/pdf/ocr.py +130 -0
  39. docintel/services/pdf/pii.py +105 -0
  40. docintel/services/pdf/presets.py +26 -0
  41. docintel/services/pdf/search.py +29 -0
  42. docintel/services/pdf/sensitive.py +212 -0
  43. docintel/services/pdf/structure.py +118 -0
  44. docintel/services/pdf/structure_llm.py +136 -0
  45. docintel/services/pdf/structure_render.py +136 -0
  46. docintel/services/pdf/structure_schema.py +99 -0
  47. docintel/services/summary/__init__.py +6 -0
  48. docintel/services/summary/models.py +21 -0
  49. docintel/services/summary/textrank.py +57 -0
  50. docintel/ui.py +347 -0
  51. docintel/wsgi.py +5 -0
  52. docintel_platform-1.0.2.dist-info/METADATA +607 -0
  53. docintel_platform-1.0.2.dist-info/RECORD +56 -0
  54. docintel_platform-1.0.2.dist-info/WHEEL +5 -0
  55. docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
  56. docintel_platform-1.0.2.dist-info/top_level.txt +1 -0
docintel/jobs/store.py ADDED
@@ -0,0 +1,82 @@
1
+ """Redis-backed job metadata store."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from functools import lru_cache
8
+
9
+ from docintel.jobs.models import JobRecord
10
+
11
+ JOB_KEY_PREFIX = "docintel:job:"
12
+ DEFAULT_JOB_TTL_SECONDS = 60 * 60 * 24 * 7
13
+
14
+
15
+ def redis_url() -> str:
16
+ return os.getenv("DOCINTEL_REDIS_URL", "redis://localhost:6379/0").strip()
17
+
18
+
19
+ def jobs_enabled() -> bool:
20
+ return os.getenv("DOCINTEL_JOBS_ENABLED", "true").lower() == "true"
21
+
22
+
23
+ @lru_cache(maxsize=1)
24
+ def _redis_client():
25
+ import redis
26
+
27
+ return redis.Redis.from_url(redis_url(), decode_responses=True)
28
+
29
+
30
+ def reset_redis_client_cache() -> None:
31
+ """Clear cached Redis client (used in tests)."""
32
+ if hasattr(_redis_client, "cache_clear"):
33
+ _redis_client.cache_clear()
34
+
35
+
36
+ def _job_key(job_id: str) -> str:
37
+ return f"{JOB_KEY_PREFIX}{job_id}"
38
+
39
+
40
+ def save_job(record: JobRecord, ttl_seconds: int = DEFAULT_JOB_TTL_SECONDS) -> None:
41
+ client = _redis_client()
42
+ client.set(_job_key(record.job_id), json.dumps(record.to_dict()), ex=ttl_seconds)
43
+
44
+
45
+ def get_job(job_id: str) -> JobRecord | None:
46
+ client = _redis_client()
47
+ raw = client.get(_job_key(job_id))
48
+ if not raw:
49
+ return None
50
+ return JobRecord.from_dict(json.loads(raw))
51
+
52
+
53
+ def update_job(job_id: str, **changes) -> JobRecord:
54
+ from docintel.jobs.models import JobStatus
55
+
56
+ record = get_job(job_id)
57
+ if record is None:
58
+ raise KeyError(f"Job not found: {job_id}")
59
+
60
+ status_value = changes.get("job_status", record.status.value)
61
+ updated = JobRecord(
62
+ job_id=record.job_id,
63
+ job_type=record.job_type,
64
+ status=JobStatus(status_value),
65
+ progress=int(changes.get("progress", record.progress)),
66
+ progress_message=str(changes.get("progress_message", record.progress_message)),
67
+ pages_done=int(changes.get("pages_done", record.pages_done)),
68
+ pages_total=int(changes.get("pages_total", record.pages_total)),
69
+ callback_url=changes.get("callback_url", record.callback_url),
70
+ download_url=changes.get("download_url", record.download_url),
71
+ error=changes.get("error", record.error),
72
+ result=changes.get("result", record.result),
73
+ )
74
+ save_job(updated)
75
+ return updated
76
+
77
+
78
+ def ping_redis() -> bool:
79
+ try:
80
+ return bool(_redis_client().ping())
81
+ except Exception:
82
+ return False
docintel/jobs/tasks.py ADDED
@@ -0,0 +1,173 @@
1
+ """Background worker tasks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from docintel.jobs.models import JobRecord, JobStatus, JobType
8
+ from docintel.jobs.store import get_job, save_job, update_job
9
+ from docintel.services.pdf.models import Action, StructureMode
10
+ from docintel.services.pdf.sensitive import detect_sensitive_pdf
11
+ from docintel.services.pdf.structure import structure_pdf
12
+
13
+
14
+ def _job_progress_callback(job_id: str):
15
+ def _callback(*, stage: str, pages_done: int, pages_total: int, message: str) -> None:
16
+ if pages_total <= 0:
17
+ progress = 10
18
+ elif stage == "rendering":
19
+ progress = 95
20
+ else:
21
+ progress = 10 + int(80 * pages_done / pages_total)
22
+ update_job(
23
+ job_id,
24
+ job_status=JobStatus.RUNNING.value,
25
+ progress=progress,
26
+ progress_message=message,
27
+ pages_done=pages_done,
28
+ pages_total=pages_total,
29
+ )
30
+
31
+ return _callback
32
+
33
+
34
+ def run_structure_pdf_job(
35
+ *,
36
+ job_id: str,
37
+ input_path: str,
38
+ output_path: str,
39
+ mode: str,
40
+ force_ocr: bool,
41
+ output_filename: str,
42
+ redact_before_llm: bool = False,
43
+ ) -> dict:
44
+ """Worker entrypoint: OCR + LLM structure, then update job metadata."""
45
+ record = get_job(job_id)
46
+ callback_url = record.callback_url if record else None
47
+
48
+ update_job(
49
+ job_id,
50
+ job_status=JobStatus.RUNNING.value,
51
+ progress=5,
52
+ progress_message="Job started",
53
+ )
54
+
55
+ try:
56
+ result = structure_pdf(
57
+ input_file=Path(input_path),
58
+ output_file=Path(output_path),
59
+ mode=StructureMode.from_value(mode),
60
+ force_ocr=force_ocr,
61
+ redact_before_llm=redact_before_llm,
62
+ progress_callback=_job_progress_callback(job_id),
63
+ )
64
+ except Exception as exc:
65
+ failed = update_job(
66
+ job_id,
67
+ job_status=JobStatus.FAILED.value,
68
+ progress=100,
69
+ progress_message="Job failed",
70
+ error=str(exc),
71
+ )
72
+ _notify_webhook(callback_url, failed)
73
+ raise
74
+
75
+ download_url = f"/v1/pdf/files/{job_id}/{output_filename}"
76
+ result_payload = result.to_dict()
77
+ completed = update_job(
78
+ job_id,
79
+ job_status=JobStatus.COMPLETED.value,
80
+ progress=100,
81
+ progress_message="Job completed",
82
+ download_url=download_url,
83
+ result=result_payload,
84
+ )
85
+ _notify_webhook(callback_url, completed)
86
+ return result_payload
87
+
88
+
89
+ def _notify_webhook(callback_url: str | None, record: JobRecord) -> None:
90
+ if not callback_url:
91
+ return
92
+ from docintel.jobs.webhooks import deliver_job_webhook
93
+
94
+ deliver_job_webhook(callback_url, record.to_dict())
95
+
96
+
97
+ def run_detect_sensitive_pdf_job(
98
+ *,
99
+ job_id: str,
100
+ input_path: str,
101
+ output_path: str,
102
+ output_filename: str,
103
+ action: str,
104
+ force_ocr: bool,
105
+ add_text_layer: bool,
106
+ min_score: float,
107
+ entities: list[str] | None = None,
108
+ pattern: str | None = None,
109
+ ) -> dict:
110
+ """Worker entrypoint: OCR + Presidio sensitive PDF detection."""
111
+ record = get_job(job_id)
112
+ callback_url = record.callback_url if record else None
113
+
114
+ update_job(
115
+ job_id,
116
+ job_status=JobStatus.RUNNING.value,
117
+ progress=5,
118
+ progress_message="Sensitive detection started",
119
+ )
120
+
121
+ try:
122
+ result = detect_sensitive_pdf(
123
+ input_file=Path(input_path),
124
+ output_file=Path(output_path),
125
+ entities=entities,
126
+ action=Action.from_value(action),
127
+ force_ocr=force_ocr,
128
+ add_text_layer=add_text_layer,
129
+ pattern=pattern,
130
+ min_score=min_score,
131
+ progress_callback=_job_progress_callback(job_id),
132
+ )
133
+ except Exception as exc:
134
+ failed = update_job(
135
+ job_id,
136
+ job_status=JobStatus.FAILED.value,
137
+ progress=100,
138
+ progress_message="Job failed",
139
+ error=str(exc),
140
+ )
141
+ _notify_webhook(callback_url, failed)
142
+ raise
143
+
144
+ download_url = f"/v1/pdf/files/{job_id}/{output_filename}"
145
+ result_payload = result.to_dict()
146
+ completed = update_job(
147
+ job_id,
148
+ job_status=JobStatus.COMPLETED.value,
149
+ progress=100,
150
+ progress_message="Job completed",
151
+ download_url=download_url,
152
+ result=result_payload,
153
+ )
154
+ _notify_webhook(callback_url, completed)
155
+ return result_payload
156
+
157
+
158
+ def create_queued_job(
159
+ job_id: str,
160
+ *,
161
+ job_type: JobType,
162
+ callback_url: str | None = None,
163
+ ) -> JobRecord:
164
+ record = JobRecord(
165
+ job_id=job_id,
166
+ job_type=job_type,
167
+ status=JobStatus.QUEUED,
168
+ progress=0,
169
+ progress_message="Queued",
170
+ callback_url=callback_url,
171
+ )
172
+ save_job(record)
173
+ return record
@@ -0,0 +1,32 @@
1
+ """Webhook delivery when async jobs finish."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ logger = logging.getLogger("docintel.webhooks")
9
+
10
+
11
+ def deliver_job_webhook(callback_url: str, payload: dict[str, Any]) -> bool:
12
+ """POST job result to the caller webhook URL. Returns True on HTTP 2xx."""
13
+ if not callback_url or not callback_url.strip():
14
+ return False
15
+
16
+ try:
17
+ import requests
18
+ except ImportError:
19
+ logger.warning("requests not installed; webhook not delivered")
20
+ return False
21
+
22
+ try:
23
+ response = requests.post(callback_url.strip(), json=payload, timeout=30)
24
+ if response.ok:
25
+ return True
26
+ logger.warning(
27
+ "webhook delivery failed",
28
+ extra={"status_code": response.status_code, "callback_url": callback_url},
29
+ )
30
+ except Exception as exc:
31
+ logger.warning("webhook delivery error: %s", exc)
32
+ return False
@@ -0,0 +1 @@
1
+ """OpenAPI specification assets."""
@@ -0,0 +1,380 @@
1
+ openapi: 3.0.3
2
+ info:
3
+ title: Document Intelligence Platform API
4
+ version: 1.0.0
5
+ description: |
6
+ REST API for PDF annotation, scanned-document PII detection, LLM PDF structuring,
7
+ resume matching, and text summarization.
8
+
9
+ Long-running PDF tasks support async mode via Redis (`async=true`) and job polling
10
+ at `GET /v1/jobs/{job_id}`.
11
+
12
+ Authenticate with `Authorization: Bearer <api_key>` when `DOCINTEL_AUTH_REQUIRED=true`.
13
+ license:
14
+ name: MIT
15
+ url: https://opensource.org/licenses/MIT
16
+ contact:
17
+ name: Babandeep Singh
18
+ url: https://github.com/baban9/document-intelligence-platform
19
+
20
+ servers:
21
+ - url: http://127.0.0.1:5000
22
+ description: Local development
23
+
24
+ tags:
25
+ - name: health
26
+ - name: pdf
27
+ - name: jobs
28
+ - name: match
29
+ - name: text
30
+ - name: ops
31
+
32
+ components:
33
+ securitySchemes:
34
+ bearerAuth:
35
+ type: http
36
+ scheme: bearer
37
+ bearerFormat: API key or OIDC JWT
38
+
39
+ schemas:
40
+ Error:
41
+ type: object
42
+ properties:
43
+ error:
44
+ type: string
45
+
46
+ JobStatus:
47
+ type: object
48
+ properties:
49
+ status:
50
+ type: string
51
+ example: ok
52
+ job_id:
53
+ type: string
54
+ job_type:
55
+ type: string
56
+ enum: [pdf_structure, pdf_detect_sensitive]
57
+ job_status:
58
+ type: string
59
+ enum: [queued, running, completed, failed]
60
+ progress:
61
+ type: integer
62
+ minimum: 0
63
+ maximum: 100
64
+ progress_message:
65
+ type: string
66
+ pages_done:
67
+ type: integer
68
+ pages_total:
69
+ type: integer
70
+ download_url:
71
+ type: string
72
+ poll_url:
73
+ type: string
74
+ callback_url:
75
+ type: string
76
+ error:
77
+ type: string
78
+ result:
79
+ type: object
80
+
81
+ AsyncAccepted:
82
+ type: object
83
+ properties:
84
+ status:
85
+ type: string
86
+ example: ok
87
+ job_id:
88
+ type: string
89
+ job_type:
90
+ type: string
91
+ job_status:
92
+ type: string
93
+ example: queued
94
+ poll_url:
95
+ type: string
96
+ message:
97
+ type: string
98
+
99
+ MatchRequest:
100
+ type: object
101
+ required: [resume, job_description]
102
+ properties:
103
+ resume:
104
+ type: string
105
+ job_description:
106
+ type: string
107
+ top_keywords:
108
+ type: integer
109
+ default: 25
110
+ minimum: 1
111
+ maximum: 100
112
+
113
+ SummarizeRequest:
114
+ type: object
115
+ required: [text]
116
+ properties:
117
+ text:
118
+ type: string
119
+ sentences:
120
+ type: integer
121
+ default: 3
122
+ minimum: 1
123
+ maximum: 20
124
+
125
+ security:
126
+ - bearerAuth: []
127
+
128
+ paths:
129
+ /health:
130
+ get:
131
+ tags: [health]
132
+ summary: Health check
133
+ security: []
134
+ responses:
135
+ "200":
136
+ description: Service is healthy
137
+
138
+ /openapi.json:
139
+ get:
140
+ tags: [ops]
141
+ summary: OpenAPI specification
142
+ security: []
143
+ responses:
144
+ "200":
145
+ description: OpenAPI 3 document
146
+
147
+ /docs:
148
+ get:
149
+ tags: [ops]
150
+ summary: Swagger UI
151
+ security: []
152
+ responses:
153
+ "200":
154
+ description: Interactive API documentation
155
+
156
+ /metrics:
157
+ get:
158
+ tags: [ops]
159
+ summary: Request metrics
160
+ security: []
161
+ responses:
162
+ "200":
163
+ description: In-process metrics snapshot
164
+
165
+ /v1/pdf/annotate:
166
+ post:
167
+ tags: [pdf]
168
+ summary: Regex search and annotate a PDF
169
+ requestBody:
170
+ required: true
171
+ content:
172
+ multipart/form-data:
173
+ schema:
174
+ type: object
175
+ required: [file, pattern]
176
+ properties:
177
+ file:
178
+ type: string
179
+ format: binary
180
+ pattern:
181
+ type: string
182
+ action:
183
+ type: string
184
+ enum: [Highlight, Redact, Frame, Underline, Squiggly, Strikeout, Remove]
185
+ pages:
186
+ type: string
187
+ description: Comma-separated zero-based page indexes
188
+ format:
189
+ type: string
190
+ enum: [file, json]
191
+ responses:
192
+ "200":
193
+ description: Annotated PDF or JSON metadata
194
+ "400":
195
+ description: Invalid request
196
+
197
+ /v1/pdf/entities:
198
+ get:
199
+ tags: [pdf]
200
+ summary: List Presidio entity types
201
+ responses:
202
+ "200":
203
+ description: Supported entities
204
+
205
+ /v1/pdf/detect-sensitive:
206
+ post:
207
+ tags: [pdf]
208
+ summary: Detect PII and annotate PDF (OCR + Presidio)
209
+ parameters:
210
+ - name: async
211
+ in: query
212
+ schema:
213
+ type: boolean
214
+ default: false
215
+ description: Queue job and return 202 when true
216
+ - name: format
217
+ in: query
218
+ schema:
219
+ type: string
220
+ enum: [file, json]
221
+ requestBody:
222
+ required: true
223
+ content:
224
+ multipart/form-data:
225
+ schema:
226
+ type: object
227
+ required: [file]
228
+ properties:
229
+ file:
230
+ type: string
231
+ format: binary
232
+ action:
233
+ type: string
234
+ entities:
235
+ type: string
236
+ description: Comma-separated Presidio entity types
237
+ pattern:
238
+ type: string
239
+ force_ocr:
240
+ type: boolean
241
+ add_text_layer:
242
+ type: boolean
243
+ min_score:
244
+ type: number
245
+ callback_url:
246
+ type: string
247
+ format: uri
248
+ async:
249
+ type: boolean
250
+ responses:
251
+ "200":
252
+ description: Processed PDF or JSON report
253
+ "202":
254
+ description: Job queued
255
+ content:
256
+ application/json:
257
+ schema:
258
+ $ref: "#/components/schemas/AsyncAccepted"
259
+
260
+ /v1/pdf/structure:
261
+ post:
262
+ tags: [pdf]
263
+ summary: Structure scanned PDF with OCR and LLM
264
+ parameters:
265
+ - name: async
266
+ in: query
267
+ schema:
268
+ type: boolean
269
+ default: false
270
+ - name: format
271
+ in: query
272
+ schema:
273
+ type: string
274
+ enum: [file, json]
275
+ requestBody:
276
+ required: true
277
+ content:
278
+ multipart/form-data:
279
+ schema:
280
+ type: object
281
+ required: [file]
282
+ properties:
283
+ file:
284
+ type: string
285
+ format: binary
286
+ mode:
287
+ type: string
288
+ enum: [curate, searchable]
289
+ force_ocr:
290
+ type: boolean
291
+ redact_before_llm:
292
+ type: boolean
293
+ callback_url:
294
+ type: string
295
+ format: uri
296
+ async:
297
+ type: boolean
298
+ responses:
299
+ "200":
300
+ description: Structured PDF or JSON metadata
301
+ "202":
302
+ description: Job queued
303
+ content:
304
+ application/json:
305
+ schema:
306
+ $ref: "#/components/schemas/AsyncAccepted"
307
+
308
+ /v1/pdf/files/{job_id}/{filename}:
309
+ get:
310
+ tags: [pdf]
311
+ summary: Download a generated PDF
312
+ parameters:
313
+ - name: job_id
314
+ in: path
315
+ required: true
316
+ schema:
317
+ type: string
318
+ - name: filename
319
+ in: path
320
+ required: true
321
+ schema:
322
+ type: string
323
+ responses:
324
+ "200":
325
+ description: PDF file
326
+ content:
327
+ application/pdf:
328
+ schema:
329
+ type: string
330
+ format: binary
331
+ "404":
332
+ description: File not found
333
+
334
+ /v1/jobs/{job_id}:
335
+ get:
336
+ tags: [jobs]
337
+ summary: Poll async job status
338
+ parameters:
339
+ - name: job_id
340
+ in: path
341
+ required: true
342
+ schema:
343
+ type: string
344
+ responses:
345
+ "200":
346
+ description: Job status
347
+ content:
348
+ application/json:
349
+ schema:
350
+ $ref: "#/components/schemas/JobStatus"
351
+ "404":
352
+ description: Job not found
353
+
354
+ /v1/match/resume:
355
+ post:
356
+ tags: [match]
357
+ summary: Score resume against job description
358
+ requestBody:
359
+ required: true
360
+ content:
361
+ application/json:
362
+ schema:
363
+ $ref: "#/components/schemas/MatchRequest"
364
+ responses:
365
+ "200":
366
+ description: Match score and keywords
367
+
368
+ /v1/text/summarize:
369
+ post:
370
+ tags: [text]
371
+ summary: Extractive TextRank summarization
372
+ requestBody:
373
+ required: true
374
+ content:
375
+ application/json:
376
+ schema:
377
+ $ref: "#/components/schemas/SummarizeRequest"
378
+ responses:
379
+ "200":
380
+ description: Summary sentences
@@ -0,0 +1 @@
1
+ """Operations: logging, metrics, and request instrumentation."""
@@ -0,0 +1,40 @@
1
+ """Structured JSON logging for the API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import sys
8
+ from datetime import datetime, timezone
9
+ from typing import Any
10
+
11
+
12
+ class JsonFormatter(logging.Formatter):
13
+ """Format log records as single-line JSON objects."""
14
+
15
+ def format(self, record: logging.LogRecord) -> str:
16
+ payload: dict[str, Any] = {
17
+ "timestamp": datetime.now(timezone.utc).isoformat(),
18
+ "level": record.levelname,
19
+ "logger": record.name,
20
+ "message": record.getMessage(),
21
+ }
22
+ for key in ("method", "path", "status_code", "duration_ms", "endpoint"):
23
+ if hasattr(record, key):
24
+ payload[key] = getattr(record, key)
25
+ if record.exc_info:
26
+ payload["exception"] = self.formatException(record.exc_info)
27
+ return json.dumps(payload, default=str)
28
+
29
+
30
+ def configure_logging(level: str = "INFO") -> None:
31
+ """Configure root logging with JSON output to stdout."""
32
+ root = logging.getLogger()
33
+ root.handlers.clear()
34
+ root.setLevel(level.upper())
35
+
36
+ handler = logging.StreamHandler(sys.stdout)
37
+ handler.setFormatter(JsonFormatter())
38
+ root.addHandler(handler)
39
+
40
+ logging.getLogger("werkzeug").setLevel(logging.WARNING)