PyPI - docintel-platform - Versions diffs - 1.0.2__py3-none-any.whl - Mend

docintel-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

docintel/__init__.py +6 -0
docintel/app.py +45 -0
docintel/auth/__init__.py +12 -0
docintel/auth/api_keys.py +48 -0
docintel/auth/limiter.py +41 -0
docintel/auth/middleware.py +34 -0
docintel/auth/oidc.py +45 -0
docintel/cli.py +21 -0
docintel/client.py +193 -0
docintel/config.py +20 -0
docintel/jobs/__init__.py +16 -0
docintel/jobs/helpers.py +38 -0
docintel/jobs/models.py +78 -0
docintel/jobs/queue.py +75 -0
docintel/jobs/store.py +82 -0
docintel/jobs/tasks.py +173 -0
docintel/jobs/webhooks.py +32 -0
docintel/openapi/__init__.py +1 -0
docintel/openapi/openapi.yaml +380 -0
docintel/ops/__init__.py +1 -0
docintel/ops/logging.py +40 -0
docintel/ops/metrics.py +57 -0
docintel/ops/middleware.py +40 -0
docintel/routes/__init__.py +1 -0
docintel/routes/jobs.py +26 -0
docintel/routes/match.py +43 -0
docintel/routes/openapi_docs.py +57 -0
docintel/routes/ops.py +22 -0
docintel/routes/pdf.py +420 -0
docintel/routes/text.py +41 -0
docintel/services/__init__.py +1 -0
docintel/services/matching/__init__.py +6 -0
docintel/services/matching/models.py +19 -0
docintel/services/matching/scorer.py +64 -0
docintel/services/pdf/__init__.py +26 -0
docintel/services/pdf/annotator.py +188 -0
docintel/services/pdf/models.py +104 -0
docintel/services/pdf/ocr.py +130 -0
docintel/services/pdf/pii.py +105 -0
docintel/services/pdf/presets.py +26 -0
docintel/services/pdf/search.py +29 -0
docintel/services/pdf/sensitive.py +212 -0
docintel/services/pdf/structure.py +118 -0
docintel/services/pdf/structure_llm.py +136 -0
docintel/services/pdf/structure_render.py +136 -0
docintel/services/pdf/structure_schema.py +99 -0
docintel/services/summary/__init__.py +6 -0
docintel/services/summary/models.py +21 -0
docintel/services/summary/textrank.py +57 -0
docintel/ui.py +347 -0
docintel/wsgi.py +5 -0
docintel_platform-1.0.2.dist-info/METADATA +607 -0
docintel_platform-1.0.2.dist-info/RECORD +56 -0
docintel_platform-1.0.2.dist-info/WHEEL +5 -0
docintel_platform-1.0.2.dist-info/entry_points.txt +3 -0
docintel_platform-1.0.2.dist-info/top_level.txt +1 -0

docintel/jobs/store.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""Redis-backed job metadata store."""
+from __future__ import annotations
+import json
+import os
+from functools import lru_cache
+from docintel.jobs.models import JobRecord
+JOB_KEY_PREFIX = "docintel:job:"
+DEFAULT_JOB_TTL_SECONDS = 60 * 60 * 24 * 7
+def redis_url() -> str:
+    return os.getenv("DOCINTEL_REDIS_URL", "redis://localhost:6379/0").strip()
+def jobs_enabled() -> bool:
+    return os.getenv("DOCINTEL_JOBS_ENABLED", "true").lower() == "true"
+@lru_cache(maxsize=1)
+def _redis_client():
+    import redis
+    return redis.Redis.from_url(redis_url(), decode_responses=True)
+def reset_redis_client_cache() -> None:
+    """Clear cached Redis client (used in tests)."""
+    if hasattr(_redis_client, "cache_clear"):
+        _redis_client.cache_clear()
+def _job_key(job_id: str) -> str:
+    return f"{JOB_KEY_PREFIX}{job_id}"
+def save_job(record: JobRecord, ttl_seconds: int = DEFAULT_JOB_TTL_SECONDS) -> None:
+    client = _redis_client()
+    client.set(_job_key(record.job_id), json.dumps(record.to_dict()), ex=ttl_seconds)
+def get_job(job_id: str) -> JobRecord | None:
+    client = _redis_client()
+    raw = client.get(_job_key(job_id))
+    if not raw:
+        return None
+    return JobRecord.from_dict(json.loads(raw))
+def update_job(job_id: str, **changes) -> JobRecord:
+    from docintel.jobs.models import JobStatus
+    record = get_job(job_id)
+    if record is None:
+        raise KeyError(f"Job not found: {job_id}")
+    status_value = changes.get("job_status", record.status.value)
+    updated = JobRecord(
+        job_id=record.job_id,
+        job_type=record.job_type,
+        status=JobStatus(status_value),
+        progress=int(changes.get("progress", record.progress)),
+        progress_message=str(changes.get("progress_message", record.progress_message)),
+        pages_done=int(changes.get("pages_done", record.pages_done)),
+        pages_total=int(changes.get("pages_total", record.pages_total)),
+        callback_url=changes.get("callback_url", record.callback_url),
+        download_url=changes.get("download_url", record.download_url),
+        error=changes.get("error", record.error),
+        result=changes.get("result", record.result),
+    )
+    save_job(updated)
+    return updated
+def ping_redis() -> bool:
+    try:
+        return bool(_redis_client().ping())
+    except Exception:
+        return False

docintel/jobs/tasks.py ADDED Viewed

@@ -0,0 +1,173 @@
+"""Background worker tasks."""
+from __future__ import annotations
+from pathlib import Path
+from docintel.jobs.models import JobRecord, JobStatus, JobType
+from docintel.jobs.store import get_job, save_job, update_job
+from docintel.services.pdf.models import Action, StructureMode
+from docintel.services.pdf.sensitive import detect_sensitive_pdf
+from docintel.services.pdf.structure import structure_pdf
+def _job_progress_callback(job_id: str):
+    def _callback(*, stage: str, pages_done: int, pages_total: int, message: str) -> None:
+        if pages_total <= 0:
+            progress = 10
+        elif stage == "rendering":
+            progress = 95
+        else:
+            progress = 10 + int(80 * pages_done / pages_total)
+        update_job(
+            job_id,
+            job_status=JobStatus.RUNNING.value,
+            progress=progress,
+            progress_message=message,
+            pages_done=pages_done,
+            pages_total=pages_total,
+        )
+    return _callback
+def run_structure_pdf_job(
+    *,
+    job_id: str,
+    input_path: str,
+    output_path: str,
+    mode: str,
+    force_ocr: bool,
+    output_filename: str,
+    redact_before_llm: bool = False,
+) -> dict:
+    """Worker entrypoint: OCR + LLM structure, then update job metadata."""
+    record = get_job(job_id)
+    callback_url = record.callback_url if record else None
+    update_job(
+        job_id,
+        job_status=JobStatus.RUNNING.value,
+        progress=5,
+        progress_message="Job started",
+    )
+    try:
+        result = structure_pdf(
+            input_file=Path(input_path),
+            output_file=Path(output_path),
+            mode=StructureMode.from_value(mode),
+            force_ocr=force_ocr,
+            redact_before_llm=redact_before_llm,
+            progress_callback=_job_progress_callback(job_id),
+        )
+    except Exception as exc:
+        failed = update_job(
+            job_id,
+            job_status=JobStatus.FAILED.value,
+            progress=100,
+            progress_message="Job failed",
+            error=str(exc),
+        )
+        _notify_webhook(callback_url, failed)
+        raise
+    download_url = f"/v1/pdf/files/{job_id}/{output_filename}"
+    result_payload = result.to_dict()
+    completed = update_job(
+        job_id,
+        job_status=JobStatus.COMPLETED.value,
+        progress=100,
+        progress_message="Job completed",
+        download_url=download_url,
+        result=result_payload,
+    )
+    _notify_webhook(callback_url, completed)
+    return result_payload
+def _notify_webhook(callback_url: str | None, record: JobRecord) -> None:
+    if not callback_url:
+        return
+    from docintel.jobs.webhooks import deliver_job_webhook
+    deliver_job_webhook(callback_url, record.to_dict())
+def run_detect_sensitive_pdf_job(
+    *,
+    job_id: str,
+    input_path: str,
+    output_path: str,
+    output_filename: str,
+    action: str,
+    force_ocr: bool,
+    add_text_layer: bool,
+    min_score: float,
+    entities: list[str] | None = None,
+    pattern: str | None = None,
+) -> dict:
+    """Worker entrypoint: OCR + Presidio sensitive PDF detection."""
+    record = get_job(job_id)
+    callback_url = record.callback_url if record else None
+    update_job(
+        job_id,
+        job_status=JobStatus.RUNNING.value,
+        progress=5,
+        progress_message="Sensitive detection started",
+    )
+    try:
+        result = detect_sensitive_pdf(
+            input_file=Path(input_path),
+            output_file=Path(output_path),
+            entities=entities,
+            action=Action.from_value(action),
+            force_ocr=force_ocr,
+            add_text_layer=add_text_layer,
+            pattern=pattern,
+            min_score=min_score,
+            progress_callback=_job_progress_callback(job_id),
+        )
+    except Exception as exc:
+        failed = update_job(
+            job_id,
+            job_status=JobStatus.FAILED.value,
+            progress=100,
+            progress_message="Job failed",
+            error=str(exc),
+        )
+        _notify_webhook(callback_url, failed)
+        raise
+    download_url = f"/v1/pdf/files/{job_id}/{output_filename}"
+    result_payload = result.to_dict()
+    completed = update_job(
+        job_id,
+        job_status=JobStatus.COMPLETED.value,
+        progress=100,
+        progress_message="Job completed",
+        download_url=download_url,
+        result=result_payload,
+    )
+    _notify_webhook(callback_url, completed)
+    return result_payload
+def create_queued_job(
+    job_id: str,
+    *,
+    job_type: JobType,
+    callback_url: str | None = None,
+) -> JobRecord:
+    record = JobRecord(
+        job_id=job_id,
+        job_type=job_type,
+        status=JobStatus.QUEUED,
+        progress=0,
+        progress_message="Queued",
+        callback_url=callback_url,
+    )
+    save_job(record)
+    return record

docintel/jobs/webhooks.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Webhook delivery when async jobs finish."""
+from __future__ import annotations
+import logging
+from typing import Any
+logger = logging.getLogger("docintel.webhooks")
+def deliver_job_webhook(callback_url: str, payload: dict[str, Any]) -> bool:
+    """POST job result to the caller webhook URL. Returns True on HTTP 2xx."""
+    if not callback_url or not callback_url.strip():
+        return False
+    try:
+        import requests
+    except ImportError:
+        logger.warning("requests not installed; webhook not delivered")
+        return False
+    try:
+        response = requests.post(callback_url.strip(), json=payload, timeout=30)
+        if response.ok:
+            return True
+        logger.warning(
+            "webhook delivery failed",
+            extra={"status_code": response.status_code, "callback_url": callback_url},
+        )
+    except Exception as exc:
+        logger.warning("webhook delivery error: %s", exc)
+    return False

docintel/openapi/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """OpenAPI specification assets."""

docintel/openapi/openapi.yaml ADDED Viewed

@@ -0,0 +1,380 @@
+openapi: 3.0.3
+info:
+  title: Document Intelligence Platform API
+  version: 1.0.0
+  description: |
+    REST API for PDF annotation, scanned-document PII detection, LLM PDF structuring,
+    resume matching, and text summarization.
+    Long-running PDF tasks support async mode via Redis (`async=true`) and job polling
+    at `GET /v1/jobs/{job_id}`.
+    Authenticate with `Authorization: Bearer <api_key>` when `DOCINTEL_AUTH_REQUIRED=true`.
+  license:
+    name: MIT
+    url: https://opensource.org/licenses/MIT
+  contact:
+    name: Babandeep Singh
+    url: https://github.com/baban9/document-intelligence-platform
+servers:
+  - url: http://127.0.0.1:5000
+    description: Local development
+tags:
+  - name: health
+  - name: pdf
+  - name: jobs
+  - name: match
+  - name: text
+  - name: ops
+components:
+  securitySchemes:
+    bearerAuth:
+      type: http
+      scheme: bearer
+      bearerFormat: API key or OIDC JWT
+  schemas:
+    Error:
+      type: object
+      properties:
+        error:
+          type: string
+    JobStatus:
+      type: object
+      properties:
+        status:
+          type: string
+          example: ok
+        job_id:
+          type: string
+        job_type:
+          type: string
+          enum: [pdf_structure, pdf_detect_sensitive]
+        job_status:
+          type: string
+          enum: [queued, running, completed, failed]
+        progress:
+          type: integer
+          minimum: 0
+          maximum: 100
+        progress_message:
+          type: string
+        pages_done:
+          type: integer
+        pages_total:
+          type: integer
+        download_url:
+          type: string
+        poll_url:
+          type: string
+        callback_url:
+          type: string
+        error:
+          type: string
+        result:
+          type: object
+    AsyncAccepted:
+      type: object
+      properties:
+        status:
+          type: string
+          example: ok
+        job_id:
+          type: string
+        job_type:
+          type: string
+        job_status:
+          type: string
+          example: queued
+        poll_url:
+          type: string
+        message:
+          type: string
+    MatchRequest:
+      type: object
+      required: [resume, job_description]
+      properties:
+        resume:
+          type: string
+        job_description:
+          type: string
+        top_keywords:
+          type: integer
+          default: 25
+          minimum: 1
+          maximum: 100
+    SummarizeRequest:
+      type: object
+      required: [text]
+      properties:
+        text:
+          type: string
+        sentences:
+          type: integer
+          default: 3
+          minimum: 1
+          maximum: 20
+security:
+  - bearerAuth: []
+paths:
+  /health:
+    get:
+      tags: [health]
+      summary: Health check
+      security: []
+      responses:
+        "200":
+          description: Service is healthy
+  /openapi.json:
+    get:
+      tags: [ops]
+      summary: OpenAPI specification
+      security: []
+      responses:
+        "200":
+          description: OpenAPI 3 document
+  /docs:
+    get:
+      tags: [ops]
+      summary: Swagger UI
+      security: []
+      responses:
+        "200":
+          description: Interactive API documentation
+  /metrics:
+    get:
+      tags: [ops]
+      summary: Request metrics
+      security: []
+      responses:
+        "200":
+          description: In-process metrics snapshot
+  /v1/pdf/annotate:
+    post:
+      tags: [pdf]
+      summary: Regex search and annotate a PDF
+      requestBody:
+        required: true
+        content:
+          multipart/form-data:
+            schema:
+              type: object
+              required: [file, pattern]
+              properties:
+                file:
+                  type: string
+                  format: binary
+                pattern:
+                  type: string
+                action:
+                  type: string
+                  enum: [Highlight, Redact, Frame, Underline, Squiggly, Strikeout, Remove]
+                pages:
+                  type: string
+                  description: Comma-separated zero-based page indexes
+                format:
+                  type: string
+                  enum: [file, json]
+      responses:
+        "200":
+          description: Annotated PDF or JSON metadata
+        "400":
+          description: Invalid request
+  /v1/pdf/entities:
+    get:
+      tags: [pdf]
+      summary: List Presidio entity types
+      responses:
+        "200":
+          description: Supported entities
+  /v1/pdf/detect-sensitive:
+    post:
+      tags: [pdf]
+      summary: Detect PII and annotate PDF (OCR + Presidio)
+      parameters:
+        - name: async
+          in: query
+          schema:
+            type: boolean
+            default: false
+          description: Queue job and return 202 when true
+        - name: format
+          in: query
+          schema:
+            type: string
+            enum: [file, json]
+      requestBody:
+        required: true
+        content:
+          multipart/form-data:
+            schema:
+              type: object
+              required: [file]
+              properties:
+                file:
+                  type: string
+                  format: binary
+                action:
+                  type: string
+                entities:
+                  type: string
+                  description: Comma-separated Presidio entity types
+                pattern:
+                  type: string
+                force_ocr:
+                  type: boolean
+                add_text_layer:
+                  type: boolean
+                min_score:
+                  type: number
+                callback_url:
+                  type: string
+                  format: uri
+                async:
+                  type: boolean
+      responses:
+        "200":
+          description: Processed PDF or JSON report
+        "202":
+          description: Job queued
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AsyncAccepted"
+  /v1/pdf/structure:
+    post:
+      tags: [pdf]
+      summary: Structure scanned PDF with OCR and LLM
+      parameters:
+        - name: async
+          in: query
+          schema:
+            type: boolean
+            default: false
+        - name: format
+          in: query
+          schema:
+            type: string
+            enum: [file, json]
+      requestBody:
+        required: true
+        content:
+          multipart/form-data:
+            schema:
+              type: object
+              required: [file]
+              properties:
+                file:
+                  type: string
+                  format: binary
+                mode:
+                  type: string
+                  enum: [curate, searchable]
+                force_ocr:
+                  type: boolean
+                redact_before_llm:
+                  type: boolean
+                callback_url:
+                  type: string
+                  format: uri
+                async:
+                  type: boolean
+      responses:
+        "200":
+          description: Structured PDF or JSON metadata
+        "202":
+          description: Job queued
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/AsyncAccepted"
+  /v1/pdf/files/{job_id}/{filename}:
+    get:
+      tags: [pdf]
+      summary: Download a generated PDF
+      parameters:
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+        - name: filename
+          in: path
+          required: true
+          schema:
+            type: string
+      responses:
+        "200":
+          description: PDF file
+          content:
+            application/pdf:
+              schema:
+                type: string
+                format: binary
+        "404":
+          description: File not found
+  /v1/jobs/{job_id}:
+    get:
+      tags: [jobs]
+      summary: Poll async job status
+      parameters:
+        - name: job_id
+          in: path
+          required: true
+          schema:
+            type: string
+      responses:
+        "200":
+          description: Job status
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/JobStatus"
+        "404":
+          description: Job not found
+  /v1/match/resume:
+    post:
+      tags: [match]
+      summary: Score resume against job description
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/MatchRequest"
+      responses:
+        "200":
+          description: Match score and keywords
+  /v1/text/summarize:
+    post:
+      tags: [text]
+      summary: Extractive TextRank summarization
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/SummarizeRequest"
+      responses:
+        "200":
+          description: Summary sentences

docintel/ops/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Operations: logging, metrics, and request instrumentation."""

docintel/ops/logging.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Structured JSON logging for the API."""
+from __future__ import annotations
+import json
+import logging
+import sys
+from datetime import datetime, timezone
+from typing import Any
+class JsonFormatter(logging.Formatter):
+    """Format log records as single-line JSON objects."""
+    def format(self, record: logging.LogRecord) -> str:
+        payload: dict[str, Any] = {
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+        }
+        for key in ("method", "path", "status_code", "duration_ms", "endpoint"):
+            if hasattr(record, key):
+                payload[key] = getattr(record, key)
+        if record.exc_info:
+            payload["exception"] = self.formatException(record.exc_info)
+        return json.dumps(payload, default=str)
+def configure_logging(level: str = "INFO") -> None:
+    """Configure root logging with JSON output to stdout."""
+    root = logging.getLogger()
+    root.handlers.clear()
+    root.setLevel(level.upper())
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(JsonFormatter())
+    root.addHandler(handler)
+    logging.getLogger("werkzeug").setLevel(logging.WARNING)