PyPI - intextum-worker - Versions diffs - 0.1.0__py3-none-any.whl - Mend

intextum-worker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

intextum_worker/__init__.py +1 -0
intextum_worker/config.py +92 -0
intextum_worker/logging_config.py +158 -0
intextum_worker/main.py +212 -0
intextum_worker/models.py +724 -0
intextum_worker/poll_enrichment.py +76 -0
intextum_worker/poll_loop.py +424 -0
intextum_worker/poll_runtime.py +327 -0
intextum_worker/processor_docling.py +113 -0
intextum_worker/processor_runtime.py +622 -0
intextum_worker/processors.py +446 -0
intextum_worker/runtime_info.py +201 -0
intextum_worker/services/__init__.py +0 -0
intextum_worker/services/api_client.py +537 -0
intextum_worker/services/api_client_api.py +99 -0
intextum_worker/services/api_client_uploads.py +110 -0
intextum_worker/services/content_enrichment/__init__.py +15 -0
intextum_worker/services/content_enrichment/batching.py +115 -0
intextum_worker/services/content_enrichment/chunk_selection.py +306 -0
intextum_worker/services/content_enrichment/classification.py +185 -0
intextum_worker/services/content_enrichment/console.py +23 -0
intextum_worker/services/content_enrichment/evidence_grounding.py +100 -0
intextum_worker/services/content_enrichment/json_response.py +188 -0
intextum_worker/services/content_enrichment/langgraph_provider.py +1036 -0
intextum_worker/services/content_enrichment/merge.py +197 -0
intextum_worker/services/content_enrichment/model_artifacts.py +170 -0
intextum_worker/services/content_enrichment/orchestration.py +65 -0
intextum_worker/services/content_enrichment/prompt.py +218 -0
intextum_worker/services/content_enrichment/registry.py +113 -0
intextum_worker/services/content_enrichment/repeated_fields.py +153 -0
intextum_worker/services/content_enrichment_training_runner.py +470 -0
intextum_worker/services/content_enrichment_utils.py +904 -0
intextum_worker/services/docling.py +372 -0
intextum_worker/services/docling_asr.py +250 -0
intextum_worker/services/docling_enrichment.py +204 -0
intextum_worker/services/docling_output.py +80 -0
intextum_worker/services/tokenizer.py +45 -0
intextum_worker/services/vector.py +216 -0
intextum_worker/version.py +55 -0
intextum_worker-0.1.0.dist-info/METADATA +115 -0
intextum_worker-0.1.0.dist-info/RECORD +44 -0
intextum_worker-0.1.0.dist-info/WHEEL +5 -0
intextum_worker-0.1.0.dist-info/entry_points.txt +2 -0
intextum_worker-0.1.0.dist-info/top_level.txt +1 -0

intextum_worker/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Worker package for file processing pipeline."""

intextum_worker/config.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Configuration settings for the worker service."""
+import json
+from collections.abc import Iterable
+from functools import lru_cache
+from pydantic import field_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+VALID_WORKER_CAPABILITIES = frozenset({"document", "image", "video", "training"})
+DEFAULT_WORKER_CAPABILITIES = "document,video,image"
+def parse_capabilities(value: object) -> list[str]:
+    """Parse and validate worker capabilities from env or CLI input."""
+    if isinstance(value, str):
+        raw = value.strip()
+        if not raw:
+            return []
+        if raw.startswith("["):
+            try:
+                decoded = json.loads(raw)
+            except json.JSONDecodeError as exc:
+                raise ValueError(
+                    "CAPABILITIES must be a comma-separated string or JSON array"
+                ) from exc
+            return parse_capabilities(decoded)
+        values: Iterable[object] = raw.split(",")
+    elif isinstance(value, (list, tuple, set)):
+        values = value
+    else:
+        raise ValueError("CAPABILITIES must be a comma-separated string or JSON array")
+    capabilities = [str(item).strip().lower() for item in values if str(item).strip()]
+    invalid = sorted(set(capabilities) - VALID_WORKER_CAPABILITIES)
+    if invalid:
+        allowed = ", ".join(sorted(VALID_WORKER_CAPABILITIES))
+        rejected = ", ".join(invalid)
+        raise ValueError(
+            f"Invalid CAPABILITIES value(s): {rejected}. Allowed: {allowed}"
+        )
+    return capabilities
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+    # Remote backend connection
+    API_URL: str = "http://api:8000"
+    WORKER_TOKEN: str = ""
+    WORK_DIR: str = "/tmp/worker"
+    # Poll loop. "training" stays opt-in and must be added explicitly.
+    CAPABILITIES: str = DEFAULT_WORKER_CAPABILITIES
+    @property
+    def parsed_capabilities(self) -> list[str]:
+        """Return validated capabilities without triggering pydantic JSON env parsing."""
+        return parse_capabilities(self.CAPABILITIES)
+    POLL_INTERVAL: float = 5.0
+    TASK_HEARTBEAT_INTERVAL_SECONDS: float = 60.0
+    CONTENT_ENRICHMENT_STAGE_TIMEOUT_SECONDS: float = 300.0
+    # Local processing settings
+    CLASSIFICATION_DEVICE: str = "cpu"
+    DOCLING_THREADS: int = 4
+    DOCLING_OCR_ENGINE: str = "easyocr"
+    ASR_MODEL: str = "whisper_large_v3"
+    ASR_LANGUAGE: str = "de"
+    KEEP_MODELS_LOADED: bool = False
+    CUSTOM_FIELD_ID: int = 1
+    model_config = SettingsConfigDict(env_file=".env", extra="ignore")
+    @field_validator("DOCLING_OCR_ENGINE", mode="before")
+    @classmethod
+    def normalize_docling_ocr_engine(cls, value: str) -> str:
+        """Normalize and validate configured Docling OCR engine."""
+        engine = str(value).strip().lower()
+        allowed = {"easyocr", "rapidocr", "tesseract", "tesseract_cli", "ocrmac"}
+        if engine not in allowed:
+            allowed_list = ", ".join(sorted(allowed))
+            raise ValueError(f"DOCLING_OCR_ENGINE must be one of: {allowed_list}")
+        return engine
+@lru_cache
+def get_settings() -> Settings:
+    """Get cached settings instance."""
+    return Settings()

intextum_worker/logging_config.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""Structured logging configuration for the worker service."""
+import json
+import logging
+import sys
+import uuid
+from collections.abc import MutableMapping
+from contextvars import ContextVar, Token
+from datetime import UTC, datetime
+from typing import Any
+correlation_id_var: ContextVar[str | None] = ContextVar("correlation_id", default=None)
+def generate_correlation_id() -> str:
+    """Generate a new correlation ID."""
+    return str(uuid.uuid4())[:8]
+def get_correlation_id() -> str | None:
+    """Get the current correlation ID from context."""
+    return correlation_id_var.get()
+def set_correlation_id(correlation_id: str) -> None:
+    """Set the correlation ID in context."""
+    correlation_id_var.set(correlation_id)
+class StructuredFormatter(logging.Formatter):
+    """JSON formatter for structured logging."""
+    def format(self, record: logging.LogRecord) -> str:
+        log_data: dict[str, Any] = {
+            "timestamp": datetime.now(UTC).isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+        }
+        correlation_id = get_correlation_id()
+        if correlation_id:
+            log_data["correlation_id"] = correlation_id
+        if record.exc_info:
+            log_data["exception"] = self.formatException(record.exc_info)
+        if hasattr(record, "__dict__"):
+            extra_fields = {
+                k: v
+                for k, v in record.__dict__.items()
+                if k
+                not in {
+                    "name",
+                    "msg",
+                    "args",
+                    "created",
+                    "filename",
+                    "funcName",
+                    "levelname",
+                    "levelno",
+                    "lineno",
+                    "module",
+                    "msecs",
+                    "pathname",
+                    "process",
+                    "processName",
+                    "relativeCreated",
+                    "stack_info",
+                    "exc_info",
+                    "exc_text",
+                    "thread",
+                    "threadName",
+                    "taskName",
+                    "message",
+                }
+            }
+            if extra_fields:
+                log_data["extra"] = extra_fields
+        return json.dumps(log_data)
+class CorrelatedLogger(logging.LoggerAdapter):
+    """Logger adapter that includes correlation ID in all log messages."""
+    def __init__(self, logger: logging.Logger, correlation_id: str):
+        super().__init__(logger, {})
+        self.correlation_id = correlation_id
+    def process(
+        self, msg: object, kwargs: MutableMapping[str, Any]
+    ) -> tuple[object, MutableMapping[str, Any]]:
+        extra = kwargs.get("extra")
+        if not isinstance(extra, dict):
+            extra = {}
+        extra["correlation_id"] = self.correlation_id
+        kwargs["extra"] = extra
+        return msg, kwargs
+def get_logger(name: str, correlation_id: str | None = None) -> logging.LoggerAdapter:
+    """Get a logger with optional correlation ID.
+    Args:
+        name: Logger name (typically __name__)
+        correlation_id: Optional correlation ID, generates new one if not provided
+    Returns:
+        Logger adapter with correlation context
+    """
+    logger = logging.getLogger(name)
+    cid = correlation_id or get_correlation_id() or generate_correlation_id()
+    return CorrelatedLogger(logger, cid)
+def configure_logging(json_format: bool = True, level: str = "INFO") -> None:
+    """Configure logging for the worker service.
+    Args:
+        json_format: Use JSON structured logging if True, human-readable if False
+        level: Logging level (DEBUG, INFO, WARNING, ERROR)
+    """
+    root_logger = logging.getLogger()
+    root_logger.setLevel(getattr(logging, level.upper()))
+    for handler in root_logger.handlers[:]:
+        root_logger.removeHandler(handler)
+    handler = logging.StreamHandler(sys.stdout)
+    if json_format:
+        handler.setFormatter(StructuredFormatter())
+    else:
+        handler.setFormatter(
+            logging.Formatter(
+                "%(asctime)s - %(levelname)s - [%(correlation_id)s] %(name)s - %(message)s",
+                defaults={"correlation_id": "no-correlation"},
+            )
+        )
+    root_logger.addHandler(handler)
+class LoggingContext:
+    """Context manager for scoped correlation IDs."""
+    def __init__(self, correlation_id: str | None = None):
+        self.correlation_id = correlation_id or generate_correlation_id()
+        self._token: Token[str | None] | None = None
+    def __enter__(self) -> str:
+        self._token = correlation_id_var.set(self.correlation_id)
+        return self.correlation_id
+    def __exit__(self, *args) -> None:
+        if self._token is not None:
+            correlation_id_var.reset(self._token)

intextum_worker/main.py ADDED Viewed

@@ -0,0 +1,212 @@
+"""Worker entry point — HTTP poll loop replacing Celery."""
+import argparse
+import os
+import platform
+import sys
+from intextum_worker.config import get_settings, parse_capabilities
+from intextum_worker.logging_config import configure_logging, get_logger
+from intextum_worker.models import WorkerRuntimeMetadata
+from intextum_worker.runtime_info import (
+    build_runtime_metadata,
+    validate_accelerator,
+    validate_runtime_dependencies,
+)
+def _build_parser() -> argparse.ArgumentParser:
+    """Build command-line parser for worker runtime overrides."""
+    parser = argparse.ArgumentParser(description="intextum Worker")
+    parser.add_argument(
+        "--capabilities",
+        type=str,
+        default=None,
+        help="Comma-separated capabilities, e.g. document,video,image,training",
+    )
+    parser.add_argument(
+        "--poll-interval",
+        type=float,
+        default=None,
+        help="Seconds between poll attempts (default: 5)",
+    )
+    parser.add_argument(
+        "--api-url",
+        type=str,
+        default=None,
+        help="API URL override (otherwise API_URL or APP_SCHEME/APP_DOMAIN)",
+    )
+    parser.add_argument(
+        "--work-dir",
+        type=str,
+        default=None,
+        help="Local worker directory override",
+    )
+    parser.add_argument(
+        "--classification-device",
+        type=str,
+        default=None,
+        help="Model device override (e.g. cpu, mps, cuda)",
+    )
+    parser.add_argument(
+        "--docling-ocr-engine",
+        type=str,
+        default=None,
+        help="Docling OCR engine override (easyocr, rapidocr, tesseract, tesseract_cli, ocrmac)",
+    )
+    parser.add_argument(
+        "--skip-device-check",
+        action="store_true",
+        help="Skip startup accelerator validation",
+    )
+    return parser
+def _resolve_api_url(cli_api_url: str | None) -> None:
+    """Resolve API_URL from CLI/env/domain and export it for Settings."""
+    if cli_api_url:
+        os.environ["API_URL"] = cli_api_url
+        return
+    if os.environ.get("API_URL", "").strip():
+        return
+    app_domain = os.environ.get("APP_DOMAIN", "").strip()
+    if app_domain:
+        app_scheme = os.environ.get("APP_SCHEME", "http").strip() or "http"
+        os.environ["API_URL"] = f"{app_scheme}://{app_domain}"
+def _resolve_work_dir(cli_work_dir: str | None) -> None:
+    """Resolve WORK_DIR from CLI/env for consistent worker file layout."""
+    if cli_work_dir:
+        os.environ["WORK_DIR"] = cli_work_dir
+        return
+    if not os.environ.get("WORK_DIR", "").strip():
+        os.environ["WORK_DIR"] = "/tmp/worker"
+def _resolve_classification_device(cli_device: str | None) -> str:
+    """Resolve classification device with platform-aware defaults."""
+    if cli_device and cli_device.strip():
+        device = cli_device.strip()
+    elif os.environ.get("CLASSIFICATION_DEVICE", "").strip():
+        device = os.environ["CLASSIFICATION_DEVICE"].strip()
+    elif platform.system() == "Darwin":
+        device = "mps"
+    else:
+        device = "cpu"
+    os.environ["CLASSIFICATION_DEVICE"] = device
+    # Keep behavior parity with previous shell script on macOS.
+    if platform.system() == "Darwin" and device == "mps":
+        os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
+    return device
+def _resolve_docling_ocr_engine(cli_engine: str | None) -> None:
+    """Resolve DOCLING_OCR_ENGINE from CLI/env and export it for Settings."""
+    if cli_engine and cli_engine.strip():
+        os.environ["DOCLING_OCR_ENGINE"] = cli_engine.strip()
+        return
+    if not os.environ.get("DOCLING_OCR_ENGINE", "").strip():
+        os.environ["DOCLING_OCR_ENGINE"] = "easyocr"
+def _report_runtime_metadata(settings, capabilities: list[str], logger) -> None:
+    """Best-effort runtime metadata report; polling can continue if it fails."""
+    metadata = WorkerRuntimeMetadata.model_validate(
+        build_runtime_metadata(settings, capabilities)
+    )
+    try:
+        # pylint: disable=import-outside-toplevel
+        from intextum_worker.services.api_client import ApiClient
+        ApiClient().report_runtime_metadata(metadata)
+    except Exception as exc:  # pylint: disable=broad-exception-caught
+        logger.warning("Failed to report worker runtime metadata: %s", exc)
+def main():
+    """Main entry point for the worker."""
+    parser = _build_parser()
+    args = parser.parse_args()
+    _resolve_api_url(args.api_url)
+    _resolve_work_dir(args.work_dir)
+    _resolve_classification_device(args.classification_device)
+    _resolve_docling_ocr_engine(args.docling_ocr_engine)
+    configure_logging()
+    logger = get_logger(__name__)
+    settings = get_settings()
+    if not settings.WORKER_TOKEN.strip():
+        print("Error: WORKER_TOKEN must be set and non-empty", file=sys.stderr)
+        sys.exit(1)
+    # Capabilities: CLI arg > env var.
+    try:
+        capabilities = (
+            parse_capabilities(args.capabilities)
+            if args.capabilities
+            else settings.parsed_capabilities
+        )
+    except ValueError as exc:
+        print(f"Error: {exc}", file=sys.stderr)
+        sys.exit(1)
+    poll_interval = args.poll_interval or settings.POLL_INTERVAL
+    if not capabilities:
+        print("Error: no capabilities specified", file=sys.stderr)
+        sys.exit(1)
+    try:
+        validate_runtime_dependencies(capabilities)
+    except RuntimeError as exc:
+        print(f"Error: {exc}", file=sys.stderr)
+        sys.exit(1)
+    try:
+        validate_accelerator(
+            settings.CLASSIFICATION_DEVICE,
+            skip_check=args.skip_device_check,
+        )
+    except RuntimeError as exc:
+        print(f"Error: {exc}", file=sys.stderr)
+        sys.exit(1)
+    logger.info(
+        "Starting intextum worker",
+        extra={
+            "api_url": settings.API_URL,
+            "work_dir": settings.WORK_DIR,
+            "classification_device": settings.CLASSIFICATION_DEVICE,
+            "docling_ocr_engine": settings.DOCLING_OCR_ENGINE,
+            "asr_model": settings.ASR_MODEL,
+            "asr_language": settings.ASR_LANGUAGE,
+            "docling_threads": settings.DOCLING_THREADS,
+            "keep_models_loaded": settings.KEEP_MODELS_LOADED,
+            "content_enrichment_stage_timeout_seconds": (
+                settings.CONTENT_ENRICHMENT_STAGE_TIMEOUT_SECONDS
+            ),
+            "capabilities": capabilities,
+            "poll_interval_seconds": poll_interval,
+        },
+    )
+    _report_runtime_metadata(settings, capabilities, logger)
+    # pylint: disable=import-outside-toplevel
+    from intextum_worker.poll_loop import run_poll_loop
+    run_poll_loop(capabilities=capabilities, poll_interval=poll_interval)
+if __name__ == "__main__":
+    main()