PyPI - intextum-worker - Versions diffs - 0.1.0__tar.gz - Mend

intextum-worker 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

intextum_worker-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,115 @@
+Metadata-Version: 2.4
+Name: intextum-worker
+Version: 0.1.0
+Summary: Intextum processing worker: HTTP-polling Docling/FFmpeg document, image and audio pipeline.
+Author-email: Sebastian Alberternst <alberternst@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/intextum/intextum
+Project-URL: Repository, https://github.com/intextum/intextum
+Keywords: intextum,docling,ocr,asr,document-processing,worker
+Requires-Python: <3.13,>=3.12
+Description-Content-Type: text/markdown
+Requires-Dist: requests<3.0.0,>=2.33.0
+Requires-Dist: pydantic-settings<3.0.0,>=2.14.0
+Requires-Dist: python-dotenv<2.0.0,>=1.2.2
+Requires-Dist: Pillow>=10.0.0
+Provides-Extra: document
+Requires-Dist: docling>=2.8.0; extra == "document"
+Requires-Dist: docling-core>=2.74.1; extra == "document"
+Requires-Dist: easyocr~=1.7.1; extra == "document"
+Requires-Dist: rapidocr-onnxruntime~=1.3.14; extra == "document"
+Requires-Dist: onnxruntime~=1.17.1; extra == "document"
+Provides-Extra: asr
+Requires-Dist: docling[asr]>=2.8.0; extra == "asr"
+Provides-Extra: enrichment
+Requires-Dist: transformers>=4.38.0; extra == "enrichment"
+Requires-Dist: gliner2>=0.1.0; extra == "enrichment"
+Requires-Dist: langgraph>=1.1.8; extra == "enrichment"
+Requires-Dist: sentencepiece; extra == "enrichment"
+Requires-Dist: protobuf; extra == "enrichment"
+Provides-Extra: full
+Requires-Dist: intextum-worker[asr,document,enrichment]; extra == "full"
+Provides-Extra: mps
+Requires-Dist: intextum-worker[full]; extra == "mps"
+Requires-Dist: torch==2.6.0; platform_system == "Darwin" and extra == "mps"
+Requires-Dist: torchvision==0.21.0; platform_system == "Darwin" and extra == "mps"
+Provides-Extra: cpu
+Requires-Dist: intextum-worker[full]; extra == "cpu"
+Requires-Dist: torch==2.6.0+cpu; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cpu"
+Requires-Dist: torchvision==0.21.0+cpu; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cpu"
+Requires-Dist: torch==2.6.0; (platform_system == "Linux" and platform_machine != "x86_64") and extra == "cpu"
+Requires-Dist: torchvision==0.21.0; (platform_system == "Linux" and platform_machine != "x86_64") and extra == "cpu"
+Provides-Extra: cpu-document
+Requires-Dist: intextum-worker[document]; extra == "cpu-document"
+Requires-Dist: torch==2.6.0+cpu; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cpu-document"
+Requires-Dist: torchvision==0.21.0+cpu; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cpu-document"
+Requires-Dist: torch==2.6.0; (platform_system == "Linux" and platform_machine != "x86_64") and extra == "cpu-document"
+Requires-Dist: torchvision==0.21.0; (platform_system == "Linux" and platform_machine != "x86_64") and extra == "cpu-document"
+Provides-Extra: cuda
+Requires-Dist: intextum-worker[full]; extra == "cuda"
+Requires-Dist: torch==2.6.0+cu126; platform_system == "Linux" and extra == "cuda"
+Requires-Dist: torchvision==0.21.0+cu126; platform_system == "Linux" and extra == "cuda"
+Provides-Extra: test
+Requires-Dist: pytest<10.0,>=9.0.3; extra == "test"
+Requires-Dist: pytest-asyncio<2.0.0,>=1.4.0; extra == "test"
+# intextum-worker
+The Intextum processing worker: an HTTP-polling worker that pulls tasks from an
+Intextum API instance and runs the Docling / FFmpeg document, image and audio
+pipeline (OCR, ASR, chunking, classification, content enrichment, embeddings).
+The worker is **always-remote**: it downloads source files from and uploads
+results to the API over HTTP, so it does not need a shared volume and can run
+anywhere — including on a host with a GPU while the rest of the stack runs in
+Docker.
+## Install
+Pick the bundle that matches your accelerator. The macOS (Apple MPS) wheels are
+on PyPI, so it installs with no extra flags:
+```bash
+pip install 'intextum-worker[mps]'
+```
+Linux CPU and NVIDIA CUDA pull their Torch build from the PyTorch index, so add
+the matching `--extra-index-url`:
+```bash
+# Linux, CPU only
+pip install 'intextum-worker[cpu]'  --extra-index-url https://download.pytorch.org/whl/cpu
+# Linux, NVIDIA CUDA 12.6
+pip install 'intextum-worker[cuda]' --extra-index-url https://download.pytorch.org/whl/cu126
+```
+Available extras: `mps`, `cpu`, `cuda`, `cpu-document` (document/image only), plus
+the granular `document`, `asr`, `enrichment` stacks.
+## Run
+```bash
+export API_URL="https://your-intextum-host"   # the API to poll
+export WORKER_TOKEN="<token from the Add Worker dialog>"
+intextum-worker --capabilities document,video,image
+```
+`intextum-worker --help` lists all flags. Every flag also has an environment
+variable (`API_URL`, `WORKER_TOKEN`, `WORK_DIR`, `CAPABILITIES`, `POLL_INTERVAL`,
+`CLASSIFICATION_DEVICE`, `DOCLING_OCR_ENGINE`, …); CLI flags take precedence.
+## Development
+This package uses a `src/` layout. The repo-root `VERSION` file is the single
+source of truth for the version; it is staged into `worker/VERSION` at build time
+(`worker/VERSION` is gitignored).
+```bash
+cp ../VERSION VERSION           # stage the version for an editable install
+pip install -e '.[mps,test]'    # or [cpu,test] / [cuda,test]
+pytest
+```
+On macOS, `scripts/setup-macos-mps.sh` does the venv + editable install for you,
+and `scripts/run-macos-mps.sh` launches the worker with MPS defaults.

intextum_worker-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,60 @@
+# intextum-worker
+The Intextum processing worker: an HTTP-polling worker that pulls tasks from an
+Intextum API instance and runs the Docling / FFmpeg document, image and audio
+pipeline (OCR, ASR, chunking, classification, content enrichment, embeddings).
+The worker is **always-remote**: it downloads source files from and uploads
+results to the API over HTTP, so it does not need a shared volume and can run
+anywhere — including on a host with a GPU while the rest of the stack runs in
+Docker.
+## Install
+Pick the bundle that matches your accelerator. The macOS (Apple MPS) wheels are
+on PyPI, so it installs with no extra flags:
+```bash
+pip install 'intextum-worker[mps]'
+```
+Linux CPU and NVIDIA CUDA pull their Torch build from the PyTorch index, so add
+the matching `--extra-index-url`:
+```bash
+# Linux, CPU only
+pip install 'intextum-worker[cpu]'  --extra-index-url https://download.pytorch.org/whl/cpu
+# Linux, NVIDIA CUDA 12.6
+pip install 'intextum-worker[cuda]' --extra-index-url https://download.pytorch.org/whl/cu126
+```
+Available extras: `mps`, `cpu`, `cuda`, `cpu-document` (document/image only), plus
+the granular `document`, `asr`, `enrichment` stacks.
+## Run
+```bash
+export API_URL="https://your-intextum-host"   # the API to poll
+export WORKER_TOKEN="<token from the Add Worker dialog>"
+intextum-worker --capabilities document,video,image
+```
+`intextum-worker --help` lists all flags. Every flag also has an environment
+variable (`API_URL`, `WORKER_TOKEN`, `WORK_DIR`, `CAPABILITIES`, `POLL_INTERVAL`,
+`CLASSIFICATION_DEVICE`, `DOCLING_OCR_ENGINE`, …); CLI flags take precedence.
+## Development
+This package uses a `src/` layout. The repo-root `VERSION` file is the single
+source of truth for the version; it is staged into `worker/VERSION` at build time
+(`worker/VERSION` is gitignored).
+```bash
+cp ../VERSION VERSION           # stage the version for an editable install
+pip install -e '.[mps,test]'    # or [cpu,test] / [cuda,test]
+pytest
+```
+On macOS, `scripts/setup-macos-mps.sh` does the venv + editable install for you,
+and `scripts/run-macos-mps.sh` launches the worker with MPS defaults.

intextum_worker-0.1.0/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1.0

intextum_worker-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,96 @@
+[build-system]
+requires = ["setuptools>=77", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "intextum-worker"
+description = "Intextum processing worker: HTTP-polling Docling/FFmpeg document, image and audio pipeline."
+readme = "README.md"
+requires-python = ">=3.12,<3.13"
+license = "MIT"
+authors = [{ name = "Sebastian Alberternst", email = "alberternst@gmail.com" }]
+keywords = ["intextum", "docling", "ocr", "asr", "document-processing", "worker"]
+dynamic = ["version"]
+# Core runtime (mirrors requirements/base.txt). Feature stacks and the Torch
+# build variant live in the optional-dependency extras below.
+dependencies = [
+    "requests>=2.33.0,<3.0.0",
+    "pydantic-settings>=2.14.0,<3.0.0",
+    "python-dotenv>=1.2.2,<2.0.0",
+    "Pillow>=10.0.0",
+]
+[project.scripts]
+intextum-worker = "intextum_worker.main:main"
+[project.urls]
+Homepage = "https://github.com/intextum/intextum"
+Repository = "https://github.com/intextum/intextum"
+[project.optional-dependencies]
+# Feature stacks (mirror requirements/{document,asr,content-enrichment}.txt).
+document = [
+    "docling>=2.8.0",
+    "docling-core>=2.74.1",
+    "easyocr~=1.7.1",
+    "rapidocr-onnxruntime~=1.3.14",
+    "onnxruntime~=1.17.1",
+]
+asr = ["docling[asr]>=2.8.0"]
+enrichment = [
+    "transformers>=4.38.0",
+    "gliner2>=0.1.0",
+    "langgraph>=1.1.8",
+    "sentencepiece",
+    "protobuf",
+]
+full = ["intextum-worker[document,asr,enrichment]"]
+# Platform bundles add the right Torch build on top of a feature stack.
+#   macOS (Apple MPS): wheels are on PyPI, so this installs with no extra index:
+#       pip install 'intextum-worker[mps]'
+#   Linux CPU / NVIDIA CUDA: the +cpu / +cu126 wheels live on the PyTorch index,
+#   so the install command must add the matching --extra-index-url:
+#       pip install 'intextum-worker[cpu]'  --extra-index-url https://download.pytorch.org/whl/cpu
+#       pip install 'intextum-worker[cuda]' --extra-index-url https://download.pytorch.org/whl/cu126
+mps = [
+    "intextum-worker[full]",
+    "torch==2.6.0 ; platform_system == 'Darwin'",
+    "torchvision==0.21.0 ; platform_system == 'Darwin'",
+]
+cpu = [
+    "intextum-worker[full]",
+    "torch==2.6.0+cpu ; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "torchvision==0.21.0+cpu ; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "torch==2.6.0 ; platform_system == 'Linux' and platform_machine != 'x86_64'",
+    "torchvision==0.21.0 ; platform_system == 'Linux' and platform_machine != 'x86_64'",
+]
+cpu-document = [
+    "intextum-worker[document]",
+    "torch==2.6.0+cpu ; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "torchvision==0.21.0+cpu ; platform_system == 'Linux' and platform_machine == 'x86_64'",
+    "torch==2.6.0 ; platform_system == 'Linux' and platform_machine != 'x86_64'",
+    "torchvision==0.21.0 ; platform_system == 'Linux' and platform_machine != 'x86_64'",
+]
+cuda = [
+    "intextum-worker[full]",
+    "torch==2.6.0+cu126 ; platform_system == 'Linux'",
+    "torchvision==0.21.0+cu126 ; platform_system == 'Linux'",
+]
+test = [
+    "pytest>=9.0.3,<10.0",
+    "pytest-asyncio>=1.4.0,<2.0.0",
+]
+[tool.setuptools.dynamic]
+# Single source of truth: the repo-root VERSION file, staged into worker/ at
+# build time (see .github/workflows/release-worker.yml and worker/Dockerfile).
+version = { file = "VERSION" }
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+testpaths = ["tests"]

intextum_worker-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

intextum_worker-0.1.0/src/intextum_worker/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Worker package for file processing pipeline."""

intextum_worker-0.1.0/src/intextum_worker/config.py ADDED Viewed

@@ -0,0 +1,92 @@
+"""Configuration settings for the worker service."""
+import json
+from collections.abc import Iterable
+from functools import lru_cache
+from pydantic import field_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+VALID_WORKER_CAPABILITIES = frozenset({"document", "image", "video", "training"})
+DEFAULT_WORKER_CAPABILITIES = "document,video,image"
+def parse_capabilities(value: object) -> list[str]:
+    """Parse and validate worker capabilities from env or CLI input."""
+    if isinstance(value, str):
+        raw = value.strip()
+        if not raw:
+            return []
+        if raw.startswith("["):
+            try:
+                decoded = json.loads(raw)
+            except json.JSONDecodeError as exc:
+                raise ValueError(
+                    "CAPABILITIES must be a comma-separated string or JSON array"
+                ) from exc
+            return parse_capabilities(decoded)
+        values: Iterable[object] = raw.split(",")
+    elif isinstance(value, (list, tuple, set)):
+        values = value
+    else:
+        raise ValueError("CAPABILITIES must be a comma-separated string or JSON array")
+    capabilities = [str(item).strip().lower() for item in values if str(item).strip()]
+    invalid = sorted(set(capabilities) - VALID_WORKER_CAPABILITIES)
+    if invalid:
+        allowed = ", ".join(sorted(VALID_WORKER_CAPABILITIES))
+        rejected = ", ".join(invalid)
+        raise ValueError(
+            f"Invalid CAPABILITIES value(s): {rejected}. Allowed: {allowed}"
+        )
+    return capabilities
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables."""
+    # Remote backend connection
+    API_URL: str = "http://api:8000"
+    WORKER_TOKEN: str = ""
+    WORK_DIR: str = "/tmp/worker"
+    # Poll loop. "training" stays opt-in and must be added explicitly.
+    CAPABILITIES: str = DEFAULT_WORKER_CAPABILITIES
+    @property
+    def parsed_capabilities(self) -> list[str]:
+        """Return validated capabilities without triggering pydantic JSON env parsing."""
+        return parse_capabilities(self.CAPABILITIES)
+    POLL_INTERVAL: float = 5.0
+    TASK_HEARTBEAT_INTERVAL_SECONDS: float = 60.0
+    CONTENT_ENRICHMENT_STAGE_TIMEOUT_SECONDS: float = 300.0
+    # Local processing settings
+    CLASSIFICATION_DEVICE: str = "cpu"
+    DOCLING_THREADS: int = 4
+    DOCLING_OCR_ENGINE: str = "easyocr"
+    ASR_MODEL: str = "whisper_large_v3"
+    ASR_LANGUAGE: str = "de"
+    KEEP_MODELS_LOADED: bool = False
+    CUSTOM_FIELD_ID: int = 1
+    model_config = SettingsConfigDict(env_file=".env", extra="ignore")
+    @field_validator("DOCLING_OCR_ENGINE", mode="before")
+    @classmethod
+    def normalize_docling_ocr_engine(cls, value: str) -> str:
+        """Normalize and validate configured Docling OCR engine."""
+        engine = str(value).strip().lower()
+        allowed = {"easyocr", "rapidocr", "tesseract", "tesseract_cli", "ocrmac"}
+        if engine not in allowed:
+            allowed_list = ", ".join(sorted(allowed))
+            raise ValueError(f"DOCLING_OCR_ENGINE must be one of: {allowed_list}")
+        return engine
+@lru_cache
+def get_settings() -> Settings:
+    """Get cached settings instance."""
+    return Settings()

intextum_worker-0.1.0/src/intextum_worker/logging_config.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""Structured logging configuration for the worker service."""
+import json
+import logging
+import sys
+import uuid
+from collections.abc import MutableMapping
+from contextvars import ContextVar, Token
+from datetime import UTC, datetime
+from typing import Any
+correlation_id_var: ContextVar[str | None] = ContextVar("correlation_id", default=None)
+def generate_correlation_id() -> str:
+    """Generate a new correlation ID."""
+    return str(uuid.uuid4())[:8]
+def get_correlation_id() -> str | None:
+    """Get the current correlation ID from context."""
+    return correlation_id_var.get()
+def set_correlation_id(correlation_id: str) -> None:
+    """Set the correlation ID in context."""
+    correlation_id_var.set(correlation_id)
+class StructuredFormatter(logging.Formatter):
+    """JSON formatter for structured logging."""
+    def format(self, record: logging.LogRecord) -> str:
+        log_data: dict[str, Any] = {
+            "timestamp": datetime.now(UTC).isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+        }
+        correlation_id = get_correlation_id()
+        if correlation_id:
+            log_data["correlation_id"] = correlation_id
+        if record.exc_info:
+            log_data["exception"] = self.formatException(record.exc_info)
+        if hasattr(record, "__dict__"):
+            extra_fields = {
+                k: v
+                for k, v in record.__dict__.items()
+                if k
+                not in {
+                    "name",
+                    "msg",
+                    "args",
+                    "created",
+                    "filename",
+                    "funcName",
+                    "levelname",
+                    "levelno",
+                    "lineno",
+                    "module",
+                    "msecs",
+                    "pathname",
+                    "process",
+                    "processName",
+                    "relativeCreated",
+                    "stack_info",
+                    "exc_info",
+                    "exc_text",
+                    "thread",
+                    "threadName",
+                    "taskName",
+                    "message",
+                }
+            }
+            if extra_fields:
+                log_data["extra"] = extra_fields
+        return json.dumps(log_data)
+class CorrelatedLogger(logging.LoggerAdapter):
+    """Logger adapter that includes correlation ID in all log messages."""
+    def __init__(self, logger: logging.Logger, correlation_id: str):
+        super().__init__(logger, {})
+        self.correlation_id = correlation_id
+    def process(
+        self, msg: object, kwargs: MutableMapping[str, Any]
+    ) -> tuple[object, MutableMapping[str, Any]]:
+        extra = kwargs.get("extra")
+        if not isinstance(extra, dict):
+            extra = {}
+        extra["correlation_id"] = self.correlation_id
+        kwargs["extra"] = extra
+        return msg, kwargs
+def get_logger(name: str, correlation_id: str | None = None) -> logging.LoggerAdapter:
+    """Get a logger with optional correlation ID.
+    Args:
+        name: Logger name (typically __name__)
+        correlation_id: Optional correlation ID, generates new one if not provided
+    Returns:
+        Logger adapter with correlation context
+    """
+    logger = logging.getLogger(name)
+    cid = correlation_id or get_correlation_id() or generate_correlation_id()
+    return CorrelatedLogger(logger, cid)
+def configure_logging(json_format: bool = True, level: str = "INFO") -> None:
+    """Configure logging for the worker service.
+    Args:
+        json_format: Use JSON structured logging if True, human-readable if False
+        level: Logging level (DEBUG, INFO, WARNING, ERROR)
+    """
+    root_logger = logging.getLogger()
+    root_logger.setLevel(getattr(logging, level.upper()))
+    for handler in root_logger.handlers[:]:
+        root_logger.removeHandler(handler)
+    handler = logging.StreamHandler(sys.stdout)
+    if json_format:
+        handler.setFormatter(StructuredFormatter())
+    else:
+        handler.setFormatter(
+            logging.Formatter(
+                "%(asctime)s - %(levelname)s - [%(correlation_id)s] %(name)s - %(message)s",
+                defaults={"correlation_id": "no-correlation"},
+            )
+        )
+    root_logger.addHandler(handler)
+class LoggingContext:
+    """Context manager for scoped correlation IDs."""
+    def __init__(self, correlation_id: str | None = None):
+        self.correlation_id = correlation_id or generate_correlation_id()
+        self._token: Token[str | None] | None = None
+    def __enter__(self) -> str:
+        self._token = correlation_id_var.set(self.correlation_id)
+        return self.correlation_id
+    def __exit__(self, *args) -> None:
+        if self._token is not None:
+            correlation_id_var.reset(self._token)