intextum-worker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. intextum_worker/__init__.py +1 -0
  2. intextum_worker/config.py +92 -0
  3. intextum_worker/logging_config.py +158 -0
  4. intextum_worker/main.py +212 -0
  5. intextum_worker/models.py +724 -0
  6. intextum_worker/poll_enrichment.py +76 -0
  7. intextum_worker/poll_loop.py +424 -0
  8. intextum_worker/poll_runtime.py +327 -0
  9. intextum_worker/processor_docling.py +113 -0
  10. intextum_worker/processor_runtime.py +622 -0
  11. intextum_worker/processors.py +446 -0
  12. intextum_worker/runtime_info.py +201 -0
  13. intextum_worker/services/__init__.py +0 -0
  14. intextum_worker/services/api_client.py +537 -0
  15. intextum_worker/services/api_client_api.py +99 -0
  16. intextum_worker/services/api_client_uploads.py +110 -0
  17. intextum_worker/services/content_enrichment/__init__.py +15 -0
  18. intextum_worker/services/content_enrichment/batching.py +115 -0
  19. intextum_worker/services/content_enrichment/chunk_selection.py +306 -0
  20. intextum_worker/services/content_enrichment/classification.py +185 -0
  21. intextum_worker/services/content_enrichment/console.py +23 -0
  22. intextum_worker/services/content_enrichment/evidence_grounding.py +100 -0
  23. intextum_worker/services/content_enrichment/json_response.py +188 -0
  24. intextum_worker/services/content_enrichment/langgraph_provider.py +1036 -0
  25. intextum_worker/services/content_enrichment/merge.py +197 -0
  26. intextum_worker/services/content_enrichment/model_artifacts.py +170 -0
  27. intextum_worker/services/content_enrichment/orchestration.py +65 -0
  28. intextum_worker/services/content_enrichment/prompt.py +218 -0
  29. intextum_worker/services/content_enrichment/registry.py +113 -0
  30. intextum_worker/services/content_enrichment/repeated_fields.py +153 -0
  31. intextum_worker/services/content_enrichment_training_runner.py +470 -0
  32. intextum_worker/services/content_enrichment_utils.py +904 -0
  33. intextum_worker/services/docling.py +372 -0
  34. intextum_worker/services/docling_asr.py +250 -0
  35. intextum_worker/services/docling_enrichment.py +204 -0
  36. intextum_worker/services/docling_output.py +80 -0
  37. intextum_worker/services/tokenizer.py +45 -0
  38. intextum_worker/services/vector.py +216 -0
  39. intextum_worker/version.py +55 -0
  40. intextum_worker-0.1.0.dist-info/METADATA +115 -0
  41. intextum_worker-0.1.0.dist-info/RECORD +44 -0
  42. intextum_worker-0.1.0.dist-info/WHEEL +5 -0
  43. intextum_worker-0.1.0.dist-info/entry_points.txt +2 -0
  44. intextum_worker-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1 @@
1
+ """Worker package for file processing pipeline."""
@@ -0,0 +1,92 @@
1
+ """Configuration settings for the worker service."""
2
+
3
+ import json
4
+ from collections.abc import Iterable
5
+ from functools import lru_cache
6
+
7
+ from pydantic import field_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
9
+
10
+ VALID_WORKER_CAPABILITIES = frozenset({"document", "image", "video", "training"})
11
+ DEFAULT_WORKER_CAPABILITIES = "document,video,image"
12
+
13
+
14
+ def parse_capabilities(value: object) -> list[str]:
15
+ """Parse and validate worker capabilities from env or CLI input."""
16
+ if isinstance(value, str):
17
+ raw = value.strip()
18
+ if not raw:
19
+ return []
20
+ if raw.startswith("["):
21
+ try:
22
+ decoded = json.loads(raw)
23
+ except json.JSONDecodeError as exc:
24
+ raise ValueError(
25
+ "CAPABILITIES must be a comma-separated string or JSON array"
26
+ ) from exc
27
+ return parse_capabilities(decoded)
28
+ values: Iterable[object] = raw.split(",")
29
+ elif isinstance(value, (list, tuple, set)):
30
+ values = value
31
+ else:
32
+ raise ValueError("CAPABILITIES must be a comma-separated string or JSON array")
33
+
34
+ capabilities = [str(item).strip().lower() for item in values if str(item).strip()]
35
+ invalid = sorted(set(capabilities) - VALID_WORKER_CAPABILITIES)
36
+ if invalid:
37
+ allowed = ", ".join(sorted(VALID_WORKER_CAPABILITIES))
38
+ rejected = ", ".join(invalid)
39
+ raise ValueError(
40
+ f"Invalid CAPABILITIES value(s): {rejected}. Allowed: {allowed}"
41
+ )
42
+ return capabilities
43
+
44
+
45
+ class Settings(BaseSettings):
46
+ """Application settings loaded from environment variables."""
47
+
48
+ # Remote backend connection
49
+ API_URL: str = "http://api:8000"
50
+ WORKER_TOKEN: str = ""
51
+ WORK_DIR: str = "/tmp/worker"
52
+
53
+ # Poll loop. "training" stays opt-in and must be added explicitly.
54
+ CAPABILITIES: str = DEFAULT_WORKER_CAPABILITIES
55
+
56
+ @property
57
+ def parsed_capabilities(self) -> list[str]:
58
+ """Return validated capabilities without triggering pydantic JSON env parsing."""
59
+ return parse_capabilities(self.CAPABILITIES)
60
+
61
+ POLL_INTERVAL: float = 5.0
62
+ TASK_HEARTBEAT_INTERVAL_SECONDS: float = 60.0
63
+ CONTENT_ENRICHMENT_STAGE_TIMEOUT_SECONDS: float = 300.0
64
+
65
+ # Local processing settings
66
+ CLASSIFICATION_DEVICE: str = "cpu"
67
+ DOCLING_THREADS: int = 4
68
+ DOCLING_OCR_ENGINE: str = "easyocr"
69
+ ASR_MODEL: str = "whisper_large_v3"
70
+ ASR_LANGUAGE: str = "de"
71
+ KEEP_MODELS_LOADED: bool = False
72
+
73
+ CUSTOM_FIELD_ID: int = 1
74
+
75
+ model_config = SettingsConfigDict(env_file=".env", extra="ignore")
76
+
77
+ @field_validator("DOCLING_OCR_ENGINE", mode="before")
78
+ @classmethod
79
+ def normalize_docling_ocr_engine(cls, value: str) -> str:
80
+ """Normalize and validate configured Docling OCR engine."""
81
+ engine = str(value).strip().lower()
82
+ allowed = {"easyocr", "rapidocr", "tesseract", "tesseract_cli", "ocrmac"}
83
+ if engine not in allowed:
84
+ allowed_list = ", ".join(sorted(allowed))
85
+ raise ValueError(f"DOCLING_OCR_ENGINE must be one of: {allowed_list}")
86
+ return engine
87
+
88
+
89
+ @lru_cache
90
+ def get_settings() -> Settings:
91
+ """Get cached settings instance."""
92
+ return Settings()
@@ -0,0 +1,158 @@
1
+ """Structured logging configuration for the worker service."""
2
+
3
+ import json
4
+ import logging
5
+ import sys
6
+ import uuid
7
+ from collections.abc import MutableMapping
8
+ from contextvars import ContextVar, Token
9
+ from datetime import UTC, datetime
10
+ from typing import Any
11
+
12
+ correlation_id_var: ContextVar[str | None] = ContextVar("correlation_id", default=None)
13
+
14
+
15
+ def generate_correlation_id() -> str:
16
+ """Generate a new correlation ID."""
17
+ return str(uuid.uuid4())[:8]
18
+
19
+
20
+ def get_correlation_id() -> str | None:
21
+ """Get the current correlation ID from context."""
22
+ return correlation_id_var.get()
23
+
24
+
25
+ def set_correlation_id(correlation_id: str) -> None:
26
+ """Set the correlation ID in context."""
27
+ correlation_id_var.set(correlation_id)
28
+
29
+
30
+ class StructuredFormatter(logging.Formatter):
31
+ """JSON formatter for structured logging."""
32
+
33
+ def format(self, record: logging.LogRecord) -> str:
34
+ log_data: dict[str, Any] = {
35
+ "timestamp": datetime.now(UTC).isoformat(),
36
+ "level": record.levelname,
37
+ "logger": record.name,
38
+ "message": record.getMessage(),
39
+ }
40
+
41
+ correlation_id = get_correlation_id()
42
+ if correlation_id:
43
+ log_data["correlation_id"] = correlation_id
44
+
45
+ if record.exc_info:
46
+ log_data["exception"] = self.formatException(record.exc_info)
47
+
48
+ if hasattr(record, "__dict__"):
49
+ extra_fields = {
50
+ k: v
51
+ for k, v in record.__dict__.items()
52
+ if k
53
+ not in {
54
+ "name",
55
+ "msg",
56
+ "args",
57
+ "created",
58
+ "filename",
59
+ "funcName",
60
+ "levelname",
61
+ "levelno",
62
+ "lineno",
63
+ "module",
64
+ "msecs",
65
+ "pathname",
66
+ "process",
67
+ "processName",
68
+ "relativeCreated",
69
+ "stack_info",
70
+ "exc_info",
71
+ "exc_text",
72
+ "thread",
73
+ "threadName",
74
+ "taskName",
75
+ "message",
76
+ }
77
+ }
78
+ if extra_fields:
79
+ log_data["extra"] = extra_fields
80
+
81
+ return json.dumps(log_data)
82
+
83
+
84
+ class CorrelatedLogger(logging.LoggerAdapter):
85
+ """Logger adapter that includes correlation ID in all log messages."""
86
+
87
+ def __init__(self, logger: logging.Logger, correlation_id: str):
88
+ super().__init__(logger, {})
89
+ self.correlation_id = correlation_id
90
+
91
+ def process(
92
+ self, msg: object, kwargs: MutableMapping[str, Any]
93
+ ) -> tuple[object, MutableMapping[str, Any]]:
94
+ extra = kwargs.get("extra")
95
+ if not isinstance(extra, dict):
96
+ extra = {}
97
+ extra["correlation_id"] = self.correlation_id
98
+ kwargs["extra"] = extra
99
+ return msg, kwargs
100
+
101
+
102
+ def get_logger(name: str, correlation_id: str | None = None) -> logging.LoggerAdapter:
103
+ """Get a logger with optional correlation ID.
104
+
105
+ Args:
106
+ name: Logger name (typically __name__)
107
+ correlation_id: Optional correlation ID, generates new one if not provided
108
+
109
+ Returns:
110
+ Logger adapter with correlation context
111
+ """
112
+ logger = logging.getLogger(name)
113
+ cid = correlation_id or get_correlation_id() or generate_correlation_id()
114
+ return CorrelatedLogger(logger, cid)
115
+
116
+
117
+ def configure_logging(json_format: bool = True, level: str = "INFO") -> None:
118
+ """Configure logging for the worker service.
119
+
120
+ Args:
121
+ json_format: Use JSON structured logging if True, human-readable if False
122
+ level: Logging level (DEBUG, INFO, WARNING, ERROR)
123
+ """
124
+ root_logger = logging.getLogger()
125
+ root_logger.setLevel(getattr(logging, level.upper()))
126
+
127
+ for handler in root_logger.handlers[:]:
128
+ root_logger.removeHandler(handler)
129
+
130
+ handler = logging.StreamHandler(sys.stdout)
131
+
132
+ if json_format:
133
+ handler.setFormatter(StructuredFormatter())
134
+ else:
135
+ handler.setFormatter(
136
+ logging.Formatter(
137
+ "%(asctime)s - %(levelname)s - [%(correlation_id)s] %(name)s - %(message)s",
138
+ defaults={"correlation_id": "no-correlation"},
139
+ )
140
+ )
141
+
142
+ root_logger.addHandler(handler)
143
+
144
+
145
+ class LoggingContext:
146
+ """Context manager for scoped correlation IDs."""
147
+
148
+ def __init__(self, correlation_id: str | None = None):
149
+ self.correlation_id = correlation_id or generate_correlation_id()
150
+ self._token: Token[str | None] | None = None
151
+
152
+ def __enter__(self) -> str:
153
+ self._token = correlation_id_var.set(self.correlation_id)
154
+ return self.correlation_id
155
+
156
+ def __exit__(self, *args) -> None:
157
+ if self._token is not None:
158
+ correlation_id_var.reset(self._token)
@@ -0,0 +1,212 @@
1
+ """Worker entry point — HTTP poll loop replacing Celery."""
2
+
3
+ import argparse
4
+ import os
5
+ import platform
6
+ import sys
7
+
8
+ from intextum_worker.config import get_settings, parse_capabilities
9
+ from intextum_worker.logging_config import configure_logging, get_logger
10
+ from intextum_worker.models import WorkerRuntimeMetadata
11
+ from intextum_worker.runtime_info import (
12
+ build_runtime_metadata,
13
+ validate_accelerator,
14
+ validate_runtime_dependencies,
15
+ )
16
+
17
+
18
+ def _build_parser() -> argparse.ArgumentParser:
19
+ """Build command-line parser for worker runtime overrides."""
20
+ parser = argparse.ArgumentParser(description="intextum Worker")
21
+ parser.add_argument(
22
+ "--capabilities",
23
+ type=str,
24
+ default=None,
25
+ help="Comma-separated capabilities, e.g. document,video,image,training",
26
+ )
27
+ parser.add_argument(
28
+ "--poll-interval",
29
+ type=float,
30
+ default=None,
31
+ help="Seconds between poll attempts (default: 5)",
32
+ )
33
+ parser.add_argument(
34
+ "--api-url",
35
+ type=str,
36
+ default=None,
37
+ help="API URL override (otherwise API_URL or APP_SCHEME/APP_DOMAIN)",
38
+ )
39
+ parser.add_argument(
40
+ "--work-dir",
41
+ type=str,
42
+ default=None,
43
+ help="Local worker directory override",
44
+ )
45
+ parser.add_argument(
46
+ "--classification-device",
47
+ type=str,
48
+ default=None,
49
+ help="Model device override (e.g. cpu, mps, cuda)",
50
+ )
51
+ parser.add_argument(
52
+ "--docling-ocr-engine",
53
+ type=str,
54
+ default=None,
55
+ help="Docling OCR engine override (easyocr, rapidocr, tesseract, tesseract_cli, ocrmac)",
56
+ )
57
+ parser.add_argument(
58
+ "--skip-device-check",
59
+ action="store_true",
60
+ help="Skip startup accelerator validation",
61
+ )
62
+ return parser
63
+
64
+
65
+ def _resolve_api_url(cli_api_url: str | None) -> None:
66
+ """Resolve API_URL from CLI/env/domain and export it for Settings."""
67
+ if cli_api_url:
68
+ os.environ["API_URL"] = cli_api_url
69
+ return
70
+
71
+ if os.environ.get("API_URL", "").strip():
72
+ return
73
+
74
+ app_domain = os.environ.get("APP_DOMAIN", "").strip()
75
+ if app_domain:
76
+ app_scheme = os.environ.get("APP_SCHEME", "http").strip() or "http"
77
+ os.environ["API_URL"] = f"{app_scheme}://{app_domain}"
78
+
79
+
80
+ def _resolve_work_dir(cli_work_dir: str | None) -> None:
81
+ """Resolve WORK_DIR from CLI/env for consistent worker file layout."""
82
+ if cli_work_dir:
83
+ os.environ["WORK_DIR"] = cli_work_dir
84
+ return
85
+
86
+ if not os.environ.get("WORK_DIR", "").strip():
87
+ os.environ["WORK_DIR"] = "/tmp/worker"
88
+
89
+
90
+ def _resolve_classification_device(cli_device: str | None) -> str:
91
+ """Resolve classification device with platform-aware defaults."""
92
+ if cli_device and cli_device.strip():
93
+ device = cli_device.strip()
94
+ elif os.environ.get("CLASSIFICATION_DEVICE", "").strip():
95
+ device = os.environ["CLASSIFICATION_DEVICE"].strip()
96
+ elif platform.system() == "Darwin":
97
+ device = "mps"
98
+ else:
99
+ device = "cpu"
100
+
101
+ os.environ["CLASSIFICATION_DEVICE"] = device
102
+
103
+ # Keep behavior parity with previous shell script on macOS.
104
+ if platform.system() == "Darwin" and device == "mps":
105
+ os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
106
+
107
+ return device
108
+
109
+
110
+ def _resolve_docling_ocr_engine(cli_engine: str | None) -> None:
111
+ """Resolve DOCLING_OCR_ENGINE from CLI/env and export it for Settings."""
112
+ if cli_engine and cli_engine.strip():
113
+ os.environ["DOCLING_OCR_ENGINE"] = cli_engine.strip()
114
+ return
115
+
116
+ if not os.environ.get("DOCLING_OCR_ENGINE", "").strip():
117
+ os.environ["DOCLING_OCR_ENGINE"] = "easyocr"
118
+
119
+
120
+ def _report_runtime_metadata(settings, capabilities: list[str], logger) -> None:
121
+ """Best-effort runtime metadata report; polling can continue if it fails."""
122
+ metadata = WorkerRuntimeMetadata.model_validate(
123
+ build_runtime_metadata(settings, capabilities)
124
+ )
125
+ try:
126
+ # pylint: disable=import-outside-toplevel
127
+ from intextum_worker.services.api_client import ApiClient
128
+
129
+ ApiClient().report_runtime_metadata(metadata)
130
+ except Exception as exc: # pylint: disable=broad-exception-caught
131
+ logger.warning("Failed to report worker runtime metadata: %s", exc)
132
+
133
+
134
+ def main():
135
+ """Main entry point for the worker."""
136
+ parser = _build_parser()
137
+ args = parser.parse_args()
138
+
139
+ _resolve_api_url(args.api_url)
140
+ _resolve_work_dir(args.work_dir)
141
+ _resolve_classification_device(args.classification_device)
142
+ _resolve_docling_ocr_engine(args.docling_ocr_engine)
143
+
144
+ configure_logging()
145
+ logger = get_logger(__name__)
146
+
147
+ settings = get_settings()
148
+
149
+ if not settings.WORKER_TOKEN.strip():
150
+ print("Error: WORKER_TOKEN must be set and non-empty", file=sys.stderr)
151
+ sys.exit(1)
152
+
153
+ # Capabilities: CLI arg > env var.
154
+ try:
155
+ capabilities = (
156
+ parse_capabilities(args.capabilities)
157
+ if args.capabilities
158
+ else settings.parsed_capabilities
159
+ )
160
+ except ValueError as exc:
161
+ print(f"Error: {exc}", file=sys.stderr)
162
+ sys.exit(1)
163
+
164
+ poll_interval = args.poll_interval or settings.POLL_INTERVAL
165
+
166
+ if not capabilities:
167
+ print("Error: no capabilities specified", file=sys.stderr)
168
+ sys.exit(1)
169
+
170
+ try:
171
+ validate_runtime_dependencies(capabilities)
172
+ except RuntimeError as exc:
173
+ print(f"Error: {exc}", file=sys.stderr)
174
+ sys.exit(1)
175
+
176
+ try:
177
+ validate_accelerator(
178
+ settings.CLASSIFICATION_DEVICE,
179
+ skip_check=args.skip_device_check,
180
+ )
181
+ except RuntimeError as exc:
182
+ print(f"Error: {exc}", file=sys.stderr)
183
+ sys.exit(1)
184
+
185
+ logger.info(
186
+ "Starting intextum worker",
187
+ extra={
188
+ "api_url": settings.API_URL,
189
+ "work_dir": settings.WORK_DIR,
190
+ "classification_device": settings.CLASSIFICATION_DEVICE,
191
+ "docling_ocr_engine": settings.DOCLING_OCR_ENGINE,
192
+ "asr_model": settings.ASR_MODEL,
193
+ "asr_language": settings.ASR_LANGUAGE,
194
+ "docling_threads": settings.DOCLING_THREADS,
195
+ "keep_models_loaded": settings.KEEP_MODELS_LOADED,
196
+ "content_enrichment_stage_timeout_seconds": (
197
+ settings.CONTENT_ENRICHMENT_STAGE_TIMEOUT_SECONDS
198
+ ),
199
+ "capabilities": capabilities,
200
+ "poll_interval_seconds": poll_interval,
201
+ },
202
+ )
203
+ _report_runtime_metadata(settings, capabilities, logger)
204
+
205
+ # pylint: disable=import-outside-toplevel
206
+ from intextum_worker.poll_loop import run_poll_loop
207
+
208
+ run_poll_loop(capabilities=capabilities, poll_interval=poll_interval)
209
+
210
+
211
+ if __name__ == "__main__":
212
+ main()