intextum-worker 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. intextum_worker-0.1.0/PKG-INFO +115 -0
  2. intextum_worker-0.1.0/README.md +60 -0
  3. intextum_worker-0.1.0/VERSION +1 -0
  4. intextum_worker-0.1.0/pyproject.toml +96 -0
  5. intextum_worker-0.1.0/setup.cfg +4 -0
  6. intextum_worker-0.1.0/src/intextum_worker/__init__.py +1 -0
  7. intextum_worker-0.1.0/src/intextum_worker/config.py +92 -0
  8. intextum_worker-0.1.0/src/intextum_worker/logging_config.py +158 -0
  9. intextum_worker-0.1.0/src/intextum_worker/main.py +212 -0
  10. intextum_worker-0.1.0/src/intextum_worker/models.py +724 -0
  11. intextum_worker-0.1.0/src/intextum_worker/poll_enrichment.py +76 -0
  12. intextum_worker-0.1.0/src/intextum_worker/poll_loop.py +424 -0
  13. intextum_worker-0.1.0/src/intextum_worker/poll_runtime.py +327 -0
  14. intextum_worker-0.1.0/src/intextum_worker/processor_docling.py +113 -0
  15. intextum_worker-0.1.0/src/intextum_worker/processor_runtime.py +622 -0
  16. intextum_worker-0.1.0/src/intextum_worker/processors.py +446 -0
  17. intextum_worker-0.1.0/src/intextum_worker/runtime_info.py +201 -0
  18. intextum_worker-0.1.0/src/intextum_worker/services/__init__.py +0 -0
  19. intextum_worker-0.1.0/src/intextum_worker/services/api_client.py +537 -0
  20. intextum_worker-0.1.0/src/intextum_worker/services/api_client_api.py +99 -0
  21. intextum_worker-0.1.0/src/intextum_worker/services/api_client_uploads.py +110 -0
  22. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/__init__.py +15 -0
  23. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/batching.py +115 -0
  24. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/chunk_selection.py +306 -0
  25. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/classification.py +185 -0
  26. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/console.py +23 -0
  27. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/evidence_grounding.py +100 -0
  28. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/json_response.py +188 -0
  29. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/langgraph_provider.py +1036 -0
  30. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/merge.py +197 -0
  31. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/model_artifacts.py +170 -0
  32. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/orchestration.py +65 -0
  33. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/prompt.py +218 -0
  34. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/registry.py +113 -0
  35. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment/repeated_fields.py +153 -0
  36. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment_training_runner.py +470 -0
  37. intextum_worker-0.1.0/src/intextum_worker/services/content_enrichment_utils.py +904 -0
  38. intextum_worker-0.1.0/src/intextum_worker/services/docling.py +372 -0
  39. intextum_worker-0.1.0/src/intextum_worker/services/docling_asr.py +250 -0
  40. intextum_worker-0.1.0/src/intextum_worker/services/docling_enrichment.py +204 -0
  41. intextum_worker-0.1.0/src/intextum_worker/services/docling_output.py +80 -0
  42. intextum_worker-0.1.0/src/intextum_worker/services/tokenizer.py +45 -0
  43. intextum_worker-0.1.0/src/intextum_worker/services/vector.py +216 -0
  44. intextum_worker-0.1.0/src/intextum_worker/version.py +55 -0
  45. intextum_worker-0.1.0/src/intextum_worker.egg-info/PKG-INFO +115 -0
  46. intextum_worker-0.1.0/src/intextum_worker.egg-info/SOURCES.txt +65 -0
  47. intextum_worker-0.1.0/src/intextum_worker.egg-info/dependency_links.txt +1 -0
  48. intextum_worker-0.1.0/src/intextum_worker.egg-info/entry_points.txt +2 -0
  49. intextum_worker-0.1.0/src/intextum_worker.egg-info/requires.txt +64 -0
  50. intextum_worker-0.1.0/src/intextum_worker.egg-info/top_level.txt +1 -0
  51. intextum_worker-0.1.0/tests/test_api_client.py +902 -0
  52. intextum_worker-0.1.0/tests/test_api_client_api.py +71 -0
  53. intextum_worker-0.1.0/tests/test_api_client_uploads.py +53 -0
  54. intextum_worker-0.1.0/tests/test_config.py +26 -0
  55. intextum_worker-0.1.0/tests/test_content_enrichment.py +199 -0
  56. intextum_worker-0.1.0/tests/test_content_enrichment_training_runner.py +280 -0
  57. intextum_worker-0.1.0/tests/test_content_enrichment_utils.py +385 -0
  58. intextum_worker-0.1.0/tests/test_langgraph_extraction.py +1215 -0
  59. intextum_worker-0.1.0/tests/test_logging.py +176 -0
  60. intextum_worker-0.1.0/tests/test_poll_runtime.py +264 -0
  61. intextum_worker-0.1.0/tests/test_processor_docling.py +218 -0
  62. intextum_worker-0.1.0/tests/test_processor_runtime.py +487 -0
  63. intextum_worker-0.1.0/tests/test_processors.py +566 -0
  64. intextum_worker-0.1.0/tests/test_requirements.py +44 -0
  65. intextum_worker-0.1.0/tests/test_runtime_info.py +148 -0
  66. intextum_worker-0.1.0/tests/test_services.py +882 -0
  67. intextum_worker-0.1.0/tests/test_tasks.py +726 -0
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: intextum-worker
3
+ Version: 0.1.0
4
+ Summary: Intextum processing worker: HTTP-polling Docling/FFmpeg document, image and audio pipeline.
5
+ Author-email: Sebastian Alberternst <alberternst@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/intextum/intextum
8
+ Project-URL: Repository, https://github.com/intextum/intextum
9
+ Keywords: intextum,docling,ocr,asr,document-processing,worker
10
+ Requires-Python: <3.13,>=3.12
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: requests<3.0.0,>=2.33.0
13
+ Requires-Dist: pydantic-settings<3.0.0,>=2.14.0
14
+ Requires-Dist: python-dotenv<2.0.0,>=1.2.2
15
+ Requires-Dist: Pillow>=10.0.0
16
+ Provides-Extra: document
17
+ Requires-Dist: docling>=2.8.0; extra == "document"
18
+ Requires-Dist: docling-core>=2.74.1; extra == "document"
19
+ Requires-Dist: easyocr~=1.7.1; extra == "document"
20
+ Requires-Dist: rapidocr-onnxruntime~=1.3.14; extra == "document"
21
+ Requires-Dist: onnxruntime~=1.17.1; extra == "document"
22
+ Provides-Extra: asr
23
+ Requires-Dist: docling[asr]>=2.8.0; extra == "asr"
24
+ Provides-Extra: enrichment
25
+ Requires-Dist: transformers>=4.38.0; extra == "enrichment"
26
+ Requires-Dist: gliner2>=0.1.0; extra == "enrichment"
27
+ Requires-Dist: langgraph>=1.1.8; extra == "enrichment"
28
+ Requires-Dist: sentencepiece; extra == "enrichment"
29
+ Requires-Dist: protobuf; extra == "enrichment"
30
+ Provides-Extra: full
31
+ Requires-Dist: intextum-worker[asr,document,enrichment]; extra == "full"
32
+ Provides-Extra: mps
33
+ Requires-Dist: intextum-worker[full]; extra == "mps"
34
+ Requires-Dist: torch==2.6.0; platform_system == "Darwin" and extra == "mps"
35
+ Requires-Dist: torchvision==0.21.0; platform_system == "Darwin" and extra == "mps"
36
+ Provides-Extra: cpu
37
+ Requires-Dist: intextum-worker[full]; extra == "cpu"
38
+ Requires-Dist: torch==2.6.0+cpu; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cpu"
39
+ Requires-Dist: torchvision==0.21.0+cpu; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cpu"
40
+ Requires-Dist: torch==2.6.0; (platform_system == "Linux" and platform_machine != "x86_64") and extra == "cpu"
41
+ Requires-Dist: torchvision==0.21.0; (platform_system == "Linux" and platform_machine != "x86_64") and extra == "cpu"
42
+ Provides-Extra: cpu-document
43
+ Requires-Dist: intextum-worker[document]; extra == "cpu-document"
44
+ Requires-Dist: torch==2.6.0+cpu; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cpu-document"
45
+ Requires-Dist: torchvision==0.21.0+cpu; (platform_system == "Linux" and platform_machine == "x86_64") and extra == "cpu-document"
46
+ Requires-Dist: torch==2.6.0; (platform_system == "Linux" and platform_machine != "x86_64") and extra == "cpu-document"
47
+ Requires-Dist: torchvision==0.21.0; (platform_system == "Linux" and platform_machine != "x86_64") and extra == "cpu-document"
48
+ Provides-Extra: cuda
49
+ Requires-Dist: intextum-worker[full]; extra == "cuda"
50
+ Requires-Dist: torch==2.6.0+cu126; platform_system == "Linux" and extra == "cuda"
51
+ Requires-Dist: torchvision==0.21.0+cu126; platform_system == "Linux" and extra == "cuda"
52
+ Provides-Extra: test
53
+ Requires-Dist: pytest<10.0,>=9.0.3; extra == "test"
54
+ Requires-Dist: pytest-asyncio<2.0.0,>=1.4.0; extra == "test"
55
+
56
+ # intextum-worker
57
+
58
+ The Intextum processing worker: an HTTP-polling worker that pulls tasks from an
59
+ Intextum API instance and runs the Docling / FFmpeg document, image and audio
60
+ pipeline (OCR, ASR, chunking, classification, content enrichment, embeddings).
61
+
62
+ The worker is **always-remote**: it downloads source files from and uploads
63
+ results to the API over HTTP, so it does not need a shared volume and can run
64
+ anywhere — including on a host with a GPU while the rest of the stack runs in
65
+ Docker.
66
+
67
+ ## Install
68
+
69
+ Pick the bundle that matches your accelerator. The macOS (Apple MPS) wheels are
70
+ on PyPI, so it installs with no extra flags:
71
+
72
+ ```bash
73
+ pip install 'intextum-worker[mps]'
74
+ ```
75
+
76
+ Linux CPU and NVIDIA CUDA pull their Torch build from the PyTorch index, so add
77
+ the matching `--extra-index-url`:
78
+
79
+ ```bash
80
+ # Linux, CPU only
81
+ pip install 'intextum-worker[cpu]' --extra-index-url https://download.pytorch.org/whl/cpu
82
+
83
+ # Linux, NVIDIA CUDA 12.6
84
+ pip install 'intextum-worker[cuda]' --extra-index-url https://download.pytorch.org/whl/cu126
85
+ ```
86
+
87
+ Available extras: `mps`, `cpu`, `cuda`, `cpu-document` (document/image only), plus
88
+ the granular `document`, `asr`, `enrichment` stacks.
89
+
90
+ ## Run
91
+
92
+ ```bash
93
+ export API_URL="https://your-intextum-host" # the API to poll
94
+ export WORKER_TOKEN="<token from the Add Worker dialog>"
95
+ intextum-worker --capabilities document,video,image
96
+ ```
97
+
98
+ `intextum-worker --help` lists all flags. Every flag also has an environment
99
+ variable (`API_URL`, `WORKER_TOKEN`, `WORK_DIR`, `CAPABILITIES`, `POLL_INTERVAL`,
100
+ `CLASSIFICATION_DEVICE`, `DOCLING_OCR_ENGINE`, …); CLI flags take precedence.
101
+
102
+ ## Development
103
+
104
+ This package uses a `src/` layout. The repo-root `VERSION` file is the single
105
+ source of truth for the version; it is staged into `worker/VERSION` at build time
106
+ (`worker/VERSION` is gitignored).
107
+
108
+ ```bash
109
+ cp ../VERSION VERSION # stage the version for an editable install
110
+ pip install -e '.[mps,test]' # or [cpu,test] / [cuda,test]
111
+ pytest
112
+ ```
113
+
114
+ On macOS, `scripts/setup-macos-mps.sh` does the venv + editable install for you,
115
+ and `scripts/run-macos-mps.sh` launches the worker with MPS defaults.
@@ -0,0 +1,60 @@
1
+ # intextum-worker
2
+
3
+ The Intextum processing worker: an HTTP-polling worker that pulls tasks from an
4
+ Intextum API instance and runs the Docling / FFmpeg document, image and audio
5
+ pipeline (OCR, ASR, chunking, classification, content enrichment, embeddings).
6
+
7
+ The worker is **always-remote**: it downloads source files from and uploads
8
+ results to the API over HTTP, so it does not need a shared volume and can run
9
+ anywhere — including on a host with a GPU while the rest of the stack runs in
10
+ Docker.
11
+
12
+ ## Install
13
+
14
+ Pick the bundle that matches your accelerator. The macOS (Apple MPS) wheels are
15
+ on PyPI, so it installs with no extra flags:
16
+
17
+ ```bash
18
+ pip install 'intextum-worker[mps]'
19
+ ```
20
+
21
+ Linux CPU and NVIDIA CUDA pull their Torch build from the PyTorch index, so add
22
+ the matching `--extra-index-url`:
23
+
24
+ ```bash
25
+ # Linux, CPU only
26
+ pip install 'intextum-worker[cpu]' --extra-index-url https://download.pytorch.org/whl/cpu
27
+
28
+ # Linux, NVIDIA CUDA 12.6
29
+ pip install 'intextum-worker[cuda]' --extra-index-url https://download.pytorch.org/whl/cu126
30
+ ```
31
+
32
+ Available extras: `mps`, `cpu`, `cuda`, `cpu-document` (document/image only), plus
33
+ the granular `document`, `asr`, `enrichment` stacks.
34
+
35
+ ## Run
36
+
37
+ ```bash
38
+ export API_URL="https://your-intextum-host" # the API to poll
39
+ export WORKER_TOKEN="<token from the Add Worker dialog>"
40
+ intextum-worker --capabilities document,video,image
41
+ ```
42
+
43
+ `intextum-worker --help` lists all flags. Every flag also has an environment
44
+ variable (`API_URL`, `WORKER_TOKEN`, `WORK_DIR`, `CAPABILITIES`, `POLL_INTERVAL`,
45
+ `CLASSIFICATION_DEVICE`, `DOCLING_OCR_ENGINE`, …); CLI flags take precedence.
46
+
47
+ ## Development
48
+
49
+ This package uses a `src/` layout. The repo-root `VERSION` file is the single
50
+ source of truth for the version; it is staged into `worker/VERSION` at build time
51
+ (`worker/VERSION` is gitignored).
52
+
53
+ ```bash
54
+ cp ../VERSION VERSION # stage the version for an editable install
55
+ pip install -e '.[mps,test]' # or [cpu,test] / [cuda,test]
56
+ pytest
57
+ ```
58
+
59
+ On macOS, `scripts/setup-macos-mps.sh` does the venv + editable install for you,
60
+ and `scripts/run-macos-mps.sh` launches the worker with MPS defaults.
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,96 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "intextum-worker"
7
+ description = "Intextum processing worker: HTTP-polling Docling/FFmpeg document, image and audio pipeline."
8
+ readme = "README.md"
9
+ requires-python = ">=3.12,<3.13"
10
+ license = "MIT"
11
+ authors = [{ name = "Sebastian Alberternst", email = "alberternst@gmail.com" }]
12
+ keywords = ["intextum", "docling", "ocr", "asr", "document-processing", "worker"]
13
+ dynamic = ["version"]
14
+
15
+ # Core runtime (mirrors requirements/base.txt). Feature stacks and the Torch
16
+ # build variant live in the optional-dependency extras below.
17
+ dependencies = [
18
+ "requests>=2.33.0,<3.0.0",
19
+ "pydantic-settings>=2.14.0,<3.0.0",
20
+ "python-dotenv>=1.2.2,<2.0.0",
21
+ "Pillow>=10.0.0",
22
+ ]
23
+
24
+ [project.scripts]
25
+ intextum-worker = "intextum_worker.main:main"
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/intextum/intextum"
29
+ Repository = "https://github.com/intextum/intextum"
30
+
31
+ [project.optional-dependencies]
32
+ # Feature stacks (mirror requirements/{document,asr,content-enrichment}.txt).
33
+ document = [
34
+ "docling>=2.8.0",
35
+ "docling-core>=2.74.1",
36
+ "easyocr~=1.7.1",
37
+ "rapidocr-onnxruntime~=1.3.14",
38
+ "onnxruntime~=1.17.1",
39
+ ]
40
+ asr = ["docling[asr]>=2.8.0"]
41
+ enrichment = [
42
+ "transformers>=4.38.0",
43
+ "gliner2>=0.1.0",
44
+ "langgraph>=1.1.8",
45
+ "sentencepiece",
46
+ "protobuf",
47
+ ]
48
+ full = ["intextum-worker[document,asr,enrichment]"]
49
+
50
+ # Platform bundles add the right Torch build on top of a feature stack.
51
+ # macOS (Apple MPS): wheels are on PyPI, so this installs with no extra index:
52
+ # pip install 'intextum-worker[mps]'
53
+ # Linux CPU / NVIDIA CUDA: the +cpu / +cu126 wheels live on the PyTorch index,
54
+ # so the install command must add the matching --extra-index-url:
55
+ # pip install 'intextum-worker[cpu]' --extra-index-url https://download.pytorch.org/whl/cpu
56
+ # pip install 'intextum-worker[cuda]' --extra-index-url https://download.pytorch.org/whl/cu126
57
+ mps = [
58
+ "intextum-worker[full]",
59
+ "torch==2.6.0 ; platform_system == 'Darwin'",
60
+ "torchvision==0.21.0 ; platform_system == 'Darwin'",
61
+ ]
62
+ cpu = [
63
+ "intextum-worker[full]",
64
+ "torch==2.6.0+cpu ; platform_system == 'Linux' and platform_machine == 'x86_64'",
65
+ "torchvision==0.21.0+cpu ; platform_system == 'Linux' and platform_machine == 'x86_64'",
66
+ "torch==2.6.0 ; platform_system == 'Linux' and platform_machine != 'x86_64'",
67
+ "torchvision==0.21.0 ; platform_system == 'Linux' and platform_machine != 'x86_64'",
68
+ ]
69
+ cpu-document = [
70
+ "intextum-worker[document]",
71
+ "torch==2.6.0+cpu ; platform_system == 'Linux' and platform_machine == 'x86_64'",
72
+ "torchvision==0.21.0+cpu ; platform_system == 'Linux' and platform_machine == 'x86_64'",
73
+ "torch==2.6.0 ; platform_system == 'Linux' and platform_machine != 'x86_64'",
74
+ "torchvision==0.21.0 ; platform_system == 'Linux' and platform_machine != 'x86_64'",
75
+ ]
76
+ cuda = [
77
+ "intextum-worker[full]",
78
+ "torch==2.6.0+cu126 ; platform_system == 'Linux'",
79
+ "torchvision==0.21.0+cu126 ; platform_system == 'Linux'",
80
+ ]
81
+ test = [
82
+ "pytest>=9.0.3,<10.0",
83
+ "pytest-asyncio>=1.4.0,<2.0.0",
84
+ ]
85
+
86
+ [tool.setuptools.dynamic]
87
+ # Single source of truth: the repo-root VERSION file, staged into worker/ at
88
+ # build time (see .github/workflows/release-worker.yml and worker/Dockerfile).
89
+ version = { file = "VERSION" }
90
+
91
+ [tool.setuptools.packages.find]
92
+ where = ["src"]
93
+
94
+ [tool.pytest.ini_options]
95
+ pythonpath = ["src"]
96
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ """Worker package for file processing pipeline."""
@@ -0,0 +1,92 @@
1
+ """Configuration settings for the worker service."""
2
+
3
+ import json
4
+ from collections.abc import Iterable
5
+ from functools import lru_cache
6
+
7
+ from pydantic import field_validator
8
+ from pydantic_settings import BaseSettings, SettingsConfigDict
9
+
10
+ VALID_WORKER_CAPABILITIES = frozenset({"document", "image", "video", "training"})
11
+ DEFAULT_WORKER_CAPABILITIES = "document,video,image"
12
+
13
+
14
+ def parse_capabilities(value: object) -> list[str]:
15
+ """Parse and validate worker capabilities from env or CLI input."""
16
+ if isinstance(value, str):
17
+ raw = value.strip()
18
+ if not raw:
19
+ return []
20
+ if raw.startswith("["):
21
+ try:
22
+ decoded = json.loads(raw)
23
+ except json.JSONDecodeError as exc:
24
+ raise ValueError(
25
+ "CAPABILITIES must be a comma-separated string or JSON array"
26
+ ) from exc
27
+ return parse_capabilities(decoded)
28
+ values: Iterable[object] = raw.split(",")
29
+ elif isinstance(value, (list, tuple, set)):
30
+ values = value
31
+ else:
32
+ raise ValueError("CAPABILITIES must be a comma-separated string or JSON array")
33
+
34
+ capabilities = [str(item).strip().lower() for item in values if str(item).strip()]
35
+ invalid = sorted(set(capabilities) - VALID_WORKER_CAPABILITIES)
36
+ if invalid:
37
+ allowed = ", ".join(sorted(VALID_WORKER_CAPABILITIES))
38
+ rejected = ", ".join(invalid)
39
+ raise ValueError(
40
+ f"Invalid CAPABILITIES value(s): {rejected}. Allowed: {allowed}"
41
+ )
42
+ return capabilities
43
+
44
+
45
+ class Settings(BaseSettings):
46
+ """Application settings loaded from environment variables."""
47
+
48
+ # Remote backend connection
49
+ API_URL: str = "http://api:8000"
50
+ WORKER_TOKEN: str = ""
51
+ WORK_DIR: str = "/tmp/worker"
52
+
53
+ # Poll loop. "training" stays opt-in and must be added explicitly.
54
+ CAPABILITIES: str = DEFAULT_WORKER_CAPABILITIES
55
+
56
+ @property
57
+ def parsed_capabilities(self) -> list[str]:
58
+ """Return validated capabilities without triggering pydantic JSON env parsing."""
59
+ return parse_capabilities(self.CAPABILITIES)
60
+
61
+ POLL_INTERVAL: float = 5.0
62
+ TASK_HEARTBEAT_INTERVAL_SECONDS: float = 60.0
63
+ CONTENT_ENRICHMENT_STAGE_TIMEOUT_SECONDS: float = 300.0
64
+
65
+ # Local processing settings
66
+ CLASSIFICATION_DEVICE: str = "cpu"
67
+ DOCLING_THREADS: int = 4
68
+ DOCLING_OCR_ENGINE: str = "easyocr"
69
+ ASR_MODEL: str = "whisper_large_v3"
70
+ ASR_LANGUAGE: str = "de"
71
+ KEEP_MODELS_LOADED: bool = False
72
+
73
+ CUSTOM_FIELD_ID: int = 1
74
+
75
+ model_config = SettingsConfigDict(env_file=".env", extra="ignore")
76
+
77
+ @field_validator("DOCLING_OCR_ENGINE", mode="before")
78
+ @classmethod
79
+ def normalize_docling_ocr_engine(cls, value: str) -> str:
80
+ """Normalize and validate configured Docling OCR engine."""
81
+ engine = str(value).strip().lower()
82
+ allowed = {"easyocr", "rapidocr", "tesseract", "tesseract_cli", "ocrmac"}
83
+ if engine not in allowed:
84
+ allowed_list = ", ".join(sorted(allowed))
85
+ raise ValueError(f"DOCLING_OCR_ENGINE must be one of: {allowed_list}")
86
+ return engine
87
+
88
+
89
+ @lru_cache
90
+ def get_settings() -> Settings:
91
+ """Get cached settings instance."""
92
+ return Settings()
@@ -0,0 +1,158 @@
1
+ """Structured logging configuration for the worker service."""
2
+
3
+ import json
4
+ import logging
5
+ import sys
6
+ import uuid
7
+ from collections.abc import MutableMapping
8
+ from contextvars import ContextVar, Token
9
+ from datetime import UTC, datetime
10
+ from typing import Any
11
+
12
+ correlation_id_var: ContextVar[str | None] = ContextVar("correlation_id", default=None)
13
+
14
+
15
+ def generate_correlation_id() -> str:
16
+ """Generate a new correlation ID."""
17
+ return str(uuid.uuid4())[:8]
18
+
19
+
20
+ def get_correlation_id() -> str | None:
21
+ """Get the current correlation ID from context."""
22
+ return correlation_id_var.get()
23
+
24
+
25
+ def set_correlation_id(correlation_id: str) -> None:
26
+ """Set the correlation ID in context."""
27
+ correlation_id_var.set(correlation_id)
28
+
29
+
30
+ class StructuredFormatter(logging.Formatter):
31
+ """JSON formatter for structured logging."""
32
+
33
+ def format(self, record: logging.LogRecord) -> str:
34
+ log_data: dict[str, Any] = {
35
+ "timestamp": datetime.now(UTC).isoformat(),
36
+ "level": record.levelname,
37
+ "logger": record.name,
38
+ "message": record.getMessage(),
39
+ }
40
+
41
+ correlation_id = get_correlation_id()
42
+ if correlation_id:
43
+ log_data["correlation_id"] = correlation_id
44
+
45
+ if record.exc_info:
46
+ log_data["exception"] = self.formatException(record.exc_info)
47
+
48
+ if hasattr(record, "__dict__"):
49
+ extra_fields = {
50
+ k: v
51
+ for k, v in record.__dict__.items()
52
+ if k
53
+ not in {
54
+ "name",
55
+ "msg",
56
+ "args",
57
+ "created",
58
+ "filename",
59
+ "funcName",
60
+ "levelname",
61
+ "levelno",
62
+ "lineno",
63
+ "module",
64
+ "msecs",
65
+ "pathname",
66
+ "process",
67
+ "processName",
68
+ "relativeCreated",
69
+ "stack_info",
70
+ "exc_info",
71
+ "exc_text",
72
+ "thread",
73
+ "threadName",
74
+ "taskName",
75
+ "message",
76
+ }
77
+ }
78
+ if extra_fields:
79
+ log_data["extra"] = extra_fields
80
+
81
+ return json.dumps(log_data)
82
+
83
+
84
+ class CorrelatedLogger(logging.LoggerAdapter):
85
+ """Logger adapter that includes correlation ID in all log messages."""
86
+
87
+ def __init__(self, logger: logging.Logger, correlation_id: str):
88
+ super().__init__(logger, {})
89
+ self.correlation_id = correlation_id
90
+
91
+ def process(
92
+ self, msg: object, kwargs: MutableMapping[str, Any]
93
+ ) -> tuple[object, MutableMapping[str, Any]]:
94
+ extra = kwargs.get("extra")
95
+ if not isinstance(extra, dict):
96
+ extra = {}
97
+ extra["correlation_id"] = self.correlation_id
98
+ kwargs["extra"] = extra
99
+ return msg, kwargs
100
+
101
+
102
+ def get_logger(name: str, correlation_id: str | None = None) -> logging.LoggerAdapter:
103
+ """Get a logger with optional correlation ID.
104
+
105
+ Args:
106
+ name: Logger name (typically __name__)
107
+ correlation_id: Optional correlation ID, generates new one if not provided
108
+
109
+ Returns:
110
+ Logger adapter with correlation context
111
+ """
112
+ logger = logging.getLogger(name)
113
+ cid = correlation_id or get_correlation_id() or generate_correlation_id()
114
+ return CorrelatedLogger(logger, cid)
115
+
116
+
117
+ def configure_logging(json_format: bool = True, level: str = "INFO") -> None:
118
+ """Configure logging for the worker service.
119
+
120
+ Args:
121
+ json_format: Use JSON structured logging if True, human-readable if False
122
+ level: Logging level (DEBUG, INFO, WARNING, ERROR)
123
+ """
124
+ root_logger = logging.getLogger()
125
+ root_logger.setLevel(getattr(logging, level.upper()))
126
+
127
+ for handler in root_logger.handlers[:]:
128
+ root_logger.removeHandler(handler)
129
+
130
+ handler = logging.StreamHandler(sys.stdout)
131
+
132
+ if json_format:
133
+ handler.setFormatter(StructuredFormatter())
134
+ else:
135
+ handler.setFormatter(
136
+ logging.Formatter(
137
+ "%(asctime)s - %(levelname)s - [%(correlation_id)s] %(name)s - %(message)s",
138
+ defaults={"correlation_id": "no-correlation"},
139
+ )
140
+ )
141
+
142
+ root_logger.addHandler(handler)
143
+
144
+
145
+ class LoggingContext:
146
+ """Context manager for scoped correlation IDs."""
147
+
148
+ def __init__(self, correlation_id: str | None = None):
149
+ self.correlation_id = correlation_id or generate_correlation_id()
150
+ self._token: Token[str | None] | None = None
151
+
152
+ def __enter__(self) -> str:
153
+ self._token = correlation_id_var.set(self.correlation_id)
154
+ return self.correlation_id
155
+
156
+ def __exit__(self, *args) -> None:
157
+ if self._token is not None:
158
+ correlation_id_var.reset(self._token)