kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_config.py
CHANGED
@@ -1,10 +1,3 @@
|
|
1
|
-
"""Configuration discovery and loading for Kreuzberg.
|
2
|
-
|
3
|
-
This module provides configuration loading from both kreuzberg.toml and pyproject.toml files.
|
4
|
-
Configuration is automatically discovered by searching up the directory tree from the current
|
5
|
-
working directory.
|
6
|
-
"""
|
7
|
-
|
8
1
|
from __future__ import annotations
|
9
2
|
|
10
3
|
import sys
|
@@ -16,29 +9,145 @@ if sys.version_info >= (3, 11):
|
|
16
9
|
else: # pragma: no cover
|
17
10
|
import tomli as tomllib # type: ignore[import-not-found]
|
18
11
|
|
19
|
-
from kreuzberg.
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
12
|
+
from kreuzberg._types import (
|
13
|
+
EasyOCRConfig,
|
14
|
+
ExtractionConfig,
|
15
|
+
GMFTConfig,
|
16
|
+
HTMLToMarkdownConfig,
|
17
|
+
OcrBackendType,
|
18
|
+
PaddleOCRConfig,
|
19
|
+
PSMMode,
|
20
|
+
TesseractConfig,
|
21
|
+
)
|
24
22
|
from kreuzberg.exceptions import ValidationError
|
25
23
|
|
26
24
|
if TYPE_CHECKING:
|
27
25
|
from collections.abc import MutableMapping
|
28
26
|
|
27
|
+
_CONFIG_FIELDS = [
|
28
|
+
"force_ocr",
|
29
|
+
"chunk_content",
|
30
|
+
"extract_tables",
|
31
|
+
"max_chars",
|
32
|
+
"max_overlap",
|
33
|
+
"ocr_backend",
|
34
|
+
"extract_entities",
|
35
|
+
"extract_keywords",
|
36
|
+
"auto_detect_language",
|
37
|
+
"enable_quality_processing",
|
38
|
+
"auto_detect_document_type",
|
39
|
+
"document_type_confidence_threshold",
|
40
|
+
"document_classification_mode",
|
41
|
+
"keyword_count",
|
42
|
+
]
|
29
43
|
|
30
|
-
|
31
|
-
|
44
|
+
_VALID_OCR_BACKENDS = {"tesseract", "easyocr", "paddleocr"}
|
45
|
+
|
46
|
+
|
47
|
+
def _merge_file_config(config_dict: dict[str, Any], file_config: dict[str, Any]) -> None:
|
48
|
+
if not file_config:
|
49
|
+
return
|
50
|
+
for field in _CONFIG_FIELDS:
|
51
|
+
if field in file_config:
|
52
|
+
config_dict[field] = file_config[field]
|
53
|
+
|
54
|
+
|
55
|
+
def _merge_cli_args(config_dict: dict[str, Any], cli_args: MutableMapping[str, Any]) -> None:
|
56
|
+
for field in _CONFIG_FIELDS:
|
57
|
+
if field in cli_args and cli_args[field] is not None:
|
58
|
+
config_dict[field] = cli_args[field]
|
59
|
+
|
60
|
+
|
61
|
+
def _build_ocr_config_from_cli(
|
62
|
+
ocr_backend: str, cli_args: MutableMapping[str, Any]
|
63
|
+
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
|
64
|
+
config_key = f"{ocr_backend}_config"
|
65
|
+
if not cli_args.get(config_key):
|
66
|
+
return None
|
67
|
+
|
68
|
+
backend_args = cli_args[config_key]
|
69
|
+
try:
|
70
|
+
match ocr_backend:
|
71
|
+
case "tesseract":
|
72
|
+
return TesseractConfig(**backend_args)
|
73
|
+
case "easyocr":
|
74
|
+
return EasyOCRConfig(**backend_args)
|
75
|
+
case "paddleocr":
|
76
|
+
return PaddleOCRConfig(**backend_args)
|
77
|
+
case _:
|
78
|
+
return None
|
79
|
+
except (TypeError, ValueError) as e:
|
80
|
+
raise ValidationError(
|
81
|
+
f"Invalid {ocr_backend} configuration from CLI: {e}",
|
82
|
+
context={"backend": ocr_backend, "config": backend_args, "error": str(e)},
|
83
|
+
) from e
|
84
|
+
|
85
|
+
|
86
|
+
def _configure_ocr_backend(
|
87
|
+
config_dict: dict[str, Any],
|
88
|
+
file_config: dict[str, Any],
|
89
|
+
cli_args: MutableMapping[str, Any],
|
90
|
+
) -> None:
|
91
|
+
ocr_backend = config_dict.get("ocr_backend")
|
92
|
+
if not ocr_backend or ocr_backend == "none":
|
93
|
+
return
|
94
|
+
|
95
|
+
ocr_config = _build_ocr_config_from_cli(ocr_backend, cli_args)
|
96
|
+
if not ocr_config and file_config:
|
97
|
+
ocr_config = parse_ocr_backend_config(file_config, ocr_backend)
|
98
|
+
|
99
|
+
if ocr_config:
|
100
|
+
config_dict["ocr_config"] = ocr_config
|
32
101
|
|
33
|
-
Args:
|
34
|
-
config_path: Path to the configuration file.
|
35
102
|
|
36
|
-
|
37
|
-
|
103
|
+
def _configure_gmft(
|
104
|
+
config_dict: dict[str, Any],
|
105
|
+
file_config: dict[str, Any],
|
106
|
+
cli_args: MutableMapping[str, Any],
|
107
|
+
) -> None:
|
108
|
+
if not config_dict.get("extract_tables"):
|
109
|
+
return
|
38
110
|
|
39
|
-
|
40
|
-
|
41
|
-
|
111
|
+
gmft_config = None
|
112
|
+
try:
|
113
|
+
if cli_args.get("gmft_config"):
|
114
|
+
gmft_config = GMFTConfig(**cli_args["gmft_config"])
|
115
|
+
elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
|
116
|
+
gmft_config = GMFTConfig(**file_config["gmft"])
|
117
|
+
except (TypeError, ValueError) as e:
|
118
|
+
raise ValidationError(
|
119
|
+
f"Invalid GMFT configuration: {e}",
|
120
|
+
context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
|
121
|
+
) from e
|
122
|
+
|
123
|
+
if gmft_config:
|
124
|
+
config_dict["gmft_config"] = gmft_config
|
125
|
+
|
126
|
+
|
127
|
+
def _create_ocr_config(
|
128
|
+
backend: str, backend_config: dict[str, Any]
|
129
|
+
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig:
|
130
|
+
match backend:
|
131
|
+
case "tesseract":
|
132
|
+
processed_config = backend_config.copy()
|
133
|
+
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
134
|
+
try:
|
135
|
+
processed_config["psm"] = PSMMode(processed_config["psm"])
|
136
|
+
except ValueError as e:
|
137
|
+
raise ValidationError(
|
138
|
+
f"Invalid PSM mode value: {processed_config['psm']}",
|
139
|
+
context={"psm_value": processed_config["psm"], "error": str(e)},
|
140
|
+
) from e
|
141
|
+
return TesseractConfig(**processed_config)
|
142
|
+
case "easyocr":
|
143
|
+
return EasyOCRConfig(**backend_config)
|
144
|
+
case "paddleocr":
|
145
|
+
return PaddleOCRConfig(**backend_config)
|
146
|
+
case _:
|
147
|
+
raise ValueError(f"Unknown backend: {backend}")
|
148
|
+
|
149
|
+
|
150
|
+
def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
42
151
|
try:
|
43
152
|
with config_path.open("rb") as f:
|
44
153
|
data = tomllib.load(f)
|
@@ -47,28 +156,16 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
47
156
|
except tomllib.TOMLDecodeError as e:
|
48
157
|
raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
|
49
158
|
|
50
|
-
# Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
|
51
159
|
if config_path.name == "kreuzberg.toml":
|
52
160
|
return data # type: ignore[no-any-return]
|
53
161
|
|
54
|
-
|
55
|
-
if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
|
162
|
+
if config_path.name == "pyproject.toml":
|
56
163
|
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
57
164
|
|
58
|
-
# Otherwise assume root-level configuration
|
59
165
|
return data # type: ignore[no-any-return]
|
60
166
|
|
61
167
|
|
62
168
|
def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
63
|
-
"""Merge two configuration dictionaries recursively.
|
64
|
-
|
65
|
-
Args:
|
66
|
-
base: Base configuration dictionary.
|
67
|
-
override: Configuration dictionary to override base values.
|
68
|
-
|
69
|
-
Returns:
|
70
|
-
Merged configuration dictionary.
|
71
|
-
"""
|
72
169
|
result = base.copy()
|
73
170
|
for key, value in override.items():
|
74
171
|
if isinstance(value, dict) and key in result and isinstance(result[key], dict):
|
@@ -81,118 +178,101 @@ def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, A
|
|
81
178
|
def parse_ocr_backend_config(
|
82
179
|
config_dict: dict[str, Any], backend: OcrBackendType
|
83
180
|
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
|
84
|
-
"""Parse OCR backend-specific configuration.
|
85
|
-
|
86
|
-
Args:
|
87
|
-
config_dict: Configuration dictionary.
|
88
|
-
backend: The OCR backend type.
|
89
|
-
|
90
|
-
Returns:
|
91
|
-
Backend-specific configuration object or None.
|
92
|
-
"""
|
93
181
|
if backend not in config_dict:
|
94
182
|
return None
|
95
183
|
|
96
184
|
backend_config = config_dict[backend]
|
97
185
|
if not isinstance(backend_config, dict):
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
# Convert psm integer to PSMMode enum if needed
|
103
|
-
processed_config = backend_config.copy()
|
104
|
-
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
105
|
-
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
186
|
+
raise ValidationError(
|
187
|
+
f"Invalid configuration for OCR backend '{backend}': expected dict, got {type(backend_config).__name__}",
|
188
|
+
context={"backend": backend, "config_type": type(backend_config).__name__},
|
189
|
+
)
|
106
190
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
return None
|
191
|
+
try:
|
192
|
+
return _create_ocr_config(backend, backend_config)
|
193
|
+
except (TypeError, ValueError) as e:
|
194
|
+
raise ValidationError(
|
195
|
+
f"Invalid configuration for OCR backend '{backend}': {e}",
|
196
|
+
context={"backend": backend, "config": backend_config, "error": str(e)},
|
197
|
+
) from e
|
115
198
|
|
116
199
|
|
117
200
|
def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
|
118
|
-
|
119
|
-
|
120
|
-
Args:
|
121
|
-
config_dict: Configuration dictionary from TOML file.
|
122
|
-
|
123
|
-
Returns:
|
124
|
-
ExtractionConfig instance.
|
125
|
-
"""
|
126
|
-
extraction_config: dict[str, Any] = {}
|
127
|
-
|
128
|
-
# Copy basic configuration fields using dictionary comprehension
|
129
|
-
basic_fields = {
|
130
|
-
"force_ocr",
|
131
|
-
"chunk_content",
|
132
|
-
"extract_tables",
|
133
|
-
"max_chars",
|
134
|
-
"max_overlap",
|
135
|
-
"ocr_backend",
|
136
|
-
"extract_entities",
|
137
|
-
"extract_keywords",
|
138
|
-
"auto_detect_language",
|
139
|
-
"enable_quality_processing",
|
140
|
-
"auto_detect_document_type",
|
141
|
-
"document_type_confidence_threshold",
|
142
|
-
"document_classification_mode",
|
143
|
-
"keyword_count",
|
144
|
-
}
|
145
|
-
extraction_config = extraction_config | {
|
146
|
-
field: config_dict[field] for field in basic_fields if field in config_dict
|
147
|
-
}
|
148
|
-
|
149
|
-
# Handle OCR backend configuration
|
201
|
+
extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
|
202
|
+
|
150
203
|
ocr_backend = extraction_config.get("ocr_backend")
|
151
204
|
if ocr_backend and ocr_backend != "none":
|
152
|
-
|
153
|
-
valid_backends = {"tesseract", "easyocr", "paddleocr"}
|
154
|
-
if ocr_backend not in valid_backends:
|
205
|
+
if ocr_backend not in _VALID_OCR_BACKENDS:
|
155
206
|
raise ValidationError(
|
156
|
-
f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(
|
157
|
-
context={"provided": ocr_backend, "valid": sorted(
|
207
|
+
f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(_VALID_OCR_BACKENDS))} or 'none'",
|
208
|
+
context={"provided": ocr_backend, "valid": sorted(_VALID_OCR_BACKENDS)},
|
158
209
|
)
|
159
210
|
ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
|
160
211
|
if ocr_config:
|
161
212
|
extraction_config["ocr_config"] = ocr_config
|
162
213
|
|
163
|
-
# Handle GMFT configuration for table extraction
|
164
214
|
if extraction_config.get("extract_tables") and "gmft" in config_dict and isinstance(config_dict["gmft"], dict):
|
165
|
-
|
215
|
+
try:
|
216
|
+
extraction_config["gmft_config"] = GMFTConfig(**config_dict["gmft"])
|
217
|
+
except (TypeError, ValueError) as e:
|
218
|
+
raise ValidationError(
|
219
|
+
f"Invalid GMFT configuration: {e}",
|
220
|
+
context={"gmft_config": config_dict["gmft"], "error": str(e)},
|
221
|
+
) from e
|
222
|
+
|
223
|
+
if "html_to_markdown" in config_dict and isinstance(config_dict["html_to_markdown"], dict):
|
224
|
+
try:
|
225
|
+
extraction_config["html_to_markdown_config"] = HTMLToMarkdownConfig(**config_dict["html_to_markdown"])
|
226
|
+
except (TypeError, ValueError) as e:
|
227
|
+
raise ValidationError(
|
228
|
+
f"Invalid HTML to Markdown configuration: {e}",
|
229
|
+
context={"html_to_markdown_config": config_dict["html_to_markdown"], "error": str(e)},
|
230
|
+
) from e
|
166
231
|
|
167
|
-
# Convert "none" to None for ocr_backend
|
168
232
|
if extraction_config.get("ocr_backend") == "none":
|
169
233
|
extraction_config["ocr_backend"] = None
|
170
234
|
|
171
|
-
|
235
|
+
try:
|
236
|
+
return ExtractionConfig(**extraction_config)
|
237
|
+
except (TypeError, ValueError) as e:
|
238
|
+
raise ValidationError(
|
239
|
+
f"Invalid extraction configuration: {e}",
|
240
|
+
context={"config": extraction_config, "error": str(e)},
|
241
|
+
) from e
|
172
242
|
|
173
243
|
|
174
|
-
def
|
175
|
-
|
244
|
+
def build_extraction_config(
|
245
|
+
file_config: dict[str, Any],
|
246
|
+
cli_args: MutableMapping[str, Any],
|
247
|
+
) -> ExtractionConfig:
|
248
|
+
config_dict: dict[str, Any] = {}
|
249
|
+
|
250
|
+
_merge_file_config(config_dict, file_config)
|
251
|
+
_merge_cli_args(config_dict, cli_args)
|
176
252
|
|
177
|
-
|
178
|
-
|
179
|
-
|
253
|
+
_configure_ocr_backend(config_dict, file_config, cli_args)
|
254
|
+
_configure_gmft(config_dict, file_config, cli_args)
|
255
|
+
|
256
|
+
if config_dict.get("ocr_backend") == "none":
|
257
|
+
config_dict["ocr_backend"] = None
|
258
|
+
|
259
|
+
try:
|
260
|
+
return ExtractionConfig(**config_dict)
|
261
|
+
except (TypeError, ValueError) as e:
|
262
|
+
raise ValidationError(
|
263
|
+
f"Invalid extraction configuration: {e}",
|
264
|
+
context={"config": config_dict, "error": str(e)},
|
265
|
+
) from e
|
180
266
|
|
181
|
-
Args:
|
182
|
-
start_path: Directory to start searching from. Defaults to current working directory.
|
183
267
|
|
184
|
-
|
185
|
-
Path to the configuration file or None if not found.
|
186
|
-
"""
|
268
|
+
def find_config_file(start_path: Path | None = None) -> Path | None:
|
187
269
|
current = start_path or Path.cwd()
|
188
270
|
|
189
271
|
while current != current.parent:
|
190
|
-
# First, look for kreuzberg.toml
|
191
272
|
kreuzberg_toml = current / "kreuzberg.toml"
|
192
273
|
if kreuzberg_toml.exists():
|
193
274
|
return kreuzberg_toml
|
194
275
|
|
195
|
-
# Then, look for pyproject.toml with [tool.kreuzberg] section
|
196
276
|
pyproject_toml = current / "pyproject.toml"
|
197
277
|
if pyproject_toml.exists():
|
198
278
|
try:
|
@@ -200,65 +280,39 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
|
|
200
280
|
data = tomllib.load(f)
|
201
281
|
if "tool" in data and "kreuzberg" in data["tool"]:
|
202
282
|
return pyproject_toml
|
203
|
-
except
|
204
|
-
|
283
|
+
except OSError as e:
|
284
|
+
raise ValidationError(
|
285
|
+
f"Failed to read pyproject.toml: {e}",
|
286
|
+
context={"file": str(pyproject_toml), "error": str(e)},
|
287
|
+
) from e
|
288
|
+
except tomllib.TOMLDecodeError as e:
|
289
|
+
raise ValidationError(
|
290
|
+
f"Invalid TOML in pyproject.toml: {e}",
|
291
|
+
context={"file": str(pyproject_toml), "error": str(e)},
|
292
|
+
) from e
|
205
293
|
|
206
294
|
current = current.parent
|
207
295
|
return None
|
208
296
|
|
209
297
|
|
210
298
|
def load_default_config(start_path: Path | None = None) -> ExtractionConfig | None:
|
211
|
-
"""Load the default configuration from discovered config file.
|
212
|
-
|
213
|
-
Args:
|
214
|
-
start_path: Directory to start searching from. Defaults to current working directory.
|
215
|
-
|
216
|
-
Returns:
|
217
|
-
ExtractionConfig instance or None if no configuration found.
|
218
|
-
"""
|
219
299
|
config_path = find_config_file(start_path)
|
220
300
|
if not config_path:
|
221
301
|
return None
|
222
302
|
|
223
|
-
|
224
|
-
|
225
|
-
if not config_dict:
|
226
|
-
return None
|
227
|
-
return build_extraction_config_from_dict(config_dict)
|
228
|
-
except Exception: # noqa: BLE001
|
229
|
-
# Silently ignore configuration errors for default loading
|
303
|
+
config_dict = load_config_from_file(config_path)
|
304
|
+
if not config_dict:
|
230
305
|
return None
|
306
|
+
return build_extraction_config_from_dict(config_dict)
|
231
307
|
|
232
308
|
|
233
309
|
def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
|
234
|
-
"""Load configuration from a specific file path.
|
235
|
-
|
236
|
-
Args:
|
237
|
-
config_path: Path to the configuration file.
|
238
|
-
|
239
|
-
Returns:
|
240
|
-
ExtractionConfig instance.
|
241
|
-
|
242
|
-
Raises:
|
243
|
-
ValidationError: If the file cannot be read, parsed, or is invalid.
|
244
|
-
"""
|
245
310
|
path = Path(config_path)
|
246
311
|
config_dict = load_config_from_file(path)
|
247
312
|
return build_extraction_config_from_dict(config_dict)
|
248
313
|
|
249
314
|
|
250
315
|
def discover_and_load_config(start_path: Path | str | None = None) -> ExtractionConfig:
|
251
|
-
"""Load configuration by discovering config files in the directory tree.
|
252
|
-
|
253
|
-
Args:
|
254
|
-
start_path: Directory to start searching from. Defaults to current working directory.
|
255
|
-
|
256
|
-
Returns:
|
257
|
-
ExtractionConfig instance.
|
258
|
-
|
259
|
-
Raises:
|
260
|
-
ValidationError: If no configuration file is found or if the file is invalid.
|
261
|
-
"""
|
262
316
|
search_path = Path(start_path) if start_path else None
|
263
317
|
config_path = find_config_file(search_path)
|
264
318
|
|
@@ -278,152 +332,18 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
|
|
278
332
|
return build_extraction_config_from_dict(config_dict)
|
279
333
|
|
280
334
|
|
281
|
-
def
|
282
|
-
|
283
|
-
|
284
|
-
Args:
|
285
|
-
start_path: Directory to start searching from. Defaults to current working directory.
|
335
|
+
def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
|
336
|
+
search_path = Path(start_path) if start_path else None
|
337
|
+
config_path = find_config_file(search_path)
|
286
338
|
|
287
|
-
|
288
|
-
ExtractionConfig instance or None if no configuration found.
|
289
|
-
"""
|
290
|
-
try:
|
291
|
-
return discover_and_load_config(start_path)
|
292
|
-
except ValidationError:
|
339
|
+
if not config_path:
|
293
340
|
return None
|
294
341
|
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
# Define common configuration fields to avoid repetition
|
299
|
-
_CONFIG_FIELDS = [
|
300
|
-
"force_ocr",
|
301
|
-
"chunk_content",
|
302
|
-
"extract_tables",
|
303
|
-
"max_chars",
|
304
|
-
"max_overlap",
|
305
|
-
"ocr_backend",
|
306
|
-
"extract_entities",
|
307
|
-
"extract_keywords",
|
308
|
-
"auto_detect_language",
|
309
|
-
"enable_quality_processing",
|
310
|
-
"auto_detect_document_type",
|
311
|
-
"document_type_confidence_threshold",
|
312
|
-
"document_classification_mode",
|
313
|
-
"keyword_count",
|
314
|
-
]
|
315
|
-
|
316
|
-
|
317
|
-
def _merge_file_config(config_dict: dict[str, Any], file_config: dict[str, Any]) -> None:
|
318
|
-
"""Merge file configuration into config dictionary."""
|
319
|
-
if not file_config:
|
320
|
-
return
|
321
|
-
|
322
|
-
for field in _CONFIG_FIELDS:
|
323
|
-
if field in file_config:
|
324
|
-
config_dict[field] = file_config[field]
|
325
|
-
|
326
|
-
|
327
|
-
def _merge_cli_args(config_dict: dict[str, Any], cli_args: MutableMapping[str, Any]) -> None:
|
328
|
-
"""Merge CLI arguments into config dictionary."""
|
329
|
-
for field in _CONFIG_FIELDS:
|
330
|
-
if field in cli_args and cli_args[field] is not None:
|
331
|
-
config_dict[field] = cli_args[field]
|
332
|
-
|
333
|
-
|
334
|
-
def _build_ocr_config_from_cli(
|
335
|
-
ocr_backend: str, cli_args: MutableMapping[str, Any]
|
336
|
-
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
|
337
|
-
"""Build OCR config from CLI arguments."""
|
338
|
-
config_key = f"{ocr_backend}_config"
|
339
|
-
if not cli_args.get(config_key):
|
342
|
+
config_dict = load_config_from_file(config_path)
|
343
|
+
if not config_dict:
|
340
344
|
return None
|
341
|
-
|
342
|
-
backend_args = cli_args[config_key]
|
343
|
-
if ocr_backend == "tesseract":
|
344
|
-
return TesseractConfig(**backend_args)
|
345
|
-
if ocr_backend == "easyocr":
|
346
|
-
return EasyOCRConfig(**backend_args)
|
347
|
-
if ocr_backend == "paddleocr":
|
348
|
-
return PaddleOCRConfig(**backend_args)
|
349
|
-
return None
|
350
|
-
|
351
|
-
|
352
|
-
def _configure_ocr_backend(
|
353
|
-
config_dict: dict[str, Any],
|
354
|
-
file_config: dict[str, Any],
|
355
|
-
cli_args: MutableMapping[str, Any],
|
356
|
-
) -> None:
|
357
|
-
"""Configure OCR backend in config dictionary."""
|
358
|
-
ocr_backend = config_dict.get("ocr_backend")
|
359
|
-
if not ocr_backend or ocr_backend == "none":
|
360
|
-
return
|
361
|
-
|
362
|
-
# Try CLI config first, then file config
|
363
|
-
ocr_config = _build_ocr_config_from_cli(ocr_backend, cli_args)
|
364
|
-
if not ocr_config and file_config:
|
365
|
-
ocr_config = parse_ocr_backend_config(file_config, ocr_backend)
|
366
|
-
|
367
|
-
if ocr_config:
|
368
|
-
config_dict["ocr_config"] = ocr_config
|
369
|
-
|
370
|
-
|
371
|
-
def _configure_gmft(
|
372
|
-
config_dict: dict[str, Any],
|
373
|
-
file_config: dict[str, Any],
|
374
|
-
cli_args: MutableMapping[str, Any],
|
375
|
-
) -> None:
|
376
|
-
"""Configure GMFT in config dictionary."""
|
377
|
-
if not config_dict.get("extract_tables"):
|
378
|
-
return
|
379
|
-
|
380
|
-
gmft_config = None
|
381
|
-
if cli_args.get("gmft_config"):
|
382
|
-
gmft_config = GMFTConfig(**cli_args["gmft_config"])
|
383
|
-
elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
|
384
|
-
gmft_config = GMFTConfig(**file_config["gmft"])
|
385
|
-
|
386
|
-
if gmft_config:
|
387
|
-
config_dict["gmft_config"] = gmft_config
|
388
|
-
|
389
|
-
|
390
|
-
def build_extraction_config(
|
391
|
-
file_config: dict[str, Any],
|
392
|
-
cli_args: MutableMapping[str, Any],
|
393
|
-
) -> ExtractionConfig:
|
394
|
-
"""Build ExtractionConfig from file config and CLI arguments.
|
395
|
-
|
396
|
-
Args:
|
397
|
-
file_config: Configuration loaded from file.
|
398
|
-
cli_args: CLI arguments.
|
399
|
-
|
400
|
-
Returns:
|
401
|
-
ExtractionConfig instance.
|
402
|
-
"""
|
403
|
-
config_dict: dict[str, Any] = {}
|
404
|
-
|
405
|
-
# Merge configurations: file first, then CLI overrides
|
406
|
-
_merge_file_config(config_dict, file_config)
|
407
|
-
_merge_cli_args(config_dict, cli_args)
|
408
|
-
|
409
|
-
# Configure complex components
|
410
|
-
_configure_ocr_backend(config_dict, file_config, cli_args)
|
411
|
-
_configure_gmft(config_dict, file_config, cli_args)
|
412
|
-
|
413
|
-
# Convert "none" to None for ocr_backend
|
414
|
-
if config_dict.get("ocr_backend") == "none":
|
415
|
-
config_dict["ocr_backend"] = None
|
416
|
-
|
417
|
-
return ExtractionConfig(**config_dict)
|
345
|
+
return build_extraction_config_from_dict(config_dict)
|
418
346
|
|
419
347
|
|
420
348
|
def find_default_config() -> Path | None:
|
421
|
-
"""Find the default configuration file (pyproject.toml).
|
422
|
-
|
423
|
-
Returns:
|
424
|
-
Path to the configuration file or None if not found.
|
425
|
-
|
426
|
-
Note:
|
427
|
-
This function is deprecated. Use find_config_file() instead.
|
428
|
-
"""
|
429
349
|
return find_config_file()
|