kreuzberg 3.11.3__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +55 -77
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.3.dist-info/RECORD +0 -54
- {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_config.py
CHANGED
@@ -1,10 +1,3 @@
|
|
1
|
-
"""Configuration discovery and loading for Kreuzberg.
|
2
|
-
|
3
|
-
This module provides configuration loading from both kreuzberg.toml and pyproject.toml files.
|
4
|
-
Configuration is automatically discovered by searching up the directory tree from the current
|
5
|
-
working directory.
|
6
|
-
"""
|
7
|
-
|
8
1
|
from __future__ import annotations
|
9
2
|
|
10
3
|
import sys
|
@@ -16,16 +9,143 @@ if sys.version_info >= (3, 11):
|
|
16
9
|
else: # pragma: no cover
|
17
10
|
import tomli as tomllib # type: ignore[import-not-found]
|
18
11
|
|
19
|
-
from kreuzberg.
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
12
|
+
from kreuzberg._types import (
|
13
|
+
EasyOCRConfig,
|
14
|
+
ExtractionConfig,
|
15
|
+
GMFTConfig,
|
16
|
+
HTMLToMarkdownConfig,
|
17
|
+
OcrBackendType,
|
18
|
+
PaddleOCRConfig,
|
19
|
+
PSMMode,
|
20
|
+
TesseractConfig,
|
21
|
+
)
|
24
22
|
from kreuzberg.exceptions import ValidationError
|
25
23
|
|
26
24
|
if TYPE_CHECKING:
|
27
25
|
from collections.abc import MutableMapping
|
28
26
|
|
27
|
+
_CONFIG_FIELDS = [
|
28
|
+
"force_ocr",
|
29
|
+
"chunk_content",
|
30
|
+
"extract_tables",
|
31
|
+
"max_chars",
|
32
|
+
"max_overlap",
|
33
|
+
"ocr_backend",
|
34
|
+
"extract_entities",
|
35
|
+
"extract_keywords",
|
36
|
+
"auto_detect_language",
|
37
|
+
"enable_quality_processing",
|
38
|
+
"auto_detect_document_type",
|
39
|
+
"document_type_confidence_threshold",
|
40
|
+
"document_classification_mode",
|
41
|
+
"keyword_count",
|
42
|
+
]
|
43
|
+
|
44
|
+
_VALID_OCR_BACKENDS = {"tesseract", "easyocr", "paddleocr"}
|
45
|
+
|
46
|
+
|
47
|
+
def _merge_file_config(config_dict: dict[str, Any], file_config: dict[str, Any]) -> None:
|
48
|
+
if not file_config:
|
49
|
+
return
|
50
|
+
for field in _CONFIG_FIELDS:
|
51
|
+
if field in file_config:
|
52
|
+
config_dict[field] = file_config[field]
|
53
|
+
|
54
|
+
|
55
|
+
def _merge_cli_args(config_dict: dict[str, Any], cli_args: MutableMapping[str, Any]) -> None:
|
56
|
+
for field in _CONFIG_FIELDS:
|
57
|
+
if field in cli_args and cli_args[field] is not None:
|
58
|
+
config_dict[field] = cli_args[field]
|
59
|
+
|
60
|
+
|
61
|
+
def _build_ocr_config_from_cli(
|
62
|
+
ocr_backend: str, cli_args: MutableMapping[str, Any]
|
63
|
+
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
|
64
|
+
config_key = f"{ocr_backend}_config"
|
65
|
+
if not cli_args.get(config_key):
|
66
|
+
return None
|
67
|
+
|
68
|
+
backend_args = cli_args[config_key]
|
69
|
+
try:
|
70
|
+
match ocr_backend:
|
71
|
+
case "tesseract":
|
72
|
+
return TesseractConfig(**backend_args)
|
73
|
+
case "easyocr":
|
74
|
+
return EasyOCRConfig(**backend_args)
|
75
|
+
case "paddleocr":
|
76
|
+
return PaddleOCRConfig(**backend_args)
|
77
|
+
case _:
|
78
|
+
return None
|
79
|
+
except (TypeError, ValueError) as e:
|
80
|
+
raise ValidationError(
|
81
|
+
f"Invalid {ocr_backend} configuration from CLI: {e}",
|
82
|
+
context={"backend": ocr_backend, "config": backend_args, "error": str(e)},
|
83
|
+
) from e
|
84
|
+
|
85
|
+
|
86
|
+
def _configure_ocr_backend(
|
87
|
+
config_dict: dict[str, Any],
|
88
|
+
file_config: dict[str, Any],
|
89
|
+
cli_args: MutableMapping[str, Any],
|
90
|
+
) -> None:
|
91
|
+
ocr_backend = config_dict.get("ocr_backend")
|
92
|
+
if not ocr_backend or ocr_backend == "none":
|
93
|
+
return
|
94
|
+
|
95
|
+
ocr_config = _build_ocr_config_from_cli(ocr_backend, cli_args)
|
96
|
+
if not ocr_config and file_config:
|
97
|
+
ocr_config = parse_ocr_backend_config(file_config, ocr_backend)
|
98
|
+
|
99
|
+
if ocr_config:
|
100
|
+
config_dict["ocr_config"] = ocr_config
|
101
|
+
|
102
|
+
|
103
|
+
def _configure_gmft(
|
104
|
+
config_dict: dict[str, Any],
|
105
|
+
file_config: dict[str, Any],
|
106
|
+
cli_args: MutableMapping[str, Any],
|
107
|
+
) -> None:
|
108
|
+
if not config_dict.get("extract_tables"):
|
109
|
+
return
|
110
|
+
|
111
|
+
gmft_config = None
|
112
|
+
try:
|
113
|
+
if cli_args.get("gmft_config"):
|
114
|
+
gmft_config = GMFTConfig(**cli_args["gmft_config"])
|
115
|
+
elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
|
116
|
+
gmft_config = GMFTConfig(**file_config["gmft"])
|
117
|
+
except (TypeError, ValueError) as e:
|
118
|
+
raise ValidationError(
|
119
|
+
f"Invalid GMFT configuration: {e}",
|
120
|
+
context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
|
121
|
+
) from e
|
122
|
+
|
123
|
+
if gmft_config:
|
124
|
+
config_dict["gmft_config"] = gmft_config
|
125
|
+
|
126
|
+
|
127
|
+
def _create_ocr_config(
|
128
|
+
backend: str, backend_config: dict[str, Any]
|
129
|
+
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig:
|
130
|
+
match backend:
|
131
|
+
case "tesseract":
|
132
|
+
processed_config = backend_config.copy()
|
133
|
+
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
134
|
+
try:
|
135
|
+
processed_config["psm"] = PSMMode(processed_config["psm"])
|
136
|
+
except ValueError as e:
|
137
|
+
raise ValidationError(
|
138
|
+
f"Invalid PSM mode value: {processed_config['psm']}",
|
139
|
+
context={"psm_value": processed_config["psm"], "error": str(e)},
|
140
|
+
) from e
|
141
|
+
return TesseractConfig(**processed_config)
|
142
|
+
case "easyocr":
|
143
|
+
return EasyOCRConfig(**backend_config)
|
144
|
+
case "paddleocr":
|
145
|
+
return PaddleOCRConfig(**backend_config)
|
146
|
+
case _:
|
147
|
+
raise ValueError(f"Unknown backend: {backend}")
|
148
|
+
|
29
149
|
|
30
150
|
def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
31
151
|
"""Load configuration from a TOML file.
|
@@ -47,15 +167,12 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
47
167
|
except tomllib.TOMLDecodeError as e:
|
48
168
|
raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
|
49
169
|
|
50
|
-
# Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
|
51
170
|
if config_path.name == "kreuzberg.toml":
|
52
171
|
return data # type: ignore[no-any-return]
|
53
172
|
|
54
|
-
|
55
|
-
if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
|
173
|
+
if config_path.name == "pyproject.toml":
|
56
174
|
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
57
175
|
|
58
|
-
# Otherwise assume root-level configuration
|
59
176
|
return data # type: ignore[no-any-return]
|
60
177
|
|
61
178
|
|
@@ -89,29 +206,27 @@ def parse_ocr_backend_config(
|
|
89
206
|
|
90
207
|
Returns:
|
91
208
|
Backend-specific configuration object or None.
|
209
|
+
|
210
|
+
Raises:
|
211
|
+
ValidationError: If the backend configuration is invalid.
|
92
212
|
"""
|
93
213
|
if backend not in config_dict:
|
94
214
|
return None
|
95
215
|
|
96
216
|
backend_config = config_dict[backend]
|
97
217
|
if not isinstance(backend_config, dict):
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
# Convert psm integer to PSMMode enum if needed
|
103
|
-
processed_config = backend_config.copy()
|
104
|
-
if "psm" in processed_config and isinstance(processed_config["psm"], int):
|
105
|
-
from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
|
218
|
+
raise ValidationError(
|
219
|
+
f"Invalid configuration for OCR backend '{backend}': expected dict, got {type(backend_config).__name__}",
|
220
|
+
context={"backend": backend, "config_type": type(backend_config).__name__},
|
221
|
+
)
|
106
222
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
return None
|
223
|
+
try:
|
224
|
+
return _create_ocr_config(backend, backend_config)
|
225
|
+
except (TypeError, ValueError) as e:
|
226
|
+
raise ValidationError(
|
227
|
+
f"Invalid configuration for OCR backend '{backend}': {e}",
|
228
|
+
context={"backend": backend, "config": backend_config, "error": str(e)},
|
229
|
+
) from e
|
115
230
|
|
116
231
|
|
117
232
|
def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
|
@@ -122,53 +237,87 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
122
237
|
|
123
238
|
Returns:
|
124
239
|
ExtractionConfig instance.
|
240
|
+
|
241
|
+
Raises:
|
242
|
+
ValidationError: If the configuration is invalid.
|
125
243
|
"""
|
126
|
-
extraction_config: dict[str, Any] = {}
|
127
|
-
|
128
|
-
# Copy basic configuration fields using dictionary comprehension
|
129
|
-
basic_fields = {
|
130
|
-
"force_ocr",
|
131
|
-
"chunk_content",
|
132
|
-
"extract_tables",
|
133
|
-
"max_chars",
|
134
|
-
"max_overlap",
|
135
|
-
"ocr_backend",
|
136
|
-
"extract_entities",
|
137
|
-
"extract_keywords",
|
138
|
-
"auto_detect_language",
|
139
|
-
"enable_quality_processing",
|
140
|
-
"auto_detect_document_type",
|
141
|
-
"document_type_confidence_threshold",
|
142
|
-
"document_classification_mode",
|
143
|
-
"keyword_count",
|
144
|
-
}
|
145
|
-
extraction_config = extraction_config | {
|
146
|
-
field: config_dict[field] for field in basic_fields if field in config_dict
|
147
|
-
}
|
148
|
-
|
149
|
-
# Handle OCR backend configuration
|
244
|
+
extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
|
245
|
+
|
150
246
|
ocr_backend = extraction_config.get("ocr_backend")
|
151
247
|
if ocr_backend and ocr_backend != "none":
|
152
|
-
|
153
|
-
valid_backends = {"tesseract", "easyocr", "paddleocr"}
|
154
|
-
if ocr_backend not in valid_backends:
|
248
|
+
if ocr_backend not in _VALID_OCR_BACKENDS:
|
155
249
|
raise ValidationError(
|
156
|
-
f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(
|
157
|
-
context={"provided": ocr_backend, "valid": sorted(
|
250
|
+
f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(_VALID_OCR_BACKENDS))} or 'none'",
|
251
|
+
context={"provided": ocr_backend, "valid": sorted(_VALID_OCR_BACKENDS)},
|
158
252
|
)
|
159
253
|
ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
|
160
254
|
if ocr_config:
|
161
255
|
extraction_config["ocr_config"] = ocr_config
|
162
256
|
|
163
|
-
# Handle GMFT configuration for table extraction
|
164
257
|
if extraction_config.get("extract_tables") and "gmft" in config_dict and isinstance(config_dict["gmft"], dict):
|
165
|
-
|
258
|
+
try:
|
259
|
+
extraction_config["gmft_config"] = GMFTConfig(**config_dict["gmft"])
|
260
|
+
except (TypeError, ValueError) as e:
|
261
|
+
raise ValidationError(
|
262
|
+
f"Invalid GMFT configuration: {e}",
|
263
|
+
context={"gmft_config": config_dict["gmft"], "error": str(e)},
|
264
|
+
) from e
|
265
|
+
|
266
|
+
if "html_to_markdown" in config_dict and isinstance(config_dict["html_to_markdown"], dict):
|
267
|
+
try:
|
268
|
+
extraction_config["html_to_markdown_config"] = HTMLToMarkdownConfig(**config_dict["html_to_markdown"])
|
269
|
+
except (TypeError, ValueError) as e:
|
270
|
+
raise ValidationError(
|
271
|
+
f"Invalid HTML to Markdown configuration: {e}",
|
272
|
+
context={"html_to_markdown_config": config_dict["html_to_markdown"], "error": str(e)},
|
273
|
+
) from e
|
166
274
|
|
167
|
-
# Convert "none" to None for ocr_backend
|
168
275
|
if extraction_config.get("ocr_backend") == "none":
|
169
276
|
extraction_config["ocr_backend"] = None
|
170
277
|
|
171
|
-
|
278
|
+
try:
|
279
|
+
return ExtractionConfig(**extraction_config)
|
280
|
+
except (TypeError, ValueError) as e:
|
281
|
+
raise ValidationError(
|
282
|
+
f"Invalid extraction configuration: {e}",
|
283
|
+
context={"config": extraction_config, "error": str(e)},
|
284
|
+
) from e
|
285
|
+
|
286
|
+
|
287
|
+
def build_extraction_config(
|
288
|
+
file_config: dict[str, Any],
|
289
|
+
cli_args: MutableMapping[str, Any],
|
290
|
+
) -> ExtractionConfig:
|
291
|
+
"""Build ExtractionConfig from file config and CLI arguments.
|
292
|
+
|
293
|
+
Args:
|
294
|
+
file_config: Configuration loaded from file.
|
295
|
+
cli_args: CLI arguments.
|
296
|
+
|
297
|
+
Returns:
|
298
|
+
ExtractionConfig instance.
|
299
|
+
|
300
|
+
Raises:
|
301
|
+
ValidationError: If the combined configuration is invalid.
|
302
|
+
"""
|
303
|
+
config_dict: dict[str, Any] = {}
|
304
|
+
|
305
|
+
_merge_file_config(config_dict, file_config)
|
306
|
+
_merge_cli_args(config_dict, cli_args)
|
307
|
+
|
308
|
+
_configure_ocr_backend(config_dict, file_config, cli_args)
|
309
|
+
_configure_gmft(config_dict, file_config, cli_args)
|
310
|
+
|
311
|
+
if config_dict.get("ocr_backend") == "none":
|
312
|
+
config_dict["ocr_backend"] = None
|
313
|
+
|
314
|
+
try:
|
315
|
+
return ExtractionConfig(**config_dict)
|
316
|
+
except (TypeError, ValueError) as e:
|
317
|
+
raise ValidationError(
|
318
|
+
f"Invalid extraction configuration: {e}",
|
319
|
+
context={"config": config_dict, "error": str(e)},
|
320
|
+
) from e
|
172
321
|
|
173
322
|
|
174
323
|
def find_config_file(start_path: Path | None = None) -> Path | None:
|
@@ -183,16 +332,17 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
|
|
183
332
|
|
184
333
|
Returns:
|
185
334
|
Path to the configuration file or None if not found.
|
335
|
+
|
336
|
+
Raises:
|
337
|
+
ValidationError: If a config file exists but cannot be read or has invalid TOML.
|
186
338
|
"""
|
187
339
|
current = start_path or Path.cwd()
|
188
340
|
|
189
341
|
while current != current.parent:
|
190
|
-
# First, look for kreuzberg.toml
|
191
342
|
kreuzberg_toml = current / "kreuzberg.toml"
|
192
343
|
if kreuzberg_toml.exists():
|
193
344
|
return kreuzberg_toml
|
194
345
|
|
195
|
-
# Then, look for pyproject.toml with [tool.kreuzberg] section
|
196
346
|
pyproject_toml = current / "pyproject.toml"
|
197
347
|
if pyproject_toml.exists():
|
198
348
|
try:
|
@@ -200,8 +350,16 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
|
|
200
350
|
data = tomllib.load(f)
|
201
351
|
if "tool" in data and "kreuzberg" in data["tool"]:
|
202
352
|
return pyproject_toml
|
203
|
-
except
|
204
|
-
|
353
|
+
except OSError as e:
|
354
|
+
raise ValidationError(
|
355
|
+
f"Failed to read pyproject.toml: {e}",
|
356
|
+
context={"file": str(pyproject_toml), "error": str(e)},
|
357
|
+
) from e
|
358
|
+
except tomllib.TOMLDecodeError as e:
|
359
|
+
raise ValidationError(
|
360
|
+
f"Invalid TOML in pyproject.toml: {e}",
|
361
|
+
context={"file": str(pyproject_toml), "error": str(e)},
|
362
|
+
) from e
|
205
363
|
|
206
364
|
current = current.parent
|
207
365
|
return None
|
@@ -215,19 +373,18 @@ def load_default_config(start_path: Path | None = None) -> ExtractionConfig | No
|
|
215
373
|
|
216
374
|
Returns:
|
217
375
|
ExtractionConfig instance or None if no configuration found.
|
376
|
+
|
377
|
+
Raises:
|
378
|
+
ValidationError: If configuration file exists but contains invalid configuration.
|
218
379
|
"""
|
219
380
|
config_path = find_config_file(start_path)
|
220
381
|
if not config_path:
|
221
382
|
return None
|
222
383
|
|
223
|
-
|
224
|
-
|
225
|
-
if not config_dict:
|
226
|
-
return None
|
227
|
-
return build_extraction_config_from_dict(config_dict)
|
228
|
-
except Exception: # noqa: BLE001
|
229
|
-
# Silently ignore configuration errors for default loading
|
384
|
+
config_dict = load_config_from_file(config_path)
|
385
|
+
if not config_dict:
|
230
386
|
return None
|
387
|
+
return build_extraction_config_from_dict(config_dict)
|
231
388
|
|
232
389
|
|
233
390
|
def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
|
@@ -278,143 +435,30 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
|
|
278
435
|
return build_extraction_config_from_dict(config_dict)
|
279
436
|
|
280
437
|
|
281
|
-
def
|
282
|
-
"""
|
438
|
+
def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
|
439
|
+
"""Discover and load configuration, returning None if no config file found.
|
440
|
+
|
441
|
+
If a config file is found, attempts to load it. Any errors during loading will bubble up.
|
283
442
|
|
284
443
|
Args:
|
285
444
|
start_path: Directory to start searching from. Defaults to current working directory.
|
286
445
|
|
287
446
|
Returns:
|
288
|
-
ExtractionConfig instance or None if no configuration found.
|
289
|
-
"""
|
290
|
-
try:
|
291
|
-
return discover_and_load_config(start_path)
|
292
|
-
except ValidationError:
|
293
|
-
return None
|
447
|
+
ExtractionConfig instance or None if no configuration file found.
|
294
448
|
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
# Define common configuration fields to avoid repetition
|
299
|
-
_CONFIG_FIELDS = [
|
300
|
-
"force_ocr",
|
301
|
-
"chunk_content",
|
302
|
-
"extract_tables",
|
303
|
-
"max_chars",
|
304
|
-
"max_overlap",
|
305
|
-
"ocr_backend",
|
306
|
-
"extract_entities",
|
307
|
-
"extract_keywords",
|
308
|
-
"auto_detect_language",
|
309
|
-
"enable_quality_processing",
|
310
|
-
"auto_detect_document_type",
|
311
|
-
"document_type_confidence_threshold",
|
312
|
-
"document_classification_mode",
|
313
|
-
"keyword_count",
|
314
|
-
]
|
315
|
-
|
316
|
-
|
317
|
-
def _merge_file_config(config_dict: dict[str, Any], file_config: dict[str, Any]) -> None:
|
318
|
-
"""Merge file configuration into config dictionary."""
|
319
|
-
if not file_config:
|
320
|
-
return
|
321
|
-
|
322
|
-
for field in _CONFIG_FIELDS:
|
323
|
-
if field in file_config:
|
324
|
-
config_dict[field] = file_config[field]
|
325
|
-
|
326
|
-
|
327
|
-
def _merge_cli_args(config_dict: dict[str, Any], cli_args: MutableMapping[str, Any]) -> None:
|
328
|
-
"""Merge CLI arguments into config dictionary."""
|
329
|
-
for field in _CONFIG_FIELDS:
|
330
|
-
if field in cli_args and cli_args[field] is not None:
|
331
|
-
config_dict[field] = cli_args[field]
|
332
|
-
|
333
|
-
|
334
|
-
def _build_ocr_config_from_cli(
|
335
|
-
ocr_backend: str, cli_args: MutableMapping[str, Any]
|
336
|
-
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
|
337
|
-
"""Build OCR config from CLI arguments."""
|
338
|
-
config_key = f"{ocr_backend}_config"
|
339
|
-
if not cli_args.get(config_key):
|
340
|
-
return None
|
341
|
-
|
342
|
-
backend_args = cli_args[config_key]
|
343
|
-
if ocr_backend == "tesseract":
|
344
|
-
return TesseractConfig(**backend_args)
|
345
|
-
if ocr_backend == "easyocr":
|
346
|
-
return EasyOCRConfig(**backend_args)
|
347
|
-
if ocr_backend == "paddleocr":
|
348
|
-
return PaddleOCRConfig(**backend_args)
|
349
|
-
return None
|
350
|
-
|
351
|
-
|
352
|
-
def _configure_ocr_backend(
|
353
|
-
config_dict: dict[str, Any],
|
354
|
-
file_config: dict[str, Any],
|
355
|
-
cli_args: MutableMapping[str, Any],
|
356
|
-
) -> None:
|
357
|
-
"""Configure OCR backend in config dictionary."""
|
358
|
-
ocr_backend = config_dict.get("ocr_backend")
|
359
|
-
if not ocr_backend or ocr_backend == "none":
|
360
|
-
return
|
361
|
-
|
362
|
-
# Try CLI config first, then file config
|
363
|
-
ocr_config = _build_ocr_config_from_cli(ocr_backend, cli_args)
|
364
|
-
if not ocr_config and file_config:
|
365
|
-
ocr_config = parse_ocr_backend_config(file_config, ocr_backend)
|
366
|
-
|
367
|
-
if ocr_config:
|
368
|
-
config_dict["ocr_config"] = ocr_config
|
369
|
-
|
370
|
-
|
371
|
-
def _configure_gmft(
|
372
|
-
config_dict: dict[str, Any],
|
373
|
-
file_config: dict[str, Any],
|
374
|
-
cli_args: MutableMapping[str, Any],
|
375
|
-
) -> None:
|
376
|
-
"""Configure GMFT in config dictionary."""
|
377
|
-
if not config_dict.get("extract_tables"):
|
378
|
-
return
|
379
|
-
|
380
|
-
gmft_config = None
|
381
|
-
if cli_args.get("gmft_config"):
|
382
|
-
gmft_config = GMFTConfig(**cli_args["gmft_config"])
|
383
|
-
elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
|
384
|
-
gmft_config = GMFTConfig(**file_config["gmft"])
|
385
|
-
|
386
|
-
if gmft_config:
|
387
|
-
config_dict["gmft_config"] = gmft_config
|
388
|
-
|
389
|
-
|
390
|
-
def build_extraction_config(
|
391
|
-
file_config: dict[str, Any],
|
392
|
-
cli_args: MutableMapping[str, Any],
|
393
|
-
) -> ExtractionConfig:
|
394
|
-
"""Build ExtractionConfig from file config and CLI arguments.
|
395
|
-
|
396
|
-
Args:
|
397
|
-
file_config: Configuration loaded from file.
|
398
|
-
cli_args: CLI arguments.
|
399
|
-
|
400
|
-
Returns:
|
401
|
-
ExtractionConfig instance.
|
449
|
+
Raises:
|
450
|
+
ValidationError: If a configuration file exists but is invalid.
|
402
451
|
"""
|
403
|
-
|
404
|
-
|
405
|
-
# Merge configurations: file first, then CLI overrides
|
406
|
-
_merge_file_config(config_dict, file_config)
|
407
|
-
_merge_cli_args(config_dict, cli_args)
|
408
|
-
|
409
|
-
# Configure complex components
|
410
|
-
_configure_ocr_backend(config_dict, file_config, cli_args)
|
411
|
-
_configure_gmft(config_dict, file_config, cli_args)
|
452
|
+
search_path = Path(start_path) if start_path else None
|
453
|
+
config_path = find_config_file(search_path)
|
412
454
|
|
413
|
-
|
414
|
-
|
415
|
-
config_dict["ocr_backend"] = None
|
455
|
+
if not config_path:
|
456
|
+
return None
|
416
457
|
|
417
|
-
|
458
|
+
config_dict = load_config_from_file(config_path)
|
459
|
+
if not config_dict:
|
460
|
+
return None
|
461
|
+
return build_extraction_config_from_dict(config_dict)
|
418
462
|
|
419
463
|
|
420
464
|
def find_default_config() -> Path | None:
|
@@ -51,10 +51,8 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
51
51
|
Raises:
|
52
52
|
MissingDependencyError: If the deep-translator package is not installed
|
53
53
|
"""
|
54
|
-
# Combine content with metadata for classification
|
55
54
|
text_to_classify = result.content
|
56
55
|
if result.metadata:
|
57
|
-
# Add metadata values to the text for classification
|
58
56
|
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
59
57
|
text_to_classify = f"{text_to_classify} {metadata_text}"
|
60
58
|
|
@@ -68,7 +66,6 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
68
66
|
try:
|
69
67
|
return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
70
68
|
except Exception: # noqa: BLE001
|
71
|
-
# Fall back to original content in lowercase if translation fails
|
72
69
|
return text_to_classify.lower()
|
73
70
|
|
74
71
|
|
@@ -131,13 +128,10 @@ def classify_document_from_layout(
|
|
131
128
|
if not all(col in layout_df.columns for col in ["text", "top", "height"]):
|
132
129
|
return None, None
|
133
130
|
|
134
|
-
# Use layout text for classification, not the content
|
135
131
|
layout_text = " ".join(layout_df["text"].astype(str).tolist())
|
136
132
|
|
137
|
-
# Translate layout text directly for classification
|
138
133
|
text_to_classify = layout_text
|
139
134
|
if result.metadata:
|
140
|
-
# Add metadata values to the text for classification
|
141
135
|
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
142
136
|
text_to_classify = f"{text_to_classify} {metadata_text}"
|
143
137
|
|
@@ -146,7 +140,6 @@ def classify_document_from_layout(
|
|
146
140
|
|
147
141
|
translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
148
142
|
except Exception: # noqa: BLE001
|
149
|
-
# Fall back to original content in lowercase if translation fails
|
150
143
|
translated_text = text_to_classify.lower()
|
151
144
|
|
152
145
|
layout_df["translated_text"] = translated_text
|
@@ -184,7 +177,6 @@ def auto_detect_document_type(
|
|
184
177
|
layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
|
185
178
|
result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
|
186
179
|
elif result.layout is not None and not result.layout.empty:
|
187
|
-
# Use layout-based classification if layout data is available
|
188
180
|
result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
|
189
181
|
else:
|
190
182
|
result.document_type, result.document_type_confidence = classify_document(result, config)
|