kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_config.py CHANGED
@@ -1,10 +1,3 @@
1
- """Configuration discovery and loading for Kreuzberg.
2
-
3
- This module provides configuration loading from both kreuzberg.toml and pyproject.toml files.
4
- Configuration is automatically discovered by searching up the directory tree from the current
5
- working directory.
6
- """
7
-
8
1
  from __future__ import annotations
9
2
 
10
3
  import sys
@@ -16,16 +9,143 @@ if sys.version_info >= (3, 11):
16
9
  else: # pragma: no cover
17
10
  import tomli as tomllib # type: ignore[import-not-found]
18
11
 
19
- from kreuzberg._gmft import GMFTConfig
20
- from kreuzberg._ocr._easyocr import EasyOCRConfig
21
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
22
- from kreuzberg._ocr._tesseract import TesseractConfig
23
- from kreuzberg._types import ExtractionConfig, OcrBackendType
12
+ from kreuzberg._types import (
13
+ EasyOCRConfig,
14
+ ExtractionConfig,
15
+ GMFTConfig,
16
+ HTMLToMarkdownConfig,
17
+ OcrBackendType,
18
+ PaddleOCRConfig,
19
+ PSMMode,
20
+ TesseractConfig,
21
+ )
24
22
  from kreuzberg.exceptions import ValidationError
25
23
 
26
24
  if TYPE_CHECKING:
27
25
  from collections.abc import MutableMapping
28
26
 
27
+ _CONFIG_FIELDS = [
28
+ "force_ocr",
29
+ "chunk_content",
30
+ "extract_tables",
31
+ "max_chars",
32
+ "max_overlap",
33
+ "ocr_backend",
34
+ "extract_entities",
35
+ "extract_keywords",
36
+ "auto_detect_language",
37
+ "enable_quality_processing",
38
+ "auto_detect_document_type",
39
+ "document_type_confidence_threshold",
40
+ "document_classification_mode",
41
+ "keyword_count",
42
+ ]
43
+
44
+ _VALID_OCR_BACKENDS = {"tesseract", "easyocr", "paddleocr"}
45
+
46
+
47
+ def _merge_file_config(config_dict: dict[str, Any], file_config: dict[str, Any]) -> None:
48
+ if not file_config:
49
+ return
50
+ for field in _CONFIG_FIELDS:
51
+ if field in file_config:
52
+ config_dict[field] = file_config[field]
53
+
54
+
55
+ def _merge_cli_args(config_dict: dict[str, Any], cli_args: MutableMapping[str, Any]) -> None:
56
+ for field in _CONFIG_FIELDS:
57
+ if field in cli_args and cli_args[field] is not None:
58
+ config_dict[field] = cli_args[field]
59
+
60
+
61
+ def _build_ocr_config_from_cli(
62
+ ocr_backend: str, cli_args: MutableMapping[str, Any]
63
+ ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
64
+ config_key = f"{ocr_backend}_config"
65
+ if not cli_args.get(config_key):
66
+ return None
67
+
68
+ backend_args = cli_args[config_key]
69
+ try:
70
+ match ocr_backend:
71
+ case "tesseract":
72
+ return TesseractConfig(**backend_args)
73
+ case "easyocr":
74
+ return EasyOCRConfig(**backend_args)
75
+ case "paddleocr":
76
+ return PaddleOCRConfig(**backend_args)
77
+ case _:
78
+ return None
79
+ except (TypeError, ValueError) as e:
80
+ raise ValidationError(
81
+ f"Invalid {ocr_backend} configuration from CLI: {e}",
82
+ context={"backend": ocr_backend, "config": backend_args, "error": str(e)},
83
+ ) from e
84
+
85
+
86
+ def _configure_ocr_backend(
87
+ config_dict: dict[str, Any],
88
+ file_config: dict[str, Any],
89
+ cli_args: MutableMapping[str, Any],
90
+ ) -> None:
91
+ ocr_backend = config_dict.get("ocr_backend")
92
+ if not ocr_backend or ocr_backend == "none":
93
+ return
94
+
95
+ ocr_config = _build_ocr_config_from_cli(ocr_backend, cli_args)
96
+ if not ocr_config and file_config:
97
+ ocr_config = parse_ocr_backend_config(file_config, ocr_backend)
98
+
99
+ if ocr_config:
100
+ config_dict["ocr_config"] = ocr_config
101
+
102
+
103
+ def _configure_gmft(
104
+ config_dict: dict[str, Any],
105
+ file_config: dict[str, Any],
106
+ cli_args: MutableMapping[str, Any],
107
+ ) -> None:
108
+ if not config_dict.get("extract_tables"):
109
+ return
110
+
111
+ gmft_config = None
112
+ try:
113
+ if cli_args.get("gmft_config"):
114
+ gmft_config = GMFTConfig(**cli_args["gmft_config"])
115
+ elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
116
+ gmft_config = GMFTConfig(**file_config["gmft"])
117
+ except (TypeError, ValueError) as e:
118
+ raise ValidationError(
119
+ f"Invalid GMFT configuration: {e}",
120
+ context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
121
+ ) from e
122
+
123
+ if gmft_config:
124
+ config_dict["gmft_config"] = gmft_config
125
+
126
+
127
+ def _create_ocr_config(
128
+ backend: str, backend_config: dict[str, Any]
129
+ ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig:
130
+ match backend:
131
+ case "tesseract":
132
+ processed_config = backend_config.copy()
133
+ if "psm" in processed_config and isinstance(processed_config["psm"], int):
134
+ try:
135
+ processed_config["psm"] = PSMMode(processed_config["psm"])
136
+ except ValueError as e:
137
+ raise ValidationError(
138
+ f"Invalid PSM mode value: {processed_config['psm']}",
139
+ context={"psm_value": processed_config["psm"], "error": str(e)},
140
+ ) from e
141
+ return TesseractConfig(**processed_config)
142
+ case "easyocr":
143
+ return EasyOCRConfig(**backend_config)
144
+ case "paddleocr":
145
+ return PaddleOCRConfig(**backend_config)
146
+ case _:
147
+ raise ValueError(f"Unknown backend: {backend}")
148
+
29
149
 
30
150
  def load_config_from_file(config_path: Path) -> dict[str, Any]:
31
151
  """Load configuration from a TOML file.
@@ -47,15 +167,12 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
47
167
  except tomllib.TOMLDecodeError as e:
48
168
  raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
49
169
 
50
- # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
51
170
  if config_path.name == "kreuzberg.toml":
52
171
  return data # type: ignore[no-any-return]
53
172
 
54
- # For other files, check if they have [tool.kreuzberg] section
55
- if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
173
+ if config_path.name == "pyproject.toml":
56
174
  return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
57
175
 
58
- # Otherwise assume root-level configuration
59
176
  return data # type: ignore[no-any-return]
60
177
 
61
178
 
@@ -89,29 +206,27 @@ def parse_ocr_backend_config(
89
206
 
90
207
  Returns:
91
208
  Backend-specific configuration object or None.
209
+
210
+ Raises:
211
+ ValidationError: If the backend configuration is invalid.
92
212
  """
93
213
  if backend not in config_dict:
94
214
  return None
95
215
 
96
216
  backend_config = config_dict[backend]
97
217
  if not isinstance(backend_config, dict):
98
- return None
99
-
100
- match backend:
101
- case "tesseract":
102
- # Convert psm integer to PSMMode enum if needed
103
- processed_config = backend_config.copy()
104
- if "psm" in processed_config and isinstance(processed_config["psm"], int):
105
- from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
218
+ raise ValidationError(
219
+ f"Invalid configuration for OCR backend '{backend}': expected dict, got {type(backend_config).__name__}",
220
+ context={"backend": backend, "config_type": type(backend_config).__name__},
221
+ )
106
222
 
107
- processed_config["psm"] = PSMMode(processed_config["psm"])
108
- return TesseractConfig(**processed_config)
109
- case "easyocr":
110
- return EasyOCRConfig(**backend_config)
111
- case "paddleocr":
112
- return PaddleOCRConfig(**backend_config)
113
- case _:
114
- return None
223
+ try:
224
+ return _create_ocr_config(backend, backend_config)
225
+ except (TypeError, ValueError) as e:
226
+ raise ValidationError(
227
+ f"Invalid configuration for OCR backend '{backend}': {e}",
228
+ context={"backend": backend, "config": backend_config, "error": str(e)},
229
+ ) from e
115
230
 
116
231
 
117
232
  def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
@@ -122,53 +237,87 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
122
237
 
123
238
  Returns:
124
239
  ExtractionConfig instance.
240
+
241
+ Raises:
242
+ ValidationError: If the configuration is invalid.
125
243
  """
126
- extraction_config: dict[str, Any] = {}
127
-
128
- # Copy basic configuration fields using dictionary comprehension
129
- basic_fields = {
130
- "force_ocr",
131
- "chunk_content",
132
- "extract_tables",
133
- "max_chars",
134
- "max_overlap",
135
- "ocr_backend",
136
- "extract_entities",
137
- "extract_keywords",
138
- "auto_detect_language",
139
- "enable_quality_processing",
140
- "auto_detect_document_type",
141
- "document_type_confidence_threshold",
142
- "document_classification_mode",
143
- "keyword_count",
144
- }
145
- extraction_config = extraction_config | {
146
- field: config_dict[field] for field in basic_fields if field in config_dict
147
- }
148
-
149
- # Handle OCR backend configuration
244
+ extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
245
+
150
246
  ocr_backend = extraction_config.get("ocr_backend")
151
247
  if ocr_backend and ocr_backend != "none":
152
- # Validate OCR backend
153
- valid_backends = {"tesseract", "easyocr", "paddleocr"}
154
- if ocr_backend not in valid_backends:
248
+ if ocr_backend not in _VALID_OCR_BACKENDS:
155
249
  raise ValidationError(
156
- f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
157
- context={"provided": ocr_backend, "valid": sorted(valid_backends)},
250
+ f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(_VALID_OCR_BACKENDS))} or 'none'",
251
+ context={"provided": ocr_backend, "valid": sorted(_VALID_OCR_BACKENDS)},
158
252
  )
159
253
  ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
160
254
  if ocr_config:
161
255
  extraction_config["ocr_config"] = ocr_config
162
256
 
163
- # Handle GMFT configuration for table extraction
164
257
  if extraction_config.get("extract_tables") and "gmft" in config_dict and isinstance(config_dict["gmft"], dict):
165
- extraction_config["gmft_config"] = GMFTConfig(**config_dict["gmft"])
258
+ try:
259
+ extraction_config["gmft_config"] = GMFTConfig(**config_dict["gmft"])
260
+ except (TypeError, ValueError) as e:
261
+ raise ValidationError(
262
+ f"Invalid GMFT configuration: {e}",
263
+ context={"gmft_config": config_dict["gmft"], "error": str(e)},
264
+ ) from e
265
+
266
+ if "html_to_markdown" in config_dict and isinstance(config_dict["html_to_markdown"], dict):
267
+ try:
268
+ extraction_config["html_to_markdown_config"] = HTMLToMarkdownConfig(**config_dict["html_to_markdown"])
269
+ except (TypeError, ValueError) as e:
270
+ raise ValidationError(
271
+ f"Invalid HTML to Markdown configuration: {e}",
272
+ context={"html_to_markdown_config": config_dict["html_to_markdown"], "error": str(e)},
273
+ ) from e
166
274
 
167
- # Convert "none" to None for ocr_backend
168
275
  if extraction_config.get("ocr_backend") == "none":
169
276
  extraction_config["ocr_backend"] = None
170
277
 
171
- return ExtractionConfig(**extraction_config)
278
+ try:
279
+ return ExtractionConfig(**extraction_config)
280
+ except (TypeError, ValueError) as e:
281
+ raise ValidationError(
282
+ f"Invalid extraction configuration: {e}",
283
+ context={"config": extraction_config, "error": str(e)},
284
+ ) from e
285
+
286
+
287
+ def build_extraction_config(
288
+ file_config: dict[str, Any],
289
+ cli_args: MutableMapping[str, Any],
290
+ ) -> ExtractionConfig:
291
+ """Build ExtractionConfig from file config and CLI arguments.
292
+
293
+ Args:
294
+ file_config: Configuration loaded from file.
295
+ cli_args: CLI arguments.
296
+
297
+ Returns:
298
+ ExtractionConfig instance.
299
+
300
+ Raises:
301
+ ValidationError: If the combined configuration is invalid.
302
+ """
303
+ config_dict: dict[str, Any] = {}
304
+
305
+ _merge_file_config(config_dict, file_config)
306
+ _merge_cli_args(config_dict, cli_args)
307
+
308
+ _configure_ocr_backend(config_dict, file_config, cli_args)
309
+ _configure_gmft(config_dict, file_config, cli_args)
310
+
311
+ if config_dict.get("ocr_backend") == "none":
312
+ config_dict["ocr_backend"] = None
313
+
314
+ try:
315
+ return ExtractionConfig(**config_dict)
316
+ except (TypeError, ValueError) as e:
317
+ raise ValidationError(
318
+ f"Invalid extraction configuration: {e}",
319
+ context={"config": config_dict, "error": str(e)},
320
+ ) from e
172
321
 
173
322
 
174
323
  def find_config_file(start_path: Path | None = None) -> Path | None:
@@ -183,16 +332,17 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
183
332
 
184
333
  Returns:
185
334
  Path to the configuration file or None if not found.
335
+
336
+ Raises:
337
+ ValidationError: If a config file exists but cannot be read or has invalid TOML.
186
338
  """
187
339
  current = start_path or Path.cwd()
188
340
 
189
341
  while current != current.parent:
190
- # First, look for kreuzberg.toml
191
342
  kreuzberg_toml = current / "kreuzberg.toml"
192
343
  if kreuzberg_toml.exists():
193
344
  return kreuzberg_toml
194
345
 
195
- # Then, look for pyproject.toml with [tool.kreuzberg] section
196
346
  pyproject_toml = current / "pyproject.toml"
197
347
  if pyproject_toml.exists():
198
348
  try:
@@ -200,8 +350,16 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
200
350
  data = tomllib.load(f)
201
351
  if "tool" in data and "kreuzberg" in data["tool"]:
202
352
  return pyproject_toml
203
- except Exception: # noqa: BLE001
204
- pass
353
+ except OSError as e:
354
+ raise ValidationError(
355
+ f"Failed to read pyproject.toml: {e}",
356
+ context={"file": str(pyproject_toml), "error": str(e)},
357
+ ) from e
358
+ except tomllib.TOMLDecodeError as e:
359
+ raise ValidationError(
360
+ f"Invalid TOML in pyproject.toml: {e}",
361
+ context={"file": str(pyproject_toml), "error": str(e)},
362
+ ) from e
205
363
 
206
364
  current = current.parent
207
365
  return None
@@ -215,19 +373,18 @@ def load_default_config(start_path: Path | None = None) -> ExtractionConfig | No
215
373
 
216
374
  Returns:
217
375
  ExtractionConfig instance or None if no configuration found.
376
+
377
+ Raises:
378
+ ValidationError: If configuration file exists but contains invalid configuration.
218
379
  """
219
380
  config_path = find_config_file(start_path)
220
381
  if not config_path:
221
382
  return None
222
383
 
223
- try:
224
- config_dict = load_config_from_file(config_path)
225
- if not config_dict:
226
- return None
227
- return build_extraction_config_from_dict(config_dict)
228
- except Exception: # noqa: BLE001
229
- # Silently ignore configuration errors for default loading
384
+ config_dict = load_config_from_file(config_path)
385
+ if not config_dict:
230
386
  return None
387
+ return build_extraction_config_from_dict(config_dict)
231
388
 
232
389
 
233
390
  def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
@@ -278,143 +435,30 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
278
435
  return build_extraction_config_from_dict(config_dict)
279
436
 
280
437
 
281
- def try_discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
282
- """Try to discover and load configuration, returning None if not found.
438
+ def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
439
+ """Discover and load configuration, returning None if no config file found.
440
+
441
+ If a config file is found, attempts to load it. Any errors during loading will bubble up.
283
442
 
284
443
  Args:
285
444
  start_path: Directory to start searching from. Defaults to current working directory.
286
445
 
287
446
  Returns:
288
- ExtractionConfig instance or None if no configuration found.
289
- """
290
- try:
291
- return discover_and_load_config(start_path)
292
- except ValidationError:
293
- return None
447
+ ExtractionConfig instance or None if no configuration file found.
294
448
 
295
-
296
- # Legacy functions for backward compatibility with CLI
297
-
298
- # Define common configuration fields to avoid repetition
299
- _CONFIG_FIELDS = [
300
- "force_ocr",
301
- "chunk_content",
302
- "extract_tables",
303
- "max_chars",
304
- "max_overlap",
305
- "ocr_backend",
306
- "extract_entities",
307
- "extract_keywords",
308
- "auto_detect_language",
309
- "enable_quality_processing",
310
- "auto_detect_document_type",
311
- "document_type_confidence_threshold",
312
- "document_classification_mode",
313
- "keyword_count",
314
- ]
315
-
316
-
317
- def _merge_file_config(config_dict: dict[str, Any], file_config: dict[str, Any]) -> None:
318
- """Merge file configuration into config dictionary."""
319
- if not file_config:
320
- return
321
-
322
- for field in _CONFIG_FIELDS:
323
- if field in file_config:
324
- config_dict[field] = file_config[field]
325
-
326
-
327
- def _merge_cli_args(config_dict: dict[str, Any], cli_args: MutableMapping[str, Any]) -> None:
328
- """Merge CLI arguments into config dictionary."""
329
- for field in _CONFIG_FIELDS:
330
- if field in cli_args and cli_args[field] is not None:
331
- config_dict[field] = cli_args[field]
332
-
333
-
334
- def _build_ocr_config_from_cli(
335
- ocr_backend: str, cli_args: MutableMapping[str, Any]
336
- ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
337
- """Build OCR config from CLI arguments."""
338
- config_key = f"{ocr_backend}_config"
339
- if not cli_args.get(config_key):
340
- return None
341
-
342
- backend_args = cli_args[config_key]
343
- if ocr_backend == "tesseract":
344
- return TesseractConfig(**backend_args)
345
- if ocr_backend == "easyocr":
346
- return EasyOCRConfig(**backend_args)
347
- if ocr_backend == "paddleocr":
348
- return PaddleOCRConfig(**backend_args)
349
- return None
350
-
351
-
352
- def _configure_ocr_backend(
353
- config_dict: dict[str, Any],
354
- file_config: dict[str, Any],
355
- cli_args: MutableMapping[str, Any],
356
- ) -> None:
357
- """Configure OCR backend in config dictionary."""
358
- ocr_backend = config_dict.get("ocr_backend")
359
- if not ocr_backend or ocr_backend == "none":
360
- return
361
-
362
- # Try CLI config first, then file config
363
- ocr_config = _build_ocr_config_from_cli(ocr_backend, cli_args)
364
- if not ocr_config and file_config:
365
- ocr_config = parse_ocr_backend_config(file_config, ocr_backend)
366
-
367
- if ocr_config:
368
- config_dict["ocr_config"] = ocr_config
369
-
370
-
371
- def _configure_gmft(
372
- config_dict: dict[str, Any],
373
- file_config: dict[str, Any],
374
- cli_args: MutableMapping[str, Any],
375
- ) -> None:
376
- """Configure GMFT in config dictionary."""
377
- if not config_dict.get("extract_tables"):
378
- return
379
-
380
- gmft_config = None
381
- if cli_args.get("gmft_config"):
382
- gmft_config = GMFTConfig(**cli_args["gmft_config"])
383
- elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
384
- gmft_config = GMFTConfig(**file_config["gmft"])
385
-
386
- if gmft_config:
387
- config_dict["gmft_config"] = gmft_config
388
-
389
-
390
- def build_extraction_config(
391
- file_config: dict[str, Any],
392
- cli_args: MutableMapping[str, Any],
393
- ) -> ExtractionConfig:
394
- """Build ExtractionConfig from file config and CLI arguments.
395
-
396
- Args:
397
- file_config: Configuration loaded from file.
398
- cli_args: CLI arguments.
399
-
400
- Returns:
401
- ExtractionConfig instance.
449
+ Raises:
450
+ ValidationError: If a configuration file exists but is invalid.
402
451
  """
403
- config_dict: dict[str, Any] = {}
404
-
405
- # Merge configurations: file first, then CLI overrides
406
- _merge_file_config(config_dict, file_config)
407
- _merge_cli_args(config_dict, cli_args)
408
-
409
- # Configure complex components
410
- _configure_ocr_backend(config_dict, file_config, cli_args)
411
- _configure_gmft(config_dict, file_config, cli_args)
452
+ search_path = Path(start_path) if start_path else None
453
+ config_path = find_config_file(search_path)
412
454
 
413
- # Convert "none" to None for ocr_backend
414
- if config_dict.get("ocr_backend") == "none":
415
- config_dict["ocr_backend"] = None
455
+ if not config_path:
456
+ return None
416
457
 
417
- return ExtractionConfig(**config_dict)
458
+ config_dict = load_config_from_file(config_path)
459
+ if not config_dict:
460
+ return None
461
+ return build_extraction_config_from_dict(config_dict)
418
462
 
419
463
 
420
464
  def find_default_config() -> Path | None:
@@ -51,10 +51,8 @@ def _get_translated_text(result: ExtractionResult) -> str:
51
51
  Raises:
52
52
  MissingDependencyError: If the deep-translator package is not installed
53
53
  """
54
- # Combine content with metadata for classification
55
54
  text_to_classify = result.content
56
55
  if result.metadata:
57
- # Add metadata values to the text for classification
58
56
  metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
59
57
  text_to_classify = f"{text_to_classify} {metadata_text}"
60
58
 
@@ -68,7 +66,6 @@ def _get_translated_text(result: ExtractionResult) -> str:
68
66
  try:
69
67
  return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
70
68
  except Exception: # noqa: BLE001
71
- # Fall back to original content in lowercase if translation fails
72
69
  return text_to_classify.lower()
73
70
 
74
71
 
@@ -131,13 +128,10 @@ def classify_document_from_layout(
131
128
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
132
129
  return None, None
133
130
 
134
- # Use layout text for classification, not the content
135
131
  layout_text = " ".join(layout_df["text"].astype(str).tolist())
136
132
 
137
- # Translate layout text directly for classification
138
133
  text_to_classify = layout_text
139
134
  if result.metadata:
140
- # Add metadata values to the text for classification
141
135
  metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
142
136
  text_to_classify = f"{text_to_classify} {metadata_text}"
143
137
 
@@ -146,7 +140,6 @@ def classify_document_from_layout(
146
140
 
147
141
  translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
148
142
  except Exception: # noqa: BLE001
149
- # Fall back to original content in lowercase if translation fails
150
143
  translated_text = text_to_classify.lower()
151
144
 
152
145
  layout_df["translated_text"] = translated_text
@@ -184,7 +177,6 @@ def auto_detect_document_type(
184
177
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
185
178
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
186
179
  elif result.layout is not None and not result.layout.empty:
187
- # Use layout-based classification if layout data is available
188
180
  result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
189
181
  else:
190
182
  result.document_type, result.document_type_confidence = classify_document(result, config)