kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_config.py CHANGED
@@ -1,10 +1,3 @@
1
- """Configuration discovery and loading for Kreuzberg.
2
-
3
- This module provides configuration loading from both kreuzberg.toml and pyproject.toml files.
4
- Configuration is automatically discovered by searching up the directory tree from the current
5
- working directory.
6
- """
7
-
8
1
  from __future__ import annotations
9
2
 
10
3
  import sys
@@ -16,29 +9,145 @@ if sys.version_info >= (3, 11):
16
9
  else: # pragma: no cover
17
10
  import tomli as tomllib # type: ignore[import-not-found]
18
11
 
19
- from kreuzberg._gmft import GMFTConfig
20
- from kreuzberg._ocr._easyocr import EasyOCRConfig
21
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
22
- from kreuzberg._ocr._tesseract import TesseractConfig
23
- from kreuzberg._types import ExtractionConfig, OcrBackendType
12
+ from kreuzberg._types import (
13
+ EasyOCRConfig,
14
+ ExtractionConfig,
15
+ GMFTConfig,
16
+ HTMLToMarkdownConfig,
17
+ OcrBackendType,
18
+ PaddleOCRConfig,
19
+ PSMMode,
20
+ TesseractConfig,
21
+ )
24
22
  from kreuzberg.exceptions import ValidationError
25
23
 
26
24
  if TYPE_CHECKING:
27
25
  from collections.abc import MutableMapping
28
26
 
27
+ _CONFIG_FIELDS = [
28
+ "force_ocr",
29
+ "chunk_content",
30
+ "extract_tables",
31
+ "max_chars",
32
+ "max_overlap",
33
+ "ocr_backend",
34
+ "extract_entities",
35
+ "extract_keywords",
36
+ "auto_detect_language",
37
+ "enable_quality_processing",
38
+ "auto_detect_document_type",
39
+ "document_type_confidence_threshold",
40
+ "document_classification_mode",
41
+ "keyword_count",
42
+ ]
29
43
 
30
- def load_config_from_file(config_path: Path) -> dict[str, Any]:
31
- """Load configuration from a TOML file.
44
+ _VALID_OCR_BACKENDS = {"tesseract", "easyocr", "paddleocr"}
45
+
46
+
47
+ def _merge_file_config(config_dict: dict[str, Any], file_config: dict[str, Any]) -> None:
48
+ if not file_config:
49
+ return
50
+ for field in _CONFIG_FIELDS:
51
+ if field in file_config:
52
+ config_dict[field] = file_config[field]
53
+
54
+
55
+ def _merge_cli_args(config_dict: dict[str, Any], cli_args: MutableMapping[str, Any]) -> None:
56
+ for field in _CONFIG_FIELDS:
57
+ if field in cli_args and cli_args[field] is not None:
58
+ config_dict[field] = cli_args[field]
59
+
60
+
61
+ def _build_ocr_config_from_cli(
62
+ ocr_backend: str, cli_args: MutableMapping[str, Any]
63
+ ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
64
+ config_key = f"{ocr_backend}_config"
65
+ if not cli_args.get(config_key):
66
+ return None
67
+
68
+ backend_args = cli_args[config_key]
69
+ try:
70
+ match ocr_backend:
71
+ case "tesseract":
72
+ return TesseractConfig(**backend_args)
73
+ case "easyocr":
74
+ return EasyOCRConfig(**backend_args)
75
+ case "paddleocr":
76
+ return PaddleOCRConfig(**backend_args)
77
+ case _:
78
+ return None
79
+ except (TypeError, ValueError) as e:
80
+ raise ValidationError(
81
+ f"Invalid {ocr_backend} configuration from CLI: {e}",
82
+ context={"backend": ocr_backend, "config": backend_args, "error": str(e)},
83
+ ) from e
84
+
85
+
86
+ def _configure_ocr_backend(
87
+ config_dict: dict[str, Any],
88
+ file_config: dict[str, Any],
89
+ cli_args: MutableMapping[str, Any],
90
+ ) -> None:
91
+ ocr_backend = config_dict.get("ocr_backend")
92
+ if not ocr_backend or ocr_backend == "none":
93
+ return
94
+
95
+ ocr_config = _build_ocr_config_from_cli(ocr_backend, cli_args)
96
+ if not ocr_config and file_config:
97
+ ocr_config = parse_ocr_backend_config(file_config, ocr_backend)
98
+
99
+ if ocr_config:
100
+ config_dict["ocr_config"] = ocr_config
32
101
 
33
- Args:
34
- config_path: Path to the configuration file.
35
102
 
36
- Returns:
37
- Dictionary containing the loaded configuration.
103
+ def _configure_gmft(
104
+ config_dict: dict[str, Any],
105
+ file_config: dict[str, Any],
106
+ cli_args: MutableMapping[str, Any],
107
+ ) -> None:
108
+ if not config_dict.get("extract_tables"):
109
+ return
38
110
 
39
- Raises:
40
- ValidationError: If the file cannot be read or parsed.
41
- """
111
+ gmft_config = None
112
+ try:
113
+ if cli_args.get("gmft_config"):
114
+ gmft_config = GMFTConfig(**cli_args["gmft_config"])
115
+ elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
116
+ gmft_config = GMFTConfig(**file_config["gmft"])
117
+ except (TypeError, ValueError) as e:
118
+ raise ValidationError(
119
+ f"Invalid GMFT configuration: {e}",
120
+ context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
121
+ ) from e
122
+
123
+ if gmft_config:
124
+ config_dict["gmft_config"] = gmft_config
125
+
126
+
127
+ def _create_ocr_config(
128
+ backend: str, backend_config: dict[str, Any]
129
+ ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig:
130
+ match backend:
131
+ case "tesseract":
132
+ processed_config = backend_config.copy()
133
+ if "psm" in processed_config and isinstance(processed_config["psm"], int):
134
+ try:
135
+ processed_config["psm"] = PSMMode(processed_config["psm"])
136
+ except ValueError as e:
137
+ raise ValidationError(
138
+ f"Invalid PSM mode value: {processed_config['psm']}",
139
+ context={"psm_value": processed_config["psm"], "error": str(e)},
140
+ ) from e
141
+ return TesseractConfig(**processed_config)
142
+ case "easyocr":
143
+ return EasyOCRConfig(**backend_config)
144
+ case "paddleocr":
145
+ return PaddleOCRConfig(**backend_config)
146
+ case _:
147
+ raise ValueError(f"Unknown backend: {backend}")
148
+
149
+
150
+ def load_config_from_file(config_path: Path) -> dict[str, Any]:
42
151
  try:
43
152
  with config_path.open("rb") as f:
44
153
  data = tomllib.load(f)
@@ -47,28 +156,16 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
47
156
  except tomllib.TOMLDecodeError as e:
48
157
  raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
49
158
 
50
- # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
51
159
  if config_path.name == "kreuzberg.toml":
52
160
  return data # type: ignore[no-any-return]
53
161
 
54
- # For other files, check if they have [tool.kreuzberg] section
55
- if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
162
+ if config_path.name == "pyproject.toml":
56
163
  return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
57
164
 
58
- # Otherwise assume root-level configuration
59
165
  return data # type: ignore[no-any-return]
60
166
 
61
167
 
62
168
  def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
63
- """Merge two configuration dictionaries recursively.
64
-
65
- Args:
66
- base: Base configuration dictionary.
67
- override: Configuration dictionary to override base values.
68
-
69
- Returns:
70
- Merged configuration dictionary.
71
- """
72
169
  result = base.copy()
73
170
  for key, value in override.items():
74
171
  if isinstance(value, dict) and key in result and isinstance(result[key], dict):
@@ -81,118 +178,101 @@ def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, A
81
178
  def parse_ocr_backend_config(
82
179
  config_dict: dict[str, Any], backend: OcrBackendType
83
180
  ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
84
- """Parse OCR backend-specific configuration.
85
-
86
- Args:
87
- config_dict: Configuration dictionary.
88
- backend: The OCR backend type.
89
-
90
- Returns:
91
- Backend-specific configuration object or None.
92
- """
93
181
  if backend not in config_dict:
94
182
  return None
95
183
 
96
184
  backend_config = config_dict[backend]
97
185
  if not isinstance(backend_config, dict):
98
- return None
99
-
100
- match backend:
101
- case "tesseract":
102
- # Convert psm integer to PSMMode enum if needed
103
- processed_config = backend_config.copy()
104
- if "psm" in processed_config and isinstance(processed_config["psm"], int):
105
- from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
186
+ raise ValidationError(
187
+ f"Invalid configuration for OCR backend '{backend}': expected dict, got {type(backend_config).__name__}",
188
+ context={"backend": backend, "config_type": type(backend_config).__name__},
189
+ )
106
190
 
107
- processed_config["psm"] = PSMMode(processed_config["psm"])
108
- return TesseractConfig(**processed_config)
109
- case "easyocr":
110
- return EasyOCRConfig(**backend_config)
111
- case "paddleocr":
112
- return PaddleOCRConfig(**backend_config)
113
- case _:
114
- return None
191
+ try:
192
+ return _create_ocr_config(backend, backend_config)
193
+ except (TypeError, ValueError) as e:
194
+ raise ValidationError(
195
+ f"Invalid configuration for OCR backend '{backend}': {e}",
196
+ context={"backend": backend, "config": backend_config, "error": str(e)},
197
+ ) from e
115
198
 
116
199
 
117
200
  def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
118
- """Build ExtractionConfig from a configuration dictionary.
119
-
120
- Args:
121
- config_dict: Configuration dictionary from TOML file.
122
-
123
- Returns:
124
- ExtractionConfig instance.
125
- """
126
- extraction_config: dict[str, Any] = {}
127
-
128
- # Copy basic configuration fields using dictionary comprehension
129
- basic_fields = {
130
- "force_ocr",
131
- "chunk_content",
132
- "extract_tables",
133
- "max_chars",
134
- "max_overlap",
135
- "ocr_backend",
136
- "extract_entities",
137
- "extract_keywords",
138
- "auto_detect_language",
139
- "enable_quality_processing",
140
- "auto_detect_document_type",
141
- "document_type_confidence_threshold",
142
- "document_classification_mode",
143
- "keyword_count",
144
- }
145
- extraction_config = extraction_config | {
146
- field: config_dict[field] for field in basic_fields if field in config_dict
147
- }
148
-
149
- # Handle OCR backend configuration
201
+ extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
202
+
150
203
  ocr_backend = extraction_config.get("ocr_backend")
151
204
  if ocr_backend and ocr_backend != "none":
152
- # Validate OCR backend
153
- valid_backends = {"tesseract", "easyocr", "paddleocr"}
154
- if ocr_backend not in valid_backends:
205
+ if ocr_backend not in _VALID_OCR_BACKENDS:
155
206
  raise ValidationError(
156
- f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
157
- context={"provided": ocr_backend, "valid": sorted(valid_backends)},
207
+ f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(_VALID_OCR_BACKENDS))} or 'none'",
208
+ context={"provided": ocr_backend, "valid": sorted(_VALID_OCR_BACKENDS)},
158
209
  )
159
210
  ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
160
211
  if ocr_config:
161
212
  extraction_config["ocr_config"] = ocr_config
162
213
 
163
- # Handle GMFT configuration for table extraction
164
214
  if extraction_config.get("extract_tables") and "gmft" in config_dict and isinstance(config_dict["gmft"], dict):
165
- extraction_config["gmft_config"] = GMFTConfig(**config_dict["gmft"])
215
+ try:
216
+ extraction_config["gmft_config"] = GMFTConfig(**config_dict["gmft"])
217
+ except (TypeError, ValueError) as e:
218
+ raise ValidationError(
219
+ f"Invalid GMFT configuration: {e}",
220
+ context={"gmft_config": config_dict["gmft"], "error": str(e)},
221
+ ) from e
222
+
223
+ if "html_to_markdown" in config_dict and isinstance(config_dict["html_to_markdown"], dict):
224
+ try:
225
+ extraction_config["html_to_markdown_config"] = HTMLToMarkdownConfig(**config_dict["html_to_markdown"])
226
+ except (TypeError, ValueError) as e:
227
+ raise ValidationError(
228
+ f"Invalid HTML to Markdown configuration: {e}",
229
+ context={"html_to_markdown_config": config_dict["html_to_markdown"], "error": str(e)},
230
+ ) from e
166
231
 
167
- # Convert "none" to None for ocr_backend
168
232
  if extraction_config.get("ocr_backend") == "none":
169
233
  extraction_config["ocr_backend"] = None
170
234
 
171
- return ExtractionConfig(**extraction_config)
235
+ try:
236
+ return ExtractionConfig(**extraction_config)
237
+ except (TypeError, ValueError) as e:
238
+ raise ValidationError(
239
+ f"Invalid extraction configuration: {e}",
240
+ context={"config": extraction_config, "error": str(e)},
241
+ ) from e
172
242
 
173
243
 
174
- def find_config_file(start_path: Path | None = None) -> Path | None:
175
- """Find configuration file by searching up the directory tree.
244
+ def build_extraction_config(
245
+ file_config: dict[str, Any],
246
+ cli_args: MutableMapping[str, Any],
247
+ ) -> ExtractionConfig:
248
+ config_dict: dict[str, Any] = {}
249
+
250
+ _merge_file_config(config_dict, file_config)
251
+ _merge_cli_args(config_dict, cli_args)
176
252
 
177
- Searches for configuration files in the following order:
178
- 1. kreuzberg.toml
179
- 2. pyproject.toml (with [tool.kreuzberg] section)
253
+ _configure_ocr_backend(config_dict, file_config, cli_args)
254
+ _configure_gmft(config_dict, file_config, cli_args)
255
+
256
+ if config_dict.get("ocr_backend") == "none":
257
+ config_dict["ocr_backend"] = None
258
+
259
+ try:
260
+ return ExtractionConfig(**config_dict)
261
+ except (TypeError, ValueError) as e:
262
+ raise ValidationError(
263
+ f"Invalid extraction configuration: {e}",
264
+ context={"config": config_dict, "error": str(e)},
265
+ ) from e
180
266
 
181
- Args:
182
- start_path: Directory to start searching from. Defaults to current working directory.
183
267
 
184
- Returns:
185
- Path to the configuration file or None if not found.
186
- """
268
+ def find_config_file(start_path: Path | None = None) -> Path | None:
187
269
  current = start_path or Path.cwd()
188
270
 
189
271
  while current != current.parent:
190
- # First, look for kreuzberg.toml
191
272
  kreuzberg_toml = current / "kreuzberg.toml"
192
273
  if kreuzberg_toml.exists():
193
274
  return kreuzberg_toml
194
275
 
195
- # Then, look for pyproject.toml with [tool.kreuzberg] section
196
276
  pyproject_toml = current / "pyproject.toml"
197
277
  if pyproject_toml.exists():
198
278
  try:
@@ -200,65 +280,39 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
200
280
  data = tomllib.load(f)
201
281
  if "tool" in data and "kreuzberg" in data["tool"]:
202
282
  return pyproject_toml
203
- except Exception: # noqa: BLE001
204
- pass
283
+ except OSError as e:
284
+ raise ValidationError(
285
+ f"Failed to read pyproject.toml: {e}",
286
+ context={"file": str(pyproject_toml), "error": str(e)},
287
+ ) from e
288
+ except tomllib.TOMLDecodeError as e:
289
+ raise ValidationError(
290
+ f"Invalid TOML in pyproject.toml: {e}",
291
+ context={"file": str(pyproject_toml), "error": str(e)},
292
+ ) from e
205
293
 
206
294
  current = current.parent
207
295
  return None
208
296
 
209
297
 
210
298
  def load_default_config(start_path: Path | None = None) -> ExtractionConfig | None:
211
- """Load the default configuration from discovered config file.
212
-
213
- Args:
214
- start_path: Directory to start searching from. Defaults to current working directory.
215
-
216
- Returns:
217
- ExtractionConfig instance or None if no configuration found.
218
- """
219
299
  config_path = find_config_file(start_path)
220
300
  if not config_path:
221
301
  return None
222
302
 
223
- try:
224
- config_dict = load_config_from_file(config_path)
225
- if not config_dict:
226
- return None
227
- return build_extraction_config_from_dict(config_dict)
228
- except Exception: # noqa: BLE001
229
- # Silently ignore configuration errors for default loading
303
+ config_dict = load_config_from_file(config_path)
304
+ if not config_dict:
230
305
  return None
306
+ return build_extraction_config_from_dict(config_dict)
231
307
 
232
308
 
233
309
  def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
234
- """Load configuration from a specific file path.
235
-
236
- Args:
237
- config_path: Path to the configuration file.
238
-
239
- Returns:
240
- ExtractionConfig instance.
241
-
242
- Raises:
243
- ValidationError: If the file cannot be read, parsed, or is invalid.
244
- """
245
310
  path = Path(config_path)
246
311
  config_dict = load_config_from_file(path)
247
312
  return build_extraction_config_from_dict(config_dict)
248
313
 
249
314
 
250
315
  def discover_and_load_config(start_path: Path | str | None = None) -> ExtractionConfig:
251
- """Load configuration by discovering config files in the directory tree.
252
-
253
- Args:
254
- start_path: Directory to start searching from. Defaults to current working directory.
255
-
256
- Returns:
257
- ExtractionConfig instance.
258
-
259
- Raises:
260
- ValidationError: If no configuration file is found or if the file is invalid.
261
- """
262
316
  search_path = Path(start_path) if start_path else None
263
317
  config_path = find_config_file(search_path)
264
318
 
@@ -278,152 +332,18 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
278
332
  return build_extraction_config_from_dict(config_dict)
279
333
 
280
334
 
281
- def try_discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
282
- """Try to discover and load configuration, returning None if not found.
283
-
284
- Args:
285
- start_path: Directory to start searching from. Defaults to current working directory.
335
+ def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
336
+ search_path = Path(start_path) if start_path else None
337
+ config_path = find_config_file(search_path)
286
338
 
287
- Returns:
288
- ExtractionConfig instance or None if no configuration found.
289
- """
290
- try:
291
- return discover_and_load_config(start_path)
292
- except ValidationError:
339
+ if not config_path:
293
340
  return None
294
341
 
295
-
296
- # Legacy functions for backward compatibility with CLI
297
-
298
- # Define common configuration fields to avoid repetition
299
- _CONFIG_FIELDS = [
300
- "force_ocr",
301
- "chunk_content",
302
- "extract_tables",
303
- "max_chars",
304
- "max_overlap",
305
- "ocr_backend",
306
- "extract_entities",
307
- "extract_keywords",
308
- "auto_detect_language",
309
- "enable_quality_processing",
310
- "auto_detect_document_type",
311
- "document_type_confidence_threshold",
312
- "document_classification_mode",
313
- "keyword_count",
314
- ]
315
-
316
-
317
- def _merge_file_config(config_dict: dict[str, Any], file_config: dict[str, Any]) -> None:
318
- """Merge file configuration into config dictionary."""
319
- if not file_config:
320
- return
321
-
322
- for field in _CONFIG_FIELDS:
323
- if field in file_config:
324
- config_dict[field] = file_config[field]
325
-
326
-
327
- def _merge_cli_args(config_dict: dict[str, Any], cli_args: MutableMapping[str, Any]) -> None:
328
- """Merge CLI arguments into config dictionary."""
329
- for field in _CONFIG_FIELDS:
330
- if field in cli_args and cli_args[field] is not None:
331
- config_dict[field] = cli_args[field]
332
-
333
-
334
- def _build_ocr_config_from_cli(
335
- ocr_backend: str, cli_args: MutableMapping[str, Any]
336
- ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
337
- """Build OCR config from CLI arguments."""
338
- config_key = f"{ocr_backend}_config"
339
- if not cli_args.get(config_key):
342
+ config_dict = load_config_from_file(config_path)
343
+ if not config_dict:
340
344
  return None
341
-
342
- backend_args = cli_args[config_key]
343
- if ocr_backend == "tesseract":
344
- return TesseractConfig(**backend_args)
345
- if ocr_backend == "easyocr":
346
- return EasyOCRConfig(**backend_args)
347
- if ocr_backend == "paddleocr":
348
- return PaddleOCRConfig(**backend_args)
349
- return None
350
-
351
-
352
- def _configure_ocr_backend(
353
- config_dict: dict[str, Any],
354
- file_config: dict[str, Any],
355
- cli_args: MutableMapping[str, Any],
356
- ) -> None:
357
- """Configure OCR backend in config dictionary."""
358
- ocr_backend = config_dict.get("ocr_backend")
359
- if not ocr_backend or ocr_backend == "none":
360
- return
361
-
362
- # Try CLI config first, then file config
363
- ocr_config = _build_ocr_config_from_cli(ocr_backend, cli_args)
364
- if not ocr_config and file_config:
365
- ocr_config = parse_ocr_backend_config(file_config, ocr_backend)
366
-
367
- if ocr_config:
368
- config_dict["ocr_config"] = ocr_config
369
-
370
-
371
- def _configure_gmft(
372
- config_dict: dict[str, Any],
373
- file_config: dict[str, Any],
374
- cli_args: MutableMapping[str, Any],
375
- ) -> None:
376
- """Configure GMFT in config dictionary."""
377
- if not config_dict.get("extract_tables"):
378
- return
379
-
380
- gmft_config = None
381
- if cli_args.get("gmft_config"):
382
- gmft_config = GMFTConfig(**cli_args["gmft_config"])
383
- elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
384
- gmft_config = GMFTConfig(**file_config["gmft"])
385
-
386
- if gmft_config:
387
- config_dict["gmft_config"] = gmft_config
388
-
389
-
390
- def build_extraction_config(
391
- file_config: dict[str, Any],
392
- cli_args: MutableMapping[str, Any],
393
- ) -> ExtractionConfig:
394
- """Build ExtractionConfig from file config and CLI arguments.
395
-
396
- Args:
397
- file_config: Configuration loaded from file.
398
- cli_args: CLI arguments.
399
-
400
- Returns:
401
- ExtractionConfig instance.
402
- """
403
- config_dict: dict[str, Any] = {}
404
-
405
- # Merge configurations: file first, then CLI overrides
406
- _merge_file_config(config_dict, file_config)
407
- _merge_cli_args(config_dict, cli_args)
408
-
409
- # Configure complex components
410
- _configure_ocr_backend(config_dict, file_config, cli_args)
411
- _configure_gmft(config_dict, file_config, cli_args)
412
-
413
- # Convert "none" to None for ocr_backend
414
- if config_dict.get("ocr_backend") == "none":
415
- config_dict["ocr_backend"] = None
416
-
417
- return ExtractionConfig(**config_dict)
345
+ return build_extraction_config_from_dict(config_dict)
418
346
 
419
347
 
420
348
  def find_default_config() -> Path | None:
421
- """Find the default configuration file (pyproject.toml).
422
-
423
- Returns:
424
- Path to the configuration file or None if not found.
425
-
426
- Note:
427
- This function is deprecated. Use find_config_file() instead.
428
- """
429
349
  return find_config_file()