kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kreuzberg/__init__.py +4 -0
  2. kreuzberg/_api/main.py +22 -1
  3. kreuzberg/_config.py +404 -0
  4. kreuzberg/_entity_extraction.py +4 -5
  5. kreuzberg/_extractors/_base.py +3 -5
  6. kreuzberg/_extractors/_image.py +18 -32
  7. kreuzberg/_extractors/_pandoc.py +3 -14
  8. kreuzberg/_extractors/_pdf.py +39 -57
  9. kreuzberg/_extractors/_spread_sheet.py +2 -3
  10. kreuzberg/_extractors/_structured.py +10 -7
  11. kreuzberg/_gmft.py +314 -10
  12. kreuzberg/_language_detection.py +1 -1
  13. kreuzberg/_mcp/server.py +58 -8
  14. kreuzberg/_ocr/__init__.py +1 -22
  15. kreuzberg/_ocr/_base.py +59 -0
  16. kreuzberg/_ocr/_easyocr.py +92 -1
  17. kreuzberg/_ocr/_paddleocr.py +90 -1
  18. kreuzberg/_ocr/_tesseract.py +556 -5
  19. kreuzberg/_playa.py +2 -3
  20. kreuzberg/_types.py +46 -24
  21. kreuzberg/_utils/_cache.py +35 -4
  22. kreuzberg/_utils/_device.py +10 -20
  23. kreuzberg/_utils/_errors.py +44 -45
  24. kreuzberg/_utils/_process_pool.py +2 -6
  25. kreuzberg/_utils/_quality.py +7 -11
  26. kreuzberg/_utils/_serialization.py +21 -16
  27. kreuzberg/_utils/_string.py +22 -12
  28. kreuzberg/_utils/_table.py +3 -4
  29. kreuzberg/cli.py +4 -5
  30. kreuzberg/exceptions.py +10 -0
  31. kreuzberg/extraction.py +6 -24
  32. kreuzberg-3.8.2.dist-info/METADATA +265 -0
  33. kreuzberg-3.8.2.dist-info/RECORD +53 -0
  34. kreuzberg/_cli_config.py +0 -175
  35. kreuzberg/_multiprocessing/__init__.py +0 -5
  36. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  37. kreuzberg/_ocr/_pool.py +0 -357
  38. kreuzberg/_ocr/_sync.py +0 -566
  39. kreuzberg-3.8.0.dist-info/METADATA +0 -313
  40. kreuzberg-3.8.0.dist-info/RECORD +0 -57
  41. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
  42. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
  43. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_mcp/server.py CHANGED
@@ -3,11 +3,14 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import base64
6
+ import json
6
7
  from typing import Any
7
8
 
9
+ import msgspec
8
10
  from mcp.server import FastMCP
9
11
  from mcp.types import TextContent
10
12
 
13
+ from kreuzberg._config import try_discover_config
11
14
  from kreuzberg._types import ExtractionConfig, OcrBackendType
12
15
  from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
13
16
 
@@ -15,6 +18,44 @@ from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
15
18
  mcp = FastMCP("Kreuzberg Text Extraction")
16
19
 
17
20
 
21
+ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
22
+ """Create ExtractionConfig with discovered config as base and tool parameters as overrides.
23
+
24
+ Args:
25
+ **kwargs: Tool parameters to override defaults/discovered config.
26
+
27
+ Returns:
28
+ ExtractionConfig instance.
29
+ """
30
+ # Try to discover configuration from files
31
+ base_config = try_discover_config()
32
+
33
+ if base_config is None:
34
+ # No config file found, use defaults
35
+ return ExtractionConfig(**kwargs)
36
+
37
+ # Merge discovered config with tool parameters (tool params take precedence)
38
+ config_dict: dict[str, Any] = {
39
+ "force_ocr": base_config.force_ocr,
40
+ "chunk_content": base_config.chunk_content,
41
+ "extract_tables": base_config.extract_tables,
42
+ "extract_entities": base_config.extract_entities,
43
+ "extract_keywords": base_config.extract_keywords,
44
+ "ocr_backend": base_config.ocr_backend,
45
+ "max_chars": base_config.max_chars,
46
+ "max_overlap": base_config.max_overlap,
47
+ "keyword_count": base_config.keyword_count,
48
+ "auto_detect_language": base_config.auto_detect_language,
49
+ "ocr_config": base_config.ocr_config,
50
+ "gmft_config": base_config.gmft_config,
51
+ }
52
+
53
+ # Override with provided parameters
54
+ config_dict.update(kwargs)
55
+
56
+ return ExtractionConfig(**config_dict)
57
+
58
+
18
59
  @mcp.tool()
19
60
  def extract_document( # noqa: PLR0913
20
61
  file_path: str,
@@ -49,7 +90,7 @@ def extract_document( # noqa: PLR0913
49
90
  Returns:
50
91
  Extracted content with metadata, tables, chunks, entities, and keywords
51
92
  """
52
- config = ExtractionConfig(
93
+ config = _create_config_with_overrides(
53
94
  force_ocr=force_ocr,
54
95
  chunk_content=chunk_content,
55
96
  extract_tables=extract_tables,
@@ -63,7 +104,7 @@ def extract_document( # noqa: PLR0913
63
104
  )
64
105
 
65
106
  result = extract_file_sync(file_path, mime_type, config)
66
- return result.to_dict()
107
+ return result.to_dict(include_none=True)
67
108
 
68
109
 
69
110
  @mcp.tool()
@@ -102,7 +143,7 @@ def extract_bytes( # noqa: PLR0913
102
143
  """
103
144
  content_bytes = base64.b64decode(content_base64)
104
145
 
105
- config = ExtractionConfig(
146
+ config = _create_config_with_overrides(
106
147
  force_ocr=force_ocr,
107
148
  chunk_content=chunk_content,
108
149
  extract_tables=extract_tables,
@@ -116,7 +157,7 @@ def extract_bytes( # noqa: PLR0913
116
157
  )
117
158
 
118
159
  result = extract_bytes_sync(content_bytes, mime_type, config)
119
- return result.to_dict()
160
+ return result.to_dict(include_none=True)
120
161
 
121
162
 
122
163
  @mcp.tool()
@@ -133,7 +174,7 @@ def extract_simple(
133
174
  Returns:
134
175
  Extracted text content as a string
135
176
  """
136
- config = ExtractionConfig()
177
+ config = _create_config_with_overrides()
137
178
  result = extract_file_sync(file_path, mime_type, config)
138
179
  return result.content
139
180
 
@@ -142,7 +183,16 @@ def extract_simple(
142
183
  def get_default_config() -> str:
143
184
  """Get the default extraction configuration."""
144
185
  config = ExtractionConfig()
145
- return str(config.__dict__)
186
+ return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
187
+
188
+
189
+ @mcp.resource("config://discovered")
190
+ def get_discovered_config() -> str:
191
+ """Get the discovered configuration from config files."""
192
+ config = try_discover_config()
193
+ if config is None:
194
+ return "No configuration file found"
195
+ return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
146
196
 
147
197
 
148
198
  @mcp.resource("config://available-backends")
@@ -175,7 +225,7 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
175
225
  Returns:
176
226
  Extracted content with summarization prompt
177
227
  """
178
- result = extract_file_sync(file_path, None, ExtractionConfig())
228
+ result = extract_file_sync(file_path, None, _create_config_with_overrides())
179
229
 
180
230
  return [
181
231
  TextContent(
@@ -195,7 +245,7 @@ def extract_structured(file_path: str) -> list[TextContent]:
195
245
  Returns:
196
246
  Extracted content with structured analysis prompt
197
247
  """
198
- config = ExtractionConfig(
248
+ config = _create_config_with_overrides(
199
249
  extract_entities=True,
200
250
  extract_keywords=True,
201
251
  extract_tables=True,
@@ -4,19 +4,7 @@ from typing import Any
4
4
  from kreuzberg._ocr._base import OCRBackend
5
5
  from kreuzberg._ocr._easyocr import EasyOCRBackend
6
6
  from kreuzberg._ocr._paddleocr import PaddleBackend
7
- from kreuzberg._ocr._pool import TesseractProcessPool
8
- from kreuzberg._ocr._sync import (
9
- process_batch_images_process_pool,
10
- process_batch_images_sync,
11
- process_batch_images_threaded,
12
- process_image_bytes_easyocr_sync,
13
- process_image_bytes_paddleocr_sync,
14
- process_image_bytes_tesseract_sync,
15
- process_image_easyocr_sync,
16
- process_image_paddleocr_sync,
17
- process_image_tesseract_sync,
18
- )
19
- from kreuzberg._ocr._tesseract import TesseractBackend
7
+ from kreuzberg._ocr._tesseract import TesseractBackend, TesseractProcessPool
20
8
  from kreuzberg._types import OcrBackendType
21
9
 
22
10
  __all__ = [
@@ -26,15 +14,6 @@ __all__ = [
26
14
  "TesseractBackend",
27
15
  "TesseractProcessPool",
28
16
  "get_ocr_backend",
29
- "process_batch_images_process_pool",
30
- "process_batch_images_sync",
31
- "process_batch_images_threaded",
32
- "process_image_bytes_easyocr_sync",
33
- "process_image_bytes_paddleocr_sync",
34
- "process_image_bytes_tesseract_sync",
35
- "process_image_easyocr_sync",
36
- "process_image_paddleocr_sync",
37
- "process_image_tesseract_sync",
38
17
  ]
39
18
 
40
19
 
kreuzberg/_ocr/_base.py CHANGED
@@ -49,6 +49,65 @@ class OCRBackend(ABC, Generic[T]):
49
49
  """
50
50
  ...
51
51
 
52
+ @abstractmethod
53
+ def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
54
+ """Synchronously process an image and extract its text and metadata.
55
+
56
+ Args:
57
+ image: An instance of PIL.Image representing the input image.
58
+ **kwargs: Any kwargs related to the given backend
59
+
60
+ Returns:
61
+ The extraction result object
62
+ """
63
+ ...
64
+
65
+ @abstractmethod
66
+ def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
67
+ """Synchronously process a file and extract its text and metadata.
68
+
69
+ Args:
70
+ path: A Path object representing the file to be processed.
71
+ **kwargs: Any kwargs related to the given backend
72
+
73
+ Returns:
74
+ The extraction result object
75
+ """
76
+ ...
77
+
78
+ def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
79
+ """Synchronously process a batch of files and extract their text and metadata.
80
+
81
+ Default implementation processes files sequentially. Backends can override
82
+ for more efficient batch processing.
83
+
84
+ Args:
85
+ paths: List of Path objects representing files to be processed.
86
+ **kwargs: Any kwargs related to the given backend
87
+
88
+ Returns:
89
+ List of extraction result objects in the same order as input paths
90
+ """
91
+ return [self.process_file_sync(path, **kwargs) for path in paths]
92
+
93
+ async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
94
+ """Asynchronously process a batch of files and extract their text and metadata.
95
+
96
+ Default implementation processes files concurrently. Backends can override
97
+ for more efficient batch processing.
98
+
99
+ Args:
100
+ paths: List of Path objects representing files to be processed.
101
+ **kwargs: Any kwargs related to the given backend
102
+
103
+ Returns:
104
+ List of extraction result objects in the same order as input paths
105
+ """
106
+ from kreuzberg._utils._sync import run_taskgroup
107
+
108
+ tasks = [self.process_file(path, **kwargs) for path in paths]
109
+ return await run_taskgroup(*tasks)
110
+
52
111
  def __hash__(self) -> int:
53
112
  """Hash function for allowing caching."""
54
113
  return hash(type(self).__name__)
@@ -4,6 +4,7 @@ import warnings
4
4
  from dataclasses import dataclass
5
5
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
6
6
 
7
+ import numpy as np
7
8
  from PIL import Image
8
9
 
9
10
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -110,7 +111,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
110
111
  }
111
112
 
112
113
 
113
- @dataclass(unsafe_hash=True, frozen=True)
114
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
114
115
  class EasyOCRConfig:
115
116
  """Configuration options for EasyOCR."""
116
117
 
@@ -440,3 +441,93 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
440
441
  )
441
442
 
442
443
  return languages
444
+
445
+ def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
446
+ """Synchronously process an image and extract its text and metadata using EasyOCR.
447
+
448
+ Args:
449
+ image: An instance of PIL.Image representing the input image.
450
+ **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
451
+
452
+ Returns:
453
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
454
+
455
+ Raises:
456
+ OCRError: If OCR processing fails.
457
+ """
458
+ self._init_easyocr_sync(**kwargs)
459
+
460
+ beam_width = kwargs.pop("beam_width")
461
+ kwargs.pop("language", None)
462
+ kwargs.pop("use_gpu", None)
463
+
464
+ try:
465
+ result = self._reader.readtext(
466
+ np.array(image),
467
+ beamWidth=beam_width,
468
+ **kwargs,
469
+ )
470
+
471
+ return self._process_easyocr_result(result, image)
472
+ except Exception as e:
473
+ raise OCRError(f"Failed to OCR using EasyOCR: {e}") from e
474
+
475
+ def process_file_sync(self, path: Path, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
476
+ """Synchronously process a file and extract its text and metadata using EasyOCR.
477
+
478
+ Args:
479
+ path: A Path object representing the file to be processed.
480
+ **kwargs: Configuration parameters for EasyOCR including language, detection thresholds, etc.
481
+
482
+ Returns:
483
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
484
+
485
+ Raises:
486
+ OCRError: If file loading or OCR processing fails.
487
+ """
488
+ self._init_easyocr_sync(**kwargs)
489
+ try:
490
+ image = Image.open(path)
491
+ return self.process_image_sync(image, **kwargs)
492
+ except Exception as e:
493
+ raise OCRError(f"Failed to load or process image using EasyOCR: {e}") from e
494
+
495
+ @classmethod
496
+ def _init_easyocr_sync(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
497
+ """Synchronously initialize EasyOCR with the provided configuration.
498
+
499
+ Args:
500
+ **kwargs: Configuration parameters for EasyOCR including language, etc.
501
+
502
+ Raises:
503
+ MissingDependencyError: If EasyOCR is not installed.
504
+ OCRError: If initialization fails.
505
+ """
506
+ if cls._reader is not None:
507
+ return
508
+
509
+ try:
510
+ import easyocr
511
+ except ImportError as e:
512
+ raise MissingDependencyError.create_for_package(
513
+ dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
514
+ ) from e
515
+
516
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
517
+
518
+ device_info = cls._resolve_device_config(**kwargs)
519
+ use_gpu = device_info.device_type in ("cuda", "mps")
520
+
521
+ kwargs.setdefault("detector", True)
522
+ kwargs.setdefault("recognizer", True)
523
+ kwargs.setdefault("download_enabled", True)
524
+ kwargs.setdefault("recog_network", "standard")
525
+
526
+ try:
527
+ cls._reader = easyocr.Reader(
528
+ languages,
529
+ gpu=use_gpu,
530
+ verbose=False,
531
+ )
532
+ except Exception as e:
533
+ raise OCRError(f"Failed to initialize EasyOCR: {e}") from e
@@ -4,8 +4,10 @@ import platform
4
4
  import warnings
5
5
  from dataclasses import dataclass
6
6
  from importlib.util import find_spec
7
+ from pathlib import Path
7
8
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
8
9
 
10
+ import numpy as np
9
11
  from PIL import Image
10
12
 
11
13
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -29,7 +31,7 @@ except ImportError: # pragma: no cover
29
31
  PADDLEOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {"ch", "en", "french", "german", "japan", "korean"}
30
32
 
31
33
 
32
- @dataclass(unsafe_hash=True, frozen=True)
34
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
33
35
  class PaddleOCRConfig:
34
36
  """Configuration options for PaddleOCR.
35
37
 
@@ -364,3 +366,90 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
364
366
  "supported_languages": ",".join(sorted(PADDLEOCR_SUPPORTED_LANGUAGE_CODES)),
365
367
  },
366
368
  )
369
+
370
+ def process_image_sync(self, image: Image.Image, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
371
+ """Synchronously process an image and extract its text and metadata using PaddleOCR.
372
+
373
+ Args:
374
+ image: An instance of PIL.Image representing the input image.
375
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
376
+
377
+ Returns:
378
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
379
+
380
+ Raises:
381
+ OCRError: If OCR processing fails.
382
+ """
383
+ self._init_paddle_ocr_sync(**kwargs)
384
+
385
+ if image.mode != "RGB":
386
+ image = image.convert("RGB")
387
+
388
+ image_np = np.array(image)
389
+ try:
390
+ result = self._paddle_ocr.ocr(image_np, cls=kwargs.get("use_angle_cls", True))
391
+ return self._process_paddle_result(result, image)
392
+ except Exception as e:
393
+ raise OCRError(f"Failed to OCR using PaddleOCR: {e}") from e
394
+
395
+ def process_file_sync(self, path: Path, **kwargs: Unpack[PaddleOCRConfig]) -> ExtractionResult:
396
+ """Synchronously process a file and extract its text and metadata using PaddleOCR.
397
+
398
+ Args:
399
+ path: A Path object representing the file to be processed.
400
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
401
+
402
+ Returns:
403
+ ExtractionResult: The extraction result containing text content, mime type, and metadata.
404
+
405
+ Raises:
406
+ OCRError: If file loading or OCR processing fails.
407
+ """
408
+ self._init_paddle_ocr_sync(**kwargs)
409
+ try:
410
+ image = Image.open(path)
411
+ return self.process_image_sync(image, **kwargs)
412
+ except Exception as e:
413
+ raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
414
+
415
+ @classmethod
416
+ def _init_paddle_ocr_sync(cls, **kwargs: Unpack[PaddleOCRConfig]) -> None:
417
+ """Synchronously initialize PaddleOCR with the provided configuration.
418
+
419
+ Args:
420
+ **kwargs: Configuration parameters for PaddleOCR including language, detection thresholds, etc.
421
+
422
+ Raises:
423
+ MissingDependencyError: If PaddleOCR is not installed.
424
+ OCRError: If initialization fails.
425
+ """
426
+ if cls._paddle_ocr is not None:
427
+ return
428
+
429
+ try:
430
+ from paddleocr import PaddleOCR
431
+ except ImportError as e:
432
+ raise MissingDependencyError.create_for_package(
433
+ dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
434
+ ) from e
435
+
436
+ language = cls._validate_language_code(kwargs.pop("language", "en"))
437
+
438
+ device_info = cls._resolve_device_config(**kwargs)
439
+ use_gpu = device_info.device_type == "cuda"
440
+
441
+ has_gpu_package = bool(find_spec("paddlepaddle_gpu"))
442
+ kwargs.setdefault("use_angle_cls", True)
443
+ kwargs["use_gpu"] = use_gpu and has_gpu_package
444
+ kwargs.setdefault("enable_mkldnn", cls._is_mkldnn_supported() and not (use_gpu and has_gpu_package))
445
+ kwargs.setdefault("det_db_thresh", 0.3)
446
+ kwargs.setdefault("det_db_box_thresh", 0.5)
447
+ kwargs.setdefault("det_db_unclip_ratio", 1.6)
448
+
449
+ if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
450
+ kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
451
+
452
+ try:
453
+ cls._paddle_ocr = PaddleOCR(lang=language, show_log=False, **kwargs)
454
+ except Exception as e:
455
+ raise OCRError(f"Failed to initialize PaddleOCR: {e}") from e