kreuzberg 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_cli_config.py +175 -0
  4. kreuzberg/_extractors/_image.py +39 -4
  5. kreuzberg/_extractors/_pandoc.py +158 -18
  6. kreuzberg/_extractors/_pdf.py +199 -19
  7. kreuzberg/_extractors/_presentation.py +1 -1
  8. kreuzberg/_extractors/_spread_sheet.py +65 -7
  9. kreuzberg/_gmft.py +222 -16
  10. kreuzberg/_mime_types.py +62 -16
  11. kreuzberg/_multiprocessing/__init__.py +6 -0
  12. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  13. kreuzberg/_multiprocessing/process_manager.py +188 -0
  14. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  15. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  16. kreuzberg/_ocr/_easyocr.py +66 -10
  17. kreuzberg/_ocr/_paddleocr.py +86 -7
  18. kreuzberg/_ocr/_tesseract.py +136 -46
  19. kreuzberg/_playa.py +43 -0
  20. kreuzberg/_utils/_cache.py +372 -0
  21. kreuzberg/_utils/_device.py +356 -0
  22. kreuzberg/_utils/_document_cache.py +220 -0
  23. kreuzberg/_utils/_errors.py +232 -0
  24. kreuzberg/_utils/_pdf_lock.py +72 -0
  25. kreuzberg/_utils/_process_pool.py +100 -0
  26. kreuzberg/_utils/_serialization.py +82 -0
  27. kreuzberg/_utils/_string.py +1 -1
  28. kreuzberg/_utils/_sync.py +21 -0
  29. kreuzberg/cli.py +338 -0
  30. kreuzberg/extraction.py +247 -36
  31. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +95 -34
  32. kreuzberg-3.3.0.dist-info/RECORD +48 -0
  33. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
  34. kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
  35. kreuzberg-3.1.7.dist-info/RECORD +0 -33
  36. kreuzberg-3.1.7.dist-info/top_level.txt +0 -1
  37. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py CHANGED
@@ -18,6 +18,8 @@ from .extraction import (
18
18
  extract_file_sync,
19
19
  )
20
20
 
21
+ __version__ = "3.2.0"
22
+
21
23
  __all__ = [
22
24
  "EasyOCRConfig",
23
25
  "ExtractionConfig",
@@ -34,6 +36,7 @@ __all__ = [
34
36
  "TableData",
35
37
  "TesseractConfig",
36
38
  "ValidationError",
39
+ "__version__",
37
40
  "batch_extract_bytes",
38
41
  "batch_extract_bytes_sync",
39
42
  "batch_extract_file",
kreuzberg/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Entry point for running kreuzberg as a module with python -m kreuzberg."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from kreuzberg.cli import cli
6
+
7
+ if __name__ == "__main__":
8
+ cli()
@@ -0,0 +1,175 @@
1
+ """Configuration parsing for the CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ if sys.version_info >= (3, 11):
10
+ import tomllib
11
+ else:
12
+ import tomli as tomllib # type: ignore[import-not-found]
13
+
14
+ from kreuzberg._gmft import GMFTConfig
15
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
16
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
17
+ from kreuzberg._ocr._tesseract import TesseractConfig
18
+ from kreuzberg._types import ExtractionConfig, OcrBackendType
19
+ from kreuzberg.exceptions import ValidationError
20
+
21
+ if TYPE_CHECKING:
22
+ from collections.abc import MutableMapping
23
+
24
+
25
+ def load_config_from_file(config_path: Path) -> dict[str, Any]:
26
+ """Load configuration from a TOML file.
27
+
28
+ Args:
29
+ config_path: Path to the configuration file.
30
+
31
+ Returns:
32
+ Dictionary containing the loaded configuration.
33
+
34
+ Raises:
35
+ ValidationError: If the file cannot be read or parsed.
36
+ """
37
+ try:
38
+ with config_path.open("rb") as f:
39
+ data = tomllib.load(f)
40
+ except FileNotFoundError as e:
41
+ raise ValidationError(f"Configuration file not found: {config_path}") from e
42
+ except tomllib.TOMLDecodeError as e:
43
+ raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
44
+
45
+ return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
46
+
47
+
48
+ def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
49
+ """Merge two configuration dictionaries recursively.
50
+
51
+ Args:
52
+ base: Base configuration dictionary.
53
+ override: Configuration dictionary to override base values.
54
+
55
+ Returns:
56
+ Merged configuration dictionary.
57
+ """
58
+ result = base.copy()
59
+ for key, value in override.items():
60
+ if isinstance(value, dict) and key in result and isinstance(result[key], dict):
61
+ result[key] = merge_configs(result[key], value)
62
+ else:
63
+ result[key] = value
64
+ return result
65
+
66
+
67
+ def parse_ocr_backend_config(
68
+ config_dict: dict[str, Any], backend: OcrBackendType
69
+ ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
70
+ """Parse OCR backend-specific configuration.
71
+
72
+ Args:
73
+ config_dict: Configuration dictionary.
74
+ backend: The OCR backend type.
75
+
76
+ Returns:
77
+ Backend-specific configuration object or None.
78
+ """
79
+ if backend not in config_dict:
80
+ return None
81
+
82
+ backend_config = config_dict[backend]
83
+ if not isinstance(backend_config, dict):
84
+ return None
85
+
86
+ if backend == "tesseract":
87
+ return TesseractConfig(**backend_config)
88
+ if backend == "easyocr":
89
+ return EasyOCRConfig(**backend_config)
90
+ if backend == "paddleocr":
91
+ return PaddleOCRConfig(**backend_config)
92
+ return None
93
+
94
+
95
+ def build_extraction_config( # noqa: C901, PLR0912
96
+ file_config: dict[str, Any],
97
+ cli_args: MutableMapping[str, Any],
98
+ ) -> ExtractionConfig:
99
+ """Build ExtractionConfig from file config and CLI arguments.
100
+
101
+ Args:
102
+ file_config: Configuration loaded from file.
103
+ cli_args: CLI arguments.
104
+
105
+ Returns:
106
+ ExtractionConfig instance.
107
+ """
108
+ config_dict: dict[str, Any] = {}
109
+
110
+ if file_config:
111
+ for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
112
+ if field in file_config:
113
+ config_dict[field] = file_config[field]
114
+
115
+ for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
116
+ cli_key = field
117
+ if cli_key in cli_args and cli_args[cli_key] is not None:
118
+ config_dict[field] = cli_args[cli_key]
119
+
120
+ ocr_backend = config_dict.get("ocr_backend")
121
+ if ocr_backend and ocr_backend != "none":
122
+ ocr_config = None
123
+
124
+ if cli_args.get(f"{ocr_backend}_config"):
125
+ backend_args = cli_args[f"{ocr_backend}_config"]
126
+ if ocr_backend == "tesseract":
127
+ ocr_config = TesseractConfig(**backend_args)
128
+ elif ocr_backend == "easyocr":
129
+ ocr_config = EasyOCRConfig(**backend_args) # type: ignore[assignment]
130
+ elif ocr_backend == "paddleocr":
131
+ ocr_config = PaddleOCRConfig(**backend_args) # type: ignore[assignment]
132
+
133
+ if not ocr_config and file_config:
134
+ ocr_config = parse_ocr_backend_config(file_config, ocr_backend) # type: ignore[assignment]
135
+
136
+ if ocr_config:
137
+ config_dict["ocr_config"] = ocr_config
138
+
139
+ if config_dict.get("extract_tables"):
140
+ gmft_config = None
141
+
142
+ if cli_args.get("gmft_config"):
143
+ gmft_config = GMFTConfig(**cli_args["gmft_config"])
144
+
145
+ elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
146
+ gmft_config = GMFTConfig(**file_config["gmft"])
147
+
148
+ if gmft_config:
149
+ config_dict["gmft_config"] = gmft_config
150
+
151
+ if config_dict.get("ocr_backend") == "none":
152
+ config_dict["ocr_backend"] = None
153
+
154
+ return ExtractionConfig(**config_dict)
155
+
156
+
157
+ def find_default_config() -> Path | None:
158
+ """Find the default configuration file (pyproject.toml).
159
+
160
+ Returns:
161
+ Path to the configuration file or None if not found.
162
+ """
163
+ current = Path.cwd()
164
+ while current != current.parent:
165
+ config_path = current / "pyproject.toml"
166
+ if config_path.exists():
167
+ try:
168
+ with config_path.open("rb") as f:
169
+ data = tomllib.load(f)
170
+ if "tool" in data and "kreuzberg" in data["tool"]:
171
+ return config_path
172
+ except Exception: # noqa: BLE001
173
+ pass
174
+ current = current.parent
175
+ return None
@@ -2,7 +2,6 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING, ClassVar
4
4
 
5
- import anyio
6
5
  from anyio import Path as AsyncPath
7
6
 
8
7
  from kreuzberg._extractors._base import Extractor
@@ -13,10 +12,12 @@ from kreuzberg.exceptions import ValidationError
13
12
 
14
13
  if TYPE_CHECKING: # pragma: no cover
15
14
  from collections.abc import Mapping
16
- from pathlib import Path
17
15
 
18
16
  from kreuzberg._types import ExtractionResult
19
17
 
18
+ import contextlib
19
+ from pathlib import Path
20
+
20
21
 
21
22
  class ImageExtractor(Extractor):
22
23
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
@@ -58,10 +59,44 @@ class ImageExtractor(Extractor):
58
59
  return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
59
60
 
60
61
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
61
- return anyio.run(self.extract_bytes_async, content)
62
+ """Pure sync implementation of extract_bytes."""
63
+ import os
64
+ import tempfile
65
+
66
+ extension = self._get_extension_from_mime_type(self.mime_type)
67
+ fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
68
+
69
+ try:
70
+ with os.fdopen(fd, "wb") as f:
71
+ f.write(content)
72
+
73
+ return self.extract_path_sync(Path(temp_path))
74
+ finally:
75
+ with contextlib.suppress(OSError):
76
+ Path(temp_path).unlink()
62
77
 
63
78
  def extract_path_sync(self, path: Path) -> ExtractionResult:
64
- return anyio.run(self.extract_path_async, path)
79
+ """Pure sync implementation of extract_path."""
80
+ if self.config.ocr_backend is None:
81
+ raise ValidationError("ocr_backend is None, cannot perform OCR")
82
+
83
+ from kreuzberg._ocr._tesseract import TesseractConfig
84
+ from kreuzberg._types import ExtractionResult
85
+
86
+ if self.config.ocr_backend == "tesseract":
87
+ from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
88
+
89
+ if isinstance(self.config.ocr_config, TesseractConfig):
90
+ config = self.config.ocr_config
91
+ else:
92
+ config = TesseractConfig()
93
+
94
+ results = process_batch_images_sync_pure([str(path)], config)
95
+ if results:
96
+ return results[0]
97
+ return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
98
+
99
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
65
100
 
66
101
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
67
102
  if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
3
4
  import re
4
5
  import sys
5
6
  from json import JSONDecodeError, loads
7
+ from pathlib import Path
6
8
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
7
9
 
8
- import anyio
9
10
  from anyio import Path as AsyncPath
10
11
  from anyio import run_process
11
12
 
@@ -21,7 +22,7 @@ from kreuzberg.exceptions import MissingDependencyError, ParsingError, Validatio
21
22
  if TYPE_CHECKING: # pragma: no cover
22
23
  from collections.abc import Mapping
23
24
  from os import PathLike
24
- from pathlib import Path
25
+
25
26
 
26
27
  if sys.version_info < (3, 11): # pragma: no cover
27
28
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
@@ -194,7 +195,7 @@ class PandocExtractor(Extractor):
194
195
  raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
195
196
 
196
197
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
197
- """Synchronous version of extract_bytes_async.
198
+ """Pure sync implementation of extract_bytes.
198
199
 
199
200
  Args:
200
201
  content: The content bytes to process.
@@ -202,18 +203,46 @@ class PandocExtractor(Extractor):
202
203
  Returns:
203
204
  ExtractionResult with the extracted text and metadata.
204
205
  """
205
- return anyio.run(self.extract_bytes_async, content)
206
+ import os
207
+ import tempfile
208
+ from pathlib import Path
209
+
210
+ extension = self._get_pandoc_type_from_mime_type(self.mime_type)
211
+ fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
212
+
213
+ try:
214
+ with os.fdopen(fd, "wb") as f:
215
+ f.write(content)
216
+
217
+ return self.extract_path_sync(Path(temp_path))
218
+ finally:
219
+ with contextlib.suppress(OSError):
220
+ Path(temp_path).unlink()
206
221
 
207
222
  def extract_path_sync(self, path: Path) -> ExtractionResult:
208
- """Synchronous version of extract_path_async.
223
+ """Pure sync implementation of extract_path.
209
224
 
210
225
  Args:
211
226
  path: The path to the file to process.
212
227
 
213
228
  Returns:
214
229
  ExtractionResult with the extracted text and metadata.
230
+
231
+ Raises:
232
+ ParsingError: When file processing fails.
215
233
  """
216
- return anyio.run(self.extract_path_async, path)
234
+ self._validate_pandoc_version_sync()
235
+ self._get_pandoc_type_from_mime_type(self.mime_type)
236
+
237
+ try:
238
+ metadata = self._extract_metadata_sync(path)
239
+ content = self._extract_file_sync(path)
240
+
241
+ return ExtractionResult(
242
+ content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE, chunks=[]
243
+ )
244
+ except Exception as e:
245
+ raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
217
246
 
218
247
  async def _validate_pandoc_version(self) -> None:
219
248
  """Validate that the installed Pandoc version meets the minimum requirement.
@@ -229,36 +258,26 @@ class PandocExtractor(Extractor):
229
258
  result = await run_process(command)
230
259
  stdout = result.stdout.decode()
231
260
 
232
- # Try more inclusive patterns to detect the pandoc version
233
- # Try common formats first
234
261
  version_match = re.search(
235
262
  r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
236
263
  )
237
264
 
238
- # Try version in parentheses format
239
265
  if not version_match:
240
266
  version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
241
267
 
242
- # Try hyphenated format
243
268
  if not version_match:
244
269
  version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
245
270
 
246
- # If still no match, check for version at the beginning of the output or any line
247
271
  if not version_match:
248
- # Match version at the start of a line (like in the test case "2.9.2.1\npandoc-types 1.20")
249
272
  version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
250
273
 
251
- # Try finding version-like patterns elsewhere in the text
252
274
  if not version_match:
253
- # Search for version-like patterns at the beginning of lines or after spaces
254
275
  version_match = re.search(r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)", stdout)
255
276
 
256
- # As a last resort, check any sequence of digits that might be a version
257
277
  if not version_match:
258
278
  out_lines = stdout.splitlines()
259
279
  for line in out_lines:
260
280
  for token in line.split():
261
- # Match standalone version patterns like 2.11 or 2.11.4
262
281
  version_pattern = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?$", token)
263
282
  if version_pattern:
264
283
  version_match = version_pattern
@@ -266,12 +285,10 @@ class PandocExtractor(Extractor):
266
285
  if version_match:
267
286
  break
268
287
 
269
- # If we found a version, check that the major version is at least the minimum required
270
288
  if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
271
289
  self._checked_version = True
272
290
  return
273
291
 
274
- # If we get here, we either didn't find a version or it's too low
275
292
  raise MissingDependencyError(
276
293
  "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
277
294
  )
@@ -560,6 +577,129 @@ class PandocExtractor(Extractor):
560
577
 
561
578
  return None
562
579
 
580
+ def _validate_pandoc_version_sync(self) -> None:
581
+ """Synchronous version of _validate_pandoc_version."""
582
+ import subprocess
583
+
584
+ try:
585
+ if self._checked_version:
586
+ return
587
+
588
+ result = subprocess.run(["pandoc", "--version"], capture_output=True, text=True, check=False) # noqa: S607
589
+
590
+ if result.returncode != 0:
591
+ raise MissingDependencyError(
592
+ "Pandoc version 2 or above is a required system dependency. "
593
+ "Please install it on your system and make sure its available in $PATH."
594
+ )
595
+
596
+ stdout = result.stdout
597
+
598
+ version_match = re.search(
599
+ r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
600
+ )
601
+
602
+ if not version_match:
603
+ version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
604
+
605
+ if not version_match:
606
+ version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
607
+
608
+ if not version_match:
609
+ version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
610
+
611
+ if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
612
+ self._checked_version = True
613
+ return
614
+
615
+ raise MissingDependencyError(
616
+ "Pandoc version 2 or above is a required system dependency. "
617
+ "Please install it on your system and make sure its available in $PATH."
618
+ )
619
+
620
+ except (subprocess.SubprocessError, FileNotFoundError) as e:
621
+ raise MissingDependencyError(
622
+ "Pandoc version 2 or above is a required system dependency. "
623
+ "Please install it on your system and make sure its available in $PATH."
624
+ ) from e
625
+
626
+ def _extract_metadata_sync(self, path: Path) -> Metadata:
627
+ """Synchronous version of _handle_extract_metadata."""
628
+ import os
629
+ import subprocess
630
+ import tempfile
631
+
632
+ pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
633
+ fd, metadata_file = tempfile.mkstemp(suffix=".json")
634
+ os.close(fd)
635
+
636
+ try:
637
+ command = [
638
+ "pandoc",
639
+ str(path),
640
+ f"--from={pandoc_type}",
641
+ "--to=json",
642
+ "--standalone",
643
+ "--quiet",
644
+ "--output",
645
+ str(metadata_file),
646
+ ]
647
+
648
+ result = subprocess.run(command, capture_output=True, text=True, check=False)
649
+
650
+ if result.returncode != 0:
651
+ raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
652
+
653
+ with Path(metadata_file).open(encoding="utf-8") as f:
654
+ json_data = loads(f.read())
655
+
656
+ return self._extract_metadata(json_data)
657
+
658
+ except (OSError, JSONDecodeError) as e:
659
+ raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
660
+ finally:
661
+ with contextlib.suppress(OSError):
662
+ Path(metadata_file).unlink()
663
+
664
+ def _extract_file_sync(self, path: Path) -> str:
665
+ """Synchronous version of _handle_extract_file."""
666
+ import os
667
+ import subprocess
668
+ import tempfile
669
+
670
+ pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
671
+ fd, output_path = tempfile.mkstemp(suffix=".md")
672
+ os.close(fd)
673
+
674
+ try:
675
+ command = [
676
+ "pandoc",
677
+ str(path),
678
+ f"--from={pandoc_type}",
679
+ "--to=markdown",
680
+ "--standalone",
681
+ "--wrap=preserve",
682
+ "--quiet",
683
+ "--output",
684
+ str(output_path),
685
+ ]
686
+
687
+ result = subprocess.run(command, capture_output=True, text=True, check=False)
688
+
689
+ if result.returncode != 0:
690
+ raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
691
+
692
+ with Path(output_path).open(encoding="utf-8") as f:
693
+ text = f.read()
694
+
695
+ return normalize_spaces(text)
696
+
697
+ except OSError as e:
698
+ raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
699
+ finally:
700
+ with contextlib.suppress(OSError):
701
+ Path(output_path).unlink()
702
+
563
703
 
564
704
  class MarkdownExtractor(PandocExtractor):
565
705
  """Extractor for Markdown-based document formats."""