kreuzberg 3.13.0__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_chunker.py +0 -15
  2. kreuzberg/_config.py +0 -124
  3. kreuzberg/_document_classification.py +20 -39
  4. kreuzberg/_entity_extraction.py +0 -29
  5. kreuzberg/_extractors/_base.py +4 -66
  6. kreuzberg/_extractors/_email.py +0 -4
  7. kreuzberg/_extractors/_image.py +0 -2
  8. kreuzberg/_extractors/_pandoc.py +0 -58
  9. kreuzberg/_extractors/_pdf.py +0 -3
  10. kreuzberg/_extractors/_presentation.py +0 -82
  11. kreuzberg/_extractors/_spread_sheet.py +0 -2
  12. kreuzberg/_gmft.py +0 -61
  13. kreuzberg/_language_detection.py +0 -14
  14. kreuzberg/_mime_types.py +0 -17
  15. kreuzberg/_ocr/_base.py +4 -76
  16. kreuzberg/_ocr/_easyocr.py +110 -85
  17. kreuzberg/_ocr/_paddleocr.py +146 -138
  18. kreuzberg/_ocr/_table_extractor.py +0 -76
  19. kreuzberg/_ocr/_tesseract.py +0 -206
  20. kreuzberg/_playa.py +0 -27
  21. kreuzberg/_registry.py +0 -36
  22. kreuzberg/_types.py +16 -119
  23. kreuzberg/_utils/_cache.py +0 -52
  24. kreuzberg/_utils/_device.py +0 -56
  25. kreuzberg/_utils/_document_cache.py +0 -73
  26. kreuzberg/_utils/_errors.py +0 -47
  27. kreuzberg/_utils/_ocr_cache.py +136 -0
  28. kreuzberg/_utils/_pdf_lock.py +0 -14
  29. kreuzberg/_utils/_process_pool.py +0 -47
  30. kreuzberg/_utils/_quality.py +0 -17
  31. kreuzberg/_utils/_ref.py +0 -16
  32. kreuzberg/_utils/_serialization.py +0 -25
  33. kreuzberg/_utils/_string.py +0 -20
  34. kreuzberg/_utils/_sync.py +0 -76
  35. kreuzberg/_utils/_table.py +0 -45
  36. kreuzberg/_utils/_tmp.py +0 -9
  37. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +3 -2
  38. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  39. kreuzberg-3.13.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -93,7 +93,6 @@ class PDFExtractor(Extractor):
93
93
  return self._apply_quality_processing(result)
94
94
 
95
95
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
96
- """Pure sync implementation of PDF extraction from bytes."""
97
96
  fd, temp_path = tempfile.mkstemp(suffix=".pdf")
98
97
  try:
99
98
  with os.fdopen(fd, "wb") as f:
@@ -110,7 +109,6 @@ class PDFExtractor(Extractor):
110
109
  Path(temp_path).unlink()
111
110
 
112
111
  def extract_path_sync(self, path: Path) -> ExtractionResult:
113
- """Pure sync implementation of PDF extraction from path."""
114
112
  try:
115
113
  text = self._extract_pdf_searchable_text_sync(path)
116
114
  except ParsingError:
@@ -330,7 +328,6 @@ class PDFExtractor(Extractor):
330
328
  return "\n\n".join(result.content for result in results)
331
329
 
332
330
  def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
333
- """Process PIL images directly without temp files."""
334
331
  backend = get_ocr_backend(self.config.ocr_backend)
335
332
 
336
333
  match self.config.ocr_backend:
@@ -25,94 +25,23 @@ _NON_WORD_PATTERN = re.compile(r"\W")
25
25
 
26
26
 
27
27
  class PresentationExtractor(Extractor):
28
- """Extractor for PowerPoint (.pptx) files.
29
-
30
- This extractor processes PowerPoint presentations and converts their content into Markdown format.
31
- It handles slides, shapes, images, tables, and slide notes, preserving the structure and content
32
- of the presentation in a readable text format.
33
-
34
- The extractor provides both synchronous and asynchronous methods for processing files either
35
- from disk or from bytes in memory.
36
- """
37
-
38
28
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
39
29
 
40
30
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
41
- """Asynchronously extract content from PowerPoint file bytes.
42
-
43
- Args:
44
- content: Raw bytes of the PowerPoint file to process.
45
-
46
- Returns:
47
- ExtractionResult: Contains the extracted content in Markdown format,
48
- the MIME type, and any additional metadata.
49
- """
50
31
  return self._extract_pptx(content)
51
32
 
52
33
  async def extract_path_async(self, path: Path) -> ExtractionResult:
53
- """Asynchronously extract content from a PowerPoint file on disk.
54
-
55
- Args:
56
- path: Path to the PowerPoint file to process.
57
-
58
- Returns:
59
- ExtractionResult: Contains the extracted content in Markdown format,
60
- the MIME type, and any additional metadata.
61
- """
62
34
  content = await AsyncPath(path).read_bytes()
63
35
  return self._extract_pptx(content)
64
36
 
65
37
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
66
- """Synchronously extract content from PowerPoint file bytes.
67
-
68
- Args:
69
- content: Raw bytes of the PowerPoint file to process.
70
-
71
- Returns:
72
- ExtractionResult: Contains the extracted content in Markdown format,
73
- the MIME type, and any additional metadata.
74
- """
75
38
  return self._extract_pptx(content)
76
39
 
77
40
  def extract_path_sync(self, path: Path) -> ExtractionResult:
78
- """Synchronously extract content from a PowerPoint file on disk.
79
-
80
- Args:
81
- path: Path to the PowerPoint file to process.
82
-
83
- Returns:
84
- ExtractionResult: Contains the extracted content in Markdown format,
85
- the MIME type, and any additional metadata.
86
- """
87
41
  content = Path(path).read_bytes()
88
42
  return self._extract_pptx(content)
89
43
 
90
44
  def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
91
- """Process PowerPoint file contents and convert to Markdown.
92
-
93
- This method handles the core logic of extracting content from a PowerPoint file.
94
- It processes:
95
- - Slide titles and content
96
- - Images (with alt text if available)
97
- - Tables (converted to HTML format)
98
- - Text frames
99
- - Slide notes
100
-
101
- Args:
102
- file_contents: Raw bytes of the PowerPoint file to process.
103
-
104
- Returns:
105
- ExtractionResult: Contains the extracted content in Markdown format,
106
- the MIME type, and any additional metadata.
107
-
108
- Notes:
109
- The extraction preserves the following elements:
110
- - Slide numbers (as HTML comments)
111
- - Images (converted to Markdown image syntax with alt text)
112
- - Tables (converted to HTML table syntax)
113
- - Text content (with titles properly formatted)
114
- - Slide notes (under a dedicated section for each slide)
115
- """
116
45
  md_content = ""
117
46
  presentation = pptx.Presentation(BytesIO(file_contents))
118
47
 
@@ -181,14 +110,6 @@ class PresentationExtractor(Extractor):
181
110
 
182
111
  @staticmethod
183
112
  def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
184
- """Extract metadata from a presentation instance.
185
-
186
- Args:
187
- presentation: A `Presentation` object representing the PowerPoint file.
188
-
189
- Returns:
190
- PresentationMetadata: Object containing presentation-specific metadata fields.
191
- """
192
113
  metadata: Metadata = {}
193
114
 
194
115
  PresentationExtractor._extract_core_properties(presentation, metadata)
@@ -203,7 +124,6 @@ class PresentationExtractor(Extractor):
203
124
 
204
125
  @staticmethod
205
126
  def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
206
- """Extract core document properties from presentation."""
207
127
  property_mapping = [
208
128
  ("authors", "author"),
209
129
  ("comments", "comments"),
@@ -230,7 +150,6 @@ class PresentationExtractor(Extractor):
230
150
 
231
151
  @staticmethod
232
152
  def _extract_fonts(presentation: Presentation) -> set[str]:
233
- """Extract all fonts used in the presentation."""
234
153
  fonts = set()
235
154
  for slide in presentation.slides:
236
155
  for shape in slide.shapes:
@@ -245,7 +164,6 @@ class PresentationExtractor(Extractor):
245
164
 
246
165
  @staticmethod
247
166
  def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
248
- """Add structural information about the presentation."""
249
167
  slide_count = len(presentation.slides)
250
168
  if slide_count == 0:
251
169
  return
@@ -72,7 +72,6 @@ class SpreadSheetExtractor(Extractor):
72
72
  ) from e
73
73
 
74
74
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
75
- """Pure sync implementation of extract_bytes."""
76
75
  fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
77
76
 
78
77
  try:
@@ -85,7 +84,6 @@ class SpreadSheetExtractor(Extractor):
85
84
  Path(temp_path).unlink()
86
85
 
87
86
  def extract_path_sync(self, path: Path) -> ExtractionResult:
88
- """Pure sync implementation of extract_path."""
89
87
  try:
90
88
  workbook = CalamineWorkbook.from_path(str(path))
91
89
  results = []
kreuzberg/_gmft.py CHANGED
@@ -31,23 +31,6 @@ if TYPE_CHECKING:
31
31
  async def extract_tables(
32
32
  file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
33
33
  ) -> list[TableData]:
34
- """Extracts tables from a PDF file.
35
-
36
- This function takes a file path to a PDF file, and an optional configuration object.
37
- It returns a list of strings, where each string is a markdown-formatted table.
38
-
39
- Args:
40
- file_path: The path to the PDF file.
41
- config: An optional configuration object.
42
- use_isolated_process: Whether to use an isolated process for extraction.
43
- If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
44
-
45
- Raises:
46
- MissingDependencyError: Raised when the required dependencies are not installed.
47
-
48
- Returns:
49
- A list of table data dictionaries.
50
- """
51
34
  # Determine if we should use isolated process # ~keep
52
35
  if use_isolated_process is None:
53
36
  use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -164,17 +147,6 @@ async def extract_tables(
164
147
  def extract_tables_sync(
165
148
  file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
166
149
  ) -> list[TableData]:
167
- """Synchronous wrapper for extract_tables.
168
-
169
- Args:
170
- file_path: The path to the PDF file.
171
- config: An optional configuration object.
172
- use_isolated_process: Whether to use an isolated process for extraction.
173
- If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
174
-
175
- Returns:
176
- A list of table data dictionaries.
177
- """
178
150
  # Determine if we should use isolated process # ~keep
179
151
  if use_isolated_process is None:
180
152
  use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -276,13 +248,6 @@ def _extract_tables_in_process(
276
248
  config_dict: dict[str, Any],
277
249
  result_queue: queue.Queue[tuple[bool, Any]],
278
250
  ) -> None:
279
- """Extract tables in an isolated process to handle potential segfaults.
280
-
281
- Args:
282
- file_path: Path to the PDF file
283
- config_dict: Serialized GMFTConfig as a dict
284
- result_queue: Queue to put results or errors
285
- """
286
251
  signal.signal(signal.SIGINT, signal.SIG_IGN)
287
252
 
288
253
  try:
@@ -366,19 +331,6 @@ def _extract_tables_isolated(
366
331
  config: GMFTConfig | None = None,
367
332
  timeout: float = 300.0,
368
333
  ) -> list[TableData]:
369
- """Extract tables using an isolated process to handle segfaults.
370
-
371
- Args:
372
- file_path: Path to the PDF file
373
- config: GMFT configuration
374
- timeout: Maximum time to wait for extraction
375
-
376
- Returns:
377
- List of extracted tables
378
-
379
- Raises:
380
- RuntimeError: If extraction fails or times out
381
- """
382
334
  config = config or GMFTConfig()
383
335
  config_dict = msgspec.to_builtins(config)
384
336
 
@@ -477,19 +429,6 @@ async def _extract_tables_isolated_async(
477
429
  config: GMFTConfig | None = None,
478
430
  timeout: float = 300.0, # noqa: ASYNC109
479
431
  ) -> list[TableData]:
480
- """Async version of extract_tables_isolated using asyncio.
481
-
482
- Args:
483
- file_path: Path to the PDF file
484
- config: GMFT configuration
485
- timeout: Maximum time to wait for extraction
486
-
487
- Returns:
488
- List of extracted tables
489
-
490
- Raises:
491
- RuntimeError: If extraction fails or times out
492
- """
493
432
  config = config or GMFTConfig()
494
433
  config_dict = msgspec.to_builtins(config)
495
434
 
@@ -24,7 +24,6 @@ _CACHE_SIZE = 128
24
24
 
25
25
 
26
26
  def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
27
- """Create FastLangDetectConfig from our config."""
28
27
  if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
29
28
  return None
30
29
 
@@ -39,19 +38,6 @@ def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangD
39
38
 
40
39
  @lru_cache(maxsize=_CACHE_SIZE)
41
40
  def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
42
- """Detect the most probable languages in the given text using fast-langdetect.
43
-
44
- Args:
45
- text: The text to analyze.
46
- config: Configuration for language detection. If None, uses defaults.
47
-
48
- Returns:
49
- A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
50
- or None if detection fails.
51
-
52
- Raises:
53
- MissingDependencyError: If fast-langdetect is not installed.
54
- """
55
41
  if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
56
42
  raise MissingDependencyError.create_for_package(
57
43
  dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
kreuzberg/_mime_types.py CHANGED
@@ -173,21 +173,6 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
173
173
  def validate_mime_type(
174
174
  *, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
175
175
  ) -> str:
176
- """Validate and detect the MIME type for a given file.
177
-
178
- Args:
179
- file_path: The path to the file.
180
- mime_type: Optional explicit MIME type. If provided, this will be validated.
181
- If not provided, the function will attempt to detect the MIME type.
182
- check_file_exists: Whether to check if the file exists. Default is True.
183
- Set to False in tests where you want to validate a mime type without an actual file.
184
-
185
- Raises:
186
- ValidationError: If the MIME type is not supported or cannot be determined.
187
-
188
- Returns:
189
- The validated MIME type.
190
- """
191
176
  if mime_type:
192
177
  return _validate_explicit_mime_type(mime_type)
193
178
 
@@ -227,7 +212,6 @@ def validate_mime_type(
227
212
 
228
213
 
229
214
  def _validate_explicit_mime_type(mime_type: str) -> str:
230
- """Validate an explicitly provided MIME type."""
231
215
  if mime_type in SUPPORTED_MIME_TYPES:
232
216
  return mime_type
233
217
 
@@ -242,7 +226,6 @@ def _validate_explicit_mime_type(mime_type: str) -> str:
242
226
 
243
227
 
244
228
  def _detect_mime_type_uncached(file_path: PathLike[str] | str | None = None, check_file_exists: bool = True) -> str:
245
- """Detect MIME type without caching (internal function)."""
246
229
  if file_path and check_file_exists:
247
230
  path = Path(file_path)
248
231
  if not path.exists():
kreuzberg/_ocr/_base.py CHANGED
@@ -16,98 +16,26 @@ T = TypeVar("T")
16
16
 
17
17
 
18
18
  class OCRBackend(ABC, Generic[T]):
19
- """Abstract base class for Optical Character Recognition (OCR) backend implementations.
20
-
21
- This class provides the blueprint for OCR backend implementations,
22
- offering both synchronous and asynchronous methods to process images
23
- and files for text extraction.
24
- """
25
-
26
19
  @abstractmethod
27
- async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
28
- """Asynchronously process an image and extract its text and metadata.
29
-
30
- Args:
31
- image: An instance of PIL.Image representing the input image.
32
- **kwargs: Any kwargs related to the given backend
33
-
34
- Returns:
35
- The extraction result object
36
- """
37
- ...
20
+ async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult: ...
38
21
 
39
22
  @abstractmethod
40
- async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
41
- """Asynchronously process a file and extract its text and metadata.
42
-
43
- Args:
44
- path: A Path object representing the file to be processed.
45
- **kwargs: Any kwargs related to the given backend
46
-
47
- Returns:
48
- The extraction result object
49
- """
50
- ...
23
+ async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult: ...
51
24
 
52
25
  @abstractmethod
53
- def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
54
- """Synchronously process an image and extract its text and metadata.
55
-
56
- Args:
57
- image: An instance of PIL.Image representing the input image.
58
- **kwargs: Any kwargs related to the given backend
59
-
60
- Returns:
61
- The extraction result object
62
- """
63
- ...
26
+ def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult: ...
64
27
 
65
28
  @abstractmethod
66
- def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
67
- """Synchronously process a file and extract its text and metadata.
68
-
69
- Args:
70
- path: A Path object representing the file to be processed.
71
- **kwargs: Any kwargs related to the given backend
72
-
73
- Returns:
74
- The extraction result object
75
- """
76
- ...
29
+ def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult: ...
77
30
 
78
31
  def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
79
- """Synchronously process a batch of files and extract their text and metadata.
80
-
81
- Default implementation processes files sequentially. Backends can override
82
- for more efficient batch processing.
83
-
84
- Args:
85
- paths: List of Path objects representing files to be processed.
86
- **kwargs: Any kwargs related to the given backend
87
-
88
- Returns:
89
- List of extraction result objects in the same order as input paths
90
- """
91
32
  return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
92
33
 
93
34
  async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
94
- """Asynchronously process a batch of files and extract their text and metadata.
95
-
96
- Default implementation processes files concurrently. Backends can override
97
- for more efficient batch processing.
98
-
99
- Args:
100
- paths: List of Path objects representing files to be processed.
101
- **kwargs: Any kwargs related to the given backend
102
-
103
- Returns:
104
- List of extraction result objects in the same order as input paths
105
- """
106
35
  from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
107
36
 
108
37
  tasks = [self.process_file(path, **kwargs) for path in paths]
109
38
  return await run_taskgroup(*tasks) # pragma: no cover
110
39
 
111
40
  def __hash__(self) -> int:
112
- """Hash function for allowing caching."""
113
41
  return hash(type(self).__name__) # pragma: no cover