kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/__init__.py +10 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +74 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_config.py +11 -1
  6. kreuzberg/_constants.py +2 -0
  7. kreuzberg/_document_classification.py +5 -7
  8. kreuzberg/_entity_extraction.py +9 -4
  9. kreuzberg/_extractors/_base.py +269 -3
  10. kreuzberg/_extractors/_email.py +101 -27
  11. kreuzberg/_extractors/_html.py +112 -7
  12. kreuzberg/_extractors/_image.py +23 -22
  13. kreuzberg/_extractors/_pandoc.py +106 -75
  14. kreuzberg/_extractors/_pdf.py +208 -99
  15. kreuzberg/_extractors/_presentation.py +76 -8
  16. kreuzberg/_extractors/_spread_sheet.py +24 -30
  17. kreuzberg/_extractors/_structured.py +83 -15
  18. kreuzberg/_gmft.py +5 -0
  19. kreuzberg/_mcp/server.py +324 -25
  20. kreuzberg/_mime_types.py +42 -0
  21. kreuzberg/_ocr/_easyocr.py +53 -21
  22. kreuzberg/_ocr/_paddleocr.py +1 -1
  23. kreuzberg/_ocr/_tesseract.py +88 -37
  24. kreuzberg/_types.py +291 -61
  25. kreuzberg/_utils/_cache.py +10 -4
  26. kreuzberg/_utils/_device.py +2 -4
  27. kreuzberg/_utils/_html_streaming.py +20 -0
  28. kreuzberg/_utils/_image_preprocessing.py +12 -39
  29. kreuzberg/_utils/_process_pool.py +29 -8
  30. kreuzberg/_utils/_quality.py +7 -2
  31. kreuzberg/_utils/_resource_managers.py +65 -0
  32. kreuzberg/_utils/_serialization.py +13 -6
  33. kreuzberg/_utils/_sync.py +39 -10
  34. kreuzberg/_utils/_tmp.py +37 -1
  35. kreuzberg/cli.py +34 -20
  36. kreuzberg/extraction.py +44 -28
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
  38. kreuzberg-3.16.0.dist-info/RECORD +61 -0
  39. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  40. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,38 +1,60 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import io
5
+ import logging
4
6
  import os
5
7
  import tempfile
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
9
  from dataclasses import asdict
10
+ from itertools import count
7
11
  from multiprocessing import cpu_count
8
12
  from pathlib import Path
9
13
  from re import Pattern
10
14
  from re import compile as compile_regex
11
- from typing import TYPE_CHECKING, ClassVar, cast
15
+ from typing import TYPE_CHECKING, Any, ClassVar, cast
12
16
 
13
17
  import anyio
14
18
  import pypdfium2
15
19
  from anyio import Path as AsyncPath
16
20
  from playa import parse
21
+ from playa.document import Document
22
+ from playa.image import get_image_suffix_and_writer
17
23
 
24
+ from kreuzberg._constants import PDF_POINTS_PER_INCH
18
25
  from kreuzberg._extractors._base import Extractor
19
26
  from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
20
27
  from kreuzberg._ocr import get_ocr_backend
21
28
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
22
- from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
29
+ from kreuzberg._types import (
30
+ EasyOCRConfig,
31
+ ExtractedImage,
32
+ ExtractionResult,
33
+ ImageOCRResult,
34
+ Metadata,
35
+ OcrBackendType,
36
+ PaddleOCRConfig,
37
+ TesseractConfig,
38
+ )
23
39
  from kreuzberg._utils._errors import create_error_context, should_retry
24
40
  from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
25
- from kreuzberg._utils._pdf_lock import pypdfium_file_lock
41
+ from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
26
42
  from kreuzberg._utils._string import normalize_spaces
27
- from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
43
+ from kreuzberg._utils._sync import run_maybe_async, run_taskgroup, run_taskgroup_batched
28
44
  from kreuzberg._utils._table import generate_table_summary
29
- from kreuzberg._utils._tmp import create_temp_file
45
+ from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
30
46
  from kreuzberg.exceptions import ParsingError
31
47
 
32
48
  if TYPE_CHECKING: # pragma: no cover
33
49
  from PIL.Image import Image
34
50
  from playa.document import Document
35
51
 
52
+ logger = logging.getLogger(__name__)
53
+
54
+ PDF_MAX_WORKERS = 8
55
+ PDF_MAX_RETRY_ATTEMPTS = 3
56
+ PDF_RETRY_DELAY_BASE = 0.5
57
+
36
58
 
37
59
  class PDFExtractor(Extractor):
38
60
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {PDF_MIME_TYPE}
@@ -41,27 +63,26 @@ class PDFExtractor(Extractor):
41
63
  MINIMUM_CORRUPTED_RESULTS: ClassVar[int] = 2
42
64
 
43
65
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
44
- file_path, unlink = await create_temp_file(".pdf")
45
- await AsyncPath(file_path).write_bytes(content)
46
- try:
66
+ async with temporary_file(".pdf", content) as file_path:
47
67
  metadata = await self._extract_metadata_with_password_attempts(content)
48
68
  result = await self.extract_path_async(file_path)
49
-
50
69
  result.metadata = metadata
51
70
  return result
52
- finally:
53
- await unlink()
54
71
 
55
72
  async def extract_path_async(self, path: Path) -> ExtractionResult:
56
73
  content_bytes = await AsyncPath(path).read_bytes()
57
74
 
58
75
  result: ExtractionResult | None = None
59
76
 
77
+ document: Document | None = None
78
+ if self.config.extract_images or self.config.extract_tables:
79
+ document = self._parse_with_password_attempts(content_bytes)
80
+
60
81
  if not self.config.force_ocr:
61
82
  try:
62
83
  content = await self._extract_pdf_searchable_text(path)
63
84
  if self._validate_extracted_text(content):
64
- result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
85
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
65
86
  except ParsingError:
66
87
  pass
67
88
 
@@ -69,16 +90,18 @@ class PDFExtractor(Extractor):
69
90
  result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
70
91
 
71
92
  if not result:
72
- result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
93
+ result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
73
94
 
74
- result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
95
+ metadata = await self._extract_metadata_with_password_attempts(content_bytes)
96
+ result.metadata = metadata
75
97
 
76
98
  if self.config.extract_tables:
77
99
  # GMFT is optional dependency ~keep
78
100
  try:
79
101
  from kreuzberg._gmft import extract_tables # noqa: PLC0415
80
102
 
81
- result.tables = await extract_tables(path, self.config.gmft_config)
103
+ tables = await extract_tables(path, self.config.gmft_config)
104
+ result.tables = tables
82
105
  except ImportError: # pragma: no cover
83
106
  result.tables = []
84
107
 
@@ -91,25 +114,30 @@ class PDFExtractor(Extractor):
91
114
  f"{table_summary['total_rows']} total rows",
92
115
  }
93
116
 
117
+ if self.config.extract_images and document:
118
+ images = await self._extract_images_from_playa(document)
119
+ images = self._check_image_memory_limits(images)
120
+ result.images = images
121
+ if self.config.ocr_extracted_images:
122
+ image_ocr_results = await self._process_images_with_ocr(result.images)
123
+ result.image_ocr_results = image_ocr_results
124
+
94
125
  return self._apply_quality_processing(result)
95
126
 
96
127
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
97
- fd, temp_path = tempfile.mkstemp(suffix=".pdf")
98
- try:
99
- with os.fdopen(fd, "wb") as f:
100
- f.write(content)
101
-
102
- result = self.extract_path_sync(Path(temp_path))
103
-
128
+ with temporary_file_sync(".pdf", content) as temp_path:
129
+ result = self.extract_path_sync(temp_path)
104
130
  metadata = self._extract_metadata_with_password_attempts_sync(content)
105
131
  result.metadata = metadata
106
-
107
132
  return result
108
- finally:
109
- with contextlib.suppress(OSError):
110
- Path(temp_path).unlink()
111
133
 
112
134
  def extract_path_sync(self, path: Path) -> ExtractionResult:
135
+ content_bytes = path.read_bytes()
136
+
137
+ document: Document | None = None
138
+ if self.config.extract_images or self.config.extract_tables:
139
+ document = self._parse_with_password_attempts(content_bytes)
140
+
113
141
  try:
114
142
  text = self._extract_pdf_searchable_text_sync(path)
115
143
  except ParsingError:
@@ -137,8 +165,7 @@ class PDFExtractor(Extractor):
137
165
  content=text,
138
166
  mime_type=PLAIN_TEXT_MIME_TYPE,
139
167
  metadata={},
140
- tables=tables,
141
- chunks=[],
168
+ tables=list(tables),
142
169
  )
143
170
 
144
171
  if tables:
@@ -150,6 +177,14 @@ class PDFExtractor(Extractor):
150
177
  f"{table_summary['total_rows']} total rows",
151
178
  }
152
179
 
180
+ if self.config.extract_images and document:
181
+ images = self._extract_images_from_playa_sync(document)
182
+ images = self._check_image_memory_limits(images)
183
+ result.images = images
184
+ if self.config.ocr_extracted_images:
185
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
186
+ result.image_ocr_results = image_ocr_results
187
+
153
188
  return self._apply_quality_processing(result)
154
189
 
155
190
  def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
@@ -163,14 +198,95 @@ class PDFExtractor(Extractor):
163
198
 
164
199
  return (len(corruption_matches) / len(text)) < corruption_threshold
165
200
 
201
+ async def _extract_images_from_playa(self, doc: Document) -> list[ExtractedImage]:
202
+ async def extract_single_image(page_num: int, img_index: int, img_obj: Any) -> ExtractedImage | None:
203
+ try:
204
+ suffix, writer = get_image_suffix_and_writer(img_obj.stream)
205
+
206
+ buffer = io.BytesIO()
207
+ writer(buffer)
208
+
209
+ filename = f"page_{page_num}_image_{img_index}{suffix}"
210
+
211
+ return ExtractedImage(
212
+ data=buffer.getvalue(),
213
+ format=suffix[1:],
214
+ filename=filename,
215
+ page_number=page_num,
216
+ dimensions=img_obj.srcsize,
217
+ colorspace=img_obj.colorspace.name if img_obj.colorspace else None,
218
+ bits_per_component=img_obj.bits,
219
+ is_mask=img_obj.imagemask,
220
+ )
221
+ except Exception as e: # noqa: BLE001
222
+ logger.warning("Failed to extract image on page %s: %s", page_num, e)
223
+ return None
224
+
225
+ tasks = []
226
+ img_counter = 1
227
+ for page_num, page in enumerate(doc.pages, 1):
228
+ for img_obj in page.images:
229
+ tasks.append(extract_single_image(page_num, img_counter, img_obj))
230
+ img_counter += 1
231
+
232
+ if tasks:
233
+ results = await run_taskgroup(*tasks)
234
+ return [img for img in results if img is not None]
235
+
236
+ return []
237
+
238
+ def _extract_images_from_playa_sync(self, doc: Document) -> list[ExtractedImage]:
239
+ def extract_single_image(page_num: int, img_index: int, img_obj: Any) -> ExtractedImage | None:
240
+ try:
241
+ suffix, writer = get_image_suffix_and_writer(img_obj.stream)
242
+
243
+ buffer = io.BytesIO()
244
+ writer(buffer)
245
+
246
+ filename = f"page_{page_num}_image_{img_index}{suffix}"
247
+
248
+ return ExtractedImage(
249
+ data=buffer.getvalue(),
250
+ format=suffix[1:],
251
+ filename=filename,
252
+ page_number=page_num,
253
+ dimensions=img_obj.srcsize,
254
+ colorspace=img_obj.colorspace.name if img_obj.colorspace else None,
255
+ bits_per_component=img_obj.bits,
256
+ is_mask=img_obj.imagemask,
257
+ )
258
+ except Exception as e: # noqa: BLE001
259
+ logger.warning("Failed to extract image on page %s: %s", page_num, e)
260
+ return None
261
+
262
+ img_counter = count(1)
263
+ jobs = [
264
+ (page_num, next(img_counter), img_obj)
265
+ for page_num, page in enumerate(doc.pages, 1)
266
+ for img_obj in page.images
267
+ ]
268
+
269
+ if not jobs:
270
+ return []
271
+
272
+ images = []
273
+ max_workers = min(PDF_MAX_WORKERS, len(jobs))
274
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
275
+ futures = {executor.submit(extract_single_image, *job): i for i, job in enumerate(jobs)}
276
+ for future in as_completed(futures):
277
+ result = future.result()
278
+ if result:
279
+ images.append(result)
280
+
281
+ images.sort(key=lambda x: int((x.filename or "page_0_image_0.jpg").split("_")[-1].split(".")[0]))
282
+ return images
283
+
166
284
  async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
167
- document: pypdfium2.PdfDocument | None = None
168
285
  last_error = None
169
286
 
170
- for attempt in range(3): # Try up to 3 times # ~keep
287
+ for attempt in range(PDF_MAX_RETRY_ATTEMPTS): # ~keep
171
288
  try:
172
- with pypdfium_file_lock(input_file):
173
- document = await run_sync(pypdfium2.PdfDocument, str(input_file))
289
+ async with pdf_document(input_file) as document:
174
290
  images = []
175
291
  for page in cast("pypdfium2.PdfDocument", document):
176
292
  width, height = page.get_size()
@@ -187,9 +303,12 @@ class PDFExtractor(Extractor):
187
303
  else:
188
304
  optimal_dpi = self.config.target_dpi
189
305
 
190
- scale = optimal_dpi / 72.0
306
+ scale = optimal_dpi / PDF_POINTS_PER_INCH
191
307
 
192
- images.append(page.render(scale=scale).to_pil())
308
+ bitmap = page.render(scale=scale)
309
+ image = bitmap.to_pil()
310
+ with pdf_resources_sync(bitmap):
311
+ images.append(image)
193
312
  return images
194
313
  except pypdfium2.PdfiumError as e: # noqa: PERF203
195
314
  last_error = e
@@ -204,11 +323,7 @@ class PDFExtractor(Extractor):
204
323
  ),
205
324
  ) from e
206
325
  # Wait before retry with exponential backoff # ~keep
207
- await anyio.sleep(0.5 * (attempt + 1))
208
- finally:
209
- if document:
210
- with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
211
- await run_sync(document.close)
326
+ await anyio.sleep(PDF_RETRY_DELAY_BASE * (attempt + 1))
212
327
 
213
328
  # All retries failed # ~keep
214
329
  raise ParsingError(
@@ -217,7 +332,7 @@ class PDFExtractor(Extractor):
217
332
  operation="convert_pdf_to_images",
218
333
  file_path=input_file,
219
334
  error=last_error,
220
- attempts=3,
335
+ attempts=PDF_MAX_RETRY_ATTEMPTS,
221
336
  ),
222
337
  ) from last_error
223
338
 
@@ -230,14 +345,12 @@ class PDFExtractor(Extractor):
230
345
  )
231
346
  content = "\n".join(result.content for result in ocr_results)
232
347
 
233
- return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
348
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
234
349
 
235
350
  @staticmethod
236
351
  async def _extract_pdf_searchable_text(input_file: Path) -> str:
237
- document: pypdfium2.PdfDocument | None = None
238
352
  try:
239
- with pypdfium_file_lock(input_file):
240
- document = await run_sync(pypdfium2.PdfDocument, str(input_file))
353
+ async with pdf_document(input_file) as document:
241
354
  pages_content = []
242
355
  page_errors = []
243
356
 
@@ -246,6 +359,8 @@ class PDFExtractor(Extractor):
246
359
  text_page = page.get_textpage()
247
360
  page_content = text_page.get_text_bounded()
248
361
  pages_content.append(page_content)
362
+ with pdf_resources_sync(text_page):
363
+ pass
249
364
  except Exception as e: # noqa: PERF203, BLE001
250
365
  page_errors.append({"page": i + 1, "error": str(e)})
251
366
  pages_content.append(f"[Error extracting page {i + 1}]")
@@ -275,52 +390,67 @@ class PDFExtractor(Extractor):
275
390
  error=e,
276
391
  ),
277
392
  ) from e
278
- finally:
279
- if document:
280
- with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
281
- await run_sync(document.close)
282
393
 
283
394
  def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
284
- pdf = None
285
395
  try:
286
- with pypdfium_file_lock(path):
287
- pdf = pypdfium2.PdfDocument(str(path))
396
+ with pdf_document_sync(path) as pdf:
288
397
  pages_text = []
289
398
  for page in pdf:
290
399
  text_page = page.get_textpage()
291
400
  text = text_page.get_text_bounded()
292
401
  pages_text.append(text)
293
- text_page.close()
294
- page.close()
402
+ with pdf_resources_sync(text_page, page):
403
+ pass
295
404
  return "\n".join(pages_text)
296
405
  except Exception as e:
297
406
  raise ParsingError(f"Failed to extract PDF text: {e}") from e
298
- finally:
299
- if pdf:
300
- with pypdfium_file_lock(path), contextlib.suppress(Exception):
301
- pdf.close()
302
407
 
303
408
  def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
304
- pdf = None
409
+ temp_files: list[Path] = []
305
410
  try:
306
- images = []
307
- with pypdfium_file_lock(path):
308
- pdf = pypdfium2.PdfDocument(str(path))
411
+ with pdf_document_sync(path) as pdf:
309
412
  for page in pdf:
310
- bitmap = page.render(scale=200 / 72)
413
+ width, height = page.get_size()
414
+
415
+ if self.config.auto_adjust_dpi:
416
+ optimal_dpi = calculate_optimal_dpi(
417
+ page_width=width,
418
+ page_height=height,
419
+ target_dpi=self.config.target_dpi,
420
+ max_dimension=self.config.max_image_dimension,
421
+ min_dpi=self.config.min_dpi,
422
+ max_dpi=self.config.max_dpi,
423
+ )
424
+ else:
425
+ optimal_dpi = self.config.target_dpi
426
+
427
+ scale = optimal_dpi / PDF_POINTS_PER_INCH
428
+
429
+ bitmap = page.render(scale=scale)
311
430
  pil_image = bitmap.to_pil()
312
- images.append(pil_image)
313
- bitmap.close()
314
- page.close()
315
431
 
316
- return self._process_pdf_images_with_ocr_direct(images)
432
+ fd, tmp = tempfile.mkstemp(suffix=".png")
433
+ try:
434
+ os.close(fd)
435
+ tmp_path = Path(tmp)
436
+ pil_image.save(tmp_path)
437
+ temp_files.append(tmp_path)
438
+ except Exception:
439
+ with contextlib.suppress(OSError):
440
+ os.close(fd)
441
+ raise
442
+ finally:
443
+ with pdf_resources_sync(bitmap, page):
444
+ pil_image.close()
445
+
446
+ return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
317
447
 
318
448
  except Exception as e:
319
449
  raise ParsingError(f"Failed to OCR PDF: {e}") from e
320
450
  finally:
321
- if pdf:
322
- with pypdfium_file_lock(path), contextlib.suppress(Exception):
323
- pdf.close()
451
+ for p in temp_files:
452
+ with contextlib.suppress(OSError):
453
+ p.unlink()
324
454
 
325
455
  def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
326
456
  backend = get_ocr_backend(self.config.ocr_backend)
@@ -348,35 +478,12 @@ class PDFExtractor(Extractor):
348
478
  return "\n\n".join(result.content for result in results)
349
479
 
350
480
  def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
481
+ if not self.config.ocr_backend:
482
+ raise ValueError("OCR backend must be specified")
351
483
  backend = get_ocr_backend(self.config.ocr_backend)
484
+ config = self._prepare_ocr_config(self.config.ocr_backend)
352
485
 
353
- match self.config.ocr_backend:
354
- case "tesseract":
355
- config = (
356
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
357
- )
358
- results = []
359
- for image in images:
360
- result = backend.process_image_sync(image, **asdict(config))
361
- results.append(result)
362
- case "paddleocr":
363
- paddle_config = (
364
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
365
- )
366
- results = []
367
- for image in images:
368
- result = backend.process_image_sync(image, **asdict(paddle_config))
369
- results.append(result)
370
- case "easyocr":
371
- easy_config = (
372
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
373
- )
374
- results = []
375
- for image in images:
376
- result = backend.process_image_sync(image, **asdict(easy_config))
377
- results.append(result)
378
- case _:
379
- raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
486
+ results = [backend.process_image_sync(image, **config) for image in images]
380
487
 
381
488
  return "\n\n".join(result.content for result in results)
382
489
 
@@ -390,9 +497,11 @@ class PDFExtractor(Extractor):
390
497
  for password in passwords:
391
498
  try:
392
499
  return parse(content, max_workers=1, password=password)
393
- except Exception as e: # noqa: PERF203, BLE001
500
+ except (ValueError, TypeError, KeyError, RuntimeError) as e: # noqa: PERF203
394
501
  last_exception = e
395
502
  continue
503
+ except OSError as e:
504
+ raise ParsingError(f"Failed to parse PDF: {e}") from e
396
505
 
397
506
  if last_exception:
398
507
  raise last_exception from None
@@ -411,7 +520,7 @@ class PDFExtractor(Extractor):
411
520
  for password in passwords:
412
521
  try:
413
522
  return await extract_pdf_metadata(content, password=password)
414
- except Exception as e: # noqa: PERF203, BLE001
523
+ except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
415
524
  last_exception = e
416
525
  continue
417
526
 
@@ -429,7 +538,7 @@ class PDFExtractor(Extractor):
429
538
  for password in passwords:
430
539
  try:
431
540
  return extract_pdf_metadata_sync(content, password=password)
432
- except Exception as e: # noqa: PERF203, BLE001
541
+ except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
433
542
  last_exception = e
434
543
  continue
435
544
 
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  import re
4
5
  from contextlib import suppress
5
6
  from html import escape
6
7
  from io import BytesIO
7
8
  from pathlib import Path
8
- from typing import TYPE_CHECKING, ClassVar
9
+ from typing import TYPE_CHECKING, Any, ClassVar
9
10
 
10
11
  import pptx
11
12
  from anyio import Path as AsyncPath
@@ -13,8 +14,9 @@ from pptx.enum.shapes import MSO_SHAPE_TYPE
13
14
 
14
15
  from kreuzberg._extractors._base import Extractor
15
16
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, POWER_POINT_MIME_TYPE
16
- from kreuzberg._types import ExtractionResult
17
+ from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult
17
18
  from kreuzberg._utils._string import normalize_spaces
19
+ from kreuzberg._utils._sync import run_maybe_async
18
20
 
19
21
  if TYPE_CHECKING: # pragma: no cover
20
22
  from pptx.presentation import Presentation
@@ -23,23 +25,41 @@ if TYPE_CHECKING: # pragma: no cover
23
25
 
24
26
  _NON_WORD_PATTERN = re.compile(r"\W")
25
27
 
28
+ logger = logging.getLogger(__name__)
29
+
26
30
 
27
31
  class PresentationExtractor(Extractor):
28
32
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
29
33
 
30
34
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
31
- return self._extract_pptx(content)
35
+ result = self._extract_pptx(content)
36
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
37
+ image_ocr_results = await self._process_images_with_ocr(result.images)
38
+ result.image_ocr_results = image_ocr_results
39
+ return result
32
40
 
33
41
  async def extract_path_async(self, path: Path) -> ExtractionResult:
34
42
  content = await AsyncPath(path).read_bytes()
35
- return self._extract_pptx(content)
43
+ result = self._extract_pptx(content)
44
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
45
+ image_ocr_results = await self._process_images_with_ocr(result.images)
46
+ result.image_ocr_results = image_ocr_results
47
+ return result
36
48
 
37
49
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
38
- return self._extract_pptx(content)
50
+ result = self._extract_pptx(content)
51
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
52
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
53
+ result.image_ocr_results = image_ocr_results
54
+ return result
39
55
 
40
56
  def extract_path_sync(self, path: Path) -> ExtractionResult:
41
57
  content = Path(path).read_bytes()
42
- return self._extract_pptx(content)
58
+ result = self._extract_pptx(content)
59
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
60
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
61
+ result.image_ocr_results = image_ocr_results
62
+ return result
43
63
 
44
64
  def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
45
65
  md_content = ""
@@ -63,8 +83,10 @@ class PresentationExtractor(Extractor):
63
83
  with suppress(AttributeError):
64
84
  alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
65
85
 
66
- filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
67
- md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
86
+ name_val = shape.name if isinstance(getattr(shape, "name", None), str) else "image"
87
+ filename = _NON_WORD_PATTERN.sub("", name_val) + ".jpg"
88
+ label = alt_text if alt_text else name_val
89
+ md_content += f"\n![{label}]({filename})\n"
68
90
 
69
91
  elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
70
92
  html_table = "<table>"
@@ -106,8 +128,54 @@ class PresentationExtractor(Extractor):
106
128
  chunks=[],
107
129
  )
108
130
 
131
+ if self.config.extract_images:
132
+ images = self._extract_images_from_pptx(presentation)
133
+ result.images = images
134
+
109
135
  return self._apply_quality_processing(result)
110
136
 
137
+ def _extract_images_from_pptx(self, presentation: Presentation) -> list[ExtractedImage]:
138
+ images: list[ExtractedImage] = []
139
+
140
+ for slide_num, slide in enumerate(presentation.slides, 1):
141
+ for shape in slide.shapes:
142
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
143
+ try:
144
+ image = shape.image
145
+ if not image.blob or not isinstance(image.blob, bytes):
146
+ continue
147
+ filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
148
+
149
+ images.append(
150
+ ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
151
+ )
152
+ except Exception as e: # noqa: BLE001
153
+ logger.warning("Failed to extract image from slide %s: %s", slide_num, e)
154
+ continue
155
+
156
+ elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
157
+ images.extend(self._extract_from_grouped_shapes(shape, slide_num, len(images)))
158
+
159
+ return images
160
+
161
+ def _extract_from_grouped_shapes(self, group_shape: Any, slide_num: int, image_count: int) -> list[ExtractedImage]:
162
+ images: list[ExtractedImage] = []
163
+ for shape in group_shape.shapes:
164
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
165
+ try:
166
+ image = shape.image
167
+ if not image.blob or not isinstance(image.blob, bytes):
168
+ continue
169
+ filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
170
+ images.append(
171
+ ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
172
+ )
173
+ except Exception as e: # noqa: BLE001
174
+ logger.warning("Failed to extract grouped image: %s", e)
175
+ elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
176
+ images.extend(self._extract_from_grouped_shapes(shape, slide_num, image_count + len(images)))
177
+ return images
178
+
111
179
  @staticmethod
112
180
  def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
113
181
  metadata: Metadata = {}