kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +156 -30
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_constants.py +2 -0
  6. kreuzberg/_document_classification.py +4 -6
  7. kreuzberg/_entity_extraction.py +9 -4
  8. kreuzberg/_extractors/_base.py +269 -3
  9. kreuzberg/_extractors/_email.py +95 -27
  10. kreuzberg/_extractors/_html.py +85 -7
  11. kreuzberg/_extractors/_image.py +23 -22
  12. kreuzberg/_extractors/_pandoc.py +106 -75
  13. kreuzberg/_extractors/_pdf.py +209 -99
  14. kreuzberg/_extractors/_presentation.py +72 -8
  15. kreuzberg/_extractors/_spread_sheet.py +25 -30
  16. kreuzberg/_mcp/server.py +345 -25
  17. kreuzberg/_mime_types.py +42 -0
  18. kreuzberg/_ocr/_easyocr.py +2 -2
  19. kreuzberg/_ocr/_paddleocr.py +1 -1
  20. kreuzberg/_ocr/_tesseract.py +74 -34
  21. kreuzberg/_types.py +182 -23
  22. kreuzberg/_utils/_cache.py +10 -4
  23. kreuzberg/_utils/_device.py +2 -4
  24. kreuzberg/_utils/_image_preprocessing.py +12 -39
  25. kreuzberg/_utils/_process_pool.py +29 -8
  26. kreuzberg/_utils/_quality.py +7 -2
  27. kreuzberg/_utils/_resource_managers.py +65 -0
  28. kreuzberg/_utils/_sync.py +36 -6
  29. kreuzberg/_utils/_tmp.py +37 -1
  30. kreuzberg/cli.py +34 -20
  31. kreuzberg/extraction.py +43 -27
  32. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
  33. kreuzberg-3.15.0.dist-info/RECORD +60 -0
  34. kreuzberg-3.14.0.dist-info/RECORD +0 -58
  35. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
  37. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,38 +1,61 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  import contextlib
5
+ import io
6
+ import logging
4
7
  import os
5
8
  import tempfile
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
10
  from dataclasses import asdict
11
+ from itertools import count
7
12
  from multiprocessing import cpu_count
8
13
  from pathlib import Path
9
14
  from re import Pattern
10
15
  from re import compile as compile_regex
11
- from typing import TYPE_CHECKING, ClassVar, cast
16
+ from typing import TYPE_CHECKING, Any, ClassVar, cast
12
17
 
13
18
  import anyio
14
19
  import pypdfium2
15
20
  from anyio import Path as AsyncPath
16
21
  from playa import parse
22
+ from playa.document import Document
23
+ from playa.image import get_image_suffix_and_writer
17
24
 
25
+ from kreuzberg._constants import PDF_POINTS_PER_INCH
18
26
  from kreuzberg._extractors._base import Extractor
19
27
  from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
20
28
  from kreuzberg._ocr import get_ocr_backend
21
29
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
22
- from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
30
+ from kreuzberg._types import (
31
+ EasyOCRConfig,
32
+ ExtractedImage,
33
+ ExtractionResult,
34
+ ImageOCRResult,
35
+ Metadata,
36
+ OcrBackendType,
37
+ PaddleOCRConfig,
38
+ TesseractConfig,
39
+ )
23
40
  from kreuzberg._utils._errors import create_error_context, should_retry
24
41
  from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
25
- from kreuzberg._utils._pdf_lock import pypdfium_file_lock
42
+ from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
26
43
  from kreuzberg._utils._string import normalize_spaces
27
- from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
44
+ from kreuzberg._utils._sync import run_maybe_async, run_taskgroup_batched
28
45
  from kreuzberg._utils._table import generate_table_summary
29
- from kreuzberg._utils._tmp import create_temp_file
46
+ from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
30
47
  from kreuzberg.exceptions import ParsingError
31
48
 
32
49
  if TYPE_CHECKING: # pragma: no cover
33
50
  from PIL.Image import Image
34
51
  from playa.document import Document
35
52
 
53
+ logger = logging.getLogger(__name__)
54
+
55
+ PDF_MAX_WORKERS = 8
56
+ PDF_MAX_RETRY_ATTEMPTS = 3
57
+ PDF_RETRY_DELAY_BASE = 0.5
58
+
36
59
 
37
60
  class PDFExtractor(Extractor):
38
61
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {PDF_MIME_TYPE}
@@ -41,27 +64,26 @@ class PDFExtractor(Extractor):
41
64
  MINIMUM_CORRUPTED_RESULTS: ClassVar[int] = 2
42
65
 
43
66
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
44
- file_path, unlink = await create_temp_file(".pdf")
45
- await AsyncPath(file_path).write_bytes(content)
46
- try:
67
+ async with temporary_file(".pdf", content) as file_path:
47
68
  metadata = await self._extract_metadata_with_password_attempts(content)
48
69
  result = await self.extract_path_async(file_path)
49
-
50
70
  result.metadata = metadata
51
71
  return result
52
- finally:
53
- await unlink()
54
72
 
55
73
  async def extract_path_async(self, path: Path) -> ExtractionResult:
56
74
  content_bytes = await AsyncPath(path).read_bytes()
57
75
 
58
76
  result: ExtractionResult | None = None
59
77
 
78
+ document: Document | None = None
79
+ if self.config.extract_images or self.config.extract_tables:
80
+ document = self._parse_with_password_attempts(content_bytes)
81
+
60
82
  if not self.config.force_ocr:
61
83
  try:
62
84
  content = await self._extract_pdf_searchable_text(path)
63
85
  if self._validate_extracted_text(content):
64
- result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
86
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
65
87
  except ParsingError:
66
88
  pass
67
89
 
@@ -69,16 +91,18 @@ class PDFExtractor(Extractor):
69
91
  result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
70
92
 
71
93
  if not result:
72
- result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
94
+ result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
73
95
 
74
- result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
96
+ metadata = await self._extract_metadata_with_password_attempts(content_bytes)
97
+ result.metadata = metadata
75
98
 
76
99
  if self.config.extract_tables:
77
100
  # GMFT is optional dependency ~keep
78
101
  try:
79
102
  from kreuzberg._gmft import extract_tables # noqa: PLC0415
80
103
 
81
- result.tables = await extract_tables(path, self.config.gmft_config)
104
+ tables = await extract_tables(path, self.config.gmft_config)
105
+ result.tables = tables
82
106
  except ImportError: # pragma: no cover
83
107
  result.tables = []
84
108
 
@@ -91,25 +115,30 @@ class PDFExtractor(Extractor):
91
115
  f"{table_summary['total_rows']} total rows",
92
116
  }
93
117
 
118
+ if self.config.extract_images and document:
119
+ images = await self._extract_images_from_playa(document)
120
+ images = self._check_image_memory_limits(images)
121
+ result.images = images
122
+ if self.config.ocr_extracted_images:
123
+ image_ocr_results = await self._process_images_with_ocr(result.images)
124
+ result.image_ocr_results = image_ocr_results
125
+
94
126
  return self._apply_quality_processing(result)
95
127
 
96
128
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
97
- fd, temp_path = tempfile.mkstemp(suffix=".pdf")
98
- try:
99
- with os.fdopen(fd, "wb") as f:
100
- f.write(content)
101
-
102
- result = self.extract_path_sync(Path(temp_path))
103
-
129
+ with temporary_file_sync(".pdf", content) as temp_path:
130
+ result = self.extract_path_sync(temp_path)
104
131
  metadata = self._extract_metadata_with_password_attempts_sync(content)
105
132
  result.metadata = metadata
106
-
107
133
  return result
108
- finally:
109
- with contextlib.suppress(OSError):
110
- Path(temp_path).unlink()
111
134
 
112
135
  def extract_path_sync(self, path: Path) -> ExtractionResult:
136
+ content_bytes = path.read_bytes()
137
+
138
+ document: Document | None = None
139
+ if self.config.extract_images or self.config.extract_tables:
140
+ document = self._parse_with_password_attempts(content_bytes)
141
+
113
142
  try:
114
143
  text = self._extract_pdf_searchable_text_sync(path)
115
144
  except ParsingError:
@@ -137,8 +166,7 @@ class PDFExtractor(Extractor):
137
166
  content=text,
138
167
  mime_type=PLAIN_TEXT_MIME_TYPE,
139
168
  metadata={},
140
- tables=tables,
141
- chunks=[],
169
+ tables=list(tables),
142
170
  )
143
171
 
144
172
  if tables:
@@ -150,6 +178,14 @@ class PDFExtractor(Extractor):
150
178
  f"{table_summary['total_rows']} total rows",
151
179
  }
152
180
 
181
+ if self.config.extract_images and document:
182
+ images = self._extract_images_from_playa_sync(document)
183
+ images = self._check_image_memory_limits(images)
184
+ result.images = images
185
+ if self.config.ocr_extracted_images:
186
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
187
+ result.image_ocr_results = image_ocr_results
188
+
153
189
  return self._apply_quality_processing(result)
154
190
 
155
191
  def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
@@ -163,14 +199,95 @@ class PDFExtractor(Extractor):
163
199
 
164
200
  return (len(corruption_matches) / len(text)) < corruption_threshold
165
201
 
202
+ async def _extract_images_from_playa(self, doc: Document) -> list[ExtractedImage]:
203
+ async def extract_single_image(page_num: int, img_index: int, img_obj: Any) -> ExtractedImage | None:
204
+ try:
205
+ suffix, writer = get_image_suffix_and_writer(img_obj.stream)
206
+
207
+ buffer = io.BytesIO()
208
+ writer(buffer)
209
+
210
+ filename = f"page_{page_num}_image_{img_index}{suffix}"
211
+
212
+ return ExtractedImage(
213
+ data=buffer.getvalue(),
214
+ format=suffix[1:],
215
+ filename=filename,
216
+ page_number=page_num,
217
+ dimensions=img_obj.srcsize,
218
+ colorspace=img_obj.colorspace.name if img_obj.colorspace else None,
219
+ bits_per_component=img_obj.bits,
220
+ is_mask=img_obj.imagemask,
221
+ )
222
+ except Exception as e: # noqa: BLE001
223
+ logger.warning("Failed to extract image on page %s: %s", page_num, e)
224
+ return None
225
+
226
+ tasks = []
227
+ img_counter = 1
228
+ for page_num, page in enumerate(doc.pages, 1):
229
+ for img_obj in page.images:
230
+ tasks.append(extract_single_image(page_num, img_counter, img_obj))
231
+ img_counter += 1
232
+
233
+ if tasks:
234
+ results = await asyncio.gather(*tasks)
235
+ return [img for img in results if img is not None]
236
+
237
+ return []
238
+
239
+ def _extract_images_from_playa_sync(self, doc: Document) -> list[ExtractedImage]:
240
+ def extract_single_image(page_num: int, img_index: int, img_obj: Any) -> ExtractedImage | None:
241
+ try:
242
+ suffix, writer = get_image_suffix_and_writer(img_obj.stream)
243
+
244
+ buffer = io.BytesIO()
245
+ writer(buffer)
246
+
247
+ filename = f"page_{page_num}_image_{img_index}{suffix}"
248
+
249
+ return ExtractedImage(
250
+ data=buffer.getvalue(),
251
+ format=suffix[1:],
252
+ filename=filename,
253
+ page_number=page_num,
254
+ dimensions=img_obj.srcsize,
255
+ colorspace=img_obj.colorspace.name if img_obj.colorspace else None,
256
+ bits_per_component=img_obj.bits,
257
+ is_mask=img_obj.imagemask,
258
+ )
259
+ except Exception as e: # noqa: BLE001
260
+ logger.warning("Failed to extract image on page %s: %s", page_num, e)
261
+ return None
262
+
263
+ img_counter = count(1)
264
+ jobs = [
265
+ (page_num, next(img_counter), img_obj)
266
+ for page_num, page in enumerate(doc.pages, 1)
267
+ for img_obj in page.images
268
+ ]
269
+
270
+ if not jobs:
271
+ return []
272
+
273
+ images = []
274
+ max_workers = min(PDF_MAX_WORKERS, len(jobs))
275
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
276
+ futures = {executor.submit(extract_single_image, *job): i for i, job in enumerate(jobs)}
277
+ for future in as_completed(futures):
278
+ result = future.result()
279
+ if result:
280
+ images.append(result)
281
+
282
+ images.sort(key=lambda x: int((x.filename or "page_0_image_0.jpg").split("_")[-1].split(".")[0]))
283
+ return images
284
+
166
285
  async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
167
- document: pypdfium2.PdfDocument | None = None
168
286
  last_error = None
169
287
 
170
- for attempt in range(3): # Try up to 3 times # ~keep
288
+ for attempt in range(PDF_MAX_RETRY_ATTEMPTS): # ~keep
171
289
  try:
172
- with pypdfium_file_lock(input_file):
173
- document = await run_sync(pypdfium2.PdfDocument, str(input_file))
290
+ async with pdf_document(input_file) as document:
174
291
  images = []
175
292
  for page in cast("pypdfium2.PdfDocument", document):
176
293
  width, height = page.get_size()
@@ -187,9 +304,12 @@ class PDFExtractor(Extractor):
187
304
  else:
188
305
  optimal_dpi = self.config.target_dpi
189
306
 
190
- scale = optimal_dpi / 72.0
307
+ scale = optimal_dpi / PDF_POINTS_PER_INCH
191
308
 
192
- images.append(page.render(scale=scale).to_pil())
309
+ bitmap = page.render(scale=scale)
310
+ image = bitmap.to_pil()
311
+ with pdf_resources_sync(bitmap):
312
+ images.append(image)
193
313
  return images
194
314
  except pypdfium2.PdfiumError as e: # noqa: PERF203
195
315
  last_error = e
@@ -204,11 +324,7 @@ class PDFExtractor(Extractor):
204
324
  ),
205
325
  ) from e
206
326
  # Wait before retry with exponential backoff # ~keep
207
- await anyio.sleep(0.5 * (attempt + 1))
208
- finally:
209
- if document:
210
- with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
211
- await run_sync(document.close)
327
+ await anyio.sleep(PDF_RETRY_DELAY_BASE * (attempt + 1))
212
328
 
213
329
  # All retries failed # ~keep
214
330
  raise ParsingError(
@@ -217,7 +333,7 @@ class PDFExtractor(Extractor):
217
333
  operation="convert_pdf_to_images",
218
334
  file_path=input_file,
219
335
  error=last_error,
220
- attempts=3,
336
+ attempts=PDF_MAX_RETRY_ATTEMPTS,
221
337
  ),
222
338
  ) from last_error
223
339
 
@@ -230,14 +346,12 @@ class PDFExtractor(Extractor):
230
346
  )
231
347
  content = "\n".join(result.content for result in ocr_results)
232
348
 
233
- return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
349
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
234
350
 
235
351
  @staticmethod
236
352
  async def _extract_pdf_searchable_text(input_file: Path) -> str:
237
- document: pypdfium2.PdfDocument | None = None
238
353
  try:
239
- with pypdfium_file_lock(input_file):
240
- document = await run_sync(pypdfium2.PdfDocument, str(input_file))
354
+ async with pdf_document(input_file) as document:
241
355
  pages_content = []
242
356
  page_errors = []
243
357
 
@@ -246,6 +360,8 @@ class PDFExtractor(Extractor):
246
360
  text_page = page.get_textpage()
247
361
  page_content = text_page.get_text_bounded()
248
362
  pages_content.append(page_content)
363
+ with pdf_resources_sync(text_page):
364
+ pass
249
365
  except Exception as e: # noqa: PERF203, BLE001
250
366
  page_errors.append({"page": i + 1, "error": str(e)})
251
367
  pages_content.append(f"[Error extracting page {i + 1}]")
@@ -275,52 +391,67 @@ class PDFExtractor(Extractor):
275
391
  error=e,
276
392
  ),
277
393
  ) from e
278
- finally:
279
- if document:
280
- with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
281
- await run_sync(document.close)
282
394
 
283
395
  def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
284
- pdf = None
285
396
  try:
286
- with pypdfium_file_lock(path):
287
- pdf = pypdfium2.PdfDocument(str(path))
397
+ with pdf_document_sync(path) as pdf:
288
398
  pages_text = []
289
399
  for page in pdf:
290
400
  text_page = page.get_textpage()
291
401
  text = text_page.get_text_bounded()
292
402
  pages_text.append(text)
293
- text_page.close()
294
- page.close()
403
+ with pdf_resources_sync(text_page, page):
404
+ pass
295
405
  return "\n".join(pages_text)
296
406
  except Exception as e:
297
407
  raise ParsingError(f"Failed to extract PDF text: {e}") from e
298
- finally:
299
- if pdf:
300
- with pypdfium_file_lock(path), contextlib.suppress(Exception):
301
- pdf.close()
302
408
 
303
409
  def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
304
- pdf = None
410
+ temp_files: list[Path] = []
305
411
  try:
306
- images = []
307
- with pypdfium_file_lock(path):
308
- pdf = pypdfium2.PdfDocument(str(path))
412
+ with pdf_document_sync(path) as pdf:
309
413
  for page in pdf:
310
- bitmap = page.render(scale=200 / 72)
414
+ width, height = page.get_size()
415
+
416
+ if self.config.auto_adjust_dpi:
417
+ optimal_dpi = calculate_optimal_dpi(
418
+ page_width=width,
419
+ page_height=height,
420
+ target_dpi=self.config.target_dpi,
421
+ max_dimension=self.config.max_image_dimension,
422
+ min_dpi=self.config.min_dpi,
423
+ max_dpi=self.config.max_dpi,
424
+ )
425
+ else:
426
+ optimal_dpi = self.config.target_dpi
427
+
428
+ scale = optimal_dpi / PDF_POINTS_PER_INCH
429
+
430
+ bitmap = page.render(scale=scale)
311
431
  pil_image = bitmap.to_pil()
312
- images.append(pil_image)
313
- bitmap.close()
314
- page.close()
315
432
 
316
- return self._process_pdf_images_with_ocr_direct(images)
433
+ fd, tmp = tempfile.mkstemp(suffix=".png")
434
+ try:
435
+ os.close(fd)
436
+ tmp_path = Path(tmp)
437
+ pil_image.save(tmp_path)
438
+ temp_files.append(tmp_path)
439
+ except Exception:
440
+ with contextlib.suppress(OSError):
441
+ os.close(fd)
442
+ raise
443
+ finally:
444
+ with pdf_resources_sync(bitmap, page):
445
+ pil_image.close()
446
+
447
+ return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
317
448
 
318
449
  except Exception as e:
319
450
  raise ParsingError(f"Failed to OCR PDF: {e}") from e
320
451
  finally:
321
- if pdf:
322
- with pypdfium_file_lock(path), contextlib.suppress(Exception):
323
- pdf.close()
452
+ for p in temp_files:
453
+ with contextlib.suppress(OSError):
454
+ p.unlink()
324
455
 
325
456
  def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
326
457
  backend = get_ocr_backend(self.config.ocr_backend)
@@ -348,35 +479,12 @@ class PDFExtractor(Extractor):
348
479
  return "\n\n".join(result.content for result in results)
349
480
 
350
481
  def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
482
+ if not self.config.ocr_backend:
483
+ raise ValueError("OCR backend must be specified")
351
484
  backend = get_ocr_backend(self.config.ocr_backend)
485
+ config = self._prepare_ocr_config(self.config.ocr_backend)
352
486
 
353
- match self.config.ocr_backend:
354
- case "tesseract":
355
- config = (
356
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
357
- )
358
- results = []
359
- for image in images:
360
- result = backend.process_image_sync(image, **asdict(config))
361
- results.append(result)
362
- case "paddleocr":
363
- paddle_config = (
364
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
365
- )
366
- results = []
367
- for image in images:
368
- result = backend.process_image_sync(image, **asdict(paddle_config))
369
- results.append(result)
370
- case "easyocr":
371
- easy_config = (
372
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
373
- )
374
- results = []
375
- for image in images:
376
- result = backend.process_image_sync(image, **asdict(easy_config))
377
- results.append(result)
378
- case _:
379
- raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
487
+ results = [backend.process_image_sync(image, **config) for image in images]
380
488
 
381
489
  return "\n\n".join(result.content for result in results)
382
490
 
@@ -390,9 +498,11 @@ class PDFExtractor(Extractor):
390
498
  for password in passwords:
391
499
  try:
392
500
  return parse(content, max_workers=1, password=password)
393
- except Exception as e: # noqa: PERF203, BLE001
501
+ except (ValueError, TypeError, KeyError, RuntimeError) as e: # noqa: PERF203
394
502
  last_exception = e
395
503
  continue
504
+ except OSError as e:
505
+ raise ParsingError(f"Failed to parse PDF: {e}") from e
396
506
 
397
507
  if last_exception:
398
508
  raise last_exception from None
@@ -411,7 +521,7 @@ class PDFExtractor(Extractor):
411
521
  for password in passwords:
412
522
  try:
413
523
  return await extract_pdf_metadata(content, password=password)
414
- except Exception as e: # noqa: PERF203, BLE001
524
+ except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
415
525
  last_exception = e
416
526
  continue
417
527
 
@@ -429,7 +539,7 @@ class PDFExtractor(Extractor):
429
539
  for password in passwords:
430
540
  try:
431
541
  return extract_pdf_metadata_sync(content, password=password)
432
- except Exception as e: # noqa: PERF203, BLE001
542
+ except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
433
543
  last_exception = e
434
544
  continue
435
545
 
@@ -1,11 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  import re
4
5
  from contextlib import suppress
5
6
  from html import escape
6
7
  from io import BytesIO
7
8
  from pathlib import Path
8
- from typing import TYPE_CHECKING, ClassVar
9
+ from typing import TYPE_CHECKING, Any, ClassVar
9
10
 
10
11
  import pptx
11
12
  from anyio import Path as AsyncPath
@@ -13,8 +14,9 @@ from pptx.enum.shapes import MSO_SHAPE_TYPE
13
14
 
14
15
  from kreuzberg._extractors._base import Extractor
15
16
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, POWER_POINT_MIME_TYPE
16
- from kreuzberg._types import ExtractionResult
17
+ from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult
17
18
  from kreuzberg._utils._string import normalize_spaces
19
+ from kreuzberg._utils._sync import run_maybe_async
18
20
 
19
21
  if TYPE_CHECKING: # pragma: no cover
20
22
  from pptx.presentation import Presentation
@@ -23,23 +25,41 @@ if TYPE_CHECKING: # pragma: no cover
23
25
 
24
26
  _NON_WORD_PATTERN = re.compile(r"\W")
25
27
 
28
+ logger = logging.getLogger(__name__)
29
+
26
30
 
27
31
  class PresentationExtractor(Extractor):
28
32
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
29
33
 
30
34
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
31
- return self._extract_pptx(content)
35
+ result = self._extract_pptx(content)
36
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
37
+ image_ocr_results = await self._process_images_with_ocr(result.images)
38
+ result.image_ocr_results = image_ocr_results
39
+ return result
32
40
 
33
41
  async def extract_path_async(self, path: Path) -> ExtractionResult:
34
42
  content = await AsyncPath(path).read_bytes()
35
- return self._extract_pptx(content)
43
+ result = self._extract_pptx(content)
44
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
45
+ image_ocr_results = await self._process_images_with_ocr(result.images)
46
+ result.image_ocr_results = image_ocr_results
47
+ return result
36
48
 
37
49
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
38
- return self._extract_pptx(content)
50
+ result = self._extract_pptx(content)
51
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
52
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
53
+ result.image_ocr_results = image_ocr_results
54
+ return result
39
55
 
40
56
  def extract_path_sync(self, path: Path) -> ExtractionResult:
41
57
  content = Path(path).read_bytes()
42
- return self._extract_pptx(content)
58
+ result = self._extract_pptx(content)
59
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
60
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
61
+ result.image_ocr_results = image_ocr_results
62
+ return result
43
63
 
44
64
  def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
45
65
  md_content = ""
@@ -63,8 +83,10 @@ class PresentationExtractor(Extractor):
63
83
  with suppress(AttributeError):
64
84
  alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
65
85
 
66
- filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
67
- md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
86
+ name_val = shape.name if isinstance(getattr(shape, "name", None), str) else "image"
87
+ filename = _NON_WORD_PATTERN.sub("", name_val) + ".jpg"
88
+ label = alt_text if alt_text else name_val
89
+ md_content += f"\n![{label}]({filename})\n"
68
90
 
69
91
  elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
70
92
  html_table = "<table>"
@@ -106,8 +128,50 @@ class PresentationExtractor(Extractor):
106
128
  chunks=[],
107
129
  )
108
130
 
131
+ if self.config.extract_images:
132
+ images = self._extract_images_from_pptx(presentation)
133
+ result.images = images
134
+
109
135
  return self._apply_quality_processing(result)
110
136
 
137
+ def _extract_images_from_pptx(self, presentation: Presentation) -> list[ExtractedImage]:
138
+ images: list[ExtractedImage] = []
139
+
140
+ for slide_num, slide in enumerate(presentation.slides, 1):
141
+ for shape in slide.shapes:
142
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
143
+ try:
144
+ image = shape.image
145
+ filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
146
+
147
+ images.append(
148
+ ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
149
+ )
150
+ except Exception as e: # noqa: BLE001
151
+ logger.warning("Failed to extract image from slide %s: %s", slide_num, e)
152
+ continue
153
+
154
+ elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
155
+ images.extend(self._extract_from_grouped_shapes(shape, slide_num, len(images)))
156
+
157
+ return images
158
+
159
+ def _extract_from_grouped_shapes(self, group_shape: Any, slide_num: int, image_count: int) -> list[ExtractedImage]:
160
+ images: list[ExtractedImage] = []
161
+ for shape in group_shape.shapes:
162
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
163
+ try:
164
+ image = shape.image
165
+ filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
166
+ images.append(
167
+ ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
168
+ )
169
+ except Exception as e: # noqa: BLE001
170
+ logger.warning("Failed to extract grouped image: %s", e)
171
+ elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
172
+ images.extend(self._extract_from_grouped_shapes(shape, slide_num, image_count + len(images)))
173
+ return images
174
+
111
175
  @staticmethod
112
176
  def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
113
177
  metadata: Metadata = {}