kreuzberg 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. kreuzberg/_extractors/_base.py +40 -0
  2. kreuzberg/_extractors/_email.py +149 -0
  3. kreuzberg/_extractors/_html.py +15 -3
  4. kreuzberg/_extractors/_image.py +17 -18
  5. kreuzberg/_extractors/_pdf.py +68 -14
  6. kreuzberg/_extractors/_presentation.py +62 -10
  7. kreuzberg/_extractors/_spread_sheet.py +179 -4
  8. kreuzberg/_extractors/_structured.py +148 -0
  9. kreuzberg/_gmft.py +2 -2
  10. kreuzberg/_mcp/__init__.py +5 -0
  11. kreuzberg/_mcp/server.py +227 -0
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_multiprocessing/__init__.py +2 -3
  14. kreuzberg/_ocr/__init__.py +30 -0
  15. kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
  16. kreuzberg/_ocr/_sync.py +566 -0
  17. kreuzberg/_ocr/_tesseract.py +6 -2
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +17 -2
  21. kreuzberg/_utils/_process_pool.py +178 -1
  22. kreuzberg/_utils/_quality.py +237 -0
  23. kreuzberg/_utils/_serialization.py +4 -2
  24. kreuzberg/_utils/_string.py +153 -10
  25. kreuzberg/_utils/_sync.py +5 -2
  26. kreuzberg/_utils/_table.py +261 -0
  27. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +116 -48
  28. kreuzberg-3.8.0.dist-info/RECORD +57 -0
  29. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +1 -0
  30. kreuzberg/_multiprocessing/process_manager.py +0 -189
  31. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  32. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  33. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  34. kreuzberg-3.6.2.dist-info/RECORD +0 -54
  35. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -90,3 +90,43 @@ class Extractor(ABC):
90
90
  return mime_type in cls.SUPPORTED_MIME_TYPES or any(
91
91
  mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
92
92
  )
93
+
94
+ def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
95
+ """Apply quality post-processing to extraction result if enabled.
96
+
97
+ Args:
98
+ result: The raw extraction result
99
+
100
+ Returns:
101
+ Enhanced extraction result with quality improvements (if enabled)
102
+ """
103
+ # Only apply quality processing if enabled in config
104
+ if not self.config.enable_quality_processing:
105
+ return result
106
+
107
+ from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
108
+
109
+ if not result.content:
110
+ return result
111
+
112
+ # Clean the content
113
+ cleaned_content = clean_extracted_text(result.content)
114
+
115
+ # Calculate quality score
116
+ quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
+
118
+ # Add quality metadata
119
+ enhanced_metadata = dict(result.metadata) if result.metadata else {}
120
+ enhanced_metadata["quality_score"] = quality_score
121
+
122
+ # Return enhanced result
123
+ from kreuzberg._types import ExtractionResult, normalize_metadata
124
+
125
+ return ExtractionResult(
126
+ content=cleaned_content,
127
+ mime_type=result.mime_type,
128
+ metadata=normalize_metadata(enhanced_metadata),
129
+ chunks=result.chunks,
130
+ detected_languages=result.detected_languages,
131
+ tables=result.tables,
132
+ )
@@ -0,0 +1,149 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from html import unescape
5
+ from typing import TYPE_CHECKING, Any, ClassVar
6
+
7
+ from anyio import Path as AsyncPath
8
+
9
+ from kreuzberg._extractors._base import Extractor
10
+ from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
11
+ from kreuzberg._types import ExtractionResult, normalize_metadata
12
+ from kreuzberg._utils._string import normalize_spaces
13
+ from kreuzberg._utils._sync import run_sync
14
+ from kreuzberg.exceptions import MissingDependencyError
15
+
16
+ if TYPE_CHECKING:
17
+ from pathlib import Path
18
+
19
+ # Import optional dependencies at module level with proper error handling
20
+ try:
21
+ import mailparse
22
+ except ImportError:
23
+ mailparse = None
24
+
25
+ try:
26
+ import html2text # type: ignore[import-not-found]
27
+ except ImportError:
28
+ html2text = None
29
+
30
+ # Compile regex pattern once at module level
31
+ _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
32
+
33
+
34
+ class EmailExtractor(Extractor):
35
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {EML_MIME_TYPE}
36
+
37
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
38
+ return await run_sync(self.extract_bytes_sync, content)
39
+
40
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
41
+ content = await AsyncPath(path).read_bytes()
42
+ return await self.extract_bytes_async(content)
43
+
44
+ def _extract_email_headers(
45
+ self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
46
+ ) -> None:
47
+ """Extract and process email headers."""
48
+ # Use single dict access where possible to avoid repeated lookups
49
+ subject = parsed_email.get("subject")
50
+ if subject:
51
+ metadata["subject"] = subject
52
+ text_parts.append(f"Subject: {subject}")
53
+
54
+ from_info = parsed_email.get("from")
55
+ if from_info:
56
+ from_email = from_info.get("email", "") if isinstance(from_info, dict) else str(from_info)
57
+ metadata["email_from"] = from_email
58
+ text_parts.append(f"From: {from_email}")
59
+
60
+ to_info = parsed_email.get("to")
61
+ if to_info:
62
+ if isinstance(to_info, list) and to_info:
63
+ to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
64
+ elif isinstance(to_info, dict):
65
+ to_email = to_info.get("email", "")
66
+ else:
67
+ to_email = str(to_info)
68
+ metadata["email_to"] = to_email
69
+ text_parts.append(f"To: {to_email}")
70
+
71
+ date = parsed_email.get("date")
72
+ if date:
73
+ metadata["date"] = date
74
+ text_parts.append(f"Date: {date}")
75
+
76
+ cc = parsed_email.get("cc")
77
+ if cc:
78
+ metadata["email_cc"] = cc
79
+ text_parts.append(f"CC: {cc}")
80
+
81
+ bcc = parsed_email.get("bcc")
82
+ if bcc:
83
+ metadata["email_bcc"] = bcc
84
+ text_parts.append(f"BCC: {bcc}")
85
+
86
+ def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
87
+ """Extract and process email body content."""
88
+ text_content = parsed_email.get("text")
89
+ if text_content:
90
+ text_parts.append(f"\n{text_content}")
91
+ return # If we have text, prefer it over HTML
92
+
93
+ html_content = parsed_email.get("html")
94
+ if html_content:
95
+ if html2text is not None:
96
+ # Use html2text if available (faster path)
97
+ h = html2text.HTML2Text()
98
+ h.ignore_links = True
99
+ h.ignore_images = True
100
+ converted_text = h.handle(html_content)
101
+ text_parts.append(f"\n{converted_text}")
102
+ else:
103
+ # Fallback: strip HTML tags and unescape entities
104
+ clean_html = _HTML_TAG_PATTERN.sub("", html_content)
105
+ clean_html = unescape(clean_html)
106
+ text_parts.append(f"\n{clean_html}")
107
+
108
+ def _extract_email_attachments(
109
+ self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
110
+ ) -> None:
111
+ """Extract and process email attachments info."""
112
+ if parsed_email.get("attachments"):
113
+ attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
114
+ metadata["attachments"] = attachment_names
115
+ if attachment_names:
116
+ text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
117
+
118
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
119
+ if mailparse is None:
120
+ msg = "mailparse is required for email extraction. Install with: pip install 'kreuzberg[additional-extensions]'"
121
+ raise MissingDependencyError(msg)
122
+
123
+ try:
124
+ parsed_email = mailparse.EmailDecode.load(content)
125
+ text_parts: list[str] = []
126
+ metadata: dict[str, Any] = {}
127
+
128
+ # Extract headers, body, and attachments
129
+ self._extract_email_headers(parsed_email, text_parts, metadata)
130
+ self._extract_email_body(parsed_email, text_parts)
131
+ self._extract_email_attachments(parsed_email, text_parts, metadata)
132
+
133
+ # Join efficiently
134
+ combined_text = "\n".join(text_parts)
135
+
136
+ return ExtractionResult(
137
+ content=normalize_spaces(combined_text),
138
+ mime_type=PLAIN_TEXT_MIME_TYPE,
139
+ metadata=normalize_metadata(metadata),
140
+ chunks=[],
141
+ )
142
+
143
+ except Exception as e:
144
+ msg = f"Failed to parse email content: {e}"
145
+ raise RuntimeError(msg) from e
146
+
147
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
148
+ content = path.read_bytes()
149
+ return self.extract_bytes_sync(content)
@@ -8,7 +8,7 @@ from anyio import Path as AsyncPath
8
8
  from kreuzberg._extractors._base import Extractor
9
9
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
10
  from kreuzberg._types import ExtractionResult
11
- from kreuzberg._utils._string import normalize_spaces, safe_decode
11
+ from kreuzberg._utils._string import safe_decode
12
12
  from kreuzberg._utils._sync import run_sync
13
13
 
14
14
  if TYPE_CHECKING:
@@ -26,8 +26,20 @@ class HTMLExtractor(Extractor):
26
26
  return await run_sync(self.extract_bytes_sync, content)
27
27
 
28
28
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
- result = html_to_markdown.convert_to_markdown(safe_decode(content))
30
- return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
29
+ # Use html-to-markdown with script/nav removal for better quality
30
+ result = html_to_markdown.convert_to_markdown(
31
+ safe_decode(content),
32
+ preprocess_html=True,
33
+ preprocessing_preset="aggressive",
34
+ remove_navigation=True,
35
+ remove_forms=True,
36
+ )
37
+
38
+ # Skip normalize_spaces since quality processing will handle whitespace
39
+ extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
40
+
41
+ # Apply quality processing which includes normalization
42
+ return self._apply_quality_processing(extraction_result)
31
43
 
32
44
  def extract_path_sync(self, path: Path) -> ExtractionResult:
33
45
  content = path.read_bytes()
@@ -1,5 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
4
+ import os
5
+ import tempfile
6
+ from pathlib import Path
3
7
  from typing import TYPE_CHECKING, ClassVar
4
8
 
5
9
  from anyio import Path as AsyncPath
@@ -7,17 +11,13 @@ from anyio import Path as AsyncPath
7
11
  from kreuzberg._extractors._base import Extractor
8
12
  from kreuzberg._mime_types import IMAGE_MIME_TYPES
9
13
  from kreuzberg._ocr import get_ocr_backend
14
+ from kreuzberg._types import ExtractionResult
10
15
  from kreuzberg._utils._tmp import create_temp_file
11
16
  from kreuzberg.exceptions import ValidationError
12
17
 
13
18
  if TYPE_CHECKING: # pragma: no cover
14
19
  from collections.abc import Mapping
15
20
 
16
- from kreuzberg._types import ExtractionResult
17
-
18
- import contextlib
19
- from pathlib import Path
20
-
21
21
 
22
22
  class ImageExtractor(Extractor):
23
23
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
@@ -56,13 +56,11 @@ class ImageExtractor(Extractor):
56
56
  if self.config.ocr_backend is None:
57
57
  raise ValidationError("ocr_backend is None, cannot perform OCR")
58
58
 
59
- return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
59
+ result = await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
60
+ return self._apply_quality_processing(result)
60
61
 
61
62
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
62
63
  """Pure sync implementation of extract_bytes."""
63
- import os
64
- import tempfile
65
-
66
64
  extension = self._get_extension_from_mime_type(self.mime_type)
67
65
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
68
66
 
@@ -80,10 +78,8 @@ class ImageExtractor(Extractor):
80
78
  if self.config.ocr_backend is None:
81
79
  raise ValidationError("ocr_backend is None, cannot perform OCR")
82
80
 
83
- from kreuzberg._types import ExtractionResult
84
-
85
81
  if self.config.ocr_backend == "tesseract":
86
- from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
82
+ from kreuzberg._ocr._sync import process_batch_images_sync
87
83
  from kreuzberg._ocr._tesseract import TesseractConfig
88
84
 
89
85
  if isinstance(self.config.ocr_config, TesseractConfig):
@@ -91,30 +87,33 @@ class ImageExtractor(Extractor):
91
87
  else:
92
88
  config = TesseractConfig()
93
89
 
94
- results = process_batch_images_sync_pure([str(path)], config)
90
+ results = process_batch_images_sync([str(path)], config, backend="tesseract")
95
91
  if results:
96
- return results[0]
92
+ result = results[0]
93
+ return self._apply_quality_processing(result)
97
94
  return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
98
95
 
99
96
  if self.config.ocr_backend == "paddleocr":
100
- from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
101
97
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
98
+ from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
102
99
 
103
100
  paddle_config = (
104
101
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
105
102
  )
106
103
 
107
- return paddle_process(path, paddle_config)
104
+ result = paddle_process(path, paddle_config)
105
+ return self._apply_quality_processing(result)
108
106
 
109
107
  if self.config.ocr_backend == "easyocr":
110
- from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
111
108
  from kreuzberg._ocr._easyocr import EasyOCRConfig
109
+ from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
112
110
 
113
111
  easy_config = (
114
112
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
115
113
  )
116
114
 
117
- return easy_process(path, easy_config)
115
+ result = easy_process(path, easy_config)
116
+ return self._apply_quality_processing(result)
118
117
 
119
118
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
120
119
 
@@ -10,15 +10,17 @@ from typing import TYPE_CHECKING, ClassVar, cast
10
10
  import anyio
11
11
  import pypdfium2
12
12
  from anyio import Path as AsyncPath
13
+ from playa import parse
13
14
 
14
15
  from kreuzberg._extractors._base import Extractor
15
16
  from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
16
17
  from kreuzberg._ocr import get_ocr_backend
17
- from kreuzberg._playa import extract_pdf_metadata
18
+ from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
18
19
  from kreuzberg._types import ExtractionResult, OcrBackendType
19
20
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
20
21
  from kreuzberg._utils._string import normalize_spaces
21
22
  from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
23
+ from kreuzberg._utils._table import generate_table_summary
22
24
  from kreuzberg._utils._tmp import create_temp_file
23
25
  from kreuzberg.exceptions import ParsingError
24
26
 
@@ -63,11 +65,27 @@ class PDFExtractor(Extractor):
63
65
  result.metadata = await extract_pdf_metadata(content_bytes)
64
66
 
65
67
  if self.config.extract_tables:
66
- from kreuzberg._gmft import extract_tables
67
-
68
- result.tables = await extract_tables(path, self.config.gmft_config)
68
+ # GMFT is optional dependency
69
+ try:
70
+ from kreuzberg._gmft import extract_tables
69
71
 
70
- return result
72
+ result.tables = await extract_tables(path, self.config.gmft_config)
73
+ except ImportError:
74
+ result.tables = []
75
+
76
+ # Enhance metadata with table information
77
+ if result.tables:
78
+ table_summary = generate_table_summary(result.tables)
79
+ result.metadata.update(
80
+ {
81
+ "table_count": table_summary["table_count"],
82
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
83
+ f"across {table_summary['pages_with_tables']} pages with "
84
+ f"{table_summary['total_rows']} total rows",
85
+ }
86
+ )
87
+
88
+ return self._apply_quality_processing(result)
71
89
 
72
90
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
73
91
  """Pure sync implementation of PDF extraction from bytes."""
@@ -81,8 +99,6 @@ class PDFExtractor(Extractor):
81
99
 
82
100
  result = self.extract_path_sync(Path(temp_path))
83
101
 
84
- from kreuzberg._playa import extract_pdf_metadata_sync
85
-
86
102
  metadata = extract_pdf_metadata_sync(content)
87
103
  result.metadata = metadata
88
104
 
@@ -100,16 +116,21 @@ class PDFExtractor(Extractor):
100
116
 
101
117
  tables = []
102
118
  if self.config.extract_tables:
119
+ # GMFT is optional dependency
103
120
  try:
104
121
  from kreuzberg._gmft import extract_tables_sync
105
122
 
106
123
  tables = extract_tables_sync(path)
107
124
  except ImportError:
108
- pass
125
+ tables = []
126
+
127
+ # Use playa for better text structure preservation when not using OCR
128
+ if not self.config.force_ocr and self._validate_extracted_text(text):
129
+ text = self._extract_with_playa_sync(path, fallback_text=text)
109
130
 
110
131
  text = normalize_spaces(text)
111
132
 
112
- return ExtractionResult(
133
+ result = ExtractionResult(
113
134
  content=text,
114
135
  mime_type=PLAIN_TEXT_MIME_TYPE,
115
136
  metadata={},
@@ -117,6 +138,21 @@ class PDFExtractor(Extractor):
117
138
  chunks=[],
118
139
  )
119
140
 
141
+ # Enhance metadata with table information
142
+ if tables:
143
+ table_summary = generate_table_summary(tables)
144
+ result.metadata.update(
145
+ {
146
+ "table_count": table_summary["table_count"],
147
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
148
+ f"across {table_summary['pages_with_tables']} pages with "
149
+ f"{table_summary['total_rows']} total rows",
150
+ }
151
+ )
152
+
153
+ # Apply quality processing
154
+ return self._apply_quality_processing(result)
155
+
120
156
  def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
121
157
  """Check if text extracted from PDF is valid or corrupted.
122
158
 
@@ -283,7 +319,7 @@ class PDFExtractor(Extractor):
283
319
  text_parts = []
284
320
  for page in pdf:
285
321
  text_page = page.get_textpage()
286
- text = text_page.get_text_range()
322
+ text = text_page.get_text_bounded()
287
323
  text_parts.append(text)
288
324
  text_page.close()
289
325
  page.close()
@@ -340,19 +376,19 @@ class PDFExtractor(Extractor):
340
376
  def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
341
377
  """Process PDF images with the configured OCR backend."""
342
378
  if self.config.ocr_backend == "tesseract":
343
- from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
379
+ from kreuzberg._ocr._sync import process_batch_images_sync
344
380
  from kreuzberg._ocr._tesseract import TesseractConfig
345
381
 
346
382
  tesseract_config = (
347
383
  self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
348
384
  )
349
- results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
385
+ results = process_batch_images_sync([str(p) for p in image_paths], tesseract_config, backend="tesseract")
350
386
  text_parts = [r.content for r in results]
351
387
  return "\n\n".join(text_parts)
352
388
 
353
389
  if self.config.ocr_backend == "paddleocr":
354
- from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
355
390
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
391
+ from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
356
392
 
357
393
  paddle_config = (
358
394
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
@@ -365,8 +401,8 @@ class PDFExtractor(Extractor):
365
401
  return "\n\n".join(text_parts)
366
402
 
367
403
  if self.config.ocr_backend == "easyocr":
368
- from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
369
404
  from kreuzberg._ocr._easyocr import EasyOCRConfig
405
+ from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
370
406
 
371
407
  easy_config = (
372
408
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
@@ -379,3 +415,21 @@ class PDFExtractor(Extractor):
379
415
  return "\n\n".join(text_parts)
380
416
 
381
417
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
418
+
419
+ def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
420
+ """Extract text using playa for better structure preservation."""
421
+ with contextlib.suppress(Exception):
422
+ content = path.read_bytes()
423
+ document = parse(content, max_workers=1)
424
+
425
+ text_parts = []
426
+ for page in document.pages:
427
+ # Extract text while preserving structure
428
+ page_text = page.extract_text()
429
+ if page_text and page_text.strip():
430
+ text_parts.append(page_text)
431
+
432
+ if text_parts:
433
+ return "\n\n".join(text_parts)
434
+
435
+ return fallback_text
@@ -30,6 +30,9 @@ if TYPE_CHECKING: # pragma: no cover
30
30
 
31
31
  from kreuzberg._types import Metadata
32
32
 
33
+ # Pre-compiled regex patterns for performance
34
+ _NON_WORD_PATTERN = re.compile(r"\W")
35
+
33
36
 
34
37
  class PresentationExtractor(Extractor):
35
38
  """Extractor for PowerPoint (.pptx) files.
@@ -141,7 +144,7 @@ class PresentationExtractor(Extractor):
141
144
  with suppress(AttributeError):
142
145
  alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
143
146
 
144
- filename = re.sub(r"\W", "", shape.name) + ".jpg"
147
+ filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
145
148
  md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
146
149
 
147
150
  elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
@@ -162,7 +165,10 @@ class PresentationExtractor(Extractor):
162
165
  md_content += "\n" + html_table + "\n"
163
166
 
164
167
  elif shape.has_text_frame:
165
- md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
168
+ if shape == title:
169
+ md_content += "# " + shape.text.lstrip() + "\n"
170
+ else:
171
+ md_content += shape.text + "\n"
166
172
 
167
173
  md_content = md_content.strip()
168
174
  if slide.has_notes_slide:
@@ -174,13 +180,15 @@ class PresentationExtractor(Extractor):
174
180
 
175
181
  md_content = md_content.strip()
176
182
 
177
- return ExtractionResult(
183
+ result = ExtractionResult(
178
184
  content=normalize_spaces(md_content),
179
185
  mime_type=MARKDOWN_MIME_TYPE,
180
186
  metadata=self._extract_presentation_metadata(presentation),
181
187
  chunks=[],
182
188
  )
183
189
 
190
+ return self._apply_quality_processing(result)
191
+
184
192
  @staticmethod
185
193
  def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
186
194
  """Extract metadata from a presentation instance.
@@ -193,7 +201,24 @@ class PresentationExtractor(Extractor):
193
201
  """
194
202
  metadata: Metadata = {}
195
203
 
196
- for metadata_key, core_property_key in [
204
+ # Extract core properties
205
+ PresentationExtractor._extract_core_properties(presentation, metadata)
206
+
207
+ # Extract fonts used in presentation
208
+ fonts = PresentationExtractor._extract_fonts(presentation)
209
+ if fonts:
210
+ metadata["fonts"] = list(fonts)
211
+
212
+ # Add structural information
213
+ PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
214
+
215
+ return metadata
216
+
217
+ @staticmethod
218
+ def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
219
+ """Extract core document properties from presentation."""
220
+ # Property mapping for core metadata
221
+ property_mapping = [
197
222
  ("authors", "author"),
198
223
  ("comments", "comments"),
199
224
  ("status", "content_status"),
@@ -205,17 +230,22 @@ class PresentationExtractor(Extractor):
205
230
  ("version", "revision"),
206
231
  ("subject", "subject"),
207
232
  ("title", "title"),
208
- ("version", "version"),
209
- ]:
233
+ ]
234
+
235
+ for metadata_key, core_property_key in property_mapping:
210
236
  if core_property := getattr(presentation.core_properties, core_property_key, None):
211
237
  metadata[metadata_key] = core_property # type: ignore[literal-required]
212
238
 
239
+ # Handle special list properties
213
240
  if presentation.core_properties.language:
214
241
  metadata["languages"] = [presentation.core_properties.language]
215
242
 
216
243
  if presentation.core_properties.category:
217
244
  metadata["categories"] = [presentation.core_properties.category]
218
245
 
246
+ @staticmethod
247
+ def _extract_fonts(presentation: Presentation) -> set[str]:
248
+ """Extract all fonts used in the presentation."""
219
249
  fonts = set()
220
250
  for slide in presentation.slides:
221
251
  for shape in slide.shapes:
@@ -226,8 +256,30 @@ class PresentationExtractor(Extractor):
226
256
  for run in paragraph.runs:
227
257
  if hasattr(run, "font") and run.font.name:
228
258
  fonts.add(run.font.name)
259
+ return fonts
229
260
 
230
- if fonts:
231
- metadata["fonts"] = list(fonts)
232
-
233
- return metadata
261
+ @staticmethod
262
+ def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
263
+ """Add structural information about the presentation."""
264
+ slide_count = len(presentation.slides)
265
+ if slide_count == 0:
266
+ return
267
+
268
+ # Build description
269
+ structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
270
+
271
+ slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
272
+ if slides_with_notes > 0:
273
+ structure_info += f", {slides_with_notes} with notes"
274
+
275
+ metadata["description"] = structure_info
276
+
277
+ # Build summary if not already present
278
+ if "summary" not in metadata:
279
+ summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
280
+ if slides_with_notes > 0:
281
+ summary_parts.append(f"{slides_with_notes} slides have notes")
282
+ if fonts:
283
+ summary_parts.append(f"uses {len(fonts)} font{'s' if len(fonts) != 1 else ''}")
284
+
285
+ metadata["summary"] = f"{'. '.join(summary_parts)}."