kreuzberg 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_base.py +40 -0
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +17 -18
- kreuzberg/_extractors/_pdf.py +68 -14
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_multiprocessing/__init__.py +2 -3
- kreuzberg/_ocr/__init__.py +30 -0
- kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
- kreuzberg/_ocr/_sync.py +566 -0
- kreuzberg/_ocr/_tesseract.py +6 -2
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +17 -2
- kreuzberg/_utils/_process_pool.py +178 -1
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +116 -48
- kreuzberg-3.8.0.dist-info/RECORD +57 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +1 -0
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg-3.6.2.dist-info/RECORD +0 -54
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_base.py
CHANGED
@@ -90,3 +90,43 @@ class Extractor(ABC):
|
|
90
90
|
return mime_type in cls.SUPPORTED_MIME_TYPES or any(
|
91
91
|
mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
|
92
92
|
)
|
93
|
+
|
94
|
+
def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
|
95
|
+
"""Apply quality post-processing to extraction result if enabled.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
result: The raw extraction result
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
Enhanced extraction result with quality improvements (if enabled)
|
102
|
+
"""
|
103
|
+
# Only apply quality processing if enabled in config
|
104
|
+
if not self.config.enable_quality_processing:
|
105
|
+
return result
|
106
|
+
|
107
|
+
from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
|
108
|
+
|
109
|
+
if not result.content:
|
110
|
+
return result
|
111
|
+
|
112
|
+
# Clean the content
|
113
|
+
cleaned_content = clean_extracted_text(result.content)
|
114
|
+
|
115
|
+
# Calculate quality score
|
116
|
+
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
|
+
|
118
|
+
# Add quality metadata
|
119
|
+
enhanced_metadata = dict(result.metadata) if result.metadata else {}
|
120
|
+
enhanced_metadata["quality_score"] = quality_score
|
121
|
+
|
122
|
+
# Return enhanced result
|
123
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
124
|
+
|
125
|
+
return ExtractionResult(
|
126
|
+
content=cleaned_content,
|
127
|
+
mime_type=result.mime_type,
|
128
|
+
metadata=normalize_metadata(enhanced_metadata),
|
129
|
+
chunks=result.chunks,
|
130
|
+
detected_languages=result.detected_languages,
|
131
|
+
tables=result.tables,
|
132
|
+
)
|
@@ -0,0 +1,149 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from html import unescape
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
6
|
+
|
7
|
+
from anyio import Path as AsyncPath
|
8
|
+
|
9
|
+
from kreuzberg._extractors._base import Extractor
|
10
|
+
from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
11
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
12
|
+
from kreuzberg._utils._string import normalize_spaces
|
13
|
+
from kreuzberg._utils._sync import run_sync
|
14
|
+
from kreuzberg.exceptions import MissingDependencyError
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from pathlib import Path
|
18
|
+
|
19
|
+
# Import optional dependencies at module level with proper error handling
|
20
|
+
try:
|
21
|
+
import mailparse
|
22
|
+
except ImportError:
|
23
|
+
mailparse = None
|
24
|
+
|
25
|
+
try:
|
26
|
+
import html2text # type: ignore[import-not-found]
|
27
|
+
except ImportError:
|
28
|
+
html2text = None
|
29
|
+
|
30
|
+
# Compile regex pattern once at module level
|
31
|
+
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
32
|
+
|
33
|
+
|
34
|
+
class EmailExtractor(Extractor):
|
35
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {EML_MIME_TYPE}
|
36
|
+
|
37
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
38
|
+
return await run_sync(self.extract_bytes_sync, content)
|
39
|
+
|
40
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
41
|
+
content = await AsyncPath(path).read_bytes()
|
42
|
+
return await self.extract_bytes_async(content)
|
43
|
+
|
44
|
+
def _extract_email_headers(
|
45
|
+
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
46
|
+
) -> None:
|
47
|
+
"""Extract and process email headers."""
|
48
|
+
# Use single dict access where possible to avoid repeated lookups
|
49
|
+
subject = parsed_email.get("subject")
|
50
|
+
if subject:
|
51
|
+
metadata["subject"] = subject
|
52
|
+
text_parts.append(f"Subject: {subject}")
|
53
|
+
|
54
|
+
from_info = parsed_email.get("from")
|
55
|
+
if from_info:
|
56
|
+
from_email = from_info.get("email", "") if isinstance(from_info, dict) else str(from_info)
|
57
|
+
metadata["email_from"] = from_email
|
58
|
+
text_parts.append(f"From: {from_email}")
|
59
|
+
|
60
|
+
to_info = parsed_email.get("to")
|
61
|
+
if to_info:
|
62
|
+
if isinstance(to_info, list) and to_info:
|
63
|
+
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
64
|
+
elif isinstance(to_info, dict):
|
65
|
+
to_email = to_info.get("email", "")
|
66
|
+
else:
|
67
|
+
to_email = str(to_info)
|
68
|
+
metadata["email_to"] = to_email
|
69
|
+
text_parts.append(f"To: {to_email}")
|
70
|
+
|
71
|
+
date = parsed_email.get("date")
|
72
|
+
if date:
|
73
|
+
metadata["date"] = date
|
74
|
+
text_parts.append(f"Date: {date}")
|
75
|
+
|
76
|
+
cc = parsed_email.get("cc")
|
77
|
+
if cc:
|
78
|
+
metadata["email_cc"] = cc
|
79
|
+
text_parts.append(f"CC: {cc}")
|
80
|
+
|
81
|
+
bcc = parsed_email.get("bcc")
|
82
|
+
if bcc:
|
83
|
+
metadata["email_bcc"] = bcc
|
84
|
+
text_parts.append(f"BCC: {bcc}")
|
85
|
+
|
86
|
+
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
87
|
+
"""Extract and process email body content."""
|
88
|
+
text_content = parsed_email.get("text")
|
89
|
+
if text_content:
|
90
|
+
text_parts.append(f"\n{text_content}")
|
91
|
+
return # If we have text, prefer it over HTML
|
92
|
+
|
93
|
+
html_content = parsed_email.get("html")
|
94
|
+
if html_content:
|
95
|
+
if html2text is not None:
|
96
|
+
# Use html2text if available (faster path)
|
97
|
+
h = html2text.HTML2Text()
|
98
|
+
h.ignore_links = True
|
99
|
+
h.ignore_images = True
|
100
|
+
converted_text = h.handle(html_content)
|
101
|
+
text_parts.append(f"\n{converted_text}")
|
102
|
+
else:
|
103
|
+
# Fallback: strip HTML tags and unescape entities
|
104
|
+
clean_html = _HTML_TAG_PATTERN.sub("", html_content)
|
105
|
+
clean_html = unescape(clean_html)
|
106
|
+
text_parts.append(f"\n{clean_html}")
|
107
|
+
|
108
|
+
def _extract_email_attachments(
|
109
|
+
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
110
|
+
) -> None:
|
111
|
+
"""Extract and process email attachments info."""
|
112
|
+
if parsed_email.get("attachments"):
|
113
|
+
attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
|
114
|
+
metadata["attachments"] = attachment_names
|
115
|
+
if attachment_names:
|
116
|
+
text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
|
117
|
+
|
118
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
119
|
+
if mailparse is None:
|
120
|
+
msg = "mailparse is required for email extraction. Install with: pip install 'kreuzberg[additional-extensions]'"
|
121
|
+
raise MissingDependencyError(msg)
|
122
|
+
|
123
|
+
try:
|
124
|
+
parsed_email = mailparse.EmailDecode.load(content)
|
125
|
+
text_parts: list[str] = []
|
126
|
+
metadata: dict[str, Any] = {}
|
127
|
+
|
128
|
+
# Extract headers, body, and attachments
|
129
|
+
self._extract_email_headers(parsed_email, text_parts, metadata)
|
130
|
+
self._extract_email_body(parsed_email, text_parts)
|
131
|
+
self._extract_email_attachments(parsed_email, text_parts, metadata)
|
132
|
+
|
133
|
+
# Join efficiently
|
134
|
+
combined_text = "\n".join(text_parts)
|
135
|
+
|
136
|
+
return ExtractionResult(
|
137
|
+
content=normalize_spaces(combined_text),
|
138
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
139
|
+
metadata=normalize_metadata(metadata),
|
140
|
+
chunks=[],
|
141
|
+
)
|
142
|
+
|
143
|
+
except Exception as e:
|
144
|
+
msg = f"Failed to parse email content: {e}"
|
145
|
+
raise RuntimeError(msg) from e
|
146
|
+
|
147
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
148
|
+
content = path.read_bytes()
|
149
|
+
return self.extract_bytes_sync(content)
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -8,7 +8,7 @@ from anyio import Path as AsyncPath
|
|
8
8
|
from kreuzberg._extractors._base import Extractor
|
9
9
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
10
10
|
from kreuzberg._types import ExtractionResult
|
11
|
-
from kreuzberg._utils._string import
|
11
|
+
from kreuzberg._utils._string import safe_decode
|
12
12
|
from kreuzberg._utils._sync import run_sync
|
13
13
|
|
14
14
|
if TYPE_CHECKING:
|
@@ -26,8 +26,20 @@ class HTMLExtractor(Extractor):
|
|
26
26
|
return await run_sync(self.extract_bytes_sync, content)
|
27
27
|
|
28
28
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
29
|
-
|
30
|
-
|
29
|
+
# Use html-to-markdown with script/nav removal for better quality
|
30
|
+
result = html_to_markdown.convert_to_markdown(
|
31
|
+
safe_decode(content),
|
32
|
+
preprocess_html=True,
|
33
|
+
preprocessing_preset="aggressive",
|
34
|
+
remove_navigation=True,
|
35
|
+
remove_forms=True,
|
36
|
+
)
|
37
|
+
|
38
|
+
# Skip normalize_spaces since quality processing will handle whitespace
|
39
|
+
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
|
40
|
+
|
41
|
+
# Apply quality processing which includes normalization
|
42
|
+
return self._apply_quality_processing(extraction_result)
|
31
43
|
|
32
44
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
33
45
|
content = path.read_bytes()
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import contextlib
|
4
|
+
import os
|
5
|
+
import tempfile
|
6
|
+
from pathlib import Path
|
3
7
|
from typing import TYPE_CHECKING, ClassVar
|
4
8
|
|
5
9
|
from anyio import Path as AsyncPath
|
@@ -7,17 +11,13 @@ from anyio import Path as AsyncPath
|
|
7
11
|
from kreuzberg._extractors._base import Extractor
|
8
12
|
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
9
13
|
from kreuzberg._ocr import get_ocr_backend
|
14
|
+
from kreuzberg._types import ExtractionResult
|
10
15
|
from kreuzberg._utils._tmp import create_temp_file
|
11
16
|
from kreuzberg.exceptions import ValidationError
|
12
17
|
|
13
18
|
if TYPE_CHECKING: # pragma: no cover
|
14
19
|
from collections.abc import Mapping
|
15
20
|
|
16
|
-
from kreuzberg._types import ExtractionResult
|
17
|
-
|
18
|
-
import contextlib
|
19
|
-
from pathlib import Path
|
20
|
-
|
21
21
|
|
22
22
|
class ImageExtractor(Extractor):
|
23
23
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
|
@@ -56,13 +56,11 @@ class ImageExtractor(Extractor):
|
|
56
56
|
if self.config.ocr_backend is None:
|
57
57
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
58
58
|
|
59
|
-
|
59
|
+
result = await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
|
60
|
+
return self._apply_quality_processing(result)
|
60
61
|
|
61
62
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
62
63
|
"""Pure sync implementation of extract_bytes."""
|
63
|
-
import os
|
64
|
-
import tempfile
|
65
|
-
|
66
64
|
extension = self._get_extension_from_mime_type(self.mime_type)
|
67
65
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
68
66
|
|
@@ -80,10 +78,8 @@ class ImageExtractor(Extractor):
|
|
80
78
|
if self.config.ocr_backend is None:
|
81
79
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
82
80
|
|
83
|
-
from kreuzberg._types import ExtractionResult
|
84
|
-
|
85
81
|
if self.config.ocr_backend == "tesseract":
|
86
|
-
from kreuzberg.
|
82
|
+
from kreuzberg._ocr._sync import process_batch_images_sync
|
87
83
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
88
84
|
|
89
85
|
if isinstance(self.config.ocr_config, TesseractConfig):
|
@@ -91,30 +87,33 @@ class ImageExtractor(Extractor):
|
|
91
87
|
else:
|
92
88
|
config = TesseractConfig()
|
93
89
|
|
94
|
-
results =
|
90
|
+
results = process_batch_images_sync([str(path)], config, backend="tesseract")
|
95
91
|
if results:
|
96
|
-
|
92
|
+
result = results[0]
|
93
|
+
return self._apply_quality_processing(result)
|
97
94
|
return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
|
98
95
|
|
99
96
|
if self.config.ocr_backend == "paddleocr":
|
100
|
-
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
101
97
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
98
|
+
from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
|
102
99
|
|
103
100
|
paddle_config = (
|
104
101
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
105
102
|
)
|
106
103
|
|
107
|
-
|
104
|
+
result = paddle_process(path, paddle_config)
|
105
|
+
return self._apply_quality_processing(result)
|
108
106
|
|
109
107
|
if self.config.ocr_backend == "easyocr":
|
110
|
-
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
111
108
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
109
|
+
from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
|
112
110
|
|
113
111
|
easy_config = (
|
114
112
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
115
113
|
)
|
116
114
|
|
117
|
-
|
115
|
+
result = easy_process(path, easy_config)
|
116
|
+
return self._apply_quality_processing(result)
|
118
117
|
|
119
118
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
120
119
|
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -10,15 +10,17 @@ from typing import TYPE_CHECKING, ClassVar, cast
|
|
10
10
|
import anyio
|
11
11
|
import pypdfium2
|
12
12
|
from anyio import Path as AsyncPath
|
13
|
+
from playa import parse
|
13
14
|
|
14
15
|
from kreuzberg._extractors._base import Extractor
|
15
16
|
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
16
17
|
from kreuzberg._ocr import get_ocr_backend
|
17
|
-
from kreuzberg._playa import extract_pdf_metadata
|
18
|
+
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
18
19
|
from kreuzberg._types import ExtractionResult, OcrBackendType
|
19
20
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
20
21
|
from kreuzberg._utils._string import normalize_spaces
|
21
22
|
from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
|
23
|
+
from kreuzberg._utils._table import generate_table_summary
|
22
24
|
from kreuzberg._utils._tmp import create_temp_file
|
23
25
|
from kreuzberg.exceptions import ParsingError
|
24
26
|
|
@@ -63,11 +65,27 @@ class PDFExtractor(Extractor):
|
|
63
65
|
result.metadata = await extract_pdf_metadata(content_bytes)
|
64
66
|
|
65
67
|
if self.config.extract_tables:
|
66
|
-
|
67
|
-
|
68
|
-
|
68
|
+
# GMFT is optional dependency
|
69
|
+
try:
|
70
|
+
from kreuzberg._gmft import extract_tables
|
69
71
|
|
70
|
-
|
72
|
+
result.tables = await extract_tables(path, self.config.gmft_config)
|
73
|
+
except ImportError:
|
74
|
+
result.tables = []
|
75
|
+
|
76
|
+
# Enhance metadata with table information
|
77
|
+
if result.tables:
|
78
|
+
table_summary = generate_table_summary(result.tables)
|
79
|
+
result.metadata.update(
|
80
|
+
{
|
81
|
+
"table_count": table_summary["table_count"],
|
82
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
83
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
84
|
+
f"{table_summary['total_rows']} total rows",
|
85
|
+
}
|
86
|
+
)
|
87
|
+
|
88
|
+
return self._apply_quality_processing(result)
|
71
89
|
|
72
90
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
73
91
|
"""Pure sync implementation of PDF extraction from bytes."""
|
@@ -81,8 +99,6 @@ class PDFExtractor(Extractor):
|
|
81
99
|
|
82
100
|
result = self.extract_path_sync(Path(temp_path))
|
83
101
|
|
84
|
-
from kreuzberg._playa import extract_pdf_metadata_sync
|
85
|
-
|
86
102
|
metadata = extract_pdf_metadata_sync(content)
|
87
103
|
result.metadata = metadata
|
88
104
|
|
@@ -100,16 +116,21 @@ class PDFExtractor(Extractor):
|
|
100
116
|
|
101
117
|
tables = []
|
102
118
|
if self.config.extract_tables:
|
119
|
+
# GMFT is optional dependency
|
103
120
|
try:
|
104
121
|
from kreuzberg._gmft import extract_tables_sync
|
105
122
|
|
106
123
|
tables = extract_tables_sync(path)
|
107
124
|
except ImportError:
|
108
|
-
|
125
|
+
tables = []
|
126
|
+
|
127
|
+
# Use playa for better text structure preservation when not using OCR
|
128
|
+
if not self.config.force_ocr and self._validate_extracted_text(text):
|
129
|
+
text = self._extract_with_playa_sync(path, fallback_text=text)
|
109
130
|
|
110
131
|
text = normalize_spaces(text)
|
111
132
|
|
112
|
-
|
133
|
+
result = ExtractionResult(
|
113
134
|
content=text,
|
114
135
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
115
136
|
metadata={},
|
@@ -117,6 +138,21 @@ class PDFExtractor(Extractor):
|
|
117
138
|
chunks=[],
|
118
139
|
)
|
119
140
|
|
141
|
+
# Enhance metadata with table information
|
142
|
+
if tables:
|
143
|
+
table_summary = generate_table_summary(tables)
|
144
|
+
result.metadata.update(
|
145
|
+
{
|
146
|
+
"table_count": table_summary["table_count"],
|
147
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
148
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
149
|
+
f"{table_summary['total_rows']} total rows",
|
150
|
+
}
|
151
|
+
)
|
152
|
+
|
153
|
+
# Apply quality processing
|
154
|
+
return self._apply_quality_processing(result)
|
155
|
+
|
120
156
|
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
121
157
|
"""Check if text extracted from PDF is valid or corrupted.
|
122
158
|
|
@@ -283,7 +319,7 @@ class PDFExtractor(Extractor):
|
|
283
319
|
text_parts = []
|
284
320
|
for page in pdf:
|
285
321
|
text_page = page.get_textpage()
|
286
|
-
text = text_page.
|
322
|
+
text = text_page.get_text_bounded()
|
287
323
|
text_parts.append(text)
|
288
324
|
text_page.close()
|
289
325
|
page.close()
|
@@ -340,19 +376,19 @@ class PDFExtractor(Extractor):
|
|
340
376
|
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
341
377
|
"""Process PDF images with the configured OCR backend."""
|
342
378
|
if self.config.ocr_backend == "tesseract":
|
343
|
-
from kreuzberg.
|
379
|
+
from kreuzberg._ocr._sync import process_batch_images_sync
|
344
380
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
345
381
|
|
346
382
|
tesseract_config = (
|
347
383
|
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
348
384
|
)
|
349
|
-
results =
|
385
|
+
results = process_batch_images_sync([str(p) for p in image_paths], tesseract_config, backend="tesseract")
|
350
386
|
text_parts = [r.content for r in results]
|
351
387
|
return "\n\n".join(text_parts)
|
352
388
|
|
353
389
|
if self.config.ocr_backend == "paddleocr":
|
354
|
-
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
355
390
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
391
|
+
from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
|
356
392
|
|
357
393
|
paddle_config = (
|
358
394
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
@@ -365,8 +401,8 @@ class PDFExtractor(Extractor):
|
|
365
401
|
return "\n\n".join(text_parts)
|
366
402
|
|
367
403
|
if self.config.ocr_backend == "easyocr":
|
368
|
-
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
369
404
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
405
|
+
from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
|
370
406
|
|
371
407
|
easy_config = (
|
372
408
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
@@ -379,3 +415,21 @@ class PDFExtractor(Extractor):
|
|
379
415
|
return "\n\n".join(text_parts)
|
380
416
|
|
381
417
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
418
|
+
|
419
|
+
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
420
|
+
"""Extract text using playa for better structure preservation."""
|
421
|
+
with contextlib.suppress(Exception):
|
422
|
+
content = path.read_bytes()
|
423
|
+
document = parse(content, max_workers=1)
|
424
|
+
|
425
|
+
text_parts = []
|
426
|
+
for page in document.pages:
|
427
|
+
# Extract text while preserving structure
|
428
|
+
page_text = page.extract_text()
|
429
|
+
if page_text and page_text.strip():
|
430
|
+
text_parts.append(page_text)
|
431
|
+
|
432
|
+
if text_parts:
|
433
|
+
return "\n\n".join(text_parts)
|
434
|
+
|
435
|
+
return fallback_text
|
@@ -30,6 +30,9 @@ if TYPE_CHECKING: # pragma: no cover
|
|
30
30
|
|
31
31
|
from kreuzberg._types import Metadata
|
32
32
|
|
33
|
+
# Pre-compiled regex patterns for performance
|
34
|
+
_NON_WORD_PATTERN = re.compile(r"\W")
|
35
|
+
|
33
36
|
|
34
37
|
class PresentationExtractor(Extractor):
|
35
38
|
"""Extractor for PowerPoint (.pptx) files.
|
@@ -141,7 +144,7 @@ class PresentationExtractor(Extractor):
|
|
141
144
|
with suppress(AttributeError):
|
142
145
|
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
143
146
|
|
144
|
-
filename =
|
147
|
+
filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
|
145
148
|
md_content += f"\n\n"
|
146
149
|
|
147
150
|
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
@@ -162,7 +165,10 @@ class PresentationExtractor(Extractor):
|
|
162
165
|
md_content += "\n" + html_table + "\n"
|
163
166
|
|
164
167
|
elif shape.has_text_frame:
|
165
|
-
|
168
|
+
if shape == title:
|
169
|
+
md_content += "# " + shape.text.lstrip() + "\n"
|
170
|
+
else:
|
171
|
+
md_content += shape.text + "\n"
|
166
172
|
|
167
173
|
md_content = md_content.strip()
|
168
174
|
if slide.has_notes_slide:
|
@@ -174,13 +180,15 @@ class PresentationExtractor(Extractor):
|
|
174
180
|
|
175
181
|
md_content = md_content.strip()
|
176
182
|
|
177
|
-
|
183
|
+
result = ExtractionResult(
|
178
184
|
content=normalize_spaces(md_content),
|
179
185
|
mime_type=MARKDOWN_MIME_TYPE,
|
180
186
|
metadata=self._extract_presentation_metadata(presentation),
|
181
187
|
chunks=[],
|
182
188
|
)
|
183
189
|
|
190
|
+
return self._apply_quality_processing(result)
|
191
|
+
|
184
192
|
@staticmethod
|
185
193
|
def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
|
186
194
|
"""Extract metadata from a presentation instance.
|
@@ -193,7 +201,24 @@ class PresentationExtractor(Extractor):
|
|
193
201
|
"""
|
194
202
|
metadata: Metadata = {}
|
195
203
|
|
196
|
-
|
204
|
+
# Extract core properties
|
205
|
+
PresentationExtractor._extract_core_properties(presentation, metadata)
|
206
|
+
|
207
|
+
# Extract fonts used in presentation
|
208
|
+
fonts = PresentationExtractor._extract_fonts(presentation)
|
209
|
+
if fonts:
|
210
|
+
metadata["fonts"] = list(fonts)
|
211
|
+
|
212
|
+
# Add structural information
|
213
|
+
PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
|
214
|
+
|
215
|
+
return metadata
|
216
|
+
|
217
|
+
@staticmethod
|
218
|
+
def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
|
219
|
+
"""Extract core document properties from presentation."""
|
220
|
+
# Property mapping for core metadata
|
221
|
+
property_mapping = [
|
197
222
|
("authors", "author"),
|
198
223
|
("comments", "comments"),
|
199
224
|
("status", "content_status"),
|
@@ -205,17 +230,22 @@ class PresentationExtractor(Extractor):
|
|
205
230
|
("version", "revision"),
|
206
231
|
("subject", "subject"),
|
207
232
|
("title", "title"),
|
208
|
-
|
209
|
-
|
233
|
+
]
|
234
|
+
|
235
|
+
for metadata_key, core_property_key in property_mapping:
|
210
236
|
if core_property := getattr(presentation.core_properties, core_property_key, None):
|
211
237
|
metadata[metadata_key] = core_property # type: ignore[literal-required]
|
212
238
|
|
239
|
+
# Handle special list properties
|
213
240
|
if presentation.core_properties.language:
|
214
241
|
metadata["languages"] = [presentation.core_properties.language]
|
215
242
|
|
216
243
|
if presentation.core_properties.category:
|
217
244
|
metadata["categories"] = [presentation.core_properties.category]
|
218
245
|
|
246
|
+
@staticmethod
|
247
|
+
def _extract_fonts(presentation: Presentation) -> set[str]:
|
248
|
+
"""Extract all fonts used in the presentation."""
|
219
249
|
fonts = set()
|
220
250
|
for slide in presentation.slides:
|
221
251
|
for shape in slide.shapes:
|
@@ -226,8 +256,30 @@ class PresentationExtractor(Extractor):
|
|
226
256
|
for run in paragraph.runs:
|
227
257
|
if hasattr(run, "font") and run.font.name:
|
228
258
|
fonts.add(run.font.name)
|
259
|
+
return fonts
|
229
260
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
261
|
+
@staticmethod
|
262
|
+
def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
|
263
|
+
"""Add structural information about the presentation."""
|
264
|
+
slide_count = len(presentation.slides)
|
265
|
+
if slide_count == 0:
|
266
|
+
return
|
267
|
+
|
268
|
+
# Build description
|
269
|
+
structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
|
270
|
+
|
271
|
+
slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
|
272
|
+
if slides_with_notes > 0:
|
273
|
+
structure_info += f", {slides_with_notes} with notes"
|
274
|
+
|
275
|
+
metadata["description"] = structure_info
|
276
|
+
|
277
|
+
# Build summary if not already present
|
278
|
+
if "summary" not in metadata:
|
279
|
+
summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
|
280
|
+
if slides_with_notes > 0:
|
281
|
+
summary_parts.append(f"{slides_with_notes} slides have notes")
|
282
|
+
if fonts:
|
283
|
+
summary_parts.append(f"uses {len(fonts)} font{'s' if len(fonts) != 1 else ''}")
|
284
|
+
|
285
|
+
metadata["summary"] = f"{'. '.join(summary_parts)}."
|