kreuzberg 3.14.1__py3-none-any.whl → 3.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +127 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +4 -6
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +95 -27
- kreuzberg/_extractors/_html.py +85 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +209 -99
- kreuzberg/_extractors/_presentation.py +72 -8
- kreuzberg/_extractors/_spread_sheet.py +25 -30
- kreuzberg/_mcp/server.py +345 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +2 -2
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +74 -34
- kreuzberg/_types.py +180 -21
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_sync.py +36 -6
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +43 -27
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
- kreuzberg-3.15.0.dist-info/RECORD +60 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -1,38 +1,61 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import asyncio
|
3
4
|
import contextlib
|
5
|
+
import io
|
6
|
+
import logging
|
4
7
|
import os
|
5
8
|
import tempfile
|
9
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
10
|
from dataclasses import asdict
|
11
|
+
from itertools import count
|
7
12
|
from multiprocessing import cpu_count
|
8
13
|
from pathlib import Path
|
9
14
|
from re import Pattern
|
10
15
|
from re import compile as compile_regex
|
11
|
-
from typing import TYPE_CHECKING, ClassVar, cast
|
16
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
12
17
|
|
13
18
|
import anyio
|
14
19
|
import pypdfium2
|
15
20
|
from anyio import Path as AsyncPath
|
16
21
|
from playa import parse
|
22
|
+
from playa.document import Document
|
23
|
+
from playa.image import get_image_suffix_and_writer
|
17
24
|
|
25
|
+
from kreuzberg._constants import PDF_POINTS_PER_INCH
|
18
26
|
from kreuzberg._extractors._base import Extractor
|
19
27
|
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
20
28
|
from kreuzberg._ocr import get_ocr_backend
|
21
29
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
22
|
-
from kreuzberg._types import
|
30
|
+
from kreuzberg._types import (
|
31
|
+
EasyOCRConfig,
|
32
|
+
ExtractedImage,
|
33
|
+
ExtractionResult,
|
34
|
+
ImageOCRResult,
|
35
|
+
Metadata,
|
36
|
+
OcrBackendType,
|
37
|
+
PaddleOCRConfig,
|
38
|
+
TesseractConfig,
|
39
|
+
)
|
23
40
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
24
41
|
from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
|
25
|
-
from kreuzberg._utils.
|
42
|
+
from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
|
26
43
|
from kreuzberg._utils._string import normalize_spaces
|
27
|
-
from kreuzberg._utils._sync import
|
44
|
+
from kreuzberg._utils._sync import run_maybe_async, run_taskgroup_batched
|
28
45
|
from kreuzberg._utils._table import generate_table_summary
|
29
|
-
from kreuzberg._utils._tmp import
|
46
|
+
from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
|
30
47
|
from kreuzberg.exceptions import ParsingError
|
31
48
|
|
32
49
|
if TYPE_CHECKING: # pragma: no cover
|
33
50
|
from PIL.Image import Image
|
34
51
|
from playa.document import Document
|
35
52
|
|
53
|
+
logger = logging.getLogger(__name__)
|
54
|
+
|
55
|
+
PDF_MAX_WORKERS = 8
|
56
|
+
PDF_MAX_RETRY_ATTEMPTS = 3
|
57
|
+
PDF_RETRY_DELAY_BASE = 0.5
|
58
|
+
|
36
59
|
|
37
60
|
class PDFExtractor(Extractor):
|
38
61
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {PDF_MIME_TYPE}
|
@@ -41,27 +64,26 @@ class PDFExtractor(Extractor):
|
|
41
64
|
MINIMUM_CORRUPTED_RESULTS: ClassVar[int] = 2
|
42
65
|
|
43
66
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
44
|
-
|
45
|
-
await AsyncPath(file_path).write_bytes(content)
|
46
|
-
try:
|
67
|
+
async with temporary_file(".pdf", content) as file_path:
|
47
68
|
metadata = await self._extract_metadata_with_password_attempts(content)
|
48
69
|
result = await self.extract_path_async(file_path)
|
49
|
-
|
50
70
|
result.metadata = metadata
|
51
71
|
return result
|
52
|
-
finally:
|
53
|
-
await unlink()
|
54
72
|
|
55
73
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
56
74
|
content_bytes = await AsyncPath(path).read_bytes()
|
57
75
|
|
58
76
|
result: ExtractionResult | None = None
|
59
77
|
|
78
|
+
document: Document | None = None
|
79
|
+
if self.config.extract_images or self.config.extract_tables:
|
80
|
+
document = self._parse_with_password_attempts(content_bytes)
|
81
|
+
|
60
82
|
if not self.config.force_ocr:
|
61
83
|
try:
|
62
84
|
content = await self._extract_pdf_searchable_text(path)
|
63
85
|
if self._validate_extracted_text(content):
|
64
|
-
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
86
|
+
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
65
87
|
except ParsingError:
|
66
88
|
pass
|
67
89
|
|
@@ -69,16 +91,18 @@ class PDFExtractor(Extractor):
|
|
69
91
|
result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
|
70
92
|
|
71
93
|
if not result:
|
72
|
-
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
94
|
+
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
73
95
|
|
74
|
-
|
96
|
+
metadata = await self._extract_metadata_with_password_attempts(content_bytes)
|
97
|
+
result.metadata = metadata
|
75
98
|
|
76
99
|
if self.config.extract_tables:
|
77
100
|
# GMFT is optional dependency ~keep
|
78
101
|
try:
|
79
102
|
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
80
103
|
|
81
|
-
|
104
|
+
tables = await extract_tables(path, self.config.gmft_config)
|
105
|
+
result.tables = tables
|
82
106
|
except ImportError: # pragma: no cover
|
83
107
|
result.tables = []
|
84
108
|
|
@@ -91,25 +115,30 @@ class PDFExtractor(Extractor):
|
|
91
115
|
f"{table_summary['total_rows']} total rows",
|
92
116
|
}
|
93
117
|
|
118
|
+
if self.config.extract_images and document:
|
119
|
+
images = await self._extract_images_from_playa(document)
|
120
|
+
images = self._check_image_memory_limits(images)
|
121
|
+
result.images = images
|
122
|
+
if self.config.ocr_extracted_images:
|
123
|
+
image_ocr_results = await self._process_images_with_ocr(result.images)
|
124
|
+
result.image_ocr_results = image_ocr_results
|
125
|
+
|
94
126
|
return self._apply_quality_processing(result)
|
95
127
|
|
96
128
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
97
|
-
|
98
|
-
|
99
|
-
with os.fdopen(fd, "wb") as f:
|
100
|
-
f.write(content)
|
101
|
-
|
102
|
-
result = self.extract_path_sync(Path(temp_path))
|
103
|
-
|
129
|
+
with temporary_file_sync(".pdf", content) as temp_path:
|
130
|
+
result = self.extract_path_sync(temp_path)
|
104
131
|
metadata = self._extract_metadata_with_password_attempts_sync(content)
|
105
132
|
result.metadata = metadata
|
106
|
-
|
107
133
|
return result
|
108
|
-
finally:
|
109
|
-
with contextlib.suppress(OSError):
|
110
|
-
Path(temp_path).unlink()
|
111
134
|
|
112
135
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
136
|
+
content_bytes = path.read_bytes()
|
137
|
+
|
138
|
+
document: Document | None = None
|
139
|
+
if self.config.extract_images or self.config.extract_tables:
|
140
|
+
document = self._parse_with_password_attempts(content_bytes)
|
141
|
+
|
113
142
|
try:
|
114
143
|
text = self._extract_pdf_searchable_text_sync(path)
|
115
144
|
except ParsingError:
|
@@ -137,8 +166,7 @@ class PDFExtractor(Extractor):
|
|
137
166
|
content=text,
|
138
167
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
139
168
|
metadata={},
|
140
|
-
tables=tables,
|
141
|
-
chunks=[],
|
169
|
+
tables=list(tables),
|
142
170
|
)
|
143
171
|
|
144
172
|
if tables:
|
@@ -150,6 +178,14 @@ class PDFExtractor(Extractor):
|
|
150
178
|
f"{table_summary['total_rows']} total rows",
|
151
179
|
}
|
152
180
|
|
181
|
+
if self.config.extract_images and document:
|
182
|
+
images = self._extract_images_from_playa_sync(document)
|
183
|
+
images = self._check_image_memory_limits(images)
|
184
|
+
result.images = images
|
185
|
+
if self.config.ocr_extracted_images:
|
186
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
|
187
|
+
result.image_ocr_results = image_ocr_results
|
188
|
+
|
153
189
|
return self._apply_quality_processing(result)
|
154
190
|
|
155
191
|
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
@@ -163,14 +199,95 @@ class PDFExtractor(Extractor):
|
|
163
199
|
|
164
200
|
return (len(corruption_matches) / len(text)) < corruption_threshold
|
165
201
|
|
202
|
+
async def _extract_images_from_playa(self, doc: Document) -> list[ExtractedImage]:
|
203
|
+
async def extract_single_image(page_num: int, img_index: int, img_obj: Any) -> ExtractedImage | None:
|
204
|
+
try:
|
205
|
+
suffix, writer = get_image_suffix_and_writer(img_obj.stream)
|
206
|
+
|
207
|
+
buffer = io.BytesIO()
|
208
|
+
writer(buffer)
|
209
|
+
|
210
|
+
filename = f"page_{page_num}_image_{img_index}{suffix}"
|
211
|
+
|
212
|
+
return ExtractedImage(
|
213
|
+
data=buffer.getvalue(),
|
214
|
+
format=suffix[1:],
|
215
|
+
filename=filename,
|
216
|
+
page_number=page_num,
|
217
|
+
dimensions=img_obj.srcsize,
|
218
|
+
colorspace=img_obj.colorspace.name if img_obj.colorspace else None,
|
219
|
+
bits_per_component=img_obj.bits,
|
220
|
+
is_mask=img_obj.imagemask,
|
221
|
+
)
|
222
|
+
except Exception as e: # noqa: BLE001
|
223
|
+
logger.warning("Failed to extract image on page %s: %s", page_num, e)
|
224
|
+
return None
|
225
|
+
|
226
|
+
tasks = []
|
227
|
+
img_counter = 1
|
228
|
+
for page_num, page in enumerate(doc.pages, 1):
|
229
|
+
for img_obj in page.images:
|
230
|
+
tasks.append(extract_single_image(page_num, img_counter, img_obj))
|
231
|
+
img_counter += 1
|
232
|
+
|
233
|
+
if tasks:
|
234
|
+
results = await asyncio.gather(*tasks)
|
235
|
+
return [img for img in results if img is not None]
|
236
|
+
|
237
|
+
return []
|
238
|
+
|
239
|
+
def _extract_images_from_playa_sync(self, doc: Document) -> list[ExtractedImage]:
|
240
|
+
def extract_single_image(page_num: int, img_index: int, img_obj: Any) -> ExtractedImage | None:
|
241
|
+
try:
|
242
|
+
suffix, writer = get_image_suffix_and_writer(img_obj.stream)
|
243
|
+
|
244
|
+
buffer = io.BytesIO()
|
245
|
+
writer(buffer)
|
246
|
+
|
247
|
+
filename = f"page_{page_num}_image_{img_index}{suffix}"
|
248
|
+
|
249
|
+
return ExtractedImage(
|
250
|
+
data=buffer.getvalue(),
|
251
|
+
format=suffix[1:],
|
252
|
+
filename=filename,
|
253
|
+
page_number=page_num,
|
254
|
+
dimensions=img_obj.srcsize,
|
255
|
+
colorspace=img_obj.colorspace.name if img_obj.colorspace else None,
|
256
|
+
bits_per_component=img_obj.bits,
|
257
|
+
is_mask=img_obj.imagemask,
|
258
|
+
)
|
259
|
+
except Exception as e: # noqa: BLE001
|
260
|
+
logger.warning("Failed to extract image on page %s: %s", page_num, e)
|
261
|
+
return None
|
262
|
+
|
263
|
+
img_counter = count(1)
|
264
|
+
jobs = [
|
265
|
+
(page_num, next(img_counter), img_obj)
|
266
|
+
for page_num, page in enumerate(doc.pages, 1)
|
267
|
+
for img_obj in page.images
|
268
|
+
]
|
269
|
+
|
270
|
+
if not jobs:
|
271
|
+
return []
|
272
|
+
|
273
|
+
images = []
|
274
|
+
max_workers = min(PDF_MAX_WORKERS, len(jobs))
|
275
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
276
|
+
futures = {executor.submit(extract_single_image, *job): i for i, job in enumerate(jobs)}
|
277
|
+
for future in as_completed(futures):
|
278
|
+
result = future.result()
|
279
|
+
if result:
|
280
|
+
images.append(result)
|
281
|
+
|
282
|
+
images.sort(key=lambda x: int((x.filename or "page_0_image_0.jpg").split("_")[-1].split(".")[0]))
|
283
|
+
return images
|
284
|
+
|
166
285
|
async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
|
167
|
-
document: pypdfium2.PdfDocument | None = None
|
168
286
|
last_error = None
|
169
287
|
|
170
|
-
for attempt in range(
|
288
|
+
for attempt in range(PDF_MAX_RETRY_ATTEMPTS): # ~keep
|
171
289
|
try:
|
172
|
-
with
|
173
|
-
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
290
|
+
async with pdf_document(input_file) as document:
|
174
291
|
images = []
|
175
292
|
for page in cast("pypdfium2.PdfDocument", document):
|
176
293
|
width, height = page.get_size()
|
@@ -187,9 +304,12 @@ class PDFExtractor(Extractor):
|
|
187
304
|
else:
|
188
305
|
optimal_dpi = self.config.target_dpi
|
189
306
|
|
190
|
-
scale = optimal_dpi /
|
307
|
+
scale = optimal_dpi / PDF_POINTS_PER_INCH
|
191
308
|
|
192
|
-
|
309
|
+
bitmap = page.render(scale=scale)
|
310
|
+
image = bitmap.to_pil()
|
311
|
+
with pdf_resources_sync(bitmap):
|
312
|
+
images.append(image)
|
193
313
|
return images
|
194
314
|
except pypdfium2.PdfiumError as e: # noqa: PERF203
|
195
315
|
last_error = e
|
@@ -204,11 +324,7 @@ class PDFExtractor(Extractor):
|
|
204
324
|
),
|
205
325
|
) from e
|
206
326
|
# Wait before retry with exponential backoff # ~keep
|
207
|
-
await anyio.sleep(
|
208
|
-
finally:
|
209
|
-
if document:
|
210
|
-
with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
|
211
|
-
await run_sync(document.close)
|
327
|
+
await anyio.sleep(PDF_RETRY_DELAY_BASE * (attempt + 1))
|
212
328
|
|
213
329
|
# All retries failed # ~keep
|
214
330
|
raise ParsingError(
|
@@ -217,7 +333,7 @@ class PDFExtractor(Extractor):
|
|
217
333
|
operation="convert_pdf_to_images",
|
218
334
|
file_path=input_file,
|
219
335
|
error=last_error,
|
220
|
-
attempts=
|
336
|
+
attempts=PDF_MAX_RETRY_ATTEMPTS,
|
221
337
|
),
|
222
338
|
) from last_error
|
223
339
|
|
@@ -230,14 +346,12 @@ class PDFExtractor(Extractor):
|
|
230
346
|
)
|
231
347
|
content = "\n".join(result.content for result in ocr_results)
|
232
348
|
|
233
|
-
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
349
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
234
350
|
|
235
351
|
@staticmethod
|
236
352
|
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
237
|
-
document: pypdfium2.PdfDocument | None = None
|
238
353
|
try:
|
239
|
-
with
|
240
|
-
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
354
|
+
async with pdf_document(input_file) as document:
|
241
355
|
pages_content = []
|
242
356
|
page_errors = []
|
243
357
|
|
@@ -246,6 +360,8 @@ class PDFExtractor(Extractor):
|
|
246
360
|
text_page = page.get_textpage()
|
247
361
|
page_content = text_page.get_text_bounded()
|
248
362
|
pages_content.append(page_content)
|
363
|
+
with pdf_resources_sync(text_page):
|
364
|
+
pass
|
249
365
|
except Exception as e: # noqa: PERF203, BLE001
|
250
366
|
page_errors.append({"page": i + 1, "error": str(e)})
|
251
367
|
pages_content.append(f"[Error extracting page {i + 1}]")
|
@@ -275,52 +391,67 @@ class PDFExtractor(Extractor):
|
|
275
391
|
error=e,
|
276
392
|
),
|
277
393
|
) from e
|
278
|
-
finally:
|
279
|
-
if document:
|
280
|
-
with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
|
281
|
-
await run_sync(document.close)
|
282
394
|
|
283
395
|
def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
|
284
|
-
pdf = None
|
285
396
|
try:
|
286
|
-
with
|
287
|
-
pdf = pypdfium2.PdfDocument(str(path))
|
397
|
+
with pdf_document_sync(path) as pdf:
|
288
398
|
pages_text = []
|
289
399
|
for page in pdf:
|
290
400
|
text_page = page.get_textpage()
|
291
401
|
text = text_page.get_text_bounded()
|
292
402
|
pages_text.append(text)
|
293
|
-
text_page
|
294
|
-
|
403
|
+
with pdf_resources_sync(text_page, page):
|
404
|
+
pass
|
295
405
|
return "\n".join(pages_text)
|
296
406
|
except Exception as e:
|
297
407
|
raise ParsingError(f"Failed to extract PDF text: {e}") from e
|
298
|
-
finally:
|
299
|
-
if pdf:
|
300
|
-
with pypdfium_file_lock(path), contextlib.suppress(Exception):
|
301
|
-
pdf.close()
|
302
408
|
|
303
409
|
def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
|
304
|
-
|
410
|
+
temp_files: list[Path] = []
|
305
411
|
try:
|
306
|
-
|
307
|
-
with pypdfium_file_lock(path):
|
308
|
-
pdf = pypdfium2.PdfDocument(str(path))
|
412
|
+
with pdf_document_sync(path) as pdf:
|
309
413
|
for page in pdf:
|
310
|
-
|
414
|
+
width, height = page.get_size()
|
415
|
+
|
416
|
+
if self.config.auto_adjust_dpi:
|
417
|
+
optimal_dpi = calculate_optimal_dpi(
|
418
|
+
page_width=width,
|
419
|
+
page_height=height,
|
420
|
+
target_dpi=self.config.target_dpi,
|
421
|
+
max_dimension=self.config.max_image_dimension,
|
422
|
+
min_dpi=self.config.min_dpi,
|
423
|
+
max_dpi=self.config.max_dpi,
|
424
|
+
)
|
425
|
+
else:
|
426
|
+
optimal_dpi = self.config.target_dpi
|
427
|
+
|
428
|
+
scale = optimal_dpi / PDF_POINTS_PER_INCH
|
429
|
+
|
430
|
+
bitmap = page.render(scale=scale)
|
311
431
|
pil_image = bitmap.to_pil()
|
312
|
-
images.append(pil_image)
|
313
|
-
bitmap.close()
|
314
|
-
page.close()
|
315
432
|
|
316
|
-
|
433
|
+
fd, tmp = tempfile.mkstemp(suffix=".png")
|
434
|
+
try:
|
435
|
+
os.close(fd)
|
436
|
+
tmp_path = Path(tmp)
|
437
|
+
pil_image.save(tmp_path)
|
438
|
+
temp_files.append(tmp_path)
|
439
|
+
except Exception:
|
440
|
+
with contextlib.suppress(OSError):
|
441
|
+
os.close(fd)
|
442
|
+
raise
|
443
|
+
finally:
|
444
|
+
with pdf_resources_sync(bitmap, page):
|
445
|
+
pil_image.close()
|
446
|
+
|
447
|
+
return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
|
317
448
|
|
318
449
|
except Exception as e:
|
319
450
|
raise ParsingError(f"Failed to OCR PDF: {e}") from e
|
320
451
|
finally:
|
321
|
-
|
322
|
-
with
|
323
|
-
|
452
|
+
for p in temp_files:
|
453
|
+
with contextlib.suppress(OSError):
|
454
|
+
p.unlink()
|
324
455
|
|
325
456
|
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
326
457
|
backend = get_ocr_backend(self.config.ocr_backend)
|
@@ -348,35 +479,12 @@ class PDFExtractor(Extractor):
|
|
348
479
|
return "\n\n".join(result.content for result in results)
|
349
480
|
|
350
481
|
def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
|
482
|
+
if not self.config.ocr_backend:
|
483
|
+
raise ValueError("OCR backend must be specified")
|
351
484
|
backend = get_ocr_backend(self.config.ocr_backend)
|
485
|
+
config = self._prepare_ocr_config(self.config.ocr_backend)
|
352
486
|
|
353
|
-
|
354
|
-
case "tesseract":
|
355
|
-
config = (
|
356
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
357
|
-
)
|
358
|
-
results = []
|
359
|
-
for image in images:
|
360
|
-
result = backend.process_image_sync(image, **asdict(config))
|
361
|
-
results.append(result)
|
362
|
-
case "paddleocr":
|
363
|
-
paddle_config = (
|
364
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
365
|
-
)
|
366
|
-
results = []
|
367
|
-
for image in images:
|
368
|
-
result = backend.process_image_sync(image, **asdict(paddle_config))
|
369
|
-
results.append(result)
|
370
|
-
case "easyocr":
|
371
|
-
easy_config = (
|
372
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
373
|
-
)
|
374
|
-
results = []
|
375
|
-
for image in images:
|
376
|
-
result = backend.process_image_sync(image, **asdict(easy_config))
|
377
|
-
results.append(result)
|
378
|
-
case _:
|
379
|
-
raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
|
487
|
+
results = [backend.process_image_sync(image, **config) for image in images]
|
380
488
|
|
381
489
|
return "\n\n".join(result.content for result in results)
|
382
490
|
|
@@ -390,9 +498,11 @@ class PDFExtractor(Extractor):
|
|
390
498
|
for password in passwords:
|
391
499
|
try:
|
392
500
|
return parse(content, max_workers=1, password=password)
|
393
|
-
except
|
501
|
+
except (ValueError, TypeError, KeyError, RuntimeError) as e: # noqa: PERF203
|
394
502
|
last_exception = e
|
395
503
|
continue
|
504
|
+
except OSError as e:
|
505
|
+
raise ParsingError(f"Failed to parse PDF: {e}") from e
|
396
506
|
|
397
507
|
if last_exception:
|
398
508
|
raise last_exception from None
|
@@ -411,7 +521,7 @@ class PDFExtractor(Extractor):
|
|
411
521
|
for password in passwords:
|
412
522
|
try:
|
413
523
|
return await extract_pdf_metadata(content, password=password)
|
414
|
-
except
|
524
|
+
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
|
415
525
|
last_exception = e
|
416
526
|
continue
|
417
527
|
|
@@ -429,7 +539,7 @@ class PDFExtractor(Extractor):
|
|
429
539
|
for password in passwords:
|
430
540
|
try:
|
431
541
|
return extract_pdf_metadata_sync(content, password=password)
|
432
|
-
except
|
542
|
+
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
|
433
543
|
last_exception = e
|
434
544
|
continue
|
435
545
|
|
@@ -1,11 +1,12 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import logging
|
3
4
|
import re
|
4
5
|
from contextlib import suppress
|
5
6
|
from html import escape
|
6
7
|
from io import BytesIO
|
7
8
|
from pathlib import Path
|
8
|
-
from typing import TYPE_CHECKING, ClassVar
|
9
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
9
10
|
|
10
11
|
import pptx
|
11
12
|
from anyio import Path as AsyncPath
|
@@ -13,8 +14,9 @@ from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
13
14
|
|
14
15
|
from kreuzberg._extractors._base import Extractor
|
15
16
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, POWER_POINT_MIME_TYPE
|
16
|
-
from kreuzberg._types import ExtractionResult
|
17
|
+
from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult
|
17
18
|
from kreuzberg._utils._string import normalize_spaces
|
19
|
+
from kreuzberg._utils._sync import run_maybe_async
|
18
20
|
|
19
21
|
if TYPE_CHECKING: # pragma: no cover
|
20
22
|
from pptx.presentation import Presentation
|
@@ -23,23 +25,41 @@ if TYPE_CHECKING: # pragma: no cover
|
|
23
25
|
|
24
26
|
_NON_WORD_PATTERN = re.compile(r"\W")
|
25
27
|
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
26
30
|
|
27
31
|
class PresentationExtractor(Extractor):
|
28
32
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
|
29
33
|
|
30
34
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
31
|
-
|
35
|
+
result = self._extract_pptx(content)
|
36
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
37
|
+
image_ocr_results = await self._process_images_with_ocr(result.images)
|
38
|
+
result.image_ocr_results = image_ocr_results
|
39
|
+
return result
|
32
40
|
|
33
41
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
34
42
|
content = await AsyncPath(path).read_bytes()
|
35
|
-
|
43
|
+
result = self._extract_pptx(content)
|
44
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
45
|
+
image_ocr_results = await self._process_images_with_ocr(result.images)
|
46
|
+
result.image_ocr_results = image_ocr_results
|
47
|
+
return result
|
36
48
|
|
37
49
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
38
|
-
|
50
|
+
result = self._extract_pptx(content)
|
51
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
52
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
|
53
|
+
result.image_ocr_results = image_ocr_results
|
54
|
+
return result
|
39
55
|
|
40
56
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
41
57
|
content = Path(path).read_bytes()
|
42
|
-
|
58
|
+
result = self._extract_pptx(content)
|
59
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
60
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
|
61
|
+
result.image_ocr_results = image_ocr_results
|
62
|
+
return result
|
43
63
|
|
44
64
|
def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
|
45
65
|
md_content = ""
|
@@ -63,8 +83,10 @@ class PresentationExtractor(Extractor):
|
|
63
83
|
with suppress(AttributeError):
|
64
84
|
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
65
85
|
|
66
|
-
|
67
|
-
|
86
|
+
name_val = shape.name if isinstance(getattr(shape, "name", None), str) else "image"
|
87
|
+
filename = _NON_WORD_PATTERN.sub("", name_val) + ".jpg"
|
88
|
+
label = alt_text if alt_text else name_val
|
89
|
+
md_content += f"\n\n"
|
68
90
|
|
69
91
|
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
70
92
|
html_table = "<table>"
|
@@ -106,8 +128,50 @@ class PresentationExtractor(Extractor):
|
|
106
128
|
chunks=[],
|
107
129
|
)
|
108
130
|
|
131
|
+
if self.config.extract_images:
|
132
|
+
images = self._extract_images_from_pptx(presentation)
|
133
|
+
result.images = images
|
134
|
+
|
109
135
|
return self._apply_quality_processing(result)
|
110
136
|
|
137
|
+
def _extract_images_from_pptx(self, presentation: Presentation) -> list[ExtractedImage]:
|
138
|
+
images: list[ExtractedImage] = []
|
139
|
+
|
140
|
+
for slide_num, slide in enumerate(presentation.slides, 1):
|
141
|
+
for shape in slide.shapes:
|
142
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
143
|
+
try:
|
144
|
+
image = shape.image
|
145
|
+
filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
|
146
|
+
|
147
|
+
images.append(
|
148
|
+
ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
|
149
|
+
)
|
150
|
+
except Exception as e: # noqa: BLE001
|
151
|
+
logger.warning("Failed to extract image from slide %s: %s", slide_num, e)
|
152
|
+
continue
|
153
|
+
|
154
|
+
elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
155
|
+
images.extend(self._extract_from_grouped_shapes(shape, slide_num, len(images)))
|
156
|
+
|
157
|
+
return images
|
158
|
+
|
159
|
+
def _extract_from_grouped_shapes(self, group_shape: Any, slide_num: int, image_count: int) -> list[ExtractedImage]:
|
160
|
+
images: list[ExtractedImage] = []
|
161
|
+
for shape in group_shape.shapes:
|
162
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
163
|
+
try:
|
164
|
+
image = shape.image
|
165
|
+
filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
|
166
|
+
images.append(
|
167
|
+
ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
|
168
|
+
)
|
169
|
+
except Exception as e: # noqa: BLE001
|
170
|
+
logger.warning("Failed to extract grouped image: %s", e)
|
171
|
+
elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
172
|
+
images.extend(self._extract_from_grouped_shapes(shape, slide_num, image_count + len(images)))
|
173
|
+
return images
|
174
|
+
|
111
175
|
@staticmethod
|
112
176
|
def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
|
113
177
|
metadata: Metadata = {}
|