kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +10 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +74 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_config.py +11 -1
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +5 -7
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +101 -27
- kreuzberg/_extractors/_html.py +112 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +208 -99
- kreuzberg/_extractors/_presentation.py +76 -8
- kreuzberg/_extractors/_spread_sheet.py +24 -30
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +324 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +53 -21
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +88 -37
- kreuzberg/_types.py +291 -61
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +39 -10
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
- kreuzberg-3.16.0.dist-info/RECORD +61 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -1,38 +1,60 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import contextlib
|
4
|
+
import io
|
5
|
+
import logging
|
4
6
|
import os
|
5
7
|
import tempfile
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6
9
|
from dataclasses import asdict
|
10
|
+
from itertools import count
|
7
11
|
from multiprocessing import cpu_count
|
8
12
|
from pathlib import Path
|
9
13
|
from re import Pattern
|
10
14
|
from re import compile as compile_regex
|
11
|
-
from typing import TYPE_CHECKING, ClassVar, cast
|
15
|
+
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
12
16
|
|
13
17
|
import anyio
|
14
18
|
import pypdfium2
|
15
19
|
from anyio import Path as AsyncPath
|
16
20
|
from playa import parse
|
21
|
+
from playa.document import Document
|
22
|
+
from playa.image import get_image_suffix_and_writer
|
17
23
|
|
24
|
+
from kreuzberg._constants import PDF_POINTS_PER_INCH
|
18
25
|
from kreuzberg._extractors._base import Extractor
|
19
26
|
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
20
27
|
from kreuzberg._ocr import get_ocr_backend
|
21
28
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
22
|
-
from kreuzberg._types import
|
29
|
+
from kreuzberg._types import (
|
30
|
+
EasyOCRConfig,
|
31
|
+
ExtractedImage,
|
32
|
+
ExtractionResult,
|
33
|
+
ImageOCRResult,
|
34
|
+
Metadata,
|
35
|
+
OcrBackendType,
|
36
|
+
PaddleOCRConfig,
|
37
|
+
TesseractConfig,
|
38
|
+
)
|
23
39
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
24
40
|
from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
|
25
|
-
from kreuzberg._utils.
|
41
|
+
from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
|
26
42
|
from kreuzberg._utils._string import normalize_spaces
|
27
|
-
from kreuzberg._utils._sync import
|
43
|
+
from kreuzberg._utils._sync import run_maybe_async, run_taskgroup, run_taskgroup_batched
|
28
44
|
from kreuzberg._utils._table import generate_table_summary
|
29
|
-
from kreuzberg._utils._tmp import
|
45
|
+
from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
|
30
46
|
from kreuzberg.exceptions import ParsingError
|
31
47
|
|
32
48
|
if TYPE_CHECKING: # pragma: no cover
|
33
49
|
from PIL.Image import Image
|
34
50
|
from playa.document import Document
|
35
51
|
|
52
|
+
logger = logging.getLogger(__name__)
|
53
|
+
|
54
|
+
PDF_MAX_WORKERS = 8
|
55
|
+
PDF_MAX_RETRY_ATTEMPTS = 3
|
56
|
+
PDF_RETRY_DELAY_BASE = 0.5
|
57
|
+
|
36
58
|
|
37
59
|
class PDFExtractor(Extractor):
|
38
60
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {PDF_MIME_TYPE}
|
@@ -41,27 +63,26 @@ class PDFExtractor(Extractor):
|
|
41
63
|
MINIMUM_CORRUPTED_RESULTS: ClassVar[int] = 2
|
42
64
|
|
43
65
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
44
|
-
|
45
|
-
await AsyncPath(file_path).write_bytes(content)
|
46
|
-
try:
|
66
|
+
async with temporary_file(".pdf", content) as file_path:
|
47
67
|
metadata = await self._extract_metadata_with_password_attempts(content)
|
48
68
|
result = await self.extract_path_async(file_path)
|
49
|
-
|
50
69
|
result.metadata = metadata
|
51
70
|
return result
|
52
|
-
finally:
|
53
|
-
await unlink()
|
54
71
|
|
55
72
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
56
73
|
content_bytes = await AsyncPath(path).read_bytes()
|
57
74
|
|
58
75
|
result: ExtractionResult | None = None
|
59
76
|
|
77
|
+
document: Document | None = None
|
78
|
+
if self.config.extract_images or self.config.extract_tables:
|
79
|
+
document = self._parse_with_password_attempts(content_bytes)
|
80
|
+
|
60
81
|
if not self.config.force_ocr:
|
61
82
|
try:
|
62
83
|
content = await self._extract_pdf_searchable_text(path)
|
63
84
|
if self._validate_extracted_text(content):
|
64
|
-
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
85
|
+
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
65
86
|
except ParsingError:
|
66
87
|
pass
|
67
88
|
|
@@ -69,16 +90,18 @@ class PDFExtractor(Extractor):
|
|
69
90
|
result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
|
70
91
|
|
71
92
|
if not result:
|
72
|
-
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
93
|
+
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
73
94
|
|
74
|
-
|
95
|
+
metadata = await self._extract_metadata_with_password_attempts(content_bytes)
|
96
|
+
result.metadata = metadata
|
75
97
|
|
76
98
|
if self.config.extract_tables:
|
77
99
|
# GMFT is optional dependency ~keep
|
78
100
|
try:
|
79
101
|
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
80
102
|
|
81
|
-
|
103
|
+
tables = await extract_tables(path, self.config.gmft_config)
|
104
|
+
result.tables = tables
|
82
105
|
except ImportError: # pragma: no cover
|
83
106
|
result.tables = []
|
84
107
|
|
@@ -91,25 +114,30 @@ class PDFExtractor(Extractor):
|
|
91
114
|
f"{table_summary['total_rows']} total rows",
|
92
115
|
}
|
93
116
|
|
117
|
+
if self.config.extract_images and document:
|
118
|
+
images = await self._extract_images_from_playa(document)
|
119
|
+
images = self._check_image_memory_limits(images)
|
120
|
+
result.images = images
|
121
|
+
if self.config.ocr_extracted_images:
|
122
|
+
image_ocr_results = await self._process_images_with_ocr(result.images)
|
123
|
+
result.image_ocr_results = image_ocr_results
|
124
|
+
|
94
125
|
return self._apply_quality_processing(result)
|
95
126
|
|
96
127
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
97
|
-
|
98
|
-
|
99
|
-
with os.fdopen(fd, "wb") as f:
|
100
|
-
f.write(content)
|
101
|
-
|
102
|
-
result = self.extract_path_sync(Path(temp_path))
|
103
|
-
|
128
|
+
with temporary_file_sync(".pdf", content) as temp_path:
|
129
|
+
result = self.extract_path_sync(temp_path)
|
104
130
|
metadata = self._extract_metadata_with_password_attempts_sync(content)
|
105
131
|
result.metadata = metadata
|
106
|
-
|
107
132
|
return result
|
108
|
-
finally:
|
109
|
-
with contextlib.suppress(OSError):
|
110
|
-
Path(temp_path).unlink()
|
111
133
|
|
112
134
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
135
|
+
content_bytes = path.read_bytes()
|
136
|
+
|
137
|
+
document: Document | None = None
|
138
|
+
if self.config.extract_images or self.config.extract_tables:
|
139
|
+
document = self._parse_with_password_attempts(content_bytes)
|
140
|
+
|
113
141
|
try:
|
114
142
|
text = self._extract_pdf_searchable_text_sync(path)
|
115
143
|
except ParsingError:
|
@@ -137,8 +165,7 @@ class PDFExtractor(Extractor):
|
|
137
165
|
content=text,
|
138
166
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
139
167
|
metadata={},
|
140
|
-
tables=tables,
|
141
|
-
chunks=[],
|
168
|
+
tables=list(tables),
|
142
169
|
)
|
143
170
|
|
144
171
|
if tables:
|
@@ -150,6 +177,14 @@ class PDFExtractor(Extractor):
|
|
150
177
|
f"{table_summary['total_rows']} total rows",
|
151
178
|
}
|
152
179
|
|
180
|
+
if self.config.extract_images and document:
|
181
|
+
images = self._extract_images_from_playa_sync(document)
|
182
|
+
images = self._check_image_memory_limits(images)
|
183
|
+
result.images = images
|
184
|
+
if self.config.ocr_extracted_images:
|
185
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
|
186
|
+
result.image_ocr_results = image_ocr_results
|
187
|
+
|
153
188
|
return self._apply_quality_processing(result)
|
154
189
|
|
155
190
|
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
@@ -163,14 +198,95 @@ class PDFExtractor(Extractor):
|
|
163
198
|
|
164
199
|
return (len(corruption_matches) / len(text)) < corruption_threshold
|
165
200
|
|
201
|
+
async def _extract_images_from_playa(self, doc: Document) -> list[ExtractedImage]:
|
202
|
+
async def extract_single_image(page_num: int, img_index: int, img_obj: Any) -> ExtractedImage | None:
|
203
|
+
try:
|
204
|
+
suffix, writer = get_image_suffix_and_writer(img_obj.stream)
|
205
|
+
|
206
|
+
buffer = io.BytesIO()
|
207
|
+
writer(buffer)
|
208
|
+
|
209
|
+
filename = f"page_{page_num}_image_{img_index}{suffix}"
|
210
|
+
|
211
|
+
return ExtractedImage(
|
212
|
+
data=buffer.getvalue(),
|
213
|
+
format=suffix[1:],
|
214
|
+
filename=filename,
|
215
|
+
page_number=page_num,
|
216
|
+
dimensions=img_obj.srcsize,
|
217
|
+
colorspace=img_obj.colorspace.name if img_obj.colorspace else None,
|
218
|
+
bits_per_component=img_obj.bits,
|
219
|
+
is_mask=img_obj.imagemask,
|
220
|
+
)
|
221
|
+
except Exception as e: # noqa: BLE001
|
222
|
+
logger.warning("Failed to extract image on page %s: %s", page_num, e)
|
223
|
+
return None
|
224
|
+
|
225
|
+
tasks = []
|
226
|
+
img_counter = 1
|
227
|
+
for page_num, page in enumerate(doc.pages, 1):
|
228
|
+
for img_obj in page.images:
|
229
|
+
tasks.append(extract_single_image(page_num, img_counter, img_obj))
|
230
|
+
img_counter += 1
|
231
|
+
|
232
|
+
if tasks:
|
233
|
+
results = await run_taskgroup(*tasks)
|
234
|
+
return [img for img in results if img is not None]
|
235
|
+
|
236
|
+
return []
|
237
|
+
|
238
|
+
def _extract_images_from_playa_sync(self, doc: Document) -> list[ExtractedImage]:
|
239
|
+
def extract_single_image(page_num: int, img_index: int, img_obj: Any) -> ExtractedImage | None:
|
240
|
+
try:
|
241
|
+
suffix, writer = get_image_suffix_and_writer(img_obj.stream)
|
242
|
+
|
243
|
+
buffer = io.BytesIO()
|
244
|
+
writer(buffer)
|
245
|
+
|
246
|
+
filename = f"page_{page_num}_image_{img_index}{suffix}"
|
247
|
+
|
248
|
+
return ExtractedImage(
|
249
|
+
data=buffer.getvalue(),
|
250
|
+
format=suffix[1:],
|
251
|
+
filename=filename,
|
252
|
+
page_number=page_num,
|
253
|
+
dimensions=img_obj.srcsize,
|
254
|
+
colorspace=img_obj.colorspace.name if img_obj.colorspace else None,
|
255
|
+
bits_per_component=img_obj.bits,
|
256
|
+
is_mask=img_obj.imagemask,
|
257
|
+
)
|
258
|
+
except Exception as e: # noqa: BLE001
|
259
|
+
logger.warning("Failed to extract image on page %s: %s", page_num, e)
|
260
|
+
return None
|
261
|
+
|
262
|
+
img_counter = count(1)
|
263
|
+
jobs = [
|
264
|
+
(page_num, next(img_counter), img_obj)
|
265
|
+
for page_num, page in enumerate(doc.pages, 1)
|
266
|
+
for img_obj in page.images
|
267
|
+
]
|
268
|
+
|
269
|
+
if not jobs:
|
270
|
+
return []
|
271
|
+
|
272
|
+
images = []
|
273
|
+
max_workers = min(PDF_MAX_WORKERS, len(jobs))
|
274
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
275
|
+
futures = {executor.submit(extract_single_image, *job): i for i, job in enumerate(jobs)}
|
276
|
+
for future in as_completed(futures):
|
277
|
+
result = future.result()
|
278
|
+
if result:
|
279
|
+
images.append(result)
|
280
|
+
|
281
|
+
images.sort(key=lambda x: int((x.filename or "page_0_image_0.jpg").split("_")[-1].split(".")[0]))
|
282
|
+
return images
|
283
|
+
|
166
284
|
async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
|
167
|
-
document: pypdfium2.PdfDocument | None = None
|
168
285
|
last_error = None
|
169
286
|
|
170
|
-
for attempt in range(
|
287
|
+
for attempt in range(PDF_MAX_RETRY_ATTEMPTS): # ~keep
|
171
288
|
try:
|
172
|
-
with
|
173
|
-
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
289
|
+
async with pdf_document(input_file) as document:
|
174
290
|
images = []
|
175
291
|
for page in cast("pypdfium2.PdfDocument", document):
|
176
292
|
width, height = page.get_size()
|
@@ -187,9 +303,12 @@ class PDFExtractor(Extractor):
|
|
187
303
|
else:
|
188
304
|
optimal_dpi = self.config.target_dpi
|
189
305
|
|
190
|
-
scale = optimal_dpi /
|
306
|
+
scale = optimal_dpi / PDF_POINTS_PER_INCH
|
191
307
|
|
192
|
-
|
308
|
+
bitmap = page.render(scale=scale)
|
309
|
+
image = bitmap.to_pil()
|
310
|
+
with pdf_resources_sync(bitmap):
|
311
|
+
images.append(image)
|
193
312
|
return images
|
194
313
|
except pypdfium2.PdfiumError as e: # noqa: PERF203
|
195
314
|
last_error = e
|
@@ -204,11 +323,7 @@ class PDFExtractor(Extractor):
|
|
204
323
|
),
|
205
324
|
) from e
|
206
325
|
# Wait before retry with exponential backoff # ~keep
|
207
|
-
await anyio.sleep(
|
208
|
-
finally:
|
209
|
-
if document:
|
210
|
-
with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
|
211
|
-
await run_sync(document.close)
|
326
|
+
await anyio.sleep(PDF_RETRY_DELAY_BASE * (attempt + 1))
|
212
327
|
|
213
328
|
# All retries failed # ~keep
|
214
329
|
raise ParsingError(
|
@@ -217,7 +332,7 @@ class PDFExtractor(Extractor):
|
|
217
332
|
operation="convert_pdf_to_images",
|
218
333
|
file_path=input_file,
|
219
334
|
error=last_error,
|
220
|
-
attempts=
|
335
|
+
attempts=PDF_MAX_RETRY_ATTEMPTS,
|
221
336
|
),
|
222
337
|
) from last_error
|
223
338
|
|
@@ -230,14 +345,12 @@ class PDFExtractor(Extractor):
|
|
230
345
|
)
|
231
346
|
content = "\n".join(result.content for result in ocr_results)
|
232
347
|
|
233
|
-
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
348
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
234
349
|
|
235
350
|
@staticmethod
|
236
351
|
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
237
|
-
document: pypdfium2.PdfDocument | None = None
|
238
352
|
try:
|
239
|
-
with
|
240
|
-
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
353
|
+
async with pdf_document(input_file) as document:
|
241
354
|
pages_content = []
|
242
355
|
page_errors = []
|
243
356
|
|
@@ -246,6 +359,8 @@ class PDFExtractor(Extractor):
|
|
246
359
|
text_page = page.get_textpage()
|
247
360
|
page_content = text_page.get_text_bounded()
|
248
361
|
pages_content.append(page_content)
|
362
|
+
with pdf_resources_sync(text_page):
|
363
|
+
pass
|
249
364
|
except Exception as e: # noqa: PERF203, BLE001
|
250
365
|
page_errors.append({"page": i + 1, "error": str(e)})
|
251
366
|
pages_content.append(f"[Error extracting page {i + 1}]")
|
@@ -275,52 +390,67 @@ class PDFExtractor(Extractor):
|
|
275
390
|
error=e,
|
276
391
|
),
|
277
392
|
) from e
|
278
|
-
finally:
|
279
|
-
if document:
|
280
|
-
with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
|
281
|
-
await run_sync(document.close)
|
282
393
|
|
283
394
|
def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
|
284
|
-
pdf = None
|
285
395
|
try:
|
286
|
-
with
|
287
|
-
pdf = pypdfium2.PdfDocument(str(path))
|
396
|
+
with pdf_document_sync(path) as pdf:
|
288
397
|
pages_text = []
|
289
398
|
for page in pdf:
|
290
399
|
text_page = page.get_textpage()
|
291
400
|
text = text_page.get_text_bounded()
|
292
401
|
pages_text.append(text)
|
293
|
-
text_page
|
294
|
-
|
402
|
+
with pdf_resources_sync(text_page, page):
|
403
|
+
pass
|
295
404
|
return "\n".join(pages_text)
|
296
405
|
except Exception as e:
|
297
406
|
raise ParsingError(f"Failed to extract PDF text: {e}") from e
|
298
|
-
finally:
|
299
|
-
if pdf:
|
300
|
-
with pypdfium_file_lock(path), contextlib.suppress(Exception):
|
301
|
-
pdf.close()
|
302
407
|
|
303
408
|
def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
|
304
|
-
|
409
|
+
temp_files: list[Path] = []
|
305
410
|
try:
|
306
|
-
|
307
|
-
with pypdfium_file_lock(path):
|
308
|
-
pdf = pypdfium2.PdfDocument(str(path))
|
411
|
+
with pdf_document_sync(path) as pdf:
|
309
412
|
for page in pdf:
|
310
|
-
|
413
|
+
width, height = page.get_size()
|
414
|
+
|
415
|
+
if self.config.auto_adjust_dpi:
|
416
|
+
optimal_dpi = calculate_optimal_dpi(
|
417
|
+
page_width=width,
|
418
|
+
page_height=height,
|
419
|
+
target_dpi=self.config.target_dpi,
|
420
|
+
max_dimension=self.config.max_image_dimension,
|
421
|
+
min_dpi=self.config.min_dpi,
|
422
|
+
max_dpi=self.config.max_dpi,
|
423
|
+
)
|
424
|
+
else:
|
425
|
+
optimal_dpi = self.config.target_dpi
|
426
|
+
|
427
|
+
scale = optimal_dpi / PDF_POINTS_PER_INCH
|
428
|
+
|
429
|
+
bitmap = page.render(scale=scale)
|
311
430
|
pil_image = bitmap.to_pil()
|
312
|
-
images.append(pil_image)
|
313
|
-
bitmap.close()
|
314
|
-
page.close()
|
315
431
|
|
316
|
-
|
432
|
+
fd, tmp = tempfile.mkstemp(suffix=".png")
|
433
|
+
try:
|
434
|
+
os.close(fd)
|
435
|
+
tmp_path = Path(tmp)
|
436
|
+
pil_image.save(tmp_path)
|
437
|
+
temp_files.append(tmp_path)
|
438
|
+
except Exception:
|
439
|
+
with contextlib.suppress(OSError):
|
440
|
+
os.close(fd)
|
441
|
+
raise
|
442
|
+
finally:
|
443
|
+
with pdf_resources_sync(bitmap, page):
|
444
|
+
pil_image.close()
|
445
|
+
|
446
|
+
return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
|
317
447
|
|
318
448
|
except Exception as e:
|
319
449
|
raise ParsingError(f"Failed to OCR PDF: {e}") from e
|
320
450
|
finally:
|
321
|
-
|
322
|
-
with
|
323
|
-
|
451
|
+
for p in temp_files:
|
452
|
+
with contextlib.suppress(OSError):
|
453
|
+
p.unlink()
|
324
454
|
|
325
455
|
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
326
456
|
backend = get_ocr_backend(self.config.ocr_backend)
|
@@ -348,35 +478,12 @@ class PDFExtractor(Extractor):
|
|
348
478
|
return "\n\n".join(result.content for result in results)
|
349
479
|
|
350
480
|
def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
|
481
|
+
if not self.config.ocr_backend:
|
482
|
+
raise ValueError("OCR backend must be specified")
|
351
483
|
backend = get_ocr_backend(self.config.ocr_backend)
|
484
|
+
config = self._prepare_ocr_config(self.config.ocr_backend)
|
352
485
|
|
353
|
-
|
354
|
-
case "tesseract":
|
355
|
-
config = (
|
356
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
357
|
-
)
|
358
|
-
results = []
|
359
|
-
for image in images:
|
360
|
-
result = backend.process_image_sync(image, **asdict(config))
|
361
|
-
results.append(result)
|
362
|
-
case "paddleocr":
|
363
|
-
paddle_config = (
|
364
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
365
|
-
)
|
366
|
-
results = []
|
367
|
-
for image in images:
|
368
|
-
result = backend.process_image_sync(image, **asdict(paddle_config))
|
369
|
-
results.append(result)
|
370
|
-
case "easyocr":
|
371
|
-
easy_config = (
|
372
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
373
|
-
)
|
374
|
-
results = []
|
375
|
-
for image in images:
|
376
|
-
result = backend.process_image_sync(image, **asdict(easy_config))
|
377
|
-
results.append(result)
|
378
|
-
case _:
|
379
|
-
raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
|
486
|
+
results = [backend.process_image_sync(image, **config) for image in images]
|
380
487
|
|
381
488
|
return "\n\n".join(result.content for result in results)
|
382
489
|
|
@@ -390,9 +497,11 @@ class PDFExtractor(Extractor):
|
|
390
497
|
for password in passwords:
|
391
498
|
try:
|
392
499
|
return parse(content, max_workers=1, password=password)
|
393
|
-
except
|
500
|
+
except (ValueError, TypeError, KeyError, RuntimeError) as e: # noqa: PERF203
|
394
501
|
last_exception = e
|
395
502
|
continue
|
503
|
+
except OSError as e:
|
504
|
+
raise ParsingError(f"Failed to parse PDF: {e}") from e
|
396
505
|
|
397
506
|
if last_exception:
|
398
507
|
raise last_exception from None
|
@@ -411,7 +520,7 @@ class PDFExtractor(Extractor):
|
|
411
520
|
for password in passwords:
|
412
521
|
try:
|
413
522
|
return await extract_pdf_metadata(content, password=password)
|
414
|
-
except
|
523
|
+
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
|
415
524
|
last_exception = e
|
416
525
|
continue
|
417
526
|
|
@@ -429,7 +538,7 @@ class PDFExtractor(Extractor):
|
|
429
538
|
for password in passwords:
|
430
539
|
try:
|
431
540
|
return extract_pdf_metadata_sync(content, password=password)
|
432
|
-
except
|
541
|
+
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
|
433
542
|
last_exception = e
|
434
543
|
continue
|
435
544
|
|
@@ -1,11 +1,12 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import logging
|
3
4
|
import re
|
4
5
|
from contextlib import suppress
|
5
6
|
from html import escape
|
6
7
|
from io import BytesIO
|
7
8
|
from pathlib import Path
|
8
|
-
from typing import TYPE_CHECKING, ClassVar
|
9
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
9
10
|
|
10
11
|
import pptx
|
11
12
|
from anyio import Path as AsyncPath
|
@@ -13,8 +14,9 @@ from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
13
14
|
|
14
15
|
from kreuzberg._extractors._base import Extractor
|
15
16
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, POWER_POINT_MIME_TYPE
|
16
|
-
from kreuzberg._types import ExtractionResult
|
17
|
+
from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult
|
17
18
|
from kreuzberg._utils._string import normalize_spaces
|
19
|
+
from kreuzberg._utils._sync import run_maybe_async
|
18
20
|
|
19
21
|
if TYPE_CHECKING: # pragma: no cover
|
20
22
|
from pptx.presentation import Presentation
|
@@ -23,23 +25,41 @@ if TYPE_CHECKING: # pragma: no cover
|
|
23
25
|
|
24
26
|
_NON_WORD_PATTERN = re.compile(r"\W")
|
25
27
|
|
28
|
+
logger = logging.getLogger(__name__)
|
29
|
+
|
26
30
|
|
27
31
|
class PresentationExtractor(Extractor):
|
28
32
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
|
29
33
|
|
30
34
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
31
|
-
|
35
|
+
result = self._extract_pptx(content)
|
36
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
37
|
+
image_ocr_results = await self._process_images_with_ocr(result.images)
|
38
|
+
result.image_ocr_results = image_ocr_results
|
39
|
+
return result
|
32
40
|
|
33
41
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
34
42
|
content = await AsyncPath(path).read_bytes()
|
35
|
-
|
43
|
+
result = self._extract_pptx(content)
|
44
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
45
|
+
image_ocr_results = await self._process_images_with_ocr(result.images)
|
46
|
+
result.image_ocr_results = image_ocr_results
|
47
|
+
return result
|
36
48
|
|
37
49
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
38
|
-
|
50
|
+
result = self._extract_pptx(content)
|
51
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
52
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
|
53
|
+
result.image_ocr_results = image_ocr_results
|
54
|
+
return result
|
39
55
|
|
40
56
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
41
57
|
content = Path(path).read_bytes()
|
42
|
-
|
58
|
+
result = self._extract_pptx(content)
|
59
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
60
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(self._process_images_with_ocr, result.images)
|
61
|
+
result.image_ocr_results = image_ocr_results
|
62
|
+
return result
|
43
63
|
|
44
64
|
def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
|
45
65
|
md_content = ""
|
@@ -63,8 +83,10 @@ class PresentationExtractor(Extractor):
|
|
63
83
|
with suppress(AttributeError):
|
64
84
|
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
65
85
|
|
66
|
-
|
67
|
-
|
86
|
+
name_val = shape.name if isinstance(getattr(shape, "name", None), str) else "image"
|
87
|
+
filename = _NON_WORD_PATTERN.sub("", name_val) + ".jpg"
|
88
|
+
label = alt_text if alt_text else name_val
|
89
|
+
md_content += f"\n\n"
|
68
90
|
|
69
91
|
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
70
92
|
html_table = "<table>"
|
@@ -106,8 +128,54 @@ class PresentationExtractor(Extractor):
|
|
106
128
|
chunks=[],
|
107
129
|
)
|
108
130
|
|
131
|
+
if self.config.extract_images:
|
132
|
+
images = self._extract_images_from_pptx(presentation)
|
133
|
+
result.images = images
|
134
|
+
|
109
135
|
return self._apply_quality_processing(result)
|
110
136
|
|
137
|
+
def _extract_images_from_pptx(self, presentation: Presentation) -> list[ExtractedImage]:
|
138
|
+
images: list[ExtractedImage] = []
|
139
|
+
|
140
|
+
for slide_num, slide in enumerate(presentation.slides, 1):
|
141
|
+
for shape in slide.shapes:
|
142
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
143
|
+
try:
|
144
|
+
image = shape.image
|
145
|
+
if not image.blob or not isinstance(image.blob, bytes):
|
146
|
+
continue
|
147
|
+
filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
|
148
|
+
|
149
|
+
images.append(
|
150
|
+
ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
|
151
|
+
)
|
152
|
+
except Exception as e: # noqa: BLE001
|
153
|
+
logger.warning("Failed to extract image from slide %s: %s", slide_num, e)
|
154
|
+
continue
|
155
|
+
|
156
|
+
elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
157
|
+
images.extend(self._extract_from_grouped_shapes(shape, slide_num, len(images)))
|
158
|
+
|
159
|
+
return images
|
160
|
+
|
161
|
+
def _extract_from_grouped_shapes(self, group_shape: Any, slide_num: int, image_count: int) -> list[ExtractedImage]:
|
162
|
+
images: list[ExtractedImage] = []
|
163
|
+
for shape in group_shape.shapes:
|
164
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
165
|
+
try:
|
166
|
+
image = shape.image
|
167
|
+
if not image.blob or not isinstance(image.blob, bytes):
|
168
|
+
continue
|
169
|
+
filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
|
170
|
+
images.append(
|
171
|
+
ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
|
172
|
+
)
|
173
|
+
except Exception as e: # noqa: BLE001
|
174
|
+
logger.warning("Failed to extract grouped image: %s", e)
|
175
|
+
elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
|
176
|
+
images.extend(self._extract_from_grouped_shapes(shape, slide_num, image_count + len(images)))
|
177
|
+
return images
|
178
|
+
|
111
179
|
@staticmethod
|
112
180
|
def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
|
113
181
|
metadata: Metadata = {}
|