kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +3 -0
- kreuzberg/__main__.py +8 -0
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_cli_config.py +175 -0
- kreuzberg/_extractors/_image.py +39 -4
- kreuzberg/_extractors/_pandoc.py +158 -18
- kreuzberg/_extractors/_pdf.py +199 -19
- kreuzberg/_extractors/_presentation.py +1 -1
- kreuzberg/_extractors/_spread_sheet.py +65 -7
- kreuzberg/_gmft.py +222 -16
- kreuzberg/_mime_types.py +62 -16
- kreuzberg/_multiprocessing/__init__.py +6 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
- kreuzberg/_multiprocessing/process_manager.py +188 -0
- kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
- kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
- kreuzberg/_ocr/_easyocr.py +6 -12
- kreuzberg/_ocr/_paddleocr.py +15 -13
- kreuzberg/_ocr/_tesseract.py +136 -46
- kreuzberg/_playa.py +43 -0
- kreuzberg/_types.py +4 -0
- kreuzberg/_utils/_cache.py +372 -0
- kreuzberg/_utils/_device.py +10 -27
- kreuzberg/_utils/_document_cache.py +220 -0
- kreuzberg/_utils/_errors.py +232 -0
- kreuzberg/_utils/_pdf_lock.py +72 -0
- kreuzberg/_utils/_process_pool.py +100 -0
- kreuzberg/_utils/_serialization.py +82 -0
- kreuzberg/_utils/_string.py +1 -1
- kreuzberg/_utils/_sync.py +21 -0
- kreuzberg/cli.py +338 -0
- kreuzberg/extraction.py +247 -36
- kreuzberg-3.4.0.dist-info/METADATA +290 -0
- kreuzberg-3.4.0.dist-info/RECORD +50 -0
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
- kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
- kreuzberg-3.2.0.dist-info/METADATA +0 -166
- kreuzberg-3.2.0.dist-info/RECORD +0 -34
- kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -57,7 +57,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
57
57
|
"hr",
|
58
58
|
"hu",
|
59
59
|
"id",
|
60
|
-
"inh",
|
60
|
+
"inh",
|
61
61
|
"is",
|
62
62
|
"it",
|
63
63
|
"ja",
|
@@ -97,7 +97,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
97
97
|
"sw",
|
98
98
|
"ta",
|
99
99
|
"tab",
|
100
|
-
"te",
|
100
|
+
"te",
|
101
101
|
"th",
|
102
102
|
"tjk",
|
103
103
|
"tl",
|
@@ -261,11 +261,12 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
261
261
|
content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
|
262
262
|
)
|
263
263
|
|
264
|
+
# Group text boxes by lines based on Y coordinate # ~keep
|
264
265
|
sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
|
265
266
|
line_groups: list[list[Any]] = []
|
266
267
|
current_line: list[Any] = []
|
267
268
|
prev_y_center: float | None = None
|
268
|
-
line_height_threshold = 20
|
269
|
+
line_height_threshold = 20 # Minimum distance to consider as new line # ~keep
|
269
270
|
|
270
271
|
for item in sorted_results:
|
271
272
|
box, text, confidence = item
|
@@ -288,7 +289,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
288
289
|
confidence_count = 0
|
289
290
|
|
290
291
|
for line in line_groups:
|
291
|
-
line_sorted = sorted(line, key=lambda x: x[0][0][0])
|
292
|
+
line_sorted = sorted(line, key=lambda x: x[0][0][0]) # Sort boxes by X coordinate within line # ~keep
|
292
293
|
|
293
294
|
for item in line_sorted:
|
294
295
|
_, text, confidence = item
|
@@ -345,7 +346,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
345
346
|
|
346
347
|
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
347
348
|
|
348
|
-
# Handle device selection with backward compatibility
|
349
349
|
device_info = cls._resolve_device_config(**kwargs)
|
350
350
|
use_gpu = device_info.device_type in ("cuda", "mps")
|
351
351
|
|
@@ -377,13 +377,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
377
377
|
Raises:
|
378
378
|
ValidationError: If requested device is not available and fallback is disabled.
|
379
379
|
"""
|
380
|
-
# Handle deprecated use_gpu parameter
|
381
380
|
use_gpu = kwargs.get("use_gpu", False)
|
382
381
|
device = kwargs.get("device", "auto")
|
383
382
|
memory_limit = kwargs.get("gpu_memory_limit")
|
384
383
|
fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
|
385
384
|
|
386
|
-
# Check for deprecated parameter usage
|
387
385
|
if use_gpu and device == "auto":
|
388
386
|
warnings.warn(
|
389
387
|
"The 'use_gpu' parameter is deprecated and will be removed in a future version. "
|
@@ -391,7 +389,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
391
389
|
DeprecationWarning,
|
392
390
|
stacklevel=4,
|
393
391
|
)
|
394
|
-
|
392
|
+
|
395
393
|
device = "auto" if use_gpu else "cpu"
|
396
394
|
elif use_gpu and device != "auto":
|
397
395
|
warnings.warn(
|
@@ -401,7 +399,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
401
399
|
stacklevel=4,
|
402
400
|
)
|
403
401
|
|
404
|
-
# Validate and get device info
|
405
402
|
try:
|
406
403
|
return validate_device_request(
|
407
404
|
device,
|
@@ -410,7 +407,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
410
407
|
fallback_to_cpu=fallback_to_cpu,
|
411
408
|
)
|
412
409
|
except ValidationError:
|
413
|
-
# If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
|
414
410
|
if not use_gpu and device == "cpu":
|
415
411
|
return DeviceInfo(device_type="cpu", name="CPU")
|
416
412
|
raise
|
@@ -429,10 +425,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
429
425
|
A list with the normalized language codes.
|
430
426
|
"""
|
431
427
|
if isinstance(language_codes, str):
|
432
|
-
# Handle comma-separated language codes
|
433
428
|
languages = [lang.strip().lower() for lang in language_codes.split(",")]
|
434
429
|
else:
|
435
|
-
# Handle list of language codes
|
436
430
|
languages = [lang.lower() for lang in language_codes]
|
437
431
|
|
438
432
|
unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
@@ -125,6 +125,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
125
125
|
import numpy as np
|
126
126
|
|
127
127
|
await self._init_paddle_ocr(**kwargs)
|
128
|
+
|
129
|
+
if image.mode != "RGB":
|
130
|
+
image = image.convert("RGB")
|
131
|
+
|
128
132
|
image_np = np.array(image)
|
129
133
|
try:
|
130
134
|
result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
|
@@ -153,7 +157,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
153
157
|
raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
|
154
158
|
|
155
159
|
@staticmethod
|
156
|
-
def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
|
160
|
+
def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
|
157
161
|
"""Process PaddleOCR result into an ExtractionResult with metadata.
|
158
162
|
|
159
163
|
Args:
|
@@ -171,6 +175,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
171
175
|
if not page_result:
|
172
176
|
continue
|
173
177
|
|
178
|
+
# Group text boxes by lines based on Y coordinate # ~keep
|
174
179
|
sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
|
175
180
|
line_groups: list[list[Any]] = []
|
176
181
|
current_line: list[Any] = []
|
@@ -179,7 +184,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
179
184
|
for box in sorted_boxes:
|
180
185
|
box_points, (_, _) = box
|
181
186
|
current_y = sum(point[1] for point in box_points) / 4
|
182
|
-
min_box_distance = 20
|
187
|
+
min_box_distance = 20 # Minimum distance to consider as new line # ~keep
|
183
188
|
|
184
189
|
if prev_y is None or abs(current_y - prev_y) > min_box_distance:
|
185
190
|
if current_line:
|
@@ -194,7 +199,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
194
199
|
line_groups.append(current_line)
|
195
200
|
|
196
201
|
for line in line_groups:
|
197
|
-
line_sorted = sorted(line, key=lambda x: x[0][0][0])
|
202
|
+
line_sorted = sorted(line, key=lambda x: x[0][0][0]) # Sort boxes by X coordinate within line # ~keep
|
198
203
|
|
199
204
|
for box in line_sorted:
|
200
205
|
_, (text, confidence) = box
|
@@ -205,7 +210,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
205
210
|
|
206
211
|
text_content += "\n"
|
207
212
|
|
208
|
-
|
213
|
+
if hasattr(image, "width") and hasattr(image, "height"):
|
214
|
+
width = image.width
|
215
|
+
height = image.height
|
216
|
+
else:
|
217
|
+
width, height = image.size
|
209
218
|
metadata = Metadata(
|
210
219
|
width=width,
|
211
220
|
height=height,
|
@@ -257,7 +266,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
257
266
|
|
258
267
|
language = cls._validate_language_code(kwargs.pop("language", "en"))
|
259
268
|
|
260
|
-
# Handle device selection with backward compatibility
|
261
269
|
device_info = cls._resolve_device_config(**kwargs)
|
262
270
|
use_gpu = device_info.device_type == "cuda"
|
263
271
|
|
@@ -269,9 +277,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
269
277
|
kwargs.setdefault("det_db_box_thresh", 0.5)
|
270
278
|
kwargs.setdefault("det_db_unclip_ratio", 1.6)
|
271
279
|
|
272
|
-
# Set GPU memory limit if specified
|
273
280
|
if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
|
274
|
-
kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
|
281
|
+
kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
|
275
282
|
|
276
283
|
try:
|
277
284
|
cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
|
@@ -291,13 +298,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
291
298
|
Raises:
|
292
299
|
ValidationError: If requested device is not available and fallback is disabled.
|
293
300
|
"""
|
294
|
-
# Handle deprecated use_gpu parameter
|
295
301
|
use_gpu = kwargs.get("use_gpu", False)
|
296
302
|
device = kwargs.get("device", "auto")
|
297
303
|
memory_limit = kwargs.get("gpu_memory_limit")
|
298
304
|
fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
|
299
305
|
|
300
|
-
# Check for deprecated parameter usage
|
301
306
|
if use_gpu and device == "auto":
|
302
307
|
warnings.warn(
|
303
308
|
"The 'use_gpu' parameter is deprecated and will be removed in a future version. "
|
@@ -305,7 +310,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
305
310
|
DeprecationWarning,
|
306
311
|
stacklevel=4,
|
307
312
|
)
|
308
|
-
|
313
|
+
|
309
314
|
device = "auto" if use_gpu else "cpu"
|
310
315
|
elif use_gpu and device != "auto":
|
311
316
|
warnings.warn(
|
@@ -315,7 +320,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
315
320
|
stacklevel=4,
|
316
321
|
)
|
317
322
|
|
318
|
-
# PaddlePaddle doesn't support MPS, so warn if requested
|
319
323
|
if device == "mps":
|
320
324
|
warnings.warn(
|
321
325
|
"PaddlePaddle does not support MPS (Apple Silicon) acceleration. Falling back to CPU.",
|
@@ -324,7 +328,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
324
328
|
)
|
325
329
|
device = "cpu"
|
326
330
|
|
327
|
-
# Validate and get device info
|
328
331
|
try:
|
329
332
|
return validate_device_request(
|
330
333
|
device,
|
@@ -333,7 +336,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
333
336
|
fallback_to_cpu=fallback_to_cpu,
|
334
337
|
)
|
335
338
|
except ValidationError:
|
336
|
-
# If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
|
337
339
|
if not use_gpu and device == "cpu":
|
338
340
|
return DeviceInfo(device_type="cpu", name="CPU")
|
339
341
|
raise
|
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import hashlib
|
3
4
|
import re
|
4
5
|
import sys
|
5
6
|
from dataclasses import dataclass
|
@@ -144,7 +145,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
144
145
|
"tel",
|
145
146
|
"tgk",
|
146
147
|
"tgl",
|
147
|
-
"tha",
|
148
|
+
"tha",
|
148
149
|
"tir",
|
149
150
|
"ton",
|
150
151
|
"tur",
|
@@ -153,7 +154,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
153
154
|
"urd",
|
154
155
|
"uzb",
|
155
156
|
"uzb_cyrl",
|
156
|
-
"vie",
|
157
|
+
"vie",
|
157
158
|
"yid",
|
158
159
|
"yor",
|
159
160
|
}
|
@@ -227,62 +228,151 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
227
228
|
image: Image,
|
228
229
|
**kwargs: Unpack[TesseractConfig],
|
229
230
|
) -> ExtractionResult:
|
230
|
-
|
231
|
-
|
232
|
-
|
231
|
+
import io
|
232
|
+
|
233
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
234
|
+
|
235
|
+
image_buffer = io.BytesIO()
|
236
|
+
await run_sync(image.save, image_buffer, format="PNG")
|
237
|
+
image_content = image_buffer.getvalue()
|
238
|
+
|
239
|
+
cache_kwargs = {
|
240
|
+
"image_hash": hashlib.sha256(image_content).hexdigest()[:16],
|
241
|
+
"ocr_backend": "tesseract",
|
242
|
+
"ocr_config": str(sorted(kwargs.items())),
|
243
|
+
}
|
244
|
+
|
245
|
+
ocr_cache = get_ocr_cache()
|
246
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
247
|
+
if cached_result is not None:
|
248
|
+
return cached_result
|
249
|
+
|
250
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
251
|
+
import anyio
|
252
|
+
|
253
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
254
|
+
await anyio.to_thread.run_sync(event.wait)
|
255
|
+
|
256
|
+
# Try cache again after waiting for other process to complete # ~keep
|
257
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
258
|
+
if cached_result is not None:
|
259
|
+
return cached_result
|
260
|
+
|
261
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
262
|
+
|
233
263
|
try:
|
234
|
-
|
264
|
+
await self._validate_tesseract_version()
|
265
|
+
image_path, unlink = await create_temp_file(".png")
|
266
|
+
await run_sync(image.save, str(image_path), format="PNG")
|
267
|
+
try:
|
268
|
+
result = await self.process_file(image_path, **kwargs)
|
269
|
+
|
270
|
+
await ocr_cache.aset(result, **cache_kwargs)
|
271
|
+
|
272
|
+
return result
|
273
|
+
finally:
|
274
|
+
await unlink()
|
235
275
|
finally:
|
236
|
-
|
276
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
237
277
|
|
238
278
|
async def process_file(
|
239
279
|
self,
|
240
280
|
path: Path,
|
241
281
|
**kwargs: Unpack[TesseractConfig],
|
242
282
|
) -> ExtractionResult:
|
243
|
-
|
244
|
-
|
245
|
-
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
246
|
-
psm = kwargs.pop("psm", PSMMode.AUTO)
|
283
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
284
|
+
|
247
285
|
try:
|
248
|
-
|
249
|
-
|
250
|
-
"
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
str(
|
257
|
-
"
|
258
|
-
"
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
286
|
+
stat = path.stat()
|
287
|
+
file_info = {
|
288
|
+
"path": str(path.resolve()),
|
289
|
+
"size": stat.st_size,
|
290
|
+
"mtime": stat.st_mtime,
|
291
|
+
}
|
292
|
+
except OSError:
|
293
|
+
file_info = {
|
294
|
+
"path": str(path),
|
295
|
+
"size": 0,
|
296
|
+
"mtime": 0,
|
297
|
+
}
|
298
|
+
|
299
|
+
cache_kwargs = {
|
300
|
+
"file_info": str(sorted(file_info.items())),
|
301
|
+
"ocr_backend": "tesseract",
|
302
|
+
"ocr_config": str(sorted(kwargs.items())),
|
303
|
+
}
|
304
|
+
|
305
|
+
ocr_cache = get_ocr_cache()
|
306
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
307
|
+
if cached_result is not None:
|
308
|
+
return cached_result
|
309
|
+
|
310
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
311
|
+
import anyio
|
312
|
+
|
313
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
314
|
+
await anyio.to_thread.run_sync(event.wait)
|
315
|
+
|
316
|
+
# Try cache again after waiting for other process to complete # ~keep
|
317
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
318
|
+
if cached_result is not None:
|
319
|
+
return cached_result
|
320
|
+
|
321
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
322
|
+
|
323
|
+
try:
|
324
|
+
await self._validate_tesseract_version()
|
325
|
+
output_path, unlink = await create_temp_file(".txt")
|
326
|
+
language = self._validate_language_code(kwargs.pop("language", "eng"))
|
327
|
+
psm = kwargs.pop("psm", PSMMode.AUTO)
|
328
|
+
try:
|
329
|
+
output_base = str(output_path).replace(".txt", "")
|
330
|
+
command = [
|
331
|
+
"tesseract",
|
332
|
+
str(path),
|
333
|
+
output_base,
|
334
|
+
"-l",
|
335
|
+
language,
|
336
|
+
"--psm",
|
337
|
+
str(psm.value),
|
338
|
+
"--oem",
|
339
|
+
"1",
|
340
|
+
"--loglevel",
|
341
|
+
"OFF",
|
342
|
+
]
|
343
|
+
for kwarg, value in kwargs.items():
|
344
|
+
command.extend(["-c", f"{kwarg}={1 if value else 0}"])
|
345
|
+
|
346
|
+
env: dict[str, Any] | None = None
|
347
|
+
if sys.platform.startswith("linux"):
|
348
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
349
|
+
|
350
|
+
result = await run_process(command, env=env)
|
351
|
+
|
352
|
+
if not result.returncode == 0:
|
353
|
+
raise OCRError(
|
354
|
+
"OCR failed with a non-0 return code.",
|
355
|
+
context={
|
356
|
+
"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
|
357
|
+
},
|
358
|
+
)
|
359
|
+
|
360
|
+
output = await AsyncPath(output_path).read_text("utf-8")
|
361
|
+
extraction_result = ExtractionResult(
|
362
|
+
content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
276
363
|
)
|
277
364
|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
365
|
+
final_cache_kwargs = cache_kwargs.copy()
|
366
|
+
final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
|
367
|
+
await ocr_cache.aset(extraction_result, **final_cache_kwargs)
|
368
|
+
|
369
|
+
return extraction_result
|
370
|
+
except (RuntimeError, OSError) as e:
|
371
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
372
|
+
finally:
|
373
|
+
await unlink()
|
284
374
|
finally:
|
285
|
-
|
375
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
286
376
|
|
287
377
|
@classmethod
|
288
378
|
async def _validate_tesseract_version(cls) -> None:
|
kreuzberg/_playa.py
CHANGED
@@ -274,3 +274,46 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
|
|
274
274
|
|
275
275
|
if subtitle and "title" in result and subtitle != result["title"]:
|
276
276
|
result["subtitle"] = subtitle
|
277
|
+
|
278
|
+
|
279
|
+
def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
|
280
|
+
"""Synchronous version of extract_pdf_metadata.
|
281
|
+
|
282
|
+
Extract metadata from a PDF document without using async/await.
|
283
|
+
|
284
|
+
Args:
|
285
|
+
pdf_content: The bytes of the PDF document.
|
286
|
+
|
287
|
+
Raises:
|
288
|
+
ParsingError: If the PDF metadata could not be extracted.
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
A dictionary of metadata extracted from the PDF.
|
292
|
+
"""
|
293
|
+
try:
|
294
|
+
document = parse(pdf_content, max_workers=1)
|
295
|
+
metadata: Metadata = {}
|
296
|
+
|
297
|
+
for raw_info in document.info:
|
298
|
+
pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
|
299
|
+
_extract_basic_metadata(pdf_info, metadata)
|
300
|
+
_extract_author_metadata(pdf_info, metadata)
|
301
|
+
_extract_keyword_metadata(pdf_info, metadata)
|
302
|
+
_extract_category_metadata(pdf_info, metadata)
|
303
|
+
_extract_date_metadata(pdf_info, metadata)
|
304
|
+
_extract_creator_metadata(pdf_info, metadata)
|
305
|
+
|
306
|
+
if document.pages:
|
307
|
+
_extract_document_dimensions(document, metadata)
|
308
|
+
|
309
|
+
if document.outline and "description" not in metadata:
|
310
|
+
metadata["description"] = _generate_outline_description(document)
|
311
|
+
|
312
|
+
if "summary" not in metadata:
|
313
|
+
metadata["summary"] = _generate_document_summary(document)
|
314
|
+
|
315
|
+
_extract_structure_information(document, metadata)
|
316
|
+
|
317
|
+
return metadata
|
318
|
+
except Exception as e:
|
319
|
+
raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e
|
kreuzberg/_types.py
CHANGED
@@ -114,6 +114,10 @@ class ExtractionResult:
|
|
114
114
|
chunks: list[str] = field(default_factory=list)
|
115
115
|
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
116
116
|
|
117
|
+
def to_dict(self) -> dict[str, Any]:
|
118
|
+
"""Converts the ExtractionResult to a dictionary."""
|
119
|
+
return asdict(self)
|
120
|
+
|
117
121
|
|
118
122
|
PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
|
119
123
|
ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
|