kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_api/__init__.py +0 -0
  4. kreuzberg/_api/main.py +87 -0
  5. kreuzberg/_cli_config.py +175 -0
  6. kreuzberg/_extractors/_image.py +39 -4
  7. kreuzberg/_extractors/_pandoc.py +158 -18
  8. kreuzberg/_extractors/_pdf.py +199 -19
  9. kreuzberg/_extractors/_presentation.py +1 -1
  10. kreuzberg/_extractors/_spread_sheet.py +65 -7
  11. kreuzberg/_gmft.py +222 -16
  12. kreuzberg/_mime_types.py +62 -16
  13. kreuzberg/_multiprocessing/__init__.py +6 -0
  14. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  15. kreuzberg/_multiprocessing/process_manager.py +188 -0
  16. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  17. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  18. kreuzberg/_ocr/_easyocr.py +6 -12
  19. kreuzberg/_ocr/_paddleocr.py +15 -13
  20. kreuzberg/_ocr/_tesseract.py +136 -46
  21. kreuzberg/_playa.py +43 -0
  22. kreuzberg/_types.py +4 -0
  23. kreuzberg/_utils/_cache.py +372 -0
  24. kreuzberg/_utils/_device.py +10 -27
  25. kreuzberg/_utils/_document_cache.py +220 -0
  26. kreuzberg/_utils/_errors.py +232 -0
  27. kreuzberg/_utils/_pdf_lock.py +72 -0
  28. kreuzberg/_utils/_process_pool.py +100 -0
  29. kreuzberg/_utils/_serialization.py +82 -0
  30. kreuzberg/_utils/_string.py +1 -1
  31. kreuzberg/_utils/_sync.py +21 -0
  32. kreuzberg/cli.py +338 -0
  33. kreuzberg/extraction.py +247 -36
  34. kreuzberg-3.4.0.dist-info/METADATA +290 -0
  35. kreuzberg-3.4.0.dist-info/RECORD +50 -0
  36. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
  37. kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
  38. kreuzberg-3.2.0.dist-info/METADATA +0 -166
  39. kreuzberg-3.2.0.dist-info/RECORD +0 -34
  40. kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
  41. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -57,7 +57,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
57
57
  "hr",
58
58
  "hu",
59
59
  "id",
60
- "inh", # codespell:ignore
60
+ "inh",
61
61
  "is",
62
62
  "it",
63
63
  "ja",
@@ -97,7 +97,7 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
97
97
  "sw",
98
98
  "ta",
99
99
  "tab",
100
- "te", # codespell:ignore
100
+ "te",
101
101
  "th",
102
102
  "tjk",
103
103
  "tl",
@@ -261,11 +261,12 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
261
261
  content=normalize_spaces(text_content), mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[]
262
262
  )
263
263
 
264
+ # Group text boxes by lines based on Y coordinate # ~keep
264
265
  sorted_results = sorted(result, key=lambda x: x[0][0][1] + x[0][2][1])
265
266
  line_groups: list[list[Any]] = []
266
267
  current_line: list[Any] = []
267
268
  prev_y_center: float | None = None
268
- line_height_threshold = 20
269
+ line_height_threshold = 20 # Minimum distance to consider as new line # ~keep
269
270
 
270
271
  for item in sorted_results:
271
272
  box, text, confidence = item
@@ -288,7 +289,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
288
289
  confidence_count = 0
289
290
 
290
291
  for line in line_groups:
291
- line_sorted = sorted(line, key=lambda x: x[0][0][0])
292
+ line_sorted = sorted(line, key=lambda x: x[0][0][0]) # Sort boxes by X coordinate within line # ~keep
292
293
 
293
294
  for item in line_sorted:
294
295
  _, text, confidence = item
@@ -345,7 +346,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
345
346
 
346
347
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
347
348
 
348
- # Handle device selection with backward compatibility
349
349
  device_info = cls._resolve_device_config(**kwargs)
350
350
  use_gpu = device_info.device_type in ("cuda", "mps")
351
351
 
@@ -377,13 +377,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
377
377
  Raises:
378
378
  ValidationError: If requested device is not available and fallback is disabled.
379
379
  """
380
- # Handle deprecated use_gpu parameter
381
380
  use_gpu = kwargs.get("use_gpu", False)
382
381
  device = kwargs.get("device", "auto")
383
382
  memory_limit = kwargs.get("gpu_memory_limit")
384
383
  fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
385
384
 
386
- # Check for deprecated parameter usage
387
385
  if use_gpu and device == "auto":
388
386
  warnings.warn(
389
387
  "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
@@ -391,7 +389,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
391
389
  DeprecationWarning,
392
390
  stacklevel=4,
393
391
  )
394
- # Convert deprecated use_gpu=True to device="auto"
392
+
395
393
  device = "auto" if use_gpu else "cpu"
396
394
  elif use_gpu and device != "auto":
397
395
  warnings.warn(
@@ -401,7 +399,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
401
399
  stacklevel=4,
402
400
  )
403
401
 
404
- # Validate and get device info
405
402
  try:
406
403
  return validate_device_request(
407
404
  device,
@@ -410,7 +407,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
410
407
  fallback_to_cpu=fallback_to_cpu,
411
408
  )
412
409
  except ValidationError:
413
- # If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
414
410
  if not use_gpu and device == "cpu":
415
411
  return DeviceInfo(device_type="cpu", name="CPU")
416
412
  raise
@@ -429,10 +425,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
429
425
  A list with the normalized language codes.
430
426
  """
431
427
  if isinstance(language_codes, str):
432
- # Handle comma-separated language codes
433
428
  languages = [lang.strip().lower() for lang in language_codes.split(",")]
434
429
  else:
435
- # Handle list of language codes
436
430
  languages = [lang.lower() for lang in language_codes]
437
431
 
438
432
  unsupported_langs = [lang for lang in languages if lang not in EASYOCR_SUPPORTED_LANGUAGE_CODES]
@@ -125,6 +125,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
125
125
  import numpy as np
126
126
 
127
127
  await self._init_paddle_ocr(**kwargs)
128
+
129
+ if image.mode != "RGB":
130
+ image = image.convert("RGB")
131
+
128
132
  image_np = np.array(image)
129
133
  try:
130
134
  result = await run_sync(self._paddle_ocr.ocr, image_np, cls=kwargs.get("use_angle_cls", True))
@@ -153,7 +157,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
153
157
  raise OCRError(f"Failed to load or process image using PaddleOCR: {e}") from e
154
158
 
155
159
  @staticmethod
156
- def _process_paddle_result(result: list[Any], image: Image.Image) -> ExtractionResult:
160
+ def _process_paddle_result(result: list[Any] | Any, image: Image.Image) -> ExtractionResult:
157
161
  """Process PaddleOCR result into an ExtractionResult with metadata.
158
162
 
159
163
  Args:
@@ -171,6 +175,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
171
175
  if not page_result:
172
176
  continue
173
177
 
178
+ # Group text boxes by lines based on Y coordinate # ~keep
174
179
  sorted_boxes = sorted(page_result, key=lambda x: x[0][0][1])
175
180
  line_groups: list[list[Any]] = []
176
181
  current_line: list[Any] = []
@@ -179,7 +184,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
179
184
  for box in sorted_boxes:
180
185
  box_points, (_, _) = box
181
186
  current_y = sum(point[1] for point in box_points) / 4
182
- min_box_distance = 20
187
+ min_box_distance = 20 # Minimum distance to consider as new line # ~keep
183
188
 
184
189
  if prev_y is None or abs(current_y - prev_y) > min_box_distance:
185
190
  if current_line:
@@ -194,7 +199,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
194
199
  line_groups.append(current_line)
195
200
 
196
201
  for line in line_groups:
197
- line_sorted = sorted(line, key=lambda x: x[0][0][0])
202
+ line_sorted = sorted(line, key=lambda x: x[0][0][0]) # Sort boxes by X coordinate within line # ~keep
198
203
 
199
204
  for box in line_sorted:
200
205
  _, (text, confidence) = box
@@ -205,7 +210,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
205
210
 
206
211
  text_content += "\n"
207
212
 
208
- width, height = image.size
213
+ if hasattr(image, "width") and hasattr(image, "height"):
214
+ width = image.width
215
+ height = image.height
216
+ else:
217
+ width, height = image.size
209
218
  metadata = Metadata(
210
219
  width=width,
211
220
  height=height,
@@ -257,7 +266,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
257
266
 
258
267
  language = cls._validate_language_code(kwargs.pop("language", "en"))
259
268
 
260
- # Handle device selection with backward compatibility
261
269
  device_info = cls._resolve_device_config(**kwargs)
262
270
  use_gpu = device_info.device_type == "cuda"
263
271
 
@@ -269,9 +277,8 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
269
277
  kwargs.setdefault("det_db_box_thresh", 0.5)
270
278
  kwargs.setdefault("det_db_unclip_ratio", 1.6)
271
279
 
272
- # Set GPU memory limit if specified
273
280
  if device_info.device_type == "cuda" and kwargs.get("gpu_memory_limit"):
274
- kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024) # Convert GB to MB
281
+ kwargs["gpu_mem"] = int(kwargs["gpu_memory_limit"] * 1024)
275
282
 
276
283
  try:
277
284
  cls._paddle_ocr = await run_sync(PaddleOCR, lang=language, show_log=False, **kwargs)
@@ -291,13 +298,11 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
291
298
  Raises:
292
299
  ValidationError: If requested device is not available and fallback is disabled.
293
300
  """
294
- # Handle deprecated use_gpu parameter
295
301
  use_gpu = kwargs.get("use_gpu", False)
296
302
  device = kwargs.get("device", "auto")
297
303
  memory_limit = kwargs.get("gpu_memory_limit")
298
304
  fallback_to_cpu = kwargs.get("fallback_to_cpu", True)
299
305
 
300
- # Check for deprecated parameter usage
301
306
  if use_gpu and device == "auto":
302
307
  warnings.warn(
303
308
  "The 'use_gpu' parameter is deprecated and will be removed in a future version. "
@@ -305,7 +310,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
305
310
  DeprecationWarning,
306
311
  stacklevel=4,
307
312
  )
308
- # Convert deprecated use_gpu=True to device="auto"
313
+
309
314
  device = "auto" if use_gpu else "cpu"
310
315
  elif use_gpu and device != "auto":
311
316
  warnings.warn(
@@ -315,7 +320,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
315
320
  stacklevel=4,
316
321
  )
317
322
 
318
- # PaddlePaddle doesn't support MPS, so warn if requested
319
323
  if device == "mps":
320
324
  warnings.warn(
321
325
  "PaddlePaddle does not support MPS (Apple Silicon) acceleration. Falling back to CPU.",
@@ -324,7 +328,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
324
328
  )
325
329
  device = "cpu"
326
330
 
327
- # Validate and get device info
328
331
  try:
329
332
  return validate_device_request(
330
333
  device,
@@ -333,7 +336,6 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
333
336
  fallback_to_cpu=fallback_to_cpu,
334
337
  )
335
338
  except ValidationError:
336
- # If device validation fails and we're using deprecated use_gpu=False, fallback to CPU
337
339
  if not use_gpu and device == "cpu":
338
340
  return DeviceInfo(device_type="cpu", name="CPU")
339
341
  raise
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hashlib
3
4
  import re
4
5
  import sys
5
6
  from dataclasses import dataclass
@@ -144,7 +145,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
144
145
  "tel",
145
146
  "tgk",
146
147
  "tgl",
147
- "tha", # codespell:ignore
148
+ "tha",
148
149
  "tir",
149
150
  "ton",
150
151
  "tur",
@@ -153,7 +154,7 @@ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
153
154
  "urd",
154
155
  "uzb",
155
156
  "uzb_cyrl",
156
- "vie", # codespell:ignore
157
+ "vie",
157
158
  "yid",
158
159
  "yor",
159
160
  }
@@ -227,62 +228,151 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
227
228
  image: Image,
228
229
  **kwargs: Unpack[TesseractConfig],
229
230
  ) -> ExtractionResult:
230
- await self._validate_tesseract_version()
231
- image_path, unlink = await create_temp_file(".png")
232
- await run_sync(image.save, str(image_path), format="PNG")
231
+ import io
232
+
233
+ from kreuzberg._utils._cache import get_ocr_cache
234
+
235
+ image_buffer = io.BytesIO()
236
+ await run_sync(image.save, image_buffer, format="PNG")
237
+ image_content = image_buffer.getvalue()
238
+
239
+ cache_kwargs = {
240
+ "image_hash": hashlib.sha256(image_content).hexdigest()[:16],
241
+ "ocr_backend": "tesseract",
242
+ "ocr_config": str(sorted(kwargs.items())),
243
+ }
244
+
245
+ ocr_cache = get_ocr_cache()
246
+ cached_result = await ocr_cache.aget(**cache_kwargs)
247
+ if cached_result is not None:
248
+ return cached_result
249
+
250
+ if ocr_cache.is_processing(**cache_kwargs):
251
+ import anyio
252
+
253
+ event = ocr_cache.mark_processing(**cache_kwargs)
254
+ await anyio.to_thread.run_sync(event.wait)
255
+
256
+ # Try cache again after waiting for other process to complete # ~keep
257
+ cached_result = await ocr_cache.aget(**cache_kwargs)
258
+ if cached_result is not None:
259
+ return cached_result
260
+
261
+ ocr_cache.mark_processing(**cache_kwargs)
262
+
233
263
  try:
234
- return await self.process_file(image_path, **kwargs)
264
+ await self._validate_tesseract_version()
265
+ image_path, unlink = await create_temp_file(".png")
266
+ await run_sync(image.save, str(image_path), format="PNG")
267
+ try:
268
+ result = await self.process_file(image_path, **kwargs)
269
+
270
+ await ocr_cache.aset(result, **cache_kwargs)
271
+
272
+ return result
273
+ finally:
274
+ await unlink()
235
275
  finally:
236
- await unlink()
276
+ ocr_cache.mark_complete(**cache_kwargs)
237
277
 
238
278
  async def process_file(
239
279
  self,
240
280
  path: Path,
241
281
  **kwargs: Unpack[TesseractConfig],
242
282
  ) -> ExtractionResult:
243
- await self._validate_tesseract_version()
244
- output_path, unlink = await create_temp_file(".txt")
245
- language = self._validate_language_code(kwargs.pop("language", "eng"))
246
- psm = kwargs.pop("psm", PSMMode.AUTO)
283
+ from kreuzberg._utils._cache import get_ocr_cache
284
+
247
285
  try:
248
- output_base = str(output_path).replace(".txt", "")
249
- command = [
250
- "tesseract",
251
- str(path),
252
- output_base,
253
- "-l",
254
- language,
255
- "--psm",
256
- str(psm.value),
257
- "--oem",
258
- "1",
259
- "--loglevel",
260
- "OFF",
261
- ]
262
- for kwarg, value in kwargs.items():
263
- command.extend(["-c", f"{kwarg}={1 if value else 0}"])
264
-
265
- env: dict[str, Any] | None = None
266
- if sys.platform.startswith("linux"):
267
- # we have to prevent multithreading this way otherwise we will get deadlocks
268
- env = {"OMP_THREAD_LIMIT": "1"}
269
-
270
- result = await run_process(command, env=env)
271
-
272
- if not result.returncode == 0:
273
- raise OCRError(
274
- "OCR failed with a non-0 return code.",
275
- context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
286
+ stat = path.stat()
287
+ file_info = {
288
+ "path": str(path.resolve()),
289
+ "size": stat.st_size,
290
+ "mtime": stat.st_mtime,
291
+ }
292
+ except OSError:
293
+ file_info = {
294
+ "path": str(path),
295
+ "size": 0,
296
+ "mtime": 0,
297
+ }
298
+
299
+ cache_kwargs = {
300
+ "file_info": str(sorted(file_info.items())),
301
+ "ocr_backend": "tesseract",
302
+ "ocr_config": str(sorted(kwargs.items())),
303
+ }
304
+
305
+ ocr_cache = get_ocr_cache()
306
+ cached_result = await ocr_cache.aget(**cache_kwargs)
307
+ if cached_result is not None:
308
+ return cached_result
309
+
310
+ if ocr_cache.is_processing(**cache_kwargs):
311
+ import anyio
312
+
313
+ event = ocr_cache.mark_processing(**cache_kwargs)
314
+ await anyio.to_thread.run_sync(event.wait)
315
+
316
+ # Try cache again after waiting for other process to complete # ~keep
317
+ cached_result = await ocr_cache.aget(**cache_kwargs)
318
+ if cached_result is not None:
319
+ return cached_result
320
+
321
+ ocr_cache.mark_processing(**cache_kwargs)
322
+
323
+ try:
324
+ await self._validate_tesseract_version()
325
+ output_path, unlink = await create_temp_file(".txt")
326
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
327
+ psm = kwargs.pop("psm", PSMMode.AUTO)
328
+ try:
329
+ output_base = str(output_path).replace(".txt", "")
330
+ command = [
331
+ "tesseract",
332
+ str(path),
333
+ output_base,
334
+ "-l",
335
+ language,
336
+ "--psm",
337
+ str(psm.value),
338
+ "--oem",
339
+ "1",
340
+ "--loglevel",
341
+ "OFF",
342
+ ]
343
+ for kwarg, value in kwargs.items():
344
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
345
+
346
+ env: dict[str, Any] | None = None
347
+ if sys.platform.startswith("linux"):
348
+ env = {"OMP_THREAD_LIMIT": "1"}
349
+
350
+ result = await run_process(command, env=env)
351
+
352
+ if not result.returncode == 0:
353
+ raise OCRError(
354
+ "OCR failed with a non-0 return code.",
355
+ context={
356
+ "error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr
357
+ },
358
+ )
359
+
360
+ output = await AsyncPath(output_path).read_text("utf-8")
361
+ extraction_result = ExtractionResult(
362
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
276
363
  )
277
364
 
278
- output = await AsyncPath(output_path).read_text("utf-8")
279
- return ExtractionResult(
280
- content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
281
- )
282
- except (RuntimeError, OSError) as e:
283
- raise OCRError(f"Failed to OCR using tesseract: {e}") from e
365
+ final_cache_kwargs = cache_kwargs.copy()
366
+ final_cache_kwargs["ocr_config"] = str(sorted({**kwargs, "language": language, "psm": psm}.items()))
367
+ await ocr_cache.aset(extraction_result, **final_cache_kwargs)
368
+
369
+ return extraction_result
370
+ except (RuntimeError, OSError) as e:
371
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
372
+ finally:
373
+ await unlink()
284
374
  finally:
285
- await unlink()
375
+ ocr_cache.mark_complete(**cache_kwargs)
286
376
 
287
377
  @classmethod
288
378
  async def _validate_tesseract_version(cls) -> None:
kreuzberg/_playa.py CHANGED
@@ -274,3 +274,46 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
274
274
 
275
275
  if subtitle and "title" in result and subtitle != result["title"]:
276
276
  result["subtitle"] = subtitle
277
+
278
+
279
+ def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
280
+ """Synchronous version of extract_pdf_metadata.
281
+
282
+ Extract metadata from a PDF document without using async/await.
283
+
284
+ Args:
285
+ pdf_content: The bytes of the PDF document.
286
+
287
+ Raises:
288
+ ParsingError: If the PDF metadata could not be extracted.
289
+
290
+ Returns:
291
+ A dictionary of metadata extracted from the PDF.
292
+ """
293
+ try:
294
+ document = parse(pdf_content, max_workers=1)
295
+ metadata: Metadata = {}
296
+
297
+ for raw_info in document.info:
298
+ pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
299
+ _extract_basic_metadata(pdf_info, metadata)
300
+ _extract_author_metadata(pdf_info, metadata)
301
+ _extract_keyword_metadata(pdf_info, metadata)
302
+ _extract_category_metadata(pdf_info, metadata)
303
+ _extract_date_metadata(pdf_info, metadata)
304
+ _extract_creator_metadata(pdf_info, metadata)
305
+
306
+ if document.pages:
307
+ _extract_document_dimensions(document, metadata)
308
+
309
+ if document.outline and "description" not in metadata:
310
+ metadata["description"] = _generate_outline_description(document)
311
+
312
+ if "summary" not in metadata:
313
+ metadata["summary"] = _generate_document_summary(document)
314
+
315
+ _extract_structure_information(document, metadata)
316
+
317
+ return metadata
318
+ except Exception as e:
319
+ raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e
kreuzberg/_types.py CHANGED
@@ -114,6 +114,10 @@ class ExtractionResult:
114
114
  chunks: list[str] = field(default_factory=list)
115
115
  """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
116
116
 
117
+ def to_dict(self) -> dict[str, Any]:
118
+ """Converts the ExtractionResult to a dictionary."""
119
+ return asdict(self)
120
+
117
121
 
118
122
  PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
119
123
  ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]