kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kreuzberg/__init__.py +4 -0
  2. kreuzberg/_api/main.py +22 -1
  3. kreuzberg/_config.py +404 -0
  4. kreuzberg/_entity_extraction.py +4 -5
  5. kreuzberg/_extractors/_base.py +3 -5
  6. kreuzberg/_extractors/_image.py +18 -32
  7. kreuzberg/_extractors/_pandoc.py +3 -14
  8. kreuzberg/_extractors/_pdf.py +39 -57
  9. kreuzberg/_extractors/_spread_sheet.py +2 -3
  10. kreuzberg/_extractors/_structured.py +10 -7
  11. kreuzberg/_gmft.py +314 -10
  12. kreuzberg/_language_detection.py +1 -1
  13. kreuzberg/_mcp/server.py +58 -8
  14. kreuzberg/_ocr/__init__.py +1 -22
  15. kreuzberg/_ocr/_base.py +59 -0
  16. kreuzberg/_ocr/_easyocr.py +92 -1
  17. kreuzberg/_ocr/_paddleocr.py +90 -1
  18. kreuzberg/_ocr/_tesseract.py +556 -5
  19. kreuzberg/_playa.py +2 -3
  20. kreuzberg/_types.py +46 -24
  21. kreuzberg/_utils/_cache.py +35 -4
  22. kreuzberg/_utils/_device.py +10 -20
  23. kreuzberg/_utils/_errors.py +44 -45
  24. kreuzberg/_utils/_process_pool.py +2 -6
  25. kreuzberg/_utils/_quality.py +7 -11
  26. kreuzberg/_utils/_serialization.py +21 -16
  27. kreuzberg/_utils/_string.py +22 -12
  28. kreuzberg/_utils/_table.py +3 -4
  29. kreuzberg/cli.py +4 -5
  30. kreuzberg/exceptions.py +10 -0
  31. kreuzberg/extraction.py +6 -24
  32. kreuzberg-3.8.2.dist-info/METADATA +265 -0
  33. kreuzberg-3.8.2.dist-info/RECORD +53 -0
  34. kreuzberg/_cli_config.py +0 -175
  35. kreuzberg/_multiprocessing/__init__.py +0 -5
  36. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  37. kreuzberg/_ocr/_pool.py +0 -357
  38. kreuzberg/_ocr/_sync.py +0 -566
  39. kreuzberg-3.8.0.dist-info/METADATA +0 -313
  40. kreuzberg-3.8.0.dist-info/RECORD +0 -57
  41. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
  42. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
  43. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import os
5
+ import tempfile
4
6
  from multiprocessing import cpu_count
5
7
  from pathlib import Path
6
8
  from re import Pattern
@@ -15,8 +17,12 @@ from playa import parse
15
17
  from kreuzberg._extractors._base import Extractor
16
18
  from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
17
19
  from kreuzberg._ocr import get_ocr_backend
20
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
21
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
22
+ from kreuzberg._ocr._tesseract import TesseractConfig
18
23
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
19
24
  from kreuzberg._types import ExtractionResult, OcrBackendType
25
+ from kreuzberg._utils._errors import create_error_context, should_retry
20
26
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
21
27
  from kreuzberg._utils._string import normalize_spaces
22
28
  from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
@@ -89,9 +95,6 @@ class PDFExtractor(Extractor):
89
95
 
90
96
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
91
97
  """Pure sync implementation of PDF extraction from bytes."""
92
- import os
93
- import tempfile
94
-
95
98
  fd, temp_path = tempfile.mkstemp(suffix=".pdf")
96
99
  try:
97
100
  with os.fdopen(fd, "wb") as f:
@@ -191,8 +194,6 @@ class PDFExtractor(Extractor):
191
194
  Returns:
192
195
  A list of Pillow Images.
193
196
  """
194
- from kreuzberg._utils._errors import create_error_context, should_retry
195
-
196
197
  document: pypdfium2.PdfDocument | None = None
197
198
  last_error = None
198
199
 
@@ -247,9 +248,10 @@ class PDFExtractor(Extractor):
247
248
  *[backend.process_image(image, **self.config.get_config_dict()) for image in images],
248
249
  batch_size=cpu_count(),
249
250
  )
250
- return ExtractionResult(
251
- content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
252
- )
251
+ # Use list comprehension and join for efficient string building
252
+ content = "\n".join(result.content for result in ocr_results)
253
+
254
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
253
255
 
254
256
  @staticmethod
255
257
  async def _extract_pdf_searchable_text(input_file: Path) -> str:
@@ -264,28 +266,28 @@ class PDFExtractor(Extractor):
264
266
  Returns:
265
267
  The extracted text.
266
268
  """
267
- from kreuzberg._utils._errors import create_error_context
268
-
269
269
  document: pypdfium2.PdfDocument | None = None
270
270
  try:
271
271
  with pypdfium_file_lock(input_file):
272
272
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
273
- text_parts = []
273
+ pages_content = []
274
274
  page_errors = []
275
275
 
276
276
  for i, page in enumerate(cast("pypdfium2.PdfDocument", document)):
277
277
  try:
278
278
  text_page = page.get_textpage()
279
- text_parts.append(text_page.get_text_bounded())
279
+ page_content = text_page.get_text_bounded()
280
+ pages_content.append(page_content)
280
281
  except Exception as e: # noqa: PERF203, BLE001
281
282
  page_errors.append({"page": i + 1, "error": str(e)})
282
- text_parts.append(f"[Error extracting page {i + 1}]")
283
+ pages_content.append(f"[Error extracting page {i + 1}]")
283
284
 
284
- text = "\n".join(text_parts)
285
+ text = "\n".join(pages_content)
286
+ has_content = bool(text.strip())
285
287
 
286
- if page_errors and text_parts:
288
+ if page_errors and has_content:
287
289
  return normalize_spaces(text)
288
- if not text_parts:
290
+ if not has_content:
289
291
  raise ParsingError(
290
292
  "Could not extract any text from PDF",
291
293
  context=create_error_context(
@@ -316,14 +318,14 @@ class PDFExtractor(Extractor):
316
318
  try:
317
319
  with pypdfium_file_lock(path):
318
320
  pdf = pypdfium2.PdfDocument(str(path))
319
- text_parts = []
321
+ pages_text = []
320
322
  for page in pdf:
321
323
  text_page = page.get_textpage()
322
324
  text = text_page.get_text_bounded()
323
- text_parts.append(text)
325
+ pages_text.append(text)
324
326
  text_page.close()
325
327
  page.close()
326
- return "".join(text_parts)
328
+ return "\n".join(pages_text)
327
329
  except Exception as e:
328
330
  raise ParsingError(f"Failed to extract PDF text: {e}") from e
329
331
  finally:
@@ -345,9 +347,6 @@ class PDFExtractor(Extractor):
345
347
  bitmap.close()
346
348
  page.close()
347
349
 
348
- import os
349
- import tempfile
350
-
351
350
  image_paths = []
352
351
  temp_files = []
353
352
 
@@ -375,46 +374,29 @@ class PDFExtractor(Extractor):
375
374
 
376
375
  def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
377
376
  """Process PDF images with the configured OCR backend."""
378
- if self.config.ocr_backend == "tesseract":
379
- from kreuzberg._ocr._sync import process_batch_images_sync
380
- from kreuzberg._ocr._tesseract import TesseractConfig
377
+ backend = get_ocr_backend(self.config.ocr_backend)
378
+ paths = [Path(p) for p in image_paths]
381
379
 
382
- tesseract_config = (
380
+ if self.config.ocr_backend == "tesseract":
381
+ config = (
383
382
  self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
384
383
  )
385
- results = process_batch_images_sync([str(p) for p in image_paths], tesseract_config, backend="tesseract")
386
- text_parts = [r.content for r in results]
387
- return "\n\n".join(text_parts)
388
-
389
- if self.config.ocr_backend == "paddleocr":
390
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
391
- from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
392
-
384
+ results = backend.process_batch_sync(paths, **config.__dict__)
385
+ elif self.config.ocr_backend == "paddleocr":
393
386
  paddle_config = (
394
387
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
395
388
  )
396
-
397
- text_parts = []
398
- for image_path in image_paths:
399
- result = paddle_process(Path(image_path), paddle_config)
400
- text_parts.append(result.content)
401
- return "\n\n".join(text_parts)
402
-
403
- if self.config.ocr_backend == "easyocr":
404
- from kreuzberg._ocr._easyocr import EasyOCRConfig
405
- from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
406
-
389
+ results = backend.process_batch_sync(paths, **paddle_config.__dict__)
390
+ elif self.config.ocr_backend == "easyocr":
407
391
  easy_config = (
408
392
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
409
393
  )
394
+ results = backend.process_batch_sync(paths, **easy_config.__dict__)
395
+ else:
396
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
410
397
 
411
- text_parts = []
412
- for image_path in image_paths:
413
- result = easy_process(Path(image_path), easy_config)
414
- text_parts.append(result.content)
415
- return "\n\n".join(text_parts)
416
-
417
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
398
+ # Use list comprehension and join for efficient string building
399
+ return "\n\n".join(result.content for result in results)
418
400
 
419
401
  def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
420
402
  """Extract text using playa for better structure preservation."""
@@ -422,14 +404,14 @@ class PDFExtractor(Extractor):
422
404
  content = path.read_bytes()
423
405
  document = parse(content, max_workers=1)
424
406
 
425
- text_parts = []
407
+ # Extract text while preserving structure
408
+ pages_text = []
426
409
  for page in document.pages:
427
- # Extract text while preserving structure
428
410
  page_text = page.extract_text()
429
411
  if page_text and page_text.strip():
430
- text_parts.append(page_text)
412
+ pages_text.append(page_text)
431
413
 
432
- if text_parts:
433
- return "\n\n".join(text_parts)
414
+ if pages_text:
415
+ return "\n\n".join(pages_text)
434
416
 
435
417
  return fallback_text
@@ -2,7 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  import contextlib
4
4
  import csv
5
+ import os
5
6
  import sys
7
+ import tempfile
6
8
  from datetime import date, datetime, time, timedelta
7
9
  from io import StringIO
8
10
  from pathlib import Path
@@ -68,9 +70,6 @@ class SpreadSheetExtractor(Extractor):
68
70
 
69
71
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
70
72
  """Pure sync implementation of extract_bytes."""
71
- import os
72
- import tempfile
73
-
74
73
  fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
75
74
 
76
75
  try:
@@ -14,6 +14,9 @@ from kreuzberg._utils._sync import run_sync
14
14
  if TYPE_CHECKING:
15
15
  from pathlib import Path
16
16
 
17
+ # Define text field keywords as a set for O(1) membership testing
18
+ _TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
19
+
17
20
 
18
21
  class StructuredDataExtractor(Extractor):
19
22
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
@@ -70,12 +73,13 @@ class StructuredDataExtractor(Extractor):
70
73
  text_parts: list[str] = []
71
74
  metadata: dict[str, Any] = {}
72
75
 
76
+ # Use match statement for cleaner code and avoid multiple isinstance calls
73
77
  if isinstance(data, dict):
74
- text_parts.extend(self._extract_from_dict(data, metadata))
78
+ text_parts = self._extract_from_dict(data, metadata)
75
79
  elif isinstance(data, list):
76
- text_parts.extend(self._extract_from_list(data, metadata))
80
+ text_parts = self._extract_from_list(data, metadata)
77
81
  else:
78
- text_parts.append(str(data))
82
+ text_parts = [str(data)]
79
83
 
80
84
  combined_text = "\n".join(text_parts) if text_parts else text_content
81
85
 
@@ -107,10 +111,9 @@ class StructuredDataExtractor(Extractor):
107
111
  if isinstance(value, str) and value.strip():
108
112
  text_parts.append(f"{full_key}: {value}")
109
113
 
110
- if any(
111
- text_field in key.lower()
112
- for text_field in ["title", "name", "subject", "description", "content", "body", "text", "message"]
113
- ):
114
+ # Check if key contains any text field keywords efficiently
115
+ key_lower = key.lower()
116
+ if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
114
117
  metadata[full_key] = value
115
118
 
116
119
  elif isinstance(value, (int, float, bool)):
kreuzberg/_gmft.py CHANGED
@@ -1,12 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import io
4
+ import multiprocessing as mp
3
5
  import os
6
+ import queue
7
+ import signal
8
+ import traceback
4
9
  from dataclasses import dataclass, field
10
+ from io import StringIO
5
11
  from typing import TYPE_CHECKING, Any, Literal
6
12
 
13
+ import msgspec
14
+
7
15
  from kreuzberg._types import TableData
8
16
  from kreuzberg._utils._sync import run_sync
9
- from kreuzberg.exceptions import MissingDependencyError
17
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError
10
18
 
11
19
  if TYPE_CHECKING:
12
20
  from os import PathLike
@@ -15,7 +23,7 @@ if TYPE_CHECKING:
15
23
  from pandas import DataFrame
16
24
 
17
25
 
18
- @dataclass(unsafe_hash=True)
26
+ @dataclass(unsafe_hash=True, slots=True)
19
27
  class GMFTConfig:
20
28
  """Configuration options for GMFT.
21
29
 
@@ -173,7 +181,7 @@ async def extract_tables( # noqa: PLR0915
173
181
  cache_kwargs = {
174
182
  "file_info": str(sorted(file_info.items())),
175
183
  "extractor": "gmft",
176
- "config": str(sorted(config.__dict__.items())),
184
+ "config": str(sorted(msgspec.to_builtins(config).items())),
177
185
  }
178
186
 
179
187
  table_cache = get_table_cache()
@@ -196,9 +204,7 @@ async def extract_tables( # noqa: PLR0915
196
204
 
197
205
  try:
198
206
  if use_isolated_process:
199
- from kreuzberg._multiprocessing import extract_tables_isolated_async
200
-
201
- result = await extract_tables_isolated_async(file_path, config)
207
+ result = await _extract_tables_isolated_async(file_path, config)
202
208
 
203
209
  await table_cache.aset(result, **cache_kwargs)
204
210
 
@@ -305,7 +311,7 @@ def extract_tables_sync(
305
311
  cache_kwargs = {
306
312
  "file_info": str(sorted(file_info.items())),
307
313
  "extractor": "gmft",
308
- "config": str(sorted(config.__dict__.items())),
314
+ "config": str(sorted(msgspec.to_builtins(config).items())),
309
315
  }
310
316
 
311
317
  table_cache = get_table_cache()
@@ -314,9 +320,7 @@ def extract_tables_sync(
314
320
  return cached_result # type: ignore[no-any-return]
315
321
 
316
322
  if use_isolated_process:
317
- from kreuzberg._multiprocessing import extract_tables_isolated
318
-
319
- result = extract_tables_isolated(file_path, config)
323
+ result = _extract_tables_isolated(file_path, config)
320
324
 
321
325
  table_cache.set(result, **cache_kwargs)
322
326
 
@@ -378,3 +382,303 @@ def extract_tables_sync(
378
382
  raise MissingDependencyError.create_for_package(
379
383
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
380
384
  ) from e
385
+
386
+
387
+ def _extract_tables_in_process(
388
+ file_path: str | PathLike[str],
389
+ config_dict: dict[str, Any],
390
+ result_queue: queue.Queue[tuple[bool, Any]],
391
+ ) -> None:
392
+ """Extract tables in an isolated process to handle potential segfaults.
393
+
394
+ Args:
395
+ file_path: Path to the PDF file
396
+ config_dict: Serialized GMFTConfig as a dict
397
+ result_queue: Queue to put results or errors
398
+ """
399
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
400
+
401
+ try:
402
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
403
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
404
+ from gmft.formatters.tatr import TATRFormatConfig
405
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document
406
+
407
+ config = GMFTConfig(**config_dict)
408
+
409
+ formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
410
+ config=TATRFormatConfig(
411
+ verbosity=config.verbosity,
412
+ formatter_base_threshold=config.formatter_base_threshold,
413
+ cell_required_confidence=config.cell_required_confidence,
414
+ remove_null_rows=config.remove_null_rows,
415
+ enable_multi_header=config.enable_multi_header,
416
+ semantic_spanning_cells=config.semantic_spanning_cells,
417
+ semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
418
+ large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
419
+ large_table_threshold=config.large_table_threshold,
420
+ large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
421
+ large_table_maximum_rows=config.large_table_maximum_rows,
422
+ force_large_table_assumption=config.force_large_table_assumption,
423
+ )
424
+ )
425
+ detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)) # type: ignore[no-untyped-call]
426
+
427
+ doc = PyPDFium2Document(str(file_path))
428
+ cropped_tables = []
429
+ dataframes = []
430
+
431
+ try:
432
+ for page in doc:
433
+ cropped_tables.extend(detector.extract(page)) # type: ignore[attr-defined]
434
+
435
+ for cropped_table in cropped_tables:
436
+ formatted_table = formatter.extract(cropped_table) # type: ignore[attr-defined]
437
+ dataframes.append(formatted_table.df())
438
+
439
+ results = []
440
+ for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
441
+ img_bytes = io.BytesIO()
442
+ cropped_image = cropped_table.image()
443
+ cropped_image.save(img_bytes, format="PNG")
444
+ img_bytes.seek(0)
445
+
446
+ results.append(
447
+ {
448
+ "cropped_image_bytes": img_bytes.getvalue(),
449
+ "page_number": cropped_table.page.page_number,
450
+ "text": data_frame.to_markdown(),
451
+ "df_csv": data_frame.to_csv(index=False),
452
+ }
453
+ )
454
+
455
+ result_queue.put((True, results))
456
+
457
+ finally:
458
+ doc.close() # type: ignore[no-untyped-call]
459
+
460
+ except Exception as e: # noqa: BLE001
461
+ error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
462
+ result_queue.put((False, error_info))
463
+
464
+
465
+ def _extract_tables_isolated(
466
+ file_path: str | PathLike[str],
467
+ config: GMFTConfig | None = None,
468
+ timeout: float = 300.0,
469
+ ) -> list[TableData]:
470
+ """Extract tables using an isolated process to handle segfaults.
471
+
472
+ Args:
473
+ file_path: Path to the PDF file
474
+ config: GMFT configuration
475
+ timeout: Maximum time to wait for extraction
476
+
477
+ Returns:
478
+ List of extracted tables
479
+
480
+ Raises:
481
+ RuntimeError: If extraction fails or times out
482
+ """
483
+ config = config or GMFTConfig()
484
+ config_dict = msgspec.to_builtins(config)
485
+
486
+ ctx = mp.get_context("spawn")
487
+ result_queue = ctx.Queue()
488
+
489
+ process = ctx.Process(
490
+ target=_extract_tables_in_process,
491
+ args=(str(file_path), config_dict, result_queue),
492
+ )
493
+
494
+ process.start()
495
+
496
+ try:
497
+ # Wait for result with timeout, checking for process death # ~keep
498
+ import time
499
+
500
+ start_time = time.time()
501
+ while True:
502
+ try:
503
+ success, result = result_queue.get_nowait()
504
+ break
505
+ except queue.Empty:
506
+ if time.time() - start_time > timeout:
507
+ raise
508
+
509
+ if not process.is_alive():
510
+ # Process died without putting result # ~keep
511
+ if process.exitcode == -signal.SIGSEGV:
512
+ raise ParsingError(
513
+ "GMFT process crashed with segmentation fault",
514
+ context={
515
+ "file_path": str(file_path),
516
+ "exit_code": process.exitcode,
517
+ },
518
+ ) from None
519
+ raise ParsingError(
520
+ f"GMFT process died unexpectedly with exit code {process.exitcode}",
521
+ context={
522
+ "file_path": str(file_path),
523
+ "exit_code": process.exitcode,
524
+ },
525
+ ) from None
526
+
527
+ time.sleep(0.1)
528
+
529
+ if success:
530
+ tables = []
531
+ for table_dict in result:
532
+ from PIL import Image
533
+
534
+ img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
535
+ import pandas as pd
536
+
537
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
538
+
539
+ tables.append(
540
+ TableData(
541
+ cropped_image=img,
542
+ page_number=table_dict["page_number"],
543
+ text=table_dict["text"],
544
+ df=df,
545
+ )
546
+ )
547
+
548
+ return tables
549
+
550
+ error_info = result
551
+ raise ParsingError(
552
+ f"GMFT table extraction failed: {error_info['error']}",
553
+ context={
554
+ "file_path": str(file_path),
555
+ "error_type": error_info["type"],
556
+ "traceback": error_info["traceback"],
557
+ },
558
+ )
559
+
560
+ except queue.Empty as e:
561
+ raise ParsingError(
562
+ "GMFT table extraction timed out",
563
+ context={
564
+ "file_path": str(file_path),
565
+ "timeout": timeout,
566
+ },
567
+ ) from e
568
+ finally:
569
+ if process.is_alive():
570
+ process.terminate()
571
+ process.join(timeout=5)
572
+ if process.is_alive():
573
+ process.kill()
574
+ process.join()
575
+
576
+
577
+ async def _extract_tables_isolated_async(
578
+ file_path: str | PathLike[str],
579
+ config: GMFTConfig | None = None,
580
+ timeout: float = 300.0,
581
+ ) -> list[TableData]:
582
+ """Async version of extract_tables_isolated using asyncio.
583
+
584
+ Args:
585
+ file_path: Path to the PDF file
586
+ config: GMFT configuration
587
+ timeout: Maximum time to wait for extraction
588
+
589
+ Returns:
590
+ List of extracted tables
591
+
592
+ Raises:
593
+ RuntimeError: If extraction fails or times out
594
+ """
595
+ import anyio
596
+
597
+ config = config or GMFTConfig()
598
+ config_dict = msgspec.to_builtins(config)
599
+
600
+ ctx = mp.get_context("spawn")
601
+ result_queue = ctx.Queue()
602
+
603
+ process = ctx.Process(
604
+ target=_extract_tables_in_process,
605
+ args=(str(file_path), config_dict, result_queue),
606
+ )
607
+
608
+ process.start()
609
+
610
+ try:
611
+
612
+ async def wait_for_result() -> tuple[bool, Any]:
613
+ while True:
614
+ try:
615
+ return result_queue.get_nowait() # type: ignore[no-any-return]
616
+ except queue.Empty: # noqa: PERF203
617
+ await anyio.sleep(0.1)
618
+ if not process.is_alive():
619
+ # Process died without putting result # ~keep
620
+ if process.exitcode == -signal.SIGSEGV:
621
+ raise ParsingError(
622
+ "GMFT process crashed with segmentation fault",
623
+ context={
624
+ "file_path": str(file_path),
625
+ "exit_code": process.exitcode,
626
+ },
627
+ ) from None
628
+ raise ParsingError(
629
+ f"GMFT process died unexpectedly with exit code {process.exitcode}",
630
+ context={
631
+ "file_path": str(file_path),
632
+ "exit_code": process.exitcode,
633
+ },
634
+ ) from None
635
+
636
+ with anyio.fail_after(timeout):
637
+ success, result = await wait_for_result()
638
+
639
+ if success:
640
+ tables = []
641
+ for table_dict in result:
642
+ from PIL import Image
643
+
644
+ img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
645
+ import pandas as pd
646
+
647
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
648
+
649
+ tables.append(
650
+ TableData(
651
+ cropped_image=img,
652
+ page_number=table_dict["page_number"],
653
+ text=table_dict["text"],
654
+ df=df,
655
+ )
656
+ )
657
+
658
+ return tables
659
+
660
+ error_info = result
661
+ raise ParsingError(
662
+ f"GMFT table extraction failed: {error_info['error']}",
663
+ context={
664
+ "file_path": str(file_path),
665
+ "error_type": error_info["type"],
666
+ "traceback": error_info["traceback"],
667
+ },
668
+ )
669
+
670
+ except TimeoutError as e:
671
+ raise ParsingError(
672
+ "GMFT table extraction timed out",
673
+ context={
674
+ "file_path": str(file_path),
675
+ "timeout": timeout,
676
+ },
677
+ ) from e
678
+ finally:
679
+ if process.is_alive():
680
+ process.terminate()
681
+ await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
682
+ if process.is_alive():
683
+ process.kill()
684
+ await anyio.to_thread.run_sync(process.join)
@@ -23,7 +23,7 @@ except ImportError:
23
23
  _CACHE_SIZE = 128
24
24
 
25
25
 
26
- @dataclass(frozen=True)
26
+ @dataclass(frozen=True, slots=True)
27
27
  class LanguageDetectionConfig:
28
28
  """Configuration for language detection.
29
29