kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -0
- kreuzberg/_api/main.py +22 -1
- kreuzberg/_config.py +404 -0
- kreuzberg/_entity_extraction.py +4 -5
- kreuzberg/_extractors/_base.py +3 -5
- kreuzberg/_extractors/_image.py +18 -32
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +39 -57
- kreuzberg/_extractors/_spread_sheet.py +2 -3
- kreuzberg/_extractors/_structured.py +10 -7
- kreuzberg/_gmft.py +314 -10
- kreuzberg/_language_detection.py +1 -1
- kreuzberg/_mcp/server.py +58 -8
- kreuzberg/_ocr/__init__.py +1 -22
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +92 -1
- kreuzberg/_ocr/_paddleocr.py +90 -1
- kreuzberg/_ocr/_tesseract.py +556 -5
- kreuzberg/_playa.py +2 -3
- kreuzberg/_types.py +46 -24
- kreuzberg/_utils/_cache.py +35 -4
- kreuzberg/_utils/_device.py +10 -20
- kreuzberg/_utils/_errors.py +44 -45
- kreuzberg/_utils/_process_pool.py +2 -6
- kreuzberg/_utils/_quality.py +7 -11
- kreuzberg/_utils/_serialization.py +21 -16
- kreuzberg/_utils/_string.py +22 -12
- kreuzberg/_utils/_table.py +3 -4
- kreuzberg/cli.py +4 -5
- kreuzberg/exceptions.py +10 -0
- kreuzberg/extraction.py +6 -24
- kreuzberg-3.8.2.dist-info/METADATA +265 -0
- kreuzberg-3.8.2.dist-info/RECORD +53 -0
- kreuzberg/_cli_config.py +0 -175
- kreuzberg/_multiprocessing/__init__.py +0 -5
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
- kreuzberg/_ocr/_pool.py +0 -357
- kreuzberg/_ocr/_sync.py +0 -566
- kreuzberg-3.8.0.dist-info/METADATA +0 -313
- kreuzberg-3.8.0.dist-info/RECORD +0 -57
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import contextlib
|
4
|
+
import os
|
5
|
+
import tempfile
|
4
6
|
from multiprocessing import cpu_count
|
5
7
|
from pathlib import Path
|
6
8
|
from re import Pattern
|
@@ -15,8 +17,12 @@ from playa import parse
|
|
15
17
|
from kreuzberg._extractors._base import Extractor
|
16
18
|
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
17
19
|
from kreuzberg._ocr import get_ocr_backend
|
20
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
21
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
22
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
18
23
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
19
24
|
from kreuzberg._types import ExtractionResult, OcrBackendType
|
25
|
+
from kreuzberg._utils._errors import create_error_context, should_retry
|
20
26
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
21
27
|
from kreuzberg._utils._string import normalize_spaces
|
22
28
|
from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
|
@@ -89,9 +95,6 @@ class PDFExtractor(Extractor):
|
|
89
95
|
|
90
96
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
91
97
|
"""Pure sync implementation of PDF extraction from bytes."""
|
92
|
-
import os
|
93
|
-
import tempfile
|
94
|
-
|
95
98
|
fd, temp_path = tempfile.mkstemp(suffix=".pdf")
|
96
99
|
try:
|
97
100
|
with os.fdopen(fd, "wb") as f:
|
@@ -191,8 +194,6 @@ class PDFExtractor(Extractor):
|
|
191
194
|
Returns:
|
192
195
|
A list of Pillow Images.
|
193
196
|
"""
|
194
|
-
from kreuzberg._utils._errors import create_error_context, should_retry
|
195
|
-
|
196
197
|
document: pypdfium2.PdfDocument | None = None
|
197
198
|
last_error = None
|
198
199
|
|
@@ -247,9 +248,10 @@ class PDFExtractor(Extractor):
|
|
247
248
|
*[backend.process_image(image, **self.config.get_config_dict()) for image in images],
|
248
249
|
batch_size=cpu_count(),
|
249
250
|
)
|
250
|
-
|
251
|
-
|
252
|
-
|
251
|
+
# Use list comprehension and join for efficient string building
|
252
|
+
content = "\n".join(result.content for result in ocr_results)
|
253
|
+
|
254
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
253
255
|
|
254
256
|
@staticmethod
|
255
257
|
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
@@ -264,28 +266,28 @@ class PDFExtractor(Extractor):
|
|
264
266
|
Returns:
|
265
267
|
The extracted text.
|
266
268
|
"""
|
267
|
-
from kreuzberg._utils._errors import create_error_context
|
268
|
-
|
269
269
|
document: pypdfium2.PdfDocument | None = None
|
270
270
|
try:
|
271
271
|
with pypdfium_file_lock(input_file):
|
272
272
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
273
|
-
|
273
|
+
pages_content = []
|
274
274
|
page_errors = []
|
275
275
|
|
276
276
|
for i, page in enumerate(cast("pypdfium2.PdfDocument", document)):
|
277
277
|
try:
|
278
278
|
text_page = page.get_textpage()
|
279
|
-
|
279
|
+
page_content = text_page.get_text_bounded()
|
280
|
+
pages_content.append(page_content)
|
280
281
|
except Exception as e: # noqa: PERF203, BLE001
|
281
282
|
page_errors.append({"page": i + 1, "error": str(e)})
|
282
|
-
|
283
|
+
pages_content.append(f"[Error extracting page {i + 1}]")
|
283
284
|
|
284
|
-
text = "\n".join(
|
285
|
+
text = "\n".join(pages_content)
|
286
|
+
has_content = bool(text.strip())
|
285
287
|
|
286
|
-
if page_errors and
|
288
|
+
if page_errors and has_content:
|
287
289
|
return normalize_spaces(text)
|
288
|
-
if not
|
290
|
+
if not has_content:
|
289
291
|
raise ParsingError(
|
290
292
|
"Could not extract any text from PDF",
|
291
293
|
context=create_error_context(
|
@@ -316,14 +318,14 @@ class PDFExtractor(Extractor):
|
|
316
318
|
try:
|
317
319
|
with pypdfium_file_lock(path):
|
318
320
|
pdf = pypdfium2.PdfDocument(str(path))
|
319
|
-
|
321
|
+
pages_text = []
|
320
322
|
for page in pdf:
|
321
323
|
text_page = page.get_textpage()
|
322
324
|
text = text_page.get_text_bounded()
|
323
|
-
|
325
|
+
pages_text.append(text)
|
324
326
|
text_page.close()
|
325
327
|
page.close()
|
326
|
-
return "".join(
|
328
|
+
return "\n".join(pages_text)
|
327
329
|
except Exception as e:
|
328
330
|
raise ParsingError(f"Failed to extract PDF text: {e}") from e
|
329
331
|
finally:
|
@@ -345,9 +347,6 @@ class PDFExtractor(Extractor):
|
|
345
347
|
bitmap.close()
|
346
348
|
page.close()
|
347
349
|
|
348
|
-
import os
|
349
|
-
import tempfile
|
350
|
-
|
351
350
|
image_paths = []
|
352
351
|
temp_files = []
|
353
352
|
|
@@ -375,46 +374,29 @@ class PDFExtractor(Extractor):
|
|
375
374
|
|
376
375
|
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
377
376
|
"""Process PDF images with the configured OCR backend."""
|
378
|
-
|
379
|
-
|
380
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
377
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
378
|
+
paths = [Path(p) for p in image_paths]
|
381
379
|
|
382
|
-
|
380
|
+
if self.config.ocr_backend == "tesseract":
|
381
|
+
config = (
|
383
382
|
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
384
383
|
)
|
385
|
-
results =
|
386
|
-
|
387
|
-
return "\n\n".join(text_parts)
|
388
|
-
|
389
|
-
if self.config.ocr_backend == "paddleocr":
|
390
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
391
|
-
from kreuzberg._ocr._sync import process_image_paddleocr_sync as paddle_process
|
392
|
-
|
384
|
+
results = backend.process_batch_sync(paths, **config.__dict__)
|
385
|
+
elif self.config.ocr_backend == "paddleocr":
|
393
386
|
paddle_config = (
|
394
387
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
395
388
|
)
|
396
|
-
|
397
|
-
|
398
|
-
for image_path in image_paths:
|
399
|
-
result = paddle_process(Path(image_path), paddle_config)
|
400
|
-
text_parts.append(result.content)
|
401
|
-
return "\n\n".join(text_parts)
|
402
|
-
|
403
|
-
if self.config.ocr_backend == "easyocr":
|
404
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
405
|
-
from kreuzberg._ocr._sync import process_image_easyocr_sync as easy_process
|
406
|
-
|
389
|
+
results = backend.process_batch_sync(paths, **paddle_config.__dict__)
|
390
|
+
elif self.config.ocr_backend == "easyocr":
|
407
391
|
easy_config = (
|
408
392
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
409
393
|
)
|
394
|
+
results = backend.process_batch_sync(paths, **easy_config.__dict__)
|
395
|
+
else:
|
396
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
410
397
|
|
411
|
-
|
412
|
-
|
413
|
-
result = easy_process(Path(image_path), easy_config)
|
414
|
-
text_parts.append(result.content)
|
415
|
-
return "\n\n".join(text_parts)
|
416
|
-
|
417
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
398
|
+
# Use list comprehension and join for efficient string building
|
399
|
+
return "\n\n".join(result.content for result in results)
|
418
400
|
|
419
401
|
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
420
402
|
"""Extract text using playa for better structure preservation."""
|
@@ -422,14 +404,14 @@ class PDFExtractor(Extractor):
|
|
422
404
|
content = path.read_bytes()
|
423
405
|
document = parse(content, max_workers=1)
|
424
406
|
|
425
|
-
|
407
|
+
# Extract text while preserving structure
|
408
|
+
pages_text = []
|
426
409
|
for page in document.pages:
|
427
|
-
# Extract text while preserving structure
|
428
410
|
page_text = page.extract_text()
|
429
411
|
if page_text and page_text.strip():
|
430
|
-
|
412
|
+
pages_text.append(page_text)
|
431
413
|
|
432
|
-
if
|
433
|
-
return "\n\n".join(
|
414
|
+
if pages_text:
|
415
|
+
return "\n\n".join(pages_text)
|
434
416
|
|
435
417
|
return fallback_text
|
@@ -2,7 +2,9 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import contextlib
|
4
4
|
import csv
|
5
|
+
import os
|
5
6
|
import sys
|
7
|
+
import tempfile
|
6
8
|
from datetime import date, datetime, time, timedelta
|
7
9
|
from io import StringIO
|
8
10
|
from pathlib import Path
|
@@ -68,9 +70,6 @@ class SpreadSheetExtractor(Extractor):
|
|
68
70
|
|
69
71
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
70
72
|
"""Pure sync implementation of extract_bytes."""
|
71
|
-
import os
|
72
|
-
import tempfile
|
73
|
-
|
74
73
|
fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
|
75
74
|
|
76
75
|
try:
|
@@ -14,6 +14,9 @@ from kreuzberg._utils._sync import run_sync
|
|
14
14
|
if TYPE_CHECKING:
|
15
15
|
from pathlib import Path
|
16
16
|
|
17
|
+
# Define text field keywords as a set for O(1) membership testing
|
18
|
+
_TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
|
19
|
+
|
17
20
|
|
18
21
|
class StructuredDataExtractor(Extractor):
|
19
22
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
@@ -70,12 +73,13 @@ class StructuredDataExtractor(Extractor):
|
|
70
73
|
text_parts: list[str] = []
|
71
74
|
metadata: dict[str, Any] = {}
|
72
75
|
|
76
|
+
# Use match statement for cleaner code and avoid multiple isinstance calls
|
73
77
|
if isinstance(data, dict):
|
74
|
-
text_parts
|
78
|
+
text_parts = self._extract_from_dict(data, metadata)
|
75
79
|
elif isinstance(data, list):
|
76
|
-
text_parts
|
80
|
+
text_parts = self._extract_from_list(data, metadata)
|
77
81
|
else:
|
78
|
-
text_parts
|
82
|
+
text_parts = [str(data)]
|
79
83
|
|
80
84
|
combined_text = "\n".join(text_parts) if text_parts else text_content
|
81
85
|
|
@@ -107,10 +111,9 @@ class StructuredDataExtractor(Extractor):
|
|
107
111
|
if isinstance(value, str) and value.strip():
|
108
112
|
text_parts.append(f"{full_key}: {value}")
|
109
113
|
|
110
|
-
if any
|
111
|
-
|
112
|
-
|
113
|
-
):
|
114
|
+
# Check if key contains any text field keywords efficiently
|
115
|
+
key_lower = key.lower()
|
116
|
+
if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
|
114
117
|
metadata[full_key] = value
|
115
118
|
|
116
119
|
elif isinstance(value, (int, float, bool)):
|
kreuzberg/_gmft.py
CHANGED
@@ -1,12 +1,20 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import io
|
4
|
+
import multiprocessing as mp
|
3
5
|
import os
|
6
|
+
import queue
|
7
|
+
import signal
|
8
|
+
import traceback
|
4
9
|
from dataclasses import dataclass, field
|
10
|
+
from io import StringIO
|
5
11
|
from typing import TYPE_CHECKING, Any, Literal
|
6
12
|
|
13
|
+
import msgspec
|
14
|
+
|
7
15
|
from kreuzberg._types import TableData
|
8
16
|
from kreuzberg._utils._sync import run_sync
|
9
|
-
from kreuzberg.exceptions import MissingDependencyError
|
17
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
10
18
|
|
11
19
|
if TYPE_CHECKING:
|
12
20
|
from os import PathLike
|
@@ -15,7 +23,7 @@ if TYPE_CHECKING:
|
|
15
23
|
from pandas import DataFrame
|
16
24
|
|
17
25
|
|
18
|
-
@dataclass(unsafe_hash=True)
|
26
|
+
@dataclass(unsafe_hash=True, slots=True)
|
19
27
|
class GMFTConfig:
|
20
28
|
"""Configuration options for GMFT.
|
21
29
|
|
@@ -173,7 +181,7 @@ async def extract_tables( # noqa: PLR0915
|
|
173
181
|
cache_kwargs = {
|
174
182
|
"file_info": str(sorted(file_info.items())),
|
175
183
|
"extractor": "gmft",
|
176
|
-
"config": str(sorted(config.
|
184
|
+
"config": str(sorted(msgspec.to_builtins(config).items())),
|
177
185
|
}
|
178
186
|
|
179
187
|
table_cache = get_table_cache()
|
@@ -196,9 +204,7 @@ async def extract_tables( # noqa: PLR0915
|
|
196
204
|
|
197
205
|
try:
|
198
206
|
if use_isolated_process:
|
199
|
-
|
200
|
-
|
201
|
-
result = await extract_tables_isolated_async(file_path, config)
|
207
|
+
result = await _extract_tables_isolated_async(file_path, config)
|
202
208
|
|
203
209
|
await table_cache.aset(result, **cache_kwargs)
|
204
210
|
|
@@ -305,7 +311,7 @@ def extract_tables_sync(
|
|
305
311
|
cache_kwargs = {
|
306
312
|
"file_info": str(sorted(file_info.items())),
|
307
313
|
"extractor": "gmft",
|
308
|
-
"config": str(sorted(config.
|
314
|
+
"config": str(sorted(msgspec.to_builtins(config).items())),
|
309
315
|
}
|
310
316
|
|
311
317
|
table_cache = get_table_cache()
|
@@ -314,9 +320,7 @@ def extract_tables_sync(
|
|
314
320
|
return cached_result # type: ignore[no-any-return]
|
315
321
|
|
316
322
|
if use_isolated_process:
|
317
|
-
|
318
|
-
|
319
|
-
result = extract_tables_isolated(file_path, config)
|
323
|
+
result = _extract_tables_isolated(file_path, config)
|
320
324
|
|
321
325
|
table_cache.set(result, **cache_kwargs)
|
322
326
|
|
@@ -378,3 +382,303 @@ def extract_tables_sync(
|
|
378
382
|
raise MissingDependencyError.create_for_package(
|
379
383
|
dependency_group="gmft", functionality="table extraction", package_name="gmft"
|
380
384
|
) from e
|
385
|
+
|
386
|
+
|
387
|
+
def _extract_tables_in_process(
|
388
|
+
file_path: str | PathLike[str],
|
389
|
+
config_dict: dict[str, Any],
|
390
|
+
result_queue: queue.Queue[tuple[bool, Any]],
|
391
|
+
) -> None:
|
392
|
+
"""Extract tables in an isolated process to handle potential segfaults.
|
393
|
+
|
394
|
+
Args:
|
395
|
+
file_path: Path to the PDF file
|
396
|
+
config_dict: Serialized GMFTConfig as a dict
|
397
|
+
result_queue: Queue to put results or errors
|
398
|
+
"""
|
399
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
400
|
+
|
401
|
+
try:
|
402
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
|
403
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
|
404
|
+
from gmft.formatters.tatr import TATRFormatConfig
|
405
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
406
|
+
|
407
|
+
config = GMFTConfig(**config_dict)
|
408
|
+
|
409
|
+
formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
|
410
|
+
config=TATRFormatConfig(
|
411
|
+
verbosity=config.verbosity,
|
412
|
+
formatter_base_threshold=config.formatter_base_threshold,
|
413
|
+
cell_required_confidence=config.cell_required_confidence,
|
414
|
+
remove_null_rows=config.remove_null_rows,
|
415
|
+
enable_multi_header=config.enable_multi_header,
|
416
|
+
semantic_spanning_cells=config.semantic_spanning_cells,
|
417
|
+
semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
|
418
|
+
large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
|
419
|
+
large_table_threshold=config.large_table_threshold,
|
420
|
+
large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
|
421
|
+
large_table_maximum_rows=config.large_table_maximum_rows,
|
422
|
+
force_large_table_assumption=config.force_large_table_assumption,
|
423
|
+
)
|
424
|
+
)
|
425
|
+
detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)) # type: ignore[no-untyped-call]
|
426
|
+
|
427
|
+
doc = PyPDFium2Document(str(file_path))
|
428
|
+
cropped_tables = []
|
429
|
+
dataframes = []
|
430
|
+
|
431
|
+
try:
|
432
|
+
for page in doc:
|
433
|
+
cropped_tables.extend(detector.extract(page)) # type: ignore[attr-defined]
|
434
|
+
|
435
|
+
for cropped_table in cropped_tables:
|
436
|
+
formatted_table = formatter.extract(cropped_table) # type: ignore[attr-defined]
|
437
|
+
dataframes.append(formatted_table.df())
|
438
|
+
|
439
|
+
results = []
|
440
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
|
441
|
+
img_bytes = io.BytesIO()
|
442
|
+
cropped_image = cropped_table.image()
|
443
|
+
cropped_image.save(img_bytes, format="PNG")
|
444
|
+
img_bytes.seek(0)
|
445
|
+
|
446
|
+
results.append(
|
447
|
+
{
|
448
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
449
|
+
"page_number": cropped_table.page.page_number,
|
450
|
+
"text": data_frame.to_markdown(),
|
451
|
+
"df_csv": data_frame.to_csv(index=False),
|
452
|
+
}
|
453
|
+
)
|
454
|
+
|
455
|
+
result_queue.put((True, results))
|
456
|
+
|
457
|
+
finally:
|
458
|
+
doc.close() # type: ignore[no-untyped-call]
|
459
|
+
|
460
|
+
except Exception as e: # noqa: BLE001
|
461
|
+
error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
|
462
|
+
result_queue.put((False, error_info))
|
463
|
+
|
464
|
+
|
465
|
+
def _extract_tables_isolated(
|
466
|
+
file_path: str | PathLike[str],
|
467
|
+
config: GMFTConfig | None = None,
|
468
|
+
timeout: float = 300.0,
|
469
|
+
) -> list[TableData]:
|
470
|
+
"""Extract tables using an isolated process to handle segfaults.
|
471
|
+
|
472
|
+
Args:
|
473
|
+
file_path: Path to the PDF file
|
474
|
+
config: GMFT configuration
|
475
|
+
timeout: Maximum time to wait for extraction
|
476
|
+
|
477
|
+
Returns:
|
478
|
+
List of extracted tables
|
479
|
+
|
480
|
+
Raises:
|
481
|
+
RuntimeError: If extraction fails or times out
|
482
|
+
"""
|
483
|
+
config = config or GMFTConfig()
|
484
|
+
config_dict = msgspec.to_builtins(config)
|
485
|
+
|
486
|
+
ctx = mp.get_context("spawn")
|
487
|
+
result_queue = ctx.Queue()
|
488
|
+
|
489
|
+
process = ctx.Process(
|
490
|
+
target=_extract_tables_in_process,
|
491
|
+
args=(str(file_path), config_dict, result_queue),
|
492
|
+
)
|
493
|
+
|
494
|
+
process.start()
|
495
|
+
|
496
|
+
try:
|
497
|
+
# Wait for result with timeout, checking for process death # ~keep
|
498
|
+
import time
|
499
|
+
|
500
|
+
start_time = time.time()
|
501
|
+
while True:
|
502
|
+
try:
|
503
|
+
success, result = result_queue.get_nowait()
|
504
|
+
break
|
505
|
+
except queue.Empty:
|
506
|
+
if time.time() - start_time > timeout:
|
507
|
+
raise
|
508
|
+
|
509
|
+
if not process.is_alive():
|
510
|
+
# Process died without putting result # ~keep
|
511
|
+
if process.exitcode == -signal.SIGSEGV:
|
512
|
+
raise ParsingError(
|
513
|
+
"GMFT process crashed with segmentation fault",
|
514
|
+
context={
|
515
|
+
"file_path": str(file_path),
|
516
|
+
"exit_code": process.exitcode,
|
517
|
+
},
|
518
|
+
) from None
|
519
|
+
raise ParsingError(
|
520
|
+
f"GMFT process died unexpectedly with exit code {process.exitcode}",
|
521
|
+
context={
|
522
|
+
"file_path": str(file_path),
|
523
|
+
"exit_code": process.exitcode,
|
524
|
+
},
|
525
|
+
) from None
|
526
|
+
|
527
|
+
time.sleep(0.1)
|
528
|
+
|
529
|
+
if success:
|
530
|
+
tables = []
|
531
|
+
for table_dict in result:
|
532
|
+
from PIL import Image
|
533
|
+
|
534
|
+
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
535
|
+
import pandas as pd
|
536
|
+
|
537
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
538
|
+
|
539
|
+
tables.append(
|
540
|
+
TableData(
|
541
|
+
cropped_image=img,
|
542
|
+
page_number=table_dict["page_number"],
|
543
|
+
text=table_dict["text"],
|
544
|
+
df=df,
|
545
|
+
)
|
546
|
+
)
|
547
|
+
|
548
|
+
return tables
|
549
|
+
|
550
|
+
error_info = result
|
551
|
+
raise ParsingError(
|
552
|
+
f"GMFT table extraction failed: {error_info['error']}",
|
553
|
+
context={
|
554
|
+
"file_path": str(file_path),
|
555
|
+
"error_type": error_info["type"],
|
556
|
+
"traceback": error_info["traceback"],
|
557
|
+
},
|
558
|
+
)
|
559
|
+
|
560
|
+
except queue.Empty as e:
|
561
|
+
raise ParsingError(
|
562
|
+
"GMFT table extraction timed out",
|
563
|
+
context={
|
564
|
+
"file_path": str(file_path),
|
565
|
+
"timeout": timeout,
|
566
|
+
},
|
567
|
+
) from e
|
568
|
+
finally:
|
569
|
+
if process.is_alive():
|
570
|
+
process.terminate()
|
571
|
+
process.join(timeout=5)
|
572
|
+
if process.is_alive():
|
573
|
+
process.kill()
|
574
|
+
process.join()
|
575
|
+
|
576
|
+
|
577
|
+
async def _extract_tables_isolated_async(
|
578
|
+
file_path: str | PathLike[str],
|
579
|
+
config: GMFTConfig | None = None,
|
580
|
+
timeout: float = 300.0,
|
581
|
+
) -> list[TableData]:
|
582
|
+
"""Async version of extract_tables_isolated using asyncio.
|
583
|
+
|
584
|
+
Args:
|
585
|
+
file_path: Path to the PDF file
|
586
|
+
config: GMFT configuration
|
587
|
+
timeout: Maximum time to wait for extraction
|
588
|
+
|
589
|
+
Returns:
|
590
|
+
List of extracted tables
|
591
|
+
|
592
|
+
Raises:
|
593
|
+
RuntimeError: If extraction fails or times out
|
594
|
+
"""
|
595
|
+
import anyio
|
596
|
+
|
597
|
+
config = config or GMFTConfig()
|
598
|
+
config_dict = msgspec.to_builtins(config)
|
599
|
+
|
600
|
+
ctx = mp.get_context("spawn")
|
601
|
+
result_queue = ctx.Queue()
|
602
|
+
|
603
|
+
process = ctx.Process(
|
604
|
+
target=_extract_tables_in_process,
|
605
|
+
args=(str(file_path), config_dict, result_queue),
|
606
|
+
)
|
607
|
+
|
608
|
+
process.start()
|
609
|
+
|
610
|
+
try:
|
611
|
+
|
612
|
+
async def wait_for_result() -> tuple[bool, Any]:
|
613
|
+
while True:
|
614
|
+
try:
|
615
|
+
return result_queue.get_nowait() # type: ignore[no-any-return]
|
616
|
+
except queue.Empty: # noqa: PERF203
|
617
|
+
await anyio.sleep(0.1)
|
618
|
+
if not process.is_alive():
|
619
|
+
# Process died without putting result # ~keep
|
620
|
+
if process.exitcode == -signal.SIGSEGV:
|
621
|
+
raise ParsingError(
|
622
|
+
"GMFT process crashed with segmentation fault",
|
623
|
+
context={
|
624
|
+
"file_path": str(file_path),
|
625
|
+
"exit_code": process.exitcode,
|
626
|
+
},
|
627
|
+
) from None
|
628
|
+
raise ParsingError(
|
629
|
+
f"GMFT process died unexpectedly with exit code {process.exitcode}",
|
630
|
+
context={
|
631
|
+
"file_path": str(file_path),
|
632
|
+
"exit_code": process.exitcode,
|
633
|
+
},
|
634
|
+
) from None
|
635
|
+
|
636
|
+
with anyio.fail_after(timeout):
|
637
|
+
success, result = await wait_for_result()
|
638
|
+
|
639
|
+
if success:
|
640
|
+
tables = []
|
641
|
+
for table_dict in result:
|
642
|
+
from PIL import Image
|
643
|
+
|
644
|
+
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
645
|
+
import pandas as pd
|
646
|
+
|
647
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
648
|
+
|
649
|
+
tables.append(
|
650
|
+
TableData(
|
651
|
+
cropped_image=img,
|
652
|
+
page_number=table_dict["page_number"],
|
653
|
+
text=table_dict["text"],
|
654
|
+
df=df,
|
655
|
+
)
|
656
|
+
)
|
657
|
+
|
658
|
+
return tables
|
659
|
+
|
660
|
+
error_info = result
|
661
|
+
raise ParsingError(
|
662
|
+
f"GMFT table extraction failed: {error_info['error']}",
|
663
|
+
context={
|
664
|
+
"file_path": str(file_path),
|
665
|
+
"error_type": error_info["type"],
|
666
|
+
"traceback": error_info["traceback"],
|
667
|
+
},
|
668
|
+
)
|
669
|
+
|
670
|
+
except TimeoutError as e:
|
671
|
+
raise ParsingError(
|
672
|
+
"GMFT table extraction timed out",
|
673
|
+
context={
|
674
|
+
"file_path": str(file_path),
|
675
|
+
"timeout": timeout,
|
676
|
+
},
|
677
|
+
) from e
|
678
|
+
finally:
|
679
|
+
if process.is_alive():
|
680
|
+
process.terminate()
|
681
|
+
await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
|
682
|
+
if process.is_alive():
|
683
|
+
process.kill()
|
684
|
+
await anyio.to_thread.run_sync(process.join)
|