kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +9 -2
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_entity_extraction.py +238 -0
- kreuzberg/_extractors/_base.py +39 -1
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +27 -22
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +97 -34
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +181 -6
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +318 -11
- kreuzberg/_language_detection.py +95 -0
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +92 -1
- kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg/_ocr/_tesseract.py +569 -5
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +181 -4
- kreuzberg/_utils/_cache.py +52 -4
- kreuzberg/_utils/_device.py +2 -2
- kreuzberg/_utils/_errors.py +3 -7
- kreuzberg/_utils/_process_pool.py +182 -9
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +6 -7
- kreuzberg/_utils/_table.py +261 -0
- kreuzberg/_utils/_tmp.py +2 -2
- kreuzberg/cli.py +1 -2
- kreuzberg/extraction.py +43 -34
- kreuzberg-3.8.1.dist-info/METADATA +301 -0
- kreuzberg-3.8.1.dist-info/RECORD +53 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
- kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
- kreuzberg/_multiprocessing/process_manager.py +0 -188
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.3.0.dist-info/METADATA +0 -235
- kreuzberg-3.3.0.dist-info/RECORD +0 -48
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,148 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
5
|
+
|
6
|
+
from anyio import Path as AsyncPath
|
7
|
+
|
8
|
+
from kreuzberg._extractors._base import Extractor
|
9
|
+
from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
|
10
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
11
|
+
from kreuzberg._utils._string import normalize_spaces, safe_decode
|
12
|
+
from kreuzberg._utils._sync import run_sync
|
13
|
+
|
14
|
+
if TYPE_CHECKING:
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
|
18
|
+
class StructuredDataExtractor(Extractor):
|
19
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
20
|
+
JSON_MIME_TYPE,
|
21
|
+
"text/json",
|
22
|
+
YAML_MIME_TYPE,
|
23
|
+
"text/yaml",
|
24
|
+
"text/x-yaml",
|
25
|
+
"application/yaml",
|
26
|
+
TOML_MIME_TYPE,
|
27
|
+
"text/toml",
|
28
|
+
}
|
29
|
+
|
30
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
31
|
+
return await run_sync(self.extract_bytes_sync, content)
|
32
|
+
|
33
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
34
|
+
content = await AsyncPath(path).read_bytes()
|
35
|
+
return await self.extract_bytes_async(content)
|
36
|
+
|
37
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
38
|
+
text_content = safe_decode(content)
|
39
|
+
|
40
|
+
try:
|
41
|
+
if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
|
42
|
+
data = json.loads(text_content)
|
43
|
+
elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
|
44
|
+
try:
|
45
|
+
import tomllib # type: ignore[import-not-found]
|
46
|
+
except ImportError:
|
47
|
+
try:
|
48
|
+
import tomli as tomllib # type: ignore[import-not-found]
|
49
|
+
except ImportError:
|
50
|
+
return ExtractionResult(
|
51
|
+
content=normalize_spaces(text_content),
|
52
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
53
|
+
metadata={"warning": "tomllib/tomli not available, returning raw text"},
|
54
|
+
chunks=[],
|
55
|
+
)
|
56
|
+
data = tomllib.loads(text_content)
|
57
|
+
else:
|
58
|
+
try:
|
59
|
+
import yaml
|
60
|
+
|
61
|
+
data = yaml.safe_load(text_content)
|
62
|
+
except ImportError:
|
63
|
+
return ExtractionResult(
|
64
|
+
content=normalize_spaces(text_content),
|
65
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
66
|
+
metadata={"warning": "PyYAML not available, returning raw text"},
|
67
|
+
chunks=[],
|
68
|
+
)
|
69
|
+
|
70
|
+
text_parts: list[str] = []
|
71
|
+
metadata: dict[str, Any] = {}
|
72
|
+
|
73
|
+
if isinstance(data, dict):
|
74
|
+
text_parts.extend(self._extract_from_dict(data, metadata))
|
75
|
+
elif isinstance(data, list):
|
76
|
+
text_parts.extend(self._extract_from_list(data, metadata))
|
77
|
+
else:
|
78
|
+
text_parts.append(str(data))
|
79
|
+
|
80
|
+
combined_text = "\n".join(text_parts) if text_parts else text_content
|
81
|
+
|
82
|
+
return ExtractionResult(
|
83
|
+
content=normalize_spaces(combined_text),
|
84
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
85
|
+
metadata=normalize_metadata(metadata),
|
86
|
+
chunks=[],
|
87
|
+
)
|
88
|
+
|
89
|
+
except (ValueError, TypeError, KeyError, AttributeError, UnicodeDecodeError) as e:
|
90
|
+
return ExtractionResult(
|
91
|
+
content=normalize_spaces(text_content),
|
92
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
93
|
+
metadata={"parse_error": str(e)},
|
94
|
+
chunks=[],
|
95
|
+
)
|
96
|
+
|
97
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
98
|
+
content = path.read_bytes()
|
99
|
+
return self.extract_bytes_sync(content)
|
100
|
+
|
101
|
+
def _extract_from_dict(self, data: dict[str, Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
|
102
|
+
text_parts = []
|
103
|
+
|
104
|
+
for key, value in data.items():
|
105
|
+
full_key = f"{prefix}.{key}" if prefix else key
|
106
|
+
|
107
|
+
if isinstance(value, str) and value.strip():
|
108
|
+
text_parts.append(f"{full_key}: {value}")
|
109
|
+
|
110
|
+
if any(
|
111
|
+
text_field in key.lower()
|
112
|
+
for text_field in ["title", "name", "subject", "description", "content", "body", "text", "message"]
|
113
|
+
):
|
114
|
+
metadata[full_key] = value
|
115
|
+
|
116
|
+
elif isinstance(value, (int, float, bool)):
|
117
|
+
text_parts.append(f"{full_key}: {value}")
|
118
|
+
|
119
|
+
elif isinstance(value, dict):
|
120
|
+
text_parts.extend(self._extract_from_dict(value, metadata, full_key))
|
121
|
+
|
122
|
+
elif isinstance(value, list):
|
123
|
+
text_parts.extend(self._extract_from_list(value, metadata, full_key))
|
124
|
+
|
125
|
+
elif value is not None:
|
126
|
+
text_parts.append(f"{full_key}: {value!s}")
|
127
|
+
|
128
|
+
return text_parts
|
129
|
+
|
130
|
+
def _extract_from_list(self, data: list[Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
|
131
|
+
text_parts = []
|
132
|
+
|
133
|
+
for i, item in enumerate(data):
|
134
|
+
item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
|
135
|
+
|
136
|
+
if isinstance(item, str) and item.strip():
|
137
|
+
text_parts.append(f"{item_key}: {item}")
|
138
|
+
|
139
|
+
elif isinstance(item, dict):
|
140
|
+
text_parts.extend(self._extract_from_dict(item, metadata, item_key))
|
141
|
+
|
142
|
+
elif isinstance(item, list):
|
143
|
+
text_parts.extend(self._extract_from_list(item, metadata, item_key))
|
144
|
+
|
145
|
+
elif item is not None:
|
146
|
+
text_parts.append(f"{item_key}: {item!s}")
|
147
|
+
|
148
|
+
return text_parts
|
kreuzberg/_gmft.py
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import multiprocessing as mp
|
3
4
|
import os
|
5
|
+
import queue
|
6
|
+
import signal
|
7
|
+
import traceback
|
4
8
|
from dataclasses import dataclass, field
|
9
|
+
from io import StringIO
|
5
10
|
from typing import TYPE_CHECKING, Any, Literal
|
6
11
|
|
7
12
|
from kreuzberg._types import TableData
|
8
13
|
from kreuzberg._utils._sync import run_sync
|
9
|
-
from kreuzberg.exceptions import MissingDependencyError
|
14
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
10
15
|
|
11
16
|
if TYPE_CHECKING:
|
12
17
|
from os import PathLike
|
@@ -196,9 +201,7 @@ async def extract_tables( # noqa: PLR0915
|
|
196
201
|
|
197
202
|
try:
|
198
203
|
if use_isolated_process:
|
199
|
-
|
200
|
-
|
201
|
-
result = await extract_tables_isolated_async(file_path, config)
|
204
|
+
result = await _extract_tables_isolated_async(file_path, config)
|
202
205
|
|
203
206
|
await table_cache.aset(result, **cache_kwargs)
|
204
207
|
|
@@ -210,7 +213,7 @@ async def extract_tables( # noqa: PLR0915
|
|
210
213
|
from gmft.formatters.tatr import TATRFormatConfig
|
211
214
|
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
212
215
|
|
213
|
-
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
216
|
+
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
|
214
217
|
config=TATRFormatConfig(
|
215
218
|
verbosity=config.verbosity,
|
216
219
|
formatter_base_threshold=config.formatter_base_threshold,
|
@@ -226,7 +229,7 @@ async def extract_tables( # noqa: PLR0915
|
|
226
229
|
force_large_table_assumption=config.force_large_table_assumption,
|
227
230
|
)
|
228
231
|
)
|
229
|
-
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
232
|
+
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
|
230
233
|
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
231
234
|
)
|
232
235
|
doc = await run_sync(PyPDFium2Document, str(file_path))
|
@@ -247,7 +250,7 @@ async def extract_tables( # noqa: PLR0915
|
|
247
250
|
text=data_frame.to_markdown(),
|
248
251
|
df=data_frame,
|
249
252
|
)
|
250
|
-
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
253
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
|
251
254
|
]
|
252
255
|
|
253
256
|
await table_cache.aset(result, **cache_kwargs)
|
@@ -314,9 +317,7 @@ def extract_tables_sync(
|
|
314
317
|
return cached_result # type: ignore[no-any-return]
|
315
318
|
|
316
319
|
if use_isolated_process:
|
317
|
-
|
318
|
-
|
319
|
-
result = extract_tables_isolated(file_path, config)
|
320
|
+
result = _extract_tables_isolated(file_path, config)
|
320
321
|
|
321
322
|
table_cache.set(result, **cache_kwargs)
|
322
323
|
|
@@ -365,7 +366,7 @@ def extract_tables_sync(
|
|
365
366
|
text=data_frame.to_markdown(),
|
366
367
|
df=data_frame,
|
367
368
|
)
|
368
|
-
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
369
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
|
369
370
|
]
|
370
371
|
|
371
372
|
table_cache.set(result, **cache_kwargs)
|
@@ -378,3 +379,309 @@ def extract_tables_sync(
|
|
378
379
|
raise MissingDependencyError.create_for_package(
|
379
380
|
dependency_group="gmft", functionality="table extraction", package_name="gmft"
|
380
381
|
) from e
|
382
|
+
|
383
|
+
|
384
|
+
def _extract_tables_in_process(
|
385
|
+
file_path: str | PathLike[str],
|
386
|
+
config_dict: dict[str, Any],
|
387
|
+
result_queue: queue.Queue[tuple[bool, Any]],
|
388
|
+
) -> None:
|
389
|
+
"""Extract tables in an isolated process to handle potential segfaults.
|
390
|
+
|
391
|
+
Args:
|
392
|
+
file_path: Path to the PDF file
|
393
|
+
config_dict: Serialized GMFTConfig as a dict
|
394
|
+
result_queue: Queue to put results or errors
|
395
|
+
"""
|
396
|
+
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
397
|
+
|
398
|
+
try:
|
399
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
|
400
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
|
401
|
+
from gmft.formatters.tatr import TATRFormatConfig
|
402
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
403
|
+
|
404
|
+
config = GMFTConfig(**config_dict)
|
405
|
+
|
406
|
+
formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
|
407
|
+
config=TATRFormatConfig(
|
408
|
+
verbosity=config.verbosity,
|
409
|
+
formatter_base_threshold=config.formatter_base_threshold,
|
410
|
+
cell_required_confidence=config.cell_required_confidence,
|
411
|
+
remove_null_rows=config.remove_null_rows,
|
412
|
+
enable_multi_header=config.enable_multi_header,
|
413
|
+
semantic_spanning_cells=config.semantic_spanning_cells,
|
414
|
+
semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
|
415
|
+
large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
|
416
|
+
large_table_threshold=config.large_table_threshold,
|
417
|
+
large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
|
418
|
+
large_table_maximum_rows=config.large_table_maximum_rows,
|
419
|
+
force_large_table_assumption=config.force_large_table_assumption,
|
420
|
+
)
|
421
|
+
)
|
422
|
+
detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)) # type: ignore[no-untyped-call]
|
423
|
+
|
424
|
+
doc = PyPDFium2Document(str(file_path))
|
425
|
+
cropped_tables = []
|
426
|
+
dataframes = []
|
427
|
+
|
428
|
+
try:
|
429
|
+
for page in doc:
|
430
|
+
cropped_tables.extend(detector.extract(page)) # type: ignore[attr-defined]
|
431
|
+
|
432
|
+
for cropped_table in cropped_tables:
|
433
|
+
formatted_table = formatter.extract(cropped_table) # type: ignore[attr-defined]
|
434
|
+
dataframes.append(formatted_table.df())
|
435
|
+
|
436
|
+
results = []
|
437
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
|
438
|
+
import io
|
439
|
+
|
440
|
+
img_bytes = io.BytesIO()
|
441
|
+
cropped_image = cropped_table.image()
|
442
|
+
cropped_image.save(img_bytes, format="PNG")
|
443
|
+
img_bytes.seek(0)
|
444
|
+
|
445
|
+
results.append(
|
446
|
+
{
|
447
|
+
"cropped_image_bytes": img_bytes.getvalue(),
|
448
|
+
"page_number": cropped_table.page.page_number,
|
449
|
+
"text": data_frame.to_markdown(),
|
450
|
+
"df_csv": data_frame.to_csv(index=False),
|
451
|
+
}
|
452
|
+
)
|
453
|
+
|
454
|
+
result_queue.put((True, results))
|
455
|
+
|
456
|
+
finally:
|
457
|
+
doc.close() # type: ignore[no-untyped-call]
|
458
|
+
|
459
|
+
except Exception as e: # noqa: BLE001
|
460
|
+
error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
|
461
|
+
result_queue.put((False, error_info))
|
462
|
+
|
463
|
+
|
464
|
+
def _extract_tables_isolated(
|
465
|
+
file_path: str | PathLike[str],
|
466
|
+
config: GMFTConfig | None = None,
|
467
|
+
timeout: float = 300.0,
|
468
|
+
) -> list[TableData]:
|
469
|
+
"""Extract tables using an isolated process to handle segfaults.
|
470
|
+
|
471
|
+
Args:
|
472
|
+
file_path: Path to the PDF file
|
473
|
+
config: GMFT configuration
|
474
|
+
timeout: Maximum time to wait for extraction
|
475
|
+
|
476
|
+
Returns:
|
477
|
+
List of extracted tables
|
478
|
+
|
479
|
+
Raises:
|
480
|
+
RuntimeError: If extraction fails or times out
|
481
|
+
"""
|
482
|
+
config = config or GMFTConfig()
|
483
|
+
config_dict = config.__dict__.copy()
|
484
|
+
|
485
|
+
ctx = mp.get_context("spawn")
|
486
|
+
result_queue = ctx.Queue()
|
487
|
+
|
488
|
+
process = ctx.Process(
|
489
|
+
target=_extract_tables_in_process,
|
490
|
+
args=(str(file_path), config_dict, result_queue),
|
491
|
+
)
|
492
|
+
|
493
|
+
process.start()
|
494
|
+
|
495
|
+
try:
|
496
|
+
# Wait for result with timeout, checking for process death # ~keep
|
497
|
+
import time
|
498
|
+
|
499
|
+
start_time = time.time()
|
500
|
+
while True:
|
501
|
+
try:
|
502
|
+
success, result = result_queue.get_nowait()
|
503
|
+
break
|
504
|
+
except queue.Empty:
|
505
|
+
if time.time() - start_time > timeout:
|
506
|
+
raise
|
507
|
+
|
508
|
+
if not process.is_alive():
|
509
|
+
# Process died without putting result # ~keep
|
510
|
+
if process.exitcode == -signal.SIGSEGV:
|
511
|
+
raise ParsingError(
|
512
|
+
"GMFT process crashed with segmentation fault",
|
513
|
+
context={
|
514
|
+
"file_path": str(file_path),
|
515
|
+
"exit_code": process.exitcode,
|
516
|
+
},
|
517
|
+
) from None
|
518
|
+
raise ParsingError(
|
519
|
+
f"GMFT process died unexpectedly with exit code {process.exitcode}",
|
520
|
+
context={
|
521
|
+
"file_path": str(file_path),
|
522
|
+
"exit_code": process.exitcode,
|
523
|
+
},
|
524
|
+
) from None
|
525
|
+
|
526
|
+
time.sleep(0.1)
|
527
|
+
|
528
|
+
if success:
|
529
|
+
tables = []
|
530
|
+
for table_dict in result:
|
531
|
+
import io
|
532
|
+
|
533
|
+
from PIL import Image
|
534
|
+
|
535
|
+
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
536
|
+
import pandas as pd
|
537
|
+
|
538
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
539
|
+
|
540
|
+
tables.append(
|
541
|
+
TableData(
|
542
|
+
cropped_image=img,
|
543
|
+
page_number=table_dict["page_number"],
|
544
|
+
text=table_dict["text"],
|
545
|
+
df=df,
|
546
|
+
)
|
547
|
+
)
|
548
|
+
|
549
|
+
return tables
|
550
|
+
|
551
|
+
error_info = result
|
552
|
+
raise ParsingError(
|
553
|
+
f"GMFT table extraction failed: {error_info['error']}",
|
554
|
+
context={
|
555
|
+
"file_path": str(file_path),
|
556
|
+
"error_type": error_info["type"],
|
557
|
+
"traceback": error_info["traceback"],
|
558
|
+
},
|
559
|
+
)
|
560
|
+
|
561
|
+
except queue.Empty as e:
|
562
|
+
raise ParsingError(
|
563
|
+
"GMFT table extraction timed out",
|
564
|
+
context={
|
565
|
+
"file_path": str(file_path),
|
566
|
+
"timeout": timeout,
|
567
|
+
},
|
568
|
+
) from e
|
569
|
+
finally:
|
570
|
+
if process.is_alive():
|
571
|
+
process.terminate()
|
572
|
+
process.join(timeout=5)
|
573
|
+
if process.is_alive():
|
574
|
+
process.kill()
|
575
|
+
process.join()
|
576
|
+
|
577
|
+
|
578
|
+
async def _extract_tables_isolated_async(
|
579
|
+
file_path: str | PathLike[str],
|
580
|
+
config: GMFTConfig | None = None,
|
581
|
+
timeout: float = 300.0,
|
582
|
+
) -> list[TableData]:
|
583
|
+
"""Async version of extract_tables_isolated using asyncio.
|
584
|
+
|
585
|
+
Args:
|
586
|
+
file_path: Path to the PDF file
|
587
|
+
config: GMFT configuration
|
588
|
+
timeout: Maximum time to wait for extraction
|
589
|
+
|
590
|
+
Returns:
|
591
|
+
List of extracted tables
|
592
|
+
|
593
|
+
Raises:
|
594
|
+
RuntimeError: If extraction fails or times out
|
595
|
+
"""
|
596
|
+
import anyio
|
597
|
+
|
598
|
+
config = config or GMFTConfig()
|
599
|
+
config_dict = config.__dict__.copy()
|
600
|
+
|
601
|
+
ctx = mp.get_context("spawn")
|
602
|
+
result_queue = ctx.Queue()
|
603
|
+
|
604
|
+
process = ctx.Process(
|
605
|
+
target=_extract_tables_in_process,
|
606
|
+
args=(str(file_path), config_dict, result_queue),
|
607
|
+
)
|
608
|
+
|
609
|
+
process.start()
|
610
|
+
|
611
|
+
try:
|
612
|
+
|
613
|
+
async def wait_for_result() -> tuple[bool, Any]:
|
614
|
+
while True:
|
615
|
+
try:
|
616
|
+
return result_queue.get_nowait() # type: ignore[no-any-return]
|
617
|
+
except queue.Empty: # noqa: PERF203
|
618
|
+
await anyio.sleep(0.1)
|
619
|
+
if not process.is_alive():
|
620
|
+
# Process died without putting result # ~keep
|
621
|
+
if process.exitcode == -signal.SIGSEGV:
|
622
|
+
raise ParsingError(
|
623
|
+
"GMFT process crashed with segmentation fault",
|
624
|
+
context={
|
625
|
+
"file_path": str(file_path),
|
626
|
+
"exit_code": process.exitcode,
|
627
|
+
},
|
628
|
+
) from None
|
629
|
+
raise ParsingError(
|
630
|
+
f"GMFT process died unexpectedly with exit code {process.exitcode}",
|
631
|
+
context={
|
632
|
+
"file_path": str(file_path),
|
633
|
+
"exit_code": process.exitcode,
|
634
|
+
},
|
635
|
+
) from None
|
636
|
+
|
637
|
+
with anyio.fail_after(timeout):
|
638
|
+
success, result = await wait_for_result()
|
639
|
+
|
640
|
+
if success:
|
641
|
+
tables = []
|
642
|
+
for table_dict in result:
|
643
|
+
import io
|
644
|
+
|
645
|
+
from PIL import Image
|
646
|
+
|
647
|
+
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
648
|
+
import pandas as pd
|
649
|
+
|
650
|
+
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
651
|
+
|
652
|
+
tables.append(
|
653
|
+
TableData(
|
654
|
+
cropped_image=img,
|
655
|
+
page_number=table_dict["page_number"],
|
656
|
+
text=table_dict["text"],
|
657
|
+
df=df,
|
658
|
+
)
|
659
|
+
)
|
660
|
+
|
661
|
+
return tables
|
662
|
+
|
663
|
+
error_info = result
|
664
|
+
raise ParsingError(
|
665
|
+
f"GMFT table extraction failed: {error_info['error']}",
|
666
|
+
context={
|
667
|
+
"file_path": str(file_path),
|
668
|
+
"error_type": error_info["type"],
|
669
|
+
"traceback": error_info["traceback"],
|
670
|
+
},
|
671
|
+
)
|
672
|
+
|
673
|
+
except TimeoutError as e:
|
674
|
+
raise ParsingError(
|
675
|
+
"GMFT table extraction timed out",
|
676
|
+
context={
|
677
|
+
"file_path": str(file_path),
|
678
|
+
"timeout": timeout,
|
679
|
+
},
|
680
|
+
) from e
|
681
|
+
finally:
|
682
|
+
if process.is_alive():
|
683
|
+
process.terminate()
|
684
|
+
await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
|
685
|
+
if process.is_alive():
|
686
|
+
process.kill()
|
687
|
+
await anyio.to_thread.run_sync(process.join)
|
@@ -0,0 +1,95 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from functools import lru_cache
|
5
|
+
from typing import TYPE_CHECKING, Any
|
6
|
+
|
7
|
+
from kreuzberg.exceptions import MissingDependencyError
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from fast_langdetect import LangDetectConfig as FastLangDetectConfig
|
11
|
+
|
12
|
+
try:
|
13
|
+
from fast_langdetect import LangDetectConfig as FastLangDetectConfig
|
14
|
+
from fast_langdetect import detect, detect_multilingual
|
15
|
+
|
16
|
+
HAS_FAST_LANGDETECT = True
|
17
|
+
except ImportError:
|
18
|
+
HAS_FAST_LANGDETECT = False
|
19
|
+
detect = None
|
20
|
+
detect_multilingual = None
|
21
|
+
FastLangDetectConfig = None
|
22
|
+
|
23
|
+
_CACHE_SIZE = 128
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass(frozen=True)
|
27
|
+
class LanguageDetectionConfig:
|
28
|
+
"""Configuration for language detection.
|
29
|
+
|
30
|
+
Attributes:
|
31
|
+
low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
|
32
|
+
Defaults to True for better memory efficiency.
|
33
|
+
top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
|
34
|
+
multilingual: If True, uses multilingual detection to handle mixed-language text.
|
35
|
+
If False, uses single language detection. Defaults to False.
|
36
|
+
cache_dir: Custom directory for model cache. If None, uses system default.
|
37
|
+
allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
|
38
|
+
"""
|
39
|
+
|
40
|
+
low_memory: bool = True
|
41
|
+
top_k: int = 3
|
42
|
+
multilingual: bool = False
|
43
|
+
cache_dir: str | None = None
|
44
|
+
allow_fallback: bool = True
|
45
|
+
|
46
|
+
|
47
|
+
def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
|
48
|
+
"""Create FastLangDetectConfig from our config."""
|
49
|
+
if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
|
50
|
+
return None
|
51
|
+
|
52
|
+
kwargs: dict[str, Any] = {
|
53
|
+
"allow_fallback": config.allow_fallback,
|
54
|
+
}
|
55
|
+
if config.cache_dir is not None:
|
56
|
+
kwargs["cache_dir"] = config.cache_dir
|
57
|
+
|
58
|
+
return FastLangDetectConfig(**kwargs)
|
59
|
+
|
60
|
+
|
61
|
+
@lru_cache(maxsize=_CACHE_SIZE)
|
62
|
+
def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
|
63
|
+
"""Detect the most probable languages in the given text using fast-langdetect.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
text: The text to analyze.
|
67
|
+
config: Configuration for language detection. If None, uses defaults.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
|
71
|
+
or None if detection fails.
|
72
|
+
|
73
|
+
Raises:
|
74
|
+
MissingDependencyError: If fast-langdetect is not installed.
|
75
|
+
"""
|
76
|
+
if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
|
77
|
+
raise MissingDependencyError.create_for_package(
|
78
|
+
dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
|
79
|
+
)
|
80
|
+
|
81
|
+
if config is None:
|
82
|
+
config = LanguageDetectionConfig()
|
83
|
+
|
84
|
+
try:
|
85
|
+
if config.multilingual:
|
86
|
+
results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
|
87
|
+
|
88
|
+
return [result["lang"].lower() for result in results if result.get("lang")]
|
89
|
+
|
90
|
+
result = detect(text, low_memory=config.low_memory)
|
91
|
+
if result and result.get("lang"):
|
92
|
+
return [result["lang"].lower()]
|
93
|
+
return None
|
94
|
+
except Exception: # noqa: BLE001
|
95
|
+
return None
|