kreuzberg 3.8.1__py3-none-any.whl → 3.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -0
- kreuzberg/_api/main.py +22 -1
- kreuzberg/_chunker.py +3 -3
- kreuzberg/_config.py +404 -0
- kreuzberg/_document_classification.py +156 -0
- kreuzberg/_entity_extraction.py +6 -6
- kreuzberg/_extractors/_image.py +4 -3
- kreuzberg/_extractors/_pdf.py +40 -29
- kreuzberg/_extractors/_spread_sheet.py +6 -8
- kreuzberg/_extractors/_structured.py +34 -25
- kreuzberg/_gmft.py +33 -42
- kreuzberg/_language_detection.py +1 -1
- kreuzberg/_mcp/server.py +58 -8
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_base.py +1 -1
- kreuzberg/_ocr/_easyocr.py +5 -5
- kreuzberg/_ocr/_paddleocr.py +4 -4
- kreuzberg/_ocr/_tesseract.py +12 -21
- kreuzberg/_playa.py +2 -3
- kreuzberg/_types.py +65 -27
- kreuzberg/_utils/_cache.py +14 -17
- kreuzberg/_utils/_device.py +17 -27
- kreuzberg/_utils/_errors.py +41 -38
- kreuzberg/_utils/_quality.py +7 -11
- kreuzberg/_utils/_serialization.py +21 -16
- kreuzberg/_utils/_string.py +22 -12
- kreuzberg/_utils/_table.py +3 -4
- kreuzberg/cli.py +5 -5
- kreuzberg/exceptions.py +10 -0
- kreuzberg/extraction.py +20 -11
- kreuzberg-3.9.0.dist-info/METADATA +269 -0
- kreuzberg-3.9.0.dist-info/RECORD +54 -0
- kreuzberg/_cli_config.py +0 -175
- kreuzberg-3.8.1.dist-info/METADATA +0 -301
- kreuzberg-3.8.1.dist-info/RECORD +0 -53
- {kreuzberg-3.8.1.dist-info → kreuzberg-3.9.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.8.1.dist-info → kreuzberg-3.9.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.8.1.dist-info → kreuzberg-3.9.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_image.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import contextlib
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from dataclasses import asdict
|
6
7
|
from pathlib import Path
|
7
8
|
from typing import TYPE_CHECKING, ClassVar
|
8
9
|
|
@@ -88,17 +89,17 @@ class ImageExtractor(Extractor):
|
|
88
89
|
config = (
|
89
90
|
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
90
91
|
)
|
91
|
-
result = backend.process_file_sync(path, **config
|
92
|
+
result = backend.process_file_sync(path, **asdict(config))
|
92
93
|
elif self.config.ocr_backend == "paddleocr":
|
93
94
|
paddle_config = (
|
94
95
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
95
96
|
)
|
96
|
-
result = backend.process_file_sync(path, **paddle_config
|
97
|
+
result = backend.process_file_sync(path, **asdict(paddle_config))
|
97
98
|
elif self.config.ocr_backend == "easyocr":
|
98
99
|
easy_config = (
|
99
100
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
100
101
|
)
|
101
|
-
result = backend.process_file_sync(path, **easy_config
|
102
|
+
result = backend.process_file_sync(path, **asdict(easy_config))
|
102
103
|
else:
|
103
104
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
104
105
|
return self._apply_quality_processing(result)
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
3
3
|
import contextlib
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from dataclasses import asdict
|
6
7
|
from multiprocessing import cpu_count
|
7
8
|
from pathlib import Path
|
8
9
|
from re import Pattern
|
@@ -58,9 +59,13 @@ class PDFExtractor(Extractor):
|
|
58
59
|
result: ExtractionResult | None = None
|
59
60
|
|
60
61
|
if not self.config.force_ocr:
|
61
|
-
|
62
|
-
|
63
|
-
|
62
|
+
try:
|
63
|
+
content = await self._extract_pdf_searchable_text(path)
|
64
|
+
if self._validate_extracted_text(content):
|
65
|
+
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
66
|
+
except ParsingError:
|
67
|
+
# If searchable text extraction fails, continue to OCR or empty result
|
68
|
+
pass
|
64
69
|
|
65
70
|
if not result and self.config.ocr_backend is not None:
|
66
71
|
result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
|
@@ -73,7 +78,7 @@ class PDFExtractor(Extractor):
|
|
73
78
|
if self.config.extract_tables:
|
74
79
|
# GMFT is optional dependency
|
75
80
|
try:
|
76
|
-
from kreuzberg._gmft import extract_tables
|
81
|
+
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
77
82
|
|
78
83
|
result.tables = await extract_tables(path, self.config.gmft_config)
|
79
84
|
except ImportError:
|
@@ -112,16 +117,19 @@ class PDFExtractor(Extractor):
|
|
112
117
|
|
113
118
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
114
119
|
"""Pure sync implementation of PDF extraction from path."""
|
115
|
-
|
120
|
+
try:
|
121
|
+
text = self._extract_pdf_searchable_text_sync(path)
|
122
|
+
except ParsingError:
|
123
|
+
text = ""
|
116
124
|
|
117
|
-
if self.config.force_ocr or not self._validate_extracted_text(text):
|
125
|
+
if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
|
118
126
|
text = self._extract_pdf_with_ocr_sync(path)
|
119
127
|
|
120
128
|
tables = []
|
121
129
|
if self.config.extract_tables:
|
122
130
|
# GMFT is optional dependency
|
123
131
|
try:
|
124
|
-
from kreuzberg._gmft import extract_tables_sync
|
132
|
+
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
125
133
|
|
126
134
|
tables = extract_tables_sync(path)
|
127
135
|
except ImportError:
|
@@ -248,9 +256,10 @@ class PDFExtractor(Extractor):
|
|
248
256
|
*[backend.process_image(image, **self.config.get_config_dict()) for image in images],
|
249
257
|
batch_size=cpu_count(),
|
250
258
|
)
|
251
|
-
|
252
|
-
|
253
|
-
|
259
|
+
# Use list comprehension and join for efficient string building
|
260
|
+
content = "\n".join(result.content for result in ocr_results)
|
261
|
+
|
262
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
254
263
|
|
255
264
|
@staticmethod
|
256
265
|
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
@@ -269,22 +278,24 @@ class PDFExtractor(Extractor):
|
|
269
278
|
try:
|
270
279
|
with pypdfium_file_lock(input_file):
|
271
280
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
272
|
-
|
281
|
+
pages_content = []
|
273
282
|
page_errors = []
|
274
283
|
|
275
284
|
for i, page in enumerate(cast("pypdfium2.PdfDocument", document)):
|
276
285
|
try:
|
277
286
|
text_page = page.get_textpage()
|
278
|
-
|
287
|
+
page_content = text_page.get_text_bounded()
|
288
|
+
pages_content.append(page_content)
|
279
289
|
except Exception as e: # noqa: PERF203, BLE001
|
280
290
|
page_errors.append({"page": i + 1, "error": str(e)})
|
281
|
-
|
291
|
+
pages_content.append(f"[Error extracting page {i + 1}]")
|
282
292
|
|
283
|
-
text = "\n".join(
|
293
|
+
text = "\n".join(pages_content)
|
294
|
+
has_content = bool(text.strip())
|
284
295
|
|
285
|
-
if page_errors and
|
296
|
+
if page_errors and has_content:
|
286
297
|
return normalize_spaces(text)
|
287
|
-
if not
|
298
|
+
if not has_content:
|
288
299
|
raise ParsingError(
|
289
300
|
"Could not extract any text from PDF",
|
290
301
|
context=create_error_context(
|
@@ -315,14 +326,14 @@ class PDFExtractor(Extractor):
|
|
315
326
|
try:
|
316
327
|
with pypdfium_file_lock(path):
|
317
328
|
pdf = pypdfium2.PdfDocument(str(path))
|
318
|
-
|
329
|
+
pages_text = []
|
319
330
|
for page in pdf:
|
320
331
|
text_page = page.get_textpage()
|
321
332
|
text = text_page.get_text_bounded()
|
322
|
-
|
333
|
+
pages_text.append(text)
|
323
334
|
text_page.close()
|
324
335
|
page.close()
|
325
|
-
return "".join(
|
336
|
+
return "\n".join(pages_text)
|
326
337
|
except Exception as e:
|
327
338
|
raise ParsingError(f"Failed to extract PDF text: {e}") from e
|
328
339
|
finally:
|
@@ -378,22 +389,22 @@ class PDFExtractor(Extractor):
|
|
378
389
|
config = (
|
379
390
|
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
380
391
|
)
|
381
|
-
results = backend.process_batch_sync(paths, **config
|
392
|
+
results = backend.process_batch_sync(paths, **asdict(config))
|
382
393
|
elif self.config.ocr_backend == "paddleocr":
|
383
394
|
paddle_config = (
|
384
395
|
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
385
396
|
)
|
386
|
-
results = backend.process_batch_sync(paths, **paddle_config
|
397
|
+
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
387
398
|
elif self.config.ocr_backend == "easyocr":
|
388
399
|
easy_config = (
|
389
400
|
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
390
401
|
)
|
391
|
-
results = backend.process_batch_sync(paths, **easy_config
|
402
|
+
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
392
403
|
else:
|
393
404
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
394
405
|
|
395
|
-
|
396
|
-
return "\n\n".join(
|
406
|
+
# Use list comprehension and join for efficient string building
|
407
|
+
return "\n\n".join(result.content for result in results)
|
397
408
|
|
398
409
|
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
399
410
|
"""Extract text using playa for better structure preservation."""
|
@@ -401,14 +412,14 @@ class PDFExtractor(Extractor):
|
|
401
412
|
content = path.read_bytes()
|
402
413
|
document = parse(content, max_workers=1)
|
403
414
|
|
404
|
-
|
415
|
+
# Extract text while preserving structure
|
416
|
+
pages_text = []
|
405
417
|
for page in document.pages:
|
406
|
-
# Extract text while preserving structure
|
407
418
|
page_text = page.extract_text()
|
408
419
|
if page_text and page_text.strip():
|
409
|
-
|
420
|
+
pages_text.append(page_text)
|
410
421
|
|
411
|
-
if
|
412
|
-
return "\n\n".join(
|
422
|
+
if pages_text:
|
423
|
+
return "\n\n".join(pages_text)
|
413
424
|
|
414
425
|
return fallback_text
|
@@ -2,13 +2,16 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import contextlib
|
4
4
|
import csv
|
5
|
+
import os
|
5
6
|
import sys
|
7
|
+
import tempfile
|
6
8
|
from datetime import date, datetime, time, timedelta
|
7
9
|
from io import StringIO
|
8
10
|
from pathlib import Path
|
9
11
|
from typing import Any
|
10
12
|
|
11
13
|
from anyio import Path as AsyncPath
|
14
|
+
from PIL import Image
|
12
15
|
from python_calamine import CalamineWorkbook
|
13
16
|
|
14
17
|
from kreuzberg._extractors._base import Extractor
|
@@ -68,9 +71,6 @@ class SpreadSheetExtractor(Extractor):
|
|
68
71
|
|
69
72
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
70
73
|
"""Pure sync implementation of extract_bytes."""
|
71
|
-
import os
|
72
|
-
import tempfile
|
73
|
-
|
74
74
|
fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
|
75
75
|
|
76
76
|
try:
|
@@ -198,9 +198,9 @@ class SpreadSheetExtractor(Extractor):
|
|
198
198
|
"""Enhanced sheet processing with better table structure preservation."""
|
199
199
|
try:
|
200
200
|
# pandas is optional dependency
|
201
|
-
import pandas as pd
|
201
|
+
import pandas as pd # noqa: PLC0415
|
202
202
|
|
203
|
-
from kreuzberg._utils._table import enhance_table_markdown
|
203
|
+
from kreuzberg._utils._table import enhance_table_markdown # noqa: PLC0415
|
204
204
|
|
205
205
|
sheet = workbook.get_sheet_by_name(sheet_name)
|
206
206
|
data = sheet.to_python()
|
@@ -218,9 +218,7 @@ class SpreadSheetExtractor(Extractor):
|
|
218
218
|
return f"## {sheet_name}\n\n*No data*"
|
219
219
|
|
220
220
|
# Create a mock TableData for enhanced formatting
|
221
|
-
from
|
222
|
-
|
223
|
-
from kreuzberg._types import TableData
|
221
|
+
from kreuzberg._types import TableData # noqa: PLC0415
|
224
222
|
|
225
223
|
# Create a 1x1 transparent image as placeholder
|
226
224
|
placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
|
@@ -1,8 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import json
|
4
|
+
import sys
|
4
5
|
from typing import TYPE_CHECKING, Any, ClassVar
|
5
6
|
|
7
|
+
if sys.version_info >= (3, 11):
|
8
|
+
import tomllib
|
9
|
+
else:
|
10
|
+
try:
|
11
|
+
import tomli as tomllib # type: ignore[import-not-found]
|
12
|
+
except ImportError:
|
13
|
+
tomllib = None
|
14
|
+
|
15
|
+
try:
|
16
|
+
import yaml
|
17
|
+
except ImportError:
|
18
|
+
yaml = None
|
19
|
+
|
6
20
|
from anyio import Path as AsyncPath
|
7
21
|
|
8
22
|
from kreuzberg._extractors._base import Extractor
|
@@ -14,6 +28,9 @@ from kreuzberg._utils._sync import run_sync
|
|
14
28
|
if TYPE_CHECKING:
|
15
29
|
from pathlib import Path
|
16
30
|
|
31
|
+
# Define text field keywords as a set for O(1) membership testing
|
32
|
+
_TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
|
33
|
+
|
17
34
|
|
18
35
|
class StructuredDataExtractor(Extractor):
|
19
36
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
@@ -41,41 +58,34 @@ class StructuredDataExtractor(Extractor):
|
|
41
58
|
if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
|
42
59
|
data = json.loads(text_content)
|
43
60
|
elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
content=normalize_spaces(text_content),
|
52
|
-
mime_type=PLAIN_TEXT_MIME_TYPE,
|
53
|
-
metadata={"warning": "tomllib/tomli not available, returning raw text"},
|
54
|
-
chunks=[],
|
55
|
-
)
|
61
|
+
if tomllib is None:
|
62
|
+
return ExtractionResult(
|
63
|
+
content=normalize_spaces(text_content),
|
64
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
65
|
+
metadata={"warning": "tomllib/tomli not available, returning raw text"},
|
66
|
+
chunks=[],
|
67
|
+
)
|
56
68
|
data = tomllib.loads(text_content)
|
57
69
|
else:
|
58
|
-
|
59
|
-
import yaml
|
60
|
-
|
61
|
-
data = yaml.safe_load(text_content)
|
62
|
-
except ImportError:
|
70
|
+
if yaml is None:
|
63
71
|
return ExtractionResult(
|
64
72
|
content=normalize_spaces(text_content),
|
65
73
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
66
74
|
metadata={"warning": "PyYAML not available, returning raw text"},
|
67
75
|
chunks=[],
|
68
76
|
)
|
77
|
+
data = yaml.safe_load(text_content)
|
69
78
|
|
70
79
|
text_parts: list[str] = []
|
71
80
|
metadata: dict[str, Any] = {}
|
72
81
|
|
82
|
+
# Use match statement for cleaner code and avoid multiple isinstance calls
|
73
83
|
if isinstance(data, dict):
|
74
|
-
text_parts
|
84
|
+
text_parts = self._extract_from_dict(data, metadata)
|
75
85
|
elif isinstance(data, list):
|
76
|
-
text_parts
|
86
|
+
text_parts = self._extract_from_list(data, metadata)
|
77
87
|
else:
|
78
|
-
text_parts
|
88
|
+
text_parts = [str(data)]
|
79
89
|
|
80
90
|
combined_text = "\n".join(text_parts) if text_parts else text_content
|
81
91
|
|
@@ -86,7 +96,7 @@ class StructuredDataExtractor(Extractor):
|
|
86
96
|
chunks=[],
|
87
97
|
)
|
88
98
|
|
89
|
-
except (ValueError, TypeError
|
99
|
+
except (json.JSONDecodeError, ValueError, TypeError) as e:
|
90
100
|
return ExtractionResult(
|
91
101
|
content=normalize_spaces(text_content),
|
92
102
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
@@ -107,10 +117,9 @@ class StructuredDataExtractor(Extractor):
|
|
107
117
|
if isinstance(value, str) and value.strip():
|
108
118
|
text_parts.append(f"{full_key}: {value}")
|
109
119
|
|
110
|
-
if any
|
111
|
-
|
112
|
-
|
113
|
-
):
|
120
|
+
# Check if key contains any text field keywords efficiently
|
121
|
+
key_lower = key.lower()
|
122
|
+
if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
|
114
123
|
metadata[full_key] = value
|
115
124
|
|
116
125
|
elif isinstance(value, (int, float, bool)):
|
kreuzberg/_gmft.py
CHANGED
@@ -1,14 +1,21 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import io
|
3
4
|
import multiprocessing as mp
|
4
5
|
import os
|
5
6
|
import queue
|
6
7
|
import signal
|
8
|
+
import time
|
7
9
|
import traceback
|
8
10
|
from dataclasses import dataclass, field
|
9
11
|
from io import StringIO
|
12
|
+
from pathlib import Path
|
10
13
|
from typing import TYPE_CHECKING, Any, Literal
|
11
14
|
|
15
|
+
import anyio
|
16
|
+
import msgspec
|
17
|
+
from PIL import Image
|
18
|
+
|
12
19
|
from kreuzberg._types import TableData
|
13
20
|
from kreuzberg._utils._sync import run_sync
|
14
21
|
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
@@ -20,7 +27,7 @@ if TYPE_CHECKING:
|
|
20
27
|
from pandas import DataFrame
|
21
28
|
|
22
29
|
|
23
|
-
@dataclass(unsafe_hash=True)
|
30
|
+
@dataclass(unsafe_hash=True, slots=True)
|
24
31
|
class GMFTConfig:
|
25
32
|
"""Configuration options for GMFT.
|
26
33
|
|
@@ -131,7 +138,7 @@ class GMFTConfig:
|
|
131
138
|
"""
|
132
139
|
|
133
140
|
|
134
|
-
async def extract_tables(
|
141
|
+
async def extract_tables(
|
135
142
|
file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
|
136
143
|
) -> list[TableData]:
|
137
144
|
"""Extracts tables from a PDF file.
|
@@ -151,9 +158,7 @@ async def extract_tables( # noqa: PLR0915
|
|
151
158
|
Returns:
|
152
159
|
A list of table data dictionaries.
|
153
160
|
"""
|
154
|
-
from
|
155
|
-
|
156
|
-
from kreuzberg._utils._cache import get_table_cache
|
161
|
+
from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
|
157
162
|
|
158
163
|
# Determine if we should use isolated process # ~keep
|
159
164
|
if use_isolated_process is None:
|
@@ -178,7 +183,7 @@ async def extract_tables( # noqa: PLR0915
|
|
178
183
|
cache_kwargs = {
|
179
184
|
"file_info": str(sorted(file_info.items())),
|
180
185
|
"extractor": "gmft",
|
181
|
-
"config": str(sorted(config.
|
186
|
+
"config": str(sorted(msgspec.to_builtins(config).items())),
|
182
187
|
}
|
183
188
|
|
184
189
|
table_cache = get_table_cache()
|
@@ -187,8 +192,6 @@ async def extract_tables( # noqa: PLR0915
|
|
187
192
|
return cached_result # type: ignore[no-any-return]
|
188
193
|
|
189
194
|
if table_cache.is_processing(**cache_kwargs):
|
190
|
-
import anyio
|
191
|
-
|
192
195
|
event = table_cache.mark_processing(**cache_kwargs)
|
193
196
|
await anyio.to_thread.run_sync(event.wait)
|
194
197
|
|
@@ -208,10 +211,13 @@ async def extract_tables( # noqa: PLR0915
|
|
208
211
|
return result
|
209
212
|
|
210
213
|
try:
|
211
|
-
from gmft.auto import
|
212
|
-
|
213
|
-
|
214
|
-
|
214
|
+
from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415 # noqa: PLC0415
|
215
|
+
AutoTableDetector,
|
216
|
+
AutoTableFormatter,
|
217
|
+
)
|
218
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
|
219
|
+
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415 # noqa: PLC0415
|
220
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415 # noqa: PLC0415
|
215
221
|
|
216
222
|
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
|
217
223
|
config=TATRFormatConfig(
|
@@ -281,9 +287,7 @@ def extract_tables_sync(
|
|
281
287
|
Returns:
|
282
288
|
A list of table data dictionaries.
|
283
289
|
"""
|
284
|
-
from
|
285
|
-
|
286
|
-
from kreuzberg._utils._cache import get_table_cache
|
290
|
+
from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
|
287
291
|
|
288
292
|
# Determine if we should use isolated process # ~keep
|
289
293
|
if use_isolated_process is None:
|
@@ -308,7 +312,7 @@ def extract_tables_sync(
|
|
308
312
|
cache_kwargs = {
|
309
313
|
"file_info": str(sorted(file_info.items())),
|
310
314
|
"extractor": "gmft",
|
311
|
-
"config": str(sorted(config.
|
315
|
+
"config": str(sorted(msgspec.to_builtins(config).items())),
|
312
316
|
}
|
313
317
|
|
314
318
|
table_cache = get_table_cache()
|
@@ -324,10 +328,10 @@ def extract_tables_sync(
|
|
324
328
|
return result
|
325
329
|
|
326
330
|
try:
|
327
|
-
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
|
328
|
-
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
|
329
|
-
from gmft.formatters.tatr import TATRFormatConfig
|
330
|
-
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
331
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined] # noqa: PLC0415
|
332
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
|
333
|
+
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
334
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
331
335
|
|
332
336
|
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
333
337
|
config=TATRFormatConfig(
|
@@ -396,10 +400,10 @@ def _extract_tables_in_process(
|
|
396
400
|
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
397
401
|
|
398
402
|
try:
|
399
|
-
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
|
400
|
-
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
|
401
|
-
from gmft.formatters.tatr import TATRFormatConfig
|
402
|
-
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
403
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined] # noqa: PLC0415
|
404
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
|
405
|
+
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
406
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
403
407
|
|
404
408
|
config = GMFTConfig(**config_dict)
|
405
409
|
|
@@ -435,8 +439,6 @@ def _extract_tables_in_process(
|
|
435
439
|
|
436
440
|
results = []
|
437
441
|
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
|
438
|
-
import io
|
439
|
-
|
440
442
|
img_bytes = io.BytesIO()
|
441
443
|
cropped_image = cropped_table.image()
|
442
444
|
cropped_image.save(img_bytes, format="PNG")
|
@@ -480,7 +482,7 @@ def _extract_tables_isolated(
|
|
480
482
|
RuntimeError: If extraction fails or times out
|
481
483
|
"""
|
482
484
|
config = config or GMFTConfig()
|
483
|
-
config_dict =
|
485
|
+
config_dict = msgspec.to_builtins(config)
|
484
486
|
|
485
487
|
ctx = mp.get_context("spawn")
|
486
488
|
result_queue = ctx.Queue()
|
@@ -494,7 +496,6 @@ def _extract_tables_isolated(
|
|
494
496
|
|
495
497
|
try:
|
496
498
|
# Wait for result with timeout, checking for process death # ~keep
|
497
|
-
import time
|
498
499
|
|
499
500
|
start_time = time.time()
|
500
501
|
while True:
|
@@ -528,12 +529,8 @@ def _extract_tables_isolated(
|
|
528
529
|
if success:
|
529
530
|
tables = []
|
530
531
|
for table_dict in result:
|
531
|
-
import io
|
532
|
-
|
533
|
-
from PIL import Image
|
534
|
-
|
535
532
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
536
|
-
import pandas as pd
|
533
|
+
import pandas as pd # noqa: PLC0415
|
537
534
|
|
538
535
|
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
539
536
|
|
@@ -578,7 +575,7 @@ def _extract_tables_isolated(
|
|
578
575
|
async def _extract_tables_isolated_async(
|
579
576
|
file_path: str | PathLike[str],
|
580
577
|
config: GMFTConfig | None = None,
|
581
|
-
timeout: float = 300.0,
|
578
|
+
timeout: float = 300.0, # noqa: ASYNC109
|
582
579
|
) -> list[TableData]:
|
583
580
|
"""Async version of extract_tables_isolated using asyncio.
|
584
581
|
|
@@ -593,10 +590,8 @@ async def _extract_tables_isolated_async(
|
|
593
590
|
Raises:
|
594
591
|
RuntimeError: If extraction fails or times out
|
595
592
|
"""
|
596
|
-
import anyio
|
597
|
-
|
598
593
|
config = config or GMFTConfig()
|
599
|
-
config_dict =
|
594
|
+
config_dict = msgspec.to_builtins(config)
|
600
595
|
|
601
596
|
ctx = mp.get_context("spawn")
|
602
597
|
result_queue = ctx.Queue()
|
@@ -640,12 +635,8 @@ async def _extract_tables_isolated_async(
|
|
640
635
|
if success:
|
641
636
|
tables = []
|
642
637
|
for table_dict in result:
|
643
|
-
import io
|
644
|
-
|
645
|
-
from PIL import Image
|
646
|
-
|
647
638
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
648
|
-
import pandas as pd
|
639
|
+
import pandas as pd # noqa: PLC0415
|
649
640
|
|
650
641
|
df = pd.read_csv(StringIO(table_dict["df_csv"]))
|
651
642
|
|