kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +3 -0
- kreuzberg/__main__.py +8 -0
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_cli_config.py +175 -0
- kreuzberg/_extractors/_image.py +39 -4
- kreuzberg/_extractors/_pandoc.py +158 -18
- kreuzberg/_extractors/_pdf.py +199 -19
- kreuzberg/_extractors/_presentation.py +1 -1
- kreuzberg/_extractors/_spread_sheet.py +65 -7
- kreuzberg/_gmft.py +222 -16
- kreuzberg/_mime_types.py +62 -16
- kreuzberg/_multiprocessing/__init__.py +6 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
- kreuzberg/_multiprocessing/process_manager.py +188 -0
- kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
- kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
- kreuzberg/_ocr/_easyocr.py +6 -12
- kreuzberg/_ocr/_paddleocr.py +15 -13
- kreuzberg/_ocr/_tesseract.py +136 -46
- kreuzberg/_playa.py +43 -0
- kreuzberg/_types.py +4 -0
- kreuzberg/_utils/_cache.py +372 -0
- kreuzberg/_utils/_device.py +10 -27
- kreuzberg/_utils/_document_cache.py +220 -0
- kreuzberg/_utils/_errors.py +232 -0
- kreuzberg/_utils/_pdf_lock.py +72 -0
- kreuzberg/_utils/_process_pool.py +100 -0
- kreuzberg/_utils/_serialization.py +82 -0
- kreuzberg/_utils/_string.py +1 -1
- kreuzberg/_utils/_sync.py +21 -0
- kreuzberg/cli.py +338 -0
- kreuzberg/extraction.py +247 -36
- kreuzberg-3.4.0.dist-info/METADATA +290 -0
- kreuzberg-3.4.0.dist-info/RECORD +50 -0
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
- kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
- kreuzberg-3.2.0.dist-info/METADATA +0 -166
- kreuzberg-3.2.0.dist-info/RECORD +0 -34
- kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import contextlib
|
3
4
|
from multiprocessing import cpu_count
|
5
|
+
from pathlib import Path
|
4
6
|
from re import Pattern
|
5
7
|
from re import compile as compile_regex
|
6
8
|
from typing import TYPE_CHECKING, ClassVar, cast
|
@@ -14,14 +16,13 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
|
14
16
|
from kreuzberg._ocr import get_ocr_backend
|
15
17
|
from kreuzberg._playa import extract_pdf_metadata
|
16
18
|
from kreuzberg._types import ExtractionResult, OcrBackendType
|
19
|
+
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
17
20
|
from kreuzberg._utils._string import normalize_spaces
|
18
21
|
from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
|
19
22
|
from kreuzberg._utils._tmp import create_temp_file
|
20
23
|
from kreuzberg.exceptions import ParsingError
|
21
24
|
|
22
25
|
if TYPE_CHECKING: # pragma: no cover
|
23
|
-
from pathlib import Path
|
24
|
-
|
25
26
|
from PIL.Image import Image
|
26
27
|
|
27
28
|
|
@@ -69,10 +70,52 @@ class PDFExtractor(Extractor):
|
|
69
70
|
return result
|
70
71
|
|
71
72
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
72
|
-
|
73
|
+
"""Pure sync implementation of PDF extraction from bytes."""
|
74
|
+
import os
|
75
|
+
import tempfile
|
76
|
+
|
77
|
+
fd, temp_path = tempfile.mkstemp(suffix=".pdf")
|
78
|
+
try:
|
79
|
+
with os.fdopen(fd, "wb") as f:
|
80
|
+
f.write(content)
|
81
|
+
|
82
|
+
result = self.extract_path_sync(Path(temp_path))
|
83
|
+
|
84
|
+
from kreuzberg._playa import extract_pdf_metadata_sync
|
85
|
+
|
86
|
+
metadata = extract_pdf_metadata_sync(content)
|
87
|
+
result.metadata = metadata
|
88
|
+
|
89
|
+
return result
|
90
|
+
finally:
|
91
|
+
with contextlib.suppress(OSError):
|
92
|
+
Path(temp_path).unlink()
|
73
93
|
|
74
94
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
75
|
-
|
95
|
+
"""Pure sync implementation of PDF extraction from path."""
|
96
|
+
text = self._extract_pdf_searchable_text_sync(path)
|
97
|
+
|
98
|
+
if self.config.force_ocr or not self._validate_extracted_text(text):
|
99
|
+
text = self._extract_pdf_with_ocr_sync(path)
|
100
|
+
|
101
|
+
tables = []
|
102
|
+
if self.config.extract_tables:
|
103
|
+
try:
|
104
|
+
from kreuzberg._gmft import extract_tables_sync
|
105
|
+
|
106
|
+
tables = extract_tables_sync(path)
|
107
|
+
except ImportError:
|
108
|
+
pass
|
109
|
+
|
110
|
+
text = normalize_spaces(text)
|
111
|
+
|
112
|
+
return ExtractionResult(
|
113
|
+
content=text,
|
114
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
115
|
+
metadata={},
|
116
|
+
tables=tables,
|
117
|
+
chunks=[],
|
118
|
+
)
|
76
119
|
|
77
120
|
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
78
121
|
"""Check if text extracted from PDF is valid or corrupted.
|
@@ -112,17 +155,45 @@ class PDFExtractor(Extractor):
|
|
112
155
|
Returns:
|
113
156
|
A list of Pillow Images.
|
114
157
|
"""
|
158
|
+
from kreuzberg._utils._errors import create_error_context, should_retry
|
159
|
+
|
115
160
|
document: pypdfium2.PdfDocument | None = None
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
161
|
+
last_error = None
|
162
|
+
|
163
|
+
for attempt in range(3): # Try up to 3 times # ~keep
|
164
|
+
try:
|
165
|
+
with pypdfium_file_lock(input_file):
|
166
|
+
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
167
|
+
return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
|
168
|
+
except pypdfium2.PdfiumError as e: # noqa: PERF203
|
169
|
+
last_error = e
|
170
|
+
if not should_retry(e, attempt + 1):
|
171
|
+
raise ParsingError(
|
172
|
+
"Could not convert PDF to images",
|
173
|
+
context=create_error_context(
|
174
|
+
operation="convert_pdf_to_images",
|
175
|
+
file_path=input_file,
|
176
|
+
error=e,
|
177
|
+
attempt=attempt + 1,
|
178
|
+
),
|
179
|
+
) from e
|
180
|
+
# Wait before retry with exponential backoff # ~keep
|
181
|
+
await anyio.sleep(0.5 * (attempt + 1))
|
182
|
+
finally:
|
183
|
+
if document:
|
184
|
+
with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
|
185
|
+
await run_sync(document.close)
|
186
|
+
|
187
|
+
# All retries failed # ~keep
|
188
|
+
raise ParsingError(
|
189
|
+
"Could not convert PDF to images after retries",
|
190
|
+
context=create_error_context(
|
191
|
+
operation="convert_pdf_to_images",
|
192
|
+
file_path=input_file,
|
193
|
+
error=last_error,
|
194
|
+
attempts=3,
|
195
|
+
),
|
196
|
+
) from last_error
|
126
197
|
|
127
198
|
async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
|
128
199
|
"""Extract text from a scanned PDF file using OCR.
|
@@ -157,15 +228,124 @@ class PDFExtractor(Extractor):
|
|
157
228
|
Returns:
|
158
229
|
The extracted text.
|
159
230
|
"""
|
231
|
+
from kreuzberg._utils._errors import create_error_context
|
232
|
+
|
160
233
|
document: pypdfium2.PdfDocument | None = None
|
161
234
|
try:
|
162
|
-
|
163
|
-
|
164
|
-
|
235
|
+
with pypdfium_file_lock(input_file):
|
236
|
+
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
237
|
+
text_parts = []
|
238
|
+
page_errors = []
|
239
|
+
|
240
|
+
for i, page in enumerate(cast("pypdfium2.PdfDocument", document)):
|
241
|
+
try:
|
242
|
+
text_page = page.get_textpage()
|
243
|
+
text_parts.append(text_page.get_text_bounded())
|
244
|
+
except Exception as e: # noqa: PERF203, BLE001
|
245
|
+
page_errors.append({"page": i + 1, "error": str(e)})
|
246
|
+
text_parts.append(f"[Error extracting page {i + 1}]")
|
247
|
+
|
248
|
+
text = "\n".join(text_parts)
|
249
|
+
|
250
|
+
if page_errors and text_parts:
|
251
|
+
return normalize_spaces(text)
|
252
|
+
if not text_parts:
|
253
|
+
raise ParsingError(
|
254
|
+
"Could not extract any text from PDF",
|
255
|
+
context=create_error_context(
|
256
|
+
operation="extract_pdf_searchable_text",
|
257
|
+
file_path=input_file,
|
258
|
+
page_errors=page_errors,
|
259
|
+
),
|
260
|
+
)
|
261
|
+
|
262
|
+
return normalize_spaces(text)
|
165
263
|
except pypdfium2.PdfiumError as e:
|
166
264
|
raise ParsingError(
|
167
|
-
"Could not extract text from PDF file",
|
265
|
+
"Could not extract text from PDF file",
|
266
|
+
context=create_error_context(
|
267
|
+
operation="extract_pdf_searchable_text",
|
268
|
+
file_path=input_file,
|
269
|
+
error=e,
|
270
|
+
),
|
168
271
|
) from e
|
169
272
|
finally:
|
170
273
|
if document:
|
171
|
-
|
274
|
+
with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
|
275
|
+
await run_sync(document.close)
|
276
|
+
|
277
|
+
def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
|
278
|
+
"""Extract searchable text from PDF using pypdfium2 (sync version)."""
|
279
|
+
pdf = None
|
280
|
+
try:
|
281
|
+
with pypdfium_file_lock(path):
|
282
|
+
pdf = pypdfium2.PdfDocument(str(path))
|
283
|
+
text_parts = []
|
284
|
+
for page in pdf:
|
285
|
+
text_page = page.get_textpage()
|
286
|
+
text = text_page.get_text_range()
|
287
|
+
text_parts.append(text)
|
288
|
+
text_page.close()
|
289
|
+
page.close()
|
290
|
+
return "".join(text_parts)
|
291
|
+
except Exception as e:
|
292
|
+
raise ParsingError(f"Failed to extract PDF text: {e}") from e
|
293
|
+
finally:
|
294
|
+
if pdf:
|
295
|
+
with pypdfium_file_lock(path), contextlib.suppress(Exception):
|
296
|
+
pdf.close()
|
297
|
+
|
298
|
+
def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
|
299
|
+
"""Extract text from PDF using OCR (sync version)."""
|
300
|
+
pdf = None
|
301
|
+
try:
|
302
|
+
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
303
|
+
|
304
|
+
images = []
|
305
|
+
with pypdfium_file_lock(path):
|
306
|
+
pdf = pypdfium2.PdfDocument(str(path))
|
307
|
+
for page in pdf:
|
308
|
+
bitmap = page.render(scale=200 / 72)
|
309
|
+
pil_image = bitmap.to_pil()
|
310
|
+
images.append(pil_image)
|
311
|
+
bitmap.close()
|
312
|
+
page.close()
|
313
|
+
|
314
|
+
import os
|
315
|
+
import tempfile
|
316
|
+
|
317
|
+
image_paths = []
|
318
|
+
temp_files = []
|
319
|
+
|
320
|
+
try:
|
321
|
+
for i, img in enumerate(images):
|
322
|
+
fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
|
323
|
+
temp_files.append((fd, temp_path))
|
324
|
+
img.save(temp_path, format="PNG")
|
325
|
+
os.close(fd)
|
326
|
+
image_paths.append(temp_path)
|
327
|
+
|
328
|
+
if self.config.ocr_backend == "tesseract":
|
329
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
330
|
+
|
331
|
+
if isinstance(self.config.ocr_config, TesseractConfig):
|
332
|
+
config = self.config.ocr_config
|
333
|
+
else:
|
334
|
+
config = TesseractConfig()
|
335
|
+
results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
|
336
|
+
text_parts = [r.content for r in results]
|
337
|
+
return "\n\n".join(text_parts)
|
338
|
+
|
339
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
340
|
+
|
341
|
+
finally:
|
342
|
+
for _, temp_path in temp_files:
|
343
|
+
with contextlib.suppress(OSError):
|
344
|
+
Path(temp_path).unlink()
|
345
|
+
|
346
|
+
except Exception as e:
|
347
|
+
raise ParsingError(f"Failed to OCR PDF: {e}") from e
|
348
|
+
finally:
|
349
|
+
if pdf:
|
350
|
+
with pypdfium_file_lock(path), contextlib.suppress(Exception):
|
351
|
+
pdf.close()
|
@@ -202,7 +202,7 @@ class PresentationExtractor(Extractor):
|
|
202
202
|
("keywords", "keywords"),
|
203
203
|
("modified_by", "last_modified_by"),
|
204
204
|
("modified_at", "modified"),
|
205
|
-
("version", "revision"),
|
205
|
+
("version", "revision"),
|
206
206
|
("subject", "subject"),
|
207
207
|
("title", "title"),
|
208
208
|
("version", "version"),
|
@@ -1,12 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import contextlib
|
3
4
|
import csv
|
4
5
|
import sys
|
5
6
|
from datetime import date, datetime, time, timedelta
|
6
7
|
from io import StringIO
|
7
|
-
from
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Any, Union
|
8
10
|
|
9
|
-
import anyio
|
10
11
|
from anyio import Path as AsyncPath
|
11
12
|
from python_calamine import CalamineWorkbook
|
12
13
|
|
@@ -18,9 +19,6 @@ from kreuzberg._utils._sync import run_sync, run_taskgroup
|
|
18
19
|
from kreuzberg._utils._tmp import create_temp_file
|
19
20
|
from kreuzberg.exceptions import ParsingError
|
20
21
|
|
21
|
-
if TYPE_CHECKING: # pragma: no cover
|
22
|
-
from pathlib import Path
|
23
|
-
|
24
22
|
if sys.version_info < (3, 11): # pragma: no cover
|
25
23
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
24
|
|
@@ -64,10 +62,37 @@ class SpreadSheetExtractor(Extractor):
|
|
64
62
|
) from e
|
65
63
|
|
66
64
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
67
|
-
|
65
|
+
"""Pure sync implementation of extract_bytes."""
|
66
|
+
import os
|
67
|
+
import tempfile
|
68
|
+
|
69
|
+
fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
|
70
|
+
|
71
|
+
try:
|
72
|
+
with os.fdopen(fd, "wb") as f:
|
73
|
+
f.write(content)
|
74
|
+
|
75
|
+
return self.extract_path_sync(Path(temp_path))
|
76
|
+
finally:
|
77
|
+
with contextlib.suppress(OSError):
|
78
|
+
Path(temp_path).unlink()
|
68
79
|
|
69
80
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
70
|
-
|
81
|
+
"""Pure sync implementation of extract_path."""
|
82
|
+
try:
|
83
|
+
workbook = CalamineWorkbook.from_path(str(path))
|
84
|
+
results = []
|
85
|
+
|
86
|
+
for sheet_name in workbook.sheet_names:
|
87
|
+
sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
|
88
|
+
results.append(sheet_text)
|
89
|
+
|
90
|
+
return ExtractionResult(content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
|
91
|
+
except Exception as e:
|
92
|
+
raise ParsingError(
|
93
|
+
"Failed to extract file data",
|
94
|
+
context={"file": str(path), "error": str(e)},
|
95
|
+
) from e
|
71
96
|
|
72
97
|
@staticmethod
|
73
98
|
def _convert_cell_to_str(value: Any) -> str:
|
@@ -123,3 +148,36 @@ class SpreadSheetExtractor(Extractor):
|
|
123
148
|
|
124
149
|
await unlink()
|
125
150
|
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
151
|
+
|
152
|
+
def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
153
|
+
"""Synchronous version of _convert_sheet_to_text."""
|
154
|
+
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
155
|
+
|
156
|
+
csv_buffer = StringIO()
|
157
|
+
writer = csv.writer(csv_buffer)
|
158
|
+
|
159
|
+
for row in values:
|
160
|
+
writer.writerow([self._convert_cell_to_str(cell) for cell in row])
|
161
|
+
|
162
|
+
csv_data = csv_buffer.getvalue()
|
163
|
+
csv_buffer.close()
|
164
|
+
|
165
|
+
csv_reader = csv.reader(StringIO(csv_data))
|
166
|
+
rows = list(csv_reader)
|
167
|
+
result = ""
|
168
|
+
|
169
|
+
if rows:
|
170
|
+
header = rows[0]
|
171
|
+
markdown_lines: list[str] = [
|
172
|
+
"| " + " | ".join(header) + " |",
|
173
|
+
"| " + " | ".join(["---" for _ in header]) + " |",
|
174
|
+
]
|
175
|
+
|
176
|
+
for row in rows[1:]: # type: ignore[assignment]
|
177
|
+
while len(row) < len(header):
|
178
|
+
row.append("")
|
179
|
+
markdown_lines.append("| " + " | ".join(row) + " |") # type: ignore[arg-type]
|
180
|
+
|
181
|
+
result = "\n".join(markdown_lines)
|
182
|
+
|
183
|
+
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
kreuzberg/_gmft.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import os
|
3
4
|
from dataclasses import dataclass, field
|
4
|
-
from typing import TYPE_CHECKING, Literal
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal
|
5
6
|
|
6
7
|
from kreuzberg._types import TableData
|
7
8
|
from kreuzberg._utils._sync import run_sync
|
@@ -69,7 +70,7 @@ class GMFTConfig:
|
|
69
70
|
"""
|
70
71
|
[Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
|
71
72
|
"""
|
72
|
-
semantic_hierarchical_left_fill:
|
73
|
+
semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
|
73
74
|
"""
|
74
75
|
[Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
|
75
76
|
|
@@ -103,9 +104,31 @@ class GMFTConfig:
|
|
103
104
|
"""
|
104
105
|
Force the large table assumption to be applied, regardless of the number of rows and overlap.
|
105
106
|
"""
|
107
|
+
total_overlap_reject_threshold: float = 0.9
|
108
|
+
"""
|
109
|
+
Reject if total overlap is > 90% of table area.
|
110
|
+
"""
|
111
|
+
total_overlap_warn_threshold: float = 0.1
|
112
|
+
"""
|
113
|
+
Warn if total overlap is > 10% of table area.
|
114
|
+
"""
|
115
|
+
nms_warn_threshold: int = 5
|
116
|
+
"""
|
117
|
+
Warn if non maxima suppression removes > 5 rows.
|
118
|
+
"""
|
119
|
+
iob_reject_threshold: float = 0.05
|
120
|
+
"""
|
121
|
+
Reject if iob between textbox and cell is < 5%.
|
122
|
+
"""
|
123
|
+
iob_warn_threshold: float = 0.5
|
124
|
+
"""
|
125
|
+
Warn if iob between textbox and cell is < 50%.
|
126
|
+
"""
|
106
127
|
|
107
128
|
|
108
|
-
async def extract_tables(
|
129
|
+
async def extract_tables( # noqa: PLR0915
|
130
|
+
file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
|
131
|
+
) -> list[TableData]:
|
109
132
|
"""Extracts tables from a PDF file.
|
110
133
|
|
111
134
|
This function takes a file path to a PDF file, and an optional configuration object.
|
@@ -114,6 +137,8 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
|
|
114
137
|
Args:
|
115
138
|
file_path: The path to the PDF file.
|
116
139
|
config: An optional configuration object.
|
140
|
+
use_isolated_process: Whether to use an isolated process for extraction.
|
141
|
+
If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
|
117
142
|
|
118
143
|
Raises:
|
119
144
|
MissingDependencyError: Raised when the required dependencies are not installed.
|
@@ -121,14 +146,189 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
|
|
121
146
|
Returns:
|
122
147
|
A list of table data dictionaries.
|
123
148
|
"""
|
149
|
+
from pathlib import Path
|
150
|
+
|
151
|
+
from kreuzberg._utils._cache import get_table_cache
|
152
|
+
|
153
|
+
# Determine if we should use isolated process # ~keep
|
154
|
+
if use_isolated_process is None:
|
155
|
+
use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
|
156
|
+
|
157
|
+
path = Path(file_path)
|
158
|
+
try:
|
159
|
+
stat = path.stat()
|
160
|
+
file_info = {
|
161
|
+
"path": str(path.resolve()),
|
162
|
+
"size": stat.st_size,
|
163
|
+
"mtime": stat.st_mtime,
|
164
|
+
}
|
165
|
+
except OSError:
|
166
|
+
file_info = {
|
167
|
+
"path": str(path),
|
168
|
+
"size": 0,
|
169
|
+
"mtime": 0,
|
170
|
+
}
|
171
|
+
|
172
|
+
config = config or GMFTConfig()
|
173
|
+
cache_kwargs = {
|
174
|
+
"file_info": str(sorted(file_info.items())),
|
175
|
+
"extractor": "gmft",
|
176
|
+
"config": str(sorted(config.__dict__.items())),
|
177
|
+
}
|
178
|
+
|
179
|
+
table_cache = get_table_cache()
|
180
|
+
cached_result = await table_cache.aget(**cache_kwargs)
|
181
|
+
if cached_result is not None:
|
182
|
+
return cached_result # type: ignore[no-any-return]
|
183
|
+
|
184
|
+
if table_cache.is_processing(**cache_kwargs):
|
185
|
+
import anyio
|
186
|
+
|
187
|
+
event = table_cache.mark_processing(**cache_kwargs)
|
188
|
+
await anyio.to_thread.run_sync(event.wait)
|
189
|
+
|
190
|
+
# Try cache again after waiting for other process to complete # ~keep
|
191
|
+
cached_result = await table_cache.aget(**cache_kwargs)
|
192
|
+
if cached_result is not None:
|
193
|
+
return cached_result # type: ignore[no-any-return]
|
194
|
+
|
195
|
+
table_cache.mark_processing(**cache_kwargs)
|
196
|
+
|
197
|
+
try:
|
198
|
+
if use_isolated_process:
|
199
|
+
from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated_async
|
200
|
+
|
201
|
+
result = await extract_tables_isolated_async(file_path, config)
|
202
|
+
|
203
|
+
await table_cache.aset(result, **cache_kwargs)
|
204
|
+
|
205
|
+
return result
|
206
|
+
|
207
|
+
try:
|
208
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
|
209
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
|
210
|
+
from gmft.formatters.tatr import TATRFormatConfig
|
211
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
212
|
+
|
213
|
+
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
214
|
+
config=TATRFormatConfig(
|
215
|
+
verbosity=config.verbosity,
|
216
|
+
formatter_base_threshold=config.formatter_base_threshold,
|
217
|
+
cell_required_confidence=config.cell_required_confidence,
|
218
|
+
remove_null_rows=config.remove_null_rows,
|
219
|
+
enable_multi_header=config.enable_multi_header,
|
220
|
+
semantic_spanning_cells=config.semantic_spanning_cells,
|
221
|
+
semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
|
222
|
+
large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
|
223
|
+
large_table_threshold=config.large_table_threshold,
|
224
|
+
large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
|
225
|
+
large_table_maximum_rows=config.large_table_maximum_rows,
|
226
|
+
force_large_table_assumption=config.force_large_table_assumption,
|
227
|
+
)
|
228
|
+
)
|
229
|
+
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
230
|
+
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
231
|
+
)
|
232
|
+
doc = await run_sync(PyPDFium2Document, str(file_path))
|
233
|
+
cropped_tables: list[CroppedTable] = []
|
234
|
+
dataframes: list[DataFrame] = []
|
235
|
+
try:
|
236
|
+
for page in doc:
|
237
|
+
cropped_tables.extend(await run_sync(detector.extract, page))
|
238
|
+
|
239
|
+
for cropped_table in cropped_tables:
|
240
|
+
formatted_table = await run_sync(formatter.extract, cropped_table)
|
241
|
+
dataframes.append(await run_sync(formatted_table.df))
|
242
|
+
|
243
|
+
result = [
|
244
|
+
TableData(
|
245
|
+
cropped_image=cropped_table.image(),
|
246
|
+
page_number=cropped_table.page.page_number,
|
247
|
+
text=data_frame.to_markdown(),
|
248
|
+
df=data_frame,
|
249
|
+
)
|
250
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
251
|
+
]
|
252
|
+
|
253
|
+
await table_cache.aset(result, **cache_kwargs)
|
254
|
+
|
255
|
+
return result
|
256
|
+
finally:
|
257
|
+
await run_sync(doc.close)
|
258
|
+
|
259
|
+
except ImportError as e:
|
260
|
+
raise MissingDependencyError.create_for_package(
|
261
|
+
dependency_group="gmft", functionality="table extraction", package_name="gmft"
|
262
|
+
) from e
|
263
|
+
finally:
|
264
|
+
table_cache.mark_complete(**cache_kwargs)
|
265
|
+
|
266
|
+
|
267
|
+
def extract_tables_sync(
|
268
|
+
file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
|
269
|
+
) -> list[TableData]:
|
270
|
+
"""Synchronous wrapper for extract_tables.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
file_path: The path to the PDF file.
|
274
|
+
config: An optional configuration object.
|
275
|
+
use_isolated_process: Whether to use an isolated process for extraction.
|
276
|
+
If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
A list of table data dictionaries.
|
280
|
+
"""
|
281
|
+
from pathlib import Path
|
282
|
+
|
283
|
+
from kreuzberg._utils._cache import get_table_cache
|
284
|
+
|
285
|
+
# Determine if we should use isolated process # ~keep
|
286
|
+
if use_isolated_process is None:
|
287
|
+
use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
|
288
|
+
|
289
|
+
path = Path(file_path)
|
124
290
|
try:
|
125
|
-
|
126
|
-
|
291
|
+
stat = path.stat()
|
292
|
+
file_info = {
|
293
|
+
"path": str(path.resolve()),
|
294
|
+
"size": stat.st_size,
|
295
|
+
"mtime": stat.st_mtime,
|
296
|
+
}
|
297
|
+
except OSError:
|
298
|
+
file_info = {
|
299
|
+
"path": str(path),
|
300
|
+
"size": 0,
|
301
|
+
"mtime": 0,
|
302
|
+
}
|
303
|
+
|
304
|
+
config = config or GMFTConfig()
|
305
|
+
cache_kwargs = {
|
306
|
+
"file_info": str(sorted(file_info.items())),
|
307
|
+
"extractor": "gmft",
|
308
|
+
"config": str(sorted(config.__dict__.items())),
|
309
|
+
}
|
310
|
+
|
311
|
+
table_cache = get_table_cache()
|
312
|
+
cached_result = table_cache.get(**cache_kwargs)
|
313
|
+
if cached_result is not None:
|
314
|
+
return cached_result # type: ignore[no-any-return]
|
315
|
+
|
316
|
+
if use_isolated_process:
|
317
|
+
from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated
|
318
|
+
|
319
|
+
result = extract_tables_isolated(file_path, config)
|
320
|
+
|
321
|
+
table_cache.set(result, **cache_kwargs)
|
322
|
+
|
323
|
+
return result
|
324
|
+
|
325
|
+
try:
|
326
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
|
327
|
+
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
|
127
328
|
from gmft.formatters.tatr import TATRFormatConfig
|
128
329
|
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
129
330
|
|
130
|
-
|
131
|
-
formatter = AutoTableFormatter(
|
331
|
+
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
132
332
|
config=TATRFormatConfig(
|
133
333
|
verbosity=config.verbosity,
|
134
334
|
formatter_base_threshold=config.formatter_base_threshold,
|
@@ -144,19 +344,21 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
|
|
144
344
|
force_large_table_assumption=config.force_large_table_assumption,
|
145
345
|
)
|
146
346
|
)
|
147
|
-
detector = AutoTableDetector(
|
148
|
-
|
149
|
-
|
150
|
-
|
347
|
+
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
348
|
+
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
349
|
+
)
|
350
|
+
doc = PyPDFium2Document(str(file_path))
|
351
|
+
cropped_tables: list[Any] = []
|
352
|
+
dataframes: list[Any] = []
|
151
353
|
try:
|
152
354
|
for page in doc:
|
153
|
-
cropped_tables.extend(
|
355
|
+
cropped_tables.extend(detector.extract(page))
|
154
356
|
|
155
357
|
for cropped_table in cropped_tables:
|
156
|
-
formatted_table =
|
157
|
-
dataframes.append(
|
358
|
+
formatted_table = formatter.extract(cropped_table)
|
359
|
+
dataframes.append(formatted_table.df())
|
158
360
|
|
159
|
-
|
361
|
+
result = [
|
160
362
|
TableData(
|
161
363
|
cropped_image=cropped_table.image(),
|
162
364
|
page_number=cropped_table.page.page_number,
|
@@ -165,8 +367,12 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
|
|
165
367
|
)
|
166
368
|
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
167
369
|
]
|
370
|
+
|
371
|
+
table_cache.set(result, **cache_kwargs)
|
372
|
+
|
373
|
+
return result
|
168
374
|
finally:
|
169
|
-
|
375
|
+
doc.close() # type: ignore[no-untyped-call]
|
170
376
|
|
171
377
|
except ImportError as e:
|
172
378
|
raise MissingDependencyError.create_for_package(
|