kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_api/__init__.py +0 -0
  4. kreuzberg/_api/main.py +87 -0
  5. kreuzberg/_cli_config.py +175 -0
  6. kreuzberg/_extractors/_image.py +39 -4
  7. kreuzberg/_extractors/_pandoc.py +158 -18
  8. kreuzberg/_extractors/_pdf.py +199 -19
  9. kreuzberg/_extractors/_presentation.py +1 -1
  10. kreuzberg/_extractors/_spread_sheet.py +65 -7
  11. kreuzberg/_gmft.py +222 -16
  12. kreuzberg/_mime_types.py +62 -16
  13. kreuzberg/_multiprocessing/__init__.py +6 -0
  14. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  15. kreuzberg/_multiprocessing/process_manager.py +188 -0
  16. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  17. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  18. kreuzberg/_ocr/_easyocr.py +6 -12
  19. kreuzberg/_ocr/_paddleocr.py +15 -13
  20. kreuzberg/_ocr/_tesseract.py +136 -46
  21. kreuzberg/_playa.py +43 -0
  22. kreuzberg/_types.py +4 -0
  23. kreuzberg/_utils/_cache.py +372 -0
  24. kreuzberg/_utils/_device.py +10 -27
  25. kreuzberg/_utils/_document_cache.py +220 -0
  26. kreuzberg/_utils/_errors.py +232 -0
  27. kreuzberg/_utils/_pdf_lock.py +72 -0
  28. kreuzberg/_utils/_process_pool.py +100 -0
  29. kreuzberg/_utils/_serialization.py +82 -0
  30. kreuzberg/_utils/_string.py +1 -1
  31. kreuzberg/_utils/_sync.py +21 -0
  32. kreuzberg/cli.py +338 -0
  33. kreuzberg/extraction.py +247 -36
  34. kreuzberg-3.4.0.dist-info/METADATA +290 -0
  35. kreuzberg-3.4.0.dist-info/RECORD +50 -0
  36. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
  37. kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
  38. kreuzberg-3.2.0.dist-info/METADATA +0 -166
  39. kreuzberg-3.2.0.dist-info/RECORD +0 -34
  40. kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
  41. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
3
4
  from multiprocessing import cpu_count
5
+ from pathlib import Path
4
6
  from re import Pattern
5
7
  from re import compile as compile_regex
6
8
  from typing import TYPE_CHECKING, ClassVar, cast
@@ -14,14 +16,13 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
14
16
  from kreuzberg._ocr import get_ocr_backend
15
17
  from kreuzberg._playa import extract_pdf_metadata
16
18
  from kreuzberg._types import ExtractionResult, OcrBackendType
19
+ from kreuzberg._utils._pdf_lock import pypdfium_file_lock
17
20
  from kreuzberg._utils._string import normalize_spaces
18
21
  from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
19
22
  from kreuzberg._utils._tmp import create_temp_file
20
23
  from kreuzberg.exceptions import ParsingError
21
24
 
22
25
  if TYPE_CHECKING: # pragma: no cover
23
- from pathlib import Path
24
-
25
26
  from PIL.Image import Image
26
27
 
27
28
 
@@ -69,10 +70,52 @@ class PDFExtractor(Extractor):
69
70
  return result
70
71
 
71
72
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
72
- return anyio.run(self.extract_bytes_async, content)
73
+ """Pure sync implementation of PDF extraction from bytes."""
74
+ import os
75
+ import tempfile
76
+
77
+ fd, temp_path = tempfile.mkstemp(suffix=".pdf")
78
+ try:
79
+ with os.fdopen(fd, "wb") as f:
80
+ f.write(content)
81
+
82
+ result = self.extract_path_sync(Path(temp_path))
83
+
84
+ from kreuzberg._playa import extract_pdf_metadata_sync
85
+
86
+ metadata = extract_pdf_metadata_sync(content)
87
+ result.metadata = metadata
88
+
89
+ return result
90
+ finally:
91
+ with contextlib.suppress(OSError):
92
+ Path(temp_path).unlink()
73
93
 
74
94
  def extract_path_sync(self, path: Path) -> ExtractionResult:
75
- return anyio.run(self.extract_path_async, path)
95
+ """Pure sync implementation of PDF extraction from path."""
96
+ text = self._extract_pdf_searchable_text_sync(path)
97
+
98
+ if self.config.force_ocr or not self._validate_extracted_text(text):
99
+ text = self._extract_pdf_with_ocr_sync(path)
100
+
101
+ tables = []
102
+ if self.config.extract_tables:
103
+ try:
104
+ from kreuzberg._gmft import extract_tables_sync
105
+
106
+ tables = extract_tables_sync(path)
107
+ except ImportError:
108
+ pass
109
+
110
+ text = normalize_spaces(text)
111
+
112
+ return ExtractionResult(
113
+ content=text,
114
+ mime_type=PLAIN_TEXT_MIME_TYPE,
115
+ metadata={},
116
+ tables=tables,
117
+ chunks=[],
118
+ )
76
119
 
77
120
  def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
78
121
  """Check if text extracted from PDF is valid or corrupted.
@@ -112,17 +155,45 @@ class PDFExtractor(Extractor):
112
155
  Returns:
113
156
  A list of Pillow Images.
114
157
  """
158
+ from kreuzberg._utils._errors import create_error_context, should_retry
159
+
115
160
  document: pypdfium2.PdfDocument | None = None
116
- try:
117
- document = await run_sync(pypdfium2.PdfDocument, str(input_file))
118
- return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
119
- except pypdfium2.PdfiumError as e:
120
- raise ParsingError(
121
- "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
122
- ) from e
123
- finally:
124
- if document:
125
- await run_sync(document.close)
161
+ last_error = None
162
+
163
+ for attempt in range(3): # Try up to 3 times # ~keep
164
+ try:
165
+ with pypdfium_file_lock(input_file):
166
+ document = await run_sync(pypdfium2.PdfDocument, str(input_file))
167
+ return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
168
+ except pypdfium2.PdfiumError as e: # noqa: PERF203
169
+ last_error = e
170
+ if not should_retry(e, attempt + 1):
171
+ raise ParsingError(
172
+ "Could not convert PDF to images",
173
+ context=create_error_context(
174
+ operation="convert_pdf_to_images",
175
+ file_path=input_file,
176
+ error=e,
177
+ attempt=attempt + 1,
178
+ ),
179
+ ) from e
180
+ # Wait before retry with exponential backoff # ~keep
181
+ await anyio.sleep(0.5 * (attempt + 1))
182
+ finally:
183
+ if document:
184
+ with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
185
+ await run_sync(document.close)
186
+
187
+ # All retries failed # ~keep
188
+ raise ParsingError(
189
+ "Could not convert PDF to images after retries",
190
+ context=create_error_context(
191
+ operation="convert_pdf_to_images",
192
+ file_path=input_file,
193
+ error=last_error,
194
+ attempts=3,
195
+ ),
196
+ ) from last_error
126
197
 
127
198
  async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
128
199
  """Extract text from a scanned PDF file using OCR.
@@ -157,15 +228,124 @@ class PDFExtractor(Extractor):
157
228
  Returns:
158
229
  The extracted text.
159
230
  """
231
+ from kreuzberg._utils._errors import create_error_context
232
+
160
233
  document: pypdfium2.PdfDocument | None = None
161
234
  try:
162
- document = await run_sync(pypdfium2.PdfDocument, str(input_file))
163
- text = "\n".join(page.get_textpage().get_text_bounded() for page in cast("pypdfium2.PdfDocument", document))
164
- return normalize_spaces(text)
235
+ with pypdfium_file_lock(input_file):
236
+ document = await run_sync(pypdfium2.PdfDocument, str(input_file))
237
+ text_parts = []
238
+ page_errors = []
239
+
240
+ for i, page in enumerate(cast("pypdfium2.PdfDocument", document)):
241
+ try:
242
+ text_page = page.get_textpage()
243
+ text_parts.append(text_page.get_text_bounded())
244
+ except Exception as e: # noqa: PERF203, BLE001
245
+ page_errors.append({"page": i + 1, "error": str(e)})
246
+ text_parts.append(f"[Error extracting page {i + 1}]")
247
+
248
+ text = "\n".join(text_parts)
249
+
250
+ if page_errors and text_parts:
251
+ return normalize_spaces(text)
252
+ if not text_parts:
253
+ raise ParsingError(
254
+ "Could not extract any text from PDF",
255
+ context=create_error_context(
256
+ operation="extract_pdf_searchable_text",
257
+ file_path=input_file,
258
+ page_errors=page_errors,
259
+ ),
260
+ )
261
+
262
+ return normalize_spaces(text)
165
263
  except pypdfium2.PdfiumError as e:
166
264
  raise ParsingError(
167
- "Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
265
+ "Could not extract text from PDF file",
266
+ context=create_error_context(
267
+ operation="extract_pdf_searchable_text",
268
+ file_path=input_file,
269
+ error=e,
270
+ ),
168
271
  ) from e
169
272
  finally:
170
273
  if document:
171
- await run_sync(document.close)
274
+ with pypdfium_file_lock(input_file), contextlib.suppress(Exception):
275
+ await run_sync(document.close)
276
+
277
+ def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
278
+ """Extract searchable text from PDF using pypdfium2 (sync version)."""
279
+ pdf = None
280
+ try:
281
+ with pypdfium_file_lock(path):
282
+ pdf = pypdfium2.PdfDocument(str(path))
283
+ text_parts = []
284
+ for page in pdf:
285
+ text_page = page.get_textpage()
286
+ text = text_page.get_text_range()
287
+ text_parts.append(text)
288
+ text_page.close()
289
+ page.close()
290
+ return "".join(text_parts)
291
+ except Exception as e:
292
+ raise ParsingError(f"Failed to extract PDF text: {e}") from e
293
+ finally:
294
+ if pdf:
295
+ with pypdfium_file_lock(path), contextlib.suppress(Exception):
296
+ pdf.close()
297
+
298
+ def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
299
+ """Extract text from PDF using OCR (sync version)."""
300
+ pdf = None
301
+ try:
302
+ from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
303
+
304
+ images = []
305
+ with pypdfium_file_lock(path):
306
+ pdf = pypdfium2.PdfDocument(str(path))
307
+ for page in pdf:
308
+ bitmap = page.render(scale=200 / 72)
309
+ pil_image = bitmap.to_pil()
310
+ images.append(pil_image)
311
+ bitmap.close()
312
+ page.close()
313
+
314
+ import os
315
+ import tempfile
316
+
317
+ image_paths = []
318
+ temp_files = []
319
+
320
+ try:
321
+ for i, img in enumerate(images):
322
+ fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
323
+ temp_files.append((fd, temp_path))
324
+ img.save(temp_path, format="PNG")
325
+ os.close(fd)
326
+ image_paths.append(temp_path)
327
+
328
+ if self.config.ocr_backend == "tesseract":
329
+ from kreuzberg._ocr._tesseract import TesseractConfig
330
+
331
+ if isinstance(self.config.ocr_config, TesseractConfig):
332
+ config = self.config.ocr_config
333
+ else:
334
+ config = TesseractConfig()
335
+ results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
336
+ text_parts = [r.content for r in results]
337
+ return "\n\n".join(text_parts)
338
+
339
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
340
+
341
+ finally:
342
+ for _, temp_path in temp_files:
343
+ with contextlib.suppress(OSError):
344
+ Path(temp_path).unlink()
345
+
346
+ except Exception as e:
347
+ raise ParsingError(f"Failed to OCR PDF: {e}") from e
348
+ finally:
349
+ if pdf:
350
+ with pypdfium_file_lock(path), contextlib.suppress(Exception):
351
+ pdf.close()
@@ -202,7 +202,7 @@ class PresentationExtractor(Extractor):
202
202
  ("keywords", "keywords"),
203
203
  ("modified_by", "last_modified_by"),
204
204
  ("modified_at", "modified"),
205
- ("version", "revision"), # if version and revision are given, version overwrites
205
+ ("version", "revision"),
206
206
  ("subject", "subject"),
207
207
  ("title", "title"),
208
208
  ("version", "version"),
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
3
4
  import csv
4
5
  import sys
5
6
  from datetime import date, datetime, time, timedelta
6
7
  from io import StringIO
7
- from typing import TYPE_CHECKING, Any, Union
8
+ from pathlib import Path
9
+ from typing import Any, Union
8
10
 
9
- import anyio
10
11
  from anyio import Path as AsyncPath
11
12
  from python_calamine import CalamineWorkbook
12
13
 
@@ -18,9 +19,6 @@ from kreuzberg._utils._sync import run_sync, run_taskgroup
18
19
  from kreuzberg._utils._tmp import create_temp_file
19
20
  from kreuzberg.exceptions import ParsingError
20
21
 
21
- if TYPE_CHECKING: # pragma: no cover
22
- from pathlib import Path
23
-
24
22
  if sys.version_info < (3, 11): # pragma: no cover
25
23
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
24
 
@@ -64,10 +62,37 @@ class SpreadSheetExtractor(Extractor):
64
62
  ) from e
65
63
 
66
64
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
67
- return anyio.run(self.extract_bytes_async, content)
65
+ """Pure sync implementation of extract_bytes."""
66
+ import os
67
+ import tempfile
68
+
69
+ fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
70
+
71
+ try:
72
+ with os.fdopen(fd, "wb") as f:
73
+ f.write(content)
74
+
75
+ return self.extract_path_sync(Path(temp_path))
76
+ finally:
77
+ with contextlib.suppress(OSError):
78
+ Path(temp_path).unlink()
68
79
 
69
80
  def extract_path_sync(self, path: Path) -> ExtractionResult:
70
- return anyio.run(self.extract_path_async, path)
81
+ """Pure sync implementation of extract_path."""
82
+ try:
83
+ workbook = CalamineWorkbook.from_path(str(path))
84
+ results = []
85
+
86
+ for sheet_name in workbook.sheet_names:
87
+ sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
88
+ results.append(sheet_text)
89
+
90
+ return ExtractionResult(content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
91
+ except Exception as e:
92
+ raise ParsingError(
93
+ "Failed to extract file data",
94
+ context={"file": str(path), "error": str(e)},
95
+ ) from e
71
96
 
72
97
  @staticmethod
73
98
  def _convert_cell_to_str(value: Any) -> str:
@@ -123,3 +148,36 @@ class SpreadSheetExtractor(Extractor):
123
148
 
124
149
  await unlink()
125
150
  return f"## {sheet_name}\n\n{normalize_spaces(result)}"
151
+
152
+ def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
153
+ """Synchronous version of _convert_sheet_to_text."""
154
+ values = workbook.get_sheet_by_name(sheet_name).to_python()
155
+
156
+ csv_buffer = StringIO()
157
+ writer = csv.writer(csv_buffer)
158
+
159
+ for row in values:
160
+ writer.writerow([self._convert_cell_to_str(cell) for cell in row])
161
+
162
+ csv_data = csv_buffer.getvalue()
163
+ csv_buffer.close()
164
+
165
+ csv_reader = csv.reader(StringIO(csv_data))
166
+ rows = list(csv_reader)
167
+ result = ""
168
+
169
+ if rows:
170
+ header = rows[0]
171
+ markdown_lines: list[str] = [
172
+ "| " + " | ".join(header) + " |",
173
+ "| " + " | ".join(["---" for _ in header]) + " |",
174
+ ]
175
+
176
+ for row in rows[1:]: # type: ignore[assignment]
177
+ while len(row) < len(header):
178
+ row.append("")
179
+ markdown_lines.append("| " + " | ".join(row) + " |") # type: ignore[arg-type]
180
+
181
+ result = "\n".join(markdown_lines)
182
+
183
+ return f"## {sheet_name}\n\n{normalize_spaces(result)}"
kreuzberg/_gmft.py CHANGED
@@ -1,7 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import os
3
4
  from dataclasses import dataclass, field
4
- from typing import TYPE_CHECKING, Literal
5
+ from typing import TYPE_CHECKING, Any, Literal
5
6
 
6
7
  from kreuzberg._types import TableData
7
8
  from kreuzberg._utils._sync import run_sync
@@ -69,7 +70,7 @@ class GMFTConfig:
69
70
  """
70
71
  [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
71
72
  """
72
- semantic_hierarchical_left_fill: str | None = "algorithm"
73
+ semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
73
74
  """
74
75
  [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
75
76
 
@@ -103,9 +104,31 @@ class GMFTConfig:
103
104
  """
104
105
  Force the large table assumption to be applied, regardless of the number of rows and overlap.
105
106
  """
107
+ total_overlap_reject_threshold: float = 0.9
108
+ """
109
+ Reject if total overlap is > 90% of table area.
110
+ """
111
+ total_overlap_warn_threshold: float = 0.1
112
+ """
113
+ Warn if total overlap is > 10% of table area.
114
+ """
115
+ nms_warn_threshold: int = 5
116
+ """
117
+ Warn if non maxima suppression removes > 5 rows.
118
+ """
119
+ iob_reject_threshold: float = 0.05
120
+ """
121
+ Reject if iob between textbox and cell is < 5%.
122
+ """
123
+ iob_warn_threshold: float = 0.5
124
+ """
125
+ Warn if iob between textbox and cell is < 50%.
126
+ """
106
127
 
107
128
 
108
- async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | None = None) -> list[TableData]:
129
+ async def extract_tables( # noqa: PLR0915
130
+ file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
131
+ ) -> list[TableData]:
109
132
  """Extracts tables from a PDF file.
110
133
 
111
134
  This function takes a file path to a PDF file, and an optional configuration object.
@@ -114,6 +137,8 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
114
137
  Args:
115
138
  file_path: The path to the PDF file.
116
139
  config: An optional configuration object.
140
+ use_isolated_process: Whether to use an isolated process for extraction.
141
+ If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
117
142
 
118
143
  Raises:
119
144
  MissingDependencyError: Raised when the required dependencies are not installed.
@@ -121,14 +146,189 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
121
146
  Returns:
122
147
  A list of table data dictionaries.
123
148
  """
149
+ from pathlib import Path
150
+
151
+ from kreuzberg._utils._cache import get_table_cache
152
+
153
+ # Determine if we should use isolated process # ~keep
154
+ if use_isolated_process is None:
155
+ use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
156
+
157
+ path = Path(file_path)
158
+ try:
159
+ stat = path.stat()
160
+ file_info = {
161
+ "path": str(path.resolve()),
162
+ "size": stat.st_size,
163
+ "mtime": stat.st_mtime,
164
+ }
165
+ except OSError:
166
+ file_info = {
167
+ "path": str(path),
168
+ "size": 0,
169
+ "mtime": 0,
170
+ }
171
+
172
+ config = config or GMFTConfig()
173
+ cache_kwargs = {
174
+ "file_info": str(sorted(file_info.items())),
175
+ "extractor": "gmft",
176
+ "config": str(sorted(config.__dict__.items())),
177
+ }
178
+
179
+ table_cache = get_table_cache()
180
+ cached_result = await table_cache.aget(**cache_kwargs)
181
+ if cached_result is not None:
182
+ return cached_result # type: ignore[no-any-return]
183
+
184
+ if table_cache.is_processing(**cache_kwargs):
185
+ import anyio
186
+
187
+ event = table_cache.mark_processing(**cache_kwargs)
188
+ await anyio.to_thread.run_sync(event.wait)
189
+
190
+ # Try cache again after waiting for other process to complete # ~keep
191
+ cached_result = await table_cache.aget(**cache_kwargs)
192
+ if cached_result is not None:
193
+ return cached_result # type: ignore[no-any-return]
194
+
195
+ table_cache.mark_processing(**cache_kwargs)
196
+
197
+ try:
198
+ if use_isolated_process:
199
+ from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated_async
200
+
201
+ result = await extract_tables_isolated_async(file_path, config)
202
+
203
+ await table_cache.aset(result, **cache_kwargs)
204
+
205
+ return result
206
+
207
+ try:
208
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
209
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
210
+ from gmft.formatters.tatr import TATRFormatConfig
211
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document
212
+
213
+ formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
214
+ config=TATRFormatConfig(
215
+ verbosity=config.verbosity,
216
+ formatter_base_threshold=config.formatter_base_threshold,
217
+ cell_required_confidence=config.cell_required_confidence,
218
+ remove_null_rows=config.remove_null_rows,
219
+ enable_multi_header=config.enable_multi_header,
220
+ semantic_spanning_cells=config.semantic_spanning_cells,
221
+ semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
222
+ large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
223
+ large_table_threshold=config.large_table_threshold,
224
+ large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
225
+ large_table_maximum_rows=config.large_table_maximum_rows,
226
+ force_large_table_assumption=config.force_large_table_assumption,
227
+ )
228
+ )
229
+ detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
230
+ config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
231
+ )
232
+ doc = await run_sync(PyPDFium2Document, str(file_path))
233
+ cropped_tables: list[CroppedTable] = []
234
+ dataframes: list[DataFrame] = []
235
+ try:
236
+ for page in doc:
237
+ cropped_tables.extend(await run_sync(detector.extract, page))
238
+
239
+ for cropped_table in cropped_tables:
240
+ formatted_table = await run_sync(formatter.extract, cropped_table)
241
+ dataframes.append(await run_sync(formatted_table.df))
242
+
243
+ result = [
244
+ TableData(
245
+ cropped_image=cropped_table.image(),
246
+ page_number=cropped_table.page.page_number,
247
+ text=data_frame.to_markdown(),
248
+ df=data_frame,
249
+ )
250
+ for data_frame, cropped_table in zip(dataframes, cropped_tables)
251
+ ]
252
+
253
+ await table_cache.aset(result, **cache_kwargs)
254
+
255
+ return result
256
+ finally:
257
+ await run_sync(doc.close)
258
+
259
+ except ImportError as e:
260
+ raise MissingDependencyError.create_for_package(
261
+ dependency_group="gmft", functionality="table extraction", package_name="gmft"
262
+ ) from e
263
+ finally:
264
+ table_cache.mark_complete(**cache_kwargs)
265
+
266
+
267
+ def extract_tables_sync(
268
+ file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
269
+ ) -> list[TableData]:
270
+ """Synchronous wrapper for extract_tables.
271
+
272
+ Args:
273
+ file_path: The path to the PDF file.
274
+ config: An optional configuration object.
275
+ use_isolated_process: Whether to use an isolated process for extraction.
276
+ If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
277
+
278
+ Returns:
279
+ A list of table data dictionaries.
280
+ """
281
+ from pathlib import Path
282
+
283
+ from kreuzberg._utils._cache import get_table_cache
284
+
285
+ # Determine if we should use isolated process # ~keep
286
+ if use_isolated_process is None:
287
+ use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
288
+
289
+ path = Path(file_path)
124
290
  try:
125
- from gmft.auto import AutoTableDetector, AutoTableFormatter
126
- from gmft.detectors.tatr import TATRDetectorConfig
291
+ stat = path.stat()
292
+ file_info = {
293
+ "path": str(path.resolve()),
294
+ "size": stat.st_size,
295
+ "mtime": stat.st_mtime,
296
+ }
297
+ except OSError:
298
+ file_info = {
299
+ "path": str(path),
300
+ "size": 0,
301
+ "mtime": 0,
302
+ }
303
+
304
+ config = config or GMFTConfig()
305
+ cache_kwargs = {
306
+ "file_info": str(sorted(file_info.items())),
307
+ "extractor": "gmft",
308
+ "config": str(sorted(config.__dict__.items())),
309
+ }
310
+
311
+ table_cache = get_table_cache()
312
+ cached_result = table_cache.get(**cache_kwargs)
313
+ if cached_result is not None:
314
+ return cached_result # type: ignore[no-any-return]
315
+
316
+ if use_isolated_process:
317
+ from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated
318
+
319
+ result = extract_tables_isolated(file_path, config)
320
+
321
+ table_cache.set(result, **cache_kwargs)
322
+
323
+ return result
324
+
325
+ try:
326
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
327
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
127
328
  from gmft.formatters.tatr import TATRFormatConfig
128
329
  from gmft.pdf_bindings.pdfium import PyPDFium2Document
129
330
 
130
- config = config or GMFTConfig()
131
- formatter = AutoTableFormatter(
331
+ formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
132
332
  config=TATRFormatConfig(
133
333
  verbosity=config.verbosity,
134
334
  formatter_base_threshold=config.formatter_base_threshold,
@@ -144,19 +344,21 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
144
344
  force_large_table_assumption=config.force_large_table_assumption,
145
345
  )
146
346
  )
147
- detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))
148
- doc = await run_sync(PyPDFium2Document, str(file_path))
149
- cropped_tables: list[CroppedTable] = []
150
- dataframes: list[DataFrame] = []
347
+ detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
348
+ config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
349
+ )
350
+ doc = PyPDFium2Document(str(file_path))
351
+ cropped_tables: list[Any] = []
352
+ dataframes: list[Any] = []
151
353
  try:
152
354
  for page in doc:
153
- cropped_tables.extend(await run_sync(detector.extract, page))
355
+ cropped_tables.extend(detector.extract(page))
154
356
 
155
357
  for cropped_table in cropped_tables:
156
- formatted_table = await run_sync(formatter.extract, cropped_table)
157
- dataframes.append(await run_sync(formatted_table.df))
358
+ formatted_table = formatter.extract(cropped_table)
359
+ dataframes.append(formatted_table.df())
158
360
 
159
- return [
361
+ result = [
160
362
  TableData(
161
363
  cropped_image=cropped_table.image(),
162
364
  page_number=cropped_table.page.page_number,
@@ -165,8 +367,12 @@ async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | No
165
367
  )
166
368
  for data_frame, cropped_table in zip(dataframes, cropped_tables)
167
369
  ]
370
+
371
+ table_cache.set(result, **cache_kwargs)
372
+
373
+ return result
168
374
  finally:
169
- await run_sync(doc.close)
375
+ doc.close() # type: ignore[no-untyped-call]
170
376
 
171
377
  except ImportError as e:
172
378
  raise MissingDependencyError.create_for_package(