kreuzberg 3.8.1__py3-none-any.whl → 3.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import contextlib
4
4
  import os
5
5
  import tempfile
6
+ from dataclasses import asdict
6
7
  from pathlib import Path
7
8
  from typing import TYPE_CHECKING, ClassVar
8
9
 
@@ -88,17 +89,17 @@ class ImageExtractor(Extractor):
88
89
  config = (
89
90
  self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
90
91
  )
91
- result = backend.process_file_sync(path, **config.__dict__)
92
+ result = backend.process_file_sync(path, **asdict(config))
92
93
  elif self.config.ocr_backend == "paddleocr":
93
94
  paddle_config = (
94
95
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
95
96
  )
96
- result = backend.process_file_sync(path, **paddle_config.__dict__)
97
+ result = backend.process_file_sync(path, **asdict(paddle_config))
97
98
  elif self.config.ocr_backend == "easyocr":
98
99
  easy_config = (
99
100
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
100
101
  )
101
- result = backend.process_file_sync(path, **easy_config.__dict__)
102
+ result = backend.process_file_sync(path, **asdict(easy_config))
102
103
  else:
103
104
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
104
105
  return self._apply_quality_processing(result)
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import contextlib
4
4
  import os
5
5
  import tempfile
6
+ from dataclasses import asdict
6
7
  from multiprocessing import cpu_count
7
8
  from pathlib import Path
8
9
  from re import Pattern
@@ -58,9 +59,13 @@ class PDFExtractor(Extractor):
58
59
  result: ExtractionResult | None = None
59
60
 
60
61
  if not self.config.force_ocr:
61
- content = await self._extract_pdf_searchable_text(path)
62
- if self._validate_extracted_text(content):
63
- result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
62
+ try:
63
+ content = await self._extract_pdf_searchable_text(path)
64
+ if self._validate_extracted_text(content):
65
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
66
+ except ParsingError:
67
+ # If searchable text extraction fails, continue to OCR or empty result
68
+ pass
64
69
 
65
70
  if not result and self.config.ocr_backend is not None:
66
71
  result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
@@ -73,7 +78,7 @@ class PDFExtractor(Extractor):
73
78
  if self.config.extract_tables:
74
79
  # GMFT is optional dependency
75
80
  try:
76
- from kreuzberg._gmft import extract_tables
81
+ from kreuzberg._gmft import extract_tables # noqa: PLC0415
77
82
 
78
83
  result.tables = await extract_tables(path, self.config.gmft_config)
79
84
  except ImportError:
@@ -112,16 +117,19 @@ class PDFExtractor(Extractor):
112
117
 
113
118
  def extract_path_sync(self, path: Path) -> ExtractionResult:
114
119
  """Pure sync implementation of PDF extraction from path."""
115
- text = self._extract_pdf_searchable_text_sync(path)
120
+ try:
121
+ text = self._extract_pdf_searchable_text_sync(path)
122
+ except ParsingError:
123
+ text = ""
116
124
 
117
- if self.config.force_ocr or not self._validate_extracted_text(text):
125
+ if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
118
126
  text = self._extract_pdf_with_ocr_sync(path)
119
127
 
120
128
  tables = []
121
129
  if self.config.extract_tables:
122
130
  # GMFT is optional dependency
123
131
  try:
124
- from kreuzberg._gmft import extract_tables_sync
132
+ from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
125
133
 
126
134
  tables = extract_tables_sync(path)
127
135
  except ImportError:
@@ -248,9 +256,10 @@ class PDFExtractor(Extractor):
248
256
  *[backend.process_image(image, **self.config.get_config_dict()) for image in images],
249
257
  batch_size=cpu_count(),
250
258
  )
251
- return ExtractionResult(
252
- content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
253
- )
259
+ # Use list comprehension and join for efficient string building
260
+ content = "\n".join(result.content for result in ocr_results)
261
+
262
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
254
263
 
255
264
  @staticmethod
256
265
  async def _extract_pdf_searchable_text(input_file: Path) -> str:
@@ -269,22 +278,24 @@ class PDFExtractor(Extractor):
269
278
  try:
270
279
  with pypdfium_file_lock(input_file):
271
280
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
272
- text_parts = []
281
+ pages_content = []
273
282
  page_errors = []
274
283
 
275
284
  for i, page in enumerate(cast("pypdfium2.PdfDocument", document)):
276
285
  try:
277
286
  text_page = page.get_textpage()
278
- text_parts.append(text_page.get_text_bounded())
287
+ page_content = text_page.get_text_bounded()
288
+ pages_content.append(page_content)
279
289
  except Exception as e: # noqa: PERF203, BLE001
280
290
  page_errors.append({"page": i + 1, "error": str(e)})
281
- text_parts.append(f"[Error extracting page {i + 1}]")
291
+ pages_content.append(f"[Error extracting page {i + 1}]")
282
292
 
283
- text = "\n".join(text_parts)
293
+ text = "\n".join(pages_content)
294
+ has_content = bool(text.strip())
284
295
 
285
- if page_errors and text_parts:
296
+ if page_errors and has_content:
286
297
  return normalize_spaces(text)
287
- if not text_parts:
298
+ if not has_content:
288
299
  raise ParsingError(
289
300
  "Could not extract any text from PDF",
290
301
  context=create_error_context(
@@ -315,14 +326,14 @@ class PDFExtractor(Extractor):
315
326
  try:
316
327
  with pypdfium_file_lock(path):
317
328
  pdf = pypdfium2.PdfDocument(str(path))
318
- text_parts = []
329
+ pages_text = []
319
330
  for page in pdf:
320
331
  text_page = page.get_textpage()
321
332
  text = text_page.get_text_bounded()
322
- text_parts.append(text)
333
+ pages_text.append(text)
323
334
  text_page.close()
324
335
  page.close()
325
- return "".join(text_parts)
336
+ return "\n".join(pages_text)
326
337
  except Exception as e:
327
338
  raise ParsingError(f"Failed to extract PDF text: {e}") from e
328
339
  finally:
@@ -378,22 +389,22 @@ class PDFExtractor(Extractor):
378
389
  config = (
379
390
  self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
380
391
  )
381
- results = backend.process_batch_sync(paths, **config.__dict__)
392
+ results = backend.process_batch_sync(paths, **asdict(config))
382
393
  elif self.config.ocr_backend == "paddleocr":
383
394
  paddle_config = (
384
395
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
385
396
  )
386
- results = backend.process_batch_sync(paths, **paddle_config.__dict__)
397
+ results = backend.process_batch_sync(paths, **asdict(paddle_config))
387
398
  elif self.config.ocr_backend == "easyocr":
388
399
  easy_config = (
389
400
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
390
401
  )
391
- results = backend.process_batch_sync(paths, **easy_config.__dict__)
402
+ results = backend.process_batch_sync(paths, **asdict(easy_config))
392
403
  else:
393
404
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
394
405
 
395
- text_parts = [r.content for r in results]
396
- return "\n\n".join(text_parts)
406
+ # Use list comprehension and join for efficient string building
407
+ return "\n\n".join(result.content for result in results)
397
408
 
398
409
  def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
399
410
  """Extract text using playa for better structure preservation."""
@@ -401,14 +412,14 @@ class PDFExtractor(Extractor):
401
412
  content = path.read_bytes()
402
413
  document = parse(content, max_workers=1)
403
414
 
404
- text_parts = []
415
+ # Extract text while preserving structure
416
+ pages_text = []
405
417
  for page in document.pages:
406
- # Extract text while preserving structure
407
418
  page_text = page.extract_text()
408
419
  if page_text and page_text.strip():
409
- text_parts.append(page_text)
420
+ pages_text.append(page_text)
410
421
 
411
- if text_parts:
412
- return "\n\n".join(text_parts)
422
+ if pages_text:
423
+ return "\n\n".join(pages_text)
413
424
 
414
425
  return fallback_text
@@ -2,13 +2,16 @@ from __future__ import annotations
2
2
 
3
3
  import contextlib
4
4
  import csv
5
+ import os
5
6
  import sys
7
+ import tempfile
6
8
  from datetime import date, datetime, time, timedelta
7
9
  from io import StringIO
8
10
  from pathlib import Path
9
11
  from typing import Any
10
12
 
11
13
  from anyio import Path as AsyncPath
14
+ from PIL import Image
12
15
  from python_calamine import CalamineWorkbook
13
16
 
14
17
  from kreuzberg._extractors._base import Extractor
@@ -68,9 +71,6 @@ class SpreadSheetExtractor(Extractor):
68
71
 
69
72
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
70
73
  """Pure sync implementation of extract_bytes."""
71
- import os
72
- import tempfile
73
-
74
74
  fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
75
75
 
76
76
  try:
@@ -198,9 +198,9 @@ class SpreadSheetExtractor(Extractor):
198
198
  """Enhanced sheet processing with better table structure preservation."""
199
199
  try:
200
200
  # pandas is optional dependency
201
- import pandas as pd
201
+ import pandas as pd # noqa: PLC0415
202
202
 
203
- from kreuzberg._utils._table import enhance_table_markdown
203
+ from kreuzberg._utils._table import enhance_table_markdown # noqa: PLC0415
204
204
 
205
205
  sheet = workbook.get_sheet_by_name(sheet_name)
206
206
  data = sheet.to_python()
@@ -218,9 +218,7 @@ class SpreadSheetExtractor(Extractor):
218
218
  return f"## {sheet_name}\n\n*No data*"
219
219
 
220
220
  # Create a mock TableData for enhanced formatting
221
- from PIL import Image
222
-
223
- from kreuzberg._types import TableData
221
+ from kreuzberg._types import TableData # noqa: PLC0415
224
222
 
225
223
  # Create a 1x1 transparent image as placeholder
226
224
  placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
@@ -1,8 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import sys
4
5
  from typing import TYPE_CHECKING, Any, ClassVar
5
6
 
7
+ if sys.version_info >= (3, 11):
8
+ import tomllib
9
+ else:
10
+ try:
11
+ import tomli as tomllib # type: ignore[import-not-found]
12
+ except ImportError:
13
+ tomllib = None
14
+
15
+ try:
16
+ import yaml
17
+ except ImportError:
18
+ yaml = None
19
+
6
20
  from anyio import Path as AsyncPath
7
21
 
8
22
  from kreuzberg._extractors._base import Extractor
@@ -14,6 +28,9 @@ from kreuzberg._utils._sync import run_sync
14
28
  if TYPE_CHECKING:
15
29
  from pathlib import Path
16
30
 
31
+ # Define text field keywords as a set for O(1) membership testing
32
+ _TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
33
+
17
34
 
18
35
  class StructuredDataExtractor(Extractor):
19
36
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
@@ -41,41 +58,34 @@ class StructuredDataExtractor(Extractor):
41
58
  if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
42
59
  data = json.loads(text_content)
43
60
  elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
44
- try:
45
- import tomllib # type: ignore[import-not-found]
46
- except ImportError:
47
- try:
48
- import tomli as tomllib # type: ignore[import-not-found]
49
- except ImportError:
50
- return ExtractionResult(
51
- content=normalize_spaces(text_content),
52
- mime_type=PLAIN_TEXT_MIME_TYPE,
53
- metadata={"warning": "tomllib/tomli not available, returning raw text"},
54
- chunks=[],
55
- )
61
+ if tomllib is None:
62
+ return ExtractionResult(
63
+ content=normalize_spaces(text_content),
64
+ mime_type=PLAIN_TEXT_MIME_TYPE,
65
+ metadata={"warning": "tomllib/tomli not available, returning raw text"},
66
+ chunks=[],
67
+ )
56
68
  data = tomllib.loads(text_content)
57
69
  else:
58
- try:
59
- import yaml
60
-
61
- data = yaml.safe_load(text_content)
62
- except ImportError:
70
+ if yaml is None:
63
71
  return ExtractionResult(
64
72
  content=normalize_spaces(text_content),
65
73
  mime_type=PLAIN_TEXT_MIME_TYPE,
66
74
  metadata={"warning": "PyYAML not available, returning raw text"},
67
75
  chunks=[],
68
76
  )
77
+ data = yaml.safe_load(text_content)
69
78
 
70
79
  text_parts: list[str] = []
71
80
  metadata: dict[str, Any] = {}
72
81
 
82
+ # Use match statement for cleaner code and avoid multiple isinstance calls
73
83
  if isinstance(data, dict):
74
- text_parts.extend(self._extract_from_dict(data, metadata))
84
+ text_parts = self._extract_from_dict(data, metadata)
75
85
  elif isinstance(data, list):
76
- text_parts.extend(self._extract_from_list(data, metadata))
86
+ text_parts = self._extract_from_list(data, metadata)
77
87
  else:
78
- text_parts.append(str(data))
88
+ text_parts = [str(data)]
79
89
 
80
90
  combined_text = "\n".join(text_parts) if text_parts else text_content
81
91
 
@@ -86,7 +96,7 @@ class StructuredDataExtractor(Extractor):
86
96
  chunks=[],
87
97
  )
88
98
 
89
- except (ValueError, TypeError, KeyError, AttributeError, UnicodeDecodeError) as e:
99
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
90
100
  return ExtractionResult(
91
101
  content=normalize_spaces(text_content),
92
102
  mime_type=PLAIN_TEXT_MIME_TYPE,
@@ -107,10 +117,9 @@ class StructuredDataExtractor(Extractor):
107
117
  if isinstance(value, str) and value.strip():
108
118
  text_parts.append(f"{full_key}: {value}")
109
119
 
110
- if any(
111
- text_field in key.lower()
112
- for text_field in ["title", "name", "subject", "description", "content", "body", "text", "message"]
113
- ):
120
+ # Check if key contains any text field keywords efficiently
121
+ key_lower = key.lower()
122
+ if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
114
123
  metadata[full_key] = value
115
124
 
116
125
  elif isinstance(value, (int, float, bool)):
kreuzberg/_gmft.py CHANGED
@@ -1,14 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import io
3
4
  import multiprocessing as mp
4
5
  import os
5
6
  import queue
6
7
  import signal
8
+ import time
7
9
  import traceback
8
10
  from dataclasses import dataclass, field
9
11
  from io import StringIO
12
+ from pathlib import Path
10
13
  from typing import TYPE_CHECKING, Any, Literal
11
14
 
15
+ import anyio
16
+ import msgspec
17
+ from PIL import Image
18
+
12
19
  from kreuzberg._types import TableData
13
20
  from kreuzberg._utils._sync import run_sync
14
21
  from kreuzberg.exceptions import MissingDependencyError, ParsingError
@@ -20,7 +27,7 @@ if TYPE_CHECKING:
20
27
  from pandas import DataFrame
21
28
 
22
29
 
23
- @dataclass(unsafe_hash=True)
30
+ @dataclass(unsafe_hash=True, slots=True)
24
31
  class GMFTConfig:
25
32
  """Configuration options for GMFT.
26
33
 
@@ -131,7 +138,7 @@ class GMFTConfig:
131
138
  """
132
139
 
133
140
 
134
- async def extract_tables( # noqa: PLR0915
141
+ async def extract_tables(
135
142
  file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
136
143
  ) -> list[TableData]:
137
144
  """Extracts tables from a PDF file.
@@ -151,9 +158,7 @@ async def extract_tables( # noqa: PLR0915
151
158
  Returns:
152
159
  A list of table data dictionaries.
153
160
  """
154
- from pathlib import Path
155
-
156
- from kreuzberg._utils._cache import get_table_cache
161
+ from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
157
162
 
158
163
  # Determine if we should use isolated process # ~keep
159
164
  if use_isolated_process is None:
@@ -178,7 +183,7 @@ async def extract_tables( # noqa: PLR0915
178
183
  cache_kwargs = {
179
184
  "file_info": str(sorted(file_info.items())),
180
185
  "extractor": "gmft",
181
- "config": str(sorted(config.__dict__.items())),
186
+ "config": str(sorted(msgspec.to_builtins(config).items())),
182
187
  }
183
188
 
184
189
  table_cache = get_table_cache()
@@ -187,8 +192,6 @@ async def extract_tables( # noqa: PLR0915
187
192
  return cached_result # type: ignore[no-any-return]
188
193
 
189
194
  if table_cache.is_processing(**cache_kwargs):
190
- import anyio
191
-
192
195
  event = table_cache.mark_processing(**cache_kwargs)
193
196
  await anyio.to_thread.run_sync(event.wait)
194
197
 
@@ -208,10 +211,13 @@ async def extract_tables( # noqa: PLR0915
208
211
  return result
209
212
 
210
213
  try:
211
- from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
212
- from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
213
- from gmft.formatters.tatr import TATRFormatConfig
214
- from gmft.pdf_bindings.pdfium import PyPDFium2Document
214
+ from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415 # noqa: PLC0415
215
+ AutoTableDetector,
216
+ AutoTableFormatter,
217
+ )
218
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
219
+ from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415 # noqa: PLC0415
220
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415 # noqa: PLC0415
215
221
 
216
222
  formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
217
223
  config=TATRFormatConfig(
@@ -281,9 +287,7 @@ def extract_tables_sync(
281
287
  Returns:
282
288
  A list of table data dictionaries.
283
289
  """
284
- from pathlib import Path
285
-
286
- from kreuzberg._utils._cache import get_table_cache
290
+ from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
287
291
 
288
292
  # Determine if we should use isolated process # ~keep
289
293
  if use_isolated_process is None:
@@ -308,7 +312,7 @@ def extract_tables_sync(
308
312
  cache_kwargs = {
309
313
  "file_info": str(sorted(file_info.items())),
310
314
  "extractor": "gmft",
311
- "config": str(sorted(config.__dict__.items())),
315
+ "config": str(sorted(msgspec.to_builtins(config).items())),
312
316
  }
313
317
 
314
318
  table_cache = get_table_cache()
@@ -324,10 +328,10 @@ def extract_tables_sync(
324
328
  return result
325
329
 
326
330
  try:
327
- from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
328
- from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
329
- from gmft.formatters.tatr import TATRFormatConfig
330
- from gmft.pdf_bindings.pdfium import PyPDFium2Document
331
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined] # noqa: PLC0415
332
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
333
+ from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
334
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
331
335
 
332
336
  formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
333
337
  config=TATRFormatConfig(
@@ -396,10 +400,10 @@ def _extract_tables_in_process(
396
400
  signal.signal(signal.SIGINT, signal.SIG_IGN)
397
401
 
398
402
  try:
399
- from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
400
- from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
401
- from gmft.formatters.tatr import TATRFormatConfig
402
- from gmft.pdf_bindings.pdfium import PyPDFium2Document
403
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined] # noqa: PLC0415
404
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
405
+ from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
406
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
403
407
 
404
408
  config = GMFTConfig(**config_dict)
405
409
 
@@ -435,8 +439,6 @@ def _extract_tables_in_process(
435
439
 
436
440
  results = []
437
441
  for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
438
- import io
439
-
440
442
  img_bytes = io.BytesIO()
441
443
  cropped_image = cropped_table.image()
442
444
  cropped_image.save(img_bytes, format="PNG")
@@ -480,7 +482,7 @@ def _extract_tables_isolated(
480
482
  RuntimeError: If extraction fails or times out
481
483
  """
482
484
  config = config or GMFTConfig()
483
- config_dict = config.__dict__.copy()
485
+ config_dict = msgspec.to_builtins(config)
484
486
 
485
487
  ctx = mp.get_context("spawn")
486
488
  result_queue = ctx.Queue()
@@ -494,7 +496,6 @@ def _extract_tables_isolated(
494
496
 
495
497
  try:
496
498
  # Wait for result with timeout, checking for process death # ~keep
497
- import time
498
499
 
499
500
  start_time = time.time()
500
501
  while True:
@@ -528,12 +529,8 @@ def _extract_tables_isolated(
528
529
  if success:
529
530
  tables = []
530
531
  for table_dict in result:
531
- import io
532
-
533
- from PIL import Image
534
-
535
532
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
536
- import pandas as pd
533
+ import pandas as pd # noqa: PLC0415
537
534
 
538
535
  df = pd.read_csv(StringIO(table_dict["df_csv"]))
539
536
 
@@ -578,7 +575,7 @@ def _extract_tables_isolated(
578
575
  async def _extract_tables_isolated_async(
579
576
  file_path: str | PathLike[str],
580
577
  config: GMFTConfig | None = None,
581
- timeout: float = 300.0,
578
+ timeout: float = 300.0, # noqa: ASYNC109
582
579
  ) -> list[TableData]:
583
580
  """Async version of extract_tables_isolated using asyncio.
584
581
 
@@ -593,10 +590,8 @@ async def _extract_tables_isolated_async(
593
590
  Raises:
594
591
  RuntimeError: If extraction fails or times out
595
592
  """
596
- import anyio
597
-
598
593
  config = config or GMFTConfig()
599
- config_dict = config.__dict__.copy()
594
+ config_dict = msgspec.to_builtins(config)
600
595
 
601
596
  ctx = mp.get_context("spawn")
602
597
  result_queue = ctx.Queue()
@@ -640,12 +635,8 @@ async def _extract_tables_isolated_async(
640
635
  if success:
641
636
  tables = []
642
637
  for table_dict in result:
643
- import io
644
-
645
- from PIL import Image
646
-
647
638
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
648
- import pandas as pd
639
+ import pandas as pd # noqa: PLC0415
649
640
 
650
641
  df = pd.read_csv(StringIO(table_dict["df_csv"]))
651
642
 
@@ -23,7 +23,7 @@ except ImportError:
23
23
  _CACHE_SIZE = 128
24
24
 
25
25
 
26
- @dataclass(frozen=True)
26
+ @dataclass(frozen=True, slots=True)
27
27
  class LanguageDetectionConfig:
28
28
  """Configuration for language detection.
29
29