kreuzberg 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -8,8 +8,10 @@ from ._types import (
8
8
  ExtractionConfig,
9
9
  ExtractionResult,
10
10
  GMFTConfig,
11
+ HTMLToMarkdownConfig,
11
12
  ImageOCRConfig,
12
13
  ImageOCRResult,
14
+ JSONExtractionConfig,
13
15
  LanguageDetectionConfig,
14
16
  Metadata,
15
17
  PaddleOCRConfig,
@@ -40,8 +42,10 @@ __all__ = [
40
42
  "ExtractionResult",
41
43
  "ExtractorRegistry",
42
44
  "GMFTConfig",
45
+ "HTMLToMarkdownConfig",
43
46
  "ImageOCRConfig",
44
47
  "ImageOCRResult",
48
+ "JSONExtractionConfig",
45
49
  "KreuzbergError",
46
50
  "LanguageDetectionConfig",
47
51
  "Metadata",
kreuzberg/_api/main.py CHANGED
@@ -13,10 +13,8 @@ from typing_extensions import TypedDict
13
13
 
14
14
  from kreuzberg import (
15
15
  EasyOCRConfig,
16
- ExtractedImage,
17
16
  ExtractionConfig,
18
17
  ExtractionResult,
19
- ImageOCRResult,
20
18
  KreuzbergError,
21
19
  MissingDependencyError,
22
20
  PaddleOCRConfig,
@@ -40,30 +38,6 @@ if TYPE_CHECKING:
40
38
  from litestar.datastructures import UploadFile
41
39
 
42
40
 
43
- class ExtractedImageDict(TypedDict):
44
- """TypedDict for extracted image JSON representation."""
45
-
46
- data: str
47
- format: str
48
- filename: str | None
49
- page_number: int | None
50
- dimensions: tuple[int, int] | None
51
- colorspace: str | None
52
- bits_per_component: int | None
53
- is_mask: bool
54
- description: str | None
55
-
56
-
57
- class ImageOCRResultDict(TypedDict):
58
- """TypedDict for image OCR result JSON representation."""
59
-
60
- image: ExtractedImageDict
61
- ocr_result: Any
62
- confidence_score: float | None
63
- processing_time: float | None
64
- skipped_reason: str | None
65
-
66
-
67
41
  class HealthResponse(TypedDict):
68
42
  """Response model for health check endpoint."""
69
43
 
@@ -384,31 +358,6 @@ def _pil_image_encoder(obj: Any) -> str:
384
358
  return f"data:image/png;base64,{img_str}"
385
359
 
386
360
 
387
- def _extracted_image_encoder(obj: ExtractedImage) -> ExtractedImageDict:
388
- encoded_data = base64.b64encode(obj.data).decode()
389
- return ExtractedImageDict(
390
- data=f"data:image/{obj.format};base64,{encoded_data}",
391
- format=obj.format,
392
- filename=obj.filename,
393
- page_number=obj.page_number,
394
- dimensions=obj.dimensions,
395
- colorspace=obj.colorspace,
396
- bits_per_component=obj.bits_per_component,
397
- is_mask=obj.is_mask,
398
- description=obj.description,
399
- )
400
-
401
-
402
- def _image_ocr_result_encoder(obj: ImageOCRResult) -> ImageOCRResultDict:
403
- return ImageOCRResultDict(
404
- image=_extracted_image_encoder(obj.image),
405
- ocr_result=obj.ocr_result,
406
- confidence_score=obj.confidence_score,
407
- processing_time=obj.processing_time,
408
- skipped_reason=obj.skipped_reason,
409
- )
410
-
411
-
412
361
  openapi_config = OpenAPIConfig(
413
362
  title="Kreuzberg API",
414
363
  version="3.14.0",
@@ -428,8 +377,6 @@ openapi_config = OpenAPIConfig(
428
377
  type_encoders = {
429
378
  pl.DataFrame: _polars_dataframe_encoder,
430
379
  Image.Image: _pil_image_encoder,
431
- ExtractedImage: _extracted_image_encoder,
432
- ImageOCRResult: _image_ocr_result_encoder,
433
380
  }
434
381
 
435
382
  app = Litestar(
kreuzberg/_config.py CHANGED
@@ -69,7 +69,17 @@ def _build_ocr_config_from_cli(
69
69
  try:
70
70
  match ocr_backend:
71
71
  case "tesseract":
72
- return TesseractConfig(**backend_args)
72
+ # Handle PSM mode conversion from int to enum
73
+ processed_args = backend_args.copy()
74
+ if "psm" in processed_args and isinstance(processed_args["psm"], int):
75
+ try:
76
+ processed_args["psm"] = PSMMode(processed_args["psm"])
77
+ except ValueError as e:
78
+ raise ValidationError(
79
+ f"Invalid PSM mode value: {processed_args['psm']}",
80
+ context={"psm_value": processed_args["psm"], "error": str(e)},
81
+ ) from e
82
+ return TesseractConfig(**processed_args)
73
83
  case "easyocr":
74
84
  return EasyOCRConfig(**backend_args)
75
85
  case "paddleocr":
@@ -132,7 +132,7 @@ def classify_document_from_layout(
132
132
  if not found_words.is_empty():
133
133
  scores[doc_type] += 1.0
134
134
  word_top = found_words[0, "top"]
135
- if word_top < page_height * 0.3:
135
+ if word_top is not None and word_top < page_height * 0.3:
136
136
  scores[doc_type] += 0.5
137
137
 
138
138
  total_score = sum(scores.values())
@@ -27,6 +27,8 @@ except ImportError: # pragma: no cover
27
27
  html2text = None
28
28
 
29
29
  _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
30
+ _UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
31
+ _UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
30
32
 
31
33
 
32
34
  class EmailExtractor(Extractor):
@@ -86,7 +88,14 @@ class EmailExtractor(Extractor):
86
88
  def _format_email_field(self, field: Any) -> str:
87
89
  match field:
88
90
  case list():
89
- return ", ".join(str(item.get("email", "")) if isinstance(item, dict) else str(item) for item in field)
91
+ emails = []
92
+ for item in field:
93
+ if isinstance(item, dict):
94
+ if email := item.get("email", ""):
95
+ emails.append(str(email))
96
+ else:
97
+ emails.append(str(item))
98
+ return ", ".join(emails)
90
99
  case dict():
91
100
  return str(field.get("email", ""))
92
101
  case _:
@@ -111,12 +120,8 @@ class EmailExtractor(Extractor):
111
120
  cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
112
121
  clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
113
122
  clean_html = unescape(clean_html)
114
- clean_html = (
115
- clean_html.replace("\u201c", '"')
116
- .replace("\u201d", '"')
117
- .replace("\u2019", "'")
118
- .replace("\u2018", "'")
119
- )
123
+ clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
124
+ clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
120
125
  text_parts.append(clean_html)
121
126
 
122
127
  def _extract_email_attachments(
@@ -129,12 +134,12 @@ class EmailExtractor(Extractor):
129
134
  for att in attachments:
130
135
  name_val: str = "unknown"
131
136
  if isinstance(att, dict):
132
- n = att.get("name")
137
+ n = att.get("name") or att.get("filename")
133
138
  if isinstance(n, str) and n:
134
139
  name_val = n
135
140
  names.append(name_val)
136
- metadata["attachments"] = names
137
141
  if names:
142
+ metadata["attachments"] = names
138
143
  text_parts.append("Attachments: " + ", ".join(names))
139
144
 
140
145
  def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
@@ -151,7 +156,8 @@ class EmailExtractor(Extractor):
151
156
  if not isinstance(mime, str) or not mime.startswith("image/"):
152
157
  continue
153
158
 
154
- name = att.get("name") if isinstance(att.get("name"), str) else None
159
+ name = att.get("name") or att.get("filename")
160
+ name = name if isinstance(name, str) else None
155
161
  data = att.get("data") or att.get("content") or att.get("payload")
156
162
  raw: bytes | None = None
157
163
  if isinstance(data, (bytes, bytearray)):
@@ -1,16 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import base64
4
+ import binascii
5
+ import io
4
6
  import logging
5
7
  from typing import TYPE_CHECKING, ClassVar
6
8
 
7
9
  import html_to_markdown
8
10
  from anyio import Path as AsyncPath
9
11
  from bs4 import BeautifulSoup
12
+ from PIL import Image
10
13
 
11
14
  from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
12
15
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
13
16
  from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
17
+ from kreuzberg._utils._html_streaming import should_use_streaming
14
18
  from kreuzberg._utils._string import safe_decode
15
19
  from kreuzberg._utils._sync import run_maybe_async, run_sync
16
20
 
@@ -44,6 +48,11 @@ class HTMLExtractor(Extractor):
44
48
  config_dict = config.to_dict()
45
49
 
46
50
  html_content = safe_decode(content)
51
+
52
+ use_streaming, chunk_size = should_use_streaming(len(content))
53
+ config_dict["stream_processing"] = use_streaming
54
+ config_dict["chunk_size"] = chunk_size
55
+
47
56
  result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
48
57
 
49
58
  extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
@@ -89,6 +98,13 @@ class HTMLExtractor(Extractor):
89
98
  )
90
99
  continue
91
100
 
101
+ dimensions = None
102
+ try:
103
+ with Image.open(io.BytesIO(image_data)) as pil_img:
104
+ dimensions = pil_img.size
105
+ except (OSError, ValueError) as e:
106
+ logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
107
+
92
108
  alt_val = img.get("alt") # type: ignore[union-attr]
93
109
  desc = alt_val if isinstance(alt_val, str) else None
94
110
  images.append(
@@ -97,25 +113,36 @@ class HTMLExtractor(Extractor):
97
113
  format=format_name,
98
114
  filename=f"embedded_image_{len(images) + 1}.{format_name}",
99
115
  description=desc,
116
+ dimensions=dimensions,
100
117
  )
101
118
  )
102
- except Exception as e: # noqa: BLE001
119
+ except (ValueError, binascii.Error) as e:
103
120
  logger.warning("Failed to extract base64 image: %s", e)
104
121
 
105
- for svg in soup.find_all("svg"):
122
+ def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
106
123
  try:
107
- svg_content = str(svg).encode("utf-8")
108
- title_or_aria = svg.get("title") or svg.get("aria-label") # type: ignore[union-attr]
124
+ svg_content = str(svg_element).encode("utf-8")
125
+
126
+ def _get_attr_safe(obj: object, attr: str) -> str | None:
127
+ get_method = getattr(obj, "get", None)
128
+ if callable(get_method):
129
+ result = get_method(attr)
130
+ return result if isinstance(result, str) else None
131
+ return None
132
+
133
+ title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
109
134
  desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
110
- images.append(
111
- ExtractedImage(
112
- data=svg_content,
113
- format="svg",
114
- filename=f"inline_svg_{len(images) + 1}.svg",
115
- description=desc_svg,
116
- )
135
+ return ExtractedImage(
136
+ data=svg_content,
137
+ format="svg",
138
+ filename=f"inline_svg_{len(images) + 1}.svg",
139
+ description=desc_svg,
117
140
  )
118
- except Exception as e: # noqa: BLE001, PERF203
141
+ except (UnicodeEncodeError, AttributeError) as e:
119
142
  logger.warning("Failed to extract SVG: %s", e)
143
+ return None
144
+
145
+ svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
146
+ images.extend(img for img in svg_images if img is not None)
120
147
 
121
148
  return images
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
3
  import contextlib
5
4
  import io
6
5
  import logging
@@ -41,7 +40,7 @@ from kreuzberg._utils._errors import create_error_context, should_retry
41
40
  from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
42
41
  from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
43
42
  from kreuzberg._utils._string import normalize_spaces
44
- from kreuzberg._utils._sync import run_maybe_async, run_taskgroup_batched
43
+ from kreuzberg._utils._sync import run_maybe_async, run_taskgroup, run_taskgroup_batched
45
44
  from kreuzberg._utils._table import generate_table_summary
46
45
  from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
47
46
  from kreuzberg.exceptions import ParsingError
@@ -231,7 +230,7 @@ class PDFExtractor(Extractor):
231
230
  img_counter += 1
232
231
 
233
232
  if tasks:
234
- results = await asyncio.gather(*tasks)
233
+ results = await run_taskgroup(*tasks)
235
234
  return [img for img in results if img is not None]
236
235
 
237
236
  return []
@@ -142,6 +142,8 @@ class PresentationExtractor(Extractor):
142
142
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
143
143
  try:
144
144
  image = shape.image
145
+ if not image.blob or not isinstance(image.blob, bytes):
146
+ continue
145
147
  filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
146
148
 
147
149
  images.append(
@@ -162,6 +164,8 @@ class PresentationExtractor(Extractor):
162
164
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
163
165
  try:
164
166
  image = shape.image
167
+ if not image.blob or not isinstance(image.blob, bytes):
168
+ continue
165
169
  filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
166
170
  images.append(
167
171
  ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
@@ -197,7 +197,6 @@ class SpreadSheetExtractor(Extractor):
197
197
  if not data or not any(row for row in data):
198
198
  return f"## {sheet_name}\n\n*Empty sheet*"
199
199
 
200
- # Normalize row lengths to avoid polars ShapeError
201
200
  if data:
202
201
  max_cols = max(len(row) if row else 0 for row in data)
203
202
  data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data] # type: ignore[list-item]
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
3
  import sys
5
4
  from typing import TYPE_CHECKING, Any, ClassVar
6
5
 
@@ -17,11 +16,13 @@ try:
17
16
  except ImportError: # pragma: no cover
18
17
  yaml = None
19
18
 
19
+
20
20
  from anyio import Path as AsyncPath
21
21
 
22
22
  from kreuzberg._extractors._base import Extractor
23
23
  from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
24
- from kreuzberg._types import ExtractionResult, normalize_metadata
24
+ from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
25
+ from kreuzberg._utils._serialization import deserialize
25
26
  from kreuzberg._utils._string import normalize_spaces, safe_decode
26
27
  from kreuzberg._utils._sync import run_sync
27
28
 
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
43
44
  "text/toml",
44
45
  }
45
46
 
47
+ @property
48
+ def _json_config(self) -> JSONExtractionConfig | None:
49
+ return self.config.json_config
50
+
51
+ def _get_text_field_keywords(self) -> frozenset[str]:
52
+ json_config = self._json_config
53
+ if json_config and json_config.custom_text_field_patterns:
54
+ return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
55
+ return _TEXT_FIELD_KEYWORDS
56
+
57
+ def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
58
+ json_config = self._json_config
59
+ if not json_config or not json_config.extract_schema:
60
+ return {}
61
+
62
+ if depth >= json_config.max_depth:
63
+ return {"max_depth_reached": True}
64
+
65
+ schema_info: dict[str, Any] = {"type": type(data).__name__}
66
+
67
+ if isinstance(data, dict):
68
+ schema_info["properties"] = {}
69
+ for key, value in data.items():
70
+ key_path = f"{path}.{key}" if path else key
71
+ schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
72
+ elif isinstance(data, list) and data:
73
+ if len(data) <= json_config.array_item_limit:
74
+ schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
75
+ schema_info["length"] = len(data)
76
+ else:
77
+ schema_info["items"] = {"type": "truncated"}
78
+ schema_info["length"] = len(data)
79
+ schema_info["truncated"] = True
80
+
81
+ return schema_info
82
+
46
83
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
47
84
  return await run_sync(self.extract_bytes_sync, content)
48
85
 
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
51
88
  return await self.extract_bytes_async(content)
52
89
 
53
90
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
54
- text_content = safe_decode(content)
55
-
91
+ text_content: None | str = None
56
92
  try:
57
93
  if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
58
- data = json.loads(text_content)
94
+ data = deserialize(content, dict, json=True)
59
95
  elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
96
+ text_content = safe_decode(content)
60
97
  if tomllib is None:
61
98
  return ExtractionResult(
62
99
  content=normalize_spaces(text_content),
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
66
103
  )
67
104
  data = tomllib.loads(text_content)
68
105
  else:
106
+ text_content = safe_decode(content)
69
107
  if yaml is None:
70
108
  return ExtractionResult(
71
109
  content=normalize_spaces(text_content),
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
75
113
  )
76
114
  data = yaml.safe_load(text_content)
77
115
 
78
- text_parts: list[str] = []
79
116
  metadata: dict[str, Any] = {}
80
117
 
118
+ if (
119
+ self.mime_type in {JSON_MIME_TYPE, "text/json"}
120
+ and self._json_config
121
+ and self._json_config.extract_schema
122
+ ):
123
+ schema_info = self._extract_json_schema(data)
124
+ if schema_info:
125
+ metadata["json_schema"] = schema_info
126
+
81
127
  if isinstance(data, dict):
82
128
  text_parts = self._extract_from_dict(data, metadata)
83
129
  elif isinstance(data, list):
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
85
131
  else:
86
132
  text_parts = [str(data)]
87
133
 
88
- combined_text = "\n".join(text_parts) if text_parts else text_content
134
+ combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
89
135
 
90
136
  return ExtractionResult(
91
137
  content=normalize_spaces(combined_text),
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
96
142
 
97
143
  except (ValueError, TypeError) as e:
98
144
  return ExtractionResult(
99
- content=normalize_spaces(text_content),
145
+ content=normalize_spaces(text_content or safe_decode(content)),
100
146
  mime_type=PLAIN_TEXT_MIME_TYPE,
101
147
  metadata={"parse_error": str(e)},
102
148
  chunks=[],
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
113
159
  full_key = f"{prefix}.{key}" if prefix else key
114
160
 
115
161
  if isinstance(value, str) and value.strip():
116
- text_parts.append(f"{full_key}: {value}")
162
+ if self._json_config and self._json_config.include_type_info:
163
+ text_parts.append(f"{full_key} (string): {value}")
164
+ else:
165
+ text_parts.append(f"{full_key}: {value}")
117
166
 
118
167
  key_lower = key.lower()
119
- if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
168
+ text_field_keywords = self._get_text_field_keywords()
169
+ if any(keyword in key_lower for keyword in text_field_keywords):
120
170
  metadata[full_key] = value
121
171
 
122
172
  elif isinstance(value, (int, float, bool)):
123
- text_parts.append(f"{full_key}: {value}")
173
+ if self._json_config and self._json_config.include_type_info:
174
+ type_name = type(value).__name__
175
+ text_parts.append(f"{full_key} ({type_name}): {value}")
176
+ else:
177
+ text_parts.append(f"{full_key}: {value}")
124
178
 
125
179
  elif isinstance(value, dict):
126
- text_parts.extend(self._extract_from_dict(value, metadata, full_key))
180
+ if self._json_config and not self._json_config.flatten_nested_objects:
181
+ text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
182
+ else:
183
+ text_parts.extend(self._extract_from_dict(value, metadata, full_key))
127
184
 
128
185
  elif isinstance(value, list):
129
186
  text_parts.extend(self._extract_from_list(value, metadata, full_key))
130
187
 
131
188
  elif value is not None:
132
- text_parts.append(f"{full_key}: {value!s}")
189
+ if self._json_config and self._json_config.include_type_info:
190
+ type_name = type(value).__name__
191
+ text_parts.append(f"{full_key} ({type_name}): {value!s}")
192
+ else:
193
+ text_parts.append(f"{full_key}: {value!s}")
133
194
 
134
195
  return text_parts
135
196
 
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
140
201
  item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
141
202
 
142
203
  if isinstance(item, str) and item.strip():
143
- text_parts.append(f"{item_key}: {item}")
204
+ if self._json_config and self._json_config.include_type_info:
205
+ text_parts.append(f"{item_key} (string): {item}")
206
+ else:
207
+ text_parts.append(f"{item_key}: {item}")
144
208
 
145
209
  elif isinstance(item, dict):
146
210
  text_parts.extend(self._extract_from_dict(item, metadata, item_key))
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
149
213
  text_parts.extend(self._extract_from_list(item, metadata, item_key))
150
214
 
151
215
  elif item is not None:
152
- text_parts.append(f"{item_key}: {item!s}")
216
+ if self._json_config and self._json_config.include_type_info:
217
+ type_name = type(item).__name__
218
+ text_parts.append(f"{item_key} ({type_name}): {item!s}")
219
+ else:
220
+ text_parts.append(f"{item_key}: {item!s}")
153
221
 
154
222
  return text_parts
kreuzberg/_gmft.py CHANGED
@@ -312,6 +312,11 @@ def _extract_tables_in_process(
312
312
  from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
313
313
  from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
314
314
 
315
+ if "cell_required_confidence" in config_dict:
316
+ cell_config = config_dict["cell_required_confidence"]
317
+ if isinstance(cell_config, dict) and cell_config:
318
+ config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
319
+
315
320
  config = GMFTConfig(**config_dict)
316
321
 
317
322
  formatter = AutoTableFormatter( # type: ignore[no-untyped-call]