kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/main.py +0 -53
  3. kreuzberg/_config.py +17 -8
  4. kreuzberg/_document_classification.py +1 -1
  5. kreuzberg/_extractors/_base.py +0 -46
  6. kreuzberg/_extractors/_email.py +16 -10
  7. kreuzberg/_extractors/_html.py +39 -12
  8. kreuzberg/_extractors/_pandoc.py +2 -2
  9. kreuzberg/_extractors/_pdf.py +6 -7
  10. kreuzberg/_extractors/_presentation.py +4 -0
  11. kreuzberg/_extractors/_spread_sheet.py +0 -1
  12. kreuzberg/_extractors/_structured.py +83 -15
  13. kreuzberg/_gmft.py +7 -2
  14. kreuzberg/_mcp/server.py +1 -22
  15. kreuzberg/_mime_types.py +1 -1
  16. kreuzberg/_ocr/_easyocr.py +47 -20
  17. kreuzberg/_ocr/_paddleocr.py +1 -1
  18. kreuzberg/_ocr/_tesseract.py +27 -26
  19. kreuzberg/_token_reduction/__init__.py +11 -0
  20. kreuzberg/_token_reduction/_reducer.py +439 -0
  21. kreuzberg/_token_reduction/_stopwords.py +116 -0
  22. kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
  23. kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
  24. kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
  25. kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
  26. kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
  27. kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
  28. kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
  29. kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
  30. kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
  31. kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
  32. kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
  33. kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
  34. kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
  35. kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
  36. kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
  37. kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
  38. kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
  39. kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
  40. kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
  41. kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
  42. kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
  43. kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
  44. kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
  45. kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
  46. kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
  47. kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
  48. kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
  49. kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
  50. kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
  51. kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
  52. kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
  53. kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
  54. kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
  55. kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
  56. kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
  57. kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
  58. kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
  59. kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
  60. kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
  61. kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
  62. kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
  63. kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
  64. kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
  65. kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
  66. kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
  67. kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
  68. kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
  69. kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
  70. kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
  71. kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
  72. kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
  73. kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
  74. kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
  75. kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
  76. kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
  77. kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
  78. kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
  79. kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
  80. kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
  81. kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
  82. kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
  83. kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
  84. kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
  85. kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
  86. kreuzberg/_types.py +146 -43
  87. kreuzberg/_utils/_html_streaming.py +20 -0
  88. kreuzberg/_utils/_image_preprocessing.py +1 -1
  89. kreuzberg/_utils/_ref.py +14 -6
  90. kreuzberg/_utils/_serialization.py +13 -6
  91. kreuzberg/_utils/_sync.py +15 -16
  92. kreuzberg/exceptions.py +0 -1
  93. kreuzberg/extraction.py +27 -11
  94. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
  95. kreuzberg-3.17.0.dist-info/RECORD +128 -0
  96. kreuzberg-3.15.0.dist-info/RECORD +0 -60
  97. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
  98. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
  99. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
3
  import sys
5
4
  from typing import TYPE_CHECKING, Any, ClassVar
6
5
 
@@ -17,11 +16,13 @@ try:
17
16
  except ImportError: # pragma: no cover
18
17
  yaml = None
19
18
 
19
+
20
20
  from anyio import Path as AsyncPath
21
21
 
22
22
  from kreuzberg._extractors._base import Extractor
23
23
  from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
24
- from kreuzberg._types import ExtractionResult, normalize_metadata
24
+ from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
25
+ from kreuzberg._utils._serialization import deserialize
25
26
  from kreuzberg._utils._string import normalize_spaces, safe_decode
26
27
  from kreuzberg._utils._sync import run_sync
27
28
 
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
43
44
  "text/toml",
44
45
  }
45
46
 
47
+ @property
48
+ def _json_config(self) -> JSONExtractionConfig | None:
49
+ return self.config.json_config
50
+
51
+ def _get_text_field_keywords(self) -> frozenset[str]:
52
+ json_config = self._json_config
53
+ if json_config and json_config.custom_text_field_patterns:
54
+ return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
55
+ return _TEXT_FIELD_KEYWORDS
56
+
57
+ def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
58
+ json_config = self._json_config
59
+ if not json_config or not json_config.extract_schema:
60
+ return {}
61
+
62
+ if depth >= json_config.max_depth:
63
+ return {"max_depth_reached": True}
64
+
65
+ schema_info: dict[str, Any] = {"type": type(data).__name__}
66
+
67
+ if isinstance(data, dict):
68
+ schema_info["properties"] = {}
69
+ for key, value in data.items():
70
+ key_path = f"{path}.{key}" if path else key
71
+ schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
72
+ elif isinstance(data, list) and data:
73
+ if len(data) <= json_config.array_item_limit:
74
+ schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
75
+ schema_info["length"] = len(data)
76
+ else:
77
+ schema_info["items"] = {"type": "truncated"}
78
+ schema_info["length"] = len(data)
79
+ schema_info["truncated"] = True
80
+
81
+ return schema_info
82
+
46
83
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
47
84
  return await run_sync(self.extract_bytes_sync, content)
48
85
 
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
51
88
  return await self.extract_bytes_async(content)
52
89
 
53
90
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
54
- text_content = safe_decode(content)
55
-
91
+ text_content: None | str = None
56
92
  try:
57
93
  if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
58
- data = json.loads(text_content)
94
+ data = deserialize(content, dict, json=True)
59
95
  elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
96
+ text_content = safe_decode(content)
60
97
  if tomllib is None:
61
98
  return ExtractionResult(
62
99
  content=normalize_spaces(text_content),
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
66
103
  )
67
104
  data = tomllib.loads(text_content)
68
105
  else:
106
+ text_content = safe_decode(content)
69
107
  if yaml is None:
70
108
  return ExtractionResult(
71
109
  content=normalize_spaces(text_content),
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
75
113
  )
76
114
  data = yaml.safe_load(text_content)
77
115
 
78
- text_parts: list[str] = []
79
116
  metadata: dict[str, Any] = {}
80
117
 
118
+ if (
119
+ self.mime_type in {JSON_MIME_TYPE, "text/json"}
120
+ and self._json_config
121
+ and self._json_config.extract_schema
122
+ ):
123
+ schema_info = self._extract_json_schema(data)
124
+ if schema_info:
125
+ metadata["json_schema"] = schema_info
126
+
81
127
  if isinstance(data, dict):
82
128
  text_parts = self._extract_from_dict(data, metadata)
83
129
  elif isinstance(data, list):
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
85
131
  else:
86
132
  text_parts = [str(data)]
87
133
 
88
- combined_text = "\n".join(text_parts) if text_parts else text_content
134
+ combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
89
135
 
90
136
  return ExtractionResult(
91
137
  content=normalize_spaces(combined_text),
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
96
142
 
97
143
  except (ValueError, TypeError) as e:
98
144
  return ExtractionResult(
99
- content=normalize_spaces(text_content),
145
+ content=normalize_spaces(text_content or safe_decode(content)),
100
146
  mime_type=PLAIN_TEXT_MIME_TYPE,
101
147
  metadata={"parse_error": str(e)},
102
148
  chunks=[],
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
113
159
  full_key = f"{prefix}.{key}" if prefix else key
114
160
 
115
161
  if isinstance(value, str) and value.strip():
116
- text_parts.append(f"{full_key}: {value}")
162
+ if self._json_config and self._json_config.include_type_info:
163
+ text_parts.append(f"{full_key} (string): {value}")
164
+ else:
165
+ text_parts.append(f"{full_key}: {value}")
117
166
 
118
167
  key_lower = key.lower()
119
- if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
168
+ text_field_keywords = self._get_text_field_keywords()
169
+ if any(keyword in key_lower for keyword in text_field_keywords):
120
170
  metadata[full_key] = value
121
171
 
122
172
  elif isinstance(value, (int, float, bool)):
123
- text_parts.append(f"{full_key}: {value}")
173
+ if self._json_config and self._json_config.include_type_info:
174
+ type_name = type(value).__name__
175
+ text_parts.append(f"{full_key} ({type_name}): {value}")
176
+ else:
177
+ text_parts.append(f"{full_key}: {value}")
124
178
 
125
179
  elif isinstance(value, dict):
126
- text_parts.extend(self._extract_from_dict(value, metadata, full_key))
180
+ if self._json_config and not self._json_config.flatten_nested_objects:
181
+ text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
182
+ else:
183
+ text_parts.extend(self._extract_from_dict(value, metadata, full_key))
127
184
 
128
185
  elif isinstance(value, list):
129
186
  text_parts.extend(self._extract_from_list(value, metadata, full_key))
130
187
 
131
188
  elif value is not None:
132
- text_parts.append(f"{full_key}: {value!s}")
189
+ if self._json_config and self._json_config.include_type_info:
190
+ type_name = type(value).__name__
191
+ text_parts.append(f"{full_key} ({type_name}): {value!s}")
192
+ else:
193
+ text_parts.append(f"{full_key}: {value!s}")
133
194
 
134
195
  return text_parts
135
196
 
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
140
201
  item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
141
202
 
142
203
  if isinstance(item, str) and item.strip():
143
- text_parts.append(f"{item_key}: {item}")
204
+ if self._json_config and self._json_config.include_type_info:
205
+ text_parts.append(f"{item_key} (string): {item}")
206
+ else:
207
+ text_parts.append(f"{item_key}: {item}")
144
208
 
145
209
  elif isinstance(item, dict):
146
210
  text_parts.extend(self._extract_from_dict(item, metadata, item_key))
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
149
213
  text_parts.extend(self._extract_from_list(item, metadata, item_key))
150
214
 
151
215
  elif item is not None:
152
- text_parts.append(f"{item_key}: {item!s}")
216
+ if self._json_config and self._json_config.include_type_info:
217
+ type_name = type(item).__name__
218
+ text_parts.append(f"{item_key} ({type_name}): {item!s}")
219
+ else:
220
+ text_parts.append(f"{item_key}: {item!s}")
153
221
 
154
222
  return text_parts
kreuzberg/_gmft.py CHANGED
@@ -99,7 +99,7 @@ async def extract_tables(
99
99
  "size": stat.st_size,
100
100
  "mtime": stat.st_mtime,
101
101
  }
102
- except OSError:
102
+ except OSError: # pragma: no cover
103
103
  file_info = {
104
104
  "path": str(path),
105
105
  "size": 0,
@@ -215,7 +215,7 @@ def extract_tables_sync(
215
215
  "size": stat.st_size,
216
216
  "mtime": stat.st_mtime,
217
217
  }
218
- except OSError:
218
+ except OSError: # pragma: no cover
219
219
  file_info = {
220
220
  "path": str(path),
221
221
  "size": 0,
@@ -312,6 +312,11 @@ def _extract_tables_in_process(
312
312
  from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
313
313
  from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
314
314
 
315
+ if "cell_required_confidence" in config_dict:
316
+ cell_config = config_dict["cell_required_confidence"]
317
+ if isinstance(cell_config, dict) and cell_config:
318
+ config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
319
+
315
320
  config = GMFTConfig(**config_dict)
316
321
 
317
322
  formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
kreuzberg/_mcp/server.py CHANGED
@@ -22,7 +22,6 @@ from kreuzberg.extraction import (
22
22
 
23
23
  mcp = FastMCP("Kreuzberg Text Extraction")
24
24
 
25
- # Security and performance limits
26
25
  MAX_BATCH_SIZE = 100
27
26
 
28
27
 
@@ -40,13 +39,12 @@ def _validate_file_path(file_path: str) -> Path:
40
39
  """
41
40
  try:
42
41
  path = Path(file_path).resolve()
43
- except (OSError, ValueError) as e:
42
+ except (OSError, ValueError) as e: # pragma: no cover
44
43
  raise ValidationError(
45
44
  f"Invalid file path: {file_path}",
46
45
  context={"file_path": file_path, "error": str(e)},
47
46
  ) from e
48
47
 
49
- # Check for path traversal attempts
50
48
  if ".." in file_path and not file_path.startswith("/"):
51
49
  raise ValidationError(
52
50
  "Path traversal detected in file path",
@@ -73,7 +71,6 @@ def _validate_file_path_with_context(file_path: str, index: int, total: int) ->
73
71
  try:
74
72
  return _validate_file_path(file_path)
75
73
  except ValidationError as e:
76
- # Add context about which file in the batch failed
77
74
  e.context = e.context or {}
78
75
  e.context["batch_index"] = index
79
76
  e.context["total_files"] = total
@@ -99,7 +96,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
99
96
  context={"context": context_info},
100
97
  )
101
98
 
102
- # Check for whitespace-only content
103
99
  if not content_base64.strip():
104
100
  raise ValidationError(
105
101
  "Base64 content cannot be whitespace only",
@@ -126,7 +122,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
126
122
  def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
127
123
  base_config = discover_config()
128
124
 
129
- # Extract Tesseract-specific parameters from kwargs first
130
125
  tesseract_lang = kwargs.pop("tesseract_lang", None)
131
126
  tesseract_psm = kwargs.pop("tesseract_psm", None)
132
127
  tesseract_output_format = kwargs.pop("tesseract_output_format", None)
@@ -151,7 +146,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
151
146
  }
152
147
  config_dict = config_dict | kwargs
153
148
 
154
- # Handle Tesseract OCR configuration
155
149
  ocr_backend = config_dict.get("ocr_backend")
156
150
  if ocr_backend == "tesseract" and (
157
151
  tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
@@ -174,10 +168,8 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
174
168
  tesseract_config_dict["enable_table_detection"] = True
175
169
 
176
170
  if tesseract_config_dict:
177
- # Merge with existing tesseract config if present
178
171
  existing_ocr_config = config_dict.get("ocr_config")
179
172
  if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
180
- # Convert existing config to dict, merge, and recreate
181
173
  existing_dict = existing_ocr_config.to_dict()
182
174
  merged_dict = existing_dict | tesseract_config_dict
183
175
  config_dict["ocr_config"] = TesseractConfig(**merged_dict)
@@ -206,7 +198,6 @@ def extract_document( # noqa: PLR0913
206
198
  tesseract_output_format: str | None = None,
207
199
  enable_table_detection: bool | None = None,
208
200
  ) -> dict[str, Any]:
209
- # Validate file path for security
210
201
  validated_path = _validate_file_path(file_path)
211
202
  config = _create_config_with_overrides(
212
203
  force_ocr=force_ocr,
@@ -289,7 +280,6 @@ def batch_extract_document( # noqa: PLR0913
289
280
  tesseract_output_format: str | None = None,
290
281
  enable_table_detection: bool | None = None,
291
282
  ) -> list[dict[str, Any]]:
292
- # Validate batch size
293
283
  if len(file_paths) > MAX_BATCH_SIZE:
294
284
  raise ValidationError(
295
285
  f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
@@ -302,7 +292,6 @@ def batch_extract_document( # noqa: PLR0913
302
292
  context={"file_paths": file_paths},
303
293
  )
304
294
 
305
- # Validate all file paths for security
306
295
  validated_paths = []
307
296
  for i, file_path in enumerate(file_paths):
308
297
  validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
@@ -346,7 +335,6 @@ def batch_extract_bytes( # noqa: PLR0913
346
335
  tesseract_output_format: str | None = None,
347
336
  enable_table_detection: bool | None = None,
348
337
  ) -> list[dict[str, Any]]:
349
- # Validate input
350
338
  if not content_items:
351
339
  raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
352
340
 
@@ -355,7 +343,6 @@ def batch_extract_bytes( # noqa: PLR0913
355
343
  "content_items must be a list", context={"content_items_type": type(content_items).__name__}
356
344
  )
357
345
 
358
- # Validate batch size
359
346
  if len(content_items) > MAX_BATCH_SIZE:
360
347
  raise ValidationError(
361
348
  f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
@@ -379,17 +366,14 @@ def batch_extract_bytes( # noqa: PLR0913
379
366
  enable_table_detection=enable_table_detection,
380
367
  )
381
368
 
382
- # Convert list of dicts to list of tuples (bytes, mime_type)
383
369
  contents = []
384
370
  for i, item in enumerate(content_items):
385
- # Validate item structure
386
371
  if not isinstance(item, dict):
387
372
  raise ValidationError(
388
373
  f"Item at index {i} must be a dictionary",
389
374
  context={"item_index": i, "item_type": type(item).__name__, "item": item},
390
375
  )
391
376
 
392
- # Check for required keys
393
377
  if "content_base64" not in item:
394
378
  raise ValidationError(
395
379
  f"Item at index {i} is missing required key 'content_base64'",
@@ -405,11 +389,9 @@ def batch_extract_bytes( # noqa: PLR0913
405
389
  content_base64 = item["content_base64"]
406
390
  mime_type = item["mime_type"]
407
391
 
408
- # Validate base64 content
409
392
  try:
410
393
  content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
411
394
  except ValidationError as e:
412
- # Add batch-specific context
413
395
  e.context = e.context or {}
414
396
  e.context["item_index"] = i
415
397
  e.context["total_items"] = len(content_items)
@@ -426,7 +408,6 @@ def extract_simple(
426
408
  file_path: str,
427
409
  mime_type: str | None = None,
428
410
  ) -> str:
429
- # Validate file path for security
430
411
  validated_path = _validate_file_path(file_path)
431
412
  config = _create_config_with_overrides()
432
413
  result = extract_file_sync(str(validated_path), mime_type, config)
@@ -467,7 +448,6 @@ def get_supported_formats() -> str:
467
448
 
468
449
  @mcp.prompt()
469
450
  def extract_and_summarize(file_path: str) -> list[TextContent]:
470
- # Validate file path for security
471
451
  validated_path = _validate_file_path(file_path)
472
452
  result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
473
453
 
@@ -481,7 +461,6 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
481
461
 
482
462
  @mcp.prompt()
483
463
  def extract_structured(file_path: str) -> list[TextContent]:
484
- # Validate file path for security
485
464
  validated_path = _validate_file_path(file_path)
486
465
  config = _create_config_with_overrides(
487
466
  extract_entities=True,
kreuzberg/_mime_types.py CHANGED
@@ -229,7 +229,7 @@ def validate_mime_type(
229
229
  "mtime": stat.st_mtime if stat else 0,
230
230
  "check_file_exists": check_file_exists,
231
231
  }
232
- except OSError:
232
+ except OSError: # pragma: no cover
233
233
  file_info = {
234
234
  "path": str(path),
235
235
  "size": 0,
@@ -33,22 +33,37 @@ except ImportError: # pragma: no cover
33
33
 
34
34
  if TYPE_CHECKING:
35
35
  import easyocr
36
- import numpy as np
37
36
  import torch
37
+ else:
38
+ easyocr: Any = None
39
+ torch: Any = None
40
+
41
+ HAS_EASYOCR: bool = False
42
+
43
+
44
+ def _import_easyocr() -> tuple[Any, Any]:
45
+ global HAS_EASYOCR, easyocr, torch
46
+
47
+ if easyocr is not None:
48
+ return easyocr, torch
49
+
50
+ if not HAS_EASYOCR and easyocr is None:
51
+ return None, None
38
52
 
39
- HAS_EASYOCR: bool
40
- if not TYPE_CHECKING:
41
53
  try:
42
- import easyocr
43
- import numpy as np
44
- import torch
54
+ import easyocr as _easyocr # noqa: PLC0415
45
55
 
56
+ try:
57
+ import torch as _torch # noqa: PLC0415
58
+ except ImportError: # pragma: no cover
59
+ _torch = None # type: ignore[assignment]
60
+
61
+ easyocr = _easyocr
62
+ torch = _torch
46
63
  HAS_EASYOCR = True
47
- except ImportError:
48
- HAS_EASYOCR = False
49
- easyocr: Any = None
50
- np: Any = None
51
- torch: Any = None
64
+ return easyocr, torch
65
+ except ImportError: # pragma: no cover
66
+ return None, None
52
67
 
53
68
 
54
69
  EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
@@ -142,6 +157,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
142
157
  _reader: ClassVar[Any] = None
143
158
 
144
159
  async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
160
+ try:
161
+ import numpy as np # noqa: PLC0415
162
+ except ImportError as e: # pragma: no cover
163
+ raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
164
+
145
165
  use_cache = kwargs.pop("use_cache", True)
146
166
 
147
167
  cache_kwargs = None
@@ -292,7 +312,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
292
312
 
293
313
  @classmethod
294
314
  def _is_gpu_available(cls) -> bool:
295
- if not HAS_EASYOCR or torch is None:
315
+ if torch is None:
296
316
  return False
297
317
  return bool(torch.cuda.is_available())
298
318
 
@@ -301,13 +321,14 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
301
321
  if cls._reader is not None:
302
322
  return
303
323
 
304
- if not HAS_EASYOCR or easyocr is None:
324
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
325
+
326
+ easyocr_module, _ = _import_easyocr()
327
+ if easyocr_module is None:
305
328
  raise MissingDependencyError.create_for_package(
306
329
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
307
330
  )
308
331
 
309
- languages = cls._validate_language_code(kwargs.pop("language", "en"))
310
-
311
332
  device_info = cls._resolve_device_config(**kwargs)
312
333
  use_gpu = device_info.device_type in ("cuda", "mps")
313
334
 
@@ -318,7 +339,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
318
339
 
319
340
  try:
320
341
  cls._reader = await run_sync(
321
- easyocr.Reader,
342
+ easyocr_module.Reader,
322
343
  languages,
323
344
  gpu=use_gpu,
324
345
  verbose=False,
@@ -382,6 +403,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
382
403
  return languages
383
404
 
384
405
  def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
406
+ try:
407
+ import numpy as np # noqa: PLC0415
408
+ except ImportError as e: # pragma: no cover
409
+ raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
410
+
385
411
  use_cache = kwargs.pop("use_cache", True)
386
412
 
387
413
  cache_kwargs = None
@@ -453,13 +479,14 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
453
479
  if cls._reader is not None:
454
480
  return
455
481
 
456
- if not HAS_EASYOCR or easyocr is None:
482
+ languages = cls._validate_language_code(kwargs.pop("language", "en"))
483
+
484
+ easyocr_module, _ = _import_easyocr()
485
+ if easyocr_module is None:
457
486
  raise MissingDependencyError.create_for_package(
458
487
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
459
488
  )
460
489
 
461
- languages = cls._validate_language_code(kwargs.pop("language", "en"))
462
-
463
490
  device_info = cls._resolve_device_config(**kwargs)
464
491
  use_gpu = device_info.device_type in ("cuda", "mps")
465
492
 
@@ -469,7 +496,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
469
496
  kwargs.setdefault("recog_network", "standard")
470
497
 
471
498
  try:
472
- cls._reader = easyocr.Reader(
499
+ cls._reader = easyocr_module.Reader(
473
500
  languages,
474
501
  gpu=use_gpu,
475
502
  verbose=False,
@@ -60,7 +60,7 @@ def _import_paddleocr() -> tuple[Any, Any]:
60
60
  PaddleOCR = _PaddleOCR
61
61
  HAS_PADDLEOCR = True
62
62
  return np, PaddleOCR
63
- except ImportError:
63
+ except ImportError: # pragma: no cover
64
64
  return None, None
65
65
 
66
66