kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/main.py +0 -53
  3. kreuzberg/_config.py +17 -8
  4. kreuzberg/_document_classification.py +1 -1
  5. kreuzberg/_extractors/_base.py +0 -46
  6. kreuzberg/_extractors/_email.py +16 -10
  7. kreuzberg/_extractors/_html.py +39 -12
  8. kreuzberg/_extractors/_pandoc.py +2 -2
  9. kreuzberg/_extractors/_pdf.py +6 -7
  10. kreuzberg/_extractors/_presentation.py +4 -0
  11. kreuzberg/_extractors/_spread_sheet.py +0 -1
  12. kreuzberg/_extractors/_structured.py +83 -15
  13. kreuzberg/_gmft.py +7 -2
  14. kreuzberg/_mcp/server.py +1 -22
  15. kreuzberg/_mime_types.py +1 -1
  16. kreuzberg/_ocr/_easyocr.py +47 -20
  17. kreuzberg/_ocr/_paddleocr.py +1 -1
  18. kreuzberg/_ocr/_tesseract.py +27 -26
  19. kreuzberg/_token_reduction/__init__.py +11 -0
  20. kreuzberg/_token_reduction/_reducer.py +439 -0
  21. kreuzberg/_token_reduction/_stopwords.py +116 -0
  22. kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
  23. kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
  24. kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
  25. kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
  26. kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
  27. kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
  28. kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
  29. kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
  30. kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
  31. kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
  32. kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
  33. kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
  34. kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
  35. kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
  36. kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
  37. kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
  38. kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
  39. kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
  40. kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
  41. kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
  42. kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
  43. kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
  44. kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
  45. kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
  46. kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
  47. kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
  48. kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
  49. kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
  50. kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
  51. kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
  52. kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
  53. kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
  54. kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
  55. kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
  56. kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
  57. kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
  58. kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
  59. kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
  60. kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
  61. kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
  62. kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
  63. kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
  64. kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
  65. kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
  66. kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
  67. kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
  68. kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
  69. kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
  70. kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
  71. kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
  72. kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
  73. kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
  74. kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
  75. kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
  76. kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
  77. kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
  78. kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
  79. kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
  80. kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
  81. kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
  82. kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
  83. kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
  84. kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
  85. kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
  86. kreuzberg/_types.py +146 -43
  87. kreuzberg/_utils/_html_streaming.py +20 -0
  88. kreuzberg/_utils/_image_preprocessing.py +1 -1
  89. kreuzberg/_utils/_ref.py +14 -6
  90. kreuzberg/_utils/_serialization.py +13 -6
  91. kreuzberg/_utils/_sync.py +15 -16
  92. kreuzberg/exceptions.py +0 -1
  93. kreuzberg/extraction.py +27 -11
  94. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
  95. kreuzberg-3.17.0.dist-info/RECORD +128 -0
  96. kreuzberg-3.15.0.dist-info/RECORD +0 -60
  97. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
  98. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
  99. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py CHANGED
@@ -8,8 +8,10 @@ from ._types import (
8
8
  ExtractionConfig,
9
9
  ExtractionResult,
10
10
  GMFTConfig,
11
+ HTMLToMarkdownConfig,
11
12
  ImageOCRConfig,
12
13
  ImageOCRResult,
14
+ JSONExtractionConfig,
13
15
  LanguageDetectionConfig,
14
16
  Metadata,
15
17
  PaddleOCRConfig,
@@ -17,6 +19,7 @@ from ._types import (
17
19
  SpacyEntityExtractionConfig,
18
20
  TableData,
19
21
  TesseractConfig,
22
+ TokenReductionConfig,
20
23
  )
21
24
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
22
25
  from .extraction import (
@@ -40,8 +43,10 @@ __all__ = [
40
43
  "ExtractionResult",
41
44
  "ExtractorRegistry",
42
45
  "GMFTConfig",
46
+ "HTMLToMarkdownConfig",
43
47
  "ImageOCRConfig",
44
48
  "ImageOCRResult",
49
+ "JSONExtractionConfig",
45
50
  "KreuzbergError",
46
51
  "LanguageDetectionConfig",
47
52
  "Metadata",
@@ -53,6 +58,7 @@ __all__ = [
53
58
  "SpacyEntityExtractionConfig",
54
59
  "TableData",
55
60
  "TesseractConfig",
61
+ "TokenReductionConfig",
56
62
  "ValidationError",
57
63
  "__version__",
58
64
  "batch_extract_bytes",
kreuzberg/_api/main.py CHANGED
@@ -13,10 +13,8 @@ from typing_extensions import TypedDict
13
13
 
14
14
  from kreuzberg import (
15
15
  EasyOCRConfig,
16
- ExtractedImage,
17
16
  ExtractionConfig,
18
17
  ExtractionResult,
19
- ImageOCRResult,
20
18
  KreuzbergError,
21
19
  MissingDependencyError,
22
20
  PaddleOCRConfig,
@@ -40,30 +38,6 @@ if TYPE_CHECKING:
40
38
  from litestar.datastructures import UploadFile
41
39
 
42
40
 
43
- class ExtractedImageDict(TypedDict):
44
- """TypedDict for extracted image JSON representation."""
45
-
46
- data: str
47
- format: str
48
- filename: str | None
49
- page_number: int | None
50
- dimensions: tuple[int, int] | None
51
- colorspace: str | None
52
- bits_per_component: int | None
53
- is_mask: bool
54
- description: str | None
55
-
56
-
57
- class ImageOCRResultDict(TypedDict):
58
- """TypedDict for image OCR result JSON representation."""
59
-
60
- image: ExtractedImageDict
61
- ocr_result: Any
62
- confidence_score: float | None
63
- processing_time: float | None
64
- skipped_reason: str | None
65
-
66
-
67
41
  class HealthResponse(TypedDict):
68
42
  """Response model for health check endpoint."""
69
43
 
@@ -384,31 +358,6 @@ def _pil_image_encoder(obj: Any) -> str:
384
358
  return f"data:image/png;base64,{img_str}"
385
359
 
386
360
 
387
- def _extracted_image_encoder(obj: ExtractedImage) -> ExtractedImageDict:
388
- encoded_data = base64.b64encode(obj.data).decode()
389
- return ExtractedImageDict(
390
- data=f"data:image/{obj.format};base64,{encoded_data}",
391
- format=obj.format,
392
- filename=obj.filename,
393
- page_number=obj.page_number,
394
- dimensions=obj.dimensions,
395
- colorspace=obj.colorspace,
396
- bits_per_component=obj.bits_per_component,
397
- is_mask=obj.is_mask,
398
- description=obj.description,
399
- )
400
-
401
-
402
- def _image_ocr_result_encoder(obj: ImageOCRResult) -> ImageOCRResultDict:
403
- return ImageOCRResultDict(
404
- image=_extracted_image_encoder(obj.image),
405
- ocr_result=obj.ocr_result,
406
- confidence_score=obj.confidence_score,
407
- processing_time=obj.processing_time,
408
- skipped_reason=obj.skipped_reason,
409
- )
410
-
411
-
412
361
  openapi_config = OpenAPIConfig(
413
362
  title="Kreuzberg API",
414
363
  version="3.14.0",
@@ -428,8 +377,6 @@ openapi_config = OpenAPIConfig(
428
377
  type_encoders = {
429
378
  pl.DataFrame: _polars_dataframe_encoder,
430
379
  Image.Image: _pil_image_encoder,
431
- ExtractedImage: _extracted_image_encoder,
432
- ImageOCRResult: _image_ocr_result_encoder,
433
380
  }
434
381
 
435
382
  app = Litestar(
kreuzberg/_config.py CHANGED
@@ -69,12 +69,21 @@ def _build_ocr_config_from_cli(
69
69
  try:
70
70
  match ocr_backend:
71
71
  case "tesseract":
72
- return TesseractConfig(**backend_args)
72
+ processed_args = backend_args.copy()
73
+ if "psm" in processed_args and isinstance(processed_args["psm"], int):
74
+ try:
75
+ processed_args["psm"] = PSMMode(processed_args["psm"])
76
+ except ValueError as e: # pragma: no cover
77
+ raise ValidationError(
78
+ f"Invalid PSM mode value: {processed_args['psm']}",
79
+ context={"psm_value": processed_args["psm"], "error": str(e)},
80
+ ) from e
81
+ return TesseractConfig(**processed_args)
73
82
  case "easyocr":
74
83
  return EasyOCRConfig(**backend_args)
75
84
  case "paddleocr":
76
85
  return PaddleOCRConfig(**backend_args)
77
- case _:
86
+ case _: # pragma: no cover
78
87
  return None
79
88
  except (TypeError, ValueError) as e:
80
89
  raise ValidationError(
@@ -112,7 +121,7 @@ def _configure_gmft(
112
121
  try:
113
122
  if cli_args.get("gmft_config"):
114
123
  gmft_config = GMFTConfig(**cli_args["gmft_config"])
115
- elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
124
+ elif "gmft" in file_config and isinstance(file_config["gmft"], dict): # pragma: no cover
116
125
  gmft_config = GMFTConfig(**file_config["gmft"])
117
126
  except (TypeError, ValueError) as e:
118
127
  raise ValidationError(
@@ -120,7 +129,7 @@ def _configure_gmft(
120
129
  context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
121
130
  ) from e
122
131
 
123
- if gmft_config:
132
+ if gmft_config: # pragma: no cover
124
133
  config_dict["gmft_config"] = gmft_config
125
134
 
126
135
 
@@ -151,7 +160,7 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
151
160
  try:
152
161
  with config_path.open("rb") as f:
153
162
  data = tomllib.load(f)
154
- except FileNotFoundError as e:
163
+ except FileNotFoundError as e: # pragma: no cover
155
164
  raise ValidationError(f"Configuration file not found: {config_path}") from e
156
165
  except tomllib.TOMLDecodeError as e:
157
166
  raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
@@ -237,7 +246,7 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
237
246
 
238
247
  try:
239
248
  return ExtractionConfig(**extraction_config)
240
- except (TypeError, ValueError) as e:
249
+ except (TypeError, ValueError) as e: # pragma: no cover
241
250
  raise ValidationError(
242
251
  f"Invalid extraction configuration: {e}",
243
252
  context={"config": extraction_config, "error": str(e)},
@@ -261,7 +270,7 @@ def build_extraction_config(
261
270
 
262
271
  try:
263
272
  return ExtractionConfig(**config_dict)
264
- except (TypeError, ValueError) as e:
273
+ except (TypeError, ValueError) as e: # pragma: no cover
265
274
  raise ValidationError(
266
275
  f"Invalid extraction configuration: {e}",
267
276
  context={"config": config_dict, "error": str(e)},
@@ -283,7 +292,7 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
283
292
  data = tomllib.load(f)
284
293
  if "tool" in data and "kreuzberg" in data["tool"]:
285
294
  return pyproject_toml
286
- except OSError as e:
295
+ except OSError as e: # pragma: no cover
287
296
  raise ValidationError(
288
297
  f"Failed to read pyproject.toml: {e}",
289
298
  context={"file": str(pyproject_toml), "error": str(e)},
@@ -132,7 +132,7 @@ def classify_document_from_layout(
132
132
  if not found_words.is_empty():
133
133
  scores[doc_type] += 1.0
134
134
  word_top = found_words[0, "top"]
135
- if word_top < page_height * 0.3:
135
+ if word_top is not None and word_top < page_height * 0.3:
136
136
  scores[doc_type] += 0.5
137
137
 
138
138
  total_score = sum(scores.values())
@@ -96,7 +96,6 @@ class Extractor(ABC):
96
96
  )
97
97
 
98
98
  def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
99
- """Filter images based on memory safety limits."""
100
99
  if not images:
101
100
  return []
102
101
 
@@ -142,17 +141,6 @@ class Extractor(ABC):
142
141
  _HASH_SAMPLE_SIZE = 512
143
142
 
144
143
  def _compute_image_hash(self, img: ExtractedImage) -> int:
145
- """Compute hash for image deduplication using progressive hashing.
146
-
147
- For small images (<1KB), hash the entire content.
148
- For larger images, use size + first/last bytes for quick comparison.
149
-
150
- Args:
151
- img: Image to hash
152
-
153
- Returns:
154
- Hash value for deduplication
155
- """
156
144
  data_len = len(img.data)
157
145
 
158
146
  if data_len < self._SMALL_IMAGE_THRESHOLD:
@@ -189,14 +177,6 @@ class Extractor(ABC):
189
177
  return unique_images
190
178
 
191
179
  def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
192
- """Prepare OCR configuration for the specified backend.
193
-
194
- Args:
195
- backend_name: Name of the OCR backend
196
-
197
- Returns:
198
- Configuration dictionary for the backend
199
- """
200
180
  default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
201
181
  config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
202
182
 
@@ -222,14 +202,6 @@ class Extractor(ABC):
222
202
  return cfg
223
203
 
224
204
  def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
225
- """Validate if an image is suitable for OCR processing.
226
-
227
- Args:
228
- img: Image to validate
229
-
230
- Returns:
231
- Reason for skipping if invalid, None if valid
232
- """
233
205
  fmt = img.format.lower()
234
206
  if fmt not in self.config.image_ocr_formats:
235
207
  return f"Unsupported format: {img.format}"
@@ -247,16 +219,6 @@ class Extractor(ABC):
247
219
  return None
248
220
 
249
221
  async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
250
- """Process a single image with OCR.
251
-
252
- Args:
253
- target: Image to process
254
- backend: OCR backend instance
255
- cfg: Configuration for the backend
256
-
257
- Returns:
258
- OCR result for the image
259
- """
260
222
  try:
261
223
  start = time.time()
262
224
  pil_img = Image.open(io.BytesIO(target.data))
@@ -284,14 +246,6 @@ class Extractor(ABC):
284
246
  async def _process_images_with_ocr(
285
247
  self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
286
248
  ) -> list[ImageOCRResult]:
287
- """Process multiple images with OCR.
288
-
289
- Args:
290
- images: Tuple or list of images to process
291
-
292
- Returns:
293
- List of OCR results
294
- """
295
249
  if not images or not self.config.ocr_extracted_images:
296
250
  return []
297
251
 
@@ -27,6 +27,8 @@ except ImportError: # pragma: no cover
27
27
  html2text = None
28
28
 
29
29
  _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
30
+ _UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
31
+ _UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
30
32
 
31
33
 
32
34
  class EmailExtractor(Extractor):
@@ -86,7 +88,14 @@ class EmailExtractor(Extractor):
86
88
  def _format_email_field(self, field: Any) -> str:
87
89
  match field:
88
90
  case list():
89
- return ", ".join(str(item.get("email", "")) if isinstance(item, dict) else str(item) for item in field)
91
+ emails = []
92
+ for item in field:
93
+ if isinstance(item, dict):
94
+ if email := item.get("email", ""):
95
+ emails.append(str(email))
96
+ else:
97
+ emails.append(str(item))
98
+ return ", ".join(emails)
90
99
  case dict():
91
100
  return str(field.get("email", ""))
92
101
  case _:
@@ -111,12 +120,8 @@ class EmailExtractor(Extractor):
111
120
  cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
112
121
  clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
113
122
  clean_html = unescape(clean_html)
114
- clean_html = (
115
- clean_html.replace("\u201c", '"')
116
- .replace("\u201d", '"')
117
- .replace("\u2019", "'")
118
- .replace("\u2018", "'")
119
- )
123
+ clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
124
+ clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
120
125
  text_parts.append(clean_html)
121
126
 
122
127
  def _extract_email_attachments(
@@ -129,12 +134,12 @@ class EmailExtractor(Extractor):
129
134
  for att in attachments:
130
135
  name_val: str = "unknown"
131
136
  if isinstance(att, dict):
132
- n = att.get("name")
137
+ n = att.get("name") or att.get("filename")
133
138
  if isinstance(n, str) and n:
134
139
  name_val = n
135
140
  names.append(name_val)
136
- metadata["attachments"] = names
137
141
  if names:
142
+ metadata["attachments"] = names
138
143
  text_parts.append("Attachments: " + ", ".join(names))
139
144
 
140
145
  def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
@@ -151,7 +156,8 @@ class EmailExtractor(Extractor):
151
156
  if not isinstance(mime, str) or not mime.startswith("image/"):
152
157
  continue
153
158
 
154
- name = att.get("name") if isinstance(att.get("name"), str) else None
159
+ name = att.get("name") or att.get("filename")
160
+ name = name if isinstance(name, str) else None
155
161
  data = att.get("data") or att.get("content") or att.get("payload")
156
162
  raw: bytes | None = None
157
163
  if isinstance(data, (bytes, bytearray)):
@@ -1,16 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import base64
4
+ import binascii
5
+ import io
4
6
  import logging
5
7
  from typing import TYPE_CHECKING, ClassVar
6
8
 
7
9
  import html_to_markdown
8
10
  from anyio import Path as AsyncPath
9
11
  from bs4 import BeautifulSoup
12
+ from PIL import Image
10
13
 
11
14
  from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
12
15
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
13
16
  from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
17
+ from kreuzberg._utils._html_streaming import should_use_streaming
14
18
  from kreuzberg._utils._string import safe_decode
15
19
  from kreuzberg._utils._sync import run_maybe_async, run_sync
16
20
 
@@ -44,6 +48,11 @@ class HTMLExtractor(Extractor):
44
48
  config_dict = config.to_dict()
45
49
 
46
50
  html_content = safe_decode(content)
51
+
52
+ use_streaming, chunk_size = should_use_streaming(len(content))
53
+ config_dict["stream_processing"] = use_streaming
54
+ config_dict["chunk_size"] = chunk_size
55
+
47
56
  result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
48
57
 
49
58
  extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
@@ -89,6 +98,13 @@ class HTMLExtractor(Extractor):
89
98
  )
90
99
  continue
91
100
 
101
+ dimensions = None
102
+ try:
103
+ with Image.open(io.BytesIO(image_data)) as pil_img:
104
+ dimensions = pil_img.size
105
+ except (OSError, ValueError) as e: # pragma: no cover
106
+ logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
107
+
92
108
  alt_val = img.get("alt") # type: ignore[union-attr]
93
109
  desc = alt_val if isinstance(alt_val, str) else None
94
110
  images.append(
@@ -97,25 +113,36 @@ class HTMLExtractor(Extractor):
97
113
  format=format_name,
98
114
  filename=f"embedded_image_{len(images) + 1}.{format_name}",
99
115
  description=desc,
116
+ dimensions=dimensions,
100
117
  )
101
118
  )
102
- except Exception as e: # noqa: BLE001
119
+ except (ValueError, binascii.Error) as e:
103
120
  logger.warning("Failed to extract base64 image: %s", e)
104
121
 
105
- for svg in soup.find_all("svg"):
122
+ def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
106
123
  try:
107
- svg_content = str(svg).encode("utf-8")
108
- title_or_aria = svg.get("title") or svg.get("aria-label") # type: ignore[union-attr]
124
+ svg_content = str(svg_element).encode("utf-8")
125
+
126
+ def _get_attr_safe(obj: object, attr: str) -> str | None:
127
+ get_method = getattr(obj, "get", None)
128
+ if callable(get_method):
129
+ result = get_method(attr)
130
+ return result if isinstance(result, str) else None
131
+ return None
132
+
133
+ title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
109
134
  desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
110
- images.append(
111
- ExtractedImage(
112
- data=svg_content,
113
- format="svg",
114
- filename=f"inline_svg_{len(images) + 1}.svg",
115
- description=desc_svg,
116
- )
135
+ return ExtractedImage(
136
+ data=svg_content,
137
+ format="svg",
138
+ filename=f"inline_svg_{len(images) + 1}.svg",
139
+ description=desc_svg,
117
140
  )
118
- except Exception as e: # noqa: BLE001, PERF203
141
+ except (UnicodeEncodeError, AttributeError) as e:
119
142
  logger.warning("Failed to extract SVG: %s", e)
143
+ return None
144
+
145
+ svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
146
+ images.extend(img for img in svg_images if img is not None)
120
147
 
121
148
  return images
@@ -253,7 +253,7 @@ class PandocExtractor(Extractor):
253
253
  "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
254
254
  )
255
255
 
256
- except FileNotFoundError as e:
256
+ except FileNotFoundError as e: # pragma: no cover
257
257
  raise MissingDependencyError(
258
258
  "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
259
259
  ) from e
@@ -491,7 +491,7 @@ class PandocExtractor(Extractor):
491
491
  "Please install it on your system and make sure its available in $PATH."
492
492
  )
493
493
 
494
- except (subprocess.SubprocessError, FileNotFoundError) as e:
494
+ except (subprocess.SubprocessError, FileNotFoundError) as e: # pragma: no cover
495
495
  raise MissingDependencyError(
496
496
  "Pandoc version 2 or above is a required system dependency. "
497
497
  "Please install it on your system and make sure its available in $PATH."
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
3
  import contextlib
5
4
  import io
6
5
  import logging
@@ -41,7 +40,7 @@ from kreuzberg._utils._errors import create_error_context, should_retry
41
40
  from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
42
41
  from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
43
42
  from kreuzberg._utils._string import normalize_spaces
44
- from kreuzberg._utils._sync import run_maybe_async, run_taskgroup_batched
43
+ from kreuzberg._utils._sync import run_maybe_async, run_taskgroup, run_taskgroup_batched
45
44
  from kreuzberg._utils._table import generate_table_summary
46
45
  from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
47
46
  from kreuzberg.exceptions import ParsingError
@@ -154,7 +153,7 @@ class PDFExtractor(Extractor):
154
153
  from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
155
154
 
156
155
  tables = extract_tables_sync(path)
157
- except ImportError:
156
+ except ImportError: # pragma: no cover
158
157
  tables = []
159
158
 
160
159
  if not self.config.force_ocr and self._validate_extracted_text(text):
@@ -231,7 +230,7 @@ class PDFExtractor(Extractor):
231
230
  img_counter += 1
232
231
 
233
232
  if tasks:
234
- results = await asyncio.gather(*tasks)
233
+ results = await run_taskgroup(*tasks)
235
234
  return [img for img in results if img is not None]
236
235
 
237
236
  return []
@@ -501,7 +500,7 @@ class PDFExtractor(Extractor):
501
500
  except (ValueError, TypeError, KeyError, RuntimeError) as e: # noqa: PERF203
502
501
  last_exception = e
503
502
  continue
504
- except OSError as e:
503
+ except OSError as e: # pragma: no cover
505
504
  raise ParsingError(f"Failed to parse PDF: {e}") from e
506
505
 
507
506
  if last_exception:
@@ -521,7 +520,7 @@ class PDFExtractor(Extractor):
521
520
  for password in passwords:
522
521
  try:
523
522
  return await extract_pdf_metadata(content, password=password)
524
- except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
523
+ except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203 # pragma: no cover
525
524
  last_exception = e
526
525
  continue
527
526
 
@@ -539,7 +538,7 @@ class PDFExtractor(Extractor):
539
538
  for password in passwords:
540
539
  try:
541
540
  return extract_pdf_metadata_sync(content, password=password)
542
- except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
541
+ except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203 # pragma: no cover
543
542
  last_exception = e
544
543
  continue
545
544
 
@@ -142,6 +142,8 @@ class PresentationExtractor(Extractor):
142
142
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
143
143
  try:
144
144
  image = shape.image
145
+ if not image.blob or not isinstance(image.blob, bytes):
146
+ continue
145
147
  filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
146
148
 
147
149
  images.append(
@@ -162,6 +164,8 @@ class PresentationExtractor(Extractor):
162
164
  if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
163
165
  try:
164
166
  image = shape.image
167
+ if not image.blob or not isinstance(image.blob, bytes):
168
+ continue
165
169
  filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
166
170
  images.append(
167
171
  ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
@@ -197,7 +197,6 @@ class SpreadSheetExtractor(Extractor):
197
197
  if not data or not any(row for row in data):
198
198
  return f"## {sheet_name}\n\n*Empty sheet*"
199
199
 
200
- # Normalize row lengths to avoid polars ShapeError
201
200
  if data:
202
201
  max_cols = max(len(row) if row else 0 for row in data)
203
202
  data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data] # type: ignore[list-item]