kreuzberg 3.16.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. kreuzberg/__init__.py +2 -0
  2. kreuzberg/_config.py +8 -9
  3. kreuzberg/_extractors/_base.py +0 -46
  4. kreuzberg/_extractors/_html.py +1 -1
  5. kreuzberg/_extractors/_pandoc.py +2 -2
  6. kreuzberg/_extractors/_pdf.py +4 -4
  7. kreuzberg/_gmft.py +2 -2
  8. kreuzberg/_mcp/server.py +1 -1
  9. kreuzberg/_mime_types.py +1 -1
  10. kreuzberg/_ocr/_easyocr.py +4 -9
  11. kreuzberg/_ocr/_paddleocr.py +1 -1
  12. kreuzberg/_ocr/_tesseract.py +15 -25
  13. kreuzberg/_token_reduction/__init__.py +11 -0
  14. kreuzberg/_token_reduction/_reducer.py +439 -0
  15. kreuzberg/_token_reduction/_stopwords.py +116 -0
  16. kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
  17. kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
  18. kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
  19. kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
  20. kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
  21. kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
  22. kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
  23. kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
  24. kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
  25. kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
  26. kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
  27. kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
  28. kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
  29. kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
  30. kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
  31. kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
  32. kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
  33. kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
  34. kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
  35. kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
  36. kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
  37. kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
  38. kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
  39. kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
  40. kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
  41. kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
  42. kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
  43. kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
  44. kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
  45. kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
  46. kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
  47. kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
  48. kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
  49. kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
  50. kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
  51. kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
  52. kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
  53. kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
  54. kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
  55. kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
  56. kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
  57. kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
  58. kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
  59. kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
  60. kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
  61. kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
  62. kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
  63. kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
  64. kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
  65. kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
  66. kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
  67. kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
  68. kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
  69. kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
  70. kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
  71. kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
  72. kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
  73. kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
  74. kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
  75. kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
  76. kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
  77. kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
  78. kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
  79. kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
  80. kreuzberg/_types.py +35 -3
  81. kreuzberg/_utils/_image_preprocessing.py +1 -1
  82. kreuzberg/_utils/_ref.py +14 -6
  83. kreuzberg/exceptions.py +0 -1
  84. kreuzberg/extraction.py +25 -9
  85. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +4 -3
  86. kreuzberg-3.17.0.dist-info/RECORD +128 -0
  87. kreuzberg-3.16.0.dist-info/RECORD +0 -61
  88. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
  89. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
  90. {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py CHANGED
@@ -19,6 +19,7 @@ from ._types import (
19
19
  SpacyEntityExtractionConfig,
20
20
  TableData,
21
21
  TesseractConfig,
22
+ TokenReductionConfig,
22
23
  )
23
24
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
24
25
  from .extraction import (
@@ -57,6 +58,7 @@ __all__ = [
57
58
  "SpacyEntityExtractionConfig",
58
59
  "TableData",
59
60
  "TesseractConfig",
61
+ "TokenReductionConfig",
60
62
  "ValidationError",
61
63
  "__version__",
62
64
  "batch_extract_bytes",
kreuzberg/_config.py CHANGED
@@ -69,12 +69,11 @@ def _build_ocr_config_from_cli(
69
69
  try:
70
70
  match ocr_backend:
71
71
  case "tesseract":
72
- # Handle PSM mode conversion from int to enum
73
72
  processed_args = backend_args.copy()
74
73
  if "psm" in processed_args and isinstance(processed_args["psm"], int):
75
74
  try:
76
75
  processed_args["psm"] = PSMMode(processed_args["psm"])
77
- except ValueError as e:
76
+ except ValueError as e: # pragma: no cover
78
77
  raise ValidationError(
79
78
  f"Invalid PSM mode value: {processed_args['psm']}",
80
79
  context={"psm_value": processed_args["psm"], "error": str(e)},
@@ -84,7 +83,7 @@ def _build_ocr_config_from_cli(
84
83
  return EasyOCRConfig(**backend_args)
85
84
  case "paddleocr":
86
85
  return PaddleOCRConfig(**backend_args)
87
- case _:
86
+ case _: # pragma: no cover
88
87
  return None
89
88
  except (TypeError, ValueError) as e:
90
89
  raise ValidationError(
@@ -122,7 +121,7 @@ def _configure_gmft(
122
121
  try:
123
122
  if cli_args.get("gmft_config"):
124
123
  gmft_config = GMFTConfig(**cli_args["gmft_config"])
125
- elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
124
+ elif "gmft" in file_config and isinstance(file_config["gmft"], dict): # pragma: no cover
126
125
  gmft_config = GMFTConfig(**file_config["gmft"])
127
126
  except (TypeError, ValueError) as e:
128
127
  raise ValidationError(
@@ -130,7 +129,7 @@ def _configure_gmft(
130
129
  context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
131
130
  ) from e
132
131
 
133
- if gmft_config:
132
+ if gmft_config: # pragma: no cover
134
133
  config_dict["gmft_config"] = gmft_config
135
134
 
136
135
 
@@ -161,7 +160,7 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
161
160
  try:
162
161
  with config_path.open("rb") as f:
163
162
  data = tomllib.load(f)
164
- except FileNotFoundError as e:
163
+ except FileNotFoundError as e: # pragma: no cover
165
164
  raise ValidationError(f"Configuration file not found: {config_path}") from e
166
165
  except tomllib.TOMLDecodeError as e:
167
166
  raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
@@ -247,7 +246,7 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
247
246
 
248
247
  try:
249
248
  return ExtractionConfig(**extraction_config)
250
- except (TypeError, ValueError) as e:
249
+ except (TypeError, ValueError) as e: # pragma: no cover
251
250
  raise ValidationError(
252
251
  f"Invalid extraction configuration: {e}",
253
252
  context={"config": extraction_config, "error": str(e)},
@@ -271,7 +270,7 @@ def build_extraction_config(
271
270
 
272
271
  try:
273
272
  return ExtractionConfig(**config_dict)
274
- except (TypeError, ValueError) as e:
273
+ except (TypeError, ValueError) as e: # pragma: no cover
275
274
  raise ValidationError(
276
275
  f"Invalid extraction configuration: {e}",
277
276
  context={"config": config_dict, "error": str(e)},
@@ -293,7 +292,7 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
293
292
  data = tomllib.load(f)
294
293
  if "tool" in data and "kreuzberg" in data["tool"]:
295
294
  return pyproject_toml
296
- except OSError as e:
295
+ except OSError as e: # pragma: no cover
297
296
  raise ValidationError(
298
297
  f"Failed to read pyproject.toml: {e}",
299
298
  context={"file": str(pyproject_toml), "error": str(e)},
@@ -96,7 +96,6 @@ class Extractor(ABC):
96
96
  )
97
97
 
98
98
  def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
99
- """Filter images based on memory safety limits."""
100
99
  if not images:
101
100
  return []
102
101
 
@@ -142,17 +141,6 @@ class Extractor(ABC):
142
141
  _HASH_SAMPLE_SIZE = 512
143
142
 
144
143
  def _compute_image_hash(self, img: ExtractedImage) -> int:
145
- """Compute hash for image deduplication using progressive hashing.
146
-
147
- For small images (<1KB), hash the entire content.
148
- For larger images, use size + first/last bytes for quick comparison.
149
-
150
- Args:
151
- img: Image to hash
152
-
153
- Returns:
154
- Hash value for deduplication
155
- """
156
144
  data_len = len(img.data)
157
145
 
158
146
  if data_len < self._SMALL_IMAGE_THRESHOLD:
@@ -189,14 +177,6 @@ class Extractor(ABC):
189
177
  return unique_images
190
178
 
191
179
  def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
192
- """Prepare OCR configuration for the specified backend.
193
-
194
- Args:
195
- backend_name: Name of the OCR backend
196
-
197
- Returns:
198
- Configuration dictionary for the backend
199
- """
200
180
  default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
201
181
  config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
202
182
 
@@ -222,14 +202,6 @@ class Extractor(ABC):
222
202
  return cfg
223
203
 
224
204
  def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
225
- """Validate if an image is suitable for OCR processing.
226
-
227
- Args:
228
- img: Image to validate
229
-
230
- Returns:
231
- Reason for skipping if invalid, None if valid
232
- """
233
205
  fmt = img.format.lower()
234
206
  if fmt not in self.config.image_ocr_formats:
235
207
  return f"Unsupported format: {img.format}"
@@ -247,16 +219,6 @@ class Extractor(ABC):
247
219
  return None
248
220
 
249
221
  async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
250
- """Process a single image with OCR.
251
-
252
- Args:
253
- target: Image to process
254
- backend: OCR backend instance
255
- cfg: Configuration for the backend
256
-
257
- Returns:
258
- OCR result for the image
259
- """
260
222
  try:
261
223
  start = time.time()
262
224
  pil_img = Image.open(io.BytesIO(target.data))
@@ -284,14 +246,6 @@ class Extractor(ABC):
284
246
  async def _process_images_with_ocr(
285
247
  self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
286
248
  ) -> list[ImageOCRResult]:
287
- """Process multiple images with OCR.
288
-
289
- Args:
290
- images: Tuple or list of images to process
291
-
292
- Returns:
293
- List of OCR results
294
- """
295
249
  if not images or not self.config.ocr_extracted_images:
296
250
  return []
297
251
 
@@ -102,7 +102,7 @@ class HTMLExtractor(Extractor):
102
102
  try:
103
103
  with Image.open(io.BytesIO(image_data)) as pil_img:
104
104
  dimensions = pil_img.size
105
- except (OSError, ValueError) as e:
105
+ except (OSError, ValueError) as e: # pragma: no cover
106
106
  logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
107
107
 
108
108
  alt_val = img.get("alt") # type: ignore[union-attr]
@@ -253,7 +253,7 @@ class PandocExtractor(Extractor):
253
253
  "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
254
254
  )
255
255
 
256
- except FileNotFoundError as e:
256
+ except FileNotFoundError as e: # pragma: no cover
257
257
  raise MissingDependencyError(
258
258
  "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
259
259
  ) from e
@@ -491,7 +491,7 @@ class PandocExtractor(Extractor):
491
491
  "Please install it on your system and make sure its available in $PATH."
492
492
  )
493
493
 
494
- except (subprocess.SubprocessError, FileNotFoundError) as e:
494
+ except (subprocess.SubprocessError, FileNotFoundError) as e: # pragma: no cover
495
495
  raise MissingDependencyError(
496
496
  "Pandoc version 2 or above is a required system dependency. "
497
497
  "Please install it on your system and make sure its available in $PATH."
@@ -153,7 +153,7 @@ class PDFExtractor(Extractor):
153
153
  from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
154
154
 
155
155
  tables = extract_tables_sync(path)
156
- except ImportError:
156
+ except ImportError: # pragma: no cover
157
157
  tables = []
158
158
 
159
159
  if not self.config.force_ocr and self._validate_extracted_text(text):
@@ -500,7 +500,7 @@ class PDFExtractor(Extractor):
500
500
  except (ValueError, TypeError, KeyError, RuntimeError) as e: # noqa: PERF203
501
501
  last_exception = e
502
502
  continue
503
- except OSError as e:
503
+ except OSError as e: # pragma: no cover
504
504
  raise ParsingError(f"Failed to parse PDF: {e}") from e
505
505
 
506
506
  if last_exception:
@@ -520,7 +520,7 @@ class PDFExtractor(Extractor):
520
520
  for password in passwords:
521
521
  try:
522
522
  return await extract_pdf_metadata(content, password=password)
523
- except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
523
+ except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203 # pragma: no cover
524
524
  last_exception = e
525
525
  continue
526
526
 
@@ -538,7 +538,7 @@ class PDFExtractor(Extractor):
538
538
  for password in passwords:
539
539
  try:
540
540
  return extract_pdf_metadata_sync(content, password=password)
541
- except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
541
+ except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203 # pragma: no cover
542
542
  last_exception = e
543
543
  continue
544
544
 
kreuzberg/_gmft.py CHANGED
@@ -99,7 +99,7 @@ async def extract_tables(
99
99
  "size": stat.st_size,
100
100
  "mtime": stat.st_mtime,
101
101
  }
102
- except OSError:
102
+ except OSError: # pragma: no cover
103
103
  file_info = {
104
104
  "path": str(path),
105
105
  "size": 0,
@@ -215,7 +215,7 @@ def extract_tables_sync(
215
215
  "size": stat.st_size,
216
216
  "mtime": stat.st_mtime,
217
217
  }
218
- except OSError:
218
+ except OSError: # pragma: no cover
219
219
  file_info = {
220
220
  "path": str(path),
221
221
  "size": 0,
kreuzberg/_mcp/server.py CHANGED
@@ -39,7 +39,7 @@ def _validate_file_path(file_path: str) -> Path:
39
39
  """
40
40
  try:
41
41
  path = Path(file_path).resolve()
42
- except (OSError, ValueError) as e:
42
+ except (OSError, ValueError) as e: # pragma: no cover
43
43
  raise ValidationError(
44
44
  f"Invalid file path: {file_path}",
45
45
  context={"file_path": file_path, "error": str(e)},
kreuzberg/_mime_types.py CHANGED
@@ -229,7 +229,7 @@ def validate_mime_type(
229
229
  "mtime": stat.st_mtime if stat else 0,
230
230
  "check_file_exists": check_file_exists,
231
231
  }
232
- except OSError:
232
+ except OSError: # pragma: no cover
233
233
  file_info = {
234
234
  "path": str(path),
235
235
  "size": 0,
@@ -44,11 +44,9 @@ HAS_EASYOCR: bool = False
44
44
  def _import_easyocr() -> tuple[Any, Any]:
45
45
  global HAS_EASYOCR, easyocr, torch
46
46
 
47
- # If easyocr is already set (either real module or mock), return it
48
47
  if easyocr is not None:
49
48
  return easyocr, torch
50
49
 
51
- # If explicitly disabled for testing
52
50
  if not HAS_EASYOCR and easyocr is None:
53
51
  return None, None
54
52
 
@@ -57,14 +55,14 @@ def _import_easyocr() -> tuple[Any, Any]:
57
55
 
58
56
  try:
59
57
  import torch as _torch # noqa: PLC0415
60
- except ImportError:
58
+ except ImportError: # pragma: no cover
61
59
  _torch = None # type: ignore[assignment]
62
60
 
63
61
  easyocr = _easyocr
64
62
  torch = _torch
65
63
  HAS_EASYOCR = True
66
64
  return easyocr, torch
67
- except ImportError:
65
+ except ImportError: # pragma: no cover
68
66
  return None, None
69
67
 
70
68
 
@@ -161,7 +159,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
161
159
  async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
162
160
  try:
163
161
  import numpy as np # noqa: PLC0415
164
- except ImportError as e:
162
+ except ImportError as e: # pragma: no cover
165
163
  raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
166
164
 
167
165
  use_cache = kwargs.pop("use_cache", True)
@@ -314,7 +312,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
314
312
 
315
313
  @classmethod
316
314
  def _is_gpu_available(cls) -> bool:
317
- # Use the module-level torch variable directly to respect patches
318
315
  if torch is None:
319
316
  return False
320
317
  return bool(torch.cuda.is_available())
@@ -324,7 +321,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
324
321
  if cls._reader is not None:
325
322
  return
326
323
 
327
- # Validate language first before attempting import
328
324
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
329
325
 
330
326
  easyocr_module, _ = _import_easyocr()
@@ -409,7 +405,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
409
405
  def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
410
406
  try:
411
407
  import numpy as np # noqa: PLC0415
412
- except ImportError as e:
408
+ except ImportError as e: # pragma: no cover
413
409
  raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
414
410
 
415
411
  use_cache = kwargs.pop("use_cache", True)
@@ -483,7 +479,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
483
479
  if cls._reader is not None:
484
480
  return
485
481
 
486
- # Validate language first before attempting import
487
482
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
488
483
 
489
484
  easyocr_module, _ = _import_easyocr()
@@ -60,7 +60,7 @@ def _import_paddleocr() -> tuple[Any, Any]:
60
60
  PaddleOCR = _PaddleOCR
61
61
  HAS_PADDLEOCR = True
62
62
  return np, PaddleOCR
63
- except ImportError:
63
+ except ImportError: # pragma: no cover
64
64
  return None, None
65
65
 
66
66
 
@@ -215,7 +215,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
215
215
 
216
216
  try:
217
217
  await run_sync(save_image.save, str(image_path), format="PNG")
218
- except OSError as e:
218
+ except OSError as e: # pragma: no cover
219
219
  if "cannot write mode" not in str(e):
220
220
  raise
221
221
  save_image = image.convert("RGB")
@@ -357,7 +357,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
357
357
  try:
358
358
  stat = path.stat()
359
359
  file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
360
- except OSError:
360
+ except OSError: # pragma: no cover
361
361
  file_info = {"path": str(path), "size": 0, "mtime": 0}
362
362
 
363
363
  cache_kwargs = {
@@ -399,7 +399,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
399
399
  await ocr_cache.aset(extraction_result, **final_cache_kwargs)
400
400
 
401
401
  return extraction_result
402
- except (RuntimeError, OSError) as e:
402
+ except (RuntimeError, OSError) as e: # pragma: no cover
403
403
  raise OCRError(f"Failed to OCR using tesseract: {e}") from e
404
404
  finally:
405
405
  await unlink()
@@ -432,7 +432,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
432
432
 
433
433
  try:
434
434
  df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
435
- except (ImportError, IndexError):
435
+ except (ImportError, IndexError): # pragma: no cover
436
436
  df = None
437
437
 
438
438
  table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
@@ -444,7 +444,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
444
444
  tables=[table],
445
445
  chunks=text_result.chunks,
446
446
  )
447
- except (ValueError, KeyError, ImportError):
447
+ except (ValueError, KeyError, ImportError): # pragma: no cover
448
448
  pass
449
449
 
450
450
  return text_result
@@ -507,12 +507,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
507
507
  table_min_confidence: float = 30.0,
508
508
  **_kwargs: Any,
509
509
  ) -> ExtractionResult:
510
- config = html_to_markdown_config or HTMLToMarkdownConfig(
511
- escape_asterisks=False,
512
- escape_underscores=False,
513
- extract_metadata=False,
514
- strip=["meta", "title"],
515
- )
510
+ config = html_to_markdown_config or HTMLToMarkdownConfig()
516
511
 
517
512
  tables: list[TableData] = []
518
513
  if enable_table_detection:
@@ -678,10 +673,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
678
673
 
679
674
  html_config = HTMLToMarkdownConfig(
680
675
  custom_converters=converters,
681
- escape_asterisks=False,
682
- escape_underscores=False,
683
- extract_metadata=False,
684
- strip=["meta", "title"],
685
676
  )
686
677
 
687
678
  config_dict = html_config.to_dict()
@@ -761,7 +752,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
761
752
 
762
753
  try:
763
754
  df = pl.DataFrame(table_data[1:], schema=table_data[0])
764
- except (ImportError, IndexError):
755
+ except (ImportError, IndexError): # pragma: no cover
765
756
  df = None
766
757
 
767
758
  table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
@@ -773,7 +764,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
773
764
  tables=[table],
774
765
  chunks=text_result.chunks,
775
766
  )
776
- except (ValueError, KeyError, ImportError):
767
+ except (ValueError, KeyError, ImportError): # pragma: no cover
777
768
  pass
778
769
 
779
770
  return text_result
@@ -810,7 +801,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
810
801
 
811
802
  try:
812
803
  df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
813
- except (ImportError, IndexError):
804
+ except (ImportError, IndexError): # pragma: no cover
814
805
  df = None
815
806
 
816
807
  dummy_image = Image.new("RGB", (1, 1), "white")
@@ -823,7 +814,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
823
814
  "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
824
815
  } # type: ignore[typeddict-unknown-key]
825
816
  tables.append(table)
826
- except (ValueError, KeyError, ImportError):
817
+ except (ValueError, KeyError, ImportError): # pragma: no cover
827
818
  pass
828
819
 
829
820
  return tables
@@ -879,7 +870,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
879
870
  env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
880
871
  try:
881
872
  result = await run_process(command, env=env)
882
- except (subprocess.CalledProcessError, FileNotFoundError) as e:
873
+ except (subprocess.CalledProcessError, FileNotFoundError) as e: # pragma: no cover
883
874
  raise MissingDependencyError(
884
875
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
885
876
  ) from e
@@ -890,7 +881,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
890
881
  )
891
882
 
892
883
  cls._version_checked = True
893
- except FileNotFoundError as e:
884
+ except FileNotFoundError as e: # pragma: no cover
894
885
  raise MissingDependencyError(
895
886
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
896
887
  ) from e
@@ -1087,7 +1078,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1087
1078
  "size": stat.st_size,
1088
1079
  "mtime": stat.st_mtime,
1089
1080
  }
1090
- except OSError:
1081
+ except OSError: # pragma: no cover
1091
1082
  return {
1092
1083
  "path": str(path),
1093
1084
  "size": 0,
@@ -1095,7 +1086,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1095
1086
  }
1096
1087
 
1097
1088
  def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
1098
- """Convert a worker result dict to ExtractionResult."""
1099
1089
  if result_dict.get("success"):
1100
1090
  return ExtractionResult(
1101
1091
  content=str(result_dict.get("text", "")),
@@ -1189,7 +1179,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1189
1179
  command = ["tesseract", "--version"]
1190
1180
  try:
1191
1181
  result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
1192
- except (subprocess.CalledProcessError, FileNotFoundError) as e:
1182
+ except (subprocess.CalledProcessError, FileNotFoundError) as e: # pragma: no cover
1193
1183
  raise MissingDependencyError(
1194
1184
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
1195
1185
  ) from e
@@ -1200,7 +1190,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1200
1190
  )
1201
1191
 
1202
1192
  cls._version_checked = True
1203
- except FileNotFoundError as e:
1193
+ except FileNotFoundError as e: # pragma: no cover
1204
1194
  raise MissingDependencyError(
1205
1195
  "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
1206
1196
  ) from e
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ from kreuzberg._token_reduction._reducer import ReductionStats, get_reduction_stats, reduce_tokens
4
+ from kreuzberg._token_reduction._stopwords import StopwordsManager
5
+
6
+ __all__ = [
7
+ "ReductionStats",
8
+ "StopwordsManager",
9
+ "get_reduction_stats",
10
+ "reduce_tokens",
11
+ ]