PyPI - kreuzberg - Versions diffs - 3.16.0__py3-none-any.whl → 3.17.0__py3-none-any.whl - Mend

kreuzberg 3.16.0py3-none-any.whl → 3.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

kreuzberg/__init__.py +2 -0
kreuzberg/_config.py +8 -9
kreuzberg/_extractors/_base.py +0 -46
kreuzberg/_extractors/_html.py +1 -1
kreuzberg/_extractors/_pandoc.py +2 -2
kreuzberg/_extractors/_pdf.py +4 -4
kreuzberg/_gmft.py +2 -2
kreuzberg/_mcp/server.py +1 -1
kreuzberg/_mime_types.py +1 -1
kreuzberg/_ocr/_easyocr.py +4 -9
kreuzberg/_ocr/_paddleocr.py +1 -1
kreuzberg/_ocr/_tesseract.py +15 -25
kreuzberg/_token_reduction/__init__.py +11 -0
kreuzberg/_token_reduction/_reducer.py +439 -0
kreuzberg/_token_reduction/_stopwords.py +116 -0
kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
kreuzberg/_types.py +35 -3
kreuzberg/_utils/_image_preprocessing.py +1 -1
kreuzberg/_utils/_ref.py +14 -6
kreuzberg/exceptions.py +0 -1
kreuzberg/extraction.py +25 -9
{kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +4 -3
kreuzberg-3.17.0.dist-info/RECORD +128 -0
kreuzberg-3.16.0.dist-info/RECORD +0 -61
{kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
{kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/__init__.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ._types import (
     SpacyEntityExtractionConfig,
     TableData,
     TesseractConfig,
+    TokenReductionConfig,
 )
 from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
 from .extraction import (
@@ -57,6 +58,7 @@ __all__ = [
     "SpacyEntityExtractionConfig",
     "TableData",
     "TesseractConfig",
+    "TokenReductionConfig",
     "ValidationError",
     "__version__",
     "batch_extract_bytes",

kreuzberg/_config.py CHANGED Viewed

@@ -69,12 +69,11 @@ def _build_ocr_config_from_cli(
     try:
         match ocr_backend:
             case "tesseract":
-                # Handle PSM mode conversion from int to enum
                 processed_args = backend_args.copy()
                 if "psm" in processed_args and isinstance(processed_args["psm"], int):
                     try:
                         processed_args["psm"] = PSMMode(processed_args["psm"])
-                    except ValueError as e:
+                    except ValueError as e:  # pragma: no cover
                         raise ValidationError(
                             f"Invalid PSM mode value: {processed_args['psm']}",
                             context={"psm_value": processed_args["psm"], "error": str(e)},
@@ -84,7 +83,7 @@ def _build_ocr_config_from_cli(
                 return EasyOCRConfig(**backend_args)
             case "paddleocr":
                 return PaddleOCRConfig(**backend_args)
-            case _:
+            case _:  # pragma: no cover
                 return None
     except (TypeError, ValueError) as e:
         raise ValidationError(
@@ -122,7 +121,7 @@ def _configure_gmft(
     try:
         if cli_args.get("gmft_config"):
             gmft_config = GMFTConfig(**cli_args["gmft_config"])
-        elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
+        elif "gmft" in file_config and isinstance(file_config["gmft"], dict):  # pragma: no cover
             gmft_config = GMFTConfig(**file_config["gmft"])
     except (TypeError, ValueError) as e:
         raise ValidationError(
@@ -130,7 +129,7 @@ def _configure_gmft(
             context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
         ) from e
-    if gmft_config:
+    if gmft_config:  # pragma: no cover
         config_dict["gmft_config"] = gmft_config
@@ -161,7 +160,7 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
     try:
         with config_path.open("rb") as f:
             data = tomllib.load(f)
-    except FileNotFoundError as e:
+    except FileNotFoundError as e:  # pragma: no cover
         raise ValidationError(f"Configuration file not found: {config_path}") from e
     except tomllib.TOMLDecodeError as e:
         raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
@@ -247,7 +246,7 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
     try:
         return ExtractionConfig(**extraction_config)
-    except (TypeError, ValueError) as e:
+    except (TypeError, ValueError) as e:  # pragma: no cover
         raise ValidationError(
             f"Invalid extraction configuration: {e}",
             context={"config": extraction_config, "error": str(e)},
@@ -271,7 +270,7 @@ def build_extraction_config(
     try:
         return ExtractionConfig(**config_dict)
-    except (TypeError, ValueError) as e:
+    except (TypeError, ValueError) as e:  # pragma: no cover
         raise ValidationError(
             f"Invalid extraction configuration: {e}",
             context={"config": config_dict, "error": str(e)},
@@ -293,7 +292,7 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
                     data = tomllib.load(f)
                 if "tool" in data and "kreuzberg" in data["tool"]:
                     return pyproject_toml
-            except OSError as e:
+            except OSError as e:  # pragma: no cover
                 raise ValidationError(
                     f"Failed to read pyproject.toml: {e}",
                     context={"file": str(pyproject_toml), "error": str(e)},

kreuzberg/_extractors/_base.py CHANGED Viewed

@@ -96,7 +96,6 @@ class Extractor(ABC):
         )
     def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
-        """Filter images based on memory safety limits."""
         if not images:
             return []
@@ -142,17 +141,6 @@ class Extractor(ABC):
     _HASH_SAMPLE_SIZE = 512
     def _compute_image_hash(self, img: ExtractedImage) -> int:
-        """Compute hash for image deduplication using progressive hashing.
-        For small images (<1KB), hash the entire content.
-        For larger images, use size + first/last bytes for quick comparison.
-        Args:
-            img: Image to hash
-        Returns:
-            Hash value for deduplication
-        """
         data_len = len(img.data)
         if data_len < self._SMALL_IMAGE_THRESHOLD:
@@ -189,14 +177,6 @@ class Extractor(ABC):
         return unique_images
     def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
-        """Prepare OCR configuration for the specified backend.
-        Args:
-            backend_name: Name of the OCR backend
-        Returns:
-            Configuration dictionary for the backend
-        """
         default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
         config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
@@ -222,14 +202,6 @@ class Extractor(ABC):
         return cfg
     def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
-        """Validate if an image is suitable for OCR processing.
-        Args:
-            img: Image to validate
-        Returns:
-            Reason for skipping if invalid, None if valid
-        """
         fmt = img.format.lower()
         if fmt not in self.config.image_ocr_formats:
             return f"Unsupported format: {img.format}"
@@ -247,16 +219,6 @@ class Extractor(ABC):
         return None
     async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
-        """Process a single image with OCR.
-        Args:
-            target: Image to process
-            backend: OCR backend instance
-            cfg: Configuration for the backend
-        Returns:
-            OCR result for the image
-        """
         try:
             start = time.time()
             pil_img = Image.open(io.BytesIO(target.data))
@@ -284,14 +246,6 @@ class Extractor(ABC):
     async def _process_images_with_ocr(
         self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
     ) -> list[ImageOCRResult]:
-        """Process multiple images with OCR.
-        Args:
-            images: Tuple or list of images to process
-        Returns:
-            List of OCR results
-        """
         if not images or not self.config.ocr_extracted_images:
             return []

kreuzberg/_extractors/_html.py CHANGED Viewed

@@ -102,7 +102,7 @@ class HTMLExtractor(Extractor):
                     try:
                         with Image.open(io.BytesIO(image_data)) as pil_img:
                             dimensions = pil_img.size
-                    except (OSError, ValueError) as e:
+                    except (OSError, ValueError) as e:  # pragma: no cover
                         logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
                     alt_val = img.get("alt")  # type: ignore[union-attr]

kreuzberg/_extractors/_pandoc.py CHANGED Viewed

@@ -253,7 +253,7 @@ class PandocExtractor(Extractor):
                 "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
             )
-        except FileNotFoundError as e:
+        except FileNotFoundError as e:  # pragma: no cover
             raise MissingDependencyError(
                 "Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
             ) from e
@@ -491,7 +491,7 @@ class PandocExtractor(Extractor):
                 "Please install it on your system and make sure its available in $PATH."
             )
-        except (subprocess.SubprocessError, FileNotFoundError) as e:
+        except (subprocess.SubprocessError, FileNotFoundError) as e:  # pragma: no cover
             raise MissingDependencyError(
                 "Pandoc version 2 or above is a required system dependency. "
                 "Please install it on your system and make sure its available in $PATH."

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -153,7 +153,7 @@ class PDFExtractor(Extractor):
                 from kreuzberg._gmft import extract_tables_sync  # noqa: PLC0415
                 tables = extract_tables_sync(path)
-            except ImportError:
+            except ImportError:  # pragma: no cover
                 tables = []
         if not self.config.force_ocr and self._validate_extracted_text(text):
@@ -500,7 +500,7 @@ class PDFExtractor(Extractor):
             except (ValueError, TypeError, KeyError, RuntimeError) as e:  # noqa: PERF203
                 last_exception = e
                 continue
-            except OSError as e:
+            except OSError as e:  # pragma: no cover
                 raise ParsingError(f"Failed to parse PDF: {e}") from e
         if last_exception:
@@ -520,7 +520,7 @@ class PDFExtractor(Extractor):
         for password in passwords:
             try:
                 return await extract_pdf_metadata(content, password=password)
-            except (ParsingError, ValueError, TypeError, OSError) as e:  # noqa: PERF203
+            except (ParsingError, ValueError, TypeError, OSError) as e:  # noqa: PERF203  # pragma: no cover
                 last_exception = e
                 continue
@@ -538,7 +538,7 @@ class PDFExtractor(Extractor):
         for password in passwords:
             try:
                 return extract_pdf_metadata_sync(content, password=password)
-            except (ParsingError, ValueError, TypeError, OSError) as e:  # noqa: PERF203
+            except (ParsingError, ValueError, TypeError, OSError) as e:  # noqa: PERF203  # pragma: no cover
                 last_exception = e
                 continue

kreuzberg/_gmft.py CHANGED Viewed

@@ -99,7 +99,7 @@ async def extract_tables(
             "size": stat.st_size,
             "mtime": stat.st_mtime,
         }
-    except OSError:
+    except OSError:  # pragma: no cover
         file_info = {
             "path": str(path),
             "size": 0,
@@ -215,7 +215,7 @@ def extract_tables_sync(
             "size": stat.st_size,
             "mtime": stat.st_mtime,
         }
-    except OSError:
+    except OSError:  # pragma: no cover
         file_info = {
             "path": str(path),
             "size": 0,

kreuzberg/_mcp/server.py CHANGED Viewed

@@ -39,7 +39,7 @@ def _validate_file_path(file_path: str) -> Path:
     """
     try:
         path = Path(file_path).resolve()
-    except (OSError, ValueError) as e:
+    except (OSError, ValueError) as e:  # pragma: no cover
         raise ValidationError(
             f"Invalid file path: {file_path}",
             context={"file_path": file_path, "error": str(e)},

kreuzberg/_mime_types.py CHANGED Viewed

@@ -229,7 +229,7 @@ def validate_mime_type(
                 "mtime": stat.st_mtime if stat else 0,
                 "check_file_exists": check_file_exists,
             }
-        except OSError:
+        except OSError:  # pragma: no cover
             file_info = {
                 "path": str(path),
                 "size": 0,

kreuzberg/_ocr/_easyocr.py CHANGED Viewed

@@ -44,11 +44,9 @@ HAS_EASYOCR: bool = False
 def _import_easyocr() -> tuple[Any, Any]:
     global HAS_EASYOCR, easyocr, torch
-    # If easyocr is already set (either real module or mock), return it
     if easyocr is not None:
         return easyocr, torch
-    # If explicitly disabled for testing
     if not HAS_EASYOCR and easyocr is None:
         return None, None
@@ -57,14 +55,14 @@ def _import_easyocr() -> tuple[Any, Any]:
         try:
             import torch as _torch  # noqa: PLC0415
-        except ImportError:
+        except ImportError:  # pragma: no cover
             _torch = None  # type: ignore[assignment]
         easyocr = _easyocr
         torch = _torch
         HAS_EASYOCR = True
         return easyocr, torch
-    except ImportError:
+    except ImportError:  # pragma: no cover
         return None, None
@@ -161,7 +159,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
         try:
             import numpy as np  # noqa: PLC0415
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
         use_cache = kwargs.pop("use_cache", True)
@@ -314,7 +312,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     @classmethod
     def _is_gpu_available(cls) -> bool:
-        # Use the module-level torch variable directly to respect patches
         if torch is None:
             return False
         return bool(torch.cuda.is_available())
@@ -324,7 +321,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        # Validate language first before attempting import
         languages = cls._validate_language_code(kwargs.pop("language", "en"))
         easyocr_module, _ = _import_easyocr()
@@ -409,7 +405,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
     def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
         try:
             import numpy as np  # noqa: PLC0415
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
         use_cache = kwargs.pop("use_cache", True)
@@ -483,7 +479,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
         if cls._reader is not None:
             return
-        # Validate language first before attempting import
         languages = cls._validate_language_code(kwargs.pop("language", "en"))
         easyocr_module, _ = _import_easyocr()

kreuzberg/_ocr/_paddleocr.py CHANGED Viewed

@@ -60,7 +60,7 @@ def _import_paddleocr() -> tuple[Any, Any]:
         PaddleOCR = _PaddleOCR
         HAS_PADDLEOCR = True
         return np, PaddleOCR
-    except ImportError:
+    except ImportError:  # pragma: no cover
         return None, None

kreuzberg/_ocr/_tesseract.py CHANGED Viewed

@@ -215,7 +215,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             try:
                 await run_sync(save_image.save, str(image_path), format="PNG")
-            except OSError as e:
+            except OSError as e:  # pragma: no cover
                 if "cannot write mode" not in str(e):
                     raise
                 save_image = image.convert("RGB")
@@ -357,7 +357,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         try:
             stat = path.stat()
             file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
-        except OSError:
+        except OSError:  # pragma: no cover
             file_info = {"path": str(path), "size": 0, "mtime": 0}
         cache_kwargs = {
@@ -399,7 +399,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     await ocr_cache.aset(extraction_result, **final_cache_kwargs)
                 return extraction_result
-            except (RuntimeError, OSError) as e:
+            except (RuntimeError, OSError) as e:  # pragma: no cover
                 raise OCRError(f"Failed to OCR using tesseract: {e}") from e
             finally:
                 await unlink()
@@ -432,7 +432,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 try:
                     df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
-                except (ImportError, IndexError):
+                except (ImportError, IndexError):  # pragma: no cover
                     df = None
                 table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None}  # type: ignore[typeddict-item]
@@ -444,7 +444,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     tables=[table],
                     chunks=text_result.chunks,
                 )
-        except (ValueError, KeyError, ImportError):
+        except (ValueError, KeyError, ImportError):  # pragma: no cover
             pass
         return text_result
@@ -507,12 +507,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
         table_min_confidence: float = 30.0,
         **_kwargs: Any,
     ) -> ExtractionResult:
-        config = html_to_markdown_config or HTMLToMarkdownConfig(
-            escape_asterisks=False,
-            escape_underscores=False,
-            extract_metadata=False,
-            strip=["meta", "title"],
-        )
+        config = html_to_markdown_config or HTMLToMarkdownConfig()
         tables: list[TableData] = []
         if enable_table_detection:
@@ -678,10 +673,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             html_config = HTMLToMarkdownConfig(
                 custom_converters=converters,
-                escape_asterisks=False,
-                escape_underscores=False,
-                extract_metadata=False,
-                strip=["meta", "title"],
             )
             config_dict = html_config.to_dict()
@@ -761,7 +752,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 try:
                     df = pl.DataFrame(table_data[1:], schema=table_data[0])
-                except (ImportError, IndexError):
+                except (ImportError, IndexError):  # pragma: no cover
                     df = None
                 table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None}  # type: ignore[typeddict-item]
@@ -773,7 +764,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     tables=[table],
                     chunks=text_result.chunks,
                 )
-        except (ValueError, KeyError, ImportError):
+        except (ValueError, KeyError, ImportError):  # pragma: no cover
             pass
         return text_result
@@ -810,7 +801,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 try:
                     df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
-                except (ImportError, IndexError):
+                except (ImportError, IndexError):  # pragma: no cover
                     df = None
                 dummy_image = Image.new("RGB", (1, 1), "white")
@@ -823,7 +814,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                     "metadata": {"bbox": (min_x, min_y, max_x, max_y)},
                 }  # type: ignore[typeddict-unknown-key]
                 tables.append(table)
-        except (ValueError, KeyError, ImportError):
+        except (ValueError, KeyError, ImportError):  # pragma: no cover
             pass
         return tables
@@ -879,7 +870,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
             try:
                 result = await run_process(command, env=env)
-            except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            except (subprocess.CalledProcessError, FileNotFoundError) as e:  # pragma: no cover
                 raise MissingDependencyError(
                     "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
                 ) from e
@@ -890,7 +881,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 )
             cls._version_checked = True
-        except FileNotFoundError as e:
+        except FileNotFoundError as e:  # pragma: no cover
             raise MissingDependencyError(
                 "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
             ) from e
@@ -1087,7 +1078,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 "size": stat.st_size,
                 "mtime": stat.st_mtime,
             }
-        except OSError:
+        except OSError:  # pragma: no cover
             return {
                 "path": str(path),
                 "size": 0,
@@ -1095,7 +1086,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             }
     def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
-        """Convert a worker result dict to ExtractionResult."""
         if result_dict.get("success"):
             return ExtractionResult(
                 content=str(result_dict.get("text", "")),
@@ -1189,7 +1179,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
             command = ["tesseract", "--version"]
             try:
                 result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
-            except (subprocess.CalledProcessError, FileNotFoundError) as e:
+            except (subprocess.CalledProcessError, FileNotFoundError) as e:  # pragma: no cover
                 raise MissingDependencyError(
                     "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
                 ) from e
@@ -1200,7 +1190,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
                 )
             cls._version_checked = True
-        except FileNotFoundError as e:
+        except FileNotFoundError as e:  # pragma: no cover
             raise MissingDependencyError(
                 "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
             ) from e

kreuzberg/_token_reduction/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from __future__ import annotations
+from kreuzberg._token_reduction._reducer import ReductionStats, get_reduction_stats, reduce_tokens
+from kreuzberg._token_reduction._stopwords import StopwordsManager
+__all__ = [
+    "ReductionStats",
+    "StopwordsManager",
+    "get_reduction_stats",
+    "reduce_tokens",
+]

kreuzberg 3.16.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

kreuzberg 3.16.0py3-none-any.whl → 3.17.0py3-none-any.whl