PyPI - kreuzberg - Versions diffs - 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl - Mend

kreuzberg 3.13.0py3-none-any.whl → 3.13.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +0 -124
kreuzberg/_document_classification.py +20 -39
kreuzberg/_entity_extraction.py +0 -29
kreuzberg/_extractors/_base.py +4 -66
kreuzberg/_extractors/_email.py +0 -4
kreuzberg/_extractors/_image.py +0 -2
kreuzberg/_extractors/_pandoc.py +0 -58
kreuzberg/_extractors/_pdf.py +0 -3
kreuzberg/_extractors/_presentation.py +0 -82
kreuzberg/_extractors/_spread_sheet.py +0 -2
kreuzberg/_gmft.py +0 -61
kreuzberg/_language_detection.py +0 -14
kreuzberg/_mime_types.py +0 -17
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +110 -85
kreuzberg/_ocr/_paddleocr.py +146 -138
kreuzberg/_ocr/_table_extractor.py +0 -76
kreuzberg/_ocr/_tesseract.py +0 -206
kreuzberg/_playa.py +0 -27
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +16 -119
kreuzberg/_utils/_cache.py +0 -52
kreuzberg/_utils/_device.py +0 -56
kreuzberg/_utils/_document_cache.py +0 -73
kreuzberg/_utils/_errors.py +0 -47
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -14
kreuzberg/_utils/_process_pool.py +0 -47
kreuzberg/_utils/_quality.py +0 -17
kreuzberg/_utils/_ref.py +0 -16
kreuzberg/_utils/_serialization.py +0 -25
kreuzberg/_utils/_string.py +0 -20
kreuzberg/_utils/_sync.py +0 -76
kreuzberg/_utils/_table.py +0 -45
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +2 -2
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
kreuzberg-3.13.2.dist-info/RECORD +57 -0
kreuzberg-3.13.0.dist-info/RECORD +0 -56
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -93,7 +93,6 @@ class PDFExtractor(Extractor):
         return self._apply_quality_processing(result)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of PDF extraction from bytes."""
         fd, temp_path = tempfile.mkstemp(suffix=".pdf")
         try:
             with os.fdopen(fd, "wb") as f:
@@ -110,7 +109,6 @@ class PDFExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of PDF extraction from path."""
         try:
             text = self._extract_pdf_searchable_text_sync(path)
         except ParsingError:
@@ -330,7 +328,6 @@ class PDFExtractor(Extractor):
         return "\n\n".join(result.content for result in results)
     def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
-        """Process PIL images directly without temp files."""
         backend = get_ocr_backend(self.config.ocr_backend)
         match self.config.ocr_backend:

kreuzberg/_extractors/_presentation.py CHANGED Viewed

@@ -25,94 +25,23 @@ _NON_WORD_PATTERN = re.compile(r"\W")
 class PresentationExtractor(Extractor):
-    """Extractor for PowerPoint (.pptx) files.
-    This extractor processes PowerPoint presentations and converts their content into Markdown format.
-    It handles slides, shapes, images, tables, and slide notes, preserving the structure and content
-    of the presentation in a readable text format.
-    The extractor provides both synchronous and asynchronous methods for processing files either
-    from disk or from bytes in memory.
-    """
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
-        """Asynchronously extract content from PowerPoint file bytes.
-        Args:
-            content: Raw bytes of the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        """
         return self._extract_pptx(content)
     async def extract_path_async(self, path: Path) -> ExtractionResult:
-        """Asynchronously extract content from a PowerPoint file on disk.
-        Args:
-            path: Path to the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        """
         content = await AsyncPath(path).read_bytes()
         return self._extract_pptx(content)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Synchronously extract content from PowerPoint file bytes.
-        Args:
-            content: Raw bytes of the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        """
         return self._extract_pptx(content)
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Synchronously extract content from a PowerPoint file on disk.
-        Args:
-            path: Path to the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        """
         content = Path(path).read_bytes()
         return self._extract_pptx(content)
     def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
-        """Process PowerPoint file contents and convert to Markdown.
-        This method handles the core logic of extracting content from a PowerPoint file.
-        It processes:
-        - Slide titles and content
-        - Images (with alt text if available)
-        - Tables (converted to HTML format)
-        - Text frames
-        - Slide notes
-        Args:
-            file_contents: Raw bytes of the PowerPoint file to process.
-        Returns:
-            ExtractionResult: Contains the extracted content in Markdown format,
-                the MIME type, and any additional metadata.
-        Notes:
-            The extraction preserves the following elements:
-            - Slide numbers (as HTML comments)
-            - Images (converted to Markdown image syntax with alt text)
-            - Tables (converted to HTML table syntax)
-            - Text content (with titles properly formatted)
-            - Slide notes (under a dedicated section for each slide)
-        """
         md_content = ""
         presentation = pptx.Presentation(BytesIO(file_contents))
@@ -181,14 +110,6 @@ class PresentationExtractor(Extractor):
     @staticmethod
     def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
-        """Extract metadata from a presentation instance.
-        Args:
-            presentation: A `Presentation` object representing the PowerPoint file.
-        Returns:
-            PresentationMetadata: Object containing presentation-specific metadata fields.
-        """
         metadata: Metadata = {}
         PresentationExtractor._extract_core_properties(presentation, metadata)
@@ -203,7 +124,6 @@ class PresentationExtractor(Extractor):
     @staticmethod
     def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
-        """Extract core document properties from presentation."""
         property_mapping = [
             ("authors", "author"),
             ("comments", "comments"),
@@ -230,7 +150,6 @@ class PresentationExtractor(Extractor):
     @staticmethod
     def _extract_fonts(presentation: Presentation) -> set[str]:
-        """Extract all fonts used in the presentation."""
         fonts = set()
         for slide in presentation.slides:
             for shape in slide.shapes:
@@ -245,7 +164,6 @@ class PresentationExtractor(Extractor):
     @staticmethod
     def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
-        """Add structural information about the presentation."""
         slide_count = len(presentation.slides)
         if slide_count == 0:
             return

kreuzberg/_extractors/_spread_sheet.py CHANGED Viewed

@@ -72,7 +72,6 @@ class SpreadSheetExtractor(Extractor):
             ) from e
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of extract_bytes."""
         fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
         try:
@@ -85,7 +84,6 @@ class SpreadSheetExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of extract_path."""
         try:
             workbook = CalamineWorkbook.from_path(str(path))
             results = []

kreuzberg/_gmft.py CHANGED Viewed

@@ -31,23 +31,6 @@ if TYPE_CHECKING:
 async def extract_tables(
     file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
 ) -> list[TableData]:
-    """Extracts tables from a PDF file.
-    This function takes a file path to a PDF file, and an optional configuration object.
-    It returns a list of strings, where each string is a markdown-formatted table.
-    Args:
-        file_path: The path to the PDF file.
-        config: An optional configuration object.
-        use_isolated_process: Whether to use an isolated process for extraction.
-            If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
-    Raises:
-        MissingDependencyError: Raised when the required dependencies are not installed.
-    Returns:
-        A list of table data dictionaries.
-    """
     # Determine if we should use isolated process  # ~keep
     if use_isolated_process is None:
         use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -164,17 +147,6 @@ async def extract_tables(
 def extract_tables_sync(
     file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
 ) -> list[TableData]:
-    """Synchronous wrapper for extract_tables.
-    Args:
-        file_path: The path to the PDF file.
-        config: An optional configuration object.
-        use_isolated_process: Whether to use an isolated process for extraction.
-            If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
-    Returns:
-        A list of table data dictionaries.
-    """
     # Determine if we should use isolated process  # ~keep
     if use_isolated_process is None:
         use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -276,13 +248,6 @@ def _extract_tables_in_process(
     config_dict: dict[str, Any],
     result_queue: queue.Queue[tuple[bool, Any]],
 ) -> None:
-    """Extract tables in an isolated process to handle potential segfaults.
-    Args:
-        file_path: Path to the PDF file
-        config_dict: Serialized GMFTConfig as a dict
-        result_queue: Queue to put results or errors
-    """
     signal.signal(signal.SIGINT, signal.SIG_IGN)
     try:
@@ -366,19 +331,6 @@ def _extract_tables_isolated(
     config: GMFTConfig | None = None,
     timeout: float = 300.0,
 ) -> list[TableData]:
-    """Extract tables using an isolated process to handle segfaults.
-    Args:
-        file_path: Path to the PDF file
-        config: GMFT configuration
-        timeout: Maximum time to wait for extraction
-    Returns:
-        List of extracted tables
-    Raises:
-        RuntimeError: If extraction fails or times out
-    """
     config = config or GMFTConfig()
     config_dict = msgspec.to_builtins(config)
@@ -477,19 +429,6 @@ async def _extract_tables_isolated_async(
     config: GMFTConfig | None = None,
     timeout: float = 300.0,  # noqa: ASYNC109
 ) -> list[TableData]:
-    """Async version of extract_tables_isolated using asyncio.
-    Args:
-        file_path: Path to the PDF file
-        config: GMFT configuration
-        timeout: Maximum time to wait for extraction
-    Returns:
-        List of extracted tables
-    Raises:
-        RuntimeError: If extraction fails or times out
-    """
     config = config or GMFTConfig()
     config_dict = msgspec.to_builtins(config)

kreuzberg/_language_detection.py CHANGED Viewed

@@ -24,7 +24,6 @@ _CACHE_SIZE = 128
 def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
-    """Create FastLangDetectConfig from our config."""
     if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
         return None
@@ -39,19 +38,6 @@ def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangD
 @lru_cache(maxsize=_CACHE_SIZE)
 def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
-    """Detect the most probable languages in the given text using fast-langdetect.
-    Args:
-        text: The text to analyze.
-        config: Configuration for language detection. If None, uses defaults.
-    Returns:
-        A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
-        or None if detection fails.
-    Raises:
-        MissingDependencyError: If fast-langdetect is not installed.
-    """
     if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
         raise MissingDependencyError.create_for_package(
             dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"

kreuzberg/_mime_types.py CHANGED Viewed

@@ -173,21 +173,6 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
 def validate_mime_type(
     *, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
 ) -> str:
-    """Validate and detect the MIME type for a given file.
-    Args:
-        file_path: The path to the file.
-        mime_type: Optional explicit MIME type. If provided, this will be validated.
-            If not provided, the function will attempt to detect the MIME type.
-        check_file_exists: Whether to check if the file exists. Default is True.
-            Set to False in tests where you want to validate a mime type without an actual file.
-    Raises:
-        ValidationError: If the MIME type is not supported or cannot be determined.
-    Returns:
-        The validated MIME type.
-    """
     if mime_type:
         return _validate_explicit_mime_type(mime_type)
@@ -227,7 +212,6 @@ def validate_mime_type(
 def _validate_explicit_mime_type(mime_type: str) -> str:
-    """Validate an explicitly provided MIME type."""
     if mime_type in SUPPORTED_MIME_TYPES:
         return mime_type
@@ -242,7 +226,6 @@ def _validate_explicit_mime_type(mime_type: str) -> str:
 def _detect_mime_type_uncached(file_path: PathLike[str] | str | None = None, check_file_exists: bool = True) -> str:
-    """Detect MIME type without caching (internal function)."""
     if file_path and check_file_exists:
         path = Path(file_path)
         if not path.exists():

kreuzberg/_ocr/_base.py CHANGED Viewed

@@ -16,98 +16,26 @@ T = TypeVar("T")
 class OCRBackend(ABC, Generic[T]):
-    """Abstract base class for Optical Character Recognition (OCR) backend implementations.
-    This class provides the blueprint for OCR backend implementations,
-    offering both synchronous and asynchronous methods to process images
-    and files for text extraction.
-    """
     @abstractmethod
-    async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
-        """Asynchronously process an image and extract its text and metadata.
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Any kwargs related to the given backend
-        Returns:
-            The extraction result object
-        """
-        ...
+    async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult: ...
     @abstractmethod
-    async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
-        """Asynchronously process a file and extract its text and metadata.
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Any kwargs related to the given backend
-        Returns:
-            The extraction result object
-        """
-        ...
+    async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult: ...
     @abstractmethod
-    def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
-        """Synchronously process an image and extract its text and metadata.
-        Args:
-            image: An instance of PIL.Image representing the input image.
-            **kwargs: Any kwargs related to the given backend
-        Returns:
-            The extraction result object
-        """
-        ...
+    def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult: ...
     @abstractmethod
-    def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
-        """Synchronously process a file and extract its text and metadata.
-        Args:
-            path: A Path object representing the file to be processed.
-            **kwargs: Any kwargs related to the given backend
-        Returns:
-            The extraction result object
-        """
-        ...
+    def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult: ...
     def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
-        """Synchronously process a batch of files and extract their text and metadata.
-        Default implementation processes files sequentially. Backends can override
-        for more efficient batch processing.
-        Args:
-            paths: List of Path objects representing files to be processed.
-            **kwargs: Any kwargs related to the given backend
-        Returns:
-            List of extraction result objects in the same order as input paths
-        """
         return [self.process_file_sync(path, **kwargs) for path in paths]  # pragma: no cover
     async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
-        """Asynchronously process a batch of files and extract their text and metadata.
-        Default implementation processes files concurrently. Backends can override
-        for more efficient batch processing.
-        Args:
-            paths: List of Path objects representing files to be processed.
-            **kwargs: Any kwargs related to the given backend
-        Returns:
-            List of extraction result objects in the same order as input paths
-        """
         from kreuzberg._utils._sync import run_taskgroup  # noqa: PLC0415
         tasks = [self.process_file(path, **kwargs) for path in paths]
         return await run_taskgroup(*tasks)  # pragma: no cover
     def __hash__(self) -> int:
-        """Hash function for allowing caching."""
         return hash(type(self).__name__)  # pragma: no cover

kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl

kreuzberg 3.13.0py3-none-any.whl → 3.13.2py3-none-any.whl