PyPI - kreuzberg - Versions diffs - 3.2.0__py3-none-any.whl → 3.3.0__py3-none-any.whl - Mend

kreuzberg 3.2.0py3-none-any.whl → 3.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kreuzberg/__init__.py +3 -0
kreuzberg/__main__.py +8 -0
kreuzberg/_cli_config.py +175 -0
kreuzberg/_extractors/_image.py +39 -4
kreuzberg/_extractors/_pandoc.py +158 -18
kreuzberg/_extractors/_pdf.py +199 -19
kreuzberg/_extractors/_presentation.py +1 -1
kreuzberg/_extractors/_spread_sheet.py +65 -7
kreuzberg/_gmft.py +222 -16
kreuzberg/_mime_types.py +62 -16
kreuzberg/_multiprocessing/__init__.py +6 -0
kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
kreuzberg/_multiprocessing/process_manager.py +188 -0
kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
kreuzberg/_ocr/_easyocr.py +6 -12
kreuzberg/_ocr/_paddleocr.py +15 -13
kreuzberg/_ocr/_tesseract.py +136 -46
kreuzberg/_playa.py +43 -0
kreuzberg/_utils/_cache.py +372 -0
kreuzberg/_utils/_device.py +10 -27
kreuzberg/_utils/_document_cache.py +220 -0
kreuzberg/_utils/_errors.py +232 -0
kreuzberg/_utils/_pdf_lock.py +72 -0
kreuzberg/_utils/_process_pool.py +100 -0
kreuzberg/_utils/_serialization.py +82 -0
kreuzberg/_utils/_string.py +1 -1
kreuzberg/_utils/_sync.py +21 -0
kreuzberg/cli.py +338 -0
kreuzberg/extraction.py +247 -36
{kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +93 -24
kreuzberg-3.3.0.dist-info/RECORD +48 -0
{kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
kreuzberg-3.2.0.dist-info/RECORD +0 -34
kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
{kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0

kreuzberg/extraction.py CHANGED Viewed

@@ -13,7 +13,8 @@ from kreuzberg._mime_types import (
 from kreuzberg._registry import ExtractorRegistry
 from kreuzberg._types import ExtractionConfig
 from kreuzberg._utils._string import safe_decode
-from kreuzberg._utils._sync import run_maybe_async, run_maybe_sync
+from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
+from kreuzberg.exceptions import ValidationError
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -42,7 +43,7 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
 def _validate_and_post_process_sync(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
     for validator in config.validators or []:
-        run_maybe_async(validator, result)
+        run_sync_only(validator, result)
     if config.chunk_content:
         result.chunks = _handle_chunk_content(
@@ -52,7 +53,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
         )
     for post_processor in config.post_processing_hooks or []:
-        result = run_maybe_async(post_processor, result)
+        result = run_sync_only(post_processor, result)
     return result
@@ -104,22 +105,57 @@ async def extract_file(
     Returns:
         The extracted content and the mime type of the content.
+    Raises:
+        ValidationError: If the file path or configuration is invalid.
     """
-    mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
-    if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
-        result = await extractor.extract_path_async(Path(file_path))
-    else:
-        result = ExtractionResult(
-            content=safe_decode(await anyio.Path(file_path).read_bytes()), chunks=[], mime_type=mime_type, metadata={}
-        )
+    from kreuzberg._utils._document_cache import get_document_cache
-    return await _validate_and_post_process_async(result=result, config=config)
+    cache = get_document_cache()
+    path = Path(file_path)
+    cached_result = cache.get(path, config)
+    if cached_result is not None:
+        return cached_result
+    if cache.is_processing(path, config):
+        event = cache.mark_processing(path, config)
+        await anyio.to_thread.run_sync(event.wait)  # pragma: no cover
+        # Try cache again after waiting for other process to complete  # ~keep
+        cached_result = cache.get(path, config)  # pragma: no cover
+        if cached_result is not None:  # pragma: no cover
+            return cached_result
+    cache.mark_processing(path, config)
+    try:
+        if not path.exists():
+            raise ValidationError("The file does not exist", context={"file_path": str(path)})
+        mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
+        if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
+            result = await extractor.extract_path_async(Path(file_path))
+        else:
+            result = ExtractionResult(
+                content=safe_decode(await anyio.Path(file_path).read_bytes()),
+                chunks=[],
+                mime_type=mime_type,
+                metadata={},
+            )
+        result = await _validate_and_post_process_async(result=result, config=config)
+        cache.set(path, config, result)
+        return result
+    finally:
+        cache.mark_complete(path, config)
 async def batch_extract_file(
     file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
 ) -> list[ExtractionResult]:
-    """Extract text from multiple files concurrently.
+    """Extract text from multiple files concurrently with optimizations.
     Args:
         file_paths: A sequence of paths to files to extract text from.
@@ -128,15 +164,43 @@ async def batch_extract_file(
     Returns:
         A list of extraction results in the same order as the input paths.
     """
+    if not file_paths:
+        return []
+    import multiprocessing as mp
+    max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
+    semaphore = anyio.Semaphore(max_concurrency)
     results = cast("list[ExtractionResult]", ([None] * len(file_paths)))
     async def _extract_file(path: PathLike[str] | str, index: int) -> None:
-        result = await extract_file(
-            path,
-            None,
-            config,
-        )
-        results[index] = result
+        async with semaphore:
+            try:
+                result = await extract_file(
+                    path,
+                    None,
+                    config,
+                )
+                results[index] = result
+            except Exception as e:  # noqa: BLE001
+                from kreuzberg._utils._errors import create_error_context
+                error_result = ExtractionResult(
+                    content=f"Error: {type(e).__name__}: {e!s}",
+                    mime_type="text/plain",
+                    metadata={  # type: ignore[typeddict-unknown-key]
+                        "error": True,
+                        "error_context": create_error_context(
+                            operation="batch_extract_file",
+                            file_path=path,
+                            error=e,
+                            index=index,
+                        ),
+                    },
+                    chunks=[],
+                )
+                results[index] = error_result
     async with anyio.create_task_group() as tg:
         for i, path in enumerate(file_paths):
@@ -148,7 +212,7 @@ async def batch_extract_file(
 async def batch_extract_bytes(
     contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
 ) -> list[ExtractionResult]:
-    """Extract text from multiple byte contents concurrently.
+    """Extract text from multiple byte contents concurrently with optimizations.
     Args:
         contents: A sequence of tuples containing (content, mime_type) pairs.
@@ -157,11 +221,40 @@ async def batch_extract_bytes(
     Returns:
         A list of extraction results in the same order as the input contents.
     """
+    if not contents:
+        return []
+    import multiprocessing as mp
+    max_concurrency = min(len(contents), mp.cpu_count() * 2)
+    semaphore = anyio.Semaphore(max_concurrency)
     results = cast("list[ExtractionResult]", [None] * len(contents))
     async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
-        result = await extract_bytes(content, mime_type, config)
-        results[index] = result
+        async with semaphore:
+            try:
+                result = await extract_bytes(content, mime_type, config)
+                results[index] = result
+            except Exception as e:  # noqa: BLE001
+                from kreuzberg._utils._errors import create_error_context
+                error_result = ExtractionResult(
+                    content=f"Error: {type(e).__name__}: {e!s}",
+                    mime_type="text/plain",
+                    metadata={  # type: ignore[typeddict-unknown-key]
+                        "error": True,
+                        "error_context": create_error_context(
+                            operation="batch_extract_bytes",
+                            error=e,
+                            index=index,
+                            mime_type=mime_type,
+                            content_size=len(content),
+                        ),
+                    },
+                    chunks=[],
+                )
+                results[index] = error_result
     async with anyio.create_task_group() as tg:
         for i, (content, mime_type) in enumerate(contents):
@@ -207,24 +300,57 @@ def extract_file_sync(
     Returns:
         The extracted content and the mime type of the content.
+    Raises:
+        ValidationError: If the file path or configuration is invalid.
     """
-    mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
-    if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
-        result = extractor.extract_path_sync(Path(file_path))
-    else:
-        result = ExtractionResult(
-            content=Path(file_path).read_text(),
-            chunks=[],
-            mime_type=mime_type,
-            metadata={},
-        )
-    return _validate_and_post_process_sync(result=result, config=config)
+    from kreuzberg._utils._document_cache import get_document_cache
+    cache = get_document_cache()
+    path = Path(file_path)
+    cached_result = cache.get(path, config)
+    if cached_result is not None:
+        return cached_result
+    if cache.is_processing(path, config):
+        event = cache.mark_processing(path, config)
+        event.wait()  # pragma: no cover
+        # Try cache again after waiting for other process to complete  # ~keep
+        cached_result = cache.get(path, config)  # pragma: no cover
+        if cached_result is not None:  # pragma: no cover
+            return cached_result
+    cache.mark_processing(path, config)
+    try:
+        if not path.exists():
+            raise ValidationError("The file does not exist", context={"file_path": str(path)})
+        mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
+        if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
+            result = extractor.extract_path_sync(Path(file_path))
+        else:
+            result = ExtractionResult(
+                content=Path(file_path).read_text(),
+                chunks=[],
+                mime_type=mime_type,
+                metadata={},
+            )
+        result = _validate_and_post_process_sync(result=result, config=config)
+        cache.set(path, config, result)
+        return result
+    finally:
+        cache.mark_complete(path, config)
 def batch_extract_file_sync(
     file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
 ) -> list[ExtractionResult]:
-    """Synchronous version of batch_extract_file.
+    """Synchronous version of batch_extract_file with parallel processing.
     Args:
         file_paths: A sequence of paths to files to extract text from.
@@ -233,13 +359,54 @@ def batch_extract_file_sync(
     Returns:
         A list of extraction results in the same order as the input paths.
     """
-    return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
+    if len(file_paths) <= 1:
+        return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
+    import multiprocessing as mp
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    max_workers = min(len(file_paths), mp.cpu_count())
+    def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
+        """Extract single file with index for ordering."""
+        try:
+            return (
+                file_paths.index(file_path),
+                extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
+            )
+        except Exception as e:  # noqa: BLE001
+            from kreuzberg._utils._errors import create_error_context
+            error_result = ExtractionResult(
+                content=f"Error: {type(e).__name__}: {e!s}",
+                mime_type="text/plain",
+                metadata={  # type: ignore[typeddict-unknown-key]
+                    "error": True,
+                    "error_context": create_error_context(
+                        operation="batch_extract_file_sync",
+                        file_path=file_path,
+                        error=e,
+                    ),
+                },
+                chunks=[],
+            )
+            return (file_paths.index(file_path), error_result)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_index = {executor.submit(extract_single, fp): i for i, fp in enumerate(file_paths)}
+        results: list[ExtractionResult] = [None] * len(file_paths)  # type: ignore[list-item]
+        for future in as_completed(future_to_index):
+            index, result = future.result()
+            results[index] = result
+    return results
 def batch_extract_bytes_sync(
     contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
 ) -> list[ExtractionResult]:
-    """Synchronous version of batch_extract_bytes.
+    """Synchronous version of batch_extract_bytes with parallel processing.
     Args:
         contents: A sequence of tuples containing (content, mime_type) pairs.
@@ -248,4 +415,48 @@ def batch_extract_bytes_sync(
     Returns:
         A list of extraction results in the same order as the input contents.
     """
-    return [extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents]
+    if len(contents) <= 1:
+        return [
+            extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
+        ]
+    import multiprocessing as mp
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    max_workers = min(len(contents), mp.cpu_count())
+    def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
+        """Extract single content with index for ordering."""
+        index, (content, mime_type) = index_and_content
+        try:
+            return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
+        except Exception as e:  # noqa: BLE001
+            from kreuzberg._utils._errors import create_error_context
+            error_result = ExtractionResult(
+                content=f"Error: {type(e).__name__}: {e!s}",
+                mime_type="text/plain",
+                metadata={  # type: ignore[typeddict-unknown-key]
+                    "error": True,
+                    "error_context": create_error_context(
+                        operation="batch_extract_bytes_sync",
+                        error=e,
+                        index=index,
+                        mime_type=mime_type,
+                        content_size=len(content),
+                    ),
+                },
+                chunks=[],
+            )
+            return (index, error_result)
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        indexed_contents = list(enumerate(contents))
+        future_to_index = {executor.submit(extract_single, ic): i for i, ic in enumerate(indexed_contents)}
+        results: list[ExtractionResult] = [None] * len(contents)  # type: ignore[list-item]
+        for future in as_completed(future_to_index):
+            index, result = future.result()
+            results[index] = result
+    return results

{kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,56 +1,60 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.2.0
+Version: 3.3.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
+Project-URL: homepage, https://github.com/Goldziher/kreuzberg
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
-Project-URL: homepage, https://github.com/Goldziher/kreuzberg
+License-File: LICENSE
 Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Classifier: Topic :: Text Processing :: General
 Classifier: Topic :: Utilities
 Classifier: Typing :: Typed
-Requires-Python: >=3.9
-Description-Content-Type: text/markdown
-License-File: LICENSE
+Requires-Python: >=3.13
 Requires-Dist: anyio>=4.9.0
 Requires-Dist: charset-normalizer>=3.4.2
-Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
+Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
 Requires-Dist: html-to-markdown>=1.4.0
+Requires-Dist: msgspec>=0.18.0
 Requires-Dist: playa-pdf>=0.6.1
+Requires-Dist: psutil>=7.0.0
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.2
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: typing-extensions>=4.14.0; python_version < "3.12"
+Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
 Provides-Extra: all
-Requires-Dist: easyocr>=1.7.2; extra == "all"
-Requires-Dist: gmft>=0.4.1; extra == "all"
-Requires-Dist: paddleocr>=3.0.2; extra == "all"
-Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
-Requires-Dist: semantic-text-splitter>=0.27.0; extra == "all"
-Requires-Dist: setuptools>=80.9.0; extra == "all"
+Requires-Dist: click>=8.2.1; extra == 'all'
+Requires-Dist: easyocr>=1.7.2; extra == 'all'
+Requires-Dist: gmft>=0.4.2; extra == 'all'
+Requires-Dist: paddleocr>=3.1.0; extra == 'all'
+Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
+Requires-Dist: rich>=14.0.0; extra == 'all'
+Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
+Requires-Dist: setuptools>=80.9.0; extra == 'all'
+Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
 Provides-Extra: chunking
-Requires-Dist: semantic-text-splitter>=0.27.0; extra == "chunking"
+Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
+Provides-Extra: cli
+Requires-Dist: click>=8.2.1; extra == 'cli'
+Requires-Dist: rich>=14.0.0; extra == 'cli'
+Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
 Provides-Extra: easyocr
-Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
+Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
 Provides-Extra: gmft
-Requires-Dist: gmft>=0.4.1; extra == "gmft"
+Requires-Dist: gmft>=0.4.2; extra == 'gmft'
 Provides-Extra: paddleocr
-Requires-Dist: paddleocr>=3.0.2; extra == "paddleocr"
-Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
-Requires-Dist: setuptools>=80.9.0; extra == "paddleocr"
-Dynamic: license-file
+Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
+Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
+Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
+Description-Content-Type: text/markdown
 # Kreuzberg
@@ -68,6 +72,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
 - **Resource Efficient**: Lightweight processing without GPU requirements
 - **Format Support**: Comprehensive support for documents, images, and text formats
 - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
+- **Command Line Interface**: Powerful CLI for batch processing and automation
 - **Metadata Extraction**: Get document metadata alongside text content
 - **Table Extraction**: Extract tables from documents using the excellent GMFT library
 - **Modern Python**: Built with async/await, type hints, and a functional-first approach
@@ -77,6 +82,9 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
 ```bash
 pip install kreuzberg
+# Or install with CLI support
+pip install "kreuzberg[cli]"
 ```
 Install pandoc:
@@ -126,12 +134,53 @@ async def main():
 asyncio.run(main())
 ```
+## Command Line Interface
+Kreuzberg includes a powerful CLI for processing documents from the command line:
+```bash
+# Extract text from a file
+kreuzberg extract document.pdf
+# Extract with JSON output and metadata
+kreuzberg extract document.pdf --output-format json --show-metadata
+# Extract from stdin
+cat document.html | kreuzberg extract
+# Use specific OCR backend
+kreuzberg extract image.png --ocr-backend easyocr --easyocr-languages en,de
+# Extract with configuration file
+kreuzberg extract document.pdf --config config.toml
+```
+### CLI Configuration
+Configure via `pyproject.toml`:
+```toml
+[tool.kreuzberg]
+force_ocr = true
+chunk_content = false
+extract_tables = true
+max_chars = 4000
+ocr_backend = "tesseract"
+[tool.kreuzberg.tesseract]
+language = "eng+deu"
+psm = 3
+```
+For full CLI documentation, see the [CLI Guide](https://goldziher.github.io/kreuzberg/cli/).
 ## Documentation
 For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
 - [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
 - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
+- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line interface documentation
 - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
 - [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
 - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
@@ -157,6 +206,26 @@ Kreuzberg supports multiple OCR engines:
 For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
+## Performance
+Kreuzberg offers both sync and async APIs. Choose the right one based on your use case:
+| Operation              | Sync Time | Async Time | Async Advantage    |
+| ---------------------- | --------- | ---------- | ------------------ |
+| Simple text (Markdown) | 0.4ms     | 17.5ms     | **❌ 41x slower**  |
+| HTML documents         | 1.6ms     | 1.1ms      | **✅ 1.5x faster** |
+| Complex PDFs           | 39.0s     | 8.5s       | **✅ 4.6x faster** |
+| OCR processing         | 0.4s      | 0.7s       | **✅ 1.7x faster** |
+| Batch operations       | 38.6s     | 8.5s       | **✅ 4.5x faster** |
+**Rule of thumb:**
+- Use **sync** for simple documents and CLI applications
+- Use **async** for complex PDFs, OCR, and batch processing
+- Use **batch operations** for multiple files
+For detailed benchmarks and methodology, see our [Performance Documentation](https://goldziher.github.io/kreuzberg/advanced/performance/).
 ## Contributing
 We welcome contributions! Please see our [Contributing Guide](docs/contributing.md) for details on setting up your development environment and submitting pull requests.

kreuzberg-3.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,48 @@
+kreuzberg/__init__.py,sha256=jRm2U-loiKWwJpgOFgZ8Ev2mfz9sI1qJOZ2V3OoJUlg,1258
+kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
+kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
+kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
+kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
+kreuzberg/_gmft.py,sha256=6liCjedPxH5Xbe7V-AmrZIq5Y9Dejn7D-LSCbgYs2Sg,14762
+kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
+kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
+kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
+kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
+kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
+kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
+kreuzberg/extraction.py,sha256=z8sht8Yw9v6bE_WgLdWx-phu4T58eExME296DV_41VU,16551
+kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
+kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
+kreuzberg/_extractors/_image.py,sha256=Vks6WEDoW5AlGqIGVSeuhZzvJNwS8V6wxeD46Fxxogw,3947
+kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
+kreuzberg/_extractors/_pdf.py,sha256=qgYwGvAlvyZzb94lXGcKGIhzmSFpP6YGzYc7fs8b-yw,13432
+kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
+kreuzberg/_extractors/_spread_sheet.py,sha256=ToLZIK_PO72IYbsdtSQkHOwTUhDwptjOfSX--e1UdSM,6487
+kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
+kreuzberg/_multiprocessing/gmft_isolated.py,sha256=wpZ5br5dL9P6hhGjAYckHbz8IvXrDdEvajJ7fxbFmAU,11199
+kreuzberg/_multiprocessing/process_manager.py,sha256=dvO9JBWYnH1KCpzwn9h3Tz-wAoihMwTLE6OS-DF_sK0,6030
+kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
+kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
+kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
+kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
+kreuzberg/_ocr/_easyocr.py,sha256=QSd6Bw7RBsOyL5ry-6lFLD7gJxcpK1P3AD_RRK4TPWs,13734
+kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
+kreuzberg/_ocr/_tesseract.py,sha256=NAHklkHvDKMgHVqjhgYfxC3DIJuQn8fXPkvnmQxUiV8,12784
+kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
+kreuzberg/_utils/_device.py,sha256=Dk4g-LzUMJ-WMM-9czNQJj3mUI43l2w7t6MJcERYb2U,10264
+kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
+kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
+kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
+kreuzberg/_utils/_process_pool.py,sha256=7n5UN3d-xeYHU5TiRI62u-JenERPinJzFhbRUq-zL9k,2895
+kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
+kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
+kreuzberg/_utils/_sync.py,sha256=IsKkR_YmseZKY6Asz6w3k-dgMXcrVaI06jWfDY7Bol4,4842
+kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
+kreuzberg-3.3.0.dist-info/METADATA,sha256=beRlFJzCsZNcQ_DsRyzRc2WDT-UkBCfBvY6vTWiOxp0,8748
+kreuzberg-3.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kreuzberg-3.3.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
+kreuzberg-3.3.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.3.0.dist-info/RECORD,,

{kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,4 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: hatchling 1.27.0
 Root-Is-Purelib: true
 Tag: py3-none-any

kreuzberg-3.3.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ kreuzberg = kreuzberg.cli:cli

kreuzberg-3.2.0.dist-info/RECORD DELETED Viewed

@@ -1,34 +0,0 @@
-kreuzberg/__init__.py,sha256=lT9OwIdf5CEhSX7IVmtSFPgRhz6B2z2A-RE8Zdm0PH4,1216
-kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
-kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
-kreuzberg/_gmft.py,sha256=qLhfepQuaROjPOdI-tDRqqqnOcqDY1D411ZXzoywnpg,7229
-kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
-kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
-kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
-kreuzberg/_types.py,sha256=G7UQ5ZUWcpgwHoasexW7f2te3gKe3PHHi_3Fm1cju-w,7503
-kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
-kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
-kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
-kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
-kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
-kreuzberg/_extractors/_pandoc.py,sha256=OAbWvfzEx3rjim9uNMS9yBRnvkI71rYJKlgVzndsvyc,22157
-kreuzberg/_extractors/_pdf.py,sha256=eNFws_UxLgWSTC_VC_zJmVojpyQvioOXgNjSHQzBq5c,6607
-kreuzberg/_extractors/_presentation.py,sha256=7W6RHTk-zksuHoSk0i6UaSBf5NatnPo17MxepQoI6XI,8758
-kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
-kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
-kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
-kreuzberg/_ocr/_easyocr.py,sha256=1OG2IbLdg4cXouV0FVzMnCkYYh6GN1pvXqXWw40PUz8,14054
-kreuzberg/_ocr/_paddleocr.py,sha256=K6D3B2cn-JIhipI5UHMa0Kn2M-GKtyUFCahs8wJQZcA,13855
-kreuzberg/_ocr/_tesseract.py,sha256=KcJMK4o__2H2ftibk1lC7HVqEfpaE_jVZgLhUXkxTvk,9773
-kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg/_utils/_device.py,sha256=Ja28S2psgEwWzjdO05ZI11RFb3MSlUZDT19sC4SAyVE,10955
-kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
-kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
-kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
-kreuzberg-3.2.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-3.2.0.dist-info/METADATA,sha256=xffQAGQur7sCgUT9RDqZpfkYTdthsuYIhCvbUDKFnmA,6504
-kreuzberg-3.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-kreuzberg-3.2.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
-kreuzberg-3.2.0.dist-info/RECORD,,

kreuzberg-3.2.0.dist-info/top_level.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- kreuzberg

{kreuzberg-3.2.0.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

kreuzberg 3.2.0__py3-none-any.whl → 3.3.0__py3-none-any.whl

kreuzberg 3.2.0py3-none-any.whl → 3.3.0py3-none-any.whl