PyPI - kreuzberg - Versions diffs - 3.14.1__tar.gz → 3.16.0__tar.gz - Mend

kreuzberg 3.14.1tar.gz → 3.16.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (311) hide show

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/ci.yaml RENAMED Viewed

@@ -212,7 +212,7 @@ jobs:
         uses: actions/checkout@v5
       - name: Download Coverage Artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
         with:
           pattern: coverage-*-${{ github.sha }}
           merge-multiple: true

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/.github/workflows/docker-e2e-tests.yml RENAMED Viewed

@@ -99,7 +99,7 @@ jobs:
         run: |
           mkdir -p tests/e2e/logs
           echo "Running E2E tests for ${{ matrix.image.name }}..."
-          python3 tests/e2e/docker_e2e_test.py --image ${{ matrix.image.name }}
+          python3 tests/e2e/docker_e2e.py --image ${{ matrix.image.name }}
       - name: Generate test report - ${{ matrix.image.name }}
         if: always()

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/.gitignore RENAMED Viewed

@@ -1,18 +1,20 @@
 *$py.class
 *.Cache
-.clause/
 *.cscfg
 *.egg-info/
 *.log
 *.py[cod]
 *.suo
+*.tar.gz
+*.temp
+*.tmp
 *.user
+*.whl
 *temp/
+.cache/
+.claude/
 .coverage
 .coverage*
-coverage.lcov
-htmlcov/
-.claude/
 .cursorrules
 .dist/
 .DS_store
@@ -20,47 +22,47 @@ htmlcov/
 .idea/
 .kreuzberg/
 .mypy_cache/
+.nox/
 .pytest_cache/
 .python-version
 .ropeproject
 .ruff_cache/
 .run/
+.task/
+.tmp/
+.tox/
 .venv/
 .vscode/
 .windsurfrules
 __pycache__/
+AGENTS.md
 benchmark_results.json
+benchmarks/results/
+build/
 CLAUDE.md
+coverage.lcov
 coverage.xml
+dist/
 docker-compose.yaml
+docs/_build/
+docs/build/
 GEMINI.md
+htmlcov/
+node_modules/
+npm-debug.log*
+output.txt
 prompt_template.egg-info/
 requirements.txt
+share/python-wheels/
 site/
-.cache/
-dist/
-build/
-.task/
-tests/e2e/test_report.json
+test_report.json
 tests/e2e/logs/
-# Additional build artifacts
-*.whl
-*.tar.gz
-.tox/
-.nox/
+tests/e2e/test_report.json
 wheels/
-share/python-wheels/
-# Documentation builds
-docs/_build/
-docs/build/
-# Node.js (if any frontend tools are used)
-node_modules/
-npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
+todo.md
+TODO.md
 # Temporary files
 *.tmp
@@ -69,3 +71,4 @@ yarn-error.log*
 # AI Rules generated files
 .claude/agents/
+AGENTS.md

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/.pre-commit-config.yaml RENAMED Viewed

@@ -11,7 +11,7 @@ repos:
       - id: name-tests-test
         args:
           - --pytest
-        exclude: factories|test_utils|completion.py|test_data
+        exclude: factories|test_utils|completion.py|test_data|docker_e2e.py
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: check-toml

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.14.1
+Version: 3.16.0
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
 Requires-Dist: anyio>=4.10.0
 Requires-Dist: chardetng-py>=0.3.5
 Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
-Requires-Dist: html-to-markdown[lxml]>=1.11.0
+Requires-Dist: html-to-markdown[lxml]>=1.13.0
 Requires-Dist: mcp>=1.14.0
 Requires-Dist: msgspec>=0.18.0
 Requires-Dist: numpy>=2.0.0
@@ -107,8 +107,9 @@ Description-Content-Type: text/markdown
 ### Document Intelligence Capabilities
 - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
+- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
 - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
-- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
+- **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
 - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
 - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
@@ -226,14 +227,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
 ## Supported Formats
-| Category          | Formats                        |
-| ----------------- | ------------------------------ |
-| **Documents**     | PDF, DOCX, DOC, RTF, TXT, EPUB |
-| **Images**        | JPG, PNG, TIFF, BMP, GIF, WEBP |
-| **Spreadsheets**  | XLSX, XLS, CSV, ODS            |
-| **Presentations** | PPTX, PPT, ODP                 |
-| **Web**           | HTML, XML, MHTML               |
-| **Archives**      | Support via extraction         |
+| Category            | Formats                        |
+| ------------------- | ------------------------------ |
+| **Documents**       | PDF, DOCX, DOC, RTF, TXT, EPUB |
+| **Images**          | JPG, PNG, TIFF, BMP, GIF, WEBP |
+| **Spreadsheets**    | XLSX, XLS, CSV, ODS            |
+| **Presentations**   | PPTX, PPT, ODP                 |
+| **Web**             | HTML, XML, MHTML               |
+| **Structured Data** | JSON, YAML, TOML               |
+| **Archives**        | Support via extraction         |
 ## 📊 Performance Characteristics

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/README.md RENAMED Viewed

@@ -16,8 +16,9 @@
 ### Document Intelligence Capabilities
 - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
+- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
 - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
-- **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
+- **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
 - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
 - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
@@ -135,14 +136,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
 ## Supported Formats
-| Category          | Formats                        |
-| ----------------- | ------------------------------ |
-| **Documents**     | PDF, DOCX, DOC, RTF, TXT, EPUB |
-| **Images**        | JPG, PNG, TIFF, BMP, GIF, WEBP |
-| **Spreadsheets**  | XLSX, XLS, CSV, ODS            |
-| **Presentations** | PPTX, PPT, ODP                 |
-| **Web**           | HTML, XML, MHTML               |
-| **Archives**      | Support via extraction         |
+| Category            | Formats                        |
+| ------------------- | ------------------------------ |
+| **Documents**       | PDF, DOCX, DOC, RTF, TXT, EPUB |
+| **Images**          | JPG, PNG, TIFF, BMP, GIF, WEBP |
+| **Spreadsheets**    | XLSX, XLS, CSV, ODS            |
+| **Presentations**   | PPTX, PPT, ODP                 |
+| **Web**             | HTML, XML, MHTML               |
+| **Structured Data** | JSON, YAML, TOML               |
+| **Archives**        | Support via extraction         |
 ## 📊 Performance Characteristics

kreuzberg-3.16.0/Taskfile.yml ADDED Viewed

@@ -0,0 +1,50 @@
+version: "3"
+env:
+  DOCKER_BUILDKIT: 1
+  BUILDKIT_PROGRESS: plain
+tasks:
+  setup:
+    desc: "Install dependencies with uv"
+    cmds:
+      - uv sync --all-extras --all-packages
+      - pre-commit install && pre-commit install -hook-type commit-msg
+  update:
+    desc: "Update the dependencies"
+    cmds:
+      - uv run uv-bump
+      - cd benchmarks && uv run uv-bump && cd -
+      - uv sync --all-extras --all-packages --upgrade
+      - pre-commit autoupdate
+  test:
+    desc: "Run tests with pytest"
+    cmds:
+      - uv run pytest
+  test:cov:
+    desc: "Run tests with coverage"
+    cmds:
+      - uv run pytest --cov
+  lint:
+    desc: "Lint code with ruff and docs with markdownlint"
+    cmds:
+      - pre-commit run --all-files
+  docs:build:
+    desc: "Build documentation"
+    cmds:
+      - uv run mkdocs build --clean --strict
+  docs:serve:
+    desc: "Serve documentation locally"
+    cmds:
+      - uv run mkdocs serve
+  default:
+    desc: "Show available tasks"
+    cmds:
+      - task --list

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/ai-rulez.yaml RENAMED Viewed

@@ -385,6 +385,7 @@ rules:
       - NEVER proactively create documentation files (*.md) or README files
       - Only create documentation files if explicitly requested by the User
       - All builtin imports should be at the top level (except for cyclical or optional dependencies)
+      - All config dataclasses must be hashable, frozen, and use slots: `@dataclass(unsafe_hash=True, frozen=True, slots=True)`
       - When committing, always use the format specified in the CLAUDE.md
     name: Important Instructions
     priority: critical

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/README.md RENAMED Viewed

@@ -87,6 +87,18 @@ uv run python -m benchmarks.src run --stress
 # Run backend comparison benchmarks
 uv run python -m benchmarks.src run --backend-comparison
+# Include Tesseract OCR benchmarks (sync)
+uv run python -m benchmarks.src run --tesseract
+# Include expanded Tesseract variant matrix (formats/PSM)
+uv run python -m benchmarks.src run --tesseract --tesseract-matrix
+# Compare Tesseract architectures (threads vs processes)
+uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-arch
+# Compare with custom worker counts (e.g., 1,4,8)
+uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-arch --workers 1,4,8
 # Custom test files directory
 uv run python -m benchmarks.src run --test-files-dir /path/to/test/files
@@ -232,3 +244,21 @@ uv run python -m benchmarks.src run --sync-only --suite-name main_baseline
 uv run python -m benchmarks.src run --sync-only --suite-name pr_test
 uv run python -m benchmarks.src compare results/main_baseline.json results/pr_test.json
 ```
+### Tesseract Benchmarks
+The suite includes focused Tesseract OCR benchmarks:
+- `--tesseract` adds thread-based batch OCR and a process-pool placeholder for A/B comparisons.
+- `--tesseract-matrix` expands with a small matrix across output formats (`text`, `markdown`, `tsv`) and PSM modes
+    (`AUTO`, `SINGLE_BLOCK`, `SINGLE_LINE`) to quantify overhead of richer outputs and segmentation strategies.
+Examples:
+```bash
+# Minimal Tesseract batch OCR benchmarks
+uv run python -m benchmarks.src run --sync-only --tesseract
+# Full Tesseract config matrix
+uv run python -m benchmarks.src run --sync-only --tesseract --tesseract-matrix
+```

kreuzberg-3.16.0/benchmarks/batch_size_benchmark.py ADDED Viewed

@@ -0,0 +1,179 @@
+import json
+import shutil
+import tempfile
+import time
+from concurrent.futures import ProcessPoolExecutor
+from pathlib import Path
+from typing import Any
+from PIL import Image, ImageDraw
+from kreuzberg import extract_file_sync
+from kreuzberg._ocr._tesseract import _process_image_with_tesseract
+from kreuzberg._types import ExtractionConfig
+from kreuzberg._utils._process_pool import get_optimal_worker_count, process_pool
+def create_test_images(sizes: list[tuple[int, int]], output_dir: Path) -> list[Path]:
+    output_dir.mkdir(exist_ok=True)
+    image_paths = []
+    for i, (width, height) in enumerate(sizes):
+        img = Image.new("RGB", (width, height), color="white")
+        draw = ImageDraw.Draw(img)
+        for y in range(0, height, 50):
+            for x in range(0, width, 100):
+                draw.text((x, y), f"Test {i}", fill="black")
+        path = output_dir / f"test_{width}x{height}_{i}.png"
+        img.save(path)
+        image_paths.append(path)
+    return image_paths
+def benchmark_batch_fixed_workers(images: list[Path], num_workers: int) -> dict[str, Any]:
+    start = time.perf_counter()
+    config_dict = {"language": "eng", "psm": 3}
+    with ProcessPoolExecutor(max_workers=num_workers) as pool:
+        futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
+        [f.result() for f in futures]
+    duration = time.perf_counter() - start
+    return {
+        "strategy": "fixed",
+        "workers": num_workers,
+        "batch_size": len(images),
+        "duration": duration,
+        "per_image": duration / len(images) if images else 0,
+    }
+def benchmark_batch_dynamic_workers(images: list[Path]) -> dict[str, Any]:
+    start = time.perf_counter()
+    config_dict = {"language": "eng", "psm": 3}
+    optimal_workers = get_optimal_worker_count(len(images), cpu_intensive=True)
+    with ProcessPoolExecutor(max_workers=optimal_workers) as pool:
+        futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
+        [f.result() for f in futures]
+    duration = time.perf_counter() - start
+    return {
+        "strategy": "dynamic",
+        "workers": optimal_workers,
+        "batch_size": len(images),
+        "duration": duration,
+        "per_image": duration / len(images) if images else 0,
+    }
+def benchmark_batch_shared_pool(images: list[Path]) -> dict[str, Any]:
+    start = time.perf_counter()
+    config_dict = {"language": "eng", "psm": 3}
+    with process_pool() as pool:
+        futures = [pool.submit(_process_image_with_tesseract, str(p), config_dict) for p in images]
+        [f.result() for f in futures]
+    duration = time.perf_counter() - start
+    return {
+        "strategy": "shared_pool",
+        "workers": 14,
+        "batch_size": len(images),
+        "duration": duration,
+        "per_image": duration / len(images) if images else 0,
+    }
+def benchmark_extraction_api(images: list[Path]) -> dict[str, Any]:
+    start = time.perf_counter()
+    config = ExtractionConfig(use_cache=False, force_ocr=True)
+    for image_path in images:
+        extract_file_sync(image_path, config=config)
+    duration = time.perf_counter() - start
+    return {
+        "strategy": "extraction_api",
+        "workers": "auto",
+        "batch_size": len(images),
+        "duration": duration,
+        "per_image": duration / len(images) if images else 0,
+    }
+def main() -> None:
+    batch_sizes = [1, 2, 5, 10, 20]
+    image_sizes = [
+        (640, 480),
+        (1024, 768),
+        (1920, 1080),
+    ]
+    test_dir = Path(tempfile.mkdtemp(prefix="kreuzberg_bench_"))
+    results = []
+    for img_width, img_height in image_sizes:
+        max_batch = max(batch_sizes)
+        images = create_test_images([(img_width, img_height)] * max_batch, test_dir)
+        for batch_size in batch_sizes:
+            batch = images[:batch_size]
+            strategies = []
+            fixed_result = benchmark_batch_fixed_workers(batch, 14)
+            strategies.append(fixed_result)
+            dynamic_result = benchmark_batch_dynamic_workers(batch)
+            strategies.append(dynamic_result)
+            shared_result = benchmark_batch_shared_pool(batch)
+            strategies.append(shared_result)
+            if batch_size <= 10:
+                api_result = benchmark_extraction_api(batch)
+                strategies.append(api_result)
+            baseline = fixed_result["duration"]
+            if baseline > 0:
+                for strategy in strategies[1:]:
+                    improvement = ((baseline - strategy["duration"]) / baseline) * 100
+                    strategy["improvement_pct"] = improvement
+            result_entry = {
+                "image_size": f"{img_width}x{img_height}",
+                "batch_size": batch_size,
+                "strategies": strategies,
+            }
+            results.append(result_entry)
+    output_file = Path("results/batch_size_benchmarks.json")
+    output_file.parent.mkdir(exist_ok=True)
+    with output_file.open("w") as f:
+        json.dump({"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "results": results}, f, indent=2)
+    for img_size in image_sizes:
+        size_str = f"{img_size[0]}x{img_size[1]}"
+        size_results = [r for r in results if r["image_size"] == size_str]
+        for result in size_results:
+            batch_size = result["batch_size"]  # type: ignore[assignment]
+            strategies = result["strategies"]  # type: ignore[assignment]
+            dynamic = next((s for s in strategies if s["strategy"] == "dynamic"), None)
+            if dynamic and "improvement_pct" in dynamic:
+                pass
+    shutil.rmtree(test_dir)
+if __name__ == "__main__":
+    main()

kreuzberg-3.16.0/benchmarks/batch_validation_benchmark.py ADDED Viewed

@@ -0,0 +1,83 @@
+import json
+import time
+from pathlib import Path
+from typing import Any
+from kreuzberg import extract_file_sync
+from kreuzberg._types import ExtractionConfig
+def benchmark_real_world_scenario(file_paths: list[Path], scenario_name: str) -> dict[str, Any]:
+    config = ExtractionConfig(use_cache=False)
+    start = time.perf_counter()
+    results = []
+    for path in file_paths:
+        result = extract_file_sync(path, config=config)
+        results.append(len(result.content))
+    duration = time.perf_counter() - start
+    return {
+        "scenario": scenario_name,
+        "file_count": len(file_paths),
+        "duration": duration,
+        "per_file": duration / len(file_paths),
+        "total_chars": sum(results),
+    }
+def main() -> None:
+    test_dir = Path("/Users/naamanhirschfeld/workspace/kreuzberg/tests/test_source_files")
+    scenarios = []
+    mixed_files = []
+    for ext in ["*.pdf", "*.docx", "*.xlsx", "*.pptx"]:
+        mixed_files.extend(list(test_dir.glob(ext))[:2])
+    if mixed_files:
+        result = benchmark_real_world_scenario(mixed_files, "Mixed Office Documents")
+        scenarios.append(result)
+    image_files = []
+    for ext in ["*.png", "*.jpg", "*.jpeg"]:
+        image_files.extend(list(test_dir.glob(ext))[:3])
+    if image_files:
+        result = benchmark_real_world_scenario(image_files, "Image Batch Processing")
+        scenarios.append(result)
+    pdf_files = list(test_dir.glob("*.pdf"))[:5]
+    if pdf_files:
+        result = benchmark_real_world_scenario(pdf_files, "PDF Document Processing")
+        scenarios.append(result)
+    small_files = []
+    for ext in ["*.txt", "*.md", "*.html"]:
+        small_files.extend(list(test_dir.glob(ext))[:3])
+    if small_files:
+        result = benchmark_real_world_scenario(small_files, "Small Text Files")
+        scenarios.append(result)
+    total_files = sum(s["file_count"] for s in scenarios)
+    total_time = sum(s["duration"] for s in scenarios)
+    total_chars = sum(s["total_chars"] for s in scenarios)
+    output = {
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+        "scenarios": scenarios,
+        "summary": {
+            "total_files": total_files,
+            "total_time": total_time,
+            "avg_per_file": total_time / total_files if total_files > 0 else 0,
+            "total_chars": total_chars,
+            "throughput": total_chars / total_time if total_time > 0 else 0,
+        },
+    }
+    output_file = Path("results/final_batch_validation.json")
+    with output_file.open("w") as f:
+        json.dump(output, f, indent=2)
+if __name__ == "__main__":
+    main()

{kreuzberg-3.14.1 → kreuzberg-3.16.0}/benchmarks/pyproject.toml RENAMED Viewed

@@ -17,18 +17,13 @@ classifiers = [
 # kreuzberg-bench = "src.cli:app"
 dependencies = [
+  "click>=8.2.1",
   "kreuzberg",
-  "matplotlib>=3.7",
-  "memory-profiler>=0.61",
-  "pandas>=2",
+  "msgpack>=1.1.1",
   "psutil>=5.9",
   "py-spy>=0.3.14",
   "rich>=13",
-  "typer>=0.9",
 ]
-[tool.ruff]
-lint.extend-ignore = [ "ARG002", "B008", "B904", "BLE001", "E722", "PLR2004", "PYI036", "SLF001" ]
 [tool.uv.sources]
 kreuzberg = { workspace = true }

kreuzberg-3.16.0/benchmarks/src/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .cli import cli
+if __name__ == "__main__":
+    cli()

kreuzberg 3.14.1__tar.gz → 3.16.0__tar.gz

kreuzberg 3.14.1tar.gz → 3.16.0tar.gz