kreuzberg 3.4.0__tar.gz → 3.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-3.4.1/.github/workflows/docs.yml +66 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.github/workflows/publish-docker.yml +19 -9
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.gitignore +1 -0
- kreuzberg-3.4.1/PKG-INFO +233 -0
- kreuzberg-3.4.1/README.md +168 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/ai-rulez.yaml +56 -3
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/docker.md +6 -10
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/__init__.py +3 -1
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/pyproject.toml +7 -3
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/uv.lock +2 -11
- kreuzberg-3.4.0/.github/benchmarks/README.md +0 -15
- kreuzberg-3.4.0/PKG-INFO +0 -290
- kreuzberg-3.4.0/README.md +0 -229
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.commitlintrc +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.docker/README.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.dockerignore +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/LICENSE +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/changelog.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/cli.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/contributing.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/index.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/process_manager.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/mkdocs.yaml +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/api/main_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/cli_integration_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/conftest.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extraction_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/gmft_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/multiprocessing/sync_tesseract_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/types_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.4.0 → kreuzberg-3.4.1}/tests/utils/tmp_test.py +0 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
name: Deploy Documentation
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches:
|
6
|
+
- main
|
7
|
+
paths:
|
8
|
+
- 'docs/**'
|
9
|
+
- 'mkdocs.yaml'
|
10
|
+
- '.github/workflows/docs.yml'
|
11
|
+
workflow_dispatch:
|
12
|
+
|
13
|
+
permissions:
|
14
|
+
contents: read
|
15
|
+
pages: write
|
16
|
+
id-token: write
|
17
|
+
|
18
|
+
concurrency:
|
19
|
+
group: "pages"
|
20
|
+
cancel-in-progress: false
|
21
|
+
|
22
|
+
jobs:
|
23
|
+
build:
|
24
|
+
runs-on: ubuntu-latest
|
25
|
+
steps:
|
26
|
+
- name: Checkout repository
|
27
|
+
uses: actions/checkout@v4
|
28
|
+
with:
|
29
|
+
fetch-depth: 0
|
30
|
+
|
31
|
+
- name: Setup Python
|
32
|
+
uses: actions/setup-python@v5
|
33
|
+
with:
|
34
|
+
python-version: '3.11'
|
35
|
+
|
36
|
+
- name: Install uv
|
37
|
+
uses: astral-sh/setup-uv@v6
|
38
|
+
with:
|
39
|
+
enable-cache: true
|
40
|
+
|
41
|
+
- name: Install dependencies
|
42
|
+
run: |
|
43
|
+
uv sync --group doc
|
44
|
+
|
45
|
+
- name: Setup Pages
|
46
|
+
uses: actions/configure-pages@v5
|
47
|
+
|
48
|
+
- name: Build documentation
|
49
|
+
run: |
|
50
|
+
uv run mkdocs build --clean --strict
|
51
|
+
|
52
|
+
- name: Upload artifact
|
53
|
+
uses: actions/upload-pages-artifact@v3
|
54
|
+
with:
|
55
|
+
path: ./site
|
56
|
+
|
57
|
+
deploy:
|
58
|
+
environment:
|
59
|
+
name: github-pages
|
60
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
61
|
+
runs-on: ubuntu-latest
|
62
|
+
needs: build
|
63
|
+
steps:
|
64
|
+
- name: Deploy to GitHub Pages
|
65
|
+
id: deployment
|
66
|
+
uses: actions/deploy-pages@v4
|
@@ -9,11 +9,12 @@ on:
|
|
9
9
|
- completed
|
10
10
|
branches:
|
11
11
|
- main
|
12
|
+
workflow_dispatch:
|
12
13
|
|
13
14
|
jobs:
|
14
15
|
build-and-push:
|
15
16
|
runs-on: ubuntu-latest
|
16
|
-
if: ${{ github.event.workflow_run.conclusion == 'success' }}
|
17
|
+
if: ${{ github.event.workflow_run.conclusion == 'success' || github.event_name == 'workflow_dispatch' }}
|
17
18
|
permissions:
|
18
19
|
contents: read
|
19
20
|
packages: write
|
@@ -41,20 +42,28 @@ jobs:
|
|
41
42
|
- name: Checkout repository
|
42
43
|
uses: actions/checkout@v4
|
43
44
|
with:
|
44
|
-
ref: ${{ github.event.workflow_run.head_branch }}
|
45
|
+
ref: ${{ github.event.workflow_run.head_branch || github.ref }}
|
45
46
|
|
46
47
|
- name: Get release version
|
47
48
|
id: get_version
|
48
49
|
run: |
|
49
|
-
|
50
|
-
|
51
|
-
if [[ "${{ github.event.workflow_run.head_branch }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
|
52
|
-
echo "VERSION=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
|
53
|
-
else
|
54
|
-
# Get the latest tag
|
50
|
+
if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
51
|
+
# For manual dispatch, get the latest tag by listing all tags
|
55
52
|
git fetch --tags
|
56
|
-
|
53
|
+
VERSION=$(git tag --sort=-version:refname | head -n1)
|
54
|
+
else
|
55
|
+
# For workflow_run, use the head branch
|
56
|
+
VERSION="${{ github.event.workflow_run.head_branch }}"
|
57
|
+
# If triggered by a tag, extract version
|
58
|
+
if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
|
59
|
+
VERSION="$VERSION"
|
60
|
+
else
|
61
|
+
# Get the latest tag by listing all tags
|
62
|
+
git fetch --tags
|
63
|
+
VERSION=$(git tag --sort=-version:refname | head -n1)
|
64
|
+
fi
|
57
65
|
fi
|
66
|
+
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
|
58
67
|
|
59
68
|
- name: Set up QEMU
|
60
69
|
uses: docker/setup-qemu-action@v3
|
@@ -94,6 +103,7 @@ jobs:
|
|
94
103
|
- name: Update Docker Hub README
|
95
104
|
uses: peter-evans/dockerhub-description@v4
|
96
105
|
if: matrix.name == 'core'
|
106
|
+
continue-on-error: true
|
97
107
|
with:
|
98
108
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
99
109
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
kreuzberg-3.4.1/PKG-INFO
ADDED
@@ -0,0 +1,233 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 3.4.1
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
|
+
License: MIT
|
8
|
+
License-File: LICENSE
|
9
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
11
|
+
Classifier: Intended Audience :: Developers
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
13
|
+
Classifier: Operating System :: OS Independent
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
|
+
Classifier: Topic :: Text Processing :: General
|
23
|
+
Classifier: Topic :: Utilities
|
24
|
+
Classifier: Typing :: Typed
|
25
|
+
Requires-Python: >=3.9
|
26
|
+
Requires-Dist: anyio>=4.9.0
|
27
|
+
Requires-Dist: charset-normalizer>=3.4.2
|
28
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
29
|
+
Requires-Dist: html-to-markdown>=1.4.0
|
30
|
+
Requires-Dist: msgspec>=0.18.0
|
31
|
+
Requires-Dist: playa-pdf>=0.6.1
|
32
|
+
Requires-Dist: psutil>=7.0.0
|
33
|
+
Requires-Dist: pypdfium2==4.30.0
|
34
|
+
Requires-Dist: python-calamine>=0.3.2
|
35
|
+
Requires-Dist: python-pptx>=1.0.2
|
36
|
+
Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
37
|
+
Provides-Extra: all
|
38
|
+
Requires-Dist: click>=8.2.1; extra == 'all'
|
39
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
40
|
+
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
41
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'all'
|
42
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
43
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
44
|
+
Requires-Dist: rich>=14.0.0; extra == 'all'
|
45
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
46
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
47
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
48
|
+
Provides-Extra: api
|
49
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.1.6; extra == 'api'
|
50
|
+
Provides-Extra: chunking
|
51
|
+
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
52
|
+
Provides-Extra: cli
|
53
|
+
Requires-Dist: click>=8.2.1; extra == 'cli'
|
54
|
+
Requires-Dist: rich>=14.0.0; extra == 'cli'
|
55
|
+
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
56
|
+
Provides-Extra: easyocr
|
57
|
+
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
58
|
+
Provides-Extra: gmft
|
59
|
+
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
60
|
+
Provides-Extra: paddleocr
|
61
|
+
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
62
|
+
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
63
|
+
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
64
|
+
Description-Content-Type: text/markdown
|
65
|
+
|
66
|
+
# Kreuzberg
|
67
|
+
|
68
|
+
[](https://discord.gg/pXxagNK2zN)
|
69
|
+
[](https://badge.fury.io/py/kreuzberg)
|
70
|
+
[](https://goldziher.github.io/kreuzberg/)
|
71
|
+
[](https://opensource.org/licenses/MIT)
|
72
|
+
|
73
|
+
**High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
|
74
|
+
|
75
|
+
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
76
|
+
|
77
|
+
## Why Kreuzberg?
|
78
|
+
|
79
|
+
- **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
|
80
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
81
|
+
- **⚡ Dual APIs**: Only library with both sync and async support
|
82
|
+
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
83
|
+
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
84
|
+
- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
|
85
|
+
- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
|
86
|
+
- **🐳 Production Ready**: CLI, REST API, and Docker images included
|
87
|
+
|
88
|
+
## Quick Start
|
89
|
+
|
90
|
+
### Installation
|
91
|
+
|
92
|
+
```bash
|
93
|
+
# Basic installation
|
94
|
+
pip install kreuzberg
|
95
|
+
|
96
|
+
# With optional features
|
97
|
+
pip install "kreuzberg[cli,api]" # CLI + REST API
|
98
|
+
pip install "kreuzberg[easyocr,gmft]" # EasyOCR + table extraction
|
99
|
+
pip install "kreuzberg[all]" # Everything
|
100
|
+
```
|
101
|
+
|
102
|
+
### System Dependencies
|
103
|
+
|
104
|
+
```bash
|
105
|
+
# Ubuntu/Debian
|
106
|
+
sudo apt-get install tesseract-ocr pandoc
|
107
|
+
|
108
|
+
# macOS
|
109
|
+
brew install tesseract pandoc
|
110
|
+
|
111
|
+
# Windows
|
112
|
+
choco install tesseract pandoc
|
113
|
+
```
|
114
|
+
|
115
|
+
### Basic Usage
|
116
|
+
|
117
|
+
```python
|
118
|
+
import asyncio
|
119
|
+
from kreuzberg import extract_file
|
120
|
+
|
121
|
+
async def main():
|
122
|
+
# Extract from any document type
|
123
|
+
result = await extract_file("document.pdf")
|
124
|
+
print(result.content)
|
125
|
+
print(result.metadata)
|
126
|
+
|
127
|
+
asyncio.run(main())
|
128
|
+
```
|
129
|
+
|
130
|
+
## Deployment Options
|
131
|
+
|
132
|
+
### 🐳 Docker (Recommended)
|
133
|
+
|
134
|
+
```bash
|
135
|
+
# Run API server
|
136
|
+
docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
|
137
|
+
|
138
|
+
# Extract files
|
139
|
+
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
140
|
+
```
|
141
|
+
|
142
|
+
Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
|
143
|
+
|
144
|
+
### 🌐 REST API
|
145
|
+
|
146
|
+
```bash
|
147
|
+
# Install and run
|
148
|
+
pip install "kreuzberg[api]"
|
149
|
+
litestar --app kreuzberg._api.main:app run
|
150
|
+
|
151
|
+
# Health check
|
152
|
+
curl http://localhost:8000/health
|
153
|
+
|
154
|
+
# Extract files
|
155
|
+
curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
|
156
|
+
```
|
157
|
+
|
158
|
+
### 💻 Command Line
|
159
|
+
|
160
|
+
```bash
|
161
|
+
# Install CLI
|
162
|
+
pip install "kreuzberg[cli]"
|
163
|
+
|
164
|
+
# Extract to stdout
|
165
|
+
kreuzberg extract document.pdf
|
166
|
+
|
167
|
+
# JSON output with metadata
|
168
|
+
kreuzberg extract document.pdf --output-format json --show-metadata
|
169
|
+
|
170
|
+
# Batch processing
|
171
|
+
kreuzberg extract *.pdf --output-dir ./extracted/
|
172
|
+
```
|
173
|
+
|
174
|
+
## Supported Formats
|
175
|
+
|
176
|
+
| Category | Formats |
|
177
|
+
| ----------------- | ------------------------------ |
|
178
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
179
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
180
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
181
|
+
| **Presentations** | PPTX, PPT, ODP |
|
182
|
+
| **Web** | HTML, XML, MHTML |
|
183
|
+
| **Archives** | Support via extraction |
|
184
|
+
|
185
|
+
## Performance
|
186
|
+
|
187
|
+
**Fastest extraction speeds** with minimal resource usage:
|
188
|
+
|
189
|
+
| Library | Speed | Memory | Size | Success Rate |
|
190
|
+
| ------------- | -------------- | ------------- | ----------- | ------------ |
|
191
|
+
| **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%** |
|
192
|
+
| Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
|
193
|
+
| MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
|
194
|
+
| Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
|
195
|
+
|
196
|
+
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
197
|
+
|
198
|
+
## Documentation
|
199
|
+
|
200
|
+
### Quick Links
|
201
|
+
|
202
|
+
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
203
|
+
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
204
|
+
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
205
|
+
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
206
|
+
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
207
|
+
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
208
|
+
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
209
|
+
|
210
|
+
## Advanced Features
|
211
|
+
|
212
|
+
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
213
|
+
- **🧩 Content Chunking**: Split documents for RAG applications
|
214
|
+
- **🎯 Custom Extractors**: Extend with your own document handlers
|
215
|
+
- **🔧 Configuration**: Flexible TOML-based configuration
|
216
|
+
- **🪝 Hooks**: Pre/post-processing customization
|
217
|
+
- **🌍 Multi-language OCR**: 100+ languages supported
|
218
|
+
- **⚙️ Metadata Extraction**: Rich document metadata
|
219
|
+
- **🔄 Batch Processing**: Efficient bulk document processing
|
220
|
+
|
221
|
+
## License
|
222
|
+
|
223
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
224
|
+
|
225
|
+
______________________________________________________________________
|
226
|
+
|
227
|
+
<div align="center">
|
228
|
+
|
229
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
230
|
+
|
231
|
+
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
232
|
+
|
233
|
+
</div>
|
@@ -0,0 +1,168 @@
|
|
1
|
+
# Kreuzberg
|
2
|
+
|
3
|
+
[](https://discord.gg/pXxagNK2zN)
|
4
|
+
[](https://badge.fury.io/py/kreuzberg)
|
5
|
+
[](https://goldziher.github.io/kreuzberg/)
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
7
|
+
|
8
|
+
**High-performance Python library for text extraction from documents.** Extract text from PDFs, images, office documents, and more with both async and sync APIs.
|
9
|
+
|
10
|
+
📖 **[Complete Documentation](https://goldziher.github.io/kreuzberg/)**
|
11
|
+
|
12
|
+
## Why Kreuzberg?
|
13
|
+
|
14
|
+
- **🚀 Fastest Performance**: [Benchmarked](https://github.com/Goldziher/python-text-extraction-libs-benchmarks) as the fastest text extraction library
|
15
|
+
- **💾 Memory Efficient**: 14x smaller than alternatives (71MB vs 1GB+)
|
16
|
+
- **⚡ Dual APIs**: Only library with both sync and async support
|
17
|
+
- **🔧 Zero Configuration**: Works out of the box with sane defaults
|
18
|
+
- **🏠 Local Processing**: No cloud dependencies or external API calls
|
19
|
+
- **📦 Rich Format Support**: PDFs, images, Office docs, HTML, and more
|
20
|
+
- **🔍 Multiple OCR Engines**: Tesseract, EasyOCR, and PaddleOCR support
|
21
|
+
- **🐳 Production Ready**: CLI, REST API, and Docker images included
|
22
|
+
|
23
|
+
## Quick Start
|
24
|
+
|
25
|
+
### Installation
|
26
|
+
|
27
|
+
```bash
|
28
|
+
# Basic installation
|
29
|
+
pip install kreuzberg
|
30
|
+
|
31
|
+
# With optional features
|
32
|
+
pip install "kreuzberg[cli,api]" # CLI + REST API
|
33
|
+
pip install "kreuzberg[easyocr,gmft]" # EasyOCR + table extraction
|
34
|
+
pip install "kreuzberg[all]" # Everything
|
35
|
+
```
|
36
|
+
|
37
|
+
### System Dependencies
|
38
|
+
|
39
|
+
```bash
|
40
|
+
# Ubuntu/Debian
|
41
|
+
sudo apt-get install tesseract-ocr pandoc
|
42
|
+
|
43
|
+
# macOS
|
44
|
+
brew install tesseract pandoc
|
45
|
+
|
46
|
+
# Windows
|
47
|
+
choco install tesseract pandoc
|
48
|
+
```
|
49
|
+
|
50
|
+
### Basic Usage
|
51
|
+
|
52
|
+
```python
|
53
|
+
import asyncio
|
54
|
+
from kreuzberg import extract_file
|
55
|
+
|
56
|
+
async def main():
|
57
|
+
# Extract from any document type
|
58
|
+
result = await extract_file("document.pdf")
|
59
|
+
print(result.content)
|
60
|
+
print(result.metadata)
|
61
|
+
|
62
|
+
asyncio.run(main())
|
63
|
+
```
|
64
|
+
|
65
|
+
## Deployment Options
|
66
|
+
|
67
|
+
### 🐳 Docker (Recommended)
|
68
|
+
|
69
|
+
```bash
|
70
|
+
# Run API server
|
71
|
+
docker run -p 8000:8000 goldziher/kreuzberg:3.4.0
|
72
|
+
|
73
|
+
# Extract files
|
74
|
+
curl -X POST http://localhost:8000/extract -F "data=@document.pdf"
|
75
|
+
```
|
76
|
+
|
77
|
+
Available variants: `3.4.0`, `3.4.0-easyocr`, `3.4.0-paddle`, `3.4.0-gmft`, `3.4.0-all`
|
78
|
+
|
79
|
+
### 🌐 REST API
|
80
|
+
|
81
|
+
```bash
|
82
|
+
# Install and run
|
83
|
+
pip install "kreuzberg[api]"
|
84
|
+
litestar --app kreuzberg._api.main:app run
|
85
|
+
|
86
|
+
# Health check
|
87
|
+
curl http://localhost:8000/health
|
88
|
+
|
89
|
+
# Extract files
|
90
|
+
curl -X POST http://localhost:8000/extract -F "data=@file.pdf"
|
91
|
+
```
|
92
|
+
|
93
|
+
### 💻 Command Line
|
94
|
+
|
95
|
+
```bash
|
96
|
+
# Install CLI
|
97
|
+
pip install "kreuzberg[cli]"
|
98
|
+
|
99
|
+
# Extract to stdout
|
100
|
+
kreuzberg extract document.pdf
|
101
|
+
|
102
|
+
# JSON output with metadata
|
103
|
+
kreuzberg extract document.pdf --output-format json --show-metadata
|
104
|
+
|
105
|
+
# Batch processing
|
106
|
+
kreuzberg extract *.pdf --output-dir ./extracted/
|
107
|
+
```
|
108
|
+
|
109
|
+
## Supported Formats
|
110
|
+
|
111
|
+
| Category | Formats |
|
112
|
+
| ----------------- | ------------------------------ |
|
113
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
114
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
115
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
116
|
+
| **Presentations** | PPTX, PPT, ODP |
|
117
|
+
| **Web** | HTML, XML, MHTML |
|
118
|
+
| **Archives** | Support via extraction |
|
119
|
+
|
120
|
+
## Performance
|
121
|
+
|
122
|
+
**Fastest extraction speeds** with minimal resource usage:
|
123
|
+
|
124
|
+
| Library | Speed | Memory | Size | Success Rate |
|
125
|
+
| ------------- | -------------- | ------------- | ----------- | ------------ |
|
126
|
+
| **Kreuzberg** | ⚡ **Fastest** | 💾 **Lowest** | 📦 **71MB** | ✅ **100%** |
|
127
|
+
| Unstructured | 2-3x slower | 2x higher | 146MB | 95% |
|
128
|
+
| MarkItDown | 3-4x slower | 3x higher | 251MB | 90% |
|
129
|
+
| Docling | 4-5x slower | 10x higher | 1,032MB | 85% |
|
130
|
+
|
131
|
+
> **Rule of thumb**: Use async API for complex documents and batch processing (up to 4.5x faster)
|
132
|
+
|
133
|
+
## Documentation
|
134
|
+
|
135
|
+
### Quick Links
|
136
|
+
|
137
|
+
- [Installation Guide](https://goldziher.github.io/kreuzberg/getting-started/installation/) - Setup and dependencies
|
138
|
+
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - Comprehensive usage guide
|
139
|
+
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Complete API documentation
|
140
|
+
- [Docker Guide](https://goldziher.github.io/kreuzberg/user-guide/docker/) - Container deployment
|
141
|
+
- [REST API](https://goldziher.github.io/kreuzberg/user-guide/api-server/) - HTTP endpoints
|
142
|
+
- [CLI Guide](https://goldziher.github.io/kreuzberg/cli/) - Command-line usage
|
143
|
+
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - OCR engine setup
|
144
|
+
|
145
|
+
## Advanced Features
|
146
|
+
|
147
|
+
- **📊 Table Extraction**: Extract tables from PDFs with GMFT
|
148
|
+
- **🧩 Content Chunking**: Split documents for RAG applications
|
149
|
+
- **🎯 Custom Extractors**: Extend with your own document handlers
|
150
|
+
- **🔧 Configuration**: Flexible TOML-based configuration
|
151
|
+
- **🪝 Hooks**: Pre/post-processing customization
|
152
|
+
- **🌍 Multi-language OCR**: 100+ languages supported
|
153
|
+
- **⚙️ Metadata Extraction**: Rich document metadata
|
154
|
+
- **🔄 Batch Processing**: Efficient bulk document processing
|
155
|
+
|
156
|
+
## License
|
157
|
+
|
158
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
159
|
+
|
160
|
+
______________________________________________________________________
|
161
|
+
|
162
|
+
<div align="center">
|
163
|
+
|
164
|
+
**[Documentation](https://goldziher.github.io/kreuzberg/) • [PyPI](https://pypi.org/project/kreuzberg/) • [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg) • [Discord](https://discord.gg/pXxagNK2zN)**
|
165
|
+
|
166
|
+
Made with ❤️ by the [Kreuzberg contributors](https://github.com/Goldziher/kreuzberg/graphs/contributors)
|
167
|
+
|
168
|
+
</div>
|
@@ -1,6 +1,6 @@
|
|
1
1
|
metadata:
|
2
2
|
name: "Kreuzberg"
|
3
|
-
version: "3.
|
3
|
+
version: "3.4.0"
|
4
4
|
description: "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
|
6
6
|
outputs:
|
@@ -96,8 +96,9 @@ rules:
|
|
96
96
|
- Run manually: `pre-commit run --all-files`
|
97
97
|
|
98
98
|
### Documentation
|
99
|
-
- Build docs: `mkdocs build`
|
100
|
-
- Serve docs locally: `mkdocs serve`
|
99
|
+
- Build docs: `uv run mkdocs build --clean --strict`
|
100
|
+
- Serve docs locally: `uv run mkdocs serve`
|
101
|
+
- Install doc dependencies: `uv sync --group doc`
|
101
102
|
|
102
103
|
- name: "Architecture"
|
103
104
|
priority: 9
|
@@ -115,6 +116,8 @@ rules:
|
|
115
116
|
- **GMFT Integration**: Table extraction using GMFT library for PDFs
|
116
117
|
- **Chunking**: Text splitting functionality in `_chunker.py`
|
117
118
|
- **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
|
119
|
+
- **API Server**: REST API using Litestar framework in `_api/main.py`
|
120
|
+
- **CLI**: Command-line interface for batch processing and automation
|
118
121
|
|
119
122
|
### Adding New Features
|
120
123
|
- New extractors: Inherit from `BaseExtractor` and register with `ExtractorRegistry`
|
@@ -153,6 +156,56 @@ rules:
|
|
153
156
|
- All builtin imports should be at the top level (except for cyclical or optional dependencies)
|
154
157
|
- When committing, always use the format specified in the CLAUDE.md
|
155
158
|
|
159
|
+
- name: "CI/CD and Deployment"
|
160
|
+
priority: 6
|
161
|
+
content: |
|
162
|
+
### GitHub Actions Workflows
|
163
|
+
- **Release**: Automated PyPI publishing via GitHub releases
|
164
|
+
- **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
|
165
|
+
- **Documentation**: Auto-deploy to GitHub Pages on docs changes
|
166
|
+
|
167
|
+
### Docker Variants
|
168
|
+
- **Core** (`goldziher/kreuzberg:v3.4.0`): API + Tesseract OCR
|
169
|
+
- **EasyOCR** (`goldziher/kreuzberg:v3.4.0-easyocr`): Core + EasyOCR
|
170
|
+
- **PaddleOCR** (`goldziher/kreuzberg:v3.4.0-paddle`): Core + PaddleOCR
|
171
|
+
- **GMFT** (`goldziher/kreuzberg:v3.4.0-gmft`): Core + table extraction
|
172
|
+
- **All** (`goldziher/kreuzberg:v3.4.0-all`): All features included
|
173
|
+
|
174
|
+
### Manual Triggers
|
175
|
+
- Docker builds: `gh workflow run "Publish Docker Images"`
|
176
|
+
- Documentation: Auto-deploys on docs/ changes
|
177
|
+
|
178
|
+
### Common Issues
|
179
|
+
- **Docker version detection**: Use `git tag --sort=-version:refname | head -n1` not `git describe`
|
180
|
+
- **Docs dependencies**: Use `uv sync --group doc` for proper mkdocs-material[imaging] support
|
181
|
+
- **Docker Hub README**: May fail due to permissions, use `continue-on-error: true`
|
182
|
+
|
183
|
+
- name: "Package Management"
|
184
|
+
priority: 6
|
185
|
+
content: |
|
186
|
+
### Optional Dependencies Structure
|
187
|
+
```toml
|
188
|
+
[project.optional-dependencies]
|
189
|
+
api = ["litestar[standard,structlog,opentelemetry]>=2.1.6"]
|
190
|
+
cli = ["click>=8.2.1", "rich>=14.0.0", "tomli>=2.0.0; python_version<'3.11'"]
|
191
|
+
chunking = ["semantic-text-splitter>=0.27.0"]
|
192
|
+
easyocr = ["easyocr>=1.7.2"]
|
193
|
+
gmft = ["gmft>=0.4.2"]
|
194
|
+
paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
|
195
|
+
all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
|
196
|
+
```
|
197
|
+
|
198
|
+
### Installation Patterns
|
199
|
+
- Basic: `pip install kreuzberg`
|
200
|
+
- With features: `pip install "kreuzberg[api,cli]"`
|
201
|
+
- All features: `pip install "kreuzberg[all]"`
|
202
|
+
- Development: `uv sync --all-extras`
|
203
|
+
|
204
|
+
### Dependencies
|
205
|
+
- **Core**: pypdfium2, playa-pdf, python-pptx, etc.
|
206
|
+
- **System**: tesseract-ocr, pandoc (via package manager)
|
207
|
+
- **Development**: Uses dependency groups in pyproject.toml
|
208
|
+
|
156
209
|
sections:
|
157
210
|
- title: "Planned Features"
|
158
211
|
content: |
|
@@ -7,11 +7,12 @@ Kreuzberg provides official Docker images for easy deployment and containerized
|
|
7
7
|
Docker images are available on [Docker Hub](https://hub.docker.com/r/goldziher/kreuzberg):
|
8
8
|
|
9
9
|
- `goldziher/kreuzberg:latest` - Core image with API server and Tesseract OCR
|
10
|
-
- `goldziher/kreuzberg:
|
11
|
-
- `goldziher/kreuzberg:
|
12
|
-
- `goldziher/kreuzberg:
|
13
|
-
- `goldziher/kreuzberg:
|
14
|
-
|
10
|
+
- `goldziher/kreuzberg:latest-easyocr` - With EasyOCR support
|
11
|
+
- `goldziher/kreuzberg:latest-paddle` - With PaddleOCR support
|
12
|
+
- `goldziher/kreuzberg:latest-gmft` - With GMFT table extraction
|
13
|
+
- `goldziher/kreuzberg:latest-all` - With all optional dependencies
|
14
|
+
|
15
|
+
> **Note**: Specific version tags are also available (e.g., `v3.4.0`, `v3.4.0-easyocr`)
|
15
16
|
|
16
17
|
## Quick Start
|
17
18
|
|
@@ -45,8 +46,6 @@ curl -X POST http://localhost:8000/extract \
|
|
45
46
|
Create a `docker-compose.yml`:
|
46
47
|
|
47
48
|
```yaml
|
48
|
-
version: '3.8'
|
49
|
-
|
50
49
|
services:
|
51
50
|
kreuzberg:
|
52
51
|
image: goldziher/kreuzberg:latest
|
@@ -54,9 +53,6 @@ services:
|
|
54
53
|
- "8000:8000"
|
55
54
|
environment:
|
56
55
|
- PYTHONUNBUFFERED=1
|
57
|
-
volumes:
|
58
|
-
# Optional: Mount local directory for file access
|
59
|
-
- ./documents:/app/documents
|
60
56
|
restart: unless-stopped
|
61
57
|
```
|
62
58
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from importlib.metadata import version
|
2
|
+
|
1
3
|
from kreuzberg._gmft import GMFTConfig
|
2
4
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
3
5
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
@@ -18,7 +20,7 @@ from .extraction import (
|
|
18
20
|
extract_file_sync,
|
19
21
|
)
|
20
22
|
|
21
|
-
__version__ = "
|
23
|
+
__version__ = version("kreuzberg")
|
22
24
|
|
23
25
|
__all__ = [
|
24
26
|
"EasyOCRConfig",
|