kreuzberg 3.4.2__tar.gz → 3.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/publish-docker.yml +7 -27
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/release.yaml +12 -0
- kreuzberg-3.6.0/.gitmodules +3 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/PKG-INFO +12 -4
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/ai-rulez.yaml +25 -9
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/benchmark_baseline.py +1 -1
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/final_benchmark.py +1 -1
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/serialization_benchmark.py +0 -1
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/performance.md +46 -9
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/types.md +18 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/changelog.md +2 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/examples/extraction-examples.md +118 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/getting-started/installation.md +33 -1
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/docker.md +49 -0
- kreuzberg-3.6.0/docs/user-guide/extraction-configuration.md +343 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/ocr-configuration.md +86 -19
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/__init__.py +6 -1
- kreuzberg-3.6.0/kreuzberg/_entity_extraction.py +239 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_image.py +21 -1
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_pdf.py +44 -14
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_spread_sheet.py +2 -2
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_gmft.py +4 -4
- kreuzberg-3.6.0/kreuzberg/_language_detection.py +95 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/process_manager.py +2 -1
- kreuzberg-3.6.0/kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
- kreuzberg-3.6.0/kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/_easyocr.py +1 -1
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/_tesseract.py +7 -3
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_types.py +46 -4
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_device.py +2 -2
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_process_pool.py +2 -2
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_sync.py +1 -5
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_tmp.py +2 -2
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/extraction.py +39 -12
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/mkdocs.yaml +1 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/pyproject.toml +27 -3
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/api/main_test.py +2 -5
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/cli_integration_test.py +9 -1
- kreuzberg-3.6.0/tests/entity_extraction_test.py +102 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extraction_test.py +9 -2
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/image_test.py +17 -4
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/pdf_test.py +7 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/gmft_extended_test.py +6 -17
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/gmft_test.py +0 -3
- kreuzberg-3.6.0/tests/language_detection_test.py +237 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/sync_tesseract_test.py +2 -3
- kreuzberg-3.6.0/tests/test_source_files/french-text.txt +2 -0
- kreuzberg-3.6.0/tests/test_source_files/german-text.txt +2 -0
- kreuzberg-3.6.0/tests/test_source_files/spanish-text.txt +2 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/cache_test.py +0 -3
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/errors_test.py +0 -1
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/process_pool_test.py +0 -3
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/sync_test.py +0 -7
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/uv.lock +466 -66
- kreuzberg-3.4.2/docs/user-guide/extraction-configuration.md +0 -162
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.commitlintrc +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.docker/README.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.dockerignore +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.gitignore +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/LICENSE +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/README.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/cli.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/contributing.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/chunker_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/cli_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/conftest.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/hooks_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/playa_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/registry_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/types_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.4.2 → kreuzberg-3.6.0}/tests/utils/tmp_test.py +0 -0
@@ -1,20 +1,13 @@
|
|
1
|
-
# .github/workflows/publish-docker.yml
|
2
1
|
|
3
2
|
name: Publish Docker Images
|
4
3
|
|
5
4
|
on:
|
6
|
-
workflow_run:
|
7
|
-
workflows: ["Release"]
|
8
|
-
types:
|
9
|
-
- completed
|
10
|
-
branches:
|
11
|
-
- main
|
12
5
|
workflow_dispatch:
|
13
6
|
|
14
7
|
jobs:
|
15
8
|
build-and-push:
|
16
9
|
runs-on: ubuntu-latest
|
17
|
-
if: ${{ github.
|
10
|
+
if: ${{ github.event_name == 'workflow_dispatch' }}
|
18
11
|
permissions:
|
19
12
|
contents: read
|
20
13
|
packages: write
|
@@ -24,7 +17,7 @@ jobs:
|
|
24
17
|
include:
|
25
18
|
- name: core
|
26
19
|
extras: ""
|
27
|
-
tag_suffix: ""
|
20
|
+
tag_suffix: ""
|
28
21
|
- name: easyocr
|
29
22
|
extras: "easyocr"
|
30
23
|
tag_suffix: "-easyocr"
|
@@ -42,27 +35,14 @@ jobs:
|
|
42
35
|
- name: Checkout repository
|
43
36
|
uses: actions/checkout@v4
|
44
37
|
with:
|
45
|
-
ref: ${{ github.
|
38
|
+
ref: ${{ github.ref }}
|
46
39
|
|
47
40
|
- name: Get release version
|
48
41
|
id: get_version
|
49
42
|
run: |
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
VERSION=$(git tag --sort=-version:refname | head -n1)
|
54
|
-
else
|
55
|
-
# For workflow_run, use the head branch
|
56
|
-
VERSION="${{ github.event.workflow_run.head_branch }}"
|
57
|
-
# If triggered by a tag, extract version
|
58
|
-
if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
|
59
|
-
VERSION="$VERSION"
|
60
|
-
else
|
61
|
-
# Get the latest tag by listing all tags
|
62
|
-
git fetch --tags
|
63
|
-
VERSION=$(git tag --sort=-version:refname | head -n1)
|
64
|
-
fi
|
65
|
-
fi
|
43
|
+
# Get the latest tag by listing all tags
|
44
|
+
git fetch --tags
|
45
|
+
VERSION=$(git tag --sort=-version:refname | head -n1)
|
66
46
|
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
|
67
47
|
|
68
48
|
- name: Set up QEMU
|
@@ -89,7 +69,7 @@ jobs:
|
|
89
69
|
type=raw,value=latest${{ matrix.tag_suffix }}
|
90
70
|
|
91
71
|
- name: Build and push Docker image
|
92
|
-
uses: docker/build-push-action@
|
72
|
+
uses: docker/build-push-action@v6
|
93
73
|
with:
|
94
74
|
context: .
|
95
75
|
file: ./.docker/Dockerfile
|
@@ -29,3 +29,15 @@ jobs:
|
|
29
29
|
|
30
30
|
- name: Publish
|
31
31
|
uses: pypa/gh-action-pypi-publish@release/v1
|
32
|
+
|
33
|
+
- name: Trigger Docker Build
|
34
|
+
uses: actions/github-script@v7
|
35
|
+
with:
|
36
|
+
github-token: ${{ secrets.GITHUB_TOKEN }}
|
37
|
+
script: |
|
38
|
+
await github.rest.actions.createWorkflowDispatch({
|
39
|
+
owner: context.repo.owner,
|
40
|
+
repo: context.repo.repo,
|
41
|
+
workflow_id: 'publish-docker.yml',
|
42
|
+
ref: 'main'
|
43
|
+
});
|
@@ -1,12 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.6.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
7
|
License: MIT
|
8
8
|
License-File: LICENSE
|
9
|
-
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
9
|
+
Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
38
38
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
39
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
39
40
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
40
|
-
Requires-Dist:
|
41
|
+
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
42
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
41
43
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
42
44
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
43
45
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
44
46
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
45
47
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
48
|
+
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
46
49
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
47
50
|
Provides-Extra: api
|
48
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
51
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
49
52
|
Provides-Extra: chunking
|
50
53
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
51
54
|
Provides-Extra: cli
|
@@ -54,8 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
|
|
54
57
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
55
58
|
Provides-Extra: easyocr
|
56
59
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
60
|
+
Provides-Extra: entity-extraction
|
61
|
+
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
62
|
+
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
57
63
|
Provides-Extra: gmft
|
58
64
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
65
|
+
Provides-Extra: langdetect
|
66
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
59
67
|
Provides-Extra: paddleocr
|
60
68
|
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
61
69
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
metadata:
|
2
2
|
name: "Kreuzberg"
|
3
|
-
version: "3.
|
3
|
+
version: "3.5.0"
|
4
4
|
description: "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
|
6
6
|
outputs:
|
@@ -115,6 +115,7 @@ rules:
|
|
115
115
|
- **OCR Backends**: Pluggable OCR engines with separate configuration classes
|
116
116
|
- **GMFT Integration**: Table extraction using GMFT library for PDFs
|
117
117
|
- **Chunking**: Text splitting functionality in `_chunker.py`
|
118
|
+
- **Language Detection**: Automatic language detection using fast-langdetect
|
118
119
|
- **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
|
119
120
|
- **API Server**: REST API using Litestar framework in `_api/main.py`
|
120
121
|
- **CLI**: Command-line interface for batch processing and automation
|
@@ -144,6 +145,8 @@ rules:
|
|
144
145
|
- Mock OCR responses for predictable testing
|
145
146
|
- Both sync and async test variants
|
146
147
|
- Comprehensive error case coverage
|
148
|
+
- OCR tests marked as `xfail` in CI environments for resilience
|
149
|
+
- Integration tests use timeouts and retry logic where appropriate
|
147
150
|
|
148
151
|
- name: "Important Instructions"
|
149
152
|
priority: 10
|
@@ -160,16 +163,17 @@ rules:
|
|
160
163
|
priority: 6
|
161
164
|
content: |
|
162
165
|
### GitHub Actions Workflows
|
163
|
-
- **Release**: Automated PyPI publishing via GitHub releases
|
164
|
-
- **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
|
166
|
+
- **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
|
167
|
+
- **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64), triggered by releases
|
165
168
|
- **Documentation**: Auto-deploy to GitHub Pages on docs changes
|
169
|
+
- **CI**: Comprehensive testing across multiple Python versions and platforms
|
166
170
|
|
167
171
|
### Docker Variants
|
168
|
-
- **Core** (`goldziher/kreuzberg:v3.
|
169
|
-
- **EasyOCR** (`goldziher/kreuzberg:v3.
|
170
|
-
- **PaddleOCR** (`goldziher/kreuzberg:v3.
|
171
|
-
- **GMFT** (`goldziher/kreuzberg:v3.
|
172
|
-
- **All** (`goldziher/kreuzberg:v3.
|
172
|
+
- **Core** (`goldziher/kreuzberg:v3.5.0`): API + Tesseract OCR
|
173
|
+
- **EasyOCR** (`goldziher/kreuzberg:v3.5.0-easyocr`): Core + EasyOCR
|
174
|
+
- **PaddleOCR** (`goldziher/kreuzberg:v3.5.0-paddle`): Core + PaddleOCR
|
175
|
+
- **GMFT** (`goldziher/kreuzberg:v3.5.0-gmft`): Core + table extraction
|
176
|
+
- **All** (`goldziher/kreuzberg:v3.5.0-all`): All features included
|
173
177
|
|
174
178
|
### Manual Triggers
|
175
179
|
- Docker builds: `gh workflow run "Publish Docker Images"`
|
@@ -191,8 +195,9 @@ rules:
|
|
191
195
|
chunking = ["semantic-text-splitter>=0.27.0"]
|
192
196
|
easyocr = ["easyocr>=1.7.2"]
|
193
197
|
gmft = ["gmft>=0.4.2"]
|
198
|
+
langdetect = ["fast-langdetect>=0.2.0"]
|
194
199
|
paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
|
195
|
-
all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
|
200
|
+
all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
|
196
201
|
```
|
197
202
|
|
198
203
|
### Installation Patterns
|
@@ -207,6 +212,17 @@ rules:
|
|
207
212
|
- **Development**: Uses dependency groups in pyproject.toml
|
208
213
|
|
209
214
|
sections:
|
215
|
+
- title: "Language Detection"
|
216
|
+
content: |
|
217
|
+
### Automatic Language Detection (v3.5.0+)
|
218
|
+
- **Feature**: Automatically detect languages in extracted text
|
219
|
+
- **Implementation**: Uses fast-langdetect library for high-performance detection
|
220
|
+
- **Configuration**:
|
221
|
+
- Enable with `auto_detect_language=True` in `ExtractionConfig`
|
222
|
+
- Configure via `LanguageDetectionConfig` for confidence thresholds
|
223
|
+
- **Output**: Results available in `ExtractionResult.detected_languages`
|
224
|
+
- **Integration**: Works with all extraction methods and file types
|
225
|
+
|
210
226
|
- title: "Planned Features"
|
211
227
|
content: |
|
212
228
|
### Structured Extraction (Issue #55)
|
@@ -8,7 +8,7 @@ from kreuzberg import ExtractionConfig, batch_extract_file, extract_file_sync
|
|
8
8
|
from kreuzberg._utils._document_cache import clear_document_cache, get_document_cache
|
9
9
|
|
10
10
|
|
11
|
-
async def run_baseline_benchmark() -> dict[str, object] | None:
|
11
|
+
async def run_baseline_benchmark() -> dict[str, object] | None:
|
12
12
|
"""Run comprehensive baseline benchmark."""
|
13
13
|
test_files_dir = Path("tests/test_source_files")
|
14
14
|
test_files = list(test_files_dir.glob("*.pdf"))
|
@@ -15,7 +15,7 @@ from kreuzberg._utils._cache import (
|
|
15
15
|
)
|
16
16
|
|
17
17
|
|
18
|
-
async def run_final_benchmark() -> dict[str, object] | None:
|
18
|
+
async def run_final_benchmark() -> dict[str, object] | None:
|
19
19
|
"""Run comprehensive benchmark of all caching improvements."""
|
20
20
|
test_files_dir = Path("tests/test_source_files")
|
21
21
|
pdf_files = list(test_files_dir.glob("*.pdf"))
|
@@ -101,7 +101,6 @@ def benchmark_serialization() -> dict[str, object]:
|
|
101
101
|
json_serialize = analyze_times(json_serialize_times, "JSON Serialize")
|
102
102
|
json_deserialize = analyze_times(json_deserialize_times, "JSON Deserialize")
|
103
103
|
|
104
|
-
# Type casting for arithmetic operations
|
105
104
|
json_ser_mean = json_serialize["mean"]
|
106
105
|
json_deser_mean = json_deserialize["mean"]
|
107
106
|
msgpack_ser_mean = msgpack_serialize["mean"]
|
@@ -125,22 +125,59 @@ The async API leverages Python's asyncio with intelligent task scheduling:
|
|
125
125
|
1. **Configure OCR appropriately** for your document types
|
126
126
|
1. **Profile your specific workload** - results vary by content
|
127
127
|
|
128
|
-
### Configuration
|
128
|
+
### Optimized Default Configuration
|
129
|
+
|
130
|
+
Kreuzberg's default configuration is **optimized out-of-the-box for modern PDFs and standard documents**:
|
129
131
|
|
130
132
|
```python
|
131
|
-
from kreuzberg import ExtractionConfig
|
132
|
-
from kreuzberg._ocr import TesseractConfig
|
133
|
+
from kreuzberg import ExtractionConfig
|
133
134
|
|
134
|
-
#
|
135
|
-
|
135
|
+
# Default configuration - already optimized for modern documents
|
136
|
+
config = ExtractionConfig() # Uses optimized defaults:
|
137
|
+
# - PSM: AUTO_ONLY (fast without orientation detection)
|
138
|
+
# - Language model: Disabled for performance
|
139
|
+
# - Dictionary correction: Enabled for accuracy
|
140
|
+
```
|
136
141
|
|
137
|
-
|
138
|
-
accurate_config = ExtractionConfig(ocr_backend="tesseract", ocr_config=TesseractConfig(psm=1)) # Auto page segmentation
|
142
|
+
### Advanced Configuration Examples
|
139
143
|
|
140
|
-
|
141
|
-
|
144
|
+
```python
|
145
|
+
from kreuzberg import ExtractionConfig, extract_file_sync
|
146
|
+
from kreuzberg._ocr._tesseract import TesseractConfig, PSMMode
|
147
|
+
|
148
|
+
# Maximum speed configuration (for high-volume processing)
|
149
|
+
speed_config = ExtractionConfig(
|
150
|
+
ocr_backend="tesseract",
|
151
|
+
ocr_config=TesseractConfig(
|
152
|
+
psm=PSMMode.SINGLE_BLOCK, # Assume simple layout
|
153
|
+
language_model_ngram_on=False, # Already disabled by default
|
154
|
+
tessedit_enable_dict_correction=False, # Disable for maximum speed
|
155
|
+
),
|
156
|
+
)
|
157
|
+
|
158
|
+
# Maximum accuracy configuration (for degraded documents)
|
159
|
+
accuracy_config = ExtractionConfig(
|
160
|
+
ocr_backend="tesseract",
|
161
|
+
ocr_config=TesseractConfig(
|
162
|
+
psm=PSMMode.AUTO, # Full analysis with orientation detection
|
163
|
+
language_model_ngram_on=True, # Enable for historical/degraded text
|
164
|
+
tessedit_enable_dict_correction=True, # Default - keep enabled
|
165
|
+
),
|
166
|
+
)
|
167
|
+
|
168
|
+
# No OCR configuration (text documents only)
|
169
|
+
text_only_config = ExtractionConfig(ocr_backend=None, force_ocr=False)
|
142
170
|
```
|
143
171
|
|
172
|
+
### Performance Optimization Tips
|
173
|
+
|
174
|
+
Based on comprehensive benchmarking with 138+ documents:
|
175
|
+
|
176
|
+
1. **Disable OCR for text documents**: Setting `ocr_backend=None` provides significant speedup for documents with text layers
|
177
|
+
1. **Use PSM `AUTO_ONLY` (default)**: Optimized for modern documents without orientation detection overhead
|
178
|
+
1. **Language model trade-offs**: Disabling `language_model_ngram_on` can provide 30x+ speedup with minimal quality impact on clean documents
|
179
|
+
1. **Dictionary correction**: Disabling `tessedit_enable_dict_correction` speeds up processing for technical documents
|
180
|
+
|
144
181
|
### Batch Processing Best Practices
|
145
182
|
|
146
183
|
```python
|
@@ -40,10 +40,28 @@ Configuration options for the GMFT table extraction engine:
|
|
40
40
|
|
41
41
|
::: kreuzberg.GMFTConfig
|
42
42
|
|
43
|
+
## Entity Extraction Configuration
|
44
|
+
|
45
|
+
Configuration options for spaCy-based entity extraction:
|
46
|
+
|
47
|
+
::: kreuzberg.SpacyEntityExtractionConfig
|
48
|
+
|
49
|
+
## Language Detection Configuration
|
50
|
+
|
51
|
+
Configuration options for automatic language detection:
|
52
|
+
|
53
|
+
::: kreuzberg.LanguageDetectionConfig
|
54
|
+
|
43
55
|
## PSMMode (Page Segmentation Mode)
|
44
56
|
|
45
57
|
::: kreuzberg.PSMMode
|
46
58
|
|
59
|
+
## Entity
|
60
|
+
|
61
|
+
Represents an extracted named entity:
|
62
|
+
|
63
|
+
::: kreuzberg.Entity
|
64
|
+
|
47
65
|
## Metadata
|
48
66
|
|
49
67
|
A TypedDict that contains optional metadata fields extracted from documents:
|
@@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
12
12
|
- Documentation site with comprehensive examples and API reference
|
13
13
|
- Improved configuration for all OCR backends
|
14
14
|
- Added hooks system for validation and post-processing
|
15
|
+
- Language detection feature with `auto_detect_language` configuration option
|
16
|
+
- New optional dependency group `langdetect` for automatic language detection
|
15
17
|
|
16
18
|
### Changed
|
17
19
|
|
@@ -79,6 +79,47 @@ async def extract_with_different_backends():
|
|
79
79
|
print(f"No OCR result: {result.content[:100]}...")
|
80
80
|
```
|
81
81
|
|
82
|
+
## Language Detection
|
83
|
+
|
84
|
+
```python
|
85
|
+
from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
|
86
|
+
|
87
|
+
async def detect_document_language():
|
88
|
+
# Simple automatic language detection
|
89
|
+
result = await extract_file("document.pdf", config=ExtractionConfig(auto_detect_language=True))
|
90
|
+
|
91
|
+
# Access detected languages
|
92
|
+
if result.detected_languages:
|
93
|
+
print(f"Detected languages: {', '.join(result.detected_languages)}")
|
94
|
+
# Example output: "Detected languages: en, de, fr"
|
95
|
+
|
96
|
+
async def detect_multilingual_document():
|
97
|
+
# Advanced multilingual detection with custom configuration
|
98
|
+
lang_config = LanguageDetectionConfig(
|
99
|
+
multilingual=True, # Detect multiple languages in mixed text
|
100
|
+
top_k=5, # Return top 5 languages
|
101
|
+
low_memory=False, # Use high accuracy mode
|
102
|
+
)
|
103
|
+
|
104
|
+
result = await extract_file(
|
105
|
+
"multilingual_document.pdf", config=ExtractionConfig(auto_detect_language=True, language_detection_config=lang_config)
|
106
|
+
)
|
107
|
+
|
108
|
+
if result.detected_languages:
|
109
|
+
print(f"Detected languages: {result.detected_languages}")
|
110
|
+
|
111
|
+
# Use detected languages for OCR
|
112
|
+
from kreuzberg import TesseractConfig
|
113
|
+
|
114
|
+
# Create language string for Tesseract (e.g., "eng+deu+fra")
|
115
|
+
tesseract_langs = "+".join(result.detected_languages[:3])
|
116
|
+
|
117
|
+
result_with_ocr = await extract_file(
|
118
|
+
"multilingual_document.pdf",
|
119
|
+
config=ExtractionConfig(force_ocr=True, ocr_config=TesseractConfig(language=tesseract_langs)),
|
120
|
+
)
|
121
|
+
```
|
122
|
+
|
82
123
|
## Table Extraction
|
83
124
|
|
84
125
|
```python
|
@@ -148,6 +189,83 @@ async def process_upload(file_content: bytes, mime_type: str):
|
|
148
189
|
print(f"{key}: {value}")
|
149
190
|
```
|
150
191
|
|
192
|
+
## Keywords
|
193
|
+
|
194
|
+
Kreuzberg supports keywords and regex extraction as follows:
|
195
|
+
|
196
|
+
```python
|
197
|
+
from kreuzberg import ExtractionConfig, extract_file
|
198
|
+
|
199
|
+
async def extract_keywords():
|
200
|
+
config = ExtractionConfig(
|
201
|
+
extract_keywords=True,
|
202
|
+
keyword_count=5, # defaults to 10 if not set
|
203
|
+
)
|
204
|
+
result = await extract_file(
|
205
|
+
"document.pdf",
|
206
|
+
config=config,
|
207
|
+
)
|
208
|
+
print(f"Keywords: {result.keywords}")
|
209
|
+
```
|
210
|
+
|
211
|
+
## Entity and Keyword Extraction
|
212
|
+
|
213
|
+
Kreuzberg can extract named entities using spaCy and keywords using KeyBERT. It automatically detects entities like people, organizations, locations, and more, plus supports custom regex patterns:
|
214
|
+
|
215
|
+
```python
|
216
|
+
from kreuzberg import ExtractionConfig, extract_file, SpacyEntityExtractionConfig
|
217
|
+
|
218
|
+
async def extract_entities_and_keywords():
|
219
|
+
# Basic extraction
|
220
|
+
config = ExtractionConfig(
|
221
|
+
extract_entities=True,
|
222
|
+
extract_keywords=True,
|
223
|
+
keyword_count=5,
|
224
|
+
custom_entity_patterns={
|
225
|
+
"INVOICE_ID": r"INV-\d+",
|
226
|
+
"EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
|
227
|
+
},
|
228
|
+
)
|
229
|
+
result = await extract_file("document.pdf", config=config)
|
230
|
+
|
231
|
+
# Print extracted entities
|
232
|
+
if result.entities:
|
233
|
+
for entity in result.entities:
|
234
|
+
print(f"{entity.type}: {entity.text}")
|
235
|
+
|
236
|
+
# Print extracted keywords
|
237
|
+
if result.keywords:
|
238
|
+
for keyword, score in result.keywords:
|
239
|
+
print(f"Keyword: {keyword} (score: {score:.3f})")
|
240
|
+
|
241
|
+
async def extract_multilingual_entities():
|
242
|
+
# Configure spaCy for multiple languages
|
243
|
+
spacy_config = SpacyEntityExtractionConfig(
|
244
|
+
language_models={
|
245
|
+
"en": "en_core_web_sm",
|
246
|
+
"de": "de_core_news_sm",
|
247
|
+
"fr": "fr_core_news_sm",
|
248
|
+
},
|
249
|
+
fallback_to_multilingual=True,
|
250
|
+
)
|
251
|
+
|
252
|
+
config = ExtractionConfig(
|
253
|
+
auto_detect_language=True, # Automatically detect document languages
|
254
|
+
extract_entities=True,
|
255
|
+
spacy_entity_extraction_config=spacy_config,
|
256
|
+
)
|
257
|
+
|
258
|
+
result = await extract_file("multilingual_document.pdf", config=config)
|
259
|
+
|
260
|
+
if result.detected_languages:
|
261
|
+
print(f"Detected languages: {result.detected_languages}")
|
262
|
+
|
263
|
+
if result.entities:
|
264
|
+
print(f"Extracted {len(result.entities)} entities")
|
265
|
+
for entity in result.entities:
|
266
|
+
print(f" {entity.type}: {entity.text}")
|
267
|
+
```
|
268
|
+
|
151
269
|
## Synchronous API
|
152
270
|
|
153
271
|
For cases where async isn't needed or available:
|
@@ -102,6 +102,38 @@ Table extraction is an optional feature that allows Kreuzberg to extract tables
|
|
102
102
|
pip install "kreuzberg[gmft]"
|
103
103
|
```
|
104
104
|
|
105
|
+
### Language Detection
|
106
|
+
|
107
|
+
Language detection is an optional feature that automatically detects the language of extracted text. It uses the [fast-langdetect](https://github.com/LlmKira/fast-langdetect) package. To install Kreuzberg with language detection support, you can use:
|
108
|
+
|
109
|
+
```shell
|
110
|
+
pip install "kreuzberg[langdetect]"
|
111
|
+
```
|
112
|
+
|
113
|
+
### Entity and Keyword Extraction
|
114
|
+
|
115
|
+
Entity and keyword extraction are optional features that extract named entities and keywords from documents. Entity extraction uses [spaCy](https://spacy.io/) for multilingual named entity recognition, while keyword extraction uses [KeyBERT](https://github.com/MaartenGr/KeyBERT) for semantic keyword extraction:
|
116
|
+
|
117
|
+
```shell
|
118
|
+
pip install "kreuzberg[entity-extraction]"
|
119
|
+
```
|
120
|
+
|
121
|
+
After installation, you'll need to download the spaCy language models you plan to use:
|
122
|
+
|
123
|
+
```shell
|
124
|
+
# Download English model (most common)
|
125
|
+
python -m spacy download en_core_web_sm
|
126
|
+
|
127
|
+
# Download other language models as needed
|
128
|
+
python -m spacy download de_core_news_sm # German
|
129
|
+
python -m spacy download fr_core_news_sm # French
|
130
|
+
python -m spacy download es_core_news_sm # Spanish
|
131
|
+
```
|
132
|
+
|
133
|
+
!!! note "Language Model Requirements"
|
134
|
+
|
135
|
+
spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
|
136
|
+
|
105
137
|
### All Optional Dependencies
|
106
138
|
|
107
139
|
To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
|
@@ -113,5 +145,5 @@ pip install "kreuzberg[all]"
|
|
113
145
|
This is equivalent to:
|
114
146
|
|
115
147
|
```shell
|
116
|
-
pip install "kreuzberg[chunking,easyocr,gmft,paddleocr]"
|
148
|
+
pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
|
117
149
|
```
|
@@ -124,6 +124,34 @@ Additional dependencies by variant:
|
|
124
124
|
- **gmft**: GMFT for table extraction
|
125
125
|
- **all**: All optional dependencies
|
126
126
|
|
127
|
+
### Health Check
|
128
|
+
|
129
|
+
All Docker images include a health check endpoint:
|
130
|
+
|
131
|
+
```bash
|
132
|
+
# Check API health
|
133
|
+
curl http://localhost:8000/health
|
134
|
+
```
|
135
|
+
|
136
|
+
Returns a JSON response with service status and version information.
|
137
|
+
|
138
|
+
### Observability
|
139
|
+
|
140
|
+
The Docker images include built-in OpenTelemetry instrumentation via Litestar:
|
141
|
+
|
142
|
+
- **Tracing**: Automatic request/response tracing
|
143
|
+
- **Metrics**: Performance and usage metrics
|
144
|
+
- **Logging**: Structured JSON logging
|
145
|
+
|
146
|
+
Configure via standard OpenTelemetry environment variables:
|
147
|
+
|
148
|
+
```bash
|
149
|
+
docker run -p 8000:8000 \
|
150
|
+
-e OTEL_SERVICE_NAME=kreuzberg-api \
|
151
|
+
-e OTEL_EXPORTER_OTLP_ENDPOINT=http://your-collector:4317 \
|
152
|
+
goldziher/kreuzberg:latest
|
153
|
+
```
|
154
|
+
|
127
155
|
### Environment Variables
|
128
156
|
|
129
157
|
- `PYTHONUNBUFFERED=1` - Ensures proper logging output
|
@@ -150,6 +178,12 @@ server {
|
|
150
178
|
client_max_body_size 100M;
|
151
179
|
proxy_read_timeout 300s;
|
152
180
|
}
|
181
|
+
|
182
|
+
# Health check endpoint
|
183
|
+
location /health {
|
184
|
+
proxy_pass http://localhost:8000/health;
|
185
|
+
access_log off;
|
186
|
+
}
|
153
187
|
}
|
154
188
|
```
|
155
189
|
|
@@ -175,6 +209,21 @@ spec:
|
|
175
209
|
image: goldziher/kreuzberg:latest
|
176
210
|
ports:
|
177
211
|
- containerPort: 8000
|
212
|
+
livenessProbe:
|
213
|
+
httpGet:
|
214
|
+
path: /health
|
215
|
+
port: 8000
|
216
|
+
initialDelaySeconds: 30
|
217
|
+
periodSeconds: 10
|
218
|
+
readinessProbe:
|
219
|
+
httpGet:
|
220
|
+
path: /health
|
221
|
+
port: 8000
|
222
|
+
initialDelaySeconds: 5
|
223
|
+
periodSeconds: 5
|
224
|
+
env:
|
225
|
+
- name: OTEL_SERVICE_NAME
|
226
|
+
value: "kreuzberg-api"
|
178
227
|
resources:
|
179
228
|
requests:
|
180
229
|
memory: "512Mi"
|