kreuzberg 3.5.0__tar.gz → 3.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/publish-docker.yml +10 -21
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/release.yaml +6 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/PKG-INFO +11 -5
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/ai-rulez.yaml +25 -9
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/types.md +18 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/examples/extraction-examples.md +77 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/getting-started/installation.md +25 -1
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/extraction-configuration.md +128 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/__init__.py +4 -1
- kreuzberg-3.6.1/kreuzberg/_entity_extraction.py +239 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_types.py +35 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/extraction.py +39 -22
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/pyproject.toml +13 -10
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/cli_integration_test.py +1 -1
- kreuzberg-3.6.1/tests/entity_extraction_test.py +102 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extraction_test.py +0 -1
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/pdf_test.py +0 -1
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/uv.lock +401 -67
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.commitlintrc +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.docker/README.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.dockerignore +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.gitignore +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.gitmodules +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/LICENSE +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/README.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/benchmark_baseline.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/end_to_end_benchmark.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/final_benchmark.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/baseline_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/benchmark_msgpack_20250702_003800.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/comprehensive_caching_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/final_benchmark_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/mime_caching_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/msgspec_caching_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/ocr_caching_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/serialization_benchmark_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/statistical_benchmark_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/results/table_caching_results.json +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/serialization_benchmark.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/__main__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/cli.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/models.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/profiler.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/src/kreuzberg_benchmarks/runner.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/benchmarks/statistical_benchmark.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/changelog.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/cli.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/contributing.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/index.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_cli_config.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/gmft_isolated.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/process_manager.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/sync_easyocr.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/sync_paddleocr.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/sync_tesseract.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_multiprocessing/tesseract_pool.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/mkdocs.yaml +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/api/main_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/chunker_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/cli_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/conftest.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/exceptions_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extraction_batch_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/extractors/spreed_sheet_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/gmft_extended_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/gmft_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/hooks_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/language_detection_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/mime_types_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/sync_tesseract_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/playa_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/registry_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/types_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.5.0 → kreuzberg-3.6.1}/tests/utils/tmp_test.py +0 -0
@@ -2,18 +2,14 @@
|
|
2
2
|
name: Publish Docker Images
|
3
3
|
|
4
4
|
on:
|
5
|
-
workflow_run:
|
6
|
-
workflows: ["Release"]
|
7
|
-
types:
|
8
|
-
- completed
|
9
|
-
branches:
|
10
|
-
- main
|
11
5
|
workflow_dispatch:
|
6
|
+
release:
|
7
|
+
types: [published]
|
12
8
|
|
13
9
|
jobs:
|
14
10
|
build-and-push:
|
15
11
|
runs-on: ubuntu-latest
|
16
|
-
if: ${{ github.
|
12
|
+
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'release' }}
|
17
13
|
permissions:
|
18
14
|
contents: read
|
19
15
|
packages: write
|
@@ -41,28 +37,21 @@ jobs:
|
|
41
37
|
- name: Checkout repository
|
42
38
|
uses: actions/checkout@v4
|
43
39
|
with:
|
44
|
-
ref: ${{ github.
|
40
|
+
ref: ${{ github.ref }}
|
45
41
|
|
46
42
|
- name: Get release version
|
47
43
|
id: get_version
|
48
44
|
run: |
|
49
|
-
if [
|
50
|
-
# For
|
45
|
+
if [ "${{ github.event_name }}" = "release" ]; then
|
46
|
+
# For release events, use the release tag
|
47
|
+
VERSION="${{ github.event.release.tag_name }}"
|
48
|
+
else
|
49
|
+
# For workflow_dispatch, get the latest tag
|
51
50
|
git fetch --tags
|
52
51
|
VERSION=$(git tag --sort=-version:refname | head -n1)
|
53
|
-
else
|
54
|
-
# For workflow_run, use the head branch
|
55
|
-
VERSION="${{ github.event.workflow_run.head_branch }}"
|
56
|
-
# If triggered by a tag, extract version
|
57
|
-
if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+ ]]; then
|
58
|
-
VERSION="$VERSION"
|
59
|
-
else
|
60
|
-
# Get the latest tag by listing all tags
|
61
|
-
git fetch --tags
|
62
|
-
VERSION=$(git tag --sort=-version:refname | head -n1)
|
63
|
-
fi
|
64
52
|
fi
|
65
53
|
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
|
54
|
+
echo "Using version: $VERSION"
|
66
55
|
|
67
56
|
- name: Set up QEMU
|
68
57
|
uses: docker/setup-qemu-action@v3
|
@@ -10,6 +10,7 @@ jobs:
|
|
10
10
|
environment: pypi
|
11
11
|
permissions:
|
12
12
|
id-token: write
|
13
|
+
contents: read
|
13
14
|
steps:
|
14
15
|
- name: Checkout
|
15
16
|
uses: actions/checkout@v4
|
@@ -29,3 +30,8 @@ jobs:
|
|
29
30
|
|
30
31
|
- name: Publish
|
31
32
|
uses: pypa/gh-action-pypi-publish@release/v1
|
33
|
+
|
34
|
+
- name: Docker Build Info
|
35
|
+
run: |
|
36
|
+
echo "Docker images will be built automatically by the publish-docker.yml workflow"
|
37
|
+
echo "triggered by this release event. No manual triggering needed."
|
@@ -1,12 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.6.1
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
7
|
License: MIT
|
8
8
|
License-File: LICENSE
|
9
|
-
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
9
|
+
Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
38
38
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
39
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
39
40
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
40
|
-
Requires-Dist:
|
41
|
+
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
42
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
41
43
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
42
44
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
43
45
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
44
46
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
45
47
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
48
|
+
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
46
49
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
47
50
|
Provides-Extra: api
|
48
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
51
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
49
52
|
Provides-Extra: chunking
|
50
53
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
51
54
|
Provides-Extra: cli
|
@@ -54,10 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
|
|
54
57
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
55
58
|
Provides-Extra: easyocr
|
56
59
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
60
|
+
Provides-Extra: entity-extraction
|
61
|
+
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
62
|
+
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
57
63
|
Provides-Extra: gmft
|
58
64
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
59
65
|
Provides-Extra: langdetect
|
60
|
-
Requires-Dist: fast-langdetect>=0.2
|
66
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
61
67
|
Provides-Extra: paddleocr
|
62
68
|
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
63
69
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
@@ -1,6 +1,6 @@
|
|
1
1
|
metadata:
|
2
2
|
name: "Kreuzberg"
|
3
|
-
version: "3.
|
3
|
+
version: "3.5.0"
|
4
4
|
description: "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
|
6
6
|
outputs:
|
@@ -115,6 +115,7 @@ rules:
|
|
115
115
|
- **OCR Backends**: Pluggable OCR engines with separate configuration classes
|
116
116
|
- **GMFT Integration**: Table extraction using GMFT library for PDFs
|
117
117
|
- **Chunking**: Text splitting functionality in `_chunker.py`
|
118
|
+
- **Language Detection**: Automatic language detection using fast-langdetect
|
118
119
|
- **Async/Sync**: Primary async implementation with sync wrappers in `_utils/_sync.py`
|
119
120
|
- **API Server**: REST API using Litestar framework in `_api/main.py`
|
120
121
|
- **CLI**: Command-line interface for batch processing and automation
|
@@ -144,6 +145,8 @@ rules:
|
|
144
145
|
- Mock OCR responses for predictable testing
|
145
146
|
- Both sync and async test variants
|
146
147
|
- Comprehensive error case coverage
|
148
|
+
- OCR tests marked as `xfail` in CI environments for resilience
|
149
|
+
- Integration tests use timeouts and retry logic where appropriate
|
147
150
|
|
148
151
|
- name: "Important Instructions"
|
149
152
|
priority: 10
|
@@ -160,16 +163,17 @@ rules:
|
|
160
163
|
priority: 6
|
161
164
|
content: |
|
162
165
|
### GitHub Actions Workflows
|
163
|
-
- **Release**: Automated PyPI publishing via GitHub releases
|
164
|
-
- **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64)
|
166
|
+
- **Release**: Automated PyPI publishing via GitHub releases, triggers Docker builds
|
167
|
+
- **Docker**: Multi-platform Docker builds (linux/amd64, linux/arm64), triggered by releases
|
165
168
|
- **Documentation**: Auto-deploy to GitHub Pages on docs changes
|
169
|
+
- **CI**: Comprehensive testing across multiple Python versions and platforms
|
166
170
|
|
167
171
|
### Docker Variants
|
168
|
-
- **Core** (`goldziher/kreuzberg:v3.
|
169
|
-
- **EasyOCR** (`goldziher/kreuzberg:v3.
|
170
|
-
- **PaddleOCR** (`goldziher/kreuzberg:v3.
|
171
|
-
- **GMFT** (`goldziher/kreuzberg:v3.
|
172
|
-
- **All** (`goldziher/kreuzberg:v3.
|
172
|
+
- **Core** (`goldziher/kreuzberg:v3.5.0`): API + Tesseract OCR
|
173
|
+
- **EasyOCR** (`goldziher/kreuzberg:v3.5.0-easyocr`): Core + EasyOCR
|
174
|
+
- **PaddleOCR** (`goldziher/kreuzberg:v3.5.0-paddle`): Core + PaddleOCR
|
175
|
+
- **GMFT** (`goldziher/kreuzberg:v3.5.0-gmft`): Core + table extraction
|
176
|
+
- **All** (`goldziher/kreuzberg:v3.5.0-all`): All features included
|
173
177
|
|
174
178
|
### Manual Triggers
|
175
179
|
- Docker builds: `gh workflow run "Publish Docker Images"`
|
@@ -191,8 +195,9 @@ rules:
|
|
191
195
|
chunking = ["semantic-text-splitter>=0.27.0"]
|
192
196
|
easyocr = ["easyocr>=1.7.2"]
|
193
197
|
gmft = ["gmft>=0.4.2"]
|
198
|
+
langdetect = ["fast-langdetect>=0.2.0"]
|
194
199
|
paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
|
195
|
-
all = ["kreuzberg[api,chunking,cli,easyocr,gmft,paddleocr]"]
|
200
|
+
all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
|
196
201
|
```
|
197
202
|
|
198
203
|
### Installation Patterns
|
@@ -207,6 +212,17 @@ rules:
|
|
207
212
|
- **Development**: Uses dependency groups in pyproject.toml
|
208
213
|
|
209
214
|
sections:
|
215
|
+
- title: "Language Detection"
|
216
|
+
content: |
|
217
|
+
### Automatic Language Detection (v3.5.0+)
|
218
|
+
- **Feature**: Automatically detect languages in extracted text
|
219
|
+
- **Implementation**: Uses fast-langdetect library for high-performance detection
|
220
|
+
- **Configuration**:
|
221
|
+
- Enable with `auto_detect_language=True` in `ExtractionConfig`
|
222
|
+
- Configure via `LanguageDetectionConfig` for confidence thresholds
|
223
|
+
- **Output**: Results available in `ExtractionResult.detected_languages`
|
224
|
+
- **Integration**: Works with all extraction methods and file types
|
225
|
+
|
210
226
|
- title: "Planned Features"
|
211
227
|
content: |
|
212
228
|
### Structured Extraction (Issue #55)
|
@@ -40,10 +40,28 @@ Configuration options for the GMFT table extraction engine:
|
|
40
40
|
|
41
41
|
::: kreuzberg.GMFTConfig
|
42
42
|
|
43
|
+
## Entity Extraction Configuration
|
44
|
+
|
45
|
+
Configuration options for spaCy-based entity extraction:
|
46
|
+
|
47
|
+
::: kreuzberg.SpacyEntityExtractionConfig
|
48
|
+
|
49
|
+
## Language Detection Configuration
|
50
|
+
|
51
|
+
Configuration options for automatic language detection:
|
52
|
+
|
53
|
+
::: kreuzberg.LanguageDetectionConfig
|
54
|
+
|
43
55
|
## PSMMode (Page Segmentation Mode)
|
44
56
|
|
45
57
|
::: kreuzberg.PSMMode
|
46
58
|
|
59
|
+
## Entity
|
60
|
+
|
61
|
+
Represents an extracted named entity:
|
62
|
+
|
63
|
+
::: kreuzberg.Entity
|
64
|
+
|
47
65
|
## Metadata
|
48
66
|
|
49
67
|
A TypedDict that contains optional metadata fields extracted from documents:
|
@@ -189,6 +189,83 @@ async def process_upload(file_content: bytes, mime_type: str):
|
|
189
189
|
print(f"{key}: {value}")
|
190
190
|
```
|
191
191
|
|
192
|
+
## Keywords
|
193
|
+
|
194
|
+
Kreuzberg supports keywords and regex extraction as follows:
|
195
|
+
|
196
|
+
```python
|
197
|
+
from kreuzberg import ExtractionConfig, extract_file
|
198
|
+
|
199
|
+
async def extract_keywords():
|
200
|
+
config = ExtractionConfig(
|
201
|
+
extract_keywords=True,
|
202
|
+
keyword_count=5, # defaults to 10 if not set
|
203
|
+
)
|
204
|
+
result = await extract_file(
|
205
|
+
"document.pdf",
|
206
|
+
config=config,
|
207
|
+
)
|
208
|
+
print(f"Keywords: {result.keywords}")
|
209
|
+
```
|
210
|
+
|
211
|
+
## Entity and Keyword Extraction
|
212
|
+
|
213
|
+
Kreuzberg can extract named entities using spaCy and keywords using KeyBERT. It automatically detects entities like people, organizations, locations, and more, plus supports custom regex patterns:
|
214
|
+
|
215
|
+
```python
|
216
|
+
from kreuzberg import ExtractionConfig, extract_file, SpacyEntityExtractionConfig
|
217
|
+
|
218
|
+
async def extract_entities_and_keywords():
|
219
|
+
# Basic extraction
|
220
|
+
config = ExtractionConfig(
|
221
|
+
extract_entities=True,
|
222
|
+
extract_keywords=True,
|
223
|
+
keyword_count=5,
|
224
|
+
custom_entity_patterns={
|
225
|
+
"INVOICE_ID": r"INV-\d+",
|
226
|
+
"EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
|
227
|
+
},
|
228
|
+
)
|
229
|
+
result = await extract_file("document.pdf", config=config)
|
230
|
+
|
231
|
+
# Print extracted entities
|
232
|
+
if result.entities:
|
233
|
+
for entity in result.entities:
|
234
|
+
print(f"{entity.type}: {entity.text}")
|
235
|
+
|
236
|
+
# Print extracted keywords
|
237
|
+
if result.keywords:
|
238
|
+
for keyword, score in result.keywords:
|
239
|
+
print(f"Keyword: {keyword} (score: {score:.3f})")
|
240
|
+
|
241
|
+
async def extract_multilingual_entities():
|
242
|
+
# Configure spaCy for multiple languages
|
243
|
+
spacy_config = SpacyEntityExtractionConfig(
|
244
|
+
language_models={
|
245
|
+
"en": "en_core_web_sm",
|
246
|
+
"de": "de_core_news_sm",
|
247
|
+
"fr": "fr_core_news_sm",
|
248
|
+
},
|
249
|
+
fallback_to_multilingual=True,
|
250
|
+
)
|
251
|
+
|
252
|
+
config = ExtractionConfig(
|
253
|
+
auto_detect_language=True, # Automatically detect document languages
|
254
|
+
extract_entities=True,
|
255
|
+
spacy_entity_extraction_config=spacy_config,
|
256
|
+
)
|
257
|
+
|
258
|
+
result = await extract_file("multilingual_document.pdf", config=config)
|
259
|
+
|
260
|
+
if result.detected_languages:
|
261
|
+
print(f"Detected languages: {result.detected_languages}")
|
262
|
+
|
263
|
+
if result.entities:
|
264
|
+
print(f"Extracted {len(result.entities)} entities")
|
265
|
+
for entity in result.entities:
|
266
|
+
print(f" {entity.type}: {entity.text}")
|
267
|
+
```
|
268
|
+
|
192
269
|
## Synchronous API
|
193
270
|
|
194
271
|
For cases where async isn't needed or available:
|
@@ -110,6 +110,30 @@ Language detection is an optional feature that automatically detects the languag
|
|
110
110
|
pip install "kreuzberg[langdetect]"
|
111
111
|
```
|
112
112
|
|
113
|
+
### Entity and Keyword Extraction
|
114
|
+
|
115
|
+
Entity and keyword extraction are optional features that extract named entities and keywords from documents. Entity extraction uses [spaCy](https://spacy.io/) for multilingual named entity recognition, while keyword extraction uses [KeyBERT](https://github.com/MaartenGr/KeyBERT) for semantic keyword extraction:
|
116
|
+
|
117
|
+
```shell
|
118
|
+
pip install "kreuzberg[entity-extraction]"
|
119
|
+
```
|
120
|
+
|
121
|
+
After installation, you'll need to download the spaCy language models you plan to use:
|
122
|
+
|
123
|
+
```shell
|
124
|
+
# Download English model (most common)
|
125
|
+
python -m spacy download en_core_web_sm
|
126
|
+
|
127
|
+
# Download other language models as needed
|
128
|
+
python -m spacy download de_core_news_sm # German
|
129
|
+
python -m spacy download fr_core_news_sm # French
|
130
|
+
python -m spacy download es_core_news_sm # Spanish
|
131
|
+
```
|
132
|
+
|
133
|
+
!!! note "Language Model Requirements"
|
134
|
+
|
135
|
+
spaCy language models are large (50-500MB each) and are downloaded separately. Only download the models for languages you actually need to process. See the [spaCy models documentation](https://spacy.io/models) for a complete list of available models.
|
136
|
+
|
113
137
|
### All Optional Dependencies
|
114
138
|
|
115
139
|
To install Kreuzberg with all optional dependencies, you can use the `all` extra group:
|
@@ -121,5 +145,5 @@ pip install "kreuzberg[all]"
|
|
121
145
|
This is equivalent to:
|
122
146
|
|
123
147
|
```shell
|
124
|
-
pip install "kreuzberg[chunking,easyocr,gmft,langdetect,paddleocr]"
|
148
|
+
pip install "kreuzberg[chunking,easyocr,entity-extraction,gmft,langdetect,paddleocr]"
|
125
149
|
```
|
@@ -153,6 +153,134 @@ The feature requires the `langdetect` dependency:
|
|
153
153
|
pip install "kreuzberg[langdetect]"
|
154
154
|
```
|
155
155
|
|
156
|
+
### Entity and Keyword Extraction
|
157
|
+
|
158
|
+
Kreuzberg can extract named entities and keywords from documents using spaCy for entity recognition and KeyBERT for keyword extraction:
|
159
|
+
|
160
|
+
```python
|
161
|
+
from kreuzberg import extract_file, ExtractionConfig, SpacyEntityExtractionConfig
|
162
|
+
|
163
|
+
# Basic entity and keyword extraction
|
164
|
+
result = await extract_file(
|
165
|
+
"document.pdf",
|
166
|
+
config=ExtractionConfig(
|
167
|
+
extract_entities=True,
|
168
|
+
extract_keywords=True,
|
169
|
+
keyword_count=10, # Number of keywords to extract (default: 10)
|
170
|
+
),
|
171
|
+
)
|
172
|
+
|
173
|
+
# Access extracted entities and keywords
|
174
|
+
if result.entities:
|
175
|
+
for entity in result.entities:
|
176
|
+
print(f"{entity.type}: {entity.text} (position {entity.start}-{entity.end})")
|
177
|
+
# Example: "PERSON: John Doe (position 0-8)"
|
178
|
+
|
179
|
+
if result.keywords:
|
180
|
+
for keyword, score in result.keywords:
|
181
|
+
print(f"{keyword}: {score:.3f}")
|
182
|
+
# Example: "artificial intelligence: 0.845"
|
183
|
+
```
|
184
|
+
|
185
|
+
#### Entity Extraction with Language Support
|
186
|
+
|
187
|
+
spaCy supports entity extraction in multiple languages. You can configure language-specific models:
|
188
|
+
|
189
|
+
```python
|
190
|
+
from kreuzberg import extract_file, ExtractionConfig, SpacyEntityExtractionConfig
|
191
|
+
|
192
|
+
# Configure spaCy for specific languages
|
193
|
+
spacy_config = SpacyEntityExtractionConfig(
|
194
|
+
language_models={
|
195
|
+
"en": "en_core_web_sm", # English
|
196
|
+
"de": "de_core_news_sm", # German
|
197
|
+
"fr": "fr_core_news_sm", # French
|
198
|
+
"es": "es_core_news_sm", # Spanish
|
199
|
+
},
|
200
|
+
model_cache_dir="/tmp/spacy_models", # Custom model cache directory
|
201
|
+
fallback_to_multilingual=True, # Use multilingual model if language-specific model fails
|
202
|
+
)
|
203
|
+
|
204
|
+
# Extract with language detection to automatically choose the right model
|
205
|
+
result = await extract_file(
|
206
|
+
"multilingual_document.pdf",
|
207
|
+
config=ExtractionConfig(
|
208
|
+
auto_detect_language=True, # Enable language detection
|
209
|
+
extract_entities=True,
|
210
|
+
spacy_entity_extraction_config=spacy_config,
|
211
|
+
),
|
212
|
+
)
|
213
|
+
|
214
|
+
# The system will automatically use the appropriate spaCy model based on detected languages
|
215
|
+
if result.detected_languages and result.entities:
|
216
|
+
print(f"Detected languages: {result.detected_languages}")
|
217
|
+
print(f"Extracted {len(result.entities)} entities")
|
218
|
+
```
|
219
|
+
|
220
|
+
#### Custom Entity Patterns
|
221
|
+
|
222
|
+
You can define custom entity patterns using regular expressions:
|
223
|
+
|
224
|
+
```python
|
225
|
+
result = await extract_file(
|
226
|
+
"invoice.pdf",
|
227
|
+
config=ExtractionConfig(
|
228
|
+
extract_entities=True,
|
229
|
+
custom_entity_patterns={
|
230
|
+
"INVOICE_ID": r"INV-\d{4,}", # Invoice numbers
|
231
|
+
"PHONE": r"\+?\d{1,3}[-.\s]?\d{3,4}[-.\s]?\d{3,4}[-.\s]?\d{3,4}", # Phone numbers
|
232
|
+
"EMAIL": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", # Email addresses
|
233
|
+
},
|
234
|
+
),
|
235
|
+
)
|
236
|
+
|
237
|
+
# Custom patterns are combined with spaCy's standard entity types
|
238
|
+
for entity in result.entities:
|
239
|
+
if entity.type in ["INVOICE_ID", "PHONE", "EMAIL"]:
|
240
|
+
print(f"Custom entity - {entity.type}: {entity.text}")
|
241
|
+
else:
|
242
|
+
print(f"Standard entity - {entity.type}: {entity.text}")
|
243
|
+
```
|
244
|
+
|
245
|
+
#### Supported Entity Types
|
246
|
+
|
247
|
+
spaCy automatically detects these standard entity types:
|
248
|
+
|
249
|
+
- **PERSON**: People's names
|
250
|
+
- **ORG**: Organizations, companies, agencies
|
251
|
+
- **GPE**: Countries, cities, states (Geopolitical entities)
|
252
|
+
- **MONEY**: Monetary values
|
253
|
+
- **DATE**: Date expressions
|
254
|
+
- **TIME**: Time expressions
|
255
|
+
- **PERCENT**: Percentage values
|
256
|
+
- **CARDINAL**: Numerals that do not fall under another type
|
257
|
+
|
258
|
+
Language-specific models may support additional entity types relevant to that language.
|
259
|
+
|
260
|
+
#### spaCy Configuration Options
|
261
|
+
|
262
|
+
- `language_models`: Dict mapping language codes to spaCy model names
|
263
|
+
- `model_cache_dir`: Custom directory for caching spaCy models
|
264
|
+
- `fallback_to_multilingual`: Whether to use multilingual model (`xx_ent_wiki_sm`) as fallback
|
265
|
+
- `max_doc_length`: Maximum document length for spaCy processing (default: 1,000,000 characters)
|
266
|
+
- `batch_size`: Batch size for processing multiple texts (default: 1,000)
|
267
|
+
|
268
|
+
#### Installation Requirements
|
269
|
+
|
270
|
+
Entity and keyword extraction require additional dependencies:
|
271
|
+
|
272
|
+
```shell
|
273
|
+
# For entity extraction with spaCy
|
274
|
+
pip install "kreuzberg[entity-extraction]"
|
275
|
+
|
276
|
+
# Install specific spaCy language models as needed
|
277
|
+
python -m spacy download en_core_web_sm # English
|
278
|
+
python -m spacy download de_core_news_sm # German
|
279
|
+
python -m spacy download fr_core_news_sm # French
|
280
|
+
```
|
281
|
+
|
282
|
+
Available spaCy models include: `en_core_web_sm`, `de_core_news_sm`, `fr_core_news_sm`, `es_core_news_sm`, `pt_core_news_sm`, `it_core_news_sm`, `nl_core_news_sm`, `zh_core_web_sm`, `ja_core_news_sm`, `ko_core_news_sm`, `ru_core_news_sm`, and many others.
|
283
|
+
|
156
284
|
### Batch Processing
|
157
285
|
|
158
286
|
```python
|
@@ -1,5 +1,6 @@
|
|
1
1
|
from importlib.metadata import version
|
2
2
|
|
3
|
+
from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
|
3
4
|
from kreuzberg._gmft import GMFTConfig
|
4
5
|
from kreuzberg._language_detection import LanguageDetectionConfig
|
5
6
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
@@ -8,7 +9,7 @@ from kreuzberg._ocr._tesseract import TesseractConfig
|
|
8
9
|
|
9
10
|
from ._ocr._tesseract import PSMMode
|
10
11
|
from ._registry import ExtractorRegistry
|
11
|
-
from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
|
12
|
+
from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
|
12
13
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
13
14
|
from .extraction import (
|
14
15
|
batch_extract_bytes,
|
@@ -25,6 +26,7 @@ __version__ = version("kreuzberg")
|
|
25
26
|
|
26
27
|
__all__ = [
|
27
28
|
"EasyOCRConfig",
|
29
|
+
"Entity",
|
28
30
|
"ExtractionConfig",
|
29
31
|
"ExtractionResult",
|
30
32
|
"ExtractorRegistry",
|
@@ -37,6 +39,7 @@ __all__ = [
|
|
37
39
|
"PSMMode",
|
38
40
|
"PaddleOCRConfig",
|
39
41
|
"ParsingError",
|
42
|
+
"SpacyEntityExtractionConfig",
|
40
43
|
"TableData",
|
41
44
|
"TesseractConfig",
|
42
45
|
"ValidationError",
|