kreuzberg 3.17.1__tar.gz → 3.17.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.github/workflows/ci.yaml +15 -9
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/PKG-INFO +2 -2
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/Taskfile.yml +3 -3
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/ai-rulez.yaml +4 -3
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/contributing.md +9 -5
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_entity_extraction.py +38 -10
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_language_detection.py +0 -2
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/extraction.py +0 -1
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/pyproject.toml +2 -3
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/entity_extraction_test.py +68 -12
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/language_detection_test.py +1 -12
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/uv.lock +261 -294
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.commitlintrc +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.deepsource.toml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.docker/Dockerfile +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.docker/README.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.dockerignore +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.github/workflows/docker-e2e-tests.yml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.github/workflows/test-docker-builds.yml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.gitignore +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.markdownlint.yaml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.pre-commit-config.yaml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/.prettierignore +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/ATTRIBUTIONS.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/LICENSE +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/README.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/README.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/batch_size_benchmark.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/batch_validation_benchmark.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/py.typed +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/src/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/src/__main__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/src/benchmarks.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/src/cli.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/src/models.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/src/profiler.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/src/runner.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/benchmarks/token_reduction_compression_benchmark.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/advanced/index.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/assets/logo.png +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/cli.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/css/extra.css +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/examples/index.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/index.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/docs/user-guide/token-reduction.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_api/_config_cache.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/_reducer.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/_stopwords.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_html_streaming.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_image_preprocessing.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_resource_managers.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/mkdocs.yaml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/api/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/api/config_cache_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/api/conftest.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/api/header_config_hashing_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/api/image_extraction_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/api/main_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/api/runtime_config_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/conftest.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/comprehensive_config_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/config_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/constants_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/dpi_configuration_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/exceptions_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/extraction_batch_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/extraction_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/html_to_markdown_config_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/image_ocr_result_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/init_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/main_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/mime_types_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/registry_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/core/types_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/e2e/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/e2e/docker_e2e.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/README_image_tests.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/base_extractor_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/base_memory_limits_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/base_ocr_processing_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/base_ocr_simple_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/email_error_paths_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/html_invalid_base64_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/image_deduplication_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/image_error_handling_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/image_error_simple_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/json_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/pdf_images_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/pdf_sync_images_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/spreadsheet_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/chunker_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/document_classification_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/gmft_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/hooks_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/table_extraction_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/features/token_reduction_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/all_extractors_images_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/api/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/api/large_file_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/api/mounted_config_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/dpi_integration_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/ocr/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/pandoc_images_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/pdf_images_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/pdf_real_images_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/pptx_complex_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/pptx_images_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/regression_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/integration/token_reduction_integration_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/interfaces/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/interfaces/cli_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/interfaces/mcp_server_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/mcp/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/mcp/mcp_server_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/ocr/tesseract_tsv_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/performance/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/performance/large_pdf_perf_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/google-doc-document.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/complex_nested.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/real_world/aws_policy.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/real_world/earthquakes.geojson +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/real_world/github_emojis.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/real_world/iss_location.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/real_world/openapi_spec.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/real_world/package.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/real_world/rick_morty_character.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/json/schema_test.json +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/sharable-web-guide.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/tables/borderless_table.png +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/tables/complex_document.png +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/test-excel.xls +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/ocr_cache_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/playa_helpers_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/playa_metadata_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/playa_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/quality_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/ref_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.17.1 → kreuzberg-3.17.3}/tests/utils/tmp_test.py +0 -0
@@ -44,15 +44,21 @@ jobs:
|
|
44
44
|
uv sync --all-extras --dev
|
45
45
|
shell: bash
|
46
46
|
|
47
|
-
- name:
|
48
|
-
|
47
|
+
- name: Install prek
|
48
|
+
run: |
|
49
|
+
# Install prek using uv (recommended method)
|
50
|
+
uv tool install prek
|
51
|
+
echo "$HOME/.local/bin" >> $GITHUB_PATH
|
52
|
+
|
53
|
+
- name: Load Cached Prek Dependencies
|
54
|
+
id: cached-prek-dependencies
|
49
55
|
uses: actions/cache@v4
|
50
56
|
with:
|
51
|
-
path: ~/.cache/
|
52
|
-
key:
|
57
|
+
path: ~/.cache/prek/
|
58
|
+
key: prek|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
|
53
59
|
|
54
|
-
- name: Execute
|
55
|
-
run:
|
60
|
+
- name: Execute Prek
|
61
|
+
run: prek run --show-diff-on-failure --color=always --all-files
|
56
62
|
|
57
63
|
coverage:
|
58
64
|
needs: validate
|
@@ -119,7 +125,7 @@ jobs:
|
|
119
125
|
shell: bash
|
120
126
|
|
121
127
|
- name: Upload Coverage to DeepSource
|
122
|
-
if: always()
|
128
|
+
if: always() && secrets.DEEPSOURCE_DSN != '' && needs.test-pr.result == 'success'
|
123
129
|
env:
|
124
130
|
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
125
131
|
run: |
|
@@ -214,7 +220,7 @@ jobs:
|
|
214
220
|
|
215
221
|
coverage-pr:
|
216
222
|
needs: test-pr
|
217
|
-
if: github.event_name == 'pull_request' &&
|
223
|
+
if: github.event_name == 'pull_request' && needs.test-pr.result == 'success'
|
218
224
|
runs-on: ubuntu-latest
|
219
225
|
timeout-minutes: 10
|
220
226
|
steps:
|
@@ -227,7 +233,7 @@ jobs:
|
|
227
233
|
name: coverage-pr-${{ github.sha }}
|
228
234
|
|
229
235
|
- name: Upload Coverage to DeepSource
|
230
|
-
if: always()
|
236
|
+
if: always() && secrets.DEEPSOURCE_DSN != '' && needs.test-pr.result == 'success'
|
231
237
|
env:
|
232
238
|
DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
|
233
239
|
run: |
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.17.
|
3
|
+
Version: 3.17.3
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.14.0
|
35
35
|
Requires-Dist: langcodes>=3.5.0
|
36
36
|
Requires-Dist: mcp>=1.14.1
|
37
37
|
Requires-Dist: msgspec>=0.18.0
|
@@ -9,7 +9,7 @@ tasks:
|
|
9
9
|
desc: "Install dependencies with uv"
|
10
10
|
cmds:
|
11
11
|
- uv sync --all-extras --all-packages
|
12
|
-
-
|
12
|
+
- prek install && prek install --hook-type commit-msg
|
13
13
|
|
14
14
|
update:
|
15
15
|
desc: "Update the dependencies"
|
@@ -17,7 +17,7 @@ tasks:
|
|
17
17
|
- uv run uv-bump
|
18
18
|
- cd benchmarks && uv run uv-bump && cd -
|
19
19
|
- uv sync --all-extras --all-packages --upgrade
|
20
|
-
-
|
20
|
+
- prek autoupdate
|
21
21
|
|
22
22
|
test:
|
23
23
|
desc: "Run tests with pytest"
|
@@ -32,7 +32,7 @@ tasks:
|
|
32
32
|
lint:
|
33
33
|
desc: "Lint code with ruff and docs with markdownlint"
|
34
34
|
cmds:
|
35
|
-
-
|
35
|
+
- prek run --all-files
|
36
36
|
|
37
37
|
docs:build:
|
38
38
|
desc: "Build documentation"
|
@@ -368,9 +368,10 @@ rules:
|
|
368
368
|
- Fix linting issues: `ruff check --fix`
|
369
369
|
- Type check: `mypy`
|
370
370
|
|
371
|
-
###
|
372
|
-
- Install
|
373
|
-
-
|
371
|
+
### Prek
|
372
|
+
- Install prek: `uv tool install prek`
|
373
|
+
- Install hooks: `prek install && prek install --hook-type commit-msg`
|
374
|
+
- Run manually: `prek run --all-files`
|
374
375
|
|
375
376
|
### Documentation
|
376
377
|
- Build docs: `uv run mkdocs build --clean --strict`
|
@@ -18,10 +18,14 @@ Thank you for contributing to Kreuzberg!
|
|
18
18
|
uv sync --all-extras --dev
|
19
19
|
```
|
20
20
|
|
21
|
-
1. **Install
|
21
|
+
1. **Install prek and hooks**:
|
22
22
|
|
23
23
|
```bash
|
24
|
-
|
24
|
+
# Install prek using uv (recommended)
|
25
|
+
uv tool install prek
|
26
|
+
|
27
|
+
# Install git hooks
|
28
|
+
prek install && prek install --hook-type commit-msg
|
25
29
|
```
|
26
30
|
|
27
31
|
## Development
|
@@ -42,8 +46,8 @@ uv run ruff check # Lint
|
|
42
46
|
uv run ruff check --fix # Auto-fix issues
|
43
47
|
uv run mypy # Type check
|
44
48
|
|
45
|
-
#
|
46
|
-
|
49
|
+
# Prek
|
50
|
+
prek run --all-files # Run all checks manually
|
47
51
|
|
48
52
|
# Documentation
|
49
53
|
uv run mkdocs serve # Serve docs locally
|
@@ -70,7 +74,7 @@ Use [Conventional Commits](https://www.conventionalcommits.org/):
|
|
70
74
|
|
71
75
|
- Python 3.10-3.13 supported
|
72
76
|
- System dependencies (optional): Tesseract, Pandoc
|
73
|
-
-
|
77
|
+
- Prek runs automatically on commit
|
74
78
|
- Join our [Discord](https://discord.gg/pXxagNK2zN) for help
|
75
79
|
|
76
80
|
## License
|
@@ -2,12 +2,14 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
5
|
+
import subprocess
|
6
|
+
import sys
|
5
7
|
from functools import lru_cache
|
6
8
|
from itertools import chain
|
7
9
|
from typing import TYPE_CHECKING, Any
|
8
10
|
|
9
11
|
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
10
|
-
from kreuzberg.exceptions import MissingDependencyError
|
12
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
11
13
|
|
12
14
|
if TYPE_CHECKING:
|
13
15
|
from collections.abc import Sequence
|
@@ -49,8 +51,6 @@ def extract_entities(
|
|
49
51
|
return entities
|
50
52
|
|
51
53
|
nlp = _load_spacy_model(model_name, spacy_config)
|
52
|
-
if not nlp:
|
53
|
-
return entities
|
54
54
|
|
55
55
|
if len(text) > spacy_config.max_doc_length:
|
56
56
|
text = text[: spacy_config.max_doc_length]
|
@@ -77,17 +77,45 @@ def extract_entities(
|
|
77
77
|
def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
78
78
|
try:
|
79
79
|
import spacy # noqa: PLC0415
|
80
|
+
except ImportError:
|
81
|
+
return None
|
80
82
|
|
81
|
-
|
82
|
-
|
83
|
+
if spacy_config.model_cache_dir:
|
84
|
+
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
83
85
|
|
86
|
+
try:
|
84
87
|
nlp = spacy.load(model_name)
|
88
|
+
except OSError:
|
89
|
+
result = subprocess.run(
|
90
|
+
[sys.executable, "-m", "spacy", "download", model_name],
|
91
|
+
capture_output=True,
|
92
|
+
text=True,
|
93
|
+
check=False,
|
94
|
+
)
|
85
95
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
96
|
+
if result.returncode != 0:
|
97
|
+
error_msg = (
|
98
|
+
f"Failed to download spaCy model '{model_name}'. "
|
99
|
+
f"Please install it manually with: python -m spacy download {model_name}"
|
100
|
+
)
|
101
|
+
if result.stderr:
|
102
|
+
error_msg += f"\nError details: {result.stderr}"
|
103
|
+
raise KreuzbergError(
|
104
|
+
error_msg, context={"model": model_name, "stderr": result.stderr, "return_code": result.returncode}
|
105
|
+
) from None
|
106
|
+
|
107
|
+
try:
|
108
|
+
nlp = spacy.load(model_name)
|
109
|
+
except OSError as e:
|
110
|
+
raise KreuzbergError(
|
111
|
+
f"Failed to load spaCy model '{model_name}' even after successful download. "
|
112
|
+
f"Please verify your spaCy installation and try reinstalling the model.",
|
113
|
+
context={"model": model_name, "error": str(e)},
|
114
|
+
) from e
|
115
|
+
|
116
|
+
nlp.max_length = spacy_config.max_doc_length
|
117
|
+
|
118
|
+
return nlp
|
91
119
|
|
92
120
|
|
93
121
|
def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
@@ -23,9 +23,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
|
|
23
23
|
config = LanguageDetectionConfig()
|
24
24
|
|
25
25
|
try:
|
26
|
-
# detect always returns a list, use k parameter for multiple languages
|
27
26
|
k = config.top_k if config.multilingual else 1
|
28
|
-
# Use the model from config directly
|
29
27
|
model = config.model
|
30
28
|
results = detect(text, model=model, k=k)
|
31
29
|
|
@@ -76,7 +76,6 @@ def _validate_and_post_process_helper(
|
|
76
76
|
result.keywords = None
|
77
77
|
|
78
78
|
if config.auto_detect_language:
|
79
|
-
# Use provided config or create one with the model from ExtractionConfig
|
80
79
|
lang_config = config.language_detection_config
|
81
80
|
if lang_config is None:
|
82
81
|
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.17.
|
8
|
+
version = "3.17.3"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -60,7 +60,7 @@ dependencies = [
|
|
60
60
|
"anyio>=4.10.0",
|
61
61
|
"chardetng-py>=0.3.5",
|
62
62
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
63
|
-
"html-to-markdown[lxml]>=1.
|
63
|
+
"html-to-markdown[lxml]>=1.14.0",
|
64
64
|
"langcodes>=3.5.0",
|
65
65
|
"mcp>=1.14.1",
|
66
66
|
"msgspec>=0.18.0",
|
@@ -111,7 +111,6 @@ scripts.kreuzberg-mcp = "kreuzberg._mcp.server:main"
|
|
111
111
|
dev = [
|
112
112
|
"covdefaults>=2.3.0",
|
113
113
|
"mypy>=1.18.2",
|
114
|
-
"pre-commit>=4.3.0",
|
115
114
|
"pytest>=8.4.2",
|
116
115
|
"pytest-cov>=7.0.0",
|
117
116
|
"pytest-mock>=3.15.1",
|
@@ -11,7 +11,7 @@ from kreuzberg._entity_extraction import (
|
|
11
11
|
extract_keywords,
|
12
12
|
)
|
13
13
|
from kreuzberg._types import SpacyEntityExtractionConfig
|
14
|
-
from kreuzberg.exceptions import MissingDependencyError
|
14
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
15
15
|
|
16
16
|
|
17
17
|
def test_extract_entities_with_custom_patterns_only() -> None:
|
@@ -58,11 +58,11 @@ def test_extract_entities_spacy_load_fails() -> None:
|
|
58
58
|
|
59
59
|
with (
|
60
60
|
patch("kreuzberg._entity_extraction._select_spacy_model", return_value="en_core_web_sm"),
|
61
|
-
patch("kreuzberg._entity_extraction._load_spacy_model"
|
61
|
+
patch("kreuzberg._entity_extraction._load_spacy_model") as mock_load,
|
62
62
|
):
|
63
|
-
|
64
|
-
|
65
|
-
|
63
|
+
mock_load.side_effect = KreuzbergError("Model download failed")
|
64
|
+
with pytest.raises(KreuzbergError, match="Model download failed"):
|
65
|
+
extract_entities(text)
|
66
66
|
|
67
67
|
|
68
68
|
def test_extract_entities_with_spacy_success() -> None:
|
@@ -216,26 +216,82 @@ def test_load_spacy_model_with_cache_dir() -> None:
|
|
216
216
|
os.environ.update(original_env)
|
217
217
|
|
218
218
|
|
219
|
-
def
|
219
|
+
def test_load_spacy_model_auto_download_success() -> None:
|
220
220
|
config = SpacyEntityExtractionConfig()
|
221
221
|
|
222
222
|
mock_spacy = Mock()
|
223
|
-
|
223
|
+
mock_nlp = Mock()
|
224
|
+
mock_spacy.load.side_effect = [OSError("Model not found"), mock_nlp]
|
224
225
|
|
225
|
-
|
226
|
+
mock_subprocess = Mock()
|
227
|
+
mock_subprocess.run.return_value.returncode = 0
|
228
|
+
|
229
|
+
with (
|
230
|
+
patch.dict("sys.modules", {"spacy": mock_spacy}),
|
231
|
+
patch("subprocess.run", mock_subprocess.run),
|
232
|
+
):
|
226
233
|
_load_spacy_model.cache_clear()
|
227
234
|
result = _load_spacy_model("en_core_web_sm", config)
|
228
235
|
|
229
|
-
assert result
|
236
|
+
assert result == mock_nlp
|
237
|
+
assert mock_nlp.max_length == config.max_doc_length
|
238
|
+
assert mock_spacy.load.call_count == 2
|
239
|
+
mock_subprocess.run.assert_called_once()
|
230
240
|
|
241
|
+
call_args = mock_subprocess.run.call_args[0][0]
|
242
|
+
assert "-m" in call_args
|
243
|
+
assert "spacy" in call_args
|
244
|
+
assert "download" in call_args
|
245
|
+
assert "en_core_web_sm" in call_args
|
231
246
|
|
232
|
-
|
247
|
+
|
248
|
+
def test_load_spacy_model_download_then_load_failure() -> None:
|
233
249
|
config = SpacyEntityExtractionConfig()
|
234
250
|
|
235
251
|
mock_spacy = Mock()
|
236
|
-
mock_spacy.load.side_effect =
|
252
|
+
mock_spacy.load.side_effect = [OSError("Model not found"), OSError("Load failed")]
|
237
253
|
|
238
|
-
|
254
|
+
mock_subprocess = Mock()
|
255
|
+
mock_subprocess.run.return_value.returncode = 0
|
256
|
+
|
257
|
+
with (
|
258
|
+
patch.dict("sys.modules", {"spacy": mock_spacy}),
|
259
|
+
patch("subprocess.run", mock_subprocess.run),
|
260
|
+
):
|
261
|
+
_load_spacy_model.cache_clear()
|
262
|
+
with pytest.raises(KreuzbergError, match="Failed to load spaCy model"):
|
263
|
+
_load_spacy_model("en_core_web_sm", config)
|
264
|
+
|
265
|
+
assert mock_spacy.load.call_count == 2
|
266
|
+
mock_subprocess.run.assert_called_once()
|
267
|
+
|
268
|
+
|
269
|
+
def test_load_spacy_model_auto_download_failure() -> None:
|
270
|
+
config = SpacyEntityExtractionConfig()
|
271
|
+
|
272
|
+
mock_spacy = Mock()
|
273
|
+
mock_spacy.load.side_effect = OSError("Model not found")
|
274
|
+
|
275
|
+
mock_subprocess = Mock()
|
276
|
+
mock_subprocess.run.return_value.returncode = 1
|
277
|
+
mock_subprocess.run.return_value.stderr = "Download error"
|
278
|
+
|
279
|
+
with (
|
280
|
+
patch.dict("sys.modules", {"spacy": mock_spacy}),
|
281
|
+
patch("subprocess.run", mock_subprocess.run),
|
282
|
+
):
|
283
|
+
_load_spacy_model.cache_clear()
|
284
|
+
with pytest.raises(KreuzbergError, match="Failed to download spaCy model"):
|
285
|
+
_load_spacy_model("en_core_web_sm", config)
|
286
|
+
|
287
|
+
mock_spacy.load.assert_called_once_with("en_core_web_sm")
|
288
|
+
mock_subprocess.run.assert_called_once()
|
289
|
+
|
290
|
+
|
291
|
+
def test_load_spacy_model_import_error() -> None:
|
292
|
+
config = SpacyEntityExtractionConfig()
|
293
|
+
|
294
|
+
with patch.dict("sys.modules", {"spacy": None}):
|
239
295
|
_load_spacy_model.cache_clear()
|
240
296
|
result = _load_spacy_model("en_core_web_sm", config)
|
241
297
|
|
@@ -23,7 +23,6 @@ def clear_language_detection_cache() -> Generator[None, None, None]:
|
|
23
23
|
def test_detect_languages_when_library_missing() -> None:
|
24
24
|
text = "This is some English text."
|
25
25
|
|
26
|
-
# Mock the import statement inside the function
|
27
26
|
with patch.dict("sys.modules", {"fast_langdetect": None}):
|
28
27
|
with pytest.raises(MissingDependencyError) as exc_info:
|
29
28
|
detect_languages(text)
|
@@ -224,7 +223,6 @@ def test_detect_languages_cache_different_configs() -> None:
|
|
224
223
|
mock_detect.assert_any_call(text, model="auto", k=2)
|
225
224
|
|
226
225
|
|
227
|
-
# Real integration tests without mocks
|
228
226
|
def test_detect_languages_real_single_language() -> None:
|
229
227
|
text = "This is definitely an English text with multiple sentences. It should be detected as English."
|
230
228
|
result = detect_languages(text)
|
@@ -235,24 +233,20 @@ def test_detect_languages_real_single_language() -> None:
|
|
235
233
|
|
236
234
|
|
237
235
|
def test_detect_languages_real_multilingual() -> None:
|
238
|
-
# Text with mixed languages
|
239
236
|
text = "Hello world. Bonjour le monde. Hola mundo. Ciao mondo."
|
240
237
|
config = LanguageDetectionConfig(multilingual=True, top_k=4)
|
241
238
|
result = detect_languages(text, config)
|
242
239
|
|
243
240
|
assert result is not None
|
244
241
|
assert len(result) >= 1
|
245
|
-
# The exact languages detected may vary, but we should get at least one
|
246
242
|
assert all(isinstance(lang, str) for lang in result)
|
247
|
-
assert all(len(lang) == 2 for lang in result)
|
243
|
+
assert all(len(lang) == 2 for lang in result)
|
248
244
|
|
249
245
|
|
250
246
|
def test_detect_languages_real_empty_text() -> None:
|
251
247
|
text = ""
|
252
248
|
result = detect_languages(text)
|
253
249
|
|
254
|
-
# Empty text should return None or raise an exception (caught and returns None)
|
255
|
-
# Note: fast_langdetect may return a default language for empty text
|
256
250
|
assert result is None or (isinstance(result, list) and len(result) <= 1)
|
257
251
|
|
258
252
|
|
@@ -272,7 +266,6 @@ def test_detect_languages_real_french_text() -> None:
|
|
272
266
|
|
273
267
|
assert result is not None
|
274
268
|
assert len(result) == 1
|
275
|
-
# Note: Model accuracy may vary, checking that we get a valid language code
|
276
269
|
assert isinstance(result[0], str)
|
277
270
|
assert len(result[0]) == 2
|
278
271
|
|
@@ -283,7 +276,6 @@ def test_detect_languages_real_german_text() -> None:
|
|
283
276
|
|
284
277
|
assert result is not None
|
285
278
|
assert len(result) == 1
|
286
|
-
# Note: Model accuracy may vary, checking that we get a valid language code
|
287
279
|
assert isinstance(result[0], str)
|
288
280
|
assert len(result[0]) == 2
|
289
281
|
|
@@ -294,19 +286,16 @@ def test_detect_languages_real_spanish_text() -> None:
|
|
294
286
|
|
295
287
|
assert result is not None
|
296
288
|
assert len(result) == 1
|
297
|
-
# Note: Model accuracy may vary, checking that we get a valid language code
|
298
289
|
assert isinstance(result[0], str)
|
299
290
|
assert len(result[0]) == 2
|
300
291
|
|
301
292
|
|
302
293
|
def test_detect_languages_real_mixed_languages_with_top_k() -> None:
|
303
|
-
# Text with multiple languages - should detect top languages
|
304
294
|
text = "English text. Texte français. Deutscher Text. Texto español."
|
305
295
|
config = LanguageDetectionConfig(multilingual=True, top_k=2)
|
306
296
|
result = detect_languages(text, config)
|
307
297
|
|
308
298
|
assert result is not None
|
309
|
-
# Should detect at least 1, up to 2 languages
|
310
299
|
assert 1 <= len(result) <= 2
|
311
300
|
assert all(isinstance(lang, str) for lang in result)
|
312
301
|
assert all(len(lang) == 2 for lang in result)
|