kreuzberg 3.17.0__tar.gz → 3.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.pre-commit-config.yaml +1 -1
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/PKG-INFO +3 -3
- kreuzberg-3.17.1/kreuzberg/_language_detection.py +37 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_types.py +15 -6
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/extraction.py +8 -1
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/pyproject.toml +5 -5
- kreuzberg-3.17.1/tests/features/language_detection_test.py +354 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/uv.lock +176 -147
- kreuzberg-3.17.0/kreuzberg/_language_detection.py +0 -60
- kreuzberg-3.17.0/tests/features/language_detection_test.py +0 -387
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.commitlintrc +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.deepsource.toml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.docker/README.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.dockerignore +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.github/workflows/docker-e2e-tests.yml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.github/workflows/test-docker-builds.yml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.gitignore +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/.prettierignore +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/ATTRIBUTIONS.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/LICENSE +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/README.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/Taskfile.yml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/ai-rulez.yaml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/batch_size_benchmark.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/batch_validation_benchmark.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/py.typed +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/src/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/src/__main__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/src/benchmarks.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/src/cli.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/src/models.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/src/profiler.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/src/runner.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/benchmarks/token_reduction_compression_benchmark.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/cli.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/contributing.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/index.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/docs/user-guide/token-reduction.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_api/_config_cache.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/_reducer.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/_stopwords.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_html_streaming.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_image_preprocessing.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_resource_managers.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/mkdocs.yaml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/api/config_cache_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/api/conftest.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/api/header_config_hashing_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/api/image_extraction_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/api/main_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/api/runtime_config_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/conftest.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/comprehensive_config_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/config_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/constants_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/dpi_configuration_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/exceptions_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/extraction_batch_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/extraction_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/html_to_markdown_config_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/image_ocr_result_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/init_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/main_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/mime_types_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/registry_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/core/types_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/e2e/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/e2e/docker_e2e.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/README_image_tests.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/base_extractor_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/base_memory_limits_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/base_ocr_processing_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/base_ocr_simple_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/email_error_paths_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/html_invalid_base64_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/image_deduplication_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/image_error_handling_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/image_error_simple_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/json_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/pdf_images_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/pdf_sync_images_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/spreadsheet_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/features/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/features/chunker_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/features/document_classification_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/features/entity_extraction_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/features/gmft_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/features/hooks_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/features/table_extraction_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/features/token_reduction_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/all_extractors_images_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/api/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/api/large_file_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/api/mounted_config_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/dpi_integration_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/ocr/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/pandoc_images_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/pdf_images_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/pdf_real_images_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/pptx_complex_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/pptx_images_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/regression_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/integration/token_reduction_integration_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/interfaces/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/interfaces/cli_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/interfaces/mcp_server_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/mcp/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/mcp/mcp_server_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/ocr/tesseract_tsv_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/performance/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/performance/large_pdf_perf_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/google-doc-document.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/complex_nested.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/aws_policy.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/earthquakes.geojson +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/github_emojis.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/iss_location.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/openapi_spec.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/package.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/rick_morty_character.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/json/schema_test.json +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/sharable-web-guide.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/borderless_table.png +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/complex_document.png +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/test-excel.xls +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/ocr_cache_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/playa_helpers_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/playa_metadata_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/playa_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/quality_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/ref_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.17.0 → kreuzberg-3.17.1}/tests/utils/tmp_test.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.17.
|
3
|
+
Version: 3.17.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -33,12 +33,12 @@ Requires-Dist: chardetng-py>=0.3.5
|
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.13.0
|
35
35
|
Requires-Dist: langcodes>=3.5.0
|
36
|
-
Requires-Dist: mcp>=1.14.
|
36
|
+
Requires-Dist: mcp>=1.14.1
|
37
37
|
Requires-Dist: msgspec>=0.18.0
|
38
38
|
Requires-Dist: numpy>=2.0.0
|
39
39
|
Requires-Dist: playa-pdf>=0.7.0
|
40
40
|
Requires-Dist: polars>=1.33.1
|
41
|
-
Requires-Dist: psutil>=7.
|
41
|
+
Requires-Dist: psutil>=7.1.0
|
42
42
|
Requires-Dist: pypdfium2==4.30.0
|
43
43
|
Requires-Dist: python-calamine>=0.5.3
|
44
44
|
Requires-Dist: python-pptx>=1.0.2
|
@@ -0,0 +1,37 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from functools import lru_cache
|
4
|
+
|
5
|
+
from kreuzberg._types import LanguageDetectionConfig
|
6
|
+
from kreuzberg.exceptions import MissingDependencyError
|
7
|
+
|
8
|
+
_CACHE_SIZE = 128
|
9
|
+
|
10
|
+
|
11
|
+
@lru_cache(maxsize=_CACHE_SIZE)
|
12
|
+
def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
|
13
|
+
try:
|
14
|
+
from fast_langdetect import detect # noqa: PLC0415
|
15
|
+
except ImportError as e:
|
16
|
+
raise MissingDependencyError.create_for_package(
|
17
|
+
dependency_group="langdetect",
|
18
|
+
functionality="language detection",
|
19
|
+
package_name="fast-langdetect",
|
20
|
+
) from e
|
21
|
+
|
22
|
+
if config is None:
|
23
|
+
config = LanguageDetectionConfig()
|
24
|
+
|
25
|
+
try:
|
26
|
+
# detect always returns a list, use k parameter for multiple languages
|
27
|
+
k = config.top_k if config.multilingual else 1
|
28
|
+
# Use the model from config directly
|
29
|
+
model = config.model
|
30
|
+
results = detect(text, model=model, k=k)
|
31
|
+
|
32
|
+
if results:
|
33
|
+
langs = [result["lang"].lower() for result in results if result.get("lang")]
|
34
|
+
return langs if langs else None
|
35
|
+
return None
|
36
|
+
except Exception: # noqa: BLE001
|
37
|
+
return None
|
@@ -402,9 +402,12 @@ class ImageOCRConfig(ConfigDict):
|
|
402
402
|
|
403
403
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
404
404
|
class LanguageDetectionConfig(ConfigDict):
|
405
|
-
|
406
|
-
"""
|
407
|
-
|
405
|
+
model: Literal["lite", "full", "auto"] = "auto"
|
406
|
+
"""Language detection model to use:
|
407
|
+
- 'lite': Smaller, faster model with good accuracy
|
408
|
+
- 'full': Larger model with highest accuracy
|
409
|
+
- 'auto': Automatically choose based on memory availability (default)
|
410
|
+
"""
|
408
411
|
top_k: int = 3
|
409
412
|
"""Maximum number of languages to return for multilingual detection."""
|
410
413
|
multilingual: bool = False
|
@@ -412,8 +415,8 @@ class LanguageDetectionConfig(ConfigDict):
|
|
412
415
|
If False, uses single language detection."""
|
413
416
|
cache_dir: str | None = None
|
414
417
|
"""Custom directory for model cache. If None, uses system default."""
|
415
|
-
|
416
|
-
"""
|
418
|
+
low_memory: bool = True
|
419
|
+
"""Deprecated. Use 'model' parameter instead. If True, uses 'lite' model."""
|
417
420
|
|
418
421
|
|
419
422
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
@@ -983,8 +986,14 @@ class ExtractionConfig(ConfigDict):
|
|
983
986
|
"""Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
|
984
987
|
auto_detect_language: bool = False
|
985
988
|
"""Whether to automatically detect language and configure OCR accordingly."""
|
989
|
+
language_detection_model: Literal["lite", "full", "auto"] = "auto"
|
990
|
+
"""Language detection model to use when auto_detect_language is True.
|
991
|
+
- 'lite': Smaller, faster model with good accuracy
|
992
|
+
- 'full': Larger model with highest accuracy
|
993
|
+
- 'auto': Automatically choose based on memory availability (default)
|
994
|
+
"""
|
986
995
|
language_detection_config: LanguageDetectionConfig | None = None
|
987
|
-
"""Configuration for language detection. If None, uses default settings."""
|
996
|
+
"""Configuration for language detection. If None, uses default settings with language_detection_model."""
|
988
997
|
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
989
998
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
990
999
|
auto_detect_document_type: bool = False
|
@@ -76,9 +76,16 @@ def _validate_and_post_process_helper(
|
|
76
76
|
result.keywords = None
|
77
77
|
|
78
78
|
if config.auto_detect_language:
|
79
|
+
# Use provided config or create one with the model from ExtractionConfig
|
80
|
+
lang_config = config.language_detection_config
|
81
|
+
if lang_config is None:
|
82
|
+
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
83
|
+
|
84
|
+
lang_config = LanguageDetectionConfig(model=config.language_detection_model)
|
85
|
+
|
79
86
|
result.detected_languages = detect_languages(
|
80
87
|
result.content,
|
81
|
-
config=
|
88
|
+
config=lang_config,
|
82
89
|
)
|
83
90
|
|
84
91
|
if config.auto_detect_document_type:
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.17.
|
8
|
+
version = "3.17.1"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -62,12 +62,12 @@ dependencies = [
|
|
62
62
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
63
63
|
"html-to-markdown[lxml]>=1.13.0",
|
64
64
|
"langcodes>=3.5.0",
|
65
|
-
"mcp>=1.14.
|
65
|
+
"mcp>=1.14.1",
|
66
66
|
"msgspec>=0.18.0",
|
67
67
|
"numpy>=2.0.0",
|
68
68
|
"playa-pdf>=0.7.0",
|
69
69
|
"polars>=1.33.1",
|
70
|
-
"psutil>=7.
|
70
|
+
"psutil>=7.1.0",
|
71
71
|
"pypdfium2==4.30.0", # pinned due to bug in 4.30.1, until v5 is stable
|
72
72
|
"python-calamine>=0.5.3",
|
73
73
|
"python-pptx>=1.0.2",
|
@@ -110,7 +110,7 @@ scripts.kreuzberg-mcp = "kreuzberg._mcp.server:main"
|
|
110
110
|
[dependency-groups]
|
111
111
|
dev = [
|
112
112
|
"covdefaults>=2.3.0",
|
113
|
-
"mypy>=1.18.
|
113
|
+
"mypy>=1.18.2",
|
114
114
|
"pre-commit>=4.3.0",
|
115
115
|
"pytest>=8.4.2",
|
116
116
|
"pytest-cov>=7.0.0",
|
@@ -118,7 +118,7 @@ dev = [
|
|
118
118
|
"pytest-rerunfailures>=16.0.1",
|
119
119
|
"pytest-timeout>=2.4.0",
|
120
120
|
"rich>=14.1.0",
|
121
|
-
"ruff>=0.13.
|
121
|
+
"ruff>=0.13.1",
|
122
122
|
"tabulate>=0.9.0",
|
123
123
|
"trio>=0.31.0",
|
124
124
|
"uv-bump",
|
@@ -0,0 +1,354 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
from unittest.mock import Mock, patch
|
5
|
+
|
6
|
+
import pytest
|
7
|
+
|
8
|
+
from kreuzberg._language_detection import detect_languages
|
9
|
+
from kreuzberg._types import LanguageDetectionConfig
|
10
|
+
from kreuzberg.exceptions import MissingDependencyError
|
11
|
+
|
12
|
+
if TYPE_CHECKING:
|
13
|
+
from collections.abc import Generator
|
14
|
+
|
15
|
+
|
16
|
+
@pytest.fixture(autouse=True)
|
17
|
+
def clear_language_detection_cache() -> Generator[None, None, None]:
|
18
|
+
detect_languages.cache_clear()
|
19
|
+
yield
|
20
|
+
detect_languages.cache_clear()
|
21
|
+
|
22
|
+
|
23
|
+
def test_detect_languages_when_library_missing() -> None:
|
24
|
+
text = "This is some English text."
|
25
|
+
|
26
|
+
# Mock the import statement inside the function
|
27
|
+
with patch.dict("sys.modules", {"fast_langdetect": None}):
|
28
|
+
with pytest.raises(MissingDependencyError) as exc_info:
|
29
|
+
detect_languages(text)
|
30
|
+
|
31
|
+
error = exc_info.value
|
32
|
+
assert "fast-langdetect" in str(error)
|
33
|
+
assert "language detection" in str(error)
|
34
|
+
|
35
|
+
|
36
|
+
def test_detect_languages_single_language_success() -> None:
|
37
|
+
text = "This is some English text."
|
38
|
+
mock_detect_result = [{"lang": "EN", "score": 0.99}]
|
39
|
+
|
40
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
41
|
+
|
42
|
+
with patch("fast_langdetect.detect", mock_detect):
|
43
|
+
result = detect_languages(text)
|
44
|
+
|
45
|
+
assert result == ["en"]
|
46
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|
47
|
+
|
48
|
+
|
49
|
+
def test_detect_languages_single_language_no_lang_key() -> None:
|
50
|
+
text = "This is some text."
|
51
|
+
mock_detect_result = [{"score": 0.50}]
|
52
|
+
|
53
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
54
|
+
|
55
|
+
with patch("fast_langdetect.detect", mock_detect):
|
56
|
+
result = detect_languages(text)
|
57
|
+
|
58
|
+
assert result is None
|
59
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|
60
|
+
|
61
|
+
|
62
|
+
def test_detect_languages_single_language_empty_lang() -> None:
|
63
|
+
text = "This is some text."
|
64
|
+
mock_detect_result = [{"lang": "", "score": 0.50}]
|
65
|
+
|
66
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
67
|
+
|
68
|
+
with patch("fast_langdetect.detect", mock_detect):
|
69
|
+
result = detect_languages(text)
|
70
|
+
|
71
|
+
assert result is None
|
72
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|
73
|
+
|
74
|
+
|
75
|
+
def test_detect_languages_single_language_none_result() -> None:
|
76
|
+
text = "This is some text."
|
77
|
+
mock_detect = Mock(return_value=None)
|
78
|
+
|
79
|
+
with patch("fast_langdetect.detect", mock_detect):
|
80
|
+
result = detect_languages(text)
|
81
|
+
|
82
|
+
assert result is None
|
83
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|
84
|
+
|
85
|
+
|
86
|
+
def test_detect_languages_multilingual_success() -> None:
|
87
|
+
text = "Hello world. Bonjour le monde."
|
88
|
+
config = LanguageDetectionConfig(multilingual=True, top_k=3)
|
89
|
+
|
90
|
+
mock_multilingual_results = [
|
91
|
+
{"lang": "EN", "score": 0.8},
|
92
|
+
{"lang": "FR", "score": 0.7},
|
93
|
+
{"lang": "ES", "score": 0.1},
|
94
|
+
]
|
95
|
+
|
96
|
+
mock_detect = Mock(return_value=mock_multilingual_results)
|
97
|
+
|
98
|
+
with patch("fast_langdetect.detect", mock_detect):
|
99
|
+
result = detect_languages(text, config)
|
100
|
+
|
101
|
+
assert result == ["en", "fr", "es"]
|
102
|
+
mock_detect.assert_called_once_with(text, model="auto", k=3)
|
103
|
+
|
104
|
+
|
105
|
+
def test_detect_languages_multilingual_with_top_k() -> None:
|
106
|
+
text = "Hello world. Bonjour le monde."
|
107
|
+
config = LanguageDetectionConfig(multilingual=True, top_k=2)
|
108
|
+
|
109
|
+
mock_multilingual_results = [{"lang": "EN", "score": 0.8}, {"lang": "FR", "score": 0.7}]
|
110
|
+
|
111
|
+
mock_detect = Mock(return_value=mock_multilingual_results)
|
112
|
+
|
113
|
+
with patch("fast_langdetect.detect", mock_detect):
|
114
|
+
result = detect_languages(text, config)
|
115
|
+
|
116
|
+
assert result == ["en", "fr"]
|
117
|
+
mock_detect.assert_called_once_with(text, model="auto", k=2)
|
118
|
+
|
119
|
+
|
120
|
+
def test_detect_languages_multilingual_results_missing_lang() -> None:
|
121
|
+
text = "Mixed language text."
|
122
|
+
config = LanguageDetectionConfig(multilingual=True)
|
123
|
+
|
124
|
+
mock_multilingual_results = [
|
125
|
+
{"lang": "EN", "score": 0.8},
|
126
|
+
{"score": 0.6},
|
127
|
+
{"lang": "", "score": 0.4},
|
128
|
+
{"lang": "FR", "score": 0.3},
|
129
|
+
]
|
130
|
+
|
131
|
+
mock_detect = Mock(return_value=mock_multilingual_results)
|
132
|
+
|
133
|
+
with patch("fast_langdetect.detect", mock_detect):
|
134
|
+
result = detect_languages(text, config)
|
135
|
+
|
136
|
+
assert result == ["en", "fr"]
|
137
|
+
|
138
|
+
|
139
|
+
def test_detect_languages_with_default_config() -> None:
|
140
|
+
text = "This is English text."
|
141
|
+
mock_detect_result = [{"lang": "EN", "score": 0.95}]
|
142
|
+
|
143
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
144
|
+
|
145
|
+
with patch("fast_langdetect.detect", mock_detect):
|
146
|
+
result = detect_languages(text, config=None)
|
147
|
+
|
148
|
+
assert result == ["en"]
|
149
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|
150
|
+
|
151
|
+
|
152
|
+
def test_detect_languages_single_language_with_config() -> None:
|
153
|
+
text = "This is English text."
|
154
|
+
config = LanguageDetectionConfig(multilingual=False)
|
155
|
+
mock_detect_result = [{"lang": "EN", "score": 0.95}]
|
156
|
+
|
157
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
158
|
+
|
159
|
+
with patch("fast_langdetect.detect", mock_detect):
|
160
|
+
result = detect_languages(text, config)
|
161
|
+
|
162
|
+
assert result == ["en"]
|
163
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|
164
|
+
|
165
|
+
|
166
|
+
def test_detect_languages_exception_handling() -> None:
|
167
|
+
text = "This is some text."
|
168
|
+
|
169
|
+
mock_detect = Mock(side_effect=RuntimeError("Detection failed"))
|
170
|
+
|
171
|
+
with patch("fast_langdetect.detect", mock_detect):
|
172
|
+
result = detect_languages(text)
|
173
|
+
|
174
|
+
assert result is None
|
175
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|
176
|
+
|
177
|
+
|
178
|
+
def test_detect_languages_multilingual_exception_handling() -> None:
|
179
|
+
text = "Mixed language text."
|
180
|
+
config = LanguageDetectionConfig(multilingual=True)
|
181
|
+
|
182
|
+
mock_detect = Mock(side_effect=ValueError("Multilingual detection failed"))
|
183
|
+
|
184
|
+
with patch("fast_langdetect.detect", mock_detect):
|
185
|
+
result = detect_languages(text, config)
|
186
|
+
|
187
|
+
assert result is None
|
188
|
+
mock_detect.assert_called_once_with(text, model="auto", k=3)
|
189
|
+
|
190
|
+
|
191
|
+
def test_detect_languages_caching_behavior() -> None:
|
192
|
+
text = "This is English text."
|
193
|
+
mock_detect_result = [{"lang": "EN", "score": 0.95}]
|
194
|
+
|
195
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
196
|
+
|
197
|
+
with patch("fast_langdetect.detect", mock_detect):
|
198
|
+
config = LanguageDetectionConfig()
|
199
|
+
result1 = detect_languages(text, config)
|
200
|
+
result2 = detect_languages(text, config)
|
201
|
+
|
202
|
+
assert result1 == ["en"]
|
203
|
+
assert result2 == ["en"]
|
204
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|
205
|
+
|
206
|
+
|
207
|
+
def test_detect_languages_cache_different_configs() -> None:
|
208
|
+
text = "This is English text."
|
209
|
+
mock_detect_result = [{"lang": "EN", "score": 0.95}]
|
210
|
+
|
211
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
212
|
+
|
213
|
+
with patch("fast_langdetect.detect", mock_detect):
|
214
|
+
config1 = LanguageDetectionConfig(multilingual=False)
|
215
|
+
config2 = LanguageDetectionConfig(multilingual=True, top_k=2)
|
216
|
+
|
217
|
+
result1 = detect_languages(text, config1)
|
218
|
+
result2 = detect_languages(text, config2)
|
219
|
+
|
220
|
+
assert result1 == ["en"]
|
221
|
+
assert result2 == ["en"]
|
222
|
+
assert mock_detect.call_count == 2
|
223
|
+
mock_detect.assert_any_call(text, model="auto", k=1)
|
224
|
+
mock_detect.assert_any_call(text, model="auto", k=2)
|
225
|
+
|
226
|
+
|
227
|
+
# Real integration tests without mocks
|
228
|
+
def test_detect_languages_real_single_language() -> None:
|
229
|
+
text = "This is definitely an English text with multiple sentences. It should be detected as English."
|
230
|
+
result = detect_languages(text)
|
231
|
+
|
232
|
+
assert result is not None
|
233
|
+
assert len(result) == 1
|
234
|
+
assert result[0] == "en"
|
235
|
+
|
236
|
+
|
237
|
+
def test_detect_languages_real_multilingual() -> None:
|
238
|
+
# Text with mixed languages
|
239
|
+
text = "Hello world. Bonjour le monde. Hola mundo. Ciao mondo."
|
240
|
+
config = LanguageDetectionConfig(multilingual=True, top_k=4)
|
241
|
+
result = detect_languages(text, config)
|
242
|
+
|
243
|
+
assert result is not None
|
244
|
+
assert len(result) >= 1
|
245
|
+
# The exact languages detected may vary, but we should get at least one
|
246
|
+
assert all(isinstance(lang, str) for lang in result)
|
247
|
+
assert all(len(lang) == 2 for lang in result) # Language codes should be 2 chars
|
248
|
+
|
249
|
+
|
250
|
+
def test_detect_languages_real_empty_text() -> None:
|
251
|
+
text = ""
|
252
|
+
result = detect_languages(text)
|
253
|
+
|
254
|
+
# Empty text should return None or raise an exception (caught and returns None)
|
255
|
+
# Note: fast_langdetect may return a default language for empty text
|
256
|
+
assert result is None or (isinstance(result, list) and len(result) <= 1)
|
257
|
+
|
258
|
+
|
259
|
+
def test_detect_languages_real_with_config() -> None:
|
260
|
+
text = "This is an English sentence."
|
261
|
+
config = LanguageDetectionConfig(multilingual=False)
|
262
|
+
result = detect_languages(text, config)
|
263
|
+
|
264
|
+
assert result is not None
|
265
|
+
assert len(result) == 1
|
266
|
+
assert result[0] == "en"
|
267
|
+
|
268
|
+
|
269
|
+
def test_detect_languages_real_french_text() -> None:
|
270
|
+
text = "Ceci est un texte en français. Il devrait être détecté comme français."
|
271
|
+
result = detect_languages(text)
|
272
|
+
|
273
|
+
assert result is not None
|
274
|
+
assert len(result) == 1
|
275
|
+
# Note: Model accuracy may vary, checking that we get a valid language code
|
276
|
+
assert isinstance(result[0], str)
|
277
|
+
assert len(result[0]) == 2
|
278
|
+
|
279
|
+
|
280
|
+
def test_detect_languages_real_german_text() -> None:
|
281
|
+
text = "Dies ist ein deutscher Text. Es sollte als Deutsch erkannt werden."
|
282
|
+
result = detect_languages(text)
|
283
|
+
|
284
|
+
assert result is not None
|
285
|
+
assert len(result) == 1
|
286
|
+
# Note: Model accuracy may vary, checking that we get a valid language code
|
287
|
+
assert isinstance(result[0], str)
|
288
|
+
assert len(result[0]) == 2
|
289
|
+
|
290
|
+
|
291
|
+
def test_detect_languages_real_spanish_text() -> None:
|
292
|
+
text = "Este es un texto en español. Debería ser detectado como español."
|
293
|
+
result = detect_languages(text)
|
294
|
+
|
295
|
+
assert result is not None
|
296
|
+
assert len(result) == 1
|
297
|
+
# Note: Model accuracy may vary, checking that we get a valid language code
|
298
|
+
assert isinstance(result[0], str)
|
299
|
+
assert len(result[0]) == 2
|
300
|
+
|
301
|
+
|
302
|
+
def test_detect_languages_real_mixed_languages_with_top_k() -> None:
|
303
|
+
# Text with multiple languages - should detect top languages
|
304
|
+
text = "English text. Texte français. Deutscher Text. Texto español."
|
305
|
+
config = LanguageDetectionConfig(multilingual=True, top_k=2)
|
306
|
+
result = detect_languages(text, config)
|
307
|
+
|
308
|
+
assert result is not None
|
309
|
+
# Should detect at least 1, up to 2 languages
|
310
|
+
assert 1 <= len(result) <= 2
|
311
|
+
assert all(isinstance(lang, str) for lang in result)
|
312
|
+
assert all(len(lang) == 2 for lang in result)
|
313
|
+
|
314
|
+
|
315
|
+
def test_detect_languages_with_lite_model() -> None:
|
316
|
+
text = "This is English text."
|
317
|
+
config = LanguageDetectionConfig(model="lite")
|
318
|
+
mock_detect_result = [{"lang": "EN", "score": 0.95}]
|
319
|
+
|
320
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
321
|
+
|
322
|
+
with patch("fast_langdetect.detect", mock_detect):
|
323
|
+
result = detect_languages(text, config)
|
324
|
+
|
325
|
+
assert result == ["en"]
|
326
|
+
mock_detect.assert_called_once_with(text, model="lite", k=1)
|
327
|
+
|
328
|
+
|
329
|
+
def test_detect_languages_with_full_model() -> None:
|
330
|
+
text = "This is English text."
|
331
|
+
config = LanguageDetectionConfig(model="full")
|
332
|
+
mock_detect_result = [{"lang": "EN", "score": 0.95}]
|
333
|
+
|
334
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
335
|
+
|
336
|
+
with patch("fast_langdetect.detect", mock_detect):
|
337
|
+
result = detect_languages(text, config)
|
338
|
+
|
339
|
+
assert result == ["en"]
|
340
|
+
mock_detect.assert_called_once_with(text, model="full", k=1)
|
341
|
+
|
342
|
+
|
343
|
+
def test_detect_languages_with_auto_model() -> None:
|
344
|
+
text = "This is English text."
|
345
|
+
config = LanguageDetectionConfig(model="auto")
|
346
|
+
mock_detect_result = [{"lang": "EN", "score": 0.95}]
|
347
|
+
|
348
|
+
mock_detect = Mock(return_value=mock_detect_result)
|
349
|
+
|
350
|
+
with patch("fast_langdetect.detect", mock_detect):
|
351
|
+
result = detect_languages(text, config)
|
352
|
+
|
353
|
+
assert result == ["en"]
|
354
|
+
mock_detect.assert_called_once_with(text, model="auto", k=1)
|