kreuzberg 3.17.3__tar.gz → 3.18.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.pre-commit-config.yaml +1 -1
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/PKG-INFO +4 -4
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/api-server.md +32 -1
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_api/main.py +43 -3
- kreuzberg-3.18.0/kreuzberg/_entity_extraction.py +244 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/pyproject.toml +5 -5
- kreuzberg-3.18.0/tests/api/environment_config_test.py +154 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pdf_test.py +74 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/entity_extraction_test.py +32 -157
- kreuzberg-3.18.0/tests/test_source_files/image-only-german-pdf.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/uv.lock +165 -141
- kreuzberg-3.17.3/kreuzberg/_entity_extraction.py +0 -150
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.commitlintrc +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.deepsource.toml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.docker/README.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.dockerignore +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/ci.yaml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/docker-e2e-tests.yml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.github/workflows/test-docker-builds.yml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.gitignore +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/.prettierignore +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/ATTRIBUTIONS.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/LICENSE +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/README.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/Taskfile.yml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/ai-rulez.yaml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/batch_size_benchmark.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/batch_validation_benchmark.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/py.typed +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/__main__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/benchmarks.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/cli.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/models.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/profiler.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/src/runner.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/benchmarks/token_reduction_compression_benchmark.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/api-reference/types.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/cli.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/contributing.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/index.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/docs/user-guide/token-reduction.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_api/_config_cache.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_pdf.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/_reducer.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/_stopwords.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_types.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_html_streaming.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_image_preprocessing.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_resource_managers.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/mkdocs.yaml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/config_cache_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/conftest.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/header_config_hashing_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/image_extraction_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/main_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/api/runtime_config_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/conftest.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/comprehensive_config_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/config_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/constants_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/dpi_configuration_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/exceptions_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/extraction_batch_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/extraction_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/html_to_markdown_config_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/image_ocr_result_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/init_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/main_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/mime_types_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/registry_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/core/types_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/e2e/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/e2e/docker_e2e.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/README_image_tests.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/base_extractor_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/base_memory_limits_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/base_ocr_processing_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/base_ocr_simple_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/email_error_paths_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/html_invalid_base64_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/image_deduplication_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/image_error_handling_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/image_error_simple_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/json_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pdf_images_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/pdf_sync_images_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/spreadsheet_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/chunker_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/document_classification_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/gmft_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/hooks_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/language_detection_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/table_extraction_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/features/token_reduction_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/all_extractors_images_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/api/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/api/large_file_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/api/mounted_config_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/dpi_integration_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/ocr/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pandoc_images_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pdf_images_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pdf_real_images_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pptx_complex_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/pptx_images_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/regression_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/integration/token_reduction_integration_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/interfaces/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/interfaces/cli_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/interfaces/mcp_server_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/mcp/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/mcp/mcp_server_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/ocr/tesseract_tsv_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/performance/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/performance/large_pdf_perf_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/google-doc-document.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/complex_nested.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/aws_policy.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/earthquakes.geojson +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/github_emojis.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/iss_location.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/openapi_spec.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/package.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/real_world/rick_morty_character.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/json/schema_test.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/sharable-web-guide.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/tables/borderless_table.png +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/tables/complex_document.png +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/test-excel.xls +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/ocr_cache_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/playa_helpers_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/playa_metadata_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/playa_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/quality_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/ref_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.18.0}/tests/utils/tmp_test.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.18.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -28,12 +28,12 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
28
|
Classifier: Topic :: Text Processing :: General
|
29
29
|
Classifier: Typing :: Typed
|
30
30
|
Requires-Python: >=3.10
|
31
|
-
Requires-Dist: anyio>=4.
|
31
|
+
Requires-Dist: anyio>=4.11.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.16.0
|
35
35
|
Requires-Dist: langcodes>=3.5.0
|
36
|
-
Requires-Dist: mcp>=1.
|
36
|
+
Requires-Dist: mcp>=1.15.0
|
37
37
|
Requires-Dist: msgspec>=0.18.0
|
38
38
|
Requires-Dist: numpy>=2.0.0
|
39
39
|
Requires-Dist: playa-pdf>=0.7.0
|
@@ -62,7 +62,7 @@ Extract text from one or more files.
|
|
62
62
|
- Method: `POST`
|
63
63
|
- Content-Type: `multipart/form-data`
|
64
64
|
- Body: One or more files with field name `data`
|
65
|
-
- **Maximum file size: 1GB per file**
|
65
|
+
- **Maximum file size: Configurable via `KREUZBERG_MAX_UPLOAD_SIZE` environment variable (default: 1GB per file)**
|
66
66
|
|
67
67
|
**Response:**
|
68
68
|
|
@@ -463,6 +463,37 @@ The API server uses the default Kreuzberg extraction configuration:
|
|
463
463
|
- PDF, image, and document extraction is supported
|
464
464
|
- Table extraction with GMFT (if installed)
|
465
465
|
|
466
|
+
### Environment Variables
|
467
|
+
|
468
|
+
The API server can be configured using environment variables for production deployments:
|
469
|
+
|
470
|
+
#### Server Configuration
|
471
|
+
|
472
|
+
| Variable | Description | Default | Example |
|
473
|
+
| -------------------------------- | ---------------------------- | ------------------ | ------------------ |
|
474
|
+
| `KREUZBERG_MAX_UPLOAD_SIZE` | Maximum upload size in bytes | `1073741824` (1GB) | `2147483648` (2GB) |
|
475
|
+
| `KREUZBERG_ENABLE_OPENTELEMETRY` | Enable OpenTelemetry tracing | `true` | `false` |
|
476
|
+
|
477
|
+
#### Usage Examples
|
478
|
+
|
479
|
+
```bash
|
480
|
+
# Set 2GB upload limit
|
481
|
+
export KREUZBERG_MAX_UPLOAD_SIZE=2147483648
|
482
|
+
litestar --app kreuzberg._api.main:app run
|
483
|
+
|
484
|
+
# Disable telemetry
|
485
|
+
export KREUZBERG_ENABLE_OPENTELEMETRY=false
|
486
|
+
uvicorn kreuzberg._api.main:app --host 0.0.0.0 --port 8000
|
487
|
+
|
488
|
+
# Production settings with Docker
|
489
|
+
docker run -p 8000:8000 \
|
490
|
+
-e KREUZBERG_MAX_UPLOAD_SIZE=5368709120 \
|
491
|
+
-e KREUZBERG_ENABLE_OPENTELEMETRY=true \
|
492
|
+
goldziher/kreuzberg:latest
|
493
|
+
```
|
494
|
+
|
495
|
+
**Note**: Boolean environment variables accept `true`/`false`, `1`/`0`, `yes`/`no`, or `on`/`off` values.
|
496
|
+
|
466
497
|
To use custom configuration, modify the extraction call in your own API wrapper:
|
467
498
|
|
468
499
|
```python
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import base64
|
4
4
|
import io
|
5
|
+
import os
|
5
6
|
import traceback
|
6
7
|
from json import dumps
|
7
8
|
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
@@ -100,6 +101,36 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
|
|
100
101
|
)
|
101
102
|
|
102
103
|
|
104
|
+
def _get_max_upload_size() -> int:
|
105
|
+
"""Get the maximum upload size from environment variable.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
Maximum upload size in bytes. Defaults to 1GB if not set.
|
109
|
+
|
110
|
+
Environment Variables:
|
111
|
+
KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
|
112
|
+
"""
|
113
|
+
default_size = 1024 * 1024 * 1024 # 1GB
|
114
|
+
try:
|
115
|
+
size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
|
116
|
+
# Return default if negative
|
117
|
+
return size if size >= 0 else default_size
|
118
|
+
except ValueError:
|
119
|
+
return default_size
|
120
|
+
|
121
|
+
|
122
|
+
def _is_opentelemetry_enabled() -> bool:
|
123
|
+
"""Check if OpenTelemetry should be enabled.
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
True if OpenTelemetry should be enabled, False otherwise.
|
127
|
+
|
128
|
+
Environment Variables:
|
129
|
+
KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
|
130
|
+
"""
|
131
|
+
return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
|
132
|
+
|
133
|
+
|
103
134
|
def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
|
104
135
|
error_type = type(exception).__name__
|
105
136
|
error_message = str(exception)
|
@@ -242,7 +273,7 @@ async def handle_files_upload( # noqa: PLR0913
|
|
242
273
|
- Language detection (if enabled)
|
243
274
|
|
244
275
|
Supports various file formats including PDF, Office documents, images, and more.
|
245
|
-
Maximum file size: 1GB per file.
|
276
|
+
Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
|
246
277
|
|
247
278
|
Args:
|
248
279
|
request: The HTTP request object
|
@@ -379,9 +410,18 @@ type_encoders = {
|
|
379
410
|
Image.Image: _pil_image_encoder,
|
380
411
|
}
|
381
412
|
|
413
|
+
|
414
|
+
def _get_plugins() -> list[Any]:
|
415
|
+
"""Get configured plugins based on environment variables."""
|
416
|
+
plugins = []
|
417
|
+
if _is_opentelemetry_enabled():
|
418
|
+
plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
|
419
|
+
return plugins
|
420
|
+
|
421
|
+
|
382
422
|
app = Litestar(
|
383
423
|
route_handlers=[handle_files_upload, health_check, get_configuration],
|
384
|
-
plugins=
|
424
|
+
plugins=_get_plugins(),
|
385
425
|
logging_config=StructLoggingConfig(),
|
386
426
|
openapi_config=openapi_config,
|
387
427
|
exception_handlers={
|
@@ -389,5 +429,5 @@ app = Litestar(
|
|
389
429
|
Exception: general_exception_handler,
|
390
430
|
},
|
391
431
|
type_encoders=type_encoders,
|
392
|
-
request_max_body_size=
|
432
|
+
request_max_body_size=_get_max_upload_size(),
|
393
433
|
)
|
@@ -0,0 +1,244 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import re
|
5
|
+
import shutil
|
6
|
+
import subprocess
|
7
|
+
from functools import lru_cache
|
8
|
+
from itertools import chain
|
9
|
+
from typing import TYPE_CHECKING, Any
|
10
|
+
|
11
|
+
import anyio
|
12
|
+
|
13
|
+
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
14
|
+
from kreuzberg._utils._sync import run_sync
|
15
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
16
|
+
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from collections.abc import Sequence
|
19
|
+
|
20
|
+
|
21
|
+
def is_uv_available() -> bool:
|
22
|
+
"""Check if uv is available in the environment."""
|
23
|
+
return shutil.which("uv") is not None
|
24
|
+
|
25
|
+
|
26
|
+
def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
|
27
|
+
"""Get the direct download URL for a spaCy model.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
|
31
|
+
version: Model version to download (default: 3.8.0)
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Direct download URL for the model
|
35
|
+
"""
|
36
|
+
return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
|
37
|
+
|
38
|
+
|
39
|
+
async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
|
40
|
+
"""Install spaCy model using uv.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
model_name: Name of the spaCy model to install
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
Completed process result
|
47
|
+
"""
|
48
|
+
model_url = get_spacy_model_url(model_name)
|
49
|
+
return await run_sync(
|
50
|
+
subprocess.run,
|
51
|
+
["uv", "pip", "install", model_url],
|
52
|
+
capture_output=True,
|
53
|
+
text=True,
|
54
|
+
check=False,
|
55
|
+
)
|
56
|
+
|
57
|
+
|
58
|
+
async def install_spacy_model_with_spacy(model_name: str) -> bool:
|
59
|
+
"""Install spaCy model using spacy download function.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
model_name: Name of the spaCy model to install
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
True if successful, False otherwise
|
66
|
+
"""
|
67
|
+
try:
|
68
|
+
import spacy.cli.download # noqa: PLC0415
|
69
|
+
|
70
|
+
await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
|
71
|
+
return True
|
72
|
+
except (ImportError, OSError, RuntimeError):
|
73
|
+
return False
|
74
|
+
|
75
|
+
|
76
|
+
def extract_entities(
|
77
|
+
text: str,
|
78
|
+
entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
|
79
|
+
custom_patterns: frozenset[tuple[str, str]] | None = None,
|
80
|
+
languages: list[str] | None = None,
|
81
|
+
spacy_config: SpacyEntityExtractionConfig | None = None,
|
82
|
+
) -> list[Entity]:
|
83
|
+
entities: list[Entity] = []
|
84
|
+
if custom_patterns:
|
85
|
+
entities.extend(
|
86
|
+
chain.from_iterable(
|
87
|
+
(
|
88
|
+
Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
|
89
|
+
for match in re.finditer(pattern, text)
|
90
|
+
)
|
91
|
+
for ent_type, pattern in custom_patterns
|
92
|
+
)
|
93
|
+
)
|
94
|
+
|
95
|
+
if spacy_config is None:
|
96
|
+
spacy_config = SpacyEntityExtractionConfig()
|
97
|
+
|
98
|
+
try:
|
99
|
+
import spacy # noqa: F401, PLC0415
|
100
|
+
except ImportError as e: # pragma: no cover
|
101
|
+
raise MissingDependencyError.create_for_package(
|
102
|
+
package_name="spacy",
|
103
|
+
dependency_group="entity-extraction",
|
104
|
+
functionality="Entity Extraction",
|
105
|
+
) from e
|
106
|
+
|
107
|
+
model_name = select_spacy_model(languages, spacy_config)
|
108
|
+
if not model_name:
|
109
|
+
return entities
|
110
|
+
|
111
|
+
nlp = load_spacy_model(model_name, spacy_config)
|
112
|
+
|
113
|
+
if len(text) > spacy_config.max_doc_length:
|
114
|
+
text = text[: spacy_config.max_doc_length]
|
115
|
+
|
116
|
+
doc = nlp(text)
|
117
|
+
|
118
|
+
entity_type_mapping = {etype.upper() for etype in entity_types}
|
119
|
+
|
120
|
+
entities.extend(
|
121
|
+
Entity(
|
122
|
+
type=ent.label_,
|
123
|
+
text=ent.text,
|
124
|
+
start=ent.start_char,
|
125
|
+
end=ent.end_char,
|
126
|
+
)
|
127
|
+
for ent in doc.ents
|
128
|
+
if ent.label_ in entity_type_mapping or ent.label_.upper() in entity_type_mapping
|
129
|
+
)
|
130
|
+
|
131
|
+
return entities
|
132
|
+
|
133
|
+
|
134
|
+
@lru_cache(maxsize=32)
|
135
|
+
def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
136
|
+
try:
|
137
|
+
import spacy # noqa: PLC0415
|
138
|
+
except ImportError:
|
139
|
+
return None
|
140
|
+
|
141
|
+
if spacy_config.model_cache_dir:
|
142
|
+
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
143
|
+
|
144
|
+
try:
|
145
|
+
nlp = spacy.load(model_name)
|
146
|
+
except OSError:
|
147
|
+
# Try to download the model automatically
|
148
|
+
async def install_model() -> tuple[bool, str | None]:
|
149
|
+
"""Install model and return success status and error message."""
|
150
|
+
# First try spaCy's built-in download
|
151
|
+
try:
|
152
|
+
success = await install_spacy_model_with_spacy(model_name)
|
153
|
+
if success:
|
154
|
+
return True, None
|
155
|
+
except (ImportError, OSError, RuntimeError) as e:
|
156
|
+
spacy_error = str(e)
|
157
|
+
else:
|
158
|
+
spacy_error = "spaCy download failed"
|
159
|
+
|
160
|
+
# If spaCy download failed and uv is available, try uv as fallback
|
161
|
+
if is_uv_available():
|
162
|
+
try:
|
163
|
+
result = await install_spacy_model_with_uv(model_name)
|
164
|
+
return result.returncode == 0, result.stderr
|
165
|
+
except (OSError, subprocess.SubprocessError) as e:
|
166
|
+
return False, f"spaCy: {spacy_error}, uv: {e!s}"
|
167
|
+
|
168
|
+
return False, spacy_error
|
169
|
+
|
170
|
+
# Run the async installation in a sync context
|
171
|
+
try:
|
172
|
+
success, error_details = anyio.run(install_model)
|
173
|
+
except (OSError, RuntimeError) as e:
|
174
|
+
success, error_details = False, str(e)
|
175
|
+
|
176
|
+
if not success:
|
177
|
+
# Generate appropriate error message based on available tools
|
178
|
+
if is_uv_available():
|
179
|
+
model_url = get_spacy_model_url(model_name)
|
180
|
+
manual_install_cmd = f"uv pip install {model_url}"
|
181
|
+
else:
|
182
|
+
manual_install_cmd = f"python -m spacy download {model_name}"
|
183
|
+
|
184
|
+
error_msg = (
|
185
|
+
f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
|
186
|
+
)
|
187
|
+
|
188
|
+
if error_details:
|
189
|
+
error_msg += f"\nError details: {error_details}"
|
190
|
+
|
191
|
+
raise KreuzbergError(
|
192
|
+
error_msg,
|
193
|
+
context={
|
194
|
+
"model": model_name,
|
195
|
+
"manual_install_cmd": manual_install_cmd,
|
196
|
+
"error_details": error_details,
|
197
|
+
"uv_available": is_uv_available(),
|
198
|
+
},
|
199
|
+
) from None
|
200
|
+
|
201
|
+
try:
|
202
|
+
nlp = spacy.load(model_name)
|
203
|
+
except OSError as e:
|
204
|
+
raise KreuzbergError(
|
205
|
+
f"Failed to load spaCy model '{model_name}' even after successful download. "
|
206
|
+
f"Please verify your spaCy installation and try reinstalling the model.",
|
207
|
+
context={"model": model_name, "error": str(e)},
|
208
|
+
) from e
|
209
|
+
|
210
|
+
nlp.max_length = spacy_config.max_doc_length
|
211
|
+
|
212
|
+
return nlp
|
213
|
+
|
214
|
+
|
215
|
+
def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
216
|
+
if not languages:
|
217
|
+
return spacy_config.get_model_for_language("en")
|
218
|
+
|
219
|
+
for lang in languages:
|
220
|
+
model_name = spacy_config.get_model_for_language(lang)
|
221
|
+
if model_name:
|
222
|
+
return model_name
|
223
|
+
|
224
|
+
return spacy_config.get_fallback_model()
|
225
|
+
|
226
|
+
|
227
|
+
def extract_keywords(
|
228
|
+
text: str,
|
229
|
+
keyword_count: int = 10,
|
230
|
+
) -> list[tuple[str, float]]:
|
231
|
+
try:
|
232
|
+
from keybert import KeyBERT # noqa: PLC0415
|
233
|
+
|
234
|
+
kw_model = KeyBERT()
|
235
|
+
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
236
|
+
return [(kw, float(score)) for kw, score in keywords]
|
237
|
+
except (RuntimeError, OSError, ValueError):
|
238
|
+
return []
|
239
|
+
except ImportError as e: # pragma: no cover
|
240
|
+
raise MissingDependencyError.create_for_package(
|
241
|
+
package_name="keybert",
|
242
|
+
dependency_group="entity-extraction",
|
243
|
+
functionality="Keyword Extraction",
|
244
|
+
) from e
|
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
|
|
5
5
|
|
6
6
|
[project]
|
7
7
|
name = "kreuzberg"
|
8
|
-
version = "3.
|
8
|
+
version = "3.18.0"
|
9
9
|
description = "Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats"
|
10
10
|
readme = "README.md"
|
11
11
|
keywords = [
|
@@ -57,12 +57,12 @@ classifiers = [
|
|
57
57
|
]
|
58
58
|
|
59
59
|
dependencies = [
|
60
|
-
"anyio>=4.
|
60
|
+
"anyio>=4.11.0",
|
61
61
|
"chardetng-py>=0.3.5",
|
62
62
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
63
|
-
"html-to-markdown[lxml]>=1.
|
63
|
+
"html-to-markdown[lxml]>=1.16.0",
|
64
64
|
"langcodes>=3.5.0",
|
65
|
-
"mcp>=1.
|
65
|
+
"mcp>=1.15.0",
|
66
66
|
"msgspec>=0.18.0",
|
67
67
|
"numpy>=2.0.0",
|
68
68
|
"playa-pdf>=0.7.0",
|
@@ -117,7 +117,7 @@ dev = [
|
|
117
117
|
"pytest-rerunfailures>=16.0.1",
|
118
118
|
"pytest-timeout>=2.4.0",
|
119
119
|
"rich>=14.1.0",
|
120
|
-
"ruff>=0.13.
|
120
|
+
"ruff>=0.13.2",
|
121
121
|
"tabulate>=0.9.0",
|
122
122
|
"trio>=0.31.0",
|
123
123
|
"uv-bump",
|
@@ -0,0 +1,154 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
from typing import TYPE_CHECKING, Any
|
5
|
+
from unittest.mock import patch
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from litestar.testing import AsyncTestClient
|
11
|
+
|
12
|
+
|
13
|
+
def test_get_max_upload_size_default() -> None:
|
14
|
+
from kreuzberg._api.main import _get_max_upload_size
|
15
|
+
|
16
|
+
with patch.dict(os.environ, {}, clear=True):
|
17
|
+
assert _get_max_upload_size() == 1024 * 1024 * 1024 # 1GB
|
18
|
+
|
19
|
+
|
20
|
+
def test_get_max_upload_size_custom() -> None:
|
21
|
+
from kreuzberg._api.main import _get_max_upload_size
|
22
|
+
|
23
|
+
custom_size = 2 * 1024 * 1024 * 1024 # 2GB
|
24
|
+
with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": str(custom_size)}):
|
25
|
+
assert _get_max_upload_size() == custom_size
|
26
|
+
|
27
|
+
|
28
|
+
def test_get_max_upload_size_invalid_value() -> None:
|
29
|
+
from kreuzberg._api.main import _get_max_upload_size
|
30
|
+
|
31
|
+
with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": "invalid"}):
|
32
|
+
assert _get_max_upload_size() == 1024 * 1024 * 1024 # Falls back to default
|
33
|
+
|
34
|
+
|
35
|
+
def test_is_opentelemetry_enabled_default() -> None:
|
36
|
+
from kreuzberg._api.main import _is_opentelemetry_enabled
|
37
|
+
|
38
|
+
with patch.dict(os.environ, {}, clear=True):
|
39
|
+
assert _is_opentelemetry_enabled() is True
|
40
|
+
|
41
|
+
|
42
|
+
def test_is_opentelemetry_enabled_false() -> None:
|
43
|
+
from kreuzberg._api.main import _is_opentelemetry_enabled
|
44
|
+
|
45
|
+
test_cases = ["false", "False", "FALSE", "0", "no", "No", "off", "Off"]
|
46
|
+
for value in test_cases:
|
47
|
+
with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": value}):
|
48
|
+
assert _is_opentelemetry_enabled() is False, f"Failed for value: {value}"
|
49
|
+
|
50
|
+
|
51
|
+
def test_is_opentelemetry_enabled_true() -> None:
|
52
|
+
from kreuzberg._api.main import _is_opentelemetry_enabled
|
53
|
+
|
54
|
+
test_cases = ["true", "True", "TRUE", "1", "yes", "Yes", "on", "On"]
|
55
|
+
for value in test_cases:
|
56
|
+
with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": value}):
|
57
|
+
assert _is_opentelemetry_enabled() is True, f"Failed for value: {value}"
|
58
|
+
|
59
|
+
|
60
|
+
def test_get_plugins_with_opentelemetry_enabled() -> None:
|
61
|
+
from kreuzberg._api.main import _get_plugins
|
62
|
+
|
63
|
+
with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "true"}):
|
64
|
+
plugins = _get_plugins()
|
65
|
+
assert len(plugins) == 1
|
66
|
+
assert type(plugins[0]).__name__ == "OpenTelemetryPlugin"
|
67
|
+
|
68
|
+
|
69
|
+
def test_get_plugins_with_opentelemetry_disabled() -> None:
|
70
|
+
from kreuzberg._api.main import _get_plugins
|
71
|
+
|
72
|
+
with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "false"}):
|
73
|
+
plugins = _get_plugins()
|
74
|
+
assert len(plugins) == 0
|
75
|
+
|
76
|
+
|
77
|
+
@pytest.mark.anyio
|
78
|
+
async def test_app_configuration_with_custom_upload_size() -> None:
|
79
|
+
"""Test that the Litestar app uses the configured upload size"""
|
80
|
+
from kreuzberg._api.main import _get_max_upload_size
|
81
|
+
|
82
|
+
custom_size = 512 * 1024 * 1024 # 512MB
|
83
|
+
|
84
|
+
with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": str(custom_size)}):
|
85
|
+
assert _get_max_upload_size() == custom_size
|
86
|
+
|
87
|
+
|
88
|
+
@pytest.mark.anyio
|
89
|
+
async def test_large_file_upload_respected(test_client: AsyncTestClient[Any], tmp_path: Any) -> None:
|
90
|
+
"""Test that large file upload limits are respected"""
|
91
|
+
|
92
|
+
# Create a test file that would exceed a small upload limit
|
93
|
+
test_file = tmp_path / "large_test.txt"
|
94
|
+
large_content = "x" * (2 * 1024 * 1024) # 2MB content
|
95
|
+
test_file.write_text(large_content)
|
96
|
+
|
97
|
+
# Test with original app (should work with default 1GB limit)
|
98
|
+
with test_file.open("rb") as f:
|
99
|
+
response = await test_client.post("/extract", files=[("data", (test_file.name, f.read(), "text/plain"))])
|
100
|
+
|
101
|
+
# Should succeed with default 1GB limit
|
102
|
+
assert response.status_code == 201
|
103
|
+
|
104
|
+
|
105
|
+
def test_environment_variable_combinations() -> None:
|
106
|
+
"""Test various combinations of environment variables"""
|
107
|
+
from kreuzberg._api.main import _get_max_upload_size, _is_opentelemetry_enabled
|
108
|
+
|
109
|
+
test_env = {
|
110
|
+
"KREUZBERG_MAX_UPLOAD_SIZE": "5368709120", # 5GB
|
111
|
+
"KREUZBERG_ENABLE_OPENTELEMETRY": "false",
|
112
|
+
}
|
113
|
+
|
114
|
+
with patch.dict(os.environ, test_env):
|
115
|
+
assert _get_max_upload_size() == 5368709120
|
116
|
+
assert _is_opentelemetry_enabled() is False
|
117
|
+
|
118
|
+
|
119
|
+
def test_edge_cases_for_upload_size() -> None:
|
120
|
+
"""Test edge cases for upload size configuration"""
|
121
|
+
from kreuzberg._api.main import _get_max_upload_size
|
122
|
+
|
123
|
+
# Test zero
|
124
|
+
with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": "0"}):
|
125
|
+
assert _get_max_upload_size() == 0
|
126
|
+
|
127
|
+
# Test very large number
|
128
|
+
large_size = str(10 * 1024 * 1024 * 1024) # 10GB
|
129
|
+
with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": large_size}):
|
130
|
+
assert _get_max_upload_size() == int(large_size)
|
131
|
+
|
132
|
+
# Test negative number (should fall back to default)
|
133
|
+
with patch.dict(os.environ, {"KREUZBERG_MAX_UPLOAD_SIZE": "-1"}):
|
134
|
+
assert _get_max_upload_size() == 1024 * 1024 * 1024
|
135
|
+
|
136
|
+
|
137
|
+
def test_edge_cases_for_opentelemetry() -> None:
|
138
|
+
"""Test edge cases for OpenTelemetry boolean configuration"""
|
139
|
+
from kreuzberg._api.main import _is_opentelemetry_enabled
|
140
|
+
|
141
|
+
# Test empty string (should default to true)
|
142
|
+
with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": ""}):
|
143
|
+
assert _is_opentelemetry_enabled() is False
|
144
|
+
|
145
|
+
# Test random string (should default to false)
|
146
|
+
with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "random"}):
|
147
|
+
assert _is_opentelemetry_enabled() is False
|
148
|
+
|
149
|
+
# Test numeric strings
|
150
|
+
with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "2"}):
|
151
|
+
assert _is_opentelemetry_enabled() is False
|
152
|
+
|
153
|
+
with patch.dict(os.environ, {"KREUZBERG_ENABLE_OPENTELEMETRY": "1"}):
|
154
|
+
assert _is_opentelemetry_enabled() is True
|
@@ -903,3 +903,77 @@ async def test_pdf_extract_path_async_table_import_error(
|
|
903
903
|
|
904
904
|
assert result.content == "Text content"
|
905
905
|
assert result.tables == []
|
906
|
+
|
907
|
+
|
908
|
+
@pytest.fixture
|
909
|
+
def german_image_pdf() -> Path:
|
910
|
+
"""Path to German image-only PDF that previously caused EmptyHtmlError."""
|
911
|
+
return Path(__file__).parent.parent / "test_source_files" / "image-only-german-pdf.pdf"
|
912
|
+
|
913
|
+
|
914
|
+
@pytest.mark.anyio
|
915
|
+
async def test_extract_german_image_pdf_async_with_force_ocr(german_image_pdf: Path) -> None:
|
916
|
+
"""Test that German image-only PDF extracts successfully with force_ocr=True.
|
917
|
+
|
918
|
+
This test reproduces issue #149 where an image-only German PDF would fail
|
919
|
+
with EmptyHtmlError when using Tesseract OCR.
|
920
|
+
"""
|
921
|
+
from kreuzberg._types import PSMMode, TesseractConfig
|
922
|
+
|
923
|
+
config = ExtractionConfig(
|
924
|
+
force_ocr=True, ocr_backend="tesseract", ocr_config=TesseractConfig(language="deu", psm=PSMMode.SINGLE_BLOCK)
|
925
|
+
)
|
926
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
927
|
+
|
928
|
+
result = await extractor.extract_path_async(german_image_pdf)
|
929
|
+
|
930
|
+
# Should extract German text successfully
|
931
|
+
assert result.content.strip(), "Should extract text content from German image PDF"
|
932
|
+
assert result.mime_type == "text/plain"
|
933
|
+
assert len(result.content) > 50, "Should extract meaningful amount of text"
|
934
|
+
|
935
|
+
|
936
|
+
def test_extract_german_image_pdf_sync_with_force_ocr(german_image_pdf: Path) -> None:
|
937
|
+
"""Test that German image-only PDF extracts successfully with force_ocr=True (sync).
|
938
|
+
|
939
|
+
This test reproduces issue #149 where an image-only German PDF would fail
|
940
|
+
with EmptyHtmlError when using Tesseract OCR.
|
941
|
+
"""
|
942
|
+
from kreuzberg._types import PSMMode, TesseractConfig
|
943
|
+
|
944
|
+
config = ExtractionConfig(
|
945
|
+
force_ocr=True, ocr_backend="tesseract", ocr_config=TesseractConfig(language="deu", psm=PSMMode.SINGLE_BLOCK)
|
946
|
+
)
|
947
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
948
|
+
|
949
|
+
result = extractor.extract_path_sync(german_image_pdf)
|
950
|
+
|
951
|
+
# Should extract German text successfully
|
952
|
+
assert result.content.strip(), "Should extract text content from German image PDF"
|
953
|
+
assert result.mime_type == "text/plain"
|
954
|
+
assert len(result.content) > 50, "Should extract meaningful amount of text"
|
955
|
+
|
956
|
+
|
957
|
+
@pytest.mark.anyio
|
958
|
+
async def test_extract_german_image_pdf_async_default_config(german_image_pdf: Path) -> None:
|
959
|
+
"""Test that German image-only PDF extracts with default OCR config."""
|
960
|
+
config = ExtractionConfig(ocr_backend="tesseract")
|
961
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
962
|
+
|
963
|
+
result = await extractor.extract_path_async(german_image_pdf)
|
964
|
+
|
965
|
+
# Should extract some text content even with default config
|
966
|
+
assert result.content.strip(), "Should extract text content with default config"
|
967
|
+
assert result.mime_type == "text/plain"
|
968
|
+
|
969
|
+
|
970
|
+
def test_extract_german_image_pdf_sync_default_config(german_image_pdf: Path) -> None:
|
971
|
+
"""Test that German image-only PDF extracts with default OCR config (sync)."""
|
972
|
+
config = ExtractionConfig(ocr_backend="tesseract")
|
973
|
+
extractor = PDFExtractor(mime_type="application/pdf", config=config)
|
974
|
+
|
975
|
+
result = extractor.extract_path_sync(german_image_pdf)
|
976
|
+
|
977
|
+
# Should extract some text content even with default config
|
978
|
+
assert result.content.strip(), "Should extract text content with default config"
|
979
|
+
assert result.mime_type == "text/plain"
|