kreuzberg 3.16.0__tar.gz → 3.17.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/ci.yaml +42 -72
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/docker-e2e-tests.yml +4 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/docs.yml +1 -1
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/test-docker-builds.yml +4 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.pre-commit-config.yaml +4 -3
- kreuzberg-3.17.1/.prettierignore +1 -0
- kreuzberg-3.17.1/ATTRIBUTIONS.md +47 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/PKG-INFO +6 -5
- kreuzberg-3.17.1/benchmarks/token_reduction_compression_benchmark.py +268 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/types.md +6 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/index.md +1 -0
- kreuzberg-3.17.1/docs/user-guide/token-reduction.md +251 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/__init__.py +2 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_config.py +8 -9
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_base.py +0 -46
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_html.py +1 -1
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_pandoc.py +2 -2
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_pdf.py +4 -4
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_gmft.py +2 -2
- kreuzberg-3.17.1/kreuzberg/_language_detection.py +37 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_mcp/server.py +1 -1
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_mime_types.py +1 -1
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_easyocr.py +4 -9
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_paddleocr.py +1 -1
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_tesseract.py +15 -25
- kreuzberg-3.17.1/kreuzberg/_token_reduction/__init__.py +11 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/_reducer.py +439 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/_stopwords.py +116 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
- kreuzberg-3.17.1/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_types.py +50 -9
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_image_preprocessing.py +1 -1
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_ref.py +14 -6
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/exceptions.py +0 -1
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/extraction.py +33 -10
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/pyproject.toml +10 -8
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/config_cache_test.py +3 -27
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/comprehensive_config_test.py +61 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/types_test.py +62 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/base_extractor_test.py +1 -1
- kreuzberg-3.17.1/tests/features/language_detection_test.py +354 -0
- kreuzberg-3.17.1/tests/features/token_reduction_test.py +813 -0
- kreuzberg-3.17.1/tests/integration/token_reduction_integration_test.py +173 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/tesseract_test.py +64 -0
- kreuzberg-3.17.1/tests/utils/playa_helpers_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/uv.lock +304 -255
- kreuzberg-3.16.0/docker-logs/docker-info.txt +0 -60
- kreuzberg-3.16.0/docker-logs/docker-version.txt +0 -27
- kreuzberg-3.16.0/kreuzberg/_language_detection.py +0 -60
- kreuzberg-3.16.0/tests/features/language_detection_test.py +0 -387
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.commitlintrc +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.deepsource.toml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.docker/Dockerfile +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.docker/README.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.dockerignore +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.gitignore +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/.markdownlint.yaml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/LICENSE +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/README.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/Taskfile.yml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/ai-rulez.yaml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/README.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/batch_size_benchmark.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/batch_validation_benchmark.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/py.typed +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/__main__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/benchmarks.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/cli.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/models.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/profiler.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/benchmarks/src/runner.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/index.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/assets/logo.png +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/cli.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/contributing.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/css/extra.css +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/examples/extraction-examples.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/examples/index.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/index.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/extraction-configuration.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/metadata-extraction.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/docs/user-guide/supported-formats.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_api/_config_cache.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_api/main.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_extractors/_structured.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_html_streaming.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_resource_managers.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/mkdocs.yaml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/conftest.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/header_config_hashing_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/image_extraction_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/main_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/api/runtime_config_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/conftest.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/config_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/constants_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/dpi_configuration_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/exceptions_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/extraction_batch_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/extraction_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/html_to_markdown_config_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/image_ocr_result_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/init_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/main_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/mime_types_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/core/registry_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/e2e/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/e2e/docker_e2e.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/README_image_tests.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/base_memory_limits_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/base_ocr_processing_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/base_ocr_simple_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/email_error_paths_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/email_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/html_invalid_base64_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/image_deduplication_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/image_error_handling_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/image_error_simple_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/image_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/json_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pandoc_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pdf_images_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pdf_sync_images_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/pdf_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/spreadsheet_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/chunker_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/document_classification_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/entity_extraction_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/gmft_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/features/hooks_test.py +0 -0
- /kreuzberg-3.16.0/tests/integration/__init__.py → /kreuzberg-3.17.1/tests/features/table_extraction_test.py +0 -0
- {kreuzberg-3.16.0/tests/integration/api → kreuzberg-3.17.1/tests/integration}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/all_extractors_images_test.py +0 -0
- {kreuzberg-3.16.0/tests/integration/multiprocessing → kreuzberg-3.17.1/tests/integration/api}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/api/large_file_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/api/mounted_config_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/dpi_integration_test.py +0 -0
- {kreuzberg-3.16.0/tests/integration/ocr → kreuzberg-3.17.1/tests/integration/multiprocessing}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.16.0/tests/interfaces → kreuzberg-3.17.1/tests/integration/ocr}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pandoc_images_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pdf_images_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pdf_real_images_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pptx_complex_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/pptx_images_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/integration/regression_test.py +0 -0
- {kreuzberg-3.16.0/tests/mcp → kreuzberg-3.17.1/tests/interfaces}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/interfaces/cli_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/interfaces/mcp_server_test.py +0 -0
- {kreuzberg-3.16.0/tests/multiprocessing → kreuzberg-3.17.1/tests/mcp}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/mcp/mcp_server_test.py +0 -0
- {kreuzberg-3.16.0/tests/ocr → kreuzberg-3.17.1/tests/multiprocessing}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/multiprocessing/gmft_isolated_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.16.0/tests/performance → kreuzberg-3.17.1/tests/ocr}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/easyocr_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/paddleocr_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/ocr/tesseract_tsv_test.py +0 -0
- {kreuzberg-3.16.0/tests/utils → kreuzberg-3.17.1/tests/performance}/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/performance/large_pdf_perf_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/google-doc-document.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/complex_nested.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/aws_policy.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/earthquakes.geojson +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/github_emojis.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/iss_location.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/openapi_spec.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/package.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/real_world/rick_morty_character.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/json/schema_test.json +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/sharable-web-guide.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/borderless_table.png +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/complex_document.png +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/test-excel.xls +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- /kreuzberg-3.16.0/tests/utils/playa_helpers_test.py → /kreuzberg-3.17.1/tests/utils/__init__.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/ocr_cache_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/playa_metadata_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/playa_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/process_pool_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/quality_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/ref_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/serialization_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.16.0 → kreuzberg-3.17.1}/tests/utils/tmp_test.py +0 -0
@@ -8,6 +8,10 @@ on:
|
|
8
8
|
branches:
|
9
9
|
- main
|
10
10
|
|
11
|
+
concurrency:
|
12
|
+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
13
|
+
cancel-in-progress: true
|
14
|
+
|
11
15
|
jobs:
|
12
16
|
validate:
|
13
17
|
runs-on: ubuntu-latest
|
@@ -138,27 +142,7 @@ jobs:
|
|
138
142
|
needs: validate
|
139
143
|
if: github.event_name == 'pull_request' && needs.validate.result == 'success'
|
140
144
|
runs-on: ubuntu-latest
|
141
|
-
|
142
|
-
fail-fast: false
|
143
|
-
matrix:
|
144
|
-
test-category:
|
145
|
-
- name: "core"
|
146
|
-
path: "tests/core,tests/utils"
|
147
|
-
system-deps: false
|
148
|
-
timeout: 15
|
149
|
-
- name: "extractors"
|
150
|
-
path: "tests/extractors"
|
151
|
-
system-deps: true
|
152
|
-
timeout: 20
|
153
|
-
- name: "integration"
|
154
|
-
path: "tests/integration,tests/api"
|
155
|
-
system-deps: true
|
156
|
-
timeout: 25
|
157
|
-
- name: "features"
|
158
|
-
path: "tests/features,tests/interfaces,tests/mcp,tests/multiprocessing,tests/ocr"
|
159
|
-
system-deps: true
|
160
|
-
timeout: 20
|
161
|
-
timeout-minutes: ${{ matrix.test-category.timeout }}
|
145
|
+
timeout-minutes: 45
|
162
146
|
steps:
|
163
147
|
- name: Checkout
|
164
148
|
uses: actions/checkout@v5
|
@@ -170,36 +154,62 @@ jobs:
|
|
170
154
|
|
171
155
|
- name: Install Python
|
172
156
|
uses: actions/setup-python@v6
|
157
|
+
id: setup-python
|
173
158
|
with:
|
174
159
|
python-version: "3.13"
|
175
160
|
|
176
161
|
- name: Cache Python Dependencies
|
162
|
+
id: python-cache
|
177
163
|
uses: actions/cache@v4
|
178
164
|
with:
|
179
165
|
path: |
|
180
166
|
~/.cache/uv
|
181
167
|
.venv
|
182
|
-
key: python-dependencies-ubuntu-latest-3.13-${{
|
168
|
+
key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
|
183
169
|
restore-keys: |
|
184
170
|
python-dependencies-ubuntu-latest-3.13-
|
185
171
|
|
186
172
|
- name: Install Dependencies
|
187
|
-
|
173
|
+
uses: nick-fields/retry@v3
|
174
|
+
with:
|
175
|
+
timeout_minutes: 5
|
176
|
+
max_attempts: 3
|
177
|
+
retry_wait_seconds: 30
|
178
|
+
command: |
|
179
|
+
uv sync --all-extras --dev
|
180
|
+
shell: bash
|
188
181
|
|
189
182
|
- name: Install System Dependencies
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
183
|
+
uses: nick-fields/retry@v3
|
184
|
+
with:
|
185
|
+
timeout_minutes: 5
|
186
|
+
max_attempts: 3
|
187
|
+
retry_wait_seconds: 30
|
188
|
+
command: |
|
189
|
+
sudo apt-get update
|
190
|
+
sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
|
191
|
+
shell: bash
|
194
192
|
|
195
|
-
- name: Run Tests
|
196
|
-
|
193
|
+
- name: Run All Tests with Coverage
|
194
|
+
uses: nick-fields/retry@v3
|
195
|
+
with:
|
196
|
+
timeout_minutes: 15
|
197
|
+
max_attempts: 3
|
198
|
+
retry_wait_seconds: 10
|
199
|
+
command: |
|
200
|
+
uv run coverage erase
|
201
|
+
uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
|
202
|
+
uv run coverage report --precision=2
|
203
|
+
shell: bash
|
197
204
|
|
198
205
|
- name: Upload Coverage Artifacts
|
206
|
+
if: always()
|
199
207
|
uses: actions/upload-artifact@v4
|
200
208
|
with:
|
201
|
-
name: coverage
|
202
|
-
path:
|
209
|
+
name: coverage-pr-${{ github.sha }}
|
210
|
+
path: |
|
211
|
+
coverage.lcov
|
212
|
+
.coverage
|
203
213
|
retention-days: 1
|
204
214
|
|
205
215
|
coverage-pr:
|
@@ -214,47 +224,7 @@ jobs:
|
|
214
224
|
- name: Download Coverage Artifacts
|
215
225
|
uses: actions/download-artifact@v5
|
216
226
|
with:
|
217
|
-
|
218
|
-
merge-multiple: true
|
219
|
-
|
220
|
-
- name: Install uv
|
221
|
-
uses: astral-sh/setup-uv@v6
|
222
|
-
with:
|
223
|
-
enable-cache: true
|
224
|
-
|
225
|
-
- name: Install Python
|
226
|
-
uses: actions/setup-python@v6
|
227
|
-
with:
|
228
|
-
python-version: "3.13"
|
229
|
-
|
230
|
-
- name: Install Dependencies
|
231
|
-
run: uv sync --dev
|
232
|
-
|
233
|
-
- name: Combine Coverage Reports
|
234
|
-
run: |
|
235
|
-
# Install lcov for combining reports
|
236
|
-
sudo apt-get update && sudo apt-get install -y lcov
|
237
|
-
|
238
|
-
# List available coverage files
|
239
|
-
echo "Available coverage files:"
|
240
|
-
find . -name "coverage-*.lcov" -type f || echo "No coverage files found"
|
241
|
-
|
242
|
-
# Combine all lcov files if they exist
|
243
|
-
coverage_files=($(find . -name "coverage-*.lcov" -type f))
|
244
|
-
if [ ${#coverage_files[@]} -gt 0 ]; then
|
245
|
-
echo "Combining ${#coverage_files[@]} coverage files..."
|
246
|
-
if [ ${#coverage_files[@]} -eq 1 ]; then
|
247
|
-
# Only one file, just copy it
|
248
|
-
cp "${coverage_files[0]}" coverage.lcov
|
249
|
-
else
|
250
|
-
# Multiple files, combine them
|
251
|
-
lcov --rc branch_coverage=1 $(printf " -a %s" "${coverage_files[@]}") -o coverage.lcov
|
252
|
-
fi
|
253
|
-
else
|
254
|
-
echo "No coverage files to combine, creating empty coverage.lcov"
|
255
|
-
echo "TN:" > coverage.lcov
|
256
|
-
echo "end_of_record" >> coverage.lcov
|
257
|
-
fi
|
227
|
+
name: coverage-pr-${{ github.sha }}
|
258
228
|
|
259
229
|
- name: Upload Coverage to DeepSource
|
260
230
|
if: always()
|
@@ -26,7 +26,7 @@ repos:
|
|
26
26
|
hooks:
|
27
27
|
- id: mdformat
|
28
28
|
additional_dependencies:
|
29
|
-
- mdformat-mkdocs==4.
|
29
|
+
- mdformat-mkdocs==4.1.0
|
30
30
|
- repo: https://github.com/igorshubovych/markdownlint-cli
|
31
31
|
rev: v0.45.0
|
32
32
|
hooks:
|
@@ -36,6 +36,7 @@ repos:
|
|
36
36
|
hooks:
|
37
37
|
- id: blacken-docs
|
38
38
|
args: ["--pyi", "--line-length", "130"]
|
39
|
+
exclude: tests/features/token_reduction_test.py
|
39
40
|
additional_dependencies:
|
40
41
|
- black==25.1.0
|
41
42
|
- repo: https://github.com/rbubley/mirrors-prettier
|
@@ -48,7 +49,7 @@ repos:
|
|
48
49
|
hooks:
|
49
50
|
- id: pyproject-fmt
|
50
51
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
51
|
-
rev: v0.13.
|
52
|
+
rev: v0.13.1
|
52
53
|
hooks:
|
53
54
|
- id: ruff
|
54
55
|
args: ["--fix", "--unsafe-fixes"]
|
@@ -59,7 +60,7 @@ repos:
|
|
59
60
|
- id: codespell
|
60
61
|
exclude: ^tests|^scripts|^kreuzberg/_tesseract|^kreuzberg/_mime_types
|
61
62
|
additional_dependencies:
|
62
|
-
- tomli
|
63
|
+
- tomli==2.2.1
|
63
64
|
- repo: https://github.com/jsh9/pydoclint
|
64
65
|
rev: 0.7.3
|
65
66
|
hooks:
|
@@ -0,0 +1 @@
|
|
1
|
+
ATTRIBUTIONS.md
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Third-Party Attributions
|
2
|
+
|
3
|
+
This file contains attributions for third-party code, data, and libraries used in Kreuzberg.
|
4
|
+
|
5
|
+
## Stopwords Data
|
6
|
+
|
7
|
+
The stopwords data in `kreuzberg/_token_reduction/stop_words.json` is derived from the [stopwords-iso](https://github.com/stopwords-iso/stopwords-iso) project.
|
8
|
+
|
9
|
+
**Original Author:** Gene Diaz and contributors
|
10
|
+
**License:** MIT License
|
11
|
+
**Source:** <https://github.com/stopwords-iso/stopwords-iso>
|
12
|
+
|
13
|
+
### MIT License (stopwords-iso)
|
14
|
+
|
15
|
+
```text
|
16
|
+
MIT License
|
17
|
+
|
18
|
+
Copyright (c) stopwords-iso contributors
|
19
|
+
|
20
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
21
|
+
of this software and associated documentation files (the "Software"), to deal
|
22
|
+
in the Software without restriction, including without limitation the rights
|
23
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
24
|
+
copies of the Software, and to permit persons to whom the Software is
|
25
|
+
furnished to do so, subject to the following conditions:
|
26
|
+
|
27
|
+
The above copyright notice and this permission notice shall be included in all
|
28
|
+
copies or substantial portions of the Software.
|
29
|
+
|
30
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
31
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
32
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
33
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
34
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
35
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
36
|
+
SOFTWARE.
|
37
|
+
```
|
38
|
+
|
39
|
+
### Changes Made
|
40
|
+
|
41
|
+
The original stopwords-iso data was used as-is with no modifications to the word lists themselves. The data was packaged into Kreuzberg's `_token_reduction` module for use in the token reduction feature.
|
42
|
+
|
43
|
+
______________________________________________________________________
|
44
|
+
|
45
|
+
## Other Third-Party Dependencies
|
46
|
+
|
47
|
+
All other third-party dependencies are listed in `pyproject.toml` with their respective licenses. This section is specifically for bundled/vendored code and data.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.17.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,12 +32,13 @@ Requires-Dist: anyio>=4.10.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.13.0
|
35
|
-
Requires-Dist:
|
35
|
+
Requires-Dist: langcodes>=3.5.0
|
36
|
+
Requires-Dist: mcp>=1.14.1
|
36
37
|
Requires-Dist: msgspec>=0.18.0
|
37
38
|
Requires-Dist: numpy>=2.0.0
|
38
39
|
Requires-Dist: playa-pdf>=0.7.0
|
39
40
|
Requires-Dist: polars>=1.33.1
|
40
|
-
Requires-Dist: psutil>=7.
|
41
|
+
Requires-Dist: psutil>=7.1.0
|
41
42
|
Requires-Dist: pypdfium2==4.30.0
|
42
43
|
Requires-Dist: python-calamine>=0.5.3
|
43
44
|
Requires-Dist: python-pptx>=1.0.2
|
@@ -49,7 +50,7 @@ Provides-Extra: all
|
|
49
50
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
50
51
|
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
51
52
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
52
|
-
Requires-Dist: fast-langdetect>=0.
|
53
|
+
Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
|
53
54
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
54
55
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
55
56
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
@@ -82,7 +83,7 @@ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
|
82
83
|
Provides-Extra: gmft
|
83
84
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
84
85
|
Provides-Extra: langdetect
|
85
|
-
Requires-Dist: fast-langdetect>=0.
|
86
|
+
Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
|
86
87
|
Provides-Extra: paddleocr
|
87
88
|
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
88
89
|
Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
|
@@ -0,0 +1,268 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
import statistics
|
5
|
+
import time
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import Any, Literal
|
9
|
+
|
10
|
+
from kreuzberg import extract_bytes_sync
|
11
|
+
from kreuzberg._token_reduction import get_reduction_stats, reduce_tokens
|
12
|
+
from kreuzberg._types import ExtractionConfig, TokenReductionConfig
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass
|
16
|
+
class CompressionResult:
|
17
|
+
text_type: str
|
18
|
+
mode: str
|
19
|
+
original_length: int
|
20
|
+
reduced_length: int
|
21
|
+
original_tokens: int
|
22
|
+
reduced_tokens: int
|
23
|
+
character_reduction_ratio: float
|
24
|
+
token_reduction_ratio: float
|
25
|
+
processing_time_ms: float
|
26
|
+
|
27
|
+
@property
|
28
|
+
def character_compression_percent(self) -> float:
|
29
|
+
return self.character_reduction_ratio * 100
|
30
|
+
|
31
|
+
@property
|
32
|
+
def token_compression_percent(self) -> float:
|
33
|
+
return self.token_reduction_ratio * 100
|
34
|
+
|
35
|
+
|
36
|
+
@dataclass
|
37
|
+
class CompressionBenchmarkSuite:
|
38
|
+
results: list[CompressionResult] = field(default_factory=list)
|
39
|
+
total_tests: int = 0
|
40
|
+
total_time_ms: float = 0.0
|
41
|
+
|
42
|
+
def add_result(self, result: CompressionResult) -> None:
|
43
|
+
self.results.append(result)
|
44
|
+
self.total_tests += 1
|
45
|
+
self.total_time_ms += result.processing_time_ms
|
46
|
+
|
47
|
+
def get_summary(self) -> dict[str, Any]:
|
48
|
+
if not self.results:
|
49
|
+
return {}
|
50
|
+
|
51
|
+
by_mode: dict[str, Any] = {}
|
52
|
+
for result in self.results:
|
53
|
+
if result.mode not in by_mode:
|
54
|
+
by_mode[result.mode] = []
|
55
|
+
by_mode[result.mode].append(result)
|
56
|
+
|
57
|
+
mode_stats = {}
|
58
|
+
for mode, results in by_mode.items():
|
59
|
+
char_ratios = [r.character_compression_percent for r in results]
|
60
|
+
token_ratios = [r.token_compression_percent for r in results]
|
61
|
+
times = [r.processing_time_ms for r in results]
|
62
|
+
|
63
|
+
mode_stats[mode] = {
|
64
|
+
"tests": len(results),
|
65
|
+
"character_compression": {
|
66
|
+
"mean": statistics.mean(char_ratios),
|
67
|
+
"median": statistics.median(char_ratios),
|
68
|
+
"stdev": statistics.stdev(char_ratios) if len(char_ratios) > 1 else 0.0,
|
69
|
+
"min": min(char_ratios),
|
70
|
+
"max": max(char_ratios),
|
71
|
+
},
|
72
|
+
"token_compression": {
|
73
|
+
"mean": statistics.mean(token_ratios),
|
74
|
+
"median": statistics.median(token_ratios),
|
75
|
+
"stdev": statistics.stdev(token_ratios) if len(token_ratios) > 1 else 0.0,
|
76
|
+
"min": min(token_ratios),
|
77
|
+
"max": max(token_ratios),
|
78
|
+
},
|
79
|
+
"performance": {
|
80
|
+
"avg_time_ms": statistics.mean(times),
|
81
|
+
"total_time_ms": sum(times),
|
82
|
+
},
|
83
|
+
}
|
84
|
+
|
85
|
+
return {
|
86
|
+
"summary": {
|
87
|
+
"total_tests": self.total_tests,
|
88
|
+
"total_time_ms": self.total_time_ms,
|
89
|
+
"avg_time_per_test_ms": self.total_time_ms / self.total_tests if self.total_tests > 0 else 0.0,
|
90
|
+
},
|
91
|
+
"by_mode": mode_stats,
|
92
|
+
"detailed_results": [
|
93
|
+
{
|
94
|
+
"text_type": r.text_type,
|
95
|
+
"mode": r.mode,
|
96
|
+
"original_length": r.original_length,
|
97
|
+
"reduced_length": r.reduced_length,
|
98
|
+
"original_tokens": r.original_tokens,
|
99
|
+
"reduced_tokens": r.reduced_tokens,
|
100
|
+
"character_compression_percent": r.character_compression_percent,
|
101
|
+
"token_compression_percent": r.token_compression_percent,
|
102
|
+
"processing_time_ms": r.processing_time_ms,
|
103
|
+
}
|
104
|
+
for r in self.results
|
105
|
+
],
|
106
|
+
}
|
107
|
+
|
108
|
+
|
109
|
+
class TokenReductionCompressionBenchmark:
|
110
|
+
def __init__(self) -> None:
|
111
|
+
self.test_texts = self._create_test_texts()
|
112
|
+
self.modes: list[Literal["light", "moderate"]] = ["light", "moderate"]
|
113
|
+
|
114
|
+
def _create_test_texts(self) -> dict[str, str]:
|
115
|
+
return {
|
116
|
+
"formal_document": """
|
117
|
+
The quarterly financial report demonstrates significant improvements in operational efficiency and market positioning.
|
118
|
+
Our comprehensive analysis reveals that the implementation of strategic initiatives has resulted in measurable outcomes
|
119
|
+
across multiple key performance indicators. The organization's commitment to excellence and continuous improvement
|
120
|
+
is evident in these results. Furthermore, the systematic approach to risk management and quality assurance has
|
121
|
+
contributed to enhanced stakeholder confidence and sustainable growth trajectory. The board of directors acknowledges
|
122
|
+
the exceptional efforts of the management team and all employees in achieving these remarkable results.
|
123
|
+
""".strip(),
|
124
|
+
"casual_conversation": """
|
125
|
+
Hey there! I was just thinking about that amazing movie we watched last weekend. It was really incredible, wasn't it?
|
126
|
+
The way they told the story was so compelling and the characters were just wonderful. I think it's one of the best
|
127
|
+
films I've seen this year. What did you think about it? I'd love to hear your thoughts and maybe we could discuss
|
128
|
+
some of the themes that really stood out to me. There were so many interesting elements that I'm still thinking about.
|
129
|
+
""".strip(),
|
130
|
+
"technical_manual": """
|
131
|
+
Configure the system parameters by accessing the administrative interface through the main configuration panel.
|
132
|
+
Navigate to Settings > Advanced > Network Configuration and verify that all connection parameters are correctly
|
133
|
+
initialized. The TCP/IP stack must be properly configured with appropriate DNS resolution settings and gateway
|
134
|
+
routing tables. Execute the diagnostic utilities to validate network connectivity and ensure that all protocols
|
135
|
+
are functioning within acceptable performance thresholds. Document any configuration changes in the system log
|
136
|
+
for future reference and troubleshooting procedures.
|
137
|
+
""".strip(),
|
138
|
+
"news_article": """
|
139
|
+
Local authorities announced today that the new public transportation system will begin operations next month,
|
140
|
+
connecting several major districts across the metropolitan area. The project, which has been in development for
|
141
|
+
over three years, represents a significant investment in sustainable urban infrastructure. City officials expect
|
142
|
+
the system to reduce traffic congestion and provide affordable transportation options for thousands of daily
|
143
|
+
commuters. Environmental impact studies indicate that the implementation will contribute to reduced carbon emissions
|
144
|
+
and improved air quality throughout the region.
|
145
|
+
""".strip(),
|
146
|
+
"literature_excerpt": """
|
147
|
+
The old lighthouse stood majestically against the stormy horizon, its weathered stones bearing witness to countless
|
148
|
+
tempests and countless ships that had sought its guiding light. Sarah approached the ancient structure with a sense
|
149
|
+
of reverence, knowing that within its walls lay the stories of generations of lighthouse keepers who had dedicated
|
150
|
+
their lives to the safety of maritime travelers. The wind howled through the nearby cliffs, carrying with it the
|
151
|
+
salt spray of crashing waves and the whispered secrets of the sea itself.
|
152
|
+
""".strip(),
|
153
|
+
"scientific_abstract": """
|
154
|
+
This study investigates the relationship between cognitive load and working memory performance in multilingual
|
155
|
+
individuals under various experimental conditions. Participants (n=127) completed a series of standardized
|
156
|
+
assessments while neural activity was monitored using electroencephalography. Results indicate significant
|
157
|
+
correlations between language switching frequency and executive control efficiency (p<0.001). The findings suggest
|
158
|
+
that bilingual advantages in cognitive flexibility extend to domain-general executive functions, with implications
|
159
|
+
for educational policy and cognitive training interventions.
|
160
|
+
""".strip(),
|
161
|
+
"stopword_heavy": """
|
162
|
+
And so it was that he went to the store, and then he bought some things that he needed for the house. But when
|
163
|
+
he got back to the place where he lived, he realized that he had forgotten to get the most important thing that
|
164
|
+
he had originally planned to purchase. So he had to go back to the store again, and this time he made sure to
|
165
|
+
get everything that he needed. It was a bit frustrating, but in the end, everything worked out just fine.
|
166
|
+
""".strip(),
|
167
|
+
"technical_jargon": """
|
168
|
+
The microservices architecture implements a distributed system pattern utilizing containerized deployments
|
169
|
+
orchestrated through Kubernetes clusters. API gateways facilitate service discovery and load balancing across
|
170
|
+
multiple availability zones. The event-driven messaging infrastructure leverages Apache Kafka for asynchronous
|
171
|
+
communication between bounded contexts. Monitoring and observability are achieved through OpenTelemetry
|
172
|
+
instrumentation with Prometheus metrics collection and Grafana visualization dashboards.
|
173
|
+
""".strip(),
|
174
|
+
"minimal_stopwords": """
|
175
|
+
Python programming language offers powerful features. Machine learning algorithms require extensive datasets.
|
176
|
+
Neural networks demonstrate remarkable performance capabilities. Developers utilize frameworks like TensorFlow.
|
177
|
+
Data preprocessing involves cleaning, transformation, validation procedures. Model training requires computational
|
178
|
+
resources, optimization techniques. Evaluation metrics include accuracy, precision, recall measurements.
|
179
|
+
Production deployment considerations encompass scalability, monitoring, maintenance requirements.
|
180
|
+
""".strip(),
|
181
|
+
}
|
182
|
+
|
183
|
+
def test_compression_effectiveness(
|
184
|
+
self, text: str, text_type: str, mode: Literal["light", "moderate"]
|
185
|
+
) -> CompressionResult:
|
186
|
+
config = TokenReductionConfig(mode=mode, preserve_markdown=False)
|
187
|
+
|
188
|
+
start_time = time.perf_counter()
|
189
|
+
reduced_text = reduce_tokens(text, config=config, language="en")
|
190
|
+
processing_time = (time.perf_counter() - start_time) * 1000
|
191
|
+
|
192
|
+
stats = get_reduction_stats(text, reduced_text)
|
193
|
+
|
194
|
+
return CompressionResult(
|
195
|
+
text_type=text_type,
|
196
|
+
mode=mode,
|
197
|
+
original_length=len(text),
|
198
|
+
reduced_length=len(reduced_text),
|
199
|
+
original_tokens=stats["original_tokens"],
|
200
|
+
reduced_tokens=stats["reduced_tokens"],
|
201
|
+
character_reduction_ratio=stats["character_reduction_ratio"],
|
202
|
+
token_reduction_ratio=stats["token_reduction_ratio"],
|
203
|
+
processing_time_ms=processing_time,
|
204
|
+
)
|
205
|
+
|
206
|
+
def run_comprehensive_benchmark(self) -> CompressionBenchmarkSuite:
|
207
|
+
suite = CompressionBenchmarkSuite()
|
208
|
+
|
209
|
+
for text_type, text in self.test_texts.items():
|
210
|
+
for mode in self.modes:
|
211
|
+
result = self.test_compression_effectiveness(text, text_type, mode)
|
212
|
+
suite.add_result(result)
|
213
|
+
|
214
|
+
return suite
|
215
|
+
|
216
|
+
def run_pipeline_integration_test(self) -> dict[str, Any]:
|
217
|
+
pipeline_results: dict[str, Any] = {}
|
218
|
+
|
219
|
+
for mode in self.modes:
|
220
|
+
config = ExtractionConfig(token_reduction=TokenReductionConfig(mode=mode))
|
221
|
+
|
222
|
+
test_text = self.test_texts["formal_document"]
|
223
|
+
|
224
|
+
start_time = time.perf_counter()
|
225
|
+
result = extract_bytes_sync(test_text.encode("utf-8"), "text/plain", config)
|
226
|
+
processing_time = (time.perf_counter() - start_time) * 1000
|
227
|
+
|
228
|
+
reduction_stats = result.metadata.get("token_reduction", {})
|
229
|
+
|
230
|
+
pipeline_results[mode] = {
|
231
|
+
"original_length": len(test_text),
|
232
|
+
"reduced_length": len(result.content),
|
233
|
+
"processing_time_ms": processing_time,
|
234
|
+
"reduction_stats": reduction_stats,
|
235
|
+
"metadata_present": "token_reduction" in result.metadata,
|
236
|
+
}
|
237
|
+
|
238
|
+
return pipeline_results
|
239
|
+
|
240
|
+
|
241
|
+
def main() -> None:
|
242
|
+
benchmark = TokenReductionCompressionBenchmark()
|
243
|
+
|
244
|
+
suite = benchmark.run_comprehensive_benchmark()
|
245
|
+
|
246
|
+
pipeline_results = benchmark.run_pipeline_integration_test()
|
247
|
+
|
248
|
+
summary = suite.get_summary()
|
249
|
+
|
250
|
+
for _stats in summary["by_mode"].values():
|
251
|
+
pass
|
252
|
+
|
253
|
+
output_dir = Path("benchmarks/results")
|
254
|
+
output_dir.mkdir(exist_ok=True)
|
255
|
+
|
256
|
+
full_results = {
|
257
|
+
"compression_benchmark": summary,
|
258
|
+
"pipeline_integration": pipeline_results,
|
259
|
+
"timestamp": time.time(),
|
260
|
+
}
|
261
|
+
|
262
|
+
output_file = output_dir / "token_reduction_compression.json"
|
263
|
+
with output_file.open("w") as f:
|
264
|
+
json.dump(full_results, f, indent=2)
|
265
|
+
|
266
|
+
|
267
|
+
if __name__ == "__main__":
|
268
|
+
main()
|
@@ -84,6 +84,12 @@ Configuration options for converting HTML content to Markdown:
|
|
84
84
|
|
85
85
|
::: kreuzberg.HTMLToMarkdownConfig
|
86
86
|
|
87
|
+
## Token Reduction Configuration
|
88
|
+
|
89
|
+
Configuration options for token reduction and text optimization:
|
90
|
+
|
91
|
+
::: kreuzberg.TokenReductionConfig
|
92
|
+
|
87
93
|
## PSMMode (Page Segmentation Mode)
|
88
94
|
|
89
95
|
::: kreuzberg.PSMMode
|
@@ -8,6 +8,7 @@ This guide provides comprehensive documentation for the Kreuzberg document intel
|
|
8
8
|
- [Extraction Configuration](extraction-configuration.md) - Configure the extraction process ([API](../api-reference/types.md#extractionconfig))
|
9
9
|
- [Metadata Extraction](metadata-extraction.md) - Document metadata extraction ([API](../api-reference/types.md#metadata))
|
10
10
|
- [Content Chunking](chunking.md) - Split documents into manageable chunks
|
11
|
+
- [Token Reduction](token-reduction.md) - Optimize text for LLMs and storage ([API](../api-reference/types.md#tokenreductionconfig))
|
11
12
|
- [Document Classification](document-classification.md) - Automatic document type detection
|
12
13
|
- [OCR Configuration](ocr-configuration.md) - Configure OCR settings ([API](../api-reference/ocr-configuration.md))
|
13
14
|
- [OCR Backends](ocr-backends.md) - Choose and configure different OCR engines
|