kreuzberg 3.17.3__tar.gz → 3.20.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/PKG-INFO +33 -46
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_api/main.py +45 -3
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_entity_extraction.py +108 -18
- kreuzberg-3.20.1/kreuzberg/_error_handling.py +182 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_base.py +2 -2
- kreuzberg-3.20.1/kreuzberg/_extractors/_html.py +138 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pdf.py +33 -54
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_structured.py +1 -1
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_language_detection.py +2 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_tesseract.py +76 -297
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_types.py +143 -47
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/cli.py +36 -22
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/extraction.py +251 -107
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/pyproject.toml +59 -75
- kreuzberg-3.17.3/.commitlintrc +0 -1
- kreuzberg-3.17.3/.deepsource.toml +0 -54
- kreuzberg-3.17.3/.docker/Dockerfile +0 -79
- kreuzberg-3.17.3/.docker/README.md +0 -190
- kreuzberg-3.17.3/.dockerignore +0 -15
- kreuzberg-3.17.3/.github/dependabot.yaml +0 -6
- kreuzberg-3.17.3/.github/workflows/ci.yaml +0 -381
- kreuzberg-3.17.3/.github/workflows/docker-e2e-tests.yml +0 -150
- kreuzberg-3.17.3/.github/workflows/docs.yml +0 -66
- kreuzberg-3.17.3/.github/workflows/pr-title.yaml +0 -20
- kreuzberg-3.17.3/.github/workflows/publish-docker.yml +0 -163
- kreuzberg-3.17.3/.github/workflows/release.yaml +0 -37
- kreuzberg-3.17.3/.github/workflows/test-docker-builds.yml +0 -101
- kreuzberg-3.17.3/.gitignore +0 -74
- kreuzberg-3.17.3/.markdownlint.yaml +0 -17
- kreuzberg-3.17.3/.pre-commit-config.yaml +0 -82
- kreuzberg-3.17.3/.prettierignore +0 -1
- kreuzberg-3.17.3/ATTRIBUTIONS.md +0 -47
- kreuzberg-3.17.3/LICENSE +0 -7
- kreuzberg-3.17.3/Taskfile.yml +0 -50
- kreuzberg-3.17.3/ai-rulez.yaml +0 -586
- kreuzberg-3.17.3/benchmarks/README.md +0 -264
- kreuzberg-3.17.3/benchmarks/batch_size_benchmark.py +0 -179
- kreuzberg-3.17.3/benchmarks/batch_validation_benchmark.py +0 -83
- kreuzberg-3.17.3/benchmarks/pyproject.toml +0 -29
- kreuzberg-3.17.3/benchmarks/src/__init__.py +0 -1
- kreuzberg-3.17.3/benchmarks/src/__main__.py +0 -4
- kreuzberg-3.17.3/benchmarks/src/benchmarks.py +0 -703
- kreuzberg-3.17.3/benchmarks/src/cli.py +0 -723
- kreuzberg-3.17.3/benchmarks/src/models.py +0 -195
- kreuzberg-3.17.3/benchmarks/src/profiler.py +0 -161
- kreuzberg-3.17.3/benchmarks/src/runner.py +0 -367
- kreuzberg-3.17.3/benchmarks/token_reduction_compression_benchmark.py +0 -268
- kreuzberg-3.17.3/docs/advanced/custom-extractors.md +0 -203
- kreuzberg-3.17.3/docs/advanced/custom-hooks.md +0 -148
- kreuzberg-3.17.3/docs/advanced/error-handling.md +0 -181
- kreuzberg-3.17.3/docs/advanced/index.md +0 -41
- kreuzberg-3.17.3/docs/advanced/performance.md +0 -306
- kreuzberg-3.17.3/docs/api-reference/exceptions.md +0 -33
- kreuzberg-3.17.3/docs/api-reference/extraction-functions.md +0 -59
- kreuzberg-3.17.3/docs/api-reference/extractor-registry.md +0 -5
- kreuzberg-3.17.3/docs/api-reference/index.md +0 -51
- kreuzberg-3.17.3/docs/api-reference/ocr-configuration.md +0 -27
- kreuzberg-3.17.3/docs/api-reference/types.md +0 -120
- kreuzberg-3.17.3/docs/assets/favicon.png +0 -0
- kreuzberg-3.17.3/docs/assets/logo.png +0 -0
- kreuzberg-3.17.3/docs/cli.md +0 -225
- kreuzberg-3.17.3/docs/contributing.md +0 -82
- kreuzberg-3.17.3/docs/css/extra.css +0 -56
- kreuzberg-3.17.3/docs/examples/extraction-examples.md +0 -763
- kreuzberg-3.17.3/docs/examples/index.md +0 -48
- kreuzberg-3.17.3/docs/getting-started/index.md +0 -20
- kreuzberg-3.17.3/docs/getting-started/installation.md +0 -154
- kreuzberg-3.17.3/docs/getting-started/quick-start.md +0 -111
- kreuzberg-3.17.3/docs/index.md +0 -60
- kreuzberg-3.17.3/docs/user-guide/api-server.md +0 -500
- kreuzberg-3.17.3/docs/user-guide/basic-usage.md +0 -161
- kreuzberg-3.17.3/docs/user-guide/chunking.md +0 -124
- kreuzberg-3.17.3/docs/user-guide/docker.md +0 -548
- kreuzberg-3.17.3/docs/user-guide/document-classification.md +0 -61
- kreuzberg-3.17.3/docs/user-guide/extraction-configuration.md +0 -966
- kreuzberg-3.17.3/docs/user-guide/index.md +0 -45
- kreuzberg-3.17.3/docs/user-guide/mcp-server.md +0 -586
- kreuzberg-3.17.3/docs/user-guide/metadata-extraction.md +0 -125
- kreuzberg-3.17.3/docs/user-guide/ocr-backends.md +0 -247
- kreuzberg-3.17.3/docs/user-guide/ocr-configuration.md +0 -414
- kreuzberg-3.17.3/docs/user-guide/supported-formats.md +0 -71
- kreuzberg-3.17.3/docs/user-guide/token-reduction.md +0 -251
- kreuzberg-3.17.3/kreuzberg/_extractors/_html.py +0 -148
- kreuzberg-3.17.3/kreuzberg/_utils/__init__.py +0 -0
- kreuzberg-3.17.3/kreuzberg/_utils/_html_streaming.py +0 -20
- kreuzberg-3.17.3/kreuzberg/py.typed +0 -0
- kreuzberg-3.17.3/mkdocs.yaml +0 -160
- kreuzberg-3.17.3/tests/__init__.py +0 -0
- kreuzberg-3.17.3/tests/api/__init__.py +0 -0
- kreuzberg-3.17.3/tests/api/config_cache_test.py +0 -224
- kreuzberg-3.17.3/tests/api/conftest.py +0 -18
- kreuzberg-3.17.3/tests/api/header_config_hashing_test.py +0 -29
- kreuzberg-3.17.3/tests/api/image_extraction_test.py +0 -59
- kreuzberg-3.17.3/tests/api/main_test.py +0 -817
- kreuzberg-3.17.3/tests/api/runtime_config_test.py +0 -374
- kreuzberg-3.17.3/tests/conftest.py +0 -219
- kreuzberg-3.17.3/tests/core/__init__.py +0 -0
- kreuzberg-3.17.3/tests/core/comprehensive_config_test.py +0 -664
- kreuzberg-3.17.3/tests/core/config_test.py +0 -15
- kreuzberg-3.17.3/tests/core/constants_test.py +0 -22
- kreuzberg-3.17.3/tests/core/dpi_configuration_test.py +0 -319
- kreuzberg-3.17.3/tests/core/exceptions_test.py +0 -159
- kreuzberg-3.17.3/tests/core/extraction_batch_test.py +0 -389
- kreuzberg-3.17.3/tests/core/extraction_test.py +0 -494
- kreuzberg-3.17.3/tests/core/html_to_markdown_config_test.py +0 -0
- kreuzberg-3.17.3/tests/core/image_ocr_result_test.py +0 -27
- kreuzberg-3.17.3/tests/core/init_test.py +0 -85
- kreuzberg-3.17.3/tests/core/main_test.py +0 -35
- kreuzberg-3.17.3/tests/core/mime_types_test.py +0 -242
- kreuzberg-3.17.3/tests/core/registry_test.py +0 -225
- kreuzberg-3.17.3/tests/core/types_test.py +0 -465
- kreuzberg-3.17.3/tests/e2e/__init__.py +0 -0
- kreuzberg-3.17.3/tests/e2e/docker_e2e.py +0 -481
- kreuzberg-3.17.3/tests/extractors/README_image_tests.md +0 -85
- kreuzberg-3.17.3/tests/extractors/__init__.py +0 -0
- kreuzberg-3.17.3/tests/extractors/base_extractor_test.py +0 -420
- kreuzberg-3.17.3/tests/extractors/base_memory_limits_test.py +0 -100
- kreuzberg-3.17.3/tests/extractors/base_ocr_processing_test.py +0 -276
- kreuzberg-3.17.3/tests/extractors/base_ocr_simple_test.py +0 -64
- kreuzberg-3.17.3/tests/extractors/email_error_paths_test.py +0 -39
- kreuzberg-3.17.3/tests/extractors/email_test.py +0 -948
- kreuzberg-3.17.3/tests/extractors/html_invalid_base64_test.py +0 -11
- kreuzberg-3.17.3/tests/extractors/html_test.py +0 -52
- kreuzberg-3.17.3/tests/extractors/image_deduplication_test.py +0 -87
- kreuzberg-3.17.3/tests/extractors/image_error_handling_test.py +0 -253
- kreuzberg-3.17.3/tests/extractors/image_error_simple_test.py +0 -75
- kreuzberg-3.17.3/tests/extractors/image_test.py +0 -766
- kreuzberg-3.17.3/tests/extractors/json_test.py +0 -427
- kreuzberg-3.17.3/tests/extractors/pandoc_metadata_test.py +0 -323
- kreuzberg-3.17.3/tests/extractors/pandoc_test.py +0 -1995
- kreuzberg-3.17.3/tests/extractors/pdf_images_test.py +0 -52
- kreuzberg-3.17.3/tests/extractors/pdf_sync_images_test.py +0 -217
- kreuzberg-3.17.3/tests/extractors/pdf_test.py +0 -905
- kreuzberg-3.17.3/tests/extractors/presentation_test.py +0 -967
- kreuzberg-3.17.3/tests/extractors/spreadsheet_test.py +0 -1140
- kreuzberg-3.17.3/tests/extractors/structured_test.py +0 -304
- kreuzberg-3.17.3/tests/features/__init__.py +0 -0
- kreuzberg-3.17.3/tests/features/chunker_test.py +0 -94
- kreuzberg-3.17.3/tests/features/document_classification_test.py +0 -747
- kreuzberg-3.17.3/tests/features/entity_extraction_test.py +0 -404
- kreuzberg-3.17.3/tests/features/gmft_test.py +0 -1496
- kreuzberg-3.17.3/tests/features/hooks_test.py +0 -0
- kreuzberg-3.17.3/tests/features/language_detection_test.py +0 -343
- kreuzberg-3.17.3/tests/features/table_extraction_test.py +0 -0
- kreuzberg-3.17.3/tests/features/token_reduction_test.py +0 -813
- kreuzberg-3.17.3/tests/integration/__init__.py +0 -0
- kreuzberg-3.17.3/tests/integration/all_extractors_images_test.py +0 -252
- kreuzberg-3.17.3/tests/integration/api/__init__.py +0 -0
- kreuzberg-3.17.3/tests/integration/api/large_file_test.py +0 -0
- kreuzberg-3.17.3/tests/integration/api/mounted_config_test.py +0 -0
- kreuzberg-3.17.3/tests/integration/dpi_integration_test.py +0 -209
- kreuzberg-3.17.3/tests/integration/multiprocessing/__init__.py +0 -0
- kreuzberg-3.17.3/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- kreuzberg-3.17.3/tests/integration/ocr/__init__.py +0 -0
- kreuzberg-3.17.3/tests/integration/ocr/device_integration_test.py +0 -0
- kreuzberg-3.17.3/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- kreuzberg-3.17.3/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- kreuzberg-3.17.3/tests/integration/pandoc_images_test.py +0 -30
- kreuzberg-3.17.3/tests/integration/pdf_images_test.py +0 -18
- kreuzberg-3.17.3/tests/integration/pdf_real_images_test.py +0 -52
- kreuzberg-3.17.3/tests/integration/pptx_complex_test.py +0 -22
- kreuzberg-3.17.3/tests/integration/pptx_images_test.py +0 -18
- kreuzberg-3.17.3/tests/integration/regression_test.py +0 -134
- kreuzberg-3.17.3/tests/integration/token_reduction_integration_test.py +0 -173
- kreuzberg-3.17.3/tests/interfaces/__init__.py +0 -0
- kreuzberg-3.17.3/tests/interfaces/cli_test.py +0 -527
- kreuzberg-3.17.3/tests/interfaces/mcp_server_test.py +0 -1116
- kreuzberg-3.17.3/tests/mcp/__init__.py +0 -0
- kreuzberg-3.17.3/tests/mcp/mcp_server_test.py +0 -0
- kreuzberg-3.17.3/tests/multiprocessing/__init__.py +0 -0
- kreuzberg-3.17.3/tests/multiprocessing/gmft_isolated_test.py +0 -449
- kreuzberg-3.17.3/tests/multiprocessing/process_manager_test.py +0 -273
- kreuzberg-3.17.3/tests/multiprocessing/tesseract_pool_test.py +0 -331
- kreuzberg-3.17.3/tests/ocr/__init__.py +0 -0
- kreuzberg-3.17.3/tests/ocr/base_test.py +0 -80
- kreuzberg-3.17.3/tests/ocr/easyocr_test.py +0 -517
- kreuzberg-3.17.3/tests/ocr/init_test.py +0 -35
- kreuzberg-3.17.3/tests/ocr/paddleocr_test.py +0 -835
- kreuzberg-3.17.3/tests/ocr/tesseract_test.py +0 -1314
- kreuzberg-3.17.3/tests/ocr/tesseract_tsv_test.py +0 -409
- kreuzberg-3.17.3/tests/performance/__init__.py +0 -0
- kreuzberg-3.17.3/tests/performance/large_pdf_perf_test.py +0 -29
- kreuzberg-3.17.3/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/contract.txt +0 -1
- kreuzberg-3.17.3/tests/test_source_files/contract_test.txt +0 -4
- kreuzberg-3.17.3/tests/test_source_files/document.docx +0 -0
- kreuzberg-3.17.3/tests/test_source_files/email/sample-email.eml +0 -11
- kreuzberg-3.17.3/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- kreuzberg-3.17.3/tests/test_source_files/excel.xlsx +0 -0
- kreuzberg-3.17.3/tests/test_source_files/flower-no-text.jpg +0 -0
- kreuzberg-3.17.3/tests/test_source_files/form_test.txt +0 -5
- kreuzberg-3.17.3/tests/test_source_files/french-text.txt +0 -2
- kreuzberg-3.17.3/tests/test_source_files/german-text.txt +0 -2
- kreuzberg-3.17.3/tests/test_source_files/google-doc-document.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/html.html +0 -10
- kreuzberg-3.17.3/tests/test_source_files/images/test_hello_world.png +0 -0
- kreuzberg-3.17.3/tests/test_source_files/invoice_image.png +0 -0
- kreuzberg-3.17.3/tests/test_source_files/invoice_test.txt +0 -4
- kreuzberg-3.17.3/tests/test_source_files/json/complex_nested.json +0 -41
- kreuzberg-3.17.3/tests/test_source_files/json/real_world/aws_policy.json +0 -43
- kreuzberg-3.17.3/tests/test_source_files/json/real_world/earthquakes.geojson +0 -6
- kreuzberg-3.17.3/tests/test_source_files/json/real_world/github_emojis.json +0 -111
- kreuzberg-3.17.3/tests/test_source_files/json/real_world/iss_location.json +0 -1
- kreuzberg-3.17.3/tests/test_source_files/json/real_world/openapi_spec.json +0 -84
- kreuzberg-3.17.3/tests/test_source_files/json/real_world/package.json +0 -33
- kreuzberg-3.17.3/tests/test_source_files/json/real_world/rick_morty_character.json +0 -1
- kreuzberg-3.17.3/tests/test_source_files/json/sample-document.json +0 -1
- kreuzberg-3.17.3/tests/test_source_files/json/schema_test.json +0 -25
- kreuzberg-3.17.3/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- kreuzberg-3.17.3/tests/test_source_files/markdown.md +0 -1
- kreuzberg-3.17.3/tests/test_source_files/non-ascii-text.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/non-searchable.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/ocr-image.jpg +0 -0
- kreuzberg-3.17.3/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- kreuzberg-3.17.3/tests/test_source_files/receipt_test.txt +0 -5
- kreuzberg-3.17.3/tests/test_source_files/report_test.txt +0 -4
- kreuzberg-3.17.3/tests/test_source_files/sample-contract.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/scanned.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/searchable.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/sharable-web-guide.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/spanish-text.txt +0 -2
- kreuzberg-3.17.3/tests/test_source_files/tables/borderless_table.png +0 -0
- kreuzberg-3.17.3/tests/test_source_files/tables/complex_document.png +0 -0
- kreuzberg-3.17.3/tests/test_source_files/tables/simple_table.png +0 -0
- kreuzberg-3.17.3/tests/test_source_files/test-article.pdf +0 -0
- kreuzberg-3.17.3/tests/test_source_files/test-excel.xls +0 -0
- kreuzberg-3.17.3/tests/test_source_files/yaml/sample-config.yaml +0 -15
- kreuzberg-3.17.3/tests/utils/__init__.py +0 -0
- kreuzberg-3.17.3/tests/utils/cache_test.py +0 -427
- kreuzberg-3.17.3/tests/utils/device_test.py +0 -347
- kreuzberg-3.17.3/tests/utils/errors_test.py +0 -343
- kreuzberg-3.17.3/tests/utils/ocr_cache_test.py +0 -286
- kreuzberg-3.17.3/tests/utils/pdf_lock_test.py +0 -215
- kreuzberg-3.17.3/tests/utils/playa_helpers_test.py +0 -0
- kreuzberg-3.17.3/tests/utils/playa_metadata_test.py +0 -753
- kreuzberg-3.17.3/tests/utils/playa_test.py +0 -315
- kreuzberg-3.17.3/tests/utils/process_pool_test.py +0 -223
- kreuzberg-3.17.3/tests/utils/quality_test.py +0 -121
- kreuzberg-3.17.3/tests/utils/ref_test.py +0 -90
- kreuzberg-3.17.3/tests/utils/serialization_test.py +0 -379
- kreuzberg-3.17.3/tests/utils/string_test.py +0 -251
- kreuzberg-3.17.3/tests/utils/sync_test.py +0 -259
- kreuzberg-3.17.3/tests/utils/table_test.py +0 -353
- kreuzberg-3.17.3/tests/utils/tmp_test.py +0 -50
- kreuzberg-3.17.3/uv.lock +0 -6184
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/README.md +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.17.3/benchmarks → kreuzberg-3.20.1/kreuzberg/_api}/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_api/_config_cache.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.17.3/kreuzberg/_api → kreuzberg-3.20.1/kreuzberg/_extractors}/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/_reducer.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/_stopwords.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
- {kreuzberg-3.17.3/kreuzberg/_extractors → kreuzberg-3.20.1/kreuzberg/_utils}/__init__.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_image_preprocessing.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_resource_managers.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.17.3 → kreuzberg-3.20.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.17.3/benchmarks → kreuzberg-3.20.1/kreuzberg}/py.typed +0 -0
@@ -1,13 +1,11 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.3
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.20.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
|
-
|
6
|
-
|
5
|
+
Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
6
|
+
Author: Na'aman Hirschfeld
|
7
7
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
8
8
|
License: MIT
|
9
|
-
License-File: LICENSE
|
10
|
-
Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
11
9
|
Classifier: Development Status :: 5 - Production/Stable
|
12
10
|
Classifier: Intended Audience :: Developers
|
13
11
|
Classifier: Intended Audience :: Information Technology
|
@@ -27,67 +25,56 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
27
25
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
26
|
Classifier: Topic :: Text Processing :: General
|
29
27
|
Classifier: Typing :: Typed
|
30
|
-
Requires-
|
31
|
-
Requires-Dist: anyio>=4.10.0
|
28
|
+
Requires-Dist: anyio>=4.11.0
|
32
29
|
Requires-Dist: chardetng-py>=0.3.5
|
33
|
-
Requires-Dist: exceptiongroup>=1.2.2;
|
34
|
-
Requires-Dist: html-to-markdown
|
30
|
+
Requires-Dist: exceptiongroup>=1.2.2 ; python_full_version < '3.11'
|
31
|
+
Requires-Dist: html-to-markdown>=2.1.0
|
35
32
|
Requires-Dist: langcodes>=3.5.0
|
36
|
-
Requires-Dist: mcp>=1.
|
33
|
+
Requires-Dist: mcp>=1.17.0
|
37
34
|
Requires-Dist: msgspec>=0.18.0
|
38
35
|
Requires-Dist: numpy>=2.0.0
|
39
36
|
Requires-Dist: playa-pdf>=0.7.0
|
40
|
-
Requires-Dist: polars>=1.
|
37
|
+
Requires-Dist: polars>=1.34.0
|
41
38
|
Requires-Dist: psutil>=7.1.0
|
42
39
|
Requires-Dist: pypdfium2==4.30.0
|
43
40
|
Requires-Dist: python-calamine>=0.5.3
|
44
41
|
Requires-Dist: python-pptx>=1.0.2
|
45
|
-
Requires-Dist:
|
42
|
+
Requires-Dist: transformers>=4.55.0
|
43
|
+
Requires-Dist: typing-extensions>=4.15.0 ; python_full_version < '3.12'
|
44
|
+
Requires-Dist: mailparse>=1.0.15 ; extra == 'additional-extensions'
|
45
|
+
Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'additional-extensions'
|
46
|
+
Requires-Dist: kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr] ; extra == 'all'
|
47
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0 ; extra == 'api'
|
48
|
+
Requires-Dist: semantic-text-splitter>=0.28.0 ; extra == 'chunking'
|
49
|
+
Requires-Dist: click>=8.3.0 ; extra == 'cli'
|
50
|
+
Requires-Dist: rich>=14.2.0 ; extra == 'cli'
|
51
|
+
Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'cli'
|
52
|
+
Requires-Dist: playa-pdf[crypto]>=0.7.0 ; extra == 'crypto'
|
53
|
+
Requires-Dist: deep-translator>=1.11.4 ; extra == 'document-classification'
|
54
|
+
Requires-Dist: easyocr>=1.7.2 ; python_full_version < '3.14' and extra == 'easyocr'
|
55
|
+
Requires-Dist: keybert>=0.9.0 ; extra == 'entity-extraction'
|
56
|
+
Requires-Dist: spacy>=3.8.7 ; python_full_version < '3.14' and extra == 'entity-extraction'
|
57
|
+
Requires-Dist: gmft>=0.4.2 ; extra == 'gmft'
|
58
|
+
Requires-Dist: transformers>=4.57.0 ; extra == 'gmft'
|
59
|
+
Requires-Dist: fast-langdetect>=1.0.0 ; extra == 'langdetect'
|
60
|
+
Requires-Dist: paddleocr>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
|
61
|
+
Requires-Dist: paddlepaddle>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
|
62
|
+
Requires-Dist: setuptools>=80.9.0 ; extra == 'paddleocr'
|
63
|
+
Requires-Python: >=3.10
|
64
|
+
Project-URL: documentation, https://kreuzberg.dev
|
65
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
46
66
|
Provides-Extra: additional-extensions
|
47
|
-
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
48
|
-
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
49
67
|
Provides-Extra: all
|
50
|
-
Requires-Dist: click>=8.2.1; extra == 'all'
|
51
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
52
|
-
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
53
|
-
Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
|
54
|
-
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
55
|
-
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
56
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
57
|
-
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
58
|
-
Requires-Dist: paddleocr>=3.2.0; extra == 'all'
|
59
|
-
Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
|
60
|
-
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
61
|
-
Requires-Dist: rich>=14.1.0; extra == 'all'
|
62
|
-
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
|
63
|
-
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
64
|
-
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
65
|
-
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
66
68
|
Provides-Extra: api
|
67
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
68
69
|
Provides-Extra: chunking
|
69
|
-
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
|
70
70
|
Provides-Extra: cli
|
71
|
-
Requires-Dist: click>=8.2.1; extra == 'cli'
|
72
|
-
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
73
|
-
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
74
71
|
Provides-Extra: crypto
|
75
|
-
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
|
76
72
|
Provides-Extra: document-classification
|
77
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
78
73
|
Provides-Extra: easyocr
|
79
|
-
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
80
74
|
Provides-Extra: entity-extraction
|
81
|
-
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
82
|
-
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
83
75
|
Provides-Extra: gmft
|
84
|
-
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
85
76
|
Provides-Extra: langdetect
|
86
|
-
Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
|
87
77
|
Provides-Extra: paddleocr
|
88
|
-
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
89
|
-
Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
|
90
|
-
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
91
78
|
Description-Content-Type: text/markdown
|
92
79
|
|
93
80
|
# Kreuzberg
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import base64
|
4
4
|
import io
|
5
|
+
import os
|
5
6
|
import traceback
|
6
7
|
from json import dumps
|
7
8
|
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
@@ -100,6 +101,35 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
|
|
100
101
|
)
|
101
102
|
|
102
103
|
|
104
|
+
def _get_max_upload_size() -> int:
|
105
|
+
"""Get the maximum upload size from environment variable.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
Maximum upload size in bytes. Defaults to 1GB if not set.
|
109
|
+
|
110
|
+
Environment Variables:
|
111
|
+
KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
|
112
|
+
"""
|
113
|
+
default_size = 1024 * 1024 * 1024
|
114
|
+
try:
|
115
|
+
size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
|
116
|
+
return size if size >= 0 else default_size
|
117
|
+
except ValueError:
|
118
|
+
return default_size
|
119
|
+
|
120
|
+
|
121
|
+
def _is_opentelemetry_enabled() -> bool:
|
122
|
+
"""Check if OpenTelemetry should be enabled.
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
True if OpenTelemetry should be enabled, False otherwise.
|
126
|
+
|
127
|
+
Environment Variables:
|
128
|
+
KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
|
129
|
+
"""
|
130
|
+
return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
|
131
|
+
|
132
|
+
|
103
133
|
def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
|
104
134
|
error_type = type(exception).__name__
|
105
135
|
error_message = str(exception)
|
@@ -242,7 +272,7 @@ async def handle_files_upload( # noqa: PLR0913
|
|
242
272
|
- Language detection (if enabled)
|
243
273
|
|
244
274
|
Supports various file formats including PDF, Office documents, images, and more.
|
245
|
-
Maximum file size: 1GB per file.
|
275
|
+
Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
|
246
276
|
|
247
277
|
Args:
|
248
278
|
request: The HTTP request object
|
@@ -280,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
|
|
280
310
|
"""
|
281
311
|
static_config = discover_config_cached()
|
282
312
|
|
313
|
+
if not data:
|
314
|
+
raise ValidationError("No files provided for extraction", context={"file_count": 0})
|
315
|
+
|
283
316
|
min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
|
284
317
|
max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
|
285
318
|
|
@@ -379,9 +412,18 @@ type_encoders = {
|
|
379
412
|
Image.Image: _pil_image_encoder,
|
380
413
|
}
|
381
414
|
|
415
|
+
|
416
|
+
def _get_plugins() -> list[Any]:
|
417
|
+
"""Get configured plugins based on environment variables."""
|
418
|
+
plugins = []
|
419
|
+
if _is_opentelemetry_enabled():
|
420
|
+
plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
|
421
|
+
return plugins
|
422
|
+
|
423
|
+
|
382
424
|
app = Litestar(
|
383
425
|
route_handlers=[handle_files_upload, health_check, get_configuration],
|
384
|
-
plugins=
|
426
|
+
plugins=_get_plugins(),
|
385
427
|
logging_config=StructLoggingConfig(),
|
386
428
|
openapi_config=openapi_config,
|
387
429
|
exception_handlers={
|
@@ -389,5 +431,5 @@ app = Litestar(
|
|
389
431
|
Exception: general_exception_handler,
|
390
432
|
},
|
391
433
|
type_encoders=type_encoders,
|
392
|
-
request_max_body_size=
|
434
|
+
request_max_body_size=_get_max_upload_size(),
|
393
435
|
)
|
@@ -2,19 +2,77 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
5
|
+
import shutil
|
5
6
|
import subprocess
|
6
|
-
import sys
|
7
7
|
from functools import lru_cache
|
8
8
|
from itertools import chain
|
9
9
|
from typing import TYPE_CHECKING, Any
|
10
10
|
|
11
|
+
import anyio
|
12
|
+
|
11
13
|
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
14
|
+
from kreuzberg._utils._sync import run_sync
|
12
15
|
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
13
16
|
|
14
17
|
if TYPE_CHECKING:
|
15
18
|
from collections.abc import Sequence
|
16
19
|
|
17
20
|
|
21
|
+
def is_uv_available() -> bool:
|
22
|
+
"""Check if uv is available in the environment."""
|
23
|
+
return shutil.which("uv") is not None
|
24
|
+
|
25
|
+
|
26
|
+
def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
|
27
|
+
"""Get the direct download URL for a spaCy model.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
|
31
|
+
version: Model version to download (default: 3.8.0)
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Direct download URL for the model
|
35
|
+
"""
|
36
|
+
return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
|
37
|
+
|
38
|
+
|
39
|
+
async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
|
40
|
+
"""Install spaCy model using uv.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
model_name: Name of the spaCy model to install
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
Completed process result
|
47
|
+
"""
|
48
|
+
model_url = get_spacy_model_url(model_name)
|
49
|
+
return await run_sync(
|
50
|
+
subprocess.run,
|
51
|
+
["uv", "pip", "install", model_url],
|
52
|
+
capture_output=True,
|
53
|
+
text=True,
|
54
|
+
check=False,
|
55
|
+
)
|
56
|
+
|
57
|
+
|
58
|
+
async def install_spacy_model_with_spacy(model_name: str) -> bool:
|
59
|
+
"""Install spaCy model using spacy download function.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
model_name: Name of the spaCy model to install
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
True if successful, False otherwise
|
66
|
+
"""
|
67
|
+
try:
|
68
|
+
import spacy.cli.download # noqa: PLC0415
|
69
|
+
|
70
|
+
await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
|
71
|
+
return True
|
72
|
+
except (ImportError, OSError, RuntimeError):
|
73
|
+
return False
|
74
|
+
|
75
|
+
|
18
76
|
def extract_entities(
|
19
77
|
text: str,
|
20
78
|
entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
|
@@ -46,11 +104,11 @@ def extract_entities(
|
|
46
104
|
functionality="Entity Extraction",
|
47
105
|
) from e
|
48
106
|
|
49
|
-
model_name =
|
107
|
+
model_name = select_spacy_model(languages, spacy_config)
|
50
108
|
if not model_name:
|
51
109
|
return entities
|
52
110
|
|
53
|
-
nlp =
|
111
|
+
nlp = load_spacy_model(model_name, spacy_config)
|
54
112
|
|
55
113
|
if len(text) > spacy_config.max_doc_length:
|
56
114
|
text = text[: spacy_config.max_doc_length]
|
@@ -74,7 +132,7 @@ def extract_entities(
|
|
74
132
|
|
75
133
|
|
76
134
|
@lru_cache(maxsize=32)
|
77
|
-
def
|
135
|
+
def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
78
136
|
try:
|
79
137
|
import spacy # noqa: PLC0415
|
80
138
|
except ImportError:
|
@@ -86,22 +144,54 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
86
144
|
try:
|
87
145
|
nlp = spacy.load(model_name)
|
88
146
|
except OSError:
|
89
|
-
result = subprocess.run(
|
90
|
-
[sys.executable, "-m", "spacy", "download", model_name],
|
91
|
-
capture_output=True,
|
92
|
-
text=True,
|
93
|
-
check=False,
|
94
|
-
)
|
95
147
|
|
96
|
-
|
148
|
+
async def install_model() -> tuple[bool, str | None]:
|
149
|
+
"""Install model and return success status and error message."""
|
150
|
+
try:
|
151
|
+
success = await install_spacy_model_with_spacy(model_name)
|
152
|
+
if success:
|
153
|
+
return True, None
|
154
|
+
except (ImportError, OSError, RuntimeError) as e:
|
155
|
+
spacy_error = str(e)
|
156
|
+
else:
|
157
|
+
spacy_error = "spaCy download failed"
|
158
|
+
|
159
|
+
if is_uv_available():
|
160
|
+
try:
|
161
|
+
result = await install_spacy_model_with_uv(model_name)
|
162
|
+
return result.returncode == 0, result.stderr
|
163
|
+
except (OSError, subprocess.SubprocessError) as e:
|
164
|
+
return False, f"spaCy: {spacy_error}, uv: {e!s}"
|
165
|
+
|
166
|
+
return False, spacy_error
|
167
|
+
|
168
|
+
try:
|
169
|
+
success, error_details = anyio.run(install_model)
|
170
|
+
except SystemExit as e:
|
171
|
+
success, error_details = False, f"spaCy CLI exit code: {e.code}"
|
172
|
+
|
173
|
+
if not success:
|
174
|
+
if is_uv_available():
|
175
|
+
model_url = get_spacy_model_url(model_name)
|
176
|
+
manual_install_cmd = f"uv pip install {model_url}"
|
177
|
+
else:
|
178
|
+
manual_install_cmd = f"python -m spacy download {model_name}"
|
179
|
+
|
97
180
|
error_msg = (
|
98
|
-
f"Failed to download spaCy model '{model_name}'. "
|
99
|
-
f"Please install it manually with: python -m spacy download {model_name}"
|
181
|
+
f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
|
100
182
|
)
|
101
|
-
|
102
|
-
|
183
|
+
|
184
|
+
if error_details:
|
185
|
+
error_msg += f"\nError details: {error_details}"
|
186
|
+
|
103
187
|
raise KreuzbergError(
|
104
|
-
error_msg,
|
188
|
+
error_msg,
|
189
|
+
context={
|
190
|
+
"model": model_name,
|
191
|
+
"manual_install_cmd": manual_install_cmd,
|
192
|
+
"error_details": error_details,
|
193
|
+
"uv_available": is_uv_available(),
|
194
|
+
},
|
105
195
|
) from None
|
106
196
|
|
107
197
|
try:
|
@@ -118,7 +208,7 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
118
208
|
return nlp
|
119
209
|
|
120
210
|
|
121
|
-
def
|
211
|
+
def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
122
212
|
if not languages:
|
123
213
|
return spacy_config.get_model_for_language("en")
|
124
214
|
|
@@ -140,7 +230,7 @@ def extract_keywords(
|
|
140
230
|
kw_model = KeyBERT()
|
141
231
|
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
142
232
|
return [(kw, float(score)) for kw, score in keywords]
|
143
|
-
except
|
233
|
+
except ValueError:
|
144
234
|
return []
|
145
235
|
except ImportError as e: # pragma: no cover
|
146
236
|
raise MissingDependencyError.create_for_package(
|
@@ -0,0 +1,182 @@
|
|
1
|
+
"""Type-safe error handling utilities for extraction pipeline."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import traceback
|
6
|
+
from typing import TYPE_CHECKING, Any
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from collections.abc import Callable
|
10
|
+
|
11
|
+
from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
|
12
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
|
13
|
+
|
14
|
+
|
15
|
+
def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
|
16
|
+
"""Determine if an exception should bubble up or be handled gracefully.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
exception: The exception to classify
|
20
|
+
context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
True if the exception should bubble up, False if it should be handled gracefully
|
24
|
+
"""
|
25
|
+
if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
|
26
|
+
return True
|
27
|
+
|
28
|
+
if isinstance(exception, MissingDependencyError):
|
29
|
+
return True
|
30
|
+
|
31
|
+
if isinstance(exception, ValidationError):
|
32
|
+
if context == "batch_processing":
|
33
|
+
return False
|
34
|
+
|
35
|
+
return context != "optional_feature"
|
36
|
+
|
37
|
+
if isinstance(exception, KreuzbergError) and context == "optional_feature":
|
38
|
+
return False
|
39
|
+
|
40
|
+
if context == "batch_processing":
|
41
|
+
return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
|
42
|
+
|
43
|
+
return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
|
44
|
+
|
45
|
+
|
46
|
+
class FeatureProcessingError:
|
47
|
+
"""Type-safe processing error for extraction features."""
|
48
|
+
|
49
|
+
def __init__(self, feature: str, error: Exception) -> None:
|
50
|
+
self._feature = feature
|
51
|
+
self._error = error
|
52
|
+
self._traceback = traceback.format_exc()
|
53
|
+
|
54
|
+
@property
|
55
|
+
def feature(self) -> str:
|
56
|
+
return self._feature
|
57
|
+
|
58
|
+
@property
|
59
|
+
def error_type(self) -> str:
|
60
|
+
return type(self._error).__name__
|
61
|
+
|
62
|
+
@property
|
63
|
+
def error_message(self) -> str:
|
64
|
+
return str(self._error)
|
65
|
+
|
66
|
+
@property
|
67
|
+
def traceback(self) -> str:
|
68
|
+
return self._traceback
|
69
|
+
|
70
|
+
def to_dict(self) -> ProcessingErrorDict:
|
71
|
+
return {
|
72
|
+
"feature": self.feature,
|
73
|
+
"error_type": self.error_type,
|
74
|
+
"error_message": self.error_message,
|
75
|
+
"traceback": self.traceback,
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
def safe_feature_execution(
|
80
|
+
feature_name: str,
|
81
|
+
execution_func: Callable[[], Any],
|
82
|
+
default_value: Any,
|
83
|
+
result: ExtractionResult,
|
84
|
+
context: ErrorContextType = "optional_feature",
|
85
|
+
) -> Any:
|
86
|
+
"""Safely execute a feature extraction function with proper error handling.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
feature_name: Name of the feature being executed
|
90
|
+
execution_func: Function to execute that may raise exceptions
|
91
|
+
default_value: Default value to return if execution fails
|
92
|
+
result: ExtractionResult to update with error information
|
93
|
+
context: The context for exception handling decisions
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
Either the successful result or the default value
|
97
|
+
"""
|
98
|
+
try:
|
99
|
+
return execution_func()
|
100
|
+
except Exception as e:
|
101
|
+
if should_exception_bubble_up(e, context):
|
102
|
+
raise
|
103
|
+
|
104
|
+
_add_processing_error(result, FeatureProcessingError(feature_name, e))
|
105
|
+
return default_value
|
106
|
+
|
107
|
+
|
108
|
+
def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
|
109
|
+
"""Add a processing error to the result metadata in a type-safe way."""
|
110
|
+
if result.metadata is None:
|
111
|
+
result.metadata = {}
|
112
|
+
|
113
|
+
if "processing_errors" not in result.metadata:
|
114
|
+
result.metadata["processing_errors"] = []
|
115
|
+
|
116
|
+
errors_list = result.metadata["processing_errors"]
|
117
|
+
if isinstance(errors_list, list):
|
118
|
+
errors_list.append(error.to_dict())
|
119
|
+
else:
|
120
|
+
result.metadata["processing_errors"] = [error.to_dict()]
|
121
|
+
|
122
|
+
|
123
|
+
def preserve_result_with_errors(
|
124
|
+
result: ExtractionResult,
|
125
|
+
errors: list[FeatureProcessingError],
|
126
|
+
) -> ExtractionResult:
|
127
|
+
"""Preserve a successful extraction result while adding error information.
|
128
|
+
|
129
|
+
This is used when core extraction succeeds but optional features fail.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
result: The successful extraction result
|
133
|
+
errors: List of errors that occurred during optional processing
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
The result with error information added to metadata
|
137
|
+
"""
|
138
|
+
for error in errors:
|
139
|
+
_add_processing_error(result, error)
|
140
|
+
|
141
|
+
return result
|
142
|
+
|
143
|
+
|
144
|
+
def create_error_result(
|
145
|
+
content: str,
|
146
|
+
mime_type: str,
|
147
|
+
errors: list[FeatureProcessingError],
|
148
|
+
**metadata_kwargs: Any,
|
149
|
+
) -> ExtractionResult:
|
150
|
+
"""Create an error result with proper type safety.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
content: Error content to include
|
154
|
+
mime_type: MIME type of the result
|
155
|
+
errors: List of errors that occurred
|
156
|
+
**metadata_kwargs: Additional metadata to include
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
An ExtractionResult with error information
|
160
|
+
"""
|
161
|
+
metadata: Metadata = {
|
162
|
+
"error": f"Multiple processing errors occurred: {len(errors)} errors",
|
163
|
+
"error_context": {
|
164
|
+
"error_count": len(errors),
|
165
|
+
"errors": [error.to_dict() for error in errors],
|
166
|
+
**metadata_kwargs,
|
167
|
+
},
|
168
|
+
"processing_errors": [error.to_dict() for error in errors],
|
169
|
+
}
|
170
|
+
|
171
|
+
return ExtractionResult(
|
172
|
+
content=content,
|
173
|
+
chunks=[],
|
174
|
+
mime_type=mime_type,
|
175
|
+
metadata=metadata,
|
176
|
+
entities=[],
|
177
|
+
keywords=[],
|
178
|
+
detected_languages=[],
|
179
|
+
tables=[],
|
180
|
+
images=[],
|
181
|
+
image_ocr_results=[],
|
182
|
+
)
|
@@ -230,13 +230,13 @@ class Extractor(ABC):
|
|
230
230
|
confidence_score=None,
|
231
231
|
processing_time=duration,
|
232
232
|
)
|
233
|
-
except
|
233
|
+
except ValueError as e: # pragma: no cover
|
234
234
|
return ImageOCRResult(
|
235
235
|
image=target,
|
236
236
|
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
237
237
|
skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
|
238
238
|
)
|
239
|
-
except
|
239
|
+
except TypeError as e: # pragma: no cover
|
240
240
|
return ImageOCRResult(
|
241
241
|
image=target,
|
242
242
|
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|