kreuzberg 3.18.0__tar.gz → 3.20.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/PKG-INFO +32 -45
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_api/main.py +4 -2
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_entity_extraction.py +4 -8
- kreuzberg-3.20.1/kreuzberg/_error_handling.py +182 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_base.py +2 -2
- kreuzberg-3.20.1/kreuzberg/_extractors/_html.py +138 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pdf.py +33 -54
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_structured.py +1 -1
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_language_detection.py +2 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_tesseract.py +76 -297
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_types.py +143 -47
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/cli.py +36 -22
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/extraction.py +251 -107
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/pyproject.toml +58 -74
- kreuzberg-3.18.0/.commitlintrc +0 -1
- kreuzberg-3.18.0/.deepsource.toml +0 -54
- kreuzberg-3.18.0/.docker/Dockerfile +0 -79
- kreuzberg-3.18.0/.docker/README.md +0 -190
- kreuzberg-3.18.0/.dockerignore +0 -15
- kreuzberg-3.18.0/.github/dependabot.yaml +0 -6
- kreuzberg-3.18.0/.github/workflows/ci.yaml +0 -381
- kreuzberg-3.18.0/.github/workflows/docker-e2e-tests.yml +0 -150
- kreuzberg-3.18.0/.github/workflows/docs.yml +0 -66
- kreuzberg-3.18.0/.github/workflows/pr-title.yaml +0 -20
- kreuzberg-3.18.0/.github/workflows/publish-docker.yml +0 -163
- kreuzberg-3.18.0/.github/workflows/release.yaml +0 -37
- kreuzberg-3.18.0/.github/workflows/test-docker-builds.yml +0 -101
- kreuzberg-3.18.0/.gitignore +0 -74
- kreuzberg-3.18.0/.markdownlint.yaml +0 -17
- kreuzberg-3.18.0/.pre-commit-config.yaml +0 -82
- kreuzberg-3.18.0/.prettierignore +0 -1
- kreuzberg-3.18.0/ATTRIBUTIONS.md +0 -47
- kreuzberg-3.18.0/LICENSE +0 -7
- kreuzberg-3.18.0/Taskfile.yml +0 -50
- kreuzberg-3.18.0/ai-rulez.yaml +0 -586
- kreuzberg-3.18.0/benchmarks/README.md +0 -264
- kreuzberg-3.18.0/benchmarks/batch_size_benchmark.py +0 -179
- kreuzberg-3.18.0/benchmarks/batch_validation_benchmark.py +0 -83
- kreuzberg-3.18.0/benchmarks/pyproject.toml +0 -29
- kreuzberg-3.18.0/benchmarks/src/__init__.py +0 -1
- kreuzberg-3.18.0/benchmarks/src/__main__.py +0 -4
- kreuzberg-3.18.0/benchmarks/src/benchmarks.py +0 -703
- kreuzberg-3.18.0/benchmarks/src/cli.py +0 -723
- kreuzberg-3.18.0/benchmarks/src/models.py +0 -195
- kreuzberg-3.18.0/benchmarks/src/profiler.py +0 -161
- kreuzberg-3.18.0/benchmarks/src/runner.py +0 -367
- kreuzberg-3.18.0/benchmarks/token_reduction_compression_benchmark.py +0 -268
- kreuzberg-3.18.0/docs/advanced/custom-extractors.md +0 -203
- kreuzberg-3.18.0/docs/advanced/custom-hooks.md +0 -148
- kreuzberg-3.18.0/docs/advanced/error-handling.md +0 -181
- kreuzberg-3.18.0/docs/advanced/index.md +0 -41
- kreuzberg-3.18.0/docs/advanced/performance.md +0 -306
- kreuzberg-3.18.0/docs/api-reference/exceptions.md +0 -33
- kreuzberg-3.18.0/docs/api-reference/extraction-functions.md +0 -59
- kreuzberg-3.18.0/docs/api-reference/extractor-registry.md +0 -5
- kreuzberg-3.18.0/docs/api-reference/index.md +0 -51
- kreuzberg-3.18.0/docs/api-reference/ocr-configuration.md +0 -27
- kreuzberg-3.18.0/docs/api-reference/types.md +0 -120
- kreuzberg-3.18.0/docs/assets/favicon.png +0 -0
- kreuzberg-3.18.0/docs/assets/logo.png +0 -0
- kreuzberg-3.18.0/docs/cli.md +0 -225
- kreuzberg-3.18.0/docs/contributing.md +0 -82
- kreuzberg-3.18.0/docs/css/extra.css +0 -56
- kreuzberg-3.18.0/docs/examples/extraction-examples.md +0 -763
- kreuzberg-3.18.0/docs/examples/index.md +0 -48
- kreuzberg-3.18.0/docs/getting-started/index.md +0 -20
- kreuzberg-3.18.0/docs/getting-started/installation.md +0 -154
- kreuzberg-3.18.0/docs/getting-started/quick-start.md +0 -111
- kreuzberg-3.18.0/docs/index.md +0 -60
- kreuzberg-3.18.0/docs/user-guide/api-server.md +0 -531
- kreuzberg-3.18.0/docs/user-guide/basic-usage.md +0 -161
- kreuzberg-3.18.0/docs/user-guide/chunking.md +0 -124
- kreuzberg-3.18.0/docs/user-guide/docker.md +0 -548
- kreuzberg-3.18.0/docs/user-guide/document-classification.md +0 -61
- kreuzberg-3.18.0/docs/user-guide/extraction-configuration.md +0 -966
- kreuzberg-3.18.0/docs/user-guide/index.md +0 -45
- kreuzberg-3.18.0/docs/user-guide/mcp-server.md +0 -586
- kreuzberg-3.18.0/docs/user-guide/metadata-extraction.md +0 -125
- kreuzberg-3.18.0/docs/user-guide/ocr-backends.md +0 -247
- kreuzberg-3.18.0/docs/user-guide/ocr-configuration.md +0 -414
- kreuzberg-3.18.0/docs/user-guide/supported-formats.md +0 -71
- kreuzberg-3.18.0/docs/user-guide/token-reduction.md +0 -251
- kreuzberg-3.18.0/kreuzberg/_extractors/_html.py +0 -148
- kreuzberg-3.18.0/kreuzberg/_utils/__init__.py +0 -0
- kreuzberg-3.18.0/kreuzberg/_utils/_html_streaming.py +0 -20
- kreuzberg-3.18.0/kreuzberg/py.typed +0 -0
- kreuzberg-3.18.0/mkdocs.yaml +0 -160
- kreuzberg-3.18.0/tests/__init__.py +0 -0
- kreuzberg-3.18.0/tests/api/__init__.py +0 -0
- kreuzberg-3.18.0/tests/api/config_cache_test.py +0 -224
- kreuzberg-3.18.0/tests/api/conftest.py +0 -18
- kreuzberg-3.18.0/tests/api/environment_config_test.py +0 -154
- kreuzberg-3.18.0/tests/api/header_config_hashing_test.py +0 -29
- kreuzberg-3.18.0/tests/api/image_extraction_test.py +0 -59
- kreuzberg-3.18.0/tests/api/main_test.py +0 -817
- kreuzberg-3.18.0/tests/api/runtime_config_test.py +0 -374
- kreuzberg-3.18.0/tests/conftest.py +0 -219
- kreuzberg-3.18.0/tests/core/__init__.py +0 -0
- kreuzberg-3.18.0/tests/core/comprehensive_config_test.py +0 -664
- kreuzberg-3.18.0/tests/core/config_test.py +0 -15
- kreuzberg-3.18.0/tests/core/constants_test.py +0 -22
- kreuzberg-3.18.0/tests/core/dpi_configuration_test.py +0 -319
- kreuzberg-3.18.0/tests/core/exceptions_test.py +0 -159
- kreuzberg-3.18.0/tests/core/extraction_batch_test.py +0 -389
- kreuzberg-3.18.0/tests/core/extraction_test.py +0 -494
- kreuzberg-3.18.0/tests/core/html_to_markdown_config_test.py +0 -0
- kreuzberg-3.18.0/tests/core/image_ocr_result_test.py +0 -27
- kreuzberg-3.18.0/tests/core/init_test.py +0 -85
- kreuzberg-3.18.0/tests/core/main_test.py +0 -35
- kreuzberg-3.18.0/tests/core/mime_types_test.py +0 -242
- kreuzberg-3.18.0/tests/core/registry_test.py +0 -225
- kreuzberg-3.18.0/tests/core/types_test.py +0 -465
- kreuzberg-3.18.0/tests/e2e/__init__.py +0 -0
- kreuzberg-3.18.0/tests/e2e/docker_e2e.py +0 -481
- kreuzberg-3.18.0/tests/extractors/README_image_tests.md +0 -85
- kreuzberg-3.18.0/tests/extractors/__init__.py +0 -0
- kreuzberg-3.18.0/tests/extractors/base_extractor_test.py +0 -420
- kreuzberg-3.18.0/tests/extractors/base_memory_limits_test.py +0 -100
- kreuzberg-3.18.0/tests/extractors/base_ocr_processing_test.py +0 -276
- kreuzberg-3.18.0/tests/extractors/base_ocr_simple_test.py +0 -64
- kreuzberg-3.18.0/tests/extractors/email_error_paths_test.py +0 -39
- kreuzberg-3.18.0/tests/extractors/email_test.py +0 -948
- kreuzberg-3.18.0/tests/extractors/html_invalid_base64_test.py +0 -11
- kreuzberg-3.18.0/tests/extractors/html_test.py +0 -52
- kreuzberg-3.18.0/tests/extractors/image_deduplication_test.py +0 -87
- kreuzberg-3.18.0/tests/extractors/image_error_handling_test.py +0 -253
- kreuzberg-3.18.0/tests/extractors/image_error_simple_test.py +0 -75
- kreuzberg-3.18.0/tests/extractors/image_test.py +0 -766
- kreuzberg-3.18.0/tests/extractors/json_test.py +0 -427
- kreuzberg-3.18.0/tests/extractors/pandoc_metadata_test.py +0 -323
- kreuzberg-3.18.0/tests/extractors/pandoc_test.py +0 -1995
- kreuzberg-3.18.0/tests/extractors/pdf_images_test.py +0 -52
- kreuzberg-3.18.0/tests/extractors/pdf_sync_images_test.py +0 -217
- kreuzberg-3.18.0/tests/extractors/pdf_test.py +0 -979
- kreuzberg-3.18.0/tests/extractors/presentation_test.py +0 -967
- kreuzberg-3.18.0/tests/extractors/spreadsheet_test.py +0 -1140
- kreuzberg-3.18.0/tests/extractors/structured_test.py +0 -304
- kreuzberg-3.18.0/tests/features/__init__.py +0 -0
- kreuzberg-3.18.0/tests/features/chunker_test.py +0 -94
- kreuzberg-3.18.0/tests/features/document_classification_test.py +0 -747
- kreuzberg-3.18.0/tests/features/entity_extraction_test.py +0 -279
- kreuzberg-3.18.0/tests/features/gmft_test.py +0 -1496
- kreuzberg-3.18.0/tests/features/hooks_test.py +0 -0
- kreuzberg-3.18.0/tests/features/language_detection_test.py +0 -343
- kreuzberg-3.18.0/tests/features/table_extraction_test.py +0 -0
- kreuzberg-3.18.0/tests/features/token_reduction_test.py +0 -813
- kreuzberg-3.18.0/tests/integration/__init__.py +0 -0
- kreuzberg-3.18.0/tests/integration/all_extractors_images_test.py +0 -252
- kreuzberg-3.18.0/tests/integration/api/__init__.py +0 -0
- kreuzberg-3.18.0/tests/integration/api/large_file_test.py +0 -0
- kreuzberg-3.18.0/tests/integration/api/mounted_config_test.py +0 -0
- kreuzberg-3.18.0/tests/integration/dpi_integration_test.py +0 -209
- kreuzberg-3.18.0/tests/integration/multiprocessing/__init__.py +0 -0
- kreuzberg-3.18.0/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- kreuzberg-3.18.0/tests/integration/ocr/__init__.py +0 -0
- kreuzberg-3.18.0/tests/integration/ocr/device_integration_test.py +0 -0
- kreuzberg-3.18.0/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- kreuzberg-3.18.0/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- kreuzberg-3.18.0/tests/integration/pandoc_images_test.py +0 -30
- kreuzberg-3.18.0/tests/integration/pdf_images_test.py +0 -18
- kreuzberg-3.18.0/tests/integration/pdf_real_images_test.py +0 -52
- kreuzberg-3.18.0/tests/integration/pptx_complex_test.py +0 -22
- kreuzberg-3.18.0/tests/integration/pptx_images_test.py +0 -18
- kreuzberg-3.18.0/tests/integration/regression_test.py +0 -134
- kreuzberg-3.18.0/tests/integration/token_reduction_integration_test.py +0 -173
- kreuzberg-3.18.0/tests/interfaces/__init__.py +0 -0
- kreuzberg-3.18.0/tests/interfaces/cli_test.py +0 -527
- kreuzberg-3.18.0/tests/interfaces/mcp_server_test.py +0 -1116
- kreuzberg-3.18.0/tests/mcp/__init__.py +0 -0
- kreuzberg-3.18.0/tests/mcp/mcp_server_test.py +0 -0
- kreuzberg-3.18.0/tests/multiprocessing/__init__.py +0 -0
- kreuzberg-3.18.0/tests/multiprocessing/gmft_isolated_test.py +0 -449
- kreuzberg-3.18.0/tests/multiprocessing/process_manager_test.py +0 -273
- kreuzberg-3.18.0/tests/multiprocessing/tesseract_pool_test.py +0 -331
- kreuzberg-3.18.0/tests/ocr/__init__.py +0 -0
- kreuzberg-3.18.0/tests/ocr/base_test.py +0 -80
- kreuzberg-3.18.0/tests/ocr/easyocr_test.py +0 -517
- kreuzberg-3.18.0/tests/ocr/init_test.py +0 -35
- kreuzberg-3.18.0/tests/ocr/paddleocr_test.py +0 -835
- kreuzberg-3.18.0/tests/ocr/tesseract_test.py +0 -1314
- kreuzberg-3.18.0/tests/ocr/tesseract_tsv_test.py +0 -409
- kreuzberg-3.18.0/tests/performance/__init__.py +0 -0
- kreuzberg-3.18.0/tests/performance/large_pdf_perf_test.py +0 -29
- kreuzberg-3.18.0/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/contract.txt +0 -1
- kreuzberg-3.18.0/tests/test_source_files/contract_test.txt +0 -4
- kreuzberg-3.18.0/tests/test_source_files/document.docx +0 -0
- kreuzberg-3.18.0/tests/test_source_files/email/sample-email.eml +0 -11
- kreuzberg-3.18.0/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- kreuzberg-3.18.0/tests/test_source_files/excel.xlsx +0 -0
- kreuzberg-3.18.0/tests/test_source_files/flower-no-text.jpg +0 -0
- kreuzberg-3.18.0/tests/test_source_files/form_test.txt +0 -5
- kreuzberg-3.18.0/tests/test_source_files/french-text.txt +0 -2
- kreuzberg-3.18.0/tests/test_source_files/german-text.txt +0 -2
- kreuzberg-3.18.0/tests/test_source_files/google-doc-document.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/html.html +0 -10
- kreuzberg-3.18.0/tests/test_source_files/image-only-german-pdf.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/images/test_hello_world.png +0 -0
- kreuzberg-3.18.0/tests/test_source_files/invoice_image.png +0 -0
- kreuzberg-3.18.0/tests/test_source_files/invoice_test.txt +0 -4
- kreuzberg-3.18.0/tests/test_source_files/json/complex_nested.json +0 -41
- kreuzberg-3.18.0/tests/test_source_files/json/real_world/aws_policy.json +0 -43
- kreuzberg-3.18.0/tests/test_source_files/json/real_world/earthquakes.geojson +0 -6
- kreuzberg-3.18.0/tests/test_source_files/json/real_world/github_emojis.json +0 -111
- kreuzberg-3.18.0/tests/test_source_files/json/real_world/iss_location.json +0 -1
- kreuzberg-3.18.0/tests/test_source_files/json/real_world/openapi_spec.json +0 -84
- kreuzberg-3.18.0/tests/test_source_files/json/real_world/package.json +0 -33
- kreuzberg-3.18.0/tests/test_source_files/json/real_world/rick_morty_character.json +0 -1
- kreuzberg-3.18.0/tests/test_source_files/json/sample-document.json +0 -1
- kreuzberg-3.18.0/tests/test_source_files/json/schema_test.json +0 -25
- kreuzberg-3.18.0/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- kreuzberg-3.18.0/tests/test_source_files/markdown.md +0 -1
- kreuzberg-3.18.0/tests/test_source_files/non-ascii-text.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/non-searchable.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/ocr-image.jpg +0 -0
- kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- kreuzberg-3.18.0/tests/test_source_files/receipt_test.txt +0 -5
- kreuzberg-3.18.0/tests/test_source_files/report_test.txt +0 -4
- kreuzberg-3.18.0/tests/test_source_files/sample-contract.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/scanned.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/searchable.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/sharable-web-guide.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/spanish-text.txt +0 -2
- kreuzberg-3.18.0/tests/test_source_files/tables/borderless_table.png +0 -0
- kreuzberg-3.18.0/tests/test_source_files/tables/complex_document.png +0 -0
- kreuzberg-3.18.0/tests/test_source_files/tables/simple_table.png +0 -0
- kreuzberg-3.18.0/tests/test_source_files/test-article.pdf +0 -0
- kreuzberg-3.18.0/tests/test_source_files/test-excel.xls +0 -0
- kreuzberg-3.18.0/tests/test_source_files/yaml/sample-config.yaml +0 -15
- kreuzberg-3.18.0/tests/utils/__init__.py +0 -0
- kreuzberg-3.18.0/tests/utils/cache_test.py +0 -427
- kreuzberg-3.18.0/tests/utils/device_test.py +0 -347
- kreuzberg-3.18.0/tests/utils/errors_test.py +0 -343
- kreuzberg-3.18.0/tests/utils/ocr_cache_test.py +0 -286
- kreuzberg-3.18.0/tests/utils/pdf_lock_test.py +0 -215
- kreuzberg-3.18.0/tests/utils/playa_helpers_test.py +0 -0
- kreuzberg-3.18.0/tests/utils/playa_metadata_test.py +0 -753
- kreuzberg-3.18.0/tests/utils/playa_test.py +0 -315
- kreuzberg-3.18.0/tests/utils/process_pool_test.py +0 -223
- kreuzberg-3.18.0/tests/utils/quality_test.py +0 -121
- kreuzberg-3.18.0/tests/utils/ref_test.py +0 -90
- kreuzberg-3.18.0/tests/utils/serialization_test.py +0 -379
- kreuzberg-3.18.0/tests/utils/string_test.py +0 -251
- kreuzberg-3.18.0/tests/utils/sync_test.py +0 -259
- kreuzberg-3.18.0/tests/utils/table_test.py +0 -353
- kreuzberg-3.18.0/tests/utils/tmp_test.py +0 -50
- kreuzberg-3.18.0/uv.lock +0 -6208
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/README.md +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/__init__.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.18.0/benchmarks → kreuzberg-3.20.1/kreuzberg/_api}/__init__.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_api/_config_cache.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_config.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_document_classification.py +0 -0
- {kreuzberg-3.18.0/kreuzberg/_api → kreuzberg-3.20.1/kreuzberg/_extractors}/__init__.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_email.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_gmft.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_mcp/server.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/__init__.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/_reducer.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/_stopwords.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/af_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ar_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/bg_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/bn_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/br_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ca_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/cs_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/da_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/de_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/el_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/en_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/eo_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/es_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/et_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/eu_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fa_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fi_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/fr_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ga_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/gl_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/gu_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ha_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/he_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hi_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hr_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hu_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/hy_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/id_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/it_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ja_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/kn_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ko_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ku_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/la_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/lt_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/lv_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ml_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/mr_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ms_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ne_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/nl_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/no_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/pl_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/pt_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ro_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ru_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/si_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sk_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sl_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/so_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/st_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sv_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/sw_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ta_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/te_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/th_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/tl_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/tr_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/uk_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/ur_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/vi_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/yo_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/zh_stopwords.json +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_token_reduction/stopwords/zu_stopwords.json +0 -0
- {kreuzberg-3.18.0/kreuzberg/_extractors → kreuzberg-3.20.1/kreuzberg/_utils}/__init__.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_image_preprocessing.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_resource_managers.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_serialization.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.18.0 → kreuzberg-3.20.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.18.0/benchmarks → kreuzberg-3.20.1/kreuzberg}/py.typed +0 -0
@@ -1,13 +1,11 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.3
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.20.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
|
-
|
6
|
-
|
5
|
+
Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
6
|
+
Author: Na'aman Hirschfeld
|
7
7
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
8
8
|
License: MIT
|
9
|
-
License-File: LICENSE
|
10
|
-
Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
|
11
9
|
Classifier: Development Status :: 5 - Production/Stable
|
12
10
|
Classifier: Intended Audience :: Developers
|
13
11
|
Classifier: Intended Audience :: Information Technology
|
@@ -27,67 +25,56 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
27
25
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
26
|
Classifier: Topic :: Text Processing :: General
|
29
27
|
Classifier: Typing :: Typed
|
30
|
-
Requires-Python: >=3.10
|
31
28
|
Requires-Dist: anyio>=4.11.0
|
32
29
|
Requires-Dist: chardetng-py>=0.3.5
|
33
|
-
Requires-Dist: exceptiongroup>=1.2.2;
|
34
|
-
Requires-Dist: html-to-markdown
|
30
|
+
Requires-Dist: exceptiongroup>=1.2.2 ; python_full_version < '3.11'
|
31
|
+
Requires-Dist: html-to-markdown>=2.1.0
|
35
32
|
Requires-Dist: langcodes>=3.5.0
|
36
|
-
Requires-Dist: mcp>=1.
|
33
|
+
Requires-Dist: mcp>=1.17.0
|
37
34
|
Requires-Dist: msgspec>=0.18.0
|
38
35
|
Requires-Dist: numpy>=2.0.0
|
39
36
|
Requires-Dist: playa-pdf>=0.7.0
|
40
|
-
Requires-Dist: polars>=1.
|
37
|
+
Requires-Dist: polars>=1.34.0
|
41
38
|
Requires-Dist: psutil>=7.1.0
|
42
39
|
Requires-Dist: pypdfium2==4.30.0
|
43
40
|
Requires-Dist: python-calamine>=0.5.3
|
44
41
|
Requires-Dist: python-pptx>=1.0.2
|
45
|
-
Requires-Dist:
|
42
|
+
Requires-Dist: transformers>=4.55.0
|
43
|
+
Requires-Dist: typing-extensions>=4.15.0 ; python_full_version < '3.12'
|
44
|
+
Requires-Dist: mailparse>=1.0.15 ; extra == 'additional-extensions'
|
45
|
+
Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'additional-extensions'
|
46
|
+
Requires-Dist: kreuzberg[additional-extensions,api,chunking,cli,crypto,document-classification,easyocr,entity-extraction,gmft,langdetect,paddleocr] ; extra == 'all'
|
47
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.18.0 ; extra == 'api'
|
48
|
+
Requires-Dist: semantic-text-splitter>=0.28.0 ; extra == 'chunking'
|
49
|
+
Requires-Dist: click>=8.3.0 ; extra == 'cli'
|
50
|
+
Requires-Dist: rich>=14.2.0 ; extra == 'cli'
|
51
|
+
Requires-Dist: tomli>=2.0.0 ; python_full_version < '3.11' and extra == 'cli'
|
52
|
+
Requires-Dist: playa-pdf[crypto]>=0.7.0 ; extra == 'crypto'
|
53
|
+
Requires-Dist: deep-translator>=1.11.4 ; extra == 'document-classification'
|
54
|
+
Requires-Dist: easyocr>=1.7.2 ; python_full_version < '3.14' and extra == 'easyocr'
|
55
|
+
Requires-Dist: keybert>=0.9.0 ; extra == 'entity-extraction'
|
56
|
+
Requires-Dist: spacy>=3.8.7 ; python_full_version < '3.14' and extra == 'entity-extraction'
|
57
|
+
Requires-Dist: gmft>=0.4.2 ; extra == 'gmft'
|
58
|
+
Requires-Dist: transformers>=4.57.0 ; extra == 'gmft'
|
59
|
+
Requires-Dist: fast-langdetect>=1.0.0 ; extra == 'langdetect'
|
60
|
+
Requires-Dist: paddleocr>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
|
61
|
+
Requires-Dist: paddlepaddle>=3.2.0 ; python_full_version < '3.14' and extra == 'paddleocr'
|
62
|
+
Requires-Dist: setuptools>=80.9.0 ; extra == 'paddleocr'
|
63
|
+
Requires-Python: >=3.10
|
64
|
+
Project-URL: documentation, https://kreuzberg.dev
|
65
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
46
66
|
Provides-Extra: additional-extensions
|
47
|
-
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
48
|
-
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
|
49
67
|
Provides-Extra: all
|
50
|
-
Requires-Dist: click>=8.2.1; extra == 'all'
|
51
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
52
|
-
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
53
|
-
Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
|
54
|
-
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
55
|
-
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
56
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
57
|
-
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
58
|
-
Requires-Dist: paddleocr>=3.2.0; extra == 'all'
|
59
|
-
Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
|
60
|
-
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
61
|
-
Requires-Dist: rich>=14.1.0; extra == 'all'
|
62
|
-
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
|
63
|
-
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
64
|
-
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
65
|
-
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
66
68
|
Provides-Extra: api
|
67
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
68
69
|
Provides-Extra: chunking
|
69
|
-
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
|
70
70
|
Provides-Extra: cli
|
71
|
-
Requires-Dist: click>=8.2.1; extra == 'cli'
|
72
|
-
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
73
|
-
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
74
71
|
Provides-Extra: crypto
|
75
|
-
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'crypto'
|
76
72
|
Provides-Extra: document-classification
|
77
|
-
Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
|
78
73
|
Provides-Extra: easyocr
|
79
|
-
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
80
74
|
Provides-Extra: entity-extraction
|
81
|
-
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
82
|
-
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
83
75
|
Provides-Extra: gmft
|
84
|
-
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
85
76
|
Provides-Extra: langdetect
|
86
|
-
Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
|
87
77
|
Provides-Extra: paddleocr
|
88
|
-
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
89
|
-
Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
|
90
|
-
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
91
78
|
Description-Content-Type: text/markdown
|
92
79
|
|
93
80
|
# Kreuzberg
|
@@ -110,10 +110,9 @@ def _get_max_upload_size() -> int:
|
|
110
110
|
Environment Variables:
|
111
111
|
KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
|
112
112
|
"""
|
113
|
-
default_size = 1024 * 1024 * 1024
|
113
|
+
default_size = 1024 * 1024 * 1024
|
114
114
|
try:
|
115
115
|
size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
|
116
|
-
# Return default if negative
|
117
116
|
return size if size >= 0 else default_size
|
118
117
|
except ValueError:
|
119
118
|
return default_size
|
@@ -311,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
|
|
311
310
|
"""
|
312
311
|
static_config = discover_config_cached()
|
313
312
|
|
313
|
+
if not data:
|
314
|
+
raise ValidationError("No files provided for extraction", context={"file_count": 0})
|
315
|
+
|
314
316
|
min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
|
315
317
|
max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
|
316
318
|
|
@@ -144,10 +144,9 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
|
|
144
144
|
try:
|
145
145
|
nlp = spacy.load(model_name)
|
146
146
|
except OSError:
|
147
|
-
|
147
|
+
|
148
148
|
async def install_model() -> tuple[bool, str | None]:
|
149
149
|
"""Install model and return success status and error message."""
|
150
|
-
# First try spaCy's built-in download
|
151
150
|
try:
|
152
151
|
success = await install_spacy_model_with_spacy(model_name)
|
153
152
|
if success:
|
@@ -157,7 +156,6 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
|
|
157
156
|
else:
|
158
157
|
spacy_error = "spaCy download failed"
|
159
158
|
|
160
|
-
# If spaCy download failed and uv is available, try uv as fallback
|
161
159
|
if is_uv_available():
|
162
160
|
try:
|
163
161
|
result = await install_spacy_model_with_uv(model_name)
|
@@ -167,14 +165,12 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
|
|
167
165
|
|
168
166
|
return False, spacy_error
|
169
167
|
|
170
|
-
# Run the async installation in a sync context
|
171
168
|
try:
|
172
169
|
success, error_details = anyio.run(install_model)
|
173
|
-
except
|
174
|
-
success, error_details = False,
|
170
|
+
except SystemExit as e:
|
171
|
+
success, error_details = False, f"spaCy CLI exit code: {e.code}"
|
175
172
|
|
176
173
|
if not success:
|
177
|
-
# Generate appropriate error message based on available tools
|
178
174
|
if is_uv_available():
|
179
175
|
model_url = get_spacy_model_url(model_name)
|
180
176
|
manual_install_cmd = f"uv pip install {model_url}"
|
@@ -234,7 +230,7 @@ def extract_keywords(
|
|
234
230
|
kw_model = KeyBERT()
|
235
231
|
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
236
232
|
return [(kw, float(score)) for kw, score in keywords]
|
237
|
-
except
|
233
|
+
except ValueError:
|
238
234
|
return []
|
239
235
|
except ImportError as e: # pragma: no cover
|
240
236
|
raise MissingDependencyError.create_for_package(
|
@@ -0,0 +1,182 @@
|
|
1
|
+
"""Type-safe error handling utilities for extraction pipeline."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import traceback
|
6
|
+
from typing import TYPE_CHECKING, Any
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from collections.abc import Callable
|
10
|
+
|
11
|
+
from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
|
12
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
|
13
|
+
|
14
|
+
|
15
|
+
def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
|
16
|
+
"""Determine if an exception should bubble up or be handled gracefully.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
exception: The exception to classify
|
20
|
+
context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
True if the exception should bubble up, False if it should be handled gracefully
|
24
|
+
"""
|
25
|
+
if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
|
26
|
+
return True
|
27
|
+
|
28
|
+
if isinstance(exception, MissingDependencyError):
|
29
|
+
return True
|
30
|
+
|
31
|
+
if isinstance(exception, ValidationError):
|
32
|
+
if context == "batch_processing":
|
33
|
+
return False
|
34
|
+
|
35
|
+
return context != "optional_feature"
|
36
|
+
|
37
|
+
if isinstance(exception, KreuzbergError) and context == "optional_feature":
|
38
|
+
return False
|
39
|
+
|
40
|
+
if context == "batch_processing":
|
41
|
+
return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
|
42
|
+
|
43
|
+
return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
|
44
|
+
|
45
|
+
|
46
|
+
class FeatureProcessingError:
|
47
|
+
"""Type-safe processing error for extraction features."""
|
48
|
+
|
49
|
+
def __init__(self, feature: str, error: Exception) -> None:
|
50
|
+
self._feature = feature
|
51
|
+
self._error = error
|
52
|
+
self._traceback = traceback.format_exc()
|
53
|
+
|
54
|
+
@property
|
55
|
+
def feature(self) -> str:
|
56
|
+
return self._feature
|
57
|
+
|
58
|
+
@property
|
59
|
+
def error_type(self) -> str:
|
60
|
+
return type(self._error).__name__
|
61
|
+
|
62
|
+
@property
|
63
|
+
def error_message(self) -> str:
|
64
|
+
return str(self._error)
|
65
|
+
|
66
|
+
@property
|
67
|
+
def traceback(self) -> str:
|
68
|
+
return self._traceback
|
69
|
+
|
70
|
+
def to_dict(self) -> ProcessingErrorDict:
|
71
|
+
return {
|
72
|
+
"feature": self.feature,
|
73
|
+
"error_type": self.error_type,
|
74
|
+
"error_message": self.error_message,
|
75
|
+
"traceback": self.traceback,
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
def safe_feature_execution(
|
80
|
+
feature_name: str,
|
81
|
+
execution_func: Callable[[], Any],
|
82
|
+
default_value: Any,
|
83
|
+
result: ExtractionResult,
|
84
|
+
context: ErrorContextType = "optional_feature",
|
85
|
+
) -> Any:
|
86
|
+
"""Safely execute a feature extraction function with proper error handling.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
feature_name: Name of the feature being executed
|
90
|
+
execution_func: Function to execute that may raise exceptions
|
91
|
+
default_value: Default value to return if execution fails
|
92
|
+
result: ExtractionResult to update with error information
|
93
|
+
context: The context for exception handling decisions
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
Either the successful result or the default value
|
97
|
+
"""
|
98
|
+
try:
|
99
|
+
return execution_func()
|
100
|
+
except Exception as e:
|
101
|
+
if should_exception_bubble_up(e, context):
|
102
|
+
raise
|
103
|
+
|
104
|
+
_add_processing_error(result, FeatureProcessingError(feature_name, e))
|
105
|
+
return default_value
|
106
|
+
|
107
|
+
|
108
|
+
def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
|
109
|
+
"""Add a processing error to the result metadata in a type-safe way."""
|
110
|
+
if result.metadata is None:
|
111
|
+
result.metadata = {}
|
112
|
+
|
113
|
+
if "processing_errors" not in result.metadata:
|
114
|
+
result.metadata["processing_errors"] = []
|
115
|
+
|
116
|
+
errors_list = result.metadata["processing_errors"]
|
117
|
+
if isinstance(errors_list, list):
|
118
|
+
errors_list.append(error.to_dict())
|
119
|
+
else:
|
120
|
+
result.metadata["processing_errors"] = [error.to_dict()]
|
121
|
+
|
122
|
+
|
123
|
+
def preserve_result_with_errors(
|
124
|
+
result: ExtractionResult,
|
125
|
+
errors: list[FeatureProcessingError],
|
126
|
+
) -> ExtractionResult:
|
127
|
+
"""Preserve a successful extraction result while adding error information.
|
128
|
+
|
129
|
+
This is used when core extraction succeeds but optional features fail.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
result: The successful extraction result
|
133
|
+
errors: List of errors that occurred during optional processing
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
The result with error information added to metadata
|
137
|
+
"""
|
138
|
+
for error in errors:
|
139
|
+
_add_processing_error(result, error)
|
140
|
+
|
141
|
+
return result
|
142
|
+
|
143
|
+
|
144
|
+
def create_error_result(
|
145
|
+
content: str,
|
146
|
+
mime_type: str,
|
147
|
+
errors: list[FeatureProcessingError],
|
148
|
+
**metadata_kwargs: Any,
|
149
|
+
) -> ExtractionResult:
|
150
|
+
"""Create an error result with proper type safety.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
content: Error content to include
|
154
|
+
mime_type: MIME type of the result
|
155
|
+
errors: List of errors that occurred
|
156
|
+
**metadata_kwargs: Additional metadata to include
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
An ExtractionResult with error information
|
160
|
+
"""
|
161
|
+
metadata: Metadata = {
|
162
|
+
"error": f"Multiple processing errors occurred: {len(errors)} errors",
|
163
|
+
"error_context": {
|
164
|
+
"error_count": len(errors),
|
165
|
+
"errors": [error.to_dict() for error in errors],
|
166
|
+
**metadata_kwargs,
|
167
|
+
},
|
168
|
+
"processing_errors": [error.to_dict() for error in errors],
|
169
|
+
}
|
170
|
+
|
171
|
+
return ExtractionResult(
|
172
|
+
content=content,
|
173
|
+
chunks=[],
|
174
|
+
mime_type=mime_type,
|
175
|
+
metadata=metadata,
|
176
|
+
entities=[],
|
177
|
+
keywords=[],
|
178
|
+
detected_languages=[],
|
179
|
+
tables=[],
|
180
|
+
images=[],
|
181
|
+
image_ocr_results=[],
|
182
|
+
)
|
@@ -230,13 +230,13 @@ class Extractor(ABC):
|
|
230
230
|
confidence_score=None,
|
231
231
|
processing_time=duration,
|
232
232
|
)
|
233
|
-
except
|
233
|
+
except ValueError as e: # pragma: no cover
|
234
234
|
return ImageOCRResult(
|
235
235
|
image=target,
|
236
236
|
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
237
237
|
skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
|
238
238
|
)
|
239
|
-
except
|
239
|
+
except TypeError as e: # pragma: no cover
|
240
240
|
return ImageOCRResult(
|
241
241
|
image=target,
|
242
242
|
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
@@ -0,0 +1,138 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
5
|
+
|
6
|
+
from anyio import Path as AsyncPath
|
7
|
+
from html_to_markdown import HtmlToMarkdownError
|
8
|
+
from html_to_markdown._html_to_markdown import (
|
9
|
+
InlineImageConfig,
|
10
|
+
convert_with_inline_images,
|
11
|
+
)
|
12
|
+
from html_to_markdown._html_to_markdown import (
|
13
|
+
convert as rust_convert,
|
14
|
+
)
|
15
|
+
|
16
|
+
from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
|
17
|
+
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
18
|
+
from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
|
19
|
+
from kreuzberg._utils._string import safe_decode
|
20
|
+
from kreuzberg._utils._sync import run_maybe_async, run_sync
|
21
|
+
|
22
|
+
if TYPE_CHECKING:
|
23
|
+
from pathlib import Path
|
24
|
+
|
25
|
+
logger = logging.getLogger(__name__)
|
26
|
+
|
27
|
+
|
28
|
+
class HTMLExtractor(Extractor):
|
29
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
|
30
|
+
|
31
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
32
|
+
result = await run_sync(self.extract_bytes_sync, content)
|
33
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
34
|
+
result.image_ocr_results = await self._process_images_with_ocr(result.images)
|
35
|
+
return result
|
36
|
+
|
37
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
38
|
+
content = await AsyncPath(path).read_bytes()
|
39
|
+
result = await run_sync(self.extract_bytes_sync, content)
|
40
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
41
|
+
result.image_ocr_results = await self._process_images_with_ocr(result.images)
|
42
|
+
return result
|
43
|
+
|
44
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
45
|
+
extraction_config = self.config
|
46
|
+
html_content = safe_decode(content)
|
47
|
+
if extraction_config and extraction_config.html_to_markdown_config is not None:
|
48
|
+
html_config = extraction_config.html_to_markdown_config
|
49
|
+
else:
|
50
|
+
html_config = HTMLToMarkdownConfig()
|
51
|
+
conversion_options, _ = html_config.to_options()
|
52
|
+
|
53
|
+
extract_inline_images = bool(extraction_config and extraction_config.extract_images)
|
54
|
+
run_ocr_on_images = bool(
|
55
|
+
extraction_config and extraction_config.extract_images and extraction_config.ocr_extracted_images
|
56
|
+
)
|
57
|
+
inline_image_config = None
|
58
|
+
if extract_inline_images:
|
59
|
+
inline_image_config = InlineImageConfig(
|
60
|
+
max_decoded_size_bytes=MAX_SINGLE_IMAGE_SIZE,
|
61
|
+
filename_prefix=None,
|
62
|
+
capture_svg=True,
|
63
|
+
infer_dimensions=True,
|
64
|
+
)
|
65
|
+
|
66
|
+
try:
|
67
|
+
if extract_inline_images:
|
68
|
+
markdown, images_payload, warnings = convert_with_inline_images(
|
69
|
+
html_content,
|
70
|
+
options=conversion_options,
|
71
|
+
image_config=inline_image_config,
|
72
|
+
)
|
73
|
+
else:
|
74
|
+
markdown = rust_convert(
|
75
|
+
html_content,
|
76
|
+
conversion_options,
|
77
|
+
)
|
78
|
+
images_payload = []
|
79
|
+
warnings = []
|
80
|
+
except (HtmlToMarkdownError, ValueError) as exc:
|
81
|
+
logger.exception("Failed to convert HTML to Markdown: %s", exc)
|
82
|
+
markdown = ""
|
83
|
+
images_payload = []
|
84
|
+
warnings = []
|
85
|
+
|
86
|
+
for warning in warnings:
|
87
|
+
self._log_inline_warning(warning)
|
88
|
+
|
89
|
+
extraction_result = ExtractionResult(content=markdown, mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
90
|
+
|
91
|
+
inline_images = [self._build_extracted_image(image) for image in images_payload]
|
92
|
+
if inline_images:
|
93
|
+
extraction_result.images = inline_images
|
94
|
+
if run_ocr_on_images:
|
95
|
+
extraction_result.image_ocr_results = run_maybe_async(
|
96
|
+
self._process_images_with_ocr,
|
97
|
+
inline_images,
|
98
|
+
)
|
99
|
+
|
100
|
+
return self._apply_quality_processing(extraction_result)
|
101
|
+
|
102
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
103
|
+
content = path.read_bytes()
|
104
|
+
return self.extract_bytes_sync(content)
|
105
|
+
|
106
|
+
@staticmethod
|
107
|
+
def _build_extracted_image(image: dict[str, Any]) -> ExtractedImage:
|
108
|
+
dimensions_value = image.get("dimensions")
|
109
|
+
dimensions = tuple(dimensions_value) if dimensions_value else None
|
110
|
+
return ExtractedImage(
|
111
|
+
data=image["data"],
|
112
|
+
format=image["format"],
|
113
|
+
filename=image.get("filename"),
|
114
|
+
description=image.get("description"),
|
115
|
+
dimensions=dimensions,
|
116
|
+
)
|
117
|
+
|
118
|
+
@staticmethod
|
119
|
+
def _log_inline_warning(warning: Any) -> None:
|
120
|
+
if isinstance(warning, dict):
|
121
|
+
index = warning.get("index")
|
122
|
+
message = warning.get("message")
|
123
|
+
if index is not None and message:
|
124
|
+
logger.warning("Inline image %s: %s", index, message)
|
125
|
+
elif message:
|
126
|
+
logger.warning("Inline image warning: %s", message)
|
127
|
+
else:
|
128
|
+
logger.warning("Inline image warning received with no message")
|
129
|
+
return
|
130
|
+
|
131
|
+
message = getattr(warning, "message", None)
|
132
|
+
index = getattr(warning, "index", None)
|
133
|
+
if message and index is not None:
|
134
|
+
logger.warning("Inline image %s: %s", index, message)
|
135
|
+
elif message:
|
136
|
+
logger.warning("Inline image warning: %s", message)
|
137
|
+
else:
|
138
|
+
logger.warning("Inline image warning received with no message")
|
@@ -6,7 +6,6 @@ import logging
|
|
6
6
|
import os
|
7
7
|
import tempfile
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
|
-
from dataclasses import asdict
|
10
9
|
from itertools import count
|
11
10
|
from multiprocessing import cpu_count
|
12
11
|
from pathlib import Path
|
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
|
27
26
|
from kreuzberg._ocr import get_ocr_backend
|
28
27
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
29
28
|
from kreuzberg._types import (
|
30
|
-
EasyOCRConfig,
|
31
29
|
ExtractedImage,
|
32
30
|
ExtractionResult,
|
33
31
|
ImageOCRResult,
|
34
32
|
Metadata,
|
35
33
|
OcrBackendType,
|
36
|
-
PaddleOCRConfig,
|
37
|
-
TesseractConfig,
|
38
34
|
)
|
39
35
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
40
36
|
from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
|
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
|
|
134
130
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
135
131
|
content_bytes = path.read_bytes()
|
136
132
|
|
133
|
+
result: ExtractionResult | None = None
|
134
|
+
|
137
135
|
document: Document | None = None
|
138
136
|
if self.config.extract_images or self.config.extract_tables:
|
139
137
|
document = self._parse_with_password_attempts(content_bytes)
|
140
138
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
139
|
+
if not self.config.force_ocr:
|
140
|
+
try:
|
141
|
+
content = self._extract_pdf_searchable_text_sync(path)
|
142
|
+
if self._validate_extracted_text(content):
|
143
|
+
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
144
|
+
except ParsingError:
|
145
|
+
pass
|
145
146
|
|
146
|
-
if
|
147
|
-
|
147
|
+
if not result and self.config.ocr_backend is not None:
|
148
|
+
result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
|
149
|
+
|
150
|
+
if not result:
|
151
|
+
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
152
|
+
|
153
|
+
metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
|
154
|
+
result.metadata = metadata
|
148
155
|
|
149
|
-
tables = []
|
150
156
|
if self.config.extract_tables:
|
151
157
|
# GMFT is optional dependency ~keep
|
152
158
|
try:
|
153
159
|
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
154
160
|
|
155
161
|
tables = extract_tables_sync(path)
|
162
|
+
result.tables = tables
|
156
163
|
except ImportError: # pragma: no cover
|
157
|
-
tables = []
|
158
|
-
|
159
|
-
if not self.config.force_ocr and self._validate_extracted_text(text):
|
160
|
-
text = self._extract_with_playa_sync(path, fallback_text=text)
|
161
|
-
|
162
|
-
text = normalize_spaces(text)
|
163
|
-
|
164
|
-
result = ExtractionResult(
|
165
|
-
content=text,
|
166
|
-
mime_type=PLAIN_TEXT_MIME_TYPE,
|
167
|
-
metadata={},
|
168
|
-
tables=list(tables),
|
169
|
-
)
|
164
|
+
result.tables = []
|
170
165
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
166
|
+
if result.tables:
|
167
|
+
table_summary = generate_table_summary(result.tables)
|
168
|
+
result.metadata = result.metadata | {
|
169
|
+
"table_count": table_summary["table_count"],
|
170
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
171
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
172
|
+
f"{table_summary['total_rows']} total rows",
|
173
|
+
}
|
179
174
|
|
180
175
|
if self.config.extract_images and document:
|
181
176
|
images = self._extract_images_from_playa_sync(document)
|
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
|
|
405
400
|
except Exception as e:
|
406
401
|
raise ParsingError(f"Failed to extract PDF text: {e}") from e
|
407
402
|
|
408
|
-
def
|
403
|
+
def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
|
409
404
|
temp_files: list[Path] = []
|
410
405
|
try:
|
411
406
|
with pdf_document_sync(path) as pdf:
|
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
|
|
443
438
|
with pdf_resources_sync(bitmap, page):
|
444
439
|
pil_image.close()
|
445
440
|
|
446
|
-
|
441
|
+
content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
|
442
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
447
443
|
|
448
444
|
except Exception as e:
|
449
445
|
raise ParsingError(f"Failed to OCR PDF: {e}") from e
|
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
|
|
452
448
|
with contextlib.suppress(OSError):
|
453
449
|
p.unlink()
|
454
450
|
|
455
|
-
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
456
|
-
backend = get_ocr_backend(
|
451
|
+
def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
|
452
|
+
backend = get_ocr_backend(ocr_backend)
|
457
453
|
paths = [Path(p) for p in image_paths]
|
458
454
|
|
459
|
-
|
460
|
-
case "tesseract":
|
461
|
-
config = (
|
462
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
463
|
-
)
|
464
|
-
results = backend.process_batch_sync(paths, **asdict(config))
|
465
|
-
case "paddleocr":
|
466
|
-
paddle_config = (
|
467
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
468
|
-
)
|
469
|
-
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
470
|
-
case "easyocr":
|
471
|
-
easy_config = (
|
472
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
473
|
-
)
|
474
|
-
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
475
|
-
case _:
|
476
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
455
|
+
results = backend.process_batch_sync(paths, **self.config.get_config_dict())
|
477
456
|
|
478
457
|
return "\n\n".join(result.content for result in results)
|
479
458
|
|