kreuzberg 3.15.0__tar.gz → 3.16.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/ci.yaml +1 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.pre-commit-config.yaml +1 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/PKG-INFO +12 -11
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/README.md +10 -9
- kreuzberg-3.16.0/Taskfile.yml +50 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/types.md +12 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/examples/extraction-examples.md +83 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/extraction-configuration.md +68 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/metadata-extraction.md +51 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/supported-formats.md +14 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/__init__.py +4 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_api/main.py +0 -53
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_config.py +11 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_document_classification.py +1 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_email.py +16 -10
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_html.py +39 -12
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_pdf.py +2 -3
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_presentation.py +4 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_spread_sheet.py +0 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_structured.py +83 -15
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_gmft.py +5 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_mcp/server.py +0 -21
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_easyocr.py +51 -19
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_tesseract.py +14 -3
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_types.py +111 -40
- kreuzberg-3.16.0/kreuzberg/_utils/_html_streaming.py +20 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_serialization.py +13 -6
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_sync.py +15 -16
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/extraction.py +2 -2
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/pyproject.toml +3 -3
- kreuzberg-3.16.0/tests/api/config_cache_test.py +248 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/image_extraction_test.py +4 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/main_test.py +7 -7
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/runtime_config_test.py +4 -1
- kreuzberg-3.16.0/tests/core/comprehensive_config_test.py +603 -0
- kreuzberg-3.16.0/tests/core/constants_test.py +22 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/dpi_configuration_test.py +19 -0
- kreuzberg-3.16.0/tests/core/exceptions_test.py +159 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/extraction_batch_test.py +8 -65
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/extraction_test.py +75 -38
- kreuzberg-3.16.0/tests/core/init_test.py +85 -0
- kreuzberg-3.16.0/tests/core/main_test.py +35 -0
- kreuzberg-3.16.0/tests/core/mime_types_test.py +242 -0
- kreuzberg-3.16.0/tests/core/registry_test.py +225 -0
- kreuzberg-3.16.0/tests/core/types_test.py +403 -0
- kreuzberg-3.16.0/tests/extractors/base_extractor_test.py +420 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/base_ocr_processing_test.py +6 -18
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/email_test.py +1 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/image_error_handling_test.py +5 -3
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/image_test.py +2 -19
- kreuzberg-3.16.0/tests/extractors/json_test.py +427 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pandoc_test.py +27 -29
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pdf_test.py +12 -7
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/spreadsheet_test.py +17 -13
- kreuzberg-3.16.0/tests/features/chunker_test.py +94 -0
- kreuzberg-3.16.0/tests/features/document_classification_test.py +747 -0
- kreuzberg-3.16.0/tests/features/entity_extraction_test.py +348 -0
- kreuzberg-3.16.0/tests/features/gmft_test.py +1496 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/features/language_detection_test.py +6 -34
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/all_extractors_images_test.py +45 -24
- kreuzberg-3.16.0/tests/interfaces/cli_test.py +527 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/interfaces/mcp_server_test.py +44 -203
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/multiprocessing/gmft_isolated_test.py +1 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/easyocr_test.py +6 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/paddleocr_test.py +1 -0
- kreuzberg-3.16.0/tests/test_source_files/json/complex_nested.json +41 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/aws_policy.json +43 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/earthquakes.geojson +6 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/github_emojis.json +111 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/iss_location.json +1 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/openapi_spec.json +84 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/package.json +33 -0
- kreuzberg-3.16.0/tests/test_source_files/json/real_world/rick_morty_character.json +1 -0
- kreuzberg-3.16.0/tests/test_source_files/json/schema_test.json +25 -0
- kreuzberg-3.16.0/tests/utils/playa_metadata_test.py +753 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/playa_test.py +68 -17
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/process_pool_test.py +1 -1
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/serialization_test.py +82 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/uv.lock +23 -23
- kreuzberg-3.15.0/Taskfile.yml +0 -161
- kreuzberg-3.15.0/tests/core/exceptions_test.py +0 -0
- kreuzberg-3.15.0/tests/core/mime_types_test.py +0 -0
- kreuzberg-3.15.0/tests/core/registry_test.py +0 -0
- kreuzberg-3.15.0/tests/core/types_test.py +0 -23
- kreuzberg-3.15.0/tests/features/chunker_test.py +0 -0
- kreuzberg-3.15.0/tests/features/document_classification_test.py +0 -0
- kreuzberg-3.15.0/tests/features/entity_extraction_test.py +0 -0
- kreuzberg-3.15.0/tests/features/gmft_test.py +0 -528
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.commitlintrc +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.deepsource.toml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.docker/Dockerfile +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.docker/README.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.dockerignore +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/dependabot.yaml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/docker-e2e-tests.yml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/docs.yml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/pr-title.yaml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/publish-docker.yml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/release.yaml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.github/workflows/test-docker-builds.yml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.gitignore +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/.markdownlint.yaml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/LICENSE +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/ai-rulez.yaml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/README.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/batch_size_benchmark.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/batch_validation_benchmark.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/py.typed +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/pyproject.toml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/__main__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/benchmarks.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/cli.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/models.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/profiler.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/benchmarks/src/runner.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docker-logs/docker-info.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docker-logs/docker-version.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/custom-extractors.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/custom-hooks.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/error-handling.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/index.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/advanced/performance.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/exceptions.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/extraction-functions.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/extractor-registry.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/index.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/api-reference/ocr-configuration.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/assets/favicon.png +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/assets/logo.png +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/cli.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/contributing.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/css/extra.css +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/examples/index.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/getting-started/index.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/getting-started/installation.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/getting-started/quick-start.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/index.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/api-server.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/basic-usage.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/chunking.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/docker.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/document-classification.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/index.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/mcp-server.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/ocr-backends.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/docs/user-guide/ocr-configuration.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/__main__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_api/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_api/_config_cache.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_entity_extraction.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_language_detection.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_mcp/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_paddleocr.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_ocr/_table_extractor.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_cache.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_device.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_document_cache.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_errors.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_image_preprocessing.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_ocr_cache.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_pdf_lock.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_process_pool.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_quality.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_ref.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_resource_managers.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_table.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/cli.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/mkdocs.yaml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/conftest.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/api/header_config_hashing_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/conftest.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/config_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/html_to_markdown_config_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/core/image_ocr_result_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/e2e/__init__.py +0 -0
- /kreuzberg-3.15.0/tests/e2e/docker_e2e_test.py → /kreuzberg-3.16.0/tests/e2e/docker_e2e.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/README_image_tests.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/base_memory_limits_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/base_ocr_simple_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/email_error_paths_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/html_invalid_base64_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/html_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/image_deduplication_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/image_error_simple_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pandoc_metadata_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pdf_images_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/pdf_sync_images_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/presentation_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/extractors/structured_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/features/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/features/hooks_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/api/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/api/large_file_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/api/mounted_config_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/dpi_integration_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/multiprocessing/gmft_integration_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/ocr/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/ocr/device_integration_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/ocr/tesseract_sync_formats_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/ocr/tesseract_tsv_integration_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pandoc_images_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pdf_images_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pdf_real_images_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pptx_complex_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/pptx_images_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/integration/regression_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/interfaces/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/mcp/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/mcp/mcp_server_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/multiprocessing/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/multiprocessing/process_manager_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/multiprocessing/tesseract_pool_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/base_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/init_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/tesseract_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/ocr/tesseract_tsv_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/performance/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/performance/large_pdf_perf_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/Xerox_AltaLink_series_mfp_sag_en-US 2.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/contract.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/contract_test.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/document.docx +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/email/sample-email.eml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/excel-multi-sheet.xlsx +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/excel.xlsx +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/flower-no-text.jpg +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/form_test.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/french-text.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/german-text.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/google-doc-document.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/html.html +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/images/test_hello_world.png +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/invoice_image.png +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/invoice_test.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/json/sample-document.json +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/layout-parser-ocr.jpg +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/markdown.md +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/non-ascii-text.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/non-searchable.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/ocr-image.jpg +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/large.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/medium.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/pdfs_with_tables/tiny.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/pitch-deck-presentation.pptx +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/receipt_test.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/report_test.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/sample-contract.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/scanned.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/searchable.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/sharable-web-guide.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/spanish-text.txt +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/tables/borderless_table.png +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/tables/complex_document.png +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/tables/simple_table.png +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/test-article.pdf +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/test-excel.xls +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/test_source_files/yaml/sample-config.yaml +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/__init__.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/cache_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/device_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/errors_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/ocr_cache_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/pdf_lock_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/playa_helpers_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/quality_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/ref_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/string_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/sync_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/table_test.py +0 -0
- {kreuzberg-3.15.0 → kreuzberg-3.16.0}/tests/utils/tmp_test.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.16.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.13.0
|
35
35
|
Requires-Dist: mcp>=1.14.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: numpy>=2.0.0
|
@@ -109,7 +109,7 @@ Description-Content-Type: text/markdown
|
|
109
109
|
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
110
110
|
- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
|
111
111
|
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
112
|
-
- **Format Support**:
|
112
|
+
- **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
113
113
|
- **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
|
114
114
|
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
115
115
|
|
@@ -227,14 +227,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
|
|
227
227
|
|
228
228
|
## Supported Formats
|
229
229
|
|
230
|
-
| Category
|
231
|
-
|
|
232
|
-
| **Documents**
|
233
|
-
| **Images**
|
234
|
-
| **Spreadsheets**
|
235
|
-
| **Presentations**
|
236
|
-
| **Web**
|
237
|
-
| **
|
230
|
+
| Category | Formats |
|
231
|
+
| ------------------- | ------------------------------ |
|
232
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
233
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
234
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
235
|
+
| **Presentations** | PPTX, PPT, ODP |
|
236
|
+
| **Web** | HTML, XML, MHTML |
|
237
|
+
| **Structured Data** | JSON, YAML, TOML |
|
238
|
+
| **Archives** | Support via extraction |
|
238
239
|
|
239
240
|
## 📊 Performance Characteristics
|
240
241
|
|
@@ -18,7 +18,7 @@
|
|
18
18
|
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
19
19
|
- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
|
20
20
|
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
21
|
-
- **Format Support**:
|
21
|
+
- **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
22
22
|
- **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
|
23
23
|
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
24
24
|
|
@@ -136,14 +136,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
|
|
136
136
|
|
137
137
|
## Supported Formats
|
138
138
|
|
139
|
-
| Category
|
140
|
-
|
|
141
|
-
| **Documents**
|
142
|
-
| **Images**
|
143
|
-
| **Spreadsheets**
|
144
|
-
| **Presentations**
|
145
|
-
| **Web**
|
146
|
-
| **
|
139
|
+
| Category | Formats |
|
140
|
+
| ------------------- | ------------------------------ |
|
141
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
142
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
143
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
144
|
+
| **Presentations** | PPTX, PPT, ODP |
|
145
|
+
| **Web** | HTML, XML, MHTML |
|
146
|
+
| **Structured Data** | JSON, YAML, TOML |
|
147
|
+
| **Archives** | Support via extraction |
|
147
148
|
|
148
149
|
## 📊 Performance Characteristics
|
149
150
|
|
@@ -0,0 +1,50 @@
|
|
1
|
+
version: "3"
|
2
|
+
|
3
|
+
env:
|
4
|
+
DOCKER_BUILDKIT: 1
|
5
|
+
BUILDKIT_PROGRESS: plain
|
6
|
+
|
7
|
+
tasks:
|
8
|
+
setup:
|
9
|
+
desc: "Install dependencies with uv"
|
10
|
+
cmds:
|
11
|
+
- uv sync --all-extras --all-packages
|
12
|
+
- pre-commit install && pre-commit install -hook-type commit-msg
|
13
|
+
|
14
|
+
update:
|
15
|
+
desc: "Update the dependencies"
|
16
|
+
cmds:
|
17
|
+
- uv run uv-bump
|
18
|
+
- cd benchmarks && uv run uv-bump && cd -
|
19
|
+
- uv sync --all-extras --all-packages --upgrade
|
20
|
+
- pre-commit autoupdate
|
21
|
+
|
22
|
+
test:
|
23
|
+
desc: "Run tests with pytest"
|
24
|
+
cmds:
|
25
|
+
- uv run pytest
|
26
|
+
|
27
|
+
test:cov:
|
28
|
+
desc: "Run tests with coverage"
|
29
|
+
cmds:
|
30
|
+
- uv run pytest --cov
|
31
|
+
|
32
|
+
lint:
|
33
|
+
desc: "Lint code with ruff and docs with markdownlint"
|
34
|
+
cmds:
|
35
|
+
- pre-commit run --all-files
|
36
|
+
|
37
|
+
docs:build:
|
38
|
+
desc: "Build documentation"
|
39
|
+
cmds:
|
40
|
+
- uv run mkdocs build --clean --strict
|
41
|
+
|
42
|
+
docs:serve:
|
43
|
+
desc: "Serve documentation locally"
|
44
|
+
cmds:
|
45
|
+
- uv run mkdocs serve
|
46
|
+
|
47
|
+
default:
|
48
|
+
desc: "Show available tasks"
|
49
|
+
cmds:
|
50
|
+
- task --list
|
@@ -72,6 +72,18 @@ Configuration options for automatic language detection:
|
|
72
72
|
|
73
73
|
::: kreuzberg.LanguageDetectionConfig
|
74
74
|
|
75
|
+
## JSON Extraction Configuration
|
76
|
+
|
77
|
+
Configuration for enhanced JSON document processing:
|
78
|
+
|
79
|
+
::: kreuzberg.JSONExtractionConfig
|
80
|
+
|
81
|
+
## HTML to Markdown Configuration
|
82
|
+
|
83
|
+
Configuration options for converting HTML content to Markdown:
|
84
|
+
|
85
|
+
::: kreuzberg.HTMLToMarkdownConfig
|
86
|
+
|
75
87
|
## PSMMode (Page Segmentation Mode)
|
76
88
|
|
77
89
|
::: kreuzberg.PSMMode
|
@@ -525,13 +525,95 @@ async def comprehensive_extraction():
|
|
525
525
|
print(f"Total text (including OCR): {len(all_text)} characters")
|
526
526
|
```
|
527
527
|
|
528
|
+
## JSON and Structured Data Extraction
|
529
|
+
|
530
|
+
### Basic JSON Extraction
|
531
|
+
|
532
|
+
```python
|
533
|
+
from kreuzberg import extract_file_sync
|
534
|
+
|
535
|
+
# Simple JSON extraction
|
536
|
+
result = extract_file_sync("data.json")
|
537
|
+
print(result.content)
|
538
|
+
|
539
|
+
# Metadata includes detected text fields
|
540
|
+
print(f"Title: {result.metadata.get('title')}")
|
541
|
+
print(f"Description: {result.metadata.get('description')}")
|
542
|
+
```
|
543
|
+
|
544
|
+
### Advanced JSON with Schema Extraction
|
545
|
+
|
546
|
+
```python
|
547
|
+
from kreuzberg import extract_file_sync, ExtractionConfig, JSONExtractionConfig
|
548
|
+
|
549
|
+
# Configure advanced JSON extraction
|
550
|
+
json_config = JSONExtractionConfig(
|
551
|
+
extract_schema=True, # Extract JSON structure
|
552
|
+
custom_text_field_patterns=frozenset({"summary", "abstract"}), # Custom fields
|
553
|
+
include_type_info=True, # Add type annotations
|
554
|
+
flatten_nested_objects=True, # Flatten nested structures
|
555
|
+
max_depth=5, # Limit schema depth
|
556
|
+
array_item_limit=100, # Limit array processing
|
557
|
+
)
|
558
|
+
|
559
|
+
config = ExtractionConfig(json_config=json_config)
|
560
|
+
result = extract_file_sync("complex.json", config=config)
|
561
|
+
|
562
|
+
# Access schema information
|
563
|
+
if "json_schema" in result.metadata:
|
564
|
+
schema = result.metadata["json_schema"]
|
565
|
+
print(f"Root type: {schema['type']}")
|
566
|
+
print(f"Properties: {list(schema.get('properties', {}).keys())}")
|
567
|
+
|
568
|
+
# Access nested attributes with dotted notation
|
569
|
+
if "attributes" in result.metadata:
|
570
|
+
attrs = result.metadata["attributes"]
|
571
|
+
# Nested fields like {"info": {"title": "Example"}} become "info.title"
|
572
|
+
print(f"Nested title: {attrs.get('info.title')}")
|
573
|
+
```
|
574
|
+
|
575
|
+
### YAML and TOML Processing
|
576
|
+
|
577
|
+
```python
|
578
|
+
from kreuzberg import extract_file_sync
|
579
|
+
|
580
|
+
# YAML extraction (similar to JSON)
|
581
|
+
yaml_result = extract_file_sync("config.yaml")
|
582
|
+
print(yaml_result.content)
|
583
|
+
|
584
|
+
# TOML extraction
|
585
|
+
toml_result = extract_file_sync("pyproject.toml")
|
586
|
+
print(toml_result.content)
|
587
|
+
|
588
|
+
# Both formats support the same metadata extraction as JSON
|
589
|
+
print(f"Package name: {toml_result.metadata.get('name')}")
|
590
|
+
```
|
591
|
+
|
592
|
+
### Working with API Responses
|
593
|
+
|
594
|
+
```python
|
595
|
+
import httpx
|
596
|
+
from kreuzberg import extract_bytes_sync, ExtractionConfig, JSONExtractionConfig
|
597
|
+
|
598
|
+
# Fetch JSON from API
|
599
|
+
response = httpx.get("https://api.example.com/data")
|
600
|
+
|
601
|
+
# Extract with schema
|
602
|
+
config = ExtractionConfig(json_config=JSONExtractionConfig(extract_schema=True))
|
603
|
+
|
604
|
+
result = extract_bytes_sync(response.content, mime_type="application/json", config=config)
|
605
|
+
|
606
|
+
print(f"API Response: {result.content}")
|
607
|
+
print(f"Schema: {result.metadata.get('json_schema')}")
|
608
|
+
```
|
609
|
+
|
528
610
|
## Batch Processing
|
529
611
|
|
530
612
|
```python
|
531
613
|
from kreuzberg import batch_extract_file, ExtractionConfig
|
532
614
|
|
533
615
|
async def process_documents():
|
534
|
-
file_paths = ["document1.pdf", "document2.docx", "image.jpg"]
|
616
|
+
file_paths = ["document1.pdf", "document2.docx", "data.json", "image.jpg"]
|
535
617
|
config = ExtractionConfig() # Optional: configure extraction options
|
536
618
|
results = await batch_extract_file(file_paths, config=config)
|
537
619
|
|
@@ -94,6 +94,14 @@ strong_em_symbol = "_"
|
|
94
94
|
escape_underscores = false
|
95
95
|
wrap = true
|
96
96
|
wrap_width = 100
|
97
|
+
list_indent_width = 2 # Use 2 spaces for Discord/Slack compatibility
|
98
|
+
list_indent_type = "spaces" # Use spaces instead of tabs
|
99
|
+
whitespace_mode = "normalized" # Handle whitespace intelligently
|
100
|
+
br_in_tables = false # Use spaces instead of <br> in tables
|
101
|
+
highlight_style = "double-equal" # Style for highlighted text
|
102
|
+
newline_style = "spaces" # Style for line breaks
|
103
|
+
preprocess_html = true # Clean messy HTML before conversion
|
104
|
+
preprocessing_preset = "standard" # Level of HTML cleaning
|
97
105
|
```
|
98
106
|
|
99
107
|
### pyproject.toml Example
|
@@ -623,6 +631,58 @@ For better performance in production:
|
|
623
631
|
- Enable deduplication to avoid redundant processing
|
624
632
|
- Use selective extraction based on document types
|
625
633
|
|
634
|
+
### JSON Extraction Configuration
|
635
|
+
|
636
|
+
Kreuzberg provides enhanced JSON document processing with schema extraction and customizable field detection:
|
637
|
+
|
638
|
+
```python
|
639
|
+
from kreuzberg import extract_file, ExtractionConfig, JSONExtractionConfig
|
640
|
+
|
641
|
+
# Advanced JSON extraction with schema
|
642
|
+
result = await extract_file(
|
643
|
+
"data.json",
|
644
|
+
config=ExtractionConfig(
|
645
|
+
json_config=JSONExtractionConfig(
|
646
|
+
extract_schema=True, # Extract JSON structure schema
|
647
|
+
include_type_info=True, # Add type annotations to output
|
648
|
+
flatten_nested_objects=True, # Flatten nested objects in output
|
649
|
+
custom_text_field_patterns=frozenset({"summary", "abstract"}), # Additional text fields
|
650
|
+
max_depth=10, # Maximum nesting depth for schema
|
651
|
+
array_item_limit=1000, # Limit array processing for performance
|
652
|
+
)
|
653
|
+
),
|
654
|
+
)
|
655
|
+
|
656
|
+
# Access schema and nested attributes
|
657
|
+
if result.metadata.get("json_schema"):
|
658
|
+
print(f"JSON Schema: {result.metadata['json_schema']}")
|
659
|
+
if result.metadata.get("attributes"):
|
660
|
+
print(f"Nested fields: {result.metadata['attributes']}")
|
661
|
+
```
|
662
|
+
|
663
|
+
#### Configuration File Support
|
664
|
+
|
665
|
+
Add JSON configuration to your `kreuzberg.toml`:
|
666
|
+
|
667
|
+
```toml
|
668
|
+
[json_config]
|
669
|
+
extract_schema = true # Extract JSON structure schema
|
670
|
+
include_type_info = false # Add type annotations to output
|
671
|
+
flatten_nested_objects = true # Flatten nested objects in output
|
672
|
+
custom_text_field_patterns = ["summary", "abstract"] # Additional text fields to extract
|
673
|
+
max_depth = 10 # Maximum nesting depth for schema extraction
|
674
|
+
array_item_limit = 1000 # Limit array processing for performance
|
675
|
+
```
|
676
|
+
|
677
|
+
#### Key Features
|
678
|
+
|
679
|
+
- **High Performance**: Uses msgspec for fast JSON parsing, significantly faster than standard library
|
680
|
+
- **Schema Extraction**: Automatically extracts the structure of your JSON data, useful for understanding complex documents
|
681
|
+
- **Custom Field Detection**: Configure additional text fields beyond defaults (title, name, description, content, body, text, message)
|
682
|
+
- **Type Information**: Optionally include data type annotations in extracted content for better understanding
|
683
|
+
- **Nested Object Control**: Choose between flattened or hierarchical output based on your needs
|
684
|
+
- **Memory Protection**: Array item limits prevent memory issues with large datasets
|
685
|
+
|
626
686
|
### Entity and Keyword Extraction
|
627
687
|
|
628
688
|
Kreuzberg can extract named entities and keywords from documents using spaCy for entity recognition and KeyBERT for keyword extraction:
|
@@ -833,7 +893,14 @@ html_config = HTMLToMarkdownConfig(
|
|
833
893
|
escape_underscores=False,
|
834
894
|
wrap=True,
|
835
895
|
wrap_width=100,
|
836
|
-
|
896
|
+
list_indent_width=2, # Discord/Slack compatible spacing
|
897
|
+
list_indent_type="spaces", # Use spaces for indentation
|
898
|
+
whitespace_mode="normalized", # Smart whitespace handling
|
899
|
+
br_in_tables=False, # Use spaces in table cells
|
900
|
+
highlight_style="double-equal", # ==highlighted== text style
|
901
|
+
newline_style="spaces", # Line break style
|
902
|
+
preprocess_html=True, # Clean HTML before conversion
|
903
|
+
preprocessing_preset="standard", # HTML cleaning level
|
837
904
|
)
|
838
905
|
|
839
906
|
result = await extract_file(
|
@@ -49,6 +49,54 @@ For PDF documents, Kreuzberg extracts a rich set of metadata including:
|
|
49
49
|
|
50
50
|
If a PDF document contains UTF-16BE encoded strings (often present in PDF metadata with a byte order mark `\xfe\xff`), Kreuzberg will automatically detect and decode these properly.
|
51
51
|
|
52
|
+
## Structured Data Metadata
|
53
|
+
|
54
|
+
For JSON, YAML, and TOML files, Kreuzberg provides specialized metadata extraction:
|
55
|
+
|
56
|
+
### Text Field Detection
|
57
|
+
|
58
|
+
Kreuzberg automatically identifies and extracts common text fields:
|
59
|
+
|
60
|
+
- **Default fields**: `title`, `name`, `description`, `content`, `body`, `text`, `message`
|
61
|
+
- **Custom fields**: Configure additional patterns via `JSONExtractionConfig`
|
62
|
+
|
63
|
+
### Nested Attributes
|
64
|
+
|
65
|
+
Complex nested fields are stored in `metadata.attributes` with dotted key notation:
|
66
|
+
|
67
|
+
```python
|
68
|
+
from kreuzberg import extract_file_sync
|
69
|
+
|
70
|
+
# Example JSON with nested structure
|
71
|
+
result = extract_file_sync("complex.json")
|
72
|
+
|
73
|
+
# Access nested fields via attributes
|
74
|
+
if "attributes" in result.metadata:
|
75
|
+
# Nested fields like {"info": {"title": "Example"}} become "info.title"
|
76
|
+
nested_title = result.metadata["attributes"].get("info.title")
|
77
|
+
|
78
|
+
# Array items are indexed: {"items": [{"name": "first"}]} becomes "items[0].name"
|
79
|
+
first_item = result.metadata["attributes"].get("items[0].name")
|
80
|
+
```
|
81
|
+
|
82
|
+
### Schema Extraction
|
83
|
+
|
84
|
+
When enabled, Kreuzberg extracts the JSON structure:
|
85
|
+
|
86
|
+
```python
|
87
|
+
from kreuzberg import extract_file_sync, ExtractionConfig, JSONExtractionConfig
|
88
|
+
|
89
|
+
config = ExtractionConfig(json_config=JSONExtractionConfig(extract_schema=True))
|
90
|
+
result = extract_file_sync("data.json", config=config)
|
91
|
+
|
92
|
+
# Access the schema
|
93
|
+
if "json_schema" in result.metadata:
|
94
|
+
schema = result.metadata["json_schema"]
|
95
|
+
print(f"Root type: {schema['type']}")
|
96
|
+
if "properties" in schema:
|
97
|
+
print(f"Properties: {list(schema['properties'].keys())}")
|
98
|
+
```
|
99
|
+
|
52
100
|
## Working with Multiple Document Types
|
53
101
|
|
54
102
|
When working with multiple document types, it's important to remember that different document formats may provide different metadata fields. Always use defensive programming (like using `.get()` with a default value) when accessing metadata fields:
|
@@ -57,6 +105,9 @@ When working with multiple document types, it's important to remember that diffe
|
|
57
105
|
# Safe way to access metadata across different document types
|
58
106
|
author = result.metadata.get("authors", ["Unknown"])[0] if "authors" in result.metadata else "Unknown"
|
59
107
|
creation_date = result.metadata.get("created_at", "Unknown date")
|
108
|
+
|
109
|
+
# For structured data with nested attributes
|
110
|
+
nested_fields = result.metadata.get("attributes", {})
|
60
111
|
```
|
61
112
|
|
62
113
|
## Viewing Available Metadata
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Supported Formats
|
2
2
|
|
3
|
-
Kreuzberg handles a wide range of document, image, and
|
3
|
+
Kreuzberg handles a wide range of document, image, text, and structured data formats.
|
4
4
|
|
5
5
|
## Document Formats
|
6
6
|
|
@@ -36,6 +36,19 @@ Kreuzberg handles a wide range of document, image, and text formats.
|
|
36
36
|
- EndNote and JATS XML (`.xml`)
|
37
37
|
- RIS (`.ris`)
|
38
38
|
|
39
|
+
## Structured Data Formats
|
40
|
+
|
41
|
+
- JSON (`.json`) - High-performance extraction using msgspec with schema analysis
|
42
|
+
- YAML (`.yaml`, `.yml`) - Full YAML 1.2 support with nested structure extraction
|
43
|
+
- TOML (`.toml`) - Configuration and metadata files with type-aware processing
|
44
|
+
|
45
|
+
These formats benefit from:
|
46
|
+
|
47
|
+
- **Schema extraction**: Automatically analyze and extract the structure of your data
|
48
|
+
- **Custom field detection**: Configure additional text fields for specialized extraction
|
49
|
+
- **Type information**: Optionally include data type annotations in extracted content
|
50
|
+
- **Performance optimization**: Uses msgspec for efficient JSON parsing
|
51
|
+
|
39
52
|
## Image Formats
|
40
53
|
|
41
54
|
- JPEG (`.jpg`, `.jpeg`, `.pjpeg`)
|
@@ -8,8 +8,10 @@ from ._types import (
|
|
8
8
|
ExtractionConfig,
|
9
9
|
ExtractionResult,
|
10
10
|
GMFTConfig,
|
11
|
+
HTMLToMarkdownConfig,
|
11
12
|
ImageOCRConfig,
|
12
13
|
ImageOCRResult,
|
14
|
+
JSONExtractionConfig,
|
13
15
|
LanguageDetectionConfig,
|
14
16
|
Metadata,
|
15
17
|
PaddleOCRConfig,
|
@@ -40,8 +42,10 @@ __all__ = [
|
|
40
42
|
"ExtractionResult",
|
41
43
|
"ExtractorRegistry",
|
42
44
|
"GMFTConfig",
|
45
|
+
"HTMLToMarkdownConfig",
|
43
46
|
"ImageOCRConfig",
|
44
47
|
"ImageOCRResult",
|
48
|
+
"JSONExtractionConfig",
|
45
49
|
"KreuzbergError",
|
46
50
|
"LanguageDetectionConfig",
|
47
51
|
"Metadata",
|
@@ -13,10 +13,8 @@ from typing_extensions import TypedDict
|
|
13
13
|
|
14
14
|
from kreuzberg import (
|
15
15
|
EasyOCRConfig,
|
16
|
-
ExtractedImage,
|
17
16
|
ExtractionConfig,
|
18
17
|
ExtractionResult,
|
19
|
-
ImageOCRResult,
|
20
18
|
KreuzbergError,
|
21
19
|
MissingDependencyError,
|
22
20
|
PaddleOCRConfig,
|
@@ -40,30 +38,6 @@ if TYPE_CHECKING:
|
|
40
38
|
from litestar.datastructures import UploadFile
|
41
39
|
|
42
40
|
|
43
|
-
class ExtractedImageDict(TypedDict):
|
44
|
-
"""TypedDict for extracted image JSON representation."""
|
45
|
-
|
46
|
-
data: str
|
47
|
-
format: str
|
48
|
-
filename: str | None
|
49
|
-
page_number: int | None
|
50
|
-
dimensions: tuple[int, int] | None
|
51
|
-
colorspace: str | None
|
52
|
-
bits_per_component: int | None
|
53
|
-
is_mask: bool
|
54
|
-
description: str | None
|
55
|
-
|
56
|
-
|
57
|
-
class ImageOCRResultDict(TypedDict):
|
58
|
-
"""TypedDict for image OCR result JSON representation."""
|
59
|
-
|
60
|
-
image: ExtractedImageDict
|
61
|
-
ocr_result: Any
|
62
|
-
confidence_score: float | None
|
63
|
-
processing_time: float | None
|
64
|
-
skipped_reason: str | None
|
65
|
-
|
66
|
-
|
67
41
|
class HealthResponse(TypedDict):
|
68
42
|
"""Response model for health check endpoint."""
|
69
43
|
|
@@ -384,31 +358,6 @@ def _pil_image_encoder(obj: Any) -> str:
|
|
384
358
|
return f"data:image/png;base64,{img_str}"
|
385
359
|
|
386
360
|
|
387
|
-
def _extracted_image_encoder(obj: ExtractedImage) -> ExtractedImageDict:
|
388
|
-
encoded_data = base64.b64encode(obj.data).decode()
|
389
|
-
return ExtractedImageDict(
|
390
|
-
data=f"data:image/{obj.format};base64,{encoded_data}",
|
391
|
-
format=obj.format,
|
392
|
-
filename=obj.filename,
|
393
|
-
page_number=obj.page_number,
|
394
|
-
dimensions=obj.dimensions,
|
395
|
-
colorspace=obj.colorspace,
|
396
|
-
bits_per_component=obj.bits_per_component,
|
397
|
-
is_mask=obj.is_mask,
|
398
|
-
description=obj.description,
|
399
|
-
)
|
400
|
-
|
401
|
-
|
402
|
-
def _image_ocr_result_encoder(obj: ImageOCRResult) -> ImageOCRResultDict:
|
403
|
-
return ImageOCRResultDict(
|
404
|
-
image=_extracted_image_encoder(obj.image),
|
405
|
-
ocr_result=obj.ocr_result,
|
406
|
-
confidence_score=obj.confidence_score,
|
407
|
-
processing_time=obj.processing_time,
|
408
|
-
skipped_reason=obj.skipped_reason,
|
409
|
-
)
|
410
|
-
|
411
|
-
|
412
361
|
openapi_config = OpenAPIConfig(
|
413
362
|
title="Kreuzberg API",
|
414
363
|
version="3.14.0",
|
@@ -428,8 +377,6 @@ openapi_config = OpenAPIConfig(
|
|
428
377
|
type_encoders = {
|
429
378
|
pl.DataFrame: _polars_dataframe_encoder,
|
430
379
|
Image.Image: _pil_image_encoder,
|
431
|
-
ExtractedImage: _extracted_image_encoder,
|
432
|
-
ImageOCRResult: _image_ocr_result_encoder,
|
433
380
|
}
|
434
381
|
|
435
382
|
app = Litestar(
|
@@ -69,7 +69,17 @@ def _build_ocr_config_from_cli(
|
|
69
69
|
try:
|
70
70
|
match ocr_backend:
|
71
71
|
case "tesseract":
|
72
|
-
|
72
|
+
# Handle PSM mode conversion from int to enum
|
73
|
+
processed_args = backend_args.copy()
|
74
|
+
if "psm" in processed_args and isinstance(processed_args["psm"], int):
|
75
|
+
try:
|
76
|
+
processed_args["psm"] = PSMMode(processed_args["psm"])
|
77
|
+
except ValueError as e:
|
78
|
+
raise ValidationError(
|
79
|
+
f"Invalid PSM mode value: {processed_args['psm']}",
|
80
|
+
context={"psm_value": processed_args["psm"], "error": str(e)},
|
81
|
+
) from e
|
82
|
+
return TesseractConfig(**processed_args)
|
73
83
|
case "easyocr":
|
74
84
|
return EasyOCRConfig(**backend_args)
|
75
85
|
case "paddleocr":
|
@@ -132,7 +132,7 @@ def classify_document_from_layout(
|
|
132
132
|
if not found_words.is_empty():
|
133
133
|
scores[doc_type] += 1.0
|
134
134
|
word_top = found_words[0, "top"]
|
135
|
-
if word_top < page_height * 0.3:
|
135
|
+
if word_top is not None and word_top < page_height * 0.3:
|
136
136
|
scores[doc_type] += 0.5
|
137
137
|
|
138
138
|
total_score = sum(scores.values())
|
@@ -27,6 +27,8 @@ except ImportError: # pragma: no cover
|
|
27
27
|
html2text = None
|
28
28
|
|
29
29
|
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
30
|
+
_UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
|
31
|
+
_UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
|
30
32
|
|
31
33
|
|
32
34
|
class EmailExtractor(Extractor):
|
@@ -86,7 +88,14 @@ class EmailExtractor(Extractor):
|
|
86
88
|
def _format_email_field(self, field: Any) -> str:
|
87
89
|
match field:
|
88
90
|
case list():
|
89
|
-
|
91
|
+
emails = []
|
92
|
+
for item in field:
|
93
|
+
if isinstance(item, dict):
|
94
|
+
if email := item.get("email", ""):
|
95
|
+
emails.append(str(email))
|
96
|
+
else:
|
97
|
+
emails.append(str(item))
|
98
|
+
return ", ".join(emails)
|
90
99
|
case dict():
|
91
100
|
return str(field.get("email", ""))
|
92
101
|
case _:
|
@@ -111,12 +120,8 @@ class EmailExtractor(Extractor):
|
|
111
120
|
cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
|
112
121
|
clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
|
113
122
|
clean_html = unescape(clean_html)
|
114
|
-
clean_html = (
|
115
|
-
|
116
|
-
.replace("\u201d", '"')
|
117
|
-
.replace("\u2019", "'")
|
118
|
-
.replace("\u2018", "'")
|
119
|
-
)
|
123
|
+
clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
|
124
|
+
clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
|
120
125
|
text_parts.append(clean_html)
|
121
126
|
|
122
127
|
def _extract_email_attachments(
|
@@ -129,12 +134,12 @@ class EmailExtractor(Extractor):
|
|
129
134
|
for att in attachments:
|
130
135
|
name_val: str = "unknown"
|
131
136
|
if isinstance(att, dict):
|
132
|
-
n = att.get("name")
|
137
|
+
n = att.get("name") or att.get("filename")
|
133
138
|
if isinstance(n, str) and n:
|
134
139
|
name_val = n
|
135
140
|
names.append(name_val)
|
136
|
-
metadata["attachments"] = names
|
137
141
|
if names:
|
142
|
+
metadata["attachments"] = names
|
138
143
|
text_parts.append("Attachments: " + ", ".join(names))
|
139
144
|
|
140
145
|
def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
|
@@ -151,7 +156,8 @@ class EmailExtractor(Extractor):
|
|
151
156
|
if not isinstance(mime, str) or not mime.startswith("image/"):
|
152
157
|
continue
|
153
158
|
|
154
|
-
name = att.get("name")
|
159
|
+
name = att.get("name") or att.get("filename")
|
160
|
+
name = name if isinstance(name, str) else None
|
155
161
|
data = att.get("data") or att.get("content") or att.get("payload")
|
156
162
|
raw: bytes | None = None
|
157
163
|
if isinstance(data, (bytes, bytearray)):
|