classifyre-cli 0.4.10__tar.gz → 0.4.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/PKG-INFO +1 -1
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/package.json +1 -1
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/pyproject.toml +8 -2
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_base.py +28 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_image_classification.py +41 -31
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_llm.py +71 -6
- classifyre_cli-0.4.12/src/detectors/custom/runners/_object_detection.py +121 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/models/generated_detectors.py +4 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/models/generated_input.py +32 -4
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/rest.py +4 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/detector_pipeline.py +13 -32
- classifyre_cli-0.4.12/src/sandbox/runner.py +308 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/databricks/source.py +61 -8
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/object_storage/base.py +81 -5
- classifyre_cli-0.4.12/src/utils/embedded_images.py +222 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/file_parser.py +65 -38
- classifyre_cli-0.4.12/src/utils/file_to_images.py +134 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_llm_runner.py +82 -3
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_transformer_runners.py +3 -3
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_outputs.py +1 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_s3_compatible_storage_source.py +68 -0
- classifyre_cli-0.4.12/tests/test_sandbox_runner.py +214 -0
- classifyre_cli-0.4.12/tests/utils/test_embedded_images.py +129 -0
- classifyre_cli-0.4.12/tests/utils/test_file_to_images.py +99 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/uv.lock +363 -271
- classifyre_cli-0.4.10/src/detectors/custom/runners/_object_detection.py +0 -107
- classifyre_cli-0.4.10/src/sandbox/runner.py +0 -145
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/.gitignore +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/.python-version +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/README.md +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/main.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/main.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/tabular_base.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.10 → classifyre_cli-0.4.12}/tests/utils/test_file_parser.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "classifyre-cli"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.12"
|
|
4
4
|
description = "Classifyre CLI — scan and classify unstructured data sources"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -47,7 +47,7 @@ privacy = [
|
|
|
47
47
|
# mid-run in frozen/venv contexts. 8.x eagerly loads all data at import time,
|
|
48
48
|
# avoiding ModuleNotFoundError during Presidio phone number analysis.
|
|
49
49
|
"phonenumbers>=8.13.0,<10.0.0",
|
|
50
|
-
"numpy>=1.26.0,<
|
|
50
|
+
"numpy>=1.26.0,<3.0.0",
|
|
51
51
|
]
|
|
52
52
|
security = [
|
|
53
53
|
"detect-secrets>=1.5.0",
|
|
@@ -93,6 +93,10 @@ regex = [
|
|
|
93
93
|
]
|
|
94
94
|
llm = [
|
|
95
95
|
"litellm>=1.86.2",
|
|
96
|
+
# Pure-wheel PDF renderer (permissive license, no system binaries) used to
|
|
97
|
+
# rasterise PDF pages to images for vision-capable LLM detectors.
|
|
98
|
+
"pypdfium2>=4.30.0",
|
|
99
|
+
"pillow>=12.2.0",
|
|
96
100
|
]
|
|
97
101
|
detectors = [
|
|
98
102
|
{ include-group = "file-processing" },
|
|
@@ -270,6 +274,8 @@ module = [
|
|
|
270
274
|
"setfit",
|
|
271
275
|
"litellm.*",
|
|
272
276
|
"litellm",
|
|
277
|
+
"pypdfium2.*",
|
|
278
|
+
"pypdfium2",
|
|
273
279
|
"sklearn.*",
|
|
274
280
|
"sklearn",
|
|
275
281
|
"numpy",
|
|
@@ -2,6 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
5
7
|
import re
|
|
6
8
|
from abc import ABC, abstractmethod
|
|
7
9
|
from datetime import UTC, datetime
|
|
@@ -38,6 +40,32 @@ _IMAGE_CONTENT_TYPES = [
|
|
|
38
40
|
"image/bmp",
|
|
39
41
|
"image/tiff",
|
|
40
42
|
]
|
|
43
|
+
# Content types HuggingFace image detectors accept. Non-image renderable files
|
|
44
|
+
# (PDFs) are rasterised page-by-page via render_to_images before classification,
|
|
45
|
+
# mirroring the vision LLM detector's input handling.
|
|
46
|
+
_IMAGE_INPUT_CONTENT_TYPES = [*_IMAGE_CONTENT_TYPES, "application/pdf"]
|
|
47
|
+
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _load_input_images(content: bytes, content_type: str, pil: Any) -> list[tuple[int, Any]]:
|
|
52
|
+
"""Return ``(page_index, PIL.Image)`` tuples for an image or renderable file.
|
|
53
|
+
|
|
54
|
+
Image MIME types open directly; PDFs (and any type ``render_to_images`` supports)
|
|
55
|
+
are rasterised to one image per page. Unsupported types return ``[]``.
|
|
56
|
+
"""
|
|
57
|
+
from ....utils.file_to_images import render_to_images, supported_mime_type
|
|
58
|
+
|
|
59
|
+
normalized = content_type.split(";", 1)[0].strip().lower()
|
|
60
|
+
try:
|
|
61
|
+
if normalized.startswith("image/"):
|
|
62
|
+
return [(0, pil.open(io.BytesIO(content)))]
|
|
63
|
+
if supported_mime_type(content_type):
|
|
64
|
+
pages = render_to_images(content, content_type)
|
|
65
|
+
return [(idx, pil.open(io.BytesIO(png))) for idx, png in enumerate(pages)]
|
|
66
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
67
|
+
logger.warning("Failed to load input images (%s): %s", normalized, exc)
|
|
68
|
+
return []
|
|
41
69
|
|
|
42
70
|
|
|
43
71
|
def _resolve_pipeline_severity(
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
import io
|
|
6
5
|
import logging
|
|
7
6
|
from typing import Any
|
|
8
7
|
|
|
@@ -11,8 +10,9 @@ from ....models.generated_single_asset_scan_results import DetectionResult
|
|
|
11
10
|
from ...dependencies import ensure_torch, require_module
|
|
12
11
|
from ._base import (
|
|
13
12
|
_DEFAULT_IMAGE_CLASSIFICATION_MODEL,
|
|
14
|
-
|
|
13
|
+
_IMAGE_INPUT_CONTENT_TYPES,
|
|
15
14
|
BaseRunner,
|
|
15
|
+
_load_input_images,
|
|
16
16
|
_resolve_pipeline_severity,
|
|
17
17
|
)
|
|
18
18
|
|
|
@@ -54,45 +54,55 @@ class ImageClassificationRunner(BaseRunner):
|
|
|
54
54
|
raise NotImplementedError("ImageClassificationRunner uses detect() directly")
|
|
55
55
|
|
|
56
56
|
def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
|
|
57
|
-
if not content_type.startswith("image/"):
|
|
58
|
-
return []
|
|
59
57
|
if isinstance(content, str):
|
|
60
58
|
logger.warning("image_classification: received string content, expected bytes")
|
|
61
59
|
return []
|
|
62
60
|
|
|
61
|
+
# image/* opens directly; PDFs are rasterised to one image per page.
|
|
62
|
+
images = _load_input_images(content, content_type, self._pil)
|
|
63
|
+
if not images:
|
|
64
|
+
return []
|
|
65
|
+
|
|
63
66
|
schema = self._schema
|
|
64
67
|
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.0
|
|
68
|
+
multi_page = len(images) > 1
|
|
65
69
|
results: list[DetectionResult] = []
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
"
|
|
86
|
-
"
|
|
87
|
-
|
|
70
|
+
for page_index, image in images:
|
|
71
|
+
try:
|
|
72
|
+
predictions: list[dict[str, Any]] = self._pipe(image) or []
|
|
73
|
+
for pred in predictions:
|
|
74
|
+
label: str = pred.get("label", "unknown")
|
|
75
|
+
score: float = float(pred.get("score", 0.0))
|
|
76
|
+
if score < threshold:
|
|
77
|
+
continue
|
|
78
|
+
severity = _resolve_pipeline_severity(label, schema.severity_map)
|
|
79
|
+
page_suffix = f" (page {page_index + 1})" if multi_page else ""
|
|
80
|
+
metadata: dict[str, Any] = {
|
|
81
|
+
"image_size": f"{image.size[0]}x{image.size[1]}",
|
|
82
|
+
"image_mode": image.mode,
|
|
83
|
+
"model": self._model_id,
|
|
84
|
+
}
|
|
85
|
+
if multi_page:
|
|
86
|
+
metadata["page"] = page_index + 1
|
|
87
|
+
results.append(
|
|
88
|
+
self._make_result(
|
|
89
|
+
finding_type=f"classification:{label}",
|
|
90
|
+
category="CONTENT",
|
|
91
|
+
severity=severity,
|
|
92
|
+
confidence=score,
|
|
93
|
+
matched_content=(
|
|
94
|
+
f"Image classified as: {label} ({score:.3f}){page_suffix}"
|
|
95
|
+
),
|
|
96
|
+
location=None,
|
|
97
|
+
metadata=metadata,
|
|
98
|
+
)
|
|
88
99
|
)
|
|
100
|
+
except Exception as exc:
|
|
101
|
+
logger.error(
|
|
102
|
+
"image_classification error (model=%s): %s", self._model_id, exc, exc_info=True
|
|
89
103
|
)
|
|
90
|
-
except Exception as exc:
|
|
91
|
-
logger.error(
|
|
92
|
-
"image_classification error (model=%s): %s", self._model_id, exc, exc_info=True
|
|
93
|
-
)
|
|
94
104
|
results.sort(key=lambda r: r.confidence, reverse=True)
|
|
95
105
|
return results
|
|
96
106
|
|
|
97
107
|
def get_supported_content_types(self) -> list[str]:
|
|
98
|
-
return list(
|
|
108
|
+
return list(_IMAGE_INPUT_CONTENT_TYPES)
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import base64
|
|
5
6
|
import json
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
@@ -17,8 +18,9 @@ from ....models.generated_single_asset_scan_results import (
|
|
|
17
18
|
DetectionResult,
|
|
18
19
|
DetectorType,
|
|
19
20
|
)
|
|
21
|
+
from ....utils.file_to_images import render_to_images, supported_mime_type
|
|
20
22
|
from ...dependencies import require_module
|
|
21
|
-
from ._base import _TEXT_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
|
|
23
|
+
from ._base import _IMAGE_CONTENT_TYPES, _TEXT_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
|
|
22
24
|
|
|
23
25
|
logger = logging.getLogger(__name__)
|
|
24
26
|
|
|
@@ -29,6 +31,14 @@ _PROVIDER_PREFIX: dict[str, str] = {
|
|
|
29
31
|
"OPENAI_COMPATIBLE": "openai",
|
|
30
32
|
}
|
|
31
33
|
|
|
34
|
+
# Content types a vision-capable LLM detector renders to images and sends to the
|
|
35
|
+
# model directly. PDFs are rasterised page-by-page; images pass through.
|
|
36
|
+
_VISION_CONTENT_TYPES = [*_IMAGE_CONTENT_TYPES, "application/pdf"]
|
|
37
|
+
|
|
38
|
+
# Cap the number of rendered page images sent in a single completion to bound
|
|
39
|
+
# token cost and request size for multi-page PDFs.
|
|
40
|
+
_MAX_VISION_IMAGES = 20
|
|
41
|
+
|
|
32
42
|
|
|
33
43
|
class LLMRunner(BaseRunner):
|
|
34
44
|
"""AI detector — sends content to a configured LLM provider for classification + extraction."""
|
|
@@ -60,7 +70,7 @@ class LLMRunner(BaseRunner):
|
|
|
60
70
|
|
|
61
71
|
def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
|
|
62
72
|
if isinstance(content, bytes):
|
|
63
|
-
return
|
|
73
|
+
return self._detect_vision(content, content_type)
|
|
64
74
|
if content_type not in _TEXT_CONTENT_TYPES:
|
|
65
75
|
return []
|
|
66
76
|
text = content.strip()
|
|
@@ -75,7 +85,48 @@ class LLMRunner(BaseRunner):
|
|
|
75
85
|
{"role": "system", "content": self._build_system_prompt()},
|
|
76
86
|
{"role": "user", "content": snippet},
|
|
77
87
|
]
|
|
88
|
+
return self._complete_and_parse(messages, snippet)
|
|
89
|
+
|
|
90
|
+
def _detect_vision(self, content: bytes, content_type: str) -> list[DetectionResult]:
|
|
91
|
+
"""Render a binary file (image/PDF) to images and classify via the model."""
|
|
92
|
+
if not self._vision_enabled():
|
|
93
|
+
return []
|
|
94
|
+
if not supported_mime_type(content_type):
|
|
95
|
+
return []
|
|
78
96
|
|
|
97
|
+
images = render_to_images(
|
|
98
|
+
content,
|
|
99
|
+
content_type,
|
|
100
|
+
max_pages=_MAX_VISION_IMAGES,
|
|
101
|
+
)
|
|
102
|
+
if not images:
|
|
103
|
+
return []
|
|
104
|
+
|
|
105
|
+
image_blocks = [
|
|
106
|
+
{
|
|
107
|
+
"type": "image_url",
|
|
108
|
+
"image_url": {
|
|
109
|
+
"url": f"data:image/png;base64,{base64.b64encode(png).decode('ascii')}"
|
|
110
|
+
},
|
|
111
|
+
}
|
|
112
|
+
for png in images[:_MAX_VISION_IMAGES]
|
|
113
|
+
]
|
|
114
|
+
messages = [
|
|
115
|
+
{"role": "system", "content": self._build_system_prompt()},
|
|
116
|
+
{"role": "user", "content": image_blocks},
|
|
117
|
+
]
|
|
118
|
+
# matched_content fallback descriptor — there is no text snippet for files.
|
|
119
|
+
descriptor = f"[{content_type}, {len(image_blocks)} page image(s)]"
|
|
120
|
+
return self._complete_and_parse(messages, descriptor, vision_pages=len(image_blocks))
|
|
121
|
+
|
|
122
|
+
def _complete_and_parse(
|
|
123
|
+
self,
|
|
124
|
+
messages: list[dict[str, Any]],
|
|
125
|
+
snippet: str,
|
|
126
|
+
*,
|
|
127
|
+
vision_pages: int | None = None,
|
|
128
|
+
) -> list[DetectionResult]:
|
|
129
|
+
schema = self._schema
|
|
79
130
|
try:
|
|
80
131
|
response = self._litellm.completion(
|
|
81
132
|
model=self._model_string(),
|
|
@@ -98,10 +149,16 @@ class LLMRunner(BaseRunner):
|
|
|
98
149
|
)
|
|
99
150
|
return []
|
|
100
151
|
|
|
101
|
-
return self._results_from_payload(snippet, parsed)
|
|
152
|
+
return self._results_from_payload(snippet, parsed, vision_pages=vision_pages)
|
|
153
|
+
|
|
154
|
+
def _vision_enabled(self) -> bool:
|
|
155
|
+
return bool(getattr(self._runtime, "supports_vision", False))
|
|
102
156
|
|
|
103
157
|
def get_supported_content_types(self) -> list[str]:
|
|
104
|
-
|
|
158
|
+
types = list(_TEXT_CONTENT_TYPES)
|
|
159
|
+
if self._vision_enabled():
|
|
160
|
+
types.extend(_VISION_CONTENT_TYPES)
|
|
161
|
+
return types
|
|
105
162
|
|
|
106
163
|
# ── Internals ────────────────────────────────────────────────────────────
|
|
107
164
|
|
|
@@ -175,7 +232,13 @@ class LLMRunner(BaseRunner):
|
|
|
175
232
|
return {}
|
|
176
233
|
return parsed if isinstance(parsed, dict) else {}
|
|
177
234
|
|
|
178
|
-
def _results_from_payload(
|
|
235
|
+
def _results_from_payload(
|
|
236
|
+
self,
|
|
237
|
+
snippet: str,
|
|
238
|
+
payload: dict[str, Any],
|
|
239
|
+
*,
|
|
240
|
+
vision_pages: int | None = None,
|
|
241
|
+
) -> list[DetectionResult]:
|
|
179
242
|
schema = self._schema
|
|
180
243
|
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
|
|
181
244
|
default_severity = schema.severity or Severity.info
|
|
@@ -201,7 +264,7 @@ class LLMRunner(BaseRunner):
|
|
|
201
264
|
results.append(
|
|
202
265
|
DetectionResult(
|
|
203
266
|
detector_type=DetectorType.CUSTOM,
|
|
204
|
-
finding_type=
|
|
267
|
+
finding_type=label,
|
|
205
268
|
category="CLASSIFICATION",
|
|
206
269
|
severity=severity,
|
|
207
270
|
confidence=min(0.99, confidence),
|
|
@@ -216,6 +279,8 @@ class LLMRunner(BaseRunner):
|
|
|
216
279
|
"model": self._runtime.model,
|
|
217
280
|
"label": label,
|
|
218
281
|
"fields": extracted,
|
|
282
|
+
"input": "vision" if vision_pages is not None else "text",
|
|
283
|
+
**({"vision_pages": vision_pages} if vision_pages is not None else {}),
|
|
219
284
|
},
|
|
220
285
|
extracted_data=extracted or None,
|
|
221
286
|
extraction_method="LLM",
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Object detection pipeline runner."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ....models.generated_detectors import ObjectDetectionPipelineSchema
|
|
9
|
+
from ....models.generated_single_asset_scan_results import DetectionResult, Location
|
|
10
|
+
from ...dependencies import MissingDependencyError, ensure_torch, require_module
|
|
11
|
+
from ._base import (
|
|
12
|
+
_IMAGE_INPUT_CONTENT_TYPES,
|
|
13
|
+
BaseRunner,
|
|
14
|
+
_load_input_images,
|
|
15
|
+
_resolve_pipeline_severity,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ObjectDetectionRunner(BaseRunner):
|
|
22
|
+
"""Object detection via a single HuggingFace object-detection pipeline."""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
schema: ObjectDetectionPipelineSchema,
|
|
27
|
+
detector_key: str = "",
|
|
28
|
+
detector_name: str = "",
|
|
29
|
+
) -> None:
|
|
30
|
+
self._schema = schema
|
|
31
|
+
self._detector_key = detector_key
|
|
32
|
+
self._detector_name = detector_name
|
|
33
|
+
ensure_torch("object_detection", ["custom", "detectors"])
|
|
34
|
+
transformers = require_module("transformers", "object_detection", ["custom", "detectors"])
|
|
35
|
+
self._pil = require_module("PIL.Image", "object_detection", ["custom", "detectors"])
|
|
36
|
+
pipeline_kwargs: dict[str, Any] = {
|
|
37
|
+
"model": schema.model,
|
|
38
|
+
"device": schema.device or "cpu",
|
|
39
|
+
}
|
|
40
|
+
if schema.model_revision:
|
|
41
|
+
pipeline_kwargs["revision"] = schema.model_revision
|
|
42
|
+
nms = getattr(schema.nms_threshold, "root", schema.nms_threshold)
|
|
43
|
+
if nms is not None:
|
|
44
|
+
pipeline_kwargs["threshold"] = nms
|
|
45
|
+
try:
|
|
46
|
+
self._pipe: Any = transformers.pipeline("object-detection", **pipeline_kwargs)
|
|
47
|
+
except ImportError as exc:
|
|
48
|
+
raise MissingDependencyError(
|
|
49
|
+
"object_detection",
|
|
50
|
+
["custom", "detectors"],
|
|
51
|
+
f"ObjectDetectionRunner requires additional dependencies: {exc}",
|
|
52
|
+
) from exc
|
|
53
|
+
|
|
54
|
+
def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
|
|
55
|
+
raise NotImplementedError("ObjectDetectionRunner uses detect() directly")
|
|
56
|
+
|
|
57
|
+
def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
|
|
58
|
+
if isinstance(content, str):
|
|
59
|
+
logger.warning("object_detection: received string content, expected bytes")
|
|
60
|
+
return []
|
|
61
|
+
|
|
62
|
+
# image/* opens directly; PDFs are rasterised to one image per page.
|
|
63
|
+
images = _load_input_images(content, content_type, self._pil)
|
|
64
|
+
if not images:
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
schema = self._schema
|
|
68
|
+
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
|
|
69
|
+
multi_page = len(images) > 1
|
|
70
|
+
results: list[DetectionResult] = []
|
|
71
|
+
for page_index, image in images:
|
|
72
|
+
try:
|
|
73
|
+
detections: list[dict[str, Any]] = self._pipe(image) or []
|
|
74
|
+
for det in detections:
|
|
75
|
+
label: str = det.get("label", "unknown")
|
|
76
|
+
score: float = float(det.get("score", 0.0))
|
|
77
|
+
box: dict[str, int] = det.get("box", {})
|
|
78
|
+
if score < threshold:
|
|
79
|
+
continue
|
|
80
|
+
if schema.min_box_area is not None:
|
|
81
|
+
w = max(0, box.get("xmax", 0) - box.get("xmin", 0))
|
|
82
|
+
h = max(0, box.get("ymax", 0) - box.get("ymin", 0))
|
|
83
|
+
if w * h < schema.min_box_area:
|
|
84
|
+
continue
|
|
85
|
+
severity = _resolve_pipeline_severity(label, schema.severity_map)
|
|
86
|
+
page_prefix = f"page {page_index + 1} " if multi_page else ""
|
|
87
|
+
metadata: dict[str, Any] = {
|
|
88
|
+
"box": box,
|
|
89
|
+
"score": score,
|
|
90
|
+
"image_size": f"{image.size[0]}x{image.size[1]}",
|
|
91
|
+
"model": schema.model,
|
|
92
|
+
}
|
|
93
|
+
if multi_page:
|
|
94
|
+
metadata["page"] = page_index + 1
|
|
95
|
+
results.append(
|
|
96
|
+
self._make_result(
|
|
97
|
+
finding_type=label,
|
|
98
|
+
category="CONTENT",
|
|
99
|
+
severity=severity,
|
|
100
|
+
confidence=score,
|
|
101
|
+
matched_content=label,
|
|
102
|
+
location=Location(
|
|
103
|
+
description=(
|
|
104
|
+
f"{page_prefix}box xmin={box.get('xmin')} ymin={box.get('ymin')}"
|
|
105
|
+
f" xmax={box.get('xmax')} ymax={box.get('ymax')}"
|
|
106
|
+
),
|
|
107
|
+
),
|
|
108
|
+
metadata=metadata,
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
except Exception as exc:
|
|
112
|
+
logger.error(
|
|
113
|
+
"object_detection error (model=%s): %s", schema.model, exc, exc_info=True
|
|
114
|
+
)
|
|
115
|
+
results.sort(key=lambda r: r.confidence, reverse=True)
|
|
116
|
+
if schema.top_k is not None:
|
|
117
|
+
results = results[: schema.top_k]
|
|
118
|
+
return results
|
|
119
|
+
|
|
120
|
+
def get_supported_content_types(self) -> list[str]:
|
|
121
|
+
return list(_IMAGE_INPUT_CONTENT_TYPES)
|
|
@@ -1027,6 +1027,10 @@ class LLMProviderRuntime(BaseModel):
|
|
|
1027
1027
|
context_size: int | None = Field(
|
|
1028
1028
|
None, description='Optional context window size configured for the provider.'
|
|
1029
1029
|
)
|
|
1030
|
+
supports_vision: bool | None = Field(
|
|
1031
|
+
False,
|
|
1032
|
+
description='Whether the resolved provider/model accepts image/PDF input. When true the detector renders supported files to images and sends them to the model as multimodal input instead of extracting text.',
|
|
1033
|
+
)
|
|
1030
1034
|
|
|
1031
1035
|
|
|
1032
1036
|
class Type4(StrEnum):
|
|
@@ -1078,9 +1078,10 @@ class DatabricksAuthMode(StrEnum):
|
|
|
1078
1078
|
|
|
1079
1079
|
PAT_TOKEN = 'PAT_TOKEN'
|
|
1080
1080
|
SERVICE_PRINCIPAL = 'SERVICE_PRINCIPAL'
|
|
1081
|
+
AZURE_SERVICE_PRINCIPAL = 'AZURE_SERVICE_PRINCIPAL'
|
|
1081
1082
|
|
|
1082
1083
|
|
|
1083
|
-
class
|
|
1084
|
+
class PersonalAccessToken(BaseModel):
|
|
1084
1085
|
model_config = ConfigDict(
|
|
1085
1086
|
extra='forbid',
|
|
1086
1087
|
)
|
|
@@ -1094,7 +1095,7 @@ class DatabricksRequiredPat(BaseModel):
|
|
|
1094
1095
|
)
|
|
1095
1096
|
|
|
1096
1097
|
|
|
1097
|
-
class
|
|
1098
|
+
class ServicePrincipalOAuthM2M(BaseModel):
|
|
1098
1099
|
model_config = ConfigDict(
|
|
1099
1100
|
extra='forbid',
|
|
1100
1101
|
)
|
|
@@ -1109,6 +1110,24 @@ class DatabricksRequiredServicePrincipal(BaseModel):
|
|
|
1109
1110
|
client_id: str = Field(..., description='Databricks service principal client ID')
|
|
1110
1111
|
|
|
1111
1112
|
|
|
1113
|
+
class AzureServicePrincipal(BaseModel):
|
|
1114
|
+
model_config = ConfigDict(
|
|
1115
|
+
extra='forbid',
|
|
1116
|
+
)
|
|
1117
|
+
auth_mode: Literal['AZURE_SERVICE_PRINCIPAL']
|
|
1118
|
+
workspace_url: AnyUrl = Field(
|
|
1119
|
+
...,
|
|
1120
|
+
description='Azure Databricks workspace URL (for example, https://adb-1234567890123456.7.azuredatabricks.net)',
|
|
1121
|
+
)
|
|
1122
|
+
warehouse_id: str = Field(
|
|
1123
|
+
..., description='Databricks SQL warehouse ID used for sampling queries'
|
|
1124
|
+
)
|
|
1125
|
+
client_id: str = Field(
|
|
1126
|
+
..., description='Azure AD application (client) ID for the service principal'
|
|
1127
|
+
)
|
|
1128
|
+
tenant_id: str = Field(..., description='Azure AD tenant ID')
|
|
1129
|
+
|
|
1130
|
+
|
|
1112
1131
|
class DatabricksMaskedPat(BaseModel):
|
|
1113
1132
|
model_config = ConfigDict(
|
|
1114
1133
|
extra='forbid',
|
|
@@ -1125,6 +1144,15 @@ class DatabricksMaskedServicePrincipal(BaseModel):
|
|
|
1125
1144
|
)
|
|
1126
1145
|
|
|
1127
1146
|
|
|
1147
|
+
class DatabricksMaskedAzureServicePrincipal(BaseModel):
|
|
1148
|
+
model_config = ConfigDict(
|
|
1149
|
+
extra='forbid',
|
|
1150
|
+
)
|
|
1151
|
+
client_secret: str = Field(
|
|
1152
|
+
..., description='Azure AD client secret for the service principal'
|
|
1153
|
+
)
|
|
1154
|
+
|
|
1155
|
+
|
|
1128
1156
|
class DatabricksOptionalConnection(BaseModel):
|
|
1129
1157
|
"""
|
|
1130
1158
|
Databricks API and SQL statement execution tuning options.
|
|
@@ -2020,8 +2048,8 @@ class DatabricksInput(CoreInput):
|
|
|
2020
2048
|
type: Literal['DATABRICKS'] = Field(
|
|
2021
2049
|
'DATABRICKS', description='Type of the asset or source'
|
|
2022
2050
|
)
|
|
2023
|
-
required:
|
|
2024
|
-
..., title='DatabricksRequired'
|
|
2051
|
+
required: PersonalAccessToken | ServicePrincipalOAuthM2M | AzureServicePrincipal = (
|
|
2052
|
+
Field(..., title='DatabricksRequired')
|
|
2025
2053
|
)
|
|
2026
2054
|
masked: DatabricksMaskedPat | DatabricksMaskedServicePrincipal = Field(
|
|
2027
2055
|
..., title='DatabricksMasked'
|
|
@@ -131,6 +131,10 @@ class RestOutputSink:
|
|
|
131
131
|
self.base_url = base_url.rstrip("/")
|
|
132
132
|
self.timeout_sec = timeout_sec
|
|
133
133
|
self.session = requests.Session()
|
|
134
|
+
# Disable keep-alive so stale pooled connections are never reused after
|
|
135
|
+
# a pod restart or server-side keep-alive timeout. Each request opens
|
|
136
|
+
# a fresh TCP connection, which is cheap enough for our batch cadence.
|
|
137
|
+
self.session.headers.update({"Connection": "close"})
|
|
134
138
|
adapter = HTTPAdapter(max_retries=_RETRY_POLICY)
|
|
135
139
|
self.session.mount("http://", adapter)
|
|
136
140
|
self.session.mount("https://", adapter)
|
|
@@ -410,7 +410,7 @@ class DetectorPipeline:
|
|
|
410
410
|
)
|
|
411
411
|
return page_findings, page_types, page_errors, page_content, page_num
|
|
412
412
|
|
|
413
|
-
async def _collect_done_and_flush() -> None:
|
|
413
|
+
async def _collect_done_and_flush(min_findings: int = 1) -> None:
|
|
414
414
|
nonlocal detector_types_run, unflushed_count
|
|
415
415
|
done = {t for t in pending_tasks if t.done()}
|
|
416
416
|
for task in done:
|
|
@@ -430,7 +430,7 @@ class DetectorPipeline:
|
|
|
430
430
|
)
|
|
431
431
|
unflushed_count += len(page_findings)
|
|
432
432
|
|
|
433
|
-
if unflushed_count >=
|
|
433
|
+
if unflushed_count >= min_findings and unflushed_count > 0:
|
|
434
434
|
logger.debug(
|
|
435
435
|
" %s flushing %d findings (%d total)",
|
|
436
436
|
asset.name,
|
|
@@ -449,36 +449,17 @@ class DetectorPipeline:
|
|
|
449
449
|
if not text_content:
|
|
450
450
|
continue
|
|
451
451
|
|
|
452
|
+
# Bound the number of detector tasks in flight. While the buffer is
|
|
453
|
+
# full we batch flushes by ``findings_flush_size`` to avoid hammering
|
|
454
|
+
# the API when pages pile up faster than detectors can drain them.
|
|
452
455
|
while len(pending_tasks) >= max_pending:
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
for finding in page_findings:
|
|
461
|
-
self.content_provider.enrich_finding_location(
|
|
462
|
-
finding,
|
|
463
|
-
asset,
|
|
464
|
-
page_content,
|
|
465
|
-
)
|
|
466
|
-
findings.extend(page_findings)
|
|
467
|
-
errors.extend(page_errors)
|
|
468
|
-
detector_types_run = self._merge_detector_types(
|
|
469
|
-
detector_types_run,
|
|
470
|
-
page_types,
|
|
471
|
-
)
|
|
472
|
-
unflushed_count += len(page_findings)
|
|
473
|
-
if unflushed_count >= findings_flush_size and unflushed_count > 0:
|
|
474
|
-
logger.info(
|
|
475
|
-
" %s flushing %d findings (%d total)",
|
|
476
|
-
asset.name,
|
|
477
|
-
unflushed_count,
|
|
478
|
-
len(findings),
|
|
479
|
-
)
|
|
480
|
-
await on_findings_flushed(list(findings))
|
|
481
|
-
unflushed_count = 0
|
|
456
|
+
await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
|
|
457
|
+
await _collect_done_and_flush(findings_flush_size)
|
|
458
|
+
|
|
459
|
+
# Steady state: flush findings from any page that has already
|
|
460
|
+
# finished as soon as they are available, so real findings stream to
|
|
461
|
+
# the API per page instead of only once the whole asset is processed.
|
|
462
|
+
await _collect_done_and_flush()
|
|
482
463
|
|
|
483
464
|
task = asyncio.create_task(_detect_page(text_content, page_index))
|
|
484
465
|
pending_tasks.add(task)
|
|
@@ -652,7 +633,7 @@ class DetectorPipeline:
|
|
|
652
633
|
detected_at = datetime.now(UTC)
|
|
653
634
|
|
|
654
635
|
for i, (detector, result) in enumerate(zip(runnable_detectors, results, strict=False)):
|
|
655
|
-
detector_name = detector
|
|
636
|
+
detector_name = self._detector_log_label(detector)
|
|
656
637
|
via = task_via[i]
|
|
657
638
|
loc = f"{asset_name}:{page_tag}" if page_tag else asset_name
|
|
658
639
|
|