classifyre-cli 0.4.8__tar.gz → 0.4.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/PKG-INFO +1 -1
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/package.json +1 -1
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/pyproject.toml +7 -1
- classifyre_cli-0.4.10/src/detectors/custom/runners/_llm.py +230 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/models/generated_detectors.py +143 -5
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/rest.py +71 -0
- classifyre_cli-0.4.10/tests/detectors/custom/test_llm_runner.py +157 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_outputs.py +3 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/uv.lock +446 -167
- classifyre_cli-0.4.8/src/detectors/custom/runners/_llm.py +0 -22
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/.gitignore +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/.python-version +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/README.md +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/main.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/main.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/models/generated_input.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/detector_pipeline.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/databricks/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/object_storage/base.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/tabular_base.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.8 → classifyre_cli-0.4.10}/tests/utils/test_file_parser.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "classifyre-cli"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.10"
|
|
4
4
|
description = "Classifyre CLI — scan and classify unstructured data sources"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -91,6 +91,9 @@ custom = [
|
|
|
91
91
|
regex = [
|
|
92
92
|
"google-re2>=1.1",
|
|
93
93
|
]
|
|
94
|
+
llm = [
|
|
95
|
+
"litellm>=1.86.2",
|
|
96
|
+
]
|
|
94
97
|
detectors = [
|
|
95
98
|
{ include-group = "file-processing" },
|
|
96
99
|
{ include-group = "privacy" },
|
|
@@ -101,6 +104,7 @@ detectors = [
|
|
|
101
104
|
{ include-group = "classification" },
|
|
102
105
|
{ include-group = "custom" },
|
|
103
106
|
{ include-group = "regex" },
|
|
107
|
+
{ include-group = "llm" },
|
|
104
108
|
]
|
|
105
109
|
file-processing = [
|
|
106
110
|
"filetype>=1.2.0",
|
|
@@ -264,6 +268,8 @@ module = [
|
|
|
264
268
|
"datasets",
|
|
265
269
|
"setfit.*",
|
|
266
270
|
"setfit",
|
|
271
|
+
"litellm.*",
|
|
272
|
+
"litellm",
|
|
267
273
|
"sklearn.*",
|
|
268
274
|
"sklearn",
|
|
269
275
|
"numpy",
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""AI/LLM pipeline runner — prompt-driven classification and field extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
# Quiet litellm's import-time provider preload warnings (bedrock/sagemaker need
|
|
12
|
+
# botocore, which we don't install) before the library is ever imported.
|
|
13
|
+
os.environ.setdefault("LITELLM_LOG", "ERROR")
|
|
14
|
+
|
|
15
|
+
from ....models.generated_detectors import LLMPipelineSchema, Severity
|
|
16
|
+
from ....models.generated_single_asset_scan_results import (
|
|
17
|
+
DetectionResult,
|
|
18
|
+
DetectorType,
|
|
19
|
+
)
|
|
20
|
+
from ...dependencies import require_module
|
|
21
|
+
from ._base import _TEXT_CONTENT_TYPES, BaseRunner, _resolve_pipeline_severity
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# Map the stored AI provider type onto the litellm model-string convention.
|
|
26
|
+
_PROVIDER_PREFIX: dict[str, str] = {
|
|
27
|
+
"CLAUDE": "anthropic",
|
|
28
|
+
"GEMINI": "gemini",
|
|
29
|
+
"OPENAI_COMPATIBLE": "openai",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LLMRunner(BaseRunner):
|
|
34
|
+
"""AI detector — sends content to a configured LLM provider for classification + extraction."""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self, schema: LLMPipelineSchema, detector_key: str = "", detector_name: str = ""
|
|
38
|
+
) -> None:
|
|
39
|
+
self._schema = schema
|
|
40
|
+
self._detector_key = detector_key
|
|
41
|
+
self._detector_name = detector_name
|
|
42
|
+
|
|
43
|
+
runtime = schema.provider_runtime
|
|
44
|
+
if runtime is None:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
f"AI detector '{detector_key}' is missing provider_runtime — the API must "
|
|
47
|
+
"inject resolved provider credentials before dispatch."
|
|
48
|
+
)
|
|
49
|
+
self._runtime = runtime
|
|
50
|
+
self._litellm = require_module("litellm", "llm", ["llm"])
|
|
51
|
+
# Let litellm silently drop params an endpoint doesn't support (e.g.
|
|
52
|
+
# response_format / temperature on some OpenAI-compatible gateways)
|
|
53
|
+
# instead of raising. Keep its own logging quiet.
|
|
54
|
+
self._litellm.drop_params = True
|
|
55
|
+
self._litellm.suppress_debug_info = True
|
|
56
|
+
logging.getLogger("LiteLLM").setLevel(logging.ERROR)
|
|
57
|
+
|
|
58
|
+
def run(self, text: str) -> None: # type: ignore[override] # pragma: no cover
|
|
59
|
+
raise NotImplementedError("LLMRunner uses detect() directly")
|
|
60
|
+
|
|
61
|
+
def detect(self, content: str | bytes, content_type: str) -> list[DetectionResult]:
|
|
62
|
+
if isinstance(content, bytes):
|
|
63
|
+
return []
|
|
64
|
+
if content_type not in _TEXT_CONTENT_TYPES:
|
|
65
|
+
return []
|
|
66
|
+
text = content.strip()
|
|
67
|
+
if not text:
|
|
68
|
+
return []
|
|
69
|
+
|
|
70
|
+
schema = self._schema
|
|
71
|
+
content_limit = schema.content_limit or 8000
|
|
72
|
+
snippet = text[:content_limit]
|
|
73
|
+
|
|
74
|
+
messages = [
|
|
75
|
+
{"role": "system", "content": self._build_system_prompt()},
|
|
76
|
+
{"role": "user", "content": snippet},
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
response = self._litellm.completion(
|
|
81
|
+
model=self._model_string(),
|
|
82
|
+
api_key=self._runtime.api_key,
|
|
83
|
+
api_base=self._runtime.base_url or None,
|
|
84
|
+
temperature=schema.temperature if schema.temperature is not None else 0.0,
|
|
85
|
+
max_tokens=self._max_tokens(),
|
|
86
|
+
messages=messages,
|
|
87
|
+
response_format={"type": "json_object"},
|
|
88
|
+
)
|
|
89
|
+
raw = response.choices[0].message.content or "{}"
|
|
90
|
+
parsed = self._parse_json(raw)
|
|
91
|
+
except Exception as exc:
|
|
92
|
+
logger.error(
|
|
93
|
+
"llm detector error (detector=%s, model=%s): %s",
|
|
94
|
+
self._detector_key,
|
|
95
|
+
self._runtime.model,
|
|
96
|
+
exc,
|
|
97
|
+
exc_info=True,
|
|
98
|
+
)
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
return self._results_from_payload(snippet, parsed)
|
|
102
|
+
|
|
103
|
+
def get_supported_content_types(self) -> list[str]:
|
|
104
|
+
return list(_TEXT_CONTENT_TYPES)
|
|
105
|
+
|
|
106
|
+
# ── Internals ────────────────────────────────────────────────────────────
|
|
107
|
+
|
|
108
|
+
def _max_tokens(self) -> int | None:
|
|
109
|
+
# `max_tokens` is generated as a RootModel[int] wrapper, so unwrap `.root`
|
|
110
|
+
# before handing it to litellm — passing the model object serialises to an
|
|
111
|
+
# invalid request body and fails the whole completion.
|
|
112
|
+
raw = self._schema.max_tokens
|
|
113
|
+
if raw is None:
|
|
114
|
+
return None
|
|
115
|
+
return getattr(raw, "root", raw)
|
|
116
|
+
|
|
117
|
+
def _model_string(self) -> str:
|
|
118
|
+
prefix = _PROVIDER_PREFIX.get(self._runtime.provider.value, "openai")
|
|
119
|
+
return f"{prefix}/{self._runtime.model}"
|
|
120
|
+
|
|
121
|
+
def _build_system_prompt(self) -> str:
|
|
122
|
+
schema = self._schema
|
|
123
|
+
parts: list[str] = [schema.system_prompt.strip()]
|
|
124
|
+
|
|
125
|
+
labels = schema.labels or []
|
|
126
|
+
if labels:
|
|
127
|
+
label_lines = "\n".join(
|
|
128
|
+
f"- {lbl.name}: {lbl.description}" if lbl.description else f"- {lbl.name}"
|
|
129
|
+
for lbl in labels
|
|
130
|
+
)
|
|
131
|
+
parts.append(
|
|
132
|
+
"Classify the content using these labels:\n"
|
|
133
|
+
+ label_lines
|
|
134
|
+
+ (
|
|
135
|
+
"\nMultiple labels may apply."
|
|
136
|
+
if schema.multi_label
|
|
137
|
+
else "\nChoose the single best label."
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
fields = schema.output_fields or []
|
|
142
|
+
if fields:
|
|
143
|
+
field_lines = "\n".join(
|
|
144
|
+
f"- {f.name} ({f.type.value if f.type else 'string'}): {f.description}"
|
|
145
|
+
if f.description
|
|
146
|
+
else f"- {f.name} ({f.type.value if f.type else 'string'})"
|
|
147
|
+
for f in fields
|
|
148
|
+
)
|
|
149
|
+
parts.append("Also extract these fields:\n" + field_lines)
|
|
150
|
+
|
|
151
|
+
parts.append(
|
|
152
|
+
"Respond with a JSON object of the form: "
|
|
153
|
+
'{"labels": [{"name": "<label>", "confidence": <0-1>, '
|
|
154
|
+
'"matched_content": "<relevant snippet>"}], "fields": {<field name>: <value>}}. '
|
|
155
|
+
"Use only the labels listed above. Return an empty labels array when none apply."
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
if schema.response_example:
|
|
159
|
+
parts.append("Example response:\n" + schema.response_example.strip())
|
|
160
|
+
|
|
161
|
+
return "\n\n".join(parts)
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def _parse_json(raw: str) -> dict[str, Any]:
|
|
165
|
+
try:
|
|
166
|
+
parsed = json.loads(raw)
|
|
167
|
+
except json.JSONDecodeError:
|
|
168
|
+
start = raw.find("{")
|
|
169
|
+
end = raw.rfind("}")
|
|
170
|
+
if start == -1 or end == -1 or end <= start:
|
|
171
|
+
return {}
|
|
172
|
+
try:
|
|
173
|
+
parsed = json.loads(raw[start : end + 1])
|
|
174
|
+
except json.JSONDecodeError:
|
|
175
|
+
return {}
|
|
176
|
+
return parsed if isinstance(parsed, dict) else {}
|
|
177
|
+
|
|
178
|
+
def _results_from_payload(self, snippet: str, payload: dict[str, Any]) -> list[DetectionResult]:
|
|
179
|
+
schema = self._schema
|
|
180
|
+
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
|
|
181
|
+
default_severity = schema.severity or Severity.info
|
|
182
|
+
extracted = self._coerce_fields(payload.get("fields"))
|
|
183
|
+
|
|
184
|
+
raw_labels = payload.get("labels")
|
|
185
|
+
label_entries: list[dict[str, Any]] = (
|
|
186
|
+
[lbl for lbl in raw_labels if isinstance(lbl, dict)]
|
|
187
|
+
if isinstance(raw_labels, list)
|
|
188
|
+
else []
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
results: list[DetectionResult] = []
|
|
192
|
+
for entry in label_entries:
|
|
193
|
+
label = str(entry.get("name", "")).strip()
|
|
194
|
+
if not label:
|
|
195
|
+
continue
|
|
196
|
+
confidence = float(entry.get("confidence", 1.0) or 0.0)
|
|
197
|
+
if confidence < threshold:
|
|
198
|
+
continue
|
|
199
|
+
severity = _resolve_pipeline_severity(label, schema.severity_map, default_severity)
|
|
200
|
+
matched = str(entry.get("matched_content") or "").strip() or snippet[:320]
|
|
201
|
+
results.append(
|
|
202
|
+
DetectionResult(
|
|
203
|
+
detector_type=DetectorType.CUSTOM,
|
|
204
|
+
finding_type=f"llm:{label}",
|
|
205
|
+
category="CLASSIFICATION",
|
|
206
|
+
severity=severity,
|
|
207
|
+
confidence=min(0.99, confidence),
|
|
208
|
+
matched_content=matched,
|
|
209
|
+
location=None,
|
|
210
|
+
custom_detector_key=self._detector_key,
|
|
211
|
+
custom_detector_name=self._detector_name,
|
|
212
|
+
detected_at=datetime.now(UTC),
|
|
213
|
+
metadata={
|
|
214
|
+
"runner": "LLM",
|
|
215
|
+
"provider": self._runtime.provider.value,
|
|
216
|
+
"model": self._runtime.model,
|
|
217
|
+
"label": label,
|
|
218
|
+
"fields": extracted,
|
|
219
|
+
},
|
|
220
|
+
extracted_data=extracted or None,
|
|
221
|
+
extraction_method="LLM",
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
results.sort(key=lambda r: r.confidence, reverse=True)
|
|
226
|
+
return results
|
|
227
|
+
|
|
228
|
+
@staticmethod
|
|
229
|
+
def _coerce_fields(raw: Any) -> dict[str, Any]:
|
|
230
|
+
return {str(k): v for k, v in raw.items()} if isinstance(raw, dict) else {}
|
|
@@ -189,7 +189,7 @@ class DetectorCatalog(RootModel[list[DetectorCatalogEntry]]):
|
|
|
189
189
|
'categories': ['CLASSIFICATION', 'COMPLIANCE'],
|
|
190
190
|
'supported_asset_types': ['TXT', 'TABLE', 'URL', 'IMAGE'],
|
|
191
191
|
'recommended_model': 'mDeBERTa-v3 + SetFit + GLiNER + HuggingFace transformers',
|
|
192
|
-
'notes': 'User-defined rules and pipelines tailored to specific business needs. Supports regex, GLiNER2, LLM, text classification, image classification, feature extraction, and object detection pipelines.',
|
|
192
|
+
'notes': 'User-defined rules and pipelines tailored to specific business needs. Supports regex, GLiNER2, AI/LLM (prompt-driven classification + extraction via a configured provider), text classification, image classification, feature extraction, and object detection pipelines.',
|
|
193
193
|
},
|
|
194
194
|
],
|
|
195
195
|
description='Detector capability catalog used for planning and runtime routing',
|
|
@@ -954,18 +954,156 @@ class RegexPipelineSchema(BaseModel):
|
|
|
954
954
|
validation: PipelineValidationConfig | None = None
|
|
955
955
|
|
|
956
956
|
|
|
957
|
+
class LLMLabelDefinition(BaseModel):
|
|
958
|
+
"""
|
|
959
|
+
One classification label the AI detector may assign to content.
|
|
960
|
+
"""
|
|
961
|
+
|
|
962
|
+
model_config = ConfigDict(
|
|
963
|
+
extra='forbid',
|
|
964
|
+
)
|
|
965
|
+
name: str = Field(
|
|
966
|
+
...,
|
|
967
|
+
description="Label name returned by the model (e.g. 'good', 'bad', 'violent').",
|
|
968
|
+
)
|
|
969
|
+
description: str | None = Field(
|
|
970
|
+
'', description='Guidance describing when this label applies.'
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
|
|
957
974
|
class Type3(StrEnum):
|
|
975
|
+
string = 'string'
|
|
976
|
+
number = 'number'
|
|
977
|
+
boolean = 'boolean'
|
|
978
|
+
list_string_ = 'list[string]'
|
|
979
|
+
list_number_ = 'list[number]'
|
|
980
|
+
|
|
981
|
+
|
|
982
|
+
class LLMOutputField(BaseModel):
|
|
983
|
+
"""
|
|
984
|
+
One structured property the AI detector extracts and stores in finding metadata and extracted_data.
|
|
985
|
+
"""
|
|
986
|
+
|
|
987
|
+
model_config = ConfigDict(
|
|
988
|
+
extra='forbid',
|
|
989
|
+
)
|
|
990
|
+
name: str = Field(
|
|
991
|
+
..., description='Output field name — becomes a key in extracted_data JSON.'
|
|
992
|
+
)
|
|
993
|
+
description: str | None = Field(
|
|
994
|
+
'', description='Hint for what this field captures.'
|
|
995
|
+
)
|
|
996
|
+
type: Type3 | None = 'string'
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
class Provider(StrEnum):
|
|
1000
|
+
"""
|
|
1001
|
+
Resolved AI provider type.
|
|
1002
|
+
"""
|
|
1003
|
+
|
|
1004
|
+
OPENAI_COMPATIBLE = 'OPENAI_COMPATIBLE'
|
|
1005
|
+
CLAUDE = 'CLAUDE'
|
|
1006
|
+
GEMINI = 'GEMINI'
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
class LLMProviderRuntime(BaseModel):
|
|
1010
|
+
"""
|
|
1011
|
+
Runtime-only provider credentials injected by the API at dispatch time. Never persisted with the detector config and rejected on create/update.
|
|
1012
|
+
"""
|
|
1013
|
+
|
|
1014
|
+
model_config = ConfigDict(
|
|
1015
|
+
extra='forbid',
|
|
1016
|
+
)
|
|
1017
|
+
provider: Provider = Field(..., description='Resolved AI provider type.')
|
|
1018
|
+
model: str = Field(
|
|
1019
|
+
...,
|
|
1020
|
+
description='Resolved model identifier (e.g. gpt-4o, claude-sonnet-4-5, gemini-2.0-flash).',
|
|
1021
|
+
)
|
|
1022
|
+
api_key: str = Field(..., description='Decrypted provider API key.')
|
|
1023
|
+
base_url: str | None = Field(
|
|
1024
|
+
None,
|
|
1025
|
+
description='Base URL for OpenAI-compatible endpoints. Null for managed providers.',
|
|
1026
|
+
)
|
|
1027
|
+
context_size: int | None = Field(
|
|
1028
|
+
None, description='Optional context window size configured for the provider.'
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
class Type4(StrEnum):
|
|
958
1033
|
LLM = 'LLM'
|
|
959
1034
|
|
|
960
1035
|
|
|
1036
|
+
class MaxTokens(RootModel[int]):
|
|
1037
|
+
root: int = Field(
|
|
1038
|
+
None,
|
|
1039
|
+
description='Maximum tokens to generate. Provider default when null.',
|
|
1040
|
+
ge=1,
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
|
|
961
1044
|
class LLMPipelineSchema(BaseModel):
|
|
1045
|
+
"""
|
|
1046
|
+
AI detector pipeline. Sends content to a configured LLM provider with a system prompt, classifies it against a label set, and extracts structured fields. Predicted labels become findings (severity via severity_map); extracted fields are stored in finding metadata and extracted_data.
|
|
1047
|
+
"""
|
|
1048
|
+
|
|
962
1049
|
model_config = ConfigDict(
|
|
963
1050
|
extra='forbid',
|
|
964
1051
|
)
|
|
965
1052
|
type: Literal['LLM'] = 'LLM'
|
|
1053
|
+
system_prompt: str = Field(
|
|
1054
|
+
...,
|
|
1055
|
+
description='Instruction describing what the model should detect, classify, and extract.',
|
|
1056
|
+
)
|
|
1057
|
+
response_example: str | None = Field(
|
|
1058
|
+
None,
|
|
1059
|
+
description='Optional few-shot example of the JSON the model should return.',
|
|
1060
|
+
)
|
|
1061
|
+
temperature: float | None = Field(
|
|
1062
|
+
0.0,
|
|
1063
|
+
description='Sampling temperature. Lower is more deterministic.',
|
|
1064
|
+
ge=0.0,
|
|
1065
|
+
le=2.0,
|
|
1066
|
+
)
|
|
1067
|
+
max_tokens: MaxTokens | None = Field(
|
|
1068
|
+
None, description='Maximum tokens to generate. Provider default when null.'
|
|
1069
|
+
)
|
|
1070
|
+
labels: list[LLMLabelDefinition] | None = Field(
|
|
1071
|
+
[],
|
|
1072
|
+
description='Classification taxonomy the model assigns to content.',
|
|
1073
|
+
validate_default=True,
|
|
1074
|
+
)
|
|
1075
|
+
multi_label: bool | None = Field(
|
|
1076
|
+
False, description='Allow more than one label per asset.'
|
|
1077
|
+
)
|
|
1078
|
+
severity: Severity | None = Field(
|
|
1079
|
+
'info',
|
|
1080
|
+
description='Default severity when no severity_map rule matches a predicted label.',
|
|
1081
|
+
)
|
|
1082
|
+
severity_map: list[PipelineSeverityRule] | None = Field(
|
|
1083
|
+
None,
|
|
1084
|
+
description='Ordered rules mapping predicted labels to severity levels. First matching rule wins.',
|
|
1085
|
+
)
|
|
1086
|
+
confidence_threshold: float | None = Field(
|
|
1087
|
+
0.5,
|
|
1088
|
+
description='Minimum model confidence to report a label as a finding (0-1).',
|
|
1089
|
+
ge=0.0,
|
|
1090
|
+
le=1.0,
|
|
1091
|
+
)
|
|
1092
|
+
output_fields: list[LLMOutputField] | None = Field(
|
|
1093
|
+
[],
|
|
1094
|
+
description='Structured properties the model extracts. Stored in finding metadata and extracted_data.',
|
|
1095
|
+
validate_default=True,
|
|
1096
|
+
)
|
|
1097
|
+
content_limit: int | None = Field(
|
|
1098
|
+
8000, description='Maximum characters of content sent to the model.', ge=1
|
|
1099
|
+
)
|
|
1100
|
+
provider_runtime: LLMProviderRuntime | None = Field(
|
|
1101
|
+
None,
|
|
1102
|
+
description='Runtime-only credentials injected by the API at dispatch. Never persisted; rejected on create/update.',
|
|
1103
|
+
)
|
|
966
1104
|
|
|
967
1105
|
|
|
968
|
-
class
|
|
1106
|
+
class Type5(StrEnum):
|
|
969
1107
|
TEXT_CLASSIFICATION = 'TEXT_CLASSIFICATION'
|
|
970
1108
|
|
|
971
1109
|
|
|
@@ -1055,7 +1193,7 @@ class TextClassificationPipelineSchema(BaseModel):
|
|
|
1055
1193
|
)
|
|
1056
1194
|
|
|
1057
1195
|
|
|
1058
|
-
class
|
|
1196
|
+
class Type6(StrEnum):
|
|
1059
1197
|
IMAGE_CLASSIFICATION = 'IMAGE_CLASSIFICATION'
|
|
1060
1198
|
|
|
1061
1199
|
|
|
@@ -1108,7 +1246,7 @@ class ImageClassificationPipelineSchema(BaseModel):
|
|
|
1108
1246
|
)
|
|
1109
1247
|
|
|
1110
1248
|
|
|
1111
|
-
class
|
|
1249
|
+
class Type7(StrEnum):
|
|
1112
1250
|
FEATURE_EXTRACTION = 'FEATURE_EXTRACTION'
|
|
1113
1251
|
|
|
1114
1252
|
|
|
@@ -1180,7 +1318,7 @@ class FeatureExtractionPipelineSchema(BaseModel):
|
|
|
1180
1318
|
)
|
|
1181
1319
|
|
|
1182
1320
|
|
|
1183
|
-
class
|
|
1321
|
+
class Type8(StrEnum):
|
|
1184
1322
|
OBJECT_DETECTION = 'OBJECT_DETECTION'
|
|
1185
1323
|
|
|
1186
1324
|
|
|
@@ -1,17 +1,85 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import random
|
|
4
5
|
from typing import Any, Literal, cast
|
|
5
6
|
from urllib.parse import urljoin
|
|
6
7
|
|
|
7
8
|
import requests # type: ignore[import-untyped]
|
|
8
9
|
from pydantic import BaseModel, ConfigDict, Field
|
|
10
|
+
from requests.adapters import HTTPAdapter
|
|
11
|
+
from urllib3.util.retry import Retry # type: ignore[import-untyped]
|
|
9
12
|
|
|
10
13
|
from .base import OutputRuntimeContext, OutputType
|
|
11
14
|
|
|
12
15
|
logger = logging.getLogger(__name__)
|
|
13
16
|
|
|
14
17
|
|
|
18
|
+
class _JitteredRetry(Retry):
|
|
19
|
+
"""urllib3 Retry subclass that adds ±25 % multiplicative jitter to the
|
|
20
|
+
computed backoff so that multiple concurrent CLI jobs do not all retry
|
|
21
|
+
at exactly the same moment (thundering-herd mitigation).
|
|
22
|
+
|
|
23
|
+
The jitter is applied *after* the standard exponential backoff formula
|
|
24
|
+
and the backoff_max cap, so it never pushes the delay above
|
|
25
|
+
backoff_max * 1.25.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
_JITTER_FACTOR: float = 0.25
|
|
29
|
+
|
|
30
|
+
def get_backoff_time(self) -> float: # type: ignore[override]
|
|
31
|
+
base = super().get_backoff_time()
|
|
32
|
+
if base == 0:
|
|
33
|
+
return 0.0
|
|
34
|
+
lo = base * (1 - self._JITTER_FACTOR)
|
|
35
|
+
hi = base * (1 + self._JITTER_FACTOR)
|
|
36
|
+
return random.uniform(lo, hi)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Retry policy for CLI → API REST calls.
|
|
40
|
+
#
|
|
41
|
+
# What we retry and why:
|
|
42
|
+
# connect=8 — pod restarted / not yet ready (RemoteDisconnected, ConnectionReset,
|
|
43
|
+
# ConnectTimeout). Request never reached the application.
|
|
44
|
+
# read=8 — API is under load and slow to respond (ReadTimeout). Safe to retry
|
|
45
|
+
# because all endpoints are idempotent (bulk ingest is upsert-based,
|
|
46
|
+
# status/findings updates are set-operations).
|
|
47
|
+
# status=8 — transient HTTP errors from an overloaded or restarting API:
|
|
48
|
+
# 408 Request Timeout - API-level timeout
|
|
49
|
+
# 429 Too Many Requests - rate-limited / backpressure
|
|
50
|
+
# 502 Bad Gateway - proxy has no upstream yet
|
|
51
|
+
# 503 Service Unavail. - under-pressure / pod not ready
|
|
52
|
+
# 504 Gateway Timeout - upstream took too long
|
|
53
|
+
#
|
|
54
|
+
# backoff_factor=2, backoff_max=60: exponential cap at 60 s, with ±25 % jitter
|
|
55
|
+
# (see _JitteredRetry). Approximate wait schedule between attempts:
|
|
56
|
+
# attempt 1 → immediate (0 s)
|
|
57
|
+
# attempt 2 → ~2 s
|
|
58
|
+
# attempt 3 → ~4 s
|
|
59
|
+
# attempt 4 → ~8 s
|
|
60
|
+
# attempt 5 → ~16 s
|
|
61
|
+
# attempt 6 → ~32 s
|
|
62
|
+
# attempt 7 → ~60 s (capped)
|
|
63
|
+
# attempt 8 → ~60 s (capped)
|
|
64
|
+
# Total extra wait: ~182 s (~3 min) — covers extended load spikes on a
|
|
65
|
+
# single-node VPS before event-loop pressure drops. Worst-case a single
|
|
66
|
+
# call costs 8 * 120 s + 182 s = ~18 min, acceptable for long-running scans.
|
|
67
|
+
#
|
|
68
|
+
# POST and PATCH are explicitly allowed: without this urllib3 only retries
|
|
69
|
+
# idempotent methods (GET/HEAD) by default.
|
|
70
|
+
_RETRY_POLICY = _JitteredRetry(
|
|
71
|
+
total=8,
|
|
72
|
+
connect=8,
|
|
73
|
+
read=8,
|
|
74
|
+
status=8,
|
|
75
|
+
backoff_factor=2,
|
|
76
|
+
backoff_max=60,
|
|
77
|
+
status_forcelist={408, 429, 502, 503, 504},
|
|
78
|
+
allowed_methods={"GET", "POST", "PUT", "PATCH", "DELETE", "HEAD", "OPTIONS"},
|
|
79
|
+
raise_on_status=False,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
15
83
|
def _drop_none_recursive(value: Any) -> Any:
|
|
16
84
|
if isinstance(value, dict):
|
|
17
85
|
return {key: _drop_none_recursive(item) for key, item in value.items() if item is not None}
|
|
@@ -63,6 +131,9 @@ class RestOutputSink:
|
|
|
63
131
|
self.base_url = base_url.rstrip("/")
|
|
64
132
|
self.timeout_sec = timeout_sec
|
|
65
133
|
self.session = requests.Session()
|
|
134
|
+
adapter = HTTPAdapter(max_retries=_RETRY_POLICY)
|
|
135
|
+
self.session.mount("http://", adapter)
|
|
136
|
+
self.session.mount("https://", adapter)
|
|
66
137
|
self._runner_id = context.runner_id
|
|
67
138
|
self._seen_hashes: set[str] = set()
|
|
68
139
|
|