classifyre-cli 0.4.2__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.4/.turbo/turbo-build.log +3 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/PKG-INFO +1 -1
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/package.json +1 -1
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/pyproject.toml +1 -1
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/detector.py +6 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/secrets/detector.py +3 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/threat/code_security_detector.py +3 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/threat/yara_detector.py +8 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/main.py +105 -68
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/models/generated_input.py +63 -5
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/pipeline/detector_pipeline.py +353 -182
- classifyre_cli-0.4.4/src/pipeline/worker_pool.py +294 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/azure_blob_storage/source.py +3 -6
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/confluence/source.py +0 -7
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/databricks/source.py +287 -672
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/google_cloud_storage/source.py +2 -8
- classifyre_cli-0.4.4/src/sources/hive/source.py +304 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/jira/source.py +0 -7
- classifyre_cli-0.4.4/src/sources/mssql/source.py +621 -0
- classifyre_cli-0.4.4/src/sources/mysql/source.py +303 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/object_storage/base.py +3 -16
- classifyre_cli-0.4.4/src/sources/oracle/source.py +632 -0
- classifyre_cli-0.4.4/src/sources/postgresql/source.py +214 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/source.py +3 -10
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/servicedesk/source.py +0 -7
- classifyre_cli-0.4.4/src/sources/snowflake/source.py +624 -0
- classifyre_cli-0.4.4/src/sources/sqlite/source.py +212 -0
- classifyre_cli-0.4.4/src/sources/tabular_base.py +793 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/tabular_utils.py +36 -0
- classifyre_cli-0.4.4/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/pipeline/test_detector_pipeline.py +5 -8
- classifyre_cli-0.4.4/tests/pipeline/test_worker_pool.py +480 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_azure_blob_storage_source.py +0 -1
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_databricks_source.py +9 -9
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_google_cloud_storage_source.py +0 -1
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_hive_source.py +8 -8
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_mssql_source.py +5 -5
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_mysql_source.py +8 -8
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_oracle_source.py +27 -41
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_postgresql_source.py +3 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_s3_compatible_storage_source.py +2 -5
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_snowflake_source.py +2 -2
- classifyre_cli-0.4.4/tests/test_sqlite_source.py +336 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/uv.lock +6 -2
- classifyre_cli-0.4.2/.turbo/turbo-build.log +0 -3
- classifyre_cli-0.4.2/src/sources/hive/source.py +0 -709
- classifyre_cli-0.4.2/src/sources/mssql/source.py +0 -1034
- classifyre_cli-0.4.2/src/sources/mysql/source.py +0 -797
- classifyre_cli-0.4.2/src/sources/oracle/source.py +0 -982
- classifyre_cli-0.4.2/src/sources/postgresql/source.py +0 -774
- classifyre_cli-0.4.2/src/sources/snowflake/source.py +0 -912
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/.gitignore +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/.python-version +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/README.md +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/main.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.2/tests → classifyre_cli-0.4.4/src/sources/sqlite}/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.2/tests/detectors → classifyre_cli-0.4.4/tests}/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.2/tests/detectors/content → classifyre_cli-0.4.4/tests/detectors}/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.2/tests/detectors/custom → classifyre_cli-0.4.4/tests/detectors/content}/__init__.py +0 -0
- {classifyre_cli-0.4.2/tests/detectors/pii → classifyre_cli-0.4.4/tests/detectors/custom}/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.2/tests/detectors/secrets → classifyre_cli-0.4.4/tests/detectors/pii}/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.2/tests/detectors/threat → classifyre_cli-0.4.4/tests/detectors/secrets}/__init__.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/tests/utils/test_file_parser.py +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import asyncio
|
|
5
6
|
import logging
|
|
6
7
|
|
|
7
8
|
from ...models.generated_detectors import (
|
|
@@ -34,6 +35,11 @@ class CustomDetector(BaseDetector):
|
|
|
34
35
|
|
|
35
36
|
async def detect(
|
|
36
37
|
self, content: str | bytes, content_type: str = "text/plain"
|
|
38
|
+
) -> list[DetectionResult]:
|
|
39
|
+
return await asyncio.to_thread(self._detect_sync, content, content_type)
|
|
40
|
+
|
|
41
|
+
def _detect_sync(
|
|
42
|
+
self, content: str | bytes, content_type: str = "text/plain"
|
|
37
43
|
) -> list[DetectionResult]:
|
|
38
44
|
findings = self._runner.detect(content, content_type)
|
|
39
45
|
max_findings = self.custom_config.max_findings
|
|
@@ -5,6 +5,7 @@ plugin's ``analyze_line`` directly. No temp files, no global Settings state,
|
|
|
5
5
|
and no ``SecretsCollection`` needed.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
import asyncio
|
|
8
9
|
import importlib
|
|
9
10
|
import logging
|
|
10
11
|
import pkgutil
|
|
@@ -304,7 +305,9 @@ class SecretsDetector(BaseDetector):
|
|
|
304
305
|
len(content),
|
|
305
306
|
)
|
|
306
307
|
return []
|
|
308
|
+
return await asyncio.to_thread(self._detect_sync, content)
|
|
307
309
|
|
|
310
|
+
def _detect_sync(self, content: str) -> list[DetectionResult]:
|
|
308
311
|
plugins = self._build_plugins()
|
|
309
312
|
if not plugins:
|
|
310
313
|
return []
|
{classifyre_cli-0.4.2 → classifyre_cli-0.4.4}/src/detectors/threat/code_security_detector.py
RENAMED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Code security detector using Bandit static analysis."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import json
|
|
4
5
|
import logging
|
|
5
6
|
import subprocess
|
|
@@ -137,7 +138,9 @@ class CodeSecurityDetector(BaseDetector):
|
|
|
137
138
|
return []
|
|
138
139
|
if not content.strip():
|
|
139
140
|
return []
|
|
141
|
+
return await asyncio.to_thread(self._detect_sync, content)
|
|
140
142
|
|
|
143
|
+
def _detect_sync(self, content: str) -> list[DetectionResult]:
|
|
141
144
|
threshold = self._cfg.confidence_threshold or 0.7
|
|
142
145
|
max_findings = self._cfg.max_findings or 25
|
|
143
146
|
findings: list[DetectionResult] = []
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""YARA-based threat detector — compiles structured rule objects into a live ruleset."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import logging
|
|
4
5
|
import re
|
|
5
6
|
|
|
@@ -89,6 +90,13 @@ class YaraDetector(BaseDetector):
|
|
|
89
90
|
|
|
90
91
|
async def detect(
|
|
91
92
|
self, content: str | bytes, content_type: str = "text/plain"
|
|
93
|
+
) -> list[DetectionResult]:
|
|
94
|
+
if self._rules is None:
|
|
95
|
+
return []
|
|
96
|
+
return await asyncio.to_thread(self._detect_sync, content, content_type)
|
|
97
|
+
|
|
98
|
+
def _detect_sync(
|
|
99
|
+
self, content: str | bytes, content_type: str = "text/plain"
|
|
92
100
|
) -> list[DetectionResult]:
|
|
93
101
|
if self._rules is None:
|
|
94
102
|
return []
|
|
@@ -179,15 +179,32 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
|
|
|
179
179
|
sink_started = True
|
|
180
180
|
|
|
181
181
|
from .pipeline.detector_pipeline import DetectorPipeline
|
|
182
|
+
from .pipeline.worker_pool import (
|
|
183
|
+
DetectorWorkerPool,
|
|
184
|
+
compute_pool_workers,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
pool_workers = compute_pool_workers(
|
|
188
|
+
override=args.max_pool_workers,
|
|
189
|
+
)
|
|
190
|
+
worker_pool: DetectorWorkerPool | None = None
|
|
182
191
|
|
|
183
192
|
pipeline = DetectorPipeline.from_recipe(
|
|
184
193
|
recipe,
|
|
185
194
|
source,
|
|
186
195
|
runner_id,
|
|
187
|
-
max_concurrent_assets=args.detector_max_concurrent,
|
|
188
196
|
)
|
|
189
197
|
has_detectors = bool(pipeline.detectors)
|
|
190
198
|
|
|
199
|
+
if has_detectors:
|
|
200
|
+
worker_pool = DetectorWorkerPool(max_workers=pool_workers)
|
|
201
|
+
pipeline = DetectorPipeline.from_recipe(
|
|
202
|
+
recipe,
|
|
203
|
+
source,
|
|
204
|
+
runner_id,
|
|
205
|
+
worker_pool=worker_pool,
|
|
206
|
+
)
|
|
207
|
+
|
|
191
208
|
# --- Phase 1: Discovery ---
|
|
192
209
|
source.set_discovery_only(True)
|
|
193
210
|
all_stubs: list[Any] = []
|
|
@@ -225,74 +242,84 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
|
|
|
225
242
|
if has_detectors and all_stubs:
|
|
226
243
|
import asyncio as _asyncio
|
|
227
244
|
|
|
228
|
-
workers = args.processing_workers
|
|
229
|
-
semaphore = _asyncio.Semaphore(workers)
|
|
230
245
|
processed_count = 0
|
|
246
|
+
_pw = worker_pool.max_workers if worker_pool else 4
|
|
247
|
+
max_concurrent = args.max_concurrent_assets or (_pw * 2)
|
|
248
|
+
max_concurrent = max(1, max_concurrent)
|
|
249
|
+
_asset_semaphore = _asyncio.Semaphore(max_concurrent)
|
|
250
|
+
logger.info(
|
|
251
|
+
"Phase 2 starting: %d assets, pool_workers=%s, max_concurrent_assets=%d",
|
|
252
|
+
len(all_stubs),
|
|
253
|
+
worker_pool.max_workers if worker_pool else "none",
|
|
254
|
+
max_concurrent,
|
|
255
|
+
)
|
|
231
256
|
error_count = 0
|
|
232
257
|
|
|
233
258
|
async def _process_one(asset: Any) -> None:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
asset_hash = getattr(asset, "hash", None) or ""
|
|
237
|
-
try:
|
|
238
|
-
if hasattr(sink, "update_asset_status"):
|
|
239
|
-
await sink.update_asset_status(asset_hash, "PROCESSING")
|
|
240
|
-
|
|
241
|
-
async def _on_findings_flushed(partial: list[Any]) -> None:
|
|
242
|
-
# partial is the full accumulated findings list from the pipeline
|
|
243
|
-
stub_payload = _asset_to_payload(asset)
|
|
244
|
-
stub_payload["findings"] = [
|
|
245
|
-
f.model_dump(mode="json", exclude_none=True)
|
|
246
|
-
if hasattr(f, "model_dump")
|
|
247
|
-
else f
|
|
248
|
-
for f in partial
|
|
249
|
-
]
|
|
250
|
-
await sink.emit_batch([stub_payload], skip_findings=False)
|
|
251
|
-
if hasattr(sink, "update_asset_status"):
|
|
252
|
-
f_total, f_by_sev, f_by_det = _compute_findings_counts(
|
|
253
|
-
partial
|
|
254
|
-
)
|
|
255
|
-
await sink.update_asset_status(
|
|
256
|
-
asset_hash,
|
|
257
|
-
"PROCESSING",
|
|
258
|
-
findings_total=f_total,
|
|
259
|
-
findings_by_severity=f_by_sev,
|
|
260
|
-
findings_by_detector=f_by_det,
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
result = await pipeline.process_single_asset(
|
|
264
|
-
asset,
|
|
265
|
-
on_findings_flushed=_on_findings_flushed,
|
|
266
|
-
findings_flush_size=args.detector_flush_batch_size,
|
|
267
|
-
)
|
|
268
|
-
payload = _asset_to_payload(result)
|
|
269
|
-
await sink.emit_batch([payload], skip_findings=False)
|
|
259
|
+
async with _asset_semaphore:
|
|
260
|
+
await _process_one_inner(asset)
|
|
270
261
|
|
|
262
|
+
async def _process_one_inner(asset: Any) -> None:
|
|
263
|
+
nonlocal processed_count, error_count
|
|
264
|
+
asset_hash = getattr(asset, "hash", None) or ""
|
|
265
|
+
try:
|
|
266
|
+
if hasattr(sink, "update_asset_status"):
|
|
267
|
+
await sink.update_asset_status(asset_hash, "PROCESSING")
|
|
268
|
+
|
|
269
|
+
async def _on_findings_flushed(partial: list[Any]) -> None:
|
|
270
|
+
stub_payload = _asset_to_payload(asset)
|
|
271
|
+
stub_payload["findings"] = [
|
|
272
|
+
f.model_dump(mode="json", exclude_none=True)
|
|
273
|
+
if hasattr(f, "model_dump")
|
|
274
|
+
else f
|
|
275
|
+
for f in partial
|
|
276
|
+
]
|
|
277
|
+
await sink.emit_batch([stub_payload], skip_findings=False)
|
|
271
278
|
if hasattr(sink, "update_asset_status"):
|
|
272
279
|
f_total, f_by_sev, f_by_det = _compute_findings_counts(
|
|
273
|
-
|
|
280
|
+
partial
|
|
274
281
|
)
|
|
275
282
|
await sink.update_asset_status(
|
|
276
283
|
asset_hash,
|
|
277
|
-
"
|
|
284
|
+
"PROCESSING",
|
|
278
285
|
findings_total=f_total,
|
|
279
286
|
findings_by_severity=f_by_sev,
|
|
280
287
|
findings_by_detector=f_by_det,
|
|
281
288
|
)
|
|
282
289
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
290
|
+
result = await pipeline.process_single_asset(
|
|
291
|
+
asset,
|
|
292
|
+
on_findings_flushed=_on_findings_flushed,
|
|
293
|
+
findings_flush_size=args.detector_flush_batch_size,
|
|
294
|
+
)
|
|
295
|
+
payload = _asset_to_payload(result)
|
|
296
|
+
await sink.emit_batch([payload], skip_findings=False)
|
|
297
|
+
|
|
298
|
+
if hasattr(sink, "update_asset_status"):
|
|
299
|
+
f_total, f_by_sev, f_by_det = _compute_findings_counts(
|
|
300
|
+
result.findings or []
|
|
301
|
+
)
|
|
302
|
+
await sink.update_asset_status(
|
|
303
|
+
asset_hash,
|
|
304
|
+
"PROCESSED",
|
|
305
|
+
findings_total=f_total,
|
|
306
|
+
findings_by_severity=f_by_sev,
|
|
307
|
+
findings_by_detector=f_by_det,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
source.evict_asset_cache(asset_hash)
|
|
311
|
+
processed_count += 1
|
|
312
|
+
except Exception as exc:
|
|
313
|
+
error_count += 1
|
|
314
|
+
logger.error("Asset %s failed: %s", asset_hash, exc)
|
|
315
|
+
if hasattr(sink, "update_asset_status"):
|
|
316
|
+
try:
|
|
317
|
+
error_msg = str(exc) or type(exc).__name__
|
|
318
|
+
await sink.update_asset_status(
|
|
319
|
+
asset_hash, "ERROR", error_msg
|
|
320
|
+
)
|
|
321
|
+
except Exception:
|
|
322
|
+
pass
|
|
296
323
|
|
|
297
324
|
tasks = [_asyncio.create_task(_process_one(a)) for a in all_stubs]
|
|
298
325
|
await _asyncio.gather(*tasks, return_exceptions=True)
|
|
@@ -340,6 +367,9 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
|
|
|
340
367
|
"Failed to mark sink failure: %s", sink_error, exc_info=True
|
|
341
368
|
)
|
|
342
369
|
raise
|
|
370
|
+
finally:
|
|
371
|
+
if worker_pool is not None:
|
|
372
|
+
worker_pool.shutdown(wait=True)
|
|
343
373
|
|
|
344
374
|
except Exception as e:
|
|
345
375
|
logger.debug("Traceback for %s failure:", args.command, exc_info=True)
|
|
@@ -527,16 +557,16 @@ def main() -> None:
|
|
|
527
557
|
help="How many detector-processed assets to accumulate before pushing findings to the API (default: 5, env: CLASSIFYRE_DETECTOR_FLUSH_BATCH_SIZE)",
|
|
528
558
|
)
|
|
529
559
|
parser.add_argument(
|
|
530
|
-
"--
|
|
560
|
+
"--max-pool-workers",
|
|
531
561
|
type=int,
|
|
532
562
|
default=None,
|
|
533
|
-
help="Max
|
|
563
|
+
help="Max OS processes in the detector pool. Auto-sized from CPU/memory when omitted (env: CLASSIFYRE_MAX_POOL_WORKERS)",
|
|
534
564
|
)
|
|
535
565
|
parser.add_argument(
|
|
536
|
-
"--
|
|
566
|
+
"--max-concurrent-assets",
|
|
537
567
|
type=int,
|
|
538
568
|
default=None,
|
|
539
|
-
help="
|
|
569
|
+
help="Max assets processed concurrently in Phase 2. Controls DB connection usage. Defaults to pool_workers*2 (env: CLASSIFYRE_MAX_CONCURRENT_ASSETS)",
|
|
540
570
|
)
|
|
541
571
|
|
|
542
572
|
args = parser.parse_args()
|
|
@@ -549,21 +579,19 @@ def main() -> None:
|
|
|
549
579
|
args.detector_flush_batch_size = 5
|
|
550
580
|
args.detector_flush_batch_size = max(args.detector_flush_batch_size, 1)
|
|
551
581
|
|
|
552
|
-
if args.
|
|
553
|
-
env_val = os.environ.get("
|
|
582
|
+
if args.max_pool_workers is None:
|
|
583
|
+
env_val = os.environ.get("CLASSIFYRE_MAX_POOL_WORKERS")
|
|
554
584
|
try:
|
|
555
|
-
args.
|
|
585
|
+
args.max_pool_workers = int(env_val) if env_val else None
|
|
556
586
|
except ValueError:
|
|
557
|
-
args.
|
|
558
|
-
args.detector_max_concurrent = max(args.detector_max_concurrent, 1)
|
|
587
|
+
args.max_pool_workers = None
|
|
559
588
|
|
|
560
|
-
if args.
|
|
561
|
-
env_val = os.environ.get("
|
|
589
|
+
if args.max_concurrent_assets is None:
|
|
590
|
+
env_val = os.environ.get("CLASSIFYRE_MAX_CONCURRENT_ASSETS")
|
|
562
591
|
try:
|
|
563
|
-
args.
|
|
592
|
+
args.max_concurrent_assets = int(env_val) if env_val else None
|
|
564
593
|
except ValueError:
|
|
565
|
-
args.
|
|
566
|
-
args.processing_workers = max(args.processing_workers, 1)
|
|
594
|
+
args.max_concurrent_assets = None
|
|
567
595
|
|
|
568
596
|
if args.debug:
|
|
569
597
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
@@ -585,6 +613,15 @@ def main() -> None:
|
|
|
585
613
|
|
|
586
614
|
recipe = load_recipe(args.recipe)
|
|
587
615
|
|
|
616
|
+
# Resolve resource overrides from recipe when CLI args / env vars are not set
|
|
617
|
+
recipe_resources = recipe.get("resources") or {}
|
|
618
|
+
if args.max_pool_workers is None and isinstance(recipe_resources.get("max_pool_workers"), int):
|
|
619
|
+
args.max_pool_workers = recipe_resources["max_pool_workers"]
|
|
620
|
+
if args.max_concurrent_assets is None and isinstance(
|
|
621
|
+
recipe_resources.get("max_concurrent_assets"), int
|
|
622
|
+
):
|
|
623
|
+
args.max_concurrent_assets = recipe_resources["max_concurrent_assets"]
|
|
624
|
+
|
|
588
625
|
source_type = recipe.get("type", "").lower()
|
|
589
626
|
if not source_type:
|
|
590
627
|
logger.error(
|
|
@@ -41,6 +41,7 @@ class AssetType(StrEnum):
|
|
|
41
41
|
CONFLUENCE = 'CONFLUENCE'
|
|
42
42
|
JIRA = 'JIRA'
|
|
43
43
|
SERVICEDESK = 'SERVICEDESK'
|
|
44
|
+
SQLITE = 'SQLITE'
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
class SourceCategory(StrEnum):
|
|
@@ -163,15 +164,15 @@ class ResourceOverrides(BaseModel):
|
|
|
163
164
|
ge=60,
|
|
164
165
|
le=86400,
|
|
165
166
|
)
|
|
166
|
-
|
|
167
|
+
max_pool_workers: int | None = Field(
|
|
167
168
|
None,
|
|
168
|
-
description='
|
|
169
|
+
description='Max OS processes in the detector pool. Auto-sized from CPU/memory limits when omitted.',
|
|
169
170
|
ge=1,
|
|
170
|
-
le=
|
|
171
|
+
le=16,
|
|
171
172
|
)
|
|
172
|
-
|
|
173
|
+
max_concurrent_assets: int | None = Field(
|
|
173
174
|
None,
|
|
174
|
-
description='Max
|
|
175
|
+
description='Max assets processed concurrently. Controls parallel DB connections. Defaults to pool_workers * 2 when omitted.',
|
|
175
176
|
ge=1,
|
|
176
177
|
le=50,
|
|
177
178
|
)
|
|
@@ -1842,6 +1843,7 @@ class Type(StrEnum):
|
|
|
1842
1843
|
CONFLUENCE = 'CONFLUENCE'
|
|
1843
1844
|
JIRA = 'JIRA'
|
|
1844
1845
|
SERVICEDESK = 'SERVICEDESK'
|
|
1846
|
+
SQLITE = 'SQLITE'
|
|
1845
1847
|
|
|
1846
1848
|
|
|
1847
1849
|
class SlackInput(CoreInput):
|
|
@@ -2628,6 +2630,7 @@ class Type17(StrEnum):
|
|
|
2628
2630
|
CONFLUENCE = 'CONFLUENCE'
|
|
2629
2631
|
JIRA = 'JIRA'
|
|
2630
2632
|
SERVICEDESK = 'SERVICEDESK'
|
|
2633
|
+
SQLITE = 'SQLITE'
|
|
2631
2634
|
|
|
2632
2635
|
|
|
2633
2636
|
class ConfluenceInput(CoreInput):
|
|
@@ -2682,6 +2685,59 @@ class ServiceDeskInput(CoreInput):
|
|
|
2682
2685
|
resources: ResourceOverrides | None = None
|
|
2683
2686
|
|
|
2684
2687
|
|
|
2688
|
+
class SQLiteRequired(BaseModel):
|
|
2689
|
+
model_config = ConfigDict(
|
|
2690
|
+
extra='forbid',
|
|
2691
|
+
)
|
|
2692
|
+
database_path: str = Field(
|
|
2693
|
+
...,
|
|
2694
|
+
description='Absolute or relative path to the SQLite database file (e.g. /data/app.db)',
|
|
2695
|
+
)
|
|
2696
|
+
|
|
2697
|
+
|
|
2698
|
+
class SQLiteOptionalScope(BaseModel):
|
|
2699
|
+
"""
|
|
2700
|
+
Table selection scope.
|
|
2701
|
+
"""
|
|
2702
|
+
|
|
2703
|
+
model_config = ConfigDict(
|
|
2704
|
+
extra='forbid',
|
|
2705
|
+
)
|
|
2706
|
+
include_tables: list[str] | None = Field(
|
|
2707
|
+
None,
|
|
2708
|
+
description='Optional table allowlist. Only tables in this list will be scanned.',
|
|
2709
|
+
)
|
|
2710
|
+
table_limit: int | None = Field(
|
|
2711
|
+
None, description='Optional cap on number of table assets extracted', ge=1
|
|
2712
|
+
)
|
|
2713
|
+
|
|
2714
|
+
|
|
2715
|
+
class SQLiteOptional(BaseModel):
|
|
2716
|
+
model_config = ConfigDict(
|
|
2717
|
+
extra='forbid',
|
|
2718
|
+
)
|
|
2719
|
+
scope: SQLiteOptionalScope | None = None
|
|
2720
|
+
|
|
2721
|
+
|
|
2722
|
+
class SQLiteInput(CoreInput):
|
|
2723
|
+
type: Literal['SQLITE'] = Field('SQLITE', description='Type of the asset or source')
|
|
2724
|
+
required: SQLiteRequired
|
|
2725
|
+
masked: dict[str, Any] | None = Field(
|
|
2726
|
+
None,
|
|
2727
|
+
description='SQLite has no credentials; this section is intentionally empty.',
|
|
2728
|
+
)
|
|
2729
|
+
optional: SQLiteOptional | None = None
|
|
2730
|
+
detectors: list[Detector] | None = Field(
|
|
2731
|
+
None, description='Detectors to run on ingested content'
|
|
2732
|
+
)
|
|
2733
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
2734
|
+
None,
|
|
2735
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
2736
|
+
)
|
|
2737
|
+
sampling: SamplingConfig
|
|
2738
|
+
resources: ResourceOverrides | None = None
|
|
2739
|
+
|
|
2740
|
+
|
|
2685
2741
|
class SourceInput(
|
|
2686
2742
|
RootModel[
|
|
2687
2743
|
SlackInput
|
|
@@ -2703,6 +2759,7 @@ class SourceInput(
|
|
|
2703
2759
|
| ConfluenceInput
|
|
2704
2760
|
| JiraInput
|
|
2705
2761
|
| ServiceDeskInput
|
|
2762
|
+
| SQLiteInput
|
|
2706
2763
|
]
|
|
2707
2764
|
):
|
|
2708
2765
|
root: (
|
|
@@ -2725,6 +2782,7 @@ class SourceInput(
|
|
|
2725
2782
|
| ConfluenceInput
|
|
2726
2783
|
| JiraInput
|
|
2727
2784
|
| ServiceDeskInput
|
|
2785
|
+
| SQLiteInput
|
|
2728
2786
|
) = Field(
|
|
2729
2787
|
...,
|
|
2730
2788
|
description='Merged configuration schema with all source types and common definitions',
|