classifyre-cli 0.4.3__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/PKG-INFO +1 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/package.json +1 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/pyproject.toml +1 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/main.py +32 -1
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/models/generated_input.py +64 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/detector_pipeline.py +60 -35
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/worker_pool.py +17 -10
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/databricks/source.py +287 -672
- classifyre_cli-0.4.4/src/sources/hive/source.py +304 -0
- classifyre_cli-0.4.4/src/sources/mssql/source.py +621 -0
- classifyre_cli-0.4.4/src/sources/mysql/source.py +303 -0
- classifyre_cli-0.4.4/src/sources/oracle/source.py +632 -0
- classifyre_cli-0.4.4/src/sources/postgresql/source.py +214 -0
- classifyre_cli-0.4.4/src/sources/snowflake/source.py +624 -0
- classifyre_cli-0.4.4/src/sources/sqlite/source.py +212 -0
- classifyre_cli-0.4.4/src/sources/tabular_base.py +793 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/tabular_utils.py +36 -0
- classifyre_cli-0.4.4/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/pipeline/test_detector_pipeline.py +1 -4
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/pipeline/test_worker_pool.py +1 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_databricks_source.py +9 -9
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_hive_source.py +8 -8
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_mssql_source.py +5 -5
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_mysql_source.py +8 -8
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_oracle_source.py +27 -41
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_postgresql_source.py +3 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_snowflake_source.py +2 -2
- classifyre_cli-0.4.4/tests/test_sqlite_source.py +336 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/uv.lock +2 -2
- classifyre_cli-0.4.3/src/sources/hive/source.py +0 -709
- classifyre_cli-0.4.3/src/sources/mssql/source.py +0 -1034
- classifyre_cli-0.4.3/src/sources/mysql/source.py +0 -797
- classifyre_cli-0.4.3/src/sources/oracle/source.py +0 -982
- classifyre_cli-0.4.3/src/sources/postgresql/source.py +0 -774
- classifyre_cli-0.4.3/src/sources/snowflake/source.py +0 -912
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/.gitignore +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/.python-version +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/README.md +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/main.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/object_storage/base.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.3/tests → classifyre_cli-0.4.4/src/sources/sqlite}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors → classifyre_cli-0.4.4/tests}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/content → classifyre_cli-0.4.4/tests/detectors}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/custom → classifyre_cli-0.4.4/tests/detectors/content}/__init__.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/pii → classifyre_cli-0.4.4/tests/detectors/custom}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/secrets → classifyre_cli-0.4.4/tests/detectors/pii}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.3/tests/detectors/threat → classifyre_cli-0.4.4/tests/detectors/secrets}/__init__.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.3 → classifyre_cli-0.4.4}/tests/utils/test_file_parser.py +0 -0
|
@@ -243,14 +243,23 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
|
|
|
243
243
|
import asyncio as _asyncio
|
|
244
244
|
|
|
245
245
|
processed_count = 0
|
|
246
|
+
_pw = worker_pool.max_workers if worker_pool else 4
|
|
247
|
+
max_concurrent = args.max_concurrent_assets or (_pw * 2)
|
|
248
|
+
max_concurrent = max(1, max_concurrent)
|
|
249
|
+
_asset_semaphore = _asyncio.Semaphore(max_concurrent)
|
|
246
250
|
logger.info(
|
|
247
|
-
"Phase 2 starting: %d assets, pool_workers=%s",
|
|
251
|
+
"Phase 2 starting: %d assets, pool_workers=%s, max_concurrent_assets=%d",
|
|
248
252
|
len(all_stubs),
|
|
249
253
|
worker_pool.max_workers if worker_pool else "none",
|
|
254
|
+
max_concurrent,
|
|
250
255
|
)
|
|
251
256
|
error_count = 0
|
|
252
257
|
|
|
253
258
|
async def _process_one(asset: Any) -> None:
|
|
259
|
+
async with _asset_semaphore:
|
|
260
|
+
await _process_one_inner(asset)
|
|
261
|
+
|
|
262
|
+
async def _process_one_inner(asset: Any) -> None:
|
|
254
263
|
nonlocal processed_count, error_count
|
|
255
264
|
asset_hash = getattr(asset, "hash", None) or ""
|
|
256
265
|
try:
|
|
@@ -553,6 +562,12 @@ def main() -> None:
|
|
|
553
562
|
default=None,
|
|
554
563
|
help="Max OS processes in the detector pool. Auto-sized from CPU/memory when omitted (env: CLASSIFYRE_MAX_POOL_WORKERS)",
|
|
555
564
|
)
|
|
565
|
+
parser.add_argument(
|
|
566
|
+
"--max-concurrent-assets",
|
|
567
|
+
type=int,
|
|
568
|
+
default=None,
|
|
569
|
+
help="Max assets processed concurrently in Phase 2. Controls DB connection usage. Defaults to pool_workers*2 (env: CLASSIFYRE_MAX_CONCURRENT_ASSETS)",
|
|
570
|
+
)
|
|
556
571
|
|
|
557
572
|
args = parser.parse_args()
|
|
558
573
|
|
|
@@ -571,6 +586,13 @@ def main() -> None:
|
|
|
571
586
|
except ValueError:
|
|
572
587
|
args.max_pool_workers = None
|
|
573
588
|
|
|
589
|
+
if args.max_concurrent_assets is None:
|
|
590
|
+
env_val = os.environ.get("CLASSIFYRE_MAX_CONCURRENT_ASSETS")
|
|
591
|
+
try:
|
|
592
|
+
args.max_concurrent_assets = int(env_val) if env_val else None
|
|
593
|
+
except ValueError:
|
|
594
|
+
args.max_concurrent_assets = None
|
|
595
|
+
|
|
574
596
|
if args.debug:
|
|
575
597
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
576
598
|
|
|
@@ -591,6 +613,15 @@ def main() -> None:
|
|
|
591
613
|
|
|
592
614
|
recipe = load_recipe(args.recipe)
|
|
593
615
|
|
|
616
|
+
# Resolve resource overrides from recipe when CLI args / env vars are not set
|
|
617
|
+
recipe_resources = recipe.get("resources") or {}
|
|
618
|
+
if args.max_pool_workers is None and isinstance(recipe_resources.get("max_pool_workers"), int):
|
|
619
|
+
args.max_pool_workers = recipe_resources["max_pool_workers"]
|
|
620
|
+
if args.max_concurrent_assets is None and isinstance(
|
|
621
|
+
recipe_resources.get("max_concurrent_assets"), int
|
|
622
|
+
):
|
|
623
|
+
args.max_concurrent_assets = recipe_resources["max_concurrent_assets"]
|
|
624
|
+
|
|
594
625
|
source_type = recipe.get("type", "").lower()
|
|
595
626
|
if not source_type:
|
|
596
627
|
logger.error(
|
|
@@ -41,6 +41,7 @@ class AssetType(StrEnum):
|
|
|
41
41
|
CONFLUENCE = 'CONFLUENCE'
|
|
42
42
|
JIRA = 'JIRA'
|
|
43
43
|
SERVICEDESK = 'SERVICEDESK'
|
|
44
|
+
SQLITE = 'SQLITE'
|
|
44
45
|
|
|
45
46
|
|
|
46
47
|
class SourceCategory(StrEnum):
|
|
@@ -169,6 +170,12 @@ class ResourceOverrides(BaseModel):
|
|
|
169
170
|
ge=1,
|
|
170
171
|
le=16,
|
|
171
172
|
)
|
|
173
|
+
max_concurrent_assets: int | None = Field(
|
|
174
|
+
None,
|
|
175
|
+
description='Max assets processed concurrently. Controls parallel DB connections. Defaults to pool_workers * 2 when omitted.',
|
|
176
|
+
ge=1,
|
|
177
|
+
le=50,
|
|
178
|
+
)
|
|
172
179
|
|
|
173
180
|
|
|
174
181
|
class Detector(BaseModel):
|
|
@@ -1836,6 +1843,7 @@ class Type(StrEnum):
|
|
|
1836
1843
|
CONFLUENCE = 'CONFLUENCE'
|
|
1837
1844
|
JIRA = 'JIRA'
|
|
1838
1845
|
SERVICEDESK = 'SERVICEDESK'
|
|
1846
|
+
SQLITE = 'SQLITE'
|
|
1839
1847
|
|
|
1840
1848
|
|
|
1841
1849
|
class SlackInput(CoreInput):
|
|
@@ -2622,6 +2630,7 @@ class Type17(StrEnum):
|
|
|
2622
2630
|
CONFLUENCE = 'CONFLUENCE'
|
|
2623
2631
|
JIRA = 'JIRA'
|
|
2624
2632
|
SERVICEDESK = 'SERVICEDESK'
|
|
2633
|
+
SQLITE = 'SQLITE'
|
|
2625
2634
|
|
|
2626
2635
|
|
|
2627
2636
|
class ConfluenceInput(CoreInput):
|
|
@@ -2676,6 +2685,59 @@ class ServiceDeskInput(CoreInput):
|
|
|
2676
2685
|
resources: ResourceOverrides | None = None
|
|
2677
2686
|
|
|
2678
2687
|
|
|
2688
|
+
class SQLiteRequired(BaseModel):
|
|
2689
|
+
model_config = ConfigDict(
|
|
2690
|
+
extra='forbid',
|
|
2691
|
+
)
|
|
2692
|
+
database_path: str = Field(
|
|
2693
|
+
...,
|
|
2694
|
+
description='Absolute or relative path to the SQLite database file (e.g. /data/app.db)',
|
|
2695
|
+
)
|
|
2696
|
+
|
|
2697
|
+
|
|
2698
|
+
class SQLiteOptionalScope(BaseModel):
|
|
2699
|
+
"""
|
|
2700
|
+
Table selection scope.
|
|
2701
|
+
"""
|
|
2702
|
+
|
|
2703
|
+
model_config = ConfigDict(
|
|
2704
|
+
extra='forbid',
|
|
2705
|
+
)
|
|
2706
|
+
include_tables: list[str] | None = Field(
|
|
2707
|
+
None,
|
|
2708
|
+
description='Optional table allowlist. Only tables in this list will be scanned.',
|
|
2709
|
+
)
|
|
2710
|
+
table_limit: int | None = Field(
|
|
2711
|
+
None, description='Optional cap on number of table assets extracted', ge=1
|
|
2712
|
+
)
|
|
2713
|
+
|
|
2714
|
+
|
|
2715
|
+
class SQLiteOptional(BaseModel):
|
|
2716
|
+
model_config = ConfigDict(
|
|
2717
|
+
extra='forbid',
|
|
2718
|
+
)
|
|
2719
|
+
scope: SQLiteOptionalScope | None = None
|
|
2720
|
+
|
|
2721
|
+
|
|
2722
|
+
class SQLiteInput(CoreInput):
|
|
2723
|
+
type: Literal['SQLITE'] = Field('SQLITE', description='Type of the asset or source')
|
|
2724
|
+
required: SQLiteRequired
|
|
2725
|
+
masked: dict[str, Any] | None = Field(
|
|
2726
|
+
None,
|
|
2727
|
+
description='SQLite has no credentials; this section is intentionally empty.',
|
|
2728
|
+
)
|
|
2729
|
+
optional: SQLiteOptional | None = None
|
|
2730
|
+
detectors: list[Detector] | None = Field(
|
|
2731
|
+
None, description='Detectors to run on ingested content'
|
|
2732
|
+
)
|
|
2733
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
2734
|
+
None,
|
|
2735
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
2736
|
+
)
|
|
2737
|
+
sampling: SamplingConfig
|
|
2738
|
+
resources: ResourceOverrides | None = None
|
|
2739
|
+
|
|
2740
|
+
|
|
2679
2741
|
class SourceInput(
|
|
2680
2742
|
RootModel[
|
|
2681
2743
|
SlackInput
|
|
@@ -2697,6 +2759,7 @@ class SourceInput(
|
|
|
2697
2759
|
| ConfluenceInput
|
|
2698
2760
|
| JiraInput
|
|
2699
2761
|
| ServiceDeskInput
|
|
2762
|
+
| SQLiteInput
|
|
2700
2763
|
]
|
|
2701
2764
|
):
|
|
2702
2765
|
root: (
|
|
@@ -2719,6 +2782,7 @@ class SourceInput(
|
|
|
2719
2782
|
| ConfluenceInput
|
|
2720
2783
|
| JiraInput
|
|
2721
2784
|
| ServiceDeskInput
|
|
2785
|
+
| SQLiteInput
|
|
2722
2786
|
) = Field(
|
|
2723
2787
|
...,
|
|
2724
2788
|
description='Merged configuration schema with all source types and common definitions',
|
|
@@ -66,9 +66,7 @@ class DetectorPipeline:
|
|
|
66
66
|
self.content_provider = ParsedContentProvider(source)
|
|
67
67
|
self.init_warnings: list[str] = []
|
|
68
68
|
|
|
69
|
-
def _register_detector_info(
|
|
70
|
-
self, detector: BaseDetector, info: _DetectorInfo
|
|
71
|
-
) -> None:
|
|
69
|
+
def _register_detector_info(self, detector: BaseDetector, info: _DetectorInfo) -> None:
|
|
72
70
|
self._detector_info[id(detector)] = info
|
|
73
71
|
|
|
74
72
|
def _get_detector_info(self, detector: BaseDetector) -> _DetectorInfo | None:
|
|
@@ -156,9 +154,7 @@ class DetectorPipeline:
|
|
|
156
154
|
all_active = text_detectors + binary_detectors + link_detectors
|
|
157
155
|
detector_names = [self._detector_log_label(d) for d in all_active]
|
|
158
156
|
pool_tag = "[pool]" if self._worker_pool else "[in-process]"
|
|
159
|
-
logger.info(
|
|
160
|
-
"%s Scanning %s [%s]", pool_tag, asset.name, ", ".join(detector_names)
|
|
161
|
-
)
|
|
157
|
+
logger.info("%s Scanning %s [%s]", pool_tag, asset.name, ", ".join(detector_names))
|
|
162
158
|
|
|
163
159
|
findings: list[DetectionResult] = []
|
|
164
160
|
detector_types_run: list[DetectorType] = []
|
|
@@ -241,12 +237,13 @@ class DetectorPipeline:
|
|
|
241
237
|
if findings:
|
|
242
238
|
logger.info(
|
|
243
239
|
"%s Scanned %s: %d finding(s) in %dms",
|
|
244
|
-
pool_tag,
|
|
240
|
+
pool_tag,
|
|
241
|
+
asset.name,
|
|
242
|
+
len(findings),
|
|
243
|
+
scan_duration,
|
|
245
244
|
)
|
|
246
245
|
else:
|
|
247
|
-
logger.info(
|
|
248
|
-
"%s Scanned %s: no findings (%dms)", pool_tag, asset.name, scan_duration
|
|
249
|
-
)
|
|
246
|
+
logger.info("%s Scanned %s: no findings (%dms)", pool_tag, asset.name, scan_duration)
|
|
250
247
|
|
|
251
248
|
return asset
|
|
252
249
|
|
|
@@ -299,7 +296,10 @@ class DetectorPipeline:
|
|
|
299
296
|
elapsed = int((time.monotonic() - t0) * 1000)
|
|
300
297
|
logger.info(
|
|
301
298
|
" %s page %d done: %d findings (%dms)",
|
|
302
|
-
asset.name,
|
|
299
|
+
asset.name,
|
|
300
|
+
page_num,
|
|
301
|
+
len(page_findings),
|
|
302
|
+
elapsed,
|
|
303
303
|
)
|
|
304
304
|
return page_findings, page_types, page_errors, page_content, page_num
|
|
305
305
|
|
|
@@ -322,9 +322,7 @@ class DetectorPipeline:
|
|
|
322
322
|
page_content,
|
|
323
323
|
)
|
|
324
324
|
|
|
325
|
-
max_pending = max(
|
|
326
|
-
2, self._worker_pool.max_workers * 2 if self._worker_pool else 4
|
|
327
|
-
)
|
|
325
|
+
max_pending = max(2, self._worker_pool.max_workers * 2 if self._worker_pool else 4)
|
|
328
326
|
|
|
329
327
|
async for text_content in self._iter_text_content_pages(asset):
|
|
330
328
|
page_index += 1
|
|
@@ -335,18 +333,22 @@ class DetectorPipeline:
|
|
|
335
333
|
|
|
336
334
|
while len(pending_tasks) >= max_pending:
|
|
337
335
|
done, pending_tasks = await asyncio.wait(
|
|
338
|
-
pending_tasks,
|
|
336
|
+
pending_tasks,
|
|
337
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
339
338
|
)
|
|
340
339
|
for task in done:
|
|
341
340
|
page_findings, page_types, page_errors, page_content, _pn = task.result()
|
|
342
341
|
findings.extend(page_findings)
|
|
343
342
|
errors.extend(page_errors)
|
|
344
343
|
detector_types_run = self._merge_detector_types(
|
|
345
|
-
detector_types_run,
|
|
344
|
+
detector_types_run,
|
|
345
|
+
page_types,
|
|
346
346
|
)
|
|
347
347
|
for finding in page_findings:
|
|
348
348
|
self.content_provider.enrich_finding_location(
|
|
349
|
-
finding,
|
|
349
|
+
finding,
|
|
350
|
+
asset,
|
|
351
|
+
page_content,
|
|
350
352
|
)
|
|
351
353
|
|
|
352
354
|
task = asyncio.create_task(_detect_page(text_content, page_index))
|
|
@@ -401,7 +403,10 @@ class DetectorPipeline:
|
|
|
401
403
|
elapsed = int((time.monotonic() - t0) * 1000)
|
|
402
404
|
logger.info(
|
|
403
405
|
" %s page %d done: %d findings (%dms)",
|
|
404
|
-
asset.name,
|
|
406
|
+
asset.name,
|
|
407
|
+
page_num,
|
|
408
|
+
len(page_findings),
|
|
409
|
+
elapsed,
|
|
405
410
|
)
|
|
406
411
|
return page_findings, page_types, page_errors, page_content, page_num
|
|
407
412
|
|
|
@@ -413,26 +418,29 @@ class DetectorPipeline:
|
|
|
413
418
|
page_findings, page_types, page_errors, page_content, _pn = task.result()
|
|
414
419
|
for finding in page_findings:
|
|
415
420
|
self.content_provider.enrich_finding_location(
|
|
416
|
-
finding,
|
|
421
|
+
finding,
|
|
422
|
+
asset,
|
|
423
|
+
page_content,
|
|
417
424
|
)
|
|
418
425
|
findings.extend(page_findings)
|
|
419
426
|
errors.extend(page_errors)
|
|
420
427
|
detector_types_run = self._merge_detector_types(
|
|
421
|
-
detector_types_run,
|
|
428
|
+
detector_types_run,
|
|
429
|
+
page_types,
|
|
422
430
|
)
|
|
423
431
|
unflushed_count += len(page_findings)
|
|
424
432
|
|
|
425
433
|
if unflushed_count >= findings_flush_size and unflushed_count > 0:
|
|
426
434
|
logger.debug(
|
|
427
435
|
" %s flushing %d findings (%d total)",
|
|
428
|
-
asset.name,
|
|
436
|
+
asset.name,
|
|
437
|
+
unflushed_count,
|
|
438
|
+
len(findings),
|
|
429
439
|
)
|
|
430
440
|
await on_findings_flushed(list(findings))
|
|
431
441
|
unflushed_count = 0
|
|
432
442
|
|
|
433
|
-
max_pending = max(
|
|
434
|
-
2, self._worker_pool.max_workers * 2 if self._worker_pool else 4
|
|
435
|
-
)
|
|
443
|
+
max_pending = max(2, self._worker_pool.max_workers * 2 if self._worker_pool else 4)
|
|
436
444
|
|
|
437
445
|
async for text_content in self._iter_text_content_pages(asset):
|
|
438
446
|
page_index += 1
|
|
@@ -443,25 +451,31 @@ class DetectorPipeline:
|
|
|
443
451
|
|
|
444
452
|
while len(pending_tasks) >= max_pending:
|
|
445
453
|
done, pending_tasks_set = await asyncio.wait(
|
|
446
|
-
pending_tasks,
|
|
454
|
+
pending_tasks,
|
|
455
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
447
456
|
)
|
|
448
457
|
pending_tasks = pending_tasks_set
|
|
449
458
|
for task in done:
|
|
450
459
|
page_findings, page_types, page_errors, page_content, _pn = task.result()
|
|
451
460
|
for finding in page_findings:
|
|
452
461
|
self.content_provider.enrich_finding_location(
|
|
453
|
-
finding,
|
|
462
|
+
finding,
|
|
463
|
+
asset,
|
|
464
|
+
page_content,
|
|
454
465
|
)
|
|
455
466
|
findings.extend(page_findings)
|
|
456
467
|
errors.extend(page_errors)
|
|
457
468
|
detector_types_run = self._merge_detector_types(
|
|
458
|
-
detector_types_run,
|
|
469
|
+
detector_types_run,
|
|
470
|
+
page_types,
|
|
459
471
|
)
|
|
460
472
|
unflushed_count += len(page_findings)
|
|
461
473
|
if unflushed_count >= findings_flush_size and unflushed_count > 0:
|
|
462
474
|
logger.info(
|
|
463
475
|
" %s flushing %d findings (%d total)",
|
|
464
|
-
asset.name,
|
|
476
|
+
asset.name,
|
|
477
|
+
unflushed_count,
|
|
478
|
+
len(findings),
|
|
465
479
|
)
|
|
466
480
|
await on_findings_flushed(list(findings))
|
|
467
481
|
unflushed_count = 0
|
|
@@ -637,9 +651,7 @@ class DetectorPipeline:
|
|
|
637
651
|
errors: list[str] = []
|
|
638
652
|
detected_at = datetime.now(UTC)
|
|
639
653
|
|
|
640
|
-
for i, (detector, result) in enumerate(
|
|
641
|
-
zip(runnable_detectors, results, strict=False)
|
|
642
|
-
):
|
|
654
|
+
for i, (detector, result) in enumerate(zip(runnable_detectors, results, strict=False)):
|
|
643
655
|
detector_name = detector.__class__.__name__
|
|
644
656
|
via = task_via[i]
|
|
645
657
|
loc = f"{asset_name}:{page_tag}" if page_tag else asset_name
|
|
@@ -648,7 +660,11 @@ class DetectorPipeline:
|
|
|
648
660
|
wall_ms = int((time.monotonic() - task_start_times[i]) * 1000)
|
|
649
661
|
logger.error(
|
|
650
662
|
" [%s] %s on %s: FAILED in %dms — %s",
|
|
651
|
-
via,
|
|
663
|
+
via,
|
|
664
|
+
detector_name,
|
|
665
|
+
loc,
|
|
666
|
+
wall_ms,
|
|
667
|
+
result,
|
|
652
668
|
)
|
|
653
669
|
errors.append(f"{detector_name}: {result}")
|
|
654
670
|
continue
|
|
@@ -677,12 +693,19 @@ class DetectorPipeline:
|
|
|
677
693
|
if detector_findings:
|
|
678
694
|
logger.info(
|
|
679
695
|
" [%s] %s on %s: %d finding(s) in %dms",
|
|
680
|
-
pid_tag,
|
|
696
|
+
pid_tag,
|
|
697
|
+
detector_name,
|
|
698
|
+
loc,
|
|
699
|
+
len(detector_findings),
|
|
700
|
+
worker_elapsed,
|
|
681
701
|
)
|
|
682
702
|
else:
|
|
683
703
|
logger.info(
|
|
684
704
|
" [%s] %s on %s: clean (%dms)",
|
|
685
|
-
pid_tag,
|
|
705
|
+
pid_tag,
|
|
706
|
+
detector_name,
|
|
707
|
+
loc,
|
|
708
|
+
worker_elapsed,
|
|
686
709
|
)
|
|
687
710
|
|
|
688
711
|
all_findings.extend(detector_findings)
|
|
@@ -835,7 +858,9 @@ class DetectorPipeline:
|
|
|
835
858
|
|
|
836
859
|
if not detector_configs:
|
|
837
860
|
return cls(
|
|
838
|
-
detectors=[],
|
|
861
|
+
detectors=[],
|
|
862
|
+
source=source,
|
|
863
|
+
runner_id=runner_id,
|
|
839
864
|
worker_pool=worker_pool,
|
|
840
865
|
)
|
|
841
866
|
|
|
@@ -37,9 +37,7 @@ class _WorkerResult:
|
|
|
37
37
|
|
|
38
38
|
__slots__ = ("elapsed_ms", "findings", "worker_pid")
|
|
39
39
|
|
|
40
|
-
def __init__(
|
|
41
|
-
self, findings: list[dict[str, Any]], worker_pid: int, elapsed_ms: int
|
|
42
|
-
) -> None:
|
|
40
|
+
def __init__(self, findings: list[dict[str, Any]], worker_pid: int, elapsed_ms: int) -> None:
|
|
43
41
|
self.findings = findings
|
|
44
42
|
self.worker_pid = worker_pid
|
|
45
43
|
self.elapsed_ms = elapsed_ms
|
|
@@ -80,9 +78,7 @@ def _detect_in_worker(
|
|
|
80
78
|
from ..detectors import get_detector
|
|
81
79
|
from ..detectors.config import parse_detector_config
|
|
82
80
|
|
|
83
|
-
name, typed_config = parse_detector_config(
|
|
84
|
-
detector_type, json.loads(config_json)
|
|
85
|
-
)
|
|
81
|
+
name, typed_config = parse_detector_config(detector_type, json.loads(config_json))
|
|
86
82
|
detector = get_detector(name, typed_config)
|
|
87
83
|
_worker_detector_cache[cache_key] = detector
|
|
88
84
|
logging.getLogger(__name__).info(
|
|
@@ -103,7 +99,9 @@ def _detect_in_worker(
|
|
|
103
99
|
elif detector_name == "custom":
|
|
104
100
|
results = detector._detect_sync(content, content_type)
|
|
105
101
|
else:
|
|
106
|
-
text =
|
|
102
|
+
text = (
|
|
103
|
+
content if isinstance(content, str) else content.decode("utf-8", errors="replace")
|
|
104
|
+
)
|
|
107
105
|
results = detector._detect_sync(text)
|
|
108
106
|
else:
|
|
109
107
|
results = asyncio.run(detector.detect(content, content_type))
|
|
@@ -111,7 +109,10 @@ def _detect_in_worker(
|
|
|
111
109
|
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
|
112
110
|
logging.getLogger(__name__).info(
|
|
113
111
|
"Worker %d ran %s: %d findings in %dms",
|
|
114
|
-
pid,
|
|
112
|
+
pid,
|
|
113
|
+
detector_name,
|
|
114
|
+
len(results),
|
|
115
|
+
elapsed_ms,
|
|
115
116
|
)
|
|
116
117
|
|
|
117
118
|
findings = [
|
|
@@ -209,7 +210,11 @@ def compute_pool_workers(override: int | None = None) -> int:
|
|
|
209
210
|
|
|
210
211
|
logger.info(
|
|
211
212
|
"Pool sizing: cpu_budget=%d (cpus=%d), mem_budget=%d (%dMB), effective=%d",
|
|
212
|
-
cpu_budget,
|
|
213
|
+
cpu_budget,
|
|
214
|
+
cpus,
|
|
215
|
+
mem_budget,
|
|
216
|
+
mem_mb,
|
|
217
|
+
effective,
|
|
213
218
|
)
|
|
214
219
|
return effective
|
|
215
220
|
|
|
@@ -238,7 +243,9 @@ class DetectorWorkerPool:
|
|
|
238
243
|
self._shutdown = False
|
|
239
244
|
logger.info(
|
|
240
245
|
"Detector pool started: %d workers (method=%s, pid=%d)",
|
|
241
|
-
effective_workers,
|
|
246
|
+
effective_workers,
|
|
247
|
+
mp_start_method,
|
|
248
|
+
os.getpid(),
|
|
242
249
|
)
|
|
243
250
|
|
|
244
251
|
@property
|