classifyre-cli 0.4.23__tar.gz → 0.4.25__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/PKG-INFO +1 -1
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/package.json +1 -1
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/pyproject.toml +1 -1
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/models/generated_input.py +56 -36
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/detector_pipeline.py +38 -2
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/parsed_content_provider.py +22 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/worker_pool.py +3 -57
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/object_storage/base.py +72 -14
- classifyre_cli-0.4.25/src/utils/resources.py +65 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/transcription.py +61 -3
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/uv.lock +158 -172
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/.gitignore +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/.python-version +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/README.md +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/main.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/config.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.23/tests/detectors/threat → classifyre_cli-0.4.25/src/detectors/content}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.23/tests/detectors/secrets → classifyre_cli-0.4.25/src/detectors/pii}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.23/tests/detectors/pii → classifyre_cli-0.4.25/src/detectors/secrets}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.23/tests/detectors/custom → classifyre_cli-0.4.25/src/detectors/threat}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/main.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/asset_metadata.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/base.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/confluence/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/databricks/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/email/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/email/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/jira/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mongodb/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.23/tests/detectors/content → classifyre_cli-0.4.25/src/sources/neo4j}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/neo4j/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/notion/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/notion/client.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/notion/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/powerbi/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/servicedesk/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/slack/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.23/tests/detectors → classifyre_cli-0.4.25/src/sources/sqlite}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/tableau/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/tabular_base.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/wordpress/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/youtube/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/sources/youtube/source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/dependency_groups.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/file_metadata.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.23/src/sources/sqlite → classifyre_cli-0.4.25/tests/detectors}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.23/src/sources/neo4j → classifyre_cli-0.4.25/tests/detectors/content}/__init__.py +0 -0
- {classifyre_cli-0.4.23/src/detectors/threat → classifyre_cli-0.4.25/tests/detectors/custom}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.23/src/detectors/secrets → classifyre_cli-0.4.25/tests/detectors/pii}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.23/src/detectors/pii → classifyre_cli-0.4.25/tests/detectors/secrets}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.23/src/detectors/content → classifyre_cli-0.4.25/tests/detectors/threat}/__init__.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_assets_metadata_catalog.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_config.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_dependency_groups.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_email_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_notion_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_uv_sync.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_youtube_source.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/test_youtube_source_integration.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_file_metadata.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_file_parser.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_file_to_images.py +0 -0
- {classifyre_cli-0.4.23 → classifyre_cli-0.4.25}/tests/utils/test_transcription.py +0 -0
|
@@ -2032,7 +2032,9 @@ class CoreInput(BaseModel):
|
|
|
2032
2032
|
|
|
2033
2033
|
|
|
2034
2034
|
class SlackInput(CoreInput):
|
|
2035
|
-
type: Literal['SLACK'] = Field(
|
|
2035
|
+
type: Literal['SLACK'] | None = Field(
|
|
2036
|
+
None, description='Type of the asset or source'
|
|
2037
|
+
)
|
|
2036
2038
|
required: SlackRequired
|
|
2037
2039
|
masked: SlackMaskedBotToken | SlackMaskedUserToken | SlackMaskedToken = Field(
|
|
2038
2040
|
..., title='SlackMasked'
|
|
@@ -2050,7 +2052,9 @@ class SlackInput(CoreInput):
|
|
|
2050
2052
|
|
|
2051
2053
|
|
|
2052
2054
|
class EmailInput(CoreInput):
|
|
2053
|
-
type: Literal['EMAIL'] = Field(
|
|
2055
|
+
type: Literal['EMAIL'] | None = Field(
|
|
2056
|
+
None, description='Type of the asset or source'
|
|
2057
|
+
)
|
|
2054
2058
|
required: EmailRequired
|
|
2055
2059
|
masked: EmailMasked
|
|
2056
2060
|
optional: EmailOptional | None = None
|
|
@@ -2066,8 +2070,8 @@ class EmailInput(CoreInput):
|
|
|
2066
2070
|
|
|
2067
2071
|
|
|
2068
2072
|
class S3CompatibleStorageInput(CoreInput):
|
|
2069
|
-
type: Literal['S3_COMPATIBLE_STORAGE'] = Field(
|
|
2070
|
-
|
|
2073
|
+
type: Literal['S3_COMPATIBLE_STORAGE'] | None = Field(
|
|
2074
|
+
None, description='Type of the asset or source'
|
|
2071
2075
|
)
|
|
2072
2076
|
required: S3CompatibleStorageRequired
|
|
2073
2077
|
masked: S3CompatibleStorageMasked | None = None
|
|
@@ -2084,8 +2088,8 @@ class S3CompatibleStorageInput(CoreInput):
|
|
|
2084
2088
|
|
|
2085
2089
|
|
|
2086
2090
|
class AzureBlobStorageInput(CoreInput):
|
|
2087
|
-
type: Literal['AZURE_BLOB_STORAGE'] = Field(
|
|
2088
|
-
|
|
2091
|
+
type: Literal['AZURE_BLOB_STORAGE'] | None = Field(
|
|
2092
|
+
None, description='Type of the asset or source'
|
|
2089
2093
|
)
|
|
2090
2094
|
required: AzureBlobStorageRequired
|
|
2091
2095
|
masked: AzureBlobStorageMasked | None = None
|
|
@@ -2102,8 +2106,8 @@ class AzureBlobStorageInput(CoreInput):
|
|
|
2102
2106
|
|
|
2103
2107
|
|
|
2104
2108
|
class GoogleCloudStorageInput(CoreInput):
|
|
2105
|
-
type: Literal['GOOGLE_CLOUD_STORAGE'] = Field(
|
|
2106
|
-
|
|
2109
|
+
type: Literal['GOOGLE_CLOUD_STORAGE'] | None = Field(
|
|
2110
|
+
None, description='Type of the asset or source'
|
|
2107
2111
|
)
|
|
2108
2112
|
required: GoogleCloudStorageRequired
|
|
2109
2113
|
masked: GoogleCloudStorageMasked | None = None
|
|
@@ -2120,8 +2124,8 @@ class GoogleCloudStorageInput(CoreInput):
|
|
|
2120
2124
|
|
|
2121
2125
|
|
|
2122
2126
|
class WordPressInput(CoreInput):
|
|
2123
|
-
type: Literal['WORDPRESS'] = Field(
|
|
2124
|
-
|
|
2127
|
+
type: Literal['WORDPRESS'] | None = Field(
|
|
2128
|
+
None, description='Type of the asset or source'
|
|
2125
2129
|
)
|
|
2126
2130
|
required: WordPressRequired
|
|
2127
2131
|
masked: WordPressMasked
|
|
@@ -2138,8 +2142,8 @@ class WordPressInput(CoreInput):
|
|
|
2138
2142
|
|
|
2139
2143
|
|
|
2140
2144
|
class PostgreSQLInput(CoreInput):
|
|
2141
|
-
type: Literal['POSTGRESQL'] = Field(
|
|
2142
|
-
|
|
2145
|
+
type: Literal['POSTGRESQL'] | None = Field(
|
|
2146
|
+
None, description='Type of the asset or source'
|
|
2143
2147
|
)
|
|
2144
2148
|
required: PostgreSQLRequired
|
|
2145
2149
|
masked: PostgreSQLMasked
|
|
@@ -2156,7 +2160,9 @@ class PostgreSQLInput(CoreInput):
|
|
|
2156
2160
|
|
|
2157
2161
|
|
|
2158
2162
|
class MySQLInput(CoreInput):
|
|
2159
|
-
type: Literal['MYSQL'] = Field(
|
|
2163
|
+
type: Literal['MYSQL'] | None = Field(
|
|
2164
|
+
None, description='Type of the asset or source'
|
|
2165
|
+
)
|
|
2160
2166
|
required: MySQLRequired
|
|
2161
2167
|
masked: MySQLMasked
|
|
2162
2168
|
optional: MySQLOptional | None = None
|
|
@@ -2172,7 +2178,9 @@ class MySQLInput(CoreInput):
|
|
|
2172
2178
|
|
|
2173
2179
|
|
|
2174
2180
|
class MSSQLInput(CoreInput):
|
|
2175
|
-
type: Literal['MSSQL'] = Field(
|
|
2181
|
+
type: Literal['MSSQL'] | None = Field(
|
|
2182
|
+
None, description='Type of the asset or source'
|
|
2183
|
+
)
|
|
2176
2184
|
required: MSSQLRequired
|
|
2177
2185
|
masked: MSSQLMasked
|
|
2178
2186
|
optional: MSSQLOptional | None = None
|
|
@@ -2188,7 +2196,9 @@ class MSSQLInput(CoreInput):
|
|
|
2188
2196
|
|
|
2189
2197
|
|
|
2190
2198
|
class OracleInput(CoreInput):
|
|
2191
|
-
type: Literal['ORACLE'] = Field(
|
|
2199
|
+
type: Literal['ORACLE'] | None = Field(
|
|
2200
|
+
None, description='Type of the asset or source'
|
|
2201
|
+
)
|
|
2192
2202
|
required: OracleRequired
|
|
2193
2203
|
masked: OracleMasked
|
|
2194
2204
|
optional: OracleOptional | None = None
|
|
@@ -2204,7 +2214,9 @@ class OracleInput(CoreInput):
|
|
|
2204
2214
|
|
|
2205
2215
|
|
|
2206
2216
|
class HiveInput(CoreInput):
|
|
2207
|
-
type: Literal['HIVE'] = Field(
|
|
2217
|
+
type: Literal['HIVE'] | None = Field(
|
|
2218
|
+
None, description='Type of the asset or source'
|
|
2219
|
+
)
|
|
2208
2220
|
required: HiveRequired
|
|
2209
2221
|
masked: HiveMasked
|
|
2210
2222
|
optional: HiveOptional | None = None
|
|
@@ -2220,8 +2232,8 @@ class HiveInput(CoreInput):
|
|
|
2220
2232
|
|
|
2221
2233
|
|
|
2222
2234
|
class DatabricksInput(CoreInput):
|
|
2223
|
-
type: Literal['DATABRICKS'] = Field(
|
|
2224
|
-
|
|
2235
|
+
type: Literal['DATABRICKS'] | None = Field(
|
|
2236
|
+
None, description='Type of the asset or source'
|
|
2225
2237
|
)
|
|
2226
2238
|
required: PersonalAccessToken | ServicePrincipalOAuthM2M | AzureServicePrincipal = (
|
|
2227
2239
|
Field(..., title='DatabricksRequired')
|
|
@@ -2242,8 +2254,8 @@ class DatabricksInput(CoreInput):
|
|
|
2242
2254
|
|
|
2243
2255
|
|
|
2244
2256
|
class SnowflakeInput(CoreInput):
|
|
2245
|
-
type: Literal['SNOWFLAKE'] = Field(
|
|
2246
|
-
|
|
2257
|
+
type: Literal['SNOWFLAKE'] | None = Field(
|
|
2258
|
+
None, description='Type of the asset or source'
|
|
2247
2259
|
)
|
|
2248
2260
|
required: (
|
|
2249
2261
|
SnowflakeRequiredDefaultAuthenticator
|
|
@@ -2270,8 +2282,8 @@ class SnowflakeInput(CoreInput):
|
|
|
2270
2282
|
|
|
2271
2283
|
|
|
2272
2284
|
class MongoDBInput(CoreInput):
|
|
2273
|
-
type: Literal['MONGODB'] = Field(
|
|
2274
|
-
|
|
2285
|
+
type: Literal['MONGODB'] | None = Field(
|
|
2286
|
+
None, description='Type of the asset or source'
|
|
2275
2287
|
)
|
|
2276
2288
|
required: MongoDBRequiredAtlas | MongoDBRequiredOnPrem = Field(
|
|
2277
2289
|
..., title='MongoDBRequired'
|
|
@@ -2398,7 +2410,9 @@ class Neo4jOptional(BaseModel):
|
|
|
2398
2410
|
|
|
2399
2411
|
|
|
2400
2412
|
class Neo4jInput(CoreInput):
|
|
2401
|
-
type: Literal['NEO4J'] = Field(
|
|
2413
|
+
type: Literal['NEO4J'] | None = Field(
|
|
2414
|
+
None, description='Type of the asset or source'
|
|
2415
|
+
)
|
|
2402
2416
|
required: Neo4jRequired
|
|
2403
2417
|
masked: Neo4jMaskedUsernamePassword | Neo4jMaskedNone = Field(
|
|
2404
2418
|
..., title='Neo4jMasked'
|
|
@@ -2416,8 +2430,8 @@ class Neo4jInput(CoreInput):
|
|
|
2416
2430
|
|
|
2417
2431
|
|
|
2418
2432
|
class PowerBIInput(CoreInput):
|
|
2419
|
-
type: Literal['POWERBI'] = Field(
|
|
2420
|
-
|
|
2433
|
+
type: Literal['POWERBI'] | None = Field(
|
|
2434
|
+
None, description='Type of the asset or source'
|
|
2421
2435
|
)
|
|
2422
2436
|
required: PowerBIRequiredServicePrincipal | PowerBIRequiredAccessToken = Field(
|
|
2423
2437
|
..., title='PowerBIRequired'
|
|
@@ -2438,8 +2452,8 @@ class PowerBIInput(CoreInput):
|
|
|
2438
2452
|
|
|
2439
2453
|
|
|
2440
2454
|
class TableauInput(CoreInput):
|
|
2441
|
-
type: Literal['TABLEAU'] = Field(
|
|
2442
|
-
|
|
2455
|
+
type: Literal['TABLEAU'] | None = Field(
|
|
2456
|
+
None, description='Type of the asset or source'
|
|
2443
2457
|
)
|
|
2444
2458
|
required: TableauRequiredUsernamePassword | TableauRequiredPersonalAccessToken = (
|
|
2445
2459
|
Field(..., title='TableauRequired')
|
|
@@ -2838,8 +2852,8 @@ class Type19(StrEnum):
|
|
|
2838
2852
|
|
|
2839
2853
|
|
|
2840
2854
|
class ConfluenceInput(CoreInput):
|
|
2841
|
-
type: Literal['CONFLUENCE'] = Field(
|
|
2842
|
-
|
|
2855
|
+
type: Literal['CONFLUENCE'] | None = Field(
|
|
2856
|
+
None, description='Type of the asset or source'
|
|
2843
2857
|
)
|
|
2844
2858
|
required: ConfluenceRequired
|
|
2845
2859
|
masked: ConfluenceMasked
|
|
@@ -2856,7 +2870,9 @@ class ConfluenceInput(CoreInput):
|
|
|
2856
2870
|
|
|
2857
2871
|
|
|
2858
2872
|
class JiraInput(CoreInput):
|
|
2859
|
-
type: Literal['JIRA'] = Field(
|
|
2873
|
+
type: Literal['JIRA'] | None = Field(
|
|
2874
|
+
None, description='Type of the asset or source'
|
|
2875
|
+
)
|
|
2860
2876
|
required: JiraRequired
|
|
2861
2877
|
masked: JiraMasked
|
|
2862
2878
|
optional: JiraOptional | None = None
|
|
@@ -2872,8 +2888,8 @@ class JiraInput(CoreInput):
|
|
|
2872
2888
|
|
|
2873
2889
|
|
|
2874
2890
|
class ServiceDeskInput(CoreInput):
|
|
2875
|
-
type: Literal['SERVICEDESK'] = Field(
|
|
2876
|
-
|
|
2891
|
+
type: Literal['SERVICEDESK'] | None = Field(
|
|
2892
|
+
None, description='Type of the asset or source'
|
|
2877
2893
|
)
|
|
2878
2894
|
required: ServiceDeskRequired
|
|
2879
2895
|
masked: ServiceDeskMasked
|
|
@@ -2924,7 +2940,9 @@ class SQLiteOptional(BaseModel):
|
|
|
2924
2940
|
|
|
2925
2941
|
|
|
2926
2942
|
class SQLiteInput(CoreInput):
|
|
2927
|
-
type: Literal['SQLITE'] = Field(
|
|
2943
|
+
type: Literal['SQLITE'] | None = Field(
|
|
2944
|
+
None, description='Type of the asset or source'
|
|
2945
|
+
)
|
|
2928
2946
|
required: SQLiteRequired
|
|
2929
2947
|
masked: dict[str, Any] | None = Field(
|
|
2930
2948
|
None,
|
|
@@ -3057,7 +3075,9 @@ class NotionOptional(BaseModel):
|
|
|
3057
3075
|
|
|
3058
3076
|
|
|
3059
3077
|
class NotionInput(CoreInput):
|
|
3060
|
-
type: Literal['NOTION'] = Field(
|
|
3078
|
+
type: Literal['NOTION'] | None = Field(
|
|
3079
|
+
None, description='Type of the asset or source'
|
|
3080
|
+
)
|
|
3061
3081
|
required: NotionRequired
|
|
3062
3082
|
masked: NotionMasked
|
|
3063
3083
|
optional: NotionOptional | None = None
|
|
@@ -3073,8 +3093,8 @@ class NotionInput(CoreInput):
|
|
|
3073
3093
|
|
|
3074
3094
|
|
|
3075
3095
|
class YouTubeInput(CoreInput):
|
|
3076
|
-
type: Literal['YOUTUBE'] = Field(
|
|
3077
|
-
|
|
3096
|
+
type: Literal['YOUTUBE'] | None = Field(
|
|
3097
|
+
None, description='Type of the asset or source'
|
|
3078
3098
|
)
|
|
3079
3099
|
required: YouTubeRequired
|
|
3080
3100
|
masked: YouTubeMasked | None = None
|
|
@@ -297,13 +297,23 @@ class DetectorPipeline:
|
|
|
297
297
|
page_num=page_num,
|
|
298
298
|
)
|
|
299
299
|
elapsed = int((time.monotonic() - t0) * 1000)
|
|
300
|
+
snippet = page_content[:120].replace("\n", "\\n") if page_content else ""
|
|
300
301
|
logger.info(
|
|
301
|
-
" %s page %d
|
|
302
|
+
" %s page %d: %d findings in %dms — snippet: %s",
|
|
302
303
|
asset.name,
|
|
303
304
|
page_num,
|
|
304
305
|
len(page_findings),
|
|
305
306
|
elapsed,
|
|
307
|
+
snippet,
|
|
306
308
|
)
|
|
309
|
+
if page_findings:
|
|
310
|
+
for f in page_findings[:5]:
|
|
311
|
+
logger.info(
|
|
312
|
+
" finding: type=%s detector=%s matched=%.100s",
|
|
313
|
+
f.finding_type,
|
|
314
|
+
f.detector_type,
|
|
315
|
+
f.matched_content[:100].replace("\n", " "),
|
|
316
|
+
)
|
|
307
317
|
return page_findings, page_types, page_errors, page_content, page_num
|
|
308
318
|
|
|
309
319
|
def _collect_done() -> None:
|
|
@@ -404,13 +414,23 @@ class DetectorPipeline:
|
|
|
404
414
|
page_num=page_num,
|
|
405
415
|
)
|
|
406
416
|
elapsed = int((time.monotonic() - t0) * 1000)
|
|
417
|
+
snippet = page_content[:120].replace("\n", "\\n") if page_content else ""
|
|
407
418
|
logger.info(
|
|
408
|
-
" %s page %d
|
|
419
|
+
" %s page %d: %d findings in %dms — snippet: %s",
|
|
409
420
|
asset.name,
|
|
410
421
|
page_num,
|
|
411
422
|
len(page_findings),
|
|
412
423
|
elapsed,
|
|
424
|
+
snippet,
|
|
413
425
|
)
|
|
426
|
+
if page_findings:
|
|
427
|
+
for f in page_findings[:5]:
|
|
428
|
+
logger.info(
|
|
429
|
+
" finding: type=%s detector=%s matched=%.100s",
|
|
430
|
+
f.finding_type,
|
|
431
|
+
f.detector_type,
|
|
432
|
+
f.matched_content[:100].replace("\n", " "),
|
|
433
|
+
)
|
|
414
434
|
return page_findings, page_types, page_errors, page_content, page_num
|
|
415
435
|
|
|
416
436
|
async def _collect_done_and_flush(min_findings: int = 1) -> None:
|
|
@@ -491,6 +511,12 @@ class DetectorPipeline:
|
|
|
491
511
|
continue
|
|
492
512
|
candidate_ids.append(value)
|
|
493
513
|
|
|
514
|
+
logger.info(
|
|
515
|
+
"_iter_text_content_pages(%s): trying candidates %s",
|
|
516
|
+
asset.name,
|
|
517
|
+
candidate_ids,
|
|
518
|
+
)
|
|
519
|
+
|
|
494
520
|
for candidate_id in candidate_ids:
|
|
495
521
|
saw_candidate_content = False
|
|
496
522
|
async for text_content in self.content_provider.fetch_text_pages(candidate_id):
|
|
@@ -502,6 +528,16 @@ class DetectorPipeline:
|
|
|
502
528
|
if saw_candidate_content:
|
|
503
529
|
return
|
|
504
530
|
|
|
531
|
+
# If fetch_content_pages ran the full bytes-path extraction (even
|
|
532
|
+
# yielding 0 text, e.g. silent audio), the source already did the
|
|
533
|
+
# expensive work. Don't re-process with another candidate ID for
|
|
534
|
+
# the same asset.
|
|
535
|
+
source = getattr(self.content_provider, "_source", None)
|
|
536
|
+
if source is not None:
|
|
537
|
+
processed: set[str] = getattr(source, "_content_pages_processed", set())
|
|
538
|
+
if candidate_id in processed:
|
|
539
|
+
return
|
|
540
|
+
|
|
505
541
|
async def _run_binary_detectors_for_asset(
|
|
506
542
|
self,
|
|
507
543
|
*,
|
|
@@ -3,11 +3,14 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import logging
|
|
6
7
|
from collections.abc import AsyncGenerator
|
|
7
8
|
|
|
8
9
|
from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
|
|
9
10
|
from ..sources.base import BaseSource
|
|
10
11
|
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
11
14
|
|
|
12
15
|
class ParsedContentProvider:
|
|
13
16
|
"""
|
|
@@ -32,11 +35,30 @@ class ParsedContentProvider:
|
|
|
32
35
|
if saw_text:
|
|
33
36
|
return
|
|
34
37
|
|
|
38
|
+
# If fetch_content_pages already ran the full extraction pipeline for
|
|
39
|
+
# this asset (tracked via _content_pages_processed), skip the fallback
|
|
40
|
+
# iter_asset_pages call. Without this, an all-silence audio file would
|
|
41
|
+
# trigger a redundant second transcription pass.
|
|
42
|
+
pages_processed: set[str] | None = getattr(self._source, "_content_pages_processed", None)
|
|
43
|
+
if isinstance(pages_processed, set) and asset_id in pages_processed:
|
|
44
|
+
logger.info(
|
|
45
|
+
"fetch_text_pages(%s): source already processed, skipping fallback",
|
|
46
|
+
asset_id,
|
|
47
|
+
)
|
|
48
|
+
return
|
|
49
|
+
|
|
35
50
|
result = await self._source.fetch_content_bytes(asset_id)
|
|
36
51
|
if result is None:
|
|
52
|
+
logger.info("fetch_text_pages(%s): fetch_content_bytes returned None", asset_id)
|
|
37
53
|
return
|
|
38
54
|
|
|
39
55
|
raw_bytes, mime = result
|
|
56
|
+
logger.info(
|
|
57
|
+
"fetch_text_pages(%s): fallback iter_asset_pages path (%s, %d bytes)",
|
|
58
|
+
asset_id,
|
|
59
|
+
mime,
|
|
60
|
+
len(raw_bytes),
|
|
61
|
+
)
|
|
40
62
|
pages: list[str] = await asyncio.to_thread(
|
|
41
63
|
list,
|
|
42
64
|
self._source.iter_asset_pages(raw_bytes, mime),
|
|
@@ -21,6 +21,9 @@ from concurrent.futures import ProcessPoolExecutor
|
|
|
21
21
|
from typing import Any
|
|
22
22
|
|
|
23
23
|
from ..models.generated_single_asset_scan_results import DetectionResult
|
|
24
|
+
from ..utils.resources import get_effective_cpu_count, get_effective_memory_mb
|
|
25
|
+
|
|
26
|
+
__all__ = ["get_effective_cpu_count", "get_effective_memory_mb"]
|
|
24
27
|
|
|
25
28
|
logger = logging.getLogger(__name__)
|
|
26
29
|
|
|
@@ -130,63 +133,6 @@ def is_io_bound_detector(detector_name: str) -> bool:
|
|
|
130
133
|
return detector_name in _IO_BOUND_DETECTORS
|
|
131
134
|
|
|
132
135
|
|
|
133
|
-
def get_effective_cpu_count() -> int:
|
|
134
|
-
"""Return the number of usable CPUs, respecting cgroup limits (K8s/Docker).
|
|
135
|
-
|
|
136
|
-
``os.cpu_count()`` returns the *host* CPU count, which can be much larger
|
|
137
|
-
than what the container is allowed to use. This function reads the cgroup
|
|
138
|
-
v2 ``cpu.max`` (or v1 ``cpu.cfs_quota_us``/``cpu.cfs_period_us``) to
|
|
139
|
-
determine the actual allocation.
|
|
140
|
-
"""
|
|
141
|
-
try:
|
|
142
|
-
data = open("/sys/fs/cgroup/cpu.max").read().strip()
|
|
143
|
-
quota_str, period_str = data.split()
|
|
144
|
-
if quota_str != "max":
|
|
145
|
-
cpus = int(quota_str) / int(period_str)
|
|
146
|
-
if cpus >= 0.5:
|
|
147
|
-
return max(1, int(cpus))
|
|
148
|
-
except (FileNotFoundError, OSError, ValueError):
|
|
149
|
-
pass
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
quota = int(open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read().strip())
|
|
153
|
-
period = int(open("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read().strip())
|
|
154
|
-
if quota > 0 and period > 0:
|
|
155
|
-
cpus = quota / period
|
|
156
|
-
if cpus >= 0.5:
|
|
157
|
-
return max(1, int(cpus))
|
|
158
|
-
except (FileNotFoundError, OSError, ValueError):
|
|
159
|
-
pass
|
|
160
|
-
|
|
161
|
-
return os.cpu_count() or 4
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def get_effective_memory_mb() -> int:
|
|
165
|
-
"""Return usable memory in MB, respecting cgroup limits."""
|
|
166
|
-
try:
|
|
167
|
-
mem_bytes = int(open("/sys/fs/cgroup/memory.max").read().strip())
|
|
168
|
-
if mem_bytes < 2**50:
|
|
169
|
-
return max(256, mem_bytes // (1024 * 1024))
|
|
170
|
-
except (FileNotFoundError, OSError, ValueError):
|
|
171
|
-
pass
|
|
172
|
-
|
|
173
|
-
try:
|
|
174
|
-
mem_bytes = int(open("/sys/fs/cgroup/memory/memory.limit_in_bytes").read().strip())
|
|
175
|
-
if mem_bytes < 2**50:
|
|
176
|
-
return max(256, mem_bytes // (1024 * 1024))
|
|
177
|
-
except (FileNotFoundError, OSError, ValueError):
|
|
178
|
-
pass
|
|
179
|
-
|
|
180
|
-
try:
|
|
181
|
-
for line in open("/proc/meminfo"):
|
|
182
|
-
if line.startswith("MemTotal:"):
|
|
183
|
-
return max(256, int(line.split()[1]) // 1024)
|
|
184
|
-
except (FileNotFoundError, OSError, ValueError):
|
|
185
|
-
pass
|
|
186
|
-
|
|
187
|
-
return 4096
|
|
188
|
-
|
|
189
|
-
|
|
190
136
|
def compute_pool_workers(override: int | None = None) -> int:
|
|
191
137
|
"""Compute optimal pool size from actual resource limits.
|
|
192
138
|
|
|
@@ -135,6 +135,11 @@ class ObjectStorageSourceBase(BaseSource, ABC):
|
|
|
135
135
|
# Keyed by both asset_hash and external_url for O(1) lookup from either.
|
|
136
136
|
self._bytes_cache: dict[str, bytes] = {}
|
|
137
137
|
self._mime_cache: dict[str, str] = {}
|
|
138
|
+
# asset_ids for which fetch_content_pages ran the full bytes path
|
|
139
|
+
# (even if it produced no text, e.g. all-silence audio). Checked by
|
|
140
|
+
# ParsedContentProvider to skip its fallback iter_asset_pages path,
|
|
141
|
+
# which would otherwise re-run an expensive transcription a second time.
|
|
142
|
+
self._content_pages_processed: set[str] = set()
|
|
138
143
|
# Child IMAGE assets queued while transforming the current object.
|
|
139
144
|
self._pending_child_assets: list[SingleAssetScanResults] = []
|
|
140
145
|
|
|
@@ -302,6 +307,15 @@ class ObjectStorageSourceBase(BaseSource, ABC):
|
|
|
302
307
|
|
|
303
308
|
return OutputAssetType.OTHER
|
|
304
309
|
|
|
310
|
+
@staticmethod
|
|
311
|
+
def _asset_kind_for_asset_type(asset_type: OutputAssetType) -> str:
|
|
312
|
+
mapping: dict[OutputAssetType, str] = {
|
|
313
|
+
OutputAssetType.IMAGE: "image",
|
|
314
|
+
OutputAssetType.AUDIO: "audio",
|
|
315
|
+
OutputAssetType.VIDEO: "video",
|
|
316
|
+
}
|
|
317
|
+
return mapping.get(asset_type, "file")
|
|
318
|
+
|
|
305
319
|
def _ensure_file_processing_dependencies(self) -> None:
|
|
306
320
|
if self._file_processing_deps_checked:
|
|
307
321
|
return
|
|
@@ -446,7 +460,7 @@ class ObjectStorageSourceBase(BaseSource, ABC):
|
|
|
446
460
|
created_at=ref.last_modified,
|
|
447
461
|
updated_at=ref.last_modified,
|
|
448
462
|
runner_id=self.runner_id,
|
|
449
|
-
**self.metadata_fields(
|
|
463
|
+
**self.metadata_fields(self._asset_kind_for_asset_type(asset_type), asset_metadata),
|
|
450
464
|
)
|
|
451
465
|
self._hash_to_uri[asset_hash] = external_url
|
|
452
466
|
self._object_ref_by_hash[asset_hash] = ref
|
|
@@ -549,6 +563,7 @@ class ObjectStorageSourceBase(BaseSource, ABC):
|
|
|
549
563
|
self._object_ref_by_hash = {}
|
|
550
564
|
self._bytes_cache = {}
|
|
551
565
|
self._mime_cache = {}
|
|
566
|
+
self._content_pages_processed = set()
|
|
552
567
|
self._pending_child_assets = []
|
|
553
568
|
|
|
554
569
|
refs = self._list_objects()
|
|
@@ -628,26 +643,69 @@ class ObjectStorageSourceBase(BaseSource, ABC):
|
|
|
628
643
|
raw_bytes = self._bytes_cache.get(asset_id)
|
|
629
644
|
mime = self._mime_cache.get(asset_id, "")
|
|
630
645
|
|
|
646
|
+
logger.info(
|
|
647
|
+
"fetch_content_pages(%s): raw_bytes=%s mime=%s processed=%s",
|
|
648
|
+
asset_id,
|
|
649
|
+
f"{len(raw_bytes)} bytes" if raw_bytes is not None else "MISS",
|
|
650
|
+
mime or "MISS",
|
|
651
|
+
asset_id in self._content_pages_processed,
|
|
652
|
+
)
|
|
653
|
+
|
|
631
654
|
if raw_bytes is not None:
|
|
632
655
|
sampling = self.config.sampling
|
|
633
656
|
batch_size = int(sampling.rows_per_page or 100)
|
|
634
657
|
include_col_names = bool(
|
|
635
658
|
sampling.include_column_names if sampling.include_column_names is not None else True
|
|
636
659
|
)
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
pages
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
660
|
+
file_name = self._file_name_for_asset_id(asset_id)
|
|
661
|
+
|
|
662
|
+
# Stream pages from a thread instead of materializing via list().
|
|
663
|
+
# For transcription this lets detectors start working on the first
|
|
664
|
+
# chunk while later chunks are still being transcribed.
|
|
665
|
+
loop = asyncio.get_running_loop()
|
|
666
|
+
queue: asyncio.Queue[str | None] = asyncio.Queue()
|
|
667
|
+
|
|
668
|
+
exc_info: list[BaseException | None] = [None]
|
|
669
|
+
|
|
670
|
+
page_count: int = 0
|
|
671
|
+
|
|
672
|
+
def _produce() -> None:
|
|
673
|
+
nonlocal page_count
|
|
674
|
+
try:
|
|
675
|
+
for page in self.iter_asset_pages(
|
|
676
|
+
raw_bytes,
|
|
677
|
+
mime,
|
|
678
|
+
batch_size,
|
|
679
|
+
include_col_names,
|
|
680
|
+
file_name=file_name,
|
|
681
|
+
):
|
|
682
|
+
loop.call_soon_threadsafe(queue.put_nowait, page)
|
|
683
|
+
page_count += 1
|
|
684
|
+
except BaseException as exc:
|
|
685
|
+
exc_info[0] = exc
|
|
686
|
+
finally:
|
|
687
|
+
loop.call_soon_threadsafe(queue.put_nowait, None)
|
|
688
|
+
|
|
689
|
+
task = loop.run_in_executor(None, _produce)
|
|
690
|
+
|
|
691
|
+
while True:
|
|
692
|
+
page = await queue.get()
|
|
693
|
+
if page is None:
|
|
694
|
+
break
|
|
695
|
+
yield "", page
|
|
696
|
+
|
|
697
|
+
await task
|
|
698
|
+
if exc_info[0] is not None:
|
|
699
|
+
raise exc_info[0] # type: ignore[misc]
|
|
700
|
+
|
|
701
|
+
logger.info(
|
|
702
|
+
"fetch_content_pages(%s): streamed %d page(s) from %s",
|
|
703
|
+
asset_id,
|
|
704
|
+
page_count,
|
|
705
|
+
file_name,
|
|
648
706
|
)
|
|
649
|
-
|
|
650
|
-
|
|
707
|
+
|
|
708
|
+
self._content_pages_processed.add(asset_id)
|
|
651
709
|
return
|
|
652
710
|
|
|
653
711
|
result = await self.fetch_content(asset_id)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Cgroup-aware CPU and memory introspection.
|
|
2
|
+
|
|
3
|
+
Shared by the detector worker pool (to size the process pool) and the
|
|
4
|
+
transcription pipeline (to select the right Whisper model at runtime).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_effective_cpu_count() -> int:
|
|
13
|
+
"""Return usable CPUs, respecting cgroup limits (K8s / Docker).
|
|
14
|
+
|
|
15
|
+
``os.cpu_count()`` returns the *host* count, which is usually much larger
|
|
16
|
+
than the container's CPU quota. This reads cgroup v2 / v1 to get the
|
|
17
|
+
actual allocation.
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
data = open("/sys/fs/cgroup/cpu.max").read().strip()
|
|
21
|
+
quota_str, period_str = data.split()
|
|
22
|
+
if quota_str != "max":
|
|
23
|
+
cpus = int(quota_str) / int(period_str)
|
|
24
|
+
if cpus >= 0.5:
|
|
25
|
+
return max(1, int(cpus))
|
|
26
|
+
except (FileNotFoundError, OSError, ValueError):
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
quota = int(open("/sys/fs/cgroup/cpu/cpu.cfs_quota_us").read().strip())
|
|
31
|
+
period = int(open("/sys/fs/cgroup/cpu/cpu.cfs_period_us").read().strip())
|
|
32
|
+
if quota > 0 and period > 0:
|
|
33
|
+
cpus = quota / period
|
|
34
|
+
if cpus >= 0.5:
|
|
35
|
+
return max(1, int(cpus))
|
|
36
|
+
except (FileNotFoundError, OSError, ValueError):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
return os.cpu_count() or 4
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_effective_memory_mb() -> int:
|
|
43
|
+
"""Return usable memory in MB, respecting cgroup limits (K8s / Docker)."""
|
|
44
|
+
try:
|
|
45
|
+
mem_bytes = int(open("/sys/fs/cgroup/memory.max").read().strip())
|
|
46
|
+
if mem_bytes < 2**50:
|
|
47
|
+
return max(256, mem_bytes // (1024 * 1024))
|
|
48
|
+
except (FileNotFoundError, OSError, ValueError):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
mem_bytes = int(open("/sys/fs/cgroup/memory/memory.limit_in_bytes").read().strip())
|
|
53
|
+
if mem_bytes < 2**50:
|
|
54
|
+
return max(256, mem_bytes // (1024 * 1024))
|
|
55
|
+
except (FileNotFoundError, OSError, ValueError):
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
for line in open("/proc/meminfo"):
|
|
60
|
+
if line.startswith("MemTotal:"):
|
|
61
|
+
return max(256, int(line.split()[1]) // 1024)
|
|
62
|
+
except (FileNotFoundError, OSError, ValueError):
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
return 4096
|