classifyre-cli 0.4.30__tar.gz → 0.4.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.32/.turbo/turbo-build.log +3 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/PKG-INFO +1 -1
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/package.json +1 -1
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/pyproject.toml +1 -1
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/main.py +6 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/models/generated_input.py +4 -3
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/outputs/rest.py +12 -1
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/base.py +120 -1
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/confluence/source.py +14 -8
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/databricks/source.py +5 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/email/source.py +62 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/jira/source.py +18 -12
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/mongodb/source.py +8 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/neo4j/source.py +8 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/notion/source.py +12 -6
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/object_storage/base.py +5 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/powerbi/source.py +17 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/recipe_normalizer.py +2 -2
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/servicedesk/source.py +5 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/slack/source.py +59 -30
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/snowflake/source.py +9 -1
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/tableau/source.py +17 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/tabular_base.py +124 -1
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/wordpress/source.py +70 -1
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/youtube/source.py +3 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_outputs.py +27 -0
- classifyre_cli-0.4.32/tests/test_sampling_automatic.py +143 -0
- classifyre_cli-0.4.32/tests/test_tabular_automatic_sampling.py +188 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/uv.lock +79 -45
- classifyre_cli-0.4.30/.turbo/turbo-build.log +0 -3
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/.gitignore +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/.python-version +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/README.md +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/main.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/scripts/generate_models.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/config.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/broken_links/detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_llm.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/runners/_text_classification.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/models/generated_detectors.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/models/generated_single_asset_scan_results.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/pipeline/detector_pipeline.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/asset_metadata.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/email/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/mysql/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/notion/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/notion/client.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/postgresql/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/sqlite/source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/sources/youtube/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/dependency_groups.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/file_metadata.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/resources.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/transcription.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/broken_links/test_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_assets_metadata_catalog.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_config.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_dependency_groups.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_email_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_notion_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_uv_sync.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_youtube_source.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/test_youtube_source_integration.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/utils/test_file_metadata.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/utils/test_file_parser.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/utils/test_file_to_images.py +0 -0
- {classifyre_cli-0.4.30 → classifyre_cli-0.4.32}/tests/utils/test_transcription.py +0 -0
|
@@ -366,6 +366,12 @@ async def run_command_async(args: argparse.Namespace, recipe: dict[str, Any]) ->
|
|
|
366
366
|
len(all_stubs),
|
|
367
367
|
)
|
|
368
368
|
|
|
369
|
+
# Persist the advanced AUTOMATIC sampling cursor (no-op for
|
|
370
|
+
# other strategies, which return None). Only on the normal
|
|
371
|
+
# completion path — a timed-out run must not advance it.
|
|
372
|
+
if hasattr(sink, "set_sampling_cursor"):
|
|
373
|
+
sink.set_sampling_cursor(source.current_sampling_cursor())
|
|
374
|
+
|
|
369
375
|
await sink.finish()
|
|
370
376
|
logger.info(
|
|
371
377
|
"Extraction completed: %s assets in %s batches",
|
|
@@ -85,9 +85,10 @@ class SlackChannelType(StrEnum):
|
|
|
85
85
|
|
|
86
86
|
class SamplingStrategy(StrEnum):
|
|
87
87
|
"""
|
|
88
|
-
Sampling strategy
|
|
88
|
+
Sampling strategy. AUTOMATIC (recommended default) incrementally ingests a new slice of not-yet-seen data on every run, remembering its position between runs and wrapping around to re-scan from the start once everything has been covered — eventually ingesting everything at a bounded cost per run. RANDOM samples items randomly. LATEST prioritises the most recently modified/created items. ALL scans every item with no limit.
|
|
89
89
|
"""
|
|
90
90
|
|
|
91
|
+
AUTOMATIC = 'AUTOMATIC'
|
|
91
92
|
RANDOM = 'RANDOM'
|
|
92
93
|
LATEST = 'LATEST'
|
|
93
94
|
ALL = 'ALL'
|
|
@@ -95,7 +96,7 @@ class SamplingStrategy(StrEnum):
|
|
|
95
96
|
|
|
96
97
|
class SamplingConfig(BaseModel):
|
|
97
98
|
"""
|
|
98
|
-
Controls how content is extracted from each source. For tabular sources rows_per_page controls both sample size for RANDOM/LATEST and pagination batch size for ALL.
|
|
99
|
+
Controls how content is extracted from each source. For tabular sources rows_per_page controls both sample size for AUTOMATIC/RANDOM/LATEST and pagination batch size for ALL.
|
|
99
100
|
"""
|
|
100
101
|
|
|
101
102
|
model_config = ConfigDict(
|
|
@@ -124,7 +125,7 @@ class SamplingConfig(BaseModel):
|
|
|
124
125
|
)
|
|
125
126
|
rows_per_page: int | None = Field(
|
|
126
127
|
100,
|
|
127
|
-
description='Tabular sources only. Number of rows per sample (RANDOM/LATEST) or per pagination batch (ALL). Controls memory usage during large table scans.',
|
|
128
|
+
description='Tabular sources only. Number of rows per sample (AUTOMATIC/RANDOM/LATEST) or per pagination batch (ALL). For AUTOMATIC this is the size of the incremental slice ingested each run. Controls memory usage during large table scans.',
|
|
128
129
|
ge=10,
|
|
129
130
|
le=10000,
|
|
130
131
|
)
|
|
@@ -127,6 +127,11 @@ class FinalizeIngestRunRequest(BaseModel):
|
|
|
127
127
|
|
|
128
128
|
runner_id: str = Field(serialization_alias="runnerId")
|
|
129
129
|
seen_hashes: list[str] = Field(serialization_alias="seenHashes")
|
|
130
|
+
# AUTOMATIC sampling cursor to persist on the source for the next run.
|
|
131
|
+
# Omitted (None) for other strategies so the stored cursor is left untouched.
|
|
132
|
+
sampling_cursor: dict[str, Any] | None = Field(
|
|
133
|
+
None, serialization_alias="samplingCursor"
|
|
134
|
+
)
|
|
130
135
|
|
|
131
136
|
|
|
132
137
|
class UpdateRunnerStatusRequest(BaseModel):
|
|
@@ -165,6 +170,11 @@ class RestOutputSink:
|
|
|
165
170
|
self.session.mount("https://", adapter)
|
|
166
171
|
self._runner_id = context.runner_id
|
|
167
172
|
self._seen_hashes: set[str] = set()
|
|
173
|
+
self._sampling_cursor: dict[str, Any] | None = None
|
|
174
|
+
|
|
175
|
+
def set_sampling_cursor(self, cursor: dict[str, Any] | None) -> None:
|
|
176
|
+
"""Record the AUTOMATIC sampling cursor to persist on finalize."""
|
|
177
|
+
self._sampling_cursor = cursor
|
|
168
178
|
|
|
169
179
|
async def start(self) -> None:
|
|
170
180
|
if not self.context.source_id:
|
|
@@ -244,11 +254,12 @@ class RestOutputSink:
|
|
|
244
254
|
payload = FinalizeIngestRunRequest(
|
|
245
255
|
runner_id=runner_id,
|
|
246
256
|
seen_hashes=sorted(self._seen_hashes),
|
|
257
|
+
sampling_cursor=self._sampling_cursor,
|
|
247
258
|
)
|
|
248
259
|
self._request_json(
|
|
249
260
|
"POST",
|
|
250
261
|
f"/sources/{source_id}/assets/finalize",
|
|
251
|
-
payload.model_dump(mode="json", by_alias=True),
|
|
262
|
+
payload.model_dump(mode="json", by_alias=True, exclude_none=True),
|
|
252
263
|
)
|
|
253
264
|
|
|
254
265
|
status_payload = UpdateRunnerStatusRequest(status="COMPLETED")
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
1
4
|
import os
|
|
5
|
+
import threading
|
|
2
6
|
from abc import ABC, abstractmethod
|
|
3
7
|
from collections.abc import AsyncGenerator, Generator
|
|
4
|
-
from typing import TYPE_CHECKING, Any
|
|
8
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
5
9
|
|
|
6
10
|
from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
|
|
7
11
|
from ..outputs.rest import IngestEdge
|
|
@@ -12,6 +16,10 @@ from ..utils.hashing import calculate_checksum, normalize_http_url
|
|
|
12
16
|
from ..utils.validation import validate_output
|
|
13
17
|
from .recipe_normalizer import normalize_source_recipe
|
|
14
18
|
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_T = TypeVar("_T")
|
|
22
|
+
|
|
15
23
|
|
|
16
24
|
class BaseSource(ABC):
|
|
17
25
|
"""
|
|
@@ -26,6 +34,10 @@ class BaseSource(ABC):
|
|
|
26
34
|
# Default batch size for streaming asset results
|
|
27
35
|
BATCH_SIZE: int = 50
|
|
28
36
|
HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
|
|
37
|
+
# The API injects the saved AUTOMATIC sampling cursor here (base64-encoded
|
|
38
|
+
# JSON) before launching the CLI job. The recipe itself cannot carry it
|
|
39
|
+
# because every source schema sets ``additionalProperties: false``.
|
|
40
|
+
SAMPLING_CURSOR_ENV = "CLASSIFYRE_SAMPLING_CURSOR"
|
|
29
41
|
|
|
30
42
|
def __init__(
|
|
31
43
|
self,
|
|
@@ -42,6 +54,11 @@ class BaseSource(ABC):
|
|
|
42
54
|
runner_id: Optional runner ID (for API runs)
|
|
43
55
|
"""
|
|
44
56
|
normalized_recipe = normalize_source_recipe(recipe, recipe.get("type"))
|
|
57
|
+
# Cursor carried over from the previous run (AUTOMATIC strategy). Read
|
|
58
|
+
# before the override hook so subclasses can consult it there if needed.
|
|
59
|
+
self._sampling_cursor: dict[str, Any] = self._load_sampling_cursor()
|
|
60
|
+
self._next_sampling_cursor: dict[str, Any] | None = None
|
|
61
|
+
self._sampling_cursor_lock = threading.Lock()
|
|
45
62
|
self._apply_initial_sampling_override(normalized_recipe)
|
|
46
63
|
recipe.clear()
|
|
47
64
|
recipe.update(normalized_recipe)
|
|
@@ -55,6 +72,108 @@ class BaseSource(ABC):
|
|
|
55
72
|
def _apply_initial_sampling_override(self, recipe: dict[str, Any]) -> None:
|
|
56
73
|
pass
|
|
57
74
|
|
|
75
|
+
# ── AUTOMATIC sampling cursor ────────────────────────────────────────
|
|
76
|
+
#
|
|
77
|
+
# AUTOMATIC sampling keeps a small, opaque, source-defined cursor in the
|
|
78
|
+
# API between runs. Each run reads the prior cursor (``sampling_cursor``),
|
|
79
|
+
# ingests the next slice of not-yet-seen data, then records the advanced
|
|
80
|
+
# cursor (``set_next_sampling_cursor``). The output sink persists it back to
|
|
81
|
+
# the API on finalize via ``current_sampling_cursor``. When a source has
|
|
82
|
+
# ingested everything it should reset the cursor so the next run wraps
|
|
83
|
+
# around and re-ingests from the start (data is not stale).
|
|
84
|
+
|
|
85
|
+
def _load_sampling_cursor(self) -> dict[str, Any]:
|
|
86
|
+
raw = os.environ.get(self.SAMPLING_CURSOR_ENV)
|
|
87
|
+
if not raw:
|
|
88
|
+
return {}
|
|
89
|
+
try:
|
|
90
|
+
decoded = base64.b64decode(raw).decode("utf-8")
|
|
91
|
+
data = json.loads(decoded)
|
|
92
|
+
except Exception as exc:
|
|
93
|
+
logger.warning("Ignoring malformed %s: %s", self.SAMPLING_CURSOR_ENV, exc)
|
|
94
|
+
return {}
|
|
95
|
+
return data if isinstance(data, dict) else {}
|
|
96
|
+
|
|
97
|
+
def sampling_cursor(self) -> dict[str, Any]:
|
|
98
|
+
"""Return the cursor saved by the previous run (empty on first run)."""
|
|
99
|
+
return self._sampling_cursor
|
|
100
|
+
|
|
101
|
+
def set_next_sampling_cursor(self, cursor: dict[str, Any]) -> None:
|
|
102
|
+
"""Record the advanced cursor to persist at the end of this run."""
|
|
103
|
+
self._next_sampling_cursor = cursor
|
|
104
|
+
|
|
105
|
+
def current_sampling_cursor(self) -> dict[str, Any] | None:
|
|
106
|
+
"""Cursor to persist for the next run, or None to leave it unchanged.
|
|
107
|
+
|
|
108
|
+
Returns None unless this run advanced the cursor (i.e. AUTOMATIC
|
|
109
|
+
sampling actually ran), so non-AUTOMATIC runs never touch the stored
|
|
110
|
+
cursor.
|
|
111
|
+
"""
|
|
112
|
+
return self._next_sampling_cursor
|
|
113
|
+
|
|
114
|
+
def sampling_window_size(self, default: int = 100) -> int:
|
|
115
|
+
"""The per-run AUTOMATIC slice size (``rows_per_page``)."""
|
|
116
|
+
config = getattr(self, "config", None)
|
|
117
|
+
sampling = getattr(config, "sampling", None) if config is not None else None
|
|
118
|
+
size = getattr(sampling, "rows_per_page", None)
|
|
119
|
+
try:
|
|
120
|
+
return int(size) if size else default
|
|
121
|
+
except (TypeError, ValueError):
|
|
122
|
+
return default
|
|
123
|
+
|
|
124
|
+
def _record_cursor_key(self, key: str, value: Any) -> None:
|
|
125
|
+
"""Thread-safely set ``key`` in the cursor to persist for the next run."""
|
|
126
|
+
with self._sampling_cursor_lock:
|
|
127
|
+
nxt = self._next_sampling_cursor if isinstance(self._next_sampling_cursor, dict) else {}
|
|
128
|
+
nxt = {**nxt, key: value}
|
|
129
|
+
self._next_sampling_cursor = nxt
|
|
130
|
+
|
|
131
|
+
def automatic_offset(self, key: str) -> int:
|
|
132
|
+
"""Return the saved offset for a keyed AUTOMATIC DB cursor (0 on first run)."""
|
|
133
|
+
saved = self._sampling_cursor.get(key)
|
|
134
|
+
return saved if isinstance(saved, int) and saved >= 0 else 0
|
|
135
|
+
|
|
136
|
+
def record_automatic_offset(
|
|
137
|
+
self, key: str, *, prev_offset: int, fetched: int
|
|
138
|
+
) -> None:
|
|
139
|
+
"""Advance a keyed offset cursor; wrap to 0 once a page underfills.
|
|
140
|
+
|
|
141
|
+
Used by sources that page rows directly from the backing store
|
|
142
|
+
(``skip``/``OFFSET``) rather than materialising a full list.
|
|
143
|
+
"""
|
|
144
|
+
size = self.sampling_window_size()
|
|
145
|
+
next_offset = 0 if fetched < size else prev_offset + fetched
|
|
146
|
+
self._record_cursor_key(key, next_offset)
|
|
147
|
+
|
|
148
|
+
def automatic_window(self, items: list[_T], *, key: str = "items") -> list[_T]:
|
|
149
|
+
"""Return the next AUTOMATIC slice of a stably-ordered in-memory list.
|
|
150
|
+
|
|
151
|
+
Non-tabular sources fetch a list of item references, then call this to
|
|
152
|
+
ingest only the next ``rows_per_page`` window. A per-``key`` offset is
|
|
153
|
+
remembered between runs and wraps back to the start once the list has
|
|
154
|
+
been fully covered (data is not stale, so re-ingesting is desired).
|
|
155
|
+
|
|
156
|
+
Callers must pass the items in a **stable order** across runs (e.g. by
|
|
157
|
+
id or timestamp) so the cursor stays meaningful.
|
|
158
|
+
"""
|
|
159
|
+
total = len(items)
|
|
160
|
+
if total == 0:
|
|
161
|
+
return []
|
|
162
|
+
|
|
163
|
+
saved = self._sampling_cursor.get(key)
|
|
164
|
+
offset = saved if isinstance(saved, int) and 0 <= saved < total else 0
|
|
165
|
+
|
|
166
|
+
size = self.sampling_window_size()
|
|
167
|
+
window = items[offset : offset + size]
|
|
168
|
+
|
|
169
|
+
next_offset = offset + len(window)
|
|
170
|
+
if next_offset >= total:
|
|
171
|
+
next_offset = 0 # wrap around on the next run
|
|
172
|
+
|
|
173
|
+
self._record_cursor_key(key, next_offset)
|
|
174
|
+
|
|
175
|
+
return window
|
|
176
|
+
|
|
58
177
|
@staticmethod
|
|
59
178
|
def _read_bool_env(name: str) -> bool | None:
|
|
60
179
|
raw = os.environ.get(name)
|
|
@@ -242,11 +242,24 @@ class ConfluenceSource(BaseSource):
|
|
|
242
242
|
params["labels"] = ",".join(str(v) for v in spaces_filter.labels)
|
|
243
243
|
return self.client.iter_confluence_results("/wiki/api/v2/spaces", params=params)
|
|
244
244
|
|
|
245
|
+
def _sorted_page_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
246
|
+
return sorted(
|
|
247
|
+
refs,
|
|
248
|
+
key=lambda ref: parse_datetime(
|
|
249
|
+
str(ref.get("version_created_at") or ref.get("created_at") or "")
|
|
250
|
+
),
|
|
251
|
+
reverse=True,
|
|
252
|
+
)
|
|
253
|
+
|
|
245
254
|
def _sample_page_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
246
255
|
sampling = self.config.sampling
|
|
247
256
|
if sampling.strategy == SamplingStrategy.ALL:
|
|
248
257
|
return refs
|
|
249
258
|
|
|
259
|
+
if sampling.strategy == SamplingStrategy.AUTOMATIC:
|
|
260
|
+
# Newest-first stable order; window advances each run and wraps around.
|
|
261
|
+
return self.automatic_window(self._sorted_page_refs(refs), key="pages")
|
|
262
|
+
|
|
250
263
|
limit = int(sampling.rows_per_page or 100)
|
|
251
264
|
if limit >= len(refs):
|
|
252
265
|
return refs
|
|
@@ -254,14 +267,7 @@ class ConfluenceSource(BaseSource):
|
|
|
254
267
|
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
255
268
|
return deterministic_sample(refs, limit)
|
|
256
269
|
|
|
257
|
-
|
|
258
|
-
refs,
|
|
259
|
-
key=lambda ref: parse_datetime(
|
|
260
|
-
str(ref.get("version_created_at") or ref.get("created_at") or "")
|
|
261
|
-
),
|
|
262
|
-
reverse=True,
|
|
263
|
-
)
|
|
264
|
-
return refs_sorted[:limit]
|
|
270
|
+
return self._sorted_page_refs(refs)[:limit]
|
|
265
271
|
|
|
266
272
|
def _extract_page_assets(self, ref: dict[str, Any]) -> list[SingleAssetScanResults]:
|
|
267
273
|
page_id = str(ref["page_id"])
|
|
@@ -423,6 +423,11 @@ class DatabricksSource(BaseTabularSource):
|
|
|
423
423
|
return value.isoformat()
|
|
424
424
|
return str(value)
|
|
425
425
|
|
|
426
|
+
def _automatic_supports_keyset(self) -> bool:
|
|
427
|
+
# Databricks builds inline (parameter-less) queries; AUTOMATIC uses OFFSET
|
|
428
|
+
# paging through _fetch_one_page rather than keyset WHERE clauses.
|
|
429
|
+
return False
|
|
430
|
+
|
|
426
431
|
# ── Databricks pagination (inline LIMIT/OFFSET) ──────────────────────
|
|
427
432
|
|
|
428
433
|
def _fetch_one_page(
|
|
@@ -195,6 +195,11 @@ class EmailSource(BaseSource):
|
|
|
195
195
|
total = 0
|
|
196
196
|
|
|
197
197
|
try:
|
|
198
|
+
if strategy == SamplingStrategy.AUTOMATIC:
|
|
199
|
+
async for batch in self._extract_automatic(mod, criteria):
|
|
200
|
+
yield batch
|
|
201
|
+
return
|
|
202
|
+
|
|
198
203
|
for folder in self.folders:
|
|
199
204
|
if self._aborted or (limit is not None and total >= limit):
|
|
200
205
|
break
|
|
@@ -236,6 +241,63 @@ class EmailSource(BaseSource):
|
|
|
236
241
|
finally:
|
|
237
242
|
logger.info("Extracted %s email messages", total)
|
|
238
243
|
|
|
244
|
+
async def _extract_automatic(
|
|
245
|
+
self, mod: Any, criteria: Any
|
|
246
|
+
) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
247
|
+
"""AUTOMATIC sampling: page through each folder's messages by UID.
|
|
248
|
+
|
|
249
|
+
Listing UIDs is cheap (no body fetch); we window the UID list (newest
|
|
250
|
+
first) so each run ingests the next ``rows_per_page`` slice per folder
|
|
251
|
+
and wraps around once the folder has been fully covered.
|
|
252
|
+
"""
|
|
253
|
+
pending: list[SingleAssetScanResults] = []
|
|
254
|
+
total = 0
|
|
255
|
+
for folder in self.folders:
|
|
256
|
+
if self._aborted:
|
|
257
|
+
break
|
|
258
|
+
try:
|
|
259
|
+
self._mailbox.folder.set(folder)
|
|
260
|
+
except Exception as e:
|
|
261
|
+
logger.warning("Skipping folder %s: %s", folder, e)
|
|
262
|
+
continue
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
uid_ints = sorted((int(u) for u in self._mailbox.uids(criteria)), reverse=True)
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.warning("Could not list UIDs for folder %s: %s", folder, e)
|
|
268
|
+
continue
|
|
269
|
+
if not uid_ints:
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
window = self.automatic_window([str(u) for u in uid_ints], key=f"folder:{folder}")
|
|
273
|
+
if not window:
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
for msg in self._mailbox.fetch(
|
|
277
|
+
mod.AND(uid=",".join(window)),
|
|
278
|
+
mark_seen=False,
|
|
279
|
+
bulk=self.BATCH_SIZE,
|
|
280
|
+
):
|
|
281
|
+
if self._aborted:
|
|
282
|
+
break
|
|
283
|
+
try:
|
|
284
|
+
assets = self._message_to_assets(msg, folder)
|
|
285
|
+
except Exception as e:
|
|
286
|
+
logger.error(
|
|
287
|
+
"Failed to transform message uid=%s: %s", getattr(msg, "uid", "?"), e
|
|
288
|
+
)
|
|
289
|
+
continue
|
|
290
|
+
for asset in assets:
|
|
291
|
+
pending.append(asset)
|
|
292
|
+
while len(pending) >= self.BATCH_SIZE:
|
|
293
|
+
yield pending[: self.BATCH_SIZE]
|
|
294
|
+
pending = pending[self.BATCH_SIZE :]
|
|
295
|
+
total += 1
|
|
296
|
+
|
|
297
|
+
if pending:
|
|
298
|
+
yield pending
|
|
299
|
+
logger.info("Extracted %s email messages (AUTOMATIC)", total)
|
|
300
|
+
|
|
239
301
|
def _message_to_assets(self, msg: Any, folder: str) -> list[SingleAssetScanResults]:
|
|
240
302
|
message_id = self._message_id(msg, folder)
|
|
241
303
|
email_hash = self.generate_hash_id(message_id)
|
|
@@ -193,11 +193,28 @@ class JiraSource(BaseSource):
|
|
|
193
193
|
return f"{query} ORDER BY updated DESC"
|
|
194
194
|
return query
|
|
195
195
|
|
|
196
|
+
def _sorted_issues(self, issues: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
197
|
+
return sorted(
|
|
198
|
+
issues,
|
|
199
|
+
key=lambda issue: parse_datetime(
|
|
200
|
+
str(
|
|
201
|
+
issue.get("fields", {}).get("updated")
|
|
202
|
+
if isinstance(issue.get("fields"), dict)
|
|
203
|
+
else ""
|
|
204
|
+
)
|
|
205
|
+
),
|
|
206
|
+
reverse=True,
|
|
207
|
+
)
|
|
208
|
+
|
|
196
209
|
def _sample_issues(self, issues: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
197
210
|
sampling = self.config.sampling
|
|
198
211
|
if sampling.strategy == SamplingStrategy.ALL:
|
|
199
212
|
return issues
|
|
200
213
|
|
|
214
|
+
if sampling.strategy == SamplingStrategy.AUTOMATIC:
|
|
215
|
+
# Newest-first stable order; window advances each run and wraps around.
|
|
216
|
+
return self.automatic_window(self._sorted_issues(issues), key="issues")
|
|
217
|
+
|
|
201
218
|
limit = int(sampling.rows_per_page or 100)
|
|
202
219
|
if limit >= len(issues):
|
|
203
220
|
return issues
|
|
@@ -205,18 +222,7 @@ class JiraSource(BaseSource):
|
|
|
205
222
|
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
206
223
|
return deterministic_sample(issues, limit)
|
|
207
224
|
|
|
208
|
-
|
|
209
|
-
issues,
|
|
210
|
-
key=lambda issue: parse_datetime(
|
|
211
|
-
str(
|
|
212
|
-
issue.get("fields", {}).get("updated")
|
|
213
|
-
if isinstance(issue.get("fields"), dict)
|
|
214
|
-
else ""
|
|
215
|
-
)
|
|
216
|
-
),
|
|
217
|
-
reverse=True,
|
|
218
|
-
)
|
|
219
|
-
return sorted_issues[:limit]
|
|
225
|
+
return self._sorted_issues(issues)[:limit]
|
|
220
226
|
|
|
221
227
|
def _extract_issue_assets(self, issue: dict[str, Any]) -> list[SingleAssetScanResults]:
|
|
222
228
|
fields = issue.get("fields", {})
|
|
@@ -407,6 +407,14 @@ class MongoDBSource(BaseSource):
|
|
|
407
407
|
if strategy == SamplingStrategy.ALL:
|
|
408
408
|
return list(collection.find({}).limit(rows_per_page))
|
|
409
409
|
|
|
410
|
+
if strategy == SamplingStrategy.AUTOMATIC:
|
|
411
|
+
# Page forward through the collection each run; wrap when exhausted.
|
|
412
|
+
key = f"collection:{collection_ref.database}.{collection_ref.collection}"
|
|
413
|
+
offset = self.automatic_offset(key)
|
|
414
|
+
documents = list(collection.find({}).skip(offset).limit(rows_per_page))
|
|
415
|
+
self.record_automatic_offset(key, prev_offset=offset, fetched=len(documents))
|
|
416
|
+
return documents
|
|
417
|
+
|
|
410
418
|
if strategy == SamplingStrategy.RANDOM:
|
|
411
419
|
return self._sample_random_documents(collection, rows_per_page)
|
|
412
420
|
|
|
@@ -392,6 +392,14 @@ class Neo4jSource(BaseSource):
|
|
|
392
392
|
strategy = sampling.strategy
|
|
393
393
|
rows = int(sampling.rows_per_page or 100)
|
|
394
394
|
|
|
395
|
+
if strategy == SamplingStrategy.AUTOMATIC:
|
|
396
|
+
# Page forward through this label's nodes each run; wrap when exhausted.
|
|
397
|
+
key = f"label:{ref.label}"
|
|
398
|
+
offset = self.automatic_offset(key)
|
|
399
|
+
page = self._fetch_nodes_page(ref, skip=offset, limit=rows)
|
|
400
|
+
self.record_automatic_offset(key, prev_offset=offset, fetched=len(page))
|
|
401
|
+
return page
|
|
402
|
+
|
|
395
403
|
if strategy == SamplingStrategy.RANDOM:
|
|
396
404
|
cypher = (
|
|
397
405
|
f"MATCH (n:{_escape_label(ref.label)}) "
|
|
@@ -338,11 +338,22 @@ class NotionSource(BaseSource):
|
|
|
338
338
|
"edited": obj.get("last_edited_time") or obj.get("created_time") or "",
|
|
339
339
|
}
|
|
340
340
|
|
|
341
|
+
def _sorted_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
342
|
+
return sorted(
|
|
343
|
+
refs,
|
|
344
|
+
key=lambda ref: parse_datetime(str(ref.get("edited") or "")),
|
|
345
|
+
reverse=True,
|
|
346
|
+
)
|
|
347
|
+
|
|
341
348
|
def _sample_refs(self, refs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
342
349
|
sampling = self.config.sampling
|
|
343
350
|
if sampling.strategy == SamplingStrategy.ALL:
|
|
344
351
|
return refs
|
|
345
352
|
|
|
353
|
+
if sampling.strategy == SamplingStrategy.AUTOMATIC:
|
|
354
|
+
# Newest-first stable order; window advances each run and wraps around.
|
|
355
|
+
return self.automatic_window(self._sorted_refs(refs), key="refs")
|
|
356
|
+
|
|
346
357
|
limit = int(sampling.rows_per_page or 100)
|
|
347
358
|
if limit >= len(refs):
|
|
348
359
|
return refs
|
|
@@ -350,12 +361,7 @@ class NotionSource(BaseSource):
|
|
|
350
361
|
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
351
362
|
return deterministic_sample(refs, limit)
|
|
352
363
|
|
|
353
|
-
|
|
354
|
-
refs,
|
|
355
|
-
key=lambda ref: parse_datetime(str(ref.get("edited") or "")),
|
|
356
|
-
reverse=True,
|
|
357
|
-
)
|
|
358
|
-
return refs_sorted[:limit]
|
|
364
|
+
return self._sorted_refs(refs)[:limit]
|
|
359
365
|
|
|
360
366
|
# ------------------------------------------------------------------- pages
|
|
361
367
|
def _extract_page_assets(self, page: dict[str, Any]) -> list[SingleAssetScanResults]:
|
|
@@ -271,6 +271,11 @@ class ObjectStorageSourceBase(BaseSource, ABC):
|
|
|
271
271
|
|
|
272
272
|
materialized = list(refs)
|
|
273
273
|
|
|
274
|
+
if strategy == SamplingStrategy.AUTOMATIC:
|
|
275
|
+
# Newest-first stable order; window advances each run and wraps around.
|
|
276
|
+
materialized.sort(key=lambda ref: ref.last_modified, reverse=True)
|
|
277
|
+
return self.automatic_window(materialized, key="objects")
|
|
278
|
+
|
|
274
279
|
if strategy == SamplingStrategy.RANDOM:
|
|
275
280
|
if limit >= len(materialized):
|
|
276
281
|
return materialized
|
|
@@ -569,11 +569,28 @@ class PowerBISource(BaseSource):
|
|
|
569
569
|
return parsed
|
|
570
570
|
return None
|
|
571
571
|
|
|
572
|
+
def _ordered_refs_for_automatic(
|
|
573
|
+
self, refs: list[PowerBIAssetRef], order_field: str
|
|
574
|
+
) -> list[PowerBIAssetRef]:
|
|
575
|
+
values = [self._sampling_sort_datetime(ref, order_field) for ref in refs]
|
|
576
|
+
scored: list[tuple[bool, datetime, PowerBIAssetRef]] = []
|
|
577
|
+
for ref, parsed in zip(refs, values, strict=False):
|
|
578
|
+
effective = parsed or ref.updated_at
|
|
579
|
+
scored.append((parsed is not None, effective, ref))
|
|
580
|
+
scored.sort(key=lambda item: (item[0], item[1]), reverse=True)
|
|
581
|
+
return [item[2] for item in scored]
|
|
582
|
+
|
|
572
583
|
def _sample_refs(self, refs: list[PowerBIAssetRef]) -> list[PowerBIAssetRef]:
|
|
573
584
|
sampling = self._sampling()
|
|
574
585
|
if sampling.strategy == SamplingStrategy.ALL:
|
|
575
586
|
return refs
|
|
576
587
|
|
|
588
|
+
if sampling.strategy == SamplingStrategy.AUTOMATIC:
|
|
589
|
+
# Newest-first stable order; window advances each run and wraps around.
|
|
590
|
+
order_field = sampling.order_by_column or "modifiedDateTime"
|
|
591
|
+
ordered = self._ordered_refs_for_automatic(refs, order_field)
|
|
592
|
+
return self.automatic_window(ordered, key="refs")
|
|
593
|
+
|
|
577
594
|
if sampling.strategy == SamplingStrategy.RANDOM:
|
|
578
595
|
limit = int(sampling.rows_per_page or 100)
|
|
579
596
|
if limit >= len(refs):
|
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from typing import Any
|
|
5
5
|
|
|
6
|
-
_VALID_SAMPLING_STRATEGIES = {"RANDOM", "LATEST", "ALL"}
|
|
6
|
+
_VALID_SAMPLING_STRATEGIES = {"AUTOMATIC", "RANDOM", "LATEST", "ALL"}
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def _as_dict(value: Any) -> dict[str, Any]:
|
|
@@ -130,7 +130,7 @@ def normalize_source_recipe(
|
|
|
130
130
|
_normalize_sampling_strategy(sampling.get("strategy")),
|
|
131
131
|
_normalize_sampling_strategy(optional_sampling.get("strategy")),
|
|
132
132
|
_normalize_sampling_strategy(optional_sampling.get("mode")),
|
|
133
|
-
"
|
|
133
|
+
"AUTOMATIC",
|
|
134
134
|
)
|
|
135
135
|
|
|
136
136
|
sampling["strategy"] = strategy
|
|
@@ -204,6 +204,11 @@ class ServiceDeskSource(BaseSource):
|
|
|
204
204
|
if sampling.strategy == SamplingStrategy.ALL:
|
|
205
205
|
return requests
|
|
206
206
|
|
|
207
|
+
if sampling.strategy == SamplingStrategy.AUTOMATIC:
|
|
208
|
+
# Newest-first stable order; window advances each run and wraps around.
|
|
209
|
+
sorted_requests = sorted(requests, key=self._request_sort_timestamp, reverse=True)
|
|
210
|
+
return self.automatic_window(sorted_requests, key="requests")
|
|
211
|
+
|
|
207
212
|
limit = int(sampling.rows_per_page or 100)
|
|
208
213
|
if limit >= len(requests):
|
|
209
214
|
return requests
|