classifyre-cli 0.4.11__tar.gz → 0.4.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/PKG-INFO +1 -1
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/package.json +1 -1
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/pyproject.toml +1 -1
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/scripts/generate_models.py +25 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/broken_links/detector.py +7 -10
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_llm.py +1 -1
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_text_classification.py +1 -1
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/models/generated_detectors.py +2 -2
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/models/generated_input.py +166 -4
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/models/generated_single_asset_scan_results.py +13 -1
- classifyre_cli-0.4.13/src/sources/asset_metadata.py +138 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/base.py +21 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/confluence/source.py +30 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/databricks/source.py +63 -8
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/jira/source.py +65 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mongodb/source.py +18 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mysql/source.py +19 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/neo4j/source.py +27 -0
- classifyre_cli-0.4.13/src/sources/notion/__init__.py +3 -0
- classifyre_cli-0.4.13/src/sources/notion/client.py +223 -0
- classifyre_cli-0.4.13/src/sources/notion/source.py +987 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/object_storage/base.py +23 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/postgresql/source.py +27 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/powerbi/source.py +27 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/servicedesk/source.py +29 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/slack/source.py +12 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/sqlite/source.py +10 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/tableau/source.py +19 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/tabular_base.py +106 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/wordpress/source.py +25 -0
- classifyre_cli-0.4.13/src/utils/file_metadata.py +236 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/broken_links/test_broken_links_detector.py +87 -0
- classifyre_cli-0.4.13/tests/test_assets_metadata_catalog.py +73 -0
- classifyre_cli-0.4.13/tests/test_notion_source.py +227 -0
- classifyre_cli-0.4.13/tests/utils/test_file_metadata.py +79 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/uv.lock +224 -153
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/.gitignore +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/.python-version +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/README.md +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/main.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/main.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/detector_pipeline.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/utils/test_file_parser.py +0 -0
- {classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/tests/utils/test_file_to_images.py +0 -0
|
@@ -16,6 +16,14 @@ _PIPELINE_TYPE_DEFAULTS: dict[str, str] = {
|
|
|
16
16
|
"LLMPipelineSchema": "LLM",
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
+
# Pipeline schema classes whose `severity` field has a string default from JSON schema
|
|
20
|
+
# ('info') but must be an enum instance to avoid Pydantic serialization warnings
|
|
21
|
+
# ("Expected `enum` - serialized value may not be as expected").
|
|
22
|
+
_SEVERITY_ENUM_DEFAULT_CLASSES = {
|
|
23
|
+
"LLMPipelineSchema",
|
|
24
|
+
"TextClassificationPipelineSchema",
|
|
25
|
+
}
|
|
26
|
+
|
|
19
27
|
|
|
20
28
|
def _patch_pipeline_type_defaults(source: str) -> str:
|
|
21
29
|
"""Add `= 'X'` default to discriminator `type` fields on pipeline schemas."""
|
|
@@ -28,6 +36,22 @@ def _patch_pipeline_type_defaults(source: str) -> str:
|
|
|
28
36
|
return source
|
|
29
37
|
|
|
30
38
|
|
|
39
|
+
def _patch_severity_enum_defaults(source: str) -> str:
|
|
40
|
+
"""Replace string 'info' severity Field defaults with Severity.info enum instances.
|
|
41
|
+
|
|
42
|
+
datamodel-codegen emits Field('info', ...) from the JSON schema default, but
|
|
43
|
+
Pydantic v2 warns at serialization time when the stored value is a plain string
|
|
44
|
+
rather than a Severity enum member. This patch rewrites only the severity field
|
|
45
|
+
inside each affected class so the fix survives future codegen runs.
|
|
46
|
+
"""
|
|
47
|
+
for cls_name in _SEVERITY_ENUM_DEFAULT_CLASSES:
|
|
48
|
+
# Match from the class definition up through the severity Field default string.
|
|
49
|
+
pattern = rf"(class {re.escape(cls_name)}\(.*?severity: Severity \| None = Field\(\n\s+)'info'(\s*,)"
|
|
50
|
+
replacement = rf"\1Severity.info\2"
|
|
51
|
+
source = re.sub(pattern, replacement, source, flags=re.DOTALL)
|
|
52
|
+
return source
|
|
53
|
+
|
|
54
|
+
|
|
31
55
|
def run_codegen(input_file):
|
|
32
56
|
"""Generate Pydantic models from a single JSON schema file."""
|
|
33
57
|
cmd = [
|
|
@@ -74,6 +98,7 @@ def main():
|
|
|
74
98
|
detector_schema = SCHEMA_DIR / "all_detectors.json"
|
|
75
99
|
content = run_codegen(detector_schema)
|
|
76
100
|
content = _patch_pipeline_type_defaults(content)
|
|
101
|
+
content = _patch_severity_enum_defaults(content)
|
|
77
102
|
(MODEL_DIR / "generated_detectors.py").write_text(content)
|
|
78
103
|
print("Wrote src/models/generated_detectors.py")
|
|
79
104
|
|
|
@@ -156,6 +156,10 @@ class BrokenLinksDetector(BaseDetector):
|
|
|
156
156
|
if status_code in {405, 501}:
|
|
157
157
|
return self._scan_with_get(url, line, start, end, "head_not_supported")
|
|
158
158
|
|
|
159
|
+
# Some servers block HEAD (403) but serve content via GET.
|
|
160
|
+
if status_code == 403:
|
|
161
|
+
return self._scan_with_get(url, line, start, end, "head_forbidden")
|
|
162
|
+
|
|
159
163
|
if status_code >= 400:
|
|
160
164
|
return LinkScanResult(
|
|
161
165
|
url=url,
|
|
@@ -169,17 +173,10 @@ class BrokenLinksDetector(BaseDetector):
|
|
|
169
173
|
|
|
170
174
|
content_length = self._parse_content_length(head_response.headers)
|
|
171
175
|
if content_length == 0:
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
start=start,
|
|
176
|
-
end=end,
|
|
177
|
-
finding_type="empty_content",
|
|
178
|
-
confidence=0.9,
|
|
179
|
-
metadata={"status_code": status_code, "reason": "empty_head_content_length"},
|
|
180
|
-
)
|
|
176
|
+
# Content-Length: 0 on HEAD can be misleading (e.g., YouTube, TikTok).
|
|
177
|
+
# Fall back to GET to verify the page truly has no content.
|
|
178
|
+
return self._scan_with_get(url, line, start, end, "empty_head_content_length")
|
|
181
179
|
|
|
182
|
-
# Some servers omit Content-Length, so perform a lightweight GET check.
|
|
183
180
|
if content_length is None:
|
|
184
181
|
return self._scan_with_get(url, line, start, end, "missing_content_length")
|
|
185
182
|
|
|
@@ -241,7 +241,7 @@ class LLMRunner(BaseRunner):
|
|
|
241
241
|
) -> list[DetectionResult]:
|
|
242
242
|
schema = self._schema
|
|
243
243
|
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
|
|
244
|
-
default_severity = schema.severity
|
|
244
|
+
default_severity = schema.severity if schema.severity is not None else Severity.info
|
|
245
245
|
extracted = self._coerce_fields(payload.get("fields"))
|
|
246
246
|
|
|
247
247
|
raw_labels = payload.get("labels")
|
{classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_text_classification.py
RENAMED
|
@@ -66,7 +66,7 @@ class TextClassificationRunner(BaseRunner):
|
|
|
66
66
|
chunk_overlap: int = getattr(schema.chunk_overlap, "root", schema.chunk_overlap) or 0
|
|
67
67
|
max_length: int | None = getattr(schema.max_length, "root", schema.max_length)
|
|
68
68
|
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.7
|
|
69
|
-
default_severity = schema.severity
|
|
69
|
+
default_severity = schema.severity if schema.severity is not None else Severity.info
|
|
70
70
|
|
|
71
71
|
best_scores: dict[str, float] = {}
|
|
72
72
|
try:
|
|
@@ -1080,7 +1080,7 @@ class LLMPipelineSchema(BaseModel):
|
|
|
1080
1080
|
False, description='Allow more than one label per asset.'
|
|
1081
1081
|
)
|
|
1082
1082
|
severity: Severity | None = Field(
|
|
1083
|
-
|
|
1083
|
+
Severity.info,
|
|
1084
1084
|
description='Default severity when no severity_map rule matches a predicted label.',
|
|
1085
1085
|
)
|
|
1086
1086
|
severity_map: list[PipelineSeverityRule] | None = Field(
|
|
@@ -1177,7 +1177,7 @@ class TextClassificationPipelineSchema(BaseModel):
|
|
|
1177
1177
|
le=1.0,
|
|
1178
1178
|
)
|
|
1179
1179
|
severity: Severity | None = Field(
|
|
1180
|
-
|
|
1180
|
+
Severity.info, description='Default severity when no severity_map rule matches.'
|
|
1181
1181
|
)
|
|
1182
1182
|
severity_map: list[PipelineSeverityRule] | None = Field(
|
|
1183
1183
|
None,
|
|
@@ -42,6 +42,7 @@ class AssetType(StrEnum):
|
|
|
42
42
|
JIRA = 'JIRA'
|
|
43
43
|
SERVICEDESK = 'SERVICEDESK'
|
|
44
44
|
SQLITE = 'SQLITE'
|
|
45
|
+
NOTION = 'NOTION'
|
|
45
46
|
|
|
46
47
|
|
|
47
48
|
class SourceCategory(StrEnum):
|
|
@@ -1078,9 +1079,10 @@ class DatabricksAuthMode(StrEnum):
|
|
|
1078
1079
|
|
|
1079
1080
|
PAT_TOKEN = 'PAT_TOKEN'
|
|
1080
1081
|
SERVICE_PRINCIPAL = 'SERVICE_PRINCIPAL'
|
|
1082
|
+
AZURE_SERVICE_PRINCIPAL = 'AZURE_SERVICE_PRINCIPAL'
|
|
1081
1083
|
|
|
1082
1084
|
|
|
1083
|
-
class
|
|
1085
|
+
class PersonalAccessToken(BaseModel):
|
|
1084
1086
|
model_config = ConfigDict(
|
|
1085
1087
|
extra='forbid',
|
|
1086
1088
|
)
|
|
@@ -1094,7 +1096,7 @@ class DatabricksRequiredPat(BaseModel):
|
|
|
1094
1096
|
)
|
|
1095
1097
|
|
|
1096
1098
|
|
|
1097
|
-
class
|
|
1099
|
+
class ServicePrincipalOAuthM2M(BaseModel):
|
|
1098
1100
|
model_config = ConfigDict(
|
|
1099
1101
|
extra='forbid',
|
|
1100
1102
|
)
|
|
@@ -1109,6 +1111,24 @@ class DatabricksRequiredServicePrincipal(BaseModel):
|
|
|
1109
1111
|
client_id: str = Field(..., description='Databricks service principal client ID')
|
|
1110
1112
|
|
|
1111
1113
|
|
|
1114
|
+
class AzureServicePrincipal(BaseModel):
|
|
1115
|
+
model_config = ConfigDict(
|
|
1116
|
+
extra='forbid',
|
|
1117
|
+
)
|
|
1118
|
+
auth_mode: Literal['AZURE_SERVICE_PRINCIPAL']
|
|
1119
|
+
workspace_url: AnyUrl = Field(
|
|
1120
|
+
...,
|
|
1121
|
+
description='Azure Databricks workspace URL (for example, https://adb-1234567890123456.7.azuredatabricks.net)',
|
|
1122
|
+
)
|
|
1123
|
+
warehouse_id: str = Field(
|
|
1124
|
+
..., description='Databricks SQL warehouse ID used for sampling queries'
|
|
1125
|
+
)
|
|
1126
|
+
client_id: str = Field(
|
|
1127
|
+
..., description='Azure AD application (client) ID for the service principal'
|
|
1128
|
+
)
|
|
1129
|
+
tenant_id: str = Field(..., description='Azure AD tenant ID')
|
|
1130
|
+
|
|
1131
|
+
|
|
1112
1132
|
class DatabricksMaskedPat(BaseModel):
|
|
1113
1133
|
model_config = ConfigDict(
|
|
1114
1134
|
extra='forbid',
|
|
@@ -1125,6 +1145,15 @@ class DatabricksMaskedServicePrincipal(BaseModel):
|
|
|
1125
1145
|
)
|
|
1126
1146
|
|
|
1127
1147
|
|
|
1148
|
+
class DatabricksMaskedAzureServicePrincipal(BaseModel):
|
|
1149
|
+
model_config = ConfigDict(
|
|
1150
|
+
extra='forbid',
|
|
1151
|
+
)
|
|
1152
|
+
client_secret: str = Field(
|
|
1153
|
+
..., description='Azure AD client secret for the service principal'
|
|
1154
|
+
)
|
|
1155
|
+
|
|
1156
|
+
|
|
1128
1157
|
class DatabricksOptionalConnection(BaseModel):
|
|
1129
1158
|
"""
|
|
1130
1159
|
Databricks API and SQL statement execution tuning options.
|
|
@@ -1842,6 +1871,7 @@ class Type(StrEnum):
|
|
|
1842
1871
|
JIRA = 'JIRA'
|
|
1843
1872
|
SERVICEDESK = 'SERVICEDESK'
|
|
1844
1873
|
SQLITE = 'SQLITE'
|
|
1874
|
+
NOTION = 'NOTION'
|
|
1845
1875
|
|
|
1846
1876
|
|
|
1847
1877
|
class SlackInput(CoreInput):
|
|
@@ -2020,8 +2050,8 @@ class DatabricksInput(CoreInput):
|
|
|
2020
2050
|
type: Literal['DATABRICKS'] = Field(
|
|
2021
2051
|
'DATABRICKS', description='Type of the asset or source'
|
|
2022
2052
|
)
|
|
2023
|
-
required:
|
|
2024
|
-
..., title='DatabricksRequired'
|
|
2053
|
+
required: PersonalAccessToken | ServicePrincipalOAuthM2M | AzureServicePrincipal = (
|
|
2054
|
+
Field(..., title='DatabricksRequired')
|
|
2025
2055
|
)
|
|
2026
2056
|
masked: DatabricksMaskedPat | DatabricksMaskedServicePrincipal = Field(
|
|
2027
2057
|
..., title='DatabricksMasked'
|
|
@@ -2629,6 +2659,7 @@ class Type17(StrEnum):
|
|
|
2629
2659
|
JIRA = 'JIRA'
|
|
2630
2660
|
SERVICEDESK = 'SERVICEDESK'
|
|
2631
2661
|
SQLITE = 'SQLITE'
|
|
2662
|
+
NOTION = 'NOTION'
|
|
2632
2663
|
|
|
2633
2664
|
|
|
2634
2665
|
class ConfluenceInput(CoreInput):
|
|
@@ -2737,6 +2768,135 @@ class SQLiteInput(CoreInput):
|
|
|
2737
2768
|
resources: ResourceOverrides | None = None
|
|
2738
2769
|
|
|
2739
2770
|
|
|
2771
|
+
class NotionRequired(BaseModel):
|
|
2772
|
+
"""
|
|
2773
|
+
Notion has no required connection fields; the integration token lives in the masked section.
|
|
2774
|
+
"""
|
|
2775
|
+
|
|
2776
|
+
model_config = ConfigDict(
|
|
2777
|
+
extra='forbid',
|
|
2778
|
+
)
|
|
2779
|
+
|
|
2780
|
+
|
|
2781
|
+
class NotionMasked(BaseModel):
|
|
2782
|
+
model_config = ConfigDict(
|
|
2783
|
+
extra='forbid',
|
|
2784
|
+
)
|
|
2785
|
+
notion_token: str = Field(
|
|
2786
|
+
...,
|
|
2787
|
+
description='Notion API token used as a Bearer credential. Accepts an internal integration secret (ntn_...) or an OAuth public-integration access token.',
|
|
2788
|
+
)
|
|
2789
|
+
|
|
2790
|
+
|
|
2791
|
+
class NotionOptionalConnection(BaseModel):
|
|
2792
|
+
"""
|
|
2793
|
+
HTTP, version, and retry settings for Notion API calls.
|
|
2794
|
+
"""
|
|
2795
|
+
|
|
2796
|
+
model_config = ConfigDict(
|
|
2797
|
+
extra='forbid',
|
|
2798
|
+
)
|
|
2799
|
+
notion_version: str | None = Field(
|
|
2800
|
+
'2025-09-03',
|
|
2801
|
+
description='Notion-Version header sent with every request. Defaults to the data-sources API version.',
|
|
2802
|
+
)
|
|
2803
|
+
request_timeout_seconds: float | None = Field(
|
|
2804
|
+
30, description='HTTP request timeout for Notion API calls', ge=1.0
|
|
2805
|
+
)
|
|
2806
|
+
rate_limit_delay_seconds: float | None = Field(
|
|
2807
|
+
0,
|
|
2808
|
+
description='Additional delay between API requests to reduce rate-limit pressure',
|
|
2809
|
+
ge=0.0,
|
|
2810
|
+
)
|
|
2811
|
+
max_retries: int | None = Field(
|
|
2812
|
+
3,
|
|
2813
|
+
description='Maximum retry attempts for transient API failures and rate limits',
|
|
2814
|
+
ge=0,
|
|
2815
|
+
le=10,
|
|
2816
|
+
)
|
|
2817
|
+
|
|
2818
|
+
|
|
2819
|
+
class NotionOptionalScope(BaseModel):
|
|
2820
|
+
"""
|
|
2821
|
+
Optional Notion scope filters. When omitted, all content shared with the integration is eligible for sampling.
|
|
2822
|
+
"""
|
|
2823
|
+
|
|
2824
|
+
model_config = ConfigDict(
|
|
2825
|
+
extra='forbid',
|
|
2826
|
+
)
|
|
2827
|
+
page_ids: list[str] | None = Field(
|
|
2828
|
+
None,
|
|
2829
|
+
description='Restrict extraction to specific page IDs (up to 250)',
|
|
2830
|
+
max_length=250,
|
|
2831
|
+
)
|
|
2832
|
+
data_source_ids: list[str] | None = Field(
|
|
2833
|
+
None,
|
|
2834
|
+
description='Restrict extraction to specific data source IDs (up to 250)',
|
|
2835
|
+
max_length=250,
|
|
2836
|
+
)
|
|
2837
|
+
search_query: str | None = Field(
|
|
2838
|
+
None,
|
|
2839
|
+
description='Optional full-text query passed to the Notion search endpoint to narrow discovery',
|
|
2840
|
+
min_length=1,
|
|
2841
|
+
)
|
|
2842
|
+
|
|
2843
|
+
|
|
2844
|
+
class NotionOptionalContent(BaseModel):
|
|
2845
|
+
"""
|
|
2846
|
+
Notion content extraction controls.
|
|
2847
|
+
"""
|
|
2848
|
+
|
|
2849
|
+
model_config = ConfigDict(
|
|
2850
|
+
extra='forbid',
|
|
2851
|
+
)
|
|
2852
|
+
include_comments: bool | None = Field(
|
|
2853
|
+
True,
|
|
2854
|
+
description='Include page and block comments and aggregate them into a per-page comments asset',
|
|
2855
|
+
)
|
|
2856
|
+
include_files: bool | None = Field(
|
|
2857
|
+
True,
|
|
2858
|
+
description='Materialize files from file/image/pdf/video blocks, file properties, and page icon/cover as related assets',
|
|
2859
|
+
)
|
|
2860
|
+
include_linked_pages: bool | None = Field(
|
|
2861
|
+
True,
|
|
2862
|
+
description='Wire parent, relation, and mention references between pages into the asset links graph',
|
|
2863
|
+
)
|
|
2864
|
+
include_data_sources: bool | None = Field(
|
|
2865
|
+
True,
|
|
2866
|
+
description='Emit Notion data sources (databases) as assets with their schema and link their row pages',
|
|
2867
|
+
)
|
|
2868
|
+
file_max_bytes: int | None = Field(
|
|
2869
|
+
5242880,
|
|
2870
|
+
description='Maximum bytes downloaded per file for MIME inference and text extraction',
|
|
2871
|
+
ge=1024,
|
|
2872
|
+
)
|
|
2873
|
+
|
|
2874
|
+
|
|
2875
|
+
class NotionOptional(BaseModel):
|
|
2876
|
+
model_config = ConfigDict(
|
|
2877
|
+
extra='forbid',
|
|
2878
|
+
)
|
|
2879
|
+
connection: NotionOptionalConnection | None = None
|
|
2880
|
+
scope: NotionOptionalScope | None = None
|
|
2881
|
+
content: NotionOptionalContent | None = None
|
|
2882
|
+
|
|
2883
|
+
|
|
2884
|
+
class NotionInput(CoreInput):
|
|
2885
|
+
type: Literal['NOTION'] = Field('NOTION', description='Type of the asset or source')
|
|
2886
|
+
required: NotionRequired
|
|
2887
|
+
masked: NotionMasked
|
|
2888
|
+
optional: NotionOptional | None = None
|
|
2889
|
+
detectors: list[Detector] | None = Field(
|
|
2890
|
+
None, description='Detectors to run on ingested content'
|
|
2891
|
+
)
|
|
2892
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
2893
|
+
None,
|
|
2894
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
2895
|
+
)
|
|
2896
|
+
sampling: SamplingConfig
|
|
2897
|
+
resources: ResourceOverrides | None = None
|
|
2898
|
+
|
|
2899
|
+
|
|
2740
2900
|
class SourceInput(
|
|
2741
2901
|
RootModel[
|
|
2742
2902
|
SlackInput
|
|
@@ -2759,6 +2919,7 @@ class SourceInput(
|
|
|
2759
2919
|
| JiraInput
|
|
2760
2920
|
| ServiceDeskInput
|
|
2761
2921
|
| SQLiteInput
|
|
2922
|
+
| NotionInput
|
|
2762
2923
|
]
|
|
2763
2924
|
):
|
|
2764
2925
|
root: (
|
|
@@ -2782,6 +2943,7 @@ class SourceInput(
|
|
|
2782
2943
|
| JiraInput
|
|
2783
2944
|
| ServiceDeskInput
|
|
2784
2945
|
| SQLiteInput
|
|
2946
|
+
| NotionInput
|
|
2785
2947
|
) = Field(
|
|
2786
2948
|
...,
|
|
2787
2949
|
description='Merged configuration schema with all source types and common definitions',
|
{classifyre_cli-0.4.11 → classifyre_cli-0.4.13}/src/models/generated_single_asset_scan_results.py
RENAMED
|
@@ -210,7 +210,14 @@ class SingleAssetScanResults(BaseModel):
|
|
|
210
210
|
..., description='Linked asset hashes referenced by this asset', title='Links'
|
|
211
211
|
)
|
|
212
212
|
asset_type: AssetType = Field(
|
|
213
|
-
...,
|
|
213
|
+
...,
|
|
214
|
+
description='Canonical asset content type (used for detector routing)',
|
|
215
|
+
title='Asset Type',
|
|
216
|
+
)
|
|
217
|
+
asset_kind: str | None = Field(
|
|
218
|
+
None,
|
|
219
|
+
description='Catalog asset kind discriminator (file, image, page, comment, table, ...). Persisted as the asset type for display/filtering.',
|
|
220
|
+
title='Asset Kind',
|
|
214
221
|
)
|
|
215
222
|
source_id: str | None = Field(
|
|
216
223
|
None,
|
|
@@ -238,3 +245,8 @@ class SingleAssetScanResults(BaseModel):
|
|
|
238
245
|
description='Statistics about the detector scan for this asset',
|
|
239
246
|
title='Scan Stats',
|
|
240
247
|
)
|
|
248
|
+
metadata: dict[str, Any] | None = Field(
|
|
249
|
+
None,
|
|
250
|
+
description='Source-specific asset metadata using normalized keys (size_bytes, row_count, etc.) where applicable',
|
|
251
|
+
title='Metadata',
|
|
252
|
+
)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Asset-metadata contract: the single source of truth for what each source
|
|
2
|
+
extracts is the ``x-asset-metadata`` catalog embedded in
|
|
3
|
+
``packages/schemas/src/schemas/all_input_sources.json``.
|
|
4
|
+
|
|
5
|
+
This module loads/resolves that catalog and validates metadata dicts against it.
|
|
6
|
+
Validation is strict (raises) under pytest or when ``CLASSIFYRE_STRICT_METADATA``
|
|
7
|
+
is set, and otherwise logs a warning during real ingestion — so drift between a
|
|
8
|
+
source's emitted keys and the declared catalog is caught either in CI or at runtime.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
from functools import cache
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from ..utils.validation import _load_schema
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_CATALOG_KEY = "x-asset-metadata"
|
|
23
|
+
|
|
24
|
+
ResolvedField = dict[str, Any] # {name, type, description, required}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AssetMetadataContractError(AssertionError):
|
|
28
|
+
"""Raised (in strict mode) when emitted metadata violates the catalog."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _strict_mode() -> bool:
|
|
32
|
+
return bool(
|
|
33
|
+
os.environ.get("PYTEST_CURRENT_TEST") or os.environ.get("CLASSIFYRE_STRICT_METADATA")
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@cache
|
|
38
|
+
def load_catalog() -> dict[str, Any]:
|
|
39
|
+
"""Load and cache the ``x-asset-metadata`` catalog from the merged schema."""
|
|
40
|
+
schema = _load_schema("all_input_sources.json")
|
|
41
|
+
catalog = schema.get(_CATALOG_KEY)
|
|
42
|
+
if not isinstance(catalog, dict):
|
|
43
|
+
raise AssetMetadataContractError(
|
|
44
|
+
f"Missing '{_CATALOG_KEY}' catalog in all_input_sources.json"
|
|
45
|
+
)
|
|
46
|
+
return catalog
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _source_key(source_type: str) -> str:
|
|
50
|
+
# Catalog keys mirror the AssetType enum (uppercased source_type).
|
|
51
|
+
return source_type.upper()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def describe_type(prop_schema: dict[str, Any]) -> str:
|
|
55
|
+
"""Render a JSON-Schema property type as a short display string."""
|
|
56
|
+
json_type = prop_schema.get("type")
|
|
57
|
+
if json_type == "array":
|
|
58
|
+
items = prop_schema.get("items", {})
|
|
59
|
+
item_type = items.get("type", "string") if isinstance(items, dict) else "string"
|
|
60
|
+
return f"{item_type}[]"
|
|
61
|
+
return str(json_type) if json_type else "string"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def resolve_fields(source_type: str, asset_kind: str) -> list[ResolvedField]:
|
|
65
|
+
"""Resolve the declared fields for a (source, asset kind).
|
|
66
|
+
|
|
67
|
+
The asset entry composes one or more reusable ``contentTypes`` via ``use``
|
|
68
|
+
plus its own ``properties``; ``required`` is the union of each used content
|
|
69
|
+
type's required list and the entry's own. Raises if the entry is absent.
|
|
70
|
+
"""
|
|
71
|
+
catalog = load_catalog()
|
|
72
|
+
sources = catalog.get("sources", {})
|
|
73
|
+
source_entry = sources.get(_source_key(source_type))
|
|
74
|
+
if not isinstance(source_entry, dict) or asset_kind not in source_entry:
|
|
75
|
+
raise AssetMetadataContractError(
|
|
76
|
+
f"No catalog entry for source '{source_type}' asset kind '{asset_kind}'"
|
|
77
|
+
)
|
|
78
|
+
entry = source_entry[asset_kind]
|
|
79
|
+
content_types = catalog.get("contentTypes", {})
|
|
80
|
+
|
|
81
|
+
properties: dict[str, dict[str, Any]] = {}
|
|
82
|
+
required: set[str] = set()
|
|
83
|
+
|
|
84
|
+
for content_type_name in entry.get("use", []):
|
|
85
|
+
content_type = content_types.get(content_type_name, {})
|
|
86
|
+
properties.update(content_type.get("properties", {}))
|
|
87
|
+
required.update(content_type.get("required", []))
|
|
88
|
+
|
|
89
|
+
properties.update(entry.get("properties", {}))
|
|
90
|
+
required.update(entry.get("required", []))
|
|
91
|
+
|
|
92
|
+
return [
|
|
93
|
+
{
|
|
94
|
+
"name": name,
|
|
95
|
+
"type": describe_type(prop),
|
|
96
|
+
"description": prop.get("description", ""),
|
|
97
|
+
"required": name in required,
|
|
98
|
+
}
|
|
99
|
+
for name, prop in properties.items()
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def validate_metadata(
|
|
104
|
+
source_type: str,
|
|
105
|
+
asset_kind: str,
|
|
106
|
+
data: dict[str, Any],
|
|
107
|
+
) -> dict[str, Any]:
|
|
108
|
+
"""Validate an emitted metadata dict against the catalog and return it.
|
|
109
|
+
|
|
110
|
+
Strict mode raises ``AssetMetadataContractError``; otherwise it logs a
|
|
111
|
+
warning. Checks: no undeclared keys, and every required field is present
|
|
112
|
+
with a non-null value.
|
|
113
|
+
"""
|
|
114
|
+
try:
|
|
115
|
+
fields = resolve_fields(source_type, asset_kind)
|
|
116
|
+
except AssetMetadataContractError as exc:
|
|
117
|
+
if _strict_mode():
|
|
118
|
+
raise
|
|
119
|
+
logger.warning("Asset metadata contract: %s", exc)
|
|
120
|
+
return data
|
|
121
|
+
|
|
122
|
+
declared = {field["name"] for field in fields}
|
|
123
|
+
required = {field["name"] for field in fields if field["required"]}
|
|
124
|
+
present_non_null = {key for key, value in data.items() if value is not None}
|
|
125
|
+
|
|
126
|
+
undeclared = sorted(set(data) - declared)
|
|
127
|
+
missing_required = sorted(required - present_non_null)
|
|
128
|
+
|
|
129
|
+
if undeclared or missing_required:
|
|
130
|
+
message = (
|
|
131
|
+
f"[{source_type}/{asset_kind}] "
|
|
132
|
+
f"undeclared={undeclared} missing_required={missing_required}"
|
|
133
|
+
)
|
|
134
|
+
if _strict_mode():
|
|
135
|
+
raise AssetMetadataContractError(message)
|
|
136
|
+
logger.warning("Asset metadata contract drift: %s", message)
|
|
137
|
+
|
|
138
|
+
return data
|
|
@@ -17,6 +17,11 @@ class BaseSource(ABC):
|
|
|
17
17
|
Abstract base class for all metadata extraction sources.
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
+
# Stable source identifier, overridden by each concrete source (e.g.
|
|
21
|
+
# "postgresql", "wordpress"). Uppercased it maps to the AssetType enum and
|
|
22
|
+
# the x-asset-metadata catalog key.
|
|
23
|
+
source_type: str = ""
|
|
24
|
+
|
|
20
25
|
# Default batch size for streaming asset results
|
|
21
26
|
BATCH_SIZE: int = 50
|
|
22
27
|
HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
|
|
@@ -130,6 +135,22 @@ class BaseSource(ABC):
|
|
|
130
135
|
"""
|
|
131
136
|
return calculate_checksum(data)
|
|
132
137
|
|
|
138
|
+
def metadata_fields(self, asset_kind: str, data: dict[str, Any]) -> dict[str, Any]:
|
|
139
|
+
"""Build the ``asset_kind`` + ``metadata`` kwargs for SingleAssetScanResults.
|
|
140
|
+
|
|
141
|
+
Spread into the constructor: ``**self.metadata_fields("page", {...})``.
|
|
142
|
+
``asset_kind`` is the catalog discriminator (persisted as the asset type
|
|
143
|
+
for display); ``metadata`` is validated against ``x-asset-metadata`` for
|
|
144
|
+
this source/kind — strict (raises) under pytest / ``CLASSIFYRE_STRICT_METADATA``,
|
|
145
|
+
otherwise a warning.
|
|
146
|
+
"""
|
|
147
|
+
from .asset_metadata import validate_metadata
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
"asset_kind": asset_kind,
|
|
151
|
+
"metadata": validate_metadata(self.source_type, asset_kind, data),
|
|
152
|
+
}
|
|
153
|
+
|
|
133
154
|
@abstractmethod
|
|
134
155
|
def abort(self) -> None:
|
|
135
156
|
"""
|
|
@@ -313,6 +313,20 @@ class ConfluenceSource(BaseSource):
|
|
|
313
313
|
"status": page.get("status"),
|
|
314
314
|
"links_count": len(related_hashes),
|
|
315
315
|
}
|
|
316
|
+
asset_metadata: dict[str, Any] = {
|
|
317
|
+
"page_id": page_id,
|
|
318
|
+
"title": title,
|
|
319
|
+
"links_count": len(related_hashes),
|
|
320
|
+
}
|
|
321
|
+
space_id = page.get("spaceId")
|
|
322
|
+
if space_id is not None:
|
|
323
|
+
asset_metadata["space_key"] = str(space_id)
|
|
324
|
+
status = page.get("status")
|
|
325
|
+
if isinstance(status, str) and status:
|
|
326
|
+
asset_metadata["status"] = status
|
|
327
|
+
author_id = page.get("authorId")
|
|
328
|
+
if isinstance(author_id, str) and author_id:
|
|
329
|
+
asset_metadata["author"] = author_id
|
|
316
330
|
page_asset = SingleAssetScanResults(
|
|
317
331
|
hash=page_hash,
|
|
318
332
|
checksum=self.calculate_checksum(page_metadata),
|
|
@@ -330,6 +344,7 @@ class ConfluenceSource(BaseSource):
|
|
|
330
344
|
)
|
|
331
345
|
),
|
|
332
346
|
runner_id=self.runner_id,
|
|
347
|
+
**self.metadata_fields("page", asset_metadata),
|
|
333
348
|
)
|
|
334
349
|
|
|
335
350
|
return [page_asset, *related_assets]
|
|
@@ -368,6 +383,15 @@ class ConfluenceSource(BaseSource):
|
|
|
368
383
|
if download_url:
|
|
369
384
|
self._attachment_download_url_by_hash[attachment_hash] = download_url
|
|
370
385
|
|
|
386
|
+
attachment_metadata: dict[str, Any] = {
|
|
387
|
+
"title": attachment_name,
|
|
388
|
+
"page_hash": page_hash,
|
|
389
|
+
}
|
|
390
|
+
if mime:
|
|
391
|
+
attachment_metadata["mime_type"] = mime
|
|
392
|
+
file_size = attachment.get("fileSize")
|
|
393
|
+
if isinstance(file_size, int):
|
|
394
|
+
attachment_metadata["size_bytes"] = file_size
|
|
371
395
|
assets.append(
|
|
372
396
|
SingleAssetScanResults(
|
|
373
397
|
hash=attachment_hash,
|
|
@@ -380,6 +404,7 @@ class ConfluenceSource(BaseSource):
|
|
|
380
404
|
created_at=now,
|
|
381
405
|
updated_at=now,
|
|
382
406
|
runner_id=self.runner_id,
|
|
407
|
+
**self.metadata_fields("attachment", attachment_metadata),
|
|
383
408
|
)
|
|
384
409
|
)
|
|
385
410
|
hashes.append(attachment_hash)
|
|
@@ -453,6 +478,10 @@ class ConfluenceSource(BaseSource):
|
|
|
453
478
|
created_at=now,
|
|
454
479
|
updated_at=now,
|
|
455
480
|
runner_id=self.runner_id,
|
|
481
|
+
**self.metadata_fields(
|
|
482
|
+
"comments",
|
|
483
|
+
{"page_id": page_id, "comments_count": len(comment_items)},
|
|
484
|
+
),
|
|
456
485
|
)
|
|
457
486
|
return comments_asset, [comments_hash]
|
|
458
487
|
|
|
@@ -578,6 +607,7 @@ class ConfluenceSource(BaseSource):
|
|
|
578
607
|
created_at=now,
|
|
579
608
|
updated_at=now,
|
|
580
609
|
runner_id=self.runner_id,
|
|
610
|
+
**self.metadata_fields("linked_file", {"referenced_by": page_hash}),
|
|
581
611
|
)
|
|
582
612
|
|
|
583
613
|
def _display_name_from_url(self, url: str) -> str:
|