classifyre-cli 0.4.12__tar.gz → 0.4.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/.turbo/turbo-build.log +1 -1
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/PKG-INFO +1 -1
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/package.json +1 -1
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/pyproject.toml +1 -1
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/scripts/generate_models.py +25 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/broken_links/detector.py +7 -10
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_llm.py +1 -1
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_text_classification.py +1 -1
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/models/generated_detectors.py +2 -2
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/models/generated_input.py +134 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/models/generated_single_asset_scan_results.py +13 -1
- classifyre_cli-0.4.13/src/sources/asset_metadata.py +138 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/base.py +21 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/confluence/source.py +30 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/databricks/source.py +2 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/jira/source.py +65 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/mongodb/source.py +18 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/mysql/source.py +19 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/neo4j/source.py +27 -0
- classifyre_cli-0.4.13/src/sources/notion/__init__.py +3 -0
- classifyre_cli-0.4.13/src/sources/notion/client.py +223 -0
- classifyre_cli-0.4.13/src/sources/notion/source.py +987 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/object_storage/base.py +23 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/postgresql/source.py +27 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/powerbi/source.py +27 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/servicedesk/source.py +29 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/slack/source.py +12 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/sqlite/source.py +10 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/tableau/source.py +19 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/tabular_base.py +106 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/wordpress/source.py +25 -0
- classifyre_cli-0.4.13/src/utils/file_metadata.py +236 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/broken_links/test_broken_links_detector.py +87 -0
- classifyre_cli-0.4.13/tests/test_assets_metadata_catalog.py +73 -0
- classifyre_cli-0.4.13/tests/test_notion_source.py +227 -0
- classifyre_cli-0.4.13/tests/utils/test_file_metadata.py +79 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/uv.lock +182 -141
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/.gitignore +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/.python-version +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/README.md +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/main.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/base.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/broken_links/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/config.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/extractor.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_base.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_factory.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_feature_extraction.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_gliner2.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_image_classification.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_object_detection.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_regex.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/trainer.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/dependencies.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/pii/detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/secrets/detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/threat/code_security_detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/threat/yara_detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/main.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/outputs/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/outputs/base.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/outputs/console.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/outputs/factory.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/outputs/file.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/outputs/rest.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/pipeline/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/pipeline/content_provider.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/pipeline/detector_pipeline.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/pipeline/parsed_content_provider.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/pipeline/worker_pool.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sandbox/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sandbox/runner.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/atlassian_common.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/azure_blob_storage/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/azure_blob_storage/source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/confluence/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/databricks/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/dependencies.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/google_cloud_storage/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/google_cloud_storage/source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/hive/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/hive/source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/jira/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/mongodb/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/mssql/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/mssql/source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/mysql/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/neo4j/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/oracle/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/oracle/source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/postgresql/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/powerbi/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/README.md +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/s3_compatible_storage/source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/servicedesk/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/slack/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/snowflake/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/snowflake/source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/sqlite/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/tableau/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/tabular_utils.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/sources/wordpress/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/telemetry.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/utils/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/utils/content_extraction.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/utils/embedded_images.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/utils/file_parser.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/utils/file_to_images.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/utils/hashing.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/utils/uv_sync.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/utils/validation.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/conftest.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/conftest.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/content/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/custom/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/custom/conftest.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/custom/test_invoice_extraction.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/custom/test_llm_runner.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/custom/test_pipeline_integration.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/custom/test_regex_runner.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/custom/test_transformer_runners.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/pii/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/pii/conftest.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/pii/sample_invoice.pdf +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/pii/test_pii_detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/pii/test_pii_detector_extended.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/secrets/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/secrets/test_secrets_detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/secrets/test_secrets_detector_extended.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/test_base_detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/test_custom_detector_examples_runtime.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/test_detector_catalog_commercial.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/test_detector_pipeline_types.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/test_detector_schema_examples.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/test_detector_types.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/test_phase2_detectors.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/test_registry.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/threat/__init__.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/threat/test_code_security_detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/detectors/threat/test_yara_detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/integration/test_wordpress_broken_links_detector.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/integration/test_wordpress_links_assets.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/pipeline/test_detector_pipeline.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/pipeline/test_worker_pool.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_azure_blob_storage_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_base_source_attachment.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_base_source_sampling.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_confluence_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_custom_extractor.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_databricks_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_google_cloud_storage_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_hashing.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_hive_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_jira_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_mongodb_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_mssql_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_mysql_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_neo4j_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_oracle_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_outputs.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_postgresql_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_powerbi_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_recipe_normalizer.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_s3_compatible_storage_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_sandbox_runner.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_servicedesk_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_slack_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_snowflake_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_source_dependency_groups.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_sqlite_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_tableau_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_tabular_utils.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/test_wordpress_source.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/utils/test_content_extraction.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/utils/test_embedded_images.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/utils/test_file_parser.py +0 -0
- {classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/tests/utils/test_file_to_images.py +0 -0
|
@@ -16,6 +16,14 @@ _PIPELINE_TYPE_DEFAULTS: dict[str, str] = {
|
|
|
16
16
|
"LLMPipelineSchema": "LLM",
|
|
17
17
|
}
|
|
18
18
|
|
|
19
|
+
# Pipeline schema classes whose `severity` field has a string default from JSON schema
|
|
20
|
+
# ('info') but must be an enum instance to avoid Pydantic serialization warnings
|
|
21
|
+
# ("Expected `enum` - serialized value may not be as expected").
|
|
22
|
+
_SEVERITY_ENUM_DEFAULT_CLASSES = {
|
|
23
|
+
"LLMPipelineSchema",
|
|
24
|
+
"TextClassificationPipelineSchema",
|
|
25
|
+
}
|
|
26
|
+
|
|
19
27
|
|
|
20
28
|
def _patch_pipeline_type_defaults(source: str) -> str:
|
|
21
29
|
"""Add `= 'X'` default to discriminator `type` fields on pipeline schemas."""
|
|
@@ -28,6 +36,22 @@ def _patch_pipeline_type_defaults(source: str) -> str:
|
|
|
28
36
|
return source
|
|
29
37
|
|
|
30
38
|
|
|
39
|
+
def _patch_severity_enum_defaults(source: str) -> str:
|
|
40
|
+
"""Replace string 'info' severity Field defaults with Severity.info enum instances.
|
|
41
|
+
|
|
42
|
+
datamodel-codegen emits Field('info', ...) from the JSON schema default, but
|
|
43
|
+
Pydantic v2 warns at serialization time when the stored value is a plain string
|
|
44
|
+
rather than a Severity enum member. This patch rewrites only the severity field
|
|
45
|
+
inside each affected class so the fix survives future codegen runs.
|
|
46
|
+
"""
|
|
47
|
+
for cls_name in _SEVERITY_ENUM_DEFAULT_CLASSES:
|
|
48
|
+
# Match from the class definition up through the severity Field default string.
|
|
49
|
+
pattern = rf"(class {re.escape(cls_name)}\(.*?severity: Severity \| None = Field\(\n\s+)'info'(\s*,)"
|
|
50
|
+
replacement = rf"\1Severity.info\2"
|
|
51
|
+
source = re.sub(pattern, replacement, source, flags=re.DOTALL)
|
|
52
|
+
return source
|
|
53
|
+
|
|
54
|
+
|
|
31
55
|
def run_codegen(input_file):
|
|
32
56
|
"""Generate Pydantic models from a single JSON schema file."""
|
|
33
57
|
cmd = [
|
|
@@ -74,6 +98,7 @@ def main():
|
|
|
74
98
|
detector_schema = SCHEMA_DIR / "all_detectors.json"
|
|
75
99
|
content = run_codegen(detector_schema)
|
|
76
100
|
content = _patch_pipeline_type_defaults(content)
|
|
101
|
+
content = _patch_severity_enum_defaults(content)
|
|
77
102
|
(MODEL_DIR / "generated_detectors.py").write_text(content)
|
|
78
103
|
print("Wrote src/models/generated_detectors.py")
|
|
79
104
|
|
|
@@ -156,6 +156,10 @@ class BrokenLinksDetector(BaseDetector):
|
|
|
156
156
|
if status_code in {405, 501}:
|
|
157
157
|
return self._scan_with_get(url, line, start, end, "head_not_supported")
|
|
158
158
|
|
|
159
|
+
# Some servers block HEAD (403) but serve content via GET.
|
|
160
|
+
if status_code == 403:
|
|
161
|
+
return self._scan_with_get(url, line, start, end, "head_forbidden")
|
|
162
|
+
|
|
159
163
|
if status_code >= 400:
|
|
160
164
|
return LinkScanResult(
|
|
161
165
|
url=url,
|
|
@@ -169,17 +173,10 @@ class BrokenLinksDetector(BaseDetector):
|
|
|
169
173
|
|
|
170
174
|
content_length = self._parse_content_length(head_response.headers)
|
|
171
175
|
if content_length == 0:
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
start=start,
|
|
176
|
-
end=end,
|
|
177
|
-
finding_type="empty_content",
|
|
178
|
-
confidence=0.9,
|
|
179
|
-
metadata={"status_code": status_code, "reason": "empty_head_content_length"},
|
|
180
|
-
)
|
|
176
|
+
# Content-Length: 0 on HEAD can be misleading (e.g., YouTube, TikTok).
|
|
177
|
+
# Fall back to GET to verify the page truly has no content.
|
|
178
|
+
return self._scan_with_get(url, line, start, end, "empty_head_content_length")
|
|
181
179
|
|
|
182
|
-
# Some servers omit Content-Length, so perform a lightweight GET check.
|
|
183
180
|
if content_length is None:
|
|
184
181
|
return self._scan_with_get(url, line, start, end, "missing_content_length")
|
|
185
182
|
|
|
@@ -241,7 +241,7 @@ class LLMRunner(BaseRunner):
|
|
|
241
241
|
) -> list[DetectionResult]:
|
|
242
242
|
schema = self._schema
|
|
243
243
|
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.5
|
|
244
|
-
default_severity = schema.severity
|
|
244
|
+
default_severity = schema.severity if schema.severity is not None else Severity.info
|
|
245
245
|
extracted = self._coerce_fields(payload.get("fields"))
|
|
246
246
|
|
|
247
247
|
raw_labels = payload.get("labels")
|
{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/detectors/custom/runners/_text_classification.py
RENAMED
|
@@ -66,7 +66,7 @@ class TextClassificationRunner(BaseRunner):
|
|
|
66
66
|
chunk_overlap: int = getattr(schema.chunk_overlap, "root", schema.chunk_overlap) or 0
|
|
67
67
|
max_length: int | None = getattr(schema.max_length, "root", schema.max_length)
|
|
68
68
|
threshold = schema.confidence_threshold if schema.confidence_threshold is not None else 0.7
|
|
69
|
-
default_severity = schema.severity
|
|
69
|
+
default_severity = schema.severity if schema.severity is not None else Severity.info
|
|
70
70
|
|
|
71
71
|
best_scores: dict[str, float] = {}
|
|
72
72
|
try:
|
|
@@ -1080,7 +1080,7 @@ class LLMPipelineSchema(BaseModel):
|
|
|
1080
1080
|
False, description='Allow more than one label per asset.'
|
|
1081
1081
|
)
|
|
1082
1082
|
severity: Severity | None = Field(
|
|
1083
|
-
|
|
1083
|
+
Severity.info,
|
|
1084
1084
|
description='Default severity when no severity_map rule matches a predicted label.',
|
|
1085
1085
|
)
|
|
1086
1086
|
severity_map: list[PipelineSeverityRule] | None = Field(
|
|
@@ -1177,7 +1177,7 @@ class TextClassificationPipelineSchema(BaseModel):
|
|
|
1177
1177
|
le=1.0,
|
|
1178
1178
|
)
|
|
1179
1179
|
severity: Severity | None = Field(
|
|
1180
|
-
|
|
1180
|
+
Severity.info, description='Default severity when no severity_map rule matches.'
|
|
1181
1181
|
)
|
|
1182
1182
|
severity_map: list[PipelineSeverityRule] | None = Field(
|
|
1183
1183
|
None,
|
|
@@ -42,6 +42,7 @@ class AssetType(StrEnum):
|
|
|
42
42
|
JIRA = 'JIRA'
|
|
43
43
|
SERVICEDESK = 'SERVICEDESK'
|
|
44
44
|
SQLITE = 'SQLITE'
|
|
45
|
+
NOTION = 'NOTION'
|
|
45
46
|
|
|
46
47
|
|
|
47
48
|
class SourceCategory(StrEnum):
|
|
@@ -1870,6 +1871,7 @@ class Type(StrEnum):
|
|
|
1870
1871
|
JIRA = 'JIRA'
|
|
1871
1872
|
SERVICEDESK = 'SERVICEDESK'
|
|
1872
1873
|
SQLITE = 'SQLITE'
|
|
1874
|
+
NOTION = 'NOTION'
|
|
1873
1875
|
|
|
1874
1876
|
|
|
1875
1877
|
class SlackInput(CoreInput):
|
|
@@ -2657,6 +2659,7 @@ class Type17(StrEnum):
|
|
|
2657
2659
|
JIRA = 'JIRA'
|
|
2658
2660
|
SERVICEDESK = 'SERVICEDESK'
|
|
2659
2661
|
SQLITE = 'SQLITE'
|
|
2662
|
+
NOTION = 'NOTION'
|
|
2660
2663
|
|
|
2661
2664
|
|
|
2662
2665
|
class ConfluenceInput(CoreInput):
|
|
@@ -2765,6 +2768,135 @@ class SQLiteInput(CoreInput):
|
|
|
2765
2768
|
resources: ResourceOverrides | None = None
|
|
2766
2769
|
|
|
2767
2770
|
|
|
2771
|
+
class NotionRequired(BaseModel):
|
|
2772
|
+
"""
|
|
2773
|
+
Notion has no required connection fields; the integration token lives in the masked section.
|
|
2774
|
+
"""
|
|
2775
|
+
|
|
2776
|
+
model_config = ConfigDict(
|
|
2777
|
+
extra='forbid',
|
|
2778
|
+
)
|
|
2779
|
+
|
|
2780
|
+
|
|
2781
|
+
class NotionMasked(BaseModel):
|
|
2782
|
+
model_config = ConfigDict(
|
|
2783
|
+
extra='forbid',
|
|
2784
|
+
)
|
|
2785
|
+
notion_token: str = Field(
|
|
2786
|
+
...,
|
|
2787
|
+
description='Notion API token used as a Bearer credential. Accepts an internal integration secret (ntn_...) or an OAuth public-integration access token.',
|
|
2788
|
+
)
|
|
2789
|
+
|
|
2790
|
+
|
|
2791
|
+
class NotionOptionalConnection(BaseModel):
|
|
2792
|
+
"""
|
|
2793
|
+
HTTP, version, and retry settings for Notion API calls.
|
|
2794
|
+
"""
|
|
2795
|
+
|
|
2796
|
+
model_config = ConfigDict(
|
|
2797
|
+
extra='forbid',
|
|
2798
|
+
)
|
|
2799
|
+
notion_version: str | None = Field(
|
|
2800
|
+
'2025-09-03',
|
|
2801
|
+
description='Notion-Version header sent with every request. Defaults to the data-sources API version.',
|
|
2802
|
+
)
|
|
2803
|
+
request_timeout_seconds: float | None = Field(
|
|
2804
|
+
30, description='HTTP request timeout for Notion API calls', ge=1.0
|
|
2805
|
+
)
|
|
2806
|
+
rate_limit_delay_seconds: float | None = Field(
|
|
2807
|
+
0,
|
|
2808
|
+
description='Additional delay between API requests to reduce rate-limit pressure',
|
|
2809
|
+
ge=0.0,
|
|
2810
|
+
)
|
|
2811
|
+
max_retries: int | None = Field(
|
|
2812
|
+
3,
|
|
2813
|
+
description='Maximum retry attempts for transient API failures and rate limits',
|
|
2814
|
+
ge=0,
|
|
2815
|
+
le=10,
|
|
2816
|
+
)
|
|
2817
|
+
|
|
2818
|
+
|
|
2819
|
+
class NotionOptionalScope(BaseModel):
|
|
2820
|
+
"""
|
|
2821
|
+
Optional Notion scope filters. When omitted, all content shared with the integration is eligible for sampling.
|
|
2822
|
+
"""
|
|
2823
|
+
|
|
2824
|
+
model_config = ConfigDict(
|
|
2825
|
+
extra='forbid',
|
|
2826
|
+
)
|
|
2827
|
+
page_ids: list[str] | None = Field(
|
|
2828
|
+
None,
|
|
2829
|
+
description='Restrict extraction to specific page IDs (up to 250)',
|
|
2830
|
+
max_length=250,
|
|
2831
|
+
)
|
|
2832
|
+
data_source_ids: list[str] | None = Field(
|
|
2833
|
+
None,
|
|
2834
|
+
description='Restrict extraction to specific data source IDs (up to 250)',
|
|
2835
|
+
max_length=250,
|
|
2836
|
+
)
|
|
2837
|
+
search_query: str | None = Field(
|
|
2838
|
+
None,
|
|
2839
|
+
description='Optional full-text query passed to the Notion search endpoint to narrow discovery',
|
|
2840
|
+
min_length=1,
|
|
2841
|
+
)
|
|
2842
|
+
|
|
2843
|
+
|
|
2844
|
+
class NotionOptionalContent(BaseModel):
|
|
2845
|
+
"""
|
|
2846
|
+
Notion content extraction controls.
|
|
2847
|
+
"""
|
|
2848
|
+
|
|
2849
|
+
model_config = ConfigDict(
|
|
2850
|
+
extra='forbid',
|
|
2851
|
+
)
|
|
2852
|
+
include_comments: bool | None = Field(
|
|
2853
|
+
True,
|
|
2854
|
+
description='Include page and block comments and aggregate them into a per-page comments asset',
|
|
2855
|
+
)
|
|
2856
|
+
include_files: bool | None = Field(
|
|
2857
|
+
True,
|
|
2858
|
+
description='Materialize files from file/image/pdf/video blocks, file properties, and page icon/cover as related assets',
|
|
2859
|
+
)
|
|
2860
|
+
include_linked_pages: bool | None = Field(
|
|
2861
|
+
True,
|
|
2862
|
+
description='Wire parent, relation, and mention references between pages into the asset links graph',
|
|
2863
|
+
)
|
|
2864
|
+
include_data_sources: bool | None = Field(
|
|
2865
|
+
True,
|
|
2866
|
+
description='Emit Notion data sources (databases) as assets with their schema and link their row pages',
|
|
2867
|
+
)
|
|
2868
|
+
file_max_bytes: int | None = Field(
|
|
2869
|
+
5242880,
|
|
2870
|
+
description='Maximum bytes downloaded per file for MIME inference and text extraction',
|
|
2871
|
+
ge=1024,
|
|
2872
|
+
)
|
|
2873
|
+
|
|
2874
|
+
|
|
2875
|
+
class NotionOptional(BaseModel):
|
|
2876
|
+
model_config = ConfigDict(
|
|
2877
|
+
extra='forbid',
|
|
2878
|
+
)
|
|
2879
|
+
connection: NotionOptionalConnection | None = None
|
|
2880
|
+
scope: NotionOptionalScope | None = None
|
|
2881
|
+
content: NotionOptionalContent | None = None
|
|
2882
|
+
|
|
2883
|
+
|
|
2884
|
+
class NotionInput(CoreInput):
|
|
2885
|
+
type: Literal['NOTION'] = Field('NOTION', description='Type of the asset or source')
|
|
2886
|
+
required: NotionRequired
|
|
2887
|
+
masked: NotionMasked
|
|
2888
|
+
optional: NotionOptional | None = None
|
|
2889
|
+
detectors: list[Detector] | None = Field(
|
|
2890
|
+
None, description='Detectors to run on ingested content'
|
|
2891
|
+
)
|
|
2892
|
+
custom_detectors: list[CustomDetectorSelection] | None = Field(
|
|
2893
|
+
None,
|
|
2894
|
+
description='Reusable custom detector IDs selected from the custom detector catalog.',
|
|
2895
|
+
)
|
|
2896
|
+
sampling: SamplingConfig
|
|
2897
|
+
resources: ResourceOverrides | None = None
|
|
2898
|
+
|
|
2899
|
+
|
|
2768
2900
|
class SourceInput(
|
|
2769
2901
|
RootModel[
|
|
2770
2902
|
SlackInput
|
|
@@ -2787,6 +2919,7 @@ class SourceInput(
|
|
|
2787
2919
|
| JiraInput
|
|
2788
2920
|
| ServiceDeskInput
|
|
2789
2921
|
| SQLiteInput
|
|
2922
|
+
| NotionInput
|
|
2790
2923
|
]
|
|
2791
2924
|
):
|
|
2792
2925
|
root: (
|
|
@@ -2810,6 +2943,7 @@ class SourceInput(
|
|
|
2810
2943
|
| JiraInput
|
|
2811
2944
|
| ServiceDeskInput
|
|
2812
2945
|
| SQLiteInput
|
|
2946
|
+
| NotionInput
|
|
2813
2947
|
) = Field(
|
|
2814
2948
|
...,
|
|
2815
2949
|
description='Merged configuration schema with all source types and common definitions',
|
{classifyre_cli-0.4.12 → classifyre_cli-0.4.13}/src/models/generated_single_asset_scan_results.py
RENAMED
|
@@ -210,7 +210,14 @@ class SingleAssetScanResults(BaseModel):
|
|
|
210
210
|
..., description='Linked asset hashes referenced by this asset', title='Links'
|
|
211
211
|
)
|
|
212
212
|
asset_type: AssetType = Field(
|
|
213
|
-
...,
|
|
213
|
+
...,
|
|
214
|
+
description='Canonical asset content type (used for detector routing)',
|
|
215
|
+
title='Asset Type',
|
|
216
|
+
)
|
|
217
|
+
asset_kind: str | None = Field(
|
|
218
|
+
None,
|
|
219
|
+
description='Catalog asset kind discriminator (file, image, page, comment, table, ...). Persisted as the asset type for display/filtering.',
|
|
220
|
+
title='Asset Kind',
|
|
214
221
|
)
|
|
215
222
|
source_id: str | None = Field(
|
|
216
223
|
None,
|
|
@@ -238,3 +245,8 @@ class SingleAssetScanResults(BaseModel):
|
|
|
238
245
|
description='Statistics about the detector scan for this asset',
|
|
239
246
|
title='Scan Stats',
|
|
240
247
|
)
|
|
248
|
+
metadata: dict[str, Any] | None = Field(
|
|
249
|
+
None,
|
|
250
|
+
description='Source-specific asset metadata using normalized keys (size_bytes, row_count, etc.) where applicable',
|
|
251
|
+
title='Metadata',
|
|
252
|
+
)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Asset-metadata contract: the single source of truth for what each source
|
|
2
|
+
extracts is the ``x-asset-metadata`` catalog embedded in
|
|
3
|
+
``packages/schemas/src/schemas/all_input_sources.json``.
|
|
4
|
+
|
|
5
|
+
This module loads/resolves that catalog and validates metadata dicts against it.
|
|
6
|
+
Validation is strict (raises) under pytest or when ``CLASSIFYRE_STRICT_METADATA``
|
|
7
|
+
is set, and otherwise logs a warning during real ingestion — so drift between a
|
|
8
|
+
source's emitted keys and the declared catalog is caught either in CI or at runtime.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
from functools import cache
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from ..utils.validation import _load_schema
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
_CATALOG_KEY = "x-asset-metadata"
|
|
23
|
+
|
|
24
|
+
ResolvedField = dict[str, Any] # {name, type, description, required}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AssetMetadataContractError(AssertionError):
|
|
28
|
+
"""Raised (in strict mode) when emitted metadata violates the catalog."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _strict_mode() -> bool:
|
|
32
|
+
return bool(
|
|
33
|
+
os.environ.get("PYTEST_CURRENT_TEST") or os.environ.get("CLASSIFYRE_STRICT_METADATA")
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@cache
|
|
38
|
+
def load_catalog() -> dict[str, Any]:
|
|
39
|
+
"""Load and cache the ``x-asset-metadata`` catalog from the merged schema."""
|
|
40
|
+
schema = _load_schema("all_input_sources.json")
|
|
41
|
+
catalog = schema.get(_CATALOG_KEY)
|
|
42
|
+
if not isinstance(catalog, dict):
|
|
43
|
+
raise AssetMetadataContractError(
|
|
44
|
+
f"Missing '{_CATALOG_KEY}' catalog in all_input_sources.json"
|
|
45
|
+
)
|
|
46
|
+
return catalog
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _source_key(source_type: str) -> str:
|
|
50
|
+
# Catalog keys mirror the AssetType enum (uppercased source_type).
|
|
51
|
+
return source_type.upper()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def describe_type(prop_schema: dict[str, Any]) -> str:
|
|
55
|
+
"""Render a JSON-Schema property type as a short display string."""
|
|
56
|
+
json_type = prop_schema.get("type")
|
|
57
|
+
if json_type == "array":
|
|
58
|
+
items = prop_schema.get("items", {})
|
|
59
|
+
item_type = items.get("type", "string") if isinstance(items, dict) else "string"
|
|
60
|
+
return f"{item_type}[]"
|
|
61
|
+
return str(json_type) if json_type else "string"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def resolve_fields(source_type: str, asset_kind: str) -> list[ResolvedField]:
|
|
65
|
+
"""Resolve the declared fields for a (source, asset kind).
|
|
66
|
+
|
|
67
|
+
The asset entry composes one or more reusable ``contentTypes`` via ``use``
|
|
68
|
+
plus its own ``properties``; ``required`` is the union of each used content
|
|
69
|
+
type's required list and the entry's own. Raises if the entry is absent.
|
|
70
|
+
"""
|
|
71
|
+
catalog = load_catalog()
|
|
72
|
+
sources = catalog.get("sources", {})
|
|
73
|
+
source_entry = sources.get(_source_key(source_type))
|
|
74
|
+
if not isinstance(source_entry, dict) or asset_kind not in source_entry:
|
|
75
|
+
raise AssetMetadataContractError(
|
|
76
|
+
f"No catalog entry for source '{source_type}' asset kind '{asset_kind}'"
|
|
77
|
+
)
|
|
78
|
+
entry = source_entry[asset_kind]
|
|
79
|
+
content_types = catalog.get("contentTypes", {})
|
|
80
|
+
|
|
81
|
+
properties: dict[str, dict[str, Any]] = {}
|
|
82
|
+
required: set[str] = set()
|
|
83
|
+
|
|
84
|
+
for content_type_name in entry.get("use", []):
|
|
85
|
+
content_type = content_types.get(content_type_name, {})
|
|
86
|
+
properties.update(content_type.get("properties", {}))
|
|
87
|
+
required.update(content_type.get("required", []))
|
|
88
|
+
|
|
89
|
+
properties.update(entry.get("properties", {}))
|
|
90
|
+
required.update(entry.get("required", []))
|
|
91
|
+
|
|
92
|
+
return [
|
|
93
|
+
{
|
|
94
|
+
"name": name,
|
|
95
|
+
"type": describe_type(prop),
|
|
96
|
+
"description": prop.get("description", ""),
|
|
97
|
+
"required": name in required,
|
|
98
|
+
}
|
|
99
|
+
for name, prop in properties.items()
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def validate_metadata(
|
|
104
|
+
source_type: str,
|
|
105
|
+
asset_kind: str,
|
|
106
|
+
data: dict[str, Any],
|
|
107
|
+
) -> dict[str, Any]:
|
|
108
|
+
"""Validate an emitted metadata dict against the catalog and return it.
|
|
109
|
+
|
|
110
|
+
Strict mode raises ``AssetMetadataContractError``; otherwise it logs a
|
|
111
|
+
warning. Checks: no undeclared keys, and every required field is present
|
|
112
|
+
with a non-null value.
|
|
113
|
+
"""
|
|
114
|
+
try:
|
|
115
|
+
fields = resolve_fields(source_type, asset_kind)
|
|
116
|
+
except AssetMetadataContractError as exc:
|
|
117
|
+
if _strict_mode():
|
|
118
|
+
raise
|
|
119
|
+
logger.warning("Asset metadata contract: %s", exc)
|
|
120
|
+
return data
|
|
121
|
+
|
|
122
|
+
declared = {field["name"] for field in fields}
|
|
123
|
+
required = {field["name"] for field in fields if field["required"]}
|
|
124
|
+
present_non_null = {key for key, value in data.items() if value is not None}
|
|
125
|
+
|
|
126
|
+
undeclared = sorted(set(data) - declared)
|
|
127
|
+
missing_required = sorted(required - present_non_null)
|
|
128
|
+
|
|
129
|
+
if undeclared or missing_required:
|
|
130
|
+
message = (
|
|
131
|
+
f"[{source_type}/{asset_kind}] "
|
|
132
|
+
f"undeclared={undeclared} missing_required={missing_required}"
|
|
133
|
+
)
|
|
134
|
+
if _strict_mode():
|
|
135
|
+
raise AssetMetadataContractError(message)
|
|
136
|
+
logger.warning("Asset metadata contract drift: %s", message)
|
|
137
|
+
|
|
138
|
+
return data
|
|
@@ -17,6 +17,11 @@ class BaseSource(ABC):
|
|
|
17
17
|
Abstract base class for all metadata extraction sources.
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
|
+
# Stable source identifier, overridden by each concrete source (e.g.
|
|
21
|
+
# "postgresql", "wordpress"). Uppercased it maps to the AssetType enum and
|
|
22
|
+
# the x-asset-metadata catalog key.
|
|
23
|
+
source_type: str = ""
|
|
24
|
+
|
|
20
25
|
# Default batch size for streaming asset results
|
|
21
26
|
BATCH_SIZE: int = 50
|
|
22
27
|
HAS_SUCCESSFUL_RUN_ENV = "CLASSIFYRE_SOURCE_HAS_SUCCESSFUL_RUN"
|
|
@@ -130,6 +135,22 @@ class BaseSource(ABC):
|
|
|
130
135
|
"""
|
|
131
136
|
return calculate_checksum(data)
|
|
132
137
|
|
|
138
|
+
def metadata_fields(self, asset_kind: str, data: dict[str, Any]) -> dict[str, Any]:
|
|
139
|
+
"""Build the ``asset_kind`` + ``metadata`` kwargs for SingleAssetScanResults.
|
|
140
|
+
|
|
141
|
+
Spread into the constructor: ``**self.metadata_fields("page", {...})``.
|
|
142
|
+
``asset_kind`` is the catalog discriminator (persisted as the asset type
|
|
143
|
+
for display); ``metadata`` is validated against ``x-asset-metadata`` for
|
|
144
|
+
this source/kind — strict (raises) under pytest / ``CLASSIFYRE_STRICT_METADATA``,
|
|
145
|
+
otherwise a warning.
|
|
146
|
+
"""
|
|
147
|
+
from .asset_metadata import validate_metadata
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
"asset_kind": asset_kind,
|
|
151
|
+
"metadata": validate_metadata(self.source_type, asset_kind, data),
|
|
152
|
+
}
|
|
153
|
+
|
|
133
154
|
@abstractmethod
|
|
134
155
|
def abort(self) -> None:
|
|
135
156
|
"""
|
|
@@ -313,6 +313,20 @@ class ConfluenceSource(BaseSource):
|
|
|
313
313
|
"status": page.get("status"),
|
|
314
314
|
"links_count": len(related_hashes),
|
|
315
315
|
}
|
|
316
|
+
asset_metadata: dict[str, Any] = {
|
|
317
|
+
"page_id": page_id,
|
|
318
|
+
"title": title,
|
|
319
|
+
"links_count": len(related_hashes),
|
|
320
|
+
}
|
|
321
|
+
space_id = page.get("spaceId")
|
|
322
|
+
if space_id is not None:
|
|
323
|
+
asset_metadata["space_key"] = str(space_id)
|
|
324
|
+
status = page.get("status")
|
|
325
|
+
if isinstance(status, str) and status:
|
|
326
|
+
asset_metadata["status"] = status
|
|
327
|
+
author_id = page.get("authorId")
|
|
328
|
+
if isinstance(author_id, str) and author_id:
|
|
329
|
+
asset_metadata["author"] = author_id
|
|
316
330
|
page_asset = SingleAssetScanResults(
|
|
317
331
|
hash=page_hash,
|
|
318
332
|
checksum=self.calculate_checksum(page_metadata),
|
|
@@ -330,6 +344,7 @@ class ConfluenceSource(BaseSource):
|
|
|
330
344
|
)
|
|
331
345
|
),
|
|
332
346
|
runner_id=self.runner_id,
|
|
347
|
+
**self.metadata_fields("page", asset_metadata),
|
|
333
348
|
)
|
|
334
349
|
|
|
335
350
|
return [page_asset, *related_assets]
|
|
@@ -368,6 +383,15 @@ class ConfluenceSource(BaseSource):
|
|
|
368
383
|
if download_url:
|
|
369
384
|
self._attachment_download_url_by_hash[attachment_hash] = download_url
|
|
370
385
|
|
|
386
|
+
attachment_metadata: dict[str, Any] = {
|
|
387
|
+
"title": attachment_name,
|
|
388
|
+
"page_hash": page_hash,
|
|
389
|
+
}
|
|
390
|
+
if mime:
|
|
391
|
+
attachment_metadata["mime_type"] = mime
|
|
392
|
+
file_size = attachment.get("fileSize")
|
|
393
|
+
if isinstance(file_size, int):
|
|
394
|
+
attachment_metadata["size_bytes"] = file_size
|
|
371
395
|
assets.append(
|
|
372
396
|
SingleAssetScanResults(
|
|
373
397
|
hash=attachment_hash,
|
|
@@ -380,6 +404,7 @@ class ConfluenceSource(BaseSource):
|
|
|
380
404
|
created_at=now,
|
|
381
405
|
updated_at=now,
|
|
382
406
|
runner_id=self.runner_id,
|
|
407
|
+
**self.metadata_fields("attachment", attachment_metadata),
|
|
383
408
|
)
|
|
384
409
|
)
|
|
385
410
|
hashes.append(attachment_hash)
|
|
@@ -453,6 +478,10 @@ class ConfluenceSource(BaseSource):
|
|
|
453
478
|
created_at=now,
|
|
454
479
|
updated_at=now,
|
|
455
480
|
runner_id=self.runner_id,
|
|
481
|
+
**self.metadata_fields(
|
|
482
|
+
"comments",
|
|
483
|
+
{"page_id": page_id, "comments_count": len(comment_items)},
|
|
484
|
+
),
|
|
456
485
|
)
|
|
457
486
|
return comments_asset, [comments_hash]
|
|
458
487
|
|
|
@@ -578,6 +607,7 @@ class ConfluenceSource(BaseSource):
|
|
|
578
607
|
created_at=now,
|
|
579
608
|
updated_at=now,
|
|
580
609
|
runner_id=self.runner_id,
|
|
610
|
+
**self.metadata_fields("linked_file", {"referenced_by": page_hash}),
|
|
581
611
|
)
|
|
582
612
|
|
|
583
613
|
def _display_name_from_url(self, url: str) -> str:
|
|
@@ -719,6 +719,7 @@ class DatabricksSource(BaseTabularSource):
|
|
|
719
719
|
created_at=now,
|
|
720
720
|
updated_at=now,
|
|
721
721
|
runner_id=self.runner_id,
|
|
722
|
+
**self.metadata_fields("notebook", metadata),
|
|
722
723
|
)
|
|
723
724
|
|
|
724
725
|
# ── Pipelines ────────────────────────────────────────────────────────
|
|
@@ -789,6 +790,7 @@ class DatabricksSource(BaseTabularSource):
|
|
|
789
790
|
created_at=now,
|
|
790
791
|
updated_at=now,
|
|
791
792
|
runner_id=self.runner_id,
|
|
793
|
+
**self.metadata_fields("pipeline", metadata),
|
|
792
794
|
)
|
|
793
795
|
|
|
794
796
|
# ── Custom extract_raw (tables + notebooks + pipelines) ──────────────
|