acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/common/serialized_value.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +18 -3
- datahub/api/entities/datajob/datajob.py +24 -4
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataproduct/dataproduct.py +32 -3
- datahub/api/entities/dataset/dataset.py +47 -72
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +724 -0
- datahub/api/entities/external/external_tag.py +147 -0
- datahub/api/entities/external/lake_formation_external_entites.py +162 -0
- datahub/api/entities/external/restricted_text.py +172 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
- datahub/api/entities/forms/forms.py +37 -37
- datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/base.py +8 -6
- datahub/api/graphql/operation.py +14 -10
- datahub/cli/check_cli.py +91 -9
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/config_utils.py +20 -12
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +133 -34
- datahub/cli/docker_check.py +110 -14
- datahub/cli/docker_cli.py +155 -231
- datahub/cli/exists_cli.py +2 -3
- datahub/cli/get_cli.py +2 -3
- datahub/cli/graphql_cli.py +1422 -0
- datahub/cli/iceberg_cli.py +11 -5
- datahub/cli/ingest_cli.py +25 -26
- datahub/cli/migrate.py +12 -9
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +4 -6
- datahub/cli/quickstart_versioning.py +53 -10
- datahub/cli/specific/assertions_cli.py +39 -7
- datahub/cli/specific/datacontract_cli.py +57 -9
- datahub/cli/specific/dataproduct_cli.py +12 -24
- datahub/cli/specific/dataset_cli.py +31 -21
- datahub/cli/specific/forms_cli.py +2 -5
- datahub/cli/specific/group_cli.py +2 -3
- datahub/cli/specific/structuredproperties_cli.py +5 -7
- datahub/cli/specific/user_cli.py +174 -4
- datahub/cli/state_cli.py +2 -3
- datahub/cli/timeline_cli.py +2 -3
- datahub/configuration/common.py +46 -2
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/env_vars.py +331 -0
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/kafka.py +21 -1
- datahub/configuration/pydantic_migration_helpers.py +6 -13
- datahub/configuration/source_common.py +4 -3
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +8 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/emitter/mce_builder.py +12 -8
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/mcp_builder.py +12 -0
- datahub/emitter/request_helper.py +138 -15
- datahub/emitter/response_helper.py +111 -19
- datahub/emitter/rest_emitter.py +399 -163
- datahub/entrypoints.py +10 -5
- datahub/errors.py +12 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
- datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +381 -3
- datahub/ingestion/api/sink.py +27 -2
- datahub/ingestion/api/source.py +174 -62
- datahub/ingestion/api/source_helpers.py +41 -3
- datahub/ingestion/api/source_protocols.py +23 -0
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3652 -0
- datahub/ingestion/autogenerated/lineage.json +402 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +31 -5
- datahub/ingestion/glossary/classification_mixin.py +9 -2
- datahub/ingestion/graph/client.py +492 -55
- datahub/ingestion/graph/config.py +18 -2
- datahub/ingestion/graph/filters.py +96 -32
- datahub/ingestion/graph/links.py +55 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
- datahub/ingestion/run/pipeline.py +90 -23
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/sink/datahub_kafka.py +1 -0
- datahub/ingestion/sink/datahub_rest.py +31 -23
- datahub/ingestion/sink/file.py +1 -0
- datahub/ingestion/source/abs/config.py +1 -1
- datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
- datahub/ingestion/source/abs/source.py +15 -30
- datahub/ingestion/source/apply/datahub_apply.py +6 -5
- datahub/ingestion/source/aws/aws_common.py +185 -13
- datahub/ingestion/source/aws/glue.py +517 -244
- datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/aws/tag_entities.py +270 -0
- datahub/ingestion/source/azure/azure_common.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
- datahub/ingestion/source/cassandra/cassandra.py +7 -18
- datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
- datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
- datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
- datahub/ingestion/source/common/subtypes.py +73 -1
- datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
- datahub/ingestion/source/data_lake_common/object_store.py +732 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
- datahub/ingestion/source/datahub/config.py +19 -5
- datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
- datahub/ingestion/source/datahub/datahub_source.py +11 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
- datahub/ingestion/source/dbt/dbt_common.py +270 -26
- datahub/ingestion/source/dbt/dbt_core.py +88 -47
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/delta_lake/config.py +9 -5
- datahub/ingestion/source/delta_lake/source.py +8 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_config.py +5 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
- datahub/ingestion/source/dremio/dremio_source.py +228 -215
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/feast.py +12 -14
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/config.py +67 -8
- datahub/ingestion/source/fivetran/fivetran.py +228 -43
- datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
- datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
- datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
- datahub/ingestion/source/fivetran/response_models.py +97 -0
- datahub/ingestion/source/gc/datahub_gc.py +0 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/gcs/gcs_source.py +53 -10
- datahub/ingestion/source/gcs/gcs_utils.py +36 -9
- datahub/ingestion/source/ge_data_profiler.py +146 -33
- datahub/ingestion/source/ge_profiling_config.py +26 -11
- datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
- datahub/ingestion/source/grafana/field_utils.py +307 -0
- datahub/ingestion/source/grafana/grafana_api.py +142 -0
- datahub/ingestion/source/grafana/grafana_config.py +104 -0
- datahub/ingestion/source/grafana/grafana_source.py +522 -84
- datahub/ingestion/source/grafana/lineage.py +202 -0
- datahub/ingestion/source/grafana/models.py +137 -0
- datahub/ingestion/source/grafana/report.py +90 -0
- datahub/ingestion/source/grafana/types.py +16 -0
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +402 -0
- datahub/ingestion/source/hex/constants.py +8 -0
- datahub/ingestion/source/hex/hex.py +311 -0
- datahub/ingestion/source/hex/mapper.py +412 -0
- datahub/ingestion/source/hex/model.py +78 -0
- datahub/ingestion/source/hex/query_fetcher.py +307 -0
- datahub/ingestion/source/iceberg/iceberg.py +385 -164
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +28 -71
- datahub/ingestion/source/kafka/kafka_config.py +78 -0
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
- datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +216 -86
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
- datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
- datahub/ingestion/source/looker/looker_source.py +539 -555
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +31 -3
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +103 -118
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/metabase.py +32 -6
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +11 -10
- datahub/ingestion/source/mlflow.py +254 -23
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/mode.py +359 -181
- datahub/ingestion/source/mongodb.py +11 -1
- datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
- datahub/ingestion/source/nifi.py +5 -5
- datahub/ingestion/source/openapi.py +85 -38
- datahub/ingestion/source/openapi_parser.py +59 -40
- datahub/ingestion/source/powerbi/config.py +92 -27
- datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +66 -32
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
- datahub/ingestion/source/preset.py +3 -3
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +15 -9
- datahub/ingestion/source/redshift/datashares.py +1 -1
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/redshift/query.py +24 -20
- datahub/ingestion/source/redshift/redshift.py +52 -111
- datahub/ingestion/source/redshift/redshift_schema.py +17 -12
- datahub/ingestion/source/redshift/report.py +0 -2
- datahub/ingestion/source/redshift/usage.py +13 -11
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +515 -244
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +28 -13
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/schema_inference/object.py +22 -6
- datahub/ingestion/source/sigma/config.py +75 -8
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/sigma/sigma.py +36 -7
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +403 -140
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/constants.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
- datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
- datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
- datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
- datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
- datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
- datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +219 -26
- datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
- datahub/ingestion/source/sql/clickhouse.py +29 -9
- datahub/ingestion/source/sql/cockroachdb.py +5 -4
- datahub/ingestion/source/sql/druid.py +9 -4
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive.py +28 -8
- datahub/ingestion/source/sql/hive_metastore.py +24 -25
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/job_models.py +18 -2
- datahub/ingestion/source/sql/mssql/source.py +376 -62
- datahub/ingestion/source/sql/mysql.py +154 -4
- datahub/ingestion/source/sql/oracle.py +62 -11
- datahub/ingestion/source/sql/postgres.py +142 -6
- datahub/ingestion/source/sql/presto.py +20 -2
- datahub/ingestion/source/sql/sql_common.py +281 -49
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/sql_types.py +27 -2
- datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
- datahub/ingestion/source/sql/teradata.py +1028 -245
- datahub/ingestion/source/sql/trino.py +43 -10
- datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
- datahub/ingestion/source/sql/vertica.py +14 -7
- datahub/ingestion/source/sql_queries.py +219 -121
- datahub/ingestion/source/state/checkpoint.py +8 -29
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/superset.py +810 -126
- datahub/ingestion/source/tableau/tableau.py +172 -69
- datahub/ingestion/source/tableau/tableau_common.py +11 -4
- datahub/ingestion/source/tableau/tableau_constant.py +1 -4
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +161 -40
- datahub/ingestion/source/unity/connection.py +61 -0
- datahub/ingestion/source/unity/connection_test.py +1 -0
- datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
- datahub/ingestion/source/unity/proxy.py +794 -51
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +36 -2
- datahub/ingestion/source/unity/report.py +15 -3
- datahub/ingestion/source/unity/source.py +465 -131
- datahub/ingestion/source/unity/tag_entities.py +197 -0
- datahub/ingestion/source/unity/usage.py +46 -4
- datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
- datahub/ingestion/source/usage/usage_common.py +4 -68
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1367 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/source_report/ingestion_stage.py +50 -11
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
- datahub/ingestion/transformer/base_transformer.py +8 -5
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/integrations/assertion/common.py +3 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
- datahub/metadata/_urns/urn_defs.py +1866 -1582
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
- datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
- datahub/metadata/schema.avsc +18404 -16617
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/Actors.avsc +38 -1
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/AssetSettings.avsc +63 -0
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
- datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
- datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +3 -1
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/IncidentInfo.avsc +3 -3
- datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
- datahub/metadata/schemas/LogicalParent.avsc +145 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
- datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
- datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/Operation.avsc +21 -2
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/QueryProperties.avsc +24 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
- datahub/metadata/schemas/SystemMetadata.avsc +147 -0
- datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +7 -0
- datahub/sdk/_all_entities.py +15 -0
- datahub/sdk/_shared.py +393 -10
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/chart.py +386 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +453 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +180 -4
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +154 -12
- datahub/sdk/lineage_client.py +943 -0
- datahub/sdk/main_client.py +83 -8
- datahub/sdk/mlmodel.py +383 -0
- datahub/sdk/mlmodelgroup.py +240 -0
- datahub/sdk/search_client.py +85 -8
- datahub/sdk/search_filters.py +393 -68
- datahub/secret/datahub_secret_store.py +5 -1
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/specific/chart.py +1 -1
- datahub/specific/datajob.py +15 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/specific/dataset.py +51 -59
- datahub/sql_parsing/_sqlglot_patch.py +1 -2
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/split_statements.py +30 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
- datahub/sql_parsing/sqlglot_lineage.py +517 -44
- datahub/sql_parsing/sqlglot_utils.py +30 -18
- datahub/sql_parsing/tool_meta_extractor.py +25 -2
- datahub/telemetry/telemetry.py +30 -16
- datahub/testing/check_imports.py +1 -1
- datahub/testing/docker_utils.py +8 -2
- datahub/testing/mce_helpers.py +421 -0
- datahub/testing/mcp_diff.py +17 -21
- datahub/testing/sdk_v2_helpers.py +18 -0
- datahub/upgrade/upgrade.py +86 -30
- datahub/utilities/file_backed_collections.py +14 -15
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/utilities/is_pytest.py +3 -2
- datahub/utilities/logging_manager.py +30 -7
- datahub/utilities/mapping.py +29 -2
- datahub/utilities/sample_data.py +5 -4
- datahub/utilities/server_config_util.py +298 -10
- datahub/utilities/sqlalchemy_query_combiner.py +6 -4
- datahub/utilities/stats_collections.py +4 -0
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/utilities/urn_encoder.py +1 -1
- datahub/utilities/urns/urn.py +41 -2
- datahub/emitter/sql_parsing_builder.py +0 -306
- datahub/ingestion/source/redshift/lineage_v2.py +0 -458
- datahub/ingestion/source/vertexai.py +0 -697
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
|
@@ -1,697 +0,0 @@
|
|
|
1
|
-
import dataclasses
|
|
2
|
-
import logging
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from typing import Any, Dict, Iterable, List, Optional, TypeVar
|
|
5
|
-
|
|
6
|
-
from google.api_core.exceptions import GoogleAPICallError
|
|
7
|
-
from google.cloud import aiplatform
|
|
8
|
-
from google.cloud.aiplatform import (
|
|
9
|
-
AutoMLForecastingTrainingJob,
|
|
10
|
-
AutoMLImageTrainingJob,
|
|
11
|
-
AutoMLTabularTrainingJob,
|
|
12
|
-
AutoMLTextTrainingJob,
|
|
13
|
-
AutoMLVideoTrainingJob,
|
|
14
|
-
Endpoint,
|
|
15
|
-
)
|
|
16
|
-
from google.cloud.aiplatform.base import VertexAiResourceNoun
|
|
17
|
-
from google.cloud.aiplatform.models import Model, VersionInfo
|
|
18
|
-
from google.oauth2 import service_account
|
|
19
|
-
from pydantic import PrivateAttr
|
|
20
|
-
from pydantic.fields import Field
|
|
21
|
-
|
|
22
|
-
import datahub.emitter.mce_builder as builder
|
|
23
|
-
from datahub.configuration.source_common import EnvConfigMixin
|
|
24
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
25
|
-
from datahub.emitter.mcp_builder import ProjectIdKey, gen_containers
|
|
26
|
-
from datahub.ingestion.api.common import PipelineContext
|
|
27
|
-
from datahub.ingestion.api.decorators import (
|
|
28
|
-
SupportStatus,
|
|
29
|
-
capability,
|
|
30
|
-
config_class,
|
|
31
|
-
platform_name,
|
|
32
|
-
support_status,
|
|
33
|
-
)
|
|
34
|
-
from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
|
|
35
|
-
from datahub.ingestion.api.source_helpers import auto_workunit
|
|
36
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
|
-
from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
|
|
38
|
-
from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import (
|
|
39
|
-
MLTrainingRunProperties,
|
|
40
|
-
)
|
|
41
|
-
from datahub.metadata.schema_classes import (
|
|
42
|
-
AuditStampClass,
|
|
43
|
-
ContainerClass,
|
|
44
|
-
DataProcessInstanceInputClass,
|
|
45
|
-
DataProcessInstancePropertiesClass,
|
|
46
|
-
DatasetPropertiesClass,
|
|
47
|
-
MLModelDeploymentPropertiesClass,
|
|
48
|
-
MLModelGroupPropertiesClass,
|
|
49
|
-
MLModelPropertiesClass,
|
|
50
|
-
SubTypesClass,
|
|
51
|
-
TimeStampClass,
|
|
52
|
-
VersionTagClass,
|
|
53
|
-
)
|
|
54
|
-
from datahub.utilities.str_enum import StrEnum
|
|
55
|
-
from datahub.utilities.time import datetime_to_ts_millis
|
|
56
|
-
|
|
57
|
-
T = TypeVar("T")
|
|
58
|
-
|
|
59
|
-
logger = logging.getLogger(__name__)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class VertexAIConfig(EnvConfigMixin):
|
|
63
|
-
credential: Optional[GCPCredential] = Field(
|
|
64
|
-
default=None, description="GCP credential information"
|
|
65
|
-
)
|
|
66
|
-
project_id: str = Field(description=("Project ID in Google Cloud Platform"))
|
|
67
|
-
region: str = Field(
|
|
68
|
-
description=("Region of your project in Google Cloud Platform"),
|
|
69
|
-
)
|
|
70
|
-
bucket_uri: Optional[str] = Field(
|
|
71
|
-
default=None,
|
|
72
|
-
description=("Bucket URI used in your project"),
|
|
73
|
-
)
|
|
74
|
-
vertexai_url: Optional[str] = Field(
|
|
75
|
-
default="https://console.cloud.google.com/vertex-ai",
|
|
76
|
-
description=("VertexUI URI"),
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
_credentials_path: Optional[str] = PrivateAttr(None)
|
|
80
|
-
|
|
81
|
-
def __init__(self, **data: Any):
|
|
82
|
-
super().__init__(**data)
|
|
83
|
-
if self.credential:
|
|
84
|
-
self._credentials_path = self.credential.create_credential_temp_file(
|
|
85
|
-
project_id=self.project_id
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
class MLTypes(StrEnum):
|
|
90
|
-
TRAINING_JOB = "Training Job"
|
|
91
|
-
MODEL = "ML Model"
|
|
92
|
-
MODEL_GROUP = "ML Model Group"
|
|
93
|
-
ENDPOINT = "Endpoint"
|
|
94
|
-
DATASET = "Dataset"
|
|
95
|
-
PROJECT = "Project"
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
@dataclasses.dataclass
|
|
99
|
-
class TrainingJobMetadata:
|
|
100
|
-
job: VertexAiResourceNoun
|
|
101
|
-
input_dataset: Optional[VertexAiResourceNoun] = None
|
|
102
|
-
output_model: Optional[Model] = None
|
|
103
|
-
output_model_version: Optional[VersionInfo] = None
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
@dataclasses.dataclass
|
|
107
|
-
class ModelMetadata:
|
|
108
|
-
model: Model
|
|
109
|
-
model_version: VersionInfo
|
|
110
|
-
training_job_urn: Optional[str] = None
|
|
111
|
-
endpoints: Optional[List[Endpoint]] = None
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
@platform_name("Vertex AI", id="vertexai")
|
|
115
|
-
@config_class(VertexAIConfig)
|
|
116
|
-
@support_status(SupportStatus.TESTING)
|
|
117
|
-
@capability(
|
|
118
|
-
SourceCapability.DESCRIPTIONS,
|
|
119
|
-
"Extract descriptions for Vertex AI Registered Models and Model Versions",
|
|
120
|
-
)
|
|
121
|
-
@capability(SourceCapability.TAGS, "Extract tags for Vertex AI Registered Model Stages")
|
|
122
|
-
class VertexAISource(Source):
|
|
123
|
-
platform: str = "vertexai"
|
|
124
|
-
|
|
125
|
-
def __init__(self, ctx: PipelineContext, config: VertexAIConfig):
|
|
126
|
-
super().__init__(ctx)
|
|
127
|
-
self.config = config
|
|
128
|
-
self.report = SourceReport()
|
|
129
|
-
|
|
130
|
-
credentials = (
|
|
131
|
-
service_account.Credentials.from_service_account_file(
|
|
132
|
-
self.config._credentials_path
|
|
133
|
-
)
|
|
134
|
-
if self.config.credential
|
|
135
|
-
else None
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
aiplatform.init(
|
|
139
|
-
project=config.project_id, location=config.region, credentials=credentials
|
|
140
|
-
)
|
|
141
|
-
self.client = aiplatform
|
|
142
|
-
self.endpoints: Optional[Dict[str, List[Endpoint]]] = None
|
|
143
|
-
self.datasets: Optional[Dict[str, VertexAiResourceNoun]] = None
|
|
144
|
-
|
|
145
|
-
def get_report(self) -> SourceReport:
|
|
146
|
-
return self.report
|
|
147
|
-
|
|
148
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
149
|
-
"""
|
|
150
|
-
Main Function to fetch and yields mcps for various VertexAI resources.
|
|
151
|
-
- Models and Model Versions from the Model Registry
|
|
152
|
-
- Training Jobs
|
|
153
|
-
"""
|
|
154
|
-
|
|
155
|
-
# Ingest Project
|
|
156
|
-
yield from self._gen_project_workunits()
|
|
157
|
-
# Fetch and Ingest Models, Model Versions a from Model Registry
|
|
158
|
-
yield from auto_workunit(self._get_ml_models_mcps())
|
|
159
|
-
# Fetch and Ingest Training Jobs
|
|
160
|
-
yield from auto_workunit(self._get_training_jobs_mcps())
|
|
161
|
-
# TODO Fetch Experiments and Experiment Runs
|
|
162
|
-
|
|
163
|
-
def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
164
|
-
yield from gen_containers(
|
|
165
|
-
container_key=self._get_project_container(),
|
|
166
|
-
name=self.config.project_id,
|
|
167
|
-
sub_types=[MLTypes.PROJECT],
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
def _get_ml_models_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
171
|
-
"""
|
|
172
|
-
Fetch List of Models in Model Registry and generate a corresponding mcp.
|
|
173
|
-
"""
|
|
174
|
-
registered_models = self.client.Model.list()
|
|
175
|
-
for model in registered_models:
|
|
176
|
-
# create mcp for Model Group (= Model in VertexAI)
|
|
177
|
-
yield from self._gen_ml_group_mcps(model)
|
|
178
|
-
model_versions = model.versioning_registry.list_versions()
|
|
179
|
-
for model_version in model_versions:
|
|
180
|
-
# create mcp for Model (= Model Version in VertexAI)
|
|
181
|
-
logger.info(
|
|
182
|
-
f"Ingesting a model (name: {model.display_name} id:{model.name})"
|
|
183
|
-
)
|
|
184
|
-
yield from self._get_ml_model_mcps(
|
|
185
|
-
model=model, model_version=model_version
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
def _get_ml_model_mcps(
|
|
189
|
-
self, model: Model, model_version: VersionInfo
|
|
190
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
191
|
-
model_meta: ModelMetadata = self._get_ml_model_metadata(model, model_version)
|
|
192
|
-
# Create ML Model Entity
|
|
193
|
-
yield from self._gen_ml_model_mcps(model_meta)
|
|
194
|
-
# Create Endpoint Entity
|
|
195
|
-
yield from self._gen_endpoints_mcps(model_meta)
|
|
196
|
-
|
|
197
|
-
def _get_ml_model_metadata(
|
|
198
|
-
self, model: Model, model_version: VersionInfo
|
|
199
|
-
) -> ModelMetadata:
|
|
200
|
-
model_meta = ModelMetadata(model=model, model_version=model_version)
|
|
201
|
-
# Search for endpoints associated with the model
|
|
202
|
-
endpoints = self._search_endpoint(model)
|
|
203
|
-
model_meta.endpoints = endpoints
|
|
204
|
-
return model_meta
|
|
205
|
-
|
|
206
|
-
def _get_training_jobs_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
207
|
-
"""
|
|
208
|
-
Fetches training jobs from Vertex AI and generates corresponding mcps.
|
|
209
|
-
This method retrieves various types of training jobs from Vertex AI, including
|
|
210
|
-
CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob,
|
|
211
|
-
AutoMLTabularTrainingJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, AutoMLVideoTrainingJob,
|
|
212
|
-
and AutoMLForecastingTrainingJob. For each job, it generates mcps containing metadata
|
|
213
|
-
about the job, its inputs, and its outputs.
|
|
214
|
-
"""
|
|
215
|
-
class_names = [
|
|
216
|
-
"CustomJob",
|
|
217
|
-
"CustomTrainingJob",
|
|
218
|
-
"CustomContainerTrainingJob",
|
|
219
|
-
"CustomPythonPackageTrainingJob",
|
|
220
|
-
"AutoMLTabularTrainingJob",
|
|
221
|
-
"AutoMLTextTrainingJob",
|
|
222
|
-
"AutoMLImageTrainingJob",
|
|
223
|
-
"AutoMLVideoTrainingJob",
|
|
224
|
-
"AutoMLForecastingTrainingJob",
|
|
225
|
-
]
|
|
226
|
-
# Iterate over class names and call the list() function
|
|
227
|
-
for class_name in class_names:
|
|
228
|
-
logger.info(f"Fetching a list of {class_name}s from VertexAI server")
|
|
229
|
-
for job in getattr(self.client, class_name).list():
|
|
230
|
-
yield from self._get_training_job_mcps(job)
|
|
231
|
-
|
|
232
|
-
def _get_training_job_mcps(
|
|
233
|
-
self, job: VertexAiResourceNoun
|
|
234
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
235
|
-
job_meta: TrainingJobMetadata = self._get_training_job_metadata(job)
|
|
236
|
-
# Create DataProcessInstance for the training job
|
|
237
|
-
yield from self._gen_training_job_mcps(job_meta)
|
|
238
|
-
# Create Dataset entity for Input Dataset of Training job
|
|
239
|
-
yield from self._get_input_dataset_mcps(job_meta)
|
|
240
|
-
# Create ML Model entity for output ML model of this training job
|
|
241
|
-
yield from self._gen_output_model_mcps(job_meta)
|
|
242
|
-
|
|
243
|
-
def _gen_output_model_mcps(
|
|
244
|
-
self, job_meta: TrainingJobMetadata
|
|
245
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
246
|
-
if job_meta.output_model and job_meta.output_model_version:
|
|
247
|
-
job = job_meta.job
|
|
248
|
-
job_urn = builder.make_data_process_instance_urn(
|
|
249
|
-
self._make_vertexai_job_name(entity_id=job.name)
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
yield from self._gen_ml_model_mcps(
|
|
253
|
-
ModelMetadata(
|
|
254
|
-
model=job_meta.output_model,
|
|
255
|
-
model_version=job_meta.output_model_version,
|
|
256
|
-
training_job_urn=job_urn,
|
|
257
|
-
)
|
|
258
|
-
)
|
|
259
|
-
|
|
260
|
-
def _gen_training_job_mcps(
|
|
261
|
-
self, job_meta: TrainingJobMetadata
|
|
262
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
263
|
-
"""
|
|
264
|
-
Generate a mcp for VertexAI Training Job
|
|
265
|
-
"""
|
|
266
|
-
job = job_meta.job
|
|
267
|
-
job_id = self._make_vertexai_job_name(entity_id=job.name)
|
|
268
|
-
job_urn = builder.make_data_process_instance_urn(job_id)
|
|
269
|
-
|
|
270
|
-
created_time = (
|
|
271
|
-
datetime_to_ts_millis(job.create_time)
|
|
272
|
-
if job.create_time
|
|
273
|
-
else datetime_to_ts_millis(datetime.now())
|
|
274
|
-
)
|
|
275
|
-
created_actor = "urn:li:corpuser:datahub"
|
|
276
|
-
|
|
277
|
-
# If Training job has Input Dataset
|
|
278
|
-
dataset_urn = (
|
|
279
|
-
builder.make_dataset_urn(
|
|
280
|
-
platform=self.platform,
|
|
281
|
-
name=self._make_vertexai_dataset_name(
|
|
282
|
-
entity_id=job_meta.input_dataset.name
|
|
283
|
-
),
|
|
284
|
-
env=self.config.env,
|
|
285
|
-
)
|
|
286
|
-
if job_meta.input_dataset
|
|
287
|
-
else None
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
yield from MetadataChangeProposalWrapper.construct_many(
|
|
291
|
-
job_urn,
|
|
292
|
-
aspects=[
|
|
293
|
-
DataProcessInstancePropertiesClass(
|
|
294
|
-
name=job_id,
|
|
295
|
-
created=AuditStampClass(
|
|
296
|
-
time=created_time,
|
|
297
|
-
actor=created_actor,
|
|
298
|
-
),
|
|
299
|
-
externalUrl=self._make_job_external_url(job),
|
|
300
|
-
customProperties={
|
|
301
|
-
"displayName": job.display_name,
|
|
302
|
-
"jobType": job.__class__.__name__,
|
|
303
|
-
},
|
|
304
|
-
),
|
|
305
|
-
MLTrainingRunProperties(
|
|
306
|
-
externalUrl=self._make_job_external_url(job), id=job.name
|
|
307
|
-
),
|
|
308
|
-
SubTypesClass(typeNames=[MLTypes.TRAINING_JOB]),
|
|
309
|
-
ContainerClass(container=self._get_project_container().as_urn()),
|
|
310
|
-
DataProcessInstanceInputClass(inputs=[dataset_urn])
|
|
311
|
-
if dataset_urn
|
|
312
|
-
else None,
|
|
313
|
-
],
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
def _gen_ml_group_mcps(
|
|
317
|
-
self,
|
|
318
|
-
model: Model,
|
|
319
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
320
|
-
"""
|
|
321
|
-
Generate an MLModelGroup mcp for a VertexAI Model.
|
|
322
|
-
"""
|
|
323
|
-
ml_model_group_urn = self._make_ml_model_group_urn(model)
|
|
324
|
-
|
|
325
|
-
yield from MetadataChangeProposalWrapper.construct_many(
|
|
326
|
-
ml_model_group_urn,
|
|
327
|
-
aspects=[
|
|
328
|
-
MLModelGroupPropertiesClass(
|
|
329
|
-
name=self._make_vertexai_model_group_name(model.name),
|
|
330
|
-
description=model.description,
|
|
331
|
-
created=(
|
|
332
|
-
TimeStampClass(time=datetime_to_ts_millis(model.create_time))
|
|
333
|
-
if model.create_time
|
|
334
|
-
else None
|
|
335
|
-
),
|
|
336
|
-
lastModified=(
|
|
337
|
-
TimeStampClass(time=datetime_to_ts_millis(model.update_time))
|
|
338
|
-
if model.update_time
|
|
339
|
-
else None
|
|
340
|
-
),
|
|
341
|
-
customProperties={"displayName": model.display_name},
|
|
342
|
-
),
|
|
343
|
-
# TODO add following when metadata model for mlgroup is updated (these aspects not supported currently)
|
|
344
|
-
# SubTypesClass(typeNames=[MLTypes.MODEL_GROUP]),
|
|
345
|
-
# ContainerClass(container=self._get_project_container().as_urn())
|
|
346
|
-
],
|
|
347
|
-
)
|
|
348
|
-
|
|
349
|
-
def _make_ml_model_group_urn(self, model: Model) -> str:
|
|
350
|
-
urn = builder.make_ml_model_group_urn(
|
|
351
|
-
platform=self.platform,
|
|
352
|
-
group_name=self._make_vertexai_model_group_name(model.name),
|
|
353
|
-
env=self.config.env,
|
|
354
|
-
)
|
|
355
|
-
return urn
|
|
356
|
-
|
|
357
|
-
def _get_project_container(self) -> ProjectIdKey:
|
|
358
|
-
return ProjectIdKey(project_id=self.config.project_id, platform=self.platform)
|
|
359
|
-
|
|
360
|
-
def _is_automl_job(self, job: VertexAiResourceNoun) -> bool:
|
|
361
|
-
return (
|
|
362
|
-
isinstance(job, AutoMLTabularTrainingJob)
|
|
363
|
-
or isinstance(job, AutoMLTextTrainingJob)
|
|
364
|
-
or isinstance(job, AutoMLImageTrainingJob)
|
|
365
|
-
or isinstance(job, AutoMLVideoTrainingJob)
|
|
366
|
-
or isinstance(job, AutoMLForecastingTrainingJob)
|
|
367
|
-
)
|
|
368
|
-
|
|
369
|
-
def _search_model_version(
|
|
370
|
-
self, model: Model, version_id: str
|
|
371
|
-
) -> Optional[VersionInfo]:
|
|
372
|
-
for version in model.versioning_registry.list_versions():
|
|
373
|
-
if version.version_id == version_id:
|
|
374
|
-
return version
|
|
375
|
-
return None
|
|
376
|
-
|
|
377
|
-
def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]:
|
|
378
|
-
"""
|
|
379
|
-
Search for a dataset by its ID in Vertex AI.
|
|
380
|
-
This method iterates through different types of datasets (Text, Tabular, Image,
|
|
381
|
-
TimeSeries, and Video) to find a dataset that matches the given dataset ID.
|
|
382
|
-
"""
|
|
383
|
-
|
|
384
|
-
dataset_types = [
|
|
385
|
-
"TextDataset",
|
|
386
|
-
"TabularDataset",
|
|
387
|
-
"ImageDataset",
|
|
388
|
-
"TimeSeriesDataset",
|
|
389
|
-
"VideoDataset",
|
|
390
|
-
]
|
|
391
|
-
|
|
392
|
-
if self.datasets is None:
|
|
393
|
-
self.datasets = {}
|
|
394
|
-
|
|
395
|
-
for dtype in dataset_types:
|
|
396
|
-
dataset_class = getattr(self.client.datasets, dtype)
|
|
397
|
-
for ds in dataset_class.list():
|
|
398
|
-
self.datasets[ds.name] = ds
|
|
399
|
-
|
|
400
|
-
return self.datasets.get(dataset_id)
|
|
401
|
-
|
|
402
|
-
def _get_input_dataset_mcps(
|
|
403
|
-
self, job_meta: TrainingJobMetadata
|
|
404
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
405
|
-
"""
|
|
406
|
-
Create a DatasetPropertiesClass aspect for a given Vertex AI dataset.
|
|
407
|
-
"""
|
|
408
|
-
ds = job_meta.input_dataset
|
|
409
|
-
|
|
410
|
-
if ds:
|
|
411
|
-
# Create URN of Input Dataset for Training Job
|
|
412
|
-
dataset_name = self._make_vertexai_dataset_name(entity_id=ds.name)
|
|
413
|
-
dataset_urn = builder.make_dataset_urn(
|
|
414
|
-
platform=self.platform,
|
|
415
|
-
name=dataset_name,
|
|
416
|
-
env=self.config.env,
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
yield from MetadataChangeProposalWrapper.construct_many(
|
|
420
|
-
dataset_urn,
|
|
421
|
-
aspects=[
|
|
422
|
-
DatasetPropertiesClass(
|
|
423
|
-
name=self._make_vertexai_dataset_name(ds.name),
|
|
424
|
-
created=(
|
|
425
|
-
TimeStampClass(time=datetime_to_ts_millis(ds.create_time))
|
|
426
|
-
if ds.create_time
|
|
427
|
-
else None
|
|
428
|
-
),
|
|
429
|
-
description=f"Dataset: {ds.display_name}",
|
|
430
|
-
customProperties={
|
|
431
|
-
"displayName": ds.display_name,
|
|
432
|
-
"resourceName": ds.resource_name,
|
|
433
|
-
},
|
|
434
|
-
qualifiedName=ds.resource_name,
|
|
435
|
-
),
|
|
436
|
-
SubTypesClass(typeNames=[MLTypes.DATASET]),
|
|
437
|
-
ContainerClass(container=self._get_project_container().as_urn()),
|
|
438
|
-
],
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
def _get_training_job_metadata(
|
|
442
|
-
self, job: VertexAiResourceNoun
|
|
443
|
-
) -> TrainingJobMetadata:
|
|
444
|
-
"""
|
|
445
|
-
Retrieve metadata for a given Vertex AI training job.
|
|
446
|
-
This method extracts metadata for a Vertex AI training job, including input datasets
|
|
447
|
-
and output models. It checks if the job is an AutoML job and retrieves the relevant
|
|
448
|
-
input dataset and output model information.
|
|
449
|
-
"""
|
|
450
|
-
job_meta = TrainingJobMetadata(job=job)
|
|
451
|
-
# Check if the job is an AutoML job
|
|
452
|
-
job_conf = job.to_dict()
|
|
453
|
-
# Check if input dataset is present in the job configuration
|
|
454
|
-
if "inputDataConfig" in job_conf and "datasetId" in job_conf["inputDataConfig"]:
|
|
455
|
-
# Create URN of Input Dataset for Training Job
|
|
456
|
-
dataset_id = job_conf["inputDataConfig"]["datasetId"]
|
|
457
|
-
logger.info(
|
|
458
|
-
f"Found input dataset (id: {dataset_id}) for training job ({job.display_name})"
|
|
459
|
-
)
|
|
460
|
-
|
|
461
|
-
if dataset_id:
|
|
462
|
-
input_ds = self._search_dataset(dataset_id)
|
|
463
|
-
if input_ds:
|
|
464
|
-
logger.info(
|
|
465
|
-
f"Found the name of input dataset ({input_ds.display_name}) with dataset id ({dataset_id})"
|
|
466
|
-
)
|
|
467
|
-
job_meta.input_dataset = input_ds
|
|
468
|
-
|
|
469
|
-
# Check if output model is present in the job configuration
|
|
470
|
-
if (
|
|
471
|
-
"modelToUpload" in job_conf
|
|
472
|
-
and "name" in job_conf["modelToUpload"]
|
|
473
|
-
and job_conf["modelToUpload"]["name"]
|
|
474
|
-
and job_conf["modelToUpload"]["versionId"]
|
|
475
|
-
):
|
|
476
|
-
model_name = job_conf["modelToUpload"]["name"]
|
|
477
|
-
model_version_str = job_conf["modelToUpload"]["versionId"]
|
|
478
|
-
try:
|
|
479
|
-
model = Model(model_name=model_name)
|
|
480
|
-
model_version = self._search_model_version(model, model_version_str)
|
|
481
|
-
if model and model_version:
|
|
482
|
-
logger.info(
|
|
483
|
-
f"Found output model (name:{model.display_name} id:{model_version_str}) "
|
|
484
|
-
f"for training job: {job.display_name}"
|
|
485
|
-
)
|
|
486
|
-
job_meta.output_model = model
|
|
487
|
-
job_meta.output_model_version = model_version
|
|
488
|
-
except GoogleAPICallError as e:
|
|
489
|
-
self.report.report_failure(
|
|
490
|
-
title="Unable to fetch model and model version",
|
|
491
|
-
message="Encountered an error while fetching output model and model version which training job generates",
|
|
492
|
-
exc=e,
|
|
493
|
-
)
|
|
494
|
-
|
|
495
|
-
return job_meta
|
|
496
|
-
|
|
497
|
-
def _gen_endpoints_mcps(
|
|
498
|
-
self, model_meta: ModelMetadata
|
|
499
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
500
|
-
model: Model = model_meta.model
|
|
501
|
-
model_version: VersionInfo = model_meta.model_version
|
|
502
|
-
|
|
503
|
-
if model_meta.endpoints:
|
|
504
|
-
for endpoint in model_meta.endpoints:
|
|
505
|
-
endpoint_urn = builder.make_ml_model_deployment_urn(
|
|
506
|
-
platform=self.platform,
|
|
507
|
-
deployment_name=self._make_vertexai_endpoint_name(
|
|
508
|
-
entity_id=endpoint.name
|
|
509
|
-
),
|
|
510
|
-
env=self.config.env,
|
|
511
|
-
)
|
|
512
|
-
|
|
513
|
-
yield from MetadataChangeProposalWrapper.construct_many(
|
|
514
|
-
endpoint_urn,
|
|
515
|
-
aspects=[
|
|
516
|
-
MLModelDeploymentPropertiesClass(
|
|
517
|
-
description=model.description,
|
|
518
|
-
createdAt=datetime_to_ts_millis(endpoint.create_time),
|
|
519
|
-
version=VersionTagClass(
|
|
520
|
-
versionTag=str(model_version.version_id)
|
|
521
|
-
),
|
|
522
|
-
customProperties={"displayName": endpoint.display_name},
|
|
523
|
-
),
|
|
524
|
-
# TODO add followings when metadata for MLModelDeployment is updated (these aspects not supported currently)
|
|
525
|
-
# ContainerClass(container=self._get_project_container().as_urn()),
|
|
526
|
-
# SubTypesClass(typeNames=[MLTypes.ENDPOINT])
|
|
527
|
-
],
|
|
528
|
-
)
|
|
529
|
-
|
|
530
|
-
def _gen_ml_model_mcps(
|
|
531
|
-
self, ModelMetadata: ModelMetadata
|
|
532
|
-
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
533
|
-
"""
|
|
534
|
-
Generate an MLModel and Endpoint mcp for an VertexAI Model Version.
|
|
535
|
-
"""
|
|
536
|
-
|
|
537
|
-
model: Model = ModelMetadata.model
|
|
538
|
-
model_version: VersionInfo = ModelMetadata.model_version
|
|
539
|
-
training_job_urn: Optional[str] = ModelMetadata.training_job_urn
|
|
540
|
-
endpoints: Optional[List[Endpoint]] = ModelMetadata.endpoints
|
|
541
|
-
endpoint_urns: List[str] = list()
|
|
542
|
-
|
|
543
|
-
logging.info(f"generating model mcp for {model.name}")
|
|
544
|
-
|
|
545
|
-
# Generate list of endpoint URL
|
|
546
|
-
if endpoints:
|
|
547
|
-
for endpoint in endpoints:
|
|
548
|
-
logger.info(
|
|
549
|
-
f"found endpoint ({endpoint.display_name}) for model ({model.resource_name})"
|
|
550
|
-
)
|
|
551
|
-
endpoint_urns.append(
|
|
552
|
-
builder.make_ml_model_deployment_urn(
|
|
553
|
-
platform=self.platform,
|
|
554
|
-
deployment_name=self._make_vertexai_endpoint_name(
|
|
555
|
-
entity_id=endpoint.display_name
|
|
556
|
-
),
|
|
557
|
-
env=self.config.env,
|
|
558
|
-
)
|
|
559
|
-
)
|
|
560
|
-
|
|
561
|
-
# Create URN for Model and Model Version
|
|
562
|
-
model_group_urn = self._make_ml_model_group_urn(model)
|
|
563
|
-
model_name = self._make_vertexai_model_name(entity_id=model.name)
|
|
564
|
-
model_version_name = f"{model_name}_{model_version.version_id}"
|
|
565
|
-
model_urn = self._make_ml_model_urn(model_version, model_name=model_name)
|
|
566
|
-
|
|
567
|
-
yield from MetadataChangeProposalWrapper.construct_many(
|
|
568
|
-
entityUrn=model_urn,
|
|
569
|
-
aspects=[
|
|
570
|
-
MLModelPropertiesClass(
|
|
571
|
-
name=model_version_name,
|
|
572
|
-
description=model_version.version_description,
|
|
573
|
-
customProperties={
|
|
574
|
-
"displayName": f"{model_version.model_display_name}",
|
|
575
|
-
"versionId": f"{model_version.version_id}",
|
|
576
|
-
"resourceName": model.resource_name,
|
|
577
|
-
},
|
|
578
|
-
created=(
|
|
579
|
-
TimeStampClass(
|
|
580
|
-
datetime_to_ts_millis(model_version.version_create_time)
|
|
581
|
-
)
|
|
582
|
-
if model_version.version_create_time
|
|
583
|
-
else None
|
|
584
|
-
),
|
|
585
|
-
lastModified=(
|
|
586
|
-
TimeStampClass(
|
|
587
|
-
datetime_to_ts_millis(model_version.version_update_time)
|
|
588
|
-
)
|
|
589
|
-
if model_version.version_update_time
|
|
590
|
-
else None
|
|
591
|
-
),
|
|
592
|
-
version=VersionTagClass(versionTag=str(model_version.version_id)),
|
|
593
|
-
groups=[model_group_urn], # link model version to model group
|
|
594
|
-
trainingJobs=(
|
|
595
|
-
[training_job_urn] if training_job_urn else None
|
|
596
|
-
), # link to training job
|
|
597
|
-
deployments=endpoint_urns,
|
|
598
|
-
externalUrl=self._make_model_version_external_url(model),
|
|
599
|
-
type="ML Model",
|
|
600
|
-
),
|
|
601
|
-
# TODO Add a container for Project as parent of the dataset
|
|
602
|
-
# ContainerClass(
|
|
603
|
-
# container=self._get_project_container().as_urn(),
|
|
604
|
-
# )
|
|
605
|
-
],
|
|
606
|
-
)
|
|
607
|
-
|
|
608
|
-
def _search_endpoint(self, model: Model) -> List[Endpoint]:
|
|
609
|
-
"""
|
|
610
|
-
Search for an endpoint associated with the model.
|
|
611
|
-
"""
|
|
612
|
-
if self.endpoints is None:
|
|
613
|
-
endpoint_dict: Dict[str, List[Endpoint]] = {}
|
|
614
|
-
for endpoint in self.client.Endpoint.list():
|
|
615
|
-
for resource in endpoint.list_models():
|
|
616
|
-
if resource.model not in endpoint_dict:
|
|
617
|
-
endpoint_dict[resource.model] = []
|
|
618
|
-
endpoint_dict[resource.model].append(endpoint)
|
|
619
|
-
self.endpoints = endpoint_dict
|
|
620
|
-
|
|
621
|
-
endpoints = (
|
|
622
|
-
self.endpoints[model.resource_name]
|
|
623
|
-
if model.resource_name in self.endpoints
|
|
624
|
-
else []
|
|
625
|
-
)
|
|
626
|
-
return endpoints
|
|
627
|
-
|
|
628
|
-
def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str:
|
|
629
|
-
urn = builder.make_ml_model_urn(
|
|
630
|
-
platform=self.platform,
|
|
631
|
-
model_name=f"{model_name}_{model_version.version_id}",
|
|
632
|
-
env=self.config.env,
|
|
633
|
-
)
|
|
634
|
-
return urn
|
|
635
|
-
|
|
636
|
-
def _make_training_job_urn(self, job: VertexAiResourceNoun) -> str:
|
|
637
|
-
job_id = self._make_vertexai_job_name(entity_id=job.name)
|
|
638
|
-
urn = builder.make_data_process_instance_urn(dataProcessInstanceId=job_id)
|
|
639
|
-
return urn
|
|
640
|
-
|
|
641
|
-
def _make_vertexai_model_group_name(
|
|
642
|
-
self,
|
|
643
|
-
entity_id: str,
|
|
644
|
-
) -> str:
|
|
645
|
-
return f"{self.config.project_id}.model_group.{entity_id}"
|
|
646
|
-
|
|
647
|
-
def _make_vertexai_endpoint_name(self, entity_id: str) -> str:
|
|
648
|
-
return f"{self.config.project_id}.endpoint.{entity_id}"
|
|
649
|
-
|
|
650
|
-
def _make_vertexai_model_name(self, entity_id: str) -> str:
|
|
651
|
-
return f"{self.config.project_id}.model.{entity_id}"
|
|
652
|
-
|
|
653
|
-
def _make_vertexai_dataset_name(self, entity_id: str) -> str:
|
|
654
|
-
return f"{self.config.project_id}.dataset.{entity_id}"
|
|
655
|
-
|
|
656
|
-
def _make_vertexai_job_name(
|
|
657
|
-
self,
|
|
658
|
-
entity_id: Optional[str],
|
|
659
|
-
) -> str:
|
|
660
|
-
return f"{self.config.project_id}.job.{entity_id}"
|
|
661
|
-
|
|
662
|
-
def _make_job_external_url(self, job: VertexAiResourceNoun) -> str:
|
|
663
|
-
"""
|
|
664
|
-
Model external URL in Vertex AI
|
|
665
|
-
Sample URLs:
|
|
666
|
-
https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888
|
|
667
|
-
"""
|
|
668
|
-
external_url: str = (
|
|
669
|
-
f"{self.config.vertexai_url}/training/training-pipelines?trainingPipelineId={job.name}"
|
|
670
|
-
f"?project={self.config.project_id}"
|
|
671
|
-
)
|
|
672
|
-
return external_url
|
|
673
|
-
|
|
674
|
-
def _make_model_external_url(self, model: Model) -> str:
|
|
675
|
-
"""
|
|
676
|
-
Model external URL in Vertex AI
|
|
677
|
-
Sample URL:
|
|
678
|
-
https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336?project=acryl-poc
|
|
679
|
-
"""
|
|
680
|
-
external_url: str = (
|
|
681
|
-
f"{self.config.vertexai_url}/models/locations/{self.config.region}/models/{model.name}"
|
|
682
|
-
f"?project={self.config.project_id}"
|
|
683
|
-
)
|
|
684
|
-
return external_url
|
|
685
|
-
|
|
686
|
-
def _make_model_version_external_url(self, model: Model) -> str:
|
|
687
|
-
"""
|
|
688
|
-
Model Version external URL in Vertex AI
|
|
689
|
-
Sample URL:
|
|
690
|
-
https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/1?project=acryl-poc
|
|
691
|
-
"""
|
|
692
|
-
external_url: str = (
|
|
693
|
-
f"{self.config.vertexai_url}/models/locations/{self.config.region}/models/{model.name}"
|
|
694
|
-
f"/versions/{model.version_id}"
|
|
695
|
-
f"?project={self.config.project_id}"
|
|
696
|
-
)
|
|
697
|
-
return external_url
|