acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1367 @@
1
+ import dataclasses
2
+ import logging
3
+ from datetime import datetime, timedelta
4
+ from typing import Dict, Iterable, List, Optional, Tuple, TypeVar, Union
5
+
6
+ from google.api_core.exceptions import GoogleAPICallError
7
+ from google.cloud import aiplatform
8
+ from google.cloud.aiplatform import (
9
+ AutoMLForecastingTrainingJob,
10
+ AutoMLImageTrainingJob,
11
+ AutoMLTabularTrainingJob,
12
+ AutoMLTextTrainingJob,
13
+ AutoMLVideoTrainingJob,
14
+ Endpoint,
15
+ ExperimentRun,
16
+ PipelineJob,
17
+ )
18
+ from google.cloud.aiplatform.base import VertexAiResourceNoun
19
+ from google.cloud.aiplatform.metadata.execution import Execution
20
+ from google.cloud.aiplatform.metadata.experiment_resources import Experiment
21
+ from google.cloud.aiplatform.models import Model, VersionInfo
22
+ from google.cloud.aiplatform.training_jobs import _TrainingJob
23
+ from google.cloud.aiplatform_v1.types import (
24
+ PipelineJob as PipelineJobType,
25
+ PipelineTaskDetail,
26
+ )
27
+ from google.oauth2 import service_account
28
+ from google.protobuf import timestamp_pb2
29
+
30
+ import datahub.emitter.mce_builder as builder
31
+ from datahub.api.entities.datajob import DataFlow, DataJob
32
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
33
+ from datahub.emitter.mcp_builder import (
34
+ ExperimentKey,
35
+ ProjectIdKey,
36
+ gen_containers,
37
+ )
38
+ from datahub.ingestion.api.common import PipelineContext
39
+ from datahub.ingestion.api.decorators import (
40
+ SupportStatus,
41
+ capability,
42
+ config_class,
43
+ platform_name,
44
+ support_status,
45
+ )
46
+ from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
47
+ from datahub.ingestion.api.source_helpers import auto_workunit
48
+ from datahub.ingestion.api.workunit import MetadataWorkUnit
49
+ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
50
+ from datahub.ingestion.source.vertexai.vertexai_config import VertexAIConfig
51
+ from datahub.ingestion.source.vertexai.vertexai_result_type_utils import (
52
+ get_execution_result_status,
53
+ get_job_result_status,
54
+ get_pipeline_task_result_status,
55
+ is_status_for_run_event_class,
56
+ )
57
+ from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import (
58
+ DataProcessInstanceRelationships,
59
+ )
60
+ from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import (
61
+ MLTrainingRunProperties,
62
+ )
63
+ from datahub.metadata.schema_classes import (
64
+ AuditStampClass,
65
+ ContainerClass,
66
+ DataPlatformInstanceClass,
67
+ DataProcessInstanceInputClass,
68
+ DataProcessInstanceOutputClass,
69
+ DataProcessInstancePropertiesClass,
70
+ DataProcessInstanceRunEventClass,
71
+ DataProcessInstanceRunResultClass,
72
+ DataProcessRunStatusClass,
73
+ DatasetPropertiesClass,
74
+ EdgeClass,
75
+ MetadataAttributionClass,
76
+ MLHyperParamClass,
77
+ MLMetricClass,
78
+ MLModelDeploymentPropertiesClass,
79
+ MLModelGroupPropertiesClass,
80
+ MLModelPropertiesClass,
81
+ MLTrainingRunPropertiesClass,
82
+ RunResultTypeClass,
83
+ SubTypesClass,
84
+ TimeStampClass,
85
+ VersionPropertiesClass,
86
+ VersionTagClass,
87
+ )
88
+ from datahub.metadata.urns import (
89
+ DataFlowUrn,
90
+ DataJobUrn,
91
+ DataPlatformUrn,
92
+ MlModelUrn,
93
+ VersionSetUrn,
94
+ )
95
+ from datahub.utilities.time import datetime_to_ts_millis
96
+
97
+ T = TypeVar("T")
98
+
99
+ logger = logging.getLogger(__name__)
100
+
101
+
102
+ @dataclasses.dataclass
103
+ class TrainingJobMetadata:
104
+ job: VertexAiResourceNoun
105
+ input_dataset: Optional[VertexAiResourceNoun] = None
106
+ output_model: Optional[Model] = None
107
+ output_model_version: Optional[VersionInfo] = None
108
+
109
+
110
+ @dataclasses.dataclass
111
+ class ModelMetadata:
112
+ model: Model
113
+ model_version: VersionInfo
114
+ training_job_urn: Optional[str] = None
115
+ endpoints: Optional[List[Endpoint]] = None
116
+
117
+
118
+ @dataclasses.dataclass
119
+ class PipelineTaskMetadata:
120
+ name: str
121
+ urn: DataJobUrn
122
+ id: Optional[int] = None
123
+ type: Optional[str] = None
124
+ state: Optional[PipelineTaskDetail.State] = None
125
+ start_time: Optional[timestamp_pb2.Timestamp] = None
126
+ create_time: Optional[timestamp_pb2.Timestamp] = None
127
+ end_time: Optional[timestamp_pb2.Timestamp] = None
128
+ upstreams: Optional[List[DataJobUrn]] = None
129
+ duration: Optional[int] = None
130
+
131
+
132
+ @dataclasses.dataclass
133
+ class PipelineMetadata:
134
+ name: str
135
+ resource_name: str
136
+ tasks: List[PipelineTaskMetadata]
137
+ urn: DataFlowUrn
138
+ id: Optional[str] = None
139
+ labels: Optional[Dict[str, str]] = None
140
+ create_time: Optional[datetime] = None
141
+ update_time: Optional[datetime] = None
142
+ duration: Optional[timedelta] = None
143
+ region: Optional[str] = None
144
+
145
+
146
+ @platform_name("Vertex AI", id="vertexai")
147
+ @config_class(VertexAIConfig)
148
+ @support_status(SupportStatus.INCUBATING)
149
+ @capability(
150
+ SourceCapability.DESCRIPTIONS,
151
+ "Extract descriptions for Vertex AI Registered Models and Model Versions",
152
+ )
153
+ class VertexAISource(Source):
154
+ platform: str = "vertexai"
155
+
156
+ def __init__(self, ctx: PipelineContext, config: VertexAIConfig):
157
+ super().__init__(ctx)
158
+ self.config = config
159
+ self.report = SourceReport()
160
+
161
+ creds = self.config.get_credentials()
162
+ credentials = (
163
+ service_account.Credentials.from_service_account_info(creds)
164
+ if creds
165
+ else None
166
+ )
167
+
168
+ aiplatform.init(
169
+ project=config.project_id, location=config.region, credentials=credentials
170
+ )
171
+ self.client = aiplatform
172
+ self.endpoints: Optional[Dict[str, List[Endpoint]]] = None
173
+ self.datasets: Optional[Dict[str, VertexAiResourceNoun]] = None
174
+ self.experiments: Optional[List[Experiment]] = None
175
+
176
+ def get_report(self) -> SourceReport:
177
+ return self.report
178
+
179
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
180
+ """
181
+ Main Function to fetch and yields mcps for various VertexAI resources.
182
+ - Models and Model Versions from the Model Registry
183
+ - Training Jobs
184
+ """
185
+
186
+ # Ingest Project
187
+ yield from self._gen_project_workunits()
188
+ # Fetch and Ingest Models, Model Versions a from Model Registry
189
+ yield from auto_workunit(self._get_ml_models_mcps())
190
+ # Fetch and Ingest Training Jobs
191
+ yield from auto_workunit(self._get_training_jobs_mcps())
192
+ # Fetch and Ingest Experiments
193
+ yield from self._get_experiments_workunits()
194
+ # Fetch and Ingest Experiment Runs
195
+ yield from auto_workunit(self._get_experiment_runs_mcps())
196
+ # Fetch Pipelines and Tasks
197
+ yield from auto_workunit(self._get_pipelines_mcps())
198
+
199
+ def _get_pipelines_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
200
+ """
201
+ Fetches pipelines from Vertex AI and generates corresponding mcps.
202
+ """
203
+
204
+ pipeline_jobs = self.client.PipelineJob.list()
205
+
206
+ for pipeline in pipeline_jobs:
207
+ logger.info(f"fetching pipeline ({pipeline.name})")
208
+ pipeline_meta = self._get_pipeline_metadata(pipeline)
209
+ yield from self._get_pipeline_mcps(pipeline_meta)
210
+ yield from self._gen_pipeline_task_mcps(pipeline_meta)
211
+
212
+ def _get_pipeline_tasks_metadata(
213
+ self, pipeline: PipelineJob, pipeline_urn: DataFlowUrn
214
+ ) -> List[PipelineTaskMetadata]:
215
+ tasks: List[PipelineTaskMetadata] = list()
216
+ task_map: Dict[str, PipelineTaskDetail] = dict()
217
+ for task in pipeline.task_details:
218
+ task_map[task.task_name] = task
219
+
220
+ resource = pipeline.gca_resource
221
+ if isinstance(resource, PipelineJobType):
222
+ for task_name in resource.pipeline_spec["root"]["dag"]["tasks"]:
223
+ logger.debug(
224
+ f"fetching pipeline task ({task_name}) in pipeline ({pipeline.name})"
225
+ )
226
+ task_urn = DataJobUrn.create_from_ids(
227
+ data_flow_urn=str(pipeline_urn),
228
+ job_id=self._make_vertexai_pipeline_task_id(task_name),
229
+ )
230
+ task_meta = PipelineTaskMetadata(name=task_name, urn=task_urn)
231
+ if (
232
+ "dependentTasks"
233
+ in resource.pipeline_spec["root"]["dag"]["tasks"][task_name]
234
+ ):
235
+ upstream_tasks = resource.pipeline_spec["root"]["dag"]["tasks"][
236
+ task_name
237
+ ]["dependentTasks"]
238
+ upstream_urls = [
239
+ DataJobUrn.create_from_ids(
240
+ data_flow_urn=str(pipeline_urn),
241
+ job_id=self._make_vertexai_pipeline_task_id(upstream_task),
242
+ )
243
+ for upstream_task in upstream_tasks
244
+ ]
245
+ task_meta.upstreams = upstream_urls
246
+
247
+ task_detail = task_map.get(task_name)
248
+ if task_detail:
249
+ task_meta.id = task_detail.task_id
250
+ task_meta.state = task_detail.state
251
+ task_meta.start_time = task_detail.start_time
252
+ task_meta.create_time = task_detail.create_time
253
+ if task_detail.end_time:
254
+ task_meta.end_time = task_detail.end_time
255
+ task_meta.duration = int(
256
+ (
257
+ task_meta.end_time.timestamp()
258
+ - task_meta.start_time.timestamp()
259
+ )
260
+ * 1000
261
+ )
262
+
263
+ tasks.append(task_meta)
264
+ return tasks
265
+
266
+ def _get_pipeline_metadata(self, pipeline: PipelineJob) -> PipelineMetadata:
267
+ dataflow_urn = DataFlowUrn.create_from_ids(
268
+ orchestrator=self.platform,
269
+ env=self.config.env,
270
+ flow_id=self._make_vertexai_pipeline_id(pipeline.name),
271
+ platform_instance=self.platform,
272
+ )
273
+ tasks = self._get_pipeline_tasks_metadata(
274
+ pipeline=pipeline, pipeline_urn=dataflow_urn
275
+ )
276
+
277
+ pipeline_meta = PipelineMetadata(
278
+ name=pipeline.name,
279
+ resource_name=pipeline.resource_name,
280
+ urn=dataflow_urn,
281
+ tasks=tasks,
282
+ )
283
+ pipeline_meta.resource_name = pipeline.resource_name
284
+ pipeline_meta.labels = pipeline.labels
285
+ pipeline_meta.create_time = pipeline.create_time
286
+ pipeline_meta.region = pipeline.location
287
+ if pipeline.update_time:
288
+ pipeline_meta.update_time = pipeline.update_time
289
+ pipeline_meta.duration = timedelta(
290
+ milliseconds=datetime_to_ts_millis(pipeline.update_time)
291
+ - datetime_to_ts_millis(pipeline.create_time)
292
+ )
293
+ return pipeline_meta
294
+
295
+ def _gen_pipeline_task_run_mcps(
296
+ self, task: PipelineTaskMetadata, datajob: DataJob, pipeline: PipelineMetadata
297
+ ) -> (Iterable)[MetadataChangeProposalWrapper]:
298
+ dpi_urn = builder.make_data_process_instance_urn(
299
+ self._make_vertexai_pipeline_task_run_id(entity_id=task.name)
300
+ )
301
+ result_status: Union[str, RunResultTypeClass] = get_pipeline_task_result_status(
302
+ task.state
303
+ )
304
+
305
+ yield from MetadataChangeProposalWrapper.construct_many(
306
+ dpi_urn,
307
+ aspects=[
308
+ DataProcessInstancePropertiesClass(
309
+ name=task.name,
310
+ created=AuditStampClass(
311
+ time=(
312
+ int(task.create_time.timestamp() * 1000)
313
+ if task.create_time
314
+ else 0
315
+ ),
316
+ actor="urn:li:corpuser:datahub",
317
+ ),
318
+ externalUrl=self._make_pipeline_external_url(pipeline.name),
319
+ customProperties={},
320
+ ),
321
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE_TASK_RUN]),
322
+ ContainerClass(container=self._get_project_container().as_urn()),
323
+ DataPlatformInstanceClass(platform=str(DataPlatformUrn(self.platform))),
324
+ DataProcessInstanceRelationships(
325
+ upstreamInstances=[], parentTemplate=str(datajob.urn)
326
+ ),
327
+ (
328
+ DataProcessInstanceRunEventClass(
329
+ status=DataProcessRunStatusClass.COMPLETE,
330
+ timestampMillis=(
331
+ int(task.create_time.timestamp() * 1000)
332
+ if task.create_time
333
+ else 0
334
+ ),
335
+ result=DataProcessInstanceRunResultClass(
336
+ type=result_status,
337
+ nativeResultType=self.platform,
338
+ ),
339
+ durationMillis=task.duration,
340
+ )
341
+ if is_status_for_run_event_class(result_status) and task.duration
342
+ else None
343
+ ),
344
+ ],
345
+ )
346
+
347
+ def _gen_pipeline_task_mcps(
348
+ self, pipeline: PipelineMetadata
349
+ ) -> Iterable[MetadataChangeProposalWrapper]:
350
+ dataflow_urn = pipeline.urn
351
+
352
+ for task in pipeline.tasks:
353
+ datajob = DataJob(
354
+ id=self._make_vertexai_pipeline_task_id(task.name),
355
+ flow_urn=dataflow_urn,
356
+ name=task.name,
357
+ properties={},
358
+ owners={"urn:li:corpuser:datahub"},
359
+ upstream_urns=task.upstreams if task.upstreams else [],
360
+ url=self._make_pipeline_external_url(pipeline.name),
361
+ )
362
+ yield from MetadataChangeProposalWrapper.construct_many(
363
+ entityUrn=str(datajob.urn),
364
+ aspects=[
365
+ ContainerClass(container=self._get_project_container().as_urn()),
366
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE_TASK]),
367
+ ],
368
+ )
369
+ yield from datajob.generate_mcp()
370
+ yield from self._gen_pipeline_task_run_mcps(task, datajob, pipeline)
371
+
372
+ def _format_pipeline_duration(self, td: timedelta) -> str:
373
+ days = td.days
374
+ hours, remainder = divmod(td.seconds, 3600)
375
+ minutes, seconds = divmod(remainder, 60)
376
+ milliseconds = td.microseconds // 1000
377
+
378
+ parts = []
379
+ if days:
380
+ parts.append(f"{days}d")
381
+ if hours:
382
+ parts.append(f"{hours}h")
383
+ if minutes:
384
+ parts.append(f"{minutes}m")
385
+ if seconds:
386
+ parts.append(f"{seconds}s")
387
+ if milliseconds:
388
+ parts.append(f"{milliseconds}ms")
389
+ return " ".join(parts) if parts else "0s"
390
+
391
+ def _get_pipeline_task_properties(
392
+ self, task: PipelineTaskMetadata
393
+ ) -> Dict[str, str]:
394
+ return {
395
+ "created_time": (
396
+ task.create_time.strftime("%Y-%m-%d %H:%M:%S")
397
+ if task.create_time
398
+ else ""
399
+ )
400
+ }
401
+
402
+ def _get_pipeline_properties(self, pipeline: PipelineMetadata) -> Dict[str, str]:
403
+ return {
404
+ "resource_name": pipeline.resource_name if pipeline.resource_name else "",
405
+ "create_time": (
406
+ pipeline.create_time.isoformat() if pipeline.create_time else ""
407
+ ),
408
+ "update_time": (
409
+ pipeline.update_time.isoformat() if pipeline.update_time else ""
410
+ ),
411
+ "duration": (
412
+ self._format_pipeline_duration(pipeline.duration)
413
+ if pipeline.duration
414
+ else ""
415
+ ),
416
+ "location": (pipeline.region if pipeline.region else ""),
417
+ "labels": ",".join([f"{k}:{v}" for k, v in pipeline.labels.items()])
418
+ if pipeline.labels
419
+ else "",
420
+ }
421
+
422
+ def _get_pipeline_mcps(
423
+ self, pipeline: PipelineMetadata
424
+ ) -> Iterable[MetadataChangeProposalWrapper]:
425
+ dataflow = DataFlow(
426
+ orchestrator=self.platform,
427
+ id=self._make_vertexai_pipeline_id(pipeline.name),
428
+ env=self.config.env,
429
+ name=pipeline.name,
430
+ platform_instance=self.platform,
431
+ properties=self._get_pipeline_properties(pipeline),
432
+ owners={"urn:li:corpuser:datahub"},
433
+ url=self._make_pipeline_external_url(pipeline_name=pipeline.name),
434
+ )
435
+
436
+ yield from dataflow.generate_mcp()
437
+
438
+ yield from MetadataChangeProposalWrapper.construct_many(
439
+ entityUrn=str(dataflow.urn),
440
+ aspects=[
441
+ ContainerClass(container=self._get_project_container().as_urn()),
442
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE]),
443
+ ],
444
+ )
445
+
446
+ def _get_experiments_workunits(self) -> Iterable[MetadataWorkUnit]:
447
+ # List all experiments
448
+ self.experiments = aiplatform.Experiment.list()
449
+
450
+ logger.info("Fetching experiments from VertexAI server")
451
+ for experiment in self.experiments:
452
+ yield from self._gen_experiment_workunits(experiment)
453
+
454
+ def _get_experiment_runs_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
455
+ if self.experiments is None:
456
+ self.experiments = aiplatform.Experiment.list()
457
+ for experiment in self.experiments:
458
+ logger.info(f"Fetching experiment runs for experiment {experiment.name}")
459
+ experiment_runs = aiplatform.ExperimentRun.list(experiment=experiment.name)
460
+ for run in experiment_runs:
461
+ yield from self._gen_experiment_run_mcps(experiment, run)
462
+
463
+ def _gen_experiment_workunits(
464
+ self, experiment: Experiment
465
+ ) -> Iterable[MetadataWorkUnit]:
466
+ yield from gen_containers(
467
+ parent_container_key=self._get_project_container(),
468
+ container_key=ExperimentKey(
469
+ platform=self.platform,
470
+ id=self._make_vertexai_experiment_id(experiment.name),
471
+ ),
472
+ name=experiment.name,
473
+ sub_types=[MLAssetSubTypes.VERTEX_EXPERIMENT],
474
+ extra_properties={
475
+ "name": experiment.name,
476
+ "resourceName": experiment.resource_name,
477
+ "dashboardURL": experiment.dashboard_url
478
+ if experiment.dashboard_url
479
+ else "",
480
+ },
481
+ external_url=self._make_experiment_external_url(experiment),
482
+ )
483
+
484
+ def _get_experiment_run_params(self, run: ExperimentRun) -> List[MLHyperParamClass]:
485
+ return [
486
+ MLHyperParamClass(name=k, value=str(v)) for k, v in run.get_params().items()
487
+ ]
488
+
489
+ def _get_experiment_run_metrics(self, run: ExperimentRun) -> List[MLMetricClass]:
490
+ return [
491
+ MLMetricClass(name=k, value=str(v)) for k, v in run.get_metrics().items()
492
+ ]
493
+
494
+ def _get_run_timestamps(
495
+ self, run: ExperimentRun
496
+ ) -> Tuple[Optional[int], Optional[int]]:
497
+ executions = run.get_executions()
498
+ if len(executions) == 1:
499
+ create_time = executions[0].create_time
500
+ update_time = executions[0].update_time
501
+ duration = update_time.timestamp() * 1000 - create_time.timestamp() * 1000
502
+ return int(create_time.timestamp() * 1000), int(duration)
503
+ # When no execution context started, start time and duration are not available
504
+ # When multiple execution contexts stared on a run, not unable to know which context to use for create_time and duration
505
+ else:
506
+ return None, None
507
+
508
+ def _get_run_result_status(self, status: str) -> Union[str, RunResultTypeClass]:
509
+ if status == "COMPLETE":
510
+ return RunResultTypeClass.SUCCESS
511
+ elif status == "FAILED":
512
+ return RunResultTypeClass.FAILURE
513
+ elif status == "RUNNING": # No Corresponding RunResultTypeClass for RUNNING
514
+ return "RUNNING"
515
+ else:
516
+ return "UNKNOWN"
517
+
518
+ def _make_custom_properties_for_run(
519
+ self, experiment: Experiment, run: ExperimentRun
520
+ ) -> dict:
521
+ properties: Dict[str, str] = dict()
522
+ properties["externalUrl"] = self._make_experiment_run_external_url(
523
+ experiment, run
524
+ )
525
+ for exec in run.get_executions():
526
+ exec_name = exec.name
527
+ properties[f"created time ({exec_name})"] = str(exec.create_time)
528
+ properties[f"update time ({exec_name}) "] = str(exec.update_time)
529
+ return properties
530
+
531
+ def _make_custom_properties_for_execution(self, execution: Execution) -> dict:
532
+ properties: Dict[str, Optional[str]] = dict()
533
+ for input in execution.get_input_artifacts():
534
+ properties[f"input artifact ({input.name})"] = input.uri
535
+ for output in execution.get_output_artifacts():
536
+ properties[f"output artifact ({output.name})"] = output.uri
537
+
538
+ return properties
539
+
540
+ def _gen_run_execution(
541
+ self, execution: Execution, run: ExperimentRun, exp: Experiment
542
+ ) -> Iterable[MetadataChangeProposalWrapper]:
543
+ create_time = execution.create_time
544
+ update_time = execution.update_time
545
+ duration = datetime_to_ts_millis(update_time) - datetime_to_ts_millis(
546
+ create_time
547
+ )
548
+ result_status: Union[str, RunResultTypeClass] = get_execution_result_status(
549
+ execution.state
550
+ )
551
+ execution_urn = builder.make_data_process_instance_urn(
552
+ self._make_vertexai_run_execution_name(execution.name)
553
+ )
554
+
555
+ yield from MetadataChangeProposalWrapper.construct_many(
556
+ entityUrn=str(execution_urn),
557
+ aspects=[
558
+ DataProcessInstancePropertiesClass(
559
+ name=execution.name,
560
+ created=AuditStampClass(
561
+ time=datetime_to_ts_millis(create_time),
562
+ actor="urn:li:corpuser:datahub",
563
+ ),
564
+ externalUrl=self._make_artifact_external_url(
565
+ experiment=exp, run=run
566
+ ),
567
+ customProperties=self._make_custom_properties_for_execution(
568
+ execution
569
+ ),
570
+ ),
571
+ DataPlatformInstanceClass(platform=str(DataPlatformUrn(self.platform))),
572
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_EXECUTION]),
573
+ (
574
+ DataProcessInstanceRunEventClass(
575
+ status=DataProcessRunStatusClass.COMPLETE,
576
+ timestampMillis=datetime_to_ts_millis(create_time),
577
+ result=DataProcessInstanceRunResultClass(
578
+ type=result_status,
579
+ nativeResultType=self.platform,
580
+ ),
581
+ durationMillis=int(duration),
582
+ )
583
+ if is_status_for_run_event_class(result_status) and duration
584
+ else None
585
+ ),
586
+ DataProcessInstanceRelationships(
587
+ upstreamInstances=[self._make_experiment_run_urn(exp, run)],
588
+ parentInstance=self._make_experiment_run_urn(exp, run),
589
+ ),
590
+ DataProcessInstanceInputClass(
591
+ inputs=[],
592
+ inputEdges=[
593
+ EdgeClass(
594
+ destinationUrn=self._make_experiment_run_urn(exp, run)
595
+ ),
596
+ ],
597
+ ),
598
+ ],
599
+ )
600
+
601
+ def _gen_experiment_run_mcps(
602
+ self, experiment: Experiment, run: ExperimentRun
603
+ ) -> Iterable[MetadataChangeProposalWrapper]:
604
+ experiment_key = ExperimentKey(
605
+ platform=self.platform,
606
+ id=self._make_vertexai_experiment_id(experiment.name),
607
+ )
608
+ run_urn = self._make_experiment_run_urn(experiment, run)
609
+ created_time, duration = self._get_run_timestamps(run)
610
+ created_actor = "urn:li:corpuser:datahub"
611
+ run_result_type = self._get_run_result_status(run.get_state())
612
+
613
+ # generating mcps for run execution
614
+ for execution in run.get_executions():
615
+ yield from self._gen_run_execution(
616
+ execution=execution, exp=experiment, run=run
617
+ )
618
+
619
+ # generating mcps for run
620
+ yield from MetadataChangeProposalWrapper.construct_many(
621
+ entityUrn=run_urn,
622
+ aspects=[
623
+ DataProcessInstancePropertiesClass(
624
+ name=run.name,
625
+ created=AuditStampClass(
626
+ time=created_time if created_time else 0,
627
+ actor=created_actor,
628
+ ),
629
+ externalUrl=self._make_experiment_run_external_url(experiment, run),
630
+ customProperties=self._make_custom_properties_for_run(
631
+ experiment, run
632
+ ),
633
+ ),
634
+ ContainerClass(container=experiment_key.as_urn()),
635
+ MLTrainingRunPropertiesClass(
636
+ hyperParams=self._get_experiment_run_params(run),
637
+ trainingMetrics=self._get_experiment_run_metrics(run),
638
+ externalUrl=self._make_experiment_run_external_url(experiment, run),
639
+ id=f"{experiment.name}-{run.name}",
640
+ ),
641
+ DataPlatformInstanceClass(platform=str(DataPlatformUrn(self.platform))),
642
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_EXPERIMENT_RUN]),
643
+ (
644
+ DataProcessInstanceRunEventClass(
645
+ status=DataProcessRunStatusClass.COMPLETE,
646
+ timestampMillis=created_time
647
+ if created_time
648
+ else 0, # None is not allowed, 0 as default value
649
+ result=DataProcessInstanceRunResultClass(
650
+ type=run_result_type,
651
+ nativeResultType=self.platform,
652
+ ),
653
+ durationMillis=duration if duration else None,
654
+ )
655
+ if is_status_for_run_event_class(run_result_type)
656
+ else None
657
+ ),
658
+ ],
659
+ )
660
+
661
+ def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]:
662
+ yield from gen_containers(
663
+ container_key=self._get_project_container(),
664
+ name=self.config.project_id,
665
+ sub_types=[MLAssetSubTypes.VERTEX_PROJECT],
666
+ )
667
+
668
+ def _get_ml_models_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
669
+ """
670
+ Fetch List of Models in Model Registry and generate a corresponding mcp.
671
+ """
672
+ registered_models = self.client.Model.list()
673
+ for model in registered_models:
674
+ # create mcp for Model Group (= Model in VertexAI)
675
+ yield from self._gen_ml_group_mcps(model)
676
+ model_versions = model.versioning_registry.list_versions()
677
+ for model_version in model_versions:
678
+ # create mcp for Model (= Model Version in VertexAI)
679
+ logger.info(
680
+ f"Ingesting a model (name: {model.display_name} id:{model.name})"
681
+ )
682
+ yield from self._get_ml_model_mcps(
683
+ model=model, model_version=model_version
684
+ )
685
+
686
+ def _get_ml_model_mcps(
687
+ self, model: Model, model_version: VersionInfo
688
+ ) -> Iterable[MetadataChangeProposalWrapper]:
689
+ model_meta: ModelMetadata = self._get_ml_model_metadata(model, model_version)
690
+ # Create ML Model Entity
691
+ yield from self._gen_ml_model_mcps(model_meta)
692
+ # Create Endpoint Entity
693
+ yield from self._gen_endpoints_mcps(model_meta)
694
+
695
+ def _get_ml_model_metadata(
696
+ self, model: Model, model_version: VersionInfo
697
+ ) -> ModelMetadata:
698
+ model_meta = ModelMetadata(model=model, model_version=model_version)
699
+ # Search for endpoints associated with the model
700
+ endpoints = self._search_endpoint(model)
701
+ model_meta.endpoints = endpoints
702
+ return model_meta
703
+
704
+ def _get_training_jobs_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
705
+ """
706
+ Fetches training jobs from Vertex AI and generates corresponding mcps.
707
+ This method retrieves various types of training jobs from Vertex AI, including
708
+ CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob,
709
+ AutoMLTabularTrainingJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, AutoMLVideoTrainingJob,
710
+ and AutoMLForecastingTrainingJob. For each job, it generates mcps containing metadata
711
+ about the job, its inputs, and its outputs.
712
+ """
713
+ class_names = [
714
+ "CustomJob",
715
+ "CustomTrainingJob",
716
+ "CustomContainerTrainingJob",
717
+ "CustomPythonPackageTrainingJob",
718
+ "AutoMLTabularTrainingJob",
719
+ "AutoMLTextTrainingJob",
720
+ "AutoMLImageTrainingJob",
721
+ "AutoMLVideoTrainingJob",
722
+ "AutoMLForecastingTrainingJob",
723
+ ]
724
+ # Iterate over class names and call the list() function
725
+ for class_name in class_names:
726
+ logger.info(f"Fetching a list of {class_name}s from VertexAI server")
727
+ for job in getattr(self.client, class_name).list():
728
+ yield from self._get_training_job_mcps(job)
729
+
730
+ def _get_training_job_mcps(
731
+ self, job: VertexAiResourceNoun
732
+ ) -> Iterable[MetadataChangeProposalWrapper]:
733
+ job_meta: TrainingJobMetadata = self._get_training_job_metadata(job)
734
+ # Create DataProcessInstance for the training job
735
+ yield from self._gen_training_job_mcps(job_meta)
736
+ # Create Dataset entity for Input Dataset of Training job
737
+ yield from self._get_input_dataset_mcps(job_meta)
738
+ # Create ML Model entity for output ML model of this training job
739
+ yield from self._gen_output_model_mcps(job_meta)
740
+
741
+ def _gen_output_model_mcps(
742
+ self, job_meta: TrainingJobMetadata
743
+ ) -> Iterable[MetadataChangeProposalWrapper]:
744
+ if job_meta.output_model and job_meta.output_model_version:
745
+ job = job_meta.job
746
+ job_urn = builder.make_data_process_instance_urn(
747
+ self._make_vertexai_job_name(entity_id=job.name)
748
+ )
749
+
750
+ yield from self._gen_ml_model_mcps(
751
+ ModelMetadata(
752
+ model=job_meta.output_model,
753
+ model_version=job_meta.output_model_version,
754
+ training_job_urn=job_urn,
755
+ )
756
+ )
757
+
758
+ def _get_job_duration_millis(self, job: VertexAiResourceNoun) -> Optional[int]:
759
+ create_time = job.create_time
760
+ duration = None
761
+ if isinstance(job, _TrainingJob) and job.create_time and job.end_time:
762
+ end_time = job.end_time
763
+ duration = datetime_to_ts_millis(end_time) - datetime_to_ts_millis(
764
+ create_time
765
+ )
766
+
767
+ return int(duration) if duration else None
768
+
769
+ def _gen_training_job_mcps(
770
+ self, job_meta: TrainingJobMetadata
771
+ ) -> Iterable[MetadataChangeProposalWrapper]:
772
+ """
773
+ Generate a mcp for VertexAI Training Job
774
+ """
775
+ job = job_meta.job
776
+ job_id = self._make_vertexai_job_name(entity_id=job.name)
777
+ job_urn = builder.make_data_process_instance_urn(job_id)
778
+
779
+ created_time = datetime_to_ts_millis(job.create_time) if job.create_time else 0
780
+ duration = self._get_job_duration_millis(job)
781
+
782
+ # If Training job has Input Dataset
783
+ dataset_urn = (
784
+ builder.make_dataset_urn(
785
+ platform=self.platform,
786
+ name=self._make_vertexai_dataset_name(
787
+ entity_id=job_meta.input_dataset.name
788
+ ),
789
+ env=self.config.env,
790
+ )
791
+ if job_meta.input_dataset
792
+ else None
793
+ )
794
+ # If Training Job has Output Model
795
+ model_urn = (
796
+ self._make_ml_model_urn(
797
+ model_version=job_meta.output_model_version,
798
+ model_name=self._make_vertexai_model_name(
799
+ entity_id=job_meta.output_model.name
800
+ ),
801
+ )
802
+ if job_meta.output_model and job_meta.output_model_version
803
+ else None
804
+ )
805
+
806
+ result_type = get_job_result_status(job)
807
+
808
+ yield from MetadataChangeProposalWrapper.construct_many(
809
+ job_urn,
810
+ aspects=[
811
+ DataProcessInstancePropertiesClass(
812
+ name=job.display_name,
813
+ created=AuditStampClass(
814
+ time=created_time,
815
+ actor="urn:li:corpuser:datahub",
816
+ ),
817
+ externalUrl=self._make_job_external_url(job),
818
+ customProperties={
819
+ "jobType": job.__class__.__name__,
820
+ },
821
+ ),
822
+ MLTrainingRunProperties(
823
+ externalUrl=self._make_job_external_url(job), id=job.name
824
+ ),
825
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_TRAINING_JOB]),
826
+ ContainerClass(container=self._get_project_container().as_urn()),
827
+ DataPlatformInstanceClass(platform=str(DataPlatformUrn(self.platform))),
828
+ (
829
+ DataProcessInstanceInputClass(
830
+ inputs=[],
831
+ inputEdges=[
832
+ EdgeClass(destinationUrn=dataset_urn),
833
+ ],
834
+ )
835
+ if dataset_urn
836
+ else None
837
+ ),
838
+ (
839
+ DataProcessInstanceOutputClass(
840
+ outputs=[],
841
+ outputEdges=[
842
+ EdgeClass(destinationUrn=model_urn),
843
+ ],
844
+ )
845
+ if model_urn
846
+ else None
847
+ ),
848
+ (
849
+ DataProcessInstanceRunEventClass(
850
+ status=DataProcessRunStatusClass.COMPLETE,
851
+ timestampMillis=created_time,
852
+ result=DataProcessInstanceRunResultClass(
853
+ type=result_type,
854
+ nativeResultType=self.platform,
855
+ ),
856
+ durationMillis=duration,
857
+ )
858
+ if is_status_for_run_event_class(result_type) and duration
859
+ else None
860
+ ),
861
+ ],
862
+ )
863
+
864
+ def _gen_ml_group_mcps(
865
+ self,
866
+ model: Model,
867
+ ) -> Iterable[MetadataChangeProposalWrapper]:
868
+ """
869
+ Generate an MLModelGroup mcp for a VertexAI Model.
870
+ """
871
+ ml_model_group_urn = self._make_ml_model_group_urn(model)
872
+
873
+ yield from MetadataChangeProposalWrapper.construct_many(
874
+ ml_model_group_urn,
875
+ aspects=[
876
+ MLModelGroupPropertiesClass(
877
+ name=model.display_name,
878
+ description=model.description,
879
+ created=(
880
+ TimeStampClass(
881
+ time=datetime_to_ts_millis(model.create_time),
882
+ actor="urn:li:corpuser:datahub",
883
+ )
884
+ if model.create_time
885
+ else None
886
+ ),
887
+ lastModified=(
888
+ TimeStampClass(
889
+ time=datetime_to_ts_millis(model.update_time),
890
+ actor="urn:li:corpuser:datahub",
891
+ )
892
+ if model.update_time
893
+ else None
894
+ ),
895
+ customProperties=None,
896
+ externalUrl=self._make_model_external_url(model),
897
+ ),
898
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_MODEL_GROUP]),
899
+ ContainerClass(container=self._get_project_container().as_urn()),
900
+ DataPlatformInstanceClass(platform=str(DataPlatformUrn(self.platform))),
901
+ ],
902
+ )
903
+
904
+ def _make_ml_model_group_urn(self, model: Model) -> str:
905
+ urn = builder.make_ml_model_group_urn(
906
+ platform=self.platform,
907
+ group_name=self._make_vertexai_model_group_name(model.name),
908
+ env=self.config.env,
909
+ )
910
+ return urn
911
+
912
+ def _get_project_container(self) -> ProjectIdKey:
913
+ return ProjectIdKey(project_id=self.config.project_id, platform=self.platform)
914
+
915
+ def _is_automl_job(self, job: VertexAiResourceNoun) -> bool:
916
+ return isinstance(
917
+ job,
918
+ (
919
+ AutoMLTabularTrainingJob,
920
+ AutoMLTextTrainingJob,
921
+ AutoMLImageTrainingJob,
922
+ AutoMLVideoTrainingJob,
923
+ AutoMLForecastingTrainingJob,
924
+ ),
925
+ )
926
+
927
+ def _search_model_version(
928
+ self, model: Model, version_id: str
929
+ ) -> Optional[VersionInfo]:
930
+ for version in model.versioning_registry.list_versions():
931
+ if version.version_id == version_id:
932
+ return version
933
+ return None
934
+
935
+ def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]:
936
+ """
937
+ Search for a dataset by its ID in Vertex AI.
938
+ This method iterates through different types of datasets (Text, Tabular, Image,
939
+ TimeSeries, and Video) to find a dataset that matches the given dataset ID.
940
+ """
941
+
942
+ dataset_types = [
943
+ "TextDataset",
944
+ "TabularDataset",
945
+ "ImageDataset",
946
+ "TimeSeriesDataset",
947
+ "VideoDataset",
948
+ ]
949
+
950
+ if self.datasets is None:
951
+ self.datasets = {}
952
+
953
+ for dtype in dataset_types:
954
+ dataset_class = getattr(self.client.datasets, dtype)
955
+ for ds in dataset_class.list():
956
+ self.datasets[ds.name] = ds
957
+
958
+ return self.datasets.get(dataset_id)
959
+
960
+ def _get_input_dataset_mcps(
961
+ self, job_meta: TrainingJobMetadata
962
+ ) -> Iterable[MetadataChangeProposalWrapper]:
963
+ """
964
+ Create a DatasetPropertiesClass aspect for a given Vertex AI dataset.
965
+ """
966
+ ds = job_meta.input_dataset
967
+
968
+ if ds:
969
+ # Create URN of Input Dataset for Training Job
970
+ dataset_name = self._make_vertexai_dataset_name(entity_id=ds.name)
971
+ dataset_urn = builder.make_dataset_urn(
972
+ platform=self.platform,
973
+ name=dataset_name,
974
+ env=self.config.env,
975
+ )
976
+
977
+ yield from MetadataChangeProposalWrapper.construct_many(
978
+ dataset_urn,
979
+ aspects=[
980
+ DatasetPropertiesClass(
981
+ name=ds.display_name,
982
+ created=(
983
+ TimeStampClass(time=datetime_to_ts_millis(ds.create_time))
984
+ if ds.create_time
985
+ else None
986
+ ),
987
+ description=ds.display_name,
988
+ customProperties={
989
+ "resourceName": ds.resource_name,
990
+ },
991
+ qualifiedName=ds.resource_name,
992
+ ),
993
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_DATASET]),
994
+ ContainerClass(container=self._get_project_container().as_urn()),
995
+ DataPlatformInstanceClass(
996
+ platform=str(DataPlatformUrn(self.platform))
997
+ ),
998
+ ],
999
+ )
1000
+
1001
+ def _get_training_job_metadata(
1002
+ self, job: VertexAiResourceNoun
1003
+ ) -> TrainingJobMetadata:
1004
+ """
1005
+ Retrieve metadata for a given Vertex AI training job.
1006
+ This method extracts metadata for a Vertex AI training job, including input datasets
1007
+ and output models. It checks if the job is an AutoML job and retrieves the relevant
1008
+ input dataset and output model information.
1009
+ """
1010
+ job_meta = TrainingJobMetadata(job=job)
1011
+ # Check if the job is an AutoML job
1012
+ if self._is_automl_job(job):
1013
+ job_conf = job.to_dict()
1014
+ # Check if input dataset is present in the job configuration
1015
+ if (
1016
+ "inputDataConfig" in job_conf
1017
+ and "datasetId" in job_conf["inputDataConfig"]
1018
+ ):
1019
+ # Create URN of Input Dataset for Training Job
1020
+ dataset_id = job_conf["inputDataConfig"]["datasetId"]
1021
+ logger.info(
1022
+ f"Found input dataset (id: {dataset_id}) for training job ({job.display_name})"
1023
+ )
1024
+
1025
+ if dataset_id:
1026
+ input_ds = self._search_dataset(dataset_id)
1027
+ if input_ds:
1028
+ logger.info(
1029
+ f"Found the name of input dataset ({input_ds.display_name}) with dataset id ({dataset_id})"
1030
+ )
1031
+ job_meta.input_dataset = input_ds
1032
+
1033
+ # Check if output model is present in the job configuration
1034
+ if (
1035
+ "modelToUpload" in job_conf
1036
+ and "name" in job_conf["modelToUpload"]
1037
+ and job_conf["modelToUpload"]["name"]
1038
+ and job_conf["modelToUpload"]["versionId"]
1039
+ ):
1040
+ model_name = job_conf["modelToUpload"]["name"]
1041
+ model_version_str = job_conf["modelToUpload"]["versionId"]
1042
+ try:
1043
+ model = Model(model_name=model_name)
1044
+ model_version = self._search_model_version(model, model_version_str)
1045
+ if model and model_version:
1046
+ logger.info(
1047
+ f"Found output model (name:{model.display_name} id:{model_version_str}) "
1048
+ f"for training job: {job.display_name}"
1049
+ )
1050
+ job_meta.output_model = model
1051
+ job_meta.output_model_version = model_version
1052
+ except GoogleAPICallError as e:
1053
+ self.report.report_failure(
1054
+ title="Unable to fetch model and model version",
1055
+ message="Encountered an error while fetching output model and model version which training job generates",
1056
+ exc=e,
1057
+ )
1058
+
1059
+ return job_meta
1060
+
1061
+ def _gen_endpoints_mcps(
1062
+ self, model_meta: ModelMetadata
1063
+ ) -> Iterable[MetadataChangeProposalWrapper]:
1064
+ model: Model = model_meta.model
1065
+ model_version: VersionInfo = model_meta.model_version
1066
+
1067
+ if model_meta.endpoints:
1068
+ for endpoint in model_meta.endpoints:
1069
+ endpoint_urn = builder.make_ml_model_deployment_urn(
1070
+ platform=self.platform,
1071
+ deployment_name=self._make_vertexai_endpoint_name(
1072
+ entity_id=endpoint.name
1073
+ ),
1074
+ env=self.config.env,
1075
+ )
1076
+
1077
+ yield from MetadataChangeProposalWrapper.construct_many(
1078
+ entityUrn=endpoint_urn,
1079
+ aspects=[
1080
+ MLModelDeploymentPropertiesClass(
1081
+ description=model.description,
1082
+ createdAt=datetime_to_ts_millis(endpoint.create_time),
1083
+ version=VersionTagClass(
1084
+ versionTag=str(model_version.version_id)
1085
+ ),
1086
+ customProperties={"displayName": endpoint.display_name},
1087
+ ),
1088
+ ContainerClass(
1089
+ container=self._get_project_container().as_urn()
1090
+ ),
1091
+ # TODO add Subtype when metadata for MLModelDeployment is updated (not supported)
1092
+ # SubTypesClass(typeNames=[MLTypes.ENDPOINT])
1093
+ ],
1094
+ )
1095
+
1096
+ def _gen_ml_model_mcps(
1097
+ self, ModelMetadata: ModelMetadata
1098
+ ) -> Iterable[MetadataChangeProposalWrapper]:
1099
+ """
1100
+ Generate an MLModel and Endpoint mcp for an VertexAI Model Version.
1101
+ """
1102
+
1103
+ model: Model = ModelMetadata.model
1104
+ model_version: VersionInfo = ModelMetadata.model_version
1105
+ training_job_urn: Optional[str] = ModelMetadata.training_job_urn
1106
+ endpoints: Optional[List[Endpoint]] = ModelMetadata.endpoints
1107
+ endpoint_urns: List[str] = list()
1108
+
1109
+ logging.info(f"generating model mcp for {model.name}")
1110
+
1111
+ # Generate list of endpoint URL
1112
+ if endpoints:
1113
+ for endpoint in endpoints:
1114
+ logger.info(
1115
+ f"found endpoint ({endpoint.display_name}) for model ({model.resource_name})"
1116
+ )
1117
+ endpoint_urns.append(
1118
+ builder.make_ml_model_deployment_urn(
1119
+ platform=self.platform,
1120
+ deployment_name=self._make_vertexai_endpoint_name(
1121
+ entity_id=endpoint.display_name
1122
+ ),
1123
+ env=self.config.env,
1124
+ )
1125
+ )
1126
+
1127
+ # Create URN for Model and Model Version
1128
+ model_group_urn = self._make_ml_model_group_urn(model)
1129
+ model_name = self._make_vertexai_model_name(entity_id=model.name)
1130
+ model_urn = self._make_ml_model_urn(model_version, model_name=model_name)
1131
+
1132
+ yield from MetadataChangeProposalWrapper.construct_many(
1133
+ entityUrn=model_urn,
1134
+ aspects=[
1135
+ MLModelPropertiesClass(
1136
+ name=f"{model.display_name}_{model_version.version_id}",
1137
+ description=model_version.version_description,
1138
+ customProperties={
1139
+ "versionId": f"{model_version.version_id}",
1140
+ "resourceName": model.resource_name,
1141
+ },
1142
+ created=(
1143
+ TimeStampClass(
1144
+ time=datetime_to_ts_millis(
1145
+ model_version.version_create_time
1146
+ ),
1147
+ actor="urn:li:corpuser:datahub",
1148
+ )
1149
+ if model_version.version_create_time
1150
+ else None
1151
+ ),
1152
+ lastModified=(
1153
+ TimeStampClass(
1154
+ time=datetime_to_ts_millis(
1155
+ model_version.version_update_time
1156
+ ),
1157
+ actor="urn:li:corpuser:datahub",
1158
+ )
1159
+ if model_version.version_update_time
1160
+ else None
1161
+ ),
1162
+ version=VersionTagClass(versionTag=str(model_version.version_id)),
1163
+ groups=[model_group_urn], # link model version to model group
1164
+ trainingJobs=(
1165
+ [training_job_urn] if training_job_urn else None
1166
+ ), # link to training job
1167
+ deployments=endpoint_urns,
1168
+ externalUrl=self._make_model_version_external_url(model),
1169
+ type="ML Model",
1170
+ ),
1171
+ ContainerClass(
1172
+ container=self._get_project_container().as_urn(),
1173
+ ),
1174
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_MODEL]),
1175
+ DataPlatformInstanceClass(platform=str(DataPlatformUrn(self.platform))),
1176
+ VersionPropertiesClass(
1177
+ version=VersionTagClass(
1178
+ versionTag=str(model_version.version_id),
1179
+ metadataAttribution=(
1180
+ MetadataAttributionClass(
1181
+ time=int(
1182
+ model_version.version_create_time.timestamp() * 1000
1183
+ ),
1184
+ actor="urn:li:corpuser:datahub",
1185
+ )
1186
+ if model_version.version_create_time
1187
+ else None
1188
+ ),
1189
+ ),
1190
+ versionSet=str(self._get_version_set_urn(model)),
1191
+ sortId=str(model_version.version_id).zfill(10),
1192
+ aliases=None,
1193
+ ),
1194
+ ],
1195
+ )
1196
+
1197
+ def _get_version_set_urn(self, model: Model) -> VersionSetUrn:
1198
+ guid_dict = {"platform": self.platform, "name": model.name}
1199
+ version_set_urn = VersionSetUrn(
1200
+ id=builder.datahub_guid(guid_dict),
1201
+ entity_type=MlModelUrn.ENTITY_TYPE,
1202
+ )
1203
+ return version_set_urn
1204
+
1205
+ def _search_endpoint(self, model: Model) -> List[Endpoint]:
1206
+ """
1207
+ Search for an endpoint associated with the model.
1208
+ """
1209
+ if self.endpoints is None:
1210
+ endpoint_dict: Dict[str, List[Endpoint]] = {}
1211
+ for endpoint in self.client.Endpoint.list():
1212
+ for resource in endpoint.list_models():
1213
+ if resource.model not in endpoint_dict:
1214
+ endpoint_dict[resource.model] = []
1215
+ endpoint_dict[resource.model].append(endpoint)
1216
+ self.endpoints = endpoint_dict
1217
+
1218
+ return self.endpoints.get(model.resource_name, [])
1219
+
1220
+ def _make_experiment_run_urn(
1221
+ self, experiment: Experiment, run: ExperimentRun
1222
+ ) -> str:
1223
+ return builder.make_data_process_instance_urn(
1224
+ self._make_vertexai_experiment_run_name(
1225
+ entity_id=f"{experiment.name}-{run.name}"
1226
+ )
1227
+ )
1228
+
1229
+ def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str:
1230
+ urn = builder.make_ml_model_urn(
1231
+ platform=self.platform,
1232
+ model_name=f"{model_name}_{model_version.version_id}",
1233
+ env=self.config.env,
1234
+ )
1235
+ return urn
1236
+
1237
+ def _make_training_job_urn(self, job: VertexAiResourceNoun) -> str:
1238
+ job_id = self._make_vertexai_job_name(entity_id=job.name)
1239
+ urn = builder.make_data_process_instance_urn(dataProcessInstanceId=job_id)
1240
+ return urn
1241
+
1242
+ def _make_vertexai_model_group_name(
1243
+ self,
1244
+ entity_id: str,
1245
+ ) -> str:
1246
+ return f"{self.config.project_id}.model_group.{entity_id}"
1247
+
1248
+ def _make_vertexai_endpoint_name(self, entity_id: str) -> str:
1249
+ return f"{self.config.project_id}.endpoint.{entity_id}"
1250
+
1251
+ def _make_vertexai_model_name(self, entity_id: str) -> str:
1252
+ return f"{self.config.project_id}.model.{entity_id}"
1253
+
1254
+ def _make_vertexai_dataset_name(self, entity_id: str) -> str:
1255
+ return f"{self.config.project_id}.dataset.{entity_id}"
1256
+
1257
+ def _make_vertexai_job_name(
1258
+ self,
1259
+ entity_id: Optional[str],
1260
+ ) -> str:
1261
+ return f"{self.config.project_id}.job.{entity_id}"
1262
+
1263
+ def _make_vertexai_experiment_id(self, entity_id: Optional[str]) -> str:
1264
+ return f"{self.config.project_id}.experiment.{entity_id}"
1265
+
1266
+ def _make_vertexai_experiment_run_name(self, entity_id: Optional[str]) -> str:
1267
+ return f"{self.config.project_id}.experiment_run.{entity_id}"
1268
+
1269
+ def _make_vertexai_run_execution_name(self, entity_id: Optional[str]) -> str:
1270
+ return f"{self.config.project_id}.execution.{entity_id}"
1271
+
1272
+ def _make_vertexai_pipeline_id(self, entity_id: Optional[str]) -> str:
1273
+ return f"{self.config.project_id}.pipeline.{entity_id}"
1274
+
1275
+ def _make_vertexai_pipeline_task_id(self, entity_id: Optional[str]) -> str:
1276
+ return f"{self.config.project_id}.pipeline_task.{entity_id}"
1277
+
1278
+ def _make_vertexai_pipeline_task_run_id(self, entity_id: Optional[str]) -> str:
1279
+ return f"{self.config.project_id}.pipeline_task_run.{entity_id}"
1280
+
1281
+ def _make_artifact_external_url(
1282
+ self, experiment: Experiment, run: ExperimentRun
1283
+ ) -> str:
1284
+ """
1285
+ Model external URL in Vertex AI
1286
+ Sample URL:
1287
+ https://console.cloud.google.com/vertex-ai/experiments/locations/us-west2/experiments/test-experiment-job-metadata/runs/test-experiment-job-metadata-run-3/artifacts?project=acryl-poc
1288
+ """
1289
+ external_url: str = (
1290
+ f"{self.config.vertexai_url}/experiments/locations/{self.config.region}/experiments/{experiment.name}/runs/{experiment.name}-{run.name}/artifacts"
1291
+ f"?project={self.config.project_id}"
1292
+ )
1293
+ return external_url
1294
+
1295
+ def _make_job_external_url(self, job: VertexAiResourceNoun) -> str:
1296
+ """
1297
+ Model external URL in Vertex AI
1298
+ Sample URLs:
1299
+ https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888
1300
+ """
1301
+ external_url: str = (
1302
+ f"{self.config.vertexai_url}/training/training-pipelines?trainingPipelineId={job.name}"
1303
+ f"?project={self.config.project_id}"
1304
+ )
1305
+ return external_url
1306
+
1307
+ def _make_model_external_url(self, model: Model) -> str:
1308
+ """
1309
+ Model external URL in Vertex AI
1310
+ Sample URL:
1311
+ https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336?project=acryl-poc
1312
+ """
1313
+ external_url: str = (
1314
+ f"{self.config.vertexai_url}/models/locations/{self.config.region}/models/{model.name}"
1315
+ f"?project={self.config.project_id}"
1316
+ )
1317
+ return external_url
1318
+
1319
+ def _make_model_version_external_url(self, model: Model) -> str:
1320
+ """
1321
+ Model Version external URL in Vertex AI
1322
+ Sample URL:
1323
+ https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/1?project=acryl-poc
1324
+ """
1325
+ external_url: str = (
1326
+ f"{self.config.vertexai_url}/models/locations/{self.config.region}/models/{model.name}"
1327
+ f"/versions/{model.version_id}"
1328
+ f"?project={self.config.project_id}"
1329
+ )
1330
+ return external_url
1331
+
1332
+ def _make_experiment_external_url(self, experiment: Experiment) -> str:
1333
+ """
1334
+ Experiment external URL in Vertex AI
1335
+ https://console.cloud.google.com/vertex-ai/experiments/locations/us-west2/experiments/experiment-run-with-automljob-1/runs?project=acryl-poc
1336
+ """
1337
+
1338
+ external_url: str = (
1339
+ f"{self.config.vertexai_url}/experiments/locations/{self.config.region}/experiments/{experiment.name}"
1340
+ f"/runs?project={self.config.project_id}"
1341
+ )
1342
+ return external_url
1343
+
1344
+ def _make_experiment_run_external_url(
1345
+ self, experiment: Experiment, run: ExperimentRun
1346
+ ) -> str:
1347
+ """
1348
+ Experiment Run external URL in Vertex AI
1349
+ https://console.cloud.google.com/vertex-ai/experiments/locations/us-west2/experiments/experiment-run-with-automljob-1/runs/experiment-run-with-automljob-1-automl-job-with-run-1/charts?project=acryl-poc
1350
+ """
1351
+
1352
+ external_url: str = (
1353
+ f"{self.config.vertexai_url}/experiments/locations/{self.config.region}/experiments/{experiment.name}"
1354
+ f"/runs/{experiment.name}-{run.name}/charts?project={self.config.project_id}"
1355
+ )
1356
+ return external_url
1357
+
1358
+ def _make_pipeline_external_url(self, pipeline_name: str) -> str:
1359
+ """
1360
+ Pipeline Run external URL in Vertex AI
1361
+ https://console.cloud.google.com/vertex-ai/pipelines/locations/us-west2/runs/pipeline-example-more-tasks-3-20250320210739?project=acryl-poc
1362
+ """
1363
+ external_url: str = (
1364
+ f"{self.config.vertexai_url}/pipelines/locations/{self.config.region}/runs/{pipeline_name}"
1365
+ f"?project={self.config.project_id}"
1366
+ )
1367
+ return external_url