acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,697 +0,0 @@
1
- import dataclasses
2
- import logging
3
- from datetime import datetime
4
- from typing import Any, Dict, Iterable, List, Optional, TypeVar
5
-
6
- from google.api_core.exceptions import GoogleAPICallError
7
- from google.cloud import aiplatform
8
- from google.cloud.aiplatform import (
9
- AutoMLForecastingTrainingJob,
10
- AutoMLImageTrainingJob,
11
- AutoMLTabularTrainingJob,
12
- AutoMLTextTrainingJob,
13
- AutoMLVideoTrainingJob,
14
- Endpoint,
15
- )
16
- from google.cloud.aiplatform.base import VertexAiResourceNoun
17
- from google.cloud.aiplatform.models import Model, VersionInfo
18
- from google.oauth2 import service_account
19
- from pydantic import PrivateAttr
20
- from pydantic.fields import Field
21
-
22
- import datahub.emitter.mce_builder as builder
23
- from datahub.configuration.source_common import EnvConfigMixin
24
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
25
- from datahub.emitter.mcp_builder import ProjectIdKey, gen_containers
26
- from datahub.ingestion.api.common import PipelineContext
27
- from datahub.ingestion.api.decorators import (
28
- SupportStatus,
29
- capability,
30
- config_class,
31
- platform_name,
32
- support_status,
33
- )
34
- from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
35
- from datahub.ingestion.api.source_helpers import auto_workunit
36
- from datahub.ingestion.api.workunit import MetadataWorkUnit
37
- from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
38
- from datahub.metadata.com.linkedin.pegasus2avro.ml.metadata import (
39
- MLTrainingRunProperties,
40
- )
41
- from datahub.metadata.schema_classes import (
42
- AuditStampClass,
43
- ContainerClass,
44
- DataProcessInstanceInputClass,
45
- DataProcessInstancePropertiesClass,
46
- DatasetPropertiesClass,
47
- MLModelDeploymentPropertiesClass,
48
- MLModelGroupPropertiesClass,
49
- MLModelPropertiesClass,
50
- SubTypesClass,
51
- TimeStampClass,
52
- VersionTagClass,
53
- )
54
- from datahub.utilities.str_enum import StrEnum
55
- from datahub.utilities.time import datetime_to_ts_millis
56
-
57
- T = TypeVar("T")
58
-
59
- logger = logging.getLogger(__name__)
60
-
61
-
62
- class VertexAIConfig(EnvConfigMixin):
63
- credential: Optional[GCPCredential] = Field(
64
- default=None, description="GCP credential information"
65
- )
66
- project_id: str = Field(description=("Project ID in Google Cloud Platform"))
67
- region: str = Field(
68
- description=("Region of your project in Google Cloud Platform"),
69
- )
70
- bucket_uri: Optional[str] = Field(
71
- default=None,
72
- description=("Bucket URI used in your project"),
73
- )
74
- vertexai_url: Optional[str] = Field(
75
- default="https://console.cloud.google.com/vertex-ai",
76
- description=("VertexUI URI"),
77
- )
78
-
79
- _credentials_path: Optional[str] = PrivateAttr(None)
80
-
81
- def __init__(self, **data: Any):
82
- super().__init__(**data)
83
- if self.credential:
84
- self._credentials_path = self.credential.create_credential_temp_file(
85
- project_id=self.project_id
86
- )
87
-
88
-
89
- class MLTypes(StrEnum):
90
- TRAINING_JOB = "Training Job"
91
- MODEL = "ML Model"
92
- MODEL_GROUP = "ML Model Group"
93
- ENDPOINT = "Endpoint"
94
- DATASET = "Dataset"
95
- PROJECT = "Project"
96
-
97
-
98
- @dataclasses.dataclass
99
- class TrainingJobMetadata:
100
- job: VertexAiResourceNoun
101
- input_dataset: Optional[VertexAiResourceNoun] = None
102
- output_model: Optional[Model] = None
103
- output_model_version: Optional[VersionInfo] = None
104
-
105
-
106
- @dataclasses.dataclass
107
- class ModelMetadata:
108
- model: Model
109
- model_version: VersionInfo
110
- training_job_urn: Optional[str] = None
111
- endpoints: Optional[List[Endpoint]] = None
112
-
113
-
114
- @platform_name("Vertex AI", id="vertexai")
115
- @config_class(VertexAIConfig)
116
- @support_status(SupportStatus.TESTING)
117
- @capability(
118
- SourceCapability.DESCRIPTIONS,
119
- "Extract descriptions for Vertex AI Registered Models and Model Versions",
120
- )
121
- @capability(SourceCapability.TAGS, "Extract tags for Vertex AI Registered Model Stages")
122
- class VertexAISource(Source):
123
- platform: str = "vertexai"
124
-
125
- def __init__(self, ctx: PipelineContext, config: VertexAIConfig):
126
- super().__init__(ctx)
127
- self.config = config
128
- self.report = SourceReport()
129
-
130
- credentials = (
131
- service_account.Credentials.from_service_account_file(
132
- self.config._credentials_path
133
- )
134
- if self.config.credential
135
- else None
136
- )
137
-
138
- aiplatform.init(
139
- project=config.project_id, location=config.region, credentials=credentials
140
- )
141
- self.client = aiplatform
142
- self.endpoints: Optional[Dict[str, List[Endpoint]]] = None
143
- self.datasets: Optional[Dict[str, VertexAiResourceNoun]] = None
144
-
145
- def get_report(self) -> SourceReport:
146
- return self.report
147
-
148
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
149
- """
150
- Main Function to fetch and yields mcps for various VertexAI resources.
151
- - Models and Model Versions from the Model Registry
152
- - Training Jobs
153
- """
154
-
155
- # Ingest Project
156
- yield from self._gen_project_workunits()
157
- # Fetch and Ingest Models, Model Versions a from Model Registry
158
- yield from auto_workunit(self._get_ml_models_mcps())
159
- # Fetch and Ingest Training Jobs
160
- yield from auto_workunit(self._get_training_jobs_mcps())
161
- # TODO Fetch Experiments and Experiment Runs
162
-
163
- def _gen_project_workunits(self) -> Iterable[MetadataWorkUnit]:
164
- yield from gen_containers(
165
- container_key=self._get_project_container(),
166
- name=self.config.project_id,
167
- sub_types=[MLTypes.PROJECT],
168
- )
169
-
170
- def _get_ml_models_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
171
- """
172
- Fetch List of Models in Model Registry and generate a corresponding mcp.
173
- """
174
- registered_models = self.client.Model.list()
175
- for model in registered_models:
176
- # create mcp for Model Group (= Model in VertexAI)
177
- yield from self._gen_ml_group_mcps(model)
178
- model_versions = model.versioning_registry.list_versions()
179
- for model_version in model_versions:
180
- # create mcp for Model (= Model Version in VertexAI)
181
- logger.info(
182
- f"Ingesting a model (name: {model.display_name} id:{model.name})"
183
- )
184
- yield from self._get_ml_model_mcps(
185
- model=model, model_version=model_version
186
- )
187
-
188
- def _get_ml_model_mcps(
189
- self, model: Model, model_version: VersionInfo
190
- ) -> Iterable[MetadataChangeProposalWrapper]:
191
- model_meta: ModelMetadata = self._get_ml_model_metadata(model, model_version)
192
- # Create ML Model Entity
193
- yield from self._gen_ml_model_mcps(model_meta)
194
- # Create Endpoint Entity
195
- yield from self._gen_endpoints_mcps(model_meta)
196
-
197
- def _get_ml_model_metadata(
198
- self, model: Model, model_version: VersionInfo
199
- ) -> ModelMetadata:
200
- model_meta = ModelMetadata(model=model, model_version=model_version)
201
- # Search for endpoints associated with the model
202
- endpoints = self._search_endpoint(model)
203
- model_meta.endpoints = endpoints
204
- return model_meta
205
-
206
- def _get_training_jobs_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
207
- """
208
- Fetches training jobs from Vertex AI and generates corresponding mcps.
209
- This method retrieves various types of training jobs from Vertex AI, including
210
- CustomJob, CustomTrainingJob, CustomContainerTrainingJob, CustomPythonPackageTrainingJob,
211
- AutoMLTabularTrainingJob, AutoMLTextTrainingJob, AutoMLImageTrainingJob, AutoMLVideoTrainingJob,
212
- and AutoMLForecastingTrainingJob. For each job, it generates mcps containing metadata
213
- about the job, its inputs, and its outputs.
214
- """
215
- class_names = [
216
- "CustomJob",
217
- "CustomTrainingJob",
218
- "CustomContainerTrainingJob",
219
- "CustomPythonPackageTrainingJob",
220
- "AutoMLTabularTrainingJob",
221
- "AutoMLTextTrainingJob",
222
- "AutoMLImageTrainingJob",
223
- "AutoMLVideoTrainingJob",
224
- "AutoMLForecastingTrainingJob",
225
- ]
226
- # Iterate over class names and call the list() function
227
- for class_name in class_names:
228
- logger.info(f"Fetching a list of {class_name}s from VertexAI server")
229
- for job in getattr(self.client, class_name).list():
230
- yield from self._get_training_job_mcps(job)
231
-
232
- def _get_training_job_mcps(
233
- self, job: VertexAiResourceNoun
234
- ) -> Iterable[MetadataChangeProposalWrapper]:
235
- job_meta: TrainingJobMetadata = self._get_training_job_metadata(job)
236
- # Create DataProcessInstance for the training job
237
- yield from self._gen_training_job_mcps(job_meta)
238
- # Create Dataset entity for Input Dataset of Training job
239
- yield from self._get_input_dataset_mcps(job_meta)
240
- # Create ML Model entity for output ML model of this training job
241
- yield from self._gen_output_model_mcps(job_meta)
242
-
243
- def _gen_output_model_mcps(
244
- self, job_meta: TrainingJobMetadata
245
- ) -> Iterable[MetadataChangeProposalWrapper]:
246
- if job_meta.output_model and job_meta.output_model_version:
247
- job = job_meta.job
248
- job_urn = builder.make_data_process_instance_urn(
249
- self._make_vertexai_job_name(entity_id=job.name)
250
- )
251
-
252
- yield from self._gen_ml_model_mcps(
253
- ModelMetadata(
254
- model=job_meta.output_model,
255
- model_version=job_meta.output_model_version,
256
- training_job_urn=job_urn,
257
- )
258
- )
259
-
260
- def _gen_training_job_mcps(
261
- self, job_meta: TrainingJobMetadata
262
- ) -> Iterable[MetadataChangeProposalWrapper]:
263
- """
264
- Generate a mcp for VertexAI Training Job
265
- """
266
- job = job_meta.job
267
- job_id = self._make_vertexai_job_name(entity_id=job.name)
268
- job_urn = builder.make_data_process_instance_urn(job_id)
269
-
270
- created_time = (
271
- datetime_to_ts_millis(job.create_time)
272
- if job.create_time
273
- else datetime_to_ts_millis(datetime.now())
274
- )
275
- created_actor = "urn:li:corpuser:datahub"
276
-
277
- # If Training job has Input Dataset
278
- dataset_urn = (
279
- builder.make_dataset_urn(
280
- platform=self.platform,
281
- name=self._make_vertexai_dataset_name(
282
- entity_id=job_meta.input_dataset.name
283
- ),
284
- env=self.config.env,
285
- )
286
- if job_meta.input_dataset
287
- else None
288
- )
289
-
290
- yield from MetadataChangeProposalWrapper.construct_many(
291
- job_urn,
292
- aspects=[
293
- DataProcessInstancePropertiesClass(
294
- name=job_id,
295
- created=AuditStampClass(
296
- time=created_time,
297
- actor=created_actor,
298
- ),
299
- externalUrl=self._make_job_external_url(job),
300
- customProperties={
301
- "displayName": job.display_name,
302
- "jobType": job.__class__.__name__,
303
- },
304
- ),
305
- MLTrainingRunProperties(
306
- externalUrl=self._make_job_external_url(job), id=job.name
307
- ),
308
- SubTypesClass(typeNames=[MLTypes.TRAINING_JOB]),
309
- ContainerClass(container=self._get_project_container().as_urn()),
310
- DataProcessInstanceInputClass(inputs=[dataset_urn])
311
- if dataset_urn
312
- else None,
313
- ],
314
- )
315
-
316
- def _gen_ml_group_mcps(
317
- self,
318
- model: Model,
319
- ) -> Iterable[MetadataChangeProposalWrapper]:
320
- """
321
- Generate an MLModelGroup mcp for a VertexAI Model.
322
- """
323
- ml_model_group_urn = self._make_ml_model_group_urn(model)
324
-
325
- yield from MetadataChangeProposalWrapper.construct_many(
326
- ml_model_group_urn,
327
- aspects=[
328
- MLModelGroupPropertiesClass(
329
- name=self._make_vertexai_model_group_name(model.name),
330
- description=model.description,
331
- created=(
332
- TimeStampClass(time=datetime_to_ts_millis(model.create_time))
333
- if model.create_time
334
- else None
335
- ),
336
- lastModified=(
337
- TimeStampClass(time=datetime_to_ts_millis(model.update_time))
338
- if model.update_time
339
- else None
340
- ),
341
- customProperties={"displayName": model.display_name},
342
- ),
343
- # TODO add following when metadata model for mlgroup is updated (these aspects not supported currently)
344
- # SubTypesClass(typeNames=[MLTypes.MODEL_GROUP]),
345
- # ContainerClass(container=self._get_project_container().as_urn())
346
- ],
347
- )
348
-
349
- def _make_ml_model_group_urn(self, model: Model) -> str:
350
- urn = builder.make_ml_model_group_urn(
351
- platform=self.platform,
352
- group_name=self._make_vertexai_model_group_name(model.name),
353
- env=self.config.env,
354
- )
355
- return urn
356
-
357
- def _get_project_container(self) -> ProjectIdKey:
358
- return ProjectIdKey(project_id=self.config.project_id, platform=self.platform)
359
-
360
- def _is_automl_job(self, job: VertexAiResourceNoun) -> bool:
361
- return (
362
- isinstance(job, AutoMLTabularTrainingJob)
363
- or isinstance(job, AutoMLTextTrainingJob)
364
- or isinstance(job, AutoMLImageTrainingJob)
365
- or isinstance(job, AutoMLVideoTrainingJob)
366
- or isinstance(job, AutoMLForecastingTrainingJob)
367
- )
368
-
369
- def _search_model_version(
370
- self, model: Model, version_id: str
371
- ) -> Optional[VersionInfo]:
372
- for version in model.versioning_registry.list_versions():
373
- if version.version_id == version_id:
374
- return version
375
- return None
376
-
377
- def _search_dataset(self, dataset_id: str) -> Optional[VertexAiResourceNoun]:
378
- """
379
- Search for a dataset by its ID in Vertex AI.
380
- This method iterates through different types of datasets (Text, Tabular, Image,
381
- TimeSeries, and Video) to find a dataset that matches the given dataset ID.
382
- """
383
-
384
- dataset_types = [
385
- "TextDataset",
386
- "TabularDataset",
387
- "ImageDataset",
388
- "TimeSeriesDataset",
389
- "VideoDataset",
390
- ]
391
-
392
- if self.datasets is None:
393
- self.datasets = {}
394
-
395
- for dtype in dataset_types:
396
- dataset_class = getattr(self.client.datasets, dtype)
397
- for ds in dataset_class.list():
398
- self.datasets[ds.name] = ds
399
-
400
- return self.datasets.get(dataset_id)
401
-
402
- def _get_input_dataset_mcps(
403
- self, job_meta: TrainingJobMetadata
404
- ) -> Iterable[MetadataChangeProposalWrapper]:
405
- """
406
- Create a DatasetPropertiesClass aspect for a given Vertex AI dataset.
407
- """
408
- ds = job_meta.input_dataset
409
-
410
- if ds:
411
- # Create URN of Input Dataset for Training Job
412
- dataset_name = self._make_vertexai_dataset_name(entity_id=ds.name)
413
- dataset_urn = builder.make_dataset_urn(
414
- platform=self.platform,
415
- name=dataset_name,
416
- env=self.config.env,
417
- )
418
-
419
- yield from MetadataChangeProposalWrapper.construct_many(
420
- dataset_urn,
421
- aspects=[
422
- DatasetPropertiesClass(
423
- name=self._make_vertexai_dataset_name(ds.name),
424
- created=(
425
- TimeStampClass(time=datetime_to_ts_millis(ds.create_time))
426
- if ds.create_time
427
- else None
428
- ),
429
- description=f"Dataset: {ds.display_name}",
430
- customProperties={
431
- "displayName": ds.display_name,
432
- "resourceName": ds.resource_name,
433
- },
434
- qualifiedName=ds.resource_name,
435
- ),
436
- SubTypesClass(typeNames=[MLTypes.DATASET]),
437
- ContainerClass(container=self._get_project_container().as_urn()),
438
- ],
439
- )
440
-
441
- def _get_training_job_metadata(
442
- self, job: VertexAiResourceNoun
443
- ) -> TrainingJobMetadata:
444
- """
445
- Retrieve metadata for a given Vertex AI training job.
446
- This method extracts metadata for a Vertex AI training job, including input datasets
447
- and output models. It checks if the job is an AutoML job and retrieves the relevant
448
- input dataset and output model information.
449
- """
450
- job_meta = TrainingJobMetadata(job=job)
451
- # Check if the job is an AutoML job
452
- job_conf = job.to_dict()
453
- # Check if input dataset is present in the job configuration
454
- if "inputDataConfig" in job_conf and "datasetId" in job_conf["inputDataConfig"]:
455
- # Create URN of Input Dataset for Training Job
456
- dataset_id = job_conf["inputDataConfig"]["datasetId"]
457
- logger.info(
458
- f"Found input dataset (id: {dataset_id}) for training job ({job.display_name})"
459
- )
460
-
461
- if dataset_id:
462
- input_ds = self._search_dataset(dataset_id)
463
- if input_ds:
464
- logger.info(
465
- f"Found the name of input dataset ({input_ds.display_name}) with dataset id ({dataset_id})"
466
- )
467
- job_meta.input_dataset = input_ds
468
-
469
- # Check if output model is present in the job configuration
470
- if (
471
- "modelToUpload" in job_conf
472
- and "name" in job_conf["modelToUpload"]
473
- and job_conf["modelToUpload"]["name"]
474
- and job_conf["modelToUpload"]["versionId"]
475
- ):
476
- model_name = job_conf["modelToUpload"]["name"]
477
- model_version_str = job_conf["modelToUpload"]["versionId"]
478
- try:
479
- model = Model(model_name=model_name)
480
- model_version = self._search_model_version(model, model_version_str)
481
- if model and model_version:
482
- logger.info(
483
- f"Found output model (name:{model.display_name} id:{model_version_str}) "
484
- f"for training job: {job.display_name}"
485
- )
486
- job_meta.output_model = model
487
- job_meta.output_model_version = model_version
488
- except GoogleAPICallError as e:
489
- self.report.report_failure(
490
- title="Unable to fetch model and model version",
491
- message="Encountered an error while fetching output model and model version which training job generates",
492
- exc=e,
493
- )
494
-
495
- return job_meta
496
-
497
- def _gen_endpoints_mcps(
498
- self, model_meta: ModelMetadata
499
- ) -> Iterable[MetadataChangeProposalWrapper]:
500
- model: Model = model_meta.model
501
- model_version: VersionInfo = model_meta.model_version
502
-
503
- if model_meta.endpoints:
504
- for endpoint in model_meta.endpoints:
505
- endpoint_urn = builder.make_ml_model_deployment_urn(
506
- platform=self.platform,
507
- deployment_name=self._make_vertexai_endpoint_name(
508
- entity_id=endpoint.name
509
- ),
510
- env=self.config.env,
511
- )
512
-
513
- yield from MetadataChangeProposalWrapper.construct_many(
514
- endpoint_urn,
515
- aspects=[
516
- MLModelDeploymentPropertiesClass(
517
- description=model.description,
518
- createdAt=datetime_to_ts_millis(endpoint.create_time),
519
- version=VersionTagClass(
520
- versionTag=str(model_version.version_id)
521
- ),
522
- customProperties={"displayName": endpoint.display_name},
523
- ),
524
- # TODO add followings when metadata for MLModelDeployment is updated (these aspects not supported currently)
525
- # ContainerClass(container=self._get_project_container().as_urn()),
526
- # SubTypesClass(typeNames=[MLTypes.ENDPOINT])
527
- ],
528
- )
529
-
530
- def _gen_ml_model_mcps(
531
- self, ModelMetadata: ModelMetadata
532
- ) -> Iterable[MetadataChangeProposalWrapper]:
533
- """
534
- Generate an MLModel and Endpoint mcp for an VertexAI Model Version.
535
- """
536
-
537
- model: Model = ModelMetadata.model
538
- model_version: VersionInfo = ModelMetadata.model_version
539
- training_job_urn: Optional[str] = ModelMetadata.training_job_urn
540
- endpoints: Optional[List[Endpoint]] = ModelMetadata.endpoints
541
- endpoint_urns: List[str] = list()
542
-
543
- logging.info(f"generating model mcp for {model.name}")
544
-
545
- # Generate list of endpoint URL
546
- if endpoints:
547
- for endpoint in endpoints:
548
- logger.info(
549
- f"found endpoint ({endpoint.display_name}) for model ({model.resource_name})"
550
- )
551
- endpoint_urns.append(
552
- builder.make_ml_model_deployment_urn(
553
- platform=self.platform,
554
- deployment_name=self._make_vertexai_endpoint_name(
555
- entity_id=endpoint.display_name
556
- ),
557
- env=self.config.env,
558
- )
559
- )
560
-
561
- # Create URN for Model and Model Version
562
- model_group_urn = self._make_ml_model_group_urn(model)
563
- model_name = self._make_vertexai_model_name(entity_id=model.name)
564
- model_version_name = f"{model_name}_{model_version.version_id}"
565
- model_urn = self._make_ml_model_urn(model_version, model_name=model_name)
566
-
567
- yield from MetadataChangeProposalWrapper.construct_many(
568
- entityUrn=model_urn,
569
- aspects=[
570
- MLModelPropertiesClass(
571
- name=model_version_name,
572
- description=model_version.version_description,
573
- customProperties={
574
- "displayName": f"{model_version.model_display_name}",
575
- "versionId": f"{model_version.version_id}",
576
- "resourceName": model.resource_name,
577
- },
578
- created=(
579
- TimeStampClass(
580
- datetime_to_ts_millis(model_version.version_create_time)
581
- )
582
- if model_version.version_create_time
583
- else None
584
- ),
585
- lastModified=(
586
- TimeStampClass(
587
- datetime_to_ts_millis(model_version.version_update_time)
588
- )
589
- if model_version.version_update_time
590
- else None
591
- ),
592
- version=VersionTagClass(versionTag=str(model_version.version_id)),
593
- groups=[model_group_urn], # link model version to model group
594
- trainingJobs=(
595
- [training_job_urn] if training_job_urn else None
596
- ), # link to training job
597
- deployments=endpoint_urns,
598
- externalUrl=self._make_model_version_external_url(model),
599
- type="ML Model",
600
- ),
601
- # TODO Add a container for Project as parent of the dataset
602
- # ContainerClass(
603
- # container=self._get_project_container().as_urn(),
604
- # )
605
- ],
606
- )
607
-
608
- def _search_endpoint(self, model: Model) -> List[Endpoint]:
609
- """
610
- Search for an endpoint associated with the model.
611
- """
612
- if self.endpoints is None:
613
- endpoint_dict: Dict[str, List[Endpoint]] = {}
614
- for endpoint in self.client.Endpoint.list():
615
- for resource in endpoint.list_models():
616
- if resource.model not in endpoint_dict:
617
- endpoint_dict[resource.model] = []
618
- endpoint_dict[resource.model].append(endpoint)
619
- self.endpoints = endpoint_dict
620
-
621
- endpoints = (
622
- self.endpoints[model.resource_name]
623
- if model.resource_name in self.endpoints
624
- else []
625
- )
626
- return endpoints
627
-
628
- def _make_ml_model_urn(self, model_version: VersionInfo, model_name: str) -> str:
629
- urn = builder.make_ml_model_urn(
630
- platform=self.platform,
631
- model_name=f"{model_name}_{model_version.version_id}",
632
- env=self.config.env,
633
- )
634
- return urn
635
-
636
- def _make_training_job_urn(self, job: VertexAiResourceNoun) -> str:
637
- job_id = self._make_vertexai_job_name(entity_id=job.name)
638
- urn = builder.make_data_process_instance_urn(dataProcessInstanceId=job_id)
639
- return urn
640
-
641
- def _make_vertexai_model_group_name(
642
- self,
643
- entity_id: str,
644
- ) -> str:
645
- return f"{self.config.project_id}.model_group.{entity_id}"
646
-
647
- def _make_vertexai_endpoint_name(self, entity_id: str) -> str:
648
- return f"{self.config.project_id}.endpoint.{entity_id}"
649
-
650
- def _make_vertexai_model_name(self, entity_id: str) -> str:
651
- return f"{self.config.project_id}.model.{entity_id}"
652
-
653
- def _make_vertexai_dataset_name(self, entity_id: str) -> str:
654
- return f"{self.config.project_id}.dataset.{entity_id}"
655
-
656
- def _make_vertexai_job_name(
657
- self,
658
- entity_id: Optional[str],
659
- ) -> str:
660
- return f"{self.config.project_id}.job.{entity_id}"
661
-
662
- def _make_job_external_url(self, job: VertexAiResourceNoun) -> str:
663
- """
664
- Model external URL in Vertex AI
665
- Sample URLs:
666
- https://console.cloud.google.com/vertex-ai/training/training-pipelines?project=acryl-poc&trainingPipelineId=5401695018589093888
667
- """
668
- external_url: str = (
669
- f"{self.config.vertexai_url}/training/training-pipelines?trainingPipelineId={job.name}"
670
- f"?project={self.config.project_id}"
671
- )
672
- return external_url
673
-
674
- def _make_model_external_url(self, model: Model) -> str:
675
- """
676
- Model external URL in Vertex AI
677
- Sample URL:
678
- https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336?project=acryl-poc
679
- """
680
- external_url: str = (
681
- f"{self.config.vertexai_url}/models/locations/{self.config.region}/models/{model.name}"
682
- f"?project={self.config.project_id}"
683
- )
684
- return external_url
685
-
686
- def _make_model_version_external_url(self, model: Model) -> str:
687
- """
688
- Model Version external URL in Vertex AI
689
- Sample URL:
690
- https://console.cloud.google.com/vertex-ai/models/locations/us-west2/models/812468724182286336/versions/1?project=acryl-poc
691
- """
692
- external_url: str = (
693
- f"{self.config.vertexai_url}/models/locations/{self.config.region}/models/{model.name}"
694
- f"/versions/{model.version_id}"
695
- f"?project={self.config.project_id}"
696
- )
697
- return external_url