acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,29 @@
1
+ from typing import Dict, Optional
2
+
3
+ from pydantic import Field
4
+
5
+ from datahub.configuration.source_common import EnvConfigMixin
6
+ from datahub.ingestion.source.common.gcp_credentials_config import GCPCredential
7
+
8
+
9
+ class VertexAIConfig(EnvConfigMixin):
10
+ credential: Optional[GCPCredential] = Field(
11
+ default=None, description="GCP credential information"
12
+ )
13
+ project_id: str = Field(description=("Project ID in Google Cloud Platform"))
14
+ region: str = Field(
15
+ description=("Region of your project in Google Cloud Platform"),
16
+ )
17
+ bucket_uri: Optional[str] = Field(
18
+ default=None,
19
+ description=("Bucket URI used in your project"),
20
+ )
21
+ vertexai_url: Optional[str] = Field(
22
+ default="https://console.cloud.google.com/vertex-ai",
23
+ description=("VertexUI URI"),
24
+ )
25
+
26
+ def get_credentials(self) -> Optional[Dict[str, str]]:
27
+ if self.credential:
28
+ return self.credential.to_dict(self.project_id)
29
+ return None
@@ -0,0 +1,89 @@
1
+ from typing import Optional, Union
2
+
3
+ from google.cloud.aiplatform.base import VertexAiResourceNoun
4
+ from google.cloud.aiplatform.jobs import _RunnableJob
5
+ from google.cloud.aiplatform.training_jobs import _TrainingJob
6
+ from google.cloud.aiplatform_v1.types import JobState, PipelineState, PipelineTaskDetail
7
+
8
+ from datahub.metadata.schema_classes import RunResultTypeClass
9
+
10
+
11
+ def get_automl_job_result_type(state: PipelineState) -> Union[str, RunResultTypeClass]:
12
+ state_mapping = {
13
+ PipelineState.PIPELINE_STATE_SUCCEEDED: RunResultTypeClass.SUCCESS,
14
+ PipelineState.PIPELINE_STATE_FAILED: RunResultTypeClass.FAILURE,
15
+ PipelineState.PIPELINE_STATE_CANCELLED: "Cancelled",
16
+ PipelineState.PIPELINE_STATE_PAUSED: "Paused",
17
+ PipelineState.PIPELINE_STATE_QUEUED: "Queued",
18
+ PipelineState.PIPELINE_STATE_RUNNING: "Running",
19
+ PipelineState.PIPELINE_STATE_UNSPECIFIED: "Unspecific",
20
+ }
21
+
22
+ return state_mapping.get(state, "UNKNOWN")
23
+
24
+
25
+ def get_custom_job_result_type(state: JobState) -> Union[str, RunResultTypeClass]:
26
+ state_mapping = {
27
+ JobState.JOB_STATE_SUCCEEDED: RunResultTypeClass.SUCCESS,
28
+ JobState.JOB_STATE_FAILED: RunResultTypeClass.FAILURE,
29
+ JobState.JOB_STATE_CANCELLED: "Cancelled",
30
+ JobState.JOB_STATE_PAUSED: "Paused",
31
+ JobState.JOB_STATE_QUEUED: "Queued",
32
+ JobState.JOB_STATE_RUNNING: "Running",
33
+ JobState.JOB_STATE_CANCELLING: "Cancelling",
34
+ JobState.JOB_STATE_EXPIRED: "Expired",
35
+ JobState.JOB_STATE_UPDATING: "Updating",
36
+ }
37
+ return state_mapping.get(state, "UNKNOWN")
38
+
39
+
40
+ def get_job_result_status(job: VertexAiResourceNoun) -> Union[str, RunResultTypeClass]:
41
+ if isinstance(job, _TrainingJob) and isinstance(job.state, PipelineState):
42
+ return get_automl_job_result_type(job.state)
43
+ elif isinstance(job, _RunnableJob) and isinstance(job.state, JobState):
44
+ return get_custom_job_result_type(job.state)
45
+ return "UNKNOWN"
46
+
47
+
48
+ def get_execution_result_status(status: int) -> Union[str, RunResultTypeClass]:
49
+ """
50
+ State of the execution.
51
+ STATE_UNSPECIFIED = 0
52
+ PENDING = 1
53
+ RUNNING = 2
54
+ SUCCEEDED = 3
55
+ FAILED = 4
56
+ """
57
+ status_mapping = {
58
+ 0: "STATE_UNSPECIFIED",
59
+ 1: "PENDING",
60
+ 2: "RUNNING",
61
+ 3: RunResultTypeClass.SUCCESS,
62
+ 4: RunResultTypeClass.FAILURE,
63
+ }
64
+ return status_mapping.get(status, "UNKNOWN")
65
+
66
+
67
+ def get_pipeline_task_result_status(
68
+ status: Optional[PipelineTaskDetail.State],
69
+ ) -> Union[str, RunResultTypeClass]:
70
+ # TODO: DataProcessInstanceRunResultClass fails with status string except for SUCCESS, FAILURE, SKIPPED,
71
+ # which will be fixed in the future
72
+ status_mapping = {
73
+ # PipelineTaskDetail.State.STATE_UNSPECIFIED: "STATE_UNSPECIFIED",
74
+ # PipelineTaskDetail.State.PENDING: "PENDING",
75
+ # PipelineTaskDetail.State.RUNNING: "RUNNING",
76
+ # PipelineTaskDetail.State.CANCEL_PENDING: "CANCEL_PENDING",
77
+ # PipelineTaskDetail.State.CANCELLING: "CANCELLING",
78
+ # PipelineTaskDetail.State.NOT_TRIGGERED: "NOT_TRIGGERED",
79
+ PipelineTaskDetail.State.SUCCEEDED: RunResultTypeClass.SUCCESS,
80
+ PipelineTaskDetail.State.FAILED: RunResultTypeClass.FAILURE,
81
+ PipelineTaskDetail.State.SKIPPED: RunResultTypeClass.SKIPPED,
82
+ }
83
+ if status is None:
84
+ return "UNKNOWN"
85
+ return status_mapping.get(status, "UNKNOWN")
86
+
87
+
88
+ def is_status_for_run_event_class(status: Union[str, RunResultTypeClass]) -> bool:
89
+ return status in [RunResultTypeClass.SUCCESS, RunResultTypeClass.FAILURE]
@@ -2,6 +2,7 @@ import re
2
2
  from typing import Dict, List, Optional, Union
3
3
  from urllib.parse import urlparse
4
4
 
5
+ import pydantic
5
6
  from pydantic import Field, validator
6
7
 
7
8
  from datahub.configuration.common import AllowDenyPattern
@@ -121,7 +122,8 @@ class PulsarSourceConfig(
121
122
  )
122
123
  return client_secret
123
124
 
124
- @validator("web_service_url")
125
+ @pydantic.field_validator("web_service_url", mode="after")
126
+ @classmethod
125
127
  def web_service_url_scheme_host_port(cls, val: str) -> str:
126
128
  # Tokenize the web url
127
129
  url = urlparse(val)
@@ -1,7 +1,9 @@
1
1
  import logging
2
+ from collections import defaultdict
2
3
  from contextlib import AbstractContextManager
3
4
  from dataclasses import dataclass, field
4
5
  from datetime import datetime, timezone
6
+ from enum import Enum
5
7
 
6
8
  from datahub.utilities.perf_timer import PerfTimer
7
9
  from datahub.utilities.stats_collections import TopKDict
@@ -20,31 +22,68 @@ QUERIES_EXTRACTION = "Queries Extraction"
20
22
  PROFILING = "Profiling"
21
23
 
22
24
 
25
+ class IngestionHighStage(Enum):
26
+ """
27
+ The high-level stages at the framework level
28
+ Team to add more stages as needed
29
+ """
30
+
31
+ PROFILING = "Profiling"
32
+ _UNDEFINED = "Ingestion"
33
+
34
+
23
35
  @dataclass
24
36
  class IngestionStageReport:
37
+ ingestion_high_stage_seconds: dict[IngestionHighStage, float] = field(
38
+ default_factory=lambda: defaultdict(float)
39
+ )
25
40
  ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
26
41
 
27
- def new_stage(self, stage: str) -> "IngestionStageContext":
28
- return IngestionStageContext(stage, self)
42
+ def new_stage(
43
+ self, stage: str, high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED
44
+ ) -> "IngestionStageContext":
45
+ return IngestionStageContext(stage, self, high_stage)
46
+
47
+ def new_high_stage(self, stage: IngestionHighStage) -> "IngestionStageContext":
48
+ return IngestionStageContext("", self, stage)
29
49
 
30
50
 
31
51
  @dataclass
32
52
  class IngestionStageContext(AbstractContextManager):
33
- def __init__(self, stage: str, report: IngestionStageReport):
34
- self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
53
+ def __init__(
54
+ self,
55
+ stage: str,
56
+ report: IngestionStageReport,
57
+ high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED,
58
+ ):
59
+ self._high_stage = high_stage
60
+ self._ingestion_stage = (
61
+ f"{stage} at {datetime.now(timezone.utc)}" if stage else ""
62
+ )
35
63
  self._timer: PerfTimer = PerfTimer()
36
64
  self._report = report
37
65
 
38
66
  def __enter__(self) -> "IngestionStageContext":
39
- logger.info(f"Stage started: {self._ingestion_stage}")
67
+ if self._ingestion_stage:
68
+ logger.info(f"Stage started: {self._ingestion_stage}")
69
+ else:
70
+ logger.info(f"High stage started: {self._high_stage.value}")
40
71
  self._timer.start()
41
72
  return self
42
73
 
43
74
  def __exit__(self, exc_type, exc_val, exc_tb):
44
75
  elapsed = self._timer.elapsed_seconds(digits=2)
45
- logger.info(
46
- f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
47
- stacklevel=2,
48
- )
49
- self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
50
- return None
76
+ if self._ingestion_stage:
77
+ logger.info(
78
+ f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
79
+ stacklevel=2,
80
+ )
81
+ # Store tuple as string to avoid serialization errors
82
+ key = f"({self._high_stage.value}, {self._ingestion_stage})"
83
+ self._report.ingestion_stage_durations[key] = elapsed
84
+ else:
85
+ logger.info(
86
+ f"Time spent in stage <{self._high_stage.value}>: {elapsed} seconds",
87
+ stacklevel=2,
88
+ )
89
+ self._report.ingestion_high_stage_seconds[self._high_stage] += elapsed
@@ -54,7 +54,7 @@ class AddDatasetDataProduct(DatasetDataproductTransformer):
54
54
  data_products_container: Dict[str, DataProductPatchBuilder] = {}
55
55
  logger.debug("Generating dataproducts")
56
56
  is_container = self.config.is_container
57
- for entity_urn in self.entity_map.keys():
57
+ for entity_urn in self.entity_map:
58
58
  data_product_urn = self.config.get_data_product_to_add(entity_urn)
59
59
  if data_product_urn:
60
60
  if data_product_urn not in data_products:
@@ -71,8 +71,24 @@ class AddDatasetOwnership(OwnershipTransformer):
71
71
 
72
72
  server_ownership = graph.get_ownership(entity_urn=urn)
73
73
  if server_ownership:
74
- owners = {owner.owner: owner for owner in server_ownership.owners}
75
- owners.update({owner.owner: owner for owner in mce_ownership.owners})
74
+ owners = {
75
+ (
76
+ owner.owner,
77
+ owner.type,
78
+ owner.typeUrn,
79
+ ): owner
80
+ for owner in server_ownership.owners
81
+ }
82
+ owners.update(
83
+ {
84
+ (
85
+ owner.owner,
86
+ owner.type,
87
+ owner.typeUrn,
88
+ ): owner
89
+ for owner in mce_ownership.owners
90
+ }
91
+ )
76
92
  mce_ownership.owners = list(owners.values())
77
93
 
78
94
  return mce_ownership
@@ -86,7 +102,7 @@ class AddDatasetOwnership(OwnershipTransformer):
86
102
  logger.debug("Generating Ownership for containers")
87
103
  ownership_container_mapping: Dict[str, List[OwnerClass]] = {}
88
104
  for entity_urn, data_ownerships in (
89
- (urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map.keys()
105
+ (urn, self.config.get_owners_to_add(urn)) for urn in self.entity_map
90
106
  ):
91
107
  if not data_ownerships:
92
108
  continue
@@ -281,11 +281,14 @@ class BaseTransformer(Transformer, metaclass=ABCMeta):
281
281
  )
282
282
  )
283
283
 
284
- record_metadata = _update_work_unit_id(
285
- envelope=envelope,
286
- aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
287
- urn=mcp.entityUrn,
288
- )
284
+ if mcp.entityUrn:
285
+ record_metadata = _update_work_unit_id(
286
+ envelope=envelope,
287
+ aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
288
+ urn=mcp.entityUrn,
289
+ )
290
+ else:
291
+ record_metadata = envelope.metadata.copy()
289
292
 
290
293
  yield RecordEnvelope(
291
294
  record=mcp,
@@ -125,7 +125,7 @@ class AddDatasetDomain(DatasetDomainTransformer):
125
125
  return domain_mcps
126
126
 
127
127
  for entity_urn, domain_to_add in (
128
- (urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map.keys()
128
+ (urn, self.config.get_domains_to_add(urn)) for urn in self.entity_map
129
129
  ):
130
130
  if not domain_to_add or not domain_to_add.domains:
131
131
  continue
@@ -0,0 +1,112 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Dict, List, Optional, cast
4
+
5
+ from datahub.configuration.common import (
6
+ TransformerSemanticsConfigModel,
7
+ )
8
+ from datahub.emitter.mce_builder import Aspect
9
+ from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.transformer.base_transformer import (
11
+ BaseTransformer,
12
+ SingleAspectTransformer,
13
+ )
14
+ from datahub.metadata.schema_classes import (
15
+ BrowsePathEntryClass,
16
+ BrowsePathsV2Class,
17
+ )
18
+ from datahub.utilities.urns.urn import guess_entity_type
19
+
20
+
21
+ class SetBrowsePathTransformerConfig(TransformerSemanticsConfigModel):
22
+ path: List[str]
23
+
24
+
25
+ class SetBrowsePathTransformer(BaseTransformer, SingleAspectTransformer):
26
+ ctx: PipelineContext
27
+ config: SetBrowsePathTransformerConfig
28
+
29
+ def __init__(self, config: SetBrowsePathTransformerConfig, ctx: PipelineContext):
30
+ super().__init__()
31
+ self.ctx = ctx
32
+ self.config = config
33
+
34
+ def aspect_name(self) -> str:
35
+ return "browsePathsV2"
36
+
37
+ def entity_types(self) -> List[str]:
38
+ # This is an arbitrary list, might be adjusted if it makes sense. It might be reasonable to make it configurable
39
+ return ["dataset", "dataJob", "dataFlow", "chart", "dashboard", "container"]
40
+
41
+ @classmethod
42
+ def create(
43
+ cls, config_dict: dict, ctx: PipelineContext
44
+ ) -> "SetBrowsePathTransformer":
45
+ config = SetBrowsePathTransformerConfig.parse_obj(config_dict)
46
+ return cls(config, ctx)
47
+
48
+ @staticmethod
49
+ def _build_model(existing_browse_paths: BrowsePathsV2Class) -> Dict[str, List[str]]:
50
+ template_vars: Dict[str, List[str]] = {}
51
+ model: Dict[str, List[str]] = defaultdict(list)
52
+ for entry in existing_browse_paths.path or []:
53
+ if entry.urn:
54
+ entity_type = guess_entity_type(entry.urn)
55
+ model[entity_type].append(entry.urn)
56
+
57
+ for entity_type, urns in model.items():
58
+ template_vars[f"{entity_type}[*]"] = urns
59
+ for i, urn in enumerate(urns):
60
+ template_vars[f"{entity_type}[{i}]"] = [urn]
61
+
62
+ return template_vars
63
+
64
+ @classmethod
65
+ def _expand_nodes(
66
+ cls, templates: List[str], template_vars: Dict[str, List[str]]
67
+ ) -> BrowsePathsV2Class:
68
+ expanded_nodes: List[str] = []
69
+ for node in templates:
70
+ resolved_nodes = cls._resolve_template_to_nodes(node, template_vars)
71
+ expanded_nodes.extend(resolved_nodes)
72
+
73
+ processed_entries: List[BrowsePathEntryClass] = []
74
+ for node in expanded_nodes:
75
+ if not node or node.isspace():
76
+ continue
77
+ processed_entries.append(
78
+ BrowsePathEntryClass(
79
+ id=node, urn=node if node.startswith("urn:") else None
80
+ )
81
+ )
82
+ return BrowsePathsV2Class(path=processed_entries)
83
+
84
+ def transform_aspect(
85
+ self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
86
+ ) -> Optional[Aspect]:
87
+ template_vars: Dict[str, List[str]] = {}
88
+ if aspect is not None:
89
+ assert isinstance(aspect, BrowsePathsV2Class)
90
+ template_vars = self._build_model(aspect)
91
+ new_browse_paths: BrowsePathsV2Class = self._expand_nodes(
92
+ self.config.path, template_vars
93
+ )
94
+ if aspect is not None and not self.config.replace_existing:
95
+ for node in aspect.path:
96
+ new_browse_paths.path.append(node)
97
+
98
+ return cast(Aspect, new_browse_paths)
99
+
100
+ @staticmethod
101
+ def _resolve_template_to_nodes(
102
+ template_str: str, template_vars: Dict[str, List[str]]
103
+ ) -> List[str]:
104
+ # This mechanism can be made simpler (match against known variables only) or more complex (e.g. by using a
105
+ # proper templating engine, like jinja).
106
+ template_str = template_str.strip()
107
+ var_pattern = re.findall(r"^\$([a-zA-Z]+\[[0-9*]+]$)", template_str)
108
+
109
+ if not var_pattern:
110
+ return [template_str]
111
+
112
+ return template_vars.get(var_pattern[0], [])
@@ -3,6 +3,7 @@ from typing import List, Optional, Tuple, TypedDict
3
3
 
4
4
  from datahub.api.entities.assertion.assertion import BaseEntityAssertion
5
5
  from datahub.ingestion.graph.client import get_default_graph
6
+ from datahub.ingestion.graph.config import ClientMode
6
7
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties
7
8
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata
8
9
  from datahub.utilities.urns.urn import Urn
@@ -15,7 +16,7 @@ class ColumnDict(TypedDict):
15
16
 
16
17
  @lru_cache
17
18
  def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
18
- with get_default_graph() as graph:
19
+ with get_default_graph(ClientMode.CLI) as graph:
19
20
  props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties)
20
21
  if props is not None:
21
22
  return props.qualifiedName
@@ -24,7 +25,7 @@ def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
24
25
 
25
26
  @lru_cache
26
27
  def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]:
27
- with get_default_graph() as graph:
28
+ with get_default_graph(ClientMode.INGESTION) as graph:
28
29
  schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata)
29
30
  if schema is not None:
30
31
  return [
@@ -84,9 +84,10 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
84
84
 
85
85
  dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME
86
86
  dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME
87
- with (dmf_definitions_path).open("w") as definitions, (
88
- dmf_associations_path
89
- ).open("w") as associations:
87
+ with (
88
+ (dmf_definitions_path).open("w") as definitions,
89
+ (dmf_associations_path).open("w") as associations,
90
+ ):
90
91
  for assertion_spec in assertion_config_spec.assertions:
91
92
  result.report.num_processed += 1
92
93
  try:
datahub/lite/lite_util.py CHANGED
@@ -99,7 +99,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
99
99
  lite_class = lite_registry.get(lite_type)
100
100
  except KeyError as e:
101
101
  raise Exception(
102
- f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping.keys()]}"
102
+ f"Failed to find a registered lite implementation for {lite_type}. Valid values are {[k for k in lite_registry.mapping]}"
103
103
  ) from e
104
104
 
105
105
  lite_specific_config = lite_class.get_config_class().parse_obj(
@@ -127,7 +127,7 @@ def get_datahub_lite(config_dict: dict, read_only: bool = False) -> "DataHubLite
127
127
  return lite
128
128
  else:
129
129
  raise Exception(
130
- f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping.keys()]}"
130
+ f"Failed to find a registered forwarding sink for type {lite_local_config.forward_to.type}. Valid values are {[k for k in sink_registry.mapping]}"
131
131
  )
132
132
  else:
133
133
  return lite