acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,197 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, List, Optional
3
+
4
+ if TYPE_CHECKING:
5
+ from datahub.ingestion.source.unity.platform_resource_repository import (
6
+ UnityCatalogPlatformResourceRepository,
7
+ )
8
+
9
+ from pydantic import BaseModel
10
+
11
+ from datahub.api.entities.external.external_entities import (
12
+ ExternalEntity,
13
+ ExternalEntityId,
14
+ LinkedResourceSet,
15
+ )
16
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
17
+ from datahub.api.entities.platformresource.platform_resource import (
18
+ PlatformResource,
19
+ PlatformResourceKey,
20
+ )
21
+ from datahub.ingestion.graph.client import DataHubGraph
22
+ from datahub.metadata.urns import TagUrn
23
+ from datahub.utilities.urns.urn import Urn
24
+
25
+
26
+ class UnityCatalogTagSyncContext(BaseModel):
27
+ # it is intentionally empty
28
+ platform_instance: Optional[str] = None
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class UnityCatalogTagPlatformResourceId(ExternalEntityId):
35
+ """
36
+ A Unity Catalog tag platform resource ID.
37
+ """
38
+
39
+ tag_key: str
40
+ tag_value: Optional[str] = None
41
+ platform_instance: Optional[str] = None
42
+ exists_in_unity_catalog: bool = False
43
+ persisted: bool = False
44
+
45
+ # this is a hack to make sure the property is a string and not private pydantic field
46
+ @staticmethod
47
+ def _RESOURCE_TYPE() -> str:
48
+ return "UnityCatalogTagPlatformResource"
49
+
50
+ def to_platform_resource_key(self) -> PlatformResourceKey:
51
+ return PlatformResourceKey(
52
+ platform="databricks",
53
+ resource_type=str(UnityCatalogTagPlatformResourceId._RESOURCE_TYPE()),
54
+ primary_key=f"{self.tag_key}:{self.tag_value}",
55
+ platform_instance=self.platform_instance,
56
+ )
57
+
58
+ @classmethod
59
+ def get_or_create_from_tag(
60
+ cls,
61
+ tag: UnityCatalogTag,
62
+ platform_resource_repository: "UnityCatalogPlatformResourceRepository",
63
+ exists_in_unity_catalog: bool = False,
64
+ ) -> "UnityCatalogTagPlatformResourceId":
65
+ """
66
+ Creates a UnityCatalogTagPlatformResourceId from a UnityCatalogTag.
67
+ """
68
+
69
+ existing_platform_resource = platform_resource_repository.search_entity_by_urn(
70
+ tag.to_datahub_tag_urn().urn()
71
+ )
72
+ if existing_platform_resource:
73
+ logger.debug(
74
+ f"Found existing UnityCatalogTagPlatformResourceId for tag {tag.key.raw_text}: {existing_platform_resource}"
75
+ )
76
+ return existing_platform_resource
77
+
78
+ return UnityCatalogTagPlatformResourceId(
79
+ tag_key=tag.key.raw_text,
80
+ tag_value=tag.value.raw_text if tag.value is not None else None,
81
+ platform_instance=platform_resource_repository.platform_instance,
82
+ exists_in_unity_catalog=exists_in_unity_catalog,
83
+ persisted=False,
84
+ )
85
+
86
+ @classmethod
87
+ def from_datahub_urn(
88
+ cls,
89
+ urn: str,
90
+ tag_sync_context: UnityCatalogTagSyncContext,
91
+ platform_resource_repository: "UnityCatalogPlatformResourceRepository",
92
+ graph: DataHubGraph,
93
+ ) -> "UnityCatalogTagPlatformResourceId":
94
+ """
95
+ Creates a UnityCatalogTagPlatformResourceId from a DataHub URN.
96
+ """
97
+ existing_platform_resource_id = (
98
+ platform_resource_repository.search_entity_by_urn(urn)
99
+ )
100
+ if existing_platform_resource_id:
101
+ return existing_platform_resource_id
102
+
103
+ new_unity_catalog_tag_id = cls.generate_tag_id(graph, tag_sync_context, urn)
104
+ if new_unity_catalog_tag_id:
105
+ resource_key = platform_resource_repository.get(
106
+ new_unity_catalog_tag_id.to_platform_resource_key()
107
+ )
108
+ if resource_key:
109
+ # Create a new ID with the correct state instead of mutating
110
+ return UnityCatalogTagPlatformResourceId(
111
+ tag_key=new_unity_catalog_tag_id.tag_key,
112
+ tag_value=new_unity_catalog_tag_id.tag_value,
113
+ platform_instance=new_unity_catalog_tag_id.platform_instance,
114
+ exists_in_unity_catalog=True, # This tag exists in Unity Catalog
115
+ persisted=new_unity_catalog_tag_id.persisted,
116
+ )
117
+ return new_unity_catalog_tag_id
118
+ raise ValueError(
119
+ f"Unable to create Unity Catalog tag ID from DataHub URN: {urn}"
120
+ )
121
+
122
+ @classmethod
123
+ def generate_tag_id(
124
+ cls, graph: DataHubGraph, tag_sync_context: UnityCatalogTagSyncContext, urn: str
125
+ ) -> "UnityCatalogTagPlatformResourceId":
126
+ parsed_urn = Urn.from_string(urn)
127
+ entity_type = parsed_urn.entity_type
128
+ if entity_type == "tag":
129
+ return UnityCatalogTagPlatformResourceId.from_datahub_tag(
130
+ TagUrn.from_string(urn), tag_sync_context
131
+ )
132
+ else:
133
+ raise ValueError(f"Unsupported entity type {entity_type} for URN {urn}")
134
+
135
+ @classmethod
136
+ def from_datahub_tag(
137
+ cls, tag_urn: TagUrn, tag_sync_context: UnityCatalogTagSyncContext
138
+ ) -> "UnityCatalogTagPlatformResourceId":
139
+ uc_tag = UnityCatalogTag.from_urn(tag_urn)
140
+
141
+ return UnityCatalogTagPlatformResourceId(
142
+ tag_key=str(uc_tag.key),
143
+ tag_value=str(uc_tag.value) if uc_tag.value is not None else None,
144
+ platform_instance=tag_sync_context.platform_instance,
145
+ exists_in_unity_catalog=False,
146
+ )
147
+
148
+
149
+ class UnityCatalogTagPlatformResource(ExternalEntity):
150
+ datahub_urns: LinkedResourceSet
151
+ managed_by_datahub: bool
152
+ id: UnityCatalogTagPlatformResourceId
153
+ allowed_values: Optional[List[str]] = None
154
+
155
+ def get_id(self) -> ExternalEntityId:
156
+ return self.id
157
+
158
+ def is_managed_by_datahub(self) -> bool:
159
+ return self.managed_by_datahub
160
+
161
+ def datahub_linked_resources(self) -> LinkedResourceSet:
162
+ return self.datahub_urns
163
+
164
+ def as_platform_resource(self) -> PlatformResource:
165
+ return PlatformResource.create(
166
+ key=self.id.to_platform_resource_key(),
167
+ secondary_keys=[u for u in self.datahub_urns.urns],
168
+ value=self,
169
+ )
170
+
171
+ @classmethod
172
+ def create_default(
173
+ cls,
174
+ entity_id: ExternalEntityId,
175
+ managed_by_datahub: bool,
176
+ ) -> "UnityCatalogTagPlatformResource":
177
+ """Create a default Unity Catalog tag entity when none found in DataHub."""
178
+ # Type narrowing: we know this will be a UnityCatalogTagPlatformResourceId
179
+ assert isinstance(entity_id, UnityCatalogTagPlatformResourceId), (
180
+ f"Expected UnityCatalogTagPlatformResourceId, got {type(entity_id)}"
181
+ )
182
+
183
+ # Create a new entity ID with correct default state instead of mutating
184
+ default_entity_id = UnityCatalogTagPlatformResourceId(
185
+ tag_key=entity_id.tag_key,
186
+ tag_value=entity_id.tag_value,
187
+ platform_instance=entity_id.platform_instance,
188
+ exists_in_unity_catalog=False, # New entities don't exist in Unity Catalog yet
189
+ persisted=False, # New entities are not persisted yet
190
+ )
191
+
192
+ return cls(
193
+ id=default_entity_id,
194
+ datahub_urns=LinkedResourceSet(urns=[]),
195
+ managed_by_datahub=managed_by_datahub,
196
+ allowed_values=None,
197
+ )
@@ -11,7 +11,10 @@ from databricks.sdk.service.sql import QueryStatementType
11
11
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
12
  from datahub.ingestion.api.source_helpers import auto_empty_dataset_usage_statistics
13
13
  from datahub.ingestion.api.workunit import MetadataWorkUnit
14
- from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig
14
+ from datahub.ingestion.source.unity.config import (
15
+ UnityCatalogSourceConfig,
16
+ UsageDataSource,
17
+ )
15
18
  from datahub.ingestion.source.unity.proxy import UnityCatalogApiProxy
16
19
  from datahub.ingestion.source.unity.proxy_types import (
17
20
  OPERATION_STATEMENT_TYPES,
@@ -164,11 +167,50 @@ class UnityCatalogUsageExtractor:
164
167
  aspect=operation_aspect,
165
168
  ).as_workunit()
166
169
 
170
+ def _validate_usage_data_source_config(self) -> None:
171
+ """Validate usage data source configuration before execution."""
172
+ usage_data_source = self.config.usage_data_source
173
+
174
+ if (
175
+ usage_data_source == UsageDataSource.SYSTEM_TABLES
176
+ and not self.proxy.warehouse_id
177
+ ):
178
+ raise ValueError(
179
+ "usage_data_source is set to SYSTEM_TABLES but warehouse_id is not configured. "
180
+ "Either set warehouse_id or use AUTO/API mode."
181
+ )
182
+
167
183
  def _get_queries(self) -> Iterable[Query]:
168
184
  try:
169
- yield from self.proxy.query_history(
170
- self.config.start_time, self.config.end_time
171
- )
185
+ self._validate_usage_data_source_config()
186
+ usage_data_source = self.config.usage_data_source
187
+
188
+ if usage_data_source == UsageDataSource.AUTO:
189
+ if self.proxy.warehouse_id:
190
+ logger.info(
191
+ "Using system tables for usage query history (AUTO mode)"
192
+ )
193
+ yield from self.proxy.get_query_history_via_system_tables(
194
+ self.config.start_time, self.config.end_time
195
+ )
196
+ else:
197
+ logger.info(
198
+ "Using API for usage query history (AUTO mode, no warehouse)"
199
+ )
200
+ yield from self.proxy.query_history(
201
+ self.config.start_time, self.config.end_time
202
+ )
203
+ elif usage_data_source == UsageDataSource.SYSTEM_TABLES:
204
+ logger.info("Using system tables for usage query history (forced)")
205
+ yield from self.proxy.get_query_history_via_system_tables(
206
+ self.config.start_time, self.config.end_time
207
+ )
208
+ elif usage_data_source == UsageDataSource.API:
209
+ logger.info("Using API for usage query history (forced)")
210
+ yield from self.proxy.query_history(
211
+ self.config.start_time, self.config.end_time
212
+ )
213
+
172
214
  except Exception as e:
173
215
  logger.warning("Error getting queries", exc_info=True)
174
216
  self.report.report_warning("get-queries", str(e))
@@ -85,8 +85,11 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
85
85
  @platform_name("ClickHouse")
86
86
  @config_class(ClickHouseUsageConfig)
87
87
  @support_status(SupportStatus.CERTIFIED)
88
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
88
+ @capability(
89
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
90
+ )
89
91
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
92
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
90
93
  @dataclasses.dataclass
91
94
  class ClickHouseUsageSource(Source):
92
95
  """
@@ -15,7 +15,9 @@ from sqlalchemy.engine import Engine
15
15
  import datahub.emitter.mce_builder as builder
16
16
  from datahub.configuration.time_window_config import get_time_bucket
17
17
  from datahub.ingestion.api.decorators import (
18
+ SourceCapability,
18
19
  SupportStatus,
20
+ capability,
19
21
  config_class,
20
22
  platform_name,
21
23
  support_status,
@@ -58,7 +60,7 @@ AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]
58
60
 
59
61
  class TrinoConnectorInfo(BaseModel):
60
62
  partitionIds: List[str]
61
- truncated: Optional[bool]
63
+ truncated: Optional[bool] = None
62
64
 
63
65
 
64
66
  class TrinoAccessedMetadata(BaseModel):
@@ -78,7 +80,7 @@ class TrinoJoinedAccessEvent(BaseModel):
78
80
  table: Optional[str] = None
79
81
  accessed_metadata: List[TrinoAccessedMetadata]
80
82
  starttime: datetime = Field(alias="create_time")
81
- endtime: Optional[datetime] = Field(alias="end_time")
83
+ endtime: Optional[datetime] = Field(None, alias="end_time")
82
84
 
83
85
 
84
86
  class EnvBasedSourceBaseConfig:
@@ -112,6 +114,7 @@ class TrinoUsageReport(SourceReport):
112
114
  @platform_name("Trino")
113
115
  @config_class(TrinoUsageConfig)
114
116
  @support_status(SupportStatus.CERTIFIED)
117
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
115
118
  @dataclasses.dataclass
116
119
  class TrinoUsageSource(Source):
117
120
  """
@@ -18,7 +18,7 @@ import pydantic
18
18
  from pydantic.fields import Field
19
19
 
20
20
  import datahub.emitter.mce_builder as builder
21
- from datahub.configuration.common import AllowDenyPattern
21
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
22
22
  from datahub.configuration.time_window_config import (
23
23
  BaseTimeWindowConfig,
24
24
  BucketDuration,
@@ -194,13 +194,13 @@ class GenericAggregatedDataset(Generic[ResourceType]):
194
194
 
195
195
 
196
196
  class BaseUsageConfig(BaseTimeWindowConfig):
197
- queries_character_limit: int = Field(
197
+ queries_character_limit: HiddenFromDocs[int] = Field(
198
+ # Hidden since we don't want to encourage people to break elasticsearch.
198
199
  default=DEFAULT_QUERIES_CHARACTER_LIMIT,
199
200
  description=(
200
201
  "Total character limit for all queries in a single usage aspect."
201
202
  " Queries will be truncated to length `queries_character_limit / top_n_queries`."
202
203
  ),
203
- hidden_from_docs=True, # Don't want to encourage people to break elasticsearch
204
204
  )
205
205
 
206
206
  top_n_queries: pydantic.PositiveInt = Field(
@@ -268,6 +268,7 @@ class UsageAggregator(Generic[ResourceType]):
268
268
  user,
269
269
  query,
270
270
  fields,
271
+ user_email_pattern=self.config.user_email_pattern,
271
272
  count=count,
272
273
  )
273
274
 
@@ -145,7 +145,7 @@ class PipelineMetadata:
145
145
 
146
146
  @platform_name("Vertex AI", id="vertexai")
147
147
  @config_class(VertexAIConfig)
148
- @support_status(SupportStatus.TESTING)
148
+ @support_status(SupportStatus.INCUBATING)
149
149
  @capability(
150
150
  SourceCapability.DESCRIPTIONS,
151
151
  "Extract descriptions for Vertex AI Registered Models and Model Versions",
@@ -2,6 +2,7 @@ import re
2
2
  from typing import Dict, List, Optional, Union
3
3
  from urllib.parse import urlparse
4
4
 
5
+ import pydantic
5
6
  from pydantic import Field, validator
6
7
 
7
8
  from datahub.configuration.common import AllowDenyPattern
@@ -121,7 +122,8 @@ class PulsarSourceConfig(
121
122
  )
122
123
  return client_secret
123
124
 
124
- @validator("web_service_url")
125
+ @pydantic.field_validator("web_service_url", mode="after")
126
+ @classmethod
125
127
  def web_service_url_scheme_host_port(cls, val: str) -> str:
126
128
  # Tokenize the web url
127
129
  url = urlparse(val)
@@ -1,7 +1,9 @@
1
1
  import logging
2
+ from collections import defaultdict
2
3
  from contextlib import AbstractContextManager
3
4
  from dataclasses import dataclass, field
4
5
  from datetime import datetime, timezone
6
+ from enum import Enum
5
7
 
6
8
  from datahub.utilities.perf_timer import PerfTimer
7
9
  from datahub.utilities.stats_collections import TopKDict
@@ -20,31 +22,68 @@ QUERIES_EXTRACTION = "Queries Extraction"
20
22
  PROFILING = "Profiling"
21
23
 
22
24
 
25
+ class IngestionHighStage(Enum):
26
+ """
27
+ The high-level stages at the framework level
28
+ Team to add more stages as needed
29
+ """
30
+
31
+ PROFILING = "Profiling"
32
+ _UNDEFINED = "Ingestion"
33
+
34
+
23
35
  @dataclass
24
36
  class IngestionStageReport:
37
+ ingestion_high_stage_seconds: dict[IngestionHighStage, float] = field(
38
+ default_factory=lambda: defaultdict(float)
39
+ )
25
40
  ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict)
26
41
 
27
- def new_stage(self, stage: str) -> "IngestionStageContext":
28
- return IngestionStageContext(stage, self)
42
+ def new_stage(
43
+ self, stage: str, high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED
44
+ ) -> "IngestionStageContext":
45
+ return IngestionStageContext(stage, self, high_stage)
46
+
47
+ def new_high_stage(self, stage: IngestionHighStage) -> "IngestionStageContext":
48
+ return IngestionStageContext("", self, stage)
29
49
 
30
50
 
31
51
  @dataclass
32
52
  class IngestionStageContext(AbstractContextManager):
33
- def __init__(self, stage: str, report: IngestionStageReport):
34
- self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}"
53
+ def __init__(
54
+ self,
55
+ stage: str,
56
+ report: IngestionStageReport,
57
+ high_stage: IngestionHighStage = IngestionHighStage._UNDEFINED,
58
+ ):
59
+ self._high_stage = high_stage
60
+ self._ingestion_stage = (
61
+ f"{stage} at {datetime.now(timezone.utc)}" if stage else ""
62
+ )
35
63
  self._timer: PerfTimer = PerfTimer()
36
64
  self._report = report
37
65
 
38
66
  def __enter__(self) -> "IngestionStageContext":
39
- logger.info(f"Stage started: {self._ingestion_stage}")
67
+ if self._ingestion_stage:
68
+ logger.info(f"Stage started: {self._ingestion_stage}")
69
+ else:
70
+ logger.info(f"High stage started: {self._high_stage.value}")
40
71
  self._timer.start()
41
72
  return self
42
73
 
43
74
  def __exit__(self, exc_type, exc_val, exc_tb):
44
75
  elapsed = self._timer.elapsed_seconds(digits=2)
45
- logger.info(
46
- f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
47
- stacklevel=2,
48
- )
49
- self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed
50
- return None
76
+ if self._ingestion_stage:
77
+ logger.info(
78
+ f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds",
79
+ stacklevel=2,
80
+ )
81
+ # Store tuple as string to avoid serialization errors
82
+ key = f"({self._high_stage.value}, {self._ingestion_stage})"
83
+ self._report.ingestion_stage_durations[key] = elapsed
84
+ else:
85
+ logger.info(
86
+ f"Time spent in stage <{self._high_stage.value}>: {elapsed} seconds",
87
+ stacklevel=2,
88
+ )
89
+ self._report.ingestion_high_stage_seconds[self._high_stage] += elapsed
@@ -71,8 +71,24 @@ class AddDatasetOwnership(OwnershipTransformer):
71
71
 
72
72
  server_ownership = graph.get_ownership(entity_urn=urn)
73
73
  if server_ownership:
74
- owners = {owner.owner: owner for owner in server_ownership.owners}
75
- owners.update({owner.owner: owner for owner in mce_ownership.owners})
74
+ owners = {
75
+ (
76
+ owner.owner,
77
+ owner.type,
78
+ owner.typeUrn,
79
+ ): owner
80
+ for owner in server_ownership.owners
81
+ }
82
+ owners.update(
83
+ {
84
+ (
85
+ owner.owner,
86
+ owner.type,
87
+ owner.typeUrn,
88
+ ): owner
89
+ for owner in mce_ownership.owners
90
+ }
91
+ )
76
92
  mce_ownership.owners = list(owners.values())
77
93
 
78
94
  return mce_ownership
@@ -281,11 +281,14 @@ class BaseTransformer(Transformer, metaclass=ABCMeta):
281
281
  )
282
282
  )
283
283
 
284
- record_metadata = _update_work_unit_id(
285
- envelope=envelope,
286
- aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
287
- urn=mcp.entityUrn,
288
- )
284
+ if mcp.entityUrn:
285
+ record_metadata = _update_work_unit_id(
286
+ envelope=envelope,
287
+ aspect_name=mcp.aspect.get_aspect_name(), # type: ignore
288
+ urn=mcp.entityUrn,
289
+ )
290
+ else:
291
+ record_metadata = envelope.metadata.copy()
289
292
 
290
293
  yield RecordEnvelope(
291
294
  record=mcp,
@@ -0,0 +1,112 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Dict, List, Optional, cast
4
+
5
+ from datahub.configuration.common import (
6
+ TransformerSemanticsConfigModel,
7
+ )
8
+ from datahub.emitter.mce_builder import Aspect
9
+ from datahub.ingestion.api.common import PipelineContext
10
+ from datahub.ingestion.transformer.base_transformer import (
11
+ BaseTransformer,
12
+ SingleAspectTransformer,
13
+ )
14
+ from datahub.metadata.schema_classes import (
15
+ BrowsePathEntryClass,
16
+ BrowsePathsV2Class,
17
+ )
18
+ from datahub.utilities.urns.urn import guess_entity_type
19
+
20
+
21
+ class SetBrowsePathTransformerConfig(TransformerSemanticsConfigModel):
22
+ path: List[str]
23
+
24
+
25
+ class SetBrowsePathTransformer(BaseTransformer, SingleAspectTransformer):
26
+ ctx: PipelineContext
27
+ config: SetBrowsePathTransformerConfig
28
+
29
+ def __init__(self, config: SetBrowsePathTransformerConfig, ctx: PipelineContext):
30
+ super().__init__()
31
+ self.ctx = ctx
32
+ self.config = config
33
+
34
+ def aspect_name(self) -> str:
35
+ return "browsePathsV2"
36
+
37
+ def entity_types(self) -> List[str]:
38
+ # This is an arbitrary list, might be adjusted if it makes sense. It might be reasonable to make it configurable
39
+ return ["dataset", "dataJob", "dataFlow", "chart", "dashboard", "container"]
40
+
41
+ @classmethod
42
+ def create(
43
+ cls, config_dict: dict, ctx: PipelineContext
44
+ ) -> "SetBrowsePathTransformer":
45
+ config = SetBrowsePathTransformerConfig.parse_obj(config_dict)
46
+ return cls(config, ctx)
47
+
48
+ @staticmethod
49
+ def _build_model(existing_browse_paths: BrowsePathsV2Class) -> Dict[str, List[str]]:
50
+ template_vars: Dict[str, List[str]] = {}
51
+ model: Dict[str, List[str]] = defaultdict(list)
52
+ for entry in existing_browse_paths.path or []:
53
+ if entry.urn:
54
+ entity_type = guess_entity_type(entry.urn)
55
+ model[entity_type].append(entry.urn)
56
+
57
+ for entity_type, urns in model.items():
58
+ template_vars[f"{entity_type}[*]"] = urns
59
+ for i, urn in enumerate(urns):
60
+ template_vars[f"{entity_type}[{i}]"] = [urn]
61
+
62
+ return template_vars
63
+
64
+ @classmethod
65
+ def _expand_nodes(
66
+ cls, templates: List[str], template_vars: Dict[str, List[str]]
67
+ ) -> BrowsePathsV2Class:
68
+ expanded_nodes: List[str] = []
69
+ for node in templates:
70
+ resolved_nodes = cls._resolve_template_to_nodes(node, template_vars)
71
+ expanded_nodes.extend(resolved_nodes)
72
+
73
+ processed_entries: List[BrowsePathEntryClass] = []
74
+ for node in expanded_nodes:
75
+ if not node or node.isspace():
76
+ continue
77
+ processed_entries.append(
78
+ BrowsePathEntryClass(
79
+ id=node, urn=node if node.startswith("urn:") else None
80
+ )
81
+ )
82
+ return BrowsePathsV2Class(path=processed_entries)
83
+
84
+ def transform_aspect(
85
+ self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
86
+ ) -> Optional[Aspect]:
87
+ template_vars: Dict[str, List[str]] = {}
88
+ if aspect is not None:
89
+ assert isinstance(aspect, BrowsePathsV2Class)
90
+ template_vars = self._build_model(aspect)
91
+ new_browse_paths: BrowsePathsV2Class = self._expand_nodes(
92
+ self.config.path, template_vars
93
+ )
94
+ if aspect is not None and not self.config.replace_existing:
95
+ for node in aspect.path:
96
+ new_browse_paths.path.append(node)
97
+
98
+ return cast(Aspect, new_browse_paths)
99
+
100
+ @staticmethod
101
+ def _resolve_template_to_nodes(
102
+ template_str: str, template_vars: Dict[str, List[str]]
103
+ ) -> List[str]:
104
+ # This mechanism can be made simpler (match against known variables only) or more complex (e.g. by using a
105
+ # proper templating engine, like jinja).
106
+ template_str = template_str.strip()
107
+ var_pattern = re.findall(r"^\$([a-zA-Z]+\[[0-9*]+]$)", template_str)
108
+
109
+ if not var_pattern:
110
+ return [template_str]
111
+
112
+ return template_vars.get(var_pattern[0], [])
@@ -84,9 +84,10 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
84
84
 
85
85
  dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME
86
86
  dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME
87
- with (dmf_definitions_path).open("w") as definitions, (
88
- dmf_associations_path
89
- ).open("w") as associations:
87
+ with (
88
+ (dmf_definitions_path).open("w") as definitions,
89
+ (dmf_associations_path).open("w") as associations,
90
+ ):
90
91
  for assertion_spec in assertion_config_spec.assertions:
91
92
  result.report.num_processed += 1
92
93
  try: