acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (414) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2615 -2547
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +412 -338
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +5 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +1 -1
  7. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  8. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  9. datahub/api/entities/dataset/dataset.py +26 -23
  10. datahub/api/entities/external/__init__.py +0 -0
  11. datahub/api/entities/external/external_entities.py +724 -0
  12. datahub/api/entities/external/external_tag.py +147 -0
  13. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  14. datahub/api/entities/external/restricted_text.py +172 -0
  15. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  16. datahub/api/entities/forms/forms.py +3 -3
  17. datahub/api/entities/structuredproperties/structuredproperties.py +4 -4
  18. datahub/api/graphql/operation.py +10 -6
  19. datahub/cli/check_cli.py +88 -7
  20. datahub/cli/cli_utils.py +63 -0
  21. datahub/cli/config_utils.py +18 -10
  22. datahub/cli/container_cli.py +5 -0
  23. datahub/cli/delete_cli.py +125 -27
  24. datahub/cli/docker_check.py +110 -14
  25. datahub/cli/docker_cli.py +153 -229
  26. datahub/cli/exists_cli.py +0 -2
  27. datahub/cli/get_cli.py +0 -2
  28. datahub/cli/graphql_cli.py +1422 -0
  29. datahub/cli/iceberg_cli.py +5 -0
  30. datahub/cli/ingest_cli.py +3 -15
  31. datahub/cli/migrate.py +2 -0
  32. datahub/cli/put_cli.py +1 -4
  33. datahub/cli/quickstart_versioning.py +53 -10
  34. datahub/cli/specific/assertions_cli.py +37 -6
  35. datahub/cli/specific/datacontract_cli.py +54 -7
  36. datahub/cli/specific/dataproduct_cli.py +2 -15
  37. datahub/cli/specific/dataset_cli.py +1 -8
  38. datahub/cli/specific/forms_cli.py +0 -4
  39. datahub/cli/specific/group_cli.py +0 -2
  40. datahub/cli/specific/structuredproperties_cli.py +1 -4
  41. datahub/cli/specific/user_cli.py +172 -3
  42. datahub/cli/state_cli.py +0 -2
  43. datahub/cli/timeline_cli.py +0 -2
  44. datahub/configuration/common.py +40 -1
  45. datahub/configuration/connection_resolver.py +5 -2
  46. datahub/configuration/env_vars.py +331 -0
  47. datahub/configuration/import_resolver.py +7 -4
  48. datahub/configuration/kafka.py +21 -1
  49. datahub/configuration/pydantic_migration_helpers.py +6 -13
  50. datahub/configuration/source_common.py +3 -2
  51. datahub/configuration/validate_field_deprecation.py +5 -2
  52. datahub/configuration/validate_field_removal.py +8 -2
  53. datahub/configuration/validate_field_rename.py +6 -5
  54. datahub/configuration/validate_multiline_string.py +5 -2
  55. datahub/emitter/mce_builder.py +8 -4
  56. datahub/emitter/rest_emitter.py +103 -30
  57. datahub/entrypoints.py +6 -3
  58. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +297 -1
  59. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  60. datahub/ingestion/api/decorators.py +15 -3
  61. datahub/ingestion/api/report.py +381 -3
  62. datahub/ingestion/api/sink.py +27 -2
  63. datahub/ingestion/api/source.py +165 -58
  64. datahub/ingestion/api/source_protocols.py +23 -0
  65. datahub/ingestion/autogenerated/__init__.py +0 -0
  66. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  67. datahub/ingestion/autogenerated/lineage.json +402 -0
  68. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  69. datahub/ingestion/extractor/schema_util.py +13 -4
  70. datahub/ingestion/glossary/classification_mixin.py +5 -0
  71. datahub/ingestion/graph/client.py +330 -25
  72. datahub/ingestion/graph/config.py +3 -2
  73. datahub/ingestion/graph/filters.py +30 -11
  74. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  75. datahub/ingestion/run/pipeline.py +81 -11
  76. datahub/ingestion/run/pipeline_config.py +2 -2
  77. datahub/ingestion/sink/datahub_kafka.py +1 -0
  78. datahub/ingestion/sink/datahub_rest.py +13 -5
  79. datahub/ingestion/sink/file.py +1 -0
  80. datahub/ingestion/source/abs/config.py +1 -1
  81. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  82. datahub/ingestion/source/abs/source.py +15 -30
  83. datahub/ingestion/source/aws/aws_common.py +185 -13
  84. datahub/ingestion/source/aws/glue.py +517 -244
  85. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  86. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  87. datahub/ingestion/source/aws/tag_entities.py +270 -0
  88. datahub/ingestion/source/azure/azure_common.py +3 -3
  89. datahub/ingestion/source/bigquery_v2/bigquery.py +67 -24
  90. datahub/ingestion/source/bigquery_v2/bigquery_config.py +47 -19
  91. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
  92. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -0
  93. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  94. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  95. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  96. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  97. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  98. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  99. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  100. datahub/ingestion/source/cassandra/cassandra.py +6 -8
  101. datahub/ingestion/source/cassandra/cassandra_api.py +17 -1
  102. datahub/ingestion/source/cassandra/cassandra_config.py +5 -0
  103. datahub/ingestion/source/cassandra/cassandra_profiling.py +7 -6
  104. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  105. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  106. datahub/ingestion/source/common/subtypes.py +53 -0
  107. datahub/ingestion/source/data_lake_common/data_lake_utils.py +37 -0
  108. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  109. datahub/ingestion/source/data_lake_common/path_spec.py +72 -43
  110. datahub/ingestion/source/datahub/config.py +12 -9
  111. datahub/ingestion/source/datahub/datahub_database_reader.py +26 -11
  112. datahub/ingestion/source/datahub/datahub_source.py +10 -0
  113. datahub/ingestion/source/dbt/dbt_cloud.py +16 -5
  114. datahub/ingestion/source/dbt/dbt_common.py +224 -9
  115. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  116. datahub/ingestion/source/debug/__init__.py +0 -0
  117. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  118. datahub/ingestion/source/delta_lake/config.py +9 -5
  119. datahub/ingestion/source/delta_lake/source.py +8 -0
  120. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  121. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  122. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  123. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  124. datahub/ingestion/source/dremio/dremio_source.py +132 -98
  125. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  126. datahub/ingestion/source/dynamodb/dynamodb.py +11 -8
  127. datahub/ingestion/source/excel/__init__.py +0 -0
  128. datahub/ingestion/source/excel/config.py +92 -0
  129. datahub/ingestion/source/excel/excel_file.py +539 -0
  130. datahub/ingestion/source/excel/profiling.py +308 -0
  131. datahub/ingestion/source/excel/report.py +49 -0
  132. datahub/ingestion/source/excel/source.py +662 -0
  133. datahub/ingestion/source/excel/util.py +18 -0
  134. datahub/ingestion/source/feast.py +8 -10
  135. datahub/ingestion/source/file.py +3 -0
  136. datahub/ingestion/source/fivetran/config.py +66 -7
  137. datahub/ingestion/source/fivetran/fivetran.py +227 -43
  138. datahub/ingestion/source/fivetran/fivetran_log_api.py +37 -8
  139. datahub/ingestion/source/fivetran/fivetran_query.py +51 -29
  140. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  141. datahub/ingestion/source/fivetran/response_models.py +97 -0
  142. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  143. datahub/ingestion/source/gcs/gcs_source.py +32 -4
  144. datahub/ingestion/source/ge_data_profiler.py +108 -31
  145. datahub/ingestion/source/ge_profiling_config.py +26 -11
  146. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  147. datahub/ingestion/source/grafana/field_utils.py +307 -0
  148. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  149. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  150. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  151. datahub/ingestion/source/grafana/lineage.py +202 -0
  152. datahub/ingestion/source/grafana/models.py +137 -0
  153. datahub/ingestion/source/grafana/report.py +90 -0
  154. datahub/ingestion/source/grafana/types.py +16 -0
  155. datahub/ingestion/source/hex/api.py +28 -1
  156. datahub/ingestion/source/hex/hex.py +16 -5
  157. datahub/ingestion/source/hex/mapper.py +16 -2
  158. datahub/ingestion/source/hex/model.py +2 -0
  159. datahub/ingestion/source/hex/query_fetcher.py +1 -1
  160. datahub/ingestion/source/iceberg/iceberg.py +123 -59
  161. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  162. datahub/ingestion/source/identity/azure_ad.py +1 -1
  163. datahub/ingestion/source/identity/okta.py +1 -14
  164. datahub/ingestion/source/kafka/kafka.py +16 -0
  165. datahub/ingestion/source/kafka_connect/common.py +2 -2
  166. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  167. datahub/ingestion/source/kafka_connect/source_connectors.py +62 -4
  168. datahub/ingestion/source/looker/looker_common.py +148 -79
  169. datahub/ingestion/source/looker/looker_config.py +15 -4
  170. datahub/ingestion/source/looker/looker_constant.py +4 -0
  171. datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
  172. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  173. datahub/ingestion/source/looker/looker_source.py +503 -547
  174. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  175. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  176. datahub/ingestion/source/looker/lookml_config.py +31 -3
  177. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  178. datahub/ingestion/source/looker/lookml_source.py +96 -117
  179. datahub/ingestion/source/looker/view_upstream.py +494 -1
  180. datahub/ingestion/source/metabase.py +32 -6
  181. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  182. datahub/ingestion/source/metadata/lineage.py +9 -9
  183. datahub/ingestion/source/mlflow.py +12 -2
  184. datahub/ingestion/source/mock_data/__init__.py +0 -0
  185. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  186. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  187. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  188. datahub/ingestion/source/mode.py +26 -5
  189. datahub/ingestion/source/mongodb.py +11 -1
  190. datahub/ingestion/source/neo4j/neo4j_source.py +83 -144
  191. datahub/ingestion/source/nifi.py +2 -2
  192. datahub/ingestion/source/openapi.py +1 -1
  193. datahub/ingestion/source/powerbi/config.py +47 -21
  194. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  195. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  196. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  197. datahub/ingestion/source/powerbi/powerbi.py +10 -6
  198. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  199. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  200. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  201. datahub/ingestion/source/preset.py +3 -3
  202. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  203. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  204. datahub/ingestion/source/redash.py +1 -1
  205. datahub/ingestion/source/redshift/config.py +15 -9
  206. datahub/ingestion/source/redshift/datashares.py +1 -1
  207. datahub/ingestion/source/redshift/lineage.py +386 -687
  208. datahub/ingestion/source/redshift/query.py +23 -19
  209. datahub/ingestion/source/redshift/redshift.py +52 -111
  210. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  211. datahub/ingestion/source/redshift/report.py +0 -2
  212. datahub/ingestion/source/redshift/usage.py +6 -5
  213. datahub/ingestion/source/s3/report.py +4 -2
  214. datahub/ingestion/source/s3/source.py +449 -248
  215. datahub/ingestion/source/sac/sac.py +3 -1
  216. datahub/ingestion/source/salesforce.py +28 -13
  217. datahub/ingestion/source/schema/json_schema.py +14 -14
  218. datahub/ingestion/source/schema_inference/object.py +22 -6
  219. datahub/ingestion/source/sigma/data_classes.py +3 -0
  220. datahub/ingestion/source/sigma/sigma.py +7 -1
  221. datahub/ingestion/source/slack/slack.py +10 -16
  222. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  223. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  224. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  225. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  226. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  227. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  228. datahub/ingestion/source/snowflake/constants.py +3 -0
  229. datahub/ingestion/source/snowflake/snowflake_config.py +76 -23
  230. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -8
  231. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +19 -6
  232. datahub/ingestion/source/snowflake/snowflake_queries.py +464 -97
  233. datahub/ingestion/source/snowflake/snowflake_query.py +77 -5
  234. datahub/ingestion/source/snowflake/snowflake_report.py +1 -2
  235. datahub/ingestion/source/snowflake/snowflake_schema.py +352 -16
  236. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +51 -10
  237. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  238. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  239. datahub/ingestion/source/snowflake/snowflake_utils.py +36 -15
  240. datahub/ingestion/source/snowflake/snowflake_v2.py +39 -4
  241. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  242. datahub/ingestion/source/sql/athena.py +217 -25
  243. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  244. datahub/ingestion/source/sql/clickhouse.py +24 -8
  245. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  246. datahub/ingestion/source/sql/druid.py +2 -2
  247. datahub/ingestion/source/sql/hana.py +3 -1
  248. datahub/ingestion/source/sql/hive.py +4 -3
  249. datahub/ingestion/source/sql/hive_metastore.py +19 -20
  250. datahub/ingestion/source/sql/mariadb.py +0 -1
  251. datahub/ingestion/source/sql/mssql/job_models.py +3 -1
  252. datahub/ingestion/source/sql/mssql/source.py +336 -57
  253. datahub/ingestion/source/sql/mysql.py +154 -4
  254. datahub/ingestion/source/sql/oracle.py +5 -5
  255. datahub/ingestion/source/sql/postgres.py +142 -6
  256. datahub/ingestion/source/sql/presto.py +2 -1
  257. datahub/ingestion/source/sql/sql_common.py +281 -49
  258. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  259. datahub/ingestion/source/sql/sql_types.py +22 -0
  260. datahub/ingestion/source/sql/sqlalchemy_uri.py +39 -7
  261. datahub/ingestion/source/sql/teradata.py +1028 -245
  262. datahub/ingestion/source/sql/trino.py +11 -1
  263. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  264. datahub/ingestion/source/sql/vertica.py +14 -7
  265. datahub/ingestion/source/sql_queries.py +219 -121
  266. datahub/ingestion/source/state/checkpoint.py +8 -29
  267. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  268. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  269. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  270. datahub/ingestion/source/superset.py +314 -67
  271. datahub/ingestion/source/tableau/tableau.py +135 -59
  272. datahub/ingestion/source/tableau/tableau_common.py +9 -2
  273. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  274. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  275. datahub/ingestion/source/unity/config.py +160 -40
  276. datahub/ingestion/source/unity/connection.py +61 -0
  277. datahub/ingestion/source/unity/connection_test.py +1 -0
  278. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  279. datahub/ingestion/source/unity/proxy.py +794 -51
  280. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  281. datahub/ingestion/source/unity/proxy_types.py +36 -2
  282. datahub/ingestion/source/unity/report.py +15 -3
  283. datahub/ingestion/source/unity/source.py +465 -131
  284. datahub/ingestion/source/unity/tag_entities.py +197 -0
  285. datahub/ingestion/source/unity/usage.py +46 -4
  286. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  287. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -2
  288. datahub/ingestion/source/usage/usage_common.py +4 -3
  289. datahub/ingestion/source/vertexai/vertexai.py +1 -1
  290. datahub/ingestion/source_config/pulsar.py +3 -1
  291. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  292. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  293. datahub/ingestion/transformer/base_transformer.py +8 -5
  294. datahub/ingestion/transformer/set_browse_path.py +112 -0
  295. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  296. datahub/metadata/_internal_schema_classes.py +6806 -4871
  297. datahub/metadata/_urns/urn_defs.py +1767 -1539
  298. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  299. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  300. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  301. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  302. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  303. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +6 -0
  304. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  305. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  306. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  307. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  308. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  309. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  310. datahub/metadata/schema.avsc +18395 -16979
  311. datahub/metadata/schemas/Actors.avsc +38 -1
  312. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  313. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  314. datahub/metadata/schemas/Applications.avsc +38 -0
  315. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  316. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  317. datahub/metadata/schemas/ChartKey.avsc +1 -0
  318. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  319. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  320. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  321. datahub/metadata/schemas/CorpUserSettings.avsc +50 -0
  322. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  323. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  324. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  325. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  326. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  327. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  328. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  329. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  330. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  331. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  332. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  333. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  334. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  335. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  336. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  337. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  338. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  339. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  340. datahub/metadata/schemas/DomainKey.avsc +2 -1
  341. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  342. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  343. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  344. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  345. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  346. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  347. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  348. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  349. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  350. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  351. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  352. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  353. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  354. datahub/metadata/schemas/MetadataChangeEvent.avsc +151 -47
  355. datahub/metadata/schemas/MetadataChangeLog.avsc +62 -44
  356. datahub/metadata/schemas/MetadataChangeProposal.avsc +61 -0
  357. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  358. datahub/metadata/schemas/Operation.avsc +4 -2
  359. datahub/metadata/schemas/Ownership.avsc +69 -0
  360. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  361. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  362. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  363. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  364. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  365. datahub/metadata/schemas/SystemMetadata.avsc +61 -0
  366. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  367. datahub/sdk/__init__.py +2 -0
  368. datahub/sdk/_all_entities.py +7 -0
  369. datahub/sdk/_shared.py +249 -5
  370. datahub/sdk/chart.py +386 -0
  371. datahub/sdk/container.py +7 -0
  372. datahub/sdk/dashboard.py +453 -0
  373. datahub/sdk/dataflow.py +7 -0
  374. datahub/sdk/datajob.py +45 -13
  375. datahub/sdk/dataset.py +56 -2
  376. datahub/sdk/entity_client.py +111 -9
  377. datahub/sdk/lineage_client.py +663 -82
  378. datahub/sdk/main_client.py +50 -16
  379. datahub/sdk/mlmodel.py +120 -38
  380. datahub/sdk/mlmodelgroup.py +7 -0
  381. datahub/sdk/search_client.py +7 -3
  382. datahub/sdk/search_filters.py +304 -36
  383. datahub/secret/datahub_secret_store.py +3 -0
  384. datahub/secret/environment_secret_store.py +29 -0
  385. datahub/secret/file_secret_store.py +49 -0
  386. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  387. datahub/specific/aspect_helpers/siblings.py +73 -0
  388. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  389. datahub/specific/chart.py +1 -1
  390. datahub/specific/datajob.py +15 -1
  391. datahub/specific/dataproduct.py +4 -0
  392. datahub/specific/dataset.py +39 -59
  393. datahub/sql_parsing/split_statements.py +13 -0
  394. datahub/sql_parsing/sql_parsing_aggregator.py +70 -26
  395. datahub/sql_parsing/sqlglot_lineage.py +196 -42
  396. datahub/sql_parsing/sqlglot_utils.py +12 -4
  397. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  398. datahub/telemetry/telemetry.py +28 -14
  399. datahub/testing/sdk_v2_helpers.py +7 -1
  400. datahub/upgrade/upgrade.py +73 -17
  401. datahub/utilities/file_backed_collections.py +8 -9
  402. datahub/utilities/is_pytest.py +3 -2
  403. datahub/utilities/logging_manager.py +22 -6
  404. datahub/utilities/mapping.py +29 -2
  405. datahub/utilities/sample_data.py +5 -4
  406. datahub/utilities/server_config_util.py +10 -1
  407. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  408. datahub/utilities/stats_collections.py +4 -0
  409. datahub/utilities/urns/urn.py +41 -2
  410. datahub/emitter/sql_parsing_builder.py +0 -306
  411. datahub/ingestion/source/redshift/lineage_v2.py +0 -466
  412. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +0 -0
  413. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/licenses/LICENSE +0 -0
  414. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -244,3 +244,24 @@ class RedundantUsageRunSkipHandler(RedundantRunSkipHandler):
244
244
  cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
245
245
  cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
246
246
  cur_state.bucket_duration = bucket_duration
247
+
248
+
249
+ class RedundantQueriesRunSkipHandler(RedundantRunSkipHandler):
250
+ """
251
+ Handler for stateful ingestion of queries v2 extraction.
252
+ Manages the time window for audit log extraction that combines
253
+ lineage, usage, operations, and queries.
254
+ """
255
+
256
+ def get_job_name_suffix(self):
257
+ return "_audit_window"
258
+
259
+ def update_state(
260
+ self, start_time: datetime, end_time: datetime, bucket_duration: BucketDuration
261
+ ) -> None:
262
+ cur_checkpoint = self.get_current_checkpoint()
263
+ if cur_checkpoint:
264
+ cur_state = cast(BaseTimeWindowCheckpointState, cur_checkpoint.state)
265
+ cur_state.begin_timestamp_millis = datetime_to_ts_millis(start_time)
266
+ cur_state.end_timestamp_millis = datetime_to_ts_millis(end_time)
267
+ cur_state.bucket_duration = bucket_duration
@@ -10,6 +10,7 @@ from datahub.configuration.common import (
10
10
  ConfigModel,
11
11
  ConfigurationError,
12
12
  DynamicTypedConfig,
13
+ HiddenFromDocs,
13
14
  )
14
15
  from datahub.configuration.pydantic_migration_helpers import GenericModel
15
16
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
@@ -55,25 +56,21 @@ class StatefulIngestionConfig(ConfigModel):
55
56
  description="Whether or not to enable stateful ingest. "
56
57
  "Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",
57
58
  )
58
- max_checkpoint_state_size: pydantic.PositiveInt = Field(
59
+ max_checkpoint_state_size: HiddenFromDocs[pydantic.PositiveInt] = Field(
59
60
  default=2**24, # 16 MB
60
61
  description="The maximum size of the checkpoint state in bytes. Default is 16MB",
61
- hidden_from_docs=True,
62
62
  )
63
- state_provider: Optional[DynamicTypedStateProviderConfig] = Field(
63
+ state_provider: HiddenFromDocs[Optional[DynamicTypedStateProviderConfig]] = Field(
64
64
  default=None,
65
65
  description="The ingestion state provider configuration.",
66
- hidden_from_docs=True,
67
66
  )
68
- ignore_old_state: bool = Field(
67
+ ignore_old_state: HiddenFromDocs[bool] = Field(
69
68
  default=False,
70
69
  description="If set to True, ignores the previous checkpoint state.",
71
- hidden_from_docs=True,
72
70
  )
73
- ignore_new_state: bool = Field(
71
+ ignore_new_state: HiddenFromDocs[bool] = Field(
74
72
  default=False,
75
73
  description="If set to True, ignores the current checkpoint state.",
76
- hidden_from_docs=True,
77
74
  )
78
75
 
79
76
  @pydantic.root_validator(skip_on_failure=True)
@@ -104,7 +101,9 @@ class StatefulLineageConfigMixin(ConfigModel):
104
101
  default=True,
105
102
  description="Enable stateful lineage ingestion."
106
103
  " This will store lineage window timestamps after successful lineage ingestion. "
107
- "and will not run lineage ingestion for same timestamps in subsequent run. ",
104
+ "and will not run lineage ingestion for same timestamps in subsequent run. "
105
+ "NOTE: This only works with use_queries_v2=False (legacy extraction path). "
106
+ "For queries v2, use enable_stateful_time_window instead.",
108
107
  )
109
108
 
110
109
  _store_last_lineage_extraction_timestamp = pydantic_renamed_field(
@@ -153,7 +152,9 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
153
152
  default=True,
154
153
  description="Enable stateful lineage ingestion."
155
154
  " This will store usage window timestamps after successful usage ingestion. "
156
- "and will not run usage ingestion for same timestamps in subsequent run. ",
155
+ "and will not run usage ingestion for same timestamps in subsequent run. "
156
+ "NOTE: This only works with use_queries_v2=False (legacy extraction path). "
157
+ "For queries v2, use enable_stateful_time_window instead.",
157
158
  )
158
159
 
159
160
  _store_last_usage_extraction_timestamp = pydantic_renamed_field(
@@ -172,6 +173,30 @@ class StatefulUsageConfigMixin(BaseTimeWindowConfig):
172
173
  return values
173
174
 
174
175
 
176
+ class StatefulTimeWindowConfigMixin(BaseTimeWindowConfig):
177
+ enable_stateful_time_window: bool = Field(
178
+ default=False,
179
+ description="Enable stateful time window tracking."
180
+ " This will store the time window after successful extraction "
181
+ "and adjust the time window in subsequent runs to avoid reprocessing. "
182
+ "NOTE: This is ONLY applicable when using queries v2 (use_queries_v2=True). "
183
+ "This replaces enable_stateful_lineage_ingestion and enable_stateful_usage_ingestion "
184
+ "for the queries v2 extraction path, since queries v2 extracts lineage, usage, operations, "
185
+ "and queries together from a single audit log and uses a unified time window.",
186
+ )
187
+
188
+ @root_validator(skip_on_failure=True)
189
+ def time_window_stateful_option_validator(cls, values: Dict) -> Dict:
190
+ sti = values.get("stateful_ingestion")
191
+ if not sti or not sti.enabled:
192
+ if values.get("enable_stateful_time_window"):
193
+ logger.warning(
194
+ "Stateful ingestion is disabled, disabling enable_stateful_time_window config option as well"
195
+ )
196
+ values["enable_stateful_time_window"] = False
197
+ return values
198
+
199
+
175
200
  @dataclass
176
201
  class StatefulIngestionReport(SourceReport):
177
202
  pass
@@ -179,7 +204,7 @@ class StatefulIngestionReport(SourceReport):
179
204
 
180
205
  @capability(
181
206
  SourceCapability.DELETION_DETECTION,
182
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
207
+ "Enabled by default via stateful ingestion",
183
208
  supported=True,
184
209
  )
185
210
  class StatefulIngestionSourceBase(Source):
@@ -8,9 +8,11 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
8
8
 
9
9
  import dateutil.parser as dp
10
10
  import requests
11
- from pydantic import BaseModel
12
- from pydantic.class_validators import root_validator, validator
11
+ import sqlglot
12
+ from pydantic import BaseModel, root_validator, validator
13
13
  from pydantic.fields import Field
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util.retry import Retry
14
16
 
15
17
  import datahub.emitter.mce_builder as builder
16
18
  from datahub.configuration.common import AllowDenyPattern
@@ -75,6 +77,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
75
77
  SchemaFieldDataType,
76
78
  SchemaMetadata,
77
79
  StringTypeClass,
80
+ TimeTypeClass,
78
81
  )
79
82
  from datahub.metadata.schema_classes import (
80
83
  AuditStampClass,
@@ -107,6 +110,12 @@ logger = logging.getLogger(__name__)
107
110
 
108
111
  PAGE_SIZE = 25
109
112
 
113
+ # Retry configuration constants
114
+ RETRY_MAX_TIMES = 3
115
+ RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
116
+ RETRY_BACKOFF_FACTOR = 1
117
+ RETRY_ALLOWED_METHODS = ["GET"]
118
+
110
119
 
111
120
  chart_type_from_viz_type = {
112
121
  "line": ChartTypeClass.LINE,
@@ -131,8 +140,11 @@ FIELD_TYPE_MAPPING = {
131
140
  "STRING": StringTypeClass,
132
141
  "FLOAT": NumberTypeClass,
133
142
  "DATETIME": DateTypeClass,
143
+ "TIMESTAMP": TimeTypeClass,
134
144
  "BOOLEAN": BooleanTypeClass,
135
145
  "SQL": StringTypeClass,
146
+ "NUMERIC": NumberTypeClass,
147
+ "TEXT": StringTypeClass,
136
148
  }
137
149
 
138
150
 
@@ -149,6 +161,7 @@ class SupersetDataset(BaseModel):
149
161
  table_name: str
150
162
  changed_on_utc: Optional[str] = None
151
163
  explore_url: Optional[str] = ""
164
+ description: Optional[str] = ""
152
165
 
153
166
  @property
154
167
  def modified_dt(self) -> Optional[datetime]:
@@ -272,10 +285,11 @@ def get_filter_name(filter_obj):
272
285
  @config_class(SupersetConfig)
273
286
  @support_status(SupportStatus.CERTIFIED)
274
287
  @capability(
275
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
288
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
276
289
  )
277
290
  @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
278
291
  @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
292
+ @capability(SourceCapability.TAGS, "Supported by default")
279
293
  class SupersetSource(StatefulIngestionSourceBase):
280
294
  """
281
295
  This plugin extracts the following:
@@ -321,6 +335,19 @@ class SupersetSource(StatefulIngestionSourceBase):
321
335
  logger.debug("Got access token from superset")
322
336
 
323
337
  requests_session = requests.Session()
338
+
339
+ # Configure retry strategy for transient failures
340
+ retry_strategy = Retry(
341
+ total=RETRY_MAX_TIMES,
342
+ status_forcelist=RETRY_STATUS_CODES,
343
+ backoff_factor=RETRY_BACKOFF_FACTOR,
344
+ allowed_methods=RETRY_ALLOWED_METHODS,
345
+ raise_on_status=False,
346
+ )
347
+ adapter = HTTPAdapter(max_retries=retry_strategy)
348
+ requests_session.mount("http://", adapter)
349
+ requests_session.mount("https://", adapter)
350
+
324
351
  requests_session.headers.update(
325
352
  {
326
353
  "Authorization": f"Bearer {self.access_token}",
@@ -353,8 +380,13 @@ class SupersetSource(StatefulIngestionSourceBase):
353
380
  )
354
381
 
355
382
  if response.status_code != 200:
356
- logger.warning(f"Failed to get {entity_type} data: {response.text}")
357
- continue
383
+ self.report.warning(
384
+ title="Failed to fetch data from Superset API",
385
+ message="Incomplete metadata extraction due to Superset API failure",
386
+ context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
387
+ )
388
+ # we stop pagination for this entity type and we continue the overall ingestion
389
+ break
358
390
 
359
391
  payload = response.json()
360
392
  # Update total_items with the actual count from the response
@@ -515,6 +547,11 @@ class SupersetSource(StatefulIngestionSourceBase):
515
547
  )
516
548
  dashboard_snapshot.aspects.append(owners_info)
517
549
 
550
+ superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
551
+ tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
552
+ if tags:
553
+ dashboard_snapshot.aspects.append(tags)
554
+
518
555
  return dashboard_snapshot
519
556
 
520
557
  def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
@@ -633,62 +670,130 @@ class SupersetSource(StatefulIngestionSourceBase):
633
670
 
634
671
  return input_fields
635
672
 
636
- def construct_chart_cll(
637
- self,
638
- chart_data: dict,
639
- datasource_urn: Union[str, None],
640
- datasource_id: Union[Any, int],
641
- ) -> List[InputField]:
642
- column_data: List[Union[str, dict]] = chart_data.get("form_data", {}).get(
643
- "all_columns", []
644
- )
645
-
646
- # the second field represents whether its a SQL expression,
647
- # false being just regular column and true being SQL col
648
- chart_column_data: List[Tuple[str, bool]] = [
649
- (column, False)
650
- if isinstance(column, str)
651
- else (column.get("label", ""), True)
652
- for column in column_data
653
- ]
673
+ def _extract_columns_from_sql(self, sql_expr: Optional[str]) -> List[str]:
674
+ if not sql_expr:
675
+ return []
654
676
 
655
- dataset_columns: List[Tuple[str, str, str]] = []
677
+ try:
678
+ parsed_expr = sqlglot.parse_one(sql_expr)
656
679
 
657
- # parses the superset dataset's column info, to build type and description info
658
- if datasource_id:
659
- dataset_info = self.get_dataset_info(datasource_id).get("result", {})
660
- dataset_column_info = dataset_info.get("columns", [])
680
+ column_refs = set()
681
+ for node in parsed_expr.walk():
682
+ if isinstance(node, sqlglot.exp.Column):
683
+ column_name = node.name
684
+ column_refs.add(column_name)
661
685
 
662
- for column in dataset_column_info:
663
- col_name = column.get("column_name", "")
664
- col_type = column.get("type", "")
665
- col_description = column.get("description", "")
686
+ return list(column_refs)
687
+ except Exception as e:
688
+ self.report.warning(f"Failed to parse SQL expression '{sql_expr}': {e}")
689
+ return []
666
690
 
667
- # if missing column name or column type, cannot construct the column,
668
- # so we skip this column, missing description is fine
669
- if col_name == "" or col_type == "":
670
- logger.info(f"could not construct column lineage for {column}")
671
- continue
691
+ def _process_column_item(
692
+ self, item: Union[str, dict], unique_columns: Dict[str, bool]
693
+ ) -> None:
694
+ """Process a single column item and add to unique_columns."""
672
695
 
673
- dataset_columns.append((col_name, col_type, col_description))
696
+ def add_column(col_name: str, is_sql: bool) -> None:
697
+ if not col_name:
698
+ return
699
+ # Always set to False if any non-SQL seen, else keep as is_sql
700
+ unique_columns[col_name] = unique_columns.get(col_name, True) and is_sql
701
+
702
+ if isinstance(item, str):
703
+ add_column(item, False)
704
+ elif isinstance(item, dict):
705
+ if item.get("expressionType") == "SIMPLE":
706
+ # For metrics with SIMPLE expression type
707
+ add_column(item.get("column", {}).get("column_name", ""), False)
708
+ elif item.get("expressionType") == "SQL":
709
+ sql_expr = item.get("sqlExpression")
710
+ column_refs = self._extract_columns_from_sql(sql_expr)
711
+ for col in column_refs:
712
+ add_column(col, False)
713
+ if not column_refs:
714
+ add_column(item.get("label", ""), True)
715
+
716
+ def _collect_all_unique_columns(self, form_data: dict) -> Dict[str, bool]:
717
+ """Collect all unique column names from form_data, distinguishing SQL vs non-SQL."""
718
+ unique_columns: Dict[str, bool] = {}
719
+
720
+ # Process regular columns
721
+ for column in form_data.get("all_columns", []):
722
+ self._process_column_item(column, unique_columns)
723
+
724
+ # Process metrics
725
+ # For charts with a single metric, the metric is stored in the form_data as a string in the 'metric' key
726
+ # For charts with multiple metrics, the metrics are stored in the form_data as a list of strings in the 'metrics' key
727
+ if "metric" in form_data:
728
+ metrics_data = [form_data.get("metric")]
674
729
  else:
675
- # if no datasource id, cannot build cll, just return
730
+ metrics_data = form_data.get("metrics", [])
731
+
732
+ for metric in metrics_data:
733
+ if metric is not None:
734
+ self._process_column_item(metric, unique_columns)
735
+
736
+ # Process group by columns
737
+ for group in form_data.get("groupby", []):
738
+ self._process_column_item(group, unique_columns)
739
+
740
+ # Process x-axis columns
741
+ x_axis_data = form_data.get("x_axis")
742
+ if x_axis_data is not None:
743
+ self._process_column_item(x_axis_data, unique_columns)
744
+
745
+ return unique_columns
746
+
747
+ def _fetch_dataset_columns(
748
+ self, datasource_id: Union[Any, int]
749
+ ) -> List[Tuple[str, str, str]]:
750
+ """Fetch dataset columns and metrics from Superset API."""
751
+ if not datasource_id:
676
752
  logger.warning(
677
753
  "no datasource id was found, cannot build column level lineage"
678
754
  )
679
755
  return []
680
756
 
757
+ dataset_info = self.get_dataset_info(datasource_id).get("result", {})
758
+ dataset_column_info = dataset_info.get("columns", [])
759
+ dataset_metric_info = dataset_info.get("metrics", [])
760
+
761
+ dataset_columns: List[Tuple[str, str, str]] = []
762
+ for column in dataset_column_info:
763
+ col_name = column.get("column_name", "")
764
+ col_type = column.get("type", "")
765
+ col_description = column.get("description", "")
766
+
767
+ if col_name == "" or col_type == "":
768
+ logger.info(f"could not construct column lineage for {column}")
769
+ continue
770
+
771
+ dataset_columns.append((col_name, col_type, col_description))
772
+
773
+ for metric in dataset_metric_info:
774
+ metric_name = metric.get("metric_name", "")
775
+ metric_type = metric.get("metric_type", "")
776
+ metric_description = metric.get("description", "")
777
+
778
+ if metric_name == "" or metric_type == "":
779
+ logger.info(f"could not construct metric lineage for {metric}")
780
+ continue
781
+
782
+ dataset_columns.append((metric_name, metric_type, metric_description))
783
+
784
+ return dataset_columns
785
+
786
+ def _match_chart_columns_with_dataset(
787
+ self,
788
+ unique_chart_columns: Dict[str, bool],
789
+ dataset_columns: List[Tuple[str, str, str]],
790
+ ) -> List[Tuple[str, str, str]]:
791
+ """Match chart columns with dataset columns, preserving SQL/non-SQL status."""
681
792
  chart_columns: List[Tuple[str, str, str]] = []
682
- for chart_col in chart_column_data:
683
- chart_col_name, is_sql = chart_col
793
+
794
+ for chart_col_name, is_sql in unique_chart_columns.items():
684
795
  if is_sql:
685
- chart_columns.append(
686
- (
687
- chart_col_name,
688
- "SQL",
689
- "",
690
- )
691
- )
796
+ chart_columns.append((chart_col_name, "SQL", ""))
692
797
  continue
693
798
 
694
799
  # find matching upstream column
@@ -699,13 +804,36 @@ class SupersetSource(StatefulIngestionSourceBase):
699
804
  if dataset_col_name == chart_col_name:
700
805
  chart_columns.append(
701
806
  (chart_col_name, dataset_col_type, dataset_col_description)
702
- ) # column name, column type, description
807
+ )
703
808
  break
704
-
705
- # if no matching upstream column was found
706
- if len(chart_columns) == 0 or chart_columns[-1][0] != chart_col_name:
809
+ else:
707
810
  chart_columns.append((chart_col_name, "", ""))
708
811
 
812
+ return chart_columns
813
+
814
+ def construct_chart_cll(
815
+ self,
816
+ chart_data: dict,
817
+ datasource_urn: Union[str, None],
818
+ datasource_id: Union[Any, int],
819
+ ) -> List[InputField]:
820
+ """Construct column-level lineage for a chart."""
821
+ form_data = chart_data.get("form_data", {})
822
+
823
+ # Extract and process all columns in one go
824
+ unique_columns = self._collect_all_unique_columns(form_data)
825
+
826
+ # Fetch dataset columns
827
+ dataset_columns = self._fetch_dataset_columns(datasource_id)
828
+ if not dataset_columns:
829
+ return []
830
+
831
+ # Match chart columns with dataset columns
832
+ chart_columns = self._match_chart_columns_with_dataset(
833
+ unique_columns, dataset_columns
834
+ )
835
+
836
+ # Build input fields
709
837
  return self.build_input_fields(chart_columns, datasource_urn)
710
838
 
711
839
  def construct_chart_from_chart_data(
@@ -822,6 +950,12 @@ class SupersetSource(StatefulIngestionSourceBase):
822
950
  lastModified=last_modified,
823
951
  )
824
952
  chart_snapshot.aspects.append(owners_info)
953
+
954
+ superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
955
+ tags = self._merge_tags_with_existing(chart_urn, superset_tags)
956
+ if tags:
957
+ chart_snapshot.aspects.append(tags)
958
+
825
959
  yield MetadataWorkUnit(
826
960
  id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
827
961
  )
@@ -966,7 +1100,27 @@ class SupersetSource(StatefulIngestionSourceBase):
966
1100
  fieldPath=col.get("column_name", ""),
967
1101
  type=SchemaFieldDataType(data_type),
968
1102
  nativeDataType="",
969
- description=col.get("column_name", ""),
1103
+ description=col.get("description") or col.get("column_name", ""),
1104
+ nullable=True,
1105
+ )
1106
+ schema_fields.append(field)
1107
+ return schema_fields
1108
+
1109
+ def gen_metric_schema_fields(
1110
+ self, metric_data: List[Dict[str, Any]]
1111
+ ) -> List[SchemaField]:
1112
+ schema_fields: List[SchemaField] = []
1113
+ for metric in metric_data:
1114
+ metric_type = metric.get("metric_type", "")
1115
+ data_type = resolve_sql_type(metric_type)
1116
+ if data_type is None:
1117
+ data_type = NullType()
1118
+
1119
+ field = SchemaField(
1120
+ fieldPath=metric.get("metric_name", ""),
1121
+ type=SchemaFieldDataType(data_type),
1122
+ nativeDataType=metric_type or "",
1123
+ description=metric.get("description", ""),
970
1124
  nullable=True,
971
1125
  )
972
1126
  schema_fields.append(field)
@@ -978,13 +1132,18 @@ class SupersetSource(StatefulIngestionSourceBase):
978
1132
  ) -> SchemaMetadata:
979
1133
  dataset_response = dataset_response.get("result", {})
980
1134
  column_data = dataset_response.get("columns", [])
1135
+ metric_data = dataset_response.get("metrics", [])
1136
+
1137
+ column_fields = self.gen_schema_fields(column_data)
1138
+ metric_fields = self.gen_metric_schema_fields(metric_data)
1139
+
981
1140
  schema_metadata = SchemaMetadata(
982
1141
  schemaName=dataset_response.get("table_name", ""),
983
1142
  platform=make_data_platform_urn(self.platform),
984
1143
  version=0,
985
1144
  hash="",
986
1145
  platformSchema=MySqlDDL(tableSchema=""),
987
- fields=self.gen_schema_fields(column_data),
1146
+ fields=column_fields + metric_fields,
988
1147
  )
989
1148
  return schema_metadata
990
1149
 
@@ -1049,6 +1208,8 @@ class SupersetSource(StatefulIngestionSourceBase):
1049
1208
  # To generate column level lineage, we can manually decode the metadata
1050
1209
  # to produce the ColumnLineageInfo
1051
1210
  columns = dataset_response.get("result", {}).get("columns", [])
1211
+ metrics = dataset_response.get("result", {}).get("metrics", [])
1212
+
1052
1213
  fine_grained_lineages: List[FineGrainedLineageClass] = []
1053
1214
 
1054
1215
  for column in columns:
@@ -1067,6 +1228,22 @@ class SupersetSource(StatefulIngestionSourceBase):
1067
1228
  )
1068
1229
  )
1069
1230
 
1231
+ for metric in metrics:
1232
+ metric_name = metric.get("metric_name", "")
1233
+ if not metric_name:
1234
+ continue
1235
+
1236
+ downstream = [make_schema_field_urn(datasource_urn, metric_name)]
1237
+ upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
1238
+ fine_grained_lineages.append(
1239
+ FineGrainedLineageClass(
1240
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1241
+ downstreams=downstream,
1242
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1243
+ upstreams=upstreams,
1244
+ )
1245
+ )
1246
+
1070
1247
  upstream_lineage = UpstreamLineageClass(
1071
1248
  upstreams=[
1072
1249
  UpstreamClass(
@@ -1087,7 +1264,7 @@ class SupersetSource(StatefulIngestionSourceBase):
1087
1264
  datasource_urn = self.get_datasource_urn_from_id(
1088
1265
  dataset_response, self.platform
1089
1266
  )
1090
- dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
1267
+ dataset_url = f"{self.config.display_uri}/explore/?datasource_type=table&datasource_id={dataset.id}"
1091
1268
 
1092
1269
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
1093
1270
  now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
@@ -1144,21 +1321,22 @@ class SupersetSource(StatefulIngestionSourceBase):
1144
1321
 
1145
1322
  dataset_info = DatasetPropertiesClass(
1146
1323
  name=dataset.table_name,
1147
- description="",
1324
+ description=dataset.description or "",
1148
1325
  externalUrl=dataset_url,
1149
1326
  lastModified=TimeStamp(time=modified_ts),
1150
1327
  )
1151
- global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
1152
-
1153
- aspects_items: List[Any] = []
1154
- aspects_items.extend(
1155
- [
1156
- self.gen_schema_metadata(dataset_response),
1157
- dataset_info,
1158
- upstream_lineage,
1159
- global_tags,
1160
- ]
1161
- )
1328
+
1329
+ dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
1330
+ tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
1331
+
1332
+ aspects_items: List[Any] = [
1333
+ self.gen_schema_metadata(dataset_response),
1334
+ dataset_info,
1335
+ upstream_lineage,
1336
+ ]
1337
+
1338
+ if tags:
1339
+ aspects_items.append(tags)
1162
1340
 
1163
1341
  dataset_snapshot = DatasetSnapshot(
1164
1342
  urn=datasource_urn,
@@ -1180,6 +1358,75 @@ class SupersetSource(StatefulIngestionSourceBase):
1180
1358
 
1181
1359
  return dataset_snapshot
1182
1360
 
1361
+ def _extract_and_map_tags(
1362
+ self, raw_tags: List[Dict[str, Any]]
1363
+ ) -> Optional[GlobalTagsClass]:
1364
+ """Extract and map Superset tags to DataHub GlobalTagsClass.
1365
+
1366
+ Filters out system-generated tags (type != 1) and only processes user-defined tags
1367
+ from the Superset API response.
1368
+
1369
+ Args:
1370
+ raw_tags: List of tag dictionaries from Superset API
1371
+
1372
+ Returns:
1373
+ GlobalTagsClass with user-defined tags, or None if no tags found
1374
+ """
1375
+ user_tags = [
1376
+ tag.get("name", "")
1377
+ for tag in raw_tags
1378
+ if tag.get("type") == 1 and tag.get("name")
1379
+ ]
1380
+
1381
+ if not user_tags:
1382
+ return None
1383
+
1384
+ tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
1385
+ return GlobalTagsClass(
1386
+ tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
1387
+ )
1388
+
1389
+ def _merge_tags_with_existing(
1390
+ self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
1391
+ ) -> Optional[GlobalTagsClass]:
1392
+ """Merge new tags with existing ones from DataHub to preserve manually added tags.
1393
+
1394
+ This method ensures that tags manually added via DataHub UI are not overwritten
1395
+ during ingestion. It fetches existing tags from the graph and merges them with
1396
+ new tags from the source system, avoiding duplicates.
1397
+
1398
+ Args:
1399
+ entity_urn: URN of the entity to check for existing tags
1400
+ new_tags: New tags to add as GlobalTagsClass object
1401
+
1402
+ Returns:
1403
+ GlobalTagsClass with merged tags preserving existing ones, or None if no tags
1404
+ """
1405
+ if not new_tags or not new_tags.tags:
1406
+ return None
1407
+
1408
+ # Fetch existing tags from DataHub
1409
+ existing_global_tags = None
1410
+ if self.ctx.graph:
1411
+ existing_global_tags = self.ctx.graph.get_aspect(
1412
+ entity_urn=entity_urn, aspect_type=GlobalTagsClass
1413
+ )
1414
+
1415
+ # Merge existing tags with new ones, avoiding duplicates
1416
+ all_tags = []
1417
+ existing_tag_urns = set()
1418
+
1419
+ if existing_global_tags and existing_global_tags.tags:
1420
+ all_tags.extend(existing_global_tags.tags)
1421
+ existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
1422
+
1423
+ # Add new tags that don't already exist
1424
+ for new_tag in new_tags.tags:
1425
+ if new_tag.tag not in existing_tag_urns:
1426
+ all_tags.append(new_tag)
1427
+
1428
+ return GlobalTagsClass(tags=all_tags) if all_tags else None
1429
+
1183
1430
  def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1184
1431
  dataset_name = ""
1185
1432
  try: