acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.3.0.1rc9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (503) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/METADATA +2686 -2563
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/RECORD +499 -392
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/entry_points.txt +7 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  7. datahub/api/entities/assertion/assertion.py +1 -1
  8. datahub/api/entities/common/serialized_value.py +1 -1
  9. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  10. datahub/api/entities/datacontract/datacontract.py +35 -3
  11. datahub/api/entities/datajob/dataflow.py +18 -3
  12. datahub/api/entities/datajob/datajob.py +24 -4
  13. datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
  14. datahub/api/entities/dataproduct/dataproduct.py +32 -3
  15. datahub/api/entities/dataset/dataset.py +47 -72
  16. datahub/api/entities/external/__init__.py +0 -0
  17. datahub/api/entities/external/external_entities.py +724 -0
  18. datahub/api/entities/external/external_tag.py +147 -0
  19. datahub/api/entities/external/lake_formation_external_entites.py +162 -0
  20. datahub/api/entities/external/restricted_text.py +172 -0
  21. datahub/api/entities/external/unity_catalog_external_entites.py +172 -0
  22. datahub/api/entities/forms/forms.py +37 -37
  23. datahub/api/entities/structuredproperties/structuredproperties.py +6 -6
  24. datahub/api/graphql/assertion.py +1 -1
  25. datahub/api/graphql/base.py +8 -6
  26. datahub/api/graphql/operation.py +14 -10
  27. datahub/cli/check_cli.py +91 -9
  28. datahub/cli/cli_utils.py +63 -0
  29. datahub/cli/config_utils.py +20 -12
  30. datahub/cli/container_cli.py +5 -0
  31. datahub/cli/delete_cli.py +133 -34
  32. datahub/cli/docker_check.py +110 -14
  33. datahub/cli/docker_cli.py +155 -231
  34. datahub/cli/exists_cli.py +2 -3
  35. datahub/cli/get_cli.py +2 -3
  36. datahub/cli/graphql_cli.py +1422 -0
  37. datahub/cli/iceberg_cli.py +11 -5
  38. datahub/cli/ingest_cli.py +25 -26
  39. datahub/cli/migrate.py +12 -9
  40. datahub/cli/migration_utils.py +4 -3
  41. datahub/cli/put_cli.py +4 -6
  42. datahub/cli/quickstart_versioning.py +53 -10
  43. datahub/cli/specific/assertions_cli.py +39 -7
  44. datahub/cli/specific/datacontract_cli.py +57 -9
  45. datahub/cli/specific/dataproduct_cli.py +12 -24
  46. datahub/cli/specific/dataset_cli.py +31 -21
  47. datahub/cli/specific/forms_cli.py +2 -5
  48. datahub/cli/specific/group_cli.py +2 -3
  49. datahub/cli/specific/structuredproperties_cli.py +5 -7
  50. datahub/cli/specific/user_cli.py +174 -4
  51. datahub/cli/state_cli.py +2 -3
  52. datahub/cli/timeline_cli.py +2 -3
  53. datahub/configuration/common.py +46 -2
  54. datahub/configuration/connection_resolver.py +5 -2
  55. datahub/configuration/env_vars.py +331 -0
  56. datahub/configuration/import_resolver.py +7 -4
  57. datahub/configuration/kafka.py +21 -1
  58. datahub/configuration/pydantic_migration_helpers.py +6 -13
  59. datahub/configuration/source_common.py +4 -3
  60. datahub/configuration/validate_field_deprecation.py +5 -2
  61. datahub/configuration/validate_field_removal.py +8 -2
  62. datahub/configuration/validate_field_rename.py +6 -5
  63. datahub/configuration/validate_multiline_string.py +5 -2
  64. datahub/emitter/mce_builder.py +12 -8
  65. datahub/emitter/mcp.py +20 -5
  66. datahub/emitter/mcp_builder.py +12 -0
  67. datahub/emitter/request_helper.py +138 -15
  68. datahub/emitter/response_helper.py +111 -19
  69. datahub/emitter/rest_emitter.py +399 -163
  70. datahub/entrypoints.py +10 -5
  71. datahub/errors.py +12 -0
  72. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +299 -2
  73. datahub/ingestion/api/auto_work_units/auto_validate_input_fields.py +87 -0
  74. datahub/ingestion/api/common.py +9 -0
  75. datahub/ingestion/api/decorators.py +15 -3
  76. datahub/ingestion/api/report.py +381 -3
  77. datahub/ingestion/api/sink.py +27 -2
  78. datahub/ingestion/api/source.py +174 -62
  79. datahub/ingestion/api/source_helpers.py +41 -3
  80. datahub/ingestion/api/source_protocols.py +23 -0
  81. datahub/ingestion/autogenerated/__init__.py +0 -0
  82. datahub/ingestion/autogenerated/capability_summary.json +3652 -0
  83. datahub/ingestion/autogenerated/lineage.json +402 -0
  84. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  85. datahub/ingestion/extractor/schema_util.py +31 -5
  86. datahub/ingestion/glossary/classification_mixin.py +9 -2
  87. datahub/ingestion/graph/client.py +492 -55
  88. datahub/ingestion/graph/config.py +18 -2
  89. datahub/ingestion/graph/filters.py +96 -32
  90. datahub/ingestion/graph/links.py +55 -0
  91. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +21 -11
  92. datahub/ingestion/run/pipeline.py +90 -23
  93. datahub/ingestion/run/pipeline_config.py +3 -3
  94. datahub/ingestion/sink/datahub_kafka.py +1 -0
  95. datahub/ingestion/sink/datahub_rest.py +31 -23
  96. datahub/ingestion/sink/file.py +1 -0
  97. datahub/ingestion/source/abs/config.py +1 -1
  98. datahub/ingestion/source/abs/datalake_profiler_config.py +1 -1
  99. datahub/ingestion/source/abs/source.py +15 -30
  100. datahub/ingestion/source/apply/datahub_apply.py +6 -5
  101. datahub/ingestion/source/aws/aws_common.py +185 -13
  102. datahub/ingestion/source/aws/glue.py +517 -244
  103. datahub/ingestion/source/aws/platform_resource_repository.py +30 -0
  104. datahub/ingestion/source/aws/s3_boto_utils.py +100 -5
  105. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  106. datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
  107. datahub/ingestion/source/aws/tag_entities.py +270 -0
  108. datahub/ingestion/source/azure/azure_common.py +3 -3
  109. datahub/ingestion/source/bigquery_v2/bigquery.py +51 -7
  110. datahub/ingestion/source/bigquery_v2/bigquery_config.py +51 -81
  111. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +81 -0
  112. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +6 -1
  113. datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -2
  114. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
  115. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +20 -5
  116. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  117. datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
  118. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  119. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  120. datahub/ingestion/source/bigquery_v2/queries_extractor.py +45 -9
  121. datahub/ingestion/source/cassandra/cassandra.py +7 -18
  122. datahub/ingestion/source/cassandra/cassandra_api.py +36 -0
  123. datahub/ingestion/source/cassandra/cassandra_config.py +20 -0
  124. datahub/ingestion/source/cassandra/cassandra_profiling.py +26 -24
  125. datahub/ingestion/source/cassandra/cassandra_utils.py +1 -2
  126. datahub/ingestion/source/common/data_platforms.py +23 -0
  127. datahub/ingestion/source/common/gcp_credentials_config.py +9 -1
  128. datahub/ingestion/source/common/subtypes.py +73 -1
  129. datahub/ingestion/source/data_lake_common/data_lake_utils.py +59 -10
  130. datahub/ingestion/source/data_lake_common/object_store.py +732 -0
  131. datahub/ingestion/source/data_lake_common/path_spec.py +87 -38
  132. datahub/ingestion/source/datahub/config.py +19 -5
  133. datahub/ingestion/source/datahub/datahub_database_reader.py +205 -36
  134. datahub/ingestion/source/datahub/datahub_source.py +11 -1
  135. datahub/ingestion/source/dbt/dbt_cloud.py +17 -10
  136. datahub/ingestion/source/dbt/dbt_common.py +270 -26
  137. datahub/ingestion/source/dbt/dbt_core.py +88 -47
  138. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  139. datahub/ingestion/source/debug/__init__.py +0 -0
  140. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  141. datahub/ingestion/source/delta_lake/config.py +9 -5
  142. datahub/ingestion/source/delta_lake/source.py +8 -0
  143. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  144. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  145. datahub/ingestion/source/dremio/dremio_config.py +5 -4
  146. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  147. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  148. datahub/ingestion/source/dremio/dremio_reporting.py +22 -3
  149. datahub/ingestion/source/dremio/dremio_source.py +228 -215
  150. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  151. datahub/ingestion/source/dynamodb/dynamodb.py +19 -13
  152. datahub/ingestion/source/excel/__init__.py +0 -0
  153. datahub/ingestion/source/excel/config.py +92 -0
  154. datahub/ingestion/source/excel/excel_file.py +539 -0
  155. datahub/ingestion/source/excel/profiling.py +308 -0
  156. datahub/ingestion/source/excel/report.py +49 -0
  157. datahub/ingestion/source/excel/source.py +662 -0
  158. datahub/ingestion/source/excel/util.py +18 -0
  159. datahub/ingestion/source/feast.py +12 -14
  160. datahub/ingestion/source/file.py +3 -0
  161. datahub/ingestion/source/fivetran/config.py +67 -8
  162. datahub/ingestion/source/fivetran/fivetran.py +228 -43
  163. datahub/ingestion/source/fivetran/fivetran_log_api.py +42 -9
  164. datahub/ingestion/source/fivetran/fivetran_query.py +58 -36
  165. datahub/ingestion/source/fivetran/fivetran_rest_api.py +65 -0
  166. datahub/ingestion/source/fivetran/response_models.py +97 -0
  167. datahub/ingestion/source/gc/datahub_gc.py +0 -2
  168. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  169. datahub/ingestion/source/gcs/gcs_source.py +53 -10
  170. datahub/ingestion/source/gcs/gcs_utils.py +36 -9
  171. datahub/ingestion/source/ge_data_profiler.py +146 -33
  172. datahub/ingestion/source/ge_profiling_config.py +26 -11
  173. datahub/ingestion/source/grafana/entity_mcp_builder.py +272 -0
  174. datahub/ingestion/source/grafana/field_utils.py +307 -0
  175. datahub/ingestion/source/grafana/grafana_api.py +142 -0
  176. datahub/ingestion/source/grafana/grafana_config.py +104 -0
  177. datahub/ingestion/source/grafana/grafana_source.py +522 -84
  178. datahub/ingestion/source/grafana/lineage.py +202 -0
  179. datahub/ingestion/source/grafana/models.py +137 -0
  180. datahub/ingestion/source/grafana/report.py +90 -0
  181. datahub/ingestion/source/grafana/types.py +16 -0
  182. datahub/ingestion/source/hex/__init__.py +0 -0
  183. datahub/ingestion/source/hex/api.py +402 -0
  184. datahub/ingestion/source/hex/constants.py +8 -0
  185. datahub/ingestion/source/hex/hex.py +311 -0
  186. datahub/ingestion/source/hex/mapper.py +412 -0
  187. datahub/ingestion/source/hex/model.py +78 -0
  188. datahub/ingestion/source/hex/query_fetcher.py +307 -0
  189. datahub/ingestion/source/iceberg/iceberg.py +385 -164
  190. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  191. datahub/ingestion/source/iceberg/iceberg_profiler.py +25 -20
  192. datahub/ingestion/source/identity/azure_ad.py +1 -1
  193. datahub/ingestion/source/identity/okta.py +1 -14
  194. datahub/ingestion/source/kafka/kafka.py +28 -71
  195. datahub/ingestion/source/kafka/kafka_config.py +78 -0
  196. datahub/ingestion/source/kafka_connect/common.py +2 -2
  197. datahub/ingestion/source/kafka_connect/sink_connectors.py +157 -48
  198. datahub/ingestion/source/kafka_connect/source_connectors.py +63 -5
  199. datahub/ingestion/source/ldap.py +1 -1
  200. datahub/ingestion/source/looker/looker_common.py +216 -86
  201. datahub/ingestion/source/looker/looker_config.py +15 -4
  202. datahub/ingestion/source/looker/looker_constant.py +4 -0
  203. datahub/ingestion/source/looker/looker_lib_wrapper.py +37 -4
  204. datahub/ingestion/source/looker/looker_liquid_tag.py +56 -5
  205. datahub/ingestion/source/looker/looker_source.py +539 -555
  206. datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
  207. datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
  208. datahub/ingestion/source/looker/lookml_config.py +31 -3
  209. datahub/ingestion/source/looker/lookml_refinement.py +1 -1
  210. datahub/ingestion/source/looker/lookml_source.py +103 -118
  211. datahub/ingestion/source/looker/view_upstream.py +494 -1
  212. datahub/ingestion/source/metabase.py +32 -6
  213. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  214. datahub/ingestion/source/metadata/lineage.py +11 -10
  215. datahub/ingestion/source/mlflow.py +254 -23
  216. datahub/ingestion/source/mock_data/__init__.py +0 -0
  217. datahub/ingestion/source/mock_data/datahub_mock_data.py +533 -0
  218. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  219. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  220. datahub/ingestion/source/mode.py +359 -181
  221. datahub/ingestion/source/mongodb.py +11 -1
  222. datahub/ingestion/source/neo4j/neo4j_source.py +122 -153
  223. datahub/ingestion/source/nifi.py +5 -5
  224. datahub/ingestion/source/openapi.py +85 -38
  225. datahub/ingestion/source/openapi_parser.py +59 -40
  226. datahub/ingestion/source/powerbi/config.py +92 -27
  227. datahub/ingestion/source/powerbi/m_query/data_classes.py +3 -0
  228. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  229. datahub/ingestion/source/powerbi/m_query/parser.py +2 -2
  230. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +358 -14
  231. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  232. datahub/ingestion/source/powerbi/powerbi.py +66 -32
  233. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  234. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -12
  235. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  236. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +2 -4
  237. datahub/ingestion/source/preset.py +3 -3
  238. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  239. datahub/ingestion/source/qlik_sense/qlik_sense.py +2 -1
  240. datahub/ingestion/source/redash.py +1 -1
  241. datahub/ingestion/source/redshift/config.py +15 -9
  242. datahub/ingestion/source/redshift/datashares.py +1 -1
  243. datahub/ingestion/source/redshift/lineage.py +386 -687
  244. datahub/ingestion/source/redshift/profile.py +2 -2
  245. datahub/ingestion/source/redshift/query.py +24 -20
  246. datahub/ingestion/source/redshift/redshift.py +52 -111
  247. datahub/ingestion/source/redshift/redshift_schema.py +17 -12
  248. datahub/ingestion/source/redshift/report.py +0 -2
  249. datahub/ingestion/source/redshift/usage.py +13 -11
  250. datahub/ingestion/source/s3/report.py +4 -2
  251. datahub/ingestion/source/s3/source.py +515 -244
  252. datahub/ingestion/source/sac/sac.py +3 -1
  253. datahub/ingestion/source/salesforce.py +28 -13
  254. datahub/ingestion/source/schema/json_schema.py +14 -14
  255. datahub/ingestion/source/schema_inference/object.py +22 -6
  256. datahub/ingestion/source/sigma/config.py +75 -8
  257. datahub/ingestion/source/sigma/data_classes.py +3 -0
  258. datahub/ingestion/source/sigma/sigma.py +36 -7
  259. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  260. datahub/ingestion/source/slack/slack.py +403 -140
  261. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  262. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  263. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  264. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  265. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  266. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  267. datahub/ingestion/source/snowflake/constants.py +4 -0
  268. datahub/ingestion/source/snowflake/snowflake_config.py +103 -34
  269. datahub/ingestion/source/snowflake/snowflake_connection.py +47 -25
  270. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +25 -6
  271. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  272. datahub/ingestion/source/snowflake/snowflake_queries.py +511 -107
  273. datahub/ingestion/source/snowflake/snowflake_query.py +100 -72
  274. datahub/ingestion/source/snowflake/snowflake_report.py +4 -2
  275. datahub/ingestion/source/snowflake/snowflake_schema.py +381 -16
  276. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +163 -52
  277. datahub/ingestion/source/snowflake/snowflake_summary.py +7 -1
  278. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  279. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  280. datahub/ingestion/source/snowflake/snowflake_utils.py +62 -17
  281. datahub/ingestion/source/snowflake/snowflake_v2.py +56 -10
  282. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  283. datahub/ingestion/source/sql/athena.py +219 -26
  284. datahub/ingestion/source/sql/athena_properties_extractor.py +795 -0
  285. datahub/ingestion/source/sql/clickhouse.py +29 -9
  286. datahub/ingestion/source/sql/cockroachdb.py +5 -4
  287. datahub/ingestion/source/sql/druid.py +9 -4
  288. datahub/ingestion/source/sql/hana.py +3 -1
  289. datahub/ingestion/source/sql/hive.py +28 -8
  290. datahub/ingestion/source/sql/hive_metastore.py +24 -25
  291. datahub/ingestion/source/sql/mariadb.py +0 -1
  292. datahub/ingestion/source/sql/mssql/job_models.py +18 -2
  293. datahub/ingestion/source/sql/mssql/source.py +376 -62
  294. datahub/ingestion/source/sql/mysql.py +154 -4
  295. datahub/ingestion/source/sql/oracle.py +62 -11
  296. datahub/ingestion/source/sql/postgres.py +142 -6
  297. datahub/ingestion/source/sql/presto.py +20 -2
  298. datahub/ingestion/source/sql/sql_common.py +281 -49
  299. datahub/ingestion/source/sql/sql_config.py +1 -34
  300. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  301. datahub/ingestion/source/sql/sql_types.py +27 -2
  302. datahub/ingestion/source/sql/sqlalchemy_uri.py +68 -0
  303. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  304. datahub/ingestion/source/sql/stored_procedures/base.py +253 -0
  305. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +2 -29
  306. datahub/ingestion/source/sql/teradata.py +1028 -245
  307. datahub/ingestion/source/sql/trino.py +43 -10
  308. datahub/ingestion/source/sql/two_tier_sql_source.py +3 -4
  309. datahub/ingestion/source/sql/vertica.py +14 -7
  310. datahub/ingestion/source/sql_queries.py +219 -121
  311. datahub/ingestion/source/state/checkpoint.py +8 -29
  312. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  313. datahub/ingestion/source/state/redundant_run_skip_handler.py +21 -0
  314. datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
  315. datahub/ingestion/source/state/stateful_ingestion_base.py +36 -11
  316. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  317. datahub/ingestion/source/superset.py +810 -126
  318. datahub/ingestion/source/tableau/tableau.py +172 -69
  319. datahub/ingestion/source/tableau/tableau_common.py +11 -4
  320. datahub/ingestion/source/tableau/tableau_constant.py +1 -4
  321. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  322. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  323. datahub/ingestion/source/unity/config.py +161 -40
  324. datahub/ingestion/source/unity/connection.py +61 -0
  325. datahub/ingestion/source/unity/connection_test.py +1 -0
  326. datahub/ingestion/source/unity/platform_resource_repository.py +19 -0
  327. datahub/ingestion/source/unity/proxy.py +794 -51
  328. datahub/ingestion/source/unity/proxy_patch.py +321 -0
  329. datahub/ingestion/source/unity/proxy_types.py +36 -2
  330. datahub/ingestion/source/unity/report.py +15 -3
  331. datahub/ingestion/source/unity/source.py +465 -131
  332. datahub/ingestion/source/unity/tag_entities.py +197 -0
  333. datahub/ingestion/source/unity/usage.py +46 -4
  334. datahub/ingestion/source/usage/clickhouse_usage.py +11 -4
  335. datahub/ingestion/source/usage/starburst_trino_usage.py +10 -5
  336. datahub/ingestion/source/usage/usage_common.py +4 -68
  337. datahub/ingestion/source/vertexai/__init__.py +0 -0
  338. datahub/ingestion/source/vertexai/vertexai.py +1367 -0
  339. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  340. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +89 -0
  341. datahub/ingestion/source_config/pulsar.py +3 -1
  342. datahub/ingestion/source_report/ingestion_stage.py +50 -11
  343. datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
  344. datahub/ingestion/transformer/add_dataset_ownership.py +19 -3
  345. datahub/ingestion/transformer/base_transformer.py +8 -5
  346. datahub/ingestion/transformer/dataset_domain.py +1 -1
  347. datahub/ingestion/transformer/set_browse_path.py +112 -0
  348. datahub/integrations/assertion/common.py +3 -2
  349. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  350. datahub/lite/lite_util.py +2 -2
  351. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +3095 -631
  352. datahub/metadata/_urns/urn_defs.py +1866 -1582
  353. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  354. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  355. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  356. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  357. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  358. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  359. datahub/metadata/com/linkedin/pegasus2avro/file/__init__.py +19 -0
  360. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  361. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  362. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +8 -0
  363. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +31 -0
  364. datahub/metadata/com/linkedin/pegasus2avro/platform/event/v1/__init__.py +4 -0
  365. datahub/metadata/com/linkedin/pegasus2avro/role/__init__.py +2 -0
  366. datahub/metadata/com/linkedin/pegasus2avro/settings/asset/__init__.py +19 -0
  367. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +8 -0
  368. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +31 -0
  369. datahub/metadata/schema.avsc +18404 -16617
  370. datahub/metadata/schema_classes.py +3 -3
  371. datahub/metadata/schemas/Actors.avsc +38 -1
  372. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  373. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  374. datahub/metadata/schemas/Applications.avsc +38 -0
  375. datahub/metadata/schemas/AssetSettings.avsc +63 -0
  376. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  377. datahub/metadata/schemas/ChartKey.avsc +1 -0
  378. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  379. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  380. datahub/metadata/schemas/CorpUserEditableInfo.avsc +15 -1
  381. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  382. datahub/metadata/schemas/CorpUserSettings.avsc +145 -0
  383. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  384. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  385. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  386. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  387. datahub/metadata/schemas/DataHubFileInfo.avsc +230 -0
  388. datahub/metadata/schemas/DataHubFileKey.avsc +21 -0
  389. datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
  390. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  391. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  392. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +298 -0
  393. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  394. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +251 -0
  395. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  396. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  397. datahub/metadata/schemas/DataJobInputOutput.avsc +8 -0
  398. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  399. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  400. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  401. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  402. datahub/metadata/schemas/DataProductKey.avsc +3 -1
  403. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  404. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  405. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  406. datahub/metadata/schemas/DatasetUsageStatistics.avsc +8 -0
  407. datahub/metadata/schemas/Deprecation.avsc +2 -0
  408. datahub/metadata/schemas/DomainKey.avsc +2 -1
  409. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  410. datahub/metadata/schemas/FormInfo.avsc +5 -0
  411. datahub/metadata/schemas/GlobalSettingsInfo.avsc +134 -0
  412. datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
  413. datahub/metadata/schemas/GlossaryTermKey.avsc +3 -1
  414. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  415. datahub/metadata/schemas/IncidentInfo.avsc +3 -3
  416. datahub/metadata/schemas/InstitutionalMemory.avsc +31 -0
  417. datahub/metadata/schemas/LogicalParent.avsc +145 -0
  418. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  419. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  420. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  421. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  422. datahub/metadata/schemas/MLModelGroupKey.avsc +11 -1
  423. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  424. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  425. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  426. datahub/metadata/schemas/MetadataChangeEvent.avsc +189 -47
  427. datahub/metadata/schemas/MetadataChangeLog.avsc +65 -44
  428. datahub/metadata/schemas/MetadataChangeProposal.avsc +64 -0
  429. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  430. datahub/metadata/schemas/Operation.avsc +21 -2
  431. datahub/metadata/schemas/Ownership.avsc +69 -0
  432. datahub/metadata/schemas/QueryProperties.avsc +24 -2
  433. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  434. datahub/metadata/schemas/RelationshipChangeEvent.avsc +215 -0
  435. datahub/metadata/schemas/SchemaFieldKey.avsc +4 -1
  436. datahub/metadata/schemas/Siblings.avsc +2 -0
  437. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  438. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  439. datahub/metadata/schemas/StructuredPropertySettings.avsc +9 -0
  440. datahub/metadata/schemas/SystemMetadata.avsc +147 -0
  441. datahub/metadata/schemas/UpstreamLineage.avsc +9 -0
  442. datahub/metadata/schemas/__init__.py +3 -3
  443. datahub/sdk/__init__.py +7 -0
  444. datahub/sdk/_all_entities.py +15 -0
  445. datahub/sdk/_shared.py +393 -10
  446. datahub/sdk/_utils.py +4 -0
  447. datahub/sdk/chart.py +386 -0
  448. datahub/sdk/container.py +7 -0
  449. datahub/sdk/dashboard.py +453 -0
  450. datahub/sdk/dataflow.py +309 -0
  451. datahub/sdk/datajob.py +367 -0
  452. datahub/sdk/dataset.py +180 -4
  453. datahub/sdk/entity.py +99 -3
  454. datahub/sdk/entity_client.py +154 -12
  455. datahub/sdk/lineage_client.py +943 -0
  456. datahub/sdk/main_client.py +83 -8
  457. datahub/sdk/mlmodel.py +383 -0
  458. datahub/sdk/mlmodelgroup.py +240 -0
  459. datahub/sdk/search_client.py +85 -8
  460. datahub/sdk/search_filters.py +393 -68
  461. datahub/secret/datahub_secret_store.py +5 -1
  462. datahub/secret/environment_secret_store.py +29 -0
  463. datahub/secret/file_secret_store.py +49 -0
  464. datahub/specific/aspect_helpers/fine_grained_lineage.py +76 -0
  465. datahub/specific/aspect_helpers/siblings.py +73 -0
  466. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  467. datahub/specific/chart.py +1 -1
  468. datahub/specific/datajob.py +15 -1
  469. datahub/specific/dataproduct.py +4 -0
  470. datahub/specific/dataset.py +51 -59
  471. datahub/sql_parsing/_sqlglot_patch.py +1 -2
  472. datahub/sql_parsing/fingerprint_utils.py +6 -0
  473. datahub/sql_parsing/split_statements.py +30 -3
  474. datahub/sql_parsing/sql_parsing_aggregator.py +144 -63
  475. datahub/sql_parsing/sqlglot_lineage.py +517 -44
  476. datahub/sql_parsing/sqlglot_utils.py +30 -18
  477. datahub/sql_parsing/tool_meta_extractor.py +25 -2
  478. datahub/telemetry/telemetry.py +30 -16
  479. datahub/testing/check_imports.py +1 -1
  480. datahub/testing/docker_utils.py +8 -2
  481. datahub/testing/mce_helpers.py +421 -0
  482. datahub/testing/mcp_diff.py +17 -21
  483. datahub/testing/sdk_v2_helpers.py +18 -0
  484. datahub/upgrade/upgrade.py +86 -30
  485. datahub/utilities/file_backed_collections.py +14 -15
  486. datahub/utilities/hive_schema_to_avro.py +2 -2
  487. datahub/utilities/ingest_utils.py +2 -2
  488. datahub/utilities/is_pytest.py +3 -2
  489. datahub/utilities/logging_manager.py +30 -7
  490. datahub/utilities/mapping.py +29 -2
  491. datahub/utilities/sample_data.py +5 -4
  492. datahub/utilities/server_config_util.py +298 -10
  493. datahub/utilities/sqlalchemy_query_combiner.py +6 -4
  494. datahub/utilities/stats_collections.py +4 -0
  495. datahub/utilities/threaded_iterator_executor.py +16 -3
  496. datahub/utilities/urn_encoder.py +1 -1
  497. datahub/utilities/urns/urn.py +41 -2
  498. datahub/emitter/sql_parsing_builder.py +0 -306
  499. datahub/ingestion/source/redshift/lineage_v2.py +0 -458
  500. datahub/ingestion/source/vertexai.py +0 -697
  501. datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
  502. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info/licenses}/LICENSE +0 -0
  503. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.3.0.1rc9.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,20 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from dataclasses import dataclass, field
4
5
  from datetime import datetime
5
6
  from functools import lru_cache
6
- from typing import Any, Dict, Iterable, List, Optional
7
+ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
7
8
 
8
9
  import dateutil.parser as dp
9
10
  import requests
10
- from pydantic import BaseModel
11
- from pydantic.class_validators import root_validator, validator
11
+ import sqlglot
12
+ from pydantic import BaseModel, root_validator, validator
12
13
  from pydantic.fields import Field
14
+ from requests.adapters import HTTPAdapter
15
+ from urllib3.util.retry import Retry
13
16
 
17
+ import datahub.emitter.mce_builder as builder
14
18
  from datahub.configuration.common import AllowDenyPattern
15
19
  from datahub.configuration.source_common import (
16
20
  EnvConfigMixin,
@@ -23,8 +27,10 @@ from datahub.emitter.mce_builder import (
23
27
  make_dataset_urn,
24
28
  make_dataset_urn_with_platform_instance,
25
29
  make_domain_urn,
30
+ make_schema_field_urn,
26
31
  make_user_urn,
27
32
  )
33
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
28
34
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
29
35
  from datahub.ingestion.api.common import PipelineContext
30
36
  from datahub.ingestion.api.decorators import (
@@ -49,6 +55,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
49
55
  )
50
56
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
51
57
  ChangeAuditStamps,
58
+ InputField,
59
+ InputFields,
52
60
  Status,
53
61
  TimeStamp,
54
62
  )
@@ -59,11 +67,17 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
59
67
  )
60
68
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
61
69
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
70
+ BooleanTypeClass,
71
+ DateTypeClass,
62
72
  MySqlDDL,
63
73
  NullType,
74
+ NullTypeClass,
75
+ NumberTypeClass,
64
76
  SchemaField,
65
77
  SchemaFieldDataType,
66
78
  SchemaMetadata,
79
+ StringTypeClass,
80
+ TimeTypeClass,
67
81
  )
68
82
  from datahub.metadata.schema_classes import (
69
83
  AuditStampClass,
@@ -72,6 +86,9 @@ from datahub.metadata.schema_classes import (
72
86
  DashboardInfoClass,
73
87
  DatasetLineageTypeClass,
74
88
  DatasetPropertiesClass,
89
+ FineGrainedLineageClass,
90
+ FineGrainedLineageDownstreamTypeClass,
91
+ FineGrainedLineageUpstreamTypeClass,
75
92
  GlobalTagsClass,
76
93
  OwnerClass,
77
94
  OwnershipClass,
@@ -80,14 +97,25 @@ from datahub.metadata.schema_classes import (
80
97
  UpstreamClass,
81
98
  UpstreamLineageClass,
82
99
  )
100
+ from datahub.sql_parsing.sqlglot_lineage import (
101
+ SqlParsingResult,
102
+ create_lineage_sql_parsed_result,
103
+ )
83
104
  from datahub.utilities import config_clean
84
105
  from datahub.utilities.lossy_collections import LossyList
85
106
  from datahub.utilities.registries.domain_registry import DomainRegistry
107
+ from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
86
108
 
87
109
  logger = logging.getLogger(__name__)
88
110
 
89
111
  PAGE_SIZE = 25
90
112
 
113
+ # Retry configuration constants
114
+ RETRY_MAX_TIMES = 3
115
+ RETRY_STATUS_CODES = [429, 500, 502, 503, 504]
116
+ RETRY_BACKOFF_FACTOR = 1
117
+ RETRY_ALLOWED_METHODS = ["GET"]
118
+
91
119
 
92
120
  chart_type_from_viz_type = {
93
121
  "line": ChartTypeClass.LINE,
@@ -105,9 +133,20 @@ chart_type_from_viz_type = {
105
133
  "box_plot": ChartTypeClass.BAR,
106
134
  }
107
135
 
108
-
109
136
  platform_without_databases = ["druid"]
110
137
 
138
+ FIELD_TYPE_MAPPING = {
139
+ "INT": NumberTypeClass,
140
+ "STRING": StringTypeClass,
141
+ "FLOAT": NumberTypeClass,
142
+ "DATETIME": DateTypeClass,
143
+ "TIMESTAMP": TimeTypeClass,
144
+ "BOOLEAN": BooleanTypeClass,
145
+ "SQL": StringTypeClass,
146
+ "NUMERIC": NumberTypeClass,
147
+ "TEXT": StringTypeClass,
148
+ }
149
+
111
150
 
112
151
  @dataclass
113
152
  class SupersetSourceReport(StaleEntityRemovalSourceReport):
@@ -122,6 +161,7 @@ class SupersetDataset(BaseModel):
122
161
  table_name: str
123
162
  changed_on_utc: Optional[str] = None
124
163
  explore_url: Optional[str] = ""
164
+ description: Optional[str] = ""
125
165
 
126
166
  @property
127
167
  def modified_dt(self) -> Optional[datetime]:
@@ -139,6 +179,7 @@ class SupersetDataset(BaseModel):
139
179
  class SupersetConfig(
140
180
  StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
141
181
  ):
182
+ # TODO: Add support for missing dataPlatformInstance/containers
142
183
  # See the Superset /security/login endpoint for details
143
184
  # https://superset.apache.org/docs/rest-api
144
185
  connect_uri: str = Field(
@@ -150,7 +191,7 @@ class SupersetConfig(
150
191
  )
151
192
  domain: Dict[str, AllowDenyPattern] = Field(
152
193
  default=dict(),
153
- description="regex patterns for tables to filter to assign domain_key. ",
194
+ description="Regex patterns for tables to filter to assign domain_key. ",
154
195
  )
155
196
  dataset_pattern: AllowDenyPattern = Field(
156
197
  default=AllowDenyPattern.allow_all(),
@@ -164,6 +205,10 @@ class SupersetConfig(
164
205
  AllowDenyPattern.allow_all(),
165
206
  description="Patterns for selecting dashboard names that are to be included",
166
207
  )
208
+ database_pattern: AllowDenyPattern = Field(
209
+ default=AllowDenyPattern.allow_all(),
210
+ description="Regex patterns for databases to filter in ingestion.",
211
+ )
167
212
  username: Optional[str] = Field(default=None, description="Superset username.")
168
213
  password: Optional[str] = Field(default=None, description="Superset password.")
169
214
  # Configuration for stateful ingestion
@@ -181,6 +226,15 @@ class SupersetConfig(
181
226
  provider: str = Field(default="db", description="Superset provider.")
182
227
  options: Dict = Field(default={}, description="")
183
228
 
229
+ timeout: int = Field(
230
+ default=10, description="Timeout of single API call to superset."
231
+ )
232
+
233
+ max_threads: int = Field(
234
+ default_factory=lambda: os.cpu_count() or 40,
235
+ description="Max parallelism for API calls. Defaults to cpuCount or 40",
236
+ )
237
+
184
238
  # TODO: Check and remove this if no longer needed.
185
239
  # Config database_alias is removed from sql sources.
186
240
  database_alias: Dict[str, str] = Field(
@@ -231,10 +285,11 @@ def get_filter_name(filter_obj):
231
285
  @config_class(SupersetConfig)
232
286
  @support_status(SupportStatus.CERTIFIED)
233
287
  @capability(
234
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
288
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
235
289
  )
236
290
  @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
237
291
  @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
292
+ @capability(SourceCapability.TAGS, "Supported by default")
238
293
  class SupersetSource(StatefulIngestionSourceBase):
239
294
  """
240
295
  This plugin extracts the following:
@@ -261,6 +316,9 @@ class SupersetSource(StatefulIngestionSourceBase):
261
316
  )
262
317
  self.session = self.login()
263
318
  self.owner_info = self.parse_owner_info()
319
+ self.filtered_dataset_to_database: Dict[int, str] = {}
320
+ self.filtered_chart_to_database: Dict[int, str] = {}
321
+ self.processed_charts: Dict[int, Tuple[Optional[str], bool]] = {}
264
322
 
265
323
  def login(self) -> requests.Session:
266
324
  login_response = requests.post(
@@ -277,6 +335,19 @@ class SupersetSource(StatefulIngestionSourceBase):
277
335
  logger.debug("Got access token from superset")
278
336
 
279
337
  requests_session = requests.Session()
338
+
339
+ # Configure retry strategy for transient failures
340
+ retry_strategy = Retry(
341
+ total=RETRY_MAX_TIMES,
342
+ status_forcelist=RETRY_STATUS_CODES,
343
+ backoff_factor=RETRY_BACKOFF_FACTOR,
344
+ allowed_methods=RETRY_ALLOWED_METHODS,
345
+ raise_on_status=False,
346
+ )
347
+ adapter = HTTPAdapter(max_retries=retry_strategy)
348
+ requests_session.mount("http://", adapter)
349
+ requests_session.mount("https://", adapter)
350
+
280
351
  requests_session.headers.update(
281
352
  {
282
353
  "Authorization": f"Bearer {self.access_token}",
@@ -285,13 +356,16 @@ class SupersetSource(StatefulIngestionSourceBase):
285
356
  }
286
357
  )
287
358
 
288
- # Test the connection
289
359
  test_response = requests_session.get(
290
- f"{self.config.connect_uri}/api/v1/dashboard/"
360
+ f"{self.config.connect_uri}/api/v1/dashboard/",
361
+ timeout=self.config.timeout,
291
362
  )
292
- if test_response.status_code == 200:
293
- pass
294
- # TODO(Gabe): how should we message about this error?
363
+ if test_response.status_code != 200:
364
+ # throw an error and terminate ingestion,
365
+ # cannot proceed without access token
366
+ logger.error(
367
+ f"Failed to log in to Superset with status: {test_response.status_code}"
368
+ )
295
369
  return requests_session
296
370
 
297
371
  def paginate_entity_api_results(self, entity_type, page_size=100):
@@ -302,10 +376,17 @@ class SupersetSource(StatefulIngestionSourceBase):
302
376
  response = self.session.get(
303
377
  f"{self.config.connect_uri}/api/v1/{entity_type}",
304
378
  params={"q": f"(page:{current_page},page_size:{page_size})"},
379
+ timeout=self.config.timeout,
305
380
  )
306
381
 
307
382
  if response.status_code != 200:
308
- logger.warning(f"Failed to get {entity_type} data: {response.text}")
383
+ self.report.warning(
384
+ title="Failed to fetch data from Superset API",
385
+ message="Incomplete metadata extraction due to Superset API failure",
386
+ context=f"Entity Type: {entity_type}, HTTP Status Code: {response.status_code}, Page: {current_page}. Response: {response.text}",
387
+ )
388
+ # we stop pagination for this entity type and we continue the overall ingestion
389
+ break
309
390
 
310
391
  payload = response.json()
311
392
  # Update total_items with the actual count from the response
@@ -339,10 +420,11 @@ class SupersetSource(StatefulIngestionSourceBase):
339
420
  def get_dataset_info(self, dataset_id: int) -> dict:
340
421
  dataset_response = self.session.get(
341
422
  f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
423
+ timeout=self.config.timeout,
342
424
  )
343
425
  if dataset_response.status_code != 200:
344
426
  logger.warning(f"Failed to get dataset info: {dataset_response.text}")
345
- dataset_response.raise_for_status()
427
+ return {}
346
428
  return dataset_response.json()
347
429
 
348
430
  def get_datasource_urn_from_id(
@@ -393,8 +475,9 @@ class SupersetSource(StatefulIngestionSourceBase):
393
475
  )
394
476
 
395
477
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
478
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
396
479
  modified_ts = int(
397
- dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
480
+ dp.parse(dashboard_data.get("changed_on_utc", now)).timestamp() * 1000
398
481
  )
399
482
  title = dashboard_data.get("dashboard_title", "")
400
483
  # note: the API does not currently supply created_by usernames due to a bug
@@ -464,37 +547,298 @@ class SupersetSource(StatefulIngestionSourceBase):
464
547
  )
465
548
  dashboard_snapshot.aspects.append(owners_info)
466
549
 
550
+ superset_tags = self._extract_and_map_tags(dashboard_data.get("tags", []))
551
+ tags = self._merge_tags_with_existing(dashboard_urn, superset_tags)
552
+ if tags:
553
+ dashboard_snapshot.aspects.append(tags)
554
+
467
555
  return dashboard_snapshot
468
556
 
469
- def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
470
- for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
471
- try:
472
- dashboard_id = str(dashboard_data.get("id"))
473
- dashboard_title = dashboard_data.get("dashboard_title", "")
557
+ def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
558
+ dashboard_title = ""
559
+ try:
560
+ dashboard_id = str(dashboard_data.get("id"))
561
+ dashboard_title = dashboard_data.get("dashboard_title", "")
562
+ if not self.config.dashboard_pattern.allowed(dashboard_title):
563
+ self.report.report_dropped(
564
+ f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
565
+ )
566
+ return
567
+
568
+ if self.config.database_pattern != AllowDenyPattern.allow_all():
569
+ raw_position_data = dashboard_data.get("position_json", "{}")
570
+ position_data = (
571
+ json.loads(raw_position_data)
572
+ if raw_position_data is not None
573
+ else {}
574
+ )
474
575
 
475
- if not self.config.dashboard_pattern.allowed(dashboard_title):
476
- self.report.report_dropped(
477
- f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
576
+ chart_ids = []
577
+ for key, value in position_data.items():
578
+ if not key.startswith("CHART-"):
579
+ continue
580
+ chart_id = value.get("meta", {}).get("chartId")
581
+ if chart_id:
582
+ chart_ids.append(chart_id)
583
+
584
+ for chart_id in chart_ids:
585
+ if chart_id in self.processed_charts:
586
+ database_name, is_filtered = self.processed_charts[chart_id]
587
+ if is_filtered:
588
+ self.report.warning(
589
+ message="Dashboard contains charts using datasets from a filtered database. Set the dashboard pattern to deny ingestion.",
590
+ context=str(
591
+ dict(
592
+ dashboard_id=dashboard_id,
593
+ dashboard_title=dashboard_title,
594
+ chart_id=chart_id,
595
+ database_name=database_name,
596
+ )
597
+ ),
598
+ title="Incomplete Ingestion",
599
+ )
600
+
601
+ dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
602
+
603
+ except Exception as e:
604
+ self.report.warning(
605
+ message="Failed to construct dashboard snapshot. This dashboard will not be ingested.",
606
+ context=str(
607
+ dict(
608
+ dashboard_id=dashboard_id,
609
+ dashboard_title=dashboard_title,
610
+ error=str(e),
478
611
  )
479
- continue
612
+ ),
613
+ title="Dashboard Construction Failed",
614
+ exc=e,
615
+ )
616
+ return
480
617
 
481
- dashboard_snapshot = self.construct_dashboard_from_api_data(
482
- dashboard_data
483
- )
484
- except Exception as e:
485
- self.report.warning(
486
- f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
487
- )
618
+ mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
619
+ yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
620
+ yield from self._get_domain_wu(
621
+ title=dashboard_title, entity_urn=dashboard_snapshot.urn
622
+ )
623
+
624
+ def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
625
+ dashboard_data_list = [
626
+ (dashboard_data,)
627
+ for dashboard_data in self.paginate_entity_api_results(
628
+ "dashboard/", PAGE_SIZE
629
+ )
630
+ ]
631
+
632
+ yield from ThreadedIteratorExecutor.process(
633
+ worker_func=self._process_dashboard,
634
+ args_list=dashboard_data_list,
635
+ max_workers=self.config.max_threads,
636
+ )
637
+
638
+ def build_input_fields(
639
+ self,
640
+ chart_columns: List[Tuple[str, str, str]],
641
+ datasource_urn: Union[str, None],
642
+ ) -> List[InputField]:
643
+ input_fields: List[InputField] = []
644
+
645
+ for column in chart_columns:
646
+ col_name, col_type, description = column
647
+ if not col_type or not datasource_urn:
488
648
  continue
489
- # Emit the dashboard
490
- mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
491
- yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
492
- yield from self._get_domain_wu(
493
- title=dashboard_title,
494
- entity_urn=dashboard_snapshot.urn,
649
+
650
+ type_class = FIELD_TYPE_MAPPING.get(
651
+ col_type.upper(), NullTypeClass
652
+ ) # gets the type mapping
653
+
654
+ input_fields.append(
655
+ InputField(
656
+ schemaFieldUrn=builder.make_schema_field_urn(
657
+ parent_urn=str(datasource_urn),
658
+ field_path=col_name,
659
+ ),
660
+ schemaField=SchemaField(
661
+ fieldPath=col_name,
662
+ type=SchemaFieldDataType(type=type_class()), # type: ignore
663
+ description=(description if description != "null" else ""),
664
+ nativeDataType=col_type,
665
+ globalTags=None,
666
+ nullable=True,
667
+ ),
668
+ )
669
+ )
670
+
671
+ return input_fields
672
+
673
+ def _extract_columns_from_sql(self, sql_expr: Optional[str]) -> List[str]:
674
+ if not sql_expr:
675
+ return []
676
+
677
+ try:
678
+ parsed_expr = sqlglot.parse_one(sql_expr)
679
+
680
+ column_refs = set()
681
+ for node in parsed_expr.walk():
682
+ if isinstance(node, sqlglot.exp.Column):
683
+ column_name = node.name
684
+ column_refs.add(column_name)
685
+
686
+ return list(column_refs)
687
+ except Exception as e:
688
+ self.report.warning(f"Failed to parse SQL expression '{sql_expr}': {e}")
689
+ return []
690
+
691
+ def _process_column_item(
692
+ self, item: Union[str, dict], unique_columns: Dict[str, bool]
693
+ ) -> None:
694
+ """Process a single column item and add to unique_columns."""
695
+
696
+ def add_column(col_name: str, is_sql: bool) -> None:
697
+ if not col_name:
698
+ return
699
+ # Always set to False if any non-SQL seen, else keep as is_sql
700
+ unique_columns[col_name] = unique_columns.get(col_name, True) and is_sql
701
+
702
+ if isinstance(item, str):
703
+ add_column(item, False)
704
+ elif isinstance(item, dict):
705
+ if item.get("expressionType") == "SIMPLE":
706
+ # For metrics with SIMPLE expression type
707
+ add_column(item.get("column", {}).get("column_name", ""), False)
708
+ elif item.get("expressionType") == "SQL":
709
+ sql_expr = item.get("sqlExpression")
710
+ column_refs = self._extract_columns_from_sql(sql_expr)
711
+ for col in column_refs:
712
+ add_column(col, False)
713
+ if not column_refs:
714
+ add_column(item.get("label", ""), True)
715
+
716
+ def _collect_all_unique_columns(self, form_data: dict) -> Dict[str, bool]:
717
+ """Collect all unique column names from form_data, distinguishing SQL vs non-SQL."""
718
+ unique_columns: Dict[str, bool] = {}
719
+
720
+ # Process regular columns
721
+ for column in form_data.get("all_columns", []):
722
+ self._process_column_item(column, unique_columns)
723
+
724
+ # Process metrics
725
+ # For charts with a single metric, the metric is stored in the form_data as a string in the 'metric' key
726
+ # For charts with multiple metrics, the metrics are stored in the form_data as a list of strings in the 'metrics' key
727
+ if "metric" in form_data:
728
+ metrics_data = [form_data.get("metric")]
729
+ else:
730
+ metrics_data = form_data.get("metrics", [])
731
+
732
+ for metric in metrics_data:
733
+ if metric is not None:
734
+ self._process_column_item(metric, unique_columns)
735
+
736
+ # Process group by columns
737
+ for group in form_data.get("groupby", []):
738
+ self._process_column_item(group, unique_columns)
739
+
740
+ # Process x-axis columns
741
+ x_axis_data = form_data.get("x_axis")
742
+ if x_axis_data is not None:
743
+ self._process_column_item(x_axis_data, unique_columns)
744
+
745
+ return unique_columns
746
+
747
+ def _fetch_dataset_columns(
748
+ self, datasource_id: Union[Any, int]
749
+ ) -> List[Tuple[str, str, str]]:
750
+ """Fetch dataset columns and metrics from Superset API."""
751
+ if not datasource_id:
752
+ logger.warning(
753
+ "no datasource id was found, cannot build column level lineage"
495
754
  )
755
+ return []
496
756
 
497
- def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
757
+ dataset_info = self.get_dataset_info(datasource_id).get("result", {})
758
+ dataset_column_info = dataset_info.get("columns", [])
759
+ dataset_metric_info = dataset_info.get("metrics", [])
760
+
761
+ dataset_columns: List[Tuple[str, str, str]] = []
762
+ for column in dataset_column_info:
763
+ col_name = column.get("column_name", "")
764
+ col_type = column.get("type", "")
765
+ col_description = column.get("description", "")
766
+
767
+ if col_name == "" or col_type == "":
768
+ logger.info(f"could not construct column lineage for {column}")
769
+ continue
770
+
771
+ dataset_columns.append((col_name, col_type, col_description))
772
+
773
+ for metric in dataset_metric_info:
774
+ metric_name = metric.get("metric_name", "")
775
+ metric_type = metric.get("metric_type", "")
776
+ metric_description = metric.get("description", "")
777
+
778
+ if metric_name == "" or metric_type == "":
779
+ logger.info(f"could not construct metric lineage for {metric}")
780
+ continue
781
+
782
+ dataset_columns.append((metric_name, metric_type, metric_description))
783
+
784
+ return dataset_columns
785
+
786
+ def _match_chart_columns_with_dataset(
787
+ self,
788
+ unique_chart_columns: Dict[str, bool],
789
+ dataset_columns: List[Tuple[str, str, str]],
790
+ ) -> List[Tuple[str, str, str]]:
791
+ """Match chart columns with dataset columns, preserving SQL/non-SQL status."""
792
+ chart_columns: List[Tuple[str, str, str]] = []
793
+
794
+ for chart_col_name, is_sql in unique_chart_columns.items():
795
+ if is_sql:
796
+ chart_columns.append((chart_col_name, "SQL", ""))
797
+ continue
798
+
799
+ # find matching upstream column
800
+ for dataset_col in dataset_columns:
801
+ dataset_col_name, dataset_col_type, dataset_col_description = (
802
+ dataset_col
803
+ )
804
+ if dataset_col_name == chart_col_name:
805
+ chart_columns.append(
806
+ (chart_col_name, dataset_col_type, dataset_col_description)
807
+ )
808
+ break
809
+ else:
810
+ chart_columns.append((chart_col_name, "", ""))
811
+
812
+ return chart_columns
813
+
814
+ def construct_chart_cll(
815
+ self,
816
+ chart_data: dict,
817
+ datasource_urn: Union[str, None],
818
+ datasource_id: Union[Any, int],
819
+ ) -> List[InputField]:
820
+ """Construct column-level lineage for a chart."""
821
+ form_data = chart_data.get("form_data", {})
822
+
823
+ # Extract and process all columns in one go
824
+ unique_columns = self._collect_all_unique_columns(form_data)
825
+
826
+ # Fetch dataset columns
827
+ dataset_columns = self._fetch_dataset_columns(datasource_id)
828
+ if not dataset_columns:
829
+ return []
830
+
831
+ # Match chart columns with dataset columns
832
+ chart_columns = self._match_chart_columns_with_dataset(
833
+ unique_columns, dataset_columns
834
+ )
835
+
836
+ # Build input fields
837
+ return self.build_input_fields(chart_columns, datasource_urn)
838
+
839
+ def construct_chart_from_chart_data(
840
+ self, chart_data: dict
841
+ ) -> Iterable[MetadataWorkUnit]:
498
842
  chart_urn = make_chart_urn(
499
843
  platform=self.platform,
500
844
  name=str(chart_data["id"]),
@@ -506,8 +850,9 @@ class SupersetSource(StatefulIngestionSourceBase):
506
850
  )
507
851
 
508
852
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
853
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
509
854
  modified_ts = int(
510
- dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
855
+ dp.parse(chart_data.get("changed_on_utc", now)).timestamp() * 1000
511
856
  )
512
857
  title = chart_data.get("slice_name", "")
513
858
 
@@ -581,6 +926,18 @@ class SupersetSource(StatefulIngestionSourceBase):
581
926
  )
582
927
  chart_snapshot.aspects.append(chart_info)
583
928
 
929
+ input_fields = self.construct_chart_cll(
930
+ chart_data, datasource_urn, datasource_id
931
+ )
932
+
933
+ if input_fields:
934
+ yield MetadataChangeProposalWrapper(
935
+ entityUrn=chart_urn,
936
+ aspect=InputFields(
937
+ fields=sorted(input_fields, key=lambda x: x.schemaFieldUrn)
938
+ ),
939
+ ).as_workunit()
940
+
584
941
  chart_owners_list = self.build_owner_urn(chart_data)
585
942
  owners_info = OwnershipClass(
586
943
  owners=[
@@ -593,50 +950,143 @@ class SupersetSource(StatefulIngestionSourceBase):
593
950
  lastModified=last_modified,
594
951
  )
595
952
  chart_snapshot.aspects.append(owners_info)
596
- return chart_snapshot
597
953
 
598
- def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
599
- for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
600
- try:
601
- chart_id = str(chart_data.get("id"))
602
- chart_name = chart_data.get("slice_name", "")
954
+ superset_tags = self._extract_and_map_tags(chart_data.get("tags", []))
955
+ tags = self._merge_tags_with_existing(chart_urn, superset_tags)
956
+ if tags:
957
+ chart_snapshot.aspects.append(tags)
603
958
 
604
- if not self.config.chart_pattern.allowed(chart_name):
605
- self.report.report_dropped(
606
- f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
607
- )
608
- continue
959
+ yield MetadataWorkUnit(
960
+ id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
961
+ )
609
962
 
610
- # Emit a warning if charts use data from a dataset that will be filtered out
611
- if self.config.dataset_pattern != AllowDenyPattern.allow_all():
612
- datasource_id = chart_data.get("datasource_id")
613
- if datasource_id:
614
- dataset_response = self.get_dataset_info(datasource_id)
615
- dataset_name = dataset_response.get("result", {}).get(
616
- "table_name", ""
963
+ yield from self._get_domain_wu(
964
+ title=chart_data.get("slice_name", ""),
965
+ entity_urn=chart_urn,
966
+ )
967
+
968
+ def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
969
+ chart_name = ""
970
+ database_name = None
971
+ try:
972
+ chart_id = chart_data.get("id")
973
+ chart_name = chart_data.get("slice_name", "")
974
+ if not self.config.chart_pattern.allowed(chart_name):
975
+ self.report.report_dropped(
976
+ f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
977
+ )
978
+ return
979
+
980
+ # TODO: Make helper methods for database_pattern
981
+ if self.config.database_pattern != AllowDenyPattern.allow_all():
982
+ datasource_id = chart_data.get("datasource_id")
983
+
984
+ if datasource_id:
985
+ if datasource_id in self.filtered_dataset_to_database:
986
+ database_name = self.filtered_dataset_to_database[datasource_id]
987
+ self.filtered_chart_to_database[chart_id] = database_name
988
+
989
+ is_filtered = not self.config.database_pattern.allowed(
990
+ database_name
617
991
  )
992
+ self.processed_charts[chart_id] = (database_name, is_filtered)
618
993
 
619
- if dataset_name and not self.config.dataset_pattern.allowed(
620
- dataset_name
621
- ):
994
+ if is_filtered:
622
995
  self.report.warning(
623
- f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
996
+ message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
997
+ context=str(
998
+ dict(
999
+ chart_id=chart_id,
1000
+ chart_name=chart_name,
1001
+ database_name=database_name,
1002
+ )
1003
+ ),
1004
+ title="Incomplete Ingestion",
624
1005
  )
625
1006
 
626
- chart_snapshot = self.construct_chart_from_chart_data(chart_data)
1007
+ else:
1008
+ dataset_response = self.get_dataset_info(datasource_id)
1009
+ database_name = (
1010
+ dataset_response.get("result", {})
1011
+ .get("database", {})
1012
+ .get("database_name")
1013
+ )
1014
+
1015
+ if database_name:
1016
+ is_filtered = not self.config.database_pattern.allowed(
1017
+ database_name
1018
+ )
1019
+ if is_filtered:
1020
+ self.filtered_chart_to_database[chart_id] = (
1021
+ database_name
1022
+ )
1023
+ self.filtered_dataset_to_database[datasource_id] = (
1024
+ database_name
1025
+ )
1026
+ self.processed_charts[chart_id] = (
1027
+ database_name,
1028
+ is_filtered,
1029
+ )
627
1030
 
628
- mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
629
- except Exception as e:
630
- self.report.warning(
631
- f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
632
- )
633
- continue
634
- # Emit the chart
635
- yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
636
- yield from self._get_domain_wu(
637
- title=chart_data.get("slice_name", ""),
638
- entity_urn=chart_snapshot.urn,
1031
+ if is_filtered:
1032
+ self.report.warning(
1033
+ message="Chart uses a dataset from a filtered database. Set the chart pattern to deny ingestion.",
1034
+ context=str(
1035
+ dict(
1036
+ chart_id=chart_id,
1037
+ chart_name=chart_name,
1038
+ database_name=database_name,
1039
+ )
1040
+ ),
1041
+ title="Incomplete Ingestion",
1042
+ )
1043
+
1044
+ if self.config.dataset_pattern != AllowDenyPattern.allow_all():
1045
+ datasource_id = chart_data.get("datasource_id")
1046
+ if datasource_id:
1047
+ dataset_response = self.get_dataset_info(datasource_id)
1048
+ dataset_name = dataset_response.get("result", {}).get(
1049
+ "table_name", ""
1050
+ )
1051
+ if dataset_name and not self.config.dataset_pattern.allowed(
1052
+ dataset_name
1053
+ ):
1054
+ self.report.warning(
1055
+ message="Chart uses a dataset that was filtered by dataset pattern. Update your dataset pattern to include this dataset.",
1056
+ context=str(
1057
+ dict(
1058
+ chart_id=chart_id,
1059
+ chart_name=chart_name,
1060
+ dataset_name=dataset_name,
1061
+ )
1062
+ ),
1063
+ title="Incomplete Ingestion",
1064
+ )
1065
+ if chart_id not in self.processed_charts:
1066
+ self.processed_charts[chart_id] = (database_name, False)
1067
+
1068
+ yield from self.construct_chart_from_chart_data(chart_data)
1069
+ except Exception as e:
1070
+ self.report.warning(
1071
+ message="Failed to construct chart snapshot. This chart will not be ingested.",
1072
+ context=str(
1073
+ dict(chart_id=chart_id, chart_name=chart_name, error=str(e))
1074
+ ),
1075
+ title="Chart Construction Failed",
1076
+ exc=e,
639
1077
  )
1078
+ return
1079
+
1080
+ def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
1081
+ chart_data_list = [
1082
+ (chart_data,)
1083
+ for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
1084
+ ]
1085
+ yield from ThreadedIteratorExecutor.process(
1086
+ worker_func=self._process_chart,
1087
+ args_list=chart_data_list,
1088
+ max_workers=self.config.max_threads,
1089
+ )
640
1090
 
641
1091
  def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
642
1092
  schema_fields: List[SchemaField] = []
@@ -650,7 +1100,27 @@ class SupersetSource(StatefulIngestionSourceBase):
650
1100
  fieldPath=col.get("column_name", ""),
651
1101
  type=SchemaFieldDataType(data_type),
652
1102
  nativeDataType="",
653
- description=col.get("column_name", ""),
1103
+ description=col.get("description") or col.get("column_name", ""),
1104
+ nullable=True,
1105
+ )
1106
+ schema_fields.append(field)
1107
+ return schema_fields
1108
+
1109
+ def gen_metric_schema_fields(
1110
+ self, metric_data: List[Dict[str, Any]]
1111
+ ) -> List[SchemaField]:
1112
+ schema_fields: List[SchemaField] = []
1113
+ for metric in metric_data:
1114
+ metric_type = metric.get("metric_type", "")
1115
+ data_type = resolve_sql_type(metric_type)
1116
+ if data_type is None:
1117
+ data_type = NullType()
1118
+
1119
+ field = SchemaField(
1120
+ fieldPath=metric.get("metric_name", ""),
1121
+ type=SchemaFieldDataType(data_type),
1122
+ nativeDataType=metric_type or "",
1123
+ description=metric.get("description", ""),
654
1124
  nullable=True,
655
1125
  )
656
1126
  schema_fields.append(field)
@@ -662,13 +1132,18 @@ class SupersetSource(StatefulIngestionSourceBase):
662
1132
  ) -> SchemaMetadata:
663
1133
  dataset_response = dataset_response.get("result", {})
664
1134
  column_data = dataset_response.get("columns", [])
1135
+ metric_data = dataset_response.get("metrics", [])
1136
+
1137
+ column_fields = self.gen_schema_fields(column_data)
1138
+ metric_fields = self.gen_metric_schema_fields(metric_data)
1139
+
665
1140
  schema_metadata = SchemaMetadata(
666
1141
  schemaName=dataset_response.get("table_name", ""),
667
1142
  platform=make_data_platform_urn(self.platform),
668
1143
  version=0,
669
1144
  hash="",
670
1145
  platformSchema=MySqlDDL(tableSchema=""),
671
- fields=self.gen_schema_fields(column_data),
1146
+ fields=column_fields + metric_fields,
672
1147
  )
673
1148
  return schema_metadata
674
1149
 
@@ -680,6 +1155,106 @@ class SupersetSource(StatefulIngestionSourceBase):
680
1155
  env=self.config.env,
681
1156
  )
682
1157
 
1158
+ def generate_virtual_dataset_lineage(
1159
+ self,
1160
+ parsed_query_object: SqlParsingResult,
1161
+ datasource_urn: str,
1162
+ ) -> UpstreamLineageClass:
1163
+ cll = (
1164
+ parsed_query_object.column_lineage
1165
+ if parsed_query_object.column_lineage is not None
1166
+ else []
1167
+ )
1168
+
1169
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
1170
+
1171
+ for cll_info in cll:
1172
+ downstream = (
1173
+ [make_schema_field_urn(datasource_urn, cll_info.downstream.column)]
1174
+ if cll_info.downstream and cll_info.downstream.column
1175
+ else []
1176
+ )
1177
+ upstreams = [
1178
+ make_schema_field_urn(column_ref.table, column_ref.column)
1179
+ for column_ref in cll_info.upstreams
1180
+ ]
1181
+ fine_grained_lineages.append(
1182
+ FineGrainedLineageClass(
1183
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1184
+ downstreams=downstream,
1185
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1186
+ upstreams=upstreams,
1187
+ )
1188
+ )
1189
+
1190
+ upstream_lineage = UpstreamLineageClass(
1191
+ upstreams=[
1192
+ UpstreamClass(
1193
+ type=DatasetLineageTypeClass.TRANSFORMED,
1194
+ dataset=input_table_urn,
1195
+ )
1196
+ for input_table_urn in parsed_query_object.in_tables
1197
+ ],
1198
+ fineGrainedLineages=fine_grained_lineages,
1199
+ )
1200
+ return upstream_lineage
1201
+
1202
+ def generate_physical_dataset_lineage(
1203
+ self,
1204
+ dataset_response: dict,
1205
+ upstream_dataset: str,
1206
+ datasource_urn: str,
1207
+ ) -> UpstreamLineageClass:
1208
+ # To generate column level lineage, we can manually decode the metadata
1209
+ # to produce the ColumnLineageInfo
1210
+ columns = dataset_response.get("result", {}).get("columns", [])
1211
+ metrics = dataset_response.get("result", {}).get("metrics", [])
1212
+
1213
+ fine_grained_lineages: List[FineGrainedLineageClass] = []
1214
+
1215
+ for column in columns:
1216
+ column_name = column.get("column_name", "")
1217
+ if not column_name:
1218
+ continue
1219
+
1220
+ downstream = [make_schema_field_urn(datasource_urn, column_name)]
1221
+ upstreams = [make_schema_field_urn(upstream_dataset, column_name)]
1222
+ fine_grained_lineages.append(
1223
+ FineGrainedLineageClass(
1224
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1225
+ downstreams=downstream,
1226
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1227
+ upstreams=upstreams,
1228
+ )
1229
+ )
1230
+
1231
+ for metric in metrics:
1232
+ metric_name = metric.get("metric_name", "")
1233
+ if not metric_name:
1234
+ continue
1235
+
1236
+ downstream = [make_schema_field_urn(datasource_urn, metric_name)]
1237
+ upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
1238
+ fine_grained_lineages.append(
1239
+ FineGrainedLineageClass(
1240
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1241
+ downstreams=downstream,
1242
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1243
+ upstreams=upstreams,
1244
+ )
1245
+ )
1246
+
1247
+ upstream_lineage = UpstreamLineageClass(
1248
+ upstreams=[
1249
+ UpstreamClass(
1250
+ type=DatasetLineageTypeClass.TRANSFORMED,
1251
+ dataset=upstream_dataset,
1252
+ )
1253
+ ],
1254
+ fineGrainedLineages=fine_grained_lineages,
1255
+ )
1256
+ return upstream_lineage
1257
+
683
1258
  def construct_dataset_from_dataset_data(
684
1259
  self, dataset_data: dict
685
1260
  ) -> DatasetSnapshot:
@@ -689,17 +1264,26 @@ class SupersetSource(StatefulIngestionSourceBase):
689
1264
  datasource_urn = self.get_datasource_urn_from_id(
690
1265
  dataset_response, self.platform
691
1266
  )
692
- dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
1267
+ dataset_url = f"{self.config.display_uri}/explore/?datasource_type=table&datasource_id={dataset.id}"
693
1268
 
694
1269
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
1270
+ now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
695
1271
  modified_ts = int(
696
- dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
1272
+ dp.parse(dataset_data.get("changed_on_utc", now)).timestamp() * 1000
697
1273
  )
698
1274
  last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
699
1275
 
700
1276
  upstream_warehouse_platform = (
701
1277
  dataset_response.get("result", {}).get("database", {}).get("backend")
702
1278
  )
1279
+ upstream_warehouse_db_name = (
1280
+ dataset_response.get("result", {}).get("database", {}).get("database_name")
1281
+ )
1282
+
1283
+ # if we have rendered sql, we always use that and defualt back to regular sql
1284
+ sql = dataset_response.get("result", {}).get(
1285
+ "rendered_sql"
1286
+ ) or dataset_response.get("result", {}).get("sql")
703
1287
 
704
1288
  # Preset has a way of naming their platforms differently than
705
1289
  # how datahub names them, so map the platform name to the correct naming
@@ -712,40 +1296,47 @@ class SupersetSource(StatefulIngestionSourceBase):
712
1296
  if upstream_warehouse_platform in warehouse_naming:
713
1297
  upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
714
1298
 
715
- # TODO: Categorize physical vs virtual upstream dataset
716
- # mark all upstream dataset as physical for now, in the future we would ideally like
717
- # to differentiate physical vs virtual upstream datasets
718
- tag_urn = f"urn:li:tag:{self.platform}:physical"
719
1299
  upstream_dataset = self.get_datasource_urn_from_id(
720
1300
  dataset_response, upstream_warehouse_platform
721
1301
  )
722
- upstream_lineage = UpstreamLineageClass(
723
- upstreams=[
724
- UpstreamClass(
725
- type=DatasetLineageTypeClass.TRANSFORMED,
726
- dataset=upstream_dataset,
727
- properties={"externalUrl": dataset_url},
728
- )
729
- ]
730
- )
1302
+
1303
+ # Sometimes the field will be null instead of not existing
1304
+ if sql == "null" or not sql:
1305
+ tag_urn = f"urn:li:tag:{self.platform}:physical"
1306
+ upstream_lineage = self.generate_physical_dataset_lineage(
1307
+ dataset_response, upstream_dataset, datasource_urn
1308
+ )
1309
+ else:
1310
+ tag_urn = f"urn:li:tag:{self.platform}:virtual"
1311
+ parsed_query_object = create_lineage_sql_parsed_result(
1312
+ query=sql,
1313
+ default_db=upstream_warehouse_db_name,
1314
+ platform=upstream_warehouse_platform,
1315
+ platform_instance=None,
1316
+ env=self.config.env,
1317
+ )
1318
+ upstream_lineage = self.generate_virtual_dataset_lineage(
1319
+ parsed_query_object, datasource_urn
1320
+ )
731
1321
 
732
1322
  dataset_info = DatasetPropertiesClass(
733
1323
  name=dataset.table_name,
734
- description="",
1324
+ description=dataset.description or "",
735
1325
  externalUrl=dataset_url,
736
1326
  lastModified=TimeStamp(time=modified_ts),
737
1327
  )
738
- global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
739
1328
 
740
- aspects_items: List[Any] = []
741
- aspects_items.extend(
742
- [
743
- self.gen_schema_metadata(dataset_response),
744
- dataset_info,
745
- upstream_lineage,
746
- global_tags,
747
- ]
748
- )
1329
+ dataset_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
1330
+ tags = self._merge_tags_with_existing(datasource_urn, dataset_tags)
1331
+
1332
+ aspects_items: List[Any] = [
1333
+ self.gen_schema_metadata(dataset_response),
1334
+ dataset_info,
1335
+ upstream_lineage,
1336
+ ]
1337
+
1338
+ if tags:
1339
+ aspects_items.append(tags)
749
1340
 
750
1341
  dataset_snapshot = DatasetSnapshot(
751
1342
  urn=datasource_urn,
@@ -767,41 +1358,134 @@ class SupersetSource(StatefulIngestionSourceBase):
767
1358
 
768
1359
  return dataset_snapshot
769
1360
 
770
- def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
771
- for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
772
- try:
773
- dataset_name = dataset_data.get("table_name", "")
1361
+ def _extract_and_map_tags(
1362
+ self, raw_tags: List[Dict[str, Any]]
1363
+ ) -> Optional[GlobalTagsClass]:
1364
+ """Extract and map Superset tags to DataHub GlobalTagsClass.
774
1365
 
775
- # Check if dataset should be filtered by dataset name
776
- if not self.config.dataset_pattern.allowed(dataset_name):
777
- self.report.report_dropped(
778
- f"Dataset '{dataset_name}' filtered by dataset_pattern"
779
- )
780
- continue
1366
+ Filters out system-generated tags (type != 1) and only processes user-defined tags
1367
+ from the Superset API response.
1368
+
1369
+ Args:
1370
+ raw_tags: List of tag dictionaries from Superset API
781
1371
 
782
- dataset_snapshot = self.construct_dataset_from_dataset_data(
783
- dataset_data
1372
+ Returns:
1373
+ GlobalTagsClass with user-defined tags, or None if no tags found
1374
+ """
1375
+ user_tags = [
1376
+ tag.get("name", "")
1377
+ for tag in raw_tags
1378
+ if tag.get("type") == 1 and tag.get("name")
1379
+ ]
1380
+
1381
+ if not user_tags:
1382
+ return None
1383
+
1384
+ tag_urns = [builder.make_tag_urn(tag) for tag in user_tags]
1385
+ return GlobalTagsClass(
1386
+ tags=[TagAssociationClass(tag=tag_urn) for tag_urn in tag_urns]
1387
+ )
1388
+
1389
+ def _merge_tags_with_existing(
1390
+ self, entity_urn: str, new_tags: Optional[GlobalTagsClass]
1391
+ ) -> Optional[GlobalTagsClass]:
1392
+ """Merge new tags with existing ones from DataHub to preserve manually added tags.
1393
+
1394
+ This method ensures that tags manually added via DataHub UI are not overwritten
1395
+ during ingestion. It fetches existing tags from the graph and merges them with
1396
+ new tags from the source system, avoiding duplicates.
1397
+
1398
+ Args:
1399
+ entity_urn: URN of the entity to check for existing tags
1400
+ new_tags: New tags to add as GlobalTagsClass object
1401
+
1402
+ Returns:
1403
+ GlobalTagsClass with merged tags preserving existing ones, or None if no tags
1404
+ """
1405
+ if not new_tags or not new_tags.tags:
1406
+ return None
1407
+
1408
+ # Fetch existing tags from DataHub
1409
+ existing_global_tags = None
1410
+ if self.ctx.graph:
1411
+ existing_global_tags = self.ctx.graph.get_aspect(
1412
+ entity_urn=entity_urn, aspect_type=GlobalTagsClass
1413
+ )
1414
+
1415
+ # Merge existing tags with new ones, avoiding duplicates
1416
+ all_tags = []
1417
+ existing_tag_urns = set()
1418
+
1419
+ if existing_global_tags and existing_global_tags.tags:
1420
+ all_tags.extend(existing_global_tags.tags)
1421
+ existing_tag_urns = {tag.tag for tag in existing_global_tags.tags}
1422
+
1423
+ # Add new tags that don't already exist
1424
+ for new_tag in new_tags.tags:
1425
+ if new_tag.tag not in existing_tag_urns:
1426
+ all_tags.append(new_tag)
1427
+
1428
+ return GlobalTagsClass(tags=all_tags) if all_tags else None
1429
+
1430
+ def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
1431
+ dataset_name = ""
1432
+ try:
1433
+ dataset_id = dataset_data.get("id")
1434
+ dataset_name = dataset_data.get("table_name", "")
1435
+ if not self.config.dataset_pattern.allowed(dataset_name):
1436
+ self.report.report_dropped(
1437
+ f"Dataset '{dataset_name}' filtered by dataset_pattern"
784
1438
  )
785
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
786
- except Exception as e:
787
- self.report.warning(
788
- f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
1439
+ return
1440
+ if self.config.database_pattern != AllowDenyPattern.allow_all():
1441
+ dataset_response = self.get_dataset_info(dataset_id)
1442
+ database_name = (
1443
+ dataset_response.get("result", {})
1444
+ .get("database", {})
1445
+ .get("database_name")
789
1446
  )
790
- continue
791
- # Emit the dataset
792
- yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
793
- yield from self._get_domain_wu(
794
- title=dataset_data.get("table_name", ""),
795
- entity_urn=dataset_snapshot.urn,
1447
+
1448
+ if database_name and not self.config.database_pattern.allowed(
1449
+ database_name
1450
+ ):
1451
+ self.filtered_dataset_to_database[dataset_id] = database_name
1452
+ self.report.report_dropped(
1453
+ f"Dataset '{dataset_name}' filtered by database_pattern with database '{database_name}'"
1454
+ )
1455
+ return
1456
+
1457
+ dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
1458
+ mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
1459
+ except Exception as e:
1460
+ self.report.warning(
1461
+ f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
796
1462
  )
1463
+ return
1464
+ yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
1465
+ yield from self._get_domain_wu(
1466
+ title=dataset_data.get("table_name", ""),
1467
+ entity_urn=dataset_snapshot.urn,
1468
+ )
1469
+
1470
+ def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
1471
+ dataset_data_list = [
1472
+ (dataset_data,)
1473
+ for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
1474
+ ]
1475
+ yield from ThreadedIteratorExecutor.process(
1476
+ worker_func=self._process_dataset,
1477
+ args_list=dataset_data_list,
1478
+ max_workers=self.config.max_threads,
1479
+ )
797
1480
 
798
1481
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
799
- if self.config.ingest_dashboards:
800
- yield from self.emit_dashboard_mces()
801
- if self.config.ingest_charts:
802
- yield from self.emit_chart_mces()
1482
+ # TODO: Possibly change ingestion order to minimize API calls
803
1483
  if self.config.ingest_datasets:
804
1484
  yield from self.emit_dataset_mces()
1485
+ if self.config.ingest_charts:
1486
+ yield from self.emit_chart_mces()
1487
+ if self.config.ingest_dashboards:
1488
+ yield from self.emit_dashboard_mces()
805
1489
 
806
1490
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
807
1491
  return [