acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (226) hide show
  1. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
  2. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
  3. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +2 -1
  7. datahub/api/entities/external/__init__.py +0 -0
  8. datahub/api/entities/external/external_entities.py +239 -0
  9. datahub/api/entities/external/external_tag.py +145 -0
  10. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  11. datahub/api/entities/external/restricted_text.py +247 -0
  12. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  13. datahub/cli/check_cli.py +88 -7
  14. datahub/cli/cli_utils.py +63 -0
  15. datahub/cli/container_cli.py +5 -0
  16. datahub/cli/delete_cli.py +124 -27
  17. datahub/cli/docker_check.py +107 -12
  18. datahub/cli/docker_cli.py +149 -227
  19. datahub/cli/exists_cli.py +0 -2
  20. datahub/cli/get_cli.py +0 -2
  21. datahub/cli/iceberg_cli.py +5 -0
  22. datahub/cli/ingest_cli.py +12 -16
  23. datahub/cli/migrate.py +2 -0
  24. datahub/cli/put_cli.py +1 -4
  25. datahub/cli/quickstart_versioning.py +50 -7
  26. datahub/cli/specific/assertions_cli.py +0 -4
  27. datahub/cli/specific/datacontract_cli.py +0 -3
  28. datahub/cli/specific/dataproduct_cli.py +0 -11
  29. datahub/cli/specific/dataset_cli.py +1 -8
  30. datahub/cli/specific/forms_cli.py +0 -4
  31. datahub/cli/specific/group_cli.py +0 -2
  32. datahub/cli/specific/structuredproperties_cli.py +1 -4
  33. datahub/cli/specific/user_cli.py +0 -2
  34. datahub/cli/state_cli.py +0 -2
  35. datahub/cli/timeline_cli.py +0 -2
  36. datahub/emitter/response_helper.py +86 -1
  37. datahub/emitter/rest_emitter.py +71 -13
  38. datahub/entrypoints.py +4 -3
  39. datahub/ingestion/api/decorators.py +15 -3
  40. datahub/ingestion/api/report.py +332 -3
  41. datahub/ingestion/api/sink.py +3 -0
  42. datahub/ingestion/api/source.py +48 -44
  43. datahub/ingestion/autogenerated/__init__.py +0 -0
  44. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  45. datahub/ingestion/autogenerated/lineage.json +401 -0
  46. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  47. datahub/ingestion/extractor/schema_util.py +13 -4
  48. datahub/ingestion/glossary/classification_mixin.py +5 -0
  49. datahub/ingestion/graph/client.py +100 -15
  50. datahub/ingestion/graph/config.py +1 -0
  51. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  52. datahub/ingestion/run/pipeline.py +54 -2
  53. datahub/ingestion/sink/datahub_rest.py +13 -0
  54. datahub/ingestion/source/abs/source.py +1 -1
  55. datahub/ingestion/source/aws/aws_common.py +4 -0
  56. datahub/ingestion/source/aws/glue.py +489 -244
  57. datahub/ingestion/source/aws/tag_entities.py +292 -0
  58. datahub/ingestion/source/azure/azure_common.py +2 -2
  59. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  60. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  61. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  62. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  63. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  64. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  65. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  66. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  67. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  68. datahub/ingestion/source/common/subtypes.py +45 -0
  69. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  70. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  71. datahub/ingestion/source/datahub/config.py +11 -0
  72. datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
  73. datahub/ingestion/source/datahub/datahub_source.py +1 -1
  74. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  75. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  76. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  77. datahub/ingestion/source/debug/__init__.py +0 -0
  78. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  79. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  80. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  81. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  82. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  83. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  84. datahub/ingestion/source/file.py +3 -0
  85. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  86. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  87. datahub/ingestion/source/ge_data_profiler.py +76 -28
  88. datahub/ingestion/source/ge_profiling_config.py +11 -0
  89. datahub/ingestion/source/hex/api.py +26 -1
  90. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  91. datahub/ingestion/source/identity/azure_ad.py +1 -1
  92. datahub/ingestion/source/identity/okta.py +1 -14
  93. datahub/ingestion/source/kafka/kafka.py +16 -0
  94. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  95. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  96. datahub/ingestion/source/looker/looker_source.py +1 -0
  97. datahub/ingestion/source/mlflow.py +11 -1
  98. datahub/ingestion/source/mock_data/__init__.py +0 -0
  99. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  100. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  101. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  102. datahub/ingestion/source/nifi.py +1 -1
  103. datahub/ingestion/source/openapi.py +12 -0
  104. datahub/ingestion/source/openapi_parser.py +56 -37
  105. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  106. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  107. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  108. datahub/ingestion/source/preset.py +2 -2
  109. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  110. datahub/ingestion/source/redshift/redshift.py +21 -1
  111. datahub/ingestion/source/redshift/usage.py +4 -3
  112. datahub/ingestion/source/s3/report.py +4 -2
  113. datahub/ingestion/source/s3/source.py +367 -115
  114. datahub/ingestion/source/sac/sac.py +3 -1
  115. datahub/ingestion/source/salesforce.py +6 -3
  116. datahub/ingestion/source/sigma/sigma.py +7 -1
  117. datahub/ingestion/source/slack/slack.py +2 -1
  118. datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
  119. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  120. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  121. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  122. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  123. datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
  124. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  125. datahub/ingestion/source/sql/athena.py +119 -11
  126. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  127. datahub/ingestion/source/sql/clickhouse.py +3 -1
  128. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  129. datahub/ingestion/source/sql/hana.py +3 -1
  130. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  131. datahub/ingestion/source/sql/mariadb.py +0 -1
  132. datahub/ingestion/source/sql/mssql/source.py +239 -34
  133. datahub/ingestion/source/sql/mysql.py +0 -1
  134. datahub/ingestion/source/sql/oracle.py +1 -1
  135. datahub/ingestion/source/sql/postgres.py +0 -1
  136. datahub/ingestion/source/sql/sql_common.py +121 -34
  137. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  138. datahub/ingestion/source/sql/teradata.py +997 -235
  139. datahub/ingestion/source/sql/vertica.py +10 -6
  140. datahub/ingestion/source/sql_queries.py +2 -2
  141. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  142. datahub/ingestion/source/superset.py +58 -3
  143. datahub/ingestion/source/tableau/tableau.py +58 -37
  144. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  145. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  146. datahub/ingestion/source/unity/config.py +5 -0
  147. datahub/ingestion/source/unity/proxy.py +118 -0
  148. datahub/ingestion/source/unity/source.py +195 -17
  149. datahub/ingestion/source/unity/tag_entities.py +295 -0
  150. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  151. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  152. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  153. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  154. datahub/metadata/_internal_schema_classes.py +1446 -559
  155. datahub/metadata/_urns/urn_defs.py +1721 -1553
  156. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  158. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  159. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  160. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  161. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  162. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  163. datahub/metadata/schema.avsc +18055 -17802
  164. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  165. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  166. datahub/metadata/schemas/Applications.avsc +38 -0
  167. datahub/metadata/schemas/ChartKey.avsc +1 -0
  168. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  169. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  170. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  171. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  172. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  173. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  174. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  175. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  176. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  177. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  178. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  179. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  180. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  181. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  182. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  183. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  184. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  185. datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
  186. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  187. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  188. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  189. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  190. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  191. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  192. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  193. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  194. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  195. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  196. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  197. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  198. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  199. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  200. datahub/sdk/__init__.py +6 -0
  201. datahub/sdk/_all_entities.py +11 -0
  202. datahub/sdk/_shared.py +118 -1
  203. datahub/sdk/chart.py +315 -0
  204. datahub/sdk/container.py +7 -0
  205. datahub/sdk/dashboard.py +432 -0
  206. datahub/sdk/dataflow.py +309 -0
  207. datahub/sdk/datajob.py +367 -0
  208. datahub/sdk/dataset.py +8 -2
  209. datahub/sdk/entity_client.py +90 -2
  210. datahub/sdk/lineage_client.py +683 -82
  211. datahub/sdk/main_client.py +46 -16
  212. datahub/sdk/mlmodel.py +101 -38
  213. datahub/sdk/mlmodelgroup.py +7 -0
  214. datahub/sdk/search_client.py +4 -3
  215. datahub/specific/chart.py +1 -1
  216. datahub/specific/dataproduct.py +4 -0
  217. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  218. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  219. datahub/telemetry/telemetry.py +17 -11
  220. datahub/testing/sdk_v2_helpers.py +7 -1
  221. datahub/upgrade/upgrade.py +46 -13
  222. datahub/utilities/server_config_util.py +8 -0
  223. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  224. datahub/utilities/stats_collections.py +4 -0
  225. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
  226. {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,10 @@ from datahub.ingestion.api.decorators import (
25
25
  )
26
26
  from datahub.ingestion.api.workunit import MetadataWorkUnit
27
27
  from datahub.ingestion.source.common.data_reader import DataReader
28
+ from datahub.ingestion.source.common.subtypes import (
29
+ DatasetSubTypes,
30
+ SourceCapabilityModifier,
31
+ )
28
32
  from datahub.ingestion.source.sql.sql_common import (
29
33
  SQLAlchemySource,
30
34
  SqlWorkUnit,
@@ -41,7 +45,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
41
45
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
42
46
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
43
47
  from datahub.metadata.schema_classes import (
44
- ChangeTypeClass,
45
48
  DatasetLineageTypeClass,
46
49
  DatasetPropertiesClass,
47
50
  SubTypesClass,
@@ -113,10 +116,14 @@ class VerticaConfig(BasicSQLAlchemyConfig):
113
116
  @capability(
114
117
  SourceCapability.LINEAGE_COARSE,
115
118
  "Enabled by default, can be disabled via configuration `include_view_lineage` and `include_projection_lineage`",
119
+ subtype_modifier=[
120
+ SourceCapabilityModifier.VIEW,
121
+ SourceCapabilityModifier.PROJECTIONS,
122
+ ],
116
123
  )
117
124
  @capability(
118
125
  SourceCapability.DELETION_DETECTION,
119
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
126
+ "Enabled by default via stateful ingestion",
120
127
  supported=True,
121
128
  )
122
129
  class VerticaSource(SQLAlchemySource):
@@ -493,11 +500,8 @@ class VerticaSource(SQLAlchemySource):
493
500
  if dpi_aspect:
494
501
  yield dpi_aspect
495
502
  yield MetadataChangeProposalWrapper(
496
- entityType="dataset",
497
- changeType=ChangeTypeClass.UPSERT,
498
503
  entityUrn=dataset_urn,
499
- aspectName="subTypes",
500
- aspect=SubTypesClass(typeNames=["Projections"]),
504
+ aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
501
505
  ).as_workunit()
502
506
 
503
507
  if self.config.domain:
@@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
66
66
  description="The default schema to use for unqualified table names",
67
67
  default=None,
68
68
  )
69
- default_dialect: Optional[str] = Field(
69
+ override_dialect: Optional[str] = Field(
70
70
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
71
71
  default=None,
72
72
  )
@@ -181,7 +181,7 @@ class SqlQueriesSource(Source):
181
181
  schema_resolver=self.schema_resolver,
182
182
  default_db=self.config.default_db,
183
183
  default_schema=self.config.default_schema,
184
- default_dialect=self.config.default_dialect,
184
+ override_dialect=self.config.override_dialect,
185
185
  )
186
186
  if result.debug_info.table_error:
187
187
  logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
@@ -179,7 +179,7 @@ class StatefulIngestionReport(SourceReport):
179
179
 
180
180
  @capability(
181
181
  SourceCapability.DELETION_DETECTION,
182
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
182
+ "Enabled by default via stateful ingestion",
183
183
  supported=True,
184
184
  )
185
185
  class StatefulIngestionSourceBase(Source):
@@ -272,7 +272,7 @@ def get_filter_name(filter_obj):
272
272
  @config_class(SupersetConfig)
273
273
  @support_status(SupportStatus.CERTIFIED)
274
274
  @capability(
275
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
275
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
276
276
  )
277
277
  @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
278
278
  @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
@@ -658,6 +658,7 @@ class SupersetSource(StatefulIngestionSourceBase):
658
658
  if datasource_id:
659
659
  dataset_info = self.get_dataset_info(datasource_id).get("result", {})
660
660
  dataset_column_info = dataset_info.get("columns", [])
661
+ dataset_metric_info = dataset_info.get("metrics", [])
661
662
 
662
663
  for column in dataset_column_info:
663
664
  col_name = column.get("column_name", "")
@@ -671,6 +672,17 @@ class SupersetSource(StatefulIngestionSourceBase):
671
672
  continue
672
673
 
673
674
  dataset_columns.append((col_name, col_type, col_description))
675
+
676
+ for metric in dataset_metric_info:
677
+ metric_name = metric.get("metric_name", "")
678
+ metric_type = metric.get("metric_type", "")
679
+ metric_description = metric.get("description", "")
680
+
681
+ if metric_name == "" or metric_type == "":
682
+ logger.info(f"could not construct metric lineage for {metric}")
683
+ continue
684
+
685
+ dataset_columns.append((metric_name, metric_type, metric_description))
674
686
  else:
675
687
  # if no datasource id, cannot build cll, just return
676
688
  logger.warning(
@@ -972,19 +984,44 @@ class SupersetSource(StatefulIngestionSourceBase):
972
984
  schema_fields.append(field)
973
985
  return schema_fields
974
986
 
987
+ def gen_metric_schema_fields(
988
+ self, metric_data: List[Dict[str, Any]]
989
+ ) -> List[SchemaField]:
990
+ schema_fields: List[SchemaField] = []
991
+ for metric in metric_data:
992
+ metric_type = metric.get("metric_type", "")
993
+ data_type = resolve_sql_type(metric_type)
994
+ if data_type is None:
995
+ data_type = NullType()
996
+
997
+ field = SchemaField(
998
+ fieldPath=metric.get("metric_name", ""),
999
+ type=SchemaFieldDataType(data_type),
1000
+ nativeDataType=metric_type or "",
1001
+ description=metric.get("description", ""),
1002
+ nullable=True,
1003
+ )
1004
+ schema_fields.append(field)
1005
+ return schema_fields
1006
+
975
1007
  def gen_schema_metadata(
976
1008
  self,
977
1009
  dataset_response: dict,
978
1010
  ) -> SchemaMetadata:
979
1011
  dataset_response = dataset_response.get("result", {})
980
1012
  column_data = dataset_response.get("columns", [])
1013
+ metric_data = dataset_response.get("metrics", [])
1014
+
1015
+ column_fields = self.gen_schema_fields(column_data)
1016
+ metric_fields = self.gen_metric_schema_fields(metric_data)
1017
+
981
1018
  schema_metadata = SchemaMetadata(
982
1019
  schemaName=dataset_response.get("table_name", ""),
983
1020
  platform=make_data_platform_urn(self.platform),
984
1021
  version=0,
985
1022
  hash="",
986
1023
  platformSchema=MySqlDDL(tableSchema=""),
987
- fields=self.gen_schema_fields(column_data),
1024
+ fields=column_fields + metric_fields,
988
1025
  )
989
1026
  return schema_metadata
990
1027
 
@@ -1049,6 +1086,8 @@ class SupersetSource(StatefulIngestionSourceBase):
1049
1086
  # To generate column level lineage, we can manually decode the metadata
1050
1087
  # to produce the ColumnLineageInfo
1051
1088
  columns = dataset_response.get("result", {}).get("columns", [])
1089
+ metrics = dataset_response.get("result", {}).get("metrics", [])
1090
+
1052
1091
  fine_grained_lineages: List[FineGrainedLineageClass] = []
1053
1092
 
1054
1093
  for column in columns:
@@ -1067,6 +1106,22 @@ class SupersetSource(StatefulIngestionSourceBase):
1067
1106
  )
1068
1107
  )
1069
1108
 
1109
+ for metric in metrics:
1110
+ metric_name = metric.get("metric_name", "")
1111
+ if not metric_name:
1112
+ continue
1113
+
1114
+ downstream = [make_schema_field_urn(datasource_urn, metric_name)]
1115
+ upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
1116
+ fine_grained_lineages.append(
1117
+ FineGrainedLineageClass(
1118
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1119
+ downstreams=downstream,
1120
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1121
+ upstreams=upstreams,
1122
+ )
1123
+ )
1124
+
1070
1125
  upstream_lineage = UpstreamLineageClass(
1071
1126
  upstreams=[
1072
1127
  UpstreamClass(
@@ -1087,7 +1142,7 @@ class SupersetSource(StatefulIngestionSourceBase):
1087
1142
  datasource_urn = self.get_datasource_urn_from_id(
1088
1143
  dataset_response, self.platform
1089
1144
  )
1090
- dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
1145
+ dataset_url = f"{self.config.display_uri}/explore/?datasource_type=table&datasource_id={dataset.id}"
1091
1146
 
1092
1147
  modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
1093
1148
  now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
@@ -80,6 +80,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
80
80
  from datahub.ingestion.source.common.subtypes import (
81
81
  BIContainerSubTypes,
82
82
  DatasetSubTypes,
83
+ SourceCapabilityModifier,
83
84
  )
84
85
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
85
86
  StaleEntityRemovalHandler,
@@ -148,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
148
149
  )
149
150
  from datahub.metadata.schema_classes import (
150
151
  BrowsePathsClass,
151
- ChangeTypeClass,
152
152
  ChartInfoClass,
153
153
  ChartUsageStatisticsClass,
154
154
  DashboardInfoClass,
@@ -528,6 +528,14 @@ class TableauConfig(
528
528
  default=False,
529
529
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
530
530
  )
531
+ emit_all_published_datasources: bool = Field(
532
+ default=False,
533
+ description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
534
+ )
535
+ emit_all_embedded_datasources: bool = Field(
536
+ default=False,
537
+ description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
538
+ )
531
539
 
532
540
  env: str = Field(
533
541
  default=builder.DEFAULT_ENV,
@@ -861,16 +869,29 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
861
869
  @platform_name("Tableau")
862
870
  @config_class(TableauConfig)
863
871
  @support_status(SupportStatus.CERTIFIED)
872
+ @capability(
873
+ SourceCapability.CONTAINERS,
874
+ "Enabled by default",
875
+ subtype_modifier=[
876
+ SourceCapabilityModifier.TABLEAU_PROJECT,
877
+ SourceCapabilityModifier.TABLEAU_SITE,
878
+ SourceCapabilityModifier.TABLEAU_WORKBOOK,
879
+ ],
880
+ )
864
881
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
865
882
  @capability(SourceCapability.DOMAINS, "Requires transformer", supported=False)
866
883
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
867
884
  @capability(
868
885
  SourceCapability.USAGE_STATS,
869
886
  "Dashboard/Chart view counts, enabled using extract_usage_stats config",
887
+ subtype_modifier=[
888
+ SourceCapabilityModifier.DASHBOARD,
889
+ SourceCapabilityModifier.CHART,
890
+ ],
870
891
  )
871
892
  @capability(
872
893
  SourceCapability.DELETION_DETECTION,
873
- "Enabled by default when stateful ingestion is turned on.",
894
+ "Enabled by default via stateful ingestion.",
874
895
  )
875
896
  @capability(SourceCapability.OWNERSHIP, "Requires recipe configuration")
876
897
  @capability(SourceCapability.TAGS, "Requires recipe configuration")
@@ -879,6 +900,7 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
879
900
  SourceCapability.LINEAGE_FINE,
880
901
  "Enabled by default, configure using `extract_column_level_lineage`",
881
902
  )
903
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
882
904
  class TableauSource(StatefulIngestionSourceBase, TestableSource):
883
905
  platform = "tableau"
884
906
 
@@ -2174,32 +2196,32 @@ class TableauSiteSource:
2174
2196
  else []
2175
2197
  )
2176
2198
 
2177
- # The Tableau SQL parser much worse than our sqlglot based parser,
2178
- # so relying on metadata parsed by Tableau from SQL queries can be
2179
- # less accurate. This option allows us to ignore Tableau's parser and
2180
- # only use our own.
2181
- if self.config.force_extraction_of_lineage_from_custom_sql_queries:
2182
- logger.debug("Extracting TLL & CLL from custom sql (forced)")
2199
+ tableau_table_list = csql.get(c.TABLES, [])
2200
+ if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
2201
+ not tableau_table_list
2202
+ and self.config.extract_lineage_from_unsupported_custom_sql_queries
2203
+ ):
2204
+ if not tableau_table_list:
2205
+ # custom sql tables may contain unsupported sql, causing incomplete lineage
2206
+ # we extract the lineage from the raw queries
2207
+ logger.debug(
2208
+ "Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
2209
+ )
2210
+ else:
2211
+ # The Tableau SQL parser is much worse than our sqlglot based parser,
2212
+ # so relying on metadata parsed by Tableau from SQL queries can be
2213
+ # less accurate. This option allows us to ignore Tableau's parser and
2214
+ # only use our own.
2215
+ logger.debug("Parsing TLL & CLL from custom sql (forced)")
2216
+
2183
2217
  yield from self._create_lineage_from_unsupported_csql(
2184
2218
  csql_urn, csql, columns
2185
2219
  )
2186
2220
  else:
2187
- tables = csql.get(c.TABLES, [])
2188
-
2189
- if tables:
2190
- # lineage from custom sql -> datasets/tables #
2191
- yield from self._create_lineage_to_upstream_tables(
2192
- csql_urn, tables, datasource
2193
- )
2194
- elif (
2195
- self.config.extract_lineage_from_unsupported_custom_sql_queries
2196
- ):
2197
- logger.debug("Extracting TLL & CLL from custom sql")
2198
- # custom sql tables may contain unsupported sql, causing incomplete lineage
2199
- # we extract the lineage from the raw queries
2200
- yield from self._create_lineage_from_unsupported_csql(
2201
- csql_urn, csql, columns
2202
- )
2221
+ # lineage from custom sql -> datasets/tables #
2222
+ yield from self._create_lineage_to_upstream_tables(
2223
+ csql_urn, tableau_table_list, datasource
2224
+ )
2203
2225
 
2204
2226
  # Schema Metadata
2205
2227
  schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
@@ -2237,7 +2259,6 @@ class TableauSiteSource:
2237
2259
  yield self.get_metadata_change_event(dataset_snapshot)
2238
2260
  yield self.get_metadata_change_proposal(
2239
2261
  dataset_snapshot.urn,
2240
- aspect_name=c.SUB_TYPES,
2241
2262
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
2242
2263
  )
2243
2264
 
@@ -2402,7 +2423,6 @@ class TableauSiteSource:
2402
2423
  upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
2403
2424
  yield self.get_metadata_change_proposal(
2404
2425
  csql_urn,
2405
- aspect_name=c.UPSTREAM_LINEAGE,
2406
2426
  aspect=upstream_lineage,
2407
2427
  )
2408
2428
  self.report.num_tables_with_upstream_lineage += 1
@@ -2588,7 +2608,6 @@ class TableauSiteSource:
2588
2608
  )
2589
2609
  yield self.get_metadata_change_proposal(
2590
2610
  csql_urn,
2591
- aspect_name=c.UPSTREAM_LINEAGE,
2592
2611
  aspect=upstream_lineage,
2593
2612
  )
2594
2613
  self.report.num_tables_with_upstream_lineage += 1
@@ -2634,14 +2653,10 @@ class TableauSiteSource:
2634
2653
  def get_metadata_change_proposal(
2635
2654
  self,
2636
2655
  urn: str,
2637
- aspect_name: str,
2638
2656
  aspect: Union["UpstreamLineage", "SubTypesClass"],
2639
2657
  ) -> MetadataWorkUnit:
2640
2658
  return MetadataChangeProposalWrapper(
2641
- entityType=c.DATASET,
2642
- changeType=ChangeTypeClass.UPSERT,
2643
2659
  entityUrn=urn,
2644
- aspectName=aspect_name,
2645
2660
  aspect=aspect,
2646
2661
  ).as_workunit()
2647
2662
 
@@ -2749,7 +2764,6 @@ class TableauSiteSource:
2749
2764
  )
2750
2765
  yield self.get_metadata_change_proposal(
2751
2766
  datasource_urn,
2752
- aspect_name=c.UPSTREAM_LINEAGE,
2753
2767
  aspect=upstream_lineage,
2754
2768
  )
2755
2769
  self.report.num_tables_with_upstream_lineage += 1
@@ -2768,7 +2782,6 @@ class TableauSiteSource:
2768
2782
  yield self.get_metadata_change_event(dataset_snapshot)
2769
2783
  yield self.get_metadata_change_proposal(
2770
2784
  dataset_snapshot.urn,
2771
- aspect_name=c.SUB_TYPES,
2772
2785
  aspect=SubTypesClass(
2773
2786
  typeNames=(
2774
2787
  ["Embedded Data Source"]
@@ -2854,7 +2867,11 @@ class TableauSiteSource:
2854
2867
  return datasource
2855
2868
 
2856
2869
  def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
2857
- datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
2870
+ datasource_filter = (
2871
+ {}
2872
+ if self.config.emit_all_published_datasources
2873
+ else {c.ID_WITH_IN: self.datasource_ids_being_used}
2874
+ )
2858
2875
 
2859
2876
  for datasource in self.get_connection_objects(
2860
2877
  query=published_datasource_graphql_query,
@@ -3547,7 +3564,11 @@ class TableauSiteSource:
3547
3564
  return browse_paths
3548
3565
 
3549
3566
  def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
3550
- datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3567
+ datasource_filter = (
3568
+ {}
3569
+ if self.config.emit_all_embedded_datasources
3570
+ else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3571
+ )
3551
3572
 
3552
3573
  for datasource in self.get_connection_objects(
3553
3574
  query=embedded_datasource_graphql_query,
@@ -3659,7 +3680,7 @@ class TableauSiteSource:
3659
3680
  container_key=project_key,
3660
3681
  name=project_.name,
3661
3682
  description=project_.description,
3662
- sub_types=[c.PROJECT],
3683
+ sub_types=[BIContainerSubTypes.TABLEAU_PROJECT],
3663
3684
  parent_container_key=parent_project_key,
3664
3685
  )
3665
3686
 
@@ -3677,7 +3698,7 @@ class TableauSiteSource:
3677
3698
  yield from gen_containers(
3678
3699
  container_key=self.gen_site_key(self.site_id),
3679
3700
  name=self.site.name or "Default",
3680
- sub_types=[c.SITE],
3701
+ sub_types=[BIContainerSubTypes.TABLEAU_SITE],
3681
3702
  )
3682
3703
 
3683
3704
  def _fetch_groups(self):
@@ -579,10 +579,12 @@ def get_platform(connection_type: str) -> str:
579
579
  platform = "oracle"
580
580
  elif connection_type in ("tbio", "teradata"):
581
581
  platform = "teradata"
582
- elif connection_type in ("sqlserver"):
582
+ elif connection_type in ("sqlserver",):
583
583
  platform = "mssql"
584
- elif connection_type in ("athena"):
584
+ elif connection_type in ("athena",):
585
585
  platform = "athena"
586
+ elif connection_type in ("googlebigquery",):
587
+ platform = "bigquery"
586
588
  elif connection_type.endswith("_jdbc"):
587
589
  # e.g. convert trino_jdbc -> trino
588
590
  platform = connection_type[: -len("_jdbc")]
@@ -50,7 +50,6 @@ TABLES = "tables"
50
50
  DESCRIPTION = "description"
51
51
  SQL = "SQL"
52
52
  QUERY = "query"
53
- SUB_TYPES = "subTypes"
54
53
  VIEW = "view"
55
54
  CUSTOM_SQL = "Custom SQL"
56
55
  REMOTE_TYPE = "remoteType"
@@ -58,7 +57,6 @@ UNKNOWN = "UNKNOWN"
58
57
  PUBLISHED_DATA_SOURCE = "PublishedDatasource"
59
58
  LUID = "luid"
60
59
  EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
61
- UPSTREAM_LINEAGE = "upstreamLineage"
62
60
  OWNER = "owner"
63
61
  USERNAME = "username"
64
62
  HAS_EXTRACTS = "hasExtracts"
@@ -78,8 +76,6 @@ CHART = "chart"
78
76
  DASHBOARD = "dashboard"
79
77
  DASHBOARDS_CONNECTION = "dashboardsConnection"
80
78
  EMBEDDED_DATA_SOURCES_CONNECTION = "embeddedDatasourcesConnection"
81
- PROJECT = "Project"
82
- SITE = "Site"
83
79
  IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
84
80
  SITE_PERMISSION = "sitePermission"
85
81
  ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"
@@ -229,6 +229,11 @@ class UnityCatalogSourceConfig(
229
229
  description="Option to enable/disable ownership generation for metastores, catalogs, schemas, and tables.",
230
230
  )
231
231
 
232
+ include_tags: bool = pydantic.Field(
233
+ default=True,
234
+ description="Option to enable/disable column/table tag extraction.",
235
+ )
236
+
232
237
  _rename_table_ownership = pydantic_renamed_field(
233
238
  "include_table_ownership", "include_ownership"
234
239
  )
@@ -8,6 +8,8 @@ from datetime import datetime
8
8
  from typing import Any, Dict, Iterable, List, Optional, Union, cast
9
9
  from unittest.mock import patch
10
10
 
11
+ import cachetools
12
+ from cachetools import cached
11
13
  from databricks.sdk import WorkspaceClient
12
14
  from databricks.sdk.service.catalog import (
13
15
  CatalogInfo,
@@ -25,8 +27,10 @@ from databricks.sdk.service.sql import (
25
27
  QueryStatus,
26
28
  )
27
29
  from databricks.sdk.service.workspace import ObjectType
30
+ from databricks.sql import connect
28
31
 
29
32
  from datahub._version import nice_version_name
33
+ from datahub.api.entities.external.unity_catalog_external_entites import UnityCatalogTag
30
34
  from datahub.emitter.mce_builder import parse_ts_millis
31
35
  from datahub.ingestion.source.unity.hive_metastore_proxy import HiveMetastoreProxy
32
36
  from datahub.ingestion.source.unity.proxy_profiling import (
@@ -108,6 +112,13 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
108
112
  self.warehouse_id = warehouse_id or ""
109
113
  self.report = report
110
114
  self.hive_metastore_proxy = hive_metastore_proxy
115
+ self._sql_connection_params = {
116
+ "server_hostname": self._workspace_client.config.host.replace(
117
+ "https://", ""
118
+ ),
119
+ "http_path": f"/sql/1.0/warehouses/{self.warehouse_id}",
120
+ "access_token": self._workspace_client.config.token,
121
+ }
111
122
 
112
123
  def check_basic_connectivity(self) -> bool:
113
124
  return bool(self._workspace_client.catalogs.list(include_browse=True))
@@ -492,3 +503,110 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
492
503
  executed_as_user_id=info.executed_as_user_id,
493
504
  executed_as_user_name=info.executed_as_user_name,
494
505
  )
506
+
507
+ def _execute_sql_query(self, query: str) -> List[List[str]]:
508
+ """Execute SQL query using databricks-sql connector for better performance"""
509
+ try:
510
+ with (
511
+ connect(**self._sql_connection_params) as connection,
512
+ connection.cursor() as cursor,
513
+ ):
514
+ cursor.execute(query)
515
+ return cursor.fetchall()
516
+
517
+ except Exception as e:
518
+ logger.warning(f"Failed to execute SQL query: {e}")
519
+ return []
520
+
521
+ @cached(cachetools.FIFOCache(maxsize=100))
522
+ def get_schema_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
523
+ """Optimized version using databricks-sql"""
524
+ logger.info(f"Fetching schema tags for catalog: {catalog}")
525
+
526
+ query = f"SELECT * FROM {catalog}.information_schema.schema_tags"
527
+ rows = self._execute_sql_query(query)
528
+
529
+ result_dict: Dict[str, List[UnityCatalogTag]] = {}
530
+
531
+ for row in rows:
532
+ catalog_name, schema_name, tag_name, tag_value = row
533
+ schema_key = f"{catalog_name}.{schema_name}"
534
+
535
+ if schema_key not in result_dict:
536
+ result_dict[schema_key] = []
537
+
538
+ result_dict[schema_key].append(
539
+ UnityCatalogTag(key=tag_name, value=tag_value)
540
+ )
541
+
542
+ return result_dict
543
+
544
+ @cached(cachetools.FIFOCache(maxsize=100))
545
+ def get_catalog_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
546
+ """Optimized version using databricks-sql"""
547
+ logger.info(f"Fetching table tags for catalog: {catalog}")
548
+
549
+ query = f"SELECT * FROM {catalog}.information_schema.catalog_tags"
550
+ rows = self._execute_sql_query(query)
551
+
552
+ result_dict: Dict[str, List[UnityCatalogTag]] = {}
553
+
554
+ for row in rows:
555
+ catalog_name, tag_name, tag_value = row
556
+
557
+ if catalog_name not in result_dict:
558
+ result_dict[catalog_name] = []
559
+
560
+ result_dict[catalog_name].append(
561
+ UnityCatalogTag(key=tag_name, value=tag_value)
562
+ )
563
+
564
+ return result_dict
565
+
566
+ @cached(cachetools.FIFOCache(maxsize=100))
567
+ def get_table_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
568
+ """Optimized version using databricks-sql"""
569
+ logger.info(f"Fetching table tags for catalog: {catalog}")
570
+
571
+ query = f"SELECT * FROM {catalog}.information_schema.table_tags"
572
+ rows = self._execute_sql_query(query)
573
+
574
+ result_dict: Dict[str, List[UnityCatalogTag]] = {}
575
+
576
+ for row in rows:
577
+ catalog_name, schema_name, table_name, tag_name, tag_value = row
578
+ table_key = f"{catalog_name}.{schema_name}.{table_name}"
579
+
580
+ if table_key not in result_dict:
581
+ result_dict[table_key] = []
582
+
583
+ result_dict[table_key].append(
584
+ UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
585
+ )
586
+
587
+ return result_dict
588
+
589
+ @cached(cachetools.FIFOCache(maxsize=100))
590
+ def get_column_tags(self, catalog: str) -> Dict[str, List[UnityCatalogTag]]:
591
+ """Optimized version using databricks-sql"""
592
+ logger.info(f"Fetching column tags for catalog: {catalog}")
593
+
594
+ query = f"SELECT * FROM {catalog}.information_schema.column_tags"
595
+ rows = self._execute_sql_query(query)
596
+
597
+ result_dict: Dict[str, List[UnityCatalogTag]] = {}
598
+
599
+ for row in rows:
600
+ catalog_name, schema_name, table_name, column_name, tag_name, tag_value = (
601
+ row
602
+ )
603
+ column_key = f"{catalog_name}.{schema_name}.{table_name}.{column_name}"
604
+
605
+ if column_key not in result_dict:
606
+ result_dict[column_key] = []
607
+
608
+ result_dict[column_key].append(
609
+ UnityCatalogTag(key=tag_name, value=tag_value if tag_value else None)
610
+ )
611
+
612
+ return result_dict