acryl-datahub 1.1.0.4rc2__py3-none-any.whl → 1.1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (156) hide show
  1. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/METADATA +2528 -2530
  2. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/RECORD +156 -138
  3. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/cli/check_cli.py +65 -11
  7. datahub/cli/cli_utils.py +63 -0
  8. datahub/cli/container_cli.py +5 -0
  9. datahub/cli/delete_cli.py +3 -4
  10. datahub/cli/docker_check.py +107 -12
  11. datahub/cli/docker_cli.py +149 -227
  12. datahub/cli/exists_cli.py +0 -2
  13. datahub/cli/get_cli.py +0 -2
  14. datahub/cli/iceberg_cli.py +5 -0
  15. datahub/cli/ingest_cli.py +3 -15
  16. datahub/cli/migrate.py +2 -0
  17. datahub/cli/put_cli.py +1 -4
  18. datahub/cli/quickstart_versioning.py +50 -7
  19. datahub/cli/specific/assertions_cli.py +0 -4
  20. datahub/cli/specific/datacontract_cli.py +0 -3
  21. datahub/cli/specific/dataproduct_cli.py +0 -11
  22. datahub/cli/specific/dataset_cli.py +1 -8
  23. datahub/cli/specific/forms_cli.py +0 -4
  24. datahub/cli/specific/group_cli.py +0 -2
  25. datahub/cli/specific/structuredproperties_cli.py +1 -4
  26. datahub/cli/specific/user_cli.py +0 -2
  27. datahub/cli/state_cli.py +0 -2
  28. datahub/cli/timeline_cli.py +0 -2
  29. datahub/emitter/rest_emitter.py +41 -8
  30. datahub/entrypoints.py +4 -3
  31. datahub/ingestion/api/decorators.py +15 -3
  32. datahub/ingestion/api/report.py +332 -3
  33. datahub/ingestion/api/sink.py +3 -0
  34. datahub/ingestion/api/source.py +47 -45
  35. datahub/ingestion/autogenerated/__init__.py +0 -0
  36. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  37. datahub/ingestion/autogenerated/lineage.json +401 -0
  38. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  39. datahub/ingestion/extractor/schema_util.py +13 -4
  40. datahub/ingestion/graph/client.py +73 -30
  41. datahub/ingestion/run/pipeline.py +54 -2
  42. datahub/ingestion/sink/datahub_rest.py +12 -0
  43. datahub/ingestion/source/abs/source.py +1 -1
  44. datahub/ingestion/source/aws/glue.py +1 -1
  45. datahub/ingestion/source/azure/azure_common.py +2 -2
  46. datahub/ingestion/source/bigquery_v2/bigquery.py +49 -23
  47. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  48. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  49. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  50. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  51. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  52. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  53. datahub/ingestion/source/common/subtypes.py +45 -0
  54. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  55. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  56. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  57. datahub/ingestion/source/dbt/dbt_cloud.py +7 -2
  58. datahub/ingestion/source/dbt/dbt_common.py +3 -1
  59. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  60. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  61. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  62. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  63. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  64. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  65. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  66. datahub/ingestion/source/ge_data_profiler.py +76 -28
  67. datahub/ingestion/source/hex/api.py +26 -1
  68. datahub/ingestion/source/identity/azure_ad.py +1 -1
  69. datahub/ingestion/source/identity/okta.py +1 -14
  70. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  71. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  72. datahub/ingestion/source/mlflow.py +11 -1
  73. datahub/ingestion/source/mock_data/__init__.py +0 -0
  74. datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
  75. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  76. datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
  77. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  78. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  79. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  80. datahub/ingestion/source/preset.py +2 -2
  81. datahub/ingestion/source/redshift/redshift.py +17 -0
  82. datahub/ingestion/source/redshift/usage.py +4 -3
  83. datahub/ingestion/source/s3/report.py +4 -2
  84. datahub/ingestion/source/s3/source.py +367 -115
  85. datahub/ingestion/source/salesforce.py +6 -3
  86. datahub/ingestion/source/sigma/sigma.py +6 -1
  87. datahub/ingestion/source/slack/slack.py +2 -1
  88. datahub/ingestion/source/snowflake/snowflake_config.py +27 -1
  89. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  90. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  91. datahub/ingestion/source/snowflake/snowflake_v2.py +14 -2
  92. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  93. datahub/ingestion/source/sql/athena.py +119 -12
  94. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  95. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  96. datahub/ingestion/source/sql/mssql/source.py +24 -15
  97. datahub/ingestion/source/sql/oracle.py +1 -1
  98. datahub/ingestion/source/sql/sql_common.py +11 -0
  99. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  100. datahub/ingestion/source/sql/teradata.py +997 -235
  101. datahub/ingestion/source/sql/vertica.py +10 -6
  102. datahub/ingestion/source/sql_queries.py +2 -2
  103. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  104. datahub/ingestion/source/superset.py +57 -2
  105. datahub/ingestion/source/tableau/tableau.py +57 -37
  106. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  107. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  108. datahub/ingestion/source/unity/proxy.py +4 -3
  109. datahub/ingestion/source/unity/source.py +56 -30
  110. datahub/ingestion/source/usage/clickhouse_usage.py +1 -0
  111. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  112. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  113. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  114. datahub/metadata/_internal_schema_classes.py +1253 -536
  115. datahub/metadata/_urns/urn_defs.py +1797 -1685
  116. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  117. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  118. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  119. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
  120. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  121. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  122. datahub/metadata/schema.avsc +16614 -16538
  123. datahub/metadata/schemas/ContainerProperties.avsc +2 -0
  124. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  125. datahub/metadata/schemas/DataFlowInfo.avsc +2 -0
  126. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  127. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
  128. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  129. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  130. datahub/metadata/schemas/DataJobInfo.avsc +2 -0
  131. datahub/metadata/schemas/DataProcessKey.avsc +2 -0
  132. datahub/metadata/schemas/DatasetKey.avsc +4 -1
  133. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  134. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +2 -0
  135. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  136. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -0
  137. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -0
  138. datahub/metadata/schemas/MLModelKey.avsc +2 -0
  139. datahub/metadata/schemas/MetadataChangeEvent.avsc +2 -0
  140. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  141. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  142. datahub/sdk/datajob.py +39 -15
  143. datahub/sdk/lineage_client.py +2 -0
  144. datahub/sdk/main_client.py +14 -2
  145. datahub/sdk/search_client.py +4 -3
  146. datahub/specific/dataproduct.py +4 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  148. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  149. datahub/telemetry/telemetry.py +17 -11
  150. datahub/upgrade/upgrade.py +46 -13
  151. datahub/utilities/server_config_util.py +8 -0
  152. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  153. datahub/utilities/stats_collections.py +4 -0
  154. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/WHEEL +0 -0
  155. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/licenses/LICENSE +0 -0
  156. {acryl_datahub-1.1.0.4rc2.dist-info → acryl_datahub-1.1.0.5.dist-info}/top_level.txt +0 -0
@@ -25,6 +25,10 @@ from datahub.ingestion.api.decorators import (
25
25
  )
26
26
  from datahub.ingestion.api.workunit import MetadataWorkUnit
27
27
  from datahub.ingestion.source.common.data_reader import DataReader
28
+ from datahub.ingestion.source.common.subtypes import (
29
+ DatasetSubTypes,
30
+ SourceCapabilityModifier,
31
+ )
28
32
  from datahub.ingestion.source.sql.sql_common import (
29
33
  SQLAlchemySource,
30
34
  SqlWorkUnit,
@@ -41,7 +45,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
41
45
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
42
46
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
43
47
  from datahub.metadata.schema_classes import (
44
- ChangeTypeClass,
45
48
  DatasetLineageTypeClass,
46
49
  DatasetPropertiesClass,
47
50
  SubTypesClass,
@@ -113,10 +116,14 @@ class VerticaConfig(BasicSQLAlchemyConfig):
113
116
  @capability(
114
117
  SourceCapability.LINEAGE_COARSE,
115
118
  "Enabled by default, can be disabled via configuration `include_view_lineage` and `include_projection_lineage`",
119
+ subtype_modifier=[
120
+ SourceCapabilityModifier.VIEW,
121
+ SourceCapabilityModifier.PROJECTIONS,
122
+ ],
116
123
  )
117
124
  @capability(
118
125
  SourceCapability.DELETION_DETECTION,
119
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
126
+ "Enabled by default via stateful ingestion",
120
127
  supported=True,
121
128
  )
122
129
  class VerticaSource(SQLAlchemySource):
@@ -493,11 +500,8 @@ class VerticaSource(SQLAlchemySource):
493
500
  if dpi_aspect:
494
501
  yield dpi_aspect
495
502
  yield MetadataChangeProposalWrapper(
496
- entityType="dataset",
497
- changeType=ChangeTypeClass.UPSERT,
498
503
  entityUrn=dataset_urn,
499
- aspectName="subTypes",
500
- aspect=SubTypesClass(typeNames=["Projections"]),
504
+ aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
501
505
  ).as_workunit()
502
506
 
503
507
  if self.config.domain:
@@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
66
66
  description="The default schema to use for unqualified table names",
67
67
  default=None,
68
68
  )
69
- default_dialect: Optional[str] = Field(
69
+ override_dialect: Optional[str] = Field(
70
70
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
71
71
  default=None,
72
72
  )
@@ -181,7 +181,7 @@ class SqlQueriesSource(Source):
181
181
  schema_resolver=self.schema_resolver,
182
182
  default_db=self.config.default_db,
183
183
  default_schema=self.config.default_schema,
184
- default_dialect=self.config.default_dialect,
184
+ override_dialect=self.config.override_dialect,
185
185
  )
186
186
  if result.debug_info.table_error:
187
187
  logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
@@ -179,7 +179,7 @@ class StatefulIngestionReport(SourceReport):
179
179
 
180
180
  @capability(
181
181
  SourceCapability.DELETION_DETECTION,
182
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
182
+ "Enabled by default via stateful ingestion",
183
183
  supported=True,
184
184
  )
185
185
  class StatefulIngestionSourceBase(Source):
@@ -272,7 +272,7 @@ def get_filter_name(filter_obj):
272
272
  @config_class(SupersetConfig)
273
273
  @support_status(SupportStatus.CERTIFIED)
274
274
  @capability(
275
- SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
275
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
276
276
  )
277
277
  @capability(SourceCapability.DOMAINS, "Enabled by `domain` config to assign domain_key")
278
278
  @capability(SourceCapability.LINEAGE_COARSE, "Supported by default")
@@ -658,6 +658,7 @@ class SupersetSource(StatefulIngestionSourceBase):
658
658
  if datasource_id:
659
659
  dataset_info = self.get_dataset_info(datasource_id).get("result", {})
660
660
  dataset_column_info = dataset_info.get("columns", [])
661
+ dataset_metric_info = dataset_info.get("metrics", [])
661
662
 
662
663
  for column in dataset_column_info:
663
664
  col_name = column.get("column_name", "")
@@ -671,6 +672,17 @@ class SupersetSource(StatefulIngestionSourceBase):
671
672
  continue
672
673
 
673
674
  dataset_columns.append((col_name, col_type, col_description))
675
+
676
+ for metric in dataset_metric_info:
677
+ metric_name = metric.get("metric_name", "")
678
+ metric_type = metric.get("metric_type", "")
679
+ metric_description = metric.get("description", "")
680
+
681
+ if metric_name == "" or metric_type == "":
682
+ logger.info(f"could not construct metric lineage for {metric}")
683
+ continue
684
+
685
+ dataset_columns.append((metric_name, metric_type, metric_description))
674
686
  else:
675
687
  # if no datasource id, cannot build cll, just return
676
688
  logger.warning(
@@ -972,19 +984,44 @@ class SupersetSource(StatefulIngestionSourceBase):
972
984
  schema_fields.append(field)
973
985
  return schema_fields
974
986
 
987
+ def gen_metric_schema_fields(
988
+ self, metric_data: List[Dict[str, Any]]
989
+ ) -> List[SchemaField]:
990
+ schema_fields: List[SchemaField] = []
991
+ for metric in metric_data:
992
+ metric_type = metric.get("metric_type", "")
993
+ data_type = resolve_sql_type(metric_type)
994
+ if data_type is None:
995
+ data_type = NullType()
996
+
997
+ field = SchemaField(
998
+ fieldPath=metric.get("metric_name", ""),
999
+ type=SchemaFieldDataType(data_type),
1000
+ nativeDataType=metric_type or "",
1001
+ description=metric.get("description", ""),
1002
+ nullable=True,
1003
+ )
1004
+ schema_fields.append(field)
1005
+ return schema_fields
1006
+
975
1007
  def gen_schema_metadata(
976
1008
  self,
977
1009
  dataset_response: dict,
978
1010
  ) -> SchemaMetadata:
979
1011
  dataset_response = dataset_response.get("result", {})
980
1012
  column_data = dataset_response.get("columns", [])
1013
+ metric_data = dataset_response.get("metrics", [])
1014
+
1015
+ column_fields = self.gen_schema_fields(column_data)
1016
+ metric_fields = self.gen_metric_schema_fields(metric_data)
1017
+
981
1018
  schema_metadata = SchemaMetadata(
982
1019
  schemaName=dataset_response.get("table_name", ""),
983
1020
  platform=make_data_platform_urn(self.platform),
984
1021
  version=0,
985
1022
  hash="",
986
1023
  platformSchema=MySqlDDL(tableSchema=""),
987
- fields=self.gen_schema_fields(column_data),
1024
+ fields=column_fields + metric_fields,
988
1025
  )
989
1026
  return schema_metadata
990
1027
 
@@ -1049,6 +1086,8 @@ class SupersetSource(StatefulIngestionSourceBase):
1049
1086
  # To generate column level lineage, we can manually decode the metadata
1050
1087
  # to produce the ColumnLineageInfo
1051
1088
  columns = dataset_response.get("result", {}).get("columns", [])
1089
+ metrics = dataset_response.get("result", {}).get("metrics", [])
1090
+
1052
1091
  fine_grained_lineages: List[FineGrainedLineageClass] = []
1053
1092
 
1054
1093
  for column in columns:
@@ -1067,6 +1106,22 @@ class SupersetSource(StatefulIngestionSourceBase):
1067
1106
  )
1068
1107
  )
1069
1108
 
1109
+ for metric in metrics:
1110
+ metric_name = metric.get("metric_name", "")
1111
+ if not metric_name:
1112
+ continue
1113
+
1114
+ downstream = [make_schema_field_urn(datasource_urn, metric_name)]
1115
+ upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
1116
+ fine_grained_lineages.append(
1117
+ FineGrainedLineageClass(
1118
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1119
+ downstreams=downstream,
1120
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1121
+ upstreams=upstreams,
1122
+ )
1123
+ )
1124
+
1070
1125
  upstream_lineage = UpstreamLineageClass(
1071
1126
  upstreams=[
1072
1127
  UpstreamClass(
@@ -80,6 +80,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
80
80
  from datahub.ingestion.source.common.subtypes import (
81
81
  BIContainerSubTypes,
82
82
  DatasetSubTypes,
83
+ SourceCapabilityModifier,
83
84
  )
84
85
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
85
86
  StaleEntityRemovalHandler,
@@ -148,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
148
149
  )
149
150
  from datahub.metadata.schema_classes import (
150
151
  BrowsePathsClass,
151
- ChangeTypeClass,
152
152
  ChartInfoClass,
153
153
  ChartUsageStatisticsClass,
154
154
  DashboardInfoClass,
@@ -528,6 +528,14 @@ class TableauConfig(
528
528
  default=False,
529
529
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
530
530
  )
531
+ emit_all_published_datasources: bool = Field(
532
+ default=False,
533
+ description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
534
+ )
535
+ emit_all_embedded_datasources: bool = Field(
536
+ default=False,
537
+ description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
538
+ )
531
539
 
532
540
  env: str = Field(
533
541
  default=builder.DEFAULT_ENV,
@@ -861,16 +869,29 @@ def report_user_role(report: TableauSourceReport, server: Server) -> None:
861
869
  @platform_name("Tableau")
862
870
  @config_class(TableauConfig)
863
871
  @support_status(SupportStatus.CERTIFIED)
872
+ @capability(
873
+ SourceCapability.CONTAINERS,
874
+ "Enabled by default",
875
+ subtype_modifier=[
876
+ SourceCapabilityModifier.TABLEAU_PROJECT,
877
+ SourceCapabilityModifier.TABLEAU_SITE,
878
+ SourceCapabilityModifier.TABLEAU_WORKBOOK,
879
+ ],
880
+ )
864
881
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
865
882
  @capability(SourceCapability.DOMAINS, "Requires transformer", supported=False)
866
883
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
867
884
  @capability(
868
885
  SourceCapability.USAGE_STATS,
869
886
  "Dashboard/Chart view counts, enabled using extract_usage_stats config",
887
+ subtype_modifier=[
888
+ SourceCapabilityModifier.DASHBOARD,
889
+ SourceCapabilityModifier.CHART,
890
+ ],
870
891
  )
871
892
  @capability(
872
893
  SourceCapability.DELETION_DETECTION,
873
- "Enabled by default when stateful ingestion is turned on.",
894
+ "Enabled by default via stateful ingestion.",
874
895
  )
875
896
  @capability(SourceCapability.OWNERSHIP, "Requires recipe configuration")
876
897
  @capability(SourceCapability.TAGS, "Requires recipe configuration")
@@ -2175,32 +2196,32 @@ class TableauSiteSource:
2175
2196
  else []
2176
2197
  )
2177
2198
 
2178
- # The Tableau SQL parser much worse than our sqlglot based parser,
2179
- # so relying on metadata parsed by Tableau from SQL queries can be
2180
- # less accurate. This option allows us to ignore Tableau's parser and
2181
- # only use our own.
2182
- if self.config.force_extraction_of_lineage_from_custom_sql_queries:
2183
- logger.debug("Extracting TLL & CLL from custom sql (forced)")
2199
+ tableau_table_list = csql.get(c.TABLES, [])
2200
+ if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
2201
+ not tableau_table_list
2202
+ and self.config.extract_lineage_from_unsupported_custom_sql_queries
2203
+ ):
2204
+ if not tableau_table_list:
2205
+ # custom sql tables may contain unsupported sql, causing incomplete lineage
2206
+ # we extract the lineage from the raw queries
2207
+ logger.debug(
2208
+ "Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
2209
+ )
2210
+ else:
2211
+ # The Tableau SQL parser is much worse than our sqlglot based parser,
2212
+ # so relying on metadata parsed by Tableau from SQL queries can be
2213
+ # less accurate. This option allows us to ignore Tableau's parser and
2214
+ # only use our own.
2215
+ logger.debug("Parsing TLL & CLL from custom sql (forced)")
2216
+
2184
2217
  yield from self._create_lineage_from_unsupported_csql(
2185
2218
  csql_urn, csql, columns
2186
2219
  )
2187
2220
  else:
2188
- tables = csql.get(c.TABLES, [])
2189
-
2190
- if tables:
2191
- # lineage from custom sql -> datasets/tables #
2192
- yield from self._create_lineage_to_upstream_tables(
2193
- csql_urn, tables, datasource
2194
- )
2195
- elif (
2196
- self.config.extract_lineage_from_unsupported_custom_sql_queries
2197
- ):
2198
- logger.debug("Extracting TLL & CLL from custom sql")
2199
- # custom sql tables may contain unsupported sql, causing incomplete lineage
2200
- # we extract the lineage from the raw queries
2201
- yield from self._create_lineage_from_unsupported_csql(
2202
- csql_urn, csql, columns
2203
- )
2221
+ # lineage from custom sql -> datasets/tables #
2222
+ yield from self._create_lineage_to_upstream_tables(
2223
+ csql_urn, tableau_table_list, datasource
2224
+ )
2204
2225
 
2205
2226
  # Schema Metadata
2206
2227
  schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
@@ -2238,7 +2259,6 @@ class TableauSiteSource:
2238
2259
  yield self.get_metadata_change_event(dataset_snapshot)
2239
2260
  yield self.get_metadata_change_proposal(
2240
2261
  dataset_snapshot.urn,
2241
- aspect_name=c.SUB_TYPES,
2242
2262
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
2243
2263
  )
2244
2264
 
@@ -2403,7 +2423,6 @@ class TableauSiteSource:
2403
2423
  upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
2404
2424
  yield self.get_metadata_change_proposal(
2405
2425
  csql_urn,
2406
- aspect_name=c.UPSTREAM_LINEAGE,
2407
2426
  aspect=upstream_lineage,
2408
2427
  )
2409
2428
  self.report.num_tables_with_upstream_lineage += 1
@@ -2589,7 +2608,6 @@ class TableauSiteSource:
2589
2608
  )
2590
2609
  yield self.get_metadata_change_proposal(
2591
2610
  csql_urn,
2592
- aspect_name=c.UPSTREAM_LINEAGE,
2593
2611
  aspect=upstream_lineage,
2594
2612
  )
2595
2613
  self.report.num_tables_with_upstream_lineage += 1
@@ -2635,14 +2653,10 @@ class TableauSiteSource:
2635
2653
  def get_metadata_change_proposal(
2636
2654
  self,
2637
2655
  urn: str,
2638
- aspect_name: str,
2639
2656
  aspect: Union["UpstreamLineage", "SubTypesClass"],
2640
2657
  ) -> MetadataWorkUnit:
2641
2658
  return MetadataChangeProposalWrapper(
2642
- entityType=c.DATASET,
2643
- changeType=ChangeTypeClass.UPSERT,
2644
2659
  entityUrn=urn,
2645
- aspectName=aspect_name,
2646
2660
  aspect=aspect,
2647
2661
  ).as_workunit()
2648
2662
 
@@ -2750,7 +2764,6 @@ class TableauSiteSource:
2750
2764
  )
2751
2765
  yield self.get_metadata_change_proposal(
2752
2766
  datasource_urn,
2753
- aspect_name=c.UPSTREAM_LINEAGE,
2754
2767
  aspect=upstream_lineage,
2755
2768
  )
2756
2769
  self.report.num_tables_with_upstream_lineage += 1
@@ -2769,7 +2782,6 @@ class TableauSiteSource:
2769
2782
  yield self.get_metadata_change_event(dataset_snapshot)
2770
2783
  yield self.get_metadata_change_proposal(
2771
2784
  dataset_snapshot.urn,
2772
- aspect_name=c.SUB_TYPES,
2773
2785
  aspect=SubTypesClass(
2774
2786
  typeNames=(
2775
2787
  ["Embedded Data Source"]
@@ -2855,7 +2867,11 @@ class TableauSiteSource:
2855
2867
  return datasource
2856
2868
 
2857
2869
  def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
2858
- datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
2870
+ datasource_filter = (
2871
+ {}
2872
+ if self.config.emit_all_published_datasources
2873
+ else {c.ID_WITH_IN: self.datasource_ids_being_used}
2874
+ )
2859
2875
 
2860
2876
  for datasource in self.get_connection_objects(
2861
2877
  query=published_datasource_graphql_query,
@@ -3548,7 +3564,11 @@ class TableauSiteSource:
3548
3564
  return browse_paths
3549
3565
 
3550
3566
  def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
3551
- datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3567
+ datasource_filter = (
3568
+ {}
3569
+ if self.config.emit_all_embedded_datasources
3570
+ else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3571
+ )
3552
3572
 
3553
3573
  for datasource in self.get_connection_objects(
3554
3574
  query=embedded_datasource_graphql_query,
@@ -3660,7 +3680,7 @@ class TableauSiteSource:
3660
3680
  container_key=project_key,
3661
3681
  name=project_.name,
3662
3682
  description=project_.description,
3663
- sub_types=[c.PROJECT],
3683
+ sub_types=[BIContainerSubTypes.TABLEAU_PROJECT],
3664
3684
  parent_container_key=parent_project_key,
3665
3685
  )
3666
3686
 
@@ -3678,7 +3698,7 @@ class TableauSiteSource:
3678
3698
  yield from gen_containers(
3679
3699
  container_key=self.gen_site_key(self.site_id),
3680
3700
  name=self.site.name or "Default",
3681
- sub_types=[c.SITE],
3701
+ sub_types=[BIContainerSubTypes.TABLEAU_SITE],
3682
3702
  )
3683
3703
 
3684
3704
  def _fetch_groups(self):
@@ -579,10 +579,12 @@ def get_platform(connection_type: str) -> str:
579
579
  platform = "oracle"
580
580
  elif connection_type in ("tbio", "teradata"):
581
581
  platform = "teradata"
582
- elif connection_type in ("sqlserver"):
582
+ elif connection_type in ("sqlserver",):
583
583
  platform = "mssql"
584
- elif connection_type in ("athena"):
584
+ elif connection_type in ("athena",):
585
585
  platform = "athena"
586
+ elif connection_type in ("googlebigquery",):
587
+ platform = "bigquery"
586
588
  elif connection_type.endswith("_jdbc"):
587
589
  # e.g. convert trino_jdbc -> trino
588
590
  platform = connection_type[: -len("_jdbc")]
@@ -50,7 +50,6 @@ TABLES = "tables"
50
50
  DESCRIPTION = "description"
51
51
  SQL = "SQL"
52
52
  QUERY = "query"
53
- SUB_TYPES = "subTypes"
54
53
  VIEW = "view"
55
54
  CUSTOM_SQL = "Custom SQL"
56
55
  REMOTE_TYPE = "remoteType"
@@ -58,7 +57,6 @@ UNKNOWN = "UNKNOWN"
58
57
  PUBLISHED_DATA_SOURCE = "PublishedDatasource"
59
58
  LUID = "luid"
60
59
  EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
61
- UPSTREAM_LINEAGE = "upstreamLineage"
62
60
  OWNER = "owner"
63
61
  USERNAME = "username"
64
62
  HAS_EXTRACTS = "hasExtracts"
@@ -78,8 +76,6 @@ CHART = "chart"
78
76
  DASHBOARD = "dashboard"
79
77
  DASHBOARDS_CONNECTION = "dashboardsConnection"
80
78
  EMBEDDED_DATA_SOURCES_CONNECTION = "embeddedDatasourcesConnection"
81
- PROJECT = "Project"
82
- SITE = "Site"
83
79
  IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
84
80
  SITE_PERMISSION = "sitePermission"
85
81
  ROLE_SITE_ADMIN_EXPLORER = "SiteAdministratorExplorer"
@@ -507,9 +507,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
507
507
  def _execute_sql_query(self, query: str) -> List[List[str]]:
508
508
  """Execute SQL query using databricks-sql connector for better performance"""
509
509
  try:
510
- with connect(
511
- **self._sql_connection_params
512
- ) as connection, connection.cursor() as cursor:
510
+ with (
511
+ connect(**self._sql_connection_params) as connection,
512
+ connection.cursor() as cursor,
513
+ ):
513
514
  cursor.execute(query)
514
515
  return cursor.fetchall()
515
516
 
@@ -56,6 +56,7 @@ from datahub.ingestion.source.aws.s3_util import (
56
56
  from datahub.ingestion.source.common.subtypes import (
57
57
  DatasetContainerSubTypes,
58
58
  DatasetSubTypes,
59
+ SourceCapabilityModifier,
59
60
  )
60
61
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
61
62
  StaleEntityRemovalHandler,
@@ -152,14 +153,21 @@ logger: logging.Logger = logging.getLogger(__name__)
152
153
  @capability(SourceCapability.USAGE_STATS, "Enabled by default")
153
154
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
154
155
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
155
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
156
+ @capability(
157
+ SourceCapability.CONTAINERS,
158
+ "Enabled by default",
159
+ subtype_modifier=[
160
+ SourceCapabilityModifier.CATALOG,
161
+ SourceCapabilityModifier.SCHEMA,
162
+ ],
163
+ )
156
164
  @capability(SourceCapability.OWNERSHIP, "Supported via the `include_ownership` config")
157
165
  @capability(
158
166
  SourceCapability.DATA_PROFILING, "Supported via the `profiling.enabled` config"
159
167
  )
160
168
  @capability(
161
169
  SourceCapability.DELETION_DETECTION,
162
- "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
170
+ "Enabled by default via stateful ingestion",
163
171
  supported=True,
164
172
  )
165
173
  @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
@@ -768,10 +776,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
768
776
 
769
777
  def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
770
778
  domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
771
- schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
772
- schema.catalog.name
773
- ).get(f"{schema.catalog.name}.{schema.name}", [])
774
- if schema_tags:
779
+ schema_tags = []
780
+ if self.config.include_tags:
781
+ schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
782
+ schema.catalog.name
783
+ ).get(f"{schema.catalog.name}.{schema.name}", [])
775
784
  logger.debug(f"Schema tags for {schema.name}: {schema_tags}")
776
785
  # Generate platform resources for schema tags
777
786
  yield from self.gen_platform_resources(schema_tags)
@@ -809,10 +818,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
809
818
 
810
819
  def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
811
820
  domain_urn = self._gen_domain_urn(catalog.name)
812
- catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog.name).get(
813
- catalog.name, []
814
- )
815
- if catalog_tags:
821
+ catalog_tags = []
822
+ if self.config.include_tags:
823
+ catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(
824
+ catalog.name
825
+ ).get(catalog.name, [])
816
826
  logger.debug(f"Schema tags for {catalog.name}: {catalog_tags}")
817
827
  # Generate platform resources for schema tags
818
828
  yield from self.gen_platform_resources(catalog_tags)
@@ -1020,29 +1030,45 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
1020
1030
  ) -> Iterable[MetadataWorkUnit]:
1021
1031
  if self.ctx.graph and self.platform_resource_repository:
1022
1032
  for tag in tags:
1023
- platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1024
- platform_instance=self.platform_instance_name,
1025
- platform_resource_repository=self.platform_resource_repository,
1026
- tag=tag,
1027
- )
1028
- logger.debug(f"Created platform resource {platform_resource_id}")
1033
+ try:
1034
+ platform_resource_id = UnityCatalogTagPlatformResourceId.from_tag(
1035
+ platform_instance=self.platform_instance_name,
1036
+ platform_resource_repository=self.platform_resource_repository,
1037
+ tag=tag,
1038
+ )
1039
+ logger.debug(f"Created platform resource {platform_resource_id}")
1029
1040
 
1030
- unity_catalog_tag = UnityCatalogTagPlatformResource.get_from_datahub(
1031
- platform_resource_id, self.platform_resource_repository, False
1032
- )
1033
- if (
1034
- tag.to_datahub_tag_urn().urn()
1035
- not in unity_catalog_tag.datahub_linked_resources().urns
1036
- ):
1037
- unity_catalog_tag.datahub_linked_resources().add(
1038
- tag.to_datahub_tag_urn().urn()
1041
+ unity_catalog_tag = (
1042
+ UnityCatalogTagPlatformResource.get_from_datahub(
1043
+ platform_resource_id,
1044
+ self.platform_resource_repository,
1045
+ False,
1046
+ )
1039
1047
  )
1040
- platform_resource = unity_catalog_tag.as_platform_resource()
1041
- for mcp in platform_resource.to_mcps():
1042
- yield MetadataWorkUnit(
1043
- id=f"platform_resource-{platform_resource.id}",
1044
- mcp=mcp,
1048
+ if (
1049
+ tag.to_datahub_tag_urn().urn()
1050
+ not in unity_catalog_tag.datahub_linked_resources().urns
1051
+ ):
1052
+ unity_catalog_tag.datahub_linked_resources().add(
1053
+ tag.to_datahub_tag_urn().urn()
1045
1054
  )
1055
+ platform_resource = unity_catalog_tag.as_platform_resource()
1056
+ for mcp in platform_resource.to_mcps():
1057
+ yield MetadataWorkUnit(
1058
+ id=f"platform_resource-{platform_resource.id}",
1059
+ mcp=mcp,
1060
+ )
1061
+ except Exception as e:
1062
+ logger.exception(
1063
+ f"Error processing platform resource for tag {tag}"
1064
+ )
1065
+ self.report.report_warning(
1066
+ message="Error processing platform resource for tag",
1067
+ context=str(tag),
1068
+ title="Error processing platform resource for tag",
1069
+ exc=e,
1070
+ )
1071
+ continue
1046
1072
 
1047
1073
  def _create_schema_metadata_aspect(
1048
1074
  self, table: Table
@@ -89,6 +89,7 @@ class ClickHouseUsageConfig(ClickHouseConfig, BaseUsageConfig, EnvConfigMixin):
89
89
  SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
90
90
  )
91
91
  @capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration")
92
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
92
93
  @dataclasses.dataclass
93
94
  class ClickHouseUsageSource(Source):
94
95
  """
@@ -15,7 +15,9 @@ from sqlalchemy.engine import Engine
15
15
  import datahub.emitter.mce_builder as builder
16
16
  from datahub.configuration.time_window_config import get_time_bucket
17
17
  from datahub.ingestion.api.decorators import (
18
+ SourceCapability,
18
19
  SupportStatus,
20
+ capability,
19
21
  config_class,
20
22
  platform_name,
21
23
  support_status,
@@ -112,6 +114,7 @@ class TrinoUsageReport(SourceReport):
112
114
  @platform_name("Trino")
113
115
  @config_class(TrinoUsageConfig)
114
116
  @support_status(SupportStatus.CERTIFIED)
117
+ @capability(SourceCapability.USAGE_STATS, "Enabled by default to get usage stats")
115
118
  @dataclasses.dataclass
116
119
  class TrinoUsageSource(Source):
117
120
  """
@@ -71,8 +71,24 @@ class AddDatasetOwnership(OwnershipTransformer):
71
71
 
72
72
  server_ownership = graph.get_ownership(entity_urn=urn)
73
73
  if server_ownership:
74
- owners = {owner.owner: owner for owner in server_ownership.owners}
75
- owners.update({owner.owner: owner for owner in mce_ownership.owners})
74
+ owners = {
75
+ (
76
+ owner.owner,
77
+ owner.type,
78
+ owner.typeUrn,
79
+ ): owner
80
+ for owner in server_ownership.owners
81
+ }
82
+ owners.update(
83
+ {
84
+ (
85
+ owner.owner,
86
+ owner.type,
87
+ owner.typeUrn,
88
+ ): owner
89
+ for owner in mce_ownership.owners
90
+ }
91
+ )
76
92
  mce_ownership.owners = list(owners.values())
77
93
 
78
94
  return mce_ownership
@@ -84,9 +84,10 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
84
84
 
85
85
  dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME
86
86
  dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME
87
- with (dmf_definitions_path).open("w") as definitions, (
88
- dmf_associations_path
89
- ).open("w") as associations:
87
+ with (
88
+ (dmf_definitions_path).open("w") as definitions,
89
+ (dmf_associations_path).open("w") as associations,
90
+ ):
90
91
  for assertion_spec in assertion_config_spec.assertions:
91
92
  result.report.num_processed += 1
92
93
  try: