acryl-datahub 1.1.0.5rc6__py3-none-any.whl → 1.1.0.5rc8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show
  1. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/METADATA +2515 -2517
  2. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/RECORD +78 -75
  3. datahub/_version.py +1 -1
  4. datahub/cli/check_cli.py +0 -7
  5. datahub/cli/cli_utils.py +73 -0
  6. datahub/cli/delete_cli.py +0 -6
  7. datahub/cli/docker_check.py +107 -12
  8. datahub/cli/docker_cli.py +148 -228
  9. datahub/cli/exists_cli.py +0 -4
  10. datahub/cli/get_cli.py +0 -4
  11. datahub/cli/ingest_cli.py +1 -20
  12. datahub/cli/put_cli.py +0 -6
  13. datahub/cli/quickstart_versioning.py +50 -5
  14. datahub/cli/specific/assertions_cli.py +0 -6
  15. datahub/cli/specific/datacontract_cli.py +0 -6
  16. datahub/cli/specific/dataproduct_cli.py +0 -22
  17. datahub/cli/specific/dataset_cli.py +0 -11
  18. datahub/cli/specific/forms_cli.py +0 -6
  19. datahub/cli/specific/group_cli.py +0 -4
  20. datahub/cli/specific/structuredproperties_cli.py +0 -7
  21. datahub/cli/specific/user_cli.py +0 -4
  22. datahub/cli/state_cli.py +0 -4
  23. datahub/cli/timeline_cli.py +0 -4
  24. datahub/entrypoints.py +4 -3
  25. datahub/ingestion/api/report.py +183 -35
  26. datahub/ingestion/autogenerated/capability_summary.json +3431 -0
  27. datahub/ingestion/autogenerated/lineage.json +401 -0
  28. datahub/ingestion/autogenerated/lineage_helper.py +30 -128
  29. datahub/ingestion/extractor/schema_util.py +13 -4
  30. datahub/ingestion/graph/client.py +2 -2
  31. datahub/ingestion/run/pipeline.py +47 -1
  32. datahub/ingestion/source/bigquery_v2/bigquery.py +32 -23
  33. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  34. datahub/ingestion/source/common/subtypes.py +1 -1
  35. datahub/ingestion/source/data_lake_common/object_store.py +40 -0
  36. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  37. datahub/ingestion/source/dremio/dremio_source.py +7 -7
  38. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  39. datahub/ingestion/source/ge_data_profiler.py +28 -20
  40. datahub/ingestion/source/identity/okta.py +0 -13
  41. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  42. datahub/ingestion/source/mock_data/datahub_mock_data.py +45 -0
  43. datahub/ingestion/source/powerbi/powerbi.py +0 -5
  44. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  45. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  46. datahub/ingestion/source/redshift/usage.py +4 -3
  47. datahub/ingestion/source/s3/source.py +19 -3
  48. datahub/ingestion/source/sigma/sigma.py +6 -1
  49. datahub/ingestion/source/snowflake/snowflake_config.py +11 -0
  50. datahub/ingestion/source/snowflake/snowflake_queries.py +147 -61
  51. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  52. datahub/ingestion/source/snowflake/snowflake_v2.py +11 -1
  53. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  54. datahub/ingestion/source/sql/hive_metastore.py +0 -10
  55. datahub/ingestion/source/sql/sql_common.py +4 -0
  56. datahub/ingestion/source/sql/vertica.py +0 -4
  57. datahub/ingestion/source/sql_queries.py +2 -2
  58. datahub/ingestion/source/superset.py +56 -1
  59. datahub/ingestion/source/tableau/tableau.py +40 -34
  60. datahub/ingestion/source/tableau/tableau_constant.py +0 -2
  61. datahub/ingestion/source/unity/proxy.py +4 -3
  62. datahub/ingestion/source/unity/source.py +19 -9
  63. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  64. datahub/metadata/_internal_schema_classes.py +85 -4
  65. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +2 -0
  66. datahub/metadata/schema.avsc +54 -1
  67. datahub/metadata/schemas/CorpUserSettings.avsc +17 -1
  68. datahub/metadata/schemas/GlobalSettingsInfo.avsc +37 -0
  69. datahub/sdk/lineage_client.py +2 -0
  70. datahub/sql_parsing/sql_parsing_aggregator.py +24 -15
  71. datahub/sql_parsing/sqlglot_lineage.py +40 -13
  72. datahub/upgrade/upgrade.py +46 -13
  73. datahub/utilities/server_config_util.py +8 -0
  74. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  75. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/WHEEL +0 -0
  76. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/entry_points.txt +0 -0
  77. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/licenses/LICENSE +0 -0
  78. {acryl_datahub-1.1.0.5rc6.dist-info → acryl_datahub-1.1.0.5rc8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,143 @@
1
+ import dataclasses
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import Any, Iterable, List, Optional
5
+
6
+ from datahub.ingestion.api.closeable import Closeable
7
+ from datahub.metadata.urns import CorpUserUrn
8
+ from datahub.sql_parsing.sql_parsing_aggregator import (
9
+ PreparsedQuery,
10
+ UrnStr,
11
+ )
12
+ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
13
+ from datahub.utilities.file_backed_collections import FileBackedDict
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class StoredProcCall:
18
+ snowflake_root_query_id: str
19
+
20
+ # Query text will typically be something like:
21
+ # "CALL SALES_FORECASTING.CUSTOMER_ANALYSIS_PROC();"
22
+ query_text: str
23
+
24
+ timestamp: datetime
25
+ user: CorpUserUrn
26
+ default_db: str
27
+ default_schema: str
28
+
29
+
30
+ @dataclass
31
+ class StoredProcExecutionLineage:
32
+ call: StoredProcCall
33
+
34
+ inputs: List[UrnStr]
35
+ outputs: List[UrnStr]
36
+
37
+
38
+ @dataclass
39
+ class StoredProcLineageReport:
40
+ num_stored_proc_calls: int = 0
41
+ num_related_queries: int = 0
42
+ num_related_queries_without_proc_call: int = 0
43
+
44
+ # Incremented at generation/build time.
45
+ num_stored_proc_lineage_entries: int = 0
46
+ num_stored_proc_calls_with_no_inputs: int = 0
47
+ num_stored_proc_calls_with_no_outputs: int = 0
48
+
49
+
50
+ class StoredProcLineageTracker(Closeable):
51
+ """
52
+ Tracks table-level lineage for Snowflake stored procedures.
53
+
54
+ Stored procedures in Snowflake trigger multiple SQL queries during execution.
55
+ Snowflake assigns each stored procedure call a unique query_id and uses this as the
56
+ root_query_id for all subsequent queries executed within that procedure. This allows
57
+ us to trace which queries belong to a specific stored procedure execution and build
58
+ table-level lineage by aggregating inputs/outputs from all related queries.
59
+ """
60
+
61
+ def __init__(self, platform: str, shared_connection: Optional[Any] = None):
62
+ self.platform = platform
63
+ self.report = StoredProcLineageReport()
64
+
65
+ # { root_query_id -> StoredProcExecutionLineage }
66
+ self._stored_proc_execution_lineage: FileBackedDict[
67
+ StoredProcExecutionLineage
68
+ ] = FileBackedDict(shared_connection, tablename="stored_proc_lineage")
69
+
70
+ def add_stored_proc_call(self, call: StoredProcCall) -> None:
71
+ """Add a stored procedure call to track."""
72
+ self._stored_proc_execution_lineage[call.snowflake_root_query_id] = (
73
+ StoredProcExecutionLineage(
74
+ call=call,
75
+ # Will be populated by subsequent queries.
76
+ inputs=[],
77
+ outputs=[],
78
+ )
79
+ )
80
+ self.report.num_stored_proc_calls += 1
81
+
82
+ def add_related_query(self, query: PreparsedQuery) -> bool:
83
+ """Add a query that might be related to a stored procedure execution.
84
+
85
+ Returns True if the query was added to a stored procedure execution, False otherwise.
86
+ """
87
+ snowflake_root_query_id = (query.extra_info or {}).get(
88
+ "snowflake_root_query_id"
89
+ )
90
+
91
+ if snowflake_root_query_id:
92
+ if snowflake_root_query_id not in self._stored_proc_execution_lineage:
93
+ self.report.num_related_queries_without_proc_call += 1
94
+ return False
95
+
96
+ stored_proc_execution = self._stored_proc_execution_lineage.for_mutation(
97
+ snowflake_root_query_id
98
+ )
99
+ stored_proc_execution.inputs.extend(query.upstreams)
100
+ if query.downstream is not None:
101
+ stored_proc_execution.outputs.append(query.downstream)
102
+ self.report.num_related_queries += 1
103
+ return True
104
+
105
+ return False
106
+
107
+ def build_merged_lineage_entries(self) -> Iterable[PreparsedQuery]:
108
+ # For stored procedures, we can only get table-level lineage from the audit log.
109
+ # We represent these as PreparsedQuery objects for now. Eventually we'll want to
110
+ # create dataJobInputOutput lineage instead.
111
+
112
+ for stored_proc_execution in self._stored_proc_execution_lineage.values():
113
+ if not stored_proc_execution.inputs:
114
+ self.report.num_stored_proc_calls_with_no_inputs += 1
115
+ continue
116
+
117
+ if not stored_proc_execution.outputs:
118
+ self.report.num_stored_proc_calls_with_no_outputs += 1
119
+ # Still continue to generate lineage for cases where we have inputs but no outputs
120
+
121
+ for downstream in stored_proc_execution.outputs:
122
+ stored_proc_query_id = get_query_fingerprint(
123
+ stored_proc_execution.call.query_text,
124
+ self.platform,
125
+ fast=True,
126
+ secondary_id=downstream,
127
+ )
128
+
129
+ lineage_entry = PreparsedQuery(
130
+ query_id=stored_proc_query_id,
131
+ query_text=stored_proc_execution.call.query_text,
132
+ upstreams=stored_proc_execution.inputs,
133
+ downstream=downstream,
134
+ query_count=0,
135
+ user=stored_proc_execution.call.user,
136
+ timestamp=stored_proc_execution.call.timestamp,
137
+ )
138
+
139
+ self.report.num_stored_proc_lineage_entries += 1
140
+ yield lineage_entry
141
+
142
+ def close(self) -> None:
143
+ self._stored_proc_execution_lineage.close()
@@ -52,7 +52,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import Dataset
52
52
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
53
53
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
54
54
  from datahub.metadata.schema_classes import (
55
- ChangeTypeClass,
56
55
  DatasetPropertiesClass,
57
56
  SubTypesClass,
58
57
  ViewPropertiesClass,
@@ -601,10 +600,7 @@ class HiveMetastoreSource(SQLAlchemySource):
601
600
  yield dpi_aspect
602
601
 
603
602
  yield MetadataChangeProposalWrapper(
604
- entityType="dataset",
605
- changeType=ChangeTypeClass.UPSERT,
606
603
  entityUrn=dataset_urn,
607
- aspectName="subTypes",
608
604
  aspect=SubTypesClass(typeNames=[self.table_subtype]),
609
605
  ).as_workunit()
610
606
 
@@ -810,10 +806,7 @@ class HiveMetastoreSource(SQLAlchemySource):
810
806
 
811
807
  # Add views subtype
812
808
  yield MetadataChangeProposalWrapper(
813
- entityType="dataset",
814
- changeType=ChangeTypeClass.UPSERT,
815
809
  entityUrn=dataset_urn,
816
- aspectName="subTypes",
817
810
  aspect=SubTypesClass(typeNames=[self.view_subtype]),
818
811
  ).as_workunit()
819
812
 
@@ -824,10 +817,7 @@ class HiveMetastoreSource(SQLAlchemySource):
824
817
  viewLogic=dataset.view_definition if dataset.view_definition else "",
825
818
  )
826
819
  yield MetadataChangeProposalWrapper(
827
- entityType="dataset",
828
- changeType=ChangeTypeClass.UPSERT,
829
820
  entityUrn=dataset_urn,
830
- aspectName="viewProperties",
831
821
  aspect=view_properties_aspect,
832
822
  ).as_workunit()
833
823
 
@@ -292,6 +292,10 @@ class ProfileMetadata:
292
292
  SourceCapability.CONTAINERS,
293
293
  "Enabled by default",
294
294
  supported=True,
295
+ subtype_modifier=[
296
+ SourceCapabilityModifier.DATABASE,
297
+ SourceCapabilityModifier.SCHEMA,
298
+ ],
295
299
  )
296
300
  @capability(
297
301
  SourceCapability.DESCRIPTIONS,
@@ -45,7 +45,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
45
45
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
46
46
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
47
47
  from datahub.metadata.schema_classes import (
48
- ChangeTypeClass,
49
48
  DatasetLineageTypeClass,
50
49
  DatasetPropertiesClass,
51
50
  SubTypesClass,
@@ -501,10 +500,7 @@ class VerticaSource(SQLAlchemySource):
501
500
  if dpi_aspect:
502
501
  yield dpi_aspect
503
502
  yield MetadataChangeProposalWrapper(
504
- entityType="dataset",
505
- changeType=ChangeTypeClass.UPSERT,
506
503
  entityUrn=dataset_urn,
507
- aspectName="subTypes",
508
504
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.PROJECTIONS]),
509
505
  ).as_workunit()
510
506
 
@@ -66,7 +66,7 @@ class SqlQueriesSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
66
66
  description="The default schema to use for unqualified table names",
67
67
  default=None,
68
68
  )
69
- default_dialect: Optional[str] = Field(
69
+ override_dialect: Optional[str] = Field(
70
70
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
71
71
  default=None,
72
72
  )
@@ -181,7 +181,7 @@ class SqlQueriesSource(Source):
181
181
  schema_resolver=self.schema_resolver,
182
182
  default_db=self.config.default_db,
183
183
  default_schema=self.config.default_schema,
184
- default_dialect=self.config.default_dialect,
184
+ override_dialect=self.config.override_dialect,
185
185
  )
186
186
  if result.debug_info.table_error:
187
187
  logger.info(f"Error parsing table lineage, {result.debug_info.table_error}")
@@ -658,6 +658,7 @@ class SupersetSource(StatefulIngestionSourceBase):
658
658
  if datasource_id:
659
659
  dataset_info = self.get_dataset_info(datasource_id).get("result", {})
660
660
  dataset_column_info = dataset_info.get("columns", [])
661
+ dataset_metric_info = dataset_info.get("metrics", [])
661
662
 
662
663
  for column in dataset_column_info:
663
664
  col_name = column.get("column_name", "")
@@ -671,6 +672,17 @@ class SupersetSource(StatefulIngestionSourceBase):
671
672
  continue
672
673
 
673
674
  dataset_columns.append((col_name, col_type, col_description))
675
+
676
+ for metric in dataset_metric_info:
677
+ metric_name = metric.get("metric_name", "")
678
+ metric_type = metric.get("metric_type", "")
679
+ metric_description = metric.get("description", "")
680
+
681
+ if metric_name == "" or metric_type == "":
682
+ logger.info(f"could not construct metric lineage for {metric}")
683
+ continue
684
+
685
+ dataset_columns.append((metric_name, metric_type, metric_description))
674
686
  else:
675
687
  # if no datasource id, cannot build cll, just return
676
688
  logger.warning(
@@ -972,19 +984,44 @@ class SupersetSource(StatefulIngestionSourceBase):
972
984
  schema_fields.append(field)
973
985
  return schema_fields
974
986
 
987
+ def gen_metric_schema_fields(
988
+ self, metric_data: List[Dict[str, Any]]
989
+ ) -> List[SchemaField]:
990
+ schema_fields: List[SchemaField] = []
991
+ for metric in metric_data:
992
+ metric_type = metric.get("metric_type", "")
993
+ data_type = resolve_sql_type(metric_type)
994
+ if data_type is None:
995
+ data_type = NullType()
996
+
997
+ field = SchemaField(
998
+ fieldPath=metric.get("metric_name", ""),
999
+ type=SchemaFieldDataType(data_type),
1000
+ nativeDataType=metric_type or "",
1001
+ description=metric.get("description", ""),
1002
+ nullable=True,
1003
+ )
1004
+ schema_fields.append(field)
1005
+ return schema_fields
1006
+
975
1007
  def gen_schema_metadata(
976
1008
  self,
977
1009
  dataset_response: dict,
978
1010
  ) -> SchemaMetadata:
979
1011
  dataset_response = dataset_response.get("result", {})
980
1012
  column_data = dataset_response.get("columns", [])
1013
+ metric_data = dataset_response.get("metrics", [])
1014
+
1015
+ column_fields = self.gen_schema_fields(column_data)
1016
+ metric_fields = self.gen_metric_schema_fields(metric_data)
1017
+
981
1018
  schema_metadata = SchemaMetadata(
982
1019
  schemaName=dataset_response.get("table_name", ""),
983
1020
  platform=make_data_platform_urn(self.platform),
984
1021
  version=0,
985
1022
  hash="",
986
1023
  platformSchema=MySqlDDL(tableSchema=""),
987
- fields=self.gen_schema_fields(column_data),
1024
+ fields=column_fields + metric_fields,
988
1025
  )
989
1026
  return schema_metadata
990
1027
 
@@ -1049,6 +1086,8 @@ class SupersetSource(StatefulIngestionSourceBase):
1049
1086
  # To generate column level lineage, we can manually decode the metadata
1050
1087
  # to produce the ColumnLineageInfo
1051
1088
  columns = dataset_response.get("result", {}).get("columns", [])
1089
+ metrics = dataset_response.get("result", {}).get("metrics", [])
1090
+
1052
1091
  fine_grained_lineages: List[FineGrainedLineageClass] = []
1053
1092
 
1054
1093
  for column in columns:
@@ -1067,6 +1106,22 @@ class SupersetSource(StatefulIngestionSourceBase):
1067
1106
  )
1068
1107
  )
1069
1108
 
1109
+ for metric in metrics:
1110
+ metric_name = metric.get("metric_name", "")
1111
+ if not metric_name:
1112
+ continue
1113
+
1114
+ downstream = [make_schema_field_urn(datasource_urn, metric_name)]
1115
+ upstreams = [make_schema_field_urn(upstream_dataset, metric_name)]
1116
+ fine_grained_lineages.append(
1117
+ FineGrainedLineageClass(
1118
+ downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
1119
+ downstreams=downstream,
1120
+ upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
1121
+ upstreams=upstreams,
1122
+ )
1123
+ )
1124
+
1070
1125
  upstream_lineage = UpstreamLineageClass(
1071
1126
  upstreams=[
1072
1127
  UpstreamClass(
@@ -149,7 +149,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
149
149
  )
150
150
  from datahub.metadata.schema_classes import (
151
151
  BrowsePathsClass,
152
- ChangeTypeClass,
153
152
  ChartInfoClass,
154
153
  ChartUsageStatisticsClass,
155
154
  DashboardInfoClass,
@@ -529,6 +528,14 @@ class TableauConfig(
529
528
  default=False,
530
529
  description="Ingest details for tables external to (not embedded in) tableau as entities.",
531
530
  )
531
+ emit_all_published_datasources: bool = Field(
532
+ default=False,
533
+ description="Ingest all published data sources. When False (default), only ingest published data sources that belong to an ingested workbook.",
534
+ )
535
+ emit_all_embedded_datasources: bool = Field(
536
+ default=False,
537
+ description="Ingest all embedded data sources. When False (default), only ingest embedded data sources that belong to an ingested workbook.",
538
+ )
532
539
 
533
540
  env: str = Field(
534
541
  default=builder.DEFAULT_ENV,
@@ -2180,32 +2187,32 @@ class TableauSiteSource:
2180
2187
  else []
2181
2188
  )
2182
2189
 
2183
- # The Tableau SQL parser much worse than our sqlglot based parser,
2184
- # so relying on metadata parsed by Tableau from SQL queries can be
2185
- # less accurate. This option allows us to ignore Tableau's parser and
2186
- # only use our own.
2187
- if self.config.force_extraction_of_lineage_from_custom_sql_queries:
2188
- logger.debug("Extracting TLL & CLL from custom sql (forced)")
2190
+ tableau_table_list = csql.get(c.TABLES, [])
2191
+ if self.config.force_extraction_of_lineage_from_custom_sql_queries or (
2192
+ not tableau_table_list
2193
+ and self.config.extract_lineage_from_unsupported_custom_sql_queries
2194
+ ):
2195
+ if not tableau_table_list:
2196
+ # custom sql tables may contain unsupported sql, causing incomplete lineage
2197
+ # we extract the lineage from the raw queries
2198
+ logger.debug(
2199
+ "Parsing TLL & CLL from custom sql (tableau metadata incomplete)"
2200
+ )
2201
+ else:
2202
+ # The Tableau SQL parser is much worse than our sqlglot based parser,
2203
+ # so relying on metadata parsed by Tableau from SQL queries can be
2204
+ # less accurate. This option allows us to ignore Tableau's parser and
2205
+ # only use our own.
2206
+ logger.debug("Parsing TLL & CLL from custom sql (forced)")
2207
+
2189
2208
  yield from self._create_lineage_from_unsupported_csql(
2190
2209
  csql_urn, csql, columns
2191
2210
  )
2192
2211
  else:
2193
- tables = csql.get(c.TABLES, [])
2194
-
2195
- if tables:
2196
- # lineage from custom sql -> datasets/tables #
2197
- yield from self._create_lineage_to_upstream_tables(
2198
- csql_urn, tables, datasource
2199
- )
2200
- elif (
2201
- self.config.extract_lineage_from_unsupported_custom_sql_queries
2202
- ):
2203
- logger.debug("Extracting TLL & CLL from custom sql")
2204
- # custom sql tables may contain unsupported sql, causing incomplete lineage
2205
- # we extract the lineage from the raw queries
2206
- yield from self._create_lineage_from_unsupported_csql(
2207
- csql_urn, csql, columns
2208
- )
2212
+ # lineage from custom sql -> datasets/tables #
2213
+ yield from self._create_lineage_to_upstream_tables(
2214
+ csql_urn, tableau_table_list, datasource
2215
+ )
2209
2216
 
2210
2217
  # Schema Metadata
2211
2218
  schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
@@ -2243,7 +2250,6 @@ class TableauSiteSource:
2243
2250
  yield self.get_metadata_change_event(dataset_snapshot)
2244
2251
  yield self.get_metadata_change_proposal(
2245
2252
  dataset_snapshot.urn,
2246
- aspect_name=c.SUB_TYPES,
2247
2253
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
2248
2254
  )
2249
2255
 
@@ -2408,7 +2414,6 @@ class TableauSiteSource:
2408
2414
  upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
2409
2415
  yield self.get_metadata_change_proposal(
2410
2416
  csql_urn,
2411
- aspect_name=c.UPSTREAM_LINEAGE,
2412
2417
  aspect=upstream_lineage,
2413
2418
  )
2414
2419
  self.report.num_tables_with_upstream_lineage += 1
@@ -2594,7 +2599,6 @@ class TableauSiteSource:
2594
2599
  )
2595
2600
  yield self.get_metadata_change_proposal(
2596
2601
  csql_urn,
2597
- aspect_name=c.UPSTREAM_LINEAGE,
2598
2602
  aspect=upstream_lineage,
2599
2603
  )
2600
2604
  self.report.num_tables_with_upstream_lineage += 1
@@ -2640,14 +2644,10 @@ class TableauSiteSource:
2640
2644
  def get_metadata_change_proposal(
2641
2645
  self,
2642
2646
  urn: str,
2643
- aspect_name: str,
2644
2647
  aspect: Union["UpstreamLineage", "SubTypesClass"],
2645
2648
  ) -> MetadataWorkUnit:
2646
2649
  return MetadataChangeProposalWrapper(
2647
- entityType=c.DATASET,
2648
- changeType=ChangeTypeClass.UPSERT,
2649
2650
  entityUrn=urn,
2650
- aspectName=aspect_name,
2651
2651
  aspect=aspect,
2652
2652
  ).as_workunit()
2653
2653
 
@@ -2755,7 +2755,6 @@ class TableauSiteSource:
2755
2755
  )
2756
2756
  yield self.get_metadata_change_proposal(
2757
2757
  datasource_urn,
2758
- aspect_name=c.UPSTREAM_LINEAGE,
2759
2758
  aspect=upstream_lineage,
2760
2759
  )
2761
2760
  self.report.num_tables_with_upstream_lineage += 1
@@ -2774,7 +2773,6 @@ class TableauSiteSource:
2774
2773
  yield self.get_metadata_change_event(dataset_snapshot)
2775
2774
  yield self.get_metadata_change_proposal(
2776
2775
  dataset_snapshot.urn,
2777
- aspect_name=c.SUB_TYPES,
2778
2776
  aspect=SubTypesClass(
2779
2777
  typeNames=(
2780
2778
  ["Embedded Data Source"]
@@ -2860,7 +2858,11 @@ class TableauSiteSource:
2860
2858
  return datasource
2861
2859
 
2862
2860
  def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
2863
- datasource_filter = {c.ID_WITH_IN: self.datasource_ids_being_used}
2861
+ datasource_filter = (
2862
+ {}
2863
+ if self.config.emit_all_published_datasources
2864
+ else {c.ID_WITH_IN: self.datasource_ids_being_used}
2865
+ )
2864
2866
 
2865
2867
  for datasource in self.get_connection_objects(
2866
2868
  query=published_datasource_graphql_query,
@@ -3553,7 +3555,11 @@ class TableauSiteSource:
3553
3555
  return browse_paths
3554
3556
 
3555
3557
  def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
3556
- datasource_filter = {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3558
+ datasource_filter = (
3559
+ {}
3560
+ if self.config.emit_all_embedded_datasources
3561
+ else {c.ID_WITH_IN: self.embedded_datasource_ids_being_used}
3562
+ )
3557
3563
 
3558
3564
  for datasource in self.get_connection_objects(
3559
3565
  query=embedded_datasource_graphql_query,
@@ -50,7 +50,6 @@ TABLES = "tables"
50
50
  DESCRIPTION = "description"
51
51
  SQL = "SQL"
52
52
  QUERY = "query"
53
- SUB_TYPES = "subTypes"
54
53
  VIEW = "view"
55
54
  CUSTOM_SQL = "Custom SQL"
56
55
  REMOTE_TYPE = "remoteType"
@@ -58,7 +57,6 @@ UNKNOWN = "UNKNOWN"
58
57
  PUBLISHED_DATA_SOURCE = "PublishedDatasource"
59
58
  LUID = "luid"
60
59
  EMBEDDED_DATA_SOURCE = "EmbeddedDatasource"
61
- UPSTREAM_LINEAGE = "upstreamLineage"
62
60
  OWNER = "owner"
63
61
  USERNAME = "username"
64
62
  HAS_EXTRACTS = "hasExtracts"
@@ -507,9 +507,10 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin):
507
507
  def _execute_sql_query(self, query: str) -> List[List[str]]:
508
508
  """Execute SQL query using databricks-sql connector for better performance"""
509
509
  try:
510
- with connect(
511
- **self._sql_connection_params
512
- ) as connection, connection.cursor() as cursor:
510
+ with (
511
+ connect(**self._sql_connection_params) as connection,
512
+ connection.cursor() as cursor,
513
+ ):
513
514
  cursor.execute(query)
514
515
  return cursor.fetchall()
515
516
 
@@ -56,6 +56,7 @@ from datahub.ingestion.source.aws.s3_util import (
56
56
  from datahub.ingestion.source.common.subtypes import (
57
57
  DatasetContainerSubTypes,
58
58
  DatasetSubTypes,
59
+ SourceCapabilityModifier,
59
60
  )
60
61
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
61
62
  StaleEntityRemovalHandler,
@@ -152,7 +153,14 @@ logger: logging.Logger = logging.getLogger(__name__)
152
153
  @capability(SourceCapability.USAGE_STATS, "Enabled by default")
153
154
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
154
155
  @capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
155
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
156
+ @capability(
157
+ SourceCapability.CONTAINERS,
158
+ "Enabled by default",
159
+ subtype_modifier=[
160
+ SourceCapabilityModifier.CATALOG,
161
+ SourceCapabilityModifier.SCHEMA,
162
+ ],
163
+ )
156
164
  @capability(SourceCapability.OWNERSHIP, "Supported via the `include_ownership` config")
157
165
  @capability(
158
166
  SourceCapability.DATA_PROFILING, "Supported via the `profiling.enabled` config"
@@ -768,10 +776,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
768
776
 
769
777
  def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
770
778
  domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}")
771
- schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
772
- schema.catalog.name
773
- ).get(f"{schema.catalog.name}.{schema.name}", [])
774
- if schema_tags:
779
+ schema_tags = []
780
+ if self.config.include_tags:
781
+ schema_tags = self.unity_catalog_api_proxy.get_schema_tags(
782
+ schema.catalog.name
783
+ ).get(f"{schema.catalog.name}.{schema.name}", [])
775
784
  logger.debug(f"Schema tags for {schema.name}: {schema_tags}")
776
785
  # Generate platform resources for schema tags
777
786
  yield from self.gen_platform_resources(schema_tags)
@@ -809,10 +818,11 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
809
818
 
810
819
  def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]:
811
820
  domain_urn = self._gen_domain_urn(catalog.name)
812
- catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(catalog.name).get(
813
- catalog.name, []
814
- )
815
- if catalog_tags:
821
+ catalog_tags = []
822
+ if self.config.include_tags:
823
+ catalog_tags = self.unity_catalog_api_proxy.get_catalog_tags(
824
+ catalog.name
825
+ ).get(catalog.name, [])
816
826
  logger.debug(f"Schema tags for {catalog.name}: {catalog_tags}")
817
827
  # Generate platform resources for schema tags
818
828
  yield from self.gen_platform_resources(catalog_tags)
@@ -84,9 +84,10 @@ class SnowflakeAssertionCompiler(AssertionCompiler):
84
84
 
85
85
  dmf_definitions_path = self.output_dir / DMF_DEFINITIONS_FILE_NAME
86
86
  dmf_associations_path = self.output_dir / DMF_ASSOCIATIONS_FILE_NAME
87
- with (dmf_definitions_path).open("w") as definitions, (
88
- dmf_associations_path
89
- ).open("w") as associations:
87
+ with (
88
+ (dmf_definitions_path).open("w") as definitions,
89
+ (dmf_associations_path).open("w") as associations,
90
+ ):
90
91
  for assertion_spec in assertion_config_spec.assertions:
91
92
  result.report.num_processed += 1
92
93
  try: