acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show
  1. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
  2. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
  3. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/common/serialized_value.py +4 -3
  7. datahub/api/entities/dataset/dataset.py +731 -42
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/cli/check_cli.py +72 -19
  10. datahub/cli/docker_cli.py +3 -3
  11. datahub/cli/iceberg_cli.py +31 -7
  12. datahub/cli/ingest_cli.py +30 -93
  13. datahub/cli/lite_cli.py +4 -2
  14. datahub/cli/specific/dataproduct_cli.py +1 -1
  15. datahub/cli/specific/dataset_cli.py +128 -14
  16. datahub/configuration/common.py +10 -2
  17. datahub/configuration/git.py +1 -3
  18. datahub/configuration/kafka.py +1 -1
  19. datahub/emitter/mce_builder.py +28 -13
  20. datahub/emitter/mcp_builder.py +4 -1
  21. datahub/emitter/response_helper.py +145 -0
  22. datahub/emitter/rest_emitter.py +323 -10
  23. datahub/ingestion/api/decorators.py +1 -1
  24. datahub/ingestion/api/source_helpers.py +4 -0
  25. datahub/ingestion/fs/s3_fs.py +2 -2
  26. datahub/ingestion/glossary/classification_mixin.py +1 -5
  27. datahub/ingestion/graph/client.py +41 -22
  28. datahub/ingestion/graph/entity_versioning.py +3 -3
  29. datahub/ingestion/graph/filters.py +64 -37
  30. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
  31. datahub/ingestion/run/pipeline.py +112 -148
  32. datahub/ingestion/run/sink_callback.py +77 -0
  33. datahub/ingestion/sink/datahub_rest.py +8 -0
  34. datahub/ingestion/source/abs/config.py +2 -4
  35. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  36. datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
  37. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
  38. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
  39. datahub/ingestion/source/cassandra/cassandra.py +152 -233
  40. datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
  41. datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
  42. datahub/ingestion/source/common/subtypes.py +12 -0
  43. datahub/ingestion/source/csv_enricher.py +3 -3
  44. datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
  45. datahub/ingestion/source/dbt/dbt_common.py +8 -5
  46. datahub/ingestion/source/dbt/dbt_core.py +11 -9
  47. datahub/ingestion/source/dbt/dbt_tests.py +4 -8
  48. datahub/ingestion/source/delta_lake/config.py +8 -1
  49. datahub/ingestion/source/delta_lake/report.py +4 -2
  50. datahub/ingestion/source/delta_lake/source.py +20 -5
  51. datahub/ingestion/source/dremio/dremio_api.py +4 -8
  52. datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
  53. datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
  54. datahub/ingestion/source/elastic_search.py +26 -6
  55. datahub/ingestion/source/feast.py +27 -8
  56. datahub/ingestion/source/file.py +6 -3
  57. datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
  58. datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
  59. datahub/ingestion/source/ge_data_profiler.py +12 -15
  60. datahub/ingestion/source/iceberg/iceberg.py +46 -12
  61. datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
  62. datahub/ingestion/source/identity/okta.py +37 -7
  63. datahub/ingestion/source/kafka/kafka.py +1 -1
  64. datahub/ingestion/source/kafka_connect/common.py +2 -7
  65. datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
  66. datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
  67. datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
  68. datahub/ingestion/source/looker/looker_common.py +6 -5
  69. datahub/ingestion/source/looker/looker_file_loader.py +2 -2
  70. datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
  71. datahub/ingestion/source/looker/looker_source.py +1 -1
  72. datahub/ingestion/source/looker/looker_template_language.py +4 -2
  73. datahub/ingestion/source/looker/lookml_source.py +3 -2
  74. datahub/ingestion/source/metabase.py +57 -35
  75. datahub/ingestion/source/metadata/business_glossary.py +45 -3
  76. datahub/ingestion/source/metadata/lineage.py +2 -2
  77. datahub/ingestion/source/mlflow.py +365 -35
  78. datahub/ingestion/source/mode.py +18 -8
  79. datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
  80. datahub/ingestion/source/nifi.py +37 -11
  81. datahub/ingestion/source/openapi.py +1 -1
  82. datahub/ingestion/source/openapi_parser.py +49 -17
  83. datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
  84. datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
  85. datahub/ingestion/source/powerbi/powerbi.py +1 -3
  86. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
  87. datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
  88. datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
  89. datahub/ingestion/source/preset.py +7 -4
  90. datahub/ingestion/source/pulsar.py +3 -2
  91. datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
  92. datahub/ingestion/source/redash.py +31 -7
  93. datahub/ingestion/source/redshift/config.py +4 -0
  94. datahub/ingestion/source/redshift/datashares.py +236 -0
  95. datahub/ingestion/source/redshift/lineage.py +6 -2
  96. datahub/ingestion/source/redshift/lineage_v2.py +24 -9
  97. datahub/ingestion/source/redshift/profile.py +1 -1
  98. datahub/ingestion/source/redshift/query.py +133 -33
  99. datahub/ingestion/source/redshift/redshift.py +46 -73
  100. datahub/ingestion/source/redshift/redshift_schema.py +186 -6
  101. datahub/ingestion/source/redshift/report.py +3 -0
  102. datahub/ingestion/source/s3/config.py +5 -5
  103. datahub/ingestion/source/s3/source.py +20 -41
  104. datahub/ingestion/source/salesforce.py +550 -275
  105. datahub/ingestion/source/schema_inference/object.py +1 -1
  106. datahub/ingestion/source/sigma/sigma.py +1 -1
  107. datahub/ingestion/source/slack/slack.py +31 -10
  108. datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
  109. datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
  110. datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
  111. datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
  112. datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
  113. datahub/ingestion/source/sql/athena.py +10 -16
  114. datahub/ingestion/source/sql/druid.py +1 -5
  115. datahub/ingestion/source/sql/hive.py +15 -6
  116. datahub/ingestion/source/sql/hive_metastore.py +3 -2
  117. datahub/ingestion/source/sql/mssql/job_models.py +29 -0
  118. datahub/ingestion/source/sql/mssql/source.py +11 -5
  119. datahub/ingestion/source/sql/oracle.py +127 -63
  120. datahub/ingestion/source/sql/sql_common.py +16 -18
  121. datahub/ingestion/source/sql/sql_types.py +2 -2
  122. datahub/ingestion/source/sql/teradata.py +19 -5
  123. datahub/ingestion/source/sql/trino.py +2 -2
  124. datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
  125. datahub/ingestion/source/superset.py +222 -62
  126. datahub/ingestion/source/tableau/tableau.py +22 -6
  127. datahub/ingestion/source/tableau/tableau_common.py +3 -2
  128. datahub/ingestion/source/unity/ge_profiler.py +2 -1
  129. datahub/ingestion/source/unity/source.py +11 -1
  130. datahub/ingestion/source/vertexai.py +697 -0
  131. datahub/ingestion/source_config/pulsar.py +3 -1
  132. datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
  133. datahub/lite/duckdb_lite.py +3 -10
  134. datahub/lite/lite_local.py +1 -1
  135. datahub/lite/lite_util.py +4 -3
  136. datahub/metadata/_schema_classes.py +714 -417
  137. datahub/metadata/_urns/urn_defs.py +1673 -1649
  138. datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
  139. datahub/metadata/schema.avsc +16438 -16603
  140. datahub/metadata/schemas/AssertionInfo.avsc +3 -1
  141. datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
  142. datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
  143. datahub/metadata/schemas/ChartInfo.avsc +1 -0
  144. datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
  145. datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
  146. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  147. datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
  148. datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
  149. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
  150. datahub/metadata/schemas/DataProcessKey.avsc +2 -1
  151. datahub/metadata/schemas/DataProductKey.avsc +2 -1
  152. datahub/metadata/schemas/DomainKey.avsc +2 -1
  153. datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
  154. datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
  155. datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
  156. datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
  157. datahub/metadata/schemas/IncidentInfo.avsc +130 -46
  158. datahub/metadata/schemas/InputFields.avsc +3 -1
  159. datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
  160. datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
  161. datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
  162. datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
  163. datahub/metadata/schemas/MLModelKey.avsc +3 -1
  164. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
  165. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
  166. datahub/metadata/schemas/PostKey.avsc +2 -1
  167. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  168. datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
  169. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
  170. datahub/metadata/schemas/VersionProperties.avsc +18 -0
  171. datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
  172. datahub/pydantic/__init__.py +0 -0
  173. datahub/pydantic/compat.py +58 -0
  174. datahub/sdk/__init__.py +30 -12
  175. datahub/sdk/_all_entities.py +1 -1
  176. datahub/sdk/_attribution.py +4 -0
  177. datahub/sdk/_shared.py +258 -16
  178. datahub/sdk/_utils.py +35 -0
  179. datahub/sdk/container.py +30 -6
  180. datahub/sdk/dataset.py +118 -20
  181. datahub/sdk/{_entity.py → entity.py} +24 -1
  182. datahub/sdk/entity_client.py +1 -1
  183. datahub/sdk/main_client.py +23 -0
  184. datahub/sdk/resolver_client.py +17 -29
  185. datahub/sdk/search_client.py +50 -0
  186. datahub/sdk/search_filters.py +374 -0
  187. datahub/specific/dataset.py +3 -4
  188. datahub/sql_parsing/_sqlglot_patch.py +2 -10
  189. datahub/sql_parsing/schema_resolver.py +1 -1
  190. datahub/sql_parsing/split_statements.py +220 -126
  191. datahub/sql_parsing/sql_parsing_common.py +7 -0
  192. datahub/sql_parsing/sqlglot_lineage.py +1 -1
  193. datahub/sql_parsing/sqlglot_utils.py +1 -4
  194. datahub/testing/check_sql_parser_result.py +5 -6
  195. datahub/testing/compare_metadata_json.py +7 -6
  196. datahub/testing/pytest_hooks.py +56 -0
  197. datahub/upgrade/upgrade.py +2 -2
  198. datahub/utilities/file_backed_collections.py +3 -14
  199. datahub/utilities/ingest_utils.py +106 -0
  200. datahub/utilities/mapping.py +1 -1
  201. datahub/utilities/memory_footprint.py +3 -2
  202. datahub/utilities/sentinels.py +22 -0
  203. datahub/utilities/unified_diff.py +5 -1
  204. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
  205. {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ from dataclasses import dataclass, field
3
4
  from datetime import datetime
4
5
  from functools import lru_cache
5
6
  from typing import Any, Dict, Iterable, List, Optional
@@ -22,6 +23,7 @@ from datahub.emitter.mce_builder import (
22
23
  make_dataset_urn,
23
24
  make_dataset_urn_with_platform_instance,
24
25
  make_domain_urn,
26
+ make_user_urn,
25
27
  )
26
28
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
27
29
  from datahub.ingestion.api.common import PipelineContext
@@ -36,9 +38,6 @@ from datahub.ingestion.api.decorators import (
36
38
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
37
39
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
40
  from datahub.ingestion.source.sql.sql_types import resolve_sql_type
39
- from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
40
- get_platform_from_sqlalchemy_uri,
41
- )
42
41
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
43
42
  StaleEntityRemovalHandler,
44
43
  StaleEntityRemovalSourceReport,
@@ -49,7 +48,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
49
48
  StatefulIngestionSourceBase,
50
49
  )
51
50
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
52
- AuditStamp,
53
51
  ChangeAuditStamps,
54
52
  Status,
55
53
  TimeStamp,
@@ -68,12 +66,22 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
68
66
  SchemaMetadata,
69
67
  )
70
68
  from datahub.metadata.schema_classes import (
69
+ AuditStampClass,
71
70
  ChartInfoClass,
72
71
  ChartTypeClass,
73
72
  DashboardInfoClass,
73
+ DatasetLineageTypeClass,
74
74
  DatasetPropertiesClass,
75
+ GlobalTagsClass,
76
+ OwnerClass,
77
+ OwnershipClass,
78
+ OwnershipTypeClass,
79
+ TagAssociationClass,
80
+ UpstreamClass,
81
+ UpstreamLineageClass,
75
82
  )
76
83
  from datahub.utilities import config_clean
84
+ from datahub.utilities.lossy_collections import LossyList
77
85
  from datahub.utilities.registries.domain_registry import DomainRegistry
78
86
 
79
87
  logger = logging.getLogger(__name__)
@@ -101,6 +109,14 @@ chart_type_from_viz_type = {
101
109
  platform_without_databases = ["druid"]
102
110
 
103
111
 
112
+ @dataclass
113
+ class SupersetSourceReport(StaleEntityRemovalSourceReport):
114
+ filtered: LossyList[str] = field(default_factory=LossyList)
115
+
116
+ def report_dropped(self, name: str) -> None:
117
+ self.filtered.append(name)
118
+
119
+
104
120
  class SupersetDataset(BaseModel):
105
121
  id: int
106
122
  table_name: str
@@ -136,6 +152,18 @@ class SupersetConfig(
136
152
  default=dict(),
137
153
  description="regex patterns for tables to filter to assign domain_key. ",
138
154
  )
155
+ dataset_pattern: AllowDenyPattern = Field(
156
+ default=AllowDenyPattern.allow_all(),
157
+ description="Regex patterns for dataset to filter in ingestion.",
158
+ )
159
+ chart_pattern: AllowDenyPattern = Field(
160
+ AllowDenyPattern.allow_all(),
161
+ description="Patterns for selecting chart names that are to be included",
162
+ )
163
+ dashboard_pattern: AllowDenyPattern = Field(
164
+ AllowDenyPattern.allow_all(),
165
+ description="Patterns for selecting dashboard names that are to be included",
166
+ )
139
167
  username: Optional[str] = Field(default=None, description="Superset username.")
140
168
  password: Optional[str] = Field(default=None, description="Superset password.")
141
169
  # Configuration for stateful ingestion
@@ -216,7 +244,7 @@ class SupersetSource(StatefulIngestionSourceBase):
216
244
  """
217
245
 
218
246
  config: SupersetConfig
219
- report: StaleEntityRemovalSourceReport
247
+ report: SupersetSourceReport
220
248
  platform = "superset"
221
249
 
222
250
  def __hash__(self):
@@ -225,13 +253,14 @@ class SupersetSource(StatefulIngestionSourceBase):
225
253
  def __init__(self, ctx: PipelineContext, config: SupersetConfig):
226
254
  super().__init__(config, ctx)
227
255
  self.config = config
228
- self.report = StaleEntityRemovalSourceReport()
256
+ self.report = SupersetSourceReport()
229
257
  if self.config.domain:
230
258
  self.domain_registry = DomainRegistry(
231
259
  cached_domains=[domain_id for domain_id in self.config.domain],
232
260
  graph=self.ctx.graph,
233
261
  )
234
262
  self.session = self.login()
263
+ self.owner_info = self.parse_owner_info()
235
264
 
236
265
  def login(self) -> requests.Session:
237
266
  login_response = requests.post(
@@ -271,7 +300,7 @@ class SupersetSource(StatefulIngestionSourceBase):
271
300
 
272
301
  while current_page * page_size < total_items:
273
302
  response = self.session.get(
274
- f"{self.config.connect_uri}/api/v1/{entity_type}/",
303
+ f"{self.config.connect_uri}/api/v1/{entity_type}",
275
304
  params={"q": f"(page:{current_page},page_size:{page_size})"},
276
305
  )
277
306
 
@@ -287,25 +316,24 @@ class SupersetSource(StatefulIngestionSourceBase):
287
316
 
288
317
  current_page += 1
289
318
 
290
- @lru_cache(maxsize=None)
291
- def get_platform_from_database_id(self, database_id):
292
- database_response = self.session.get(
293
- f"{self.config.connect_uri}/api/v1/database/{database_id}"
294
- ).json()
295
- sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri")
296
- if sqlalchemy_uri is None:
297
- platform_name = database_response.get("result", {}).get(
298
- "backend", "external"
299
- )
300
- else:
301
- platform_name = get_platform_from_sqlalchemy_uri(sqlalchemy_uri)
302
- if platform_name == "awsathena":
303
- return "athena"
304
- if platform_name == "clickhousedb":
305
- return "clickhouse"
306
- if platform_name == "postgresql":
307
- return "postgres"
308
- return platform_name
319
+ def parse_owner_info(self) -> Dict[str, Any]:
320
+ entity_types = ["dataset", "dashboard", "chart"]
321
+ owners_info = {}
322
+
323
+ for entity in entity_types:
324
+ for owner in self.paginate_entity_api_results(f"{entity}/related/owners"):
325
+ owner_id = owner.get("value")
326
+ if owner_id:
327
+ owners_info[owner_id] = owner.get("extra", {}).get("email", "")
328
+
329
+ return owners_info
330
+
331
+ def build_owner_urn(self, data: Dict[str, Any]) -> List[str]:
332
+ return [
333
+ make_user_urn(self.owner_info.get(owner.get("id"), ""))
334
+ for owner in data.get("owners", [])
335
+ if owner.get("id")
336
+ ]
309
337
 
310
338
  @lru_cache(maxsize=None)
311
339
  def get_dataset_info(self, dataset_id: int) -> dict:
@@ -323,8 +351,6 @@ class SupersetSource(StatefulIngestionSourceBase):
323
351
  schema_name = dataset_response.get("result", {}).get("schema")
324
352
  table_name = dataset_response.get("result", {}).get("table_name")
325
353
  database_id = dataset_response.get("result", {}).get("database", {}).get("id")
326
- platform = self.get_platform_from_database_id(database_id)
327
-
328
354
  database_name = (
329
355
  dataset_response.get("result", {}).get("database", {}).get("database_name")
330
356
  )
@@ -333,21 +359,24 @@ class SupersetSource(StatefulIngestionSourceBase):
333
359
  # Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
334
360
  # from superset. There is only one database per platform instance, and one schema named druid, so it would be
335
361
  # redundant to systemically store them both in the URN.
336
- if platform in platform_without_databases:
362
+ if platform_instance in platform_without_databases:
337
363
  database_name = None
338
364
 
339
- if platform == "druid" and schema_name == "druid":
365
+ if platform_instance == "druid" and schema_name == "druid":
340
366
  # Follow DataHub's druid source convention.
341
367
  schema_name = None
342
368
 
343
- if database_id and table_name:
369
+ # If the information about the datasource is already contained in the dataset response,
370
+ # can just return the urn directly
371
+ if table_name and database_id:
344
372
  return make_dataset_urn(
345
- platform=platform,
373
+ platform=platform_instance,
346
374
  name=".".join(
347
375
  name for name in [database_name, schema_name, table_name] if name
348
376
  ),
349
377
  env=self.config.env,
350
378
  )
379
+
351
380
  raise ValueError("Could not construct dataset URN")
352
381
 
353
382
  def construct_dashboard_from_api_data(
@@ -355,7 +384,7 @@ class SupersetSource(StatefulIngestionSourceBase):
355
384
  ) -> DashboardSnapshot:
356
385
  dashboard_urn = make_dashboard_urn(
357
386
  platform=self.platform,
358
- name=dashboard_data["id"],
387
+ name=str(dashboard_data["id"]),
359
388
  platform_instance=self.config.platform_instance,
360
389
  )
361
390
  dashboard_snapshot = DashboardSnapshot(
@@ -363,15 +392,16 @@ class SupersetSource(StatefulIngestionSourceBase):
363
392
  aspects=[Status(removed=False)],
364
393
  )
365
394
 
366
- modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('username', 'unknown')}"
395
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
367
396
  modified_ts = int(
368
397
  dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
369
398
  )
370
399
  title = dashboard_data.get("dashboard_title", "")
371
400
  # note: the API does not currently supply created_by usernames due to a bug
372
- last_modified = ChangeAuditStamps(
373
- created=None,
374
- lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
401
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
402
+
403
+ change_audit_stamps = ChangeAuditStamps(
404
+ created=None, lastModified=last_modified
375
405
  )
376
406
  dashboard_url = f"{self.config.display_uri}{dashboard_data.get('url', '')}"
377
407
 
@@ -386,7 +416,7 @@ class SupersetSource(StatefulIngestionSourceBase):
386
416
  chart_urns.append(
387
417
  make_chart_urn(
388
418
  platform=self.platform,
389
- name=value.get("meta", {}).get("chartId", "unknown"),
419
+ name=str(value.get("meta", {}).get("chartId", "unknown")),
390
420
  platform_instance=self.config.platform_instance,
391
421
  )
392
422
  )
@@ -397,13 +427,11 @@ class SupersetSource(StatefulIngestionSourceBase):
397
427
  "IsPublished": str(dashboard_data.get("published", False)).lower(),
398
428
  "Owners": ", ".join(
399
429
  map(
400
- lambda owner: owner.get("username", "unknown"),
430
+ lambda owner: self.owner_info.get(owner.get("id", -1), "unknown"),
401
431
  dashboard_data.get("owners", []),
402
432
  )
403
433
  ),
404
- "IsCertified": str(
405
- True if dashboard_data.get("certified_by") else False
406
- ).lower(),
434
+ "IsCertified": str(bool(dashboard_data.get("certified_by"))).lower(),
407
435
  }
408
436
 
409
437
  if dashboard_data.get("certified_by"):
@@ -417,16 +445,39 @@ class SupersetSource(StatefulIngestionSourceBase):
417
445
  description="",
418
446
  title=title,
419
447
  charts=chart_urns,
420
- lastModified=last_modified,
421
448
  dashboardUrl=dashboard_url,
422
449
  customProperties=custom_properties,
450
+ lastModified=change_audit_stamps,
423
451
  )
424
452
  dashboard_snapshot.aspects.append(dashboard_info)
453
+
454
+ dashboard_owners_list = self.build_owner_urn(dashboard_data)
455
+ owners_info = OwnershipClass(
456
+ owners=[
457
+ OwnerClass(
458
+ owner=urn,
459
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
460
+ )
461
+ for urn in (dashboard_owners_list or [])
462
+ ],
463
+ lastModified=last_modified,
464
+ )
465
+ dashboard_snapshot.aspects.append(owners_info)
466
+
425
467
  return dashboard_snapshot
426
468
 
427
469
  def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
428
- for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
470
+ for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
429
471
  try:
472
+ dashboard_id = str(dashboard_data.get("id"))
473
+ dashboard_title = dashboard_data.get("dashboard_title", "")
474
+
475
+ if not self.config.dashboard_pattern.allowed(dashboard_title):
476
+ self.report.report_dropped(
477
+ f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
478
+ )
479
+ continue
480
+
430
481
  dashboard_snapshot = self.construct_dashboard_from_api_data(
431
482
  dashboard_data
432
483
  )
@@ -439,14 +490,14 @@ class SupersetSource(StatefulIngestionSourceBase):
439
490
  mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
440
491
  yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
441
492
  yield from self._get_domain_wu(
442
- title=dashboard_data.get("dashboard_title", ""),
493
+ title=dashboard_title,
443
494
  entity_urn=dashboard_snapshot.urn,
444
495
  )
445
496
 
446
497
  def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
447
498
  chart_urn = make_chart_urn(
448
499
  platform=self.platform,
449
- name=chart_data["id"],
500
+ name=str(chart_data["id"]),
450
501
  platform_instance=self.config.platform_instance,
451
502
  )
452
503
  chart_snapshot = ChartSnapshot(
@@ -454,25 +505,33 @@ class SupersetSource(StatefulIngestionSourceBase):
454
505
  aspects=[Status(removed=False)],
455
506
  )
456
507
 
457
- modified_actor = f"urn:li:corpuser:{(chart_data.get('changed_by') or {}).get('username', 'unknown')}"
508
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
458
509
  modified_ts = int(
459
510
  dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
460
511
  )
461
512
  title = chart_data.get("slice_name", "")
462
513
 
463
514
  # note: the API does not currently supply created_by usernames due to a bug
464
- last_modified = ChangeAuditStamps(
465
- created=None,
466
- lastModified=AuditStamp(time=modified_ts, actor=modified_actor),
515
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
516
+
517
+ change_audit_stamps = ChangeAuditStamps(
518
+ created=None, lastModified=last_modified
467
519
  )
520
+
468
521
  chart_type = chart_type_from_viz_type.get(chart_data.get("viz_type", ""))
469
522
  chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
470
523
 
471
524
  datasource_id = chart_data.get("datasource_id")
472
- dataset_response = self.get_dataset_info(datasource_id)
473
- datasource_urn = self.get_datasource_urn_from_id(
474
- dataset_response, self.platform
475
- )
525
+ if not datasource_id:
526
+ logger.debug(
527
+ f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
528
+ )
529
+ datasource_urn = None
530
+ else:
531
+ dataset_response = self.get_dataset_info(datasource_id)
532
+ datasource_urn = self.get_datasource_urn_from_id(
533
+ dataset_response, self.platform
534
+ )
476
535
 
477
536
  params = json.loads(chart_data.get("params", "{}"))
478
537
  metrics = [
@@ -515,23 +574,61 @@ class SupersetSource(StatefulIngestionSourceBase):
515
574
  type=chart_type,
516
575
  description="",
517
576
  title=title,
518
- lastModified=last_modified,
519
577
  chartUrl=chart_url,
520
578
  inputs=[datasource_urn] if datasource_urn else None,
521
579
  customProperties=custom_properties,
580
+ lastModified=change_audit_stamps,
522
581
  )
523
582
  chart_snapshot.aspects.append(chart_info)
583
+
584
+ chart_owners_list = self.build_owner_urn(chart_data)
585
+ owners_info = OwnershipClass(
586
+ owners=[
587
+ OwnerClass(
588
+ owner=urn,
589
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
590
+ )
591
+ for urn in (chart_owners_list or [])
592
+ ],
593
+ lastModified=last_modified,
594
+ )
595
+ chart_snapshot.aspects.append(owners_info)
524
596
  return chart_snapshot
525
597
 
526
598
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
527
- for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
599
+ for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
528
600
  try:
601
+ chart_id = str(chart_data.get("id"))
602
+ chart_name = chart_data.get("slice_name", "")
603
+
604
+ if not self.config.chart_pattern.allowed(chart_name):
605
+ self.report.report_dropped(
606
+ f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
607
+ )
608
+ continue
609
+
610
+ # Emit a warning if charts use data from a dataset that will be filtered out
611
+ if self.config.dataset_pattern != AllowDenyPattern.allow_all():
612
+ datasource_id = chart_data.get("datasource_id")
613
+ if datasource_id:
614
+ dataset_response = self.get_dataset_info(datasource_id)
615
+ dataset_name = dataset_response.get("result", {}).get(
616
+ "table_name", ""
617
+ )
618
+
619
+ if dataset_name and not self.config.dataset_pattern.allowed(
620
+ dataset_name
621
+ ):
622
+ self.report.warning(
623
+ f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
624
+ )
625
+
529
626
  chart_snapshot = self.construct_chart_from_chart_data(chart_data)
530
627
 
531
628
  mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
532
629
  except Exception as e:
533
630
  self.report.warning(
534
- f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
631
+ f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
535
632
  )
536
633
  continue
537
634
  # Emit the chart
@@ -588,25 +685,65 @@ class SupersetSource(StatefulIngestionSourceBase):
588
685
  ) -> DatasetSnapshot:
589
686
  dataset_response = self.get_dataset_info(dataset_data.get("id"))
590
687
  dataset = SupersetDataset(**dataset_response["result"])
688
+
591
689
  datasource_urn = self.get_datasource_urn_from_id(
592
690
  dataset_response, self.platform
593
691
  )
692
+ dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
693
+
694
+ modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
695
+ modified_ts = int(
696
+ dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
697
+ )
698
+ last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
699
+
700
+ upstream_warehouse_platform = (
701
+ dataset_response.get("result", {}).get("database", {}).get("backend")
702
+ )
594
703
 
595
- dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
704
+ # Preset has a way of naming their platforms differently than
705
+ # how datahub names them, so map the platform name to the correct naming
706
+ warehouse_naming = {
707
+ "awsathena": "athena",
708
+ "clickhousedb": "clickhouse",
709
+ "postgresql": "postgres",
710
+ }
711
+
712
+ if upstream_warehouse_platform in warehouse_naming:
713
+ upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
714
+
715
+ # TODO: Categorize physical vs virtual upstream dataset
716
+ # mark all upstream dataset as physical for now, in the future we would ideally like
717
+ # to differentiate physical vs virtual upstream datasets
718
+ tag_urn = f"urn:li:tag:{self.platform}:physical"
719
+ upstream_dataset = self.get_datasource_urn_from_id(
720
+ dataset_response, upstream_warehouse_platform
721
+ )
722
+ upstream_lineage = UpstreamLineageClass(
723
+ upstreams=[
724
+ UpstreamClass(
725
+ type=DatasetLineageTypeClass.TRANSFORMED,
726
+ dataset=upstream_dataset,
727
+ properties={"externalUrl": dataset_url},
728
+ )
729
+ ]
730
+ )
596
731
 
597
732
  dataset_info = DatasetPropertiesClass(
598
733
  name=dataset.table_name,
599
734
  description="",
600
- lastModified=TimeStamp(time=dataset.modified_ts)
601
- if dataset.modified_ts
602
- else None,
603
735
  externalUrl=dataset_url,
736
+ lastModified=TimeStamp(time=modified_ts),
604
737
  )
738
+ global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
739
+
605
740
  aspects_items: List[Any] = []
606
741
  aspects_items.extend(
607
742
  [
608
743
  self.gen_schema_metadata(dataset_response),
609
744
  dataset_info,
745
+ upstream_lineage,
746
+ global_tags,
610
747
  ]
611
748
  )
612
749
 
@@ -614,11 +751,34 @@ class SupersetSource(StatefulIngestionSourceBase):
614
751
  urn=datasource_urn,
615
752
  aspects=aspects_items,
616
753
  )
754
+
755
+ dataset_owners_list = self.build_owner_urn(dataset_data)
756
+ owners_info = OwnershipClass(
757
+ owners=[
758
+ OwnerClass(
759
+ owner=urn,
760
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
761
+ )
762
+ for urn in (dataset_owners_list or [])
763
+ ],
764
+ lastModified=last_modified,
765
+ )
766
+ aspects_items.append(owners_info)
767
+
617
768
  return dataset_snapshot
618
769
 
619
770
  def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
620
- for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
771
+ for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
621
772
  try:
773
+ dataset_name = dataset_data.get("table_name", "")
774
+
775
+ # Check if dataset should be filtered by dataset name
776
+ if not self.config.dataset_pattern.allowed(dataset_name):
777
+ self.report.report_dropped(
778
+ f"Dataset '{dataset_name}' filtered by dataset_pattern"
779
+ )
780
+ continue
781
+
622
782
  dataset_snapshot = self.construct_dataset_from_dataset_data(
623
783
  dataset_data
624
784
  )
@@ -1562,8 +1562,9 @@ class TableauSiteSource:
1562
1562
  query: str,
1563
1563
  connection_type: str,
1564
1564
  page_size: int,
1565
- query_filter: dict = {},
1565
+ query_filter: Optional[dict] = None,
1566
1566
  ) -> Iterable[dict]:
1567
+ query_filter = query_filter or {}
1567
1568
  query_filter = optimize_query_filter(query_filter)
1568
1569
 
1569
1570
  # Calls the get_connection_object_page function to get the objects,
@@ -1910,11 +1911,7 @@ class TableauSiteSource:
1910
1911
  if upstream_col.get(c.TABLE)
1911
1912
  else None
1912
1913
  )
1913
- if (
1914
- name
1915
- and upstream_table_id
1916
- and upstream_table_id in table_id_to_urn.keys()
1917
- ):
1914
+ if name and upstream_table_id and upstream_table_id in table_id_to_urn:
1918
1915
  parent_dataset_urn = table_id_to_urn[upstream_table_id]
1919
1916
  if (
1920
1917
  self.is_snowflake_urn(parent_dataset_urn)
@@ -2190,6 +2187,10 @@ class TableauSiteSource:
2190
2187
  dataset_snapshot.aspects.append(browse_paths)
2191
2188
  else:
2192
2189
  logger.debug(f"Browse path not set for Custom SQL table {csql_id}")
2190
+ logger.warning(
2191
+ f"Skipping Custom SQL table {csql_id} due to filtered downstream"
2192
+ )
2193
+ continue
2193
2194
 
2194
2195
  dataset_properties = DatasetPropertiesClass(
2195
2196
  name=csql.get(c.NAME),
@@ -2628,6 +2629,15 @@ class TableauSiteSource:
2628
2629
  datasource_info = datasource
2629
2630
 
2630
2631
  browse_path = self._get_project_browse_path_name(datasource)
2632
+ if (
2633
+ not is_embedded_ds
2634
+ and self._get_published_datasource_project_luid(datasource) is None
2635
+ ):
2636
+ logger.warning(
2637
+ f"Skip ingesting published datasource {datasource.get(c.NAME)} because of filtered project"
2638
+ )
2639
+ return
2640
+
2631
2641
  logger.debug(f"datasource {datasource.get(c.NAME)} browse-path {browse_path}")
2632
2642
  datasource_id = datasource[c.ID]
2633
2643
  datasource_urn = builder.make_dataset_urn_with_platform_instance(
@@ -2851,6 +2861,11 @@ class TableauSiteSource:
2851
2861
  query_filter=tables_filter,
2852
2862
  page_size=self.config.effective_database_table_page_size,
2853
2863
  ):
2864
+ if tableau_database_table_id_to_urn_map.get(tableau_table[c.ID]) is None:
2865
+ logger.warning(
2866
+ f"Skipping table {tableau_table[c.ID]} due to filtered out published datasource"
2867
+ )
2868
+ continue
2854
2869
  database_table = self.database_tables[
2855
2870
  tableau_database_table_id_to_urn_map[tableau_table[c.ID]]
2856
2871
  ]
@@ -2905,6 +2920,7 @@ class TableauSiteSource:
2905
2920
  dataset_snapshot.aspects.append(browse_paths)
2906
2921
  else:
2907
2922
  logger.debug(f"Browse path not set for table {database_table.urn}")
2923
+ return
2908
2924
 
2909
2925
  schema_metadata = self.get_schema_metadata_for_table(
2910
2926
  tableau_columns, database_table.parsed_columns
@@ -514,7 +514,8 @@ FIELD_TYPE_MAPPING = {
514
514
  }
515
515
 
516
516
 
517
- def get_tags_from_params(params: List[str] = []) -> GlobalTagsClass:
517
+ def get_tags_from_params(params: Optional[List[str]] = None) -> GlobalTagsClass:
518
+ params = params or []
518
519
  tags = [
519
520
  TagAssociationClass(tag=builder.make_tag_urn(tag.upper()))
520
521
  for tag in params
@@ -901,7 +902,7 @@ def get_unique_custom_sql(custom_sql_list: List[dict]) -> List[dict]:
901
902
  "name": custom_sql.get("name"),
902
903
  # We assume that this is unsupported custom sql if "actual tables that this query references"
903
904
  # are missing from api result.
904
- "isUnsupportedCustomSql": True if not custom_sql.get("tables") else False,
905
+ "isUnsupportedCustomSql": not custom_sql.get("tables"),
905
906
  "query": custom_sql.get("query"),
906
907
  "connectionType": custom_sql.get("connectionType"),
907
908
  "columns": custom_sql.get("columns"),
@@ -1,3 +1,4 @@
1
+ import concurrent.futures
1
2
  import logging
2
3
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
4
  from dataclasses import dataclass, field
@@ -91,7 +92,7 @@ class UnityCatalogGEProfiler(GenericProfiler):
91
92
  profile_requests.append(profile_request)
92
93
  if i > 0 and i % 100 == 0:
93
94
  logger.info(f"Finished table-level profiling for {i} tables")
94
- except TimeoutError:
95
+ except (TimeoutError, concurrent.futures.TimeoutError):
95
96
  logger.warning("Timed out waiting to complete table-level profiling.")
96
97
 
97
98
  if len(profile_requests) == 0:
@@ -464,7 +464,17 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
464
464
 
465
465
  with self.report.new_stage(f"Ingest schema {schema.id}"):
466
466
  yield from self.gen_schema_containers(schema)
467
- yield from self.process_tables(schema)
467
+ try:
468
+ yield from self.process_tables(schema)
469
+ except Exception as e:
470
+ logger.exception(f"Error parsing schema {schema}")
471
+ self.report.report_warning(
472
+ message="Missed schema because of parsing issues",
473
+ context=str(schema),
474
+ title="Error parsing schema",
475
+ exc=e,
476
+ )
477
+ continue
468
478
 
469
479
  self.report.schemas.processed(schema.id)
470
480