acryl-datahub 0.15.0rc12__py3-none-any.whl → 0.15.0rc14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (26) hide show
  1. {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/METADATA +2499 -2499
  2. {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/RECORD +26 -26
  3. datahub/__init__.py +1 -1
  4. datahub/ingestion/source/aws/aws_common.py +13 -1
  5. datahub/ingestion/source/aws/sagemaker.py +8 -0
  6. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  7. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  8. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  9. datahub/ingestion/source/gc/dataprocess_cleanup.py +20 -11
  10. datahub/ingestion/source/powerbi/m_query/data_classes.py +2 -13
  11. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +19 -27
  12. datahub/ingestion/source/powerbi/m_query/resolver.py +8 -10
  13. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  14. datahub/ingestion/source/preset.py +1 -0
  15. datahub/ingestion/source/snowflake/snowflake_config.py +4 -3
  16. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  17. datahub/ingestion/source/snowflake/snowflake_query.py +2 -2
  18. datahub/ingestion/source/sql/mssql/source.py +0 -2
  19. datahub/ingestion/source/sql/sql_common.py +34 -21
  20. datahub/ingestion/source/sql/sql_report.py +1 -0
  21. datahub/ingestion/source/superset.py +215 -65
  22. datahub/ingestion/source/unity/source.py +2 -0
  23. datahub/sql_parsing/sqlglot_lineage.py +7 -1
  24. {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/WHEEL +0 -0
  25. {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/entry_points.txt +0 -0
  26. {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/top_level.txt +0 -0
@@ -85,6 +85,7 @@ class PresetSource(SupersetSource):
85
85
  super().__init__(ctx, config)
86
86
  self.config = config
87
87
  self.report = StaleEntityRemovalSourceReport()
88
+ self.platform = "preset"
88
89
 
89
90
  def login(self):
90
91
  try:
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from collections import defaultdict
3
3
  from dataclasses import dataclass
4
- from typing import Dict, List, Optional, Set, cast
4
+ from typing import Dict, List, Optional, Set
5
5
 
6
6
  import pydantic
7
7
  from pydantic import Field, SecretStr, root_validator, validator
@@ -118,9 +118,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
118
118
  )
119
119
 
120
120
  # Always exclude reporting metadata for INFORMATION_SCHEMA schema
121
- if schema_pattern is not None and schema_pattern:
121
+ if schema_pattern:
122
122
  logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
123
- cast(AllowDenyPattern, schema_pattern).deny.append(r".*INFORMATION_SCHEMA$")
123
+ assert isinstance(schema_pattern, AllowDenyPattern)
124
+ schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
124
125
 
125
126
  return values
126
127
 
@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
43
43
  "EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
44
44
  "KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
45
45
  "OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
46
+ "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
46
47
  }
47
48
 
48
49
  _SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
104
105
  description="Connect args to pass to Snowflake SqlAlchemy driver",
105
106
  exclude=True,
106
107
  )
108
+ token: Optional[str] = pydantic.Field(
109
+ default=None,
110
+ description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
111
+ )
107
112
 
108
113
  def get_account(self) -> str:
109
114
  assert self.account_id
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
148
153
  logger.info(f"using authenticator type '{v}'")
149
154
  return v
150
155
 
156
+ @pydantic.validator("token", always=True)
157
+ def validate_token_oauth_config(cls, v, values):
158
+ auth_type = values.get("authentication_type")
159
+ if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
160
+ if not v:
161
+ raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
162
+ elif v is not None:
163
+ raise ValueError(
164
+ "Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
165
+ )
166
+ return v
167
+
151
168
  @staticmethod
152
169
  def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
153
170
  if oauth_config is None:
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
333
350
  application=_APPLICATION_NAME,
334
351
  **connect_args,
335
352
  )
353
+ elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
354
+ return snowflake.connector.connect(
355
+ user=self.username,
356
+ account=self.account_id,
357
+ authenticator="oauth",
358
+ token=self.token, # Token generated externally and provided directly to the recipe
359
+ warehouse=self.warehouse,
360
+ role=self.role,
361
+ application=_APPLICATION_NAME,
362
+ **connect_args,
363
+ )
336
364
  elif self.authentication_type == "OAUTH_AUTHENTICATOR":
337
365
  return self.get_oauth_connection()
338
366
  elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":
@@ -132,7 +132,7 @@ class SnowflakeQuery:
132
132
  auto_clustering_on AS "AUTO_CLUSTERING_ON"
133
133
  FROM {db_clause}information_schema.tables t
134
134
  WHERE table_schema != 'INFORMATION_SCHEMA'
135
- and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
135
+ and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
136
136
  order by table_schema, table_name"""
137
137
 
138
138
  @staticmethod
@@ -152,7 +152,7 @@ class SnowflakeQuery:
152
152
  auto_clustering_on AS "AUTO_CLUSTERING_ON"
153
153
  FROM {db_clause}information_schema.tables t
154
154
  where table_schema='{schema_name}'
155
- and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
155
+ and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
156
156
  order by table_schema, table_name"""
157
157
 
158
158
  @staticmethod
@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
5
5
 
6
6
  import pydantic
7
7
  import sqlalchemy.dialects.mssql
8
-
9
- # This import verifies that the dependencies are available.
10
8
  from pydantic.fields import Field
11
9
  from sqlalchemy import create_engine, inspect
12
10
  from sqlalchemy.engine.base import Connection
@@ -582,6 +582,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
582
582
  generate_operations=False,
583
583
  )
584
584
  for dataset_name in self._view_definition_cache.keys():
585
+ # TODO: Ensure that the lineage generated from the view definition
586
+ # matches the dataset_name.
585
587
  view_definition = self._view_definition_cache[dataset_name]
586
588
  result = self._run_sql_parser(
587
589
  dataset_name,
@@ -1059,6 +1061,20 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1059
1061
  exc=e,
1060
1062
  )
1061
1063
 
1064
+ def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
1065
+ try:
1066
+ view_definition = inspector.get_view_definition(view, schema)
1067
+ if view_definition is None:
1068
+ view_definition = ""
1069
+ else:
1070
+ # Some dialects return a TextClause instead of a raw string,
1071
+ # so we need to convert them to a string.
1072
+ view_definition = str(view_definition)
1073
+ except NotImplementedError:
1074
+ view_definition = ""
1075
+
1076
+ return view_definition
1077
+
1062
1078
  def _process_view(
1063
1079
  self,
1064
1080
  dataset_name: str,
@@ -1077,7 +1093,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1077
1093
  columns = inspector.get_columns(view, schema)
1078
1094
  except KeyError:
1079
1095
  # For certain types of views, we are unable to fetch the list of columns.
1080
- self.warn(logger, dataset_name, "unable to get schema for this view")
1096
+ self.report.warning(
1097
+ message="Unable to get schema for a view",
1098
+ context=f"{dataset_name}",
1099
+ )
1081
1100
  schema_metadata = None
1082
1101
  else:
1083
1102
  schema_fields = self.get_schema_fields(dataset_name, columns, inspector)
@@ -1091,19 +1110,12 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1091
1110
  if self._save_schema_to_resolver():
1092
1111
  self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
1093
1112
  self.discovered_datasets.add(dataset_name)
1113
+
1094
1114
  description, properties, _ = self.get_table_properties(inspector, schema, view)
1095
- try:
1096
- view_definition = inspector.get_view_definition(view, schema)
1097
- if view_definition is None:
1098
- view_definition = ""
1099
- else:
1100
- # Some dialects return a TextClause instead of a raw string,
1101
- # so we need to convert them to a string.
1102
- view_definition = str(view_definition)
1103
- except NotImplementedError:
1104
- view_definition = ""
1105
- properties["view_definition"] = view_definition
1106
1115
  properties["is_view"] = "True"
1116
+
1117
+ view_definition = self._get_view_definition(inspector, schema, view)
1118
+ properties["view_definition"] = view_definition
1107
1119
  if view_definition and self.config.include_view_lineage:
1108
1120
  self._view_definition_cache[dataset_name] = view_definition
1109
1121
 
@@ -1135,15 +1147,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1135
1147
  entityUrn=dataset_urn,
1136
1148
  aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
1137
1149
  ).as_workunit()
1138
- if "view_definition" in properties:
1139
- view_definition_string = properties["view_definition"]
1140
- view_properties_aspect = ViewPropertiesClass(
1141
- materialized=False, viewLanguage="SQL", viewLogic=view_definition_string
1142
- )
1143
- yield MetadataChangeProposalWrapper(
1144
- entityUrn=dataset_urn,
1145
- aspect=view_properties_aspect,
1146
- ).as_workunit()
1150
+
1151
+ view_properties_aspect = ViewPropertiesClass(
1152
+ materialized=False, viewLanguage="SQL", viewLogic=view_definition
1153
+ )
1154
+ yield MetadataChangeProposalWrapper(
1155
+ entityUrn=dataset_urn,
1156
+ aspect=view_properties_aspect,
1157
+ ).as_workunit()
1147
1158
 
1148
1159
  if self.config.domain and self.domain_registry:
1149
1160
  yield from get_domain_wu(
@@ -1197,6 +1208,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
1197
1208
  )
1198
1209
  else:
1199
1210
  self.report.num_view_definitions_parsed += 1
1211
+ if raw_lineage.out_tables != [view_urn]:
1212
+ self.report.num_view_definitions_view_urn_mismatch += 1
1200
1213
  return view_definition_lineage_helper(raw_lineage, view_urn)
1201
1214
 
1202
1215
  def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
@@ -48,6 +48,7 @@ class SQLSourceReport(
48
48
  query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
49
49
 
50
50
  num_view_definitions_parsed: int = 0
51
+ num_view_definitions_view_urn_mismatch: int = 0
51
52
  num_view_definitions_failed_parsing: int = 0
52
53
  num_view_definitions_failed_column_parsing: int = 0
53
54
  view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
@@ -1,10 +1,12 @@
1
1
  import json
2
2
  import logging
3
+ from datetime import datetime
3
4
  from functools import lru_cache
4
- from typing import Dict, Iterable, List, Optional
5
+ from typing import Any, Dict, Iterable, List, Optional
5
6
 
6
7
  import dateutil.parser as dp
7
8
  import requests
9
+ from pydantic import BaseModel
8
10
  from pydantic.class_validators import root_validator, validator
9
11
  from pydantic.fields import Field
10
12
 
@@ -16,7 +18,9 @@ from datahub.configuration.source_common import (
16
18
  from datahub.emitter.mce_builder import (
17
19
  make_chart_urn,
18
20
  make_dashboard_urn,
21
+ make_data_platform_urn,
19
22
  make_dataset_urn,
23
+ make_dataset_urn_with_platform_instance,
20
24
  make_domain_urn,
21
25
  )
22
26
  from datahub.emitter.mcp_builder import add_domain_to_entity_wu
@@ -31,6 +35,7 @@ from datahub.ingestion.api.decorators import (
31
35
  )
32
36
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
33
37
  from datahub.ingestion.api.workunit import MetadataWorkUnit
38
+ from datahub.ingestion.source.sql.sql_types import resolve_sql_type
34
39
  from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
35
40
  get_platform_from_sqlalchemy_uri,
36
41
  )
@@ -47,16 +52,26 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
47
52
  AuditStamp,
48
53
  ChangeAuditStamps,
49
54
  Status,
55
+ TimeStamp,
50
56
  )
51
57
  from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
52
58
  ChartSnapshot,
53
59
  DashboardSnapshot,
60
+ DatasetSnapshot,
54
61
  )
55
62
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
63
+ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
64
+ MySqlDDL,
65
+ NullType,
66
+ SchemaField,
67
+ SchemaFieldDataType,
68
+ SchemaMetadata,
69
+ )
56
70
  from datahub.metadata.schema_classes import (
57
71
  ChartInfoClass,
58
72
  ChartTypeClass,
59
73
  DashboardInfoClass,
74
+ DatasetPropertiesClass,
60
75
  )
61
76
  from datahub.utilities import config_clean
62
77
  from datahub.utilities.registries.domain_registry import DomainRegistry
@@ -82,9 +97,29 @@ chart_type_from_viz_type = {
82
97
  "box_plot": ChartTypeClass.BAR,
83
98
  }
84
99
 
100
+
85
101
  platform_without_databases = ["druid"]
86
102
 
87
103
 
104
+ class SupersetDataset(BaseModel):
105
+ id: int
106
+ table_name: str
107
+ changed_on_utc: Optional[str] = None
108
+ explore_url: Optional[str] = ""
109
+
110
+ @property
111
+ def modified_dt(self) -> Optional[datetime]:
112
+ if self.changed_on_utc:
113
+ return dp.parse(self.changed_on_utc)
114
+ return None
115
+
116
+ @property
117
+ def modified_ts(self) -> Optional[int]:
118
+ if self.modified_dt:
119
+ return int(self.modified_dt.timestamp() * 1000)
120
+ return None
121
+
122
+
88
123
  class SupersetConfig(
89
124
  StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
90
125
  ):
@@ -103,15 +138,17 @@ class SupersetConfig(
103
138
  )
104
139
  username: Optional[str] = Field(default=None, description="Superset username.")
105
140
  password: Optional[str] = Field(default=None, description="Superset password.")
106
- api_key: Optional[str] = Field(default=None, description="Preset.io API key.")
107
- api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.")
108
- manager_uri: str = Field(
109
- default="https://api.app.preset.io", description="Preset.io API URL"
110
- )
111
141
  # Configuration for stateful ingestion
112
142
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
113
143
  default=None, description="Superset Stateful Ingestion Config."
114
144
  )
145
+ ingest_dashboards: bool = Field(
146
+ default=True, description="Enable to ingest dashboards."
147
+ )
148
+ ingest_charts: bool = Field(default=True, description="Enable to ingest charts.")
149
+ ingest_datasets: bool = Field(
150
+ default=False, description="Enable to ingest datasets."
151
+ )
115
152
 
116
153
  provider: str = Field(default="db", description="Superset provider.")
117
154
  options: Dict = Field(default={}, description="")
@@ -123,6 +160,10 @@ class SupersetConfig(
123
160
  description="Can be used to change mapping for database names in superset to what you have in datahub",
124
161
  )
125
162
 
163
+ class Config:
164
+ # This is required to allow preset configs to get parsed
165
+ extra = "allow"
166
+
126
167
  @validator("connect_uri", "display_uri")
127
168
  def remove_trailing_slash(cls, v):
128
169
  return config_clean.remove_trailing_slashes(v)
@@ -229,6 +270,28 @@ class SupersetSource(StatefulIngestionSourceBase):
229
270
  config = SupersetConfig.parse_obj(config_dict)
230
271
  return cls(ctx, config)
231
272
 
273
+ def paginate_entity_api_results(self, entity_type, page_size=100):
274
+ current_page = 0
275
+ total_items = page_size
276
+
277
+ while current_page * page_size < total_items:
278
+ response = self.session.get(
279
+ f"{self.config.connect_uri}/api/v1/{entity_type}/",
280
+ params={"q": f"(page:{current_page},page_size:{page_size})"},
281
+ )
282
+
283
+ if response.status_code != 200:
284
+ logger.warning(f"Failed to get {entity_type} data: {response.text}")
285
+
286
+ payload = response.json()
287
+ # Update total_items with the actual count from the response
288
+ total_items = payload.get("count", total_items)
289
+ # Yield each item in the result, this gets passed into the construct functions
290
+ for item in payload.get("result", []):
291
+ yield item
292
+
293
+ current_page += 1
294
+
232
295
  @lru_cache(maxsize=None)
233
296
  def get_platform_from_database_id(self, database_id):
234
297
  database_response = self.session.get(
@@ -250,11 +313,18 @@ class SupersetSource(StatefulIngestionSourceBase):
250
313
  return platform_name
251
314
 
252
315
  @lru_cache(maxsize=None)
253
- def get_datasource_urn_from_id(self, datasource_id):
316
+ def get_dataset_info(self, dataset_id: int) -> dict:
254
317
  dataset_response = self.session.get(
255
- f"{self.config.connect_uri}/api/v1/dataset/{datasource_id}"
256
- ).json()
257
-
318
+ f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
319
+ )
320
+ if dataset_response.status_code != 200:
321
+ logger.warning(f"Failed to get dataset info: {dataset_response.text}")
322
+ dataset_response.raise_for_status()
323
+ return dataset_response.json()
324
+
325
+ def get_datasource_urn_from_id(
326
+ self, dataset_response: dict, platform_instance: str
327
+ ) -> str:
258
328
  schema_name = dataset_response.get("result", {}).get("schema")
259
329
  table_name = dataset_response.get("result", {}).get("table_name")
260
330
  database_id = dataset_response.get("result", {}).get("database", {}).get("id")
@@ -283,9 +353,11 @@ class SupersetSource(StatefulIngestionSourceBase):
283
353
  ),
284
354
  env=self.config.env,
285
355
  )
286
- return None
356
+ raise ValueError("Could not construct dataset URN")
287
357
 
288
- def construct_dashboard_from_api_data(self, dashboard_data):
358
+ def construct_dashboard_from_api_data(
359
+ self, dashboard_data: dict
360
+ ) -> DashboardSnapshot:
289
361
  dashboard_urn = make_dashboard_urn(
290
362
  platform=self.platform,
291
363
  name=dashboard_data["id"],
@@ -340,7 +412,7 @@ class SupersetSource(StatefulIngestionSourceBase):
340
412
  }
341
413
 
342
414
  if dashboard_data.get("certified_by"):
343
- custom_properties["CertifiedBy"] = dashboard_data.get("certified_by")
415
+ custom_properties["CertifiedBy"] = dashboard_data.get("certified_by", "")
344
416
  custom_properties["CertificationDetails"] = str(
345
417
  dashboard_data.get("certification_details")
346
418
  )
@@ -358,38 +430,25 @@ class SupersetSource(StatefulIngestionSourceBase):
358
430
  return dashboard_snapshot
359
431
 
360
432
  def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
361
- current_dashboard_page = 0
362
- # we will set total dashboards to the actual number after we get the response
363
- total_dashboards = PAGE_SIZE
364
-
365
- while current_dashboard_page * PAGE_SIZE <= total_dashboards:
366
- dashboard_response = self.session.get(
367
- f"{self.config.connect_uri}/api/v1/dashboard/",
368
- params=f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})",
369
- )
370
- if dashboard_response.status_code != 200:
371
- logger.warning(
372
- f"Failed to get dashboard data: {dashboard_response.text}"
373
- )
374
- dashboard_response.raise_for_status()
375
-
376
- payload = dashboard_response.json()
377
- total_dashboards = payload.get("count") or 0
378
-
379
- current_dashboard_page += 1
380
-
381
- for dashboard_data in payload["result"]:
433
+ for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
434
+ try:
382
435
  dashboard_snapshot = self.construct_dashboard_from_api_data(
383
436
  dashboard_data
384
437
  )
385
- mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
386
- yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
387
- yield from self._get_domain_wu(
388
- title=dashboard_data.get("dashboard_title", ""),
389
- entity_urn=dashboard_snapshot.urn,
438
+ except Exception as e:
439
+ self.report.warning(
440
+ f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
390
441
  )
442
+ continue
443
+ # Emit the dashboard
444
+ mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
445
+ yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
446
+ yield from self._get_domain_wu(
447
+ title=dashboard_data.get("dashboard_title", ""),
448
+ entity_urn=dashboard_snapshot.urn,
449
+ )
391
450
 
392
- def construct_chart_from_chart_data(self, chart_data):
451
+ def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
393
452
  chart_urn = make_chart_urn(
394
453
  platform=self.platform,
395
454
  name=chart_data["id"],
@@ -415,9 +474,12 @@ class SupersetSource(StatefulIngestionSourceBase):
415
474
  chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
416
475
 
417
476
  datasource_id = chart_data.get("datasource_id")
418
- datasource_urn = self.get_datasource_urn_from_id(datasource_id)
477
+ dataset_response = self.get_dataset_info(datasource_id)
478
+ datasource_urn = self.get_datasource_urn_from_id(
479
+ dataset_response, self.platform
480
+ )
419
481
 
420
- params = json.loads(chart_data.get("params"))
482
+ params = json.loads(chart_data.get("params", "{}"))
421
483
  metrics = [
422
484
  get_metric_name(metric)
423
485
  for metric in (params.get("metrics", []) or [params.get("metric")])
@@ -467,36 +529,124 @@ class SupersetSource(StatefulIngestionSourceBase):
467
529
  return chart_snapshot
468
530
 
469
531
  def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
470
- current_chart_page = 0
471
- # we will set total charts to the actual number after we get the response
472
- total_charts = PAGE_SIZE
473
-
474
- while current_chart_page * PAGE_SIZE <= total_charts:
475
- chart_response = self.session.get(
476
- f"{self.config.connect_uri}/api/v1/chart/",
477
- params=f"q=(page:{current_chart_page},page_size:{PAGE_SIZE})",
532
+ for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
533
+ try:
534
+ chart_snapshot = self.construct_chart_from_chart_data(chart_data)
535
+
536
+ mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
537
+ except Exception as e:
538
+ self.report.warning(
539
+ f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
540
+ )
541
+ continue
542
+ # Emit the chart
543
+ yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
544
+ yield from self._get_domain_wu(
545
+ title=chart_data.get("slice_name", ""),
546
+ entity_urn=chart_snapshot.urn,
478
547
  )
479
- if chart_response.status_code != 200:
480
- logger.warning(f"Failed to get chart data: {chart_response.text}")
481
- chart_response.raise_for_status()
482
548
 
483
- current_chart_page += 1
549
+ def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
550
+ schema_fields: List[SchemaField] = []
551
+ for col in column_data:
552
+ col_type = (col.get("type") or "").lower()
553
+ data_type = resolve_sql_type(col_type)
554
+ if data_type is None:
555
+ data_type = NullType()
556
+
557
+ field = SchemaField(
558
+ fieldPath=col.get("column_name", ""),
559
+ type=SchemaFieldDataType(data_type),
560
+ nativeDataType="",
561
+ description=col.get("column_name", ""),
562
+ nullable=True,
563
+ )
564
+ schema_fields.append(field)
565
+ return schema_fields
566
+
567
+ def gen_schema_metadata(
568
+ self,
569
+ dataset_response: dict,
570
+ ) -> SchemaMetadata:
571
+ dataset_response = dataset_response.get("result", {})
572
+ column_data = dataset_response.get("columns", [])
573
+ schema_metadata = SchemaMetadata(
574
+ schemaName=dataset_response.get("table_name", ""),
575
+ platform=make_data_platform_urn(self.platform),
576
+ version=0,
577
+ hash="",
578
+ platformSchema=MySqlDDL(tableSchema=""),
579
+ fields=self.gen_schema_fields(column_data),
580
+ )
581
+ return schema_metadata
484
582
 
485
- payload = chart_response.json()
486
- total_charts = payload["count"]
487
- for chart_data in payload["result"]:
488
- chart_snapshot = self.construct_chart_from_chart_data(chart_data)
583
+ def gen_dataset_urn(self, datahub_dataset_name: str) -> str:
584
+ return make_dataset_urn_with_platform_instance(
585
+ platform=self.platform,
586
+ name=datahub_dataset_name,
587
+ platform_instance=self.config.platform_instance,
588
+ env=self.config.env,
589
+ )
489
590
 
490
- mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
491
- yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
492
- yield from self._get_domain_wu(
493
- title=chart_data.get("slice_name", ""),
494
- entity_urn=chart_snapshot.urn,
591
+ def construct_dataset_from_dataset_data(
592
+ self, dataset_data: dict
593
+ ) -> DatasetSnapshot:
594
+ dataset_response = self.get_dataset_info(dataset_data.get("id"))
595
+ dataset = SupersetDataset(**dataset_response["result"])
596
+ datasource_urn = self.get_datasource_urn_from_id(
597
+ dataset_response, self.platform
598
+ )
599
+
600
+ dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
601
+
602
+ dataset_info = DatasetPropertiesClass(
603
+ name=dataset.table_name,
604
+ description="",
605
+ lastModified=TimeStamp(time=dataset.modified_ts)
606
+ if dataset.modified_ts
607
+ else None,
608
+ externalUrl=dataset_url,
609
+ )
610
+ aspects_items: List[Any] = []
611
+ aspects_items.extend(
612
+ [
613
+ self.gen_schema_metadata(dataset_response),
614
+ dataset_info,
615
+ ]
616
+ )
617
+
618
+ dataset_snapshot = DatasetSnapshot(
619
+ urn=datasource_urn,
620
+ aspects=aspects_items,
621
+ )
622
+ return dataset_snapshot
623
+
624
+ def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
625
+ for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
626
+ try:
627
+ dataset_snapshot = self.construct_dataset_from_dataset_data(
628
+ dataset_data
495
629
  )
630
+ mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
631
+ except Exception as e:
632
+ self.report.warning(
633
+ f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
634
+ )
635
+ continue
636
+ # Emit the dataset
637
+ yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
638
+ yield from self._get_domain_wu(
639
+ title=dataset_data.get("table_name", ""),
640
+ entity_urn=dataset_snapshot.urn,
641
+ )
496
642
 
497
643
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
498
- yield from self.emit_dashboard_mces()
499
- yield from self.emit_chart_mces()
644
+ if self.config.ingest_dashboards:
645
+ yield from self.emit_dashboard_mces()
646
+ if self.config.ingest_charts:
647
+ yield from self.emit_chart_mces()
648
+ if self.config.ingest_datasets:
649
+ yield from self.emit_dataset_mces()
500
650
 
501
651
  def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
502
652
  return [
@@ -974,6 +974,8 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
974
974
  )
975
975
  else:
976
976
  self.report.num_view_definitions_parsed += 1
977
+ if raw_lineage.out_tables != [view_urn]:
978
+ self.report.num_view_definitions_view_urn_mismatch += 1
977
979
  return view_definition_lineage_helper(raw_lineage, view_urn)
978
980
 
979
981
  def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
@@ -1243,13 +1243,19 @@ def infer_output_schema(result: SqlParsingResult) -> Optional[List[SchemaFieldCl
1243
1243
  def view_definition_lineage_helper(
1244
1244
  result: SqlParsingResult, view_urn: str
1245
1245
  ) -> SqlParsingResult:
1246
- if result.query_type is QueryType.SELECT:
1246
+ if result.query_type is QueryType.SELECT or (
1247
+ result.out_tables and result.out_tables != [view_urn]
1248
+ ):
1247
1249
  # Some platforms (e.g. postgres) store only <select statement> from view definition
1248
1250
  # `create view V as <select statement>` . For such view definitions, `result.out_tables` and
1249
1251
  # `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas upstream
1250
1252
  # details and downstream column details are extracted correctly.
1251
1253
  # Here, we inject view V's urn in `result.out_tables` and `result.column_lineage[].downstream`
1252
1254
  # to get complete lineage result.
1255
+
1256
+ # Some platforms(e.g. mssql) may have slightly different view name in view definition than
1257
+ # actual view name used elsewhere. Therefore we overwrite downstream table for such cases as well.
1258
+
1253
1259
  result.out_tables = [view_urn]
1254
1260
  if result.column_lineage:
1255
1261
  for col_result in result.column_lineage: