acryl-datahub 0.15.0rc12__py3-none-any.whl → 0.15.0rc14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/METADATA +2499 -2499
- {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/RECORD +26 -26
- datahub/__init__.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/gc/dataprocess_cleanup.py +20 -11
- datahub/ingestion/source/powerbi/m_query/data_classes.py +2 -13
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +19 -27
- datahub/ingestion/source/powerbi/m_query/resolver.py +8 -10
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +4 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_query.py +2 -2
- datahub/ingestion/source/sql/mssql/source.py +0 -2
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/unity/source.py +2 -0
- datahub/sql_parsing/sqlglot_lineage.py +7 -1
- {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc12.dist-info → acryl_datahub-0.15.0rc14.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from typing import Dict, List, Optional, Set
|
|
4
|
+
from typing import Dict, List, Optional, Set
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic import Field, SecretStr, root_validator, validator
|
|
@@ -118,9 +118,10 @@ class SnowflakeFilterConfig(SQLFilterConfig):
|
|
|
118
118
|
)
|
|
119
119
|
|
|
120
120
|
# Always exclude reporting metadata for INFORMATION_SCHEMA schema
|
|
121
|
-
if schema_pattern
|
|
121
|
+
if schema_pattern:
|
|
122
122
|
logger.debug("Adding deny for INFORMATION_SCHEMA to schema_pattern.")
|
|
123
|
-
|
|
123
|
+
assert isinstance(schema_pattern, AllowDenyPattern)
|
|
124
|
+
schema_pattern.deny.append(r".*INFORMATION_SCHEMA$")
|
|
124
125
|
|
|
125
126
|
return values
|
|
126
127
|
|
|
@@ -43,6 +43,7 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
|
|
|
43
43
|
"EXTERNAL_BROWSER_AUTHENTICATOR": EXTERNAL_BROWSER_AUTHENTICATOR,
|
|
44
44
|
"KEY_PAIR_AUTHENTICATOR": KEY_PAIR_AUTHENTICATOR,
|
|
45
45
|
"OAUTH_AUTHENTICATOR": OAUTH_AUTHENTICATOR,
|
|
46
|
+
"OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
|
|
46
47
|
}
|
|
47
48
|
|
|
48
49
|
_SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
|
|
@@ -104,6 +105,10 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
104
105
|
description="Connect args to pass to Snowflake SqlAlchemy driver",
|
|
105
106
|
exclude=True,
|
|
106
107
|
)
|
|
108
|
+
token: Optional[str] = pydantic.Field(
|
|
109
|
+
default=None,
|
|
110
|
+
description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
|
|
111
|
+
)
|
|
107
112
|
|
|
108
113
|
def get_account(self) -> str:
|
|
109
114
|
assert self.account_id
|
|
@@ -148,6 +153,18 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
148
153
|
logger.info(f"using authenticator type '{v}'")
|
|
149
154
|
return v
|
|
150
155
|
|
|
156
|
+
@pydantic.validator("token", always=True)
|
|
157
|
+
def validate_token_oauth_config(cls, v, values):
|
|
158
|
+
auth_type = values.get("authentication_type")
|
|
159
|
+
if auth_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
160
|
+
if not v:
|
|
161
|
+
raise ValueError("Token required for OAUTH_AUTHENTICATOR_TOKEN.")
|
|
162
|
+
elif v is not None:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
"Token can only be provided when using OAUTH_AUTHENTICATOR_TOKEN"
|
|
165
|
+
)
|
|
166
|
+
return v
|
|
167
|
+
|
|
151
168
|
@staticmethod
|
|
152
169
|
def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
|
|
153
170
|
if oauth_config is None:
|
|
@@ -333,6 +350,17 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
333
350
|
application=_APPLICATION_NAME,
|
|
334
351
|
**connect_args,
|
|
335
352
|
)
|
|
353
|
+
elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
354
|
+
return snowflake.connector.connect(
|
|
355
|
+
user=self.username,
|
|
356
|
+
account=self.account_id,
|
|
357
|
+
authenticator="oauth",
|
|
358
|
+
token=self.token, # Token generated externally and provided directly to the recipe
|
|
359
|
+
warehouse=self.warehouse,
|
|
360
|
+
role=self.role,
|
|
361
|
+
application=_APPLICATION_NAME,
|
|
362
|
+
**connect_args,
|
|
363
|
+
)
|
|
336
364
|
elif self.authentication_type == "OAUTH_AUTHENTICATOR":
|
|
337
365
|
return self.get_oauth_connection()
|
|
338
366
|
elif self.authentication_type == "KEY_PAIR_AUTHENTICATOR":
|
|
@@ -132,7 +132,7 @@ class SnowflakeQuery:
|
|
|
132
132
|
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
133
133
|
FROM {db_clause}information_schema.tables t
|
|
134
134
|
WHERE table_schema != 'INFORMATION_SCHEMA'
|
|
135
|
-
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE')
|
|
135
|
+
and table_type in ( 'BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
136
136
|
order by table_schema, table_name"""
|
|
137
137
|
|
|
138
138
|
@staticmethod
|
|
@@ -152,7 +152,7 @@ class SnowflakeQuery:
|
|
|
152
152
|
auto_clustering_on AS "AUTO_CLUSTERING_ON"
|
|
153
153
|
FROM {db_clause}information_schema.tables t
|
|
154
154
|
where table_schema='{schema_name}'
|
|
155
|
-
and table_type in ('BASE TABLE', 'EXTERNAL TABLE')
|
|
155
|
+
and table_type in ('BASE TABLE', 'EXTERNAL TABLE', 'HYBRID TABLE')
|
|
156
156
|
order by table_schema, table_name"""
|
|
157
157
|
|
|
158
158
|
@staticmethod
|
|
@@ -5,8 +5,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import sqlalchemy.dialects.mssql
|
|
8
|
-
|
|
9
|
-
# This import verifies that the dependencies are available.
|
|
10
8
|
from pydantic.fields import Field
|
|
11
9
|
from sqlalchemy import create_engine, inspect
|
|
12
10
|
from sqlalchemy.engine.base import Connection
|
|
@@ -582,6 +582,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
582
582
|
generate_operations=False,
|
|
583
583
|
)
|
|
584
584
|
for dataset_name in self._view_definition_cache.keys():
|
|
585
|
+
# TODO: Ensure that the lineage generated from the view definition
|
|
586
|
+
# matches the dataset_name.
|
|
585
587
|
view_definition = self._view_definition_cache[dataset_name]
|
|
586
588
|
result = self._run_sql_parser(
|
|
587
589
|
dataset_name,
|
|
@@ -1059,6 +1061,20 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1059
1061
|
exc=e,
|
|
1060
1062
|
)
|
|
1061
1063
|
|
|
1064
|
+
def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
|
|
1065
|
+
try:
|
|
1066
|
+
view_definition = inspector.get_view_definition(view, schema)
|
|
1067
|
+
if view_definition is None:
|
|
1068
|
+
view_definition = ""
|
|
1069
|
+
else:
|
|
1070
|
+
# Some dialects return a TextClause instead of a raw string,
|
|
1071
|
+
# so we need to convert them to a string.
|
|
1072
|
+
view_definition = str(view_definition)
|
|
1073
|
+
except NotImplementedError:
|
|
1074
|
+
view_definition = ""
|
|
1075
|
+
|
|
1076
|
+
return view_definition
|
|
1077
|
+
|
|
1062
1078
|
def _process_view(
|
|
1063
1079
|
self,
|
|
1064
1080
|
dataset_name: str,
|
|
@@ -1077,7 +1093,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1077
1093
|
columns = inspector.get_columns(view, schema)
|
|
1078
1094
|
except KeyError:
|
|
1079
1095
|
# For certain types of views, we are unable to fetch the list of columns.
|
|
1080
|
-
self.
|
|
1096
|
+
self.report.warning(
|
|
1097
|
+
message="Unable to get schema for a view",
|
|
1098
|
+
context=f"{dataset_name}",
|
|
1099
|
+
)
|
|
1081
1100
|
schema_metadata = None
|
|
1082
1101
|
else:
|
|
1083
1102
|
schema_fields = self.get_schema_fields(dataset_name, columns, inspector)
|
|
@@ -1091,19 +1110,12 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1091
1110
|
if self._save_schema_to_resolver():
|
|
1092
1111
|
self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
|
|
1093
1112
|
self.discovered_datasets.add(dataset_name)
|
|
1113
|
+
|
|
1094
1114
|
description, properties, _ = self.get_table_properties(inspector, schema, view)
|
|
1095
|
-
try:
|
|
1096
|
-
view_definition = inspector.get_view_definition(view, schema)
|
|
1097
|
-
if view_definition is None:
|
|
1098
|
-
view_definition = ""
|
|
1099
|
-
else:
|
|
1100
|
-
# Some dialects return a TextClause instead of a raw string,
|
|
1101
|
-
# so we need to convert them to a string.
|
|
1102
|
-
view_definition = str(view_definition)
|
|
1103
|
-
except NotImplementedError:
|
|
1104
|
-
view_definition = ""
|
|
1105
|
-
properties["view_definition"] = view_definition
|
|
1106
1115
|
properties["is_view"] = "True"
|
|
1116
|
+
|
|
1117
|
+
view_definition = self._get_view_definition(inspector, schema, view)
|
|
1118
|
+
properties["view_definition"] = view_definition
|
|
1107
1119
|
if view_definition and self.config.include_view_lineage:
|
|
1108
1120
|
self._view_definition_cache[dataset_name] = view_definition
|
|
1109
1121
|
|
|
@@ -1135,15 +1147,14 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1135
1147
|
entityUrn=dataset_urn,
|
|
1136
1148
|
aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
|
|
1137
1149
|
).as_workunit()
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
).as_workunit()
|
|
1150
|
+
|
|
1151
|
+
view_properties_aspect = ViewPropertiesClass(
|
|
1152
|
+
materialized=False, viewLanguage="SQL", viewLogic=view_definition
|
|
1153
|
+
)
|
|
1154
|
+
yield MetadataChangeProposalWrapper(
|
|
1155
|
+
entityUrn=dataset_urn,
|
|
1156
|
+
aspect=view_properties_aspect,
|
|
1157
|
+
).as_workunit()
|
|
1147
1158
|
|
|
1148
1159
|
if self.config.domain and self.domain_registry:
|
|
1149
1160
|
yield from get_domain_wu(
|
|
@@ -1197,6 +1208,8 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1197
1208
|
)
|
|
1198
1209
|
else:
|
|
1199
1210
|
self.report.num_view_definitions_parsed += 1
|
|
1211
|
+
if raw_lineage.out_tables != [view_urn]:
|
|
1212
|
+
self.report.num_view_definitions_view_urn_mismatch += 1
|
|
1200
1213
|
return view_definition_lineage_helper(raw_lineage, view_urn)
|
|
1201
1214
|
|
|
1202
1215
|
def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]:
|
|
@@ -48,6 +48,7 @@ class SQLSourceReport(
|
|
|
48
48
|
query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
|
|
49
49
|
|
|
50
50
|
num_view_definitions_parsed: int = 0
|
|
51
|
+
num_view_definitions_view_urn_mismatch: int = 0
|
|
51
52
|
num_view_definitions_failed_parsing: int = 0
|
|
52
53
|
num_view_definitions_failed_column_parsing: int = 0
|
|
53
54
|
view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
from datetime import datetime
|
|
3
4
|
from functools import lru_cache
|
|
4
|
-
from typing import Dict, Iterable, List, Optional
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
5
6
|
|
|
6
7
|
import dateutil.parser as dp
|
|
7
8
|
import requests
|
|
9
|
+
from pydantic import BaseModel
|
|
8
10
|
from pydantic.class_validators import root_validator, validator
|
|
9
11
|
from pydantic.fields import Field
|
|
10
12
|
|
|
@@ -16,7 +18,9 @@ from datahub.configuration.source_common import (
|
|
|
16
18
|
from datahub.emitter.mce_builder import (
|
|
17
19
|
make_chart_urn,
|
|
18
20
|
make_dashboard_urn,
|
|
21
|
+
make_data_platform_urn,
|
|
19
22
|
make_dataset_urn,
|
|
23
|
+
make_dataset_urn_with_platform_instance,
|
|
20
24
|
make_domain_urn,
|
|
21
25
|
)
|
|
22
26
|
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
|
|
@@ -31,6 +35,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
31
35
|
)
|
|
32
36
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
|
|
33
37
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
|
+
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
34
39
|
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
35
40
|
get_platform_from_sqlalchemy_uri,
|
|
36
41
|
)
|
|
@@ -47,16 +52,26 @@ from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
|
47
52
|
AuditStamp,
|
|
48
53
|
ChangeAuditStamps,
|
|
49
54
|
Status,
|
|
55
|
+
TimeStamp,
|
|
50
56
|
)
|
|
51
57
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
52
58
|
ChartSnapshot,
|
|
53
59
|
DashboardSnapshot,
|
|
60
|
+
DatasetSnapshot,
|
|
54
61
|
)
|
|
55
62
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
63
|
+
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
64
|
+
MySqlDDL,
|
|
65
|
+
NullType,
|
|
66
|
+
SchemaField,
|
|
67
|
+
SchemaFieldDataType,
|
|
68
|
+
SchemaMetadata,
|
|
69
|
+
)
|
|
56
70
|
from datahub.metadata.schema_classes import (
|
|
57
71
|
ChartInfoClass,
|
|
58
72
|
ChartTypeClass,
|
|
59
73
|
DashboardInfoClass,
|
|
74
|
+
DatasetPropertiesClass,
|
|
60
75
|
)
|
|
61
76
|
from datahub.utilities import config_clean
|
|
62
77
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -82,9 +97,29 @@ chart_type_from_viz_type = {
|
|
|
82
97
|
"box_plot": ChartTypeClass.BAR,
|
|
83
98
|
}
|
|
84
99
|
|
|
100
|
+
|
|
85
101
|
platform_without_databases = ["druid"]
|
|
86
102
|
|
|
87
103
|
|
|
104
|
+
class SupersetDataset(BaseModel):
|
|
105
|
+
id: int
|
|
106
|
+
table_name: str
|
|
107
|
+
changed_on_utc: Optional[str] = None
|
|
108
|
+
explore_url: Optional[str] = ""
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def modified_dt(self) -> Optional[datetime]:
|
|
112
|
+
if self.changed_on_utc:
|
|
113
|
+
return dp.parse(self.changed_on_utc)
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def modified_ts(self) -> Optional[int]:
|
|
118
|
+
if self.modified_dt:
|
|
119
|
+
return int(self.modified_dt.timestamp() * 1000)
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
|
|
88
123
|
class SupersetConfig(
|
|
89
124
|
StatefulIngestionConfigBase, EnvConfigMixin, PlatformInstanceConfigMixin
|
|
90
125
|
):
|
|
@@ -103,15 +138,17 @@ class SupersetConfig(
|
|
|
103
138
|
)
|
|
104
139
|
username: Optional[str] = Field(default=None, description="Superset username.")
|
|
105
140
|
password: Optional[str] = Field(default=None, description="Superset password.")
|
|
106
|
-
api_key: Optional[str] = Field(default=None, description="Preset.io API key.")
|
|
107
|
-
api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.")
|
|
108
|
-
manager_uri: str = Field(
|
|
109
|
-
default="https://api.app.preset.io", description="Preset.io API URL"
|
|
110
|
-
)
|
|
111
141
|
# Configuration for stateful ingestion
|
|
112
142
|
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
|
|
113
143
|
default=None, description="Superset Stateful Ingestion Config."
|
|
114
144
|
)
|
|
145
|
+
ingest_dashboards: bool = Field(
|
|
146
|
+
default=True, description="Enable to ingest dashboards."
|
|
147
|
+
)
|
|
148
|
+
ingest_charts: bool = Field(default=True, description="Enable to ingest charts.")
|
|
149
|
+
ingest_datasets: bool = Field(
|
|
150
|
+
default=False, description="Enable to ingest datasets."
|
|
151
|
+
)
|
|
115
152
|
|
|
116
153
|
provider: str = Field(default="db", description="Superset provider.")
|
|
117
154
|
options: Dict = Field(default={}, description="")
|
|
@@ -123,6 +160,10 @@ class SupersetConfig(
|
|
|
123
160
|
description="Can be used to change mapping for database names in superset to what you have in datahub",
|
|
124
161
|
)
|
|
125
162
|
|
|
163
|
+
class Config:
|
|
164
|
+
# This is required to allow preset configs to get parsed
|
|
165
|
+
extra = "allow"
|
|
166
|
+
|
|
126
167
|
@validator("connect_uri", "display_uri")
|
|
127
168
|
def remove_trailing_slash(cls, v):
|
|
128
169
|
return config_clean.remove_trailing_slashes(v)
|
|
@@ -229,6 +270,28 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
229
270
|
config = SupersetConfig.parse_obj(config_dict)
|
|
230
271
|
return cls(ctx, config)
|
|
231
272
|
|
|
273
|
+
def paginate_entity_api_results(self, entity_type, page_size=100):
|
|
274
|
+
current_page = 0
|
|
275
|
+
total_items = page_size
|
|
276
|
+
|
|
277
|
+
while current_page * page_size < total_items:
|
|
278
|
+
response = self.session.get(
|
|
279
|
+
f"{self.config.connect_uri}/api/v1/{entity_type}/",
|
|
280
|
+
params={"q": f"(page:{current_page},page_size:{page_size})"},
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
if response.status_code != 200:
|
|
284
|
+
logger.warning(f"Failed to get {entity_type} data: {response.text}")
|
|
285
|
+
|
|
286
|
+
payload = response.json()
|
|
287
|
+
# Update total_items with the actual count from the response
|
|
288
|
+
total_items = payload.get("count", total_items)
|
|
289
|
+
# Yield each item in the result, this gets passed into the construct functions
|
|
290
|
+
for item in payload.get("result", []):
|
|
291
|
+
yield item
|
|
292
|
+
|
|
293
|
+
current_page += 1
|
|
294
|
+
|
|
232
295
|
@lru_cache(maxsize=None)
|
|
233
296
|
def get_platform_from_database_id(self, database_id):
|
|
234
297
|
database_response = self.session.get(
|
|
@@ -250,11 +313,18 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
250
313
|
return platform_name
|
|
251
314
|
|
|
252
315
|
@lru_cache(maxsize=None)
|
|
253
|
-
def
|
|
316
|
+
def get_dataset_info(self, dataset_id: int) -> dict:
|
|
254
317
|
dataset_response = self.session.get(
|
|
255
|
-
f"{self.config.connect_uri}/api/v1/dataset/{
|
|
256
|
-
)
|
|
257
|
-
|
|
318
|
+
f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
|
|
319
|
+
)
|
|
320
|
+
if dataset_response.status_code != 200:
|
|
321
|
+
logger.warning(f"Failed to get dataset info: {dataset_response.text}")
|
|
322
|
+
dataset_response.raise_for_status()
|
|
323
|
+
return dataset_response.json()
|
|
324
|
+
|
|
325
|
+
def get_datasource_urn_from_id(
|
|
326
|
+
self, dataset_response: dict, platform_instance: str
|
|
327
|
+
) -> str:
|
|
258
328
|
schema_name = dataset_response.get("result", {}).get("schema")
|
|
259
329
|
table_name = dataset_response.get("result", {}).get("table_name")
|
|
260
330
|
database_id = dataset_response.get("result", {}).get("database", {}).get("id")
|
|
@@ -283,9 +353,11 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
283
353
|
),
|
|
284
354
|
env=self.config.env,
|
|
285
355
|
)
|
|
286
|
-
|
|
356
|
+
raise ValueError("Could not construct dataset URN")
|
|
287
357
|
|
|
288
|
-
def construct_dashboard_from_api_data(
|
|
358
|
+
def construct_dashboard_from_api_data(
|
|
359
|
+
self, dashboard_data: dict
|
|
360
|
+
) -> DashboardSnapshot:
|
|
289
361
|
dashboard_urn = make_dashboard_urn(
|
|
290
362
|
platform=self.platform,
|
|
291
363
|
name=dashboard_data["id"],
|
|
@@ -340,7 +412,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
340
412
|
}
|
|
341
413
|
|
|
342
414
|
if dashboard_data.get("certified_by"):
|
|
343
|
-
custom_properties["CertifiedBy"] = dashboard_data.get("certified_by")
|
|
415
|
+
custom_properties["CertifiedBy"] = dashboard_data.get("certified_by", "")
|
|
344
416
|
custom_properties["CertificationDetails"] = str(
|
|
345
417
|
dashboard_data.get("certification_details")
|
|
346
418
|
)
|
|
@@ -358,38 +430,25 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
358
430
|
return dashboard_snapshot
|
|
359
431
|
|
|
360
432
|
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
total_dashboards = PAGE_SIZE
|
|
364
|
-
|
|
365
|
-
while current_dashboard_page * PAGE_SIZE <= total_dashboards:
|
|
366
|
-
dashboard_response = self.session.get(
|
|
367
|
-
f"{self.config.connect_uri}/api/v1/dashboard/",
|
|
368
|
-
params=f"q=(page:{current_dashboard_page},page_size:{PAGE_SIZE})",
|
|
369
|
-
)
|
|
370
|
-
if dashboard_response.status_code != 200:
|
|
371
|
-
logger.warning(
|
|
372
|
-
f"Failed to get dashboard data: {dashboard_response.text}"
|
|
373
|
-
)
|
|
374
|
-
dashboard_response.raise_for_status()
|
|
375
|
-
|
|
376
|
-
payload = dashboard_response.json()
|
|
377
|
-
total_dashboards = payload.get("count") or 0
|
|
378
|
-
|
|
379
|
-
current_dashboard_page += 1
|
|
380
|
-
|
|
381
|
-
for dashboard_data in payload["result"]:
|
|
433
|
+
for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
|
|
434
|
+
try:
|
|
382
435
|
dashboard_snapshot = self.construct_dashboard_from_api_data(
|
|
383
436
|
dashboard_data
|
|
384
437
|
)
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
title=dashboard_data.get("dashboard_title", ""),
|
|
389
|
-
entity_urn=dashboard_snapshot.urn,
|
|
438
|
+
except Exception as e:
|
|
439
|
+
self.report.warning(
|
|
440
|
+
f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
|
|
390
441
|
)
|
|
442
|
+
continue
|
|
443
|
+
# Emit the dashboard
|
|
444
|
+
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
|
|
445
|
+
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
|
|
446
|
+
yield from self._get_domain_wu(
|
|
447
|
+
title=dashboard_data.get("dashboard_title", ""),
|
|
448
|
+
entity_urn=dashboard_snapshot.urn,
|
|
449
|
+
)
|
|
391
450
|
|
|
392
|
-
def construct_chart_from_chart_data(self, chart_data):
|
|
451
|
+
def construct_chart_from_chart_data(self, chart_data: dict) -> ChartSnapshot:
|
|
393
452
|
chart_urn = make_chart_urn(
|
|
394
453
|
platform=self.platform,
|
|
395
454
|
name=chart_data["id"],
|
|
@@ -415,9 +474,12 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
415
474
|
chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
|
|
416
475
|
|
|
417
476
|
datasource_id = chart_data.get("datasource_id")
|
|
418
|
-
|
|
477
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
478
|
+
datasource_urn = self.get_datasource_urn_from_id(
|
|
479
|
+
dataset_response, self.platform
|
|
480
|
+
)
|
|
419
481
|
|
|
420
|
-
params = json.loads(chart_data.get("params"))
|
|
482
|
+
params = json.loads(chart_data.get("params", "{}"))
|
|
421
483
|
metrics = [
|
|
422
484
|
get_metric_name(metric)
|
|
423
485
|
for metric in (params.get("metrics", []) or [params.get("metric")])
|
|
@@ -467,36 +529,124 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
467
529
|
return chart_snapshot
|
|
468
530
|
|
|
469
531
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
532
|
+
for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
|
|
533
|
+
try:
|
|
534
|
+
chart_snapshot = self.construct_chart_from_chart_data(chart_data)
|
|
535
|
+
|
|
536
|
+
mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
537
|
+
except Exception as e:
|
|
538
|
+
self.report.warning(
|
|
539
|
+
f"Failed to construct chart snapshot. Chart name: {chart_data.get('table_name')}. Error: \n{e}"
|
|
540
|
+
)
|
|
541
|
+
continue
|
|
542
|
+
# Emit the chart
|
|
543
|
+
yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
|
|
544
|
+
yield from self._get_domain_wu(
|
|
545
|
+
title=chart_data.get("slice_name", ""),
|
|
546
|
+
entity_urn=chart_snapshot.urn,
|
|
478
547
|
)
|
|
479
|
-
if chart_response.status_code != 200:
|
|
480
|
-
logger.warning(f"Failed to get chart data: {chart_response.text}")
|
|
481
|
-
chart_response.raise_for_status()
|
|
482
548
|
|
|
483
|
-
|
|
549
|
+
def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
|
|
550
|
+
schema_fields: List[SchemaField] = []
|
|
551
|
+
for col in column_data:
|
|
552
|
+
col_type = (col.get("type") or "").lower()
|
|
553
|
+
data_type = resolve_sql_type(col_type)
|
|
554
|
+
if data_type is None:
|
|
555
|
+
data_type = NullType()
|
|
556
|
+
|
|
557
|
+
field = SchemaField(
|
|
558
|
+
fieldPath=col.get("column_name", ""),
|
|
559
|
+
type=SchemaFieldDataType(data_type),
|
|
560
|
+
nativeDataType="",
|
|
561
|
+
description=col.get("column_name", ""),
|
|
562
|
+
nullable=True,
|
|
563
|
+
)
|
|
564
|
+
schema_fields.append(field)
|
|
565
|
+
return schema_fields
|
|
566
|
+
|
|
567
|
+
def gen_schema_metadata(
|
|
568
|
+
self,
|
|
569
|
+
dataset_response: dict,
|
|
570
|
+
) -> SchemaMetadata:
|
|
571
|
+
dataset_response = dataset_response.get("result", {})
|
|
572
|
+
column_data = dataset_response.get("columns", [])
|
|
573
|
+
schema_metadata = SchemaMetadata(
|
|
574
|
+
schemaName=dataset_response.get("table_name", ""),
|
|
575
|
+
platform=make_data_platform_urn(self.platform),
|
|
576
|
+
version=0,
|
|
577
|
+
hash="",
|
|
578
|
+
platformSchema=MySqlDDL(tableSchema=""),
|
|
579
|
+
fields=self.gen_schema_fields(column_data),
|
|
580
|
+
)
|
|
581
|
+
return schema_metadata
|
|
484
582
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
583
|
+
def gen_dataset_urn(self, datahub_dataset_name: str) -> str:
|
|
584
|
+
return make_dataset_urn_with_platform_instance(
|
|
585
|
+
platform=self.platform,
|
|
586
|
+
name=datahub_dataset_name,
|
|
587
|
+
platform_instance=self.config.platform_instance,
|
|
588
|
+
env=self.config.env,
|
|
589
|
+
)
|
|
489
590
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
591
|
+
def construct_dataset_from_dataset_data(
|
|
592
|
+
self, dataset_data: dict
|
|
593
|
+
) -> DatasetSnapshot:
|
|
594
|
+
dataset_response = self.get_dataset_info(dataset_data.get("id"))
|
|
595
|
+
dataset = SupersetDataset(**dataset_response["result"])
|
|
596
|
+
datasource_urn = self.get_datasource_urn_from_id(
|
|
597
|
+
dataset_response, self.platform
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
dataset_url = f"{self.config.display_uri}{dataset.explore_url or ''}"
|
|
601
|
+
|
|
602
|
+
dataset_info = DatasetPropertiesClass(
|
|
603
|
+
name=dataset.table_name,
|
|
604
|
+
description="",
|
|
605
|
+
lastModified=TimeStamp(time=dataset.modified_ts)
|
|
606
|
+
if dataset.modified_ts
|
|
607
|
+
else None,
|
|
608
|
+
externalUrl=dataset_url,
|
|
609
|
+
)
|
|
610
|
+
aspects_items: List[Any] = []
|
|
611
|
+
aspects_items.extend(
|
|
612
|
+
[
|
|
613
|
+
self.gen_schema_metadata(dataset_response),
|
|
614
|
+
dataset_info,
|
|
615
|
+
]
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
dataset_snapshot = DatasetSnapshot(
|
|
619
|
+
urn=datasource_urn,
|
|
620
|
+
aspects=aspects_items,
|
|
621
|
+
)
|
|
622
|
+
return dataset_snapshot
|
|
623
|
+
|
|
624
|
+
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
625
|
+
for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
|
|
626
|
+
try:
|
|
627
|
+
dataset_snapshot = self.construct_dataset_from_dataset_data(
|
|
628
|
+
dataset_data
|
|
495
629
|
)
|
|
630
|
+
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
631
|
+
except Exception as e:
|
|
632
|
+
self.report.warning(
|
|
633
|
+
f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
|
|
634
|
+
)
|
|
635
|
+
continue
|
|
636
|
+
# Emit the dataset
|
|
637
|
+
yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
|
|
638
|
+
yield from self._get_domain_wu(
|
|
639
|
+
title=dataset_data.get("table_name", ""),
|
|
640
|
+
entity_urn=dataset_snapshot.urn,
|
|
641
|
+
)
|
|
496
642
|
|
|
497
643
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
498
|
-
|
|
499
|
-
|
|
644
|
+
if self.config.ingest_dashboards:
|
|
645
|
+
yield from self.emit_dashboard_mces()
|
|
646
|
+
if self.config.ingest_charts:
|
|
647
|
+
yield from self.emit_chart_mces()
|
|
648
|
+
if self.config.ingest_datasets:
|
|
649
|
+
yield from self.emit_dataset_mces()
|
|
500
650
|
|
|
501
651
|
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
502
652
|
return [
|
|
@@ -974,6 +974,8 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
974
974
|
)
|
|
975
975
|
else:
|
|
976
976
|
self.report.num_view_definitions_parsed += 1
|
|
977
|
+
if raw_lineage.out_tables != [view_urn]:
|
|
978
|
+
self.report.num_view_definitions_view_urn_mismatch += 1
|
|
977
979
|
return view_definition_lineage_helper(raw_lineage, view_urn)
|
|
978
980
|
|
|
979
981
|
def get_view_lineage(self) -> Iterable[MetadataWorkUnit]:
|
|
@@ -1243,13 +1243,19 @@ def infer_output_schema(result: SqlParsingResult) -> Optional[List[SchemaFieldCl
|
|
|
1243
1243
|
def view_definition_lineage_helper(
|
|
1244
1244
|
result: SqlParsingResult, view_urn: str
|
|
1245
1245
|
) -> SqlParsingResult:
|
|
1246
|
-
if result.query_type is QueryType.SELECT
|
|
1246
|
+
if result.query_type is QueryType.SELECT or (
|
|
1247
|
+
result.out_tables and result.out_tables != [view_urn]
|
|
1248
|
+
):
|
|
1247
1249
|
# Some platforms (e.g. postgres) store only <select statement> from view definition
|
|
1248
1250
|
# `create view V as <select statement>` . For such view definitions, `result.out_tables` and
|
|
1249
1251
|
# `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas upstream
|
|
1250
1252
|
# details and downstream column details are extracted correctly.
|
|
1251
1253
|
# Here, we inject view V's urn in `result.out_tables` and `result.column_lineage[].downstream`
|
|
1252
1254
|
# to get complete lineage result.
|
|
1255
|
+
|
|
1256
|
+
# Some platforms(e.g. mssql) may have slightly different view name in view definition than
|
|
1257
|
+
# actual view name used elsewhere. Therefore we overwrite downstream table for such cases as well.
|
|
1258
|
+
|
|
1253
1259
|
result.out_tables = [view_urn]
|
|
1254
1260
|
if result.column_lineage:
|
|
1255
1261
|
for col_result in result.column_lineage:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|