acryl-datahub 1.0.0rc7__py3-none-any.whl → 1.0.0rc9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/METADATA +2487 -2487
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/RECORD +88 -84
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/git.py +1 -3
- datahub/ingestion/glossary/classification_mixin.py +1 -1
- datahub/ingestion/graph/client.py +16 -12
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/dbt/dbt_common.py +1 -1
- datahub/ingestion/source/file.py +5 -2
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/ge_data_profiler.py +11 -14
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +31 -20
- datahub/ingestion/source/identity/okta.py +1 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +4 -7
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +2 -1
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +6 -3
- datahub/ingestion/source/openapi_parser.py +2 -2
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +2 -1
- datahub/ingestion/source/s3/config.py +2 -4
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +1 -1
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_connection.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +2 -2
- datahub/ingestion/source/sql/sql_common.py +2 -2
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +4 -2
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/superset.py +218 -56
- datahub/ingestion/source/tableau/tableau.py +1 -5
- datahub/lite/duckdb_lite.py +3 -9
- datahub/metadata/_schema_classes.py +157 -14
- datahub/metadata/_urns/urn_defs.py +58 -58
- datahub/metadata/schema.avsc +23 -10
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_shared.py +88 -3
- datahub/sdk/container.py +7 -1
- datahub/sdk/dataset.py +10 -4
- datahub/sdk/{_entity.py → entity.py} +4 -0
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +7 -1
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc7.dist-info → acryl_datahub-1.0.0rc9.dist-info}/top_level.txt +0 -0
|
@@ -204,7 +204,7 @@ def get_column_type(
|
|
|
204
204
|
"""
|
|
205
205
|
|
|
206
206
|
TypeClass: Optional[Type] = None
|
|
207
|
-
for sql_type in _field_type_mapping
|
|
207
|
+
for sql_type in _field_type_mapping:
|
|
208
208
|
if isinstance(column_type, sql_type):
|
|
209
209
|
TypeClass = _field_type_mapping[sql_type]
|
|
210
210
|
break
|
|
@@ -973,7 +973,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
973
973
|
inspector=inspector,
|
|
974
974
|
)
|
|
975
975
|
),
|
|
976
|
-
description=column.get("comment"
|
|
976
|
+
description=column.get("comment"),
|
|
977
977
|
nullable=column["nullable"],
|
|
978
978
|
recursive=False,
|
|
979
979
|
globalTags=gtc,
|
|
@@ -317,10 +317,10 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
|
|
|
317
317
|
match = re.match(r"([a-zA-Z_]+)\(\d+,\s\d+\)", type_string)
|
|
318
318
|
if match:
|
|
319
319
|
modified_type_base = match.group(1) # Extract the base type
|
|
320
|
-
return SNOWFLAKE_TYPES_MAP.get(modified_type_base
|
|
320
|
+
return SNOWFLAKE_TYPES_MAP.get(modified_type_base)
|
|
321
321
|
|
|
322
322
|
# Fallback for types without precision/scale
|
|
323
|
-
return SNOWFLAKE_TYPES_MAP.get(type_string
|
|
323
|
+
return SNOWFLAKE_TYPES_MAP.get(type_string)
|
|
324
324
|
|
|
325
325
|
|
|
326
326
|
# see https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_types.py#L32
|
|
@@ -180,10 +180,11 @@ def optimized_get_columns(
|
|
|
180
180
|
connection: Connection,
|
|
181
181
|
table_name: str,
|
|
182
182
|
schema: Optional[str] = None,
|
|
183
|
-
tables_cache: MutableMapping[str, List[TeradataTable]] =
|
|
183
|
+
tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
|
|
184
184
|
use_qvci: bool = False,
|
|
185
185
|
**kw: Dict[str, Any],
|
|
186
186
|
) -> List[Dict]:
|
|
187
|
+
tables_cache = tables_cache or {}
|
|
187
188
|
if schema is None:
|
|
188
189
|
schema = self.default_schema_name
|
|
189
190
|
|
|
@@ -314,9 +315,10 @@ def optimized_get_view_definition(
|
|
|
314
315
|
connection: Connection,
|
|
315
316
|
view_name: str,
|
|
316
317
|
schema: Optional[str] = None,
|
|
317
|
-
tables_cache: MutableMapping[str, List[TeradataTable]] =
|
|
318
|
+
tables_cache: Optional[MutableMapping[str, List[TeradataTable]]] = None,
|
|
318
319
|
**kw: Dict[str, Any],
|
|
319
320
|
) -> Optional[str]:
|
|
321
|
+
tables_cache = tables_cache or {}
|
|
320
322
|
if schema is None:
|
|
321
323
|
schema = self.default_schema_name
|
|
322
324
|
|
|
@@ -142,7 +142,7 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
|
|
|
142
142
|
if col_value is not None:
|
|
143
143
|
properties[col_name] = col_value
|
|
144
144
|
|
|
145
|
-
return {"text": properties.get("comment"
|
|
145
|
+
return {"text": properties.get("comment"), "properties": properties}
|
|
146
146
|
else:
|
|
147
147
|
return self.get_table_comment_default(connection, table_name, schema)
|
|
148
148
|
except Exception:
|
|
@@ -483,7 +483,7 @@ def _parse_struct_fields(parts):
|
|
|
483
483
|
|
|
484
484
|
|
|
485
485
|
def _parse_basic_datatype(s):
|
|
486
|
-
for sql_type in _all_atomic_types
|
|
486
|
+
for sql_type in _all_atomic_types:
|
|
487
487
|
if isinstance(s, sql_type):
|
|
488
488
|
return {
|
|
489
489
|
"type": _all_atomic_types[sql_type],
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
from dataclasses import dataclass, field
|
|
3
4
|
from datetime import datetime
|
|
4
5
|
from functools import lru_cache
|
|
5
6
|
from typing import Any, Dict, Iterable, List, Optional
|
|
@@ -22,6 +23,7 @@ from datahub.emitter.mce_builder import (
|
|
|
22
23
|
make_dataset_urn,
|
|
23
24
|
make_dataset_urn_with_platform_instance,
|
|
24
25
|
make_domain_urn,
|
|
26
|
+
make_user_urn,
|
|
25
27
|
)
|
|
26
28
|
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
|
|
27
29
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -36,9 +38,6 @@ from datahub.ingestion.api.decorators import (
|
|
|
36
38
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
37
39
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
38
40
|
from datahub.ingestion.source.sql.sql_types import resolve_sql_type
|
|
39
|
-
from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
|
|
40
|
-
get_platform_from_sqlalchemy_uri,
|
|
41
|
-
)
|
|
42
41
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
43
42
|
StaleEntityRemovalHandler,
|
|
44
43
|
StaleEntityRemovalSourceReport,
|
|
@@ -49,7 +48,6 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
49
48
|
StatefulIngestionSourceBase,
|
|
50
49
|
)
|
|
51
50
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
52
|
-
AuditStamp,
|
|
53
51
|
ChangeAuditStamps,
|
|
54
52
|
Status,
|
|
55
53
|
TimeStamp,
|
|
@@ -68,12 +66,22 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
|
68
66
|
SchemaMetadata,
|
|
69
67
|
)
|
|
70
68
|
from datahub.metadata.schema_classes import (
|
|
69
|
+
AuditStampClass,
|
|
71
70
|
ChartInfoClass,
|
|
72
71
|
ChartTypeClass,
|
|
73
72
|
DashboardInfoClass,
|
|
73
|
+
DatasetLineageTypeClass,
|
|
74
74
|
DatasetPropertiesClass,
|
|
75
|
+
GlobalTagsClass,
|
|
76
|
+
OwnerClass,
|
|
77
|
+
OwnershipClass,
|
|
78
|
+
OwnershipTypeClass,
|
|
79
|
+
TagAssociationClass,
|
|
80
|
+
UpstreamClass,
|
|
81
|
+
UpstreamLineageClass,
|
|
75
82
|
)
|
|
76
83
|
from datahub.utilities import config_clean
|
|
84
|
+
from datahub.utilities.lossy_collections import LossyList
|
|
77
85
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
78
86
|
|
|
79
87
|
logger = logging.getLogger(__name__)
|
|
@@ -101,6 +109,14 @@ chart_type_from_viz_type = {
|
|
|
101
109
|
platform_without_databases = ["druid"]
|
|
102
110
|
|
|
103
111
|
|
|
112
|
+
@dataclass
|
|
113
|
+
class SupersetSourceReport(StaleEntityRemovalSourceReport):
|
|
114
|
+
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
115
|
+
|
|
116
|
+
def report_dropped(self, name: str) -> None:
|
|
117
|
+
self.filtered.append(name)
|
|
118
|
+
|
|
119
|
+
|
|
104
120
|
class SupersetDataset(BaseModel):
|
|
105
121
|
id: int
|
|
106
122
|
table_name: str
|
|
@@ -136,6 +152,18 @@ class SupersetConfig(
|
|
|
136
152
|
default=dict(),
|
|
137
153
|
description="regex patterns for tables to filter to assign domain_key. ",
|
|
138
154
|
)
|
|
155
|
+
dataset_pattern: AllowDenyPattern = Field(
|
|
156
|
+
default=AllowDenyPattern.allow_all(),
|
|
157
|
+
description="Regex patterns for dataset to filter in ingestion.",
|
|
158
|
+
)
|
|
159
|
+
chart_pattern: AllowDenyPattern = Field(
|
|
160
|
+
AllowDenyPattern.allow_all(),
|
|
161
|
+
description="Patterns for selecting chart names that are to be included",
|
|
162
|
+
)
|
|
163
|
+
dashboard_pattern: AllowDenyPattern = Field(
|
|
164
|
+
AllowDenyPattern.allow_all(),
|
|
165
|
+
description="Patterns for selecting dashboard names that are to be included",
|
|
166
|
+
)
|
|
139
167
|
username: Optional[str] = Field(default=None, description="Superset username.")
|
|
140
168
|
password: Optional[str] = Field(default=None, description="Superset password.")
|
|
141
169
|
# Configuration for stateful ingestion
|
|
@@ -216,7 +244,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
216
244
|
"""
|
|
217
245
|
|
|
218
246
|
config: SupersetConfig
|
|
219
|
-
report:
|
|
247
|
+
report: SupersetSourceReport
|
|
220
248
|
platform = "superset"
|
|
221
249
|
|
|
222
250
|
def __hash__(self):
|
|
@@ -225,13 +253,14 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
225
253
|
def __init__(self, ctx: PipelineContext, config: SupersetConfig):
|
|
226
254
|
super().__init__(config, ctx)
|
|
227
255
|
self.config = config
|
|
228
|
-
self.report =
|
|
256
|
+
self.report = SupersetSourceReport()
|
|
229
257
|
if self.config.domain:
|
|
230
258
|
self.domain_registry = DomainRegistry(
|
|
231
259
|
cached_domains=[domain_id for domain_id in self.config.domain],
|
|
232
260
|
graph=self.ctx.graph,
|
|
233
261
|
)
|
|
234
262
|
self.session = self.login()
|
|
263
|
+
self.owner_info = self.parse_owner_info()
|
|
235
264
|
|
|
236
265
|
def login(self) -> requests.Session:
|
|
237
266
|
login_response = requests.post(
|
|
@@ -271,7 +300,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
271
300
|
|
|
272
301
|
while current_page * page_size < total_items:
|
|
273
302
|
response = self.session.get(
|
|
274
|
-
f"{self.config.connect_uri}/api/v1/{entity_type}
|
|
303
|
+
f"{self.config.connect_uri}/api/v1/{entity_type}",
|
|
275
304
|
params={"q": f"(page:{current_page},page_size:{page_size})"},
|
|
276
305
|
)
|
|
277
306
|
|
|
@@ -287,25 +316,24 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
287
316
|
|
|
288
317
|
current_page += 1
|
|
289
318
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
return platform_name
|
|
319
|
+
def parse_owner_info(self) -> Dict[str, Any]:
|
|
320
|
+
entity_types = ["dataset", "dashboard", "chart"]
|
|
321
|
+
owners_info = {}
|
|
322
|
+
|
|
323
|
+
for entity in entity_types:
|
|
324
|
+
for owner in self.paginate_entity_api_results(f"{entity}/related/owners"):
|
|
325
|
+
owner_id = owner.get("value")
|
|
326
|
+
if owner_id:
|
|
327
|
+
owners_info[owner_id] = owner.get("extra", {}).get("email", "")
|
|
328
|
+
|
|
329
|
+
return owners_info
|
|
330
|
+
|
|
331
|
+
def build_owner_urn(self, data: Dict[str, Any]) -> List[str]:
|
|
332
|
+
return [
|
|
333
|
+
make_user_urn(self.owner_info.get(owner.get("id"), ""))
|
|
334
|
+
for owner in data.get("owners", [])
|
|
335
|
+
if owner.get("id")
|
|
336
|
+
]
|
|
309
337
|
|
|
310
338
|
@lru_cache(maxsize=None)
|
|
311
339
|
def get_dataset_info(self, dataset_id: int) -> dict:
|
|
@@ -323,8 +351,6 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
323
351
|
schema_name = dataset_response.get("result", {}).get("schema")
|
|
324
352
|
table_name = dataset_response.get("result", {}).get("table_name")
|
|
325
353
|
database_id = dataset_response.get("result", {}).get("database", {}).get("id")
|
|
326
|
-
platform = self.get_platform_from_database_id(database_id)
|
|
327
|
-
|
|
328
354
|
database_name = (
|
|
329
355
|
dataset_response.get("result", {}).get("database", {}).get("database_name")
|
|
330
356
|
)
|
|
@@ -333,21 +359,24 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
333
359
|
# Druid do not have a database concept and has a limited schema concept, but they are nonetheless reported
|
|
334
360
|
# from superset. There is only one database per platform instance, and one schema named druid, so it would be
|
|
335
361
|
# redundant to systemically store them both in the URN.
|
|
336
|
-
if
|
|
362
|
+
if platform_instance in platform_without_databases:
|
|
337
363
|
database_name = None
|
|
338
364
|
|
|
339
|
-
if
|
|
365
|
+
if platform_instance == "druid" and schema_name == "druid":
|
|
340
366
|
# Follow DataHub's druid source convention.
|
|
341
367
|
schema_name = None
|
|
342
368
|
|
|
343
|
-
|
|
369
|
+
# If the information about the datasource is already contained in the dataset response,
|
|
370
|
+
# can just return the urn directly
|
|
371
|
+
if table_name and database_id:
|
|
344
372
|
return make_dataset_urn(
|
|
345
|
-
platform=
|
|
373
|
+
platform=platform_instance,
|
|
346
374
|
name=".".join(
|
|
347
375
|
name for name in [database_name, schema_name, table_name] if name
|
|
348
376
|
),
|
|
349
377
|
env=self.config.env,
|
|
350
378
|
)
|
|
379
|
+
|
|
351
380
|
raise ValueError("Could not construct dataset URN")
|
|
352
381
|
|
|
353
382
|
def construct_dashboard_from_api_data(
|
|
@@ -363,15 +392,16 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
363
392
|
aspects=[Status(removed=False)],
|
|
364
393
|
)
|
|
365
394
|
|
|
366
|
-
modified_actor = f"urn:li:corpuser:{(dashboard_data.get('changed_by') or {}).get('
|
|
395
|
+
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
367
396
|
modified_ts = int(
|
|
368
397
|
dp.parse(dashboard_data.get("changed_on_utc", "now")).timestamp() * 1000
|
|
369
398
|
)
|
|
370
399
|
title = dashboard_data.get("dashboard_title", "")
|
|
371
400
|
# note: the API does not currently supply created_by usernames due to a bug
|
|
372
|
-
last_modified =
|
|
373
|
-
|
|
374
|
-
|
|
401
|
+
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
402
|
+
|
|
403
|
+
change_audit_stamps = ChangeAuditStamps(
|
|
404
|
+
created=None, lastModified=last_modified
|
|
375
405
|
)
|
|
376
406
|
dashboard_url = f"{self.config.display_uri}{dashboard_data.get('url', '')}"
|
|
377
407
|
|
|
@@ -397,7 +427,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
397
427
|
"IsPublished": str(dashboard_data.get("published", False)).lower(),
|
|
398
428
|
"Owners": ", ".join(
|
|
399
429
|
map(
|
|
400
|
-
lambda owner: owner.get("
|
|
430
|
+
lambda owner: self.owner_info.get(owner.get("id", -1), "unknown"),
|
|
401
431
|
dashboard_data.get("owners", []),
|
|
402
432
|
)
|
|
403
433
|
),
|
|
@@ -417,16 +447,39 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
417
447
|
description="",
|
|
418
448
|
title=title,
|
|
419
449
|
charts=chart_urns,
|
|
420
|
-
lastModified=last_modified,
|
|
421
450
|
dashboardUrl=dashboard_url,
|
|
422
451
|
customProperties=custom_properties,
|
|
452
|
+
lastModified=change_audit_stamps,
|
|
423
453
|
)
|
|
424
454
|
dashboard_snapshot.aspects.append(dashboard_info)
|
|
455
|
+
|
|
456
|
+
dashboard_owners_list = self.build_owner_urn(dashboard_data)
|
|
457
|
+
owners_info = OwnershipClass(
|
|
458
|
+
owners=[
|
|
459
|
+
OwnerClass(
|
|
460
|
+
owner=urn,
|
|
461
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
462
|
+
)
|
|
463
|
+
for urn in (dashboard_owners_list or [])
|
|
464
|
+
],
|
|
465
|
+
lastModified=last_modified,
|
|
466
|
+
)
|
|
467
|
+
dashboard_snapshot.aspects.append(owners_info)
|
|
468
|
+
|
|
425
469
|
return dashboard_snapshot
|
|
426
470
|
|
|
427
471
|
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
428
|
-
for dashboard_data in self.paginate_entity_api_results("dashboard", PAGE_SIZE):
|
|
472
|
+
for dashboard_data in self.paginate_entity_api_results("dashboard/", PAGE_SIZE):
|
|
429
473
|
try:
|
|
474
|
+
dashboard_id = str(dashboard_data.get("id"))
|
|
475
|
+
dashboard_title = dashboard_data.get("dashboard_title", "")
|
|
476
|
+
|
|
477
|
+
if not self.config.dashboard_pattern.allowed(dashboard_title):
|
|
478
|
+
self.report.report_dropped(
|
|
479
|
+
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
480
|
+
)
|
|
481
|
+
continue
|
|
482
|
+
|
|
430
483
|
dashboard_snapshot = self.construct_dashboard_from_api_data(
|
|
431
484
|
dashboard_data
|
|
432
485
|
)
|
|
@@ -439,7 +492,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
439
492
|
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
|
|
440
493
|
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
|
|
441
494
|
yield from self._get_domain_wu(
|
|
442
|
-
title=
|
|
495
|
+
title=dashboard_title,
|
|
443
496
|
entity_urn=dashboard_snapshot.urn,
|
|
444
497
|
)
|
|
445
498
|
|
|
@@ -454,25 +507,33 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
454
507
|
aspects=[Status(removed=False)],
|
|
455
508
|
)
|
|
456
509
|
|
|
457
|
-
modified_actor = f"urn:li:corpuser:{(chart_data.get('changed_by') or {}).get('
|
|
510
|
+
modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
458
511
|
modified_ts = int(
|
|
459
512
|
dp.parse(chart_data.get("changed_on_utc", "now")).timestamp() * 1000
|
|
460
513
|
)
|
|
461
514
|
title = chart_data.get("slice_name", "")
|
|
462
515
|
|
|
463
516
|
# note: the API does not currently supply created_by usernames due to a bug
|
|
464
|
-
last_modified =
|
|
465
|
-
|
|
466
|
-
|
|
517
|
+
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
518
|
+
|
|
519
|
+
change_audit_stamps = ChangeAuditStamps(
|
|
520
|
+
created=None, lastModified=last_modified
|
|
467
521
|
)
|
|
522
|
+
|
|
468
523
|
chart_type = chart_type_from_viz_type.get(chart_data.get("viz_type", ""))
|
|
469
524
|
chart_url = f"{self.config.display_uri}{chart_data.get('url', '')}"
|
|
470
525
|
|
|
471
526
|
datasource_id = chart_data.get("datasource_id")
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
527
|
+
if not datasource_id:
|
|
528
|
+
logger.debug(
|
|
529
|
+
f"chart {chart_data['id']} has no datasource_id, skipping fetching dataset info"
|
|
530
|
+
)
|
|
531
|
+
datasource_urn = None
|
|
532
|
+
else:
|
|
533
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
534
|
+
datasource_urn = self.get_datasource_urn_from_id(
|
|
535
|
+
dataset_response, self.platform
|
|
536
|
+
)
|
|
476
537
|
|
|
477
538
|
params = json.loads(chart_data.get("params", "{}"))
|
|
478
539
|
metrics = [
|
|
@@ -515,23 +576,61 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
515
576
|
type=chart_type,
|
|
516
577
|
description="",
|
|
517
578
|
title=title,
|
|
518
|
-
lastModified=last_modified,
|
|
519
579
|
chartUrl=chart_url,
|
|
520
580
|
inputs=[datasource_urn] if datasource_urn else None,
|
|
521
581
|
customProperties=custom_properties,
|
|
582
|
+
lastModified=change_audit_stamps,
|
|
522
583
|
)
|
|
523
584
|
chart_snapshot.aspects.append(chart_info)
|
|
585
|
+
|
|
586
|
+
chart_owners_list = self.build_owner_urn(chart_data)
|
|
587
|
+
owners_info = OwnershipClass(
|
|
588
|
+
owners=[
|
|
589
|
+
OwnerClass(
|
|
590
|
+
owner=urn,
|
|
591
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
592
|
+
)
|
|
593
|
+
for urn in (chart_owners_list or [])
|
|
594
|
+
],
|
|
595
|
+
lastModified=last_modified,
|
|
596
|
+
)
|
|
597
|
+
chart_snapshot.aspects.append(owners_info)
|
|
524
598
|
return chart_snapshot
|
|
525
599
|
|
|
526
600
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
527
|
-
for chart_data in self.paginate_entity_api_results("chart", PAGE_SIZE):
|
|
601
|
+
for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
|
|
528
602
|
try:
|
|
603
|
+
chart_id = str(chart_data.get("id"))
|
|
604
|
+
chart_name = chart_data.get("slice_name", "")
|
|
605
|
+
|
|
606
|
+
if not self.config.chart_pattern.allowed(chart_name):
|
|
607
|
+
self.report.report_dropped(
|
|
608
|
+
f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
|
|
609
|
+
)
|
|
610
|
+
continue
|
|
611
|
+
|
|
612
|
+
# Emit a warning if charts use data from a dataset that will be filtered out
|
|
613
|
+
if self.config.dataset_pattern != AllowDenyPattern.allow_all():
|
|
614
|
+
datasource_id = chart_data.get("datasource_id")
|
|
615
|
+
if datasource_id:
|
|
616
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
617
|
+
dataset_name = dataset_response.get("result", {}).get(
|
|
618
|
+
"table_name", ""
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
if dataset_name and not self.config.dataset_pattern.allowed(
|
|
622
|
+
dataset_name
|
|
623
|
+
):
|
|
624
|
+
self.report.warning(
|
|
625
|
+
f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
|
|
626
|
+
)
|
|
627
|
+
|
|
529
628
|
chart_snapshot = self.construct_chart_from_chart_data(chart_data)
|
|
530
629
|
|
|
531
630
|
mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
532
631
|
except Exception as e:
|
|
533
632
|
self.report.warning(
|
|
534
|
-
f"Failed to construct chart snapshot. Chart name: {
|
|
633
|
+
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
535
634
|
)
|
|
536
635
|
continue
|
|
537
636
|
# Emit the chart
|
|
@@ -588,25 +687,65 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
588
687
|
) -> DatasetSnapshot:
|
|
589
688
|
dataset_response = self.get_dataset_info(dataset_data.get("id"))
|
|
590
689
|
dataset = SupersetDataset(**dataset_response["result"])
|
|
690
|
+
|
|
591
691
|
datasource_urn = self.get_datasource_urn_from_id(
|
|
592
692
|
dataset_response, self.platform
|
|
593
693
|
)
|
|
694
|
+
dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
|
|
695
|
+
|
|
696
|
+
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
697
|
+
modified_ts = int(
|
|
698
|
+
dp.parse(dataset_data.get("changed_on_utc", "now")).timestamp() * 1000
|
|
699
|
+
)
|
|
700
|
+
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
701
|
+
|
|
702
|
+
upstream_warehouse_platform = (
|
|
703
|
+
dataset_response.get("result", {}).get("database", {}).get("backend")
|
|
704
|
+
)
|
|
594
705
|
|
|
595
|
-
|
|
706
|
+
# Preset has a way of naming their platforms differently than
|
|
707
|
+
# how datahub names them, so map the platform name to the correct naming
|
|
708
|
+
warehouse_naming = {
|
|
709
|
+
"awsathena": "athena",
|
|
710
|
+
"clickhousedb": "clickhouse",
|
|
711
|
+
"postgresql": "postgres",
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
if upstream_warehouse_platform in warehouse_naming:
|
|
715
|
+
upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
|
|
716
|
+
|
|
717
|
+
# TODO: Categorize physical vs virtual upstream dataset
|
|
718
|
+
# mark all upstream dataset as physical for now, in the future we would ideally like
|
|
719
|
+
# to differentiate physical vs virtual upstream datasets
|
|
720
|
+
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
721
|
+
upstream_dataset = self.get_datasource_urn_from_id(
|
|
722
|
+
dataset_response, upstream_warehouse_platform
|
|
723
|
+
)
|
|
724
|
+
upstream_lineage = UpstreamLineageClass(
|
|
725
|
+
upstreams=[
|
|
726
|
+
UpstreamClass(
|
|
727
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
728
|
+
dataset=upstream_dataset,
|
|
729
|
+
properties={"externalUrl": dataset_url},
|
|
730
|
+
)
|
|
731
|
+
]
|
|
732
|
+
)
|
|
596
733
|
|
|
597
734
|
dataset_info = DatasetPropertiesClass(
|
|
598
735
|
name=dataset.table_name,
|
|
599
736
|
description="",
|
|
600
|
-
lastModified=TimeStamp(time=dataset.modified_ts)
|
|
601
|
-
if dataset.modified_ts
|
|
602
|
-
else None,
|
|
603
737
|
externalUrl=dataset_url,
|
|
738
|
+
lastModified=TimeStamp(time=modified_ts),
|
|
604
739
|
)
|
|
740
|
+
global_tags = GlobalTagsClass(tags=[TagAssociationClass(tag=tag_urn)])
|
|
741
|
+
|
|
605
742
|
aspects_items: List[Any] = []
|
|
606
743
|
aspects_items.extend(
|
|
607
744
|
[
|
|
608
745
|
self.gen_schema_metadata(dataset_response),
|
|
609
746
|
dataset_info,
|
|
747
|
+
upstream_lineage,
|
|
748
|
+
global_tags,
|
|
610
749
|
]
|
|
611
750
|
)
|
|
612
751
|
|
|
@@ -614,11 +753,34 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
614
753
|
urn=datasource_urn,
|
|
615
754
|
aspects=aspects_items,
|
|
616
755
|
)
|
|
756
|
+
|
|
757
|
+
dataset_owners_list = self.build_owner_urn(dataset_data)
|
|
758
|
+
owners_info = OwnershipClass(
|
|
759
|
+
owners=[
|
|
760
|
+
OwnerClass(
|
|
761
|
+
owner=urn,
|
|
762
|
+
type=OwnershipTypeClass.TECHNICAL_OWNER,
|
|
763
|
+
)
|
|
764
|
+
for urn in (dataset_owners_list or [])
|
|
765
|
+
],
|
|
766
|
+
lastModified=last_modified,
|
|
767
|
+
)
|
|
768
|
+
aspects_items.append(owners_info)
|
|
769
|
+
|
|
617
770
|
return dataset_snapshot
|
|
618
771
|
|
|
619
772
|
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
620
|
-
for dataset_data in self.paginate_entity_api_results("dataset", PAGE_SIZE):
|
|
773
|
+
for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE):
|
|
621
774
|
try:
|
|
775
|
+
dataset_name = dataset_data.get("table_name", "")
|
|
776
|
+
|
|
777
|
+
# Check if dataset should be filtered by dataset name
|
|
778
|
+
if not self.config.dataset_pattern.allowed(dataset_name):
|
|
779
|
+
self.report.report_dropped(
|
|
780
|
+
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
781
|
+
)
|
|
782
|
+
continue
|
|
783
|
+
|
|
622
784
|
dataset_snapshot = self.construct_dataset_from_dataset_data(
|
|
623
785
|
dataset_data
|
|
624
786
|
)
|
|
@@ -1911,11 +1911,7 @@ class TableauSiteSource:
|
|
|
1911
1911
|
if upstream_col.get(c.TABLE)
|
|
1912
1912
|
else None
|
|
1913
1913
|
)
|
|
1914
|
-
if
|
|
1915
|
-
name
|
|
1916
|
-
and upstream_table_id
|
|
1917
|
-
and upstream_table_id in table_id_to_urn.keys()
|
|
1918
|
-
):
|
|
1914
|
+
if name and upstream_table_id and upstream_table_id in table_id_to_urn:
|
|
1919
1915
|
parent_dataset_urn = table_id_to_urn[upstream_table_id]
|
|
1920
1916
|
if (
|
|
1921
1917
|
self.is_snowflake_urn(parent_dataset_urn)
|
datahub/lite/duckdb_lite.py
CHANGED
|
@@ -760,15 +760,9 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
760
760
|
entity_id=[str(data_platform_urn), data_platform_instance],
|
|
761
761
|
)
|
|
762
762
|
self._create_edges_from_data_platform_instance(data_platform_instance_urn)
|
|
763
|
-
elif isinstance(aspect, ChartInfoClass)
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
entity_urn,
|
|
767
|
-
"name",
|
|
768
|
-
aspect.title + f" ({urn.get_entity_id()[-1]})",
|
|
769
|
-
remove_existing=True,
|
|
770
|
-
)
|
|
771
|
-
elif isinstance(aspect, DashboardInfoClass):
|
|
763
|
+
elif isinstance(aspect, ChartInfoClass) or isinstance(
|
|
764
|
+
aspect, DashboardInfoClass
|
|
765
|
+
):
|
|
772
766
|
urn = Urn.from_string(entity_urn)
|
|
773
767
|
self.add_edge(
|
|
774
768
|
entity_urn,
|