acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from functools import lru_cache
|
|
6
|
-
from typing import Any, Dict, Iterable, List, Optional
|
|
6
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
7
7
|
|
|
8
8
|
import dateutil.parser as dp
|
|
9
9
|
import requests
|
|
@@ -11,6 +11,7 @@ from pydantic import BaseModel
|
|
|
11
11
|
from pydantic.class_validators import root_validator, validator
|
|
12
12
|
from pydantic.fields import Field
|
|
13
13
|
|
|
14
|
+
import datahub.emitter.mce_builder as builder
|
|
14
15
|
from datahub.configuration.common import AllowDenyPattern
|
|
15
16
|
from datahub.configuration.source_common import (
|
|
16
17
|
EnvConfigMixin,
|
|
@@ -23,8 +24,10 @@ from datahub.emitter.mce_builder import (
|
|
|
23
24
|
make_dataset_urn,
|
|
24
25
|
make_dataset_urn_with_platform_instance,
|
|
25
26
|
make_domain_urn,
|
|
27
|
+
make_schema_field_urn,
|
|
26
28
|
make_user_urn,
|
|
27
29
|
)
|
|
30
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
28
31
|
from datahub.emitter.mcp_builder import add_domain_to_entity_wu
|
|
29
32
|
from datahub.ingestion.api.common import PipelineContext
|
|
30
33
|
from datahub.ingestion.api.decorators import (
|
|
@@ -49,6 +52,8 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
49
52
|
)
|
|
50
53
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
51
54
|
ChangeAuditStamps,
|
|
55
|
+
InputField,
|
|
56
|
+
InputFields,
|
|
52
57
|
Status,
|
|
53
58
|
TimeStamp,
|
|
54
59
|
)
|
|
@@ -59,11 +64,16 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
|
59
64
|
)
|
|
60
65
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
61
66
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
67
|
+
BooleanTypeClass,
|
|
68
|
+
DateTypeClass,
|
|
62
69
|
MySqlDDL,
|
|
63
70
|
NullType,
|
|
71
|
+
NullTypeClass,
|
|
72
|
+
NumberTypeClass,
|
|
64
73
|
SchemaField,
|
|
65
74
|
SchemaFieldDataType,
|
|
66
75
|
SchemaMetadata,
|
|
76
|
+
StringTypeClass,
|
|
67
77
|
)
|
|
68
78
|
from datahub.metadata.schema_classes import (
|
|
69
79
|
AuditStampClass,
|
|
@@ -72,6 +82,9 @@ from datahub.metadata.schema_classes import (
|
|
|
72
82
|
DashboardInfoClass,
|
|
73
83
|
DatasetLineageTypeClass,
|
|
74
84
|
DatasetPropertiesClass,
|
|
85
|
+
FineGrainedLineageClass,
|
|
86
|
+
FineGrainedLineageDownstreamTypeClass,
|
|
87
|
+
FineGrainedLineageUpstreamTypeClass,
|
|
75
88
|
GlobalTagsClass,
|
|
76
89
|
OwnerClass,
|
|
77
90
|
OwnershipClass,
|
|
@@ -80,6 +93,10 @@ from datahub.metadata.schema_classes import (
|
|
|
80
93
|
UpstreamClass,
|
|
81
94
|
UpstreamLineageClass,
|
|
82
95
|
)
|
|
96
|
+
from datahub.sql_parsing.sqlglot_lineage import (
|
|
97
|
+
SqlParsingResult,
|
|
98
|
+
create_lineage_sql_parsed_result,
|
|
99
|
+
)
|
|
83
100
|
from datahub.utilities import config_clean
|
|
84
101
|
from datahub.utilities.lossy_collections import LossyList
|
|
85
102
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
@@ -105,9 +122,17 @@ chart_type_from_viz_type = {
|
|
|
105
122
|
"box_plot": ChartTypeClass.BAR,
|
|
106
123
|
}
|
|
107
124
|
|
|
108
|
-
|
|
109
125
|
platform_without_databases = ["druid"]
|
|
110
126
|
|
|
127
|
+
FIELD_TYPE_MAPPING = {
|
|
128
|
+
"INT": NumberTypeClass,
|
|
129
|
+
"STRING": StringTypeClass,
|
|
130
|
+
"FLOAT": NumberTypeClass,
|
|
131
|
+
"DATETIME": DateTypeClass,
|
|
132
|
+
"BOOLEAN": BooleanTypeClass,
|
|
133
|
+
"SQL": StringTypeClass,
|
|
134
|
+
}
|
|
135
|
+
|
|
111
136
|
|
|
112
137
|
@dataclass
|
|
113
138
|
class SupersetSourceReport(StaleEntityRemovalSourceReport):
|
|
@@ -181,6 +206,10 @@ class SupersetConfig(
|
|
|
181
206
|
provider: str = Field(default="db", description="Superset provider.")
|
|
182
207
|
options: Dict = Field(default={}, description="")
|
|
183
208
|
|
|
209
|
+
timeout: int = Field(
|
|
210
|
+
default=10, description="Timeout of single API call to superset."
|
|
211
|
+
)
|
|
212
|
+
|
|
184
213
|
# TODO: Check and remove this if no longer needed.
|
|
185
214
|
# Config database_alias is removed from sql sources.
|
|
186
215
|
database_alias: Dict[str, str] = Field(
|
|
@@ -285,13 +314,16 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
285
314
|
}
|
|
286
315
|
)
|
|
287
316
|
|
|
288
|
-
# Test the connection
|
|
289
317
|
test_response = requests_session.get(
|
|
290
|
-
f"{self.config.connect_uri}/api/v1/dashboard/"
|
|
318
|
+
f"{self.config.connect_uri}/api/v1/dashboard/",
|
|
319
|
+
timeout=self.config.timeout,
|
|
291
320
|
)
|
|
292
|
-
if test_response.status_code
|
|
293
|
-
|
|
294
|
-
#
|
|
321
|
+
if test_response.status_code != 200:
|
|
322
|
+
# throw an error and terminate ingestion,
|
|
323
|
+
# cannot proceed without access token
|
|
324
|
+
logger.error(
|
|
325
|
+
f"Failed to log in to Superset with status: {test_response.status_code}"
|
|
326
|
+
)
|
|
295
327
|
return requests_session
|
|
296
328
|
|
|
297
329
|
def paginate_entity_api_results(self, entity_type, page_size=100):
|
|
@@ -302,6 +334,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
302
334
|
response = self.session.get(
|
|
303
335
|
f"{self.config.connect_uri}/api/v1/{entity_type}",
|
|
304
336
|
params={"q": f"(page:{current_page},page_size:{page_size})"},
|
|
337
|
+
timeout=self.config.timeout,
|
|
305
338
|
)
|
|
306
339
|
|
|
307
340
|
if response.status_code != 200:
|
|
@@ -339,10 +372,11 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
339
372
|
def get_dataset_info(self, dataset_id: int) -> dict:
|
|
340
373
|
dataset_response = self.session.get(
|
|
341
374
|
f"{self.config.connect_uri}/api/v1/dataset/{dataset_id}",
|
|
375
|
+
timeout=self.config.timeout,
|
|
342
376
|
)
|
|
343
377
|
if dataset_response.status_code != 200:
|
|
344
378
|
logger.warning(f"Failed to get dataset info: {dataset_response.text}")
|
|
345
|
-
|
|
379
|
+
return {}
|
|
346
380
|
return dataset_response.json()
|
|
347
381
|
|
|
348
382
|
def get_datasource_urn_from_id(
|
|
@@ -393,8 +427,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
393
427
|
)
|
|
394
428
|
|
|
395
429
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dashboard_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
430
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
396
431
|
modified_ts = int(
|
|
397
|
-
dp.parse(dashboard_data.get("changed_on_utc",
|
|
432
|
+
dp.parse(dashboard_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
398
433
|
)
|
|
399
434
|
title = dashboard_data.get("dashboard_title", "")
|
|
400
435
|
# note: the API does not currently supply created_by usernames due to a bug
|
|
@@ -494,7 +529,119 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
494
529
|
entity_urn=dashboard_snapshot.urn,
|
|
495
530
|
)
|
|
496
531
|
|
|
497
|
-
def
|
|
532
|
+
def build_input_fields(
|
|
533
|
+
self,
|
|
534
|
+
chart_columns: List[Tuple[str, str, str]],
|
|
535
|
+
datasource_urn: Union[str, None],
|
|
536
|
+
) -> List[InputField]:
|
|
537
|
+
input_fields: List[InputField] = []
|
|
538
|
+
|
|
539
|
+
for column in chart_columns:
|
|
540
|
+
col_name, col_type, description = column
|
|
541
|
+
if not col_type or not datasource_urn:
|
|
542
|
+
continue
|
|
543
|
+
|
|
544
|
+
type_class = FIELD_TYPE_MAPPING.get(
|
|
545
|
+
col_type.upper(), NullTypeClass
|
|
546
|
+
) # gets the type mapping
|
|
547
|
+
|
|
548
|
+
input_fields.append(
|
|
549
|
+
InputField(
|
|
550
|
+
schemaFieldUrn=builder.make_schema_field_urn(
|
|
551
|
+
parent_urn=str(datasource_urn),
|
|
552
|
+
field_path=col_name,
|
|
553
|
+
),
|
|
554
|
+
schemaField=SchemaField(
|
|
555
|
+
fieldPath=col_name,
|
|
556
|
+
type=SchemaFieldDataType(type=type_class()), # type: ignore
|
|
557
|
+
description=(description if description != "null" else ""),
|
|
558
|
+
nativeDataType=col_type,
|
|
559
|
+
globalTags=None,
|
|
560
|
+
nullable=True,
|
|
561
|
+
),
|
|
562
|
+
)
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
return input_fields
|
|
566
|
+
|
|
567
|
+
def construct_chart_cll(
|
|
568
|
+
self,
|
|
569
|
+
chart_data: dict,
|
|
570
|
+
datasource_urn: Union[str, None],
|
|
571
|
+
datasource_id: Union[Any, int],
|
|
572
|
+
) -> List[InputField]:
|
|
573
|
+
column_data: List[Union[str, dict]] = chart_data.get("form_data", {}).get(
|
|
574
|
+
"all_columns", []
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# the second field represents whether its a SQL expression,
|
|
578
|
+
# false being just regular column and true being SQL col
|
|
579
|
+
chart_column_data: List[Tuple[str, bool]] = [
|
|
580
|
+
(column, False)
|
|
581
|
+
if isinstance(column, str)
|
|
582
|
+
else (column.get("label", ""), True)
|
|
583
|
+
for column in column_data
|
|
584
|
+
]
|
|
585
|
+
|
|
586
|
+
dataset_columns: List[Tuple[str, str, str]] = []
|
|
587
|
+
|
|
588
|
+
# parses the superset dataset's column info, to build type and description info
|
|
589
|
+
if datasource_id:
|
|
590
|
+
dataset_info = self.get_dataset_info(datasource_id).get("result", {})
|
|
591
|
+
dataset_column_info = dataset_info.get("columns", [])
|
|
592
|
+
|
|
593
|
+
for column in dataset_column_info:
|
|
594
|
+
col_name = column.get("column_name", "")
|
|
595
|
+
col_type = column.get("type", "")
|
|
596
|
+
col_description = column.get("description", "")
|
|
597
|
+
|
|
598
|
+
# if missing column name or column type, cannot construct the column,
|
|
599
|
+
# so we skip this column, missing description is fine
|
|
600
|
+
if col_name == "" or col_type == "":
|
|
601
|
+
logger.info(f"could not construct column lineage for {column}")
|
|
602
|
+
continue
|
|
603
|
+
|
|
604
|
+
dataset_columns.append((col_name, col_type, col_description))
|
|
605
|
+
else:
|
|
606
|
+
# if no datasource id, cannot build cll, just return
|
|
607
|
+
logger.warning(
|
|
608
|
+
"no datasource id was found, cannot build column level lineage"
|
|
609
|
+
)
|
|
610
|
+
return []
|
|
611
|
+
|
|
612
|
+
chart_columns: List[Tuple[str, str, str]] = []
|
|
613
|
+
for chart_col in chart_column_data:
|
|
614
|
+
chart_col_name, is_sql = chart_col
|
|
615
|
+
if is_sql:
|
|
616
|
+
chart_columns.append(
|
|
617
|
+
(
|
|
618
|
+
chart_col_name,
|
|
619
|
+
"SQL",
|
|
620
|
+
"",
|
|
621
|
+
)
|
|
622
|
+
)
|
|
623
|
+
continue
|
|
624
|
+
|
|
625
|
+
# find matching upstream column
|
|
626
|
+
for dataset_col in dataset_columns:
|
|
627
|
+
dataset_col_name, dataset_col_type, dataset_col_description = (
|
|
628
|
+
dataset_col
|
|
629
|
+
)
|
|
630
|
+
if dataset_col_name == chart_col_name:
|
|
631
|
+
chart_columns.append(
|
|
632
|
+
(chart_col_name, dataset_col_type, dataset_col_description)
|
|
633
|
+
) # column name, column type, description
|
|
634
|
+
break
|
|
635
|
+
|
|
636
|
+
# if no matching upstream column was found
|
|
637
|
+
if len(chart_columns) == 0 or chart_columns[-1][0] != chart_col_name:
|
|
638
|
+
chart_columns.append((chart_col_name, "", ""))
|
|
639
|
+
|
|
640
|
+
return self.build_input_fields(chart_columns, datasource_urn)
|
|
641
|
+
|
|
642
|
+
def construct_chart_from_chart_data(
|
|
643
|
+
self, chart_data: dict
|
|
644
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
498
645
|
chart_urn = make_chart_urn(
|
|
499
646
|
platform=self.platform,
|
|
500
647
|
name=str(chart_data["id"]),
|
|
@@ -506,8 +653,9 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
506
653
|
)
|
|
507
654
|
|
|
508
655
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((chart_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
656
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
509
657
|
modified_ts = int(
|
|
510
|
-
dp.parse(chart_data.get("changed_on_utc",
|
|
658
|
+
dp.parse(chart_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
511
659
|
)
|
|
512
660
|
title = chart_data.get("slice_name", "")
|
|
513
661
|
|
|
@@ -581,6 +729,18 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
581
729
|
)
|
|
582
730
|
chart_snapshot.aspects.append(chart_info)
|
|
583
731
|
|
|
732
|
+
input_fields = self.construct_chart_cll(
|
|
733
|
+
chart_data, datasource_urn, datasource_id
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
if input_fields:
|
|
737
|
+
yield MetadataChangeProposalWrapper(
|
|
738
|
+
entityUrn=chart_urn,
|
|
739
|
+
aspect=InputFields(
|
|
740
|
+
fields=sorted(input_fields, key=lambda x: x.schemaFieldUrn)
|
|
741
|
+
),
|
|
742
|
+
).as_workunit()
|
|
743
|
+
|
|
584
744
|
chart_owners_list = self.build_owner_urn(chart_data)
|
|
585
745
|
owners_info = OwnershipClass(
|
|
586
746
|
owners=[
|
|
@@ -593,7 +753,14 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
593
753
|
lastModified=last_modified,
|
|
594
754
|
)
|
|
595
755
|
chart_snapshot.aspects.append(owners_info)
|
|
596
|
-
|
|
756
|
+
yield MetadataWorkUnit(
|
|
757
|
+
id=chart_urn, mce=MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
yield from self._get_domain_wu(
|
|
761
|
+
title=chart_data.get("slice_name", ""),
|
|
762
|
+
entity_urn=chart_urn,
|
|
763
|
+
)
|
|
597
764
|
|
|
598
765
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
599
766
|
for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE):
|
|
@@ -623,20 +790,12 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
623
790
|
f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
|
|
624
791
|
)
|
|
625
792
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
mce = MetadataChangeEvent(proposedSnapshot=chart_snapshot)
|
|
793
|
+
yield from self.construct_chart_from_chart_data(chart_data)
|
|
629
794
|
except Exception as e:
|
|
630
795
|
self.report.warning(
|
|
631
796
|
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
632
797
|
)
|
|
633
798
|
continue
|
|
634
|
-
# Emit the chart
|
|
635
|
-
yield MetadataWorkUnit(id=chart_snapshot.urn, mce=mce)
|
|
636
|
-
yield from self._get_domain_wu(
|
|
637
|
-
title=chart_data.get("slice_name", ""),
|
|
638
|
-
entity_urn=chart_snapshot.urn,
|
|
639
|
-
)
|
|
640
799
|
|
|
641
800
|
def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
|
|
642
801
|
schema_fields: List[SchemaField] = []
|
|
@@ -680,6 +839,88 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
680
839
|
env=self.config.env,
|
|
681
840
|
)
|
|
682
841
|
|
|
842
|
+
def generate_virtual_dataset_lineage(
|
|
843
|
+
self,
|
|
844
|
+
parsed_query_object: SqlParsingResult,
|
|
845
|
+
datasource_urn: str,
|
|
846
|
+
) -> UpstreamLineageClass:
|
|
847
|
+
cll = (
|
|
848
|
+
parsed_query_object.column_lineage
|
|
849
|
+
if parsed_query_object.column_lineage is not None
|
|
850
|
+
else []
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
854
|
+
|
|
855
|
+
for cll_info in cll:
|
|
856
|
+
downstream = (
|
|
857
|
+
[make_schema_field_urn(datasource_urn, cll_info.downstream.column)]
|
|
858
|
+
if cll_info.downstream and cll_info.downstream.column
|
|
859
|
+
else []
|
|
860
|
+
)
|
|
861
|
+
upstreams = [
|
|
862
|
+
make_schema_field_urn(column_ref.table, column_ref.column)
|
|
863
|
+
for column_ref in cll_info.upstreams
|
|
864
|
+
]
|
|
865
|
+
fine_grained_lineages.append(
|
|
866
|
+
FineGrainedLineageClass(
|
|
867
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
868
|
+
downstreams=downstream,
|
|
869
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
870
|
+
upstreams=upstreams,
|
|
871
|
+
)
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
upstream_lineage = UpstreamLineageClass(
|
|
875
|
+
upstreams=[
|
|
876
|
+
UpstreamClass(
|
|
877
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
878
|
+
dataset=input_table_urn,
|
|
879
|
+
)
|
|
880
|
+
for input_table_urn in parsed_query_object.in_tables
|
|
881
|
+
],
|
|
882
|
+
fineGrainedLineages=fine_grained_lineages,
|
|
883
|
+
)
|
|
884
|
+
return upstream_lineage
|
|
885
|
+
|
|
886
|
+
def generate_physical_dataset_lineage(
|
|
887
|
+
self,
|
|
888
|
+
dataset_response: dict,
|
|
889
|
+
upstream_dataset: str,
|
|
890
|
+
datasource_urn: str,
|
|
891
|
+
) -> UpstreamLineageClass:
|
|
892
|
+
# To generate column level lineage, we can manually decode the metadata
|
|
893
|
+
# to produce the ColumnLineageInfo
|
|
894
|
+
columns = dataset_response.get("result", {}).get("columns", [])
|
|
895
|
+
fine_grained_lineages: List[FineGrainedLineageClass] = []
|
|
896
|
+
|
|
897
|
+
for column in columns:
|
|
898
|
+
column_name = column.get("column_name", "")
|
|
899
|
+
if not column_name:
|
|
900
|
+
continue
|
|
901
|
+
|
|
902
|
+
downstream = [make_schema_field_urn(datasource_urn, column_name)]
|
|
903
|
+
upstreams = [make_schema_field_urn(upstream_dataset, column_name)]
|
|
904
|
+
fine_grained_lineages.append(
|
|
905
|
+
FineGrainedLineageClass(
|
|
906
|
+
downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD,
|
|
907
|
+
downstreams=downstream,
|
|
908
|
+
upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET,
|
|
909
|
+
upstreams=upstreams,
|
|
910
|
+
)
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
upstream_lineage = UpstreamLineageClass(
|
|
914
|
+
upstreams=[
|
|
915
|
+
UpstreamClass(
|
|
916
|
+
type=DatasetLineageTypeClass.TRANSFORMED,
|
|
917
|
+
dataset=upstream_dataset,
|
|
918
|
+
)
|
|
919
|
+
],
|
|
920
|
+
fineGrainedLineages=fine_grained_lineages,
|
|
921
|
+
)
|
|
922
|
+
return upstream_lineage
|
|
923
|
+
|
|
683
924
|
def construct_dataset_from_dataset_data(
|
|
684
925
|
self, dataset_data: dict
|
|
685
926
|
) -> DatasetSnapshot:
|
|
@@ -692,14 +933,23 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
692
933
|
dataset_url = f"{self.config.display_uri}{dataset_response.get('result', {}).get('url', '')}"
|
|
693
934
|
|
|
694
935
|
modified_actor = f"urn:li:corpuser:{self.owner_info.get((dataset_data.get('changed_by') or {}).get('id', -1), 'unknown')}"
|
|
936
|
+
now = datetime.now().strftime("%I:%M%p on %B %d, %Y")
|
|
695
937
|
modified_ts = int(
|
|
696
|
-
dp.parse(dataset_data.get("changed_on_utc",
|
|
938
|
+
dp.parse(dataset_data.get("changed_on_utc", now)).timestamp() * 1000
|
|
697
939
|
)
|
|
698
940
|
last_modified = AuditStampClass(time=modified_ts, actor=modified_actor)
|
|
699
941
|
|
|
700
942
|
upstream_warehouse_platform = (
|
|
701
943
|
dataset_response.get("result", {}).get("database", {}).get("backend")
|
|
702
944
|
)
|
|
945
|
+
upstream_warehouse_db_name = (
|
|
946
|
+
dataset_response.get("result", {}).get("database", {}).get("database_name")
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
# if we have rendered sql, we always use that and defualt back to regular sql
|
|
950
|
+
sql = dataset_response.get("result", {}).get(
|
|
951
|
+
"rendered_sql"
|
|
952
|
+
) or dataset_response.get("result", {}).get("sql")
|
|
703
953
|
|
|
704
954
|
# Preset has a way of naming their platforms differently than
|
|
705
955
|
# how datahub names them, so map the platform name to the correct naming
|
|
@@ -712,22 +962,28 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
712
962
|
if upstream_warehouse_platform in warehouse_naming:
|
|
713
963
|
upstream_warehouse_platform = warehouse_naming[upstream_warehouse_platform]
|
|
714
964
|
|
|
715
|
-
# TODO: Categorize physical vs virtual upstream dataset
|
|
716
|
-
# mark all upstream dataset as physical for now, in the future we would ideally like
|
|
717
|
-
# to differentiate physical vs virtual upstream datasets
|
|
718
|
-
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
719
965
|
upstream_dataset = self.get_datasource_urn_from_id(
|
|
720
966
|
dataset_response, upstream_warehouse_platform
|
|
721
967
|
)
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
968
|
+
|
|
969
|
+
# Sometimes the field will be null instead of not existing
|
|
970
|
+
if sql == "null" or not sql:
|
|
971
|
+
tag_urn = f"urn:li:tag:{self.platform}:physical"
|
|
972
|
+
upstream_lineage = self.generate_physical_dataset_lineage(
|
|
973
|
+
dataset_response, upstream_dataset, datasource_urn
|
|
974
|
+
)
|
|
975
|
+
else:
|
|
976
|
+
tag_urn = f"urn:li:tag:{self.platform}:virtual"
|
|
977
|
+
parsed_query_object = create_lineage_sql_parsed_result(
|
|
978
|
+
query=sql,
|
|
979
|
+
default_db=upstream_warehouse_db_name,
|
|
980
|
+
platform=upstream_warehouse_platform,
|
|
981
|
+
platform_instance=None,
|
|
982
|
+
env=self.config.env,
|
|
983
|
+
)
|
|
984
|
+
upstream_lineage = self.generate_virtual_dataset_lineage(
|
|
985
|
+
parsed_query_object, datasource_urn
|
|
986
|
+
)
|
|
731
987
|
|
|
732
988
|
dataset_info = DatasetPropertiesClass(
|
|
733
989
|
name=dataset.table_name,
|
|
@@ -12,11 +12,9 @@ from typing import (
|
|
|
12
12
|
Optional,
|
|
13
13
|
Tuple,
|
|
14
14
|
TypeVar,
|
|
15
|
-
Union,
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
import pydantic
|
|
19
|
-
from deprecated import deprecated
|
|
20
18
|
from pydantic.fields import Field
|
|
21
19
|
|
|
22
20
|
import datahub.emitter.mce_builder as builder
|
|
@@ -28,19 +26,13 @@ from datahub.configuration.time_window_config import (
|
|
|
28
26
|
)
|
|
29
27
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
30
28
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
31
|
-
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetUsageStatistics
|
|
32
29
|
from datahub.metadata.schema_classes import (
|
|
33
|
-
CalendarIntervalClass,
|
|
34
30
|
DatasetFieldUsageCountsClass,
|
|
35
31
|
DatasetUsageStatisticsClass,
|
|
36
32
|
DatasetUserUsageCountsClass,
|
|
37
33
|
TimeWindowSizeClass,
|
|
38
|
-
UsageAggregationClass,
|
|
39
|
-
WindowDurationClass,
|
|
40
34
|
)
|
|
41
35
|
from datahub.utilities.sql_formatter import format_sql_query, trim_query
|
|
42
|
-
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
43
|
-
from datahub.utilities.urns.urn import guess_entity_type
|
|
44
36
|
|
|
45
37
|
logger = logging.getLogger(__name__)
|
|
46
38
|
|
|
@@ -295,60 +287,3 @@ class UsageAggregator(Generic[ResourceType]):
|
|
|
295
287
|
user_urn_builder=user_urn_builder,
|
|
296
288
|
queries_character_limit=self.config.queries_character_limit,
|
|
297
289
|
)
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
@deprecated
|
|
301
|
-
def convert_usage_aggregation_class(
|
|
302
|
-
obj: UsageAggregationClass,
|
|
303
|
-
) -> MetadataChangeProposalWrapper:
|
|
304
|
-
# Legacy usage aggregation only supported dataset usage stats
|
|
305
|
-
if guess_entity_type(obj.resource) == DatasetUrn.ENTITY_TYPE:
|
|
306
|
-
aspect = DatasetUsageStatistics(
|
|
307
|
-
timestampMillis=obj.bucket,
|
|
308
|
-
eventGranularity=TimeWindowSizeClass(
|
|
309
|
-
unit=convert_window_to_interval(obj.duration)
|
|
310
|
-
),
|
|
311
|
-
uniqueUserCount=obj.metrics.uniqueUserCount,
|
|
312
|
-
totalSqlQueries=obj.metrics.totalSqlQueries,
|
|
313
|
-
topSqlQueries=obj.metrics.topSqlQueries,
|
|
314
|
-
userCounts=(
|
|
315
|
-
[
|
|
316
|
-
DatasetUserUsageCountsClass(
|
|
317
|
-
user=u.user, count=u.count, userEmail=u.userEmail
|
|
318
|
-
)
|
|
319
|
-
for u in obj.metrics.users
|
|
320
|
-
if u.user is not None
|
|
321
|
-
]
|
|
322
|
-
if obj.metrics.users
|
|
323
|
-
else None
|
|
324
|
-
),
|
|
325
|
-
fieldCounts=(
|
|
326
|
-
[
|
|
327
|
-
DatasetFieldUsageCountsClass(fieldPath=f.fieldName, count=f.count)
|
|
328
|
-
for f in obj.metrics.fields
|
|
329
|
-
]
|
|
330
|
-
if obj.metrics.fields
|
|
331
|
-
else None
|
|
332
|
-
),
|
|
333
|
-
)
|
|
334
|
-
return MetadataChangeProposalWrapper(entityUrn=obj.resource, aspect=aspect)
|
|
335
|
-
else:
|
|
336
|
-
raise Exception(
|
|
337
|
-
f"Skipping unsupported usage aggregation - invalid entity type: {obj}"
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
@deprecated
|
|
342
|
-
def convert_window_to_interval(window: Union[str, WindowDurationClass]) -> str:
|
|
343
|
-
if window == WindowDurationClass.YEAR:
|
|
344
|
-
return CalendarIntervalClass.YEAR
|
|
345
|
-
elif window == WindowDurationClass.MONTH:
|
|
346
|
-
return CalendarIntervalClass.MONTH
|
|
347
|
-
elif window == WindowDurationClass.WEEK:
|
|
348
|
-
return CalendarIntervalClass.WEEK
|
|
349
|
-
elif window == WindowDurationClass.DAY:
|
|
350
|
-
return CalendarIntervalClass.DAY
|
|
351
|
-
elif window == WindowDurationClass.HOUR:
|
|
352
|
-
return CalendarIntervalClass.HOUR
|
|
353
|
-
else:
|
|
354
|
-
raise Exception(f"Unsupported window duration: {window}")
|
|
File without changes
|