acryl-datahub 1.0.0.1rc7__py3-none-any.whl → 1.0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/METADATA +2561 -2561
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/RECORD +75 -73
- datahub/_version.py +1 -1
- datahub/api/entities/datajob/dataflow.py +15 -0
- datahub/api/entities/datajob/datajob.py +17 -0
- datahub/api/entities/dataprocess/dataprocess_instance.py +4 -0
- datahub/api/entities/dataset/dataset.py +2 -2
- datahub/api/entities/structuredproperties/structuredproperties.py +1 -1
- datahub/cli/ingest_cli.py +4 -4
- datahub/cli/migrate.py +6 -6
- datahub/configuration/common.py +1 -1
- datahub/emitter/mcp_builder.py +4 -0
- datahub/ingestion/api/common.py +9 -0
- datahub/ingestion/api/source.py +4 -1
- datahub/ingestion/api/source_helpers.py +26 -1
- datahub/ingestion/graph/client.py +104 -0
- datahub/ingestion/run/pipeline.py +0 -6
- datahub/ingestion/source/aws/sagemaker_processors/models.py +4 -4
- datahub/ingestion/source/bigquery_v2/lineage.py +1 -1
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/fivetran/fivetran.py +1 -0
- datahub/ingestion/source/fivetran/fivetran_log_api.py +1 -1
- datahub/ingestion/source/hex/constants.py +5 -0
- datahub/ingestion/source/hex/hex.py +150 -22
- datahub/ingestion/source/hex/mapper.py +28 -2
- datahub/ingestion/source/hex/model.py +10 -2
- datahub/ingestion/source/hex/query_fetcher.py +300 -0
- datahub/ingestion/source/iceberg/iceberg.py +106 -18
- datahub/ingestion/source/kafka/kafka.py +1 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +1 -1
- datahub/ingestion/source/kafka_connect/source_connectors.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +2 -3
- datahub/ingestion/source/mlflow.py +6 -7
- datahub/ingestion/source/mode.py +2 -2
- datahub/ingestion/source/nifi.py +3 -3
- datahub/ingestion/source/openapi.py +3 -3
- datahub/ingestion/source/openapi_parser.py +8 -8
- datahub/ingestion/source/powerbi/config.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +16 -3
- datahub/ingestion/source/redshift/profile.py +2 -2
- datahub/ingestion/source/sigma/sigma.py +6 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -1
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/trino.py +4 -3
- datahub/ingestion/source/state/stale_entity_removal_handler.py +0 -1
- datahub/ingestion/source/superset.py +108 -81
- datahub/ingestion/source/tableau/tableau.py +4 -4
- datahub/ingestion/source/tableau/tableau_common.py +2 -2
- datahub/ingestion/source/unity/source.py +1 -1
- datahub/ingestion/source/vertexai/vertexai.py +7 -7
- datahub/ingestion/transformer/add_dataset_dataproduct.py +1 -1
- datahub/ingestion/transformer/add_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/dataset_domain.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_schema_classes.py +47 -2
- datahub/metadata/_urns/urn_defs.py +56 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +121 -85
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +6 -6
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/ingest_utils.py +2 -2
- datahub/ingestion/transformer/system_metadata_transformer.py +0 -45
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc7.dist-info → acryl_datahub-1.0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -16,7 +16,7 @@ from datahub.api.entities.dataprocess.dataprocess_instance import (
|
|
|
16
16
|
)
|
|
17
17
|
from datahub.configuration.source_common import EnvConfigMixin
|
|
18
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
|
-
from datahub.emitter.mcp_builder import
|
|
19
|
+
from datahub.emitter.mcp_builder import ExperimentKey
|
|
20
20
|
from datahub.ingestion.api.common import PipelineContext
|
|
21
21
|
from datahub.ingestion.api.decorators import (
|
|
22
22
|
SupportStatus,
|
|
@@ -36,6 +36,7 @@ from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
|
|
|
36
36
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
37
37
|
StaleEntityRemovalHandler,
|
|
38
38
|
StaleEntityRemovalSourceReport,
|
|
39
|
+
StatefulStaleMetadataRemovalConfig,
|
|
39
40
|
)
|
|
40
41
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
41
42
|
StatefulIngestionConfigBase,
|
|
@@ -77,10 +78,6 @@ from datahub.sdk.dataset import Dataset
|
|
|
77
78
|
T = TypeVar("T")
|
|
78
79
|
|
|
79
80
|
|
|
80
|
-
class ContainerKeyWithId(ContainerKey):
|
|
81
|
-
id: str
|
|
82
|
-
|
|
83
|
-
|
|
84
81
|
class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
85
82
|
tracking_uri: Optional[str] = Field(
|
|
86
83
|
default=None,
|
|
@@ -123,6 +120,8 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
123
120
|
default=None, description="Password for MLflow authentication"
|
|
124
121
|
)
|
|
125
122
|
|
|
123
|
+
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
124
|
+
|
|
126
125
|
|
|
127
126
|
@dataclass
|
|
128
127
|
class MLflowRegisteredModelStageInfo:
|
|
@@ -252,7 +251,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
252
251
|
self, experiment: Experiment
|
|
253
252
|
) -> Iterable[MetadataWorkUnit]:
|
|
254
253
|
experiment_container = Container(
|
|
255
|
-
container_key=
|
|
254
|
+
container_key=ExperimentKey(
|
|
256
255
|
platform=str(DataPlatformUrn(platform_name=self.platform)),
|
|
257
256
|
id=experiment.name,
|
|
258
257
|
),
|
|
@@ -470,7 +469,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
470
469
|
def _get_run_workunits(
|
|
471
470
|
self, experiment: Experiment, run: Run
|
|
472
471
|
) -> Iterable[MetadataWorkUnit]:
|
|
473
|
-
experiment_key =
|
|
472
|
+
experiment_key = ExperimentKey(
|
|
474
473
|
platform=str(DataPlatformUrn(self.platform)), id=experiment.name
|
|
475
474
|
)
|
|
476
475
|
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -899,7 +899,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
899
899
|
for match in matches:
|
|
900
900
|
definition = Template(source=match).render()
|
|
901
901
|
parameters = yaml.safe_load(definition)
|
|
902
|
-
for key in parameters
|
|
902
|
+
for key in parameters:
|
|
903
903
|
jinja_params[key] = parameters[key].get("default", "")
|
|
904
904
|
|
|
905
905
|
normalized_query = re.sub(
|
|
@@ -1601,7 +1601,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1601
1601
|
|
|
1602
1602
|
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1603
1603
|
# Space/collection -> report -> query -> Chart
|
|
1604
|
-
for space_token in self.space_tokens
|
|
1604
|
+
for space_token in self.space_tokens:
|
|
1605
1605
|
reports = self._get_reports(space_token)
|
|
1606
1606
|
for report in reports:
|
|
1607
1607
|
report_token = report.get("token", "")
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -703,7 +703,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
703
703
|
if (
|
|
704
704
|
component.nifi_type is NifiType.PROCESSOR
|
|
705
705
|
and component.type
|
|
706
|
-
not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
|
|
706
|
+
not in NifiProcessorProvenanceEventAnalyzer.KNOWN_INGRESS_EGRESS_PROCESORS
|
|
707
707
|
) or component.nifi_type not in [
|
|
708
708
|
NifiType.PROCESSOR,
|
|
709
709
|
NifiType.REMOTE_INPUT_PORT,
|
|
@@ -977,7 +977,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
977
977
|
)
|
|
978
978
|
|
|
979
979
|
for incoming_from in incoming:
|
|
980
|
-
if incoming_from in self.nifi_flow.remotely_accessible_ports
|
|
980
|
+
if incoming_from in self.nifi_flow.remotely_accessible_ports:
|
|
981
981
|
dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[incoming_from].name}"
|
|
982
982
|
dataset_urn = builder.make_dataset_urn(
|
|
983
983
|
NIFI, dataset_name, self.config.env
|
|
@@ -994,7 +994,7 @@ class NifiSource(StatefulIngestionSourceBase):
|
|
|
994
994
|
)
|
|
995
995
|
|
|
996
996
|
for outgoing_to in outgoing:
|
|
997
|
-
if outgoing_to in self.nifi_flow.remotely_accessible_ports
|
|
997
|
+
if outgoing_to in self.nifi_flow.remotely_accessible_ports:
|
|
998
998
|
dataset_name = f"{self.config.site_name}.{self.nifi_flow.remotely_accessible_ports[outgoing_to].name}"
|
|
999
999
|
dataset_urn = builder.make_dataset_urn(
|
|
1000
1000
|
NIFI, dataset_name, self.config.env
|
|
@@ -102,7 +102,7 @@ class OpenApiConfig(ConfigModel):
|
|
|
102
102
|
# details there once, and then use that session for all requests.
|
|
103
103
|
self.token = f"Bearer {self.bearer_token}"
|
|
104
104
|
else:
|
|
105
|
-
assert "url_complement" in self.get_token
|
|
105
|
+
assert "url_complement" in self.get_token, (
|
|
106
106
|
"When 'request_type' is set to 'get', an url_complement is needed for the request."
|
|
107
107
|
)
|
|
108
108
|
if self.get_token["request_type"] == "get":
|
|
@@ -317,7 +317,7 @@ class APISource(Source, ABC):
|
|
|
317
317
|
yield wu
|
|
318
318
|
|
|
319
319
|
# Handle schema metadata if available
|
|
320
|
-
if "data" in endpoint_dets
|
|
320
|
+
if "data" in endpoint_dets:
|
|
321
321
|
# we are lucky! data is defined in the swagger for this endpoint
|
|
322
322
|
schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
|
|
323
323
|
wu = MetadataWorkUnit(
|
|
@@ -371,7 +371,7 @@ class APISource(Source, ABC):
|
|
|
371
371
|
else:
|
|
372
372
|
self.report_bad_responses(response.status_code, type=endpoint_k)
|
|
373
373
|
else:
|
|
374
|
-
if endpoint_k not in config.forced_examples
|
|
374
|
+
if endpoint_k not in config.forced_examples:
|
|
375
375
|
# start guessing...
|
|
376
376
|
url_guess = try_guessing(endpoint_k, root_dataset_samples)
|
|
377
377
|
tot_url = clean_url(config.url + self.url_basepath + url_guess)
|
|
@@ -128,18 +128,18 @@ def get_endpoints(sw_dict: dict) -> dict:
|
|
|
128
128
|
|
|
129
129
|
for p_k, p_o in sw_dict["paths"].items():
|
|
130
130
|
method = list(p_o)[0]
|
|
131
|
-
if "200" in p_o[method]["responses"]
|
|
131
|
+
if "200" in p_o[method]["responses"]:
|
|
132
132
|
base_res = p_o[method]["responses"]["200"]
|
|
133
|
-
elif 200 in p_o[method]["responses"]
|
|
133
|
+
elif 200 in p_o[method]["responses"]:
|
|
134
134
|
# if you read a plain yml file the 200 will be an integer
|
|
135
135
|
base_res = p_o[method]["responses"][200]
|
|
136
136
|
else:
|
|
137
137
|
# the endpoint does not have a 200 response
|
|
138
138
|
continue
|
|
139
139
|
|
|
140
|
-
if "description" in p_o[method]
|
|
140
|
+
if "description" in p_o[method]:
|
|
141
141
|
desc = p_o[method]["description"]
|
|
142
|
-
elif "summary" in p_o[method]
|
|
142
|
+
elif "summary" in p_o[method]:
|
|
143
143
|
desc = p_o[method]["summary"]
|
|
144
144
|
else: # still testing
|
|
145
145
|
desc = ""
|
|
@@ -156,7 +156,7 @@ def get_endpoints(sw_dict: dict) -> dict:
|
|
|
156
156
|
url_details[p_k]["data"] = example_data
|
|
157
157
|
|
|
158
158
|
# checking whether there are defined parameters to execute the call...
|
|
159
|
-
if "parameters" in p_o[method]
|
|
159
|
+
if "parameters" in p_o[method]:
|
|
160
160
|
url_details[p_k]["parameters"] = p_o[method]["parameters"]
|
|
161
161
|
|
|
162
162
|
return dict(sorted(url_details.items()))
|
|
@@ -169,7 +169,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
169
169
|
data = {}
|
|
170
170
|
if "content" in base_res:
|
|
171
171
|
res_cont = base_res["content"]
|
|
172
|
-
if "application/json" in res_cont
|
|
172
|
+
if "application/json" in res_cont:
|
|
173
173
|
ex_field = None
|
|
174
174
|
if "example" in res_cont["application/json"]:
|
|
175
175
|
ex_field = "example"
|
|
@@ -186,7 +186,7 @@ def check_for_api_example_data(base_res: dict, key: str) -> dict:
|
|
|
186
186
|
logger.warning(
|
|
187
187
|
f"Field in swagger file does not give consistent data --- {key}"
|
|
188
188
|
)
|
|
189
|
-
elif "text/csv" in res_cont
|
|
189
|
+
elif "text/csv" in res_cont:
|
|
190
190
|
data = res_cont["text/csv"]["schema"]
|
|
191
191
|
elif "examples" in base_res:
|
|
192
192
|
data = base_res["examples"]["application/json"]
|
|
@@ -239,7 +239,7 @@ def guessing_url_name(url: str, examples: dict) -> str:
|
|
|
239
239
|
|
|
240
240
|
# substituting the parameter's name w the value
|
|
241
241
|
for name, clean_name in zip(needed_n, cleaned_needed_n):
|
|
242
|
-
if clean_name in examples[ex2use]
|
|
242
|
+
if clean_name in examples[ex2use]:
|
|
243
243
|
guessed_url = re.sub(name, str(examples[ex2use][clean_name]), guessed_url)
|
|
244
244
|
|
|
245
245
|
return guessed_url
|
|
@@ -555,7 +555,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
555
555
|
def map_data_platform(cls, value):
|
|
556
556
|
# For backward compatibility convert input PostgreSql to PostgreSQL
|
|
557
557
|
# PostgreSQL is name of the data-platform in M-Query
|
|
558
|
-
if "PostgreSql" in value
|
|
558
|
+
if "PostgreSql" in value:
|
|
559
559
|
platform_name = value["PostgreSql"]
|
|
560
560
|
del value["PostgreSql"]
|
|
561
561
|
value["PostgreSQL"] = platform_name
|
|
@@ -94,7 +94,7 @@ from datahub.metadata.schema_classes import (
|
|
|
94
94
|
UpstreamLineageClass,
|
|
95
95
|
ViewPropertiesClass,
|
|
96
96
|
)
|
|
97
|
-
from datahub.metadata.urns import ChartUrn
|
|
97
|
+
from datahub.metadata.urns import ChartUrn, DatasetUrn
|
|
98
98
|
from datahub.sql_parsing.sqlglot_lineage import ColumnLineageInfo
|
|
99
99
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
100
100
|
from datahub.utilities.urns.urn_iter import lowercase_dataset_urn
|
|
@@ -263,7 +263,7 @@ class Mapper:
|
|
|
263
263
|
for upstream_dpt in lineage.upstreams:
|
|
264
264
|
if (
|
|
265
265
|
upstream_dpt.data_platform_pair.powerbi_data_platform_name
|
|
266
|
-
not in self.__config.dataset_type_mapping
|
|
266
|
+
not in self.__config.dataset_type_mapping
|
|
267
267
|
):
|
|
268
268
|
logger.debug(
|
|
269
269
|
f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
|
|
@@ -1083,6 +1083,7 @@ class Mapper:
|
|
|
1083
1083
|
report: powerbi_data_classes.Report,
|
|
1084
1084
|
chart_mcps: List[MetadataChangeProposalWrapper],
|
|
1085
1085
|
user_mcps: List[MetadataChangeProposalWrapper],
|
|
1086
|
+
dataset_edges: List[EdgeClass],
|
|
1086
1087
|
) -> List[MetadataChangeProposalWrapper]:
|
|
1087
1088
|
"""
|
|
1088
1089
|
Map PowerBi report to Datahub dashboard
|
|
@@ -1104,6 +1105,7 @@ class Mapper:
|
|
|
1104
1105
|
charts=chart_urn_list,
|
|
1105
1106
|
lastModified=ChangeAuditStamps(),
|
|
1106
1107
|
dashboardUrl=report.webUrl,
|
|
1108
|
+
datasetEdges=dataset_edges,
|
|
1107
1109
|
)
|
|
1108
1110
|
|
|
1109
1111
|
info_mcp = self.new_mcp(
|
|
@@ -1197,12 +1199,23 @@ class Mapper:
|
|
|
1197
1199
|
ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
|
|
1198
1200
|
chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
|
|
1199
1201
|
|
|
1202
|
+
# collect all upstream datasets; using a set to retain unique urns
|
|
1203
|
+
dataset_urns = {
|
|
1204
|
+
dataset.entityUrn
|
|
1205
|
+
for dataset in ds_mcps
|
|
1206
|
+
if dataset.entityType == DatasetUrn.ENTITY_TYPE and dataset.entityUrn
|
|
1207
|
+
}
|
|
1208
|
+
dataset_edges = [
|
|
1209
|
+
EdgeClass(destinationUrn=dataset_urn) for dataset_urn in dataset_urns
|
|
1210
|
+
]
|
|
1211
|
+
|
|
1200
1212
|
# Let's convert report to datahub dashboard
|
|
1201
1213
|
report_mcps = self.report_to_dashboard(
|
|
1202
1214
|
workspace=workspace,
|
|
1203
1215
|
report=report,
|
|
1204
1216
|
chart_mcps=chart_mcps,
|
|
1205
1217
|
user_mcps=user_mcps,
|
|
1218
|
+
dataset_edges=dataset_edges,
|
|
1206
1219
|
)
|
|
1207
1220
|
|
|
1208
1221
|
# Now add MCPs in sequence
|
|
@@ -1340,7 +1353,7 @@ class PowerBiDashboardSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1340
1353
|
for data_platform in SupportedDataPlatform
|
|
1341
1354
|
]
|
|
1342
1355
|
|
|
1343
|
-
for key in self.source_config.dataset_type_mapping
|
|
1356
|
+
for key in self.source_config.dataset_type_mapping:
|
|
1344
1357
|
if key not in powerbi_data_platforms:
|
|
1345
1358
|
raise ValueError(f"PowerBI DataPlatform {key} is not supported")
|
|
1346
1359
|
|
|
@@ -42,9 +42,9 @@ class RedshiftProfiler(GenericProfiler):
|
|
|
42
42
|
"max_overflow", self.config.profiling.max_workers
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
-
for db in tables
|
|
45
|
+
for db in tables:
|
|
46
46
|
profile_requests = []
|
|
47
|
-
for schema in tables.get(db, {})
|
|
47
|
+
for schema in tables.get(db, {}):
|
|
48
48
|
if not self.config.schema_pattern.allowed(schema):
|
|
49
49
|
continue
|
|
50
50
|
for table in tables[db].get(schema, {}):
|
|
@@ -170,7 +170,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
170
170
|
if self.config.workspace_pattern.allowed(workspace.name):
|
|
171
171
|
allowed_workspaces.append(workspace)
|
|
172
172
|
else:
|
|
173
|
-
self.reporter.workspaces.dropped(
|
|
173
|
+
self.reporter.workspaces.dropped(
|
|
174
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
175
|
+
)
|
|
174
176
|
logger.info(f"Number of allowed workspaces = {len(allowed_workspaces)}")
|
|
175
177
|
|
|
176
178
|
return allowed_workspaces
|
|
@@ -661,7 +663,9 @@ class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
661
663
|
yield from self._gen_workbook_workunit(workbook)
|
|
662
664
|
|
|
663
665
|
for workspace in self._get_allowed_workspaces():
|
|
664
|
-
self.reporter.workspaces.processed(
|
|
666
|
+
self.reporter.workspaces.processed(
|
|
667
|
+
f"{workspace.name} ({workspace.workspaceId})"
|
|
668
|
+
)
|
|
665
669
|
yield from self._gen_workspace_workunit(workspace)
|
|
666
670
|
yield from self._gen_sigma_dataset_upstream_lineage_workunit()
|
|
667
671
|
|
|
@@ -77,7 +77,7 @@ class SnowsightUrlBuilder:
|
|
|
77
77
|
region: str,
|
|
78
78
|
) -> Tuple[str, str]:
|
|
79
79
|
cloud: str
|
|
80
|
-
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING
|
|
80
|
+
if region in SNOWFLAKE_REGION_CLOUD_REGION_MAPPING:
|
|
81
81
|
cloud, cloud_region_id = SNOWFLAKE_REGION_CLOUD_REGION_MAPPING[region]
|
|
82
82
|
elif region.startswith(("aws_", "gcp_", "azure_")):
|
|
83
83
|
# e.g. aws_us_west_2, gcp_us_central1, azure_northeurope
|
|
@@ -26,6 +26,7 @@ from datahub.metadata.schema_classes import (
|
|
|
26
26
|
DataPlatformInstanceClass,
|
|
27
27
|
DataTransformClass,
|
|
28
28
|
DataTransformLogicClass,
|
|
29
|
+
QueryLanguageClass,
|
|
29
30
|
QueryStatementClass,
|
|
30
31
|
SubTypesClass,
|
|
31
32
|
)
|
|
@@ -176,7 +177,17 @@ def _generate_job_workunits(
|
|
|
176
177
|
DataTransformClass(
|
|
177
178
|
queryStatement=QueryStatementClass(
|
|
178
179
|
value=procedure.procedure_definition,
|
|
179
|
-
language=
|
|
180
|
+
language=(
|
|
181
|
+
QueryLanguageClass.SQL
|
|
182
|
+
if procedure.language == "SQL"
|
|
183
|
+
# The language field uses a pretty limited enum.
|
|
184
|
+
# The "UNKNOWN" enum value is pretty new, so we don't want to
|
|
185
|
+
# emit it until it has broader server-side support. As a
|
|
186
|
+
# short-term solution, we map all languages to "SQL".
|
|
187
|
+
# TODO: Once we've released server 1.1.0, we should change
|
|
188
|
+
# this to be "UNKNOWN" for all languages except "SQL".
|
|
189
|
+
else QueryLanguageClass.SQL
|
|
190
|
+
),
|
|
180
191
|
),
|
|
181
192
|
)
|
|
182
193
|
]
|
|
@@ -128,9 +128,10 @@ def get_table_comment(self, connection, table_name: str, schema: str = None, **k
|
|
|
128
128
|
if catalog_name is None:
|
|
129
129
|
raise exc.NoSuchTableError("catalog is required in connection")
|
|
130
130
|
connector_name = get_catalog_connector_name(connection.engine, catalog_name)
|
|
131
|
-
if
|
|
132
|
-
|
|
133
|
-
|
|
131
|
+
if (
|
|
132
|
+
connector_name is not None
|
|
133
|
+
and connector_name in PROPERTIES_TABLE_SUPPORTED_CONNECTORS
|
|
134
|
+
):
|
|
134
135
|
properties_table = self._get_full_table(f"{table_name}$properties", schema)
|
|
135
136
|
query = f"SELECT * FROM {properties_table}"
|
|
136
137
|
row = connection.execute(sql.text(query)).fetchone()
|
|
@@ -45,7 +45,6 @@ class StatefulStaleMetadataRemovalConfig(StatefulIngestionConfig):
|
|
|
45
45
|
description="Prevents large amount of soft deletes & the state from committing from accidental changes to the source configuration if the relative change percent in entities compared to the previous state is above the 'fail_safe_threshold'.",
|
|
46
46
|
le=100.0,
|
|
47
47
|
ge=0.0,
|
|
48
|
-
hidden_from_docs=True,
|
|
49
48
|
)
|
|
50
49
|
|
|
51
50
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import os
|
|
3
4
|
from dataclasses import dataclass, field
|
|
4
5
|
from datetime import datetime
|
|
5
6
|
from functools import lru_cache
|
|
@@ -100,6 +101,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
|
|
|
100
101
|
from datahub.utilities import config_clean
|
|
101
102
|
from datahub.utilities.lossy_collections import LossyList
|
|
102
103
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
|
104
|
+
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
103
105
|
|
|
104
106
|
logger = logging.getLogger(__name__)
|
|
105
107
|
|
|
@@ -210,6 +212,11 @@ class SupersetConfig(
|
|
|
210
212
|
default=10, description="Timeout of single API call to superset."
|
|
211
213
|
)
|
|
212
214
|
|
|
215
|
+
max_threads: int = Field(
|
|
216
|
+
default_factory=lambda: os.cpu_count() or 40,
|
|
217
|
+
description="Max parallelism for API calls. Defaults to cpuCount or 40",
|
|
218
|
+
)
|
|
219
|
+
|
|
213
220
|
# TODO: Check and remove this if no longer needed.
|
|
214
221
|
# Config database_alias is removed from sql sources.
|
|
215
222
|
database_alias: Dict[str, str] = Field(
|
|
@@ -339,6 +346,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
339
346
|
|
|
340
347
|
if response.status_code != 200:
|
|
341
348
|
logger.warning(f"Failed to get {entity_type} data: {response.text}")
|
|
349
|
+
continue
|
|
342
350
|
|
|
343
351
|
payload = response.json()
|
|
344
352
|
# Update total_items with the actual count from the response
|
|
@@ -501,33 +509,41 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
501
509
|
|
|
502
510
|
return dashboard_snapshot
|
|
503
511
|
|
|
504
|
-
def
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
513
|
-
)
|
|
514
|
-
continue
|
|
515
|
-
|
|
516
|
-
dashboard_snapshot = self.construct_dashboard_from_api_data(
|
|
517
|
-
dashboard_data
|
|
518
|
-
)
|
|
519
|
-
except Exception as e:
|
|
520
|
-
self.report.warning(
|
|
521
|
-
f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
|
|
512
|
+
def _process_dashboard(self, dashboard_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
513
|
+
dashboard_title = ""
|
|
514
|
+
try:
|
|
515
|
+
dashboard_id = str(dashboard_data.get("id"))
|
|
516
|
+
dashboard_title = dashboard_data.get("dashboard_title", "")
|
|
517
|
+
if not self.config.dashboard_pattern.allowed(dashboard_title):
|
|
518
|
+
self.report.report_dropped(
|
|
519
|
+
f"Dashboard '{dashboard_title}' (id: {dashboard_id}) filtered by dashboard_pattern"
|
|
522
520
|
)
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
521
|
+
return
|
|
522
|
+
dashboard_snapshot = self.construct_dashboard_from_api_data(dashboard_data)
|
|
523
|
+
except Exception as e:
|
|
524
|
+
self.report.warning(
|
|
525
|
+
f"Failed to construct dashboard snapshot. Dashboard name: {dashboard_data.get('dashboard_title')}. Error: \n{e}"
|
|
526
|
+
)
|
|
527
|
+
return
|
|
528
|
+
mce = MetadataChangeEvent(proposedSnapshot=dashboard_snapshot)
|
|
529
|
+
yield MetadataWorkUnit(id=dashboard_snapshot.urn, mce=mce)
|
|
530
|
+
yield from self._get_domain_wu(
|
|
531
|
+
title=dashboard_title, entity_urn=dashboard_snapshot.urn
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
def emit_dashboard_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
535
|
+
dashboard_data_list = [
|
|
536
|
+
(dashboard_data,)
|
|
537
|
+
for dashboard_data in self.paginate_entity_api_results(
|
|
538
|
+
"dashboard/", PAGE_SIZE
|
|
530
539
|
)
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
yield from ThreadedIteratorExecutor.process(
|
|
543
|
+
worker_func=self._process_dashboard,
|
|
544
|
+
args_list=dashboard_data_list,
|
|
545
|
+
max_workers=self.config.max_threads,
|
|
546
|
+
)
|
|
531
547
|
|
|
532
548
|
def build_input_fields(
|
|
533
549
|
self,
|
|
@@ -762,40 +778,46 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
762
778
|
entity_urn=chart_urn,
|
|
763
779
|
)
|
|
764
780
|
|
|
765
|
-
def
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
781
|
+
def _process_chart(self, chart_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
782
|
+
chart_name = ""
|
|
783
|
+
try:
|
|
784
|
+
chart_id = str(chart_data.get("id"))
|
|
785
|
+
chart_name = chart_data.get("slice_name", "")
|
|
786
|
+
if not self.config.chart_pattern.allowed(chart_name):
|
|
787
|
+
self.report.report_dropped(
|
|
788
|
+
f"Chart '{chart_name}' (id: {chart_id}) filtered by chart_pattern"
|
|
789
|
+
)
|
|
790
|
+
return
|
|
791
|
+
if self.config.dataset_pattern != AllowDenyPattern.allow_all():
|
|
792
|
+
datasource_id = chart_data.get("datasource_id")
|
|
793
|
+
if datasource_id:
|
|
794
|
+
dataset_response = self.get_dataset_info(datasource_id)
|
|
795
|
+
dataset_name = dataset_response.get("result", {}).get(
|
|
796
|
+
"table_name", ""
|
|
774
797
|
)
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
if datasource_id:
|
|
781
|
-
dataset_response = self.get_dataset_info(datasource_id)
|
|
782
|
-
dataset_name = dataset_response.get("result", {}).get(
|
|
783
|
-
"table_name", ""
|
|
798
|
+
if dataset_name and not self.config.dataset_pattern.allowed(
|
|
799
|
+
dataset_name
|
|
800
|
+
):
|
|
801
|
+
self.report.warning(
|
|
802
|
+
f"Chart '{chart_name}' (id: {chart_id}) uses dataset '{dataset_name}' which is filtered by dataset_pattern"
|
|
784
803
|
)
|
|
804
|
+
yield from self.construct_chart_from_chart_data(chart_data)
|
|
805
|
+
except Exception as e:
|
|
806
|
+
self.report.warning(
|
|
807
|
+
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
808
|
+
)
|
|
809
|
+
return
|
|
785
810
|
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
f"Failed to construct chart snapshot. Chart name: {chart_name}. Error: \n{e}"
|
|
797
|
-
)
|
|
798
|
-
continue
|
|
811
|
+
def emit_chart_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
812
|
+
chart_data_list = [
|
|
813
|
+
(chart_data,)
|
|
814
|
+
for chart_data in self.paginate_entity_api_results("chart/", PAGE_SIZE)
|
|
815
|
+
]
|
|
816
|
+
yield from ThreadedIteratorExecutor.process(
|
|
817
|
+
worker_func=self._process_chart,
|
|
818
|
+
args_list=chart_data_list,
|
|
819
|
+
max_workers=self.config.max_threads,
|
|
820
|
+
)
|
|
799
821
|
|
|
800
822
|
def gen_schema_fields(self, column_data: List[Dict[str, str]]) -> List[SchemaField]:
|
|
801
823
|
schema_fields: List[SchemaField] = []
|
|
@@ -1023,33 +1045,38 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
1023
1045
|
|
|
1024
1046
|
return dataset_snapshot
|
|
1025
1047
|
|
|
1026
|
-
def
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
self.report.report_dropped(
|
|
1034
|
-
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
1035
|
-
)
|
|
1036
|
-
continue
|
|
1037
|
-
|
|
1038
|
-
dataset_snapshot = self.construct_dataset_from_dataset_data(
|
|
1039
|
-
dataset_data
|
|
1040
|
-
)
|
|
1041
|
-
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1042
|
-
except Exception as e:
|
|
1043
|
-
self.report.warning(
|
|
1044
|
-
f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
|
|
1048
|
+
def _process_dataset(self, dataset_data: Any) -> Iterable[MetadataWorkUnit]:
|
|
1049
|
+
dataset_name = ""
|
|
1050
|
+
try:
|
|
1051
|
+
dataset_name = dataset_data.get("table_name", "")
|
|
1052
|
+
if not self.config.dataset_pattern.allowed(dataset_name):
|
|
1053
|
+
self.report.report_dropped(
|
|
1054
|
+
f"Dataset '{dataset_name}' filtered by dataset_pattern"
|
|
1045
1055
|
)
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1056
|
+
return
|
|
1057
|
+
dataset_snapshot = self.construct_dataset_from_dataset_data(dataset_data)
|
|
1058
|
+
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1059
|
+
except Exception as e:
|
|
1060
|
+
self.report.warning(
|
|
1061
|
+
f"Failed to construct dataset snapshot. Dataset name: {dataset_data.get('table_name')}. Error: \n{e}"
|
|
1052
1062
|
)
|
|
1063
|
+
return
|
|
1064
|
+
yield MetadataWorkUnit(id=dataset_snapshot.urn, mce=mce)
|
|
1065
|
+
yield from self._get_domain_wu(
|
|
1066
|
+
title=dataset_data.get("table_name", ""),
|
|
1067
|
+
entity_urn=dataset_snapshot.urn,
|
|
1068
|
+
)
|
|
1069
|
+
|
|
1070
|
+
def emit_dataset_mces(self) -> Iterable[MetadataWorkUnit]:
|
|
1071
|
+
dataset_data_list = [
|
|
1072
|
+
(dataset_data,)
|
|
1073
|
+
for dataset_data in self.paginate_entity_api_results("dataset/", PAGE_SIZE)
|
|
1074
|
+
]
|
|
1075
|
+
yield from ThreadedIteratorExecutor.process(
|
|
1076
|
+
worker_func=self._process_dataset,
|
|
1077
|
+
args_list=dataset_data_list,
|
|
1078
|
+
max_workers=self.config.max_threads,
|
|
1079
|
+
)
|
|
1053
1080
|
|
|
1054
1081
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
1055
1082
|
if self.config.ingest_dashboards:
|
|
@@ -1623,7 +1623,7 @@ class TableauSiteSource:
|
|
|
1623
1623
|
# if multiple project has name C. Ideal solution is to use projectLuidWithin to avoid duplicate project,
|
|
1624
1624
|
# however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
|
|
1625
1625
|
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
|
|
1626
|
-
if project_luid not in self.tableau_project_registry
|
|
1626
|
+
if project_luid not in self.tableau_project_registry:
|
|
1627
1627
|
wrk_name: Optional[str] = workbook.get(c.NAME)
|
|
1628
1628
|
wrk_id: Optional[str] = workbook.get(c.ID)
|
|
1629
1629
|
prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
|
|
@@ -2253,7 +2253,7 @@ class TableauSiteSource:
|
|
|
2253
2253
|
# It is possible due to https://github.com/tableau/server-client-python/issues/1210
|
|
2254
2254
|
if (
|
|
2255
2255
|
ds.get(c.LUID)
|
|
2256
|
-
and ds[c.LUID] not in self.datasource_project_map
|
|
2256
|
+
and ds[c.LUID] not in self.datasource_project_map
|
|
2257
2257
|
and self.report.get_all_datasources_query_failed
|
|
2258
2258
|
):
|
|
2259
2259
|
logger.debug(
|
|
@@ -2265,7 +2265,7 @@ class TableauSiteSource:
|
|
|
2265
2265
|
|
|
2266
2266
|
if (
|
|
2267
2267
|
ds.get(c.LUID)
|
|
2268
|
-
and ds[c.LUID] in self.datasource_project_map
|
|
2268
|
+
and ds[c.LUID] in self.datasource_project_map
|
|
2269
2269
|
and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
|
|
2270
2270
|
):
|
|
2271
2271
|
return self.datasource_project_map[ds[c.LUID]]
|
|
@@ -3252,7 +3252,7 @@ class TableauSiteSource:
|
|
|
3252
3252
|
|
|
3253
3253
|
parent_key = None
|
|
3254
3254
|
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
|
|
3255
|
-
if project_luid and project_luid in self.tableau_project_registry
|
|
3255
|
+
if project_luid and project_luid in self.tableau_project_registry:
|
|
3256
3256
|
parent_key = self.gen_project_key(project_luid)
|
|
3257
3257
|
else:
|
|
3258
3258
|
workbook_id: Optional[str] = workbook.get(c.ID)
|