acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2391 -2392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +105 -88
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
1
3
|
import time
|
|
2
4
|
from dataclasses import dataclass
|
|
3
|
-
from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
|
|
5
|
+
from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
|
|
4
6
|
|
|
5
7
|
from mlflow import MlflowClient
|
|
6
|
-
from mlflow.entities import Experiment, Run
|
|
8
|
+
from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
|
|
7
9
|
from mlflow.entities.model_registry import ModelVersion, RegisteredModel
|
|
8
10
|
from mlflow.store.entities import PagedList
|
|
9
11
|
from pydantic.fields import Field
|
|
@@ -29,6 +31,7 @@ from datahub.ingestion.api.source import (
|
|
|
29
31
|
SourceReport,
|
|
30
32
|
)
|
|
31
33
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
34
|
+
from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
|
|
32
35
|
from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
|
|
33
36
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
34
37
|
StaleEntityRemovalHandler,
|
|
@@ -42,6 +45,7 @@ from datahub.metadata.schema_classes import (
|
|
|
42
45
|
AuditStampClass,
|
|
43
46
|
ContainerClass,
|
|
44
47
|
DataPlatformInstanceClass,
|
|
48
|
+
DataProcessInstanceInputClass,
|
|
45
49
|
DataProcessInstanceOutputClass,
|
|
46
50
|
DataProcessInstancePropertiesClass,
|
|
47
51
|
DataProcessInstanceRunEventClass,
|
|
@@ -60,16 +64,15 @@ from datahub.metadata.schema_classes import (
|
|
|
60
64
|
TagAssociationClass,
|
|
61
65
|
TagPropertiesClass,
|
|
62
66
|
TimeStampClass,
|
|
67
|
+
UpstreamClass,
|
|
68
|
+
UpstreamLineageClass,
|
|
63
69
|
VersionPropertiesClass,
|
|
64
70
|
VersionTagClass,
|
|
65
71
|
_Aspect,
|
|
66
72
|
)
|
|
67
|
-
from datahub.metadata.urns import
|
|
68
|
-
DataPlatformUrn,
|
|
69
|
-
MlModelUrn,
|
|
70
|
-
VersionSetUrn,
|
|
71
|
-
)
|
|
73
|
+
from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
|
|
72
74
|
from datahub.sdk.container import Container
|
|
75
|
+
from datahub.sdk.dataset import Dataset
|
|
73
76
|
|
|
74
77
|
T = TypeVar("T")
|
|
75
78
|
|
|
@@ -105,6 +108,20 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
105
108
|
" If neither is set, external URLs are not generated."
|
|
106
109
|
),
|
|
107
110
|
)
|
|
111
|
+
materialize_dataset_inputs: Optional[bool] = Field(
|
|
112
|
+
default=False,
|
|
113
|
+
description="Whether to materialize dataset inputs for each run",
|
|
114
|
+
)
|
|
115
|
+
source_mapping_to_platform: Optional[dict] = Field(
|
|
116
|
+
default=None, description="Mapping of source type to datahub platform"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
username: Optional[str] = Field(
|
|
120
|
+
default=None, description="Username for MLflow authentication"
|
|
121
|
+
)
|
|
122
|
+
password: Optional[str] = Field(
|
|
123
|
+
default=None, description="Password for MLflow authentication"
|
|
124
|
+
)
|
|
108
125
|
|
|
109
126
|
|
|
110
127
|
@dataclass
|
|
@@ -152,7 +169,17 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
152
169
|
self.ctx = ctx
|
|
153
170
|
self.config = config
|
|
154
171
|
self.report = StaleEntityRemovalSourceReport()
|
|
155
|
-
self.client =
|
|
172
|
+
self.client = self._configure_client()
|
|
173
|
+
|
|
174
|
+
def _configure_client(self) -> MlflowClient:
|
|
175
|
+
if bool(self.config.username) != bool(self.config.password):
|
|
176
|
+
raise ValueError("Both username and password must be set together")
|
|
177
|
+
|
|
178
|
+
if self.config.username and self.config.password:
|
|
179
|
+
os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
|
|
180
|
+
os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
|
|
181
|
+
|
|
182
|
+
return MlflowClient(
|
|
156
183
|
tracking_uri=self.config.tracking_uri,
|
|
157
184
|
registry_uri=self.config.registry_uri,
|
|
158
185
|
)
|
|
@@ -213,6 +240,7 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
213
240
|
if runs:
|
|
214
241
|
for run in runs:
|
|
215
242
|
yield from self._get_run_workunits(experiment, run)
|
|
243
|
+
yield from self._get_dataset_input_workunits(run)
|
|
216
244
|
|
|
217
245
|
def _get_experiment_custom_properties(self, experiment):
|
|
218
246
|
experiment_custom_props = getattr(experiment, "tags", {}) or {}
|
|
@@ -262,6 +290,183 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
262
290
|
type="SKIPPED", nativeResultType=self.platform
|
|
263
291
|
)
|
|
264
292
|
|
|
293
|
+
def _get_dataset_schema(
|
|
294
|
+
self, dataset: MlflowDataset
|
|
295
|
+
) -> Optional[List[Tuple[str, str]]]:
|
|
296
|
+
try:
|
|
297
|
+
schema_dict = json.loads(dataset.schema)
|
|
298
|
+
except json.JSONDecodeError:
|
|
299
|
+
self.report.warning(
|
|
300
|
+
title="Failed to load dataset schema",
|
|
301
|
+
message="Schema metadata will be missing due to a JSON parsing error.",
|
|
302
|
+
context=f"Dataset: {dataset.name}, Schema: {dataset.schema}",
|
|
303
|
+
)
|
|
304
|
+
return None
|
|
305
|
+
|
|
306
|
+
if "mlflow_colspec" in schema_dict:
|
|
307
|
+
try:
|
|
308
|
+
return [
|
|
309
|
+
(field["name"], field["type"])
|
|
310
|
+
for field in schema_dict["mlflow_colspec"]
|
|
311
|
+
]
|
|
312
|
+
except (KeyError, TypeError):
|
|
313
|
+
return None
|
|
314
|
+
# If the schema is not formatted, return None
|
|
315
|
+
return None
|
|
316
|
+
|
|
317
|
+
def _get_external_dataset_urn(self, platform: str, dataset_name: str) -> str:
|
|
318
|
+
"""
|
|
319
|
+
Get the URN for an external dataset.
|
|
320
|
+
Args:
|
|
321
|
+
platform: The platform of the external dataset (e.g., 's3', 'bigquery')
|
|
322
|
+
dataset: The MLflow dataset
|
|
323
|
+
Returns:
|
|
324
|
+
str: The URN of the external dataset
|
|
325
|
+
"""
|
|
326
|
+
return str(DatasetUrn(platform=platform, name=dataset_name))
|
|
327
|
+
|
|
328
|
+
def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
|
|
329
|
+
"""
|
|
330
|
+
Generate workunits for dataset inputs in a run.
|
|
331
|
+
|
|
332
|
+
For each dataset input:
|
|
333
|
+
1. If source type is 'local' or 'code':
|
|
334
|
+
- Create a local dataset reference
|
|
335
|
+
2. Otherwise:
|
|
336
|
+
- If materialization is enabled:
|
|
337
|
+
- Create a hosted dataset and a dataset reference with upstream
|
|
338
|
+
- If materialization is not enabled:
|
|
339
|
+
- Create a dataset reference and add upstream if dataset exists
|
|
340
|
+
3. Add all dataset references as upstreams for the run
|
|
341
|
+
"""
|
|
342
|
+
run_urn = DataProcessInstance(
|
|
343
|
+
id=run.info.run_id,
|
|
344
|
+
orchestrator=self.platform,
|
|
345
|
+
).urn
|
|
346
|
+
|
|
347
|
+
dataset_reference_urns = []
|
|
348
|
+
|
|
349
|
+
for dataset_input in run.inputs.dataset_inputs:
|
|
350
|
+
dataset = dataset_input.dataset
|
|
351
|
+
source_type = dataset.source_type
|
|
352
|
+
dataset_tags = {k[1]: v[1] for k, v in dataset_input.tags}
|
|
353
|
+
|
|
354
|
+
# Prepare dataset properties
|
|
355
|
+
custom_properties = dataset_tags
|
|
356
|
+
formatted_schema = self._get_dataset_schema(dataset)
|
|
357
|
+
if formatted_schema is None:
|
|
358
|
+
custom_properties["schema"] = dataset.schema
|
|
359
|
+
|
|
360
|
+
# Handle local/code datasets
|
|
361
|
+
if source_type in ("local", "code"):
|
|
362
|
+
local_dataset = Dataset(
|
|
363
|
+
platform=self.platform,
|
|
364
|
+
name=dataset.name,
|
|
365
|
+
schema=formatted_schema,
|
|
366
|
+
custom_properties=custom_properties,
|
|
367
|
+
)
|
|
368
|
+
yield from local_dataset.as_workunits()
|
|
369
|
+
dataset_reference_urns.append(local_dataset.urn)
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# Handle hosted datasets
|
|
373
|
+
formatted_platform = self._get_dataset_platform_from_source_type(
|
|
374
|
+
source_type
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Validate platform if materialization is enabled
|
|
378
|
+
if self.config.materialize_dataset_inputs:
|
|
379
|
+
if not formatted_platform:
|
|
380
|
+
self.report.failure(
|
|
381
|
+
title="Unable to materialize dataset inputs",
|
|
382
|
+
message=f"No mapping dataPlatform found for dataset input source type '{source_type}'",
|
|
383
|
+
context=f"please add `materialize_dataset_inputs.source_mapping_to_platform` in config "
|
|
384
|
+
f"(e.g. '{source_type}': 'snowflake')",
|
|
385
|
+
)
|
|
386
|
+
continue
|
|
387
|
+
# Create hosted dataset
|
|
388
|
+
hosted_dataset = Dataset(
|
|
389
|
+
platform=formatted_platform,
|
|
390
|
+
name=dataset.name,
|
|
391
|
+
schema=formatted_schema,
|
|
392
|
+
custom_properties=dataset_tags,
|
|
393
|
+
)
|
|
394
|
+
yield from hosted_dataset.as_workunits()
|
|
395
|
+
|
|
396
|
+
# Create dataset reference with upstream
|
|
397
|
+
hosted_dataset_reference = Dataset(
|
|
398
|
+
platform=self.platform,
|
|
399
|
+
name=dataset.name,
|
|
400
|
+
schema=formatted_schema,
|
|
401
|
+
custom_properties=dataset_tags,
|
|
402
|
+
upstreams=UpstreamLineageClass(
|
|
403
|
+
upstreams=[
|
|
404
|
+
UpstreamClass(
|
|
405
|
+
self._get_external_dataset_urn(
|
|
406
|
+
formatted_platform, dataset.name
|
|
407
|
+
),
|
|
408
|
+
type="COPY",
|
|
409
|
+
)
|
|
410
|
+
]
|
|
411
|
+
)
|
|
412
|
+
if formatted_platform
|
|
413
|
+
else None,
|
|
414
|
+
)
|
|
415
|
+
dataset_reference_urns.append(hosted_dataset_reference.urn)
|
|
416
|
+
yield from hosted_dataset_reference.as_workunits()
|
|
417
|
+
|
|
418
|
+
# Add dataset references as upstreams for the run
|
|
419
|
+
if dataset_reference_urns:
|
|
420
|
+
input_edges = [
|
|
421
|
+
EdgeClass(destinationUrn=str(dataset_ref_urn))
|
|
422
|
+
for dataset_ref_urn in dataset_reference_urns
|
|
423
|
+
]
|
|
424
|
+
yield MetadataChangeProposalWrapper(
|
|
425
|
+
entityUrn=str(run_urn),
|
|
426
|
+
aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
|
|
427
|
+
).as_workunit()
|
|
428
|
+
|
|
429
|
+
def _get_dataset_platform_from_source_type(self, source_type: str) -> Optional[str]:
|
|
430
|
+
"""
|
|
431
|
+
Map MLflow source type to DataHub platform.
|
|
432
|
+
|
|
433
|
+
Priority:
|
|
434
|
+
1. User-provided mapping in config
|
|
435
|
+
2. Internal mapping
|
|
436
|
+
3. Direct platform match from list of supported platforms
|
|
437
|
+
"""
|
|
438
|
+
source_type = source_type.lower()
|
|
439
|
+
|
|
440
|
+
# User-provided mapping
|
|
441
|
+
platform = self._get_platform_from_user_mapping(source_type)
|
|
442
|
+
if platform:
|
|
443
|
+
return platform
|
|
444
|
+
|
|
445
|
+
# Internal mapping
|
|
446
|
+
if source_type == "gs":
|
|
447
|
+
return "gcs"
|
|
448
|
+
|
|
449
|
+
# Check direct platform match
|
|
450
|
+
if self._is_valid_platform(source_type):
|
|
451
|
+
return source_type
|
|
452
|
+
|
|
453
|
+
return None
|
|
454
|
+
|
|
455
|
+
def _get_platform_from_user_mapping(self, source_type: str) -> Optional[str]:
|
|
456
|
+
"""
|
|
457
|
+
Get platform from user-provided mapping in config.
|
|
458
|
+
Returns None if mapping is invalid or platform is not supported.
|
|
459
|
+
"""
|
|
460
|
+
source_mapping = self.config.source_mapping_to_platform
|
|
461
|
+
if not source_mapping:
|
|
462
|
+
return None
|
|
463
|
+
|
|
464
|
+
platform = source_mapping.get(source_type)
|
|
465
|
+
if not platform:
|
|
466
|
+
return None
|
|
467
|
+
|
|
468
|
+
return platform
|
|
469
|
+
|
|
265
470
|
def _get_run_workunits(
|
|
266
471
|
self, experiment: Experiment, run: Run
|
|
267
472
|
) -> Iterable[MetadataWorkUnit]:
|
|
@@ -659,6 +864,10 @@ class MLflowSource(StatefulIngestionSourceBase):
|
|
|
659
864
|
)
|
|
660
865
|
return wu
|
|
661
866
|
|
|
867
|
+
def _is_valid_platform(self, platform: Optional[str]) -> bool:
|
|
868
|
+
"""Check if platform is registered as a source plugin"""
|
|
869
|
+
return platform in KNOWN_VALID_PLATFORM_NAMES
|
|
870
|
+
|
|
662
871
|
@classmethod
|
|
663
872
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
|
|
664
873
|
config = MLflowConfig.parse_obj(config_dict)
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -33,6 +33,7 @@ from datahub.emitter.mcp_builder import (
|
|
|
33
33
|
add_dataset_to_container,
|
|
34
34
|
gen_containers,
|
|
35
35
|
)
|
|
36
|
+
from datahub.emitter.request_helper import make_curl_command
|
|
36
37
|
from datahub.ingestion.api.common import PipelineContext
|
|
37
38
|
from datahub.ingestion.api.decorators import (
|
|
38
39
|
SourceCapability,
|
|
@@ -339,7 +340,8 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
339
340
|
|
|
340
341
|
# Test the connection
|
|
341
342
|
try:
|
|
342
|
-
self._get_request_json(f"{self.config.connect_uri}/api/verify")
|
|
343
|
+
key_info = self._get_request_json(f"{self.config.connect_uri}/api/verify")
|
|
344
|
+
logger.debug(f"Auth info: {key_info}")
|
|
343
345
|
except ModeRequestError as e:
|
|
344
346
|
self.report.report_failure(
|
|
345
347
|
title="Failed to Connect",
|
|
@@ -1485,12 +1487,17 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1485
1487
|
|
|
1486
1488
|
@r.wraps
|
|
1487
1489
|
def get_request():
|
|
1490
|
+
curl_command = make_curl_command(self.session, "GET", url, "")
|
|
1491
|
+
logger.debug(f"Issuing request; curl equivalent: {curl_command}")
|
|
1492
|
+
|
|
1488
1493
|
try:
|
|
1489
1494
|
response = self.session.get(
|
|
1490
1495
|
url, timeout=self.config.api_options.timeout
|
|
1491
1496
|
)
|
|
1492
1497
|
if response.status_code == 204: # No content, don't parse json
|
|
1493
1498
|
return {}
|
|
1499
|
+
|
|
1500
|
+
response.raise_for_status()
|
|
1494
1501
|
return response.json()
|
|
1495
1502
|
except HTTPError as http_error:
|
|
1496
1503
|
error_response = http_error.response
|
|
@@ -1501,6 +1508,9 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
1501
1508
|
time.sleep(float(sleep_time))
|
|
1502
1509
|
raise HTTPError429 from None
|
|
1503
1510
|
|
|
1511
|
+
logger.debug(
|
|
1512
|
+
f"Error response ({error_response.status_code}): {error_response.text}"
|
|
1513
|
+
)
|
|
1504
1514
|
raise http_error
|
|
1505
1515
|
|
|
1506
1516
|
return get_request()
|
|
@@ -2,13 +2,14 @@ import logging
|
|
|
2
2
|
import time
|
|
3
3
|
import warnings
|
|
4
4
|
from abc import ABC
|
|
5
|
-
from typing import Dict, Iterable, Optional, Tuple
|
|
5
|
+
from typing import Dict, Iterable, List, Optional, Tuple
|
|
6
6
|
|
|
7
7
|
from pydantic import validator
|
|
8
8
|
from pydantic.fields import Field
|
|
9
9
|
|
|
10
10
|
from datahub.configuration.common import ConfigModel
|
|
11
11
|
from datahub.emitter.mce_builder import make_tag_urn
|
|
12
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
12
13
|
from datahub.ingestion.api.common import PipelineContext
|
|
13
14
|
from datahub.ingestion.api.decorators import (
|
|
14
15
|
SourceCapability,
|
|
@@ -20,6 +21,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
20
21
|
)
|
|
21
22
|
from datahub.ingestion.api.source import Source, SourceReport
|
|
22
23
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
24
|
+
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
23
25
|
from datahub.ingestion.source.openapi_parser import (
|
|
24
26
|
clean_url,
|
|
25
27
|
compose_url_attr,
|
|
@@ -32,14 +34,13 @@ from datahub.ingestion.source.openapi_parser import (
|
|
|
32
34
|
set_metadata,
|
|
33
35
|
try_guessing,
|
|
34
36
|
)
|
|
35
|
-
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
36
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
37
37
|
from datahub.metadata.schema_classes import (
|
|
38
38
|
AuditStampClass,
|
|
39
39
|
DatasetPropertiesClass,
|
|
40
40
|
GlobalTagsClass,
|
|
41
41
|
InstitutionalMemoryClass,
|
|
42
42
|
InstitutionalMemoryMetadataClass,
|
|
43
|
+
SubTypesClass,
|
|
43
44
|
TagAssociationClass,
|
|
44
45
|
)
|
|
45
46
|
|
|
@@ -222,8 +223,9 @@ class APISource(Source, ABC):
|
|
|
222
223
|
|
|
223
224
|
def init_dataset(
|
|
224
225
|
self, endpoint_k: str, endpoint_dets: dict
|
|
225
|
-
) -> Tuple[
|
|
226
|
+
) -> Tuple[str, str, List[MetadataWorkUnit]]:
|
|
226
227
|
config = self.config
|
|
228
|
+
workunits = []
|
|
227
229
|
|
|
228
230
|
dataset_name = endpoint_k[1:].replace("/", ".")
|
|
229
231
|
|
|
@@ -233,22 +235,27 @@ class APISource(Source, ABC):
|
|
|
233
235
|
else:
|
|
234
236
|
dataset_name = "root"
|
|
235
237
|
|
|
236
|
-
|
|
237
|
-
urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
|
|
238
|
-
aspects=[],
|
|
239
|
-
)
|
|
238
|
+
dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)"
|
|
240
239
|
|
|
241
|
-
#
|
|
242
|
-
|
|
240
|
+
# Create dataset properties aspect
|
|
241
|
+
properties = DatasetPropertiesClass(
|
|
243
242
|
description=endpoint_dets["description"], customProperties={}
|
|
244
243
|
)
|
|
245
|
-
|
|
244
|
+
wu = MetadataWorkUnit(
|
|
245
|
+
id=dataset_name,
|
|
246
|
+
mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=properties),
|
|
247
|
+
)
|
|
248
|
+
workunits.append(wu)
|
|
246
249
|
|
|
247
|
-
#
|
|
250
|
+
# Create tags aspect
|
|
248
251
|
tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
|
|
249
252
|
tags_tac = [TagAssociationClass(t) for t in tags_str]
|
|
250
253
|
gtc = GlobalTagsClass(tags_tac)
|
|
251
|
-
|
|
254
|
+
wu = MetadataWorkUnit(
|
|
255
|
+
id=f"{dataset_name}-tags",
|
|
256
|
+
mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=gtc),
|
|
257
|
+
)
|
|
258
|
+
workunits.append(wu)
|
|
252
259
|
|
|
253
260
|
# the link will appear in the "documentation"
|
|
254
261
|
link_url = clean_url(config.url + self.url_basepath + endpoint_k)
|
|
@@ -260,17 +267,25 @@ class APISource(Source, ABC):
|
|
|
260
267
|
url=link_url, description=link_description, createStamp=creation
|
|
261
268
|
)
|
|
262
269
|
inst_memory = InstitutionalMemoryClass([link_metadata])
|
|
263
|
-
|
|
270
|
+
wu = MetadataWorkUnit(
|
|
271
|
+
id=f"{dataset_name}-docs",
|
|
272
|
+
mcp=MetadataChangeProposalWrapper(
|
|
273
|
+
entityUrn=dataset_urn, aspect=inst_memory
|
|
274
|
+
),
|
|
275
|
+
)
|
|
276
|
+
workunits.append(wu)
|
|
264
277
|
|
|
265
|
-
|
|
278
|
+
# Create subtype aspect
|
|
279
|
+
sub_types = SubTypesClass(typeNames=[DatasetSubTypes.API_ENDPOINT])
|
|
280
|
+
wu = MetadataWorkUnit(
|
|
281
|
+
id=f"{dataset_name}-subtype",
|
|
282
|
+
mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=sub_types),
|
|
283
|
+
)
|
|
284
|
+
workunits.append(wu)
|
|
266
285
|
|
|
267
|
-
|
|
268
|
-
self, dataset_snapshot: DatasetSnapshot, dataset_name: str
|
|
269
|
-
) -> ApiWorkUnit:
|
|
270
|
-
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
271
|
-
return ApiWorkUnit(id=dataset_name, mce=mce)
|
|
286
|
+
return dataset_name, dataset_urn, workunits
|
|
272
287
|
|
|
273
|
-
def get_workunits_internal(self) -> Iterable[
|
|
288
|
+
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
274
289
|
config = self.config
|
|
275
290
|
|
|
276
291
|
sw_dict = self.config.get_swagger()
|
|
@@ -294,16 +309,24 @@ class APISource(Source, ABC):
|
|
|
294
309
|
if endpoint_k in config.ignore_endpoints:
|
|
295
310
|
continue
|
|
296
311
|
|
|
297
|
-
|
|
312
|
+
# Initialize dataset and get common aspects
|
|
313
|
+
dataset_name, dataset_urn, workunits = self.init_dataset(
|
|
298
314
|
endpoint_k, endpoint_dets
|
|
299
315
|
)
|
|
316
|
+
for wu in workunits:
|
|
317
|
+
yield wu
|
|
300
318
|
|
|
301
|
-
#
|
|
319
|
+
# Handle schema metadata if available
|
|
302
320
|
if "data" in endpoint_dets.keys():
|
|
303
321
|
# we are lucky! data is defined in the swagger for this endpoint
|
|
304
322
|
schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
|
|
305
|
-
|
|
306
|
-
|
|
323
|
+
wu = MetadataWorkUnit(
|
|
324
|
+
id=f"{dataset_name}-schema",
|
|
325
|
+
mcp=MetadataChangeProposalWrapper(
|
|
326
|
+
entityUrn=dataset_urn, aspect=schema_metadata
|
|
327
|
+
),
|
|
328
|
+
)
|
|
329
|
+
yield wu
|
|
307
330
|
elif endpoint_dets["method"] != "get":
|
|
308
331
|
self.report.report_warning(
|
|
309
332
|
title="Failed to Extract Endpoint Metadata",
|
|
@@ -338,9 +361,13 @@ class APISource(Source, ABC):
|
|
|
338
361
|
context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
|
|
339
362
|
)
|
|
340
363
|
schema_metadata = set_metadata(dataset_name, fields2add)
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
364
|
+
wu = MetadataWorkUnit(
|
|
365
|
+
id=f"{dataset_name}-schema",
|
|
366
|
+
mcp=MetadataChangeProposalWrapper(
|
|
367
|
+
entityUrn=dataset_urn, aspect=schema_metadata
|
|
368
|
+
),
|
|
369
|
+
)
|
|
370
|
+
yield wu
|
|
344
371
|
else:
|
|
345
372
|
self.report_bad_responses(response.status_code, type=endpoint_k)
|
|
346
373
|
else:
|
|
@@ -369,9 +396,13 @@ class APISource(Source, ABC):
|
|
|
369
396
|
context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
|
|
370
397
|
)
|
|
371
398
|
schema_metadata = set_metadata(dataset_name, fields2add)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
399
|
+
wu = MetadataWorkUnit(
|
|
400
|
+
id=f"{dataset_name}-schema",
|
|
401
|
+
mcp=MetadataChangeProposalWrapper(
|
|
402
|
+
entityUrn=dataset_urn, aspect=schema_metadata
|
|
403
|
+
),
|
|
404
|
+
)
|
|
405
|
+
yield wu
|
|
375
406
|
else:
|
|
376
407
|
self.report_bad_responses(response.status_code, type=endpoint_k)
|
|
377
408
|
else:
|
|
@@ -400,9 +431,13 @@ class APISource(Source, ABC):
|
|
|
400
431
|
context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
|
|
401
432
|
)
|
|
402
433
|
schema_metadata = set_metadata(dataset_name, fields2add)
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
434
|
+
wu = MetadataWorkUnit(
|
|
435
|
+
id=f"{dataset_name}-schema",
|
|
436
|
+
mcp=MetadataChangeProposalWrapper(
|
|
437
|
+
entityUrn=dataset_urn, aspect=schema_metadata
|
|
438
|
+
),
|
|
439
|
+
)
|
|
440
|
+
yield wu
|
|
406
441
|
else:
|
|
407
442
|
self.report_bad_responses(response.status_code, type=endpoint_k)
|
|
408
443
|
|
|
@@ -11,6 +11,9 @@ import datahub.emitter.mce_builder as builder
|
|
|
11
11
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
12
12
|
from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
|
|
13
13
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
14
|
+
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
15
|
+
IncrementalLineageConfigMixin,
|
|
16
|
+
)
|
|
14
17
|
from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
|
|
15
18
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
16
19
|
StaleEntityRemovalSourceReport,
|
|
@@ -19,6 +22,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
19
22
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
20
23
|
StatefulIngestionConfigBase,
|
|
21
24
|
)
|
|
25
|
+
from datahub.utilities.global_warning_util import add_global_warning
|
|
22
26
|
from datahub.utilities.lossy_collections import LossyList
|
|
23
27
|
from datahub.utilities.perf_timer import PerfTimer
|
|
24
28
|
|
|
@@ -183,6 +187,11 @@ class SupportedDataPlatform(Enum):
|
|
|
183
187
|
datahub_data_platform_name="databricks",
|
|
184
188
|
)
|
|
185
189
|
|
|
190
|
+
MYSQL = DataPlatformPair(
|
|
191
|
+
powerbi_data_platform_name="MySQL",
|
|
192
|
+
datahub_data_platform_name="mysql",
|
|
193
|
+
)
|
|
194
|
+
|
|
186
195
|
|
|
187
196
|
@dataclass
|
|
188
197
|
class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
@@ -275,7 +284,7 @@ class PowerBiProfilingConfig(ConfigModel):
|
|
|
275
284
|
|
|
276
285
|
|
|
277
286
|
class PowerBiDashboardSourceConfig(
|
|
278
|
-
StatefulIngestionConfigBase, DatasetSourceConfigMixin
|
|
287
|
+
StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
|
|
279
288
|
):
|
|
280
289
|
platform_name: str = pydantic.Field(
|
|
281
290
|
default=Constant.PLATFORM_NAME, hidden_from_docs=True
|
|
@@ -297,7 +306,15 @@ class PowerBiDashboardSourceConfig(
|
|
|
297
306
|
# PowerBi workspace identifier
|
|
298
307
|
workspace_id_pattern: AllowDenyPattern = pydantic.Field(
|
|
299
308
|
default=AllowDenyPattern.allow_all(),
|
|
300
|
-
description="Regex patterns to filter PowerBI workspaces in ingestion."
|
|
309
|
+
description="Regex patterns to filter PowerBI workspaces in ingestion by ID."
|
|
310
|
+
" By default all IDs are allowed unless they are filtered by name using 'workspace_name_pattern'."
|
|
311
|
+
" Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
|
|
312
|
+
)
|
|
313
|
+
# PowerBi workspace name
|
|
314
|
+
workspace_name_pattern: AllowDenyPattern = pydantic.Field(
|
|
315
|
+
default=AllowDenyPattern.allow_all(),
|
|
316
|
+
description="Regex patterns to filter PowerBI workspaces in ingestion by name."
|
|
317
|
+
" By default all names are allowed unless they are filtered by ID using 'workspace_id_pattern'."
|
|
301
318
|
" Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
|
|
302
319
|
)
|
|
303
320
|
|
|
@@ -373,8 +390,9 @@ class PowerBiDashboardSourceConfig(
|
|
|
373
390
|
)
|
|
374
391
|
# Enable/Disable extracting dataset schema
|
|
375
392
|
extract_dataset_schema: bool = pydantic.Field(
|
|
376
|
-
default=
|
|
377
|
-
description="Whether to ingest PBI Dataset Table columns and measures"
|
|
393
|
+
default=True,
|
|
394
|
+
description="Whether to ingest PBI Dataset Table columns and measures."
|
|
395
|
+
" Note: this setting must be `true` for schema extraction and column lineage to be enabled.",
|
|
378
396
|
)
|
|
379
397
|
# Enable/Disable extracting lineage information of PowerBI Dataset
|
|
380
398
|
extract_lineage: bool = pydantic.Field(
|
|
@@ -510,6 +528,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
510
528
|
"native_query_parsing",
|
|
511
529
|
"enable_advance_lineage_sql_construct",
|
|
512
530
|
"extract_lineage",
|
|
531
|
+
"extract_dataset_schema",
|
|
513
532
|
]
|
|
514
533
|
|
|
515
534
|
if (
|
|
@@ -575,3 +594,11 @@ class PowerBiDashboardSourceConfig(
|
|
|
575
594
|
)
|
|
576
595
|
|
|
577
596
|
return values
|
|
597
|
+
|
|
598
|
+
@root_validator(skip_on_failure=True)
|
|
599
|
+
def validate_extract_dataset_schema(cls, values: Dict) -> Dict:
|
|
600
|
+
if values.get("extract_dataset_schema") is False:
|
|
601
|
+
add_global_warning(
|
|
602
|
+
"Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
|
|
603
|
+
)
|
|
604
|
+
return values
|