acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/datajob/datajob.py +7 -4
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/errors.py +4 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +33 -8
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/run/pipeline.py +9 -6
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/ge_data_profiler.py +27 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +13 -1
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sigma/config.py +74 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/ingestion/source/sql/hive_metastore.py +5 -5
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +31 -6
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/ingestion/source/vertexai/vertexai.py +316 -4
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
- datahub/metadata/_urns/urn_defs.py +1819 -1763
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +17296 -16883
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +142 -4
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/entity_client.py +8 -0
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +6 -3
- datahub/sdk/mlmodel.py +301 -0
- datahub/sdk/mlmodelgroup.py +233 -0
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/specific/dataset.py +12 -0
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +18 -14
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/testing/mcp_diff.py +15 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +350 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
+
from datetime import datetime, timedelta
|
|
3
4
|
from typing import Dict, Iterable, List, Optional, Tuple, TypeVar, Union
|
|
4
5
|
|
|
5
6
|
from google.api_core.exceptions import GoogleAPICallError
|
|
@@ -12,15 +13,22 @@ from google.cloud.aiplatform import (
|
|
|
12
13
|
AutoMLVideoTrainingJob,
|
|
13
14
|
Endpoint,
|
|
14
15
|
ExperimentRun,
|
|
16
|
+
PipelineJob,
|
|
15
17
|
)
|
|
16
18
|
from google.cloud.aiplatform.base import VertexAiResourceNoun
|
|
17
19
|
from google.cloud.aiplatform.metadata.execution import Execution
|
|
18
20
|
from google.cloud.aiplatform.metadata.experiment_resources import Experiment
|
|
19
21
|
from google.cloud.aiplatform.models import Model, VersionInfo
|
|
20
22
|
from google.cloud.aiplatform.training_jobs import _TrainingJob
|
|
23
|
+
from google.cloud.aiplatform_v1.types import (
|
|
24
|
+
PipelineJob as PipelineJobType,
|
|
25
|
+
PipelineTaskDetail,
|
|
26
|
+
)
|
|
21
27
|
from google.oauth2 import service_account
|
|
28
|
+
from google.protobuf import timestamp_pb2
|
|
22
29
|
|
|
23
30
|
import datahub.emitter.mce_builder as builder
|
|
31
|
+
from datahub.api.entities.datajob import DataFlow, DataJob
|
|
24
32
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
25
33
|
from datahub.emitter.mcp_builder import (
|
|
26
34
|
ExperimentKey,
|
|
@@ -43,6 +51,7 @@ from datahub.ingestion.source.vertexai.vertexai_config import VertexAIConfig
|
|
|
43
51
|
from datahub.ingestion.source.vertexai.vertexai_result_type_utils import (
|
|
44
52
|
get_execution_result_status,
|
|
45
53
|
get_job_result_status,
|
|
54
|
+
get_pipeline_task_result_status,
|
|
46
55
|
is_status_for_run_event_class,
|
|
47
56
|
)
|
|
48
57
|
from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import (
|
|
@@ -76,7 +85,13 @@ from datahub.metadata.schema_classes import (
|
|
|
76
85
|
VersionPropertiesClass,
|
|
77
86
|
VersionTagClass,
|
|
78
87
|
)
|
|
79
|
-
from datahub.metadata.urns import
|
|
88
|
+
from datahub.metadata.urns import (
|
|
89
|
+
DataFlowUrn,
|
|
90
|
+
DataJobUrn,
|
|
91
|
+
DataPlatformUrn,
|
|
92
|
+
MlModelUrn,
|
|
93
|
+
VersionSetUrn,
|
|
94
|
+
)
|
|
80
95
|
from datahub.utilities.time import datetime_to_ts_millis
|
|
81
96
|
|
|
82
97
|
T = TypeVar("T")
|
|
@@ -100,6 +115,34 @@ class ModelMetadata:
|
|
|
100
115
|
endpoints: Optional[List[Endpoint]] = None
|
|
101
116
|
|
|
102
117
|
|
|
118
|
+
@dataclasses.dataclass
|
|
119
|
+
class PipelineTaskMetadata:
|
|
120
|
+
name: str
|
|
121
|
+
urn: DataJobUrn
|
|
122
|
+
id: Optional[int] = None
|
|
123
|
+
type: Optional[str] = None
|
|
124
|
+
state: Optional[PipelineTaskDetail.State] = None
|
|
125
|
+
start_time: Optional[timestamp_pb2.Timestamp] = None
|
|
126
|
+
create_time: Optional[timestamp_pb2.Timestamp] = None
|
|
127
|
+
end_time: Optional[timestamp_pb2.Timestamp] = None
|
|
128
|
+
upstreams: Optional[List[DataJobUrn]] = None
|
|
129
|
+
duration: Optional[int] = None
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclasses.dataclass
|
|
133
|
+
class PipelineMetadata:
|
|
134
|
+
name: str
|
|
135
|
+
resource_name: str
|
|
136
|
+
tasks: List[PipelineTaskMetadata]
|
|
137
|
+
urn: DataFlowUrn
|
|
138
|
+
id: Optional[str] = None
|
|
139
|
+
labels: Optional[Dict[str, str]] = None
|
|
140
|
+
create_time: Optional[datetime] = None
|
|
141
|
+
update_time: Optional[datetime] = None
|
|
142
|
+
duration: Optional[timedelta] = None
|
|
143
|
+
region: Optional[str] = None
|
|
144
|
+
|
|
145
|
+
|
|
103
146
|
@platform_name("Vertex AI", id="vertexai")
|
|
104
147
|
@config_class(VertexAIConfig)
|
|
105
148
|
@support_status(SupportStatus.TESTING)
|
|
@@ -150,6 +193,255 @@ class VertexAISource(Source):
|
|
|
150
193
|
yield from self._get_experiments_workunits()
|
|
151
194
|
# Fetch and Ingest Experiment Runs
|
|
152
195
|
yield from auto_workunit(self._get_experiment_runs_mcps())
|
|
196
|
+
# Fetch Pipelines and Tasks
|
|
197
|
+
yield from auto_workunit(self._get_pipelines_mcps())
|
|
198
|
+
|
|
199
|
+
def _get_pipelines_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
|
|
200
|
+
"""
|
|
201
|
+
Fetches pipelines from Vertex AI and generates corresponding mcps.
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
pipeline_jobs = self.client.PipelineJob.list()
|
|
205
|
+
|
|
206
|
+
for pipeline in pipeline_jobs:
|
|
207
|
+
logger.info(f"fetching pipeline ({pipeline.name})")
|
|
208
|
+
pipeline_meta = self._get_pipeline_metadata(pipeline)
|
|
209
|
+
yield from self._get_pipeline_mcps(pipeline_meta)
|
|
210
|
+
yield from self._gen_pipeline_task_mcps(pipeline_meta)
|
|
211
|
+
|
|
212
|
+
def _get_pipeline_tasks_metadata(
|
|
213
|
+
self, pipeline: PipelineJob, pipeline_urn: DataFlowUrn
|
|
214
|
+
) -> List[PipelineTaskMetadata]:
|
|
215
|
+
tasks: List[PipelineTaskMetadata] = list()
|
|
216
|
+
task_map: Dict[str, PipelineTaskDetail] = dict()
|
|
217
|
+
for task in pipeline.task_details:
|
|
218
|
+
task_map[task.task_name] = task
|
|
219
|
+
|
|
220
|
+
resource = pipeline.gca_resource
|
|
221
|
+
if isinstance(resource, PipelineJobType):
|
|
222
|
+
for task_name in resource.pipeline_spec["root"]["dag"]["tasks"]:
|
|
223
|
+
logger.debug(
|
|
224
|
+
f"fetching pipeline task ({task_name}) in pipeline ({pipeline.name})"
|
|
225
|
+
)
|
|
226
|
+
task_urn = DataJobUrn.create_from_ids(
|
|
227
|
+
data_flow_urn=str(pipeline_urn),
|
|
228
|
+
job_id=self._make_vertexai_pipeline_task_id(task_name),
|
|
229
|
+
)
|
|
230
|
+
task_meta = PipelineTaskMetadata(name=task_name, urn=task_urn)
|
|
231
|
+
if (
|
|
232
|
+
"dependentTasks"
|
|
233
|
+
in resource.pipeline_spec["root"]["dag"]["tasks"][task_name]
|
|
234
|
+
):
|
|
235
|
+
upstream_tasks = resource.pipeline_spec["root"]["dag"]["tasks"][
|
|
236
|
+
task_name
|
|
237
|
+
]["dependentTasks"]
|
|
238
|
+
upstream_urls = [
|
|
239
|
+
DataJobUrn.create_from_ids(
|
|
240
|
+
data_flow_urn=str(pipeline_urn),
|
|
241
|
+
job_id=self._make_vertexai_pipeline_task_id(upstream_task),
|
|
242
|
+
)
|
|
243
|
+
for upstream_task in upstream_tasks
|
|
244
|
+
]
|
|
245
|
+
task_meta.upstreams = upstream_urls
|
|
246
|
+
|
|
247
|
+
task_detail = task_map.get(task_name)
|
|
248
|
+
if task_detail:
|
|
249
|
+
task_meta.id = task_detail.task_id
|
|
250
|
+
task_meta.state = task_detail.state
|
|
251
|
+
task_meta.start_time = task_detail.start_time
|
|
252
|
+
task_meta.create_time = task_detail.create_time
|
|
253
|
+
if task_detail.end_time:
|
|
254
|
+
task_meta.end_time = task_detail.end_time
|
|
255
|
+
task_meta.duration = int(
|
|
256
|
+
(
|
|
257
|
+
task_meta.end_time.timestamp()
|
|
258
|
+
- task_meta.start_time.timestamp()
|
|
259
|
+
)
|
|
260
|
+
* 1000
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
tasks.append(task_meta)
|
|
264
|
+
return tasks
|
|
265
|
+
|
|
266
|
+
def _get_pipeline_metadata(self, pipeline: PipelineJob) -> PipelineMetadata:
|
|
267
|
+
dataflow_urn = DataFlowUrn.create_from_ids(
|
|
268
|
+
orchestrator=self.platform,
|
|
269
|
+
env=self.config.env,
|
|
270
|
+
flow_id=self._make_vertexai_pipeline_id(pipeline.name),
|
|
271
|
+
platform_instance=self.platform,
|
|
272
|
+
)
|
|
273
|
+
tasks = self._get_pipeline_tasks_metadata(
|
|
274
|
+
pipeline=pipeline, pipeline_urn=dataflow_urn
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
pipeline_meta = PipelineMetadata(
|
|
278
|
+
name=pipeline.name,
|
|
279
|
+
resource_name=pipeline.resource_name,
|
|
280
|
+
urn=dataflow_urn,
|
|
281
|
+
tasks=tasks,
|
|
282
|
+
)
|
|
283
|
+
pipeline_meta.resource_name = pipeline.resource_name
|
|
284
|
+
pipeline_meta.labels = pipeline.labels
|
|
285
|
+
pipeline_meta.create_time = pipeline.create_time
|
|
286
|
+
pipeline_meta.region = pipeline.location
|
|
287
|
+
if pipeline.update_time:
|
|
288
|
+
pipeline_meta.update_time = pipeline.update_time
|
|
289
|
+
pipeline_meta.duration = timedelta(
|
|
290
|
+
milliseconds=datetime_to_ts_millis(pipeline.update_time)
|
|
291
|
+
- datetime_to_ts_millis(pipeline.create_time)
|
|
292
|
+
)
|
|
293
|
+
return pipeline_meta
|
|
294
|
+
|
|
295
|
+
def _gen_pipeline_task_run_mcps(
|
|
296
|
+
self, task: PipelineTaskMetadata, datajob: DataJob, pipeline: PipelineMetadata
|
|
297
|
+
) -> (Iterable)[MetadataChangeProposalWrapper]:
|
|
298
|
+
dpi_urn = builder.make_data_process_instance_urn(
|
|
299
|
+
self._make_vertexai_pipeline_task_run_id(entity_id=task.name)
|
|
300
|
+
)
|
|
301
|
+
result_status: Union[str, RunResultTypeClass] = get_pipeline_task_result_status(
|
|
302
|
+
task.state
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
yield from MetadataChangeProposalWrapper.construct_many(
|
|
306
|
+
dpi_urn,
|
|
307
|
+
aspects=[
|
|
308
|
+
DataProcessInstancePropertiesClass(
|
|
309
|
+
name=task.name,
|
|
310
|
+
created=AuditStampClass(
|
|
311
|
+
time=(
|
|
312
|
+
int(task.create_time.timestamp() * 1000)
|
|
313
|
+
if task.create_time
|
|
314
|
+
else 0
|
|
315
|
+
),
|
|
316
|
+
actor="urn:li:corpuser:datahub",
|
|
317
|
+
),
|
|
318
|
+
externalUrl=self._make_pipeline_external_url(pipeline.name),
|
|
319
|
+
customProperties={},
|
|
320
|
+
),
|
|
321
|
+
SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE_TASK_RUN]),
|
|
322
|
+
ContainerClass(container=self._get_project_container().as_urn()),
|
|
323
|
+
DataPlatformInstanceClass(platform=str(DataPlatformUrn(self.platform))),
|
|
324
|
+
DataProcessInstanceRelationships(
|
|
325
|
+
upstreamInstances=[], parentTemplate=str(datajob.urn)
|
|
326
|
+
),
|
|
327
|
+
(
|
|
328
|
+
DataProcessInstanceRunEventClass(
|
|
329
|
+
status=DataProcessRunStatusClass.COMPLETE,
|
|
330
|
+
timestampMillis=(
|
|
331
|
+
int(task.create_time.timestamp() * 1000)
|
|
332
|
+
if task.create_time
|
|
333
|
+
else 0
|
|
334
|
+
),
|
|
335
|
+
result=DataProcessInstanceRunResultClass(
|
|
336
|
+
type=result_status,
|
|
337
|
+
nativeResultType=self.platform,
|
|
338
|
+
),
|
|
339
|
+
durationMillis=task.duration,
|
|
340
|
+
)
|
|
341
|
+
if is_status_for_run_event_class(result_status) and task.duration
|
|
342
|
+
else None
|
|
343
|
+
),
|
|
344
|
+
],
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def _gen_pipeline_task_mcps(
|
|
348
|
+
self, pipeline: PipelineMetadata
|
|
349
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
350
|
+
dataflow_urn = pipeline.urn
|
|
351
|
+
|
|
352
|
+
for task in pipeline.tasks:
|
|
353
|
+
datajob = DataJob(
|
|
354
|
+
id=self._make_vertexai_pipeline_task_id(task.name),
|
|
355
|
+
flow_urn=dataflow_urn,
|
|
356
|
+
name=task.name,
|
|
357
|
+
properties={},
|
|
358
|
+
owners={"urn:li:corpuser:datahub"},
|
|
359
|
+
upstream_urns=task.upstreams if task.upstreams else [],
|
|
360
|
+
url=self._make_pipeline_external_url(pipeline.name),
|
|
361
|
+
)
|
|
362
|
+
yield from MetadataChangeProposalWrapper.construct_many(
|
|
363
|
+
entityUrn=str(datajob.urn),
|
|
364
|
+
aspects=[
|
|
365
|
+
ContainerClass(container=self._get_project_container().as_urn()),
|
|
366
|
+
SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE_TASK]),
|
|
367
|
+
],
|
|
368
|
+
)
|
|
369
|
+
yield from datajob.generate_mcp()
|
|
370
|
+
yield from self._gen_pipeline_task_run_mcps(task, datajob, pipeline)
|
|
371
|
+
|
|
372
|
+
def _format_pipeline_duration(self, td: timedelta) -> str:
|
|
373
|
+
days = td.days
|
|
374
|
+
hours, remainder = divmod(td.seconds, 3600)
|
|
375
|
+
minutes, seconds = divmod(remainder, 60)
|
|
376
|
+
milliseconds = td.microseconds // 1000
|
|
377
|
+
|
|
378
|
+
parts = []
|
|
379
|
+
if days:
|
|
380
|
+
parts.append(f"{days}d")
|
|
381
|
+
if hours:
|
|
382
|
+
parts.append(f"{hours}h")
|
|
383
|
+
if minutes:
|
|
384
|
+
parts.append(f"{minutes}m")
|
|
385
|
+
if seconds:
|
|
386
|
+
parts.append(f"{seconds}s")
|
|
387
|
+
if milliseconds:
|
|
388
|
+
parts.append(f"{milliseconds}ms")
|
|
389
|
+
return " ".join(parts) if parts else "0s"
|
|
390
|
+
|
|
391
|
+
def _get_pipeline_task_properties(
|
|
392
|
+
self, task: PipelineTaskMetadata
|
|
393
|
+
) -> Dict[str, str]:
|
|
394
|
+
return {
|
|
395
|
+
"created_time": (
|
|
396
|
+
task.create_time.strftime("%Y-%m-%d %H:%M:%S")
|
|
397
|
+
if task.create_time
|
|
398
|
+
else ""
|
|
399
|
+
)
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
def _get_pipeline_properties(self, pipeline: PipelineMetadata) -> Dict[str, str]:
|
|
403
|
+
return {
|
|
404
|
+
"resource_name": pipeline.resource_name if pipeline.resource_name else "",
|
|
405
|
+
"create_time": (
|
|
406
|
+
pipeline.create_time.isoformat() if pipeline.create_time else ""
|
|
407
|
+
),
|
|
408
|
+
"update_time": (
|
|
409
|
+
pipeline.update_time.isoformat() if pipeline.update_time else ""
|
|
410
|
+
),
|
|
411
|
+
"duration": (
|
|
412
|
+
self._format_pipeline_duration(pipeline.duration)
|
|
413
|
+
if pipeline.duration
|
|
414
|
+
else ""
|
|
415
|
+
),
|
|
416
|
+
"location": (pipeline.region if pipeline.region else ""),
|
|
417
|
+
"labels": ",".join([f"{k}:{v}" for k, v in pipeline.labels.items()])
|
|
418
|
+
if pipeline.labels
|
|
419
|
+
else "",
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
def _get_pipeline_mcps(
|
|
423
|
+
self, pipeline: PipelineMetadata
|
|
424
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
425
|
+
dataflow = DataFlow(
|
|
426
|
+
orchestrator=self.platform,
|
|
427
|
+
id=self._make_vertexai_pipeline_id(pipeline.name),
|
|
428
|
+
env=self.config.env,
|
|
429
|
+
name=pipeline.name,
|
|
430
|
+
platform_instance=self.platform,
|
|
431
|
+
properties=self._get_pipeline_properties(pipeline),
|
|
432
|
+
owners={"urn:li:corpuser:datahub"},
|
|
433
|
+
url=self._make_pipeline_external_url(pipeline_name=pipeline.name),
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
yield from dataflow.generate_mcp()
|
|
437
|
+
|
|
438
|
+
yield from MetadataChangeProposalWrapper.construct_many(
|
|
439
|
+
entityUrn=str(dataflow.urn),
|
|
440
|
+
aspects=[
|
|
441
|
+
ContainerClass(container=self._get_project_container().as_urn()),
|
|
442
|
+
SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE]),
|
|
443
|
+
],
|
|
444
|
+
)
|
|
153
445
|
|
|
154
446
|
def _get_experiments_workunits(self) -> Iterable[MetadataWorkUnit]:
|
|
155
447
|
# List all experiments
|
|
@@ -175,7 +467,7 @@ class VertexAISource(Source):
|
|
|
175
467
|
parent_container_key=self._get_project_container(),
|
|
176
468
|
container_key=ExperimentKey(
|
|
177
469
|
platform=self.platform,
|
|
178
|
-
id=self.
|
|
470
|
+
id=self._make_vertexai_experiment_id(experiment.name),
|
|
179
471
|
),
|
|
180
472
|
name=experiment.name,
|
|
181
473
|
sub_types=[MLAssetSubTypes.VERTEX_EXPERIMENT],
|
|
@@ -311,7 +603,7 @@ class VertexAISource(Source):
|
|
|
311
603
|
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
312
604
|
experiment_key = ExperimentKey(
|
|
313
605
|
platform=self.platform,
|
|
314
|
-
id=self.
|
|
606
|
+
id=self._make_vertexai_experiment_id(experiment.name),
|
|
315
607
|
)
|
|
316
608
|
run_urn = self._make_experiment_run_urn(experiment, run)
|
|
317
609
|
created_time, duration = self._get_run_timestamps(run)
|
|
@@ -968,7 +1260,7 @@ class VertexAISource(Source):
|
|
|
968
1260
|
) -> str:
|
|
969
1261
|
return f"{self.config.project_id}.job.{entity_id}"
|
|
970
1262
|
|
|
971
|
-
def
|
|
1263
|
+
def _make_vertexai_experiment_id(self, entity_id: Optional[str]) -> str:
|
|
972
1264
|
return f"{self.config.project_id}.experiment.{entity_id}"
|
|
973
1265
|
|
|
974
1266
|
def _make_vertexai_experiment_run_name(self, entity_id: Optional[str]) -> str:
|
|
@@ -977,6 +1269,15 @@ class VertexAISource(Source):
|
|
|
977
1269
|
def _make_vertexai_run_execution_name(self, entity_id: Optional[str]) -> str:
|
|
978
1270
|
return f"{self.config.project_id}.execution.{entity_id}"
|
|
979
1271
|
|
|
1272
|
+
def _make_vertexai_pipeline_id(self, entity_id: Optional[str]) -> str:
|
|
1273
|
+
return f"{self.config.project_id}.pipeline.{entity_id}"
|
|
1274
|
+
|
|
1275
|
+
def _make_vertexai_pipeline_task_id(self, entity_id: Optional[str]) -> str:
|
|
1276
|
+
return f"{self.config.project_id}.pipeline_task.{entity_id}"
|
|
1277
|
+
|
|
1278
|
+
def _make_vertexai_pipeline_task_run_id(self, entity_id: Optional[str]) -> str:
|
|
1279
|
+
return f"{self.config.project_id}.pipeline_task_run.{entity_id}"
|
|
1280
|
+
|
|
980
1281
|
def _make_artifact_external_url(
|
|
981
1282
|
self, experiment: Experiment, run: ExperimentRun
|
|
982
1283
|
) -> str:
|
|
@@ -1053,3 +1354,14 @@ class VertexAISource(Source):
|
|
|
1053
1354
|
f"/runs/{experiment.name}-{run.name}/charts?project={self.config.project_id}"
|
|
1054
1355
|
)
|
|
1055
1356
|
return external_url
|
|
1357
|
+
|
|
1358
|
+
def _make_pipeline_external_url(self, pipeline_name: str) -> str:
|
|
1359
|
+
"""
|
|
1360
|
+
Pipeline Run external URL in Vertex AI
|
|
1361
|
+
https://console.cloud.google.com/vertex-ai/pipelines/locations/us-west2/runs/pipeline-example-more-tasks-3-20250320210739?project=acryl-poc
|
|
1362
|
+
"""
|
|
1363
|
+
external_url: str = (
|
|
1364
|
+
f"{self.config.vertexai_url}/pipelines/locations/{self.config.region}/runs/{pipeline_name}"
|
|
1365
|
+
f"?project={self.config.project_id}"
|
|
1366
|
+
)
|
|
1367
|
+
return external_url
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from typing import Union
|
|
1
|
+
from typing import Optional, Union
|
|
2
2
|
|
|
3
3
|
from google.cloud.aiplatform.base import VertexAiResourceNoun
|
|
4
4
|
from google.cloud.aiplatform.jobs import _RunnableJob
|
|
5
5
|
from google.cloud.aiplatform.training_jobs import _TrainingJob
|
|
6
|
-
from google.cloud.aiplatform_v1.types import JobState, PipelineState
|
|
6
|
+
from google.cloud.aiplatform_v1.types import JobState, PipelineState, PipelineTaskDetail
|
|
7
7
|
|
|
8
8
|
from datahub.metadata.schema_classes import RunResultTypeClass
|
|
9
9
|
|
|
@@ -64,5 +64,26 @@ def get_execution_result_status(status: int) -> Union[str, RunResultTypeClass]:
|
|
|
64
64
|
return status_mapping.get(status, "UNKNOWN")
|
|
65
65
|
|
|
66
66
|
|
|
67
|
+
def get_pipeline_task_result_status(
|
|
68
|
+
status: Optional[PipelineTaskDetail.State],
|
|
69
|
+
) -> Union[str, RunResultTypeClass]:
|
|
70
|
+
# TODO: DataProcessInstanceRunResultClass fails with status string except for SUCCESS, FAILURE, SKIPPED,
|
|
71
|
+
# which will be fixed in the future
|
|
72
|
+
status_mapping = {
|
|
73
|
+
# PipelineTaskDetail.State.STATE_UNSPECIFIED: "STATE_UNSPECIFIED",
|
|
74
|
+
# PipelineTaskDetail.State.PENDING: "PENDING",
|
|
75
|
+
# PipelineTaskDetail.State.RUNNING: "RUNNING",
|
|
76
|
+
# PipelineTaskDetail.State.CANCEL_PENDING: "CANCEL_PENDING",
|
|
77
|
+
# PipelineTaskDetail.State.CANCELLING: "CANCELLING",
|
|
78
|
+
# PipelineTaskDetail.State.NOT_TRIGGERED: "NOT_TRIGGERED",
|
|
79
|
+
PipelineTaskDetail.State.SUCCEEDED: RunResultTypeClass.SUCCESS,
|
|
80
|
+
PipelineTaskDetail.State.FAILED: RunResultTypeClass.FAILURE,
|
|
81
|
+
PipelineTaskDetail.State.SKIPPED: RunResultTypeClass.SKIPPED,
|
|
82
|
+
}
|
|
83
|
+
if status is None:
|
|
84
|
+
return "UNKNOWN"
|
|
85
|
+
return status_mapping.get(status, "UNKNOWN")
|
|
86
|
+
|
|
87
|
+
|
|
67
88
|
def is_status_for_run_event_class(status: Union[str, RunResultTypeClass]) -> bool:
|
|
68
89
|
return status in [RunResultTypeClass.SUCCESS, RunResultTypeClass.FAILURE]
|
|
@@ -3,6 +3,7 @@ from typing import List, Optional, Tuple, TypedDict
|
|
|
3
3
|
|
|
4
4
|
from datahub.api.entities.assertion.assertion import BaseEntityAssertion
|
|
5
5
|
from datahub.ingestion.graph.client import get_default_graph
|
|
6
|
+
from datahub.ingestion.graph.config import ClientMode
|
|
6
7
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties
|
|
7
8
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata
|
|
8
9
|
from datahub.utilities.urns.urn import Urn
|
|
@@ -15,7 +16,7 @@ class ColumnDict(TypedDict):
|
|
|
15
16
|
|
|
16
17
|
@lru_cache
|
|
17
18
|
def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
|
|
18
|
-
with get_default_graph() as graph:
|
|
19
|
+
with get_default_graph(ClientMode.CLI) as graph:
|
|
19
20
|
props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties)
|
|
20
21
|
if props is not None:
|
|
21
22
|
return props.qualifiedName
|
|
@@ -24,7 +25,7 @@ def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
|
|
|
24
25
|
|
|
25
26
|
@lru_cache
|
|
26
27
|
def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]:
|
|
27
|
-
with get_default_graph() as graph:
|
|
28
|
+
with get_default_graph(ClientMode.INGESTION) as graph:
|
|
28
29
|
schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata)
|
|
29
30
|
if schema is not None:
|
|
30
31
|
return [
|