acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (159) hide show
  1. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
  2. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
  3. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
  4. datahub/_version.py +1 -1
  5. datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
  6. datahub/api/entities/datacontract/datacontract.py +35 -3
  7. datahub/api/entities/datajob/dataflow.py +3 -3
  8. datahub/api/entities/datajob/datajob.py +7 -4
  9. datahub/api/entities/dataset/dataset.py +9 -11
  10. datahub/api/entities/forms/forms.py +34 -34
  11. datahub/api/graphql/assertion.py +1 -1
  12. datahub/api/graphql/operation.py +4 -4
  13. datahub/cli/check_cli.py +3 -2
  14. datahub/cli/config_utils.py +2 -2
  15. datahub/cli/delete_cli.py +6 -5
  16. datahub/cli/docker_cli.py +2 -2
  17. datahub/cli/exists_cli.py +2 -1
  18. datahub/cli/get_cli.py +2 -1
  19. datahub/cli/iceberg_cli.py +6 -5
  20. datahub/cli/ingest_cli.py +9 -6
  21. datahub/cli/migrate.py +4 -3
  22. datahub/cli/migration_utils.py +4 -3
  23. datahub/cli/put_cli.py +3 -2
  24. datahub/cli/specific/assertions_cli.py +2 -1
  25. datahub/cli/specific/datacontract_cli.py +3 -2
  26. datahub/cli/specific/dataproduct_cli.py +10 -9
  27. datahub/cli/specific/dataset_cli.py +4 -3
  28. datahub/cli/specific/forms_cli.py +2 -1
  29. datahub/cli/specific/group_cli.py +2 -1
  30. datahub/cli/specific/structuredproperties_cli.py +4 -3
  31. datahub/cli/specific/user_cli.py +2 -1
  32. datahub/cli/state_cli.py +2 -1
  33. datahub/cli/timeline_cli.py +2 -1
  34. datahub/configuration/common.py +5 -0
  35. datahub/configuration/source_common.py +1 -1
  36. datahub/emitter/mcp.py +20 -5
  37. datahub/emitter/request_helper.py +116 -3
  38. datahub/emitter/rest_emitter.py +163 -93
  39. datahub/entrypoints.py +2 -1
  40. datahub/errors.py +4 -0
  41. datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
  42. datahub/ingestion/api/source.py +2 -5
  43. datahub/ingestion/api/source_helpers.py +1 -0
  44. datahub/ingestion/glossary/classification_mixin.py +4 -2
  45. datahub/ingestion/graph/client.py +33 -8
  46. datahub/ingestion/graph/config.py +14 -0
  47. datahub/ingestion/graph/filters.py +1 -1
  48. datahub/ingestion/graph/links.py +53 -0
  49. datahub/ingestion/run/pipeline.py +9 -6
  50. datahub/ingestion/run/pipeline_config.py +1 -1
  51. datahub/ingestion/sink/datahub_rest.py +5 -6
  52. datahub/ingestion/source/apply/datahub_apply.py +2 -1
  53. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
  54. datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
  55. datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
  56. datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
  57. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
  58. datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
  59. datahub/ingestion/source/common/subtypes.py +3 -0
  60. datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
  61. datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
  62. datahub/ingestion/source/dbt/dbt_common.py +10 -2
  63. datahub/ingestion/source/dbt/dbt_core.py +82 -42
  64. datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
  65. datahub/ingestion/source/feast.py +4 -4
  66. datahub/ingestion/source/fivetran/config.py +1 -1
  67. datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
  68. datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
  69. datahub/ingestion/source/ge_data_profiler.py +27 -1
  70. datahub/ingestion/source/hex/api.py +1 -20
  71. datahub/ingestion/source/hex/query_fetcher.py +4 -1
  72. datahub/ingestion/source/iceberg/iceberg.py +20 -4
  73. datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
  74. datahub/ingestion/source/ldap.py +1 -1
  75. datahub/ingestion/source/looker/looker_common.py +17 -2
  76. datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
  77. datahub/ingestion/source/looker/looker_source.py +34 -5
  78. datahub/ingestion/source/looker/lookml_source.py +7 -1
  79. datahub/ingestion/source/metadata/lineage.py +2 -1
  80. datahub/ingestion/source/mlflow.py +19 -6
  81. datahub/ingestion/source/mode.py +74 -28
  82. datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
  83. datahub/ingestion/source/powerbi/config.py +13 -1
  84. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  85. datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
  86. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
  87. datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
  88. datahub/ingestion/source/redshift/usage.py +10 -9
  89. datahub/ingestion/source/sigma/config.py +74 -6
  90. datahub/ingestion/source/sigma/sigma.py +16 -1
  91. datahub/ingestion/source/sigma/sigma_api.py +99 -58
  92. datahub/ingestion/source/slack/slack.py +4 -52
  93. datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
  94. datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
  95. datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
  96. datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
  97. datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
  98. datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
  99. datahub/ingestion/source/sql/athena.py +2 -1
  100. datahub/ingestion/source/sql/clickhouse.py +5 -1
  101. datahub/ingestion/source/sql/druid.py +7 -2
  102. datahub/ingestion/source/sql/hive.py +7 -2
  103. datahub/ingestion/source/sql/hive_metastore.py +5 -5
  104. datahub/ingestion/source/sql/mssql/source.py +1 -1
  105. datahub/ingestion/source/sql/oracle.py +6 -2
  106. datahub/ingestion/source/sql/sql_config.py +1 -34
  107. datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
  108. datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
  109. datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
  110. datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
  111. datahub/ingestion/source/tableau/tableau.py +31 -6
  112. datahub/ingestion/source/tableau/tableau_validation.py +1 -1
  113. datahub/ingestion/source/unity/config.py +2 -1
  114. datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
  115. datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
  116. datahub/ingestion/source/vertexai/vertexai.py +316 -4
  117. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
  118. datahub/integrations/assertion/common.py +3 -2
  119. datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
  120. datahub/metadata/_urns/urn_defs.py +1819 -1763
  121. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
  122. datahub/metadata/schema.avsc +17296 -16883
  123. datahub/metadata/schema_classes.py +3 -3
  124. datahub/metadata/schemas/DataContractKey.avsc +2 -1
  125. datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
  126. datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
  127. datahub/metadata/schemas/FormInfo.avsc +5 -0
  128. datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
  129. datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
  130. datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
  131. datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
  132. datahub/metadata/schemas/QueryProperties.avsc +4 -2
  133. datahub/metadata/schemas/SystemMetadata.avsc +86 -0
  134. datahub/metadata/schemas/__init__.py +3 -3
  135. datahub/sdk/_all_entities.py +4 -0
  136. datahub/sdk/_shared.py +142 -4
  137. datahub/sdk/_utils.py +4 -0
  138. datahub/sdk/dataset.py +2 -2
  139. datahub/sdk/entity_client.py +8 -0
  140. datahub/sdk/lineage_client.py +235 -0
  141. datahub/sdk/main_client.py +6 -3
  142. datahub/sdk/mlmodel.py +301 -0
  143. datahub/sdk/mlmodelgroup.py +233 -0
  144. datahub/secret/datahub_secret_store.py +2 -1
  145. datahub/specific/dataset.py +12 -0
  146. datahub/sql_parsing/fingerprint_utils.py +6 -0
  147. datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
  148. datahub/sql_parsing/sqlglot_utils.py +18 -14
  149. datahub/telemetry/telemetry.py +2 -2
  150. datahub/testing/check_imports.py +1 -1
  151. datahub/testing/mcp_diff.py +15 -2
  152. datahub/upgrade/upgrade.py +10 -12
  153. datahub/utilities/logging_manager.py +8 -1
  154. datahub/utilities/server_config_util.py +350 -10
  155. datahub/utilities/sqlalchemy_query_combiner.py +4 -5
  156. datahub/utilities/urn_encoder.py +1 -1
  157. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
  158. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
  159. {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import dataclasses
2
2
  import logging
3
+ from datetime import datetime, timedelta
3
4
  from typing import Dict, Iterable, List, Optional, Tuple, TypeVar, Union
4
5
 
5
6
  from google.api_core.exceptions import GoogleAPICallError
@@ -12,15 +13,22 @@ from google.cloud.aiplatform import (
12
13
  AutoMLVideoTrainingJob,
13
14
  Endpoint,
14
15
  ExperimentRun,
16
+ PipelineJob,
15
17
  )
16
18
  from google.cloud.aiplatform.base import VertexAiResourceNoun
17
19
  from google.cloud.aiplatform.metadata.execution import Execution
18
20
  from google.cloud.aiplatform.metadata.experiment_resources import Experiment
19
21
  from google.cloud.aiplatform.models import Model, VersionInfo
20
22
  from google.cloud.aiplatform.training_jobs import _TrainingJob
23
+ from google.cloud.aiplatform_v1.types import (
24
+ PipelineJob as PipelineJobType,
25
+ PipelineTaskDetail,
26
+ )
21
27
  from google.oauth2 import service_account
28
+ from google.protobuf import timestamp_pb2
22
29
 
23
30
  import datahub.emitter.mce_builder as builder
31
+ from datahub.api.entities.datajob import DataFlow, DataJob
24
32
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
25
33
  from datahub.emitter.mcp_builder import (
26
34
  ExperimentKey,
@@ -43,6 +51,7 @@ from datahub.ingestion.source.vertexai.vertexai_config import VertexAIConfig
43
51
  from datahub.ingestion.source.vertexai.vertexai_result_type_utils import (
44
52
  get_execution_result_status,
45
53
  get_job_result_status,
54
+ get_pipeline_task_result_status,
46
55
  is_status_for_run_event_class,
47
56
  )
48
57
  from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import (
@@ -76,7 +85,13 @@ from datahub.metadata.schema_classes import (
76
85
  VersionPropertiesClass,
77
86
  VersionTagClass,
78
87
  )
79
- from datahub.metadata.urns import DataPlatformUrn, MlModelUrn, VersionSetUrn
88
+ from datahub.metadata.urns import (
89
+ DataFlowUrn,
90
+ DataJobUrn,
91
+ DataPlatformUrn,
92
+ MlModelUrn,
93
+ VersionSetUrn,
94
+ )
80
95
  from datahub.utilities.time import datetime_to_ts_millis
81
96
 
82
97
  T = TypeVar("T")
@@ -100,6 +115,34 @@ class ModelMetadata:
100
115
  endpoints: Optional[List[Endpoint]] = None
101
116
 
102
117
 
118
+ @dataclasses.dataclass
119
+ class PipelineTaskMetadata:
120
+ name: str
121
+ urn: DataJobUrn
122
+ id: Optional[int] = None
123
+ type: Optional[str] = None
124
+ state: Optional[PipelineTaskDetail.State] = None
125
+ start_time: Optional[timestamp_pb2.Timestamp] = None
126
+ create_time: Optional[timestamp_pb2.Timestamp] = None
127
+ end_time: Optional[timestamp_pb2.Timestamp] = None
128
+ upstreams: Optional[List[DataJobUrn]] = None
129
+ duration: Optional[int] = None
130
+
131
+
132
+ @dataclasses.dataclass
133
+ class PipelineMetadata:
134
+ name: str
135
+ resource_name: str
136
+ tasks: List[PipelineTaskMetadata]
137
+ urn: DataFlowUrn
138
+ id: Optional[str] = None
139
+ labels: Optional[Dict[str, str]] = None
140
+ create_time: Optional[datetime] = None
141
+ update_time: Optional[datetime] = None
142
+ duration: Optional[timedelta] = None
143
+ region: Optional[str] = None
144
+
145
+
103
146
  @platform_name("Vertex AI", id="vertexai")
104
147
  @config_class(VertexAIConfig)
105
148
  @support_status(SupportStatus.TESTING)
@@ -150,6 +193,255 @@ class VertexAISource(Source):
150
193
  yield from self._get_experiments_workunits()
151
194
  # Fetch and Ingest Experiment Runs
152
195
  yield from auto_workunit(self._get_experiment_runs_mcps())
196
+ # Fetch Pipelines and Tasks
197
+ yield from auto_workunit(self._get_pipelines_mcps())
198
+
199
+ def _get_pipelines_mcps(self) -> Iterable[MetadataChangeProposalWrapper]:
200
+ """
201
+ Fetches pipelines from Vertex AI and generates corresponding mcps.
202
+ """
203
+
204
+ pipeline_jobs = self.client.PipelineJob.list()
205
+
206
+ for pipeline in pipeline_jobs:
207
+ logger.info(f"fetching pipeline ({pipeline.name})")
208
+ pipeline_meta = self._get_pipeline_metadata(pipeline)
209
+ yield from self._get_pipeline_mcps(pipeline_meta)
210
+ yield from self._gen_pipeline_task_mcps(pipeline_meta)
211
+
212
+ def _get_pipeline_tasks_metadata(
213
+ self, pipeline: PipelineJob, pipeline_urn: DataFlowUrn
214
+ ) -> List[PipelineTaskMetadata]:
215
+ tasks: List[PipelineTaskMetadata] = list()
216
+ task_map: Dict[str, PipelineTaskDetail] = dict()
217
+ for task in pipeline.task_details:
218
+ task_map[task.task_name] = task
219
+
220
+ resource = pipeline.gca_resource
221
+ if isinstance(resource, PipelineJobType):
222
+ for task_name in resource.pipeline_spec["root"]["dag"]["tasks"]:
223
+ logger.debug(
224
+ f"fetching pipeline task ({task_name}) in pipeline ({pipeline.name})"
225
+ )
226
+ task_urn = DataJobUrn.create_from_ids(
227
+ data_flow_urn=str(pipeline_urn),
228
+ job_id=self._make_vertexai_pipeline_task_id(task_name),
229
+ )
230
+ task_meta = PipelineTaskMetadata(name=task_name, urn=task_urn)
231
+ if (
232
+ "dependentTasks"
233
+ in resource.pipeline_spec["root"]["dag"]["tasks"][task_name]
234
+ ):
235
+ upstream_tasks = resource.pipeline_spec["root"]["dag"]["tasks"][
236
+ task_name
237
+ ]["dependentTasks"]
238
+ upstream_urls = [
239
+ DataJobUrn.create_from_ids(
240
+ data_flow_urn=str(pipeline_urn),
241
+ job_id=self._make_vertexai_pipeline_task_id(upstream_task),
242
+ )
243
+ for upstream_task in upstream_tasks
244
+ ]
245
+ task_meta.upstreams = upstream_urls
246
+
247
+ task_detail = task_map.get(task_name)
248
+ if task_detail:
249
+ task_meta.id = task_detail.task_id
250
+ task_meta.state = task_detail.state
251
+ task_meta.start_time = task_detail.start_time
252
+ task_meta.create_time = task_detail.create_time
253
+ if task_detail.end_time:
254
+ task_meta.end_time = task_detail.end_time
255
+ task_meta.duration = int(
256
+ (
257
+ task_meta.end_time.timestamp()
258
+ - task_meta.start_time.timestamp()
259
+ )
260
+ * 1000
261
+ )
262
+
263
+ tasks.append(task_meta)
264
+ return tasks
265
+
266
+ def _get_pipeline_metadata(self, pipeline: PipelineJob) -> PipelineMetadata:
267
+ dataflow_urn = DataFlowUrn.create_from_ids(
268
+ orchestrator=self.platform,
269
+ env=self.config.env,
270
+ flow_id=self._make_vertexai_pipeline_id(pipeline.name),
271
+ platform_instance=self.platform,
272
+ )
273
+ tasks = self._get_pipeline_tasks_metadata(
274
+ pipeline=pipeline, pipeline_urn=dataflow_urn
275
+ )
276
+
277
+ pipeline_meta = PipelineMetadata(
278
+ name=pipeline.name,
279
+ resource_name=pipeline.resource_name,
280
+ urn=dataflow_urn,
281
+ tasks=tasks,
282
+ )
283
+ pipeline_meta.resource_name = pipeline.resource_name
284
+ pipeline_meta.labels = pipeline.labels
285
+ pipeline_meta.create_time = pipeline.create_time
286
+ pipeline_meta.region = pipeline.location
287
+ if pipeline.update_time:
288
+ pipeline_meta.update_time = pipeline.update_time
289
+ pipeline_meta.duration = timedelta(
290
+ milliseconds=datetime_to_ts_millis(pipeline.update_time)
291
+ - datetime_to_ts_millis(pipeline.create_time)
292
+ )
293
+ return pipeline_meta
294
+
295
+ def _gen_pipeline_task_run_mcps(
296
+ self, task: PipelineTaskMetadata, datajob: DataJob, pipeline: PipelineMetadata
297
+ ) -> (Iterable)[MetadataChangeProposalWrapper]:
298
+ dpi_urn = builder.make_data_process_instance_urn(
299
+ self._make_vertexai_pipeline_task_run_id(entity_id=task.name)
300
+ )
301
+ result_status: Union[str, RunResultTypeClass] = get_pipeline_task_result_status(
302
+ task.state
303
+ )
304
+
305
+ yield from MetadataChangeProposalWrapper.construct_many(
306
+ dpi_urn,
307
+ aspects=[
308
+ DataProcessInstancePropertiesClass(
309
+ name=task.name,
310
+ created=AuditStampClass(
311
+ time=(
312
+ int(task.create_time.timestamp() * 1000)
313
+ if task.create_time
314
+ else 0
315
+ ),
316
+ actor="urn:li:corpuser:datahub",
317
+ ),
318
+ externalUrl=self._make_pipeline_external_url(pipeline.name),
319
+ customProperties={},
320
+ ),
321
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE_TASK_RUN]),
322
+ ContainerClass(container=self._get_project_container().as_urn()),
323
+ DataPlatformInstanceClass(platform=str(DataPlatformUrn(self.platform))),
324
+ DataProcessInstanceRelationships(
325
+ upstreamInstances=[], parentTemplate=str(datajob.urn)
326
+ ),
327
+ (
328
+ DataProcessInstanceRunEventClass(
329
+ status=DataProcessRunStatusClass.COMPLETE,
330
+ timestampMillis=(
331
+ int(task.create_time.timestamp() * 1000)
332
+ if task.create_time
333
+ else 0
334
+ ),
335
+ result=DataProcessInstanceRunResultClass(
336
+ type=result_status,
337
+ nativeResultType=self.platform,
338
+ ),
339
+ durationMillis=task.duration,
340
+ )
341
+ if is_status_for_run_event_class(result_status) and task.duration
342
+ else None
343
+ ),
344
+ ],
345
+ )
346
+
347
+ def _gen_pipeline_task_mcps(
348
+ self, pipeline: PipelineMetadata
349
+ ) -> Iterable[MetadataChangeProposalWrapper]:
350
+ dataflow_urn = pipeline.urn
351
+
352
+ for task in pipeline.tasks:
353
+ datajob = DataJob(
354
+ id=self._make_vertexai_pipeline_task_id(task.name),
355
+ flow_urn=dataflow_urn,
356
+ name=task.name,
357
+ properties={},
358
+ owners={"urn:li:corpuser:datahub"},
359
+ upstream_urns=task.upstreams if task.upstreams else [],
360
+ url=self._make_pipeline_external_url(pipeline.name),
361
+ )
362
+ yield from MetadataChangeProposalWrapper.construct_many(
363
+ entityUrn=str(datajob.urn),
364
+ aspects=[
365
+ ContainerClass(container=self._get_project_container().as_urn()),
366
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE_TASK]),
367
+ ],
368
+ )
369
+ yield from datajob.generate_mcp()
370
+ yield from self._gen_pipeline_task_run_mcps(task, datajob, pipeline)
371
+
372
+ def _format_pipeline_duration(self, td: timedelta) -> str:
373
+ days = td.days
374
+ hours, remainder = divmod(td.seconds, 3600)
375
+ minutes, seconds = divmod(remainder, 60)
376
+ milliseconds = td.microseconds // 1000
377
+
378
+ parts = []
379
+ if days:
380
+ parts.append(f"{days}d")
381
+ if hours:
382
+ parts.append(f"{hours}h")
383
+ if minutes:
384
+ parts.append(f"{minutes}m")
385
+ if seconds:
386
+ parts.append(f"{seconds}s")
387
+ if milliseconds:
388
+ parts.append(f"{milliseconds}ms")
389
+ return " ".join(parts) if parts else "0s"
390
+
391
+ def _get_pipeline_task_properties(
392
+ self, task: PipelineTaskMetadata
393
+ ) -> Dict[str, str]:
394
+ return {
395
+ "created_time": (
396
+ task.create_time.strftime("%Y-%m-%d %H:%M:%S")
397
+ if task.create_time
398
+ else ""
399
+ )
400
+ }
401
+
402
+ def _get_pipeline_properties(self, pipeline: PipelineMetadata) -> Dict[str, str]:
403
+ return {
404
+ "resource_name": pipeline.resource_name if pipeline.resource_name else "",
405
+ "create_time": (
406
+ pipeline.create_time.isoformat() if pipeline.create_time else ""
407
+ ),
408
+ "update_time": (
409
+ pipeline.update_time.isoformat() if pipeline.update_time else ""
410
+ ),
411
+ "duration": (
412
+ self._format_pipeline_duration(pipeline.duration)
413
+ if pipeline.duration
414
+ else ""
415
+ ),
416
+ "location": (pipeline.region if pipeline.region else ""),
417
+ "labels": ",".join([f"{k}:{v}" for k, v in pipeline.labels.items()])
418
+ if pipeline.labels
419
+ else "",
420
+ }
421
+
422
+ def _get_pipeline_mcps(
423
+ self, pipeline: PipelineMetadata
424
+ ) -> Iterable[MetadataChangeProposalWrapper]:
425
+ dataflow = DataFlow(
426
+ orchestrator=self.platform,
427
+ id=self._make_vertexai_pipeline_id(pipeline.name),
428
+ env=self.config.env,
429
+ name=pipeline.name,
430
+ platform_instance=self.platform,
431
+ properties=self._get_pipeline_properties(pipeline),
432
+ owners={"urn:li:corpuser:datahub"},
433
+ url=self._make_pipeline_external_url(pipeline_name=pipeline.name),
434
+ )
435
+
436
+ yield from dataflow.generate_mcp()
437
+
438
+ yield from MetadataChangeProposalWrapper.construct_many(
439
+ entityUrn=str(dataflow.urn),
440
+ aspects=[
441
+ ContainerClass(container=self._get_project_container().as_urn()),
442
+ SubTypesClass(typeNames=[MLAssetSubTypes.VERTEX_PIPELINE]),
443
+ ],
444
+ )
153
445
 
154
446
  def _get_experiments_workunits(self) -> Iterable[MetadataWorkUnit]:
155
447
  # List all experiments
@@ -175,7 +467,7 @@ class VertexAISource(Source):
175
467
  parent_container_key=self._get_project_container(),
176
468
  container_key=ExperimentKey(
177
469
  platform=self.platform,
178
- id=self._make_vertexai_experiment_name(experiment.name),
470
+ id=self._make_vertexai_experiment_id(experiment.name),
179
471
  ),
180
472
  name=experiment.name,
181
473
  sub_types=[MLAssetSubTypes.VERTEX_EXPERIMENT],
@@ -311,7 +603,7 @@ class VertexAISource(Source):
311
603
  ) -> Iterable[MetadataChangeProposalWrapper]:
312
604
  experiment_key = ExperimentKey(
313
605
  platform=self.platform,
314
- id=self._make_vertexai_experiment_name(experiment.name),
606
+ id=self._make_vertexai_experiment_id(experiment.name),
315
607
  )
316
608
  run_urn = self._make_experiment_run_urn(experiment, run)
317
609
  created_time, duration = self._get_run_timestamps(run)
@@ -968,7 +1260,7 @@ class VertexAISource(Source):
968
1260
  ) -> str:
969
1261
  return f"{self.config.project_id}.job.{entity_id}"
970
1262
 
971
- def _make_vertexai_experiment_name(self, entity_id: Optional[str]) -> str:
1263
+ def _make_vertexai_experiment_id(self, entity_id: Optional[str]) -> str:
972
1264
  return f"{self.config.project_id}.experiment.{entity_id}"
973
1265
 
974
1266
  def _make_vertexai_experiment_run_name(self, entity_id: Optional[str]) -> str:
@@ -977,6 +1269,15 @@ class VertexAISource(Source):
977
1269
  def _make_vertexai_run_execution_name(self, entity_id: Optional[str]) -> str:
978
1270
  return f"{self.config.project_id}.execution.{entity_id}"
979
1271
 
1272
+ def _make_vertexai_pipeline_id(self, entity_id: Optional[str]) -> str:
1273
+ return f"{self.config.project_id}.pipeline.{entity_id}"
1274
+
1275
+ def _make_vertexai_pipeline_task_id(self, entity_id: Optional[str]) -> str:
1276
+ return f"{self.config.project_id}.pipeline_task.{entity_id}"
1277
+
1278
+ def _make_vertexai_pipeline_task_run_id(self, entity_id: Optional[str]) -> str:
1279
+ return f"{self.config.project_id}.pipeline_task_run.{entity_id}"
1280
+
980
1281
  def _make_artifact_external_url(
981
1282
  self, experiment: Experiment, run: ExperimentRun
982
1283
  ) -> str:
@@ -1053,3 +1354,14 @@ class VertexAISource(Source):
1053
1354
  f"/runs/{experiment.name}-{run.name}/charts?project={self.config.project_id}"
1054
1355
  )
1055
1356
  return external_url
1357
+
1358
+ def _make_pipeline_external_url(self, pipeline_name: str) -> str:
1359
+ """
1360
+ Pipeline Run external URL in Vertex AI
1361
+ https://console.cloud.google.com/vertex-ai/pipelines/locations/us-west2/runs/pipeline-example-more-tasks-3-20250320210739?project=acryl-poc
1362
+ """
1363
+ external_url: str = (
1364
+ f"{self.config.vertexai_url}/pipelines/locations/{self.config.region}/runs/{pipeline_name}"
1365
+ f"?project={self.config.project_id}"
1366
+ )
1367
+ return external_url
@@ -1,9 +1,9 @@
1
- from typing import Union
1
+ from typing import Optional, Union
2
2
 
3
3
  from google.cloud.aiplatform.base import VertexAiResourceNoun
4
4
  from google.cloud.aiplatform.jobs import _RunnableJob
5
5
  from google.cloud.aiplatform.training_jobs import _TrainingJob
6
- from google.cloud.aiplatform_v1.types import JobState, PipelineState
6
+ from google.cloud.aiplatform_v1.types import JobState, PipelineState, PipelineTaskDetail
7
7
 
8
8
  from datahub.metadata.schema_classes import RunResultTypeClass
9
9
 
@@ -64,5 +64,26 @@ def get_execution_result_status(status: int) -> Union[str, RunResultTypeClass]:
64
64
  return status_mapping.get(status, "UNKNOWN")
65
65
 
66
66
 
67
+ def get_pipeline_task_result_status(
68
+ status: Optional[PipelineTaskDetail.State],
69
+ ) -> Union[str, RunResultTypeClass]:
70
+ # TODO: DataProcessInstanceRunResultClass fails with status string except for SUCCESS, FAILURE, SKIPPED,
71
+ # which will be fixed in the future
72
+ status_mapping = {
73
+ # PipelineTaskDetail.State.STATE_UNSPECIFIED: "STATE_UNSPECIFIED",
74
+ # PipelineTaskDetail.State.PENDING: "PENDING",
75
+ # PipelineTaskDetail.State.RUNNING: "RUNNING",
76
+ # PipelineTaskDetail.State.CANCEL_PENDING: "CANCEL_PENDING",
77
+ # PipelineTaskDetail.State.CANCELLING: "CANCELLING",
78
+ # PipelineTaskDetail.State.NOT_TRIGGERED: "NOT_TRIGGERED",
79
+ PipelineTaskDetail.State.SUCCEEDED: RunResultTypeClass.SUCCESS,
80
+ PipelineTaskDetail.State.FAILED: RunResultTypeClass.FAILURE,
81
+ PipelineTaskDetail.State.SKIPPED: RunResultTypeClass.SKIPPED,
82
+ }
83
+ if status is None:
84
+ return "UNKNOWN"
85
+ return status_mapping.get(status, "UNKNOWN")
86
+
87
+
67
88
  def is_status_for_run_event_class(status: Union[str, RunResultTypeClass]) -> bool:
68
89
  return status in [RunResultTypeClass.SUCCESS, RunResultTypeClass.FAILURE]
@@ -3,6 +3,7 @@ from typing import List, Optional, Tuple, TypedDict
3
3
 
4
4
  from datahub.api.entities.assertion.assertion import BaseEntityAssertion
5
5
  from datahub.ingestion.graph.client import get_default_graph
6
+ from datahub.ingestion.graph.config import ClientMode
6
7
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProperties
7
8
  from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata
8
9
  from datahub.utilities.urns.urn import Urn
@@ -15,7 +16,7 @@ class ColumnDict(TypedDict):
15
16
 
16
17
  @lru_cache
17
18
  def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
18
- with get_default_graph() as graph:
19
+ with get_default_graph(ClientMode.CLI) as graph:
19
20
  props: Optional[DatasetProperties] = graph.get_aspect(urn, DatasetProperties)
20
21
  if props is not None:
21
22
  return props.qualifiedName
@@ -24,7 +25,7 @@ def get_qualified_name_from_datahub(urn: str) -> Optional[str]:
24
25
 
25
26
  @lru_cache
26
27
  def get_schema_from_datahub(urn: str) -> Optional[List[ColumnDict]]:
27
- with get_default_graph() as graph:
28
+ with get_default_graph(ClientMode.INGESTION) as graph:
28
29
  schema: Optional[SchemaMetadata] = graph.get_aspect(urn, SchemaMetadata)
29
30
  if schema is not None:
30
31
  return [