zenml-nightly 0.83.1.dev20250707__py3-none-any.whl → 0.83.1.dev20250708__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zenml/VERSION +1 -1
- zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py +296 -276
- zenml/integrations/kubernetes/pod_settings.py +37 -0
- zenml/integrations/kubernetes/serialization_utils.py +2 -1
- zenml/logging/step_logging.py +81 -0
- zenml/models/v2/core/logs.py +14 -1
- zenml/models/v2/core/pipeline_run.py +16 -0
- zenml/orchestrators/step_launcher.py +1 -0
- zenml/pipelines/pipeline_definition.py +1 -0
- zenml/zen_server/routers/runs_endpoints.py +26 -17
- zenml/zen_stores/migrations/versions/85289fea86ff_adding_source_to_logs.py +68 -0
- zenml/zen_stores/schemas/logs_schemas.py +11 -2
- zenml/zen_stores/schemas/pipeline_run_schemas.py +12 -3
- zenml/zen_stores/sql_zen_store.py +81 -21
- {zenml_nightly-0.83.1.dev20250707.dist-info → zenml_nightly-0.83.1.dev20250708.dist-info}/METADATA +1 -1
- {zenml_nightly-0.83.1.dev20250707.dist-info → zenml_nightly-0.83.1.dev20250708.dist-info}/RECORD +19 -18
- {zenml_nightly-0.83.1.dev20250707.dist-info → zenml_nightly-0.83.1.dev20250708.dist-info}/LICENSE +0 -0
- {zenml_nightly-0.83.1.dev20250707.dist-info → zenml_nightly-0.83.1.dev20250708.dist-info}/WHEEL +0 -0
- {zenml_nightly-0.83.1.dev20250707.dist-info → zenml_nightly-0.83.1.dev20250708.dist-info}/entry_points.txt +0 -0
zenml/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.83.1.
|
1
|
+
0.83.1.dev20250708
|
@@ -39,6 +39,7 @@ from zenml.integrations.kubernetes.orchestrators.manifest_utils import (
|
|
39
39
|
build_pod_manifest,
|
40
40
|
)
|
41
41
|
from zenml.logger import get_logger
|
42
|
+
from zenml.logging.step_logging import setup_orchestrator_logging
|
42
43
|
from zenml.orchestrators import publish_utils
|
43
44
|
from zenml.orchestrators.dag_runner import NodeStatus, ThreadedDagRunner
|
44
45
|
from zenml.orchestrators.step_run_utils import (
|
@@ -77,44 +78,7 @@ def main() -> None:
|
|
77
78
|
orchestrator_pod_name = socket.gethostname()
|
78
79
|
|
79
80
|
client = Client()
|
80
|
-
active_stack = client.active_stack
|
81
|
-
orchestrator = active_stack.orchestrator
|
82
|
-
assert isinstance(orchestrator, KubernetesOrchestrator)
|
83
|
-
namespace = orchestrator.config.kubernetes_namespace
|
84
|
-
|
85
81
|
deployment = client.get_deployment(args.deployment_id)
|
86
|
-
pipeline_settings = cast(
|
87
|
-
KubernetesOrchestratorSettings,
|
88
|
-
orchestrator.get_settings(deployment),
|
89
|
-
)
|
90
|
-
|
91
|
-
step_command = StepEntrypointConfiguration.get_entrypoint_command()
|
92
|
-
|
93
|
-
mount_local_stores = active_stack.orchestrator.config.is_local
|
94
|
-
|
95
|
-
# Get a Kubernetes client from the active Kubernetes orchestrator, but
|
96
|
-
# override the `incluster` setting to `True` since we are running inside
|
97
|
-
# the Kubernetes cluster.
|
98
|
-
kube_client = orchestrator.get_kube_client(incluster=True)
|
99
|
-
core_api = k8s_client.CoreV1Api(kube_client)
|
100
|
-
|
101
|
-
env = get_config_environment_vars()
|
102
|
-
env[ENV_ZENML_KUBERNETES_RUN_ID] = orchestrator_pod_name
|
103
|
-
|
104
|
-
try:
|
105
|
-
owner_references = kube_utils.get_pod_owner_references(
|
106
|
-
core_api=core_api,
|
107
|
-
pod_name=orchestrator_pod_name,
|
108
|
-
namespace=namespace,
|
109
|
-
)
|
110
|
-
except Exception as e:
|
111
|
-
logger.warning(f"Failed to get pod owner references: {str(e)}")
|
112
|
-
owner_references = []
|
113
|
-
else:
|
114
|
-
# Make sure None of the owner references are marked as controllers of
|
115
|
-
# the created pod, which messes with the garbage collection logic.
|
116
|
-
for owner_reference in owner_references:
|
117
|
-
owner_reference.controller = False
|
118
82
|
|
119
83
|
if args.run_id:
|
120
84
|
pipeline_run = client.get_pipeline_run(args.run_id)
|
@@ -124,279 +88,335 @@ def main() -> None:
|
|
124
88
|
orchestrator_run_id=orchestrator_pod_name,
|
125
89
|
)
|
126
90
|
|
127
|
-
|
91
|
+
logs_context = setup_orchestrator_logging(
|
92
|
+
run_id=str(pipeline_run.id), deployment=deployment
|
93
|
+
)
|
128
94
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
95
|
+
with logs_context:
|
96
|
+
active_stack = client.active_stack
|
97
|
+
orchestrator = active_stack.orchestrator
|
98
|
+
assert isinstance(orchestrator, KubernetesOrchestrator)
|
99
|
+
namespace = orchestrator.config.kubernetes_namespace
|
100
|
+
|
101
|
+
pipeline_settings = cast(
|
102
|
+
KubernetesOrchestratorSettings,
|
103
|
+
orchestrator.get_settings(deployment),
|
134
104
|
)
|
135
|
-
step_runs = {}
|
136
105
|
|
137
|
-
|
138
|
-
"""Pre-step run.
|
106
|
+
step_command = StepEntrypointConfiguration.get_entrypoint_command()
|
139
107
|
|
140
|
-
|
141
|
-
step_name: Name of the step.
|
108
|
+
mount_local_stores = active_stack.orchestrator.config.is_local
|
142
109
|
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
110
|
+
# Get a Kubernetes client from the active Kubernetes orchestrator, but
|
111
|
+
# override the `incluster` setting to `True` since we are running inside
|
112
|
+
# the Kubernetes cluster.
|
113
|
+
kube_client = orchestrator.get_kube_client(incluster=True)
|
114
|
+
core_api = k8s_client.CoreV1Api(kube_client)
|
115
|
+
|
116
|
+
env = get_config_environment_vars()
|
117
|
+
env[ENV_ZENML_KUBERNETES_RUN_ID] = orchestrator_pod_name
|
118
|
+
|
119
|
+
try:
|
120
|
+
owner_references = kube_utils.get_pod_owner_references(
|
121
|
+
core_api=core_api,
|
122
|
+
pod_name=orchestrator_pod_name,
|
123
|
+
namespace=namespace,
|
148
124
|
)
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
125
|
+
except Exception as e:
|
126
|
+
logger.warning(f"Failed to get pod owner references: {str(e)}")
|
127
|
+
owner_references = []
|
128
|
+
else:
|
129
|
+
# Make sure None of the owner references are marked as controllers of
|
130
|
+
# the created pod, which messes with the garbage collection logic.
|
131
|
+
for owner_reference in owner_references:
|
132
|
+
owner_reference.controller = False
|
133
|
+
|
134
|
+
pre_step_run: Optional[Callable[[str], bool]] = None
|
135
|
+
|
136
|
+
if not pipeline_settings.prevent_orchestrator_pod_caching:
|
137
|
+
step_run_request_factory = StepRunRequestFactory(
|
138
|
+
deployment=deployment,
|
139
|
+
pipeline_run=pipeline_run,
|
140
|
+
stack=active_stack,
|
141
|
+
)
|
142
|
+
step_runs = {}
|
143
|
+
|
144
|
+
def pre_step_run(step_name: str) -> bool:
|
145
|
+
"""Pre-step run.
|
146
|
+
|
147
|
+
Args:
|
148
|
+
step_name: Name of the step.
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
Whether the step node needs to be run.
|
152
|
+
"""
|
153
|
+
step_run_request = step_run_request_factory.create_request(
|
154
|
+
step_name
|
154
155
|
)
|
156
|
+
try:
|
157
|
+
step_run_request_factory.populate_request(step_run_request)
|
158
|
+
except Exception as e:
|
159
|
+
logger.error(
|
160
|
+
f"Failed to populate step run request for step {step_name}: {e}"
|
161
|
+
)
|
162
|
+
return True
|
163
|
+
|
164
|
+
if step_run_request.status == ExecutionStatus.CACHED:
|
165
|
+
step_run = publish_cached_step_run(
|
166
|
+
step_run_request, pipeline_run
|
167
|
+
)
|
168
|
+
step_runs[step_name] = step_run
|
169
|
+
logger.info(
|
170
|
+
"Using cached version of step `%s`.", step_name
|
171
|
+
)
|
172
|
+
return False
|
173
|
+
|
155
174
|
return True
|
156
175
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
return True
|
166
|
-
|
167
|
-
step_pod_labels = {
|
168
|
-
"run_id": kube_utils.sanitize_label(str(pipeline_run.id)),
|
169
|
-
"run_name": kube_utils.sanitize_label(str(pipeline_run.name)),
|
170
|
-
"pipeline": kube_utils.sanitize_label(
|
171
|
-
deployment.pipeline_configuration.name
|
172
|
-
),
|
173
|
-
}
|
174
|
-
|
175
|
-
def run_step_on_kubernetes(step_name: str) -> None:
|
176
|
-
"""Run a pipeline step in a separate Kubernetes pod.
|
177
|
-
|
178
|
-
Args:
|
179
|
-
step_name: Name of the step.
|
180
|
-
|
181
|
-
Raises:
|
182
|
-
Exception: If the pod fails to start.
|
183
|
-
"""
|
184
|
-
step_config = deployment.step_configurations[step_name].config
|
185
|
-
settings = step_config.settings.get("orchestrator.kubernetes", None)
|
186
|
-
settings = KubernetesOrchestratorSettings.model_validate(
|
187
|
-
settings.model_dump() if settings else {}
|
188
|
-
)
|
176
|
+
step_pod_labels = {
|
177
|
+
"run_id": kube_utils.sanitize_label(str(pipeline_run.id)),
|
178
|
+
"run_name": kube_utils.sanitize_label(str(pipeline_run.name)),
|
179
|
+
"pipeline": kube_utils.sanitize_label(
|
180
|
+
deployment.pipeline_configuration.name
|
181
|
+
),
|
182
|
+
}
|
189
183
|
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
184
|
+
def run_step_on_kubernetes(step_name: str) -> None:
|
185
|
+
"""Run a pipeline step in a separate Kubernetes pod.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
step_name: Name of the step.
|
189
|
+
|
190
|
+
Raises:
|
191
|
+
Exception: If the pod fails to start.
|
192
|
+
"""
|
193
|
+
step_config = deployment.step_configurations[step_name].config
|
194
|
+
settings = step_config.settings.get(
|
195
|
+
"orchestrator.kubernetes", None
|
197
196
|
)
|
198
|
-
|
199
|
-
settings.
|
197
|
+
settings = KubernetesOrchestratorSettings.model_validate(
|
198
|
+
settings.model_dump() if settings else {}
|
200
199
|
)
|
201
|
-
pod_name = f"{pod_name_prefix}-{step_name}"
|
202
|
-
else:
|
203
|
-
pod_name = f"{orchestrator_pod_name}-{step_name}"
|
204
200
|
|
205
|
-
|
201
|
+
if (
|
202
|
+
settings.pod_name_prefix
|
203
|
+
and not orchestrator_pod_name.startswith(
|
204
|
+
settings.pod_name_prefix
|
205
|
+
)
|
206
|
+
):
|
207
|
+
max_length = (
|
208
|
+
kube_utils.calculate_max_pod_name_length_for_namespace(
|
209
|
+
namespace=namespace
|
210
|
+
)
|
211
|
+
)
|
212
|
+
pod_name_prefix = get_orchestrator_run_name(
|
213
|
+
settings.pod_name_prefix, max_length=max_length
|
214
|
+
)
|
215
|
+
pod_name = f"{pod_name_prefix}-{step_name}"
|
216
|
+
else:
|
217
|
+
pod_name = f"{orchestrator_pod_name}-{step_name}"
|
206
218
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
step_args = StepEntrypointConfiguration.get_entrypoint_arguments(
|
211
|
-
step_name=step_name, deployment_id=deployment.id
|
212
|
-
)
|
219
|
+
pod_name = kube_utils.sanitize_pod_name(
|
220
|
+
pod_name, namespace=namespace
|
221
|
+
)
|
213
222
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
memory="400Mi",
|
221
|
-
pod_settings=settings.pod_settings,
|
222
|
-
)
|
223
|
+
image = KubernetesOrchestrator.get_image(
|
224
|
+
deployment=deployment, step_name=step_name
|
225
|
+
)
|
226
|
+
step_args = StepEntrypointConfiguration.get_entrypoint_arguments(
|
227
|
+
step_name=step_name, deployment_id=deployment.id
|
228
|
+
)
|
223
229
|
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
}
|
235
|
-
},
|
236
|
-
}
|
230
|
+
# We set some default minimum memory resource requests for the step pod
|
231
|
+
# here if the user has not specified any, because the step pod takes up
|
232
|
+
# some memory resources itself and, if not specified, the pod will be
|
233
|
+
# scheduled on any node regardless of available memory and risk
|
234
|
+
# negatively impacting or even crashing the node due to memory pressure.
|
235
|
+
pod_settings = (
|
236
|
+
KubernetesOrchestrator.apply_default_resource_requests(
|
237
|
+
memory="400Mi",
|
238
|
+
pod_settings=settings.pod_settings,
|
239
|
+
)
|
237
240
|
)
|
238
241
|
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
labels=step_pod_labels,
|
254
|
-
)
|
242
|
+
if orchestrator.config.pass_zenml_token_as_secret:
|
243
|
+
env.pop("ZENML_STORE_API_TOKEN", None)
|
244
|
+
secret_name = orchestrator.get_token_secret_name(deployment.id)
|
245
|
+
pod_settings.env.append(
|
246
|
+
{
|
247
|
+
"name": "ZENML_STORE_API_TOKEN",
|
248
|
+
"valueFrom": {
|
249
|
+
"secretKeyRef": {
|
250
|
+
"name": secret_name,
|
251
|
+
"key": KUBERNETES_SECRET_TOKEN_KEY_NAME,
|
252
|
+
}
|
253
|
+
},
|
254
|
+
}
|
255
|
+
)
|
255
256
|
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
257
|
+
# Define Kubernetes pod manifest.
|
258
|
+
pod_manifest = build_pod_manifest(
|
259
|
+
pod_name=pod_name,
|
260
|
+
image_name=image,
|
261
|
+
command=step_command,
|
262
|
+
args=step_args,
|
263
|
+
env=env,
|
264
|
+
privileged=settings.privileged,
|
265
|
+
pod_settings=pod_settings,
|
266
|
+
service_account_name=settings.step_pod_service_account_name
|
267
|
+
or settings.service_account_name,
|
268
|
+
mount_local_stores=mount_local_stores,
|
269
|
+
owner_references=owner_references,
|
270
|
+
termination_grace_period_seconds=settings.pod_stop_grace_period,
|
271
|
+
labels=step_pod_labels,
|
272
|
+
)
|
267
273
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
kube_utils.wait_pod(
|
272
|
-
kube_client_fn=lambda: orchestrator.get_kube_client(
|
273
|
-
incluster=True
|
274
|
-
),
|
274
|
+
kube_utils.create_and_wait_for_pod_to_start(
|
275
|
+
core_api=core_api,
|
276
|
+
pod_display_name=f"pod for step `{step_name}`",
|
275
277
|
pod_name=pod_name,
|
278
|
+
pod_manifest=pod_manifest,
|
276
279
|
namespace=namespace,
|
277
|
-
|
278
|
-
|
280
|
+
startup_max_retries=settings.pod_failure_max_retries,
|
281
|
+
startup_failure_delay=settings.pod_failure_retry_delay,
|
282
|
+
startup_failure_backoff=settings.pod_failure_backoff,
|
283
|
+
startup_timeout=settings.pod_startup_timeout,
|
279
284
|
)
|
280
285
|
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
+
# Wait for pod to finish.
|
287
|
+
logger.info(f"Waiting for pod of step `{step_name}` to finish...")
|
288
|
+
try:
|
289
|
+
kube_utils.wait_pod(
|
290
|
+
kube_client_fn=lambda: orchestrator.get_kube_client(
|
291
|
+
incluster=True
|
292
|
+
),
|
293
|
+
pod_name=pod_name,
|
294
|
+
namespace=namespace,
|
295
|
+
exit_condition_lambda=kube_utils.pod_is_done,
|
296
|
+
stream_logs=True,
|
297
|
+
)
|
286
298
|
|
287
|
-
|
288
|
-
|
299
|
+
logger.info(f"Pod for step `{step_name}` completed.")
|
300
|
+
except Exception:
|
301
|
+
logger.error(f"Pod for step `{step_name}` failed.")
|
289
302
|
|
290
|
-
|
291
|
-
node_states: The states of the nodes.
|
292
|
-
"""
|
293
|
-
try:
|
294
|
-
# Some steps may have failed because the pods could not be created.
|
295
|
-
# We need to check for this and mark the step run as failed if so.
|
296
|
-
pipeline_failed = False
|
297
|
-
failed_step_names = [
|
298
|
-
step_name
|
299
|
-
for step_name, node_state in node_states.items()
|
300
|
-
if node_state == NodeStatus.FAILED
|
301
|
-
]
|
302
|
-
step_runs = fetch_step_runs_by_names(
|
303
|
-
step_run_names=failed_step_names, pipeline_run=pipeline_run
|
304
|
-
)
|
303
|
+
raise
|
305
304
|
|
306
|
-
|
307
|
-
|
308
|
-
continue
|
309
|
-
|
310
|
-
pipeline_failed = True
|
311
|
-
|
312
|
-
if step_run := step_runs.get(step_name, None):
|
313
|
-
# Try to update the step run status, if it exists and is in
|
314
|
-
# a transient state.
|
315
|
-
if step_run and step_run.status in {
|
316
|
-
ExecutionStatus.INITIALIZING,
|
317
|
-
ExecutionStatus.RUNNING,
|
318
|
-
}:
|
319
|
-
publish_utils.publish_failed_step_run(step_run.id)
|
320
|
-
|
321
|
-
# If any steps failed and the pipeline run is still in a transient
|
322
|
-
# state, we need to mark it as failed.
|
323
|
-
if pipeline_failed and pipeline_run.status in {
|
324
|
-
ExecutionStatus.INITIALIZING,
|
325
|
-
ExecutionStatus.RUNNING,
|
326
|
-
}:
|
327
|
-
publish_utils.publish_failed_pipeline_run(pipeline_run.id)
|
328
|
-
except AuthorizationException:
|
329
|
-
# If a step of the pipeline failed or all of them completed
|
330
|
-
# successfully, the pipeline run will be finished and the API token
|
331
|
-
# will be invalidated. We catch this exception and do nothing here,
|
332
|
-
# as the pipeline run status will already have been published.
|
333
|
-
pass
|
334
|
-
|
335
|
-
def check_pipeline_cancellation() -> bool:
|
336
|
-
"""Check if the pipeline should continue execution.
|
337
|
-
|
338
|
-
Returns:
|
339
|
-
True if execution should continue, False if it should stop.
|
340
|
-
"""
|
341
|
-
try:
|
342
|
-
run = client.get_pipeline_run(
|
343
|
-
name_id_or_prefix=pipeline_run.id,
|
344
|
-
project=pipeline_run.project_id,
|
345
|
-
hydrate=False, # We only need status, not full hydration
|
346
|
-
)
|
305
|
+
def finalize_run(node_states: Dict[str, NodeStatus]) -> None:
|
306
|
+
"""Finalize the run.
|
347
307
|
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
308
|
+
Args:
|
309
|
+
node_states: The states of the nodes.
|
310
|
+
"""
|
311
|
+
try:
|
312
|
+
# Some steps may have failed because the pods could not be created.
|
313
|
+
# We need to check for this and mark the step run as failed if so.
|
314
|
+
pipeline_failed = False
|
315
|
+
failed_step_names = [
|
316
|
+
step_name
|
317
|
+
for step_name, node_state in node_states.items()
|
318
|
+
if node_state == NodeStatus.FAILED
|
319
|
+
]
|
320
|
+
step_runs = fetch_step_runs_by_names(
|
321
|
+
step_run_names=failed_step_names, pipeline_run=pipeline_run
|
355
322
|
)
|
356
|
-
return False
|
357
323
|
|
358
|
-
|
324
|
+
for step_name, node_state in node_states.items():
|
325
|
+
if node_state != NodeStatus.FAILED:
|
326
|
+
continue
|
327
|
+
|
328
|
+
pipeline_failed = True
|
329
|
+
|
330
|
+
if step_run := step_runs.get(step_name, None):
|
331
|
+
# Try to update the step run status, if it exists and is in
|
332
|
+
# a transient state.
|
333
|
+
if step_run and step_run.status in {
|
334
|
+
ExecutionStatus.INITIALIZING,
|
335
|
+
ExecutionStatus.RUNNING,
|
336
|
+
}:
|
337
|
+
publish_utils.publish_failed_step_run(step_run.id)
|
338
|
+
|
339
|
+
# If any steps failed and the pipeline run is still in a transient
|
340
|
+
# state, we need to mark it as failed.
|
341
|
+
if pipeline_failed and pipeline_run.status in {
|
342
|
+
ExecutionStatus.INITIALIZING,
|
343
|
+
ExecutionStatus.RUNNING,
|
344
|
+
}:
|
345
|
+
publish_utils.publish_failed_pipeline_run(pipeline_run.id)
|
346
|
+
except AuthorizationException:
|
347
|
+
# If a step of the pipeline failed or all of them completed
|
348
|
+
# successfully, the pipeline run will be finished and the API token
|
349
|
+
# will be invalidated. We catch this exception and do nothing here,
|
350
|
+
# as the pipeline run status will already have been published.
|
351
|
+
pass
|
352
|
+
|
353
|
+
def check_pipeline_cancellation() -> bool:
|
354
|
+
"""Check if the pipeline should continue execution.
|
359
355
|
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
356
|
+
Returns:
|
357
|
+
True if execution should continue, False if it should stop.
|
358
|
+
"""
|
359
|
+
try:
|
360
|
+
run = client.get_pipeline_run(
|
361
|
+
name_id_or_prefix=pipeline_run.id,
|
362
|
+
project=pipeline_run.project_id,
|
363
|
+
hydrate=False, # We only need status, not full hydration
|
364
|
+
)
|
366
365
|
|
367
|
-
|
368
|
-
|
369
|
-
|
366
|
+
# If the run is STOPPING or STOPPED, we should stop the execution
|
367
|
+
if run.status in [
|
368
|
+
ExecutionStatus.STOPPING,
|
369
|
+
ExecutionStatus.STOPPED,
|
370
|
+
]:
|
371
|
+
logger.info(
|
372
|
+
f"Pipeline run is in {run.status} state, stopping execution"
|
373
|
+
)
|
374
|
+
return False
|
370
375
|
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
dag=pipeline_dag,
|
378
|
-
run_fn=run_step_on_kubernetes,
|
379
|
-
preparation_fn=pre_step_run,
|
380
|
-
finalize_fn=finalize_run,
|
381
|
-
continue_fn=check_pipeline_cancellation,
|
382
|
-
parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
|
383
|
-
max_parallelism=pipeline_settings.max_parallelism,
|
384
|
-
).run()
|
385
|
-
logger.info("Orchestration pod completed.")
|
386
|
-
finally:
|
387
|
-
if (
|
388
|
-
orchestrator.config.pass_zenml_token_as_secret
|
389
|
-
and deployment.schedule is None
|
390
|
-
):
|
391
|
-
secret_name = orchestrator.get_token_secret_name(deployment.id)
|
392
|
-
try:
|
393
|
-
kube_utils.delete_secret(
|
394
|
-
core_api=core_api,
|
395
|
-
namespace=namespace,
|
396
|
-
secret_name=secret_name,
|
376
|
+
return True
|
377
|
+
|
378
|
+
except Exception as e:
|
379
|
+
# If we can't check the status, assume we should continue
|
380
|
+
logger.warning(
|
381
|
+
f"Failed to check pipeline cancellation status: {e}"
|
397
382
|
)
|
398
|
-
|
399
|
-
|
383
|
+
return True
|
384
|
+
|
385
|
+
parallel_node_startup_waiting_period = (
|
386
|
+
orchestrator.config.parallel_step_startup_waiting_period or 0.0
|
387
|
+
)
|
388
|
+
|
389
|
+
pipeline_dag = {
|
390
|
+
step_name: step.spec.upstream_steps
|
391
|
+
for step_name, step in deployment.step_configurations.items()
|
392
|
+
}
|
393
|
+
try:
|
394
|
+
ThreadedDagRunner(
|
395
|
+
dag=pipeline_dag,
|
396
|
+
run_fn=run_step_on_kubernetes,
|
397
|
+
preparation_fn=pre_step_run,
|
398
|
+
finalize_fn=finalize_run,
|
399
|
+
continue_fn=check_pipeline_cancellation,
|
400
|
+
parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
|
401
|
+
max_parallelism=pipeline_settings.max_parallelism,
|
402
|
+
).run()
|
403
|
+
logger.info("Orchestration pod completed.")
|
404
|
+
finally:
|
405
|
+
if (
|
406
|
+
orchestrator.config.pass_zenml_token_as_secret
|
407
|
+
and deployment.schedule is None
|
408
|
+
):
|
409
|
+
secret_name = orchestrator.get_token_secret_name(deployment.id)
|
410
|
+
try:
|
411
|
+
kube_utils.delete_secret(
|
412
|
+
core_api=core_api,
|
413
|
+
namespace=namespace,
|
414
|
+
secret_name=secret_name,
|
415
|
+
)
|
416
|
+
except ApiException as e:
|
417
|
+
logger.error(
|
418
|
+
f"Error cleaning up secret {secret_name}: {e}"
|
419
|
+
)
|
400
420
|
|
401
421
|
|
402
422
|
if __name__ == "__main__":
|