tangle-cli 0.0.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tangle_cli/__init__.py +19 -0
- tangle_cli/api_cli.py +787 -0
- tangle_cli/api_schema.py +633 -0
- tangle_cli/api_transport.py +461 -0
- tangle_cli/args_container.py +244 -0
- tangle_cli/artifacts.py +293 -0
- tangle_cli/artifacts_cli.py +108 -0
- tangle_cli/cli.py +57 -0
- tangle_cli/cli_helpers.py +116 -0
- tangle_cli/cli_options.py +52 -0
- tangle_cli/client.py +677 -0
- tangle_cli/component_from_func.py +1856 -0
- tangle_cli/component_generator.py +298 -0
- tangle_cli/component_inspector.py +494 -0
- tangle_cli/component_publisher.py +921 -0
- tangle_cli/components_cli.py +269 -0
- tangle_cli/dynamic_discovery_client.py +296 -0
- tangle_cli/generated_model_extensions.py +405 -0
- tangle_cli/generated_runtime.py +43 -0
- tangle_cli/handler.py +96 -0
- tangle_cli/hydration_trust.py +222 -0
- tangle_cli/logger.py +166 -0
- tangle_cli/models.py +407 -0
- tangle_cli/module_bundler.py +662 -0
- tangle_cli/openapi/__init__.py +0 -0
- tangle_cli/openapi/codegen.py +1090 -0
- tangle_cli/openapi/parser.py +77 -0
- tangle_cli/pipeline_dehydrator.py +720 -0
- tangle_cli/pipeline_hydrator.py +1785 -0
- tangle_cli/pipeline_run_annotations.py +41 -0
- tangle_cli/pipeline_run_details.py +203 -0
- tangle_cli/pipeline_run_manager.py +1994 -0
- tangle_cli/pipeline_run_search.py +712 -0
- tangle_cli/pipeline_runner.py +620 -0
- tangle_cli/pipeline_runs_cli.py +584 -0
- tangle_cli/pipelines.py +581 -0
- tangle_cli/pipelines_cli.py +271 -0
- tangle_cli/published_components_cli.py +373 -0
- tangle_cli/py.typed +0 -0
- tangle_cli/quickstart.py +110 -0
- tangle_cli/secrets.py +156 -0
- tangle_cli/secrets_cli.py +269 -0
- tangle_cli/utils.py +942 -0
- tangle_cli/version_manager.py +470 -0
- tangle_cli-0.0.1a1.dist-info/METADATA +561 -0
- tangle_cli-0.0.1a1.dist-info/RECORD +48 -0
- tangle_cli-0.0.1a1.dist-info/WHEEL +4 -0
- tangle_cli-0.0.1a1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,1994 @@
|
|
|
1
|
+
"""Generic pipeline-run helpers for `tangle sdk pipeline-runs`.
|
|
2
|
+
|
|
3
|
+
This module ports the OSS-safe parts of tangle-deploy's runner/run details
|
|
4
|
+
commands while keeping downstream-specific behavior behind hooks. The default
|
|
5
|
+
implementation uses only the public Tangle API and local files; cloud storage,
|
|
6
|
+
notifications, scheduler, mutex, run-as annotation defaults, and alternate log
|
|
7
|
+
backends are intentionally extension points rather than OSS behavior.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import copy
|
|
13
|
+
import inspect
|
|
14
|
+
import json
|
|
15
|
+
import re
|
|
16
|
+
import time
|
|
17
|
+
import uuid
|
|
18
|
+
from collections.abc import Callable
|
|
19
|
+
from contextlib import AbstractContextManager, nullcontext
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Mapping
|
|
23
|
+
|
|
24
|
+
import yaml
|
|
25
|
+
|
|
26
|
+
from .handler import TangleCliHandler
|
|
27
|
+
from .logger import Logger, get_default_logger
|
|
28
|
+
from .pipeline_dehydrator import DehydrateChoice, PipelineDehydrator
|
|
29
|
+
from .pipeline_hydrator import HydrationError, PipelineHydrator
|
|
30
|
+
from .pipeline_run_details import PipelineRunDetails
|
|
31
|
+
from .pipeline_run_search import PipelineRunSearch
|
|
32
|
+
from .utils import dump_yaml
|
|
33
|
+
|
|
34
|
+
_TERMINAL_STATUSES = ("FAILED", "SYSTEM_ERROR", "CANCELLED", "CANCELED", "SKIPPED", "SUCCEEDED", "INVALID")
|
|
35
|
+
_ACTIVE_STATUSES = ("RUNNING", "CANCELLING", "CANCELING", "PENDING", "QUEUED")
|
|
36
|
+
_FAILURE_EARLY_EXIT_STATUSES = ("FAILED", "SYSTEM_ERROR")
|
|
37
|
+
_EXECUTION_STATE_TIMINGS_METADATA_KEY = "execution_state_timings"
|
|
38
|
+
_EXECUTION_STATE_TIMING_MONOTONIC_METADATA_KEY = "_execution_state_timing_monotonic"
|
|
39
|
+
_SUBMISSION_ID_ANNOTATION_KEY = "tangle-cli/submission-id"
|
|
40
|
+
_SUBMIT_RECOVERY_LOOKUP_ATTEMPTS = 2
|
|
41
|
+
_SUBMIT_RECOVERY_LOOKUP_DELAY_SECONDS = 0.1
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class PipelineRunError(RuntimeError):
|
|
45
|
+
"""Raised when a pipeline-run operation cannot complete."""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class UnsupportedPipelineRunFeatureError(PipelineRunError):
|
|
49
|
+
"""Raised for TD extension points intentionally unsupported in OSS defaults."""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class AmbiguousPipelineRunRecoveryError(PipelineRunError):
|
|
53
|
+
"""Raised when submit recovery finds multiple runs for one submission id."""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class PipelineSubmitPayload:
|
|
58
|
+
"""Prepared submit payload state before calling ``pipeline_runs_create``.
|
|
59
|
+
|
|
60
|
+
This keeps the generic submit-body pipeline explicit: downstream hooks can
|
|
61
|
+
adjust the spec, runtime arguments, run name, and annotations while callers
|
|
62
|
+
still have one canonical body shape to submit.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
prepared_spec: dict[str, Any]
|
|
66
|
+
pipeline_spec: dict[str, Any]
|
|
67
|
+
run_args: dict[str, Any] | None
|
|
68
|
+
root_task: dict[str, Any]
|
|
69
|
+
annotations: dict[str, str]
|
|
70
|
+
run_name: str | None = None
|
|
71
|
+
|
|
72
|
+
def to_body(self) -> dict[str, Any]:
|
|
73
|
+
return {"root_task": self.root_task, "annotations": self.annotations}
|
|
74
|
+
|
|
75
|
+
def sync_from_body(self, body: Mapping[str, Any]) -> None:
|
|
76
|
+
"""Refresh derived payload fields after in-place body normalization."""
|
|
77
|
+
|
|
78
|
+
root_task = body.get("root_task")
|
|
79
|
+
if isinstance(root_task, dict):
|
|
80
|
+
self.root_task = root_task
|
|
81
|
+
annotations = body.get("annotations")
|
|
82
|
+
if isinstance(annotations, dict):
|
|
83
|
+
self.annotations = {str(key): str(value) for key, value in annotations.items()}
|
|
84
|
+
component_ref = self.root_task.get("componentRef") if isinstance(self.root_task, Mapping) else None
|
|
85
|
+
submit_spec = component_ref.get("spec") if isinstance(component_ref, Mapping) else None
|
|
86
|
+
if isinstance(submit_spec, dict):
|
|
87
|
+
self.pipeline_spec = submit_spec
|
|
88
|
+
run_name = submit_spec.get("name")
|
|
89
|
+
self.run_name = run_name if isinstance(run_name, str) and run_name else None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass(frozen=True)
|
|
93
|
+
class PipelineWaitOutcome:
|
|
94
|
+
"""Normalized wait result attached to a run context.
|
|
95
|
+
|
|
96
|
+
This is the generic OSS result boundary for wait lifecycle decisions.
|
|
97
|
+
Downstreams can format legacy result dictionaries or notifications from
|
|
98
|
+
this typed outcome without inventing their own metadata flags for success,
|
|
99
|
+
timeout, failure counts, or fail-fast early exit.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
status: str | None = None
|
|
103
|
+
timed_out: bool = False
|
|
104
|
+
early_exit: bool = False
|
|
105
|
+
failed_count: int = 0
|
|
106
|
+
error_count: int = 0
|
|
107
|
+
elapsed_seconds: float = 0.0
|
|
108
|
+
success_override: bool | None = None
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def success(self) -> bool | None:
|
|
112
|
+
"""Return generic success for completed waits, or None for timeout/unknown."""
|
|
113
|
+
|
|
114
|
+
if self.success_override is not None:
|
|
115
|
+
return self.success_override
|
|
116
|
+
if self.timed_out:
|
|
117
|
+
return None
|
|
118
|
+
if self.early_exit or self.failed_count > 0 or self.error_count > 0:
|
|
119
|
+
return False
|
|
120
|
+
status = str(self.status or "").upper()
|
|
121
|
+
if status == "SUCCEEDED":
|
|
122
|
+
return True
|
|
123
|
+
if status in _TERMINAL_STATUSES:
|
|
124
|
+
return False
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def _count_statuses(status_counts: Mapping[str, Any], *statuses: str) -> int:
|
|
129
|
+
total = 0
|
|
130
|
+
for status in statuses:
|
|
131
|
+
try:
|
|
132
|
+
total += int(status_counts.get(status, 0) or 0)
|
|
133
|
+
except (TypeError, ValueError):
|
|
134
|
+
continue
|
|
135
|
+
return total
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def _success_override_from_counts(
|
|
139
|
+
cls,
|
|
140
|
+
status_counts: Mapping[str, Any],
|
|
141
|
+
*,
|
|
142
|
+
terminal: bool,
|
|
143
|
+
total: int,
|
|
144
|
+
) -> bool | None:
|
|
145
|
+
if not terminal or total <= 0:
|
|
146
|
+
return None
|
|
147
|
+
unsuccessful = cls._count_statuses(
|
|
148
|
+
status_counts,
|
|
149
|
+
"FAILED",
|
|
150
|
+
"SYSTEM_ERROR",
|
|
151
|
+
"CANCELLED",
|
|
152
|
+
"CANCELED",
|
|
153
|
+
"INVALID",
|
|
154
|
+
)
|
|
155
|
+
if unsuccessful > 0:
|
|
156
|
+
return False
|
|
157
|
+
terminal_count = cls._count_statuses(status_counts, *_TERMINAL_STATUSES)
|
|
158
|
+
if terminal_count == total:
|
|
159
|
+
return True
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
@classmethod
|
|
163
|
+
def from_poll_result(
|
|
164
|
+
cls,
|
|
165
|
+
poll: "PipelineWaitPoll",
|
|
166
|
+
result: Mapping[str, Any],
|
|
167
|
+
) -> "PipelineWaitOutcome":
|
|
168
|
+
"""Build an outcome from a wait poll and public wait result."""
|
|
169
|
+
|
|
170
|
+
timed_out = bool(result.get("timed_out"))
|
|
171
|
+
early_exit = bool(result.get("early_exit"))
|
|
172
|
+
success_override = cls._success_override_from_counts(
|
|
173
|
+
poll.status_counts,
|
|
174
|
+
terminal=poll.terminal and not timed_out,
|
|
175
|
+
total=poll.total,
|
|
176
|
+
)
|
|
177
|
+
if early_exit and poll.total == 0:
|
|
178
|
+
early_exit = False
|
|
179
|
+
success_override = False
|
|
180
|
+
return cls(
|
|
181
|
+
status=str(result.get("status")) if result.get("status") is not None else poll.status,
|
|
182
|
+
timed_out=timed_out,
|
|
183
|
+
early_exit=early_exit,
|
|
184
|
+
failed_count=int(poll.status_counts.get("FAILED", 0) or 0),
|
|
185
|
+
error_count=int(poll.status_counts.get("SYSTEM_ERROR", 0) or 0),
|
|
186
|
+
elapsed_seconds=poll.elapsed_seconds,
|
|
187
|
+
success_override=success_override,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
@classmethod
|
|
191
|
+
def from_wait_result(
|
|
192
|
+
cls,
|
|
193
|
+
result: Mapping[str, Any],
|
|
194
|
+
metadata: Mapping[str, Any] | None = None,
|
|
195
|
+
) -> "PipelineWaitOutcome":
|
|
196
|
+
"""Build an outcome from a public wait result and optional metadata."""
|
|
197
|
+
|
|
198
|
+
source = metadata or result
|
|
199
|
+
status = str(result.get("status")) if result.get("status") is not None else None
|
|
200
|
+
timed_out = bool(result.get("timed_out") or source.get("timed_out"))
|
|
201
|
+
early_exit = bool(result.get("early_exit") or source.get("early_exit"))
|
|
202
|
+
status_counts = source.get("status_counts")
|
|
203
|
+
status_counts = status_counts if isinstance(status_counts, Mapping) else {}
|
|
204
|
+
total = 0
|
|
205
|
+
for count in status_counts.values():
|
|
206
|
+
try:
|
|
207
|
+
total += int(count or 0)
|
|
208
|
+
except (TypeError, ValueError):
|
|
209
|
+
continue
|
|
210
|
+
terminal = bool(status and (status.upper() == "ENDED" or status.upper() in _TERMINAL_STATUSES))
|
|
211
|
+
success_override = cls._success_override_from_counts(
|
|
212
|
+
status_counts,
|
|
213
|
+
terminal=terminal and not timed_out,
|
|
214
|
+
total=total,
|
|
215
|
+
)
|
|
216
|
+
if early_exit and total == 0:
|
|
217
|
+
early_exit = False
|
|
218
|
+
success_override = False
|
|
219
|
+
failed_count = int(
|
|
220
|
+
source.get(
|
|
221
|
+
"failed_count",
|
|
222
|
+
result.get("failed_count", cls._count_statuses(status_counts, "FAILED")),
|
|
223
|
+
)
|
|
224
|
+
or 0
|
|
225
|
+
)
|
|
226
|
+
error_count = int(
|
|
227
|
+
source.get(
|
|
228
|
+
"error_count",
|
|
229
|
+
result.get("error_count", cls._count_statuses(status_counts, "SYSTEM_ERROR")),
|
|
230
|
+
)
|
|
231
|
+
or 0
|
|
232
|
+
)
|
|
233
|
+
return cls(
|
|
234
|
+
status=status,
|
|
235
|
+
timed_out=timed_out,
|
|
236
|
+
early_exit=early_exit,
|
|
237
|
+
failed_count=failed_count,
|
|
238
|
+
error_count=error_count,
|
|
239
|
+
elapsed_seconds=float(source.get("elapsed_seconds", 0.0) or 0.0),
|
|
240
|
+
success_override=success_override,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@dataclass
|
|
245
|
+
class PipelineRunContext:
|
|
246
|
+
"""First-class context for a pipeline run lifecycle.
|
|
247
|
+
|
|
248
|
+
Downstreams can use this for mutex ownership, graceful-shutdown state,
|
|
249
|
+
notifications, retries, and scheduled timeout bookkeeping without scraping
|
|
250
|
+
transient manager attributes.
|
|
251
|
+
|
|
252
|
+
Fields:
|
|
253
|
+
run_id: Submitted pipeline run id, when an attempt reaches submit.
|
|
254
|
+
run_name: Display/pipeline name derived from the submitted spec.
|
|
255
|
+
root_execution_id: Root execution id returned by the submit API.
|
|
256
|
+
pipeline_path: Source path or URI used for the run, when path-backed.
|
|
257
|
+
start_time: Wall-clock attempt start time for downstream reporting.
|
|
258
|
+
attempt: 1-based attempt number for submit/wait/retry lifecycle hooks.
|
|
259
|
+
submit_body: Submit body for this attempt after normalization.
|
|
260
|
+
pipeline_spec: Pipeline spec extracted from ``submit_body``.
|
|
261
|
+
response: Submit API response for this attempt, when available.
|
|
262
|
+
wait_outcome: Generic wait result for this attempt, when wait ran.
|
|
263
|
+
previous_context: Previous attempt context, including attempts that
|
|
264
|
+
failed during submit before a ``run_id`` existed. This is not just
|
|
265
|
+
the previous successfully submitted run context.
|
|
266
|
+
previous_error: Error from the previous attempt that caused this retry.
|
|
267
|
+
carry_resource_to_retry: Generic resource/mutex handoff flag. Hooks set
|
|
268
|
+
this directly when a resource should remain held for the replacement
|
|
269
|
+
attempt. The current attempt's lifecycle context can then skip
|
|
270
|
+
release, and the next attempt can inspect ``previous_context`` to
|
|
271
|
+
reuse the carried resource.
|
|
272
|
+
metadata: Extra hook-specific state carried through the lifecycle.
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
run_id: str | None = None
|
|
276
|
+
run_name: str | None = None
|
|
277
|
+
root_execution_id: str | None = None
|
|
278
|
+
pipeline_path: str | Path | None = None
|
|
279
|
+
start_time: float | None = None
|
|
280
|
+
attempt: int = 1
|
|
281
|
+
submit_body: dict[str, Any] | None = None
|
|
282
|
+
pipeline_spec: dict[str, Any] | None = None
|
|
283
|
+
response: dict[str, Any] | None = None
|
|
284
|
+
wait_outcome: PipelineWaitOutcome | None = None
|
|
285
|
+
previous_context: "PipelineRunContext | None" = None
|
|
286
|
+
previous_error: Exception | None = None
|
|
287
|
+
carry_resource_to_retry: bool = False
|
|
288
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@dataclass
|
|
292
|
+
class PipelineWaitPoll:
|
|
293
|
+
"""One wait-loop observation passed to lifecycle hooks."""
|
|
294
|
+
|
|
295
|
+
run_id: str
|
|
296
|
+
run: dict[str, Any]
|
|
297
|
+
status: str
|
|
298
|
+
status_counts: dict[str, int]
|
|
299
|
+
total: int
|
|
300
|
+
terminal: bool
|
|
301
|
+
graph_state: dict[str, Any] | None = None
|
|
302
|
+
elapsed_seconds: float = 0.0
|
|
303
|
+
execution_state_timings: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
@dataclass
|
|
307
|
+
class PipelineRunHooks:
|
|
308
|
+
"""Overridable seams for downstream tangle-deploy behavior.
|
|
309
|
+
|
|
310
|
+
Subclasses can override these methods to add provider-specific auth wrappers,
|
|
311
|
+
cloud-object loading, JOB_CONFIG time input, run-as annotations,
|
|
312
|
+
mutex/schedule behavior, graceful shutdown, notifications, hosted logs, or
|
|
313
|
+
from-container runtime defaults without forking the generic pipeline-run manager.
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
logger: Logger = field(default_factory=get_default_logger)
|
|
317
|
+
trusted_python_sources: list[str] = field(default_factory=list)
|
|
318
|
+
allow_all_hydration: bool = False
|
|
319
|
+
|
|
320
|
+
def read_pipeline_yaml(self, pipeline_path: str | Path) -> dict[str, Any]:
|
|
321
|
+
path_text = str(pipeline_path)
|
|
322
|
+
if path_text.startswith("gs://"):
|
|
323
|
+
raise UnsupportedPipelineRunFeatureError(
|
|
324
|
+
"gs:// pipeline loading is not supported by the OSS CLI default hooks"
|
|
325
|
+
)
|
|
326
|
+
path = Path(pipeline_path)
|
|
327
|
+
with path.open(encoding="utf-8") as handle:
|
|
328
|
+
data = yaml.safe_load(handle)
|
|
329
|
+
if not isinstance(data, dict):
|
|
330
|
+
raise PipelineRunError("Pipeline YAML must contain a top-level mapping")
|
|
331
|
+
return data
|
|
332
|
+
|
|
333
|
+
def hydrate_pipeline(
|
|
334
|
+
self,
|
|
335
|
+
pipeline_path: str | Path,
|
|
336
|
+
*,
|
|
337
|
+
resolution_overrides: dict[str, Any] | None = None,
|
|
338
|
+
) -> dict[str, Any]:
|
|
339
|
+
client = getattr(self, "client", None)
|
|
340
|
+
if client is None and hasattr(self, "_get_client"):
|
|
341
|
+
client = self._get_client()
|
|
342
|
+
if client is None:
|
|
343
|
+
raise PipelineRunError("Failed to create TangleApiClient")
|
|
344
|
+
hydrator = PipelineHydrator(
|
|
345
|
+
client=client,
|
|
346
|
+
resolution_overrides=resolution_overrides,
|
|
347
|
+
logger=self.logger,
|
|
348
|
+
trusted_python_sources=self.trusted_python_sources,
|
|
349
|
+
allow_all_hydration=self.allow_all_hydration,
|
|
350
|
+
)
|
|
351
|
+
try:
|
|
352
|
+
return hydrator.hydrate_file(pipeline_path).data
|
|
353
|
+
except HydrationError as exc:
|
|
354
|
+
raise PipelineRunError(str(exc)) from exc
|
|
355
|
+
|
|
356
|
+
def prepare_pipeline_spec(
|
|
357
|
+
self,
|
|
358
|
+
pipeline_spec: dict[str, Any],
|
|
359
|
+
*,
|
|
360
|
+
pipeline_path: str | Path | None,
|
|
361
|
+
run_args: dict[str, Any] | None,
|
|
362
|
+
hydrate: bool,
|
|
363
|
+
) -> dict[str, Any]:
|
|
364
|
+
"""Hook for downstream validation/hydration/layout/annotation transforms.
|
|
365
|
+
|
|
366
|
+
The default returns the already-loaded spec unchanged. TD can override
|
|
367
|
+
this to run schema validation, auto-layout, source annotations, or any
|
|
368
|
+
pre-submit preparation before the generic payload conversion runs.
|
|
369
|
+
"""
|
|
370
|
+
|
|
371
|
+
return pipeline_spec
|
|
372
|
+
|
|
373
|
+
def prepare_run_arguments(
|
|
374
|
+
self,
|
|
375
|
+
pipeline_spec: dict[str, Any],
|
|
376
|
+
run_args: dict[str, Any] | None,
|
|
377
|
+
) -> dict[str, Any] | None:
|
|
378
|
+
"""Hook for TD JOB_CONFIG time input / scheduled runtime behavior."""
|
|
379
|
+
return run_args
|
|
380
|
+
|
|
381
|
+
def transform_run_name(
|
|
382
|
+
self,
|
|
383
|
+
run_name: str,
|
|
384
|
+
*,
|
|
385
|
+
pipeline_spec: dict[str, Any],
|
|
386
|
+
run_args: dict[str, Any] | None,
|
|
387
|
+
) -> str:
|
|
388
|
+
"""Hook for downstream run-name policies after template expansion."""
|
|
389
|
+
|
|
390
|
+
return run_name
|
|
391
|
+
|
|
392
|
+
def extra_submit_annotations(
|
|
393
|
+
self,
|
|
394
|
+
*,
|
|
395
|
+
pipeline_spec: dict[str, Any],
|
|
396
|
+
pipeline_path: str | Path | None,
|
|
397
|
+
run_as: str | None = None,
|
|
398
|
+
) -> dict[str, str]:
|
|
399
|
+
"""Hook for downstream source/run-as/git annotations."""
|
|
400
|
+
if run_as:
|
|
401
|
+
raise UnsupportedPipelineRunFeatureError(
|
|
402
|
+
"--run-as is a downstream extension point and has no OSS default behavior"
|
|
403
|
+
)
|
|
404
|
+
return {}
|
|
405
|
+
|
|
406
|
+
def before_submit(self, pipeline_spec: dict[str, Any]) -> None:
|
|
407
|
+
"""Legacy hook retained for compatibility with existing downstreams."""
|
|
408
|
+
|
|
409
|
+
def before_submit_context(self, context: PipelineRunContext) -> None:
|
|
410
|
+
"""Hook for TD mutex/overlap checks with full run context."""
|
|
411
|
+
|
|
412
|
+
if context.pipeline_spec is not None:
|
|
413
|
+
self.before_submit(context.pipeline_spec)
|
|
414
|
+
|
|
415
|
+
def after_submit(self, response: Mapping[str, Any]) -> None:
|
|
416
|
+
"""Legacy hook retained for downstream start notifications."""
|
|
417
|
+
|
|
418
|
+
def after_submit_context(self, context: PipelineRunContext) -> None:
|
|
419
|
+
"""Hook for downstream start notifications with full run context."""
|
|
420
|
+
|
|
421
|
+
if context.response is not None:
|
|
422
|
+
self.after_submit(context.response)
|
|
423
|
+
|
|
424
|
+
def on_submit_error(
|
|
425
|
+
self,
|
|
426
|
+
error: Exception,
|
|
427
|
+
*,
|
|
428
|
+
context: PipelineRunContext,
|
|
429
|
+
) -> None:
|
|
430
|
+
"""Hook for downstream submit-error notifications/cleanup."""
|
|
431
|
+
|
|
432
|
+
def around_run(self, context: PipelineRunContext) -> AbstractContextManager[Any]:
|
|
433
|
+
"""Context-manager seam for mutex/run lifecycle ownership."""
|
|
434
|
+
|
|
435
|
+
return nullcontext()
|
|
436
|
+
|
|
437
|
+
def before_run_lifecycle(self, context: PipelineRunContext) -> None:
|
|
438
|
+
"""Hook called before a run attempt enters the lifecycle context."""
|
|
439
|
+
|
|
440
|
+
def after_run_lifecycle(
|
|
441
|
+
self,
|
|
442
|
+
context: PipelineRunContext,
|
|
443
|
+
*,
|
|
444
|
+
success: bool,
|
|
445
|
+
error: Exception | None = None,
|
|
446
|
+
) -> None:
|
|
447
|
+
"""Hook called after the lifecycle context exits."""
|
|
448
|
+
|
|
449
|
+
def on_fail_fast_before_release(
|
|
450
|
+
self,
|
|
451
|
+
context: PipelineRunContext,
|
|
452
|
+
error: Exception,
|
|
453
|
+
) -> None:
|
|
454
|
+
"""Hook called before lifecycle release when fail-fast aborts a run."""
|
|
455
|
+
|
|
456
|
+
def before_retry(
|
|
457
|
+
self,
|
|
458
|
+
context: PipelineRunContext,
|
|
459
|
+
error: Exception,
|
|
460
|
+
*,
|
|
461
|
+
next_attempt: int,
|
|
462
|
+
) -> None:
|
|
463
|
+
"""Hook before retrying a failed submit/run attempt."""
|
|
464
|
+
|
|
465
|
+
def after_retry_submit(self, context: PipelineRunContext) -> None:
|
|
466
|
+
"""Hook after a retry successfully submits a new run."""
|
|
467
|
+
|
|
468
|
+
def should_cancel_previous_run(
|
|
469
|
+
self,
|
|
470
|
+
context: PipelineRunContext,
|
|
471
|
+
error: Exception,
|
|
472
|
+
*,
|
|
473
|
+
next_attempt: int,
|
|
474
|
+
) -> bool:
|
|
475
|
+
"""Return True when retry should cancel the previous run first."""
|
|
476
|
+
|
|
477
|
+
return False
|
|
478
|
+
|
|
479
|
+
def before_wait(self, context: PipelineRunContext) -> None:
|
|
480
|
+
"""Hook called before polling a run."""
|
|
481
|
+
|
|
482
|
+
def after_poll(self, poll: PipelineWaitPoll, context: PipelineRunContext) -> None:
|
|
483
|
+
"""Hook called after each run/graph-state poll."""
|
|
484
|
+
|
|
485
|
+
def should_exit_early(self, poll: PipelineWaitPoll, context: PipelineRunContext) -> bool:
|
|
486
|
+
"""Return True to stop waiting before terminal/timeout.
|
|
487
|
+
|
|
488
|
+
The generic fail-fast policy is opt-in via ``exit_on_first_failure``.
|
|
489
|
+
Downstreams can set that flag when they want the wait loop to return as
|
|
490
|
+
soon as a task fails, before the full graph reaches a terminal state.
|
|
491
|
+
"""
|
|
492
|
+
|
|
493
|
+
if not context.metadata.get("exit_on_first_failure"):
|
|
494
|
+
return False
|
|
495
|
+
return any(int(poll.status_counts.get(status, 0) or 0) > 0 for status in _FAILURE_EARLY_EXIT_STATUSES)
|
|
496
|
+
|
|
497
|
+
def on_timeout(self, poll: PipelineWaitPoll, context: PipelineRunContext) -> None:
|
|
498
|
+
"""Hook called when wait reaches max_wait."""
|
|
499
|
+
|
|
500
|
+
def on_terminal(self, poll: PipelineWaitPoll, context: PipelineRunContext) -> None:
|
|
501
|
+
"""Hook called when wait observes terminal state."""
|
|
502
|
+
|
|
503
|
+
def on_early_exit_before_release(
|
|
504
|
+
self,
|
|
505
|
+
poll: PipelineWaitPoll,
|
|
506
|
+
context: PipelineRunContext,
|
|
507
|
+
) -> None:
|
|
508
|
+
"""Hook called for fail-fast early exit before lifecycle release."""
|
|
509
|
+
|
|
510
|
+
def after_wait(self, result: Mapping[str, Any]) -> None:
|
|
511
|
+
"""Legacy hook retained for terminal downstream notifications."""
|
|
512
|
+
|
|
513
|
+
def wait_outcome(
|
|
514
|
+
self,
|
|
515
|
+
poll: PipelineWaitPoll,
|
|
516
|
+
result: Mapping[str, Any],
|
|
517
|
+
context: PipelineRunContext,
|
|
518
|
+
) -> PipelineWaitOutcome:
|
|
519
|
+
"""Return the typed wait outcome to attach to the run context."""
|
|
520
|
+
|
|
521
|
+
del context
|
|
522
|
+
return PipelineWaitOutcome.from_poll_result(poll, result)
|
|
523
|
+
|
|
524
|
+
def after_wait_context(self, result: Mapping[str, Any], context: PipelineRunContext) -> None:
|
|
525
|
+
"""Hook called after wait returns with full run context.
|
|
526
|
+
|
|
527
|
+
Preserve legacy behavior: ``after_wait(result)`` is called only for
|
|
528
|
+
terminal observations, not timeouts or fail-fast/early-exit returns.
|
|
529
|
+
Downstreams that need those outcomes should override ``on_timeout``,
|
|
530
|
+
``on_early_exit_before_release``, or this context-aware hook directly.
|
|
531
|
+
"""
|
|
532
|
+
|
|
533
|
+
if not result.get("timed_out") and not result.get("early_exit"):
|
|
534
|
+
status = result.get("status")
|
|
535
|
+
status_text = str(status).upper() if status else None
|
|
536
|
+
if status_text == "ENDED" or status_text in _TERMINAL_STATUSES:
|
|
537
|
+
self.after_wait(result)
|
|
538
|
+
|
|
539
|
+
def should_enforce_max_wait(self, context: PipelineRunContext) -> bool:
|
|
540
|
+
"""Return False for downstream-controlled scheduled timeout policies."""
|
|
541
|
+
|
|
542
|
+
return True
|
|
543
|
+
|
|
544
|
+
def poll_run_snapshot(
|
|
545
|
+
self,
|
|
546
|
+
manager: "PipelineRunManager",
|
|
547
|
+
run_id: str,
|
|
548
|
+
context: PipelineRunContext,
|
|
549
|
+
) -> Mapping[str, Any] | None:
|
|
550
|
+
"""Optional hook to provide a run-like snapshot for wait polling.
|
|
551
|
+
|
|
552
|
+
Downstreams whose wait API is rooted at an execution id can return a
|
|
553
|
+
synthetic run snapshot here instead of forcing the generic manager to
|
|
554
|
+
call ``pipeline_runs_get(run_id)``.
|
|
555
|
+
"""
|
|
556
|
+
|
|
557
|
+
return None
|
|
558
|
+
|
|
559
|
+
def graph_state_execution_id(
|
|
560
|
+
self,
|
|
561
|
+
run: Mapping[str, Any],
|
|
562
|
+
context: PipelineRunContext,
|
|
563
|
+
) -> str | None:
|
|
564
|
+
"""Return the execution id to use for graph-state polling."""
|
|
565
|
+
|
|
566
|
+
root_execution_id = run.get("root_execution_id") or context.root_execution_id
|
|
567
|
+
return str(root_execution_id) if root_execution_id is not None else None
|
|
568
|
+
|
|
569
|
+
def on_poll_error(self, error: Exception, context: PipelineRunContext) -> float | None:
|
|
570
|
+
"""Handle polling errors.
|
|
571
|
+
|
|
572
|
+
Return a sleep interval to retry, or ``None`` to propagate the error.
|
|
573
|
+
"""
|
|
574
|
+
|
|
575
|
+
return None
|
|
576
|
+
|
|
577
|
+
def fetch_logs(self, client: Any, execution_id: str) -> Any:
|
|
578
|
+
"""Hook for alternate TD log providers; OSS uses the Tangle API only."""
|
|
579
|
+
return client.executions_container_log(execution_id)
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
@dataclass
|
|
583
|
+
class PipelineRunManager(TangleCliHandler):
|
|
584
|
+
client: Any
|
|
585
|
+
hooks: PipelineRunHooks = field(default_factory=PipelineRunHooks)
|
|
586
|
+
logger: Logger = field(default_factory=get_default_logger)
|
|
587
|
+
base_url: str | None = None
|
|
588
|
+
|
|
589
|
+
def __post_init__(self) -> None:
|
|
590
|
+
TangleCliHandler.__init__(
|
|
591
|
+
self,
|
|
592
|
+
client=self.client,
|
|
593
|
+
logger=self.logger,
|
|
594
|
+
base_url=self.base_url,
|
|
595
|
+
)
|
|
596
|
+
if self.hooks is not self:
|
|
597
|
+
setattr(self.hooks, "client", self.client)
|
|
598
|
+
|
|
599
|
+
@staticmethod
|
|
600
|
+
def to_plain(value: Any) -> Any:
|
|
601
|
+
if isinstance(value, Mapping):
|
|
602
|
+
return {key: PipelineRunManager.to_plain(val) for key, val in value.items()}
|
|
603
|
+
if hasattr(value, "to_dict"):
|
|
604
|
+
return value.to_dict()
|
|
605
|
+
if hasattr(value, "model_dump"):
|
|
606
|
+
return value.model_dump(by_alias=True)
|
|
607
|
+
if isinstance(value, list):
|
|
608
|
+
return [PipelineRunManager.to_plain(item) for item in value]
|
|
609
|
+
if hasattr(value, "__dict__"):
|
|
610
|
+
return {
|
|
611
|
+
key: PipelineRunManager.to_plain(val)
|
|
612
|
+
for key, val in vars(value).items()
|
|
613
|
+
if not key.startswith("_")
|
|
614
|
+
}
|
|
615
|
+
return value
|
|
616
|
+
|
|
617
|
+
@staticmethod
|
|
618
|
+
def extract_default_arguments(pipeline_spec: dict[str, Any]) -> dict[str, Any]:
|
|
619
|
+
arguments: dict[str, Any] = {}
|
|
620
|
+
inputs = pipeline_spec.get("inputs", [])
|
|
621
|
+
if isinstance(inputs, list):
|
|
622
|
+
for input_item in inputs:
|
|
623
|
+
if isinstance(input_item, dict) and "name" in input_item and "default" in input_item:
|
|
624
|
+
arguments[input_item["name"]] = input_item["default"]
|
|
625
|
+
return arguments
|
|
626
|
+
|
|
627
|
+
@staticmethod
|
|
628
|
+
def convert_yaml_to_payload(
|
|
629
|
+
pipeline_spec: dict[str, Any],
|
|
630
|
+
run_args: dict[str, Any] | None = None,
|
|
631
|
+
) -> dict[str, Any]:
|
|
632
|
+
payload: dict[str, Any] = {"root_task": {"componentRef": {"spec": pipeline_spec}}}
|
|
633
|
+
arguments = PipelineRunManager.extract_default_arguments(pipeline_spec)
|
|
634
|
+
if run_args:
|
|
635
|
+
arguments.update(run_args)
|
|
636
|
+
|
|
637
|
+
pipeline_inputs = pipeline_spec.get("inputs", [])
|
|
638
|
+
valid_inputs = {inp.get("name") for inp in pipeline_inputs if isinstance(inp, dict) and inp.get("name")}
|
|
639
|
+
if valid_inputs:
|
|
640
|
+
arguments = {key: value for key, value in arguments.items() if key in valid_inputs}
|
|
641
|
+
|
|
642
|
+
missing: list[str] = []
|
|
643
|
+
for input_item in pipeline_inputs if isinstance(pipeline_inputs, list) else []:
|
|
644
|
+
if not isinstance(input_item, dict):
|
|
645
|
+
continue
|
|
646
|
+
name = input_item.get("name")
|
|
647
|
+
if name and "default" not in input_item and not input_item.get("optional", False) and name not in arguments:
|
|
648
|
+
missing.append(name)
|
|
649
|
+
if missing:
|
|
650
|
+
raise PipelineRunError(
|
|
651
|
+
f"Missing {len(missing)} required pipeline input(s): {', '.join(sorted(missing))}"
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
if arguments:
|
|
655
|
+
payload["root_task"]["arguments"] = arguments
|
|
656
|
+
return payload
|
|
657
|
+
|
|
658
|
+
@staticmethod
|
|
659
|
+
def sanitize_submit_payload(value: Any) -> Any:
|
|
660
|
+
"""Return a submit-safe payload with TD-compatible componentRef fixes.
|
|
661
|
+
|
|
662
|
+
The hydrator uses explicit local-only annotations such as
|
|
663
|
+
``_source_dir`` while recursively resolving local files. Those
|
|
664
|
+
provenance keys must not be submitted to the backend. User-supplied
|
|
665
|
+
underscore-prefixed payload keys are otherwise valid and preserved.
|
|
666
|
+
TD also normalizes ``componentRef.text`` into ``componentRef.spec``
|
|
667
|
+
for component-library entries before submit; keep the same behavior
|
|
668
|
+
here.
|
|
669
|
+
"""
|
|
670
|
+
|
|
671
|
+
if isinstance(value, list):
|
|
672
|
+
return [PipelineRunManager.sanitize_submit_payload(item) for item in value]
|
|
673
|
+
if not isinstance(value, dict):
|
|
674
|
+
return value
|
|
675
|
+
|
|
676
|
+
local_only_keys = {"_source_dir", "_recursive_params"}
|
|
677
|
+
cleaned: dict[str, Any] = {}
|
|
678
|
+
for key, item in value.items():
|
|
679
|
+
if str(key) in local_only_keys:
|
|
680
|
+
continue
|
|
681
|
+
cleaned[key] = PipelineRunManager.sanitize_submit_payload(item)
|
|
682
|
+
|
|
683
|
+
component_ref = cleaned.get("componentRef")
|
|
684
|
+
if isinstance(component_ref, dict) and "text" in component_ref and not component_ref.get("spec"):
|
|
685
|
+
text_content = component_ref.pop("text")
|
|
686
|
+
if isinstance(text_content, str):
|
|
687
|
+
try:
|
|
688
|
+
component_ref["spec"] = yaml.safe_load(text_content)
|
|
689
|
+
except yaml.YAMLError as exc:
|
|
690
|
+
component_name = component_ref.get("name", "unknown")
|
|
691
|
+
raise PipelineRunError(
|
|
692
|
+
f"Failed to parse YAML in componentRef {component_name!r}: {exc}"
|
|
693
|
+
) from exc
|
|
694
|
+
else:
|
|
695
|
+
component_ref["spec"] = text_content
|
|
696
|
+
component_ref["spec"] = PipelineRunManager.sanitize_submit_payload(component_ref["spec"])
|
|
697
|
+
|
|
698
|
+
return cleaned
|
|
699
|
+
|
|
700
|
+
@staticmethod
|
|
701
|
+
def normalize_submit_body_in_place(body: dict[str, Any]) -> dict[str, Any]:
|
|
702
|
+
"""Normalize a submit body in place and return it.
|
|
703
|
+
|
|
704
|
+
This is the mutable counterpart to :meth:`sanitize_submit_payload` for
|
|
705
|
+
callers that already have a body object. It keeps component-ref text
|
|
706
|
+
normalization and submit-only field stripping in the OSS submit layer,
|
|
707
|
+
instead of requiring downstream runners to patch bodies before submit.
|
|
708
|
+
"""
|
|
709
|
+
|
|
710
|
+
sanitized = PipelineRunManager.sanitize_submit_payload(body)
|
|
711
|
+
if not isinstance(sanitized, dict):
|
|
712
|
+
raise PipelineRunError("submit body must be a mapping")
|
|
713
|
+
body.clear()
|
|
714
|
+
body.update(sanitized)
|
|
715
|
+
return body
|
|
716
|
+
|
|
717
|
+
@staticmethod
|
|
718
|
+
def is_terminal_status(status: str | None) -> bool:
|
|
719
|
+
return bool(status and status.upper() in _TERMINAL_STATUSES)
|
|
720
|
+
|
|
721
|
+
@staticmethod
|
|
722
|
+
def status_counts_from_run(run: Mapping[str, Any]) -> dict[str, int]:
|
|
723
|
+
stats = run.get("execution_status_stats")
|
|
724
|
+
if not isinstance(stats, Mapping):
|
|
725
|
+
return {}
|
|
726
|
+
result: dict[str, int] = {}
|
|
727
|
+
for key, value in stats.items():
|
|
728
|
+
try:
|
|
729
|
+
result[str(key).upper()] = int(value or 0)
|
|
730
|
+
except (TypeError, ValueError):
|
|
731
|
+
continue
|
|
732
|
+
return result
|
|
733
|
+
|
|
734
|
+
@staticmethod
|
|
735
|
+
def _counts_mapping(value: Any) -> Mapping[str, Any] | None:
|
|
736
|
+
if isinstance(value, Mapping):
|
|
737
|
+
return value
|
|
738
|
+
if value is not None and hasattr(value, "items"):
|
|
739
|
+
return value
|
|
740
|
+
return None
|
|
741
|
+
|
|
742
|
+
@staticmethod
|
|
743
|
+
def status_counts_from_graph_state(graph_state: Mapping[str, Any] | Any) -> dict[str, int]:
|
|
744
|
+
for key in ("status_totals", "execution_status_stats"):
|
|
745
|
+
stats = graph_state.get(key) if isinstance(graph_state, Mapping) else getattr(graph_state, key, None)
|
|
746
|
+
counts = PipelineRunManager._counts_mapping(stats)
|
|
747
|
+
if counts is not None:
|
|
748
|
+
return {
|
|
749
|
+
str(status).upper(): int(count or 0)
|
|
750
|
+
for status, count in counts.items()
|
|
751
|
+
}
|
|
752
|
+
child_stats = (
|
|
753
|
+
graph_state.get("child_execution_status_stats")
|
|
754
|
+
if isinstance(graph_state, Mapping)
|
|
755
|
+
else getattr(graph_state, "child_execution_status_stats", None)
|
|
756
|
+
)
|
|
757
|
+
totals: dict[str, int] = {}
|
|
758
|
+
child_counts = PipelineRunManager._counts_mapping(child_stats)
|
|
759
|
+
if child_counts is not None:
|
|
760
|
+
for stats in child_counts.values():
|
|
761
|
+
counts = PipelineRunManager._counts_mapping(stats)
|
|
762
|
+
if counts is None:
|
|
763
|
+
continue
|
|
764
|
+
for status, count in counts.items():
|
|
765
|
+
totals[str(status).upper()] = totals.get(str(status).upper(), 0) + int(count or 0)
|
|
766
|
+
return totals
|
|
767
|
+
|
|
768
|
+
@staticmethod
|
|
769
|
+
def execution_status_counts_from_graph_state(graph_state: Mapping[str, Any] | Any) -> dict[str, dict[str, int]]:
|
|
770
|
+
"""Return per-execution status counts from a graph-state response."""
|
|
771
|
+
|
|
772
|
+
child_stats = (
|
|
773
|
+
graph_state.get("child_execution_status_stats")
|
|
774
|
+
if isinstance(graph_state, Mapping)
|
|
775
|
+
else getattr(graph_state, "child_execution_status_stats", None)
|
|
776
|
+
)
|
|
777
|
+
child_counts = PipelineRunManager._counts_mapping(child_stats)
|
|
778
|
+
if child_counts is None:
|
|
779
|
+
return {}
|
|
780
|
+
result: dict[str, dict[str, int]] = {}
|
|
781
|
+
for execution_id, stats in child_counts.items():
|
|
782
|
+
counts = PipelineRunManager._counts_mapping(stats)
|
|
783
|
+
if counts is None:
|
|
784
|
+
continue
|
|
785
|
+
status_counts: dict[str, int] = {}
|
|
786
|
+
for status, count in counts.items():
|
|
787
|
+
try:
|
|
788
|
+
status_counts[str(status).upper()] = int(count or 0)
|
|
789
|
+
except (TypeError, ValueError):
|
|
790
|
+
continue
|
|
791
|
+
result[str(execution_id)] = status_counts
|
|
792
|
+
return result
|
|
793
|
+
|
|
794
|
+
@staticmethod
|
|
795
|
+
def status_from_counts(status_counts: Mapping[str, int]) -> str | None:
|
|
796
|
+
for status in _ACTIVE_STATUSES:
|
|
797
|
+
if int(status_counts.get(status, 0) or 0) > 0:
|
|
798
|
+
return status
|
|
799
|
+
for status in _TERMINAL_STATUSES:
|
|
800
|
+
if int(status_counts.get(status, 0) or 0) > 0:
|
|
801
|
+
return status
|
|
802
|
+
return None
|
|
803
|
+
|
|
804
|
+
@staticmethod
|
|
805
|
+
def status_from_run(run: Mapping[str, Any]) -> str | None:
|
|
806
|
+
summary = run.get("execution_summary")
|
|
807
|
+
if isinstance(summary, Mapping) and summary.get("has_ended") is True:
|
|
808
|
+
stats = run.get("execution_status_stats")
|
|
809
|
+
if isinstance(stats, Mapping):
|
|
810
|
+
for status in ("FAILED", "SYSTEM_ERROR", "CANCELLED", "CANCELED"):
|
|
811
|
+
if int(stats.get(status, 0) or 0) > 0:
|
|
812
|
+
return status
|
|
813
|
+
if int(stats.get("SUCCEEDED", 0) or 0) > 0:
|
|
814
|
+
return "SUCCEEDED"
|
|
815
|
+
return "ENDED"
|
|
816
|
+
stats = run.get("execution_status_stats")
|
|
817
|
+
if isinstance(stats, Mapping):
|
|
818
|
+
for status in _ACTIVE_STATUSES:
|
|
819
|
+
if int(stats.get(status, 0) or 0) > 0:
|
|
820
|
+
return status
|
|
821
|
+
for status in _TERMINAL_STATUSES:
|
|
822
|
+
if int(stats.get(status, 0) or 0) > 0:
|
|
823
|
+
return status
|
|
824
|
+
return None
|
|
825
|
+
|
|
826
|
+
@staticmethod
|
|
827
|
+
def _accepts_client_keyword(method: Any) -> bool:
|
|
828
|
+
try:
|
|
829
|
+
parameters = inspect.signature(method).parameters
|
|
830
|
+
except (TypeError, ValueError):
|
|
831
|
+
return False
|
|
832
|
+
return "client" in parameters or any(
|
|
833
|
+
parameter.kind is inspect.Parameter.VAR_KEYWORD
|
|
834
|
+
for parameter in parameters.values()
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
def load_pipeline_for_submit(
|
|
838
|
+
self,
|
|
839
|
+
pipeline_path: str | Path,
|
|
840
|
+
*,
|
|
841
|
+
hydrate: bool = True,
|
|
842
|
+
resolution_overrides: dict[str, Any] | None = None,
|
|
843
|
+
) -> dict[str, Any]:
|
|
844
|
+
if hydrate:
|
|
845
|
+
hydrate_pipeline = self.hooks.hydrate_pipeline
|
|
846
|
+
hydrate_kwargs: dict[str, Any] = {"resolution_overrides": resolution_overrides}
|
|
847
|
+
if self._accepts_client_keyword(hydrate_pipeline):
|
|
848
|
+
hydrate_kwargs["client"] = self._get_client()
|
|
849
|
+
return hydrate_pipeline(pipeline_path, **hydrate_kwargs)
|
|
850
|
+
return self.hooks.read_pipeline_yaml(pipeline_path)
|
|
851
|
+
|
|
852
|
+
@staticmethod
|
|
853
|
+
def expand_run_name_template(
|
|
854
|
+
template: str,
|
|
855
|
+
pipeline_spec: dict[str, Any],
|
|
856
|
+
run_args: dict[str, Any] | None = None,
|
|
857
|
+
) -> str:
|
|
858
|
+
"""Expand ``${arguments.NAME}`` placeholders from defaults + run args."""
|
|
859
|
+
|
|
860
|
+
arguments = PipelineRunManager.extract_default_arguments(pipeline_spec)
|
|
861
|
+
if run_args:
|
|
862
|
+
arguments.update(run_args)
|
|
863
|
+
|
|
864
|
+
def replace_placeholder(match: re.Match[str]) -> str:
|
|
865
|
+
value = arguments.get(match.group(1))
|
|
866
|
+
return str(value) if value is not None else match.group(0)
|
|
867
|
+
|
|
868
|
+
return re.sub(r"\$\{arguments\.([^}]+)\}", replace_placeholder, template)
|
|
869
|
+
|
|
870
|
+
def apply_run_name_template(
|
|
871
|
+
self,
|
|
872
|
+
pipeline_spec: dict[str, Any],
|
|
873
|
+
run_args: dict[str, Any] | None = None,
|
|
874
|
+
) -> dict[str, Any]:
|
|
875
|
+
annotations = pipeline_spec.get("metadata", {}).get("annotations", {})
|
|
876
|
+
template = annotations.get("run-name-template") if isinstance(annotations, Mapping) else None
|
|
877
|
+
if not template:
|
|
878
|
+
return pipeline_spec
|
|
879
|
+
transformed = copy.deepcopy(pipeline_spec)
|
|
880
|
+
expanded = self.expand_run_name_template(str(template), transformed, run_args)
|
|
881
|
+
transformed["name"] = self.hooks.transform_run_name(
|
|
882
|
+
expanded,
|
|
883
|
+
pipeline_spec=transformed,
|
|
884
|
+
run_args=run_args,
|
|
885
|
+
)
|
|
886
|
+
return transformed
|
|
887
|
+
|
|
888
|
+
def prepare_pipeline_spec_for_submit(
|
|
889
|
+
self,
|
|
890
|
+
pipeline_spec: dict[str, Any],
|
|
891
|
+
*,
|
|
892
|
+
pipeline_path: str | Path | None = None,
|
|
893
|
+
run_args: dict[str, Any] | None = None,
|
|
894
|
+
hydrate: bool = True,
|
|
895
|
+
) -> dict[str, Any]:
|
|
896
|
+
return self.hooks.prepare_pipeline_spec(
|
|
897
|
+
pipeline_spec,
|
|
898
|
+
pipeline_path=pipeline_path,
|
|
899
|
+
run_args=run_args,
|
|
900
|
+
hydrate=hydrate,
|
|
901
|
+
)
|
|
902
|
+
|
|
903
|
+
def prepare_submit_payload_from_spec(
|
|
904
|
+
self,
|
|
905
|
+
pipeline_spec: dict[str, Any],
|
|
906
|
+
*,
|
|
907
|
+
run_args: dict[str, Any] | None = None,
|
|
908
|
+
annotations: dict[str, str] | None = None,
|
|
909
|
+
pipeline_path: str | Path | None = None,
|
|
910
|
+
run_as: str | None = None,
|
|
911
|
+
hydrate: bool = True,
|
|
912
|
+
) -> PipelineSubmitPayload:
|
|
913
|
+
"""Prepare the generic submit payload from a pipeline spec.
|
|
914
|
+
|
|
915
|
+
The order here is the submit-body contract shared by OSS and TD:
|
|
916
|
+
prepare the spec, prepare runtime arguments, expand run-name templates,
|
|
917
|
+
convert/sanitize the payload, then merge downstream/default annotations
|
|
918
|
+
before caller-supplied annotations override them.
|
|
919
|
+
"""
|
|
920
|
+
|
|
921
|
+
prepared_spec = self.prepare_pipeline_spec_for_submit(
|
|
922
|
+
pipeline_spec,
|
|
923
|
+
pipeline_path=pipeline_path,
|
|
924
|
+
run_args=run_args,
|
|
925
|
+
hydrate=hydrate,
|
|
926
|
+
)
|
|
927
|
+
prepared_run_args = self.hooks.prepare_run_arguments(prepared_spec, run_args)
|
|
928
|
+
prepared_spec = self.apply_run_name_template(prepared_spec, prepared_run_args)
|
|
929
|
+
payload = self.convert_yaml_to_payload(copy.deepcopy(prepared_spec), prepared_run_args)
|
|
930
|
+
payload = self.sanitize_submit_payload(payload)
|
|
931
|
+
root_task = payload["root_task"]
|
|
932
|
+
component_ref = root_task.get("componentRef") if isinstance(root_task, Mapping) else None
|
|
933
|
+
submit_spec = (
|
|
934
|
+
component_ref.get("spec")
|
|
935
|
+
if isinstance(component_ref, Mapping) and isinstance(component_ref.get("spec"), dict)
|
|
936
|
+
else prepared_spec
|
|
937
|
+
)
|
|
938
|
+
submit_annotations = self.hooks.extra_submit_annotations(
|
|
939
|
+
pipeline_spec=prepared_spec,
|
|
940
|
+
pipeline_path=pipeline_path,
|
|
941
|
+
run_as=run_as,
|
|
942
|
+
)
|
|
943
|
+
if annotations:
|
|
944
|
+
submit_annotations.update({str(k): str(v) for k, v in annotations.items()})
|
|
945
|
+
run_name = submit_spec.get("name")
|
|
946
|
+
return PipelineSubmitPayload(
|
|
947
|
+
prepared_spec=prepared_spec,
|
|
948
|
+
pipeline_spec=submit_spec,
|
|
949
|
+
run_args=prepared_run_args,
|
|
950
|
+
root_task=root_task,
|
|
951
|
+
annotations=submit_annotations,
|
|
952
|
+
run_name=run_name if isinstance(run_name, str) and run_name else None,
|
|
953
|
+
)
|
|
954
|
+
|
|
955
|
+
def build_submit_body_from_spec(
|
|
956
|
+
self,
|
|
957
|
+
pipeline_spec: dict[str, Any],
|
|
958
|
+
*,
|
|
959
|
+
run_args: dict[str, Any] | None = None,
|
|
960
|
+
annotations: dict[str, str] | None = None,
|
|
961
|
+
pipeline_path: str | Path | None = None,
|
|
962
|
+
run_as: str | None = None,
|
|
963
|
+
hydrate: bool = True,
|
|
964
|
+
) -> dict[str, Any]:
|
|
965
|
+
"""Build a submit body from an already-prepared pipeline spec."""
|
|
966
|
+
|
|
967
|
+
return self.prepare_submit_payload_from_spec(
|
|
968
|
+
pipeline_spec,
|
|
969
|
+
run_args=run_args,
|
|
970
|
+
annotations=annotations,
|
|
971
|
+
pipeline_path=pipeline_path,
|
|
972
|
+
run_as=run_as,
|
|
973
|
+
hydrate=hydrate,
|
|
974
|
+
).to_body()
|
|
975
|
+
|
|
976
|
+
def prepare_submit_payload(
|
|
977
|
+
self,
|
|
978
|
+
pipeline_path: str | Path,
|
|
979
|
+
*,
|
|
980
|
+
run_args: dict[str, Any] | None = None,
|
|
981
|
+
annotations: dict[str, str] | None = None,
|
|
982
|
+
hydrate: bool = True,
|
|
983
|
+
run_as: str | None = None,
|
|
984
|
+
resolution_overrides: dict[str, Any] | None = None,
|
|
985
|
+
) -> PipelineSubmitPayload:
|
|
986
|
+
pipeline_spec = self.load_pipeline_for_submit(
|
|
987
|
+
pipeline_path,
|
|
988
|
+
hydrate=hydrate,
|
|
989
|
+
resolution_overrides=resolution_overrides,
|
|
990
|
+
)
|
|
991
|
+
return self.prepare_submit_payload_from_spec(
|
|
992
|
+
pipeline_spec,
|
|
993
|
+
run_args=run_args,
|
|
994
|
+
annotations=annotations,
|
|
995
|
+
pipeline_path=pipeline_path,
|
|
996
|
+
run_as=run_as,
|
|
997
|
+
hydrate=hydrate,
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
def build_submit_body(
|
|
1001
|
+
self,
|
|
1002
|
+
pipeline_path: str | Path,
|
|
1003
|
+
*,
|
|
1004
|
+
run_args: dict[str, Any] | None = None,
|
|
1005
|
+
annotations: dict[str, str] | None = None,
|
|
1006
|
+
hydrate: bool = True,
|
|
1007
|
+
run_as: str | None = None,
|
|
1008
|
+
resolution_overrides: dict[str, Any] | None = None,
|
|
1009
|
+
) -> dict[str, Any]:
|
|
1010
|
+
return self.prepare_submit_payload(
|
|
1011
|
+
pipeline_path,
|
|
1012
|
+
run_args=run_args,
|
|
1013
|
+
annotations=annotations,
|
|
1014
|
+
hydrate=hydrate,
|
|
1015
|
+
run_as=run_as,
|
|
1016
|
+
resolution_overrides=resolution_overrides,
|
|
1017
|
+
).to_body()
|
|
1018
|
+
|
|
1019
|
+
@staticmethod
|
|
1020
|
+
def response_run_context(
|
|
1021
|
+
response: Mapping[str, Any],
|
|
1022
|
+
*,
|
|
1023
|
+
submit_body: dict[str, Any],
|
|
1024
|
+
pipeline_path: str | Path | None = None,
|
|
1025
|
+
attempt: int = 1,
|
|
1026
|
+
) -> PipelineRunContext:
|
|
1027
|
+
pipeline_spec = submit_body.get("root_task", {}).get("componentRef", {}).get("spec")
|
|
1028
|
+
run_name = pipeline_spec.get("name") if isinstance(pipeline_spec, dict) else None
|
|
1029
|
+
return PipelineRunContext(
|
|
1030
|
+
run_id=str(response.get("id")) if response.get("id") is not None else None,
|
|
1031
|
+
run_name=run_name if isinstance(run_name, str) and run_name else None,
|
|
1032
|
+
root_execution_id=(
|
|
1033
|
+
str(response.get("root_execution_id"))
|
|
1034
|
+
if response.get("root_execution_id") is not None
|
|
1035
|
+
else None
|
|
1036
|
+
),
|
|
1037
|
+
pipeline_path=pipeline_path,
|
|
1038
|
+
start_time=time.time(),
|
|
1039
|
+
attempt=attempt,
|
|
1040
|
+
submit_body=submit_body,
|
|
1041
|
+
pipeline_spec=pipeline_spec if isinstance(pipeline_spec, dict) else None,
|
|
1042
|
+
response=dict(response),
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
def submit_prepared_body(
|
|
1046
|
+
self,
|
|
1047
|
+
body: dict[str, Any],
|
|
1048
|
+
*,
|
|
1049
|
+
pipeline_path: str | Path | None = None,
|
|
1050
|
+
attempt: int = 1,
|
|
1051
|
+
context: PipelineRunContext | None = None,
|
|
1052
|
+
notify_submit_error: bool = True,
|
|
1053
|
+
) -> dict[str, Any]:
|
|
1054
|
+
self.normalize_submit_body_in_place(body)
|
|
1055
|
+
pipeline_spec = body["root_task"]["componentRef"]["spec"]
|
|
1056
|
+
submit_context = context or PipelineRunContext(
|
|
1057
|
+
pipeline_path=pipeline_path,
|
|
1058
|
+
start_time=time.time(),
|
|
1059
|
+
attempt=attempt,
|
|
1060
|
+
)
|
|
1061
|
+
spec_name = pipeline_spec.get("name") if isinstance(pipeline_spec, dict) else None
|
|
1062
|
+
submit_context.run_name = spec_name if isinstance(spec_name, str) and spec_name else None
|
|
1063
|
+
submit_context.pipeline_path = pipeline_path
|
|
1064
|
+
submit_context.attempt = attempt
|
|
1065
|
+
submit_context.submit_body = body
|
|
1066
|
+
submit_context.pipeline_spec = pipeline_spec if isinstance(pipeline_spec, dict) else None
|
|
1067
|
+
self.hooks.before_submit_context(submit_context)
|
|
1068
|
+
client = self._require_client()
|
|
1069
|
+
try:
|
|
1070
|
+
response = self.to_plain(client.pipeline_runs_create(body=body))
|
|
1071
|
+
except Exception as exc:
|
|
1072
|
+
if notify_submit_error:
|
|
1073
|
+
self.hooks.on_submit_error(exc, context=submit_context)
|
|
1074
|
+
raise
|
|
1075
|
+
if not isinstance(response, dict):
|
|
1076
|
+
response = {}
|
|
1077
|
+
submitted_context = self.response_run_context(
|
|
1078
|
+
response,
|
|
1079
|
+
submit_body=body,
|
|
1080
|
+
pipeline_path=pipeline_path,
|
|
1081
|
+
attempt=attempt,
|
|
1082
|
+
)
|
|
1083
|
+
submit_context.run_id = submitted_context.run_id
|
|
1084
|
+
submit_context.run_name = submitted_context.run_name
|
|
1085
|
+
submit_context.root_execution_id = submitted_context.root_execution_id
|
|
1086
|
+
submit_context.submit_body = submitted_context.submit_body
|
|
1087
|
+
submit_context.pipeline_spec = submitted_context.pipeline_spec
|
|
1088
|
+
submit_context.response = response
|
|
1089
|
+
self.hooks.after_submit_context(submit_context)
|
|
1090
|
+
return response
|
|
1091
|
+
|
|
1092
|
+
def submit_prepared_payload(
|
|
1093
|
+
self,
|
|
1094
|
+
payload: PipelineSubmitPayload,
|
|
1095
|
+
*,
|
|
1096
|
+
pipeline_path: str | Path | None = None,
|
|
1097
|
+
attempt: int = 1,
|
|
1098
|
+
context: PipelineRunContext | None = None,
|
|
1099
|
+
) -> dict[str, Any]:
|
|
1100
|
+
body = payload.to_body()
|
|
1101
|
+
response = self.submit_prepared_body(
|
|
1102
|
+
body,
|
|
1103
|
+
pipeline_path=pipeline_path,
|
|
1104
|
+
attempt=attempt,
|
|
1105
|
+
context=context,
|
|
1106
|
+
)
|
|
1107
|
+
payload.sync_from_body(body)
|
|
1108
|
+
return response
|
|
1109
|
+
|
|
1110
|
+
def submit_pipeline_spec(
|
|
1111
|
+
self,
|
|
1112
|
+
pipeline_spec: dict[str, Any],
|
|
1113
|
+
*,
|
|
1114
|
+
run_args: dict[str, Any] | None = None,
|
|
1115
|
+
annotations: dict[str, str] | None = None,
|
|
1116
|
+
pipeline_path: str | Path | None = None,
|
|
1117
|
+
run_as: str | None = None,
|
|
1118
|
+
hydrate: bool = True,
|
|
1119
|
+
attempt: int = 1,
|
|
1120
|
+
) -> dict[str, Any]:
|
|
1121
|
+
payload = self.prepare_submit_payload_from_spec(
|
|
1122
|
+
pipeline_spec,
|
|
1123
|
+
run_args=run_args,
|
|
1124
|
+
annotations=annotations,
|
|
1125
|
+
pipeline_path=pipeline_path,
|
|
1126
|
+
run_as=run_as,
|
|
1127
|
+
hydrate=hydrate,
|
|
1128
|
+
)
|
|
1129
|
+
return self.submit_prepared_payload(payload, pipeline_path=pipeline_path, attempt=attempt)
|
|
1130
|
+
|
|
1131
|
+
def submit_pipeline(
|
|
1132
|
+
self,
|
|
1133
|
+
pipeline_path: str | Path,
|
|
1134
|
+
*,
|
|
1135
|
+
run_args: dict[str, Any] | None = None,
|
|
1136
|
+
annotations: dict[str, str] | None = None,
|
|
1137
|
+
hydrate: bool = True,
|
|
1138
|
+
run_as: str | None = None,
|
|
1139
|
+
resolution_overrides: dict[str, Any] | None = None,
|
|
1140
|
+
attempt: int = 1,
|
|
1141
|
+
) -> dict[str, Any]:
|
|
1142
|
+
payload = self.prepare_submit_payload(
|
|
1143
|
+
pipeline_path,
|
|
1144
|
+
run_args=run_args,
|
|
1145
|
+
annotations=annotations,
|
|
1146
|
+
hydrate=hydrate,
|
|
1147
|
+
run_as=run_as,
|
|
1148
|
+
resolution_overrides=resolution_overrides,
|
|
1149
|
+
)
|
|
1150
|
+
return self.submit_prepared_payload(payload, pipeline_path=pipeline_path, attempt=attempt)
|
|
1151
|
+
|
|
1152
|
+
def get_run(self, run_id: str, *, include_execution_stats: bool = True) -> dict[str, Any]:
|
|
1153
|
+
return self.to_plain(
|
|
1154
|
+
self.client.pipeline_runs_get(
|
|
1155
|
+
run_id,
|
|
1156
|
+
include_execution_stats=include_execution_stats,
|
|
1157
|
+
)
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
def get_run_details(
|
|
1161
|
+
self,
|
|
1162
|
+
run_id: str,
|
|
1163
|
+
*,
|
|
1164
|
+
include_annotations: bool = False,
|
|
1165
|
+
include_execution_state: bool = False,
|
|
1166
|
+
include_implementations: bool = False,
|
|
1167
|
+
execution_id: str | None = None,
|
|
1168
|
+
) -> dict[str, Any]:
|
|
1169
|
+
return PipelineRunDetails(client=self.client).get_run_details_output(
|
|
1170
|
+
run_id,
|
|
1171
|
+
include_implementations=include_implementations,
|
|
1172
|
+
include_annotations=include_annotations,
|
|
1173
|
+
include_execution_state=include_execution_state,
|
|
1174
|
+
execution_id=execution_id,
|
|
1175
|
+
)
|
|
1176
|
+
|
|
1177
|
+
def cancel_run(self, run_id: str) -> dict[str, Any]:
|
|
1178
|
+
return self.to_plain(self.client.pipeline_runs_cancel(run_id)) or {"id": run_id, "cancelled": True}
|
|
1179
|
+
|
|
1180
|
+
def graph_state(self, execution_id: str) -> Mapping[str, Any] | Any:
|
|
1181
|
+
graph_state = self.client.executions_graph_execution_state(execution_id)
|
|
1182
|
+
return self.to_plain(graph_state)
|
|
1183
|
+
|
|
1184
|
+
def graph_state_output(self, run_ids: list[str], *, timeout: float = 30.0) -> dict[str, Any]:
|
|
1185
|
+
return PipelineRunDetails(client=self.client).get_graph_state_output(run_ids, timeout=timeout)
|
|
1186
|
+
|
|
1187
|
+
def logs(self, execution_id: str) -> dict[str, Any]:
|
|
1188
|
+
return self.to_plain(self.hooks.fetch_logs(self.client, execution_id))
|
|
1189
|
+
|
|
1190
|
+
def search_runs(
|
|
1191
|
+
self,
|
|
1192
|
+
*,
|
|
1193
|
+
filter: str | None = None,
|
|
1194
|
+
filter_query: str | None = None,
|
|
1195
|
+
page_token: str | None = None,
|
|
1196
|
+
include_pipeline_names: bool | None = None,
|
|
1197
|
+
include_execution_stats: bool | None = True,
|
|
1198
|
+
) -> dict[str, Any]:
|
|
1199
|
+
return self.to_plain(
|
|
1200
|
+
self.client.pipeline_runs_list(
|
|
1201
|
+
page_token=page_token,
|
|
1202
|
+
filter=filter,
|
|
1203
|
+
filter_query=filter_query,
|
|
1204
|
+
include_pipeline_names=include_pipeline_names,
|
|
1205
|
+
include_execution_stats=include_execution_stats,
|
|
1206
|
+
)
|
|
1207
|
+
)
|
|
1208
|
+
|
|
1209
|
+
def search_pipeline_runs(
|
|
1210
|
+
self,
|
|
1211
|
+
*,
|
|
1212
|
+
name: str | None = None,
|
|
1213
|
+
created_by: str | None = None,
|
|
1214
|
+
annotations: dict[str, str | None] | None = None,
|
|
1215
|
+
start_date: str | None = None,
|
|
1216
|
+
end_date: str | None = None,
|
|
1217
|
+
local_time: bool = False,
|
|
1218
|
+
query: dict[str, Any] | None = None,
|
|
1219
|
+
limit: int = 10,
|
|
1220
|
+
page_token: str | None = None,
|
|
1221
|
+
) -> dict[str, Any]:
|
|
1222
|
+
return PipelineRunSearch(client=self.client, logger=self.logger).search(
|
|
1223
|
+
name=name,
|
|
1224
|
+
created_by=created_by,
|
|
1225
|
+
annotations=annotations,
|
|
1226
|
+
start_date=start_date,
|
|
1227
|
+
end_date=end_date,
|
|
1228
|
+
local_time=local_time,
|
|
1229
|
+
query=query,
|
|
1230
|
+
limit=limit,
|
|
1231
|
+
page_token=page_token,
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
def export_run(
|
|
1235
|
+
self,
|
|
1236
|
+
run_id: str,
|
|
1237
|
+
output: str | Path | None = None,
|
|
1238
|
+
*,
|
|
1239
|
+
dehydrate: bool = False,
|
|
1240
|
+
) -> dict[str, Any]:
|
|
1241
|
+
task_spec = self.client.get_run_pipeline_spec(run_id)
|
|
1242
|
+
if task_spec is None:
|
|
1243
|
+
raise PipelineRunError(f"No pipeline spec found for run {run_id}")
|
|
1244
|
+
raw = getattr(task_spec, "raw", None)
|
|
1245
|
+
if isinstance(raw, Mapping):
|
|
1246
|
+
spec = raw.get("componentRef", {}).get("spec")
|
|
1247
|
+
else:
|
|
1248
|
+
spec = None
|
|
1249
|
+
component_spec = getattr(task_spec, "component_spec", None)
|
|
1250
|
+
if not isinstance(spec, dict) and component_spec is not None:
|
|
1251
|
+
spec = getattr(component_spec, "data", None)
|
|
1252
|
+
if not isinstance(spec, dict) or not spec:
|
|
1253
|
+
raise PipelineRunError(f"Pipeline spec for run {run_id} is not exportable")
|
|
1254
|
+
if dehydrate and output is None:
|
|
1255
|
+
raise PipelineRunError("--dehydrate requires --output")
|
|
1256
|
+
if dehydrate:
|
|
1257
|
+
spec = PipelineDehydrator(
|
|
1258
|
+
remembered_choices={"": DehydrateChoice.AUTO},
|
|
1259
|
+
output_file=output,
|
|
1260
|
+
client=self.client,
|
|
1261
|
+
logger=self.logger,
|
|
1262
|
+
).dehydrate(spec)
|
|
1263
|
+
content = dump_yaml(spec)
|
|
1264
|
+
if output is None:
|
|
1265
|
+
return {"run_id": run_id, "pipeline": spec, "yaml": content, "dehydrated": dehydrate}
|
|
1266
|
+
output_path = Path(output)
|
|
1267
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1268
|
+
output_path.write_text(content, encoding="utf-8")
|
|
1269
|
+
|
|
1270
|
+
result = {"run_id": run_id, "output": str(output_path), "dehydrated": dehydrate}
|
|
1271
|
+
arguments = self.to_plain(getattr(task_spec, "arguments", None) or {})
|
|
1272
|
+
if not arguments and isinstance(raw, Mapping):
|
|
1273
|
+
arguments = self.to_plain(raw.get("arguments") or {})
|
|
1274
|
+
if isinstance(arguments, Mapping) and (arguments or dehydrate):
|
|
1275
|
+
config_path = output_path.parent / f"{output_path.stem}.config.yaml"
|
|
1276
|
+
config_data: dict[str, Any] = {"pipeline_path": output_path.name}
|
|
1277
|
+
if dehydrate:
|
|
1278
|
+
config_data["hydrate"] = True
|
|
1279
|
+
if arguments:
|
|
1280
|
+
config_data["args"] = dict(arguments)
|
|
1281
|
+
config_path.write_text(dump_yaml(config_data), encoding="utf-8")
|
|
1282
|
+
result["config_path"] = str(config_path)
|
|
1283
|
+
return result
|
|
1284
|
+
|
|
1285
|
+
def _update_execution_state_timings(
|
|
1286
|
+
self,
|
|
1287
|
+
context: PipelineRunContext,
|
|
1288
|
+
graph_state: Mapping[str, Any] | Any,
|
|
1289
|
+
) -> dict[str, dict[str, Any]]:
|
|
1290
|
+
"""Track how long each execution has stayed in its observed state."""
|
|
1291
|
+
|
|
1292
|
+
execution_status_counts = self.execution_status_counts_from_graph_state(graph_state)
|
|
1293
|
+
if not execution_status_counts:
|
|
1294
|
+
context.metadata[_EXECUTION_STATE_TIMINGS_METADATA_KEY] = {}
|
|
1295
|
+
context.metadata[_EXECUTION_STATE_TIMING_MONOTONIC_METADATA_KEY] = {}
|
|
1296
|
+
return {}
|
|
1297
|
+
|
|
1298
|
+
existing_value = context.metadata.get(_EXECUTION_STATE_TIMINGS_METADATA_KEY)
|
|
1299
|
+
existing = existing_value if isinstance(existing_value, Mapping) else {}
|
|
1300
|
+
monotonic_value = context.metadata.get(_EXECUTION_STATE_TIMING_MONOTONIC_METADATA_KEY)
|
|
1301
|
+
monotonic_state_entered = monotonic_value if isinstance(monotonic_value, Mapping) else {}
|
|
1302
|
+
now_wall = time.time()
|
|
1303
|
+
now_monotonic = time.monotonic()
|
|
1304
|
+
timings: dict[str, dict[str, Any]] = {}
|
|
1305
|
+
next_monotonic_state_entered: dict[str, float] = {}
|
|
1306
|
+
|
|
1307
|
+
for execution_id, status_counts in execution_status_counts.items():
|
|
1308
|
+
state = self.status_from_counts(status_counts) or "UNKNOWN"
|
|
1309
|
+
existing_record = existing.get(execution_id)
|
|
1310
|
+
previous = existing_record if isinstance(existing_record, Mapping) else {}
|
|
1311
|
+
previous_state = previous.get("state")
|
|
1312
|
+
if previous_state == state:
|
|
1313
|
+
try:
|
|
1314
|
+
state_entered_at = float(previous.get("state_entered_at", now_wall))
|
|
1315
|
+
except (TypeError, ValueError):
|
|
1316
|
+
state_entered_at = now_wall
|
|
1317
|
+
try:
|
|
1318
|
+
state_entered_monotonic = float(monotonic_state_entered.get(execution_id, now_monotonic))
|
|
1319
|
+
except (TypeError, ValueError):
|
|
1320
|
+
state_entered_monotonic = now_monotonic
|
|
1321
|
+
else:
|
|
1322
|
+
state_entered_at = now_wall
|
|
1323
|
+
state_entered_monotonic = now_monotonic
|
|
1324
|
+
|
|
1325
|
+
timings[execution_id] = {
|
|
1326
|
+
"state": state,
|
|
1327
|
+
"state_entered_at": state_entered_at,
|
|
1328
|
+
"elapsed_seconds": max(0.0, now_monotonic - state_entered_monotonic),
|
|
1329
|
+
"last_observed_at": now_wall,
|
|
1330
|
+
}
|
|
1331
|
+
next_monotonic_state_entered[execution_id] = state_entered_monotonic
|
|
1332
|
+
|
|
1333
|
+
context.metadata[_EXECUTION_STATE_TIMINGS_METADATA_KEY] = timings
|
|
1334
|
+
context.metadata[_EXECUTION_STATE_TIMING_MONOTONIC_METADATA_KEY] = next_monotonic_state_entered
|
|
1335
|
+
return copy.deepcopy(timings)
|
|
1336
|
+
|
|
1337
|
+
def _poll_run_status(
|
|
1338
|
+
self,
|
|
1339
|
+
run_id: str,
|
|
1340
|
+
*,
|
|
1341
|
+
use_graph_state: bool,
|
|
1342
|
+
started_at: float,
|
|
1343
|
+
context: PipelineRunContext | None = None,
|
|
1344
|
+
) -> PipelineWaitPoll:
|
|
1345
|
+
wait_context = context or PipelineRunContext(run_id=run_id, start_time=time.time())
|
|
1346
|
+
run_snapshot = self.hooks.poll_run_snapshot(self, run_id, wait_context)
|
|
1347
|
+
run = self.to_plain(run_snapshot) if run_snapshot is not None else self.get_run(
|
|
1348
|
+
run_id, include_execution_stats=True
|
|
1349
|
+
)
|
|
1350
|
+
if not isinstance(run, dict):
|
|
1351
|
+
run = {}
|
|
1352
|
+
graph_state: dict[str, Any] | None = None
|
|
1353
|
+
execution_state_timings: dict[str, dict[str, Any]] = {}
|
|
1354
|
+
status_counts = self.status_counts_from_run(run)
|
|
1355
|
+
if use_graph_state:
|
|
1356
|
+
root_execution_id = self.hooks.graph_state_execution_id(run, wait_context)
|
|
1357
|
+
if root_execution_id:
|
|
1358
|
+
graph_state = self.graph_state(str(root_execution_id))
|
|
1359
|
+
graph_counts = self.status_counts_from_graph_state(graph_state)
|
|
1360
|
+
if graph_counts:
|
|
1361
|
+
status_counts = graph_counts
|
|
1362
|
+
execution_state_timings = self._update_execution_state_timings(wait_context, graph_state)
|
|
1363
|
+
status = self.status_from_counts(status_counts) or self.status_from_run(run) or "UNKNOWN"
|
|
1364
|
+
terminal = self.is_terminal_status(status) or status == "ENDED"
|
|
1365
|
+
total = sum(status_counts.values())
|
|
1366
|
+
if total and use_graph_state:
|
|
1367
|
+
terminal_count = sum(status_counts.get(state, 0) for state in _TERMINAL_STATUSES)
|
|
1368
|
+
terminal = terminal_count == total
|
|
1369
|
+
return PipelineWaitPoll(
|
|
1370
|
+
run_id=run_id,
|
|
1371
|
+
run=run,
|
|
1372
|
+
status=status,
|
|
1373
|
+
status_counts=status_counts,
|
|
1374
|
+
total=total,
|
|
1375
|
+
terminal=terminal,
|
|
1376
|
+
graph_state=graph_state if isinstance(graph_state, dict) else None,
|
|
1377
|
+
elapsed_seconds=time.monotonic() - started_at,
|
|
1378
|
+
execution_state_timings=execution_state_timings,
|
|
1379
|
+
)
|
|
1380
|
+
|
|
1381
|
+
def wait_for_completion(
|
|
1382
|
+
self,
|
|
1383
|
+
run_id: str,
|
|
1384
|
+
*,
|
|
1385
|
+
max_wait: float | None,
|
|
1386
|
+
poll_interval: float,
|
|
1387
|
+
use_graph_state: bool = False,
|
|
1388
|
+
context: PipelineRunContext | None = None,
|
|
1389
|
+
allow_zero_poll_interval: bool = False,
|
|
1390
|
+
timeout_clock: str = "monotonic",
|
|
1391
|
+
exit_on_first_failure: bool = False,
|
|
1392
|
+
) -> dict[str, Any]:
|
|
1393
|
+
wait_context = context or PipelineRunContext(run_id=run_id, start_time=time.time())
|
|
1394
|
+
if exit_on_first_failure:
|
|
1395
|
+
wait_context.metadata["exit_on_first_failure"] = True
|
|
1396
|
+
if max_wait is not None and max_wait < 0:
|
|
1397
|
+
raise PipelineRunError("--max-wait must be non-negative")
|
|
1398
|
+
if poll_interval < 0 or (poll_interval == 0 and not allow_zero_poll_interval):
|
|
1399
|
+
raise PipelineRunError("--poll-interval must be positive")
|
|
1400
|
+
if timeout_clock not in {"monotonic", "wall"}:
|
|
1401
|
+
raise PipelineRunError("timeout_clock must be 'monotonic' or 'wall'")
|
|
1402
|
+
enforce_max_wait = max_wait is not None and self.hooks.should_enforce_max_wait(wait_context)
|
|
1403
|
+
poll_started_at = time.monotonic()
|
|
1404
|
+
deadline_now: Callable[[], float] = time.time if timeout_clock == "wall" else time.monotonic
|
|
1405
|
+
deadline_started_at = deadline_now()
|
|
1406
|
+
deadline = deadline_started_at + max_wait if enforce_max_wait else None
|
|
1407
|
+
self.hooks.before_wait(wait_context)
|
|
1408
|
+
last_poll: PipelineWaitPoll | None = None
|
|
1409
|
+
while True:
|
|
1410
|
+
try:
|
|
1411
|
+
poll = self._poll_run_status(
|
|
1412
|
+
run_id,
|
|
1413
|
+
use_graph_state=use_graph_state,
|
|
1414
|
+
started_at=poll_started_at,
|
|
1415
|
+
context=wait_context,
|
|
1416
|
+
)
|
|
1417
|
+
except KeyboardInterrupt:
|
|
1418
|
+
raise
|
|
1419
|
+
except Exception as exc:
|
|
1420
|
+
if deadline is not None and deadline_now() >= deadline:
|
|
1421
|
+
raise PipelineRunError(f"Timed out waiting for run {run_id}") from exc
|
|
1422
|
+
retry_interval = self.hooks.on_poll_error(exc, wait_context)
|
|
1423
|
+
if retry_interval is None:
|
|
1424
|
+
raise
|
|
1425
|
+
if deadline is not None:
|
|
1426
|
+
remaining = deadline - deadline_now()
|
|
1427
|
+
if remaining <= 0:
|
|
1428
|
+
raise PipelineRunError(f"Timed out waiting for run {run_id}") from exc
|
|
1429
|
+
retry_interval = min(retry_interval, remaining)
|
|
1430
|
+
time.sleep(max(0.0, retry_interval))
|
|
1431
|
+
continue
|
|
1432
|
+
last_poll = poll
|
|
1433
|
+
self.hooks.after_poll(poll, wait_context)
|
|
1434
|
+
if poll.terminal:
|
|
1435
|
+
wait_context.metadata["wait_result"] = self._wait_metadata(poll)
|
|
1436
|
+
self.hooks.on_terminal(poll, wait_context)
|
|
1437
|
+
result = self._wait_result(poll, timed_out=False)
|
|
1438
|
+
self._record_wait_outcome(wait_context, poll, result)
|
|
1439
|
+
self.hooks.after_wait_context(result, wait_context)
|
|
1440
|
+
return result
|
|
1441
|
+
if self.hooks.should_exit_early(poll, wait_context):
|
|
1442
|
+
wait_context.metadata["wait_result"] = self._wait_metadata(poll, early_exit=True)
|
|
1443
|
+
self.hooks.on_early_exit_before_release(poll, wait_context)
|
|
1444
|
+
result = self._wait_result(poll, timed_out=False, early_exit=True)
|
|
1445
|
+
self._record_wait_outcome(wait_context, poll, result)
|
|
1446
|
+
self.hooks.after_wait_context(result, wait_context)
|
|
1447
|
+
return result
|
|
1448
|
+
if deadline is not None and deadline_now() >= deadline:
|
|
1449
|
+
wait_context.metadata["wait_result"] = self._wait_metadata(poll, timed_out=True)
|
|
1450
|
+
self.hooks.on_timeout(poll, wait_context)
|
|
1451
|
+
result = self._wait_result(poll, timed_out=True)
|
|
1452
|
+
self._record_wait_outcome(wait_context, poll, result)
|
|
1453
|
+
self.hooks.after_wait_context(result, wait_context)
|
|
1454
|
+
return result
|
|
1455
|
+
if deadline is None:
|
|
1456
|
+
sleep_for = poll_interval
|
|
1457
|
+
else:
|
|
1458
|
+
sleep_for = min(poll_interval, max(0.0, deadline - deadline_now()))
|
|
1459
|
+
time.sleep(sleep_for)
|
|
1460
|
+
if last_poll is None: # pragma: no cover - defensive, loop always polls first
|
|
1461
|
+
raise PipelineRunError(f"No status returned for run {run_id}")
|
|
1462
|
+
|
|
1463
|
+
@staticmethod
|
|
1464
|
+
def _wait_metadata(
|
|
1465
|
+
poll: PipelineWaitPoll,
|
|
1466
|
+
*,
|
|
1467
|
+
timed_out: bool = False,
|
|
1468
|
+
early_exit: bool = False,
|
|
1469
|
+
) -> dict[str, Any]:
|
|
1470
|
+
failed_count = int(poll.status_counts.get("FAILED", 0) or 0)
|
|
1471
|
+
error_count = int(poll.status_counts.get("SYSTEM_ERROR", 0) or 0)
|
|
1472
|
+
metadata: dict[str, Any] = {
|
|
1473
|
+
"status_counts": dict(poll.status_counts),
|
|
1474
|
+
"failed_count": failed_count,
|
|
1475
|
+
"error_count": error_count,
|
|
1476
|
+
"elapsed_seconds": poll.elapsed_seconds,
|
|
1477
|
+
}
|
|
1478
|
+
if timed_out:
|
|
1479
|
+
metadata["timed_out"] = True
|
|
1480
|
+
if early_exit:
|
|
1481
|
+
metadata["early_exit"] = True
|
|
1482
|
+
return metadata
|
|
1483
|
+
|
|
1484
|
+
def _record_wait_outcome(
|
|
1485
|
+
self,
|
|
1486
|
+
context: PipelineRunContext,
|
|
1487
|
+
poll: PipelineWaitPoll,
|
|
1488
|
+
result: Mapping[str, Any],
|
|
1489
|
+
) -> None:
|
|
1490
|
+
context.wait_outcome = self.hooks.wait_outcome(poll, result, context)
|
|
1491
|
+
|
|
1492
|
+
@staticmethod
|
|
1493
|
+
def _wait_result(
|
|
1494
|
+
poll: PipelineWaitPoll,
|
|
1495
|
+
*,
|
|
1496
|
+
timed_out: bool,
|
|
1497
|
+
early_exit: bool = False,
|
|
1498
|
+
) -> dict[str, Any]:
|
|
1499
|
+
result: dict[str, Any] = {
|
|
1500
|
+
"run": poll.run,
|
|
1501
|
+
"status": poll.status,
|
|
1502
|
+
"timed_out": timed_out,
|
|
1503
|
+
}
|
|
1504
|
+
if early_exit or timed_out:
|
|
1505
|
+
result.update(PipelineRunManager._wait_metadata(poll, timed_out=timed_out, early_exit=early_exit))
|
|
1506
|
+
if early_exit:
|
|
1507
|
+
result["early_exit"] = True
|
|
1508
|
+
return result
|
|
1509
|
+
|
|
1510
|
+
@staticmethod
|
|
1511
|
+
def _ensure_submission_id_annotation(body: dict[str, Any]) -> str:
|
|
1512
|
+
annotations = body.setdefault("annotations", {})
|
|
1513
|
+
if not isinstance(annotations, dict):
|
|
1514
|
+
annotations = {}
|
|
1515
|
+
body["annotations"] = annotations
|
|
1516
|
+
submission_id = annotations.get(_SUBMISSION_ID_ANNOTATION_KEY)
|
|
1517
|
+
if submission_id:
|
|
1518
|
+
annotations[_SUBMISSION_ID_ANNOTATION_KEY] = str(submission_id)
|
|
1519
|
+
return str(submission_id)
|
|
1520
|
+
submission_id = uuid.uuid4().hex
|
|
1521
|
+
annotations[_SUBMISSION_ID_ANNOTATION_KEY] = submission_id
|
|
1522
|
+
return submission_id
|
|
1523
|
+
|
|
1524
|
+
@staticmethod
|
|
1525
|
+
def _submission_id_from_body(body: Mapping[str, Any]) -> str | None:
|
|
1526
|
+
annotations = body.get("annotations")
|
|
1527
|
+
if not isinstance(annotations, Mapping):
|
|
1528
|
+
return None
|
|
1529
|
+
submission_id = annotations.get(_SUBMISSION_ID_ANNOTATION_KEY)
|
|
1530
|
+
return str(submission_id) if submission_id else None
|
|
1531
|
+
|
|
1532
|
+
def _submitted_runs_for_submission_id(self, submission_id: str) -> list[dict[str, Any]]:
|
|
1533
|
+
query = {
|
|
1534
|
+
"and": [
|
|
1535
|
+
PipelineRunSearch.build_value_equals(
|
|
1536
|
+
key=_SUBMISSION_ID_ANNOTATION_KEY,
|
|
1537
|
+
value=submission_id,
|
|
1538
|
+
)
|
|
1539
|
+
]
|
|
1540
|
+
}
|
|
1541
|
+
response = self._require_client().pipeline_runs_list(
|
|
1542
|
+
filter_query=json.dumps(query, separators=(",", ":")),
|
|
1543
|
+
include_pipeline_names=True,
|
|
1544
|
+
)
|
|
1545
|
+
plain = self.to_plain(response)
|
|
1546
|
+
if not isinstance(plain, Mapping):
|
|
1547
|
+
return []
|
|
1548
|
+
runs = plain.get("pipeline_runs")
|
|
1549
|
+
if not isinstance(runs, list):
|
|
1550
|
+
return []
|
|
1551
|
+
return [dict(run) for run in runs if isinstance(run, Mapping)]
|
|
1552
|
+
|
|
1553
|
+
def _recover_submitted_run_after_submit_error(
|
|
1554
|
+
self,
|
|
1555
|
+
*,
|
|
1556
|
+
submission_id: str | None,
|
|
1557
|
+
) -> dict[str, Any] | None:
|
|
1558
|
+
if not submission_id:
|
|
1559
|
+
return None
|
|
1560
|
+
for lookup_attempt in range(1, _SUBMIT_RECOVERY_LOOKUP_ATTEMPTS + 1):
|
|
1561
|
+
self.logger.info(
|
|
1562
|
+
"Checking whether failed submit already created a pipeline run "
|
|
1563
|
+
f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}, "
|
|
1564
|
+
f"lookup_attempt={lookup_attempt}/{_SUBMIT_RECOVERY_LOOKUP_ATTEMPTS})"
|
|
1565
|
+
)
|
|
1566
|
+
try:
|
|
1567
|
+
matches = self._submitted_runs_for_submission_id(submission_id)
|
|
1568
|
+
except Exception as exc:
|
|
1569
|
+
self.logger.warn(
|
|
1570
|
+
"Submit recovery lookup failed "
|
|
1571
|
+
f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}): {exc}. "
|
|
1572
|
+
"Falling back to resubmitting the same frozen body."
|
|
1573
|
+
)
|
|
1574
|
+
return None
|
|
1575
|
+
self.logger.info(
|
|
1576
|
+
"Submit recovery lookup matched "
|
|
1577
|
+
f"{len(matches)} run(s) for {_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}"
|
|
1578
|
+
)
|
|
1579
|
+
if len(matches) == 1:
|
|
1580
|
+
run = matches[0]
|
|
1581
|
+
run_id = run.get("id")
|
|
1582
|
+
root_execution_id = run.get("root_execution_id")
|
|
1583
|
+
self.logger.info(
|
|
1584
|
+
"Recovered existing pipeline run "
|
|
1585
|
+
f"run_id={run_id}, root_execution_id={root_execution_id}, "
|
|
1586
|
+
f"{_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}; adopting instead of resubmitting."
|
|
1587
|
+
)
|
|
1588
|
+
return run
|
|
1589
|
+
if len(matches) > 1:
|
|
1590
|
+
run_ids = [str(run.get("id")) for run in matches if run.get("id") is not None]
|
|
1591
|
+
self.logger.warn(
|
|
1592
|
+
"Submit recovery lookup was ambiguous "
|
|
1593
|
+
f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}, matched_run_ids={run_ids}). "
|
|
1594
|
+
"Refusing to submit a duplicate."
|
|
1595
|
+
)
|
|
1596
|
+
raise AmbiguousPipelineRunRecoveryError(
|
|
1597
|
+
"Found multiple pipeline runs for failed submit recovery "
|
|
1598
|
+
f"{_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}: {', '.join(run_ids) or matches!r}. "
|
|
1599
|
+
"Refusing to submit a duplicate."
|
|
1600
|
+
)
|
|
1601
|
+
if lookup_attempt < _SUBMIT_RECOVERY_LOOKUP_ATTEMPTS:
|
|
1602
|
+
time.sleep(_SUBMIT_RECOVERY_LOOKUP_DELAY_SECONDS)
|
|
1603
|
+
self.logger.warn(
|
|
1604
|
+
"No existing pipeline run found after submit failure "
|
|
1605
|
+
f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}); "
|
|
1606
|
+
"resubmitting the same frozen body with preserved inputs."
|
|
1607
|
+
)
|
|
1608
|
+
return None
|
|
1609
|
+
|
|
1610
|
+
def _adopt_submitted_run(
|
|
1611
|
+
self,
|
|
1612
|
+
*,
|
|
1613
|
+
response: Mapping[str, Any],
|
|
1614
|
+
body: dict[str, Any],
|
|
1615
|
+
pipeline_path: str | Path | None,
|
|
1616
|
+
attempt: int,
|
|
1617
|
+
context: PipelineRunContext,
|
|
1618
|
+
) -> dict[str, Any]:
|
|
1619
|
+
response_dict = dict(response)
|
|
1620
|
+
submitted_context = self.response_run_context(
|
|
1621
|
+
response_dict,
|
|
1622
|
+
submit_body=body,
|
|
1623
|
+
pipeline_path=pipeline_path,
|
|
1624
|
+
attempt=attempt,
|
|
1625
|
+
)
|
|
1626
|
+
context.run_id = submitted_context.run_id
|
|
1627
|
+
context.run_name = submitted_context.run_name
|
|
1628
|
+
context.root_execution_id = submitted_context.root_execution_id
|
|
1629
|
+
context.submit_body = submitted_context.submit_body
|
|
1630
|
+
context.pipeline_spec = submitted_context.pipeline_spec
|
|
1631
|
+
context.response = response_dict
|
|
1632
|
+
context.metadata["recovered_after_submit_error"] = True
|
|
1633
|
+
self.hooks.after_submit_context(context)
|
|
1634
|
+
return response_dict
|
|
1635
|
+
|
|
1636
|
+
def _run_body_factory(
|
|
1637
|
+
self,
|
|
1638
|
+
body_factory: Callable[[int, PipelineRunContext | None, Exception | None], dict[str, Any]],
|
|
1639
|
+
*,
|
|
1640
|
+
pipeline_path: str | Path | None = None,
|
|
1641
|
+
wait: bool = False,
|
|
1642
|
+
max_wait: float | None = 600.0,
|
|
1643
|
+
poll_interval: float = 10.0,
|
|
1644
|
+
use_graph_state: bool = False,
|
|
1645
|
+
max_attempts: int = 1,
|
|
1646
|
+
allow_zero_poll_interval: bool = False,
|
|
1647
|
+
timeout_clock: str = "monotonic",
|
|
1648
|
+
exit_on_first_failure: bool = False,
|
|
1649
|
+
metadata: dict[str, Any] | None = None,
|
|
1650
|
+
metadata_factory: Callable[
|
|
1651
|
+
[int, PipelineRunContext | None, Exception | None], dict[str, Any]
|
|
1652
|
+
] | None = None,
|
|
1653
|
+
) -> dict[str, Any]:
|
|
1654
|
+
"""Drive submit/wait/retry for already prepared specs or submit bodies."""
|
|
1655
|
+
|
|
1656
|
+
if max_attempts < 1:
|
|
1657
|
+
raise PipelineRunError("max_attempts must be at least 1")
|
|
1658
|
+
last_error: Exception | None = None
|
|
1659
|
+
previous_context: PipelineRunContext | None = None
|
|
1660
|
+
attempts: list[PipelineRunContext] = []
|
|
1661
|
+
for attempt in range(1, max_attempts + 1):
|
|
1662
|
+
context = PipelineRunContext(
|
|
1663
|
+
pipeline_path=pipeline_path,
|
|
1664
|
+
start_time=time.time(),
|
|
1665
|
+
attempt=attempt,
|
|
1666
|
+
previous_context=previous_context,
|
|
1667
|
+
previous_error=last_error,
|
|
1668
|
+
metadata=dict(metadata or {}),
|
|
1669
|
+
)
|
|
1670
|
+
lifecycle_started = False
|
|
1671
|
+
success = False
|
|
1672
|
+
error: Exception | None = None
|
|
1673
|
+
retry_requested = False
|
|
1674
|
+
reused_after_submit_failure = (
|
|
1675
|
+
previous_context is not None
|
|
1676
|
+
and previous_context.run_id is None
|
|
1677
|
+
and previous_context.submit_body is not None
|
|
1678
|
+
)
|
|
1679
|
+
if reused_after_submit_failure:
|
|
1680
|
+
# The previous attempt failed while submitting, before the API
|
|
1681
|
+
# returned a run id. Retry the exact same submit body instead
|
|
1682
|
+
# of rebuilding it: body construction can intentionally inject
|
|
1683
|
+
# dynamic inputs (for example a scheduler creation timestamp),
|
|
1684
|
+
# and changing those inputs on an ambiguous submit timeout can
|
|
1685
|
+
# defeat cache reuse or double-run the logical pipeline.
|
|
1686
|
+
body = copy.deepcopy(previous_context.submit_body)
|
|
1687
|
+
self.logger.info(
|
|
1688
|
+
"Retrying submit after submit exception with the same frozen body "
|
|
1689
|
+
f"({_SUBMISSION_ID_ANNOTATION_KEY}={self._submission_id_from_body(body)}); "
|
|
1690
|
+
"dynamic inputs are preserved."
|
|
1691
|
+
)
|
|
1692
|
+
else:
|
|
1693
|
+
if previous_context is not None:
|
|
1694
|
+
self.logger.info(
|
|
1695
|
+
"Retrying after pipeline failure; rebuilding submit body so dynamic run arguments "
|
|
1696
|
+
"can follow hook policy (for example update-vs-fixed time input)."
|
|
1697
|
+
)
|
|
1698
|
+
body = body_factory(attempt, previous_context, last_error)
|
|
1699
|
+
self.normalize_submit_body_in_place(body)
|
|
1700
|
+
submission_id = self._ensure_submission_id_annotation(body)
|
|
1701
|
+
context.metadata["submission_id"] = submission_id
|
|
1702
|
+
if metadata_factory is not None:
|
|
1703
|
+
context.metadata.update(metadata_factory(attempt, previous_context, last_error))
|
|
1704
|
+
pipeline_spec = body.get("root_task", {}).get("componentRef", {}).get("spec")
|
|
1705
|
+
context.submit_body = body
|
|
1706
|
+
context.pipeline_spec = pipeline_spec if isinstance(pipeline_spec, dict) else None
|
|
1707
|
+
if context.pipeline_spec is not None:
|
|
1708
|
+
spec_name = context.pipeline_spec.get("name")
|
|
1709
|
+
if isinstance(spec_name, str) and spec_name:
|
|
1710
|
+
context.run_name = spec_name
|
|
1711
|
+
self.hooks.before_run_lifecycle(context)
|
|
1712
|
+
lifecycle_started = True
|
|
1713
|
+
attempts.append(context)
|
|
1714
|
+
# ``previous_context`` tracks the previous attempt, not only the
|
|
1715
|
+
# previous successfully submitted run. Resource-carry hooks need to
|
|
1716
|
+
# hand off mutexes/leases even when an attempt fails during submit
|
|
1717
|
+
# before a run id is available.
|
|
1718
|
+
previous_context = context
|
|
1719
|
+
try:
|
|
1720
|
+
with self.hooks.around_run(context):
|
|
1721
|
+
try:
|
|
1722
|
+
recovered_response = None
|
|
1723
|
+
if reused_after_submit_failure:
|
|
1724
|
+
recovered_response = self._recover_submitted_run_after_submit_error(
|
|
1725
|
+
submission_id=self._submission_id_from_body(body),
|
|
1726
|
+
)
|
|
1727
|
+
if recovered_response is not None:
|
|
1728
|
+
response = self._adopt_submitted_run(
|
|
1729
|
+
response=recovered_response,
|
|
1730
|
+
body=body,
|
|
1731
|
+
pipeline_path=pipeline_path,
|
|
1732
|
+
attempt=attempt,
|
|
1733
|
+
context=context,
|
|
1734
|
+
)
|
|
1735
|
+
if attempt > 1:
|
|
1736
|
+
self.hooks.after_retry_submit(context)
|
|
1737
|
+
else:
|
|
1738
|
+
try:
|
|
1739
|
+
response = self.submit_prepared_body(
|
|
1740
|
+
body,
|
|
1741
|
+
pipeline_path=pipeline_path,
|
|
1742
|
+
attempt=attempt,
|
|
1743
|
+
context=context,
|
|
1744
|
+
notify_submit_error=False,
|
|
1745
|
+
)
|
|
1746
|
+
except Exception as submit_exc:
|
|
1747
|
+
if context.run_id is not None:
|
|
1748
|
+
raise
|
|
1749
|
+
submission_id_for_recovery = self._submission_id_from_body(body)
|
|
1750
|
+
self.logger.warn(
|
|
1751
|
+
"Submit failed before a run id was returned "
|
|
1752
|
+
f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id_for_recovery}): "
|
|
1753
|
+
f"{submit_exc}. Checking whether the run was actually created."
|
|
1754
|
+
)
|
|
1755
|
+
recovered_response = self._recover_submitted_run_after_submit_error(
|
|
1756
|
+
submission_id=submission_id_for_recovery,
|
|
1757
|
+
)
|
|
1758
|
+
if recovered_response is None:
|
|
1759
|
+
self.hooks.on_submit_error(submit_exc, context=context)
|
|
1760
|
+
raise
|
|
1761
|
+
response = self._adopt_submitted_run(
|
|
1762
|
+
response=recovered_response,
|
|
1763
|
+
body=body,
|
|
1764
|
+
pipeline_path=pipeline_path,
|
|
1765
|
+
attempt=attempt,
|
|
1766
|
+
context=context,
|
|
1767
|
+
)
|
|
1768
|
+
if attempt > 1:
|
|
1769
|
+
self.hooks.after_retry_submit(context)
|
|
1770
|
+
result: dict[str, Any]
|
|
1771
|
+
if wait and context.run_id:
|
|
1772
|
+
wait_result = self.wait_for_completion(
|
|
1773
|
+
context.run_id,
|
|
1774
|
+
max_wait=max_wait,
|
|
1775
|
+
poll_interval=poll_interval,
|
|
1776
|
+
use_graph_state=use_graph_state,
|
|
1777
|
+
context=context,
|
|
1778
|
+
allow_zero_poll_interval=allow_zero_poll_interval,
|
|
1779
|
+
timeout_clock=timeout_clock,
|
|
1780
|
+
exit_on_first_failure=exit_on_first_failure,
|
|
1781
|
+
)
|
|
1782
|
+
result = {"response": response, "wait": wait_result}
|
|
1783
|
+
else:
|
|
1784
|
+
result = {"response": response}
|
|
1785
|
+
result["context"] = context
|
|
1786
|
+
result["attempts"] = attempts
|
|
1787
|
+
success = True
|
|
1788
|
+
return result
|
|
1789
|
+
except Exception as exc:
|
|
1790
|
+
error = exc
|
|
1791
|
+
last_error = exc
|
|
1792
|
+
if isinstance(exc, AmbiguousPipelineRunRecoveryError):
|
|
1793
|
+
self.hooks.on_fail_fast_before_release(context, exc)
|
|
1794
|
+
raise
|
|
1795
|
+
if (
|
|
1796
|
+
context.run_id
|
|
1797
|
+
and attempt < max_attempts
|
|
1798
|
+
and self.hooks.should_cancel_previous_run(
|
|
1799
|
+
context,
|
|
1800
|
+
exc,
|
|
1801
|
+
next_attempt=attempt + 1,
|
|
1802
|
+
)
|
|
1803
|
+
):
|
|
1804
|
+
self.cancel_run(context.run_id)
|
|
1805
|
+
if attempt >= max_attempts:
|
|
1806
|
+
self.hooks.on_fail_fast_before_release(context, exc)
|
|
1807
|
+
raise
|
|
1808
|
+
retry_context = context if context.run_id else previous_context or context
|
|
1809
|
+
self.hooks.before_retry(retry_context, exc, next_attempt=attempt + 1)
|
|
1810
|
+
retry_requested = True
|
|
1811
|
+
finally:
|
|
1812
|
+
if lifecycle_started:
|
|
1813
|
+
self.hooks.after_run_lifecycle(context, success=success, error=error)
|
|
1814
|
+
if retry_requested:
|
|
1815
|
+
continue
|
|
1816
|
+
if last_error is not None: # pragma: no cover - defensive
|
|
1817
|
+
raise last_error
|
|
1818
|
+
raise PipelineRunError("Pipeline run did not start") # pragma: no cover
|
|
1819
|
+
|
|
1820
|
+
def run_prepared_body(
|
|
1821
|
+
self,
|
|
1822
|
+
body: dict[str, Any],
|
|
1823
|
+
*,
|
|
1824
|
+
pipeline_path: str | Path | None = None,
|
|
1825
|
+
wait: bool = False,
|
|
1826
|
+
max_wait: float | None = 600.0,
|
|
1827
|
+
poll_interval: float = 10.0,
|
|
1828
|
+
use_graph_state: bool = False,
|
|
1829
|
+
max_attempts: int = 1,
|
|
1830
|
+
retry_body_factory: Callable[
|
|
1831
|
+
[int, PipelineRunContext | None, Exception | None], dict[str, Any]
|
|
1832
|
+
] | None = None,
|
|
1833
|
+
allow_zero_poll_interval: bool = False,
|
|
1834
|
+
timeout_clock: str = "monotonic",
|
|
1835
|
+
exit_on_first_failure: bool = False,
|
|
1836
|
+
metadata: dict[str, Any] | None = None,
|
|
1837
|
+
) -> dict[str, Any]:
|
|
1838
|
+
"""Submit/wait/retry an already prepared submit body.
|
|
1839
|
+
|
|
1840
|
+
``retry_body_factory`` lets downstreams refresh retry bodies while still
|
|
1841
|
+
keeping hydration/layout/validation outside the generic lifecycle.
|
|
1842
|
+
"""
|
|
1843
|
+
|
|
1844
|
+
def body_factory(
|
|
1845
|
+
attempt: int,
|
|
1846
|
+
previous_context: PipelineRunContext | None,
|
|
1847
|
+
error: Exception | None,
|
|
1848
|
+
) -> dict[str, Any]:
|
|
1849
|
+
if attempt > 1 and retry_body_factory is not None:
|
|
1850
|
+
return retry_body_factory(attempt, previous_context, error)
|
|
1851
|
+
return copy.deepcopy(body)
|
|
1852
|
+
|
|
1853
|
+
return self._run_body_factory(
|
|
1854
|
+
body_factory,
|
|
1855
|
+
pipeline_path=pipeline_path,
|
|
1856
|
+
wait=wait,
|
|
1857
|
+
max_wait=max_wait,
|
|
1858
|
+
poll_interval=poll_interval,
|
|
1859
|
+
use_graph_state=use_graph_state,
|
|
1860
|
+
max_attempts=max_attempts,
|
|
1861
|
+
allow_zero_poll_interval=allow_zero_poll_interval,
|
|
1862
|
+
timeout_clock=timeout_clock,
|
|
1863
|
+
exit_on_first_failure=exit_on_first_failure,
|
|
1864
|
+
metadata=metadata,
|
|
1865
|
+
)
|
|
1866
|
+
|
|
1867
|
+
def run_pipeline_spec(
|
|
1868
|
+
self,
|
|
1869
|
+
pipeline_spec: dict[str, Any],
|
|
1870
|
+
*,
|
|
1871
|
+
run_args: dict[str, Any] | None = None,
|
|
1872
|
+
annotations: dict[str, str] | None = None,
|
|
1873
|
+
pipeline_path: str | Path | None = None,
|
|
1874
|
+
run_as: str | None = None,
|
|
1875
|
+
hydrate: bool = True,
|
|
1876
|
+
wait: bool = False,
|
|
1877
|
+
max_wait: float | None = 600.0,
|
|
1878
|
+
poll_interval: float = 10.0,
|
|
1879
|
+
use_graph_state: bool = False,
|
|
1880
|
+
max_attempts: int = 1,
|
|
1881
|
+
allow_zero_poll_interval: bool = False,
|
|
1882
|
+
timeout_clock: str = "monotonic",
|
|
1883
|
+
exit_on_first_failure: bool = False,
|
|
1884
|
+
metadata: dict[str, Any] | None = None,
|
|
1885
|
+
) -> dict[str, Any]:
|
|
1886
|
+
"""Submit/wait/retry an already hydrated/validated in-memory spec."""
|
|
1887
|
+
|
|
1888
|
+
def body_factory(
|
|
1889
|
+
_attempt: int,
|
|
1890
|
+
_previous_context: PipelineRunContext | None,
|
|
1891
|
+
_error: Exception | None,
|
|
1892
|
+
) -> dict[str, Any]:
|
|
1893
|
+
return self.prepare_submit_payload_from_spec(
|
|
1894
|
+
copy.deepcopy(pipeline_spec),
|
|
1895
|
+
run_args=run_args,
|
|
1896
|
+
annotations=annotations,
|
|
1897
|
+
pipeline_path=pipeline_path,
|
|
1898
|
+
run_as=run_as,
|
|
1899
|
+
hydrate=hydrate,
|
|
1900
|
+
).to_body()
|
|
1901
|
+
|
|
1902
|
+
return self._run_body_factory(
|
|
1903
|
+
body_factory,
|
|
1904
|
+
pipeline_path=pipeline_path,
|
|
1905
|
+
wait=wait,
|
|
1906
|
+
max_wait=max_wait,
|
|
1907
|
+
poll_interval=poll_interval,
|
|
1908
|
+
use_graph_state=use_graph_state,
|
|
1909
|
+
max_attempts=max_attempts,
|
|
1910
|
+
allow_zero_poll_interval=allow_zero_poll_interval,
|
|
1911
|
+
timeout_clock=timeout_clock,
|
|
1912
|
+
exit_on_first_failure=exit_on_first_failure,
|
|
1913
|
+
metadata=metadata,
|
|
1914
|
+
)
|
|
1915
|
+
|
|
1916
|
+
def run_pipeline(
|
|
1917
|
+
self,
|
|
1918
|
+
pipeline_path: str | Path,
|
|
1919
|
+
*,
|
|
1920
|
+
run_args: dict[str, Any] | None = None,
|
|
1921
|
+
annotations: dict[str, str] | None = None,
|
|
1922
|
+
hydrate: bool = True,
|
|
1923
|
+
run_as: str | None = None,
|
|
1924
|
+
resolution_overrides: dict[str, Any] | None = None,
|
|
1925
|
+
wait: bool = False,
|
|
1926
|
+
max_wait: float | None = 600.0,
|
|
1927
|
+
poll_interval: float = 10.0,
|
|
1928
|
+
use_graph_state: bool = False,
|
|
1929
|
+
max_attempts: int = 1,
|
|
1930
|
+
allow_zero_poll_interval: bool = False,
|
|
1931
|
+
timeout_clock: str = "monotonic",
|
|
1932
|
+
exit_on_first_failure: bool = False,
|
|
1933
|
+
metadata: dict[str, Any] | None = None,
|
|
1934
|
+
) -> dict[str, Any]:
|
|
1935
|
+
"""Submit (and optionally wait for) a pipeline with lifecycle hooks.
|
|
1936
|
+
|
|
1937
|
+
Unlike ``run_pipeline_spec``, path-based runs intentionally rebuild the
|
|
1938
|
+
submit body on every retry so read/hydrate/resolution hooks are
|
|
1939
|
+
re-invoked for each attempt.
|
|
1940
|
+
"""
|
|
1941
|
+
|
|
1942
|
+
def body_factory(
|
|
1943
|
+
_attempt: int,
|
|
1944
|
+
_previous_context: PipelineRunContext | None,
|
|
1945
|
+
_error: Exception | None,
|
|
1946
|
+
) -> dict[str, Any]:
|
|
1947
|
+
return self.prepare_submit_payload(
|
|
1948
|
+
pipeline_path,
|
|
1949
|
+
run_args=run_args,
|
|
1950
|
+
annotations=annotations,
|
|
1951
|
+
hydrate=hydrate,
|
|
1952
|
+
run_as=run_as,
|
|
1953
|
+
resolution_overrides=resolution_overrides,
|
|
1954
|
+
).to_body()
|
|
1955
|
+
|
|
1956
|
+
return self._run_body_factory(
|
|
1957
|
+
body_factory,
|
|
1958
|
+
pipeline_path=pipeline_path,
|
|
1959
|
+
wait=wait,
|
|
1960
|
+
max_wait=max_wait,
|
|
1961
|
+
poll_interval=poll_interval,
|
|
1962
|
+
use_graph_state=use_graph_state,
|
|
1963
|
+
max_attempts=max_attempts,
|
|
1964
|
+
allow_zero_poll_interval=allow_zero_poll_interval,
|
|
1965
|
+
timeout_clock=timeout_clock,
|
|
1966
|
+
exit_on_first_failure=exit_on_first_failure,
|
|
1967
|
+
metadata=metadata,
|
|
1968
|
+
)
|
|
1969
|
+
|
|
1970
|
+
|
|
1971
|
+
def parse_key_value_entries(entries: list[str] | None) -> dict[str, str]:
|
|
1972
|
+
parsed: dict[str, str] = {}
|
|
1973
|
+
for entry in entries or []:
|
|
1974
|
+
if "=" not in entry:
|
|
1975
|
+
raise PipelineRunError("Expected KEY=VALUE")
|
|
1976
|
+
key, value = entry.split("=", 1)
|
|
1977
|
+
if not key:
|
|
1978
|
+
raise PipelineRunError("Expected KEY=VALUE")
|
|
1979
|
+
parsed[key] = value
|
|
1980
|
+
return parsed
|
|
1981
|
+
|
|
1982
|
+
|
|
1983
|
+
def parse_json_or_key_values(
|
|
1984
|
+
text: str | Mapping[str, Any] | None,
|
|
1985
|
+
entries: list[str] | None = None,
|
|
1986
|
+
) -> dict[str, Any]:
|
|
1987
|
+
result: dict[str, Any] = {}
|
|
1988
|
+
if text:
|
|
1989
|
+
loaded = dict(text) if isinstance(text, Mapping) else json.loads(text)
|
|
1990
|
+
if not isinstance(loaded, dict):
|
|
1991
|
+
raise PipelineRunError("JSON value must be an object")
|
|
1992
|
+
result.update(loaded)
|
|
1993
|
+
result.update(parse_key_value_entries(entries))
|
|
1994
|
+
return result
|