tangle-cli 0.0.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. tangle_cli/__init__.py +19 -0
  2. tangle_cli/api_cli.py +787 -0
  3. tangle_cli/api_schema.py +633 -0
  4. tangle_cli/api_transport.py +461 -0
  5. tangle_cli/args_container.py +244 -0
  6. tangle_cli/artifacts.py +293 -0
  7. tangle_cli/artifacts_cli.py +108 -0
  8. tangle_cli/cli.py +57 -0
  9. tangle_cli/cli_helpers.py +116 -0
  10. tangle_cli/cli_options.py +52 -0
  11. tangle_cli/client.py +677 -0
  12. tangle_cli/component_from_func.py +1856 -0
  13. tangle_cli/component_generator.py +298 -0
  14. tangle_cli/component_inspector.py +494 -0
  15. tangle_cli/component_publisher.py +921 -0
  16. tangle_cli/components_cli.py +269 -0
  17. tangle_cli/dynamic_discovery_client.py +296 -0
  18. tangle_cli/generated_model_extensions.py +405 -0
  19. tangle_cli/generated_runtime.py +43 -0
  20. tangle_cli/handler.py +96 -0
  21. tangle_cli/hydration_trust.py +222 -0
  22. tangle_cli/logger.py +166 -0
  23. tangle_cli/models.py +407 -0
  24. tangle_cli/module_bundler.py +662 -0
  25. tangle_cli/openapi/__init__.py +0 -0
  26. tangle_cli/openapi/codegen.py +1090 -0
  27. tangle_cli/openapi/parser.py +77 -0
  28. tangle_cli/pipeline_dehydrator.py +720 -0
  29. tangle_cli/pipeline_hydrator.py +1785 -0
  30. tangle_cli/pipeline_run_annotations.py +41 -0
  31. tangle_cli/pipeline_run_details.py +203 -0
  32. tangle_cli/pipeline_run_manager.py +1994 -0
  33. tangle_cli/pipeline_run_search.py +712 -0
  34. tangle_cli/pipeline_runner.py +620 -0
  35. tangle_cli/pipeline_runs_cli.py +584 -0
  36. tangle_cli/pipelines.py +581 -0
  37. tangle_cli/pipelines_cli.py +271 -0
  38. tangle_cli/published_components_cli.py +373 -0
  39. tangle_cli/py.typed +0 -0
  40. tangle_cli/quickstart.py +110 -0
  41. tangle_cli/secrets.py +156 -0
  42. tangle_cli/secrets_cli.py +269 -0
  43. tangle_cli/utils.py +942 -0
  44. tangle_cli/version_manager.py +470 -0
  45. tangle_cli-0.0.1a1.dist-info/METADATA +561 -0
  46. tangle_cli-0.0.1a1.dist-info/RECORD +48 -0
  47. tangle_cli-0.0.1a1.dist-info/WHEEL +4 -0
  48. tangle_cli-0.0.1a1.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,1994 @@
1
+ """Generic pipeline-run helpers for `tangle sdk pipeline-runs`.
2
+
3
+ This module ports the OSS-safe parts of tangle-deploy's runner/run details
4
+ commands while keeping downstream-specific behavior behind hooks. The default
5
+ implementation uses only the public Tangle API and local files; cloud storage,
6
+ notifications, scheduler, mutex, run-as annotation defaults, and alternate log
7
+ backends are intentionally extension points rather than OSS behavior.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import copy
13
+ import inspect
14
+ import json
15
+ import re
16
+ import time
17
+ import uuid
18
+ from collections.abc import Callable
19
+ from contextlib import AbstractContextManager, nullcontext
20
+ from dataclasses import dataclass, field
21
+ from pathlib import Path
22
+ from typing import Any, Mapping
23
+
24
+ import yaml
25
+
26
+ from .handler import TangleCliHandler
27
+ from .logger import Logger, get_default_logger
28
+ from .pipeline_dehydrator import DehydrateChoice, PipelineDehydrator
29
+ from .pipeline_hydrator import HydrationError, PipelineHydrator
30
+ from .pipeline_run_details import PipelineRunDetails
31
+ from .pipeline_run_search import PipelineRunSearch
32
+ from .utils import dump_yaml
33
+
34
+ _TERMINAL_STATUSES = ("FAILED", "SYSTEM_ERROR", "CANCELLED", "CANCELED", "SKIPPED", "SUCCEEDED", "INVALID")
35
+ _ACTIVE_STATUSES = ("RUNNING", "CANCELLING", "CANCELING", "PENDING", "QUEUED")
36
+ _FAILURE_EARLY_EXIT_STATUSES = ("FAILED", "SYSTEM_ERROR")
37
+ _EXECUTION_STATE_TIMINGS_METADATA_KEY = "execution_state_timings"
38
+ _EXECUTION_STATE_TIMING_MONOTONIC_METADATA_KEY = "_execution_state_timing_monotonic"
39
+ _SUBMISSION_ID_ANNOTATION_KEY = "tangle-cli/submission-id"
40
+ _SUBMIT_RECOVERY_LOOKUP_ATTEMPTS = 2
41
+ _SUBMIT_RECOVERY_LOOKUP_DELAY_SECONDS = 0.1
42
+
43
+
44
+ class PipelineRunError(RuntimeError):
45
+ """Raised when a pipeline-run operation cannot complete."""
46
+
47
+
48
+ class UnsupportedPipelineRunFeatureError(PipelineRunError):
49
+ """Raised for TD extension points intentionally unsupported in OSS defaults."""
50
+
51
+
52
+ class AmbiguousPipelineRunRecoveryError(PipelineRunError):
53
+ """Raised when submit recovery finds multiple runs for one submission id."""
54
+
55
+
56
+ @dataclass
57
+ class PipelineSubmitPayload:
58
+ """Prepared submit payload state before calling ``pipeline_runs_create``.
59
+
60
+ This keeps the generic submit-body pipeline explicit: downstream hooks can
61
+ adjust the spec, runtime arguments, run name, and annotations while callers
62
+ still have one canonical body shape to submit.
63
+ """
64
+
65
+ prepared_spec: dict[str, Any]
66
+ pipeline_spec: dict[str, Any]
67
+ run_args: dict[str, Any] | None
68
+ root_task: dict[str, Any]
69
+ annotations: dict[str, str]
70
+ run_name: str | None = None
71
+
72
+ def to_body(self) -> dict[str, Any]:
73
+ return {"root_task": self.root_task, "annotations": self.annotations}
74
+
75
+ def sync_from_body(self, body: Mapping[str, Any]) -> None:
76
+ """Refresh derived payload fields after in-place body normalization."""
77
+
78
+ root_task = body.get("root_task")
79
+ if isinstance(root_task, dict):
80
+ self.root_task = root_task
81
+ annotations = body.get("annotations")
82
+ if isinstance(annotations, dict):
83
+ self.annotations = {str(key): str(value) for key, value in annotations.items()}
84
+ component_ref = self.root_task.get("componentRef") if isinstance(self.root_task, Mapping) else None
85
+ submit_spec = component_ref.get("spec") if isinstance(component_ref, Mapping) else None
86
+ if isinstance(submit_spec, dict):
87
+ self.pipeline_spec = submit_spec
88
+ run_name = submit_spec.get("name")
89
+ self.run_name = run_name if isinstance(run_name, str) and run_name else None
90
+
91
+
92
+ @dataclass(frozen=True)
93
+ class PipelineWaitOutcome:
94
+ """Normalized wait result attached to a run context.
95
+
96
+ This is the generic OSS result boundary for wait lifecycle decisions.
97
+ Downstreams can format legacy result dictionaries or notifications from
98
+ this typed outcome without inventing their own metadata flags for success,
99
+ timeout, failure counts, or fail-fast early exit.
100
+ """
101
+
102
+ status: str | None = None
103
+ timed_out: bool = False
104
+ early_exit: bool = False
105
+ failed_count: int = 0
106
+ error_count: int = 0
107
+ elapsed_seconds: float = 0.0
108
+ success_override: bool | None = None
109
+
110
+ @property
111
+ def success(self) -> bool | None:
112
+ """Return generic success for completed waits, or None for timeout/unknown."""
113
+
114
+ if self.success_override is not None:
115
+ return self.success_override
116
+ if self.timed_out:
117
+ return None
118
+ if self.early_exit or self.failed_count > 0 or self.error_count > 0:
119
+ return False
120
+ status = str(self.status or "").upper()
121
+ if status == "SUCCEEDED":
122
+ return True
123
+ if status in _TERMINAL_STATUSES:
124
+ return False
125
+ return None
126
+
127
+ @staticmethod
128
+ def _count_statuses(status_counts: Mapping[str, Any], *statuses: str) -> int:
129
+ total = 0
130
+ for status in statuses:
131
+ try:
132
+ total += int(status_counts.get(status, 0) or 0)
133
+ except (TypeError, ValueError):
134
+ continue
135
+ return total
136
+
137
+ @classmethod
138
+ def _success_override_from_counts(
139
+ cls,
140
+ status_counts: Mapping[str, Any],
141
+ *,
142
+ terminal: bool,
143
+ total: int,
144
+ ) -> bool | None:
145
+ if not terminal or total <= 0:
146
+ return None
147
+ unsuccessful = cls._count_statuses(
148
+ status_counts,
149
+ "FAILED",
150
+ "SYSTEM_ERROR",
151
+ "CANCELLED",
152
+ "CANCELED",
153
+ "INVALID",
154
+ )
155
+ if unsuccessful > 0:
156
+ return False
157
+ terminal_count = cls._count_statuses(status_counts, *_TERMINAL_STATUSES)
158
+ if terminal_count == total:
159
+ return True
160
+ return None
161
+
162
+ @classmethod
163
+ def from_poll_result(
164
+ cls,
165
+ poll: "PipelineWaitPoll",
166
+ result: Mapping[str, Any],
167
+ ) -> "PipelineWaitOutcome":
168
+ """Build an outcome from a wait poll and public wait result."""
169
+
170
+ timed_out = bool(result.get("timed_out"))
171
+ early_exit = bool(result.get("early_exit"))
172
+ success_override = cls._success_override_from_counts(
173
+ poll.status_counts,
174
+ terminal=poll.terminal and not timed_out,
175
+ total=poll.total,
176
+ )
177
+ if early_exit and poll.total == 0:
178
+ early_exit = False
179
+ success_override = False
180
+ return cls(
181
+ status=str(result.get("status")) if result.get("status") is not None else poll.status,
182
+ timed_out=timed_out,
183
+ early_exit=early_exit,
184
+ failed_count=int(poll.status_counts.get("FAILED", 0) or 0),
185
+ error_count=int(poll.status_counts.get("SYSTEM_ERROR", 0) or 0),
186
+ elapsed_seconds=poll.elapsed_seconds,
187
+ success_override=success_override,
188
+ )
189
+
190
+ @classmethod
191
+ def from_wait_result(
192
+ cls,
193
+ result: Mapping[str, Any],
194
+ metadata: Mapping[str, Any] | None = None,
195
+ ) -> "PipelineWaitOutcome":
196
+ """Build an outcome from a public wait result and optional metadata."""
197
+
198
+ source = metadata or result
199
+ status = str(result.get("status")) if result.get("status") is not None else None
200
+ timed_out = bool(result.get("timed_out") or source.get("timed_out"))
201
+ early_exit = bool(result.get("early_exit") or source.get("early_exit"))
202
+ status_counts = source.get("status_counts")
203
+ status_counts = status_counts if isinstance(status_counts, Mapping) else {}
204
+ total = 0
205
+ for count in status_counts.values():
206
+ try:
207
+ total += int(count or 0)
208
+ except (TypeError, ValueError):
209
+ continue
210
+ terminal = bool(status and (status.upper() == "ENDED" or status.upper() in _TERMINAL_STATUSES))
211
+ success_override = cls._success_override_from_counts(
212
+ status_counts,
213
+ terminal=terminal and not timed_out,
214
+ total=total,
215
+ )
216
+ if early_exit and total == 0:
217
+ early_exit = False
218
+ success_override = False
219
+ failed_count = int(
220
+ source.get(
221
+ "failed_count",
222
+ result.get("failed_count", cls._count_statuses(status_counts, "FAILED")),
223
+ )
224
+ or 0
225
+ )
226
+ error_count = int(
227
+ source.get(
228
+ "error_count",
229
+ result.get("error_count", cls._count_statuses(status_counts, "SYSTEM_ERROR")),
230
+ )
231
+ or 0
232
+ )
233
+ return cls(
234
+ status=status,
235
+ timed_out=timed_out,
236
+ early_exit=early_exit,
237
+ failed_count=failed_count,
238
+ error_count=error_count,
239
+ elapsed_seconds=float(source.get("elapsed_seconds", 0.0) or 0.0),
240
+ success_override=success_override,
241
+ )
242
+
243
+
244
+ @dataclass
245
+ class PipelineRunContext:
246
+ """First-class context for a pipeline run lifecycle.
247
+
248
+ Downstreams can use this for mutex ownership, graceful-shutdown state,
249
+ notifications, retries, and scheduled timeout bookkeeping without scraping
250
+ transient manager attributes.
251
+
252
+ Fields:
253
+ run_id: Submitted pipeline run id, when an attempt reaches submit.
254
+ run_name: Display/pipeline name derived from the submitted spec.
255
+ root_execution_id: Root execution id returned by the submit API.
256
+ pipeline_path: Source path or URI used for the run, when path-backed.
257
+ start_time: Wall-clock attempt start time for downstream reporting.
258
+ attempt: 1-based attempt number for submit/wait/retry lifecycle hooks.
259
+ submit_body: Submit body for this attempt after normalization.
260
+ pipeline_spec: Pipeline spec extracted from ``submit_body``.
261
+ response: Submit API response for this attempt, when available.
262
+ wait_outcome: Generic wait result for this attempt, when wait ran.
263
+ previous_context: Previous attempt context, including attempts that
264
+ failed during submit before a ``run_id`` existed. This is not just
265
+ the previous successfully submitted run context.
266
+ previous_error: Error from the previous attempt that caused this retry.
267
+ carry_resource_to_retry: Generic resource/mutex handoff flag. Hooks set
268
+ this directly when a resource should remain held for the replacement
269
+ attempt. The current attempt's lifecycle context can then skip
270
+ release, and the next attempt can inspect ``previous_context`` to
271
+ reuse the carried resource.
272
+ metadata: Extra hook-specific state carried through the lifecycle.
273
+ """
274
+
275
+ run_id: str | None = None
276
+ run_name: str | None = None
277
+ root_execution_id: str | None = None
278
+ pipeline_path: str | Path | None = None
279
+ start_time: float | None = None
280
+ attempt: int = 1
281
+ submit_body: dict[str, Any] | None = None
282
+ pipeline_spec: dict[str, Any] | None = None
283
+ response: dict[str, Any] | None = None
284
+ wait_outcome: PipelineWaitOutcome | None = None
285
+ previous_context: "PipelineRunContext | None" = None
286
+ previous_error: Exception | None = None
287
+ carry_resource_to_retry: bool = False
288
+ metadata: dict[str, Any] = field(default_factory=dict)
289
+
290
+
291
+ @dataclass
292
+ class PipelineWaitPoll:
293
+ """One wait-loop observation passed to lifecycle hooks."""
294
+
295
+ run_id: str
296
+ run: dict[str, Any]
297
+ status: str
298
+ status_counts: dict[str, int]
299
+ total: int
300
+ terminal: bool
301
+ graph_state: dict[str, Any] | None = None
302
+ elapsed_seconds: float = 0.0
303
+ execution_state_timings: dict[str, dict[str, Any]] = field(default_factory=dict)
304
+
305
+
306
+ @dataclass
307
+ class PipelineRunHooks:
308
+ """Overridable seams for downstream tangle-deploy behavior.
309
+
310
+ Subclasses can override these methods to add provider-specific auth wrappers,
311
+ cloud-object loading, JOB_CONFIG time input, run-as annotations,
312
+ mutex/schedule behavior, graceful shutdown, notifications, hosted logs, or
313
+ from-container runtime defaults without forking the generic pipeline-run manager.
314
+ """
315
+
316
+ logger: Logger = field(default_factory=get_default_logger)
317
+ trusted_python_sources: list[str] = field(default_factory=list)
318
+ allow_all_hydration: bool = False
319
+
320
+ def read_pipeline_yaml(self, pipeline_path: str | Path) -> dict[str, Any]:
321
+ path_text = str(pipeline_path)
322
+ if path_text.startswith("gs://"):
323
+ raise UnsupportedPipelineRunFeatureError(
324
+ "gs:// pipeline loading is not supported by the OSS CLI default hooks"
325
+ )
326
+ path = Path(pipeline_path)
327
+ with path.open(encoding="utf-8") as handle:
328
+ data = yaml.safe_load(handle)
329
+ if not isinstance(data, dict):
330
+ raise PipelineRunError("Pipeline YAML must contain a top-level mapping")
331
+ return data
332
+
333
+ def hydrate_pipeline(
334
+ self,
335
+ pipeline_path: str | Path,
336
+ *,
337
+ resolution_overrides: dict[str, Any] | None = None,
338
+ ) -> dict[str, Any]:
339
+ client = getattr(self, "client", None)
340
+ if client is None and hasattr(self, "_get_client"):
341
+ client = self._get_client()
342
+ if client is None:
343
+ raise PipelineRunError("Failed to create TangleApiClient")
344
+ hydrator = PipelineHydrator(
345
+ client=client,
346
+ resolution_overrides=resolution_overrides,
347
+ logger=self.logger,
348
+ trusted_python_sources=self.trusted_python_sources,
349
+ allow_all_hydration=self.allow_all_hydration,
350
+ )
351
+ try:
352
+ return hydrator.hydrate_file(pipeline_path).data
353
+ except HydrationError as exc:
354
+ raise PipelineRunError(str(exc)) from exc
355
+
356
+ def prepare_pipeline_spec(
357
+ self,
358
+ pipeline_spec: dict[str, Any],
359
+ *,
360
+ pipeline_path: str | Path | None,
361
+ run_args: dict[str, Any] | None,
362
+ hydrate: bool,
363
+ ) -> dict[str, Any]:
364
+ """Hook for downstream validation/hydration/layout/annotation transforms.
365
+
366
+ The default returns the already-loaded spec unchanged. TD can override
367
+ this to run schema validation, auto-layout, source annotations, or any
368
+ pre-submit preparation before the generic payload conversion runs.
369
+ """
370
+
371
+ return pipeline_spec
372
+
373
+ def prepare_run_arguments(
374
+ self,
375
+ pipeline_spec: dict[str, Any],
376
+ run_args: dict[str, Any] | None,
377
+ ) -> dict[str, Any] | None:
378
+ """Hook for TD JOB_CONFIG time input / scheduled runtime behavior."""
379
+ return run_args
380
+
381
+ def transform_run_name(
382
+ self,
383
+ run_name: str,
384
+ *,
385
+ pipeline_spec: dict[str, Any],
386
+ run_args: dict[str, Any] | None,
387
+ ) -> str:
388
+ """Hook for downstream run-name policies after template expansion."""
389
+
390
+ return run_name
391
+
392
+ def extra_submit_annotations(
393
+ self,
394
+ *,
395
+ pipeline_spec: dict[str, Any],
396
+ pipeline_path: str | Path | None,
397
+ run_as: str | None = None,
398
+ ) -> dict[str, str]:
399
+ """Hook for downstream source/run-as/git annotations."""
400
+ if run_as:
401
+ raise UnsupportedPipelineRunFeatureError(
402
+ "--run-as is a downstream extension point and has no OSS default behavior"
403
+ )
404
+ return {}
405
+
406
+ def before_submit(self, pipeline_spec: dict[str, Any]) -> None:
407
+ """Legacy hook retained for compatibility with existing downstreams."""
408
+
409
+ def before_submit_context(self, context: PipelineRunContext) -> None:
410
+ """Hook for TD mutex/overlap checks with full run context."""
411
+
412
+ if context.pipeline_spec is not None:
413
+ self.before_submit(context.pipeline_spec)
414
+
415
+ def after_submit(self, response: Mapping[str, Any]) -> None:
416
+ """Legacy hook retained for downstream start notifications."""
417
+
418
+ def after_submit_context(self, context: PipelineRunContext) -> None:
419
+ """Hook for downstream start notifications with full run context."""
420
+
421
+ if context.response is not None:
422
+ self.after_submit(context.response)
423
+
424
+ def on_submit_error(
425
+ self,
426
+ error: Exception,
427
+ *,
428
+ context: PipelineRunContext,
429
+ ) -> None:
430
+ """Hook for downstream submit-error notifications/cleanup."""
431
+
432
+ def around_run(self, context: PipelineRunContext) -> AbstractContextManager[Any]:
433
+ """Context-manager seam for mutex/run lifecycle ownership."""
434
+
435
+ return nullcontext()
436
+
437
+ def before_run_lifecycle(self, context: PipelineRunContext) -> None:
438
+ """Hook called before a run attempt enters the lifecycle context."""
439
+
440
+ def after_run_lifecycle(
441
+ self,
442
+ context: PipelineRunContext,
443
+ *,
444
+ success: bool,
445
+ error: Exception | None = None,
446
+ ) -> None:
447
+ """Hook called after the lifecycle context exits."""
448
+
449
+ def on_fail_fast_before_release(
450
+ self,
451
+ context: PipelineRunContext,
452
+ error: Exception,
453
+ ) -> None:
454
+ """Hook called before lifecycle release when fail-fast aborts a run."""
455
+
456
+ def before_retry(
457
+ self,
458
+ context: PipelineRunContext,
459
+ error: Exception,
460
+ *,
461
+ next_attempt: int,
462
+ ) -> None:
463
+ """Hook before retrying a failed submit/run attempt."""
464
+
465
+ def after_retry_submit(self, context: PipelineRunContext) -> None:
466
+ """Hook after a retry successfully submits a new run."""
467
+
468
+ def should_cancel_previous_run(
469
+ self,
470
+ context: PipelineRunContext,
471
+ error: Exception,
472
+ *,
473
+ next_attempt: int,
474
+ ) -> bool:
475
+ """Return True when retry should cancel the previous run first."""
476
+
477
+ return False
478
+
479
+ def before_wait(self, context: PipelineRunContext) -> None:
480
+ """Hook called before polling a run."""
481
+
482
+ def after_poll(self, poll: PipelineWaitPoll, context: PipelineRunContext) -> None:
483
+ """Hook called after each run/graph-state poll."""
484
+
485
+ def should_exit_early(self, poll: PipelineWaitPoll, context: PipelineRunContext) -> bool:
486
+ """Return True to stop waiting before terminal/timeout.
487
+
488
+ The generic fail-fast policy is opt-in via ``exit_on_first_failure``.
489
+ Downstreams can set that flag when they want the wait loop to return as
490
+ soon as a task fails, before the full graph reaches a terminal state.
491
+ """
492
+
493
+ if not context.metadata.get("exit_on_first_failure"):
494
+ return False
495
+ return any(int(poll.status_counts.get(status, 0) or 0) > 0 for status in _FAILURE_EARLY_EXIT_STATUSES)
496
+
497
+ def on_timeout(self, poll: PipelineWaitPoll, context: PipelineRunContext) -> None:
498
+ """Hook called when wait reaches max_wait."""
499
+
500
+ def on_terminal(self, poll: PipelineWaitPoll, context: PipelineRunContext) -> None:
501
+ """Hook called when wait observes terminal state."""
502
+
503
+ def on_early_exit_before_release(
504
+ self,
505
+ poll: PipelineWaitPoll,
506
+ context: PipelineRunContext,
507
+ ) -> None:
508
+ """Hook called for fail-fast early exit before lifecycle release."""
509
+
510
+ def after_wait(self, result: Mapping[str, Any]) -> None:
511
+ """Legacy hook retained for terminal downstream notifications."""
512
+
513
+ def wait_outcome(
514
+ self,
515
+ poll: PipelineWaitPoll,
516
+ result: Mapping[str, Any],
517
+ context: PipelineRunContext,
518
+ ) -> PipelineWaitOutcome:
519
+ """Return the typed wait outcome to attach to the run context."""
520
+
521
+ del context
522
+ return PipelineWaitOutcome.from_poll_result(poll, result)
523
+
524
+ def after_wait_context(self, result: Mapping[str, Any], context: PipelineRunContext) -> None:
525
+ """Hook called after wait returns with full run context.
526
+
527
+ Preserve legacy behavior: ``after_wait(result)`` is called only for
528
+ terminal observations, not timeouts or fail-fast/early-exit returns.
529
+ Downstreams that need those outcomes should override ``on_timeout``,
530
+ ``on_early_exit_before_release``, or this context-aware hook directly.
531
+ """
532
+
533
+ if not result.get("timed_out") and not result.get("early_exit"):
534
+ status = result.get("status")
535
+ status_text = str(status).upper() if status else None
536
+ if status_text == "ENDED" or status_text in _TERMINAL_STATUSES:
537
+ self.after_wait(result)
538
+
539
+ def should_enforce_max_wait(self, context: PipelineRunContext) -> bool:
540
+ """Return False for downstream-controlled scheduled timeout policies."""
541
+
542
+ return True
543
+
544
+ def poll_run_snapshot(
545
+ self,
546
+ manager: "PipelineRunManager",
547
+ run_id: str,
548
+ context: PipelineRunContext,
549
+ ) -> Mapping[str, Any] | None:
550
+ """Optional hook to provide a run-like snapshot for wait polling.
551
+
552
+ Downstreams whose wait API is rooted at an execution id can return a
553
+ synthetic run snapshot here instead of forcing the generic manager to
554
+ call ``pipeline_runs_get(run_id)``.
555
+ """
556
+
557
+ return None
558
+
559
+ def graph_state_execution_id(
560
+ self,
561
+ run: Mapping[str, Any],
562
+ context: PipelineRunContext,
563
+ ) -> str | None:
564
+ """Return the execution id to use for graph-state polling."""
565
+
566
+ root_execution_id = run.get("root_execution_id") or context.root_execution_id
567
+ return str(root_execution_id) if root_execution_id is not None else None
568
+
569
+ def on_poll_error(self, error: Exception, context: PipelineRunContext) -> float | None:
570
+ """Handle polling errors.
571
+
572
+ Return a sleep interval to retry, or ``None`` to propagate the error.
573
+ """
574
+
575
+ return None
576
+
577
+ def fetch_logs(self, client: Any, execution_id: str) -> Any:
578
+ """Hook for alternate TD log providers; OSS uses the Tangle API only."""
579
+ return client.executions_container_log(execution_id)
580
+
581
+
582
+ @dataclass
583
+ class PipelineRunManager(TangleCliHandler):
584
+ client: Any
585
+ hooks: PipelineRunHooks = field(default_factory=PipelineRunHooks)
586
+ logger: Logger = field(default_factory=get_default_logger)
587
+ base_url: str | None = None
588
+
589
+ def __post_init__(self) -> None:
590
+ TangleCliHandler.__init__(
591
+ self,
592
+ client=self.client,
593
+ logger=self.logger,
594
+ base_url=self.base_url,
595
+ )
596
+ if self.hooks is not self:
597
+ setattr(self.hooks, "client", self.client)
598
+
599
+ @staticmethod
600
+ def to_plain(value: Any) -> Any:
601
+ if isinstance(value, Mapping):
602
+ return {key: PipelineRunManager.to_plain(val) for key, val in value.items()}
603
+ if hasattr(value, "to_dict"):
604
+ return value.to_dict()
605
+ if hasattr(value, "model_dump"):
606
+ return value.model_dump(by_alias=True)
607
+ if isinstance(value, list):
608
+ return [PipelineRunManager.to_plain(item) for item in value]
609
+ if hasattr(value, "__dict__"):
610
+ return {
611
+ key: PipelineRunManager.to_plain(val)
612
+ for key, val in vars(value).items()
613
+ if not key.startswith("_")
614
+ }
615
+ return value
616
+
617
+ @staticmethod
618
+ def extract_default_arguments(pipeline_spec: dict[str, Any]) -> dict[str, Any]:
619
+ arguments: dict[str, Any] = {}
620
+ inputs = pipeline_spec.get("inputs", [])
621
+ if isinstance(inputs, list):
622
+ for input_item in inputs:
623
+ if isinstance(input_item, dict) and "name" in input_item and "default" in input_item:
624
+ arguments[input_item["name"]] = input_item["default"]
625
+ return arguments
626
+
627
+ @staticmethod
628
+ def convert_yaml_to_payload(
629
+ pipeline_spec: dict[str, Any],
630
+ run_args: dict[str, Any] | None = None,
631
+ ) -> dict[str, Any]:
632
+ payload: dict[str, Any] = {"root_task": {"componentRef": {"spec": pipeline_spec}}}
633
+ arguments = PipelineRunManager.extract_default_arguments(pipeline_spec)
634
+ if run_args:
635
+ arguments.update(run_args)
636
+
637
+ pipeline_inputs = pipeline_spec.get("inputs", [])
638
+ valid_inputs = {inp.get("name") for inp in pipeline_inputs if isinstance(inp, dict) and inp.get("name")}
639
+ if valid_inputs:
640
+ arguments = {key: value for key, value in arguments.items() if key in valid_inputs}
641
+
642
+ missing: list[str] = []
643
+ for input_item in pipeline_inputs if isinstance(pipeline_inputs, list) else []:
644
+ if not isinstance(input_item, dict):
645
+ continue
646
+ name = input_item.get("name")
647
+ if name and "default" not in input_item and not input_item.get("optional", False) and name not in arguments:
648
+ missing.append(name)
649
+ if missing:
650
+ raise PipelineRunError(
651
+ f"Missing {len(missing)} required pipeline input(s): {', '.join(sorted(missing))}"
652
+ )
653
+
654
+ if arguments:
655
+ payload["root_task"]["arguments"] = arguments
656
+ return payload
657
+
658
+ @staticmethod
659
+ def sanitize_submit_payload(value: Any) -> Any:
660
+ """Return a submit-safe payload with TD-compatible componentRef fixes.
661
+
662
+ The hydrator uses explicit local-only annotations such as
663
+ ``_source_dir`` while recursively resolving local files. Those
664
+ provenance keys must not be submitted to the backend. User-supplied
665
+ underscore-prefixed payload keys are otherwise valid and preserved.
666
+ TD also normalizes ``componentRef.text`` into ``componentRef.spec``
667
+ for component-library entries before submit; keep the same behavior
668
+ here.
669
+ """
670
+
671
+ if isinstance(value, list):
672
+ return [PipelineRunManager.sanitize_submit_payload(item) for item in value]
673
+ if not isinstance(value, dict):
674
+ return value
675
+
676
+ local_only_keys = {"_source_dir", "_recursive_params"}
677
+ cleaned: dict[str, Any] = {}
678
+ for key, item in value.items():
679
+ if str(key) in local_only_keys:
680
+ continue
681
+ cleaned[key] = PipelineRunManager.sanitize_submit_payload(item)
682
+
683
+ component_ref = cleaned.get("componentRef")
684
+ if isinstance(component_ref, dict) and "text" in component_ref and not component_ref.get("spec"):
685
+ text_content = component_ref.pop("text")
686
+ if isinstance(text_content, str):
687
+ try:
688
+ component_ref["spec"] = yaml.safe_load(text_content)
689
+ except yaml.YAMLError as exc:
690
+ component_name = component_ref.get("name", "unknown")
691
+ raise PipelineRunError(
692
+ f"Failed to parse YAML in componentRef {component_name!r}: {exc}"
693
+ ) from exc
694
+ else:
695
+ component_ref["spec"] = text_content
696
+ component_ref["spec"] = PipelineRunManager.sanitize_submit_payload(component_ref["spec"])
697
+
698
+ return cleaned
699
+
700
+ @staticmethod
701
+ def normalize_submit_body_in_place(body: dict[str, Any]) -> dict[str, Any]:
702
+ """Normalize a submit body in place and return it.
703
+
704
+ This is the mutable counterpart to :meth:`sanitize_submit_payload` for
705
+ callers that already have a body object. It keeps component-ref text
706
+ normalization and submit-only field stripping in the OSS submit layer,
707
+ instead of requiring downstream runners to patch bodies before submit.
708
+ """
709
+
710
+ sanitized = PipelineRunManager.sanitize_submit_payload(body)
711
+ if not isinstance(sanitized, dict):
712
+ raise PipelineRunError("submit body must be a mapping")
713
+ body.clear()
714
+ body.update(sanitized)
715
+ return body
716
+
717
+ @staticmethod
718
+ def is_terminal_status(status: str | None) -> bool:
719
+ return bool(status and status.upper() in _TERMINAL_STATUSES)
720
+
721
+ @staticmethod
722
+ def status_counts_from_run(run: Mapping[str, Any]) -> dict[str, int]:
723
+ stats = run.get("execution_status_stats")
724
+ if not isinstance(stats, Mapping):
725
+ return {}
726
+ result: dict[str, int] = {}
727
+ for key, value in stats.items():
728
+ try:
729
+ result[str(key).upper()] = int(value or 0)
730
+ except (TypeError, ValueError):
731
+ continue
732
+ return result
733
+
734
+ @staticmethod
735
+ def _counts_mapping(value: Any) -> Mapping[str, Any] | None:
736
+ if isinstance(value, Mapping):
737
+ return value
738
+ if value is not None and hasattr(value, "items"):
739
+ return value
740
+ return None
741
+
742
+ @staticmethod
743
+ def status_counts_from_graph_state(graph_state: Mapping[str, Any] | Any) -> dict[str, int]:
744
+ for key in ("status_totals", "execution_status_stats"):
745
+ stats = graph_state.get(key) if isinstance(graph_state, Mapping) else getattr(graph_state, key, None)
746
+ counts = PipelineRunManager._counts_mapping(stats)
747
+ if counts is not None:
748
+ return {
749
+ str(status).upper(): int(count or 0)
750
+ for status, count in counts.items()
751
+ }
752
+ child_stats = (
753
+ graph_state.get("child_execution_status_stats")
754
+ if isinstance(graph_state, Mapping)
755
+ else getattr(graph_state, "child_execution_status_stats", None)
756
+ )
757
+ totals: dict[str, int] = {}
758
+ child_counts = PipelineRunManager._counts_mapping(child_stats)
759
+ if child_counts is not None:
760
+ for stats in child_counts.values():
761
+ counts = PipelineRunManager._counts_mapping(stats)
762
+ if counts is None:
763
+ continue
764
+ for status, count in counts.items():
765
+ totals[str(status).upper()] = totals.get(str(status).upper(), 0) + int(count or 0)
766
+ return totals
767
+
768
+ @staticmethod
769
+ def execution_status_counts_from_graph_state(graph_state: Mapping[str, Any] | Any) -> dict[str, dict[str, int]]:
770
+ """Return per-execution status counts from a graph-state response."""
771
+
772
+ child_stats = (
773
+ graph_state.get("child_execution_status_stats")
774
+ if isinstance(graph_state, Mapping)
775
+ else getattr(graph_state, "child_execution_status_stats", None)
776
+ )
777
+ child_counts = PipelineRunManager._counts_mapping(child_stats)
778
+ if child_counts is None:
779
+ return {}
780
+ result: dict[str, dict[str, int]] = {}
781
+ for execution_id, stats in child_counts.items():
782
+ counts = PipelineRunManager._counts_mapping(stats)
783
+ if counts is None:
784
+ continue
785
+ status_counts: dict[str, int] = {}
786
+ for status, count in counts.items():
787
+ try:
788
+ status_counts[str(status).upper()] = int(count or 0)
789
+ except (TypeError, ValueError):
790
+ continue
791
+ result[str(execution_id)] = status_counts
792
+ return result
793
+
794
+ @staticmethod
795
+ def status_from_counts(status_counts: Mapping[str, int]) -> str | None:
796
+ for status in _ACTIVE_STATUSES:
797
+ if int(status_counts.get(status, 0) or 0) > 0:
798
+ return status
799
+ for status in _TERMINAL_STATUSES:
800
+ if int(status_counts.get(status, 0) or 0) > 0:
801
+ return status
802
+ return None
803
+
804
+ @staticmethod
805
+ def status_from_run(run: Mapping[str, Any]) -> str | None:
806
+ summary = run.get("execution_summary")
807
+ if isinstance(summary, Mapping) and summary.get("has_ended") is True:
808
+ stats = run.get("execution_status_stats")
809
+ if isinstance(stats, Mapping):
810
+ for status in ("FAILED", "SYSTEM_ERROR", "CANCELLED", "CANCELED"):
811
+ if int(stats.get(status, 0) or 0) > 0:
812
+ return status
813
+ if int(stats.get("SUCCEEDED", 0) or 0) > 0:
814
+ return "SUCCEEDED"
815
+ return "ENDED"
816
+ stats = run.get("execution_status_stats")
817
+ if isinstance(stats, Mapping):
818
+ for status in _ACTIVE_STATUSES:
819
+ if int(stats.get(status, 0) or 0) > 0:
820
+ return status
821
+ for status in _TERMINAL_STATUSES:
822
+ if int(stats.get(status, 0) or 0) > 0:
823
+ return status
824
+ return None
825
+
826
+ @staticmethod
827
+ def _accepts_client_keyword(method: Any) -> bool:
828
+ try:
829
+ parameters = inspect.signature(method).parameters
830
+ except (TypeError, ValueError):
831
+ return False
832
+ return "client" in parameters or any(
833
+ parameter.kind is inspect.Parameter.VAR_KEYWORD
834
+ for parameter in parameters.values()
835
+ )
836
+
837
+ def load_pipeline_for_submit(
838
+ self,
839
+ pipeline_path: str | Path,
840
+ *,
841
+ hydrate: bool = True,
842
+ resolution_overrides: dict[str, Any] | None = None,
843
+ ) -> dict[str, Any]:
844
+ if hydrate:
845
+ hydrate_pipeline = self.hooks.hydrate_pipeline
846
+ hydrate_kwargs: dict[str, Any] = {"resolution_overrides": resolution_overrides}
847
+ if self._accepts_client_keyword(hydrate_pipeline):
848
+ hydrate_kwargs["client"] = self._get_client()
849
+ return hydrate_pipeline(pipeline_path, **hydrate_kwargs)
850
+ return self.hooks.read_pipeline_yaml(pipeline_path)
851
+
852
+ @staticmethod
853
+ def expand_run_name_template(
854
+ template: str,
855
+ pipeline_spec: dict[str, Any],
856
+ run_args: dict[str, Any] | None = None,
857
+ ) -> str:
858
+ """Expand ``${arguments.NAME}`` placeholders from defaults + run args."""
859
+
860
+ arguments = PipelineRunManager.extract_default_arguments(pipeline_spec)
861
+ if run_args:
862
+ arguments.update(run_args)
863
+
864
+ def replace_placeholder(match: re.Match[str]) -> str:
865
+ value = arguments.get(match.group(1))
866
+ return str(value) if value is not None else match.group(0)
867
+
868
+ return re.sub(r"\$\{arguments\.([^}]+)\}", replace_placeholder, template)
869
+
870
+ def apply_run_name_template(
871
+ self,
872
+ pipeline_spec: dict[str, Any],
873
+ run_args: dict[str, Any] | None = None,
874
+ ) -> dict[str, Any]:
875
+ annotations = pipeline_spec.get("metadata", {}).get("annotations", {})
876
+ template = annotations.get("run-name-template") if isinstance(annotations, Mapping) else None
877
+ if not template:
878
+ return pipeline_spec
879
+ transformed = copy.deepcopy(pipeline_spec)
880
+ expanded = self.expand_run_name_template(str(template), transformed, run_args)
881
+ transformed["name"] = self.hooks.transform_run_name(
882
+ expanded,
883
+ pipeline_spec=transformed,
884
+ run_args=run_args,
885
+ )
886
+ return transformed
887
+
888
+ def prepare_pipeline_spec_for_submit(
889
+ self,
890
+ pipeline_spec: dict[str, Any],
891
+ *,
892
+ pipeline_path: str | Path | None = None,
893
+ run_args: dict[str, Any] | None = None,
894
+ hydrate: bool = True,
895
+ ) -> dict[str, Any]:
896
+ return self.hooks.prepare_pipeline_spec(
897
+ pipeline_spec,
898
+ pipeline_path=pipeline_path,
899
+ run_args=run_args,
900
+ hydrate=hydrate,
901
+ )
902
+
903
+ def prepare_submit_payload_from_spec(
904
+ self,
905
+ pipeline_spec: dict[str, Any],
906
+ *,
907
+ run_args: dict[str, Any] | None = None,
908
+ annotations: dict[str, str] | None = None,
909
+ pipeline_path: str | Path | None = None,
910
+ run_as: str | None = None,
911
+ hydrate: bool = True,
912
+ ) -> PipelineSubmitPayload:
913
+ """Prepare the generic submit payload from a pipeline spec.
914
+
915
+ The order here is the submit-body contract shared by OSS and TD:
916
+ prepare the spec, prepare runtime arguments, expand run-name templates,
917
+ convert/sanitize the payload, then merge downstream/default annotations
918
+ before caller-supplied annotations override them.
919
+ """
920
+
921
+ prepared_spec = self.prepare_pipeline_spec_for_submit(
922
+ pipeline_spec,
923
+ pipeline_path=pipeline_path,
924
+ run_args=run_args,
925
+ hydrate=hydrate,
926
+ )
927
+ prepared_run_args = self.hooks.prepare_run_arguments(prepared_spec, run_args)
928
+ prepared_spec = self.apply_run_name_template(prepared_spec, prepared_run_args)
929
+ payload = self.convert_yaml_to_payload(copy.deepcopy(prepared_spec), prepared_run_args)
930
+ payload = self.sanitize_submit_payload(payload)
931
+ root_task = payload["root_task"]
932
+ component_ref = root_task.get("componentRef") if isinstance(root_task, Mapping) else None
933
+ submit_spec = (
934
+ component_ref.get("spec")
935
+ if isinstance(component_ref, Mapping) and isinstance(component_ref.get("spec"), dict)
936
+ else prepared_spec
937
+ )
938
+ submit_annotations = self.hooks.extra_submit_annotations(
939
+ pipeline_spec=prepared_spec,
940
+ pipeline_path=pipeline_path,
941
+ run_as=run_as,
942
+ )
943
+ if annotations:
944
+ submit_annotations.update({str(k): str(v) for k, v in annotations.items()})
945
+ run_name = submit_spec.get("name")
946
+ return PipelineSubmitPayload(
947
+ prepared_spec=prepared_spec,
948
+ pipeline_spec=submit_spec,
949
+ run_args=prepared_run_args,
950
+ root_task=root_task,
951
+ annotations=submit_annotations,
952
+ run_name=run_name if isinstance(run_name, str) and run_name else None,
953
+ )
954
+
955
+ def build_submit_body_from_spec(
956
+ self,
957
+ pipeline_spec: dict[str, Any],
958
+ *,
959
+ run_args: dict[str, Any] | None = None,
960
+ annotations: dict[str, str] | None = None,
961
+ pipeline_path: str | Path | None = None,
962
+ run_as: str | None = None,
963
+ hydrate: bool = True,
964
+ ) -> dict[str, Any]:
965
+ """Build a submit body from an already-prepared pipeline spec."""
966
+
967
+ return self.prepare_submit_payload_from_spec(
968
+ pipeline_spec,
969
+ run_args=run_args,
970
+ annotations=annotations,
971
+ pipeline_path=pipeline_path,
972
+ run_as=run_as,
973
+ hydrate=hydrate,
974
+ ).to_body()
975
+
976
+ def prepare_submit_payload(
977
+ self,
978
+ pipeline_path: str | Path,
979
+ *,
980
+ run_args: dict[str, Any] | None = None,
981
+ annotations: dict[str, str] | None = None,
982
+ hydrate: bool = True,
983
+ run_as: str | None = None,
984
+ resolution_overrides: dict[str, Any] | None = None,
985
+ ) -> PipelineSubmitPayload:
986
+ pipeline_spec = self.load_pipeline_for_submit(
987
+ pipeline_path,
988
+ hydrate=hydrate,
989
+ resolution_overrides=resolution_overrides,
990
+ )
991
+ return self.prepare_submit_payload_from_spec(
992
+ pipeline_spec,
993
+ run_args=run_args,
994
+ annotations=annotations,
995
+ pipeline_path=pipeline_path,
996
+ run_as=run_as,
997
+ hydrate=hydrate,
998
+ )
999
+
1000
+ def build_submit_body(
1001
+ self,
1002
+ pipeline_path: str | Path,
1003
+ *,
1004
+ run_args: dict[str, Any] | None = None,
1005
+ annotations: dict[str, str] | None = None,
1006
+ hydrate: bool = True,
1007
+ run_as: str | None = None,
1008
+ resolution_overrides: dict[str, Any] | None = None,
1009
+ ) -> dict[str, Any]:
1010
+ return self.prepare_submit_payload(
1011
+ pipeline_path,
1012
+ run_args=run_args,
1013
+ annotations=annotations,
1014
+ hydrate=hydrate,
1015
+ run_as=run_as,
1016
+ resolution_overrides=resolution_overrides,
1017
+ ).to_body()
1018
+
1019
+ @staticmethod
1020
+ def response_run_context(
1021
+ response: Mapping[str, Any],
1022
+ *,
1023
+ submit_body: dict[str, Any],
1024
+ pipeline_path: str | Path | None = None,
1025
+ attempt: int = 1,
1026
+ ) -> PipelineRunContext:
1027
+ pipeline_spec = submit_body.get("root_task", {}).get("componentRef", {}).get("spec")
1028
+ run_name = pipeline_spec.get("name") if isinstance(pipeline_spec, dict) else None
1029
+ return PipelineRunContext(
1030
+ run_id=str(response.get("id")) if response.get("id") is not None else None,
1031
+ run_name=run_name if isinstance(run_name, str) and run_name else None,
1032
+ root_execution_id=(
1033
+ str(response.get("root_execution_id"))
1034
+ if response.get("root_execution_id") is not None
1035
+ else None
1036
+ ),
1037
+ pipeline_path=pipeline_path,
1038
+ start_time=time.time(),
1039
+ attempt=attempt,
1040
+ submit_body=submit_body,
1041
+ pipeline_spec=pipeline_spec if isinstance(pipeline_spec, dict) else None,
1042
+ response=dict(response),
1043
+ )
1044
+
1045
+ def submit_prepared_body(
1046
+ self,
1047
+ body: dict[str, Any],
1048
+ *,
1049
+ pipeline_path: str | Path | None = None,
1050
+ attempt: int = 1,
1051
+ context: PipelineRunContext | None = None,
1052
+ notify_submit_error: bool = True,
1053
+ ) -> dict[str, Any]:
1054
+ self.normalize_submit_body_in_place(body)
1055
+ pipeline_spec = body["root_task"]["componentRef"]["spec"]
1056
+ submit_context = context or PipelineRunContext(
1057
+ pipeline_path=pipeline_path,
1058
+ start_time=time.time(),
1059
+ attempt=attempt,
1060
+ )
1061
+ spec_name = pipeline_spec.get("name") if isinstance(pipeline_spec, dict) else None
1062
+ submit_context.run_name = spec_name if isinstance(spec_name, str) and spec_name else None
1063
+ submit_context.pipeline_path = pipeline_path
1064
+ submit_context.attempt = attempt
1065
+ submit_context.submit_body = body
1066
+ submit_context.pipeline_spec = pipeline_spec if isinstance(pipeline_spec, dict) else None
1067
+ self.hooks.before_submit_context(submit_context)
1068
+ client = self._require_client()
1069
+ try:
1070
+ response = self.to_plain(client.pipeline_runs_create(body=body))
1071
+ except Exception as exc:
1072
+ if notify_submit_error:
1073
+ self.hooks.on_submit_error(exc, context=submit_context)
1074
+ raise
1075
+ if not isinstance(response, dict):
1076
+ response = {}
1077
+ submitted_context = self.response_run_context(
1078
+ response,
1079
+ submit_body=body,
1080
+ pipeline_path=pipeline_path,
1081
+ attempt=attempt,
1082
+ )
1083
+ submit_context.run_id = submitted_context.run_id
1084
+ submit_context.run_name = submitted_context.run_name
1085
+ submit_context.root_execution_id = submitted_context.root_execution_id
1086
+ submit_context.submit_body = submitted_context.submit_body
1087
+ submit_context.pipeline_spec = submitted_context.pipeline_spec
1088
+ submit_context.response = response
1089
+ self.hooks.after_submit_context(submit_context)
1090
+ return response
1091
+
1092
+ def submit_prepared_payload(
1093
+ self,
1094
+ payload: PipelineSubmitPayload,
1095
+ *,
1096
+ pipeline_path: str | Path | None = None,
1097
+ attempt: int = 1,
1098
+ context: PipelineRunContext | None = None,
1099
+ ) -> dict[str, Any]:
1100
+ body = payload.to_body()
1101
+ response = self.submit_prepared_body(
1102
+ body,
1103
+ pipeline_path=pipeline_path,
1104
+ attempt=attempt,
1105
+ context=context,
1106
+ )
1107
+ payload.sync_from_body(body)
1108
+ return response
1109
+
1110
+ def submit_pipeline_spec(
1111
+ self,
1112
+ pipeline_spec: dict[str, Any],
1113
+ *,
1114
+ run_args: dict[str, Any] | None = None,
1115
+ annotations: dict[str, str] | None = None,
1116
+ pipeline_path: str | Path | None = None,
1117
+ run_as: str | None = None,
1118
+ hydrate: bool = True,
1119
+ attempt: int = 1,
1120
+ ) -> dict[str, Any]:
1121
+ payload = self.prepare_submit_payload_from_spec(
1122
+ pipeline_spec,
1123
+ run_args=run_args,
1124
+ annotations=annotations,
1125
+ pipeline_path=pipeline_path,
1126
+ run_as=run_as,
1127
+ hydrate=hydrate,
1128
+ )
1129
+ return self.submit_prepared_payload(payload, pipeline_path=pipeline_path, attempt=attempt)
1130
+
1131
+ def submit_pipeline(
1132
+ self,
1133
+ pipeline_path: str | Path,
1134
+ *,
1135
+ run_args: dict[str, Any] | None = None,
1136
+ annotations: dict[str, str] | None = None,
1137
+ hydrate: bool = True,
1138
+ run_as: str | None = None,
1139
+ resolution_overrides: dict[str, Any] | None = None,
1140
+ attempt: int = 1,
1141
+ ) -> dict[str, Any]:
1142
+ payload = self.prepare_submit_payload(
1143
+ pipeline_path,
1144
+ run_args=run_args,
1145
+ annotations=annotations,
1146
+ hydrate=hydrate,
1147
+ run_as=run_as,
1148
+ resolution_overrides=resolution_overrides,
1149
+ )
1150
+ return self.submit_prepared_payload(payload, pipeline_path=pipeline_path, attempt=attempt)
1151
+
1152
+ def get_run(self, run_id: str, *, include_execution_stats: bool = True) -> dict[str, Any]:
1153
+ return self.to_plain(
1154
+ self.client.pipeline_runs_get(
1155
+ run_id,
1156
+ include_execution_stats=include_execution_stats,
1157
+ )
1158
+ )
1159
+
1160
+ def get_run_details(
1161
+ self,
1162
+ run_id: str,
1163
+ *,
1164
+ include_annotations: bool = False,
1165
+ include_execution_state: bool = False,
1166
+ include_implementations: bool = False,
1167
+ execution_id: str | None = None,
1168
+ ) -> dict[str, Any]:
1169
+ return PipelineRunDetails(client=self.client).get_run_details_output(
1170
+ run_id,
1171
+ include_implementations=include_implementations,
1172
+ include_annotations=include_annotations,
1173
+ include_execution_state=include_execution_state,
1174
+ execution_id=execution_id,
1175
+ )
1176
+
1177
+ def cancel_run(self, run_id: str) -> dict[str, Any]:
1178
+ return self.to_plain(self.client.pipeline_runs_cancel(run_id)) or {"id": run_id, "cancelled": True}
1179
+
1180
+ def graph_state(self, execution_id: str) -> Mapping[str, Any] | Any:
1181
+ graph_state = self.client.executions_graph_execution_state(execution_id)
1182
+ return self.to_plain(graph_state)
1183
+
1184
+ def graph_state_output(self, run_ids: list[str], *, timeout: float = 30.0) -> dict[str, Any]:
1185
+ return PipelineRunDetails(client=self.client).get_graph_state_output(run_ids, timeout=timeout)
1186
+
1187
+ def logs(self, execution_id: str) -> dict[str, Any]:
1188
+ return self.to_plain(self.hooks.fetch_logs(self.client, execution_id))
1189
+
1190
+ def search_runs(
1191
+ self,
1192
+ *,
1193
+ filter: str | None = None,
1194
+ filter_query: str | None = None,
1195
+ page_token: str | None = None,
1196
+ include_pipeline_names: bool | None = None,
1197
+ include_execution_stats: bool | None = True,
1198
+ ) -> dict[str, Any]:
1199
+ return self.to_plain(
1200
+ self.client.pipeline_runs_list(
1201
+ page_token=page_token,
1202
+ filter=filter,
1203
+ filter_query=filter_query,
1204
+ include_pipeline_names=include_pipeline_names,
1205
+ include_execution_stats=include_execution_stats,
1206
+ )
1207
+ )
1208
+
1209
+ def search_pipeline_runs(
1210
+ self,
1211
+ *,
1212
+ name: str | None = None,
1213
+ created_by: str | None = None,
1214
+ annotations: dict[str, str | None] | None = None,
1215
+ start_date: str | None = None,
1216
+ end_date: str | None = None,
1217
+ local_time: bool = False,
1218
+ query: dict[str, Any] | None = None,
1219
+ limit: int = 10,
1220
+ page_token: str | None = None,
1221
+ ) -> dict[str, Any]:
1222
+ return PipelineRunSearch(client=self.client, logger=self.logger).search(
1223
+ name=name,
1224
+ created_by=created_by,
1225
+ annotations=annotations,
1226
+ start_date=start_date,
1227
+ end_date=end_date,
1228
+ local_time=local_time,
1229
+ query=query,
1230
+ limit=limit,
1231
+ page_token=page_token,
1232
+ )
1233
+
1234
+ def export_run(
1235
+ self,
1236
+ run_id: str,
1237
+ output: str | Path | None = None,
1238
+ *,
1239
+ dehydrate: bool = False,
1240
+ ) -> dict[str, Any]:
1241
+ task_spec = self.client.get_run_pipeline_spec(run_id)
1242
+ if task_spec is None:
1243
+ raise PipelineRunError(f"No pipeline spec found for run {run_id}")
1244
+ raw = getattr(task_spec, "raw", None)
1245
+ if isinstance(raw, Mapping):
1246
+ spec = raw.get("componentRef", {}).get("spec")
1247
+ else:
1248
+ spec = None
1249
+ component_spec = getattr(task_spec, "component_spec", None)
1250
+ if not isinstance(spec, dict) and component_spec is not None:
1251
+ spec = getattr(component_spec, "data", None)
1252
+ if not isinstance(spec, dict) or not spec:
1253
+ raise PipelineRunError(f"Pipeline spec for run {run_id} is not exportable")
1254
+ if dehydrate and output is None:
1255
+ raise PipelineRunError("--dehydrate requires --output")
1256
+ if dehydrate:
1257
+ spec = PipelineDehydrator(
1258
+ remembered_choices={"": DehydrateChoice.AUTO},
1259
+ output_file=output,
1260
+ client=self.client,
1261
+ logger=self.logger,
1262
+ ).dehydrate(spec)
1263
+ content = dump_yaml(spec)
1264
+ if output is None:
1265
+ return {"run_id": run_id, "pipeline": spec, "yaml": content, "dehydrated": dehydrate}
1266
+ output_path = Path(output)
1267
+ output_path.parent.mkdir(parents=True, exist_ok=True)
1268
+ output_path.write_text(content, encoding="utf-8")
1269
+
1270
+ result = {"run_id": run_id, "output": str(output_path), "dehydrated": dehydrate}
1271
+ arguments = self.to_plain(getattr(task_spec, "arguments", None) or {})
1272
+ if not arguments and isinstance(raw, Mapping):
1273
+ arguments = self.to_plain(raw.get("arguments") or {})
1274
+ if isinstance(arguments, Mapping) and (arguments or dehydrate):
1275
+ config_path = output_path.parent / f"{output_path.stem}.config.yaml"
1276
+ config_data: dict[str, Any] = {"pipeline_path": output_path.name}
1277
+ if dehydrate:
1278
+ config_data["hydrate"] = True
1279
+ if arguments:
1280
+ config_data["args"] = dict(arguments)
1281
+ config_path.write_text(dump_yaml(config_data), encoding="utf-8")
1282
+ result["config_path"] = str(config_path)
1283
+ return result
1284
+
1285
+ def _update_execution_state_timings(
1286
+ self,
1287
+ context: PipelineRunContext,
1288
+ graph_state: Mapping[str, Any] | Any,
1289
+ ) -> dict[str, dict[str, Any]]:
1290
+ """Track how long each execution has stayed in its observed state."""
1291
+
1292
+ execution_status_counts = self.execution_status_counts_from_graph_state(graph_state)
1293
+ if not execution_status_counts:
1294
+ context.metadata[_EXECUTION_STATE_TIMINGS_METADATA_KEY] = {}
1295
+ context.metadata[_EXECUTION_STATE_TIMING_MONOTONIC_METADATA_KEY] = {}
1296
+ return {}
1297
+
1298
+ existing_value = context.metadata.get(_EXECUTION_STATE_TIMINGS_METADATA_KEY)
1299
+ existing = existing_value if isinstance(existing_value, Mapping) else {}
1300
+ monotonic_value = context.metadata.get(_EXECUTION_STATE_TIMING_MONOTONIC_METADATA_KEY)
1301
+ monotonic_state_entered = monotonic_value if isinstance(monotonic_value, Mapping) else {}
1302
+ now_wall = time.time()
1303
+ now_monotonic = time.monotonic()
1304
+ timings: dict[str, dict[str, Any]] = {}
1305
+ next_monotonic_state_entered: dict[str, float] = {}
1306
+
1307
+ for execution_id, status_counts in execution_status_counts.items():
1308
+ state = self.status_from_counts(status_counts) or "UNKNOWN"
1309
+ existing_record = existing.get(execution_id)
1310
+ previous = existing_record if isinstance(existing_record, Mapping) else {}
1311
+ previous_state = previous.get("state")
1312
+ if previous_state == state:
1313
+ try:
1314
+ state_entered_at = float(previous.get("state_entered_at", now_wall))
1315
+ except (TypeError, ValueError):
1316
+ state_entered_at = now_wall
1317
+ try:
1318
+ state_entered_monotonic = float(monotonic_state_entered.get(execution_id, now_monotonic))
1319
+ except (TypeError, ValueError):
1320
+ state_entered_monotonic = now_monotonic
1321
+ else:
1322
+ state_entered_at = now_wall
1323
+ state_entered_monotonic = now_monotonic
1324
+
1325
+ timings[execution_id] = {
1326
+ "state": state,
1327
+ "state_entered_at": state_entered_at,
1328
+ "elapsed_seconds": max(0.0, now_monotonic - state_entered_monotonic),
1329
+ "last_observed_at": now_wall,
1330
+ }
1331
+ next_monotonic_state_entered[execution_id] = state_entered_monotonic
1332
+
1333
+ context.metadata[_EXECUTION_STATE_TIMINGS_METADATA_KEY] = timings
1334
+ context.metadata[_EXECUTION_STATE_TIMING_MONOTONIC_METADATA_KEY] = next_monotonic_state_entered
1335
+ return copy.deepcopy(timings)
1336
+
1337
+ def _poll_run_status(
1338
+ self,
1339
+ run_id: str,
1340
+ *,
1341
+ use_graph_state: bool,
1342
+ started_at: float,
1343
+ context: PipelineRunContext | None = None,
1344
+ ) -> PipelineWaitPoll:
1345
+ wait_context = context or PipelineRunContext(run_id=run_id, start_time=time.time())
1346
+ run_snapshot = self.hooks.poll_run_snapshot(self, run_id, wait_context)
1347
+ run = self.to_plain(run_snapshot) if run_snapshot is not None else self.get_run(
1348
+ run_id, include_execution_stats=True
1349
+ )
1350
+ if not isinstance(run, dict):
1351
+ run = {}
1352
+ graph_state: dict[str, Any] | None = None
1353
+ execution_state_timings: dict[str, dict[str, Any]] = {}
1354
+ status_counts = self.status_counts_from_run(run)
1355
+ if use_graph_state:
1356
+ root_execution_id = self.hooks.graph_state_execution_id(run, wait_context)
1357
+ if root_execution_id:
1358
+ graph_state = self.graph_state(str(root_execution_id))
1359
+ graph_counts = self.status_counts_from_graph_state(graph_state)
1360
+ if graph_counts:
1361
+ status_counts = graph_counts
1362
+ execution_state_timings = self._update_execution_state_timings(wait_context, graph_state)
1363
+ status = self.status_from_counts(status_counts) or self.status_from_run(run) or "UNKNOWN"
1364
+ terminal = self.is_terminal_status(status) or status == "ENDED"
1365
+ total = sum(status_counts.values())
1366
+ if total and use_graph_state:
1367
+ terminal_count = sum(status_counts.get(state, 0) for state in _TERMINAL_STATUSES)
1368
+ terminal = terminal_count == total
1369
+ return PipelineWaitPoll(
1370
+ run_id=run_id,
1371
+ run=run,
1372
+ status=status,
1373
+ status_counts=status_counts,
1374
+ total=total,
1375
+ terminal=terminal,
1376
+ graph_state=graph_state if isinstance(graph_state, dict) else None,
1377
+ elapsed_seconds=time.monotonic() - started_at,
1378
+ execution_state_timings=execution_state_timings,
1379
+ )
1380
+
1381
+ def wait_for_completion(
1382
+ self,
1383
+ run_id: str,
1384
+ *,
1385
+ max_wait: float | None,
1386
+ poll_interval: float,
1387
+ use_graph_state: bool = False,
1388
+ context: PipelineRunContext | None = None,
1389
+ allow_zero_poll_interval: bool = False,
1390
+ timeout_clock: str = "monotonic",
1391
+ exit_on_first_failure: bool = False,
1392
+ ) -> dict[str, Any]:
1393
+ wait_context = context or PipelineRunContext(run_id=run_id, start_time=time.time())
1394
+ if exit_on_first_failure:
1395
+ wait_context.metadata["exit_on_first_failure"] = True
1396
+ if max_wait is not None and max_wait < 0:
1397
+ raise PipelineRunError("--max-wait must be non-negative")
1398
+ if poll_interval < 0 or (poll_interval == 0 and not allow_zero_poll_interval):
1399
+ raise PipelineRunError("--poll-interval must be positive")
1400
+ if timeout_clock not in {"monotonic", "wall"}:
1401
+ raise PipelineRunError("timeout_clock must be 'monotonic' or 'wall'")
1402
+ enforce_max_wait = max_wait is not None and self.hooks.should_enforce_max_wait(wait_context)
1403
+ poll_started_at = time.monotonic()
1404
+ deadline_now: Callable[[], float] = time.time if timeout_clock == "wall" else time.monotonic
1405
+ deadline_started_at = deadline_now()
1406
+ deadline = deadline_started_at + max_wait if enforce_max_wait else None
1407
+ self.hooks.before_wait(wait_context)
1408
+ last_poll: PipelineWaitPoll | None = None
1409
+ while True:
1410
+ try:
1411
+ poll = self._poll_run_status(
1412
+ run_id,
1413
+ use_graph_state=use_graph_state,
1414
+ started_at=poll_started_at,
1415
+ context=wait_context,
1416
+ )
1417
+ except KeyboardInterrupt:
1418
+ raise
1419
+ except Exception as exc:
1420
+ if deadline is not None and deadline_now() >= deadline:
1421
+ raise PipelineRunError(f"Timed out waiting for run {run_id}") from exc
1422
+ retry_interval = self.hooks.on_poll_error(exc, wait_context)
1423
+ if retry_interval is None:
1424
+ raise
1425
+ if deadline is not None:
1426
+ remaining = deadline - deadline_now()
1427
+ if remaining <= 0:
1428
+ raise PipelineRunError(f"Timed out waiting for run {run_id}") from exc
1429
+ retry_interval = min(retry_interval, remaining)
1430
+ time.sleep(max(0.0, retry_interval))
1431
+ continue
1432
+ last_poll = poll
1433
+ self.hooks.after_poll(poll, wait_context)
1434
+ if poll.terminal:
1435
+ wait_context.metadata["wait_result"] = self._wait_metadata(poll)
1436
+ self.hooks.on_terminal(poll, wait_context)
1437
+ result = self._wait_result(poll, timed_out=False)
1438
+ self._record_wait_outcome(wait_context, poll, result)
1439
+ self.hooks.after_wait_context(result, wait_context)
1440
+ return result
1441
+ if self.hooks.should_exit_early(poll, wait_context):
1442
+ wait_context.metadata["wait_result"] = self._wait_metadata(poll, early_exit=True)
1443
+ self.hooks.on_early_exit_before_release(poll, wait_context)
1444
+ result = self._wait_result(poll, timed_out=False, early_exit=True)
1445
+ self._record_wait_outcome(wait_context, poll, result)
1446
+ self.hooks.after_wait_context(result, wait_context)
1447
+ return result
1448
+ if deadline is not None and deadline_now() >= deadline:
1449
+ wait_context.metadata["wait_result"] = self._wait_metadata(poll, timed_out=True)
1450
+ self.hooks.on_timeout(poll, wait_context)
1451
+ result = self._wait_result(poll, timed_out=True)
1452
+ self._record_wait_outcome(wait_context, poll, result)
1453
+ self.hooks.after_wait_context(result, wait_context)
1454
+ return result
1455
+ if deadline is None:
1456
+ sleep_for = poll_interval
1457
+ else:
1458
+ sleep_for = min(poll_interval, max(0.0, deadline - deadline_now()))
1459
+ time.sleep(sleep_for)
1460
+ if last_poll is None: # pragma: no cover - defensive, loop always polls first
1461
+ raise PipelineRunError(f"No status returned for run {run_id}")
1462
+
1463
+ @staticmethod
1464
+ def _wait_metadata(
1465
+ poll: PipelineWaitPoll,
1466
+ *,
1467
+ timed_out: bool = False,
1468
+ early_exit: bool = False,
1469
+ ) -> dict[str, Any]:
1470
+ failed_count = int(poll.status_counts.get("FAILED", 0) or 0)
1471
+ error_count = int(poll.status_counts.get("SYSTEM_ERROR", 0) or 0)
1472
+ metadata: dict[str, Any] = {
1473
+ "status_counts": dict(poll.status_counts),
1474
+ "failed_count": failed_count,
1475
+ "error_count": error_count,
1476
+ "elapsed_seconds": poll.elapsed_seconds,
1477
+ }
1478
+ if timed_out:
1479
+ metadata["timed_out"] = True
1480
+ if early_exit:
1481
+ metadata["early_exit"] = True
1482
+ return metadata
1483
+
1484
+ def _record_wait_outcome(
1485
+ self,
1486
+ context: PipelineRunContext,
1487
+ poll: PipelineWaitPoll,
1488
+ result: Mapping[str, Any],
1489
+ ) -> None:
1490
+ context.wait_outcome = self.hooks.wait_outcome(poll, result, context)
1491
+
1492
+ @staticmethod
1493
+ def _wait_result(
1494
+ poll: PipelineWaitPoll,
1495
+ *,
1496
+ timed_out: bool,
1497
+ early_exit: bool = False,
1498
+ ) -> dict[str, Any]:
1499
+ result: dict[str, Any] = {
1500
+ "run": poll.run,
1501
+ "status": poll.status,
1502
+ "timed_out": timed_out,
1503
+ }
1504
+ if early_exit or timed_out:
1505
+ result.update(PipelineRunManager._wait_metadata(poll, timed_out=timed_out, early_exit=early_exit))
1506
+ if early_exit:
1507
+ result["early_exit"] = True
1508
+ return result
1509
+
1510
+ @staticmethod
1511
+ def _ensure_submission_id_annotation(body: dict[str, Any]) -> str:
1512
+ annotations = body.setdefault("annotations", {})
1513
+ if not isinstance(annotations, dict):
1514
+ annotations = {}
1515
+ body["annotations"] = annotations
1516
+ submission_id = annotations.get(_SUBMISSION_ID_ANNOTATION_KEY)
1517
+ if submission_id:
1518
+ annotations[_SUBMISSION_ID_ANNOTATION_KEY] = str(submission_id)
1519
+ return str(submission_id)
1520
+ submission_id = uuid.uuid4().hex
1521
+ annotations[_SUBMISSION_ID_ANNOTATION_KEY] = submission_id
1522
+ return submission_id
1523
+
1524
+ @staticmethod
1525
+ def _submission_id_from_body(body: Mapping[str, Any]) -> str | None:
1526
+ annotations = body.get("annotations")
1527
+ if not isinstance(annotations, Mapping):
1528
+ return None
1529
+ submission_id = annotations.get(_SUBMISSION_ID_ANNOTATION_KEY)
1530
+ return str(submission_id) if submission_id else None
1531
+
1532
+ def _submitted_runs_for_submission_id(self, submission_id: str) -> list[dict[str, Any]]:
1533
+ query = {
1534
+ "and": [
1535
+ PipelineRunSearch.build_value_equals(
1536
+ key=_SUBMISSION_ID_ANNOTATION_KEY,
1537
+ value=submission_id,
1538
+ )
1539
+ ]
1540
+ }
1541
+ response = self._require_client().pipeline_runs_list(
1542
+ filter_query=json.dumps(query, separators=(",", ":")),
1543
+ include_pipeline_names=True,
1544
+ )
1545
+ plain = self.to_plain(response)
1546
+ if not isinstance(plain, Mapping):
1547
+ return []
1548
+ runs = plain.get("pipeline_runs")
1549
+ if not isinstance(runs, list):
1550
+ return []
1551
+ return [dict(run) for run in runs if isinstance(run, Mapping)]
1552
+
1553
+ def _recover_submitted_run_after_submit_error(
1554
+ self,
1555
+ *,
1556
+ submission_id: str | None,
1557
+ ) -> dict[str, Any] | None:
1558
+ if not submission_id:
1559
+ return None
1560
+ for lookup_attempt in range(1, _SUBMIT_RECOVERY_LOOKUP_ATTEMPTS + 1):
1561
+ self.logger.info(
1562
+ "Checking whether failed submit already created a pipeline run "
1563
+ f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}, "
1564
+ f"lookup_attempt={lookup_attempt}/{_SUBMIT_RECOVERY_LOOKUP_ATTEMPTS})"
1565
+ )
1566
+ try:
1567
+ matches = self._submitted_runs_for_submission_id(submission_id)
1568
+ except Exception as exc:
1569
+ self.logger.warn(
1570
+ "Submit recovery lookup failed "
1571
+ f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}): {exc}. "
1572
+ "Falling back to resubmitting the same frozen body."
1573
+ )
1574
+ return None
1575
+ self.logger.info(
1576
+ "Submit recovery lookup matched "
1577
+ f"{len(matches)} run(s) for {_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}"
1578
+ )
1579
+ if len(matches) == 1:
1580
+ run = matches[0]
1581
+ run_id = run.get("id")
1582
+ root_execution_id = run.get("root_execution_id")
1583
+ self.logger.info(
1584
+ "Recovered existing pipeline run "
1585
+ f"run_id={run_id}, root_execution_id={root_execution_id}, "
1586
+ f"{_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}; adopting instead of resubmitting."
1587
+ )
1588
+ return run
1589
+ if len(matches) > 1:
1590
+ run_ids = [str(run.get("id")) for run in matches if run.get("id") is not None]
1591
+ self.logger.warn(
1592
+ "Submit recovery lookup was ambiguous "
1593
+ f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}, matched_run_ids={run_ids}). "
1594
+ "Refusing to submit a duplicate."
1595
+ )
1596
+ raise AmbiguousPipelineRunRecoveryError(
1597
+ "Found multiple pipeline runs for failed submit recovery "
1598
+ f"{_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}: {', '.join(run_ids) or matches!r}. "
1599
+ "Refusing to submit a duplicate."
1600
+ )
1601
+ if lookup_attempt < _SUBMIT_RECOVERY_LOOKUP_ATTEMPTS:
1602
+ time.sleep(_SUBMIT_RECOVERY_LOOKUP_DELAY_SECONDS)
1603
+ self.logger.warn(
1604
+ "No existing pipeline run found after submit failure "
1605
+ f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id}); "
1606
+ "resubmitting the same frozen body with preserved inputs."
1607
+ )
1608
+ return None
1609
+
1610
+ def _adopt_submitted_run(
1611
+ self,
1612
+ *,
1613
+ response: Mapping[str, Any],
1614
+ body: dict[str, Any],
1615
+ pipeline_path: str | Path | None,
1616
+ attempt: int,
1617
+ context: PipelineRunContext,
1618
+ ) -> dict[str, Any]:
1619
+ response_dict = dict(response)
1620
+ submitted_context = self.response_run_context(
1621
+ response_dict,
1622
+ submit_body=body,
1623
+ pipeline_path=pipeline_path,
1624
+ attempt=attempt,
1625
+ )
1626
+ context.run_id = submitted_context.run_id
1627
+ context.run_name = submitted_context.run_name
1628
+ context.root_execution_id = submitted_context.root_execution_id
1629
+ context.submit_body = submitted_context.submit_body
1630
+ context.pipeline_spec = submitted_context.pipeline_spec
1631
+ context.response = response_dict
1632
+ context.metadata["recovered_after_submit_error"] = True
1633
+ self.hooks.after_submit_context(context)
1634
+ return response_dict
1635
+
1636
+ def _run_body_factory(
1637
+ self,
1638
+ body_factory: Callable[[int, PipelineRunContext | None, Exception | None], dict[str, Any]],
1639
+ *,
1640
+ pipeline_path: str | Path | None = None,
1641
+ wait: bool = False,
1642
+ max_wait: float | None = 600.0,
1643
+ poll_interval: float = 10.0,
1644
+ use_graph_state: bool = False,
1645
+ max_attempts: int = 1,
1646
+ allow_zero_poll_interval: bool = False,
1647
+ timeout_clock: str = "monotonic",
1648
+ exit_on_first_failure: bool = False,
1649
+ metadata: dict[str, Any] | None = None,
1650
+ metadata_factory: Callable[
1651
+ [int, PipelineRunContext | None, Exception | None], dict[str, Any]
1652
+ ] | None = None,
1653
+ ) -> dict[str, Any]:
1654
+ """Drive submit/wait/retry for already prepared specs or submit bodies."""
1655
+
1656
+ if max_attempts < 1:
1657
+ raise PipelineRunError("max_attempts must be at least 1")
1658
+ last_error: Exception | None = None
1659
+ previous_context: PipelineRunContext | None = None
1660
+ attempts: list[PipelineRunContext] = []
1661
+ for attempt in range(1, max_attempts + 1):
1662
+ context = PipelineRunContext(
1663
+ pipeline_path=pipeline_path,
1664
+ start_time=time.time(),
1665
+ attempt=attempt,
1666
+ previous_context=previous_context,
1667
+ previous_error=last_error,
1668
+ metadata=dict(metadata or {}),
1669
+ )
1670
+ lifecycle_started = False
1671
+ success = False
1672
+ error: Exception | None = None
1673
+ retry_requested = False
1674
+ reused_after_submit_failure = (
1675
+ previous_context is not None
1676
+ and previous_context.run_id is None
1677
+ and previous_context.submit_body is not None
1678
+ )
1679
+ if reused_after_submit_failure:
1680
+ # The previous attempt failed while submitting, before the API
1681
+ # returned a run id. Retry the exact same submit body instead
1682
+ # of rebuilding it: body construction can intentionally inject
1683
+ # dynamic inputs (for example a scheduler creation timestamp),
1684
+ # and changing those inputs on an ambiguous submit timeout can
1685
+ # defeat cache reuse or double-run the logical pipeline.
1686
+ body = copy.deepcopy(previous_context.submit_body)
1687
+ self.logger.info(
1688
+ "Retrying submit after submit exception with the same frozen body "
1689
+ f"({_SUBMISSION_ID_ANNOTATION_KEY}={self._submission_id_from_body(body)}); "
1690
+ "dynamic inputs are preserved."
1691
+ )
1692
+ else:
1693
+ if previous_context is not None:
1694
+ self.logger.info(
1695
+ "Retrying after pipeline failure; rebuilding submit body so dynamic run arguments "
1696
+ "can follow hook policy (for example update-vs-fixed time input)."
1697
+ )
1698
+ body = body_factory(attempt, previous_context, last_error)
1699
+ self.normalize_submit_body_in_place(body)
1700
+ submission_id = self._ensure_submission_id_annotation(body)
1701
+ context.metadata["submission_id"] = submission_id
1702
+ if metadata_factory is not None:
1703
+ context.metadata.update(metadata_factory(attempt, previous_context, last_error))
1704
+ pipeline_spec = body.get("root_task", {}).get("componentRef", {}).get("spec")
1705
+ context.submit_body = body
1706
+ context.pipeline_spec = pipeline_spec if isinstance(pipeline_spec, dict) else None
1707
+ if context.pipeline_spec is not None:
1708
+ spec_name = context.pipeline_spec.get("name")
1709
+ if isinstance(spec_name, str) and spec_name:
1710
+ context.run_name = spec_name
1711
+ self.hooks.before_run_lifecycle(context)
1712
+ lifecycle_started = True
1713
+ attempts.append(context)
1714
+ # ``previous_context`` tracks the previous attempt, not only the
1715
+ # previous successfully submitted run. Resource-carry hooks need to
1716
+ # hand off mutexes/leases even when an attempt fails during submit
1717
+ # before a run id is available.
1718
+ previous_context = context
1719
+ try:
1720
+ with self.hooks.around_run(context):
1721
+ try:
1722
+ recovered_response = None
1723
+ if reused_after_submit_failure:
1724
+ recovered_response = self._recover_submitted_run_after_submit_error(
1725
+ submission_id=self._submission_id_from_body(body),
1726
+ )
1727
+ if recovered_response is not None:
1728
+ response = self._adopt_submitted_run(
1729
+ response=recovered_response,
1730
+ body=body,
1731
+ pipeline_path=pipeline_path,
1732
+ attempt=attempt,
1733
+ context=context,
1734
+ )
1735
+ if attempt > 1:
1736
+ self.hooks.after_retry_submit(context)
1737
+ else:
1738
+ try:
1739
+ response = self.submit_prepared_body(
1740
+ body,
1741
+ pipeline_path=pipeline_path,
1742
+ attempt=attempt,
1743
+ context=context,
1744
+ notify_submit_error=False,
1745
+ )
1746
+ except Exception as submit_exc:
1747
+ if context.run_id is not None:
1748
+ raise
1749
+ submission_id_for_recovery = self._submission_id_from_body(body)
1750
+ self.logger.warn(
1751
+ "Submit failed before a run id was returned "
1752
+ f"({_SUBMISSION_ID_ANNOTATION_KEY}={submission_id_for_recovery}): "
1753
+ f"{submit_exc}. Checking whether the run was actually created."
1754
+ )
1755
+ recovered_response = self._recover_submitted_run_after_submit_error(
1756
+ submission_id=submission_id_for_recovery,
1757
+ )
1758
+ if recovered_response is None:
1759
+ self.hooks.on_submit_error(submit_exc, context=context)
1760
+ raise
1761
+ response = self._adopt_submitted_run(
1762
+ response=recovered_response,
1763
+ body=body,
1764
+ pipeline_path=pipeline_path,
1765
+ attempt=attempt,
1766
+ context=context,
1767
+ )
1768
+ if attempt > 1:
1769
+ self.hooks.after_retry_submit(context)
1770
+ result: dict[str, Any]
1771
+ if wait and context.run_id:
1772
+ wait_result = self.wait_for_completion(
1773
+ context.run_id,
1774
+ max_wait=max_wait,
1775
+ poll_interval=poll_interval,
1776
+ use_graph_state=use_graph_state,
1777
+ context=context,
1778
+ allow_zero_poll_interval=allow_zero_poll_interval,
1779
+ timeout_clock=timeout_clock,
1780
+ exit_on_first_failure=exit_on_first_failure,
1781
+ )
1782
+ result = {"response": response, "wait": wait_result}
1783
+ else:
1784
+ result = {"response": response}
1785
+ result["context"] = context
1786
+ result["attempts"] = attempts
1787
+ success = True
1788
+ return result
1789
+ except Exception as exc:
1790
+ error = exc
1791
+ last_error = exc
1792
+ if isinstance(exc, AmbiguousPipelineRunRecoveryError):
1793
+ self.hooks.on_fail_fast_before_release(context, exc)
1794
+ raise
1795
+ if (
1796
+ context.run_id
1797
+ and attempt < max_attempts
1798
+ and self.hooks.should_cancel_previous_run(
1799
+ context,
1800
+ exc,
1801
+ next_attempt=attempt + 1,
1802
+ )
1803
+ ):
1804
+ self.cancel_run(context.run_id)
1805
+ if attempt >= max_attempts:
1806
+ self.hooks.on_fail_fast_before_release(context, exc)
1807
+ raise
1808
+ retry_context = context if context.run_id else previous_context or context
1809
+ self.hooks.before_retry(retry_context, exc, next_attempt=attempt + 1)
1810
+ retry_requested = True
1811
+ finally:
1812
+ if lifecycle_started:
1813
+ self.hooks.after_run_lifecycle(context, success=success, error=error)
1814
+ if retry_requested:
1815
+ continue
1816
+ if last_error is not None: # pragma: no cover - defensive
1817
+ raise last_error
1818
+ raise PipelineRunError("Pipeline run did not start") # pragma: no cover
1819
+
1820
+ def run_prepared_body(
1821
+ self,
1822
+ body: dict[str, Any],
1823
+ *,
1824
+ pipeline_path: str | Path | None = None,
1825
+ wait: bool = False,
1826
+ max_wait: float | None = 600.0,
1827
+ poll_interval: float = 10.0,
1828
+ use_graph_state: bool = False,
1829
+ max_attempts: int = 1,
1830
+ retry_body_factory: Callable[
1831
+ [int, PipelineRunContext | None, Exception | None], dict[str, Any]
1832
+ ] | None = None,
1833
+ allow_zero_poll_interval: bool = False,
1834
+ timeout_clock: str = "monotonic",
1835
+ exit_on_first_failure: bool = False,
1836
+ metadata: dict[str, Any] | None = None,
1837
+ ) -> dict[str, Any]:
1838
+ """Submit/wait/retry an already prepared submit body.
1839
+
1840
+ ``retry_body_factory`` lets downstreams refresh retry bodies while still
1841
+ keeping hydration/layout/validation outside the generic lifecycle.
1842
+ """
1843
+
1844
+ def body_factory(
1845
+ attempt: int,
1846
+ previous_context: PipelineRunContext | None,
1847
+ error: Exception | None,
1848
+ ) -> dict[str, Any]:
1849
+ if attempt > 1 and retry_body_factory is not None:
1850
+ return retry_body_factory(attempt, previous_context, error)
1851
+ return copy.deepcopy(body)
1852
+
1853
+ return self._run_body_factory(
1854
+ body_factory,
1855
+ pipeline_path=pipeline_path,
1856
+ wait=wait,
1857
+ max_wait=max_wait,
1858
+ poll_interval=poll_interval,
1859
+ use_graph_state=use_graph_state,
1860
+ max_attempts=max_attempts,
1861
+ allow_zero_poll_interval=allow_zero_poll_interval,
1862
+ timeout_clock=timeout_clock,
1863
+ exit_on_first_failure=exit_on_first_failure,
1864
+ metadata=metadata,
1865
+ )
1866
+
1867
+ def run_pipeline_spec(
1868
+ self,
1869
+ pipeline_spec: dict[str, Any],
1870
+ *,
1871
+ run_args: dict[str, Any] | None = None,
1872
+ annotations: dict[str, str] | None = None,
1873
+ pipeline_path: str | Path | None = None,
1874
+ run_as: str | None = None,
1875
+ hydrate: bool = True,
1876
+ wait: bool = False,
1877
+ max_wait: float | None = 600.0,
1878
+ poll_interval: float = 10.0,
1879
+ use_graph_state: bool = False,
1880
+ max_attempts: int = 1,
1881
+ allow_zero_poll_interval: bool = False,
1882
+ timeout_clock: str = "monotonic",
1883
+ exit_on_first_failure: bool = False,
1884
+ metadata: dict[str, Any] | None = None,
1885
+ ) -> dict[str, Any]:
1886
+ """Submit/wait/retry an already hydrated/validated in-memory spec."""
1887
+
1888
+ def body_factory(
1889
+ _attempt: int,
1890
+ _previous_context: PipelineRunContext | None,
1891
+ _error: Exception | None,
1892
+ ) -> dict[str, Any]:
1893
+ return self.prepare_submit_payload_from_spec(
1894
+ copy.deepcopy(pipeline_spec),
1895
+ run_args=run_args,
1896
+ annotations=annotations,
1897
+ pipeline_path=pipeline_path,
1898
+ run_as=run_as,
1899
+ hydrate=hydrate,
1900
+ ).to_body()
1901
+
1902
+ return self._run_body_factory(
1903
+ body_factory,
1904
+ pipeline_path=pipeline_path,
1905
+ wait=wait,
1906
+ max_wait=max_wait,
1907
+ poll_interval=poll_interval,
1908
+ use_graph_state=use_graph_state,
1909
+ max_attempts=max_attempts,
1910
+ allow_zero_poll_interval=allow_zero_poll_interval,
1911
+ timeout_clock=timeout_clock,
1912
+ exit_on_first_failure=exit_on_first_failure,
1913
+ metadata=metadata,
1914
+ )
1915
+
1916
+ def run_pipeline(
1917
+ self,
1918
+ pipeline_path: str | Path,
1919
+ *,
1920
+ run_args: dict[str, Any] | None = None,
1921
+ annotations: dict[str, str] | None = None,
1922
+ hydrate: bool = True,
1923
+ run_as: str | None = None,
1924
+ resolution_overrides: dict[str, Any] | None = None,
1925
+ wait: bool = False,
1926
+ max_wait: float | None = 600.0,
1927
+ poll_interval: float = 10.0,
1928
+ use_graph_state: bool = False,
1929
+ max_attempts: int = 1,
1930
+ allow_zero_poll_interval: bool = False,
1931
+ timeout_clock: str = "monotonic",
1932
+ exit_on_first_failure: bool = False,
1933
+ metadata: dict[str, Any] | None = None,
1934
+ ) -> dict[str, Any]:
1935
+ """Submit (and optionally wait for) a pipeline with lifecycle hooks.
1936
+
1937
+ Unlike ``run_pipeline_spec``, path-based runs intentionally rebuild the
1938
+ submit body on every retry so read/hydrate/resolution hooks are
1939
+ re-invoked for each attempt.
1940
+ """
1941
+
1942
+ def body_factory(
1943
+ _attempt: int,
1944
+ _previous_context: PipelineRunContext | None,
1945
+ _error: Exception | None,
1946
+ ) -> dict[str, Any]:
1947
+ return self.prepare_submit_payload(
1948
+ pipeline_path,
1949
+ run_args=run_args,
1950
+ annotations=annotations,
1951
+ hydrate=hydrate,
1952
+ run_as=run_as,
1953
+ resolution_overrides=resolution_overrides,
1954
+ ).to_body()
1955
+
1956
+ return self._run_body_factory(
1957
+ body_factory,
1958
+ pipeline_path=pipeline_path,
1959
+ wait=wait,
1960
+ max_wait=max_wait,
1961
+ poll_interval=poll_interval,
1962
+ use_graph_state=use_graph_state,
1963
+ max_attempts=max_attempts,
1964
+ allow_zero_poll_interval=allow_zero_poll_interval,
1965
+ timeout_clock=timeout_clock,
1966
+ exit_on_first_failure=exit_on_first_failure,
1967
+ metadata=metadata,
1968
+ )
1969
+
1970
+
1971
+ def parse_key_value_entries(entries: list[str] | None) -> dict[str, str]:
1972
+ parsed: dict[str, str] = {}
1973
+ for entry in entries or []:
1974
+ if "=" not in entry:
1975
+ raise PipelineRunError("Expected KEY=VALUE")
1976
+ key, value = entry.split("=", 1)
1977
+ if not key:
1978
+ raise PipelineRunError("Expected KEY=VALUE")
1979
+ parsed[key] = value
1980
+ return parsed
1981
+
1982
+
1983
+ def parse_json_or_key_values(
1984
+ text: str | Mapping[str, Any] | None,
1985
+ entries: list[str] | None = None,
1986
+ ) -> dict[str, Any]:
1987
+ result: dict[str, Any] = {}
1988
+ if text:
1989
+ loaded = dict(text) if isinstance(text, Mapping) else json.loads(text)
1990
+ if not isinstance(loaded, dict):
1991
+ raise PipelineRunError("JSON value must be an object")
1992
+ result.update(loaded)
1993
+ result.update(parse_key_value_entries(entries))
1994
+ return result