alpha-engine-lib 0.32.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. alpha_engine_lib/__init__.py +3 -0
  2. alpha_engine_lib/agent_schemas.py +663 -0
  3. alpha_engine_lib/alerts.py +576 -0
  4. alpha_engine_lib/arcticdb.py +340 -0
  5. alpha_engine_lib/collector_results.py +69 -0
  6. alpha_engine_lib/cost.py +665 -0
  7. alpha_engine_lib/dates.py +273 -0
  8. alpha_engine_lib/decision_capture.py +462 -0
  9. alpha_engine_lib/ec2_spot.py +363 -0
  10. alpha_engine_lib/email_sender.py +206 -0
  11. alpha_engine_lib/eval_artifacts.py +361 -0
  12. alpha_engine_lib/logging.py +303 -0
  13. alpha_engine_lib/model_pricing.yaml +73 -0
  14. alpha_engine_lib/pillars.py +756 -0
  15. alpha_engine_lib/pipeline_status/__init__.py +70 -0
  16. alpha_engine_lib/pipeline_status/read.py +541 -0
  17. alpha_engine_lib/pipeline_status/registry.py +368 -0
  18. alpha_engine_lib/pipeline_status/templates.py +120 -0
  19. alpha_engine_lib/preflight.py +444 -0
  20. alpha_engine_lib/rag/__init__.py +39 -0
  21. alpha_engine_lib/rag/db.py +96 -0
  22. alpha_engine_lib/rag/embeddings.py +63 -0
  23. alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
  24. alpha_engine_lib/rag/rerank.py +377 -0
  25. alpha_engine_lib/rag/retrieval.py +465 -0
  26. alpha_engine_lib/rag/schema.sql +65 -0
  27. alpha_engine_lib/reconcile.py +203 -0
  28. alpha_engine_lib/secrets.py +186 -0
  29. alpha_engine_lib/sources/__init__.py +35 -0
  30. alpha_engine_lib/sources/protocols.py +227 -0
  31. alpha_engine_lib/ssm_log_capture.py +274 -0
  32. alpha_engine_lib/telegram.py +165 -0
  33. alpha_engine_lib/trading_calendar.py +236 -0
  34. alpha_engine_lib/transparency.py +746 -0
  35. alpha_engine_lib/transparency_inventory.yaml +260 -0
  36. alpha_engine_lib/universe.py +83 -0
  37. alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
  38. alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
  39. alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
  40. alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,70 @@
1
+ """Pipeline-status projection of the three Alpha Engine Step Functions.
2
+
3
+ Substrate for the pipeline-reporting-revamp arc (ROADMAP L3050, plan doc
4
+ ``alpha-engine-docs/private/pipeline-reporting-revamp-260524.md``). Projects
5
+ ``states:DescribeExecution`` + ``states:GetExecutionHistory`` onto a typed
6
+ :class:`PipelineRun` so the dashboard page 25 (and any future Slack/CLI
7
+ subscriber) renders SF state without rebuilding the projection logic per
8
+ consumer.
9
+
10
+ **Public surface:**
11
+
12
+ - :func:`read_pipeline_state` — projection entry point. Returns a
13
+ :class:`PipelineRun` for the most-recent execution of the given SF ARN.
14
+ - :class:`PipelineRun` / :class:`TaskRow` / :class:`RunStatus` — typed shape.
15
+ - :data:`STATE_TO_ARCHIVE_PAGE` — registry mapping every substantive Task
16
+ state to either an :class:`ArchivePageRef` deep-link OR a non-generic
17
+ :class:`ArtifactReason` string (per ``feedback_no_silent_fails`` — no
18
+ generic "no artifact" placeholders).
19
+ - :func:`format_success_message` / :func:`format_failure_message` — verbatim
20
+ Python parity for the ``States.Format`` templates baked into the SF JSON.
21
+ Lets future non-SF consumers render byte-identical message bodies without
22
+ duplicating the template.
23
+
24
+ **Why this lives in lib (not in alpha-engine-dashboard):** second adoption
25
+ is anticipated — the same projection is the natural backing for a Slack
26
+ subscriber + a CLI ``ae pipeline status`` command. Per the SOTA / institutional
27
+ sub-sub-rule in ``~/Development/CLAUDE.md`` item 9, the lift goes upstream
28
+ on first build, not after the second consumer arrives.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ from .read import (
34
+ PipelineRun,
35
+ RunStatus,
36
+ SFNAccessDenied,
37
+ SFNNoExecutions,
38
+ SFNThrottled,
39
+ TaskRow,
40
+ TaskStatus,
41
+ read_pipeline_state,
42
+ )
43
+ from .registry import (
44
+ PIPELINE_LABELS,
45
+ STATE_TO_ARCHIVE_PAGE,
46
+ SUBSTANTIVE_RESOURCES,
47
+ WAIT_GROUPING,
48
+ ArchivePageRef,
49
+ ArtifactReason,
50
+ )
51
+ from .templates import format_failure_message, format_success_message
52
+
53
+ __all__ = [
54
+ "ArchivePageRef",
55
+ "ArtifactReason",
56
+ "PIPELINE_LABELS",
57
+ "PipelineRun",
58
+ "RunStatus",
59
+ "SFNAccessDenied",
60
+ "SFNNoExecutions",
61
+ "SFNThrottled",
62
+ "STATE_TO_ARCHIVE_PAGE",
63
+ "SUBSTANTIVE_RESOURCES",
64
+ "TaskRow",
65
+ "TaskStatus",
66
+ "WAIT_GROUPING",
67
+ "format_failure_message",
68
+ "format_success_message",
69
+ "read_pipeline_state",
70
+ ]
@@ -0,0 +1,541 @@
1
+ """SF-state projection — ``read_pipeline_state`` and shapes.
2
+
3
+ Reads the most-recent execution of a Step Function and projects it onto
4
+ the typed :class:`PipelineRun` shape the dashboard page 25 (and any future
5
+ Slack/CLI subscriber) consumes.
6
+
7
+ Three API calls per invocation:
8
+
9
+ 1. ``states:ListExecutions(maxResults=1)`` — find the latest execution arn.
10
+ 2. ``states:DescribeExecution(executionArn=...)`` — top-level status +
11
+ start/stop timestamps + failure cause when applicable.
12
+ 3. ``states:GetExecutionHistory(executionArn=..., maxResults=1000)`` —
13
+ per-state entry/exit events. Substantive-state filter + Wait-grouping
14
+ applied in :func:`_materialize_tasks`.
15
+
16
+ **Exception contract** — every documented error path raises a typed
17
+ subclass of :class:`PipelineStatusError` so the dashboard page can switch
18
+ on the cause and render the appropriate banner state:
19
+
20
+ - :class:`SFNAccessDenied` — IAM missing one of states:Describe / Get / List.
21
+ Page renders a red banner naming the missing action.
22
+ - :class:`SFNThrottled` — SF API rate-limited. Page falls back to the
23
+ ``pipeline_status_cache.json`` last-good cache with a yellow banner.
24
+ - :class:`SFNNoExecutions` — SF has never been executed. Page renders an
25
+ empty section with a "no executions yet" note (NOT an error).
26
+
27
+ **Never silently degrades** per ``feedback_no_silent_fails`` — unknown
28
+ boto3 errors are re-raised as :class:`PipelineStatusError` so the page's
29
+ red banner always names a specific cause.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import logging
35
+ from dataclasses import dataclass
36
+ from datetime import datetime, timezone
37
+ from enum import Enum
38
+ from typing import TYPE_CHECKING, Any, Optional
39
+
40
+ from pydantic import BaseModel, ConfigDict, Field
41
+
42
+ from .registry import (
43
+ PIPELINE_LABELS,
44
+ SUBSTANTIVE_RESOURCES,
45
+ WAIT_GROUPING,
46
+ ArchivePageRef,
47
+ ArtifactReason,
48
+ lookup_registry,
49
+ )
50
+
51
+ if TYPE_CHECKING: # pragma: no cover — type-only import
52
+ from mypy_boto3_stepfunctions.client import SFNClient
53
+
54
+ logger = logging.getLogger(__name__)
55
+
56
+ # Failure-cause truncation — kept verbatim with sf-telegram-notifier
57
+ # (alpha-engine-data/infrastructure/lambdas/sf-telegram-notifier/index.py
58
+ # line 69) so the email + Telegram + page channels render byte-identical
59
+ # cause snippets.
60
+ _CAUSE_MAX_CHARS = 280
61
+
62
+ # Bounds the ``getExecutionHistory`` page size. The Saturday SF emits
63
+ # ~600-800 history events on a clean run; the page-25 SLA is one round-trip
64
+ # per poll, so we want enough headroom not to paginate but not so much we
65
+ # load 100k events for a one-state-failed execution. 1000 is what
66
+ # states:GetExecutionHistory's MaxResults caps at anyway.
67
+ _HISTORY_PAGE_SIZE = 1000
68
+
69
+
70
+ # ── Status enums ──────────────────────────────────────────────────────────
71
+
72
+
73
+ class RunStatus(str, Enum):
74
+ """Terminal status of a Step Functions execution.
75
+
76
+ Mirrors the AWS Step Functions API ``status`` field verbatim. The
77
+ ``NOT-RUN`` sentinel is lib-internal — returned by
78
+ :func:`read_pipeline_state` when the SF has never been executed (vs
79
+ raising), so the page can render "no executions yet" cleanly.
80
+
81
+ ``str`` mixin lets ``RunStatus.SUCCEEDED == "SUCCEEDED"`` compare True,
82
+ which the dashboard's existing component patterns rely on.
83
+ """
84
+
85
+ RUNNING = "RUNNING"
86
+ SUCCEEDED = "SUCCEEDED"
87
+ FAILED = "FAILED"
88
+ TIMED_OUT = "TIMED_OUT"
89
+ ABORTED = "ABORTED"
90
+ NOT_RUN = "NOT-RUN"
91
+
92
+
93
+ class TaskStatus(str, Enum):
94
+ """Per-state status as projected from the execution history.
95
+
96
+ Adds ``SKIPPED`` (state was reached but a Choice branched past it
97
+ without entering) + ``NOT_RUN`` (state exists in the SF JSON but was
98
+ never reached this execution — e.g. ``ReinvokePredictor`` on a clean
99
+ run) to the AWS status vocabulary. The Choice/Pass control flow is
100
+ rolled up upstream (filter in :func:`_materialize_tasks`) so these two
101
+ extras only ever apply to genuinely-substantive states.
102
+ """
103
+
104
+ RUNNING = "RUNNING"
105
+ SUCCEEDED = "SUCCEEDED"
106
+ FAILED = "FAILED"
107
+ TIMED_OUT = "TIMED_OUT"
108
+ ABORTED = "ABORTED"
109
+ SKIPPED = "SKIPPED"
110
+ NOT_RUN = "NOT-RUN"
111
+
112
+
113
+ # ── Exception types ───────────────────────────────────────────────────────
114
+
115
+
116
+ class PipelineStatusError(Exception):
117
+ """Base class for all read-side errors."""
118
+
119
+
120
+ class SFNAccessDenied(PipelineStatusError):
121
+ """IAM missing one of states:DescribeExecution / GetExecutionHistory /
122
+ ListExecutions. Caller (page 25) renders a red banner naming the action.
123
+ """
124
+
125
+
126
+ class SFNThrottled(PipelineStatusError):
127
+ """SF API rate-limited. Caller falls back to the last-good cache."""
128
+
129
+
130
+ class SFNNoExecutions(PipelineStatusError):
131
+ """SF has never been executed. Caller renders 'no executions yet'.
132
+
133
+ Distinct from a generic empty response — this is the ``executions: []``
134
+ branch from ``ListExecutions`` and means the SF exists but has zero
135
+ history. Most often surfaced in dev/test environments; production
136
+ SFs all have history.
137
+ """
138
+
139
+
140
+ # ── Output shapes (Pydantic v2) ───────────────────────────────────────────
141
+
142
+
143
+ # ``model_config = ConfigDict(extra="forbid")`` because every field is
144
+ # strictly defined; an unknown key indicates a schema drift that should
145
+ # fail loud (per feedback_no_silent_fails) rather than silently widen.
146
+ _STRICT_CONFIG: ConfigDict = ConfigDict(extra="forbid", arbitrary_types_allowed=False)
147
+
148
+
149
+ class TaskRow(BaseModel):
150
+ """One row on the page-25 per-pipeline state table."""
151
+
152
+ model_config = _STRICT_CONFIG
153
+
154
+ state_name: str
155
+ status: TaskStatus
156
+ start_utc: Optional[datetime] = None
157
+ end_utc: Optional[datetime] = None
158
+ duration_sec: Optional[float] = None
159
+ # Either an ArchivePageRef (deep-link) OR an ArtifactReason (explicit
160
+ # substrate-only reason). ``None`` here means "state name not in the
161
+ # registry" and is a CI-time bug — the consumer should treat it as a
162
+ # registry-drift signal, not a renderable placeholder.
163
+ archive: Optional[Any] = None # ArchivePageRef | ArtifactReason | None
164
+ failure_cause: Optional[str] = None # populated only when status == FAILED
165
+
166
+
167
+ class PipelineRun(BaseModel):
168
+ """Top-level shape returned by :func:`read_pipeline_state`."""
169
+
170
+ model_config = _STRICT_CONFIG
171
+
172
+ state_machine_arn: str
173
+ pretty_label: str # "Saturday SF" / "Weekday SF" / "EOD SF" — from registry
174
+ execution_arn: Optional[str] = None # None iff status == NOT_RUN
175
+ execution_name: Optional[str] = None # human-readable execution id
176
+ status: RunStatus
177
+ start_utc: Optional[datetime] = None
178
+ end_utc: Optional[datetime] = None
179
+ duration_sec: Optional[float] = None
180
+ tasks: list[TaskRow] = Field(default_factory=list)
181
+ failing_state: Optional[str] = None # populated only when status == FAILED
182
+ failure_cause: Optional[str] = None # populated only when status == FAILED
183
+
184
+
185
+ # ── Helpers ───────────────────────────────────────────────────────────────
186
+
187
+
188
+ def _label_for_arn(state_machine_arn: str) -> str:
189
+ """Mirror sf-telegram-notifier's ``_label_for_arn`` semantics."""
190
+ sm_name = state_machine_arn.rsplit(":", 1)[-1] if state_machine_arn else ""
191
+ return PIPELINE_LABELS.get(sm_name, sm_name or "Unknown SF")
192
+
193
+
194
+ def _failure_cause_from(describe_resp: dict) -> str:
195
+ """Extract + truncate the failure cause from DescribeExecution response.
196
+
197
+ Mirrors sf-telegram-notifier's ``_failure_cause_from`` (lines 125-136)
198
+ BYTE-FOR-BYTE so the email body + Telegram message + page cell all
199
+ render the same snippet. Truncation policy: 280 chars max, ``…``
200
+ appended on overflow.
201
+ """
202
+ error = (describe_resp.get("error") or "").strip()
203
+ cause = (describe_resp.get("cause") or "").strip()
204
+ if error and cause:
205
+ snippet = f"{error}: {cause}"
206
+ else:
207
+ snippet = error or cause
208
+ if len(snippet) > _CAUSE_MAX_CHARS:
209
+ snippet = snippet[: _CAUSE_MAX_CHARS - 1] + "…"
210
+ return snippet
211
+
212
+
213
+ def _parse_ts(value: Any) -> Optional[datetime]:
214
+ """Normalize boto3's datetime values to UTC.
215
+
216
+ boto3 returns ``datetime`` objects with offset-aware ``tzinfo`` (usually
217
+ ``tzutc()``); we coerce to ``timezone.utc`` for round-trip consistency.
218
+ Returns None for falsy input.
219
+ """
220
+ if value is None:
221
+ return None
222
+ if isinstance(value, datetime):
223
+ if value.tzinfo is None:
224
+ return value.replace(tzinfo=timezone.utc)
225
+ return value.astimezone(timezone.utc)
226
+ return None
227
+
228
+
229
+ def _is_substantive_event(event: dict) -> bool:
230
+ """Filter for events that name a substantive state.
231
+
232
+ Used to scan the history for ``TaskStateEntered`` / ``TaskStateExited``
233
+ events whose state corresponds to a Task with a substantive Resource.
234
+ The history's event-type taxonomy is one filter axis; the registry's
235
+ ``SUBSTANTIVE_RESOURCES`` is the second (applied via the state
236
+ definition lookup, which the SF JSON walk handles upstream).
237
+
238
+ The history events don't carry the Resource ARN directly — we rely on
239
+ the registry membership check at materialization time instead.
240
+ """
241
+ return event.get("type", "").startswith(("TaskStateEntered", "TaskStateExited"))
242
+
243
+
244
+ def _absorb_wait_companion(state_name: str) -> str:
245
+ """Return the parent state name if ``state_name`` is a Wait companion.
246
+
247
+ Per §3.2 of the plan doc, ``WaitForDataPhase1`` is rolled up into
248
+ ``DataPhase1`` for display — operators think in terms of "DataPhase1
249
+ took 40 min" not "DataPhase1 took 200ms; WaitForDataPhase1 took 39m
250
+ 59s 800ms". The duration math in :func:`_materialize_tasks` measures
251
+ parent-entered → wait-exited when both exist.
252
+ """
253
+ return WAIT_GROUPING.get(state_name, state_name)
254
+
255
+
256
+ def _materialize_tasks(history_events: list[dict]) -> list[TaskRow]:
257
+ """Walk the execution history and produce one TaskRow per substantive state.
258
+
259
+ Algorithm:
260
+
261
+ 1. Walk every event; for each ``TaskStateEntered`` / ``TaskStateExited``,
262
+ record the state name + timestamp + outcome.
263
+ 2. Absorb Wait companions: a ``WaitForX`` entered event extends ``X``'s
264
+ end timestamp; the Wait state never becomes its own row.
265
+ 3. Filter to states that are in the registry (registry membership IS
266
+ the substantive-state filter at this layer).
267
+ 4. Render each surviving state as a TaskRow with status + duration +
268
+ registry entry attached.
269
+ """
270
+ # state_name → {"start": dt | None, "end": dt | None, "status": TaskStatus,
271
+ # "cause": str | None}
272
+ by_state: dict[str, dict[str, Any]] = {}
273
+
274
+ for event in history_events:
275
+ etype = event.get("type", "")
276
+ details_key = None
277
+ if etype == "TaskStateEntered":
278
+ details_key = "stateEnteredEventDetails"
279
+ elif etype == "TaskStateExited":
280
+ details_key = "stateExitedEventDetails"
281
+ elif etype == "TaskFailed":
282
+ # TaskFailed carries cause+error on TaskFailedEventDetails; we
283
+ # attach to the most recently-entered state. boto3's history
284
+ # iteration preserves chronological order, so the "last
285
+ # entered" state at TaskFailed time is the failing state.
286
+ cause = (event.get("taskFailedEventDetails") or {}).get("cause", "")
287
+ error = (event.get("taskFailedEventDetails") or {}).get("error", "")
288
+ snippet = f"{error}: {cause}" if (error and cause) else (error or cause)
289
+ if len(snippet) > _CAUSE_MAX_CHARS:
290
+ snippet = snippet[: _CAUSE_MAX_CHARS - 1] + "…"
291
+ # Attach to the most-recent state that entered without exiting.
292
+ for sn, rec in reversed(list(by_state.items())):
293
+ if rec.get("end") is None:
294
+ rec["status"] = TaskStatus.FAILED
295
+ rec["cause"] = snippet
296
+ break
297
+ continue
298
+ else:
299
+ continue
300
+
301
+ details = event.get(details_key) or {}
302
+ state_name = details.get("name")
303
+ if not state_name:
304
+ continue
305
+
306
+ # Roll Wait companions into their parent.
307
+ parent_name = _absorb_wait_companion(state_name)
308
+
309
+ rec = by_state.setdefault(
310
+ parent_name,
311
+ {"start": None, "end": None, "status": TaskStatus.SUCCEEDED, "cause": None},
312
+ )
313
+
314
+ ts = _parse_ts(event.get("timestamp"))
315
+ if etype == "TaskStateEntered":
316
+ # Only set start if this is the FIRST entered event for this
317
+ # parent (parent entered first, Wait companion entered after).
318
+ if rec["start"] is None:
319
+ rec["start"] = ts
320
+ elif etype == "TaskStateExited":
321
+ # The LATEST exited event wins (Wait companion exits last).
322
+ rec["end"] = ts
323
+
324
+ rows: list[TaskRow] = []
325
+ for state_name, rec in by_state.items():
326
+ # Filter: only render states that are in the registry. Anything
327
+ # else is control-flow (Choice / Pass / Succeed) that we don't
328
+ # surface on the page.
329
+ archive_entry = lookup_registry(state_name)
330
+ if archive_entry is None:
331
+ continue
332
+
333
+ start = rec["start"]
334
+ end = rec["end"]
335
+ duration: Optional[float] = None
336
+ if start is not None and end is not None:
337
+ duration = (end - start).total_seconds()
338
+
339
+ # If the state was entered but never exited, status is RUNNING.
340
+ status: TaskStatus = rec["status"]
341
+ if end is None and start is not None and status == TaskStatus.SUCCEEDED:
342
+ status = TaskStatus.RUNNING
343
+
344
+ rows.append(
345
+ TaskRow(
346
+ state_name=state_name,
347
+ status=status,
348
+ start_utc=start,
349
+ end_utc=end,
350
+ duration_sec=duration,
351
+ archive=archive_entry,
352
+ failure_cause=rec["cause"],
353
+ )
354
+ )
355
+ return rows
356
+
357
+
358
+ def _failing_state_from_history(history_events: list[dict]) -> Optional[str]:
359
+ """Identify the state that emitted TaskFailed (or ExecutionFailed) first."""
360
+ for event in history_events:
361
+ etype = event.get("type", "")
362
+ if etype == "TaskFailed":
363
+ # Walk backwards through prior events to find the most-recent
364
+ # TaskStateEntered without a matching TaskStateExited.
365
+ idx = history_events.index(event)
366
+ for prior in reversed(history_events[:idx]):
367
+ if prior.get("type") == "TaskStateEntered":
368
+ name = (prior.get("stateEnteredEventDetails") or {}).get("name")
369
+ return _absorb_wait_companion(name) if name else None
370
+ if etype == "ExecutionFailed":
371
+ cause_details = event.get("executionFailedEventDetails") or {}
372
+ # ExecutionFailed doesn't directly carry state name; walk back.
373
+ idx = history_events.index(event)
374
+ for prior in reversed(history_events[:idx]):
375
+ if prior.get("type") == "TaskStateEntered":
376
+ name = (prior.get("stateEnteredEventDetails") or {}).get("name")
377
+ return _absorb_wait_companion(name) if name else None
378
+ # Fallback: synthesize from the cause if no entered event found.
379
+ return (cause_details.get("error") or None)
380
+ return None
381
+
382
+
383
+ # ── Public entry point ────────────────────────────────────────────────────
384
+
385
+
386
+ def read_pipeline_state(
387
+ state_machine_arn: str,
388
+ *,
389
+ client: Optional["SFNClient"] = None,
390
+ ) -> PipelineRun:
391
+ """Project the most-recent execution of ``state_machine_arn`` onto a
392
+ typed :class:`PipelineRun`.
393
+
394
+ Calls (in order):
395
+
396
+ 1. ``states:ListExecutions(stateMachineArn=..., maxResults=1)`` — finds
397
+ the latest execution arn. If the SF has zero executions, raises
398
+ :class:`SFNNoExecutions`.
399
+ 2. ``states:DescribeExecution(executionArn=...)`` — top-level status +
400
+ start/stop + failure cause.
401
+ 3. ``states:GetExecutionHistory(executionArn=..., maxResults=1000)`` —
402
+ per-state events for the Task row table.
403
+
404
+ Parameters
405
+ ----------
406
+ state_machine_arn:
407
+ Full SF ARN, e.g. ``arn:aws:states:us-east-1:711398986525:stateMachine:alpha-engine-saturday-pipeline``.
408
+ client:
409
+ Optional boto3 ``stepfunctions`` client. Tests pass a mock here;
410
+ production passes None and gets a fresh client per call (cheap;
411
+ boto3 caches under the hood).
412
+
413
+ Returns
414
+ -------
415
+ PipelineRun
416
+ Fully populated except when ``status == NOT_RUN`` (only
417
+ ``state_machine_arn`` + ``pretty_label`` + ``status`` set).
418
+
419
+ Raises
420
+ ------
421
+ SFNAccessDenied
422
+ IAM denial on any of the three required actions.
423
+ SFNThrottled
424
+ Rate-limit on any of the three.
425
+ SFNNoExecutions
426
+ SF exists but has zero executions ever.
427
+ PipelineStatusError
428
+ Any other unexpected error path — the caller renders a red banner.
429
+ """
430
+ if client is None: # pragma: no cover — production path
431
+ import boto3
432
+
433
+ client = boto3.client("stepfunctions")
434
+
435
+ label = _label_for_arn(state_machine_arn)
436
+
437
+ # 1. ListExecutions
438
+ try:
439
+ list_resp = client.list_executions(
440
+ stateMachineArn=state_machine_arn,
441
+ maxResults=1,
442
+ )
443
+ except Exception as exc: # noqa: BLE001 — narrow + re-raise
444
+ _raise_for_boto_error(exc, "ListExecutions")
445
+
446
+ executions = list_resp.get("executions") or []
447
+ if not executions:
448
+ raise SFNNoExecutions(
449
+ f"State machine {state_machine_arn} has no executions yet."
450
+ )
451
+
452
+ latest = executions[0]
453
+ execution_arn = latest.get("executionArn")
454
+ execution_name = latest.get("name")
455
+
456
+ # 2. DescribeExecution
457
+ try:
458
+ describe_resp = client.describe_execution(executionArn=execution_arn)
459
+ except Exception as exc: # noqa: BLE001 — narrow + re-raise
460
+ _raise_for_boto_error(exc, "DescribeExecution")
461
+
462
+ status_str = describe_resp.get("status", "RUNNING")
463
+ try:
464
+ run_status = RunStatus(status_str)
465
+ except ValueError:
466
+ # Unknown status string from boto3 (forward-compatibility) — fail
467
+ # loud rather than silently mis-render.
468
+ raise PipelineStatusError(
469
+ f"Unknown SF execution status {status_str!r} from boto3 for {execution_arn}"
470
+ )
471
+
472
+ start_utc = _parse_ts(describe_resp.get("startDate"))
473
+ end_utc = _parse_ts(describe_resp.get("stopDate"))
474
+ duration: Optional[float] = None
475
+ if start_utc is not None and end_utc is not None:
476
+ duration = (end_utc - start_utc).total_seconds()
477
+
478
+ failure_cause = (
479
+ _failure_cause_from(describe_resp) if run_status == RunStatus.FAILED else None
480
+ )
481
+
482
+ # 3. GetExecutionHistory
483
+ try:
484
+ history_resp = client.get_execution_history(
485
+ executionArn=execution_arn,
486
+ maxResults=_HISTORY_PAGE_SIZE,
487
+ reverseOrder=False,
488
+ )
489
+ except Exception as exc: # noqa: BLE001 — narrow + re-raise
490
+ _raise_for_boto_error(exc, "GetExecutionHistory")
491
+
492
+ events = history_resp.get("events") or []
493
+ tasks = _materialize_tasks(events)
494
+ failing_state = (
495
+ _failing_state_from_history(events) if run_status == RunStatus.FAILED else None
496
+ )
497
+
498
+ return PipelineRun(
499
+ state_machine_arn=state_machine_arn,
500
+ pretty_label=label,
501
+ execution_arn=execution_arn,
502
+ execution_name=execution_name,
503
+ status=run_status,
504
+ start_utc=start_utc,
505
+ end_utc=end_utc,
506
+ duration_sec=duration,
507
+ tasks=tasks,
508
+ failing_state=failing_state,
509
+ failure_cause=failure_cause,
510
+ )
511
+
512
+
513
+ def _raise_for_boto_error(exc: Exception, action: str) -> None:
514
+ """Translate a boto3 exception into a typed PipelineStatusError.
515
+
516
+ Inspects the ``ClientError.response["Error"]["Code"]`` for the common
517
+ cases (AccessDenied / Throttling) and re-raises the matching typed
518
+ exception. Unknown error codes re-raise as :class:`PipelineStatusError`
519
+ with the boto3 cause attached.
520
+ """
521
+ code = ""
522
+ response = getattr(exc, "response", None) or {}
523
+ error_dict = response.get("Error") or {}
524
+ code = error_dict.get("Code", "")
525
+
526
+ if code in ("AccessDeniedException", "AccessDenied"):
527
+ raise SFNAccessDenied(
528
+ f"states:{action} denied — add the action to the dashboard "
529
+ f"EC2 role's inline policy. Boto3 detail: {exc}"
530
+ ) from exc
531
+ if code in ("ThrottlingException", "Throttling", "TooManyRequestsException"):
532
+ raise SFNThrottled(
533
+ f"states:{action} rate-limited; page falls back to last-good cache."
534
+ ) from exc
535
+ if code in ("StateMachineDoesNotExist", "ExecutionDoesNotExist"):
536
+ raise SFNNoExecutions(
537
+ f"states:{action} returned {code}: {exc}"
538
+ ) from exc
539
+ raise PipelineStatusError(
540
+ f"Unexpected boto3 error on states:{action}: {code or type(exc).__name__}: {exc}"
541
+ ) from exc