alpha-engine-lib 0.32.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alpha_engine_lib/__init__.py +3 -0
- alpha_engine_lib/agent_schemas.py +663 -0
- alpha_engine_lib/alerts.py +576 -0
- alpha_engine_lib/arcticdb.py +340 -0
- alpha_engine_lib/collector_results.py +69 -0
- alpha_engine_lib/cost.py +665 -0
- alpha_engine_lib/dates.py +273 -0
- alpha_engine_lib/decision_capture.py +462 -0
- alpha_engine_lib/ec2_spot.py +363 -0
- alpha_engine_lib/email_sender.py +206 -0
- alpha_engine_lib/eval_artifacts.py +361 -0
- alpha_engine_lib/logging.py +303 -0
- alpha_engine_lib/model_pricing.yaml +73 -0
- alpha_engine_lib/pillars.py +756 -0
- alpha_engine_lib/pipeline_status/__init__.py +70 -0
- alpha_engine_lib/pipeline_status/read.py +541 -0
- alpha_engine_lib/pipeline_status/registry.py +368 -0
- alpha_engine_lib/pipeline_status/templates.py +120 -0
- alpha_engine_lib/preflight.py +444 -0
- alpha_engine_lib/rag/__init__.py +39 -0
- alpha_engine_lib/rag/db.py +96 -0
- alpha_engine_lib/rag/embeddings.py +63 -0
- alpha_engine_lib/rag/migrations/0001_content_tsv.sql +39 -0
- alpha_engine_lib/rag/rerank.py +377 -0
- alpha_engine_lib/rag/retrieval.py +465 -0
- alpha_engine_lib/rag/schema.sql +65 -0
- alpha_engine_lib/reconcile.py +203 -0
- alpha_engine_lib/secrets.py +186 -0
- alpha_engine_lib/sources/__init__.py +35 -0
- alpha_engine_lib/sources/protocols.py +227 -0
- alpha_engine_lib/ssm_log_capture.py +274 -0
- alpha_engine_lib/telegram.py +165 -0
- alpha_engine_lib/trading_calendar.py +236 -0
- alpha_engine_lib/transparency.py +746 -0
- alpha_engine_lib/transparency_inventory.yaml +260 -0
- alpha_engine_lib/universe.py +83 -0
- alpha_engine_lib-0.32.0.dist-info/METADATA +217 -0
- alpha_engine_lib-0.32.0.dist-info/RECORD +40 -0
- alpha_engine_lib-0.32.0.dist-info/WHEEL +5 -0
- alpha_engine_lib-0.32.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Pipeline-status projection of the three Alpha Engine Step Functions.
|
|
2
|
+
|
|
3
|
+
Substrate for the pipeline-reporting-revamp arc (ROADMAP L3050, plan doc
|
|
4
|
+
``alpha-engine-docs/private/pipeline-reporting-revamp-260524.md``). Projects
|
|
5
|
+
``states:DescribeExecution`` + ``states:GetExecutionHistory`` onto a typed
|
|
6
|
+
:class:`PipelineRun` so the dashboard page 25 (and any future Slack/CLI
|
|
7
|
+
subscriber) renders SF state without rebuilding the projection logic per
|
|
8
|
+
consumer.
|
|
9
|
+
|
|
10
|
+
**Public surface:**
|
|
11
|
+
|
|
12
|
+
- :func:`read_pipeline_state` — projection entry point. Returns a
|
|
13
|
+
:class:`PipelineRun` for the most-recent execution of the given SF ARN.
|
|
14
|
+
- :class:`PipelineRun` / :class:`TaskRow` / :class:`RunStatus` — typed shape.
|
|
15
|
+
- :data:`STATE_TO_ARCHIVE_PAGE` — registry mapping every substantive Task
|
|
16
|
+
state to either an :class:`ArchivePageRef` deep-link OR a non-generic
|
|
17
|
+
:class:`ArtifactReason` string (per ``feedback_no_silent_fails`` — no
|
|
18
|
+
generic "no artifact" placeholders).
|
|
19
|
+
- :func:`format_success_message` / :func:`format_failure_message` — verbatim
|
|
20
|
+
Python parity for the ``States.Format`` templates baked into the SF JSON.
|
|
21
|
+
Lets future non-SF consumers render byte-identical message bodies without
|
|
22
|
+
duplicating the template.
|
|
23
|
+
|
|
24
|
+
**Why this lives in lib (not in alpha-engine-dashboard):** second adoption
|
|
25
|
+
is anticipated — the same projection is the natural backing for a Slack
|
|
26
|
+
subscriber + a CLI ``ae pipeline status`` command. Per the SOTA / institutional
|
|
27
|
+
sub-sub-rule in ``~/Development/CLAUDE.md`` item 9, the lift goes upstream
|
|
28
|
+
on first build, not after the second consumer arrives.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
from .read import (
|
|
34
|
+
PipelineRun,
|
|
35
|
+
RunStatus,
|
|
36
|
+
SFNAccessDenied,
|
|
37
|
+
SFNNoExecutions,
|
|
38
|
+
SFNThrottled,
|
|
39
|
+
TaskRow,
|
|
40
|
+
TaskStatus,
|
|
41
|
+
read_pipeline_state,
|
|
42
|
+
)
|
|
43
|
+
from .registry import (
|
|
44
|
+
PIPELINE_LABELS,
|
|
45
|
+
STATE_TO_ARCHIVE_PAGE,
|
|
46
|
+
SUBSTANTIVE_RESOURCES,
|
|
47
|
+
WAIT_GROUPING,
|
|
48
|
+
ArchivePageRef,
|
|
49
|
+
ArtifactReason,
|
|
50
|
+
)
|
|
51
|
+
from .templates import format_failure_message, format_success_message
|
|
52
|
+
|
|
53
|
+
__all__ = [
|
|
54
|
+
"ArchivePageRef",
|
|
55
|
+
"ArtifactReason",
|
|
56
|
+
"PIPELINE_LABELS",
|
|
57
|
+
"PipelineRun",
|
|
58
|
+
"RunStatus",
|
|
59
|
+
"SFNAccessDenied",
|
|
60
|
+
"SFNNoExecutions",
|
|
61
|
+
"SFNThrottled",
|
|
62
|
+
"STATE_TO_ARCHIVE_PAGE",
|
|
63
|
+
"SUBSTANTIVE_RESOURCES",
|
|
64
|
+
"TaskRow",
|
|
65
|
+
"TaskStatus",
|
|
66
|
+
"WAIT_GROUPING",
|
|
67
|
+
"format_failure_message",
|
|
68
|
+
"format_success_message",
|
|
69
|
+
"read_pipeline_state",
|
|
70
|
+
]
|
|
@@ -0,0 +1,541 @@
|
|
|
1
|
+
"""SF-state projection — ``read_pipeline_state`` and shapes.
|
|
2
|
+
|
|
3
|
+
Reads the most-recent execution of a Step Function and projects it onto
|
|
4
|
+
the typed :class:`PipelineRun` shape the dashboard page 25 (and any future
|
|
5
|
+
Slack/CLI subscriber) consumes.
|
|
6
|
+
|
|
7
|
+
Three API calls per invocation:
|
|
8
|
+
|
|
9
|
+
1. ``states:ListExecutions(maxResults=1)`` — find the latest execution arn.
|
|
10
|
+
2. ``states:DescribeExecution(executionArn=...)`` — top-level status +
|
|
11
|
+
start/stop timestamps + failure cause when applicable.
|
|
12
|
+
3. ``states:GetExecutionHistory(executionArn=..., maxResults=1000)`` —
|
|
13
|
+
per-state entry/exit events. Substantive-state filter + Wait-grouping
|
|
14
|
+
applied in :func:`_materialize_tasks`.
|
|
15
|
+
|
|
16
|
+
**Exception contract** — every documented error path raises a typed
|
|
17
|
+
subclass of :class:`PipelineStatusError` so the dashboard page can switch
|
|
18
|
+
on the cause and render the appropriate banner state:
|
|
19
|
+
|
|
20
|
+
- :class:`SFNAccessDenied` — IAM missing one of states:Describe / Get / List.
|
|
21
|
+
Page renders a red banner naming the missing action.
|
|
22
|
+
- :class:`SFNThrottled` — SF API rate-limited. Page falls back to the
|
|
23
|
+
``pipeline_status_cache.json`` last-good cache with a yellow banner.
|
|
24
|
+
- :class:`SFNNoExecutions` — SF has never been executed. Page renders an
|
|
25
|
+
empty section with a "no executions yet" note (NOT an error).
|
|
26
|
+
|
|
27
|
+
**Never silently degrades** per ``feedback_no_silent_fails`` — unknown
|
|
28
|
+
boto3 errors are re-raised as :class:`PipelineStatusError` so the page's
|
|
29
|
+
red banner always names a specific cause.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import logging
|
|
35
|
+
from dataclasses import dataclass
|
|
36
|
+
from datetime import datetime, timezone
|
|
37
|
+
from enum import Enum
|
|
38
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
39
|
+
|
|
40
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
41
|
+
|
|
42
|
+
from .registry import (
|
|
43
|
+
PIPELINE_LABELS,
|
|
44
|
+
SUBSTANTIVE_RESOURCES,
|
|
45
|
+
WAIT_GROUPING,
|
|
46
|
+
ArchivePageRef,
|
|
47
|
+
ArtifactReason,
|
|
48
|
+
lookup_registry,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
if TYPE_CHECKING: # pragma: no cover — type-only import
|
|
52
|
+
from mypy_boto3_stepfunctions.client import SFNClient
|
|
53
|
+
|
|
54
|
+
logger = logging.getLogger(__name__)
|
|
55
|
+
|
|
56
|
+
# Failure-cause truncation — kept verbatim with sf-telegram-notifier
|
|
57
|
+
# (alpha-engine-data/infrastructure/lambdas/sf-telegram-notifier/index.py
|
|
58
|
+
# line 69) so the email + Telegram + page channels render byte-identical
|
|
59
|
+
# cause snippets.
|
|
60
|
+
_CAUSE_MAX_CHARS = 280
|
|
61
|
+
|
|
62
|
+
# Bounds the ``getExecutionHistory`` page size. The Saturday SF emits
|
|
63
|
+
# ~600-800 history events on a clean run; the page-25 SLA is one round-trip
|
|
64
|
+
# per poll, so we want enough headroom not to paginate but not so much we
|
|
65
|
+
# load 100k events for a one-state-failed execution. 1000 is what
|
|
66
|
+
# states:GetExecutionHistory's MaxResults caps at anyway.
|
|
67
|
+
_HISTORY_PAGE_SIZE = 1000
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ── Status enums ──────────────────────────────────────────────────────────
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class RunStatus(str, Enum):
|
|
74
|
+
"""Terminal status of a Step Functions execution.
|
|
75
|
+
|
|
76
|
+
Mirrors the AWS Step Functions API ``status`` field verbatim. The
|
|
77
|
+
``NOT-RUN`` sentinel is lib-internal — returned by
|
|
78
|
+
:func:`read_pipeline_state` when the SF has never been executed (vs
|
|
79
|
+
raising), so the page can render "no executions yet" cleanly.
|
|
80
|
+
|
|
81
|
+
``str`` mixin lets ``RunStatus.SUCCEEDED == "SUCCEEDED"`` compare True,
|
|
82
|
+
which the dashboard's existing component patterns rely on.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
RUNNING = "RUNNING"
|
|
86
|
+
SUCCEEDED = "SUCCEEDED"
|
|
87
|
+
FAILED = "FAILED"
|
|
88
|
+
TIMED_OUT = "TIMED_OUT"
|
|
89
|
+
ABORTED = "ABORTED"
|
|
90
|
+
NOT_RUN = "NOT-RUN"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class TaskStatus(str, Enum):
|
|
94
|
+
"""Per-state status as projected from the execution history.
|
|
95
|
+
|
|
96
|
+
Adds ``SKIPPED`` (state was reached but a Choice branched past it
|
|
97
|
+
without entering) + ``NOT_RUN`` (state exists in the SF JSON but was
|
|
98
|
+
never reached this execution — e.g. ``ReinvokePredictor`` on a clean
|
|
99
|
+
run) to the AWS status vocabulary. The Choice/Pass control flow is
|
|
100
|
+
rolled up upstream (filter in :func:`_materialize_tasks`) so these two
|
|
101
|
+
extras only ever apply to genuinely-substantive states.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
RUNNING = "RUNNING"
|
|
105
|
+
SUCCEEDED = "SUCCEEDED"
|
|
106
|
+
FAILED = "FAILED"
|
|
107
|
+
TIMED_OUT = "TIMED_OUT"
|
|
108
|
+
ABORTED = "ABORTED"
|
|
109
|
+
SKIPPED = "SKIPPED"
|
|
110
|
+
NOT_RUN = "NOT-RUN"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ── Exception types ───────────────────────────────────────────────────────
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class PipelineStatusError(Exception):
|
|
117
|
+
"""Base class for all read-side errors."""
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class SFNAccessDenied(PipelineStatusError):
|
|
121
|
+
"""IAM missing one of states:DescribeExecution / GetExecutionHistory /
|
|
122
|
+
ListExecutions. Caller (page 25) renders a red banner naming the action.
|
|
123
|
+
"""
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class SFNThrottled(PipelineStatusError):
|
|
127
|
+
"""SF API rate-limited. Caller falls back to the last-good cache."""
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class SFNNoExecutions(PipelineStatusError):
|
|
131
|
+
"""SF has never been executed. Caller renders 'no executions yet'.
|
|
132
|
+
|
|
133
|
+
Distinct from a generic empty response — this is the ``executions: []``
|
|
134
|
+
branch from ``ListExecutions`` and means the SF exists but has zero
|
|
135
|
+
history. Most often surfaced in dev/test environments; production
|
|
136
|
+
SFs all have history.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ── Output shapes (Pydantic v2) ───────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ``model_config = ConfigDict(extra="forbid")`` because every field is
|
|
144
|
+
# strictly defined; an unknown key indicates a schema drift that should
|
|
145
|
+
# fail loud (per feedback_no_silent_fails) rather than silently widen.
|
|
146
|
+
_STRICT_CONFIG: ConfigDict = ConfigDict(extra="forbid", arbitrary_types_allowed=False)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class TaskRow(BaseModel):
|
|
150
|
+
"""One row on the page-25 per-pipeline state table."""
|
|
151
|
+
|
|
152
|
+
model_config = _STRICT_CONFIG
|
|
153
|
+
|
|
154
|
+
state_name: str
|
|
155
|
+
status: TaskStatus
|
|
156
|
+
start_utc: Optional[datetime] = None
|
|
157
|
+
end_utc: Optional[datetime] = None
|
|
158
|
+
duration_sec: Optional[float] = None
|
|
159
|
+
# Either an ArchivePageRef (deep-link) OR an ArtifactReason (explicit
|
|
160
|
+
# substrate-only reason). ``None`` here means "state name not in the
|
|
161
|
+
# registry" and is a CI-time bug — the consumer should treat it as a
|
|
162
|
+
# registry-drift signal, not a renderable placeholder.
|
|
163
|
+
archive: Optional[Any] = None # ArchivePageRef | ArtifactReason | None
|
|
164
|
+
failure_cause: Optional[str] = None # populated only when status == FAILED
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
class PipelineRun(BaseModel):
|
|
168
|
+
"""Top-level shape returned by :func:`read_pipeline_state`."""
|
|
169
|
+
|
|
170
|
+
model_config = _STRICT_CONFIG
|
|
171
|
+
|
|
172
|
+
state_machine_arn: str
|
|
173
|
+
pretty_label: str # "Saturday SF" / "Weekday SF" / "EOD SF" — from registry
|
|
174
|
+
execution_arn: Optional[str] = None # None iff status == NOT_RUN
|
|
175
|
+
execution_name: Optional[str] = None # human-readable execution id
|
|
176
|
+
status: RunStatus
|
|
177
|
+
start_utc: Optional[datetime] = None
|
|
178
|
+
end_utc: Optional[datetime] = None
|
|
179
|
+
duration_sec: Optional[float] = None
|
|
180
|
+
tasks: list[TaskRow] = Field(default_factory=list)
|
|
181
|
+
failing_state: Optional[str] = None # populated only when status == FAILED
|
|
182
|
+
failure_cause: Optional[str] = None # populated only when status == FAILED
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# ── Helpers ───────────────────────────────────────────────────────────────
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _label_for_arn(state_machine_arn: str) -> str:
|
|
189
|
+
"""Mirror sf-telegram-notifier's ``_label_for_arn`` semantics."""
|
|
190
|
+
sm_name = state_machine_arn.rsplit(":", 1)[-1] if state_machine_arn else ""
|
|
191
|
+
return PIPELINE_LABELS.get(sm_name, sm_name or "Unknown SF")
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _failure_cause_from(describe_resp: dict) -> str:
|
|
195
|
+
"""Extract + truncate the failure cause from DescribeExecution response.
|
|
196
|
+
|
|
197
|
+
Mirrors sf-telegram-notifier's ``_failure_cause_from`` (lines 125-136)
|
|
198
|
+
BYTE-FOR-BYTE so the email body + Telegram message + page cell all
|
|
199
|
+
render the same snippet. Truncation policy: 280 chars max, ``…``
|
|
200
|
+
appended on overflow.
|
|
201
|
+
"""
|
|
202
|
+
error = (describe_resp.get("error") or "").strip()
|
|
203
|
+
cause = (describe_resp.get("cause") or "").strip()
|
|
204
|
+
if error and cause:
|
|
205
|
+
snippet = f"{error}: {cause}"
|
|
206
|
+
else:
|
|
207
|
+
snippet = error or cause
|
|
208
|
+
if len(snippet) > _CAUSE_MAX_CHARS:
|
|
209
|
+
snippet = snippet[: _CAUSE_MAX_CHARS - 1] + "…"
|
|
210
|
+
return snippet
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _parse_ts(value: Any) -> Optional[datetime]:
|
|
214
|
+
"""Normalize boto3's datetime values to UTC.
|
|
215
|
+
|
|
216
|
+
boto3 returns ``datetime`` objects with offset-aware ``tzinfo`` (usually
|
|
217
|
+
``tzutc()``); we coerce to ``timezone.utc`` for round-trip consistency.
|
|
218
|
+
Returns None for falsy input.
|
|
219
|
+
"""
|
|
220
|
+
if value is None:
|
|
221
|
+
return None
|
|
222
|
+
if isinstance(value, datetime):
|
|
223
|
+
if value.tzinfo is None:
|
|
224
|
+
return value.replace(tzinfo=timezone.utc)
|
|
225
|
+
return value.astimezone(timezone.utc)
|
|
226
|
+
return None
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _is_substantive_event(event: dict) -> bool:
|
|
230
|
+
"""Filter for events that name a substantive state.
|
|
231
|
+
|
|
232
|
+
Used to scan the history for ``TaskStateEntered`` / ``TaskStateExited``
|
|
233
|
+
events whose state corresponds to a Task with a substantive Resource.
|
|
234
|
+
The history's event-type taxonomy is one filter axis; the registry's
|
|
235
|
+
``SUBSTANTIVE_RESOURCES`` is the second (applied via the state
|
|
236
|
+
definition lookup, which the SF JSON walk handles upstream).
|
|
237
|
+
|
|
238
|
+
The history events don't carry the Resource ARN directly — we rely on
|
|
239
|
+
the registry membership check at materialization time instead.
|
|
240
|
+
"""
|
|
241
|
+
return event.get("type", "").startswith(("TaskStateEntered", "TaskStateExited"))
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _absorb_wait_companion(state_name: str) -> str:
|
|
245
|
+
"""Return the parent state name if ``state_name`` is a Wait companion.
|
|
246
|
+
|
|
247
|
+
Per §3.2 of the plan doc, ``WaitForDataPhase1`` is rolled up into
|
|
248
|
+
``DataPhase1`` for display — operators think in terms of "DataPhase1
|
|
249
|
+
took 40 min" not "DataPhase1 took 200ms; WaitForDataPhase1 took 39m
|
|
250
|
+
59s 800ms". The duration math in :func:`_materialize_tasks` measures
|
|
251
|
+
parent-entered → wait-exited when both exist.
|
|
252
|
+
"""
|
|
253
|
+
return WAIT_GROUPING.get(state_name, state_name)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _materialize_tasks(history_events: list[dict]) -> list[TaskRow]:
|
|
257
|
+
"""Walk the execution history and produce one TaskRow per substantive state.
|
|
258
|
+
|
|
259
|
+
Algorithm:
|
|
260
|
+
|
|
261
|
+
1. Walk every event; for each ``TaskStateEntered`` / ``TaskStateExited``,
|
|
262
|
+
record the state name + timestamp + outcome.
|
|
263
|
+
2. Absorb Wait companions: a ``WaitForX`` entered event extends ``X``'s
|
|
264
|
+
end timestamp; the Wait state never becomes its own row.
|
|
265
|
+
3. Filter to states that are in the registry (registry membership IS
|
|
266
|
+
the substantive-state filter at this layer).
|
|
267
|
+
4. Render each surviving state as a TaskRow with status + duration +
|
|
268
|
+
registry entry attached.
|
|
269
|
+
"""
|
|
270
|
+
# state_name → {"start": dt | None, "end": dt | None, "status": TaskStatus,
|
|
271
|
+
# "cause": str | None}
|
|
272
|
+
by_state: dict[str, dict[str, Any]] = {}
|
|
273
|
+
|
|
274
|
+
for event in history_events:
|
|
275
|
+
etype = event.get("type", "")
|
|
276
|
+
details_key = None
|
|
277
|
+
if etype == "TaskStateEntered":
|
|
278
|
+
details_key = "stateEnteredEventDetails"
|
|
279
|
+
elif etype == "TaskStateExited":
|
|
280
|
+
details_key = "stateExitedEventDetails"
|
|
281
|
+
elif etype == "TaskFailed":
|
|
282
|
+
# TaskFailed carries cause+error on TaskFailedEventDetails; we
|
|
283
|
+
# attach to the most recently-entered state. boto3's history
|
|
284
|
+
# iteration preserves chronological order, so the "last
|
|
285
|
+
# entered" state at TaskFailed time is the failing state.
|
|
286
|
+
cause = (event.get("taskFailedEventDetails") or {}).get("cause", "")
|
|
287
|
+
error = (event.get("taskFailedEventDetails") or {}).get("error", "")
|
|
288
|
+
snippet = f"{error}: {cause}" if (error and cause) else (error or cause)
|
|
289
|
+
if len(snippet) > _CAUSE_MAX_CHARS:
|
|
290
|
+
snippet = snippet[: _CAUSE_MAX_CHARS - 1] + "…"
|
|
291
|
+
# Attach to the most-recent state that entered without exiting.
|
|
292
|
+
for sn, rec in reversed(list(by_state.items())):
|
|
293
|
+
if rec.get("end") is None:
|
|
294
|
+
rec["status"] = TaskStatus.FAILED
|
|
295
|
+
rec["cause"] = snippet
|
|
296
|
+
break
|
|
297
|
+
continue
|
|
298
|
+
else:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
details = event.get(details_key) or {}
|
|
302
|
+
state_name = details.get("name")
|
|
303
|
+
if not state_name:
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
# Roll Wait companions into their parent.
|
|
307
|
+
parent_name = _absorb_wait_companion(state_name)
|
|
308
|
+
|
|
309
|
+
rec = by_state.setdefault(
|
|
310
|
+
parent_name,
|
|
311
|
+
{"start": None, "end": None, "status": TaskStatus.SUCCEEDED, "cause": None},
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
ts = _parse_ts(event.get("timestamp"))
|
|
315
|
+
if etype == "TaskStateEntered":
|
|
316
|
+
# Only set start if this is the FIRST entered event for this
|
|
317
|
+
# parent (parent entered first, Wait companion entered after).
|
|
318
|
+
if rec["start"] is None:
|
|
319
|
+
rec["start"] = ts
|
|
320
|
+
elif etype == "TaskStateExited":
|
|
321
|
+
# The LATEST exited event wins (Wait companion exits last).
|
|
322
|
+
rec["end"] = ts
|
|
323
|
+
|
|
324
|
+
rows: list[TaskRow] = []
|
|
325
|
+
for state_name, rec in by_state.items():
|
|
326
|
+
# Filter: only render states that are in the registry. Anything
|
|
327
|
+
# else is control-flow (Choice / Pass / Succeed) that we don't
|
|
328
|
+
# surface on the page.
|
|
329
|
+
archive_entry = lookup_registry(state_name)
|
|
330
|
+
if archive_entry is None:
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
start = rec["start"]
|
|
334
|
+
end = rec["end"]
|
|
335
|
+
duration: Optional[float] = None
|
|
336
|
+
if start is not None and end is not None:
|
|
337
|
+
duration = (end - start).total_seconds()
|
|
338
|
+
|
|
339
|
+
# If the state was entered but never exited, status is RUNNING.
|
|
340
|
+
status: TaskStatus = rec["status"]
|
|
341
|
+
if end is None and start is not None and status == TaskStatus.SUCCEEDED:
|
|
342
|
+
status = TaskStatus.RUNNING
|
|
343
|
+
|
|
344
|
+
rows.append(
|
|
345
|
+
TaskRow(
|
|
346
|
+
state_name=state_name,
|
|
347
|
+
status=status,
|
|
348
|
+
start_utc=start,
|
|
349
|
+
end_utc=end,
|
|
350
|
+
duration_sec=duration,
|
|
351
|
+
archive=archive_entry,
|
|
352
|
+
failure_cause=rec["cause"],
|
|
353
|
+
)
|
|
354
|
+
)
|
|
355
|
+
return rows
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _failing_state_from_history(history_events: list[dict]) -> Optional[str]:
|
|
359
|
+
"""Identify the state that emitted TaskFailed (or ExecutionFailed) first."""
|
|
360
|
+
for event in history_events:
|
|
361
|
+
etype = event.get("type", "")
|
|
362
|
+
if etype == "TaskFailed":
|
|
363
|
+
# Walk backwards through prior events to find the most-recent
|
|
364
|
+
# TaskStateEntered without a matching TaskStateExited.
|
|
365
|
+
idx = history_events.index(event)
|
|
366
|
+
for prior in reversed(history_events[:idx]):
|
|
367
|
+
if prior.get("type") == "TaskStateEntered":
|
|
368
|
+
name = (prior.get("stateEnteredEventDetails") or {}).get("name")
|
|
369
|
+
return _absorb_wait_companion(name) if name else None
|
|
370
|
+
if etype == "ExecutionFailed":
|
|
371
|
+
cause_details = event.get("executionFailedEventDetails") or {}
|
|
372
|
+
# ExecutionFailed doesn't directly carry state name; walk back.
|
|
373
|
+
idx = history_events.index(event)
|
|
374
|
+
for prior in reversed(history_events[:idx]):
|
|
375
|
+
if prior.get("type") == "TaskStateEntered":
|
|
376
|
+
name = (prior.get("stateEnteredEventDetails") or {}).get("name")
|
|
377
|
+
return _absorb_wait_companion(name) if name else None
|
|
378
|
+
# Fallback: synthesize from the cause if no entered event found.
|
|
379
|
+
return (cause_details.get("error") or None)
|
|
380
|
+
return None
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
# ── Public entry point ────────────────────────────────────────────────────
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def read_pipeline_state(
|
|
387
|
+
state_machine_arn: str,
|
|
388
|
+
*,
|
|
389
|
+
client: Optional["SFNClient"] = None,
|
|
390
|
+
) -> PipelineRun:
|
|
391
|
+
"""Project the most-recent execution of ``state_machine_arn`` onto a
|
|
392
|
+
typed :class:`PipelineRun`.
|
|
393
|
+
|
|
394
|
+
Calls (in order):
|
|
395
|
+
|
|
396
|
+
1. ``states:ListExecutions(stateMachineArn=..., maxResults=1)`` — finds
|
|
397
|
+
the latest execution arn. If the SF has zero executions, raises
|
|
398
|
+
:class:`SFNNoExecutions`.
|
|
399
|
+
2. ``states:DescribeExecution(executionArn=...)`` — top-level status +
|
|
400
|
+
start/stop + failure cause.
|
|
401
|
+
3. ``states:GetExecutionHistory(executionArn=..., maxResults=1000)`` —
|
|
402
|
+
per-state events for the Task row table.
|
|
403
|
+
|
|
404
|
+
Parameters
|
|
405
|
+
----------
|
|
406
|
+
state_machine_arn:
|
|
407
|
+
Full SF ARN, e.g. ``arn:aws:states:us-east-1:711398986525:stateMachine:alpha-engine-saturday-pipeline``.
|
|
408
|
+
client:
|
|
409
|
+
Optional boto3 ``stepfunctions`` client. Tests pass a mock here;
|
|
410
|
+
production passes None and gets a fresh client per call (cheap;
|
|
411
|
+
boto3 caches under the hood).
|
|
412
|
+
|
|
413
|
+
Returns
|
|
414
|
+
-------
|
|
415
|
+
PipelineRun
|
|
416
|
+
Fully populated except when ``status == NOT_RUN`` (only
|
|
417
|
+
``state_machine_arn`` + ``pretty_label`` + ``status`` set).
|
|
418
|
+
|
|
419
|
+
Raises
|
|
420
|
+
------
|
|
421
|
+
SFNAccessDenied
|
|
422
|
+
IAM denial on any of the three required actions.
|
|
423
|
+
SFNThrottled
|
|
424
|
+
Rate-limit on any of the three.
|
|
425
|
+
SFNNoExecutions
|
|
426
|
+
SF exists but has zero executions ever.
|
|
427
|
+
PipelineStatusError
|
|
428
|
+
Any other unexpected error path — the caller renders a red banner.
|
|
429
|
+
"""
|
|
430
|
+
if client is None: # pragma: no cover — production path
|
|
431
|
+
import boto3
|
|
432
|
+
|
|
433
|
+
client = boto3.client("stepfunctions")
|
|
434
|
+
|
|
435
|
+
label = _label_for_arn(state_machine_arn)
|
|
436
|
+
|
|
437
|
+
# 1. ListExecutions
|
|
438
|
+
try:
|
|
439
|
+
list_resp = client.list_executions(
|
|
440
|
+
stateMachineArn=state_machine_arn,
|
|
441
|
+
maxResults=1,
|
|
442
|
+
)
|
|
443
|
+
except Exception as exc: # noqa: BLE001 — narrow + re-raise
|
|
444
|
+
_raise_for_boto_error(exc, "ListExecutions")
|
|
445
|
+
|
|
446
|
+
executions = list_resp.get("executions") or []
|
|
447
|
+
if not executions:
|
|
448
|
+
raise SFNNoExecutions(
|
|
449
|
+
f"State machine {state_machine_arn} has no executions yet."
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
latest = executions[0]
|
|
453
|
+
execution_arn = latest.get("executionArn")
|
|
454
|
+
execution_name = latest.get("name")
|
|
455
|
+
|
|
456
|
+
# 2. DescribeExecution
|
|
457
|
+
try:
|
|
458
|
+
describe_resp = client.describe_execution(executionArn=execution_arn)
|
|
459
|
+
except Exception as exc: # noqa: BLE001 — narrow + re-raise
|
|
460
|
+
_raise_for_boto_error(exc, "DescribeExecution")
|
|
461
|
+
|
|
462
|
+
status_str = describe_resp.get("status", "RUNNING")
|
|
463
|
+
try:
|
|
464
|
+
run_status = RunStatus(status_str)
|
|
465
|
+
except ValueError:
|
|
466
|
+
# Unknown status string from boto3 (forward-compatibility) — fail
|
|
467
|
+
# loud rather than silently mis-render.
|
|
468
|
+
raise PipelineStatusError(
|
|
469
|
+
f"Unknown SF execution status {status_str!r} from boto3 for {execution_arn}"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
start_utc = _parse_ts(describe_resp.get("startDate"))
|
|
473
|
+
end_utc = _parse_ts(describe_resp.get("stopDate"))
|
|
474
|
+
duration: Optional[float] = None
|
|
475
|
+
if start_utc is not None and end_utc is not None:
|
|
476
|
+
duration = (end_utc - start_utc).total_seconds()
|
|
477
|
+
|
|
478
|
+
failure_cause = (
|
|
479
|
+
_failure_cause_from(describe_resp) if run_status == RunStatus.FAILED else None
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# 3. GetExecutionHistory
|
|
483
|
+
try:
|
|
484
|
+
history_resp = client.get_execution_history(
|
|
485
|
+
executionArn=execution_arn,
|
|
486
|
+
maxResults=_HISTORY_PAGE_SIZE,
|
|
487
|
+
reverseOrder=False,
|
|
488
|
+
)
|
|
489
|
+
except Exception as exc: # noqa: BLE001 — narrow + re-raise
|
|
490
|
+
_raise_for_boto_error(exc, "GetExecutionHistory")
|
|
491
|
+
|
|
492
|
+
events = history_resp.get("events") or []
|
|
493
|
+
tasks = _materialize_tasks(events)
|
|
494
|
+
failing_state = (
|
|
495
|
+
_failing_state_from_history(events) if run_status == RunStatus.FAILED else None
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
return PipelineRun(
|
|
499
|
+
state_machine_arn=state_machine_arn,
|
|
500
|
+
pretty_label=label,
|
|
501
|
+
execution_arn=execution_arn,
|
|
502
|
+
execution_name=execution_name,
|
|
503
|
+
status=run_status,
|
|
504
|
+
start_utc=start_utc,
|
|
505
|
+
end_utc=end_utc,
|
|
506
|
+
duration_sec=duration,
|
|
507
|
+
tasks=tasks,
|
|
508
|
+
failing_state=failing_state,
|
|
509
|
+
failure_cause=failure_cause,
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def _raise_for_boto_error(exc: Exception, action: str) -> None:
|
|
514
|
+
"""Translate a boto3 exception into a typed PipelineStatusError.
|
|
515
|
+
|
|
516
|
+
Inspects the ``ClientError.response["Error"]["Code"]`` for the common
|
|
517
|
+
cases (AccessDenied / Throttling) and re-raises the matching typed
|
|
518
|
+
exception. Unknown error codes re-raise as :class:`PipelineStatusError`
|
|
519
|
+
with the boto3 cause attached.
|
|
520
|
+
"""
|
|
521
|
+
code = ""
|
|
522
|
+
response = getattr(exc, "response", None) or {}
|
|
523
|
+
error_dict = response.get("Error") or {}
|
|
524
|
+
code = error_dict.get("Code", "")
|
|
525
|
+
|
|
526
|
+
if code in ("AccessDeniedException", "AccessDenied"):
|
|
527
|
+
raise SFNAccessDenied(
|
|
528
|
+
f"states:{action} denied — add the action to the dashboard "
|
|
529
|
+
f"EC2 role's inline policy. Boto3 detail: {exc}"
|
|
530
|
+
) from exc
|
|
531
|
+
if code in ("ThrottlingException", "Throttling", "TooManyRequestsException"):
|
|
532
|
+
raise SFNThrottled(
|
|
533
|
+
f"states:{action} rate-limited; page falls back to last-good cache."
|
|
534
|
+
) from exc
|
|
535
|
+
if code in ("StateMachineDoesNotExist", "ExecutionDoesNotExist"):
|
|
536
|
+
raise SFNNoExecutions(
|
|
537
|
+
f"states:{action} returned {code}: {exc}"
|
|
538
|
+
) from exc
|
|
539
|
+
raise PipelineStatusError(
|
|
540
|
+
f"Unexpected boto3 error on states:{action}: {code or type(exc).__name__}: {exc}"
|
|
541
|
+
) from exc
|