async-durable-execution-runner 2.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- LICENSE +175 -0
- NOTICE +8 -0
- VERSION.py +5 -0
- async_durable_execution_runner/__about__.py +33 -0
- async_durable_execution_runner/__init__.py +23 -0
- async_durable_execution_runner/checkpoint/__init__.py +1 -0
- async_durable_execution_runner/checkpoint/processor.py +101 -0
- async_durable_execution_runner/checkpoint/processors/__init__.py +1 -0
- async_durable_execution_runner/checkpoint/processors/base.py +199 -0
- async_durable_execution_runner/checkpoint/processors/callback.py +89 -0
- async_durable_execution_runner/checkpoint/processors/context.py +59 -0
- async_durable_execution_runner/checkpoint/processors/execution.py +52 -0
- async_durable_execution_runner/checkpoint/processors/step.py +124 -0
- async_durable_execution_runner/checkpoint/processors/wait.py +95 -0
- async_durable_execution_runner/checkpoint/transformer.py +104 -0
- async_durable_execution_runner/checkpoint/validators/__init__.py +1 -0
- async_durable_execution_runner/checkpoint/validators/checkpoint.py +242 -0
- async_durable_execution_runner/checkpoint/validators/operations/__init__.py +1 -0
- async_durable_execution_runner/checkpoint/validators/operations/callback.py +45 -0
- async_durable_execution_runner/checkpoint/validators/operations/context.py +73 -0
- async_durable_execution_runner/checkpoint/validators/operations/execution.py +47 -0
- async_durable_execution_runner/checkpoint/validators/operations/invoke.py +56 -0
- async_durable_execution_runner/checkpoint/validators/operations/step.py +106 -0
- async_durable_execution_runner/checkpoint/validators/operations/wait.py +54 -0
- async_durable_execution_runner/checkpoint/validators/transitions.py +66 -0
- async_durable_execution_runner/cli.py +498 -0
- async_durable_execution_runner/client.py +50 -0
- async_durable_execution_runner/exceptions.py +288 -0
- async_durable_execution_runner/execution.py +444 -0
- async_durable_execution_runner/executor.py +1234 -0
- async_durable_execution_runner/invoker.py +340 -0
- async_durable_execution_runner/model.py +3296 -0
- async_durable_execution_runner/observer.py +144 -0
- async_durable_execution_runner/py.typed +1 -0
- async_durable_execution_runner/runner.py +1167 -0
- async_durable_execution_runner/scheduler.py +246 -0
- async_durable_execution_runner/stores/__init__.py +1 -0
- async_durable_execution_runner/stores/base.py +147 -0
- async_durable_execution_runner/stores/filesystem.py +79 -0
- async_durable_execution_runner/stores/memory.py +38 -0
- async_durable_execution_runner/stores/sqlite.py +273 -0
- async_durable_execution_runner/token.py +49 -0
- async_durable_execution_runner/web/__init__.py +1 -0
- async_durable_execution_runner/web/errors.py +8 -0
- async_durable_execution_runner/web/handlers.py +813 -0
- async_durable_execution_runner/web/models.py +266 -0
- async_durable_execution_runner/web/routes.py +692 -0
- async_durable_execution_runner/web/serialization.py +235 -0
- async_durable_execution_runner/web/server.py +243 -0
- async_durable_execution_runner-2.0.0a1.dist-info/METADATA +238 -0
- async_durable_execution_runner-2.0.0a1.dist-info/RECORD +55 -0
- async_durable_execution_runner-2.0.0a1.dist-info/WHEEL +4 -0
- async_durable_execution_runner-2.0.0a1.dist-info/entry_points.txt +2 -0
- async_durable_execution_runner-2.0.0a1.dist-info/licenses/LICENSE +175 -0
- async_durable_execution_runner-2.0.0a1.dist-info/licenses/NOTICE +1 -0
|
@@ -0,0 +1,1234 @@
|
|
|
1
|
+
"""Execution life-cycle logic."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
import uuid
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from async_durable_execution.execution import (
|
|
12
|
+
DurableExecutionInvocationInput,
|
|
13
|
+
DurableExecutionInvocationOutput,
|
|
14
|
+
InvocationStatus,
|
|
15
|
+
)
|
|
16
|
+
from async_durable_execution.lambda_service import (
|
|
17
|
+
CallbackTimeoutType,
|
|
18
|
+
ErrorObject,
|
|
19
|
+
Operation,
|
|
20
|
+
OperationUpdate,
|
|
21
|
+
OperationStatus,
|
|
22
|
+
OperationType,
|
|
23
|
+
CallbackOptions,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from async_durable_execution_runner.exceptions import (
|
|
27
|
+
ExecutionAlreadyStartedException,
|
|
28
|
+
IllegalStateException,
|
|
29
|
+
InvalidParameterValueException,
|
|
30
|
+
ResourceNotFoundException,
|
|
31
|
+
)
|
|
32
|
+
from async_durable_execution_runner.execution import Execution
|
|
33
|
+
from async_durable_execution_runner.model import (
|
|
34
|
+
CheckpointDurableExecutionResponse,
|
|
35
|
+
CheckpointUpdatedExecutionState,
|
|
36
|
+
EventCreationContext,
|
|
37
|
+
EventType,
|
|
38
|
+
GetDurableExecutionHistoryResponse,
|
|
39
|
+
GetDurableExecutionResponse,
|
|
40
|
+
GetDurableExecutionStateResponse,
|
|
41
|
+
ListDurableExecutionsByFunctionResponse,
|
|
42
|
+
ListDurableExecutionsResponse,
|
|
43
|
+
SendDurableExecutionCallbackFailureResponse,
|
|
44
|
+
SendDurableExecutionCallbackHeartbeatResponse,
|
|
45
|
+
SendDurableExecutionCallbackSuccessResponse,
|
|
46
|
+
StartDurableExecutionInput,
|
|
47
|
+
StartDurableExecutionOutput,
|
|
48
|
+
StopDurableExecutionResponse,
|
|
49
|
+
TERMINAL_STATUSES,
|
|
50
|
+
)
|
|
51
|
+
from async_durable_execution_runner.model import (
|
|
52
|
+
Event as HistoryEvent,
|
|
53
|
+
)
|
|
54
|
+
from async_durable_execution_runner.model import (
|
|
55
|
+
Execution as ExecutionSummary,
|
|
56
|
+
)
|
|
57
|
+
from async_durable_execution_runner.observer import ExecutionObserver
|
|
58
|
+
from async_durable_execution_runner.token import CallbackToken
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if TYPE_CHECKING:
|
|
62
|
+
from collections.abc import Awaitable, Callable
|
|
63
|
+
from concurrent.futures import Future
|
|
64
|
+
|
|
65
|
+
from async_durable_execution_runner.checkpoint.processor import (
|
|
66
|
+
CheckpointProcessor,
|
|
67
|
+
)
|
|
68
|
+
from async_durable_execution_runner.invoker import Invoker
|
|
69
|
+
from async_durable_execution_runner.scheduler import Event, Scheduler
|
|
70
|
+
from async_durable_execution_runner.stores.base import ExecutionStore
|
|
71
|
+
|
|
72
|
+
logger = logging.getLogger(__name__)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Executor(ExecutionObserver):
|
|
76
|
+
MAX_CONSECUTIVE_FAILED_ATTEMPTS: int = 5
|
|
77
|
+
RETRY_BACKOFF_SECONDS: int = 5
|
|
78
|
+
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
store: ExecutionStore,
|
|
82
|
+
scheduler: Scheduler,
|
|
83
|
+
invoker: Invoker,
|
|
84
|
+
checkpoint_processor: CheckpointProcessor,
|
|
85
|
+
):
|
|
86
|
+
self._store = store
|
|
87
|
+
self._scheduler = scheduler
|
|
88
|
+
self._invoker = invoker
|
|
89
|
+
self._checkpoint_processor = checkpoint_processor
|
|
90
|
+
self._completion_events: dict[str, Event] = {}
|
|
91
|
+
self._callback_timeouts: dict[str, Future] = {}
|
|
92
|
+
self._callback_heartbeats: dict[str, Future] = {}
|
|
93
|
+
self._execution_timeout: Future | None = None
|
|
94
|
+
|
|
95
|
+
def start_execution(
|
|
96
|
+
self,
|
|
97
|
+
input: StartDurableExecutionInput, # noqa: A002
|
|
98
|
+
) -> StartDurableExecutionOutput:
|
|
99
|
+
# Generate invocation_id if not provided
|
|
100
|
+
if input.invocation_id is None:
|
|
101
|
+
input = StartDurableExecutionInput(
|
|
102
|
+
account_id=input.account_id,
|
|
103
|
+
function_name=input.function_name,
|
|
104
|
+
function_qualifier=input.function_qualifier,
|
|
105
|
+
execution_name=input.execution_name,
|
|
106
|
+
execution_timeout_seconds=input.execution_timeout_seconds,
|
|
107
|
+
execution_retention_period_days=input.execution_retention_period_days,
|
|
108
|
+
invocation_id=str(uuid.uuid4()),
|
|
109
|
+
trace_fields=input.trace_fields,
|
|
110
|
+
tenant_id=input.tenant_id,
|
|
111
|
+
input=input.input,
|
|
112
|
+
lambda_endpoint=input.lambda_endpoint,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
execution = Execution.new(input=input)
|
|
116
|
+
execution.start()
|
|
117
|
+
self._store.save(execution)
|
|
118
|
+
logger.debug("Created execution with ARN: %s", execution.durable_execution_arn)
|
|
119
|
+
|
|
120
|
+
completion_event = self._scheduler.create_event()
|
|
121
|
+
self._completion_events[execution.durable_execution_arn] = completion_event
|
|
122
|
+
|
|
123
|
+
# Schedule execution timeout
|
|
124
|
+
if input.execution_timeout_seconds > 0:
|
|
125
|
+
|
|
126
|
+
def timeout_handler():
|
|
127
|
+
error = ErrorObject.from_message(
|
|
128
|
+
f"Execution timed out after {input.execution_timeout_seconds} seconds."
|
|
129
|
+
)
|
|
130
|
+
self.on_timed_out(execution.durable_execution_arn, error)
|
|
131
|
+
|
|
132
|
+
self._execution_timeout = self._scheduler.call_later(
|
|
133
|
+
timeout_handler,
|
|
134
|
+
delay=input.execution_timeout_seconds,
|
|
135
|
+
completion_event=completion_event,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Schedule initial invocation to run immediately
|
|
139
|
+
self._invoke_execution(execution.durable_execution_arn)
|
|
140
|
+
|
|
141
|
+
return StartDurableExecutionOutput(
|
|
142
|
+
execution_arn=execution.durable_execution_arn
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def get_execution(self, execution_arn: str) -> Execution:
|
|
146
|
+
"""Get execution by ARN.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
execution_arn: The execution ARN to retrieve
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Execution: The execution object
|
|
153
|
+
|
|
154
|
+
Raises:
|
|
155
|
+
ResourceNotFoundException: If execution does not exist
|
|
156
|
+
"""
|
|
157
|
+
try:
|
|
158
|
+
return self._store.load(execution_arn)
|
|
159
|
+
except KeyError as e:
|
|
160
|
+
msg: str = f"Execution {execution_arn} not found"
|
|
161
|
+
raise ResourceNotFoundException(msg) from e
|
|
162
|
+
|
|
163
|
+
def get_execution_details(self, execution_arn: str) -> GetDurableExecutionResponse:
|
|
164
|
+
"""Get detailed execution information for web API response.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
execution_arn: The execution ARN to retrieve
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
GetDurableExecutionResponse: Detailed execution information
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
ResourceNotFoundException: If execution does not exist
|
|
174
|
+
"""
|
|
175
|
+
execution = self.get_execution(execution_arn)
|
|
176
|
+
|
|
177
|
+
# Extract execution details from the first operation (EXECUTION type)
|
|
178
|
+
execution_op = execution.get_operation_execution_started()
|
|
179
|
+
status = execution.current_status().value
|
|
180
|
+
|
|
181
|
+
# Extract result and error from execution result
|
|
182
|
+
result = None
|
|
183
|
+
error = None
|
|
184
|
+
if execution.result:
|
|
185
|
+
if execution.result.status == InvocationStatus.SUCCEEDED:
|
|
186
|
+
result = execution.result.result
|
|
187
|
+
elif execution.result.status == InvocationStatus.FAILED:
|
|
188
|
+
error = execution.result.error
|
|
189
|
+
|
|
190
|
+
return GetDurableExecutionResponse(
|
|
191
|
+
durable_execution_arn=execution.durable_execution_arn,
|
|
192
|
+
durable_execution_name=execution.start_input.execution_name,
|
|
193
|
+
function_arn=f"arn:aws:lambda:us-east-1:123456789012:function:{execution.start_input.function_name}",
|
|
194
|
+
status=status,
|
|
195
|
+
start_timestamp=execution_op.start_timestamp
|
|
196
|
+
if execution_op.start_timestamp
|
|
197
|
+
else datetime.now(UTC),
|
|
198
|
+
input_payload=execution_op.execution_details.input_payload
|
|
199
|
+
if execution_op.execution_details
|
|
200
|
+
else None,
|
|
201
|
+
result=result,
|
|
202
|
+
error=error,
|
|
203
|
+
end_timestamp=execution_op.end_timestamp
|
|
204
|
+
if execution_op.end_timestamp
|
|
205
|
+
else None,
|
|
206
|
+
version="1.0",
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def list_executions(
|
|
210
|
+
self,
|
|
211
|
+
function_name: str | None = None,
|
|
212
|
+
function_version: str | None = None, # noqa: ARG002
|
|
213
|
+
execution_name: str | None = None,
|
|
214
|
+
status_filter: str | None = None,
|
|
215
|
+
started_after: str | None = None,
|
|
216
|
+
started_before: str | None = None,
|
|
217
|
+
marker: str | None = None,
|
|
218
|
+
max_items: int | None = None,
|
|
219
|
+
reverse_order: bool = False, # noqa: FBT001, FBT002
|
|
220
|
+
) -> ListDurableExecutionsResponse:
|
|
221
|
+
"""List executions with filtering and pagination.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
function_name: Filter by function name
|
|
225
|
+
function_version: Filter by function version
|
|
226
|
+
execution_name: Filter by execution name
|
|
227
|
+
status_filter: Filter by status (RUNNING, SUCCEEDED, FAILED)
|
|
228
|
+
started_after: Filter executions started after this time
|
|
229
|
+
started_before: Filter executions started before this time
|
|
230
|
+
marker: Pagination marker
|
|
231
|
+
max_items: Maximum items to return (default 50)
|
|
232
|
+
reverse_order: Return results in reverse chronological order
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
ListDurableExecutionsResponse: List of executions with pagination
|
|
236
|
+
"""
|
|
237
|
+
# Convert marker to offset
|
|
238
|
+
offset: int = 0
|
|
239
|
+
if marker:
|
|
240
|
+
try:
|
|
241
|
+
offset = int(marker)
|
|
242
|
+
except ValueError:
|
|
243
|
+
offset = 0
|
|
244
|
+
|
|
245
|
+
# Query store directly with parameters
|
|
246
|
+
executions, next_marker = self._store.query(
|
|
247
|
+
function_name=function_name,
|
|
248
|
+
execution_name=execution_name,
|
|
249
|
+
status_filter=status_filter,
|
|
250
|
+
started_after=started_after,
|
|
251
|
+
started_before=started_before,
|
|
252
|
+
limit=max_items or 50,
|
|
253
|
+
offset=offset,
|
|
254
|
+
reverse_order=reverse_order,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Convert to ExecutionSummary objects
|
|
258
|
+
execution_summaries: list[ExecutionSummary] = [
|
|
259
|
+
ExecutionSummary.from_execution(execution, execution.current_status().value)
|
|
260
|
+
for execution in executions
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
return ListDurableExecutionsResponse(
|
|
264
|
+
durable_executions=execution_summaries, next_marker=next_marker
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
def list_executions_by_function(
|
|
268
|
+
self,
|
|
269
|
+
function_name: str,
|
|
270
|
+
qualifier: str | None = None, # noqa: ARG002
|
|
271
|
+
execution_name: str | None = None,
|
|
272
|
+
status_filter: str | None = None,
|
|
273
|
+
started_after: str | None = None,
|
|
274
|
+
started_before: str | None = None,
|
|
275
|
+
marker: str | None = None,
|
|
276
|
+
max_items: int | None = None,
|
|
277
|
+
reverse_order: bool = False, # noqa: FBT001, FBT002
|
|
278
|
+
) -> ListDurableExecutionsByFunctionResponse:
|
|
279
|
+
"""List executions for a specific function.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
function_name: The function name to filter by
|
|
283
|
+
qualifier: Function qualifier/version
|
|
284
|
+
execution_name: Filter by execution name
|
|
285
|
+
status_filter: Filter by status (RUNNING, SUCCEEDED, FAILED)
|
|
286
|
+
started_after: Filter executions started after this time
|
|
287
|
+
started_before: Filter executions started before this time
|
|
288
|
+
marker: Pagination marker
|
|
289
|
+
max_items: Maximum items to return (default 50)
|
|
290
|
+
reverse_order: Return results in reverse chronological order
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
ListDurableExecutionsByFunctionResponse: List of executions for the function
|
|
294
|
+
"""
|
|
295
|
+
# Use the general list_executions method with function_name filter
|
|
296
|
+
list_response = self.list_executions(
|
|
297
|
+
function_name=function_name,
|
|
298
|
+
execution_name=execution_name,
|
|
299
|
+
status_filter=status_filter,
|
|
300
|
+
started_after=started_after,
|
|
301
|
+
started_before=started_before,
|
|
302
|
+
marker=marker,
|
|
303
|
+
max_items=max_items,
|
|
304
|
+
reverse_order=reverse_order,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
return ListDurableExecutionsByFunctionResponse(
|
|
308
|
+
durable_executions=list_response.durable_executions,
|
|
309
|
+
next_marker=list_response.next_marker,
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
def stop_execution(
|
|
313
|
+
self, execution_arn: str, error: ErrorObject | None = None
|
|
314
|
+
) -> StopDurableExecutionResponse:
|
|
315
|
+
"""Stop a running execution.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
execution_arn: The execution ARN to stop
|
|
319
|
+
error: Optional error to use when stopping the execution
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
StopDurableExecutionResponse: Response containing end timestamp
|
|
323
|
+
|
|
324
|
+
Raises:
|
|
325
|
+
ResourceNotFoundException: If execution does not exist
|
|
326
|
+
"""
|
|
327
|
+
execution = self.get_execution(execution_arn)
|
|
328
|
+
|
|
329
|
+
if execution.is_complete:
|
|
330
|
+
# Idempotent: return the existing stop timestamp
|
|
331
|
+
execution_op = execution.get_operation_execution_started()
|
|
332
|
+
stop_timestamp = execution_op.end_timestamp or datetime.now(UTC)
|
|
333
|
+
return StopDurableExecutionResponse(stop_timestamp=stop_timestamp)
|
|
334
|
+
|
|
335
|
+
# Use provided error or create a default one
|
|
336
|
+
stop_error = error or ErrorObject.from_message(
|
|
337
|
+
"Execution stopped by user request"
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
# Stop sets TERMINATED close status (different from fail)
|
|
341
|
+
logger.exception("[%s] Stopping execution.", execution_arn)
|
|
342
|
+
execution.complete_stopped(error=stop_error) # Sets CloseStatus.TERMINATED
|
|
343
|
+
self._store.update(execution)
|
|
344
|
+
self._complete_events(execution_arn=execution_arn)
|
|
345
|
+
|
|
346
|
+
return StopDurableExecutionResponse(stop_timestamp=datetime.now(UTC))
|
|
347
|
+
|
|
348
|
+
def get_execution_state(
|
|
349
|
+
self,
|
|
350
|
+
execution_arn: str,
|
|
351
|
+
checkpoint_token: str | None = None,
|
|
352
|
+
marker: str | None = None,
|
|
353
|
+
max_items: int | None = None,
|
|
354
|
+
) -> GetDurableExecutionStateResponse:
|
|
355
|
+
"""Get execution state with operations.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
execution_arn: The execution ARN
|
|
359
|
+
checkpoint_token: Checkpoint token for state consistency
|
|
360
|
+
marker: Pagination marker
|
|
361
|
+
max_items: Maximum items to return
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
GetDurableExecutionStateResponse: Execution state with operations
|
|
365
|
+
|
|
366
|
+
Raises:
|
|
367
|
+
ResourceNotFoundException: If execution does not exist
|
|
368
|
+
InvalidParameterValueException: If checkpoint token is invalid
|
|
369
|
+
"""
|
|
370
|
+
execution = self.get_execution(execution_arn)
|
|
371
|
+
|
|
372
|
+
# TODO: Validate checkpoint token if provided
|
|
373
|
+
if checkpoint_token and checkpoint_token not in execution.used_tokens:
|
|
374
|
+
msg: str = f"Invalid checkpoint token: {checkpoint_token}"
|
|
375
|
+
raise InvalidParameterValueException(msg)
|
|
376
|
+
|
|
377
|
+
# Get operations (excluding the initial EXECUTION operation for state)
|
|
378
|
+
operations = execution.get_assertable_operations()
|
|
379
|
+
|
|
380
|
+
# Apply pagination
|
|
381
|
+
if max_items is None:
|
|
382
|
+
max_items = 100
|
|
383
|
+
|
|
384
|
+
# Simple pagination - in real implementation would need proper marker handling
|
|
385
|
+
start_index = 0
|
|
386
|
+
if marker:
|
|
387
|
+
try:
|
|
388
|
+
start_index = int(marker)
|
|
389
|
+
except ValueError:
|
|
390
|
+
start_index = 0
|
|
391
|
+
|
|
392
|
+
end_index = start_index + max_items
|
|
393
|
+
paginated_operations = operations[start_index:end_index]
|
|
394
|
+
|
|
395
|
+
next_marker = None
|
|
396
|
+
if end_index < len(operations):
|
|
397
|
+
next_marker = str(end_index)
|
|
398
|
+
|
|
399
|
+
return GetDurableExecutionStateResponse(
|
|
400
|
+
operations=paginated_operations, next_marker=next_marker
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def get_execution_history(
|
|
404
|
+
self,
|
|
405
|
+
execution_arn: str,
|
|
406
|
+
include_execution_data: bool = False, # noqa: FBT001, FBT002
|
|
407
|
+
reverse_order: bool = False, # noqa: FBT001, FBT002
|
|
408
|
+
marker: str | None = None,
|
|
409
|
+
max_items: int | None = None,
|
|
410
|
+
) -> GetDurableExecutionHistoryResponse:
|
|
411
|
+
"""Get execution history with events.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
execution_arn: The execution ARN
|
|
415
|
+
include_execution_data: Whether to include execution data in events
|
|
416
|
+
reverse_order: Return events in reverse chronological order
|
|
417
|
+
marker: Pagination marker (event_id)
|
|
418
|
+
max_items: Maximum items to return
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
GetDurableExecutionHistoryResponse: Execution history with events
|
|
422
|
+
|
|
423
|
+
Raises:
|
|
424
|
+
ResourceNotFoundException: If execution does not exist
|
|
425
|
+
"""
|
|
426
|
+
execution: Execution = self.get_execution(execution_arn)
|
|
427
|
+
|
|
428
|
+
# Generate events
|
|
429
|
+
all_events: list[HistoryEvent] = []
|
|
430
|
+
ops: list[Operation] = execution.operations
|
|
431
|
+
updates: list[OperationUpdate] = execution.updates
|
|
432
|
+
updates_dict: dict[str, OperationUpdate] = {u.operation_id: u for u in updates}
|
|
433
|
+
durable_execution_arn: str = execution.durable_execution_arn
|
|
434
|
+
|
|
435
|
+
# Add InvocationCompleted events
|
|
436
|
+
for completion in execution.invocation_completions:
|
|
437
|
+
invocation_event = HistoryEvent.create_invocation_completed(
|
|
438
|
+
event_id=0, # Temporary, will be reassigned
|
|
439
|
+
event_timestamp=completion.end_timestamp,
|
|
440
|
+
start_timestamp=completion.start_timestamp,
|
|
441
|
+
end_timestamp=completion.end_timestamp,
|
|
442
|
+
request_id=completion.request_id,
|
|
443
|
+
)
|
|
444
|
+
all_events.append(invocation_event)
|
|
445
|
+
|
|
446
|
+
# Generate all events first (without final event IDs)
|
|
447
|
+
for op in ops:
|
|
448
|
+
operation_update: OperationUpdate | None = updates_dict.get(
|
|
449
|
+
op.operation_id, None
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
if op.status is OperationStatus.PENDING:
|
|
453
|
+
if (
|
|
454
|
+
op.operation_type is not OperationType.CHAINED_INVOKE
|
|
455
|
+
or op.start_timestamp is None
|
|
456
|
+
):
|
|
457
|
+
continue
|
|
458
|
+
context: EventCreationContext = EventCreationContext(
|
|
459
|
+
op,
|
|
460
|
+
0, # Temporary event_id, will be reassigned after sorting
|
|
461
|
+
durable_execution_arn,
|
|
462
|
+
execution.start_input,
|
|
463
|
+
execution.result,
|
|
464
|
+
operation_update,
|
|
465
|
+
include_execution_data,
|
|
466
|
+
)
|
|
467
|
+
pending = HistoryEvent.create_chained_invoke_event_pending(context)
|
|
468
|
+
all_events.append(pending)
|
|
469
|
+
if op.start_timestamp is not None:
|
|
470
|
+
context = EventCreationContext(
|
|
471
|
+
op,
|
|
472
|
+
0, # Temporary event_id, will be reassigned after sorting
|
|
473
|
+
durable_execution_arn,
|
|
474
|
+
execution.start_input,
|
|
475
|
+
execution.result,
|
|
476
|
+
operation_update,
|
|
477
|
+
include_execution_data,
|
|
478
|
+
)
|
|
479
|
+
started = HistoryEvent.create_event_started(context)
|
|
480
|
+
all_events.append(started)
|
|
481
|
+
if op.end_timestamp is not None and op.status in TERMINAL_STATUSES:
|
|
482
|
+
context = EventCreationContext(
|
|
483
|
+
op,
|
|
484
|
+
0, # Temporary event_id, will be reassigned after sorting
|
|
485
|
+
durable_execution_arn,
|
|
486
|
+
execution.start_input,
|
|
487
|
+
execution.result,
|
|
488
|
+
operation_update,
|
|
489
|
+
include_execution_data,
|
|
490
|
+
)
|
|
491
|
+
finished = HistoryEvent.create_event_terminated(context)
|
|
492
|
+
all_events.append(finished)
|
|
493
|
+
|
|
494
|
+
# Sort events by timestamp to get correct chronological order
|
|
495
|
+
all_events.sort(key=lambda event: event.event_timestamp)
|
|
496
|
+
|
|
497
|
+
# Reassign event IDs based on chronological order
|
|
498
|
+
all_events = [
|
|
499
|
+
HistoryEvent.from_event_with_id(event, i)
|
|
500
|
+
for i, event in enumerate(all_events, 1)
|
|
501
|
+
]
|
|
502
|
+
|
|
503
|
+
# Apply cursor-based pagination
|
|
504
|
+
if max_items is None:
|
|
505
|
+
max_items = 100
|
|
506
|
+
|
|
507
|
+
# Handle pagination marker
|
|
508
|
+
if reverse_order:
|
|
509
|
+
all_events.reverse()
|
|
510
|
+
start_index: int = 0
|
|
511
|
+
if marker:
|
|
512
|
+
try:
|
|
513
|
+
marker_event_id: int = int(marker)
|
|
514
|
+
# Find the index of the first event with event_id >= marker
|
|
515
|
+
start_index = len(all_events)
|
|
516
|
+
for i, e in enumerate(all_events):
|
|
517
|
+
is_valid_page_start: bool = (
|
|
518
|
+
e.event_id < marker_event_id
|
|
519
|
+
if reverse_order
|
|
520
|
+
else e.event_id >= marker_event_id
|
|
521
|
+
)
|
|
522
|
+
if is_valid_page_start:
|
|
523
|
+
start_index = i
|
|
524
|
+
break
|
|
525
|
+
except ValueError:
|
|
526
|
+
start_index = 0
|
|
527
|
+
|
|
528
|
+
# Get paginated events
|
|
529
|
+
end_index: int = start_index + max_items
|
|
530
|
+
paginated_events: list[HistoryEvent] = all_events[start_index:end_index]
|
|
531
|
+
|
|
532
|
+
# Generate next marker
|
|
533
|
+
next_marker: str | None = None
|
|
534
|
+
if end_index < len(all_events):
|
|
535
|
+
if reverse_order:
|
|
536
|
+
# Next marker is the event_id of the last returned event
|
|
537
|
+
next_marker = (
|
|
538
|
+
str(paginated_events[-1].event_id) if paginated_events else None
|
|
539
|
+
)
|
|
540
|
+
else:
|
|
541
|
+
# Next marker is the event_id of the next event after the last returned
|
|
542
|
+
next_marker = (
|
|
543
|
+
str(all_events[end_index].event_id)
|
|
544
|
+
if end_index < len(all_events)
|
|
545
|
+
else None
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
return GetDurableExecutionHistoryResponse(
|
|
549
|
+
events=paginated_events, next_marker=next_marker
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
def checkpoint_execution(
|
|
553
|
+
self,
|
|
554
|
+
execution_arn: str,
|
|
555
|
+
checkpoint_token: str,
|
|
556
|
+
updates: list[OperationUpdate] | None = None,
|
|
557
|
+
client_token: str | None = None,
|
|
558
|
+
) -> CheckpointDurableExecutionResponse:
|
|
559
|
+
"""Process checkpoint for an execution.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
execution_arn: The execution ARN
|
|
563
|
+
checkpoint_token: Current checkpoint token
|
|
564
|
+
updates: List of operation updates to process
|
|
565
|
+
client_token: Client token for idempotency
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
CheckpointDurableExecutionResponse: Updated checkpoint token and state
|
|
569
|
+
|
|
570
|
+
Raises:
|
|
571
|
+
ResourceNotFoundException: If execution does not exist
|
|
572
|
+
InvalidParameterValueException: If checkpoint token is invalid
|
|
573
|
+
"""
|
|
574
|
+
execution = self.get_execution(execution_arn)
|
|
575
|
+
|
|
576
|
+
# Validate checkpoint token
|
|
577
|
+
if checkpoint_token not in execution.used_tokens:
|
|
578
|
+
msg: str = f"Invalid checkpoint token: {checkpoint_token}"
|
|
579
|
+
raise InvalidParameterValueException(msg)
|
|
580
|
+
|
|
581
|
+
if updates:
|
|
582
|
+
checkpoint_output = self._checkpoint_processor.process_checkpoint(
|
|
583
|
+
checkpoint_token=checkpoint_token,
|
|
584
|
+
updates=updates,
|
|
585
|
+
client_token=client_token,
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
new_execution_state = None
|
|
589
|
+
if checkpoint_output.new_execution_state:
|
|
590
|
+
new_execution_state = CheckpointUpdatedExecutionState(
|
|
591
|
+
operations=checkpoint_output.new_execution_state.operations,
|
|
592
|
+
next_marker=checkpoint_output.new_execution_state.next_marker,
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
return CheckpointDurableExecutionResponse(
|
|
596
|
+
checkpoint_token=checkpoint_output.checkpoint_token,
|
|
597
|
+
new_execution_state=new_execution_state,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
# Save execution state after generating new token
|
|
601
|
+
new_checkpoint_token = execution.get_new_checkpoint_token()
|
|
602
|
+
self._store.update(execution)
|
|
603
|
+
|
|
604
|
+
return CheckpointDurableExecutionResponse(
|
|
605
|
+
checkpoint_token=new_checkpoint_token,
|
|
606
|
+
new_execution_state=None,
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
def send_callback_success(
|
|
610
|
+
self,
|
|
611
|
+
callback_id: str,
|
|
612
|
+
result: bytes | None = None,
|
|
613
|
+
) -> SendDurableExecutionCallbackSuccessResponse:
|
|
614
|
+
"""Send callback success response.
|
|
615
|
+
|
|
616
|
+
Args:
|
|
617
|
+
callback_id: The callback ID to respond to
|
|
618
|
+
result: Optional result data for the callback
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
SendDurableExecutionCallbackSuccessResponse: Empty response
|
|
622
|
+
|
|
623
|
+
Raises:
|
|
624
|
+
InvalidParameterValueException: If callback_id is invalid
|
|
625
|
+
ResourceNotFoundException: If callback does not exist
|
|
626
|
+
"""
|
|
627
|
+
if not callback_id:
|
|
628
|
+
msg: str = "callback_id is required"
|
|
629
|
+
raise InvalidParameterValueException(msg)
|
|
630
|
+
|
|
631
|
+
try:
|
|
632
|
+
callback_token = CallbackToken.from_str(callback_id)
|
|
633
|
+
execution = self.get_execution(callback_token.execution_arn)
|
|
634
|
+
execution.complete_callback_success(callback_id, result)
|
|
635
|
+
self._store.update(execution)
|
|
636
|
+
self._cleanup_callback_timeouts(callback_id)
|
|
637
|
+
self._invoke_execution(callback_token.execution_arn)
|
|
638
|
+
logger.info("Callback success completed for callback_id: %s", callback_id)
|
|
639
|
+
except Exception as e:
|
|
640
|
+
msg = f"Failed to process callback success: {e}"
|
|
641
|
+
raise ResourceNotFoundException(msg) from e
|
|
642
|
+
|
|
643
|
+
return SendDurableExecutionCallbackSuccessResponse()
|
|
644
|
+
|
|
645
|
+
def send_callback_failure(
|
|
646
|
+
self,
|
|
647
|
+
callback_id: str,
|
|
648
|
+
error: ErrorObject | None = None,
|
|
649
|
+
) -> SendDurableExecutionCallbackFailureResponse:
|
|
650
|
+
"""Send callback failure response.
|
|
651
|
+
|
|
652
|
+
Args:
|
|
653
|
+
callback_id: The callback ID to respond to
|
|
654
|
+
error: Optional error object for the callback failure
|
|
655
|
+
|
|
656
|
+
Returns:
|
|
657
|
+
SendDurableExecutionCallbackFailureResponse: Empty response
|
|
658
|
+
|
|
659
|
+
Raises:
|
|
660
|
+
InvalidParameterValueException: If callback_id is invalid
|
|
661
|
+
ResourceNotFoundException: If callback does not exist
|
|
662
|
+
"""
|
|
663
|
+
if not callback_id:
|
|
664
|
+
msg: str = "callback_id is required"
|
|
665
|
+
raise InvalidParameterValueException(msg)
|
|
666
|
+
|
|
667
|
+
callback_error: ErrorObject = error or ErrorObject.from_message("")
|
|
668
|
+
|
|
669
|
+
try:
|
|
670
|
+
callback_token: CallbackToken = CallbackToken.from_str(callback_id)
|
|
671
|
+
execution: Execution = self.get_execution(callback_token.execution_arn)
|
|
672
|
+
execution.complete_callback_failure(callback_id, callback_error)
|
|
673
|
+
self._store.update(execution)
|
|
674
|
+
self._cleanup_callback_timeouts(callback_id)
|
|
675
|
+
self._invoke_execution(callback_token.execution_arn)
|
|
676
|
+
logger.info("Callback failure completed for callback_id: %s", callback_id)
|
|
677
|
+
except Exception as e:
|
|
678
|
+
msg = f"Failed to process callback failure: {e}"
|
|
679
|
+
raise ResourceNotFoundException(msg) from e
|
|
680
|
+
|
|
681
|
+
return SendDurableExecutionCallbackFailureResponse()
|
|
682
|
+
|
|
683
|
+
def send_callback_heartbeat(
|
|
684
|
+
self, callback_id: str
|
|
685
|
+
) -> SendDurableExecutionCallbackHeartbeatResponse:
|
|
686
|
+
"""Send callback heartbeat to keep callback alive.
|
|
687
|
+
|
|
688
|
+
Args:
|
|
689
|
+
callback_id: The callback ID to send heartbeat for
|
|
690
|
+
|
|
691
|
+
Returns:
|
|
692
|
+
SendDurableExecutionCallbackHeartbeatResponse: Empty response
|
|
693
|
+
|
|
694
|
+
Raises:
|
|
695
|
+
InvalidParameterValueException: If callback_id is invalid
|
|
696
|
+
ResourceNotFoundException: If callback does not exist
|
|
697
|
+
"""
|
|
698
|
+
if not callback_id:
|
|
699
|
+
msg: str = "callback_id is required"
|
|
700
|
+
raise InvalidParameterValueException(msg)
|
|
701
|
+
|
|
702
|
+
try:
|
|
703
|
+
callback_token: CallbackToken = CallbackToken.from_str(callback_id)
|
|
704
|
+
execution: Execution = self.get_execution(callback_token.execution_arn)
|
|
705
|
+
|
|
706
|
+
# Find callback operation to verify it exists and is active
|
|
707
|
+
_, operation = execution.find_callback_operation(callback_id)
|
|
708
|
+
if operation.status != OperationStatus.STARTED:
|
|
709
|
+
msg = f"Callback {callback_id} is not active"
|
|
710
|
+
raise ResourceNotFoundException(msg)
|
|
711
|
+
|
|
712
|
+
# Reset heartbeat timeout if configured
|
|
713
|
+
self._reset_callback_heartbeat_timeout(
|
|
714
|
+
callback_id, execution.durable_execution_arn
|
|
715
|
+
)
|
|
716
|
+
logger.info("Callback heartbeat processed for callback_id: %s", callback_id)
|
|
717
|
+
except Exception as e:
|
|
718
|
+
msg = f"Failed to process callback heartbeat: {e}"
|
|
719
|
+
raise ResourceNotFoundException(msg) from e
|
|
720
|
+
|
|
721
|
+
return SendDurableExecutionCallbackHeartbeatResponse()
|
|
722
|
+
|
|
723
|
+
def _validate_invocation_response_and_store(
|
|
724
|
+
self,
|
|
725
|
+
execution_arn: str,
|
|
726
|
+
response: DurableExecutionInvocationOutput,
|
|
727
|
+
execution: Execution,
|
|
728
|
+
):
|
|
729
|
+
"""Validate response status and save it to the store if fine.
|
|
730
|
+
|
|
731
|
+
Raises:
|
|
732
|
+
InvalidParameterValueException: If the response status is invalid.
|
|
733
|
+
IllegalStateException: If the response status is valid but the execution is already completed.
|
|
734
|
+
"""
|
|
735
|
+
if execution.is_complete:
|
|
736
|
+
msg_already_complete: str = "Execution already completed, ignoring result"
|
|
737
|
+
|
|
738
|
+
raise IllegalStateException(msg_already_complete)
|
|
739
|
+
|
|
740
|
+
if response.status is None:
|
|
741
|
+
msg_status_required: str = "Response status is required"
|
|
742
|
+
|
|
743
|
+
raise InvalidParameterValueException(msg_status_required)
|
|
744
|
+
|
|
745
|
+
match response.status:
|
|
746
|
+
case InvocationStatus.FAILED:
|
|
747
|
+
if response.result is not None:
|
|
748
|
+
msg_failed_result: str = (
|
|
749
|
+
"Cannot provide a Result for FAILED status."
|
|
750
|
+
)
|
|
751
|
+
raise InvalidParameterValueException(msg_failed_result)
|
|
752
|
+
logger.info("[%s] Execution failed", execution_arn)
|
|
753
|
+
self._complete_workflow(
|
|
754
|
+
execution_arn, result=None, error=response.error
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
case InvocationStatus.SUCCEEDED:
|
|
758
|
+
if response.error is not None:
|
|
759
|
+
msg_success_error: str = (
|
|
760
|
+
"Cannot provide an Error for SUCCEEDED status."
|
|
761
|
+
)
|
|
762
|
+
raise InvalidParameterValueException(msg_success_error)
|
|
763
|
+
logger.info("[%s] Execution succeeded", execution_arn)
|
|
764
|
+
self._complete_workflow(
|
|
765
|
+
execution_arn, result=response.result, error=None
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
case InvocationStatus.PENDING:
|
|
769
|
+
if not execution.has_pending_operations(execution):
|
|
770
|
+
msg_pending_ops: str = (
|
|
771
|
+
"Cannot return PENDING status with no pending operations."
|
|
772
|
+
)
|
|
773
|
+
raise InvalidParameterValueException(msg_pending_ops)
|
|
774
|
+
logger.info("[%s] Execution pending async work", execution_arn)
|
|
775
|
+
|
|
776
|
+
case _:
|
|
777
|
+
msg_unexpected_status: str = (
|
|
778
|
+
f"Unexpected invocation status: {response.status}"
|
|
779
|
+
)
|
|
780
|
+
raise IllegalStateException(msg_unexpected_status)
|
|
781
|
+
|
|
782
|
+
def _invoke_handler(self, execution_arn: str) -> Callable[[], Awaitable[None]]:
|
|
783
|
+
"""Create a parameterless callable that captures execution arn for the scheduler."""
|
|
784
|
+
|
|
785
|
+
async def invoke() -> None:
|
|
786
|
+
execution: Execution = self._store.load(execution_arn)
|
|
787
|
+
|
|
788
|
+
# Early exit if execution is already completed - like Java's COMPLETED check
|
|
789
|
+
if execution.is_complete:
|
|
790
|
+
logger.info(
|
|
791
|
+
"[%s] Execution already completed, ignoring result", execution_arn
|
|
792
|
+
)
|
|
793
|
+
return
|
|
794
|
+
|
|
795
|
+
try:
|
|
796
|
+
invocation_input: DurableExecutionInvocationInput = (
|
|
797
|
+
self._invoker.create_invocation_input(execution=execution)
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
self._store.save(execution)
|
|
801
|
+
|
|
802
|
+
invocation_start = datetime.now(UTC)
|
|
803
|
+
invoke_response = self._invoker.invoke(
|
|
804
|
+
execution.start_input.function_name,
|
|
805
|
+
invocation_input,
|
|
806
|
+
execution.start_input.lambda_endpoint,
|
|
807
|
+
)
|
|
808
|
+
invocation_end = datetime.now(UTC)
|
|
809
|
+
|
|
810
|
+
# Reload execution after invocation in case it was completed via checkpoint
|
|
811
|
+
execution = self._store.load(execution_arn)
|
|
812
|
+
|
|
813
|
+
# Record invocation completion and save immediately
|
|
814
|
+
execution.record_invocation_completion(
|
|
815
|
+
invocation_start, invocation_end, invoke_response.request_id
|
|
816
|
+
)
|
|
817
|
+
self._store.save(execution)
|
|
818
|
+
|
|
819
|
+
if execution.is_complete:
|
|
820
|
+
logger.info(
|
|
821
|
+
"[%s] Execution completed during invocation, ignoring result",
|
|
822
|
+
execution_arn,
|
|
823
|
+
)
|
|
824
|
+
return
|
|
825
|
+
|
|
826
|
+
# Process successful received response - validate status and handle accordingly
|
|
827
|
+
response = invoke_response.invocation_output
|
|
828
|
+
try:
|
|
829
|
+
self._validate_invocation_response_and_store(
|
|
830
|
+
execution_arn, response, execution
|
|
831
|
+
)
|
|
832
|
+
except (InvalidParameterValueException, IllegalStateException) as e:
|
|
833
|
+
logger.warning(
|
|
834
|
+
"[%s] Lambda output validation failure: %s", execution_arn, e
|
|
835
|
+
)
|
|
836
|
+
error_obj = ErrorObject.from_exception(e)
|
|
837
|
+
self._retry_invocation(execution, error_obj)
|
|
838
|
+
|
|
839
|
+
except ResourceNotFoundException:
|
|
840
|
+
logger.warning(
|
|
841
|
+
"[%s] Function No longer exists: %s",
|
|
842
|
+
execution_arn,
|
|
843
|
+
execution.start_input.function_name,
|
|
844
|
+
)
|
|
845
|
+
error_obj = ErrorObject.from_message(
|
|
846
|
+
message=f"Function not found: {execution.start_input.function_name}"
|
|
847
|
+
)
|
|
848
|
+
self._fail_workflow(execution_arn, error_obj)
|
|
849
|
+
|
|
850
|
+
except Exception as e: # noqa: BLE001
|
|
851
|
+
# Handle invocation errors (network, function not found, etc.)
|
|
852
|
+
logger.warning("[%s] Invocation failed: %s", execution_arn, e)
|
|
853
|
+
error_obj = ErrorObject.from_exception(e)
|
|
854
|
+
self._retry_invocation(execution, error_obj)
|
|
855
|
+
|
|
856
|
+
return invoke
|
|
857
|
+
|
|
858
|
+
def _invoke_execution(self, execution_arn: str, delay: float = 0) -> None:
|
|
859
|
+
"""Invoke execution after delay in seconds."""
|
|
860
|
+
completion_event = self._completion_events.get(execution_arn)
|
|
861
|
+
self._scheduler.call_later(
|
|
862
|
+
self._invoke_handler(execution_arn),
|
|
863
|
+
delay=delay,
|
|
864
|
+
completion_event=completion_event,
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
def _complete_workflow(
|
|
868
|
+
self, execution_arn: str, result: str | None, error: ErrorObject | None
|
|
869
|
+
):
|
|
870
|
+
"""Complete workflow - handles both success and failure with terminal state validation."""
|
|
871
|
+
execution = self._store.load(execution_arn)
|
|
872
|
+
|
|
873
|
+
if execution.is_complete:
|
|
874
|
+
msg: str = "Cannot make multiple close workflow decisions."
|
|
875
|
+
|
|
876
|
+
raise IllegalStateException(msg)
|
|
877
|
+
|
|
878
|
+
if error is not None:
|
|
879
|
+
self.fail_execution(execution_arn, error)
|
|
880
|
+
else:
|
|
881
|
+
self.complete_execution(execution_arn, result)
|
|
882
|
+
|
|
883
|
+
def _fail_workflow(self, execution_arn: str, error: ErrorObject):
|
|
884
|
+
"""Fail workflow with terminal state validation."""
|
|
885
|
+
execution = self._store.load(execution_arn)
|
|
886
|
+
|
|
887
|
+
if execution.is_complete:
|
|
888
|
+
msg: str = "Cannot make multiple close workflow decisions."
|
|
889
|
+
|
|
890
|
+
raise IllegalStateException(msg)
|
|
891
|
+
|
|
892
|
+
self.fail_execution(execution_arn, error)
|
|
893
|
+
|
|
894
|
+
def _retry_invocation(self, execution: Execution, error: ErrorObject):
|
|
895
|
+
"""Handle retry logic or fail execution if retries exhausted."""
|
|
896
|
+
if (
|
|
897
|
+
execution.consecutive_failed_invocation_attempts
|
|
898
|
+
> self.MAX_CONSECUTIVE_FAILED_ATTEMPTS
|
|
899
|
+
):
|
|
900
|
+
# Exhausted retries - fail the execution
|
|
901
|
+
self._fail_workflow(
|
|
902
|
+
execution_arn=execution.durable_execution_arn, error=error
|
|
903
|
+
)
|
|
904
|
+
else:
|
|
905
|
+
# Schedule retry with backoff
|
|
906
|
+
execution.consecutive_failed_invocation_attempts += 1
|
|
907
|
+
self._store.save(execution)
|
|
908
|
+
self._invoke_execution(
|
|
909
|
+
execution_arn=execution.durable_execution_arn,
|
|
910
|
+
delay=self.RETRY_BACKOFF_SECONDS,
|
|
911
|
+
)
|
|
912
|
+
|
|
913
|
+
def _complete_events(self, execution_arn: str):
|
|
914
|
+
# complete doesn't actually checkpoint explicitly
|
|
915
|
+
if event := self._completion_events.get(execution_arn):
|
|
916
|
+
event.set()
|
|
917
|
+
if self._execution_timeout:
|
|
918
|
+
self._execution_timeout.cancel()
|
|
919
|
+
self._execution_timeout = None
|
|
920
|
+
|
|
921
|
+
def wait_until_complete(
|
|
922
|
+
self, execution_arn: str, timeout: float | None = None
|
|
923
|
+
) -> bool:
|
|
924
|
+
"""Block until execution completion. Don't do this unless you actually want to block.
|
|
925
|
+
|
|
926
|
+
Args
|
|
927
|
+
timeout (int|float|None): Wait for event to set until this timeout.
|
|
928
|
+
|
|
929
|
+
Returns:
|
|
930
|
+
True when set. False if the event timed out without being set.
|
|
931
|
+
"""
|
|
932
|
+
if event := self._completion_events.get(execution_arn):
|
|
933
|
+
return event.wait(timeout)
|
|
934
|
+
|
|
935
|
+
# this really shouldn't happen - implies execution timed out?
|
|
936
|
+
msg: str = "execution does not exist."
|
|
937
|
+
|
|
938
|
+
raise ResourceNotFoundException(msg)
|
|
939
|
+
|
|
940
|
+
def complete_execution(self, execution_arn: str, result: str | None = None) -> None:
|
|
941
|
+
"""Complete execution successfully (COMPLETE_WORKFLOW_EXECUTION decision)."""
|
|
942
|
+
logger.debug("[%s] Completing execution with result: %s", execution_arn, result)
|
|
943
|
+
execution: Execution = self._store.load(execution_arn=execution_arn)
|
|
944
|
+
execution.complete_success(result=result) # Sets CloseStatus.COMPLETED
|
|
945
|
+
self._store.update(execution)
|
|
946
|
+
if execution.result is None:
|
|
947
|
+
msg: str = "Execution result is required"
|
|
948
|
+
raise IllegalStateException(msg)
|
|
949
|
+
self._complete_events(execution_arn=execution_arn)
|
|
950
|
+
|
|
951
|
+
def fail_execution(self, execution_arn: str, error: ErrorObject) -> None:
|
|
952
|
+
"""Fail execution with error (FAIL_WORKFLOW_EXECUTION decision)."""
|
|
953
|
+
logger.error("[%s] Completing execution with error: %s", execution_arn, error)
|
|
954
|
+
execution: Execution = self._store.load(execution_arn=execution_arn)
|
|
955
|
+
execution.complete_fail(error=error) # Sets CloseStatus.FAILED
|
|
956
|
+
self._store.update(execution)
|
|
957
|
+
# set by complete_fail
|
|
958
|
+
if execution.result is None:
|
|
959
|
+
msg: str = "Execution result is required"
|
|
960
|
+
raise IllegalStateException(msg)
|
|
961
|
+
self._complete_events(execution_arn=execution_arn)
|
|
962
|
+
|
|
963
|
+
def _on_wait_succeeded(self, execution_arn: str, operation_id: str) -> None:
|
|
964
|
+
"""Private method - called when a wait operation completes successfully."""
|
|
965
|
+
execution = self._store.load(execution_arn)
|
|
966
|
+
|
|
967
|
+
if execution.is_complete:
|
|
968
|
+
logger.info(
|
|
969
|
+
"[%s] Execution already completed, ignoring wait succeeded event",
|
|
970
|
+
execution_arn,
|
|
971
|
+
)
|
|
972
|
+
return
|
|
973
|
+
|
|
974
|
+
try:
|
|
975
|
+
execution.complete_wait(operation_id=operation_id)
|
|
976
|
+
self._store.update(execution)
|
|
977
|
+
logger.debug(
|
|
978
|
+
"[%s] Wait succeeded for operation %s", execution_arn, operation_id
|
|
979
|
+
)
|
|
980
|
+
except Exception:
|
|
981
|
+
logger.exception("[%s] Error processing wait succeeded.", execution_arn)
|
|
982
|
+
|
|
983
|
+
def _on_retry_ready(self, execution_arn: str, operation_id: str) -> None:
|
|
984
|
+
"""Private method - called when a retry delay has elapsed and retry is ready."""
|
|
985
|
+
execution = self._store.load(execution_arn)
|
|
986
|
+
|
|
987
|
+
if execution.is_complete:
|
|
988
|
+
logger.info(
|
|
989
|
+
"[%s] Execution already completed, ignoring retry", execution_arn
|
|
990
|
+
)
|
|
991
|
+
return
|
|
992
|
+
|
|
993
|
+
try:
|
|
994
|
+
execution.complete_retry(operation_id=operation_id)
|
|
995
|
+
self._store.update(execution)
|
|
996
|
+
logger.debug(
|
|
997
|
+
"[%s] Retry ready for operation %s", execution_arn, operation_id
|
|
998
|
+
)
|
|
999
|
+
except Exception:
|
|
1000
|
+
logger.exception("[%s] Error processing retry ready.", execution_arn)
|
|
1001
|
+
|
|
1002
|
+
# region ExecutionObserver
|
|
1003
|
+
def on_completed(self, execution_arn: str, result: str | None = None) -> None:
|
|
1004
|
+
"""Complete execution successfully. Observer method triggered by notifier."""
|
|
1005
|
+
self.complete_execution(execution_arn, result)
|
|
1006
|
+
|
|
1007
|
+
def on_failed(self, execution_arn: str, error: ErrorObject) -> None:
|
|
1008
|
+
"""Fail execution. Observer method triggered by notifier."""
|
|
1009
|
+
self.fail_execution(execution_arn, error)
|
|
1010
|
+
|
|
1011
|
+
def on_timed_out(self, execution_arn: str, error: ErrorObject) -> None:
|
|
1012
|
+
"""Handle execution timeout (workflow timeout). Observer method triggered by notifier."""
|
|
1013
|
+
logger.exception("[%s] Execution timed out.", execution_arn)
|
|
1014
|
+
execution: Execution = self._store.load(execution_arn=execution_arn)
|
|
1015
|
+
execution.complete_timeout(error=error) # Sets CloseStatus.TIMED_OUT
|
|
1016
|
+
self._store.update(execution)
|
|
1017
|
+
self._complete_events(execution_arn=execution_arn)
|
|
1018
|
+
|
|
1019
|
+
def on_stopped(self, execution_arn: str, error: ErrorObject) -> None:
|
|
1020
|
+
"""Handle execution stop. Observer method triggered by notifier."""
|
|
1021
|
+
# This should not be called directly - stop_execution handles termination
|
|
1022
|
+
self.fail_execution(execution_arn, error)
|
|
1023
|
+
|
|
1024
|
+
def on_wait_timer_scheduled(
|
|
1025
|
+
self, execution_arn: str, operation_id: str, delay: float
|
|
1026
|
+
) -> None:
|
|
1027
|
+
"""Schedule a wait operation. Observer method triggered by notifier."""
|
|
1028
|
+
logger.debug("[%s] scheduling wait with delay: %d", execution_arn, delay)
|
|
1029
|
+
|
|
1030
|
+
def wait_handler() -> None:
|
|
1031
|
+
self._on_wait_succeeded(execution_arn, operation_id)
|
|
1032
|
+
self._invoke_execution(execution_arn, delay=0)
|
|
1033
|
+
|
|
1034
|
+
completion_event = self._completion_events.get(execution_arn)
|
|
1035
|
+
self._scheduler.call_later(
|
|
1036
|
+
wait_handler, delay=delay, completion_event=completion_event
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
def on_step_retry_scheduled(
|
|
1040
|
+
self, execution_arn: str, operation_id: str, delay: float
|
|
1041
|
+
) -> None:
|
|
1042
|
+
"""Schedule a retry a step. Observer method triggered by notifier."""
|
|
1043
|
+
logger.debug(
|
|
1044
|
+
"[%s] scheduling retry for %s with delay: %d",
|
|
1045
|
+
execution_arn,
|
|
1046
|
+
operation_id,
|
|
1047
|
+
delay,
|
|
1048
|
+
)
|
|
1049
|
+
|
|
1050
|
+
def retry_handler() -> None:
|
|
1051
|
+
self._on_retry_ready(execution_arn, operation_id)
|
|
1052
|
+
self._invoke_execution(execution_arn, delay=0)
|
|
1053
|
+
|
|
1054
|
+
completion_event = self._completion_events.get(execution_arn)
|
|
1055
|
+
self._scheduler.call_later(
|
|
1056
|
+
retry_handler, delay=delay, completion_event=completion_event
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
def on_callback_created(
|
|
1060
|
+
self,
|
|
1061
|
+
execution_arn: str,
|
|
1062
|
+
operation_id: str,
|
|
1063
|
+
callback_options: CallbackOptions | None,
|
|
1064
|
+
callback_token: CallbackToken,
|
|
1065
|
+
) -> None:
|
|
1066
|
+
"""Handle callback creation. Observer method triggered by notifier."""
|
|
1067
|
+
callback_id = callback_token.to_str()
|
|
1068
|
+
logger.debug(
|
|
1069
|
+
"[%s] Callback created for operation %s with callback_id: %s",
|
|
1070
|
+
execution_arn,
|
|
1071
|
+
operation_id,
|
|
1072
|
+
callback_id,
|
|
1073
|
+
)
|
|
1074
|
+
|
|
1075
|
+
# Schedule callback timeouts if configured
|
|
1076
|
+
self._schedule_callback_timeouts(execution_arn, callback_options, callback_id)
|
|
1077
|
+
|
|
1078
|
+
# endregion ExecutionObserver
|
|
1079
|
+
|
|
1080
|
+
# region Callback Timeouts
|
|
1081
|
+
def _schedule_callback_timeouts(
|
|
1082
|
+
self,
|
|
1083
|
+
execution_arn: str,
|
|
1084
|
+
callback_options: CallbackOptions | None,
|
|
1085
|
+
callback_id: str,
|
|
1086
|
+
) -> None:
|
|
1087
|
+
"""Schedule callback timeout and heartbeat timeout if configured."""
|
|
1088
|
+
try:
|
|
1089
|
+
if not callback_options:
|
|
1090
|
+
return
|
|
1091
|
+
|
|
1092
|
+
completion_event = self._completion_events.get(execution_arn)
|
|
1093
|
+
|
|
1094
|
+
# Schedule main timeout if configured
|
|
1095
|
+
if callback_options.timeout_seconds > 0:
|
|
1096
|
+
|
|
1097
|
+
def timeout_handler():
|
|
1098
|
+
self._on_callback_timeout(execution_arn, callback_id)
|
|
1099
|
+
|
|
1100
|
+
timeout_future = self._scheduler.call_later(
|
|
1101
|
+
timeout_handler,
|
|
1102
|
+
delay=callback_options.timeout_seconds,
|
|
1103
|
+
completion_event=completion_event,
|
|
1104
|
+
)
|
|
1105
|
+
self._callback_timeouts[callback_id] = timeout_future
|
|
1106
|
+
|
|
1107
|
+
# Schedule heartbeat timeout if configured
|
|
1108
|
+
if callback_options.heartbeat_timeout_seconds > 0:
|
|
1109
|
+
|
|
1110
|
+
def heartbeat_timeout_handler():
|
|
1111
|
+
self._on_callback_heartbeat_timeout(execution_arn, callback_id)
|
|
1112
|
+
|
|
1113
|
+
heartbeat_future = self._scheduler.call_later(
|
|
1114
|
+
heartbeat_timeout_handler,
|
|
1115
|
+
delay=callback_options.heartbeat_timeout_seconds,
|
|
1116
|
+
completion_event=completion_event,
|
|
1117
|
+
)
|
|
1118
|
+
self._callback_heartbeats[callback_id] = heartbeat_future
|
|
1119
|
+
|
|
1120
|
+
except Exception:
|
|
1121
|
+
logger.exception(
|
|
1122
|
+
"[%s] Error scheduling callback timeouts for %s",
|
|
1123
|
+
execution_arn,
|
|
1124
|
+
callback_id,
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1127
|
+
def _reset_callback_heartbeat_timeout(
|
|
1128
|
+
self, callback_id: str, execution_arn: str
|
|
1129
|
+
) -> None:
|
|
1130
|
+
"""Reset the heartbeat timeout for a callback."""
|
|
1131
|
+
# Cancel existing heartbeat timeout
|
|
1132
|
+
if heartbeat_future := self._callback_heartbeats.pop(callback_id, None):
|
|
1133
|
+
heartbeat_future.cancel()
|
|
1134
|
+
|
|
1135
|
+
# Find callback options to reschedule heartbeat timeout
|
|
1136
|
+
try:
|
|
1137
|
+
callback_token = CallbackToken.from_str(callback_id)
|
|
1138
|
+
execution = self.get_execution(callback_token.execution_arn)
|
|
1139
|
+
|
|
1140
|
+
callback_options = None
|
|
1141
|
+
for update in execution.updates:
|
|
1142
|
+
if (
|
|
1143
|
+
update.operation_id == callback_token.operation_id
|
|
1144
|
+
and update.callback_options
|
|
1145
|
+
and update.action.value == "START"
|
|
1146
|
+
):
|
|
1147
|
+
callback_options = update.callback_options
|
|
1148
|
+
break
|
|
1149
|
+
|
|
1150
|
+
if callback_options and callback_options.heartbeat_timeout_seconds > 0:
|
|
1151
|
+
|
|
1152
|
+
def heartbeat_timeout_handler():
|
|
1153
|
+
self._on_callback_heartbeat_timeout(execution_arn, callback_id)
|
|
1154
|
+
|
|
1155
|
+
completion_event = self._completion_events.get(execution_arn)
|
|
1156
|
+
|
|
1157
|
+
heartbeat_future = self._scheduler.call_later(
|
|
1158
|
+
heartbeat_timeout_handler,
|
|
1159
|
+
delay=callback_options.heartbeat_timeout_seconds,
|
|
1160
|
+
completion_event=completion_event,
|
|
1161
|
+
)
|
|
1162
|
+
self._callback_heartbeats[callback_id] = heartbeat_future
|
|
1163
|
+
|
|
1164
|
+
except Exception:
|
|
1165
|
+
logger.exception(
|
|
1166
|
+
"[%s] Error resetting callback heartbeat timeout for %s",
|
|
1167
|
+
execution_arn,
|
|
1168
|
+
callback_id,
|
|
1169
|
+
)
|
|
1170
|
+
|
|
1171
|
+
def _cleanup_callback_timeouts(self, callback_id: str) -> None:
|
|
1172
|
+
"""Clean up timeout events for a completed callback."""
|
|
1173
|
+
# Clean up main timeout
|
|
1174
|
+
if timeout_future := self._callback_timeouts.pop(callback_id, None):
|
|
1175
|
+
timeout_future.cancel()
|
|
1176
|
+
|
|
1177
|
+
# Clean up heartbeat timeout
|
|
1178
|
+
if heartbeat_future := self._callback_heartbeats.pop(callback_id, None):
|
|
1179
|
+
heartbeat_future.cancel()
|
|
1180
|
+
|
|
1181
|
+
def _on_callback_timeout(self, execution_arn: str, callback_id: str) -> None:
|
|
1182
|
+
"""Handle callback timeout."""
|
|
1183
|
+
try:
|
|
1184
|
+
callback_token = CallbackToken.from_str(callback_id)
|
|
1185
|
+
execution = self.get_execution(callback_token.execution_arn)
|
|
1186
|
+
|
|
1187
|
+
if execution.is_complete:
|
|
1188
|
+
return
|
|
1189
|
+
|
|
1190
|
+
# Fail the callback with timeout error
|
|
1191
|
+
timeout_error = ErrorObject.from_message(
|
|
1192
|
+
f"Callback timed out: {CallbackTimeoutType.TIMEOUT.value}"
|
|
1193
|
+
)
|
|
1194
|
+
execution.complete_callback_timeout(callback_id, timeout_error)
|
|
1195
|
+
self._store.update(execution)
|
|
1196
|
+
logger.warning("[%s] Callback %s timed out", execution_arn, callback_id)
|
|
1197
|
+
self._invoke_execution(callback_token.execution_arn)
|
|
1198
|
+
except Exception:
|
|
1199
|
+
logger.exception(
|
|
1200
|
+
"[%s] Error processing callback timeout for %s",
|
|
1201
|
+
execution_arn,
|
|
1202
|
+
callback_id,
|
|
1203
|
+
)
|
|
1204
|
+
|
|
1205
|
+
def _on_callback_heartbeat_timeout(
|
|
1206
|
+
self, execution_arn: str, callback_id: str
|
|
1207
|
+
) -> None:
|
|
1208
|
+
"""Handle callback heartbeat timeout."""
|
|
1209
|
+
try:
|
|
1210
|
+
callback_token = CallbackToken.from_str(callback_id)
|
|
1211
|
+
execution = self.get_execution(callback_token.execution_arn)
|
|
1212
|
+
|
|
1213
|
+
if execution.is_complete:
|
|
1214
|
+
return
|
|
1215
|
+
|
|
1216
|
+
# Fail the callback with heartbeat timeout error
|
|
1217
|
+
|
|
1218
|
+
heartbeat_error = ErrorObject.from_message(
|
|
1219
|
+
f"Callback heartbeat timed out: {CallbackTimeoutType.HEARTBEAT.value}"
|
|
1220
|
+
)
|
|
1221
|
+
execution.complete_callback_timeout(callback_id, heartbeat_error)
|
|
1222
|
+
self._store.update(execution)
|
|
1223
|
+
logger.warning(
|
|
1224
|
+
"[%s] Callback %s heartbeat timed out", execution_arn, callback_id
|
|
1225
|
+
)
|
|
1226
|
+
self._invoke_execution(callback_token.execution_arn)
|
|
1227
|
+
except Exception:
|
|
1228
|
+
logger.exception(
|
|
1229
|
+
"[%s] Error processing callback heartbeat timeout for %s",
|
|
1230
|
+
execution_arn,
|
|
1231
|
+
callback_id,
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
# endregion Callback Timeouts
|