flowstash-runtime 0.9.0__tar.gz → 0.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/PKG-INFO +4 -3
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/pyproject.toml +5 -4
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/http_entrypoint.py +90 -47
- flowstash_runtime-0.9.2/src/flowstash/runtime/worker/backends/managed/lease_client.py +267 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/managed_consumer.py +50 -3
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/task_resolver.py +13 -3
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/__init__.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/ingress/__init__.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/ingress/app.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/ingress/router.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/wiring/__init__.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/wiring/runtime.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/__init__.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/__init__.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/__init__.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/bootstrap.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/consumer_middleware.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/dramatiq_backend.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/dramatiq_consumer.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/entrypoint.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/README.md +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/__init__.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/drain.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/feed_consumer.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/main.py +0 -0
- {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/runner.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: flowstash-runtime
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.2
|
|
4
4
|
Summary: Actual runtime engine and bootstrap layer for the flowstash platform.
|
|
5
5
|
Author: juraj.bezdek@gmail.com
|
|
6
6
|
Author-email: juraj.bezdek@gmail.com
|
|
@@ -12,7 +12,8 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
12
12
|
Requires-Dist: apscheduler (>=3.11.0)
|
|
13
13
|
Requires-Dist: dramatiq (>=1.16.0)
|
|
14
14
|
Requires-Dist: fastapi (>=0.110.0)
|
|
15
|
-
Requires-Dist: flowstash-clients (>=0.9.
|
|
16
|
-
Requires-Dist: flowstash-lib (>=0.9.
|
|
15
|
+
Requires-Dist: flowstash-clients (>=0.9.2,<0.10.0)
|
|
16
|
+
Requires-Dist: flowstash-lib (>=0.9.2,<0.10.0)
|
|
17
17
|
Requires-Dist: pyyaml (>=6.0.1)
|
|
18
18
|
Requires-Dist: uvicorn (>=0.29.0)
|
|
19
|
+
Requires-Dist: websockets (>=14.0)
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "flowstash-runtime"
|
|
3
|
-
version = "0.9.
|
|
3
|
+
version = "0.9.2"
|
|
4
4
|
description = "Actual runtime engine and bootstrap layer for the flowstash platform."
|
|
5
5
|
authors = [{name = "juraj.bezdek@gmail.com", email = "juraj.bezdek@gmail.com"}]
|
|
6
6
|
requires-python = ">=3.11"
|
|
7
7
|
dependencies = [
|
|
8
|
-
"flowstash-clients>=0.9.
|
|
9
|
-
"flowstash-lib>=0.9.
|
|
8
|
+
"flowstash-clients>=0.9.2,<0.10.0",
|
|
9
|
+
"flowstash-lib>=0.9.2,<0.10.0",
|
|
10
10
|
"fastapi>=0.110.0",
|
|
11
11
|
"uvicorn>=0.29.0",
|
|
12
12
|
"dramatiq>=1.16.0",
|
|
13
13
|
"apscheduler>=3.11.0",
|
|
14
|
-
"pyyaml>=6.0.1"
|
|
14
|
+
"pyyaml>=6.0.1",
|
|
15
|
+
"websockets>=14.0"
|
|
15
16
|
]
|
|
16
17
|
|
|
17
18
|
[build-system]
|
|
@@ -34,6 +34,12 @@ from .task_resolver import (
|
|
|
34
34
|
resolve_function as _registry_resolve,
|
|
35
35
|
_invoke_task_callable,
|
|
36
36
|
)
|
|
37
|
+
from .lease_client import (
|
|
38
|
+
get_lease_client,
|
|
39
|
+
LeaseBusy,
|
|
40
|
+
SUCCEEDED as LEASE_SUCCEEDED,
|
|
41
|
+
FAILED as LEASE_FAILED,
|
|
42
|
+
)
|
|
37
43
|
from flowstash.pipelines.record_serialization import from_jsonable
|
|
38
44
|
|
|
39
45
|
logger = logging.getLogger(__name__)
|
|
@@ -218,63 +224,91 @@ async def _execute_managed_task(payload: TaskPayload) -> dict:
|
|
|
218
224
|
# Fall back to a fresh id for older payloads that don't carry one.
|
|
219
225
|
execution_run_id = payload.run_id or str(uuid.uuid4())
|
|
220
226
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
payload.task_name or payload.task_id, func_ref
|
|
227
|
+
# Idempotency guard: claim an exclusive lease on this run_id from the broker so a
|
|
228
|
+
# Cloud Tasks redelivery (or a concurrent duplicate) never re-executes a run that is
|
|
229
|
+
# already running or just completed. Disabled-safe: get_lease_client() returns None
|
|
230
|
+
# when the broker isn't configured; an unreachable broker yields UNAVAILABLE → we
|
|
231
|
+
# fail closed with a retryable 503 rather than risk a duplicate.
|
|
232
|
+
lease = get_lease_client()
|
|
233
|
+
lease_held = False
|
|
234
|
+
if lease is not None:
|
|
235
|
+
res = await lease.acquire(execution_run_id, entry_point=entry_point)
|
|
236
|
+
if res.duplicate:
|
|
237
|
+
logger.info(
|
|
238
|
+
"run %s already completed — skipping redelivery", execution_run_id
|
|
234
239
|
)
|
|
235
|
-
|
|
236
|
-
|
|
240
|
+
return {"status": "duplicate", "run_id": execution_run_id, "task": func_ref}
|
|
241
|
+
if not res.acquired:
|
|
242
|
+
# BUSY (held elsewhere), RECOVERING (broker restarting), or UNAVAILABLE
|
|
243
|
+
# (broker unreachable) — all map to a retryable 503. Only ACQUIRED runs.
|
|
244
|
+
raise LeaseBusy(execution_run_id)
|
|
245
|
+
lease_held = True
|
|
246
|
+
|
|
247
|
+
lease_status = LEASE_FAILED
|
|
248
|
+
try:
|
|
249
|
+
with integration_context(
|
|
250
|
+
integration=integration,
|
|
251
|
+
integration_pipeline=pipeline,
|
|
252
|
+
run_id=execution_run_id,
|
|
253
|
+
parent_run_id=parent_run_id,
|
|
254
|
+
operation_id=operation_id,
|
|
255
|
+
tags=tags,
|
|
256
|
+
attrs={"args": raw_args},
|
|
257
|
+
record_lifecycle=False,
|
|
258
|
+
) as ctx:
|
|
259
|
+
try:
|
|
260
|
+
func = _resolve_task_callable(
|
|
261
|
+
payload.task_name or payload.task_id, func_ref
|
|
262
|
+
)
|
|
263
|
+
except ValueError as e:
|
|
264
|
+
logger.warning(f"Could not resolve task '{entry_point}': {e}")
|
|
265
|
+
await record_run_started(
|
|
266
|
+
correlation=ctx.corelation,
|
|
267
|
+
entry_point=entry_point,
|
|
268
|
+
attrs={"args": raw_args},
|
|
269
|
+
)
|
|
270
|
+
await record_run_ended(
|
|
271
|
+
status="FAILED",
|
|
272
|
+
correlation=ctx.corelation,
|
|
273
|
+
attrs={"error": str(e), "task_resolution_failed": True},
|
|
274
|
+
)
|
|
275
|
+
raise
|
|
276
|
+
|
|
277
|
+
normalized_args = normalize_arguments(func, args, kwargs)
|
|
237
278
|
await record_run_started(
|
|
238
279
|
correlation=ctx.corelation,
|
|
239
280
|
entry_point=entry_point,
|
|
240
|
-
attrs={"args":
|
|
241
|
-
)
|
|
242
|
-
await record_run_ended(
|
|
243
|
-
status="FAILED",
|
|
244
|
-
correlation=ctx.corelation,
|
|
245
|
-
attrs={"error": str(e), "task_resolution_failed": True},
|
|
281
|
+
attrs={"args": normalized_args},
|
|
246
282
|
)
|
|
247
|
-
raise
|
|
248
283
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
284
|
+
try:
|
|
285
|
+
await _invoke_task_callable(func, args, kwargs)
|
|
286
|
+
except Exception as e:
|
|
287
|
+
import traceback
|
|
288
|
+
|
|
289
|
+
error = str(e)
|
|
290
|
+
tb = traceback.format_exc()
|
|
291
|
+
logger.error(f"Task execution failed: {func_ref}: {e}", exc_info=True)
|
|
292
|
+
await record_run_ended(
|
|
293
|
+
status="FAILED",
|
|
294
|
+
correlation=ctx.corelation,
|
|
295
|
+
attrs={"error": error, "traceback": tb},
|
|
296
|
+
)
|
|
297
|
+
# Re-raise so the run ends as FAILED only (not ALSO SUCCEEDED) and the handler
|
|
298
|
+
# returns 500 → Cloud Tasks retries the delivery. Without this, a raising task
|
|
299
|
+
# was recorded as both FAILED and SUCCEEDED and the worker returned 200.
|
|
300
|
+
raise
|
|
260
301
|
|
|
261
|
-
error = str(e)
|
|
262
|
-
tb = traceback.format_exc()
|
|
263
|
-
logger.error(f"Task execution failed: {func_ref}: {e}", exc_info=True)
|
|
264
302
|
await record_run_ended(
|
|
265
|
-
status="
|
|
303
|
+
status="SUCCEEDED",
|
|
266
304
|
correlation=ctx.corelation,
|
|
267
|
-
attrs={"error": error, "traceback": tb},
|
|
268
305
|
)
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
status="SUCCEEDED",
|
|
276
|
-
correlation=ctx.corelation,
|
|
277
|
-
)
|
|
306
|
+
lease_status = LEASE_SUCCEEDED
|
|
307
|
+
finally:
|
|
308
|
+
if lease_held:
|
|
309
|
+
# SUCCEEDED → completed-set tombstone (skips post-completion redeliveries);
|
|
310
|
+
# FAILED → free the lease so Cloud Tasks can genuinely retry.
|
|
311
|
+
await lease.release(execution_run_id, lease_status)
|
|
278
312
|
|
|
279
313
|
return {
|
|
280
314
|
"status": "ok",
|
|
@@ -317,6 +351,15 @@ async def handle_task(request: Request, payload: TaskPayload):
|
|
|
317
351
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
318
352
|
content={"status": "TASK_NOT_FOUND", "detail": str(e)},
|
|
319
353
|
)
|
|
354
|
+
except LeaseBusy as e:
|
|
355
|
+
# Lease held elsewhere (or broker unreachable). 503 is retryable by
|
|
356
|
+
# Cloud Tasks (unlike a 4xx), so the redelivery becomes the liveness
|
|
357
|
+
# backstop: by the next attempt the holder has finished (→ COMPLETED)
|
|
358
|
+
# or its lease expired (→ this attempt acquires it).
|
|
359
|
+
return JSONResponse(
|
|
360
|
+
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
361
|
+
content={"status": "LEASE_HELD", "run_id": e.run_id},
|
|
362
|
+
)
|
|
320
363
|
except Exception as e:
|
|
321
364
|
# Task execution failed (already recorded as FAILED on the run). Return 500 so
|
|
322
365
|
# Cloud Tasks retries the delivery; the request scope still releases its slot
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Worker-side client for the lease broker.
|
|
3
|
+
|
|
4
|
+
One process-singleton WebSocket connection multiplexes the acquire/release of
|
|
5
|
+
every task this worker process runs. The connection itself is the liveness
|
|
6
|
+
signal — if it drops, the broker frees this worker's leases after a grace window
|
|
7
|
+
unless we reconnect and re-assert them. We track our own held run_ids locally and
|
|
8
|
+
re-assert the whole set on every (re)connect.
|
|
9
|
+
|
|
10
|
+
Safe rollout: if the broker is not configured (no URL/token), or the optional
|
|
11
|
+
``websockets`` dependency is missing, the client is *disabled* and callers run
|
|
12
|
+
without the guard (no behaviour change). Once enabled, an unreachable broker
|
|
13
|
+
makes ``acquire`` return ``UNAVAILABLE`` so the caller fails closed (HTTP 503 /
|
|
14
|
+
job no-op) rather than risk a duplicate.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import asyncio
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
from dataclasses import dataclass
|
|
24
|
+
from typing import Dict, List, Optional, Set
|
|
25
|
+
|
|
26
|
+
try: # optional dependency — absence simply disables the guard
|
|
27
|
+
import websockets
|
|
28
|
+
except Exception: # pragma: no cover - import guard
|
|
29
|
+
websockets = None # type: ignore
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# acquire() outcomes
|
|
34
|
+
ACQUIRED = "ACQUIRED"
|
|
35
|
+
BUSY = "BUSY"
|
|
36
|
+
COMPLETED = "COMPLETED"
|
|
37
|
+
UNAVAILABLE = "UNAVAILABLE"
|
|
38
|
+
|
|
39
|
+
# release() statuses
|
|
40
|
+
SUCCEEDED = "SUCCEEDED"
|
|
41
|
+
FAILED = "FAILED"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class LeaseBusy(Exception):
|
|
45
|
+
"""Raised when the run's lease is held elsewhere (or the broker is
|
|
46
|
+
unreachable) — the caller should respond with a retryable 503 so Cloud Tasks
|
|
47
|
+
redelivers later, by which point the holder has finished or its lease expired."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, run_id: str):
|
|
50
|
+
super().__init__(f"lease busy for run_id={run_id}")
|
|
51
|
+
self.run_id = run_id
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class AcquireResult:
|
|
56
|
+
outcome: str # ACQUIRED | BUSY | COMPLETED | UNAVAILABLE
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def acquired(self) -> bool:
|
|
60
|
+
return self.outcome == ACQUIRED
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def duplicate(self) -> bool:
|
|
64
|
+
return self.outcome == COMPLETED
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def busy(self) -> bool:
|
|
68
|
+
return self.outcome in (BUSY, UNAVAILABLE)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class LeaseBrokerClient:
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
url: str,
|
|
75
|
+
token: str,
|
|
76
|
+
*,
|
|
77
|
+
acquire_timeout: float = 10.0,
|
|
78
|
+
reconnect_min: float = 0.5,
|
|
79
|
+
reconnect_max: float = 10.0,
|
|
80
|
+
) -> None:
|
|
81
|
+
self._url = url
|
|
82
|
+
self._token = token
|
|
83
|
+
self._acquire_timeout = acquire_timeout
|
|
84
|
+
self._reconnect_min = reconnect_min
|
|
85
|
+
self._reconnect_max = reconnect_max
|
|
86
|
+
|
|
87
|
+
self._held: Set[str] = set()
|
|
88
|
+
self._waiters: Dict[str, List[asyncio.Future]] = {}
|
|
89
|
+
self._ws = None
|
|
90
|
+
self._connected = asyncio.Event()
|
|
91
|
+
self._send_lock = asyncio.Lock()
|
|
92
|
+
self._run_task: Optional[asyncio.Task] = None
|
|
93
|
+
self._closing = False
|
|
94
|
+
|
|
95
|
+
# ── lifecycle ────────────────────────────────────────────────────────
|
|
96
|
+
|
|
97
|
+
def ensure_started(self) -> None:
|
|
98
|
+
"""Start the background connect loop once, on the current event loop."""
|
|
99
|
+
if self._run_task is None or self._run_task.done():
|
|
100
|
+
self._run_task = asyncio.ensure_future(self._run())
|
|
101
|
+
|
|
102
|
+
async def stop(self) -> None:
|
|
103
|
+
self._closing = True
|
|
104
|
+
if self._run_task is not None:
|
|
105
|
+
self._run_task.cancel()
|
|
106
|
+
ws = self._ws
|
|
107
|
+
if ws is not None:
|
|
108
|
+
try:
|
|
109
|
+
await ws.close()
|
|
110
|
+
except Exception:
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
async def _run(self) -> None:
|
|
114
|
+
backoff = self._reconnect_min
|
|
115
|
+
headers = {"Authorization": f"Bearer {self._token}"}
|
|
116
|
+
while not self._closing:
|
|
117
|
+
try:
|
|
118
|
+
async with websockets.connect(
|
|
119
|
+
self._url, additional_headers=headers, open_timeout=10
|
|
120
|
+
) as ws:
|
|
121
|
+
self._ws = ws
|
|
122
|
+
backoff = self._reconnect_min
|
|
123
|
+
if self._held:
|
|
124
|
+
# Re-assert everything we still believe we own.
|
|
125
|
+
await self._send({"op": "reassert", "run_ids": list(self._held)})
|
|
126
|
+
self._connected.set()
|
|
127
|
+
async for raw in ws:
|
|
128
|
+
try:
|
|
129
|
+
self._on_message(json.loads(raw))
|
|
130
|
+
except Exception:
|
|
131
|
+
logger.exception("lease client: bad message %r", raw)
|
|
132
|
+
except asyncio.CancelledError:
|
|
133
|
+
raise
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.warning("lease broker connection lost: %s", e)
|
|
136
|
+
finally:
|
|
137
|
+
self._connected.clear()
|
|
138
|
+
self._ws = None
|
|
139
|
+
if self._closing:
|
|
140
|
+
break
|
|
141
|
+
await asyncio.sleep(backoff)
|
|
142
|
+
backoff = min(backoff * 2, self._reconnect_max)
|
|
143
|
+
|
|
144
|
+
# ── messaging ──────────────────────────────────────────────────────────
|
|
145
|
+
|
|
146
|
+
def _on_message(self, msg: dict) -> None:
|
|
147
|
+
op = msg.get("op")
|
|
148
|
+
if op in ("granted", "refused"):
|
|
149
|
+
run_id = msg.get("run_id")
|
|
150
|
+
outcome = ACQUIRED if op == "granted" else msg.get("reason", BUSY)
|
|
151
|
+
self._resolve(run_id, outcome)
|
|
152
|
+
elif op == "revoked":
|
|
153
|
+
# Another worker took over (split-brain after a long stall). We can't
|
|
154
|
+
# force-cancel in-flight work; drop ownership and rely on step
|
|
155
|
+
# idempotency. Logged loudly so it's visible.
|
|
156
|
+
run_id = msg.get("run_id")
|
|
157
|
+
self._held.discard(run_id)
|
|
158
|
+
logger.error("lease REVOKED run_id=%s reason=%s", run_id, msg.get("reason"))
|
|
159
|
+
elif op == "ping":
|
|
160
|
+
asyncio.ensure_future(self._send_safe({"op": "pong"}))
|
|
161
|
+
|
|
162
|
+
def _resolve(self, run_id: Optional[str], outcome: str) -> None:
|
|
163
|
+
for fut in self._waiters.pop(run_id, []):
|
|
164
|
+
if not fut.done():
|
|
165
|
+
fut.set_result(outcome)
|
|
166
|
+
|
|
167
|
+
async def _send(self, msg: dict) -> None:
|
|
168
|
+
async with self._send_lock:
|
|
169
|
+
if self._ws is None:
|
|
170
|
+
raise ConnectionError("lease broker not connected")
|
|
171
|
+
await self._ws.send(json.dumps(msg))
|
|
172
|
+
|
|
173
|
+
async def _send_safe(self, msg: dict) -> None:
|
|
174
|
+
try:
|
|
175
|
+
await self._send(msg)
|
|
176
|
+
except Exception:
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
# ── public API ──────────────────────────────────────────────────────────
|
|
180
|
+
|
|
181
|
+
async def acquire(
|
|
182
|
+
self, run_id: str, entry_point: Optional[str] = None, timeout: Optional[float] = None
|
|
183
|
+
) -> AcquireResult:
|
|
184
|
+
self.ensure_started()
|
|
185
|
+
timeout = timeout or self._acquire_timeout
|
|
186
|
+
try:
|
|
187
|
+
await asyncio.wait_for(self._connected.wait(), timeout=timeout)
|
|
188
|
+
except asyncio.TimeoutError:
|
|
189
|
+
return AcquireResult(UNAVAILABLE)
|
|
190
|
+
|
|
191
|
+
fut: asyncio.Future = asyncio.get_event_loop().create_future()
|
|
192
|
+
self._waiters.setdefault(run_id, []).append(fut)
|
|
193
|
+
try:
|
|
194
|
+
await self._send({"op": "acquire", "run_id": run_id, "entry_point": entry_point})
|
|
195
|
+
except Exception:
|
|
196
|
+
self._discard_waiter(run_id, fut)
|
|
197
|
+
return AcquireResult(UNAVAILABLE)
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
outcome = await asyncio.wait_for(fut, timeout=timeout)
|
|
201
|
+
except asyncio.TimeoutError:
|
|
202
|
+
self._discard_waiter(run_id, fut)
|
|
203
|
+
return AcquireResult(UNAVAILABLE)
|
|
204
|
+
|
|
205
|
+
if outcome == ACQUIRED:
|
|
206
|
+
self._held.add(run_id)
|
|
207
|
+
return AcquireResult(outcome)
|
|
208
|
+
|
|
209
|
+
async def release(self, run_id: str, status: str = SUCCEEDED) -> None:
|
|
210
|
+
self._held.discard(run_id)
|
|
211
|
+
# Best-effort: if the send fails, the broker frees the lease via the
|
|
212
|
+
# grace window when the connection drops anyway.
|
|
213
|
+
await self._send_safe({"op": "release", "run_id": run_id, "status": status})
|
|
214
|
+
|
|
215
|
+
def _discard_waiter(self, run_id: str, fut: asyncio.Future) -> None:
|
|
216
|
+
waiters = self._waiters.get(run_id)
|
|
217
|
+
if waiters and fut in waiters:
|
|
218
|
+
waiters.remove(fut)
|
|
219
|
+
if not waiters:
|
|
220
|
+
self._waiters.pop(run_id, None)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
# ── singleton wiring ─────────────────────────────────────────────────────────
|
|
224
|
+
|
|
225
|
+
_client: Optional[LeaseBrokerClient] = None
|
|
226
|
+
_resolved = False
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _derive_broker_url() -> Optional[str]:
|
|
230
|
+
explicit = os.getenv("LEASE_BROKER_URL")
|
|
231
|
+
if explicit:
|
|
232
|
+
return explicit
|
|
233
|
+
api = os.getenv("FLOWSTASH_API_URL") or os.getenv("MANAGED_API_URL")
|
|
234
|
+
if not api:
|
|
235
|
+
return None
|
|
236
|
+
api = api.rstrip("/")
|
|
237
|
+
if api.startswith("https://"):
|
|
238
|
+
return "wss://" + api[len("https://") :] + "/ws/leases"
|
|
239
|
+
if api.startswith("http://"):
|
|
240
|
+
return "ws://" + api[len("http://") :] + "/ws/leases"
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_lease_client() -> Optional[LeaseBrokerClient]:
|
|
245
|
+
"""Return the process-singleton lease client, or None when the guard is
|
|
246
|
+
disabled (broker not configured, explicitly turned off, or ``websockets``
|
|
247
|
+
not installed). Callers treat None as 'run without the guard'."""
|
|
248
|
+
global _client, _resolved
|
|
249
|
+
if _resolved:
|
|
250
|
+
return _client
|
|
251
|
+
_resolved = True
|
|
252
|
+
|
|
253
|
+
if os.getenv("LEASE_BROKER_ENABLED", "true").lower() in ("0", "false", "no"):
|
|
254
|
+
logger.info("lease broker disabled via LEASE_BROKER_ENABLED")
|
|
255
|
+
return None
|
|
256
|
+
if websockets is None:
|
|
257
|
+
logger.warning("lease broker disabled: 'websockets' not installed")
|
|
258
|
+
return None
|
|
259
|
+
url = _derive_broker_url()
|
|
260
|
+
token = os.getenv("MANAGED_AUTH_TOKEN", "")
|
|
261
|
+
if not url or not token:
|
|
262
|
+
logger.info("lease broker disabled: URL/token not configured")
|
|
263
|
+
return None
|
|
264
|
+
|
|
265
|
+
_client = LeaseBrokerClient(url, token)
|
|
266
|
+
logger.info("lease broker enabled: %s", url)
|
|
267
|
+
return _client
|
|
@@ -14,6 +14,7 @@ import logging
|
|
|
14
14
|
import os
|
|
15
15
|
import sys
|
|
16
16
|
import traceback
|
|
17
|
+
import uuid
|
|
17
18
|
from typing import Any
|
|
18
19
|
|
|
19
20
|
from collections import defaultdict
|
|
@@ -32,6 +33,14 @@ from flowstash.observability.ingestion import (
|
|
|
32
33
|
from flowstash.queue.consumer import TaskConsumer
|
|
33
34
|
|
|
34
35
|
from .task_resolver import _invoke_task_callable, resolve_function
|
|
36
|
+
from .lease_client import (
|
|
37
|
+
get_lease_client,
|
|
38
|
+
ACQUIRED,
|
|
39
|
+
BUSY,
|
|
40
|
+
COMPLETED,
|
|
41
|
+
SUCCEEDED,
|
|
42
|
+
FAILED,
|
|
43
|
+
)
|
|
35
44
|
|
|
36
45
|
logger = logging.getLogger(__name__)
|
|
37
46
|
|
|
@@ -146,13 +155,16 @@ async def _record_missing_task(task_name: str) -> None:
|
|
|
146
155
|
)
|
|
147
156
|
|
|
148
157
|
|
|
149
|
-
async def _run_job_task(
|
|
158
|
+
async def _run_job_task(
|
|
159
|
+
func: Any, task_name: str, args: list, kwargs: dict, run_id: str | None = None
|
|
160
|
+
) -> bool:
|
|
150
161
|
"""Execute the resolved task within an integration_context. Returns True on success."""
|
|
151
162
|
normalized_args = normalize_arguments(func, args, kwargs)
|
|
152
163
|
|
|
153
164
|
with integration_context(
|
|
154
165
|
integration="managed-job",
|
|
155
166
|
integration_pipeline=task_name,
|
|
167
|
+
run_id=run_id,
|
|
156
168
|
record_lifecycle=False,
|
|
157
169
|
) as ctx:
|
|
158
170
|
await record_run_started(
|
|
@@ -388,7 +400,11 @@ class ManagedConsumer(TaskConsumer):
|
|
|
388
400
|
sys.exit(1)
|
|
389
401
|
|
|
390
402
|
task_name, args, kwargs = _parse_cli_args(argv[1:])
|
|
391
|
-
|
|
403
|
+
# Stable run_id injected by the platform at schedule time (FLOWSTASH_RUN_ID).
|
|
404
|
+
# It is identical across Cloud Run Job retries of one execution, so the lease
|
|
405
|
+
# guard collapses duplicate executions of the same logical run.
|
|
406
|
+
run_id = os.getenv("FLOWSTASH_RUN_ID") or str(uuid.uuid4())
|
|
407
|
+
logger.info(f"[run-task] Resolving task: {task_name!r} (run_id={run_id})")
|
|
392
408
|
|
|
393
409
|
try:
|
|
394
410
|
func = resolve_function(task_name)
|
|
@@ -398,10 +414,41 @@ class ManagedConsumer(TaskConsumer):
|
|
|
398
414
|
await asyncio.to_thread(AsyncManager.get_instance().flush, 15.0)
|
|
399
415
|
sys.exit(1)
|
|
400
416
|
|
|
417
|
+
# Idempotency guard via the lease broker. Unlike HTTP tasks, a job that
|
|
418
|
+
# exits has no automatic redelivery, so on BUSY/COMPLETED we exit 0 (another
|
|
419
|
+
# execution owns it / it is already done) and on UNAVAILABLE we fail OPEN
|
|
420
|
+
# (run anyway) rather than silently drop the work.
|
|
421
|
+
lease = get_lease_client()
|
|
422
|
+
lease_held = False
|
|
423
|
+
if lease is not None:
|
|
424
|
+
res = await lease.acquire(run_id, entry_point=task_name)
|
|
425
|
+
if res.outcome in (COMPLETED, BUSY):
|
|
426
|
+
logger.info(
|
|
427
|
+
"[run-task] run %s not started (%s) — another execution owns it",
|
|
428
|
+
run_id,
|
|
429
|
+
res.outcome,
|
|
430
|
+
)
|
|
431
|
+
await asyncio.to_thread(AsyncManager.get_instance().flush, 15.0)
|
|
432
|
+
sys.exit(0)
|
|
433
|
+
elif res.outcome == ACQUIRED:
|
|
434
|
+
lease_held = True
|
|
435
|
+
else:
|
|
436
|
+
# UNAVAILABLE (broker unreachable) or RECOVERING (broker restarting).
|
|
437
|
+
# A job has no automatic redelivery, so fail OPEN (run unguarded)
|
|
438
|
+
# rather than silently drop the work; duplicates here are rare.
|
|
439
|
+
logger.warning(
|
|
440
|
+
"[run-task] lease not granted (%s) — running run %s without guard",
|
|
441
|
+
res.outcome,
|
|
442
|
+
run_id,
|
|
443
|
+
)
|
|
444
|
+
|
|
401
445
|
logger.info(
|
|
402
446
|
f"[run-task] Executing task: {task_name!r}, args={args}, kwargs={kwargs}"
|
|
403
447
|
)
|
|
404
|
-
success = await _run_job_task(func, task_name, args, kwargs)
|
|
448
|
+
success = await _run_job_task(func, task_name, args, kwargs, run_id=run_id)
|
|
449
|
+
|
|
450
|
+
if lease_held:
|
|
451
|
+
await lease.release(run_id, SUCCEEDED if success else FAILED)
|
|
405
452
|
|
|
406
453
|
await asyncio.to_thread(AsyncManager.get_instance().flush, 15.0)
|
|
407
454
|
sys.exit(0 if success else 1)
|
|
@@ -4,6 +4,7 @@ Task registry, function resolution, and callable dispatch.
|
|
|
4
4
|
Shared by the managed job consumer.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
import asyncio
|
|
7
8
|
import importlib
|
|
8
9
|
import inspect
|
|
9
10
|
import logging
|
|
@@ -48,14 +49,23 @@ def resolve_function(func_ref: str) -> Any:
|
|
|
48
49
|
|
|
49
50
|
|
|
50
51
|
async def _invoke_task_callable(func: Any, args: list, kwargs: dict) -> Any:
|
|
51
|
-
"""Dispatch a task callable regardless of its wrapper type.
|
|
52
|
+
"""Dispatch a task callable regardless of its wrapper type.
|
|
53
|
+
|
|
54
|
+
Sync callables are run via ``asyncio.to_thread`` rather than inline: a
|
|
55
|
+
multi-minute sync task would otherwise block the single worker event loop and
|
|
56
|
+
starve all other concurrent requests on the instance — the very condition that
|
|
57
|
+
triggers Cloud Tasks' connection resets/retries — and would also freeze the
|
|
58
|
+
lease WebSocket. ``asyncio.to_thread`` copies the current ``contextvars``
|
|
59
|
+
context into the worker thread, so the ``integration_context`` / ``run_id``
|
|
60
|
+
remain visible to the task and its observability.
|
|
61
|
+
"""
|
|
52
62
|
if hasattr(func, "run"):
|
|
53
63
|
return await func.run(*args, **kwargs)
|
|
54
64
|
if hasattr(func, "func"):
|
|
55
65
|
underlying = func.func
|
|
56
66
|
if inspect.iscoroutinefunction(underlying):
|
|
57
67
|
return await underlying(*args, **kwargs)
|
|
58
|
-
return underlying
|
|
68
|
+
return await asyncio.to_thread(underlying, *args, **kwargs)
|
|
59
69
|
if inspect.iscoroutinefunction(func):
|
|
60
70
|
return await func(*args, **kwargs)
|
|
61
|
-
return func
|
|
71
|
+
return await asyncio.to_thread(func, *args, **kwargs)
|
|
File without changes
|
{flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/ingress/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/wiring/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|