flowstash-runtime 0.9.0__tar.gz → 0.9.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/PKG-INFO +4 -3
  2. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/pyproject.toml +5 -4
  3. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/http_entrypoint.py +90 -47
  4. flowstash_runtime-0.9.2/src/flowstash/runtime/worker/backends/managed/lease_client.py +267 -0
  5. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/managed_consumer.py +50 -3
  6. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/task_resolver.py +13 -3
  7. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/__init__.py +0 -0
  8. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/ingress/__init__.py +0 -0
  9. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/ingress/app.py +0 -0
  10. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/ingress/router.py +0 -0
  11. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/wiring/__init__.py +0 -0
  12. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/wiring/runtime.py +0 -0
  13. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/__init__.py +0 -0
  14. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/__init__.py +0 -0
  15. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/__init__.py +0 -0
  16. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/bootstrap.py +0 -0
  17. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/consumer_middleware.py +0 -0
  18. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/dramatiq_backend.py +0 -0
  19. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/dramatiq_consumer.py +0 -0
  20. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/dramatiq/entrypoint.py +0 -0
  21. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/README.md +0 -0
  22. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/__init__.py +0 -0
  23. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/drain.py +0 -0
  24. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/feed_consumer.py +0 -0
  25. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/backends/managed/main.py +0 -0
  26. {flowstash_runtime-0.9.0 → flowstash_runtime-0.9.2}/src/flowstash/runtime/worker/runner.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: flowstash-runtime
3
- Version: 0.9.0
3
+ Version: 0.9.2
4
4
  Summary: Actual runtime engine and bootstrap layer for the flowstash platform.
5
5
  Author: juraj.bezdek@gmail.com
6
6
  Author-email: juraj.bezdek@gmail.com
@@ -12,7 +12,8 @@ Classifier: Programming Language :: Python :: 3.13
12
12
  Requires-Dist: apscheduler (>=3.11.0)
13
13
  Requires-Dist: dramatiq (>=1.16.0)
14
14
  Requires-Dist: fastapi (>=0.110.0)
15
- Requires-Dist: flowstash-clients (>=0.9.0,<0.10.0)
16
- Requires-Dist: flowstash-lib (>=0.9.0,<0.10.0)
15
+ Requires-Dist: flowstash-clients (>=0.9.2,<0.10.0)
16
+ Requires-Dist: flowstash-lib (>=0.9.2,<0.10.0)
17
17
  Requires-Dist: pyyaml (>=6.0.1)
18
18
  Requires-Dist: uvicorn (>=0.29.0)
19
+ Requires-Dist: websockets (>=14.0)
@@ -1,17 +1,18 @@
1
1
  [project]
2
2
  name = "flowstash-runtime"
3
- version = "0.9.0"
3
+ version = "0.9.2"
4
4
  description = "Actual runtime engine and bootstrap layer for the flowstash platform."
5
5
  authors = [{name = "juraj.bezdek@gmail.com", email = "juraj.bezdek@gmail.com"}]
6
6
  requires-python = ">=3.11"
7
7
  dependencies = [
8
- "flowstash-clients>=0.9.0,<0.10.0",
9
- "flowstash-lib>=0.9.0,<0.10.0",
8
+ "flowstash-clients>=0.9.2,<0.10.0",
9
+ "flowstash-lib>=0.9.2,<0.10.0",
10
10
  "fastapi>=0.110.0",
11
11
  "uvicorn>=0.29.0",
12
12
  "dramatiq>=1.16.0",
13
13
  "apscheduler>=3.11.0",
14
- "pyyaml>=6.0.1"
14
+ "pyyaml>=6.0.1",
15
+ "websockets>=14.0"
15
16
  ]
16
17
 
17
18
  [build-system]
@@ -34,6 +34,12 @@ from .task_resolver import (
34
34
  resolve_function as _registry_resolve,
35
35
  _invoke_task_callable,
36
36
  )
37
+ from .lease_client import (
38
+ get_lease_client,
39
+ LeaseBusy,
40
+ SUCCEEDED as LEASE_SUCCEEDED,
41
+ FAILED as LEASE_FAILED,
42
+ )
37
43
  from flowstash.pipelines.record_serialization import from_jsonable
38
44
 
39
45
  logger = logging.getLogger(__name__)
@@ -218,63 +224,91 @@ async def _execute_managed_task(payload: TaskPayload) -> dict:
218
224
  # Fall back to a fresh id for older payloads that don't carry one.
219
225
  execution_run_id = payload.run_id or str(uuid.uuid4())
220
226
 
221
- with integration_context(
222
- integration=integration,
223
- integration_pipeline=pipeline,
224
- run_id=execution_run_id,
225
- parent_run_id=parent_run_id,
226
- operation_id=operation_id,
227
- tags=tags,
228
- attrs={"args": raw_args},
229
- record_lifecycle=False,
230
- ) as ctx:
231
- try:
232
- func = _resolve_task_callable(
233
- payload.task_name or payload.task_id, func_ref
227
+ # Idempotency guard: claim an exclusive lease on this run_id from the broker so a
228
+ # Cloud Tasks redelivery (or a concurrent duplicate) never re-executes a run that is
229
+ # already running or just completed. Disabled-safe: get_lease_client() returns None
230
+ # when the broker isn't configured; an unreachable broker yields UNAVAILABLE → we
231
+ # fail closed with a retryable 503 rather than risk a duplicate.
232
+ lease = get_lease_client()
233
+ lease_held = False
234
+ if lease is not None:
235
+ res = await lease.acquire(execution_run_id, entry_point=entry_point)
236
+ if res.duplicate:
237
+ logger.info(
238
+ "run %s already completed — skipping redelivery", execution_run_id
234
239
  )
235
- except ValueError as e:
236
- logger.warning(f"Could not resolve task '{entry_point}': {e}")
240
+ return {"status": "duplicate", "run_id": execution_run_id, "task": func_ref}
241
+ if not res.acquired:
242
+ # BUSY (held elsewhere), RECOVERING (broker restarting), or UNAVAILABLE
243
+ # (broker unreachable) — all map to a retryable 503. Only ACQUIRED runs.
244
+ raise LeaseBusy(execution_run_id)
245
+ lease_held = True
246
+
247
+ lease_status = LEASE_FAILED
248
+ try:
249
+ with integration_context(
250
+ integration=integration,
251
+ integration_pipeline=pipeline,
252
+ run_id=execution_run_id,
253
+ parent_run_id=parent_run_id,
254
+ operation_id=operation_id,
255
+ tags=tags,
256
+ attrs={"args": raw_args},
257
+ record_lifecycle=False,
258
+ ) as ctx:
259
+ try:
260
+ func = _resolve_task_callable(
261
+ payload.task_name or payload.task_id, func_ref
262
+ )
263
+ except ValueError as e:
264
+ logger.warning(f"Could not resolve task '{entry_point}': {e}")
265
+ await record_run_started(
266
+ correlation=ctx.corelation,
267
+ entry_point=entry_point,
268
+ attrs={"args": raw_args},
269
+ )
270
+ await record_run_ended(
271
+ status="FAILED",
272
+ correlation=ctx.corelation,
273
+ attrs={"error": str(e), "task_resolution_failed": True},
274
+ )
275
+ raise
276
+
277
+ normalized_args = normalize_arguments(func, args, kwargs)
237
278
  await record_run_started(
238
279
  correlation=ctx.corelation,
239
280
  entry_point=entry_point,
240
- attrs={"args": raw_args},
241
- )
242
- await record_run_ended(
243
- status="FAILED",
244
- correlation=ctx.corelation,
245
- attrs={"error": str(e), "task_resolution_failed": True},
281
+ attrs={"args": normalized_args},
246
282
  )
247
- raise
248
283
 
249
- normalized_args = normalize_arguments(func, args, kwargs)
250
- await record_run_started(
251
- correlation=ctx.corelation,
252
- entry_point=entry_point,
253
- attrs={"args": normalized_args},
254
- )
255
-
256
- try:
257
- await _invoke_task_callable(func, args, kwargs)
258
- except Exception as e:
259
- import traceback
284
+ try:
285
+ await _invoke_task_callable(func, args, kwargs)
286
+ except Exception as e:
287
+ import traceback
288
+
289
+ error = str(e)
290
+ tb = traceback.format_exc()
291
+ logger.error(f"Task execution failed: {func_ref}: {e}", exc_info=True)
292
+ await record_run_ended(
293
+ status="FAILED",
294
+ correlation=ctx.corelation,
295
+ attrs={"error": error, "traceback": tb},
296
+ )
297
+ # Re-raise so the run ends as FAILED only (not ALSO SUCCEEDED) and the handler
298
+ # returns 500 → Cloud Tasks retries the delivery. Without this, a raising task
299
+ # was recorded as both FAILED and SUCCEEDED and the worker returned 200.
300
+ raise
260
301
 
261
- error = str(e)
262
- tb = traceback.format_exc()
263
- logger.error(f"Task execution failed: {func_ref}: {e}", exc_info=True)
264
302
  await record_run_ended(
265
- status="FAILED",
303
+ status="SUCCEEDED",
266
304
  correlation=ctx.corelation,
267
- attrs={"error": error, "traceback": tb},
268
305
  )
269
- # Re-raise so the run ends as FAILED only (not ALSO SUCCEEDED) and the handler
270
- # returns 500 → Cloud Tasks retries the delivery. Without this, a raising task
271
- # was recorded as both FAILED and SUCCEEDED and the worker returned 200.
272
- raise
273
-
274
- await record_run_ended(
275
- status="SUCCEEDED",
276
- correlation=ctx.corelation,
277
- )
306
+ lease_status = LEASE_SUCCEEDED
307
+ finally:
308
+ if lease_held:
309
+ # SUCCEEDED → completed-set tombstone (skips post-completion redeliveries);
310
+ # FAILED → free the lease so Cloud Tasks can genuinely retry.
311
+ await lease.release(execution_run_id, lease_status)
278
312
 
279
313
  return {
280
314
  "status": "ok",
@@ -317,6 +351,15 @@ async def handle_task(request: Request, payload: TaskPayload):
317
351
  status_code=status.HTTP_404_NOT_FOUND,
318
352
  content={"status": "TASK_NOT_FOUND", "detail": str(e)},
319
353
  )
354
+ except LeaseBusy as e:
355
+ # Lease held elsewhere (or broker unreachable). 503 is retryable by
356
+ # Cloud Tasks (unlike a 4xx), so the redelivery becomes the liveness
357
+ # backstop: by the next attempt the holder has finished (→ COMPLETED)
358
+ # or its lease expired (→ this attempt acquires it).
359
+ return JSONResponse(
360
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
361
+ content={"status": "LEASE_HELD", "run_id": e.run_id},
362
+ )
320
363
  except Exception as e:
321
364
  # Task execution failed (already recorded as FAILED on the run). Return 500 so
322
365
  # Cloud Tasks retries the delivery; the request scope still releases its slot
@@ -0,0 +1,267 @@
1
+ """
2
+ Worker-side client for the lease broker.
3
+
4
+ One process-singleton WebSocket connection multiplexes the acquire/release of
5
+ every task this worker process runs. The connection itself is the liveness
6
+ signal — if it drops, the broker frees this worker's leases after a grace window
7
+ unless we reconnect and re-assert them. We track our own held run_ids locally and
8
+ re-assert the whole set on every (re)connect.
9
+
10
+ Safe rollout: if the broker is not configured (no URL/token), or the optional
11
+ ``websockets`` dependency is missing, the client is *disabled* and callers run
12
+ without the guard (no behaviour change). Once enabled, an unreachable broker
13
+ makes ``acquire`` return ``UNAVAILABLE`` so the caller fails closed (HTTP 503 /
14
+ job no-op) rather than risk a duplicate.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import asyncio
20
+ import json
21
+ import logging
22
+ import os
23
+ from dataclasses import dataclass
24
+ from typing import Dict, List, Optional, Set
25
+
26
+ try: # optional dependency — absence simply disables the guard
27
+ import websockets
28
+ except Exception: # pragma: no cover - import guard
29
+ websockets = None # type: ignore
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # acquire() outcomes
34
+ ACQUIRED = "ACQUIRED"
35
+ BUSY = "BUSY"
36
+ COMPLETED = "COMPLETED"
37
+ UNAVAILABLE = "UNAVAILABLE"
38
+
39
+ # release() statuses
40
+ SUCCEEDED = "SUCCEEDED"
41
+ FAILED = "FAILED"
42
+
43
+
44
+ class LeaseBusy(Exception):
45
+ """Raised when the run's lease is held elsewhere (or the broker is
46
+ unreachable) — the caller should respond with a retryable 503 so Cloud Tasks
47
+ redelivers later, by which point the holder has finished or its lease expired."""
48
+
49
+ def __init__(self, run_id: str):
50
+ super().__init__(f"lease busy for run_id={run_id}")
51
+ self.run_id = run_id
52
+
53
+
54
+ @dataclass
55
+ class AcquireResult:
56
+ outcome: str # ACQUIRED | BUSY | COMPLETED | UNAVAILABLE
57
+
58
+ @property
59
+ def acquired(self) -> bool:
60
+ return self.outcome == ACQUIRED
61
+
62
+ @property
63
+ def duplicate(self) -> bool:
64
+ return self.outcome == COMPLETED
65
+
66
+ @property
67
+ def busy(self) -> bool:
68
+ return self.outcome in (BUSY, UNAVAILABLE)
69
+
70
+
71
+ class LeaseBrokerClient:
72
+ def __init__(
73
+ self,
74
+ url: str,
75
+ token: str,
76
+ *,
77
+ acquire_timeout: float = 10.0,
78
+ reconnect_min: float = 0.5,
79
+ reconnect_max: float = 10.0,
80
+ ) -> None:
81
+ self._url = url
82
+ self._token = token
83
+ self._acquire_timeout = acquire_timeout
84
+ self._reconnect_min = reconnect_min
85
+ self._reconnect_max = reconnect_max
86
+
87
+ self._held: Set[str] = set()
88
+ self._waiters: Dict[str, List[asyncio.Future]] = {}
89
+ self._ws = None
90
+ self._connected = asyncio.Event()
91
+ self._send_lock = asyncio.Lock()
92
+ self._run_task: Optional[asyncio.Task] = None
93
+ self._closing = False
94
+
95
+ # ── lifecycle ────────────────────────────────────────────────────────
96
+
97
+ def ensure_started(self) -> None:
98
+ """Start the background connect loop once, on the current event loop."""
99
+ if self._run_task is None or self._run_task.done():
100
+ self._run_task = asyncio.ensure_future(self._run())
101
+
102
+ async def stop(self) -> None:
103
+ self._closing = True
104
+ if self._run_task is not None:
105
+ self._run_task.cancel()
106
+ ws = self._ws
107
+ if ws is not None:
108
+ try:
109
+ await ws.close()
110
+ except Exception:
111
+ pass
112
+
113
+ async def _run(self) -> None:
114
+ backoff = self._reconnect_min
115
+ headers = {"Authorization": f"Bearer {self._token}"}
116
+ while not self._closing:
117
+ try:
118
+ async with websockets.connect(
119
+ self._url, additional_headers=headers, open_timeout=10
120
+ ) as ws:
121
+ self._ws = ws
122
+ backoff = self._reconnect_min
123
+ if self._held:
124
+ # Re-assert everything we still believe we own.
125
+ await self._send({"op": "reassert", "run_ids": list(self._held)})
126
+ self._connected.set()
127
+ async for raw in ws:
128
+ try:
129
+ self._on_message(json.loads(raw))
130
+ except Exception:
131
+ logger.exception("lease client: bad message %r", raw)
132
+ except asyncio.CancelledError:
133
+ raise
134
+ except Exception as e:
135
+ logger.warning("lease broker connection lost: %s", e)
136
+ finally:
137
+ self._connected.clear()
138
+ self._ws = None
139
+ if self._closing:
140
+ break
141
+ await asyncio.sleep(backoff)
142
+ backoff = min(backoff * 2, self._reconnect_max)
143
+
144
+ # ── messaging ──────────────────────────────────────────────────────────
145
+
146
+ def _on_message(self, msg: dict) -> None:
147
+ op = msg.get("op")
148
+ if op in ("granted", "refused"):
149
+ run_id = msg.get("run_id")
150
+ outcome = ACQUIRED if op == "granted" else msg.get("reason", BUSY)
151
+ self._resolve(run_id, outcome)
152
+ elif op == "revoked":
153
+ # Another worker took over (split-brain after a long stall). We can't
154
+ # force-cancel in-flight work; drop ownership and rely on step
155
+ # idempotency. Logged loudly so it's visible.
156
+ run_id = msg.get("run_id")
157
+ self._held.discard(run_id)
158
+ logger.error("lease REVOKED run_id=%s reason=%s", run_id, msg.get("reason"))
159
+ elif op == "ping":
160
+ asyncio.ensure_future(self._send_safe({"op": "pong"}))
161
+
162
+ def _resolve(self, run_id: Optional[str], outcome: str) -> None:
163
+ for fut in self._waiters.pop(run_id, []):
164
+ if not fut.done():
165
+ fut.set_result(outcome)
166
+
167
+ async def _send(self, msg: dict) -> None:
168
+ async with self._send_lock:
169
+ if self._ws is None:
170
+ raise ConnectionError("lease broker not connected")
171
+ await self._ws.send(json.dumps(msg))
172
+
173
+ async def _send_safe(self, msg: dict) -> None:
174
+ try:
175
+ await self._send(msg)
176
+ except Exception:
177
+ pass
178
+
179
+ # ── public API ──────────────────────────────────────────────────────────
180
+
181
+ async def acquire(
182
+ self, run_id: str, entry_point: Optional[str] = None, timeout: Optional[float] = None
183
+ ) -> AcquireResult:
184
+ self.ensure_started()
185
+ timeout = timeout or self._acquire_timeout
186
+ try:
187
+ await asyncio.wait_for(self._connected.wait(), timeout=timeout)
188
+ except asyncio.TimeoutError:
189
+ return AcquireResult(UNAVAILABLE)
190
+
191
+ fut: asyncio.Future = asyncio.get_event_loop().create_future()
192
+ self._waiters.setdefault(run_id, []).append(fut)
193
+ try:
194
+ await self._send({"op": "acquire", "run_id": run_id, "entry_point": entry_point})
195
+ except Exception:
196
+ self._discard_waiter(run_id, fut)
197
+ return AcquireResult(UNAVAILABLE)
198
+
199
+ try:
200
+ outcome = await asyncio.wait_for(fut, timeout=timeout)
201
+ except asyncio.TimeoutError:
202
+ self._discard_waiter(run_id, fut)
203
+ return AcquireResult(UNAVAILABLE)
204
+
205
+ if outcome == ACQUIRED:
206
+ self._held.add(run_id)
207
+ return AcquireResult(outcome)
208
+
209
+ async def release(self, run_id: str, status: str = SUCCEEDED) -> None:
210
+ self._held.discard(run_id)
211
+ # Best-effort: if the send fails, the broker frees the lease via the
212
+ # grace window when the connection drops anyway.
213
+ await self._send_safe({"op": "release", "run_id": run_id, "status": status})
214
+
215
+ def _discard_waiter(self, run_id: str, fut: asyncio.Future) -> None:
216
+ waiters = self._waiters.get(run_id)
217
+ if waiters and fut in waiters:
218
+ waiters.remove(fut)
219
+ if not waiters:
220
+ self._waiters.pop(run_id, None)
221
+
222
+
223
+ # ── singleton wiring ─────────────────────────────────────────────────────────
224
+
225
+ _client: Optional[LeaseBrokerClient] = None
226
+ _resolved = False
227
+
228
+
229
+ def _derive_broker_url() -> Optional[str]:
230
+ explicit = os.getenv("LEASE_BROKER_URL")
231
+ if explicit:
232
+ return explicit
233
+ api = os.getenv("FLOWSTASH_API_URL") or os.getenv("MANAGED_API_URL")
234
+ if not api:
235
+ return None
236
+ api = api.rstrip("/")
237
+ if api.startswith("https://"):
238
+ return "wss://" + api[len("https://") :] + "/ws/leases"
239
+ if api.startswith("http://"):
240
+ return "ws://" + api[len("http://") :] + "/ws/leases"
241
+ return None
242
+
243
+
244
+ def get_lease_client() -> Optional[LeaseBrokerClient]:
245
+ """Return the process-singleton lease client, or None when the guard is
246
+ disabled (broker not configured, explicitly turned off, or ``websockets``
247
+ not installed). Callers treat None as 'run without the guard'."""
248
+ global _client, _resolved
249
+ if _resolved:
250
+ return _client
251
+ _resolved = True
252
+
253
+ if os.getenv("LEASE_BROKER_ENABLED", "true").lower() in ("0", "false", "no"):
254
+ logger.info("lease broker disabled via LEASE_BROKER_ENABLED")
255
+ return None
256
+ if websockets is None:
257
+ logger.warning("lease broker disabled: 'websockets' not installed")
258
+ return None
259
+ url = _derive_broker_url()
260
+ token = os.getenv("MANAGED_AUTH_TOKEN", "")
261
+ if not url or not token:
262
+ logger.info("lease broker disabled: URL/token not configured")
263
+ return None
264
+
265
+ _client = LeaseBrokerClient(url, token)
266
+ logger.info("lease broker enabled: %s", url)
267
+ return _client
@@ -14,6 +14,7 @@ import logging
14
14
  import os
15
15
  import sys
16
16
  import traceback
17
+ import uuid
17
18
  from typing import Any
18
19
 
19
20
  from collections import defaultdict
@@ -32,6 +33,14 @@ from flowstash.observability.ingestion import (
32
33
  from flowstash.queue.consumer import TaskConsumer
33
34
 
34
35
  from .task_resolver import _invoke_task_callable, resolve_function
36
+ from .lease_client import (
37
+ get_lease_client,
38
+ ACQUIRED,
39
+ BUSY,
40
+ COMPLETED,
41
+ SUCCEEDED,
42
+ FAILED,
43
+ )
35
44
 
36
45
  logger = logging.getLogger(__name__)
37
46
 
@@ -146,13 +155,16 @@ async def _record_missing_task(task_name: str) -> None:
146
155
  )
147
156
 
148
157
 
149
- async def _run_job_task(func: Any, task_name: str, args: list, kwargs: dict) -> bool:
158
+ async def _run_job_task(
159
+ func: Any, task_name: str, args: list, kwargs: dict, run_id: str | None = None
160
+ ) -> bool:
150
161
  """Execute the resolved task within an integration_context. Returns True on success."""
151
162
  normalized_args = normalize_arguments(func, args, kwargs)
152
163
 
153
164
  with integration_context(
154
165
  integration="managed-job",
155
166
  integration_pipeline=task_name,
167
+ run_id=run_id,
156
168
  record_lifecycle=False,
157
169
  ) as ctx:
158
170
  await record_run_started(
@@ -388,7 +400,11 @@ class ManagedConsumer(TaskConsumer):
388
400
  sys.exit(1)
389
401
 
390
402
  task_name, args, kwargs = _parse_cli_args(argv[1:])
391
- logger.info(f"[run-task] Resolving task: {task_name!r}")
403
+ # Stable run_id injected by the platform at schedule time (FLOWSTASH_RUN_ID).
404
+ # It is identical across Cloud Run Job retries of one execution, so the lease
405
+ # guard collapses duplicate executions of the same logical run.
406
+ run_id = os.getenv("FLOWSTASH_RUN_ID") or str(uuid.uuid4())
407
+ logger.info(f"[run-task] Resolving task: {task_name!r} (run_id={run_id})")
392
408
 
393
409
  try:
394
410
  func = resolve_function(task_name)
@@ -398,10 +414,41 @@ class ManagedConsumer(TaskConsumer):
398
414
  await asyncio.to_thread(AsyncManager.get_instance().flush, 15.0)
399
415
  sys.exit(1)
400
416
 
417
+ # Idempotency guard via the lease broker. Unlike HTTP tasks, a job that
418
+ # exits has no automatic redelivery, so on BUSY/COMPLETED we exit 0 (another
419
+ # execution owns it / it is already done) and on UNAVAILABLE we fail OPEN
420
+ # (run anyway) rather than silently drop the work.
421
+ lease = get_lease_client()
422
+ lease_held = False
423
+ if lease is not None:
424
+ res = await lease.acquire(run_id, entry_point=task_name)
425
+ if res.outcome in (COMPLETED, BUSY):
426
+ logger.info(
427
+ "[run-task] run %s not started (%s) — another execution owns it",
428
+ run_id,
429
+ res.outcome,
430
+ )
431
+ await asyncio.to_thread(AsyncManager.get_instance().flush, 15.0)
432
+ sys.exit(0)
433
+ elif res.outcome == ACQUIRED:
434
+ lease_held = True
435
+ else:
436
+ # UNAVAILABLE (broker unreachable) or RECOVERING (broker restarting).
437
+ # A job has no automatic redelivery, so fail OPEN (run unguarded)
438
+ # rather than silently drop the work; duplicates here are rare.
439
+ logger.warning(
440
+ "[run-task] lease not granted (%s) — running run %s without guard",
441
+ res.outcome,
442
+ run_id,
443
+ )
444
+
401
445
  logger.info(
402
446
  f"[run-task] Executing task: {task_name!r}, args={args}, kwargs={kwargs}"
403
447
  )
404
- success = await _run_job_task(func, task_name, args, kwargs)
448
+ success = await _run_job_task(func, task_name, args, kwargs, run_id=run_id)
449
+
450
+ if lease_held:
451
+ await lease.release(run_id, SUCCEEDED if success else FAILED)
405
452
 
406
453
  await asyncio.to_thread(AsyncManager.get_instance().flush, 15.0)
407
454
  sys.exit(0 if success else 1)
@@ -4,6 +4,7 @@ Task registry, function resolution, and callable dispatch.
4
4
  Shared by the managed job consumer.
5
5
  """
6
6
 
7
+ import asyncio
7
8
  import importlib
8
9
  import inspect
9
10
  import logging
@@ -48,14 +49,23 @@ def resolve_function(func_ref: str) -> Any:
48
49
 
49
50
 
50
51
  async def _invoke_task_callable(func: Any, args: list, kwargs: dict) -> Any:
51
- """Dispatch a task callable regardless of its wrapper type."""
52
+ """Dispatch a task callable regardless of its wrapper type.
53
+
54
+ Sync callables are run via ``asyncio.to_thread`` rather than inline: a
55
+ multi-minute sync task would otherwise block the single worker event loop and
56
+ starve all other concurrent requests on the instance — the very condition that
57
+ triggers Cloud Tasks' connection resets/retries — and would also freeze the
58
+ lease WebSocket. ``asyncio.to_thread`` copies the current ``contextvars``
59
+ context into the worker thread, so the ``integration_context`` / ``run_id``
60
+ remain visible to the task and its observability.
61
+ """
52
62
  if hasattr(func, "run"):
53
63
  return await func.run(*args, **kwargs)
54
64
  if hasattr(func, "func"):
55
65
  underlying = func.func
56
66
  if inspect.iscoroutinefunction(underlying):
57
67
  return await underlying(*args, **kwargs)
58
- return underlying(*args, **kwargs)
68
+ return await asyncio.to_thread(underlying, *args, **kwargs)
59
69
  if inspect.iscoroutinefunction(func):
60
70
  return await func(*args, **kwargs)
61
- return func(*args, **kwargs)
71
+ return await asyncio.to_thread(func, *args, **kwargs)