django-agent-runtime 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_agent_runtime/__init__.py +25 -0
- django_agent_runtime/admin.py +155 -0
- django_agent_runtime/api/__init__.py +26 -0
- django_agent_runtime/api/permissions.py +109 -0
- django_agent_runtime/api/serializers.py +114 -0
- django_agent_runtime/api/views.py +472 -0
- django_agent_runtime/apps.py +26 -0
- django_agent_runtime/conf.py +241 -0
- django_agent_runtime/examples/__init__.py +10 -0
- django_agent_runtime/examples/langgraph_adapter.py +164 -0
- django_agent_runtime/examples/langgraph_tools.py +179 -0
- django_agent_runtime/examples/simple_chat.py +69 -0
- django_agent_runtime/examples/tool_agent.py +157 -0
- django_agent_runtime/management/__init__.py +2 -0
- django_agent_runtime/management/commands/__init__.py +2 -0
- django_agent_runtime/management/commands/runagent.py +419 -0
- django_agent_runtime/migrations/0001_initial.py +117 -0
- django_agent_runtime/migrations/0002_persistence_models.py +129 -0
- django_agent_runtime/migrations/0003_persistenceconversation_active_branch_id_and_more.py +212 -0
- django_agent_runtime/migrations/0004_add_anonymous_session_id.py +18 -0
- django_agent_runtime/migrations/__init__.py +2 -0
- django_agent_runtime/models/__init__.py +54 -0
- django_agent_runtime/models/base.py +450 -0
- django_agent_runtime/models/concrete.py +146 -0
- django_agent_runtime/persistence/__init__.py +60 -0
- django_agent_runtime/persistence/helpers.py +148 -0
- django_agent_runtime/persistence/models.py +506 -0
- django_agent_runtime/persistence/stores.py +1191 -0
- django_agent_runtime/runtime/__init__.py +23 -0
- django_agent_runtime/runtime/events/__init__.py +65 -0
- django_agent_runtime/runtime/events/base.py +135 -0
- django_agent_runtime/runtime/events/db.py +129 -0
- django_agent_runtime/runtime/events/redis.py +228 -0
- django_agent_runtime/runtime/events/sync.py +140 -0
- django_agent_runtime/runtime/interfaces.py +475 -0
- django_agent_runtime/runtime/llm/__init__.py +91 -0
- django_agent_runtime/runtime/llm/anthropic.py +249 -0
- django_agent_runtime/runtime/llm/litellm_adapter.py +173 -0
- django_agent_runtime/runtime/llm/openai.py +230 -0
- django_agent_runtime/runtime/queue/__init__.py +75 -0
- django_agent_runtime/runtime/queue/base.py +158 -0
- django_agent_runtime/runtime/queue/postgres.py +248 -0
- django_agent_runtime/runtime/queue/redis_streams.py +336 -0
- django_agent_runtime/runtime/queue/sync.py +277 -0
- django_agent_runtime/runtime/registry.py +186 -0
- django_agent_runtime/runtime/runner.py +540 -0
- django_agent_runtime/runtime/tracing/__init__.py +48 -0
- django_agent_runtime/runtime/tracing/langfuse.py +117 -0
- django_agent_runtime/runtime/tracing/noop.py +36 -0
- django_agent_runtime/urls.py +39 -0
- django_agent_runtime-0.3.6.dist-info/METADATA +723 -0
- django_agent_runtime-0.3.6.dist-info/RECORD +55 -0
- django_agent_runtime-0.3.6.dist-info/WHEEL +5 -0
- django_agent_runtime-0.3.6.dist-info/licenses/LICENSE +22 -0
- django_agent_runtime-0.3.6.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redis Streams-backed queue with consumer groups.
|
|
3
|
+
|
|
4
|
+
Higher throughput than Postgres queue, recommended for production.
|
|
5
|
+
Database remains authoritative - Redis is used for distribution only.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from datetime import datetime, timedelta, timezone
|
|
10
|
+
from typing import Optional
|
|
11
|
+
from uuid import UUID
|
|
12
|
+
|
|
13
|
+
from asgiref.sync import sync_to_async
|
|
14
|
+
from django.db import transaction
|
|
15
|
+
|
|
16
|
+
from django_agent_runtime.models import AgentRun
|
|
17
|
+
from django_agent_runtime.models.base import RunStatus
|
|
18
|
+
from django_agent_runtime.runtime.queue.base import RunQueue, QueuedRun
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
import redis.asyncio as aioredis
|
|
22
|
+
except ImportError:
|
|
23
|
+
aioredis = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RedisStreamsQueue(RunQueue):
|
|
27
|
+
"""
|
|
28
|
+
Redis Streams-backed queue implementation.
|
|
29
|
+
|
|
30
|
+
Uses consumer groups for distributed processing.
|
|
31
|
+
Database is still the source of truth - Redis handles distribution.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
STREAM_KEY = "agent_runtime:runs"
|
|
35
|
+
GROUP_NAME = "agent_workers"
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
redis_url: str,
|
|
40
|
+
lease_ttl_seconds: int = 30,
|
|
41
|
+
stream_key: Optional[str] = None,
|
|
42
|
+
group_name: Optional[str] = None,
|
|
43
|
+
):
|
|
44
|
+
if aioredis is None:
|
|
45
|
+
raise ImportError("redis package is required for RedisStreamsQueue")
|
|
46
|
+
|
|
47
|
+
self.redis_url = redis_url
|
|
48
|
+
self.lease_ttl_seconds = lease_ttl_seconds
|
|
49
|
+
self.stream_key = stream_key or self.STREAM_KEY
|
|
50
|
+
self.group_name = group_name or self.GROUP_NAME
|
|
51
|
+
self._redis: Optional[aioredis.Redis] = None
|
|
52
|
+
|
|
53
|
+
async def _get_redis(self) -> "aioredis.Redis":
|
|
54
|
+
"""Get or create Redis connection."""
|
|
55
|
+
if self._redis is None:
|
|
56
|
+
self._redis = aioredis.from_url(self.redis_url)
|
|
57
|
+
# Ensure consumer group exists
|
|
58
|
+
try:
|
|
59
|
+
await self._redis.xgroup_create(
|
|
60
|
+
self.stream_key, self.group_name, id="0", mkstream=True
|
|
61
|
+
)
|
|
62
|
+
except aioredis.ResponseError as e:
|
|
63
|
+
if "BUSYGROUP" not in str(e):
|
|
64
|
+
raise
|
|
65
|
+
return self._redis
|
|
66
|
+
|
|
67
|
+
async def enqueue(self, run_id: UUID, agent_key: str) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Add a run to the stream.
|
|
70
|
+
|
|
71
|
+
Called when a new run is created.
|
|
72
|
+
"""
|
|
73
|
+
redis = await self._get_redis()
|
|
74
|
+
await redis.xadd(
|
|
75
|
+
self.stream_key,
|
|
76
|
+
{"run_id": str(run_id), "agent_key": agent_key},
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
async def claim(
|
|
80
|
+
self,
|
|
81
|
+
worker_id: str,
|
|
82
|
+
agent_keys: Optional[list[str]] = None,
|
|
83
|
+
batch_size: int = 1,
|
|
84
|
+
) -> list[QueuedRun]:
|
|
85
|
+
"""Claim runs from the stream using consumer groups."""
|
|
86
|
+
redis = await self._get_redis()
|
|
87
|
+
now = datetime.now(timezone.utc)
|
|
88
|
+
lease_expires = now + timedelta(seconds=self.lease_ttl_seconds)
|
|
89
|
+
|
|
90
|
+
# Read from consumer group
|
|
91
|
+
messages = await redis.xreadgroup(
|
|
92
|
+
self.group_name,
|
|
93
|
+
worker_id,
|
|
94
|
+
{self.stream_key: ">"},
|
|
95
|
+
count=batch_size,
|
|
96
|
+
block=1000, # 1 second block
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
if not messages:
|
|
100
|
+
return []
|
|
101
|
+
|
|
102
|
+
claimed = []
|
|
103
|
+
for stream_name, stream_messages in messages:
|
|
104
|
+
for msg_id, data in stream_messages:
|
|
105
|
+
run_id = UUID(data[b"run_id"].decode())
|
|
106
|
+
agent_key = data[b"agent_key"].decode()
|
|
107
|
+
|
|
108
|
+
# Filter by agent_keys if specified
|
|
109
|
+
if agent_keys and agent_key not in agent_keys:
|
|
110
|
+
# Acknowledge but don't process
|
|
111
|
+
await redis.xack(self.stream_key, self.group_name, msg_id)
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
# Update database with lease
|
|
115
|
+
run = await self._claim_in_db(run_id, worker_id, lease_expires)
|
|
116
|
+
if run:
|
|
117
|
+
claimed.append(run)
|
|
118
|
+
# Acknowledge the message
|
|
119
|
+
await redis.xack(self.stream_key, self.group_name, msg_id)
|
|
120
|
+
else:
|
|
121
|
+
# Run not found or already claimed, acknowledge anyway
|
|
122
|
+
await redis.xack(self.stream_key, self.group_name, msg_id)
|
|
123
|
+
|
|
124
|
+
return claimed
|
|
125
|
+
|
|
126
|
+
@sync_to_async
|
|
127
|
+
def _claim_in_db(
|
|
128
|
+
self, run_id: UUID, worker_id: str, lease_expires: datetime
|
|
129
|
+
) -> Optional[QueuedRun]:
|
|
130
|
+
"""Claim run in database."""
|
|
131
|
+
now = datetime.now(timezone.utc)
|
|
132
|
+
|
|
133
|
+
with transaction.atomic():
|
|
134
|
+
try:
|
|
135
|
+
run = AgentRun.objects.select_for_update(nowait=True).get(
|
|
136
|
+
id=run_id,
|
|
137
|
+
status__in=[RunStatus.QUEUED, RunStatus.RUNNING],
|
|
138
|
+
)
|
|
139
|
+
except (AgentRun.DoesNotExist, Exception):
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
# Check if already claimed by another worker
|
|
143
|
+
if run.status == RunStatus.RUNNING and run.lease_expires_at > now:
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
run.status = RunStatus.RUNNING
|
|
147
|
+
run.lease_owner = worker_id
|
|
148
|
+
run.lease_expires_at = lease_expires
|
|
149
|
+
if run.started_at is None:
|
|
150
|
+
run.started_at = now
|
|
151
|
+
run.save()
|
|
152
|
+
|
|
153
|
+
return QueuedRun(
|
|
154
|
+
run_id=run.id,
|
|
155
|
+
agent_key=run.agent_key,
|
|
156
|
+
attempt=run.attempt,
|
|
157
|
+
lease_expires_at=lease_expires,
|
|
158
|
+
input=run.input,
|
|
159
|
+
metadata=run.metadata,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
async def extend_lease(self, run_id: UUID, worker_id: str, seconds: int) -> bool:
|
|
163
|
+
"""Extend lease in database."""
|
|
164
|
+
|
|
165
|
+
@sync_to_async
|
|
166
|
+
def _extend():
|
|
167
|
+
now = datetime.now(timezone.utc)
|
|
168
|
+
new_expires = now + timedelta(seconds=seconds)
|
|
169
|
+
|
|
170
|
+
updated = AgentRun.objects.filter(
|
|
171
|
+
id=run_id,
|
|
172
|
+
lease_owner=worker_id,
|
|
173
|
+
status=RunStatus.RUNNING,
|
|
174
|
+
).update(lease_expires_at=new_expires)
|
|
175
|
+
|
|
176
|
+
return updated > 0
|
|
177
|
+
|
|
178
|
+
return await _extend()
|
|
179
|
+
|
|
180
|
+
async def release(
|
|
181
|
+
self,
|
|
182
|
+
run_id: UUID,
|
|
183
|
+
worker_id: str,
|
|
184
|
+
success: bool,
|
|
185
|
+
output: Optional[dict] = None,
|
|
186
|
+
error: Optional[dict] = None,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Release run after completion."""
|
|
189
|
+
|
|
190
|
+
@sync_to_async
|
|
191
|
+
def _release():
|
|
192
|
+
now = datetime.now(timezone.utc)
|
|
193
|
+
|
|
194
|
+
updates = {
|
|
195
|
+
"status": RunStatus.SUCCEEDED if success else RunStatus.FAILED,
|
|
196
|
+
"finished_at": now,
|
|
197
|
+
"lease_owner": "",
|
|
198
|
+
"lease_expires_at": None,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if output:
|
|
202
|
+
updates["output"] = output
|
|
203
|
+
if error:
|
|
204
|
+
updates["error"] = error
|
|
205
|
+
|
|
206
|
+
AgentRun.objects.filter(
|
|
207
|
+
id=run_id,
|
|
208
|
+
lease_owner=worker_id,
|
|
209
|
+
).update(**updates)
|
|
210
|
+
|
|
211
|
+
await _release()
|
|
212
|
+
|
|
213
|
+
async def requeue_for_retry(
|
|
214
|
+
self,
|
|
215
|
+
run_id: UUID,
|
|
216
|
+
worker_id: str,
|
|
217
|
+
error: dict,
|
|
218
|
+
delay_seconds: int = 0,
|
|
219
|
+
) -> bool:
|
|
220
|
+
"""Requeue for retry - re-add to stream."""
|
|
221
|
+
|
|
222
|
+
@sync_to_async
|
|
223
|
+
def _check_and_update():
|
|
224
|
+
with transaction.atomic():
|
|
225
|
+
try:
|
|
226
|
+
run = AgentRun.objects.select_for_update().get(
|
|
227
|
+
id=run_id, lease_owner=worker_id
|
|
228
|
+
)
|
|
229
|
+
except AgentRun.DoesNotExist:
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
if run.attempt >= run.max_attempts:
|
|
233
|
+
run.status = RunStatus.FAILED
|
|
234
|
+
run.error = error
|
|
235
|
+
run.finished_at = datetime.now(timezone.utc)
|
|
236
|
+
run.lease_owner = ""
|
|
237
|
+
run.lease_expires_at = None
|
|
238
|
+
run.save()
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
run.status = RunStatus.QUEUED
|
|
242
|
+
run.attempt += 1
|
|
243
|
+
run.error = error
|
|
244
|
+
run.lease_owner = ""
|
|
245
|
+
run.lease_expires_at = None
|
|
246
|
+
run.save()
|
|
247
|
+
return run.agent_key
|
|
248
|
+
|
|
249
|
+
agent_key = await _check_and_update()
|
|
250
|
+
if agent_key:
|
|
251
|
+
# Re-add to stream
|
|
252
|
+
await self.enqueue(run_id, agent_key)
|
|
253
|
+
return True
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
async def cancel(self, run_id: UUID) -> bool:
|
|
257
|
+
"""Mark run for cancellation."""
|
|
258
|
+
|
|
259
|
+
@sync_to_async
|
|
260
|
+
def _cancel():
|
|
261
|
+
now = datetime.now(timezone.utc)
|
|
262
|
+
updated = AgentRun.objects.filter(
|
|
263
|
+
id=run_id,
|
|
264
|
+
status__in=[RunStatus.QUEUED, RunStatus.RUNNING],
|
|
265
|
+
).update(cancel_requested_at=now)
|
|
266
|
+
return updated > 0
|
|
267
|
+
|
|
268
|
+
return await _cancel()
|
|
269
|
+
|
|
270
|
+
async def is_cancelled(self, run_id: UUID) -> bool:
|
|
271
|
+
"""Check if cancellation was requested."""
|
|
272
|
+
|
|
273
|
+
@sync_to_async
|
|
274
|
+
def _is_cancelled():
|
|
275
|
+
try:
|
|
276
|
+
run = AgentRun.objects.get(id=run_id)
|
|
277
|
+
return run.cancel_requested_at is not None
|
|
278
|
+
except AgentRun.DoesNotExist:
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
return await _is_cancelled()
|
|
282
|
+
|
|
283
|
+
async def recover_expired_leases(self) -> int:
|
|
284
|
+
"""Recover runs with expired leases and re-add to stream."""
|
|
285
|
+
redis = await self._get_redis()
|
|
286
|
+
|
|
287
|
+
@sync_to_async
|
|
288
|
+
def _get_expired():
|
|
289
|
+
now = datetime.now(timezone.utc)
|
|
290
|
+
return list(
|
|
291
|
+
AgentRun.objects.filter(
|
|
292
|
+
status=RunStatus.RUNNING,
|
|
293
|
+
lease_expires_at__lt=now,
|
|
294
|
+
).values("id", "agent_key", "attempt", "max_attempts")
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
expired = await _get_expired()
|
|
298
|
+
|
|
299
|
+
@sync_to_async
|
|
300
|
+
def _update_run(run_data):
|
|
301
|
+
now = datetime.now(timezone.utc)
|
|
302
|
+
run = AgentRun.objects.get(id=run_data["id"])
|
|
303
|
+
|
|
304
|
+
if run_data["attempt"] >= run_data["max_attempts"]:
|
|
305
|
+
run.status = RunStatus.TIMED_OUT
|
|
306
|
+
run.finished_at = now
|
|
307
|
+
run.error = {
|
|
308
|
+
"type": "LeaseExpired",
|
|
309
|
+
"message": "Worker lease expired without completion",
|
|
310
|
+
"retriable": False,
|
|
311
|
+
}
|
|
312
|
+
requeue = False
|
|
313
|
+
else:
|
|
314
|
+
run.status = RunStatus.QUEUED
|
|
315
|
+
run.attempt += 1
|
|
316
|
+
requeue = True
|
|
317
|
+
|
|
318
|
+
run.lease_owner = ""
|
|
319
|
+
run.lease_expires_at = None
|
|
320
|
+
run.save()
|
|
321
|
+
return requeue
|
|
322
|
+
|
|
323
|
+
count = 0
|
|
324
|
+
for run_data in expired:
|
|
325
|
+
requeue = await _update_run(run_data)
|
|
326
|
+
if requeue:
|
|
327
|
+
await self.enqueue(run_data["id"], run_data["agent_key"])
|
|
328
|
+
count += 1
|
|
329
|
+
|
|
330
|
+
return count
|
|
331
|
+
|
|
332
|
+
async def close(self) -> None:
|
|
333
|
+
"""Close Redis connection."""
|
|
334
|
+
if self._redis:
|
|
335
|
+
await self._redis.close()
|
|
336
|
+
self._redis = None
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Synchronous queue implementations.
|
|
3
|
+
|
|
4
|
+
These are for use in sync contexts like management commands, Celery tasks,
|
|
5
|
+
and traditional Django views.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from datetime import datetime, timedelta, timezone
|
|
10
|
+
from typing import Optional
|
|
11
|
+
from uuid import UUID
|
|
12
|
+
|
|
13
|
+
from django.db import transaction
|
|
14
|
+
from django.db.models import F, Q
|
|
15
|
+
|
|
16
|
+
from django_agent_runtime.models import AgentRun
|
|
17
|
+
from django_agent_runtime.models.base import RunStatus
|
|
18
|
+
from django_agent_runtime.runtime.queue.base import QueuedRun
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SyncRunQueue(ABC):
|
|
22
|
+
"""
|
|
23
|
+
Synchronous interface for run queue implementations.
|
|
24
|
+
|
|
25
|
+
Use this in sync contexts like management commands, Celery tasks,
|
|
26
|
+
and traditional Django views.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
@abstractmethod
|
|
30
|
+
def claim(
|
|
31
|
+
self,
|
|
32
|
+
worker_id: str,
|
|
33
|
+
agent_keys: Optional[list[str]] = None,
|
|
34
|
+
batch_size: int = 1,
|
|
35
|
+
) -> list[QueuedRun]:
|
|
36
|
+
"""Claim runs from the queue."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def extend_lease(self, run_id: UUID, worker_id: str, seconds: int) -> bool:
|
|
41
|
+
"""Extend the lease on a run (heartbeat)."""
|
|
42
|
+
...
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def release(
|
|
46
|
+
self,
|
|
47
|
+
run_id: UUID,
|
|
48
|
+
worker_id: str,
|
|
49
|
+
success: bool,
|
|
50
|
+
output: Optional[dict] = None,
|
|
51
|
+
error: Optional[dict] = None,
|
|
52
|
+
) -> None:
|
|
53
|
+
"""Release a run after completion."""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def requeue_for_retry(
|
|
58
|
+
self,
|
|
59
|
+
run_id: UUID,
|
|
60
|
+
worker_id: str,
|
|
61
|
+
error: dict,
|
|
62
|
+
delay_seconds: int = 0,
|
|
63
|
+
) -> bool:
|
|
64
|
+
"""Requeue a run for retry."""
|
|
65
|
+
...
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def cancel(self, run_id: UUID) -> bool:
|
|
69
|
+
"""Mark a run for cancellation."""
|
|
70
|
+
...
|
|
71
|
+
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def is_cancelled(self, run_id: UUID) -> bool:
|
|
74
|
+
"""Check if a run has been cancelled."""
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
@abstractmethod
|
|
78
|
+
def recover_expired_leases(self) -> int:
|
|
79
|
+
"""Recover runs with expired leases."""
|
|
80
|
+
...
|
|
81
|
+
|
|
82
|
+
def close(self) -> None:
|
|
83
|
+
"""Close any connections. Override if needed."""
|
|
84
|
+
pass
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class SyncPostgresQueue(SyncRunQueue):
|
|
88
|
+
"""
|
|
89
|
+
Synchronous PostgreSQL-backed queue implementation.
|
|
90
|
+
|
|
91
|
+
Uses SELECT FOR UPDATE SKIP LOCKED for atomic claiming.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
def __init__(self, lease_ttl_seconds: int = 30):
|
|
95
|
+
self.lease_ttl_seconds = lease_ttl_seconds
|
|
96
|
+
|
|
97
|
+
def claim(
|
|
98
|
+
self,
|
|
99
|
+
worker_id: str,
|
|
100
|
+
agent_keys: Optional[list[str]] = None,
|
|
101
|
+
batch_size: int = 1,
|
|
102
|
+
) -> list[QueuedRun]:
|
|
103
|
+
"""Claim runs using SELECT FOR UPDATE SKIP LOCKED."""
|
|
104
|
+
now = datetime.now(timezone.utc)
|
|
105
|
+
lease_expires = now + timedelta(seconds=self.lease_ttl_seconds)
|
|
106
|
+
|
|
107
|
+
with transaction.atomic():
|
|
108
|
+
# Build query for claimable runs
|
|
109
|
+
query = Q(status=RunStatus.QUEUED) | Q(
|
|
110
|
+
status=RunStatus.RUNNING,
|
|
111
|
+
lease_expires_at__lt=now, # Expired lease
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
queryset = AgentRun.objects.filter(query)
|
|
115
|
+
|
|
116
|
+
if agent_keys:
|
|
117
|
+
queryset = queryset.filter(agent_key__in=agent_keys)
|
|
118
|
+
|
|
119
|
+
# SELECT FOR UPDATE SKIP LOCKED
|
|
120
|
+
runs = list(queryset.select_for_update(skip_locked=True)[:batch_size])
|
|
121
|
+
|
|
122
|
+
claimed = []
|
|
123
|
+
for run in runs:
|
|
124
|
+
# Update lease
|
|
125
|
+
run.status = RunStatus.RUNNING
|
|
126
|
+
run.lease_owner = worker_id
|
|
127
|
+
run.lease_expires_at = lease_expires
|
|
128
|
+
if run.started_at is None:
|
|
129
|
+
run.started_at = now
|
|
130
|
+
run.save(
|
|
131
|
+
update_fields=[
|
|
132
|
+
"status",
|
|
133
|
+
"lease_owner",
|
|
134
|
+
"lease_expires_at",
|
|
135
|
+
"started_at",
|
|
136
|
+
]
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
claimed.append(
|
|
140
|
+
QueuedRun(
|
|
141
|
+
run_id=run.id,
|
|
142
|
+
agent_key=run.agent_key,
|
|
143
|
+
attempt=run.attempt,
|
|
144
|
+
lease_expires_at=lease_expires,
|
|
145
|
+
input=run.input,
|
|
146
|
+
metadata=run.metadata,
|
|
147
|
+
)
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return claimed
|
|
151
|
+
|
|
152
|
+
def extend_lease(self, run_id: UUID, worker_id: str, seconds: int) -> bool:
|
|
153
|
+
"""Extend lease if we still own it."""
|
|
154
|
+
now = datetime.now(timezone.utc)
|
|
155
|
+
new_expires = now + timedelta(seconds=seconds)
|
|
156
|
+
|
|
157
|
+
updated = AgentRun.objects.filter(
|
|
158
|
+
id=run_id,
|
|
159
|
+
lease_owner=worker_id,
|
|
160
|
+
status=RunStatus.RUNNING,
|
|
161
|
+
).update(lease_expires_at=new_expires)
|
|
162
|
+
|
|
163
|
+
return updated > 0
|
|
164
|
+
|
|
165
|
+
def release(
|
|
166
|
+
self,
|
|
167
|
+
run_id: UUID,
|
|
168
|
+
worker_id: str,
|
|
169
|
+
success: bool,
|
|
170
|
+
output: Optional[dict] = None,
|
|
171
|
+
error: Optional[dict] = None,
|
|
172
|
+
) -> None:
|
|
173
|
+
"""Release run after completion."""
|
|
174
|
+
now = datetime.now(timezone.utc)
|
|
175
|
+
|
|
176
|
+
updates = {
|
|
177
|
+
"status": RunStatus.SUCCEEDED if success else RunStatus.FAILED,
|
|
178
|
+
"finished_at": now,
|
|
179
|
+
"lease_owner": "",
|
|
180
|
+
"lease_expires_at": None,
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
if output:
|
|
184
|
+
updates["output"] = output
|
|
185
|
+
if error:
|
|
186
|
+
updates["error"] = error
|
|
187
|
+
|
|
188
|
+
AgentRun.objects.filter(
|
|
189
|
+
id=run_id,
|
|
190
|
+
lease_owner=worker_id,
|
|
191
|
+
).update(**updates)
|
|
192
|
+
|
|
193
|
+
def requeue_for_retry(
|
|
194
|
+
self,
|
|
195
|
+
run_id: UUID,
|
|
196
|
+
worker_id: str,
|
|
197
|
+
error: dict,
|
|
198
|
+
delay_seconds: int = 0,
|
|
199
|
+
) -> bool:
|
|
200
|
+
"""Requeue for retry if attempts remain."""
|
|
201
|
+
with transaction.atomic():
|
|
202
|
+
try:
|
|
203
|
+
run = AgentRun.objects.select_for_update().get(
|
|
204
|
+
id=run_id, lease_owner=worker_id
|
|
205
|
+
)
|
|
206
|
+
except AgentRun.DoesNotExist:
|
|
207
|
+
return False
|
|
208
|
+
|
|
209
|
+
if run.attempt >= run.max_attempts:
|
|
210
|
+
# Max attempts reached
|
|
211
|
+
run.status = RunStatus.FAILED
|
|
212
|
+
run.error = error
|
|
213
|
+
run.finished_at = datetime.now(timezone.utc)
|
|
214
|
+
run.lease_owner = ""
|
|
215
|
+
run.lease_expires_at = None
|
|
216
|
+
run.save()
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
# Requeue with incremented attempt
|
|
220
|
+
run.status = RunStatus.QUEUED
|
|
221
|
+
run.attempt = F("attempt") + 1
|
|
222
|
+
run.error = error
|
|
223
|
+
run.lease_owner = ""
|
|
224
|
+
run.lease_expires_at = None
|
|
225
|
+
run.save()
|
|
226
|
+
return True
|
|
227
|
+
|
|
228
|
+
def cancel(self, run_id: UUID) -> bool:
|
|
229
|
+
"""Mark run for cancellation."""
|
|
230
|
+
now = datetime.now(timezone.utc)
|
|
231
|
+
updated = AgentRun.objects.filter(
|
|
232
|
+
id=run_id,
|
|
233
|
+
status__in=[RunStatus.QUEUED, RunStatus.RUNNING],
|
|
234
|
+
).update(cancel_requested_at=now)
|
|
235
|
+
return updated > 0
|
|
236
|
+
|
|
237
|
+
def is_cancelled(self, run_id: UUID) -> bool:
|
|
238
|
+
"""Check if cancellation was requested."""
|
|
239
|
+
try:
|
|
240
|
+
run = AgentRun.objects.get(id=run_id)
|
|
241
|
+
return run.cancel_requested_at is not None
|
|
242
|
+
except AgentRun.DoesNotExist:
|
|
243
|
+
return False
|
|
244
|
+
|
|
245
|
+
def recover_expired_leases(self) -> int:
|
|
246
|
+
"""Recover runs with expired leases."""
|
|
247
|
+
now = datetime.now(timezone.utc)
|
|
248
|
+
|
|
249
|
+
# Find runs with expired leases
|
|
250
|
+
expired = AgentRun.objects.filter(
|
|
251
|
+
status=RunStatus.RUNNING,
|
|
252
|
+
lease_expires_at__lt=now,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
count = 0
|
|
256
|
+
for run in expired:
|
|
257
|
+
if run.attempt >= run.max_attempts:
|
|
258
|
+
# Mark as timed out
|
|
259
|
+
run.status = RunStatus.TIMED_OUT
|
|
260
|
+
run.finished_at = now
|
|
261
|
+
run.error = {
|
|
262
|
+
"type": "LeaseExpired",
|
|
263
|
+
"message": "Worker lease expired without completion",
|
|
264
|
+
"retriable": False,
|
|
265
|
+
}
|
|
266
|
+
else:
|
|
267
|
+
# Requeue for retry
|
|
268
|
+
run.status = RunStatus.QUEUED
|
|
269
|
+
run.attempt += 1
|
|
270
|
+
|
|
271
|
+
run.lease_owner = ""
|
|
272
|
+
run.lease_expires_at = None
|
|
273
|
+
run.save()
|
|
274
|
+
count += 1
|
|
275
|
+
|
|
276
|
+
return count
|
|
277
|
+
|