django-agent-runtime 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_agent_runtime/__init__.py +25 -0
- django_agent_runtime/admin.py +155 -0
- django_agent_runtime/api/__init__.py +26 -0
- django_agent_runtime/api/permissions.py +109 -0
- django_agent_runtime/api/serializers.py +114 -0
- django_agent_runtime/api/views.py +472 -0
- django_agent_runtime/apps.py +26 -0
- django_agent_runtime/conf.py +241 -0
- django_agent_runtime/examples/__init__.py +10 -0
- django_agent_runtime/examples/langgraph_adapter.py +164 -0
- django_agent_runtime/examples/langgraph_tools.py +179 -0
- django_agent_runtime/examples/simple_chat.py +69 -0
- django_agent_runtime/examples/tool_agent.py +157 -0
- django_agent_runtime/management/__init__.py +2 -0
- django_agent_runtime/management/commands/__init__.py +2 -0
- django_agent_runtime/management/commands/runagent.py +419 -0
- django_agent_runtime/migrations/0001_initial.py +117 -0
- django_agent_runtime/migrations/0002_persistence_models.py +129 -0
- django_agent_runtime/migrations/0003_persistenceconversation_active_branch_id_and_more.py +212 -0
- django_agent_runtime/migrations/0004_add_anonymous_session_id.py +18 -0
- django_agent_runtime/migrations/__init__.py +2 -0
- django_agent_runtime/models/__init__.py +54 -0
- django_agent_runtime/models/base.py +450 -0
- django_agent_runtime/models/concrete.py +146 -0
- django_agent_runtime/persistence/__init__.py +60 -0
- django_agent_runtime/persistence/helpers.py +148 -0
- django_agent_runtime/persistence/models.py +506 -0
- django_agent_runtime/persistence/stores.py +1191 -0
- django_agent_runtime/runtime/__init__.py +23 -0
- django_agent_runtime/runtime/events/__init__.py +65 -0
- django_agent_runtime/runtime/events/base.py +135 -0
- django_agent_runtime/runtime/events/db.py +129 -0
- django_agent_runtime/runtime/events/redis.py +228 -0
- django_agent_runtime/runtime/events/sync.py +140 -0
- django_agent_runtime/runtime/interfaces.py +475 -0
- django_agent_runtime/runtime/llm/__init__.py +91 -0
- django_agent_runtime/runtime/llm/anthropic.py +249 -0
- django_agent_runtime/runtime/llm/litellm_adapter.py +173 -0
- django_agent_runtime/runtime/llm/openai.py +230 -0
- django_agent_runtime/runtime/queue/__init__.py +75 -0
- django_agent_runtime/runtime/queue/base.py +158 -0
- django_agent_runtime/runtime/queue/postgres.py +248 -0
- django_agent_runtime/runtime/queue/redis_streams.py +336 -0
- django_agent_runtime/runtime/queue/sync.py +277 -0
- django_agent_runtime/runtime/registry.py +186 -0
- django_agent_runtime/runtime/runner.py +540 -0
- django_agent_runtime/runtime/tracing/__init__.py +48 -0
- django_agent_runtime/runtime/tracing/langfuse.py +117 -0
- django_agent_runtime/runtime/tracing/noop.py +36 -0
- django_agent_runtime/urls.py +39 -0
- django_agent_runtime-0.3.6.dist-info/METADATA +723 -0
- django_agent_runtime-0.3.6.dist-info/RECORD +55 -0
- django_agent_runtime-0.3.6.dist-info/WHEEL +5 -0
- django_agent_runtime-0.3.6.dist-info/licenses/LICENSE +22 -0
- django_agent_runtime-0.3.6.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Queue adapters for distributing agent runs to workers.
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- RunQueue: Abstract async interface for queue implementations
|
|
6
|
+
- SyncRunQueue: Abstract sync interface for queue implementations
|
|
7
|
+
- PostgresQueue: Async database-backed queue using SELECT FOR UPDATE SKIP LOCKED
|
|
8
|
+
- SyncPostgresQueue: Sync database-backed queue
|
|
9
|
+
- RedisStreamsQueue: Redis Streams-backed queue with consumer groups
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from typing import Union
|
|
13
|
+
|
|
14
|
+
from django_agent_runtime.runtime.queue.base import RunQueue, QueuedRun
|
|
15
|
+
from django_agent_runtime.runtime.queue.postgres import PostgresQueue
|
|
16
|
+
from django_agent_runtime.runtime.queue.sync import SyncRunQueue, SyncPostgresQueue
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
# Async
|
|
20
|
+
"RunQueue",
|
|
21
|
+
"QueuedRun",
|
|
22
|
+
"PostgresQueue",
|
|
23
|
+
# Sync
|
|
24
|
+
"SyncRunQueue",
|
|
25
|
+
"SyncPostgresQueue",
|
|
26
|
+
# Factory functions
|
|
27
|
+
"get_queue",
|
|
28
|
+
"get_sync_queue",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
# Conditional import for Redis
|
|
32
|
+
try:
|
|
33
|
+
from django_agent_runtime.runtime.queue.redis_streams import RedisStreamsQueue
|
|
34
|
+
|
|
35
|
+
__all__.append("RedisStreamsQueue")
|
|
36
|
+
except ImportError:
|
|
37
|
+
pass # Redis not installed
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_queue(backend: str = "postgres", **kwargs) -> RunQueue:
|
|
41
|
+
"""
|
|
42
|
+
Factory function to get a queue instance.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
backend: "postgres" or "redis_streams"
|
|
46
|
+
**kwargs: Backend-specific configuration
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
RunQueue instance
|
|
50
|
+
"""
|
|
51
|
+
if backend == "postgres":
|
|
52
|
+
return PostgresQueue(**kwargs)
|
|
53
|
+
elif backend == "redis_streams":
|
|
54
|
+
from django_agent_runtime.runtime.queue.redis_streams import RedisStreamsQueue
|
|
55
|
+
|
|
56
|
+
return RedisStreamsQueue(**kwargs)
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError(f"Unknown queue backend: {backend}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_sync_queue(backend: str = "postgres", **kwargs) -> SyncRunQueue:
|
|
62
|
+
"""
|
|
63
|
+
Factory function to get a synchronous queue instance.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
backend: "postgres" (only postgres supported for sync)
|
|
67
|
+
**kwargs: Backend-specific configuration
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
SyncRunQueue instance
|
|
71
|
+
"""
|
|
72
|
+
if backend == "postgres":
|
|
73
|
+
return SyncPostgresQueue(**kwargs)
|
|
74
|
+
else:
|
|
75
|
+
raise ValueError(f"Unknown or unsupported sync queue backend: {backend}")
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Abstract base class for queue implementations.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class QueuedRun:
|
|
14
|
+
"""
|
|
15
|
+
A run claimed from the queue.
|
|
16
|
+
|
|
17
|
+
Contains the run ID and metadata needed for execution.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
run_id: UUID
|
|
21
|
+
agent_key: str
|
|
22
|
+
attempt: int
|
|
23
|
+
lease_expires_at: datetime
|
|
24
|
+
input: dict
|
|
25
|
+
metadata: dict
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RunQueue(ABC):
|
|
29
|
+
"""
|
|
30
|
+
Abstract interface for run queue implementations.
|
|
31
|
+
|
|
32
|
+
Queues handle:
|
|
33
|
+
- Claiming runs with leases
|
|
34
|
+
- Extending leases (heartbeats)
|
|
35
|
+
- Releasing runs (success/failure)
|
|
36
|
+
- Recovering expired leases
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
async def claim(
|
|
41
|
+
self,
|
|
42
|
+
worker_id: str,
|
|
43
|
+
agent_keys: Optional[list[str]] = None,
|
|
44
|
+
batch_size: int = 1,
|
|
45
|
+
) -> list[QueuedRun]:
|
|
46
|
+
"""
|
|
47
|
+
Claim runs from the queue.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
worker_id: Unique identifier for this worker
|
|
51
|
+
agent_keys: Optional filter for specific agent types
|
|
52
|
+
batch_size: Maximum number of runs to claim
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of claimed runs (may be empty)
|
|
56
|
+
"""
|
|
57
|
+
...
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
async def extend_lease(self, run_id: UUID, worker_id: str, seconds: int) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Extend the lease on a run (heartbeat).
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
run_id: Run to extend
|
|
66
|
+
worker_id: Must match the current lease owner
|
|
67
|
+
seconds: Seconds to extend the lease
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
True if extended, False if lease was lost
|
|
71
|
+
"""
|
|
72
|
+
...
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
async def release(
|
|
76
|
+
self,
|
|
77
|
+
run_id: UUID,
|
|
78
|
+
worker_id: str,
|
|
79
|
+
success: bool,
|
|
80
|
+
output: Optional[dict] = None,
|
|
81
|
+
error: Optional[dict] = None,
|
|
82
|
+
) -> None:
|
|
83
|
+
"""
|
|
84
|
+
Release a run after completion.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
run_id: Run to release
|
|
88
|
+
worker_id: Must match the current lease owner
|
|
89
|
+
success: Whether the run succeeded
|
|
90
|
+
output: Final output (if success)
|
|
91
|
+
error: Error info (if failure)
|
|
92
|
+
"""
|
|
93
|
+
...
|
|
94
|
+
|
|
95
|
+
@abstractmethod
|
|
96
|
+
async def requeue_for_retry(
|
|
97
|
+
self,
|
|
98
|
+
run_id: UUID,
|
|
99
|
+
worker_id: str,
|
|
100
|
+
error: dict,
|
|
101
|
+
delay_seconds: int = 0,
|
|
102
|
+
) -> bool:
|
|
103
|
+
"""
|
|
104
|
+
Requeue a run for retry.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
run_id: Run to requeue
|
|
108
|
+
worker_id: Must match the current lease owner
|
|
109
|
+
error: Error information
|
|
110
|
+
delay_seconds: Delay before the run becomes available
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
True if requeued, False if max attempts reached
|
|
114
|
+
"""
|
|
115
|
+
...
|
|
116
|
+
|
|
117
|
+
@abstractmethod
|
|
118
|
+
async def cancel(self, run_id: UUID) -> bool:
|
|
119
|
+
"""
|
|
120
|
+
Mark a run for cancellation.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
run_id: Run to cancel
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
True if cancellation was requested
|
|
127
|
+
"""
|
|
128
|
+
...
|
|
129
|
+
|
|
130
|
+
@abstractmethod
|
|
131
|
+
async def is_cancelled(self, run_id: UUID) -> bool:
|
|
132
|
+
"""
|
|
133
|
+
Check if a run has been cancelled.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
run_id: Run to check
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
True if cancellation was requested
|
|
140
|
+
"""
|
|
141
|
+
...
|
|
142
|
+
|
|
143
|
+
@abstractmethod
|
|
144
|
+
async def recover_expired_leases(self) -> int:
|
|
145
|
+
"""
|
|
146
|
+
Recover runs with expired leases.
|
|
147
|
+
|
|
148
|
+
Called periodically to handle worker failures.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Number of runs recovered
|
|
152
|
+
"""
|
|
153
|
+
...
|
|
154
|
+
|
|
155
|
+
async def close(self) -> None:
|
|
156
|
+
"""Close any connections. Override if needed."""
|
|
157
|
+
pass
|
|
158
|
+
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PostgreSQL-backed queue using SELECT FOR UPDATE SKIP LOCKED.
|
|
3
|
+
|
|
4
|
+
This is the baseline queue that works everywhere without Redis.
|
|
5
|
+
Lower throughput than Redis but simpler to deploy.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from datetime import datetime, timedelta, timezone
|
|
9
|
+
from typing import Optional
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
|
|
12
|
+
from asgiref.sync import sync_to_async
|
|
13
|
+
from django.db import transaction
|
|
14
|
+
from django.db.models import F, Q
|
|
15
|
+
|
|
16
|
+
from django_agent_runtime.models import AgentRun
|
|
17
|
+
from django_agent_runtime.models.base import RunStatus
|
|
18
|
+
from django_agent_runtime.runtime.queue.base import RunQueue, QueuedRun
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PostgresQueue(RunQueue):
|
|
22
|
+
"""
|
|
23
|
+
PostgreSQL-backed queue implementation.
|
|
24
|
+
|
|
25
|
+
Uses SELECT FOR UPDATE SKIP LOCKED for atomic claiming.
|
|
26
|
+
Database remains the source of truth.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, lease_ttl_seconds: int = 30):
|
|
30
|
+
self.lease_ttl_seconds = lease_ttl_seconds
|
|
31
|
+
|
|
32
|
+
async def claim(
|
|
33
|
+
self,
|
|
34
|
+
worker_id: str,
|
|
35
|
+
agent_keys: Optional[list[str]] = None,
|
|
36
|
+
batch_size: int = 1,
|
|
37
|
+
) -> list[QueuedRun]:
|
|
38
|
+
"""Claim runs using SELECT FOR UPDATE SKIP LOCKED."""
|
|
39
|
+
|
|
40
|
+
@sync_to_async
|
|
41
|
+
def _claim():
|
|
42
|
+
now = datetime.now(timezone.utc)
|
|
43
|
+
lease_expires = now + timedelta(seconds=self.lease_ttl_seconds)
|
|
44
|
+
|
|
45
|
+
with transaction.atomic():
|
|
46
|
+
# Build query for claimable runs
|
|
47
|
+
query = Q(status=RunStatus.QUEUED) | Q(
|
|
48
|
+
status=RunStatus.RUNNING,
|
|
49
|
+
lease_expires_at__lt=now, # Expired lease
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
queryset = AgentRun.objects.filter(query)
|
|
53
|
+
|
|
54
|
+
if agent_keys:
|
|
55
|
+
queryset = queryset.filter(agent_key__in=agent_keys)
|
|
56
|
+
|
|
57
|
+
# SELECT FOR UPDATE SKIP LOCKED
|
|
58
|
+
runs = list(
|
|
59
|
+
queryset.select_for_update(skip_locked=True)[:batch_size]
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
claimed = []
|
|
63
|
+
for run in runs:
|
|
64
|
+
# Update lease
|
|
65
|
+
run.status = RunStatus.RUNNING
|
|
66
|
+
run.lease_owner = worker_id
|
|
67
|
+
run.lease_expires_at = lease_expires
|
|
68
|
+
if run.started_at is None:
|
|
69
|
+
run.started_at = now
|
|
70
|
+
run.save(
|
|
71
|
+
update_fields=[
|
|
72
|
+
"status",
|
|
73
|
+
"lease_owner",
|
|
74
|
+
"lease_expires_at",
|
|
75
|
+
"started_at",
|
|
76
|
+
]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
claimed.append(
|
|
80
|
+
QueuedRun(
|
|
81
|
+
run_id=run.id,
|
|
82
|
+
agent_key=run.agent_key,
|
|
83
|
+
attempt=run.attempt,
|
|
84
|
+
lease_expires_at=lease_expires,
|
|
85
|
+
input=run.input,
|
|
86
|
+
metadata=run.metadata,
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return claimed
|
|
91
|
+
|
|
92
|
+
return await _claim()
|
|
93
|
+
|
|
94
|
+
async def extend_lease(self, run_id: UUID, worker_id: str, seconds: int) -> bool:
|
|
95
|
+
"""Extend lease if we still own it."""
|
|
96
|
+
|
|
97
|
+
@sync_to_async
|
|
98
|
+
def _extend():
|
|
99
|
+
now = datetime.now(timezone.utc)
|
|
100
|
+
new_expires = now + timedelta(seconds=seconds)
|
|
101
|
+
|
|
102
|
+
updated = AgentRun.objects.filter(
|
|
103
|
+
id=run_id,
|
|
104
|
+
lease_owner=worker_id,
|
|
105
|
+
status=RunStatus.RUNNING,
|
|
106
|
+
).update(lease_expires_at=new_expires)
|
|
107
|
+
|
|
108
|
+
return updated > 0
|
|
109
|
+
|
|
110
|
+
return await _extend()
|
|
111
|
+
|
|
112
|
+
async def release(
|
|
113
|
+
self,
|
|
114
|
+
run_id: UUID,
|
|
115
|
+
worker_id: str,
|
|
116
|
+
success: bool,
|
|
117
|
+
output: Optional[dict] = None,
|
|
118
|
+
error: Optional[dict] = None,
|
|
119
|
+
) -> None:
|
|
120
|
+
"""Release run after completion."""
|
|
121
|
+
|
|
122
|
+
@sync_to_async
|
|
123
|
+
def _release():
|
|
124
|
+
now = datetime.now(timezone.utc)
|
|
125
|
+
|
|
126
|
+
updates = {
|
|
127
|
+
"status": RunStatus.SUCCEEDED if success else RunStatus.FAILED,
|
|
128
|
+
"finished_at": now,
|
|
129
|
+
"lease_owner": "",
|
|
130
|
+
"lease_expires_at": None,
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if output:
|
|
134
|
+
updates["output"] = output
|
|
135
|
+
if error:
|
|
136
|
+
updates["error"] = error
|
|
137
|
+
|
|
138
|
+
AgentRun.objects.filter(
|
|
139
|
+
id=run_id,
|
|
140
|
+
lease_owner=worker_id,
|
|
141
|
+
).update(**updates)
|
|
142
|
+
|
|
143
|
+
await _release()
|
|
144
|
+
|
|
145
|
+
async def requeue_for_retry(
|
|
146
|
+
self,
|
|
147
|
+
run_id: UUID,
|
|
148
|
+
worker_id: str,
|
|
149
|
+
error: dict,
|
|
150
|
+
delay_seconds: int = 0,
|
|
151
|
+
) -> bool:
|
|
152
|
+
"""Requeue for retry if attempts remain."""
|
|
153
|
+
|
|
154
|
+
@sync_to_async
|
|
155
|
+
def _requeue():
|
|
156
|
+
with transaction.atomic():
|
|
157
|
+
try:
|
|
158
|
+
run = AgentRun.objects.select_for_update().get(
|
|
159
|
+
id=run_id, lease_owner=worker_id
|
|
160
|
+
)
|
|
161
|
+
except AgentRun.DoesNotExist:
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
if run.attempt >= run.max_attempts:
|
|
165
|
+
# Max attempts reached
|
|
166
|
+
run.status = RunStatus.FAILED
|
|
167
|
+
run.error = error
|
|
168
|
+
run.finished_at = datetime.now(timezone.utc)
|
|
169
|
+
run.lease_owner = ""
|
|
170
|
+
run.lease_expires_at = None
|
|
171
|
+
run.save()
|
|
172
|
+
return False
|
|
173
|
+
|
|
174
|
+
# Requeue with incremented attempt
|
|
175
|
+
run.status = RunStatus.QUEUED
|
|
176
|
+
run.attempt = F("attempt") + 1
|
|
177
|
+
run.error = error
|
|
178
|
+
run.lease_owner = ""
|
|
179
|
+
run.lease_expires_at = None
|
|
180
|
+
run.save()
|
|
181
|
+
return True
|
|
182
|
+
|
|
183
|
+
return await _requeue()
|
|
184
|
+
|
|
185
|
+
async def cancel(self, run_id: UUID) -> bool:
|
|
186
|
+
"""Mark run for cancellation."""
|
|
187
|
+
|
|
188
|
+
@sync_to_async
|
|
189
|
+
def _cancel():
|
|
190
|
+
now = datetime.now(timezone.utc)
|
|
191
|
+
updated = AgentRun.objects.filter(
|
|
192
|
+
id=run_id,
|
|
193
|
+
status__in=[RunStatus.QUEUED, RunStatus.RUNNING],
|
|
194
|
+
).update(cancel_requested_at=now)
|
|
195
|
+
return updated > 0
|
|
196
|
+
|
|
197
|
+
return await _cancel()
|
|
198
|
+
|
|
199
|
+
async def is_cancelled(self, run_id: UUID) -> bool:
|
|
200
|
+
"""Check if cancellation was requested."""
|
|
201
|
+
|
|
202
|
+
@sync_to_async
|
|
203
|
+
def _is_cancelled():
|
|
204
|
+
try:
|
|
205
|
+
run = AgentRun.objects.get(id=run_id)
|
|
206
|
+
return run.cancel_requested_at is not None
|
|
207
|
+
except AgentRun.DoesNotExist:
|
|
208
|
+
return False
|
|
209
|
+
|
|
210
|
+
return await _is_cancelled()
|
|
211
|
+
|
|
212
|
+
async def recover_expired_leases(self) -> int:
|
|
213
|
+
"""Recover runs with expired leases."""
|
|
214
|
+
|
|
215
|
+
@sync_to_async
|
|
216
|
+
def _recover():
|
|
217
|
+
now = datetime.now(timezone.utc)
|
|
218
|
+
|
|
219
|
+
# Find runs with expired leases
|
|
220
|
+
expired = AgentRun.objects.filter(
|
|
221
|
+
status=RunStatus.RUNNING,
|
|
222
|
+
lease_expires_at__lt=now,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
count = 0
|
|
226
|
+
for run in expired:
|
|
227
|
+
if run.attempt >= run.max_attempts:
|
|
228
|
+
# Mark as timed out
|
|
229
|
+
run.status = RunStatus.TIMED_OUT
|
|
230
|
+
run.finished_at = now
|
|
231
|
+
run.error = {
|
|
232
|
+
"type": "LeaseExpired",
|
|
233
|
+
"message": "Worker lease expired without completion",
|
|
234
|
+
"retriable": False,
|
|
235
|
+
}
|
|
236
|
+
else:
|
|
237
|
+
# Requeue for retry
|
|
238
|
+
run.status = RunStatus.QUEUED
|
|
239
|
+
run.attempt += 1
|
|
240
|
+
|
|
241
|
+
run.lease_owner = ""
|
|
242
|
+
run.lease_expires_at = None
|
|
243
|
+
run.save()
|
|
244
|
+
count += 1
|
|
245
|
+
|
|
246
|
+
return count
|
|
247
|
+
|
|
248
|
+
return await _recover()
|