django-agent-runtime 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- django_agent_runtime/__init__.py +25 -0
- django_agent_runtime/admin.py +155 -0
- django_agent_runtime/api/__init__.py +26 -0
- django_agent_runtime/api/permissions.py +109 -0
- django_agent_runtime/api/serializers.py +114 -0
- django_agent_runtime/api/views.py +472 -0
- django_agent_runtime/apps.py +26 -0
- django_agent_runtime/conf.py +241 -0
- django_agent_runtime/examples/__init__.py +10 -0
- django_agent_runtime/examples/langgraph_adapter.py +164 -0
- django_agent_runtime/examples/langgraph_tools.py +179 -0
- django_agent_runtime/examples/simple_chat.py +69 -0
- django_agent_runtime/examples/tool_agent.py +157 -0
- django_agent_runtime/management/__init__.py +2 -0
- django_agent_runtime/management/commands/__init__.py +2 -0
- django_agent_runtime/management/commands/runagent.py +419 -0
- django_agent_runtime/migrations/0001_initial.py +117 -0
- django_agent_runtime/migrations/0002_persistence_models.py +129 -0
- django_agent_runtime/migrations/0003_persistenceconversation_active_branch_id_and_more.py +212 -0
- django_agent_runtime/migrations/0004_add_anonymous_session_id.py +18 -0
- django_agent_runtime/migrations/__init__.py +2 -0
- django_agent_runtime/models/__init__.py +54 -0
- django_agent_runtime/models/base.py +450 -0
- django_agent_runtime/models/concrete.py +146 -0
- django_agent_runtime/persistence/__init__.py +60 -0
- django_agent_runtime/persistence/helpers.py +148 -0
- django_agent_runtime/persistence/models.py +506 -0
- django_agent_runtime/persistence/stores.py +1191 -0
- django_agent_runtime/runtime/__init__.py +23 -0
- django_agent_runtime/runtime/events/__init__.py +65 -0
- django_agent_runtime/runtime/events/base.py +135 -0
- django_agent_runtime/runtime/events/db.py +129 -0
- django_agent_runtime/runtime/events/redis.py +228 -0
- django_agent_runtime/runtime/events/sync.py +140 -0
- django_agent_runtime/runtime/interfaces.py +475 -0
- django_agent_runtime/runtime/llm/__init__.py +91 -0
- django_agent_runtime/runtime/llm/anthropic.py +249 -0
- django_agent_runtime/runtime/llm/litellm_adapter.py +173 -0
- django_agent_runtime/runtime/llm/openai.py +230 -0
- django_agent_runtime/runtime/queue/__init__.py +75 -0
- django_agent_runtime/runtime/queue/base.py +158 -0
- django_agent_runtime/runtime/queue/postgres.py +248 -0
- django_agent_runtime/runtime/queue/redis_streams.py +336 -0
- django_agent_runtime/runtime/queue/sync.py +277 -0
- django_agent_runtime/runtime/registry.py +186 -0
- django_agent_runtime/runtime/runner.py +540 -0
- django_agent_runtime/runtime/tracing/__init__.py +48 -0
- django_agent_runtime/runtime/tracing/langfuse.py +117 -0
- django_agent_runtime/runtime/tracing/noop.py +36 -0
- django_agent_runtime/urls.py +39 -0
- django_agent_runtime-0.3.6.dist-info/METADATA +723 -0
- django_agent_runtime-0.3.6.dist-info/RECORD +55 -0
- django_agent_runtime-0.3.6.dist-info/WHEEL +5 -0
- django_agent_runtime-0.3.6.dist-info/licenses/LICENSE +22 -0
- django_agent_runtime-0.3.6.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,540 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core runner for executing agent runs.
|
|
3
|
+
|
|
4
|
+
Handles:
|
|
5
|
+
- Claiming runs from queue
|
|
6
|
+
- Executing agent runtimes
|
|
7
|
+
- Heartbeats and lease management
|
|
8
|
+
- Retries and error handling
|
|
9
|
+
- Cancellation
|
|
10
|
+
- Event emission
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import logging
|
|
15
|
+
import traceback
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
from typing import Optional
|
|
19
|
+
from uuid import UUID
|
|
20
|
+
|
|
21
|
+
from django.conf import settings as django_settings
|
|
22
|
+
|
|
23
|
+
from django_agent_runtime.conf import runtime_settings, get_event_visibility
|
|
24
|
+
from django_agent_runtime.runtime.interfaces import (
|
|
25
|
+
AgentRuntime,
|
|
26
|
+
EventType,
|
|
27
|
+
Message,
|
|
28
|
+
RunContext,
|
|
29
|
+
RunResult,
|
|
30
|
+
ToolRegistry,
|
|
31
|
+
ErrorInfo,
|
|
32
|
+
)
|
|
33
|
+
from django_agent_runtime.runtime.registry import get_runtime
|
|
34
|
+
from django_agent_runtime.runtime.queue.base import RunQueue, QueuedRun
|
|
35
|
+
from django_agent_runtime.runtime.events.base import EventBus, Event
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# Check DEBUG mode
|
|
40
|
+
DEBUG = getattr(django_settings, 'DEBUG', False)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def debug_print(msg: str):
|
|
44
|
+
"""Print debug message if Django DEBUG is True."""
|
|
45
|
+
if DEBUG:
|
|
46
|
+
print(f"[agent-runner] {msg}", flush=True)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class RunContextImpl:
|
|
51
|
+
"""
|
|
52
|
+
Concrete implementation of RunContext.
|
|
53
|
+
|
|
54
|
+
Provided to agent runtimes during execution.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
run_id: UUID
|
|
58
|
+
conversation_id: Optional[UUID]
|
|
59
|
+
input_messages: list[Message]
|
|
60
|
+
params: dict
|
|
61
|
+
metadata: dict
|
|
62
|
+
tool_registry: ToolRegistry
|
|
63
|
+
|
|
64
|
+
# Internal state
|
|
65
|
+
_event_bus: EventBus = field(repr=False)
|
|
66
|
+
_queue: RunQueue = field(repr=False)
|
|
67
|
+
_worker_id: str = field(repr=False)
|
|
68
|
+
_seq: int = field(default=0, repr=False)
|
|
69
|
+
_state: Optional[dict] = field(default=None, repr=False)
|
|
70
|
+
_cancel_check_interval: float = field(default=1.0, repr=False)
|
|
71
|
+
_last_cancel_check: float = field(default=0.0, repr=False)
|
|
72
|
+
_is_cancelled: bool = field(default=False, repr=False)
|
|
73
|
+
|
|
74
|
+
async def emit(self, event_type: EventType | str, payload: dict) -> None:
|
|
75
|
+
"""Emit an event to the event bus."""
|
|
76
|
+
event_type_str = event_type.value if isinstance(event_type, EventType) else event_type
|
|
77
|
+
|
|
78
|
+
# Get visibility for this event type
|
|
79
|
+
visibility_level, ui_visible = get_event_visibility(event_type_str)
|
|
80
|
+
|
|
81
|
+
event = Event(
|
|
82
|
+
run_id=self.run_id,
|
|
83
|
+
seq=self._seq,
|
|
84
|
+
event_type=event_type_str,
|
|
85
|
+
payload=payload,
|
|
86
|
+
timestamp=datetime.now(timezone.utc),
|
|
87
|
+
visibility_level=visibility_level,
|
|
88
|
+
ui_visible=ui_visible,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Add detail for specific event types
|
|
92
|
+
detail = ""
|
|
93
|
+
if event_type == EventType.TOOL_CALL:
|
|
94
|
+
tool_name = payload.get("name", "unknown")
|
|
95
|
+
tool_args = str(payload.get("arguments", {}))[:80]
|
|
96
|
+
detail = f" -> {tool_name}({tool_args})"
|
|
97
|
+
elif event_type == EventType.ASSISTANT_MESSAGE:
|
|
98
|
+
content = str(payload.get("content", ""))[:80]
|
|
99
|
+
detail = f" -> {content}{'...' if len(str(payload.get('content', ''))) > 80 else ''}"
|
|
100
|
+
|
|
101
|
+
debug_print(f"Emitting event: type={event_type_str}, seq={self._seq}, visible={ui_visible}{detail}")
|
|
102
|
+
await self._event_bus.publish(event)
|
|
103
|
+
self._seq += 1
|
|
104
|
+
|
|
105
|
+
async def emit_user_message(self, content: str) -> None:
|
|
106
|
+
"""
|
|
107
|
+
Emit a message that will always be shown to the user.
|
|
108
|
+
|
|
109
|
+
This is a convenience method for emitting assistant messages.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
content: The message content to display
|
|
113
|
+
"""
|
|
114
|
+
await self.emit(EventType.ASSISTANT_MESSAGE, {"content": content})
|
|
115
|
+
|
|
116
|
+
async def emit_error(self, error: str, details: dict = None) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Emit an error that will be shown to the user.
|
|
119
|
+
|
|
120
|
+
This is for runtime errors that should be displayed to users,
|
|
121
|
+
distinct from run.failed which is the final failure event.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
error: The error message
|
|
125
|
+
details: Optional additional error details
|
|
126
|
+
"""
|
|
127
|
+
await self.emit(EventType.ERROR, {
|
|
128
|
+
"message": error,
|
|
129
|
+
"details": details or {},
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
async def checkpoint(self, state: dict) -> None:
|
|
133
|
+
"""Save a state checkpoint."""
|
|
134
|
+
from asgiref.sync import sync_to_async
|
|
135
|
+
from django_agent_runtime.models import AgentCheckpoint
|
|
136
|
+
|
|
137
|
+
self._state = state
|
|
138
|
+
|
|
139
|
+
@sync_to_async
|
|
140
|
+
def _save():
|
|
141
|
+
# Get next checkpoint seq
|
|
142
|
+
last = AgentCheckpoint.objects.filter(run_id=self.run_id).order_by("-seq").first()
|
|
143
|
+
next_seq = (last.seq + 1) if last else 0
|
|
144
|
+
|
|
145
|
+
AgentCheckpoint.objects.create(
|
|
146
|
+
run_id=self.run_id,
|
|
147
|
+
seq=next_seq,
|
|
148
|
+
state=state,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
await _save()
|
|
152
|
+
|
|
153
|
+
# Also emit checkpoint event
|
|
154
|
+
await self.emit(EventType.STATE_CHECKPOINT, {"seq": self._seq - 1})
|
|
155
|
+
|
|
156
|
+
async def get_state(self) -> Optional[dict]:
|
|
157
|
+
"""Get the last checkpointed state."""
|
|
158
|
+
if self._state is not None:
|
|
159
|
+
return self._state
|
|
160
|
+
|
|
161
|
+
from asgiref.sync import sync_to_async
|
|
162
|
+
from django_agent_runtime.models import AgentCheckpoint
|
|
163
|
+
|
|
164
|
+
@sync_to_async
|
|
165
|
+
def _get():
|
|
166
|
+
checkpoint = (
|
|
167
|
+
AgentCheckpoint.objects.filter(run_id=self.run_id)
|
|
168
|
+
.order_by("-seq")
|
|
169
|
+
.first()
|
|
170
|
+
)
|
|
171
|
+
return checkpoint.state if checkpoint else None
|
|
172
|
+
|
|
173
|
+
self._state = await _get()
|
|
174
|
+
return self._state
|
|
175
|
+
|
|
176
|
+
def cancelled(self) -> bool:
|
|
177
|
+
"""Check if cancellation has been requested."""
|
|
178
|
+
return self._is_cancelled
|
|
179
|
+
|
|
180
|
+
async def check_cancelled(self) -> bool:
|
|
181
|
+
"""
|
|
182
|
+
Async check for cancellation (queries database).
|
|
183
|
+
|
|
184
|
+
Call this periodically in long-running operations.
|
|
185
|
+
"""
|
|
186
|
+
now = asyncio.get_event_loop().time()
|
|
187
|
+
if now - self._last_cancel_check < self._cancel_check_interval:
|
|
188
|
+
return self._is_cancelled
|
|
189
|
+
|
|
190
|
+
self._last_cancel_check = now
|
|
191
|
+
|
|
192
|
+
self._is_cancelled = await self._queue.is_cancelled(self.run_id)
|
|
193
|
+
return self._is_cancelled
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class AgentRunner:
|
|
197
|
+
"""
|
|
198
|
+
Main runner for executing agent runs.
|
|
199
|
+
|
|
200
|
+
Manages the lifecycle of runs including:
|
|
201
|
+
- Claiming from queue
|
|
202
|
+
- Executing with timeout
|
|
203
|
+
- Heartbeat management
|
|
204
|
+
- Error handling and retries
|
|
205
|
+
- Cancellation
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
def __init__(
|
|
209
|
+
self,
|
|
210
|
+
worker_id: str,
|
|
211
|
+
queue: RunQueue,
|
|
212
|
+
event_bus: EventBus,
|
|
213
|
+
trace_sink: Optional["TraceSink"] = None,
|
|
214
|
+
):
|
|
215
|
+
self.worker_id = worker_id
|
|
216
|
+
self.queue = queue
|
|
217
|
+
self.event_bus = event_bus
|
|
218
|
+
self.trace_sink = trace_sink
|
|
219
|
+
self.settings = runtime_settings()
|
|
220
|
+
|
|
221
|
+
self._running = False
|
|
222
|
+
self._current_runs: dict[UUID, asyncio.Task] = {}
|
|
223
|
+
|
|
224
|
+
async def run_once(self, queued_run: QueuedRun) -> None:
|
|
225
|
+
"""Execute a single run."""
|
|
226
|
+
run_id = queued_run.run_id
|
|
227
|
+
agent_key = queued_run.agent_key
|
|
228
|
+
|
|
229
|
+
print(f"[agent-runner] Starting run {run_id} (agent={agent_key}, attempt={queued_run.attempt})", flush=True)
|
|
230
|
+
|
|
231
|
+
# Start tracing
|
|
232
|
+
if self.trace_sink:
|
|
233
|
+
self.trace_sink.start_run(run_id, {"agent_key": agent_key})
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
# Get the runtime
|
|
237
|
+
debug_print(f"Getting runtime for agent_key={agent_key}")
|
|
238
|
+
runtime = get_runtime(agent_key)
|
|
239
|
+
debug_print(f"Got runtime: {runtime.__class__.__name__}")
|
|
240
|
+
|
|
241
|
+
# Build context
|
|
242
|
+
ctx = await self._build_context(queued_run, runtime)
|
|
243
|
+
debug_print(f"Context built: {len(ctx.input_messages)} messages")
|
|
244
|
+
for i, msg in enumerate(ctx.input_messages):
|
|
245
|
+
role = msg.get("role", "unknown")
|
|
246
|
+
content = msg.get("content", "")[:100] # Truncate for readability
|
|
247
|
+
debug_print(f" [{i}] {role}: {content}{'...' if len(msg.get('content', '')) > 100 else ''}")
|
|
248
|
+
|
|
249
|
+
# Emit started event
|
|
250
|
+
await ctx.emit(EventType.RUN_STARTED, {
|
|
251
|
+
"agent_key": agent_key,
|
|
252
|
+
"attempt": queued_run.attempt,
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
# Start heartbeat task
|
|
256
|
+
heartbeat_task = asyncio.create_task(
|
|
257
|
+
self._heartbeat_loop(run_id, ctx)
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
# Execute with timeout
|
|
262
|
+
debug_print(f"Calling runtime.run() with timeout={self.settings.RUN_TIMEOUT_SECONDS}s")
|
|
263
|
+
result = await asyncio.wait_for(
|
|
264
|
+
runtime.run(ctx),
|
|
265
|
+
timeout=self.settings.RUN_TIMEOUT_SECONDS,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Check for cancellation
|
|
269
|
+
if ctx.cancelled():
|
|
270
|
+
await self._handle_cancellation(run_id, ctx)
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
# Success!
|
|
274
|
+
await self._handle_success(run_id, ctx, result)
|
|
275
|
+
|
|
276
|
+
except asyncio.TimeoutError:
|
|
277
|
+
await self._handle_timeout(run_id, ctx)
|
|
278
|
+
|
|
279
|
+
except asyncio.CancelledError:
|
|
280
|
+
await self._handle_cancellation(run_id, ctx)
|
|
281
|
+
|
|
282
|
+
except Exception as e:
|
|
283
|
+
print(f"[agent-runner] Runtime error in run {run_id}: {e}", flush=True)
|
|
284
|
+
traceback.print_exc()
|
|
285
|
+
await self._handle_error(
|
|
286
|
+
run_id, ctx, runtime, e,
|
|
287
|
+
attempt=queued_run.attempt,
|
|
288
|
+
max_attempts=self.settings.DEFAULT_MAX_ATTEMPTS,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
finally:
|
|
292
|
+
heartbeat_task.cancel()
|
|
293
|
+
try:
|
|
294
|
+
await heartbeat_task
|
|
295
|
+
except asyncio.CancelledError:
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
except Exception as e:
|
|
299
|
+
# Error before run started (e.g., runtime not found)
|
|
300
|
+
print(f"[agent-runner] Failed to start run {run_id}: {e}", flush=True)
|
|
301
|
+
traceback.print_exc()
|
|
302
|
+
await self.queue.release(
|
|
303
|
+
run_id,
|
|
304
|
+
self.worker_id,
|
|
305
|
+
success=False,
|
|
306
|
+
error={
|
|
307
|
+
"type": type(e).__name__,
|
|
308
|
+
"message": str(e),
|
|
309
|
+
"stack": traceback.format_exc(),
|
|
310
|
+
"retriable": False,
|
|
311
|
+
},
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
finally:
|
|
315
|
+
if self.trace_sink:
|
|
316
|
+
self.trace_sink.end_run(run_id, "completed")
|
|
317
|
+
|
|
318
|
+
async def _build_context(
|
|
319
|
+
self, queued_run: QueuedRun, runtime: AgentRuntime
|
|
320
|
+
) -> RunContextImpl:
|
|
321
|
+
"""Build the run context."""
|
|
322
|
+
input_data = queued_run.input
|
|
323
|
+
messages = input_data.get("messages", [])
|
|
324
|
+
params = input_data.get("params", {})
|
|
325
|
+
|
|
326
|
+
# Get conversation_id from metadata
|
|
327
|
+
conversation_id = queued_run.metadata.get("conversation_id")
|
|
328
|
+
if conversation_id:
|
|
329
|
+
conversation_id = UUID(conversation_id) if isinstance(conversation_id, str) else conversation_id
|
|
330
|
+
|
|
331
|
+
# Build tool registry (could be customized per agent)
|
|
332
|
+
tool_registry = ToolRegistry()
|
|
333
|
+
|
|
334
|
+
# Get next sequence number
|
|
335
|
+
seq = await self.event_bus.get_next_seq(queued_run.run_id)
|
|
336
|
+
|
|
337
|
+
return RunContextImpl(
|
|
338
|
+
run_id=queued_run.run_id,
|
|
339
|
+
conversation_id=conversation_id,
|
|
340
|
+
input_messages=messages,
|
|
341
|
+
params=params,
|
|
342
|
+
metadata=queued_run.metadata,
|
|
343
|
+
tool_registry=tool_registry,
|
|
344
|
+
_event_bus=self.event_bus,
|
|
345
|
+
_queue=self.queue,
|
|
346
|
+
_worker_id=self.worker_id,
|
|
347
|
+
_seq=seq,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
async def _heartbeat_loop(self, run_id: UUID, ctx: RunContextImpl) -> None:
|
|
351
|
+
"""Send periodic heartbeats to extend lease."""
|
|
352
|
+
while True:
|
|
353
|
+
await asyncio.sleep(self.settings.HEARTBEAT_INTERVAL_SECONDS)
|
|
354
|
+
|
|
355
|
+
# Extend lease
|
|
356
|
+
extended = await self.queue.extend_lease(
|
|
357
|
+
run_id,
|
|
358
|
+
self.worker_id,
|
|
359
|
+
self.settings.LEASE_TTL_SECONDS,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
if not extended:
|
|
363
|
+
print(f"[agent-runner] Lost lease on run {run_id}", flush=True)
|
|
364
|
+
break
|
|
365
|
+
|
|
366
|
+
# Emit heartbeat event
|
|
367
|
+
await ctx.emit(EventType.RUN_HEARTBEAT, {})
|
|
368
|
+
|
|
369
|
+
# Check for cancellation
|
|
370
|
+
await ctx.check_cancelled()
|
|
371
|
+
|
|
372
|
+
async def _handle_success(
|
|
373
|
+
self, run_id: UUID, ctx: RunContextImpl, result: RunResult
|
|
374
|
+
) -> None:
|
|
375
|
+
"""Handle successful run completion."""
|
|
376
|
+
print(f"[agent-runner] Run {run_id} succeeded", flush=True)
|
|
377
|
+
|
|
378
|
+
output = {
|
|
379
|
+
"final_output": result.final_output,
|
|
380
|
+
"final_messages": result.final_messages,
|
|
381
|
+
"usage": result.usage,
|
|
382
|
+
"artifacts": result.artifacts,
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
# Emit success event
|
|
386
|
+
await ctx.emit(EventType.RUN_SUCCEEDED, {
|
|
387
|
+
"output": result.final_output,
|
|
388
|
+
"usage": result.usage,
|
|
389
|
+
})
|
|
390
|
+
|
|
391
|
+
# Release with success
|
|
392
|
+
await self.queue.release(
|
|
393
|
+
run_id,
|
|
394
|
+
self.worker_id,
|
|
395
|
+
success=True,
|
|
396
|
+
output=output,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Call completion hook if configured
|
|
400
|
+
await self._call_completion_hook(run_id, output)
|
|
401
|
+
|
|
402
|
+
async def _call_completion_hook(self, run_id: UUID, output: dict) -> None:
|
|
403
|
+
"""Call the configured completion hook if any."""
|
|
404
|
+
from django_agent_runtime.conf import get_hook
|
|
405
|
+
|
|
406
|
+
hook = get_hook(self.settings.RUN_COMPLETED_HOOK)
|
|
407
|
+
if not hook:
|
|
408
|
+
return
|
|
409
|
+
|
|
410
|
+
try:
|
|
411
|
+
# Run hook in thread pool since it may do sync I/O
|
|
412
|
+
from asgiref.sync import sync_to_async
|
|
413
|
+
await sync_to_async(hook)(str(run_id), output)
|
|
414
|
+
except Exception as e:
|
|
415
|
+
print(f"[agent-runner] Error in completion hook for run {run_id}: {e}", flush=True)
|
|
416
|
+
|
|
417
|
+
async def _handle_timeout(self, run_id: UUID, ctx: RunContextImpl) -> None:
|
|
418
|
+
"""Handle run timeout."""
|
|
419
|
+
print(f"[agent-runner] Run {run_id} timed out after {self.settings.RUN_TIMEOUT_SECONDS}s", flush=True)
|
|
420
|
+
|
|
421
|
+
await ctx.emit(EventType.RUN_TIMED_OUT, {
|
|
422
|
+
"timeout_seconds": self.settings.RUN_TIMEOUT_SECONDS,
|
|
423
|
+
})
|
|
424
|
+
|
|
425
|
+
await self.queue.release(
|
|
426
|
+
run_id,
|
|
427
|
+
self.worker_id,
|
|
428
|
+
success=False,
|
|
429
|
+
error={
|
|
430
|
+
"type": "TimeoutError",
|
|
431
|
+
"message": f"Run exceeded {self.settings.RUN_TIMEOUT_SECONDS}s timeout",
|
|
432
|
+
"retriable": False,
|
|
433
|
+
},
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
async def _handle_cancellation(self, run_id: UUID, ctx: RunContextImpl) -> None:
|
|
437
|
+
"""Handle run cancellation."""
|
|
438
|
+
print(f"[agent-runner] Run {run_id} cancelled", flush=True)
|
|
439
|
+
|
|
440
|
+
await ctx.emit(EventType.RUN_CANCELLED, {})
|
|
441
|
+
|
|
442
|
+
# Update status directly (not through queue.release)
|
|
443
|
+
from asgiref.sync import sync_to_async
|
|
444
|
+
from django_agent_runtime.models import AgentRun
|
|
445
|
+
from django_agent_runtime.models.base import RunStatus
|
|
446
|
+
|
|
447
|
+
@sync_to_async
|
|
448
|
+
def _update():
|
|
449
|
+
AgentRun.objects.filter(id=run_id).update(
|
|
450
|
+
status=RunStatus.CANCELLED,
|
|
451
|
+
finished_at=datetime.now(timezone.utc),
|
|
452
|
+
lease_owner="",
|
|
453
|
+
lease_expires_at=None,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
await _update()
|
|
457
|
+
|
|
458
|
+
async def _handle_error(
|
|
459
|
+
self,
|
|
460
|
+
run_id: UUID,
|
|
461
|
+
ctx: RunContextImpl,
|
|
462
|
+
runtime: AgentRuntime,
|
|
463
|
+
error: Exception,
|
|
464
|
+
attempt: int = 1,
|
|
465
|
+
max_attempts: int = None,
|
|
466
|
+
) -> None:
|
|
467
|
+
"""Handle run error with retry logic."""
|
|
468
|
+
if max_attempts is None:
|
|
469
|
+
max_attempts = self.settings.DEFAULT_MAX_ATTEMPTS
|
|
470
|
+
|
|
471
|
+
print(f"[agent-runner] Run {run_id} failed (attempt {attempt}/{max_attempts}): {error}", flush=True)
|
|
472
|
+
|
|
473
|
+
# Let runtime classify the error
|
|
474
|
+
error_info = await runtime.on_error(ctx, error)
|
|
475
|
+
if error_info is None:
|
|
476
|
+
error_info = ErrorInfo(
|
|
477
|
+
type=type(error).__name__,
|
|
478
|
+
message=str(error),
|
|
479
|
+
stack=traceback.format_exc(),
|
|
480
|
+
retriable=True,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Build comprehensive error dict for events and storage
|
|
484
|
+
error_dict = {
|
|
485
|
+
"type": error_info.type,
|
|
486
|
+
"message": error_info.message,
|
|
487
|
+
"stack": error_info.stack,
|
|
488
|
+
"retriable": error_info.retriable,
|
|
489
|
+
"details": error_info.details,
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
# Check if we should retry
|
|
493
|
+
can_retry = error_info.retriable and attempt < max_attempts
|
|
494
|
+
|
|
495
|
+
if can_retry:
|
|
496
|
+
# Try to requeue
|
|
497
|
+
requeued = await self.queue.requeue_for_retry(
|
|
498
|
+
run_id,
|
|
499
|
+
self.worker_id,
|
|
500
|
+
error_dict,
|
|
501
|
+
delay_seconds=self._calculate_backoff(ctx, attempt),
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
if requeued:
|
|
505
|
+
print(f"[agent-runner] Run {run_id} requeued for retry (attempt {attempt + 1})", flush=True)
|
|
506
|
+
# Emit an error event so UI knows about the retry
|
|
507
|
+
await ctx.emit(EventType.ERROR, {
|
|
508
|
+
"message": f"Error occurred, retrying... (attempt {attempt}/{max_attempts})",
|
|
509
|
+
"error": error_info.message,
|
|
510
|
+
"error_type": error_info.type,
|
|
511
|
+
"attempt": attempt,
|
|
512
|
+
"max_attempts": max_attempts,
|
|
513
|
+
"retriable": True,
|
|
514
|
+
})
|
|
515
|
+
return
|
|
516
|
+
|
|
517
|
+
# Final failure - emit detailed run.failed event
|
|
518
|
+
await ctx.emit(EventType.RUN_FAILED, {
|
|
519
|
+
"error": error_dict["message"],
|
|
520
|
+
"error_type": error_dict["type"],
|
|
521
|
+
"error_details": error_dict,
|
|
522
|
+
"attempt": attempt,
|
|
523
|
+
"max_attempts": max_attempts,
|
|
524
|
+
"retriable": False, # No more retries
|
|
525
|
+
})
|
|
526
|
+
|
|
527
|
+
await self.queue.release(
|
|
528
|
+
run_id,
|
|
529
|
+
self.worker_id,
|
|
530
|
+
success=False,
|
|
531
|
+
error=error_dict,
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
def _calculate_backoff(self, ctx: RunContextImpl, attempt: int = 1) -> int:
|
|
535
|
+
"""Calculate exponential backoff delay."""
|
|
536
|
+
base = self.settings.RETRY_BACKOFF_BASE
|
|
537
|
+
max_backoff = self.settings.RETRY_BACKOFF_MAX
|
|
538
|
+
|
|
539
|
+
delay = min(base ** attempt, max_backoff)
|
|
540
|
+
return int(delay)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tracing/observability layer for agent runs.
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- TraceSink: Abstract interface (from interfaces.py)
|
|
6
|
+
- NoopTraceSink: Default no-op implementation
|
|
7
|
+
- LangfuseTraceSink: Langfuse integration (optional)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from django_agent_runtime.runtime.interfaces import TraceSink
|
|
11
|
+
from django_agent_runtime.runtime.tracing.noop import NoopTraceSink
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"TraceSink",
|
|
15
|
+
"NoopTraceSink",
|
|
16
|
+
"get_trace_sink",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_trace_sink() -> TraceSink:
|
|
21
|
+
"""
|
|
22
|
+
Factory function to get a trace sink based on settings.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
TraceSink instance (NoopTraceSink if tracing disabled)
|
|
26
|
+
"""
|
|
27
|
+
from django_agent_runtime.conf import runtime_settings
|
|
28
|
+
|
|
29
|
+
settings = runtime_settings()
|
|
30
|
+
|
|
31
|
+
if settings.LANGFUSE_ENABLED:
|
|
32
|
+
try:
|
|
33
|
+
from django_agent_runtime.runtime.tracing.langfuse import LangfuseTraceSink
|
|
34
|
+
|
|
35
|
+
return LangfuseTraceSink(
|
|
36
|
+
public_key=settings.LANGFUSE_PUBLIC_KEY,
|
|
37
|
+
secret_key=settings.LANGFUSE_SECRET_KEY,
|
|
38
|
+
host=settings.LANGFUSE_HOST,
|
|
39
|
+
)
|
|
40
|
+
except ImportError:
|
|
41
|
+
import logging
|
|
42
|
+
|
|
43
|
+
logging.getLogger(__name__).warning(
|
|
44
|
+
"Langfuse enabled but langfuse package not installed. Using NoopTraceSink."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return NoopTraceSink()
|
|
48
|
+
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Langfuse trace sink implementation.
|
|
3
|
+
|
|
4
|
+
Langfuse is an open-source LLM observability platform.
|
|
5
|
+
This is an OPTIONAL integration - the core runtime doesn't depend on it.
|
|
6
|
+
|
|
7
|
+
See: https://langfuse.com/
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Optional
|
|
12
|
+
from uuid import UUID
|
|
13
|
+
|
|
14
|
+
from django_agent_runtime.runtime.interfaces import TraceSink
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from langfuse import Langfuse
|
|
18
|
+
except ImportError:
|
|
19
|
+
Langfuse = None
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LangfuseTraceSink(TraceSink):
|
|
25
|
+
"""
|
|
26
|
+
Langfuse trace sink for LLM observability.
|
|
27
|
+
|
|
28
|
+
Sends traces to Langfuse for monitoring, debugging, and analytics.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
public_key: Optional[str] = None,
|
|
34
|
+
secret_key: Optional[str] = None,
|
|
35
|
+
host: Optional[str] = None,
|
|
36
|
+
):
|
|
37
|
+
if Langfuse is None:
|
|
38
|
+
raise ImportError("langfuse package is required for LangfuseTraceSink")
|
|
39
|
+
|
|
40
|
+
self._client = Langfuse(
|
|
41
|
+
public_key=public_key,
|
|
42
|
+
secret_key=secret_key,
|
|
43
|
+
host=host,
|
|
44
|
+
)
|
|
45
|
+
self._traces: dict[UUID, any] = {}
|
|
46
|
+
|
|
47
|
+
def start_run(self, run_id: UUID, metadata: dict) -> None:
|
|
48
|
+
"""Start a new trace in Langfuse."""
|
|
49
|
+
try:
|
|
50
|
+
trace = self._client.trace(
|
|
51
|
+
id=str(run_id),
|
|
52
|
+
name=metadata.get("agent_key", "agent_run"),
|
|
53
|
+
metadata=metadata,
|
|
54
|
+
)
|
|
55
|
+
self._traces[run_id] = trace
|
|
56
|
+
except Exception as e:
|
|
57
|
+
logger.warning(f"Failed to start Langfuse trace: {e}")
|
|
58
|
+
|
|
59
|
+
def log_event(self, run_id: UUID, event_type: str, payload: dict) -> None:
|
|
60
|
+
"""Log an event to the trace."""
|
|
61
|
+
trace = self._traces.get(run_id)
|
|
62
|
+
if not trace:
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Map event types to Langfuse concepts
|
|
67
|
+
if event_type == "assistant.message":
|
|
68
|
+
trace.generation(
|
|
69
|
+
name="assistant_message",
|
|
70
|
+
output=payload.get("content", ""),
|
|
71
|
+
metadata=payload,
|
|
72
|
+
)
|
|
73
|
+
elif event_type == "tool.call":
|
|
74
|
+
trace.span(
|
|
75
|
+
name=f"tool:{payload.get('name', 'unknown')}",
|
|
76
|
+
input=payload.get("arguments", {}),
|
|
77
|
+
)
|
|
78
|
+
elif event_type == "tool.result":
|
|
79
|
+
# Tool results are logged as part of the span
|
|
80
|
+
pass
|
|
81
|
+
else:
|
|
82
|
+
# Generic event
|
|
83
|
+
trace.event(
|
|
84
|
+
name=event_type,
|
|
85
|
+
metadata=payload,
|
|
86
|
+
)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logger.warning(f"Failed to log Langfuse event: {e}")
|
|
89
|
+
|
|
90
|
+
def end_run(self, run_id: UUID, outcome: str, metadata: Optional[dict] = None) -> None:
|
|
91
|
+
"""End the trace."""
|
|
92
|
+
trace = self._traces.pop(run_id, None)
|
|
93
|
+
if not trace:
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
# Update trace with final status
|
|
98
|
+
status_map = {
|
|
99
|
+
"succeeded": "SUCCESS",
|
|
100
|
+
"failed": "ERROR",
|
|
101
|
+
"cancelled": "CANCELLED",
|
|
102
|
+
"timed_out": "ERROR",
|
|
103
|
+
}
|
|
104
|
+
trace.update(
|
|
105
|
+
status=status_map.get(outcome, "UNKNOWN"),
|
|
106
|
+
metadata=metadata or {},
|
|
107
|
+
)
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.warning(f"Failed to end Langfuse trace: {e}")
|
|
110
|
+
|
|
111
|
+
def flush(self) -> None:
|
|
112
|
+
"""Flush any buffered traces to Langfuse."""
|
|
113
|
+
try:
|
|
114
|
+
self._client.flush()
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.warning(f"Failed to flush Langfuse traces: {e}")
|
|
117
|
+
|