daita-agents 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of daita-agents might be problematic. Click here for more details.
- daita/__init__.py +208 -0
- daita/agents/__init__.py +33 -0
- daita/agents/base.py +722 -0
- daita/agents/substrate.py +895 -0
- daita/cli/__init__.py +145 -0
- daita/cli/__main__.py +7 -0
- daita/cli/ascii_art.py +44 -0
- daita/cli/core/__init__.py +0 -0
- daita/cli/core/create.py +254 -0
- daita/cli/core/deploy.py +473 -0
- daita/cli/core/deployments.py +309 -0
- daita/cli/core/import_detector.py +219 -0
- daita/cli/core/init.py +382 -0
- daita/cli/core/logs.py +239 -0
- daita/cli/core/managed_deploy.py +709 -0
- daita/cli/core/run.py +648 -0
- daita/cli/core/status.py +421 -0
- daita/cli/core/test.py +239 -0
- daita/cli/core/webhooks.py +172 -0
- daita/cli/main.py +588 -0
- daita/cli/utils.py +541 -0
- daita/config/__init__.py +62 -0
- daita/config/base.py +159 -0
- daita/config/settings.py +184 -0
- daita/core/__init__.py +262 -0
- daita/core/decision_tracing.py +701 -0
- daita/core/exceptions.py +480 -0
- daita/core/focus.py +251 -0
- daita/core/interfaces.py +76 -0
- daita/core/plugin_tracing.py +550 -0
- daita/core/relay.py +695 -0
- daita/core/reliability.py +381 -0
- daita/core/scaling.py +444 -0
- daita/core/tools.py +402 -0
- daita/core/tracing.py +770 -0
- daita/core/workflow.py +1084 -0
- daita/display/__init__.py +1 -0
- daita/display/console.py +160 -0
- daita/execution/__init__.py +58 -0
- daita/execution/client.py +856 -0
- daita/execution/exceptions.py +92 -0
- daita/execution/models.py +317 -0
- daita/llm/__init__.py +60 -0
- daita/llm/anthropic.py +166 -0
- daita/llm/base.py +373 -0
- daita/llm/factory.py +101 -0
- daita/llm/gemini.py +152 -0
- daita/llm/grok.py +114 -0
- daita/llm/mock.py +135 -0
- daita/llm/openai.py +109 -0
- daita/plugins/__init__.py +141 -0
- daita/plugins/base.py +37 -0
- daita/plugins/base_db.py +167 -0
- daita/plugins/elasticsearch.py +844 -0
- daita/plugins/mcp.py +481 -0
- daita/plugins/mongodb.py +510 -0
- daita/plugins/mysql.py +351 -0
- daita/plugins/postgresql.py +331 -0
- daita/plugins/redis_messaging.py +500 -0
- daita/plugins/rest.py +529 -0
- daita/plugins/s3.py +761 -0
- daita/plugins/slack.py +729 -0
- daita/utils/__init__.py +18 -0
- daita_agents-0.1.0.dist-info/METADATA +350 -0
- daita_agents-0.1.0.dist-info/RECORD +69 -0
- daita_agents-0.1.0.dist-info/WHEEL +5 -0
- daita_agents-0.1.0.dist-info/entry_points.txt +2 -0
- daita_agents-0.1.0.dist-info/licenses/LICENSE +56 -0
- daita_agents-0.1.0.dist-info/top_level.txt +1 -0
daita/agents/base.py
ADDED
|
@@ -0,0 +1,722 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Updated BaseAgent with Unified Tracing Integration
|
|
3
|
+
|
|
4
|
+
This replaces the old BaseAgent to use the new unified tracing system.
|
|
5
|
+
All operations are automatically traced without user configuration.
|
|
6
|
+
|
|
7
|
+
Key Changes:
|
|
8
|
+
- Removed old metrics system completely
|
|
9
|
+
- Integrated automatic tracing for all operations
|
|
10
|
+
- Added decision tracing for retry logic
|
|
11
|
+
- Automatic agent lifecycle tracing
|
|
12
|
+
- Zero configuration required
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import logging
|
|
17
|
+
import uuid
|
|
18
|
+
import random
|
|
19
|
+
import time
|
|
20
|
+
from datetime import datetime, timezone
|
|
21
|
+
from typing import Any, Dict, List, Optional, Union
|
|
22
|
+
|
|
23
|
+
from ..config.base import AgentConfig, AgentType, RetryStrategy, RetryPolicy
|
|
24
|
+
from ..core.interfaces import Agent, LLMProvider
|
|
25
|
+
from ..core.exceptions import DaitaError, AgentError, LLMError, BackpressureError, TaskTimeoutError
|
|
26
|
+
from ..core.tracing import get_trace_manager, TraceType, TraceStatus
|
|
27
|
+
from ..core.decision_tracing import record_decision_point, DecisionType
|
|
28
|
+
from ..core.reliability import (
|
|
29
|
+
TaskManager, get_global_task_manager, TaskStatus,
|
|
30
|
+
BackpressureController
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
class BaseAgent(Agent):
|
|
36
|
+
"""
|
|
37
|
+
Base implementation for all Daita agents with automatic tracing.
|
|
38
|
+
|
|
39
|
+
Every operation is automatically traced and sent to the dashboard.
|
|
40
|
+
Users don't need to configure anything - tracing just works.
|
|
41
|
+
|
|
42
|
+
Features:
|
|
43
|
+
- Automatic operation tracing
|
|
44
|
+
- Retry decision tracing with confidence scores
|
|
45
|
+
- Agent lifecycle tracing
|
|
46
|
+
- LLM integration with automatic token tracking
|
|
47
|
+
- Performance monitoring
|
|
48
|
+
- Error tracking and correlation
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
config: AgentConfig,
|
|
54
|
+
llm_provider: Optional[LLMProvider] = None,
|
|
55
|
+
agent_id: Optional[str] = None,
|
|
56
|
+
name: Optional[str] = None,
|
|
57
|
+
enable_reliability: bool = False,
|
|
58
|
+
max_concurrent_tasks: int = 10,
|
|
59
|
+
max_queue_size: int = 100,
|
|
60
|
+
):
|
|
61
|
+
self.config = config
|
|
62
|
+
self.llm = llm_provider
|
|
63
|
+
self.name = name or config.name
|
|
64
|
+
self.agent_type = config.type
|
|
65
|
+
self.enable_reliability = enable_reliability
|
|
66
|
+
|
|
67
|
+
# Generate unique ID
|
|
68
|
+
if agent_id:
|
|
69
|
+
self.agent_id = agent_id
|
|
70
|
+
elif self.name:
|
|
71
|
+
slug = self.name.lower().replace(' ', '_').replace('-', '_')
|
|
72
|
+
self.agent_id = f"{slug}_{uuid.uuid4().hex[:8]}"
|
|
73
|
+
else:
|
|
74
|
+
self.agent_id = f"{self.__class__.__name__}_{uuid.uuid4().hex[:8]}"
|
|
75
|
+
|
|
76
|
+
# Runtime state
|
|
77
|
+
self._running = False
|
|
78
|
+
self._tasks = []
|
|
79
|
+
|
|
80
|
+
# Get trace manager for automatic tracing
|
|
81
|
+
self.trace_manager = get_trace_manager()
|
|
82
|
+
|
|
83
|
+
# Reliability features (enabled when reliability is configured)
|
|
84
|
+
self.task_manager = get_global_task_manager() if enable_reliability else None
|
|
85
|
+
self.backpressure_controller = None
|
|
86
|
+
if enable_reliability:
|
|
87
|
+
self.backpressure_controller = BackpressureController(
|
|
88
|
+
max_concurrent_tasks=max_concurrent_tasks,
|
|
89
|
+
max_queue_size=max_queue_size,
|
|
90
|
+
agent_id=self.agent_id
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Set agent ID in LLM provider for automatic LLM tracing
|
|
94
|
+
if self.llm:
|
|
95
|
+
self.llm.set_agent_id(self.agent_id)
|
|
96
|
+
|
|
97
|
+
logger.debug(f"Agent {self.name} ({self.agent_id}) initialized with automatic tracing")
|
|
98
|
+
|
|
99
|
+
async def start(self) -> None:
|
|
100
|
+
"""Start the agent with automatic lifecycle tracing."""
|
|
101
|
+
if self._running:
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
# Start decision display if enabled
|
|
105
|
+
if hasattr(self, '_decision_display') and self._decision_display:
|
|
106
|
+
self._decision_display.start()
|
|
107
|
+
|
|
108
|
+
# Automatically trace agent lifecycle
|
|
109
|
+
async with self.trace_manager.span(
|
|
110
|
+
operation_name="agent_start",
|
|
111
|
+
trace_type=TraceType.AGENT_LIFECYCLE,
|
|
112
|
+
agent_id=self.agent_id,
|
|
113
|
+
agent_name=self.name,
|
|
114
|
+
agent_type=self.agent_type.value,
|
|
115
|
+
retry_enabled=str(self.config.retry_enabled)
|
|
116
|
+
):
|
|
117
|
+
self._running = True
|
|
118
|
+
logger.info(f"Agent {self.name} started")
|
|
119
|
+
|
|
120
|
+
async def stop(self) -> None:
|
|
121
|
+
"""Stop the agent with automatic lifecycle tracing."""
|
|
122
|
+
if not self._running:
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
# Stop decision display if enabled
|
|
126
|
+
if hasattr(self, '_decision_display') and self._decision_display:
|
|
127
|
+
self._decision_display.stop()
|
|
128
|
+
# Cleanup decision streaming registration
|
|
129
|
+
try:
|
|
130
|
+
from ..core.decision_tracing import unregister_agent_decision_stream
|
|
131
|
+
unregister_agent_decision_stream(
|
|
132
|
+
agent_id=self.agent_id,
|
|
133
|
+
callback=self._decision_display.handle_event
|
|
134
|
+
)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.debug(f"Failed to cleanup decision display: {e}")
|
|
137
|
+
|
|
138
|
+
# Automatically trace agent lifecycle
|
|
139
|
+
async with self.trace_manager.span(
|
|
140
|
+
operation_name="agent_stop",
|
|
141
|
+
trace_type=TraceType.AGENT_LIFECYCLE,
|
|
142
|
+
agent_id=self.agent_id,
|
|
143
|
+
agent_name=self.name,
|
|
144
|
+
tasks_completed=str(len(self._tasks))
|
|
145
|
+
):
|
|
146
|
+
# Cancel running tasks
|
|
147
|
+
for task in self._tasks:
|
|
148
|
+
if not task.done():
|
|
149
|
+
task.cancel()
|
|
150
|
+
|
|
151
|
+
if self._tasks:
|
|
152
|
+
await asyncio.gather(*self._tasks, return_exceptions=True)
|
|
153
|
+
self._tasks.clear()
|
|
154
|
+
|
|
155
|
+
self._running = False
|
|
156
|
+
logger.info(f"Agent {self.name} stopped")
|
|
157
|
+
|
|
158
|
+
async def process(
|
|
159
|
+
self,
|
|
160
|
+
task: str,
|
|
161
|
+
data: Any = None,
|
|
162
|
+
context: Optional[Dict[str, Any]] = None,
|
|
163
|
+
**kwargs
|
|
164
|
+
) -> Dict[str, Any]:
|
|
165
|
+
"""
|
|
166
|
+
Process a task with optional reliability features and automatic tracing.
|
|
167
|
+
|
|
168
|
+
Every call to process() is automatically traced with:
|
|
169
|
+
- Input/output data
|
|
170
|
+
- Performance metrics
|
|
171
|
+
- Error tracking
|
|
172
|
+
- Retry attempts and decisions
|
|
173
|
+
- LLM calls (if any)
|
|
174
|
+
- Task lifecycle (if reliability enabled)
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
task: Task to execute
|
|
178
|
+
data: Input data
|
|
179
|
+
context: Additional context
|
|
180
|
+
**kwargs: Additional arguments
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Task results with automatic tracing metadata
|
|
184
|
+
"""
|
|
185
|
+
# Build full context
|
|
186
|
+
full_context = {
|
|
187
|
+
'agent_id': self.agent_id,
|
|
188
|
+
'agent_name': self.name,
|
|
189
|
+
'agent_type': self.agent_type.value,
|
|
190
|
+
'task': task,
|
|
191
|
+
'retry_enabled': self.config.retry_enabled,
|
|
192
|
+
'reliability_enabled': self.enable_reliability,
|
|
193
|
+
**(context or {}),
|
|
194
|
+
**kwargs
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
# Handle reliability features if enabled
|
|
198
|
+
if self.enable_reliability:
|
|
199
|
+
return await self._process_with_reliability(task, data, full_context)
|
|
200
|
+
else:
|
|
201
|
+
return await self._process_without_reliability(task, data, full_context)
|
|
202
|
+
|
|
203
|
+
async def _process_with_reliability(
|
|
204
|
+
self,
|
|
205
|
+
task: str,
|
|
206
|
+
data: Any,
|
|
207
|
+
context: Dict[str, Any]
|
|
208
|
+
) -> Dict[str, Any]:
|
|
209
|
+
"""Process task with full reliability features."""
|
|
210
|
+
# Check backpressure first
|
|
211
|
+
if self.backpressure_controller and not await self.backpressure_controller.acquire_processing_slot():
|
|
212
|
+
raise BackpressureError(
|
|
213
|
+
"Unable to acquire processing slot",
|
|
214
|
+
agent_id=self.agent_id,
|
|
215
|
+
queue_size=self.backpressure_controller.task_queue.qsize()
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Create task in task manager
|
|
219
|
+
task_id = None
|
|
220
|
+
if self.task_manager:
|
|
221
|
+
task_id = await self.task_manager.create_task(
|
|
222
|
+
agent_id=self.agent_id,
|
|
223
|
+
task_type=task,
|
|
224
|
+
data=data,
|
|
225
|
+
context=context
|
|
226
|
+
)
|
|
227
|
+
context['task_id'] = task_id
|
|
228
|
+
# Update task status to running
|
|
229
|
+
await self.task_manager.update_status(task_id, TaskStatus.RUNNING)
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
# Automatically trace the entire operation
|
|
233
|
+
async with self.trace_manager.span(
|
|
234
|
+
operation_name=f"agent_process_{task}",
|
|
235
|
+
trace_type=TraceType.AGENT_EXECUTION,
|
|
236
|
+
agent_id=self.agent_id,
|
|
237
|
+
input_data=data,
|
|
238
|
+
agent_name=self.name,
|
|
239
|
+
task=task,
|
|
240
|
+
task_id=task_id,
|
|
241
|
+
retry_enabled=str(self.config.retry_enabled),
|
|
242
|
+
reliability_enabled="true"
|
|
243
|
+
) as span_id:
|
|
244
|
+
|
|
245
|
+
# Execute with or without retry logic
|
|
246
|
+
if self.config.retry_enabled:
|
|
247
|
+
result = await self._process_with_retry(span_id, task, data, context)
|
|
248
|
+
else:
|
|
249
|
+
result = await self._process_fail_fast(span_id, task, data, context)
|
|
250
|
+
|
|
251
|
+
# Update task status to completed
|
|
252
|
+
if task_id and self.task_manager:
|
|
253
|
+
await self.task_manager.update_status(task_id, TaskStatus.COMPLETED)
|
|
254
|
+
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
# Update task status to failed
|
|
259
|
+
if task_id and self.task_manager:
|
|
260
|
+
await self.task_manager.update_status(task_id, TaskStatus.FAILED, error=str(e))
|
|
261
|
+
raise
|
|
262
|
+
|
|
263
|
+
finally:
|
|
264
|
+
# Always release the processing slot
|
|
265
|
+
if self.backpressure_controller:
|
|
266
|
+
self.backpressure_controller.release_processing_slot()
|
|
267
|
+
|
|
268
|
+
async def _process_without_reliability(
|
|
269
|
+
self,
|
|
270
|
+
task: str,
|
|
271
|
+
data: Any,
|
|
272
|
+
context: Dict[str, Any]
|
|
273
|
+
) -> Dict[str, Any]:
|
|
274
|
+
"""Process task without reliability features (original behavior)."""
|
|
275
|
+
# Automatically trace the entire operation
|
|
276
|
+
async with self.trace_manager.span(
|
|
277
|
+
operation_name=f"agent_process_{task}",
|
|
278
|
+
trace_type=TraceType.AGENT_EXECUTION,
|
|
279
|
+
agent_id=self.agent_id,
|
|
280
|
+
input_data=data,
|
|
281
|
+
agent_name=self.name,
|
|
282
|
+
task=task,
|
|
283
|
+
retry_enabled=str(self.config.retry_enabled),
|
|
284
|
+
reliability_enabled="false"
|
|
285
|
+
) as span_id:
|
|
286
|
+
|
|
287
|
+
# Execute with or without retry logic
|
|
288
|
+
if self.config.retry_enabled:
|
|
289
|
+
result = await self._process_with_retry(span_id, task, data, context)
|
|
290
|
+
else:
|
|
291
|
+
result = await self._process_fail_fast(span_id, task, data, context)
|
|
292
|
+
|
|
293
|
+
return result
|
|
294
|
+
|
|
295
|
+
async def _process_with_retry(
|
|
296
|
+
self,
|
|
297
|
+
parent_span_id: str,
|
|
298
|
+
task: str,
|
|
299
|
+
data: Any,
|
|
300
|
+
context: Dict[str, Any]
|
|
301
|
+
) -> Dict[str, Any]:
|
|
302
|
+
"""Process task with retry logic and automatic retry decision tracing."""
|
|
303
|
+
retry_policy = self.config.retry_policy
|
|
304
|
+
max_attempts = retry_policy.max_retries + 1
|
|
305
|
+
last_exception = None
|
|
306
|
+
|
|
307
|
+
for attempt in range(1, max_attempts + 1):
|
|
308
|
+
# Create a child span for each retry attempt
|
|
309
|
+
async with self.trace_manager.span(
|
|
310
|
+
operation_name=f"retry_attempt_{attempt}",
|
|
311
|
+
trace_type=TraceType.AGENT_EXECUTION,
|
|
312
|
+
agent_id=self.agent_id,
|
|
313
|
+
parent_span_id=parent_span_id,
|
|
314
|
+
attempt=str(attempt),
|
|
315
|
+
max_attempts=str(max_attempts),
|
|
316
|
+
is_retry=str(attempt > 1)
|
|
317
|
+
) as attempt_span_id:
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
# Add attempt info to context
|
|
321
|
+
attempt_context = {
|
|
322
|
+
**context,
|
|
323
|
+
'attempt_number': attempt,
|
|
324
|
+
'max_attempts': max_attempts,
|
|
325
|
+
'is_retry': attempt > 1
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
# Execute the task
|
|
329
|
+
result = await self._process_once(task, data, attempt_context, attempt, max_attempts)
|
|
330
|
+
|
|
331
|
+
# Success!
|
|
332
|
+
if attempt > 1:
|
|
333
|
+
logger.info(f"Agent {self.name} succeeded on attempt {attempt}")
|
|
334
|
+
|
|
335
|
+
return self._format_success_response(result, attempt_context, attempt, max_attempts)
|
|
336
|
+
|
|
337
|
+
except Exception as e:
|
|
338
|
+
last_exception = e
|
|
339
|
+
|
|
340
|
+
# Should we retry? Use decision tracing to record the retry decision
|
|
341
|
+
if attempt < max_attempts:
|
|
342
|
+
should_retry = await self._should_retry_error_with_tracing(
|
|
343
|
+
e, attempt, max_attempts, attempt_span_id
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
if should_retry:
|
|
347
|
+
# Calculate delay and wait
|
|
348
|
+
delay = self._calculate_retry_delay(attempt - 1, retry_policy)
|
|
349
|
+
logger.debug(f"Agent {self.name} retrying in {delay:.2f}s")
|
|
350
|
+
await asyncio.sleep(delay)
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
# Don't retry or no more attempts
|
|
354
|
+
logger.debug(f"Agent {self.name} not retrying: {type(e).__name__}")
|
|
355
|
+
return self._format_error_response(last_exception, context, attempt, max_attempts)
|
|
356
|
+
|
|
357
|
+
# All attempts exhausted
|
|
358
|
+
return self._format_error_response(
|
|
359
|
+
last_exception or Exception("Unknown error"), context, max_attempts, max_attempts
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
async def _process_fail_fast(
|
|
363
|
+
self,
|
|
364
|
+
span_id: str,
|
|
365
|
+
task: str,
|
|
366
|
+
data: Any,
|
|
367
|
+
context: Dict[str, Any]
|
|
368
|
+
) -> Dict[str, Any]:
|
|
369
|
+
"""Process task in fail-fast mode with error tracing."""
|
|
370
|
+
try:
|
|
371
|
+
result = await self._process_once(task, data, context, attempt=1, max_attempts=1)
|
|
372
|
+
return self._format_success_response(result, context, 1, 1)
|
|
373
|
+
except Exception as e:
|
|
374
|
+
logger.error(f"Error in agent {self.name} (fail-fast mode): {str(e)}")
|
|
375
|
+
return self._format_error_response(e, context, 1, 1)
|
|
376
|
+
|
|
377
|
+
async def _process_once(
|
|
378
|
+
self,
|
|
379
|
+
task: str,
|
|
380
|
+
data: Any,
|
|
381
|
+
context: Dict[str, Any],
|
|
382
|
+
attempt: int,
|
|
383
|
+
max_attempts: int
|
|
384
|
+
) -> Dict[str, Any]:
|
|
385
|
+
"""
|
|
386
|
+
Execute the task once without retry logic.
|
|
387
|
+
|
|
388
|
+
Subclasses should override this method for their specific behavior.
|
|
389
|
+
The automatic tracing happens at higher levels.
|
|
390
|
+
"""
|
|
391
|
+
# Default implementation for base agent
|
|
392
|
+
return {
|
|
393
|
+
'message': f'Agent {self.name} processed task "{task}"',
|
|
394
|
+
'task': task,
|
|
395
|
+
'data': data,
|
|
396
|
+
'agent_id': self.agent_id,
|
|
397
|
+
'agent_name': self.name,
|
|
398
|
+
'attempt': attempt,
|
|
399
|
+
'timestamp': datetime.now(timezone.utc).isoformat()
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
async def _should_retry_error_with_tracing(
|
|
403
|
+
self,
|
|
404
|
+
error: Exception,
|
|
405
|
+
attempt: int,
|
|
406
|
+
max_attempts: int,
|
|
407
|
+
span_id: str
|
|
408
|
+
) -> bool:
|
|
409
|
+
"""
|
|
410
|
+
Determine if an error should be retried with decision tracing.
|
|
411
|
+
|
|
412
|
+
This traces the retry decision-making process including confidence
|
|
413
|
+
scores and reasoning for better observability.
|
|
414
|
+
"""
|
|
415
|
+
# Use decision tracing to record retry logic
|
|
416
|
+
async with record_decision_point("retry_decision", DecisionType.VALIDATION, self.agent_id) as decision:
|
|
417
|
+
|
|
418
|
+
# Import here to avoid circular imports
|
|
419
|
+
from ..core.exceptions import (
|
|
420
|
+
TransientError, RetryableError, PermanentError,
|
|
421
|
+
classify_exception
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
# Classify the error
|
|
425
|
+
error_class = classify_exception(error)
|
|
426
|
+
error_type = type(error).__name__
|
|
427
|
+
|
|
428
|
+
# Decision logic with reasoning
|
|
429
|
+
reasoning = []
|
|
430
|
+
should_retry = False
|
|
431
|
+
confidence = 0.0
|
|
432
|
+
|
|
433
|
+
# Check attempt limit
|
|
434
|
+
if attempt >= max_attempts:
|
|
435
|
+
reasoning.append(f"Max attempts reached ({attempt}/{max_attempts})")
|
|
436
|
+
should_retry = False
|
|
437
|
+
confidence = 1.0 # Certain we shouldn't retry
|
|
438
|
+
|
|
439
|
+
# Error classification logic
|
|
440
|
+
elif error_class == "transient":
|
|
441
|
+
reasoning.append(f"Transient error detected: {error_type}")
|
|
442
|
+
reasoning.append("Transient errors are typically safe to retry")
|
|
443
|
+
should_retry = True
|
|
444
|
+
confidence = 0.9
|
|
445
|
+
|
|
446
|
+
elif error_class == "retryable":
|
|
447
|
+
reasoning.append(f"Retryable error detected: {error_type}")
|
|
448
|
+
reasoning.append("Error may resolve on retry")
|
|
449
|
+
should_retry = True
|
|
450
|
+
confidence = 0.7
|
|
451
|
+
|
|
452
|
+
elif error_class == "permanent":
|
|
453
|
+
reasoning.append(f"Permanent error detected: {error_type}")
|
|
454
|
+
reasoning.append("Permanent errors should not be retried")
|
|
455
|
+
should_retry = False
|
|
456
|
+
confidence = 0.95
|
|
457
|
+
|
|
458
|
+
else:
|
|
459
|
+
# Unknown error - use heuristics
|
|
460
|
+
reasoning.append(f"Unknown error type: {error_type}")
|
|
461
|
+
|
|
462
|
+
if isinstance(error, (ValueError, TypeError, KeyError)):
|
|
463
|
+
reasoning.append("Logic/data error - likely permanent")
|
|
464
|
+
should_retry = False
|
|
465
|
+
confidence = 0.8
|
|
466
|
+
else:
|
|
467
|
+
reasoning.append("Unknown error - defaulting to retry")
|
|
468
|
+
should_retry = True
|
|
469
|
+
confidence = 0.5
|
|
470
|
+
|
|
471
|
+
# Record the decision
|
|
472
|
+
decision.set_confidence(confidence)
|
|
473
|
+
for reason in reasoning:
|
|
474
|
+
decision.add_reasoning(reason)
|
|
475
|
+
|
|
476
|
+
decision.set_factor("error_type", error_type)
|
|
477
|
+
decision.set_factor("error_class", error_class)
|
|
478
|
+
decision.set_factor("attempt", attempt)
|
|
479
|
+
decision.set_factor("max_attempts", max_attempts)
|
|
480
|
+
|
|
481
|
+
# Add alternatives considered
|
|
482
|
+
decision.add_alternative("retry" if not should_retry else "fail")
|
|
483
|
+
|
|
484
|
+
logger.debug(f"Retry decision for {error_type}: {should_retry} (confidence: {confidence:.2f})")
|
|
485
|
+
return should_retry
|
|
486
|
+
|
|
487
|
+
def _calculate_retry_delay(self, attempt: int, retry_policy) -> float:
|
|
488
|
+
"""Calculate retry delay with jitter."""
|
|
489
|
+
if hasattr(retry_policy, 'calculate_delay'):
|
|
490
|
+
# Use the RetryPolicy's built-in delay calculation
|
|
491
|
+
return retry_policy.calculate_delay(attempt)
|
|
492
|
+
|
|
493
|
+
# Legacy fallback for old-style retry policies
|
|
494
|
+
if retry_policy.strategy in [RetryStrategy.IMMEDIATE, "immediate"]:
|
|
495
|
+
delay = 0.0
|
|
496
|
+
elif retry_policy.strategy in [RetryStrategy.FIXED, RetryStrategy.FIXED_DELAY, "fixed", "fixed_delay"]:
|
|
497
|
+
delay = getattr(retry_policy, 'base_delay', getattr(retry_policy, 'initial_delay', 1.0))
|
|
498
|
+
else: # EXPONENTIAL (default)
|
|
499
|
+
base_delay = getattr(retry_policy, 'base_delay', getattr(retry_policy, 'initial_delay', 1.0))
|
|
500
|
+
delay = base_delay * (2 ** attempt)
|
|
501
|
+
|
|
502
|
+
# Add small random jitter to prevent thundering herd
|
|
503
|
+
jitter = delay * 0.1 * random.random()
|
|
504
|
+
delay += jitter
|
|
505
|
+
|
|
506
|
+
return delay
|
|
507
|
+
|
|
508
|
+
def _format_success_response(
|
|
509
|
+
self,
|
|
510
|
+
result: Any,
|
|
511
|
+
context: Dict[str, Any],
|
|
512
|
+
attempt: int,
|
|
513
|
+
max_attempts: int
|
|
514
|
+
) -> Dict[str, Any]:
|
|
515
|
+
"""Format successful response with tracing metadata (flattened for better DX)."""
|
|
516
|
+
# Build response with framework metadata
|
|
517
|
+
response = {
|
|
518
|
+
'status': 'success',
|
|
519
|
+
'agent_id': self.agent_id,
|
|
520
|
+
'agent_name': self.name,
|
|
521
|
+
'context': context,
|
|
522
|
+
'retry_info': {
|
|
523
|
+
'attempt': attempt,
|
|
524
|
+
'max_attempts': max_attempts,
|
|
525
|
+
'retry_enabled': self.config.retry_enabled
|
|
526
|
+
} if self.config.retry_enabled else None,
|
|
527
|
+
'timestamp': datetime.now(timezone.utc).isoformat()
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
# Flatten handler result into top level for better DX
|
|
531
|
+
# If result is a dict, merge it; otherwise add as 'result' key
|
|
532
|
+
if isinstance(result, dict):
|
|
533
|
+
# Merge handler result at top level (handler keys won't overwrite framework keys)
|
|
534
|
+
response.update(result)
|
|
535
|
+
else:
|
|
536
|
+
# Non-dict results stored under 'result' key
|
|
537
|
+
response['result'] = result
|
|
538
|
+
|
|
539
|
+
return response
|
|
540
|
+
|
|
541
|
+
def _format_error_response(
|
|
542
|
+
self,
|
|
543
|
+
error: Exception,
|
|
544
|
+
context: Dict[str, Any],
|
|
545
|
+
attempt: int,
|
|
546
|
+
max_attempts: int
|
|
547
|
+
) -> Dict[str, Any]:
|
|
548
|
+
"""Format error response with tracing metadata."""
|
|
549
|
+
return {
|
|
550
|
+
'status': 'error',
|
|
551
|
+
'error': str(error),
|
|
552
|
+
'error_type': error.__class__.__name__,
|
|
553
|
+
'agent_id': self.agent_id,
|
|
554
|
+
'agent_name': self.name,
|
|
555
|
+
'context': context,
|
|
556
|
+
'result': None, # Ensure result field exists for relay compatibility
|
|
557
|
+
'retry_info': {
|
|
558
|
+
'attempt': attempt,
|
|
559
|
+
'max_attempts': max_attempts,
|
|
560
|
+
'retry_enabled': self.config.retry_enabled,
|
|
561
|
+
'retry_exhausted': attempt >= max_attempts
|
|
562
|
+
} if self.config.retry_enabled else None,
|
|
563
|
+
'timestamp': datetime.now(timezone.utc).isoformat()
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
@property
|
|
567
|
+
def health(self) -> Dict[str, Any]:
|
|
568
|
+
"""Get agent health information from unified tracing system."""
|
|
569
|
+
# Get real-time metrics from trace manager
|
|
570
|
+
metrics = self.trace_manager.get_agent_metrics(self.agent_id)
|
|
571
|
+
|
|
572
|
+
return {
|
|
573
|
+
'id': self.agent_id,
|
|
574
|
+
'name': self.name,
|
|
575
|
+
'type': self.agent_type.value,
|
|
576
|
+
'running': self._running,
|
|
577
|
+
'metrics': metrics,
|
|
578
|
+
'retry_config': {
|
|
579
|
+
'enabled': self.config.retry_enabled,
|
|
580
|
+
'max_retries': self.config.retry_policy.max_retries if self.config.retry_enabled else None,
|
|
581
|
+
'strategy': self.config.retry_policy.strategy.value if self.config.retry_enabled else None,
|
|
582
|
+
},
|
|
583
|
+
'tracing': {
|
|
584
|
+
'enabled': True,
|
|
585
|
+
'trace_manager_available': self.trace_manager is not None
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
@property
|
|
590
|
+
def trace_id(self) -> Optional[str]:
|
|
591
|
+
"""Get current trace ID for this agent."""
|
|
592
|
+
return self.trace_manager.trace_context.current_trace_id
|
|
593
|
+
|
|
594
|
+
@property
|
|
595
|
+
def current_span_id(self) -> Optional[str]:
|
|
596
|
+
"""Get current span ID for this agent."""
|
|
597
|
+
return self.trace_manager.trace_context.current_span_id
|
|
598
|
+
|
|
599
|
+
def get_recent_operations(self, limit: int = 10) -> List[Dict[str, Any]]:
|
|
600
|
+
"""Get recent operations for this agent from unified tracing."""
|
|
601
|
+
return self.trace_manager.get_recent_operations(agent_id=self.agent_id, limit=limit)
|
|
602
|
+
|
|
603
|
+
def get_trace_stats(self) -> Dict[str, Any]:
|
|
604
|
+
"""Get comprehensive tracing statistics for this agent."""
|
|
605
|
+
return self.trace_manager.get_agent_metrics(self.agent_id)
|
|
606
|
+
|
|
607
|
+
def get_recent_decisions(self, limit: int = 10) -> List[Dict[str, Any]]:
|
|
608
|
+
"""Get recent decision traces for this agent."""
|
|
609
|
+
from ..core.decision_tracing import get_recent_decisions
|
|
610
|
+
return get_recent_decisions(agent_id=self.agent_id, limit=limit)
|
|
611
|
+
|
|
612
|
+
def get_decision_stats(self) -> Dict[str, Any]:
|
|
613
|
+
"""Get decision statistics for this agent."""
|
|
614
|
+
from ..core.decision_tracing import get_decision_stats
|
|
615
|
+
return get_decision_stats(agent_id=self.agent_id)
|
|
616
|
+
|
|
617
|
+
# Reliability management methods
|
|
618
|
+
|
|
619
|
+
def enable_reliability_features(
|
|
620
|
+
self,
|
|
621
|
+
max_concurrent_tasks: int = 10,
|
|
622
|
+
max_queue_size: int = 100
|
|
623
|
+
) -> None:
|
|
624
|
+
"""
|
|
625
|
+
Enable reliability features for this agent.
|
|
626
|
+
|
|
627
|
+
Args:
|
|
628
|
+
max_concurrent_tasks: Maximum concurrent tasks
|
|
629
|
+
max_queue_size: Maximum queue size for backpressure control
|
|
630
|
+
"""
|
|
631
|
+
if self.enable_reliability:
|
|
632
|
+
logger.warning(f"Reliability already enabled for agent {self.name}")
|
|
633
|
+
return
|
|
634
|
+
|
|
635
|
+
self.enable_reliability = True
|
|
636
|
+
self.task_manager = get_global_task_manager()
|
|
637
|
+
self.backpressure_controller = BackpressureController(
|
|
638
|
+
max_concurrent_tasks=max_concurrent_tasks,
|
|
639
|
+
max_queue_size=max_queue_size,
|
|
640
|
+
agent_id=self.agent_id
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
logger.info(f"Enabled reliability features for agent {self.name}")
|
|
644
|
+
|
|
645
|
+
def disable_reliability_features(self) -> None:
|
|
646
|
+
"""Disable reliability features for this agent."""
|
|
647
|
+
self.enable_reliability = False
|
|
648
|
+
self.task_manager = None
|
|
649
|
+
self.backpressure_controller = None
|
|
650
|
+
|
|
651
|
+
logger.info(f"Disabled reliability features for agent {self.name}")
|
|
652
|
+
|
|
653
|
+
async def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
|
|
654
|
+
"""Get status of a specific task."""
|
|
655
|
+
if not self.task_manager:
|
|
656
|
+
return None
|
|
657
|
+
return await self.task_manager.get_task_status(task_id)
|
|
658
|
+
|
|
659
|
+
async def get_agent_tasks(self, status: Optional[TaskStatus] = None) -> List[Dict[str, Any]]:
|
|
660
|
+
"""Get all tasks for this agent, optionally filtered by status."""
|
|
661
|
+
if not self.task_manager:
|
|
662
|
+
return []
|
|
663
|
+
|
|
664
|
+
tasks = await self.task_manager.get_agent_tasks(self.agent_id, status)
|
|
665
|
+
return [
|
|
666
|
+
{
|
|
667
|
+
"id": task.id,
|
|
668
|
+
"status": task.status.value,
|
|
669
|
+
"progress": task.progress,
|
|
670
|
+
"error": task.error,
|
|
671
|
+
"duration": task.duration(),
|
|
672
|
+
"age": task.age(),
|
|
673
|
+
"retry_count": task.retry_count
|
|
674
|
+
}
|
|
675
|
+
for task in tasks
|
|
676
|
+
]
|
|
677
|
+
|
|
678
|
+
def get_backpressure_stats(self) -> Dict[str, Any]:
|
|
679
|
+
"""Get current backpressure statistics."""
|
|
680
|
+
if not self.backpressure_controller:
|
|
681
|
+
return {"enabled": False}
|
|
682
|
+
|
|
683
|
+
stats = self.backpressure_controller.get_stats()
|
|
684
|
+
stats["enabled"] = True
|
|
685
|
+
return stats
|
|
686
|
+
|
|
687
|
+
async def cancel_task(self, task_id: str) -> bool:
|
|
688
|
+
"""Cancel a specific task."""
|
|
689
|
+
if not self.task_manager:
|
|
690
|
+
return False
|
|
691
|
+
return await self.task_manager.cancel_task(task_id)
|
|
692
|
+
|
|
693
|
+
# Integration helpers
|
|
694
|
+
|
|
695
|
+
def create_child_agent(self, name: str, config_overrides: Optional[Dict[str, Any]] = None) -> "BaseAgent":
|
|
696
|
+
"""Create a child agent that inherits tracing context."""
|
|
697
|
+
# Create new config based on current config
|
|
698
|
+
from ..config.base import AgentConfig
|
|
699
|
+
|
|
700
|
+
child_config = AgentConfig(
|
|
701
|
+
name=name,
|
|
702
|
+
type=self.config.type,
|
|
703
|
+
enable_retry=self.config.enable_retry,
|
|
704
|
+
retry_policy=self.config.retry_policy,
|
|
705
|
+
**(config_overrides or {})
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
# Create child agent
|
|
709
|
+
child = self.__class__(
|
|
710
|
+
config=child_config,
|
|
711
|
+
llm_provider=self.llm,
|
|
712
|
+
name=name
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
logger.debug(f"Created child agent {name} from parent {self.name}")
|
|
716
|
+
return child
|
|
717
|
+
|
|
718
|
+
def __repr__(self) -> str:
|
|
719
|
+
return f"BaseAgent(name='{self.name}', id='{self.agent_id}', running={self._running})"
|
|
720
|
+
|
|
721
|
+
def __str__(self) -> str:
|
|
722
|
+
return f"BaseAgent '{self.name}'"
|