daita-agents 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of daita-agents might be problematic. Click here for more details.
- daita/__init__.py +208 -0
- daita/agents/__init__.py +33 -0
- daita/agents/base.py +722 -0
- daita/agents/substrate.py +895 -0
- daita/cli/__init__.py +145 -0
- daita/cli/__main__.py +7 -0
- daita/cli/ascii_art.py +44 -0
- daita/cli/core/__init__.py +0 -0
- daita/cli/core/create.py +254 -0
- daita/cli/core/deploy.py +473 -0
- daita/cli/core/deployments.py +309 -0
- daita/cli/core/import_detector.py +219 -0
- daita/cli/core/init.py +382 -0
- daita/cli/core/logs.py +239 -0
- daita/cli/core/managed_deploy.py +709 -0
- daita/cli/core/run.py +648 -0
- daita/cli/core/status.py +421 -0
- daita/cli/core/test.py +239 -0
- daita/cli/core/webhooks.py +172 -0
- daita/cli/main.py +588 -0
- daita/cli/utils.py +541 -0
- daita/config/__init__.py +62 -0
- daita/config/base.py +159 -0
- daita/config/settings.py +184 -0
- daita/core/__init__.py +262 -0
- daita/core/decision_tracing.py +701 -0
- daita/core/exceptions.py +480 -0
- daita/core/focus.py +251 -0
- daita/core/interfaces.py +76 -0
- daita/core/plugin_tracing.py +550 -0
- daita/core/relay.py +695 -0
- daita/core/reliability.py +381 -0
- daita/core/scaling.py +444 -0
- daita/core/tools.py +402 -0
- daita/core/tracing.py +770 -0
- daita/core/workflow.py +1084 -0
- daita/display/__init__.py +1 -0
- daita/display/console.py +160 -0
- daita/execution/__init__.py +58 -0
- daita/execution/client.py +856 -0
- daita/execution/exceptions.py +92 -0
- daita/execution/models.py +317 -0
- daita/llm/__init__.py +60 -0
- daita/llm/anthropic.py +166 -0
- daita/llm/base.py +373 -0
- daita/llm/factory.py +101 -0
- daita/llm/gemini.py +152 -0
- daita/llm/grok.py +114 -0
- daita/llm/mock.py +135 -0
- daita/llm/openai.py +109 -0
- daita/plugins/__init__.py +141 -0
- daita/plugins/base.py +37 -0
- daita/plugins/base_db.py +167 -0
- daita/plugins/elasticsearch.py +844 -0
- daita/plugins/mcp.py +481 -0
- daita/plugins/mongodb.py +510 -0
- daita/plugins/mysql.py +351 -0
- daita/plugins/postgresql.py +331 -0
- daita/plugins/redis_messaging.py +500 -0
- daita/plugins/rest.py +529 -0
- daita/plugins/s3.py +761 -0
- daita/plugins/slack.py +729 -0
- daita/utils/__init__.py +18 -0
- daita_agents-0.1.0.dist-info/METADATA +350 -0
- daita_agents-0.1.0.dist-info/RECORD +69 -0
- daita_agents-0.1.0.dist-info/WHEEL +5 -0
- daita_agents-0.1.0.dist-info/entry_points.txt +2 -0
- daita_agents-0.1.0.dist-info/licenses/LICENSE +56 -0
- daita_agents-0.1.0.dist-info/top_level.txt +1 -0
daita/core/tracing.py
ADDED
|
@@ -0,0 +1,770 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified TraceManager for Daita Agents - Fixed MVP Version
|
|
3
|
+
|
|
4
|
+
Streamlined automatic tracing system that captures all agent operations,
|
|
5
|
+
LLM calls, workflow communication, and tool usage. Zero configuration required.
|
|
6
|
+
|
|
7
|
+
FIXED ISSUES:
|
|
8
|
+
- Added missing methods (record_decision, record_llm_call)
|
|
9
|
+
- Improved error handling with proper logging
|
|
10
|
+
- Fixed dependency management for aiohttp
|
|
11
|
+
- Added thread safety for concurrent access
|
|
12
|
+
- Improved context management to prevent leaks
|
|
13
|
+
- Added proper configuration validation
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import logging
|
|
18
|
+
import time
|
|
19
|
+
import uuid
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
from typing import Dict, Any, Optional, List, Union
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
from collections import deque
|
|
26
|
+
from enum import Enum
|
|
27
|
+
from contextlib import asynccontextmanager
|
|
28
|
+
import threading
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
class TraceType(str, Enum):
|
|
33
|
+
"""Types of traces we capture."""
|
|
34
|
+
AGENT_EXECUTION = "agent_execution"
|
|
35
|
+
LLM_CALL = "llm_call"
|
|
36
|
+
WORKFLOW_COMMUNICATION = "workflow_communication"
|
|
37
|
+
AGENT_LIFECYCLE = "agent_lifecycle"
|
|
38
|
+
DECISION_TRACE = "decision_trace"
|
|
39
|
+
TOOL_EXECUTION = "tool_execution"
|
|
40
|
+
|
|
41
|
+
class TraceStatus(str, Enum):
|
|
42
|
+
"""Status of a trace span."""
|
|
43
|
+
STARTED = "started"
|
|
44
|
+
SUCCESS = "success"
|
|
45
|
+
ERROR = "error"
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class TraceSpan:
|
|
49
|
+
"""A single trace span - simplified for MVP."""
|
|
50
|
+
span_id: str
|
|
51
|
+
trace_id: str
|
|
52
|
+
parent_span_id: Optional[str]
|
|
53
|
+
agent_id: Optional[str]
|
|
54
|
+
operation_name: str
|
|
55
|
+
trace_type: TraceType
|
|
56
|
+
start_time: float
|
|
57
|
+
end_time: Optional[float]
|
|
58
|
+
status: TraceStatus
|
|
59
|
+
|
|
60
|
+
# Core data
|
|
61
|
+
input_data: Any
|
|
62
|
+
output_data: Any
|
|
63
|
+
error_message: Optional[str]
|
|
64
|
+
|
|
65
|
+
# Performance
|
|
66
|
+
duration_ms: Optional[float]
|
|
67
|
+
|
|
68
|
+
# Metadata - simple dict for flexibility
|
|
69
|
+
metadata: Dict[str, Any]
|
|
70
|
+
|
|
71
|
+
# Environment context
|
|
72
|
+
deployment_id: Optional[str]
|
|
73
|
+
environment: str
|
|
74
|
+
|
|
75
|
+
def __post_init__(self):
|
|
76
|
+
"""Auto-populate deployment context."""
|
|
77
|
+
if self.deployment_id is None:
|
|
78
|
+
self.deployment_id = os.getenv("DAITA_DEPLOYMENT_ID")
|
|
79
|
+
if not self.environment:
|
|
80
|
+
self.environment = os.getenv("DAITA_ENVIRONMENT", "development")
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def is_completed(self) -> bool:
|
|
84
|
+
return self.end_time is not None
|
|
85
|
+
|
|
86
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
87
|
+
"""Convert to dictionary for serialization."""
|
|
88
|
+
return {
|
|
89
|
+
"span_id": self.span_id,
|
|
90
|
+
"trace_id": self.trace_id,
|
|
91
|
+
"parent_span_id": self.parent_span_id,
|
|
92
|
+
"agent_id": self.agent_id,
|
|
93
|
+
"operation": self.operation_name,
|
|
94
|
+
"type": self.trace_type.value,
|
|
95
|
+
"start_time": self.start_time,
|
|
96
|
+
"end_time": self.end_time,
|
|
97
|
+
"duration_ms": self.duration_ms,
|
|
98
|
+
"status": self.status.value,
|
|
99
|
+
"input_preview": self._create_preview(self.input_data),
|
|
100
|
+
"output_preview": self._create_preview(self.output_data),
|
|
101
|
+
"error": self.error_message,
|
|
102
|
+
"metadata": self.metadata,
|
|
103
|
+
"environment": self.environment,
|
|
104
|
+
"deployment_id": self.deployment_id
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
def _create_preview(self, data: Any, max_length: int = 200) -> str:
|
|
108
|
+
"""Create a preview string for data."""
|
|
109
|
+
if data is None:
|
|
110
|
+
return ""
|
|
111
|
+
try:
|
|
112
|
+
if isinstance(data, str):
|
|
113
|
+
preview = data
|
|
114
|
+
elif isinstance(data, dict):
|
|
115
|
+
preview = json.dumps(data, separators=(',', ':'))
|
|
116
|
+
else:
|
|
117
|
+
preview = str(data)
|
|
118
|
+
|
|
119
|
+
if len(preview) > max_length:
|
|
120
|
+
return preview[:max_length] + "..."
|
|
121
|
+
return preview
|
|
122
|
+
except Exception:
|
|
123
|
+
return f"<{type(data).__name__}>"
|
|
124
|
+
|
|
125
|
+
class TraceContext:
|
|
126
|
+
"""Thread-local trace context for automatic correlation."""
|
|
127
|
+
|
|
128
|
+
def __init__(self):
|
|
129
|
+
self._local = threading.local()
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def current_trace_id(self) -> Optional[str]:
|
|
133
|
+
return getattr(self._local, 'trace_id', None)
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def current_span_id(self) -> Optional[str]:
|
|
137
|
+
return getattr(self._local, 'span_id', None)
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def current_agent_id(self) -> Optional[str]:
|
|
141
|
+
return getattr(self._local, 'agent_id', None)
|
|
142
|
+
|
|
143
|
+
def set_context(self, trace_id: str, span_id: str, agent_id: Optional[str] = None):
|
|
144
|
+
self._local.trace_id = trace_id
|
|
145
|
+
self._local.span_id = span_id
|
|
146
|
+
if agent_id:
|
|
147
|
+
self._local.agent_id = agent_id
|
|
148
|
+
|
|
149
|
+
def clear_context(self):
|
|
150
|
+
self._local.trace_id = None
|
|
151
|
+
self._local.span_id = None
|
|
152
|
+
self._local.agent_id = None
|
|
153
|
+
|
|
154
|
+
@asynccontextmanager
|
|
155
|
+
async def span_context(self, trace_id: str, span_id: str, agent_id: Optional[str] = None):
|
|
156
|
+
"""Context manager for automatic span context management."""
|
|
157
|
+
old_trace_id = self.current_trace_id
|
|
158
|
+
old_span_id = self.current_span_id
|
|
159
|
+
old_agent_id = self.current_agent_id
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
self.set_context(trace_id, span_id, agent_id)
|
|
163
|
+
yield
|
|
164
|
+
finally:
|
|
165
|
+
if old_trace_id:
|
|
166
|
+
self.set_context(old_trace_id, old_span_id, old_agent_id)
|
|
167
|
+
else:
|
|
168
|
+
self.clear_context()
|
|
169
|
+
|
|
170
|
+
class DashboardReporter:
|
|
171
|
+
"""Dashboard reporting with proper dependency management."""
|
|
172
|
+
|
|
173
|
+
def __init__(self):
|
|
174
|
+
self.api_key = os.getenv("DAITA_API_KEY")
|
|
175
|
+
self.dashboard_url = os.getenv("DAITA_DASHBOARD_URL") or os.getenv("DAITA_DASHBOARD_API") or os.getenv("DAITA_DASHBOARD_API_OVERRIDE") or ""
|
|
176
|
+
self.enabled = bool(self.api_key and self.dashboard_url)
|
|
177
|
+
self.reports_sent = 0
|
|
178
|
+
self.reports_failed = 0
|
|
179
|
+
self._aiohttp_available = None
|
|
180
|
+
|
|
181
|
+
# Validate configuration
|
|
182
|
+
if self.api_key and not self.dashboard_url:
|
|
183
|
+
self.enabled = False
|
|
184
|
+
|
|
185
|
+
if self.enabled:
|
|
186
|
+
logger.info(f"Dashboard reporting enabled (URL: {self.dashboard_url})")
|
|
187
|
+
else:
|
|
188
|
+
logger.debug("Dashboard reporting disabled (API key or URL not configured)")
|
|
189
|
+
|
|
190
|
+
def _check_aiohttp(self) -> bool:
|
|
191
|
+
"""Check if aiohttp is available (cached result)."""
|
|
192
|
+
if self._aiohttp_available is None:
|
|
193
|
+
try:
|
|
194
|
+
import aiohttp
|
|
195
|
+
self._aiohttp_available = True
|
|
196
|
+
logger.debug("aiohttp available for dashboard reporting")
|
|
197
|
+
except ImportError:
|
|
198
|
+
self._aiohttp_available = False
|
|
199
|
+
logger.warning("aiohttp not available - dashboard reporting will be skipped")
|
|
200
|
+
return self._aiohttp_available
|
|
201
|
+
|
|
202
|
+
async def report_span(self, span: TraceSpan) -> bool:
|
|
203
|
+
"""Report a single span to dashboard with proper error handling."""
|
|
204
|
+
if not self.enabled:
|
|
205
|
+
return True
|
|
206
|
+
|
|
207
|
+
if not self._check_aiohttp():
|
|
208
|
+
# Don't log this repeatedly
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
import aiohttp
|
|
213
|
+
|
|
214
|
+
headers = {
|
|
215
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
216
|
+
"Content-Type": "application/json",
|
|
217
|
+
"User-Agent": "daita-agents/1.0.0"
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
payload = {
|
|
221
|
+
"spans": [span.to_dict()],
|
|
222
|
+
"environment": span.environment,
|
|
223
|
+
"timestamp": datetime.utcnow().isoformat()
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
timeout = aiohttp.ClientTimeout(total=5)
|
|
227
|
+
|
|
228
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
229
|
+
async with session.post(
|
|
230
|
+
f"{self.dashboard_url}/v1/traces",
|
|
231
|
+
headers=headers,
|
|
232
|
+
json=payload
|
|
233
|
+
) as response:
|
|
234
|
+
if response.status == 200:
|
|
235
|
+
self.reports_sent += 1
|
|
236
|
+
logger.debug(f"Successfully reported span {span.span_id}")
|
|
237
|
+
return True
|
|
238
|
+
else:
|
|
239
|
+
self.reports_failed += 1
|
|
240
|
+
logger.warning(f"Dashboard API error: {response.status} - {await response.text()}")
|
|
241
|
+
return False
|
|
242
|
+
|
|
243
|
+
except asyncio.TimeoutError:
|
|
244
|
+
self.reports_failed += 1
|
|
245
|
+
logger.warning("Dashboard reporting timeout")
|
|
246
|
+
return False
|
|
247
|
+
except Exception as e:
|
|
248
|
+
self.reports_failed += 1
|
|
249
|
+
logger.warning(f"Dashboard reporting failed: {e}")
|
|
250
|
+
return False
|
|
251
|
+
|
|
252
|
+
class TraceManager:
|
|
253
|
+
"""
|
|
254
|
+
Fixed TraceManager for MVP - automatic tracing with proper error handling.
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
def __init__(self):
|
|
258
|
+
self.trace_context = TraceContext()
|
|
259
|
+
self.dashboard_reporter = DashboardReporter()
|
|
260
|
+
|
|
261
|
+
# Thread-safe storage
|
|
262
|
+
self._lock = threading.RLock()
|
|
263
|
+
self._active_spans: Dict[str, TraceSpan] = {}
|
|
264
|
+
self._completed_spans: deque = deque(maxlen=500)
|
|
265
|
+
|
|
266
|
+
# Basic metrics
|
|
267
|
+
self._metrics = {
|
|
268
|
+
"total_spans": 0,
|
|
269
|
+
"total_llm_calls": 0,
|
|
270
|
+
"total_tokens": 0,
|
|
271
|
+
"total_decisions": 0
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
# Streaming decision events support
|
|
275
|
+
self._decision_stream_callbacks: Dict[str, List[callable]] = {}
|
|
276
|
+
|
|
277
|
+
logger.info("TraceManager initialized (Fixed MVP version)")
|
|
278
|
+
|
|
279
|
+
def start_span(
|
|
280
|
+
self,
|
|
281
|
+
operation_name: str,
|
|
282
|
+
trace_type: TraceType,
|
|
283
|
+
agent_id: Optional[str] = None,
|
|
284
|
+
parent_span_id: Optional[str] = None,
|
|
285
|
+
**metadata
|
|
286
|
+
) -> str:
|
|
287
|
+
"""Start a new trace span with thread safety."""
|
|
288
|
+
try:
|
|
289
|
+
span_id = str(uuid.uuid4())
|
|
290
|
+
|
|
291
|
+
# Determine trace_id with context fallback
|
|
292
|
+
if parent_span_id:
|
|
293
|
+
with self._lock:
|
|
294
|
+
parent_span = self._active_spans.get(parent_span_id)
|
|
295
|
+
trace_id = parent_span.trace_id if parent_span else str(uuid.uuid4())
|
|
296
|
+
elif self.trace_context.current_trace_id:
|
|
297
|
+
trace_id = self.trace_context.current_trace_id
|
|
298
|
+
parent_span_id = self.trace_context.current_span_id
|
|
299
|
+
else:
|
|
300
|
+
trace_id = str(uuid.uuid4())
|
|
301
|
+
|
|
302
|
+
# Use agent from context if not provided
|
|
303
|
+
if not agent_id:
|
|
304
|
+
agent_id = self.trace_context.current_agent_id
|
|
305
|
+
|
|
306
|
+
# Create span
|
|
307
|
+
span = TraceSpan(
|
|
308
|
+
span_id=span_id,
|
|
309
|
+
trace_id=trace_id,
|
|
310
|
+
parent_span_id=parent_span_id,
|
|
311
|
+
agent_id=agent_id,
|
|
312
|
+
operation_name=operation_name,
|
|
313
|
+
trace_type=trace_type,
|
|
314
|
+
start_time=time.time(),
|
|
315
|
+
end_time=None,
|
|
316
|
+
status=TraceStatus.STARTED,
|
|
317
|
+
input_data=metadata.get('input_data'),
|
|
318
|
+
output_data=None,
|
|
319
|
+
error_message=None,
|
|
320
|
+
duration_ms=None,
|
|
321
|
+
metadata=metadata,
|
|
322
|
+
deployment_id=None,
|
|
323
|
+
environment=""
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
with self._lock:
|
|
327
|
+
self._active_spans[span_id] = span
|
|
328
|
+
self._metrics["total_spans"] += 1
|
|
329
|
+
|
|
330
|
+
logger.debug(f"Started span {span_id} for '{operation_name}'")
|
|
331
|
+
return span_id
|
|
332
|
+
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.error(f"Failed to start span: {e}")
|
|
335
|
+
# Return a valid span ID so operations don't break
|
|
336
|
+
return f"error_{uuid.uuid4().hex[:8]}"
|
|
337
|
+
|
|
338
|
+
def end_span(
|
|
339
|
+
self,
|
|
340
|
+
span_id: str,
|
|
341
|
+
status: TraceStatus = TraceStatus.SUCCESS,
|
|
342
|
+
output_data: Any = None,
|
|
343
|
+
error_message: Optional[str] = None,
|
|
344
|
+
**metadata
|
|
345
|
+
) -> None:
|
|
346
|
+
"""End a trace span with thread safety."""
|
|
347
|
+
try:
|
|
348
|
+
with self._lock:
|
|
349
|
+
if span_id not in self._active_spans:
|
|
350
|
+
logger.debug(f"Unknown or already completed span: {span_id}")
|
|
351
|
+
return
|
|
352
|
+
|
|
353
|
+
span = self._active_spans[span_id]
|
|
354
|
+
|
|
355
|
+
# Update span
|
|
356
|
+
span.end_time = time.time()
|
|
357
|
+
span.duration_ms = (span.end_time - span.start_time) * 1000
|
|
358
|
+
span.status = status
|
|
359
|
+
span.output_data = output_data
|
|
360
|
+
span.error_message = error_message
|
|
361
|
+
span.metadata.update(metadata)
|
|
362
|
+
|
|
363
|
+
# Move to completed
|
|
364
|
+
self._completed_spans.append(span)
|
|
365
|
+
del self._active_spans[span_id]
|
|
366
|
+
|
|
367
|
+
# Update metrics
|
|
368
|
+
if span.trace_type == TraceType.LLM_CALL:
|
|
369
|
+
self._metrics["total_llm_calls"] += 1
|
|
370
|
+
if "tokens_total" in span.metadata:
|
|
371
|
+
self._metrics["total_tokens"] += span.metadata.get("tokens_total", 0)
|
|
372
|
+
elif span.trace_type == TraceType.DECISION_TRACE:
|
|
373
|
+
self._metrics["total_decisions"] += 1
|
|
374
|
+
|
|
375
|
+
# Report to dashboard (fire and forget)
|
|
376
|
+
task = asyncio.create_task(self.dashboard_reporter.report_span(span))
|
|
377
|
+
task.add_done_callback(lambda t: t.exception() if not t.cancelled() else None)
|
|
378
|
+
|
|
379
|
+
logger.debug(f"Ended span {span_id} ({span.duration_ms:.1f}ms)")
|
|
380
|
+
|
|
381
|
+
except Exception as e:
|
|
382
|
+
logger.error(f"Failed to end span {span_id}: {e}")
|
|
383
|
+
# Clean up active span even if there's an error
|
|
384
|
+
with self._lock:
|
|
385
|
+
self._active_spans.pop(span_id, None)
|
|
386
|
+
|
|
387
|
+
def record_decision(
|
|
388
|
+
self,
|
|
389
|
+
span_id: str,
|
|
390
|
+
confidence: float = 0.0,
|
|
391
|
+
reasoning: Optional[List[str]] = None,
|
|
392
|
+
alternatives: Optional[List[str]] = None,
|
|
393
|
+
**factors
|
|
394
|
+
) -> None:
|
|
395
|
+
"""Record decision metadata for a span."""
|
|
396
|
+
try:
|
|
397
|
+
with self._lock:
|
|
398
|
+
span = self._active_spans.get(span_id)
|
|
399
|
+
if span:
|
|
400
|
+
span.metadata.update({
|
|
401
|
+
"confidence_score": confidence,
|
|
402
|
+
"reasoning_chain": reasoning or [],
|
|
403
|
+
"alternatives": alternatives or [],
|
|
404
|
+
"decision_factors": factors
|
|
405
|
+
})
|
|
406
|
+
logger.debug(f"Recorded decision for span {span_id} (confidence: {confidence:.2f})")
|
|
407
|
+
else:
|
|
408
|
+
logger.debug(f"Cannot record decision for unknown span: {span_id}")
|
|
409
|
+
except Exception as e:
|
|
410
|
+
logger.error(f"Failed to record decision for span {span_id}: {e}")
|
|
411
|
+
|
|
412
|
+
def record_llm_call(
|
|
413
|
+
self,
|
|
414
|
+
span_id: str,
|
|
415
|
+
model: str,
|
|
416
|
+
prompt_tokens: int = 0,
|
|
417
|
+
completion_tokens: int = 0,
|
|
418
|
+
total_tokens: int = 0,
|
|
419
|
+
**llm_metadata
|
|
420
|
+
) -> None:
|
|
421
|
+
"""Record LLM call metadata for a span."""
|
|
422
|
+
try:
|
|
423
|
+
with self._lock:
|
|
424
|
+
span = self._active_spans.get(span_id)
|
|
425
|
+
if span:
|
|
426
|
+
span.metadata.update({
|
|
427
|
+
"model": model,
|
|
428
|
+
"tokens_prompt": prompt_tokens,
|
|
429
|
+
"tokens_completion": completion_tokens,
|
|
430
|
+
"tokens_total": total_tokens or (prompt_tokens + completion_tokens),
|
|
431
|
+
**llm_metadata
|
|
432
|
+
})
|
|
433
|
+
logger.debug(f"Recorded LLM call for span {span_id} ({total_tokens} tokens)")
|
|
434
|
+
else:
|
|
435
|
+
logger.debug(f"Cannot record LLM call for unknown span: {span_id}")
|
|
436
|
+
except Exception as e:
|
|
437
|
+
logger.error(f"Failed to record LLM call for span {span_id}: {e}")
|
|
438
|
+
|
|
439
|
+
@asynccontextmanager
|
|
440
|
+
async def span(
|
|
441
|
+
self,
|
|
442
|
+
operation_name: str,
|
|
443
|
+
trace_type: TraceType,
|
|
444
|
+
agent_id: Optional[str] = None,
|
|
445
|
+
**metadata
|
|
446
|
+
):
|
|
447
|
+
"""Context manager for automatic span lifecycle."""
|
|
448
|
+
span_id = self.start_span(
|
|
449
|
+
operation_name=operation_name,
|
|
450
|
+
trace_type=trace_type,
|
|
451
|
+
agent_id=agent_id,
|
|
452
|
+
**metadata
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
try:
|
|
456
|
+
with self._lock:
|
|
457
|
+
span = self._active_spans.get(span_id)
|
|
458
|
+
|
|
459
|
+
if span:
|
|
460
|
+
async with self.trace_context.span_context(span.trace_id, span_id, agent_id):
|
|
461
|
+
yield span_id
|
|
462
|
+
else:
|
|
463
|
+
yield span_id
|
|
464
|
+
|
|
465
|
+
self.end_span(span_id, TraceStatus.SUCCESS)
|
|
466
|
+
|
|
467
|
+
except Exception as e:
|
|
468
|
+
self.end_span(span_id, TraceStatus.ERROR, error_message=str(e))
|
|
469
|
+
raise
|
|
470
|
+
|
|
471
|
+
# Convenience methods for specific trace types
|
|
472
|
+
|
|
473
|
+
async def decision_span(self, decision_point: str, agent_id: Optional[str] = None, **metadata):
|
|
474
|
+
"""Context manager for decision tracing."""
|
|
475
|
+
metadata.update({
|
|
476
|
+
"decision_point": decision_point,
|
|
477
|
+
"trace_subtype": "decision"
|
|
478
|
+
})
|
|
479
|
+
return self.span(f"decision_{decision_point}", TraceType.DECISION_TRACE, agent_id, **metadata)
|
|
480
|
+
|
|
481
|
+
async def tool_span(self, tool_name: str, operation: str, agent_id: Optional[str] = None, **metadata):
|
|
482
|
+
"""Context manager for tool execution tracing."""
|
|
483
|
+
metadata.update({
|
|
484
|
+
"tool_name": tool_name,
|
|
485
|
+
"tool_operation": operation
|
|
486
|
+
})
|
|
487
|
+
return self.span(f"tool_{tool_name}_{operation}", TraceType.TOOL_EXECUTION, agent_id, **metadata)
|
|
488
|
+
|
|
489
|
+
# Query methods
|
|
490
|
+
|
|
491
|
+
def get_recent_operations(self, agent_id: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]:
|
|
492
|
+
"""Get recent operations with thread safety."""
|
|
493
|
+
try:
|
|
494
|
+
with self._lock:
|
|
495
|
+
spans = list(self._completed_spans)
|
|
496
|
+
|
|
497
|
+
if agent_id:
|
|
498
|
+
spans = [s for s in spans if s.agent_id == agent_id]
|
|
499
|
+
|
|
500
|
+
# Most recent first
|
|
501
|
+
spans = spans[-limit:]
|
|
502
|
+
spans.reverse()
|
|
503
|
+
|
|
504
|
+
return [span.to_dict() for span in spans]
|
|
505
|
+
except Exception as e:
|
|
506
|
+
logger.error(f"Error getting recent operations: {e}")
|
|
507
|
+
return []
|
|
508
|
+
|
|
509
|
+
def get_global_metrics(self) -> Dict[str, Any]:
|
|
510
|
+
"""Get global metrics with thread safety."""
|
|
511
|
+
with self._lock:
|
|
512
|
+
return {
|
|
513
|
+
**self._metrics,
|
|
514
|
+
"active_spans": len(self._active_spans),
|
|
515
|
+
"dashboard_reports_sent": self.dashboard_reporter.reports_sent,
|
|
516
|
+
"dashboard_reports_failed": self.dashboard_reporter.reports_failed
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
def get_agent_metrics(self, agent_id: str) -> Dict[str, Any]:
|
|
520
|
+
"""Get basic metrics for an agent with thread safety."""
|
|
521
|
+
try:
|
|
522
|
+
with self._lock:
|
|
523
|
+
spans = [s for s in self._completed_spans if s.agent_id == agent_id]
|
|
524
|
+
|
|
525
|
+
if not spans:
|
|
526
|
+
return {"total_operations": 0, "success_rate": 0}
|
|
527
|
+
|
|
528
|
+
total_ops = len(spans)
|
|
529
|
+
successful_ops = len([s for s in spans if s.status == TraceStatus.SUCCESS])
|
|
530
|
+
|
|
531
|
+
# Average latency
|
|
532
|
+
latencies = [s.duration_ms for s in spans if s.duration_ms]
|
|
533
|
+
avg_latency = sum(latencies) / len(latencies) if latencies else 0
|
|
534
|
+
|
|
535
|
+
return {
|
|
536
|
+
"total_operations": total_ops,
|
|
537
|
+
"successful_operations": successful_ops,
|
|
538
|
+
"failed_operations": total_ops - successful_ops,
|
|
539
|
+
"success_rate": successful_ops / total_ops if total_ops > 0 else 0,
|
|
540
|
+
"avg_latency_ms": avg_latency
|
|
541
|
+
}
|
|
542
|
+
except Exception as e:
|
|
543
|
+
logger.error(f"Error getting agent metrics: {e}")
|
|
544
|
+
return {"total_operations": 0, "success_rate": 0}
|
|
545
|
+
|
|
546
|
+
def get_workflow_communications(self, workflow_name: Optional[str] = None, limit: int = 20) -> List[Dict[str, Any]]:
|
|
547
|
+
"""
|
|
548
|
+
Get workflow communication traces.
|
|
549
|
+
|
|
550
|
+
Returns spans that represent workflow communications (agent-to-agent messages).
|
|
551
|
+
"""
|
|
552
|
+
try:
|
|
553
|
+
with self._lock:
|
|
554
|
+
# Filter for workflow communication spans
|
|
555
|
+
comm_spans = [
|
|
556
|
+
s for s in self._completed_spans
|
|
557
|
+
if s.trace_type == TraceType.WORKFLOW_COMMUNICATION
|
|
558
|
+
]
|
|
559
|
+
|
|
560
|
+
# Filter by workflow name if provided
|
|
561
|
+
if workflow_name:
|
|
562
|
+
comm_spans = [
|
|
563
|
+
s for s in comm_spans
|
|
564
|
+
if s.metadata.get('workflow_name') == workflow_name
|
|
565
|
+
]
|
|
566
|
+
|
|
567
|
+
# Most recent first
|
|
568
|
+
comm_spans = comm_spans[-limit:]
|
|
569
|
+
comm_spans.reverse()
|
|
570
|
+
|
|
571
|
+
# Convert to dictionaries with workflow-specific fields
|
|
572
|
+
result = []
|
|
573
|
+
for span in comm_spans:
|
|
574
|
+
comm_dict = span.to_dict()
|
|
575
|
+
# Add workflow-specific fields from metadata
|
|
576
|
+
comm_dict['from_agent'] = span.metadata.get('from_agent', 'unknown')
|
|
577
|
+
comm_dict['to_agent'] = span.metadata.get('to_agent', 'unknown')
|
|
578
|
+
comm_dict['channel'] = span.metadata.get('channel', 'unknown')
|
|
579
|
+
comm_dict['message_id'] = span.metadata.get('message_id')
|
|
580
|
+
comm_dict['success'] = span.status == TraceStatus.SUCCESS
|
|
581
|
+
result.append(comm_dict)
|
|
582
|
+
|
|
583
|
+
return result
|
|
584
|
+
|
|
585
|
+
except Exception as e:
|
|
586
|
+
logger.error(f"Error getting workflow communications: {e}")
|
|
587
|
+
return []
|
|
588
|
+
|
|
589
|
+
def get_workflow_metrics(self, workflow_name: str) -> Dict[str, Any]:
|
|
590
|
+
"""Get metrics for a specific workflow."""
|
|
591
|
+
try:
|
|
592
|
+
with self._lock:
|
|
593
|
+
# Get all communication spans for this workflow
|
|
594
|
+
comm_spans = [
|
|
595
|
+
s for s in self._completed_spans
|
|
596
|
+
if s.trace_type == TraceType.WORKFLOW_COMMUNICATION
|
|
597
|
+
and s.metadata.get('workflow_name') == workflow_name
|
|
598
|
+
]
|
|
599
|
+
|
|
600
|
+
if not comm_spans:
|
|
601
|
+
return {"total_messages": 0, "success_rate": 0}
|
|
602
|
+
|
|
603
|
+
total = len(comm_spans)
|
|
604
|
+
successful = len([s for s in comm_spans if s.status == TraceStatus.SUCCESS])
|
|
605
|
+
|
|
606
|
+
return {
|
|
607
|
+
"workflow_name": workflow_name,
|
|
608
|
+
"total_messages": total,
|
|
609
|
+
"successful_messages": successful,
|
|
610
|
+
"failed_messages": total - successful,
|
|
611
|
+
"success_rate": successful / total if total > 0 else 0
|
|
612
|
+
}
|
|
613
|
+
except Exception as e:
|
|
614
|
+
logger.error(f"Error getting workflow metrics: {e}")
|
|
615
|
+
return {"total_messages": 0, "success_rate": 0}
|
|
616
|
+
|
|
617
|
+
# Streaming decision events support
|
|
618
|
+
|
|
619
|
+
def register_decision_stream_callback(self, agent_id: str, callback: callable) -> None:
|
|
620
|
+
"""Register a callback for streaming decision events for a specific agent."""
|
|
621
|
+
try:
|
|
622
|
+
with self._lock:
|
|
623
|
+
if agent_id not in self._decision_stream_callbacks:
|
|
624
|
+
self._decision_stream_callbacks[agent_id] = []
|
|
625
|
+
self._decision_stream_callbacks[agent_id].append(callback)
|
|
626
|
+
logger.debug(f"Registered decision stream callback for agent {agent_id}")
|
|
627
|
+
except Exception as e:
|
|
628
|
+
logger.error(f"Failed to register decision stream callback: {e}")
|
|
629
|
+
|
|
630
|
+
def unregister_decision_stream_callback(self, agent_id: str, callback: callable) -> None:
|
|
631
|
+
"""Unregister a decision stream callback for a specific agent."""
|
|
632
|
+
try:
|
|
633
|
+
with self._lock:
|
|
634
|
+
if agent_id in self._decision_stream_callbacks:
|
|
635
|
+
if callback in self._decision_stream_callbacks[agent_id]:
|
|
636
|
+
self._decision_stream_callbacks[agent_id].remove(callback)
|
|
637
|
+
if not self._decision_stream_callbacks[agent_id]:
|
|
638
|
+
del self._decision_stream_callbacks[agent_id]
|
|
639
|
+
logger.debug(f"Unregistered decision stream callback for agent {agent_id}")
|
|
640
|
+
except Exception as e:
|
|
641
|
+
logger.error(f"Failed to unregister decision stream callback: {e}")
|
|
642
|
+
|
|
643
|
+
def emit_decision_event(self, agent_id: Optional[str], decision_event: 'DecisionEvent') -> None:
|
|
644
|
+
"""Emit a decision event to all registered callbacks for the agent."""
|
|
645
|
+
if not agent_id:
|
|
646
|
+
return
|
|
647
|
+
|
|
648
|
+
try:
|
|
649
|
+
with self._lock:
|
|
650
|
+
callbacks = self._decision_stream_callbacks.get(agent_id, [])
|
|
651
|
+
|
|
652
|
+
# Call each callback (don't hold the lock during callback execution)
|
|
653
|
+
for callback in callbacks:
|
|
654
|
+
try:
|
|
655
|
+
callback(decision_event)
|
|
656
|
+
except Exception as e:
|
|
657
|
+
logger.warning(f"Decision stream callback failed for agent {agent_id}: {e}")
|
|
658
|
+
except Exception as e:
|
|
659
|
+
logger.error(f"Failed to emit decision event: {e}")
|
|
660
|
+
|
|
661
|
+
def get_streaming_agents(self) -> List[str]:
|
|
662
|
+
"""Get list of agents that have streaming callbacks registered."""
|
|
663
|
+
with self._lock:
|
|
664
|
+
return list(self._decision_stream_callbacks.keys())
|
|
665
|
+
|
|
666
|
+
# Global instance with safer initialization
|
|
667
|
+
_global_trace_manager: Optional[TraceManager] = None
|
|
668
|
+
_manager_lock = threading.Lock()
|
|
669
|
+
|
|
670
|
+
def get_trace_manager() -> TraceManager:
|
|
671
|
+
"""Get the global trace manager instance with thread safety."""
|
|
672
|
+
global _global_trace_manager
|
|
673
|
+
if _global_trace_manager is None:
|
|
674
|
+
with _manager_lock:
|
|
675
|
+
if _global_trace_manager is None: # Double-check pattern
|
|
676
|
+
try:
|
|
677
|
+
_global_trace_manager = TraceManager()
|
|
678
|
+
logger.info("TraceManager successfully initialized")
|
|
679
|
+
except Exception as e:
|
|
680
|
+
logger.error(f"Failed to initialize TraceManager: {e}")
|
|
681
|
+
# Create a no-op manager that doesn't break but logs the issue
|
|
682
|
+
_global_trace_manager = _create_safe_noop_manager()
|
|
683
|
+
return _global_trace_manager
|
|
684
|
+
|
|
685
|
+
def _create_safe_noop_manager():
|
|
686
|
+
"""Create a safe no-op manager that logs issues but doesn't break."""
|
|
687
|
+
logger.warning("Using no-op TraceManager due to initialization failure")
|
|
688
|
+
|
|
689
|
+
class SafeNoOpTraceManager:
|
|
690
|
+
def __init__(self):
|
|
691
|
+
self.dashboard_reporter = type('obj', (object,), {
|
|
692
|
+
'enabled': False,
|
|
693
|
+
'reports_sent': 0,
|
|
694
|
+
'reports_failed': 0
|
|
695
|
+
})()
|
|
696
|
+
|
|
697
|
+
def start_span(self, *args, **kwargs):
|
|
698
|
+
return f"noop_{uuid.uuid4().hex[:8]}"
|
|
699
|
+
|
|
700
|
+
def end_span(self, *args, **kwargs):
|
|
701
|
+
pass
|
|
702
|
+
|
|
703
|
+
def record_llm_call(self, *args, **kwargs):
|
|
704
|
+
pass
|
|
705
|
+
|
|
706
|
+
def record_decision(self, *args, **kwargs):
|
|
707
|
+
pass
|
|
708
|
+
|
|
709
|
+
@asynccontextmanager
|
|
710
|
+
async def span(self, *args, **kwargs):
|
|
711
|
+
yield f"noop_{uuid.uuid4().hex[:8]}"
|
|
712
|
+
|
|
713
|
+
async def decision_span(self, *args, **kwargs):
|
|
714
|
+
return self.span(*args, **kwargs)
|
|
715
|
+
|
|
716
|
+
async def tool_span(self, *args, **kwargs):
|
|
717
|
+
return self.span(*args, **kwargs)
|
|
718
|
+
|
|
719
|
+
def get_recent_operations(self, *args, **kwargs):
|
|
720
|
+
return []
|
|
721
|
+
|
|
722
|
+
def get_global_metrics(self):
|
|
723
|
+
return {"total_spans": 0, "total_llm_calls": 0, "total_tokens": 0}
|
|
724
|
+
|
|
725
|
+
def get_agent_metrics(self, *args, **kwargs):
|
|
726
|
+
return {"total_operations": 0, "success_rate": 0}
|
|
727
|
+
|
|
728
|
+
return SafeNoOpTraceManager()
|
|
729
|
+
|
|
730
|
+
# Legacy compatibility functions (preserved for backward compatibility)
|
|
731
|
+
def record_tokens(agent_id: str, total_tokens: int = 0, prompt_tokens: int = 0, completion_tokens: int = 0):
|
|
732
|
+
"""Legacy token recording - now handled automatically by LLM tracing."""
|
|
733
|
+
pass
|
|
734
|
+
|
|
735
|
+
def get_agent_tokens(agent_id: str) -> Dict[str, int]:
|
|
736
|
+
"""Legacy token retrieval."""
|
|
737
|
+
metrics = get_trace_manager().get_agent_metrics(agent_id)
|
|
738
|
+
return {
|
|
739
|
+
"total_tokens": 0, # Legacy format not supported in simplified version
|
|
740
|
+
"prompt_tokens": 0,
|
|
741
|
+
"completion_tokens": 0,
|
|
742
|
+
"requests": metrics.get("total_operations", 0)
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
def record_operation(agent_id: str, agent_name: str, task: str, input_data: Any,
|
|
746
|
+
output_data: Any, latency_ms: float, status: str = "success", **kwargs) -> str:
|
|
747
|
+
"""Legacy operation recording."""
|
|
748
|
+
trace_manager = get_trace_manager()
|
|
749
|
+
|
|
750
|
+
span_id = trace_manager.start_span(
|
|
751
|
+
operation_name=task,
|
|
752
|
+
trace_type=TraceType.AGENT_EXECUTION,
|
|
753
|
+
agent_id=agent_id,
|
|
754
|
+
input_data=input_data,
|
|
755
|
+
agent_name=agent_name
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
trace_status = TraceStatus.SUCCESS if status == "success" else TraceStatus.ERROR
|
|
759
|
+
trace_manager.end_span(
|
|
760
|
+
span_id=span_id,
|
|
761
|
+
status=trace_status,
|
|
762
|
+
output_data=output_data,
|
|
763
|
+
error_message=kwargs.get("error_message")
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
return span_id
|
|
767
|
+
|
|
768
|
+
def get_recent_operations(agent_id: Optional[str] = None, limit: int = 50) -> List[Dict[str, Any]]:
|
|
769
|
+
"""Legacy function to get recent operations."""
|
|
770
|
+
return get_trace_manager().get_recent_operations(agent_id, limit)
|