daita-agents 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. daita/__init__.py +216 -0
  2. daita/agents/__init__.py +33 -0
  3. daita/agents/base.py +743 -0
  4. daita/agents/substrate.py +1141 -0
  5. daita/cli/__init__.py +145 -0
  6. daita/cli/__main__.py +7 -0
  7. daita/cli/ascii_art.py +44 -0
  8. daita/cli/core/__init__.py +0 -0
  9. daita/cli/core/create.py +254 -0
  10. daita/cli/core/deploy.py +473 -0
  11. daita/cli/core/deployments.py +309 -0
  12. daita/cli/core/import_detector.py +219 -0
  13. daita/cli/core/init.py +481 -0
  14. daita/cli/core/logs.py +239 -0
  15. daita/cli/core/managed_deploy.py +709 -0
  16. daita/cli/core/run.py +648 -0
  17. daita/cli/core/status.py +421 -0
  18. daita/cli/core/test.py +239 -0
  19. daita/cli/core/webhooks.py +172 -0
  20. daita/cli/main.py +588 -0
  21. daita/cli/utils.py +541 -0
  22. daita/config/__init__.py +62 -0
  23. daita/config/base.py +159 -0
  24. daita/config/settings.py +184 -0
  25. daita/core/__init__.py +262 -0
  26. daita/core/decision_tracing.py +701 -0
  27. daita/core/exceptions.py +480 -0
  28. daita/core/focus.py +251 -0
  29. daita/core/interfaces.py +76 -0
  30. daita/core/plugin_tracing.py +550 -0
  31. daita/core/relay.py +779 -0
  32. daita/core/reliability.py +381 -0
  33. daita/core/scaling.py +459 -0
  34. daita/core/tools.py +554 -0
  35. daita/core/tracing.py +770 -0
  36. daita/core/workflow.py +1144 -0
  37. daita/display/__init__.py +1 -0
  38. daita/display/console.py +160 -0
  39. daita/execution/__init__.py +58 -0
  40. daita/execution/client.py +856 -0
  41. daita/execution/exceptions.py +92 -0
  42. daita/execution/models.py +317 -0
  43. daita/llm/__init__.py +60 -0
  44. daita/llm/anthropic.py +291 -0
  45. daita/llm/base.py +530 -0
  46. daita/llm/factory.py +101 -0
  47. daita/llm/gemini.py +355 -0
  48. daita/llm/grok.py +219 -0
  49. daita/llm/mock.py +172 -0
  50. daita/llm/openai.py +220 -0
  51. daita/plugins/__init__.py +141 -0
  52. daita/plugins/base.py +37 -0
  53. daita/plugins/base_db.py +167 -0
  54. daita/plugins/elasticsearch.py +849 -0
  55. daita/plugins/mcp.py +481 -0
  56. daita/plugins/mongodb.py +520 -0
  57. daita/plugins/mysql.py +362 -0
  58. daita/plugins/postgresql.py +342 -0
  59. daita/plugins/redis_messaging.py +500 -0
  60. daita/plugins/rest.py +537 -0
  61. daita/plugins/s3.py +770 -0
  62. daita/plugins/slack.py +729 -0
  63. daita/utils/__init__.py +18 -0
  64. daita_agents-0.2.0.dist-info/METADATA +409 -0
  65. daita_agents-0.2.0.dist-info/RECORD +69 -0
  66. daita_agents-0.2.0.dist-info/WHEEL +5 -0
  67. daita_agents-0.2.0.dist-info/entry_points.txt +2 -0
  68. daita_agents-0.2.0.dist-info/licenses/LICENSE +56 -0
  69. daita_agents-0.2.0.dist-info/top_level.txt +1 -0
daita/core/tracing.py ADDED
@@ -0,0 +1,770 @@
1
+ """
2
+ Unified TraceManager for Daita Agents - Fixed MVP Version
3
+
4
+ Streamlined automatic tracing system that captures all agent operations,
5
+ LLM calls, workflow communication, and tool usage. Zero configuration required.
6
+
7
+ FIXED ISSUES:
8
+ - Added missing methods (record_decision, record_llm_call)
9
+ - Improved error handling with proper logging
10
+ - Fixed dependency management for aiohttp
11
+ - Added thread safety for concurrent access
12
+ - Improved context management to prevent leaks
13
+ - Added proper configuration validation
14
+ """
15
+
16
+ import asyncio
17
+ import logging
18
+ import time
19
+ import uuid
20
+ import json
21
+ import os
22
+ from datetime import datetime
23
+ from typing import Dict, Any, Optional, List, Union
24
+ from dataclasses import dataclass
25
+ from collections import deque
26
+ from enum import Enum
27
+ from contextlib import asynccontextmanager
28
+ import threading
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ class TraceType(str, Enum):
33
+ """Types of traces we capture."""
34
+ AGENT_EXECUTION = "agent_execution"
35
+ LLM_CALL = "llm_call"
36
+ WORKFLOW_COMMUNICATION = "workflow_communication"
37
+ AGENT_LIFECYCLE = "agent_lifecycle"
38
+ DECISION_TRACE = "decision_trace"
39
+ TOOL_EXECUTION = "tool_execution"
40
+
41
+ class TraceStatus(str, Enum):
42
+ """Status of a trace span."""
43
+ STARTED = "started"
44
+ SUCCESS = "success"
45
+ ERROR = "error"
46
+
47
+ @dataclass
48
+ class TraceSpan:
49
+ """A single trace span - simplified for MVP."""
50
+ span_id: str
51
+ trace_id: str
52
+ parent_span_id: Optional[str]
53
+ agent_id: Optional[str]
54
+ operation_name: str
55
+ trace_type: TraceType
56
+ start_time: float
57
+ end_time: Optional[float]
58
+ status: TraceStatus
59
+
60
+ # Core data
61
+ input_data: Any
62
+ output_data: Any
63
+ error_message: Optional[str]
64
+
65
+ # Performance
66
+ duration_ms: Optional[float]
67
+
68
+ # Metadata - simple dict for flexibility
69
+ metadata: Dict[str, Any]
70
+
71
+ # Environment context
72
+ deployment_id: Optional[str]
73
+ environment: str
74
+
75
+ def __post_init__(self):
76
+ """Auto-populate deployment context."""
77
+ if self.deployment_id is None:
78
+ self.deployment_id = os.getenv("DAITA_DEPLOYMENT_ID")
79
+ if not self.environment:
80
+ self.environment = os.getenv("DAITA_ENVIRONMENT", "development")
81
+
82
+ @property
83
+ def is_completed(self) -> bool:
84
+ return self.end_time is not None
85
+
86
+ def to_dict(self) -> Dict[str, Any]:
87
+ """Convert to dictionary for serialization."""
88
+ return {
89
+ "span_id": self.span_id,
90
+ "trace_id": self.trace_id,
91
+ "parent_span_id": self.parent_span_id,
92
+ "agent_id": self.agent_id,
93
+ "operation": self.operation_name,
94
+ "type": self.trace_type.value,
95
+ "start_time": self.start_time,
96
+ "end_time": self.end_time,
97
+ "duration_ms": self.duration_ms,
98
+ "status": self.status.value,
99
+ "input_preview": self._create_preview(self.input_data),
100
+ "output_preview": self._create_preview(self.output_data),
101
+ "error": self.error_message,
102
+ "metadata": self.metadata,
103
+ "environment": self.environment,
104
+ "deployment_id": self.deployment_id
105
+ }
106
+
107
+ def _create_preview(self, data: Any, max_length: int = 200) -> str:
108
+ """Create a preview string for data."""
109
+ if data is None:
110
+ return ""
111
+ try:
112
+ if isinstance(data, str):
113
+ preview = data
114
+ elif isinstance(data, dict):
115
+ preview = json.dumps(data, separators=(',', ':'))
116
+ else:
117
+ preview = str(data)
118
+
119
+ if len(preview) > max_length:
120
+ return preview[:max_length] + "..."
121
+ return preview
122
+ except Exception:
123
+ return f"<{type(data).__name__}>"
124
+
125
+ class TraceContext:
126
+ """Thread-local trace context for automatic correlation."""
127
+
128
+ def __init__(self):
129
+ self._local = threading.local()
130
+
131
+ @property
132
+ def current_trace_id(self) -> Optional[str]:
133
+ return getattr(self._local, 'trace_id', None)
134
+
135
+ @property
136
+ def current_span_id(self) -> Optional[str]:
137
+ return getattr(self._local, 'span_id', None)
138
+
139
+ @property
140
+ def current_agent_id(self) -> Optional[str]:
141
+ return getattr(self._local, 'agent_id', None)
142
+
143
+ def set_context(self, trace_id: str, span_id: str, agent_id: Optional[str] = None):
144
+ self._local.trace_id = trace_id
145
+ self._local.span_id = span_id
146
+ if agent_id:
147
+ self._local.agent_id = agent_id
148
+
149
+ def clear_context(self):
150
+ self._local.trace_id = None
151
+ self._local.span_id = None
152
+ self._local.agent_id = None
153
+
154
+ @asynccontextmanager
155
+ async def span_context(self, trace_id: str, span_id: str, agent_id: Optional[str] = None):
156
+ """Context manager for automatic span context management."""
157
+ old_trace_id = self.current_trace_id
158
+ old_span_id = self.current_span_id
159
+ old_agent_id = self.current_agent_id
160
+
161
+ try:
162
+ self.set_context(trace_id, span_id, agent_id)
163
+ yield
164
+ finally:
165
+ if old_trace_id:
166
+ self.set_context(old_trace_id, old_span_id, old_agent_id)
167
+ else:
168
+ self.clear_context()
169
+
170
+ class DashboardReporter:
171
+ """Dashboard reporting with proper dependency management."""
172
+
173
+ def __init__(self):
174
+ self.api_key = os.getenv("DAITA_API_KEY")
175
+ self.dashboard_url = os.getenv("DAITA_DASHBOARD_URL") or os.getenv("DAITA_DASHBOARD_API") or os.getenv("DAITA_DASHBOARD_API_OVERRIDE") or ""
176
+ self.enabled = bool(self.api_key and self.dashboard_url)
177
+ self.reports_sent = 0
178
+ self.reports_failed = 0
179
+ self._aiohttp_available = None
180
+
181
+ # Validate configuration
182
+ if self.api_key and not self.dashboard_url:
183
+ self.enabled = False
184
+
185
+ if self.enabled:
186
+ logger.info(f"Dashboard reporting enabled (URL: {self.dashboard_url})")
187
+ else:
188
+ logger.debug("Dashboard reporting disabled (API key or URL not configured)")
189
+
190
+ def _check_aiohttp(self) -> bool:
191
+ """Check if aiohttp is available (cached result)."""
192
+ if self._aiohttp_available is None:
193
+ try:
194
+ import aiohttp
195
+ self._aiohttp_available = True
196
+ logger.debug("aiohttp available for dashboard reporting")
197
+ except ImportError:
198
+ self._aiohttp_available = False
199
+ logger.warning("aiohttp not available - dashboard reporting will be skipped")
200
+ return self._aiohttp_available
201
+
202
+ async def report_span(self, span: TraceSpan) -> bool:
203
+ """Report a single span to dashboard with proper error handling."""
204
+ if not self.enabled:
205
+ return True
206
+
207
+ if not self._check_aiohttp():
208
+ # Don't log this repeatedly
209
+ return False
210
+
211
+ try:
212
+ import aiohttp
213
+
214
+ headers = {
215
+ "Authorization": f"Bearer {self.api_key}",
216
+ "Content-Type": "application/json",
217
+ "User-Agent": "daita-agents/0.1.1"
218
+ }
219
+
220
+ payload = {
221
+ "spans": [span.to_dict()],
222
+ "environment": span.environment,
223
+ "timestamp": datetime.utcnow().isoformat()
224
+ }
225
+
226
+ timeout = aiohttp.ClientTimeout(total=5)
227
+
228
+ async with aiohttp.ClientSession(timeout=timeout) as session:
229
+ async with session.post(
230
+ f"{self.dashboard_url}/v1/traces",
231
+ headers=headers,
232
+ json=payload
233
+ ) as response:
234
+ if response.status == 200:
235
+ self.reports_sent += 1
236
+ logger.debug(f"Successfully reported span {span.span_id}")
237
+ return True
238
+ else:
239
+ self.reports_failed += 1
240
+ logger.warning(f"Dashboard API error: {response.status} - {await response.text()}")
241
+ return False
242
+
243
+ except asyncio.TimeoutError:
244
+ self.reports_failed += 1
245
+ logger.warning("Dashboard reporting timeout")
246
+ return False
247
+ except Exception as e:
248
+ self.reports_failed += 1
249
+ logger.warning(f"Dashboard reporting failed: {e}")
250
+ return False
251
+
252
+ class TraceManager:
253
+ """
254
+ Fixed TraceManager for MVP - automatic tracing with proper error handling.
255
+ """
256
+
257
+ def __init__(self):
258
+ self.trace_context = TraceContext()
259
+ self.dashboard_reporter = DashboardReporter()
260
+
261
+ # Thread-safe storage
262
+ self._lock = threading.RLock()
263
+ self._active_spans: Dict[str, TraceSpan] = {}
264
+ self._completed_spans: deque = deque(maxlen=500)
265
+
266
+ # Basic metrics
267
+ self._metrics = {
268
+ "total_spans": 0,
269
+ "total_llm_calls": 0,
270
+ "total_tokens": 0,
271
+ "total_decisions": 0
272
+ }
273
+
274
+ # Streaming decision events support
275
+ self._decision_stream_callbacks: Dict[str, List[callable]] = {}
276
+
277
+ logger.info("TraceManager initialized (Fixed MVP version)")
278
+
279
+ def start_span(
280
+ self,
281
+ operation_name: str,
282
+ trace_type: TraceType,
283
+ agent_id: Optional[str] = None,
284
+ parent_span_id: Optional[str] = None,
285
+ **metadata
286
+ ) -> str:
287
+ """Start a new trace span with thread safety."""
288
+ try:
289
+ span_id = str(uuid.uuid4())
290
+
291
+ # Determine trace_id with context fallback
292
+ if parent_span_id:
293
+ with self._lock:
294
+ parent_span = self._active_spans.get(parent_span_id)
295
+ trace_id = parent_span.trace_id if parent_span else str(uuid.uuid4())
296
+ elif self.trace_context.current_trace_id:
297
+ trace_id = self.trace_context.current_trace_id
298
+ parent_span_id = self.trace_context.current_span_id
299
+ else:
300
+ trace_id = str(uuid.uuid4())
301
+
302
+ # Use agent from context if not provided
303
+ if not agent_id:
304
+ agent_id = self.trace_context.current_agent_id
305
+
306
+ # Create span
307
+ span = TraceSpan(
308
+ span_id=span_id,
309
+ trace_id=trace_id,
310
+ parent_span_id=parent_span_id,
311
+ agent_id=agent_id,
312
+ operation_name=operation_name,
313
+ trace_type=trace_type,
314
+ start_time=time.time(),
315
+ end_time=None,
316
+ status=TraceStatus.STARTED,
317
+ input_data=metadata.get('input_data'),
318
+ output_data=None,
319
+ error_message=None,
320
+ duration_ms=None,
321
+ metadata=metadata,
322
+ deployment_id=None,
323
+ environment=""
324
+ )
325
+
326
+ with self._lock:
327
+ self._active_spans[span_id] = span
328
+ self._metrics["total_spans"] += 1
329
+
330
+ logger.debug(f"Started span {span_id} for '{operation_name}'")
331
+ return span_id
332
+
333
+ except Exception as e:
334
+ logger.error(f"Failed to start span: {e}")
335
+ # Return a valid span ID so operations don't break
336
+ return f"error_{uuid.uuid4().hex[:8]}"
337
+
338
+ def end_span(
339
+ self,
340
+ span_id: str,
341
+ status: TraceStatus = TraceStatus.SUCCESS,
342
+ output_data: Any = None,
343
+ error_message: Optional[str] = None,
344
+ **metadata
345
+ ) -> None:
346
+ """End a trace span with thread safety."""
347
+ try:
348
+ with self._lock:
349
+ if span_id not in self._active_spans:
350
+ logger.debug(f"Unknown or already completed span: {span_id}")
351
+ return
352
+
353
+ span = self._active_spans[span_id]
354
+
355
+ # Update span
356
+ span.end_time = time.time()
357
+ span.duration_ms = (span.end_time - span.start_time) * 1000
358
+ span.status = status
359
+ span.output_data = output_data
360
+ span.error_message = error_message
361
+ span.metadata.update(metadata)
362
+
363
+ # Move to completed
364
+ self._completed_spans.append(span)
365
+ del self._active_spans[span_id]
366
+
367
+ # Update metrics
368
+ if span.trace_type == TraceType.LLM_CALL:
369
+ self._metrics["total_llm_calls"] += 1
370
+ if "tokens_total" in span.metadata:
371
+ self._metrics["total_tokens"] += span.metadata.get("tokens_total", 0)
372
+ elif span.trace_type == TraceType.DECISION_TRACE:
373
+ self._metrics["total_decisions"] += 1
374
+
375
+ # Report to dashboard (fire and forget)
376
+ task = asyncio.create_task(self.dashboard_reporter.report_span(span))
377
+ task.add_done_callback(lambda t: t.exception() if not t.cancelled() else None)
378
+
379
+ logger.debug(f"Ended span {span_id} ({span.duration_ms:.1f}ms)")
380
+
381
+ except Exception as e:
382
+ logger.error(f"Failed to end span {span_id}: {e}")
383
+ # Clean up active span even if there's an error
384
+ with self._lock:
385
+ self._active_spans.pop(span_id, None)
386
+
387
+ def record_decision(
388
+ self,
389
+ span_id: str,
390
+ confidence: float = 0.0,
391
+ reasoning: Optional[List[str]] = None,
392
+ alternatives: Optional[List[str]] = None,
393
+ **factors
394
+ ) -> None:
395
+ """Record decision metadata for a span."""
396
+ try:
397
+ with self._lock:
398
+ span = self._active_spans.get(span_id)
399
+ if span:
400
+ span.metadata.update({
401
+ "confidence_score": confidence,
402
+ "reasoning_chain": reasoning or [],
403
+ "alternatives": alternatives or [],
404
+ "decision_factors": factors
405
+ })
406
+ logger.debug(f"Recorded decision for span {span_id} (confidence: {confidence:.2f})")
407
+ else:
408
+ logger.debug(f"Cannot record decision for unknown span: {span_id}")
409
+ except Exception as e:
410
+ logger.error(f"Failed to record decision for span {span_id}: {e}")
411
+
412
+ def record_llm_call(
413
+ self,
414
+ span_id: str,
415
+ model: str,
416
+ prompt_tokens: int = 0,
417
+ completion_tokens: int = 0,
418
+ total_tokens: int = 0,
419
+ **llm_metadata
420
+ ) -> None:
421
+ """Record LLM call metadata for a span."""
422
+ try:
423
+ with self._lock:
424
+ span = self._active_spans.get(span_id)
425
+ if span:
426
+ span.metadata.update({
427
+ "model": model,
428
+ "tokens_prompt": prompt_tokens,
429
+ "tokens_completion": completion_tokens,
430
+ "tokens_total": total_tokens or (prompt_tokens + completion_tokens),
431
+ **llm_metadata
432
+ })
433
+ logger.debug(f"Recorded LLM call for span {span_id} ({total_tokens} tokens)")
434
+ else:
435
+ logger.debug(f"Cannot record LLM call for unknown span: {span_id}")
436
+ except Exception as e:
437
+ logger.error(f"Failed to record LLM call for span {span_id}: {e}")
438
+
439
+ @asynccontextmanager
440
+ async def span(
441
+ self,
442
+ operation_name: str,
443
+ trace_type: TraceType,
444
+ agent_id: Optional[str] = None,
445
+ **metadata
446
+ ):
447
+ """Context manager for automatic span lifecycle."""
448
+ span_id = self.start_span(
449
+ operation_name=operation_name,
450
+ trace_type=trace_type,
451
+ agent_id=agent_id,
452
+ **metadata
453
+ )
454
+
455
+ try:
456
+ with self._lock:
457
+ span = self._active_spans.get(span_id)
458
+
459
+ if span:
460
+ async with self.trace_context.span_context(span.trace_id, span_id, agent_id):
461
+ yield span_id
462
+ else:
463
+ yield span_id
464
+
465
+ self.end_span(span_id, TraceStatus.SUCCESS)
466
+
467
+ except Exception as e:
468
+ self.end_span(span_id, TraceStatus.ERROR, error_message=str(e))
469
+ raise
470
+
471
+ # Convenience methods for specific trace types
472
+
473
+ async def decision_span(self, decision_point: str, agent_id: Optional[str] = None, **metadata):
474
+ """Context manager for decision tracing."""
475
+ metadata.update({
476
+ "decision_point": decision_point,
477
+ "trace_subtype": "decision"
478
+ })
479
+ return self.span(f"decision_{decision_point}", TraceType.DECISION_TRACE, agent_id, **metadata)
480
+
481
+ async def tool_span(self, tool_name: str, operation: str, agent_id: Optional[str] = None, **metadata):
482
+ """Context manager for tool execution tracing."""
483
+ metadata.update({
484
+ "tool_name": tool_name,
485
+ "tool_operation": operation
486
+ })
487
+ return self.span(f"tool_{tool_name}_{operation}", TraceType.TOOL_EXECUTION, agent_id, **metadata)
488
+
489
+ # Query methods
490
+
491
+ def get_recent_operations(self, agent_id: Optional[str] = None, limit: int = 10) -> List[Dict[str, Any]]:
492
+ """Get recent operations with thread safety."""
493
+ try:
494
+ with self._lock:
495
+ spans = list(self._completed_spans)
496
+
497
+ if agent_id:
498
+ spans = [s for s in spans if s.agent_id == agent_id]
499
+
500
+ # Most recent first
501
+ spans = spans[-limit:]
502
+ spans.reverse()
503
+
504
+ return [span.to_dict() for span in spans]
505
+ except Exception as e:
506
+ logger.error(f"Error getting recent operations: {e}")
507
+ return []
508
+
509
+ def get_global_metrics(self) -> Dict[str, Any]:
510
+ """Get global metrics with thread safety."""
511
+ with self._lock:
512
+ return {
513
+ **self._metrics,
514
+ "active_spans": len(self._active_spans),
515
+ "dashboard_reports_sent": self.dashboard_reporter.reports_sent,
516
+ "dashboard_reports_failed": self.dashboard_reporter.reports_failed
517
+ }
518
+
519
+ def get_agent_metrics(self, agent_id: str) -> Dict[str, Any]:
520
+ """Get basic metrics for an agent with thread safety."""
521
+ try:
522
+ with self._lock:
523
+ spans = [s for s in self._completed_spans if s.agent_id == agent_id]
524
+
525
+ if not spans:
526
+ return {"total_operations": 0, "success_rate": 0}
527
+
528
+ total_ops = len(spans)
529
+ successful_ops = len([s for s in spans if s.status == TraceStatus.SUCCESS])
530
+
531
+ # Average latency
532
+ latencies = [s.duration_ms for s in spans if s.duration_ms]
533
+ avg_latency = sum(latencies) / len(latencies) if latencies else 0
534
+
535
+ return {
536
+ "total_operations": total_ops,
537
+ "successful_operations": successful_ops,
538
+ "failed_operations": total_ops - successful_ops,
539
+ "success_rate": successful_ops / total_ops if total_ops > 0 else 0,
540
+ "avg_latency_ms": avg_latency
541
+ }
542
+ except Exception as e:
543
+ logger.error(f"Error getting agent metrics: {e}")
544
+ return {"total_operations": 0, "success_rate": 0}
545
+
546
+ def get_workflow_communications(self, workflow_name: Optional[str] = None, limit: int = 20) -> List[Dict[str, Any]]:
547
+ """
548
+ Get workflow communication traces.
549
+
550
+ Returns spans that represent workflow communications (agent-to-agent messages).
551
+ """
552
+ try:
553
+ with self._lock:
554
+ # Filter for workflow communication spans
555
+ comm_spans = [
556
+ s for s in self._completed_spans
557
+ if s.trace_type == TraceType.WORKFLOW_COMMUNICATION
558
+ ]
559
+
560
+ # Filter by workflow name if provided
561
+ if workflow_name:
562
+ comm_spans = [
563
+ s for s in comm_spans
564
+ if s.metadata.get('workflow_name') == workflow_name
565
+ ]
566
+
567
+ # Most recent first
568
+ comm_spans = comm_spans[-limit:]
569
+ comm_spans.reverse()
570
+
571
+ # Convert to dictionaries with workflow-specific fields
572
+ result = []
573
+ for span in comm_spans:
574
+ comm_dict = span.to_dict()
575
+ # Add workflow-specific fields from metadata
576
+ comm_dict['from_agent'] = span.metadata.get('from_agent', 'unknown')
577
+ comm_dict['to_agent'] = span.metadata.get('to_agent', 'unknown')
578
+ comm_dict['channel'] = span.metadata.get('channel', 'unknown')
579
+ comm_dict['message_id'] = span.metadata.get('message_id')
580
+ comm_dict['success'] = span.status == TraceStatus.SUCCESS
581
+ result.append(comm_dict)
582
+
583
+ return result
584
+
585
+ except Exception as e:
586
+ logger.error(f"Error getting workflow communications: {e}")
587
+ return []
588
+
589
+ def get_workflow_metrics(self, workflow_name: str) -> Dict[str, Any]:
590
+ """Get metrics for a specific workflow."""
591
+ try:
592
+ with self._lock:
593
+ # Get all communication spans for this workflow
594
+ comm_spans = [
595
+ s for s in self._completed_spans
596
+ if s.trace_type == TraceType.WORKFLOW_COMMUNICATION
597
+ and s.metadata.get('workflow_name') == workflow_name
598
+ ]
599
+
600
+ if not comm_spans:
601
+ return {"total_messages": 0, "success_rate": 0}
602
+
603
+ total = len(comm_spans)
604
+ successful = len([s for s in comm_spans if s.status == TraceStatus.SUCCESS])
605
+
606
+ return {
607
+ "workflow_name": workflow_name,
608
+ "total_messages": total,
609
+ "successful_messages": successful,
610
+ "failed_messages": total - successful,
611
+ "success_rate": successful / total if total > 0 else 0
612
+ }
613
+ except Exception as e:
614
+ logger.error(f"Error getting workflow metrics: {e}")
615
+ return {"total_messages": 0, "success_rate": 0}
616
+
617
+ # Streaming decision events support
618
+
619
+ def register_decision_stream_callback(self, agent_id: str, callback: callable) -> None:
620
+ """Register a callback for streaming decision events for a specific agent."""
621
+ try:
622
+ with self._lock:
623
+ if agent_id not in self._decision_stream_callbacks:
624
+ self._decision_stream_callbacks[agent_id] = []
625
+ self._decision_stream_callbacks[agent_id].append(callback)
626
+ logger.debug(f"Registered decision stream callback for agent {agent_id}")
627
+ except Exception as e:
628
+ logger.error(f"Failed to register decision stream callback: {e}")
629
+
630
+ def unregister_decision_stream_callback(self, agent_id: str, callback: callable) -> None:
631
+ """Unregister a decision stream callback for a specific agent."""
632
+ try:
633
+ with self._lock:
634
+ if agent_id in self._decision_stream_callbacks:
635
+ if callback in self._decision_stream_callbacks[agent_id]:
636
+ self._decision_stream_callbacks[agent_id].remove(callback)
637
+ if not self._decision_stream_callbacks[agent_id]:
638
+ del self._decision_stream_callbacks[agent_id]
639
+ logger.debug(f"Unregistered decision stream callback for agent {agent_id}")
640
+ except Exception as e:
641
+ logger.error(f"Failed to unregister decision stream callback: {e}")
642
+
643
+ def emit_decision_event(self, agent_id: Optional[str], decision_event: 'DecisionEvent') -> None:
644
+ """Emit a decision event to all registered callbacks for the agent."""
645
+ if not agent_id:
646
+ return
647
+
648
+ try:
649
+ with self._lock:
650
+ callbacks = self._decision_stream_callbacks.get(agent_id, [])
651
+
652
+ # Call each callback (don't hold the lock during callback execution)
653
+ for callback in callbacks:
654
+ try:
655
+ callback(decision_event)
656
+ except Exception as e:
657
+ logger.warning(f"Decision stream callback failed for agent {agent_id}: {e}")
658
+ except Exception as e:
659
+ logger.error(f"Failed to emit decision event: {e}")
660
+
661
+ def get_streaming_agents(self) -> List[str]:
662
+ """Get list of agents that have streaming callbacks registered."""
663
+ with self._lock:
664
+ return list(self._decision_stream_callbacks.keys())
665
+
666
+ # Global instance with safer initialization
667
+ _global_trace_manager: Optional[TraceManager] = None
668
+ _manager_lock = threading.Lock()
669
+
670
+ def get_trace_manager() -> TraceManager:
671
+ """Get the global trace manager instance with thread safety."""
672
+ global _global_trace_manager
673
+ if _global_trace_manager is None:
674
+ with _manager_lock:
675
+ if _global_trace_manager is None: # Double-check pattern
676
+ try:
677
+ _global_trace_manager = TraceManager()
678
+ logger.info("TraceManager successfully initialized")
679
+ except Exception as e:
680
+ logger.error(f"Failed to initialize TraceManager: {e}")
681
+ # Create a no-op manager that doesn't break but logs the issue
682
+ _global_trace_manager = _create_safe_noop_manager()
683
+ return _global_trace_manager
684
+
685
+ def _create_safe_noop_manager():
686
+ """Create a safe no-op manager that logs issues but doesn't break."""
687
+ logger.warning("Using no-op TraceManager due to initialization failure")
688
+
689
+ class SafeNoOpTraceManager:
690
+ def __init__(self):
691
+ self.dashboard_reporter = type('obj', (object,), {
692
+ 'enabled': False,
693
+ 'reports_sent': 0,
694
+ 'reports_failed': 0
695
+ })()
696
+
697
+ def start_span(self, *args, **kwargs):
698
+ return f"noop_{uuid.uuid4().hex[:8]}"
699
+
700
+ def end_span(self, *args, **kwargs):
701
+ pass
702
+
703
+ def record_llm_call(self, *args, **kwargs):
704
+ pass
705
+
706
+ def record_decision(self, *args, **kwargs):
707
+ pass
708
+
709
+ @asynccontextmanager
710
+ async def span(self, *args, **kwargs):
711
+ yield f"noop_{uuid.uuid4().hex[:8]}"
712
+
713
+ async def decision_span(self, *args, **kwargs):
714
+ return self.span(*args, **kwargs)
715
+
716
+ async def tool_span(self, *args, **kwargs):
717
+ return self.span(*args, **kwargs)
718
+
719
+ def get_recent_operations(self, *args, **kwargs):
720
+ return []
721
+
722
+ def get_global_metrics(self):
723
+ return {"total_spans": 0, "total_llm_calls": 0, "total_tokens": 0}
724
+
725
+ def get_agent_metrics(self, *args, **kwargs):
726
+ return {"total_operations": 0, "success_rate": 0}
727
+
728
+ return SafeNoOpTraceManager()
729
+
730
+ # Legacy compatibility functions (preserved for backward compatibility)
731
+ def record_tokens(agent_id: str, total_tokens: int = 0, prompt_tokens: int = 0, completion_tokens: int = 0):
732
+ """Legacy token recording - now handled automatically by LLM tracing."""
733
+ pass
734
+
735
+ def get_agent_tokens(agent_id: str) -> Dict[str, int]:
736
+ """Legacy token retrieval."""
737
+ metrics = get_trace_manager().get_agent_metrics(agent_id)
738
+ return {
739
+ "total_tokens": 0, # Legacy format not supported in simplified version
740
+ "prompt_tokens": 0,
741
+ "completion_tokens": 0,
742
+ "requests": metrics.get("total_operations", 0)
743
+ }
744
+
745
+ def record_operation(agent_id: str, agent_name: str, task: str, input_data: Any,
746
+ output_data: Any, latency_ms: float, status: str = "success", **kwargs) -> str:
747
+ """Legacy operation recording."""
748
+ trace_manager = get_trace_manager()
749
+
750
+ span_id = trace_manager.start_span(
751
+ operation_name=task,
752
+ trace_type=TraceType.AGENT_EXECUTION,
753
+ agent_id=agent_id,
754
+ input_data=input_data,
755
+ agent_name=agent_name
756
+ )
757
+
758
+ trace_status = TraceStatus.SUCCESS if status == "success" else TraceStatus.ERROR
759
+ trace_manager.end_span(
760
+ span_id=span_id,
761
+ status=trace_status,
762
+ output_data=output_data,
763
+ error_message=kwargs.get("error_message")
764
+ )
765
+
766
+ return span_id
767
+
768
+ def get_recent_operations(agent_id: Optional[str] = None, limit: int = 50) -> List[Dict[str, Any]]:
769
+ """Legacy function to get recent operations."""
770
+ return get_trace_manager().get_recent_operations(agent_id, limit)