daita-agents 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of daita-agents might be problematic. Click here for more details.

Files changed (69) hide show
  1. daita/__init__.py +208 -0
  2. daita/agents/__init__.py +33 -0
  3. daita/agents/base.py +722 -0
  4. daita/agents/substrate.py +895 -0
  5. daita/cli/__init__.py +145 -0
  6. daita/cli/__main__.py +7 -0
  7. daita/cli/ascii_art.py +44 -0
  8. daita/cli/core/__init__.py +0 -0
  9. daita/cli/core/create.py +254 -0
  10. daita/cli/core/deploy.py +473 -0
  11. daita/cli/core/deployments.py +309 -0
  12. daita/cli/core/import_detector.py +219 -0
  13. daita/cli/core/init.py +382 -0
  14. daita/cli/core/logs.py +239 -0
  15. daita/cli/core/managed_deploy.py +709 -0
  16. daita/cli/core/run.py +648 -0
  17. daita/cli/core/status.py +421 -0
  18. daita/cli/core/test.py +239 -0
  19. daita/cli/core/webhooks.py +172 -0
  20. daita/cli/main.py +588 -0
  21. daita/cli/utils.py +541 -0
  22. daita/config/__init__.py +62 -0
  23. daita/config/base.py +159 -0
  24. daita/config/settings.py +184 -0
  25. daita/core/__init__.py +262 -0
  26. daita/core/decision_tracing.py +701 -0
  27. daita/core/exceptions.py +480 -0
  28. daita/core/focus.py +251 -0
  29. daita/core/interfaces.py +76 -0
  30. daita/core/plugin_tracing.py +550 -0
  31. daita/core/relay.py +695 -0
  32. daita/core/reliability.py +381 -0
  33. daita/core/scaling.py +444 -0
  34. daita/core/tools.py +402 -0
  35. daita/core/tracing.py +770 -0
  36. daita/core/workflow.py +1084 -0
  37. daita/display/__init__.py +1 -0
  38. daita/display/console.py +160 -0
  39. daita/execution/__init__.py +58 -0
  40. daita/execution/client.py +856 -0
  41. daita/execution/exceptions.py +92 -0
  42. daita/execution/models.py +317 -0
  43. daita/llm/__init__.py +60 -0
  44. daita/llm/anthropic.py +166 -0
  45. daita/llm/base.py +373 -0
  46. daita/llm/factory.py +101 -0
  47. daita/llm/gemini.py +152 -0
  48. daita/llm/grok.py +114 -0
  49. daita/llm/mock.py +135 -0
  50. daita/llm/openai.py +109 -0
  51. daita/plugins/__init__.py +141 -0
  52. daita/plugins/base.py +37 -0
  53. daita/plugins/base_db.py +167 -0
  54. daita/plugins/elasticsearch.py +844 -0
  55. daita/plugins/mcp.py +481 -0
  56. daita/plugins/mongodb.py +510 -0
  57. daita/plugins/mysql.py +351 -0
  58. daita/plugins/postgresql.py +331 -0
  59. daita/plugins/redis_messaging.py +500 -0
  60. daita/plugins/rest.py +529 -0
  61. daita/plugins/s3.py +761 -0
  62. daita/plugins/slack.py +729 -0
  63. daita/utils/__init__.py +18 -0
  64. daita_agents-0.1.0.dist-info/METADATA +350 -0
  65. daita_agents-0.1.0.dist-info/RECORD +69 -0
  66. daita_agents-0.1.0.dist-info/WHEEL +5 -0
  67. daita_agents-0.1.0.dist-info/entry_points.txt +2 -0
  68. daita_agents-0.1.0.dist-info/licenses/LICENSE +56 -0
  69. daita_agents-0.1.0.dist-info/top_level.txt +1 -0
daita/core/workflow.py ADDED
@@ -0,0 +1,1084 @@
1
+ """
2
+ Simplified Workflow System for Daita Agents.
3
+
4
+ Provides orchestration of agents as connected systems with automatic tracing.
5
+ All workflow communication is automatically traced through the unified tracing system.
6
+
7
+ Example:
8
+ ```python
9
+ from daita.core.workflow import Workflow
10
+
11
+ # Create agents
12
+ fetcher = sdk.substrate_agent(name="Data Fetcher")
13
+ analyzer = sdk.analysis_agent(name="Analyzer")
14
+
15
+ # Create workflow
16
+ workflow = Workflow("Data Pipeline")
17
+ workflow.add_agent("fetcher", fetcher)
18
+ workflow.add_agent("analyzer", analyzer)
19
+
20
+ # Connect agents via relay channels
21
+ workflow.connect("fetcher", "raw_data", "analyzer")
22
+
23
+ # Start workflow
24
+ await workflow.start()
25
+
26
+ # View recent communication in unified dashboard
27
+ # All workflow communication is automatically traced
28
+ ```
29
+ """
30
+ import asyncio
31
+ import logging
32
+ import time
33
+ from typing import Dict, Any, Optional, List, Tuple, Set
34
+ from dataclasses import dataclass
35
+ from enum import Enum
36
+ from datetime import datetime
37
+
38
+ from ..core.exceptions import DaitaError, WorkflowError, BackpressureError
39
+ from ..core.relay import RelayManager, get_global_relay
40
+ from ..core.tracing import get_trace_manager, TraceType, TraceStatus
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+ class WorkflowStatus(str, Enum):
45
+ """Status of a workflow."""
46
+ CREATED = "created"
47
+ STARTING = "starting"
48
+ RUNNING = "running"
49
+ STOPPING = "stopping"
50
+ STOPPED = "stopped"
51
+ ERROR = "error"
52
+
53
+ @dataclass
54
+ class ReliabilityConfig:
55
+ """Configuration for workflow reliability features."""
56
+ acknowledgments: bool = True
57
+ task_tracking: bool = True
58
+ backpressure_control: bool = True
59
+
60
+ @dataclass
61
+ class Connection:
62
+ """Represents a connection between agents via a relay channel."""
63
+ from_agent: str
64
+ channel: str
65
+ to_agent: str
66
+ task: str = "relay_message"
67
+
68
+ def __str__(self):
69
+ return f"{self.from_agent} -> {self.channel} -> {self.to_agent}"
70
+
71
+ class Workflow:
72
+ """
73
+ A workflow manages a collection of agents and their connections.
74
+
75
+ All workflow communication is automatically traced through the unified
76
+ tracing system without any configuration required.
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ name: str,
82
+ project_id: Optional[str] = None,
83
+ relay_manager: Optional[RelayManager] = None
84
+ ):
85
+ """
86
+ Initialize a workflow.
87
+
88
+ Args:
89
+ name: Workflow name
90
+ project_id: Optional project ID this workflow belongs to
91
+ relay_manager: Relay manager for agent communication
92
+ """
93
+ self.name = name
94
+ self.project_id = project_id
95
+ self.relay_manager = relay_manager or get_global_relay()
96
+
97
+ # Agent storage: agent_name -> agent_instance
98
+ self.agents: Dict[str, Any] = {}
99
+
100
+ # Agent pools: pool_name -> AgentPool instance (for horizontal scaling)
101
+ self.agent_pools: Dict[str, Any] = {}
102
+
103
+ # Connections: list of Connection objects
104
+ self.connections: List[Connection] = []
105
+
106
+ # Relay channels used by this workflow
107
+ self.channels: Set[str] = set()
108
+
109
+ # Reliability configuration
110
+ self.reliability_config: Optional[ReliabilityConfig] = None
111
+ self._reliability_enabled = False
112
+
113
+ # Workflow state
114
+ self.status = WorkflowStatus.CREATED
115
+ self.created_at = time.time()
116
+ self.started_at: Optional[float] = None
117
+ self.stopped_at: Optional[float] = None
118
+ self.error: Optional[str] = None
119
+
120
+ # Subscription tracking for cleanup
121
+ self._subscriptions: List[Tuple[str, Any]] = []
122
+
123
+ # Message deduplication (only for reliable mode)
124
+ self._processed_messages: Set[str] = set()
125
+ self._dedup_cleanup_task: Optional[asyncio.Task] = None
126
+ self._dedup_max_size = 10000 # Prevent unbounded growth
127
+
128
+ # Get trace manager for automatic workflow communication tracing
129
+ self.trace_manager = get_trace_manager()
130
+
131
+ logger.debug(f"Created workflow '{name}' with automatic tracing")
132
+
133
+ def add_agent(self, name: str, agent: Any) -> "Workflow":
134
+ """
135
+ Add an agent to the workflow.
136
+
137
+ Args:
138
+ name: Agent name for workflow reference
139
+ agent: Agent instance
140
+
141
+ Returns:
142
+ Self for method chaining
143
+ """
144
+ if name in self.agents:
145
+ raise WorkflowError(f"Agent '{name}' already exists in workflow")
146
+
147
+ self.agents[name] = agent
148
+ logger.debug(f"Added agent '{name}' to workflow '{self.name}'")
149
+ return self
150
+
151
+ def add_agent_pool(
152
+ self,
153
+ name: str,
154
+ agent_factory: Any,
155
+ instances: int = 1
156
+ ) -> "Workflow":
157
+ """
158
+ Add an agent pool to the workflow for horizontal scaling.
159
+
160
+ Args:
161
+ name: Pool name for workflow reference
162
+ agent_factory: Factory function to create agent instances
163
+ instances: Number of agent instances in the pool
164
+
165
+ Returns:
166
+ Self for method chaining
167
+
168
+ Example:
169
+ ```python
170
+ def create_processor():
171
+ return sdk.substrate_agent(name="Processor")
172
+
173
+ workflow.add_agent_pool("processors", create_processor, instances=5)
174
+ ```
175
+ """
176
+ if name in self.agent_pools:
177
+ raise WorkflowError(f"Agent pool '{name}' already exists in workflow")
178
+
179
+ if name in self.agents:
180
+ raise WorkflowError(f"Name '{name}' already used by an agent in workflow")
181
+
182
+ # Import AgentPool here to avoid circular imports
183
+ from ..core.scaling import AgentPool
184
+
185
+ # Create agent pool
186
+ pool = AgentPool(
187
+ agent_factory=agent_factory,
188
+ instances=instances,
189
+ pool_name=f"{self.name}_{name}"
190
+ )
191
+
192
+ self.agent_pools[name] = pool
193
+ logger.debug(f"Added agent pool '{name}' with {instances} instances to workflow '{self.name}'")
194
+ return self
195
+
196
+ def remove_agent(self, name: str) -> bool:
197
+ """
198
+ Remove agent and clean up its connections.
199
+
200
+ Args:
201
+ name: Agent name to remove
202
+
203
+ Returns:
204
+ True if agent was removed, False if not found
205
+ """
206
+ if name not in self.agents:
207
+ return False
208
+
209
+ # Remove agent
210
+ del self.agents[name]
211
+
212
+ # Clean up connections involving this agent
213
+ self.connections = [
214
+ c for c in self.connections
215
+ if c.from_agent != name and c.to_agent != name
216
+ ]
217
+
218
+ # Note: Subscriptions will be cleaned up in _cleanup_connections when workflow stops
219
+
220
+ logger.debug(f"Removed agent '{name}' and cleaned up connections")
221
+ return True
222
+
223
+ def connect(self, from_agent: str, channel: str, to_agent: str, task: str = "relay_message") -> "Workflow":
224
+ """
225
+ Connect two agents via a relay channel.
226
+
227
+ Args:
228
+ from_agent: Source agent name
229
+ channel: Relay channel name
230
+ to_agent: Destination agent name
231
+ task: Task to execute on destination agent
232
+
233
+ Returns:
234
+ Self for method chaining
235
+ """
236
+ # Validate agents exist
237
+ if from_agent not in self.agents:
238
+ raise WorkflowError(f"Source agent '{from_agent}' not found")
239
+ if to_agent not in self.agents:
240
+ raise WorkflowError(f"Destination agent '{to_agent}' not found")
241
+
242
+ # Check if connection already exists
243
+ existing = next(
244
+ (c for c in self.connections if c.from_agent == from_agent
245
+ and c.channel == channel and c.to_agent == to_agent),
246
+ None
247
+ )
248
+
249
+ if existing:
250
+ logger.warning(f"Connection already exists: {existing}")
251
+ return self
252
+
253
+ connection = Connection(from_agent, channel, to_agent, task)
254
+ self.connections.append(connection)
255
+ self.channels.add(channel)
256
+
257
+ logger.debug(f"Connected {from_agent} -> {channel} -> {to_agent}")
258
+ return self
259
+
260
+ def configure_reliability(
261
+ self,
262
+ preset: Optional[str] = None,
263
+ acknowledgments: Optional[bool] = None,
264
+ task_tracking: Optional[bool] = None,
265
+ backpressure_control: Optional[bool] = None
266
+ ) -> "Workflow":
267
+ """
268
+ Configure reliability features for this workflow.
269
+
270
+ Args:
271
+ preset: Predefined configuration preset ("basic", "production", "enterprise")
272
+ acknowledgments: Enable message acknowledgments
273
+ task_tracking: Enable task lifecycle tracking
274
+ backpressure_control: Enable backpressure control
275
+
276
+ Returns:
277
+ Self for method chaining
278
+ """
279
+ # Handle presets
280
+ if preset == "basic":
281
+ config = ReliabilityConfig(
282
+ acknowledgments=True,
283
+ task_tracking=True,
284
+ backpressure_control=True
285
+ )
286
+ elif preset == "production":
287
+ config = ReliabilityConfig(
288
+ acknowledgments=True,
289
+ task_tracking=True,
290
+ backpressure_control=True
291
+ )
292
+ elif preset == "enterprise":
293
+ config = ReliabilityConfig(
294
+ acknowledgments=True,
295
+ task_tracking=True,
296
+ backpressure_control=True
297
+ )
298
+ else:
299
+ # Default configuration or use provided values
300
+ config = ReliabilityConfig(
301
+ acknowledgments=acknowledgments if acknowledgments is not None else True,
302
+ task_tracking=task_tracking if task_tracking is not None else True,
303
+ backpressure_control=backpressure_control if backpressure_control is not None else True
304
+ )
305
+
306
+ # Override individual settings if provided
307
+ if acknowledgments is not None:
308
+ config.acknowledgments = acknowledgments
309
+ if task_tracking is not None:
310
+ config.task_tracking = task_tracking
311
+ if backpressure_control is not None:
312
+ config.backpressure_control = backpressure_control
313
+
314
+ self.reliability_config = config
315
+ self._reliability_enabled = True
316
+
317
+ # Enable reliability in relay manager
318
+ self.relay_manager.enable_reliability = True
319
+
320
+ logger.info(f"Configured reliability for workflow '{self.name}': {config}")
321
+ return self
322
+
323
+ def validate_connections(self) -> List[str]:
324
+ """
325
+ Validate all workflow connections.
326
+
327
+ Returns:
328
+ List of validation error messages (empty if all valid)
329
+ """
330
+ errors = []
331
+
332
+ for conn in self.connections:
333
+ # Check from_agent exists
334
+ if conn.from_agent not in self.agents and conn.from_agent not in self.agent_pools:
335
+ errors.append(f"Source '{conn.from_agent}' not found in workflow")
336
+
337
+ # Check to_agent exists
338
+ if conn.to_agent not in self.agents and conn.to_agent not in self.agent_pools:
339
+ errors.append(f"Destination '{conn.to_agent}' not found in workflow")
340
+
341
+ # Check for circular dependencies (self-loops)
342
+ if conn.from_agent == conn.to_agent:
343
+ errors.append(f"Circular dependency: {conn.from_agent} -> {conn.to_agent}")
344
+
345
+ return errors
346
+
347
+ async def start(self) -> None:
348
+ """Start the workflow and all agents with automatic tracing."""
349
+ if self.status in [WorkflowStatus.RUNNING, WorkflowStatus.STARTING]:
350
+ logger.warning(f"Workflow '{self.name}' is already running")
351
+ return
352
+
353
+ try:
354
+ self.status = WorkflowStatus.STARTING
355
+ logger.info(f"Starting workflow '{self.name}'")
356
+
357
+ # Validate connections before starting
358
+ validation_errors = self.validate_connections()
359
+ if validation_errors:
360
+ raise WorkflowError(f"Invalid connections: {'; '.join(validation_errors)}")
361
+
362
+ # Trace workflow lifecycle start
363
+ await self._trace_workflow_event("workflow_started", {
364
+ "workflow_name": self.name,
365
+ "agent_count": len(self.agents),
366
+ "agent_pool_count": len(self.agent_pools),
367
+ "connection_count": len(self.connections)
368
+ })
369
+
370
+ # Ensure relay manager is running
371
+ if not self.relay_manager._running:
372
+ try:
373
+ await self.relay_manager.start()
374
+ except Exception as e:
375
+ raise WorkflowError(f"Failed to start relay manager: {str(e)}")
376
+
377
+ # Configure agents with reliability features if enabled
378
+ if self._reliability_enabled and self.reliability_config:
379
+ await self._configure_agents_reliability()
380
+
381
+ # Start all agents
382
+ for agent_name, agent in self.agents.items():
383
+ try:
384
+ if hasattr(agent, 'start'):
385
+ await agent.start()
386
+ logger.debug(f"Started agent '{agent_name}'")
387
+ except Exception as e:
388
+ logger.error(f"Failed to start agent '{agent_name}': {str(e)}")
389
+ raise WorkflowError(f"Failed to start agent '{agent_name}': {str(e)}")
390
+
391
+ # Start all agent pools
392
+ for pool_name, pool in self.agent_pools.items():
393
+ try:
394
+ await pool.start()
395
+ logger.debug(f"Started agent pool '{pool_name}' with {pool.instance_count} instances")
396
+ except Exception as e:
397
+ logger.error(f"Failed to start agent pool '{pool_name}': {str(e)}")
398
+ raise WorkflowError(f"Failed to start agent pool '{pool_name}': {str(e)}")
399
+
400
+ # Set up relay connections with automatic tracing
401
+ await self._setup_connections()
402
+
403
+ # Start dedup cleanup task if reliability enabled
404
+ if self._reliability_enabled:
405
+ self._dedup_cleanup_task = asyncio.create_task(self._cleanup_dedup_cache())
406
+
407
+ # Update status
408
+ self.status = WorkflowStatus.RUNNING
409
+ self.started_at = time.time()
410
+
411
+ logger.info(f"Workflow '{self.name}' started successfully")
412
+
413
+ except Exception as e:
414
+ self.status = WorkflowStatus.ERROR
415
+ self.error = str(e)
416
+ logger.error(f"Failed to start workflow '{self.name}': {str(e)}")
417
+
418
+ # Trace workflow error
419
+ await self._trace_workflow_event("workflow_error", {
420
+ "workflow_name": self.name,
421
+ "error": str(e)
422
+ })
423
+ raise
424
+
425
+ async def stop(self) -> None:
426
+ """Stop the workflow by stopping all agents and cleaning up connections."""
427
+ if self.status in [WorkflowStatus.STOPPED, WorkflowStatus.STOPPING]:
428
+ logger.warning(f"Workflow '{self.name}' is already stopped")
429
+ return
430
+
431
+ try:
432
+ self.status = WorkflowStatus.STOPPING
433
+ logger.info(f"Stopping workflow '{self.name}'")
434
+
435
+ # Stop dedup cleanup task
436
+ if self._dedup_cleanup_task:
437
+ self._dedup_cleanup_task.cancel()
438
+ try:
439
+ await self._dedup_cleanup_task
440
+ except asyncio.CancelledError:
441
+ pass
442
+ self._dedup_cleanup_task = None
443
+
444
+ # Clean up relay subscriptions
445
+ await self._cleanup_connections()
446
+
447
+ # Stop all agents
448
+ for agent_name, agent in self.agents.items():
449
+ try:
450
+ if hasattr(agent, 'stop'):
451
+ await agent.stop()
452
+ logger.debug(f"Stopped agent '{agent_name}'")
453
+ except Exception as e:
454
+ logger.warning(f"Error stopping agent '{agent_name}': {str(e)}")
455
+
456
+ # Stop all agent pools
457
+ for pool_name, pool in self.agent_pools.items():
458
+ try:
459
+ await pool.stop()
460
+ logger.debug(f"Stopped agent pool '{pool_name}'")
461
+ except Exception as e:
462
+ logger.warning(f"Error stopping agent pool '{pool_name}': {str(e)}")
463
+
464
+ self.status = WorkflowStatus.STOPPED
465
+ self.stopped_at = time.time()
466
+
467
+ # Trace workflow lifecycle stop
468
+ await self._trace_workflow_event("workflow_stopped", {
469
+ "workflow_name": self.name,
470
+ "running_time_seconds": self.stopped_at - (self.started_at or self.stopped_at)
471
+ })
472
+
473
+ logger.info(f"Workflow '{self.name}' stopped")
474
+
475
+ except Exception as e:
476
+ self.status = WorkflowStatus.ERROR
477
+ self.error = str(e)
478
+ logger.error(f"Error stopping workflow '{self.name}': {str(e)}")
479
+ raise
480
+
481
+ async def _setup_connections(self) -> None:
482
+ """Set up relay connections between agents with automatic tracing."""
483
+ for connection in self.connections:
484
+ try:
485
+ # Create callback based on reliability configuration
486
+ if self._reliability_enabled and self.reliability_config:
487
+ callback = self._create_reliable_callback(connection)
488
+ else:
489
+ callback = self._create_traced_callback(connection)
490
+
491
+ # Subscribe to the relay channel
492
+ await self.relay_manager.subscribe(connection.channel, callback)
493
+ self._subscriptions.append((connection.channel, callback))
494
+
495
+ logger.debug(f"Set up connection: {connection} (reliability: {self._reliability_enabled})")
496
+
497
+ except Exception as e:
498
+ logger.error(f"Failed to set up connection {connection}: {str(e)}")
499
+ raise WorkflowError(f"Failed to set up connection {connection}: {str(e)}")
500
+
501
+ def _create_traced_callback(self, connection: Connection):
502
+ """Create a callback that automatically traces workflow communication."""
503
+ async def traced_callback(data: Any):
504
+ """Callback that processes relay data and automatically traces communication."""
505
+ try:
506
+ # Automatically trace the workflow communication
507
+ await self._trace_workflow_communication(
508
+ from_agent=connection.from_agent,
509
+ to_agent=connection.to_agent,
510
+ channel=connection.channel,
511
+ data=data,
512
+ success=True
513
+ )
514
+
515
+ # Process the data with the destination agent or agent pool
516
+ if connection.to_agent in self.agents:
517
+ # Single agent
518
+ dest_agent = self.agents[connection.to_agent]
519
+ if hasattr(dest_agent, 'process'):
520
+ await dest_agent.process(connection.task, data, {
521
+ 'source_agent': connection.from_agent,
522
+ 'channel': connection.channel,
523
+ 'workflow': self.name
524
+ })
525
+ else:
526
+ logger.warning(f"Agent '{connection.to_agent}' has no process method")
527
+
528
+ elif connection.to_agent in self.agent_pools:
529
+ # Agent pool - submit task to pool
530
+ pool = self.agent_pools[connection.to_agent]
531
+ await pool.submit_task(connection.task, data, {
532
+ 'source_agent': connection.from_agent,
533
+ 'channel': connection.channel,
534
+ 'workflow': self.name
535
+ })
536
+
537
+ else:
538
+ logger.error(f"Destination '{connection.to_agent}' not found in agents or agent pools")
539
+
540
+ except Exception as e:
541
+ # Trace communication error
542
+ await self._trace_workflow_communication(
543
+ from_agent=connection.from_agent,
544
+ to_agent=connection.to_agent,
545
+ channel=connection.channel,
546
+ data=data,
547
+ success=False,
548
+ error_message=str(e)
549
+ )
550
+ logger.error(f"Error in workflow communication {connection}: {str(e)}")
551
+
552
+ return traced_callback
553
+
554
+ async def _configure_agents_reliability(self) -> None:
555
+ """Configure agents with reliability features."""
556
+ if not self.reliability_config:
557
+ return
558
+
559
+ for agent_name, agent in self.agents.items():
560
+ try:
561
+ # Enable reliability features if agent supports it
562
+ if hasattr(agent, 'enable_reliability_features') and self.reliability_config.task_tracking:
563
+ agent.enable_reliability_features()
564
+ logger.debug(f"Enabled reliability features for agent '{agent_name}'")
565
+
566
+ # Configure backpressure if supported
567
+ if (hasattr(agent, 'backpressure_controller') and
568
+ self.reliability_config.backpressure_control and
569
+ agent.backpressure_controller is None):
570
+ from ..core.reliability import BackpressureController
571
+ agent.backpressure_controller = BackpressureController(
572
+ max_concurrent_tasks=10,
573
+ max_queue_size=100,
574
+ agent_id=getattr(agent, 'agent_id', agent_name)
575
+ )
576
+ logger.debug(f"Configured backpressure control for agent '{agent_name}'")
577
+
578
+ except Exception as e:
579
+ logger.warning(f"Failed to configure reliability for agent '{agent_name}': {e}")
580
+
581
+ def _create_reliable_callback(self, connection: Connection):
582
+ """Create a callback with reliability features enabled."""
583
+ async def reliable_callback(data: Any, message_id: Optional[str] = None):
584
+ """Callback that processes relay data with reliability features."""
585
+ start_time = time.time()
586
+
587
+ # Deduplication check (only if message_id provided)
588
+ if message_id:
589
+ if message_id in self._processed_messages:
590
+ logger.debug(f"Skipping duplicate message {message_id}")
591
+ # Still ACK it since we processed it before (idempotency)
592
+ if self.reliability_config and self.reliability_config.acknowledgments:
593
+ await self.relay_manager.ack_message(message_id)
594
+ return
595
+
596
+ # Mark as processing
597
+ self._processed_messages.add(message_id)
598
+
599
+ try:
600
+ # Automatically trace the workflow communication
601
+ await self._trace_workflow_communication(
602
+ from_agent=connection.from_agent,
603
+ to_agent=connection.to_agent,
604
+ channel=connection.channel,
605
+ data=data,
606
+ success=True,
607
+ message_id=message_id
608
+ )
609
+
610
+ # Process the data with the destination agent
611
+ dest_agent = self.agents[connection.to_agent]
612
+ if hasattr(dest_agent, 'process'):
613
+ context = {
614
+ 'source_agent': connection.from_agent,
615
+ 'channel': connection.channel,
616
+ 'workflow': self.name,
617
+ 'message_id': message_id,
618
+ 'reliability_enabled': True
619
+ }
620
+
621
+ # Handle backpressure if enabled
622
+ if (self.reliability_config and
623
+ self.reliability_config.backpressure_control and
624
+ hasattr(dest_agent, 'backpressure_controller') and
625
+ dest_agent.backpressure_controller):
626
+
627
+ # Check if agent can handle the task
628
+ if not await dest_agent.backpressure_controller.acquire_processing_slot():
629
+ raise BackpressureError(
630
+ f"Agent {connection.to_agent} queue is full",
631
+ agent_id=getattr(dest_agent, 'agent_id', connection.to_agent)
632
+ )
633
+
634
+ try:
635
+ await dest_agent.process(connection.task, data, context)
636
+ finally:
637
+ dest_agent.backpressure_controller.release_processing_slot()
638
+ else:
639
+ await dest_agent.process(connection.task, data, context)
640
+
641
+ # Acknowledge message if reliability is enabled and message_id provided
642
+ if message_id and self.reliability_config and self.reliability_config.acknowledgments:
643
+ await self.relay_manager.ack_message(message_id)
644
+
645
+ else:
646
+ logger.warning(f"Agent '{connection.to_agent}' has no process method")
647
+ # NACK the message since we can't process it
648
+ if message_id and self.reliability_config and self.reliability_config.acknowledgments:
649
+ await self.relay_manager.nack_message(message_id, "Agent has no process method")
650
+
651
+ except Exception as e:
652
+ # Remove from processed on error so it can be retried
653
+ if message_id:
654
+ self._processed_messages.discard(message_id)
655
+
656
+ # Trace communication error
657
+ await self._trace_workflow_communication(
658
+ from_agent=connection.from_agent,
659
+ to_agent=connection.to_agent,
660
+ channel=connection.channel,
661
+ data=data,
662
+ success=False,
663
+ error_message=str(e),
664
+ message_id=message_id
665
+ )
666
+
667
+ # NACK the message on error
668
+ if message_id and self.reliability_config and self.reliability_config.acknowledgments:
669
+ await self.relay_manager.nack_message(message_id, str(e))
670
+
671
+ # Enhanced error propagation
672
+ error_context = {
673
+ 'connection': str(connection),
674
+ 'processing_time': time.time() - start_time,
675
+ 'message_id': message_id,
676
+ 'workflow': self.name
677
+ }
678
+
679
+ workflow_error = WorkflowError(
680
+ f"Agent {connection.to_agent} failed processing {connection.task}: {str(e)}",
681
+ workflow_name=self.name,
682
+ context=error_context
683
+ )
684
+
685
+ logger.error(f"Error in reliable workflow communication {connection}: {str(e)}")
686
+
687
+ raise workflow_error
688
+
689
+ return reliable_callback
690
+
691
+ async def _cleanup_connections(self) -> None:
692
+ """Clean up relay subscriptions."""
693
+ for channel, callback in self._subscriptions:
694
+ try:
695
+ self.relay_manager.unsubscribe(channel, callback)
696
+ except Exception as e:
697
+ logger.warning(f"Error cleaning up subscription for channel '{channel}': {str(e)}")
698
+
699
+ self._subscriptions.clear()
700
+ logger.debug("Cleaned up relay subscriptions")
701
+
702
+ async def _cleanup_dedup_cache(self) -> None:
703
+ """Periodic cleanup of dedup cache to prevent memory leaks."""
704
+ while self.status == WorkflowStatus.RUNNING:
705
+ try:
706
+ await asyncio.sleep(300) # Every 5 minutes
707
+
708
+ # Simple size-based cleanup
709
+ if len(self._processed_messages) > self._dedup_max_size:
710
+ logger.warning(
711
+ f"Dedup cache exceeded {self._dedup_max_size} entries, clearing"
712
+ )
713
+ self._processed_messages.clear()
714
+ except asyncio.CancelledError:
715
+ break
716
+ except Exception as e:
717
+ logger.error(f"Error in dedup cleanup: {e}")
718
+
719
+ async def inject_data(self, agent_name: str, data: Any, task: str = "inject") -> None:
720
+ """
721
+ Inject data into a specific agent to trigger workflow processing.
722
+
723
+ Args:
724
+ agent_name: Name of the agent to inject data into
725
+ data: Data to inject
726
+ task: Task to execute
727
+ """
728
+ if self.status != WorkflowStatus.RUNNING:
729
+ raise WorkflowError(f"Cannot inject data - workflow is not running (status: {self.status})")
730
+
731
+ if agent_name not in self.agents:
732
+ raise WorkflowError(f"Agent '{agent_name}' not found in workflow")
733
+
734
+ agent = self.agents[agent_name]
735
+
736
+ try:
737
+ # Trace data injection
738
+ await self._trace_workflow_event("data_injected", {
739
+ "workflow_name": self.name,
740
+ "target_agent": agent_name,
741
+ "task": task,
742
+ "data_type": type(data).__name__
743
+ })
744
+
745
+ # Process data with the agent
746
+ if hasattr(agent, 'process'):
747
+ await agent.process(task, data, {'workflow': self.name, 'injection': True})
748
+ logger.debug(f"Injected data into agent '{agent_name}' in workflow '{self.name}'")
749
+ else:
750
+ logger.warning(f"Agent '{agent_name}' has no process method")
751
+
752
+ except Exception as e:
753
+ # Log the error but don't raise it - allow workflow to continue
754
+ logger.error(f"Error injecting data into agent '{agent_name}': {str(e)}")
755
+
756
+ # Tracing methods for workflow events
757
+
758
+ async def _trace_workflow_communication(
759
+ self,
760
+ from_agent: str,
761
+ to_agent: str,
762
+ channel: str,
763
+ data: Any,
764
+ success: bool,
765
+ error_message: Optional[str] = None,
766
+ message_id: Optional[str] = None
767
+ ) -> None:
768
+ """Trace workflow communication using the unified tracing system."""
769
+ try:
770
+ # Create a communication span
771
+ span_id = self.trace_manager.start_span(
772
+ operation_name=f"workflow_communication",
773
+ trace_type=TraceType.WORKFLOW_COMMUNICATION,
774
+ input_data={
775
+ "from_agent": from_agent,
776
+ "to_agent": to_agent,
777
+ "channel": channel,
778
+ "data_preview": str(data)[:200] if data else None
779
+ },
780
+ workflow_name=self.name,
781
+ from_agent=from_agent,
782
+ to_agent=to_agent,
783
+ channel=channel,
784
+ data_type=type(data).__name__,
785
+ message_id=message_id,
786
+ reliability_enabled=str(self._reliability_enabled)
787
+ )
788
+
789
+ # End the span with the result
790
+ self.trace_manager.end_span(
791
+ span_id=span_id,
792
+ status=TraceStatus.SUCCESS if success else TraceStatus.ERROR,
793
+ output_data={"communication_processed": success},
794
+ error_message=error_message
795
+ )
796
+
797
+ except Exception as e:
798
+ logger.warning(f"Failed to trace workflow communication: {e}")
799
+
800
+ async def _trace_workflow_event(self, event_type: str, event_data: Dict[str, Any]) -> None:
801
+ """Trace general workflow events using the unified tracing system."""
802
+ try:
803
+ span_id = self.trace_manager.start_span(
804
+ operation_name=f"workflow_{event_type}",
805
+ trace_type=TraceType.WORKFLOW_COMMUNICATION,
806
+ input_data=event_data,
807
+ metadata={
808
+ "workflow_name": self.name,
809
+ "event_type": event_type
810
+ }
811
+ )
812
+
813
+ self.trace_manager.end_span(
814
+ span_id=span_id,
815
+ status=TraceStatus.SUCCESS,
816
+ output_data={"event_recorded": True}
817
+ )
818
+
819
+ except Exception as e:
820
+ logger.warning(f"Failed to trace workflow event: {e}")
821
+
822
+ # Simplified query methods using unified tracing
823
+
824
+ def get_recent_communication(self, limit: int = 20) -> List[Dict[str, Any]]:
825
+ """
826
+ Get recent workflow communication from the unified tracing system.
827
+
828
+ Args:
829
+ limit: Maximum number of communication events to return
830
+
831
+ Returns:
832
+ List of recent workflow communication events
833
+ """
834
+ try:
835
+ # Get recent operations from unified system
836
+ return self.trace_manager.get_recent_operations(limit=limit)
837
+ except Exception as e:
838
+ logger.warning(f"Failed to get recent communication: {e}")
839
+ return []
840
+
841
+ def get_communication_log(self, count: int = 20) -> List[Dict[str, Any]]:
842
+ """
843
+ Get workflow communication log (alias for get_recent_communication).
844
+
845
+ Args:
846
+ count: Maximum number of communication events to return
847
+
848
+ Returns:
849
+ List of recent workflow communication events
850
+ """
851
+ return self.get_recent_communication(limit=count)
852
+
853
+ def get_workflow_stats(self) -> Dict[str, Any]:
854
+ """Get workflow statistics from the unified tracing system."""
855
+ try:
856
+ # Get workflow-specific metrics
857
+ workflow_metrics = self.trace_manager.get_workflow_metrics(self.name)
858
+ return workflow_metrics
859
+ except Exception as e:
860
+ logger.warning(f"Failed to get workflow stats: {e}")
861
+ return {}
862
+
863
+ # Basic workflow information methods
864
+
865
+ def get_agent(self, name: str) -> Optional[Any]:
866
+ """Get an agent by name."""
867
+ return self.agents.get(name)
868
+
869
+ def list_agents(self) -> List[str]:
870
+ """List all agent names in the workflow."""
871
+ return list(self.agents.keys())
872
+
873
+ def list_connections(self) -> List[str]:
874
+ """List all connections as strings."""
875
+ return [str(conn) for conn in self.connections]
876
+
877
+ def get_channel_data(self, channel: str, count: int = 1) -> List[Any]:
878
+ """Get latest data from a relay channel."""
879
+ return self.relay_manager.get_latest(channel, count)
880
+
881
+ def get_stats(self) -> Dict[str, Any]:
882
+ """Get comprehensive workflow statistics including reliability metrics."""
883
+ running_time = None
884
+ if self.started_at:
885
+ end_time = self.stopped_at or time.time()
886
+ running_time = end_time - self.started_at
887
+
888
+ stats = {
889
+ 'name': self.name,
890
+ 'project_id': self.project_id,
891
+ 'status': self.status.value,
892
+ 'agent_count': len(self.agents),
893
+ 'connection_count': len(self.connections),
894
+ 'channel_count': len(self.channels),
895
+ 'created_at': self.created_at,
896
+ 'started_at': self.started_at,
897
+ 'stopped_at': self.stopped_at,
898
+ 'running_time': running_time,
899
+ 'error': self.error,
900
+ 'agents': list(self.agents.keys()),
901
+ 'channels': list(self.channels),
902
+ 'reliability_enabled': self._reliability_enabled
903
+ }
904
+
905
+ # Add reliability statistics if enabled
906
+ if self._reliability_enabled and self.reliability_config:
907
+ stats['reliability_config'] = {
908
+ 'acknowledgments': self.reliability_config.acknowledgments,
909
+ 'task_tracking': self.reliability_config.task_tracking,
910
+ 'backpressure_control': self.reliability_config.backpressure_control
911
+ }
912
+
913
+ # Get relay manager reliability stats
914
+ relay_stats = self.relay_manager.get_stats()
915
+ if 'pending_messages' in relay_stats:
916
+ stats['pending_messages'] = relay_stats['pending_messages']
917
+ stats['active_timeouts'] = relay_stats['active_timeouts']
918
+
919
+ return stats
920
+
921
+ # Reliability management methods
922
+
923
+ def get_pending_messages(self) -> List[Dict[str, Any]]:
924
+ """Get list of pending messages waiting for acknowledgment."""
925
+ if not self._reliability_enabled:
926
+ return []
927
+ return self.relay_manager.get_pending_messages()
928
+
929
+ async def get_agent_reliability_stats(self, agent_name: str) -> Dict[str, Any]:
930
+ """Get reliability statistics for a specific agent."""
931
+ if agent_name not in self.agents:
932
+ return {}
933
+
934
+ agent = self.agents[agent_name]
935
+ stats = {'agent_name': agent_name, 'reliability_enabled': False}
936
+
937
+ if hasattr(agent, 'enable_reliability') and agent.enable_reliability:
938
+ stats['reliability_enabled'] = True
939
+
940
+ # Get task management stats
941
+ if hasattr(agent, 'get_agent_tasks'):
942
+ try:
943
+ tasks = await agent.get_agent_tasks()
944
+ stats['total_tasks'] = len(tasks)
945
+ stats['tasks_by_status'] = {}
946
+ for task in tasks:
947
+ status = task.get('status', 'unknown')
948
+ stats['tasks_by_status'][status] = stats['tasks_by_status'].get(status, 0) + 1
949
+ except Exception as e:
950
+ logger.warning(f"Failed to get task stats for agent {agent_name}: {e}")
951
+
952
+ # Get backpressure stats
953
+ if hasattr(agent, 'get_backpressure_stats'):
954
+ try:
955
+ bp_stats = agent.get_backpressure_stats()
956
+ stats['backpressure'] = bp_stats
957
+ except Exception as e:
958
+ logger.warning(f"Failed to get backpressure stats for agent {agent_name}: {e}")
959
+
960
+ return stats
961
+
962
+ def is_reliability_enabled(self) -> bool:
963
+ """Check if reliability features are enabled for this workflow."""
964
+ return self._reliability_enabled
965
+
966
+ def get_reliability_config(self) -> Optional[ReliabilityConfig]:
967
+ """Get the current reliability configuration."""
968
+ return self.reliability_config
969
+
970
+ def get_token_usage(self) -> Dict[str, Any]:
971
+ """
972
+ Aggregate token usage from all agents in the workflow.
973
+
974
+ Returns:
975
+ Dictionary containing aggregated token usage across all agents
976
+ """
977
+ total_usage = {
978
+ "total_tokens": 0,
979
+ "prompt_tokens": 0,
980
+ "completion_tokens": 0,
981
+ "llm_calls": 0,
982
+ "models_used": [],
983
+ "agents_with_usage": []
984
+ }
985
+
986
+ for agent_name, agent in self.agents.items():
987
+ try:
988
+ # Method 1: Check if agent has get_token_usage()
989
+ if hasattr(agent, 'get_token_usage'):
990
+ usage = agent.get_token_usage()
991
+ if usage and isinstance(usage, dict):
992
+ tokens = usage.get("total_tokens", 0)
993
+ if tokens > 0:
994
+ total_usage["total_tokens"] += tokens
995
+ total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
996
+ total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
997
+ total_usage["llm_calls"] += usage.get("total_calls", usage.get("llm_calls", 0))
998
+ total_usage["agents_with_usage"].append(agent_name)
999
+ logger.debug(f"Workflow: Agent '{agent_name}' used {tokens} tokens")
1000
+
1001
+ # Method 2: Check if agent has llm_provider with usage tracking
1002
+ elif hasattr(agent, 'llm_provider') and hasattr(agent.llm_provider, 'get_token_usage'):
1003
+ llm_usage = agent.llm_provider.get_token_usage()
1004
+ if llm_usage and isinstance(llm_usage, dict):
1005
+ tokens = llm_usage.get("total_tokens", 0)
1006
+ if tokens > 0:
1007
+ total_usage["total_tokens"] += tokens
1008
+ total_usage["prompt_tokens"] += llm_usage.get("prompt_tokens", 0)
1009
+ total_usage["completion_tokens"] += llm_usage.get("completion_tokens", 0)
1010
+ total_usage["llm_calls"] += 1
1011
+ total_usage["agents_with_usage"].append(agent_name)
1012
+ logger.debug(f"Workflow: Agent '{agent_name}' (via llm_provider) used {tokens} tokens")
1013
+
1014
+ except Exception as e:
1015
+ logger.warning(f"Failed to get token usage from agent '{agent_name}': {e}")
1016
+
1017
+ if total_usage["total_tokens"] > 0:
1018
+ logger.info(f"Workflow '{self.name}' total token usage: {total_usage['total_tokens']} tokens across {len(total_usage['agents_with_usage'])} agents")
1019
+ return total_usage
1020
+ else:
1021
+ logger.debug(f"Workflow '{self.name}' has no token usage")
1022
+ return None
1023
+
1024
+ def health_check(self) -> Dict[str, Any]:
1025
+ """
1026
+ Comprehensive workflow health check.
1027
+
1028
+ Returns:
1029
+ Dictionary containing health status and any issues found
1030
+ """
1031
+ health = {
1032
+ 'status': self.status.value,
1033
+ 'healthy': self.status == WorkflowStatus.RUNNING,
1034
+ 'agents': {},
1035
+ 'issues': []
1036
+ }
1037
+
1038
+ # Check each agent (with safe attribute access)
1039
+ for agent_name, agent in self.agents.items():
1040
+ agent_health = {
1041
+ 'name': agent_name,
1042
+ 'has_process': hasattr(agent, 'process'),
1043
+ 'running': self.status == WorkflowStatus.RUNNING
1044
+ }
1045
+
1046
+ # Check if agent has health method (optional)
1047
+ if hasattr(agent, 'get_health'):
1048
+ try:
1049
+ agent_health.update(agent.get_health())
1050
+ except Exception as e:
1051
+ agent_health['health_error'] = str(e)
1052
+
1053
+ health['agents'][agent_name] = agent_health
1054
+
1055
+ if not agent_health['has_process']:
1056
+ health['issues'].append(f"Agent '{agent_name}' has no process method")
1057
+ health['healthy'] = False
1058
+
1059
+ # Check subscriptions for potential memory leaks
1060
+ subscription_count = len(self._subscriptions)
1061
+ health['subscription_count'] = subscription_count
1062
+ if subscription_count > 1000:
1063
+ health['issues'].append(f"High subscription count: {subscription_count}")
1064
+ health['healthy'] = False
1065
+
1066
+ # Check pending messages (if reliability enabled)
1067
+ if self._reliability_enabled:
1068
+ pending = self.get_pending_messages()
1069
+ health['pending_message_count'] = len(pending)
1070
+ if len(pending) > 100:
1071
+ health['issues'].append(f"High pending message count: {len(pending)}")
1072
+ health['healthy'] = False
1073
+
1074
+ return health
1075
+
1076
+ # Context manager support
1077
+ async def __aenter__(self) -> "Workflow":
1078
+ """Async context manager entry."""
1079
+ await self.start()
1080
+ return self
1081
+
1082
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
1083
+ """Async context manager exit."""
1084
+ await self.stop()