camel-ai 0.2.66__py3-none-any.whl → 0.2.68__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (68) hide show
  1. camel/__init__.py +1 -1
  2. camel/configs/__init__.py +3 -0
  3. camel/configs/qianfan_config.py +85 -0
  4. camel/environments/__init__.py +12 -0
  5. camel/environments/rlcards_env.py +860 -0
  6. camel/interpreters/docker/Dockerfile +2 -5
  7. camel/loaders/firecrawl_reader.py +4 -4
  8. camel/memories/blocks/vectordb_block.py +8 -1
  9. camel/memories/context_creators/score_based.py +123 -19
  10. camel/models/__init__.py +2 -0
  11. camel/models/aiml_model.py +8 -0
  12. camel/models/anthropic_model.py +122 -2
  13. camel/models/aws_bedrock_model.py +8 -0
  14. camel/models/azure_openai_model.py +14 -5
  15. camel/models/base_model.py +4 -0
  16. camel/models/cohere_model.py +9 -2
  17. camel/models/crynux_model.py +8 -0
  18. camel/models/deepseek_model.py +8 -0
  19. camel/models/gemini_model.py +8 -0
  20. camel/models/groq_model.py +8 -0
  21. camel/models/internlm_model.py +8 -0
  22. camel/models/litellm_model.py +5 -0
  23. camel/models/lmstudio_model.py +14 -1
  24. camel/models/mistral_model.py +15 -1
  25. camel/models/model_factory.py +6 -0
  26. camel/models/modelscope_model.py +8 -0
  27. camel/models/moonshot_model.py +8 -0
  28. camel/models/nemotron_model.py +17 -2
  29. camel/models/netmind_model.py +8 -0
  30. camel/models/novita_model.py +8 -0
  31. camel/models/nvidia_model.py +8 -0
  32. camel/models/ollama_model.py +8 -0
  33. camel/models/openai_compatible_model.py +23 -5
  34. camel/models/openai_model.py +21 -4
  35. camel/models/openrouter_model.py +8 -0
  36. camel/models/ppio_model.py +8 -0
  37. camel/models/qianfan_model.py +104 -0
  38. camel/models/qwen_model.py +8 -0
  39. camel/models/reka_model.py +18 -3
  40. camel/models/samba_model.py +17 -3
  41. camel/models/sglang_model.py +20 -5
  42. camel/models/siliconflow_model.py +8 -0
  43. camel/models/stub_model.py +8 -1
  44. camel/models/togetherai_model.py +8 -0
  45. camel/models/vllm_model.py +7 -0
  46. camel/models/volcano_model.py +14 -1
  47. camel/models/watsonx_model.py +4 -1
  48. camel/models/yi_model.py +8 -0
  49. camel/models/zhipuai_model.py +8 -0
  50. camel/societies/workforce/prompts.py +71 -22
  51. camel/societies/workforce/role_playing_worker.py +3 -8
  52. camel/societies/workforce/single_agent_worker.py +37 -9
  53. camel/societies/workforce/task_channel.py +25 -20
  54. camel/societies/workforce/utils.py +104 -14
  55. camel/societies/workforce/worker.py +98 -16
  56. camel/societies/workforce/workforce.py +1289 -101
  57. camel/societies/workforce/workforce_logger.py +613 -0
  58. camel/tasks/task.py +16 -5
  59. camel/toolkits/__init__.py +2 -0
  60. camel/toolkits/code_execution.py +1 -1
  61. camel/toolkits/playwright_mcp_toolkit.py +2 -1
  62. camel/toolkits/pptx_toolkit.py +4 -4
  63. camel/types/enums.py +32 -0
  64. camel/types/unified_model_type.py +5 -0
  65. {camel_ai-0.2.66.dist-info → camel_ai-0.2.68.dist-info}/METADATA +4 -3
  66. {camel_ai-0.2.66.dist-info → camel_ai-0.2.68.dist-info}/RECORD +68 -64
  67. {camel_ai-0.2.66.dist-info → camel_ai-0.2.68.dist-info}/WHEEL +0 -0
  68. {camel_ai-0.2.66.dist-info → camel_ai-0.2.68.dist-info}/licenses/LICENSE +0 -0
@@ -15,9 +15,11 @@ from __future__ import annotations
15
15
 
16
16
  import asyncio
17
17
  import json
18
+ import time
18
19
  import uuid
19
20
  from collections import deque
20
- from typing import Deque, Dict, List, Optional
21
+ from enum import Enum
22
+ from typing import Any, Coroutine, Deque, Dict, List, Optional
21
23
 
22
24
  from colorama import Fore
23
25
 
@@ -41,13 +43,56 @@ from camel.societies.workforce.utils import (
41
43
  )
42
44
  from camel.societies.workforce.worker import Worker
43
45
  from camel.tasks.task import Task, TaskState, validate_task_content
44
- from camel.toolkits import CodeExecutionToolkit, SearchToolkit, ThinkingToolkit
46
+ from camel.toolkits import (
47
+ CodeExecutionToolkit,
48
+ SearchToolkit,
49
+ TaskPlanningToolkit,
50
+ ThinkingToolkit,
51
+ )
45
52
  from camel.types import ModelPlatformType, ModelType
46
53
  from camel.utils import dependencies_required
47
54
 
55
+ from .workforce_logger import WorkforceLogger
56
+
48
57
  logger = get_logger(__name__)
49
58
 
50
59
 
60
+ class WorkforceState(Enum):
61
+ r"""Workforce execution state for human intervention support."""
62
+
63
+ IDLE = "idle"
64
+ RUNNING = "running"
65
+ PAUSED = "paused"
66
+ STOPPED = "stopped"
67
+
68
+
69
+ class WorkforceSnapshot:
70
+ r"""Snapshot of workforce state for resuming execution."""
71
+
72
+ def __init__(
73
+ self,
74
+ main_task: Optional[Task] = None,
75
+ pending_tasks: Optional[Deque[Task]] = None,
76
+ completed_tasks: Optional[List[Task]] = None,
77
+ task_dependencies: Optional[Dict[str, List[str]]] = None,
78
+ assignees: Optional[Dict[str, str]] = None,
79
+ current_task_index: int = 0,
80
+ description: str = "",
81
+ ):
82
+ self.main_task = main_task
83
+ self.pending_tasks = pending_tasks.copy() if pending_tasks else deque()
84
+ self.completed_tasks = (
85
+ completed_tasks.copy() if completed_tasks else []
86
+ )
87
+ self.task_dependencies = (
88
+ task_dependencies.copy() if task_dependencies else {}
89
+ )
90
+ self.assignees = assignees.copy() if assignees else {}
91
+ self.current_task_index = current_task_index
92
+ self.description = description
93
+ self.timestamp = time.time()
94
+
95
+
51
96
  class Workforce(BaseNode):
52
97
  r"""A system where multiple worker nodes (agents) cooperate together
53
98
  to solve tasks. It can assign tasks to worker nodes and also take
@@ -90,21 +135,35 @@ class Workforce(BaseNode):
90
135
  for graceful shutdown when a task fails 3 times. During this
91
136
  period, the workforce remains active for debugging.
92
137
  Set to 0 for immediate shutdown. (default: :obj:`15.0`)
138
+ share_memory (bool, optional): Whether to enable shared memory across
139
+ SingleAgentWorker instances in the workforce. When enabled, all
140
+ SingleAgentWorker instances, coordinator agent, and task planning
141
+ agent will share their complete conversation history and
142
+ function-calling trajectory, providing better context for task
143
+ handoffs and continuity. Note: Currently only supports
144
+ SingleAgentWorker instances; RolePlayingWorker and nested
145
+ Workforce instances do not participate in memory sharing.
146
+ (default: :obj:`False`)
93
147
 
94
148
  Example:
95
- >>> # Configure with custom model
149
+ >>> # Configure with custom model and shared memory
150
+ >>> import asyncio
96
151
  >>> model = ModelFactory.create(
97
152
  ... ModelPlatformType.OPENAI, ModelType.GPT_4O
98
153
  ... )
99
154
  >>> workforce = Workforce(
100
155
  ... "Research Team",
101
156
  ... coordinator_agent_kwargs={"model": model, "token_limit": 4000},
102
- ... task_agent_kwargs={"model": model, "token_limit": 8000}
157
+ ... task_agent_kwargs={"model": model, "token_limit": 8000},
158
+ ... share_memory=True # Enable shared memory
103
159
  ... )
104
160
  >>>
105
161
  >>> # Process a task
106
- >>> task = Task(content="Research AI trends", id="1")
107
- >>> result = workforce.process_task(task)
162
+ >>> async def main():
163
+ ... task = Task(content="Research AI trends", id="1")
164
+ ... result = workforce.process_task(task)
165
+ ... return result
166
+ >>> asyncio.run(main())
108
167
  """
109
168
 
110
169
  def __init__(
@@ -115,12 +174,44 @@ class Workforce(BaseNode):
115
174
  task_agent_kwargs: Optional[Dict] = None,
116
175
  new_worker_agent_kwargs: Optional[Dict] = None,
117
176
  graceful_shutdown_timeout: float = 15.0,
177
+ share_memory: bool = False,
118
178
  ) -> None:
119
179
  super().__init__(description)
120
180
  self._child_listening_tasks: Deque[asyncio.Task] = deque()
121
181
  self._children = children or []
122
182
  self.new_worker_agent_kwargs = new_worker_agent_kwargs
123
183
  self.graceful_shutdown_timeout = graceful_shutdown_timeout
184
+ self.share_memory = share_memory
185
+ self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
186
+ self._task: Optional[Task] = None
187
+ self._pending_tasks: Deque[Task] = deque()
188
+ self._task_dependencies: Dict[str, List[str]] = {}
189
+ self._assignees: Dict[str, str] = {}
190
+ self._in_flight_tasks: int = 0
191
+ # Dictionary to track task start times
192
+ self._task_start_times: Dict[str, float] = {}
193
+ # Human intervention support
194
+ self._state = WorkforceState.IDLE
195
+ self._pause_event = asyncio.Event()
196
+ self._pause_event.set() # Initially not paused
197
+ self._stop_requested = False
198
+ self._snapshots: List[WorkforceSnapshot] = []
199
+ self._completed_tasks: List[Task] = []
200
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
201
+ self._main_task_future: Optional[asyncio.Future] = None
202
+ # Snapshot throttle support
203
+ self._last_snapshot_time: float = 0.0
204
+ # Minimum seconds between automatic snapshots
205
+ self.snapshot_interval: float = 30.0
206
+ if self.metrics_logger:
207
+ for child in self._children:
208
+ worker_type = type(child).__name__
209
+ role_or_desc = child.description
210
+ self.metrics_logger.log_worker_created(
211
+ worker_id=child.node_id,
212
+ worker_type=worker_type,
213
+ role=role_or_desc,
214
+ )
124
215
 
125
216
  # Warning messages for default model usage
126
217
  if coordinator_agent_kwargs is None:
@@ -154,6 +245,13 @@ class Workforce(BaseNode):
154
245
  "available options."
155
246
  )
156
247
 
248
+ if self.share_memory:
249
+ logger.info(
250
+ "Shared memory enabled. All agents will share their complete "
251
+ "conversation history and function-calling trajectory for "
252
+ "better context continuity during task handoffs."
253
+ )
254
+
157
255
  coord_agent_sys_msg = BaseMessage.make_assistant_message(
158
256
  role_name="Workforce Manager",
159
257
  content="You are coordinating a group of workers. A worker can be "
@@ -163,21 +261,160 @@ class Workforce(BaseNode):
163
261
  "a new worker for a task, etc.",
164
262
  )
165
263
  self.coordinator_agent = ChatAgent(
166
- coord_agent_sys_msg, **(coordinator_agent_kwargs or {})
264
+ coord_agent_sys_msg,
265
+ **(coordinator_agent_kwargs or {}),
167
266
  )
168
267
 
169
268
  task_sys_msg = BaseMessage.make_assistant_message(
170
269
  role_name="Task Planner",
171
- content="You are going to compose and decompose tasks.",
270
+ content="You are going to compose and decompose tasks. Keep "
271
+ "tasks that are sequential and require the same type of "
272
+ "agent together in one agent process. Only decompose tasks "
273
+ "that can be handled in parallel and require different types "
274
+ "of agents. This ensures efficient execution by minimizing "
275
+ "context switching between agents.",
172
276
  )
173
- self.task_agent = ChatAgent(task_sys_msg, **(task_agent_kwargs or {}))
174
-
175
- # If there is one, will set by the workforce class wrapping this
176
- self._task: Optional[Task] = None
177
- self._pending_tasks: Deque[Task] = deque()
277
+ _task_agent_kwargs = dict(task_agent_kwargs or {})
278
+ extra_tools = TaskPlanningToolkit().get_tools()
279
+ _task_agent_kwargs["tools"] = [
280
+ *_task_agent_kwargs.get("tools", []),
281
+ *extra_tools,
282
+ ]
283
+ self.task_agent = ChatAgent(task_sys_msg, **_task_agent_kwargs)
178
284
 
179
285
  def __repr__(self):
180
- return f"Workforce {self.node_id} ({self.description})"
286
+ return (
287
+ f"Workforce {self.node_id} ({self.description}) - "
288
+ f"State: {self._state.value}"
289
+ )
290
+
291
+ def _collect_shared_memory(self) -> Dict[str, List]:
292
+ r"""Collect memory from all SingleAgentWorker instances for sharing.
293
+
294
+ Returns:
295
+ Dict[str, List]: A dictionary mapping agent types to their memory
296
+ records. Contains entries for 'coordinator', 'task_agent',
297
+ and 'workers'.
298
+ """
299
+ # TODO: add memory collection for RolePlayingWorker and nested
300
+ # Workforce instances
301
+ if not self.share_memory:
302
+ return {}
303
+
304
+ shared_memory: Dict[str, List] = {
305
+ 'coordinator': [],
306
+ 'task_agent': [],
307
+ 'workers': [],
308
+ }
309
+
310
+ try:
311
+ # Collect coordinator agent memory
312
+ coord_records = self.coordinator_agent.memory.retrieve()
313
+ shared_memory['coordinator'] = [
314
+ record.memory_record.to_dict() for record in coord_records
315
+ ]
316
+
317
+ # Collect task agent memory
318
+ task_records = self.task_agent.memory.retrieve()
319
+ shared_memory['task_agent'] = [
320
+ record.memory_record.to_dict() for record in task_records
321
+ ]
322
+
323
+ # Collect worker memory only from SingleAgentWorker instances
324
+ for child in self._children:
325
+ if isinstance(child, SingleAgentWorker):
326
+ worker_records = child.worker.memory.retrieve()
327
+ worker_memory = [
328
+ record.memory_record.to_dict()
329
+ for record in worker_records
330
+ ]
331
+ shared_memory['workers'].extend(worker_memory)
332
+
333
+ except Exception as e:
334
+ logger.warning(f"Error collecting shared memory: {e}")
335
+
336
+ return shared_memory
337
+
338
+ def _share_memory_with_agents(
339
+ self, shared_memory: Dict[str, List]
340
+ ) -> None:
341
+ r"""Share collected memory with coordinator, task agent, and
342
+ SingleAgentWorker instances.
343
+
344
+ Args:
345
+ shared_memory (Dict[str, List]): Memory records collected from
346
+ all agents to be shared.
347
+ """
348
+ if not self.share_memory or not shared_memory:
349
+ return
350
+
351
+ try:
352
+ # Create a consolidated memory from all collected records
353
+ all_records = []
354
+ for _memory_type, records in shared_memory.items():
355
+ all_records.extend(records)
356
+
357
+ if not all_records:
358
+ return
359
+
360
+ # Import necessary classes for memory record reconstruction
361
+ from camel.memories.records import MemoryRecord
362
+
363
+ # Create consolidated memory objects from records
364
+ memory_records: List[MemoryRecord] = []
365
+ for record_dict in all_records:
366
+ try:
367
+ memory_record = MemoryRecord.from_dict(record_dict)
368
+ memory_records.append(memory_record)
369
+ except Exception as e:
370
+ logger.warning(f"Failed to reconstruct memory record: {e}")
371
+ continue
372
+
373
+ if not memory_records:
374
+ return
375
+
376
+ # Share with coordinator agent
377
+ for record in memory_records:
378
+ # Only add records from other agents to avoid duplication
379
+ if record.agent_id != self.coordinator_agent.agent_id:
380
+ self.coordinator_agent.memory.write_record(record)
381
+
382
+ # Share with task agent
383
+ for record in memory_records:
384
+ if record.agent_id != self.task_agent.agent_id:
385
+ self.task_agent.memory.write_record(record)
386
+
387
+ # Share with SingleAgentWorker instances only
388
+ single_agent_workers = [
389
+ child
390
+ for child in self._children
391
+ if isinstance(child, SingleAgentWorker)
392
+ ]
393
+
394
+ for worker in single_agent_workers:
395
+ for record in memory_records:
396
+ if record.agent_id != worker.worker.agent_id:
397
+ worker.worker.memory.write_record(record)
398
+
399
+ logger.info(
400
+ f"Shared {len(memory_records)} memory records across "
401
+ f"{len(single_agent_workers) + 2} agents in workforce "
402
+ f"{self.node_id}"
403
+ )
404
+
405
+ except Exception as e:
406
+ logger.warning(f"Error sharing memory with agents: {e}")
407
+
408
+ def _sync_shared_memory(self) -> None:
409
+ r"""Synchronize memory across all agents by collecting and sharing."""
410
+ if not self.share_memory:
411
+ return
412
+
413
+ try:
414
+ shared_memory = self._collect_shared_memory()
415
+ self._share_memory_with_agents(shared_memory)
416
+ except Exception as e:
417
+ logger.warning(f"Error synchronizing shared memory: {e}")
181
418
 
182
419
  def _decompose_task(self, task: Task) -> List[Task]:
183
420
  r"""Decompose the task into subtasks. This method will also set the
@@ -199,18 +436,313 @@ class Workforce(BaseNode):
199
436
 
200
437
  return subtasks
201
438
 
439
+ # Human intervention methods
440
+ async def _async_pause(self) -> None:
441
+ r"""Async implementation of pause to run on the event loop."""
442
+ if self._state == WorkforceState.RUNNING:
443
+ self._state = WorkforceState.PAUSED
444
+ self._pause_event.clear()
445
+ logger.info(f"Workforce {self.node_id} paused.")
446
+
447
+ def pause(self) -> None:
448
+ r"""Pause the workforce execution.
449
+ If the internal event-loop is already running we schedule the
450
+ asynchronous pause coroutine onto it. When the loop has not yet
451
+ been created (e.g. the caller presses the hot-key immediately after
452
+ workforce start-up) we fall back to a synchronous state change so
453
+ that no tasks will be scheduled until the loop is ready.
454
+ """
455
+
456
+ if self._loop and not self._loop.is_closed():
457
+ self._submit_coro_to_loop(self._async_pause())
458
+ else:
459
+ # Loop not yet created, just mark state so when loop starts it
460
+ # will proceed.
461
+ if self._state == WorkforceState.RUNNING:
462
+ self._state = WorkforceState.PAUSED
463
+ self._pause_event.clear()
464
+ logger.info(
465
+ f"Workforce {self.node_id} paused "
466
+ f"(event-loop not yet started)."
467
+ )
468
+
469
+ async def _async_resume(self) -> None:
470
+ r"""Async implementation of resume to run on the event loop."""
471
+ if self._state == WorkforceState.PAUSED:
472
+ self._state = WorkforceState.RUNNING
473
+ self._pause_event.set()
474
+ logger.info(f"Workforce {self.node_id} resumed.")
475
+
476
+ # Re-post ready tasks (if any)
477
+ if self._pending_tasks:
478
+ await self._post_ready_tasks()
479
+
480
+ def resume(self) -> None:
481
+ r"""Resume execution after a manual pause."""
482
+
483
+ if self._loop and not self._loop.is_closed():
484
+ self._submit_coro_to_loop(self._async_resume())
485
+ else:
486
+ # Loop not running yet, just mark state so when loop starts it
487
+ # will proceed.
488
+ if self._state == WorkforceState.PAUSED:
489
+ self._state = WorkforceState.RUNNING
490
+ self._pause_event.set()
491
+ logger.info(
492
+ f"Workforce {self.node_id} resumed "
493
+ f"(event-loop not yet started)."
494
+ )
495
+
496
+ async def _async_stop_gracefully(self) -> None:
497
+ r"""Async implementation of stop_gracefully to run on the event
498
+ loop.
499
+ """
500
+ self._stop_requested = True
501
+ if self._pause_event.is_set() is False:
502
+ self._pause_event.set() # Resume if paused to process stop
503
+ logger.info(f"Workforce {self.node_id} stop requested.")
504
+
505
+ def stop_gracefully(self) -> None:
506
+ r"""Request workforce to finish current in-flight work then halt.
507
+
508
+ Works both when the internal event-loop is alive and when it has not
509
+ yet been started. In the latter case we simply mark the stop flag so
510
+ that the loop (when it eventually starts) will exit immediately after
511
+ initialisation.
512
+ """
513
+
514
+ if self._loop and not self._loop.is_closed():
515
+ self._submit_coro_to_loop(self._async_stop_gracefully())
516
+ else:
517
+ # Loop not yet created, set the flag synchronously so later
518
+ # startup will respect it.
519
+ self._stop_requested = True
520
+ # Ensure any pending pause is released so that when the loop does
521
+ # start it can see the stop request and exit.
522
+ self._pause_event.set()
523
+ logger.info(
524
+ f"Workforce {self.node_id} stop requested "
525
+ f"(event-loop not yet started)."
526
+ )
527
+
528
+ def save_snapshot(self, description: str = "") -> None:
529
+ r"""Save current state as a snapshot."""
530
+ snapshot = WorkforceSnapshot(
531
+ main_task=self._task,
532
+ pending_tasks=self._pending_tasks,
533
+ completed_tasks=self._completed_tasks,
534
+ task_dependencies=self._task_dependencies,
535
+ assignees=self._assignees,
536
+ current_task_index=len(self._completed_tasks),
537
+ description=description or f"Snapshot at {time.time()}",
538
+ )
539
+ self._snapshots.append(snapshot)
540
+ logger.info(f"Snapshot saved: {description}")
541
+
542
+ def list_snapshots(self) -> List[str]:
543
+ r"""List all available snapshots."""
544
+ snapshots_info = []
545
+ for i, snapshot in enumerate(self._snapshots):
546
+ desc_part = (
547
+ f" - {snapshot.description}" if snapshot.description else ""
548
+ )
549
+ info = (
550
+ f"Snapshot {i}: {len(snapshot.completed_tasks)} completed, "
551
+ f"{len(snapshot.pending_tasks)} pending{desc_part}"
552
+ )
553
+ snapshots_info.append(info)
554
+ return snapshots_info
555
+
556
+ def get_pending_tasks(self) -> List[Task]:
557
+ r"""Get current pending tasks for human review."""
558
+ return list(self._pending_tasks)
559
+
560
+ def get_completed_tasks(self) -> List[Task]:
561
+ r"""Get completed tasks."""
562
+ return self._completed_tasks.copy()
563
+
564
+ def modify_task_content(self, task_id: str, new_content: str) -> bool:
565
+ r"""Modify the content of a pending task."""
566
+ # Validate the new content first
567
+ if not validate_task_content(new_content, task_id):
568
+ logger.warning(
569
+ f"Task {task_id} content modification rejected: "
570
+ f"Invalid content. Content preview: '{new_content[:50]}...'"
571
+ )
572
+ return False
573
+
574
+ for task in self._pending_tasks:
575
+ if task.id == task_id:
576
+ task.content = new_content
577
+ logger.info(f"Task {task_id} content modified.")
578
+ return True
579
+ logger.warning(f"Task {task_id} not found in pending tasks.")
580
+ return False
581
+
582
+ def add_task(
583
+ self,
584
+ content: str,
585
+ task_id: Optional[str] = None,
586
+ additional_info: Optional[Dict[str, Any]] = None,
587
+ insert_position: int = -1,
588
+ ) -> Task:
589
+ r"""Add a new task to the pending queue."""
590
+ new_task = Task(
591
+ content=content,
592
+ id=task_id or f"human_added_{len(self._pending_tasks)}",
593
+ additional_info=additional_info,
594
+ )
595
+ if insert_position == -1:
596
+ self._pending_tasks.append(new_task)
597
+ else:
598
+ # Convert deque to list, insert, then back to deque
599
+ tasks_list = list(self._pending_tasks)
600
+ tasks_list.insert(insert_position, new_task)
601
+ self._pending_tasks = deque(tasks_list)
602
+
603
+ logger.info(f"New task added: {new_task.id}")
604
+ return new_task
605
+
606
+ def remove_task(self, task_id: str) -> bool:
607
+ r"""Remove a task from the pending queue."""
608
+ # Convert to list to find and remove
609
+ tasks_list = list(self._pending_tasks)
610
+ for i, task in enumerate(tasks_list):
611
+ if task.id == task_id:
612
+ tasks_list.pop(i)
613
+ self._pending_tasks = deque(tasks_list)
614
+ logger.info(f"Task {task_id} removed.")
615
+ return True
616
+ logger.warning(f"Task {task_id} not found in pending tasks.")
617
+ return False
618
+
619
+ def reorder_tasks(self, task_ids: List[str]) -> bool:
620
+ r"""Reorder pending tasks according to the provided task IDs list."""
621
+ # Create a mapping of task_id to task
622
+ tasks_dict = {task.id: task for task in self._pending_tasks}
623
+
624
+ # Check if all provided IDs exist
625
+ if not all(task_id in tasks_dict for task_id in task_ids):
626
+ logger.warning("Some task IDs not found in pending tasks.")
627
+ return False
628
+
629
+ # Check if we have the same number of tasks
630
+ if len(task_ids) != len(self._pending_tasks):
631
+ logger.warning(
632
+ "Number of task IDs doesn't match pending tasks count."
633
+ )
634
+ return False
635
+
636
+ # Reorder tasks
637
+ reordered_tasks = deque([tasks_dict[task_id] for task_id in task_ids])
638
+ self._pending_tasks = reordered_tasks
639
+
640
+ logger.info("Tasks reordered successfully.")
641
+ return True
642
+
643
+ def resume_from_task(self, task_id: str) -> bool:
644
+ r"""Resume execution from a specific task."""
645
+ if self._state != WorkforceState.PAUSED:
646
+ logger.warning(
647
+ "Workforce must be paused to resume from specific task."
648
+ )
649
+ return False
650
+
651
+ # Find the task in pending tasks
652
+ tasks_list = list(self._pending_tasks)
653
+ target_index = -1
654
+
655
+ for i, task in enumerate(tasks_list):
656
+ if task.id == task_id:
657
+ target_index = i
658
+ break
659
+
660
+ if target_index == -1:
661
+ logger.warning(f"Task {task_id} not found in pending tasks.")
662
+ return False
663
+
664
+ # Move completed tasks that come after the target task back to pending
665
+ tasks_to_move_back = tasks_list[:target_index]
666
+ remaining_tasks = tasks_list[target_index:]
667
+
668
+ # Update pending tasks to start from the target task
669
+ self._pending_tasks = deque(remaining_tasks)
670
+
671
+ # Move previously "completed" tasks that are after target back to
672
+ # pending and reset their state
673
+ if tasks_to_move_back:
674
+ # Reset state for tasks being moved back to pending
675
+ for task in tasks_to_move_back:
676
+ # Handle all possible task states
677
+ if task.state in [TaskState.DONE, TaskState.FAILED]:
678
+ task.state = TaskState.OPEN
679
+ # Clear result to avoid confusion
680
+ task.result = None
681
+ # Reset failure count to give task a fresh start
682
+ task.failure_count = 0
683
+
684
+ logger.info(
685
+ f"Moving {len(tasks_to_move_back)} tasks back to pending "
686
+ f"state."
687
+ )
688
+
689
+ logger.info(f"Ready to resume from task: {task_id}")
690
+ return True
691
+
692
+ def restore_from_snapshot(self, snapshot_index: int) -> bool:
693
+ r"""Restore workforce state from a snapshot."""
694
+ if not (0 <= snapshot_index < len(self._snapshots)):
695
+ logger.warning(f"Invalid snapshot index: {snapshot_index}")
696
+ return False
697
+
698
+ if self._state == WorkforceState.RUNNING:
699
+ logger.warning(
700
+ "Cannot restore snapshot while workforce is running. "
701
+ "Pause first."
702
+ )
703
+ return False
704
+
705
+ snapshot = self._snapshots[snapshot_index]
706
+ self._task = snapshot.main_task
707
+ self._pending_tasks = snapshot.pending_tasks.copy()
708
+ self._completed_tasks = snapshot.completed_tasks.copy()
709
+ self._task_dependencies = snapshot.task_dependencies.copy()
710
+ self._assignees = snapshot.assignees.copy()
711
+
712
+ logger.info(f"Workforce state restored from snapshot {snapshot_index}")
713
+ return True
714
+
715
+ def get_workforce_status(self) -> Dict:
716
+ r"""Get current workforce status for human review."""
717
+ return {
718
+ "state": self._state.value,
719
+ "pending_tasks_count": len(self._pending_tasks),
720
+ "completed_tasks_count": len(self._completed_tasks),
721
+ "snapshots_count": len(self._snapshots),
722
+ "children_count": len(self._children),
723
+ "main_task_id": self._task.id if self._task else None,
724
+ }
725
+
202
726
  @check_if_running(False)
203
- def process_task(self, task: Task) -> Task:
204
- r"""The main entry point for the workforce to process a task. It will
205
- start the workforce and all the child nodes under it, process the
206
- task provided and return the updated task.
727
+ async def process_task_async(
728
+ self, task: Task, interactive: bool = False
729
+ ) -> Task:
730
+ r"""Main entry point to process a task asynchronously.
207
731
 
208
732
  Args:
209
733
  task (Task): The task to be processed.
734
+ interactive (bool, optional): If True, enables human-intervention
735
+ workflow (pause/resume/snapshot). Defaults to False, which
736
+ runs the task in a blocking one-shot manner.
210
737
 
211
738
  Returns:
212
739
  Task: The updated task.
213
740
  """
741
+ # Delegate to intervention pipeline when requested to keep
742
+ # backward-compat.
743
+ if interactive:
744
+ return await self._process_task_with_snapshot(task)
745
+
214
746
  if not validate_task_content(task.content, task.id):
215
747
  task.state = TaskState.FAILED
216
748
  task.result = "Task failed: Invalid or empty content provided"
@@ -222,33 +754,273 @@ class Workforce(BaseNode):
222
754
 
223
755
  self.reset()
224
756
  self._task = task
757
+ if self.metrics_logger:
758
+ self.metrics_logger.log_task_created(
759
+ task_id=task.id,
760
+ description=task.content,
761
+ task_type=task.type,
762
+ metadata=task.additional_info,
763
+ )
225
764
  task.state = TaskState.FAILED
226
- self._pending_tasks.append(task)
227
765
  # The agent tend to be overconfident on the whole task, so we
228
766
  # decompose the task into subtasks first
229
767
  subtasks = self._decompose_task(task)
768
+ if self.metrics_logger and subtasks:
769
+ self.metrics_logger.log_task_decomposed(
770
+ parent_task_id=task.id, subtask_ids=[st.id for st in subtasks]
771
+ )
772
+ for subtask in subtasks:
773
+ self.metrics_logger.log_task_created(
774
+ task_id=subtask.id,
775
+ description=subtask.content,
776
+ parent_task_id=task.id,
777
+ task_type=subtask.type,
778
+ metadata=subtask.additional_info,
779
+ )
780
+ if subtasks:
781
+ # If decomposition happened, the original task becomes a container.
782
+ # We only execute its subtasks.
783
+ self._pending_tasks.extendleft(reversed(subtasks))
784
+ else:
785
+ # If no decomposition, execute the original task.
786
+ self._pending_tasks.append(task)
787
+
788
+ self.set_channel(TaskChannel())
789
+
790
+ await self.start()
791
+
792
+ if subtasks:
793
+ task.result = "\n\n".join(
794
+ f"--- Subtask {sub.id} Result ---\n{sub.result}"
795
+ for sub in task.subtasks
796
+ if sub.result
797
+ )
798
+ if task.subtasks and all(
799
+ sub.state == TaskState.DONE for sub in task.subtasks
800
+ ):
801
+ task.state = TaskState.DONE
802
+ else:
803
+ task.state = TaskState.FAILED
804
+
805
+ return task
806
+
807
+ def process_task(self, task: Task) -> Task:
808
+ r"""Synchronous wrapper for process_task that handles async operations
809
+ internally.
810
+
811
+ Args:
812
+ task (Task): The task to be processed.
813
+
814
+ Returns:
815
+ Task: The updated task.
816
+
817
+ Example:
818
+ >>> workforce = Workforce("My Team")
819
+ >>> task = Task(content="Analyze data", id="1")
820
+ >>> result = workforce.process_task(task) # No async/await
821
+ needed
822
+ >>> print(result.result)
823
+ """
824
+ import asyncio
825
+ import concurrent.futures
826
+
827
+ # Check if we're already in an event loop
828
+ try:
829
+ current_loop = asyncio.get_running_loop()
830
+ # Store the current loop for potential reuse by async tools
831
+ self._loop = current_loop
832
+
833
+ logger.info(
834
+ "Running in active event loop context. "
835
+ "Consider using process_task_async() directly for better "
836
+ "async tool compatibility."
837
+ )
838
+
839
+ # Create a new thread with a fresh event loop
840
+ def run_in_thread():
841
+ # Create new event loop for this thread
842
+ new_loop = asyncio.new_event_loop()
843
+ asyncio.set_event_loop(new_loop)
844
+ try:
845
+ return new_loop.run_until_complete(
846
+ self.process_task_async(task)
847
+ )
848
+ finally:
849
+ new_loop.close()
850
+ # Restore original loop reference
851
+ self._loop = current_loop
852
+
853
+ with concurrent.futures.ThreadPoolExecutor() as executor:
854
+ future = executor.submit(run_in_thread)
855
+ return future.result()
856
+
857
+ except RuntimeError:
858
+ # No event loop running, we can create one
859
+ return asyncio.run(self.process_task_async(task))
860
+
861
+ async def _process_task_with_snapshot(self, task: Task) -> Task:
862
+ r"""Async version of process_task that supports human intervention.
863
+ This method can be paused, resumed, and allows task modification.
864
+
865
+ Args:
866
+ task (Task): The task to be processed.
867
+
868
+ Returns:
869
+ Task: The updated task.
870
+ """
871
+
872
+ if not validate_task_content(task.content, task.id):
873
+ task.state = TaskState.FAILED
874
+ task.result = "Task failed: Invalid or empty content provided"
875
+ logger.warning(
876
+ f"Task {task.id} rejected: Invalid or empty content. "
877
+ f"Content preview: '{task.content[:50]}...'"
878
+ )
879
+ return task
880
+
881
+ self.reset()
882
+ self._task = task
883
+ self._state = WorkforceState.RUNNING
884
+ task.state = TaskState.OPEN
885
+ self._pending_tasks.append(task)
886
+
887
+ # Decompose the task into subtasks first
888
+ subtasks = self._decompose_task(task)
230
889
  self._pending_tasks.extendleft(reversed(subtasks))
231
890
  self.set_channel(TaskChannel())
232
891
 
233
- asyncio.run(self.start())
892
+ # Save initial snapshot
893
+ self.save_snapshot("Initial task decomposition")
894
+
895
+ try:
896
+ await self.start()
897
+ except Exception as e:
898
+ logger.error(f"Error in workforce execution: {e}")
899
+ self._state = WorkforceState.STOPPED
900
+ raise
901
+ finally:
902
+ if self._state != WorkforceState.STOPPED:
903
+ self._state = WorkforceState.IDLE
234
904
 
235
905
  return task
236
906
 
907
+ def _process_task_with_intervention(self, task: Task) -> Task:
908
+ r"""Process task with human intervention support. This creates and
909
+ manages its own event loop to allow for pausing/resuming functionality.
910
+
911
+ Args:
912
+ task (Task): The task to be processed.
913
+
914
+ Returns:
915
+ Task: The updated task.
916
+ """
917
+ # Create new event loop if none exists or if we need a fresh one
918
+ try:
919
+ self._loop = asyncio.get_event_loop()
920
+ if self._loop.is_closed():
921
+ self._loop = asyncio.new_event_loop()
922
+ asyncio.set_event_loop(self._loop)
923
+ except RuntimeError:
924
+ self._loop = asyncio.new_event_loop()
925
+ asyncio.set_event_loop(self._loop)
926
+
927
+ try:
928
+ return self._loop.run_until_complete(
929
+ self._process_task_with_snapshot(task)
930
+ )
931
+ finally:
932
+ # Decide whether to keep or close the loop
933
+ if self._loop and not self._loop.is_closed():
934
+ if self._state == WorkforceState.PAUSED:
935
+ # Keep alive to support resume()
936
+ logger.info(
937
+ "Event loop kept alive for potential resume "
938
+ "operations."
939
+ )
940
+ else:
941
+ # No more tasks; shut everything down cleanly
942
+ try:
943
+ # Ensure all async generators are finished
944
+ self._loop.run_until_complete(
945
+ self._loop.shutdown_asyncgens()
946
+ )
947
+ except RuntimeError:
948
+ # Loop already running elsewhere
949
+ pass
950
+ self._loop.close()
951
+
952
+ def continue_from_pause(self) -> Optional[Task]:
953
+ r"""Continue execution from a paused state. This reuses the
954
+ existing event loop.
955
+
956
+ Returns:
957
+ Optional[Task]: The completed task if execution finishes, None if
958
+ still running/paused.
959
+ """
960
+ if self._state != WorkforceState.PAUSED:
961
+ logger.warning("Workforce is not in paused state.")
962
+ return None
963
+
964
+ if self._loop is None or self._loop.is_closed():
965
+ logger.error("No active event loop available for resuming.")
966
+ return None
967
+
968
+ # Resume execution
969
+ self.resume()
970
+
971
+ try:
972
+ # Continue the existing async task
973
+ remaining_task = self._loop.run_until_complete(
974
+ self._continue_execution()
975
+ )
976
+ return remaining_task
977
+ except Exception as e:
978
+ logger.error(f"Error continuing execution: {e}")
979
+ self._state = WorkforceState.STOPPED
980
+ return None
981
+
982
+ async def _continue_execution(self) -> Optional[Task]:
983
+ r"""Internal method to continue execution after pause."""
984
+ try:
985
+ await self._listen_to_channel()
986
+ except Exception as e:
987
+ logger.error(f"Error in continued execution: {e}")
988
+ self._state = WorkforceState.STOPPED
989
+ raise
990
+ finally:
991
+ if self._state != WorkforceState.STOPPED:
992
+ self._state = WorkforceState.IDLE
993
+
994
+ return self._task
995
+
237
996
  @check_if_running(False)
238
997
  def add_single_agent_worker(
239
- self, description: str, worker: ChatAgent
998
+ self,
999
+ description: str,
1000
+ worker: ChatAgent,
1001
+ max_concurrent_tasks: int = 10,
240
1002
  ) -> Workforce:
241
1003
  r"""Add a worker node to the workforce that uses a single agent.
242
1004
 
243
1005
  Args:
244
1006
  description (str): Description of the worker node.
245
1007
  worker (ChatAgent): The agent to be added.
1008
+ max_concurrent_tasks (int): Maximum number of tasks this worker can
1009
+ process concurrently. (default: :obj:`10`)
246
1010
 
247
1011
  Returns:
248
1012
  Workforce: The workforce node itself.
249
1013
  """
250
- worker_node = SingleAgentWorker(description, worker)
1014
+ worker_node = SingleAgentWorker(
1015
+ description, worker, max_concurrent_tasks
1016
+ )
251
1017
  self._children.append(worker_node)
1018
+ if self.metrics_logger:
1019
+ self.metrics_logger.log_worker_created(
1020
+ worker_id=worker_node.node_id,
1021
+ worker_type='SingleAgentWorker',
1022
+ role=worker_node.description,
1023
+ )
252
1024
  return self
253
1025
 
254
1026
  @check_if_running(False)
@@ -293,6 +1065,12 @@ class Workforce(BaseNode):
293
1065
  chat_turn_limit=chat_turn_limit,
294
1066
  )
295
1067
  self._children.append(worker_node)
1068
+ if self.metrics_logger:
1069
+ self.metrics_logger.log_worker_created(
1070
+ worker_id=worker_node.node_id,
1071
+ worker_type='RolePlayingWorker',
1072
+ role=worker_node.description,
1073
+ )
296
1074
  return self
297
1075
 
298
1076
  @check_if_running(False)
@@ -308,19 +1086,50 @@ class Workforce(BaseNode):
308
1086
  self._children.append(workforce)
309
1087
  return self
310
1088
 
1089
+ async def _async_reset(self) -> None:
1090
+ r"""Async implementation of reset to run on the event loop."""
1091
+ self._pause_event.set()
1092
+
311
1093
  @check_if_running(False)
312
1094
  def reset(self) -> None:
313
1095
  r"""Reset the workforce and all the child nodes under it. Can only
314
- be called when the workforce is not running."""
1096
+ be called when the workforce is not running.
1097
+ """
315
1098
  super().reset()
316
1099
  self._task = None
317
1100
  self._pending_tasks.clear()
318
1101
  self._child_listening_tasks.clear()
1102
+ # Clear dependency tracking
1103
+ self._task_dependencies.clear()
1104
+ self._completed_tasks = []
1105
+ self._assignees.clear()
1106
+ self._in_flight_tasks = 0
319
1107
  self.coordinator_agent.reset()
320
1108
  self.task_agent.reset()
1109
+ self._task_start_times.clear()
321
1110
  for child in self._children:
322
1111
  child.reset()
323
1112
 
1113
+ # Reset intervention state
1114
+ self._state = WorkforceState.IDLE
1115
+ self._stop_requested = False
1116
+ # Handle asyncio.Event in a thread-safe way
1117
+ if self._loop and not self._loop.is_closed():
1118
+ # If we have a loop, use it to set the event safely
1119
+ asyncio.run_coroutine_threadsafe(
1120
+ self._async_reset(), self._loop
1121
+ ).result()
1122
+ else:
1123
+ try:
1124
+ self._reset_task = asyncio.create_task(self._async_reset())
1125
+ except RuntimeError:
1126
+ asyncio.run(self._async_reset())
1127
+
1128
+ if hasattr(self, 'logger') and self.metrics_logger is not None:
1129
+ self.metrics_logger.reset_task_data()
1130
+ else:
1131
+ self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
1132
+
324
1133
  @check_if_running(False)
325
1134
  def set_channel(self, channel: TaskChannel) -> None:
326
1135
  r"""Set the channel for the node and all the child nodes under it."""
@@ -350,21 +1159,36 @@ class Workforce(BaseNode):
350
1159
 
351
1160
  def _find_assignee(
352
1161
  self,
353
- task: Task,
354
- ) -> str:
355
- r"""Assigns a task to a worker node with the best capability.
1162
+ tasks: List[Task],
1163
+ ) -> TaskAssignResult:
1164
+ r"""Assigns multiple tasks to worker nodes with the best capabilities.
356
1165
 
357
1166
  Parameters:
358
- task (Task): The task to be assigned.
1167
+ tasks (List[Task]): The tasks to be assigned.
359
1168
 
360
1169
  Returns:
361
- str: ID of the worker node to be assigned.
1170
+ TaskAssignResult: Assignment result containing task assignments
1171
+ with their dependencies.
362
1172
  """
363
1173
  self.coordinator_agent.reset()
1174
+
1175
+ # Format tasks information for the prompt
1176
+ tasks_info = ""
1177
+ for task in tasks:
1178
+ tasks_info += f"Task ID: {task.id}\n"
1179
+ tasks_info += f"Content: {task.content}\n"
1180
+ if task.additional_info:
1181
+ tasks_info += f"Additional Info: {task.additional_info}\n"
1182
+ tasks_info += "---\n"
1183
+
364
1184
  prompt = ASSIGN_TASK_PROMPT.format(
365
- content=task.content,
1185
+ tasks_info=tasks_info,
366
1186
  child_nodes_info=self._get_child_nodes_info(),
367
- additional_info=task.additional_info,
1187
+ )
1188
+
1189
+ logger.debug(
1190
+ f"Sending batch assignment request to coordinator "
1191
+ f"for {len(tasks)} tasks."
368
1192
  )
369
1193
 
370
1194
  response = self.coordinator_agent.step(
@@ -372,9 +1196,17 @@ class Workforce(BaseNode):
372
1196
  )
373
1197
  result_dict = json.loads(response.msg.content, parse_int=str)
374
1198
  task_assign_result = TaskAssignResult(**result_dict)
375
- return task_assign_result.assignee_id
1199
+ return task_assign_result
376
1200
 
377
1201
  async def _post_task(self, task: Task, assignee_id: str) -> None:
1202
+ # Record the start time when a task is posted
1203
+ self._task_start_times[task.id] = time.time()
1204
+
1205
+ if self.metrics_logger:
1206
+ self.metrics_logger.log_task_started(
1207
+ task_id=task.id, worker_id=assignee_id
1208
+ )
1209
+ self._in_flight_tasks += 1
378
1210
  await self._channel.post_task(task, self.node_id, assignee_id)
379
1211
 
380
1212
  async def _post_dependency(self, dependency: Task) -> None:
@@ -410,12 +1242,20 @@ class Workforce(BaseNode):
410
1242
  new_node = SingleAgentWorker(
411
1243
  description=new_node_conf.description,
412
1244
  worker=new_agent,
1245
+ max_concurrent_tasks=10, # TODO: make this configurable
413
1246
  )
414
1247
  new_node.set_channel(self._channel)
415
1248
 
416
1249
  print(f"{Fore.CYAN}{new_node} created.{Fore.RESET}")
417
1250
 
418
1251
  self._children.append(new_node)
1252
+ if self.metrics_logger:
1253
+ self.metrics_logger.log_worker_created(
1254
+ worker_id=new_node.node_id,
1255
+ worker_type='SingleAgentWorker',
1256
+ role=new_node_conf.role,
1257
+ metadata={'description': new_node_conf.description},
1258
+ )
419
1259
  self._child_listening_tasks.append(
420
1260
  asyncio.create_task(new_node.start())
421
1261
  )
@@ -447,62 +1287,287 @@ class Workforce(BaseNode):
447
1287
 
448
1288
  async def _get_returned_task(self) -> Task:
449
1289
  r"""Get the task that's published by this node and just get returned
450
- from the assignee.
1290
+ from the assignee. Includes timeout handling to prevent indefinite
1291
+ waiting.
451
1292
  """
452
- return await self._channel.get_returned_task_by_publisher(self.node_id)
1293
+ try:
1294
+ # Add timeout to prevent indefinite waiting
1295
+ return await asyncio.wait_for(
1296
+ self._channel.get_returned_task_by_publisher(self.node_id),
1297
+ timeout=180.0, # 3 minute timeout
1298
+ )
1299
+ except asyncio.TimeoutError:
1300
+ logger.warning(
1301
+ f"Timeout waiting for returned task in "
1302
+ f"workforce {self.node_id}. "
1303
+ f"This may indicate an issue with async tool execution. "
1304
+ f"Current pending tasks: {len(self._pending_tasks)}, "
1305
+ f"In-flight tasks: {self._in_flight_tasks}"
1306
+ )
1307
+ raise
453
1308
 
454
1309
  async def _post_ready_tasks(self) -> None:
455
- r"""Send all the pending tasks that have all the dependencies met to
456
- the channel, or directly return if there is none. For now, we will
457
- directly send the first task in the pending list because all the tasks
458
- are linearly dependent."""
1310
+ r"""Checks for unassigned tasks, assigns them, and then posts any
1311
+ tasks whose dependencies have been met."""
1312
+
1313
+ # Step 1: Identify and assign any new tasks in the pending queue
1314
+ tasks_to_assign = [
1315
+ task
1316
+ for task in self._pending_tasks
1317
+ if task.id not in self._task_dependencies
1318
+ ]
1319
+ if tasks_to_assign:
1320
+ logger.debug(
1321
+ f"Found {len(tasks_to_assign)} new tasks. "
1322
+ f"Requesting assignment..."
1323
+ )
1324
+ batch_result = self._find_assignee(tasks_to_assign)
1325
+ logger.debug(
1326
+ f"Coordinator returned assignments:\n"
1327
+ f"{json.dumps(batch_result.dict(), indent=2)}"
1328
+ )
1329
+ for assignment in batch_result.assignments:
1330
+ self._task_dependencies[assignment.task_id] = (
1331
+ assignment.dependencies
1332
+ )
1333
+ self._assignees[assignment.task_id] = assignment.assignee_id
1334
+ if self.metrics_logger:
1335
+ # queue_time_seconds can be derived by logger if task
1336
+ # creation time is logged
1337
+ self.metrics_logger.log_task_assigned(
1338
+ task_id=assignment.task_id,
1339
+ worker_id=assignment.assignee_id,
1340
+ dependencies=assignment.dependencies,
1341
+ queue_time_seconds=None,
1342
+ )
1343
+
1344
+ # Step 2: Iterate through all pending tasks and post those that are
1345
+ # ready
1346
+ posted_tasks = []
1347
+ for task in self._pending_tasks:
1348
+ # A task must be assigned to be considered for posting
1349
+ if task.id in self._task_dependencies:
1350
+ dependencies = self._task_dependencies[task.id]
1351
+ # Check if all dependencies for this task are in the completed
1352
+ # set
1353
+ if all(
1354
+ dep_id in {t.id for t in self._completed_tasks}
1355
+ for dep_id in dependencies
1356
+ ):
1357
+ assignee_id = self._assignees[task.id]
1358
+ logger.debug(
1359
+ f"Posting task {task.id} to assignee {assignee_id}. "
1360
+ f"Dependencies met."
1361
+ )
1362
+ await self._post_task(task, assignee_id)
1363
+ posted_tasks.append(task)
1364
+
1365
+ # Step 3: Remove the posted tasks from the pending list
1366
+ for task in posted_tasks:
1367
+ try:
1368
+ self._pending_tasks.remove(task)
1369
+ except ValueError:
1370
+ # Task might have been removed by another process, which is
1371
+ # fine
1372
+ pass
459
1373
 
460
- if not self._pending_tasks:
461
- return
1374
+ async def _handle_failed_task(self, task: Task) -> bool:
1375
+ task.failure_count += 1
462
1376
 
463
- ready_task = self._pending_tasks[0]
464
-
465
- # If the task has failed previously, just compose and send the task
466
- # to the channel as a dependency
467
- if ready_task.state == TaskState.FAILED:
468
- # TODO: the composing of tasks seems not work very well
469
- self.task_agent.reset()
470
- ready_task.compose(self.task_agent)
471
- # Remove the subtasks from the channel
472
- for subtask in ready_task.subtasks:
473
- await self._channel.remove_task(subtask.id)
474
- # Send the task to the channel as a dependency
475
- await self._post_dependency(ready_task)
476
- self._pending_tasks.popleft()
477
- # Try to send the next task in the pending list
478
- await self._post_ready_tasks()
479
- else:
480
- # Directly post the task to the channel if it's a new one
481
- # Find a node to assign the task
482
- assignee_id = self._find_assignee(task=ready_task)
483
- await self._post_task(ready_task, assignee_id)
1377
+ if self.metrics_logger:
1378
+ worker_id = self._assignees.get(task.id)
1379
+ self.metrics_logger.log_task_failed(
1380
+ task_id=task.id,
1381
+ worker_id=worker_id,
1382
+ error_message=task.result or "Task execution failed",
1383
+ error_type="TaskFailure",
1384
+ metadata={'failure_count': task.failure_count},
1385
+ )
484
1386
 
485
- async def _handle_failed_task(self, task: Task) -> bool:
486
1387
  if task.failure_count >= 3:
487
1388
  return True
488
- task.failure_count += 1
489
- # Remove the failed task from the channel
490
- await self._channel.remove_task(task.id)
1389
+
491
1390
  if task.get_depth() >= 3:
492
1391
  # Create a new worker node and reassign
493
1392
  assignee = self._create_worker_node_for_task(task)
1393
+
1394
+ # Sync shared memory after creating new worker to provide context
1395
+ if self.share_memory:
1396
+ logger.info(
1397
+ f"Syncing shared memory after creating new worker "
1398
+ f"{assignee.node_id} for failed task {task.id}"
1399
+ )
1400
+ self._sync_shared_memory()
1401
+
494
1402
  await self._post_task(task, assignee.node_id)
1403
+ action_taken = f"reassigned to new worker {assignee.node_id}"
495
1404
  else:
496
1405
  subtasks = self._decompose_task(task)
1406
+ if self.metrics_logger and subtasks:
1407
+ self.metrics_logger.log_task_decomposed(
1408
+ parent_task_id=task.id,
1409
+ subtask_ids=[st.id for st in subtasks],
1410
+ )
1411
+ for subtask in subtasks:
1412
+ self.metrics_logger.log_task_created(
1413
+ task_id=subtask.id,
1414
+ description=subtask.content,
1415
+ parent_task_id=task.id,
1416
+ task_type=subtask.type,
1417
+ metadata=subtask.additional_info,
1418
+ )
497
1419
  # Insert packets at the head of the queue
498
1420
  self._pending_tasks.extendleft(reversed(subtasks))
1421
+
1422
+ # Sync shared memory after task decomposition
1423
+ if self.share_memory:
1424
+ logger.info(
1425
+ f"Syncing shared memory after decomposing failed "
1426
+ f"task {task.id}"
1427
+ )
1428
+ self._sync_shared_memory()
1429
+
499
1430
  await self._post_ready_tasks()
1431
+ action_taken = f"decomposed into {len(subtasks)} subtasks"
1432
+ if task.id in self._assignees:
1433
+ await self._channel.archive_task(task.id)
1434
+
1435
+ logger.debug(
1436
+ f"Task {task.id} failed and was {action_taken}. "
1437
+ f"Updating dependency state."
1438
+ )
1439
+ # Mark task as completed for dependency tracking
1440
+ self._completed_tasks.append(task)
1441
+
1442
+ # Post next ready tasks
1443
+
1444
+ # Sync shared memory after task completion to share knowledge
1445
+ if self.share_memory:
1446
+ logger.info(
1447
+ f"Syncing shared memory after task {task.id} completion"
1448
+ )
1449
+ self._sync_shared_memory()
1450
+
1451
+ # Check if any pending tasks are now ready to execute
1452
+ await self._post_ready_tasks()
500
1453
  return False
501
1454
 
502
1455
  async def _handle_completed_task(self, task: Task) -> None:
503
- # archive the packet, making it into a dependency
504
- self._pending_tasks.popleft()
505
- await self._channel.archive_task(task.id)
1456
+ if self.metrics_logger:
1457
+ worker_id = self._assignees.get(task.id, "unknown")
1458
+ processing_time_seconds = None
1459
+ token_usage = None
1460
+
1461
+ # Get processing time from task start time or additional info
1462
+ if task.id in self._task_start_times:
1463
+ processing_time_seconds = (
1464
+ time.time() - self._task_start_times[task.id]
1465
+ )
1466
+ del self._task_start_times[task.id] # Prevent memory leaks
1467
+ elif (
1468
+ task.additional_info is not None
1469
+ and 'processing_time_seconds' in task.additional_info
1470
+ ):
1471
+ processing_time_seconds = task.additional_info[
1472
+ 'processing_time_seconds'
1473
+ ]
1474
+
1475
+ # Get token usage from task additional info (preferred - actual
1476
+ # usage)
1477
+ if (
1478
+ task.additional_info is not None
1479
+ and 'token_usage' in task.additional_info
1480
+ ):
1481
+ token_usage = task.additional_info['token_usage']
1482
+ else:
1483
+ # Fallback: Try to get token usage from SingleAgentWorker
1484
+ # memory
1485
+ assignee_node = next(
1486
+ (
1487
+ child
1488
+ for child in self._children
1489
+ if child.node_id == worker_id
1490
+ ),
1491
+ None,
1492
+ )
1493
+ if isinstance(assignee_node, SingleAgentWorker):
1494
+ try:
1495
+ _, total_tokens = (
1496
+ assignee_node.worker.memory.get_context()
1497
+ )
1498
+ token_usage = {'total_tokens': total_tokens}
1499
+ except Exception:
1500
+ token_usage = None
1501
+
1502
+ # Log the completed task
1503
+ self.metrics_logger.log_task_completed(
1504
+ task_id=task.id,
1505
+ worker_id=worker_id,
1506
+ result_summary=task.result if task.result else "Completed",
1507
+ processing_time_seconds=processing_time_seconds,
1508
+ token_usage=token_usage,
1509
+ metadata={'current_state': task.state.value},
1510
+ )
1511
+
1512
+ # Find and remove the completed task from pending tasks
1513
+ tasks_list = list(self._pending_tasks)
1514
+ found_and_removed = False
1515
+
1516
+ for i, pending_task in enumerate(tasks_list):
1517
+ if pending_task.id == task.id:
1518
+ # Remove this specific task
1519
+ tasks_list.pop(i)
1520
+ self._pending_tasks = deque(tasks_list)
1521
+ found_and_removed = True
1522
+ print(
1523
+ f"{Fore.GREEN}✅ Task {task.id} completed and removed "
1524
+ f"from queue.{Fore.RESET}"
1525
+ )
1526
+ break
1527
+
1528
+ if not found_and_removed:
1529
+ # Task was already removed from pending queue (expected case when
1530
+ # it had been popped immediately after posting). No need to
1531
+ # draw user attention with a warning; record at debug level.
1532
+ logger.debug(
1533
+ f"Completed task {task.id} was already removed from pending "
1534
+ "queue."
1535
+ )
1536
+
1537
+ # Archive the task and update dependency tracking
1538
+ if task.id in self._assignees:
1539
+ await self._channel.archive_task(task.id)
1540
+
1541
+ # Ensure it's in completed tasks set
1542
+ self._completed_tasks.append(task)
1543
+
1544
+ # Handle parent task completion logic
1545
+ parent = task.parent
1546
+ if parent and parent.id not in {t.id for t in self._completed_tasks}:
1547
+ all_subtasks_done = all(
1548
+ sub.id in {t.id for t in self._completed_tasks}
1549
+ for sub in parent.subtasks
1550
+ )
1551
+ if all_subtasks_done:
1552
+ # Set the parent task state to done
1553
+ parent.state = TaskState.DONE
1554
+ logger.debug(
1555
+ f"All subtasks of {parent.id} are done. "
1556
+ f"Marking parent as complete."
1557
+ )
1558
+ # Treat the parent task as a completed task to unblock
1559
+ # its dependents. Since it was never sent to a worker,
1560
+ # we call this method recursively.
1561
+ await self._handle_completed_task(parent)
1562
+
1563
+ # Sync shared memory after task completion to share knowledge
1564
+ if self.share_memory:
1565
+ logger.info(
1566
+ f"Syncing shared memory after task {task.id} completion"
1567
+ )
1568
+ self._sync_shared_memory()
1569
+
1570
+ # Check if any pending tasks are now ready to execute
506
1571
  await self._post_ready_tasks()
507
1572
 
508
1573
  async def _graceful_shutdown(self, failed_task: Task) -> None:
@@ -521,50 +1586,157 @@ class Workforce(BaseNode):
521
1586
  f"seconds due to failure. You can use this time to inspect the "
522
1587
  f"current state of the workforce."
523
1588
  )
524
-
525
1589
  # Wait for the full timeout period
526
1590
  await asyncio.sleep(self.graceful_shutdown_timeout)
527
1591
 
1592
+ def get_workforce_log_tree(self) -> str:
1593
+ r"""Returns an ASCII tree representation of the task hierarchy and
1594
+ worker status.
1595
+ """
1596
+ if not self.metrics_logger:
1597
+ return "Logger not initialized."
1598
+ return self.metrics_logger.get_ascii_tree_representation()
1599
+
1600
+ def get_workforce_kpis(self) -> Dict[str, Any]:
1601
+ r"""Returns a dictionary of key performance indicators."""
1602
+ if not self.metrics_logger:
1603
+ return {"error": "Logger not initialized."}
1604
+ return self.metrics_logger.get_kpis()
1605
+
1606
+ def dump_workforce_logs(self, file_path: str) -> None:
1607
+ r"""Dumps all collected logs to a JSON file.
1608
+
1609
+ Args:
1610
+ file_path (str): The path to the JSON file.
1611
+ """
1612
+ if not self.metrics_logger:
1613
+ print("Logger not initialized. Cannot dump logs.")
1614
+ return
1615
+ self.metrics_logger.dump_to_json(file_path)
1616
+ # Use logger.info or print, consistent with existing style
1617
+ logger.info(f"Workforce logs dumped to {file_path}")
1618
+
528
1619
  @check_if_running(False)
529
1620
  async def _listen_to_channel(self) -> None:
530
1621
  r"""Continuously listen to the channel, post task to the channel and
531
- track the status of posted tasks.
1622
+ track the status of posted tasks. Now supports pause/resume and
1623
+ graceful stop.
532
1624
  """
533
1625
 
534
1626
  self._running = True
1627
+ self._state = WorkforceState.RUNNING
535
1628
  logger.info(f"Workforce {self.node_id} started.")
536
1629
 
537
1630
  await self._post_ready_tasks()
538
1631
 
539
- while self._task is None or self._pending_tasks:
540
- returned_task = await self._get_returned_task()
541
- if returned_task.state == TaskState.DONE:
542
- await self._handle_completed_task(returned_task)
543
- elif returned_task.state == TaskState.FAILED:
544
- halt = await self._handle_failed_task(returned_task)
545
- if not halt:
546
- continue
547
- print(
548
- f"{Fore.RED}Task {returned_task.id} has failed "
549
- f"for 3 times, halting the workforce.{Fore.RESET}"
550
- )
551
- # Graceful shutdown instead of immediate break
552
- await self._graceful_shutdown(returned_task)
553
- break
554
- elif returned_task.state == TaskState.OPEN:
555
- # TODO: multi-layer workforce
556
- pass
557
- else:
558
- raise ValueError(
559
- f"Task {returned_task.id} has an unexpected state."
560
- )
1632
+ while (
1633
+ self._task is None
1634
+ or self._pending_tasks
1635
+ or self._in_flight_tasks > 0
1636
+ ) and not self._stop_requested:
1637
+ try:
1638
+ # Check for pause request at the beginning of each loop
1639
+ # iteration
1640
+ await self._pause_event.wait()
1641
+
1642
+ # Check for stop request after potential pause
1643
+ if self._stop_requested:
1644
+ logger.info("Stop requested, breaking execution loop.")
1645
+ break
1646
+
1647
+ # Save snapshot before processing next task
1648
+ if self._pending_tasks:
1649
+ current_task = self._pending_tasks[0]
1650
+ # Throttled snapshot
1651
+ if (
1652
+ time.time() - self._last_snapshot_time
1653
+ >= self.snapshot_interval
1654
+ ):
1655
+ self.save_snapshot(
1656
+ f"Before processing task: {current_task.id}"
1657
+ )
1658
+ self._last_snapshot_time = time.time()
1659
+
1660
+ # Get returned task (this may block until a task is returned)
1661
+ returned_task = await self._get_returned_task()
1662
+ self._in_flight_tasks -= 1
1663
+
1664
+ # Check for stop request after getting task
1665
+ if self._stop_requested:
1666
+ logger.info("Stop requested after receiving task.")
1667
+ break
1668
+
1669
+ # Process the returned task based on its state
1670
+ if returned_task.state == TaskState.DONE:
1671
+ print(
1672
+ f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
1673
+ f"successfully.{Fore.RESET}"
1674
+ )
1675
+ await self._handle_completed_task(returned_task)
1676
+ elif returned_task.state == TaskState.FAILED:
1677
+ halt = await self._handle_failed_task(returned_task)
1678
+ if not halt:
1679
+ continue
1680
+ print(
1681
+ f"{Fore.RED}Task {returned_task.id} has failed "
1682
+ f"for 3 times, halting the workforce.{Fore.RESET}"
1683
+ )
1684
+ # Graceful shutdown instead of immediate break
1685
+ await self._graceful_shutdown(returned_task)
1686
+ break
1687
+ elif returned_task.state == TaskState.OPEN:
1688
+ # TODO: multi-layer workforce
1689
+ pass
1690
+ else:
1691
+ raise ValueError(
1692
+ f"Task {returned_task.id} has an unexpected state."
1693
+ )
1694
+
1695
+ except Exception as e:
1696
+ logger.error(f"Error processing task: {e}")
1697
+ if self._stop_requested:
1698
+ break
1699
+ # Continue with next iteration unless stop is requested
1700
+ continue
1701
+
1702
+ # Handle final state
1703
+ if self._stop_requested:
1704
+ self._state = WorkforceState.STOPPED
1705
+ logger.info("Workforce stopped by user request.")
1706
+ elif not self._pending_tasks and self._in_flight_tasks == 0:
1707
+ self._state = WorkforceState.IDLE
1708
+ logger.info("All tasks completed.")
561
1709
 
562
1710
  # shut down the whole workforce tree
563
1711
  self.stop()
564
1712
 
1713
+ def _submit_coro_to_loop(self, coro: 'Coroutine') -> None:
1714
+ r"""Thread-safe submission of coroutine to the workforce loop."""
1715
+
1716
+ loop = self._loop
1717
+ if loop is None or loop.is_closed():
1718
+ logger.warning("Cannot submit coroutine - no active event loop")
1719
+ return
1720
+ try:
1721
+ running_loop = asyncio.get_running_loop()
1722
+ except RuntimeError:
1723
+ running_loop = None
1724
+
1725
+ if running_loop is loop:
1726
+ loop.create_task(coro)
1727
+ else:
1728
+ asyncio.run_coroutine_threadsafe(coro, loop)
1729
+
565
1730
  @check_if_running(False)
566
1731
  async def start(self) -> None:
567
1732
  r"""Start itself and all the child nodes under it."""
1733
+ # Sync shared memory at the start to ensure all agents have context
1734
+ if self.share_memory:
1735
+ logger.info(
1736
+ f"Syncing shared memory at workforce {self.node_id} startup"
1737
+ )
1738
+ self._sync_shared_memory()
1739
+
568
1740
  for child in self._children:
569
1741
  child_listening_task = asyncio.create_task(child.start())
570
1742
  self._child_listening_tasks.append(child_listening_task)
@@ -576,7 +1748,8 @@ class Workforce(BaseNode):
576
1748
  by its parent node.
577
1749
  """
578
1750
  for child in self._children:
579
- child.stop()
1751
+ if child._running:
1752
+ child.stop()
580
1753
  for child_task in self._child_listening_tasks:
581
1754
  child_task.cancel()
582
1755
  self._running = False
@@ -596,12 +1769,21 @@ class Workforce(BaseNode):
596
1769
  """
597
1770
 
598
1771
  # Create a new instance with the same configuration
1772
+ # Extract the original kwargs from the agents to properly clone them
1773
+ coordinator_kwargs = (
1774
+ getattr(self.coordinator_agent, 'init_kwargs', {}) or {}
1775
+ )
1776
+ task_kwargs = getattr(self.task_agent, 'init_kwargs', {}) or {}
1777
+
599
1778
  new_instance = Workforce(
600
1779
  description=self.description,
601
- coordinator_agent_kwargs={},
602
- task_agent_kwargs={},
603
- new_worker_agent_kwargs=self.new_worker_agent_kwargs,
1780
+ coordinator_agent_kwargs=coordinator_kwargs.copy(),
1781
+ task_agent_kwargs=task_kwargs.copy(),
1782
+ new_worker_agent_kwargs=self.new_worker_agent_kwargs.copy()
1783
+ if self.new_worker_agent_kwargs
1784
+ else None,
604
1785
  graceful_shutdown_timeout=self.graceful_shutdown_timeout,
1786
+ share_memory=self.share_memory,
605
1787
  )
606
1788
 
607
1789
  new_instance.task_agent = self.task_agent.clone(with_memory)
@@ -613,7 +1795,9 @@ class Workforce(BaseNode):
613
1795
  if isinstance(child, SingleAgentWorker):
614
1796
  cloned_worker = child.worker.clone(with_memory)
615
1797
  new_instance.add_single_agent_worker(
616
- child.description, cloned_worker
1798
+ child.description,
1799
+ cloned_worker,
1800
+ child.max_concurrent_tasks,
617
1801
  )
618
1802
  elif isinstance(child, RolePlayingWorker):
619
1803
  new_instance.add_role_playing_worker(
@@ -624,6 +1808,7 @@ class Workforce(BaseNode):
624
1808
  child.user_agent_kwargs,
625
1809
  child.summarize_agent_kwargs,
626
1810
  child.chat_turn_limit,
1811
+ child.max_concurrent_tasks,
627
1812
  )
628
1813
  elif isinstance(child, Workforce):
629
1814
  new_instance.add_workforce(child.clone(with_memory))
@@ -682,7 +1867,9 @@ class Workforce(BaseNode):
682
1867
  workforce_instance = self
683
1868
 
684
1869
  # Define functions first
685
- def process_task(task_content, task_id=None, additional_info=None):
1870
+ async def process_task(
1871
+ task_content, task_id=None, additional_info=None
1872
+ ):
686
1873
  r"""Process a task using the workforce.
687
1874
 
688
1875
  Args:
@@ -704,7 +1891,8 @@ class Workforce(BaseNode):
704
1891
  - message (str): Error message if status is "error"
705
1892
 
706
1893
  Example:
707
- >>> result = process_task("Analyze market trends", "task_001")
1894
+ >>> result = await process_task("Analyze market trends",
1895
+ "task_001")
708
1896
  >>> print(result["status"]) # "success" or "error"
709
1897
  """
710
1898
  task = Task(
@@ -714,7 +1902,7 @@ class Workforce(BaseNode):
714
1902
  )
715
1903
 
716
1904
  try:
717
- result_task = workforce_instance.process_task(task)
1905
+ result_task = await workforce_instance.process_task_async(task)
718
1906
  return {
719
1907
  "status": "success",
720
1908
  "task_id": result_task.id,
@@ -834,9 +2022,9 @@ class Workforce(BaseNode):
834
2022
  >>> for child in children:
835
2023
  ... print(f"{child['type']}: {child['description']}")
836
2024
  """
837
- children_info = []
2025
+ children_info: List[Dict[str, Any]] = []
838
2026
  for child in workforce_instance._children:
839
- child_info = {
2027
+ child_info: Dict[str, Any] = {
840
2028
  "node_id": child.node_id,
841
2029
  "description": child.description,
842
2030
  "type": type(child).__name__,