camel-ai 0.2.66__py3-none-any.whl → 0.2.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. camel/__init__.py +1 -1
  2. camel/configs/__init__.py +3 -0
  3. camel/configs/qianfan_config.py +85 -0
  4. camel/models/__init__.py +2 -0
  5. camel/models/aiml_model.py +8 -0
  6. camel/models/anthropic_model.py +8 -0
  7. camel/models/aws_bedrock_model.py +8 -0
  8. camel/models/azure_openai_model.py +14 -5
  9. camel/models/base_model.py +4 -0
  10. camel/models/cohere_model.py +9 -2
  11. camel/models/crynux_model.py +8 -0
  12. camel/models/deepseek_model.py +8 -0
  13. camel/models/gemini_model.py +8 -0
  14. camel/models/groq_model.py +8 -0
  15. camel/models/internlm_model.py +8 -0
  16. camel/models/litellm_model.py +5 -0
  17. camel/models/lmstudio_model.py +14 -1
  18. camel/models/mistral_model.py +15 -1
  19. camel/models/model_factory.py +6 -0
  20. camel/models/modelscope_model.py +8 -0
  21. camel/models/moonshot_model.py +8 -0
  22. camel/models/nemotron_model.py +17 -2
  23. camel/models/netmind_model.py +8 -0
  24. camel/models/novita_model.py +8 -0
  25. camel/models/nvidia_model.py +8 -0
  26. camel/models/ollama_model.py +8 -0
  27. camel/models/openai_compatible_model.py +23 -5
  28. camel/models/openai_model.py +21 -4
  29. camel/models/openrouter_model.py +8 -0
  30. camel/models/ppio_model.py +8 -0
  31. camel/models/qianfan_model.py +104 -0
  32. camel/models/qwen_model.py +8 -0
  33. camel/models/reka_model.py +18 -3
  34. camel/models/samba_model.py +17 -3
  35. camel/models/sglang_model.py +20 -5
  36. camel/models/siliconflow_model.py +8 -0
  37. camel/models/stub_model.py +8 -1
  38. camel/models/togetherai_model.py +8 -0
  39. camel/models/vllm_model.py +7 -0
  40. camel/models/volcano_model.py +14 -1
  41. camel/models/watsonx_model.py +4 -1
  42. camel/models/yi_model.py +8 -0
  43. camel/models/zhipuai_model.py +8 -0
  44. camel/societies/workforce/prompts.py +33 -17
  45. camel/societies/workforce/role_playing_worker.py +3 -8
  46. camel/societies/workforce/single_agent_worker.py +1 -3
  47. camel/societies/workforce/task_channel.py +16 -18
  48. camel/societies/workforce/utils.py +104 -14
  49. camel/societies/workforce/workforce.py +1253 -99
  50. camel/societies/workforce/workforce_logger.py +613 -0
  51. camel/tasks/task.py +16 -5
  52. camel/toolkits/__init__.py +2 -0
  53. camel/toolkits/code_execution.py +1 -1
  54. camel/toolkits/playwright_mcp_toolkit.py +2 -1
  55. camel/toolkits/pptx_toolkit.py +4 -4
  56. camel/types/enums.py +32 -0
  57. camel/types/unified_model_type.py +5 -0
  58. {camel_ai-0.2.66.dist-info → camel_ai-0.2.67.dist-info}/METADATA +3 -3
  59. {camel_ai-0.2.66.dist-info → camel_ai-0.2.67.dist-info}/RECORD +61 -58
  60. {camel_ai-0.2.66.dist-info → camel_ai-0.2.67.dist-info}/WHEEL +0 -0
  61. {camel_ai-0.2.66.dist-info → camel_ai-0.2.67.dist-info}/licenses/LICENSE +0 -0
@@ -15,9 +15,11 @@ from __future__ import annotations
15
15
 
16
16
  import asyncio
17
17
  import json
18
+ import time
18
19
  import uuid
19
20
  from collections import deque
20
- from typing import Deque, Dict, List, Optional
21
+ from enum import Enum
22
+ from typing import Any, Coroutine, Deque, Dict, List, Optional
21
23
 
22
24
  from colorama import Fore
23
25
 
@@ -41,13 +43,56 @@ from camel.societies.workforce.utils import (
41
43
  )
42
44
  from camel.societies.workforce.worker import Worker
43
45
  from camel.tasks.task import Task, TaskState, validate_task_content
44
- from camel.toolkits import CodeExecutionToolkit, SearchToolkit, ThinkingToolkit
46
+ from camel.toolkits import (
47
+ CodeExecutionToolkit,
48
+ SearchToolkit,
49
+ TaskPlanningToolkit,
50
+ ThinkingToolkit,
51
+ )
45
52
  from camel.types import ModelPlatformType, ModelType
46
53
  from camel.utils import dependencies_required
47
54
 
55
+ from .workforce_logger import WorkforceLogger
56
+
48
57
  logger = get_logger(__name__)
49
58
 
50
59
 
60
+ class WorkforceState(Enum):
61
+ r"""Workforce execution state for human intervention support."""
62
+
63
+ IDLE = "idle"
64
+ RUNNING = "running"
65
+ PAUSED = "paused"
66
+ STOPPED = "stopped"
67
+
68
+
69
+ class WorkforceSnapshot:
70
+ r"""Snapshot of workforce state for resuming execution."""
71
+
72
+ def __init__(
73
+ self,
74
+ main_task: Optional[Task] = None,
75
+ pending_tasks: Optional[Deque[Task]] = None,
76
+ completed_tasks: Optional[List[Task]] = None,
77
+ task_dependencies: Optional[Dict[str, List[str]]] = None,
78
+ assignees: Optional[Dict[str, str]] = None,
79
+ current_task_index: int = 0,
80
+ description: str = "",
81
+ ):
82
+ self.main_task = main_task
83
+ self.pending_tasks = pending_tasks.copy() if pending_tasks else deque()
84
+ self.completed_tasks = (
85
+ completed_tasks.copy() if completed_tasks else []
86
+ )
87
+ self.task_dependencies = (
88
+ task_dependencies.copy() if task_dependencies else {}
89
+ )
90
+ self.assignees = assignees.copy() if assignees else {}
91
+ self.current_task_index = current_task_index
92
+ self.description = description
93
+ self.timestamp = time.time()
94
+
95
+
51
96
  class Workforce(BaseNode):
52
97
  r"""A system where multiple worker nodes (agents) cooperate together
53
98
  to solve tasks. It can assign tasks to worker nodes and also take
@@ -90,21 +135,35 @@ class Workforce(BaseNode):
90
135
  for graceful shutdown when a task fails 3 times. During this
91
136
  period, the workforce remains active for debugging.
92
137
  Set to 0 for immediate shutdown. (default: :obj:`15.0`)
138
+ share_memory (bool, optional): Whether to enable shared memory across
139
+ SingleAgentWorker instances in the workforce. When enabled, all
140
+ SingleAgentWorker instances, coordinator agent, and task planning
141
+ agent will share their complete conversation history and
142
+ function-calling trajectory, providing better context for task
143
+ handoffs and continuity. Note: Currently only supports
144
+ SingleAgentWorker instances; RolePlayingWorker and nested
145
+ Workforce instances do not participate in memory sharing.
146
+ (default: :obj:`False`)
93
147
 
94
148
  Example:
95
- >>> # Configure with custom model
149
+ >>> # Configure with custom model and shared memory
150
+ >>> import asyncio
96
151
  >>> model = ModelFactory.create(
97
152
  ... ModelPlatformType.OPENAI, ModelType.GPT_4O
98
153
  ... )
99
154
  >>> workforce = Workforce(
100
155
  ... "Research Team",
101
156
  ... coordinator_agent_kwargs={"model": model, "token_limit": 4000},
102
- ... task_agent_kwargs={"model": model, "token_limit": 8000}
157
+ ... task_agent_kwargs={"model": model, "token_limit": 8000},
158
+ ... share_memory=True # Enable shared memory
103
159
  ... )
104
160
  >>>
105
161
  >>> # Process a task
106
- >>> task = Task(content="Research AI trends", id="1")
107
- >>> result = workforce.process_task(task)
162
+ >>> async def main():
163
+ ... task = Task(content="Research AI trends", id="1")
164
+ ... result = workforce.process_task(task)
165
+ ... return result
166
+ >>> asyncio.run(main())
108
167
  """
109
168
 
110
169
  def __init__(
@@ -115,12 +174,44 @@ class Workforce(BaseNode):
115
174
  task_agent_kwargs: Optional[Dict] = None,
116
175
  new_worker_agent_kwargs: Optional[Dict] = None,
117
176
  graceful_shutdown_timeout: float = 15.0,
177
+ share_memory: bool = False,
118
178
  ) -> None:
119
179
  super().__init__(description)
120
180
  self._child_listening_tasks: Deque[asyncio.Task] = deque()
121
181
  self._children = children or []
122
182
  self.new_worker_agent_kwargs = new_worker_agent_kwargs
123
183
  self.graceful_shutdown_timeout = graceful_shutdown_timeout
184
+ self.share_memory = share_memory
185
+ self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
186
+ self._task: Optional[Task] = None
187
+ self._pending_tasks: Deque[Task] = deque()
188
+ self._task_dependencies: Dict[str, List[str]] = {}
189
+ self._assignees: Dict[str, str] = {}
190
+ self._in_flight_tasks: int = 0
191
+ # Dictionary to track task start times
192
+ self._task_start_times: Dict[str, float] = {}
193
+ # Human intervention support
194
+ self._state = WorkforceState.IDLE
195
+ self._pause_event = asyncio.Event()
196
+ self._pause_event.set() # Initially not paused
197
+ self._stop_requested = False
198
+ self._snapshots: List[WorkforceSnapshot] = []
199
+ self._completed_tasks: List[Task] = []
200
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
201
+ self._main_task_future: Optional[asyncio.Future] = None
202
+ # Snapshot throttle support
203
+ self._last_snapshot_time: float = 0.0
204
+ # Minimum seconds between automatic snapshots
205
+ self.snapshot_interval: float = 30.0
206
+ if self.metrics_logger:
207
+ for child in self._children:
208
+ worker_type = type(child).__name__
209
+ role_or_desc = child.description
210
+ self.metrics_logger.log_worker_created(
211
+ worker_id=child.node_id,
212
+ worker_type=worker_type,
213
+ role=role_or_desc,
214
+ )
124
215
 
125
216
  # Warning messages for default model usage
126
217
  if coordinator_agent_kwargs is None:
@@ -154,6 +245,13 @@ class Workforce(BaseNode):
154
245
  "available options."
155
246
  )
156
247
 
248
+ if self.share_memory:
249
+ logger.info(
250
+ "Shared memory enabled. All agents will share their complete "
251
+ "conversation history and function-calling trajectory for "
252
+ "better context continuity during task handoffs."
253
+ )
254
+
157
255
  coord_agent_sys_msg = BaseMessage.make_assistant_message(
158
256
  role_name="Workforce Manager",
159
257
  content="You are coordinating a group of workers. A worker can be "
@@ -163,21 +261,157 @@ class Workforce(BaseNode):
163
261
  "a new worker for a task, etc.",
164
262
  )
165
263
  self.coordinator_agent = ChatAgent(
166
- coord_agent_sys_msg, **(coordinator_agent_kwargs or {})
264
+ coord_agent_sys_msg,
265
+ **(coordinator_agent_kwargs or {}),
167
266
  )
168
267
 
169
268
  task_sys_msg = BaseMessage.make_assistant_message(
170
269
  role_name="Task Planner",
171
- content="You are going to compose and decompose tasks.",
270
+ content="You are going to compose and decompose tasks. Keep "
271
+ "tasks that are sequential and require the same type of "
272
+ "agent together in one agent process. Only decompose tasks "
273
+ "that can be handled in parallel and require different types "
274
+ "of agents. This ensures efficient execution by minimizing "
275
+ "context switching between agents.",
172
276
  )
173
- self.task_agent = ChatAgent(task_sys_msg, **(task_agent_kwargs or {}))
174
-
175
- # If there is one, will set by the workforce class wrapping this
176
- self._task: Optional[Task] = None
177
- self._pending_tasks: Deque[Task] = deque()
277
+ _kwargs = dict(task_agent_kwargs or {})
278
+ extra_tools = TaskPlanningToolkit().get_tools()
279
+ _kwargs["tools"] = [*_kwargs.get("tools", []), *extra_tools]
280
+ self.task_agent = ChatAgent(task_sys_msg, **_kwargs)
178
281
 
179
282
  def __repr__(self):
180
- return f"Workforce {self.node_id} ({self.description})"
283
+ return (
284
+ f"Workforce {self.node_id} ({self.description}) - "
285
+ f"State: {self._state.value}"
286
+ )
287
+
288
+ def _collect_shared_memory(self) -> Dict[str, List]:
289
+ r"""Collect memory from all SingleAgentWorker instances for sharing.
290
+
291
+ Returns:
292
+ Dict[str, List]: A dictionary mapping agent types to their memory
293
+ records. Contains entries for 'coordinator', 'task_agent',
294
+ and 'workers'.
295
+ """
296
+ # TODO: add memory collection for RolePlayingWorker and nested
297
+ # Workforce instances
298
+ if not self.share_memory:
299
+ return {}
300
+
301
+ shared_memory: Dict[str, List] = {
302
+ 'coordinator': [],
303
+ 'task_agent': [],
304
+ 'workers': [],
305
+ }
306
+
307
+ try:
308
+ # Collect coordinator agent memory
309
+ coord_records = self.coordinator_agent.memory.retrieve()
310
+ shared_memory['coordinator'] = [
311
+ record.memory_record.to_dict() for record in coord_records
312
+ ]
313
+
314
+ # Collect task agent memory
315
+ task_records = self.task_agent.memory.retrieve()
316
+ shared_memory['task_agent'] = [
317
+ record.memory_record.to_dict() for record in task_records
318
+ ]
319
+
320
+ # Collect worker memory only from SingleAgentWorker instances
321
+ for child in self._children:
322
+ if isinstance(child, SingleAgentWorker):
323
+ worker_records = child.worker.memory.retrieve()
324
+ worker_memory = [
325
+ record.memory_record.to_dict()
326
+ for record in worker_records
327
+ ]
328
+ shared_memory['workers'].extend(worker_memory)
329
+
330
+ except Exception as e:
331
+ logger.warning(f"Error collecting shared memory: {e}")
332
+
333
+ return shared_memory
334
+
335
+ def _share_memory_with_agents(
336
+ self, shared_memory: Dict[str, List]
337
+ ) -> None:
338
+ r"""Share collected memory with coordinator, task agent, and
339
+ SingleAgentWorker instances.
340
+
341
+ Args:
342
+ shared_memory (Dict[str, List]): Memory records collected from
343
+ all agents to be shared.
344
+ """
345
+ if not self.share_memory or not shared_memory:
346
+ return
347
+
348
+ try:
349
+ # Create a consolidated memory from all collected records
350
+ all_records = []
351
+ for _memory_type, records in shared_memory.items():
352
+ all_records.extend(records)
353
+
354
+ if not all_records:
355
+ return
356
+
357
+ # Import necessary classes for memory record reconstruction
358
+ from camel.memories.records import MemoryRecord
359
+
360
+ # Create consolidated memory objects from records
361
+ memory_records: List[MemoryRecord] = []
362
+ for record_dict in all_records:
363
+ try:
364
+ memory_record = MemoryRecord.from_dict(record_dict)
365
+ memory_records.append(memory_record)
366
+ except Exception as e:
367
+ logger.warning(f"Failed to reconstruct memory record: {e}")
368
+ continue
369
+
370
+ if not memory_records:
371
+ return
372
+
373
+ # Share with coordinator agent
374
+ for record in memory_records:
375
+ # Only add records from other agents to avoid duplication
376
+ if record.agent_id != self.coordinator_agent.agent_id:
377
+ self.coordinator_agent.memory.write_record(record)
378
+
379
+ # Share with task agent
380
+ for record in memory_records:
381
+ if record.agent_id != self.task_agent.agent_id:
382
+ self.task_agent.memory.write_record(record)
383
+
384
+ # Share with SingleAgentWorker instances only
385
+ single_agent_workers = [
386
+ child
387
+ for child in self._children
388
+ if isinstance(child, SingleAgentWorker)
389
+ ]
390
+
391
+ for worker in single_agent_workers:
392
+ for record in memory_records:
393
+ if record.agent_id != worker.worker.agent_id:
394
+ worker.worker.memory.write_record(record)
395
+
396
+ logger.info(
397
+ f"Shared {len(memory_records)} memory records across "
398
+ f"{len(single_agent_workers) + 2} agents in workforce "
399
+ f"{self.node_id}"
400
+ )
401
+
402
+ except Exception as e:
403
+ logger.warning(f"Error sharing memory with agents: {e}")
404
+
405
+ def _sync_shared_memory(self) -> None:
406
+ r"""Synchronize memory across all agents by collecting and sharing."""
407
+ if not self.share_memory:
408
+ return
409
+
410
+ try:
411
+ shared_memory = self._collect_shared_memory()
412
+ self._share_memory_with_agents(shared_memory)
413
+ except Exception as e:
414
+ logger.warning(f"Error synchronizing shared memory: {e}")
181
415
 
182
416
  def _decompose_task(self, task: Task) -> List[Task]:
183
417
  r"""Decompose the task into subtasks. This method will also set the
@@ -199,18 +433,313 @@ class Workforce(BaseNode):
199
433
 
200
434
  return subtasks
201
435
 
436
+ # Human intervention methods
437
+ async def _async_pause(self) -> None:
438
+ r"""Async implementation of pause to run on the event loop."""
439
+ if self._state == WorkforceState.RUNNING:
440
+ self._state = WorkforceState.PAUSED
441
+ self._pause_event.clear()
442
+ logger.info(f"Workforce {self.node_id} paused.")
443
+
444
+ def pause(self) -> None:
445
+ r"""Pause the workforce execution.
446
+ If the internal event-loop is already running we schedule the
447
+ asynchronous pause coroutine onto it. When the loop has not yet
448
+ been created (e.g. the caller presses the hot-key immediately after
449
+ workforce start-up) we fall back to a synchronous state change so
450
+ that no tasks will be scheduled until the loop is ready.
451
+ """
452
+
453
+ if self._loop and not self._loop.is_closed():
454
+ self._submit_coro_to_loop(self._async_pause())
455
+ else:
456
+ # Loop not yet created, just mark state so when loop starts it
457
+ # will proceed.
458
+ if self._state == WorkforceState.RUNNING:
459
+ self._state = WorkforceState.PAUSED
460
+ self._pause_event.clear()
461
+ logger.info(
462
+ f"Workforce {self.node_id} paused "
463
+ f"(event-loop not yet started)."
464
+ )
465
+
466
+ async def _async_resume(self) -> None:
467
+ r"""Async implementation of resume to run on the event loop."""
468
+ if self._state == WorkforceState.PAUSED:
469
+ self._state = WorkforceState.RUNNING
470
+ self._pause_event.set()
471
+ logger.info(f"Workforce {self.node_id} resumed.")
472
+
473
+ # Re-post ready tasks (if any)
474
+ if self._pending_tasks:
475
+ await self._post_ready_tasks()
476
+
477
+ def resume(self) -> None:
478
+ r"""Resume execution after a manual pause."""
479
+
480
+ if self._loop and not self._loop.is_closed():
481
+ self._submit_coro_to_loop(self._async_resume())
482
+ else:
483
+ # Loop not running yet, just mark state so when loop starts it
484
+ # will proceed.
485
+ if self._state == WorkforceState.PAUSED:
486
+ self._state = WorkforceState.RUNNING
487
+ self._pause_event.set()
488
+ logger.info(
489
+ f"Workforce {self.node_id} resumed "
490
+ f"(event-loop not yet started)."
491
+ )
492
+
493
+ async def _async_stop_gracefully(self) -> None:
494
+ r"""Async implementation of stop_gracefully to run on the event
495
+ loop.
496
+ """
497
+ self._stop_requested = True
498
+ if self._pause_event.is_set() is False:
499
+ self._pause_event.set() # Resume if paused to process stop
500
+ logger.info(f"Workforce {self.node_id} stop requested.")
501
+
502
+ def stop_gracefully(self) -> None:
503
+ r"""Request workforce to finish current in-flight work then halt.
504
+
505
+ Works both when the internal event-loop is alive and when it has not
506
+ yet been started. In the latter case we simply mark the stop flag so
507
+ that the loop (when it eventually starts) will exit immediately after
508
+ initialisation.
509
+ """
510
+
511
+ if self._loop and not self._loop.is_closed():
512
+ self._submit_coro_to_loop(self._async_stop_gracefully())
513
+ else:
514
+ # Loop not yet created, set the flag synchronously so later
515
+ # startup will respect it.
516
+ self._stop_requested = True
517
+ # Ensure any pending pause is released so that when the loop does
518
+ # start it can see the stop request and exit.
519
+ self._pause_event.set()
520
+ logger.info(
521
+ f"Workforce {self.node_id} stop requested "
522
+ f"(event-loop not yet started)."
523
+ )
524
+
525
+ def save_snapshot(self, description: str = "") -> None:
526
+ r"""Save current state as a snapshot."""
527
+ snapshot = WorkforceSnapshot(
528
+ main_task=self._task,
529
+ pending_tasks=self._pending_tasks,
530
+ completed_tasks=self._completed_tasks,
531
+ task_dependencies=self._task_dependencies,
532
+ assignees=self._assignees,
533
+ current_task_index=len(self._completed_tasks),
534
+ description=description or f"Snapshot at {time.time()}",
535
+ )
536
+ self._snapshots.append(snapshot)
537
+ logger.info(f"Snapshot saved: {description}")
538
+
539
+ def list_snapshots(self) -> List[str]:
540
+ r"""List all available snapshots."""
541
+ snapshots_info = []
542
+ for i, snapshot in enumerate(self._snapshots):
543
+ desc_part = (
544
+ f" - {snapshot.description}" if snapshot.description else ""
545
+ )
546
+ info = (
547
+ f"Snapshot {i}: {len(snapshot.completed_tasks)} completed, "
548
+ f"{len(snapshot.pending_tasks)} pending{desc_part}"
549
+ )
550
+ snapshots_info.append(info)
551
+ return snapshots_info
552
+
553
+ def get_pending_tasks(self) -> List[Task]:
554
+ r"""Get current pending tasks for human review."""
555
+ return list(self._pending_tasks)
556
+
557
+ def get_completed_tasks(self) -> List[Task]:
558
+ r"""Get completed tasks."""
559
+ return self._completed_tasks.copy()
560
+
561
+ def modify_task_content(self, task_id: str, new_content: str) -> bool:
562
+ r"""Modify the content of a pending task."""
563
+ # Validate the new content first
564
+ if not validate_task_content(new_content, task_id):
565
+ logger.warning(
566
+ f"Task {task_id} content modification rejected: "
567
+ f"Invalid content. Content preview: '{new_content[:50]}...'"
568
+ )
569
+ return False
570
+
571
+ for task in self._pending_tasks:
572
+ if task.id == task_id:
573
+ task.content = new_content
574
+ logger.info(f"Task {task_id} content modified.")
575
+ return True
576
+ logger.warning(f"Task {task_id} not found in pending tasks.")
577
+ return False
578
+
579
+ def add_task(
580
+ self,
581
+ content: str,
582
+ task_id: Optional[str] = None,
583
+ additional_info: Optional[Dict[str, Any]] = None,
584
+ insert_position: int = -1,
585
+ ) -> Task:
586
+ r"""Add a new task to the pending queue."""
587
+ new_task = Task(
588
+ content=content,
589
+ id=task_id or f"human_added_{len(self._pending_tasks)}",
590
+ additional_info=additional_info,
591
+ )
592
+ if insert_position == -1:
593
+ self._pending_tasks.append(new_task)
594
+ else:
595
+ # Convert deque to list, insert, then back to deque
596
+ tasks_list = list(self._pending_tasks)
597
+ tasks_list.insert(insert_position, new_task)
598
+ self._pending_tasks = deque(tasks_list)
599
+
600
+ logger.info(f"New task added: {new_task.id}")
601
+ return new_task
602
+
603
+ def remove_task(self, task_id: str) -> bool:
604
+ r"""Remove a task from the pending queue."""
605
+ # Convert to list to find and remove
606
+ tasks_list = list(self._pending_tasks)
607
+ for i, task in enumerate(tasks_list):
608
+ if task.id == task_id:
609
+ tasks_list.pop(i)
610
+ self._pending_tasks = deque(tasks_list)
611
+ logger.info(f"Task {task_id} removed.")
612
+ return True
613
+ logger.warning(f"Task {task_id} not found in pending tasks.")
614
+ return False
615
+
616
+ def reorder_tasks(self, task_ids: List[str]) -> bool:
617
+ r"""Reorder pending tasks according to the provided task IDs list."""
618
+ # Create a mapping of task_id to task
619
+ tasks_dict = {task.id: task for task in self._pending_tasks}
620
+
621
+ # Check if all provided IDs exist
622
+ if not all(task_id in tasks_dict for task_id in task_ids):
623
+ logger.warning("Some task IDs not found in pending tasks.")
624
+ return False
625
+
626
+ # Check if we have the same number of tasks
627
+ if len(task_ids) != len(self._pending_tasks):
628
+ logger.warning(
629
+ "Number of task IDs doesn't match pending tasks count."
630
+ )
631
+ return False
632
+
633
+ # Reorder tasks
634
+ reordered_tasks = deque([tasks_dict[task_id] for task_id in task_ids])
635
+ self._pending_tasks = reordered_tasks
636
+
637
+ logger.info("Tasks reordered successfully.")
638
+ return True
639
+
640
+ def resume_from_task(self, task_id: str) -> bool:
641
+ r"""Resume execution from a specific task."""
642
+ if self._state != WorkforceState.PAUSED:
643
+ logger.warning(
644
+ "Workforce must be paused to resume from specific task."
645
+ )
646
+ return False
647
+
648
+ # Find the task in pending tasks
649
+ tasks_list = list(self._pending_tasks)
650
+ target_index = -1
651
+
652
+ for i, task in enumerate(tasks_list):
653
+ if task.id == task_id:
654
+ target_index = i
655
+ break
656
+
657
+ if target_index == -1:
658
+ logger.warning(f"Task {task_id} not found in pending tasks.")
659
+ return False
660
+
661
+ # Move completed tasks that come after the target task back to pending
662
+ tasks_to_move_back = tasks_list[:target_index]
663
+ remaining_tasks = tasks_list[target_index:]
664
+
665
+ # Update pending tasks to start from the target task
666
+ self._pending_tasks = deque(remaining_tasks)
667
+
668
+ # Move previously "completed" tasks that are after target back to
669
+ # pending and reset their state
670
+ if tasks_to_move_back:
671
+ # Reset state for tasks being moved back to pending
672
+ for task in tasks_to_move_back:
673
+ # Handle all possible task states
674
+ if task.state in [TaskState.DONE, TaskState.FAILED]:
675
+ task.state = TaskState.OPEN
676
+ # Clear result to avoid confusion
677
+ task.result = None
678
+ # Reset failure count to give task a fresh start
679
+ task.failure_count = 0
680
+
681
+ logger.info(
682
+ f"Moving {len(tasks_to_move_back)} tasks back to pending "
683
+ f"state."
684
+ )
685
+
686
+ logger.info(f"Ready to resume from task: {task_id}")
687
+ return True
688
+
689
+ def restore_from_snapshot(self, snapshot_index: int) -> bool:
690
+ r"""Restore workforce state from a snapshot."""
691
+ if not (0 <= snapshot_index < len(self._snapshots)):
692
+ logger.warning(f"Invalid snapshot index: {snapshot_index}")
693
+ return False
694
+
695
+ if self._state == WorkforceState.RUNNING:
696
+ logger.warning(
697
+ "Cannot restore snapshot while workforce is running. "
698
+ "Pause first."
699
+ )
700
+ return False
701
+
702
+ snapshot = self._snapshots[snapshot_index]
703
+ self._task = snapshot.main_task
704
+ self._pending_tasks = snapshot.pending_tasks.copy()
705
+ self._completed_tasks = snapshot.completed_tasks.copy()
706
+ self._task_dependencies = snapshot.task_dependencies.copy()
707
+ self._assignees = snapshot.assignees.copy()
708
+
709
+ logger.info(f"Workforce state restored from snapshot {snapshot_index}")
710
+ return True
711
+
712
+ def get_workforce_status(self) -> Dict:
713
+ r"""Get current workforce status for human review."""
714
+ return {
715
+ "state": self._state.value,
716
+ "pending_tasks_count": len(self._pending_tasks),
717
+ "completed_tasks_count": len(self._completed_tasks),
718
+ "snapshots_count": len(self._snapshots),
719
+ "children_count": len(self._children),
720
+ "main_task_id": self._task.id if self._task else None,
721
+ }
722
+
202
723
  @check_if_running(False)
203
- def process_task(self, task: Task) -> Task:
204
- r"""The main entry point for the workforce to process a task. It will
205
- start the workforce and all the child nodes under it, process the
206
- task provided and return the updated task.
724
+ async def process_task_async(
725
+ self, task: Task, interactive: bool = False
726
+ ) -> Task:
727
+ r"""Main entry point to process a task asynchronously.
207
728
 
208
729
  Args:
209
730
  task (Task): The task to be processed.
731
+ interactive (bool, optional): If True, enables human-intervention
732
+ workflow (pause/resume/snapshot). Defaults to False, which
733
+ runs the task in a blocking one-shot manner.
210
734
 
211
735
  Returns:
212
736
  Task: The updated task.
213
737
  """
738
+ # Delegate to intervention pipeline when requested to keep
739
+ # backward-compat.
740
+ if interactive:
741
+ return await self._process_task_with_snapshot(task)
742
+
214
743
  if not validate_task_content(task.content, task.id):
215
744
  task.state = TaskState.FAILED
216
745
  task.result = "Task failed: Invalid or empty content provided"
@@ -222,18 +751,235 @@ class Workforce(BaseNode):
222
751
 
223
752
  self.reset()
224
753
  self._task = task
754
+ if self.metrics_logger:
755
+ self.metrics_logger.log_task_created(
756
+ task_id=task.id,
757
+ description=task.content,
758
+ task_type=task.type,
759
+ metadata=task.additional_info,
760
+ )
225
761
  task.state = TaskState.FAILED
226
- self._pending_tasks.append(task)
227
762
  # The agent tend to be overconfident on the whole task, so we
228
763
  # decompose the task into subtasks first
229
764
  subtasks = self._decompose_task(task)
765
+ if self.metrics_logger and subtasks:
766
+ self.metrics_logger.log_task_decomposed(
767
+ parent_task_id=task.id, subtask_ids=[st.id for st in subtasks]
768
+ )
769
+ for subtask in subtasks:
770
+ self.metrics_logger.log_task_created(
771
+ task_id=subtask.id,
772
+ description=subtask.content,
773
+ parent_task_id=task.id,
774
+ task_type=subtask.type,
775
+ metadata=subtask.additional_info,
776
+ )
777
+ if subtasks:
778
+ # If decomposition happened, the original task becomes a container.
779
+ # We only execute its subtasks.
780
+ self._pending_tasks.extendleft(reversed(subtasks))
781
+ else:
782
+ # If no decomposition, execute the original task.
783
+ self._pending_tasks.append(task)
784
+
785
+ self.set_channel(TaskChannel())
786
+
787
+ await self.start()
788
+
789
+ if subtasks:
790
+ task.result = "\n\n".join(
791
+ f"--- Subtask {sub.id} Result ---\n{sub.result}"
792
+ for sub in task.subtasks
793
+ if sub.result
794
+ )
795
+ if task.subtasks and all(
796
+ sub.state == TaskState.DONE for sub in task.subtasks
797
+ ):
798
+ task.state = TaskState.DONE
799
+ else:
800
+ task.state = TaskState.FAILED
801
+
802
+ return task
803
+
804
+ def process_task(self, task: Task) -> Task:
805
+ r"""Synchronous wrapper for process_task that handles async operations
806
+ internally.
807
+
808
+ Args:
809
+ task (Task): The task to be processed.
810
+
811
+ Returns:
812
+ Task: The updated task.
813
+
814
+ Example:
815
+ >>> workforce = Workforce("My Team")
816
+ >>> task = Task(content="Analyze data", id="1")
817
+ >>> result = workforce.process_task(task) # No async/await
818
+ needed
819
+ >>> print(result.result)
820
+ """
821
+ import asyncio
822
+ import concurrent.futures
823
+
824
+ # Check if we're already in an event loop
825
+ try:
826
+ asyncio.get_running_loop()
827
+
828
+ # If we're in an event loop, we need to run in a thread
829
+ def run_in_thread():
830
+ # Create new event loop for this thread
831
+ new_loop = asyncio.new_event_loop()
832
+ asyncio.set_event_loop(new_loop)
833
+ try:
834
+ return new_loop.run_until_complete(
835
+ self.process_task_async(task)
836
+ )
837
+ finally:
838
+ new_loop.close()
839
+
840
+ with concurrent.futures.ThreadPoolExecutor() as executor:
841
+ future = executor.submit(run_in_thread)
842
+ return future.result()
843
+
844
+ except RuntimeError:
845
+ # No event loop running, we can create one
846
+ return asyncio.run(self.process_task_async(task))
847
+
848
+ async def _process_task_with_snapshot(self, task: Task) -> Task:
849
+ r"""Async version of process_task that supports human intervention.
850
+ This method can be paused, resumed, and allows task modification.
851
+
852
+ Args:
853
+ task (Task): The task to be processed.
854
+
855
+ Returns:
856
+ Task: The updated task.
857
+ """
858
+
859
+ if not validate_task_content(task.content, task.id):
860
+ task.state = TaskState.FAILED
861
+ task.result = "Task failed: Invalid or empty content provided"
862
+ logger.warning(
863
+ f"Task {task.id} rejected: Invalid or empty content. "
864
+ f"Content preview: '{task.content[:50]}...'"
865
+ )
866
+ return task
867
+
868
+ self.reset()
869
+ self._task = task
870
+ self._state = WorkforceState.RUNNING
871
+ task.state = TaskState.OPEN
872
+ self._pending_tasks.append(task)
873
+
874
+ # Decompose the task into subtasks first
875
+ subtasks = self._decompose_task(task)
230
876
  self._pending_tasks.extendleft(reversed(subtasks))
231
877
  self.set_channel(TaskChannel())
232
878
 
233
- asyncio.run(self.start())
879
+ # Save initial snapshot
880
+ self.save_snapshot("Initial task decomposition")
881
+
882
+ try:
883
+ await self.start()
884
+ except Exception as e:
885
+ logger.error(f"Error in workforce execution: {e}")
886
+ self._state = WorkforceState.STOPPED
887
+ raise
888
+ finally:
889
+ if self._state != WorkforceState.STOPPED:
890
+ self._state = WorkforceState.IDLE
234
891
 
235
892
  return task
236
893
 
894
+ def _process_task_with_intervention(self, task: Task) -> Task:
895
+ r"""Process task with human intervention support. This creates and
896
+ manages its own event loop to allow for pausing/resuming functionality.
897
+
898
+ Args:
899
+ task (Task): The task to be processed.
900
+
901
+ Returns:
902
+ Task: The updated task.
903
+ """
904
+ # Create new event loop if none exists or if we need a fresh one
905
+ try:
906
+ self._loop = asyncio.get_event_loop()
907
+ if self._loop.is_closed():
908
+ self._loop = asyncio.new_event_loop()
909
+ asyncio.set_event_loop(self._loop)
910
+ except RuntimeError:
911
+ self._loop = asyncio.new_event_loop()
912
+ asyncio.set_event_loop(self._loop)
913
+
914
+ try:
915
+ return self._loop.run_until_complete(
916
+ self._process_task_with_snapshot(task)
917
+ )
918
+ finally:
919
+ # Decide whether to keep or close the loop
920
+ if self._loop and not self._loop.is_closed():
921
+ if self._state == WorkforceState.PAUSED:
922
+ # Keep alive to support resume()
923
+ logger.info(
924
+ "Event loop kept alive for potential resume "
925
+ "operations."
926
+ )
927
+ else:
928
+ # No more tasks; shut everything down cleanly
929
+ try:
930
+ # Ensure all async generators are finished
931
+ self._loop.run_until_complete(
932
+ self._loop.shutdown_asyncgens()
933
+ )
934
+ except RuntimeError:
935
+ # Loop already running elsewhere
936
+ pass
937
+ self._loop.close()
938
+
939
+ def continue_from_pause(self) -> Optional[Task]:
940
+ r"""Continue execution from a paused state. This reuses the
941
+ existing event loop.
942
+
943
+ Returns:
944
+ Optional[Task]: The completed task if execution finishes, None if
945
+ still running/paused.
946
+ """
947
+ if self._state != WorkforceState.PAUSED:
948
+ logger.warning("Workforce is not in paused state.")
949
+ return None
950
+
951
+ if self._loop is None or self._loop.is_closed():
952
+ logger.error("No active event loop available for resuming.")
953
+ return None
954
+
955
+ # Resume execution
956
+ self.resume()
957
+
958
+ try:
959
+ # Continue the existing async task
960
+ remaining_task = self._loop.run_until_complete(
961
+ self._continue_execution()
962
+ )
963
+ return remaining_task
964
+ except Exception as e:
965
+ logger.error(f"Error continuing execution: {e}")
966
+ self._state = WorkforceState.STOPPED
967
+ return None
968
+
969
+ async def _continue_execution(self) -> Optional[Task]:
970
+ r"""Internal method to continue execution after pause."""
971
+ try:
972
+ await self._listen_to_channel()
973
+ except Exception as e:
974
+ logger.error(f"Error in continued execution: {e}")
975
+ self._state = WorkforceState.STOPPED
976
+ raise
977
+ finally:
978
+ if self._state != WorkforceState.STOPPED:
979
+ self._state = WorkforceState.IDLE
980
+
981
+ return self._task
982
+
237
983
  @check_if_running(False)
238
984
  def add_single_agent_worker(
239
985
  self, description: str, worker: ChatAgent
@@ -249,6 +995,12 @@ class Workforce(BaseNode):
249
995
  """
250
996
  worker_node = SingleAgentWorker(description, worker)
251
997
  self._children.append(worker_node)
998
+ if self.metrics_logger:
999
+ self.metrics_logger.log_worker_created(
1000
+ worker_id=worker_node.node_id,
1001
+ worker_type='SingleAgentWorker',
1002
+ role=worker_node.description,
1003
+ )
252
1004
  return self
253
1005
 
254
1006
  @check_if_running(False)
@@ -293,6 +1045,12 @@ class Workforce(BaseNode):
293
1045
  chat_turn_limit=chat_turn_limit,
294
1046
  )
295
1047
  self._children.append(worker_node)
1048
+ if self.metrics_logger:
1049
+ self.metrics_logger.log_worker_created(
1050
+ worker_id=worker_node.node_id,
1051
+ worker_type='RolePlayingWorker',
1052
+ role=worker_node.description,
1053
+ )
296
1054
  return self
297
1055
 
298
1056
  @check_if_running(False)
@@ -308,19 +1066,50 @@ class Workforce(BaseNode):
308
1066
  self._children.append(workforce)
309
1067
  return self
310
1068
 
1069
+ async def _async_reset(self) -> None:
1070
+ r"""Async implementation of reset to run on the event loop."""
1071
+ self._pause_event.set()
1072
+
311
1073
  @check_if_running(False)
312
1074
  def reset(self) -> None:
313
1075
  r"""Reset the workforce and all the child nodes under it. Can only
314
- be called when the workforce is not running."""
1076
+ be called when the workforce is not running.
1077
+ """
315
1078
  super().reset()
316
1079
  self._task = None
317
1080
  self._pending_tasks.clear()
318
1081
  self._child_listening_tasks.clear()
1082
+ # Clear dependency tracking
1083
+ self._task_dependencies.clear()
1084
+ self._completed_tasks = []
1085
+ self._assignees.clear()
1086
+ self._in_flight_tasks = 0
319
1087
  self.coordinator_agent.reset()
320
1088
  self.task_agent.reset()
1089
+ self._task_start_times.clear()
321
1090
  for child in self._children:
322
1091
  child.reset()
323
1092
 
1093
+ # Reset intervention state
1094
+ self._state = WorkforceState.IDLE
1095
+ self._stop_requested = False
1096
+ # Handle asyncio.Event in a thread-safe way
1097
+ if self._loop and not self._loop.is_closed():
1098
+ # If we have a loop, use it to set the event safely
1099
+ asyncio.run_coroutine_threadsafe(
1100
+ self._async_reset(), self._loop
1101
+ ).result()
1102
+ else:
1103
+ try:
1104
+ self._reset_task = asyncio.create_task(self._async_reset())
1105
+ except RuntimeError:
1106
+ asyncio.run(self._async_reset())
1107
+
1108
+ if hasattr(self, 'logger') and self.metrics_logger is not None:
1109
+ self.metrics_logger.reset_task_data()
1110
+ else:
1111
+ self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
1112
+
324
1113
  @check_if_running(False)
325
1114
  def set_channel(self, channel: TaskChannel) -> None:
326
1115
  r"""Set the channel for the node and all the child nodes under it."""
@@ -350,21 +1139,36 @@ class Workforce(BaseNode):
350
1139
 
351
1140
  def _find_assignee(
352
1141
  self,
353
- task: Task,
354
- ) -> str:
355
- r"""Assigns a task to a worker node with the best capability.
1142
+ tasks: List[Task],
1143
+ ) -> TaskAssignResult:
1144
+ r"""Assigns multiple tasks to worker nodes with the best capabilities.
356
1145
 
357
1146
  Parameters:
358
- task (Task): The task to be assigned.
1147
+ tasks (List[Task]): The tasks to be assigned.
359
1148
 
360
1149
  Returns:
361
- str: ID of the worker node to be assigned.
1150
+ TaskAssignResult: Assignment result containing task assignments
1151
+ with their dependencies.
362
1152
  """
363
1153
  self.coordinator_agent.reset()
1154
+
1155
+ # Format tasks information for the prompt
1156
+ tasks_info = ""
1157
+ for task in tasks:
1158
+ tasks_info += f"Task ID: {task.id}\n"
1159
+ tasks_info += f"Content: {task.content}\n"
1160
+ if task.additional_info:
1161
+ tasks_info += f"Additional Info: {task.additional_info}\n"
1162
+ tasks_info += "---\n"
1163
+
364
1164
  prompt = ASSIGN_TASK_PROMPT.format(
365
- content=task.content,
1165
+ tasks_info=tasks_info,
366
1166
  child_nodes_info=self._get_child_nodes_info(),
367
- additional_info=task.additional_info,
1167
+ )
1168
+
1169
+ logger.debug(
1170
+ f"Sending batch assignment request to coordinator "
1171
+ f"for {len(tasks)} tasks."
368
1172
  )
369
1173
 
370
1174
  response = self.coordinator_agent.step(
@@ -372,9 +1176,17 @@ class Workforce(BaseNode):
372
1176
  )
373
1177
  result_dict = json.loads(response.msg.content, parse_int=str)
374
1178
  task_assign_result = TaskAssignResult(**result_dict)
375
- return task_assign_result.assignee_id
1179
+ return task_assign_result
376
1180
 
377
1181
  async def _post_task(self, task: Task, assignee_id: str) -> None:
1182
+ # Record the start time when a task is posted
1183
+ self._task_start_times[task.id] = time.time()
1184
+
1185
+ if self.metrics_logger:
1186
+ self.metrics_logger.log_task_started(
1187
+ task_id=task.id, worker_id=assignee_id
1188
+ )
1189
+ self._in_flight_tasks += 1
378
1190
  await self._channel.post_task(task, self.node_id, assignee_id)
379
1191
 
380
1192
  async def _post_dependency(self, dependency: Task) -> None:
@@ -416,6 +1228,13 @@ class Workforce(BaseNode):
416
1228
  print(f"{Fore.CYAN}{new_node} created.{Fore.RESET}")
417
1229
 
418
1230
  self._children.append(new_node)
1231
+ if self.metrics_logger:
1232
+ self.metrics_logger.log_worker_created(
1233
+ worker_id=new_node.node_id,
1234
+ worker_type='SingleAgentWorker',
1235
+ role=new_node_conf.role,
1236
+ metadata={'description': new_node_conf.description},
1237
+ )
419
1238
  self._child_listening_tasks.append(
420
1239
  asyncio.create_task(new_node.start())
421
1240
  )
@@ -447,62 +1266,277 @@ class Workforce(BaseNode):
447
1266
 
448
1267
  async def _get_returned_task(self) -> Task:
449
1268
  r"""Get the task that's published by this node and just get returned
450
- from the assignee.
1269
+ from the assignee. Includes timeout handling to prevent indefinite
1270
+ waiting.
451
1271
  """
452
- return await self._channel.get_returned_task_by_publisher(self.node_id)
1272
+ try:
1273
+ # Add timeout to prevent indefinite waiting
1274
+ return await asyncio.wait_for(
1275
+ self._channel.get_returned_task_by_publisher(self.node_id),
1276
+ timeout=300.0, # 5 minute timeout
1277
+ )
1278
+ except asyncio.TimeoutError:
1279
+ logger.warning(
1280
+ f"Timeout waiting for returned task in "
1281
+ f"workforce {self.node_id}"
1282
+ )
1283
+ raise ValueError("Timeout waiting for task to be returned")
453
1284
 
454
1285
  async def _post_ready_tasks(self) -> None:
455
- r"""Send all the pending tasks that have all the dependencies met to
456
- the channel, or directly return if there is none. For now, we will
457
- directly send the first task in the pending list because all the tasks
458
- are linearly dependent."""
1286
+ r"""Checks for unassigned tasks, assigns them, and then posts any
1287
+ tasks whose dependencies have been met."""
1288
+
1289
+ # Step 1: Identify and assign any new tasks in the pending queue
1290
+ tasks_to_assign = [
1291
+ task
1292
+ for task in self._pending_tasks
1293
+ if task.id not in self._task_dependencies
1294
+ ]
1295
+ if tasks_to_assign:
1296
+ logger.debug(
1297
+ f"Found {len(tasks_to_assign)} new tasks. "
1298
+ f"Requesting assignment..."
1299
+ )
1300
+ batch_result = self._find_assignee(tasks_to_assign)
1301
+ logger.debug(
1302
+ f"Coordinator returned assignments:\n"
1303
+ f"{json.dumps(batch_result.dict(), indent=2)}"
1304
+ )
1305
+ for assignment in batch_result.assignments:
1306
+ self._task_dependencies[assignment.task_id] = (
1307
+ assignment.dependencies
1308
+ )
1309
+ self._assignees[assignment.task_id] = assignment.assignee_id
1310
+ if self.metrics_logger:
1311
+ # queue_time_seconds can be derived by logger if task
1312
+ # creation time is logged
1313
+ self.metrics_logger.log_task_assigned(
1314
+ task_id=assignment.task_id,
1315
+ worker_id=assignment.assignee_id,
1316
+ dependencies=assignment.dependencies,
1317
+ queue_time_seconds=None,
1318
+ )
1319
+
1320
+ # Step 2: Iterate through all pending tasks and post those that are
1321
+ # ready
1322
+ posted_tasks = []
1323
+ for task in self._pending_tasks:
1324
+ # A task must be assigned to be considered for posting
1325
+ if task.id in self._task_dependencies:
1326
+ dependencies = self._task_dependencies[task.id]
1327
+ # Check if all dependencies for this task are in the completed
1328
+ # set
1329
+ if all(
1330
+ dep_id in {t.id for t in self._completed_tasks}
1331
+ for dep_id in dependencies
1332
+ ):
1333
+ assignee_id = self._assignees[task.id]
1334
+ logger.debug(
1335
+ f"Posting task {task.id} to assignee {assignee_id}. "
1336
+ f"Dependencies met."
1337
+ )
1338
+ await self._post_task(task, assignee_id)
1339
+ posted_tasks.append(task)
1340
+
1341
+ # Step 3: Remove the posted tasks from the pending list
1342
+ for task in posted_tasks:
1343
+ try:
1344
+ self._pending_tasks.remove(task)
1345
+ except ValueError:
1346
+ # Task might have been removed by another process, which is
1347
+ # fine
1348
+ pass
459
1349
 
460
- if not self._pending_tasks:
461
- return
1350
+ async def _handle_failed_task(self, task: Task) -> bool:
1351
+ task.failure_count += 1
462
1352
 
463
- ready_task = self._pending_tasks[0]
464
-
465
- # If the task has failed previously, just compose and send the task
466
- # to the channel as a dependency
467
- if ready_task.state == TaskState.FAILED:
468
- # TODO: the composing of tasks seems not work very well
469
- self.task_agent.reset()
470
- ready_task.compose(self.task_agent)
471
- # Remove the subtasks from the channel
472
- for subtask in ready_task.subtasks:
473
- await self._channel.remove_task(subtask.id)
474
- # Send the task to the channel as a dependency
475
- await self._post_dependency(ready_task)
476
- self._pending_tasks.popleft()
477
- # Try to send the next task in the pending list
478
- await self._post_ready_tasks()
479
- else:
480
- # Directly post the task to the channel if it's a new one
481
- # Find a node to assign the task
482
- assignee_id = self._find_assignee(task=ready_task)
483
- await self._post_task(ready_task, assignee_id)
1353
+ if self.metrics_logger:
1354
+ worker_id = self._assignees.get(task.id)
1355
+ self.metrics_logger.log_task_failed(
1356
+ task_id=task.id,
1357
+ worker_id=worker_id,
1358
+ error_message=task.result or "Task execution failed",
1359
+ error_type="TaskFailure",
1360
+ metadata={'failure_count': task.failure_count},
1361
+ )
484
1362
 
485
- async def _handle_failed_task(self, task: Task) -> bool:
486
1363
  if task.failure_count >= 3:
487
1364
  return True
488
- task.failure_count += 1
489
- # Remove the failed task from the channel
490
- await self._channel.remove_task(task.id)
1365
+
491
1366
  if task.get_depth() >= 3:
492
1367
  # Create a new worker node and reassign
493
1368
  assignee = self._create_worker_node_for_task(task)
1369
+
1370
+ # Sync shared memory after creating new worker to provide context
1371
+ if self.share_memory:
1372
+ logger.info(
1373
+ f"Syncing shared memory after creating new worker "
1374
+ f"{assignee.node_id} for failed task {task.id}"
1375
+ )
1376
+ self._sync_shared_memory()
1377
+
494
1378
  await self._post_task(task, assignee.node_id)
1379
+ action_taken = f"reassigned to new worker {assignee.node_id}"
495
1380
  else:
496
1381
  subtasks = self._decompose_task(task)
1382
+ if self.metrics_logger and subtasks:
1383
+ self.metrics_logger.log_task_decomposed(
1384
+ parent_task_id=task.id,
1385
+ subtask_ids=[st.id for st in subtasks],
1386
+ )
1387
+ for subtask in subtasks:
1388
+ self.metrics_logger.log_task_created(
1389
+ task_id=subtask.id,
1390
+ description=subtask.content,
1391
+ parent_task_id=task.id,
1392
+ task_type=subtask.type,
1393
+ metadata=subtask.additional_info,
1394
+ )
497
1395
  # Insert packets at the head of the queue
498
1396
  self._pending_tasks.extendleft(reversed(subtasks))
1397
+
1398
+ # Sync shared memory after task decomposition
1399
+ if self.share_memory:
1400
+ logger.info(
1401
+ f"Syncing shared memory after decomposing failed "
1402
+ f"task {task.id}"
1403
+ )
1404
+ self._sync_shared_memory()
1405
+
499
1406
  await self._post_ready_tasks()
1407
+ action_taken = f"decomposed into {len(subtasks)} subtasks"
1408
+ if task.id in self._assignees:
1409
+ await self._channel.archive_task(task.id)
1410
+
1411
+ logger.debug(
1412
+ f"Task {task.id} failed and was {action_taken}. "
1413
+ f"Updating dependency state."
1414
+ )
1415
+ # Mark task as completed for dependency tracking
1416
+ self._completed_tasks.append(task)
1417
+
1418
+ # Post next ready tasks
1419
+
1420
+ # Sync shared memory after task completion to share knowledge
1421
+ if self.share_memory:
1422
+ logger.info(
1423
+ f"Syncing shared memory after task {task.id} completion"
1424
+ )
1425
+ self._sync_shared_memory()
1426
+
1427
+ # Check if any pending tasks are now ready to execute
1428
+ await self._post_ready_tasks()
500
1429
  return False
501
1430
 
502
1431
  async def _handle_completed_task(self, task: Task) -> None:
503
- # archive the packet, making it into a dependency
504
- self._pending_tasks.popleft()
505
- await self._channel.archive_task(task.id)
1432
+ if self.metrics_logger:
1433
+ worker_id = self._assignees.get(task.id, "unknown")
1434
+ processing_time_seconds = None
1435
+ token_usage = None
1436
+
1437
+ # Get processing time from task start time or additional info
1438
+ if task.id in self._task_start_times:
1439
+ processing_time_seconds = (
1440
+ time.time() - self._task_start_times[task.id]
1441
+ )
1442
+ del self._task_start_times[task.id] # Prevent memory leaks
1443
+ elif (
1444
+ task.additional_info is not None
1445
+ and 'processing_time_seconds' in task.additional_info
1446
+ ):
1447
+ processing_time_seconds = task.additional_info[
1448
+ 'processing_time_seconds'
1449
+ ]
1450
+
1451
+ # Get token usage from task additional info
1452
+ if (
1453
+ task.additional_info is not None
1454
+ and 'token_usage' in task.additional_info
1455
+ ):
1456
+ token_usage = task.additional_info['token_usage']
1457
+
1458
+ # Try to get token usage from SingleAgentWorker memory if available
1459
+ assignee_node = next(
1460
+ (
1461
+ child
1462
+ for child in self._children
1463
+ if child.node_id == worker_id
1464
+ ),
1465
+ None,
1466
+ )
1467
+ if isinstance(assignee_node, SingleAgentWorker):
1468
+ _, total_tokens = assignee_node.worker.memory.get_context()
1469
+ token_usage = {'total_tokens': total_tokens}
1470
+
1471
+ # Log the completed task
1472
+ self.metrics_logger.log_task_completed(
1473
+ task_id=task.id,
1474
+ worker_id=worker_id,
1475
+ result_summary=task.result if task.result else "Completed",
1476
+ processing_time_seconds=processing_time_seconds,
1477
+ token_usage=token_usage,
1478
+ metadata={'current_state': task.state.value},
1479
+ )
1480
+
1481
+ # Find and remove the completed task from pending tasks
1482
+ tasks_list = list(self._pending_tasks)
1483
+ found_and_removed = False
1484
+
1485
+ for i, pending_task in enumerate(tasks_list):
1486
+ if pending_task.id == task.id:
1487
+ # Remove this specific task
1488
+ tasks_list.pop(i)
1489
+ self._pending_tasks = deque(tasks_list)
1490
+ found_and_removed = True
1491
+ print(
1492
+ f"{Fore.GREEN}✅ Task {task.id} completed and removed "
1493
+ f"from queue.{Fore.RESET}"
1494
+ )
1495
+ break
1496
+
1497
+ if not found_and_removed:
1498
+ # Task was already removed from pending queue (expected case when
1499
+ # it had been popped immediately after posting). No need to
1500
+ # draw user attention with a warning; record at debug level.
1501
+ logger.debug(
1502
+ f"Completed task {task.id} was already removed from pending "
1503
+ "queue."
1504
+ )
1505
+
1506
+ # Archive the task and update dependency tracking
1507
+ if task.id in self._assignees:
1508
+ await self._channel.archive_task(task.id)
1509
+
1510
+ # Ensure it's in completed tasks set
1511
+ self._completed_tasks.append(task)
1512
+
1513
+ # Handle parent task completion logic
1514
+ parent = task.parent
1515
+ if parent and parent.id not in {t.id for t in self._completed_tasks}:
1516
+ all_subtasks_done = all(
1517
+ sub.id in {t.id for t in self._completed_tasks}
1518
+ for sub in parent.subtasks
1519
+ )
1520
+ if all_subtasks_done:
1521
+ # Set the parent task state to done
1522
+ parent.state = TaskState.DONE
1523
+ logger.debug(
1524
+ f"All subtasks of {parent.id} are done. "
1525
+ f"Marking parent as complete."
1526
+ )
1527
+ # Treat the parent task as a completed task to unblock
1528
+ # its dependents. Since it was never sent to a worker,
1529
+ # we call this method recursively.
1530
+ await self._handle_completed_task(parent)
1531
+
1532
+ # Sync shared memory after task completion to share knowledge
1533
+ if self.share_memory:
1534
+ logger.info(
1535
+ f"Syncing shared memory after task {task.id} completion"
1536
+ )
1537
+ self._sync_shared_memory()
1538
+
1539
+ # Check if any pending tasks are now ready to execute
506
1540
  await self._post_ready_tasks()
507
1541
 
508
1542
  async def _graceful_shutdown(self, failed_task: Task) -> None:
@@ -521,50 +1555,157 @@ class Workforce(BaseNode):
521
1555
  f"seconds due to failure. You can use this time to inspect the "
522
1556
  f"current state of the workforce."
523
1557
  )
524
-
525
1558
  # Wait for the full timeout period
526
1559
  await asyncio.sleep(self.graceful_shutdown_timeout)
527
1560
 
1561
+ def get_workforce_log_tree(self) -> str:
1562
+ r"""Returns an ASCII tree representation of the task hierarchy and
1563
+ worker status.
1564
+ """
1565
+ if not self.metrics_logger:
1566
+ return "Logger not initialized."
1567
+ return self.metrics_logger.get_ascii_tree_representation()
1568
+
1569
+ def get_workforce_kpis(self) -> Dict[str, Any]:
1570
+ r"""Returns a dictionary of key performance indicators."""
1571
+ if not self.metrics_logger:
1572
+ return {"error": "Logger not initialized."}
1573
+ return self.metrics_logger.get_kpis()
1574
+
1575
+ def dump_workforce_logs(self, file_path: str) -> None:
1576
+ r"""Dumps all collected logs to a JSON file.
1577
+
1578
+ Args:
1579
+ file_path (str): The path to the JSON file.
1580
+ """
1581
+ if not self.metrics_logger:
1582
+ print("Logger not initialized. Cannot dump logs.")
1583
+ return
1584
+ self.metrics_logger.dump_to_json(file_path)
1585
+ # Use logger.info or print, consistent with existing style
1586
+ logger.info(f"Workforce logs dumped to {file_path}")
1587
+
528
1588
  @check_if_running(False)
529
1589
  async def _listen_to_channel(self) -> None:
530
1590
  r"""Continuously listen to the channel, post task to the channel and
531
- track the status of posted tasks.
1591
+ track the status of posted tasks. Now supports pause/resume and
1592
+ graceful stop.
532
1593
  """
533
1594
 
534
1595
  self._running = True
1596
+ self._state = WorkforceState.RUNNING
535
1597
  logger.info(f"Workforce {self.node_id} started.")
536
1598
 
537
1599
  await self._post_ready_tasks()
538
1600
 
539
- while self._task is None or self._pending_tasks:
540
- returned_task = await self._get_returned_task()
541
- if returned_task.state == TaskState.DONE:
542
- await self._handle_completed_task(returned_task)
543
- elif returned_task.state == TaskState.FAILED:
544
- halt = await self._handle_failed_task(returned_task)
545
- if not halt:
546
- continue
547
- print(
548
- f"{Fore.RED}Task {returned_task.id} has failed "
549
- f"for 3 times, halting the workforce.{Fore.RESET}"
550
- )
551
- # Graceful shutdown instead of immediate break
552
- await self._graceful_shutdown(returned_task)
553
- break
554
- elif returned_task.state == TaskState.OPEN:
555
- # TODO: multi-layer workforce
556
- pass
557
- else:
558
- raise ValueError(
559
- f"Task {returned_task.id} has an unexpected state."
560
- )
1601
+ while (
1602
+ self._task is None
1603
+ or self._pending_tasks
1604
+ or self._in_flight_tasks > 0
1605
+ ) and not self._stop_requested:
1606
+ try:
1607
+ # Check for pause request at the beginning of each loop
1608
+ # iteration
1609
+ await self._pause_event.wait()
1610
+
1611
+ # Check for stop request after potential pause
1612
+ if self._stop_requested:
1613
+ logger.info("Stop requested, breaking execution loop.")
1614
+ break
1615
+
1616
+ # Save snapshot before processing next task
1617
+ if self._pending_tasks:
1618
+ current_task = self._pending_tasks[0]
1619
+ # Throttled snapshot
1620
+ if (
1621
+ time.time() - self._last_snapshot_time
1622
+ >= self.snapshot_interval
1623
+ ):
1624
+ self.save_snapshot(
1625
+ f"Before processing task: {current_task.id}"
1626
+ )
1627
+ self._last_snapshot_time = time.time()
1628
+
1629
+ # Get returned task (this may block until a task is returned)
1630
+ returned_task = await self._get_returned_task()
1631
+ self._in_flight_tasks -= 1
1632
+
1633
+ # Check for stop request after getting task
1634
+ if self._stop_requested:
1635
+ logger.info("Stop requested after receiving task.")
1636
+ break
1637
+
1638
+ # Process the returned task based on its state
1639
+ if returned_task.state == TaskState.DONE:
1640
+ print(
1641
+ f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
1642
+ f"successfully.{Fore.RESET}"
1643
+ )
1644
+ await self._handle_completed_task(returned_task)
1645
+ elif returned_task.state == TaskState.FAILED:
1646
+ halt = await self._handle_failed_task(returned_task)
1647
+ if not halt:
1648
+ continue
1649
+ print(
1650
+ f"{Fore.RED}Task {returned_task.id} has failed "
1651
+ f"for 3 times, halting the workforce.{Fore.RESET}"
1652
+ )
1653
+ # Graceful shutdown instead of immediate break
1654
+ await self._graceful_shutdown(returned_task)
1655
+ break
1656
+ elif returned_task.state == TaskState.OPEN:
1657
+ # TODO: multi-layer workforce
1658
+ pass
1659
+ else:
1660
+ raise ValueError(
1661
+ f"Task {returned_task.id} has an unexpected state."
1662
+ )
1663
+
1664
+ except Exception as e:
1665
+ logger.error(f"Error processing task: {e}")
1666
+ if self._stop_requested:
1667
+ break
1668
+ # Continue with next iteration unless stop is requested
1669
+ continue
1670
+
1671
+ # Handle final state
1672
+ if self._stop_requested:
1673
+ self._state = WorkforceState.STOPPED
1674
+ logger.info("Workforce stopped by user request.")
1675
+ elif not self._pending_tasks and self._in_flight_tasks == 0:
1676
+ self._state = WorkforceState.IDLE
1677
+ logger.info("All tasks completed.")
561
1678
 
562
1679
  # shut down the whole workforce tree
563
1680
  self.stop()
564
1681
 
1682
+ def _submit_coro_to_loop(self, coro: 'Coroutine') -> None:
1683
+ r"""Thread-safe submission of coroutine to the workforce loop."""
1684
+
1685
+ loop = self._loop
1686
+ if loop is None or loop.is_closed():
1687
+ logger.warning("Cannot submit coroutine - no active event loop")
1688
+ return
1689
+ try:
1690
+ running_loop = asyncio.get_running_loop()
1691
+ except RuntimeError:
1692
+ running_loop = None
1693
+
1694
+ if running_loop is loop:
1695
+ loop.create_task(coro)
1696
+ else:
1697
+ asyncio.run_coroutine_threadsafe(coro, loop)
1698
+
565
1699
  @check_if_running(False)
566
1700
  async def start(self) -> None:
567
1701
  r"""Start itself and all the child nodes under it."""
1702
+ # Sync shared memory at the start to ensure all agents have context
1703
+ if self.share_memory:
1704
+ logger.info(
1705
+ f"Syncing shared memory at workforce {self.node_id} startup"
1706
+ )
1707
+ self._sync_shared_memory()
1708
+
568
1709
  for child in self._children:
569
1710
  child_listening_task = asyncio.create_task(child.start())
570
1711
  self._child_listening_tasks.append(child_listening_task)
@@ -576,7 +1717,8 @@ class Workforce(BaseNode):
576
1717
  by its parent node.
577
1718
  """
578
1719
  for child in self._children:
579
- child.stop()
1720
+ if child._running:
1721
+ child.stop()
580
1722
  for child_task in self._child_listening_tasks:
581
1723
  child_task.cancel()
582
1724
  self._running = False
@@ -596,12 +1738,21 @@ class Workforce(BaseNode):
596
1738
  """
597
1739
 
598
1740
  # Create a new instance with the same configuration
1741
+ # Extract the original kwargs from the agents to properly clone them
1742
+ coordinator_kwargs = (
1743
+ getattr(self.coordinator_agent, 'init_kwargs', {}) or {}
1744
+ )
1745
+ task_kwargs = getattr(self.task_agent, 'init_kwargs', {}) or {}
1746
+
599
1747
  new_instance = Workforce(
600
1748
  description=self.description,
601
- coordinator_agent_kwargs={},
602
- task_agent_kwargs={},
603
- new_worker_agent_kwargs=self.new_worker_agent_kwargs,
1749
+ coordinator_agent_kwargs=coordinator_kwargs.copy(),
1750
+ task_agent_kwargs=task_kwargs.copy(),
1751
+ new_worker_agent_kwargs=self.new_worker_agent_kwargs.copy()
1752
+ if self.new_worker_agent_kwargs
1753
+ else None,
604
1754
  graceful_shutdown_timeout=self.graceful_shutdown_timeout,
1755
+ share_memory=self.share_memory,
605
1756
  )
606
1757
 
607
1758
  new_instance.task_agent = self.task_agent.clone(with_memory)
@@ -620,10 +1771,10 @@ class Workforce(BaseNode):
620
1771
  child.description,
621
1772
  child.assistant_role_name,
622
1773
  child.user_role_name,
1774
+ child.chat_turn_limit,
623
1775
  child.assistant_agent_kwargs,
624
1776
  child.user_agent_kwargs,
625
1777
  child.summarize_agent_kwargs,
626
- child.chat_turn_limit,
627
1778
  )
628
1779
  elif isinstance(child, Workforce):
629
1780
  new_instance.add_workforce(child.clone(with_memory))
@@ -682,7 +1833,9 @@ class Workforce(BaseNode):
682
1833
  workforce_instance = self
683
1834
 
684
1835
  # Define functions first
685
- def process_task(task_content, task_id=None, additional_info=None):
1836
+ async def process_task(
1837
+ task_content, task_id=None, additional_info=None
1838
+ ):
686
1839
  r"""Process a task using the workforce.
687
1840
 
688
1841
  Args:
@@ -704,7 +1857,8 @@ class Workforce(BaseNode):
704
1857
  - message (str): Error message if status is "error"
705
1858
 
706
1859
  Example:
707
- >>> result = process_task("Analyze market trends", "task_001")
1860
+ >>> result = await process_task("Analyze market trends",
1861
+ "task_001")
708
1862
  >>> print(result["status"]) # "success" or "error"
709
1863
  """
710
1864
  task = Task(
@@ -714,7 +1868,7 @@ class Workforce(BaseNode):
714
1868
  )
715
1869
 
716
1870
  try:
717
- result_task = workforce_instance.process_task(task)
1871
+ result_task = await workforce_instance.process_task(task)
718
1872
  return {
719
1873
  "status": "success",
720
1874
  "task_id": result_task.id,
@@ -834,9 +1988,9 @@ class Workforce(BaseNode):
834
1988
  >>> for child in children:
835
1989
  ... print(f"{child['type']}: {child['description']}")
836
1990
  """
837
- children_info = []
1991
+ children_info: List[Dict[str, Any]] = []
838
1992
  for child in workforce_instance._children:
839
- child_info = {
1993
+ child_info: Dict[str, Any] = {
840
1994
  "node_id": child.node_id,
841
1995
  "description": child.description,
842
1996
  "type": type(child).__name__,