camel-ai 0.2.73a4__py3-none-any.whl → 0.2.80a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_utils.py +38 -0
  3. camel/agents/chat_agent.py +2217 -519
  4. camel/agents/mcp_agent.py +30 -27
  5. camel/configs/__init__.py +15 -0
  6. camel/configs/aihubmix_config.py +88 -0
  7. camel/configs/amd_config.py +70 -0
  8. camel/configs/cometapi_config.py +104 -0
  9. camel/configs/minimax_config.py +93 -0
  10. camel/configs/nebius_config.py +103 -0
  11. camel/data_collectors/alpaca_collector.py +15 -6
  12. camel/datasets/base_generator.py +39 -10
  13. camel/environments/single_step.py +28 -3
  14. camel/environments/tic_tac_toe.py +1 -1
  15. camel/interpreters/__init__.py +2 -0
  16. camel/interpreters/docker/Dockerfile +3 -12
  17. camel/interpreters/e2b_interpreter.py +34 -1
  18. camel/interpreters/microsandbox_interpreter.py +395 -0
  19. camel/loaders/__init__.py +11 -2
  20. camel/loaders/chunkr_reader.py +9 -0
  21. camel/memories/agent_memories.py +48 -4
  22. camel/memories/base.py +26 -0
  23. camel/memories/blocks/chat_history_block.py +122 -4
  24. camel/memories/context_creators/score_based.py +25 -384
  25. camel/memories/records.py +88 -8
  26. camel/messages/base.py +153 -34
  27. camel/models/__init__.py +10 -0
  28. camel/models/aihubmix_model.py +83 -0
  29. camel/models/aiml_model.py +1 -16
  30. camel/models/amd_model.py +101 -0
  31. camel/models/anthropic_model.py +6 -19
  32. camel/models/aws_bedrock_model.py +2 -33
  33. camel/models/azure_openai_model.py +114 -89
  34. camel/models/base_audio_model.py +3 -1
  35. camel/models/base_model.py +32 -14
  36. camel/models/cohere_model.py +1 -16
  37. camel/models/cometapi_model.py +83 -0
  38. camel/models/crynux_model.py +1 -16
  39. camel/models/deepseek_model.py +1 -16
  40. camel/models/fish_audio_model.py +6 -0
  41. camel/models/gemini_model.py +36 -18
  42. camel/models/groq_model.py +1 -17
  43. camel/models/internlm_model.py +1 -16
  44. camel/models/litellm_model.py +1 -16
  45. camel/models/lmstudio_model.py +1 -17
  46. camel/models/minimax_model.py +83 -0
  47. camel/models/mistral_model.py +1 -16
  48. camel/models/model_factory.py +27 -1
  49. camel/models/modelscope_model.py +1 -16
  50. camel/models/moonshot_model.py +105 -24
  51. camel/models/nebius_model.py +83 -0
  52. camel/models/nemotron_model.py +0 -5
  53. camel/models/netmind_model.py +1 -16
  54. camel/models/novita_model.py +1 -16
  55. camel/models/nvidia_model.py +1 -16
  56. camel/models/ollama_model.py +4 -19
  57. camel/models/openai_compatible_model.py +62 -41
  58. camel/models/openai_model.py +62 -57
  59. camel/models/openrouter_model.py +1 -17
  60. camel/models/ppio_model.py +1 -16
  61. camel/models/qianfan_model.py +1 -16
  62. camel/models/qwen_model.py +1 -16
  63. camel/models/reka_model.py +1 -16
  64. camel/models/samba_model.py +34 -47
  65. camel/models/sglang_model.py +64 -31
  66. camel/models/siliconflow_model.py +1 -16
  67. camel/models/stub_model.py +0 -4
  68. camel/models/togetherai_model.py +1 -16
  69. camel/models/vllm_model.py +1 -16
  70. camel/models/volcano_model.py +0 -17
  71. camel/models/watsonx_model.py +1 -16
  72. camel/models/yi_model.py +1 -16
  73. camel/models/zhipuai_model.py +60 -16
  74. camel/parsers/__init__.py +18 -0
  75. camel/parsers/mcp_tool_call_parser.py +176 -0
  76. camel/retrievers/auto_retriever.py +1 -0
  77. camel/runtimes/daytona_runtime.py +11 -12
  78. camel/societies/__init__.py +2 -0
  79. camel/societies/workforce/__init__.py +2 -0
  80. camel/societies/workforce/events.py +122 -0
  81. camel/societies/workforce/prompts.py +146 -66
  82. camel/societies/workforce/role_playing_worker.py +15 -11
  83. camel/societies/workforce/single_agent_worker.py +302 -65
  84. camel/societies/workforce/structured_output_handler.py +30 -18
  85. camel/societies/workforce/task_channel.py +163 -27
  86. camel/societies/workforce/utils.py +107 -13
  87. camel/societies/workforce/workflow_memory_manager.py +772 -0
  88. camel/societies/workforce/workforce.py +1949 -579
  89. camel/societies/workforce/workforce_callback.py +74 -0
  90. camel/societies/workforce/workforce_logger.py +168 -145
  91. camel/societies/workforce/workforce_metrics.py +33 -0
  92. camel/storages/key_value_storages/json.py +15 -2
  93. camel/storages/key_value_storages/mem0_cloud.py +48 -47
  94. camel/storages/object_storages/google_cloud.py +1 -1
  95. camel/storages/vectordb_storages/oceanbase.py +13 -13
  96. camel/storages/vectordb_storages/qdrant.py +3 -3
  97. camel/storages/vectordb_storages/tidb.py +8 -6
  98. camel/tasks/task.py +4 -3
  99. camel/toolkits/__init__.py +20 -7
  100. camel/toolkits/aci_toolkit.py +45 -0
  101. camel/toolkits/base.py +6 -4
  102. camel/toolkits/code_execution.py +28 -1
  103. camel/toolkits/context_summarizer_toolkit.py +684 -0
  104. camel/toolkits/dappier_toolkit.py +5 -1
  105. camel/toolkits/dingtalk.py +1135 -0
  106. camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
  107. camel/toolkits/excel_toolkit.py +1 -1
  108. camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +430 -36
  109. camel/toolkits/function_tool.py +13 -3
  110. camel/toolkits/github_toolkit.py +104 -17
  111. camel/toolkits/gmail_toolkit.py +1839 -0
  112. camel/toolkits/google_calendar_toolkit.py +38 -4
  113. camel/toolkits/google_drive_mcp_toolkit.py +12 -31
  114. camel/toolkits/hybrid_browser_toolkit/config_loader.py +15 -0
  115. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +77 -8
  116. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +884 -88
  117. camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
  118. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
  119. camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
  120. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +959 -89
  121. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +9 -2
  122. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +281 -213
  123. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  124. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
  125. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  126. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +23 -3
  127. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +72 -7
  128. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +582 -132
  129. camel/toolkits/hybrid_browser_toolkit_py/actions.py +158 -0
  130. camel/toolkits/hybrid_browser_toolkit_py/browser_session.py +55 -8
  131. camel/toolkits/hybrid_browser_toolkit_py/config_loader.py +43 -0
  132. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +321 -8
  133. camel/toolkits/hybrid_browser_toolkit_py/snapshot.py +10 -4
  134. camel/toolkits/hybrid_browser_toolkit_py/unified_analyzer.js +45 -4
  135. camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +151 -53
  136. camel/toolkits/klavis_toolkit.py +5 -1
  137. camel/toolkits/markitdown_toolkit.py +27 -1
  138. camel/toolkits/math_toolkit.py +64 -10
  139. camel/toolkits/mcp_toolkit.py +366 -71
  140. camel/toolkits/memory_toolkit.py +5 -1
  141. camel/toolkits/message_integration.py +18 -13
  142. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  143. camel/toolkits/note_taking_toolkit.py +19 -10
  144. camel/toolkits/notion_mcp_toolkit.py +16 -26
  145. camel/toolkits/openbb_toolkit.py +5 -1
  146. camel/toolkits/origene_mcp_toolkit.py +8 -49
  147. camel/toolkits/playwright_mcp_toolkit.py +12 -31
  148. camel/toolkits/resend_toolkit.py +168 -0
  149. camel/toolkits/search_toolkit.py +264 -91
  150. camel/toolkits/slack_toolkit.py +64 -10
  151. camel/toolkits/terminal_toolkit/__init__.py +18 -0
  152. camel/toolkits/terminal_toolkit/terminal_toolkit.py +957 -0
  153. camel/toolkits/terminal_toolkit/utils.py +532 -0
  154. camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
  155. camel/toolkits/video_analysis_toolkit.py +17 -11
  156. camel/toolkits/wechat_official_toolkit.py +483 -0
  157. camel/toolkits/zapier_toolkit.py +5 -1
  158. camel/types/__init__.py +2 -2
  159. camel/types/enums.py +274 -7
  160. camel/types/openai_types.py +2 -2
  161. camel/types/unified_model_type.py +15 -0
  162. camel/utils/commons.py +36 -5
  163. camel/utils/constants.py +3 -0
  164. camel/utils/context_utils.py +1003 -0
  165. camel/utils/mcp.py +138 -4
  166. camel/utils/token_counting.py +43 -20
  167. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/METADATA +223 -83
  168. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/RECORD +170 -141
  169. camel/loaders/pandas_reader.py +0 -368
  170. camel/toolkits/openai_agent_toolkit.py +0 -135
  171. camel/toolkits/terminal_toolkit.py +0 -1550
  172. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/WHEEL +0 -0
  173. {camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/licenses/LICENSE +0 -0
@@ -16,12 +16,15 @@ from __future__ import annotations
16
16
  import asyncio
17
17
  import concurrent.futures
18
18
  import json
19
+ import os
19
20
  import time
20
21
  import uuid
21
22
  from collections import deque
22
23
  from enum import Enum
23
24
  from typing import (
25
+ TYPE_CHECKING,
24
26
  Any,
27
+ Callable,
25
28
  Coroutine,
26
29
  Deque,
27
30
  Dict,
@@ -31,8 +34,15 @@ from typing import (
31
34
  Set,
32
35
  Tuple,
33
36
  Union,
37
+ cast,
34
38
  )
35
39
 
40
+ from .workforce_callback import WorkforceCallback
41
+ from .workforce_metrics import WorkforceMetrics
42
+
43
+ if TYPE_CHECKING:
44
+ from camel.utils.context_utils import ContextUtility
45
+
36
46
  from colorama import Fore
37
47
 
38
48
  from camel.agents import ChatAgent
@@ -43,19 +53,23 @@ from camel.societies.workforce.base import BaseNode
43
53
  from camel.societies.workforce.prompts import (
44
54
  ASSIGN_TASK_PROMPT,
45
55
  CREATE_NODE_PROMPT,
46
- FAILURE_ANALYSIS_PROMPT,
56
+ FAILURE_ANALYSIS_RESPONSE_FORMAT,
57
+ QUALITY_EVALUATION_RESPONSE_FORMAT,
58
+ TASK_AGENT_SYSTEM_MESSAGE,
59
+ TASK_ANALYSIS_PROMPT,
47
60
  TASK_DECOMPOSE_PROMPT,
48
61
  )
49
62
  from camel.societies.workforce.role_playing_worker import RolePlayingWorker
50
- from camel.societies.workforce.single_agent_worker import SingleAgentWorker
63
+ from camel.societies.workforce.single_agent_worker import (
64
+ SingleAgentWorker,
65
+ )
51
66
  from camel.societies.workforce.structured_output_handler import (
52
67
  StructuredOutputHandler,
53
68
  )
54
69
  from camel.societies.workforce.task_channel import TaskChannel
55
70
  from camel.societies.workforce.utils import (
56
- FailureContext,
57
- RecoveryDecision,
58
71
  RecoveryStrategy,
72
+ TaskAnalysisResult,
59
73
  TaskAssignment,
60
74
  TaskAssignResult,
61
75
  WorkerConf,
@@ -70,21 +84,39 @@ from camel.tasks.task import (
70
84
  )
71
85
  from camel.toolkits import (
72
86
  CodeExecutionToolkit,
87
+ FunctionTool,
73
88
  SearchToolkit,
74
- TaskPlanningToolkit,
75
89
  ThinkingToolkit,
76
90
  )
77
91
  from camel.types import ModelPlatformType, ModelType
78
92
  from camel.utils import dependencies_required
79
93
 
94
+ from .events import (
95
+ AllTasksCompletedEvent,
96
+ TaskAssignedEvent,
97
+ TaskCompletedEvent,
98
+ TaskCreatedEvent,
99
+ TaskDecomposedEvent,
100
+ TaskFailedEvent,
101
+ TaskStartedEvent,
102
+ WorkerCreatedEvent,
103
+ )
80
104
  from .workforce_logger import WorkforceLogger
81
105
 
82
- logger = get_logger(__name__)
106
+ if os.environ.get("TRACEROOT_ENABLED", "False").lower() == "true":
107
+ try:
108
+ import traceroot # type: ignore[import]
109
+
110
+ logger = traceroot.get_logger('camel')
111
+ except ImportError:
112
+ logger = get_logger(__name__)
113
+ else:
114
+ logger = get_logger(__name__)
83
115
 
84
116
  # Constants for configuration values
85
117
  MAX_TASK_RETRIES = 3
86
118
  MAX_PENDING_TASKS_LIMIT = 20
87
- TASK_TIMEOUT_SECONDS = 180.0
119
+ TASK_TIMEOUT_SECONDS = 600.0
88
120
  DEFAULT_WORKER_POOL_SIZE = 10
89
121
 
90
122
 
@@ -151,9 +183,9 @@ class Workforce(BaseNode):
151
183
  task_agent (Optional[ChatAgent], optional): A custom task planning
152
184
  agent instance for task decomposition and composition. If
153
185
  provided, the workforce will create a new agent using this agent's
154
- model configuration but with the required system message and tools
155
- (TaskPlanningToolkit). If None, a default agent will be created
156
- using DEFAULT model settings. (default: :obj:`None`)
186
+ model configuration but with the required system message. If None,
187
+ a default agent will be created using DEFAULT model settings.
188
+ (default: :obj:`None`)
157
189
  new_worker_agent (Optional[ChatAgent], optional): A template agent for
158
190
  workers created dynamically at runtime when existing workers cannot
159
191
  handle failed tasks. If None, workers will be created with default
@@ -163,6 +195,11 @@ class Workforce(BaseNode):
163
195
  for graceful shutdown when a task fails 3 times. During this
164
196
  period, the workforce remains active for debugging.
165
197
  Set to 0 for immediate shutdown. (default: :obj:`15.0`)
198
+ task_timeout_seconds (Optional[float], optional): The timeout in
199
+ seconds for waiting for tasks to be returned by workers. If None,
200
+ uses the global TASK_TIMEOUT_SECONDS value (600.0 seconds).
201
+ Increase this value for tasks that require more processing time.
202
+ (default: :obj:`None`)
166
203
  share_memory (bool, optional): Whether to enable shared memory across
167
204
  SingleAgentWorker instances in the workforce. When enabled, all
168
205
  SingleAgentWorker instances, coordinator agent, and task planning
@@ -180,6 +217,17 @@ class Workforce(BaseNode):
180
217
  support native structured output. When disabled, the workforce
181
218
  uses the native response_format parameter.
182
219
  (default: :obj:`True`)
220
+ callbacks (Optional[List[WorkforceCallback]], optional): A list of
221
+ callback handlers to observe and record workforce lifecycle events
222
+ and metrics (e.g., task creation/assignment/start/completion/
223
+ failure, worker creation/deletion, all-tasks-completed). All
224
+ items must be instances of :class:`WorkforceCallback`, otherwise
225
+ a :class:`ValueError` is raised. If none of the provided
226
+ callbacks implement :class:`WorkforceMetrics`, a built-in
227
+ :class:`WorkforceLogger` (implements both callback and metrics)
228
+ is added automatically. If at least one provided callback
229
+ implements :class:`WorkforceMetrics`, no default logger is added.
230
+ (default: :obj:`None`)
183
231
 
184
232
  Example:
185
233
  >>> import asyncio
@@ -231,6 +279,8 @@ class Workforce(BaseNode):
231
279
  graceful_shutdown_timeout: float = 15.0,
232
280
  share_memory: bool = False,
233
281
  use_structured_output_handler: bool = True,
282
+ task_timeout_seconds: Optional[float] = None,
283
+ callbacks: Optional[List[WorkforceCallback]] = None,
234
284
  ) -> None:
235
285
  super().__init__(description)
236
286
  self._child_listening_tasks: Deque[
@@ -241,9 +291,11 @@ class Workforce(BaseNode):
241
291
  self.graceful_shutdown_timeout = graceful_shutdown_timeout
242
292
  self.share_memory = share_memory
243
293
  self.use_structured_output_handler = use_structured_output_handler
294
+ self.task_timeout_seconds = (
295
+ task_timeout_seconds or TASK_TIMEOUT_SECONDS
296
+ )
244
297
  if self.use_structured_output_handler:
245
298
  self.structured_handler = StructuredOutputHandler()
246
- self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
247
299
  self._task: Optional[Task] = None
248
300
  self._pending_tasks: Deque[Task] = deque()
249
301
  self._task_dependencies: Dict[str, List[str]] = {}
@@ -256,6 +308,7 @@ class Workforce(BaseNode):
256
308
  self._pause_event = asyncio.Event()
257
309
  self._pause_event.set() # Initially not paused
258
310
  self._stop_requested = False
311
+ self._skip_requested = False
259
312
  self._snapshots: List[WorkforceSnapshot] = []
260
313
  self._completed_tasks: List[Task] = []
261
314
  self._loop: Optional[asyncio.AbstractEventLoop] = None
@@ -265,15 +318,9 @@ class Workforce(BaseNode):
265
318
  self._last_snapshot_time: float = 0.0
266
319
  # Minimum seconds between automatic snapshots
267
320
  self.snapshot_interval: float = 30.0
268
- if self.metrics_logger:
269
- for child in self._children:
270
- worker_type = type(child).__name__
271
- role_or_desc = child.description
272
- self.metrics_logger.log_worker_created(
273
- worker_id=child.node_id,
274
- worker_type=worker_type,
275
- role=role_or_desc,
276
- )
321
+ # Shared memory UUID tracking to prevent re-sharing duplicates
322
+ self._shared_memory_uuids: Set[str] = set()
323
+ self._initialize_callbacks(callbacks)
277
324
 
278
325
  # Set up coordinator agent with default system message
279
326
  coord_agent_sys_msg = BaseMessage.make_assistant_message(
@@ -302,8 +349,7 @@ class Workforce(BaseNode):
302
349
  if coordinator_agent.system_message is not None:
303
350
  user_sys_msg_content = coordinator_agent.system_message.content
304
351
  combined_content = (
305
- f"{user_sys_msg_content}\n\n"
306
- f"{coord_agent_sys_msg.content}"
352
+ f"{user_sys_msg_content}\n\n{coord_agent_sys_msg.content}"
307
353
  )
308
354
  combined_sys_msg = BaseMessage.make_assistant_message(
309
355
  role_name=coordinator_agent.system_message.role_name,
@@ -327,10 +373,7 @@ class Workforce(BaseNode):
327
373
  None,
328
374
  ),
329
375
  output_language=coordinator_agent.output_language,
330
- tools=[
331
- tool.func
332
- for tool in coordinator_agent._internal_tools.values()
333
- ],
376
+ tools=list(coordinator_agent._internal_tools.values()),
334
377
  external_tools=[
335
378
  schema
336
379
  for schema in coordinator_agent._external_tool_schemas.values() # noqa: E501
@@ -340,28 +383,20 @@ class Workforce(BaseNode):
340
383
  stop_event=coordinator_agent.stop_event,
341
384
  )
342
385
 
343
- # Set up task agent with default system message and required tools
386
+ # Set up task agent with default system message
344
387
  task_sys_msg = BaseMessage.make_assistant_message(
345
388
  role_name="Task Planner",
346
- content="You are going to compose and decompose tasks. Keep "
347
- "tasks that are sequential and require the same type of "
348
- "agent together in one agent process. Only decompose tasks "
349
- "that can be handled in parallel and require different types "
350
- "of agents. This ensures efficient execution by minimizing "
351
- "context switching between agents.",
389
+ content=TASK_AGENT_SYSTEM_MESSAGE,
352
390
  )
353
- task_planning_tools = TaskPlanningToolkit().get_tools()
354
391
 
355
392
  if task_agent is None:
356
393
  logger.warning(
357
394
  "No task_agent provided. Using default ChatAgent "
358
395
  "settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT) "
359
- "with default system message and TaskPlanningToolkit."
396
+ "with default system message."
360
397
  )
361
- task_tools = TaskPlanningToolkit().get_tools()
362
398
  self.task_agent = ChatAgent(
363
399
  task_sys_msg,
364
- tools=task_tools, # type: ignore[arg-type]
365
400
  )
366
401
  else:
367
402
  logger.info(
@@ -373,8 +408,7 @@ class Workforce(BaseNode):
373
408
  if task_agent.system_message is not None:
374
409
  user_task_sys_msg_content = task_agent.system_message.content
375
410
  combined_task_content = (
376
- f"{user_task_sys_msg_content}\n\n"
377
- f"{task_sys_msg.content}"
411
+ f"{user_task_sys_msg_content}\n\n{task_sys_msg.content}"
378
412
  )
379
413
  combined_task_sys_msg = BaseMessage.make_assistant_message(
380
414
  role_name=task_agent.system_message.role_name,
@@ -385,9 +419,10 @@ class Workforce(BaseNode):
385
419
 
386
420
  # Since ChatAgent constructor uses a dictionary with
387
421
  # function names as keys, we don't need to manually deduplicate.
388
- combined_tools = [
389
- tool.func for tool in task_agent._internal_tools.values()
390
- ] + [tool.func for tool in task_planning_tools]
422
+ combined_tools: List[Union[FunctionTool, Callable]] = cast(
423
+ List[Union[FunctionTool, Callable]],
424
+ list(task_agent._internal_tools.values()),
425
+ )
391
426
 
392
427
  # Create a new agent with the provided agent's configuration
393
428
  # but with the combined system message and tools
@@ -434,10 +469,85 @@ class Workforce(BaseNode):
434
469
  "better context continuity during task handoffs."
435
470
  )
436
471
 
472
+ # Shared context utility for workflow management (created lazily)
473
+ self._shared_context_utility: Optional["ContextUtility"] = None
474
+
437
475
  # ------------------------------------------------------------------
438
476
  # Helper for propagating pause control to externally supplied agents
439
477
  # ------------------------------------------------------------------
440
478
 
479
+ def _initialize_callbacks(
480
+ self, callbacks: Optional[List[WorkforceCallback]]
481
+ ) -> None:
482
+ r"""Validate, register, and prime workforce callbacks."""
483
+ self._callbacks: List[WorkforceCallback] = []
484
+
485
+ if callbacks:
486
+ for cb in callbacks:
487
+ if isinstance(cb, WorkforceCallback):
488
+ self._callbacks.append(cb)
489
+ else:
490
+ raise ValueError(
491
+ "All callbacks must be instances of WorkforceCallback"
492
+ )
493
+
494
+ has_metrics_callback = any(
495
+ isinstance(cb, WorkforceMetrics) for cb in self._callbacks
496
+ )
497
+
498
+ if not has_metrics_callback:
499
+ self._callbacks.append(WorkforceLogger(workforce_id=self.node_id))
500
+ else:
501
+ logger.info(
502
+ "WorkforceMetrics implementation detected. Skipping default "
503
+ "WorkforceLogger addition."
504
+ )
505
+
506
+ for child in self._children:
507
+ self._notify_worker_created(child)
508
+
509
+ def _notify_worker_created(
510
+ self,
511
+ worker_node: BaseNode,
512
+ *,
513
+ worker_type: Optional[str] = None,
514
+ role: Optional[str] = None,
515
+ metadata: Optional[Dict[str, Any]] = None,
516
+ ) -> None:
517
+ r"""Emit a worker-created event to all registered callbacks."""
518
+ event = WorkerCreatedEvent(
519
+ worker_id=worker_node.node_id,
520
+ worker_type=worker_type or type(worker_node).__name__,
521
+ role=role or worker_node.description,
522
+ metadata=metadata,
523
+ )
524
+ for cb in self._callbacks:
525
+ cb.log_worker_created(event)
526
+
527
+ def _get_or_create_shared_context_utility(
528
+ self,
529
+ session_id: Optional[str] = None,
530
+ ) -> "ContextUtility":
531
+ r"""Get or create the shared context utility for workflow management.
532
+
533
+ This method creates the context utility only when needed, avoiding
534
+ unnecessary session folder creation during initialization.
535
+
536
+ Args:
537
+ session_id (Optional[str]): Custom session ID to use. If None,
538
+ auto-generates a timestamped session ID. (default: :obj:`None`)
539
+
540
+ Returns:
541
+ ContextUtility: The shared context utility instance.
542
+ """
543
+ if self._shared_context_utility is None:
544
+ from camel.utils.context_utils import ContextUtility
545
+
546
+ self._shared_context_utility = ContextUtility.get_workforce_shared(
547
+ session_id=session_id
548
+ )
549
+ return self._shared_context_utility
550
+
441
551
  def _validate_agent_compatibility(
442
552
  self, agent: ChatAgent, agent_context: str = "agent"
443
553
  ) -> None:
@@ -474,6 +584,9 @@ class Workforce(BaseNode):
474
584
  "the Workforce."
475
585
  )
476
586
 
587
+ # ------------------------------------------------------------------
588
+ # Helper for propagating pause control to externally supplied agents
589
+ # ------------------------------------------------------------------
477
590
  def _attach_pause_event_to_agent(self, agent: ChatAgent) -> None:
478
591
  r"""Ensure the given ChatAgent shares this workforce's pause_event.
479
592
 
@@ -599,14 +712,29 @@ class Workforce(BaseNode):
599
712
  )
600
713
  return
601
714
 
602
- # Share with coordinator agent
715
+ # Filter out already-shared records to prevent re-sharing
716
+ # This prevents exponential growth of duplicate records
717
+ new_records = []
603
718
  for record in memory_records:
719
+ record_uuid = str(record.uuid)
720
+ if record_uuid not in self._shared_memory_uuids:
721
+ new_records.append(record)
722
+ self._shared_memory_uuids.add(record_uuid)
723
+
724
+ if not new_records:
725
+ logger.debug(
726
+ "No new records to share (all were already shared)"
727
+ )
728
+ return
729
+
730
+ # Share with coordinator agent
731
+ for record in new_records:
604
732
  # Only add records from other agents to avoid duplication
605
733
  if record.agent_id != self.coordinator_agent.agent_id:
606
734
  self.coordinator_agent.memory.write_record(record)
607
735
 
608
736
  # Share with task agent
609
- for record in memory_records:
737
+ for record in new_records:
610
738
  if record.agent_id != self.task_agent.agent_id:
611
739
  self.task_agent.memory.write_record(record)
612
740
 
@@ -618,12 +746,12 @@ class Workforce(BaseNode):
618
746
  ]
619
747
 
620
748
  for worker in single_agent_workers:
621
- for record in memory_records:
749
+ for record in new_records:
622
750
  if record.agent_id != worker.worker.agent_id:
623
751
  worker.worker.memory.write_record(record)
624
752
 
625
753
  logger.info(
626
- f"Shared {len(memory_records)} memory records across "
754
+ f"Shared {len(new_records)} new memory records across "
627
755
  f"{len(single_agent_workers) + 2} agents in workforce "
628
756
  f"{self.node_id}"
629
757
  )
@@ -730,10 +858,12 @@ class Workforce(BaseNode):
730
858
  Union[List[Task], Generator[List[Task], None, None]]:
731
859
  The subtasks or generator of subtasks.
732
860
  """
733
- decompose_prompt = TASK_DECOMPOSE_PROMPT.format(
734
- content=task.content,
735
- child_nodes_info=self._get_child_nodes_info(),
736
- additional_info=task.additional_info,
861
+ decompose_prompt = str(
862
+ TASK_DECOMPOSE_PROMPT.format(
863
+ content=task.content,
864
+ child_nodes_info=self._get_child_nodes_info(),
865
+ additional_info=task.additional_info,
866
+ )
737
867
  )
738
868
  self.task_agent.reset()
739
869
  result = task.decompose(self.task_agent, decompose_prompt)
@@ -761,76 +891,126 @@ class Workforce(BaseNode):
761
891
  self._update_dependencies_for_decomposition(task, subtasks)
762
892
  return subtasks
763
893
 
764
- def _analyze_failure(
765
- self, task: Task, error_message: str
766
- ) -> RecoveryDecision:
767
- r"""Analyze a task failure and decide on the best recovery strategy.
894
+ def _analyze_task(
895
+ self,
896
+ task: Task,
897
+ *,
898
+ for_failure: bool,
899
+ error_message: Optional[str] = None,
900
+ ) -> TaskAnalysisResult:
901
+ r"""Unified task analysis for both failures and quality evaluation.
902
+
903
+ This method consolidates the logic for analyzing task failures and
904
+ evaluating task quality, using the unified TASK_ANALYSIS_PROMPT.
768
905
 
769
906
  Args:
770
- task (Task): The failed task
771
- error_message (str): The error message from the failure
907
+ task (Task): The task to analyze
908
+ for_failure (bool): True for failure analysis, False for quality
909
+ evaluation
910
+ error_message (Optional[str]): Error message, required when
911
+ for_failure=True
772
912
 
773
913
  Returns:
774
- RecoveryDecision: The decided recovery strategy with reasoning
914
+ TaskAnalysisResult: Unified analysis result with recovery strategy
915
+ and optional quality metrics
916
+
917
+ Raises:
918
+ ValueError: If for_failure=True but error_message is None
775
919
  """
776
- # First, do a quick smart analysis based on error patterns
777
- error_msg_lower = error_message.lower()
778
- if any(
779
- keyword in error_msg_lower
780
- for keyword in [
781
- 'connection',
782
- 'network',
783
- 'server disconnected',
784
- 'timeout',
785
- 'apiconnectionerror',
920
+ # Validate required parameters
921
+ if for_failure and error_message is None:
922
+ raise ValueError("error_message is required when for_failure=True")
923
+
924
+ # Determine task result and issue-specific analysis based on context
925
+ if for_failure:
926
+ task_result = "N/A (task failed)"
927
+ issue_type = "Task Failure"
928
+ issue_analysis = f"**Error Message:** {error_message}"
929
+ response_format = FAILURE_ANALYSIS_RESPONSE_FORMAT
930
+ result_schema = TaskAnalysisResult
931
+ fallback_values: Dict[str, Any] = {
932
+ "reasoning": "Defaulting to retry due to parsing error",
933
+ "recovery_strategy": RecoveryStrategy.RETRY,
934
+ "modified_task_content": None,
935
+ "issues": [error_message] if error_message else [],
936
+ }
937
+ examples: List[Dict[str, Any]] = [
938
+ {
939
+ "reasoning": "Temporary network error, worth retrying",
940
+ "recovery_strategy": "retry",
941
+ "modified_task_content": None,
942
+ "issues": ["Network timeout"],
943
+ }
786
944
  ]
787
- ):
788
- return RecoveryDecision(
789
- strategy=RecoveryStrategy.RETRY,
790
- reasoning="Network/connection error detected, retrying task",
791
- modified_task_content=None,
945
+ else:
946
+ # Quality evaluation
947
+ task_result = task.result or "No result available"
948
+ issue_type = "Quality Evaluation"
949
+ issue_analysis = (
950
+ "Provide a quality score (0-100) and list any specific "
951
+ "issues found."
792
952
  )
953
+ response_format = QUALITY_EVALUATION_RESPONSE_FORMAT
954
+ result_schema = TaskAnalysisResult
955
+ fallback_values = {
956
+ "reasoning": (
957
+ "Defaulting to acceptable quality due to parsing error"
958
+ ),
959
+ "issues": [],
960
+ "recovery_strategy": None,
961
+ "modified_task_content": None,
962
+ "quality_score": 80,
963
+ }
964
+ examples = [
965
+ {
966
+ "reasoning": (
967
+ "Excellent implementation with comprehensive tests"
968
+ ),
969
+ "issues": [],
970
+ "recovery_strategy": None,
971
+ "modified_task_content": None,
972
+ "quality_score": 98,
973
+ },
974
+ {
975
+ "reasoning": (
976
+ "Implementation incomplete with missing features"
977
+ ),
978
+ "issues": [
979
+ "Incomplete implementation",
980
+ "Missing error handling",
981
+ ],
982
+ "recovery_strategy": "replan",
983
+ "modified_task_content": (
984
+ "Previous attempt was incomplete. "
985
+ "Please implement with: 1) Full feature "
986
+ "coverage, 2) Proper error handling"
987
+ ),
988
+ "quality_score": 45,
989
+ },
990
+ ]
793
991
 
794
- # Create failure context
795
- failure_context = FailureContext(
796
- task_id=task.id,
797
- task_content=task.content,
798
- failure_count=task.failure_count,
799
- error_message=error_message,
800
- worker_id=task.assigned_worker_id,
801
- task_depth=task.get_depth(),
802
- additional_info=str(task.additional_info)
803
- if task.additional_info
804
- else None,
805
- )
806
-
807
- # Format the analysis prompt
808
- analysis_prompt = FAILURE_ANALYSIS_PROMPT.format(
809
- task_id=failure_context.task_id,
810
- task_content=failure_context.task_content,
811
- failure_count=failure_context.failure_count,
812
- error_message=failure_context.error_message,
813
- worker_id=failure_context.worker_id or "unknown",
814
- task_depth=failure_context.task_depth,
815
- additional_info=failure_context.additional_info or "None",
992
+ # Format the unified analysis prompt
993
+ analysis_prompt = str(
994
+ TASK_ANALYSIS_PROMPT.format(
995
+ task_id=task.id,
996
+ task_content=task.content,
997
+ task_result=task_result,
998
+ failure_count=task.failure_count,
999
+ task_depth=task.get_depth(),
1000
+ assigned_worker=task.assigned_worker_id or "unknown",
1001
+ issue_type=issue_type,
1002
+ issue_specific_analysis=issue_analysis,
1003
+ response_format=response_format,
1004
+ )
816
1005
  )
817
1006
 
818
1007
  try:
819
- # Check if we should use structured handler
820
1008
  if self.use_structured_output_handler:
821
- # Use structured handler
822
1009
  enhanced_prompt = (
823
1010
  self.structured_handler.generate_structured_prompt(
824
1011
  base_prompt=analysis_prompt,
825
- schema=RecoveryDecision,
826
- examples=[
827
- {
828
- "strategy": "RETRY",
829
- "reasoning": "Temporary network error, "
830
- "worth retrying",
831
- "modified_task_content": None,
832
- }
833
- ],
1012
+ schema=result_schema,
1013
+ examples=examples,
834
1014
  )
835
1015
  )
836
1016
 
@@ -839,43 +1019,224 @@ class Workforce(BaseNode):
839
1019
 
840
1020
  result = self.structured_handler.parse_structured_response(
841
1021
  response.msg.content if response.msg else "",
842
- schema=RecoveryDecision,
843
- fallback_values={
844
- "strategy": RecoveryStrategy.RETRY,
845
- "reasoning": "Defaulting to retry due to parsing "
846
- "issues",
847
- "modified_task_content": None,
848
- },
1022
+ schema=result_schema,
1023
+ fallback_values=fallback_values,
849
1024
  )
850
- # Ensure we return a RecoveryDecision instance
851
- if isinstance(result, RecoveryDecision):
1025
+
1026
+ if isinstance(result, TaskAnalysisResult):
852
1027
  return result
853
1028
  elif isinstance(result, dict):
854
- return RecoveryDecision(**result)
1029
+ return result_schema(**result)
855
1030
  else:
856
- return RecoveryDecision(
857
- strategy=RecoveryStrategy.RETRY,
858
- reasoning="Failed to parse recovery decision",
859
- modified_task_content=None,
860
- )
1031
+ # Fallback based on context
1032
+ return TaskAnalysisResult(**fallback_values)
861
1033
  else:
862
- # Use existing native structured output code
863
1034
  self.task_agent.reset()
864
1035
  response = self.task_agent.step(
865
- analysis_prompt, response_format=RecoveryDecision
1036
+ analysis_prompt, response_format=result_schema
866
1037
  )
867
1038
  return response.msg.parsed
868
1039
 
869
1040
  except Exception as e:
870
1041
  logger.warning(
871
- f"Error during failure analysis: {e}, defaulting to RETRY"
1042
+ f"Error during task analysis "
1043
+ f"({'failure' if for_failure else 'quality'}): {e}, "
1044
+ f"using fallback"
872
1045
  )
873
- return RecoveryDecision(
874
- strategy=RecoveryStrategy.RETRY,
875
- reasoning=f"Analysis failed due to error: {e!s}, "
876
- f"defaulting to retry",
877
- modified_task_content=None,
1046
+ return TaskAnalysisResult(**fallback_values)
1047
+
1048
+ async def _apply_recovery_strategy(
1049
+ self,
1050
+ task: Task,
1051
+ recovery_decision: TaskAnalysisResult,
1052
+ ) -> bool:
1053
+ r"""Apply the recovery strategy from a task analysis result.
1054
+
1055
+ This method centralizes the recovery logic for both execution failures
1056
+ and quality-based failures.
1057
+
1058
+ Args:
1059
+ task (Task): The task that needs recovery
1060
+ recovery_decision (TaskAnalysisResult): The analysis result with
1061
+ recovery strategy
1062
+
1063
+ Returns:
1064
+ bool: True if workforce should halt (e.g., decompose needs
1065
+ different handling), False otherwise
1066
+ """
1067
+ strategy = (
1068
+ recovery_decision.recovery_strategy or RecoveryStrategy.RETRY
1069
+ )
1070
+ action_taken = ""
1071
+
1072
+ try:
1073
+ if strategy == RecoveryStrategy.RETRY:
1074
+ # Simply retry the task by reposting it to the same worker
1075
+ # Check both _assignees dict and task.assigned_worker_id
1076
+ assignee_id = (
1077
+ self._assignees.get(task.id) or task.assigned_worker_id
1078
+ )
1079
+
1080
+ if assignee_id:
1081
+ # Retry with the same worker - no coordinator call needed
1082
+ await self._post_task(task, assignee_id)
1083
+ action_taken = f"retried with same worker {assignee_id}"
1084
+ logger.info(
1085
+ f"Task {task.id} retrying with same worker "
1086
+ f"{assignee_id} (no coordinator call)"
1087
+ )
1088
+ else:
1089
+ # No previous assignment exists - find a new assignee
1090
+ logger.info(
1091
+ f"Task {task.id} has no previous assignee, "
1092
+ f"calling coordinator"
1093
+ )
1094
+ batch_result = await self._find_assignee([task])
1095
+ assignment = batch_result.assignments[0]
1096
+ self._assignees[task.id] = assignment.assignee_id
1097
+ await self._post_task(task, assignment.assignee_id)
1098
+ action_taken = (
1099
+ f"retried with new worker {assignment.assignee_id}"
1100
+ )
1101
+
1102
+ elif strategy == RecoveryStrategy.REPLAN:
1103
+ # Modify the task content and retry
1104
+ if recovery_decision.modified_task_content:
1105
+ task.content = recovery_decision.modified_task_content
1106
+ logger.info(f"Task {task.id} content modified for replan")
1107
+
1108
+ # Repost the modified task
1109
+ if task.id in self._assignees:
1110
+ assignee_id = self._assignees[task.id]
1111
+ await self._post_task(task, assignee_id)
1112
+ action_taken = (
1113
+ f"replanned and retried with worker {assignee_id}"
1114
+ )
1115
+ else:
1116
+ # Find a new assignee for the replanned task
1117
+ batch_result = await self._find_assignee([task])
1118
+ assignment = batch_result.assignments[0]
1119
+ self._assignees[task.id] = assignment.assignee_id
1120
+ await self._post_task(task, assignment.assignee_id)
1121
+ action_taken = (
1122
+ f"replanned and assigned to "
1123
+ f"worker {assignment.assignee_id}"
1124
+ )
1125
+
1126
+ elif strategy == RecoveryStrategy.REASSIGN:
1127
+ # Reassign to a different worker
1128
+ old_worker = task.assigned_worker_id
1129
+ logger.info(
1130
+ f"Task {task.id} will be reassigned from worker "
1131
+ f"{old_worker}"
1132
+ )
1133
+
1134
+ # Find a different worker
1135
+ batch_result = await self._find_assignee([task])
1136
+ assignment = batch_result.assignments[0]
1137
+ new_worker = assignment.assignee_id
1138
+
1139
+ # If same worker, force find another
1140
+ if new_worker == old_worker and len(self._children) > 1:
1141
+ logger.info("Same worker selected, finding alternative")
1142
+ # Try to find different worker by adding note to
1143
+ # task content
1144
+ task.content = (
1145
+ f"{task.content}\n\n"
1146
+ f"Note: Previous worker {old_worker} had quality "
1147
+ f"issues. Needs different approach."
1148
+ )
1149
+ batch_result = await self._find_assignee([task])
1150
+ assignment = batch_result.assignments[0]
1151
+ new_worker = assignment.assignee_id
1152
+
1153
+ self._assignees[task.id] = new_worker
1154
+ await self._post_task(task, new_worker)
1155
+ action_taken = f"reassigned from {old_worker} to {new_worker}"
1156
+ logger.info(
1157
+ f"Task {task.id} reassigned from {old_worker} to "
1158
+ f"{new_worker}"
1159
+ )
1160
+
1161
+ elif strategy == RecoveryStrategy.DECOMPOSE:
1162
+ # Decompose the task into subtasks
1163
+ reason = (
1164
+ "failure"
1165
+ if not recovery_decision.is_quality_evaluation
1166
+ else "quality issues"
1167
+ )
1168
+ logger.info(
1169
+ f"Task {task.id} will be decomposed due to {reason}"
1170
+ )
1171
+ subtasks_result = self._decompose_task(task)
1172
+
1173
+ # Handle both streaming and non-streaming results
1174
+ if isinstance(subtasks_result, Generator):
1175
+ subtasks = []
1176
+ for new_tasks in subtasks_result:
1177
+ subtasks.extend(new_tasks)
1178
+ else:
1179
+ subtasks = subtasks_result
1180
+
1181
+ if subtasks:
1182
+ task_decomposed_event = TaskDecomposedEvent(
1183
+ parent_task_id=task.id,
1184
+ subtask_ids=[st.id for st in subtasks],
1185
+ )
1186
+ for cb in self._callbacks:
1187
+ cb.log_task_decomposed(task_decomposed_event)
1188
+ for subtask in subtasks:
1189
+ task_created_event = TaskCreatedEvent(
1190
+ task_id=subtask.id,
1191
+ description=subtask.content,
1192
+ parent_task_id=task.id,
1193
+ task_type=subtask.type,
1194
+ metadata=subtask.additional_info,
1195
+ )
1196
+ for cb in self._callbacks:
1197
+ cb.log_task_created(task_created_event)
1198
+
1199
+ # Insert subtasks at the head of the queue
1200
+ self._pending_tasks.extendleft(reversed(subtasks))
1201
+ await self._post_ready_tasks()
1202
+ action_taken = f"decomposed into {len(subtasks)} subtasks"
1203
+
1204
+ logger.info(
1205
+ f"Task {task.id} decomposed into {len(subtasks)} subtasks"
1206
+ )
1207
+
1208
+ # Sync shared memory after task decomposition
1209
+ if self.share_memory:
1210
+ logger.info(
1211
+ f"Syncing shared memory after task {task.id} "
1212
+ f"decomposition"
1213
+ )
1214
+ self._sync_shared_memory()
1215
+
1216
+ # For decompose, we return early with special handling
1217
+ return True
1218
+
1219
+ elif strategy == RecoveryStrategy.CREATE_WORKER:
1220
+ assignee = await self._create_worker_node_for_task(task)
1221
+ await self._post_task(task, assignee.node_id)
1222
+ action_taken = (
1223
+ f"created new worker {assignee.node_id} and assigned "
1224
+ f"task {task.id} to it"
1225
+ )
1226
+
1227
+ except Exception as e:
1228
+ logger.error(
1229
+ f"Recovery strategy {strategy} failed for task {task.id}: {e}",
1230
+ exc_info=True,
878
1231
  )
1232
+ raise
1233
+
1234
+ logger.debug(
1235
+ f"Task {task.id} recovery: {action_taken}. "
1236
+ f"Strategy: {strategy.value}"
1237
+ )
1238
+
1239
+ return False
879
1240
 
880
1241
  # Human intervention methods
881
1242
  async def _async_pause(self) -> None:
@@ -966,6 +1327,39 @@ class Workforce(BaseNode):
966
1327
  f"(event-loop not yet started)."
967
1328
  )
968
1329
 
1330
+ async def _async_skip_gracefully(self) -> None:
1331
+ r"""Async implementation of skip_gracefully to run on the event
1332
+ loop.
1333
+ """
1334
+ self._skip_requested = True
1335
+ if self._pause_event.is_set() is False:
1336
+ self._pause_event.set() # Resume if paused to process skip
1337
+ logger.info(f"Workforce {self.node_id} skip requested.")
1338
+
1339
+ def skip_gracefully(self) -> None:
1340
+ r"""Request workforce to skip current pending tasks and move to next
1341
+ main task from the queue. If no main tasks exist, acts like
1342
+ stop_gracefully.
1343
+
1344
+ This method clears the current pending subtasks and moves to the next
1345
+ main task in the queue if available. Works both when the internal
1346
+ event-loop is alive and when it has not yet been started.
1347
+ """
1348
+
1349
+ if self._loop and not self._loop.is_closed():
1350
+ self._submit_coro_to_loop(self._async_skip_gracefully())
1351
+ else:
1352
+ # Loop not yet created, set the flag synchronously so later
1353
+ # startup will respect it.
1354
+ self._skip_requested = True
1355
+ # Ensure any pending pause is released so that when the loop does
1356
+ # start it can see the skip request and exit.
1357
+ self._pause_event.set()
1358
+ logger.info(
1359
+ f"Workforce {self.node_id} skip requested "
1360
+ f"(event-loop not yet started)."
1361
+ )
1362
+
969
1363
  def save_snapshot(self, description: str = "") -> None:
970
1364
  r"""Save current state as a snapshot."""
971
1365
  snapshot = WorkforceSnapshot(
@@ -1020,67 +1414,193 @@ class Workforce(BaseNode):
1020
1414
  logger.warning(f"Task {task_id} not found in pending tasks.")
1021
1415
  return False
1022
1416
 
1417
+ def get_main_task_queue(self) -> List[Task]:
1418
+ r"""Get current main task queue for human review.
1419
+ Returns:
1420
+ List[Task]: List of main tasks waiting to be decomposed
1421
+ and executed.
1422
+ """
1423
+ # Return tasks from pending queue that need decomposition
1424
+ return [
1425
+ t
1426
+ for t in self._pending_tasks
1427
+ if t.additional_info
1428
+ and t.additional_info.get('_needs_decomposition')
1429
+ ]
1430
+
1023
1431
  def add_task(
1024
1432
  self,
1025
1433
  content: str,
1026
1434
  task_id: Optional[str] = None,
1027
1435
  additional_info: Optional[Dict[str, Any]] = None,
1436
+ as_subtask: bool = False,
1028
1437
  insert_position: int = -1,
1029
1438
  ) -> Task:
1030
- r"""Add a new task to the pending queue."""
1031
- new_task = Task(
1032
- content=content,
1033
- id=task_id or f"human_added_{len(self._pending_tasks)}",
1034
- additional_info=additional_info,
1035
- )
1036
- if insert_position == -1:
1037
- self._pending_tasks.append(new_task)
1038
- else:
1039
- # Convert deque to list, insert, then back to deque
1040
- tasks_list = list(self._pending_tasks)
1041
- tasks_list.insert(insert_position, new_task)
1042
- self._pending_tasks = deque(tasks_list)
1439
+ r"""Add a new task to the workforce.
1043
1440
 
1044
- logger.info(f"New task added: {new_task.id}")
1045
- return new_task
1441
+ By default, this method adds a main task that will be decomposed into
1442
+ subtasks. Set `as_subtask=True` to add a task directly to the pending
1443
+ subtask queue without decomposition.
1046
1444
 
1047
- def remove_task(self, task_id: str) -> bool:
1048
- r"""Remove a task from the pending queue."""
1049
- # Convert to list to find and remove
1050
- tasks_list = list(self._pending_tasks)
1051
- for i, task in enumerate(tasks_list):
1052
- if task.id == task_id:
1053
- tasks_list.pop(i)
1054
- self._pending_tasks = deque(tasks_list)
1055
- logger.info(f"Task {task_id} removed.")
1056
- return True
1057
- logger.warning(f"Task {task_id} not found in pending tasks.")
1058
- return False
1445
+ Args:
1446
+ content (str): The content of the task.
1447
+ task_id (Optional[str], optional): Optional ID for the task.
1448
+ If not provided, a unique ID will be generated.
1449
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1450
+ additional metadata for the task.
1451
+ as_subtask (bool, optional): If True, adds the task directly to
1452
+ the pending subtask queue. If False, adds as a main task that
1453
+ will be decomposed. Defaults to False.
1454
+ insert_position (int, optional): Position to insert the task in
1455
+ the pending queue. Only applies when as_subtask=True.
1456
+ Defaults to -1 (append to end).
1059
1457
 
1060
- def reorder_tasks(self, task_ids: List[str]) -> bool:
1061
- r"""Reorder pending tasks according to the provided task IDs list."""
1062
- # Create a mapping of task_id to task
1063
- tasks_dict = {task.id: task for task in self._pending_tasks}
1458
+ Returns:
1459
+ Task: The created task object.
1460
+ """
1461
+ if as_subtask:
1462
+ new_task = Task(
1463
+ content=content,
1464
+ id=task_id or f"human_added_{len(self._pending_tasks)}",
1465
+ additional_info=additional_info,
1466
+ )
1064
1467
 
1065
- # Check if all provided IDs exist
1066
- invalid_ids = [
1067
- task_id for task_id in task_ids if task_id not in tasks_dict
1068
- ]
1069
- if invalid_ids:
1070
- logger.warning(
1071
- f"Task IDs not found in pending tasks: {invalid_ids}"
1468
+ # Add directly to current pending subtasks
1469
+ if insert_position == -1:
1470
+ self._pending_tasks.append(new_task)
1471
+ else:
1472
+ # Convert deque to list, insert, then back to deque
1473
+ tasks_list = list(self._pending_tasks)
1474
+ tasks_list.insert(insert_position, new_task)
1475
+ self._pending_tasks = deque(tasks_list)
1476
+
1477
+ logger.info(f"New subtask added to pending queue: {new_task.id}")
1478
+ return new_task
1479
+ else:
1480
+ # Add as main task that needs decomposition
1481
+ # Use additional_info to mark this task needs decomposition
1482
+ # Make a copy to avoid modifying user's dict
1483
+ info = additional_info.copy() if additional_info else {}
1484
+ info['_needs_decomposition'] = True
1485
+
1486
+ task_count = sum(
1487
+ 1
1488
+ for t in self._pending_tasks
1489
+ if t.additional_info
1490
+ and t.additional_info.get('_needs_decomposition')
1072
1491
  )
1073
- return False
1074
1492
 
1075
- # Check if we have the same number of tasks
1076
- if len(task_ids) != len(self._pending_tasks):
1077
- logger.warning(
1078
- "Number of task IDs doesn't match pending tasks count."
1493
+ new_task = Task(
1494
+ content=content,
1495
+ id=task_id or f"main_task_{task_count}",
1496
+ additional_info=info,
1079
1497
  )
1080
- return False
1081
1498
 
1082
- # Reorder tasks
1083
- reordered_tasks = deque([tasks_dict[task_id] for task_id in task_ids])
1499
+ self._pending_tasks.append(new_task)
1500
+ logger.info(f"New main task added to pending queue: {new_task.id}")
1501
+ return new_task
1502
+
1503
+ def add_main_task(
1504
+ self,
1505
+ content: str,
1506
+ task_id: Optional[str] = None,
1507
+ additional_info: Optional[Dict[str, Any]] = None,
1508
+ ) -> Task:
1509
+ r"""Add a new main task that will be decomposed into subtasks.
1510
+
1511
+ This is an alias for :meth:`add_task` with `as_subtask=False`.
1512
+
1513
+ Args:
1514
+ content (str): The content of the main task.
1515
+ task_id (Optional[str], optional): Optional ID for the task.
1516
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1517
+ additional metadata.
1518
+
1519
+ Returns:
1520
+ Task: The created main task object.
1521
+ """
1522
+ return self.add_task(
1523
+ content=content,
1524
+ task_id=task_id,
1525
+ additional_info=additional_info,
1526
+ as_subtask=False,
1527
+ )
1528
+
1529
+ def add_subtask(
1530
+ self,
1531
+ content: str,
1532
+ task_id: Optional[str] = None,
1533
+ additional_info: Optional[Dict[str, Any]] = None,
1534
+ insert_position: int = -1,
1535
+ ) -> Task:
1536
+ r"""Add a new subtask to the current pending queue.
1537
+
1538
+ This is an alias for :meth:`add_task` with `as_subtask=True`.
1539
+
1540
+ Args:
1541
+ content (str): The content of the subtask.
1542
+ task_id (Optional[str], optional): Optional ID for the task.
1543
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1544
+ additional metadata.
1545
+ insert_position (int, optional): Position to insert the task.
1546
+ Defaults to -1 (append to end).
1547
+
1548
+ Returns:
1549
+ Task: The created subtask object.
1550
+ """
1551
+ return self.add_task(
1552
+ content=content,
1553
+ task_id=task_id,
1554
+ additional_info=additional_info,
1555
+ as_subtask=True,
1556
+ insert_position=insert_position,
1557
+ )
1558
+
1559
+ def remove_task(self, task_id: str) -> bool:
1560
+ r"""Remove a task from the pending queue or main task queue.
1561
+
1562
+ Args:
1563
+ task_id (str): The ID of the task to remove.
1564
+
1565
+ Returns:
1566
+ bool: True if task was found and removed, False otherwise.
1567
+ """
1568
+ # Check main task queue first
1569
+ pending_tasks_list = list(self._pending_tasks)
1570
+ for i, task in enumerate(pending_tasks_list):
1571
+ if task.id == task_id:
1572
+ pending_tasks_list.pop(i)
1573
+ self._pending_tasks = deque(pending_tasks_list)
1574
+ logger.info(f"Task {task_id} removed from pending queue.")
1575
+ return True
1576
+
1577
+ logger.warning(f"Task {task_id} not found in any task queue.")
1578
+ return False
1579
+
1580
+ def reorder_tasks(self, task_ids: List[str]) -> bool:
1581
+ r"""Reorder pending tasks according to the provided task IDs list."""
1582
+ # Create a mapping of task_id to task
1583
+ tasks_dict = {task.id: task for task in self._pending_tasks}
1584
+
1585
+ # Check if all provided IDs exist
1586
+ invalid_ids = [
1587
+ task_id for task_id in task_ids if task_id not in tasks_dict
1588
+ ]
1589
+ if invalid_ids:
1590
+ logger.warning(
1591
+ f"Task IDs not found in pending tasks: {invalid_ids}"
1592
+ )
1593
+ return False
1594
+
1595
+ # Check if we have the same number of tasks
1596
+ if len(task_ids) != len(self._pending_tasks):
1597
+ logger.warning(
1598
+ "Number of task IDs doesn't match pending tasks count."
1599
+ )
1600
+ return False
1601
+
1602
+ # Reorder tasks
1603
+ reordered_tasks = deque([tasks_dict[task_id] for task_id in task_ids])
1084
1604
  self._pending_tasks = reordered_tasks
1085
1605
 
1086
1606
  logger.info("Tasks reordered successfully.")
@@ -1169,26 +1689,21 @@ class Workforce(BaseNode):
1169
1689
  "main_task_id": self._task.id if self._task else None,
1170
1690
  }
1171
1691
 
1172
- @check_if_running(False)
1173
- async def process_task_async(
1174
- self, task: Task, interactive: bool = False
1175
- ) -> Task:
1176
- r"""Main entry point to process a task asynchronously.
1692
+ async def handle_decompose_append_task(
1693
+ self, task: Task, reset: bool = True
1694
+ ) -> List[Task]:
1695
+ r"""Handle task decomposition and validation with
1696
+ workforce environment functions. Then append to
1697
+ pending tasks if decomposition happened.
1177
1698
 
1178
1699
  Args:
1179
1700
  task (Task): The task to be processed.
1180
- interactive (bool, optional): If True, enables human-intervention
1181
- workflow (pause/resume/snapshot). Defaults to False, which
1182
- runs the task in a blocking one-shot manner.
1701
+ reset (Bool): Should trigger workforce reset (Workforce must not
1702
+ be running). Default: True
1183
1703
 
1184
1704
  Returns:
1185
- Task: The updated task.
1705
+ List[Task]: The decomposed subtasks or the original task.
1186
1706
  """
1187
- # Delegate to intervention pipeline when requested to keep
1188
- # backward-compat.
1189
- if interactive:
1190
- return await self._process_task_with_snapshot(task)
1191
-
1192
1707
  if not validate_task_content(task.content, task.id):
1193
1708
  task.state = TaskState.FAILED
1194
1709
  task.result = "Task failed: Invalid or empty content provided"
@@ -1196,18 +1711,25 @@ class Workforce(BaseNode):
1196
1711
  f"Task {task.id} rejected: Invalid or empty content. "
1197
1712
  f"Content preview: '{task.content}'"
1198
1713
  )
1199
- return task
1714
+ return [task]
1200
1715
 
1201
- self.reset()
1716
+ if reset and self._state != WorkforceState.RUNNING:
1717
+ self.reset()
1718
+ logger.info("Workforce reset before handling task.")
1719
+
1720
+ # Focus on the new task
1202
1721
  self._task = task
1203
- if self.metrics_logger:
1204
- self.metrics_logger.log_task_created(
1205
- task_id=task.id,
1206
- description=task.content,
1207
- task_type=task.type,
1208
- metadata=task.additional_info,
1209
- )
1210
1722
  task.state = TaskState.FAILED
1723
+
1724
+ task_created_event = TaskCreatedEvent(
1725
+ task_id=task.id,
1726
+ description=task.content,
1727
+ task_type=task.type,
1728
+ metadata=task.additional_info,
1729
+ )
1730
+ for cb in self._callbacks:
1731
+ cb.log_task_created(task_created_event)
1732
+
1211
1733
  # The agent tend to be overconfident on the whole task, so we
1212
1734
  # decompose the task into subtasks first
1213
1735
  subtasks_result = self._decompose_task(task)
@@ -1221,26 +1743,57 @@ class Workforce(BaseNode):
1221
1743
  else:
1222
1744
  # This is a regular list (non-streaming mode)
1223
1745
  subtasks = subtasks_result
1224
- if self.metrics_logger and subtasks:
1225
- self.metrics_logger.log_task_decomposed(
1226
- parent_task_id=task.id, subtask_ids=[st.id for st in subtasks]
1746
+ if subtasks:
1747
+ task_decomposed_event = TaskDecomposedEvent(
1748
+ parent_task_id=task.id,
1749
+ subtask_ids=[st.id for st in subtasks],
1227
1750
  )
1751
+ for cb in self._callbacks:
1752
+ cb.log_task_decomposed(task_decomposed_event)
1228
1753
  for subtask in subtasks:
1229
- self.metrics_logger.log_task_created(
1754
+ task_created_event = TaskCreatedEvent(
1230
1755
  task_id=subtask.id,
1231
1756
  description=subtask.content,
1232
1757
  parent_task_id=task.id,
1233
1758
  task_type=subtask.type,
1234
1759
  metadata=subtask.additional_info,
1235
1760
  )
1761
+ for cb in self._callbacks:
1762
+ cb.log_task_created(task_created_event)
1763
+
1236
1764
  if subtasks:
1237
- # If decomposition happened, the original task becomes a container.
1238
- # We only execute its subtasks.
1765
+ # _pending_tasks will contain both undecomposed
1766
+ # and decomposed tasks, so we use additional_info
1767
+ # to mark the tasks that need decomposition instead
1239
1768
  self._pending_tasks.extendleft(reversed(subtasks))
1240
1769
  else:
1241
1770
  # If no decomposition, execute the original task.
1242
1771
  self._pending_tasks.append(task)
1243
1772
 
1773
+ return subtasks
1774
+
1775
+ @check_if_running(False)
1776
+ async def process_task_async(
1777
+ self, task: Task, interactive: bool = False
1778
+ ) -> Task:
1779
+ r"""Main entry point to process a task asynchronously.
1780
+
1781
+ Args:
1782
+ task (Task): The task to be processed.
1783
+ interactive (bool, optional): If True, enables human-intervention
1784
+ workflow (pause/resume/snapshot). Defaults to False, which
1785
+ runs the task in a blocking one-shot manner.
1786
+
1787
+ Returns:
1788
+ Task: The updated task.
1789
+ """
1790
+ # Delegate to intervention pipeline when requested to keep
1791
+ # backward-compat.
1792
+ if interactive:
1793
+ return await self._process_task_with_snapshot(task)
1794
+
1795
+ subtasks = await self.handle_decompose_append_task(task)
1796
+
1244
1797
  self.set_channel(TaskChannel())
1245
1798
 
1246
1799
  await self.start()
@@ -1322,39 +1875,8 @@ class Workforce(BaseNode):
1322
1875
  Task: The updated task.
1323
1876
  """
1324
1877
 
1325
- if not validate_task_content(task.content, task.id):
1326
- task.state = TaskState.FAILED
1327
- task.result = "Task failed: Invalid or empty content provided"
1328
- logger.warning(
1329
- f"Task {task.id} rejected: Invalid or empty content. "
1330
- f"Content preview: '{task.content}'"
1331
- )
1332
- return task
1333
-
1334
- self.reset()
1335
- self._task = task
1336
- self._state = WorkforceState.RUNNING
1337
- task.state = TaskState.FAILED # TODO: Add logic for OPEN
1338
-
1339
- # Decompose the task into subtasks first
1340
- subtasks_result = self._decompose_task(task)
1878
+ await self.handle_decompose_append_task(task)
1341
1879
 
1342
- # Handle both streaming and non-streaming results
1343
- if isinstance(subtasks_result, Generator):
1344
- # This is a generator (streaming mode)
1345
- subtasks = []
1346
- for new_tasks in subtasks_result:
1347
- subtasks.extend(new_tasks)
1348
- else:
1349
- # This is a regular list (non-streaming mode)
1350
- subtasks = subtasks_result
1351
- if subtasks:
1352
- # If decomposition happened, the original task becomes a container.
1353
- # We only execute its subtasks.
1354
- self._pending_tasks.extendleft(reversed(subtasks))
1355
- else:
1356
- # If no decomposition, execute the original task.
1357
- self._pending_tasks.append(task)
1358
1880
  self.set_channel(TaskChannel())
1359
1881
 
1360
1882
  # Save initial snapshot
@@ -1493,6 +2015,9 @@ class Workforce(BaseNode):
1493
2015
  start_coroutine, self._loop
1494
2016
  )
1495
2017
  self._child_listening_tasks.append(child_task)
2018
+ else:
2019
+ # Close the coroutine to prevent RuntimeWarning
2020
+ start_coroutine.close()
1496
2021
  else:
1497
2022
  # Close the coroutine to prevent RuntimeWarning
1498
2023
  start_coroutine.close()
@@ -1502,6 +2027,7 @@ class Workforce(BaseNode):
1502
2027
  description: str,
1503
2028
  worker: ChatAgent,
1504
2029
  pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
2030
+ enable_workflow_memory: bool = False,
1505
2031
  ) -> Workforce:
1506
2032
  r"""Add a worker node to the workforce that uses a single agent.
1507
2033
  Can be called when workforce is paused to dynamically add workers.
@@ -1511,6 +2037,9 @@ class Workforce(BaseNode):
1511
2037
  worker (ChatAgent): The agent to be added.
1512
2038
  pool_max_size (int): Maximum size of the agent pool.
1513
2039
  (default: :obj:`10`)
2040
+ enable_workflow_memory (bool): Whether to enable workflow memory
2041
+ accumulation. Set to True if you plan to call
2042
+ save_workflow_memories(). (default: :obj:`False`)
1514
2043
 
1515
2044
  Returns:
1516
2045
  Workforce: The workforce node itself.
@@ -1537,6 +2066,8 @@ class Workforce(BaseNode):
1537
2066
  worker=worker,
1538
2067
  pool_max_size=pool_max_size,
1539
2068
  use_structured_output_handler=self.use_structured_output_handler,
2069
+ context_utility=None, # Will be set during save/load operations
2070
+ enable_workflow_memory=enable_workflow_memory,
1540
2071
  )
1541
2072
  self._children.append(worker_node)
1542
2073
 
@@ -1547,12 +2078,10 @@ class Workforce(BaseNode):
1547
2078
  # If workforce is paused, start the worker's listening task
1548
2079
  self._start_child_node_when_paused(worker_node.start())
1549
2080
 
1550
- if self.metrics_logger:
1551
- self.metrics_logger.log_worker_created(
1552
- worker_id=worker_node.node_id,
1553
- worker_type='SingleAgentWorker',
1554
- role=worker_node.description,
1555
- )
2081
+ self._notify_worker_created(
2082
+ worker_node,
2083
+ worker_type='SingleAgentWorker',
2084
+ )
1556
2085
  return self
1557
2086
 
1558
2087
  def add_role_playing_worker(
@@ -1626,12 +2155,10 @@ class Workforce(BaseNode):
1626
2155
  # If workforce is paused, start the worker's listening task
1627
2156
  self._start_child_node_when_paused(worker_node.start())
1628
2157
 
1629
- if self.metrics_logger:
1630
- self.metrics_logger.log_worker_created(
1631
- worker_id=worker_node.node_id,
1632
- worker_type='RolePlayingWorker',
1633
- role=worker_node.description,
1634
- )
2158
+ self._notify_worker_created(
2159
+ worker_node,
2160
+ worker_type='RolePlayingWorker',
2161
+ )
1635
2162
  return self
1636
2163
 
1637
2164
  def add_workforce(self, workforce: Workforce) -> Workforce:
@@ -1692,6 +2219,7 @@ class Workforce(BaseNode):
1692
2219
  # Reset intervention state
1693
2220
  self._state = WorkforceState.IDLE
1694
2221
  self._stop_requested = False
2222
+ self._skip_requested = False
1695
2223
  # Handle asyncio.Event in a thread-safe way
1696
2224
  if self._loop and not self._loop.is_closed():
1697
2225
  # If we have a loop, use it to set the event safely
@@ -1707,118 +2235,520 @@ class Workforce(BaseNode):
1707
2235
  # No active loop, directly set the event
1708
2236
  self._pause_event.set()
1709
2237
 
1710
- if hasattr(self, 'metrics_logger') and self.metrics_logger is not None:
1711
- self.metrics_logger.reset_task_data()
1712
- else:
1713
- self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
1714
-
1715
- @check_if_running(False)
1716
- def set_channel(self, channel: TaskChannel) -> None:
1717
- r"""Set the channel for the node and all the child nodes under it."""
1718
- self._channel = channel
1719
- for child in self._children:
1720
- child.set_channel(channel)
2238
+ for cb in self._callbacks:
2239
+ if isinstance(cb, WorkforceMetrics):
2240
+ cb.reset_task_data()
1721
2241
 
1722
- def _get_child_nodes_info(self) -> str:
1723
- r"""Get the information of all the child nodes under this node."""
1724
- return "".join(
1725
- f"<{child.node_id}>:<{child.description}>:<{self._get_node_info(child)}>\n"
1726
- for child in self._children
1727
- )
1728
-
1729
- def _get_node_info(self, node) -> str:
1730
- r"""Get descriptive information for a specific node type."""
1731
- if isinstance(node, Workforce):
1732
- return "A Workforce node"
1733
- elif isinstance(node, SingleAgentWorker):
1734
- return self._get_single_agent_info(node)
1735
- elif isinstance(node, RolePlayingWorker):
1736
- return "A Role playing node"
1737
- else:
1738
- return "Unknown node"
2242
+ def save_workflow_memories(
2243
+ self,
2244
+ session_id: Optional[str] = None,
2245
+ ) -> Dict[str, str]:
2246
+ r"""Save workflow memories for all SingleAgentWorker instances in the
2247
+ workforce.
2248
+
2249
+ .. deprecated:: 0.2.80
2250
+ This synchronous method processes workers sequentially, which can
2251
+ be slow for multiple agents. Use
2252
+ :meth:`save_workflow_memories_async`
2253
+ instead for parallel processing and significantly better
2254
+ performance.
2255
+
2256
+ This method iterates through all child workers and triggers workflow
2257
+ saving for SingleAgentWorker instances using their
2258
+ save_workflow_memories()
2259
+ method.
2260
+ Other worker types are skipped.
1739
2261
 
1740
- def _get_single_agent_info(self, worker: 'SingleAgentWorker') -> str:
1741
- r"""Get formatted information for a SingleAgentWorker node."""
1742
- toolkit_tools = self._group_tools_by_toolkit(worker.worker.tool_dict)
2262
+ Args:
2263
+ session_id (Optional[str]): Custom session ID to use for saving
2264
+ workflows. If None, auto-generates a timestamped session ID.
2265
+ Useful for organizing workflows by project or context.
2266
+ (default: :obj:`None`)
1743
2267
 
1744
- if not toolkit_tools:
1745
- return "no tools available"
2268
+ Returns:
2269
+ Dict[str, str]: Dictionary mapping worker node IDs to save results.
2270
+ Values are either file paths (success) or error messages
2271
+ (failure).
1746
2272
 
1747
- toolkit_info = []
1748
- for toolkit_name, tools in sorted(toolkit_tools.items()):
1749
- tools_str = ', '.join(sorted(tools))
1750
- toolkit_info.append(f"{toolkit_name}({tools_str})")
2273
+ Example:
2274
+ >>> workforce = Workforce("My Team")
2275
+ >>> # ... add workers and process tasks ...
2276
+ >>> # save with auto-generated session id
2277
+ >>> results = workforce.save_workflow_memories()
2278
+ >>> print(results)
2279
+ {'worker_123': '/path/to/developer_agent_workflow.md',
2280
+ 'worker_456': 'error: No conversation context available'}
2281
+ >>> # save with custom project id
2282
+ >>> results = workforce.save_workflow_memories(
2283
+ ... session_id="project_123"
2284
+ ... )
2285
+
2286
+ Note:
2287
+ For better performance with multiple workers, use the async
2288
+ version::
2289
+
2290
+ results = await workforce.save_workflow_memories_async()
2291
+
2292
+ See Also:
2293
+ :meth:`save_workflow_memories_async`: Async version with parallel
2294
+ processing for significantly better performance.
2295
+ """
2296
+ import warnings
2297
+
2298
+ warnings.warn(
2299
+ "save_workflow_memories() is slow for multiple workers. "
2300
+ "Consider using save_workflow_memories_async() for parallel "
2301
+ "processing and ~4x faster performance.",
2302
+ DeprecationWarning,
2303
+ stacklevel=2,
2304
+ )
2305
+ results = {}
1751
2306
 
1752
- return " | ".join(toolkit_info)
2307
+ # Get or create shared context utility for this save operation
2308
+ shared_context_utility = self._get_or_create_shared_context_utility(
2309
+ session_id=session_id
2310
+ )
1753
2311
 
1754
- def _group_tools_by_toolkit(self, tool_dict: dict) -> dict[str, list[str]]:
1755
- r"""Group tools by their parent toolkit class names."""
1756
- toolkit_tools: dict[str, list[str]] = {}
2312
+ for child in self._children:
2313
+ if isinstance(child, SingleAgentWorker):
2314
+ try:
2315
+ # Set the shared context utility for this operation
2316
+ child._shared_context_utility = shared_context_utility
2317
+ child.worker.set_context_utility(shared_context_utility)
2318
+
2319
+ result = child.save_workflow_memories()
2320
+ if result.get("status") == "success":
2321
+ results[child.node_id] = result.get(
2322
+ "file_path", "unknown_path"
2323
+ )
2324
+ else:
2325
+ # Error: check if there's a separate message field,
2326
+ # otherwise use the status itself
2327
+ error_msg = result.get(
2328
+ "message", result.get("status", "Unknown error")
2329
+ )
2330
+ results[child.node_id] = f"error: {error_msg}"
1757
2331
 
1758
- for tool_name, tool in tool_dict.items():
1759
- if hasattr(tool.func, '__self__'):
1760
- toolkit_name = tool.func.__self__.__class__.__name__
2332
+ except Exception as e:
2333
+ results[child.node_id] = f"error: {e!s}"
1761
2334
  else:
1762
- toolkit_name = "Standalone"
1763
-
1764
- if toolkit_name not in toolkit_tools:
1765
- toolkit_tools[toolkit_name] = []
1766
- toolkit_tools[toolkit_name].append(tool_name)
2335
+ # Skip non-SingleAgentWorker types
2336
+ results[child.node_id] = (
2337
+ f"skipped: {type(child).__name__} not supported"
2338
+ )
1767
2339
 
1768
- return toolkit_tools
2340
+ logger.info(f"Workflow save completed for {len(results)} workers")
2341
+ return results
1769
2342
 
1770
- def _get_valid_worker_ids(self) -> set:
1771
- r"""Get all valid worker IDs from child nodes.
2343
+ async def save_workflow_memories_async(
2344
+ self,
2345
+ session_id: Optional[str] = None,
2346
+ ) -> Dict[str, str]:
2347
+ r"""Asynchronously save workflow memories for all SingleAgentWorker
2348
+ instances in the workforce.
1772
2349
 
1773
- Returns:
1774
- set: Set of valid worker IDs that can be assigned tasks.
1775
- """
1776
- valid_worker_ids = {child.node_id for child in self._children}
1777
- return valid_worker_ids
2350
+ This is the async version of save_workflow_memories() that parallelizes
2351
+ LLM summarization calls across all workers using asyncio.gather(),
2352
+ significantly reducing total save time.
1778
2353
 
1779
- def _call_coordinator_for_assignment(
1780
- self, tasks: List[Task], invalid_ids: Optional[List[str]] = None
1781
- ) -> TaskAssignResult:
1782
- r"""Call coordinator agent to assign tasks with optional validation
1783
- feedback in the case of invalid worker IDs.
2354
+ This method iterates through all child workers and triggers workflow
2355
+ saving for SingleAgentWorker instances using their
2356
+ save_workflow_memories_async() method in parallel.
2357
+ Other worker types are skipped.
1784
2358
 
1785
2359
  Args:
1786
- tasks (List[Task]): Tasks to assign.
1787
- invalid_ids (List[str], optional): Invalid worker IDs from previous
1788
- attempt (if any).
2360
+ session_id (Optional[str]): Custom session ID to use for saving
2361
+ workflows. If None, auto-generates a timestamped session ID.
2362
+ Useful for organizing workflows by project or context.
2363
+ (default: :obj:`None`)
1789
2364
 
1790
2365
  Returns:
1791
- TaskAssignResult: Assignment result from coordinator.
2366
+ Dict[str, str]: Dictionary mapping worker node IDs to save results.
2367
+ Values are either file paths (success) or error messages
2368
+ (failure).
2369
+
2370
+ Example:
2371
+ >>> workforce = Workforce("My Team")
2372
+ >>> # ... add workers and process tasks ...
2373
+ >>> # save with parallel summarization (faster)
2374
+ >>> results = await workforce.save_workflow_memories_async()
2375
+ >>> print(results)
2376
+ {'worker_123': '/path/to/developer_agent_workflow.md',
2377
+ 'worker_456': '/path/to/search_agent_workflow.md',
2378
+ 'worker_789': '/path/to/document_agent_workflow.md'}
1792
2379
  """
1793
- # format tasks information for the prompt
1794
- tasks_info = ""
1795
- for task in tasks:
1796
- tasks_info += f"Task ID: {task.id}\n"
1797
- tasks_info += f"Content: {task.content}\n"
1798
- if task.additional_info:
1799
- tasks_info += f"Additional Info: {task.additional_info}\n"
1800
- tasks_info += "---\n"
2380
+ import asyncio
1801
2381
 
1802
- prompt = str(
1803
- ASSIGN_TASK_PROMPT.format(
1804
- tasks_info=tasks_info,
1805
- child_nodes_info=self._get_child_nodes_info(),
1806
- )
2382
+ results = {}
2383
+
2384
+ # Get or create shared context utility for this save operation
2385
+ shared_context_utility = self._get_or_create_shared_context_utility(
2386
+ session_id=session_id
1807
2387
  )
1808
2388
 
1809
- # add feedback if this is a retry
1810
- if invalid_ids:
1811
- valid_worker_ids = list(self._get_valid_worker_ids())
1812
- feedback = (
1813
- f"VALIDATION ERROR: The following worker IDs are invalid: "
1814
- f"{invalid_ids}. "
1815
- f"VALID WORKER IDS: {valid_worker_ids}. "
1816
- f"Please reassign ONLY the above tasks using these valid IDs."
1817
- )
1818
- prompt = prompt + f"\n\n{feedback}"
2389
+ # Prepare tasks for parallel execution
2390
+ async def save_single_worker(
2391
+ child: BaseNode,
2392
+ ) -> tuple[str, str]:
2393
+ """Save workflow for a single worker, then return (node_id,
2394
+ result)."""
2395
+ if isinstance(child, SingleAgentWorker):
2396
+ try:
2397
+ # Set the shared context utility for this operation
2398
+ child._shared_context_utility = shared_context_utility
2399
+ child.worker.set_context_utility(shared_context_utility)
2400
+
2401
+ result = await child.save_workflow_memories_async()
2402
+ if result.get("status") == "success":
2403
+ return (
2404
+ child.node_id,
2405
+ result.get("file_path", "unknown_path"),
2406
+ )
2407
+ else:
2408
+ # Error: check if there's a separate message field,
2409
+ # otherwise use the status itself
2410
+ error_msg = result.get(
2411
+ "message", result.get("status", "Unknown error")
2412
+ )
2413
+ return (child.node_id, f"error: {error_msg}")
1819
2414
 
1820
- # Check if we should use structured handler
1821
- if self.use_structured_output_handler:
2415
+ except Exception as e:
2416
+ return (child.node_id, f"error: {e!s}")
2417
+ else:
2418
+ # Skip non-SingleAgentWorker types
2419
+ return (
2420
+ child.node_id,
2421
+ f"skipped: {type(child).__name__} not supported",
2422
+ )
2423
+
2424
+ # Create tasks for all workers
2425
+ tasks = [save_single_worker(child) for child in self._children]
2426
+
2427
+ # Execute all tasks in parallel using asyncio.gather()
2428
+ parallel_results = await asyncio.gather(*tasks, return_exceptions=True)
2429
+
2430
+ # Process results
2431
+ for result in parallel_results:
2432
+ if isinstance(result, Exception):
2433
+ # Handle any unexpected exceptions
2434
+ logger.error(
2435
+ f"Unexpected error during workflow save: {result}"
2436
+ )
2437
+ results["unknown"] = f"error: {result!s}"
2438
+ elif isinstance(result, tuple) and len(result) == 2:
2439
+ # Successfully got (node_id, save_result) tuple
2440
+ node_id, save_result = result
2441
+ results[node_id] = save_result
2442
+ else:
2443
+ # Unexpected result format
2444
+ logger.error(f"Unexpected result format: {result}")
2445
+ results["unknown"] = "error: unexpected result format"
2446
+
2447
+ logger.info(
2448
+ f"Workflow save completed for {len(results)} workers "
2449
+ f"(parallelized)"
2450
+ )
2451
+ return results
2452
+
2453
+ def load_workflow_memories(
2454
+ self,
2455
+ session_id: Optional[str] = None,
2456
+ worker_max_workflows: int = 3,
2457
+ coordinator_max_workflows: int = 5,
2458
+ task_agent_max_workflows: int = 3,
2459
+ ) -> Dict[str, bool]:
2460
+ r"""Load workflow memories for all SingleAgentWorker instances in the
2461
+ workforce.
2462
+
2463
+ This method iterates through all child workers and loads relevant
2464
+ workflow files for SingleAgentWorker instances using their
2465
+ load_workflow_memories()
2466
+ method. Workers match files based on their description names.
2467
+
2468
+ Args:
2469
+ session_id (Optional[str]): Specific workforce session ID to load
2470
+ from. If None, searches across all sessions.
2471
+ (default: :obj:`None`)
2472
+ worker_max_workflows (int): Maximum number of workflow files to
2473
+ load per worker agent. (default: :obj:`3`)
2474
+ coordinator_max_workflows (int): Maximum number of workflow files
2475
+ to load for the coordinator agent. (default: :obj:`5`)
2476
+ task_agent_max_workflows (int): Maximum number of workflow files
2477
+ to load for the task planning agent. (default: :obj:`3`)
2478
+
2479
+ Returns:
2480
+ Dict[str, bool]: Dictionary mapping worker node IDs to load
2481
+ success status.
2482
+ True indicates successful loading, False indicates failure.
2483
+
2484
+ Example:
2485
+ >>> workforce = Workforce("My Team")
2486
+ >>> workforce.add_single_agent_worker(
2487
+ ... "data_analyst", analyst_agent
2488
+ ... )
2489
+ >>> success_status = workforce.load_workflow_memories(
2490
+ ... worker_max_workflows=5,
2491
+ ... coordinator_max_workflows=10,
2492
+ ... task_agent_max_workflows=5
2493
+ ... )
2494
+ >>> print(success_status)
2495
+ {'worker_123': True} # Successfully loaded workflows for
2496
+ # data_analyst
2497
+ """
2498
+ results = {}
2499
+
2500
+ # For loading, we don't create a new session - instead we search
2501
+ # existing ones
2502
+ # Each worker will search independently across all existing sessions
2503
+
2504
+ # First, load workflows for SingleAgentWorker instances
2505
+ for child in self._children:
2506
+ if isinstance(child, SingleAgentWorker):
2507
+ try:
2508
+ # For loading, don't set shared context utility
2509
+ # Let each worker search across existing sessions
2510
+ success = child.load_workflow_memories(
2511
+ max_workflows=worker_max_workflows,
2512
+ session_id=session_id,
2513
+ )
2514
+ results[child.node_id] = success
2515
+
2516
+ except Exception as e:
2517
+ logger.error(
2518
+ f"Failed to load workflow for {child.node_id}: {e!s}"
2519
+ )
2520
+ results[child.node_id] = False
2521
+ else:
2522
+ # Skip non-SingleAgentWorker types
2523
+ results[child.node_id] = False
2524
+
2525
+ # Load aggregated workflow summaries for coordinator and task agents
2526
+ self._load_management_agent_workflows(
2527
+ coordinator_max_workflows, task_agent_max_workflows, session_id
2528
+ )
2529
+
2530
+ logger.info(f"Workflow load completed for {len(results)} workers")
2531
+ return results
2532
+
2533
+ def _load_management_agent_workflows(
2534
+ self,
2535
+ coordinator_max_workflows: int,
2536
+ task_agent_max_workflows: int,
2537
+ session_id: Optional[str] = None,
2538
+ ) -> None:
2539
+ r"""Load workflow summaries for coordinator and task planning agents.
2540
+
2541
+ This method loads aggregated workflow summaries to help:
2542
+ - Coordinator agent: understand task assignment patterns and worker
2543
+ capabilities
2544
+ - Task agent: understand task decomposition patterns and
2545
+ successful strategies
2546
+
2547
+ Args:
2548
+ coordinator_max_workflows (int): Maximum number of workflow files
2549
+ to load for the coordinator agent.
2550
+ task_agent_max_workflows (int): Maximum number of workflow files
2551
+ to load for the task planning agent.
2552
+ session_id (Optional[str]): Specific session ID to load from.
2553
+ If None, searches across all sessions.
2554
+ """
2555
+ try:
2556
+ import glob
2557
+ import os
2558
+ from pathlib import Path
2559
+
2560
+ from camel.utils.context_utils import ContextUtility
2561
+
2562
+ # For loading management workflows, search across all sessions
2563
+ camel_workdir = os.environ.get("CAMEL_WORKDIR")
2564
+ if camel_workdir:
2565
+ base_dir = os.path.join(camel_workdir, "workforce_workflows")
2566
+ else:
2567
+ base_dir = "workforce_workflows"
2568
+
2569
+ # Search for workflow files in specified or all session directories
2570
+ if session_id:
2571
+ search_path = str(
2572
+ Path(base_dir) / session_id / "*_workflow*.md"
2573
+ )
2574
+ else:
2575
+ search_path = str(Path(base_dir) / "*" / "*_workflow*.md")
2576
+ workflow_files = glob.glob(search_path)
2577
+
2578
+ if not workflow_files:
2579
+ logger.info(
2580
+ "No workflow files found for management agent context"
2581
+ )
2582
+ return
2583
+
2584
+ # Sort by modification time (most recent first)
2585
+ workflow_files.sort(
2586
+ key=lambda x: os.path.getmtime(x), reverse=True
2587
+ )
2588
+
2589
+ # Load workflows for coordinator agent
2590
+ coordinator_loaded = 0
2591
+ for file_path in workflow_files[:coordinator_max_workflows]:
2592
+ try:
2593
+ filename = os.path.basename(file_path).replace('.md', '')
2594
+ session_dir = os.path.dirname(file_path)
2595
+ session_id = os.path.basename(session_dir)
2596
+
2597
+ # Use shared context utility with specific session
2598
+ temp_utility = ContextUtility.get_workforce_shared(
2599
+ session_id
2600
+ )
2601
+
2602
+ status = temp_utility.load_markdown_context_to_memory(
2603
+ self.coordinator_agent, filename
2604
+ )
2605
+ if "Context appended" in status:
2606
+ coordinator_loaded += 1
2607
+ except Exception as e:
2608
+ logger.warning(
2609
+ f"Failed to load coordinator workflow {file_path}: {e}"
2610
+ )
2611
+
2612
+ # Load workflows for task agent
2613
+ task_agent_loaded = 0
2614
+ for file_path in workflow_files[:task_agent_max_workflows]:
2615
+ try:
2616
+ filename = os.path.basename(file_path).replace('.md', '')
2617
+ session_dir = os.path.dirname(file_path)
2618
+ session_id = os.path.basename(session_dir)
2619
+
2620
+ # Use shared context utility with specific session
2621
+ temp_utility = ContextUtility.get_workforce_shared(
2622
+ session_id
2623
+ )
2624
+
2625
+ status = temp_utility.load_markdown_context_to_memory(
2626
+ self.task_agent, filename
2627
+ )
2628
+ if "Context appended" in status:
2629
+ task_agent_loaded += 1
2630
+ except Exception as e:
2631
+ logger.warning(
2632
+ f"Failed to load task agent workflow {file_path}: {e}"
2633
+ )
2634
+
2635
+ logger.info(
2636
+ f"Loaded {coordinator_loaded} workflows for coordinator, "
2637
+ f"{task_agent_loaded} workflows for task agent"
2638
+ )
2639
+
2640
+ except Exception as e:
2641
+ logger.error(f"Error loading management agent workflows: {e}")
2642
+
2643
+ @check_if_running(False)
2644
+ def set_channel(self, channel: TaskChannel) -> None:
2645
+ r"""Set the channel for the node and all the child nodes under it."""
2646
+ self._channel = channel
2647
+ for child in self._children:
2648
+ child.set_channel(channel)
2649
+
2650
+ def _get_child_nodes_info(self) -> str:
2651
+ r"""Get the information of all the child nodes under this node."""
2652
+ return "".join(
2653
+ f"<{child.node_id}>:<{child.description}>:<{self._get_node_info(child)}>\n"
2654
+ for child in self._children
2655
+ )
2656
+
2657
+ def _get_node_info(self, node) -> str:
2658
+ r"""Get descriptive information for a specific node type."""
2659
+ if isinstance(node, Workforce):
2660
+ return "A Workforce node"
2661
+ elif isinstance(node, SingleAgentWorker):
2662
+ return self._get_single_agent_toolkit_info(node)
2663
+ elif isinstance(node, RolePlayingWorker):
2664
+ return "A Role playing node"
2665
+ else:
2666
+ return "Unknown node"
2667
+
2668
+ def _get_single_agent_toolkit_info(
2669
+ self, worker: 'SingleAgentWorker'
2670
+ ) -> str:
2671
+ r"""Get formatted information for a SingleAgentWorker node."""
2672
+ toolkit_tools = self._group_tools_by_toolkit(worker.worker.tool_dict)
2673
+
2674
+ if not toolkit_tools:
2675
+ return ""
2676
+
2677
+ toolkit_info = []
2678
+ for toolkit_name, tools in sorted(toolkit_tools.items()):
2679
+ tools_str = ', '.join(sorted(tools))
2680
+ toolkit_info.append(f"{toolkit_name}({tools_str})")
2681
+
2682
+ return ", ".join(toolkit_info)
2683
+
2684
+ def _group_tools_by_toolkit(self, tool_dict: dict) -> dict[str, list[str]]:
2685
+ r"""Group tools by their parent toolkit class names."""
2686
+ toolkit_tools: dict[str, list[str]] = {}
2687
+
2688
+ for tool_name, tool in tool_dict.items():
2689
+ if hasattr(tool.func, '__self__'):
2690
+ toolkit_name = tool.func.__self__.__class__.__name__
2691
+ else:
2692
+ toolkit_name = "Standalone"
2693
+
2694
+ if toolkit_name not in toolkit_tools:
2695
+ toolkit_tools[toolkit_name] = []
2696
+ toolkit_tools[toolkit_name].append(tool_name)
2697
+
2698
+ return toolkit_tools
2699
+
2700
+ def _get_valid_worker_ids(self) -> set:
2701
+ r"""Get all valid worker IDs from child nodes.
2702
+
2703
+ Returns:
2704
+ set: Set of valid worker IDs that can be assigned tasks.
2705
+ """
2706
+ valid_worker_ids = {child.node_id for child in self._children}
2707
+ return valid_worker_ids
2708
+
2709
+ def _call_coordinator_for_assignment(
2710
+ self, tasks: List[Task], invalid_ids: Optional[List[str]] = None
2711
+ ) -> TaskAssignResult:
2712
+ r"""Call coordinator agent to assign tasks with optional validation
2713
+ feedback in the case of invalid worker IDs.
2714
+
2715
+ Args:
2716
+ tasks (List[Task]): Tasks to assign.
2717
+ invalid_ids (List[str], optional): Invalid worker IDs from previous
2718
+ attempt (if any).
2719
+
2720
+ Returns:
2721
+ TaskAssignResult: Assignment result from coordinator.
2722
+ """
2723
+ # format tasks information for the prompt
2724
+ tasks_info = ""
2725
+ for task in tasks:
2726
+ tasks_info += f"Task ID: {task.id}\n"
2727
+ tasks_info += f"Content: {task.content}\n"
2728
+ if task.additional_info:
2729
+ tasks_info += f"Additional Info: {task.additional_info}\n"
2730
+ tasks_info += "---\n"
2731
+
2732
+ prompt = str(
2733
+ ASSIGN_TASK_PROMPT.format(
2734
+ tasks_info=tasks_info,
2735
+ child_nodes_info=self._get_child_nodes_info(),
2736
+ )
2737
+ )
2738
+
2739
+ # add feedback if this is a retry
2740
+ if invalid_ids:
2741
+ valid_worker_ids = list(self._get_valid_worker_ids())
2742
+ feedback = (
2743
+ f"VALIDATION ERROR: The following worker IDs are invalid: "
2744
+ f"{invalid_ids}. "
2745
+ f"VALID WORKER IDS: {valid_worker_ids}. "
2746
+ f"Please reassign ONLY the above tasks using these valid IDs."
2747
+ )
2748
+ prompt = prompt + f"\n\n{feedback}"
2749
+
2750
+ # Check if we should use structured handler
2751
+ if self.use_structured_output_handler:
1822
2752
  # Use structured handler for prompt-based extraction
1823
2753
  enhanced_prompt = (
1824
2754
  self.structured_handler.generate_structured_prompt(
@@ -2057,8 +2987,40 @@ class Workforce(BaseNode):
2057
2987
  TaskAssignResult: Assignment result containing task assignments
2058
2988
  with their dependencies.
2059
2989
  """
2990
+ # Wait for workers to be ready before assignment with exponential
2991
+ # backoff
2992
+ worker_readiness_timeout = 2.0 # Maximum wait time in seconds
2993
+ worker_readiness_check_interval = 0.05 # Initial check interval
2994
+ start_time = time.time()
2995
+ check_interval = worker_readiness_check_interval
2996
+ backoff_multiplier = 1.5 # Exponential backoff factor
2997
+ max_interval = 0.5 # Cap the maximum interval
2998
+
2999
+ while (time.time() - start_time) < worker_readiness_timeout:
3000
+ valid_worker_ids = self._get_valid_worker_ids()
3001
+ if len(valid_worker_ids) > 0:
3002
+ elapsed = time.time() - start_time
3003
+ logger.debug(
3004
+ f"Workers ready after {elapsed:.3f}s: "
3005
+ f"{len(valid_worker_ids)} workers available"
3006
+ )
3007
+ break
3008
+
3009
+ await asyncio.sleep(check_interval)
3010
+ # Exponential backoff with cap
3011
+ check_interval = min(
3012
+ check_interval * backoff_multiplier, max_interval
3013
+ )
3014
+ else:
3015
+ # Timeout reached, log warning but continue
3016
+ logger.warning(
3017
+ f"Worker readiness timeout after "
3018
+ f"{worker_readiness_timeout}s, "
3019
+ f"proceeding with {len(self._children)} children"
3020
+ )
3021
+ valid_worker_ids = self._get_valid_worker_ids()
3022
+
2060
3023
  self.coordinator_agent.reset()
2061
- valid_worker_ids = self._get_valid_worker_ids()
2062
3024
 
2063
3025
  logger.debug(
2064
3026
  f"Sending batch assignment request to coordinator "
@@ -2092,7 +3054,24 @@ class Workforce(BaseNode):
2092
3054
  invalid_assignments, tasks, valid_worker_ids
2093
3055
  )
2094
3056
  )
2095
- all_assignments = valid_assignments + retry_and_fallback_assignments
3057
+
3058
+ # Combine assignments with deduplication, prioritizing retry results
3059
+ assignment_map = {a.task_id: a for a in valid_assignments}
3060
+ assignment_map.update(
3061
+ {a.task_id: a for a in retry_and_fallback_assignments}
3062
+ )
3063
+ all_assignments = list(assignment_map.values())
3064
+
3065
+ # Log any overwrites for debugging
3066
+ valid_task_ids = {a.task_id for a in valid_assignments}
3067
+ retry_task_ids = {a.task_id for a in retry_and_fallback_assignments}
3068
+ overlap_task_ids = valid_task_ids & retry_task_ids
3069
+
3070
+ if overlap_task_ids:
3071
+ logger.warning(
3072
+ f"Retry assignments overrode {len(overlap_task_ids)} "
3073
+ f"valid assignments for tasks: {sorted(overlap_task_ids)}"
3074
+ )
2096
3075
 
2097
3076
  # Update Task.dependencies for all final assignments
2098
3077
  self._update_task_dependencies_from_assignments(all_assignments, tasks)
@@ -2105,10 +3084,11 @@ class Workforce(BaseNode):
2105
3084
 
2106
3085
  task.assigned_worker_id = assignee_id
2107
3086
 
2108
- if self.metrics_logger:
2109
- self.metrics_logger.log_task_started(
2110
- task_id=task.id, worker_id=assignee_id
2111
- )
3087
+ task_started_event = TaskStartedEvent(
3088
+ task_id=task.id, worker_id=assignee_id
3089
+ )
3090
+ for cb in self._callbacks:
3091
+ cb.log_task_started(task_started_event)
2112
3092
 
2113
3093
  try:
2114
3094
  await self._channel.post_task(task, self.node_id, assignee_id)
@@ -2140,10 +3120,12 @@ class Workforce(BaseNode):
2140
3120
  Returns:
2141
3121
  Worker: The created worker node.
2142
3122
  """
2143
- prompt = CREATE_NODE_PROMPT.format(
2144
- content=task.content,
2145
- child_nodes_info=self._get_child_nodes_info(),
2146
- additional_info=task.additional_info,
3123
+ prompt = str(
3124
+ CREATE_NODE_PROMPT.format(
3125
+ content=task.content,
3126
+ child_nodes_info=self._get_child_nodes_info(),
3127
+ additional_info=task.additional_info,
3128
+ )
2147
3129
  )
2148
3130
  # Check if we should use structured handler
2149
3131
  if self.use_structured_output_handler:
@@ -2170,8 +3152,7 @@ class Workforce(BaseNode):
2170
3152
  "worker creation"
2171
3153
  )
2172
3154
  new_node_conf = WorkerConf(
2173
- description=f"Fallback worker for task: "
2174
- f"{task.content}",
3155
+ description=f"Fallback worker for task: {task.content}",
2175
3156
  role="General Assistant",
2176
3157
  sys_msg="You are a general assistant that can help "
2177
3158
  "with various tasks.",
@@ -2181,7 +3162,7 @@ class Workforce(BaseNode):
2181
3162
  response.msg.content,
2182
3163
  schema=WorkerConf,
2183
3164
  fallback_values={
2184
- "description": f"Worker for task: " f"{task.content}",
3165
+ "description": f"Worker for task: {task.content}",
2185
3166
  "role": "Task Specialist",
2186
3167
  "sys_msg": f"You are a specialist for: {task.content}",
2187
3168
  },
@@ -2209,8 +3190,7 @@ class Workforce(BaseNode):
2209
3190
  )
2210
3191
  # Create a fallback worker configuration
2211
3192
  new_node_conf = WorkerConf(
2212
- description=f"Fallback worker for "
2213
- f"task: {task.content}",
3193
+ description=f"Fallback worker for task: {task.content}",
2214
3194
  role="General Assistant",
2215
3195
  sys_msg="You are a general assistant that can help "
2216
3196
  "with various tasks.",
@@ -2254,13 +3234,13 @@ class Workforce(BaseNode):
2254
3234
  print(f"{Fore.CYAN}{new_node} created.{Fore.RESET}")
2255
3235
 
2256
3236
  self._children.append(new_node)
2257
- if self.metrics_logger:
2258
- self.metrics_logger.log_worker_created(
2259
- worker_id=new_node.node_id,
2260
- worker_type='SingleAgentWorker',
2261
- role=new_node_conf.role,
2262
- metadata={'description': new_node_conf.description},
2263
- )
3237
+
3238
+ self._notify_worker_created(
3239
+ new_node,
3240
+ worker_type='SingleAgentWorker',
3241
+ role=new_node_conf.role,
3242
+ metadata={'description': new_node_conf.description},
3243
+ )
2264
3244
  self._child_listening_tasks.append(
2265
3245
  asyncio.create_task(new_node.start())
2266
3246
  )
@@ -2304,13 +3284,27 @@ class Workforce(BaseNode):
2304
3284
  r"""Get the task that's published by this node and just get returned
2305
3285
  from the assignee. Includes timeout handling to prevent indefinite
2306
3286
  waiting.
3287
+
3288
+ Raises:
3289
+ asyncio.TimeoutError: If waiting for task exceeds timeout
2307
3290
  """
2308
3291
  try:
2309
3292
  # Add timeout to prevent indefinite waiting
2310
3293
  return await asyncio.wait_for(
2311
3294
  self._channel.get_returned_task_by_publisher(self.node_id),
2312
- timeout=TASK_TIMEOUT_SECONDS,
3295
+ timeout=self.task_timeout_seconds,
2313
3296
  )
3297
+ except asyncio.TimeoutError:
3298
+ # Re-raise timeout errors to be handled by caller
3299
+ # This prevents hanging when tasks are stuck
3300
+ logger.warning(
3301
+ f"Timeout waiting for task return in workforce "
3302
+ f"{self.node_id}. "
3303
+ f"Timeout: {self.task_timeout_seconds}s, "
3304
+ f"Pending tasks: {len(self._pending_tasks)}, "
3305
+ f"In-flight tasks: {self._in_flight_tasks}"
3306
+ )
3307
+ raise
2314
3308
  except Exception as e:
2315
3309
  error_msg = (
2316
3310
  f"Error getting returned task {e} in "
@@ -2329,7 +3323,15 @@ class Workforce(BaseNode):
2329
3323
  tasks_to_assign = [
2330
3324
  task
2331
3325
  for task in self._pending_tasks
2332
- if task.id not in self._task_dependencies
3326
+ if (
3327
+ task.id not in self._task_dependencies
3328
+ and (
3329
+ task.additional_info is None
3330
+ or not task.additional_info.get(
3331
+ "_needs_decomposition", False
3332
+ )
3333
+ )
3334
+ )
2333
3335
  ]
2334
3336
  if tasks_to_assign:
2335
3337
  logger.debug(
@@ -2339,22 +3341,24 @@ class Workforce(BaseNode):
2339
3341
  batch_result = await self._find_assignee(tasks_to_assign)
2340
3342
  logger.debug(
2341
3343
  f"Coordinator returned assignments:\n"
2342
- f"{json.dumps(batch_result.dict(), indent=2)}"
3344
+ f"{json.dumps(batch_result.model_dump(), indent=2)}"
2343
3345
  )
2344
3346
  for assignment in batch_result.assignments:
2345
3347
  self._task_dependencies[assignment.task_id] = (
2346
3348
  assignment.dependencies
2347
3349
  )
2348
3350
  self._assignees[assignment.task_id] = assignment.assignee_id
2349
- if self.metrics_logger:
3351
+
3352
+ task_assigned_event = TaskAssignedEvent(
3353
+ task_id=assignment.task_id,
3354
+ worker_id=assignment.assignee_id,
3355
+ dependencies=assignment.dependencies,
3356
+ queue_time_seconds=None,
3357
+ )
3358
+ for cb in self._callbacks:
2350
3359
  # queue_time_seconds can be derived by logger if task
2351
3360
  # creation time is logged
2352
- self.metrics_logger.log_task_assigned(
2353
- task_id=assignment.task_id,
2354
- worker_id=assignment.assignee_id,
2355
- dependencies=assignment.dependencies,
2356
- queue_time_seconds=None,
2357
- )
3361
+ cb.log_task_assigned(task_assigned_event)
2358
3362
 
2359
3363
  # Step 2: Iterate through all pending tasks and post those that are
2360
3364
  # ready
@@ -2365,21 +3369,139 @@ class Workforce(BaseNode):
2365
3369
  for task in self._pending_tasks:
2366
3370
  # A task must be assigned to be considered for posting
2367
3371
  if task.id in self._task_dependencies:
3372
+ # Skip if task has already been posted to prevent duplicates
3373
+ try:
3374
+ task_from_channel = await self._channel.get_task_by_id(
3375
+ task.id
3376
+ )
3377
+ # Check if task is already assigned to a worker
3378
+ if (
3379
+ task_from_channel
3380
+ and task_from_channel.assigned_worker_id
3381
+ ):
3382
+ logger.debug(
3383
+ f"Task {task.id} already assigned to "
3384
+ f"{task_from_channel.assigned_worker_id}, "
3385
+ f"skipping to prevent duplicate"
3386
+ )
3387
+ continue
3388
+ except Exception as e:
3389
+ logger.info(
3390
+ f"Task {task.id} non existent in channel. "
3391
+ f"Assigning task: {e}"
3392
+ )
2368
3393
  dependencies = self._task_dependencies[task.id]
2369
- # Check if all dependencies for this task are in the completed
2370
- # set and their state is DONE
2371
- if all(
2372
- dep_id in completed_tasks_info
2373
- and completed_tasks_info[dep_id] == TaskState.DONE
2374
- for dep_id in dependencies
2375
- ):
2376
- assignee_id = self._assignees[task.id]
2377
- logger.debug(
2378
- f"Posting task {task.id} to assignee {assignee_id}. "
2379
- f"Dependencies met."
3394
+
3395
+ # Check if all dependencies are in completed state
3396
+ all_deps_completed = all(
3397
+ dep_id in completed_tasks_info for dep_id in dependencies
3398
+ )
3399
+
3400
+ # Only proceed with dependency checks if all deps are completed
3401
+ if all_deps_completed:
3402
+ # Check if all dependencies succeeded (state is DONE)
3403
+ all_deps_done = all(
3404
+ completed_tasks_info[dep_id] == TaskState.DONE
3405
+ for dep_id in dependencies
2380
3406
  )
2381
- await self._post_task(task, assignee_id)
2382
- posted_tasks.append(task)
3407
+
3408
+ # Check if any dependency failed
3409
+ any_dep_failed = any(
3410
+ completed_tasks_info[dep_id] == TaskState.FAILED
3411
+ for dep_id in dependencies
3412
+ )
3413
+
3414
+ if all_deps_done:
3415
+ # All dependencies completed successfully - post the
3416
+ # task
3417
+ assignee_id = self._assignees[task.id]
3418
+ logger.debug(
3419
+ f"Posting task {task.id} to "
3420
+ f"assignee {assignee_id}. "
3421
+ f"Dependencies met."
3422
+ )
3423
+ await self._post_task(task, assignee_id)
3424
+ posted_tasks.append(task)
3425
+ elif any_dep_failed:
3426
+ # Check if any failed dependencies can still be retried
3427
+ failed_deps = [
3428
+ dep_id
3429
+ for dep_id in dependencies
3430
+ if completed_tasks_info[dep_id] == TaskState.FAILED
3431
+ ]
3432
+
3433
+ # Check if any failed dependency is still retryable
3434
+ failed_tasks_with_retry_potential = []
3435
+ permanently_failed_deps = []
3436
+
3437
+ for dep_id in failed_deps:
3438
+ # Find the failed dependency task
3439
+ failed_task = next(
3440
+ (
3441
+ t
3442
+ for t in self._completed_tasks
3443
+ if t.id == dep_id
3444
+ ),
3445
+ None,
3446
+ )
3447
+ if (
3448
+ failed_task
3449
+ and failed_task.failure_count
3450
+ < MAX_TASK_RETRIES
3451
+ ):
3452
+ failed_tasks_with_retry_potential.append(
3453
+ dep_id
3454
+ )
3455
+ else:
3456
+ permanently_failed_deps.append(dep_id)
3457
+
3458
+ # Only fail the task if ALL dependencies are
3459
+ # permanently failed
3460
+ if (
3461
+ permanently_failed_deps
3462
+ and not failed_tasks_with_retry_potential
3463
+ ):
3464
+ logger.error(
3465
+ f"Task {task.id} cannot proceed: dependencies "
3466
+ f"{permanently_failed_deps} have "
3467
+ f"permanently failed. "
3468
+ f"Marking task as failed."
3469
+ )
3470
+ task.state = TaskState.FAILED
3471
+ task.result = (
3472
+ f"Task failed due to permanently "
3473
+ f"failed dependencies: "
3474
+ f"{permanently_failed_deps}"
3475
+ )
3476
+
3477
+ # Log the failure to metrics
3478
+ task_failed_event = TaskFailedEvent(
3479
+ task_id=task.id,
3480
+ worker_id=task.assigned_worker_id or "unknown",
3481
+ error_message=task.result,
3482
+ metadata={
3483
+ 'failure_reason': 'dependency_failure',
3484
+ 'failed_dependencies': (
3485
+ permanently_failed_deps
3486
+ ),
3487
+ },
3488
+ )
3489
+ for cb in self._callbacks:
3490
+ cb.log_task_failed(task_failed_event)
3491
+
3492
+ self._completed_tasks.append(task)
3493
+ self._cleanup_task_tracking(task.id)
3494
+ posted_tasks.append(task) # Remove from pending
3495
+ else:
3496
+ # Some dependencies may still be retried, keep
3497
+ # task pending
3498
+ logger.debug(
3499
+ f"Task {task.id} waiting: dependencies "
3500
+ f"{failed_tasks_with_retry_potential} "
3501
+ f"failed but may be retried "
3502
+ f"(attempt < {MAX_TASK_RETRIES})"
3503
+ )
3504
+ # else: Not all dependencies completed yet, skip this task
2383
3505
 
2384
3506
  # Step 3: Remove the posted tasks from the pending list
2385
3507
  for task in posted_tasks:
@@ -2391,53 +3513,59 @@ class Workforce(BaseNode):
2391
3513
  pass
2392
3514
 
2393
3515
  async def _handle_failed_task(self, task: Task) -> bool:
3516
+ r"""Handle a task that failed during execution.
3517
+
3518
+ Args:
3519
+ task (Task): The failed task
3520
+
3521
+ Returns:
3522
+ bool: True if workforce should halt, False otherwise
3523
+ """
2394
3524
  task.failure_count += 1
2395
3525
 
2396
3526
  # Determine detailed failure information
2397
- # Use the actual error/result stored in task.result
2398
3527
  failure_reason = task.result or "Unknown error"
2399
-
2400
- # Add context about the worker and task
2401
3528
  worker_id = task.assigned_worker_id or "unknown"
2402
- worker_info = f" (assigned to worker: {worker_id})"
2403
-
2404
- detailed_error = f"{failure_reason}{worker_info}"
3529
+ detailed_error = f"{failure_reason} (assigned to worker: {worker_id})"
2405
3530
 
2406
3531
  logger.error(
2407
3532
  f"Task {task.id} failed (attempt "
2408
- f"{task.failure_count}/3): {detailed_error}"
3533
+ f"{task.failure_count}/{MAX_TASK_RETRIES}): {detailed_error}"
2409
3534
  )
2410
3535
 
2411
- if self.metrics_logger:
2412
- self.metrics_logger.log_task_failed(
2413
- task_id=task.id,
2414
- worker_id=worker_id,
2415
- error_message=detailed_error,
2416
- metadata={
2417
- 'failure_count': task.failure_count,
2418
- 'task_content': task.content,
2419
- 'result_length': len(task.result) if task.result else 0,
2420
- },
2421
- )
3536
+ print(
3537
+ f"{Fore.RED}❌ Task {task.id} failed "
3538
+ f"(attempt {task.failure_count}/{MAX_TASK_RETRIES}): "
3539
+ f"{failure_reason}{Fore.RESET}"
3540
+ )
3541
+
3542
+ task_failed_event = TaskFailedEvent(
3543
+ task_id=task.id,
3544
+ worker_id=worker_id,
3545
+ error_message=detailed_error,
3546
+ metadata={
3547
+ 'failure_count': task.failure_count,
3548
+ 'task_content': task.content,
3549
+ 'result_length': len(task.result) if task.result else 0,
3550
+ },
3551
+ )
3552
+ for cb in self._callbacks:
3553
+ cb.log_task_failed(task_failed_event)
2422
3554
 
2423
- # Check for immediate halt conditions - return immediately if we
2424
- # should halt
3555
+ # Check for immediate halt conditions
2425
3556
  if task.failure_count >= MAX_TASK_RETRIES:
2426
3557
  logger.error(
2427
3558
  f"Task {task.id} has exceeded maximum retry attempts "
2428
- f"({MAX_TASK_RETRIES}). Final failure "
2429
- f"reason: {detailed_error}. "
3559
+ f"({MAX_TASK_RETRIES}). Final failure reason: "
3560
+ f"{detailed_error}. "
2430
3561
  f"Task content: '{task.content}'"
2431
3562
  )
2432
3563
  self._cleanup_task_tracking(task.id)
2433
- # Mark task as completed for dependency tracking before halting
2434
3564
  self._completed_tasks.append(task)
2435
3565
  if task.id in self._assignees:
2436
3566
  await self._channel.archive_task(task.id)
2437
3567
  return True
2438
3568
 
2439
- # If too many tasks are failing rapidly, also halt to prevent infinite
2440
- # loops
2441
3569
  if len(self._pending_tasks) > MAX_PENDING_TASKS_LIMIT:
2442
3570
  logger.error(
2443
3571
  f"Too many pending tasks ({len(self._pending_tasks)} > "
@@ -2445,18 +3573,24 @@ class Workforce(BaseNode):
2445
3573
  f"explosion. Last failed task: {task.id}"
2446
3574
  )
2447
3575
  self._cleanup_task_tracking(task.id)
2448
- # Mark task as completed for dependency tracking before halting
2449
3576
  self._completed_tasks.append(task)
2450
3577
  if task.id in self._assignees:
2451
3578
  await self._channel.archive_task(task.id)
2452
3579
  return True
2453
3580
 
2454
3581
  # Use intelligent failure analysis to decide recovery strategy
2455
- recovery_decision = self._analyze_failure(task, detailed_error)
3582
+ recovery_decision = self._analyze_task(
3583
+ task, for_failure=True, error_message=detailed_error
3584
+ )
2456
3585
 
3586
+ strategy_str = (
3587
+ recovery_decision.recovery_strategy.value
3588
+ if recovery_decision.recovery_strategy
3589
+ else "none"
3590
+ )
2457
3591
  logger.info(
2458
3592
  f"Task {task.id} failure "
2459
- f"analysis: {recovery_decision.strategy.value} - "
3593
+ f"analysis: {strategy_str} - "
2460
3594
  f"{recovery_decision.reasoning}"
2461
3595
  )
2462
3596
 
@@ -2465,105 +3599,23 @@ class Workforce(BaseNode):
2465
3599
  await self._channel.archive_task(task.id)
2466
3600
  self._cleanup_task_tracking(task.id)
2467
3601
 
3602
+ # Apply recovery strategy
2468
3603
  try:
2469
- if recovery_decision.strategy == RecoveryStrategy.RETRY:
2470
- # Simply retry the task by reposting it
2471
- if task.id in self._assignees:
2472
- assignee_id = self._assignees[task.id]
2473
- await self._post_task(task, assignee_id)
2474
- action_taken = f"retried with same worker {assignee_id}"
2475
- else:
2476
- # Find a new assignee and retry
2477
- batch_result = await self._find_assignee([task])
2478
- assignment = batch_result.assignments[0]
2479
- self._assignees[task.id] = assignment.assignee_id
2480
- await self._post_task(task, assignment.assignee_id)
2481
- action_taken = (
2482
- f"retried with new worker {assignment.assignee_id}"
2483
- )
2484
-
2485
- elif recovery_decision.strategy == RecoveryStrategy.REPLAN:
2486
- # Modify the task content and retry
2487
- if recovery_decision.modified_task_content:
2488
- task.content = recovery_decision.modified_task_content
2489
- logger.info(f"Task {task.id} content modified for replan")
2490
-
2491
- # Repost the modified task
2492
- if task.id in self._assignees:
2493
- assignee_id = self._assignees[task.id]
2494
- await self._post_task(task, assignee_id)
2495
- action_taken = (
2496
- f"replanned and retried with worker {assignee_id}"
2497
- )
2498
- else:
2499
- # Find a new assignee for the replanned task
2500
- batch_result = await self._find_assignee([task])
2501
- assignment = batch_result.assignments[0]
2502
- self._assignees[task.id] = assignment.assignee_id
2503
- await self._post_task(task, assignment.assignee_id)
2504
- action_taken = (
2505
- f"replanned and assigned to "
2506
- f"worker {assignment.assignee_id}"
2507
- )
2508
-
2509
- elif recovery_decision.strategy == RecoveryStrategy.DECOMPOSE:
2510
- # Decompose the task into subtasks
2511
- subtasks_result = self._decompose_task(task)
2512
-
2513
- # Handle both streaming and non-streaming results
2514
- if isinstance(subtasks_result, Generator):
2515
- # This is a generator (streaming mode)
2516
- subtasks = []
2517
- for new_tasks in subtasks_result:
2518
- subtasks.extend(new_tasks)
2519
- else:
2520
- # This is a regular list (non-streaming mode)
2521
- subtasks = subtasks_result
2522
- if self.metrics_logger and subtasks:
2523
- self.metrics_logger.log_task_decomposed(
2524
- parent_task_id=task.id,
2525
- subtask_ids=[st.id for st in subtasks],
2526
- )
2527
- for subtask in subtasks:
2528
- self.metrics_logger.log_task_created(
2529
- task_id=subtask.id,
2530
- description=subtask.content,
2531
- parent_task_id=task.id,
2532
- task_type=subtask.type,
2533
- metadata=subtask.additional_info,
2534
- )
2535
- # Insert packets at the head of the queue
2536
- self._pending_tasks.extendleft(reversed(subtasks))
2537
-
2538
- await self._post_ready_tasks()
2539
- action_taken = f"decomposed into {len(subtasks)} subtasks"
2540
-
2541
- logger.debug(
2542
- f"Task {task.id} failed and was {action_taken}. "
2543
- f"Dependencies updated for subtasks."
2544
- )
2545
-
2546
- # Sync shared memory after task decomposition
2547
- if self.share_memory:
2548
- logger.info(
2549
- f"Syncing shared memory after "
2550
- f"task {task.id} decomposition"
2551
- )
2552
- self._sync_shared_memory()
3604
+ is_decompose = await self._apply_recovery_strategy(
3605
+ task, recovery_decision
3606
+ )
2553
3607
 
2554
- # Check if any pending tasks are now ready to execute
2555
- await self._post_ready_tasks()
3608
+ # For decompose, we handle it specially
3609
+ if is_decompose:
3610
+ # Task was decomposed, add to completed tasks
3611
+ self._completed_tasks.append(task)
2556
3612
  return False
2557
3613
 
2558
- elif recovery_decision.strategy == RecoveryStrategy.CREATE_WORKER:
2559
- assignee = await self._create_worker_node_for_task(task)
2560
- await self._post_task(task, assignee.node_id)
2561
- action_taken = (
2562
- f"created new worker {assignee.node_id} and assigned "
2563
- f"task {task.id} to it"
2564
- )
2565
3614
  except Exception as e:
2566
- logger.error(f"Recovery strategy failed for task {task.id}: {e}")
3615
+ logger.error(
3616
+ f"Recovery strategy failed for task {task.id}: {e}",
3617
+ exc_info=True,
3618
+ )
2567
3619
  # If max retries reached, halt the workforce
2568
3620
  if task.failure_count >= MAX_TASK_RETRIES:
2569
3621
  self._completed_tasks.append(task)
@@ -2571,18 +3623,17 @@ class Workforce(BaseNode):
2571
3623
  self._completed_tasks.append(task)
2572
3624
  return False
2573
3625
 
3626
+ # Task is being retried - don't add to completed tasks
3627
+ # It will be added when it actually completes or permanently fails
2574
3628
  logger.debug(
2575
- f"Task {task.id} failed and was {action_taken}. "
2576
- f"Updating dependency state."
3629
+ f"Task {task.id} is being retried (strategy: "
3630
+ f"{recovery_decision.recovery_strategy}). "
3631
+ f"Not adding to completed tasks until final outcome."
2577
3632
  )
2578
- # Mark task as completed for dependency tracking
2579
- self._completed_tasks.append(task)
2580
3633
 
2581
- # Sync shared memory after task completion to share knowledge
3634
+ # Sync shared memory after task recovery
2582
3635
  if self.share_memory:
2583
- logger.info(
2584
- f"Syncing shared memory after task {task.id} completion"
2585
- )
3636
+ logger.info(f"Syncing shared memory after task {task.id} recovery")
2586
3637
  self._sync_shared_memory()
2587
3638
 
2588
3639
  # Check if any pending tasks are now ready to execute
@@ -2590,61 +3641,60 @@ class Workforce(BaseNode):
2590
3641
  return False
2591
3642
 
2592
3643
  async def _handle_completed_task(self, task: Task) -> None:
2593
- if self.metrics_logger:
2594
- worker_id = task.assigned_worker_id or "unknown"
2595
- processing_time_seconds = None
2596
- token_usage = None
2597
-
2598
- # Get processing time from task start time or additional info
2599
- if task.id in self._task_start_times:
2600
- processing_time_seconds = (
2601
- time.time() - self._task_start_times[task.id]
2602
- )
2603
- self._cleanup_task_tracking(task.id)
2604
- elif (
2605
- task.additional_info is not None
2606
- and 'processing_time_seconds' in task.additional_info
2607
- ):
2608
- processing_time_seconds = task.additional_info[
2609
- 'processing_time_seconds'
2610
- ]
3644
+ worker_id = task.assigned_worker_id or "unknown"
3645
+ processing_time_seconds = None
3646
+ token_usage = None
2611
3647
 
2612
- # Get token usage from task additional info (preferred - actual
2613
- # usage)
2614
- if (
2615
- task.additional_info is not None
2616
- and 'token_usage' in task.additional_info
2617
- ):
2618
- token_usage = task.additional_info['token_usage']
2619
- else:
2620
- # Fallback: Try to get token usage from SingleAgentWorker
2621
- # memory
2622
- assignee_node = next(
2623
- (
2624
- child
2625
- for child in self._children
2626
- if child.node_id == worker_id
2627
- ),
2628
- None,
2629
- )
2630
- if isinstance(assignee_node, SingleAgentWorker):
2631
- try:
2632
- _, total_tokens = (
2633
- assignee_node.worker.memory.get_context()
2634
- )
2635
- token_usage = {'total_tokens': total_tokens}
2636
- except Exception:
2637
- token_usage = None
3648
+ # Get processing time from task start time or additional info
3649
+ if task.id in self._task_start_times:
3650
+ processing_time_seconds = (
3651
+ time.time() - self._task_start_times[task.id]
3652
+ )
3653
+ self._cleanup_task_tracking(task.id)
3654
+ elif (
3655
+ task.additional_info is not None
3656
+ and 'processing_time_seconds' in task.additional_info
3657
+ ):
3658
+ processing_time_seconds = task.additional_info[
3659
+ 'processing_time_seconds'
3660
+ ]
2638
3661
 
2639
- # Log the completed task
2640
- self.metrics_logger.log_task_completed(
2641
- task_id=task.id,
2642
- worker_id=worker_id,
2643
- result_summary=task.result if task.result else "Completed",
2644
- processing_time_seconds=processing_time_seconds,
2645
- token_usage=token_usage,
2646
- metadata={'current_state': task.state.value},
3662
+ # Get token usage from task additional info (preferred - actual
3663
+ # usage)
3664
+ if (
3665
+ task.additional_info is not None
3666
+ and 'token_usage' in task.additional_info
3667
+ ):
3668
+ token_usage = task.additional_info['token_usage']
3669
+ else:
3670
+ # Fallback: Try to get token usage from SingleAgentWorker
3671
+ # memory
3672
+ assignee_node = next(
3673
+ (
3674
+ child
3675
+ for child in self._children
3676
+ if child.node_id == worker_id
3677
+ ),
3678
+ None,
2647
3679
  )
3680
+ if isinstance(assignee_node, SingleAgentWorker):
3681
+ try:
3682
+ _, total_tokens = assignee_node.worker.memory.get_context()
3683
+ token_usage = {'total_tokens': total_tokens}
3684
+ except Exception:
3685
+ token_usage = None
3686
+
3687
+ # Log the completed task
3688
+ task_completed_event = TaskCompletedEvent(
3689
+ task_id=task.id,
3690
+ worker_id=worker_id,
3691
+ result_summary=task.result if task.result else "Completed",
3692
+ processing_time_seconds=processing_time_seconds,
3693
+ token_usage=token_usage,
3694
+ metadata={'current_state': task.state.value},
3695
+ )
3696
+ for cb in self._callbacks:
3697
+ cb.log_task_completed(task_completed_event)
2648
3698
 
2649
3699
  # Find and remove the completed task from pending tasks
2650
3700
  tasks_list = list(self._pending_tasks)
@@ -2764,15 +3814,23 @@ class Workforce(BaseNode):
2764
3814
  r"""Returns an ASCII tree representation of the task hierarchy and
2765
3815
  worker status.
2766
3816
  """
2767
- if not self.metrics_logger:
2768
- return "Logger not initialized."
2769
- return self.metrics_logger.get_ascii_tree_representation()
3817
+ metrics_cb: List[WorkforceMetrics] = [
3818
+ cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
3819
+ ]
3820
+ if len(metrics_cb) == 0:
3821
+ return "Metrics Callback not initialized."
3822
+ else:
3823
+ return metrics_cb[0].get_ascii_tree_representation()
2770
3824
 
2771
3825
  def get_workforce_kpis(self) -> Dict[str, Any]:
2772
3826
  r"""Returns a dictionary of key performance indicators."""
2773
- if not self.metrics_logger:
2774
- return {"error": "Logger not initialized."}
2775
- return self.metrics_logger.get_kpis()
3827
+ metrics_cb: List[WorkforceMetrics] = [
3828
+ cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
3829
+ ]
3830
+ if len(metrics_cb) == 0:
3831
+ return {"error": "Metrics Callback not initialized."}
3832
+ else:
3833
+ return metrics_cb[0].get_kpis()
2776
3834
 
2777
3835
  def dump_workforce_logs(self, file_path: str) -> None:
2778
3836
  r"""Dumps all collected logs to a JSON file.
@@ -2780,13 +3838,133 @@ class Workforce(BaseNode):
2780
3838
  Args:
2781
3839
  file_path (str): The path to the JSON file.
2782
3840
  """
2783
- if not self.metrics_logger:
3841
+ metrics_cb: List[WorkforceMetrics] = [
3842
+ cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
3843
+ ]
3844
+ if len(metrics_cb) == 0:
2784
3845
  print("Logger not initialized. Cannot dump logs.")
2785
3846
  return
2786
- self.metrics_logger.dump_to_json(file_path)
3847
+ metrics_cb[0].dump_to_json(file_path)
2787
3848
  # Use logger.info or print, consistent with existing style
2788
3849
  logger.info(f"Workforce logs dumped to {file_path}")
2789
3850
 
3851
+ async def _handle_skip_task(self) -> bool:
3852
+ r"""Handle skip request by marking pending and in-flight tasks
3853
+ as completed.
3854
+
3855
+ Returns:
3856
+ bool: True if workforce should stop (no independent tasks),
3857
+ False to continue.
3858
+ """
3859
+ logger.info("Skip requested, processing skip logic.")
3860
+
3861
+ # Mark all pending tasks as completed instead of just clearing
3862
+ pending_tasks_to_complete = list(self._pending_tasks)
3863
+ if pending_tasks_to_complete:
3864
+ logger.info(
3865
+ f"Marking {len(pending_tasks_to_complete)} pending tasks "
3866
+ f"as completed."
3867
+ )
3868
+ for task in pending_tasks_to_complete:
3869
+ # Don't remove tasks that need decomposition
3870
+ if task.additional_info and task.additional_info.get(
3871
+ '_needs_decomposition', False
3872
+ ):
3873
+ continue
3874
+ # Set task state to DONE and add a completion message
3875
+ task.state = TaskState.DONE
3876
+ task.result = "Task marked as completed due to skip request"
3877
+
3878
+ # Use the existing handle completed task function
3879
+ await self._handle_completed_task(task)
3880
+
3881
+ # Handle in-flight tasks if they exist
3882
+ if self._in_flight_tasks > 0:
3883
+ logger.info(
3884
+ f"Found {self._in_flight_tasks} in-flight tasks. "
3885
+ f"Retrieving and completing them."
3886
+ )
3887
+ try:
3888
+ # Get all in-flight tasks for this publisher from the channel
3889
+ in_flight_tasks = await self._channel.get_in_flight_tasks(
3890
+ self.node_id
3891
+ )
3892
+ logger.info(
3893
+ f"Retrieved {len(in_flight_tasks)} in-flight "
3894
+ f"tasks from channel."
3895
+ )
3896
+
3897
+ for task in in_flight_tasks:
3898
+ # Set task state to DONE and add a completion message
3899
+ task.state = TaskState.DONE
3900
+ task.result = (
3901
+ "Task marked as completed due to skip request"
3902
+ )
3903
+
3904
+ # Remove the task from the channel to avoid hanging
3905
+ await self._channel.remove_task(task.id)
3906
+
3907
+ # Decrement in-flight counter
3908
+ self._decrement_in_flight_tasks(
3909
+ task.id, "skip request - removed from channel"
3910
+ )
3911
+
3912
+ # Handle as completed task to update dependencies
3913
+ await self._handle_completed_task(task)
3914
+
3915
+ logger.info(
3916
+ f"Completed in-flight task {task.id} due "
3917
+ f"to skip request."
3918
+ )
3919
+
3920
+ except Exception as e:
3921
+ logger.error(
3922
+ f"Error handling in-flight tasks during skip: {e}",
3923
+ exc_info=True,
3924
+ )
3925
+ # Reset in-flight counter to prevent hanging
3926
+ self._in_flight_tasks = 0
3927
+
3928
+ # Check if there are any main pending tasks after filtering
3929
+ if self._pending_tasks:
3930
+ # Check if the first pending task needs decomposition
3931
+ next_task = self._pending_tasks[0]
3932
+ if next_task.additional_info and next_task.additional_info.get(
3933
+ '_needs_decomposition'
3934
+ ):
3935
+ logger.info(
3936
+ f"Decomposing main task {next_task.id} after skip request."
3937
+ )
3938
+ try:
3939
+ # Remove the decomposition flag to avoid re-decomposition
3940
+ next_task.additional_info['_needs_decomposition'] = False
3941
+
3942
+ # Decompose the task and append subtasks to _pending_tasks
3943
+ await self.handle_decompose_append_task(
3944
+ next_task, reset=False
3945
+ )
3946
+
3947
+ # Mark the main task as completed and remove from pending
3948
+ await self._handle_completed_task(next_task)
3949
+ logger.info(
3950
+ f"Main task {next_task.id} decomposed after "
3951
+ f"skip request"
3952
+ )
3953
+ except Exception as e:
3954
+ logger.error(
3955
+ f"Error decomposing main task {next_task.id} "
3956
+ f"after skip: {e}",
3957
+ exc_info=True,
3958
+ )
3959
+
3960
+ logger.info("Pending tasks available after skip, continuing.")
3961
+ await self._post_ready_tasks()
3962
+ return False # Continue processing
3963
+ else:
3964
+ # No pending tasks available, act like stop
3965
+ logger.info("No pending tasks available, acting like stop.")
3966
+ return True # Stop processing
3967
+
2790
3968
  @check_if_running(False)
2791
3969
  async def _listen_to_channel(self) -> None:
2792
3970
  r"""Continuously listen to the channel, post task to the channel and
@@ -2815,6 +3993,75 @@ class Workforce(BaseNode):
2815
3993
  logger.info("Stop requested, breaking execution loop.")
2816
3994
  break
2817
3995
 
3996
+ # Check for skip request after potential pause
3997
+ if self._skip_requested:
3998
+ should_stop = await self._handle_skip_task()
3999
+ if should_stop:
4000
+ self._stop_requested = True
4001
+ break
4002
+
4003
+ # Reset skip flag
4004
+ self._skip_requested = False
4005
+ continue
4006
+
4007
+ # Check if we should decompose a main task
4008
+ # Only decompose when no tasks are in flight and pending queue
4009
+ # is empty
4010
+ if not self._pending_tasks and self._in_flight_tasks == 0:
4011
+ # All tasks completed, will exit loop
4012
+ break
4013
+
4014
+ # Check if the first pending task needs decomposition
4015
+ # This happens when add_task(as_subtask=False) was called
4016
+ if self._pending_tasks and self._in_flight_tasks == 0:
4017
+ next_task = self._pending_tasks[0]
4018
+ if (
4019
+ next_task.additional_info
4020
+ and next_task.additional_info.get(
4021
+ '_needs_decomposition'
4022
+ )
4023
+ ):
4024
+ logger.info(f"Decomposing main task: {next_task.id}")
4025
+ try:
4026
+ # Remove the decomposition flag to avoid
4027
+ # re-decomposition
4028
+ next_task.additional_info[
4029
+ '_needs_decomposition'
4030
+ ] = False
4031
+
4032
+ # Decompose the task and append subtasks to
4033
+ # _pending_tasks
4034
+ await self.handle_decompose_append_task(
4035
+ next_task, reset=False
4036
+ )
4037
+
4038
+ # Mark the main task as completed (decomposition
4039
+ # successful) and Remove it from pending tasks
4040
+ await self._handle_completed_task(next_task)
4041
+ logger.info(
4042
+ f"Main task {next_task.id} decomposed and "
4043
+ f"ready for processing"
4044
+ )
4045
+ except Exception as e:
4046
+ logger.error(
4047
+ f"Error decomposing main task {next_task.id}: "
4048
+ f"{e}",
4049
+ exc_info=True,
4050
+ )
4051
+ # Revert back to the queue for retry later if
4052
+ # decomposition failed
4053
+ if not self._pending_tasks:
4054
+ self._pending_tasks.appendleft(next_task)
4055
+ else:
4056
+ logger.warning(
4057
+ "Pending tasks exist after decomposition "
4058
+ "error."
4059
+ )
4060
+
4061
+ # Immediately assign and post the transferred tasks
4062
+ await self._post_ready_tasks()
4063
+ continue
4064
+
2818
4065
  # Save snapshot before processing next task
2819
4066
  if self._pending_tasks:
2820
4067
  current_task = self._pending_tasks[0]
@@ -2829,9 +4076,24 @@ class Workforce(BaseNode):
2829
4076
  self._last_snapshot_time = time.time()
2830
4077
 
2831
4078
  # Get returned task
2832
- returned_task = await self._get_returned_task()
4079
+ try:
4080
+ returned_task = await self._get_returned_task()
4081
+ except asyncio.TimeoutError:
4082
+ # Handle timeout - check if we have tasks stuck in flight
4083
+ if self._in_flight_tasks > 0:
4084
+ logger.warning(
4085
+ f"Timeout waiting for {self._in_flight_tasks} "
4086
+ f"in-flight tasks. Breaking to prevent hanging."
4087
+ )
4088
+ # Break the loop to prevent indefinite hanging
4089
+ # The finally block will handle cleanup
4090
+ break
4091
+ else:
4092
+ # No tasks in flight, safe to continue
4093
+ await self._post_ready_tasks()
4094
+ continue
2833
4095
 
2834
- # If no task was returned, continue
4096
+ # If no task was returned (other errors), continue
2835
4097
  if returned_task is None:
2836
4098
  logger.debug(
2837
4099
  f"No task returned in workforce {self.node_id}. "
@@ -2872,6 +4134,20 @@ class Workforce(BaseNode):
2872
4134
  )
2873
4135
  if not halt:
2874
4136
  continue
4137
+
4138
+ # Do not halt if we have main tasks in queue
4139
+ if len(self.get_main_task_queue()) > 0:
4140
+ print(
4141
+ f"{Fore.RED}Task {returned_task.id} has "
4142
+ f"failed for {MAX_TASK_RETRIES} times "
4143
+ f"after insufficient results, skipping "
4144
+ f"that task. Final error: "
4145
+ f"{returned_task.result or 'Unknown err'}"
4146
+ f"{Fore.RESET}"
4147
+ )
4148
+ self._skip_requested = True
4149
+ continue
4150
+
2875
4151
  print(
2876
4152
  f"{Fore.RED}Task {returned_task.id} has "
2877
4153
  f"failed for {MAX_TASK_RETRIES} times after "
@@ -2890,16 +4166,106 @@ class Workforce(BaseNode):
2890
4166
  )
2891
4167
  continue
2892
4168
  else:
2893
- print(
2894
- f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
2895
- f"successfully.{Fore.RESET}"
4169
+ quality_eval = self._analyze_task(
4170
+ returned_task, for_failure=False
2896
4171
  )
2897
- await self._handle_completed_task(returned_task)
4172
+
4173
+ if not quality_eval.quality_sufficient:
4174
+ logger.info(
4175
+ f"Task {returned_task.id} quality check: "
4176
+ f"score={quality_eval.quality_score}, "
4177
+ f"issues={quality_eval.issues}, "
4178
+ f"strategy={quality_eval.recovery_strategy}"
4179
+ )
4180
+
4181
+ # Check retry limit before attempting recovery
4182
+ if returned_task.failure_count >= 2:
4183
+ print(
4184
+ f"{Fore.YELLOW}Task {returned_task.id} "
4185
+ f"completed with low quality score: "
4186
+ f"{quality_eval.quality_score} "
4187
+ f"(retry limit reached){Fore.RESET}"
4188
+ )
4189
+ await self._handle_completed_task(
4190
+ returned_task
4191
+ )
4192
+ continue
4193
+
4194
+ # Print visual feedback for quality-failed tasks
4195
+ # with recovery strategy
4196
+ recovery_action = (
4197
+ quality_eval.recovery_strategy.value
4198
+ if quality_eval.recovery_strategy
4199
+ else ""
4200
+ )
4201
+ print(
4202
+ f"{Fore.YELLOW}⚠️ Task {returned_task.id} "
4203
+ f"failed quality check (score: "
4204
+ f"{quality_eval.quality_score}). "
4205
+ f"Issues: {', '.join(quality_eval.issues)}. "
4206
+ f"Recovery: {recovery_action}{Fore.RESET}"
4207
+ )
4208
+
4209
+ # Mark as failed for recovery
4210
+ returned_task.failure_count += 1
4211
+ returned_task.state = TaskState.FAILED
4212
+ returned_task.result = (
4213
+ f"Quality insufficient (score: "
4214
+ f"{quality_eval.quality_score}). "
4215
+ f"Issues: {', '.join(quality_eval.issues)}"
4216
+ )
4217
+
4218
+ # Clean up tracking before attempting recovery
4219
+ if returned_task.id in self._assignees:
4220
+ await self._channel.archive_task(
4221
+ returned_task.id
4222
+ )
4223
+ self._cleanup_task_tracking(returned_task.id)
4224
+
4225
+ # Apply LLM-recommended recovery strategy
4226
+ try:
4227
+ is_decompose = (
4228
+ await self._apply_recovery_strategy(
4229
+ returned_task, quality_eval
4230
+ )
4231
+ )
4232
+
4233
+ # For decompose, cleanup happens in the method
4234
+ if is_decompose:
4235
+ continue
4236
+
4237
+ except Exception as e:
4238
+ logger.error(
4239
+ f"Error handling quality-failed task "
4240
+ f"{returned_task.id}: {e}",
4241
+ exc_info=True,
4242
+ )
4243
+ continue
4244
+ else:
4245
+ print(
4246
+ f"{Fore.CYAN}Task {returned_task.id} "
4247
+ f"completed successfully (quality score: "
4248
+ f"{quality_eval.quality_score}).{Fore.RESET}"
4249
+ )
4250
+ await self._handle_completed_task(returned_task)
2898
4251
  elif returned_task.state == TaskState.FAILED:
2899
4252
  try:
2900
4253
  halt = await self._handle_failed_task(returned_task)
2901
4254
  if not halt:
2902
4255
  continue
4256
+
4257
+ # Do not halt if we have main tasks in queue
4258
+ if len(self.get_main_task_queue()) > 0:
4259
+ print(
4260
+ f"{Fore.RED}Task {returned_task.id} has "
4261
+ f"failed for {MAX_TASK_RETRIES} times, "
4262
+ f"skipping that task. Final error: "
4263
+ f"{returned_task.result or 'Unknown error'}"
4264
+ f"{Fore.RESET}"
4265
+ )
4266
+ self._skip_requested = True
4267
+ continue
4268
+
2903
4269
  print(
2904
4270
  f"{Fore.RED}Task {returned_task.id} has failed "
2905
4271
  f"for {MAX_TASK_RETRIES} times, halting "
@@ -2952,6 +4318,9 @@ class Workforce(BaseNode):
2952
4318
  elif not self._pending_tasks and self._in_flight_tasks == 0:
2953
4319
  self._state = WorkforceState.IDLE
2954
4320
  logger.info("All tasks completed.")
4321
+ all_tasks_completed_event = AllTasksCompletedEvent()
4322
+ for cb in self._callbacks:
4323
+ cb.log_all_tasks_completed(all_tasks_completed_event)
2955
4324
 
2956
4325
  # shut down the whole workforce tree
2957
4326
  self.stop()
@@ -3064,6 +4433,7 @@ class Workforce(BaseNode):
3064
4433
  graceful_shutdown_timeout=self.graceful_shutdown_timeout,
3065
4434
  share_memory=self.share_memory,
3066
4435
  use_structured_output_handler=self.use_structured_output_handler,
4436
+ task_timeout_seconds=self.task_timeout_seconds,
3067
4437
  )
3068
4438
 
3069
4439
  for child in self._children: