camel-ai 0.2.67__py3-none-any.whl → 0.2.80a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/_types.py +6 -2
  3. camel/agents/_utils.py +38 -0
  4. camel/agents/chat_agent.py +4014 -410
  5. camel/agents/mcp_agent.py +30 -27
  6. camel/agents/repo_agent.py +2 -1
  7. camel/benchmarks/browsecomp.py +6 -6
  8. camel/configs/__init__.py +15 -0
  9. camel/configs/aihubmix_config.py +88 -0
  10. camel/configs/amd_config.py +70 -0
  11. camel/configs/cometapi_config.py +104 -0
  12. camel/configs/minimax_config.py +93 -0
  13. camel/configs/nebius_config.py +103 -0
  14. camel/configs/vllm_config.py +2 -0
  15. camel/data_collectors/alpaca_collector.py +15 -6
  16. camel/datagen/self_improving_cot.py +1 -1
  17. camel/datasets/base_generator.py +39 -10
  18. camel/environments/__init__.py +12 -0
  19. camel/environments/rlcards_env.py +860 -0
  20. camel/environments/single_step.py +28 -3
  21. camel/environments/tic_tac_toe.py +1 -1
  22. camel/interpreters/__init__.py +2 -0
  23. camel/interpreters/docker/Dockerfile +4 -16
  24. camel/interpreters/docker_interpreter.py +3 -2
  25. camel/interpreters/e2b_interpreter.py +34 -1
  26. camel/interpreters/internal_python_interpreter.py +51 -2
  27. camel/interpreters/microsandbox_interpreter.py +395 -0
  28. camel/loaders/__init__.py +11 -2
  29. camel/loaders/base_loader.py +85 -0
  30. camel/loaders/chunkr_reader.py +9 -0
  31. camel/loaders/firecrawl_reader.py +4 -4
  32. camel/logger.py +1 -1
  33. camel/memories/agent_memories.py +84 -1
  34. camel/memories/base.py +34 -0
  35. camel/memories/blocks/chat_history_block.py +122 -4
  36. camel/memories/blocks/vectordb_block.py +8 -1
  37. camel/memories/context_creators/score_based.py +29 -237
  38. camel/memories/records.py +88 -8
  39. camel/messages/base.py +166 -40
  40. camel/messages/func_message.py +32 -5
  41. camel/models/__init__.py +10 -0
  42. camel/models/aihubmix_model.py +83 -0
  43. camel/models/aiml_model.py +1 -16
  44. camel/models/amd_model.py +101 -0
  45. camel/models/anthropic_model.py +117 -18
  46. camel/models/aws_bedrock_model.py +2 -33
  47. camel/models/azure_openai_model.py +205 -91
  48. camel/models/base_audio_model.py +3 -1
  49. camel/models/base_model.py +189 -24
  50. camel/models/cohere_model.py +5 -17
  51. camel/models/cometapi_model.py +83 -0
  52. camel/models/crynux_model.py +1 -16
  53. camel/models/deepseek_model.py +6 -16
  54. camel/models/fish_audio_model.py +6 -0
  55. camel/models/gemini_model.py +71 -20
  56. camel/models/groq_model.py +1 -17
  57. camel/models/internlm_model.py +1 -16
  58. camel/models/litellm_model.py +49 -32
  59. camel/models/lmstudio_model.py +1 -17
  60. camel/models/minimax_model.py +83 -0
  61. camel/models/mistral_model.py +1 -16
  62. camel/models/model_factory.py +27 -1
  63. camel/models/model_manager.py +24 -6
  64. camel/models/modelscope_model.py +1 -16
  65. camel/models/moonshot_model.py +185 -19
  66. camel/models/nebius_model.py +83 -0
  67. camel/models/nemotron_model.py +0 -5
  68. camel/models/netmind_model.py +1 -16
  69. camel/models/novita_model.py +1 -16
  70. camel/models/nvidia_model.py +1 -16
  71. camel/models/ollama_model.py +4 -19
  72. camel/models/openai_compatible_model.py +171 -46
  73. camel/models/openai_model.py +205 -77
  74. camel/models/openrouter_model.py +1 -17
  75. camel/models/ppio_model.py +1 -16
  76. camel/models/qianfan_model.py +1 -16
  77. camel/models/qwen_model.py +1 -16
  78. camel/models/reka_model.py +1 -16
  79. camel/models/samba_model.py +34 -47
  80. camel/models/sglang_model.py +64 -31
  81. camel/models/siliconflow_model.py +1 -16
  82. camel/models/stub_model.py +0 -4
  83. camel/models/togetherai_model.py +1 -16
  84. camel/models/vllm_model.py +1 -16
  85. camel/models/volcano_model.py +0 -17
  86. camel/models/watsonx_model.py +1 -16
  87. camel/models/yi_model.py +1 -16
  88. camel/models/zhipuai_model.py +60 -16
  89. camel/parsers/__init__.py +18 -0
  90. camel/parsers/mcp_tool_call_parser.py +176 -0
  91. camel/retrievers/auto_retriever.py +1 -0
  92. camel/runtimes/configs.py +11 -11
  93. camel/runtimes/daytona_runtime.py +15 -16
  94. camel/runtimes/docker_runtime.py +6 -6
  95. camel/runtimes/remote_http_runtime.py +5 -5
  96. camel/services/agent_openapi_server.py +380 -0
  97. camel/societies/__init__.py +2 -0
  98. camel/societies/role_playing.py +26 -28
  99. camel/societies/workforce/__init__.py +2 -0
  100. camel/societies/workforce/events.py +122 -0
  101. camel/societies/workforce/prompts.py +249 -38
  102. camel/societies/workforce/role_playing_worker.py +82 -20
  103. camel/societies/workforce/single_agent_worker.py +634 -34
  104. camel/societies/workforce/structured_output_handler.py +512 -0
  105. camel/societies/workforce/task_channel.py +169 -23
  106. camel/societies/workforce/utils.py +176 -9
  107. camel/societies/workforce/worker.py +77 -23
  108. camel/societies/workforce/workflow_memory_manager.py +772 -0
  109. camel/societies/workforce/workforce.py +3168 -478
  110. camel/societies/workforce/workforce_callback.py +74 -0
  111. camel/societies/workforce/workforce_logger.py +203 -175
  112. camel/societies/workforce/workforce_metrics.py +33 -0
  113. camel/storages/__init__.py +4 -0
  114. camel/storages/key_value_storages/json.py +15 -2
  115. camel/storages/key_value_storages/mem0_cloud.py +48 -47
  116. camel/storages/object_storages/google_cloud.py +1 -1
  117. camel/storages/vectordb_storages/__init__.py +6 -0
  118. camel/storages/vectordb_storages/chroma.py +731 -0
  119. camel/storages/vectordb_storages/oceanbase.py +13 -13
  120. camel/storages/vectordb_storages/pgvector.py +349 -0
  121. camel/storages/vectordb_storages/qdrant.py +3 -3
  122. camel/storages/vectordb_storages/surreal.py +365 -0
  123. camel/storages/vectordb_storages/tidb.py +8 -6
  124. camel/tasks/task.py +244 -27
  125. camel/toolkits/__init__.py +46 -8
  126. camel/toolkits/aci_toolkit.py +64 -19
  127. camel/toolkits/arxiv_toolkit.py +6 -6
  128. camel/toolkits/base.py +63 -5
  129. camel/toolkits/code_execution.py +28 -1
  130. camel/toolkits/context_summarizer_toolkit.py +684 -0
  131. camel/toolkits/craw4ai_toolkit.py +93 -0
  132. camel/toolkits/dappier_toolkit.py +10 -6
  133. camel/toolkits/dingtalk.py +1135 -0
  134. camel/toolkits/edgeone_pages_mcp_toolkit.py +49 -0
  135. camel/toolkits/excel_toolkit.py +901 -67
  136. camel/toolkits/file_toolkit.py +1402 -0
  137. camel/toolkits/function_tool.py +30 -6
  138. camel/toolkits/github_toolkit.py +107 -20
  139. camel/toolkits/gmail_toolkit.py +1839 -0
  140. camel/toolkits/google_calendar_toolkit.py +38 -4
  141. camel/toolkits/google_drive_mcp_toolkit.py +54 -0
  142. camel/toolkits/human_toolkit.py +34 -10
  143. camel/toolkits/hybrid_browser_toolkit/__init__.py +18 -0
  144. camel/toolkits/hybrid_browser_toolkit/config_loader.py +185 -0
  145. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +246 -0
  146. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +1973 -0
  147. camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
  148. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +3749 -0
  149. camel/toolkits/hybrid_browser_toolkit/ts/package.json +32 -0
  150. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-scripts.js +125 -0
  151. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +1815 -0
  152. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +233 -0
  153. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +590 -0
  154. camel/toolkits/hybrid_browser_toolkit/ts/src/index.ts +7 -0
  155. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  156. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
  157. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  158. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +130 -0
  159. camel/toolkits/hybrid_browser_toolkit/ts/tsconfig.json +26 -0
  160. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +319 -0
  161. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +1032 -0
  162. camel/toolkits/hybrid_browser_toolkit_py/__init__.py +17 -0
  163. camel/toolkits/hybrid_browser_toolkit_py/actions.py +575 -0
  164. camel/toolkits/hybrid_browser_toolkit_py/agent.py +311 -0
  165. camel/toolkits/hybrid_browser_toolkit_py/browser_session.py +787 -0
  166. camel/toolkits/hybrid_browser_toolkit_py/config_loader.py +490 -0
  167. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +2390 -0
  168. camel/toolkits/hybrid_browser_toolkit_py/snapshot.py +233 -0
  169. camel/toolkits/hybrid_browser_toolkit_py/stealth_script.js +0 -0
  170. camel/toolkits/hybrid_browser_toolkit_py/unified_analyzer.js +1043 -0
  171. camel/toolkits/image_generation_toolkit.py +390 -0
  172. camel/toolkits/jina_reranker_toolkit.py +3 -4
  173. camel/toolkits/klavis_toolkit.py +5 -1
  174. camel/toolkits/markitdown_toolkit.py +104 -0
  175. camel/toolkits/math_toolkit.py +64 -10
  176. camel/toolkits/mcp_toolkit.py +370 -45
  177. camel/toolkits/memory_toolkit.py +5 -1
  178. camel/toolkits/message_agent_toolkit.py +608 -0
  179. camel/toolkits/message_integration.py +724 -0
  180. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  181. camel/toolkits/note_taking_toolkit.py +277 -0
  182. camel/toolkits/notion_mcp_toolkit.py +224 -0
  183. camel/toolkits/openbb_toolkit.py +5 -1
  184. camel/toolkits/origene_mcp_toolkit.py +56 -0
  185. camel/toolkits/playwright_mcp_toolkit.py +12 -31
  186. camel/toolkits/pptx_toolkit.py +25 -12
  187. camel/toolkits/resend_toolkit.py +168 -0
  188. camel/toolkits/screenshot_toolkit.py +213 -0
  189. camel/toolkits/search_toolkit.py +437 -142
  190. camel/toolkits/slack_toolkit.py +104 -50
  191. camel/toolkits/sympy_toolkit.py +1 -1
  192. camel/toolkits/task_planning_toolkit.py +3 -3
  193. camel/toolkits/terminal_toolkit/__init__.py +18 -0
  194. camel/toolkits/terminal_toolkit/terminal_toolkit.py +957 -0
  195. camel/toolkits/terminal_toolkit/utils.py +532 -0
  196. camel/toolkits/thinking_toolkit.py +1 -1
  197. camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
  198. camel/toolkits/video_analysis_toolkit.py +106 -26
  199. camel/toolkits/video_download_toolkit.py +17 -14
  200. camel/toolkits/web_deploy_toolkit.py +1219 -0
  201. camel/toolkits/wechat_official_toolkit.py +483 -0
  202. camel/toolkits/zapier_toolkit.py +5 -1
  203. camel/types/__init__.py +2 -2
  204. camel/types/agents/tool_calling_record.py +4 -1
  205. camel/types/enums.py +316 -40
  206. camel/types/openai_types.py +2 -2
  207. camel/types/unified_model_type.py +31 -4
  208. camel/utils/commons.py +36 -5
  209. camel/utils/constants.py +3 -0
  210. camel/utils/context_utils.py +1003 -0
  211. camel/utils/mcp.py +138 -4
  212. camel/utils/mcp_client.py +45 -1
  213. camel/utils/message_summarizer.py +148 -0
  214. camel/utils/token_counting.py +43 -20
  215. camel/utils/tool_result.py +44 -0
  216. {camel_ai-0.2.67.dist-info → camel_ai-0.2.80a2.dist-info}/METADATA +296 -85
  217. {camel_ai-0.2.67.dist-info → camel_ai-0.2.80a2.dist-info}/RECORD +219 -146
  218. camel/loaders/pandas_reader.py +0 -368
  219. camel/toolkits/dalle_toolkit.py +0 -175
  220. camel/toolkits/file_write_toolkit.py +0 -444
  221. camel/toolkits/openai_agent_toolkit.py +0 -135
  222. camel/toolkits/terminal_toolkit.py +0 -1037
  223. {camel_ai-0.2.67.dist-info → camel_ai-0.2.80a2.dist-info}/WHEEL +0 -0
  224. {camel_ai-0.2.67.dist-info → camel_ai-0.2.80a2.dist-info}/licenses/LICENSE +0 -0
@@ -14,12 +14,34 @@
14
14
  from __future__ import annotations
15
15
 
16
16
  import asyncio
17
+ import concurrent.futures
17
18
  import json
19
+ import os
18
20
  import time
19
21
  import uuid
20
22
  from collections import deque
21
23
  from enum import Enum
22
- from typing import Any, Coroutine, Deque, Dict, List, Optional
24
+ from typing import (
25
+ TYPE_CHECKING,
26
+ Any,
27
+ Callable,
28
+ Coroutine,
29
+ Deque,
30
+ Dict,
31
+ Generator,
32
+ List,
33
+ Optional,
34
+ Set,
35
+ Tuple,
36
+ Union,
37
+ cast,
38
+ )
39
+
40
+ from .workforce_callback import WorkforceCallback
41
+ from .workforce_metrics import WorkforceMetrics
42
+
43
+ if TYPE_CHECKING:
44
+ from camel.utils.context_utils import ContextUtility
23
45
 
24
46
  from colorama import Fore
25
47
 
@@ -31,30 +53,71 @@ from camel.societies.workforce.base import BaseNode
31
53
  from camel.societies.workforce.prompts import (
32
54
  ASSIGN_TASK_PROMPT,
33
55
  CREATE_NODE_PROMPT,
34
- WF_TASK_DECOMPOSE_PROMPT,
56
+ FAILURE_ANALYSIS_RESPONSE_FORMAT,
57
+ QUALITY_EVALUATION_RESPONSE_FORMAT,
58
+ TASK_AGENT_SYSTEM_MESSAGE,
59
+ TASK_ANALYSIS_PROMPT,
60
+ TASK_DECOMPOSE_PROMPT,
35
61
  )
36
62
  from camel.societies.workforce.role_playing_worker import RolePlayingWorker
37
- from camel.societies.workforce.single_agent_worker import SingleAgentWorker
63
+ from camel.societies.workforce.single_agent_worker import (
64
+ SingleAgentWorker,
65
+ )
66
+ from camel.societies.workforce.structured_output_handler import (
67
+ StructuredOutputHandler,
68
+ )
38
69
  from camel.societies.workforce.task_channel import TaskChannel
39
70
  from camel.societies.workforce.utils import (
71
+ RecoveryStrategy,
72
+ TaskAnalysisResult,
73
+ TaskAssignment,
40
74
  TaskAssignResult,
41
75
  WorkerConf,
42
76
  check_if_running,
43
77
  )
44
78
  from camel.societies.workforce.worker import Worker
45
- from camel.tasks.task import Task, TaskState, validate_task_content
79
+ from camel.tasks.task import (
80
+ Task,
81
+ TaskState,
82
+ is_task_result_insufficient,
83
+ validate_task_content,
84
+ )
46
85
  from camel.toolkits import (
47
86
  CodeExecutionToolkit,
87
+ FunctionTool,
48
88
  SearchToolkit,
49
- TaskPlanningToolkit,
50
89
  ThinkingToolkit,
51
90
  )
52
91
  from camel.types import ModelPlatformType, ModelType
53
92
  from camel.utils import dependencies_required
54
93
 
94
+ from .events import (
95
+ AllTasksCompletedEvent,
96
+ TaskAssignedEvent,
97
+ TaskCompletedEvent,
98
+ TaskCreatedEvent,
99
+ TaskDecomposedEvent,
100
+ TaskFailedEvent,
101
+ TaskStartedEvent,
102
+ WorkerCreatedEvent,
103
+ )
55
104
  from .workforce_logger import WorkforceLogger
56
105
 
57
- logger = get_logger(__name__)
106
+ if os.environ.get("TRACEROOT_ENABLED", "False").lower() == "true":
107
+ try:
108
+ import traceroot # type: ignore[import]
109
+
110
+ logger = traceroot.get_logger('camel')
111
+ except ImportError:
112
+ logger = get_logger(__name__)
113
+ else:
114
+ logger = get_logger(__name__)
115
+
116
+ # Constants for configuration values
117
+ MAX_TASK_RETRIES = 3
118
+ MAX_PENDING_TASKS_LIMIT = 20
119
+ TASK_TIMEOUT_SECONDS = 600.0
120
+ DEFAULT_WORKER_POOL_SIZE = 10
58
121
 
59
122
 
60
123
  class WorkforceState(Enum):
@@ -110,31 +173,33 @@ class Workforce(BaseNode):
110
173
  children (Optional[List[BaseNode]], optional): List of child nodes
111
174
  under this node. Each child node can be a worker node or
112
175
  another workforce node. (default: :obj:`None`)
113
- coordinator_agent_kwargs (Optional[Dict], optional): Keyword
114
- arguments passed directly to the coordinator :obj:`ChatAgent`
115
- constructor. The coordinator manages task assignment and failure
116
- handling strategies. See :obj:`ChatAgent` documentation
117
- for all available parameters.
118
- (default: :obj:`None` - uses ModelPlatformType.DEFAULT,
119
- ModelType.DEFAULT)
120
- task_agent_kwargs (Optional[Dict], optional): Keyword arguments
121
- passed directly to the task planning :obj:`ChatAgent` constructor.
122
- The task agent handles task decomposition into subtasks and result
123
- composition. See :obj:`ChatAgent` documentation for all
124
- available parameters.
125
- (default: :obj:`None` - uses ModelPlatformType.DEFAULT,
126
- ModelType.DEFAULT)
127
- new_worker_agent_kwargs (Optional[Dict], optional): Default keyword
128
- arguments passed to :obj:`ChatAgent` constructor for workers
129
- created dynamically at runtime when existing workers cannot handle
130
- failed tasks. See :obj:`ChatAgent` documentation for all
131
- available parameters.
132
- (default: :obj:`None` - creates workers with SearchToolkit,
133
- CodeExecutionToolkit, and ThinkingToolkit)
176
+ coordinator_agent (Optional[ChatAgent], optional): A custom coordinator
177
+ agent instance for task assignment and worker creation. If
178
+ provided, the workforce will create a new agent using this agent's
179
+ model configuration but with the required system message and
180
+ functionality.
181
+ If None, a default agent will be created using DEFAULT model
182
+ settings. (default: :obj:`None`)
183
+ task_agent (Optional[ChatAgent], optional): A custom task planning
184
+ agent instance for task decomposition and composition. If
185
+ provided, the workforce will create a new agent using this agent's
186
+ model configuration but with the required system message. If None,
187
+ a default agent will be created using DEFAULT model settings.
188
+ (default: :obj:`None`)
189
+ new_worker_agent (Optional[ChatAgent], optional): A template agent for
190
+ workers created dynamically at runtime when existing workers cannot
191
+ handle failed tasks. If None, workers will be created with default
192
+ settings including SearchToolkit, CodeExecutionToolkit, and
193
+ ThinkingToolkit. (default: :obj:`None`)
134
194
  graceful_shutdown_timeout (float, optional): The timeout in seconds
135
195
  for graceful shutdown when a task fails 3 times. During this
136
196
  period, the workforce remains active for debugging.
137
197
  Set to 0 for immediate shutdown. (default: :obj:`15.0`)
198
+ task_timeout_seconds (Optional[float], optional): The timeout in
199
+ seconds for waiting for tasks to be returned by workers. If None,
200
+ uses the global TASK_TIMEOUT_SECONDS value (600.0 seconds).
201
+ Increase this value for tasks that require more processing time.
202
+ (default: :obj:`None`)
138
203
  share_memory (bool, optional): Whether to enable shared memory across
139
204
  SingleAgentWorker instances in the workforce. When enabled, all
140
205
  SingleAgentWorker instances, coordinator agent, and task planning
@@ -144,45 +209,93 @@ class Workforce(BaseNode):
144
209
  SingleAgentWorker instances; RolePlayingWorker and nested
145
210
  Workforce instances do not participate in memory sharing.
146
211
  (default: :obj:`False`)
212
+ use_structured_output_handler (bool, optional): Whether to use the
213
+ structured output handler instead of native structured output.
214
+ When enabled, the workforce will use prompts with structured
215
+ output instructions and regex extraction to parse responses.
216
+ This ensures compatibility with agents that don't reliably
217
+ support native structured output. When disabled, the workforce
218
+ uses the native response_format parameter.
219
+ (default: :obj:`True`)
220
+ callbacks (Optional[List[WorkforceCallback]], optional): A list of
221
+ callback handlers to observe and record workforce lifecycle events
222
+ and metrics (e.g., task creation/assignment/start/completion/
223
+ failure, worker creation/deletion, all-tasks-completed). All
224
+ items must be instances of :class:`WorkforceCallback`, otherwise
225
+ a :class:`ValueError` is raised. If none of the provided
226
+ callbacks implement :class:`WorkforceMetrics`, a built-in
227
+ :class:`WorkforceLogger` (implements both callback and metrics)
228
+ is added automatically. If at least one provided callback
229
+ implements :class:`WorkforceMetrics`, no default logger is added.
230
+ (default: :obj:`None`)
147
231
 
148
232
  Example:
149
- >>> # Configure with custom model and shared memory
150
233
  >>> import asyncio
234
+ >>> from camel.agents import ChatAgent
235
+ >>> from camel.models import ModelFactory
236
+ >>> from camel.types import ModelPlatformType, ModelType
237
+ >>> from camel.tasks import Task
238
+ >>>
239
+ >>> # Simple workforce with default agents
240
+ >>> workforce = Workforce("Research Team")
241
+ >>>
242
+ >>> # Workforce with custom model configuration
151
243
  >>> model = ModelFactory.create(
152
- ... ModelPlatformType.OPENAI, ModelType.GPT_4O
244
+ ... ModelPlatformType.OPENAI, model_type=ModelType.GPT_4O
153
245
  ... )
246
+ >>> coordinator_agent = ChatAgent(model=model)
247
+ >>> task_agent = ChatAgent(model=model)
248
+ >>>
154
249
  >>> workforce = Workforce(
155
250
  ... "Research Team",
156
- ... coordinator_agent_kwargs={"model": model, "token_limit": 4000},
157
- ... task_agent_kwargs={"model": model, "token_limit": 8000},
158
- ... share_memory=True # Enable shared memory
251
+ ... coordinator_agent=coordinator_agent,
252
+ ... task_agent=task_agent,
159
253
  ... )
160
254
  >>>
161
255
  >>> # Process a task
162
256
  >>> async def main():
163
257
  ... task = Task(content="Research AI trends", id="1")
164
- ... result = workforce.process_task(task)
258
+ ... result = await workforce.process_task_async(task)
165
259
  ... return result
166
- >>> asyncio.run(main())
260
+ >>>
261
+ >>> result_task = asyncio.run(main())
262
+
263
+ Note:
264
+ When custom coordinator_agent or task_agent are provided, the workforce
265
+ will preserve the user's system message and append the required
266
+ workforce coordination or task planning instructions to it. This
267
+ ensures both the user's intent is preserved and proper workforce
268
+ functionality is maintained. All other agent configurations (model,
269
+ memory, tools, etc.) will also be preserved.
167
270
  """
168
271
 
169
272
  def __init__(
170
273
  self,
171
274
  description: str,
172
275
  children: Optional[List[BaseNode]] = None,
173
- coordinator_agent_kwargs: Optional[Dict] = None,
174
- task_agent_kwargs: Optional[Dict] = None,
175
- new_worker_agent_kwargs: Optional[Dict] = None,
276
+ coordinator_agent: Optional[ChatAgent] = None,
277
+ task_agent: Optional[ChatAgent] = None,
278
+ new_worker_agent: Optional[ChatAgent] = None,
176
279
  graceful_shutdown_timeout: float = 15.0,
177
280
  share_memory: bool = False,
281
+ use_structured_output_handler: bool = True,
282
+ task_timeout_seconds: Optional[float] = None,
283
+ callbacks: Optional[List[WorkforceCallback]] = None,
178
284
  ) -> None:
179
285
  super().__init__(description)
180
- self._child_listening_tasks: Deque[asyncio.Task] = deque()
286
+ self._child_listening_tasks: Deque[
287
+ Union[asyncio.Task, concurrent.futures.Future]
288
+ ] = deque()
181
289
  self._children = children or []
182
- self.new_worker_agent_kwargs = new_worker_agent_kwargs
290
+ self.new_worker_agent = new_worker_agent
183
291
  self.graceful_shutdown_timeout = graceful_shutdown_timeout
184
292
  self.share_memory = share_memory
185
- self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
293
+ self.use_structured_output_handler = use_structured_output_handler
294
+ self.task_timeout_seconds = (
295
+ task_timeout_seconds or TASK_TIMEOUT_SECONDS
296
+ )
297
+ if self.use_structured_output_handler:
298
+ self.structured_handler = StructuredOutputHandler()
186
299
  self._task: Optional[Task] = None
187
300
  self._pending_tasks: Deque[Task] = deque()
188
301
  self._task_dependencies: Dict[str, List[str]] = {}
@@ -195,54 +308,158 @@ class Workforce(BaseNode):
195
308
  self._pause_event = asyncio.Event()
196
309
  self._pause_event.set() # Initially not paused
197
310
  self._stop_requested = False
311
+ self._skip_requested = False
198
312
  self._snapshots: List[WorkforceSnapshot] = []
199
313
  self._completed_tasks: List[Task] = []
200
314
  self._loop: Optional[asyncio.AbstractEventLoop] = None
201
315
  self._main_task_future: Optional[asyncio.Future] = None
316
+ self._cleanup_task: Optional[asyncio.Task] = None
202
317
  # Snapshot throttle support
203
318
  self._last_snapshot_time: float = 0.0
204
319
  # Minimum seconds between automatic snapshots
205
320
  self.snapshot_interval: float = 30.0
206
- if self.metrics_logger:
207
- for child in self._children:
208
- worker_type = type(child).__name__
209
- role_or_desc = child.description
210
- self.metrics_logger.log_worker_created(
211
- worker_id=child.node_id,
212
- worker_type=worker_type,
213
- role=role_or_desc,
214
- )
321
+ # Shared memory UUID tracking to prevent re-sharing duplicates
322
+ self._shared_memory_uuids: Set[str] = set()
323
+ self._initialize_callbacks(callbacks)
215
324
 
216
- # Warning messages for default model usage
217
- if coordinator_agent_kwargs is None:
325
+ # Set up coordinator agent with default system message
326
+ coord_agent_sys_msg = BaseMessage.make_assistant_message(
327
+ role_name="Workforce Manager",
328
+ content="You are coordinating a group of workers. A worker "
329
+ "can be a group of agents or a single agent. Each worker is "
330
+ "created to solve a specific kind of task. Your job "
331
+ "includes assigning tasks to a existing worker, creating "
332
+ "a new worker for a task, etc.",
333
+ )
334
+
335
+ if coordinator_agent is None:
218
336
  logger.warning(
219
- "No coordinator_agent_kwargs provided. Using default "
337
+ "No coordinator_agent provided. Using default "
220
338
  "ChatAgent settings (ModelPlatformType.DEFAULT, "
221
- "ModelType.DEFAULT). To customize the coordinator agent "
222
- "that assigns tasks and handles failures, pass a dictionary "
223
- "with ChatAgent parameters, e.g.: {'model': your_model, "
224
- "'tools': your_tools, 'token_limit': 8000}. See ChatAgent "
225
- "documentation for all available options."
339
+ "ModelType.DEFAULT) with default system message."
226
340
  )
227
- if task_agent_kwargs is None:
228
- logger.warning(
229
- "No task_agent_kwargs provided. Using default ChatAgent "
230
- "settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT). "
231
- "To customize the task planning agent that "
232
- "decomposes/composes tasks, pass a dictionary with "
233
- "ChatAgent parameters, e.g.: {'model': your_model, "
234
- "'token_limit': 16000}. See ChatAgent documentation for "
235
- "all available options."
236
- )
237
- if new_worker_agent_kwargs is None:
341
+ self.coordinator_agent = ChatAgent(coord_agent_sys_msg)
342
+ else:
343
+ logger.info(
344
+ "Custom coordinator_agent provided. Preserving user's "
345
+ "system message and appending workforce coordination "
346
+ "instructions to ensure proper functionality."
347
+ )
348
+
349
+ if coordinator_agent.system_message is not None:
350
+ user_sys_msg_content = coordinator_agent.system_message.content
351
+ combined_content = (
352
+ f"{user_sys_msg_content}\n\n{coord_agent_sys_msg.content}"
353
+ )
354
+ combined_sys_msg = BaseMessage.make_assistant_message(
355
+ role_name=coordinator_agent.system_message.role_name,
356
+ content=combined_content,
357
+ )
358
+ else:
359
+ combined_sys_msg = coord_agent_sys_msg
360
+
361
+ # Create a new agent with the provided agent's configuration
362
+ # but with the combined system message
363
+ self.coordinator_agent = ChatAgent(
364
+ system_message=combined_sys_msg,
365
+ model=coordinator_agent.model_backend,
366
+ memory=coordinator_agent.memory,
367
+ message_window_size=getattr(
368
+ coordinator_agent.memory, "window_size", None
369
+ ),
370
+ token_limit=getattr(
371
+ coordinator_agent.memory.get_context_creator(),
372
+ "token_limit",
373
+ None,
374
+ ),
375
+ output_language=coordinator_agent.output_language,
376
+ tools=list(coordinator_agent._internal_tools.values()),
377
+ external_tools=[
378
+ schema
379
+ for schema in coordinator_agent._external_tool_schemas.values() # noqa: E501
380
+ ],
381
+ response_terminators=coordinator_agent.response_terminators,
382
+ max_iteration=coordinator_agent.max_iteration,
383
+ stop_event=coordinator_agent.stop_event,
384
+ )
385
+
386
+ # Set up task agent with default system message
387
+ task_sys_msg = BaseMessage.make_assistant_message(
388
+ role_name="Task Planner",
389
+ content=TASK_AGENT_SYSTEM_MESSAGE,
390
+ )
391
+
392
+ if task_agent is None:
238
393
  logger.warning(
239
- "No new_worker_agent_kwargs provided. Workers created at "
240
- "runtime will use default ChatAgent settings with "
241
- "SearchToolkit, CodeExecutionToolkit, and ThinkingToolkit. "
242
- "To customize runtime worker creation, pass a dictionary "
243
- "with ChatAgent parameters, e.g.: {'model': your_model, "
244
- "'tools': your_tools}. See ChatAgent documentation for all "
245
- "available options."
394
+ "No task_agent provided. Using default ChatAgent "
395
+ "settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT) "
396
+ "with default system message."
397
+ )
398
+ self.task_agent = ChatAgent(
399
+ task_sys_msg,
400
+ )
401
+ else:
402
+ logger.info(
403
+ "Custom task_agent provided. Preserving user's "
404
+ "system message and appending task planning "
405
+ "instructions to ensure proper functionality."
406
+ )
407
+
408
+ if task_agent.system_message is not None:
409
+ user_task_sys_msg_content = task_agent.system_message.content
410
+ combined_task_content = (
411
+ f"{user_task_sys_msg_content}\n\n{task_sys_msg.content}"
412
+ )
413
+ combined_task_sys_msg = BaseMessage.make_assistant_message(
414
+ role_name=task_agent.system_message.role_name,
415
+ content=combined_task_content,
416
+ )
417
+ else:
418
+ combined_task_sys_msg = task_sys_msg
419
+
420
+ # Since ChatAgent constructor uses a dictionary with
421
+ # function names as keys, we don't need to manually deduplicate.
422
+ combined_tools: List[Union[FunctionTool, Callable]] = cast(
423
+ List[Union[FunctionTool, Callable]],
424
+ list(task_agent._internal_tools.values()),
425
+ )
426
+
427
+ # Create a new agent with the provided agent's configuration
428
+ # but with the combined system message and tools
429
+ self.task_agent = ChatAgent(
430
+ system_message=combined_task_sys_msg,
431
+ model=task_agent.model_backend,
432
+ memory=task_agent.memory,
433
+ message_window_size=getattr(
434
+ task_agent.memory, "window_size", None
435
+ ),
436
+ token_limit=getattr(
437
+ task_agent.memory.get_context_creator(),
438
+ "token_limit",
439
+ None,
440
+ ),
441
+ output_language=task_agent.output_language,
442
+ tools=combined_tools,
443
+ external_tools=[
444
+ schema
445
+ for schema in task_agent._external_tool_schemas.values()
446
+ ],
447
+ response_terminators=task_agent.response_terminators,
448
+ max_iteration=task_agent.max_iteration,
449
+ stop_event=task_agent.stop_event,
450
+ )
451
+
452
+ if new_worker_agent is None:
453
+ logger.info(
454
+ "No new_worker_agent provided. Workers created at runtime "
455
+ "will use default ChatAgent settings with SearchToolkit, "
456
+ "CodeExecutionToolkit, and ThinkingToolkit. To customize "
457
+ "runtime worker creation, pass a ChatAgent instance."
458
+ )
459
+ else:
460
+ # Validate new_worker_agent if provided
461
+ self._validate_agent_compatibility(
462
+ new_worker_agent, "new_worker_agent"
246
463
  )
247
464
 
248
465
  if self.share_memory:
@@ -252,32 +469,153 @@ class Workforce(BaseNode):
252
469
  "better context continuity during task handoffs."
253
470
  )
254
471
 
255
- coord_agent_sys_msg = BaseMessage.make_assistant_message(
256
- role_name="Workforce Manager",
257
- content="You are coordinating a group of workers. A worker can be "
258
- "a group of agents or a single agent. Each worker is "
259
- "created to solve a specific kind of task. Your job "
260
- "includes assigning tasks to a existing worker, creating "
261
- "a new worker for a task, etc.",
472
+ # Shared context utility for workflow management (created lazily)
473
+ self._shared_context_utility: Optional["ContextUtility"] = None
474
+
475
+ # ------------------------------------------------------------------
476
+ # Helper for propagating pause control to externally supplied agents
477
+ # ------------------------------------------------------------------
478
+
479
+ def _initialize_callbacks(
480
+ self, callbacks: Optional[List[WorkforceCallback]]
481
+ ) -> None:
482
+ r"""Validate, register, and prime workforce callbacks."""
483
+ self._callbacks: List[WorkforceCallback] = []
484
+
485
+ if callbacks:
486
+ for cb in callbacks:
487
+ if isinstance(cb, WorkforceCallback):
488
+ self._callbacks.append(cb)
489
+ else:
490
+ raise ValueError(
491
+ "All callbacks must be instances of WorkforceCallback"
492
+ )
493
+
494
+ has_metrics_callback = any(
495
+ isinstance(cb, WorkforceMetrics) for cb in self._callbacks
262
496
  )
263
- self.coordinator_agent = ChatAgent(
264
- coord_agent_sys_msg,
265
- **(coordinator_agent_kwargs or {}),
497
+
498
+ if not has_metrics_callback:
499
+ self._callbacks.append(WorkforceLogger(workforce_id=self.node_id))
500
+ else:
501
+ logger.info(
502
+ "WorkforceMetrics implementation detected. Skipping default "
503
+ "WorkforceLogger addition."
504
+ )
505
+
506
+ for child in self._children:
507
+ self._notify_worker_created(child)
508
+
509
+ def _notify_worker_created(
510
+ self,
511
+ worker_node: BaseNode,
512
+ *,
513
+ worker_type: Optional[str] = None,
514
+ role: Optional[str] = None,
515
+ metadata: Optional[Dict[str, Any]] = None,
516
+ ) -> None:
517
+ r"""Emit a worker-created event to all registered callbacks."""
518
+ event = WorkerCreatedEvent(
519
+ worker_id=worker_node.node_id,
520
+ worker_type=worker_type or type(worker_node).__name__,
521
+ role=role or worker_node.description,
522
+ metadata=metadata,
266
523
  )
524
+ for cb in self._callbacks:
525
+ cb.log_worker_created(event)
267
526
 
268
- task_sys_msg = BaseMessage.make_assistant_message(
269
- role_name="Task Planner",
270
- content="You are going to compose and decompose tasks. Keep "
271
- "tasks that are sequential and require the same type of "
272
- "agent together in one agent process. Only decompose tasks "
273
- "that can be handled in parallel and require different types "
274
- "of agents. This ensures efficient execution by minimizing "
275
- "context switching between agents.",
527
+ def _get_or_create_shared_context_utility(
528
+ self,
529
+ session_id: Optional[str] = None,
530
+ ) -> "ContextUtility":
531
+ r"""Get or create the shared context utility for workflow management.
532
+
533
+ This method creates the context utility only when needed, avoiding
534
+ unnecessary session folder creation during initialization.
535
+
536
+ Args:
537
+ session_id (Optional[str]): Custom session ID to use. If None,
538
+ auto-generates a timestamped session ID. (default: :obj:`None`)
539
+
540
+ Returns:
541
+ ContextUtility: The shared context utility instance.
542
+ """
543
+ if self._shared_context_utility is None:
544
+ from camel.utils.context_utils import ContextUtility
545
+
546
+ self._shared_context_utility = ContextUtility.get_workforce_shared(
547
+ session_id=session_id
548
+ )
549
+ return self._shared_context_utility
550
+
551
+ def _validate_agent_compatibility(
552
+ self, agent: ChatAgent, agent_context: str = "agent"
553
+ ) -> None:
554
+ r"""Validate that agent configuration is compatible with workforce
555
+ settings.
556
+
557
+ Args:
558
+ agent (ChatAgent): The agent to validate.
559
+ agent_context (str): Context description for error messages.
560
+
561
+ Raises:
562
+ ValueError: If agent has tools and stream mode enabled but
563
+ use_structured_output_handler is False.
564
+ """
565
+ agent_has_tools = (
566
+ bool(agent.tool_dict) if hasattr(agent, 'tool_dict') else False
567
+ )
568
+ agent_stream_mode = (
569
+ getattr(agent.model_backend, 'stream', False)
570
+ if hasattr(agent, 'model_backend')
571
+ else False
276
572
  )
277
- _kwargs = dict(task_agent_kwargs or {})
278
- extra_tools = TaskPlanningToolkit().get_tools()
279
- _kwargs["tools"] = [*_kwargs.get("tools", []), *extra_tools]
280
- self.task_agent = ChatAgent(task_sys_msg, **_kwargs)
573
+
574
+ if (
575
+ agent_has_tools
576
+ and agent_stream_mode
577
+ and not self.use_structured_output_handler
578
+ ):
579
+ raise ValueError(
580
+ f"{agent_context} has tools and stream mode enabled, but "
581
+ "use_structured_output_handler is False. Native structured "
582
+ "output doesn't work with tool calls in stream mode. "
583
+ "Please set use_structured_output_handler=True when creating "
584
+ "the Workforce."
585
+ )
586
+
587
+ # ------------------------------------------------------------------
588
+ # Helper for propagating pause control to externally supplied agents
589
+ # ------------------------------------------------------------------
590
+ def _attach_pause_event_to_agent(self, agent: ChatAgent) -> None:
591
+ r"""Ensure the given ChatAgent shares this workforce's pause_event.
592
+
593
+ If the agent already has a different pause_event we overwrite it and
594
+ emit a debug log (it is unlikely an agent needs multiple independent
595
+ pause controls once managed by this workforce)."""
596
+ try:
597
+ existing_pause_event = getattr(agent, "pause_event", None)
598
+ if existing_pause_event is not self._pause_event:
599
+ if existing_pause_event is not None:
600
+ logger.debug(
601
+ f"Overriding pause_event for agent {agent.agent_id} "
602
+ f"(had different pause_event: "
603
+ f"{id(existing_pause_event)} "
604
+ f"-> {id(self._pause_event)})"
605
+ )
606
+ agent.pause_event = self._pause_event
607
+ except AttributeError:
608
+ # Should not happen, but guard against unexpected objects
609
+ logger.warning(
610
+ f"Cannot attach pause_event to object {type(agent)} - "
611
+ f"missing pause_event attribute"
612
+ )
613
+
614
+ def _ensure_pause_event_in_kwargs(self, kwargs: Optional[Dict]) -> Dict:
615
+ r"""Insert pause_event into kwargs dict for ChatAgent construction."""
616
+ new_kwargs = dict(kwargs) if kwargs else {}
617
+ new_kwargs.setdefault("pause_event", self._pause_event)
618
+ return new_kwargs
281
619
 
282
620
  def __repr__(self):
283
621
  return (
@@ -368,16 +706,35 @@ class Workforce(BaseNode):
368
706
  continue
369
707
 
370
708
  if not memory_records:
709
+ logger.warning(
710
+ "No valid memory records could be reconstructed "
711
+ "for sharing"
712
+ )
371
713
  return
372
714
 
373
- # Share with coordinator agent
715
+ # Filter out already-shared records to prevent re-sharing
716
+ # This prevents exponential growth of duplicate records
717
+ new_records = []
374
718
  for record in memory_records:
719
+ record_uuid = str(record.uuid)
720
+ if record_uuid not in self._shared_memory_uuids:
721
+ new_records.append(record)
722
+ self._shared_memory_uuids.add(record_uuid)
723
+
724
+ if not new_records:
725
+ logger.debug(
726
+ "No new records to share (all were already shared)"
727
+ )
728
+ return
729
+
730
+ # Share with coordinator agent
731
+ for record in new_records:
375
732
  # Only add records from other agents to avoid duplication
376
733
  if record.agent_id != self.coordinator_agent.agent_id:
377
734
  self.coordinator_agent.memory.write_record(record)
378
735
 
379
736
  # Share with task agent
380
- for record in memory_records:
737
+ for record in new_records:
381
738
  if record.agent_id != self.task_agent.agent_id:
382
739
  self.task_agent.memory.write_record(record)
383
740
 
@@ -389,12 +746,12 @@ class Workforce(BaseNode):
389
746
  ]
390
747
 
391
748
  for worker in single_agent_workers:
392
- for record in memory_records:
749
+ for record in new_records:
393
750
  if record.agent_id != worker.worker.agent_id:
394
751
  worker.worker.memory.write_record(record)
395
752
 
396
753
  logger.info(
397
- f"Shared {len(memory_records)} memory records across "
754
+ f"Shared {len(new_records)} new memory records across "
398
755
  f"{len(single_agent_workers) + 2} agents in workforce "
399
756
  f"{self.node_id}"
400
757
  )
@@ -413,25 +770,473 @@ class Workforce(BaseNode):
413
770
  except Exception as e:
414
771
  logger.warning(f"Error synchronizing shared memory: {e}")
415
772
 
416
- def _decompose_task(self, task: Task) -> List[Task]:
773
+ def _update_dependencies_for_decomposition(
774
+ self, original_task: Task, subtasks: List[Task]
775
+ ) -> None:
776
+ r"""Update dependency tracking when a task is decomposed into subtasks.
777
+ Tasks that depended on the original task should now depend on all
778
+ subtasks. The last subtask inherits the original task's dependencies.
779
+ """
780
+ if not subtasks:
781
+ return
782
+
783
+ original_task_id = original_task.id
784
+ subtask_ids = [subtask.id for subtask in subtasks]
785
+
786
+ # Find tasks that depend on the original task
787
+ dependent_task_ids = [
788
+ task_id
789
+ for task_id, deps in self._task_dependencies.items()
790
+ if original_task_id in deps
791
+ ]
792
+
793
+ # Update dependent tasks to depend on all subtasks
794
+ for task_id in dependent_task_ids:
795
+ dependencies = self._task_dependencies[task_id]
796
+ dependencies.remove(original_task_id)
797
+ dependencies.extend(subtask_ids)
798
+
799
+ # The last subtask inherits original task's dependencies (if any)
800
+ if original_task_id in self._task_dependencies:
801
+ original_dependencies = self._task_dependencies[original_task_id]
802
+ if original_dependencies:
803
+ # Set dependencies for the last subtask to maintain execution
804
+ # order
805
+ self._task_dependencies[subtask_ids[-1]] = (
806
+ original_dependencies.copy()
807
+ )
808
+ # Remove original task dependencies as it's now decomposed
809
+ del self._task_dependencies[original_task_id]
810
+
811
+ def _increment_in_flight_tasks(self, task_id: str) -> None:
812
+ r"""Safely increment the in-flight tasks counter with logging."""
813
+ self._in_flight_tasks += 1
814
+ logger.debug(
815
+ f"Incremented in-flight tasks for {task_id}. "
816
+ f"Count: {self._in_flight_tasks}"
817
+ )
818
+
819
+ def _decrement_in_flight_tasks(
820
+ self, task_id: str, context: str = ""
821
+ ) -> None:
822
+ r"""Safely decrement the in-flight tasks counter with safety checks."""
823
+ if self._in_flight_tasks > 0:
824
+ self._in_flight_tasks -= 1
825
+ logger.debug(
826
+ f"Decremented in-flight tasks for {task_id} ({context}). "
827
+ f"Count: {self._in_flight_tasks}"
828
+ )
829
+ else:
830
+ logger.debug(
831
+ f"Attempted to decrement in-flight tasks for {task_id} "
832
+ f"({context}) but counter is already 0. "
833
+ f"Counter: {self._in_flight_tasks}"
834
+ )
835
+
836
+ def _cleanup_task_tracking(self, task_id: str) -> None:
837
+ r"""Clean up tracking data for a task to prevent memory leaks.
838
+
839
+ Args:
840
+ task_id (str): The ID of the task to clean up.
841
+ """
842
+ if task_id in self._task_start_times:
843
+ del self._task_start_times[task_id]
844
+
845
+ if task_id in self._task_dependencies:
846
+ del self._task_dependencies[task_id]
847
+
848
+ if task_id in self._assignees:
849
+ del self._assignees[task_id]
850
+
851
+ def _decompose_task(
852
+ self, task: Task
853
+ ) -> Union[List[Task], Generator[List[Task], None, None]]:
417
854
  r"""Decompose the task into subtasks. This method will also set the
418
855
  relationship between the task and its subtasks.
419
856
 
420
857
  Returns:
421
- List[Task]: The subtasks.
858
+ Union[List[Task], Generator[List[Task], None, None]]:
859
+ The subtasks or generator of subtasks.
422
860
  """
423
- decompose_prompt = WF_TASK_DECOMPOSE_PROMPT.format(
424
- content=task.content,
425
- child_nodes_info=self._get_child_nodes_info(),
426
- additional_info=task.additional_info,
861
+ decompose_prompt = str(
862
+ TASK_DECOMPOSE_PROMPT.format(
863
+ content=task.content,
864
+ child_nodes_info=self._get_child_nodes_info(),
865
+ additional_info=task.additional_info,
866
+ )
427
867
  )
428
868
  self.task_agent.reset()
429
- subtasks = task.decompose(self.task_agent, decompose_prompt)
430
- task.subtasks = subtasks
431
- for subtask in subtasks:
432
- subtask.parent = task
869
+ result = task.decompose(self.task_agent, decompose_prompt)
870
+
871
+ # Handle both streaming and non-streaming results
872
+ if isinstance(result, Generator):
873
+ # This is a generator (streaming mode)
874
+ def streaming_with_dependencies():
875
+ all_subtasks = []
876
+ for new_tasks in result:
877
+ all_subtasks.extend(new_tasks)
878
+ # Update dependency tracking for each batch of new tasks
879
+ if new_tasks:
880
+ self._update_dependencies_for_decomposition(
881
+ task, all_subtasks
882
+ )
883
+ yield new_tasks
433
884
 
434
- return subtasks
885
+ return streaming_with_dependencies()
886
+ else:
887
+ # This is a regular list (non-streaming mode)
888
+ subtasks = result
889
+ # Update dependency tracking for decomposed task
890
+ if subtasks:
891
+ self._update_dependencies_for_decomposition(task, subtasks)
892
+ return subtasks
893
+
894
+ def _analyze_task(
895
+ self,
896
+ task: Task,
897
+ *,
898
+ for_failure: bool,
899
+ error_message: Optional[str] = None,
900
+ ) -> TaskAnalysisResult:
901
+ r"""Unified task analysis for both failures and quality evaluation.
902
+
903
+ This method consolidates the logic for analyzing task failures and
904
+ evaluating task quality, using the unified TASK_ANALYSIS_PROMPT.
905
+
906
+ Args:
907
+ task (Task): The task to analyze
908
+ for_failure (bool): True for failure analysis, False for quality
909
+ evaluation
910
+ error_message (Optional[str]): Error message, required when
911
+ for_failure=True
912
+
913
+ Returns:
914
+ TaskAnalysisResult: Unified analysis result with recovery strategy
915
+ and optional quality metrics
916
+
917
+ Raises:
918
+ ValueError: If for_failure=True but error_message is None
919
+ """
920
+ # Validate required parameters
921
+ if for_failure and error_message is None:
922
+ raise ValueError("error_message is required when for_failure=True")
923
+
924
+ # Determine task result and issue-specific analysis based on context
925
+ if for_failure:
926
+ task_result = "N/A (task failed)"
927
+ issue_type = "Task Failure"
928
+ issue_analysis = f"**Error Message:** {error_message}"
929
+ response_format = FAILURE_ANALYSIS_RESPONSE_FORMAT
930
+ result_schema = TaskAnalysisResult
931
+ fallback_values: Dict[str, Any] = {
932
+ "reasoning": "Defaulting to retry due to parsing error",
933
+ "recovery_strategy": RecoveryStrategy.RETRY,
934
+ "modified_task_content": None,
935
+ "issues": [error_message] if error_message else [],
936
+ }
937
+ examples: List[Dict[str, Any]] = [
938
+ {
939
+ "reasoning": "Temporary network error, worth retrying",
940
+ "recovery_strategy": "retry",
941
+ "modified_task_content": None,
942
+ "issues": ["Network timeout"],
943
+ }
944
+ ]
945
+ else:
946
+ # Quality evaluation
947
+ task_result = task.result or "No result available"
948
+ issue_type = "Quality Evaluation"
949
+ issue_analysis = (
950
+ "Provide a quality score (0-100) and list any specific "
951
+ "issues found."
952
+ )
953
+ response_format = QUALITY_EVALUATION_RESPONSE_FORMAT
954
+ result_schema = TaskAnalysisResult
955
+ fallback_values = {
956
+ "reasoning": (
957
+ "Defaulting to acceptable quality due to parsing error"
958
+ ),
959
+ "issues": [],
960
+ "recovery_strategy": None,
961
+ "modified_task_content": None,
962
+ "quality_score": 80,
963
+ }
964
+ examples = [
965
+ {
966
+ "reasoning": (
967
+ "Excellent implementation with comprehensive tests"
968
+ ),
969
+ "issues": [],
970
+ "recovery_strategy": None,
971
+ "modified_task_content": None,
972
+ "quality_score": 98,
973
+ },
974
+ {
975
+ "reasoning": (
976
+ "Implementation incomplete with missing features"
977
+ ),
978
+ "issues": [
979
+ "Incomplete implementation",
980
+ "Missing error handling",
981
+ ],
982
+ "recovery_strategy": "replan",
983
+ "modified_task_content": (
984
+ "Previous attempt was incomplete. "
985
+ "Please implement with: 1) Full feature "
986
+ "coverage, 2) Proper error handling"
987
+ ),
988
+ "quality_score": 45,
989
+ },
990
+ ]
991
+
992
+ # Format the unified analysis prompt
993
+ analysis_prompt = str(
994
+ TASK_ANALYSIS_PROMPT.format(
995
+ task_id=task.id,
996
+ task_content=task.content,
997
+ task_result=task_result,
998
+ failure_count=task.failure_count,
999
+ task_depth=task.get_depth(),
1000
+ assigned_worker=task.assigned_worker_id or "unknown",
1001
+ issue_type=issue_type,
1002
+ issue_specific_analysis=issue_analysis,
1003
+ response_format=response_format,
1004
+ )
1005
+ )
1006
+
1007
+ try:
1008
+ if self.use_structured_output_handler:
1009
+ enhanced_prompt = (
1010
+ self.structured_handler.generate_structured_prompt(
1011
+ base_prompt=analysis_prompt,
1012
+ schema=result_schema,
1013
+ examples=examples,
1014
+ )
1015
+ )
1016
+
1017
+ self.task_agent.reset()
1018
+ response = self.task_agent.step(enhanced_prompt)
1019
+
1020
+ result = self.structured_handler.parse_structured_response(
1021
+ response.msg.content if response.msg else "",
1022
+ schema=result_schema,
1023
+ fallback_values=fallback_values,
1024
+ )
1025
+
1026
+ if isinstance(result, TaskAnalysisResult):
1027
+ return result
1028
+ elif isinstance(result, dict):
1029
+ return result_schema(**result)
1030
+ else:
1031
+ # Fallback based on context
1032
+ return TaskAnalysisResult(**fallback_values)
1033
+ else:
1034
+ self.task_agent.reset()
1035
+ response = self.task_agent.step(
1036
+ analysis_prompt, response_format=result_schema
1037
+ )
1038
+ return response.msg.parsed
1039
+
1040
+ except Exception as e:
1041
+ logger.warning(
1042
+ f"Error during task analysis "
1043
+ f"({'failure' if for_failure else 'quality'}): {e}, "
1044
+ f"using fallback"
1045
+ )
1046
+ return TaskAnalysisResult(**fallback_values)
1047
+
1048
+ async def _apply_recovery_strategy(
1049
+ self,
1050
+ task: Task,
1051
+ recovery_decision: TaskAnalysisResult,
1052
+ ) -> bool:
1053
+ r"""Apply the recovery strategy from a task analysis result.
1054
+
1055
+ This method centralizes the recovery logic for both execution failures
1056
+ and quality-based failures.
1057
+
1058
+ Args:
1059
+ task (Task): The task that needs recovery
1060
+ recovery_decision (TaskAnalysisResult): The analysis result with
1061
+ recovery strategy
1062
+
1063
+ Returns:
1064
+ bool: True if workforce should halt (e.g., decompose needs
1065
+ different handling), False otherwise
1066
+ """
1067
+ strategy = (
1068
+ recovery_decision.recovery_strategy or RecoveryStrategy.RETRY
1069
+ )
1070
+ action_taken = ""
1071
+
1072
+ try:
1073
+ if strategy == RecoveryStrategy.RETRY:
1074
+ # Simply retry the task by reposting it to the same worker
1075
+ # Check both _assignees dict and task.assigned_worker_id
1076
+ assignee_id = (
1077
+ self._assignees.get(task.id) or task.assigned_worker_id
1078
+ )
1079
+
1080
+ if assignee_id:
1081
+ # Retry with the same worker - no coordinator call needed
1082
+ await self._post_task(task, assignee_id)
1083
+ action_taken = f"retried with same worker {assignee_id}"
1084
+ logger.info(
1085
+ f"Task {task.id} retrying with same worker "
1086
+ f"{assignee_id} (no coordinator call)"
1087
+ )
1088
+ else:
1089
+ # No previous assignment exists - find a new assignee
1090
+ logger.info(
1091
+ f"Task {task.id} has no previous assignee, "
1092
+ f"calling coordinator"
1093
+ )
1094
+ batch_result = await self._find_assignee([task])
1095
+ assignment = batch_result.assignments[0]
1096
+ self._assignees[task.id] = assignment.assignee_id
1097
+ await self._post_task(task, assignment.assignee_id)
1098
+ action_taken = (
1099
+ f"retried with new worker {assignment.assignee_id}"
1100
+ )
1101
+
1102
+ elif strategy == RecoveryStrategy.REPLAN:
1103
+ # Modify the task content and retry
1104
+ if recovery_decision.modified_task_content:
1105
+ task.content = recovery_decision.modified_task_content
1106
+ logger.info(f"Task {task.id} content modified for replan")
1107
+
1108
+ # Repost the modified task
1109
+ if task.id in self._assignees:
1110
+ assignee_id = self._assignees[task.id]
1111
+ await self._post_task(task, assignee_id)
1112
+ action_taken = (
1113
+ f"replanned and retried with worker {assignee_id}"
1114
+ )
1115
+ else:
1116
+ # Find a new assignee for the replanned task
1117
+ batch_result = await self._find_assignee([task])
1118
+ assignment = batch_result.assignments[0]
1119
+ self._assignees[task.id] = assignment.assignee_id
1120
+ await self._post_task(task, assignment.assignee_id)
1121
+ action_taken = (
1122
+ f"replanned and assigned to "
1123
+ f"worker {assignment.assignee_id}"
1124
+ )
1125
+
1126
+ elif strategy == RecoveryStrategy.REASSIGN:
1127
+ # Reassign to a different worker
1128
+ old_worker = task.assigned_worker_id
1129
+ logger.info(
1130
+ f"Task {task.id} will be reassigned from worker "
1131
+ f"{old_worker}"
1132
+ )
1133
+
1134
+ # Find a different worker
1135
+ batch_result = await self._find_assignee([task])
1136
+ assignment = batch_result.assignments[0]
1137
+ new_worker = assignment.assignee_id
1138
+
1139
+ # If same worker, force find another
1140
+ if new_worker == old_worker and len(self._children) > 1:
1141
+ logger.info("Same worker selected, finding alternative")
1142
+ # Try to find different worker by adding note to
1143
+ # task content
1144
+ task.content = (
1145
+ f"{task.content}\n\n"
1146
+ f"Note: Previous worker {old_worker} had quality "
1147
+ f"issues. Needs different approach."
1148
+ )
1149
+ batch_result = await self._find_assignee([task])
1150
+ assignment = batch_result.assignments[0]
1151
+ new_worker = assignment.assignee_id
1152
+
1153
+ self._assignees[task.id] = new_worker
1154
+ await self._post_task(task, new_worker)
1155
+ action_taken = f"reassigned from {old_worker} to {new_worker}"
1156
+ logger.info(
1157
+ f"Task {task.id} reassigned from {old_worker} to "
1158
+ f"{new_worker}"
1159
+ )
1160
+
1161
+ elif strategy == RecoveryStrategy.DECOMPOSE:
1162
+ # Decompose the task into subtasks
1163
+ reason = (
1164
+ "failure"
1165
+ if not recovery_decision.is_quality_evaluation
1166
+ else "quality issues"
1167
+ )
1168
+ logger.info(
1169
+ f"Task {task.id} will be decomposed due to {reason}"
1170
+ )
1171
+ subtasks_result = self._decompose_task(task)
1172
+
1173
+ # Handle both streaming and non-streaming results
1174
+ if isinstance(subtasks_result, Generator):
1175
+ subtasks = []
1176
+ for new_tasks in subtasks_result:
1177
+ subtasks.extend(new_tasks)
1178
+ else:
1179
+ subtasks = subtasks_result
1180
+
1181
+ if subtasks:
1182
+ task_decomposed_event = TaskDecomposedEvent(
1183
+ parent_task_id=task.id,
1184
+ subtask_ids=[st.id for st in subtasks],
1185
+ )
1186
+ for cb in self._callbacks:
1187
+ cb.log_task_decomposed(task_decomposed_event)
1188
+ for subtask in subtasks:
1189
+ task_created_event = TaskCreatedEvent(
1190
+ task_id=subtask.id,
1191
+ description=subtask.content,
1192
+ parent_task_id=task.id,
1193
+ task_type=subtask.type,
1194
+ metadata=subtask.additional_info,
1195
+ )
1196
+ for cb in self._callbacks:
1197
+ cb.log_task_created(task_created_event)
1198
+
1199
+ # Insert subtasks at the head of the queue
1200
+ self._pending_tasks.extendleft(reversed(subtasks))
1201
+ await self._post_ready_tasks()
1202
+ action_taken = f"decomposed into {len(subtasks)} subtasks"
1203
+
1204
+ logger.info(
1205
+ f"Task {task.id} decomposed into {len(subtasks)} subtasks"
1206
+ )
1207
+
1208
+ # Sync shared memory after task decomposition
1209
+ if self.share_memory:
1210
+ logger.info(
1211
+ f"Syncing shared memory after task {task.id} "
1212
+ f"decomposition"
1213
+ )
1214
+ self._sync_shared_memory()
1215
+
1216
+ # For decompose, we return early with special handling
1217
+ return True
1218
+
1219
+ elif strategy == RecoveryStrategy.CREATE_WORKER:
1220
+ assignee = await self._create_worker_node_for_task(task)
1221
+ await self._post_task(task, assignee.node_id)
1222
+ action_taken = (
1223
+ f"created new worker {assignee.node_id} and assigned "
1224
+ f"task {task.id} to it"
1225
+ )
1226
+
1227
+ except Exception as e:
1228
+ logger.error(
1229
+ f"Recovery strategy {strategy} failed for task {task.id}: {e}",
1230
+ exc_info=True,
1231
+ )
1232
+ raise
1233
+
1234
+ logger.debug(
1235
+ f"Task {task.id} recovery: {action_taken}. "
1236
+ f"Strategy: {strategy.value}"
1237
+ )
1238
+
1239
+ return False
435
1240
 
436
1241
  # Human intervention methods
437
1242
  async def _async_pause(self) -> None:
@@ -522,12 +1327,45 @@ class Workforce(BaseNode):
522
1327
  f"(event-loop not yet started)."
523
1328
  )
524
1329
 
525
- def save_snapshot(self, description: str = "") -> None:
526
- r"""Save current state as a snapshot."""
527
- snapshot = WorkforceSnapshot(
528
- main_task=self._task,
529
- pending_tasks=self._pending_tasks,
530
- completed_tasks=self._completed_tasks,
1330
+ async def _async_skip_gracefully(self) -> None:
1331
+ r"""Async implementation of skip_gracefully to run on the event
1332
+ loop.
1333
+ """
1334
+ self._skip_requested = True
1335
+ if self._pause_event.is_set() is False:
1336
+ self._pause_event.set() # Resume if paused to process skip
1337
+ logger.info(f"Workforce {self.node_id} skip requested.")
1338
+
1339
+ def skip_gracefully(self) -> None:
1340
+ r"""Request workforce to skip current pending tasks and move to next
1341
+ main task from the queue. If no main tasks exist, acts like
1342
+ stop_gracefully.
1343
+
1344
+ This method clears the current pending subtasks and moves to the next
1345
+ main task in the queue if available. Works both when the internal
1346
+ event-loop is alive and when it has not yet been started.
1347
+ """
1348
+
1349
+ if self._loop and not self._loop.is_closed():
1350
+ self._submit_coro_to_loop(self._async_skip_gracefully())
1351
+ else:
1352
+ # Loop not yet created, set the flag synchronously so later
1353
+ # startup will respect it.
1354
+ self._skip_requested = True
1355
+ # Ensure any pending pause is released so that when the loop does
1356
+ # start it can see the skip request and exit.
1357
+ self._pause_event.set()
1358
+ logger.info(
1359
+ f"Workforce {self.node_id} skip requested "
1360
+ f"(event-loop not yet started)."
1361
+ )
1362
+
1363
+ def save_snapshot(self, description: str = "") -> None:
1364
+ r"""Save current state as a snapshot."""
1365
+ snapshot = WorkforceSnapshot(
1366
+ main_task=self._task,
1367
+ pending_tasks=self._pending_tasks,
1368
+ completed_tasks=self._completed_tasks,
531
1369
  task_dependencies=self._task_dependencies,
532
1370
  assignees=self._assignees,
533
1371
  current_task_index=len(self._completed_tasks),
@@ -564,7 +1402,7 @@ class Workforce(BaseNode):
564
1402
  if not validate_task_content(new_content, task_id):
565
1403
  logger.warning(
566
1404
  f"Task {task_id} content modification rejected: "
567
- f"Invalid content. Content preview: '{new_content[:50]}...'"
1405
+ f"Invalid content. Content preview: '{new_content}'"
568
1406
  )
569
1407
  return False
570
1408
 
@@ -576,41 +1414,167 @@ class Workforce(BaseNode):
576
1414
  logger.warning(f"Task {task_id} not found in pending tasks.")
577
1415
  return False
578
1416
 
1417
+ def get_main_task_queue(self) -> List[Task]:
1418
+ r"""Get current main task queue for human review.
1419
+ Returns:
1420
+ List[Task]: List of main tasks waiting to be decomposed
1421
+ and executed.
1422
+ """
1423
+ # Return tasks from pending queue that need decomposition
1424
+ return [
1425
+ t
1426
+ for t in self._pending_tasks
1427
+ if t.additional_info
1428
+ and t.additional_info.get('_needs_decomposition')
1429
+ ]
1430
+
579
1431
  def add_task(
580
1432
  self,
581
1433
  content: str,
582
1434
  task_id: Optional[str] = None,
583
1435
  additional_info: Optional[Dict[str, Any]] = None,
1436
+ as_subtask: bool = False,
584
1437
  insert_position: int = -1,
585
1438
  ) -> Task:
586
- r"""Add a new task to the pending queue."""
587
- new_task = Task(
1439
+ r"""Add a new task to the workforce.
1440
+
1441
+ By default, this method adds a main task that will be decomposed into
1442
+ subtasks. Set `as_subtask=True` to add a task directly to the pending
1443
+ subtask queue without decomposition.
1444
+
1445
+ Args:
1446
+ content (str): The content of the task.
1447
+ task_id (Optional[str], optional): Optional ID for the task.
1448
+ If not provided, a unique ID will be generated.
1449
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1450
+ additional metadata for the task.
1451
+ as_subtask (bool, optional): If True, adds the task directly to
1452
+ the pending subtask queue. If False, adds as a main task that
1453
+ will be decomposed. Defaults to False.
1454
+ insert_position (int, optional): Position to insert the task in
1455
+ the pending queue. Only applies when as_subtask=True.
1456
+ Defaults to -1 (append to end).
1457
+
1458
+ Returns:
1459
+ Task: The created task object.
1460
+ """
1461
+ if as_subtask:
1462
+ new_task = Task(
1463
+ content=content,
1464
+ id=task_id or f"human_added_{len(self._pending_tasks)}",
1465
+ additional_info=additional_info,
1466
+ )
1467
+
1468
+ # Add directly to current pending subtasks
1469
+ if insert_position == -1:
1470
+ self._pending_tasks.append(new_task)
1471
+ else:
1472
+ # Convert deque to list, insert, then back to deque
1473
+ tasks_list = list(self._pending_tasks)
1474
+ tasks_list.insert(insert_position, new_task)
1475
+ self._pending_tasks = deque(tasks_list)
1476
+
1477
+ logger.info(f"New subtask added to pending queue: {new_task.id}")
1478
+ return new_task
1479
+ else:
1480
+ # Add as main task that needs decomposition
1481
+ # Use additional_info to mark this task needs decomposition
1482
+ # Make a copy to avoid modifying user's dict
1483
+ info = additional_info.copy() if additional_info else {}
1484
+ info['_needs_decomposition'] = True
1485
+
1486
+ task_count = sum(
1487
+ 1
1488
+ for t in self._pending_tasks
1489
+ if t.additional_info
1490
+ and t.additional_info.get('_needs_decomposition')
1491
+ )
1492
+
1493
+ new_task = Task(
1494
+ content=content,
1495
+ id=task_id or f"main_task_{task_count}",
1496
+ additional_info=info,
1497
+ )
1498
+
1499
+ self._pending_tasks.append(new_task)
1500
+ logger.info(f"New main task added to pending queue: {new_task.id}")
1501
+ return new_task
1502
+
1503
+ def add_main_task(
1504
+ self,
1505
+ content: str,
1506
+ task_id: Optional[str] = None,
1507
+ additional_info: Optional[Dict[str, Any]] = None,
1508
+ ) -> Task:
1509
+ r"""Add a new main task that will be decomposed into subtasks.
1510
+
1511
+ This is an alias for :meth:`add_task` with `as_subtask=False`.
1512
+
1513
+ Args:
1514
+ content (str): The content of the main task.
1515
+ task_id (Optional[str], optional): Optional ID for the task.
1516
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1517
+ additional metadata.
1518
+
1519
+ Returns:
1520
+ Task: The created main task object.
1521
+ """
1522
+ return self.add_task(
588
1523
  content=content,
589
- id=task_id or f"human_added_{len(self._pending_tasks)}",
1524
+ task_id=task_id,
590
1525
  additional_info=additional_info,
1526
+ as_subtask=False,
591
1527
  )
592
- if insert_position == -1:
593
- self._pending_tasks.append(new_task)
594
- else:
595
- # Convert deque to list, insert, then back to deque
596
- tasks_list = list(self._pending_tasks)
597
- tasks_list.insert(insert_position, new_task)
598
- self._pending_tasks = deque(tasks_list)
599
1528
 
600
- logger.info(f"New task added: {new_task.id}")
601
- return new_task
1529
+ def add_subtask(
1530
+ self,
1531
+ content: str,
1532
+ task_id: Optional[str] = None,
1533
+ additional_info: Optional[Dict[str, Any]] = None,
1534
+ insert_position: int = -1,
1535
+ ) -> Task:
1536
+ r"""Add a new subtask to the current pending queue.
1537
+
1538
+ This is an alias for :meth:`add_task` with `as_subtask=True`.
1539
+
1540
+ Args:
1541
+ content (str): The content of the subtask.
1542
+ task_id (Optional[str], optional): Optional ID for the task.
1543
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1544
+ additional metadata.
1545
+ insert_position (int, optional): Position to insert the task.
1546
+ Defaults to -1 (append to end).
1547
+
1548
+ Returns:
1549
+ Task: The created subtask object.
1550
+ """
1551
+ return self.add_task(
1552
+ content=content,
1553
+ task_id=task_id,
1554
+ additional_info=additional_info,
1555
+ as_subtask=True,
1556
+ insert_position=insert_position,
1557
+ )
602
1558
 
603
1559
  def remove_task(self, task_id: str) -> bool:
604
- r"""Remove a task from the pending queue."""
605
- # Convert to list to find and remove
606
- tasks_list = list(self._pending_tasks)
607
- for i, task in enumerate(tasks_list):
1560
+ r"""Remove a task from the pending queue or main task queue.
1561
+
1562
+ Args:
1563
+ task_id (str): The ID of the task to remove.
1564
+
1565
+ Returns:
1566
+ bool: True if task was found and removed, False otherwise.
1567
+ """
1568
+ # Check main task queue first
1569
+ pending_tasks_list = list(self._pending_tasks)
1570
+ for i, task in enumerate(pending_tasks_list):
608
1571
  if task.id == task_id:
609
- tasks_list.pop(i)
610
- self._pending_tasks = deque(tasks_list)
611
- logger.info(f"Task {task_id} removed.")
1572
+ pending_tasks_list.pop(i)
1573
+ self._pending_tasks = deque(pending_tasks_list)
1574
+ logger.info(f"Task {task_id} removed from pending queue.")
612
1575
  return True
613
- logger.warning(f"Task {task_id} not found in pending tasks.")
1576
+
1577
+ logger.warning(f"Task {task_id} not found in any task queue.")
614
1578
  return False
615
1579
 
616
1580
  def reorder_tasks(self, task_ids: List[str]) -> bool:
@@ -619,8 +1583,13 @@ class Workforce(BaseNode):
619
1583
  tasks_dict = {task.id: task for task in self._pending_tasks}
620
1584
 
621
1585
  # Check if all provided IDs exist
622
- if not all(task_id in tasks_dict for task_id in task_ids):
623
- logger.warning("Some task IDs not found in pending tasks.")
1586
+ invalid_ids = [
1587
+ task_id for task_id in task_ids if task_id not in tasks_dict
1588
+ ]
1589
+ if invalid_ids:
1590
+ logger.warning(
1591
+ f"Task IDs not found in pending tasks: {invalid_ids}"
1592
+ )
624
1593
  return False
625
1594
 
626
1595
  # Check if we have the same number of tasks
@@ -671,8 +1640,8 @@ class Workforce(BaseNode):
671
1640
  # Reset state for tasks being moved back to pending
672
1641
  for task in tasks_to_move_back:
673
1642
  # Handle all possible task states
674
- if task.state in [TaskState.DONE, TaskState.FAILED]:
675
- task.state = TaskState.OPEN
1643
+ if task.state in [TaskState.DONE, TaskState.OPEN]:
1644
+ task.state = TaskState.FAILED # TODO: Add logic for OPEN
676
1645
  # Clear result to avoid confusion
677
1646
  task.result = None
678
1647
  # Reset failure count to give task a fresh start
@@ -720,68 +1689,111 @@ class Workforce(BaseNode):
720
1689
  "main_task_id": self._task.id if self._task else None,
721
1690
  }
722
1691
 
723
- @check_if_running(False)
724
- async def process_task_async(
725
- self, task: Task, interactive: bool = False
726
- ) -> Task:
727
- r"""Main entry point to process a task asynchronously.
1692
+ async def handle_decompose_append_task(
1693
+ self, task: Task, reset: bool = True
1694
+ ) -> List[Task]:
1695
+ r"""Handle task decomposition and validation with
1696
+ workforce environment functions. Then append to
1697
+ pending tasks if decomposition happened.
728
1698
 
729
1699
  Args:
730
1700
  task (Task): The task to be processed.
731
- interactive (bool, optional): If True, enables human-intervention
732
- workflow (pause/resume/snapshot). Defaults to False, which
733
- runs the task in a blocking one-shot manner.
1701
+ reset (Bool): Should trigger workforce reset (Workforce must not
1702
+ be running). Default: True
734
1703
 
735
1704
  Returns:
736
- Task: The updated task.
1705
+ List[Task]: The decomposed subtasks or the original task.
737
1706
  """
738
- # Delegate to intervention pipeline when requested to keep
739
- # backward-compat.
740
- if interactive:
741
- return await self._process_task_with_snapshot(task)
742
-
743
1707
  if not validate_task_content(task.content, task.id):
744
1708
  task.state = TaskState.FAILED
745
1709
  task.result = "Task failed: Invalid or empty content provided"
746
1710
  logger.warning(
747
1711
  f"Task {task.id} rejected: Invalid or empty content. "
748
- f"Content preview: '{task.content[:50]}...'"
1712
+ f"Content preview: '{task.content}'"
749
1713
  )
750
- return task
1714
+ return [task]
1715
+
1716
+ if reset and self._state != WorkforceState.RUNNING:
1717
+ self.reset()
1718
+ logger.info("Workforce reset before handling task.")
751
1719
 
752
- self.reset()
1720
+ # Focus on the new task
753
1721
  self._task = task
754
- if self.metrics_logger:
755
- self.metrics_logger.log_task_created(
756
- task_id=task.id,
757
- description=task.content,
758
- task_type=task.type,
759
- metadata=task.additional_info,
760
- )
761
1722
  task.state = TaskState.FAILED
1723
+
1724
+ task_created_event = TaskCreatedEvent(
1725
+ task_id=task.id,
1726
+ description=task.content,
1727
+ task_type=task.type,
1728
+ metadata=task.additional_info,
1729
+ )
1730
+ for cb in self._callbacks:
1731
+ cb.log_task_created(task_created_event)
1732
+
762
1733
  # The agent tend to be overconfident on the whole task, so we
763
1734
  # decompose the task into subtasks first
764
- subtasks = self._decompose_task(task)
765
- if self.metrics_logger and subtasks:
766
- self.metrics_logger.log_task_decomposed(
767
- parent_task_id=task.id, subtask_ids=[st.id for st in subtasks]
1735
+ subtasks_result = self._decompose_task(task)
1736
+
1737
+ # Handle both streaming and non-streaming results
1738
+ if isinstance(subtasks_result, Generator):
1739
+ # This is a generator (streaming mode)
1740
+ subtasks = []
1741
+ for new_tasks in subtasks_result:
1742
+ subtasks.extend(new_tasks)
1743
+ else:
1744
+ # This is a regular list (non-streaming mode)
1745
+ subtasks = subtasks_result
1746
+ if subtasks:
1747
+ task_decomposed_event = TaskDecomposedEvent(
1748
+ parent_task_id=task.id,
1749
+ subtask_ids=[st.id for st in subtasks],
768
1750
  )
1751
+ for cb in self._callbacks:
1752
+ cb.log_task_decomposed(task_decomposed_event)
769
1753
  for subtask in subtasks:
770
- self.metrics_logger.log_task_created(
1754
+ task_created_event = TaskCreatedEvent(
771
1755
  task_id=subtask.id,
772
1756
  description=subtask.content,
773
1757
  parent_task_id=task.id,
774
1758
  task_type=subtask.type,
775
1759
  metadata=subtask.additional_info,
776
1760
  )
1761
+ for cb in self._callbacks:
1762
+ cb.log_task_created(task_created_event)
1763
+
777
1764
  if subtasks:
778
- # If decomposition happened, the original task becomes a container.
779
- # We only execute its subtasks.
1765
+ # _pending_tasks will contain both undecomposed
1766
+ # and decomposed tasks, so we use additional_info
1767
+ # to mark the tasks that need decomposition instead
780
1768
  self._pending_tasks.extendleft(reversed(subtasks))
781
1769
  else:
782
1770
  # If no decomposition, execute the original task.
783
1771
  self._pending_tasks.append(task)
784
1772
 
1773
+ return subtasks
1774
+
1775
+ @check_if_running(False)
1776
+ async def process_task_async(
1777
+ self, task: Task, interactive: bool = False
1778
+ ) -> Task:
1779
+ r"""Main entry point to process a task asynchronously.
1780
+
1781
+ Args:
1782
+ task (Task): The task to be processed.
1783
+ interactive (bool, optional): If True, enables human-intervention
1784
+ workflow (pause/resume/snapshot). Defaults to False, which
1785
+ runs the task in a blocking one-shot manner.
1786
+
1787
+ Returns:
1788
+ Task: The updated task.
1789
+ """
1790
+ # Delegate to intervention pipeline when requested to keep
1791
+ # backward-compat.
1792
+ if interactive:
1793
+ return await self._process_task_with_snapshot(task)
1794
+
1795
+ subtasks = await self.handle_decompose_append_task(task)
1796
+
785
1797
  self.set_channel(TaskChannel())
786
1798
 
787
1799
  await self.start()
@@ -818,14 +1830,19 @@ class Workforce(BaseNode):
818
1830
  needed
819
1831
  >>> print(result.result)
820
1832
  """
821
- import asyncio
822
- import concurrent.futures
823
-
824
1833
  # Check if we're already in an event loop
825
1834
  try:
826
- asyncio.get_running_loop()
1835
+ current_loop = asyncio.get_running_loop()
1836
+ # Store the current loop for potential reuse by async tools
1837
+ self._loop = current_loop
1838
+
1839
+ logger.info(
1840
+ "Running in active event loop context. "
1841
+ "Consider using process_task_async() directly for better "
1842
+ "async tool compatibility."
1843
+ )
827
1844
 
828
- # If we're in an event loop, we need to run in a thread
1845
+ # Create a new thread with a fresh event loop
829
1846
  def run_in_thread():
830
1847
  # Create new event loop for this thread
831
1848
  new_loop = asyncio.new_event_loop()
@@ -836,6 +1853,8 @@ class Workforce(BaseNode):
836
1853
  )
837
1854
  finally:
838
1855
  new_loop.close()
1856
+ # Restore original loop reference
1857
+ self._loop = current_loop
839
1858
 
840
1859
  with concurrent.futures.ThreadPoolExecutor() as executor:
841
1860
  future = executor.submit(run_in_thread)
@@ -856,24 +1875,8 @@ class Workforce(BaseNode):
856
1875
  Task: The updated task.
857
1876
  """
858
1877
 
859
- if not validate_task_content(task.content, task.id):
860
- task.state = TaskState.FAILED
861
- task.result = "Task failed: Invalid or empty content provided"
862
- logger.warning(
863
- f"Task {task.id} rejected: Invalid or empty content. "
864
- f"Content preview: '{task.content[:50]}...'"
865
- )
866
- return task
867
-
868
- self.reset()
869
- self._task = task
870
- self._state = WorkforceState.RUNNING
871
- task.state = TaskState.OPEN
872
- self._pending_tasks.append(task)
1878
+ await self.handle_decompose_append_task(task)
873
1879
 
874
- # Decompose the task into subtasks first
875
- subtasks = self._decompose_task(task)
876
- self._pending_tasks.extendleft(reversed(subtasks))
877
1880
  self.set_channel(TaskChannel())
878
1881
 
879
1882
  # Save initial snapshot
@@ -980,30 +1983,107 @@ class Workforce(BaseNode):
980
1983
 
981
1984
  return self._task
982
1985
 
983
- @check_if_running(False)
1986
+ def _start_child_node_when_paused(
1987
+ self, start_coroutine: Coroutine
1988
+ ) -> None:
1989
+ r"""Helper to start a child node when workforce is paused.
1990
+
1991
+ Args:
1992
+ start_coroutine: The coroutine to start (e.g., worker_node.start())
1993
+ """
1994
+ if self._state == WorkforceState.PAUSED and hasattr(
1995
+ self, '_child_listening_tasks'
1996
+ ):
1997
+ if self._loop and not self._loop.is_closed():
1998
+ # Use thread-safe coroutine execution for dynamic addition
1999
+ child_task: Union[asyncio.Task, concurrent.futures.Future]
2000
+ try:
2001
+ # Check if we're in the same thread as the loop
2002
+ current_loop = asyncio.get_running_loop()
2003
+ if current_loop is self._loop:
2004
+ # Same loop context - use create_task
2005
+ child_task = self._loop.create_task(start_coroutine)
2006
+ else:
2007
+ # Different loop context - use thread-safe approach
2008
+ child_task = asyncio.run_coroutine_threadsafe(
2009
+ start_coroutine, self._loop
2010
+ )
2011
+ except RuntimeError:
2012
+ # No running loop in current thread - use thread-safe
2013
+ # approach
2014
+ child_task = asyncio.run_coroutine_threadsafe(
2015
+ start_coroutine, self._loop
2016
+ )
2017
+ self._child_listening_tasks.append(child_task)
2018
+ else:
2019
+ # Close the coroutine to prevent RuntimeWarning
2020
+ start_coroutine.close()
2021
+ else:
2022
+ # Close the coroutine to prevent RuntimeWarning
2023
+ start_coroutine.close()
2024
+
984
2025
  def add_single_agent_worker(
985
- self, description: str, worker: ChatAgent
2026
+ self,
2027
+ description: str,
2028
+ worker: ChatAgent,
2029
+ pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
2030
+ enable_workflow_memory: bool = False,
986
2031
  ) -> Workforce:
987
2032
  r"""Add a worker node to the workforce that uses a single agent.
2033
+ Can be called when workforce is paused to dynamically add workers.
988
2034
 
989
2035
  Args:
990
2036
  description (str): Description of the worker node.
991
2037
  worker (ChatAgent): The agent to be added.
2038
+ pool_max_size (int): Maximum size of the agent pool.
2039
+ (default: :obj:`10`)
2040
+ enable_workflow_memory (bool): Whether to enable workflow memory
2041
+ accumulation. Set to True if you plan to call
2042
+ save_workflow_memories(). (default: :obj:`False`)
992
2043
 
993
2044
  Returns:
994
2045
  Workforce: The workforce node itself.
2046
+
2047
+ Raises:
2048
+ RuntimeError: If called while workforce is running (not paused).
2049
+ ValueError: If worker has tools and stream mode enabled but
2050
+ use_structured_output_handler is False.
995
2051
  """
996
- worker_node = SingleAgentWorker(description, worker)
997
- self._children.append(worker_node)
998
- if self.metrics_logger:
999
- self.metrics_logger.log_worker_created(
1000
- worker_id=worker_node.node_id,
1001
- worker_type='SingleAgentWorker',
1002
- role=worker_node.description,
2052
+ if self._state == WorkforceState.RUNNING:
2053
+ raise RuntimeError(
2054
+ "Cannot add workers while workforce is running. "
2055
+ "Pause the workforce first."
1003
2056
  )
2057
+
2058
+ # Validate worker agent compatibility
2059
+ self._validate_agent_compatibility(worker, "Worker agent")
2060
+
2061
+ # Ensure the worker agent shares this workforce's pause control
2062
+ self._attach_pause_event_to_agent(worker)
2063
+
2064
+ worker_node = SingleAgentWorker(
2065
+ description=description,
2066
+ worker=worker,
2067
+ pool_max_size=pool_max_size,
2068
+ use_structured_output_handler=self.use_structured_output_handler,
2069
+ context_utility=None, # Will be set during save/load operations
2070
+ enable_workflow_memory=enable_workflow_memory,
2071
+ )
2072
+ self._children.append(worker_node)
2073
+
2074
+ # If we have a channel set up, set it for the new worker
2075
+ if hasattr(self, '_channel') and self._channel is not None:
2076
+ worker_node.set_channel(self._channel)
2077
+
2078
+ # If workforce is paused, start the worker's listening task
2079
+ self._start_child_node_when_paused(worker_node.start())
2080
+
2081
+ self._notify_worker_created(
2082
+ worker_node,
2083
+ worker_type='SingleAgentWorker',
2084
+ )
1004
2085
  return self
1005
2086
 
1006
- @check_if_running(False)
1007
2087
  def add_role_playing_worker(
1008
2088
  self,
1009
2089
  description: str,
@@ -1015,6 +2095,7 @@ class Workforce(BaseNode):
1015
2095
  chat_turn_limit: int = 3,
1016
2096
  ) -> Workforce:
1017
2097
  r"""Add a worker node to the workforce that uses `RolePlaying` system.
2098
+ Can be called when workforce is paused to dynamically add workers.
1018
2099
 
1019
2100
  Args:
1020
2101
  description (str): Description of the node.
@@ -1034,7 +2115,27 @@ class Workforce(BaseNode):
1034
2115
 
1035
2116
  Returns:
1036
2117
  Workforce: The workforce node itself.
2118
+
2119
+ Raises:
2120
+ RuntimeError: If called while workforce is running (not paused).
1037
2121
  """
2122
+ if self._state == WorkforceState.RUNNING:
2123
+ raise RuntimeError(
2124
+ "Cannot add workers while workforce is running. "
2125
+ "Pause the workforce first."
2126
+ )
2127
+ # Ensure provided kwargs carry pause_event so that internally created
2128
+ # ChatAgents (assistant/user/summarizer) inherit it.
2129
+ assistant_agent_kwargs = self._ensure_pause_event_in_kwargs(
2130
+ assistant_agent_kwargs
2131
+ )
2132
+ user_agent_kwargs = self._ensure_pause_event_in_kwargs(
2133
+ user_agent_kwargs
2134
+ )
2135
+ summarize_agent_kwargs = self._ensure_pause_event_in_kwargs(
2136
+ summarize_agent_kwargs
2137
+ )
2138
+
1038
2139
  worker_node = RolePlayingWorker(
1039
2140
  description=description,
1040
2141
  assistant_role_name=assistant_role_name,
@@ -1043,27 +2144,52 @@ class Workforce(BaseNode):
1043
2144
  user_agent_kwargs=user_agent_kwargs,
1044
2145
  summarize_agent_kwargs=summarize_agent_kwargs,
1045
2146
  chat_turn_limit=chat_turn_limit,
2147
+ use_structured_output_handler=self.use_structured_output_handler,
1046
2148
  )
1047
2149
  self._children.append(worker_node)
1048
- if self.metrics_logger:
1049
- self.metrics_logger.log_worker_created(
1050
- worker_id=worker_node.node_id,
1051
- worker_type='RolePlayingWorker',
1052
- role=worker_node.description,
1053
- )
2150
+
2151
+ # If we have a channel set up, set it for the new worker
2152
+ if hasattr(self, '_channel') and self._channel is not None:
2153
+ worker_node.set_channel(self._channel)
2154
+
2155
+ # If workforce is paused, start the worker's listening task
2156
+ self._start_child_node_when_paused(worker_node.start())
2157
+
2158
+ self._notify_worker_created(
2159
+ worker_node,
2160
+ worker_type='RolePlayingWorker',
2161
+ )
1054
2162
  return self
1055
2163
 
1056
- @check_if_running(False)
1057
2164
  def add_workforce(self, workforce: Workforce) -> Workforce:
1058
2165
  r"""Add a workforce node to the workforce.
2166
+ Can be called when workforce is paused to dynamically add workers.
1059
2167
 
1060
2168
  Args:
1061
2169
  workforce (Workforce): The workforce node to be added.
1062
2170
 
1063
2171
  Returns:
1064
2172
  Workforce: The workforce node itself.
2173
+
2174
+ Raises:
2175
+ RuntimeError: If called while workforce is running (not paused).
1065
2176
  """
2177
+ if self._state == WorkforceState.RUNNING:
2178
+ raise RuntimeError(
2179
+ "Cannot add workers while workforce is running. "
2180
+ "Pause the workforce first."
2181
+ )
2182
+ # Align child workforce's pause_event with this one for unified
2183
+ # control of worker agents only.
2184
+ workforce._pause_event = self._pause_event
1066
2185
  self._children.append(workforce)
2186
+
2187
+ # If we have a channel set up, set it for the new workforce
2188
+ if hasattr(self, '_channel') and self._channel is not None:
2189
+ workforce.set_channel(self._channel)
2190
+
2191
+ # If workforce is paused, start the child workforce's listening task
2192
+ self._start_child_node_when_paused(workforce.start())
1067
2193
  return self
1068
2194
 
1069
2195
  async def _async_reset(self) -> None:
@@ -1093,22 +2219,426 @@ class Workforce(BaseNode):
1093
2219
  # Reset intervention state
1094
2220
  self._state = WorkforceState.IDLE
1095
2221
  self._stop_requested = False
2222
+ self._skip_requested = False
1096
2223
  # Handle asyncio.Event in a thread-safe way
1097
2224
  if self._loop and not self._loop.is_closed():
1098
2225
  # If we have a loop, use it to set the event safely
1099
- asyncio.run_coroutine_threadsafe(
1100
- self._async_reset(), self._loop
1101
- ).result()
1102
- else:
1103
2226
  try:
1104
- self._reset_task = asyncio.create_task(self._async_reset())
1105
- except RuntimeError:
1106
- asyncio.run(self._async_reset())
1107
-
1108
- if hasattr(self, 'logger') and self.metrics_logger is not None:
1109
- self.metrics_logger.reset_task_data()
2227
+ asyncio.run_coroutine_threadsafe(
2228
+ self._async_reset(), self._loop
2229
+ ).result()
2230
+ except RuntimeError as e:
2231
+ logger.warning(f"Failed to reset via existing loop: {e}")
2232
+ # Fallback to direct event manipulation
2233
+ self._pause_event.set()
1110
2234
  else:
1111
- self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
2235
+ # No active loop, directly set the event
2236
+ self._pause_event.set()
2237
+
2238
+ for cb in self._callbacks:
2239
+ if isinstance(cb, WorkforceMetrics):
2240
+ cb.reset_task_data()
2241
+
2242
+ def save_workflow_memories(
2243
+ self,
2244
+ session_id: Optional[str] = None,
2245
+ ) -> Dict[str, str]:
2246
+ r"""Save workflow memories for all SingleAgentWorker instances in the
2247
+ workforce.
2248
+
2249
+ .. deprecated:: 0.2.80
2250
+ This synchronous method processes workers sequentially, which can
2251
+ be slow for multiple agents. Use
2252
+ :meth:`save_workflow_memories_async`
2253
+ instead for parallel processing and significantly better
2254
+ performance.
2255
+
2256
+ This method iterates through all child workers and triggers workflow
2257
+ saving for SingleAgentWorker instances using their
2258
+ save_workflow_memories()
2259
+ method.
2260
+ Other worker types are skipped.
2261
+
2262
+ Args:
2263
+ session_id (Optional[str]): Custom session ID to use for saving
2264
+ workflows. If None, auto-generates a timestamped session ID.
2265
+ Useful for organizing workflows by project or context.
2266
+ (default: :obj:`None`)
2267
+
2268
+ Returns:
2269
+ Dict[str, str]: Dictionary mapping worker node IDs to save results.
2270
+ Values are either file paths (success) or error messages
2271
+ (failure).
2272
+
2273
+ Example:
2274
+ >>> workforce = Workforce("My Team")
2275
+ >>> # ... add workers and process tasks ...
2276
+ >>> # save with auto-generated session id
2277
+ >>> results = workforce.save_workflow_memories()
2278
+ >>> print(results)
2279
+ {'worker_123': '/path/to/developer_agent_workflow.md',
2280
+ 'worker_456': 'error: No conversation context available'}
2281
+ >>> # save with custom project id
2282
+ >>> results = workforce.save_workflow_memories(
2283
+ ... session_id="project_123"
2284
+ ... )
2285
+
2286
+ Note:
2287
+ For better performance with multiple workers, use the async
2288
+ version::
2289
+
2290
+ results = await workforce.save_workflow_memories_async()
2291
+
2292
+ See Also:
2293
+ :meth:`save_workflow_memories_async`: Async version with parallel
2294
+ processing for significantly better performance.
2295
+ """
2296
+ import warnings
2297
+
2298
+ warnings.warn(
2299
+ "save_workflow_memories() is slow for multiple workers. "
2300
+ "Consider using save_workflow_memories_async() for parallel "
2301
+ "processing and ~4x faster performance.",
2302
+ DeprecationWarning,
2303
+ stacklevel=2,
2304
+ )
2305
+ results = {}
2306
+
2307
+ # Get or create shared context utility for this save operation
2308
+ shared_context_utility = self._get_or_create_shared_context_utility(
2309
+ session_id=session_id
2310
+ )
2311
+
2312
+ for child in self._children:
2313
+ if isinstance(child, SingleAgentWorker):
2314
+ try:
2315
+ # Set the shared context utility for this operation
2316
+ child._shared_context_utility = shared_context_utility
2317
+ child.worker.set_context_utility(shared_context_utility)
2318
+
2319
+ result = child.save_workflow_memories()
2320
+ if result.get("status") == "success":
2321
+ results[child.node_id] = result.get(
2322
+ "file_path", "unknown_path"
2323
+ )
2324
+ else:
2325
+ # Error: check if there's a separate message field,
2326
+ # otherwise use the status itself
2327
+ error_msg = result.get(
2328
+ "message", result.get("status", "Unknown error")
2329
+ )
2330
+ results[child.node_id] = f"error: {error_msg}"
2331
+
2332
+ except Exception as e:
2333
+ results[child.node_id] = f"error: {e!s}"
2334
+ else:
2335
+ # Skip non-SingleAgentWorker types
2336
+ results[child.node_id] = (
2337
+ f"skipped: {type(child).__name__} not supported"
2338
+ )
2339
+
2340
+ logger.info(f"Workflow save completed for {len(results)} workers")
2341
+ return results
2342
+
2343
+ async def save_workflow_memories_async(
2344
+ self,
2345
+ session_id: Optional[str] = None,
2346
+ ) -> Dict[str, str]:
2347
+ r"""Asynchronously save workflow memories for all SingleAgentWorker
2348
+ instances in the workforce.
2349
+
2350
+ This is the async version of save_workflow_memories() that parallelizes
2351
+ LLM summarization calls across all workers using asyncio.gather(),
2352
+ significantly reducing total save time.
2353
+
2354
+ This method iterates through all child workers and triggers workflow
2355
+ saving for SingleAgentWorker instances using their
2356
+ save_workflow_memories_async() method in parallel.
2357
+ Other worker types are skipped.
2358
+
2359
+ Args:
2360
+ session_id (Optional[str]): Custom session ID to use for saving
2361
+ workflows. If None, auto-generates a timestamped session ID.
2362
+ Useful for organizing workflows by project or context.
2363
+ (default: :obj:`None`)
2364
+
2365
+ Returns:
2366
+ Dict[str, str]: Dictionary mapping worker node IDs to save results.
2367
+ Values are either file paths (success) or error messages
2368
+ (failure).
2369
+
2370
+ Example:
2371
+ >>> workforce = Workforce("My Team")
2372
+ >>> # ... add workers and process tasks ...
2373
+ >>> # save with parallel summarization (faster)
2374
+ >>> results = await workforce.save_workflow_memories_async()
2375
+ >>> print(results)
2376
+ {'worker_123': '/path/to/developer_agent_workflow.md',
2377
+ 'worker_456': '/path/to/search_agent_workflow.md',
2378
+ 'worker_789': '/path/to/document_agent_workflow.md'}
2379
+ """
2380
+ import asyncio
2381
+
2382
+ results = {}
2383
+
2384
+ # Get or create shared context utility for this save operation
2385
+ shared_context_utility = self._get_or_create_shared_context_utility(
2386
+ session_id=session_id
2387
+ )
2388
+
2389
+ # Prepare tasks for parallel execution
2390
+ async def save_single_worker(
2391
+ child: BaseNode,
2392
+ ) -> tuple[str, str]:
2393
+ """Save workflow for a single worker, then return (node_id,
2394
+ result)."""
2395
+ if isinstance(child, SingleAgentWorker):
2396
+ try:
2397
+ # Set the shared context utility for this operation
2398
+ child._shared_context_utility = shared_context_utility
2399
+ child.worker.set_context_utility(shared_context_utility)
2400
+
2401
+ result = await child.save_workflow_memories_async()
2402
+ if result.get("status") == "success":
2403
+ return (
2404
+ child.node_id,
2405
+ result.get("file_path", "unknown_path"),
2406
+ )
2407
+ else:
2408
+ # Error: check if there's a separate message field,
2409
+ # otherwise use the status itself
2410
+ error_msg = result.get(
2411
+ "message", result.get("status", "Unknown error")
2412
+ )
2413
+ return (child.node_id, f"error: {error_msg}")
2414
+
2415
+ except Exception as e:
2416
+ return (child.node_id, f"error: {e!s}")
2417
+ else:
2418
+ # Skip non-SingleAgentWorker types
2419
+ return (
2420
+ child.node_id,
2421
+ f"skipped: {type(child).__name__} not supported",
2422
+ )
2423
+
2424
+ # Create tasks for all workers
2425
+ tasks = [save_single_worker(child) for child in self._children]
2426
+
2427
+ # Execute all tasks in parallel using asyncio.gather()
2428
+ parallel_results = await asyncio.gather(*tasks, return_exceptions=True)
2429
+
2430
+ # Process results
2431
+ for result in parallel_results:
2432
+ if isinstance(result, Exception):
2433
+ # Handle any unexpected exceptions
2434
+ logger.error(
2435
+ f"Unexpected error during workflow save: {result}"
2436
+ )
2437
+ results["unknown"] = f"error: {result!s}"
2438
+ elif isinstance(result, tuple) and len(result) == 2:
2439
+ # Successfully got (node_id, save_result) tuple
2440
+ node_id, save_result = result
2441
+ results[node_id] = save_result
2442
+ else:
2443
+ # Unexpected result format
2444
+ logger.error(f"Unexpected result format: {result}")
2445
+ results["unknown"] = "error: unexpected result format"
2446
+
2447
+ logger.info(
2448
+ f"Workflow save completed for {len(results)} workers "
2449
+ f"(parallelized)"
2450
+ )
2451
+ return results
2452
+
2453
+ def load_workflow_memories(
2454
+ self,
2455
+ session_id: Optional[str] = None,
2456
+ worker_max_workflows: int = 3,
2457
+ coordinator_max_workflows: int = 5,
2458
+ task_agent_max_workflows: int = 3,
2459
+ ) -> Dict[str, bool]:
2460
+ r"""Load workflow memories for all SingleAgentWorker instances in the
2461
+ workforce.
2462
+
2463
+ This method iterates through all child workers and loads relevant
2464
+ workflow files for SingleAgentWorker instances using their
2465
+ load_workflow_memories()
2466
+ method. Workers match files based on their description names.
2467
+
2468
+ Args:
2469
+ session_id (Optional[str]): Specific workforce session ID to load
2470
+ from. If None, searches across all sessions.
2471
+ (default: :obj:`None`)
2472
+ worker_max_workflows (int): Maximum number of workflow files to
2473
+ load per worker agent. (default: :obj:`3`)
2474
+ coordinator_max_workflows (int): Maximum number of workflow files
2475
+ to load for the coordinator agent. (default: :obj:`5`)
2476
+ task_agent_max_workflows (int): Maximum number of workflow files
2477
+ to load for the task planning agent. (default: :obj:`3`)
2478
+
2479
+ Returns:
2480
+ Dict[str, bool]: Dictionary mapping worker node IDs to load
2481
+ success status.
2482
+ True indicates successful loading, False indicates failure.
2483
+
2484
+ Example:
2485
+ >>> workforce = Workforce("My Team")
2486
+ >>> workforce.add_single_agent_worker(
2487
+ ... "data_analyst", analyst_agent
2488
+ ... )
2489
+ >>> success_status = workforce.load_workflow_memories(
2490
+ ... worker_max_workflows=5,
2491
+ ... coordinator_max_workflows=10,
2492
+ ... task_agent_max_workflows=5
2493
+ ... )
2494
+ >>> print(success_status)
2495
+ {'worker_123': True} # Successfully loaded workflows for
2496
+ # data_analyst
2497
+ """
2498
+ results = {}
2499
+
2500
+ # For loading, we don't create a new session - instead we search
2501
+ # existing ones
2502
+ # Each worker will search independently across all existing sessions
2503
+
2504
+ # First, load workflows for SingleAgentWorker instances
2505
+ for child in self._children:
2506
+ if isinstance(child, SingleAgentWorker):
2507
+ try:
2508
+ # For loading, don't set shared context utility
2509
+ # Let each worker search across existing sessions
2510
+ success = child.load_workflow_memories(
2511
+ max_workflows=worker_max_workflows,
2512
+ session_id=session_id,
2513
+ )
2514
+ results[child.node_id] = success
2515
+
2516
+ except Exception as e:
2517
+ logger.error(
2518
+ f"Failed to load workflow for {child.node_id}: {e!s}"
2519
+ )
2520
+ results[child.node_id] = False
2521
+ else:
2522
+ # Skip non-SingleAgentWorker types
2523
+ results[child.node_id] = False
2524
+
2525
+ # Load aggregated workflow summaries for coordinator and task agents
2526
+ self._load_management_agent_workflows(
2527
+ coordinator_max_workflows, task_agent_max_workflows, session_id
2528
+ )
2529
+
2530
+ logger.info(f"Workflow load completed for {len(results)} workers")
2531
+ return results
2532
+
2533
+ def _load_management_agent_workflows(
2534
+ self,
2535
+ coordinator_max_workflows: int,
2536
+ task_agent_max_workflows: int,
2537
+ session_id: Optional[str] = None,
2538
+ ) -> None:
2539
+ r"""Load workflow summaries for coordinator and task planning agents.
2540
+
2541
+ This method loads aggregated workflow summaries to help:
2542
+ - Coordinator agent: understand task assignment patterns and worker
2543
+ capabilities
2544
+ - Task agent: understand task decomposition patterns and
2545
+ successful strategies
2546
+
2547
+ Args:
2548
+ coordinator_max_workflows (int): Maximum number of workflow files
2549
+ to load for the coordinator agent.
2550
+ task_agent_max_workflows (int): Maximum number of workflow files
2551
+ to load for the task planning agent.
2552
+ session_id (Optional[str]): Specific session ID to load from.
2553
+ If None, searches across all sessions.
2554
+ """
2555
+ try:
2556
+ import glob
2557
+ import os
2558
+ from pathlib import Path
2559
+
2560
+ from camel.utils.context_utils import ContextUtility
2561
+
2562
+ # For loading management workflows, search across all sessions
2563
+ camel_workdir = os.environ.get("CAMEL_WORKDIR")
2564
+ if camel_workdir:
2565
+ base_dir = os.path.join(camel_workdir, "workforce_workflows")
2566
+ else:
2567
+ base_dir = "workforce_workflows"
2568
+
2569
+ # Search for workflow files in specified or all session directories
2570
+ if session_id:
2571
+ search_path = str(
2572
+ Path(base_dir) / session_id / "*_workflow*.md"
2573
+ )
2574
+ else:
2575
+ search_path = str(Path(base_dir) / "*" / "*_workflow*.md")
2576
+ workflow_files = glob.glob(search_path)
2577
+
2578
+ if not workflow_files:
2579
+ logger.info(
2580
+ "No workflow files found for management agent context"
2581
+ )
2582
+ return
2583
+
2584
+ # Sort by modification time (most recent first)
2585
+ workflow_files.sort(
2586
+ key=lambda x: os.path.getmtime(x), reverse=True
2587
+ )
2588
+
2589
+ # Load workflows for coordinator agent
2590
+ coordinator_loaded = 0
2591
+ for file_path in workflow_files[:coordinator_max_workflows]:
2592
+ try:
2593
+ filename = os.path.basename(file_path).replace('.md', '')
2594
+ session_dir = os.path.dirname(file_path)
2595
+ session_id = os.path.basename(session_dir)
2596
+
2597
+ # Use shared context utility with specific session
2598
+ temp_utility = ContextUtility.get_workforce_shared(
2599
+ session_id
2600
+ )
2601
+
2602
+ status = temp_utility.load_markdown_context_to_memory(
2603
+ self.coordinator_agent, filename
2604
+ )
2605
+ if "Context appended" in status:
2606
+ coordinator_loaded += 1
2607
+ except Exception as e:
2608
+ logger.warning(
2609
+ f"Failed to load coordinator workflow {file_path}: {e}"
2610
+ )
2611
+
2612
+ # Load workflows for task agent
2613
+ task_agent_loaded = 0
2614
+ for file_path in workflow_files[:task_agent_max_workflows]:
2615
+ try:
2616
+ filename = os.path.basename(file_path).replace('.md', '')
2617
+ session_dir = os.path.dirname(file_path)
2618
+ session_id = os.path.basename(session_dir)
2619
+
2620
+ # Use shared context utility with specific session
2621
+ temp_utility = ContextUtility.get_workforce_shared(
2622
+ session_id
2623
+ )
2624
+
2625
+ status = temp_utility.load_markdown_context_to_memory(
2626
+ self.task_agent, filename
2627
+ )
2628
+ if "Context appended" in status:
2629
+ task_agent_loaded += 1
2630
+ except Exception as e:
2631
+ logger.warning(
2632
+ f"Failed to load task agent workflow {file_path}: {e}"
2633
+ )
2634
+
2635
+ logger.info(
2636
+ f"Loaded {coordinator_loaded} workflows for coordinator, "
2637
+ f"{task_agent_loaded} workflows for task agent"
2638
+ )
2639
+
2640
+ except Exception as e:
2641
+ logger.error(f"Error loading management agent workflows: {e}")
1112
2642
 
1113
2643
  @check_if_running(False)
1114
2644
  def set_channel(self, channel: TaskChannel) -> None:
@@ -1119,25 +2649,332 @@ class Workforce(BaseNode):
1119
2649
 
1120
2650
  def _get_child_nodes_info(self) -> str:
1121
2651
  r"""Get the information of all the child nodes under this node."""
1122
- info = ""
1123
- for child in self._children:
1124
- if isinstance(child, Workforce):
1125
- additional_info = "A Workforce node"
1126
- elif isinstance(child, SingleAgentWorker):
1127
- additional_info = "tools: " + (
1128
- ", ".join(child.worker.tool_dict.keys())
2652
+ return "".join(
2653
+ f"<{child.node_id}>:<{child.description}>:<{self._get_node_info(child)}>\n"
2654
+ for child in self._children
2655
+ )
2656
+
2657
+ def _get_node_info(self, node) -> str:
2658
+ r"""Get descriptive information for a specific node type."""
2659
+ if isinstance(node, Workforce):
2660
+ return "A Workforce node"
2661
+ elif isinstance(node, SingleAgentWorker):
2662
+ return self._get_single_agent_toolkit_info(node)
2663
+ elif isinstance(node, RolePlayingWorker):
2664
+ return "A Role playing node"
2665
+ else:
2666
+ return "Unknown node"
2667
+
2668
+ def _get_single_agent_toolkit_info(
2669
+ self, worker: 'SingleAgentWorker'
2670
+ ) -> str:
2671
+ r"""Get formatted information for a SingleAgentWorker node."""
2672
+ toolkit_tools = self._group_tools_by_toolkit(worker.worker.tool_dict)
2673
+
2674
+ if not toolkit_tools:
2675
+ return ""
2676
+
2677
+ toolkit_info = []
2678
+ for toolkit_name, tools in sorted(toolkit_tools.items()):
2679
+ tools_str = ', '.join(sorted(tools))
2680
+ toolkit_info.append(f"{toolkit_name}({tools_str})")
2681
+
2682
+ return ", ".join(toolkit_info)
2683
+
2684
+ def _group_tools_by_toolkit(self, tool_dict: dict) -> dict[str, list[str]]:
2685
+ r"""Group tools by their parent toolkit class names."""
2686
+ toolkit_tools: dict[str, list[str]] = {}
2687
+
2688
+ for tool_name, tool in tool_dict.items():
2689
+ if hasattr(tool.func, '__self__'):
2690
+ toolkit_name = tool.func.__self__.__class__.__name__
2691
+ else:
2692
+ toolkit_name = "Standalone"
2693
+
2694
+ if toolkit_name not in toolkit_tools:
2695
+ toolkit_tools[toolkit_name] = []
2696
+ toolkit_tools[toolkit_name].append(tool_name)
2697
+
2698
+ return toolkit_tools
2699
+
2700
+ def _get_valid_worker_ids(self) -> set:
2701
+ r"""Get all valid worker IDs from child nodes.
2702
+
2703
+ Returns:
2704
+ set: Set of valid worker IDs that can be assigned tasks.
2705
+ """
2706
+ valid_worker_ids = {child.node_id for child in self._children}
2707
+ return valid_worker_ids
2708
+
2709
+ def _call_coordinator_for_assignment(
2710
+ self, tasks: List[Task], invalid_ids: Optional[List[str]] = None
2711
+ ) -> TaskAssignResult:
2712
+ r"""Call coordinator agent to assign tasks with optional validation
2713
+ feedback in the case of invalid worker IDs.
2714
+
2715
+ Args:
2716
+ tasks (List[Task]): Tasks to assign.
2717
+ invalid_ids (List[str], optional): Invalid worker IDs from previous
2718
+ attempt (if any).
2719
+
2720
+ Returns:
2721
+ TaskAssignResult: Assignment result from coordinator.
2722
+ """
2723
+ # format tasks information for the prompt
2724
+ tasks_info = ""
2725
+ for task in tasks:
2726
+ tasks_info += f"Task ID: {task.id}\n"
2727
+ tasks_info += f"Content: {task.content}\n"
2728
+ if task.additional_info:
2729
+ tasks_info += f"Additional Info: {task.additional_info}\n"
2730
+ tasks_info += "---\n"
2731
+
2732
+ prompt = str(
2733
+ ASSIGN_TASK_PROMPT.format(
2734
+ tasks_info=tasks_info,
2735
+ child_nodes_info=self._get_child_nodes_info(),
2736
+ )
2737
+ )
2738
+
2739
+ # add feedback if this is a retry
2740
+ if invalid_ids:
2741
+ valid_worker_ids = list(self._get_valid_worker_ids())
2742
+ feedback = (
2743
+ f"VALIDATION ERROR: The following worker IDs are invalid: "
2744
+ f"{invalid_ids}. "
2745
+ f"VALID WORKER IDS: {valid_worker_ids}. "
2746
+ f"Please reassign ONLY the above tasks using these valid IDs."
2747
+ )
2748
+ prompt = prompt + f"\n\n{feedback}"
2749
+
2750
+ # Check if we should use structured handler
2751
+ if self.use_structured_output_handler:
2752
+ # Use structured handler for prompt-based extraction
2753
+ enhanced_prompt = (
2754
+ self.structured_handler.generate_structured_prompt(
2755
+ base_prompt=prompt,
2756
+ schema=TaskAssignResult,
2757
+ examples=[
2758
+ {
2759
+ "assignments": [
2760
+ {
2761
+ "task_id": "task_1",
2762
+ "assignee_id": "worker_123",
2763
+ "dependencies": [],
2764
+ }
2765
+ ]
2766
+ }
2767
+ ],
1129
2768
  )
1130
- elif isinstance(child, RolePlayingWorker):
1131
- additional_info = "A Role playing node"
2769
+ )
2770
+
2771
+ # Get response without structured format
2772
+ response = self.coordinator_agent.step(enhanced_prompt)
2773
+
2774
+ if response.msg is None or response.msg.content is None:
2775
+ logger.error(
2776
+ "Coordinator agent returned empty response for "
2777
+ "task assignment"
2778
+ )
2779
+ return TaskAssignResult(assignments=[])
2780
+
2781
+ # Parse with structured handler
2782
+ result = self.structured_handler.parse_structured_response(
2783
+ response.msg.content,
2784
+ schema=TaskAssignResult,
2785
+ fallback_values={"assignments": []},
2786
+ )
2787
+ # Ensure we return a TaskAssignResult instance
2788
+ if isinstance(result, TaskAssignResult):
2789
+ return result
2790
+ elif isinstance(result, dict):
2791
+ return TaskAssignResult(**result)
2792
+ else:
2793
+ return TaskAssignResult(assignments=[])
2794
+ else:
2795
+ # Use existing native structured output code
2796
+ response = self.coordinator_agent.step(
2797
+ prompt, response_format=TaskAssignResult
2798
+ )
2799
+
2800
+ if response.msg is None or response.msg.content is None:
2801
+ logger.error(
2802
+ "Coordinator agent returned empty response for "
2803
+ "task assignment"
2804
+ )
2805
+ return TaskAssignResult(assignments=[])
2806
+
2807
+ try:
2808
+ result_dict = json.loads(response.msg.content, parse_int=str)
2809
+ return TaskAssignResult(**result_dict)
2810
+ except json.JSONDecodeError as e:
2811
+ logger.error(
2812
+ f"JSON parsing error in task assignment: Invalid response "
2813
+ f"format - {e}. Response content: "
2814
+ f"{response.msg.content}"
2815
+ )
2816
+ return TaskAssignResult(assignments=[])
2817
+
2818
+ def _validate_assignments(
2819
+ self, assignments: List[TaskAssignment], valid_ids: Set[str]
2820
+ ) -> Tuple[List[TaskAssignment], List[TaskAssignment]]:
2821
+ r"""Validate task assignments against valid worker IDs.
2822
+
2823
+ Args:
2824
+ assignments (List[TaskAssignment]): Assignments to validate.
2825
+ valid_ids (Set[str]): Set of valid worker IDs.
2826
+
2827
+ Returns:
2828
+ Tuple[List[TaskAssignment], List[TaskAssignment]]:
2829
+ (valid_assignments, invalid_assignments)
2830
+ """
2831
+ valid_assignments: List[TaskAssignment] = []
2832
+ invalid_assignments: List[TaskAssignment] = []
2833
+
2834
+ for assignment in assignments:
2835
+ if assignment.assignee_id in valid_ids:
2836
+ valid_assignments.append(assignment)
2837
+ else:
2838
+ invalid_assignments.append(assignment)
2839
+
2840
+ return valid_assignments, invalid_assignments
2841
+
2842
+ async def _handle_task_assignment_fallbacks(
2843
+ self, tasks: List[Task]
2844
+ ) -> List:
2845
+ r"""Create new workers for unassigned tasks as fallback.
2846
+
2847
+ Args:
2848
+ tasks (List[Task]): Tasks that need new workers.
2849
+
2850
+ Returns:
2851
+ List[TaskAssignment]: Assignments for newly created workers.
2852
+ """
2853
+ fallback_assignments = []
2854
+
2855
+ for task in tasks:
2856
+ logger.info(f"Creating new worker for unassigned task {task.id}")
2857
+ new_worker = await self._create_worker_node_for_task(task)
2858
+
2859
+ assignment = TaskAssignment(
2860
+ task_id=task.id,
2861
+ assignee_id=new_worker.node_id,
2862
+ dependencies=[],
2863
+ )
2864
+ fallback_assignments.append(assignment)
2865
+
2866
+ return fallback_assignments
2867
+
2868
+ async def _handle_assignment_retry_and_fallback(
2869
+ self,
2870
+ invalid_assignments: List[TaskAssignment],
2871
+ tasks: List[Task],
2872
+ valid_worker_ids: Set[str],
2873
+ ) -> List[TaskAssignment]:
2874
+ r"""Called if Coordinator agent fails to assign tasks to valid worker
2875
+ IDs. Handles retry assignment and fallback worker creation for invalid
2876
+ assignments.
2877
+
2878
+ Args:
2879
+ invalid_assignments (List[TaskAssignment]): Invalid assignments to
2880
+ retry.
2881
+ tasks (List[Task]): Original tasks list for task lookup.
2882
+ valid_worker_ids (set): Set of valid worker IDs.
2883
+
2884
+ Returns:
2885
+ List[TaskAssignment]: Final assignments for the invalid tasks.
2886
+ """
2887
+ invalid_ids = [a.assignee_id for a in invalid_assignments]
2888
+ invalid_tasks = [
2889
+ task
2890
+ for task in tasks
2891
+ if any(a.task_id == task.id for a in invalid_assignments)
2892
+ ]
2893
+
2894
+ # handle cases where coordinator returned no assignments at all
2895
+ if not invalid_assignments:
2896
+ invalid_tasks = tasks # all tasks need assignment
2897
+ logger.warning(
2898
+ f"Coordinator returned no assignments. "
2899
+ f"Retrying assignment for all {len(invalid_tasks)} tasks."
2900
+ )
2901
+ else:
2902
+ logger.warning(
2903
+ f"Invalid worker IDs detected: {invalid_ids}. "
2904
+ f"Retrying assignment for {len(invalid_tasks)} tasks."
2905
+ )
2906
+
2907
+ # retry assignment with feedback
2908
+ retry_result = self._call_coordinator_for_assignment(
2909
+ invalid_tasks, invalid_ids
2910
+ )
2911
+ final_assignments = []
2912
+
2913
+ if retry_result.assignments:
2914
+ retry_valid, retry_invalid = self._validate_assignments(
2915
+ retry_result.assignments, valid_worker_ids
2916
+ )
2917
+ final_assignments.extend(retry_valid)
2918
+
2919
+ # collect tasks that are still unassigned for fallback
2920
+ if retry_invalid:
2921
+ unassigned_tasks = [
2922
+ task
2923
+ for task in invalid_tasks
2924
+ if any(a.task_id == task.id for a in retry_invalid)
2925
+ ]
1132
2926
  else:
1133
- additional_info = "Unknown node"
1134
- info += (
1135
- f"<{child.node_id}>:<{child.description}>:<"
1136
- f"{additional_info}>\n"
2927
+ unassigned_tasks = []
2928
+ else:
2929
+ # retry failed completely, all invalid tasks need fallback
2930
+ logger.warning("Retry assignment failed")
2931
+ unassigned_tasks = invalid_tasks
2932
+
2933
+ # handle fallback for any remaining unassigned tasks
2934
+ if unassigned_tasks:
2935
+ logger.warning(
2936
+ f"Creating fallback workers for {len(unassigned_tasks)} "
2937
+ f"unassigned tasks"
2938
+ )
2939
+ fallback_assignments = (
2940
+ await self._handle_task_assignment_fallbacks(unassigned_tasks)
1137
2941
  )
1138
- return info
2942
+ final_assignments.extend(fallback_assignments)
2943
+
2944
+ return final_assignments
1139
2945
 
1140
- def _find_assignee(
2946
+ def _update_task_dependencies_from_assignments(
2947
+ self, assignments: List[TaskAssignment], tasks: List[Task]
2948
+ ) -> None:
2949
+ r"""Update Task.dependencies with actual Task objects based on
2950
+ assignments.
2951
+
2952
+ Args:
2953
+ assignments (List[TaskAssignment]): The task assignments
2954
+ containing dependency IDs.
2955
+ tasks (List[Task]): The tasks that were assigned.
2956
+ """
2957
+ # Create a lookup map for all available tasks
2958
+ all_tasks = {}
2959
+ for task_list in [self._completed_tasks, self._pending_tasks, tasks]:
2960
+ for task in task_list:
2961
+ all_tasks[task.id] = task
2962
+
2963
+ # Update dependencies for each assigned task
2964
+ for assignment in assignments:
2965
+ if not assignment.dependencies:
2966
+ continue
2967
+
2968
+ matching_tasks = [t for t in tasks if t.id == assignment.task_id]
2969
+ if matching_tasks:
2970
+ task = matching_tasks[0]
2971
+ task.dependencies = [
2972
+ all_tasks[dep_id]
2973
+ for dep_id in assignment.dependencies
2974
+ if dep_id in all_tasks
2975
+ ]
2976
+
2977
+ async def _find_assignee(
1141
2978
  self,
1142
2979
  tasks: List[Task],
1143
2980
  ) -> TaskAssignResult:
@@ -1150,49 +2987,129 @@ class Workforce(BaseNode):
1150
2987
  TaskAssignResult: Assignment result containing task assignments
1151
2988
  with their dependencies.
1152
2989
  """
2990
+ # Wait for workers to be ready before assignment with exponential
2991
+ # backoff
2992
+ worker_readiness_timeout = 2.0 # Maximum wait time in seconds
2993
+ worker_readiness_check_interval = 0.05 # Initial check interval
2994
+ start_time = time.time()
2995
+ check_interval = worker_readiness_check_interval
2996
+ backoff_multiplier = 1.5 # Exponential backoff factor
2997
+ max_interval = 0.5 # Cap the maximum interval
2998
+
2999
+ while (time.time() - start_time) < worker_readiness_timeout:
3000
+ valid_worker_ids = self._get_valid_worker_ids()
3001
+ if len(valid_worker_ids) > 0:
3002
+ elapsed = time.time() - start_time
3003
+ logger.debug(
3004
+ f"Workers ready after {elapsed:.3f}s: "
3005
+ f"{len(valid_worker_ids)} workers available"
3006
+ )
3007
+ break
3008
+
3009
+ await asyncio.sleep(check_interval)
3010
+ # Exponential backoff with cap
3011
+ check_interval = min(
3012
+ check_interval * backoff_multiplier, max_interval
3013
+ )
3014
+ else:
3015
+ # Timeout reached, log warning but continue
3016
+ logger.warning(
3017
+ f"Worker readiness timeout after "
3018
+ f"{worker_readiness_timeout}s, "
3019
+ f"proceeding with {len(self._children)} children"
3020
+ )
3021
+ valid_worker_ids = self._get_valid_worker_ids()
3022
+
1153
3023
  self.coordinator_agent.reset()
1154
3024
 
1155
- # Format tasks information for the prompt
1156
- tasks_info = ""
1157
- for task in tasks:
1158
- tasks_info += f"Task ID: {task.id}\n"
1159
- tasks_info += f"Content: {task.content}\n"
1160
- if task.additional_info:
1161
- tasks_info += f"Additional Info: {task.additional_info}\n"
1162
- tasks_info += "---\n"
3025
+ logger.debug(
3026
+ f"Sending batch assignment request to coordinator "
3027
+ f"for {len(tasks)} tasks."
3028
+ )
3029
+
3030
+ assignment_result = self._call_coordinator_for_assignment(tasks)
3031
+
3032
+ # validate assignments
3033
+ valid_assignments, invalid_assignments = self._validate_assignments(
3034
+ assignment_result.assignments, valid_worker_ids
3035
+ )
3036
+
3037
+ # check if we have assignments for all tasks
3038
+ assigned_task_ids = {
3039
+ a.task_id for a in valid_assignments + invalid_assignments
3040
+ }
3041
+ unassigned_tasks = [t for t in tasks if t.id not in assigned_task_ids]
3042
+
3043
+ # if all assignments are valid and all tasks are assigned, return early
3044
+ if not invalid_assignments and not unassigned_tasks:
3045
+ self._update_task_dependencies_from_assignments(
3046
+ valid_assignments, tasks
3047
+ )
3048
+ return TaskAssignResult(assignments=valid_assignments)
1163
3049
 
1164
- prompt = ASSIGN_TASK_PROMPT.format(
1165
- tasks_info=tasks_info,
1166
- child_nodes_info=self._get_child_nodes_info(),
3050
+ # handle retry and fallback for invalid assignments and unassigned
3051
+ # tasks
3052
+ retry_and_fallback_assignments = (
3053
+ await self._handle_assignment_retry_and_fallback(
3054
+ invalid_assignments, tasks, valid_worker_ids
3055
+ )
1167
3056
  )
1168
3057
 
1169
- logger.debug(
1170
- f"Sending batch assignment request to coordinator "
1171
- f"for {len(tasks)} tasks."
3058
+ # Combine assignments with deduplication, prioritizing retry results
3059
+ assignment_map = {a.task_id: a for a in valid_assignments}
3060
+ assignment_map.update(
3061
+ {a.task_id: a for a in retry_and_fallback_assignments}
1172
3062
  )
3063
+ all_assignments = list(assignment_map.values())
1173
3064
 
1174
- response = self.coordinator_agent.step(
1175
- prompt, response_format=TaskAssignResult
1176
- )
1177
- result_dict = json.loads(response.msg.content, parse_int=str)
1178
- task_assign_result = TaskAssignResult(**result_dict)
1179
- return task_assign_result
3065
+ # Log any overwrites for debugging
3066
+ valid_task_ids = {a.task_id for a in valid_assignments}
3067
+ retry_task_ids = {a.task_id for a in retry_and_fallback_assignments}
3068
+ overlap_task_ids = valid_task_ids & retry_task_ids
3069
+
3070
+ if overlap_task_ids:
3071
+ logger.warning(
3072
+ f"Retry assignments overrode {len(overlap_task_ids)} "
3073
+ f"valid assignments for tasks: {sorted(overlap_task_ids)}"
3074
+ )
3075
+
3076
+ # Update Task.dependencies for all final assignments
3077
+ self._update_task_dependencies_from_assignments(all_assignments, tasks)
3078
+
3079
+ return TaskAssignResult(assignments=all_assignments)
1180
3080
 
1181
3081
  async def _post_task(self, task: Task, assignee_id: str) -> None:
1182
3082
  # Record the start time when a task is posted
1183
3083
  self._task_start_times[task.id] = time.time()
1184
3084
 
1185
- if self.metrics_logger:
1186
- self.metrics_logger.log_task_started(
1187
- task_id=task.id, worker_id=assignee_id
3085
+ task.assigned_worker_id = assignee_id
3086
+
3087
+ task_started_event = TaskStartedEvent(
3088
+ task_id=task.id, worker_id=assignee_id
3089
+ )
3090
+ for cb in self._callbacks:
3091
+ cb.log_task_started(task_started_event)
3092
+
3093
+ try:
3094
+ await self._channel.post_task(task, self.node_id, assignee_id)
3095
+ self._increment_in_flight_tasks(task.id)
3096
+ logger.debug(
3097
+ f"Posted task {task.id} to {assignee_id}. "
3098
+ f"In-flight tasks: {self._in_flight_tasks}"
3099
+ )
3100
+ except Exception as e:
3101
+ logger.error(
3102
+ f"Failed to post task {task.id} to {assignee_id}: {e}"
3103
+ )
3104
+ print(
3105
+ f"{Fore.RED}Failed to post task {task.id} to {assignee_id}: "
3106
+ f"{e}{Fore.RESET}"
1188
3107
  )
1189
- self._in_flight_tasks += 1
1190
- await self._channel.post_task(task, self.node_id, assignee_id)
1191
3108
 
1192
3109
  async def _post_dependency(self, dependency: Task) -> None:
1193
3110
  await self._channel.post_dependency(dependency, self.node_id)
1194
3111
 
1195
- def _create_worker_node_for_task(self, task: Task) -> Worker:
3112
+ async def _create_worker_node_for_task(self, task: Task) -> Worker:
1196
3113
  r"""Creates a new worker node for a given task and add it to the
1197
3114
  children list of this node. This is one of the actions that
1198
3115
  the coordinator can take when a task has failed.
@@ -1203,84 +3120,200 @@ class Workforce(BaseNode):
1203
3120
  Returns:
1204
3121
  Worker: The created worker node.
1205
3122
  """
1206
- prompt = CREATE_NODE_PROMPT.format(
1207
- content=task.content,
1208
- child_nodes_info=self._get_child_nodes_info(),
1209
- additional_info=task.additional_info,
1210
- )
1211
- response = self.coordinator_agent.step(
1212
- prompt, response_format=WorkerConf
3123
+ prompt = str(
3124
+ CREATE_NODE_PROMPT.format(
3125
+ content=task.content,
3126
+ child_nodes_info=self._get_child_nodes_info(),
3127
+ additional_info=task.additional_info,
3128
+ )
1213
3129
  )
1214
- result_dict = json.loads(response.msg.content)
1215
- new_node_conf = WorkerConf(**result_dict)
3130
+ # Check if we should use structured handler
3131
+ if self.use_structured_output_handler:
3132
+ # Use structured handler
3133
+ enhanced_prompt = (
3134
+ self.structured_handler.generate_structured_prompt(
3135
+ base_prompt=prompt,
3136
+ schema=WorkerConf,
3137
+ examples=[
3138
+ {
3139
+ "description": "Data analysis specialist",
3140
+ "role": "Data Analyst",
3141
+ "sys_msg": "You are an expert data analyst.",
3142
+ }
3143
+ ],
3144
+ )
3145
+ )
3146
+
3147
+ response = self.coordinator_agent.step(enhanced_prompt)
3148
+
3149
+ if response.msg is None or response.msg.content is None:
3150
+ logger.error(
3151
+ "Coordinator agent returned empty response for "
3152
+ "worker creation"
3153
+ )
3154
+ new_node_conf = WorkerConf(
3155
+ description=f"Fallback worker for task: {task.content}",
3156
+ role="General Assistant",
3157
+ sys_msg="You are a general assistant that can help "
3158
+ "with various tasks.",
3159
+ )
3160
+ else:
3161
+ result = self.structured_handler.parse_structured_response(
3162
+ response.msg.content,
3163
+ schema=WorkerConf,
3164
+ fallback_values={
3165
+ "description": f"Worker for task: {task.content}",
3166
+ "role": "Task Specialist",
3167
+ "sys_msg": f"You are a specialist for: {task.content}",
3168
+ },
3169
+ )
3170
+ # Ensure we have a WorkerConf instance
3171
+ if isinstance(result, WorkerConf):
3172
+ new_node_conf = result
3173
+ elif isinstance(result, dict):
3174
+ new_node_conf = WorkerConf(**result)
3175
+ else:
3176
+ new_node_conf = WorkerConf(
3177
+ description=f"Worker for task: {task.content}",
3178
+ role="Task Specialist",
3179
+ sys_msg=f"You are a specialist for: {task.content}",
3180
+ )
3181
+ else:
3182
+ # Use existing native structured output code
3183
+ response = self.coordinator_agent.step(
3184
+ prompt, response_format=WorkerConf
3185
+ )
3186
+ if response.msg is None or response.msg.content is None:
3187
+ logger.error(
3188
+ "Coordinator agent returned empty response for "
3189
+ "worker creation"
3190
+ )
3191
+ # Create a fallback worker configuration
3192
+ new_node_conf = WorkerConf(
3193
+ description=f"Fallback worker for task: {task.content}",
3194
+ role="General Assistant",
3195
+ sys_msg="You are a general assistant that can help "
3196
+ "with various tasks.",
3197
+ )
3198
+ else:
3199
+ try:
3200
+ result_dict = json.loads(response.msg.content)
3201
+ new_node_conf = WorkerConf(**result_dict)
3202
+ except json.JSONDecodeError as e:
3203
+ logger.error(
3204
+ f"JSON parsing error in worker creation: Invalid "
3205
+ f"response format - {e}. Response content: "
3206
+ f"{response.msg.content}"
3207
+ )
3208
+ raise RuntimeError(
3209
+ f"Failed to create worker for task {task.id}: "
3210
+ f"Coordinator agent returned malformed JSON response. "
3211
+ ) from e
1216
3212
 
1217
- new_agent = self._create_new_agent(
3213
+ new_agent = await self._create_new_agent(
1218
3214
  new_node_conf.role,
1219
3215
  new_node_conf.sys_msg,
1220
3216
  )
1221
3217
 
3218
+ # Validate the new agent compatibility before creating worker
3219
+ try:
3220
+ self._validate_agent_compatibility(
3221
+ new_agent, f"Agent for task {task.id}"
3222
+ )
3223
+ except ValueError as e:
3224
+ raise ValueError(f"Cannot create worker for task {task.id}: {e!s}")
3225
+
1222
3226
  new_node = SingleAgentWorker(
1223
3227
  description=new_node_conf.description,
1224
3228
  worker=new_agent,
3229
+ pool_max_size=DEFAULT_WORKER_POOL_SIZE,
3230
+ use_structured_output_handler=self.use_structured_output_handler,
1225
3231
  )
1226
3232
  new_node.set_channel(self._channel)
1227
3233
 
1228
3234
  print(f"{Fore.CYAN}{new_node} created.{Fore.RESET}")
1229
3235
 
1230
3236
  self._children.append(new_node)
1231
- if self.metrics_logger:
1232
- self.metrics_logger.log_worker_created(
1233
- worker_id=new_node.node_id,
1234
- worker_type='SingleAgentWorker',
1235
- role=new_node_conf.role,
1236
- metadata={'description': new_node_conf.description},
1237
- )
3237
+
3238
+ self._notify_worker_created(
3239
+ new_node,
3240
+ worker_type='SingleAgentWorker',
3241
+ role=new_node_conf.role,
3242
+ metadata={'description': new_node_conf.description},
3243
+ )
1238
3244
  self._child_listening_tasks.append(
1239
3245
  asyncio.create_task(new_node.start())
1240
3246
  )
1241
3247
  return new_node
1242
3248
 
1243
- def _create_new_agent(self, role: str, sys_msg: str) -> ChatAgent:
3249
+ async def _create_new_agent(self, role: str, sys_msg: str) -> ChatAgent:
1244
3250
  worker_sys_msg = BaseMessage.make_assistant_message(
1245
3251
  role_name=role,
1246
3252
  content=sys_msg,
1247
3253
  )
1248
3254
 
1249
- if self.new_worker_agent_kwargs is not None:
1250
- return ChatAgent(worker_sys_msg, **self.new_worker_agent_kwargs)
1251
-
1252
- # Default tools for a new agent
1253
- function_list = [
1254
- SearchToolkit().search_duckduckgo,
1255
- *CodeExecutionToolkit().get_tools(),
1256
- *ThinkingToolkit().get_tools(),
1257
- ]
3255
+ if self.new_worker_agent is not None:
3256
+ # Clone the template agent to create an independent instance
3257
+ cloned_agent = self.new_worker_agent.clone(with_memory=False)
3258
+ # Update the system message for the specific role
3259
+ cloned_agent._system_message = worker_sys_msg
3260
+ cloned_agent.init_messages() # Initialize with new system message
3261
+ return cloned_agent
3262
+ else:
3263
+ # Default tools for a new agent
3264
+ function_list = [
3265
+ SearchToolkit().search_duckduckgo,
3266
+ *CodeExecutionToolkit().get_tools(),
3267
+ *ThinkingToolkit().get_tools(),
3268
+ ]
1258
3269
 
1259
- model = ModelFactory.create(
1260
- model_platform=ModelPlatformType.DEFAULT,
1261
- model_type=ModelType.DEFAULT,
1262
- model_config_dict={"temperature": 0},
1263
- )
3270
+ model = ModelFactory.create(
3271
+ model_platform=ModelPlatformType.DEFAULT,
3272
+ model_type=ModelType.DEFAULT,
3273
+ model_config_dict={"temperature": 0},
3274
+ )
1264
3275
 
1265
- return ChatAgent(worker_sys_msg, model=model, tools=function_list) # type: ignore[arg-type]
3276
+ return ChatAgent(
3277
+ system_message=worker_sys_msg,
3278
+ model=model,
3279
+ tools=function_list, # type: ignore[arg-type]
3280
+ pause_event=self._pause_event,
3281
+ )
1266
3282
 
1267
- async def _get_returned_task(self) -> Task:
3283
+ async def _get_returned_task(self) -> Optional[Task]:
1268
3284
  r"""Get the task that's published by this node and just get returned
1269
3285
  from the assignee. Includes timeout handling to prevent indefinite
1270
3286
  waiting.
3287
+
3288
+ Raises:
3289
+ asyncio.TimeoutError: If waiting for task exceeds timeout
1271
3290
  """
1272
3291
  try:
1273
3292
  # Add timeout to prevent indefinite waiting
1274
3293
  return await asyncio.wait_for(
1275
3294
  self._channel.get_returned_task_by_publisher(self.node_id),
1276
- timeout=300.0, # 5 minute timeout
3295
+ timeout=self.task_timeout_seconds,
1277
3296
  )
1278
3297
  except asyncio.TimeoutError:
3298
+ # Re-raise timeout errors to be handled by caller
3299
+ # This prevents hanging when tasks are stuck
1279
3300
  logger.warning(
1280
- f"Timeout waiting for returned task in "
1281
- f"workforce {self.node_id}"
3301
+ f"Timeout waiting for task return in workforce "
3302
+ f"{self.node_id}. "
3303
+ f"Timeout: {self.task_timeout_seconds}s, "
3304
+ f"Pending tasks: {len(self._pending_tasks)}, "
3305
+ f"In-flight tasks: {self._in_flight_tasks}"
3306
+ )
3307
+ raise
3308
+ except Exception as e:
3309
+ error_msg = (
3310
+ f"Error getting returned task {e} in "
3311
+ f"workforce {self.node_id}. "
3312
+ f"Current pending tasks: {len(self._pending_tasks)}, "
3313
+ f"In-flight tasks: {self._in_flight_tasks}"
1282
3314
  )
1283
- raise ValueError("Timeout waiting for task to be returned")
3315
+ logger.error(error_msg, exc_info=True)
3316
+ return None
1284
3317
 
1285
3318
  async def _post_ready_tasks(self) -> None:
1286
3319
  r"""Checks for unassigned tasks, assigns them, and then posts any
@@ -1290,53 +3323,185 @@ class Workforce(BaseNode):
1290
3323
  tasks_to_assign = [
1291
3324
  task
1292
3325
  for task in self._pending_tasks
1293
- if task.id not in self._task_dependencies
3326
+ if (
3327
+ task.id not in self._task_dependencies
3328
+ and (
3329
+ task.additional_info is None
3330
+ or not task.additional_info.get(
3331
+ "_needs_decomposition", False
3332
+ )
3333
+ )
3334
+ )
1294
3335
  ]
1295
3336
  if tasks_to_assign:
1296
3337
  logger.debug(
1297
3338
  f"Found {len(tasks_to_assign)} new tasks. "
1298
3339
  f"Requesting assignment..."
1299
3340
  )
1300
- batch_result = self._find_assignee(tasks_to_assign)
3341
+ batch_result = await self._find_assignee(tasks_to_assign)
1301
3342
  logger.debug(
1302
3343
  f"Coordinator returned assignments:\n"
1303
- f"{json.dumps(batch_result.dict(), indent=2)}"
3344
+ f"{json.dumps(batch_result.model_dump(), indent=2)}"
1304
3345
  )
1305
3346
  for assignment in batch_result.assignments:
1306
3347
  self._task_dependencies[assignment.task_id] = (
1307
3348
  assignment.dependencies
1308
3349
  )
1309
3350
  self._assignees[assignment.task_id] = assignment.assignee_id
1310
- if self.metrics_logger:
3351
+
3352
+ task_assigned_event = TaskAssignedEvent(
3353
+ task_id=assignment.task_id,
3354
+ worker_id=assignment.assignee_id,
3355
+ dependencies=assignment.dependencies,
3356
+ queue_time_seconds=None,
3357
+ )
3358
+ for cb in self._callbacks:
1311
3359
  # queue_time_seconds can be derived by logger if task
1312
3360
  # creation time is logged
1313
- self.metrics_logger.log_task_assigned(
1314
- task_id=assignment.task_id,
1315
- worker_id=assignment.assignee_id,
1316
- dependencies=assignment.dependencies,
1317
- queue_time_seconds=None,
1318
- )
3361
+ cb.log_task_assigned(task_assigned_event)
1319
3362
 
1320
3363
  # Step 2: Iterate through all pending tasks and post those that are
1321
3364
  # ready
1322
3365
  posted_tasks = []
3366
+ # Pre-compute completed task IDs and their states for O(1) lookups
3367
+ completed_tasks_info = {t.id: t.state for t in self._completed_tasks}
3368
+
1323
3369
  for task in self._pending_tasks:
1324
3370
  # A task must be assigned to be considered for posting
1325
3371
  if task.id in self._task_dependencies:
3372
+ # Skip if task has already been posted to prevent duplicates
3373
+ try:
3374
+ task_from_channel = await self._channel.get_task_by_id(
3375
+ task.id
3376
+ )
3377
+ # Check if task is already assigned to a worker
3378
+ if (
3379
+ task_from_channel
3380
+ and task_from_channel.assigned_worker_id
3381
+ ):
3382
+ logger.debug(
3383
+ f"Task {task.id} already assigned to "
3384
+ f"{task_from_channel.assigned_worker_id}, "
3385
+ f"skipping to prevent duplicate"
3386
+ )
3387
+ continue
3388
+ except Exception as e:
3389
+ logger.info(
3390
+ f"Task {task.id} non existent in channel. "
3391
+ f"Assigning task: {e}"
3392
+ )
1326
3393
  dependencies = self._task_dependencies[task.id]
1327
- # Check if all dependencies for this task are in the completed
1328
- # set
1329
- if all(
1330
- dep_id in {t.id for t in self._completed_tasks}
1331
- for dep_id in dependencies
1332
- ):
1333
- assignee_id = self._assignees[task.id]
1334
- logger.debug(
1335
- f"Posting task {task.id} to assignee {assignee_id}. "
1336
- f"Dependencies met."
3394
+
3395
+ # Check if all dependencies are in completed state
3396
+ all_deps_completed = all(
3397
+ dep_id in completed_tasks_info for dep_id in dependencies
3398
+ )
3399
+
3400
+ # Only proceed with dependency checks if all deps are completed
3401
+ if all_deps_completed:
3402
+ # Check if all dependencies succeeded (state is DONE)
3403
+ all_deps_done = all(
3404
+ completed_tasks_info[dep_id] == TaskState.DONE
3405
+ for dep_id in dependencies
1337
3406
  )
1338
- await self._post_task(task, assignee_id)
1339
- posted_tasks.append(task)
3407
+
3408
+ # Check if any dependency failed
3409
+ any_dep_failed = any(
3410
+ completed_tasks_info[dep_id] == TaskState.FAILED
3411
+ for dep_id in dependencies
3412
+ )
3413
+
3414
+ if all_deps_done:
3415
+ # All dependencies completed successfully - post the
3416
+ # task
3417
+ assignee_id = self._assignees[task.id]
3418
+ logger.debug(
3419
+ f"Posting task {task.id} to "
3420
+ f"assignee {assignee_id}. "
3421
+ f"Dependencies met."
3422
+ )
3423
+ await self._post_task(task, assignee_id)
3424
+ posted_tasks.append(task)
3425
+ elif any_dep_failed:
3426
+ # Check if any failed dependencies can still be retried
3427
+ failed_deps = [
3428
+ dep_id
3429
+ for dep_id in dependencies
3430
+ if completed_tasks_info[dep_id] == TaskState.FAILED
3431
+ ]
3432
+
3433
+ # Check if any failed dependency is still retryable
3434
+ failed_tasks_with_retry_potential = []
3435
+ permanently_failed_deps = []
3436
+
3437
+ for dep_id in failed_deps:
3438
+ # Find the failed dependency task
3439
+ failed_task = next(
3440
+ (
3441
+ t
3442
+ for t in self._completed_tasks
3443
+ if t.id == dep_id
3444
+ ),
3445
+ None,
3446
+ )
3447
+ if (
3448
+ failed_task
3449
+ and failed_task.failure_count
3450
+ < MAX_TASK_RETRIES
3451
+ ):
3452
+ failed_tasks_with_retry_potential.append(
3453
+ dep_id
3454
+ )
3455
+ else:
3456
+ permanently_failed_deps.append(dep_id)
3457
+
3458
+ # Only fail the task if ALL dependencies are
3459
+ # permanently failed
3460
+ if (
3461
+ permanently_failed_deps
3462
+ and not failed_tasks_with_retry_potential
3463
+ ):
3464
+ logger.error(
3465
+ f"Task {task.id} cannot proceed: dependencies "
3466
+ f"{permanently_failed_deps} have "
3467
+ f"permanently failed. "
3468
+ f"Marking task as failed."
3469
+ )
3470
+ task.state = TaskState.FAILED
3471
+ task.result = (
3472
+ f"Task failed due to permanently "
3473
+ f"failed dependencies: "
3474
+ f"{permanently_failed_deps}"
3475
+ )
3476
+
3477
+ # Log the failure to metrics
3478
+ task_failed_event = TaskFailedEvent(
3479
+ task_id=task.id,
3480
+ worker_id=task.assigned_worker_id or "unknown",
3481
+ error_message=task.result,
3482
+ metadata={
3483
+ 'failure_reason': 'dependency_failure',
3484
+ 'failed_dependencies': (
3485
+ permanently_failed_deps
3486
+ ),
3487
+ },
3488
+ )
3489
+ for cb in self._callbacks:
3490
+ cb.log_task_failed(task_failed_event)
3491
+
3492
+ self._completed_tasks.append(task)
3493
+ self._cleanup_task_tracking(task.id)
3494
+ posted_tasks.append(task) # Remove from pending
3495
+ else:
3496
+ # Some dependencies may still be retried, keep
3497
+ # task pending
3498
+ logger.debug(
3499
+ f"Task {task.id} waiting: dependencies "
3500
+ f"{failed_tasks_with_retry_potential} "
3501
+ f"failed but may be retried "
3502
+ f"(attempt < {MAX_TASK_RETRIES})"
3503
+ )
3504
+ # else: Not all dependencies completed yet, skip this task
1340
3505
 
1341
3506
  # Step 3: Remove the posted tasks from the pending list
1342
3507
  for task in posted_tasks:
@@ -1348,80 +3513,127 @@ class Workforce(BaseNode):
1348
3513
  pass
1349
3514
 
1350
3515
  async def _handle_failed_task(self, task: Task) -> bool:
3516
+ r"""Handle a task that failed during execution.
3517
+
3518
+ Args:
3519
+ task (Task): The failed task
3520
+
3521
+ Returns:
3522
+ bool: True if workforce should halt, False otherwise
3523
+ """
1351
3524
  task.failure_count += 1
1352
3525
 
1353
- if self.metrics_logger:
1354
- worker_id = self._assignees.get(task.id)
1355
- self.metrics_logger.log_task_failed(
1356
- task_id=task.id,
1357
- worker_id=worker_id,
1358
- error_message=task.result or "Task execution failed",
1359
- error_type="TaskFailure",
1360
- metadata={'failure_count': task.failure_count},
1361
- )
3526
+ # Determine detailed failure information
3527
+ failure_reason = task.result or "Unknown error"
3528
+ worker_id = task.assigned_worker_id or "unknown"
3529
+ detailed_error = f"{failure_reason} (assigned to worker: {worker_id})"
1362
3530
 
1363
- if task.failure_count >= 3:
1364
- return True
3531
+ logger.error(
3532
+ f"Task {task.id} failed (attempt "
3533
+ f"{task.failure_count}/{MAX_TASK_RETRIES}): {detailed_error}"
3534
+ )
1365
3535
 
1366
- if task.get_depth() >= 3:
1367
- # Create a new worker node and reassign
1368
- assignee = self._create_worker_node_for_task(task)
3536
+ print(
3537
+ f"{Fore.RED}❌ Task {task.id} failed "
3538
+ f"(attempt {task.failure_count}/{MAX_TASK_RETRIES}): "
3539
+ f"{failure_reason}{Fore.RESET}"
3540
+ )
1369
3541
 
1370
- # Sync shared memory after creating new worker to provide context
1371
- if self.share_memory:
1372
- logger.info(
1373
- f"Syncing shared memory after creating new worker "
1374
- f"{assignee.node_id} for failed task {task.id}"
1375
- )
1376
- self._sync_shared_memory()
3542
+ task_failed_event = TaskFailedEvent(
3543
+ task_id=task.id,
3544
+ worker_id=worker_id,
3545
+ error_message=detailed_error,
3546
+ metadata={
3547
+ 'failure_count': task.failure_count,
3548
+ 'task_content': task.content,
3549
+ 'result_length': len(task.result) if task.result else 0,
3550
+ },
3551
+ )
3552
+ for cb in self._callbacks:
3553
+ cb.log_task_failed(task_failed_event)
3554
+
3555
+ # Check for immediate halt conditions
3556
+ if task.failure_count >= MAX_TASK_RETRIES:
3557
+ logger.error(
3558
+ f"Task {task.id} has exceeded maximum retry attempts "
3559
+ f"({MAX_TASK_RETRIES}). Final failure reason: "
3560
+ f"{detailed_error}. "
3561
+ f"Task content: '{task.content}'"
3562
+ )
3563
+ self._cleanup_task_tracking(task.id)
3564
+ self._completed_tasks.append(task)
3565
+ if task.id in self._assignees:
3566
+ await self._channel.archive_task(task.id)
3567
+ return True
1377
3568
 
1378
- await self._post_task(task, assignee.node_id)
1379
- action_taken = f"reassigned to new worker {assignee.node_id}"
1380
- else:
1381
- subtasks = self._decompose_task(task)
1382
- if self.metrics_logger and subtasks:
1383
- self.metrics_logger.log_task_decomposed(
1384
- parent_task_id=task.id,
1385
- subtask_ids=[st.id for st in subtasks],
1386
- )
1387
- for subtask in subtasks:
1388
- self.metrics_logger.log_task_created(
1389
- task_id=subtask.id,
1390
- description=subtask.content,
1391
- parent_task_id=task.id,
1392
- task_type=subtask.type,
1393
- metadata=subtask.additional_info,
1394
- )
1395
- # Insert packets at the head of the queue
1396
- self._pending_tasks.extendleft(reversed(subtasks))
3569
+ if len(self._pending_tasks) > MAX_PENDING_TASKS_LIMIT:
3570
+ logger.error(
3571
+ f"Too many pending tasks ({len(self._pending_tasks)} > "
3572
+ f"{MAX_PENDING_TASKS_LIMIT}). Halting to prevent task "
3573
+ f"explosion. Last failed task: {task.id}"
3574
+ )
3575
+ self._cleanup_task_tracking(task.id)
3576
+ self._completed_tasks.append(task)
3577
+ if task.id in self._assignees:
3578
+ await self._channel.archive_task(task.id)
3579
+ return True
1397
3580
 
1398
- # Sync shared memory after task decomposition
1399
- if self.share_memory:
1400
- logger.info(
1401
- f"Syncing shared memory after decomposing failed "
1402
- f"task {task.id}"
1403
- )
1404
- self._sync_shared_memory()
3581
+ # Use intelligent failure analysis to decide recovery strategy
3582
+ recovery_decision = self._analyze_task(
3583
+ task, for_failure=True, error_message=detailed_error
3584
+ )
1405
3585
 
1406
- await self._post_ready_tasks()
1407
- action_taken = f"decomposed into {len(subtasks)} subtasks"
3586
+ strategy_str = (
3587
+ recovery_decision.recovery_strategy.value
3588
+ if recovery_decision.recovery_strategy
3589
+ else "none"
3590
+ )
3591
+ logger.info(
3592
+ f"Task {task.id} failure "
3593
+ f"analysis: {strategy_str} - "
3594
+ f"{recovery_decision.reasoning}"
3595
+ )
3596
+
3597
+ # Clean up tracking before attempting recovery
1408
3598
  if task.id in self._assignees:
1409
3599
  await self._channel.archive_task(task.id)
3600
+ self._cleanup_task_tracking(task.id)
3601
+
3602
+ # Apply recovery strategy
3603
+ try:
3604
+ is_decompose = await self._apply_recovery_strategy(
3605
+ task, recovery_decision
3606
+ )
1410
3607
 
3608
+ # For decompose, we handle it specially
3609
+ if is_decompose:
3610
+ # Task was decomposed, add to completed tasks
3611
+ self._completed_tasks.append(task)
3612
+ return False
3613
+
3614
+ except Exception as e:
3615
+ logger.error(
3616
+ f"Recovery strategy failed for task {task.id}: {e}",
3617
+ exc_info=True,
3618
+ )
3619
+ # If max retries reached, halt the workforce
3620
+ if task.failure_count >= MAX_TASK_RETRIES:
3621
+ self._completed_tasks.append(task)
3622
+ return True
3623
+ self._completed_tasks.append(task)
3624
+ return False
3625
+
3626
+ # Task is being retried - don't add to completed tasks
3627
+ # It will be added when it actually completes or permanently fails
1411
3628
  logger.debug(
1412
- f"Task {task.id} failed and was {action_taken}. "
1413
- f"Updating dependency state."
3629
+ f"Task {task.id} is being retried (strategy: "
3630
+ f"{recovery_decision.recovery_strategy}). "
3631
+ f"Not adding to completed tasks until final outcome."
1414
3632
  )
1415
- # Mark task as completed for dependency tracking
1416
- self._completed_tasks.append(task)
1417
3633
 
1418
- # Post next ready tasks
1419
-
1420
- # Sync shared memory after task completion to share knowledge
3634
+ # Sync shared memory after task recovery
1421
3635
  if self.share_memory:
1422
- logger.info(
1423
- f"Syncing shared memory after task {task.id} completion"
1424
- )
3636
+ logger.info(f"Syncing shared memory after task {task.id} recovery")
1425
3637
  self._sync_shared_memory()
1426
3638
 
1427
3639
  # Check if any pending tasks are now ready to execute
@@ -1429,33 +3641,34 @@ class Workforce(BaseNode):
1429
3641
  return False
1430
3642
 
1431
3643
  async def _handle_completed_task(self, task: Task) -> None:
1432
- if self.metrics_logger:
1433
- worker_id = self._assignees.get(task.id, "unknown")
1434
- processing_time_seconds = None
1435
- token_usage = None
1436
-
1437
- # Get processing time from task start time or additional info
1438
- if task.id in self._task_start_times:
1439
- processing_time_seconds = (
1440
- time.time() - self._task_start_times[task.id]
1441
- )
1442
- del self._task_start_times[task.id] # Prevent memory leaks
1443
- elif (
1444
- task.additional_info is not None
1445
- and 'processing_time_seconds' in task.additional_info
1446
- ):
1447
- processing_time_seconds = task.additional_info[
1448
- 'processing_time_seconds'
1449
- ]
1450
-
1451
- # Get token usage from task additional info
1452
- if (
1453
- task.additional_info is not None
1454
- and 'token_usage' in task.additional_info
1455
- ):
1456
- token_usage = task.additional_info['token_usage']
3644
+ worker_id = task.assigned_worker_id or "unknown"
3645
+ processing_time_seconds = None
3646
+ token_usage = None
3647
+
3648
+ # Get processing time from task start time or additional info
3649
+ if task.id in self._task_start_times:
3650
+ processing_time_seconds = (
3651
+ time.time() - self._task_start_times[task.id]
3652
+ )
3653
+ self._cleanup_task_tracking(task.id)
3654
+ elif (
3655
+ task.additional_info is not None
3656
+ and 'processing_time_seconds' in task.additional_info
3657
+ ):
3658
+ processing_time_seconds = task.additional_info[
3659
+ 'processing_time_seconds'
3660
+ ]
1457
3661
 
1458
- # Try to get token usage from SingleAgentWorker memory if available
3662
+ # Get token usage from task additional info (preferred - actual
3663
+ # usage)
3664
+ if (
3665
+ task.additional_info is not None
3666
+ and 'token_usage' in task.additional_info
3667
+ ):
3668
+ token_usage = task.additional_info['token_usage']
3669
+ else:
3670
+ # Fallback: Try to get token usage from SingleAgentWorker
3671
+ # memory
1459
3672
  assignee_node = next(
1460
3673
  (
1461
3674
  child
@@ -1465,18 +3678,23 @@ class Workforce(BaseNode):
1465
3678
  None,
1466
3679
  )
1467
3680
  if isinstance(assignee_node, SingleAgentWorker):
1468
- _, total_tokens = assignee_node.worker.memory.get_context()
1469
- token_usage = {'total_tokens': total_tokens}
1470
-
1471
- # Log the completed task
1472
- self.metrics_logger.log_task_completed(
1473
- task_id=task.id,
1474
- worker_id=worker_id,
1475
- result_summary=task.result if task.result else "Completed",
1476
- processing_time_seconds=processing_time_seconds,
1477
- token_usage=token_usage,
1478
- metadata={'current_state': task.state.value},
1479
- )
3681
+ try:
3682
+ _, total_tokens = assignee_node.worker.memory.get_context()
3683
+ token_usage = {'total_tokens': total_tokens}
3684
+ except Exception:
3685
+ token_usage = None
3686
+
3687
+ # Log the completed task
3688
+ task_completed_event = TaskCompletedEvent(
3689
+ task_id=task.id,
3690
+ worker_id=worker_id,
3691
+ result_summary=task.result if task.result else "Completed",
3692
+ processing_time_seconds=processing_time_seconds,
3693
+ token_usage=token_usage,
3694
+ metadata={'current_state': task.state.value},
3695
+ )
3696
+ for cb in self._callbacks:
3697
+ cb.log_task_completed(task_completed_event)
1480
3698
 
1481
3699
  # Find and remove the completed task from pending tasks
1482
3700
  tasks_list = list(self._pending_tasks)
@@ -1495,31 +3713,65 @@ class Workforce(BaseNode):
1495
3713
  break
1496
3714
 
1497
3715
  if not found_and_removed:
1498
- # Task was already removed from pending queue (expected case when
1499
- # it had been popped immediately after posting). No need to
1500
- # draw user attention with a warning; record at debug level.
3716
+ # Task was already removed from pending queue (common case when
3717
+ # it was posted and removed immediately).
1501
3718
  logger.debug(
1502
3719
  f"Completed task {task.id} was already removed from pending "
1503
- "queue."
3720
+ "queue (normal for posted tasks)."
1504
3721
  )
1505
3722
 
1506
3723
  # Archive the task and update dependency tracking
1507
3724
  if task.id in self._assignees:
1508
3725
  await self._channel.archive_task(task.id)
1509
3726
 
1510
- # Ensure it's in completed tasks set
1511
- self._completed_tasks.append(task)
3727
+ # Ensure it's in completed tasks set by updating if it exists or
3728
+ # appending if it's new.
3729
+ task_found_in_completed = False
3730
+ for i, t in enumerate(self._completed_tasks):
3731
+ if t.id == task.id:
3732
+ self._completed_tasks[i] = task
3733
+ task_found_in_completed = True
3734
+ break
3735
+ if not task_found_in_completed:
3736
+ self._completed_tasks.append(task)
1512
3737
 
1513
3738
  # Handle parent task completion logic
1514
3739
  parent = task.parent
1515
- if parent and parent.id not in {t.id for t in self._completed_tasks}:
3740
+ if parent:
3741
+ # Check if all subtasks are completed and successful
1516
3742
  all_subtasks_done = all(
1517
- sub.id in {t.id for t in self._completed_tasks}
3743
+ any(
3744
+ t.id == sub.id and t.state == TaskState.DONE
3745
+ for t in self._completed_tasks
3746
+ )
1518
3747
  for sub in parent.subtasks
1519
3748
  )
1520
3749
  if all_subtasks_done:
1521
- # Set the parent task state to done
3750
+ # Collect results from successful subtasks only
3751
+ successful_results = []
3752
+ for sub in parent.subtasks:
3753
+ completed_subtask = next(
3754
+ (
3755
+ t
3756
+ for t in self._completed_tasks
3757
+ if t.id == sub.id and t.state == TaskState.DONE
3758
+ ),
3759
+ None,
3760
+ )
3761
+ if completed_subtask and completed_subtask.result:
3762
+ successful_results.append(
3763
+ f"--- Subtask {sub.id} Result ---\n"
3764
+ f"{completed_subtask.result}"
3765
+ )
3766
+
3767
+ # Set parent task state and result
1522
3768
  parent.state = TaskState.DONE
3769
+ parent.result = (
3770
+ "\n\n".join(successful_results)
3771
+ if successful_results
3772
+ else "All subtasks completed"
3773
+ )
3774
+
1523
3775
  logger.debug(
1524
3776
  f"All subtasks of {parent.id} are done. "
1525
3777
  f"Marking parent as complete."
@@ -1562,15 +3814,23 @@ class Workforce(BaseNode):
1562
3814
  r"""Returns an ASCII tree representation of the task hierarchy and
1563
3815
  worker status.
1564
3816
  """
1565
- if not self.metrics_logger:
1566
- return "Logger not initialized."
1567
- return self.metrics_logger.get_ascii_tree_representation()
3817
+ metrics_cb: List[WorkforceMetrics] = [
3818
+ cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
3819
+ ]
3820
+ if len(metrics_cb) == 0:
3821
+ return "Metrics Callback not initialized."
3822
+ else:
3823
+ return metrics_cb[0].get_ascii_tree_representation()
1568
3824
 
1569
3825
  def get_workforce_kpis(self) -> Dict[str, Any]:
1570
3826
  r"""Returns a dictionary of key performance indicators."""
1571
- if not self.metrics_logger:
1572
- return {"error": "Logger not initialized."}
1573
- return self.metrics_logger.get_kpis()
3827
+ metrics_cb: List[WorkforceMetrics] = [
3828
+ cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
3829
+ ]
3830
+ if len(metrics_cb) == 0:
3831
+ return {"error": "Metrics Callback not initialized."}
3832
+ else:
3833
+ return metrics_cb[0].get_kpis()
1574
3834
 
1575
3835
  def dump_workforce_logs(self, file_path: str) -> None:
1576
3836
  r"""Dumps all collected logs to a JSON file.
@@ -1578,13 +3838,133 @@ class Workforce(BaseNode):
1578
3838
  Args:
1579
3839
  file_path (str): The path to the JSON file.
1580
3840
  """
1581
- if not self.metrics_logger:
3841
+ metrics_cb: List[WorkforceMetrics] = [
3842
+ cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
3843
+ ]
3844
+ if len(metrics_cb) == 0:
1582
3845
  print("Logger not initialized. Cannot dump logs.")
1583
3846
  return
1584
- self.metrics_logger.dump_to_json(file_path)
3847
+ metrics_cb[0].dump_to_json(file_path)
1585
3848
  # Use logger.info or print, consistent with existing style
1586
3849
  logger.info(f"Workforce logs dumped to {file_path}")
1587
3850
 
3851
+ async def _handle_skip_task(self) -> bool:
3852
+ r"""Handle skip request by marking pending and in-flight tasks
3853
+ as completed.
3854
+
3855
+ Returns:
3856
+ bool: True if workforce should stop (no independent tasks),
3857
+ False to continue.
3858
+ """
3859
+ logger.info("Skip requested, processing skip logic.")
3860
+
3861
+ # Mark all pending tasks as completed instead of just clearing
3862
+ pending_tasks_to_complete = list(self._pending_tasks)
3863
+ if pending_tasks_to_complete:
3864
+ logger.info(
3865
+ f"Marking {len(pending_tasks_to_complete)} pending tasks "
3866
+ f"as completed."
3867
+ )
3868
+ for task in pending_tasks_to_complete:
3869
+ # Don't remove tasks that need decomposition
3870
+ if task.additional_info and task.additional_info.get(
3871
+ '_needs_decomposition', False
3872
+ ):
3873
+ continue
3874
+ # Set task state to DONE and add a completion message
3875
+ task.state = TaskState.DONE
3876
+ task.result = "Task marked as completed due to skip request"
3877
+
3878
+ # Use the existing handle completed task function
3879
+ await self._handle_completed_task(task)
3880
+
3881
+ # Handle in-flight tasks if they exist
3882
+ if self._in_flight_tasks > 0:
3883
+ logger.info(
3884
+ f"Found {self._in_flight_tasks} in-flight tasks. "
3885
+ f"Retrieving and completing them."
3886
+ )
3887
+ try:
3888
+ # Get all in-flight tasks for this publisher from the channel
3889
+ in_flight_tasks = await self._channel.get_in_flight_tasks(
3890
+ self.node_id
3891
+ )
3892
+ logger.info(
3893
+ f"Retrieved {len(in_flight_tasks)} in-flight "
3894
+ f"tasks from channel."
3895
+ )
3896
+
3897
+ for task in in_flight_tasks:
3898
+ # Set task state to DONE and add a completion message
3899
+ task.state = TaskState.DONE
3900
+ task.result = (
3901
+ "Task marked as completed due to skip request"
3902
+ )
3903
+
3904
+ # Remove the task from the channel to avoid hanging
3905
+ await self._channel.remove_task(task.id)
3906
+
3907
+ # Decrement in-flight counter
3908
+ self._decrement_in_flight_tasks(
3909
+ task.id, "skip request - removed from channel"
3910
+ )
3911
+
3912
+ # Handle as completed task to update dependencies
3913
+ await self._handle_completed_task(task)
3914
+
3915
+ logger.info(
3916
+ f"Completed in-flight task {task.id} due "
3917
+ f"to skip request."
3918
+ )
3919
+
3920
+ except Exception as e:
3921
+ logger.error(
3922
+ f"Error handling in-flight tasks during skip: {e}",
3923
+ exc_info=True,
3924
+ )
3925
+ # Reset in-flight counter to prevent hanging
3926
+ self._in_flight_tasks = 0
3927
+
3928
+ # Check if there are any main pending tasks after filtering
3929
+ if self._pending_tasks:
3930
+ # Check if the first pending task needs decomposition
3931
+ next_task = self._pending_tasks[0]
3932
+ if next_task.additional_info and next_task.additional_info.get(
3933
+ '_needs_decomposition'
3934
+ ):
3935
+ logger.info(
3936
+ f"Decomposing main task {next_task.id} after skip request."
3937
+ )
3938
+ try:
3939
+ # Remove the decomposition flag to avoid re-decomposition
3940
+ next_task.additional_info['_needs_decomposition'] = False
3941
+
3942
+ # Decompose the task and append subtasks to _pending_tasks
3943
+ await self.handle_decompose_append_task(
3944
+ next_task, reset=False
3945
+ )
3946
+
3947
+ # Mark the main task as completed and remove from pending
3948
+ await self._handle_completed_task(next_task)
3949
+ logger.info(
3950
+ f"Main task {next_task.id} decomposed after "
3951
+ f"skip request"
3952
+ )
3953
+ except Exception as e:
3954
+ logger.error(
3955
+ f"Error decomposing main task {next_task.id} "
3956
+ f"after skip: {e}",
3957
+ exc_info=True,
3958
+ )
3959
+
3960
+ logger.info("Pending tasks available after skip, continuing.")
3961
+ await self._post_ready_tasks()
3962
+ return False # Continue processing
3963
+ else:
3964
+ # No pending tasks available, act like stop
3965
+ logger.info("No pending tasks available, acting like stop.")
3966
+ return True # Stop processing
3967
+
1588
3968
  @check_if_running(False)
1589
3969
  async def _listen_to_channel(self) -> None:
1590
3970
  r"""Continuously listen to the channel, post task to the channel and
@@ -1613,6 +3993,75 @@ class Workforce(BaseNode):
1613
3993
  logger.info("Stop requested, breaking execution loop.")
1614
3994
  break
1615
3995
 
3996
+ # Check for skip request after potential pause
3997
+ if self._skip_requested:
3998
+ should_stop = await self._handle_skip_task()
3999
+ if should_stop:
4000
+ self._stop_requested = True
4001
+ break
4002
+
4003
+ # Reset skip flag
4004
+ self._skip_requested = False
4005
+ continue
4006
+
4007
+ # Check if we should decompose a main task
4008
+ # Only decompose when no tasks are in flight and pending queue
4009
+ # is empty
4010
+ if not self._pending_tasks and self._in_flight_tasks == 0:
4011
+ # All tasks completed, will exit loop
4012
+ break
4013
+
4014
+ # Check if the first pending task needs decomposition
4015
+ # This happens when add_task(as_subtask=False) was called
4016
+ if self._pending_tasks and self._in_flight_tasks == 0:
4017
+ next_task = self._pending_tasks[0]
4018
+ if (
4019
+ next_task.additional_info
4020
+ and next_task.additional_info.get(
4021
+ '_needs_decomposition'
4022
+ )
4023
+ ):
4024
+ logger.info(f"Decomposing main task: {next_task.id}")
4025
+ try:
4026
+ # Remove the decomposition flag to avoid
4027
+ # re-decomposition
4028
+ next_task.additional_info[
4029
+ '_needs_decomposition'
4030
+ ] = False
4031
+
4032
+ # Decompose the task and append subtasks to
4033
+ # _pending_tasks
4034
+ await self.handle_decompose_append_task(
4035
+ next_task, reset=False
4036
+ )
4037
+
4038
+ # Mark the main task as completed (decomposition
4039
+ # successful) and Remove it from pending tasks
4040
+ await self._handle_completed_task(next_task)
4041
+ logger.info(
4042
+ f"Main task {next_task.id} decomposed and "
4043
+ f"ready for processing"
4044
+ )
4045
+ except Exception as e:
4046
+ logger.error(
4047
+ f"Error decomposing main task {next_task.id}: "
4048
+ f"{e}",
4049
+ exc_info=True,
4050
+ )
4051
+ # Revert back to the queue for retry later if
4052
+ # decomposition failed
4053
+ if not self._pending_tasks:
4054
+ self._pending_tasks.appendleft(next_task)
4055
+ else:
4056
+ logger.warning(
4057
+ "Pending tasks exist after decomposition "
4058
+ "error."
4059
+ )
4060
+
4061
+ # Immediately assign and post the transferred tasks
4062
+ await self._post_ready_tasks()
4063
+ continue
4064
+
1616
4065
  # Save snapshot before processing next task
1617
4066
  if self._pending_tasks:
1618
4067
  current_task = self._pending_tasks[0]
@@ -1626,9 +4075,37 @@ class Workforce(BaseNode):
1626
4075
  )
1627
4076
  self._last_snapshot_time = time.time()
1628
4077
 
1629
- # Get returned task (this may block until a task is returned)
1630
- returned_task = await self._get_returned_task()
1631
- self._in_flight_tasks -= 1
4078
+ # Get returned task
4079
+ try:
4080
+ returned_task = await self._get_returned_task()
4081
+ except asyncio.TimeoutError:
4082
+ # Handle timeout - check if we have tasks stuck in flight
4083
+ if self._in_flight_tasks > 0:
4084
+ logger.warning(
4085
+ f"Timeout waiting for {self._in_flight_tasks} "
4086
+ f"in-flight tasks. Breaking to prevent hanging."
4087
+ )
4088
+ # Break the loop to prevent indefinite hanging
4089
+ # The finally block will handle cleanup
4090
+ break
4091
+ else:
4092
+ # No tasks in flight, safe to continue
4093
+ await self._post_ready_tasks()
4094
+ continue
4095
+
4096
+ # If no task was returned (other errors), continue
4097
+ if returned_task is None:
4098
+ logger.debug(
4099
+ f"No task returned in workforce {self.node_id}. "
4100
+ f"Pending: {len(self._pending_tasks)}, "
4101
+ f"In-flight: {self._in_flight_tasks}"
4102
+ )
4103
+ await self._post_ready_tasks()
4104
+ continue
4105
+
4106
+ self._decrement_in_flight_tasks(
4107
+ returned_task.id, "task returned successfully"
4108
+ )
1632
4109
 
1633
4110
  # Check for stop request after getting task
1634
4111
  if self._stop_requested:
@@ -1637,24 +4114,178 @@ class Workforce(BaseNode):
1637
4114
 
1638
4115
  # Process the returned task based on its state
1639
4116
  if returned_task.state == TaskState.DONE:
1640
- print(
1641
- f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
1642
- f"successfully.{Fore.RESET}"
1643
- )
1644
- await self._handle_completed_task(returned_task)
4117
+ # Check if the "completed" task actually failed to provide
4118
+ # useful results
4119
+ if is_task_result_insufficient(returned_task):
4120
+ result_preview = (
4121
+ returned_task.result
4122
+ if returned_task.result
4123
+ else "No result"
4124
+ )
4125
+ logger.warning(
4126
+ f"Task {returned_task.id} marked as DONE but "
4127
+ f"result is insufficient. "
4128
+ f"Treating as failed. Result: '{result_preview}'"
4129
+ )
4130
+ returned_task.state = TaskState.FAILED
4131
+ try:
4132
+ halt = await self._handle_failed_task(
4133
+ returned_task
4134
+ )
4135
+ if not halt:
4136
+ continue
4137
+
4138
+ # Do not halt if we have main tasks in queue
4139
+ if len(self.get_main_task_queue()) > 0:
4140
+ print(
4141
+ f"{Fore.RED}Task {returned_task.id} has "
4142
+ f"failed for {MAX_TASK_RETRIES} times "
4143
+ f"after insufficient results, skipping "
4144
+ f"that task. Final error: "
4145
+ f"{returned_task.result or 'Unknown err'}"
4146
+ f"{Fore.RESET}"
4147
+ )
4148
+ self._skip_requested = True
4149
+ continue
4150
+
4151
+ print(
4152
+ f"{Fore.RED}Task {returned_task.id} has "
4153
+ f"failed for {MAX_TASK_RETRIES} times after "
4154
+ f"insufficient results, halting the "
4155
+ f"workforce. Final error: "
4156
+ f"{returned_task.result or 'Unknown error'}"
4157
+ f"{Fore.RESET}"
4158
+ )
4159
+ await self._graceful_shutdown(returned_task)
4160
+ break
4161
+ except Exception as e:
4162
+ logger.error(
4163
+ f"Error handling insufficient task result "
4164
+ f"{returned_task.id}: {e}",
4165
+ exc_info=True,
4166
+ )
4167
+ continue
4168
+ else:
4169
+ quality_eval = self._analyze_task(
4170
+ returned_task, for_failure=False
4171
+ )
4172
+
4173
+ if not quality_eval.quality_sufficient:
4174
+ logger.info(
4175
+ f"Task {returned_task.id} quality check: "
4176
+ f"score={quality_eval.quality_score}, "
4177
+ f"issues={quality_eval.issues}, "
4178
+ f"strategy={quality_eval.recovery_strategy}"
4179
+ )
4180
+
4181
+ # Check retry limit before attempting recovery
4182
+ if returned_task.failure_count >= 2:
4183
+ print(
4184
+ f"{Fore.YELLOW}Task {returned_task.id} "
4185
+ f"completed with low quality score: "
4186
+ f"{quality_eval.quality_score} "
4187
+ f"(retry limit reached){Fore.RESET}"
4188
+ )
4189
+ await self._handle_completed_task(
4190
+ returned_task
4191
+ )
4192
+ continue
4193
+
4194
+ # Print visual feedback for quality-failed tasks
4195
+ # with recovery strategy
4196
+ recovery_action = (
4197
+ quality_eval.recovery_strategy.value
4198
+ if quality_eval.recovery_strategy
4199
+ else ""
4200
+ )
4201
+ print(
4202
+ f"{Fore.YELLOW}⚠️ Task {returned_task.id} "
4203
+ f"failed quality check (score: "
4204
+ f"{quality_eval.quality_score}). "
4205
+ f"Issues: {', '.join(quality_eval.issues)}. "
4206
+ f"Recovery: {recovery_action}{Fore.RESET}"
4207
+ )
4208
+
4209
+ # Mark as failed for recovery
4210
+ returned_task.failure_count += 1
4211
+ returned_task.state = TaskState.FAILED
4212
+ returned_task.result = (
4213
+ f"Quality insufficient (score: "
4214
+ f"{quality_eval.quality_score}). "
4215
+ f"Issues: {', '.join(quality_eval.issues)}"
4216
+ )
4217
+
4218
+ # Clean up tracking before attempting recovery
4219
+ if returned_task.id in self._assignees:
4220
+ await self._channel.archive_task(
4221
+ returned_task.id
4222
+ )
4223
+ self._cleanup_task_tracking(returned_task.id)
4224
+
4225
+ # Apply LLM-recommended recovery strategy
4226
+ try:
4227
+ is_decompose = (
4228
+ await self._apply_recovery_strategy(
4229
+ returned_task, quality_eval
4230
+ )
4231
+ )
4232
+
4233
+ # For decompose, cleanup happens in the method
4234
+ if is_decompose:
4235
+ continue
4236
+
4237
+ except Exception as e:
4238
+ logger.error(
4239
+ f"Error handling quality-failed task "
4240
+ f"{returned_task.id}: {e}",
4241
+ exc_info=True,
4242
+ )
4243
+ continue
4244
+ else:
4245
+ print(
4246
+ f"{Fore.CYAN}Task {returned_task.id} "
4247
+ f"completed successfully (quality score: "
4248
+ f"{quality_eval.quality_score}).{Fore.RESET}"
4249
+ )
4250
+ await self._handle_completed_task(returned_task)
1645
4251
  elif returned_task.state == TaskState.FAILED:
1646
- halt = await self._handle_failed_task(returned_task)
1647
- if not halt:
4252
+ try:
4253
+ halt = await self._handle_failed_task(returned_task)
4254
+ if not halt:
4255
+ continue
4256
+
4257
+ # Do not halt if we have main tasks in queue
4258
+ if len(self.get_main_task_queue()) > 0:
4259
+ print(
4260
+ f"{Fore.RED}Task {returned_task.id} has "
4261
+ f"failed for {MAX_TASK_RETRIES} times, "
4262
+ f"skipping that task. Final error: "
4263
+ f"{returned_task.result or 'Unknown error'}"
4264
+ f"{Fore.RESET}"
4265
+ )
4266
+ self._skip_requested = True
4267
+ continue
4268
+
4269
+ print(
4270
+ f"{Fore.RED}Task {returned_task.id} has failed "
4271
+ f"for {MAX_TASK_RETRIES} times, halting "
4272
+ f"the workforce. Final error: "
4273
+ f"{returned_task.result or 'Unknown error'}"
4274
+ f"{Fore.RESET}"
4275
+ )
4276
+ # Graceful shutdown instead of immediate break
4277
+ await self._graceful_shutdown(returned_task)
4278
+ break
4279
+ except Exception as e:
4280
+ logger.error(
4281
+ f"Error handling failed task "
4282
+ f"{returned_task.id}: {e}",
4283
+ exc_info=True,
4284
+ )
4285
+ # Continue to prevent hanging
1648
4286
  continue
1649
- print(
1650
- f"{Fore.RED}Task {returned_task.id} has failed "
1651
- f"for 3 times, halting the workforce.{Fore.RESET}"
1652
- )
1653
- # Graceful shutdown instead of immediate break
1654
- await self._graceful_shutdown(returned_task)
1655
- break
1656
4287
  elif returned_task.state == TaskState.OPEN:
1657
- # TODO: multi-layer workforce
4288
+ # TODO: Add logic for OPEN
1658
4289
  pass
1659
4290
  else:
1660
4291
  raise ValueError(
@@ -1662,7 +4293,19 @@ class Workforce(BaseNode):
1662
4293
  )
1663
4294
 
1664
4295
  except Exception as e:
1665
- logger.error(f"Error processing task: {e}")
4296
+ # Decrement in-flight counter to prevent hanging
4297
+ self._decrement_in_flight_tasks(
4298
+ "unknown", "exception in task processing loop"
4299
+ )
4300
+
4301
+ logger.error(
4302
+ f"Error processing task in workforce {self.node_id}: {e}"
4303
+ f"Workforce state - Pending tasks: "
4304
+ f"{len(self._pending_tasks)}, "
4305
+ f"In-flight tasks: {self._in_flight_tasks}, "
4306
+ f"Completed tasks: {len(self._completed_tasks)}"
4307
+ )
4308
+
1666
4309
  if self._stop_requested:
1667
4310
  break
1668
4311
  # Continue with next iteration unless stop is requested
@@ -1675,6 +4318,9 @@ class Workforce(BaseNode):
1675
4318
  elif not self._pending_tasks and self._in_flight_tasks == 0:
1676
4319
  self._state = WorkforceState.IDLE
1677
4320
  logger.info("All tasks completed.")
4321
+ all_tasks_completed_event = AllTasksCompletedEvent()
4322
+ for cb in self._callbacks:
4323
+ cb.log_all_tasks_completed(all_tasks_completed_event)
1678
4324
 
1679
4325
  # shut down the whole workforce tree
1680
4326
  self.stop()
@@ -1716,11 +4362,50 @@ class Workforce(BaseNode):
1716
4362
  r"""Stop all the child nodes under it. The node itself will be stopped
1717
4363
  by its parent node.
1718
4364
  """
4365
+ # Stop all child nodes first
1719
4366
  for child in self._children:
1720
4367
  if child._running:
1721
4368
  child.stop()
1722
- for child_task in self._child_listening_tasks:
1723
- child_task.cancel()
4369
+
4370
+ # Cancel child listening tasks
4371
+ if self._child_listening_tasks:
4372
+ try:
4373
+ loop = asyncio.get_running_loop()
4374
+ if loop and not loop.is_closed():
4375
+ # Create graceful cleanup task
4376
+ async def cleanup():
4377
+ await asyncio.sleep(0.1) # Brief grace period
4378
+ for task in self._child_listening_tasks:
4379
+ if not task.done():
4380
+ task.cancel()
4381
+
4382
+ # Handle both asyncio.Task and concurrent.futures.
4383
+ # Future
4384
+ awaitables = []
4385
+ for task in self._child_listening_tasks:
4386
+ if isinstance(task, concurrent.futures.Future):
4387
+ # Convert Future to awaitable
4388
+ awaitables.append(asyncio.wrap_future(task))
4389
+ else:
4390
+ # Already an asyncio.Task
4391
+ awaitables.append(task)
4392
+
4393
+ await asyncio.gather(
4394
+ *awaitables,
4395
+ return_exceptions=True,
4396
+ )
4397
+
4398
+ self._cleanup_task = loop.create_task(cleanup())
4399
+ else:
4400
+ # No active loop, cancel immediately
4401
+ for task in self._child_listening_tasks:
4402
+ task.cancel()
4403
+ except (RuntimeError, Exception) as e:
4404
+ # Fallback: cancel immediately
4405
+ logger.debug(f"Exception during task cleanup: {e}")
4406
+ for task in self._child_listening_tasks:
4407
+ task.cancel()
4408
+
1724
4409
  self._running = False
1725
4410
 
1726
4411
  def clone(self, with_memory: bool = False) -> 'Workforce':
@@ -1738,43 +4423,36 @@ class Workforce(BaseNode):
1738
4423
  """
1739
4424
 
1740
4425
  # Create a new instance with the same configuration
1741
- # Extract the original kwargs from the agents to properly clone them
1742
- coordinator_kwargs = (
1743
- getattr(self.coordinator_agent, 'init_kwargs', {}) or {}
1744
- )
1745
- task_kwargs = getattr(self.task_agent, 'init_kwargs', {}) or {}
1746
-
1747
4426
  new_instance = Workforce(
1748
4427
  description=self.description,
1749
- coordinator_agent_kwargs=coordinator_kwargs.copy(),
1750
- task_agent_kwargs=task_kwargs.copy(),
1751
- new_worker_agent_kwargs=self.new_worker_agent_kwargs.copy()
1752
- if self.new_worker_agent_kwargs
4428
+ coordinator_agent=self.coordinator_agent.clone(with_memory),
4429
+ task_agent=self.task_agent.clone(with_memory),
4430
+ new_worker_agent=self.new_worker_agent.clone(with_memory)
4431
+ if self.new_worker_agent
1753
4432
  else None,
1754
4433
  graceful_shutdown_timeout=self.graceful_shutdown_timeout,
1755
4434
  share_memory=self.share_memory,
1756
- )
1757
-
1758
- new_instance.task_agent = self.task_agent.clone(with_memory)
1759
- new_instance.coordinator_agent = self.coordinator_agent.clone(
1760
- with_memory
4435
+ use_structured_output_handler=self.use_structured_output_handler,
4436
+ task_timeout_seconds=self.task_timeout_seconds,
1761
4437
  )
1762
4438
 
1763
4439
  for child in self._children:
1764
4440
  if isinstance(child, SingleAgentWorker):
1765
4441
  cloned_worker = child.worker.clone(with_memory)
1766
4442
  new_instance.add_single_agent_worker(
1767
- child.description, cloned_worker
4443
+ child.description,
4444
+ cloned_worker,
4445
+ pool_max_size=10,
1768
4446
  )
1769
4447
  elif isinstance(child, RolePlayingWorker):
1770
4448
  new_instance.add_role_playing_worker(
1771
4449
  child.description,
1772
4450
  child.assistant_role_name,
1773
4451
  child.user_role_name,
1774
- child.chat_turn_limit,
1775
4452
  child.assistant_agent_kwargs,
1776
4453
  child.user_agent_kwargs,
1777
4454
  child.summarize_agent_kwargs,
4455
+ child.chat_turn_limit,
1778
4456
  )
1779
4457
  elif isinstance(child, Workforce):
1780
4458
  new_instance.add_workforce(child.clone(with_memory))
@@ -1868,7 +4546,7 @@ class Workforce(BaseNode):
1868
4546
  )
1869
4547
 
1870
4548
  try:
1871
- result_task = await workforce_instance.process_task(task)
4549
+ result_task = await workforce_instance.process_task_async(task)
1872
4550
  return {
1873
4551
  "status": "success",
1874
4552
  "task_id": result_task.id,
@@ -2064,6 +4742,18 @@ class Workforce(BaseNode):
2064
4742
  )
2065
4743
 
2066
4744
  agent = ChatAgent(sys_msg, **(agent_kwargs or {}))
4745
+
4746
+ # Validate agent compatibility
4747
+ try:
4748
+ workforce_instance._validate_agent_compatibility(
4749
+ agent, "Worker agent"
4750
+ )
4751
+ except ValueError as e:
4752
+ return {
4753
+ "status": "error",
4754
+ "message": str(e),
4755
+ }
4756
+
2067
4757
  workforce_instance.add_single_agent_worker(description, agent)
2068
4758
 
2069
4759
  return {