camel-ai 0.2.69a7__py3-none-any.whl → 0.2.71a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

@@ -43,7 +43,12 @@ from camel.societies.workforce.utils import (
43
43
  check_if_running,
44
44
  )
45
45
  from camel.societies.workforce.worker import Worker
46
- from camel.tasks.task import Task, TaskState, validate_task_content
46
+ from camel.tasks.task import (
47
+ Task,
48
+ TaskState,
49
+ is_task_result_insufficient,
50
+ validate_task_content,
51
+ )
47
52
  from camel.toolkits import (
48
53
  CodeExecutionToolkit,
49
54
  SearchToolkit,
@@ -57,6 +62,12 @@ from .workforce_logger import WorkforceLogger
57
62
 
58
63
  logger = get_logger(__name__)
59
64
 
65
+ # Constants for configuration values
66
+ MAX_TASK_RETRIES = 3
67
+ MAX_PENDING_TASKS_LIMIT = 20
68
+ TASK_TIMEOUT_SECONDS = 180.0
69
+ DEFAULT_WORKER_POOL_SIZE = 10
70
+
60
71
 
61
72
  class WorkforceState(Enum):
62
73
  r"""Workforce execution state for human intervention support."""
@@ -111,27 +122,24 @@ class Workforce(BaseNode):
111
122
  children (Optional[List[BaseNode]], optional): List of child nodes
112
123
  under this node. Each child node can be a worker node or
113
124
  another workforce node. (default: :obj:`None`)
114
- coordinator_agent_kwargs (Optional[Dict], optional): Keyword
115
- arguments passed directly to the coordinator :obj:`ChatAgent`
116
- constructor. The coordinator manages task assignment and failure
117
- handling strategies. See :obj:`ChatAgent` documentation
118
- for all available parameters.
119
- (default: :obj:`None` - uses ModelPlatformType.DEFAULT,
120
- ModelType.DEFAULT)
121
- task_agent_kwargs (Optional[Dict], optional): Keyword arguments
122
- passed directly to the task planning :obj:`ChatAgent` constructor.
123
- The task agent handles task decomposition into subtasks and result
124
- composition. See :obj:`ChatAgent` documentation for all
125
- available parameters.
126
- (default: :obj:`None` - uses ModelPlatformType.DEFAULT,
127
- ModelType.DEFAULT)
128
- new_worker_agent_kwargs (Optional[Dict], optional): Default keyword
129
- arguments passed to :obj:`ChatAgent` constructor for workers
130
- created dynamically at runtime when existing workers cannot handle
131
- failed tasks. See :obj:`ChatAgent` documentation for all
132
- available parameters.
133
- (default: :obj:`None` - creates workers with SearchToolkit,
134
- CodeExecutionToolkit, and ThinkingToolkit)
125
+ coordinator_agent (Optional[ChatAgent], optional): A custom coordinator
126
+ agent instance for task assignment and worker creation. If
127
+ provided, the workforce will create a new agent using this agent's
128
+ model configuration but with the required system message and
129
+ functionality.
130
+ If None, a default agent will be created using DEFAULT model
131
+ settings. (default: :obj:`None`)
132
+ task_agent (Optional[ChatAgent], optional): A custom task planning
133
+ agent instance for task decomposition and composition. If
134
+ provided, the workforce will create a new agent using this agent's
135
+ model configuration but with the required system message and tools
136
+ (TaskPlanningToolkit). If None, a default agent will be created
137
+ using DEFAULT model settings. (default: :obj:`None`)
138
+ new_worker_agent (Optional[ChatAgent], optional): A template agent for
139
+ workers created dynamically at runtime when existing workers cannot
140
+ handle failed tasks. If None, workers will be created with default
141
+ settings including SearchToolkit, CodeExecutionToolkit, and
142
+ ThinkingToolkit. (default: :obj:`None`)
135
143
  graceful_shutdown_timeout (float, optional): The timeout in seconds
136
144
  for graceful shutdown when a task fails 3 times. During this
137
145
  period, the workforce remains active for debugging.
@@ -147,40 +155,59 @@ class Workforce(BaseNode):
147
155
  (default: :obj:`False`)
148
156
 
149
157
  Example:
150
- >>> # Configure with custom model and shared memory
151
158
  >>> import asyncio
159
+ >>> from camel.agents import ChatAgent
160
+ >>> from camel.models import ModelFactory
161
+ >>> from camel.types import ModelPlatformType, ModelType
162
+ >>> from camel.tasks import Task
163
+ >>>
164
+ >>> # Simple workforce with default agents
165
+ >>> workforce = Workforce("Research Team")
166
+ >>>
167
+ >>> # Workforce with custom model configuration
152
168
  >>> model = ModelFactory.create(
153
- ... ModelPlatformType.OPENAI, ModelType.GPT_4O
169
+ ... ModelPlatformType.OPENAI, model_type=ModelType.GPT_4O
154
170
  ... )
171
+ >>> coordinator_agent = ChatAgent(model=model)
172
+ >>> task_agent = ChatAgent(model=model)
173
+ >>>
155
174
  >>> workforce = Workforce(
156
175
  ... "Research Team",
157
- ... coordinator_agent_kwargs={"model": model, "token_limit": 4000},
158
- ... task_agent_kwargs={"model": model, "token_limit": 8000},
159
- ... share_memory=True # Enable shared memory
176
+ ... coordinator_agent=coordinator_agent,
177
+ ... task_agent=task_agent,
160
178
  ... )
161
179
  >>>
162
180
  >>> # Process a task
163
181
  >>> async def main():
164
182
  ... task = Task(content="Research AI trends", id="1")
165
- ... result = workforce.process_task(task)
183
+ ... result = await workforce.process_task_async(task)
166
184
  ... return result
167
- >>> asyncio.run(main())
185
+ >>>
186
+ >>> result_task = asyncio.run(main())
187
+
188
+ Note:
189
+ When custom coordinator_agent or task_agent are provided, the workforce
190
+ will preserve the user's system message and append the required
191
+ workforce coordination or task planning instructions to it. This
192
+ ensures both the user's intent is preserved and proper workforce
193
+ functionality is maintained. All other agent configurations (model,
194
+ memory, tools, etc.) will also be preserved.
168
195
  """
169
196
 
170
197
  def __init__(
171
198
  self,
172
199
  description: str,
173
200
  children: Optional[List[BaseNode]] = None,
174
- coordinator_agent_kwargs: Optional[Dict] = None,
175
- task_agent_kwargs: Optional[Dict] = None,
176
- new_worker_agent_kwargs: Optional[Dict] = None,
201
+ coordinator_agent: Optional[ChatAgent] = None,
202
+ task_agent: Optional[ChatAgent] = None,
203
+ new_worker_agent: Optional[ChatAgent] = None, # TODO: use MCP Agent
177
204
  graceful_shutdown_timeout: float = 15.0,
178
205
  share_memory: bool = False,
179
206
  ) -> None:
180
207
  super().__init__(description)
181
208
  self._child_listening_tasks: Deque[asyncio.Task] = deque()
182
209
  self._children = children or []
183
- self.new_worker_agent_kwargs = new_worker_agent_kwargs
210
+ self.new_worker_agent = new_worker_agent
184
211
  self.graceful_shutdown_timeout = graceful_shutdown_timeout
185
212
  self.share_memory = share_memory
186
213
  self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
@@ -200,6 +227,7 @@ class Workforce(BaseNode):
200
227
  self._completed_tasks: List[Task] = []
201
228
  self._loop: Optional[asyncio.AbstractEventLoop] = None
202
229
  self._main_task_future: Optional[asyncio.Future] = None
230
+ self._cleanup_task: Optional[asyncio.Task] = None
203
231
  # Snapshot throttle support
204
232
  self._last_snapshot_time: float = 0.0
205
233
  # Minimum seconds between automatic snapshots
@@ -214,58 +242,72 @@ class Workforce(BaseNode):
214
242
  role=role_or_desc,
215
243
  )
216
244
 
217
- # Warning messages for default model usage
218
- if coordinator_agent_kwargs is None:
219
- logger.warning(
220
- "No coordinator_agent_kwargs provided. Using default "
221
- "ChatAgent settings (ModelPlatformType.DEFAULT, "
222
- "ModelType.DEFAULT). To customize the coordinator agent "
223
- "that assigns tasks and handles failures, pass a dictionary "
224
- "with ChatAgent parameters, e.g.: {'model': your_model, "
225
- "'tools': your_tools, 'token_limit': 8000}. See ChatAgent "
226
- "documentation for all available options."
227
- )
228
- if task_agent_kwargs is None:
229
- logger.warning(
230
- "No task_agent_kwargs provided. Using default ChatAgent "
231
- "settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT). "
232
- "To customize the task planning agent that "
233
- "decomposes/composes tasks, pass a dictionary with "
234
- "ChatAgent parameters, e.g.: {'model': your_model, "
235
- "'token_limit': 16000}. See ChatAgent documentation for "
236
- "all available options."
237
- )
238
- if new_worker_agent_kwargs is None:
239
- logger.warning(
240
- "No new_worker_agent_kwargs provided. Workers created at "
241
- "runtime will use default ChatAgent settings with "
242
- "SearchToolkit, CodeExecutionToolkit, and ThinkingToolkit. "
243
- "To customize runtime worker creation, pass a dictionary "
244
- "with ChatAgent parameters, e.g.: {'model': your_model, "
245
- "'tools': your_tools}. See ChatAgent documentation for all "
246
- "available options."
247
- )
248
-
249
- if self.share_memory:
250
- logger.info(
251
- "Shared memory enabled. All agents will share their complete "
252
- "conversation history and function-calling trajectory for "
253
- "better context continuity during task handoffs."
254
- )
255
-
245
+ # Set up coordinator agent with default system message
256
246
  coord_agent_sys_msg = BaseMessage.make_assistant_message(
257
247
  role_name="Workforce Manager",
258
- content="You are coordinating a group of workers. A worker can be "
259
- "a group of agents or a single agent. Each worker is "
248
+ content="You are coordinating a group of workers. A worker "
249
+ "can be a group of agents or a single agent. Each worker is "
260
250
  "created to solve a specific kind of task. Your job "
261
251
  "includes assigning tasks to a existing worker, creating "
262
252
  "a new worker for a task, etc.",
263
253
  )
264
- self.coordinator_agent = ChatAgent(
265
- coord_agent_sys_msg,
266
- **(coordinator_agent_kwargs or {}),
267
- )
268
254
 
255
+ if coordinator_agent is None:
256
+ logger.warning(
257
+ "No coordinator_agent provided. Using default "
258
+ "ChatAgent settings (ModelPlatformType.DEFAULT, "
259
+ "ModelType.DEFAULT) with default system message."
260
+ )
261
+ self.coordinator_agent = ChatAgent(coord_agent_sys_msg)
262
+ else:
263
+ logger.info(
264
+ "Custom coordinator_agent provided. Preserving user's "
265
+ "system message and appending workforce coordination "
266
+ "instructions to ensure proper functionality."
267
+ )
268
+
269
+ if coordinator_agent.system_message is not None:
270
+ user_sys_msg_content = coordinator_agent.system_message.content
271
+ combined_content = (
272
+ f"{user_sys_msg_content}\n\n"
273
+ f"{coord_agent_sys_msg.content}"
274
+ )
275
+ combined_sys_msg = BaseMessage.make_assistant_message(
276
+ role_name=coordinator_agent.system_message.role_name,
277
+ content=combined_content,
278
+ )
279
+ else:
280
+ combined_sys_msg = coord_agent_sys_msg
281
+
282
+ # Create a new agent with the provided agent's configuration
283
+ # but with the combined system message
284
+ self.coordinator_agent = ChatAgent(
285
+ system_message=combined_sys_msg,
286
+ model=coordinator_agent.model_backend,
287
+ memory=coordinator_agent.memory,
288
+ message_window_size=getattr(
289
+ coordinator_agent.memory, "window_size", None
290
+ ),
291
+ token_limit=getattr(
292
+ coordinator_agent.memory.get_context_creator(),
293
+ "token_limit",
294
+ None,
295
+ ),
296
+ output_language=coordinator_agent.output_language,
297
+ tools=[
298
+ tool.func
299
+ for tool in coordinator_agent._internal_tools.values()
300
+ ],
301
+ external_tools=[
302
+ schema
303
+ for schema in coordinator_agent._external_tool_schemas.values() # noqa: E501
304
+ ],
305
+ response_terminators=coordinator_agent.response_terminators,
306
+ max_iteration=coordinator_agent.max_iteration,
307
+ stop_event=coordinator_agent.stop_event,
308
+ )
309
+
310
+ # Set up task agent with default system message and required tools
269
311
  task_sys_msg = BaseMessage.make_assistant_message(
270
312
  role_name="Task Planner",
271
313
  content="You are going to compose and decompose tasks. Keep "
@@ -275,13 +317,83 @@ class Workforce(BaseNode):
275
317
  "of agents. This ensures efficient execution by minimizing "
276
318
  "context switching between agents.",
277
319
  )
278
- _task_agent_kwargs = dict(task_agent_kwargs or {})
279
- extra_tools = TaskPlanningToolkit().get_tools()
280
- _task_agent_kwargs["tools"] = [
281
- *_task_agent_kwargs.get("tools", []),
282
- *extra_tools,
283
- ]
284
- self.task_agent = ChatAgent(task_sys_msg, **_task_agent_kwargs)
320
+ task_planning_tools = TaskPlanningToolkit().get_tools()
321
+
322
+ if task_agent is None:
323
+ logger.warning(
324
+ "No task_agent provided. Using default ChatAgent "
325
+ "settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT) "
326
+ "with default system message and TaskPlanningToolkit."
327
+ )
328
+ self.task_agent = ChatAgent(
329
+ task_sys_msg,
330
+ tools=TaskPlanningToolkit().get_tools(), # type: ignore[arg-type]
331
+ )
332
+ else:
333
+ logger.info(
334
+ "Custom task_agent provided. Preserving user's "
335
+ "system message and appending task planning "
336
+ "instructions to ensure proper functionality."
337
+ )
338
+
339
+ if task_agent.system_message is not None:
340
+ user_task_sys_msg_content = task_agent.system_message.content
341
+ combined_task_content = (
342
+ f"{user_task_sys_msg_content}\n\n"
343
+ f"{task_sys_msg.content}"
344
+ )
345
+ combined_task_sys_msg = BaseMessage.make_assistant_message(
346
+ role_name=task_agent.system_message.role_name,
347
+ content=combined_task_content,
348
+ )
349
+ else:
350
+ combined_task_sys_msg = task_sys_msg
351
+
352
+ # Since ChatAgent constructor uses a dictionary with
353
+ # function names as keys, we don't need to manually deduplicate.
354
+ combined_tools = [
355
+ tool.func for tool in task_agent._internal_tools.values()
356
+ ] + [tool.func for tool in task_planning_tools]
357
+
358
+ # Create a new agent with the provided agent's configuration
359
+ # but with the combined system message and tools
360
+ self.task_agent = ChatAgent(
361
+ system_message=combined_task_sys_msg,
362
+ model=task_agent.model_backend,
363
+ memory=task_agent.memory,
364
+ message_window_size=getattr(
365
+ task_agent.memory, "window_size", None
366
+ ),
367
+ token_limit=getattr(
368
+ task_agent.memory.get_context_creator(),
369
+ "token_limit",
370
+ None,
371
+ ),
372
+ output_language=task_agent.output_language,
373
+ tools=combined_tools,
374
+ external_tools=[
375
+ schema
376
+ for schema in task_agent._external_tool_schemas.values()
377
+ ],
378
+ response_terminators=task_agent.response_terminators,
379
+ max_iteration=task_agent.max_iteration,
380
+ stop_event=task_agent.stop_event,
381
+ )
382
+
383
+ if new_worker_agent is None:
384
+ logger.info(
385
+ "No new_worker_agent provided. Workers created at runtime "
386
+ "will use default ChatAgent settings with SearchToolkit, "
387
+ "CodeExecutionToolkit, and ThinkingToolkit. To customize "
388
+ "runtime worker creation, pass a ChatAgent instance."
389
+ )
390
+
391
+ if self.share_memory:
392
+ logger.info(
393
+ "Shared memory enabled. All agents will share their complete "
394
+ "conversation history and function-calling trajectory for "
395
+ "better context continuity during task handoffs."
396
+ )
285
397
 
286
398
  def __repr__(self):
287
399
  return (
@@ -417,6 +529,15 @@ class Workforce(BaseNode):
417
529
  except Exception as e:
418
530
  logger.warning(f"Error synchronizing shared memory: {e}")
419
531
 
532
+ def _cleanup_task_tracking(self, task_id: str) -> None:
533
+ r"""Clean up tracking data for a task to prevent memory leaks.
534
+
535
+ Args:
536
+ task_id (str): The ID of the task to clean up.
537
+ """
538
+ if task_id in self._task_start_times:
539
+ del self._task_start_times[task_id]
540
+
420
541
  def _decompose_task(self, task: Task) -> List[Task]:
421
542
  r"""Decompose the task into subtasks. This method will also set the
422
543
  relationship between the task and its subtasks.
@@ -1004,7 +1125,7 @@ class Workforce(BaseNode):
1004
1125
  self,
1005
1126
  description: str,
1006
1127
  worker: ChatAgent,
1007
- pool_max_size: int = 10,
1128
+ pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
1008
1129
  ) -> Workforce:
1009
1130
  r"""Add a worker node to the workforce that uses a single agent.
1010
1131
 
@@ -1133,7 +1254,7 @@ class Workforce(BaseNode):
1133
1254
  except RuntimeError:
1134
1255
  asyncio.run(self._async_reset())
1135
1256
 
1136
- if hasattr(self, 'logger') and self.metrics_logger is not None:
1257
+ if hasattr(self, 'metrics_logger') and self.metrics_logger is not None:
1137
1258
  self.metrics_logger.reset_task_data()
1138
1259
  else:
1139
1260
  self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
@@ -1225,8 +1346,16 @@ class Workforce(BaseNode):
1225
1346
  )
1226
1347
  return TaskAssignResult(assignments=[])
1227
1348
 
1228
- result_dict = json.loads(response.msg.content, parse_int=str)
1229
- return TaskAssignResult(**result_dict)
1349
+ try:
1350
+ result_dict = json.loads(response.msg.content, parse_int=str)
1351
+ return TaskAssignResult(**result_dict)
1352
+ except json.JSONDecodeError as e:
1353
+ logger.error(
1354
+ f"JSON parsing error in task assignment: Invalid response "
1355
+ f"format - {e}. Response content: "
1356
+ f"{response.msg.content[:50]}..."
1357
+ )
1358
+ return TaskAssignResult(assignments=[])
1230
1359
 
1231
1360
  def _validate_assignments(
1232
1361
  self, assignments: List[TaskAssignment], valid_ids: Set[str]
@@ -1408,12 +1537,26 @@ class Workforce(BaseNode):
1408
1537
  # Record the start time when a task is posted
1409
1538
  self._task_start_times[task.id] = time.time()
1410
1539
 
1540
+ task.assigned_worker_id = assignee_id
1541
+
1411
1542
  if self.metrics_logger:
1412
1543
  self.metrics_logger.log_task_started(
1413
1544
  task_id=task.id, worker_id=assignee_id
1414
1545
  )
1415
- self._in_flight_tasks += 1
1416
- await self._channel.post_task(task, self.node_id, assignee_id)
1546
+
1547
+ try:
1548
+ self._in_flight_tasks += 1
1549
+ await self._channel.post_task(task, self.node_id, assignee_id)
1550
+ logger.debug(
1551
+ f"Posted task {task.id} to {assignee_id}. "
1552
+ f"In-flight tasks: {self._in_flight_tasks}"
1553
+ )
1554
+ except Exception as e:
1555
+ # Decrement counter if posting failed
1556
+ self._in_flight_tasks -= 1
1557
+ logger.error(
1558
+ f"Failed to post task {task.id} to {assignee_id}: {e}"
1559
+ )
1417
1560
 
1418
1561
  async def _post_dependency(self, dependency: Task) -> None:
1419
1562
  await self._channel.post_dependency(dependency, self.node_id)
@@ -1450,8 +1593,19 @@ class Workforce(BaseNode):
1450
1593
  "with various tasks.",
1451
1594
  )
1452
1595
  else:
1453
- result_dict = json.loads(response.msg.content)
1454
- new_node_conf = WorkerConf(**result_dict)
1596
+ try:
1597
+ result_dict = json.loads(response.msg.content)
1598
+ new_node_conf = WorkerConf(**result_dict)
1599
+ except json.JSONDecodeError as e:
1600
+ logger.error(
1601
+ f"JSON parsing error in worker creation: Invalid response "
1602
+ f"format - {e}. Response content: "
1603
+ f"{response.msg.content[:100]}..."
1604
+ )
1605
+ raise RuntimeError(
1606
+ f"Failed to create worker for task {task.id}: "
1607
+ f"Coordinator agent returned malformed JSON response. "
1608
+ )
1455
1609
 
1456
1610
  new_agent = self._create_new_agent(
1457
1611
  new_node_conf.role,
@@ -1461,7 +1615,7 @@ class Workforce(BaseNode):
1461
1615
  new_node = SingleAgentWorker(
1462
1616
  description=new_node_conf.description,
1463
1617
  worker=new_agent,
1464
- pool_max_size=10, # TODO: make this configurable
1618
+ pool_max_size=DEFAULT_WORKER_POOL_SIZE,
1465
1619
  )
1466
1620
  new_node.set_channel(self._channel)
1467
1621
 
@@ -1486,25 +1640,25 @@ class Workforce(BaseNode):
1486
1640
  content=sys_msg,
1487
1641
  )
1488
1642
 
1489
- if self.new_worker_agent_kwargs is not None:
1490
- return ChatAgent(worker_sys_msg, **self.new_worker_agent_kwargs)
1491
-
1492
- # Default tools for a new agent
1493
- function_list = [
1494
- SearchToolkit().search_duckduckgo,
1495
- *CodeExecutionToolkit().get_tools(),
1496
- *ThinkingToolkit().get_tools(),
1497
- ]
1643
+ if self.new_worker_agent is not None:
1644
+ return self.new_worker_agent
1645
+ else:
1646
+ # Default tools for a new agent
1647
+ function_list = [
1648
+ SearchToolkit().search_duckduckgo,
1649
+ *CodeExecutionToolkit().get_tools(),
1650
+ *ThinkingToolkit().get_tools(),
1651
+ ]
1498
1652
 
1499
- model = ModelFactory.create(
1500
- model_platform=ModelPlatformType.DEFAULT,
1501
- model_type=ModelType.DEFAULT,
1502
- model_config_dict={"temperature": 0},
1503
- )
1653
+ model = ModelFactory.create(
1654
+ model_platform=ModelPlatformType.DEFAULT,
1655
+ model_type=ModelType.DEFAULT,
1656
+ model_config_dict={"temperature": 0},
1657
+ )
1504
1658
 
1505
- return ChatAgent(worker_sys_msg, model=model, tools=function_list) # type: ignore[arg-type]
1659
+ return ChatAgent(worker_sys_msg, model=model, tools=function_list) # type: ignore[arg-type]
1506
1660
 
1507
- async def _get_returned_task(self) -> Task:
1661
+ async def _get_returned_task(self) -> Optional[Task]:
1508
1662
  r"""Get the task that's published by this node and just get returned
1509
1663
  from the assignee. Includes timeout handling to prevent indefinite
1510
1664
  waiting.
@@ -1513,17 +1667,28 @@ class Workforce(BaseNode):
1513
1667
  # Add timeout to prevent indefinite waiting
1514
1668
  return await asyncio.wait_for(
1515
1669
  self._channel.get_returned_task_by_publisher(self.node_id),
1516
- timeout=180.0, # 3 minute timeout
1670
+ timeout=TASK_TIMEOUT_SECONDS,
1517
1671
  )
1518
- except asyncio.TimeoutError:
1519
- logger.warning(
1520
- f"Timeout waiting for returned task in "
1672
+ except Exception as e:
1673
+ # Decrement in-flight counter to prevent hanging
1674
+ if self._in_flight_tasks > 0:
1675
+ self._in_flight_tasks -= 1
1676
+
1677
+ error_msg = (
1678
+ f"Error getting returned task {e} in "
1521
1679
  f"workforce {self.node_id}. "
1522
- f"This may indicate an issue with async tool execution. "
1523
1680
  f"Current pending tasks: {len(self._pending_tasks)}, "
1524
1681
  f"In-flight tasks: {self._in_flight_tasks}"
1525
1682
  )
1526
- raise
1683
+ logger.warning(error_msg)
1684
+
1685
+ if self._pending_tasks and self._assignees:
1686
+ for task in self._pending_tasks:
1687
+ if task.id in self._assignees:
1688
+ # Mark this real task as failed
1689
+ task.set_state(TaskState.FAILED)
1690
+ return task
1691
+ return None
1527
1692
 
1528
1693
  async def _post_ready_tasks(self) -> None:
1529
1694
  r"""Checks for unassigned tasks, assigns them, and then posts any
@@ -1563,6 +1728,9 @@ class Workforce(BaseNode):
1563
1728
  # Step 2: Iterate through all pending tasks and post those that are
1564
1729
  # ready
1565
1730
  posted_tasks = []
1731
+ # Pre-compute completed task IDs set for O(1) lookups
1732
+ completed_task_ids = {t.id for t in self._completed_tasks}
1733
+
1566
1734
  for task in self._pending_tasks:
1567
1735
  # A task must be assigned to be considered for posting
1568
1736
  if task.id in self._task_dependencies:
@@ -1570,8 +1738,7 @@ class Workforce(BaseNode):
1570
1738
  # Check if all dependencies for this task are in the completed
1571
1739
  # set
1572
1740
  if all(
1573
- dep_id in {t.id for t in self._completed_tasks}
1574
- for dep_id in dependencies
1741
+ dep_id in completed_task_ids for dep_id in dependencies
1575
1742
  ):
1576
1743
  assignee_id = self._assignees[task.id]
1577
1744
  logger.debug(
@@ -1593,17 +1760,67 @@ class Workforce(BaseNode):
1593
1760
  async def _handle_failed_task(self, task: Task) -> bool:
1594
1761
  task.failure_count += 1
1595
1762
 
1763
+ # Determine detailed failure information
1764
+ if is_task_result_insufficient(task):
1765
+ failure_reason = "Worker returned unhelpful "
1766
+ f"response: {task.result[:100] if task.result else ''}..."
1767
+ else:
1768
+ failure_reason = "Task marked as failed despite "
1769
+ f"having result: {(task.result or '')[:100]}..."
1770
+
1771
+ # Add context about the worker and task
1772
+ worker_id = task.assigned_worker_id or "unknown"
1773
+ worker_info = f" (assigned to worker: {worker_id})"
1774
+
1775
+ detailed_error = f"{failure_reason}{worker_info}"
1776
+
1777
+ logger.error(
1778
+ f"Task {task.id} failed (attempt "
1779
+ f"{task.failure_count}/3): {detailed_error}"
1780
+ )
1781
+
1596
1782
  if self.metrics_logger:
1597
- worker_id = self._assignees.get(task.id)
1598
1783
  self.metrics_logger.log_task_failed(
1599
1784
  task_id=task.id,
1600
1785
  worker_id=worker_id,
1601
- error_message=task.result or "Task execution failed",
1786
+ error_message=detailed_error,
1602
1787
  error_type="TaskFailure",
1603
- metadata={'failure_count': task.failure_count},
1788
+ metadata={
1789
+ 'failure_count': task.failure_count,
1790
+ 'task_content': task.content,
1791
+ 'result_length': len(task.result) if task.result else 0,
1792
+ },
1604
1793
  )
1605
1794
 
1606
- if task.failure_count > 3:
1795
+ # Check for immediate halt conditions - return immediately if we
1796
+ # should halt
1797
+ if task.failure_count >= MAX_TASK_RETRIES:
1798
+ logger.error(
1799
+ f"Task {task.id} has exceeded maximum retry attempts "
1800
+ f"({MAX_TASK_RETRIES}). Final failure "
1801
+ f"reason: {detailed_error}. "
1802
+ f"Task content: '{task.content[:100]}...'"
1803
+ )
1804
+ self._cleanup_task_tracking(task.id)
1805
+ # Mark task as completed for dependency tracking before halting
1806
+ self._completed_tasks.append(task)
1807
+ if task.id in self._assignees:
1808
+ await self._channel.archive_task(task.id)
1809
+ return True
1810
+
1811
+ # If too many tasks are failing rapidly, also halt to prevent infinite
1812
+ # loops
1813
+ if len(self._pending_tasks) > MAX_PENDING_TASKS_LIMIT:
1814
+ logger.error(
1815
+ f"Too many pending tasks ({len(self._pending_tasks)} > "
1816
+ f"{MAX_PENDING_TASKS_LIMIT}). Halting to prevent task "
1817
+ f"explosion. Last failed task: {task.id}"
1818
+ )
1819
+ self._cleanup_task_tracking(task.id)
1820
+ # Mark task as completed for dependency tracking before halting
1821
+ self._completed_tasks.append(task)
1822
+ if task.id in self._assignees:
1823
+ await self._channel.archive_task(task.id)
1607
1824
  return True
1608
1825
 
1609
1826
  if task.get_depth() > 3:
@@ -1658,8 +1875,6 @@ class Workforce(BaseNode):
1658
1875
  # Mark task as completed for dependency tracking
1659
1876
  self._completed_tasks.append(task)
1660
1877
 
1661
- # Post next ready tasks
1662
-
1663
1878
  # Sync shared memory after task completion to share knowledge
1664
1879
  if self.share_memory:
1665
1880
  logger.info(
@@ -1673,7 +1888,7 @@ class Workforce(BaseNode):
1673
1888
 
1674
1889
  async def _handle_completed_task(self, task: Task) -> None:
1675
1890
  if self.metrics_logger:
1676
- worker_id = self._assignees.get(task.id, "unknown")
1891
+ worker_id = task.assigned_worker_id or "unknown"
1677
1892
  processing_time_seconds = None
1678
1893
  token_usage = None
1679
1894
 
@@ -1682,7 +1897,7 @@ class Workforce(BaseNode):
1682
1897
  processing_time_seconds = (
1683
1898
  time.time() - self._task_start_times[task.id]
1684
1899
  )
1685
- del self._task_start_times[task.id] # Prevent memory leaks
1900
+ self._cleanup_task_tracking(task.id)
1686
1901
  elif (
1687
1902
  task.additional_info is not None
1688
1903
  and 'processing_time_seconds' in task.additional_info
@@ -1876,8 +2091,19 @@ class Workforce(BaseNode):
1876
2091
  )
1877
2092
  self._last_snapshot_time = time.time()
1878
2093
 
1879
- # Get returned task (this may block until a task is returned)
2094
+ # Get returned task
1880
2095
  returned_task = await self._get_returned_task()
2096
+
2097
+ # If no task was returned, continue
2098
+ if returned_task is None:
2099
+ logger.debug(
2100
+ f"No task returned in workforce {self.node_id}. "
2101
+ f"Pending: {len(self._pending_tasks)}, "
2102
+ f"In-flight: {self._in_flight_tasks}"
2103
+ )
2104
+ await self._post_ready_tasks()
2105
+ continue
2106
+
1881
2107
  self._in_flight_tasks -= 1
1882
2108
 
1883
2109
  # Check for stop request after getting task
@@ -1887,22 +2113,72 @@ class Workforce(BaseNode):
1887
2113
 
1888
2114
  # Process the returned task based on its state
1889
2115
  if returned_task.state == TaskState.DONE:
1890
- print(
1891
- f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
1892
- f"successfully.{Fore.RESET}"
1893
- )
1894
- await self._handle_completed_task(returned_task)
2116
+ # Check if the "completed" task actually failed to provide
2117
+ # useful results
2118
+ if is_task_result_insufficient(returned_task):
2119
+ result_preview = (
2120
+ returned_task.result[:100] + "..."
2121
+ if returned_task.result
2122
+ else "No result"
2123
+ )
2124
+ logger.warning(
2125
+ f"Task {returned_task.id} marked as DONE but "
2126
+ f"result is insufficient. "
2127
+ f"Treating as failed. Result: '{result_preview}'"
2128
+ )
2129
+ returned_task.state = TaskState.FAILED
2130
+ try:
2131
+ halt = await self._handle_failed_task(
2132
+ returned_task
2133
+ )
2134
+ if not halt:
2135
+ continue
2136
+ print(
2137
+ f"{Fore.RED}Task {returned_task.id} has "
2138
+ f"failed for {MAX_TASK_RETRIES} times after "
2139
+ f"insufficient results, halting the "
2140
+ f"workforce. Final error: "
2141
+ f"{returned_task.result or 'Unknown error'}"
2142
+ f"{Fore.RESET}"
2143
+ )
2144
+ await self._graceful_shutdown(returned_task)
2145
+ break
2146
+ except Exception as e:
2147
+ logger.error(
2148
+ f"Error handling insufficient task result "
2149
+ f"{returned_task.id}: {e}",
2150
+ exc_info=True,
2151
+ )
2152
+ continue
2153
+ else:
2154
+ print(
2155
+ f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
2156
+ f"successfully.{Fore.RESET}"
2157
+ )
2158
+ await self._handle_completed_task(returned_task)
1895
2159
  elif returned_task.state == TaskState.FAILED:
1896
- halt = await self._handle_failed_task(returned_task)
1897
- if not halt:
2160
+ try:
2161
+ halt = await self._handle_failed_task(returned_task)
2162
+ if not halt:
2163
+ continue
2164
+ print(
2165
+ f"{Fore.RED}Task {returned_task.id} has failed "
2166
+ f"for {MAX_TASK_RETRIES} times, halting "
2167
+ f"the workforce. Final error: "
2168
+ f"{returned_task.result or 'Unknown error'}"
2169
+ f"{Fore.RESET}"
2170
+ )
2171
+ # Graceful shutdown instead of immediate break
2172
+ await self._graceful_shutdown(returned_task)
2173
+ break
2174
+ except Exception as e:
2175
+ logger.error(
2176
+ f"Error handling failed task "
2177
+ f"{returned_task.id}: {e}",
2178
+ exc_info=True,
2179
+ )
2180
+ # Continue to prevent hanging
1898
2181
  continue
1899
- print(
1900
- f"{Fore.RED}Task {returned_task.id} has failed "
1901
- f"for 3 times, halting the workforce.{Fore.RESET}"
1902
- )
1903
- # Graceful shutdown instead of immediate break
1904
- await self._graceful_shutdown(returned_task)
1905
- break
1906
2182
  elif returned_task.state == TaskState.OPEN:
1907
2183
  # TODO: Add logic for OPEN
1908
2184
  pass
@@ -1912,7 +2188,18 @@ class Workforce(BaseNode):
1912
2188
  )
1913
2189
 
1914
2190
  except Exception as e:
1915
- logger.error(f"Error processing task: {e}")
2191
+ # Decrement in-flight counter to prevent hanging
2192
+ if self._in_flight_tasks > 0:
2193
+ self._in_flight_tasks -= 1
2194
+
2195
+ logger.error(
2196
+ f"Error processing task in workforce {self.node_id}: {e}"
2197
+ f"Workforce state - Pending tasks: "
2198
+ f"{len(self._pending_tasks)}, "
2199
+ f"In-flight tasks: {self._in_flight_tasks}, "
2200
+ f"Completed tasks: {len(self._completed_tasks)}"
2201
+ )
2202
+
1916
2203
  if self._stop_requested:
1917
2204
  break
1918
2205
  # Continue with next iteration unless stop is requested
@@ -1966,11 +2253,38 @@ class Workforce(BaseNode):
1966
2253
  r"""Stop all the child nodes under it. The node itself will be stopped
1967
2254
  by its parent node.
1968
2255
  """
2256
+ # Stop all child nodes first
1969
2257
  for child in self._children:
1970
2258
  if child._running:
1971
2259
  child.stop()
1972
- for child_task in self._child_listening_tasks:
1973
- child_task.cancel()
2260
+
2261
+ # Cancel child listening tasks
2262
+ if self._child_listening_tasks:
2263
+ try:
2264
+ loop = asyncio.get_running_loop()
2265
+ if loop and not loop.is_closed():
2266
+ # Create graceful cleanup task
2267
+ async def cleanup():
2268
+ await asyncio.sleep(0.1) # Brief grace period
2269
+ for task in self._child_listening_tasks:
2270
+ if not task.done():
2271
+ task.cancel()
2272
+ await asyncio.gather(
2273
+ *self._child_listening_tasks,
2274
+ return_exceptions=True,
2275
+ )
2276
+
2277
+ self._cleanup_task = loop.create_task(cleanup())
2278
+ else:
2279
+ # No active loop, cancel immediately
2280
+ for task in self._child_listening_tasks:
2281
+ task.cancel()
2282
+ except (RuntimeError, Exception) as e:
2283
+ # Fallback: cancel immediately
2284
+ logger.debug(f"Exception during task cleanup: {e}")
2285
+ for task in self._child_listening_tasks:
2286
+ task.cancel()
2287
+
1974
2288
  self._running = False
1975
2289
 
1976
2290
  def clone(self, with_memory: bool = False) -> 'Workforce':
@@ -1988,28 +2302,17 @@ class Workforce(BaseNode):
1988
2302
  """
1989
2303
 
1990
2304
  # Create a new instance with the same configuration
1991
- # Extract the original kwargs from the agents to properly clone them
1992
- coordinator_kwargs = (
1993
- getattr(self.coordinator_agent, 'init_kwargs', {}) or {}
1994
- )
1995
- task_kwargs = getattr(self.task_agent, 'init_kwargs', {}) or {}
1996
-
1997
2305
  new_instance = Workforce(
1998
2306
  description=self.description,
1999
- coordinator_agent_kwargs=coordinator_kwargs.copy(),
2000
- task_agent_kwargs=task_kwargs.copy(),
2001
- new_worker_agent_kwargs=self.new_worker_agent_kwargs.copy()
2002
- if self.new_worker_agent_kwargs
2307
+ coordinator_agent=self.coordinator_agent.clone(with_memory),
2308
+ task_agent=self.task_agent.clone(with_memory),
2309
+ new_worker_agent=self.new_worker_agent.clone(with_memory)
2310
+ if self.new_worker_agent
2003
2311
  else None,
2004
2312
  graceful_shutdown_timeout=self.graceful_shutdown_timeout,
2005
2313
  share_memory=self.share_memory,
2006
2314
  )
2007
2315
 
2008
- new_instance.task_agent = self.task_agent.clone(with_memory)
2009
- new_instance.coordinator_agent = self.coordinator_agent.clone(
2010
- with_memory
2011
- )
2012
-
2013
2316
  for child in self._children:
2014
2317
  if isinstance(child, SingleAgentWorker):
2015
2318
  cloned_worker = child.worker.clone(with_memory)