camel-ai 0.2.71a3__py3-none-any.whl → 0.2.71a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (39) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +1482 -134
  3. camel/agents/repo_agent.py +2 -1
  4. camel/benchmarks/browsecomp.py +6 -6
  5. camel/interpreters/docker_interpreter.py +3 -2
  6. camel/loaders/base_loader.py +85 -0
  7. camel/logger.py +1 -1
  8. camel/messages/base.py +12 -1
  9. camel/models/azure_openai_model.py +96 -7
  10. camel/models/base_model.py +68 -10
  11. camel/models/deepseek_model.py +5 -0
  12. camel/models/gemini_model.py +5 -0
  13. camel/models/litellm_model.py +48 -16
  14. camel/models/model_manager.py +24 -6
  15. camel/models/openai_compatible_model.py +109 -5
  16. camel/models/openai_model.py +117 -8
  17. camel/societies/workforce/prompts.py +68 -5
  18. camel/societies/workforce/role_playing_worker.py +1 -0
  19. camel/societies/workforce/single_agent_worker.py +1 -0
  20. camel/societies/workforce/utils.py +67 -2
  21. camel/societies/workforce/workforce.py +412 -67
  22. camel/societies/workforce/workforce_logger.py +0 -8
  23. camel/tasks/task.py +2 -0
  24. camel/toolkits/__init__.py +7 -2
  25. camel/toolkits/craw4ai_toolkit.py +2 -2
  26. camel/toolkits/file_write_toolkit.py +526 -121
  27. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +9 -3
  28. camel/toolkits/hybrid_browser_toolkit/unified_analyzer.js +31 -8
  29. camel/toolkits/message_agent_toolkit.py +608 -0
  30. camel/toolkits/note_taking_toolkit.py +90 -0
  31. camel/toolkits/openai_image_toolkit.py +292 -0
  32. camel/toolkits/slack_toolkit.py +4 -4
  33. camel/toolkits/terminal_toolkit.py +223 -73
  34. camel/utils/mcp_client.py +37 -1
  35. {camel_ai-0.2.71a3.dist-info → camel_ai-0.2.71a5.dist-info}/METADATA +48 -7
  36. {camel_ai-0.2.71a3.dist-info → camel_ai-0.2.71a5.dist-info}/RECORD +38 -35
  37. camel/toolkits/dalle_toolkit.py +0 -175
  38. {camel_ai-0.2.71a3.dist-info → camel_ai-0.2.71a5.dist-info}/WHEEL +0 -0
  39. {camel_ai-0.2.71a3.dist-info → camel_ai-0.2.71a5.dist-info}/licenses/LICENSE +0 -0
@@ -14,12 +14,23 @@
14
14
  from __future__ import annotations
15
15
 
16
16
  import asyncio
17
+ import concurrent.futures
17
18
  import json
18
19
  import time
19
20
  import uuid
20
21
  from collections import deque
21
22
  from enum import Enum
22
- from typing import Any, Coroutine, Deque, Dict, List, Optional, Set, Tuple
23
+ from typing import (
24
+ Any,
25
+ Coroutine,
26
+ Deque,
27
+ Dict,
28
+ List,
29
+ Optional,
30
+ Set,
31
+ Tuple,
32
+ Union,
33
+ )
23
34
 
24
35
  from colorama import Fore
25
36
 
@@ -31,12 +42,16 @@ from camel.societies.workforce.base import BaseNode
31
42
  from camel.societies.workforce.prompts import (
32
43
  ASSIGN_TASK_PROMPT,
33
44
  CREATE_NODE_PROMPT,
45
+ FAILURE_ANALYSIS_PROMPT,
34
46
  WF_TASK_DECOMPOSE_PROMPT,
35
47
  )
36
48
  from camel.societies.workforce.role_playing_worker import RolePlayingWorker
37
49
  from camel.societies.workforce.single_agent_worker import SingleAgentWorker
38
50
  from camel.societies.workforce.task_channel import TaskChannel
39
51
  from camel.societies.workforce.utils import (
52
+ FailureContext,
53
+ RecoveryDecision,
54
+ RecoveryStrategy,
40
55
  TaskAssignment,
41
56
  TaskAssignResult,
42
57
  WorkerConf,
@@ -200,12 +215,14 @@ class Workforce(BaseNode):
200
215
  children: Optional[List[BaseNode]] = None,
201
216
  coordinator_agent: Optional[ChatAgent] = None,
202
217
  task_agent: Optional[ChatAgent] = None,
203
- new_worker_agent: Optional[ChatAgent] = None, # TODO: use MCP Agent
218
+ new_worker_agent: Optional[ChatAgent] = None,
204
219
  graceful_shutdown_timeout: float = 15.0,
205
220
  share_memory: bool = False,
206
221
  ) -> None:
207
222
  super().__init__(description)
208
- self._child_listening_tasks: Deque[asyncio.Task] = deque()
223
+ self._child_listening_tasks: Deque[
224
+ Union[asyncio.Task, concurrent.futures.Future]
225
+ ] = deque()
209
226
  self._children = children or []
210
227
  self.new_worker_agent = new_worker_agent
211
228
  self.graceful_shutdown_timeout = graceful_shutdown_timeout
@@ -325,9 +342,10 @@ class Workforce(BaseNode):
325
342
  "settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT) "
326
343
  "with default system message and TaskPlanningToolkit."
327
344
  )
345
+ task_tools = TaskPlanningToolkit().get_tools()
328
346
  self.task_agent = ChatAgent(
329
347
  task_sys_msg,
330
- tools=TaskPlanningToolkit().get_tools(), # type: ignore[arg-type]
348
+ tools=task_tools, # type: ignore[arg-type]
331
349
  )
332
350
  else:
333
351
  logger.info(
@@ -563,6 +581,69 @@ class Workforce(BaseNode):
563
581
  except Exception as e:
564
582
  logger.warning(f"Error synchronizing shared memory: {e}")
565
583
 
584
+ def _update_dependencies_for_decomposition(
585
+ self, original_task: Task, subtasks: List[Task]
586
+ ) -> None:
587
+ r"""Update dependency tracking when a task is decomposed into subtasks.
588
+ Tasks that depended on the original task should now depend on all
589
+ subtasks. The last subtask inherits the original task's dependencies.
590
+ """
591
+ if not subtasks:
592
+ return
593
+
594
+ original_task_id = original_task.id
595
+ subtask_ids = [subtask.id for subtask in subtasks]
596
+
597
+ # Find tasks that depend on the original task
598
+ dependent_task_ids = [
599
+ task_id
600
+ for task_id, deps in self._task_dependencies.items()
601
+ if original_task_id in deps
602
+ ]
603
+
604
+ # Update dependent tasks to depend on all subtasks
605
+ for task_id in dependent_task_ids:
606
+ dependencies = self._task_dependencies[task_id]
607
+ dependencies.remove(original_task_id)
608
+ dependencies.extend(subtask_ids)
609
+
610
+ # The last subtask inherits original task's dependencies (if any)
611
+ if original_task_id in self._task_dependencies:
612
+ original_dependencies = self._task_dependencies[original_task_id]
613
+ if original_dependencies:
614
+ # Set dependencies for the last subtask to maintain execution
615
+ # order
616
+ self._task_dependencies[subtask_ids[-1]] = (
617
+ original_dependencies.copy()
618
+ )
619
+ # Remove original task dependencies as it's now decomposed
620
+ del self._task_dependencies[original_task_id]
621
+
622
+ def _increment_in_flight_tasks(self, task_id: str) -> None:
623
+ r"""Safely increment the in-flight tasks counter with logging."""
624
+ self._in_flight_tasks += 1
625
+ logger.debug(
626
+ f"Incremented in-flight tasks for {task_id}. "
627
+ f"Count: {self._in_flight_tasks}"
628
+ )
629
+
630
+ def _decrement_in_flight_tasks(
631
+ self, task_id: str, context: str = ""
632
+ ) -> None:
633
+ r"""Safely decrement the in-flight tasks counter with safety checks."""
634
+ if self._in_flight_tasks > 0:
635
+ self._in_flight_tasks -= 1
636
+ logger.debug(
637
+ f"Decremented in-flight tasks for {task_id} ({context}). "
638
+ f"Count: {self._in_flight_tasks}"
639
+ )
640
+ else:
641
+ logger.debug(
642
+ f"Attempted to decrement in-flight tasks for {task_id} "
643
+ f"({context}) but counter is already 0. "
644
+ f"Counter: {self._in_flight_tasks}"
645
+ )
646
+
566
647
  def _cleanup_task_tracking(self, task_id: str) -> None:
567
648
  r"""Clean up tracking data for a task to prevent memory leaks.
568
649
 
@@ -586,12 +667,86 @@ class Workforce(BaseNode):
586
667
  )
587
668
  self.task_agent.reset()
588
669
  subtasks = task.decompose(self.task_agent, decompose_prompt)
589
- task.subtasks = subtasks
590
- for subtask in subtasks:
591
- subtask.parent = task
670
+
671
+ # Update dependency tracking for decomposed task
672
+ if subtasks:
673
+ self._update_dependencies_for_decomposition(task, subtasks)
592
674
 
593
675
  return subtasks
594
676
 
677
+ def _analyze_failure(
678
+ self, task: Task, error_message: str
679
+ ) -> RecoveryDecision:
680
+ r"""Analyze a task failure and decide on the best recovery strategy.
681
+
682
+ Args:
683
+ task (Task): The failed task
684
+ error_message (str): The error message from the failure
685
+
686
+ Returns:
687
+ RecoveryDecision: The decided recovery strategy with reasoning
688
+ """
689
+ # First, do a quick smart analysis based on error patterns
690
+ error_msg_lower = error_message.lower()
691
+ if any(
692
+ keyword in error_msg_lower
693
+ for keyword in [
694
+ 'connection',
695
+ 'network',
696
+ 'server disconnected',
697
+ 'timeout',
698
+ 'apiconnectionerror',
699
+ ]
700
+ ):
701
+ return RecoveryDecision(
702
+ strategy=RecoveryStrategy.RETRY,
703
+ reasoning="Network/connection error detected, retrying task",
704
+ modified_task_content=None,
705
+ )
706
+
707
+ # Create failure context
708
+ failure_context = FailureContext(
709
+ task_id=task.id,
710
+ task_content=task.content,
711
+ failure_count=task.failure_count,
712
+ error_message=error_message,
713
+ worker_id=task.assigned_worker_id,
714
+ task_depth=task.get_depth(),
715
+ additional_info=str(task.additional_info)
716
+ if task.additional_info
717
+ else None,
718
+ )
719
+
720
+ # Format the analysis prompt
721
+ analysis_prompt = FAILURE_ANALYSIS_PROMPT.format(
722
+ task_id=failure_context.task_id,
723
+ task_content=failure_context.task_content,
724
+ failure_count=failure_context.failure_count,
725
+ error_message=failure_context.error_message,
726
+ worker_id=failure_context.worker_id or "unknown",
727
+ task_depth=failure_context.task_depth,
728
+ additional_info=failure_context.additional_info or "None",
729
+ )
730
+
731
+ try:
732
+ # Get decision from task agent
733
+ self.task_agent.reset()
734
+ response = self.task_agent.step(
735
+ analysis_prompt, response_format=RecoveryDecision
736
+ )
737
+ return response.msg.parsed
738
+
739
+ except Exception as e:
740
+ logger.warning(
741
+ f"Error during failure analysis: {e}, defaulting to RETRY"
742
+ )
743
+ return RecoveryDecision(
744
+ strategy=RecoveryStrategy.RETRY,
745
+ reasoning=f"Analysis failed due to error: {e!s}, "
746
+ f"defaulting to retry",
747
+ modified_task_content=None,
748
+ )
749
+
595
750
  # Human intervention methods
596
751
  async def _async_pause(self) -> None:
597
752
  r"""Async implementation of pause to run on the event loop."""
@@ -977,9 +1132,6 @@ class Workforce(BaseNode):
977
1132
  needed
978
1133
  >>> print(result.result)
979
1134
  """
980
- import asyncio
981
- import concurrent.futures
982
-
983
1135
  # Check if we're already in an event loop
984
1136
  try:
985
1137
  current_loop = asyncio.get_running_loop()
@@ -1154,7 +1306,39 @@ class Workforce(BaseNode):
1154
1306
 
1155
1307
  return self._task
1156
1308
 
1157
- @check_if_running(False)
1309
+ def _start_child_node_when_paused(
1310
+ self, start_coroutine: Coroutine
1311
+ ) -> None:
1312
+ r"""Helper to start a child node when workforce is paused.
1313
+
1314
+ Args:
1315
+ start_coroutine: The coroutine to start (e.g., worker_node.start())
1316
+ """
1317
+ if self._state == WorkforceState.PAUSED and hasattr(
1318
+ self, '_child_listening_tasks'
1319
+ ):
1320
+ if self._loop and not self._loop.is_closed():
1321
+ # Use thread-safe coroutine execution for dynamic addition
1322
+ child_task: Union[asyncio.Task, concurrent.futures.Future]
1323
+ try:
1324
+ # Check if we're in the same thread as the loop
1325
+ current_loop = asyncio.get_running_loop()
1326
+ if current_loop is self._loop:
1327
+ # Same loop context - use create_task
1328
+ child_task = self._loop.create_task(start_coroutine)
1329
+ else:
1330
+ # Different loop context - use thread-safe approach
1331
+ child_task = asyncio.run_coroutine_threadsafe(
1332
+ start_coroutine, self._loop
1333
+ )
1334
+ except RuntimeError:
1335
+ # No running loop in current thread - use thread-safe
1336
+ # approach
1337
+ child_task = asyncio.run_coroutine_threadsafe(
1338
+ start_coroutine, self._loop
1339
+ )
1340
+ self._child_listening_tasks.append(child_task)
1341
+
1158
1342
  def add_single_agent_worker(
1159
1343
  self,
1160
1344
  description: str,
@@ -1162,6 +1346,7 @@ class Workforce(BaseNode):
1162
1346
  pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
1163
1347
  ) -> Workforce:
1164
1348
  r"""Add a worker node to the workforce that uses a single agent.
1349
+ Can be called when workforce is paused to dynamically add workers.
1165
1350
 
1166
1351
  Args:
1167
1352
  description (str): Description of the worker node.
@@ -1171,7 +1356,15 @@ class Workforce(BaseNode):
1171
1356
 
1172
1357
  Returns:
1173
1358
  Workforce: The workforce node itself.
1359
+
1360
+ Raises:
1361
+ RuntimeError: If called while workforce is running (not paused).
1174
1362
  """
1363
+ if self._state == WorkforceState.RUNNING:
1364
+ raise RuntimeError(
1365
+ "Cannot add workers while workforce is running. "
1366
+ "Pause the workforce first."
1367
+ )
1175
1368
  # Ensure the worker agent shares this workforce's pause control
1176
1369
  self._attach_pause_event_to_agent(worker)
1177
1370
 
@@ -1181,6 +1374,14 @@ class Workforce(BaseNode):
1181
1374
  pool_max_size=pool_max_size,
1182
1375
  )
1183
1376
  self._children.append(worker_node)
1377
+
1378
+ # If we have a channel set up, set it for the new worker
1379
+ if hasattr(self, '_channel') and self._channel is not None:
1380
+ worker_node.set_channel(self._channel)
1381
+
1382
+ # If workforce is paused, start the worker's listening task
1383
+ self._start_child_node_when_paused(worker_node.start())
1384
+
1184
1385
  if self.metrics_logger:
1185
1386
  self.metrics_logger.log_worker_created(
1186
1387
  worker_id=worker_node.node_id,
@@ -1189,7 +1390,6 @@ class Workforce(BaseNode):
1189
1390
  )
1190
1391
  return self
1191
1392
 
1192
- @check_if_running(False)
1193
1393
  def add_role_playing_worker(
1194
1394
  self,
1195
1395
  description: str,
@@ -1201,6 +1401,7 @@ class Workforce(BaseNode):
1201
1401
  chat_turn_limit: int = 3,
1202
1402
  ) -> Workforce:
1203
1403
  r"""Add a worker node to the workforce that uses `RolePlaying` system.
1404
+ Can be called when workforce is paused to dynamically add workers.
1204
1405
 
1205
1406
  Args:
1206
1407
  description (str): Description of the node.
@@ -1220,7 +1421,15 @@ class Workforce(BaseNode):
1220
1421
 
1221
1422
  Returns:
1222
1423
  Workforce: The workforce node itself.
1424
+
1425
+ Raises:
1426
+ RuntimeError: If called while workforce is running (not paused).
1223
1427
  """
1428
+ if self._state == WorkforceState.RUNNING:
1429
+ raise RuntimeError(
1430
+ "Cannot add workers while workforce is running. "
1431
+ "Pause the workforce first."
1432
+ )
1224
1433
  # Ensure provided kwargs carry pause_event so that internally created
1225
1434
  # ChatAgents (assistant/user/summarizer) inherit it.
1226
1435
  assistant_agent_kwargs = self._ensure_pause_event_in_kwargs(
@@ -1243,6 +1452,14 @@ class Workforce(BaseNode):
1243
1452
  chat_turn_limit=chat_turn_limit,
1244
1453
  )
1245
1454
  self._children.append(worker_node)
1455
+
1456
+ # If we have a channel set up, set it for the new worker
1457
+ if hasattr(self, '_channel') and self._channel is not None:
1458
+ worker_node.set_channel(self._channel)
1459
+
1460
+ # If workforce is paused, start the worker's listening task
1461
+ self._start_child_node_when_paused(worker_node.start())
1462
+
1246
1463
  if self.metrics_logger:
1247
1464
  self.metrics_logger.log_worker_created(
1248
1465
  worker_id=worker_node.node_id,
@@ -1251,20 +1468,35 @@ class Workforce(BaseNode):
1251
1468
  )
1252
1469
  return self
1253
1470
 
1254
- @check_if_running(False)
1255
1471
  def add_workforce(self, workforce: Workforce) -> Workforce:
1256
1472
  r"""Add a workforce node to the workforce.
1473
+ Can be called when workforce is paused to dynamically add workers.
1257
1474
 
1258
1475
  Args:
1259
1476
  workforce (Workforce): The workforce node to be added.
1260
1477
 
1261
1478
  Returns:
1262
1479
  Workforce: The workforce node itself.
1480
+
1481
+ Raises:
1482
+ RuntimeError: If called while workforce is running (not paused).
1263
1483
  """
1484
+ if self._state == WorkforceState.RUNNING:
1485
+ raise RuntimeError(
1486
+ "Cannot add workers while workforce is running. "
1487
+ "Pause the workforce first."
1488
+ )
1264
1489
  # Align child workforce's pause_event with this one for unified
1265
1490
  # control of worker agents only.
1266
1491
  workforce._pause_event = self._pause_event
1267
1492
  self._children.append(workforce)
1493
+
1494
+ # If we have a channel set up, set it for the new workforce
1495
+ if hasattr(self, '_channel') and self._channel is not None:
1496
+ workforce.set_channel(self._channel)
1497
+
1498
+ # If workforce is paused, start the child workforce's listening task
1499
+ self._start_child_node_when_paused(workforce.start())
1268
1500
  return self
1269
1501
 
1270
1502
  async def _async_reset(self) -> None:
@@ -1436,7 +1668,9 @@ class Workforce(BaseNode):
1436
1668
 
1437
1669
  return valid_assignments, invalid_assignments
1438
1670
 
1439
- def _handle_task_assignment_fallbacks(self, tasks: List[Task]) -> List:
1671
+ async def _handle_task_assignment_fallbacks(
1672
+ self, tasks: List[Task]
1673
+ ) -> List:
1440
1674
  r"""Create new workers for unassigned tasks as fallback.
1441
1675
 
1442
1676
  Args:
@@ -1449,7 +1683,7 @@ class Workforce(BaseNode):
1449
1683
 
1450
1684
  for task in tasks:
1451
1685
  logger.info(f"Creating new worker for unassigned task {task.id}")
1452
- new_worker = self._create_worker_node_for_task(task)
1686
+ new_worker = await self._create_worker_node_for_task(task)
1453
1687
 
1454
1688
  assignment = TaskAssignment(
1455
1689
  task_id=task.id,
@@ -1460,7 +1694,7 @@ class Workforce(BaseNode):
1460
1694
 
1461
1695
  return fallback_assignments
1462
1696
 
1463
- def _handle_assignment_retry_and_fallback(
1697
+ async def _handle_assignment_retry_and_fallback(
1464
1698
  self,
1465
1699
  invalid_assignments: List[TaskAssignment],
1466
1700
  tasks: List[Task],
@@ -1531,14 +1765,14 @@ class Workforce(BaseNode):
1531
1765
  f"Creating fallback workers for {len(unassigned_tasks)} "
1532
1766
  f"unassigned tasks"
1533
1767
  )
1534
- fallback_assignments = self._handle_task_assignment_fallbacks(
1535
- unassigned_tasks
1768
+ fallback_assignments = (
1769
+ await self._handle_task_assignment_fallbacks(unassigned_tasks)
1536
1770
  )
1537
1771
  final_assignments.extend(fallback_assignments)
1538
1772
 
1539
1773
  return final_assignments
1540
1774
 
1541
- def _find_assignee(
1775
+ async def _find_assignee(
1542
1776
  self,
1543
1777
  tasks: List[Task],
1544
1778
  ) -> TaskAssignResult:
@@ -1580,7 +1814,7 @@ class Workforce(BaseNode):
1580
1814
  # invalid assignments and unassigned tasks
1581
1815
  all_problem_assignments = invalid_assignments
1582
1816
  retry_and_fallback_assignments = (
1583
- self._handle_assignment_retry_and_fallback(
1817
+ await self._handle_assignment_retry_and_fallback(
1584
1818
  all_problem_assignments, tasks, valid_worker_ids
1585
1819
  )
1586
1820
  )
@@ -1600,15 +1834,13 @@ class Workforce(BaseNode):
1600
1834
  )
1601
1835
 
1602
1836
  try:
1603
- self._in_flight_tasks += 1
1604
1837
  await self._channel.post_task(task, self.node_id, assignee_id)
1838
+ self._increment_in_flight_tasks(task.id)
1605
1839
  logger.debug(
1606
1840
  f"Posted task {task.id} to {assignee_id}. "
1607
1841
  f"In-flight tasks: {self._in_flight_tasks}"
1608
1842
  )
1609
1843
  except Exception as e:
1610
- # Decrement counter if posting failed
1611
- self._in_flight_tasks -= 1
1612
1844
  logger.error(
1613
1845
  f"Failed to post task {task.id} to {assignee_id}: {e}"
1614
1846
  )
@@ -1616,7 +1848,7 @@ class Workforce(BaseNode):
1616
1848
  async def _post_dependency(self, dependency: Task) -> None:
1617
1849
  await self._channel.post_dependency(dependency, self.node_id)
1618
1850
 
1619
- def _create_worker_node_for_task(self, task: Task) -> Worker:
1851
+ async def _create_worker_node_for_task(self, task: Task) -> Worker:
1620
1852
  r"""Creates a new worker node for a given task and add it to the
1621
1853
  children list of this node. This is one of the actions that
1622
1854
  the coordinator can take when a task has failed.
@@ -1662,7 +1894,7 @@ class Workforce(BaseNode):
1662
1894
  f"Coordinator agent returned malformed JSON response. "
1663
1895
  )
1664
1896
 
1665
- new_agent = self._create_new_agent(
1897
+ new_agent = await self._create_new_agent(
1666
1898
  new_node_conf.role,
1667
1899
  new_node_conf.sys_msg,
1668
1900
  )
@@ -1689,14 +1921,19 @@ class Workforce(BaseNode):
1689
1921
  )
1690
1922
  return new_node
1691
1923
 
1692
- def _create_new_agent(self, role: str, sys_msg: str) -> ChatAgent:
1924
+ async def _create_new_agent(self, role: str, sys_msg: str) -> ChatAgent:
1693
1925
  worker_sys_msg = BaseMessage.make_assistant_message(
1694
1926
  role_name=role,
1695
1927
  content=sys_msg,
1696
1928
  )
1697
1929
 
1698
1930
  if self.new_worker_agent is not None:
1699
- return self.new_worker_agent
1931
+ # Clone the template agent to create an independent instance
1932
+ cloned_agent = self.new_worker_agent.clone(with_memory=False)
1933
+ # Update the system message for the specific role
1934
+ cloned_agent._system_message = worker_sys_msg
1935
+ cloned_agent.init_messages() # Initialize with new system message
1936
+ return cloned_agent
1700
1937
  else:
1701
1938
  # Default tools for a new agent
1702
1939
  function_list = [
@@ -1712,7 +1949,7 @@ class Workforce(BaseNode):
1712
1949
  )
1713
1950
 
1714
1951
  return ChatAgent(
1715
- worker_sys_msg,
1952
+ system_message=worker_sys_msg,
1716
1953
  model=model,
1717
1954
  tools=function_list, # type: ignore[arg-type]
1718
1955
  pause_event=self._pause_event,
@@ -1730,10 +1967,6 @@ class Workforce(BaseNode):
1730
1967
  timeout=TASK_TIMEOUT_SECONDS,
1731
1968
  )
1732
1969
  except Exception as e:
1733
- # Decrement in-flight counter to prevent hanging
1734
- if self._in_flight_tasks > 0:
1735
- self._in_flight_tasks -= 1
1736
-
1737
1970
  error_msg = (
1738
1971
  f"Error getting returned task {e} in "
1739
1972
  f"workforce {self.node_id}. "
@@ -1745,8 +1978,11 @@ class Workforce(BaseNode):
1745
1978
  if self._pending_tasks and self._assignees:
1746
1979
  for task in self._pending_tasks:
1747
1980
  if task.id in self._assignees:
1748
- # Mark this real task as failed
1981
+ # Mark task as failed and decrement counter
1749
1982
  task.set_state(TaskState.FAILED)
1983
+ self._decrement_in_flight_tasks(
1984
+ task.id, "timeout/error in _get_returned_task"
1985
+ )
1750
1986
  return task
1751
1987
  return None
1752
1988
 
@@ -1765,7 +2001,7 @@ class Workforce(BaseNode):
1765
2001
  f"Found {len(tasks_to_assign)} new tasks. "
1766
2002
  f"Requesting assignment..."
1767
2003
  )
1768
- batch_result = self._find_assignee(tasks_to_assign)
2004
+ batch_result = await self._find_assignee(tasks_to_assign)
1769
2005
  logger.debug(
1770
2006
  f"Coordinator returned assignments:\n"
1771
2007
  f"{json.dumps(batch_result.dict(), indent=2)}"
@@ -1788,17 +2024,19 @@ class Workforce(BaseNode):
1788
2024
  # Step 2: Iterate through all pending tasks and post those that are
1789
2025
  # ready
1790
2026
  posted_tasks = []
1791
- # Pre-compute completed task IDs set for O(1) lookups
1792
- completed_task_ids = {t.id for t in self._completed_tasks}
2027
+ # Pre-compute completed task IDs and their states for O(1) lookups
2028
+ completed_tasks_info = {t.id: t.state for t in self._completed_tasks}
1793
2029
 
1794
2030
  for task in self._pending_tasks:
1795
2031
  # A task must be assigned to be considered for posting
1796
2032
  if task.id in self._task_dependencies:
1797
2033
  dependencies = self._task_dependencies[task.id]
1798
2034
  # Check if all dependencies for this task are in the completed
1799
- # set
2035
+ # set and their state is DONE
1800
2036
  if all(
1801
- dep_id in completed_task_ids for dep_id in dependencies
2037
+ dep_id in completed_tasks_info
2038
+ and completed_tasks_info[dep_id] == TaskState.DONE
2039
+ for dep_id in dependencies
1802
2040
  ):
1803
2041
  assignee_id = self._assignees[task.id]
1804
2042
  logger.debug(
@@ -1844,7 +2082,6 @@ class Workforce(BaseNode):
1844
2082
  task_id=task.id,
1845
2083
  worker_id=worker_id,
1846
2084
  error_message=detailed_error,
1847
- error_type="TaskFailure",
1848
2085
  metadata={
1849
2086
  'failure_count': task.failure_count,
1850
2087
  'task_content': task.content,
@@ -1883,21 +2120,57 @@ class Workforce(BaseNode):
1883
2120
  await self._channel.archive_task(task.id)
1884
2121
  return True
1885
2122
 
1886
- if task.get_depth() > 3:
1887
- # Create a new worker node and reassign
1888
- assignee = self._create_worker_node_for_task(task)
2123
+ # Use intelligent failure analysis to decide recovery strategy
2124
+ recovery_decision = self._analyze_failure(task, detailed_error)
1889
2125
 
1890
- # Sync shared memory after creating new worker to provide context
1891
- if self.share_memory:
1892
- logger.info(
1893
- f"Syncing shared memory after creating new worker "
1894
- f"{assignee.node_id} for failed task {task.id}"
2126
+ logger.info(
2127
+ f"Task {task.id} failure "
2128
+ f"analysis: {recovery_decision.strategy.value} - "
2129
+ f"{recovery_decision.reasoning}"
2130
+ )
2131
+
2132
+ if recovery_decision.strategy == RecoveryStrategy.RETRY:
2133
+ # Simply retry the task by reposting it
2134
+ if task.id in self._assignees:
2135
+ assignee_id = self._assignees[task.id]
2136
+ await self._post_task(task, assignee_id)
2137
+ action_taken = f"retried with same worker {assignee_id}"
2138
+ else:
2139
+ # Find a new assignee and retry
2140
+ batch_result = await self._find_assignee([task])
2141
+ assignment = batch_result.assignments[0]
2142
+ self._assignees[task.id] = assignment.assignee_id
2143
+ await self._post_task(task, assignment.assignee_id)
2144
+ action_taken = (
2145
+ f"retried with new worker {assignment.assignee_id}"
1895
2146
  )
1896
- self._sync_shared_memory()
1897
2147
 
1898
- await self._post_task(task, assignee.node_id)
1899
- action_taken = f"reassigned to new worker {assignee.node_id}"
1900
- else:
2148
+ elif recovery_decision.strategy == RecoveryStrategy.REPLAN:
2149
+ # Modify the task content and retry
2150
+ if recovery_decision.modified_task_content:
2151
+ task.content = recovery_decision.modified_task_content
2152
+ logger.info(f"Task {task.id} content modified for replan")
2153
+
2154
+ # Repost the modified task
2155
+ if task.id in self._assignees:
2156
+ assignee_id = self._assignees[task.id]
2157
+ await self._post_task(task, assignee_id)
2158
+ action_taken = (
2159
+ f"replanned and retried with worker {assignee_id}"
2160
+ )
2161
+ else:
2162
+ # Find a new assignee for the replanned task
2163
+ batch_result = await self._find_assignee([task])
2164
+ assignment = batch_result.assignments[0]
2165
+ self._assignees[task.id] = assignment.assignee_id
2166
+ await self._post_task(task, assignment.assignee_id)
2167
+ action_taken = (
2168
+ f"replanned and assigned to "
2169
+ f"worker {assignment.assignee_id}"
2170
+ )
2171
+
2172
+ elif recovery_decision.strategy == RecoveryStrategy.DECOMPOSE:
2173
+ # Decompose the task into subtasks
1901
2174
  subtasks = self._decompose_task(task)
1902
2175
  if self.metrics_logger and subtasks:
1903
2176
  self.metrics_logger.log_task_decomposed(
@@ -1915,19 +2188,42 @@ class Workforce(BaseNode):
1915
2188
  # Insert packets at the head of the queue
1916
2189
  self._pending_tasks.extendleft(reversed(subtasks))
1917
2190
 
2191
+ await self._post_ready_tasks()
2192
+ action_taken = f"decomposed into {len(subtasks)} subtasks"
2193
+
2194
+ # Handle task completion differently for decomposed tasks
2195
+ if task.id in self._assignees:
2196
+ await self._channel.archive_task(task.id)
2197
+
2198
+ self._cleanup_task_tracking(task.id)
2199
+ logger.debug(
2200
+ f"Task {task.id} failed and was {action_taken}. "
2201
+ f"Dependencies updated for subtasks."
2202
+ )
2203
+
1918
2204
  # Sync shared memory after task decomposition
1919
2205
  if self.share_memory:
1920
2206
  logger.info(
1921
- f"Syncing shared memory after decomposing failed "
1922
- f"task {task.id}"
2207
+ f"Syncing shared memory after task {task.id} decomposition"
1923
2208
  )
1924
2209
  self._sync_shared_memory()
1925
2210
 
2211
+ # Check if any pending tasks are now ready to execute
1926
2212
  await self._post_ready_tasks()
1927
- action_taken = f"decomposed into {len(subtasks)} subtasks"
2213
+ return False
2214
+
2215
+ elif recovery_decision.strategy == RecoveryStrategy.CREATE_WORKER:
2216
+ assignee = await self._create_worker_node_for_task(task)
2217
+ await self._post_task(task, assignee.node_id)
2218
+ action_taken = (
2219
+ f"created new worker {assignee.node_id} and assigned "
2220
+ f"task {task.id} to it"
2221
+ )
2222
+
1928
2223
  if task.id in self._assignees:
1929
2224
  await self._channel.archive_task(task.id)
1930
2225
 
2226
+ self._cleanup_task_tracking(task.id)
1931
2227
  logger.debug(
1932
2228
  f"Task {task.id} failed and was {action_taken}. "
1933
2229
  f"Updating dependency state."
@@ -2020,31 +2316,65 @@ class Workforce(BaseNode):
2020
2316
  break
2021
2317
 
2022
2318
  if not found_and_removed:
2023
- # Task was already removed from pending queue (expected case when
2024
- # it had been popped immediately after posting). No need to
2025
- # draw user attention with a warning; record at debug level.
2319
+ # Task was already removed from pending queue (common case when
2320
+ # it was posted and removed immediately).
2026
2321
  logger.debug(
2027
2322
  f"Completed task {task.id} was already removed from pending "
2028
- "queue."
2323
+ "queue (normal for posted tasks)."
2029
2324
  )
2030
2325
 
2031
2326
  # Archive the task and update dependency tracking
2032
2327
  if task.id in self._assignees:
2033
2328
  await self._channel.archive_task(task.id)
2034
2329
 
2035
- # Ensure it's in completed tasks set
2036
- self._completed_tasks.append(task)
2330
+ # Ensure it's in completed tasks set by updating if it exists or
2331
+ # appending if it's new.
2332
+ task_found_in_completed = False
2333
+ for i, t in enumerate(self._completed_tasks):
2334
+ if t.id == task.id:
2335
+ self._completed_tasks[i] = task
2336
+ task_found_in_completed = True
2337
+ break
2338
+ if not task_found_in_completed:
2339
+ self._completed_tasks.append(task)
2037
2340
 
2038
2341
  # Handle parent task completion logic
2039
2342
  parent = task.parent
2040
- if parent and parent.id not in {t.id for t in self._completed_tasks}:
2343
+ if parent:
2344
+ # Check if all subtasks are completed and successful
2041
2345
  all_subtasks_done = all(
2042
- sub.id in {t.id for t in self._completed_tasks}
2346
+ any(
2347
+ t.id == sub.id and t.state == TaskState.DONE
2348
+ for t in self._completed_tasks
2349
+ )
2043
2350
  for sub in parent.subtasks
2044
2351
  )
2045
2352
  if all_subtasks_done:
2046
- # Set the parent task state to done
2353
+ # Collect results from successful subtasks only
2354
+ successful_results = []
2355
+ for sub in parent.subtasks:
2356
+ completed_subtask = next(
2357
+ (
2358
+ t
2359
+ for t in self._completed_tasks
2360
+ if t.id == sub.id and t.state == TaskState.DONE
2361
+ ),
2362
+ None,
2363
+ )
2364
+ if completed_subtask and completed_subtask.result:
2365
+ successful_results.append(
2366
+ f"--- Subtask {sub.id} Result ---\n"
2367
+ f"{completed_subtask.result}"
2368
+ )
2369
+
2370
+ # Set parent task state and result
2047
2371
  parent.state = TaskState.DONE
2372
+ parent.result = (
2373
+ "\n\n".join(successful_results)
2374
+ if successful_results
2375
+ else "All subtasks completed"
2376
+ )
2377
+
2048
2378
  logger.debug(
2049
2379
  f"All subtasks of {parent.id} are done. "
2050
2380
  f"Marking parent as complete."
@@ -2164,7 +2494,9 @@ class Workforce(BaseNode):
2164
2494
  await self._post_ready_tasks()
2165
2495
  continue
2166
2496
 
2167
- self._in_flight_tasks -= 1
2497
+ self._decrement_in_flight_tasks(
2498
+ returned_task.id, "task returned successfully"
2499
+ )
2168
2500
 
2169
2501
  # Check for stop request after getting task
2170
2502
  if self._stop_requested:
@@ -2249,8 +2581,9 @@ class Workforce(BaseNode):
2249
2581
 
2250
2582
  except Exception as e:
2251
2583
  # Decrement in-flight counter to prevent hanging
2252
- if self._in_flight_tasks > 0:
2253
- self._in_flight_tasks -= 1
2584
+ self._decrement_in_flight_tasks(
2585
+ "unknown", "exception in task processing loop"
2586
+ )
2254
2587
 
2255
2588
  logger.error(
2256
2589
  f"Error processing task in workforce {self.node_id}: {e}"
@@ -2329,8 +2662,20 @@ class Workforce(BaseNode):
2329
2662
  for task in self._child_listening_tasks:
2330
2663
  if not task.done():
2331
2664
  task.cancel()
2665
+
2666
+ # Handle both asyncio.Task and concurrent.futures.
2667
+ # Future
2668
+ awaitables = []
2669
+ for task in self._child_listening_tasks:
2670
+ if isinstance(task, concurrent.futures.Future):
2671
+ # Convert Future to awaitable
2672
+ awaitables.append(asyncio.wrap_future(task))
2673
+ else:
2674
+ # Already an asyncio.Task
2675
+ awaitables.append(task)
2676
+
2332
2677
  await asyncio.gather(
2333
- *self._child_listening_tasks,
2678
+ *awaitables,
2334
2679
  return_exceptions=True,
2335
2680
  )
2336
2681