camel-ai 0.2.75a6__py3-none-any.whl → 0.2.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of camel-ai might be problematic. Click here for more details.

Files changed (97) hide show
  1. camel/__init__.py +1 -1
  2. camel/agents/chat_agent.py +1001 -205
  3. camel/agents/mcp_agent.py +30 -27
  4. camel/configs/__init__.py +6 -0
  5. camel/configs/amd_config.py +70 -0
  6. camel/configs/cometapi_config.py +104 -0
  7. camel/data_collectors/alpaca_collector.py +15 -6
  8. camel/environments/tic_tac_toe.py +1 -1
  9. camel/interpreters/__init__.py +2 -0
  10. camel/interpreters/docker/Dockerfile +3 -12
  11. camel/interpreters/microsandbox_interpreter.py +395 -0
  12. camel/loaders/__init__.py +11 -2
  13. camel/loaders/chunkr_reader.py +9 -0
  14. camel/memories/__init__.py +2 -1
  15. camel/memories/agent_memories.py +3 -1
  16. camel/memories/blocks/chat_history_block.py +21 -3
  17. camel/memories/records.py +88 -8
  18. camel/messages/base.py +127 -34
  19. camel/models/__init__.py +4 -0
  20. camel/models/amd_model.py +101 -0
  21. camel/models/azure_openai_model.py +0 -6
  22. camel/models/base_model.py +30 -0
  23. camel/models/cometapi_model.py +83 -0
  24. camel/models/model_factory.py +4 -0
  25. camel/models/openai_compatible_model.py +0 -6
  26. camel/models/openai_model.py +0 -6
  27. camel/models/zhipuai_model.py +61 -2
  28. camel/parsers/__init__.py +18 -0
  29. camel/parsers/mcp_tool_call_parser.py +176 -0
  30. camel/retrievers/auto_retriever.py +1 -0
  31. camel/runtimes/daytona_runtime.py +11 -12
  32. camel/societies/workforce/prompts.py +131 -50
  33. camel/societies/workforce/single_agent_worker.py +434 -49
  34. camel/societies/workforce/structured_output_handler.py +30 -18
  35. camel/societies/workforce/task_channel.py +43 -0
  36. camel/societies/workforce/utils.py +105 -12
  37. camel/societies/workforce/workforce.py +1322 -311
  38. camel/societies/workforce/workforce_logger.py +24 -5
  39. camel/storages/key_value_storages/json.py +15 -2
  40. camel/storages/object_storages/google_cloud.py +1 -1
  41. camel/storages/vectordb_storages/oceanbase.py +10 -11
  42. camel/storages/vectordb_storages/tidb.py +8 -6
  43. camel/tasks/task.py +4 -3
  44. camel/toolkits/__init__.py +18 -5
  45. camel/toolkits/aci_toolkit.py +45 -0
  46. camel/toolkits/code_execution.py +28 -1
  47. camel/toolkits/context_summarizer_toolkit.py +684 -0
  48. camel/toolkits/dingtalk.py +1135 -0
  49. camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
  50. camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +194 -34
  51. camel/toolkits/function_tool.py +6 -1
  52. camel/toolkits/google_drive_mcp_toolkit.py +12 -31
  53. camel/toolkits/hybrid_browser_toolkit/config_loader.py +12 -0
  54. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +79 -2
  55. camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +95 -59
  56. camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
  57. camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
  58. camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
  59. camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +619 -95
  60. camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +7 -2
  61. camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +115 -219
  62. camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
  63. camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
  64. camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
  65. camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +1 -0
  66. camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +39 -6
  67. camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +405 -131
  68. camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +9 -5
  69. camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +98 -31
  70. camel/toolkits/markitdown_toolkit.py +27 -1
  71. camel/toolkits/mcp_toolkit.py +348 -348
  72. camel/toolkits/message_integration.py +3 -0
  73. camel/toolkits/minimax_mcp_toolkit.py +195 -0
  74. camel/toolkits/note_taking_toolkit.py +18 -8
  75. camel/toolkits/notion_mcp_toolkit.py +16 -26
  76. camel/toolkits/origene_mcp_toolkit.py +8 -49
  77. camel/toolkits/playwright_mcp_toolkit.py +12 -31
  78. camel/toolkits/resend_toolkit.py +168 -0
  79. camel/toolkits/slack_toolkit.py +50 -1
  80. camel/toolkits/terminal_toolkit/__init__.py +18 -0
  81. camel/toolkits/terminal_toolkit/terminal_toolkit.py +924 -0
  82. camel/toolkits/terminal_toolkit/utils.py +532 -0
  83. camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
  84. camel/toolkits/video_analysis_toolkit.py +17 -11
  85. camel/toolkits/wechat_official_toolkit.py +483 -0
  86. camel/types/enums.py +124 -1
  87. camel/types/unified_model_type.py +5 -0
  88. camel/utils/commons.py +17 -0
  89. camel/utils/context_utils.py +804 -0
  90. camel/utils/mcp.py +136 -2
  91. camel/utils/token_counting.py +25 -17
  92. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/METADATA +158 -59
  93. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/RECORD +95 -76
  94. camel/loaders/pandas_reader.py +0 -368
  95. camel/toolkits/terminal_toolkit.py +0 -1788
  96. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/WHEEL +0 -0
  97. {camel_ai-0.2.75a6.dist-info → camel_ai-0.2.76.dist-info}/licenses/LICENSE +0 -0
@@ -16,12 +16,15 @@ from __future__ import annotations
16
16
  import asyncio
17
17
  import concurrent.futures
18
18
  import json
19
+ import os
19
20
  import time
20
21
  import uuid
21
22
  from collections import deque
22
23
  from enum import Enum
23
24
  from typing import (
25
+ TYPE_CHECKING,
24
26
  Any,
27
+ Callable,
25
28
  Coroutine,
26
29
  Deque,
27
30
  Dict,
@@ -31,8 +34,12 @@ from typing import (
31
34
  Set,
32
35
  Tuple,
33
36
  Union,
37
+ cast,
34
38
  )
35
39
 
40
+ if TYPE_CHECKING:
41
+ from camel.utils.context_utils import ContextUtility
42
+
36
43
  from colorama import Fore
37
44
 
38
45
  from camel.agents import ChatAgent
@@ -43,19 +50,23 @@ from camel.societies.workforce.base import BaseNode
43
50
  from camel.societies.workforce.prompts import (
44
51
  ASSIGN_TASK_PROMPT,
45
52
  CREATE_NODE_PROMPT,
46
- FAILURE_ANALYSIS_PROMPT,
53
+ FAILURE_ANALYSIS_RESPONSE_FORMAT,
54
+ QUALITY_EVALUATION_RESPONSE_FORMAT,
55
+ TASK_AGENT_SYSTEM_MESSAGE,
56
+ TASK_ANALYSIS_PROMPT,
47
57
  TASK_DECOMPOSE_PROMPT,
48
58
  )
49
59
  from camel.societies.workforce.role_playing_worker import RolePlayingWorker
50
- from camel.societies.workforce.single_agent_worker import SingleAgentWorker
60
+ from camel.societies.workforce.single_agent_worker import (
61
+ SingleAgentWorker,
62
+ )
51
63
  from camel.societies.workforce.structured_output_handler import (
52
64
  StructuredOutputHandler,
53
65
  )
54
66
  from camel.societies.workforce.task_channel import TaskChannel
55
67
  from camel.societies.workforce.utils import (
56
- FailureContext,
57
- RecoveryDecision,
58
68
  RecoveryStrategy,
69
+ TaskAnalysisResult,
59
70
  TaskAssignment,
60
71
  TaskAssignResult,
61
72
  WorkerConf,
@@ -70,6 +81,7 @@ from camel.tasks.task import (
70
81
  )
71
82
  from camel.toolkits import (
72
83
  CodeExecutionToolkit,
84
+ FunctionTool,
73
85
  SearchToolkit,
74
86
  TaskPlanningToolkit,
75
87
  ThinkingToolkit,
@@ -79,7 +91,15 @@ from camel.utils import dependencies_required
79
91
 
80
92
  from .workforce_logger import WorkforceLogger
81
93
 
82
- logger = get_logger(__name__)
94
+ if os.environ.get("TRACEROOT_ENABLED", "False").lower() == "true":
95
+ try:
96
+ import traceroot # type: ignore[import]
97
+
98
+ logger = traceroot.get_logger('camel')
99
+ except ImportError:
100
+ logger = get_logger(__name__)
101
+ else:
102
+ logger = get_logger(__name__)
83
103
 
84
104
  # Constants for configuration values
85
105
  MAX_TASK_RETRIES = 3
@@ -265,6 +285,7 @@ class Workforce(BaseNode):
265
285
  self._pause_event = asyncio.Event()
266
286
  self._pause_event.set() # Initially not paused
267
287
  self._stop_requested = False
288
+ self._skip_requested = False
268
289
  self._snapshots: List[WorkforceSnapshot] = []
269
290
  self._completed_tasks: List[Task] = []
270
291
  self._loop: Optional[asyncio.AbstractEventLoop] = None
@@ -311,8 +332,7 @@ class Workforce(BaseNode):
311
332
  if coordinator_agent.system_message is not None:
312
333
  user_sys_msg_content = coordinator_agent.system_message.content
313
334
  combined_content = (
314
- f"{user_sys_msg_content}\n\n"
315
- f"{coord_agent_sys_msg.content}"
335
+ f"{user_sys_msg_content}\n\n{coord_agent_sys_msg.content}"
316
336
  )
317
337
  combined_sys_msg = BaseMessage.make_assistant_message(
318
338
  role_name=coordinator_agent.system_message.role_name,
@@ -336,10 +356,7 @@ class Workforce(BaseNode):
336
356
  None,
337
357
  ),
338
358
  output_language=coordinator_agent.output_language,
339
- tools=[
340
- tool.func
341
- for tool in coordinator_agent._internal_tools.values()
342
- ],
359
+ tools=list(coordinator_agent._internal_tools.values()),
343
360
  external_tools=[
344
361
  schema
345
362
  for schema in coordinator_agent._external_tool_schemas.values() # noqa: E501
@@ -352,7 +369,7 @@ class Workforce(BaseNode):
352
369
  # Set up task agent with default system message and required tools
353
370
  task_sys_msg = BaseMessage.make_assistant_message(
354
371
  role_name="Task Planner",
355
- content="You are going to handle tasks.",
372
+ content=TASK_AGENT_SYSTEM_MESSAGE,
356
373
  )
357
374
  task_planning_tools = TaskPlanningToolkit().get_tools()
358
375
 
@@ -377,8 +394,7 @@ class Workforce(BaseNode):
377
394
  if task_agent.system_message is not None:
378
395
  user_task_sys_msg_content = task_agent.system_message.content
379
396
  combined_task_content = (
380
- f"{user_task_sys_msg_content}\n\n"
381
- f"{task_sys_msg.content}"
397
+ f"{user_task_sys_msg_content}\n\n{task_sys_msg.content}"
382
398
  )
383
399
  combined_task_sys_msg = BaseMessage.make_assistant_message(
384
400
  role_name=task_agent.system_message.role_name,
@@ -389,9 +405,11 @@ class Workforce(BaseNode):
389
405
 
390
406
  # Since ChatAgent constructor uses a dictionary with
391
407
  # function names as keys, we don't need to manually deduplicate.
392
- combined_tools = [
393
- tool.func for tool in task_agent._internal_tools.values()
394
- ] + [tool.func for tool in task_planning_tools]
408
+ combined_tools: List[Union[FunctionTool, Callable]] = cast(
409
+ List[Union[FunctionTool, Callable]],
410
+ list(task_agent._internal_tools.values())
411
+ + task_planning_tools,
412
+ )
395
413
 
396
414
  # Create a new agent with the provided agent's configuration
397
415
  # but with the combined system message and tools
@@ -438,10 +456,30 @@ class Workforce(BaseNode):
438
456
  "better context continuity during task handoffs."
439
457
  )
440
458
 
459
+ # Shared context utility for workflow management (created lazily)
460
+ self._shared_context_utility: Optional["ContextUtility"] = None
461
+
441
462
  # ------------------------------------------------------------------
442
463
  # Helper for propagating pause control to externally supplied agents
443
464
  # ------------------------------------------------------------------
444
465
 
466
+ def _get_or_create_shared_context_utility(self) -> "ContextUtility":
467
+ r"""Get or create the shared context utility for workflow management.
468
+
469
+ This method creates the context utility only when needed, avoiding
470
+ unnecessary session folder creation during initialization.
471
+
472
+ Returns:
473
+ ContextUtility: The shared context utility instance.
474
+ """
475
+ if self._shared_context_utility is None:
476
+ from camel.utils.context_utils import ContextUtility
477
+
478
+ self._shared_context_utility = (
479
+ ContextUtility.get_workforce_shared()
480
+ )
481
+ return self._shared_context_utility
482
+
445
483
  def _validate_agent_compatibility(
446
484
  self, agent: ChatAgent, agent_context: str = "agent"
447
485
  ) -> None:
@@ -478,6 +516,9 @@ class Workforce(BaseNode):
478
516
  "the Workforce."
479
517
  )
480
518
 
519
+ # ------------------------------------------------------------------
520
+ # Helper for propagating pause control to externally supplied agents
521
+ # ------------------------------------------------------------------
481
522
  def _attach_pause_event_to_agent(self, agent: ChatAgent) -> None:
482
523
  r"""Ensure the given ChatAgent shares this workforce's pause_event.
483
524
 
@@ -765,76 +806,124 @@ class Workforce(BaseNode):
765
806
  self._update_dependencies_for_decomposition(task, subtasks)
766
807
  return subtasks
767
808
 
768
- def _analyze_failure(
769
- self, task: Task, error_message: str
770
- ) -> RecoveryDecision:
771
- r"""Analyze a task failure and decide on the best recovery strategy.
809
+ def _analyze_task(
810
+ self,
811
+ task: Task,
812
+ *,
813
+ for_failure: bool,
814
+ error_message: Optional[str] = None,
815
+ ) -> TaskAnalysisResult:
816
+ r"""Unified task analysis for both failures and quality evaluation.
817
+
818
+ This method consolidates the logic for analyzing task failures and
819
+ evaluating task quality, using the unified TASK_ANALYSIS_PROMPT.
772
820
 
773
821
  Args:
774
- task (Task): The failed task
775
- error_message (str): The error message from the failure
822
+ task (Task): The task to analyze
823
+ for_failure (bool): True for failure analysis, False for quality
824
+ evaluation
825
+ error_message (Optional[str]): Error message, required when
826
+ for_failure=True
776
827
 
777
828
  Returns:
778
- RecoveryDecision: The decided recovery strategy with reasoning
829
+ TaskAnalysisResult: Unified analysis result with recovery strategy
830
+ and optional quality metrics
831
+
832
+ Raises:
833
+ ValueError: If for_failure=True but error_message is None
779
834
  """
780
- # First, do a quick smart analysis based on error patterns
781
- error_msg_lower = error_message.lower()
782
- if any(
783
- keyword in error_msg_lower
784
- for keyword in [
785
- 'connection',
786
- 'network',
787
- 'server disconnected',
788
- 'timeout',
789
- 'apiconnectionerror',
835
+ # Validate required parameters
836
+ if for_failure and error_message is None:
837
+ raise ValueError("error_message is required when for_failure=True")
838
+
839
+ # Determine task result and issue-specific analysis based on context
840
+ if for_failure:
841
+ task_result = "N/A (task failed)"
842
+ issue_type = "Task Failure"
843
+ issue_analysis = f"**Error Message:** {error_message}"
844
+ response_format = FAILURE_ANALYSIS_RESPONSE_FORMAT
845
+ result_schema = TaskAnalysisResult
846
+ fallback_values: Dict[str, Any] = {
847
+ "reasoning": "Defaulting to retry due to parsing error",
848
+ "recovery_strategy": RecoveryStrategy.RETRY,
849
+ "modified_task_content": None,
850
+ "issues": [error_message] if error_message else [],
851
+ }
852
+ examples: List[Dict[str, Any]] = [
853
+ {
854
+ "reasoning": "Temporary network error, worth retrying",
855
+ "recovery_strategy": "retry",
856
+ "modified_task_content": None,
857
+ "issues": ["Network timeout"],
858
+ }
790
859
  ]
791
- ):
792
- return RecoveryDecision(
793
- strategy=RecoveryStrategy.RETRY,
794
- reasoning="Network/connection error detected, retrying task",
795
- modified_task_content=None,
860
+ else:
861
+ # Quality evaluation
862
+ task_result = task.result or "No result available"
863
+ issue_type = "Quality Evaluation"
864
+ issue_analysis = (
865
+ "Provide a quality score (0-100) and list any specific "
866
+ "issues found."
796
867
  )
868
+ response_format = QUALITY_EVALUATION_RESPONSE_FORMAT
869
+ result_schema = TaskAnalysisResult
870
+ fallback_values = {
871
+ "reasoning": (
872
+ "Defaulting to acceptable quality due to parsing error"
873
+ ),
874
+ "issues": [],
875
+ "recovery_strategy": None,
876
+ "modified_task_content": None,
877
+ "quality_score": 80,
878
+ }
879
+ examples = [
880
+ {
881
+ "reasoning": (
882
+ "Excellent implementation with comprehensive tests"
883
+ ),
884
+ "issues": [],
885
+ "recovery_strategy": None,
886
+ "modified_task_content": None,
887
+ "quality_score": 98,
888
+ },
889
+ {
890
+ "reasoning": (
891
+ "Implementation incomplete with missing features"
892
+ ),
893
+ "issues": [
894
+ "Incomplete implementation",
895
+ "Missing error handling",
896
+ ],
897
+ "recovery_strategy": "replan",
898
+ "modified_task_content": (
899
+ "Previous attempt was incomplete. "
900
+ "Please implement with: 1) Full feature "
901
+ "coverage, 2) Proper error handling"
902
+ ),
903
+ "quality_score": 45,
904
+ },
905
+ ]
797
906
 
798
- # Create failure context
799
- failure_context = FailureContext(
907
+ # Format the unified analysis prompt
908
+ analysis_prompt = TASK_ANALYSIS_PROMPT.format(
800
909
  task_id=task.id,
801
910
  task_content=task.content,
911
+ task_result=task_result,
802
912
  failure_count=task.failure_count,
803
- error_message=error_message,
804
- worker_id=task.assigned_worker_id,
805
913
  task_depth=task.get_depth(),
806
- additional_info=str(task.additional_info)
807
- if task.additional_info
808
- else None,
809
- )
810
-
811
- # Format the analysis prompt
812
- analysis_prompt = FAILURE_ANALYSIS_PROMPT.format(
813
- task_id=failure_context.task_id,
814
- task_content=failure_context.task_content,
815
- failure_count=failure_context.failure_count,
816
- error_message=failure_context.error_message,
817
- worker_id=failure_context.worker_id or "unknown",
818
- task_depth=failure_context.task_depth,
819
- additional_info=failure_context.additional_info or "None",
914
+ assigned_worker=task.assigned_worker_id or "unknown",
915
+ issue_type=issue_type,
916
+ issue_specific_analysis=issue_analysis,
917
+ response_format=response_format,
820
918
  )
821
919
 
822
920
  try:
823
- # Check if we should use structured handler
824
921
  if self.use_structured_output_handler:
825
- # Use structured handler
826
922
  enhanced_prompt = (
827
923
  self.structured_handler.generate_structured_prompt(
828
924
  base_prompt=analysis_prompt,
829
- schema=RecoveryDecision,
830
- examples=[
831
- {
832
- "strategy": "RETRY",
833
- "reasoning": "Temporary network error, "
834
- "worth retrying",
835
- "modified_task_content": None,
836
- }
837
- ],
925
+ schema=result_schema,
926
+ examples=examples,
838
927
  )
839
928
  )
840
929
 
@@ -843,43 +932,220 @@ class Workforce(BaseNode):
843
932
 
844
933
  result = self.structured_handler.parse_structured_response(
845
934
  response.msg.content if response.msg else "",
846
- schema=RecoveryDecision,
847
- fallback_values={
848
- "strategy": RecoveryStrategy.RETRY,
849
- "reasoning": "Defaulting to retry due to parsing "
850
- "issues",
851
- "modified_task_content": None,
852
- },
935
+ schema=result_schema,
936
+ fallback_values=fallback_values,
853
937
  )
854
- # Ensure we return a RecoveryDecision instance
855
- if isinstance(result, RecoveryDecision):
938
+
939
+ if isinstance(result, TaskAnalysisResult):
856
940
  return result
857
941
  elif isinstance(result, dict):
858
- return RecoveryDecision(**result)
942
+ return result_schema(**result)
859
943
  else:
860
- return RecoveryDecision(
861
- strategy=RecoveryStrategy.RETRY,
862
- reasoning="Failed to parse recovery decision",
863
- modified_task_content=None,
864
- )
944
+ # Fallback based on context
945
+ return TaskAnalysisResult(**fallback_values)
865
946
  else:
866
- # Use existing native structured output code
867
947
  self.task_agent.reset()
868
948
  response = self.task_agent.step(
869
- analysis_prompt, response_format=RecoveryDecision
949
+ analysis_prompt, response_format=result_schema
870
950
  )
871
951
  return response.msg.parsed
872
952
 
873
953
  except Exception as e:
874
954
  logger.warning(
875
- f"Error during failure analysis: {e}, defaulting to RETRY"
955
+ f"Error during task analysis "
956
+ f"({'failure' if for_failure else 'quality'}): {e}, "
957
+ f"using fallback"
876
958
  )
877
- return RecoveryDecision(
878
- strategy=RecoveryStrategy.RETRY,
879
- reasoning=f"Analysis failed due to error: {e!s}, "
880
- f"defaulting to retry",
881
- modified_task_content=None,
959
+ return TaskAnalysisResult(**fallback_values)
960
+
961
+ async def _apply_recovery_strategy(
962
+ self,
963
+ task: Task,
964
+ recovery_decision: TaskAnalysisResult,
965
+ ) -> bool:
966
+ r"""Apply the recovery strategy from a task analysis result.
967
+
968
+ This method centralizes the recovery logic for both execution failures
969
+ and quality-based failures.
970
+
971
+ Args:
972
+ task (Task): The task that needs recovery
973
+ recovery_decision (TaskAnalysisResult): The analysis result with
974
+ recovery strategy
975
+
976
+ Returns:
977
+ bool: True if workforce should halt (e.g., decompose needs
978
+ different handling), False otherwise
979
+ """
980
+ strategy = (
981
+ recovery_decision.recovery_strategy or RecoveryStrategy.RETRY
982
+ )
983
+ action_taken = ""
984
+
985
+ try:
986
+ if strategy == RecoveryStrategy.RETRY:
987
+ # Simply retry the task by reposting it to the same worker
988
+ # Check both _assignees dict and task.assigned_worker_id
989
+ assignee_id = (
990
+ self._assignees.get(task.id) or task.assigned_worker_id
991
+ )
992
+
993
+ if assignee_id:
994
+ # Retry with the same worker - no coordinator call needed
995
+ await self._post_task(task, assignee_id)
996
+ action_taken = f"retried with same worker {assignee_id}"
997
+ logger.info(
998
+ f"Task {task.id} retrying with same worker "
999
+ f"{assignee_id} (no coordinator call)"
1000
+ )
1001
+ else:
1002
+ # No previous assignment exists - find a new assignee
1003
+ logger.info(
1004
+ f"Task {task.id} has no previous assignee, "
1005
+ f"calling coordinator"
1006
+ )
1007
+ batch_result = await self._find_assignee([task])
1008
+ assignment = batch_result.assignments[0]
1009
+ self._assignees[task.id] = assignment.assignee_id
1010
+ await self._post_task(task, assignment.assignee_id)
1011
+ action_taken = (
1012
+ f"retried with new worker {assignment.assignee_id}"
1013
+ )
1014
+
1015
+ elif strategy == RecoveryStrategy.REPLAN:
1016
+ # Modify the task content and retry
1017
+ if recovery_decision.modified_task_content:
1018
+ task.content = recovery_decision.modified_task_content
1019
+ logger.info(f"Task {task.id} content modified for replan")
1020
+
1021
+ # Repost the modified task
1022
+ if task.id in self._assignees:
1023
+ assignee_id = self._assignees[task.id]
1024
+ await self._post_task(task, assignee_id)
1025
+ action_taken = (
1026
+ f"replanned and retried with worker {assignee_id}"
1027
+ )
1028
+ else:
1029
+ # Find a new assignee for the replanned task
1030
+ batch_result = await self._find_assignee([task])
1031
+ assignment = batch_result.assignments[0]
1032
+ self._assignees[task.id] = assignment.assignee_id
1033
+ await self._post_task(task, assignment.assignee_id)
1034
+ action_taken = (
1035
+ f"replanned and assigned to "
1036
+ f"worker {assignment.assignee_id}"
1037
+ )
1038
+
1039
+ elif strategy == RecoveryStrategy.REASSIGN:
1040
+ # Reassign to a different worker
1041
+ old_worker = task.assigned_worker_id
1042
+ logger.info(
1043
+ f"Task {task.id} will be reassigned from worker "
1044
+ f"{old_worker}"
1045
+ )
1046
+
1047
+ # Find a different worker
1048
+ batch_result = await self._find_assignee([task])
1049
+ assignment = batch_result.assignments[0]
1050
+ new_worker = assignment.assignee_id
1051
+
1052
+ # If same worker, force find another
1053
+ if new_worker == old_worker and len(self._children) > 1:
1054
+ logger.info("Same worker selected, finding alternative")
1055
+ # Try to find different worker by adding note to
1056
+ # task content
1057
+ task.content = (
1058
+ f"{task.content}\n\n"
1059
+ f"Note: Previous worker {old_worker} had quality "
1060
+ f"issues. Needs different approach."
1061
+ )
1062
+ batch_result = await self._find_assignee([task])
1063
+ assignment = batch_result.assignments[0]
1064
+ new_worker = assignment.assignee_id
1065
+
1066
+ self._assignees[task.id] = new_worker
1067
+ await self._post_task(task, new_worker)
1068
+ action_taken = f"reassigned from {old_worker} to {new_worker}"
1069
+ logger.info(
1070
+ f"Task {task.id} reassigned from {old_worker} to "
1071
+ f"{new_worker}"
1072
+ )
1073
+
1074
+ elif strategy == RecoveryStrategy.DECOMPOSE:
1075
+ # Decompose the task into subtasks
1076
+ reason = (
1077
+ "failure"
1078
+ if not recovery_decision.is_quality_evaluation
1079
+ else "quality issues"
1080
+ )
1081
+ logger.info(
1082
+ f"Task {task.id} will be decomposed due to {reason}"
1083
+ )
1084
+ subtasks_result = self._decompose_task(task)
1085
+
1086
+ # Handle both streaming and non-streaming results
1087
+ if isinstance(subtasks_result, Generator):
1088
+ subtasks = []
1089
+ for new_tasks in subtasks_result:
1090
+ subtasks.extend(new_tasks)
1091
+ else:
1092
+ subtasks = subtasks_result
1093
+
1094
+ if self.metrics_logger and subtasks:
1095
+ self.metrics_logger.log_task_decomposed(
1096
+ parent_task_id=task.id,
1097
+ subtask_ids=[st.id for st in subtasks],
1098
+ )
1099
+ for subtask in subtasks:
1100
+ self.metrics_logger.log_task_created(
1101
+ task_id=subtask.id,
1102
+ description=subtask.content,
1103
+ parent_task_id=task.id,
1104
+ task_type=subtask.type,
1105
+ metadata=subtask.additional_info,
1106
+ )
1107
+
1108
+ # Insert subtasks at the head of the queue
1109
+ self._pending_tasks.extendleft(reversed(subtasks))
1110
+ await self._post_ready_tasks()
1111
+ action_taken = f"decomposed into {len(subtasks)} subtasks"
1112
+
1113
+ logger.info(
1114
+ f"Task {task.id} decomposed into {len(subtasks)} subtasks"
1115
+ )
1116
+
1117
+ # Sync shared memory after task decomposition
1118
+ if self.share_memory:
1119
+ logger.info(
1120
+ f"Syncing shared memory after task {task.id} "
1121
+ f"decomposition"
1122
+ )
1123
+ self._sync_shared_memory()
1124
+
1125
+ # For decompose, we return early with special handling
1126
+ return True
1127
+
1128
+ elif strategy == RecoveryStrategy.CREATE_WORKER:
1129
+ assignee = await self._create_worker_node_for_task(task)
1130
+ await self._post_task(task, assignee.node_id)
1131
+ action_taken = (
1132
+ f"created new worker {assignee.node_id} and assigned "
1133
+ f"task {task.id} to it"
1134
+ )
1135
+
1136
+ except Exception as e:
1137
+ logger.error(
1138
+ f"Recovery strategy {strategy} failed for task {task.id}: {e}",
1139
+ exc_info=True,
882
1140
  )
1141
+ raise
1142
+
1143
+ logger.debug(
1144
+ f"Task {task.id} recovery: {action_taken}. "
1145
+ f"Strategy: {strategy.value}"
1146
+ )
1147
+
1148
+ return False
883
1149
 
884
1150
  # Human intervention methods
885
1151
  async def _async_pause(self) -> None:
@@ -970,6 +1236,39 @@ class Workforce(BaseNode):
970
1236
  f"(event-loop not yet started)."
971
1237
  )
972
1238
 
1239
+ async def _async_skip_gracefully(self) -> None:
1240
+ r"""Async implementation of skip_gracefully to run on the event
1241
+ loop.
1242
+ """
1243
+ self._skip_requested = True
1244
+ if self._pause_event.is_set() is False:
1245
+ self._pause_event.set() # Resume if paused to process skip
1246
+ logger.info(f"Workforce {self.node_id} skip requested.")
1247
+
1248
+ def skip_gracefully(self) -> None:
1249
+ r"""Request workforce to skip current pending tasks and move to next
1250
+ main task from the queue. If no main tasks exist, acts like
1251
+ stop_gracefully.
1252
+
1253
+ This method clears the current pending subtasks and moves to the next
1254
+ main task in the queue if available. Works both when the internal
1255
+ event-loop is alive and when it has not yet been started.
1256
+ """
1257
+
1258
+ if self._loop and not self._loop.is_closed():
1259
+ self._submit_coro_to_loop(self._async_skip_gracefully())
1260
+ else:
1261
+ # Loop not yet created, set the flag synchronously so later
1262
+ # startup will respect it.
1263
+ self._skip_requested = True
1264
+ # Ensure any pending pause is released so that when the loop does
1265
+ # start it can see the skip request and exit.
1266
+ self._pause_event.set()
1267
+ logger.info(
1268
+ f"Workforce {self.node_id} skip requested "
1269
+ f"(event-loop not yet started)."
1270
+ )
1271
+
973
1272
  def save_snapshot(self, description: str = "") -> None:
974
1273
  r"""Save current state as a snapshot."""
975
1274
  snapshot = WorkforceSnapshot(
@@ -1029,36 +1328,148 @@ class Workforce(BaseNode):
1029
1328
  content: str,
1030
1329
  task_id: Optional[str] = None,
1031
1330
  additional_info: Optional[Dict[str, Any]] = None,
1331
+ as_subtask: bool = False,
1032
1332
  insert_position: int = -1,
1033
1333
  ) -> Task:
1034
- r"""Add a new task to the pending queue."""
1035
- new_task = Task(
1334
+ r"""Add a new task to the workforce.
1335
+
1336
+ By default, this method adds a main task that will be decomposed into
1337
+ subtasks. Set `as_subtask=True` to add a task directly to the pending
1338
+ subtask queue without decomposition.
1339
+
1340
+ Args:
1341
+ content (str): The content of the task.
1342
+ task_id (Optional[str], optional): Optional ID for the task.
1343
+ If not provided, a unique ID will be generated.
1344
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1345
+ additional metadata for the task.
1346
+ as_subtask (bool, optional): If True, adds the task directly to
1347
+ the pending subtask queue. If False, adds as a main task that
1348
+ will be decomposed. Defaults to False.
1349
+ insert_position (int, optional): Position to insert the task in
1350
+ the pending queue. Only applies when as_subtask=True.
1351
+ Defaults to -1 (append to end).
1352
+
1353
+ Returns:
1354
+ Task: The created task object.
1355
+ """
1356
+ if as_subtask:
1357
+ new_task = Task(
1358
+ content=content,
1359
+ id=task_id or f"human_added_{len(self._pending_tasks)}",
1360
+ additional_info=additional_info,
1361
+ )
1362
+
1363
+ # Add directly to current pending subtasks
1364
+ if insert_position == -1:
1365
+ self._pending_tasks.append(new_task)
1366
+ else:
1367
+ # Convert deque to list, insert, then back to deque
1368
+ tasks_list = list(self._pending_tasks)
1369
+ tasks_list.insert(insert_position, new_task)
1370
+ self._pending_tasks = deque(tasks_list)
1371
+
1372
+ logger.info(f"New subtask added to pending queue: {new_task.id}")
1373
+ return new_task
1374
+ else:
1375
+ # Add as main task that needs decomposition
1376
+ # Use additional_info to mark this task needs decomposition
1377
+ # Make a copy to avoid modifying user's dict
1378
+ info = additional_info.copy() if additional_info else {}
1379
+ info['_needs_decomposition'] = True
1380
+
1381
+ task_count = sum(
1382
+ 1
1383
+ for t in self._pending_tasks
1384
+ if t.additional_info
1385
+ and t.additional_info.get('_needs_decomposition')
1386
+ )
1387
+
1388
+ new_task = Task(
1389
+ content=content,
1390
+ id=task_id or f"main_task_{task_count}",
1391
+ additional_info=info,
1392
+ )
1393
+
1394
+ self._pending_tasks.append(new_task)
1395
+ logger.info(f"New main task added to pending queue: {new_task.id}")
1396
+ return new_task
1397
+
1398
+ def add_main_task(
1399
+ self,
1400
+ content: str,
1401
+ task_id: Optional[str] = None,
1402
+ additional_info: Optional[Dict[str, Any]] = None,
1403
+ ) -> Task:
1404
+ r"""Add a new main task that will be decomposed into subtasks.
1405
+
1406
+ This is an alias for :meth:`add_task` with `as_subtask=False`.
1407
+
1408
+ Args:
1409
+ content (str): The content of the main task.
1410
+ task_id (Optional[str], optional): Optional ID for the task.
1411
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1412
+ additional metadata.
1413
+
1414
+ Returns:
1415
+ Task: The created main task object.
1416
+ """
1417
+ return self.add_task(
1036
1418
  content=content,
1037
- id=task_id or f"human_added_{len(self._pending_tasks)}",
1419
+ task_id=task_id,
1038
1420
  additional_info=additional_info,
1421
+ as_subtask=False,
1039
1422
  )
1040
- if insert_position == -1:
1041
- self._pending_tasks.append(new_task)
1042
- else:
1043
- # Convert deque to list, insert, then back to deque
1044
- tasks_list = list(self._pending_tasks)
1045
- tasks_list.insert(insert_position, new_task)
1046
- self._pending_tasks = deque(tasks_list)
1047
1423
 
1048
- logger.info(f"New task added: {new_task.id}")
1049
- return new_task
1424
+ def add_subtask(
1425
+ self,
1426
+ content: str,
1427
+ task_id: Optional[str] = None,
1428
+ additional_info: Optional[Dict[str, Any]] = None,
1429
+ insert_position: int = -1,
1430
+ ) -> Task:
1431
+ r"""Add a new subtask to the current pending queue.
1432
+
1433
+ This is an alias for :meth:`add_task` with `as_subtask=True`.
1434
+
1435
+ Args:
1436
+ content (str): The content of the subtask.
1437
+ task_id (Optional[str], optional): Optional ID for the task.
1438
+ additional_info (Optional[Dict[str, Any]], optional): Optional
1439
+ additional metadata.
1440
+ insert_position (int, optional): Position to insert the task.
1441
+ Defaults to -1 (append to end).
1442
+
1443
+ Returns:
1444
+ Task: The created subtask object.
1445
+ """
1446
+ return self.add_task(
1447
+ content=content,
1448
+ task_id=task_id,
1449
+ additional_info=additional_info,
1450
+ as_subtask=True,
1451
+ insert_position=insert_position,
1452
+ )
1050
1453
 
1051
1454
  def remove_task(self, task_id: str) -> bool:
1052
- r"""Remove a task from the pending queue."""
1053
- # Convert to list to find and remove
1054
- tasks_list = list(self._pending_tasks)
1055
- for i, task in enumerate(tasks_list):
1455
+ r"""Remove a task from the pending queue or main task queue.
1456
+
1457
+ Args:
1458
+ task_id (str): The ID of the task to remove.
1459
+
1460
+ Returns:
1461
+ bool: True if task was found and removed, False otherwise.
1462
+ """
1463
+ # Check main task queue first
1464
+ pending_tasks_list = list(self._pending_tasks)
1465
+ for i, task in enumerate(pending_tasks_list):
1056
1466
  if task.id == task_id:
1057
- tasks_list.pop(i)
1058
- self._pending_tasks = deque(tasks_list)
1059
- logger.info(f"Task {task_id} removed.")
1467
+ pending_tasks_list.pop(i)
1468
+ self._pending_tasks = deque(pending_tasks_list)
1469
+ logger.info(f"Task {task_id} removed from pending queue.")
1060
1470
  return True
1061
- logger.warning(f"Task {task_id} not found in pending tasks.")
1471
+
1472
+ logger.warning(f"Task {task_id} not found in any task queue.")
1062
1473
  return False
1063
1474
 
1064
1475
  def reorder_tasks(self, task_ids: List[str]) -> bool:
@@ -1173,26 +1584,21 @@ class Workforce(BaseNode):
1173
1584
  "main_task_id": self._task.id if self._task else None,
1174
1585
  }
1175
1586
 
1176
- @check_if_running(False)
1177
- async def process_task_async(
1178
- self, task: Task, interactive: bool = False
1179
- ) -> Task:
1180
- r"""Main entry point to process a task asynchronously.
1587
+ async def handle_decompose_append_task(
1588
+ self, task: Task, reset: bool = True
1589
+ ) -> List[Task]:
1590
+ r"""Handle task decomposition and validation with
1591
+ workforce environment functions. Then append to
1592
+ pending tasks if decomposition happened.
1181
1593
 
1182
1594
  Args:
1183
1595
  task (Task): The task to be processed.
1184
- interactive (bool, optional): If True, enables human-intervention
1185
- workflow (pause/resume/snapshot). Defaults to False, which
1186
- runs the task in a blocking one-shot manner.
1596
+ reset (Bool): Should trigger workforce reset (Workforce must not
1597
+ be running). Default: True
1187
1598
 
1188
1599
  Returns:
1189
- Task: The updated task.
1600
+ List[Task]: The decomposed subtasks or the original task.
1190
1601
  """
1191
- # Delegate to intervention pipeline when requested to keep
1192
- # backward-compat.
1193
- if interactive:
1194
- return await self._process_task_with_snapshot(task)
1195
-
1196
1602
  if not validate_task_content(task.content, task.id):
1197
1603
  task.state = TaskState.FAILED
1198
1604
  task.result = "Task failed: Invalid or empty content provided"
@@ -1200,10 +1606,16 @@ class Workforce(BaseNode):
1200
1606
  f"Task {task.id} rejected: Invalid or empty content. "
1201
1607
  f"Content preview: '{task.content}'"
1202
1608
  )
1203
- return task
1609
+ return [task]
1204
1610
 
1205
- self.reset()
1611
+ if reset and self._state != WorkforceState.RUNNING:
1612
+ self.reset()
1613
+ logger.info("Workforce reset before handling task.")
1614
+
1615
+ # Focus on the new task
1206
1616
  self._task = task
1617
+ task.state = TaskState.FAILED
1618
+
1207
1619
  if self.metrics_logger:
1208
1620
  self.metrics_logger.log_task_created(
1209
1621
  task_id=task.id,
@@ -1211,7 +1623,6 @@ class Workforce(BaseNode):
1211
1623
  task_type=task.type,
1212
1624
  metadata=task.additional_info,
1213
1625
  )
1214
- task.state = TaskState.FAILED
1215
1626
  # The agent tend to be overconfident on the whole task, so we
1216
1627
  # decompose the task into subtasks first
1217
1628
  subtasks_result = self._decompose_task(task)
@@ -1237,20 +1648,46 @@ class Workforce(BaseNode):
1237
1648
  task_type=subtask.type,
1238
1649
  metadata=subtask.additional_info,
1239
1650
  )
1651
+
1240
1652
  if subtasks:
1241
- # If decomposition happened, the original task becomes a container.
1242
- # We only execute its subtasks.
1653
+ # _pending_tasks will contain both undecomposed
1654
+ # and decomposed tasks, so we use additional_info
1655
+ # to mark the tasks that need decomposition instead
1243
1656
  self._pending_tasks.extendleft(reversed(subtasks))
1244
1657
  else:
1245
1658
  # If no decomposition, execute the original task.
1246
1659
  self._pending_tasks.append(task)
1247
1660
 
1248
- self.set_channel(TaskChannel())
1249
-
1250
- await self.start()
1661
+ return subtasks
1251
1662
 
1252
- if subtasks:
1253
- task.result = "\n\n".join(
1663
+ @check_if_running(False)
1664
+ async def process_task_async(
1665
+ self, task: Task, interactive: bool = False
1666
+ ) -> Task:
1667
+ r"""Main entry point to process a task asynchronously.
1668
+
1669
+ Args:
1670
+ task (Task): The task to be processed.
1671
+ interactive (bool, optional): If True, enables human-intervention
1672
+ workflow (pause/resume/snapshot). Defaults to False, which
1673
+ runs the task in a blocking one-shot manner.
1674
+
1675
+ Returns:
1676
+ Task: The updated task.
1677
+ """
1678
+ # Delegate to intervention pipeline when requested to keep
1679
+ # backward-compat.
1680
+ if interactive:
1681
+ return await self._process_task_with_snapshot(task)
1682
+
1683
+ subtasks = await self.handle_decompose_append_task(task)
1684
+
1685
+ self.set_channel(TaskChannel())
1686
+
1687
+ await self.start()
1688
+
1689
+ if subtasks:
1690
+ task.result = "\n\n".join(
1254
1691
  f"--- Subtask {sub.id} Result ---\n{sub.result}"
1255
1692
  for sub in task.subtasks
1256
1693
  if sub.result
@@ -1326,39 +1763,8 @@ class Workforce(BaseNode):
1326
1763
  Task: The updated task.
1327
1764
  """
1328
1765
 
1329
- if not validate_task_content(task.content, task.id):
1330
- task.state = TaskState.FAILED
1331
- task.result = "Task failed: Invalid or empty content provided"
1332
- logger.warning(
1333
- f"Task {task.id} rejected: Invalid or empty content. "
1334
- f"Content preview: '{task.content}'"
1335
- )
1336
- return task
1766
+ await self.handle_decompose_append_task(task)
1337
1767
 
1338
- self.reset()
1339
- self._task = task
1340
- self._state = WorkforceState.RUNNING
1341
- task.state = TaskState.FAILED # TODO: Add logic for OPEN
1342
-
1343
- # Decompose the task into subtasks first
1344
- subtasks_result = self._decompose_task(task)
1345
-
1346
- # Handle both streaming and non-streaming results
1347
- if isinstance(subtasks_result, Generator):
1348
- # This is a generator (streaming mode)
1349
- subtasks = []
1350
- for new_tasks in subtasks_result:
1351
- subtasks.extend(new_tasks)
1352
- else:
1353
- # This is a regular list (non-streaming mode)
1354
- subtasks = subtasks_result
1355
- if subtasks:
1356
- # If decomposition happened, the original task becomes a container.
1357
- # We only execute its subtasks.
1358
- self._pending_tasks.extendleft(reversed(subtasks))
1359
- else:
1360
- # If no decomposition, execute the original task.
1361
- self._pending_tasks.append(task)
1362
1768
  self.set_channel(TaskChannel())
1363
1769
 
1364
1770
  # Save initial snapshot
@@ -1509,6 +1915,7 @@ class Workforce(BaseNode):
1509
1915
  description: str,
1510
1916
  worker: ChatAgent,
1511
1917
  pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
1918
+ enable_workflow_memory: bool = False,
1512
1919
  ) -> Workforce:
1513
1920
  r"""Add a worker node to the workforce that uses a single agent.
1514
1921
  Can be called when workforce is paused to dynamically add workers.
@@ -1518,6 +1925,9 @@ class Workforce(BaseNode):
1518
1925
  worker (ChatAgent): The agent to be added.
1519
1926
  pool_max_size (int): Maximum size of the agent pool.
1520
1927
  (default: :obj:`10`)
1928
+ enable_workflow_memory (bool): Whether to enable workflow memory
1929
+ accumulation. Set to True if you plan to call
1930
+ save_workflow_memories(). (default: :obj:`False`)
1521
1931
 
1522
1932
  Returns:
1523
1933
  Workforce: The workforce node itself.
@@ -1544,6 +1954,8 @@ class Workforce(BaseNode):
1544
1954
  worker=worker,
1545
1955
  pool_max_size=pool_max_size,
1546
1956
  use_structured_output_handler=self.use_structured_output_handler,
1957
+ context_utility=None, # Will be set during save/load operations
1958
+ enable_workflow_memory=enable_workflow_memory,
1547
1959
  )
1548
1960
  self._children.append(worker_node)
1549
1961
 
@@ -1699,6 +2111,7 @@ class Workforce(BaseNode):
1699
2111
  # Reset intervention state
1700
2112
  self._state = WorkforceState.IDLE
1701
2113
  self._stop_requested = False
2114
+ self._skip_requested = False
1702
2115
  # Handle asyncio.Event in a thread-safe way
1703
2116
  if self._loop and not self._loop.is_closed():
1704
2117
  # If we have a loop, use it to set the event safely
@@ -1719,6 +2132,237 @@ class Workforce(BaseNode):
1719
2132
  else:
1720
2133
  self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
1721
2134
 
2135
+ def save_workflow_memories(self) -> Dict[str, str]:
2136
+ r"""Save workflow memories for all SingleAgentWorker instances in the
2137
+ workforce.
2138
+
2139
+ This method iterates through all child workers and triggers workflow
2140
+ saving for SingleAgentWorker instances using their
2141
+ save_workflow_memories()
2142
+ method.
2143
+ Other worker types are skipped.
2144
+
2145
+ Returns:
2146
+ Dict[str, str]: Dictionary mapping worker node IDs to save results.
2147
+ Values are either file paths (success) or error messages
2148
+ (failure).
2149
+
2150
+ Example:
2151
+ >>> workforce = Workforce("My Team")
2152
+ >>> # ... add workers and process tasks ...
2153
+ >>> results = workforce.save_workflows()
2154
+ >>> print(results)
2155
+ {'worker_123': '/path/to/data_analyst_workflow_20250122.md',
2156
+ 'worker_456': 'error: No conversation context available'}
2157
+ """
2158
+ results = {}
2159
+
2160
+ # Get or create shared context utility for this save operation
2161
+ shared_context_utility = self._get_or_create_shared_context_utility()
2162
+
2163
+ for child in self._children:
2164
+ if isinstance(child, SingleAgentWorker):
2165
+ try:
2166
+ # Set the shared context utility for this operation
2167
+ child._shared_context_utility = shared_context_utility
2168
+ child.worker.set_context_utility(shared_context_utility)
2169
+
2170
+ result = child.save_workflow_memories()
2171
+ if result.get("status") == "success":
2172
+ results[child.node_id] = result.get(
2173
+ "file_path", "unknown_path"
2174
+ )
2175
+ else:
2176
+ # Error: check if there's a separate message field,
2177
+ # otherwise use the status itself
2178
+ error_msg = result.get(
2179
+ "message", result.get("status", "Unknown error")
2180
+ )
2181
+ results[child.node_id] = f"error: {error_msg}"
2182
+
2183
+ except Exception as e:
2184
+ results[child.node_id] = f"error: {e!s}"
2185
+ else:
2186
+ # Skip non-SingleAgentWorker types
2187
+ results[child.node_id] = (
2188
+ f"skipped: {type(child).__name__} not supported"
2189
+ )
2190
+
2191
+ logger.info(f"Workflow save completed for {len(results)} workers")
2192
+ return results
2193
+
2194
+ def load_workflow_memories(
2195
+ self,
2196
+ max_files_to_load: int = 3,
2197
+ session_id: Optional[str] = None,
2198
+ ) -> Dict[str, bool]:
2199
+ r"""Load workflow memories for all SingleAgentWorker instances in the
2200
+ workforce.
2201
+
2202
+ This method iterates through all child workers and loads relevant
2203
+ workflow files for SingleAgentWorker instances using their
2204
+ load_workflow_memories()
2205
+ method. Workers match files based on their description names.
2206
+
2207
+ Args:
2208
+ max_files_to_load (int): Maximum number of workflow files to load
2209
+ per worker. (default: :obj:`3`)
2210
+ session_id (Optional[str]): Specific workforce session ID to load
2211
+ from. If None, searches across all sessions.
2212
+ (default: :obj:`None`)
2213
+
2214
+ Returns:
2215
+ Dict[str, bool]: Dictionary mapping worker node IDs to load
2216
+ success status.
2217
+ True indicates successful loading, False indicates failure.
2218
+
2219
+ Example:
2220
+ >>> workforce = Workforce("My Team")
2221
+ >>> workforce.add_single_agent_worker(
2222
+ ... "data_analyst", analyst_agent
2223
+ ... )
2224
+ >>> success_status = workforce.load_workflows()
2225
+ >>> print(success_status)
2226
+ {'worker_123': True} # Successfully loaded workflows for
2227
+ # data_analyst
2228
+ """
2229
+ results = {}
2230
+
2231
+ # For loading, we don't create a new session - instead we search
2232
+ # existing ones
2233
+ # Each worker will search independently across all existing sessions
2234
+
2235
+ # First, load workflows for SingleAgentWorker instances
2236
+ for child in self._children:
2237
+ if isinstance(child, SingleAgentWorker):
2238
+ try:
2239
+ # For loading, don't set shared context utility
2240
+ # Let each worker search across existing sessions
2241
+ success = child.load_workflow_memories(
2242
+ max_files_to_load=max_files_to_load,
2243
+ session_id=session_id,
2244
+ )
2245
+ results[child.node_id] = success
2246
+
2247
+ except Exception as e:
2248
+ logger.error(
2249
+ f"Failed to load workflow for {child.node_id}: {e!s}"
2250
+ )
2251
+ results[child.node_id] = False
2252
+ else:
2253
+ # Skip non-SingleAgentWorker types
2254
+ results[child.node_id] = False
2255
+
2256
+ # Load aggregated workflow summaries for coordinator and task agents
2257
+ self._load_management_agent_workflows(max_files_to_load, session_id)
2258
+
2259
+ logger.info(f"Workflow load completed for {len(results)} workers")
2260
+ return results
2261
+
2262
+ def _load_management_agent_workflows(
2263
+ self, max_files_to_load: int, session_id: Optional[str] = None
2264
+ ) -> None:
2265
+ r"""Load workflow summaries for coordinator and task planning agents.
2266
+
2267
+ This method loads aggregated workflow summaries to help:
2268
+ - Coordinator agent: understand task assignment patterns and worker
2269
+ capabilities
2270
+ - Task agent: understand task decomposition patterns and
2271
+ successful strategies
2272
+
2273
+ Args:
2274
+ max_files_to_load (int): Maximum number of workflow files to load.
2275
+ session_id (Optional[str]): Specific session ID to load from.
2276
+ If None, searches across all sessions.
2277
+ """
2278
+ try:
2279
+ import glob
2280
+ import os
2281
+ from pathlib import Path
2282
+
2283
+ from camel.utils.context_utils import ContextUtility
2284
+
2285
+ # For loading management workflows, search across all sessions
2286
+ camel_workdir = os.environ.get("CAMEL_WORKDIR")
2287
+ if camel_workdir:
2288
+ base_dir = os.path.join(camel_workdir, "workforce_workflows")
2289
+ else:
2290
+ base_dir = "workforce_workflows"
2291
+
2292
+ # Search for workflow files in specified or all session directories
2293
+ if session_id:
2294
+ search_path = str(
2295
+ Path(base_dir) / session_id / "*_workflow*.md"
2296
+ )
2297
+ else:
2298
+ search_path = str(Path(base_dir) / "*" / "*_workflow*.md")
2299
+ workflow_files = glob.glob(search_path)
2300
+
2301
+ if not workflow_files:
2302
+ logger.info(
2303
+ "No workflow files found for management agent context"
2304
+ )
2305
+ return
2306
+
2307
+ # Sort by modification time (most recent first)
2308
+ workflow_files.sort(
2309
+ key=lambda x: os.path.getmtime(x), reverse=True
2310
+ )
2311
+
2312
+ # Load workflows for coordinator agent (up to 5 most recent)
2313
+ coordinator_loaded = 0
2314
+ for file_path in workflow_files[:max_files_to_load]:
2315
+ try:
2316
+ filename = os.path.basename(file_path).replace('.md', '')
2317
+ session_dir = os.path.dirname(file_path)
2318
+ session_id = os.path.basename(session_dir)
2319
+
2320
+ # Use shared context utility with specific session
2321
+ temp_utility = ContextUtility.get_workforce_shared(
2322
+ session_id
2323
+ )
2324
+
2325
+ status = temp_utility.load_markdown_context_to_memory(
2326
+ self.coordinator_agent, filename
2327
+ )
2328
+ if "Context appended" in status:
2329
+ coordinator_loaded += 1
2330
+ except Exception as e:
2331
+ logger.warning(
2332
+ f"Failed to load coordinator workflow {file_path}: {e}"
2333
+ )
2334
+
2335
+ # Load workflows for task agent (up to 3 most recent)
2336
+ task_agent_loaded = 0
2337
+ for file_path in workflow_files[:max_files_to_load]:
2338
+ try:
2339
+ filename = os.path.basename(file_path).replace('.md', '')
2340
+ session_dir = os.path.dirname(file_path)
2341
+ session_id = os.path.basename(session_dir)
2342
+
2343
+ # Use shared context utility with specific session
2344
+ temp_utility = ContextUtility.get_workforce_shared(
2345
+ session_id
2346
+ )
2347
+
2348
+ status = temp_utility.load_markdown_context_to_memory(
2349
+ self.task_agent, filename
2350
+ )
2351
+ if "Context appended" in status:
2352
+ task_agent_loaded += 1
2353
+ except Exception as e:
2354
+ logger.warning(
2355
+ f"Failed to load task agent workflow {file_path}: {e}"
2356
+ )
2357
+
2358
+ logger.info(
2359
+ f"Loaded {coordinator_loaded} workflows for coordinator, "
2360
+ f"{task_agent_loaded} workflows for task agent"
2361
+ )
2362
+
2363
+ except Exception as e:
2364
+ logger.error(f"Error loading management agent workflows: {e}")
2365
+
1722
2366
  @check_if_running(False)
1723
2367
  def set_channel(self, channel: TaskChannel) -> None:
1724
2368
  r"""Set the channel for the node and all the child nodes under it."""
@@ -2066,8 +2710,40 @@ class Workforce(BaseNode):
2066
2710
  TaskAssignResult: Assignment result containing task assignments
2067
2711
  with their dependencies.
2068
2712
  """
2713
+ # Wait for workers to be ready before assignment with exponential
2714
+ # backoff
2715
+ worker_readiness_timeout = 2.0 # Maximum wait time in seconds
2716
+ worker_readiness_check_interval = 0.05 # Initial check interval
2717
+ start_time = time.time()
2718
+ check_interval = worker_readiness_check_interval
2719
+ backoff_multiplier = 1.5 # Exponential backoff factor
2720
+ max_interval = 0.5 # Cap the maximum interval
2721
+
2722
+ while (time.time() - start_time) < worker_readiness_timeout:
2723
+ valid_worker_ids = self._get_valid_worker_ids()
2724
+ if len(valid_worker_ids) > 0:
2725
+ elapsed = time.time() - start_time
2726
+ logger.debug(
2727
+ f"Workers ready after {elapsed:.3f}s: "
2728
+ f"{len(valid_worker_ids)} workers available"
2729
+ )
2730
+ break
2731
+
2732
+ await asyncio.sleep(check_interval)
2733
+ # Exponential backoff with cap
2734
+ check_interval = min(
2735
+ check_interval * backoff_multiplier, max_interval
2736
+ )
2737
+ else:
2738
+ # Timeout reached, log warning but continue
2739
+ logger.warning(
2740
+ f"Worker readiness timeout after "
2741
+ f"{worker_readiness_timeout}s, "
2742
+ f"proceeding with {len(self._children)} children"
2743
+ )
2744
+ valid_worker_ids = self._get_valid_worker_ids()
2745
+
2069
2746
  self.coordinator_agent.reset()
2070
- valid_worker_ids = self._get_valid_worker_ids()
2071
2747
 
2072
2748
  logger.debug(
2073
2749
  f"Sending batch assignment request to coordinator "
@@ -2101,7 +2777,24 @@ class Workforce(BaseNode):
2101
2777
  invalid_assignments, tasks, valid_worker_ids
2102
2778
  )
2103
2779
  )
2104
- all_assignments = valid_assignments + retry_and_fallback_assignments
2780
+
2781
+ # Combine assignments with deduplication, prioritizing retry results
2782
+ assignment_map = {a.task_id: a for a in valid_assignments}
2783
+ assignment_map.update(
2784
+ {a.task_id: a for a in retry_and_fallback_assignments}
2785
+ )
2786
+ all_assignments = list(assignment_map.values())
2787
+
2788
+ # Log any overwrites for debugging
2789
+ valid_task_ids = {a.task_id for a in valid_assignments}
2790
+ retry_task_ids = {a.task_id for a in retry_and_fallback_assignments}
2791
+ overlap_task_ids = valid_task_ids & retry_task_ids
2792
+
2793
+ if overlap_task_ids:
2794
+ logger.warning(
2795
+ f"Retry assignments overrode {len(overlap_task_ids)} "
2796
+ f"valid assignments for tasks: {sorted(overlap_task_ids)}"
2797
+ )
2105
2798
 
2106
2799
  # Update Task.dependencies for all final assignments
2107
2800
  self._update_task_dependencies_from_assignments(all_assignments, tasks)
@@ -2179,8 +2872,7 @@ class Workforce(BaseNode):
2179
2872
  "worker creation"
2180
2873
  )
2181
2874
  new_node_conf = WorkerConf(
2182
- description=f"Fallback worker for task: "
2183
- f"{task.content}",
2875
+ description=f"Fallback worker for task: {task.content}",
2184
2876
  role="General Assistant",
2185
2877
  sys_msg="You are a general assistant that can help "
2186
2878
  "with various tasks.",
@@ -2190,7 +2882,7 @@ class Workforce(BaseNode):
2190
2882
  response.msg.content,
2191
2883
  schema=WorkerConf,
2192
2884
  fallback_values={
2193
- "description": f"Worker for task: " f"{task.content}",
2885
+ "description": f"Worker for task: {task.content}",
2194
2886
  "role": "Task Specialist",
2195
2887
  "sys_msg": f"You are a specialist for: {task.content}",
2196
2888
  },
@@ -2218,8 +2910,7 @@ class Workforce(BaseNode):
2218
2910
  )
2219
2911
  # Create a fallback worker configuration
2220
2912
  new_node_conf = WorkerConf(
2221
- description=f"Fallback worker for "
2222
- f"task: {task.content}",
2913
+ description=f"Fallback worker for task: {task.content}",
2223
2914
  role="General Assistant",
2224
2915
  sys_msg="You are a general assistant that can help "
2225
2916
  "with various tasks.",
@@ -2352,7 +3043,15 @@ class Workforce(BaseNode):
2352
3043
  tasks_to_assign = [
2353
3044
  task
2354
3045
  for task in self._pending_tasks
2355
- if task.id not in self._task_dependencies
3046
+ if (
3047
+ task.id not in self._task_dependencies
3048
+ and (
3049
+ task.additional_info is None
3050
+ or not task.additional_info.get(
3051
+ "_needs_decomposition", False
3052
+ )
3053
+ )
3054
+ )
2356
3055
  ]
2357
3056
  if tasks_to_assign:
2358
3057
  logger.debug(
@@ -2388,21 +3087,141 @@ class Workforce(BaseNode):
2388
3087
  for task in self._pending_tasks:
2389
3088
  # A task must be assigned to be considered for posting
2390
3089
  if task.id in self._task_dependencies:
3090
+ # Skip if task has already been posted to prevent duplicates
3091
+ try:
3092
+ task_from_channel = await self._channel.get_task_by_id(
3093
+ task.id
3094
+ )
3095
+ # Check if task is already assigned to a worker
3096
+ if (
3097
+ task_from_channel
3098
+ and task_from_channel.assigned_worker_id
3099
+ ):
3100
+ logger.debug(
3101
+ f"Task {task.id} already assigned to "
3102
+ f"{task_from_channel.assigned_worker_id}, "
3103
+ f"skipping to prevent duplicate"
3104
+ )
3105
+ continue
3106
+ except Exception as e:
3107
+ logger.info(
3108
+ f"Task {task.id} non existent in channel. "
3109
+ f"Assigning task: {e}"
3110
+ )
2391
3111
  dependencies = self._task_dependencies[task.id]
2392
- # Check if all dependencies for this task are in the completed
2393
- # set and their state is DONE
2394
- if all(
2395
- dep_id in completed_tasks_info
2396
- and completed_tasks_info[dep_id] == TaskState.DONE
2397
- for dep_id in dependencies
2398
- ):
2399
- assignee_id = self._assignees[task.id]
2400
- logger.debug(
2401
- f"Posting task {task.id} to assignee {assignee_id}. "
2402
- f"Dependencies met."
3112
+
3113
+ # Check if all dependencies are in completed state
3114
+ all_deps_completed = all(
3115
+ dep_id in completed_tasks_info for dep_id in dependencies
3116
+ )
3117
+
3118
+ # Only proceed with dependency checks if all deps are completed
3119
+ if all_deps_completed:
3120
+ # Check if all dependencies succeeded (state is DONE)
3121
+ all_deps_done = all(
3122
+ completed_tasks_info[dep_id] == TaskState.DONE
3123
+ for dep_id in dependencies
3124
+ )
3125
+
3126
+ # Check if any dependency failed
3127
+ any_dep_failed = any(
3128
+ completed_tasks_info[dep_id] == TaskState.FAILED
3129
+ for dep_id in dependencies
2403
3130
  )
2404
- await self._post_task(task, assignee_id)
2405
- posted_tasks.append(task)
3131
+
3132
+ if all_deps_done:
3133
+ # All dependencies completed successfully - post the
3134
+ # task
3135
+ assignee_id = self._assignees[task.id]
3136
+ logger.debug(
3137
+ f"Posting task {task.id} to "
3138
+ f"assignee {assignee_id}. "
3139
+ f"Dependencies met."
3140
+ )
3141
+ await self._post_task(task, assignee_id)
3142
+ posted_tasks.append(task)
3143
+ elif any_dep_failed:
3144
+ # Check if any failed dependencies can still be retried
3145
+ failed_deps = [
3146
+ dep_id
3147
+ for dep_id in dependencies
3148
+ if completed_tasks_info[dep_id] == TaskState.FAILED
3149
+ ]
3150
+
3151
+ # Check if any failed dependency is still retryable
3152
+ failed_tasks_with_retry_potential = []
3153
+ permanently_failed_deps = []
3154
+
3155
+ for dep_id in failed_deps:
3156
+ # Find the failed dependency task
3157
+ failed_task = next(
3158
+ (
3159
+ t
3160
+ for t in self._completed_tasks
3161
+ if t.id == dep_id
3162
+ ),
3163
+ None,
3164
+ )
3165
+ if (
3166
+ failed_task
3167
+ and failed_task.failure_count
3168
+ < MAX_TASK_RETRIES
3169
+ ):
3170
+ failed_tasks_with_retry_potential.append(
3171
+ dep_id
3172
+ )
3173
+ else:
3174
+ permanently_failed_deps.append(dep_id)
3175
+
3176
+ # Only fail the task if ALL dependencies are
3177
+ # permanently failed
3178
+ if (
3179
+ permanently_failed_deps
3180
+ and not failed_tasks_with_retry_potential
3181
+ ):
3182
+ logger.error(
3183
+ f"Task {task.id} cannot proceed: dependencies "
3184
+ f"{permanently_failed_deps} have "
3185
+ f"permanently failed. "
3186
+ f"Marking task as failed."
3187
+ )
3188
+ task.state = TaskState.FAILED
3189
+ task.result = (
3190
+ f"Task failed due to permanently "
3191
+ f"failed dependencies: "
3192
+ f"{permanently_failed_deps}"
3193
+ )
3194
+
3195
+ # Log the failure to metrics
3196
+ if self.metrics_logger:
3197
+ self.metrics_logger.log_task_failed(
3198
+ task_id=task.id,
3199
+ worker_id=task.assigned_worker_id
3200
+ or "unknown",
3201
+ error_message=task.result,
3202
+ metadata={
3203
+ 'failure_reason': (
3204
+ 'dependency_failure'
3205
+ ),
3206
+ 'failed_dependencies': (
3207
+ permanently_failed_deps
3208
+ ),
3209
+ },
3210
+ )
3211
+
3212
+ self._completed_tasks.append(task)
3213
+ self._cleanup_task_tracking(task.id)
3214
+ posted_tasks.append(task) # Remove from pending
3215
+ else:
3216
+ # Some dependencies may still be retried, keep
3217
+ # task pending
3218
+ logger.debug(
3219
+ f"Task {task.id} waiting: dependencies "
3220
+ f"{failed_tasks_with_retry_potential} "
3221
+ f"failed but may be retried "
3222
+ f"(attempt < {MAX_TASK_RETRIES})"
3223
+ )
3224
+ # else: Not all dependencies completed yet, skip this task
2406
3225
 
2407
3226
  # Step 3: Remove the posted tasks from the pending list
2408
3227
  for task in posted_tasks:
@@ -2414,21 +3233,30 @@ class Workforce(BaseNode):
2414
3233
  pass
2415
3234
 
2416
3235
  async def _handle_failed_task(self, task: Task) -> bool:
3236
+ r"""Handle a task that failed during execution.
3237
+
3238
+ Args:
3239
+ task (Task): The failed task
3240
+
3241
+ Returns:
3242
+ bool: True if workforce should halt, False otherwise
3243
+ """
2417
3244
  task.failure_count += 1
2418
3245
 
2419
3246
  # Determine detailed failure information
2420
- # Use the actual error/result stored in task.result
2421
3247
  failure_reason = task.result or "Unknown error"
2422
-
2423
- # Add context about the worker and task
2424
3248
  worker_id = task.assigned_worker_id or "unknown"
2425
- worker_info = f" (assigned to worker: {worker_id})"
2426
-
2427
- detailed_error = f"{failure_reason}{worker_info}"
3249
+ detailed_error = f"{failure_reason} (assigned to worker: {worker_id})"
2428
3250
 
2429
3251
  logger.error(
2430
3252
  f"Task {task.id} failed (attempt "
2431
- f"{task.failure_count}/3): {detailed_error}"
3253
+ f"{task.failure_count}/{MAX_TASK_RETRIES}): {detailed_error}"
3254
+ )
3255
+
3256
+ print(
3257
+ f"{Fore.RED}❌ Task {task.id} failed "
3258
+ f"(attempt {task.failure_count}/{MAX_TASK_RETRIES}): "
3259
+ f"{failure_reason}{Fore.RESET}"
2432
3260
  )
2433
3261
 
2434
3262
  if self.metrics_logger:
@@ -2443,24 +3271,20 @@ class Workforce(BaseNode):
2443
3271
  },
2444
3272
  )
2445
3273
 
2446
- # Check for immediate halt conditions - return immediately if we
2447
- # should halt
3274
+ # Check for immediate halt conditions
2448
3275
  if task.failure_count >= MAX_TASK_RETRIES:
2449
3276
  logger.error(
2450
3277
  f"Task {task.id} has exceeded maximum retry attempts "
2451
- f"({MAX_TASK_RETRIES}). Final failure "
2452
- f"reason: {detailed_error}. "
3278
+ f"({MAX_TASK_RETRIES}). Final failure reason: "
3279
+ f"{detailed_error}. "
2453
3280
  f"Task content: '{task.content}'"
2454
3281
  )
2455
3282
  self._cleanup_task_tracking(task.id)
2456
- # Mark task as completed for dependency tracking before halting
2457
3283
  self._completed_tasks.append(task)
2458
3284
  if task.id in self._assignees:
2459
3285
  await self._channel.archive_task(task.id)
2460
3286
  return True
2461
3287
 
2462
- # If too many tasks are failing rapidly, also halt to prevent infinite
2463
- # loops
2464
3288
  if len(self._pending_tasks) > MAX_PENDING_TASKS_LIMIT:
2465
3289
  logger.error(
2466
3290
  f"Too many pending tasks ({len(self._pending_tasks)} > "
@@ -2468,18 +3292,24 @@ class Workforce(BaseNode):
2468
3292
  f"explosion. Last failed task: {task.id}"
2469
3293
  )
2470
3294
  self._cleanup_task_tracking(task.id)
2471
- # Mark task as completed for dependency tracking before halting
2472
3295
  self._completed_tasks.append(task)
2473
3296
  if task.id in self._assignees:
2474
3297
  await self._channel.archive_task(task.id)
2475
3298
  return True
2476
3299
 
2477
3300
  # Use intelligent failure analysis to decide recovery strategy
2478
- recovery_decision = self._analyze_failure(task, detailed_error)
3301
+ recovery_decision = self._analyze_task(
3302
+ task, for_failure=True, error_message=detailed_error
3303
+ )
2479
3304
 
3305
+ strategy_str = (
3306
+ recovery_decision.recovery_strategy.value
3307
+ if recovery_decision.recovery_strategy
3308
+ else "none"
3309
+ )
2480
3310
  logger.info(
2481
3311
  f"Task {task.id} failure "
2482
- f"analysis: {recovery_decision.strategy.value} - "
3312
+ f"analysis: {strategy_str} - "
2483
3313
  f"{recovery_decision.reasoning}"
2484
3314
  )
2485
3315
 
@@ -2488,105 +3318,23 @@ class Workforce(BaseNode):
2488
3318
  await self._channel.archive_task(task.id)
2489
3319
  self._cleanup_task_tracking(task.id)
2490
3320
 
3321
+ # Apply recovery strategy
2491
3322
  try:
2492
- if recovery_decision.strategy == RecoveryStrategy.RETRY:
2493
- # Simply retry the task by reposting it
2494
- if task.id in self._assignees:
2495
- assignee_id = self._assignees[task.id]
2496
- await self._post_task(task, assignee_id)
2497
- action_taken = f"retried with same worker {assignee_id}"
2498
- else:
2499
- # Find a new assignee and retry
2500
- batch_result = await self._find_assignee([task])
2501
- assignment = batch_result.assignments[0]
2502
- self._assignees[task.id] = assignment.assignee_id
2503
- await self._post_task(task, assignment.assignee_id)
2504
- action_taken = (
2505
- f"retried with new worker {assignment.assignee_id}"
2506
- )
2507
-
2508
- elif recovery_decision.strategy == RecoveryStrategy.REPLAN:
2509
- # Modify the task content and retry
2510
- if recovery_decision.modified_task_content:
2511
- task.content = recovery_decision.modified_task_content
2512
- logger.info(f"Task {task.id} content modified for replan")
2513
-
2514
- # Repost the modified task
2515
- if task.id in self._assignees:
2516
- assignee_id = self._assignees[task.id]
2517
- await self._post_task(task, assignee_id)
2518
- action_taken = (
2519
- f"replanned and retried with worker {assignee_id}"
2520
- )
2521
- else:
2522
- # Find a new assignee for the replanned task
2523
- batch_result = await self._find_assignee([task])
2524
- assignment = batch_result.assignments[0]
2525
- self._assignees[task.id] = assignment.assignee_id
2526
- await self._post_task(task, assignment.assignee_id)
2527
- action_taken = (
2528
- f"replanned and assigned to "
2529
- f"worker {assignment.assignee_id}"
2530
- )
2531
-
2532
- elif recovery_decision.strategy == RecoveryStrategy.DECOMPOSE:
2533
- # Decompose the task into subtasks
2534
- subtasks_result = self._decompose_task(task)
2535
-
2536
- # Handle both streaming and non-streaming results
2537
- if isinstance(subtasks_result, Generator):
2538
- # This is a generator (streaming mode)
2539
- subtasks = []
2540
- for new_tasks in subtasks_result:
2541
- subtasks.extend(new_tasks)
2542
- else:
2543
- # This is a regular list (non-streaming mode)
2544
- subtasks = subtasks_result
2545
- if self.metrics_logger and subtasks:
2546
- self.metrics_logger.log_task_decomposed(
2547
- parent_task_id=task.id,
2548
- subtask_ids=[st.id for st in subtasks],
2549
- )
2550
- for subtask in subtasks:
2551
- self.metrics_logger.log_task_created(
2552
- task_id=subtask.id,
2553
- description=subtask.content,
2554
- parent_task_id=task.id,
2555
- task_type=subtask.type,
2556
- metadata=subtask.additional_info,
2557
- )
2558
- # Insert packets at the head of the queue
2559
- self._pending_tasks.extendleft(reversed(subtasks))
2560
-
2561
- await self._post_ready_tasks()
2562
- action_taken = f"decomposed into {len(subtasks)} subtasks"
2563
-
2564
- logger.debug(
2565
- f"Task {task.id} failed and was {action_taken}. "
2566
- f"Dependencies updated for subtasks."
2567
- )
2568
-
2569
- # Sync shared memory after task decomposition
2570
- if self.share_memory:
2571
- logger.info(
2572
- f"Syncing shared memory after "
2573
- f"task {task.id} decomposition"
2574
- )
2575
- self._sync_shared_memory()
3323
+ is_decompose = await self._apply_recovery_strategy(
3324
+ task, recovery_decision
3325
+ )
2576
3326
 
2577
- # Check if any pending tasks are now ready to execute
2578
- await self._post_ready_tasks()
3327
+ # For decompose, we handle it specially
3328
+ if is_decompose:
3329
+ # Task was decomposed, add to completed tasks
3330
+ self._completed_tasks.append(task)
2579
3331
  return False
2580
3332
 
2581
- elif recovery_decision.strategy == RecoveryStrategy.CREATE_WORKER:
2582
- assignee = await self._create_worker_node_for_task(task)
2583
- await self._post_task(task, assignee.node_id)
2584
- action_taken = (
2585
- f"created new worker {assignee.node_id} and assigned "
2586
- f"task {task.id} to it"
2587
- )
2588
3333
  except Exception as e:
2589
- logger.error(f"Recovery strategy failed for task {task.id}: {e}")
3334
+ logger.error(
3335
+ f"Recovery strategy failed for task {task.id}: {e}",
3336
+ exc_info=True,
3337
+ )
2590
3338
  # If max retries reached, halt the workforce
2591
3339
  if task.failure_count >= MAX_TASK_RETRIES:
2592
3340
  self._completed_tasks.append(task)
@@ -2594,18 +3342,17 @@ class Workforce(BaseNode):
2594
3342
  self._completed_tasks.append(task)
2595
3343
  return False
2596
3344
 
3345
+ # Task is being retried - don't add to completed tasks
3346
+ # It will be added when it actually completes or permanently fails
2597
3347
  logger.debug(
2598
- f"Task {task.id} failed and was {action_taken}. "
2599
- f"Updating dependency state."
3348
+ f"Task {task.id} is being retried (strategy: "
3349
+ f"{recovery_decision.recovery_strategy}). "
3350
+ f"Not adding to completed tasks until final outcome."
2600
3351
  )
2601
- # Mark task as completed for dependency tracking
2602
- self._completed_tasks.append(task)
2603
3352
 
2604
- # Sync shared memory after task completion to share knowledge
3353
+ # Sync shared memory after task recovery
2605
3354
  if self.share_memory:
2606
- logger.info(
2607
- f"Syncing shared memory after task {task.id} completion"
2608
- )
3355
+ logger.info(f"Syncing shared memory after task {task.id} recovery")
2609
3356
  self._sync_shared_memory()
2610
3357
 
2611
3358
  # Check if any pending tasks are now ready to execute
@@ -2810,6 +3557,124 @@ class Workforce(BaseNode):
2810
3557
  # Use logger.info or print, consistent with existing style
2811
3558
  logger.info(f"Workforce logs dumped to {file_path}")
2812
3559
 
3560
+ async def _handle_skip_task(self) -> bool:
3561
+ r"""Handle skip request by marking pending and in-flight tasks
3562
+ as completed.
3563
+
3564
+ Returns:
3565
+ bool: True if workforce should stop (no independent tasks),
3566
+ False to continue.
3567
+ """
3568
+ logger.info("Skip requested, processing skip logic.")
3569
+
3570
+ # Mark all pending tasks as completed instead of just clearing
3571
+ pending_tasks_to_complete = list(self._pending_tasks)
3572
+ if pending_tasks_to_complete:
3573
+ logger.info(
3574
+ f"Marking {len(pending_tasks_to_complete)} pending tasks "
3575
+ f"as completed."
3576
+ )
3577
+ for task in pending_tasks_to_complete:
3578
+ # Don't remove tasks that need decomposition
3579
+ if task.additional_info and task.additional_info.get(
3580
+ '_needs_decomposition', False
3581
+ ):
3582
+ continue
3583
+ # Set task state to DONE and add a completion message
3584
+ task.state = TaskState.DONE
3585
+ task.result = "Task marked as completed due to skip request"
3586
+
3587
+ # Use the existing handle completed task function
3588
+ await self._handle_completed_task(task)
3589
+
3590
+ # Handle in-flight tasks if they exist
3591
+ if self._in_flight_tasks > 0:
3592
+ logger.info(
3593
+ f"Found {self._in_flight_tasks} in-flight tasks. "
3594
+ f"Retrieving and completing them."
3595
+ )
3596
+ try:
3597
+ # Get all in-flight tasks for this publisher from the channel
3598
+ in_flight_tasks = await self._channel.get_in_flight_tasks(
3599
+ self.node_id
3600
+ )
3601
+ logger.info(
3602
+ f"Retrieved {len(in_flight_tasks)} in-flight "
3603
+ f"tasks from channel."
3604
+ )
3605
+
3606
+ for task in in_flight_tasks:
3607
+ # Set task state to DONE and add a completion message
3608
+ task.state = TaskState.DONE
3609
+ task.result = (
3610
+ "Task marked as completed due to skip request"
3611
+ )
3612
+
3613
+ # Remove the task from the channel to avoid hanging
3614
+ await self._channel.remove_task(task.id)
3615
+
3616
+ # Decrement in-flight counter
3617
+ self._decrement_in_flight_tasks(
3618
+ task.id, "skip request - removed from channel"
3619
+ )
3620
+
3621
+ # Handle as completed task to update dependencies
3622
+ await self._handle_completed_task(task)
3623
+
3624
+ logger.info(
3625
+ f"Completed in-flight task {task.id} due "
3626
+ f"to skip request."
3627
+ )
3628
+
3629
+ except Exception as e:
3630
+ logger.error(
3631
+ f"Error handling in-flight tasks during skip: {e}",
3632
+ exc_info=True,
3633
+ )
3634
+ # Reset in-flight counter to prevent hanging
3635
+ self._in_flight_tasks = 0
3636
+
3637
+ # Check if there are any pending tasks (including those needing
3638
+ # decomposition)
3639
+ if self._pending_tasks:
3640
+ # Check if the first pending task needs decomposition
3641
+ next_task = self._pending_tasks[0]
3642
+ if next_task.additional_info and next_task.additional_info.get(
3643
+ '_needs_decomposition'
3644
+ ):
3645
+ logger.info(
3646
+ f"Decomposing main task {next_task.id} after skip request."
3647
+ )
3648
+ try:
3649
+ # Remove the decomposition flag to avoid re-decomposition
3650
+ next_task.additional_info['_needs_decomposition'] = False
3651
+
3652
+ # Decompose the task and append subtasks to _pending_tasks
3653
+ await self.handle_decompose_append_task(
3654
+ next_task, reset=False
3655
+ )
3656
+
3657
+ # Mark the main task as completed and remove from pending
3658
+ await self._handle_completed_task(next_task)
3659
+ logger.info(
3660
+ f"Main task {next_task.id} decomposed after "
3661
+ f"skip request"
3662
+ )
3663
+ except Exception as e:
3664
+ logger.error(
3665
+ f"Error decomposing main task {next_task.id} "
3666
+ f"after skip: {e}",
3667
+ exc_info=True,
3668
+ )
3669
+
3670
+ logger.info("Pending tasks available after skip, continuing.")
3671
+ await self._post_ready_tasks()
3672
+ return False # Continue processing
3673
+ else:
3674
+ # No pending tasks available, act like stop
3675
+ logger.info("No pending tasks available, acting like stop.")
3676
+ return True # Stop processing
3677
+
2813
3678
  @check_if_running(False)
2814
3679
  async def _listen_to_channel(self) -> None:
2815
3680
  r"""Continuously listen to the channel, post task to the channel and
@@ -2838,6 +3703,75 @@ class Workforce(BaseNode):
2838
3703
  logger.info("Stop requested, breaking execution loop.")
2839
3704
  break
2840
3705
 
3706
+ # Check for skip request after potential pause
3707
+ if self._skip_requested:
3708
+ should_stop = await self._handle_skip_task()
3709
+ if should_stop:
3710
+ self._stop_requested = True
3711
+ break
3712
+
3713
+ # Reset skip flag
3714
+ self._skip_requested = False
3715
+ continue
3716
+
3717
+ # Check if we should decompose a main task
3718
+ # Only decompose when no tasks are in flight and pending queue
3719
+ # is empty
3720
+ if not self._pending_tasks and self._in_flight_tasks == 0:
3721
+ # All tasks completed, will exit loop
3722
+ break
3723
+
3724
+ # Check if the first pending task needs decomposition
3725
+ # This happens when add_task(as_subtask=False) was called
3726
+ if self._pending_tasks and self._in_flight_tasks == 0:
3727
+ next_task = self._pending_tasks[0]
3728
+ if (
3729
+ next_task.additional_info
3730
+ and next_task.additional_info.get(
3731
+ '_needs_decomposition'
3732
+ )
3733
+ ):
3734
+ logger.info(f"Decomposing main task: {next_task.id}")
3735
+ try:
3736
+ # Remove the decomposition flag to avoid
3737
+ # re-decomposition
3738
+ next_task.additional_info[
3739
+ '_needs_decomposition'
3740
+ ] = False
3741
+
3742
+ # Decompose the task and append subtasks to
3743
+ # _pending_tasks
3744
+ await self.handle_decompose_append_task(
3745
+ next_task, reset=False
3746
+ )
3747
+
3748
+ # Mark the main task as completed (decomposition
3749
+ # successful) and Remove it from pending tasks
3750
+ await self._handle_completed_task(next_task)
3751
+ logger.info(
3752
+ f"Main task {next_task.id} decomposed and "
3753
+ f"ready for processing"
3754
+ )
3755
+ except Exception as e:
3756
+ logger.error(
3757
+ f"Error decomposing main task {next_task.id}: "
3758
+ f"{e}",
3759
+ exc_info=True,
3760
+ )
3761
+ # Revert back to the queue for retry later if
3762
+ # decomposition failed
3763
+ if not self._pending_tasks:
3764
+ self._pending_tasks.appendleft(next_task)
3765
+ else:
3766
+ logger.warning(
3767
+ "Pending tasks exist after decomposition "
3768
+ "error."
3769
+ )
3770
+
3771
+ # Immediately assign and post the transferred tasks
3772
+ await self._post_ready_tasks()
3773
+ continue
3774
+
2841
3775
  # Save snapshot before processing next task
2842
3776
  if self._pending_tasks:
2843
3777
  current_task = self._pending_tasks[0]
@@ -2928,11 +3862,88 @@ class Workforce(BaseNode):
2928
3862
  )
2929
3863
  continue
2930
3864
  else:
2931
- print(
2932
- f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
2933
- f"successfully.{Fore.RESET}"
3865
+ quality_eval = self._analyze_task(
3866
+ returned_task, for_failure=False
2934
3867
  )
2935
- await self._handle_completed_task(returned_task)
3868
+
3869
+ if not quality_eval.quality_sufficient:
3870
+ logger.info(
3871
+ f"Task {returned_task.id} quality check: "
3872
+ f"score={quality_eval.quality_score}, "
3873
+ f"issues={quality_eval.issues}, "
3874
+ f"strategy={quality_eval.recovery_strategy}"
3875
+ )
3876
+
3877
+ # Check retry limit before attempting recovery
3878
+ if returned_task.failure_count >= 2:
3879
+ print(
3880
+ f"{Fore.YELLOW}Task {returned_task.id} "
3881
+ f"completed with low quality score: "
3882
+ f"{quality_eval.quality_score} "
3883
+ f"(retry limit reached){Fore.RESET}"
3884
+ )
3885
+ await self._handle_completed_task(
3886
+ returned_task
3887
+ )
3888
+ continue
3889
+
3890
+ # Print visual feedback for quality-failed tasks
3891
+ # with recovery strategy
3892
+ recovery_action = (
3893
+ quality_eval.recovery_strategy.value
3894
+ if quality_eval.recovery_strategy
3895
+ else ""
3896
+ )
3897
+ print(
3898
+ f"{Fore.YELLOW}⚠️ Task {returned_task.id} "
3899
+ f"failed quality check (score: "
3900
+ f"{quality_eval.quality_score}). "
3901
+ f"Issues: {', '.join(quality_eval.issues)}. "
3902
+ f"Recovery: {recovery_action}{Fore.RESET}"
3903
+ )
3904
+
3905
+ # Mark as failed for recovery
3906
+ returned_task.failure_count += 1
3907
+ returned_task.state = TaskState.FAILED
3908
+ returned_task.result = (
3909
+ f"Quality insufficient (score: "
3910
+ f"{quality_eval.quality_score}). "
3911
+ f"Issues: {', '.join(quality_eval.issues)}"
3912
+ )
3913
+
3914
+ # Clean up tracking before attempting recovery
3915
+ if returned_task.id in self._assignees:
3916
+ await self._channel.archive_task(
3917
+ returned_task.id
3918
+ )
3919
+ self._cleanup_task_tracking(returned_task.id)
3920
+
3921
+ # Apply LLM-recommended recovery strategy
3922
+ try:
3923
+ is_decompose = (
3924
+ await self._apply_recovery_strategy(
3925
+ returned_task, quality_eval
3926
+ )
3927
+ )
3928
+
3929
+ # For decompose, cleanup happens in the method
3930
+ if is_decompose:
3931
+ continue
3932
+
3933
+ except Exception as e:
3934
+ logger.error(
3935
+ f"Error handling quality-failed task "
3936
+ f"{returned_task.id}: {e}",
3937
+ exc_info=True,
3938
+ )
3939
+ continue
3940
+ else:
3941
+ print(
3942
+ f"{Fore.CYAN}Task {returned_task.id} "
3943
+ f"completed successfully (quality score: "
3944
+ f"{quality_eval.quality_score}).{Fore.RESET}"
3945
+ )
3946
+ await self._handle_completed_task(returned_task)
2936
3947
  elif returned_task.state == TaskState.FAILED:
2937
3948
  try:
2938
3949
  halt = await self._handle_failed_task(returned_task)