camel-ai 0.2.76a14__py3-none-any.whl → 0.2.78__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +278 -154
- camel/data_collectors/alpaca_collector.py +15 -6
- camel/societies/workforce/prompts.py +131 -50
- camel/societies/workforce/single_agent_worker.py +390 -11
- camel/societies/workforce/structured_output_handler.py +30 -18
- camel/societies/workforce/utils.py +105 -12
- camel/societies/workforce/workforce.py +818 -224
- camel/societies/workforce/workforce_logger.py +24 -5
- camel/toolkits/context_summarizer_toolkit.py +2 -2
- camel/toolkits/excel_toolkit.py +1 -1
- camel/toolkits/file_toolkit.py +3 -2
- camel/toolkits/terminal_toolkit/utils.py +106 -154
- camel/types/enums.py +4 -4
- camel/utils/context_utils.py +379 -22
- {camel_ai-0.2.76a14.dist-info → camel_ai-0.2.78.dist-info}/METADATA +10 -1
- {camel_ai-0.2.76a14.dist-info → camel_ai-0.2.78.dist-info}/RECORD +19 -19
- {camel_ai-0.2.76a14.dist-info → camel_ai-0.2.78.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.76a14.dist-info → camel_ai-0.2.78.dist-info}/licenses/LICENSE +0 -0
|
@@ -22,6 +22,7 @@ import uuid
|
|
|
22
22
|
from collections import deque
|
|
23
23
|
from enum import Enum
|
|
24
24
|
from typing import (
|
|
25
|
+
TYPE_CHECKING,
|
|
25
26
|
Any,
|
|
26
27
|
Callable,
|
|
27
28
|
Coroutine,
|
|
@@ -36,6 +37,9 @@ from typing import (
|
|
|
36
37
|
cast,
|
|
37
38
|
)
|
|
38
39
|
|
|
40
|
+
if TYPE_CHECKING:
|
|
41
|
+
from camel.utils.context_utils import ContextUtility
|
|
42
|
+
|
|
39
43
|
from colorama import Fore
|
|
40
44
|
|
|
41
45
|
from camel.agents import ChatAgent
|
|
@@ -46,19 +50,23 @@ from camel.societies.workforce.base import BaseNode
|
|
|
46
50
|
from camel.societies.workforce.prompts import (
|
|
47
51
|
ASSIGN_TASK_PROMPT,
|
|
48
52
|
CREATE_NODE_PROMPT,
|
|
49
|
-
|
|
53
|
+
FAILURE_ANALYSIS_RESPONSE_FORMAT,
|
|
54
|
+
QUALITY_EVALUATION_RESPONSE_FORMAT,
|
|
55
|
+
TASK_AGENT_SYSTEM_MESSAGE,
|
|
56
|
+
TASK_ANALYSIS_PROMPT,
|
|
50
57
|
TASK_DECOMPOSE_PROMPT,
|
|
51
58
|
)
|
|
52
59
|
from camel.societies.workforce.role_playing_worker import RolePlayingWorker
|
|
53
|
-
from camel.societies.workforce.single_agent_worker import
|
|
60
|
+
from camel.societies.workforce.single_agent_worker import (
|
|
61
|
+
SingleAgentWorker,
|
|
62
|
+
)
|
|
54
63
|
from camel.societies.workforce.structured_output_handler import (
|
|
55
64
|
StructuredOutputHandler,
|
|
56
65
|
)
|
|
57
66
|
from camel.societies.workforce.task_channel import TaskChannel
|
|
58
67
|
from camel.societies.workforce.utils import (
|
|
59
|
-
FailureContext,
|
|
60
|
-
RecoveryDecision,
|
|
61
68
|
RecoveryStrategy,
|
|
69
|
+
TaskAnalysisResult,
|
|
62
70
|
TaskAssignment,
|
|
63
71
|
TaskAssignResult,
|
|
64
72
|
WorkerConf,
|
|
@@ -324,8 +332,7 @@ class Workforce(BaseNode):
|
|
|
324
332
|
if coordinator_agent.system_message is not None:
|
|
325
333
|
user_sys_msg_content = coordinator_agent.system_message.content
|
|
326
334
|
combined_content = (
|
|
327
|
-
f"{user_sys_msg_content}\n\n"
|
|
328
|
-
f"{coord_agent_sys_msg.content}"
|
|
335
|
+
f"{user_sys_msg_content}\n\n{coord_agent_sys_msg.content}"
|
|
329
336
|
)
|
|
330
337
|
combined_sys_msg = BaseMessage.make_assistant_message(
|
|
331
338
|
role_name=coordinator_agent.system_message.role_name,
|
|
@@ -362,7 +369,7 @@ class Workforce(BaseNode):
|
|
|
362
369
|
# Set up task agent with default system message and required tools
|
|
363
370
|
task_sys_msg = BaseMessage.make_assistant_message(
|
|
364
371
|
role_name="Task Planner",
|
|
365
|
-
content=
|
|
372
|
+
content=TASK_AGENT_SYSTEM_MESSAGE,
|
|
366
373
|
)
|
|
367
374
|
task_planning_tools = TaskPlanningToolkit().get_tools()
|
|
368
375
|
|
|
@@ -387,8 +394,7 @@ class Workforce(BaseNode):
|
|
|
387
394
|
if task_agent.system_message is not None:
|
|
388
395
|
user_task_sys_msg_content = task_agent.system_message.content
|
|
389
396
|
combined_task_content = (
|
|
390
|
-
f"{user_task_sys_msg_content}\n\n"
|
|
391
|
-
f"{task_sys_msg.content}"
|
|
397
|
+
f"{user_task_sys_msg_content}\n\n{task_sys_msg.content}"
|
|
392
398
|
)
|
|
393
399
|
combined_task_sys_msg = BaseMessage.make_assistant_message(
|
|
394
400
|
role_name=task_agent.system_message.role_name,
|
|
@@ -450,6 +456,30 @@ class Workforce(BaseNode):
|
|
|
450
456
|
"better context continuity during task handoffs."
|
|
451
457
|
)
|
|
452
458
|
|
|
459
|
+
# Shared context utility for workflow management (created lazily)
|
|
460
|
+
self._shared_context_utility: Optional["ContextUtility"] = None
|
|
461
|
+
|
|
462
|
+
# ------------------------------------------------------------------
|
|
463
|
+
# Helper for propagating pause control to externally supplied agents
|
|
464
|
+
# ------------------------------------------------------------------
|
|
465
|
+
|
|
466
|
+
def _get_or_create_shared_context_utility(self) -> "ContextUtility":
|
|
467
|
+
r"""Get or create the shared context utility for workflow management.
|
|
468
|
+
|
|
469
|
+
This method creates the context utility only when needed, avoiding
|
|
470
|
+
unnecessary session folder creation during initialization.
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
ContextUtility: The shared context utility instance.
|
|
474
|
+
"""
|
|
475
|
+
if self._shared_context_utility is None:
|
|
476
|
+
from camel.utils.context_utils import ContextUtility
|
|
477
|
+
|
|
478
|
+
self._shared_context_utility = (
|
|
479
|
+
ContextUtility.get_workforce_shared()
|
|
480
|
+
)
|
|
481
|
+
return self._shared_context_utility
|
|
482
|
+
|
|
453
483
|
def _validate_agent_compatibility(
|
|
454
484
|
self, agent: ChatAgent, agent_context: str = "agent"
|
|
455
485
|
) -> None:
|
|
@@ -776,76 +806,124 @@ class Workforce(BaseNode):
|
|
|
776
806
|
self._update_dependencies_for_decomposition(task, subtasks)
|
|
777
807
|
return subtasks
|
|
778
808
|
|
|
779
|
-
def
|
|
780
|
-
self,
|
|
781
|
-
|
|
782
|
-
|
|
809
|
+
def _analyze_task(
|
|
810
|
+
self,
|
|
811
|
+
task: Task,
|
|
812
|
+
*,
|
|
813
|
+
for_failure: bool,
|
|
814
|
+
error_message: Optional[str] = None,
|
|
815
|
+
) -> TaskAnalysisResult:
|
|
816
|
+
r"""Unified task analysis for both failures and quality evaluation.
|
|
817
|
+
|
|
818
|
+
This method consolidates the logic for analyzing task failures and
|
|
819
|
+
evaluating task quality, using the unified TASK_ANALYSIS_PROMPT.
|
|
783
820
|
|
|
784
821
|
Args:
|
|
785
|
-
task (Task): The
|
|
786
|
-
|
|
822
|
+
task (Task): The task to analyze
|
|
823
|
+
for_failure (bool): True for failure analysis, False for quality
|
|
824
|
+
evaluation
|
|
825
|
+
error_message (Optional[str]): Error message, required when
|
|
826
|
+
for_failure=True
|
|
787
827
|
|
|
788
828
|
Returns:
|
|
789
|
-
|
|
829
|
+
TaskAnalysisResult: Unified analysis result with recovery strategy
|
|
830
|
+
and optional quality metrics
|
|
831
|
+
|
|
832
|
+
Raises:
|
|
833
|
+
ValueError: If for_failure=True but error_message is None
|
|
790
834
|
"""
|
|
791
|
-
#
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
835
|
+
# Validate required parameters
|
|
836
|
+
if for_failure and error_message is None:
|
|
837
|
+
raise ValueError("error_message is required when for_failure=True")
|
|
838
|
+
|
|
839
|
+
# Determine task result and issue-specific analysis based on context
|
|
840
|
+
if for_failure:
|
|
841
|
+
task_result = "N/A (task failed)"
|
|
842
|
+
issue_type = "Task Failure"
|
|
843
|
+
issue_analysis = f"**Error Message:** {error_message}"
|
|
844
|
+
response_format = FAILURE_ANALYSIS_RESPONSE_FORMAT
|
|
845
|
+
result_schema = TaskAnalysisResult
|
|
846
|
+
fallback_values: Dict[str, Any] = {
|
|
847
|
+
"reasoning": "Defaulting to retry due to parsing error",
|
|
848
|
+
"recovery_strategy": RecoveryStrategy.RETRY,
|
|
849
|
+
"modified_task_content": None,
|
|
850
|
+
"issues": [error_message] if error_message else [],
|
|
851
|
+
}
|
|
852
|
+
examples: List[Dict[str, Any]] = [
|
|
853
|
+
{
|
|
854
|
+
"reasoning": "Temporary network error, worth retrying",
|
|
855
|
+
"recovery_strategy": "retry",
|
|
856
|
+
"modified_task_content": None,
|
|
857
|
+
"issues": ["Network timeout"],
|
|
858
|
+
}
|
|
801
859
|
]
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
860
|
+
else:
|
|
861
|
+
# Quality evaluation
|
|
862
|
+
task_result = task.result or "No result available"
|
|
863
|
+
issue_type = "Quality Evaluation"
|
|
864
|
+
issue_analysis = (
|
|
865
|
+
"Provide a quality score (0-100) and list any specific "
|
|
866
|
+
"issues found."
|
|
807
867
|
)
|
|
868
|
+
response_format = QUALITY_EVALUATION_RESPONSE_FORMAT
|
|
869
|
+
result_schema = TaskAnalysisResult
|
|
870
|
+
fallback_values = {
|
|
871
|
+
"reasoning": (
|
|
872
|
+
"Defaulting to acceptable quality due to parsing error"
|
|
873
|
+
),
|
|
874
|
+
"issues": [],
|
|
875
|
+
"recovery_strategy": None,
|
|
876
|
+
"modified_task_content": None,
|
|
877
|
+
"quality_score": 80,
|
|
878
|
+
}
|
|
879
|
+
examples = [
|
|
880
|
+
{
|
|
881
|
+
"reasoning": (
|
|
882
|
+
"Excellent implementation with comprehensive tests"
|
|
883
|
+
),
|
|
884
|
+
"issues": [],
|
|
885
|
+
"recovery_strategy": None,
|
|
886
|
+
"modified_task_content": None,
|
|
887
|
+
"quality_score": 98,
|
|
888
|
+
},
|
|
889
|
+
{
|
|
890
|
+
"reasoning": (
|
|
891
|
+
"Implementation incomplete with missing features"
|
|
892
|
+
),
|
|
893
|
+
"issues": [
|
|
894
|
+
"Incomplete implementation",
|
|
895
|
+
"Missing error handling",
|
|
896
|
+
],
|
|
897
|
+
"recovery_strategy": "replan",
|
|
898
|
+
"modified_task_content": (
|
|
899
|
+
"Previous attempt was incomplete. "
|
|
900
|
+
"Please implement with: 1) Full feature "
|
|
901
|
+
"coverage, 2) Proper error handling"
|
|
902
|
+
),
|
|
903
|
+
"quality_score": 45,
|
|
904
|
+
},
|
|
905
|
+
]
|
|
808
906
|
|
|
809
|
-
#
|
|
810
|
-
|
|
907
|
+
# Format the unified analysis prompt
|
|
908
|
+
analysis_prompt = TASK_ANALYSIS_PROMPT.format(
|
|
811
909
|
task_id=task.id,
|
|
812
910
|
task_content=task.content,
|
|
911
|
+
task_result=task_result,
|
|
813
912
|
failure_count=task.failure_count,
|
|
814
|
-
error_message=error_message,
|
|
815
|
-
worker_id=task.assigned_worker_id,
|
|
816
913
|
task_depth=task.get_depth(),
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
# Format the analysis prompt
|
|
823
|
-
analysis_prompt = FAILURE_ANALYSIS_PROMPT.format(
|
|
824
|
-
task_id=failure_context.task_id,
|
|
825
|
-
task_content=failure_context.task_content,
|
|
826
|
-
failure_count=failure_context.failure_count,
|
|
827
|
-
error_message=failure_context.error_message,
|
|
828
|
-
worker_id=failure_context.worker_id or "unknown",
|
|
829
|
-
task_depth=failure_context.task_depth,
|
|
830
|
-
additional_info=failure_context.additional_info or "None",
|
|
914
|
+
assigned_worker=task.assigned_worker_id or "unknown",
|
|
915
|
+
issue_type=issue_type,
|
|
916
|
+
issue_specific_analysis=issue_analysis,
|
|
917
|
+
response_format=response_format,
|
|
831
918
|
)
|
|
832
919
|
|
|
833
920
|
try:
|
|
834
|
-
# Check if we should use structured handler
|
|
835
921
|
if self.use_structured_output_handler:
|
|
836
|
-
# Use structured handler
|
|
837
922
|
enhanced_prompt = (
|
|
838
923
|
self.structured_handler.generate_structured_prompt(
|
|
839
924
|
base_prompt=analysis_prompt,
|
|
840
|
-
schema=
|
|
841
|
-
examples=
|
|
842
|
-
{
|
|
843
|
-
"strategy": "RETRY",
|
|
844
|
-
"reasoning": "Temporary network error, "
|
|
845
|
-
"worth retrying",
|
|
846
|
-
"modified_task_content": None,
|
|
847
|
-
}
|
|
848
|
-
],
|
|
925
|
+
schema=result_schema,
|
|
926
|
+
examples=examples,
|
|
849
927
|
)
|
|
850
928
|
)
|
|
851
929
|
|
|
@@ -854,43 +932,220 @@ class Workforce(BaseNode):
|
|
|
854
932
|
|
|
855
933
|
result = self.structured_handler.parse_structured_response(
|
|
856
934
|
response.msg.content if response.msg else "",
|
|
857
|
-
schema=
|
|
858
|
-
fallback_values=
|
|
859
|
-
"strategy": RecoveryStrategy.RETRY,
|
|
860
|
-
"reasoning": "Defaulting to retry due to parsing "
|
|
861
|
-
"issues",
|
|
862
|
-
"modified_task_content": None,
|
|
863
|
-
},
|
|
935
|
+
schema=result_schema,
|
|
936
|
+
fallback_values=fallback_values,
|
|
864
937
|
)
|
|
865
|
-
|
|
866
|
-
if isinstance(result,
|
|
938
|
+
|
|
939
|
+
if isinstance(result, TaskAnalysisResult):
|
|
867
940
|
return result
|
|
868
941
|
elif isinstance(result, dict):
|
|
869
|
-
return
|
|
942
|
+
return result_schema(**result)
|
|
870
943
|
else:
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
reasoning="Failed to parse recovery decision",
|
|
874
|
-
modified_task_content=None,
|
|
875
|
-
)
|
|
944
|
+
# Fallback based on context
|
|
945
|
+
return TaskAnalysisResult(**fallback_values)
|
|
876
946
|
else:
|
|
877
|
-
# Use existing native structured output code
|
|
878
947
|
self.task_agent.reset()
|
|
879
948
|
response = self.task_agent.step(
|
|
880
|
-
analysis_prompt, response_format=
|
|
949
|
+
analysis_prompt, response_format=result_schema
|
|
881
950
|
)
|
|
882
951
|
return response.msg.parsed
|
|
883
952
|
|
|
884
953
|
except Exception as e:
|
|
885
954
|
logger.warning(
|
|
886
|
-
f"Error during
|
|
955
|
+
f"Error during task analysis "
|
|
956
|
+
f"({'failure' if for_failure else 'quality'}): {e}, "
|
|
957
|
+
f"using fallback"
|
|
887
958
|
)
|
|
888
|
-
return
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
959
|
+
return TaskAnalysisResult(**fallback_values)
|
|
960
|
+
|
|
961
|
+
async def _apply_recovery_strategy(
|
|
962
|
+
self,
|
|
963
|
+
task: Task,
|
|
964
|
+
recovery_decision: TaskAnalysisResult,
|
|
965
|
+
) -> bool:
|
|
966
|
+
r"""Apply the recovery strategy from a task analysis result.
|
|
967
|
+
|
|
968
|
+
This method centralizes the recovery logic for both execution failures
|
|
969
|
+
and quality-based failures.
|
|
970
|
+
|
|
971
|
+
Args:
|
|
972
|
+
task (Task): The task that needs recovery
|
|
973
|
+
recovery_decision (TaskAnalysisResult): The analysis result with
|
|
974
|
+
recovery strategy
|
|
975
|
+
|
|
976
|
+
Returns:
|
|
977
|
+
bool: True if workforce should halt (e.g., decompose needs
|
|
978
|
+
different handling), False otherwise
|
|
979
|
+
"""
|
|
980
|
+
strategy = (
|
|
981
|
+
recovery_decision.recovery_strategy or RecoveryStrategy.RETRY
|
|
982
|
+
)
|
|
983
|
+
action_taken = ""
|
|
984
|
+
|
|
985
|
+
try:
|
|
986
|
+
if strategy == RecoveryStrategy.RETRY:
|
|
987
|
+
# Simply retry the task by reposting it to the same worker
|
|
988
|
+
# Check both _assignees dict and task.assigned_worker_id
|
|
989
|
+
assignee_id = (
|
|
990
|
+
self._assignees.get(task.id) or task.assigned_worker_id
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
if assignee_id:
|
|
994
|
+
# Retry with the same worker - no coordinator call needed
|
|
995
|
+
await self._post_task(task, assignee_id)
|
|
996
|
+
action_taken = f"retried with same worker {assignee_id}"
|
|
997
|
+
logger.info(
|
|
998
|
+
f"Task {task.id} retrying with same worker "
|
|
999
|
+
f"{assignee_id} (no coordinator call)"
|
|
1000
|
+
)
|
|
1001
|
+
else:
|
|
1002
|
+
# No previous assignment exists - find a new assignee
|
|
1003
|
+
logger.info(
|
|
1004
|
+
f"Task {task.id} has no previous assignee, "
|
|
1005
|
+
f"calling coordinator"
|
|
1006
|
+
)
|
|
1007
|
+
batch_result = await self._find_assignee([task])
|
|
1008
|
+
assignment = batch_result.assignments[0]
|
|
1009
|
+
self._assignees[task.id] = assignment.assignee_id
|
|
1010
|
+
await self._post_task(task, assignment.assignee_id)
|
|
1011
|
+
action_taken = (
|
|
1012
|
+
f"retried with new worker {assignment.assignee_id}"
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
elif strategy == RecoveryStrategy.REPLAN:
|
|
1016
|
+
# Modify the task content and retry
|
|
1017
|
+
if recovery_decision.modified_task_content:
|
|
1018
|
+
task.content = recovery_decision.modified_task_content
|
|
1019
|
+
logger.info(f"Task {task.id} content modified for replan")
|
|
1020
|
+
|
|
1021
|
+
# Repost the modified task
|
|
1022
|
+
if task.id in self._assignees:
|
|
1023
|
+
assignee_id = self._assignees[task.id]
|
|
1024
|
+
await self._post_task(task, assignee_id)
|
|
1025
|
+
action_taken = (
|
|
1026
|
+
f"replanned and retried with worker {assignee_id}"
|
|
1027
|
+
)
|
|
1028
|
+
else:
|
|
1029
|
+
# Find a new assignee for the replanned task
|
|
1030
|
+
batch_result = await self._find_assignee([task])
|
|
1031
|
+
assignment = batch_result.assignments[0]
|
|
1032
|
+
self._assignees[task.id] = assignment.assignee_id
|
|
1033
|
+
await self._post_task(task, assignment.assignee_id)
|
|
1034
|
+
action_taken = (
|
|
1035
|
+
f"replanned and assigned to "
|
|
1036
|
+
f"worker {assignment.assignee_id}"
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
elif strategy == RecoveryStrategy.REASSIGN:
|
|
1040
|
+
# Reassign to a different worker
|
|
1041
|
+
old_worker = task.assigned_worker_id
|
|
1042
|
+
logger.info(
|
|
1043
|
+
f"Task {task.id} will be reassigned from worker "
|
|
1044
|
+
f"{old_worker}"
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
# Find a different worker
|
|
1048
|
+
batch_result = await self._find_assignee([task])
|
|
1049
|
+
assignment = batch_result.assignments[0]
|
|
1050
|
+
new_worker = assignment.assignee_id
|
|
1051
|
+
|
|
1052
|
+
# If same worker, force find another
|
|
1053
|
+
if new_worker == old_worker and len(self._children) > 1:
|
|
1054
|
+
logger.info("Same worker selected, finding alternative")
|
|
1055
|
+
# Try to find different worker by adding note to
|
|
1056
|
+
# task content
|
|
1057
|
+
task.content = (
|
|
1058
|
+
f"{task.content}\n\n"
|
|
1059
|
+
f"Note: Previous worker {old_worker} had quality "
|
|
1060
|
+
f"issues. Needs different approach."
|
|
1061
|
+
)
|
|
1062
|
+
batch_result = await self._find_assignee([task])
|
|
1063
|
+
assignment = batch_result.assignments[0]
|
|
1064
|
+
new_worker = assignment.assignee_id
|
|
1065
|
+
|
|
1066
|
+
self._assignees[task.id] = new_worker
|
|
1067
|
+
await self._post_task(task, new_worker)
|
|
1068
|
+
action_taken = f"reassigned from {old_worker} to {new_worker}"
|
|
1069
|
+
logger.info(
|
|
1070
|
+
f"Task {task.id} reassigned from {old_worker} to "
|
|
1071
|
+
f"{new_worker}"
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
elif strategy == RecoveryStrategy.DECOMPOSE:
|
|
1075
|
+
# Decompose the task into subtasks
|
|
1076
|
+
reason = (
|
|
1077
|
+
"failure"
|
|
1078
|
+
if not recovery_decision.is_quality_evaluation
|
|
1079
|
+
else "quality issues"
|
|
1080
|
+
)
|
|
1081
|
+
logger.info(
|
|
1082
|
+
f"Task {task.id} will be decomposed due to {reason}"
|
|
1083
|
+
)
|
|
1084
|
+
subtasks_result = self._decompose_task(task)
|
|
1085
|
+
|
|
1086
|
+
# Handle both streaming and non-streaming results
|
|
1087
|
+
if isinstance(subtasks_result, Generator):
|
|
1088
|
+
subtasks = []
|
|
1089
|
+
for new_tasks in subtasks_result:
|
|
1090
|
+
subtasks.extend(new_tasks)
|
|
1091
|
+
else:
|
|
1092
|
+
subtasks = subtasks_result
|
|
1093
|
+
|
|
1094
|
+
if self.metrics_logger and subtasks:
|
|
1095
|
+
self.metrics_logger.log_task_decomposed(
|
|
1096
|
+
parent_task_id=task.id,
|
|
1097
|
+
subtask_ids=[st.id for st in subtasks],
|
|
1098
|
+
)
|
|
1099
|
+
for subtask in subtasks:
|
|
1100
|
+
self.metrics_logger.log_task_created(
|
|
1101
|
+
task_id=subtask.id,
|
|
1102
|
+
description=subtask.content,
|
|
1103
|
+
parent_task_id=task.id,
|
|
1104
|
+
task_type=subtask.type,
|
|
1105
|
+
metadata=subtask.additional_info,
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
# Insert subtasks at the head of the queue
|
|
1109
|
+
self._pending_tasks.extendleft(reversed(subtasks))
|
|
1110
|
+
await self._post_ready_tasks()
|
|
1111
|
+
action_taken = f"decomposed into {len(subtasks)} subtasks"
|
|
1112
|
+
|
|
1113
|
+
logger.info(
|
|
1114
|
+
f"Task {task.id} decomposed into {len(subtasks)} subtasks"
|
|
1115
|
+
)
|
|
1116
|
+
|
|
1117
|
+
# Sync shared memory after task decomposition
|
|
1118
|
+
if self.share_memory:
|
|
1119
|
+
logger.info(
|
|
1120
|
+
f"Syncing shared memory after task {task.id} "
|
|
1121
|
+
f"decomposition"
|
|
1122
|
+
)
|
|
1123
|
+
self._sync_shared_memory()
|
|
1124
|
+
|
|
1125
|
+
# For decompose, we return early with special handling
|
|
1126
|
+
return True
|
|
1127
|
+
|
|
1128
|
+
elif strategy == RecoveryStrategy.CREATE_WORKER:
|
|
1129
|
+
assignee = await self._create_worker_node_for_task(task)
|
|
1130
|
+
await self._post_task(task, assignee.node_id)
|
|
1131
|
+
action_taken = (
|
|
1132
|
+
f"created new worker {assignee.node_id} and assigned "
|
|
1133
|
+
f"task {task.id} to it"
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
except Exception as e:
|
|
1137
|
+
logger.error(
|
|
1138
|
+
f"Recovery strategy {strategy} failed for task {task.id}: {e}",
|
|
1139
|
+
exc_info=True,
|
|
893
1140
|
)
|
|
1141
|
+
raise
|
|
1142
|
+
|
|
1143
|
+
logger.debug(
|
|
1144
|
+
f"Task {task.id} recovery: {action_taken}. "
|
|
1145
|
+
f"Strategy: {strategy.value}"
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
return False
|
|
894
1149
|
|
|
895
1150
|
# Human intervention methods
|
|
896
1151
|
async def _async_pause(self) -> None:
|
|
@@ -1660,6 +1915,7 @@ class Workforce(BaseNode):
|
|
|
1660
1915
|
description: str,
|
|
1661
1916
|
worker: ChatAgent,
|
|
1662
1917
|
pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
|
|
1918
|
+
enable_workflow_memory: bool = False,
|
|
1663
1919
|
) -> Workforce:
|
|
1664
1920
|
r"""Add a worker node to the workforce that uses a single agent.
|
|
1665
1921
|
Can be called when workforce is paused to dynamically add workers.
|
|
@@ -1669,6 +1925,9 @@ class Workforce(BaseNode):
|
|
|
1669
1925
|
worker (ChatAgent): The agent to be added.
|
|
1670
1926
|
pool_max_size (int): Maximum size of the agent pool.
|
|
1671
1927
|
(default: :obj:`10`)
|
|
1928
|
+
enable_workflow_memory (bool): Whether to enable workflow memory
|
|
1929
|
+
accumulation. Set to True if you plan to call
|
|
1930
|
+
save_workflow_memories(). (default: :obj:`False`)
|
|
1672
1931
|
|
|
1673
1932
|
Returns:
|
|
1674
1933
|
Workforce: The workforce node itself.
|
|
@@ -1695,6 +1954,8 @@ class Workforce(BaseNode):
|
|
|
1695
1954
|
worker=worker,
|
|
1696
1955
|
pool_max_size=pool_max_size,
|
|
1697
1956
|
use_structured_output_handler=self.use_structured_output_handler,
|
|
1957
|
+
context_utility=None, # Will be set during save/load operations
|
|
1958
|
+
enable_workflow_memory=enable_workflow_memory,
|
|
1698
1959
|
)
|
|
1699
1960
|
self._children.append(worker_node)
|
|
1700
1961
|
|
|
@@ -1871,6 +2132,237 @@ class Workforce(BaseNode):
|
|
|
1871
2132
|
else:
|
|
1872
2133
|
self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
|
|
1873
2134
|
|
|
2135
|
+
def save_workflow_memories(self) -> Dict[str, str]:
|
|
2136
|
+
r"""Save workflow memories for all SingleAgentWorker instances in the
|
|
2137
|
+
workforce.
|
|
2138
|
+
|
|
2139
|
+
This method iterates through all child workers and triggers workflow
|
|
2140
|
+
saving for SingleAgentWorker instances using their
|
|
2141
|
+
save_workflow_memories()
|
|
2142
|
+
method.
|
|
2143
|
+
Other worker types are skipped.
|
|
2144
|
+
|
|
2145
|
+
Returns:
|
|
2146
|
+
Dict[str, str]: Dictionary mapping worker node IDs to save results.
|
|
2147
|
+
Values are either file paths (success) or error messages
|
|
2148
|
+
(failure).
|
|
2149
|
+
|
|
2150
|
+
Example:
|
|
2151
|
+
>>> workforce = Workforce("My Team")
|
|
2152
|
+
>>> # ... add workers and process tasks ...
|
|
2153
|
+
>>> results = workforce.save_workflows()
|
|
2154
|
+
>>> print(results)
|
|
2155
|
+
{'worker_123': '/path/to/data_analyst_workflow_20250122.md',
|
|
2156
|
+
'worker_456': 'error: No conversation context available'}
|
|
2157
|
+
"""
|
|
2158
|
+
results = {}
|
|
2159
|
+
|
|
2160
|
+
# Get or create shared context utility for this save operation
|
|
2161
|
+
shared_context_utility = self._get_or_create_shared_context_utility()
|
|
2162
|
+
|
|
2163
|
+
for child in self._children:
|
|
2164
|
+
if isinstance(child, SingleAgentWorker):
|
|
2165
|
+
try:
|
|
2166
|
+
# Set the shared context utility for this operation
|
|
2167
|
+
child._shared_context_utility = shared_context_utility
|
|
2168
|
+
child.worker.set_context_utility(shared_context_utility)
|
|
2169
|
+
|
|
2170
|
+
result = child.save_workflow_memories()
|
|
2171
|
+
if result.get("status") == "success":
|
|
2172
|
+
results[child.node_id] = result.get(
|
|
2173
|
+
"file_path", "unknown_path"
|
|
2174
|
+
)
|
|
2175
|
+
else:
|
|
2176
|
+
# Error: check if there's a separate message field,
|
|
2177
|
+
# otherwise use the status itself
|
|
2178
|
+
error_msg = result.get(
|
|
2179
|
+
"message", result.get("status", "Unknown error")
|
|
2180
|
+
)
|
|
2181
|
+
results[child.node_id] = f"error: {error_msg}"
|
|
2182
|
+
|
|
2183
|
+
except Exception as e:
|
|
2184
|
+
results[child.node_id] = f"error: {e!s}"
|
|
2185
|
+
else:
|
|
2186
|
+
# Skip non-SingleAgentWorker types
|
|
2187
|
+
results[child.node_id] = (
|
|
2188
|
+
f"skipped: {type(child).__name__} not supported"
|
|
2189
|
+
)
|
|
2190
|
+
|
|
2191
|
+
logger.info(f"Workflow save completed for {len(results)} workers")
|
|
2192
|
+
return results
|
|
2193
|
+
|
|
2194
|
+
def load_workflow_memories(
|
|
2195
|
+
self,
|
|
2196
|
+
max_files_to_load: int = 3,
|
|
2197
|
+
session_id: Optional[str] = None,
|
|
2198
|
+
) -> Dict[str, bool]:
|
|
2199
|
+
r"""Load workflow memories for all SingleAgentWorker instances in the
|
|
2200
|
+
workforce.
|
|
2201
|
+
|
|
2202
|
+
This method iterates through all child workers and loads relevant
|
|
2203
|
+
workflow files for SingleAgentWorker instances using their
|
|
2204
|
+
load_workflow_memories()
|
|
2205
|
+
method. Workers match files based on their description names.
|
|
2206
|
+
|
|
2207
|
+
Args:
|
|
2208
|
+
max_files_to_load (int): Maximum number of workflow files to load
|
|
2209
|
+
per worker. (default: :obj:`3`)
|
|
2210
|
+
session_id (Optional[str]): Specific workforce session ID to load
|
|
2211
|
+
from. If None, searches across all sessions.
|
|
2212
|
+
(default: :obj:`None`)
|
|
2213
|
+
|
|
2214
|
+
Returns:
|
|
2215
|
+
Dict[str, bool]: Dictionary mapping worker node IDs to load
|
|
2216
|
+
success status.
|
|
2217
|
+
True indicates successful loading, False indicates failure.
|
|
2218
|
+
|
|
2219
|
+
Example:
|
|
2220
|
+
>>> workforce = Workforce("My Team")
|
|
2221
|
+
>>> workforce.add_single_agent_worker(
|
|
2222
|
+
... "data_analyst", analyst_agent
|
|
2223
|
+
... )
|
|
2224
|
+
>>> success_status = workforce.load_workflows()
|
|
2225
|
+
>>> print(success_status)
|
|
2226
|
+
{'worker_123': True} # Successfully loaded workflows for
|
|
2227
|
+
# data_analyst
|
|
2228
|
+
"""
|
|
2229
|
+
results = {}
|
|
2230
|
+
|
|
2231
|
+
# For loading, we don't create a new session - instead we search
|
|
2232
|
+
# existing ones
|
|
2233
|
+
# Each worker will search independently across all existing sessions
|
|
2234
|
+
|
|
2235
|
+
# First, load workflows for SingleAgentWorker instances
|
|
2236
|
+
for child in self._children:
|
|
2237
|
+
if isinstance(child, SingleAgentWorker):
|
|
2238
|
+
try:
|
|
2239
|
+
# For loading, don't set shared context utility
|
|
2240
|
+
# Let each worker search across existing sessions
|
|
2241
|
+
success = child.load_workflow_memories(
|
|
2242
|
+
max_files_to_load=max_files_to_load,
|
|
2243
|
+
session_id=session_id,
|
|
2244
|
+
)
|
|
2245
|
+
results[child.node_id] = success
|
|
2246
|
+
|
|
2247
|
+
except Exception as e:
|
|
2248
|
+
logger.error(
|
|
2249
|
+
f"Failed to load workflow for {child.node_id}: {e!s}"
|
|
2250
|
+
)
|
|
2251
|
+
results[child.node_id] = False
|
|
2252
|
+
else:
|
|
2253
|
+
# Skip non-SingleAgentWorker types
|
|
2254
|
+
results[child.node_id] = False
|
|
2255
|
+
|
|
2256
|
+
# Load aggregated workflow summaries for coordinator and task agents
|
|
2257
|
+
self._load_management_agent_workflows(max_files_to_load, session_id)
|
|
2258
|
+
|
|
2259
|
+
logger.info(f"Workflow load completed for {len(results)} workers")
|
|
2260
|
+
return results
|
|
2261
|
+
|
|
2262
|
+
def _load_management_agent_workflows(
|
|
2263
|
+
self, max_files_to_load: int, session_id: Optional[str] = None
|
|
2264
|
+
) -> None:
|
|
2265
|
+
r"""Load workflow summaries for coordinator and task planning agents.
|
|
2266
|
+
|
|
2267
|
+
This method loads aggregated workflow summaries to help:
|
|
2268
|
+
- Coordinator agent: understand task assignment patterns and worker
|
|
2269
|
+
capabilities
|
|
2270
|
+
- Task agent: understand task decomposition patterns and
|
|
2271
|
+
successful strategies
|
|
2272
|
+
|
|
2273
|
+
Args:
|
|
2274
|
+
max_files_to_load (int): Maximum number of workflow files to load.
|
|
2275
|
+
session_id (Optional[str]): Specific session ID to load from.
|
|
2276
|
+
If None, searches across all sessions.
|
|
2277
|
+
"""
|
|
2278
|
+
try:
|
|
2279
|
+
import glob
|
|
2280
|
+
import os
|
|
2281
|
+
from pathlib import Path
|
|
2282
|
+
|
|
2283
|
+
from camel.utils.context_utils import ContextUtility
|
|
2284
|
+
|
|
2285
|
+
# For loading management workflows, search across all sessions
|
|
2286
|
+
camel_workdir = os.environ.get("CAMEL_WORKDIR")
|
|
2287
|
+
if camel_workdir:
|
|
2288
|
+
base_dir = os.path.join(camel_workdir, "workforce_workflows")
|
|
2289
|
+
else:
|
|
2290
|
+
base_dir = "workforce_workflows"
|
|
2291
|
+
|
|
2292
|
+
# Search for workflow files in specified or all session directories
|
|
2293
|
+
if session_id:
|
|
2294
|
+
search_path = str(
|
|
2295
|
+
Path(base_dir) / session_id / "*_workflow*.md"
|
|
2296
|
+
)
|
|
2297
|
+
else:
|
|
2298
|
+
search_path = str(Path(base_dir) / "*" / "*_workflow*.md")
|
|
2299
|
+
workflow_files = glob.glob(search_path)
|
|
2300
|
+
|
|
2301
|
+
if not workflow_files:
|
|
2302
|
+
logger.info(
|
|
2303
|
+
"No workflow files found for management agent context"
|
|
2304
|
+
)
|
|
2305
|
+
return
|
|
2306
|
+
|
|
2307
|
+
# Sort by modification time (most recent first)
|
|
2308
|
+
workflow_files.sort(
|
|
2309
|
+
key=lambda x: os.path.getmtime(x), reverse=True
|
|
2310
|
+
)
|
|
2311
|
+
|
|
2312
|
+
# Load workflows for coordinator agent (up to 5 most recent)
|
|
2313
|
+
coordinator_loaded = 0
|
|
2314
|
+
for file_path in workflow_files[:max_files_to_load]:
|
|
2315
|
+
try:
|
|
2316
|
+
filename = os.path.basename(file_path).replace('.md', '')
|
|
2317
|
+
session_dir = os.path.dirname(file_path)
|
|
2318
|
+
session_id = os.path.basename(session_dir)
|
|
2319
|
+
|
|
2320
|
+
# Use shared context utility with specific session
|
|
2321
|
+
temp_utility = ContextUtility.get_workforce_shared(
|
|
2322
|
+
session_id
|
|
2323
|
+
)
|
|
2324
|
+
|
|
2325
|
+
status = temp_utility.load_markdown_context_to_memory(
|
|
2326
|
+
self.coordinator_agent, filename
|
|
2327
|
+
)
|
|
2328
|
+
if "Context appended" in status:
|
|
2329
|
+
coordinator_loaded += 1
|
|
2330
|
+
except Exception as e:
|
|
2331
|
+
logger.warning(
|
|
2332
|
+
f"Failed to load coordinator workflow {file_path}: {e}"
|
|
2333
|
+
)
|
|
2334
|
+
|
|
2335
|
+
# Load workflows for task agent (up to 3 most recent)
|
|
2336
|
+
task_agent_loaded = 0
|
|
2337
|
+
for file_path in workflow_files[:max_files_to_load]:
|
|
2338
|
+
try:
|
|
2339
|
+
filename = os.path.basename(file_path).replace('.md', '')
|
|
2340
|
+
session_dir = os.path.dirname(file_path)
|
|
2341
|
+
session_id = os.path.basename(session_dir)
|
|
2342
|
+
|
|
2343
|
+
# Use shared context utility with specific session
|
|
2344
|
+
temp_utility = ContextUtility.get_workforce_shared(
|
|
2345
|
+
session_id
|
|
2346
|
+
)
|
|
2347
|
+
|
|
2348
|
+
status = temp_utility.load_markdown_context_to_memory(
|
|
2349
|
+
self.task_agent, filename
|
|
2350
|
+
)
|
|
2351
|
+
if "Context appended" in status:
|
|
2352
|
+
task_agent_loaded += 1
|
|
2353
|
+
except Exception as e:
|
|
2354
|
+
logger.warning(
|
|
2355
|
+
f"Failed to load task agent workflow {file_path}: {e}"
|
|
2356
|
+
)
|
|
2357
|
+
|
|
2358
|
+
logger.info(
|
|
2359
|
+
f"Loaded {coordinator_loaded} workflows for coordinator, "
|
|
2360
|
+
f"{task_agent_loaded} workflows for task agent"
|
|
2361
|
+
)
|
|
2362
|
+
|
|
2363
|
+
except Exception as e:
|
|
2364
|
+
logger.error(f"Error loading management agent workflows: {e}")
|
|
2365
|
+
|
|
1874
2366
|
@check_if_running(False)
|
|
1875
2367
|
def set_channel(self, channel: TaskChannel) -> None:
|
|
1876
2368
|
r"""Set the channel for the node and all the child nodes under it."""
|
|
@@ -2380,8 +2872,7 @@ class Workforce(BaseNode):
|
|
|
2380
2872
|
"worker creation"
|
|
2381
2873
|
)
|
|
2382
2874
|
new_node_conf = WorkerConf(
|
|
2383
|
-
description=f"Fallback worker for task: "
|
|
2384
|
-
f"{task.content}",
|
|
2875
|
+
description=f"Fallback worker for task: {task.content}",
|
|
2385
2876
|
role="General Assistant",
|
|
2386
2877
|
sys_msg="You are a general assistant that can help "
|
|
2387
2878
|
"with various tasks.",
|
|
@@ -2391,7 +2882,7 @@ class Workforce(BaseNode):
|
|
|
2391
2882
|
response.msg.content,
|
|
2392
2883
|
schema=WorkerConf,
|
|
2393
2884
|
fallback_values={
|
|
2394
|
-
"description": f"Worker for task:
|
|
2885
|
+
"description": f"Worker for task: {task.content}",
|
|
2395
2886
|
"role": "Task Specialist",
|
|
2396
2887
|
"sys_msg": f"You are a specialist for: {task.content}",
|
|
2397
2888
|
},
|
|
@@ -2419,8 +2910,7 @@ class Workforce(BaseNode):
|
|
|
2419
2910
|
)
|
|
2420
2911
|
# Create a fallback worker configuration
|
|
2421
2912
|
new_node_conf = WorkerConf(
|
|
2422
|
-
description=f"Fallback worker for "
|
|
2423
|
-
f"task: {task.content}",
|
|
2913
|
+
description=f"Fallback worker for task: {task.content}",
|
|
2424
2914
|
role="General Assistant",
|
|
2425
2915
|
sys_msg="You are a general assistant that can help "
|
|
2426
2916
|
"with various tasks.",
|
|
@@ -2619,20 +3109,119 @@ class Workforce(BaseNode):
|
|
|
2619
3109
|
f"Assigning task: {e}"
|
|
2620
3110
|
)
|
|
2621
3111
|
dependencies = self._task_dependencies[task.id]
|
|
2622
|
-
|
|
2623
|
-
#
|
|
2624
|
-
|
|
2625
|
-
dep_id in completed_tasks_info
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
3112
|
+
|
|
3113
|
+
# Check if all dependencies are in completed state
|
|
3114
|
+
all_deps_completed = all(
|
|
3115
|
+
dep_id in completed_tasks_info for dep_id in dependencies
|
|
3116
|
+
)
|
|
3117
|
+
|
|
3118
|
+
# Only proceed with dependency checks if all deps are completed
|
|
3119
|
+
if all_deps_completed:
|
|
3120
|
+
# Check if all dependencies succeeded (state is DONE)
|
|
3121
|
+
all_deps_done = all(
|
|
3122
|
+
completed_tasks_info[dep_id] == TaskState.DONE
|
|
3123
|
+
for dep_id in dependencies
|
|
2633
3124
|
)
|
|
2634
|
-
|
|
2635
|
-
|
|
3125
|
+
|
|
3126
|
+
# Check if any dependency failed
|
|
3127
|
+
any_dep_failed = any(
|
|
3128
|
+
completed_tasks_info[dep_id] == TaskState.FAILED
|
|
3129
|
+
for dep_id in dependencies
|
|
3130
|
+
)
|
|
3131
|
+
|
|
3132
|
+
if all_deps_done:
|
|
3133
|
+
# All dependencies completed successfully - post the
|
|
3134
|
+
# task
|
|
3135
|
+
assignee_id = self._assignees[task.id]
|
|
3136
|
+
logger.debug(
|
|
3137
|
+
f"Posting task {task.id} to "
|
|
3138
|
+
f"assignee {assignee_id}. "
|
|
3139
|
+
f"Dependencies met."
|
|
3140
|
+
)
|
|
3141
|
+
await self._post_task(task, assignee_id)
|
|
3142
|
+
posted_tasks.append(task)
|
|
3143
|
+
elif any_dep_failed:
|
|
3144
|
+
# Check if any failed dependencies can still be retried
|
|
3145
|
+
failed_deps = [
|
|
3146
|
+
dep_id
|
|
3147
|
+
for dep_id in dependencies
|
|
3148
|
+
if completed_tasks_info[dep_id] == TaskState.FAILED
|
|
3149
|
+
]
|
|
3150
|
+
|
|
3151
|
+
# Check if any failed dependency is still retryable
|
|
3152
|
+
failed_tasks_with_retry_potential = []
|
|
3153
|
+
permanently_failed_deps = []
|
|
3154
|
+
|
|
3155
|
+
for dep_id in failed_deps:
|
|
3156
|
+
# Find the failed dependency task
|
|
3157
|
+
failed_task = next(
|
|
3158
|
+
(
|
|
3159
|
+
t
|
|
3160
|
+
for t in self._completed_tasks
|
|
3161
|
+
if t.id == dep_id
|
|
3162
|
+
),
|
|
3163
|
+
None,
|
|
3164
|
+
)
|
|
3165
|
+
if (
|
|
3166
|
+
failed_task
|
|
3167
|
+
and failed_task.failure_count
|
|
3168
|
+
< MAX_TASK_RETRIES
|
|
3169
|
+
):
|
|
3170
|
+
failed_tasks_with_retry_potential.append(
|
|
3171
|
+
dep_id
|
|
3172
|
+
)
|
|
3173
|
+
else:
|
|
3174
|
+
permanently_failed_deps.append(dep_id)
|
|
3175
|
+
|
|
3176
|
+
# Only fail the task if ALL dependencies are
|
|
3177
|
+
# permanently failed
|
|
3178
|
+
if (
|
|
3179
|
+
permanently_failed_deps
|
|
3180
|
+
and not failed_tasks_with_retry_potential
|
|
3181
|
+
):
|
|
3182
|
+
logger.error(
|
|
3183
|
+
f"Task {task.id} cannot proceed: dependencies "
|
|
3184
|
+
f"{permanently_failed_deps} have "
|
|
3185
|
+
f"permanently failed. "
|
|
3186
|
+
f"Marking task as failed."
|
|
3187
|
+
)
|
|
3188
|
+
task.state = TaskState.FAILED
|
|
3189
|
+
task.result = (
|
|
3190
|
+
f"Task failed due to permanently "
|
|
3191
|
+
f"failed dependencies: "
|
|
3192
|
+
f"{permanently_failed_deps}"
|
|
3193
|
+
)
|
|
3194
|
+
|
|
3195
|
+
# Log the failure to metrics
|
|
3196
|
+
if self.metrics_logger:
|
|
3197
|
+
self.metrics_logger.log_task_failed(
|
|
3198
|
+
task_id=task.id,
|
|
3199
|
+
worker_id=task.assigned_worker_id
|
|
3200
|
+
or "unknown",
|
|
3201
|
+
error_message=task.result,
|
|
3202
|
+
metadata={
|
|
3203
|
+
'failure_reason': (
|
|
3204
|
+
'dependency_failure'
|
|
3205
|
+
),
|
|
3206
|
+
'failed_dependencies': (
|
|
3207
|
+
permanently_failed_deps
|
|
3208
|
+
),
|
|
3209
|
+
},
|
|
3210
|
+
)
|
|
3211
|
+
|
|
3212
|
+
self._completed_tasks.append(task)
|
|
3213
|
+
self._cleanup_task_tracking(task.id)
|
|
3214
|
+
posted_tasks.append(task) # Remove from pending
|
|
3215
|
+
else:
|
|
3216
|
+
# Some dependencies may still be retried, keep
|
|
3217
|
+
# task pending
|
|
3218
|
+
logger.debug(
|
|
3219
|
+
f"Task {task.id} waiting: dependencies "
|
|
3220
|
+
f"{failed_tasks_with_retry_potential} "
|
|
3221
|
+
f"failed but may be retried "
|
|
3222
|
+
f"(attempt < {MAX_TASK_RETRIES})"
|
|
3223
|
+
)
|
|
3224
|
+
# else: Not all dependencies completed yet, skip this task
|
|
2636
3225
|
|
|
2637
3226
|
# Step 3: Remove the posted tasks from the pending list
|
|
2638
3227
|
for task in posted_tasks:
|
|
@@ -2644,21 +3233,30 @@ class Workforce(BaseNode):
|
|
|
2644
3233
|
pass
|
|
2645
3234
|
|
|
2646
3235
|
async def _handle_failed_task(self, task: Task) -> bool:
|
|
3236
|
+
r"""Handle a task that failed during execution.
|
|
3237
|
+
|
|
3238
|
+
Args:
|
|
3239
|
+
task (Task): The failed task
|
|
3240
|
+
|
|
3241
|
+
Returns:
|
|
3242
|
+
bool: True if workforce should halt, False otherwise
|
|
3243
|
+
"""
|
|
2647
3244
|
task.failure_count += 1
|
|
2648
3245
|
|
|
2649
3246
|
# Determine detailed failure information
|
|
2650
|
-
# Use the actual error/result stored in task.result
|
|
2651
3247
|
failure_reason = task.result or "Unknown error"
|
|
2652
|
-
|
|
2653
|
-
# Add context about the worker and task
|
|
2654
3248
|
worker_id = task.assigned_worker_id or "unknown"
|
|
2655
|
-
|
|
2656
|
-
|
|
2657
|
-
detailed_error = f"{failure_reason}{worker_info}"
|
|
3249
|
+
detailed_error = f"{failure_reason} (assigned to worker: {worker_id})"
|
|
2658
3250
|
|
|
2659
3251
|
logger.error(
|
|
2660
3252
|
f"Task {task.id} failed (attempt "
|
|
2661
|
-
f"{task.failure_count}/
|
|
3253
|
+
f"{task.failure_count}/{MAX_TASK_RETRIES}): {detailed_error}"
|
|
3254
|
+
)
|
|
3255
|
+
|
|
3256
|
+
print(
|
|
3257
|
+
f"{Fore.RED}❌ Task {task.id} failed "
|
|
3258
|
+
f"(attempt {task.failure_count}/{MAX_TASK_RETRIES}): "
|
|
3259
|
+
f"{failure_reason}{Fore.RESET}"
|
|
2662
3260
|
)
|
|
2663
3261
|
|
|
2664
3262
|
if self.metrics_logger:
|
|
@@ -2673,24 +3271,20 @@ class Workforce(BaseNode):
|
|
|
2673
3271
|
},
|
|
2674
3272
|
)
|
|
2675
3273
|
|
|
2676
|
-
# Check for immediate halt conditions
|
|
2677
|
-
# should halt
|
|
3274
|
+
# Check for immediate halt conditions
|
|
2678
3275
|
if task.failure_count >= MAX_TASK_RETRIES:
|
|
2679
3276
|
logger.error(
|
|
2680
3277
|
f"Task {task.id} has exceeded maximum retry attempts "
|
|
2681
|
-
f"({MAX_TASK_RETRIES}). Final failure "
|
|
2682
|
-
f"
|
|
3278
|
+
f"({MAX_TASK_RETRIES}). Final failure reason: "
|
|
3279
|
+
f"{detailed_error}. "
|
|
2683
3280
|
f"Task content: '{task.content}'"
|
|
2684
3281
|
)
|
|
2685
3282
|
self._cleanup_task_tracking(task.id)
|
|
2686
|
-
# Mark task as completed for dependency tracking before halting
|
|
2687
3283
|
self._completed_tasks.append(task)
|
|
2688
3284
|
if task.id in self._assignees:
|
|
2689
3285
|
await self._channel.archive_task(task.id)
|
|
2690
3286
|
return True
|
|
2691
3287
|
|
|
2692
|
-
# If too many tasks are failing rapidly, also halt to prevent infinite
|
|
2693
|
-
# loops
|
|
2694
3288
|
if len(self._pending_tasks) > MAX_PENDING_TASKS_LIMIT:
|
|
2695
3289
|
logger.error(
|
|
2696
3290
|
f"Too many pending tasks ({len(self._pending_tasks)} > "
|
|
@@ -2698,18 +3292,24 @@ class Workforce(BaseNode):
|
|
|
2698
3292
|
f"explosion. Last failed task: {task.id}"
|
|
2699
3293
|
)
|
|
2700
3294
|
self._cleanup_task_tracking(task.id)
|
|
2701
|
-
# Mark task as completed for dependency tracking before halting
|
|
2702
3295
|
self._completed_tasks.append(task)
|
|
2703
3296
|
if task.id in self._assignees:
|
|
2704
3297
|
await self._channel.archive_task(task.id)
|
|
2705
3298
|
return True
|
|
2706
3299
|
|
|
2707
3300
|
# Use intelligent failure analysis to decide recovery strategy
|
|
2708
|
-
recovery_decision = self.
|
|
3301
|
+
recovery_decision = self._analyze_task(
|
|
3302
|
+
task, for_failure=True, error_message=detailed_error
|
|
3303
|
+
)
|
|
2709
3304
|
|
|
3305
|
+
strategy_str = (
|
|
3306
|
+
recovery_decision.recovery_strategy.value
|
|
3307
|
+
if recovery_decision.recovery_strategy
|
|
3308
|
+
else "none"
|
|
3309
|
+
)
|
|
2710
3310
|
logger.info(
|
|
2711
3311
|
f"Task {task.id} failure "
|
|
2712
|
-
f"analysis: {
|
|
3312
|
+
f"analysis: {strategy_str} - "
|
|
2713
3313
|
f"{recovery_decision.reasoning}"
|
|
2714
3314
|
)
|
|
2715
3315
|
|
|
@@ -2718,105 +3318,23 @@ class Workforce(BaseNode):
|
|
|
2718
3318
|
await self._channel.archive_task(task.id)
|
|
2719
3319
|
self._cleanup_task_tracking(task.id)
|
|
2720
3320
|
|
|
3321
|
+
# Apply recovery strategy
|
|
2721
3322
|
try:
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
assignee_id = self._assignees[task.id]
|
|
2726
|
-
await self._post_task(task, assignee_id)
|
|
2727
|
-
action_taken = f"retried with same worker {assignee_id}"
|
|
2728
|
-
else:
|
|
2729
|
-
# Find a new assignee and retry
|
|
2730
|
-
batch_result = await self._find_assignee([task])
|
|
2731
|
-
assignment = batch_result.assignments[0]
|
|
2732
|
-
self._assignees[task.id] = assignment.assignee_id
|
|
2733
|
-
await self._post_task(task, assignment.assignee_id)
|
|
2734
|
-
action_taken = (
|
|
2735
|
-
f"retried with new worker {assignment.assignee_id}"
|
|
2736
|
-
)
|
|
2737
|
-
|
|
2738
|
-
elif recovery_decision.strategy == RecoveryStrategy.REPLAN:
|
|
2739
|
-
# Modify the task content and retry
|
|
2740
|
-
if recovery_decision.modified_task_content:
|
|
2741
|
-
task.content = recovery_decision.modified_task_content
|
|
2742
|
-
logger.info(f"Task {task.id} content modified for replan")
|
|
2743
|
-
|
|
2744
|
-
# Repost the modified task
|
|
2745
|
-
if task.id in self._assignees:
|
|
2746
|
-
assignee_id = self._assignees[task.id]
|
|
2747
|
-
await self._post_task(task, assignee_id)
|
|
2748
|
-
action_taken = (
|
|
2749
|
-
f"replanned and retried with worker {assignee_id}"
|
|
2750
|
-
)
|
|
2751
|
-
else:
|
|
2752
|
-
# Find a new assignee for the replanned task
|
|
2753
|
-
batch_result = await self._find_assignee([task])
|
|
2754
|
-
assignment = batch_result.assignments[0]
|
|
2755
|
-
self._assignees[task.id] = assignment.assignee_id
|
|
2756
|
-
await self._post_task(task, assignment.assignee_id)
|
|
2757
|
-
action_taken = (
|
|
2758
|
-
f"replanned and assigned to "
|
|
2759
|
-
f"worker {assignment.assignee_id}"
|
|
2760
|
-
)
|
|
2761
|
-
|
|
2762
|
-
elif recovery_decision.strategy == RecoveryStrategy.DECOMPOSE:
|
|
2763
|
-
# Decompose the task into subtasks
|
|
2764
|
-
subtasks_result = self._decompose_task(task)
|
|
2765
|
-
|
|
2766
|
-
# Handle both streaming and non-streaming results
|
|
2767
|
-
if isinstance(subtasks_result, Generator):
|
|
2768
|
-
# This is a generator (streaming mode)
|
|
2769
|
-
subtasks = []
|
|
2770
|
-
for new_tasks in subtasks_result:
|
|
2771
|
-
subtasks.extend(new_tasks)
|
|
2772
|
-
else:
|
|
2773
|
-
# This is a regular list (non-streaming mode)
|
|
2774
|
-
subtasks = subtasks_result
|
|
2775
|
-
if self.metrics_logger and subtasks:
|
|
2776
|
-
self.metrics_logger.log_task_decomposed(
|
|
2777
|
-
parent_task_id=task.id,
|
|
2778
|
-
subtask_ids=[st.id for st in subtasks],
|
|
2779
|
-
)
|
|
2780
|
-
for subtask in subtasks:
|
|
2781
|
-
self.metrics_logger.log_task_created(
|
|
2782
|
-
task_id=subtask.id,
|
|
2783
|
-
description=subtask.content,
|
|
2784
|
-
parent_task_id=task.id,
|
|
2785
|
-
task_type=subtask.type,
|
|
2786
|
-
metadata=subtask.additional_info,
|
|
2787
|
-
)
|
|
2788
|
-
# Insert packets at the head of the queue
|
|
2789
|
-
self._pending_tasks.extendleft(reversed(subtasks))
|
|
2790
|
-
|
|
2791
|
-
await self._post_ready_tasks()
|
|
2792
|
-
action_taken = f"decomposed into {len(subtasks)} subtasks"
|
|
2793
|
-
|
|
2794
|
-
logger.debug(
|
|
2795
|
-
f"Task {task.id} failed and was {action_taken}. "
|
|
2796
|
-
f"Dependencies updated for subtasks."
|
|
2797
|
-
)
|
|
2798
|
-
|
|
2799
|
-
# Sync shared memory after task decomposition
|
|
2800
|
-
if self.share_memory:
|
|
2801
|
-
logger.info(
|
|
2802
|
-
f"Syncing shared memory after "
|
|
2803
|
-
f"task {task.id} decomposition"
|
|
2804
|
-
)
|
|
2805
|
-
self._sync_shared_memory()
|
|
3323
|
+
is_decompose = await self._apply_recovery_strategy(
|
|
3324
|
+
task, recovery_decision
|
|
3325
|
+
)
|
|
2806
3326
|
|
|
2807
|
-
|
|
2808
|
-
|
|
3327
|
+
# For decompose, we handle it specially
|
|
3328
|
+
if is_decompose:
|
|
3329
|
+
# Task was decomposed, add to completed tasks
|
|
3330
|
+
self._completed_tasks.append(task)
|
|
2809
3331
|
return False
|
|
2810
3332
|
|
|
2811
|
-
elif recovery_decision.strategy == RecoveryStrategy.CREATE_WORKER:
|
|
2812
|
-
assignee = await self._create_worker_node_for_task(task)
|
|
2813
|
-
await self._post_task(task, assignee.node_id)
|
|
2814
|
-
action_taken = (
|
|
2815
|
-
f"created new worker {assignee.node_id} and assigned "
|
|
2816
|
-
f"task {task.id} to it"
|
|
2817
|
-
)
|
|
2818
3333
|
except Exception as e:
|
|
2819
|
-
logger.error(
|
|
3334
|
+
logger.error(
|
|
3335
|
+
f"Recovery strategy failed for task {task.id}: {e}",
|
|
3336
|
+
exc_info=True,
|
|
3337
|
+
)
|
|
2820
3338
|
# If max retries reached, halt the workforce
|
|
2821
3339
|
if task.failure_count >= MAX_TASK_RETRIES:
|
|
2822
3340
|
self._completed_tasks.append(task)
|
|
@@ -2824,18 +3342,17 @@ class Workforce(BaseNode):
|
|
|
2824
3342
|
self._completed_tasks.append(task)
|
|
2825
3343
|
return False
|
|
2826
3344
|
|
|
3345
|
+
# Task is being retried - don't add to completed tasks
|
|
3346
|
+
# It will be added when it actually completes or permanently fails
|
|
2827
3347
|
logger.debug(
|
|
2828
|
-
f"Task {task.id}
|
|
2829
|
-
f"
|
|
3348
|
+
f"Task {task.id} is being retried (strategy: "
|
|
3349
|
+
f"{recovery_decision.recovery_strategy}). "
|
|
3350
|
+
f"Not adding to completed tasks until final outcome."
|
|
2830
3351
|
)
|
|
2831
|
-
# Mark task as completed for dependency tracking
|
|
2832
|
-
self._completed_tasks.append(task)
|
|
2833
3352
|
|
|
2834
|
-
# Sync shared memory after task
|
|
3353
|
+
# Sync shared memory after task recovery
|
|
2835
3354
|
if self.share_memory:
|
|
2836
|
-
logger.info(
|
|
2837
|
-
f"Syncing shared memory after task {task.id} completion"
|
|
2838
|
-
)
|
|
3355
|
+
logger.info(f"Syncing shared memory after task {task.id} recovery")
|
|
2839
3356
|
self._sync_shared_memory()
|
|
2840
3357
|
|
|
2841
3358
|
# Check if any pending tasks are now ready to execute
|
|
@@ -3345,11 +3862,88 @@ class Workforce(BaseNode):
|
|
|
3345
3862
|
)
|
|
3346
3863
|
continue
|
|
3347
3864
|
else:
|
|
3348
|
-
|
|
3349
|
-
|
|
3350
|
-
f"successfully.{Fore.RESET}"
|
|
3865
|
+
quality_eval = self._analyze_task(
|
|
3866
|
+
returned_task, for_failure=False
|
|
3351
3867
|
)
|
|
3352
|
-
|
|
3868
|
+
|
|
3869
|
+
if not quality_eval.quality_sufficient:
|
|
3870
|
+
logger.info(
|
|
3871
|
+
f"Task {returned_task.id} quality check: "
|
|
3872
|
+
f"score={quality_eval.quality_score}, "
|
|
3873
|
+
f"issues={quality_eval.issues}, "
|
|
3874
|
+
f"strategy={quality_eval.recovery_strategy}"
|
|
3875
|
+
)
|
|
3876
|
+
|
|
3877
|
+
# Check retry limit before attempting recovery
|
|
3878
|
+
if returned_task.failure_count >= 2:
|
|
3879
|
+
print(
|
|
3880
|
+
f"{Fore.YELLOW}Task {returned_task.id} "
|
|
3881
|
+
f"completed with low quality score: "
|
|
3882
|
+
f"{quality_eval.quality_score} "
|
|
3883
|
+
f"(retry limit reached){Fore.RESET}"
|
|
3884
|
+
)
|
|
3885
|
+
await self._handle_completed_task(
|
|
3886
|
+
returned_task
|
|
3887
|
+
)
|
|
3888
|
+
continue
|
|
3889
|
+
|
|
3890
|
+
# Print visual feedback for quality-failed tasks
|
|
3891
|
+
# with recovery strategy
|
|
3892
|
+
recovery_action = (
|
|
3893
|
+
quality_eval.recovery_strategy.value
|
|
3894
|
+
if quality_eval.recovery_strategy
|
|
3895
|
+
else ""
|
|
3896
|
+
)
|
|
3897
|
+
print(
|
|
3898
|
+
f"{Fore.YELLOW}⚠️ Task {returned_task.id} "
|
|
3899
|
+
f"failed quality check (score: "
|
|
3900
|
+
f"{quality_eval.quality_score}). "
|
|
3901
|
+
f"Issues: {', '.join(quality_eval.issues)}. "
|
|
3902
|
+
f"Recovery: {recovery_action}{Fore.RESET}"
|
|
3903
|
+
)
|
|
3904
|
+
|
|
3905
|
+
# Mark as failed for recovery
|
|
3906
|
+
returned_task.failure_count += 1
|
|
3907
|
+
returned_task.state = TaskState.FAILED
|
|
3908
|
+
returned_task.result = (
|
|
3909
|
+
f"Quality insufficient (score: "
|
|
3910
|
+
f"{quality_eval.quality_score}). "
|
|
3911
|
+
f"Issues: {', '.join(quality_eval.issues)}"
|
|
3912
|
+
)
|
|
3913
|
+
|
|
3914
|
+
# Clean up tracking before attempting recovery
|
|
3915
|
+
if returned_task.id in self._assignees:
|
|
3916
|
+
await self._channel.archive_task(
|
|
3917
|
+
returned_task.id
|
|
3918
|
+
)
|
|
3919
|
+
self._cleanup_task_tracking(returned_task.id)
|
|
3920
|
+
|
|
3921
|
+
# Apply LLM-recommended recovery strategy
|
|
3922
|
+
try:
|
|
3923
|
+
is_decompose = (
|
|
3924
|
+
await self._apply_recovery_strategy(
|
|
3925
|
+
returned_task, quality_eval
|
|
3926
|
+
)
|
|
3927
|
+
)
|
|
3928
|
+
|
|
3929
|
+
# For decompose, cleanup happens in the method
|
|
3930
|
+
if is_decompose:
|
|
3931
|
+
continue
|
|
3932
|
+
|
|
3933
|
+
except Exception as e:
|
|
3934
|
+
logger.error(
|
|
3935
|
+
f"Error handling quality-failed task "
|
|
3936
|
+
f"{returned_task.id}: {e}",
|
|
3937
|
+
exc_info=True,
|
|
3938
|
+
)
|
|
3939
|
+
continue
|
|
3940
|
+
else:
|
|
3941
|
+
print(
|
|
3942
|
+
f"{Fore.CYAN}Task {returned_task.id} "
|
|
3943
|
+
f"completed successfully (quality score: "
|
|
3944
|
+
f"{quality_eval.quality_score}).{Fore.RESET}"
|
|
3945
|
+
)
|
|
3946
|
+
await self._handle_completed_task(returned_task)
|
|
3353
3947
|
elif returned_task.state == TaskState.FAILED:
|
|
3354
3948
|
try:
|
|
3355
3949
|
halt = await self._handle_failed_task(returned_task)
|