camel-ai 0.2.75a5__py3-none-any.whl → 0.2.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +1148 -298
- camel/agents/mcp_agent.py +30 -27
- camel/configs/__init__.py +9 -0
- camel/configs/amd_config.py +70 -0
- camel/configs/cometapi_config.py +104 -0
- camel/configs/nebius_config.py +103 -0
- camel/data_collectors/alpaca_collector.py +15 -6
- camel/environments/tic_tac_toe.py +1 -1
- camel/interpreters/__init__.py +2 -0
- camel/interpreters/docker/Dockerfile +3 -12
- camel/interpreters/microsandbox_interpreter.py +395 -0
- camel/loaders/__init__.py +11 -2
- camel/loaders/chunkr_reader.py +9 -0
- camel/memories/__init__.py +2 -1
- camel/memories/agent_memories.py +3 -1
- camel/memories/blocks/chat_history_block.py +21 -3
- camel/memories/records.py +88 -8
- camel/messages/base.py +127 -34
- camel/models/__init__.py +6 -0
- camel/models/amd_model.py +101 -0
- camel/models/azure_openai_model.py +0 -6
- camel/models/base_model.py +30 -0
- camel/models/cometapi_model.py +83 -0
- camel/models/model_factory.py +6 -0
- camel/models/nebius_model.py +83 -0
- camel/models/ollama_model.py +3 -3
- camel/models/openai_compatible_model.py +0 -6
- camel/models/openai_model.py +0 -6
- camel/models/zhipuai_model.py +61 -2
- camel/parsers/__init__.py +18 -0
- camel/parsers/mcp_tool_call_parser.py +176 -0
- camel/retrievers/auto_retriever.py +1 -0
- camel/runtimes/daytona_runtime.py +11 -12
- camel/societies/workforce/prompts.py +131 -50
- camel/societies/workforce/single_agent_worker.py +434 -49
- camel/societies/workforce/structured_output_handler.py +30 -18
- camel/societies/workforce/task_channel.py +163 -27
- camel/societies/workforce/utils.py +105 -12
- camel/societies/workforce/workforce.py +1357 -314
- camel/societies/workforce/workforce_logger.py +24 -5
- camel/storages/key_value_storages/json.py +15 -2
- camel/storages/object_storages/google_cloud.py +1 -1
- camel/storages/vectordb_storages/oceanbase.py +10 -11
- camel/storages/vectordb_storages/tidb.py +8 -6
- camel/tasks/task.py +4 -3
- camel/toolkits/__init__.py +18 -5
- camel/toolkits/aci_toolkit.py +45 -0
- camel/toolkits/code_execution.py +28 -1
- camel/toolkits/context_summarizer_toolkit.py +684 -0
- camel/toolkits/dingtalk.py +1135 -0
- camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
- camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +194 -34
- camel/toolkits/function_tool.py +6 -1
- camel/toolkits/github_toolkit.py +104 -17
- camel/toolkits/google_drive_mcp_toolkit.py +12 -31
- camel/toolkits/hybrid_browser_toolkit/config_loader.py +12 -0
- camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +79 -2
- camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +95 -59
- camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
- camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
- camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
- camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +619 -95
- camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +7 -2
- camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +115 -219
- camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
- camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +1 -0
- camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +39 -6
- camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +412 -133
- camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +9 -5
- camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +98 -31
- camel/toolkits/markitdown_toolkit.py +27 -1
- camel/toolkits/math_toolkit.py +64 -10
- camel/toolkits/mcp_toolkit.py +348 -348
- camel/toolkits/message_integration.py +3 -0
- camel/toolkits/minimax_mcp_toolkit.py +195 -0
- camel/toolkits/note_taking_toolkit.py +18 -8
- camel/toolkits/notion_mcp_toolkit.py +16 -26
- camel/toolkits/origene_mcp_toolkit.py +8 -49
- camel/toolkits/playwright_mcp_toolkit.py +12 -31
- camel/toolkits/resend_toolkit.py +168 -0
- camel/toolkits/search_toolkit.py +13 -2
- camel/toolkits/slack_toolkit.py +50 -1
- camel/toolkits/terminal_toolkit/__init__.py +18 -0
- camel/toolkits/terminal_toolkit/terminal_toolkit.py +924 -0
- camel/toolkits/terminal_toolkit/utils.py +532 -0
- camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
- camel/toolkits/video_analysis_toolkit.py +17 -11
- camel/toolkits/wechat_official_toolkit.py +483 -0
- camel/types/enums.py +155 -1
- camel/types/unified_model_type.py +10 -0
- camel/utils/commons.py +17 -0
- camel/utils/context_utils.py +804 -0
- camel/utils/mcp.py +136 -2
- camel/utils/token_counting.py +25 -17
- {camel_ai-0.2.75a5.dist-info → camel_ai-0.2.76.dist-info}/METADATA +158 -67
- {camel_ai-0.2.75a5.dist-info → camel_ai-0.2.76.dist-info}/RECORD +101 -80
- camel/loaders/pandas_reader.py +0 -368
- camel/toolkits/terminal_toolkit.py +0 -1788
- {camel_ai-0.2.75a5.dist-info → camel_ai-0.2.76.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.75a5.dist-info → camel_ai-0.2.76.dist-info}/licenses/LICENSE +0 -0
|
@@ -16,12 +16,15 @@ from __future__ import annotations
|
|
|
16
16
|
import asyncio
|
|
17
17
|
import concurrent.futures
|
|
18
18
|
import json
|
|
19
|
+
import os
|
|
19
20
|
import time
|
|
20
21
|
import uuid
|
|
21
22
|
from collections import deque
|
|
22
23
|
from enum import Enum
|
|
23
24
|
from typing import (
|
|
25
|
+
TYPE_CHECKING,
|
|
24
26
|
Any,
|
|
27
|
+
Callable,
|
|
25
28
|
Coroutine,
|
|
26
29
|
Deque,
|
|
27
30
|
Dict,
|
|
@@ -31,8 +34,12 @@ from typing import (
|
|
|
31
34
|
Set,
|
|
32
35
|
Tuple,
|
|
33
36
|
Union,
|
|
37
|
+
cast,
|
|
34
38
|
)
|
|
35
39
|
|
|
40
|
+
if TYPE_CHECKING:
|
|
41
|
+
from camel.utils.context_utils import ContextUtility
|
|
42
|
+
|
|
36
43
|
from colorama import Fore
|
|
37
44
|
|
|
38
45
|
from camel.agents import ChatAgent
|
|
@@ -43,19 +50,23 @@ from camel.societies.workforce.base import BaseNode
|
|
|
43
50
|
from camel.societies.workforce.prompts import (
|
|
44
51
|
ASSIGN_TASK_PROMPT,
|
|
45
52
|
CREATE_NODE_PROMPT,
|
|
46
|
-
|
|
53
|
+
FAILURE_ANALYSIS_RESPONSE_FORMAT,
|
|
54
|
+
QUALITY_EVALUATION_RESPONSE_FORMAT,
|
|
55
|
+
TASK_AGENT_SYSTEM_MESSAGE,
|
|
56
|
+
TASK_ANALYSIS_PROMPT,
|
|
47
57
|
TASK_DECOMPOSE_PROMPT,
|
|
48
58
|
)
|
|
49
59
|
from camel.societies.workforce.role_playing_worker import RolePlayingWorker
|
|
50
|
-
from camel.societies.workforce.single_agent_worker import
|
|
60
|
+
from camel.societies.workforce.single_agent_worker import (
|
|
61
|
+
SingleAgentWorker,
|
|
62
|
+
)
|
|
51
63
|
from camel.societies.workforce.structured_output_handler import (
|
|
52
64
|
StructuredOutputHandler,
|
|
53
65
|
)
|
|
54
66
|
from camel.societies.workforce.task_channel import TaskChannel
|
|
55
67
|
from camel.societies.workforce.utils import (
|
|
56
|
-
FailureContext,
|
|
57
|
-
RecoveryDecision,
|
|
58
68
|
RecoveryStrategy,
|
|
69
|
+
TaskAnalysisResult,
|
|
59
70
|
TaskAssignment,
|
|
60
71
|
TaskAssignResult,
|
|
61
72
|
WorkerConf,
|
|
@@ -70,6 +81,7 @@ from camel.tasks.task import (
|
|
|
70
81
|
)
|
|
71
82
|
from camel.toolkits import (
|
|
72
83
|
CodeExecutionToolkit,
|
|
84
|
+
FunctionTool,
|
|
73
85
|
SearchToolkit,
|
|
74
86
|
TaskPlanningToolkit,
|
|
75
87
|
ThinkingToolkit,
|
|
@@ -79,12 +91,20 @@ from camel.utils import dependencies_required
|
|
|
79
91
|
|
|
80
92
|
from .workforce_logger import WorkforceLogger
|
|
81
93
|
|
|
82
|
-
|
|
94
|
+
if os.environ.get("TRACEROOT_ENABLED", "False").lower() == "true":
|
|
95
|
+
try:
|
|
96
|
+
import traceroot # type: ignore[import]
|
|
97
|
+
|
|
98
|
+
logger = traceroot.get_logger('camel')
|
|
99
|
+
except ImportError:
|
|
100
|
+
logger = get_logger(__name__)
|
|
101
|
+
else:
|
|
102
|
+
logger = get_logger(__name__)
|
|
83
103
|
|
|
84
104
|
# Constants for configuration values
|
|
85
105
|
MAX_TASK_RETRIES = 3
|
|
86
106
|
MAX_PENDING_TASKS_LIMIT = 20
|
|
87
|
-
TASK_TIMEOUT_SECONDS =
|
|
107
|
+
TASK_TIMEOUT_SECONDS = 600.0
|
|
88
108
|
DEFAULT_WORKER_POOL_SIZE = 10
|
|
89
109
|
|
|
90
110
|
|
|
@@ -265,6 +285,7 @@ class Workforce(BaseNode):
|
|
|
265
285
|
self._pause_event = asyncio.Event()
|
|
266
286
|
self._pause_event.set() # Initially not paused
|
|
267
287
|
self._stop_requested = False
|
|
288
|
+
self._skip_requested = False
|
|
268
289
|
self._snapshots: List[WorkforceSnapshot] = []
|
|
269
290
|
self._completed_tasks: List[Task] = []
|
|
270
291
|
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
@@ -311,8 +332,7 @@ class Workforce(BaseNode):
|
|
|
311
332
|
if coordinator_agent.system_message is not None:
|
|
312
333
|
user_sys_msg_content = coordinator_agent.system_message.content
|
|
313
334
|
combined_content = (
|
|
314
|
-
f"{user_sys_msg_content}\n\n"
|
|
315
|
-
f"{coord_agent_sys_msg.content}"
|
|
335
|
+
f"{user_sys_msg_content}\n\n{coord_agent_sys_msg.content}"
|
|
316
336
|
)
|
|
317
337
|
combined_sys_msg = BaseMessage.make_assistant_message(
|
|
318
338
|
role_name=coordinator_agent.system_message.role_name,
|
|
@@ -336,10 +356,7 @@ class Workforce(BaseNode):
|
|
|
336
356
|
None,
|
|
337
357
|
),
|
|
338
358
|
output_language=coordinator_agent.output_language,
|
|
339
|
-
tools=
|
|
340
|
-
tool.func
|
|
341
|
-
for tool in coordinator_agent._internal_tools.values()
|
|
342
|
-
],
|
|
359
|
+
tools=list(coordinator_agent._internal_tools.values()),
|
|
343
360
|
external_tools=[
|
|
344
361
|
schema
|
|
345
362
|
for schema in coordinator_agent._external_tool_schemas.values() # noqa: E501
|
|
@@ -352,7 +369,7 @@ class Workforce(BaseNode):
|
|
|
352
369
|
# Set up task agent with default system message and required tools
|
|
353
370
|
task_sys_msg = BaseMessage.make_assistant_message(
|
|
354
371
|
role_name="Task Planner",
|
|
355
|
-
content=
|
|
372
|
+
content=TASK_AGENT_SYSTEM_MESSAGE,
|
|
356
373
|
)
|
|
357
374
|
task_planning_tools = TaskPlanningToolkit().get_tools()
|
|
358
375
|
|
|
@@ -377,8 +394,7 @@ class Workforce(BaseNode):
|
|
|
377
394
|
if task_agent.system_message is not None:
|
|
378
395
|
user_task_sys_msg_content = task_agent.system_message.content
|
|
379
396
|
combined_task_content = (
|
|
380
|
-
f"{user_task_sys_msg_content}\n\n"
|
|
381
|
-
f"{task_sys_msg.content}"
|
|
397
|
+
f"{user_task_sys_msg_content}\n\n{task_sys_msg.content}"
|
|
382
398
|
)
|
|
383
399
|
combined_task_sys_msg = BaseMessage.make_assistant_message(
|
|
384
400
|
role_name=task_agent.system_message.role_name,
|
|
@@ -389,9 +405,11 @@ class Workforce(BaseNode):
|
|
|
389
405
|
|
|
390
406
|
# Since ChatAgent constructor uses a dictionary with
|
|
391
407
|
# function names as keys, we don't need to manually deduplicate.
|
|
392
|
-
combined_tools =
|
|
393
|
-
|
|
394
|
-
|
|
408
|
+
combined_tools: List[Union[FunctionTool, Callable]] = cast(
|
|
409
|
+
List[Union[FunctionTool, Callable]],
|
|
410
|
+
list(task_agent._internal_tools.values())
|
|
411
|
+
+ task_planning_tools,
|
|
412
|
+
)
|
|
395
413
|
|
|
396
414
|
# Create a new agent with the provided agent's configuration
|
|
397
415
|
# but with the combined system message and tools
|
|
@@ -438,10 +456,30 @@ class Workforce(BaseNode):
|
|
|
438
456
|
"better context continuity during task handoffs."
|
|
439
457
|
)
|
|
440
458
|
|
|
459
|
+
# Shared context utility for workflow management (created lazily)
|
|
460
|
+
self._shared_context_utility: Optional["ContextUtility"] = None
|
|
461
|
+
|
|
441
462
|
# ------------------------------------------------------------------
|
|
442
463
|
# Helper for propagating pause control to externally supplied agents
|
|
443
464
|
# ------------------------------------------------------------------
|
|
444
465
|
|
|
466
|
+
def _get_or_create_shared_context_utility(self) -> "ContextUtility":
|
|
467
|
+
r"""Get or create the shared context utility for workflow management.
|
|
468
|
+
|
|
469
|
+
This method creates the context utility only when needed, avoiding
|
|
470
|
+
unnecessary session folder creation during initialization.
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
ContextUtility: The shared context utility instance.
|
|
474
|
+
"""
|
|
475
|
+
if self._shared_context_utility is None:
|
|
476
|
+
from camel.utils.context_utils import ContextUtility
|
|
477
|
+
|
|
478
|
+
self._shared_context_utility = (
|
|
479
|
+
ContextUtility.get_workforce_shared()
|
|
480
|
+
)
|
|
481
|
+
return self._shared_context_utility
|
|
482
|
+
|
|
445
483
|
def _validate_agent_compatibility(
|
|
446
484
|
self, agent: ChatAgent, agent_context: str = "agent"
|
|
447
485
|
) -> None:
|
|
@@ -478,6 +516,9 @@ class Workforce(BaseNode):
|
|
|
478
516
|
"the Workforce."
|
|
479
517
|
)
|
|
480
518
|
|
|
519
|
+
# ------------------------------------------------------------------
|
|
520
|
+
# Helper for propagating pause control to externally supplied agents
|
|
521
|
+
# ------------------------------------------------------------------
|
|
481
522
|
def _attach_pause_event_to_agent(self, agent: ChatAgent) -> None:
|
|
482
523
|
r"""Ensure the given ChatAgent shares this workforce's pause_event.
|
|
483
524
|
|
|
@@ -765,76 +806,124 @@ class Workforce(BaseNode):
|
|
|
765
806
|
self._update_dependencies_for_decomposition(task, subtasks)
|
|
766
807
|
return subtasks
|
|
767
808
|
|
|
768
|
-
def
|
|
769
|
-
self,
|
|
770
|
-
|
|
771
|
-
|
|
809
|
+
def _analyze_task(
|
|
810
|
+
self,
|
|
811
|
+
task: Task,
|
|
812
|
+
*,
|
|
813
|
+
for_failure: bool,
|
|
814
|
+
error_message: Optional[str] = None,
|
|
815
|
+
) -> TaskAnalysisResult:
|
|
816
|
+
r"""Unified task analysis for both failures and quality evaluation.
|
|
817
|
+
|
|
818
|
+
This method consolidates the logic for analyzing task failures and
|
|
819
|
+
evaluating task quality, using the unified TASK_ANALYSIS_PROMPT.
|
|
772
820
|
|
|
773
821
|
Args:
|
|
774
|
-
task (Task): The
|
|
775
|
-
|
|
822
|
+
task (Task): The task to analyze
|
|
823
|
+
for_failure (bool): True for failure analysis, False for quality
|
|
824
|
+
evaluation
|
|
825
|
+
error_message (Optional[str]): Error message, required when
|
|
826
|
+
for_failure=True
|
|
776
827
|
|
|
777
828
|
Returns:
|
|
778
|
-
|
|
829
|
+
TaskAnalysisResult: Unified analysis result with recovery strategy
|
|
830
|
+
and optional quality metrics
|
|
831
|
+
|
|
832
|
+
Raises:
|
|
833
|
+
ValueError: If for_failure=True but error_message is None
|
|
779
834
|
"""
|
|
780
|
-
#
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
835
|
+
# Validate required parameters
|
|
836
|
+
if for_failure and error_message is None:
|
|
837
|
+
raise ValueError("error_message is required when for_failure=True")
|
|
838
|
+
|
|
839
|
+
# Determine task result and issue-specific analysis based on context
|
|
840
|
+
if for_failure:
|
|
841
|
+
task_result = "N/A (task failed)"
|
|
842
|
+
issue_type = "Task Failure"
|
|
843
|
+
issue_analysis = f"**Error Message:** {error_message}"
|
|
844
|
+
response_format = FAILURE_ANALYSIS_RESPONSE_FORMAT
|
|
845
|
+
result_schema = TaskAnalysisResult
|
|
846
|
+
fallback_values: Dict[str, Any] = {
|
|
847
|
+
"reasoning": "Defaulting to retry due to parsing error",
|
|
848
|
+
"recovery_strategy": RecoveryStrategy.RETRY,
|
|
849
|
+
"modified_task_content": None,
|
|
850
|
+
"issues": [error_message] if error_message else [],
|
|
851
|
+
}
|
|
852
|
+
examples: List[Dict[str, Any]] = [
|
|
853
|
+
{
|
|
854
|
+
"reasoning": "Temporary network error, worth retrying",
|
|
855
|
+
"recovery_strategy": "retry",
|
|
856
|
+
"modified_task_content": None,
|
|
857
|
+
"issues": ["Network timeout"],
|
|
858
|
+
}
|
|
790
859
|
]
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
860
|
+
else:
|
|
861
|
+
# Quality evaluation
|
|
862
|
+
task_result = task.result or "No result available"
|
|
863
|
+
issue_type = "Quality Evaluation"
|
|
864
|
+
issue_analysis = (
|
|
865
|
+
"Provide a quality score (0-100) and list any specific "
|
|
866
|
+
"issues found."
|
|
796
867
|
)
|
|
868
|
+
response_format = QUALITY_EVALUATION_RESPONSE_FORMAT
|
|
869
|
+
result_schema = TaskAnalysisResult
|
|
870
|
+
fallback_values = {
|
|
871
|
+
"reasoning": (
|
|
872
|
+
"Defaulting to acceptable quality due to parsing error"
|
|
873
|
+
),
|
|
874
|
+
"issues": [],
|
|
875
|
+
"recovery_strategy": None,
|
|
876
|
+
"modified_task_content": None,
|
|
877
|
+
"quality_score": 80,
|
|
878
|
+
}
|
|
879
|
+
examples = [
|
|
880
|
+
{
|
|
881
|
+
"reasoning": (
|
|
882
|
+
"Excellent implementation with comprehensive tests"
|
|
883
|
+
),
|
|
884
|
+
"issues": [],
|
|
885
|
+
"recovery_strategy": None,
|
|
886
|
+
"modified_task_content": None,
|
|
887
|
+
"quality_score": 98,
|
|
888
|
+
},
|
|
889
|
+
{
|
|
890
|
+
"reasoning": (
|
|
891
|
+
"Implementation incomplete with missing features"
|
|
892
|
+
),
|
|
893
|
+
"issues": [
|
|
894
|
+
"Incomplete implementation",
|
|
895
|
+
"Missing error handling",
|
|
896
|
+
],
|
|
897
|
+
"recovery_strategy": "replan",
|
|
898
|
+
"modified_task_content": (
|
|
899
|
+
"Previous attempt was incomplete. "
|
|
900
|
+
"Please implement with: 1) Full feature "
|
|
901
|
+
"coverage, 2) Proper error handling"
|
|
902
|
+
),
|
|
903
|
+
"quality_score": 45,
|
|
904
|
+
},
|
|
905
|
+
]
|
|
797
906
|
|
|
798
|
-
#
|
|
799
|
-
|
|
907
|
+
# Format the unified analysis prompt
|
|
908
|
+
analysis_prompt = TASK_ANALYSIS_PROMPT.format(
|
|
800
909
|
task_id=task.id,
|
|
801
910
|
task_content=task.content,
|
|
911
|
+
task_result=task_result,
|
|
802
912
|
failure_count=task.failure_count,
|
|
803
|
-
error_message=error_message,
|
|
804
|
-
worker_id=task.assigned_worker_id,
|
|
805
913
|
task_depth=task.get_depth(),
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
# Format the analysis prompt
|
|
812
|
-
analysis_prompt = FAILURE_ANALYSIS_PROMPT.format(
|
|
813
|
-
task_id=failure_context.task_id,
|
|
814
|
-
task_content=failure_context.task_content,
|
|
815
|
-
failure_count=failure_context.failure_count,
|
|
816
|
-
error_message=failure_context.error_message,
|
|
817
|
-
worker_id=failure_context.worker_id or "unknown",
|
|
818
|
-
task_depth=failure_context.task_depth,
|
|
819
|
-
additional_info=failure_context.additional_info or "None",
|
|
914
|
+
assigned_worker=task.assigned_worker_id or "unknown",
|
|
915
|
+
issue_type=issue_type,
|
|
916
|
+
issue_specific_analysis=issue_analysis,
|
|
917
|
+
response_format=response_format,
|
|
820
918
|
)
|
|
821
919
|
|
|
822
920
|
try:
|
|
823
|
-
# Check if we should use structured handler
|
|
824
921
|
if self.use_structured_output_handler:
|
|
825
|
-
# Use structured handler
|
|
826
922
|
enhanced_prompt = (
|
|
827
923
|
self.structured_handler.generate_structured_prompt(
|
|
828
924
|
base_prompt=analysis_prompt,
|
|
829
|
-
schema=
|
|
830
|
-
examples=
|
|
831
|
-
{
|
|
832
|
-
"strategy": "RETRY",
|
|
833
|
-
"reasoning": "Temporary network error, "
|
|
834
|
-
"worth retrying",
|
|
835
|
-
"modified_task_content": None,
|
|
836
|
-
}
|
|
837
|
-
],
|
|
925
|
+
schema=result_schema,
|
|
926
|
+
examples=examples,
|
|
838
927
|
)
|
|
839
928
|
)
|
|
840
929
|
|
|
@@ -843,43 +932,220 @@ class Workforce(BaseNode):
|
|
|
843
932
|
|
|
844
933
|
result = self.structured_handler.parse_structured_response(
|
|
845
934
|
response.msg.content if response.msg else "",
|
|
846
|
-
schema=
|
|
847
|
-
fallback_values=
|
|
848
|
-
"strategy": RecoveryStrategy.RETRY,
|
|
849
|
-
"reasoning": "Defaulting to retry due to parsing "
|
|
850
|
-
"issues",
|
|
851
|
-
"modified_task_content": None,
|
|
852
|
-
},
|
|
935
|
+
schema=result_schema,
|
|
936
|
+
fallback_values=fallback_values,
|
|
853
937
|
)
|
|
854
|
-
|
|
855
|
-
if isinstance(result,
|
|
938
|
+
|
|
939
|
+
if isinstance(result, TaskAnalysisResult):
|
|
856
940
|
return result
|
|
857
941
|
elif isinstance(result, dict):
|
|
858
|
-
return
|
|
942
|
+
return result_schema(**result)
|
|
859
943
|
else:
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
reasoning="Failed to parse recovery decision",
|
|
863
|
-
modified_task_content=None,
|
|
864
|
-
)
|
|
944
|
+
# Fallback based on context
|
|
945
|
+
return TaskAnalysisResult(**fallback_values)
|
|
865
946
|
else:
|
|
866
|
-
# Use existing native structured output code
|
|
867
947
|
self.task_agent.reset()
|
|
868
948
|
response = self.task_agent.step(
|
|
869
|
-
analysis_prompt, response_format=
|
|
949
|
+
analysis_prompt, response_format=result_schema
|
|
870
950
|
)
|
|
871
951
|
return response.msg.parsed
|
|
872
952
|
|
|
873
953
|
except Exception as e:
|
|
874
954
|
logger.warning(
|
|
875
|
-
f"Error during
|
|
955
|
+
f"Error during task analysis "
|
|
956
|
+
f"({'failure' if for_failure else 'quality'}): {e}, "
|
|
957
|
+
f"using fallback"
|
|
876
958
|
)
|
|
877
|
-
return
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
959
|
+
return TaskAnalysisResult(**fallback_values)
|
|
960
|
+
|
|
961
|
+
async def _apply_recovery_strategy(
|
|
962
|
+
self,
|
|
963
|
+
task: Task,
|
|
964
|
+
recovery_decision: TaskAnalysisResult,
|
|
965
|
+
) -> bool:
|
|
966
|
+
r"""Apply the recovery strategy from a task analysis result.
|
|
967
|
+
|
|
968
|
+
This method centralizes the recovery logic for both execution failures
|
|
969
|
+
and quality-based failures.
|
|
970
|
+
|
|
971
|
+
Args:
|
|
972
|
+
task (Task): The task that needs recovery
|
|
973
|
+
recovery_decision (TaskAnalysisResult): The analysis result with
|
|
974
|
+
recovery strategy
|
|
975
|
+
|
|
976
|
+
Returns:
|
|
977
|
+
bool: True if workforce should halt (e.g., decompose needs
|
|
978
|
+
different handling), False otherwise
|
|
979
|
+
"""
|
|
980
|
+
strategy = (
|
|
981
|
+
recovery_decision.recovery_strategy or RecoveryStrategy.RETRY
|
|
982
|
+
)
|
|
983
|
+
action_taken = ""
|
|
984
|
+
|
|
985
|
+
try:
|
|
986
|
+
if strategy == RecoveryStrategy.RETRY:
|
|
987
|
+
# Simply retry the task by reposting it to the same worker
|
|
988
|
+
# Check both _assignees dict and task.assigned_worker_id
|
|
989
|
+
assignee_id = (
|
|
990
|
+
self._assignees.get(task.id) or task.assigned_worker_id
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
if assignee_id:
|
|
994
|
+
# Retry with the same worker - no coordinator call needed
|
|
995
|
+
await self._post_task(task, assignee_id)
|
|
996
|
+
action_taken = f"retried with same worker {assignee_id}"
|
|
997
|
+
logger.info(
|
|
998
|
+
f"Task {task.id} retrying with same worker "
|
|
999
|
+
f"{assignee_id} (no coordinator call)"
|
|
1000
|
+
)
|
|
1001
|
+
else:
|
|
1002
|
+
# No previous assignment exists - find a new assignee
|
|
1003
|
+
logger.info(
|
|
1004
|
+
f"Task {task.id} has no previous assignee, "
|
|
1005
|
+
f"calling coordinator"
|
|
1006
|
+
)
|
|
1007
|
+
batch_result = await self._find_assignee([task])
|
|
1008
|
+
assignment = batch_result.assignments[0]
|
|
1009
|
+
self._assignees[task.id] = assignment.assignee_id
|
|
1010
|
+
await self._post_task(task, assignment.assignee_id)
|
|
1011
|
+
action_taken = (
|
|
1012
|
+
f"retried with new worker {assignment.assignee_id}"
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
elif strategy == RecoveryStrategy.REPLAN:
|
|
1016
|
+
# Modify the task content and retry
|
|
1017
|
+
if recovery_decision.modified_task_content:
|
|
1018
|
+
task.content = recovery_decision.modified_task_content
|
|
1019
|
+
logger.info(f"Task {task.id} content modified for replan")
|
|
1020
|
+
|
|
1021
|
+
# Repost the modified task
|
|
1022
|
+
if task.id in self._assignees:
|
|
1023
|
+
assignee_id = self._assignees[task.id]
|
|
1024
|
+
await self._post_task(task, assignee_id)
|
|
1025
|
+
action_taken = (
|
|
1026
|
+
f"replanned and retried with worker {assignee_id}"
|
|
1027
|
+
)
|
|
1028
|
+
else:
|
|
1029
|
+
# Find a new assignee for the replanned task
|
|
1030
|
+
batch_result = await self._find_assignee([task])
|
|
1031
|
+
assignment = batch_result.assignments[0]
|
|
1032
|
+
self._assignees[task.id] = assignment.assignee_id
|
|
1033
|
+
await self._post_task(task, assignment.assignee_id)
|
|
1034
|
+
action_taken = (
|
|
1035
|
+
f"replanned and assigned to "
|
|
1036
|
+
f"worker {assignment.assignee_id}"
|
|
1037
|
+
)
|
|
1038
|
+
|
|
1039
|
+
elif strategy == RecoveryStrategy.REASSIGN:
|
|
1040
|
+
# Reassign to a different worker
|
|
1041
|
+
old_worker = task.assigned_worker_id
|
|
1042
|
+
logger.info(
|
|
1043
|
+
f"Task {task.id} will be reassigned from worker "
|
|
1044
|
+
f"{old_worker}"
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
# Find a different worker
|
|
1048
|
+
batch_result = await self._find_assignee([task])
|
|
1049
|
+
assignment = batch_result.assignments[0]
|
|
1050
|
+
new_worker = assignment.assignee_id
|
|
1051
|
+
|
|
1052
|
+
# If same worker, force find another
|
|
1053
|
+
if new_worker == old_worker and len(self._children) > 1:
|
|
1054
|
+
logger.info("Same worker selected, finding alternative")
|
|
1055
|
+
# Try to find different worker by adding note to
|
|
1056
|
+
# task content
|
|
1057
|
+
task.content = (
|
|
1058
|
+
f"{task.content}\n\n"
|
|
1059
|
+
f"Note: Previous worker {old_worker} had quality "
|
|
1060
|
+
f"issues. Needs different approach."
|
|
1061
|
+
)
|
|
1062
|
+
batch_result = await self._find_assignee([task])
|
|
1063
|
+
assignment = batch_result.assignments[0]
|
|
1064
|
+
new_worker = assignment.assignee_id
|
|
1065
|
+
|
|
1066
|
+
self._assignees[task.id] = new_worker
|
|
1067
|
+
await self._post_task(task, new_worker)
|
|
1068
|
+
action_taken = f"reassigned from {old_worker} to {new_worker}"
|
|
1069
|
+
logger.info(
|
|
1070
|
+
f"Task {task.id} reassigned from {old_worker} to "
|
|
1071
|
+
f"{new_worker}"
|
|
1072
|
+
)
|
|
1073
|
+
|
|
1074
|
+
elif strategy == RecoveryStrategy.DECOMPOSE:
|
|
1075
|
+
# Decompose the task into subtasks
|
|
1076
|
+
reason = (
|
|
1077
|
+
"failure"
|
|
1078
|
+
if not recovery_decision.is_quality_evaluation
|
|
1079
|
+
else "quality issues"
|
|
1080
|
+
)
|
|
1081
|
+
logger.info(
|
|
1082
|
+
f"Task {task.id} will be decomposed due to {reason}"
|
|
1083
|
+
)
|
|
1084
|
+
subtasks_result = self._decompose_task(task)
|
|
1085
|
+
|
|
1086
|
+
# Handle both streaming and non-streaming results
|
|
1087
|
+
if isinstance(subtasks_result, Generator):
|
|
1088
|
+
subtasks = []
|
|
1089
|
+
for new_tasks in subtasks_result:
|
|
1090
|
+
subtasks.extend(new_tasks)
|
|
1091
|
+
else:
|
|
1092
|
+
subtasks = subtasks_result
|
|
1093
|
+
|
|
1094
|
+
if self.metrics_logger and subtasks:
|
|
1095
|
+
self.metrics_logger.log_task_decomposed(
|
|
1096
|
+
parent_task_id=task.id,
|
|
1097
|
+
subtask_ids=[st.id for st in subtasks],
|
|
1098
|
+
)
|
|
1099
|
+
for subtask in subtasks:
|
|
1100
|
+
self.metrics_logger.log_task_created(
|
|
1101
|
+
task_id=subtask.id,
|
|
1102
|
+
description=subtask.content,
|
|
1103
|
+
parent_task_id=task.id,
|
|
1104
|
+
task_type=subtask.type,
|
|
1105
|
+
metadata=subtask.additional_info,
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
# Insert subtasks at the head of the queue
|
|
1109
|
+
self._pending_tasks.extendleft(reversed(subtasks))
|
|
1110
|
+
await self._post_ready_tasks()
|
|
1111
|
+
action_taken = f"decomposed into {len(subtasks)} subtasks"
|
|
1112
|
+
|
|
1113
|
+
logger.info(
|
|
1114
|
+
f"Task {task.id} decomposed into {len(subtasks)} subtasks"
|
|
1115
|
+
)
|
|
1116
|
+
|
|
1117
|
+
# Sync shared memory after task decomposition
|
|
1118
|
+
if self.share_memory:
|
|
1119
|
+
logger.info(
|
|
1120
|
+
f"Syncing shared memory after task {task.id} "
|
|
1121
|
+
f"decomposition"
|
|
1122
|
+
)
|
|
1123
|
+
self._sync_shared_memory()
|
|
1124
|
+
|
|
1125
|
+
# For decompose, we return early with special handling
|
|
1126
|
+
return True
|
|
1127
|
+
|
|
1128
|
+
elif strategy == RecoveryStrategy.CREATE_WORKER:
|
|
1129
|
+
assignee = await self._create_worker_node_for_task(task)
|
|
1130
|
+
await self._post_task(task, assignee.node_id)
|
|
1131
|
+
action_taken = (
|
|
1132
|
+
f"created new worker {assignee.node_id} and assigned "
|
|
1133
|
+
f"task {task.id} to it"
|
|
1134
|
+
)
|
|
1135
|
+
|
|
1136
|
+
except Exception as e:
|
|
1137
|
+
logger.error(
|
|
1138
|
+
f"Recovery strategy {strategy} failed for task {task.id}: {e}",
|
|
1139
|
+
exc_info=True,
|
|
882
1140
|
)
|
|
1141
|
+
raise
|
|
1142
|
+
|
|
1143
|
+
logger.debug(
|
|
1144
|
+
f"Task {task.id} recovery: {action_taken}. "
|
|
1145
|
+
f"Strategy: {strategy.value}"
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
return False
|
|
883
1149
|
|
|
884
1150
|
# Human intervention methods
|
|
885
1151
|
async def _async_pause(self) -> None:
|
|
@@ -970,6 +1236,39 @@ class Workforce(BaseNode):
|
|
|
970
1236
|
f"(event-loop not yet started)."
|
|
971
1237
|
)
|
|
972
1238
|
|
|
1239
|
+
async def _async_skip_gracefully(self) -> None:
|
|
1240
|
+
r"""Async implementation of skip_gracefully to run on the event
|
|
1241
|
+
loop.
|
|
1242
|
+
"""
|
|
1243
|
+
self._skip_requested = True
|
|
1244
|
+
if self._pause_event.is_set() is False:
|
|
1245
|
+
self._pause_event.set() # Resume if paused to process skip
|
|
1246
|
+
logger.info(f"Workforce {self.node_id} skip requested.")
|
|
1247
|
+
|
|
1248
|
+
def skip_gracefully(self) -> None:
|
|
1249
|
+
r"""Request workforce to skip current pending tasks and move to next
|
|
1250
|
+
main task from the queue. If no main tasks exist, acts like
|
|
1251
|
+
stop_gracefully.
|
|
1252
|
+
|
|
1253
|
+
This method clears the current pending subtasks and moves to the next
|
|
1254
|
+
main task in the queue if available. Works both when the internal
|
|
1255
|
+
event-loop is alive and when it has not yet been started.
|
|
1256
|
+
"""
|
|
1257
|
+
|
|
1258
|
+
if self._loop and not self._loop.is_closed():
|
|
1259
|
+
self._submit_coro_to_loop(self._async_skip_gracefully())
|
|
1260
|
+
else:
|
|
1261
|
+
# Loop not yet created, set the flag synchronously so later
|
|
1262
|
+
# startup will respect it.
|
|
1263
|
+
self._skip_requested = True
|
|
1264
|
+
# Ensure any pending pause is released so that when the loop does
|
|
1265
|
+
# start it can see the skip request and exit.
|
|
1266
|
+
self._pause_event.set()
|
|
1267
|
+
logger.info(
|
|
1268
|
+
f"Workforce {self.node_id} skip requested "
|
|
1269
|
+
f"(event-loop not yet started)."
|
|
1270
|
+
)
|
|
1271
|
+
|
|
973
1272
|
def save_snapshot(self, description: str = "") -> None:
|
|
974
1273
|
r"""Save current state as a snapshot."""
|
|
975
1274
|
snapshot = WorkforceSnapshot(
|
|
@@ -1029,36 +1328,148 @@ class Workforce(BaseNode):
|
|
|
1029
1328
|
content: str,
|
|
1030
1329
|
task_id: Optional[str] = None,
|
|
1031
1330
|
additional_info: Optional[Dict[str, Any]] = None,
|
|
1331
|
+
as_subtask: bool = False,
|
|
1032
1332
|
insert_position: int = -1,
|
|
1033
1333
|
) -> Task:
|
|
1034
|
-
r"""Add a new task to the
|
|
1035
|
-
|
|
1334
|
+
r"""Add a new task to the workforce.
|
|
1335
|
+
|
|
1336
|
+
By default, this method adds a main task that will be decomposed into
|
|
1337
|
+
subtasks. Set `as_subtask=True` to add a task directly to the pending
|
|
1338
|
+
subtask queue without decomposition.
|
|
1339
|
+
|
|
1340
|
+
Args:
|
|
1341
|
+
content (str): The content of the task.
|
|
1342
|
+
task_id (Optional[str], optional): Optional ID for the task.
|
|
1343
|
+
If not provided, a unique ID will be generated.
|
|
1344
|
+
additional_info (Optional[Dict[str, Any]], optional): Optional
|
|
1345
|
+
additional metadata for the task.
|
|
1346
|
+
as_subtask (bool, optional): If True, adds the task directly to
|
|
1347
|
+
the pending subtask queue. If False, adds as a main task that
|
|
1348
|
+
will be decomposed. Defaults to False.
|
|
1349
|
+
insert_position (int, optional): Position to insert the task in
|
|
1350
|
+
the pending queue. Only applies when as_subtask=True.
|
|
1351
|
+
Defaults to -1 (append to end).
|
|
1352
|
+
|
|
1353
|
+
Returns:
|
|
1354
|
+
Task: The created task object.
|
|
1355
|
+
"""
|
|
1356
|
+
if as_subtask:
|
|
1357
|
+
new_task = Task(
|
|
1358
|
+
content=content,
|
|
1359
|
+
id=task_id or f"human_added_{len(self._pending_tasks)}",
|
|
1360
|
+
additional_info=additional_info,
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
# Add directly to current pending subtasks
|
|
1364
|
+
if insert_position == -1:
|
|
1365
|
+
self._pending_tasks.append(new_task)
|
|
1366
|
+
else:
|
|
1367
|
+
# Convert deque to list, insert, then back to deque
|
|
1368
|
+
tasks_list = list(self._pending_tasks)
|
|
1369
|
+
tasks_list.insert(insert_position, new_task)
|
|
1370
|
+
self._pending_tasks = deque(tasks_list)
|
|
1371
|
+
|
|
1372
|
+
logger.info(f"New subtask added to pending queue: {new_task.id}")
|
|
1373
|
+
return new_task
|
|
1374
|
+
else:
|
|
1375
|
+
# Add as main task that needs decomposition
|
|
1376
|
+
# Use additional_info to mark this task needs decomposition
|
|
1377
|
+
# Make a copy to avoid modifying user's dict
|
|
1378
|
+
info = additional_info.copy() if additional_info else {}
|
|
1379
|
+
info['_needs_decomposition'] = True
|
|
1380
|
+
|
|
1381
|
+
task_count = sum(
|
|
1382
|
+
1
|
|
1383
|
+
for t in self._pending_tasks
|
|
1384
|
+
if t.additional_info
|
|
1385
|
+
and t.additional_info.get('_needs_decomposition')
|
|
1386
|
+
)
|
|
1387
|
+
|
|
1388
|
+
new_task = Task(
|
|
1389
|
+
content=content,
|
|
1390
|
+
id=task_id or f"main_task_{task_count}",
|
|
1391
|
+
additional_info=info,
|
|
1392
|
+
)
|
|
1393
|
+
|
|
1394
|
+
self._pending_tasks.append(new_task)
|
|
1395
|
+
logger.info(f"New main task added to pending queue: {new_task.id}")
|
|
1396
|
+
return new_task
|
|
1397
|
+
|
|
1398
|
+
def add_main_task(
|
|
1399
|
+
self,
|
|
1400
|
+
content: str,
|
|
1401
|
+
task_id: Optional[str] = None,
|
|
1402
|
+
additional_info: Optional[Dict[str, Any]] = None,
|
|
1403
|
+
) -> Task:
|
|
1404
|
+
r"""Add a new main task that will be decomposed into subtasks.
|
|
1405
|
+
|
|
1406
|
+
This is an alias for :meth:`add_task` with `as_subtask=False`.
|
|
1407
|
+
|
|
1408
|
+
Args:
|
|
1409
|
+
content (str): The content of the main task.
|
|
1410
|
+
task_id (Optional[str], optional): Optional ID for the task.
|
|
1411
|
+
additional_info (Optional[Dict[str, Any]], optional): Optional
|
|
1412
|
+
additional metadata.
|
|
1413
|
+
|
|
1414
|
+
Returns:
|
|
1415
|
+
Task: The created main task object.
|
|
1416
|
+
"""
|
|
1417
|
+
return self.add_task(
|
|
1036
1418
|
content=content,
|
|
1037
|
-
|
|
1419
|
+
task_id=task_id,
|
|
1038
1420
|
additional_info=additional_info,
|
|
1421
|
+
as_subtask=False,
|
|
1039
1422
|
)
|
|
1040
|
-
if insert_position == -1:
|
|
1041
|
-
self._pending_tasks.append(new_task)
|
|
1042
|
-
else:
|
|
1043
|
-
# Convert deque to list, insert, then back to deque
|
|
1044
|
-
tasks_list = list(self._pending_tasks)
|
|
1045
|
-
tasks_list.insert(insert_position, new_task)
|
|
1046
|
-
self._pending_tasks = deque(tasks_list)
|
|
1047
1423
|
|
|
1048
|
-
|
|
1049
|
-
|
|
1424
|
+
def add_subtask(
|
|
1425
|
+
self,
|
|
1426
|
+
content: str,
|
|
1427
|
+
task_id: Optional[str] = None,
|
|
1428
|
+
additional_info: Optional[Dict[str, Any]] = None,
|
|
1429
|
+
insert_position: int = -1,
|
|
1430
|
+
) -> Task:
|
|
1431
|
+
r"""Add a new subtask to the current pending queue.
|
|
1432
|
+
|
|
1433
|
+
This is an alias for :meth:`add_task` with `as_subtask=True`.
|
|
1434
|
+
|
|
1435
|
+
Args:
|
|
1436
|
+
content (str): The content of the subtask.
|
|
1437
|
+
task_id (Optional[str], optional): Optional ID for the task.
|
|
1438
|
+
additional_info (Optional[Dict[str, Any]], optional): Optional
|
|
1439
|
+
additional metadata.
|
|
1440
|
+
insert_position (int, optional): Position to insert the task.
|
|
1441
|
+
Defaults to -1 (append to end).
|
|
1442
|
+
|
|
1443
|
+
Returns:
|
|
1444
|
+
Task: The created subtask object.
|
|
1445
|
+
"""
|
|
1446
|
+
return self.add_task(
|
|
1447
|
+
content=content,
|
|
1448
|
+
task_id=task_id,
|
|
1449
|
+
additional_info=additional_info,
|
|
1450
|
+
as_subtask=True,
|
|
1451
|
+
insert_position=insert_position,
|
|
1452
|
+
)
|
|
1050
1453
|
|
|
1051
1454
|
def remove_task(self, task_id: str) -> bool:
|
|
1052
|
-
r"""Remove a task from the pending queue.
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1455
|
+
r"""Remove a task from the pending queue or main task queue.
|
|
1456
|
+
|
|
1457
|
+
Args:
|
|
1458
|
+
task_id (str): The ID of the task to remove.
|
|
1459
|
+
|
|
1460
|
+
Returns:
|
|
1461
|
+
bool: True if task was found and removed, False otherwise.
|
|
1462
|
+
"""
|
|
1463
|
+
# Check main task queue first
|
|
1464
|
+
pending_tasks_list = list(self._pending_tasks)
|
|
1465
|
+
for i, task in enumerate(pending_tasks_list):
|
|
1056
1466
|
if task.id == task_id:
|
|
1057
|
-
|
|
1058
|
-
self._pending_tasks = deque(
|
|
1059
|
-
logger.info(f"Task {task_id} removed.")
|
|
1467
|
+
pending_tasks_list.pop(i)
|
|
1468
|
+
self._pending_tasks = deque(pending_tasks_list)
|
|
1469
|
+
logger.info(f"Task {task_id} removed from pending queue.")
|
|
1060
1470
|
return True
|
|
1061
|
-
|
|
1471
|
+
|
|
1472
|
+
logger.warning(f"Task {task_id} not found in any task queue.")
|
|
1062
1473
|
return False
|
|
1063
1474
|
|
|
1064
1475
|
def reorder_tasks(self, task_ids: List[str]) -> bool:
|
|
@@ -1173,26 +1584,21 @@ class Workforce(BaseNode):
|
|
|
1173
1584
|
"main_task_id": self._task.id if self._task else None,
|
|
1174
1585
|
}
|
|
1175
1586
|
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1587
|
+
async def handle_decompose_append_task(
|
|
1588
|
+
self, task: Task, reset: bool = True
|
|
1589
|
+
) -> List[Task]:
|
|
1590
|
+
r"""Handle task decomposition and validation with
|
|
1591
|
+
workforce environment functions. Then append to
|
|
1592
|
+
pending tasks if decomposition happened.
|
|
1181
1593
|
|
|
1182
1594
|
Args:
|
|
1183
1595
|
task (Task): The task to be processed.
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
runs the task in a blocking one-shot manner.
|
|
1596
|
+
reset (Bool): Should trigger workforce reset (Workforce must not
|
|
1597
|
+
be running). Default: True
|
|
1187
1598
|
|
|
1188
1599
|
Returns:
|
|
1189
|
-
Task: The
|
|
1600
|
+
List[Task]: The decomposed subtasks or the original task.
|
|
1190
1601
|
"""
|
|
1191
|
-
# Delegate to intervention pipeline when requested to keep
|
|
1192
|
-
# backward-compat.
|
|
1193
|
-
if interactive:
|
|
1194
|
-
return await self._process_task_with_snapshot(task)
|
|
1195
|
-
|
|
1196
1602
|
if not validate_task_content(task.content, task.id):
|
|
1197
1603
|
task.state = TaskState.FAILED
|
|
1198
1604
|
task.result = "Task failed: Invalid or empty content provided"
|
|
@@ -1200,10 +1606,16 @@ class Workforce(BaseNode):
|
|
|
1200
1606
|
f"Task {task.id} rejected: Invalid or empty content. "
|
|
1201
1607
|
f"Content preview: '{task.content}'"
|
|
1202
1608
|
)
|
|
1203
|
-
return task
|
|
1609
|
+
return [task]
|
|
1204
1610
|
|
|
1205
|
-
self.
|
|
1611
|
+
if reset and self._state != WorkforceState.RUNNING:
|
|
1612
|
+
self.reset()
|
|
1613
|
+
logger.info("Workforce reset before handling task.")
|
|
1614
|
+
|
|
1615
|
+
# Focus on the new task
|
|
1206
1616
|
self._task = task
|
|
1617
|
+
task.state = TaskState.FAILED
|
|
1618
|
+
|
|
1207
1619
|
if self.metrics_logger:
|
|
1208
1620
|
self.metrics_logger.log_task_created(
|
|
1209
1621
|
task_id=task.id,
|
|
@@ -1211,7 +1623,6 @@ class Workforce(BaseNode):
|
|
|
1211
1623
|
task_type=task.type,
|
|
1212
1624
|
metadata=task.additional_info,
|
|
1213
1625
|
)
|
|
1214
|
-
task.state = TaskState.FAILED
|
|
1215
1626
|
# The agent tend to be overconfident on the whole task, so we
|
|
1216
1627
|
# decompose the task into subtasks first
|
|
1217
1628
|
subtasks_result = self._decompose_task(task)
|
|
@@ -1237,20 +1648,46 @@ class Workforce(BaseNode):
|
|
|
1237
1648
|
task_type=subtask.type,
|
|
1238
1649
|
metadata=subtask.additional_info,
|
|
1239
1650
|
)
|
|
1651
|
+
|
|
1240
1652
|
if subtasks:
|
|
1241
|
-
#
|
|
1242
|
-
#
|
|
1653
|
+
# _pending_tasks will contain both undecomposed
|
|
1654
|
+
# and decomposed tasks, so we use additional_info
|
|
1655
|
+
# to mark the tasks that need decomposition instead
|
|
1243
1656
|
self._pending_tasks.extendleft(reversed(subtasks))
|
|
1244
1657
|
else:
|
|
1245
1658
|
# If no decomposition, execute the original task.
|
|
1246
1659
|
self._pending_tasks.append(task)
|
|
1247
1660
|
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
await self.start()
|
|
1661
|
+
return subtasks
|
|
1251
1662
|
|
|
1252
|
-
|
|
1253
|
-
|
|
1663
|
+
@check_if_running(False)
|
|
1664
|
+
async def process_task_async(
|
|
1665
|
+
self, task: Task, interactive: bool = False
|
|
1666
|
+
) -> Task:
|
|
1667
|
+
r"""Main entry point to process a task asynchronously.
|
|
1668
|
+
|
|
1669
|
+
Args:
|
|
1670
|
+
task (Task): The task to be processed.
|
|
1671
|
+
interactive (bool, optional): If True, enables human-intervention
|
|
1672
|
+
workflow (pause/resume/snapshot). Defaults to False, which
|
|
1673
|
+
runs the task in a blocking one-shot manner.
|
|
1674
|
+
|
|
1675
|
+
Returns:
|
|
1676
|
+
Task: The updated task.
|
|
1677
|
+
"""
|
|
1678
|
+
# Delegate to intervention pipeline when requested to keep
|
|
1679
|
+
# backward-compat.
|
|
1680
|
+
if interactive:
|
|
1681
|
+
return await self._process_task_with_snapshot(task)
|
|
1682
|
+
|
|
1683
|
+
subtasks = await self.handle_decompose_append_task(task)
|
|
1684
|
+
|
|
1685
|
+
self.set_channel(TaskChannel())
|
|
1686
|
+
|
|
1687
|
+
await self.start()
|
|
1688
|
+
|
|
1689
|
+
if subtasks:
|
|
1690
|
+
task.result = "\n\n".join(
|
|
1254
1691
|
f"--- Subtask {sub.id} Result ---\n{sub.result}"
|
|
1255
1692
|
for sub in task.subtasks
|
|
1256
1693
|
if sub.result
|
|
@@ -1326,39 +1763,8 @@ class Workforce(BaseNode):
|
|
|
1326
1763
|
Task: The updated task.
|
|
1327
1764
|
"""
|
|
1328
1765
|
|
|
1329
|
-
|
|
1330
|
-
task.state = TaskState.FAILED
|
|
1331
|
-
task.result = "Task failed: Invalid or empty content provided"
|
|
1332
|
-
logger.warning(
|
|
1333
|
-
f"Task {task.id} rejected: Invalid or empty content. "
|
|
1334
|
-
f"Content preview: '{task.content}'"
|
|
1335
|
-
)
|
|
1336
|
-
return task
|
|
1337
|
-
|
|
1338
|
-
self.reset()
|
|
1339
|
-
self._task = task
|
|
1340
|
-
self._state = WorkforceState.RUNNING
|
|
1341
|
-
task.state = TaskState.FAILED # TODO: Add logic for OPEN
|
|
1342
|
-
|
|
1343
|
-
# Decompose the task into subtasks first
|
|
1344
|
-
subtasks_result = self._decompose_task(task)
|
|
1766
|
+
await self.handle_decompose_append_task(task)
|
|
1345
1767
|
|
|
1346
|
-
# Handle both streaming and non-streaming results
|
|
1347
|
-
if isinstance(subtasks_result, Generator):
|
|
1348
|
-
# This is a generator (streaming mode)
|
|
1349
|
-
subtasks = []
|
|
1350
|
-
for new_tasks in subtasks_result:
|
|
1351
|
-
subtasks.extend(new_tasks)
|
|
1352
|
-
else:
|
|
1353
|
-
# This is a regular list (non-streaming mode)
|
|
1354
|
-
subtasks = subtasks_result
|
|
1355
|
-
if subtasks:
|
|
1356
|
-
# If decomposition happened, the original task becomes a container.
|
|
1357
|
-
# We only execute its subtasks.
|
|
1358
|
-
self._pending_tasks.extendleft(reversed(subtasks))
|
|
1359
|
-
else:
|
|
1360
|
-
# If no decomposition, execute the original task.
|
|
1361
|
-
self._pending_tasks.append(task)
|
|
1362
1768
|
self.set_channel(TaskChannel())
|
|
1363
1769
|
|
|
1364
1770
|
# Save initial snapshot
|
|
@@ -1497,6 +1903,9 @@ class Workforce(BaseNode):
|
|
|
1497
1903
|
start_coroutine, self._loop
|
|
1498
1904
|
)
|
|
1499
1905
|
self._child_listening_tasks.append(child_task)
|
|
1906
|
+
else:
|
|
1907
|
+
# Close the coroutine to prevent RuntimeWarning
|
|
1908
|
+
start_coroutine.close()
|
|
1500
1909
|
else:
|
|
1501
1910
|
# Close the coroutine to prevent RuntimeWarning
|
|
1502
1911
|
start_coroutine.close()
|
|
@@ -1506,6 +1915,7 @@ class Workforce(BaseNode):
|
|
|
1506
1915
|
description: str,
|
|
1507
1916
|
worker: ChatAgent,
|
|
1508
1917
|
pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
|
|
1918
|
+
enable_workflow_memory: bool = False,
|
|
1509
1919
|
) -> Workforce:
|
|
1510
1920
|
r"""Add a worker node to the workforce that uses a single agent.
|
|
1511
1921
|
Can be called when workforce is paused to dynamically add workers.
|
|
@@ -1515,6 +1925,9 @@ class Workforce(BaseNode):
|
|
|
1515
1925
|
worker (ChatAgent): The agent to be added.
|
|
1516
1926
|
pool_max_size (int): Maximum size of the agent pool.
|
|
1517
1927
|
(default: :obj:`10`)
|
|
1928
|
+
enable_workflow_memory (bool): Whether to enable workflow memory
|
|
1929
|
+
accumulation. Set to True if you plan to call
|
|
1930
|
+
save_workflow_memories(). (default: :obj:`False`)
|
|
1518
1931
|
|
|
1519
1932
|
Returns:
|
|
1520
1933
|
Workforce: The workforce node itself.
|
|
@@ -1541,6 +1954,8 @@ class Workforce(BaseNode):
|
|
|
1541
1954
|
worker=worker,
|
|
1542
1955
|
pool_max_size=pool_max_size,
|
|
1543
1956
|
use_structured_output_handler=self.use_structured_output_handler,
|
|
1957
|
+
context_utility=None, # Will be set during save/load operations
|
|
1958
|
+
enable_workflow_memory=enable_workflow_memory,
|
|
1544
1959
|
)
|
|
1545
1960
|
self._children.append(worker_node)
|
|
1546
1961
|
|
|
@@ -1696,6 +2111,7 @@ class Workforce(BaseNode):
|
|
|
1696
2111
|
# Reset intervention state
|
|
1697
2112
|
self._state = WorkforceState.IDLE
|
|
1698
2113
|
self._stop_requested = False
|
|
2114
|
+
self._skip_requested = False
|
|
1699
2115
|
# Handle asyncio.Event in a thread-safe way
|
|
1700
2116
|
if self._loop and not self._loop.is_closed():
|
|
1701
2117
|
# If we have a loop, use it to set the event safely
|
|
@@ -1716,6 +2132,237 @@ class Workforce(BaseNode):
|
|
|
1716
2132
|
else:
|
|
1717
2133
|
self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
|
|
1718
2134
|
|
|
2135
|
+
def save_workflow_memories(self) -> Dict[str, str]:
|
|
2136
|
+
r"""Save workflow memories for all SingleAgentWorker instances in the
|
|
2137
|
+
workforce.
|
|
2138
|
+
|
|
2139
|
+
This method iterates through all child workers and triggers workflow
|
|
2140
|
+
saving for SingleAgentWorker instances using their
|
|
2141
|
+
save_workflow_memories()
|
|
2142
|
+
method.
|
|
2143
|
+
Other worker types are skipped.
|
|
2144
|
+
|
|
2145
|
+
Returns:
|
|
2146
|
+
Dict[str, str]: Dictionary mapping worker node IDs to save results.
|
|
2147
|
+
Values are either file paths (success) or error messages
|
|
2148
|
+
(failure).
|
|
2149
|
+
|
|
2150
|
+
Example:
|
|
2151
|
+
>>> workforce = Workforce("My Team")
|
|
2152
|
+
>>> # ... add workers and process tasks ...
|
|
2153
|
+
>>> results = workforce.save_workflows()
|
|
2154
|
+
>>> print(results)
|
|
2155
|
+
{'worker_123': '/path/to/data_analyst_workflow_20250122.md',
|
|
2156
|
+
'worker_456': 'error: No conversation context available'}
|
|
2157
|
+
"""
|
|
2158
|
+
results = {}
|
|
2159
|
+
|
|
2160
|
+
# Get or create shared context utility for this save operation
|
|
2161
|
+
shared_context_utility = self._get_or_create_shared_context_utility()
|
|
2162
|
+
|
|
2163
|
+
for child in self._children:
|
|
2164
|
+
if isinstance(child, SingleAgentWorker):
|
|
2165
|
+
try:
|
|
2166
|
+
# Set the shared context utility for this operation
|
|
2167
|
+
child._shared_context_utility = shared_context_utility
|
|
2168
|
+
child.worker.set_context_utility(shared_context_utility)
|
|
2169
|
+
|
|
2170
|
+
result = child.save_workflow_memories()
|
|
2171
|
+
if result.get("status") == "success":
|
|
2172
|
+
results[child.node_id] = result.get(
|
|
2173
|
+
"file_path", "unknown_path"
|
|
2174
|
+
)
|
|
2175
|
+
else:
|
|
2176
|
+
# Error: check if there's a separate message field,
|
|
2177
|
+
# otherwise use the status itself
|
|
2178
|
+
error_msg = result.get(
|
|
2179
|
+
"message", result.get("status", "Unknown error")
|
|
2180
|
+
)
|
|
2181
|
+
results[child.node_id] = f"error: {error_msg}"
|
|
2182
|
+
|
|
2183
|
+
except Exception as e:
|
|
2184
|
+
results[child.node_id] = f"error: {e!s}"
|
|
2185
|
+
else:
|
|
2186
|
+
# Skip non-SingleAgentWorker types
|
|
2187
|
+
results[child.node_id] = (
|
|
2188
|
+
f"skipped: {type(child).__name__} not supported"
|
|
2189
|
+
)
|
|
2190
|
+
|
|
2191
|
+
logger.info(f"Workflow save completed for {len(results)} workers")
|
|
2192
|
+
return results
|
|
2193
|
+
|
|
2194
|
+
def load_workflow_memories(
|
|
2195
|
+
self,
|
|
2196
|
+
max_files_to_load: int = 3,
|
|
2197
|
+
session_id: Optional[str] = None,
|
|
2198
|
+
) -> Dict[str, bool]:
|
|
2199
|
+
r"""Load workflow memories for all SingleAgentWorker instances in the
|
|
2200
|
+
workforce.
|
|
2201
|
+
|
|
2202
|
+
This method iterates through all child workers and loads relevant
|
|
2203
|
+
workflow files for SingleAgentWorker instances using their
|
|
2204
|
+
load_workflow_memories()
|
|
2205
|
+
method. Workers match files based on their description names.
|
|
2206
|
+
|
|
2207
|
+
Args:
|
|
2208
|
+
max_files_to_load (int): Maximum number of workflow files to load
|
|
2209
|
+
per worker. (default: :obj:`3`)
|
|
2210
|
+
session_id (Optional[str]): Specific workforce session ID to load
|
|
2211
|
+
from. If None, searches across all sessions.
|
|
2212
|
+
(default: :obj:`None`)
|
|
2213
|
+
|
|
2214
|
+
Returns:
|
|
2215
|
+
Dict[str, bool]: Dictionary mapping worker node IDs to load
|
|
2216
|
+
success status.
|
|
2217
|
+
True indicates successful loading, False indicates failure.
|
|
2218
|
+
|
|
2219
|
+
Example:
|
|
2220
|
+
>>> workforce = Workforce("My Team")
|
|
2221
|
+
>>> workforce.add_single_agent_worker(
|
|
2222
|
+
... "data_analyst", analyst_agent
|
|
2223
|
+
... )
|
|
2224
|
+
>>> success_status = workforce.load_workflows()
|
|
2225
|
+
>>> print(success_status)
|
|
2226
|
+
{'worker_123': True} # Successfully loaded workflows for
|
|
2227
|
+
# data_analyst
|
|
2228
|
+
"""
|
|
2229
|
+
results = {}
|
|
2230
|
+
|
|
2231
|
+
# For loading, we don't create a new session - instead we search
|
|
2232
|
+
# existing ones
|
|
2233
|
+
# Each worker will search independently across all existing sessions
|
|
2234
|
+
|
|
2235
|
+
# First, load workflows for SingleAgentWorker instances
|
|
2236
|
+
for child in self._children:
|
|
2237
|
+
if isinstance(child, SingleAgentWorker):
|
|
2238
|
+
try:
|
|
2239
|
+
# For loading, don't set shared context utility
|
|
2240
|
+
# Let each worker search across existing sessions
|
|
2241
|
+
success = child.load_workflow_memories(
|
|
2242
|
+
max_files_to_load=max_files_to_load,
|
|
2243
|
+
session_id=session_id,
|
|
2244
|
+
)
|
|
2245
|
+
results[child.node_id] = success
|
|
2246
|
+
|
|
2247
|
+
except Exception as e:
|
|
2248
|
+
logger.error(
|
|
2249
|
+
f"Failed to load workflow for {child.node_id}: {e!s}"
|
|
2250
|
+
)
|
|
2251
|
+
results[child.node_id] = False
|
|
2252
|
+
else:
|
|
2253
|
+
# Skip non-SingleAgentWorker types
|
|
2254
|
+
results[child.node_id] = False
|
|
2255
|
+
|
|
2256
|
+
# Load aggregated workflow summaries for coordinator and task agents
|
|
2257
|
+
self._load_management_agent_workflows(max_files_to_load, session_id)
|
|
2258
|
+
|
|
2259
|
+
logger.info(f"Workflow load completed for {len(results)} workers")
|
|
2260
|
+
return results
|
|
2261
|
+
|
|
2262
|
+
def _load_management_agent_workflows(
|
|
2263
|
+
self, max_files_to_load: int, session_id: Optional[str] = None
|
|
2264
|
+
) -> None:
|
|
2265
|
+
r"""Load workflow summaries for coordinator and task planning agents.
|
|
2266
|
+
|
|
2267
|
+
This method loads aggregated workflow summaries to help:
|
|
2268
|
+
- Coordinator agent: understand task assignment patterns and worker
|
|
2269
|
+
capabilities
|
|
2270
|
+
- Task agent: understand task decomposition patterns and
|
|
2271
|
+
successful strategies
|
|
2272
|
+
|
|
2273
|
+
Args:
|
|
2274
|
+
max_files_to_load (int): Maximum number of workflow files to load.
|
|
2275
|
+
session_id (Optional[str]): Specific session ID to load from.
|
|
2276
|
+
If None, searches across all sessions.
|
|
2277
|
+
"""
|
|
2278
|
+
try:
|
|
2279
|
+
import glob
|
|
2280
|
+
import os
|
|
2281
|
+
from pathlib import Path
|
|
2282
|
+
|
|
2283
|
+
from camel.utils.context_utils import ContextUtility
|
|
2284
|
+
|
|
2285
|
+
# For loading management workflows, search across all sessions
|
|
2286
|
+
camel_workdir = os.environ.get("CAMEL_WORKDIR")
|
|
2287
|
+
if camel_workdir:
|
|
2288
|
+
base_dir = os.path.join(camel_workdir, "workforce_workflows")
|
|
2289
|
+
else:
|
|
2290
|
+
base_dir = "workforce_workflows"
|
|
2291
|
+
|
|
2292
|
+
# Search for workflow files in specified or all session directories
|
|
2293
|
+
if session_id:
|
|
2294
|
+
search_path = str(
|
|
2295
|
+
Path(base_dir) / session_id / "*_workflow*.md"
|
|
2296
|
+
)
|
|
2297
|
+
else:
|
|
2298
|
+
search_path = str(Path(base_dir) / "*" / "*_workflow*.md")
|
|
2299
|
+
workflow_files = glob.glob(search_path)
|
|
2300
|
+
|
|
2301
|
+
if not workflow_files:
|
|
2302
|
+
logger.info(
|
|
2303
|
+
"No workflow files found for management agent context"
|
|
2304
|
+
)
|
|
2305
|
+
return
|
|
2306
|
+
|
|
2307
|
+
# Sort by modification time (most recent first)
|
|
2308
|
+
workflow_files.sort(
|
|
2309
|
+
key=lambda x: os.path.getmtime(x), reverse=True
|
|
2310
|
+
)
|
|
2311
|
+
|
|
2312
|
+
# Load workflows for coordinator agent (up to 5 most recent)
|
|
2313
|
+
coordinator_loaded = 0
|
|
2314
|
+
for file_path in workflow_files[:max_files_to_load]:
|
|
2315
|
+
try:
|
|
2316
|
+
filename = os.path.basename(file_path).replace('.md', '')
|
|
2317
|
+
session_dir = os.path.dirname(file_path)
|
|
2318
|
+
session_id = os.path.basename(session_dir)
|
|
2319
|
+
|
|
2320
|
+
# Use shared context utility with specific session
|
|
2321
|
+
temp_utility = ContextUtility.get_workforce_shared(
|
|
2322
|
+
session_id
|
|
2323
|
+
)
|
|
2324
|
+
|
|
2325
|
+
status = temp_utility.load_markdown_context_to_memory(
|
|
2326
|
+
self.coordinator_agent, filename
|
|
2327
|
+
)
|
|
2328
|
+
if "Context appended" in status:
|
|
2329
|
+
coordinator_loaded += 1
|
|
2330
|
+
except Exception as e:
|
|
2331
|
+
logger.warning(
|
|
2332
|
+
f"Failed to load coordinator workflow {file_path}: {e}"
|
|
2333
|
+
)
|
|
2334
|
+
|
|
2335
|
+
# Load workflows for task agent (up to 3 most recent)
|
|
2336
|
+
task_agent_loaded = 0
|
|
2337
|
+
for file_path in workflow_files[:max_files_to_load]:
|
|
2338
|
+
try:
|
|
2339
|
+
filename = os.path.basename(file_path).replace('.md', '')
|
|
2340
|
+
session_dir = os.path.dirname(file_path)
|
|
2341
|
+
session_id = os.path.basename(session_dir)
|
|
2342
|
+
|
|
2343
|
+
# Use shared context utility with specific session
|
|
2344
|
+
temp_utility = ContextUtility.get_workforce_shared(
|
|
2345
|
+
session_id
|
|
2346
|
+
)
|
|
2347
|
+
|
|
2348
|
+
status = temp_utility.load_markdown_context_to_memory(
|
|
2349
|
+
self.task_agent, filename
|
|
2350
|
+
)
|
|
2351
|
+
if "Context appended" in status:
|
|
2352
|
+
task_agent_loaded += 1
|
|
2353
|
+
except Exception as e:
|
|
2354
|
+
logger.warning(
|
|
2355
|
+
f"Failed to load task agent workflow {file_path}: {e}"
|
|
2356
|
+
)
|
|
2357
|
+
|
|
2358
|
+
logger.info(
|
|
2359
|
+
f"Loaded {coordinator_loaded} workflows for coordinator, "
|
|
2360
|
+
f"{task_agent_loaded} workflows for task agent"
|
|
2361
|
+
)
|
|
2362
|
+
|
|
2363
|
+
except Exception as e:
|
|
2364
|
+
logger.error(f"Error loading management agent workflows: {e}")
|
|
2365
|
+
|
|
1719
2366
|
@check_if_running(False)
|
|
1720
2367
|
def set_channel(self, channel: TaskChannel) -> None:
|
|
1721
2368
|
r"""Set the channel for the node and all the child nodes under it."""
|
|
@@ -2063,8 +2710,40 @@ class Workforce(BaseNode):
|
|
|
2063
2710
|
TaskAssignResult: Assignment result containing task assignments
|
|
2064
2711
|
with their dependencies.
|
|
2065
2712
|
"""
|
|
2713
|
+
# Wait for workers to be ready before assignment with exponential
|
|
2714
|
+
# backoff
|
|
2715
|
+
worker_readiness_timeout = 2.0 # Maximum wait time in seconds
|
|
2716
|
+
worker_readiness_check_interval = 0.05 # Initial check interval
|
|
2717
|
+
start_time = time.time()
|
|
2718
|
+
check_interval = worker_readiness_check_interval
|
|
2719
|
+
backoff_multiplier = 1.5 # Exponential backoff factor
|
|
2720
|
+
max_interval = 0.5 # Cap the maximum interval
|
|
2721
|
+
|
|
2722
|
+
while (time.time() - start_time) < worker_readiness_timeout:
|
|
2723
|
+
valid_worker_ids = self._get_valid_worker_ids()
|
|
2724
|
+
if len(valid_worker_ids) > 0:
|
|
2725
|
+
elapsed = time.time() - start_time
|
|
2726
|
+
logger.debug(
|
|
2727
|
+
f"Workers ready after {elapsed:.3f}s: "
|
|
2728
|
+
f"{len(valid_worker_ids)} workers available"
|
|
2729
|
+
)
|
|
2730
|
+
break
|
|
2731
|
+
|
|
2732
|
+
await asyncio.sleep(check_interval)
|
|
2733
|
+
# Exponential backoff with cap
|
|
2734
|
+
check_interval = min(
|
|
2735
|
+
check_interval * backoff_multiplier, max_interval
|
|
2736
|
+
)
|
|
2737
|
+
else:
|
|
2738
|
+
# Timeout reached, log warning but continue
|
|
2739
|
+
logger.warning(
|
|
2740
|
+
f"Worker readiness timeout after "
|
|
2741
|
+
f"{worker_readiness_timeout}s, "
|
|
2742
|
+
f"proceeding with {len(self._children)} children"
|
|
2743
|
+
)
|
|
2744
|
+
valid_worker_ids = self._get_valid_worker_ids()
|
|
2745
|
+
|
|
2066
2746
|
self.coordinator_agent.reset()
|
|
2067
|
-
valid_worker_ids = self._get_valid_worker_ids()
|
|
2068
2747
|
|
|
2069
2748
|
logger.debug(
|
|
2070
2749
|
f"Sending batch assignment request to coordinator "
|
|
@@ -2098,7 +2777,24 @@ class Workforce(BaseNode):
|
|
|
2098
2777
|
invalid_assignments, tasks, valid_worker_ids
|
|
2099
2778
|
)
|
|
2100
2779
|
)
|
|
2101
|
-
|
|
2780
|
+
|
|
2781
|
+
# Combine assignments with deduplication, prioritizing retry results
|
|
2782
|
+
assignment_map = {a.task_id: a for a in valid_assignments}
|
|
2783
|
+
assignment_map.update(
|
|
2784
|
+
{a.task_id: a for a in retry_and_fallback_assignments}
|
|
2785
|
+
)
|
|
2786
|
+
all_assignments = list(assignment_map.values())
|
|
2787
|
+
|
|
2788
|
+
# Log any overwrites for debugging
|
|
2789
|
+
valid_task_ids = {a.task_id for a in valid_assignments}
|
|
2790
|
+
retry_task_ids = {a.task_id for a in retry_and_fallback_assignments}
|
|
2791
|
+
overlap_task_ids = valid_task_ids & retry_task_ids
|
|
2792
|
+
|
|
2793
|
+
if overlap_task_ids:
|
|
2794
|
+
logger.warning(
|
|
2795
|
+
f"Retry assignments overrode {len(overlap_task_ids)} "
|
|
2796
|
+
f"valid assignments for tasks: {sorted(overlap_task_ids)}"
|
|
2797
|
+
)
|
|
2102
2798
|
|
|
2103
2799
|
# Update Task.dependencies for all final assignments
|
|
2104
2800
|
self._update_task_dependencies_from_assignments(all_assignments, tasks)
|
|
@@ -2176,8 +2872,7 @@ class Workforce(BaseNode):
|
|
|
2176
2872
|
"worker creation"
|
|
2177
2873
|
)
|
|
2178
2874
|
new_node_conf = WorkerConf(
|
|
2179
|
-
description=f"Fallback worker for task: "
|
|
2180
|
-
f"{task.content}",
|
|
2875
|
+
description=f"Fallback worker for task: {task.content}",
|
|
2181
2876
|
role="General Assistant",
|
|
2182
2877
|
sys_msg="You are a general assistant that can help "
|
|
2183
2878
|
"with various tasks.",
|
|
@@ -2187,7 +2882,7 @@ class Workforce(BaseNode):
|
|
|
2187
2882
|
response.msg.content,
|
|
2188
2883
|
schema=WorkerConf,
|
|
2189
2884
|
fallback_values={
|
|
2190
|
-
"description": f"Worker for task:
|
|
2885
|
+
"description": f"Worker for task: {task.content}",
|
|
2191
2886
|
"role": "Task Specialist",
|
|
2192
2887
|
"sys_msg": f"You are a specialist for: {task.content}",
|
|
2193
2888
|
},
|
|
@@ -2215,8 +2910,7 @@ class Workforce(BaseNode):
|
|
|
2215
2910
|
)
|
|
2216
2911
|
# Create a fallback worker configuration
|
|
2217
2912
|
new_node_conf = WorkerConf(
|
|
2218
|
-
description=f"Fallback worker for "
|
|
2219
|
-
f"task: {task.content}",
|
|
2913
|
+
description=f"Fallback worker for task: {task.content}",
|
|
2220
2914
|
role="General Assistant",
|
|
2221
2915
|
sys_msg="You are a general assistant that can help "
|
|
2222
2916
|
"with various tasks.",
|
|
@@ -2310,6 +3004,9 @@ class Workforce(BaseNode):
|
|
|
2310
3004
|
r"""Get the task that's published by this node and just get returned
|
|
2311
3005
|
from the assignee. Includes timeout handling to prevent indefinite
|
|
2312
3006
|
waiting.
|
|
3007
|
+
|
|
3008
|
+
Raises:
|
|
3009
|
+
asyncio.TimeoutError: If waiting for task exceeds timeout
|
|
2313
3010
|
"""
|
|
2314
3011
|
try:
|
|
2315
3012
|
# Add timeout to prevent indefinite waiting
|
|
@@ -2317,6 +3014,17 @@ class Workforce(BaseNode):
|
|
|
2317
3014
|
self._channel.get_returned_task_by_publisher(self.node_id),
|
|
2318
3015
|
timeout=self.task_timeout_seconds,
|
|
2319
3016
|
)
|
|
3017
|
+
except asyncio.TimeoutError:
|
|
3018
|
+
# Re-raise timeout errors to be handled by caller
|
|
3019
|
+
# This prevents hanging when tasks are stuck
|
|
3020
|
+
logger.warning(
|
|
3021
|
+
f"Timeout waiting for task return in workforce "
|
|
3022
|
+
f"{self.node_id}. "
|
|
3023
|
+
f"Timeout: {self.task_timeout_seconds}s, "
|
|
3024
|
+
f"Pending tasks: {len(self._pending_tasks)}, "
|
|
3025
|
+
f"In-flight tasks: {self._in_flight_tasks}"
|
|
3026
|
+
)
|
|
3027
|
+
raise
|
|
2320
3028
|
except Exception as e:
|
|
2321
3029
|
error_msg = (
|
|
2322
3030
|
f"Error getting returned task {e} in "
|
|
@@ -2335,7 +3043,15 @@ class Workforce(BaseNode):
|
|
|
2335
3043
|
tasks_to_assign = [
|
|
2336
3044
|
task
|
|
2337
3045
|
for task in self._pending_tasks
|
|
2338
|
-
if
|
|
3046
|
+
if (
|
|
3047
|
+
task.id not in self._task_dependencies
|
|
3048
|
+
and (
|
|
3049
|
+
task.additional_info is None
|
|
3050
|
+
or not task.additional_info.get(
|
|
3051
|
+
"_needs_decomposition", False
|
|
3052
|
+
)
|
|
3053
|
+
)
|
|
3054
|
+
)
|
|
2339
3055
|
]
|
|
2340
3056
|
if tasks_to_assign:
|
|
2341
3057
|
logger.debug(
|
|
@@ -2371,21 +3087,141 @@ class Workforce(BaseNode):
|
|
|
2371
3087
|
for task in self._pending_tasks:
|
|
2372
3088
|
# A task must be assigned to be considered for posting
|
|
2373
3089
|
if task.id in self._task_dependencies:
|
|
3090
|
+
# Skip if task has already been posted to prevent duplicates
|
|
3091
|
+
try:
|
|
3092
|
+
task_from_channel = await self._channel.get_task_by_id(
|
|
3093
|
+
task.id
|
|
3094
|
+
)
|
|
3095
|
+
# Check if task is already assigned to a worker
|
|
3096
|
+
if (
|
|
3097
|
+
task_from_channel
|
|
3098
|
+
and task_from_channel.assigned_worker_id
|
|
3099
|
+
):
|
|
3100
|
+
logger.debug(
|
|
3101
|
+
f"Task {task.id} already assigned to "
|
|
3102
|
+
f"{task_from_channel.assigned_worker_id}, "
|
|
3103
|
+
f"skipping to prevent duplicate"
|
|
3104
|
+
)
|
|
3105
|
+
continue
|
|
3106
|
+
except Exception as e:
|
|
3107
|
+
logger.info(
|
|
3108
|
+
f"Task {task.id} non existent in channel. "
|
|
3109
|
+
f"Assigning task: {e}"
|
|
3110
|
+
)
|
|
2374
3111
|
dependencies = self._task_dependencies[task.id]
|
|
2375
|
-
|
|
2376
|
-
#
|
|
2377
|
-
|
|
2378
|
-
dep_id in completed_tasks_info
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
3112
|
+
|
|
3113
|
+
# Check if all dependencies are in completed state
|
|
3114
|
+
all_deps_completed = all(
|
|
3115
|
+
dep_id in completed_tasks_info for dep_id in dependencies
|
|
3116
|
+
)
|
|
3117
|
+
|
|
3118
|
+
# Only proceed with dependency checks if all deps are completed
|
|
3119
|
+
if all_deps_completed:
|
|
3120
|
+
# Check if all dependencies succeeded (state is DONE)
|
|
3121
|
+
all_deps_done = all(
|
|
3122
|
+
completed_tasks_info[dep_id] == TaskState.DONE
|
|
3123
|
+
for dep_id in dependencies
|
|
2386
3124
|
)
|
|
2387
|
-
|
|
2388
|
-
|
|
3125
|
+
|
|
3126
|
+
# Check if any dependency failed
|
|
3127
|
+
any_dep_failed = any(
|
|
3128
|
+
completed_tasks_info[dep_id] == TaskState.FAILED
|
|
3129
|
+
for dep_id in dependencies
|
|
3130
|
+
)
|
|
3131
|
+
|
|
3132
|
+
if all_deps_done:
|
|
3133
|
+
# All dependencies completed successfully - post the
|
|
3134
|
+
# task
|
|
3135
|
+
assignee_id = self._assignees[task.id]
|
|
3136
|
+
logger.debug(
|
|
3137
|
+
f"Posting task {task.id} to "
|
|
3138
|
+
f"assignee {assignee_id}. "
|
|
3139
|
+
f"Dependencies met."
|
|
3140
|
+
)
|
|
3141
|
+
await self._post_task(task, assignee_id)
|
|
3142
|
+
posted_tasks.append(task)
|
|
3143
|
+
elif any_dep_failed:
|
|
3144
|
+
# Check if any failed dependencies can still be retried
|
|
3145
|
+
failed_deps = [
|
|
3146
|
+
dep_id
|
|
3147
|
+
for dep_id in dependencies
|
|
3148
|
+
if completed_tasks_info[dep_id] == TaskState.FAILED
|
|
3149
|
+
]
|
|
3150
|
+
|
|
3151
|
+
# Check if any failed dependency is still retryable
|
|
3152
|
+
failed_tasks_with_retry_potential = []
|
|
3153
|
+
permanently_failed_deps = []
|
|
3154
|
+
|
|
3155
|
+
for dep_id in failed_deps:
|
|
3156
|
+
# Find the failed dependency task
|
|
3157
|
+
failed_task = next(
|
|
3158
|
+
(
|
|
3159
|
+
t
|
|
3160
|
+
for t in self._completed_tasks
|
|
3161
|
+
if t.id == dep_id
|
|
3162
|
+
),
|
|
3163
|
+
None,
|
|
3164
|
+
)
|
|
3165
|
+
if (
|
|
3166
|
+
failed_task
|
|
3167
|
+
and failed_task.failure_count
|
|
3168
|
+
< MAX_TASK_RETRIES
|
|
3169
|
+
):
|
|
3170
|
+
failed_tasks_with_retry_potential.append(
|
|
3171
|
+
dep_id
|
|
3172
|
+
)
|
|
3173
|
+
else:
|
|
3174
|
+
permanently_failed_deps.append(dep_id)
|
|
3175
|
+
|
|
3176
|
+
# Only fail the task if ALL dependencies are
|
|
3177
|
+
# permanently failed
|
|
3178
|
+
if (
|
|
3179
|
+
permanently_failed_deps
|
|
3180
|
+
and not failed_tasks_with_retry_potential
|
|
3181
|
+
):
|
|
3182
|
+
logger.error(
|
|
3183
|
+
f"Task {task.id} cannot proceed: dependencies "
|
|
3184
|
+
f"{permanently_failed_deps} have "
|
|
3185
|
+
f"permanently failed. "
|
|
3186
|
+
f"Marking task as failed."
|
|
3187
|
+
)
|
|
3188
|
+
task.state = TaskState.FAILED
|
|
3189
|
+
task.result = (
|
|
3190
|
+
f"Task failed due to permanently "
|
|
3191
|
+
f"failed dependencies: "
|
|
3192
|
+
f"{permanently_failed_deps}"
|
|
3193
|
+
)
|
|
3194
|
+
|
|
3195
|
+
# Log the failure to metrics
|
|
3196
|
+
if self.metrics_logger:
|
|
3197
|
+
self.metrics_logger.log_task_failed(
|
|
3198
|
+
task_id=task.id,
|
|
3199
|
+
worker_id=task.assigned_worker_id
|
|
3200
|
+
or "unknown",
|
|
3201
|
+
error_message=task.result,
|
|
3202
|
+
metadata={
|
|
3203
|
+
'failure_reason': (
|
|
3204
|
+
'dependency_failure'
|
|
3205
|
+
),
|
|
3206
|
+
'failed_dependencies': (
|
|
3207
|
+
permanently_failed_deps
|
|
3208
|
+
),
|
|
3209
|
+
},
|
|
3210
|
+
)
|
|
3211
|
+
|
|
3212
|
+
self._completed_tasks.append(task)
|
|
3213
|
+
self._cleanup_task_tracking(task.id)
|
|
3214
|
+
posted_tasks.append(task) # Remove from pending
|
|
3215
|
+
else:
|
|
3216
|
+
# Some dependencies may still be retried, keep
|
|
3217
|
+
# task pending
|
|
3218
|
+
logger.debug(
|
|
3219
|
+
f"Task {task.id} waiting: dependencies "
|
|
3220
|
+
f"{failed_tasks_with_retry_potential} "
|
|
3221
|
+
f"failed but may be retried "
|
|
3222
|
+
f"(attempt < {MAX_TASK_RETRIES})"
|
|
3223
|
+
)
|
|
3224
|
+
# else: Not all dependencies completed yet, skip this task
|
|
2389
3225
|
|
|
2390
3226
|
# Step 3: Remove the posted tasks from the pending list
|
|
2391
3227
|
for task in posted_tasks:
|
|
@@ -2397,21 +3233,30 @@ class Workforce(BaseNode):
|
|
|
2397
3233
|
pass
|
|
2398
3234
|
|
|
2399
3235
|
async def _handle_failed_task(self, task: Task) -> bool:
|
|
3236
|
+
r"""Handle a task that failed during execution.
|
|
3237
|
+
|
|
3238
|
+
Args:
|
|
3239
|
+
task (Task): The failed task
|
|
3240
|
+
|
|
3241
|
+
Returns:
|
|
3242
|
+
bool: True if workforce should halt, False otherwise
|
|
3243
|
+
"""
|
|
2400
3244
|
task.failure_count += 1
|
|
2401
3245
|
|
|
2402
3246
|
# Determine detailed failure information
|
|
2403
|
-
# Use the actual error/result stored in task.result
|
|
2404
3247
|
failure_reason = task.result or "Unknown error"
|
|
2405
|
-
|
|
2406
|
-
# Add context about the worker and task
|
|
2407
3248
|
worker_id = task.assigned_worker_id or "unknown"
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
detailed_error = f"{failure_reason}{worker_info}"
|
|
3249
|
+
detailed_error = f"{failure_reason} (assigned to worker: {worker_id})"
|
|
2411
3250
|
|
|
2412
3251
|
logger.error(
|
|
2413
3252
|
f"Task {task.id} failed (attempt "
|
|
2414
|
-
f"{task.failure_count}/
|
|
3253
|
+
f"{task.failure_count}/{MAX_TASK_RETRIES}): {detailed_error}"
|
|
3254
|
+
)
|
|
3255
|
+
|
|
3256
|
+
print(
|
|
3257
|
+
f"{Fore.RED}❌ Task {task.id} failed "
|
|
3258
|
+
f"(attempt {task.failure_count}/{MAX_TASK_RETRIES}): "
|
|
3259
|
+
f"{failure_reason}{Fore.RESET}"
|
|
2415
3260
|
)
|
|
2416
3261
|
|
|
2417
3262
|
if self.metrics_logger:
|
|
@@ -2426,24 +3271,20 @@ class Workforce(BaseNode):
|
|
|
2426
3271
|
},
|
|
2427
3272
|
)
|
|
2428
3273
|
|
|
2429
|
-
# Check for immediate halt conditions
|
|
2430
|
-
# should halt
|
|
3274
|
+
# Check for immediate halt conditions
|
|
2431
3275
|
if task.failure_count >= MAX_TASK_RETRIES:
|
|
2432
3276
|
logger.error(
|
|
2433
3277
|
f"Task {task.id} has exceeded maximum retry attempts "
|
|
2434
|
-
f"({MAX_TASK_RETRIES}). Final failure "
|
|
2435
|
-
f"
|
|
3278
|
+
f"({MAX_TASK_RETRIES}). Final failure reason: "
|
|
3279
|
+
f"{detailed_error}. "
|
|
2436
3280
|
f"Task content: '{task.content}'"
|
|
2437
3281
|
)
|
|
2438
3282
|
self._cleanup_task_tracking(task.id)
|
|
2439
|
-
# Mark task as completed for dependency tracking before halting
|
|
2440
3283
|
self._completed_tasks.append(task)
|
|
2441
3284
|
if task.id in self._assignees:
|
|
2442
3285
|
await self._channel.archive_task(task.id)
|
|
2443
3286
|
return True
|
|
2444
3287
|
|
|
2445
|
-
# If too many tasks are failing rapidly, also halt to prevent infinite
|
|
2446
|
-
# loops
|
|
2447
3288
|
if len(self._pending_tasks) > MAX_PENDING_TASKS_LIMIT:
|
|
2448
3289
|
logger.error(
|
|
2449
3290
|
f"Too many pending tasks ({len(self._pending_tasks)} > "
|
|
@@ -2451,18 +3292,24 @@ class Workforce(BaseNode):
|
|
|
2451
3292
|
f"explosion. Last failed task: {task.id}"
|
|
2452
3293
|
)
|
|
2453
3294
|
self._cleanup_task_tracking(task.id)
|
|
2454
|
-
# Mark task as completed for dependency tracking before halting
|
|
2455
3295
|
self._completed_tasks.append(task)
|
|
2456
3296
|
if task.id in self._assignees:
|
|
2457
3297
|
await self._channel.archive_task(task.id)
|
|
2458
3298
|
return True
|
|
2459
3299
|
|
|
2460
3300
|
# Use intelligent failure analysis to decide recovery strategy
|
|
2461
|
-
recovery_decision = self.
|
|
3301
|
+
recovery_decision = self._analyze_task(
|
|
3302
|
+
task, for_failure=True, error_message=detailed_error
|
|
3303
|
+
)
|
|
2462
3304
|
|
|
3305
|
+
strategy_str = (
|
|
3306
|
+
recovery_decision.recovery_strategy.value
|
|
3307
|
+
if recovery_decision.recovery_strategy
|
|
3308
|
+
else "none"
|
|
3309
|
+
)
|
|
2463
3310
|
logger.info(
|
|
2464
3311
|
f"Task {task.id} failure "
|
|
2465
|
-
f"analysis: {
|
|
3312
|
+
f"analysis: {strategy_str} - "
|
|
2466
3313
|
f"{recovery_decision.reasoning}"
|
|
2467
3314
|
)
|
|
2468
3315
|
|
|
@@ -2471,105 +3318,23 @@ class Workforce(BaseNode):
|
|
|
2471
3318
|
await self._channel.archive_task(task.id)
|
|
2472
3319
|
self._cleanup_task_tracking(task.id)
|
|
2473
3320
|
|
|
3321
|
+
# Apply recovery strategy
|
|
2474
3322
|
try:
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
assignee_id = self._assignees[task.id]
|
|
2479
|
-
await self._post_task(task, assignee_id)
|
|
2480
|
-
action_taken = f"retried with same worker {assignee_id}"
|
|
2481
|
-
else:
|
|
2482
|
-
# Find a new assignee and retry
|
|
2483
|
-
batch_result = await self._find_assignee([task])
|
|
2484
|
-
assignment = batch_result.assignments[0]
|
|
2485
|
-
self._assignees[task.id] = assignment.assignee_id
|
|
2486
|
-
await self._post_task(task, assignment.assignee_id)
|
|
2487
|
-
action_taken = (
|
|
2488
|
-
f"retried with new worker {assignment.assignee_id}"
|
|
2489
|
-
)
|
|
2490
|
-
|
|
2491
|
-
elif recovery_decision.strategy == RecoveryStrategy.REPLAN:
|
|
2492
|
-
# Modify the task content and retry
|
|
2493
|
-
if recovery_decision.modified_task_content:
|
|
2494
|
-
task.content = recovery_decision.modified_task_content
|
|
2495
|
-
logger.info(f"Task {task.id} content modified for replan")
|
|
2496
|
-
|
|
2497
|
-
# Repost the modified task
|
|
2498
|
-
if task.id in self._assignees:
|
|
2499
|
-
assignee_id = self._assignees[task.id]
|
|
2500
|
-
await self._post_task(task, assignee_id)
|
|
2501
|
-
action_taken = (
|
|
2502
|
-
f"replanned and retried with worker {assignee_id}"
|
|
2503
|
-
)
|
|
2504
|
-
else:
|
|
2505
|
-
# Find a new assignee for the replanned task
|
|
2506
|
-
batch_result = await self._find_assignee([task])
|
|
2507
|
-
assignment = batch_result.assignments[0]
|
|
2508
|
-
self._assignees[task.id] = assignment.assignee_id
|
|
2509
|
-
await self._post_task(task, assignment.assignee_id)
|
|
2510
|
-
action_taken = (
|
|
2511
|
-
f"replanned and assigned to "
|
|
2512
|
-
f"worker {assignment.assignee_id}"
|
|
2513
|
-
)
|
|
2514
|
-
|
|
2515
|
-
elif recovery_decision.strategy == RecoveryStrategy.DECOMPOSE:
|
|
2516
|
-
# Decompose the task into subtasks
|
|
2517
|
-
subtasks_result = self._decompose_task(task)
|
|
2518
|
-
|
|
2519
|
-
# Handle both streaming and non-streaming results
|
|
2520
|
-
if isinstance(subtasks_result, Generator):
|
|
2521
|
-
# This is a generator (streaming mode)
|
|
2522
|
-
subtasks = []
|
|
2523
|
-
for new_tasks in subtasks_result:
|
|
2524
|
-
subtasks.extend(new_tasks)
|
|
2525
|
-
else:
|
|
2526
|
-
# This is a regular list (non-streaming mode)
|
|
2527
|
-
subtasks = subtasks_result
|
|
2528
|
-
if self.metrics_logger and subtasks:
|
|
2529
|
-
self.metrics_logger.log_task_decomposed(
|
|
2530
|
-
parent_task_id=task.id,
|
|
2531
|
-
subtask_ids=[st.id for st in subtasks],
|
|
2532
|
-
)
|
|
2533
|
-
for subtask in subtasks:
|
|
2534
|
-
self.metrics_logger.log_task_created(
|
|
2535
|
-
task_id=subtask.id,
|
|
2536
|
-
description=subtask.content,
|
|
2537
|
-
parent_task_id=task.id,
|
|
2538
|
-
task_type=subtask.type,
|
|
2539
|
-
metadata=subtask.additional_info,
|
|
2540
|
-
)
|
|
2541
|
-
# Insert packets at the head of the queue
|
|
2542
|
-
self._pending_tasks.extendleft(reversed(subtasks))
|
|
2543
|
-
|
|
2544
|
-
await self._post_ready_tasks()
|
|
2545
|
-
action_taken = f"decomposed into {len(subtasks)} subtasks"
|
|
2546
|
-
|
|
2547
|
-
logger.debug(
|
|
2548
|
-
f"Task {task.id} failed and was {action_taken}. "
|
|
2549
|
-
f"Dependencies updated for subtasks."
|
|
2550
|
-
)
|
|
2551
|
-
|
|
2552
|
-
# Sync shared memory after task decomposition
|
|
2553
|
-
if self.share_memory:
|
|
2554
|
-
logger.info(
|
|
2555
|
-
f"Syncing shared memory after "
|
|
2556
|
-
f"task {task.id} decomposition"
|
|
2557
|
-
)
|
|
2558
|
-
self._sync_shared_memory()
|
|
3323
|
+
is_decompose = await self._apply_recovery_strategy(
|
|
3324
|
+
task, recovery_decision
|
|
3325
|
+
)
|
|
2559
3326
|
|
|
2560
|
-
|
|
2561
|
-
|
|
3327
|
+
# For decompose, we handle it specially
|
|
3328
|
+
if is_decompose:
|
|
3329
|
+
# Task was decomposed, add to completed tasks
|
|
3330
|
+
self._completed_tasks.append(task)
|
|
2562
3331
|
return False
|
|
2563
3332
|
|
|
2564
|
-
elif recovery_decision.strategy == RecoveryStrategy.CREATE_WORKER:
|
|
2565
|
-
assignee = await self._create_worker_node_for_task(task)
|
|
2566
|
-
await self._post_task(task, assignee.node_id)
|
|
2567
|
-
action_taken = (
|
|
2568
|
-
f"created new worker {assignee.node_id} and assigned "
|
|
2569
|
-
f"task {task.id} to it"
|
|
2570
|
-
)
|
|
2571
3333
|
except Exception as e:
|
|
2572
|
-
logger.error(
|
|
3334
|
+
logger.error(
|
|
3335
|
+
f"Recovery strategy failed for task {task.id}: {e}",
|
|
3336
|
+
exc_info=True,
|
|
3337
|
+
)
|
|
2573
3338
|
# If max retries reached, halt the workforce
|
|
2574
3339
|
if task.failure_count >= MAX_TASK_RETRIES:
|
|
2575
3340
|
self._completed_tasks.append(task)
|
|
@@ -2577,18 +3342,17 @@ class Workforce(BaseNode):
|
|
|
2577
3342
|
self._completed_tasks.append(task)
|
|
2578
3343
|
return False
|
|
2579
3344
|
|
|
3345
|
+
# Task is being retried - don't add to completed tasks
|
|
3346
|
+
# It will be added when it actually completes or permanently fails
|
|
2580
3347
|
logger.debug(
|
|
2581
|
-
f"Task {task.id}
|
|
2582
|
-
f"
|
|
3348
|
+
f"Task {task.id} is being retried (strategy: "
|
|
3349
|
+
f"{recovery_decision.recovery_strategy}). "
|
|
3350
|
+
f"Not adding to completed tasks until final outcome."
|
|
2583
3351
|
)
|
|
2584
|
-
# Mark task as completed for dependency tracking
|
|
2585
|
-
self._completed_tasks.append(task)
|
|
2586
3352
|
|
|
2587
|
-
# Sync shared memory after task
|
|
3353
|
+
# Sync shared memory after task recovery
|
|
2588
3354
|
if self.share_memory:
|
|
2589
|
-
logger.info(
|
|
2590
|
-
f"Syncing shared memory after task {task.id} completion"
|
|
2591
|
-
)
|
|
3355
|
+
logger.info(f"Syncing shared memory after task {task.id} recovery")
|
|
2592
3356
|
self._sync_shared_memory()
|
|
2593
3357
|
|
|
2594
3358
|
# Check if any pending tasks are now ready to execute
|
|
@@ -2793,6 +3557,124 @@ class Workforce(BaseNode):
|
|
|
2793
3557
|
# Use logger.info or print, consistent with existing style
|
|
2794
3558
|
logger.info(f"Workforce logs dumped to {file_path}")
|
|
2795
3559
|
|
|
3560
|
+
async def _handle_skip_task(self) -> bool:
|
|
3561
|
+
r"""Handle skip request by marking pending and in-flight tasks
|
|
3562
|
+
as completed.
|
|
3563
|
+
|
|
3564
|
+
Returns:
|
|
3565
|
+
bool: True if workforce should stop (no independent tasks),
|
|
3566
|
+
False to continue.
|
|
3567
|
+
"""
|
|
3568
|
+
logger.info("Skip requested, processing skip logic.")
|
|
3569
|
+
|
|
3570
|
+
# Mark all pending tasks as completed instead of just clearing
|
|
3571
|
+
pending_tasks_to_complete = list(self._pending_tasks)
|
|
3572
|
+
if pending_tasks_to_complete:
|
|
3573
|
+
logger.info(
|
|
3574
|
+
f"Marking {len(pending_tasks_to_complete)} pending tasks "
|
|
3575
|
+
f"as completed."
|
|
3576
|
+
)
|
|
3577
|
+
for task in pending_tasks_to_complete:
|
|
3578
|
+
# Don't remove tasks that need decomposition
|
|
3579
|
+
if task.additional_info and task.additional_info.get(
|
|
3580
|
+
'_needs_decomposition', False
|
|
3581
|
+
):
|
|
3582
|
+
continue
|
|
3583
|
+
# Set task state to DONE and add a completion message
|
|
3584
|
+
task.state = TaskState.DONE
|
|
3585
|
+
task.result = "Task marked as completed due to skip request"
|
|
3586
|
+
|
|
3587
|
+
# Use the existing handle completed task function
|
|
3588
|
+
await self._handle_completed_task(task)
|
|
3589
|
+
|
|
3590
|
+
# Handle in-flight tasks if they exist
|
|
3591
|
+
if self._in_flight_tasks > 0:
|
|
3592
|
+
logger.info(
|
|
3593
|
+
f"Found {self._in_flight_tasks} in-flight tasks. "
|
|
3594
|
+
f"Retrieving and completing them."
|
|
3595
|
+
)
|
|
3596
|
+
try:
|
|
3597
|
+
# Get all in-flight tasks for this publisher from the channel
|
|
3598
|
+
in_flight_tasks = await self._channel.get_in_flight_tasks(
|
|
3599
|
+
self.node_id
|
|
3600
|
+
)
|
|
3601
|
+
logger.info(
|
|
3602
|
+
f"Retrieved {len(in_flight_tasks)} in-flight "
|
|
3603
|
+
f"tasks from channel."
|
|
3604
|
+
)
|
|
3605
|
+
|
|
3606
|
+
for task in in_flight_tasks:
|
|
3607
|
+
# Set task state to DONE and add a completion message
|
|
3608
|
+
task.state = TaskState.DONE
|
|
3609
|
+
task.result = (
|
|
3610
|
+
"Task marked as completed due to skip request"
|
|
3611
|
+
)
|
|
3612
|
+
|
|
3613
|
+
# Remove the task from the channel to avoid hanging
|
|
3614
|
+
await self._channel.remove_task(task.id)
|
|
3615
|
+
|
|
3616
|
+
# Decrement in-flight counter
|
|
3617
|
+
self._decrement_in_flight_tasks(
|
|
3618
|
+
task.id, "skip request - removed from channel"
|
|
3619
|
+
)
|
|
3620
|
+
|
|
3621
|
+
# Handle as completed task to update dependencies
|
|
3622
|
+
await self._handle_completed_task(task)
|
|
3623
|
+
|
|
3624
|
+
logger.info(
|
|
3625
|
+
f"Completed in-flight task {task.id} due "
|
|
3626
|
+
f"to skip request."
|
|
3627
|
+
)
|
|
3628
|
+
|
|
3629
|
+
except Exception as e:
|
|
3630
|
+
logger.error(
|
|
3631
|
+
f"Error handling in-flight tasks during skip: {e}",
|
|
3632
|
+
exc_info=True,
|
|
3633
|
+
)
|
|
3634
|
+
# Reset in-flight counter to prevent hanging
|
|
3635
|
+
self._in_flight_tasks = 0
|
|
3636
|
+
|
|
3637
|
+
# Check if there are any pending tasks (including those needing
|
|
3638
|
+
# decomposition)
|
|
3639
|
+
if self._pending_tasks:
|
|
3640
|
+
# Check if the first pending task needs decomposition
|
|
3641
|
+
next_task = self._pending_tasks[0]
|
|
3642
|
+
if next_task.additional_info and next_task.additional_info.get(
|
|
3643
|
+
'_needs_decomposition'
|
|
3644
|
+
):
|
|
3645
|
+
logger.info(
|
|
3646
|
+
f"Decomposing main task {next_task.id} after skip request."
|
|
3647
|
+
)
|
|
3648
|
+
try:
|
|
3649
|
+
# Remove the decomposition flag to avoid re-decomposition
|
|
3650
|
+
next_task.additional_info['_needs_decomposition'] = False
|
|
3651
|
+
|
|
3652
|
+
# Decompose the task and append subtasks to _pending_tasks
|
|
3653
|
+
await self.handle_decompose_append_task(
|
|
3654
|
+
next_task, reset=False
|
|
3655
|
+
)
|
|
3656
|
+
|
|
3657
|
+
# Mark the main task as completed and remove from pending
|
|
3658
|
+
await self._handle_completed_task(next_task)
|
|
3659
|
+
logger.info(
|
|
3660
|
+
f"Main task {next_task.id} decomposed after "
|
|
3661
|
+
f"skip request"
|
|
3662
|
+
)
|
|
3663
|
+
except Exception as e:
|
|
3664
|
+
logger.error(
|
|
3665
|
+
f"Error decomposing main task {next_task.id} "
|
|
3666
|
+
f"after skip: {e}",
|
|
3667
|
+
exc_info=True,
|
|
3668
|
+
)
|
|
3669
|
+
|
|
3670
|
+
logger.info("Pending tasks available after skip, continuing.")
|
|
3671
|
+
await self._post_ready_tasks()
|
|
3672
|
+
return False # Continue processing
|
|
3673
|
+
else:
|
|
3674
|
+
# No pending tasks available, act like stop
|
|
3675
|
+
logger.info("No pending tasks available, acting like stop.")
|
|
3676
|
+
return True # Stop processing
|
|
3677
|
+
|
|
2796
3678
|
@check_if_running(False)
|
|
2797
3679
|
async def _listen_to_channel(self) -> None:
|
|
2798
3680
|
r"""Continuously listen to the channel, post task to the channel and
|
|
@@ -2821,6 +3703,75 @@ class Workforce(BaseNode):
|
|
|
2821
3703
|
logger.info("Stop requested, breaking execution loop.")
|
|
2822
3704
|
break
|
|
2823
3705
|
|
|
3706
|
+
# Check for skip request after potential pause
|
|
3707
|
+
if self._skip_requested:
|
|
3708
|
+
should_stop = await self._handle_skip_task()
|
|
3709
|
+
if should_stop:
|
|
3710
|
+
self._stop_requested = True
|
|
3711
|
+
break
|
|
3712
|
+
|
|
3713
|
+
# Reset skip flag
|
|
3714
|
+
self._skip_requested = False
|
|
3715
|
+
continue
|
|
3716
|
+
|
|
3717
|
+
# Check if we should decompose a main task
|
|
3718
|
+
# Only decompose when no tasks are in flight and pending queue
|
|
3719
|
+
# is empty
|
|
3720
|
+
if not self._pending_tasks and self._in_flight_tasks == 0:
|
|
3721
|
+
# All tasks completed, will exit loop
|
|
3722
|
+
break
|
|
3723
|
+
|
|
3724
|
+
# Check if the first pending task needs decomposition
|
|
3725
|
+
# This happens when add_task(as_subtask=False) was called
|
|
3726
|
+
if self._pending_tasks and self._in_flight_tasks == 0:
|
|
3727
|
+
next_task = self._pending_tasks[0]
|
|
3728
|
+
if (
|
|
3729
|
+
next_task.additional_info
|
|
3730
|
+
and next_task.additional_info.get(
|
|
3731
|
+
'_needs_decomposition'
|
|
3732
|
+
)
|
|
3733
|
+
):
|
|
3734
|
+
logger.info(f"Decomposing main task: {next_task.id}")
|
|
3735
|
+
try:
|
|
3736
|
+
# Remove the decomposition flag to avoid
|
|
3737
|
+
# re-decomposition
|
|
3738
|
+
next_task.additional_info[
|
|
3739
|
+
'_needs_decomposition'
|
|
3740
|
+
] = False
|
|
3741
|
+
|
|
3742
|
+
# Decompose the task and append subtasks to
|
|
3743
|
+
# _pending_tasks
|
|
3744
|
+
await self.handle_decompose_append_task(
|
|
3745
|
+
next_task, reset=False
|
|
3746
|
+
)
|
|
3747
|
+
|
|
3748
|
+
# Mark the main task as completed (decomposition
|
|
3749
|
+
# successful) and Remove it from pending tasks
|
|
3750
|
+
await self._handle_completed_task(next_task)
|
|
3751
|
+
logger.info(
|
|
3752
|
+
f"Main task {next_task.id} decomposed and "
|
|
3753
|
+
f"ready for processing"
|
|
3754
|
+
)
|
|
3755
|
+
except Exception as e:
|
|
3756
|
+
logger.error(
|
|
3757
|
+
f"Error decomposing main task {next_task.id}: "
|
|
3758
|
+
f"{e}",
|
|
3759
|
+
exc_info=True,
|
|
3760
|
+
)
|
|
3761
|
+
# Revert back to the queue for retry later if
|
|
3762
|
+
# decomposition failed
|
|
3763
|
+
if not self._pending_tasks:
|
|
3764
|
+
self._pending_tasks.appendleft(next_task)
|
|
3765
|
+
else:
|
|
3766
|
+
logger.warning(
|
|
3767
|
+
"Pending tasks exist after decomposition "
|
|
3768
|
+
"error."
|
|
3769
|
+
)
|
|
3770
|
+
|
|
3771
|
+
# Immediately assign and post the transferred tasks
|
|
3772
|
+
await self._post_ready_tasks()
|
|
3773
|
+
continue
|
|
3774
|
+
|
|
2824
3775
|
# Save snapshot before processing next task
|
|
2825
3776
|
if self._pending_tasks:
|
|
2826
3777
|
current_task = self._pending_tasks[0]
|
|
@@ -2835,9 +3786,24 @@ class Workforce(BaseNode):
|
|
|
2835
3786
|
self._last_snapshot_time = time.time()
|
|
2836
3787
|
|
|
2837
3788
|
# Get returned task
|
|
2838
|
-
|
|
3789
|
+
try:
|
|
3790
|
+
returned_task = await self._get_returned_task()
|
|
3791
|
+
except asyncio.TimeoutError:
|
|
3792
|
+
# Handle timeout - check if we have tasks stuck in flight
|
|
3793
|
+
if self._in_flight_tasks > 0:
|
|
3794
|
+
logger.warning(
|
|
3795
|
+
f"Timeout waiting for {self._in_flight_tasks} "
|
|
3796
|
+
f"in-flight tasks. Breaking to prevent hanging."
|
|
3797
|
+
)
|
|
3798
|
+
# Break the loop to prevent indefinite hanging
|
|
3799
|
+
# The finally block will handle cleanup
|
|
3800
|
+
break
|
|
3801
|
+
else:
|
|
3802
|
+
# No tasks in flight, safe to continue
|
|
3803
|
+
await self._post_ready_tasks()
|
|
3804
|
+
continue
|
|
2839
3805
|
|
|
2840
|
-
# If no task was returned, continue
|
|
3806
|
+
# If no task was returned (other errors), continue
|
|
2841
3807
|
if returned_task is None:
|
|
2842
3808
|
logger.debug(
|
|
2843
3809
|
f"No task returned in workforce {self.node_id}. "
|
|
@@ -2896,11 +3862,88 @@ class Workforce(BaseNode):
|
|
|
2896
3862
|
)
|
|
2897
3863
|
continue
|
|
2898
3864
|
else:
|
|
2899
|
-
|
|
2900
|
-
|
|
2901
|
-
f"successfully.{Fore.RESET}"
|
|
3865
|
+
quality_eval = self._analyze_task(
|
|
3866
|
+
returned_task, for_failure=False
|
|
2902
3867
|
)
|
|
2903
|
-
|
|
3868
|
+
|
|
3869
|
+
if not quality_eval.quality_sufficient:
|
|
3870
|
+
logger.info(
|
|
3871
|
+
f"Task {returned_task.id} quality check: "
|
|
3872
|
+
f"score={quality_eval.quality_score}, "
|
|
3873
|
+
f"issues={quality_eval.issues}, "
|
|
3874
|
+
f"strategy={quality_eval.recovery_strategy}"
|
|
3875
|
+
)
|
|
3876
|
+
|
|
3877
|
+
# Check retry limit before attempting recovery
|
|
3878
|
+
if returned_task.failure_count >= 2:
|
|
3879
|
+
print(
|
|
3880
|
+
f"{Fore.YELLOW}Task {returned_task.id} "
|
|
3881
|
+
f"completed with low quality score: "
|
|
3882
|
+
f"{quality_eval.quality_score} "
|
|
3883
|
+
f"(retry limit reached){Fore.RESET}"
|
|
3884
|
+
)
|
|
3885
|
+
await self._handle_completed_task(
|
|
3886
|
+
returned_task
|
|
3887
|
+
)
|
|
3888
|
+
continue
|
|
3889
|
+
|
|
3890
|
+
# Print visual feedback for quality-failed tasks
|
|
3891
|
+
# with recovery strategy
|
|
3892
|
+
recovery_action = (
|
|
3893
|
+
quality_eval.recovery_strategy.value
|
|
3894
|
+
if quality_eval.recovery_strategy
|
|
3895
|
+
else ""
|
|
3896
|
+
)
|
|
3897
|
+
print(
|
|
3898
|
+
f"{Fore.YELLOW}⚠️ Task {returned_task.id} "
|
|
3899
|
+
f"failed quality check (score: "
|
|
3900
|
+
f"{quality_eval.quality_score}). "
|
|
3901
|
+
f"Issues: {', '.join(quality_eval.issues)}. "
|
|
3902
|
+
f"Recovery: {recovery_action}{Fore.RESET}"
|
|
3903
|
+
)
|
|
3904
|
+
|
|
3905
|
+
# Mark as failed for recovery
|
|
3906
|
+
returned_task.failure_count += 1
|
|
3907
|
+
returned_task.state = TaskState.FAILED
|
|
3908
|
+
returned_task.result = (
|
|
3909
|
+
f"Quality insufficient (score: "
|
|
3910
|
+
f"{quality_eval.quality_score}). "
|
|
3911
|
+
f"Issues: {', '.join(quality_eval.issues)}"
|
|
3912
|
+
)
|
|
3913
|
+
|
|
3914
|
+
# Clean up tracking before attempting recovery
|
|
3915
|
+
if returned_task.id in self._assignees:
|
|
3916
|
+
await self._channel.archive_task(
|
|
3917
|
+
returned_task.id
|
|
3918
|
+
)
|
|
3919
|
+
self._cleanup_task_tracking(returned_task.id)
|
|
3920
|
+
|
|
3921
|
+
# Apply LLM-recommended recovery strategy
|
|
3922
|
+
try:
|
|
3923
|
+
is_decompose = (
|
|
3924
|
+
await self._apply_recovery_strategy(
|
|
3925
|
+
returned_task, quality_eval
|
|
3926
|
+
)
|
|
3927
|
+
)
|
|
3928
|
+
|
|
3929
|
+
# For decompose, cleanup happens in the method
|
|
3930
|
+
if is_decompose:
|
|
3931
|
+
continue
|
|
3932
|
+
|
|
3933
|
+
except Exception as e:
|
|
3934
|
+
logger.error(
|
|
3935
|
+
f"Error handling quality-failed task "
|
|
3936
|
+
f"{returned_task.id}: {e}",
|
|
3937
|
+
exc_info=True,
|
|
3938
|
+
)
|
|
3939
|
+
continue
|
|
3940
|
+
else:
|
|
3941
|
+
print(
|
|
3942
|
+
f"{Fore.CYAN}Task {returned_task.id} "
|
|
3943
|
+
f"completed successfully (quality score: "
|
|
3944
|
+
f"{quality_eval.quality_score}).{Fore.RESET}"
|
|
3945
|
+
)
|
|
3946
|
+
await self._handle_completed_task(returned_task)
|
|
2904
3947
|
elif returned_task.state == TaskState.FAILED:
|
|
2905
3948
|
try:
|
|
2906
3949
|
halt = await self._handle_failed_task(returned_task)
|