PyPI - camel-ai - Versions diffs - 0.2.73a4__py3-none-any.whl → 0.2.80a2__py3-none-any.whl - Mend

camel-ai 0.2.73a4py3-none-any.whl → 0.2.80a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (173) hide show

camel/__init__.py +1 -1
camel/agents/_utils.py +38 -0
camel/agents/chat_agent.py +2217 -519
camel/agents/mcp_agent.py +30 -27
camel/configs/__init__.py +15 -0
camel/configs/aihubmix_config.py +88 -0
camel/configs/amd_config.py +70 -0
camel/configs/cometapi_config.py +104 -0
camel/configs/minimax_config.py +93 -0
camel/configs/nebius_config.py +103 -0
camel/data_collectors/alpaca_collector.py +15 -6
camel/datasets/base_generator.py +39 -10
camel/environments/single_step.py +28 -3
camel/environments/tic_tac_toe.py +1 -1
camel/interpreters/__init__.py +2 -0
camel/interpreters/docker/Dockerfile +3 -12
camel/interpreters/e2b_interpreter.py +34 -1
camel/interpreters/microsandbox_interpreter.py +395 -0
camel/loaders/__init__.py +11 -2
camel/loaders/chunkr_reader.py +9 -0
camel/memories/agent_memories.py +48 -4
camel/memories/base.py +26 -0
camel/memories/blocks/chat_history_block.py +122 -4
camel/memories/context_creators/score_based.py +25 -384
camel/memories/records.py +88 -8
camel/messages/base.py +153 -34
camel/models/__init__.py +10 -0
camel/models/aihubmix_model.py +83 -0
camel/models/aiml_model.py +1 -16
camel/models/amd_model.py +101 -0
camel/models/anthropic_model.py +6 -19
camel/models/aws_bedrock_model.py +2 -33
camel/models/azure_openai_model.py +114 -89
camel/models/base_audio_model.py +3 -1
camel/models/base_model.py +32 -14
camel/models/cohere_model.py +1 -16
camel/models/cometapi_model.py +83 -0
camel/models/crynux_model.py +1 -16
camel/models/deepseek_model.py +1 -16
camel/models/fish_audio_model.py +6 -0
camel/models/gemini_model.py +36 -18
camel/models/groq_model.py +1 -17
camel/models/internlm_model.py +1 -16
camel/models/litellm_model.py +1 -16
camel/models/lmstudio_model.py +1 -17
camel/models/minimax_model.py +83 -0
camel/models/mistral_model.py +1 -16
camel/models/model_factory.py +27 -1
camel/models/modelscope_model.py +1 -16
camel/models/moonshot_model.py +105 -24
camel/models/nebius_model.py +83 -0
camel/models/nemotron_model.py +0 -5
camel/models/netmind_model.py +1 -16
camel/models/novita_model.py +1 -16
camel/models/nvidia_model.py +1 -16
camel/models/ollama_model.py +4 -19
camel/models/openai_compatible_model.py +62 -41
camel/models/openai_model.py +62 -57
camel/models/openrouter_model.py +1 -17
camel/models/ppio_model.py +1 -16
camel/models/qianfan_model.py +1 -16
camel/models/qwen_model.py +1 -16
camel/models/reka_model.py +1 -16
camel/models/samba_model.py +34 -47
camel/models/sglang_model.py +64 -31
camel/models/siliconflow_model.py +1 -16
camel/models/stub_model.py +0 -4
camel/models/togetherai_model.py +1 -16
camel/models/vllm_model.py +1 -16
camel/models/volcano_model.py +0 -17
camel/models/watsonx_model.py +1 -16
camel/models/yi_model.py +1 -16
camel/models/zhipuai_model.py +60 -16
camel/parsers/__init__.py +18 -0
camel/parsers/mcp_tool_call_parser.py +176 -0
camel/retrievers/auto_retriever.py +1 -0
camel/runtimes/daytona_runtime.py +11 -12
camel/societies/__init__.py +2 -0
camel/societies/workforce/__init__.py +2 -0
camel/societies/workforce/events.py +122 -0
camel/societies/workforce/prompts.py +146 -66
camel/societies/workforce/role_playing_worker.py +15 -11
camel/societies/workforce/single_agent_worker.py +302 -65
camel/societies/workforce/structured_output_handler.py +30 -18
camel/societies/workforce/task_channel.py +163 -27
camel/societies/workforce/utils.py +107 -13
camel/societies/workforce/workflow_memory_manager.py +772 -0
camel/societies/workforce/workforce.py +1949 -579
camel/societies/workforce/workforce_callback.py +74 -0
camel/societies/workforce/workforce_logger.py +168 -145
camel/societies/workforce/workforce_metrics.py +33 -0
camel/storages/key_value_storages/json.py +15 -2
camel/storages/key_value_storages/mem0_cloud.py +48 -47
camel/storages/object_storages/google_cloud.py +1 -1
camel/storages/vectordb_storages/oceanbase.py +13 -13
camel/storages/vectordb_storages/qdrant.py +3 -3
camel/storages/vectordb_storages/tidb.py +8 -6
camel/tasks/task.py +4 -3
camel/toolkits/__init__.py +20 -7
camel/toolkits/aci_toolkit.py +45 -0
camel/toolkits/base.py +6 -4
camel/toolkits/code_execution.py +28 -1
camel/toolkits/context_summarizer_toolkit.py +684 -0
camel/toolkits/dappier_toolkit.py +5 -1
camel/toolkits/dingtalk.py +1135 -0
camel/toolkits/edgeone_pages_mcp_toolkit.py +11 -31
camel/toolkits/excel_toolkit.py +1 -1
camel/toolkits/{file_write_toolkit.py → file_toolkit.py} +430 -36
camel/toolkits/function_tool.py +13 -3
camel/toolkits/github_toolkit.py +104 -17
camel/toolkits/gmail_toolkit.py +1839 -0
camel/toolkits/google_calendar_toolkit.py +38 -4
camel/toolkits/google_drive_mcp_toolkit.py +12 -31
camel/toolkits/hybrid_browser_toolkit/config_loader.py +15 -0
camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +77 -8
camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit_ts.py +884 -88
camel/toolkits/hybrid_browser_toolkit/installer.py +203 -0
camel/toolkits/hybrid_browser_toolkit/ts/package-lock.json +5 -612
camel/toolkits/hybrid_browser_toolkit/ts/package.json +0 -1
camel/toolkits/hybrid_browser_toolkit/ts/src/browser-session.ts +959 -89
camel/toolkits/hybrid_browser_toolkit/ts/src/config-loader.ts +9 -2
camel/toolkits/hybrid_browser_toolkit/ts/src/hybrid-browser-toolkit.ts +281 -213
camel/toolkits/hybrid_browser_toolkit/ts/src/parent-child-filter.ts +226 -0
camel/toolkits/hybrid_browser_toolkit/ts/src/snapshot-parser.ts +219 -0
camel/toolkits/hybrid_browser_toolkit/ts/src/som-screenshot-injected.ts +543 -0
camel/toolkits/hybrid_browser_toolkit/ts/src/types.ts +23 -3
camel/toolkits/hybrid_browser_toolkit/ts/websocket-server.js +72 -7
camel/toolkits/hybrid_browser_toolkit/ws_wrapper.py +582 -132
camel/toolkits/hybrid_browser_toolkit_py/actions.py +158 -0
camel/toolkits/hybrid_browser_toolkit_py/browser_session.py +55 -8
camel/toolkits/hybrid_browser_toolkit_py/config_loader.py +43 -0
camel/toolkits/hybrid_browser_toolkit_py/hybrid_browser_toolkit.py +321 -8
camel/toolkits/hybrid_browser_toolkit_py/snapshot.py +10 -4
camel/toolkits/hybrid_browser_toolkit_py/unified_analyzer.js +45 -4
camel/toolkits/{openai_image_toolkit.py → image_generation_toolkit.py} +151 -53
camel/toolkits/klavis_toolkit.py +5 -1
camel/toolkits/markitdown_toolkit.py +27 -1
camel/toolkits/math_toolkit.py +64 -10
camel/toolkits/mcp_toolkit.py +366 -71
camel/toolkits/memory_toolkit.py +5 -1
camel/toolkits/message_integration.py +18 -13
camel/toolkits/minimax_mcp_toolkit.py +195 -0
camel/toolkits/note_taking_toolkit.py +19 -10
camel/toolkits/notion_mcp_toolkit.py +16 -26
camel/toolkits/openbb_toolkit.py +5 -1
camel/toolkits/origene_mcp_toolkit.py +8 -49
camel/toolkits/playwright_mcp_toolkit.py +12 -31
camel/toolkits/resend_toolkit.py +168 -0
camel/toolkits/search_toolkit.py +264 -91
camel/toolkits/slack_toolkit.py +64 -10
camel/toolkits/terminal_toolkit/__init__.py +18 -0
camel/toolkits/terminal_toolkit/terminal_toolkit.py +957 -0
camel/toolkits/terminal_toolkit/utils.py +532 -0
camel/toolkits/vertex_ai_veo_toolkit.py +590 -0
camel/toolkits/video_analysis_toolkit.py +17 -11
camel/toolkits/wechat_official_toolkit.py +483 -0
camel/toolkits/zapier_toolkit.py +5 -1
camel/types/__init__.py +2 -2
camel/types/enums.py +274 -7
camel/types/openai_types.py +2 -2
camel/types/unified_model_type.py +15 -0
camel/utils/commons.py +36 -5
camel/utils/constants.py +3 -0
camel/utils/context_utils.py +1003 -0
camel/utils/mcp.py +138 -4
camel/utils/token_counting.py +43 -20
{camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/METADATA +223 -83
{camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/RECORD +170 -141
camel/loaders/pandas_reader.py +0 -368
camel/toolkits/openai_agent_toolkit.py +0 -135
camel/toolkits/terminal_toolkit.py +0 -1550
{camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/WHEEL +0 -0
{camel_ai-0.2.73a4.dist-info → camel_ai-0.2.80a2.dist-info}/licenses/LICENSE +0 -0

camel/societies/workforce/workforce.py CHANGED Viewed

@@ -16,12 +16,15 @@ from __future__ import annotations
 import asyncio
 import concurrent.futures
 import json
+import os
 import time
 import uuid
 from collections import deque
 from enum import Enum
 from typing import (
+    TYPE_CHECKING,
     Any,
+    Callable,
     Coroutine,
     Deque,
     Dict,
@@ -31,8 +34,15 @@ from typing import (
     Set,
     Tuple,
     Union,
+    cast,
 )
+from .workforce_callback import WorkforceCallback
+from .workforce_metrics import WorkforceMetrics
+if TYPE_CHECKING:
+    from camel.utils.context_utils import ContextUtility
 from colorama import Fore
 from camel.agents import ChatAgent
@@ -43,19 +53,23 @@ from camel.societies.workforce.base import BaseNode
 from camel.societies.workforce.prompts import (
     ASSIGN_TASK_PROMPT,
     CREATE_NODE_PROMPT,
-    FAILURE_ANALYSIS_PROMPT,
+    FAILURE_ANALYSIS_RESPONSE_FORMAT,
+    QUALITY_EVALUATION_RESPONSE_FORMAT,
+    TASK_AGENT_SYSTEM_MESSAGE,
+    TASK_ANALYSIS_PROMPT,
     TASK_DECOMPOSE_PROMPT,
 )
 from camel.societies.workforce.role_playing_worker import RolePlayingWorker
-from camel.societies.workforce.single_agent_worker import SingleAgentWorker
+from camel.societies.workforce.single_agent_worker import (
+    SingleAgentWorker,
+)
 from camel.societies.workforce.structured_output_handler import (
     StructuredOutputHandler,
 )
 from camel.societies.workforce.task_channel import TaskChannel
 from camel.societies.workforce.utils import (
-    FailureContext,
-    RecoveryDecision,
     RecoveryStrategy,
+    TaskAnalysisResult,
     TaskAssignment,
     TaskAssignResult,
     WorkerConf,
@@ -70,21 +84,39 @@ from camel.tasks.task import (
 )
 from camel.toolkits import (
     CodeExecutionToolkit,
+    FunctionTool,
     SearchToolkit,
-    TaskPlanningToolkit,
     ThinkingToolkit,
 )
 from camel.types import ModelPlatformType, ModelType
 from camel.utils import dependencies_required
+from .events import (
+    AllTasksCompletedEvent,
+    TaskAssignedEvent,
+    TaskCompletedEvent,
+    TaskCreatedEvent,
+    TaskDecomposedEvent,
+    TaskFailedEvent,
+    TaskStartedEvent,
+    WorkerCreatedEvent,
+)
 from .workforce_logger import WorkforceLogger
-logger = get_logger(__name__)
+if os.environ.get("TRACEROOT_ENABLED", "False").lower() == "true":
+    try:
+        import traceroot  # type: ignore[import]
+        logger = traceroot.get_logger('camel')
+    except ImportError:
+        logger = get_logger(__name__)
+else:
+    logger = get_logger(__name__)
 # Constants for configuration values
 MAX_TASK_RETRIES = 3
 MAX_PENDING_TASKS_LIMIT = 20
-TASK_TIMEOUT_SECONDS = 180.0
+TASK_TIMEOUT_SECONDS = 600.0
 DEFAULT_WORKER_POOL_SIZE = 10
@@ -151,9 +183,9 @@ class Workforce(BaseNode):
         task_agent (Optional[ChatAgent], optional): A custom task planning
             agent instance for task decomposition and composition. If
             provided, the workforce will create a new agent using this agent's
-            model configuration but with the required system message and tools
-            (TaskPlanningToolkit). If None, a default agent will be created
-            using DEFAULT model settings. (default: :obj:`None`)
+            model configuration but with the required system message. If None,
+            a default agent will be created using DEFAULT model settings.
+            (default: :obj:`None`)
         new_worker_agent (Optional[ChatAgent], optional): A template agent for
             workers created dynamically at runtime when existing workers cannot
             handle failed tasks. If None, workers will be created with default
@@ -163,6 +195,11 @@ class Workforce(BaseNode):
             for graceful shutdown when a task fails 3 times. During this
             period, the workforce remains active for debugging.
             Set to 0 for immediate shutdown. (default: :obj:`15.0`)
+        task_timeout_seconds (Optional[float], optional): The timeout in
+            seconds for waiting for tasks to be returned by workers. If None,
+            uses the global TASK_TIMEOUT_SECONDS value (600.0 seconds).
+            Increase this value for tasks that require more processing time.
+            (default: :obj:`None`)
         share_memory (bool, optional): Whether to enable shared memory across
             SingleAgentWorker instances in the workforce. When enabled, all
             SingleAgentWorker instances, coordinator agent, and task planning
@@ -180,6 +217,17 @@ class Workforce(BaseNode):
             support native structured output. When disabled, the workforce
             uses the native response_format parameter.
             (default: :obj:`True`)
+        callbacks (Optional[List[WorkforceCallback]], optional): A list of
+            callback handlers to observe and record workforce lifecycle events
+            and metrics (e.g., task creation/assignment/start/completion/
+            failure, worker creation/deletion, all-tasks-completed). All
+            items must be instances of :class:`WorkforceCallback`, otherwise
+            a :class:`ValueError` is raised. If none of the provided
+            callbacks implement :class:`WorkforceMetrics`, a built-in
+            :class:`WorkforceLogger` (implements both callback and metrics)
+            is added automatically. If at least one provided callback
+            implements :class:`WorkforceMetrics`, no default logger is added.
+            (default: :obj:`None`)
     Example:
         >>> import asyncio
@@ -231,6 +279,8 @@ class Workforce(BaseNode):
         graceful_shutdown_timeout: float = 15.0,
         share_memory: bool = False,
         use_structured_output_handler: bool = True,
+        task_timeout_seconds: Optional[float] = None,
+        callbacks: Optional[List[WorkforceCallback]] = None,
     ) -> None:
         super().__init__(description)
         self._child_listening_tasks: Deque[
@@ -241,9 +291,11 @@ class Workforce(BaseNode):
         self.graceful_shutdown_timeout = graceful_shutdown_timeout
         self.share_memory = share_memory
         self.use_structured_output_handler = use_structured_output_handler
+        self.task_timeout_seconds = (
+            task_timeout_seconds or TASK_TIMEOUT_SECONDS
+        )
         if self.use_structured_output_handler:
             self.structured_handler = StructuredOutputHandler()
-        self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
         self._task: Optional[Task] = None
         self._pending_tasks: Deque[Task] = deque()
         self._task_dependencies: Dict[str, List[str]] = {}
@@ -256,6 +308,7 @@ class Workforce(BaseNode):
         self._pause_event = asyncio.Event()
         self._pause_event.set()  # Initially not paused
         self._stop_requested = False
+        self._skip_requested = False
         self._snapshots: List[WorkforceSnapshot] = []
         self._completed_tasks: List[Task] = []
         self._loop: Optional[asyncio.AbstractEventLoop] = None
@@ -265,15 +318,9 @@ class Workforce(BaseNode):
         self._last_snapshot_time: float = 0.0
         # Minimum seconds between automatic snapshots
         self.snapshot_interval: float = 30.0
-        if self.metrics_logger:
-            for child in self._children:
-                worker_type = type(child).__name__
-                role_or_desc = child.description
-                self.metrics_logger.log_worker_created(
-                    worker_id=child.node_id,
-                    worker_type=worker_type,
-                    role=role_or_desc,
-                )
+        # Shared memory UUID tracking to prevent re-sharing duplicates
+        self._shared_memory_uuids: Set[str] = set()
+        self._initialize_callbacks(callbacks)
         # Set up coordinator agent with default system message
         coord_agent_sys_msg = BaseMessage.make_assistant_message(
@@ -302,8 +349,7 @@ class Workforce(BaseNode):
             if coordinator_agent.system_message is not None:
                 user_sys_msg_content = coordinator_agent.system_message.content
                 combined_content = (
-                    f"{user_sys_msg_content}\n\n"
-                    f"{coord_agent_sys_msg.content}"
+                    f"{user_sys_msg_content}\n\n{coord_agent_sys_msg.content}"
                 )
                 combined_sys_msg = BaseMessage.make_assistant_message(
                     role_name=coordinator_agent.system_message.role_name,
@@ -327,10 +373,7 @@ class Workforce(BaseNode):
                     None,
                 ),
                 output_language=coordinator_agent.output_language,
-                tools=[
-                    tool.func
-                    for tool in coordinator_agent._internal_tools.values()
-                ],
+                tools=list(coordinator_agent._internal_tools.values()),
                 external_tools=[
                     schema
                     for schema in coordinator_agent._external_tool_schemas.values()  # noqa: E501
@@ -340,28 +383,20 @@ class Workforce(BaseNode):
                 stop_event=coordinator_agent.stop_event,
             )
-        # Set up task agent with default system message and required tools
+        # Set up task agent with default system message
         task_sys_msg = BaseMessage.make_assistant_message(
             role_name="Task Planner",
-            content="You are going to compose and decompose tasks. Keep "
-            "tasks that are sequential and require the same type of "
-            "agent together in one agent process. Only decompose tasks "
-            "that can be handled in parallel and require different types "
-            "of agents. This ensures efficient execution by minimizing "
-            "context switching between agents.",
+            content=TASK_AGENT_SYSTEM_MESSAGE,
         )
-        task_planning_tools = TaskPlanningToolkit().get_tools()
         if task_agent is None:
             logger.warning(
                 "No task_agent provided. Using default ChatAgent "
                 "settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT) "
-                "with default system message and TaskPlanningToolkit."
+                "with default system message."
             )
-            task_tools = TaskPlanningToolkit().get_tools()
             self.task_agent = ChatAgent(
                 task_sys_msg,
-                tools=task_tools,  # type: ignore[arg-type]
             )
         else:
             logger.info(
@@ -373,8 +408,7 @@ class Workforce(BaseNode):
             if task_agent.system_message is not None:
                 user_task_sys_msg_content = task_agent.system_message.content
                 combined_task_content = (
-                    f"{user_task_sys_msg_content}\n\n"
-                    f"{task_sys_msg.content}"
+                    f"{user_task_sys_msg_content}\n\n{task_sys_msg.content}"
                 )
                 combined_task_sys_msg = BaseMessage.make_assistant_message(
                     role_name=task_agent.system_message.role_name,
@@ -385,9 +419,10 @@ class Workforce(BaseNode):
             # Since ChatAgent constructor uses a dictionary with
             # function names as keys, we don't need to manually deduplicate.
-            combined_tools = [
-                tool.func for tool in task_agent._internal_tools.values()
-            ] + [tool.func for tool in task_planning_tools]
+            combined_tools: List[Union[FunctionTool, Callable]] = cast(
+                List[Union[FunctionTool, Callable]],
+                list(task_agent._internal_tools.values()),
+            )
             # Create a new agent with the provided agent's configuration
             # but with the combined system message and tools
@@ -434,10 +469,85 @@ class Workforce(BaseNode):
                 "better context continuity during task handoffs."
             )
+        # Shared context utility for workflow management (created lazily)
+        self._shared_context_utility: Optional["ContextUtility"] = None
         # ------------------------------------------------------------------
         # Helper for propagating pause control to externally supplied agents
         # ------------------------------------------------------------------
+    def _initialize_callbacks(
+        self, callbacks: Optional[List[WorkforceCallback]]
+    ) -> None:
+        r"""Validate, register, and prime workforce callbacks."""
+        self._callbacks: List[WorkforceCallback] = []
+        if callbacks:
+            for cb in callbacks:
+                if isinstance(cb, WorkforceCallback):
+                    self._callbacks.append(cb)
+                else:
+                    raise ValueError(
+                        "All callbacks must be instances of WorkforceCallback"
+                    )
+        has_metrics_callback = any(
+            isinstance(cb, WorkforceMetrics) for cb in self._callbacks
+        )
+        if not has_metrics_callback:
+            self._callbacks.append(WorkforceLogger(workforce_id=self.node_id))
+        else:
+            logger.info(
+                "WorkforceMetrics implementation detected. Skipping default "
+                "WorkforceLogger addition."
+            )
+        for child in self._children:
+            self._notify_worker_created(child)
+    def _notify_worker_created(
+        self,
+        worker_node: BaseNode,
+        *,
+        worker_type: Optional[str] = None,
+        role: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        r"""Emit a worker-created event to all registered callbacks."""
+        event = WorkerCreatedEvent(
+            worker_id=worker_node.node_id,
+            worker_type=worker_type or type(worker_node).__name__,
+            role=role or worker_node.description,
+            metadata=metadata,
+        )
+        for cb in self._callbacks:
+            cb.log_worker_created(event)
+    def _get_or_create_shared_context_utility(
+        self,
+        session_id: Optional[str] = None,
+    ) -> "ContextUtility":
+        r"""Get or create the shared context utility for workflow management.
+        This method creates the context utility only when needed, avoiding
+        unnecessary session folder creation during initialization.
+        Args:
+            session_id (Optional[str]): Custom session ID to use. If None,
+                auto-generates a timestamped session ID. (default: :obj:`None`)
+        Returns:
+            ContextUtility: The shared context utility instance.
+        """
+        if self._shared_context_utility is None:
+            from camel.utils.context_utils import ContextUtility
+            self._shared_context_utility = ContextUtility.get_workforce_shared(
+                session_id=session_id
+            )
+        return self._shared_context_utility
     def _validate_agent_compatibility(
         self, agent: ChatAgent, agent_context: str = "agent"
     ) -> None:
@@ -474,6 +584,9 @@ class Workforce(BaseNode):
                 "the Workforce."
             )
+    # ------------------------------------------------------------------
+    # Helper for propagating pause control to externally supplied agents
+    # ------------------------------------------------------------------
     def _attach_pause_event_to_agent(self, agent: ChatAgent) -> None:
         r"""Ensure the given ChatAgent shares this workforce's pause_event.
@@ -599,14 +712,29 @@ class Workforce(BaseNode):
                 )
                 return
-            # Share with coordinator agent
+            # Filter out already-shared records to prevent re-sharing
+            # This prevents exponential growth of duplicate records
+            new_records = []
             for record in memory_records:
+                record_uuid = str(record.uuid)
+                if record_uuid not in self._shared_memory_uuids:
+                    new_records.append(record)
+                    self._shared_memory_uuids.add(record_uuid)
+            if not new_records:
+                logger.debug(
+                    "No new records to share (all were already shared)"
+                )
+                return
+            # Share with coordinator agent
+            for record in new_records:
                 # Only add records from other agents to avoid duplication
                 if record.agent_id != self.coordinator_agent.agent_id:
                     self.coordinator_agent.memory.write_record(record)
             # Share with task agent
-            for record in memory_records:
+            for record in new_records:
                 if record.agent_id != self.task_agent.agent_id:
                     self.task_agent.memory.write_record(record)
@@ -618,12 +746,12 @@ class Workforce(BaseNode):
             ]
             for worker in single_agent_workers:
-                for record in memory_records:
+                for record in new_records:
                     if record.agent_id != worker.worker.agent_id:
                         worker.worker.memory.write_record(record)
             logger.info(
-                f"Shared {len(memory_records)} memory records across "
+                f"Shared {len(new_records)} new memory records across "
                 f"{len(single_agent_workers) + 2} agents in workforce "
                 f"{self.node_id}"
             )
@@ -730,10 +858,12 @@ class Workforce(BaseNode):
             Union[List[Task], Generator[List[Task], None, None]]:
             The subtasks or generator of subtasks.
         """
-        decompose_prompt = TASK_DECOMPOSE_PROMPT.format(
-            content=task.content,
-            child_nodes_info=self._get_child_nodes_info(),
-            additional_info=task.additional_info,
+        decompose_prompt = str(
+            TASK_DECOMPOSE_PROMPT.format(
+                content=task.content,
+                child_nodes_info=self._get_child_nodes_info(),
+                additional_info=task.additional_info,
+            )
         )
         self.task_agent.reset()
         result = task.decompose(self.task_agent, decompose_prompt)
@@ -761,76 +891,126 @@ class Workforce(BaseNode):
                 self._update_dependencies_for_decomposition(task, subtasks)
             return subtasks
-    def _analyze_failure(
-        self, task: Task, error_message: str
-    ) -> RecoveryDecision:
-        r"""Analyze a task failure and decide on the best recovery strategy.
+    def _analyze_task(
+        self,
+        task: Task,
+        *,
+        for_failure: bool,
+        error_message: Optional[str] = None,
+    ) -> TaskAnalysisResult:
+        r"""Unified task analysis for both failures and quality evaluation.
+        This method consolidates the logic for analyzing task failures and
+        evaluating task quality, using the unified TASK_ANALYSIS_PROMPT.
         Args:
-            task (Task): The failed task
-            error_message (str): The error message from the failure
+            task (Task): The task to analyze
+            for_failure (bool): True for failure analysis, False for quality
+                evaluation
+            error_message (Optional[str]): Error message, required when
+                for_failure=True
         Returns:
-            RecoveryDecision: The decided recovery strategy with reasoning
+            TaskAnalysisResult: Unified analysis result with recovery strategy
+                and optional quality metrics
+        Raises:
+            ValueError: If for_failure=True but error_message is None
         """
-        # First, do a quick smart analysis based on error patterns
-        error_msg_lower = error_message.lower()
-        if any(
-            keyword in error_msg_lower
-            for keyword in [
-                'connection',
-                'network',
-                'server disconnected',
-                'timeout',
-                'apiconnectionerror',
+        # Validate required parameters
+        if for_failure and error_message is None:
+            raise ValueError("error_message is required when for_failure=True")
+        # Determine task result and issue-specific analysis based on context
+        if for_failure:
+            task_result = "N/A (task failed)"
+            issue_type = "Task Failure"
+            issue_analysis = f"**Error Message:** {error_message}"
+            response_format = FAILURE_ANALYSIS_RESPONSE_FORMAT
+            result_schema = TaskAnalysisResult
+            fallback_values: Dict[str, Any] = {
+                "reasoning": "Defaulting to retry due to parsing error",
+                "recovery_strategy": RecoveryStrategy.RETRY,
+                "modified_task_content": None,
+                "issues": [error_message] if error_message else [],
+            }
+            examples: List[Dict[str, Any]] = [
+                {
+                    "reasoning": "Temporary network error, worth retrying",
+                    "recovery_strategy": "retry",
+                    "modified_task_content": None,
+                    "issues": ["Network timeout"],
+                }
             ]
-        ):
-            return RecoveryDecision(
-                strategy=RecoveryStrategy.RETRY,
-                reasoning="Network/connection error detected, retrying task",
-                modified_task_content=None,
+        else:
+            # Quality evaluation
+            task_result = task.result or "No result available"
+            issue_type = "Quality Evaluation"
+            issue_analysis = (
+                "Provide a quality score (0-100) and list any specific "
+                "issues found."
             )
+            response_format = QUALITY_EVALUATION_RESPONSE_FORMAT
+            result_schema = TaskAnalysisResult
+            fallback_values = {
+                "reasoning": (
+                    "Defaulting to acceptable quality due to parsing error"
+                ),
+                "issues": [],
+                "recovery_strategy": None,
+                "modified_task_content": None,
+                "quality_score": 80,
+            }
+            examples = [
+                {
+                    "reasoning": (
+                        "Excellent implementation with comprehensive tests"
+                    ),
+                    "issues": [],
+                    "recovery_strategy": None,
+                    "modified_task_content": None,
+                    "quality_score": 98,
+                },
+                {
+                    "reasoning": (
+                        "Implementation incomplete with missing features"
+                    ),
+                    "issues": [
+                        "Incomplete implementation",
+                        "Missing error handling",
+                    ],
+                    "recovery_strategy": "replan",
+                    "modified_task_content": (
+                        "Previous attempt was incomplete. "
+                        "Please implement with: 1) Full feature "
+                        "coverage, 2) Proper error handling"
+                    ),
+                    "quality_score": 45,
+                },
+            ]
-        # Create failure context
-        failure_context = FailureContext(
-            task_id=task.id,
-            task_content=task.content,
-            failure_count=task.failure_count,
-            error_message=error_message,
-            worker_id=task.assigned_worker_id,
-            task_depth=task.get_depth(),
-            additional_info=str(task.additional_info)
-            if task.additional_info
-            else None,
-        )
-        # Format the analysis prompt
-        analysis_prompt = FAILURE_ANALYSIS_PROMPT.format(
-            task_id=failure_context.task_id,
-            task_content=failure_context.task_content,
-            failure_count=failure_context.failure_count,
-            error_message=failure_context.error_message,
-            worker_id=failure_context.worker_id or "unknown",
-            task_depth=failure_context.task_depth,
-            additional_info=failure_context.additional_info or "None",
+        # Format the unified analysis prompt
+        analysis_prompt = str(
+            TASK_ANALYSIS_PROMPT.format(
+                task_id=task.id,
+                task_content=task.content,
+                task_result=task_result,
+                failure_count=task.failure_count,
+                task_depth=task.get_depth(),
+                assigned_worker=task.assigned_worker_id or "unknown",
+                issue_type=issue_type,
+                issue_specific_analysis=issue_analysis,
+                response_format=response_format,
+            )
         )
         try:
-            # Check if we should use structured handler
             if self.use_structured_output_handler:
-                # Use structured handler
                 enhanced_prompt = (
                     self.structured_handler.generate_structured_prompt(
                         base_prompt=analysis_prompt,
-                        schema=RecoveryDecision,
-                        examples=[
-                            {
-                                "strategy": "RETRY",
-                                "reasoning": "Temporary network error, "
-                                "worth retrying",
-                                "modified_task_content": None,
-                            }
-                        ],
+                        schema=result_schema,
+                        examples=examples,
                     )
                 )
@@ -839,43 +1019,224 @@ class Workforce(BaseNode):
                 result = self.structured_handler.parse_structured_response(
                     response.msg.content if response.msg else "",
-                    schema=RecoveryDecision,
-                    fallback_values={
-                        "strategy": RecoveryStrategy.RETRY,
-                        "reasoning": "Defaulting to retry due to parsing "
-                        "issues",
-                        "modified_task_content": None,
-                    },
+                    schema=result_schema,
+                    fallback_values=fallback_values,
                 )
-                # Ensure we return a RecoveryDecision instance
-                if isinstance(result, RecoveryDecision):
+                if isinstance(result, TaskAnalysisResult):
                     return result
                 elif isinstance(result, dict):
-                    return RecoveryDecision(**result)
+                    return result_schema(**result)
                 else:
-                    return RecoveryDecision(
-                        strategy=RecoveryStrategy.RETRY,
-                        reasoning="Failed to parse recovery decision",
-                        modified_task_content=None,
-                    )
+                    # Fallback based on context
+                    return TaskAnalysisResult(**fallback_values)
             else:
-                # Use existing native structured output code
                 self.task_agent.reset()
                 response = self.task_agent.step(
-                    analysis_prompt, response_format=RecoveryDecision
+                    analysis_prompt, response_format=result_schema
                 )
                 return response.msg.parsed
         except Exception as e:
             logger.warning(
-                f"Error during failure analysis: {e}, defaulting to RETRY"
+                f"Error during task analysis "
+                f"({'failure' if for_failure else 'quality'}): {e}, "
+                f"using fallback"
             )
-            return RecoveryDecision(
-                strategy=RecoveryStrategy.RETRY,
-                reasoning=f"Analysis failed due to error: {e!s}, "
-                f"defaulting to retry",
-                modified_task_content=None,
+            return TaskAnalysisResult(**fallback_values)
+    async def _apply_recovery_strategy(
+        self,
+        task: Task,
+        recovery_decision: TaskAnalysisResult,
+    ) -> bool:
+        r"""Apply the recovery strategy from a task analysis result.
+        This method centralizes the recovery logic for both execution failures
+        and quality-based failures.
+        Args:
+            task (Task): The task that needs recovery
+            recovery_decision (TaskAnalysisResult): The analysis result with
+                recovery strategy
+        Returns:
+            bool: True if workforce should halt (e.g., decompose needs
+                different handling), False otherwise
+        """
+        strategy = (
+            recovery_decision.recovery_strategy or RecoveryStrategy.RETRY
+        )
+        action_taken = ""
+        try:
+            if strategy == RecoveryStrategy.RETRY:
+                # Simply retry the task by reposting it to the same worker
+                # Check both _assignees dict and task.assigned_worker_id
+                assignee_id = (
+                    self._assignees.get(task.id) or task.assigned_worker_id
+                )
+                if assignee_id:
+                    # Retry with the same worker - no coordinator call needed
+                    await self._post_task(task, assignee_id)
+                    action_taken = f"retried with same worker {assignee_id}"
+                    logger.info(
+                        f"Task {task.id} retrying with same worker "
+                        f"{assignee_id} (no coordinator call)"
+                    )
+                else:
+                    # No previous assignment exists - find a new assignee
+                    logger.info(
+                        f"Task {task.id} has no previous assignee, "
+                        f"calling coordinator"
+                    )
+                    batch_result = await self._find_assignee([task])
+                    assignment = batch_result.assignments[0]
+                    self._assignees[task.id] = assignment.assignee_id
+                    await self._post_task(task, assignment.assignee_id)
+                    action_taken = (
+                        f"retried with new worker {assignment.assignee_id}"
+                    )
+            elif strategy == RecoveryStrategy.REPLAN:
+                # Modify the task content and retry
+                if recovery_decision.modified_task_content:
+                    task.content = recovery_decision.modified_task_content
+                    logger.info(f"Task {task.id} content modified for replan")
+                # Repost the modified task
+                if task.id in self._assignees:
+                    assignee_id = self._assignees[task.id]
+                    await self._post_task(task, assignee_id)
+                    action_taken = (
+                        f"replanned and retried with worker {assignee_id}"
+                    )
+                else:
+                    # Find a new assignee for the replanned task
+                    batch_result = await self._find_assignee([task])
+                    assignment = batch_result.assignments[0]
+                    self._assignees[task.id] = assignment.assignee_id
+                    await self._post_task(task, assignment.assignee_id)
+                    action_taken = (
+                        f"replanned and assigned to "
+                        f"worker {assignment.assignee_id}"
+                    )
+            elif strategy == RecoveryStrategy.REASSIGN:
+                # Reassign to a different worker
+                old_worker = task.assigned_worker_id
+                logger.info(
+                    f"Task {task.id} will be reassigned from worker "
+                    f"{old_worker}"
+                )
+                # Find a different worker
+                batch_result = await self._find_assignee([task])
+                assignment = batch_result.assignments[0]
+                new_worker = assignment.assignee_id
+                # If same worker, force find another
+                if new_worker == old_worker and len(self._children) > 1:
+                    logger.info("Same worker selected, finding alternative")
+                    # Try to find different worker by adding note to
+                    # task content
+                    task.content = (
+                        f"{task.content}\n\n"
+                        f"Note: Previous worker {old_worker} had quality "
+                        f"issues. Needs different approach."
+                    )
+                    batch_result = await self._find_assignee([task])
+                    assignment = batch_result.assignments[0]
+                    new_worker = assignment.assignee_id
+                self._assignees[task.id] = new_worker
+                await self._post_task(task, new_worker)
+                action_taken = f"reassigned from {old_worker} to {new_worker}"
+                logger.info(
+                    f"Task {task.id} reassigned from {old_worker} to "
+                    f"{new_worker}"
+                )
+            elif strategy == RecoveryStrategy.DECOMPOSE:
+                # Decompose the task into subtasks
+                reason = (
+                    "failure"
+                    if not recovery_decision.is_quality_evaluation
+                    else "quality issues"
+                )
+                logger.info(
+                    f"Task {task.id} will be decomposed due to {reason}"
+                )
+                subtasks_result = self._decompose_task(task)
+                # Handle both streaming and non-streaming results
+                if isinstance(subtasks_result, Generator):
+                    subtasks = []
+                    for new_tasks in subtasks_result:
+                        subtasks.extend(new_tasks)
+                else:
+                    subtasks = subtasks_result
+                if subtasks:
+                    task_decomposed_event = TaskDecomposedEvent(
+                        parent_task_id=task.id,
+                        subtask_ids=[st.id for st in subtasks],
+                    )
+                    for cb in self._callbacks:
+                        cb.log_task_decomposed(task_decomposed_event)
+                    for subtask in subtasks:
+                        task_created_event = TaskCreatedEvent(
+                            task_id=subtask.id,
+                            description=subtask.content,
+                            parent_task_id=task.id,
+                            task_type=subtask.type,
+                            metadata=subtask.additional_info,
+                        )
+                        for cb in self._callbacks:
+                            cb.log_task_created(task_created_event)
+                # Insert subtasks at the head of the queue
+                self._pending_tasks.extendleft(reversed(subtasks))
+                await self._post_ready_tasks()
+                action_taken = f"decomposed into {len(subtasks)} subtasks"
+                logger.info(
+                    f"Task {task.id} decomposed into {len(subtasks)} subtasks"
+                )
+                # Sync shared memory after task decomposition
+                if self.share_memory:
+                    logger.info(
+                        f"Syncing shared memory after task {task.id} "
+                        f"decomposition"
+                    )
+                    self._sync_shared_memory()
+                # For decompose, we return early with special handling
+                return True
+            elif strategy == RecoveryStrategy.CREATE_WORKER:
+                assignee = await self._create_worker_node_for_task(task)
+                await self._post_task(task, assignee.node_id)
+                action_taken = (
+                    f"created new worker {assignee.node_id} and assigned "
+                    f"task {task.id} to it"
+                )
+        except Exception as e:
+            logger.error(
+                f"Recovery strategy {strategy} failed for task {task.id}: {e}",
+                exc_info=True,
             )
+            raise
+        logger.debug(
+            f"Task {task.id} recovery: {action_taken}. "
+            f"Strategy: {strategy.value}"
+        )
+        return False
     # Human intervention methods
     async def _async_pause(self) -> None:
@@ -966,6 +1327,39 @@ class Workforce(BaseNode):
                 f"(event-loop not yet started)."
             )
+    async def _async_skip_gracefully(self) -> None:
+        r"""Async implementation of skip_gracefully to run on the event
+        loop.
+        """
+        self._skip_requested = True
+        if self._pause_event.is_set() is False:
+            self._pause_event.set()  # Resume if paused to process skip
+        logger.info(f"Workforce {self.node_id} skip requested.")
+    def skip_gracefully(self) -> None:
+        r"""Request workforce to skip current pending tasks and move to next
+        main task from the queue. If no main tasks exist, acts like
+        stop_gracefully.
+        This method clears the current pending subtasks and moves to the next
+        main task in the queue if available. Works both when the internal
+        event-loop is alive and when it has not yet been started.
+        """
+        if self._loop and not self._loop.is_closed():
+            self._submit_coro_to_loop(self._async_skip_gracefully())
+        else:
+            # Loop not yet created, set the flag synchronously so later
+            # startup will respect it.
+            self._skip_requested = True
+            # Ensure any pending pause is released so that when the loop does
+            # start it can see the skip request and exit.
+            self._pause_event.set()
+            logger.info(
+                f"Workforce {self.node_id} skip requested "
+                f"(event-loop not yet started)."
+            )
     def save_snapshot(self, description: str = "") -> None:
         r"""Save current state as a snapshot."""
         snapshot = WorkforceSnapshot(
@@ -1020,67 +1414,193 @@ class Workforce(BaseNode):
         logger.warning(f"Task {task_id} not found in pending tasks.")
         return False
+    def get_main_task_queue(self) -> List[Task]:
+        r"""Get current main task queue for human review.
+        Returns:
+            List[Task]: List of main tasks waiting to be decomposed
+                and executed.
+        """
+        # Return tasks from pending queue that need decomposition
+        return [
+            t
+            for t in self._pending_tasks
+            if t.additional_info
+            and t.additional_info.get('_needs_decomposition')
+        ]
     def add_task(
         self,
         content: str,
         task_id: Optional[str] = None,
         additional_info: Optional[Dict[str, Any]] = None,
+        as_subtask: bool = False,
         insert_position: int = -1,
     ) -> Task:
-        r"""Add a new task to the pending queue."""
-        new_task = Task(
-            content=content,
-            id=task_id or f"human_added_{len(self._pending_tasks)}",
-            additional_info=additional_info,
-        )
-        if insert_position == -1:
-            self._pending_tasks.append(new_task)
-        else:
-            # Convert deque to list, insert, then back to deque
-            tasks_list = list(self._pending_tasks)
-            tasks_list.insert(insert_position, new_task)
-            self._pending_tasks = deque(tasks_list)
+        r"""Add a new task to the workforce.
-        logger.info(f"New task added: {new_task.id}")
-        return new_task
+        By default, this method adds a main task that will be decomposed into
+        subtasks. Set `as_subtask=True` to add a task directly to the pending
+        subtask queue without decomposition.
-    def remove_task(self, task_id: str) -> bool:
-        r"""Remove a task from the pending queue."""
-        # Convert to list to find and remove
-        tasks_list = list(self._pending_tasks)
-        for i, task in enumerate(tasks_list):
-            if task.id == task_id:
-                tasks_list.pop(i)
-                self._pending_tasks = deque(tasks_list)
-                logger.info(f"Task {task_id} removed.")
-                return True
-        logger.warning(f"Task {task_id} not found in pending tasks.")
-        return False
+        Args:
+            content (str): The content of the task.
+            task_id (Optional[str], optional): Optional ID for the task.
+                If not provided, a unique ID will be generated.
+            additional_info (Optional[Dict[str, Any]], optional): Optional
+                additional metadata for the task.
+            as_subtask (bool, optional): If True, adds the task directly to
+                the pending subtask queue. If False, adds as a main task that
+                will be decomposed. Defaults to False.
+            insert_position (int, optional): Position to insert the task in
+                the pending queue. Only applies when as_subtask=True.
+                Defaults to -1 (append to end).
-    def reorder_tasks(self, task_ids: List[str]) -> bool:
-        r"""Reorder pending tasks according to the provided task IDs list."""
-        # Create a mapping of task_id to task
-        tasks_dict = {task.id: task for task in self._pending_tasks}
+        Returns:
+            Task: The created task object.
+        """
+        if as_subtask:
+            new_task = Task(
+                content=content,
+                id=task_id or f"human_added_{len(self._pending_tasks)}",
+                additional_info=additional_info,
+            )
-        # Check if all provided IDs exist
-        invalid_ids = [
-            task_id for task_id in task_ids if task_id not in tasks_dict
-        ]
-        if invalid_ids:
-            logger.warning(
-                f"Task IDs not found in pending tasks: {invalid_ids}"
+            # Add directly to current pending subtasks
+            if insert_position == -1:
+                self._pending_tasks.append(new_task)
+            else:
+                # Convert deque to list, insert, then back to deque
+                tasks_list = list(self._pending_tasks)
+                tasks_list.insert(insert_position, new_task)
+                self._pending_tasks = deque(tasks_list)
+            logger.info(f"New subtask added to pending queue: {new_task.id}")
+            return new_task
+        else:
+            # Add as main task that needs decomposition
+            # Use additional_info to mark this task needs decomposition
+            # Make a copy to avoid modifying user's dict
+            info = additional_info.copy() if additional_info else {}
+            info['_needs_decomposition'] = True
+            task_count = sum(
+                1
+                for t in self._pending_tasks
+                if t.additional_info
+                and t.additional_info.get('_needs_decomposition')
             )
-            return False
-        # Check if we have the same number of tasks
-        if len(task_ids) != len(self._pending_tasks):
-            logger.warning(
-                "Number of task IDs doesn't match pending tasks count."
+            new_task = Task(
+                content=content,
+                id=task_id or f"main_task_{task_count}",
+                additional_info=info,
             )
-            return False
-        # Reorder tasks
-        reordered_tasks = deque([tasks_dict[task_id] for task_id in task_ids])
+            self._pending_tasks.append(new_task)
+            logger.info(f"New main task added to pending queue: {new_task.id}")
+            return new_task
+    def add_main_task(
+        self,
+        content: str,
+        task_id: Optional[str] = None,
+        additional_info: Optional[Dict[str, Any]] = None,
+    ) -> Task:
+        r"""Add a new main task that will be decomposed into subtasks.
+        This is an alias for :meth:`add_task` with `as_subtask=False`.
+        Args:
+            content (str): The content of the main task.
+            task_id (Optional[str], optional): Optional ID for the task.
+            additional_info (Optional[Dict[str, Any]], optional): Optional
+                additional metadata.
+        Returns:
+            Task: The created main task object.
+        """
+        return self.add_task(
+            content=content,
+            task_id=task_id,
+            additional_info=additional_info,
+            as_subtask=False,
+        )
+    def add_subtask(
+        self,
+        content: str,
+        task_id: Optional[str] = None,
+        additional_info: Optional[Dict[str, Any]] = None,
+        insert_position: int = -1,
+    ) -> Task:
+        r"""Add a new subtask to the current pending queue.
+        This is an alias for :meth:`add_task` with `as_subtask=True`.
+        Args:
+            content (str): The content of the subtask.
+            task_id (Optional[str], optional): Optional ID for the task.
+            additional_info (Optional[Dict[str, Any]], optional): Optional
+                additional metadata.
+            insert_position (int, optional): Position to insert the task.
+                Defaults to -1 (append to end).
+        Returns:
+            Task: The created subtask object.
+        """
+        return self.add_task(
+            content=content,
+            task_id=task_id,
+            additional_info=additional_info,
+            as_subtask=True,
+            insert_position=insert_position,
+        )
+    def remove_task(self, task_id: str) -> bool:
+        r"""Remove a task from the pending queue or main task queue.
+        Args:
+            task_id (str): The ID of the task to remove.
+        Returns:
+            bool: True if task was found and removed, False otherwise.
+        """
+        # Check main task queue first
+        pending_tasks_list = list(self._pending_tasks)
+        for i, task in enumerate(pending_tasks_list):
+            if task.id == task_id:
+                pending_tasks_list.pop(i)
+                self._pending_tasks = deque(pending_tasks_list)
+                logger.info(f"Task {task_id} removed from pending queue.")
+                return True
+        logger.warning(f"Task {task_id} not found in any task queue.")
+        return False
+    def reorder_tasks(self, task_ids: List[str]) -> bool:
+        r"""Reorder pending tasks according to the provided task IDs list."""
+        # Create a mapping of task_id to task
+        tasks_dict = {task.id: task for task in self._pending_tasks}
+        # Check if all provided IDs exist
+        invalid_ids = [
+            task_id for task_id in task_ids if task_id not in tasks_dict
+        ]
+        if invalid_ids:
+            logger.warning(
+                f"Task IDs not found in pending tasks: {invalid_ids}"
+            )
+            return False
+        # Check if we have the same number of tasks
+        if len(task_ids) != len(self._pending_tasks):
+            logger.warning(
+                "Number of task IDs doesn't match pending tasks count."
+            )
+            return False
+        # Reorder tasks
+        reordered_tasks = deque([tasks_dict[task_id] for task_id in task_ids])
         self._pending_tasks = reordered_tasks
         logger.info("Tasks reordered successfully.")
@@ -1169,26 +1689,21 @@ class Workforce(BaseNode):
             "main_task_id": self._task.id if self._task else None,
         }
-    @check_if_running(False)
-    async def process_task_async(
-        self, task: Task, interactive: bool = False
-    ) -> Task:
-        r"""Main entry point to process a task asynchronously.
+    async def handle_decompose_append_task(
+        self, task: Task, reset: bool = True
+    ) -> List[Task]:
+        r"""Handle task decomposition and validation with
+        workforce environment functions. Then append to
+        pending tasks if decomposition happened.
         Args:
             task (Task): The task to be processed.
-            interactive (bool, optional): If True, enables human-intervention
-                workflow (pause/resume/snapshot). Defaults to False, which
-                runs the task in a blocking one-shot manner.
+            reset (Bool): Should trigger workforce reset (Workforce must not
+                be running). Default: True
         Returns:
-            Task: The updated task.
+            List[Task]: The decomposed subtasks or the original task.
         """
-        # Delegate to intervention pipeline when requested to keep
-        # backward-compat.
-        if interactive:
-            return await self._process_task_with_snapshot(task)
         if not validate_task_content(task.content, task.id):
             task.state = TaskState.FAILED
             task.result = "Task failed: Invalid or empty content provided"
@@ -1196,18 +1711,25 @@ class Workforce(BaseNode):
                 f"Task {task.id} rejected: Invalid or empty content. "
                 f"Content preview: '{task.content}'"
             )
-            return task
+            return [task]
-        self.reset()
+        if reset and self._state != WorkforceState.RUNNING:
+            self.reset()
+            logger.info("Workforce reset before handling task.")
+        # Focus on the new task
         self._task = task
-        if self.metrics_logger:
-            self.metrics_logger.log_task_created(
-                task_id=task.id,
-                description=task.content,
-                task_type=task.type,
-                metadata=task.additional_info,
-            )
         task.state = TaskState.FAILED
+        task_created_event = TaskCreatedEvent(
+            task_id=task.id,
+            description=task.content,
+            task_type=task.type,
+            metadata=task.additional_info,
+        )
+        for cb in self._callbacks:
+            cb.log_task_created(task_created_event)
         # The agent tend to be overconfident on the whole task, so we
         # decompose the task into subtasks first
         subtasks_result = self._decompose_task(task)
@@ -1221,26 +1743,57 @@ class Workforce(BaseNode):
         else:
             # This is a regular list (non-streaming mode)
             subtasks = subtasks_result
-        if self.metrics_logger and subtasks:
-            self.metrics_logger.log_task_decomposed(
-                parent_task_id=task.id, subtask_ids=[st.id for st in subtasks]
+        if subtasks:
+            task_decomposed_event = TaskDecomposedEvent(
+                parent_task_id=task.id,
+                subtask_ids=[st.id for st in subtasks],
             )
+            for cb in self._callbacks:
+                cb.log_task_decomposed(task_decomposed_event)
             for subtask in subtasks:
-                self.metrics_logger.log_task_created(
+                task_created_event = TaskCreatedEvent(
                     task_id=subtask.id,
                     description=subtask.content,
                     parent_task_id=task.id,
                     task_type=subtask.type,
                     metadata=subtask.additional_info,
                 )
+                for cb in self._callbacks:
+                    cb.log_task_created(task_created_event)
         if subtasks:
-            # If decomposition happened, the original task becomes a container.
-            # We only execute its subtasks.
+            # _pending_tasks will contain both undecomposed
+            # and decomposed tasks, so we use additional_info
+            # to mark the tasks that need decomposition instead
             self._pending_tasks.extendleft(reversed(subtasks))
         else:
             # If no decomposition, execute the original task.
             self._pending_tasks.append(task)
+        return subtasks
+    @check_if_running(False)
+    async def process_task_async(
+        self, task: Task, interactive: bool = False
+    ) -> Task:
+        r"""Main entry point to process a task asynchronously.
+        Args:
+            task (Task): The task to be processed.
+            interactive (bool, optional): If True, enables human-intervention
+                workflow (pause/resume/snapshot). Defaults to False, which
+                runs the task in a blocking one-shot manner.
+        Returns:
+            Task: The updated task.
+        """
+        # Delegate to intervention pipeline when requested to keep
+        # backward-compat.
+        if interactive:
+            return await self._process_task_with_snapshot(task)
+        subtasks = await self.handle_decompose_append_task(task)
         self.set_channel(TaskChannel())
         await self.start()
@@ -1322,39 +1875,8 @@ class Workforce(BaseNode):
             Task: The updated task.
         """
-        if not validate_task_content(task.content, task.id):
-            task.state = TaskState.FAILED
-            task.result = "Task failed: Invalid or empty content provided"
-            logger.warning(
-                f"Task {task.id} rejected: Invalid or empty content. "
-                f"Content preview: '{task.content}'"
-            )
-            return task
-        self.reset()
-        self._task = task
-        self._state = WorkforceState.RUNNING
-        task.state = TaskState.FAILED  # TODO: Add logic for OPEN
-        # Decompose the task into subtasks first
-        subtasks_result = self._decompose_task(task)
+        await self.handle_decompose_append_task(task)
-        # Handle both streaming and non-streaming results
-        if isinstance(subtasks_result, Generator):
-            # This is a generator (streaming mode)
-            subtasks = []
-            for new_tasks in subtasks_result:
-                subtasks.extend(new_tasks)
-        else:
-            # This is a regular list (non-streaming mode)
-            subtasks = subtasks_result
-        if subtasks:
-            # If decomposition happened, the original task becomes a container.
-            # We only execute its subtasks.
-            self._pending_tasks.extendleft(reversed(subtasks))
-        else:
-            # If no decomposition, execute the original task.
-            self._pending_tasks.append(task)
         self.set_channel(TaskChannel())
         # Save initial snapshot
@@ -1493,6 +2015,9 @@ class Workforce(BaseNode):
                         start_coroutine, self._loop
                     )
                 self._child_listening_tasks.append(child_task)
+            else:
+                # Close the coroutine to prevent RuntimeWarning
+                start_coroutine.close()
         else:
             # Close the coroutine to prevent RuntimeWarning
             start_coroutine.close()
@@ -1502,6 +2027,7 @@ class Workforce(BaseNode):
         description: str,
         worker: ChatAgent,
         pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
+        enable_workflow_memory: bool = False,
     ) -> Workforce:
         r"""Add a worker node to the workforce that uses a single agent.
         Can be called when workforce is paused to dynamically add workers.
@@ -1511,6 +2037,9 @@ class Workforce(BaseNode):
             worker (ChatAgent): The agent to be added.
             pool_max_size (int): Maximum size of the agent pool.
                 (default: :obj:`10`)
+            enable_workflow_memory (bool): Whether to enable workflow memory
+                accumulation. Set to True if you plan to call
+                save_workflow_memories(). (default: :obj:`False`)
         Returns:
             Workforce: The workforce node itself.
@@ -1537,6 +2066,8 @@ class Workforce(BaseNode):
             worker=worker,
             pool_max_size=pool_max_size,
             use_structured_output_handler=self.use_structured_output_handler,
+            context_utility=None,  # Will be set during save/load operations
+            enable_workflow_memory=enable_workflow_memory,
         )
         self._children.append(worker_node)
@@ -1547,12 +2078,10 @@ class Workforce(BaseNode):
         # If workforce is paused, start the worker's listening task
         self._start_child_node_when_paused(worker_node.start())
-        if self.metrics_logger:
-            self.metrics_logger.log_worker_created(
-                worker_id=worker_node.node_id,
-                worker_type='SingleAgentWorker',
-                role=worker_node.description,
-            )
+        self._notify_worker_created(
+            worker_node,
+            worker_type='SingleAgentWorker',
+        )
         return self
     def add_role_playing_worker(
@@ -1626,12 +2155,10 @@ class Workforce(BaseNode):
         # If workforce is paused, start the worker's listening task
         self._start_child_node_when_paused(worker_node.start())
-        if self.metrics_logger:
-            self.metrics_logger.log_worker_created(
-                worker_id=worker_node.node_id,
-                worker_type='RolePlayingWorker',
-                role=worker_node.description,
-            )
+        self._notify_worker_created(
+            worker_node,
+            worker_type='RolePlayingWorker',
+        )
         return self
     def add_workforce(self, workforce: Workforce) -> Workforce:
@@ -1692,6 +2219,7 @@ class Workforce(BaseNode):
         # Reset intervention state
         self._state = WorkforceState.IDLE
         self._stop_requested = False
+        self._skip_requested = False
         # Handle asyncio.Event in a thread-safe way
         if self._loop and not self._loop.is_closed():
             # If we have a loop, use it to set the event safely
@@ -1707,118 +2235,520 @@ class Workforce(BaseNode):
             # No active loop, directly set the event
             self._pause_event.set()
-        if hasattr(self, 'metrics_logger') and self.metrics_logger is not None:
-            self.metrics_logger.reset_task_data()
-        else:
-            self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
-    @check_if_running(False)
-    def set_channel(self, channel: TaskChannel) -> None:
-        r"""Set the channel for the node and all the child nodes under it."""
-        self._channel = channel
-        for child in self._children:
-            child.set_channel(channel)
+        for cb in self._callbacks:
+            if isinstance(cb, WorkforceMetrics):
+                cb.reset_task_data()
-    def _get_child_nodes_info(self) -> str:
-        r"""Get the information of all the child nodes under this node."""
-        return "".join(
-            f"<{child.node_id}>:<{child.description}>:<{self._get_node_info(child)}>\n"
-            for child in self._children
-        )
-    def _get_node_info(self, node) -> str:
-        r"""Get descriptive information for a specific node type."""
-        if isinstance(node, Workforce):
-            return "A Workforce node"
-        elif isinstance(node, SingleAgentWorker):
-            return self._get_single_agent_info(node)
-        elif isinstance(node, RolePlayingWorker):
-            return "A Role playing node"
-        else:
-            return "Unknown node"
+    def save_workflow_memories(
+        self,
+        session_id: Optional[str] = None,
+    ) -> Dict[str, str]:
+        r"""Save workflow memories for all SingleAgentWorker instances in the
+        workforce.
+        .. deprecated:: 0.2.80
+            This synchronous method processes workers sequentially, which can
+            be slow for multiple agents. Use
+            :meth:`save_workflow_memories_async`
+            instead for parallel processing and significantly better
+            performance.
+        This method iterates through all child workers and triggers workflow
+        saving for SingleAgentWorker instances using their
+        save_workflow_memories()
+        method.
+        Other worker types are skipped.
-    def _get_single_agent_info(self, worker: 'SingleAgentWorker') -> str:
-        r"""Get formatted information for a SingleAgentWorker node."""
-        toolkit_tools = self._group_tools_by_toolkit(worker.worker.tool_dict)
+        Args:
+            session_id (Optional[str]): Custom session ID to use for saving
+                workflows. If None, auto-generates a timestamped session ID.
+                Useful for organizing workflows by project or context.
+                (default: :obj:`None`)
-        if not toolkit_tools:
-            return "no tools available"
+        Returns:
+            Dict[str, str]: Dictionary mapping worker node IDs to save results.
+                Values are either file paths (success) or error messages
+                (failure).
-        toolkit_info = []
-        for toolkit_name, tools in sorted(toolkit_tools.items()):
-            tools_str = ', '.join(sorted(tools))
-            toolkit_info.append(f"{toolkit_name}({tools_str})")
+        Example:
+            >>> workforce = Workforce("My Team")
+            >>> # ... add workers and process tasks ...
+            >>> # save with auto-generated session id
+            >>> results = workforce.save_workflow_memories()
+            >>> print(results)
+            {'worker_123': '/path/to/developer_agent_workflow.md',
+             'worker_456': 'error: No conversation context available'}
+            >>> # save with custom project id
+            >>> results = workforce.save_workflow_memories(
+            ...     session_id="project_123"
+            ... )
+        Note:
+            For better performance with multiple workers, use the async
+            version::
+                results = await workforce.save_workflow_memories_async()
+        See Also:
+            :meth:`save_workflow_memories_async`: Async version with parallel
+                processing for significantly better performance.
+        """
+        import warnings
+        warnings.warn(
+            "save_workflow_memories() is slow for multiple workers. "
+            "Consider using save_workflow_memories_async() for parallel "
+            "processing and ~4x faster performance.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        results = {}
-        return " | ".join(toolkit_info)
+        # Get or create shared context utility for this save operation
+        shared_context_utility = self._get_or_create_shared_context_utility(
+            session_id=session_id
+        )
-    def _group_tools_by_toolkit(self, tool_dict: dict) -> dict[str, list[str]]:
-        r"""Group tools by their parent toolkit class names."""
-        toolkit_tools: dict[str, list[str]] = {}
+        for child in self._children:
+            if isinstance(child, SingleAgentWorker):
+                try:
+                    # Set the shared context utility for this operation
+                    child._shared_context_utility = shared_context_utility
+                    child.worker.set_context_utility(shared_context_utility)
+                    result = child.save_workflow_memories()
+                    if result.get("status") == "success":
+                        results[child.node_id] = result.get(
+                            "file_path", "unknown_path"
+                        )
+                    else:
+                        # Error: check if there's a separate message field,
+                        # otherwise use the status itself
+                        error_msg = result.get(
+                            "message", result.get("status", "Unknown error")
+                        )
+                        results[child.node_id] = f"error: {error_msg}"
-        for tool_name, tool in tool_dict.items():
-            if hasattr(tool.func, '__self__'):
-                toolkit_name = tool.func.__self__.__class__.__name__
+                except Exception as e:
+                    results[child.node_id] = f"error: {e!s}"
             else:
-                toolkit_name = "Standalone"
-            if toolkit_name not in toolkit_tools:
-                toolkit_tools[toolkit_name] = []
-            toolkit_tools[toolkit_name].append(tool_name)
+                # Skip non-SingleAgentWorker types
+                results[child.node_id] = (
+                    f"skipped: {type(child).__name__} not supported"
+                )
-        return toolkit_tools
+        logger.info(f"Workflow save completed for {len(results)} workers")
+        return results
-    def _get_valid_worker_ids(self) -> set:
-        r"""Get all valid worker IDs from child nodes.
+    async def save_workflow_memories_async(
+        self,
+        session_id: Optional[str] = None,
+    ) -> Dict[str, str]:
+        r"""Asynchronously save workflow memories for all SingleAgentWorker
+        instances in the workforce.
-        Returns:
-            set: Set of valid worker IDs that can be assigned tasks.
-        """
-        valid_worker_ids = {child.node_id for child in self._children}
-        return valid_worker_ids
+        This is the async version of save_workflow_memories() that parallelizes
+        LLM summarization calls across all workers using asyncio.gather(),
+        significantly reducing total save time.
-    def _call_coordinator_for_assignment(
-        self, tasks: List[Task], invalid_ids: Optional[List[str]] = None
-    ) -> TaskAssignResult:
-        r"""Call coordinator agent to assign tasks with optional validation
-        feedback in the case of invalid worker IDs.
+        This method iterates through all child workers and triggers workflow
+        saving for SingleAgentWorker instances using their
+        save_workflow_memories_async() method in parallel.
+        Other worker types are skipped.
         Args:
-            tasks (List[Task]): Tasks to assign.
-            invalid_ids (List[str], optional): Invalid worker IDs from previous
-                attempt (if any).
+            session_id (Optional[str]): Custom session ID to use for saving
+                workflows. If None, auto-generates a timestamped session ID.
+                Useful for organizing workflows by project or context.
+                (default: :obj:`None`)
         Returns:
-            TaskAssignResult: Assignment result from coordinator.
+            Dict[str, str]: Dictionary mapping worker node IDs to save results.
+                Values are either file paths (success) or error messages
+                (failure).
+        Example:
+            >>> workforce = Workforce("My Team")
+            >>> # ... add workers and process tasks ...
+            >>> # save with parallel summarization (faster)
+            >>> results = await workforce.save_workflow_memories_async()
+            >>> print(results)
+            {'worker_123': '/path/to/developer_agent_workflow.md',
+             'worker_456': '/path/to/search_agent_workflow.md',
+             'worker_789': '/path/to/document_agent_workflow.md'}
         """
-        # format tasks information for the prompt
-        tasks_info = ""
-        for task in tasks:
-            tasks_info += f"Task ID: {task.id}\n"
-            tasks_info += f"Content: {task.content}\n"
-            if task.additional_info:
-                tasks_info += f"Additional Info: {task.additional_info}\n"
-            tasks_info += "---\n"
+        import asyncio
-        prompt = str(
-            ASSIGN_TASK_PROMPT.format(
-                tasks_info=tasks_info,
-                child_nodes_info=self._get_child_nodes_info(),
-            )
+        results = {}
+        # Get or create shared context utility for this save operation
+        shared_context_utility = self._get_or_create_shared_context_utility(
+            session_id=session_id
         )
-        # add feedback if this is a retry
-        if invalid_ids:
-            valid_worker_ids = list(self._get_valid_worker_ids())
-            feedback = (
-                f"VALIDATION ERROR: The following worker IDs are invalid: "
-                f"{invalid_ids}. "
-                f"VALID WORKER IDS: {valid_worker_ids}. "
-                f"Please reassign ONLY the above tasks using these valid IDs."
-            )
-            prompt = prompt + f"\n\n{feedback}"
+        # Prepare tasks for parallel execution
+        async def save_single_worker(
+            child: BaseNode,
+        ) -> tuple[str, str]:
+            """Save workflow for a single worker, then return (node_id,
+            result)."""
+            if isinstance(child, SingleAgentWorker):
+                try:
+                    # Set the shared context utility for this operation
+                    child._shared_context_utility = shared_context_utility
+                    child.worker.set_context_utility(shared_context_utility)
+                    result = await child.save_workflow_memories_async()
+                    if result.get("status") == "success":
+                        return (
+                            child.node_id,
+                            result.get("file_path", "unknown_path"),
+                        )
+                    else:
+                        # Error: check if there's a separate message field,
+                        # otherwise use the status itself
+                        error_msg = result.get(
+                            "message", result.get("status", "Unknown error")
+                        )
+                        return (child.node_id, f"error: {error_msg}")
-        # Check if we should use structured handler
-        if self.use_structured_output_handler:
+                except Exception as e:
+                    return (child.node_id, f"error: {e!s}")
+            else:
+                # Skip non-SingleAgentWorker types
+                return (
+                    child.node_id,
+                    f"skipped: {type(child).__name__} not supported",
+                )
+        # Create tasks for all workers
+        tasks = [save_single_worker(child) for child in self._children]
+        # Execute all tasks in parallel using asyncio.gather()
+        parallel_results = await asyncio.gather(*tasks, return_exceptions=True)
+        # Process results
+        for result in parallel_results:
+            if isinstance(result, Exception):
+                # Handle any unexpected exceptions
+                logger.error(
+                    f"Unexpected error during workflow save: {result}"
+                )
+                results["unknown"] = f"error: {result!s}"
+            elif isinstance(result, tuple) and len(result) == 2:
+                # Successfully got (node_id, save_result) tuple
+                node_id, save_result = result
+                results[node_id] = save_result
+            else:
+                # Unexpected result format
+                logger.error(f"Unexpected result format: {result}")
+                results["unknown"] = "error: unexpected result format"
+        logger.info(
+            f"Workflow save completed for {len(results)} workers "
+            f"(parallelized)"
+        )
+        return results
+    def load_workflow_memories(
+        self,
+        session_id: Optional[str] = None,
+        worker_max_workflows: int = 3,
+        coordinator_max_workflows: int = 5,
+        task_agent_max_workflows: int = 3,
+    ) -> Dict[str, bool]:
+        r"""Load workflow memories for all SingleAgentWorker instances in the
+        workforce.
+        This method iterates through all child workers and loads relevant
+        workflow files for SingleAgentWorker instances using their
+        load_workflow_memories()
+        method. Workers match files based on their description names.
+        Args:
+            session_id (Optional[str]): Specific workforce session ID to load
+                from. If None, searches across all sessions.
+                (default: :obj:`None`)
+            worker_max_workflows (int): Maximum number of workflow files to
+                load per worker agent. (default: :obj:`3`)
+            coordinator_max_workflows (int): Maximum number of workflow files
+                to load for the coordinator agent. (default: :obj:`5`)
+            task_agent_max_workflows (int): Maximum number of workflow files
+                to load for the task planning agent. (default: :obj:`3`)
+        Returns:
+            Dict[str, bool]: Dictionary mapping worker node IDs to load
+                success status.
+                True indicates successful loading, False indicates failure.
+        Example:
+            >>> workforce = Workforce("My Team")
+            >>> workforce.add_single_agent_worker(
+            ...     "data_analyst", analyst_agent
+            ... )
+            >>> success_status = workforce.load_workflow_memories(
+            ...     worker_max_workflows=5,
+            ...     coordinator_max_workflows=10,
+            ...     task_agent_max_workflows=5
+            ... )
+            >>> print(success_status)
+            {'worker_123': True}  # Successfully loaded workflows for
+            # data_analyst
+        """
+        results = {}
+        # For loading, we don't create a new session - instead we search
+        # existing ones
+        # Each worker will search independently across all existing sessions
+        # First, load workflows for SingleAgentWorker instances
+        for child in self._children:
+            if isinstance(child, SingleAgentWorker):
+                try:
+                    # For loading, don't set shared context utility
+                    # Let each worker search across existing sessions
+                    success = child.load_workflow_memories(
+                        max_workflows=worker_max_workflows,
+                        session_id=session_id,
+                    )
+                    results[child.node_id] = success
+                except Exception as e:
+                    logger.error(
+                        f"Failed to load workflow for {child.node_id}: {e!s}"
+                    )
+                    results[child.node_id] = False
+            else:
+                # Skip non-SingleAgentWorker types
+                results[child.node_id] = False
+        # Load aggregated workflow summaries for coordinator and task agents
+        self._load_management_agent_workflows(
+            coordinator_max_workflows, task_agent_max_workflows, session_id
+        )
+        logger.info(f"Workflow load completed for {len(results)} workers")
+        return results
+    def _load_management_agent_workflows(
+        self,
+        coordinator_max_workflows: int,
+        task_agent_max_workflows: int,
+        session_id: Optional[str] = None,
+    ) -> None:
+        r"""Load workflow summaries for coordinator and task planning agents.
+        This method loads aggregated workflow summaries to help:
+        - Coordinator agent: understand task assignment patterns and worker
+          capabilities
+        - Task agent: understand task decomposition patterns and
+          successful strategies
+        Args:
+            coordinator_max_workflows (int): Maximum number of workflow files
+                to load for the coordinator agent.
+            task_agent_max_workflows (int): Maximum number of workflow files
+                to load for the task planning agent.
+            session_id (Optional[str]): Specific session ID to load from.
+                If None, searches across all sessions.
+        """
+        try:
+            import glob
+            import os
+            from pathlib import Path
+            from camel.utils.context_utils import ContextUtility
+            # For loading management workflows, search across all sessions
+            camel_workdir = os.environ.get("CAMEL_WORKDIR")
+            if camel_workdir:
+                base_dir = os.path.join(camel_workdir, "workforce_workflows")
+            else:
+                base_dir = "workforce_workflows"
+            # Search for workflow files in specified or all session directories
+            if session_id:
+                search_path = str(
+                    Path(base_dir) / session_id / "*_workflow*.md"
+                )
+            else:
+                search_path = str(Path(base_dir) / "*" / "*_workflow*.md")
+            workflow_files = glob.glob(search_path)
+            if not workflow_files:
+                logger.info(
+                    "No workflow files found for management agent context"
+                )
+                return
+            # Sort by modification time (most recent first)
+            workflow_files.sort(
+                key=lambda x: os.path.getmtime(x), reverse=True
+            )
+            # Load workflows for coordinator agent
+            coordinator_loaded = 0
+            for file_path in workflow_files[:coordinator_max_workflows]:
+                try:
+                    filename = os.path.basename(file_path).replace('.md', '')
+                    session_dir = os.path.dirname(file_path)
+                    session_id = os.path.basename(session_dir)
+                    # Use shared context utility with specific session
+                    temp_utility = ContextUtility.get_workforce_shared(
+                        session_id
+                    )
+                    status = temp_utility.load_markdown_context_to_memory(
+                        self.coordinator_agent, filename
+                    )
+                    if "Context appended" in status:
+                        coordinator_loaded += 1
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to load coordinator workflow {file_path}: {e}"
+                    )
+            # Load workflows for task agent
+            task_agent_loaded = 0
+            for file_path in workflow_files[:task_agent_max_workflows]:
+                try:
+                    filename = os.path.basename(file_path).replace('.md', '')
+                    session_dir = os.path.dirname(file_path)
+                    session_id = os.path.basename(session_dir)
+                    # Use shared context utility with specific session
+                    temp_utility = ContextUtility.get_workforce_shared(
+                        session_id
+                    )
+                    status = temp_utility.load_markdown_context_to_memory(
+                        self.task_agent, filename
+                    )
+                    if "Context appended" in status:
+                        task_agent_loaded += 1
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to load task agent workflow {file_path}: {e}"
+                    )
+            logger.info(
+                f"Loaded {coordinator_loaded} workflows for coordinator, "
+                f"{task_agent_loaded} workflows for task agent"
+            )
+        except Exception as e:
+            logger.error(f"Error loading management agent workflows: {e}")
+    @check_if_running(False)
+    def set_channel(self, channel: TaskChannel) -> None:
+        r"""Set the channel for the node and all the child nodes under it."""
+        self._channel = channel
+        for child in self._children:
+            child.set_channel(channel)
+    def _get_child_nodes_info(self) -> str:
+        r"""Get the information of all the child nodes under this node."""
+        return "".join(
+            f"<{child.node_id}>:<{child.description}>:<{self._get_node_info(child)}>\n"
+            for child in self._children
+        )
+    def _get_node_info(self, node) -> str:
+        r"""Get descriptive information for a specific node type."""
+        if isinstance(node, Workforce):
+            return "A Workforce node"
+        elif isinstance(node, SingleAgentWorker):
+            return self._get_single_agent_toolkit_info(node)
+        elif isinstance(node, RolePlayingWorker):
+            return "A Role playing node"
+        else:
+            return "Unknown node"
+    def _get_single_agent_toolkit_info(
+        self, worker: 'SingleAgentWorker'
+    ) -> str:
+        r"""Get formatted information for a SingleAgentWorker node."""
+        toolkit_tools = self._group_tools_by_toolkit(worker.worker.tool_dict)
+        if not toolkit_tools:
+            return ""
+        toolkit_info = []
+        for toolkit_name, tools in sorted(toolkit_tools.items()):
+            tools_str = ', '.join(sorted(tools))
+            toolkit_info.append(f"{toolkit_name}({tools_str})")
+        return ", ".join(toolkit_info)
+    def _group_tools_by_toolkit(self, tool_dict: dict) -> dict[str, list[str]]:
+        r"""Group tools by their parent toolkit class names."""
+        toolkit_tools: dict[str, list[str]] = {}
+        for tool_name, tool in tool_dict.items():
+            if hasattr(tool.func, '__self__'):
+                toolkit_name = tool.func.__self__.__class__.__name__
+            else:
+                toolkit_name = "Standalone"
+            if toolkit_name not in toolkit_tools:
+                toolkit_tools[toolkit_name] = []
+            toolkit_tools[toolkit_name].append(tool_name)
+        return toolkit_tools
+    def _get_valid_worker_ids(self) -> set:
+        r"""Get all valid worker IDs from child nodes.
+        Returns:
+            set: Set of valid worker IDs that can be assigned tasks.
+        """
+        valid_worker_ids = {child.node_id for child in self._children}
+        return valid_worker_ids
+    def _call_coordinator_for_assignment(
+        self, tasks: List[Task], invalid_ids: Optional[List[str]] = None
+    ) -> TaskAssignResult:
+        r"""Call coordinator agent to assign tasks with optional validation
+        feedback in the case of invalid worker IDs.
+        Args:
+            tasks (List[Task]): Tasks to assign.
+            invalid_ids (List[str], optional): Invalid worker IDs from previous
+                attempt (if any).
+        Returns:
+            TaskAssignResult: Assignment result from coordinator.
+        """
+        # format tasks information for the prompt
+        tasks_info = ""
+        for task in tasks:
+            tasks_info += f"Task ID: {task.id}\n"
+            tasks_info += f"Content: {task.content}\n"
+            if task.additional_info:
+                tasks_info += f"Additional Info: {task.additional_info}\n"
+            tasks_info += "---\n"
+        prompt = str(
+            ASSIGN_TASK_PROMPT.format(
+                tasks_info=tasks_info,
+                child_nodes_info=self._get_child_nodes_info(),
+            )
+        )
+        # add feedback if this is a retry
+        if invalid_ids:
+            valid_worker_ids = list(self._get_valid_worker_ids())
+            feedback = (
+                f"VALIDATION ERROR: The following worker IDs are invalid: "
+                f"{invalid_ids}. "
+                f"VALID WORKER IDS: {valid_worker_ids}. "
+                f"Please reassign ONLY the above tasks using these valid IDs."
+            )
+            prompt = prompt + f"\n\n{feedback}"
+        # Check if we should use structured handler
+        if self.use_structured_output_handler:
             # Use structured handler for prompt-based extraction
             enhanced_prompt = (
                 self.structured_handler.generate_structured_prompt(
@@ -2057,8 +2987,40 @@ class Workforce(BaseNode):
             TaskAssignResult: Assignment result containing task assignments
                 with their dependencies.
         """
+        # Wait for workers to be ready before assignment with exponential
+        # backoff
+        worker_readiness_timeout = 2.0  # Maximum wait time in seconds
+        worker_readiness_check_interval = 0.05  # Initial check interval
+        start_time = time.time()
+        check_interval = worker_readiness_check_interval
+        backoff_multiplier = 1.5  # Exponential backoff factor
+        max_interval = 0.5  # Cap the maximum interval
+        while (time.time() - start_time) < worker_readiness_timeout:
+            valid_worker_ids = self._get_valid_worker_ids()
+            if len(valid_worker_ids) > 0:
+                elapsed = time.time() - start_time
+                logger.debug(
+                    f"Workers ready after {elapsed:.3f}s: "
+                    f"{len(valid_worker_ids)} workers available"
+                )
+                break
+            await asyncio.sleep(check_interval)
+            # Exponential backoff with cap
+            check_interval = min(
+                check_interval * backoff_multiplier, max_interval
+            )
+        else:
+            # Timeout reached, log warning but continue
+            logger.warning(
+                f"Worker readiness timeout after "
+                f"{worker_readiness_timeout}s, "
+                f"proceeding with {len(self._children)} children"
+            )
+            valid_worker_ids = self._get_valid_worker_ids()
         self.coordinator_agent.reset()
-        valid_worker_ids = self._get_valid_worker_ids()
         logger.debug(
             f"Sending batch assignment request to coordinator "
@@ -2092,7 +3054,24 @@ class Workforce(BaseNode):
                 invalid_assignments, tasks, valid_worker_ids
             )
         )
-        all_assignments = valid_assignments + retry_and_fallback_assignments
+        # Combine assignments with deduplication, prioritizing retry results
+        assignment_map = {a.task_id: a for a in valid_assignments}
+        assignment_map.update(
+            {a.task_id: a for a in retry_and_fallback_assignments}
+        )
+        all_assignments = list(assignment_map.values())
+        # Log any overwrites for debugging
+        valid_task_ids = {a.task_id for a in valid_assignments}
+        retry_task_ids = {a.task_id for a in retry_and_fallback_assignments}
+        overlap_task_ids = valid_task_ids & retry_task_ids
+        if overlap_task_ids:
+            logger.warning(
+                f"Retry assignments overrode {len(overlap_task_ids)} "
+                f"valid assignments for tasks: {sorted(overlap_task_ids)}"
+            )
         # Update Task.dependencies for all final assignments
         self._update_task_dependencies_from_assignments(all_assignments, tasks)
@@ -2105,10 +3084,11 @@ class Workforce(BaseNode):
         task.assigned_worker_id = assignee_id
-        if self.metrics_logger:
-            self.metrics_logger.log_task_started(
-                task_id=task.id, worker_id=assignee_id
-            )
+        task_started_event = TaskStartedEvent(
+            task_id=task.id, worker_id=assignee_id
+        )
+        for cb in self._callbacks:
+            cb.log_task_started(task_started_event)
         try:
             await self._channel.post_task(task, self.node_id, assignee_id)
@@ -2140,10 +3120,12 @@ class Workforce(BaseNode):
         Returns:
             Worker: The created worker node.
         """
-        prompt = CREATE_NODE_PROMPT.format(
-            content=task.content,
-            child_nodes_info=self._get_child_nodes_info(),
-            additional_info=task.additional_info,
+        prompt = str(
+            CREATE_NODE_PROMPT.format(
+                content=task.content,
+                child_nodes_info=self._get_child_nodes_info(),
+                additional_info=task.additional_info,
+            )
         )
         # Check if we should use structured handler
         if self.use_structured_output_handler:
@@ -2170,8 +3152,7 @@ class Workforce(BaseNode):
                     "worker creation"
                 )
                 new_node_conf = WorkerConf(
-                    description=f"Fallback worker for task: "
-                    f"{task.content}",
+                    description=f"Fallback worker for task: {task.content}",
                     role="General Assistant",
                     sys_msg="You are a general assistant that can help "
                     "with various tasks.",
@@ -2181,7 +3162,7 @@ class Workforce(BaseNode):
                     response.msg.content,
                     schema=WorkerConf,
                     fallback_values={
-                        "description": f"Worker for task: " f"{task.content}",
+                        "description": f"Worker for task: {task.content}",
                         "role": "Task Specialist",
                         "sys_msg": f"You are a specialist for: {task.content}",
                     },
@@ -2209,8 +3190,7 @@ class Workforce(BaseNode):
                 )
                 # Create a fallback worker configuration
                 new_node_conf = WorkerConf(
-                    description=f"Fallback worker for "
-                    f"task: {task.content}",
+                    description=f"Fallback worker for task: {task.content}",
                     role="General Assistant",
                     sys_msg="You are a general assistant that can help "
                     "with various tasks.",
@@ -2254,13 +3234,13 @@ class Workforce(BaseNode):
         print(f"{Fore.CYAN}{new_node} created.{Fore.RESET}")
         self._children.append(new_node)
-        if self.metrics_logger:
-            self.metrics_logger.log_worker_created(
-                worker_id=new_node.node_id,
-                worker_type='SingleAgentWorker',
-                role=new_node_conf.role,
-                metadata={'description': new_node_conf.description},
-            )
+        self._notify_worker_created(
+            new_node,
+            worker_type='SingleAgentWorker',
+            role=new_node_conf.role,
+            metadata={'description': new_node_conf.description},
+        )
         self._child_listening_tasks.append(
             asyncio.create_task(new_node.start())
         )
@@ -2304,13 +3284,27 @@ class Workforce(BaseNode):
         r"""Get the task that's published by this node and just get returned
         from the assignee. Includes timeout handling to prevent indefinite
         waiting.
+        Raises:
+            asyncio.TimeoutError: If waiting for task exceeds timeout
         """
         try:
             # Add timeout to prevent indefinite waiting
             return await asyncio.wait_for(
                 self._channel.get_returned_task_by_publisher(self.node_id),
-                timeout=TASK_TIMEOUT_SECONDS,
+                timeout=self.task_timeout_seconds,
             )
+        except asyncio.TimeoutError:
+            # Re-raise timeout errors to be handled by caller
+            # This prevents hanging when tasks are stuck
+            logger.warning(
+                f"Timeout waiting for task return in workforce "
+                f"{self.node_id}. "
+                f"Timeout: {self.task_timeout_seconds}s, "
+                f"Pending tasks: {len(self._pending_tasks)}, "
+                f"In-flight tasks: {self._in_flight_tasks}"
+            )
+            raise
         except Exception as e:
             error_msg = (
                 f"Error getting returned task {e} in "
@@ -2329,7 +3323,15 @@ class Workforce(BaseNode):
         tasks_to_assign = [
             task
             for task in self._pending_tasks
-            if task.id not in self._task_dependencies
+            if (
+                task.id not in self._task_dependencies
+                and (
+                    task.additional_info is None
+                    or not task.additional_info.get(
+                        "_needs_decomposition", False
+                    )
+                )
+            )
         ]
         if tasks_to_assign:
             logger.debug(
@@ -2339,22 +3341,24 @@ class Workforce(BaseNode):
             batch_result = await self._find_assignee(tasks_to_assign)
             logger.debug(
                 f"Coordinator returned assignments:\n"
-                f"{json.dumps(batch_result.dict(), indent=2)}"
+                f"{json.dumps(batch_result.model_dump(), indent=2)}"
             )
             for assignment in batch_result.assignments:
                 self._task_dependencies[assignment.task_id] = (
                     assignment.dependencies
                 )
                 self._assignees[assignment.task_id] = assignment.assignee_id
-                if self.metrics_logger:
+                task_assigned_event = TaskAssignedEvent(
+                    task_id=assignment.task_id,
+                    worker_id=assignment.assignee_id,
+                    dependencies=assignment.dependencies,
+                    queue_time_seconds=None,
+                )
+                for cb in self._callbacks:
                     # queue_time_seconds can be derived by logger if task
                     # creation time is logged
-                    self.metrics_logger.log_task_assigned(
-                        task_id=assignment.task_id,
-                        worker_id=assignment.assignee_id,
-                        dependencies=assignment.dependencies,
-                        queue_time_seconds=None,
-                    )
+                    cb.log_task_assigned(task_assigned_event)
         # Step 2: Iterate through all pending tasks and post those that are
         # ready
@@ -2365,21 +3369,139 @@ class Workforce(BaseNode):
         for task in self._pending_tasks:
             # A task must be assigned to be considered for posting
             if task.id in self._task_dependencies:
+                # Skip if task has already been posted to prevent duplicates
+                try:
+                    task_from_channel = await self._channel.get_task_by_id(
+                        task.id
+                    )
+                    # Check if task is already assigned to a worker
+                    if (
+                        task_from_channel
+                        and task_from_channel.assigned_worker_id
+                    ):
+                        logger.debug(
+                            f"Task {task.id} already assigned to "
+                            f"{task_from_channel.assigned_worker_id}, "
+                            f"skipping to prevent duplicate"
+                        )
+                        continue
+                except Exception as e:
+                    logger.info(
+                        f"Task {task.id} non existent in channel. "
+                        f"Assigning task: {e}"
+                    )
                 dependencies = self._task_dependencies[task.id]
-                # Check if all dependencies for this task are in the completed
-                # set and their state is DONE
-                if all(
-                    dep_id in completed_tasks_info
-                    and completed_tasks_info[dep_id] == TaskState.DONE
-                    for dep_id in dependencies
-                ):
-                    assignee_id = self._assignees[task.id]
-                    logger.debug(
-                        f"Posting task {task.id} to assignee {assignee_id}. "
-                        f"Dependencies met."
+                # Check if all dependencies are in completed state
+                all_deps_completed = all(
+                    dep_id in completed_tasks_info for dep_id in dependencies
+                )
+                # Only proceed with dependency checks if all deps are completed
+                if all_deps_completed:
+                    # Check if all dependencies succeeded (state is DONE)
+                    all_deps_done = all(
+                        completed_tasks_info[dep_id] == TaskState.DONE
+                        for dep_id in dependencies
                     )
-                    await self._post_task(task, assignee_id)
-                    posted_tasks.append(task)
+                    # Check if any dependency failed
+                    any_dep_failed = any(
+                        completed_tasks_info[dep_id] == TaskState.FAILED
+                        for dep_id in dependencies
+                    )
+                    if all_deps_done:
+                        # All dependencies completed successfully - post the
+                        # task
+                        assignee_id = self._assignees[task.id]
+                        logger.debug(
+                            f"Posting task {task.id} to "
+                            f"assignee {assignee_id}. "
+                            f"Dependencies met."
+                        )
+                        await self._post_task(task, assignee_id)
+                        posted_tasks.append(task)
+                    elif any_dep_failed:
+                        # Check if any failed dependencies can still be retried
+                        failed_deps = [
+                            dep_id
+                            for dep_id in dependencies
+                            if completed_tasks_info[dep_id] == TaskState.FAILED
+                        ]
+                        # Check if any failed dependency is still retryable
+                        failed_tasks_with_retry_potential = []
+                        permanently_failed_deps = []
+                        for dep_id in failed_deps:
+                            # Find the failed dependency task
+                            failed_task = next(
+                                (
+                                    t
+                                    for t in self._completed_tasks
+                                    if t.id == dep_id
+                                ),
+                                None,
+                            )
+                            if (
+                                failed_task
+                                and failed_task.failure_count
+                                < MAX_TASK_RETRIES
+                            ):
+                                failed_tasks_with_retry_potential.append(
+                                    dep_id
+                                )
+                            else:
+                                permanently_failed_deps.append(dep_id)
+                        # Only fail the task if ALL dependencies are
+                        # permanently failed
+                        if (
+                            permanently_failed_deps
+                            and not failed_tasks_with_retry_potential
+                        ):
+                            logger.error(
+                                f"Task {task.id} cannot proceed: dependencies "
+                                f"{permanently_failed_deps} have "
+                                f"permanently failed. "
+                                f"Marking task as failed."
+                            )
+                            task.state = TaskState.FAILED
+                            task.result = (
+                                f"Task failed due to permanently "
+                                f"failed dependencies: "
+                                f"{permanently_failed_deps}"
+                            )
+                            # Log the failure to metrics
+                            task_failed_event = TaskFailedEvent(
+                                task_id=task.id,
+                                worker_id=task.assigned_worker_id or "unknown",
+                                error_message=task.result,
+                                metadata={
+                                    'failure_reason': 'dependency_failure',
+                                    'failed_dependencies': (
+                                        permanently_failed_deps
+                                    ),
+                                },
+                            )
+                            for cb in self._callbacks:
+                                cb.log_task_failed(task_failed_event)
+                            self._completed_tasks.append(task)
+                            self._cleanup_task_tracking(task.id)
+                            posted_tasks.append(task)  # Remove from pending
+                        else:
+                            # Some dependencies may still be retried, keep
+                            # task pending
+                            logger.debug(
+                                f"Task {task.id} waiting: dependencies "
+                                f"{failed_tasks_with_retry_potential} "
+                                f"failed but may be retried "
+                                f"(attempt < {MAX_TASK_RETRIES})"
+                            )
+                # else: Not all dependencies completed yet, skip this task
         # Step 3: Remove the posted tasks from the pending list
         for task in posted_tasks:
@@ -2391,53 +3513,59 @@ class Workforce(BaseNode):
                 pass
     async def _handle_failed_task(self, task: Task) -> bool:
+        r"""Handle a task that failed during execution.
+        Args:
+            task (Task): The failed task
+        Returns:
+            bool: True if workforce should halt, False otherwise
+        """
         task.failure_count += 1
         # Determine detailed failure information
-        # Use the actual error/result stored in task.result
         failure_reason = task.result or "Unknown error"
-        # Add context about the worker and task
         worker_id = task.assigned_worker_id or "unknown"
-        worker_info = f" (assigned to worker: {worker_id})"
-        detailed_error = f"{failure_reason}{worker_info}"
+        detailed_error = f"{failure_reason} (assigned to worker: {worker_id})"
         logger.error(
             f"Task {task.id} failed (attempt "
-            f"{task.failure_count}/3): {detailed_error}"
+            f"{task.failure_count}/{MAX_TASK_RETRIES}): {detailed_error}"
         )
-        if self.metrics_logger:
-            self.metrics_logger.log_task_failed(
-                task_id=task.id,
-                worker_id=worker_id,
-                error_message=detailed_error,
-                metadata={
-                    'failure_count': task.failure_count,
-                    'task_content': task.content,
-                    'result_length': len(task.result) if task.result else 0,
-                },
-            )
+        print(
+            f"{Fore.RED}❌ Task {task.id} failed "
+            f"(attempt {task.failure_count}/{MAX_TASK_RETRIES}): "
+            f"{failure_reason}{Fore.RESET}"
+        )
+        task_failed_event = TaskFailedEvent(
+            task_id=task.id,
+            worker_id=worker_id,
+            error_message=detailed_error,
+            metadata={
+                'failure_count': task.failure_count,
+                'task_content': task.content,
+                'result_length': len(task.result) if task.result else 0,
+            },
+        )
+        for cb in self._callbacks:
+            cb.log_task_failed(task_failed_event)
-        # Check for immediate halt conditions - return immediately if we
-        # should halt
+        # Check for immediate halt conditions
         if task.failure_count >= MAX_TASK_RETRIES:
             logger.error(
                 f"Task {task.id} has exceeded maximum retry attempts "
-                f"({MAX_TASK_RETRIES}). Final failure "
-                f"reason: {detailed_error}. "
+                f"({MAX_TASK_RETRIES}). Final failure reason: "
+                f"{detailed_error}. "
                 f"Task content: '{task.content}'"
             )
             self._cleanup_task_tracking(task.id)
-            # Mark task as completed for dependency tracking before halting
             self._completed_tasks.append(task)
             if task.id in self._assignees:
                 await self._channel.archive_task(task.id)
             return True
-        # If too many tasks are failing rapidly, also halt to prevent infinite
-        # loops
         if len(self._pending_tasks) > MAX_PENDING_TASKS_LIMIT:
             logger.error(
                 f"Too many pending tasks ({len(self._pending_tasks)} > "
@@ -2445,18 +3573,24 @@ class Workforce(BaseNode):
                 f"explosion. Last failed task: {task.id}"
             )
             self._cleanup_task_tracking(task.id)
-            # Mark task as completed for dependency tracking before halting
             self._completed_tasks.append(task)
             if task.id in self._assignees:
                 await self._channel.archive_task(task.id)
             return True
         # Use intelligent failure analysis to decide recovery strategy
-        recovery_decision = self._analyze_failure(task, detailed_error)
+        recovery_decision = self._analyze_task(
+            task, for_failure=True, error_message=detailed_error
+        )
+        strategy_str = (
+            recovery_decision.recovery_strategy.value
+            if recovery_decision.recovery_strategy
+            else "none"
+        )
         logger.info(
             f"Task {task.id} failure "
-            f"analysis: {recovery_decision.strategy.value} - "
+            f"analysis: {strategy_str} - "
             f"{recovery_decision.reasoning}"
         )
@@ -2465,105 +3599,23 @@ class Workforce(BaseNode):
             await self._channel.archive_task(task.id)
         self._cleanup_task_tracking(task.id)
+        # Apply recovery strategy
         try:
-            if recovery_decision.strategy == RecoveryStrategy.RETRY:
-                # Simply retry the task by reposting it
-                if task.id in self._assignees:
-                    assignee_id = self._assignees[task.id]
-                    await self._post_task(task, assignee_id)
-                    action_taken = f"retried with same worker {assignee_id}"
-                else:
-                    # Find a new assignee and retry
-                    batch_result = await self._find_assignee([task])
-                    assignment = batch_result.assignments[0]
-                    self._assignees[task.id] = assignment.assignee_id
-                    await self._post_task(task, assignment.assignee_id)
-                    action_taken = (
-                        f"retried with new worker {assignment.assignee_id}"
-                    )
-            elif recovery_decision.strategy == RecoveryStrategy.REPLAN:
-                # Modify the task content and retry
-                if recovery_decision.modified_task_content:
-                    task.content = recovery_decision.modified_task_content
-                    logger.info(f"Task {task.id} content modified for replan")
-                # Repost the modified task
-                if task.id in self._assignees:
-                    assignee_id = self._assignees[task.id]
-                    await self._post_task(task, assignee_id)
-                    action_taken = (
-                        f"replanned and retried with worker {assignee_id}"
-                    )
-                else:
-                    # Find a new assignee for the replanned task
-                    batch_result = await self._find_assignee([task])
-                    assignment = batch_result.assignments[0]
-                    self._assignees[task.id] = assignment.assignee_id
-                    await self._post_task(task, assignment.assignee_id)
-                    action_taken = (
-                        f"replanned and assigned to "
-                        f"worker {assignment.assignee_id}"
-                    )
-            elif recovery_decision.strategy == RecoveryStrategy.DECOMPOSE:
-                # Decompose the task into subtasks
-                subtasks_result = self._decompose_task(task)
-                # Handle both streaming and non-streaming results
-                if isinstance(subtasks_result, Generator):
-                    # This is a generator (streaming mode)
-                    subtasks = []
-                    for new_tasks in subtasks_result:
-                        subtasks.extend(new_tasks)
-                else:
-                    # This is a regular list (non-streaming mode)
-                    subtasks = subtasks_result
-                if self.metrics_logger and subtasks:
-                    self.metrics_logger.log_task_decomposed(
-                        parent_task_id=task.id,
-                        subtask_ids=[st.id for st in subtasks],
-                    )
-                    for subtask in subtasks:
-                        self.metrics_logger.log_task_created(
-                            task_id=subtask.id,
-                            description=subtask.content,
-                            parent_task_id=task.id,
-                            task_type=subtask.type,
-                            metadata=subtask.additional_info,
-                        )
-                # Insert packets at the head of the queue
-                self._pending_tasks.extendleft(reversed(subtasks))
-                await self._post_ready_tasks()
-                action_taken = f"decomposed into {len(subtasks)} subtasks"
-                logger.debug(
-                    f"Task {task.id} failed and was {action_taken}. "
-                    f"Dependencies updated for subtasks."
-                )
-                # Sync shared memory after task decomposition
-                if self.share_memory:
-                    logger.info(
-                        f"Syncing shared memory after "
-                        f"task {task.id} decomposition"
-                    )
-                    self._sync_shared_memory()
+            is_decompose = await self._apply_recovery_strategy(
+                task, recovery_decision
+            )
-                # Check if any pending tasks are now ready to execute
-                await self._post_ready_tasks()
+            # For decompose, we handle it specially
+            if is_decompose:
+                # Task was decomposed, add to completed tasks
+                self._completed_tasks.append(task)
                 return False
-            elif recovery_decision.strategy == RecoveryStrategy.CREATE_WORKER:
-                assignee = await self._create_worker_node_for_task(task)
-                await self._post_task(task, assignee.node_id)
-                action_taken = (
-                    f"created new worker {assignee.node_id} and assigned "
-                    f"task {task.id} to it"
-                )
         except Exception as e:
-            logger.error(f"Recovery strategy failed for task {task.id}: {e}")
+            logger.error(
+                f"Recovery strategy failed for task {task.id}: {e}",
+                exc_info=True,
+            )
             # If max retries reached, halt the workforce
             if task.failure_count >= MAX_TASK_RETRIES:
                 self._completed_tasks.append(task)
@@ -2571,18 +3623,17 @@ class Workforce(BaseNode):
             self._completed_tasks.append(task)
             return False
+        # Task is being retried - don't add to completed tasks
+        # It will be added when it actually completes or permanently fails
         logger.debug(
-            f"Task {task.id} failed and was {action_taken}. "
-            f"Updating dependency state."
+            f"Task {task.id} is being retried (strategy: "
+            f"{recovery_decision.recovery_strategy}). "
+            f"Not adding to completed tasks until final outcome."
         )
-        # Mark task as completed for dependency tracking
-        self._completed_tasks.append(task)
-        # Sync shared memory after task completion to share knowledge
+        # Sync shared memory after task recovery
         if self.share_memory:
-            logger.info(
-                f"Syncing shared memory after task {task.id} completion"
-            )
+            logger.info(f"Syncing shared memory after task {task.id} recovery")
             self._sync_shared_memory()
         # Check if any pending tasks are now ready to execute
@@ -2590,61 +3641,60 @@ class Workforce(BaseNode):
         return False
     async def _handle_completed_task(self, task: Task) -> None:
-        if self.metrics_logger:
-            worker_id = task.assigned_worker_id or "unknown"
-            processing_time_seconds = None
-            token_usage = None
-            # Get processing time from task start time or additional info
-            if task.id in self._task_start_times:
-                processing_time_seconds = (
-                    time.time() - self._task_start_times[task.id]
-                )
-                self._cleanup_task_tracking(task.id)
-            elif (
-                task.additional_info is not None
-                and 'processing_time_seconds' in task.additional_info
-            ):
-                processing_time_seconds = task.additional_info[
-                    'processing_time_seconds'
-                ]
+        worker_id = task.assigned_worker_id or "unknown"
+        processing_time_seconds = None
+        token_usage = None
-            # Get token usage from task additional info (preferred - actual
-            # usage)
-            if (
-                task.additional_info is not None
-                and 'token_usage' in task.additional_info
-            ):
-                token_usage = task.additional_info['token_usage']
-            else:
-                # Fallback: Try to get token usage from SingleAgentWorker
-                # memory
-                assignee_node = next(
-                    (
-                        child
-                        for child in self._children
-                        if child.node_id == worker_id
-                    ),
-                    None,
-                )
-                if isinstance(assignee_node, SingleAgentWorker):
-                    try:
-                        _, total_tokens = (
-                            assignee_node.worker.memory.get_context()
-                        )
-                        token_usage = {'total_tokens': total_tokens}
-                    except Exception:
-                        token_usage = None
+        # Get processing time from task start time or additional info
+        if task.id in self._task_start_times:
+            processing_time_seconds = (
+                time.time() - self._task_start_times[task.id]
+            )
+            self._cleanup_task_tracking(task.id)
+        elif (
+            task.additional_info is not None
+            and 'processing_time_seconds' in task.additional_info
+        ):
+            processing_time_seconds = task.additional_info[
+                'processing_time_seconds'
+            ]
-            # Log the completed task
-            self.metrics_logger.log_task_completed(
-                task_id=task.id,
-                worker_id=worker_id,
-                result_summary=task.result if task.result else "Completed",
-                processing_time_seconds=processing_time_seconds,
-                token_usage=token_usage,
-                metadata={'current_state': task.state.value},
+        # Get token usage from task additional info (preferred - actual
+        # usage)
+        if (
+            task.additional_info is not None
+            and 'token_usage' in task.additional_info
+        ):
+            token_usage = task.additional_info['token_usage']
+        else:
+            # Fallback: Try to get token usage from SingleAgentWorker
+            # memory
+            assignee_node = next(
+                (
+                    child
+                    for child in self._children
+                    if child.node_id == worker_id
+                ),
+                None,
             )
+            if isinstance(assignee_node, SingleAgentWorker):
+                try:
+                    _, total_tokens = assignee_node.worker.memory.get_context()
+                    token_usage = {'total_tokens': total_tokens}
+                except Exception:
+                    token_usage = None
+        # Log the completed task
+        task_completed_event = TaskCompletedEvent(
+            task_id=task.id,
+            worker_id=worker_id,
+            result_summary=task.result if task.result else "Completed",
+            processing_time_seconds=processing_time_seconds,
+            token_usage=token_usage,
+            metadata={'current_state': task.state.value},
+        )
+        for cb in self._callbacks:
+            cb.log_task_completed(task_completed_event)
         # Find and remove the completed task from pending tasks
         tasks_list = list(self._pending_tasks)
@@ -2764,15 +3814,23 @@ class Workforce(BaseNode):
         r"""Returns an ASCII tree representation of the task hierarchy and
         worker status.
         """
-        if not self.metrics_logger:
-            return "Logger not initialized."
-        return self.metrics_logger.get_ascii_tree_representation()
+        metrics_cb: List[WorkforceMetrics] = [
+            cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
+        ]
+        if len(metrics_cb) == 0:
+            return "Metrics Callback not initialized."
+        else:
+            return metrics_cb[0].get_ascii_tree_representation()
     def get_workforce_kpis(self) -> Dict[str, Any]:
         r"""Returns a dictionary of key performance indicators."""
-        if not self.metrics_logger:
-            return {"error": "Logger not initialized."}
-        return self.metrics_logger.get_kpis()
+        metrics_cb: List[WorkforceMetrics] = [
+            cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
+        ]
+        if len(metrics_cb) == 0:
+            return {"error": "Metrics Callback not initialized."}
+        else:
+            return metrics_cb[0].get_kpis()
     def dump_workforce_logs(self, file_path: str) -> None:
         r"""Dumps all collected logs to a JSON file.
@@ -2780,13 +3838,133 @@ class Workforce(BaseNode):
         Args:
             file_path (str): The path to the JSON file.
         """
-        if not self.metrics_logger:
+        metrics_cb: List[WorkforceMetrics] = [
+            cb for cb in self._callbacks if isinstance(cb, WorkforceMetrics)
+        ]
+        if len(metrics_cb) == 0:
             print("Logger not initialized. Cannot dump logs.")
             return
-        self.metrics_logger.dump_to_json(file_path)
+        metrics_cb[0].dump_to_json(file_path)
         # Use logger.info or print, consistent with existing style
         logger.info(f"Workforce logs dumped to {file_path}")
+    async def _handle_skip_task(self) -> bool:
+        r"""Handle skip request by marking pending and in-flight tasks
+        as completed.
+        Returns:
+            bool: True if workforce should stop (no independent tasks),
+            False to continue.
+        """
+        logger.info("Skip requested, processing skip logic.")
+        # Mark all pending tasks as completed instead of just clearing
+        pending_tasks_to_complete = list(self._pending_tasks)
+        if pending_tasks_to_complete:
+            logger.info(
+                f"Marking {len(pending_tasks_to_complete)} pending tasks "
+                f"as completed."
+            )
+            for task in pending_tasks_to_complete:
+                # Don't remove tasks that need decomposition
+                if task.additional_info and task.additional_info.get(
+                    '_needs_decomposition', False
+                ):
+                    continue
+                # Set task state to DONE and add a completion message
+                task.state = TaskState.DONE
+                task.result = "Task marked as completed due to skip request"
+                # Use the existing handle completed task function
+                await self._handle_completed_task(task)
+        # Handle in-flight tasks if they exist
+        if self._in_flight_tasks > 0:
+            logger.info(
+                f"Found {self._in_flight_tasks} in-flight tasks. "
+                f"Retrieving and completing them."
+            )
+            try:
+                # Get all in-flight tasks for this publisher from the channel
+                in_flight_tasks = await self._channel.get_in_flight_tasks(
+                    self.node_id
+                )
+                logger.info(
+                    f"Retrieved {len(in_flight_tasks)} in-flight "
+                    f"tasks from channel."
+                )
+                for task in in_flight_tasks:
+                    # Set task state to DONE and add a completion message
+                    task.state = TaskState.DONE
+                    task.result = (
+                        "Task marked as completed due to skip request"
+                    )
+                    # Remove the task from the channel to avoid hanging
+                    await self._channel.remove_task(task.id)
+                    # Decrement in-flight counter
+                    self._decrement_in_flight_tasks(
+                        task.id, "skip request - removed from channel"
+                    )
+                    # Handle as completed task to update dependencies
+                    await self._handle_completed_task(task)
+                    logger.info(
+                        f"Completed in-flight task {task.id} due "
+                        f"to skip request."
+                    )
+            except Exception as e:
+                logger.error(
+                    f"Error handling in-flight tasks during skip: {e}",
+                    exc_info=True,
+                )
+                # Reset in-flight counter to prevent hanging
+                self._in_flight_tasks = 0
+        # Check if there are any main pending tasks after filtering
+        if self._pending_tasks:
+            # Check if the first pending task needs decomposition
+            next_task = self._pending_tasks[0]
+            if next_task.additional_info and next_task.additional_info.get(
+                '_needs_decomposition'
+            ):
+                logger.info(
+                    f"Decomposing main task {next_task.id} after skip request."
+                )
+                try:
+                    # Remove the decomposition flag to avoid re-decomposition
+                    next_task.additional_info['_needs_decomposition'] = False
+                    # Decompose the task and append subtasks to _pending_tasks
+                    await self.handle_decompose_append_task(
+                        next_task, reset=False
+                    )
+                    # Mark the main task as completed and remove from pending
+                    await self._handle_completed_task(next_task)
+                    logger.info(
+                        f"Main task {next_task.id} decomposed after "
+                        f"skip request"
+                    )
+                except Exception as e:
+                    logger.error(
+                        f"Error decomposing main task {next_task.id} "
+                        f"after skip: {e}",
+                        exc_info=True,
+                    )
+            logger.info("Pending tasks available after skip, continuing.")
+            await self._post_ready_tasks()
+            return False  # Continue processing
+        else:
+            # No pending tasks available, act like stop
+            logger.info("No pending tasks available, acting like stop.")
+            return True  # Stop processing
     @check_if_running(False)
     async def _listen_to_channel(self) -> None:
         r"""Continuously listen to the channel, post task to the channel and
@@ -2815,6 +3993,75 @@ class Workforce(BaseNode):
                     logger.info("Stop requested, breaking execution loop.")
                     break
+                # Check for skip request after potential pause
+                if self._skip_requested:
+                    should_stop = await self._handle_skip_task()
+                    if should_stop:
+                        self._stop_requested = True
+                        break
+                    # Reset skip flag
+                    self._skip_requested = False
+                    continue
+                # Check if we should decompose a main task
+                # Only decompose when no tasks are in flight and pending queue
+                # is empty
+                if not self._pending_tasks and self._in_flight_tasks == 0:
+                    # All tasks completed, will exit loop
+                    break
+                # Check if the first pending task needs decomposition
+                # This happens when add_task(as_subtask=False) was called
+                if self._pending_tasks and self._in_flight_tasks == 0:
+                    next_task = self._pending_tasks[0]
+                    if (
+                        next_task.additional_info
+                        and next_task.additional_info.get(
+                            '_needs_decomposition'
+                        )
+                    ):
+                        logger.info(f"Decomposing main task: {next_task.id}")
+                        try:
+                            # Remove the decomposition flag to avoid
+                            # re-decomposition
+                            next_task.additional_info[
+                                '_needs_decomposition'
+                            ] = False
+                            # Decompose the task and append subtasks to
+                            # _pending_tasks
+                            await self.handle_decompose_append_task(
+                                next_task, reset=False
+                            )
+                            # Mark the main task as completed (decomposition
+                            # successful) and Remove it from pending tasks
+                            await self._handle_completed_task(next_task)
+                            logger.info(
+                                f"Main task {next_task.id} decomposed and "
+                                f"ready for processing"
+                            )
+                        except Exception as e:
+                            logger.error(
+                                f"Error decomposing main task {next_task.id}: "
+                                f"{e}",
+                                exc_info=True,
+                            )
+                            # Revert back to the queue for retry later if
+                            # decomposition failed
+                            if not self._pending_tasks:
+                                self._pending_tasks.appendleft(next_task)
+                            else:
+                                logger.warning(
+                                    "Pending tasks exist after decomposition "
+                                    "error."
+                                )
+                        # Immediately assign and post the transferred tasks
+                        await self._post_ready_tasks()
+                        continue
                 # Save snapshot before processing next task
                 if self._pending_tasks:
                     current_task = self._pending_tasks[0]
@@ -2829,9 +4076,24 @@ class Workforce(BaseNode):
                         self._last_snapshot_time = time.time()
                 # Get returned task
-                returned_task = await self._get_returned_task()
+                try:
+                    returned_task = await self._get_returned_task()
+                except asyncio.TimeoutError:
+                    # Handle timeout - check if we have tasks stuck in flight
+                    if self._in_flight_tasks > 0:
+                        logger.warning(
+                            f"Timeout waiting for {self._in_flight_tasks} "
+                            f"in-flight tasks. Breaking to prevent hanging."
+                        )
+                        # Break the loop to prevent indefinite hanging
+                        # The finally block will handle cleanup
+                        break
+                    else:
+                        # No tasks in flight, safe to continue
+                        await self._post_ready_tasks()
+                        continue
-                # If no task was returned, continue
+                # If no task was returned (other errors), continue
                 if returned_task is None:
                     logger.debug(
                         f"No task returned in workforce {self.node_id}. "
@@ -2872,6 +4134,20 @@ class Workforce(BaseNode):
                             )
                             if not halt:
                                 continue
+                            # Do not halt if we have main tasks in queue
+                            if len(self.get_main_task_queue()) > 0:
+                                print(
+                                    f"{Fore.RED}Task {returned_task.id} has "
+                                    f"failed for {MAX_TASK_RETRIES} times "
+                                    f"after insufficient results, skipping "
+                                    f"that task. Final error: "
+                                    f"{returned_task.result or 'Unknown err'}"
+                                    f"{Fore.RESET}"
+                                )
+                                self._skip_requested = True
+                                continue
                             print(
                                 f"{Fore.RED}Task {returned_task.id} has "
                                 f"failed for {MAX_TASK_RETRIES} times after "
@@ -2890,16 +4166,106 @@ class Workforce(BaseNode):
                             )
                             continue
                     else:
-                        print(
-                            f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
-                            f"successfully.{Fore.RESET}"
+                        quality_eval = self._analyze_task(
+                            returned_task, for_failure=False
                         )
-                        await self._handle_completed_task(returned_task)
+                        if not quality_eval.quality_sufficient:
+                            logger.info(
+                                f"Task {returned_task.id} quality check: "
+                                f"score={quality_eval.quality_score}, "
+                                f"issues={quality_eval.issues}, "
+                                f"strategy={quality_eval.recovery_strategy}"
+                            )
+                            # Check retry limit before attempting recovery
+                            if returned_task.failure_count >= 2:
+                                print(
+                                    f"{Fore.YELLOW}Task {returned_task.id} "
+                                    f"completed with low quality score: "
+                                    f"{quality_eval.quality_score} "
+                                    f"(retry limit reached){Fore.RESET}"
+                                )
+                                await self._handle_completed_task(
+                                    returned_task
+                                )
+                                continue
+                            # Print visual feedback for quality-failed tasks
+                            # with recovery strategy
+                            recovery_action = (
+                                quality_eval.recovery_strategy.value
+                                if quality_eval.recovery_strategy
+                                else ""
+                            )
+                            print(
+                                f"{Fore.YELLOW}⚠️ Task {returned_task.id} "
+                                f"failed quality check (score: "
+                                f"{quality_eval.quality_score}). "
+                                f"Issues: {', '.join(quality_eval.issues)}. "
+                                f"Recovery: {recovery_action}{Fore.RESET}"
+                            )
+                            # Mark as failed for recovery
+                            returned_task.failure_count += 1
+                            returned_task.state = TaskState.FAILED
+                            returned_task.result = (
+                                f"Quality insufficient (score: "
+                                f"{quality_eval.quality_score}). "
+                                f"Issues: {', '.join(quality_eval.issues)}"
+                            )
+                            # Clean up tracking before attempting recovery
+                            if returned_task.id in self._assignees:
+                                await self._channel.archive_task(
+                                    returned_task.id
+                                )
+                            self._cleanup_task_tracking(returned_task.id)
+                            # Apply LLM-recommended recovery strategy
+                            try:
+                                is_decompose = (
+                                    await self._apply_recovery_strategy(
+                                        returned_task, quality_eval
+                                    )
+                                )
+                                # For decompose, cleanup happens in the method
+                                if is_decompose:
+                                    continue
+                            except Exception as e:
+                                logger.error(
+                                    f"Error handling quality-failed task "
+                                    f"{returned_task.id}: {e}",
+                                    exc_info=True,
+                                )
+                                continue
+                        else:
+                            print(
+                                f"{Fore.CYAN}Task {returned_task.id} "
+                                f"completed successfully (quality score: "
+                                f"{quality_eval.quality_score}).{Fore.RESET}"
+                            )
+                            await self._handle_completed_task(returned_task)
                 elif returned_task.state == TaskState.FAILED:
                     try:
                         halt = await self._handle_failed_task(returned_task)
                         if not halt:
                             continue
+                        # Do not halt if we have main tasks in queue
+                        if len(self.get_main_task_queue()) > 0:
+                            print(
+                                f"{Fore.RED}Task {returned_task.id} has "
+                                f"failed for {MAX_TASK_RETRIES} times, "
+                                f"skipping that task. Final error: "
+                                f"{returned_task.result or 'Unknown error'}"
+                                f"{Fore.RESET}"
+                            )
+                            self._skip_requested = True
+                            continue
                         print(
                             f"{Fore.RED}Task {returned_task.id} has failed "
                             f"for {MAX_TASK_RETRIES} times, halting "
@@ -2952,6 +4318,9 @@ class Workforce(BaseNode):
         elif not self._pending_tasks and self._in_flight_tasks == 0:
             self._state = WorkforceState.IDLE
             logger.info("All tasks completed.")
+            all_tasks_completed_event = AllTasksCompletedEvent()
+            for cb in self._callbacks:
+                cb.log_all_tasks_completed(all_tasks_completed_event)
         # shut down the whole workforce tree
         self.stop()
@@ -3064,6 +4433,7 @@ class Workforce(BaseNode):
             graceful_shutdown_timeout=self.graceful_shutdown_timeout,
             share_memory=self.share_memory,
             use_structured_output_handler=self.use_structured_output_handler,
+            task_timeout_seconds=self.task_timeout_seconds,
         )
         for child in self._children:

camel-ai 0.2.73a4__py3-none-any.whl → 0.2.80a2__py3-none-any.whl

camel-ai 0.2.73a4py3-none-any.whl → 0.2.80a2py3-none-any.whl