camel-ai 0.2.71a3__py3-none-any.whl → 0.2.71a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of camel-ai might be problematic. Click here for more details.
- camel/__init__.py +1 -1
- camel/agents/chat_agent.py +1482 -134
- camel/agents/repo_agent.py +2 -1
- camel/benchmarks/browsecomp.py +6 -6
- camel/interpreters/docker_interpreter.py +3 -2
- camel/loaders/base_loader.py +85 -0
- camel/logger.py +1 -1
- camel/messages/base.py +12 -1
- camel/models/azure_openai_model.py +96 -7
- camel/models/base_model.py +68 -10
- camel/models/deepseek_model.py +5 -0
- camel/models/gemini_model.py +5 -0
- camel/models/litellm_model.py +48 -16
- camel/models/model_manager.py +24 -6
- camel/models/openai_compatible_model.py +109 -5
- camel/models/openai_model.py +117 -8
- camel/societies/workforce/prompts.py +68 -5
- camel/societies/workforce/role_playing_worker.py +1 -0
- camel/societies/workforce/single_agent_worker.py +1 -0
- camel/societies/workforce/utils.py +67 -2
- camel/societies/workforce/workforce.py +412 -67
- camel/societies/workforce/workforce_logger.py +0 -8
- camel/tasks/task.py +2 -0
- camel/toolkits/__init__.py +7 -2
- camel/toolkits/craw4ai_toolkit.py +2 -2
- camel/toolkits/file_write_toolkit.py +526 -121
- camel/toolkits/hybrid_browser_toolkit/hybrid_browser_toolkit.py +9 -3
- camel/toolkits/hybrid_browser_toolkit/unified_analyzer.js +31 -8
- camel/toolkits/message_agent_toolkit.py +608 -0
- camel/toolkits/note_taking_toolkit.py +90 -0
- camel/toolkits/openai_image_toolkit.py +292 -0
- camel/toolkits/slack_toolkit.py +4 -4
- camel/toolkits/terminal_toolkit.py +223 -73
- camel/utils/mcp_client.py +37 -1
- {camel_ai-0.2.71a3.dist-info → camel_ai-0.2.71a5.dist-info}/METADATA +48 -7
- {camel_ai-0.2.71a3.dist-info → camel_ai-0.2.71a5.dist-info}/RECORD +38 -35
- camel/toolkits/dalle_toolkit.py +0 -175
- {camel_ai-0.2.71a3.dist-info → camel_ai-0.2.71a5.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.71a3.dist-info → camel_ai-0.2.71a5.dist-info}/licenses/LICENSE +0 -0
|
@@ -14,12 +14,23 @@
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
|
|
16
16
|
import asyncio
|
|
17
|
+
import concurrent.futures
|
|
17
18
|
import json
|
|
18
19
|
import time
|
|
19
20
|
import uuid
|
|
20
21
|
from collections import deque
|
|
21
22
|
from enum import Enum
|
|
22
|
-
from typing import
|
|
23
|
+
from typing import (
|
|
24
|
+
Any,
|
|
25
|
+
Coroutine,
|
|
26
|
+
Deque,
|
|
27
|
+
Dict,
|
|
28
|
+
List,
|
|
29
|
+
Optional,
|
|
30
|
+
Set,
|
|
31
|
+
Tuple,
|
|
32
|
+
Union,
|
|
33
|
+
)
|
|
23
34
|
|
|
24
35
|
from colorama import Fore
|
|
25
36
|
|
|
@@ -31,12 +42,16 @@ from camel.societies.workforce.base import BaseNode
|
|
|
31
42
|
from camel.societies.workforce.prompts import (
|
|
32
43
|
ASSIGN_TASK_PROMPT,
|
|
33
44
|
CREATE_NODE_PROMPT,
|
|
45
|
+
FAILURE_ANALYSIS_PROMPT,
|
|
34
46
|
WF_TASK_DECOMPOSE_PROMPT,
|
|
35
47
|
)
|
|
36
48
|
from camel.societies.workforce.role_playing_worker import RolePlayingWorker
|
|
37
49
|
from camel.societies.workforce.single_agent_worker import SingleAgentWorker
|
|
38
50
|
from camel.societies.workforce.task_channel import TaskChannel
|
|
39
51
|
from camel.societies.workforce.utils import (
|
|
52
|
+
FailureContext,
|
|
53
|
+
RecoveryDecision,
|
|
54
|
+
RecoveryStrategy,
|
|
40
55
|
TaskAssignment,
|
|
41
56
|
TaskAssignResult,
|
|
42
57
|
WorkerConf,
|
|
@@ -200,12 +215,14 @@ class Workforce(BaseNode):
|
|
|
200
215
|
children: Optional[List[BaseNode]] = None,
|
|
201
216
|
coordinator_agent: Optional[ChatAgent] = None,
|
|
202
217
|
task_agent: Optional[ChatAgent] = None,
|
|
203
|
-
new_worker_agent: Optional[ChatAgent] = None,
|
|
218
|
+
new_worker_agent: Optional[ChatAgent] = None,
|
|
204
219
|
graceful_shutdown_timeout: float = 15.0,
|
|
205
220
|
share_memory: bool = False,
|
|
206
221
|
) -> None:
|
|
207
222
|
super().__init__(description)
|
|
208
|
-
self._child_listening_tasks: Deque[
|
|
223
|
+
self._child_listening_tasks: Deque[
|
|
224
|
+
Union[asyncio.Task, concurrent.futures.Future]
|
|
225
|
+
] = deque()
|
|
209
226
|
self._children = children or []
|
|
210
227
|
self.new_worker_agent = new_worker_agent
|
|
211
228
|
self.graceful_shutdown_timeout = graceful_shutdown_timeout
|
|
@@ -325,9 +342,10 @@ class Workforce(BaseNode):
|
|
|
325
342
|
"settings (ModelPlatformType.DEFAULT, ModelType.DEFAULT) "
|
|
326
343
|
"with default system message and TaskPlanningToolkit."
|
|
327
344
|
)
|
|
345
|
+
task_tools = TaskPlanningToolkit().get_tools()
|
|
328
346
|
self.task_agent = ChatAgent(
|
|
329
347
|
task_sys_msg,
|
|
330
|
-
tools=
|
|
348
|
+
tools=task_tools, # type: ignore[arg-type]
|
|
331
349
|
)
|
|
332
350
|
else:
|
|
333
351
|
logger.info(
|
|
@@ -563,6 +581,69 @@ class Workforce(BaseNode):
|
|
|
563
581
|
except Exception as e:
|
|
564
582
|
logger.warning(f"Error synchronizing shared memory: {e}")
|
|
565
583
|
|
|
584
|
+
def _update_dependencies_for_decomposition(
|
|
585
|
+
self, original_task: Task, subtasks: List[Task]
|
|
586
|
+
) -> None:
|
|
587
|
+
r"""Update dependency tracking when a task is decomposed into subtasks.
|
|
588
|
+
Tasks that depended on the original task should now depend on all
|
|
589
|
+
subtasks. The last subtask inherits the original task's dependencies.
|
|
590
|
+
"""
|
|
591
|
+
if not subtasks:
|
|
592
|
+
return
|
|
593
|
+
|
|
594
|
+
original_task_id = original_task.id
|
|
595
|
+
subtask_ids = [subtask.id for subtask in subtasks]
|
|
596
|
+
|
|
597
|
+
# Find tasks that depend on the original task
|
|
598
|
+
dependent_task_ids = [
|
|
599
|
+
task_id
|
|
600
|
+
for task_id, deps in self._task_dependencies.items()
|
|
601
|
+
if original_task_id in deps
|
|
602
|
+
]
|
|
603
|
+
|
|
604
|
+
# Update dependent tasks to depend on all subtasks
|
|
605
|
+
for task_id in dependent_task_ids:
|
|
606
|
+
dependencies = self._task_dependencies[task_id]
|
|
607
|
+
dependencies.remove(original_task_id)
|
|
608
|
+
dependencies.extend(subtask_ids)
|
|
609
|
+
|
|
610
|
+
# The last subtask inherits original task's dependencies (if any)
|
|
611
|
+
if original_task_id in self._task_dependencies:
|
|
612
|
+
original_dependencies = self._task_dependencies[original_task_id]
|
|
613
|
+
if original_dependencies:
|
|
614
|
+
# Set dependencies for the last subtask to maintain execution
|
|
615
|
+
# order
|
|
616
|
+
self._task_dependencies[subtask_ids[-1]] = (
|
|
617
|
+
original_dependencies.copy()
|
|
618
|
+
)
|
|
619
|
+
# Remove original task dependencies as it's now decomposed
|
|
620
|
+
del self._task_dependencies[original_task_id]
|
|
621
|
+
|
|
622
|
+
def _increment_in_flight_tasks(self, task_id: str) -> None:
|
|
623
|
+
r"""Safely increment the in-flight tasks counter with logging."""
|
|
624
|
+
self._in_flight_tasks += 1
|
|
625
|
+
logger.debug(
|
|
626
|
+
f"Incremented in-flight tasks for {task_id}. "
|
|
627
|
+
f"Count: {self._in_flight_tasks}"
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
def _decrement_in_flight_tasks(
|
|
631
|
+
self, task_id: str, context: str = ""
|
|
632
|
+
) -> None:
|
|
633
|
+
r"""Safely decrement the in-flight tasks counter with safety checks."""
|
|
634
|
+
if self._in_flight_tasks > 0:
|
|
635
|
+
self._in_flight_tasks -= 1
|
|
636
|
+
logger.debug(
|
|
637
|
+
f"Decremented in-flight tasks for {task_id} ({context}). "
|
|
638
|
+
f"Count: {self._in_flight_tasks}"
|
|
639
|
+
)
|
|
640
|
+
else:
|
|
641
|
+
logger.debug(
|
|
642
|
+
f"Attempted to decrement in-flight tasks for {task_id} "
|
|
643
|
+
f"({context}) but counter is already 0. "
|
|
644
|
+
f"Counter: {self._in_flight_tasks}"
|
|
645
|
+
)
|
|
646
|
+
|
|
566
647
|
def _cleanup_task_tracking(self, task_id: str) -> None:
|
|
567
648
|
r"""Clean up tracking data for a task to prevent memory leaks.
|
|
568
649
|
|
|
@@ -586,12 +667,86 @@ class Workforce(BaseNode):
|
|
|
586
667
|
)
|
|
587
668
|
self.task_agent.reset()
|
|
588
669
|
subtasks = task.decompose(self.task_agent, decompose_prompt)
|
|
589
|
-
|
|
590
|
-
for
|
|
591
|
-
|
|
670
|
+
|
|
671
|
+
# Update dependency tracking for decomposed task
|
|
672
|
+
if subtasks:
|
|
673
|
+
self._update_dependencies_for_decomposition(task, subtasks)
|
|
592
674
|
|
|
593
675
|
return subtasks
|
|
594
676
|
|
|
677
|
+
def _analyze_failure(
|
|
678
|
+
self, task: Task, error_message: str
|
|
679
|
+
) -> RecoveryDecision:
|
|
680
|
+
r"""Analyze a task failure and decide on the best recovery strategy.
|
|
681
|
+
|
|
682
|
+
Args:
|
|
683
|
+
task (Task): The failed task
|
|
684
|
+
error_message (str): The error message from the failure
|
|
685
|
+
|
|
686
|
+
Returns:
|
|
687
|
+
RecoveryDecision: The decided recovery strategy with reasoning
|
|
688
|
+
"""
|
|
689
|
+
# First, do a quick smart analysis based on error patterns
|
|
690
|
+
error_msg_lower = error_message.lower()
|
|
691
|
+
if any(
|
|
692
|
+
keyword in error_msg_lower
|
|
693
|
+
for keyword in [
|
|
694
|
+
'connection',
|
|
695
|
+
'network',
|
|
696
|
+
'server disconnected',
|
|
697
|
+
'timeout',
|
|
698
|
+
'apiconnectionerror',
|
|
699
|
+
]
|
|
700
|
+
):
|
|
701
|
+
return RecoveryDecision(
|
|
702
|
+
strategy=RecoveryStrategy.RETRY,
|
|
703
|
+
reasoning="Network/connection error detected, retrying task",
|
|
704
|
+
modified_task_content=None,
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
# Create failure context
|
|
708
|
+
failure_context = FailureContext(
|
|
709
|
+
task_id=task.id,
|
|
710
|
+
task_content=task.content,
|
|
711
|
+
failure_count=task.failure_count,
|
|
712
|
+
error_message=error_message,
|
|
713
|
+
worker_id=task.assigned_worker_id,
|
|
714
|
+
task_depth=task.get_depth(),
|
|
715
|
+
additional_info=str(task.additional_info)
|
|
716
|
+
if task.additional_info
|
|
717
|
+
else None,
|
|
718
|
+
)
|
|
719
|
+
|
|
720
|
+
# Format the analysis prompt
|
|
721
|
+
analysis_prompt = FAILURE_ANALYSIS_PROMPT.format(
|
|
722
|
+
task_id=failure_context.task_id,
|
|
723
|
+
task_content=failure_context.task_content,
|
|
724
|
+
failure_count=failure_context.failure_count,
|
|
725
|
+
error_message=failure_context.error_message,
|
|
726
|
+
worker_id=failure_context.worker_id or "unknown",
|
|
727
|
+
task_depth=failure_context.task_depth,
|
|
728
|
+
additional_info=failure_context.additional_info or "None",
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
try:
|
|
732
|
+
# Get decision from task agent
|
|
733
|
+
self.task_agent.reset()
|
|
734
|
+
response = self.task_agent.step(
|
|
735
|
+
analysis_prompt, response_format=RecoveryDecision
|
|
736
|
+
)
|
|
737
|
+
return response.msg.parsed
|
|
738
|
+
|
|
739
|
+
except Exception as e:
|
|
740
|
+
logger.warning(
|
|
741
|
+
f"Error during failure analysis: {e}, defaulting to RETRY"
|
|
742
|
+
)
|
|
743
|
+
return RecoveryDecision(
|
|
744
|
+
strategy=RecoveryStrategy.RETRY,
|
|
745
|
+
reasoning=f"Analysis failed due to error: {e!s}, "
|
|
746
|
+
f"defaulting to retry",
|
|
747
|
+
modified_task_content=None,
|
|
748
|
+
)
|
|
749
|
+
|
|
595
750
|
# Human intervention methods
|
|
596
751
|
async def _async_pause(self) -> None:
|
|
597
752
|
r"""Async implementation of pause to run on the event loop."""
|
|
@@ -977,9 +1132,6 @@ class Workforce(BaseNode):
|
|
|
977
1132
|
needed
|
|
978
1133
|
>>> print(result.result)
|
|
979
1134
|
"""
|
|
980
|
-
import asyncio
|
|
981
|
-
import concurrent.futures
|
|
982
|
-
|
|
983
1135
|
# Check if we're already in an event loop
|
|
984
1136
|
try:
|
|
985
1137
|
current_loop = asyncio.get_running_loop()
|
|
@@ -1154,7 +1306,39 @@ class Workforce(BaseNode):
|
|
|
1154
1306
|
|
|
1155
1307
|
return self._task
|
|
1156
1308
|
|
|
1157
|
-
|
|
1309
|
+
def _start_child_node_when_paused(
|
|
1310
|
+
self, start_coroutine: Coroutine
|
|
1311
|
+
) -> None:
|
|
1312
|
+
r"""Helper to start a child node when workforce is paused.
|
|
1313
|
+
|
|
1314
|
+
Args:
|
|
1315
|
+
start_coroutine: The coroutine to start (e.g., worker_node.start())
|
|
1316
|
+
"""
|
|
1317
|
+
if self._state == WorkforceState.PAUSED and hasattr(
|
|
1318
|
+
self, '_child_listening_tasks'
|
|
1319
|
+
):
|
|
1320
|
+
if self._loop and not self._loop.is_closed():
|
|
1321
|
+
# Use thread-safe coroutine execution for dynamic addition
|
|
1322
|
+
child_task: Union[asyncio.Task, concurrent.futures.Future]
|
|
1323
|
+
try:
|
|
1324
|
+
# Check if we're in the same thread as the loop
|
|
1325
|
+
current_loop = asyncio.get_running_loop()
|
|
1326
|
+
if current_loop is self._loop:
|
|
1327
|
+
# Same loop context - use create_task
|
|
1328
|
+
child_task = self._loop.create_task(start_coroutine)
|
|
1329
|
+
else:
|
|
1330
|
+
# Different loop context - use thread-safe approach
|
|
1331
|
+
child_task = asyncio.run_coroutine_threadsafe(
|
|
1332
|
+
start_coroutine, self._loop
|
|
1333
|
+
)
|
|
1334
|
+
except RuntimeError:
|
|
1335
|
+
# No running loop in current thread - use thread-safe
|
|
1336
|
+
# approach
|
|
1337
|
+
child_task = asyncio.run_coroutine_threadsafe(
|
|
1338
|
+
start_coroutine, self._loop
|
|
1339
|
+
)
|
|
1340
|
+
self._child_listening_tasks.append(child_task)
|
|
1341
|
+
|
|
1158
1342
|
def add_single_agent_worker(
|
|
1159
1343
|
self,
|
|
1160
1344
|
description: str,
|
|
@@ -1162,6 +1346,7 @@ class Workforce(BaseNode):
|
|
|
1162
1346
|
pool_max_size: int = DEFAULT_WORKER_POOL_SIZE,
|
|
1163
1347
|
) -> Workforce:
|
|
1164
1348
|
r"""Add a worker node to the workforce that uses a single agent.
|
|
1349
|
+
Can be called when workforce is paused to dynamically add workers.
|
|
1165
1350
|
|
|
1166
1351
|
Args:
|
|
1167
1352
|
description (str): Description of the worker node.
|
|
@@ -1171,7 +1356,15 @@ class Workforce(BaseNode):
|
|
|
1171
1356
|
|
|
1172
1357
|
Returns:
|
|
1173
1358
|
Workforce: The workforce node itself.
|
|
1359
|
+
|
|
1360
|
+
Raises:
|
|
1361
|
+
RuntimeError: If called while workforce is running (not paused).
|
|
1174
1362
|
"""
|
|
1363
|
+
if self._state == WorkforceState.RUNNING:
|
|
1364
|
+
raise RuntimeError(
|
|
1365
|
+
"Cannot add workers while workforce is running. "
|
|
1366
|
+
"Pause the workforce first."
|
|
1367
|
+
)
|
|
1175
1368
|
# Ensure the worker agent shares this workforce's pause control
|
|
1176
1369
|
self._attach_pause_event_to_agent(worker)
|
|
1177
1370
|
|
|
@@ -1181,6 +1374,14 @@ class Workforce(BaseNode):
|
|
|
1181
1374
|
pool_max_size=pool_max_size,
|
|
1182
1375
|
)
|
|
1183
1376
|
self._children.append(worker_node)
|
|
1377
|
+
|
|
1378
|
+
# If we have a channel set up, set it for the new worker
|
|
1379
|
+
if hasattr(self, '_channel') and self._channel is not None:
|
|
1380
|
+
worker_node.set_channel(self._channel)
|
|
1381
|
+
|
|
1382
|
+
# If workforce is paused, start the worker's listening task
|
|
1383
|
+
self._start_child_node_when_paused(worker_node.start())
|
|
1384
|
+
|
|
1184
1385
|
if self.metrics_logger:
|
|
1185
1386
|
self.metrics_logger.log_worker_created(
|
|
1186
1387
|
worker_id=worker_node.node_id,
|
|
@@ -1189,7 +1390,6 @@ class Workforce(BaseNode):
|
|
|
1189
1390
|
)
|
|
1190
1391
|
return self
|
|
1191
1392
|
|
|
1192
|
-
@check_if_running(False)
|
|
1193
1393
|
def add_role_playing_worker(
|
|
1194
1394
|
self,
|
|
1195
1395
|
description: str,
|
|
@@ -1201,6 +1401,7 @@ class Workforce(BaseNode):
|
|
|
1201
1401
|
chat_turn_limit: int = 3,
|
|
1202
1402
|
) -> Workforce:
|
|
1203
1403
|
r"""Add a worker node to the workforce that uses `RolePlaying` system.
|
|
1404
|
+
Can be called when workforce is paused to dynamically add workers.
|
|
1204
1405
|
|
|
1205
1406
|
Args:
|
|
1206
1407
|
description (str): Description of the node.
|
|
@@ -1220,7 +1421,15 @@ class Workforce(BaseNode):
|
|
|
1220
1421
|
|
|
1221
1422
|
Returns:
|
|
1222
1423
|
Workforce: The workforce node itself.
|
|
1424
|
+
|
|
1425
|
+
Raises:
|
|
1426
|
+
RuntimeError: If called while workforce is running (not paused).
|
|
1223
1427
|
"""
|
|
1428
|
+
if self._state == WorkforceState.RUNNING:
|
|
1429
|
+
raise RuntimeError(
|
|
1430
|
+
"Cannot add workers while workforce is running. "
|
|
1431
|
+
"Pause the workforce first."
|
|
1432
|
+
)
|
|
1224
1433
|
# Ensure provided kwargs carry pause_event so that internally created
|
|
1225
1434
|
# ChatAgents (assistant/user/summarizer) inherit it.
|
|
1226
1435
|
assistant_agent_kwargs = self._ensure_pause_event_in_kwargs(
|
|
@@ -1243,6 +1452,14 @@ class Workforce(BaseNode):
|
|
|
1243
1452
|
chat_turn_limit=chat_turn_limit,
|
|
1244
1453
|
)
|
|
1245
1454
|
self._children.append(worker_node)
|
|
1455
|
+
|
|
1456
|
+
# If we have a channel set up, set it for the new worker
|
|
1457
|
+
if hasattr(self, '_channel') and self._channel is not None:
|
|
1458
|
+
worker_node.set_channel(self._channel)
|
|
1459
|
+
|
|
1460
|
+
# If workforce is paused, start the worker's listening task
|
|
1461
|
+
self._start_child_node_when_paused(worker_node.start())
|
|
1462
|
+
|
|
1246
1463
|
if self.metrics_logger:
|
|
1247
1464
|
self.metrics_logger.log_worker_created(
|
|
1248
1465
|
worker_id=worker_node.node_id,
|
|
@@ -1251,20 +1468,35 @@ class Workforce(BaseNode):
|
|
|
1251
1468
|
)
|
|
1252
1469
|
return self
|
|
1253
1470
|
|
|
1254
|
-
@check_if_running(False)
|
|
1255
1471
|
def add_workforce(self, workforce: Workforce) -> Workforce:
|
|
1256
1472
|
r"""Add a workforce node to the workforce.
|
|
1473
|
+
Can be called when workforce is paused to dynamically add workers.
|
|
1257
1474
|
|
|
1258
1475
|
Args:
|
|
1259
1476
|
workforce (Workforce): The workforce node to be added.
|
|
1260
1477
|
|
|
1261
1478
|
Returns:
|
|
1262
1479
|
Workforce: The workforce node itself.
|
|
1480
|
+
|
|
1481
|
+
Raises:
|
|
1482
|
+
RuntimeError: If called while workforce is running (not paused).
|
|
1263
1483
|
"""
|
|
1484
|
+
if self._state == WorkforceState.RUNNING:
|
|
1485
|
+
raise RuntimeError(
|
|
1486
|
+
"Cannot add workers while workforce is running. "
|
|
1487
|
+
"Pause the workforce first."
|
|
1488
|
+
)
|
|
1264
1489
|
# Align child workforce's pause_event with this one for unified
|
|
1265
1490
|
# control of worker agents only.
|
|
1266
1491
|
workforce._pause_event = self._pause_event
|
|
1267
1492
|
self._children.append(workforce)
|
|
1493
|
+
|
|
1494
|
+
# If we have a channel set up, set it for the new workforce
|
|
1495
|
+
if hasattr(self, '_channel') and self._channel is not None:
|
|
1496
|
+
workforce.set_channel(self._channel)
|
|
1497
|
+
|
|
1498
|
+
# If workforce is paused, start the child workforce's listening task
|
|
1499
|
+
self._start_child_node_when_paused(workforce.start())
|
|
1268
1500
|
return self
|
|
1269
1501
|
|
|
1270
1502
|
async def _async_reset(self) -> None:
|
|
@@ -1436,7 +1668,9 @@ class Workforce(BaseNode):
|
|
|
1436
1668
|
|
|
1437
1669
|
return valid_assignments, invalid_assignments
|
|
1438
1670
|
|
|
1439
|
-
def _handle_task_assignment_fallbacks(
|
|
1671
|
+
async def _handle_task_assignment_fallbacks(
|
|
1672
|
+
self, tasks: List[Task]
|
|
1673
|
+
) -> List:
|
|
1440
1674
|
r"""Create new workers for unassigned tasks as fallback.
|
|
1441
1675
|
|
|
1442
1676
|
Args:
|
|
@@ -1449,7 +1683,7 @@ class Workforce(BaseNode):
|
|
|
1449
1683
|
|
|
1450
1684
|
for task in tasks:
|
|
1451
1685
|
logger.info(f"Creating new worker for unassigned task {task.id}")
|
|
1452
|
-
new_worker = self._create_worker_node_for_task(task)
|
|
1686
|
+
new_worker = await self._create_worker_node_for_task(task)
|
|
1453
1687
|
|
|
1454
1688
|
assignment = TaskAssignment(
|
|
1455
1689
|
task_id=task.id,
|
|
@@ -1460,7 +1694,7 @@ class Workforce(BaseNode):
|
|
|
1460
1694
|
|
|
1461
1695
|
return fallback_assignments
|
|
1462
1696
|
|
|
1463
|
-
def _handle_assignment_retry_and_fallback(
|
|
1697
|
+
async def _handle_assignment_retry_and_fallback(
|
|
1464
1698
|
self,
|
|
1465
1699
|
invalid_assignments: List[TaskAssignment],
|
|
1466
1700
|
tasks: List[Task],
|
|
@@ -1531,14 +1765,14 @@ class Workforce(BaseNode):
|
|
|
1531
1765
|
f"Creating fallback workers for {len(unassigned_tasks)} "
|
|
1532
1766
|
f"unassigned tasks"
|
|
1533
1767
|
)
|
|
1534
|
-
fallback_assignments =
|
|
1535
|
-
unassigned_tasks
|
|
1768
|
+
fallback_assignments = (
|
|
1769
|
+
await self._handle_task_assignment_fallbacks(unassigned_tasks)
|
|
1536
1770
|
)
|
|
1537
1771
|
final_assignments.extend(fallback_assignments)
|
|
1538
1772
|
|
|
1539
1773
|
return final_assignments
|
|
1540
1774
|
|
|
1541
|
-
def _find_assignee(
|
|
1775
|
+
async def _find_assignee(
|
|
1542
1776
|
self,
|
|
1543
1777
|
tasks: List[Task],
|
|
1544
1778
|
) -> TaskAssignResult:
|
|
@@ -1580,7 +1814,7 @@ class Workforce(BaseNode):
|
|
|
1580
1814
|
# invalid assignments and unassigned tasks
|
|
1581
1815
|
all_problem_assignments = invalid_assignments
|
|
1582
1816
|
retry_and_fallback_assignments = (
|
|
1583
|
-
self._handle_assignment_retry_and_fallback(
|
|
1817
|
+
await self._handle_assignment_retry_and_fallback(
|
|
1584
1818
|
all_problem_assignments, tasks, valid_worker_ids
|
|
1585
1819
|
)
|
|
1586
1820
|
)
|
|
@@ -1600,15 +1834,13 @@ class Workforce(BaseNode):
|
|
|
1600
1834
|
)
|
|
1601
1835
|
|
|
1602
1836
|
try:
|
|
1603
|
-
self._in_flight_tasks += 1
|
|
1604
1837
|
await self._channel.post_task(task, self.node_id, assignee_id)
|
|
1838
|
+
self._increment_in_flight_tasks(task.id)
|
|
1605
1839
|
logger.debug(
|
|
1606
1840
|
f"Posted task {task.id} to {assignee_id}. "
|
|
1607
1841
|
f"In-flight tasks: {self._in_flight_tasks}"
|
|
1608
1842
|
)
|
|
1609
1843
|
except Exception as e:
|
|
1610
|
-
# Decrement counter if posting failed
|
|
1611
|
-
self._in_flight_tasks -= 1
|
|
1612
1844
|
logger.error(
|
|
1613
1845
|
f"Failed to post task {task.id} to {assignee_id}: {e}"
|
|
1614
1846
|
)
|
|
@@ -1616,7 +1848,7 @@ class Workforce(BaseNode):
|
|
|
1616
1848
|
async def _post_dependency(self, dependency: Task) -> None:
|
|
1617
1849
|
await self._channel.post_dependency(dependency, self.node_id)
|
|
1618
1850
|
|
|
1619
|
-
def _create_worker_node_for_task(self, task: Task) -> Worker:
|
|
1851
|
+
async def _create_worker_node_for_task(self, task: Task) -> Worker:
|
|
1620
1852
|
r"""Creates a new worker node for a given task and add it to the
|
|
1621
1853
|
children list of this node. This is one of the actions that
|
|
1622
1854
|
the coordinator can take when a task has failed.
|
|
@@ -1662,7 +1894,7 @@ class Workforce(BaseNode):
|
|
|
1662
1894
|
f"Coordinator agent returned malformed JSON response. "
|
|
1663
1895
|
)
|
|
1664
1896
|
|
|
1665
|
-
new_agent = self._create_new_agent(
|
|
1897
|
+
new_agent = await self._create_new_agent(
|
|
1666
1898
|
new_node_conf.role,
|
|
1667
1899
|
new_node_conf.sys_msg,
|
|
1668
1900
|
)
|
|
@@ -1689,14 +1921,19 @@ class Workforce(BaseNode):
|
|
|
1689
1921
|
)
|
|
1690
1922
|
return new_node
|
|
1691
1923
|
|
|
1692
|
-
def _create_new_agent(self, role: str, sys_msg: str) -> ChatAgent:
|
|
1924
|
+
async def _create_new_agent(self, role: str, sys_msg: str) -> ChatAgent:
|
|
1693
1925
|
worker_sys_msg = BaseMessage.make_assistant_message(
|
|
1694
1926
|
role_name=role,
|
|
1695
1927
|
content=sys_msg,
|
|
1696
1928
|
)
|
|
1697
1929
|
|
|
1698
1930
|
if self.new_worker_agent is not None:
|
|
1699
|
-
|
|
1931
|
+
# Clone the template agent to create an independent instance
|
|
1932
|
+
cloned_agent = self.new_worker_agent.clone(with_memory=False)
|
|
1933
|
+
# Update the system message for the specific role
|
|
1934
|
+
cloned_agent._system_message = worker_sys_msg
|
|
1935
|
+
cloned_agent.init_messages() # Initialize with new system message
|
|
1936
|
+
return cloned_agent
|
|
1700
1937
|
else:
|
|
1701
1938
|
# Default tools for a new agent
|
|
1702
1939
|
function_list = [
|
|
@@ -1712,7 +1949,7 @@ class Workforce(BaseNode):
|
|
|
1712
1949
|
)
|
|
1713
1950
|
|
|
1714
1951
|
return ChatAgent(
|
|
1715
|
-
worker_sys_msg,
|
|
1952
|
+
system_message=worker_sys_msg,
|
|
1716
1953
|
model=model,
|
|
1717
1954
|
tools=function_list, # type: ignore[arg-type]
|
|
1718
1955
|
pause_event=self._pause_event,
|
|
@@ -1730,10 +1967,6 @@ class Workforce(BaseNode):
|
|
|
1730
1967
|
timeout=TASK_TIMEOUT_SECONDS,
|
|
1731
1968
|
)
|
|
1732
1969
|
except Exception as e:
|
|
1733
|
-
# Decrement in-flight counter to prevent hanging
|
|
1734
|
-
if self._in_flight_tasks > 0:
|
|
1735
|
-
self._in_flight_tasks -= 1
|
|
1736
|
-
|
|
1737
1970
|
error_msg = (
|
|
1738
1971
|
f"Error getting returned task {e} in "
|
|
1739
1972
|
f"workforce {self.node_id}. "
|
|
@@ -1745,8 +1978,11 @@ class Workforce(BaseNode):
|
|
|
1745
1978
|
if self._pending_tasks and self._assignees:
|
|
1746
1979
|
for task in self._pending_tasks:
|
|
1747
1980
|
if task.id in self._assignees:
|
|
1748
|
-
# Mark
|
|
1981
|
+
# Mark task as failed and decrement counter
|
|
1749
1982
|
task.set_state(TaskState.FAILED)
|
|
1983
|
+
self._decrement_in_flight_tasks(
|
|
1984
|
+
task.id, "timeout/error in _get_returned_task"
|
|
1985
|
+
)
|
|
1750
1986
|
return task
|
|
1751
1987
|
return None
|
|
1752
1988
|
|
|
@@ -1765,7 +2001,7 @@ class Workforce(BaseNode):
|
|
|
1765
2001
|
f"Found {len(tasks_to_assign)} new tasks. "
|
|
1766
2002
|
f"Requesting assignment..."
|
|
1767
2003
|
)
|
|
1768
|
-
batch_result = self._find_assignee(tasks_to_assign)
|
|
2004
|
+
batch_result = await self._find_assignee(tasks_to_assign)
|
|
1769
2005
|
logger.debug(
|
|
1770
2006
|
f"Coordinator returned assignments:\n"
|
|
1771
2007
|
f"{json.dumps(batch_result.dict(), indent=2)}"
|
|
@@ -1788,17 +2024,19 @@ class Workforce(BaseNode):
|
|
|
1788
2024
|
# Step 2: Iterate through all pending tasks and post those that are
|
|
1789
2025
|
# ready
|
|
1790
2026
|
posted_tasks = []
|
|
1791
|
-
# Pre-compute completed task IDs
|
|
1792
|
-
|
|
2027
|
+
# Pre-compute completed task IDs and their states for O(1) lookups
|
|
2028
|
+
completed_tasks_info = {t.id: t.state for t in self._completed_tasks}
|
|
1793
2029
|
|
|
1794
2030
|
for task in self._pending_tasks:
|
|
1795
2031
|
# A task must be assigned to be considered for posting
|
|
1796
2032
|
if task.id in self._task_dependencies:
|
|
1797
2033
|
dependencies = self._task_dependencies[task.id]
|
|
1798
2034
|
# Check if all dependencies for this task are in the completed
|
|
1799
|
-
# set
|
|
2035
|
+
# set and their state is DONE
|
|
1800
2036
|
if all(
|
|
1801
|
-
dep_id in
|
|
2037
|
+
dep_id in completed_tasks_info
|
|
2038
|
+
and completed_tasks_info[dep_id] == TaskState.DONE
|
|
2039
|
+
for dep_id in dependencies
|
|
1802
2040
|
):
|
|
1803
2041
|
assignee_id = self._assignees[task.id]
|
|
1804
2042
|
logger.debug(
|
|
@@ -1844,7 +2082,6 @@ class Workforce(BaseNode):
|
|
|
1844
2082
|
task_id=task.id,
|
|
1845
2083
|
worker_id=worker_id,
|
|
1846
2084
|
error_message=detailed_error,
|
|
1847
|
-
error_type="TaskFailure",
|
|
1848
2085
|
metadata={
|
|
1849
2086
|
'failure_count': task.failure_count,
|
|
1850
2087
|
'task_content': task.content,
|
|
@@ -1883,21 +2120,57 @@ class Workforce(BaseNode):
|
|
|
1883
2120
|
await self._channel.archive_task(task.id)
|
|
1884
2121
|
return True
|
|
1885
2122
|
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
assignee = self._create_worker_node_for_task(task)
|
|
2123
|
+
# Use intelligent failure analysis to decide recovery strategy
|
|
2124
|
+
recovery_decision = self._analyze_failure(task, detailed_error)
|
|
1889
2125
|
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
2126
|
+
logger.info(
|
|
2127
|
+
f"Task {task.id} failure "
|
|
2128
|
+
f"analysis: {recovery_decision.strategy.value} - "
|
|
2129
|
+
f"{recovery_decision.reasoning}"
|
|
2130
|
+
)
|
|
2131
|
+
|
|
2132
|
+
if recovery_decision.strategy == RecoveryStrategy.RETRY:
|
|
2133
|
+
# Simply retry the task by reposting it
|
|
2134
|
+
if task.id in self._assignees:
|
|
2135
|
+
assignee_id = self._assignees[task.id]
|
|
2136
|
+
await self._post_task(task, assignee_id)
|
|
2137
|
+
action_taken = f"retried with same worker {assignee_id}"
|
|
2138
|
+
else:
|
|
2139
|
+
# Find a new assignee and retry
|
|
2140
|
+
batch_result = await self._find_assignee([task])
|
|
2141
|
+
assignment = batch_result.assignments[0]
|
|
2142
|
+
self._assignees[task.id] = assignment.assignee_id
|
|
2143
|
+
await self._post_task(task, assignment.assignee_id)
|
|
2144
|
+
action_taken = (
|
|
2145
|
+
f"retried with new worker {assignment.assignee_id}"
|
|
1895
2146
|
)
|
|
1896
|
-
self._sync_shared_memory()
|
|
1897
2147
|
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
2148
|
+
elif recovery_decision.strategy == RecoveryStrategy.REPLAN:
|
|
2149
|
+
# Modify the task content and retry
|
|
2150
|
+
if recovery_decision.modified_task_content:
|
|
2151
|
+
task.content = recovery_decision.modified_task_content
|
|
2152
|
+
logger.info(f"Task {task.id} content modified for replan")
|
|
2153
|
+
|
|
2154
|
+
# Repost the modified task
|
|
2155
|
+
if task.id in self._assignees:
|
|
2156
|
+
assignee_id = self._assignees[task.id]
|
|
2157
|
+
await self._post_task(task, assignee_id)
|
|
2158
|
+
action_taken = (
|
|
2159
|
+
f"replanned and retried with worker {assignee_id}"
|
|
2160
|
+
)
|
|
2161
|
+
else:
|
|
2162
|
+
# Find a new assignee for the replanned task
|
|
2163
|
+
batch_result = await self._find_assignee([task])
|
|
2164
|
+
assignment = batch_result.assignments[0]
|
|
2165
|
+
self._assignees[task.id] = assignment.assignee_id
|
|
2166
|
+
await self._post_task(task, assignment.assignee_id)
|
|
2167
|
+
action_taken = (
|
|
2168
|
+
f"replanned and assigned to "
|
|
2169
|
+
f"worker {assignment.assignee_id}"
|
|
2170
|
+
)
|
|
2171
|
+
|
|
2172
|
+
elif recovery_decision.strategy == RecoveryStrategy.DECOMPOSE:
|
|
2173
|
+
# Decompose the task into subtasks
|
|
1901
2174
|
subtasks = self._decompose_task(task)
|
|
1902
2175
|
if self.metrics_logger and subtasks:
|
|
1903
2176
|
self.metrics_logger.log_task_decomposed(
|
|
@@ -1915,19 +2188,42 @@ class Workforce(BaseNode):
|
|
|
1915
2188
|
# Insert packets at the head of the queue
|
|
1916
2189
|
self._pending_tasks.extendleft(reversed(subtasks))
|
|
1917
2190
|
|
|
2191
|
+
await self._post_ready_tasks()
|
|
2192
|
+
action_taken = f"decomposed into {len(subtasks)} subtasks"
|
|
2193
|
+
|
|
2194
|
+
# Handle task completion differently for decomposed tasks
|
|
2195
|
+
if task.id in self._assignees:
|
|
2196
|
+
await self._channel.archive_task(task.id)
|
|
2197
|
+
|
|
2198
|
+
self._cleanup_task_tracking(task.id)
|
|
2199
|
+
logger.debug(
|
|
2200
|
+
f"Task {task.id} failed and was {action_taken}. "
|
|
2201
|
+
f"Dependencies updated for subtasks."
|
|
2202
|
+
)
|
|
2203
|
+
|
|
1918
2204
|
# Sync shared memory after task decomposition
|
|
1919
2205
|
if self.share_memory:
|
|
1920
2206
|
logger.info(
|
|
1921
|
-
f"Syncing shared memory after
|
|
1922
|
-
f"task {task.id}"
|
|
2207
|
+
f"Syncing shared memory after task {task.id} decomposition"
|
|
1923
2208
|
)
|
|
1924
2209
|
self._sync_shared_memory()
|
|
1925
2210
|
|
|
2211
|
+
# Check if any pending tasks are now ready to execute
|
|
1926
2212
|
await self._post_ready_tasks()
|
|
1927
|
-
|
|
2213
|
+
return False
|
|
2214
|
+
|
|
2215
|
+
elif recovery_decision.strategy == RecoveryStrategy.CREATE_WORKER:
|
|
2216
|
+
assignee = await self._create_worker_node_for_task(task)
|
|
2217
|
+
await self._post_task(task, assignee.node_id)
|
|
2218
|
+
action_taken = (
|
|
2219
|
+
f"created new worker {assignee.node_id} and assigned "
|
|
2220
|
+
f"task {task.id} to it"
|
|
2221
|
+
)
|
|
2222
|
+
|
|
1928
2223
|
if task.id in self._assignees:
|
|
1929
2224
|
await self._channel.archive_task(task.id)
|
|
1930
2225
|
|
|
2226
|
+
self._cleanup_task_tracking(task.id)
|
|
1931
2227
|
logger.debug(
|
|
1932
2228
|
f"Task {task.id} failed and was {action_taken}. "
|
|
1933
2229
|
f"Updating dependency state."
|
|
@@ -2020,31 +2316,65 @@ class Workforce(BaseNode):
|
|
|
2020
2316
|
break
|
|
2021
2317
|
|
|
2022
2318
|
if not found_and_removed:
|
|
2023
|
-
# Task was already removed from pending queue (
|
|
2024
|
-
# it
|
|
2025
|
-
# draw user attention with a warning; record at debug level.
|
|
2319
|
+
# Task was already removed from pending queue (common case when
|
|
2320
|
+
# it was posted and removed immediately).
|
|
2026
2321
|
logger.debug(
|
|
2027
2322
|
f"Completed task {task.id} was already removed from pending "
|
|
2028
|
-
"queue."
|
|
2323
|
+
"queue (normal for posted tasks)."
|
|
2029
2324
|
)
|
|
2030
2325
|
|
|
2031
2326
|
# Archive the task and update dependency tracking
|
|
2032
2327
|
if task.id in self._assignees:
|
|
2033
2328
|
await self._channel.archive_task(task.id)
|
|
2034
2329
|
|
|
2035
|
-
# Ensure it's in completed tasks set
|
|
2036
|
-
|
|
2330
|
+
# Ensure it's in completed tasks set by updating if it exists or
|
|
2331
|
+
# appending if it's new.
|
|
2332
|
+
task_found_in_completed = False
|
|
2333
|
+
for i, t in enumerate(self._completed_tasks):
|
|
2334
|
+
if t.id == task.id:
|
|
2335
|
+
self._completed_tasks[i] = task
|
|
2336
|
+
task_found_in_completed = True
|
|
2337
|
+
break
|
|
2338
|
+
if not task_found_in_completed:
|
|
2339
|
+
self._completed_tasks.append(task)
|
|
2037
2340
|
|
|
2038
2341
|
# Handle parent task completion logic
|
|
2039
2342
|
parent = task.parent
|
|
2040
|
-
if parent
|
|
2343
|
+
if parent:
|
|
2344
|
+
# Check if all subtasks are completed and successful
|
|
2041
2345
|
all_subtasks_done = all(
|
|
2042
|
-
|
|
2346
|
+
any(
|
|
2347
|
+
t.id == sub.id and t.state == TaskState.DONE
|
|
2348
|
+
for t in self._completed_tasks
|
|
2349
|
+
)
|
|
2043
2350
|
for sub in parent.subtasks
|
|
2044
2351
|
)
|
|
2045
2352
|
if all_subtasks_done:
|
|
2046
|
-
#
|
|
2353
|
+
# Collect results from successful subtasks only
|
|
2354
|
+
successful_results = []
|
|
2355
|
+
for sub in parent.subtasks:
|
|
2356
|
+
completed_subtask = next(
|
|
2357
|
+
(
|
|
2358
|
+
t
|
|
2359
|
+
for t in self._completed_tasks
|
|
2360
|
+
if t.id == sub.id and t.state == TaskState.DONE
|
|
2361
|
+
),
|
|
2362
|
+
None,
|
|
2363
|
+
)
|
|
2364
|
+
if completed_subtask and completed_subtask.result:
|
|
2365
|
+
successful_results.append(
|
|
2366
|
+
f"--- Subtask {sub.id} Result ---\n"
|
|
2367
|
+
f"{completed_subtask.result}"
|
|
2368
|
+
)
|
|
2369
|
+
|
|
2370
|
+
# Set parent task state and result
|
|
2047
2371
|
parent.state = TaskState.DONE
|
|
2372
|
+
parent.result = (
|
|
2373
|
+
"\n\n".join(successful_results)
|
|
2374
|
+
if successful_results
|
|
2375
|
+
else "All subtasks completed"
|
|
2376
|
+
)
|
|
2377
|
+
|
|
2048
2378
|
logger.debug(
|
|
2049
2379
|
f"All subtasks of {parent.id} are done. "
|
|
2050
2380
|
f"Marking parent as complete."
|
|
@@ -2164,7 +2494,9 @@ class Workforce(BaseNode):
|
|
|
2164
2494
|
await self._post_ready_tasks()
|
|
2165
2495
|
continue
|
|
2166
2496
|
|
|
2167
|
-
self.
|
|
2497
|
+
self._decrement_in_flight_tasks(
|
|
2498
|
+
returned_task.id, "task returned successfully"
|
|
2499
|
+
)
|
|
2168
2500
|
|
|
2169
2501
|
# Check for stop request after getting task
|
|
2170
2502
|
if self._stop_requested:
|
|
@@ -2249,8 +2581,9 @@ class Workforce(BaseNode):
|
|
|
2249
2581
|
|
|
2250
2582
|
except Exception as e:
|
|
2251
2583
|
# Decrement in-flight counter to prevent hanging
|
|
2252
|
-
|
|
2253
|
-
|
|
2584
|
+
self._decrement_in_flight_tasks(
|
|
2585
|
+
"unknown", "exception in task processing loop"
|
|
2586
|
+
)
|
|
2254
2587
|
|
|
2255
2588
|
logger.error(
|
|
2256
2589
|
f"Error processing task in workforce {self.node_id}: {e}"
|
|
@@ -2329,8 +2662,20 @@ class Workforce(BaseNode):
|
|
|
2329
2662
|
for task in self._child_listening_tasks:
|
|
2330
2663
|
if not task.done():
|
|
2331
2664
|
task.cancel()
|
|
2665
|
+
|
|
2666
|
+
# Handle both asyncio.Task and concurrent.futures.
|
|
2667
|
+
# Future
|
|
2668
|
+
awaitables = []
|
|
2669
|
+
for task in self._child_listening_tasks:
|
|
2670
|
+
if isinstance(task, concurrent.futures.Future):
|
|
2671
|
+
# Convert Future to awaitable
|
|
2672
|
+
awaitables.append(asyncio.wrap_future(task))
|
|
2673
|
+
else:
|
|
2674
|
+
# Already an asyncio.Task
|
|
2675
|
+
awaitables.append(task)
|
|
2676
|
+
|
|
2332
2677
|
await asyncio.gather(
|
|
2333
|
-
*
|
|
2678
|
+
*awaitables,
|
|
2334
2679
|
return_exceptions=True,
|
|
2335
2680
|
)
|
|
2336
2681
|
|