camel-ai 0.2.66__py3-none-any.whl → 0.2.67__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- camel/__init__.py +1 -1
- camel/configs/__init__.py +3 -0
- camel/configs/qianfan_config.py +85 -0
- camel/models/__init__.py +2 -0
- camel/models/aiml_model.py +8 -0
- camel/models/anthropic_model.py +8 -0
- camel/models/aws_bedrock_model.py +8 -0
- camel/models/azure_openai_model.py +14 -5
- camel/models/base_model.py +4 -0
- camel/models/cohere_model.py +9 -2
- camel/models/crynux_model.py +8 -0
- camel/models/deepseek_model.py +8 -0
- camel/models/gemini_model.py +8 -0
- camel/models/groq_model.py +8 -0
- camel/models/internlm_model.py +8 -0
- camel/models/litellm_model.py +5 -0
- camel/models/lmstudio_model.py +14 -1
- camel/models/mistral_model.py +15 -1
- camel/models/model_factory.py +6 -0
- camel/models/modelscope_model.py +8 -0
- camel/models/moonshot_model.py +8 -0
- camel/models/nemotron_model.py +17 -2
- camel/models/netmind_model.py +8 -0
- camel/models/novita_model.py +8 -0
- camel/models/nvidia_model.py +8 -0
- camel/models/ollama_model.py +8 -0
- camel/models/openai_compatible_model.py +23 -5
- camel/models/openai_model.py +21 -4
- camel/models/openrouter_model.py +8 -0
- camel/models/ppio_model.py +8 -0
- camel/models/qianfan_model.py +104 -0
- camel/models/qwen_model.py +8 -0
- camel/models/reka_model.py +18 -3
- camel/models/samba_model.py +17 -3
- camel/models/sglang_model.py +20 -5
- camel/models/siliconflow_model.py +8 -0
- camel/models/stub_model.py +8 -1
- camel/models/togetherai_model.py +8 -0
- camel/models/vllm_model.py +7 -0
- camel/models/volcano_model.py +14 -1
- camel/models/watsonx_model.py +4 -1
- camel/models/yi_model.py +8 -0
- camel/models/zhipuai_model.py +8 -0
- camel/societies/workforce/prompts.py +33 -17
- camel/societies/workforce/role_playing_worker.py +3 -8
- camel/societies/workforce/single_agent_worker.py +1 -3
- camel/societies/workforce/task_channel.py +16 -18
- camel/societies/workforce/utils.py +104 -14
- camel/societies/workforce/workforce.py +1253 -99
- camel/societies/workforce/workforce_logger.py +613 -0
- camel/tasks/task.py +16 -5
- camel/toolkits/__init__.py +2 -0
- camel/toolkits/code_execution.py +1 -1
- camel/toolkits/playwright_mcp_toolkit.py +2 -1
- camel/toolkits/pptx_toolkit.py +4 -4
- camel/types/enums.py +32 -0
- camel/types/unified_model_type.py +5 -0
- {camel_ai-0.2.66.dist-info → camel_ai-0.2.67.dist-info}/METADATA +3 -3
- {camel_ai-0.2.66.dist-info → camel_ai-0.2.67.dist-info}/RECORD +61 -58
- {camel_ai-0.2.66.dist-info → camel_ai-0.2.67.dist-info}/WHEEL +0 -0
- {camel_ai-0.2.66.dist-info → camel_ai-0.2.67.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,9 +15,11 @@ from __future__ import annotations
|
|
|
15
15
|
|
|
16
16
|
import asyncio
|
|
17
17
|
import json
|
|
18
|
+
import time
|
|
18
19
|
import uuid
|
|
19
20
|
from collections import deque
|
|
20
|
-
from
|
|
21
|
+
from enum import Enum
|
|
22
|
+
from typing import Any, Coroutine, Deque, Dict, List, Optional
|
|
21
23
|
|
|
22
24
|
from colorama import Fore
|
|
23
25
|
|
|
@@ -41,13 +43,56 @@ from camel.societies.workforce.utils import (
|
|
|
41
43
|
)
|
|
42
44
|
from camel.societies.workforce.worker import Worker
|
|
43
45
|
from camel.tasks.task import Task, TaskState, validate_task_content
|
|
44
|
-
from camel.toolkits import
|
|
46
|
+
from camel.toolkits import (
|
|
47
|
+
CodeExecutionToolkit,
|
|
48
|
+
SearchToolkit,
|
|
49
|
+
TaskPlanningToolkit,
|
|
50
|
+
ThinkingToolkit,
|
|
51
|
+
)
|
|
45
52
|
from camel.types import ModelPlatformType, ModelType
|
|
46
53
|
from camel.utils import dependencies_required
|
|
47
54
|
|
|
55
|
+
from .workforce_logger import WorkforceLogger
|
|
56
|
+
|
|
48
57
|
logger = get_logger(__name__)
|
|
49
58
|
|
|
50
59
|
|
|
60
|
+
class WorkforceState(Enum):
|
|
61
|
+
r"""Workforce execution state for human intervention support."""
|
|
62
|
+
|
|
63
|
+
IDLE = "idle"
|
|
64
|
+
RUNNING = "running"
|
|
65
|
+
PAUSED = "paused"
|
|
66
|
+
STOPPED = "stopped"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class WorkforceSnapshot:
|
|
70
|
+
r"""Snapshot of workforce state for resuming execution."""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
main_task: Optional[Task] = None,
|
|
75
|
+
pending_tasks: Optional[Deque[Task]] = None,
|
|
76
|
+
completed_tasks: Optional[List[Task]] = None,
|
|
77
|
+
task_dependencies: Optional[Dict[str, List[str]]] = None,
|
|
78
|
+
assignees: Optional[Dict[str, str]] = None,
|
|
79
|
+
current_task_index: int = 0,
|
|
80
|
+
description: str = "",
|
|
81
|
+
):
|
|
82
|
+
self.main_task = main_task
|
|
83
|
+
self.pending_tasks = pending_tasks.copy() if pending_tasks else deque()
|
|
84
|
+
self.completed_tasks = (
|
|
85
|
+
completed_tasks.copy() if completed_tasks else []
|
|
86
|
+
)
|
|
87
|
+
self.task_dependencies = (
|
|
88
|
+
task_dependencies.copy() if task_dependencies else {}
|
|
89
|
+
)
|
|
90
|
+
self.assignees = assignees.copy() if assignees else {}
|
|
91
|
+
self.current_task_index = current_task_index
|
|
92
|
+
self.description = description
|
|
93
|
+
self.timestamp = time.time()
|
|
94
|
+
|
|
95
|
+
|
|
51
96
|
class Workforce(BaseNode):
|
|
52
97
|
r"""A system where multiple worker nodes (agents) cooperate together
|
|
53
98
|
to solve tasks. It can assign tasks to worker nodes and also take
|
|
@@ -90,21 +135,35 @@ class Workforce(BaseNode):
|
|
|
90
135
|
for graceful shutdown when a task fails 3 times. During this
|
|
91
136
|
period, the workforce remains active for debugging.
|
|
92
137
|
Set to 0 for immediate shutdown. (default: :obj:`15.0`)
|
|
138
|
+
share_memory (bool, optional): Whether to enable shared memory across
|
|
139
|
+
SingleAgentWorker instances in the workforce. When enabled, all
|
|
140
|
+
SingleAgentWorker instances, coordinator agent, and task planning
|
|
141
|
+
agent will share their complete conversation history and
|
|
142
|
+
function-calling trajectory, providing better context for task
|
|
143
|
+
handoffs and continuity. Note: Currently only supports
|
|
144
|
+
SingleAgentWorker instances; RolePlayingWorker and nested
|
|
145
|
+
Workforce instances do not participate in memory sharing.
|
|
146
|
+
(default: :obj:`False`)
|
|
93
147
|
|
|
94
148
|
Example:
|
|
95
|
-
>>> # Configure with custom model
|
|
149
|
+
>>> # Configure with custom model and shared memory
|
|
150
|
+
>>> import asyncio
|
|
96
151
|
>>> model = ModelFactory.create(
|
|
97
152
|
... ModelPlatformType.OPENAI, ModelType.GPT_4O
|
|
98
153
|
... )
|
|
99
154
|
>>> workforce = Workforce(
|
|
100
155
|
... "Research Team",
|
|
101
156
|
... coordinator_agent_kwargs={"model": model, "token_limit": 4000},
|
|
102
|
-
... task_agent_kwargs={"model": model, "token_limit": 8000}
|
|
157
|
+
... task_agent_kwargs={"model": model, "token_limit": 8000},
|
|
158
|
+
... share_memory=True # Enable shared memory
|
|
103
159
|
... )
|
|
104
160
|
>>>
|
|
105
161
|
>>> # Process a task
|
|
106
|
-
>>>
|
|
107
|
-
|
|
162
|
+
>>> async def main():
|
|
163
|
+
... task = Task(content="Research AI trends", id="1")
|
|
164
|
+
... result = workforce.process_task(task)
|
|
165
|
+
... return result
|
|
166
|
+
>>> asyncio.run(main())
|
|
108
167
|
"""
|
|
109
168
|
|
|
110
169
|
def __init__(
|
|
@@ -115,12 +174,44 @@ class Workforce(BaseNode):
|
|
|
115
174
|
task_agent_kwargs: Optional[Dict] = None,
|
|
116
175
|
new_worker_agent_kwargs: Optional[Dict] = None,
|
|
117
176
|
graceful_shutdown_timeout: float = 15.0,
|
|
177
|
+
share_memory: bool = False,
|
|
118
178
|
) -> None:
|
|
119
179
|
super().__init__(description)
|
|
120
180
|
self._child_listening_tasks: Deque[asyncio.Task] = deque()
|
|
121
181
|
self._children = children or []
|
|
122
182
|
self.new_worker_agent_kwargs = new_worker_agent_kwargs
|
|
123
183
|
self.graceful_shutdown_timeout = graceful_shutdown_timeout
|
|
184
|
+
self.share_memory = share_memory
|
|
185
|
+
self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
|
|
186
|
+
self._task: Optional[Task] = None
|
|
187
|
+
self._pending_tasks: Deque[Task] = deque()
|
|
188
|
+
self._task_dependencies: Dict[str, List[str]] = {}
|
|
189
|
+
self._assignees: Dict[str, str] = {}
|
|
190
|
+
self._in_flight_tasks: int = 0
|
|
191
|
+
# Dictionary to track task start times
|
|
192
|
+
self._task_start_times: Dict[str, float] = {}
|
|
193
|
+
# Human intervention support
|
|
194
|
+
self._state = WorkforceState.IDLE
|
|
195
|
+
self._pause_event = asyncio.Event()
|
|
196
|
+
self._pause_event.set() # Initially not paused
|
|
197
|
+
self._stop_requested = False
|
|
198
|
+
self._snapshots: List[WorkforceSnapshot] = []
|
|
199
|
+
self._completed_tasks: List[Task] = []
|
|
200
|
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
|
201
|
+
self._main_task_future: Optional[asyncio.Future] = None
|
|
202
|
+
# Snapshot throttle support
|
|
203
|
+
self._last_snapshot_time: float = 0.0
|
|
204
|
+
# Minimum seconds between automatic snapshots
|
|
205
|
+
self.snapshot_interval: float = 30.0
|
|
206
|
+
if self.metrics_logger:
|
|
207
|
+
for child in self._children:
|
|
208
|
+
worker_type = type(child).__name__
|
|
209
|
+
role_or_desc = child.description
|
|
210
|
+
self.metrics_logger.log_worker_created(
|
|
211
|
+
worker_id=child.node_id,
|
|
212
|
+
worker_type=worker_type,
|
|
213
|
+
role=role_or_desc,
|
|
214
|
+
)
|
|
124
215
|
|
|
125
216
|
# Warning messages for default model usage
|
|
126
217
|
if coordinator_agent_kwargs is None:
|
|
@@ -154,6 +245,13 @@ class Workforce(BaseNode):
|
|
|
154
245
|
"available options."
|
|
155
246
|
)
|
|
156
247
|
|
|
248
|
+
if self.share_memory:
|
|
249
|
+
logger.info(
|
|
250
|
+
"Shared memory enabled. All agents will share their complete "
|
|
251
|
+
"conversation history and function-calling trajectory for "
|
|
252
|
+
"better context continuity during task handoffs."
|
|
253
|
+
)
|
|
254
|
+
|
|
157
255
|
coord_agent_sys_msg = BaseMessage.make_assistant_message(
|
|
158
256
|
role_name="Workforce Manager",
|
|
159
257
|
content="You are coordinating a group of workers. A worker can be "
|
|
@@ -163,21 +261,157 @@ class Workforce(BaseNode):
|
|
|
163
261
|
"a new worker for a task, etc.",
|
|
164
262
|
)
|
|
165
263
|
self.coordinator_agent = ChatAgent(
|
|
166
|
-
coord_agent_sys_msg,
|
|
264
|
+
coord_agent_sys_msg,
|
|
265
|
+
**(coordinator_agent_kwargs or {}),
|
|
167
266
|
)
|
|
168
267
|
|
|
169
268
|
task_sys_msg = BaseMessage.make_assistant_message(
|
|
170
269
|
role_name="Task Planner",
|
|
171
|
-
content="You are going to compose and decompose tasks."
|
|
270
|
+
content="You are going to compose and decompose tasks. Keep "
|
|
271
|
+
"tasks that are sequential and require the same type of "
|
|
272
|
+
"agent together in one agent process. Only decompose tasks "
|
|
273
|
+
"that can be handled in parallel and require different types "
|
|
274
|
+
"of agents. This ensures efficient execution by minimizing "
|
|
275
|
+
"context switching between agents.",
|
|
172
276
|
)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
self.
|
|
177
|
-
self._pending_tasks: Deque[Task] = deque()
|
|
277
|
+
_kwargs = dict(task_agent_kwargs or {})
|
|
278
|
+
extra_tools = TaskPlanningToolkit().get_tools()
|
|
279
|
+
_kwargs["tools"] = [*_kwargs.get("tools", []), *extra_tools]
|
|
280
|
+
self.task_agent = ChatAgent(task_sys_msg, **_kwargs)
|
|
178
281
|
|
|
179
282
|
def __repr__(self):
|
|
180
|
-
return
|
|
283
|
+
return (
|
|
284
|
+
f"Workforce {self.node_id} ({self.description}) - "
|
|
285
|
+
f"State: {self._state.value}"
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
def _collect_shared_memory(self) -> Dict[str, List]:
|
|
289
|
+
r"""Collect memory from all SingleAgentWorker instances for sharing.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Dict[str, List]: A dictionary mapping agent types to their memory
|
|
293
|
+
records. Contains entries for 'coordinator', 'task_agent',
|
|
294
|
+
and 'workers'.
|
|
295
|
+
"""
|
|
296
|
+
# TODO: add memory collection for RolePlayingWorker and nested
|
|
297
|
+
# Workforce instances
|
|
298
|
+
if not self.share_memory:
|
|
299
|
+
return {}
|
|
300
|
+
|
|
301
|
+
shared_memory: Dict[str, List] = {
|
|
302
|
+
'coordinator': [],
|
|
303
|
+
'task_agent': [],
|
|
304
|
+
'workers': [],
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
# Collect coordinator agent memory
|
|
309
|
+
coord_records = self.coordinator_agent.memory.retrieve()
|
|
310
|
+
shared_memory['coordinator'] = [
|
|
311
|
+
record.memory_record.to_dict() for record in coord_records
|
|
312
|
+
]
|
|
313
|
+
|
|
314
|
+
# Collect task agent memory
|
|
315
|
+
task_records = self.task_agent.memory.retrieve()
|
|
316
|
+
shared_memory['task_agent'] = [
|
|
317
|
+
record.memory_record.to_dict() for record in task_records
|
|
318
|
+
]
|
|
319
|
+
|
|
320
|
+
# Collect worker memory only from SingleAgentWorker instances
|
|
321
|
+
for child in self._children:
|
|
322
|
+
if isinstance(child, SingleAgentWorker):
|
|
323
|
+
worker_records = child.worker.memory.retrieve()
|
|
324
|
+
worker_memory = [
|
|
325
|
+
record.memory_record.to_dict()
|
|
326
|
+
for record in worker_records
|
|
327
|
+
]
|
|
328
|
+
shared_memory['workers'].extend(worker_memory)
|
|
329
|
+
|
|
330
|
+
except Exception as e:
|
|
331
|
+
logger.warning(f"Error collecting shared memory: {e}")
|
|
332
|
+
|
|
333
|
+
return shared_memory
|
|
334
|
+
|
|
335
|
+
def _share_memory_with_agents(
|
|
336
|
+
self, shared_memory: Dict[str, List]
|
|
337
|
+
) -> None:
|
|
338
|
+
r"""Share collected memory with coordinator, task agent, and
|
|
339
|
+
SingleAgentWorker instances.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
shared_memory (Dict[str, List]): Memory records collected from
|
|
343
|
+
all agents to be shared.
|
|
344
|
+
"""
|
|
345
|
+
if not self.share_memory or not shared_memory:
|
|
346
|
+
return
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
# Create a consolidated memory from all collected records
|
|
350
|
+
all_records = []
|
|
351
|
+
for _memory_type, records in shared_memory.items():
|
|
352
|
+
all_records.extend(records)
|
|
353
|
+
|
|
354
|
+
if not all_records:
|
|
355
|
+
return
|
|
356
|
+
|
|
357
|
+
# Import necessary classes for memory record reconstruction
|
|
358
|
+
from camel.memories.records import MemoryRecord
|
|
359
|
+
|
|
360
|
+
# Create consolidated memory objects from records
|
|
361
|
+
memory_records: List[MemoryRecord] = []
|
|
362
|
+
for record_dict in all_records:
|
|
363
|
+
try:
|
|
364
|
+
memory_record = MemoryRecord.from_dict(record_dict)
|
|
365
|
+
memory_records.append(memory_record)
|
|
366
|
+
except Exception as e:
|
|
367
|
+
logger.warning(f"Failed to reconstruct memory record: {e}")
|
|
368
|
+
continue
|
|
369
|
+
|
|
370
|
+
if not memory_records:
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
# Share with coordinator agent
|
|
374
|
+
for record in memory_records:
|
|
375
|
+
# Only add records from other agents to avoid duplication
|
|
376
|
+
if record.agent_id != self.coordinator_agent.agent_id:
|
|
377
|
+
self.coordinator_agent.memory.write_record(record)
|
|
378
|
+
|
|
379
|
+
# Share with task agent
|
|
380
|
+
for record in memory_records:
|
|
381
|
+
if record.agent_id != self.task_agent.agent_id:
|
|
382
|
+
self.task_agent.memory.write_record(record)
|
|
383
|
+
|
|
384
|
+
# Share with SingleAgentWorker instances only
|
|
385
|
+
single_agent_workers = [
|
|
386
|
+
child
|
|
387
|
+
for child in self._children
|
|
388
|
+
if isinstance(child, SingleAgentWorker)
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
for worker in single_agent_workers:
|
|
392
|
+
for record in memory_records:
|
|
393
|
+
if record.agent_id != worker.worker.agent_id:
|
|
394
|
+
worker.worker.memory.write_record(record)
|
|
395
|
+
|
|
396
|
+
logger.info(
|
|
397
|
+
f"Shared {len(memory_records)} memory records across "
|
|
398
|
+
f"{len(single_agent_workers) + 2} agents in workforce "
|
|
399
|
+
f"{self.node_id}"
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
except Exception as e:
|
|
403
|
+
logger.warning(f"Error sharing memory with agents: {e}")
|
|
404
|
+
|
|
405
|
+
def _sync_shared_memory(self) -> None:
|
|
406
|
+
r"""Synchronize memory across all agents by collecting and sharing."""
|
|
407
|
+
if not self.share_memory:
|
|
408
|
+
return
|
|
409
|
+
|
|
410
|
+
try:
|
|
411
|
+
shared_memory = self._collect_shared_memory()
|
|
412
|
+
self._share_memory_with_agents(shared_memory)
|
|
413
|
+
except Exception as e:
|
|
414
|
+
logger.warning(f"Error synchronizing shared memory: {e}")
|
|
181
415
|
|
|
182
416
|
def _decompose_task(self, task: Task) -> List[Task]:
|
|
183
417
|
r"""Decompose the task into subtasks. This method will also set the
|
|
@@ -199,18 +433,313 @@ class Workforce(BaseNode):
|
|
|
199
433
|
|
|
200
434
|
return subtasks
|
|
201
435
|
|
|
436
|
+
# Human intervention methods
|
|
437
|
+
async def _async_pause(self) -> None:
|
|
438
|
+
r"""Async implementation of pause to run on the event loop."""
|
|
439
|
+
if self._state == WorkforceState.RUNNING:
|
|
440
|
+
self._state = WorkforceState.PAUSED
|
|
441
|
+
self._pause_event.clear()
|
|
442
|
+
logger.info(f"Workforce {self.node_id} paused.")
|
|
443
|
+
|
|
444
|
+
def pause(self) -> None:
|
|
445
|
+
r"""Pause the workforce execution.
|
|
446
|
+
If the internal event-loop is already running we schedule the
|
|
447
|
+
asynchronous pause coroutine onto it. When the loop has not yet
|
|
448
|
+
been created (e.g. the caller presses the hot-key immediately after
|
|
449
|
+
workforce start-up) we fall back to a synchronous state change so
|
|
450
|
+
that no tasks will be scheduled until the loop is ready.
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
if self._loop and not self._loop.is_closed():
|
|
454
|
+
self._submit_coro_to_loop(self._async_pause())
|
|
455
|
+
else:
|
|
456
|
+
# Loop not yet created, just mark state so when loop starts it
|
|
457
|
+
# will proceed.
|
|
458
|
+
if self._state == WorkforceState.RUNNING:
|
|
459
|
+
self._state = WorkforceState.PAUSED
|
|
460
|
+
self._pause_event.clear()
|
|
461
|
+
logger.info(
|
|
462
|
+
f"Workforce {self.node_id} paused "
|
|
463
|
+
f"(event-loop not yet started)."
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
async def _async_resume(self) -> None:
|
|
467
|
+
r"""Async implementation of resume to run on the event loop."""
|
|
468
|
+
if self._state == WorkforceState.PAUSED:
|
|
469
|
+
self._state = WorkforceState.RUNNING
|
|
470
|
+
self._pause_event.set()
|
|
471
|
+
logger.info(f"Workforce {self.node_id} resumed.")
|
|
472
|
+
|
|
473
|
+
# Re-post ready tasks (if any)
|
|
474
|
+
if self._pending_tasks:
|
|
475
|
+
await self._post_ready_tasks()
|
|
476
|
+
|
|
477
|
+
def resume(self) -> None:
|
|
478
|
+
r"""Resume execution after a manual pause."""
|
|
479
|
+
|
|
480
|
+
if self._loop and not self._loop.is_closed():
|
|
481
|
+
self._submit_coro_to_loop(self._async_resume())
|
|
482
|
+
else:
|
|
483
|
+
# Loop not running yet, just mark state so when loop starts it
|
|
484
|
+
# will proceed.
|
|
485
|
+
if self._state == WorkforceState.PAUSED:
|
|
486
|
+
self._state = WorkforceState.RUNNING
|
|
487
|
+
self._pause_event.set()
|
|
488
|
+
logger.info(
|
|
489
|
+
f"Workforce {self.node_id} resumed "
|
|
490
|
+
f"(event-loop not yet started)."
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
async def _async_stop_gracefully(self) -> None:
|
|
494
|
+
r"""Async implementation of stop_gracefully to run on the event
|
|
495
|
+
loop.
|
|
496
|
+
"""
|
|
497
|
+
self._stop_requested = True
|
|
498
|
+
if self._pause_event.is_set() is False:
|
|
499
|
+
self._pause_event.set() # Resume if paused to process stop
|
|
500
|
+
logger.info(f"Workforce {self.node_id} stop requested.")
|
|
501
|
+
|
|
502
|
+
def stop_gracefully(self) -> None:
|
|
503
|
+
r"""Request workforce to finish current in-flight work then halt.
|
|
504
|
+
|
|
505
|
+
Works both when the internal event-loop is alive and when it has not
|
|
506
|
+
yet been started. In the latter case we simply mark the stop flag so
|
|
507
|
+
that the loop (when it eventually starts) will exit immediately after
|
|
508
|
+
initialisation.
|
|
509
|
+
"""
|
|
510
|
+
|
|
511
|
+
if self._loop and not self._loop.is_closed():
|
|
512
|
+
self._submit_coro_to_loop(self._async_stop_gracefully())
|
|
513
|
+
else:
|
|
514
|
+
# Loop not yet created, set the flag synchronously so later
|
|
515
|
+
# startup will respect it.
|
|
516
|
+
self._stop_requested = True
|
|
517
|
+
# Ensure any pending pause is released so that when the loop does
|
|
518
|
+
# start it can see the stop request and exit.
|
|
519
|
+
self._pause_event.set()
|
|
520
|
+
logger.info(
|
|
521
|
+
f"Workforce {self.node_id} stop requested "
|
|
522
|
+
f"(event-loop not yet started)."
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
def save_snapshot(self, description: str = "") -> None:
|
|
526
|
+
r"""Save current state as a snapshot."""
|
|
527
|
+
snapshot = WorkforceSnapshot(
|
|
528
|
+
main_task=self._task,
|
|
529
|
+
pending_tasks=self._pending_tasks,
|
|
530
|
+
completed_tasks=self._completed_tasks,
|
|
531
|
+
task_dependencies=self._task_dependencies,
|
|
532
|
+
assignees=self._assignees,
|
|
533
|
+
current_task_index=len(self._completed_tasks),
|
|
534
|
+
description=description or f"Snapshot at {time.time()}",
|
|
535
|
+
)
|
|
536
|
+
self._snapshots.append(snapshot)
|
|
537
|
+
logger.info(f"Snapshot saved: {description}")
|
|
538
|
+
|
|
539
|
+
def list_snapshots(self) -> List[str]:
|
|
540
|
+
r"""List all available snapshots."""
|
|
541
|
+
snapshots_info = []
|
|
542
|
+
for i, snapshot in enumerate(self._snapshots):
|
|
543
|
+
desc_part = (
|
|
544
|
+
f" - {snapshot.description}" if snapshot.description else ""
|
|
545
|
+
)
|
|
546
|
+
info = (
|
|
547
|
+
f"Snapshot {i}: {len(snapshot.completed_tasks)} completed, "
|
|
548
|
+
f"{len(snapshot.pending_tasks)} pending{desc_part}"
|
|
549
|
+
)
|
|
550
|
+
snapshots_info.append(info)
|
|
551
|
+
return snapshots_info
|
|
552
|
+
|
|
553
|
+
def get_pending_tasks(self) -> List[Task]:
|
|
554
|
+
r"""Get current pending tasks for human review."""
|
|
555
|
+
return list(self._pending_tasks)
|
|
556
|
+
|
|
557
|
+
def get_completed_tasks(self) -> List[Task]:
|
|
558
|
+
r"""Get completed tasks."""
|
|
559
|
+
return self._completed_tasks.copy()
|
|
560
|
+
|
|
561
|
+
def modify_task_content(self, task_id: str, new_content: str) -> bool:
|
|
562
|
+
r"""Modify the content of a pending task."""
|
|
563
|
+
# Validate the new content first
|
|
564
|
+
if not validate_task_content(new_content, task_id):
|
|
565
|
+
logger.warning(
|
|
566
|
+
f"Task {task_id} content modification rejected: "
|
|
567
|
+
f"Invalid content. Content preview: '{new_content[:50]}...'"
|
|
568
|
+
)
|
|
569
|
+
return False
|
|
570
|
+
|
|
571
|
+
for task in self._pending_tasks:
|
|
572
|
+
if task.id == task_id:
|
|
573
|
+
task.content = new_content
|
|
574
|
+
logger.info(f"Task {task_id} content modified.")
|
|
575
|
+
return True
|
|
576
|
+
logger.warning(f"Task {task_id} not found in pending tasks.")
|
|
577
|
+
return False
|
|
578
|
+
|
|
579
|
+
def add_task(
|
|
580
|
+
self,
|
|
581
|
+
content: str,
|
|
582
|
+
task_id: Optional[str] = None,
|
|
583
|
+
additional_info: Optional[Dict[str, Any]] = None,
|
|
584
|
+
insert_position: int = -1,
|
|
585
|
+
) -> Task:
|
|
586
|
+
r"""Add a new task to the pending queue."""
|
|
587
|
+
new_task = Task(
|
|
588
|
+
content=content,
|
|
589
|
+
id=task_id or f"human_added_{len(self._pending_tasks)}",
|
|
590
|
+
additional_info=additional_info,
|
|
591
|
+
)
|
|
592
|
+
if insert_position == -1:
|
|
593
|
+
self._pending_tasks.append(new_task)
|
|
594
|
+
else:
|
|
595
|
+
# Convert deque to list, insert, then back to deque
|
|
596
|
+
tasks_list = list(self._pending_tasks)
|
|
597
|
+
tasks_list.insert(insert_position, new_task)
|
|
598
|
+
self._pending_tasks = deque(tasks_list)
|
|
599
|
+
|
|
600
|
+
logger.info(f"New task added: {new_task.id}")
|
|
601
|
+
return new_task
|
|
602
|
+
|
|
603
|
+
def remove_task(self, task_id: str) -> bool:
|
|
604
|
+
r"""Remove a task from the pending queue."""
|
|
605
|
+
# Convert to list to find and remove
|
|
606
|
+
tasks_list = list(self._pending_tasks)
|
|
607
|
+
for i, task in enumerate(tasks_list):
|
|
608
|
+
if task.id == task_id:
|
|
609
|
+
tasks_list.pop(i)
|
|
610
|
+
self._pending_tasks = deque(tasks_list)
|
|
611
|
+
logger.info(f"Task {task_id} removed.")
|
|
612
|
+
return True
|
|
613
|
+
logger.warning(f"Task {task_id} not found in pending tasks.")
|
|
614
|
+
return False
|
|
615
|
+
|
|
616
|
+
def reorder_tasks(self, task_ids: List[str]) -> bool:
|
|
617
|
+
r"""Reorder pending tasks according to the provided task IDs list."""
|
|
618
|
+
# Create a mapping of task_id to task
|
|
619
|
+
tasks_dict = {task.id: task for task in self._pending_tasks}
|
|
620
|
+
|
|
621
|
+
# Check if all provided IDs exist
|
|
622
|
+
if not all(task_id in tasks_dict for task_id in task_ids):
|
|
623
|
+
logger.warning("Some task IDs not found in pending tasks.")
|
|
624
|
+
return False
|
|
625
|
+
|
|
626
|
+
# Check if we have the same number of tasks
|
|
627
|
+
if len(task_ids) != len(self._pending_tasks):
|
|
628
|
+
logger.warning(
|
|
629
|
+
"Number of task IDs doesn't match pending tasks count."
|
|
630
|
+
)
|
|
631
|
+
return False
|
|
632
|
+
|
|
633
|
+
# Reorder tasks
|
|
634
|
+
reordered_tasks = deque([tasks_dict[task_id] for task_id in task_ids])
|
|
635
|
+
self._pending_tasks = reordered_tasks
|
|
636
|
+
|
|
637
|
+
logger.info("Tasks reordered successfully.")
|
|
638
|
+
return True
|
|
639
|
+
|
|
640
|
+
def resume_from_task(self, task_id: str) -> bool:
|
|
641
|
+
r"""Resume execution from a specific task."""
|
|
642
|
+
if self._state != WorkforceState.PAUSED:
|
|
643
|
+
logger.warning(
|
|
644
|
+
"Workforce must be paused to resume from specific task."
|
|
645
|
+
)
|
|
646
|
+
return False
|
|
647
|
+
|
|
648
|
+
# Find the task in pending tasks
|
|
649
|
+
tasks_list = list(self._pending_tasks)
|
|
650
|
+
target_index = -1
|
|
651
|
+
|
|
652
|
+
for i, task in enumerate(tasks_list):
|
|
653
|
+
if task.id == task_id:
|
|
654
|
+
target_index = i
|
|
655
|
+
break
|
|
656
|
+
|
|
657
|
+
if target_index == -1:
|
|
658
|
+
logger.warning(f"Task {task_id} not found in pending tasks.")
|
|
659
|
+
return False
|
|
660
|
+
|
|
661
|
+
# Move completed tasks that come after the target task back to pending
|
|
662
|
+
tasks_to_move_back = tasks_list[:target_index]
|
|
663
|
+
remaining_tasks = tasks_list[target_index:]
|
|
664
|
+
|
|
665
|
+
# Update pending tasks to start from the target task
|
|
666
|
+
self._pending_tasks = deque(remaining_tasks)
|
|
667
|
+
|
|
668
|
+
# Move previously "completed" tasks that are after target back to
|
|
669
|
+
# pending and reset their state
|
|
670
|
+
if tasks_to_move_back:
|
|
671
|
+
# Reset state for tasks being moved back to pending
|
|
672
|
+
for task in tasks_to_move_back:
|
|
673
|
+
# Handle all possible task states
|
|
674
|
+
if task.state in [TaskState.DONE, TaskState.FAILED]:
|
|
675
|
+
task.state = TaskState.OPEN
|
|
676
|
+
# Clear result to avoid confusion
|
|
677
|
+
task.result = None
|
|
678
|
+
# Reset failure count to give task a fresh start
|
|
679
|
+
task.failure_count = 0
|
|
680
|
+
|
|
681
|
+
logger.info(
|
|
682
|
+
f"Moving {len(tasks_to_move_back)} tasks back to pending "
|
|
683
|
+
f"state."
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
logger.info(f"Ready to resume from task: {task_id}")
|
|
687
|
+
return True
|
|
688
|
+
|
|
689
|
+
def restore_from_snapshot(self, snapshot_index: int) -> bool:
|
|
690
|
+
r"""Restore workforce state from a snapshot."""
|
|
691
|
+
if not (0 <= snapshot_index < len(self._snapshots)):
|
|
692
|
+
logger.warning(f"Invalid snapshot index: {snapshot_index}")
|
|
693
|
+
return False
|
|
694
|
+
|
|
695
|
+
if self._state == WorkforceState.RUNNING:
|
|
696
|
+
logger.warning(
|
|
697
|
+
"Cannot restore snapshot while workforce is running. "
|
|
698
|
+
"Pause first."
|
|
699
|
+
)
|
|
700
|
+
return False
|
|
701
|
+
|
|
702
|
+
snapshot = self._snapshots[snapshot_index]
|
|
703
|
+
self._task = snapshot.main_task
|
|
704
|
+
self._pending_tasks = snapshot.pending_tasks.copy()
|
|
705
|
+
self._completed_tasks = snapshot.completed_tasks.copy()
|
|
706
|
+
self._task_dependencies = snapshot.task_dependencies.copy()
|
|
707
|
+
self._assignees = snapshot.assignees.copy()
|
|
708
|
+
|
|
709
|
+
logger.info(f"Workforce state restored from snapshot {snapshot_index}")
|
|
710
|
+
return True
|
|
711
|
+
|
|
712
|
+
def get_workforce_status(self) -> Dict:
|
|
713
|
+
r"""Get current workforce status for human review."""
|
|
714
|
+
return {
|
|
715
|
+
"state": self._state.value,
|
|
716
|
+
"pending_tasks_count": len(self._pending_tasks),
|
|
717
|
+
"completed_tasks_count": len(self._completed_tasks),
|
|
718
|
+
"snapshots_count": len(self._snapshots),
|
|
719
|
+
"children_count": len(self._children),
|
|
720
|
+
"main_task_id": self._task.id if self._task else None,
|
|
721
|
+
}
|
|
722
|
+
|
|
202
723
|
@check_if_running(False)
|
|
203
|
-
def
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
724
|
+
async def process_task_async(
|
|
725
|
+
self, task: Task, interactive: bool = False
|
|
726
|
+
) -> Task:
|
|
727
|
+
r"""Main entry point to process a task asynchronously.
|
|
207
728
|
|
|
208
729
|
Args:
|
|
209
730
|
task (Task): The task to be processed.
|
|
731
|
+
interactive (bool, optional): If True, enables human-intervention
|
|
732
|
+
workflow (pause/resume/snapshot). Defaults to False, which
|
|
733
|
+
runs the task in a blocking one-shot manner.
|
|
210
734
|
|
|
211
735
|
Returns:
|
|
212
736
|
Task: The updated task.
|
|
213
737
|
"""
|
|
738
|
+
# Delegate to intervention pipeline when requested to keep
|
|
739
|
+
# backward-compat.
|
|
740
|
+
if interactive:
|
|
741
|
+
return await self._process_task_with_snapshot(task)
|
|
742
|
+
|
|
214
743
|
if not validate_task_content(task.content, task.id):
|
|
215
744
|
task.state = TaskState.FAILED
|
|
216
745
|
task.result = "Task failed: Invalid or empty content provided"
|
|
@@ -222,18 +751,235 @@ class Workforce(BaseNode):
|
|
|
222
751
|
|
|
223
752
|
self.reset()
|
|
224
753
|
self._task = task
|
|
754
|
+
if self.metrics_logger:
|
|
755
|
+
self.metrics_logger.log_task_created(
|
|
756
|
+
task_id=task.id,
|
|
757
|
+
description=task.content,
|
|
758
|
+
task_type=task.type,
|
|
759
|
+
metadata=task.additional_info,
|
|
760
|
+
)
|
|
225
761
|
task.state = TaskState.FAILED
|
|
226
|
-
self._pending_tasks.append(task)
|
|
227
762
|
# The agent tend to be overconfident on the whole task, so we
|
|
228
763
|
# decompose the task into subtasks first
|
|
229
764
|
subtasks = self._decompose_task(task)
|
|
765
|
+
if self.metrics_logger and subtasks:
|
|
766
|
+
self.metrics_logger.log_task_decomposed(
|
|
767
|
+
parent_task_id=task.id, subtask_ids=[st.id for st in subtasks]
|
|
768
|
+
)
|
|
769
|
+
for subtask in subtasks:
|
|
770
|
+
self.metrics_logger.log_task_created(
|
|
771
|
+
task_id=subtask.id,
|
|
772
|
+
description=subtask.content,
|
|
773
|
+
parent_task_id=task.id,
|
|
774
|
+
task_type=subtask.type,
|
|
775
|
+
metadata=subtask.additional_info,
|
|
776
|
+
)
|
|
777
|
+
if subtasks:
|
|
778
|
+
# If decomposition happened, the original task becomes a container.
|
|
779
|
+
# We only execute its subtasks.
|
|
780
|
+
self._pending_tasks.extendleft(reversed(subtasks))
|
|
781
|
+
else:
|
|
782
|
+
# If no decomposition, execute the original task.
|
|
783
|
+
self._pending_tasks.append(task)
|
|
784
|
+
|
|
785
|
+
self.set_channel(TaskChannel())
|
|
786
|
+
|
|
787
|
+
await self.start()
|
|
788
|
+
|
|
789
|
+
if subtasks:
|
|
790
|
+
task.result = "\n\n".join(
|
|
791
|
+
f"--- Subtask {sub.id} Result ---\n{sub.result}"
|
|
792
|
+
for sub in task.subtasks
|
|
793
|
+
if sub.result
|
|
794
|
+
)
|
|
795
|
+
if task.subtasks and all(
|
|
796
|
+
sub.state == TaskState.DONE for sub in task.subtasks
|
|
797
|
+
):
|
|
798
|
+
task.state = TaskState.DONE
|
|
799
|
+
else:
|
|
800
|
+
task.state = TaskState.FAILED
|
|
801
|
+
|
|
802
|
+
return task
|
|
803
|
+
|
|
804
|
+
def process_task(self, task: Task) -> Task:
|
|
805
|
+
r"""Synchronous wrapper for process_task that handles async operations
|
|
806
|
+
internally.
|
|
807
|
+
|
|
808
|
+
Args:
|
|
809
|
+
task (Task): The task to be processed.
|
|
810
|
+
|
|
811
|
+
Returns:
|
|
812
|
+
Task: The updated task.
|
|
813
|
+
|
|
814
|
+
Example:
|
|
815
|
+
>>> workforce = Workforce("My Team")
|
|
816
|
+
>>> task = Task(content="Analyze data", id="1")
|
|
817
|
+
>>> result = workforce.process_task(task) # No async/await
|
|
818
|
+
needed
|
|
819
|
+
>>> print(result.result)
|
|
820
|
+
"""
|
|
821
|
+
import asyncio
|
|
822
|
+
import concurrent.futures
|
|
823
|
+
|
|
824
|
+
# Check if we're already in an event loop
|
|
825
|
+
try:
|
|
826
|
+
asyncio.get_running_loop()
|
|
827
|
+
|
|
828
|
+
# If we're in an event loop, we need to run in a thread
|
|
829
|
+
def run_in_thread():
|
|
830
|
+
# Create new event loop for this thread
|
|
831
|
+
new_loop = asyncio.new_event_loop()
|
|
832
|
+
asyncio.set_event_loop(new_loop)
|
|
833
|
+
try:
|
|
834
|
+
return new_loop.run_until_complete(
|
|
835
|
+
self.process_task_async(task)
|
|
836
|
+
)
|
|
837
|
+
finally:
|
|
838
|
+
new_loop.close()
|
|
839
|
+
|
|
840
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
841
|
+
future = executor.submit(run_in_thread)
|
|
842
|
+
return future.result()
|
|
843
|
+
|
|
844
|
+
except RuntimeError:
|
|
845
|
+
# No event loop running, we can create one
|
|
846
|
+
return asyncio.run(self.process_task_async(task))
|
|
847
|
+
|
|
848
|
+
async def _process_task_with_snapshot(self, task: Task) -> Task:
|
|
849
|
+
r"""Async version of process_task that supports human intervention.
|
|
850
|
+
This method can be paused, resumed, and allows task modification.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
task (Task): The task to be processed.
|
|
854
|
+
|
|
855
|
+
Returns:
|
|
856
|
+
Task: The updated task.
|
|
857
|
+
"""
|
|
858
|
+
|
|
859
|
+
if not validate_task_content(task.content, task.id):
|
|
860
|
+
task.state = TaskState.FAILED
|
|
861
|
+
task.result = "Task failed: Invalid or empty content provided"
|
|
862
|
+
logger.warning(
|
|
863
|
+
f"Task {task.id} rejected: Invalid or empty content. "
|
|
864
|
+
f"Content preview: '{task.content[:50]}...'"
|
|
865
|
+
)
|
|
866
|
+
return task
|
|
867
|
+
|
|
868
|
+
self.reset()
|
|
869
|
+
self._task = task
|
|
870
|
+
self._state = WorkforceState.RUNNING
|
|
871
|
+
task.state = TaskState.OPEN
|
|
872
|
+
self._pending_tasks.append(task)
|
|
873
|
+
|
|
874
|
+
# Decompose the task into subtasks first
|
|
875
|
+
subtasks = self._decompose_task(task)
|
|
230
876
|
self._pending_tasks.extendleft(reversed(subtasks))
|
|
231
877
|
self.set_channel(TaskChannel())
|
|
232
878
|
|
|
233
|
-
|
|
879
|
+
# Save initial snapshot
|
|
880
|
+
self.save_snapshot("Initial task decomposition")
|
|
881
|
+
|
|
882
|
+
try:
|
|
883
|
+
await self.start()
|
|
884
|
+
except Exception as e:
|
|
885
|
+
logger.error(f"Error in workforce execution: {e}")
|
|
886
|
+
self._state = WorkforceState.STOPPED
|
|
887
|
+
raise
|
|
888
|
+
finally:
|
|
889
|
+
if self._state != WorkforceState.STOPPED:
|
|
890
|
+
self._state = WorkforceState.IDLE
|
|
234
891
|
|
|
235
892
|
return task
|
|
236
893
|
|
|
894
|
+
def _process_task_with_intervention(self, task: Task) -> Task:
|
|
895
|
+
r"""Process task with human intervention support. This creates and
|
|
896
|
+
manages its own event loop to allow for pausing/resuming functionality.
|
|
897
|
+
|
|
898
|
+
Args:
|
|
899
|
+
task (Task): The task to be processed.
|
|
900
|
+
|
|
901
|
+
Returns:
|
|
902
|
+
Task: The updated task.
|
|
903
|
+
"""
|
|
904
|
+
# Create new event loop if none exists or if we need a fresh one
|
|
905
|
+
try:
|
|
906
|
+
self._loop = asyncio.get_event_loop()
|
|
907
|
+
if self._loop.is_closed():
|
|
908
|
+
self._loop = asyncio.new_event_loop()
|
|
909
|
+
asyncio.set_event_loop(self._loop)
|
|
910
|
+
except RuntimeError:
|
|
911
|
+
self._loop = asyncio.new_event_loop()
|
|
912
|
+
asyncio.set_event_loop(self._loop)
|
|
913
|
+
|
|
914
|
+
try:
|
|
915
|
+
return self._loop.run_until_complete(
|
|
916
|
+
self._process_task_with_snapshot(task)
|
|
917
|
+
)
|
|
918
|
+
finally:
|
|
919
|
+
# Decide whether to keep or close the loop
|
|
920
|
+
if self._loop and not self._loop.is_closed():
|
|
921
|
+
if self._state == WorkforceState.PAUSED:
|
|
922
|
+
# Keep alive to support resume()
|
|
923
|
+
logger.info(
|
|
924
|
+
"Event loop kept alive for potential resume "
|
|
925
|
+
"operations."
|
|
926
|
+
)
|
|
927
|
+
else:
|
|
928
|
+
# No more tasks; shut everything down cleanly
|
|
929
|
+
try:
|
|
930
|
+
# Ensure all async generators are finished
|
|
931
|
+
self._loop.run_until_complete(
|
|
932
|
+
self._loop.shutdown_asyncgens()
|
|
933
|
+
)
|
|
934
|
+
except RuntimeError:
|
|
935
|
+
# Loop already running elsewhere
|
|
936
|
+
pass
|
|
937
|
+
self._loop.close()
|
|
938
|
+
|
|
939
|
+
def continue_from_pause(self) -> Optional[Task]:
|
|
940
|
+
r"""Continue execution from a paused state. This reuses the
|
|
941
|
+
existing event loop.
|
|
942
|
+
|
|
943
|
+
Returns:
|
|
944
|
+
Optional[Task]: The completed task if execution finishes, None if
|
|
945
|
+
still running/paused.
|
|
946
|
+
"""
|
|
947
|
+
if self._state != WorkforceState.PAUSED:
|
|
948
|
+
logger.warning("Workforce is not in paused state.")
|
|
949
|
+
return None
|
|
950
|
+
|
|
951
|
+
if self._loop is None or self._loop.is_closed():
|
|
952
|
+
logger.error("No active event loop available for resuming.")
|
|
953
|
+
return None
|
|
954
|
+
|
|
955
|
+
# Resume execution
|
|
956
|
+
self.resume()
|
|
957
|
+
|
|
958
|
+
try:
|
|
959
|
+
# Continue the existing async task
|
|
960
|
+
remaining_task = self._loop.run_until_complete(
|
|
961
|
+
self._continue_execution()
|
|
962
|
+
)
|
|
963
|
+
return remaining_task
|
|
964
|
+
except Exception as e:
|
|
965
|
+
logger.error(f"Error continuing execution: {e}")
|
|
966
|
+
self._state = WorkforceState.STOPPED
|
|
967
|
+
return None
|
|
968
|
+
|
|
969
|
+
async def _continue_execution(self) -> Optional[Task]:
|
|
970
|
+
r"""Internal method to continue execution after pause."""
|
|
971
|
+
try:
|
|
972
|
+
await self._listen_to_channel()
|
|
973
|
+
except Exception as e:
|
|
974
|
+
logger.error(f"Error in continued execution: {e}")
|
|
975
|
+
self._state = WorkforceState.STOPPED
|
|
976
|
+
raise
|
|
977
|
+
finally:
|
|
978
|
+
if self._state != WorkforceState.STOPPED:
|
|
979
|
+
self._state = WorkforceState.IDLE
|
|
980
|
+
|
|
981
|
+
return self._task
|
|
982
|
+
|
|
237
983
|
@check_if_running(False)
|
|
238
984
|
def add_single_agent_worker(
|
|
239
985
|
self, description: str, worker: ChatAgent
|
|
@@ -249,6 +995,12 @@ class Workforce(BaseNode):
|
|
|
249
995
|
"""
|
|
250
996
|
worker_node = SingleAgentWorker(description, worker)
|
|
251
997
|
self._children.append(worker_node)
|
|
998
|
+
if self.metrics_logger:
|
|
999
|
+
self.metrics_logger.log_worker_created(
|
|
1000
|
+
worker_id=worker_node.node_id,
|
|
1001
|
+
worker_type='SingleAgentWorker',
|
|
1002
|
+
role=worker_node.description,
|
|
1003
|
+
)
|
|
252
1004
|
return self
|
|
253
1005
|
|
|
254
1006
|
@check_if_running(False)
|
|
@@ -293,6 +1045,12 @@ class Workforce(BaseNode):
|
|
|
293
1045
|
chat_turn_limit=chat_turn_limit,
|
|
294
1046
|
)
|
|
295
1047
|
self._children.append(worker_node)
|
|
1048
|
+
if self.metrics_logger:
|
|
1049
|
+
self.metrics_logger.log_worker_created(
|
|
1050
|
+
worker_id=worker_node.node_id,
|
|
1051
|
+
worker_type='RolePlayingWorker',
|
|
1052
|
+
role=worker_node.description,
|
|
1053
|
+
)
|
|
296
1054
|
return self
|
|
297
1055
|
|
|
298
1056
|
@check_if_running(False)
|
|
@@ -308,19 +1066,50 @@ class Workforce(BaseNode):
|
|
|
308
1066
|
self._children.append(workforce)
|
|
309
1067
|
return self
|
|
310
1068
|
|
|
1069
|
+
async def _async_reset(self) -> None:
|
|
1070
|
+
r"""Async implementation of reset to run on the event loop."""
|
|
1071
|
+
self._pause_event.set()
|
|
1072
|
+
|
|
311
1073
|
@check_if_running(False)
|
|
312
1074
|
def reset(self) -> None:
|
|
313
1075
|
r"""Reset the workforce and all the child nodes under it. Can only
|
|
314
|
-
be called when the workforce is not running.
|
|
1076
|
+
be called when the workforce is not running.
|
|
1077
|
+
"""
|
|
315
1078
|
super().reset()
|
|
316
1079
|
self._task = None
|
|
317
1080
|
self._pending_tasks.clear()
|
|
318
1081
|
self._child_listening_tasks.clear()
|
|
1082
|
+
# Clear dependency tracking
|
|
1083
|
+
self._task_dependencies.clear()
|
|
1084
|
+
self._completed_tasks = []
|
|
1085
|
+
self._assignees.clear()
|
|
1086
|
+
self._in_flight_tasks = 0
|
|
319
1087
|
self.coordinator_agent.reset()
|
|
320
1088
|
self.task_agent.reset()
|
|
1089
|
+
self._task_start_times.clear()
|
|
321
1090
|
for child in self._children:
|
|
322
1091
|
child.reset()
|
|
323
1092
|
|
|
1093
|
+
# Reset intervention state
|
|
1094
|
+
self._state = WorkforceState.IDLE
|
|
1095
|
+
self._stop_requested = False
|
|
1096
|
+
# Handle asyncio.Event in a thread-safe way
|
|
1097
|
+
if self._loop and not self._loop.is_closed():
|
|
1098
|
+
# If we have a loop, use it to set the event safely
|
|
1099
|
+
asyncio.run_coroutine_threadsafe(
|
|
1100
|
+
self._async_reset(), self._loop
|
|
1101
|
+
).result()
|
|
1102
|
+
else:
|
|
1103
|
+
try:
|
|
1104
|
+
self._reset_task = asyncio.create_task(self._async_reset())
|
|
1105
|
+
except RuntimeError:
|
|
1106
|
+
asyncio.run(self._async_reset())
|
|
1107
|
+
|
|
1108
|
+
if hasattr(self, 'logger') and self.metrics_logger is not None:
|
|
1109
|
+
self.metrics_logger.reset_task_data()
|
|
1110
|
+
else:
|
|
1111
|
+
self.metrics_logger = WorkforceLogger(workforce_id=self.node_id)
|
|
1112
|
+
|
|
324
1113
|
@check_if_running(False)
|
|
325
1114
|
def set_channel(self, channel: TaskChannel) -> None:
|
|
326
1115
|
r"""Set the channel for the node and all the child nodes under it."""
|
|
@@ -350,21 +1139,36 @@ class Workforce(BaseNode):
|
|
|
350
1139
|
|
|
351
1140
|
def _find_assignee(
|
|
352
1141
|
self,
|
|
353
|
-
|
|
354
|
-
) ->
|
|
355
|
-
r"""Assigns
|
|
1142
|
+
tasks: List[Task],
|
|
1143
|
+
) -> TaskAssignResult:
|
|
1144
|
+
r"""Assigns multiple tasks to worker nodes with the best capabilities.
|
|
356
1145
|
|
|
357
1146
|
Parameters:
|
|
358
|
-
|
|
1147
|
+
tasks (List[Task]): The tasks to be assigned.
|
|
359
1148
|
|
|
360
1149
|
Returns:
|
|
361
|
-
|
|
1150
|
+
TaskAssignResult: Assignment result containing task assignments
|
|
1151
|
+
with their dependencies.
|
|
362
1152
|
"""
|
|
363
1153
|
self.coordinator_agent.reset()
|
|
1154
|
+
|
|
1155
|
+
# Format tasks information for the prompt
|
|
1156
|
+
tasks_info = ""
|
|
1157
|
+
for task in tasks:
|
|
1158
|
+
tasks_info += f"Task ID: {task.id}\n"
|
|
1159
|
+
tasks_info += f"Content: {task.content}\n"
|
|
1160
|
+
if task.additional_info:
|
|
1161
|
+
tasks_info += f"Additional Info: {task.additional_info}\n"
|
|
1162
|
+
tasks_info += "---\n"
|
|
1163
|
+
|
|
364
1164
|
prompt = ASSIGN_TASK_PROMPT.format(
|
|
365
|
-
|
|
1165
|
+
tasks_info=tasks_info,
|
|
366
1166
|
child_nodes_info=self._get_child_nodes_info(),
|
|
367
|
-
|
|
1167
|
+
)
|
|
1168
|
+
|
|
1169
|
+
logger.debug(
|
|
1170
|
+
f"Sending batch assignment request to coordinator "
|
|
1171
|
+
f"for {len(tasks)} tasks."
|
|
368
1172
|
)
|
|
369
1173
|
|
|
370
1174
|
response = self.coordinator_agent.step(
|
|
@@ -372,9 +1176,17 @@ class Workforce(BaseNode):
|
|
|
372
1176
|
)
|
|
373
1177
|
result_dict = json.loads(response.msg.content, parse_int=str)
|
|
374
1178
|
task_assign_result = TaskAssignResult(**result_dict)
|
|
375
|
-
return task_assign_result
|
|
1179
|
+
return task_assign_result
|
|
376
1180
|
|
|
377
1181
|
async def _post_task(self, task: Task, assignee_id: str) -> None:
|
|
1182
|
+
# Record the start time when a task is posted
|
|
1183
|
+
self._task_start_times[task.id] = time.time()
|
|
1184
|
+
|
|
1185
|
+
if self.metrics_logger:
|
|
1186
|
+
self.metrics_logger.log_task_started(
|
|
1187
|
+
task_id=task.id, worker_id=assignee_id
|
|
1188
|
+
)
|
|
1189
|
+
self._in_flight_tasks += 1
|
|
378
1190
|
await self._channel.post_task(task, self.node_id, assignee_id)
|
|
379
1191
|
|
|
380
1192
|
async def _post_dependency(self, dependency: Task) -> None:
|
|
@@ -416,6 +1228,13 @@ class Workforce(BaseNode):
|
|
|
416
1228
|
print(f"{Fore.CYAN}{new_node} created.{Fore.RESET}")
|
|
417
1229
|
|
|
418
1230
|
self._children.append(new_node)
|
|
1231
|
+
if self.metrics_logger:
|
|
1232
|
+
self.metrics_logger.log_worker_created(
|
|
1233
|
+
worker_id=new_node.node_id,
|
|
1234
|
+
worker_type='SingleAgentWorker',
|
|
1235
|
+
role=new_node_conf.role,
|
|
1236
|
+
metadata={'description': new_node_conf.description},
|
|
1237
|
+
)
|
|
419
1238
|
self._child_listening_tasks.append(
|
|
420
1239
|
asyncio.create_task(new_node.start())
|
|
421
1240
|
)
|
|
@@ -447,62 +1266,277 @@ class Workforce(BaseNode):
|
|
|
447
1266
|
|
|
448
1267
|
async def _get_returned_task(self) -> Task:
|
|
449
1268
|
r"""Get the task that's published by this node and just get returned
|
|
450
|
-
from the assignee.
|
|
1269
|
+
from the assignee. Includes timeout handling to prevent indefinite
|
|
1270
|
+
waiting.
|
|
451
1271
|
"""
|
|
452
|
-
|
|
1272
|
+
try:
|
|
1273
|
+
# Add timeout to prevent indefinite waiting
|
|
1274
|
+
return await asyncio.wait_for(
|
|
1275
|
+
self._channel.get_returned_task_by_publisher(self.node_id),
|
|
1276
|
+
timeout=300.0, # 5 minute timeout
|
|
1277
|
+
)
|
|
1278
|
+
except asyncio.TimeoutError:
|
|
1279
|
+
logger.warning(
|
|
1280
|
+
f"Timeout waiting for returned task in "
|
|
1281
|
+
f"workforce {self.node_id}"
|
|
1282
|
+
)
|
|
1283
|
+
raise ValueError("Timeout waiting for task to be returned")
|
|
453
1284
|
|
|
454
1285
|
async def _post_ready_tasks(self) -> None:
|
|
455
|
-
r"""
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
1286
|
+
r"""Checks for unassigned tasks, assigns them, and then posts any
|
|
1287
|
+
tasks whose dependencies have been met."""
|
|
1288
|
+
|
|
1289
|
+
# Step 1: Identify and assign any new tasks in the pending queue
|
|
1290
|
+
tasks_to_assign = [
|
|
1291
|
+
task
|
|
1292
|
+
for task in self._pending_tasks
|
|
1293
|
+
if task.id not in self._task_dependencies
|
|
1294
|
+
]
|
|
1295
|
+
if tasks_to_assign:
|
|
1296
|
+
logger.debug(
|
|
1297
|
+
f"Found {len(tasks_to_assign)} new tasks. "
|
|
1298
|
+
f"Requesting assignment..."
|
|
1299
|
+
)
|
|
1300
|
+
batch_result = self._find_assignee(tasks_to_assign)
|
|
1301
|
+
logger.debug(
|
|
1302
|
+
f"Coordinator returned assignments:\n"
|
|
1303
|
+
f"{json.dumps(batch_result.dict(), indent=2)}"
|
|
1304
|
+
)
|
|
1305
|
+
for assignment in batch_result.assignments:
|
|
1306
|
+
self._task_dependencies[assignment.task_id] = (
|
|
1307
|
+
assignment.dependencies
|
|
1308
|
+
)
|
|
1309
|
+
self._assignees[assignment.task_id] = assignment.assignee_id
|
|
1310
|
+
if self.metrics_logger:
|
|
1311
|
+
# queue_time_seconds can be derived by logger if task
|
|
1312
|
+
# creation time is logged
|
|
1313
|
+
self.metrics_logger.log_task_assigned(
|
|
1314
|
+
task_id=assignment.task_id,
|
|
1315
|
+
worker_id=assignment.assignee_id,
|
|
1316
|
+
dependencies=assignment.dependencies,
|
|
1317
|
+
queue_time_seconds=None,
|
|
1318
|
+
)
|
|
1319
|
+
|
|
1320
|
+
# Step 2: Iterate through all pending tasks and post those that are
|
|
1321
|
+
# ready
|
|
1322
|
+
posted_tasks = []
|
|
1323
|
+
for task in self._pending_tasks:
|
|
1324
|
+
# A task must be assigned to be considered for posting
|
|
1325
|
+
if task.id in self._task_dependencies:
|
|
1326
|
+
dependencies = self._task_dependencies[task.id]
|
|
1327
|
+
# Check if all dependencies for this task are in the completed
|
|
1328
|
+
# set
|
|
1329
|
+
if all(
|
|
1330
|
+
dep_id in {t.id for t in self._completed_tasks}
|
|
1331
|
+
for dep_id in dependencies
|
|
1332
|
+
):
|
|
1333
|
+
assignee_id = self._assignees[task.id]
|
|
1334
|
+
logger.debug(
|
|
1335
|
+
f"Posting task {task.id} to assignee {assignee_id}. "
|
|
1336
|
+
f"Dependencies met."
|
|
1337
|
+
)
|
|
1338
|
+
await self._post_task(task, assignee_id)
|
|
1339
|
+
posted_tasks.append(task)
|
|
1340
|
+
|
|
1341
|
+
# Step 3: Remove the posted tasks from the pending list
|
|
1342
|
+
for task in posted_tasks:
|
|
1343
|
+
try:
|
|
1344
|
+
self._pending_tasks.remove(task)
|
|
1345
|
+
except ValueError:
|
|
1346
|
+
# Task might have been removed by another process, which is
|
|
1347
|
+
# fine
|
|
1348
|
+
pass
|
|
459
1349
|
|
|
460
|
-
|
|
461
|
-
|
|
1350
|
+
async def _handle_failed_task(self, task: Task) -> bool:
|
|
1351
|
+
task.failure_count += 1
|
|
462
1352
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
for subtask in ready_task.subtasks:
|
|
473
|
-
await self._channel.remove_task(subtask.id)
|
|
474
|
-
# Send the task to the channel as a dependency
|
|
475
|
-
await self._post_dependency(ready_task)
|
|
476
|
-
self._pending_tasks.popleft()
|
|
477
|
-
# Try to send the next task in the pending list
|
|
478
|
-
await self._post_ready_tasks()
|
|
479
|
-
else:
|
|
480
|
-
# Directly post the task to the channel if it's a new one
|
|
481
|
-
# Find a node to assign the task
|
|
482
|
-
assignee_id = self._find_assignee(task=ready_task)
|
|
483
|
-
await self._post_task(ready_task, assignee_id)
|
|
1353
|
+
if self.metrics_logger:
|
|
1354
|
+
worker_id = self._assignees.get(task.id)
|
|
1355
|
+
self.metrics_logger.log_task_failed(
|
|
1356
|
+
task_id=task.id,
|
|
1357
|
+
worker_id=worker_id,
|
|
1358
|
+
error_message=task.result or "Task execution failed",
|
|
1359
|
+
error_type="TaskFailure",
|
|
1360
|
+
metadata={'failure_count': task.failure_count},
|
|
1361
|
+
)
|
|
484
1362
|
|
|
485
|
-
async def _handle_failed_task(self, task: Task) -> bool:
|
|
486
1363
|
if task.failure_count >= 3:
|
|
487
1364
|
return True
|
|
488
|
-
|
|
489
|
-
# Remove the failed task from the channel
|
|
490
|
-
await self._channel.remove_task(task.id)
|
|
1365
|
+
|
|
491
1366
|
if task.get_depth() >= 3:
|
|
492
1367
|
# Create a new worker node and reassign
|
|
493
1368
|
assignee = self._create_worker_node_for_task(task)
|
|
1369
|
+
|
|
1370
|
+
# Sync shared memory after creating new worker to provide context
|
|
1371
|
+
if self.share_memory:
|
|
1372
|
+
logger.info(
|
|
1373
|
+
f"Syncing shared memory after creating new worker "
|
|
1374
|
+
f"{assignee.node_id} for failed task {task.id}"
|
|
1375
|
+
)
|
|
1376
|
+
self._sync_shared_memory()
|
|
1377
|
+
|
|
494
1378
|
await self._post_task(task, assignee.node_id)
|
|
1379
|
+
action_taken = f"reassigned to new worker {assignee.node_id}"
|
|
495
1380
|
else:
|
|
496
1381
|
subtasks = self._decompose_task(task)
|
|
1382
|
+
if self.metrics_logger and subtasks:
|
|
1383
|
+
self.metrics_logger.log_task_decomposed(
|
|
1384
|
+
parent_task_id=task.id,
|
|
1385
|
+
subtask_ids=[st.id for st in subtasks],
|
|
1386
|
+
)
|
|
1387
|
+
for subtask in subtasks:
|
|
1388
|
+
self.metrics_logger.log_task_created(
|
|
1389
|
+
task_id=subtask.id,
|
|
1390
|
+
description=subtask.content,
|
|
1391
|
+
parent_task_id=task.id,
|
|
1392
|
+
task_type=subtask.type,
|
|
1393
|
+
metadata=subtask.additional_info,
|
|
1394
|
+
)
|
|
497
1395
|
# Insert packets at the head of the queue
|
|
498
1396
|
self._pending_tasks.extendleft(reversed(subtasks))
|
|
1397
|
+
|
|
1398
|
+
# Sync shared memory after task decomposition
|
|
1399
|
+
if self.share_memory:
|
|
1400
|
+
logger.info(
|
|
1401
|
+
f"Syncing shared memory after decomposing failed "
|
|
1402
|
+
f"task {task.id}"
|
|
1403
|
+
)
|
|
1404
|
+
self._sync_shared_memory()
|
|
1405
|
+
|
|
499
1406
|
await self._post_ready_tasks()
|
|
1407
|
+
action_taken = f"decomposed into {len(subtasks)} subtasks"
|
|
1408
|
+
if task.id in self._assignees:
|
|
1409
|
+
await self._channel.archive_task(task.id)
|
|
1410
|
+
|
|
1411
|
+
logger.debug(
|
|
1412
|
+
f"Task {task.id} failed and was {action_taken}. "
|
|
1413
|
+
f"Updating dependency state."
|
|
1414
|
+
)
|
|
1415
|
+
# Mark task as completed for dependency tracking
|
|
1416
|
+
self._completed_tasks.append(task)
|
|
1417
|
+
|
|
1418
|
+
# Post next ready tasks
|
|
1419
|
+
|
|
1420
|
+
# Sync shared memory after task completion to share knowledge
|
|
1421
|
+
if self.share_memory:
|
|
1422
|
+
logger.info(
|
|
1423
|
+
f"Syncing shared memory after task {task.id} completion"
|
|
1424
|
+
)
|
|
1425
|
+
self._sync_shared_memory()
|
|
1426
|
+
|
|
1427
|
+
# Check if any pending tasks are now ready to execute
|
|
1428
|
+
await self._post_ready_tasks()
|
|
500
1429
|
return False
|
|
501
1430
|
|
|
502
1431
|
async def _handle_completed_task(self, task: Task) -> None:
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
1432
|
+
if self.metrics_logger:
|
|
1433
|
+
worker_id = self._assignees.get(task.id, "unknown")
|
|
1434
|
+
processing_time_seconds = None
|
|
1435
|
+
token_usage = None
|
|
1436
|
+
|
|
1437
|
+
# Get processing time from task start time or additional info
|
|
1438
|
+
if task.id in self._task_start_times:
|
|
1439
|
+
processing_time_seconds = (
|
|
1440
|
+
time.time() - self._task_start_times[task.id]
|
|
1441
|
+
)
|
|
1442
|
+
del self._task_start_times[task.id] # Prevent memory leaks
|
|
1443
|
+
elif (
|
|
1444
|
+
task.additional_info is not None
|
|
1445
|
+
and 'processing_time_seconds' in task.additional_info
|
|
1446
|
+
):
|
|
1447
|
+
processing_time_seconds = task.additional_info[
|
|
1448
|
+
'processing_time_seconds'
|
|
1449
|
+
]
|
|
1450
|
+
|
|
1451
|
+
# Get token usage from task additional info
|
|
1452
|
+
if (
|
|
1453
|
+
task.additional_info is not None
|
|
1454
|
+
and 'token_usage' in task.additional_info
|
|
1455
|
+
):
|
|
1456
|
+
token_usage = task.additional_info['token_usage']
|
|
1457
|
+
|
|
1458
|
+
# Try to get token usage from SingleAgentWorker memory if available
|
|
1459
|
+
assignee_node = next(
|
|
1460
|
+
(
|
|
1461
|
+
child
|
|
1462
|
+
for child in self._children
|
|
1463
|
+
if child.node_id == worker_id
|
|
1464
|
+
),
|
|
1465
|
+
None,
|
|
1466
|
+
)
|
|
1467
|
+
if isinstance(assignee_node, SingleAgentWorker):
|
|
1468
|
+
_, total_tokens = assignee_node.worker.memory.get_context()
|
|
1469
|
+
token_usage = {'total_tokens': total_tokens}
|
|
1470
|
+
|
|
1471
|
+
# Log the completed task
|
|
1472
|
+
self.metrics_logger.log_task_completed(
|
|
1473
|
+
task_id=task.id,
|
|
1474
|
+
worker_id=worker_id,
|
|
1475
|
+
result_summary=task.result if task.result else "Completed",
|
|
1476
|
+
processing_time_seconds=processing_time_seconds,
|
|
1477
|
+
token_usage=token_usage,
|
|
1478
|
+
metadata={'current_state': task.state.value},
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
# Find and remove the completed task from pending tasks
|
|
1482
|
+
tasks_list = list(self._pending_tasks)
|
|
1483
|
+
found_and_removed = False
|
|
1484
|
+
|
|
1485
|
+
for i, pending_task in enumerate(tasks_list):
|
|
1486
|
+
if pending_task.id == task.id:
|
|
1487
|
+
# Remove this specific task
|
|
1488
|
+
tasks_list.pop(i)
|
|
1489
|
+
self._pending_tasks = deque(tasks_list)
|
|
1490
|
+
found_and_removed = True
|
|
1491
|
+
print(
|
|
1492
|
+
f"{Fore.GREEN}✅ Task {task.id} completed and removed "
|
|
1493
|
+
f"from queue.{Fore.RESET}"
|
|
1494
|
+
)
|
|
1495
|
+
break
|
|
1496
|
+
|
|
1497
|
+
if not found_and_removed:
|
|
1498
|
+
# Task was already removed from pending queue (expected case when
|
|
1499
|
+
# it had been popped immediately after posting). No need to
|
|
1500
|
+
# draw user attention with a warning; record at debug level.
|
|
1501
|
+
logger.debug(
|
|
1502
|
+
f"Completed task {task.id} was already removed from pending "
|
|
1503
|
+
"queue."
|
|
1504
|
+
)
|
|
1505
|
+
|
|
1506
|
+
# Archive the task and update dependency tracking
|
|
1507
|
+
if task.id in self._assignees:
|
|
1508
|
+
await self._channel.archive_task(task.id)
|
|
1509
|
+
|
|
1510
|
+
# Ensure it's in completed tasks set
|
|
1511
|
+
self._completed_tasks.append(task)
|
|
1512
|
+
|
|
1513
|
+
# Handle parent task completion logic
|
|
1514
|
+
parent = task.parent
|
|
1515
|
+
if parent and parent.id not in {t.id for t in self._completed_tasks}:
|
|
1516
|
+
all_subtasks_done = all(
|
|
1517
|
+
sub.id in {t.id for t in self._completed_tasks}
|
|
1518
|
+
for sub in parent.subtasks
|
|
1519
|
+
)
|
|
1520
|
+
if all_subtasks_done:
|
|
1521
|
+
# Set the parent task state to done
|
|
1522
|
+
parent.state = TaskState.DONE
|
|
1523
|
+
logger.debug(
|
|
1524
|
+
f"All subtasks of {parent.id} are done. "
|
|
1525
|
+
f"Marking parent as complete."
|
|
1526
|
+
)
|
|
1527
|
+
# Treat the parent task as a completed task to unblock
|
|
1528
|
+
# its dependents. Since it was never sent to a worker,
|
|
1529
|
+
# we call this method recursively.
|
|
1530
|
+
await self._handle_completed_task(parent)
|
|
1531
|
+
|
|
1532
|
+
# Sync shared memory after task completion to share knowledge
|
|
1533
|
+
if self.share_memory:
|
|
1534
|
+
logger.info(
|
|
1535
|
+
f"Syncing shared memory after task {task.id} completion"
|
|
1536
|
+
)
|
|
1537
|
+
self._sync_shared_memory()
|
|
1538
|
+
|
|
1539
|
+
# Check if any pending tasks are now ready to execute
|
|
506
1540
|
await self._post_ready_tasks()
|
|
507
1541
|
|
|
508
1542
|
async def _graceful_shutdown(self, failed_task: Task) -> None:
|
|
@@ -521,50 +1555,157 @@ class Workforce(BaseNode):
|
|
|
521
1555
|
f"seconds due to failure. You can use this time to inspect the "
|
|
522
1556
|
f"current state of the workforce."
|
|
523
1557
|
)
|
|
524
|
-
|
|
525
1558
|
# Wait for the full timeout period
|
|
526
1559
|
await asyncio.sleep(self.graceful_shutdown_timeout)
|
|
527
1560
|
|
|
1561
|
+
def get_workforce_log_tree(self) -> str:
|
|
1562
|
+
r"""Returns an ASCII tree representation of the task hierarchy and
|
|
1563
|
+
worker status.
|
|
1564
|
+
"""
|
|
1565
|
+
if not self.metrics_logger:
|
|
1566
|
+
return "Logger not initialized."
|
|
1567
|
+
return self.metrics_logger.get_ascii_tree_representation()
|
|
1568
|
+
|
|
1569
|
+
def get_workforce_kpis(self) -> Dict[str, Any]:
|
|
1570
|
+
r"""Returns a dictionary of key performance indicators."""
|
|
1571
|
+
if not self.metrics_logger:
|
|
1572
|
+
return {"error": "Logger not initialized."}
|
|
1573
|
+
return self.metrics_logger.get_kpis()
|
|
1574
|
+
|
|
1575
|
+
def dump_workforce_logs(self, file_path: str) -> None:
|
|
1576
|
+
r"""Dumps all collected logs to a JSON file.
|
|
1577
|
+
|
|
1578
|
+
Args:
|
|
1579
|
+
file_path (str): The path to the JSON file.
|
|
1580
|
+
"""
|
|
1581
|
+
if not self.metrics_logger:
|
|
1582
|
+
print("Logger not initialized. Cannot dump logs.")
|
|
1583
|
+
return
|
|
1584
|
+
self.metrics_logger.dump_to_json(file_path)
|
|
1585
|
+
# Use logger.info or print, consistent with existing style
|
|
1586
|
+
logger.info(f"Workforce logs dumped to {file_path}")
|
|
1587
|
+
|
|
528
1588
|
@check_if_running(False)
|
|
529
1589
|
async def _listen_to_channel(self) -> None:
|
|
530
1590
|
r"""Continuously listen to the channel, post task to the channel and
|
|
531
|
-
track the status of posted tasks.
|
|
1591
|
+
track the status of posted tasks. Now supports pause/resume and
|
|
1592
|
+
graceful stop.
|
|
532
1593
|
"""
|
|
533
1594
|
|
|
534
1595
|
self._running = True
|
|
1596
|
+
self._state = WorkforceState.RUNNING
|
|
535
1597
|
logger.info(f"Workforce {self.node_id} started.")
|
|
536
1598
|
|
|
537
1599
|
await self._post_ready_tasks()
|
|
538
1600
|
|
|
539
|
-
while
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
1601
|
+
while (
|
|
1602
|
+
self._task is None
|
|
1603
|
+
or self._pending_tasks
|
|
1604
|
+
or self._in_flight_tasks > 0
|
|
1605
|
+
) and not self._stop_requested:
|
|
1606
|
+
try:
|
|
1607
|
+
# Check for pause request at the beginning of each loop
|
|
1608
|
+
# iteration
|
|
1609
|
+
await self._pause_event.wait()
|
|
1610
|
+
|
|
1611
|
+
# Check for stop request after potential pause
|
|
1612
|
+
if self._stop_requested:
|
|
1613
|
+
logger.info("Stop requested, breaking execution loop.")
|
|
1614
|
+
break
|
|
1615
|
+
|
|
1616
|
+
# Save snapshot before processing next task
|
|
1617
|
+
if self._pending_tasks:
|
|
1618
|
+
current_task = self._pending_tasks[0]
|
|
1619
|
+
# Throttled snapshot
|
|
1620
|
+
if (
|
|
1621
|
+
time.time() - self._last_snapshot_time
|
|
1622
|
+
>= self.snapshot_interval
|
|
1623
|
+
):
|
|
1624
|
+
self.save_snapshot(
|
|
1625
|
+
f"Before processing task: {current_task.id}"
|
|
1626
|
+
)
|
|
1627
|
+
self._last_snapshot_time = time.time()
|
|
1628
|
+
|
|
1629
|
+
# Get returned task (this may block until a task is returned)
|
|
1630
|
+
returned_task = await self._get_returned_task()
|
|
1631
|
+
self._in_flight_tasks -= 1
|
|
1632
|
+
|
|
1633
|
+
# Check for stop request after getting task
|
|
1634
|
+
if self._stop_requested:
|
|
1635
|
+
logger.info("Stop requested after receiving task.")
|
|
1636
|
+
break
|
|
1637
|
+
|
|
1638
|
+
# Process the returned task based on its state
|
|
1639
|
+
if returned_task.state == TaskState.DONE:
|
|
1640
|
+
print(
|
|
1641
|
+
f"{Fore.CYAN}🎯 Task {returned_task.id} completed "
|
|
1642
|
+
f"successfully.{Fore.RESET}"
|
|
1643
|
+
)
|
|
1644
|
+
await self._handle_completed_task(returned_task)
|
|
1645
|
+
elif returned_task.state == TaskState.FAILED:
|
|
1646
|
+
halt = await self._handle_failed_task(returned_task)
|
|
1647
|
+
if not halt:
|
|
1648
|
+
continue
|
|
1649
|
+
print(
|
|
1650
|
+
f"{Fore.RED}Task {returned_task.id} has failed "
|
|
1651
|
+
f"for 3 times, halting the workforce.{Fore.RESET}"
|
|
1652
|
+
)
|
|
1653
|
+
# Graceful shutdown instead of immediate break
|
|
1654
|
+
await self._graceful_shutdown(returned_task)
|
|
1655
|
+
break
|
|
1656
|
+
elif returned_task.state == TaskState.OPEN:
|
|
1657
|
+
# TODO: multi-layer workforce
|
|
1658
|
+
pass
|
|
1659
|
+
else:
|
|
1660
|
+
raise ValueError(
|
|
1661
|
+
f"Task {returned_task.id} has an unexpected state."
|
|
1662
|
+
)
|
|
1663
|
+
|
|
1664
|
+
except Exception as e:
|
|
1665
|
+
logger.error(f"Error processing task: {e}")
|
|
1666
|
+
if self._stop_requested:
|
|
1667
|
+
break
|
|
1668
|
+
# Continue with next iteration unless stop is requested
|
|
1669
|
+
continue
|
|
1670
|
+
|
|
1671
|
+
# Handle final state
|
|
1672
|
+
if self._stop_requested:
|
|
1673
|
+
self._state = WorkforceState.STOPPED
|
|
1674
|
+
logger.info("Workforce stopped by user request.")
|
|
1675
|
+
elif not self._pending_tasks and self._in_flight_tasks == 0:
|
|
1676
|
+
self._state = WorkforceState.IDLE
|
|
1677
|
+
logger.info("All tasks completed.")
|
|
561
1678
|
|
|
562
1679
|
# shut down the whole workforce tree
|
|
563
1680
|
self.stop()
|
|
564
1681
|
|
|
1682
|
+
def _submit_coro_to_loop(self, coro: 'Coroutine') -> None:
|
|
1683
|
+
r"""Thread-safe submission of coroutine to the workforce loop."""
|
|
1684
|
+
|
|
1685
|
+
loop = self._loop
|
|
1686
|
+
if loop is None or loop.is_closed():
|
|
1687
|
+
logger.warning("Cannot submit coroutine - no active event loop")
|
|
1688
|
+
return
|
|
1689
|
+
try:
|
|
1690
|
+
running_loop = asyncio.get_running_loop()
|
|
1691
|
+
except RuntimeError:
|
|
1692
|
+
running_loop = None
|
|
1693
|
+
|
|
1694
|
+
if running_loop is loop:
|
|
1695
|
+
loop.create_task(coro)
|
|
1696
|
+
else:
|
|
1697
|
+
asyncio.run_coroutine_threadsafe(coro, loop)
|
|
1698
|
+
|
|
565
1699
|
@check_if_running(False)
|
|
566
1700
|
async def start(self) -> None:
|
|
567
1701
|
r"""Start itself and all the child nodes under it."""
|
|
1702
|
+
# Sync shared memory at the start to ensure all agents have context
|
|
1703
|
+
if self.share_memory:
|
|
1704
|
+
logger.info(
|
|
1705
|
+
f"Syncing shared memory at workforce {self.node_id} startup"
|
|
1706
|
+
)
|
|
1707
|
+
self._sync_shared_memory()
|
|
1708
|
+
|
|
568
1709
|
for child in self._children:
|
|
569
1710
|
child_listening_task = asyncio.create_task(child.start())
|
|
570
1711
|
self._child_listening_tasks.append(child_listening_task)
|
|
@@ -576,7 +1717,8 @@ class Workforce(BaseNode):
|
|
|
576
1717
|
by its parent node.
|
|
577
1718
|
"""
|
|
578
1719
|
for child in self._children:
|
|
579
|
-
child.
|
|
1720
|
+
if child._running:
|
|
1721
|
+
child.stop()
|
|
580
1722
|
for child_task in self._child_listening_tasks:
|
|
581
1723
|
child_task.cancel()
|
|
582
1724
|
self._running = False
|
|
@@ -596,12 +1738,21 @@ class Workforce(BaseNode):
|
|
|
596
1738
|
"""
|
|
597
1739
|
|
|
598
1740
|
# Create a new instance with the same configuration
|
|
1741
|
+
# Extract the original kwargs from the agents to properly clone them
|
|
1742
|
+
coordinator_kwargs = (
|
|
1743
|
+
getattr(self.coordinator_agent, 'init_kwargs', {}) or {}
|
|
1744
|
+
)
|
|
1745
|
+
task_kwargs = getattr(self.task_agent, 'init_kwargs', {}) or {}
|
|
1746
|
+
|
|
599
1747
|
new_instance = Workforce(
|
|
600
1748
|
description=self.description,
|
|
601
|
-
coordinator_agent_kwargs=
|
|
602
|
-
task_agent_kwargs=
|
|
603
|
-
new_worker_agent_kwargs=self.new_worker_agent_kwargs
|
|
1749
|
+
coordinator_agent_kwargs=coordinator_kwargs.copy(),
|
|
1750
|
+
task_agent_kwargs=task_kwargs.copy(),
|
|
1751
|
+
new_worker_agent_kwargs=self.new_worker_agent_kwargs.copy()
|
|
1752
|
+
if self.new_worker_agent_kwargs
|
|
1753
|
+
else None,
|
|
604
1754
|
graceful_shutdown_timeout=self.graceful_shutdown_timeout,
|
|
1755
|
+
share_memory=self.share_memory,
|
|
605
1756
|
)
|
|
606
1757
|
|
|
607
1758
|
new_instance.task_agent = self.task_agent.clone(with_memory)
|
|
@@ -620,10 +1771,10 @@ class Workforce(BaseNode):
|
|
|
620
1771
|
child.description,
|
|
621
1772
|
child.assistant_role_name,
|
|
622
1773
|
child.user_role_name,
|
|
1774
|
+
child.chat_turn_limit,
|
|
623
1775
|
child.assistant_agent_kwargs,
|
|
624
1776
|
child.user_agent_kwargs,
|
|
625
1777
|
child.summarize_agent_kwargs,
|
|
626
|
-
child.chat_turn_limit,
|
|
627
1778
|
)
|
|
628
1779
|
elif isinstance(child, Workforce):
|
|
629
1780
|
new_instance.add_workforce(child.clone(with_memory))
|
|
@@ -682,7 +1833,9 @@ class Workforce(BaseNode):
|
|
|
682
1833
|
workforce_instance = self
|
|
683
1834
|
|
|
684
1835
|
# Define functions first
|
|
685
|
-
def process_task(
|
|
1836
|
+
async def process_task(
|
|
1837
|
+
task_content, task_id=None, additional_info=None
|
|
1838
|
+
):
|
|
686
1839
|
r"""Process a task using the workforce.
|
|
687
1840
|
|
|
688
1841
|
Args:
|
|
@@ -704,7 +1857,8 @@ class Workforce(BaseNode):
|
|
|
704
1857
|
- message (str): Error message if status is "error"
|
|
705
1858
|
|
|
706
1859
|
Example:
|
|
707
|
-
>>> result = process_task("Analyze market trends",
|
|
1860
|
+
>>> result = await process_task("Analyze market trends",
|
|
1861
|
+
"task_001")
|
|
708
1862
|
>>> print(result["status"]) # "success" or "error"
|
|
709
1863
|
"""
|
|
710
1864
|
task = Task(
|
|
@@ -714,7 +1868,7 @@ class Workforce(BaseNode):
|
|
|
714
1868
|
)
|
|
715
1869
|
|
|
716
1870
|
try:
|
|
717
|
-
result_task = workforce_instance.process_task(task)
|
|
1871
|
+
result_task = await workforce_instance.process_task(task)
|
|
718
1872
|
return {
|
|
719
1873
|
"status": "success",
|
|
720
1874
|
"task_id": result_task.id,
|
|
@@ -834,9 +1988,9 @@ class Workforce(BaseNode):
|
|
|
834
1988
|
>>> for child in children:
|
|
835
1989
|
... print(f"{child['type']}: {child['description']}")
|
|
836
1990
|
"""
|
|
837
|
-
children_info = []
|
|
1991
|
+
children_info: List[Dict[str, Any]] = []
|
|
838
1992
|
for child in workforce_instance._children:
|
|
839
|
-
child_info = {
|
|
1993
|
+
child_info: Dict[str, Any] = {
|
|
840
1994
|
"node_id": child.node_id,
|
|
841
1995
|
"description": child.description,
|
|
842
1996
|
"type": type(child).__name__,
|