fleet-python 0.2.66b2__py3-none-any.whl → 0.2.105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/export_tasks.py +16 -5
- examples/export_tasks_filtered.py +245 -0
- examples/fetch_tasks.py +230 -0
- examples/import_tasks.py +140 -8
- examples/iterate_verifiers.py +725 -0
- fleet/__init__.py +128 -5
- fleet/_async/__init__.py +27 -3
- fleet/_async/base.py +24 -9
- fleet/_async/client.py +938 -41
- fleet/_async/env/client.py +60 -3
- fleet/_async/instance/client.py +52 -7
- fleet/_async/models.py +15 -0
- fleet/_async/resources/api.py +200 -0
- fleet/_async/resources/sqlite.py +1801 -46
- fleet/_async/tasks.py +122 -25
- fleet/_async/verifiers/bundler.py +22 -21
- fleet/_async/verifiers/verifier.py +25 -19
- fleet/agent/__init__.py +32 -0
- fleet/agent/gemini_cua/Dockerfile +45 -0
- fleet/agent/gemini_cua/__init__.py +10 -0
- fleet/agent/gemini_cua/agent.py +759 -0
- fleet/agent/gemini_cua/mcp/main.py +108 -0
- fleet/agent/gemini_cua/mcp_server/__init__.py +5 -0
- fleet/agent/gemini_cua/mcp_server/main.py +105 -0
- fleet/agent/gemini_cua/mcp_server/tools.py +178 -0
- fleet/agent/gemini_cua/requirements.txt +5 -0
- fleet/agent/gemini_cua/start.sh +30 -0
- fleet/agent/orchestrator.py +854 -0
- fleet/agent/types.py +49 -0
- fleet/agent/utils.py +34 -0
- fleet/base.py +34 -9
- fleet/cli.py +1061 -0
- fleet/client.py +1060 -48
- fleet/config.py +1 -1
- fleet/env/__init__.py +16 -0
- fleet/env/client.py +60 -3
- fleet/eval/__init__.py +15 -0
- fleet/eval/uploader.py +231 -0
- fleet/exceptions.py +8 -0
- fleet/instance/client.py +53 -8
- fleet/instance/models.py +1 -0
- fleet/models.py +303 -0
- fleet/proxy/__init__.py +25 -0
- fleet/proxy/proxy.py +453 -0
- fleet/proxy/whitelist.py +244 -0
- fleet/resources/api.py +200 -0
- fleet/resources/sqlite.py +1845 -46
- fleet/tasks.py +113 -20
- fleet/utils/__init__.py +7 -0
- fleet/utils/http_logging.py +178 -0
- fleet/utils/logging.py +13 -0
- fleet/utils/playwright.py +440 -0
- fleet/verifiers/bundler.py +22 -21
- fleet/verifiers/db.py +985 -1
- fleet/verifiers/decorator.py +1 -1
- fleet/verifiers/verifier.py +25 -19
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/METADATA +28 -1
- fleet_python-0.2.105.dist-info/RECORD +115 -0
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/WHEEL +1 -1
- fleet_python-0.2.105.dist-info/entry_points.txt +2 -0
- tests/test_app_method.py +85 -0
- tests/test_expect_exactly.py +4148 -0
- tests/test_expect_only.py +2593 -0
- tests/test_instance_dispatch.py +607 -0
- tests/test_sqlite_resource_dual_mode.py +263 -0
- tests/test_sqlite_shared_memory_behavior.py +117 -0
- fleet_python-0.2.66b2.dist-info/RECORD +0 -81
- tests/test_verifier_security.py +0 -427
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/licenses/LICENSE +0 -0
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,854 @@
|
|
|
1
|
+
"""Agent Orchestrator - Coordinates running agents on Fleet tasks.
|
|
2
|
+
|
|
3
|
+
Architecture:
|
|
4
|
+
1. Load tasks from Fleet API
|
|
5
|
+
2. For each task (parallel up to max_concurrent):
|
|
6
|
+
a. Create Fleet environment (cloud)
|
|
7
|
+
b. Start Docker container with CUA server (Playwright + browser)
|
|
8
|
+
c. Run agent on HOST, connecting to container's MCP server
|
|
9
|
+
d. Collect results and run verification
|
|
10
|
+
e. Clean up
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
results = await run_agent(
|
|
14
|
+
project_key="my-project",
|
|
15
|
+
agent="gemini_cua",
|
|
16
|
+
api_keys={"GEMINI_API_KEY": "xxx"},
|
|
17
|
+
)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
import atexit
|
|
22
|
+
import json
|
|
23
|
+
import logging
|
|
24
|
+
import os
|
|
25
|
+
import signal
|
|
26
|
+
import sys
|
|
27
|
+
import time
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
30
|
+
|
|
31
|
+
import fleet
|
|
32
|
+
from .utils import get_agent_path
|
|
33
|
+
from .types import AgentConfig, AgentResult, TaskResult
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
# Global tracking of running containers for cleanup on exit
|
|
38
|
+
_running_containers: Set[str] = set()
|
|
39
|
+
_cleanup_registered = False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _cleanup_all_containers():
|
|
43
|
+
"""Kill all tracked running containers. Called on exit/signal."""
|
|
44
|
+
import subprocess
|
|
45
|
+
|
|
46
|
+
if not _running_containers:
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
containers = list(_running_containers)
|
|
50
|
+
logger.debug(f"Cleaning up {len(containers)} container(s)...")
|
|
51
|
+
|
|
52
|
+
for container_id in containers:
|
|
53
|
+
try:
|
|
54
|
+
# Use docker kill for immediate termination
|
|
55
|
+
subprocess.run(
|
|
56
|
+
["docker", "kill", container_id],
|
|
57
|
+
stdout=subprocess.DEVNULL,
|
|
58
|
+
stderr=subprocess.DEVNULL,
|
|
59
|
+
timeout=5,
|
|
60
|
+
)
|
|
61
|
+
_running_containers.discard(container_id)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.debug(f"Failed to kill container {container_id[:12]}: {e}")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _register_cleanup():
|
|
67
|
+
"""Register cleanup handlers (only once)."""
|
|
68
|
+
global _cleanup_registered
|
|
69
|
+
if _cleanup_registered:
|
|
70
|
+
return
|
|
71
|
+
_cleanup_registered = True
|
|
72
|
+
|
|
73
|
+
# Register atexit handler
|
|
74
|
+
atexit.register(_cleanup_all_containers)
|
|
75
|
+
|
|
76
|
+
# Register signal handlers for graceful shutdown
|
|
77
|
+
def signal_handler(signum, frame):
|
|
78
|
+
_cleanup_all_containers()
|
|
79
|
+
# Re-raise to allow normal signal handling
|
|
80
|
+
signal.default_int_handler(signum, frame)
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
signal.signal(signal.SIGINT, signal_handler)
|
|
84
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
|
85
|
+
except (ValueError, OSError):
|
|
86
|
+
# Can't set signal handlers in some contexts (e.g., non-main thread)
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _cleanup_orphaned_containers(image_prefix: str = "fleet-cua-"):
|
|
91
|
+
"""Kill any orphaned containers from previous runs.
|
|
92
|
+
|
|
93
|
+
This handles cases where a previous run was force-killed (SIGKILL)
|
|
94
|
+
and containers were left running, which could cause port conflicts.
|
|
95
|
+
"""
|
|
96
|
+
import subprocess
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
# List running containers with our image prefix
|
|
100
|
+
result = subprocess.run(
|
|
101
|
+
[
|
|
102
|
+
"docker",
|
|
103
|
+
"ps",
|
|
104
|
+
"--filter",
|
|
105
|
+
f"ancestor={image_prefix}",
|
|
106
|
+
"-q",
|
|
107
|
+
"--format",
|
|
108
|
+
"{{.ID}} {{.Image}}",
|
|
109
|
+
],
|
|
110
|
+
capture_output=True,
|
|
111
|
+
text=True,
|
|
112
|
+
timeout=10,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if result.returncode != 0:
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
# Parse container IDs - we need to filter by image name pattern
|
|
119
|
+
# Use docker ps with format to get both ID and image
|
|
120
|
+
result = subprocess.run(
|
|
121
|
+
["docker", "ps", "--format", "{{.ID}}\t{{.Image}}"],
|
|
122
|
+
capture_output=True,
|
|
123
|
+
text=True,
|
|
124
|
+
timeout=10,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if result.returncode != 0 or not result.stdout.strip():
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
orphaned = []
|
|
131
|
+
for line in result.stdout.strip().split("\n"):
|
|
132
|
+
if not line:
|
|
133
|
+
continue
|
|
134
|
+
parts = line.split("\t")
|
|
135
|
+
if len(parts) >= 2:
|
|
136
|
+
container_id, image = parts[0], parts[1]
|
|
137
|
+
if image.startswith(image_prefix):
|
|
138
|
+
orphaned.append(container_id)
|
|
139
|
+
|
|
140
|
+
if orphaned:
|
|
141
|
+
logger.info(
|
|
142
|
+
f"Cleaning up {len(orphaned)} orphaned container(s) from previous run..."
|
|
143
|
+
)
|
|
144
|
+
for container_id in orphaned:
|
|
145
|
+
try:
|
|
146
|
+
subprocess.run(
|
|
147
|
+
["docker", "kill", container_id],
|
|
148
|
+
stdout=subprocess.DEVNULL,
|
|
149
|
+
stderr=subprocess.DEVNULL,
|
|
150
|
+
timeout=5,
|
|
151
|
+
)
|
|
152
|
+
except Exception:
|
|
153
|
+
pass
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.debug(f"Failed to check for orphaned containers: {e}")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class AgentOrchestrator:
|
|
159
|
+
"""Orchestrates running agents on Fleet tasks."""
|
|
160
|
+
|
|
161
|
+
def __init__(self, config: AgentConfig):
|
|
162
|
+
self.config = config
|
|
163
|
+
self._port_counter = config.port_range_start
|
|
164
|
+
self._vnc_port_counter = config.vnc_port_start
|
|
165
|
+
self._port_lock = asyncio.Lock()
|
|
166
|
+
self._docker_image: Optional[str] = None
|
|
167
|
+
# Track available ports (recycled when tasks complete)
|
|
168
|
+
self._available_ports: List[Tuple[int, int]] = []
|
|
169
|
+
# Register global cleanup handlers
|
|
170
|
+
_register_cleanup()
|
|
171
|
+
# Stats tracking
|
|
172
|
+
self._stats = {"started": 0, "completed": 0, "failed": 0, "errors": {}}
|
|
173
|
+
|
|
174
|
+
def _track_error(self, category: str, message: str):
|
|
175
|
+
"""Track an error for summary statistics."""
|
|
176
|
+
if category not in self._stats["errors"]:
|
|
177
|
+
self._stats["errors"][category] = []
|
|
178
|
+
# Keep up to 5 examples per category
|
|
179
|
+
if len(self._stats["errors"][category]) < 5:
|
|
180
|
+
self._stats["errors"][category].append(message[:200])
|
|
181
|
+
|
|
182
|
+
def _print_stats(self):
|
|
183
|
+
"""Print summary statistics."""
|
|
184
|
+
from rich.console import Console
|
|
185
|
+
from rich.table import Table
|
|
186
|
+
|
|
187
|
+
console = Console()
|
|
188
|
+
|
|
189
|
+
total = self._stats["started"]
|
|
190
|
+
completed = self._stats["completed"]
|
|
191
|
+
failed = self._stats["failed"]
|
|
192
|
+
|
|
193
|
+
console.print()
|
|
194
|
+
console.print("[bold]Run Summary:[/bold]")
|
|
195
|
+
console.print(f" Started: {total}")
|
|
196
|
+
console.print(f" Completed: [green]{completed}[/green] ({100*completed/total:.1f}%)" if total > 0 else " Completed: 0")
|
|
197
|
+
console.print(f" Failed: [red]{failed}[/red] ({100*failed/total:.1f}%)" if total > 0 else " Failed: 0")
|
|
198
|
+
|
|
199
|
+
if self._stats["errors"]:
|
|
200
|
+
console.print()
|
|
201
|
+
console.print("[bold]Error Breakdown:[/bold]")
|
|
202
|
+
table = Table(show_header=True, header_style="bold")
|
|
203
|
+
table.add_column("Category")
|
|
204
|
+
table.add_column("Count")
|
|
205
|
+
table.add_column("Example")
|
|
206
|
+
|
|
207
|
+
for category, examples in sorted(self._stats["errors"].items(), key=lambda x: -len(x[1])):
|
|
208
|
+
table.add_row(
|
|
209
|
+
category,
|
|
210
|
+
str(len(examples)),
|
|
211
|
+
examples[0][:80] + "..." if len(examples[0]) > 80 else examples[0]
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
console.print(table)
|
|
215
|
+
|
|
216
|
+
async def _get_next_ports(self) -> Tuple[int, int]:
|
|
217
|
+
"""Get next available MCP port and VNC port."""
|
|
218
|
+
async with self._port_lock:
|
|
219
|
+
# Reuse recycled ports first
|
|
220
|
+
if self._available_ports:
|
|
221
|
+
return self._available_ports.pop()
|
|
222
|
+
# Otherwise allocate new ones
|
|
223
|
+
port = self._port_counter
|
|
224
|
+
vnc_port = self._vnc_port_counter
|
|
225
|
+
self._port_counter += 1
|
|
226
|
+
self._vnc_port_counter += 1
|
|
227
|
+
return port, vnc_port
|
|
228
|
+
|
|
229
|
+
async def _release_ports(self, port: int, vnc_port: int):
|
|
230
|
+
"""Return ports to the pool for reuse."""
|
|
231
|
+
async with self._port_lock:
|
|
232
|
+
self._available_ports.append((port, vnc_port))
|
|
233
|
+
|
|
234
|
+
async def run(self) -> List[TaskResult]:
|
|
235
|
+
"""Run agents on all tasks."""
|
|
236
|
+
from fleet._async import load_tasks
|
|
237
|
+
from rich.console import Console
|
|
238
|
+
from rich.live import Live
|
|
239
|
+
from rich.panel import Panel
|
|
240
|
+
from rich.spinner import Spinner
|
|
241
|
+
|
|
242
|
+
console = Console()
|
|
243
|
+
|
|
244
|
+
# Create job via Fleet API (name generated server-side)
|
|
245
|
+
self._job_id = await fleet.job_async()
|
|
246
|
+
console.print(Panel(
|
|
247
|
+
f"[bold]Live agent traces[/bold]\n\n https://www.fleetai.com/dashboard/jobs/{self._job_id}",
|
|
248
|
+
border_style="cyan",
|
|
249
|
+
))
|
|
250
|
+
console.print()
|
|
251
|
+
|
|
252
|
+
# Create log directory: ~/.fleet/logs/{job_id}/
|
|
253
|
+
self._log_dir = Path.home() / ".fleet" / "logs" / self._job_id
|
|
254
|
+
self._log_dir.mkdir(parents=True, exist_ok=True)
|
|
255
|
+
|
|
256
|
+
# Load tasks with spinner
|
|
257
|
+
with Live(
|
|
258
|
+
Spinner("dots", text=f"Loading tasks from {self.config.project_key}..."),
|
|
259
|
+
console=console,
|
|
260
|
+
transient=True,
|
|
261
|
+
):
|
|
262
|
+
if self.config.task_keys:
|
|
263
|
+
tasks = await load_tasks(keys=self.config.task_keys)
|
|
264
|
+
elif self.config.project_key:
|
|
265
|
+
tasks = await load_tasks(project_key=self.config.project_key)
|
|
266
|
+
else:
|
|
267
|
+
raise ValueError("Either project_key or task_keys required")
|
|
268
|
+
|
|
269
|
+
console.print(f"Loaded {len(tasks)} tasks")
|
|
270
|
+
|
|
271
|
+
# Build Docker image
|
|
272
|
+
agent_path = get_agent_path(self.config.agent)
|
|
273
|
+
await self._build_docker_image(agent_path)
|
|
274
|
+
|
|
275
|
+
# Run tasks with concurrency limit and progress
|
|
276
|
+
from rich.progress import (
|
|
277
|
+
Progress,
|
|
278
|
+
SpinnerColumn,
|
|
279
|
+
TextColumn,
|
|
280
|
+
BarColumn,
|
|
281
|
+
TaskProgressColumn,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
semaphore = asyncio.Semaphore(self.config.max_concurrent)
|
|
285
|
+
results = [None] * len(tasks)
|
|
286
|
+
completed_count = 0
|
|
287
|
+
passed_count = 0
|
|
288
|
+
total_count = len(tasks)
|
|
289
|
+
|
|
290
|
+
with Progress(
|
|
291
|
+
SpinnerColumn(),
|
|
292
|
+
TextColumn("[progress.description]{task.description}"),
|
|
293
|
+
BarColumn(),
|
|
294
|
+
TaskProgressColumn(),
|
|
295
|
+
console=console,
|
|
296
|
+
) as progress:
|
|
297
|
+
task_progress = progress.add_task(
|
|
298
|
+
f"[cyan]Running ({completed_count}/{total_count}) | {passed_count} passed[/cyan]",
|
|
299
|
+
total=len(tasks)
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
async def run_with_semaphore(idx, task):
|
|
303
|
+
nonlocal completed_count, passed_count
|
|
304
|
+
async with semaphore:
|
|
305
|
+
result = await self._run_task(task)
|
|
306
|
+
completed_count += 1
|
|
307
|
+
if result.verification_success:
|
|
308
|
+
passed_count += 1
|
|
309
|
+
progress.update(
|
|
310
|
+
task_progress,
|
|
311
|
+
advance=1,
|
|
312
|
+
description=f"[cyan]Running ({completed_count}/{total_count}) | {passed_count} passed[/cyan]"
|
|
313
|
+
)
|
|
314
|
+
return idx, result
|
|
315
|
+
|
|
316
|
+
completed = await asyncio.gather(
|
|
317
|
+
*[run_with_semaphore(i, t) for i, t in enumerate(tasks)],
|
|
318
|
+
return_exceptions=True,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Convert to ordered list
|
|
322
|
+
for item in completed:
|
|
323
|
+
if isinstance(item, Exception):
|
|
324
|
+
# Find which task this was - shouldn't happen but handle it
|
|
325
|
+
continue
|
|
326
|
+
idx, result = item
|
|
327
|
+
results[idx] = result
|
|
328
|
+
|
|
329
|
+
# Fill any gaps with error results
|
|
330
|
+
final = []
|
|
331
|
+
for i, r in enumerate(results):
|
|
332
|
+
if r is None:
|
|
333
|
+
final.append(
|
|
334
|
+
TaskResult(
|
|
335
|
+
task_key=tasks[i].key,
|
|
336
|
+
task_prompt=tasks[i].prompt,
|
|
337
|
+
error="Task failed unexpectedly",
|
|
338
|
+
)
|
|
339
|
+
)
|
|
340
|
+
else:
|
|
341
|
+
final.append(r)
|
|
342
|
+
|
|
343
|
+
# Show logs location
|
|
344
|
+
if hasattr(self, "_log_dir") and self._log_dir.exists():
|
|
345
|
+
session_logs = list(self._log_dir.glob("*.jsonl"))
|
|
346
|
+
console.print(f"Logs: {self._log_dir}/ ({len(session_logs)} sessions)")
|
|
347
|
+
|
|
348
|
+
# Print summary statistics
|
|
349
|
+
self._print_stats()
|
|
350
|
+
|
|
351
|
+
return final, self._job_id
|
|
352
|
+
|
|
353
|
+
async def _build_docker_image(self, agent_path: Path):
|
|
354
|
+
"""Build Docker image for CUA server."""
|
|
355
|
+
from rich.console import Console
|
|
356
|
+
from rich.live import Live
|
|
357
|
+
from rich.spinner import Spinner
|
|
358
|
+
|
|
359
|
+
console = Console()
|
|
360
|
+
dockerfile = agent_path / "Dockerfile"
|
|
361
|
+
if not dockerfile.exists():
|
|
362
|
+
raise FileNotFoundError(f"Dockerfile not found in {agent_path}")
|
|
363
|
+
|
|
364
|
+
image_name = f"fleet-cua-{agent_path.name}"
|
|
365
|
+
|
|
366
|
+
# Clean up any orphaned containers from previous runs (prevents port conflicts)
|
|
367
|
+
_cleanup_orphaned_containers(image_name)
|
|
368
|
+
|
|
369
|
+
# Build context is the agent directory (all files are self-contained)
|
|
370
|
+
with Live(
|
|
371
|
+
Spinner("dots", text=f"Building Docker image {image_name}..."),
|
|
372
|
+
console=console,
|
|
373
|
+
transient=True,
|
|
374
|
+
):
|
|
375
|
+
proc = await asyncio.create_subprocess_exec(
|
|
376
|
+
"docker",
|
|
377
|
+
"build",
|
|
378
|
+
"-t",
|
|
379
|
+
image_name,
|
|
380
|
+
str(agent_path), # Build context is agent directory
|
|
381
|
+
stdout=asyncio.subprocess.PIPE,
|
|
382
|
+
stderr=asyncio.subprocess.PIPE,
|
|
383
|
+
)
|
|
384
|
+
stdout, stderr = await proc.communicate()
|
|
385
|
+
|
|
386
|
+
if proc.returncode != 0:
|
|
387
|
+
console.print(f"[red]✗[/red] Docker build failed")
|
|
388
|
+
console.print(stderr.decode())
|
|
389
|
+
raise RuntimeError(f"Docker build failed: {stderr.decode()}")
|
|
390
|
+
|
|
391
|
+
self._docker_image = image_name
|
|
392
|
+
console.print(f"Docker image ready: {image_name}")
|
|
393
|
+
|
|
394
|
+
async def _run_task(self, task) -> TaskResult:
|
|
395
|
+
"""Run agent on a single task."""
|
|
396
|
+
from fleet.env import make_async
|
|
397
|
+
|
|
398
|
+
start = time.time()
|
|
399
|
+
task_key = task.key
|
|
400
|
+
task_prompt = task.prompt
|
|
401
|
+
short_key = task_key[:20]
|
|
402
|
+
|
|
403
|
+
self._stats["started"] += 1
|
|
404
|
+
logger.debug(f"[{short_key}] Starting (total started: {self._stats['started']})")
|
|
405
|
+
|
|
406
|
+
env = None
|
|
407
|
+
container_id = None
|
|
408
|
+
port = None
|
|
409
|
+
vnc_port = None
|
|
410
|
+
current_phase = "init"
|
|
411
|
+
|
|
412
|
+
try:
|
|
413
|
+
# 1. Create Fleet environment
|
|
414
|
+
current_phase = "create_env"
|
|
415
|
+
logger.debug(f"[{short_key}] Creating env...")
|
|
416
|
+
env = await make_async(
|
|
417
|
+
env_key=task.env_key,
|
|
418
|
+
data_key=task.data_key,
|
|
419
|
+
env_variables=task.env_variables,
|
|
420
|
+
ttl_seconds=self.config.timeout_seconds + 300,
|
|
421
|
+
)
|
|
422
|
+
env_url = env.urls.root
|
|
423
|
+
logger.debug(f"[{short_key}] Env: {env_url}")
|
|
424
|
+
|
|
425
|
+
await asyncio.sleep(3) # Wait for env to be ready
|
|
426
|
+
|
|
427
|
+
# 2. Start Docker container with CUA server
|
|
428
|
+
current_phase = "start_container"
|
|
429
|
+
port, vnc_port = await self._get_next_ports()
|
|
430
|
+
logger.debug(f"[{short_key}] Starting container on port {port}...")
|
|
431
|
+
container_id = await self._start_container(
|
|
432
|
+
port=port,
|
|
433
|
+
vnc_port=vnc_port,
|
|
434
|
+
env_url=env_url,
|
|
435
|
+
task_prompt=task_prompt,
|
|
436
|
+
task_key=task_key,
|
|
437
|
+
)
|
|
438
|
+
logger.debug(f"[{short_key}] Container: {container_id[:12]}")
|
|
439
|
+
|
|
440
|
+
# Always show instance URL
|
|
441
|
+
print(f"[{short_key}] Instance: {env_url}")
|
|
442
|
+
if self.config.headful:
|
|
443
|
+
print(f"[{short_key}] Browser: http://localhost:{vnc_port}/vnc.html")
|
|
444
|
+
|
|
445
|
+
# Wait for server to be ready
|
|
446
|
+
current_phase = "wait_for_server"
|
|
447
|
+
logger.debug(f"[{short_key}] Waiting for CUA server...")
|
|
448
|
+
await self._wait_for_server(port)
|
|
449
|
+
logger.debug(f"[{short_key}] CUA server ready")
|
|
450
|
+
|
|
451
|
+
# 3. Run agent
|
|
452
|
+
current_phase = "run_agent"
|
|
453
|
+
logger.debug(f"[{short_key}] Running agent...")
|
|
454
|
+
agent_result = await self._run_agent(
|
|
455
|
+
port=port,
|
|
456
|
+
task_prompt=task_prompt,
|
|
457
|
+
task_key=task_key,
|
|
458
|
+
instance_id=env.instance_id,
|
|
459
|
+
)
|
|
460
|
+
logger.debug(
|
|
461
|
+
f"[{short_key}] Agent done: completed={agent_result.completed}"
|
|
462
|
+
)
|
|
463
|
+
if agent_result.error and agent_result.error != "Max steps reached":
|
|
464
|
+
print(f"[{short_key}] Agent error: {agent_result.error[:200]}")
|
|
465
|
+
|
|
466
|
+
# 4. Run verification
|
|
467
|
+
current_phase = "verification"
|
|
468
|
+
verification_success = None
|
|
469
|
+
verification_score = None
|
|
470
|
+
verifier_execution_id = None
|
|
471
|
+
|
|
472
|
+
if agent_result.completed and task.verifier:
|
|
473
|
+
logger.info(f"[{short_key}] Running verification...")
|
|
474
|
+
try:
|
|
475
|
+
v = await task.verify_detailed_async(
|
|
476
|
+
env=env,
|
|
477
|
+
final_answer=agent_result.final_answer,
|
|
478
|
+
)
|
|
479
|
+
verification_success = v.success
|
|
480
|
+
verifier_execution_id = v.execution_id
|
|
481
|
+
# Score is in v.result (the verifier function's return value)
|
|
482
|
+
verification_score = (
|
|
483
|
+
v.result if isinstance(v.result, (int, float)) else None
|
|
484
|
+
)
|
|
485
|
+
logger.info(f"[{short_key}] Verification: {verification_success}")
|
|
486
|
+
if verification_success:
|
|
487
|
+
self._stats["completed"] += 1
|
|
488
|
+
else:
|
|
489
|
+
self._stats["failed"] += 1
|
|
490
|
+
print(f"[{short_key}] Verification FAILED: score={verification_score}")
|
|
491
|
+
except Exception as e:
|
|
492
|
+
logger.error(f"[{short_key}] Verification error: {e}")
|
|
493
|
+
self._stats["failed"] += 1
|
|
494
|
+
self._track_error("verification_error", str(e))
|
|
495
|
+
elif not agent_result.completed:
|
|
496
|
+
self._stats["failed"] += 1
|
|
497
|
+
error_msg = agent_result.error or "unknown"
|
|
498
|
+
self._track_error("agent_not_completed", error_msg)
|
|
499
|
+
print(f"[{short_key}] Agent did not complete: {error_msg}")
|
|
500
|
+
|
|
501
|
+
# 5. Complete/fail session (session was created by agent, we just complete it)
|
|
502
|
+
session_id = getattr(agent_result, "session_id", None)
|
|
503
|
+
if session_id:
|
|
504
|
+
try:
|
|
505
|
+
# Create session object to complete it
|
|
506
|
+
session = fleet.session_async(session_id=session_id)
|
|
507
|
+
if verification_success:
|
|
508
|
+
await session.complete(
|
|
509
|
+
verifier_execution_id=verifier_execution_id
|
|
510
|
+
)
|
|
511
|
+
else:
|
|
512
|
+
await session.fail(verifier_execution_id=verifier_execution_id)
|
|
513
|
+
logger.info(
|
|
514
|
+
f"[{task_key}] Session: https://fleetai.com/dashboard/sessions/{session_id}"
|
|
515
|
+
)
|
|
516
|
+
except Exception as e:
|
|
517
|
+
logger.error(f"[{task_key}] Session complete error: {e}")
|
|
518
|
+
|
|
519
|
+
return TaskResult(
|
|
520
|
+
task_key=task_key,
|
|
521
|
+
task_prompt=task_prompt,
|
|
522
|
+
agent_result=agent_result,
|
|
523
|
+
verification_success=verification_success,
|
|
524
|
+
verification_score=verification_score,
|
|
525
|
+
execution_time_ms=int((time.time() - start) * 1000),
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
except Exception as e:
|
|
529
|
+
import traceback
|
|
530
|
+
error_type = type(e).__name__
|
|
531
|
+
error_msg = str(e)
|
|
532
|
+
tb = traceback.format_exc()
|
|
533
|
+
|
|
534
|
+
# Categorize the error
|
|
535
|
+
error_category = f"{current_phase}:{error_type}"
|
|
536
|
+
self._track_error(error_category, error_msg)
|
|
537
|
+
self._stats["failed"] += 1
|
|
538
|
+
|
|
539
|
+
# Always print errors for visibility
|
|
540
|
+
print(f"[{short_key}] EXCEPTION in {current_phase}: {error_type}: {error_msg[:200]}")
|
|
541
|
+
logger.error(f"[{short_key}] Traceback:\n{tb}")
|
|
542
|
+
|
|
543
|
+
return TaskResult(
|
|
544
|
+
task_key=task_key,
|
|
545
|
+
task_prompt=task_prompt,
|
|
546
|
+
error=f"[{current_phase}] {error_type}: {error_msg}",
|
|
547
|
+
execution_time_ms=int((time.time() - start) * 1000),
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
finally:
|
|
551
|
+
# Cleanup
|
|
552
|
+
if container_id:
|
|
553
|
+
await self._stop_container(container_id)
|
|
554
|
+
if port and vnc_port:
|
|
555
|
+
await self._release_ports(port, vnc_port)
|
|
556
|
+
if env:
|
|
557
|
+
try:
|
|
558
|
+
await env.close()
|
|
559
|
+
except:
|
|
560
|
+
pass
|
|
561
|
+
|
|
562
|
+
async def _start_container(
|
|
563
|
+
self,
|
|
564
|
+
port: int,
|
|
565
|
+
vnc_port: int,
|
|
566
|
+
env_url: str,
|
|
567
|
+
task_prompt: str,
|
|
568
|
+
task_key: str,
|
|
569
|
+
) -> str:
|
|
570
|
+
"""Start Docker container with CUA server."""
|
|
571
|
+
headless = "false" if self.config.headful else "true"
|
|
572
|
+
|
|
573
|
+
cmd = [
|
|
574
|
+
"docker",
|
|
575
|
+
"run",
|
|
576
|
+
"-d",
|
|
577
|
+
"--rm",
|
|
578
|
+
"-p",
|
|
579
|
+
f"{port}:8765",
|
|
580
|
+
"-e",
|
|
581
|
+
f"FLEET_ENV_URL={env_url}",
|
|
582
|
+
"-e",
|
|
583
|
+
f"FLEET_TASK_PROMPT={task_prompt}",
|
|
584
|
+
"-e",
|
|
585
|
+
f"FLEET_TASK_KEY={task_key}",
|
|
586
|
+
"-e",
|
|
587
|
+
f"SCREEN_WIDTH={self.config.screen_width}",
|
|
588
|
+
"-e",
|
|
589
|
+
f"SCREEN_HEIGHT={self.config.screen_height}",
|
|
590
|
+
"-e",
|
|
591
|
+
f"HEADLESS={headless}",
|
|
592
|
+
]
|
|
593
|
+
|
|
594
|
+
# Add noVNC port mapping if headful
|
|
595
|
+
if self.config.headful:
|
|
596
|
+
cmd.extend(["-p", f"{vnc_port}:6080"])
|
|
597
|
+
|
|
598
|
+
cmd.append(self._docker_image)
|
|
599
|
+
|
|
600
|
+
proc = await asyncio.create_subprocess_exec(
|
|
601
|
+
*cmd,
|
|
602
|
+
stdout=asyncio.subprocess.PIPE,
|
|
603
|
+
stderr=asyncio.subprocess.PIPE,
|
|
604
|
+
)
|
|
605
|
+
stdout, stderr = await proc.communicate()
|
|
606
|
+
|
|
607
|
+
if proc.returncode != 0:
|
|
608
|
+
stderr_str = stderr.decode()
|
|
609
|
+
# Check for port conflict
|
|
610
|
+
if (
|
|
611
|
+
"port is already allocated" in stderr_str
|
|
612
|
+
or "address already in use" in stderr_str.lower()
|
|
613
|
+
):
|
|
614
|
+
raise RuntimeError(
|
|
615
|
+
f"Port conflict on {port} or {vnc_port}. Try again or check for orphaned containers with: docker ps"
|
|
616
|
+
)
|
|
617
|
+
raise RuntimeError(f"Container start failed: {stderr_str}")
|
|
618
|
+
|
|
619
|
+
container_id = stdout.decode().strip()
|
|
620
|
+
|
|
621
|
+
# Track container globally for cleanup on exit
|
|
622
|
+
_running_containers.add(container_id)
|
|
623
|
+
|
|
624
|
+
return container_id
|
|
625
|
+
|
|
626
|
+
async def _stop_container(self, container_id: str):
|
|
627
|
+
"""Stop Docker container and capture logs."""
|
|
628
|
+
# Get logs before stopping
|
|
629
|
+
log_proc = await asyncio.create_subprocess_exec(
|
|
630
|
+
"docker",
|
|
631
|
+
"logs",
|
|
632
|
+
"--tail",
|
|
633
|
+
"50",
|
|
634
|
+
container_id,
|
|
635
|
+
stdout=asyncio.subprocess.PIPE,
|
|
636
|
+
stderr=asyncio.subprocess.STDOUT,
|
|
637
|
+
)
|
|
638
|
+
logs, _ = await log_proc.communicate()
|
|
639
|
+
if logs:
|
|
640
|
+
logger.debug(f"Container {container_id[:12]} logs:\n{logs.decode()}")
|
|
641
|
+
|
|
642
|
+
proc = await asyncio.create_subprocess_exec(
|
|
643
|
+
"docker",
|
|
644
|
+
"stop",
|
|
645
|
+
container_id,
|
|
646
|
+
stdout=asyncio.subprocess.DEVNULL,
|
|
647
|
+
stderr=asyncio.subprocess.DEVNULL,
|
|
648
|
+
)
|
|
649
|
+
await proc.wait()
|
|
650
|
+
|
|
651
|
+
# Remove from global tracking
|
|
652
|
+
_running_containers.discard(container_id)
|
|
653
|
+
|
|
654
|
+
async def _wait_for_server(self, port: int, timeout: int = 60):
|
|
655
|
+
"""Wait for CUA server to be ready."""
|
|
656
|
+
import aiohttp
|
|
657
|
+
|
|
658
|
+
url = f"http://localhost:{port}/health"
|
|
659
|
+
start = time.time()
|
|
660
|
+
|
|
661
|
+
while time.time() - start < timeout:
|
|
662
|
+
try:
|
|
663
|
+
async with aiohttp.ClientSession() as session:
|
|
664
|
+
async with session.get(url, timeout=2) as resp:
|
|
665
|
+
if resp.status == 200:
|
|
666
|
+
return
|
|
667
|
+
except:
|
|
668
|
+
pass
|
|
669
|
+
await asyncio.sleep(1)
|
|
670
|
+
|
|
671
|
+
raise TimeoutError(f"CUA server not ready after {timeout}s")
|
|
672
|
+
|
|
673
|
+
async def _run_agent(
|
|
674
|
+
self,
|
|
675
|
+
port: int,
|
|
676
|
+
task_prompt: str,
|
|
677
|
+
task_key: str,
|
|
678
|
+
instance_id: Optional[str] = None,
|
|
679
|
+
) -> AgentResult:
|
|
680
|
+
"""Run agent process."""
|
|
681
|
+
agent_path = get_agent_path(self.config.agent)
|
|
682
|
+
agent_script = agent_path / "agent.py"
|
|
683
|
+
|
|
684
|
+
# Set up environment
|
|
685
|
+
env = os.environ.copy()
|
|
686
|
+
|
|
687
|
+
# Session log file: ~/.fleet/logs/{job_id}/{task_key}.jsonl
|
|
688
|
+
session_log_file = self._log_dir / f"{task_key}.jsonl"
|
|
689
|
+
|
|
690
|
+
env.update(
|
|
691
|
+
{
|
|
692
|
+
"PYTHONUNBUFFERED": "1", # Ensure real-time output
|
|
693
|
+
"FLEET_MCP_URL": f"http://localhost:{port}",
|
|
694
|
+
"FLEET_SESSION_LOG": str(
|
|
695
|
+
session_log_file
|
|
696
|
+
), # Unified session log (MCP + HTTP)
|
|
697
|
+
"FLEET_JOB_ID": self._job_id,
|
|
698
|
+
"FLEET_TASK_PROMPT": task_prompt,
|
|
699
|
+
"FLEET_TASK_KEY": task_key,
|
|
700
|
+
"FLEET_INSTANCE_ID": instance_id or "",
|
|
701
|
+
"FLEET_MODEL": self.config.model,
|
|
702
|
+
"FLEET_MAX_STEPS": str(self.config.max_steps),
|
|
703
|
+
"FLEET_SCREEN_WIDTH": str(self.config.screen_width),
|
|
704
|
+
"FLEET_SCREEN_HEIGHT": str(self.config.screen_height),
|
|
705
|
+
"FLEET_VERBOSE": "true" if self.config.verbose else "false",
|
|
706
|
+
}
|
|
707
|
+
)
|
|
708
|
+
env.update(self.config.api_keys)
|
|
709
|
+
|
|
710
|
+
proc = await asyncio.create_subprocess_exec(
|
|
711
|
+
sys.executable,
|
|
712
|
+
str(agent_script),
|
|
713
|
+
stdout=asyncio.subprocess.PIPE,
|
|
714
|
+
stderr=asyncio.subprocess.PIPE,
|
|
715
|
+
env=env,
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
short_key = task_key[:20]
|
|
719
|
+
stdout_lines = []
|
|
720
|
+
stderr_lines = []
|
|
721
|
+
|
|
722
|
+
async def read_stdout():
|
|
723
|
+
while True:
|
|
724
|
+
line = await proc.stdout.readline()
|
|
725
|
+
if not line:
|
|
726
|
+
break
|
|
727
|
+
line_str = line.decode().rstrip()
|
|
728
|
+
stdout_lines.append(line_str)
|
|
729
|
+
# Show step updates in real-time
|
|
730
|
+
if line_str.startswith("STEP:") or line_str.startswith("Step "):
|
|
731
|
+
print(f"[{short_key}] {line_str}")
|
|
732
|
+
elif self.config.verbose:
|
|
733
|
+
logger.info(f"[{short_key}] {line_str}")
|
|
734
|
+
|
|
735
|
+
async def read_stderr():
|
|
736
|
+
while True:
|
|
737
|
+
line = await proc.stderr.readline()
|
|
738
|
+
if not line:
|
|
739
|
+
break
|
|
740
|
+
line_str = line.decode().rstrip()
|
|
741
|
+
stderr_lines.append(line_str)
|
|
742
|
+
if self.config.verbose:
|
|
743
|
+
logger.warning(f"[{short_key}] stderr: {line_str}")
|
|
744
|
+
|
|
745
|
+
try:
|
|
746
|
+
await asyncio.wait_for(
|
|
747
|
+
asyncio.gather(read_stdout(), read_stderr(), proc.wait()),
|
|
748
|
+
timeout=self.config.timeout_seconds,
|
|
749
|
+
)
|
|
750
|
+
except asyncio.TimeoutError:
|
|
751
|
+
proc.kill()
|
|
752
|
+
await proc.wait()
|
|
753
|
+
return AgentResult(
|
|
754
|
+
task_key=task_key,
|
|
755
|
+
completed=False,
|
|
756
|
+
error="Agent timeout",
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
# Parse result from stdout/stderr
|
|
760
|
+
stdout_str = "\n".join(stdout_lines)
|
|
761
|
+
stderr_str = "\n".join(stderr_lines)
|
|
762
|
+
|
|
763
|
+
# Show full output in verbose mode
|
|
764
|
+
if self.config.verbose:
|
|
765
|
+
logger.info(f"Agent stdout:\n{stdout_str}")
|
|
766
|
+
if stderr_str:
|
|
767
|
+
logger.info(f"Agent stderr:\n{stderr_str}")
|
|
768
|
+
else:
|
|
769
|
+
logger.debug(f"Agent stdout: {stdout_str[:500]}")
|
|
770
|
+
if stderr_str:
|
|
771
|
+
logger.debug(f"Agent stderr: {stderr_str[:500]}")
|
|
772
|
+
|
|
773
|
+
# Always show stderr if agent crashed (non-zero exit or has stderr)
|
|
774
|
+
if proc.returncode != 0 or stderr_str:
|
|
775
|
+
if stderr_str:
|
|
776
|
+
print(f"[{short_key}] Agent stderr: {stderr_str[:500]}")
|
|
777
|
+
|
|
778
|
+
result_json = None
|
|
779
|
+
for line in stdout_str.split("\n"):
|
|
780
|
+
line = line.strip()
|
|
781
|
+
if line.startswith("{"):
|
|
782
|
+
try:
|
|
783
|
+
result_json = json.loads(line)
|
|
784
|
+
except:
|
|
785
|
+
continue
|
|
786
|
+
|
|
787
|
+
if result_json:
|
|
788
|
+
return AgentResult(
|
|
789
|
+
task_key=result_json.get("task_key", task_key),
|
|
790
|
+
final_answer=result_json.get("final_answer"),
|
|
791
|
+
completed=result_json.get("completed", False),
|
|
792
|
+
error=result_json.get("error"),
|
|
793
|
+
steps_taken=result_json.get("steps_taken", 0),
|
|
794
|
+
execution_time_ms=result_json.get("execution_time_ms", 0),
|
|
795
|
+
transcript=result_json.get("transcript", []),
|
|
796
|
+
session_id=result_json.get("session_id"),
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
# Include stderr in error message
|
|
800
|
+
error_msg = f"Agent failed. stdout: {stdout_str[:300]}"
|
|
801
|
+
if stderr_str:
|
|
802
|
+
error_msg += f" | stderr: {stderr_str[:300]}"
|
|
803
|
+
|
|
804
|
+
return AgentResult(
|
|
805
|
+
task_key=task_key,
|
|
806
|
+
completed=False,
|
|
807
|
+
error=error_msg,
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
async def run_agent(
|
|
812
|
+
project_key: Optional[str] = None,
|
|
813
|
+
task_keys: Optional[List[str]] = None,
|
|
814
|
+
agent: str = "gemini_cua",
|
|
815
|
+
model: str = "gemini-2.5-pro",
|
|
816
|
+
max_concurrent: int = 4,
|
|
817
|
+
max_steps: int = 200,
|
|
818
|
+
timeout_seconds: int = 600,
|
|
819
|
+
api_keys: Optional[Dict[str, str]] = None,
|
|
820
|
+
headful: bool = False,
|
|
821
|
+
verbose: bool = False,
|
|
822
|
+
) -> Tuple[List[TaskResult], str]:
|
|
823
|
+
"""Run agent on Fleet tasks.
|
|
824
|
+
|
|
825
|
+
Args:
|
|
826
|
+
project_key: Fleet project to run on
|
|
827
|
+
task_keys: Specific tasks (alternative to project_key)
|
|
828
|
+
agent: Agent implementation (default: gemini_cua)
|
|
829
|
+
model: Model to use
|
|
830
|
+
max_concurrent: Max parallel tasks
|
|
831
|
+
max_steps: Max agent steps per task
|
|
832
|
+
timeout_seconds: Timeout per task
|
|
833
|
+
api_keys: API keys (e.g., {"GEMINI_API_KEY": "xxx"})
|
|
834
|
+
headful: Show browser via noVNC
|
|
835
|
+
verbose: Enable verbose agent logging
|
|
836
|
+
|
|
837
|
+
Returns:
|
|
838
|
+
Tuple of (List of TaskResult, job_id)
|
|
839
|
+
"""
|
|
840
|
+
config = AgentConfig(
|
|
841
|
+
project_key=project_key,
|
|
842
|
+
task_keys=task_keys,
|
|
843
|
+
agent=agent,
|
|
844
|
+
headful=headful,
|
|
845
|
+
verbose=verbose,
|
|
846
|
+
model=model,
|
|
847
|
+
max_concurrent=max_concurrent,
|
|
848
|
+
max_steps=max_steps,
|
|
849
|
+
timeout_seconds=timeout_seconds,
|
|
850
|
+
api_keys=api_keys or {},
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
orchestrator = AgentOrchestrator(config)
|
|
854
|
+
return await orchestrator.run()
|