fleet-python 0.2.66b2__py3-none-any.whl → 0.2.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. examples/export_tasks.py +16 -5
  2. examples/export_tasks_filtered.py +245 -0
  3. examples/fetch_tasks.py +230 -0
  4. examples/import_tasks.py +140 -8
  5. examples/iterate_verifiers.py +725 -0
  6. fleet/__init__.py +128 -5
  7. fleet/_async/__init__.py +27 -3
  8. fleet/_async/base.py +24 -9
  9. fleet/_async/client.py +938 -41
  10. fleet/_async/env/client.py +60 -3
  11. fleet/_async/instance/client.py +52 -7
  12. fleet/_async/models.py +15 -0
  13. fleet/_async/resources/api.py +200 -0
  14. fleet/_async/resources/sqlite.py +1801 -46
  15. fleet/_async/tasks.py +122 -25
  16. fleet/_async/verifiers/bundler.py +22 -21
  17. fleet/_async/verifiers/verifier.py +25 -19
  18. fleet/agent/__init__.py +32 -0
  19. fleet/agent/gemini_cua/Dockerfile +45 -0
  20. fleet/agent/gemini_cua/__init__.py +10 -0
  21. fleet/agent/gemini_cua/agent.py +759 -0
  22. fleet/agent/gemini_cua/mcp/main.py +108 -0
  23. fleet/agent/gemini_cua/mcp_server/__init__.py +5 -0
  24. fleet/agent/gemini_cua/mcp_server/main.py +105 -0
  25. fleet/agent/gemini_cua/mcp_server/tools.py +178 -0
  26. fleet/agent/gemini_cua/requirements.txt +5 -0
  27. fleet/agent/gemini_cua/start.sh +30 -0
  28. fleet/agent/orchestrator.py +854 -0
  29. fleet/agent/types.py +49 -0
  30. fleet/agent/utils.py +34 -0
  31. fleet/base.py +34 -9
  32. fleet/cli.py +1061 -0
  33. fleet/client.py +1060 -48
  34. fleet/config.py +1 -1
  35. fleet/env/__init__.py +16 -0
  36. fleet/env/client.py +60 -3
  37. fleet/eval/__init__.py +15 -0
  38. fleet/eval/uploader.py +231 -0
  39. fleet/exceptions.py +8 -0
  40. fleet/instance/client.py +53 -8
  41. fleet/instance/models.py +1 -0
  42. fleet/models.py +303 -0
  43. fleet/proxy/__init__.py +25 -0
  44. fleet/proxy/proxy.py +453 -0
  45. fleet/proxy/whitelist.py +244 -0
  46. fleet/resources/api.py +200 -0
  47. fleet/resources/sqlite.py +1845 -46
  48. fleet/tasks.py +113 -20
  49. fleet/utils/__init__.py +7 -0
  50. fleet/utils/http_logging.py +178 -0
  51. fleet/utils/logging.py +13 -0
  52. fleet/utils/playwright.py +440 -0
  53. fleet/verifiers/bundler.py +22 -21
  54. fleet/verifiers/db.py +985 -1
  55. fleet/verifiers/decorator.py +1 -1
  56. fleet/verifiers/verifier.py +25 -19
  57. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/METADATA +28 -1
  58. fleet_python-0.2.105.dist-info/RECORD +115 -0
  59. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/WHEEL +1 -1
  60. fleet_python-0.2.105.dist-info/entry_points.txt +2 -0
  61. tests/test_app_method.py +85 -0
  62. tests/test_expect_exactly.py +4148 -0
  63. tests/test_expect_only.py +2593 -0
  64. tests/test_instance_dispatch.py +607 -0
  65. tests/test_sqlite_resource_dual_mode.py +263 -0
  66. tests/test_sqlite_shared_memory_behavior.py +117 -0
  67. fleet_python-0.2.66b2.dist-info/RECORD +0 -81
  68. tests/test_verifier_security.py +0 -427
  69. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/licenses/LICENSE +0 -0
  70. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,854 @@
1
+ """Agent Orchestrator - Coordinates running agents on Fleet tasks.
2
+
3
+ Architecture:
4
+ 1. Load tasks from Fleet API
5
+ 2. For each task (parallel up to max_concurrent):
6
+ a. Create Fleet environment (cloud)
7
+ b. Start Docker container with CUA server (Playwright + browser)
8
+ c. Run agent on HOST, connecting to container's MCP server
9
+ d. Collect results and run verification
10
+ e. Clean up
11
+
12
+ Usage:
13
+ results = await run_agent(
14
+ project_key="my-project",
15
+ agent="gemini_cua",
16
+ api_keys={"GEMINI_API_KEY": "xxx"},
17
+ )
18
+ """
19
+
20
+ import asyncio
21
+ import atexit
22
+ import json
23
+ import logging
24
+ import os
25
+ import signal
26
+ import sys
27
+ import time
28
+ from pathlib import Path
29
+ from typing import Dict, List, Optional, Set, Tuple
30
+
31
+ import fleet
32
+ from .utils import get_agent_path
33
+ from .types import AgentConfig, AgentResult, TaskResult
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Global tracking of running containers for cleanup on exit
38
+ _running_containers: Set[str] = set()
39
+ _cleanup_registered = False
40
+
41
+
42
+ def _cleanup_all_containers():
43
+ """Kill all tracked running containers. Called on exit/signal."""
44
+ import subprocess
45
+
46
+ if not _running_containers:
47
+ return
48
+
49
+ containers = list(_running_containers)
50
+ logger.debug(f"Cleaning up {len(containers)} container(s)...")
51
+
52
+ for container_id in containers:
53
+ try:
54
+ # Use docker kill for immediate termination
55
+ subprocess.run(
56
+ ["docker", "kill", container_id],
57
+ stdout=subprocess.DEVNULL,
58
+ stderr=subprocess.DEVNULL,
59
+ timeout=5,
60
+ )
61
+ _running_containers.discard(container_id)
62
+ except Exception as e:
63
+ logger.debug(f"Failed to kill container {container_id[:12]}: {e}")
64
+
65
+
66
+ def _register_cleanup():
67
+ """Register cleanup handlers (only once)."""
68
+ global _cleanup_registered
69
+ if _cleanup_registered:
70
+ return
71
+ _cleanup_registered = True
72
+
73
+ # Register atexit handler
74
+ atexit.register(_cleanup_all_containers)
75
+
76
+ # Register signal handlers for graceful shutdown
77
+ def signal_handler(signum, frame):
78
+ _cleanup_all_containers()
79
+ # Re-raise to allow normal signal handling
80
+ signal.default_int_handler(signum, frame)
81
+
82
+ try:
83
+ signal.signal(signal.SIGINT, signal_handler)
84
+ signal.signal(signal.SIGTERM, signal_handler)
85
+ except (ValueError, OSError):
86
+ # Can't set signal handlers in some contexts (e.g., non-main thread)
87
+ pass
88
+
89
+
90
+ def _cleanup_orphaned_containers(image_prefix: str = "fleet-cua-"):
91
+ """Kill any orphaned containers from previous runs.
92
+
93
+ This handles cases where a previous run was force-killed (SIGKILL)
94
+ and containers were left running, which could cause port conflicts.
95
+ """
96
+ import subprocess
97
+
98
+ try:
99
+ # List running containers with our image prefix
100
+ result = subprocess.run(
101
+ [
102
+ "docker",
103
+ "ps",
104
+ "--filter",
105
+ f"ancestor={image_prefix}",
106
+ "-q",
107
+ "--format",
108
+ "{{.ID}} {{.Image}}",
109
+ ],
110
+ capture_output=True,
111
+ text=True,
112
+ timeout=10,
113
+ )
114
+
115
+ if result.returncode != 0:
116
+ return
117
+
118
+ # Parse container IDs - we need to filter by image name pattern
119
+ # Use docker ps with format to get both ID and image
120
+ result = subprocess.run(
121
+ ["docker", "ps", "--format", "{{.ID}}\t{{.Image}}"],
122
+ capture_output=True,
123
+ text=True,
124
+ timeout=10,
125
+ )
126
+
127
+ if result.returncode != 0 or not result.stdout.strip():
128
+ return
129
+
130
+ orphaned = []
131
+ for line in result.stdout.strip().split("\n"):
132
+ if not line:
133
+ continue
134
+ parts = line.split("\t")
135
+ if len(parts) >= 2:
136
+ container_id, image = parts[0], parts[1]
137
+ if image.startswith(image_prefix):
138
+ orphaned.append(container_id)
139
+
140
+ if orphaned:
141
+ logger.info(
142
+ f"Cleaning up {len(orphaned)} orphaned container(s) from previous run..."
143
+ )
144
+ for container_id in orphaned:
145
+ try:
146
+ subprocess.run(
147
+ ["docker", "kill", container_id],
148
+ stdout=subprocess.DEVNULL,
149
+ stderr=subprocess.DEVNULL,
150
+ timeout=5,
151
+ )
152
+ except Exception:
153
+ pass
154
+ except Exception as e:
155
+ logger.debug(f"Failed to check for orphaned containers: {e}")
156
+
157
+
158
+ class AgentOrchestrator:
159
+ """Orchestrates running agents on Fleet tasks."""
160
+
161
+ def __init__(self, config: AgentConfig):
162
+ self.config = config
163
+ self._port_counter = config.port_range_start
164
+ self._vnc_port_counter = config.vnc_port_start
165
+ self._port_lock = asyncio.Lock()
166
+ self._docker_image: Optional[str] = None
167
+ # Track available ports (recycled when tasks complete)
168
+ self._available_ports: List[Tuple[int, int]] = []
169
+ # Register global cleanup handlers
170
+ _register_cleanup()
171
+ # Stats tracking
172
+ self._stats = {"started": 0, "completed": 0, "failed": 0, "errors": {}}
173
+
174
+ def _track_error(self, category: str, message: str):
175
+ """Track an error for summary statistics."""
176
+ if category not in self._stats["errors"]:
177
+ self._stats["errors"][category] = []
178
+ # Keep up to 5 examples per category
179
+ if len(self._stats["errors"][category]) < 5:
180
+ self._stats["errors"][category].append(message[:200])
181
+
182
+ def _print_stats(self):
183
+ """Print summary statistics."""
184
+ from rich.console import Console
185
+ from rich.table import Table
186
+
187
+ console = Console()
188
+
189
+ total = self._stats["started"]
190
+ completed = self._stats["completed"]
191
+ failed = self._stats["failed"]
192
+
193
+ console.print()
194
+ console.print("[bold]Run Summary:[/bold]")
195
+ console.print(f" Started: {total}")
196
+ console.print(f" Completed: [green]{completed}[/green] ({100*completed/total:.1f}%)" if total > 0 else " Completed: 0")
197
+ console.print(f" Failed: [red]{failed}[/red] ({100*failed/total:.1f}%)" if total > 0 else " Failed: 0")
198
+
199
+ if self._stats["errors"]:
200
+ console.print()
201
+ console.print("[bold]Error Breakdown:[/bold]")
202
+ table = Table(show_header=True, header_style="bold")
203
+ table.add_column("Category")
204
+ table.add_column("Count")
205
+ table.add_column("Example")
206
+
207
+ for category, examples in sorted(self._stats["errors"].items(), key=lambda x: -len(x[1])):
208
+ table.add_row(
209
+ category,
210
+ str(len(examples)),
211
+ examples[0][:80] + "..." if len(examples[0]) > 80 else examples[0]
212
+ )
213
+
214
+ console.print(table)
215
+
216
+ async def _get_next_ports(self) -> Tuple[int, int]:
217
+ """Get next available MCP port and VNC port."""
218
+ async with self._port_lock:
219
+ # Reuse recycled ports first
220
+ if self._available_ports:
221
+ return self._available_ports.pop()
222
+ # Otherwise allocate new ones
223
+ port = self._port_counter
224
+ vnc_port = self._vnc_port_counter
225
+ self._port_counter += 1
226
+ self._vnc_port_counter += 1
227
+ return port, vnc_port
228
+
229
+ async def _release_ports(self, port: int, vnc_port: int):
230
+ """Return ports to the pool for reuse."""
231
+ async with self._port_lock:
232
+ self._available_ports.append((port, vnc_port))
233
+
234
+ async def run(self) -> List[TaskResult]:
235
+ """Run agents on all tasks."""
236
+ from fleet._async import load_tasks
237
+ from rich.console import Console
238
+ from rich.live import Live
239
+ from rich.panel import Panel
240
+ from rich.spinner import Spinner
241
+
242
+ console = Console()
243
+
244
+ # Create job via Fleet API (name generated server-side)
245
+ self._job_id = await fleet.job_async()
246
+ console.print(Panel(
247
+ f"[bold]Live agent traces[/bold]\n\n https://www.fleetai.com/dashboard/jobs/{self._job_id}",
248
+ border_style="cyan",
249
+ ))
250
+ console.print()
251
+
252
+ # Create log directory: ~/.fleet/logs/{job_id}/
253
+ self._log_dir = Path.home() / ".fleet" / "logs" / self._job_id
254
+ self._log_dir.mkdir(parents=True, exist_ok=True)
255
+
256
+ # Load tasks with spinner
257
+ with Live(
258
+ Spinner("dots", text=f"Loading tasks from {self.config.project_key}..."),
259
+ console=console,
260
+ transient=True,
261
+ ):
262
+ if self.config.task_keys:
263
+ tasks = await load_tasks(keys=self.config.task_keys)
264
+ elif self.config.project_key:
265
+ tasks = await load_tasks(project_key=self.config.project_key)
266
+ else:
267
+ raise ValueError("Either project_key or task_keys required")
268
+
269
+ console.print(f"Loaded {len(tasks)} tasks")
270
+
271
+ # Build Docker image
272
+ agent_path = get_agent_path(self.config.agent)
273
+ await self._build_docker_image(agent_path)
274
+
275
+ # Run tasks with concurrency limit and progress
276
+ from rich.progress import (
277
+ Progress,
278
+ SpinnerColumn,
279
+ TextColumn,
280
+ BarColumn,
281
+ TaskProgressColumn,
282
+ )
283
+
284
+ semaphore = asyncio.Semaphore(self.config.max_concurrent)
285
+ results = [None] * len(tasks)
286
+ completed_count = 0
287
+ passed_count = 0
288
+ total_count = len(tasks)
289
+
290
+ with Progress(
291
+ SpinnerColumn(),
292
+ TextColumn("[progress.description]{task.description}"),
293
+ BarColumn(),
294
+ TaskProgressColumn(),
295
+ console=console,
296
+ ) as progress:
297
+ task_progress = progress.add_task(
298
+ f"[cyan]Running ({completed_count}/{total_count}) | {passed_count} passed[/cyan]",
299
+ total=len(tasks)
300
+ )
301
+
302
+ async def run_with_semaphore(idx, task):
303
+ nonlocal completed_count, passed_count
304
+ async with semaphore:
305
+ result = await self._run_task(task)
306
+ completed_count += 1
307
+ if result.verification_success:
308
+ passed_count += 1
309
+ progress.update(
310
+ task_progress,
311
+ advance=1,
312
+ description=f"[cyan]Running ({completed_count}/{total_count}) | {passed_count} passed[/cyan]"
313
+ )
314
+ return idx, result
315
+
316
+ completed = await asyncio.gather(
317
+ *[run_with_semaphore(i, t) for i, t in enumerate(tasks)],
318
+ return_exceptions=True,
319
+ )
320
+
321
+ # Convert to ordered list
322
+ for item in completed:
323
+ if isinstance(item, Exception):
324
+ # Find which task this was - shouldn't happen but handle it
325
+ continue
326
+ idx, result = item
327
+ results[idx] = result
328
+
329
+ # Fill any gaps with error results
330
+ final = []
331
+ for i, r in enumerate(results):
332
+ if r is None:
333
+ final.append(
334
+ TaskResult(
335
+ task_key=tasks[i].key,
336
+ task_prompt=tasks[i].prompt,
337
+ error="Task failed unexpectedly",
338
+ )
339
+ )
340
+ else:
341
+ final.append(r)
342
+
343
+ # Show logs location
344
+ if hasattr(self, "_log_dir") and self._log_dir.exists():
345
+ session_logs = list(self._log_dir.glob("*.jsonl"))
346
+ console.print(f"Logs: {self._log_dir}/ ({len(session_logs)} sessions)")
347
+
348
+ # Print summary statistics
349
+ self._print_stats()
350
+
351
+ return final, self._job_id
352
+
353
+ async def _build_docker_image(self, agent_path: Path):
354
+ """Build Docker image for CUA server."""
355
+ from rich.console import Console
356
+ from rich.live import Live
357
+ from rich.spinner import Spinner
358
+
359
+ console = Console()
360
+ dockerfile = agent_path / "Dockerfile"
361
+ if not dockerfile.exists():
362
+ raise FileNotFoundError(f"Dockerfile not found in {agent_path}")
363
+
364
+ image_name = f"fleet-cua-{agent_path.name}"
365
+
366
+ # Clean up any orphaned containers from previous runs (prevents port conflicts)
367
+ _cleanup_orphaned_containers(image_name)
368
+
369
+ # Build context is the agent directory (all files are self-contained)
370
+ with Live(
371
+ Spinner("dots", text=f"Building Docker image {image_name}..."),
372
+ console=console,
373
+ transient=True,
374
+ ):
375
+ proc = await asyncio.create_subprocess_exec(
376
+ "docker",
377
+ "build",
378
+ "-t",
379
+ image_name,
380
+ str(agent_path), # Build context is agent directory
381
+ stdout=asyncio.subprocess.PIPE,
382
+ stderr=asyncio.subprocess.PIPE,
383
+ )
384
+ stdout, stderr = await proc.communicate()
385
+
386
+ if proc.returncode != 0:
387
+ console.print(f"[red]✗[/red] Docker build failed")
388
+ console.print(stderr.decode())
389
+ raise RuntimeError(f"Docker build failed: {stderr.decode()}")
390
+
391
+ self._docker_image = image_name
392
+ console.print(f"Docker image ready: {image_name}")
393
+
394
+ async def _run_task(self, task) -> TaskResult:
395
+ """Run agent on a single task."""
396
+ from fleet.env import make_async
397
+
398
+ start = time.time()
399
+ task_key = task.key
400
+ task_prompt = task.prompt
401
+ short_key = task_key[:20]
402
+
403
+ self._stats["started"] += 1
404
+ logger.debug(f"[{short_key}] Starting (total started: {self._stats['started']})")
405
+
406
+ env = None
407
+ container_id = None
408
+ port = None
409
+ vnc_port = None
410
+ current_phase = "init"
411
+
412
+ try:
413
+ # 1. Create Fleet environment
414
+ current_phase = "create_env"
415
+ logger.debug(f"[{short_key}] Creating env...")
416
+ env = await make_async(
417
+ env_key=task.env_key,
418
+ data_key=task.data_key,
419
+ env_variables=task.env_variables,
420
+ ttl_seconds=self.config.timeout_seconds + 300,
421
+ )
422
+ env_url = env.urls.root
423
+ logger.debug(f"[{short_key}] Env: {env_url}")
424
+
425
+ await asyncio.sleep(3) # Wait for env to be ready
426
+
427
+ # 2. Start Docker container with CUA server
428
+ current_phase = "start_container"
429
+ port, vnc_port = await self._get_next_ports()
430
+ logger.debug(f"[{short_key}] Starting container on port {port}...")
431
+ container_id = await self._start_container(
432
+ port=port,
433
+ vnc_port=vnc_port,
434
+ env_url=env_url,
435
+ task_prompt=task_prompt,
436
+ task_key=task_key,
437
+ )
438
+ logger.debug(f"[{short_key}] Container: {container_id[:12]}")
439
+
440
+ # Always show instance URL
441
+ print(f"[{short_key}] Instance: {env_url}")
442
+ if self.config.headful:
443
+ print(f"[{short_key}] Browser: http://localhost:{vnc_port}/vnc.html")
444
+
445
+ # Wait for server to be ready
446
+ current_phase = "wait_for_server"
447
+ logger.debug(f"[{short_key}] Waiting for CUA server...")
448
+ await self._wait_for_server(port)
449
+ logger.debug(f"[{short_key}] CUA server ready")
450
+
451
+ # 3. Run agent
452
+ current_phase = "run_agent"
453
+ logger.debug(f"[{short_key}] Running agent...")
454
+ agent_result = await self._run_agent(
455
+ port=port,
456
+ task_prompt=task_prompt,
457
+ task_key=task_key,
458
+ instance_id=env.instance_id,
459
+ )
460
+ logger.debug(
461
+ f"[{short_key}] Agent done: completed={agent_result.completed}"
462
+ )
463
+ if agent_result.error and agent_result.error != "Max steps reached":
464
+ print(f"[{short_key}] Agent error: {agent_result.error[:200]}")
465
+
466
+ # 4. Run verification
467
+ current_phase = "verification"
468
+ verification_success = None
469
+ verification_score = None
470
+ verifier_execution_id = None
471
+
472
+ if agent_result.completed and task.verifier:
473
+ logger.info(f"[{short_key}] Running verification...")
474
+ try:
475
+ v = await task.verify_detailed_async(
476
+ env=env,
477
+ final_answer=agent_result.final_answer,
478
+ )
479
+ verification_success = v.success
480
+ verifier_execution_id = v.execution_id
481
+ # Score is in v.result (the verifier function's return value)
482
+ verification_score = (
483
+ v.result if isinstance(v.result, (int, float)) else None
484
+ )
485
+ logger.info(f"[{short_key}] Verification: {verification_success}")
486
+ if verification_success:
487
+ self._stats["completed"] += 1
488
+ else:
489
+ self._stats["failed"] += 1
490
+ print(f"[{short_key}] Verification FAILED: score={verification_score}")
491
+ except Exception as e:
492
+ logger.error(f"[{short_key}] Verification error: {e}")
493
+ self._stats["failed"] += 1
494
+ self._track_error("verification_error", str(e))
495
+ elif not agent_result.completed:
496
+ self._stats["failed"] += 1
497
+ error_msg = agent_result.error or "unknown"
498
+ self._track_error("agent_not_completed", error_msg)
499
+ print(f"[{short_key}] Agent did not complete: {error_msg}")
500
+
501
+ # 5. Complete/fail session (session was created by agent, we just complete it)
502
+ session_id = getattr(agent_result, "session_id", None)
503
+ if session_id:
504
+ try:
505
+ # Create session object to complete it
506
+ session = fleet.session_async(session_id=session_id)
507
+ if verification_success:
508
+ await session.complete(
509
+ verifier_execution_id=verifier_execution_id
510
+ )
511
+ else:
512
+ await session.fail(verifier_execution_id=verifier_execution_id)
513
+ logger.info(
514
+ f"[{task_key}] Session: https://fleetai.com/dashboard/sessions/{session_id}"
515
+ )
516
+ except Exception as e:
517
+ logger.error(f"[{task_key}] Session complete error: {e}")
518
+
519
+ return TaskResult(
520
+ task_key=task_key,
521
+ task_prompt=task_prompt,
522
+ agent_result=agent_result,
523
+ verification_success=verification_success,
524
+ verification_score=verification_score,
525
+ execution_time_ms=int((time.time() - start) * 1000),
526
+ )
527
+
528
+ except Exception as e:
529
+ import traceback
530
+ error_type = type(e).__name__
531
+ error_msg = str(e)
532
+ tb = traceback.format_exc()
533
+
534
+ # Categorize the error
535
+ error_category = f"{current_phase}:{error_type}"
536
+ self._track_error(error_category, error_msg)
537
+ self._stats["failed"] += 1
538
+
539
+ # Always print errors for visibility
540
+ print(f"[{short_key}] EXCEPTION in {current_phase}: {error_type}: {error_msg[:200]}")
541
+ logger.error(f"[{short_key}] Traceback:\n{tb}")
542
+
543
+ return TaskResult(
544
+ task_key=task_key,
545
+ task_prompt=task_prompt,
546
+ error=f"[{current_phase}] {error_type}: {error_msg}",
547
+ execution_time_ms=int((time.time() - start) * 1000),
548
+ )
549
+
550
+ finally:
551
+ # Cleanup
552
+ if container_id:
553
+ await self._stop_container(container_id)
554
+ if port and vnc_port:
555
+ await self._release_ports(port, vnc_port)
556
+ if env:
557
+ try:
558
+ await env.close()
559
+ except:
560
+ pass
561
+
562
+ async def _start_container(
563
+ self,
564
+ port: int,
565
+ vnc_port: int,
566
+ env_url: str,
567
+ task_prompt: str,
568
+ task_key: str,
569
+ ) -> str:
570
+ """Start Docker container with CUA server."""
571
+ headless = "false" if self.config.headful else "true"
572
+
573
+ cmd = [
574
+ "docker",
575
+ "run",
576
+ "-d",
577
+ "--rm",
578
+ "-p",
579
+ f"{port}:8765",
580
+ "-e",
581
+ f"FLEET_ENV_URL={env_url}",
582
+ "-e",
583
+ f"FLEET_TASK_PROMPT={task_prompt}",
584
+ "-e",
585
+ f"FLEET_TASK_KEY={task_key}",
586
+ "-e",
587
+ f"SCREEN_WIDTH={self.config.screen_width}",
588
+ "-e",
589
+ f"SCREEN_HEIGHT={self.config.screen_height}",
590
+ "-e",
591
+ f"HEADLESS={headless}",
592
+ ]
593
+
594
+ # Add noVNC port mapping if headful
595
+ if self.config.headful:
596
+ cmd.extend(["-p", f"{vnc_port}:6080"])
597
+
598
+ cmd.append(self._docker_image)
599
+
600
+ proc = await asyncio.create_subprocess_exec(
601
+ *cmd,
602
+ stdout=asyncio.subprocess.PIPE,
603
+ stderr=asyncio.subprocess.PIPE,
604
+ )
605
+ stdout, stderr = await proc.communicate()
606
+
607
+ if proc.returncode != 0:
608
+ stderr_str = stderr.decode()
609
+ # Check for port conflict
610
+ if (
611
+ "port is already allocated" in stderr_str
612
+ or "address already in use" in stderr_str.lower()
613
+ ):
614
+ raise RuntimeError(
615
+ f"Port conflict on {port} or {vnc_port}. Try again or check for orphaned containers with: docker ps"
616
+ )
617
+ raise RuntimeError(f"Container start failed: {stderr_str}")
618
+
619
+ container_id = stdout.decode().strip()
620
+
621
+ # Track container globally for cleanup on exit
622
+ _running_containers.add(container_id)
623
+
624
+ return container_id
625
+
626
+ async def _stop_container(self, container_id: str):
627
+ """Stop Docker container and capture logs."""
628
+ # Get logs before stopping
629
+ log_proc = await asyncio.create_subprocess_exec(
630
+ "docker",
631
+ "logs",
632
+ "--tail",
633
+ "50",
634
+ container_id,
635
+ stdout=asyncio.subprocess.PIPE,
636
+ stderr=asyncio.subprocess.STDOUT,
637
+ )
638
+ logs, _ = await log_proc.communicate()
639
+ if logs:
640
+ logger.debug(f"Container {container_id[:12]} logs:\n{logs.decode()}")
641
+
642
+ proc = await asyncio.create_subprocess_exec(
643
+ "docker",
644
+ "stop",
645
+ container_id,
646
+ stdout=asyncio.subprocess.DEVNULL,
647
+ stderr=asyncio.subprocess.DEVNULL,
648
+ )
649
+ await proc.wait()
650
+
651
+ # Remove from global tracking
652
+ _running_containers.discard(container_id)
653
+
654
+ async def _wait_for_server(self, port: int, timeout: int = 60):
655
+ """Wait for CUA server to be ready."""
656
+ import aiohttp
657
+
658
+ url = f"http://localhost:{port}/health"
659
+ start = time.time()
660
+
661
+ while time.time() - start < timeout:
662
+ try:
663
+ async with aiohttp.ClientSession() as session:
664
+ async with session.get(url, timeout=2) as resp:
665
+ if resp.status == 200:
666
+ return
667
+ except:
668
+ pass
669
+ await asyncio.sleep(1)
670
+
671
+ raise TimeoutError(f"CUA server not ready after {timeout}s")
672
+
673
+ async def _run_agent(
674
+ self,
675
+ port: int,
676
+ task_prompt: str,
677
+ task_key: str,
678
+ instance_id: Optional[str] = None,
679
+ ) -> AgentResult:
680
+ """Run agent process."""
681
+ agent_path = get_agent_path(self.config.agent)
682
+ agent_script = agent_path / "agent.py"
683
+
684
+ # Set up environment
685
+ env = os.environ.copy()
686
+
687
+ # Session log file: ~/.fleet/logs/{job_id}/{task_key}.jsonl
688
+ session_log_file = self._log_dir / f"{task_key}.jsonl"
689
+
690
+ env.update(
691
+ {
692
+ "PYTHONUNBUFFERED": "1", # Ensure real-time output
693
+ "FLEET_MCP_URL": f"http://localhost:{port}",
694
+ "FLEET_SESSION_LOG": str(
695
+ session_log_file
696
+ ), # Unified session log (MCP + HTTP)
697
+ "FLEET_JOB_ID": self._job_id,
698
+ "FLEET_TASK_PROMPT": task_prompt,
699
+ "FLEET_TASK_KEY": task_key,
700
+ "FLEET_INSTANCE_ID": instance_id or "",
701
+ "FLEET_MODEL": self.config.model,
702
+ "FLEET_MAX_STEPS": str(self.config.max_steps),
703
+ "FLEET_SCREEN_WIDTH": str(self.config.screen_width),
704
+ "FLEET_SCREEN_HEIGHT": str(self.config.screen_height),
705
+ "FLEET_VERBOSE": "true" if self.config.verbose else "false",
706
+ }
707
+ )
708
+ env.update(self.config.api_keys)
709
+
710
+ proc = await asyncio.create_subprocess_exec(
711
+ sys.executable,
712
+ str(agent_script),
713
+ stdout=asyncio.subprocess.PIPE,
714
+ stderr=asyncio.subprocess.PIPE,
715
+ env=env,
716
+ )
717
+
718
+ short_key = task_key[:20]
719
+ stdout_lines = []
720
+ stderr_lines = []
721
+
722
+ async def read_stdout():
723
+ while True:
724
+ line = await proc.stdout.readline()
725
+ if not line:
726
+ break
727
+ line_str = line.decode().rstrip()
728
+ stdout_lines.append(line_str)
729
+ # Show step updates in real-time
730
+ if line_str.startswith("STEP:") or line_str.startswith("Step "):
731
+ print(f"[{short_key}] {line_str}")
732
+ elif self.config.verbose:
733
+ logger.info(f"[{short_key}] {line_str}")
734
+
735
+ async def read_stderr():
736
+ while True:
737
+ line = await proc.stderr.readline()
738
+ if not line:
739
+ break
740
+ line_str = line.decode().rstrip()
741
+ stderr_lines.append(line_str)
742
+ if self.config.verbose:
743
+ logger.warning(f"[{short_key}] stderr: {line_str}")
744
+
745
+ try:
746
+ await asyncio.wait_for(
747
+ asyncio.gather(read_stdout(), read_stderr(), proc.wait()),
748
+ timeout=self.config.timeout_seconds,
749
+ )
750
+ except asyncio.TimeoutError:
751
+ proc.kill()
752
+ await proc.wait()
753
+ return AgentResult(
754
+ task_key=task_key,
755
+ completed=False,
756
+ error="Agent timeout",
757
+ )
758
+
759
+ # Parse result from stdout/stderr
760
+ stdout_str = "\n".join(stdout_lines)
761
+ stderr_str = "\n".join(stderr_lines)
762
+
763
+ # Show full output in verbose mode
764
+ if self.config.verbose:
765
+ logger.info(f"Agent stdout:\n{stdout_str}")
766
+ if stderr_str:
767
+ logger.info(f"Agent stderr:\n{stderr_str}")
768
+ else:
769
+ logger.debug(f"Agent stdout: {stdout_str[:500]}")
770
+ if stderr_str:
771
+ logger.debug(f"Agent stderr: {stderr_str[:500]}")
772
+
773
+ # Always show stderr if agent crashed (non-zero exit or has stderr)
774
+ if proc.returncode != 0 or stderr_str:
775
+ if stderr_str:
776
+ print(f"[{short_key}] Agent stderr: {stderr_str[:500]}")
777
+
778
+ result_json = None
779
+ for line in stdout_str.split("\n"):
780
+ line = line.strip()
781
+ if line.startswith("{"):
782
+ try:
783
+ result_json = json.loads(line)
784
+ except:
785
+ continue
786
+
787
+ if result_json:
788
+ return AgentResult(
789
+ task_key=result_json.get("task_key", task_key),
790
+ final_answer=result_json.get("final_answer"),
791
+ completed=result_json.get("completed", False),
792
+ error=result_json.get("error"),
793
+ steps_taken=result_json.get("steps_taken", 0),
794
+ execution_time_ms=result_json.get("execution_time_ms", 0),
795
+ transcript=result_json.get("transcript", []),
796
+ session_id=result_json.get("session_id"),
797
+ )
798
+
799
+ # Include stderr in error message
800
+ error_msg = f"Agent failed. stdout: {stdout_str[:300]}"
801
+ if stderr_str:
802
+ error_msg += f" | stderr: {stderr_str[:300]}"
803
+
804
+ return AgentResult(
805
+ task_key=task_key,
806
+ completed=False,
807
+ error=error_msg,
808
+ )
809
+
810
+
811
+ async def run_agent(
812
+ project_key: Optional[str] = None,
813
+ task_keys: Optional[List[str]] = None,
814
+ agent: str = "gemini_cua",
815
+ model: str = "gemini-2.5-pro",
816
+ max_concurrent: int = 4,
817
+ max_steps: int = 200,
818
+ timeout_seconds: int = 600,
819
+ api_keys: Optional[Dict[str, str]] = None,
820
+ headful: bool = False,
821
+ verbose: bool = False,
822
+ ) -> Tuple[List[TaskResult], str]:
823
+ """Run agent on Fleet tasks.
824
+
825
+ Args:
826
+ project_key: Fleet project to run on
827
+ task_keys: Specific tasks (alternative to project_key)
828
+ agent: Agent implementation (default: gemini_cua)
829
+ model: Model to use
830
+ max_concurrent: Max parallel tasks
831
+ max_steps: Max agent steps per task
832
+ timeout_seconds: Timeout per task
833
+ api_keys: API keys (e.g., {"GEMINI_API_KEY": "xxx"})
834
+ headful: Show browser via noVNC
835
+ verbose: Enable verbose agent logging
836
+
837
+ Returns:
838
+ Tuple of (List of TaskResult, job_id)
839
+ """
840
+ config = AgentConfig(
841
+ project_key=project_key,
842
+ task_keys=task_keys,
843
+ agent=agent,
844
+ headful=headful,
845
+ verbose=verbose,
846
+ model=model,
847
+ max_concurrent=max_concurrent,
848
+ max_steps=max_steps,
849
+ timeout_seconds=timeout_seconds,
850
+ api_keys=api_keys or {},
851
+ )
852
+
853
+ orchestrator = AgentOrchestrator(config)
854
+ return await orchestrator.run()