fleet-python 0.2.66b2__py3-none-any.whl → 0.2.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. examples/export_tasks.py +16 -5
  2. examples/export_tasks_filtered.py +245 -0
  3. examples/fetch_tasks.py +230 -0
  4. examples/import_tasks.py +140 -8
  5. examples/iterate_verifiers.py +725 -0
  6. fleet/__init__.py +128 -5
  7. fleet/_async/__init__.py +27 -3
  8. fleet/_async/base.py +24 -9
  9. fleet/_async/client.py +938 -41
  10. fleet/_async/env/client.py +60 -3
  11. fleet/_async/instance/client.py +52 -7
  12. fleet/_async/models.py +15 -0
  13. fleet/_async/resources/api.py +200 -0
  14. fleet/_async/resources/sqlite.py +1801 -46
  15. fleet/_async/tasks.py +122 -25
  16. fleet/_async/verifiers/bundler.py +22 -21
  17. fleet/_async/verifiers/verifier.py +25 -19
  18. fleet/agent/__init__.py +32 -0
  19. fleet/agent/gemini_cua/Dockerfile +45 -0
  20. fleet/agent/gemini_cua/__init__.py +10 -0
  21. fleet/agent/gemini_cua/agent.py +759 -0
  22. fleet/agent/gemini_cua/mcp/main.py +108 -0
  23. fleet/agent/gemini_cua/mcp_server/__init__.py +5 -0
  24. fleet/agent/gemini_cua/mcp_server/main.py +105 -0
  25. fleet/agent/gemini_cua/mcp_server/tools.py +178 -0
  26. fleet/agent/gemini_cua/requirements.txt +5 -0
  27. fleet/agent/gemini_cua/start.sh +30 -0
  28. fleet/agent/orchestrator.py +854 -0
  29. fleet/agent/types.py +49 -0
  30. fleet/agent/utils.py +34 -0
  31. fleet/base.py +34 -9
  32. fleet/cli.py +1061 -0
  33. fleet/client.py +1060 -48
  34. fleet/config.py +1 -1
  35. fleet/env/__init__.py +16 -0
  36. fleet/env/client.py +60 -3
  37. fleet/eval/__init__.py +15 -0
  38. fleet/eval/uploader.py +231 -0
  39. fleet/exceptions.py +8 -0
  40. fleet/instance/client.py +53 -8
  41. fleet/instance/models.py +1 -0
  42. fleet/models.py +303 -0
  43. fleet/proxy/__init__.py +25 -0
  44. fleet/proxy/proxy.py +453 -0
  45. fleet/proxy/whitelist.py +244 -0
  46. fleet/resources/api.py +200 -0
  47. fleet/resources/sqlite.py +1845 -46
  48. fleet/tasks.py +113 -20
  49. fleet/utils/__init__.py +7 -0
  50. fleet/utils/http_logging.py +178 -0
  51. fleet/utils/logging.py +13 -0
  52. fleet/utils/playwright.py +440 -0
  53. fleet/verifiers/bundler.py +22 -21
  54. fleet/verifiers/db.py +985 -1
  55. fleet/verifiers/decorator.py +1 -1
  56. fleet/verifiers/verifier.py +25 -19
  57. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/METADATA +28 -1
  58. fleet_python-0.2.105.dist-info/RECORD +115 -0
  59. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/WHEEL +1 -1
  60. fleet_python-0.2.105.dist-info/entry_points.txt +2 -0
  61. tests/test_app_method.py +85 -0
  62. tests/test_expect_exactly.py +4148 -0
  63. tests/test_expect_only.py +2593 -0
  64. tests/test_instance_dispatch.py +607 -0
  65. tests/test_sqlite_resource_dual_mode.py +263 -0
  66. tests/test_sqlite_shared_memory_behavior.py +117 -0
  67. fleet_python-0.2.66b2.dist-info/RECORD +0 -81
  68. tests/test_verifier_security.py +0 -427
  69. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/licenses/LICENSE +0 -0
  70. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/top_level.txt +0 -0
fleet/cli.py ADDED
@@ -0,0 +1,1061 @@
1
+ """Fleet CLI - Command line interface for Fleet SDK."""
2
+
3
+ import json
4
+ import os
5
+ import signal
6
+ import sys
7
+ import threading
8
+ import time
9
+ from typing import List, Optional
10
+
11
+ # Load .env file if present (before other imports that might need env vars)
12
+ try:
13
+ from dotenv import load_dotenv
14
+ load_dotenv()
15
+ except ImportError:
16
+ pass # python-dotenv not installed, skip
17
+
18
+ try:
19
+ import typer
20
+ from rich.console import Console
21
+ from rich.live import Live
22
+ from rich.panel import Panel
23
+ from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TaskProgressColumn, TimeRemainingColumn
24
+ from rich.table import Table
25
+ except ImportError:
26
+ print(
27
+ "Error: CLI dependencies not installed.\n"
28
+ "Install with: pip install 'fleet-python[cli]'",
29
+ file=sys.stderr,
30
+ )
31
+ sys.exit(1)
32
+
33
+ from .client import Fleet
34
+ from .models import JobCreateRequest
35
+
36
+
37
+ app = typer.Typer(
38
+ name="flt",
39
+ help="Fleet CLI - Interact with Fleet jobs and sessions",
40
+ no_args_is_help=True,
41
+ )
42
+ jobs_app = typer.Typer(help="Manage jobs", no_args_is_help=True)
43
+ sessions_app = typer.Typer(help="Manage sessions", no_args_is_help=True)
44
+ eval_app = typer.Typer(help="Run evaluations", no_args_is_help=True)
45
+ projects_app = typer.Typer(help="Manage projects", no_args_is_help=True)
46
+
47
+ app.add_typer(jobs_app, name="jobs")
48
+ app.add_typer(sessions_app, name="sessions")
49
+ app.add_typer(eval_app, name="eval")
50
+ app.add_typer(projects_app, name="projects")
51
+
52
+ console = Console()
53
+
54
+
55
+ CLI_DEFAULT_BASE_URL = "https://us-west-1.fleetai.com"
56
+
57
+
58
+ def colorize_score(score: float) -> str:
59
+ """Color a score from red (0.0) to yellow (0.5) to green (1.0)."""
60
+ if score >= 0.7:
61
+ return f"[green]{score:.2f}[/green]"
62
+ elif score >= 0.4:
63
+ return f"[yellow]{score:.2f}[/yellow]"
64
+ else:
65
+ return f"[red]{score:.2f}[/red]"
66
+
67
+
68
+ def format_status(status: Optional[str]) -> str:
69
+ """Format job status with color and clean text."""
70
+ if not status:
71
+ return "[dim]-[/dim]"
72
+
73
+ status_map = {
74
+ "completed": "[green]✓ completed[/green]",
75
+ "in_progress": "[yellow]● running[/yellow]",
76
+ "pending": "[dim]○ pending[/dim]",
77
+ "load_tasks": "[blue]↻ loading[/blue]",
78
+ "failed": "[red]✗ failed[/red]",
79
+ "cancelled": "[dim]✗ cancelled[/dim]",
80
+ }
81
+ return status_map.get(status, f"[dim]{status}[/dim]")
82
+
83
+
84
+ def get_client() -> Fleet:
85
+ """Get a Fleet client using environment variables."""
86
+ api_key = os.getenv("FLEET_API_KEY")
87
+ if not api_key:
88
+ console.print(
89
+ "[red]Error:[/red] FLEET_API_KEY environment variable not set",
90
+ style="bold",
91
+ )
92
+ raise typer.Exit(1)
93
+ base_url = os.getenv("FLEET_BASE_URL", CLI_DEFAULT_BASE_URL)
94
+ return Fleet(api_key=api_key, base_url=base_url)
95
+
96
+
97
+ def _run_oversight(job_id: str, model: str = "anthropic/claude-sonnet-4"):
98
+ """Run oversight summarization on a completed job."""
99
+ import httpx
100
+
101
+ api_key = os.getenv("FLEET_API_KEY")
102
+ if not api_key:
103
+ console.print("[yellow]Warning:[/yellow] FLEET_API_KEY not set, skipping oversight")
104
+ return
105
+
106
+ base_url = os.getenv("FLEET_BASE_URL", CLI_DEFAULT_BASE_URL)
107
+ oversight_url = f"{base_url}/v1/summarize/job"
108
+
109
+ console.print()
110
+ console.print("[bold]Running Oversight Analysis...[/bold]")
111
+
112
+ try:
113
+ with httpx.Client(timeout=300) as client:
114
+ response = client.post(
115
+ oversight_url,
116
+ headers={
117
+ "accept": "application/json",
118
+ "Authorization": f"Bearer {api_key}",
119
+ "Content-Type": "application/json",
120
+ },
121
+ json={
122
+ "job_id": job_id,
123
+ "model": model,
124
+ "max_context_tokens": 180000,
125
+ "force_new_summary": False,
126
+ "max_concurrent": 20,
127
+ },
128
+ )
129
+
130
+ if response.status_code == 200:
131
+ result = response.json()
132
+ console.print(f"[green]✓[/green] Oversight analysis started")
133
+ if "summary_id" in result:
134
+ console.print(f" Summary ID: [cyan]{result['summary_id']}[/cyan]")
135
+ # Show link to dashboard
136
+ console.print(f" View: [cyan]https://fleetai.com/dashboard/jobs/{job_id}[/cyan]")
137
+ else:
138
+ console.print(f"[yellow]Warning:[/yellow] Oversight API returned {response.status_code}")
139
+ console.print(f" {response.text[:200]}")
140
+ except Exception as e:
141
+ console.print(f"[yellow]Warning:[/yellow] Oversight request failed: {e}")
142
+
143
+
144
+ # Jobs commands
145
+
146
+
147
+ @jobs_app.command("list")
148
+ def list_jobs(
149
+ team_id: Optional[str] = typer.Option(None, "--team-id", help="Filter by team ID (admin only)"),
150
+ output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
151
+ ):
152
+ """List all jobs."""
153
+ client = get_client()
154
+ jobs = client.list_jobs(team_id=team_id)
155
+
156
+ if output_json:
157
+ console.print(json.dumps([j.model_dump() for j in jobs], indent=2, default=str))
158
+ return
159
+
160
+ if not jobs:
161
+ console.print("No jobs found.")
162
+ return
163
+
164
+ table = Table(title="Jobs")
165
+ table.add_column("ID", style="cyan")
166
+ table.add_column("Name", style="green")
167
+ table.add_column("Status", style="yellow")
168
+ table.add_column("Created At", style="dim")
169
+
170
+ for job in jobs:
171
+ table.add_row(
172
+ job.id,
173
+ job.name or "-",
174
+ format_status(job.status),
175
+ job.created_at or "-",
176
+ )
177
+
178
+ console.print(table)
179
+
180
+ # Show tips with a real job ID from the results
181
+ first_job_id = jobs[0].id
182
+ console.print()
183
+ console.print("[dim]Tips:[/dim]")
184
+ console.print(f"[dim] Job details: flt jobs get {first_job_id}[/dim]")
185
+ console.print(f"[dim] Job sessions: flt jobs sessions {first_job_id}[/dim]")
186
+ console.print(f"[dim] Session transcript: flt sessions transcript <session-id>[/dim]")
187
+
188
+
189
+ @jobs_app.command("create")
190
+ def create_job(
191
+ model: List[str] = typer.Option(..., "--model", "-m", help="Model in 'provider/model' format (repeatable)"),
192
+ env_key: Optional[str] = typer.Option(None, "--env-key", "-e", help="Environment key"),
193
+ project_key: Optional[str] = typer.Option(None, "--project-key", "-p", help="Project key"),
194
+ task_keys: Optional[List[str]] = typer.Option(None, "--task-key", "-t", help="Task key (repeatable)"),
195
+ name: Optional[str] = typer.Option(None, "--name", "-n", help="Job name. Supports placeholders: {id} (UUID), {sid} (short UUID), {i} (auto-increment, must be suffix)"),
196
+ pass_k: int = typer.Option(1, "--pass-k", help="Number of passes"),
197
+ max_steps: Optional[int] = typer.Option(None, "--max-steps", help="Maximum agent steps"),
198
+ max_duration: int = typer.Option(60, "--max-duration", help="Timeout in minutes"),
199
+ max_concurrent: int = typer.Option(30, "--max-concurrent", help="Max concurrent per model"),
200
+ mode: Optional[str] = typer.Option(None, "--mode", help="Mode: 'tool-use' or 'computer-use'"),
201
+ system_prompt: Optional[str] = typer.Option(None, "--system-prompt", help="Custom system prompt"),
202
+ model_prompt: Optional[List[str]] = typer.Option(None, "--model-prompt", help="Per-model prompt in 'provider/model=prompt' format (repeatable)"),
203
+ byok: Optional[List[str]] = typer.Option(None, "--byok", help="Bring Your Own Key in 'provider=key' format (repeatable)"),
204
+ byok_ttl: Optional[int] = typer.Option(None, "--byok-ttl", help="TTL for BYOK keys in minutes"),
205
+ harness: Optional[str] = typer.Option(None, "--harness", help="Harness identifier"),
206
+ output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
207
+ ):
208
+ """Create a new job.
209
+
210
+ Requires --model (repeatable) and exactly one of --env-key, --project-key, or --task-key.
211
+ """
212
+ # Validate mutual exclusivity
213
+ sources = [env_key, project_key, task_keys]
214
+ specified = sum(1 for s in sources if s)
215
+ if specified != 1:
216
+ console.print(
217
+ "[red]Error:[/red] Exactly one of --env-key, --project-key, or --task-key must be specified",
218
+ style="bold",
219
+ )
220
+ raise typer.Exit(1)
221
+
222
+ # Parse model prompts
223
+ model_prompts = None
224
+ if model_prompt:
225
+ model_prompts = {}
226
+ for mp in model_prompt:
227
+ if "=" not in mp:
228
+ console.print(
229
+ f"[red]Error:[/red] Invalid --model-prompt format: {mp}. Expected 'provider/model=prompt'",
230
+ style="bold",
231
+ )
232
+ raise typer.Exit(1)
233
+ key, value = mp.split("=", 1)
234
+ model_prompts[key] = value
235
+
236
+ # Parse BYOK keys
237
+ byok_keys = None
238
+ if byok:
239
+ byok_keys = {}
240
+ for b in byok:
241
+ if "=" not in b:
242
+ console.print(
243
+ f"[red]Error:[/red] Invalid --byok format: {b}. Expected 'provider=key'",
244
+ style="bold",
245
+ )
246
+ raise typer.Exit(1)
247
+ provider, key = b.split("=", 1)
248
+ byok_keys[provider] = key
249
+
250
+ client = get_client()
251
+
252
+ try:
253
+ result = client.create_job(
254
+ models=model,
255
+ name=name,
256
+ pass_k=pass_k,
257
+ env_key=env_key,
258
+ project_key=project_key,
259
+ task_keys=task_keys,
260
+ max_steps=max_steps,
261
+ max_duration_minutes=max_duration,
262
+ max_concurrent_per_model=max_concurrent,
263
+ mode=mode,
264
+ system_prompt=system_prompt,
265
+ model_prompts=model_prompts,
266
+ byok_keys=byok_keys,
267
+ byok_ttl_minutes=byok_ttl,
268
+ harness=harness,
269
+ )
270
+ except Exception as e:
271
+ console.print(f"[red]Error creating job:[/red] {e}")
272
+ raise typer.Exit(1)
273
+
274
+ if output_json:
275
+ console.print(json.dumps(result.model_dump(), indent=2, default=str))
276
+ return
277
+
278
+ console.print(f"[green]Job created successfully![/green]")
279
+ console.print(f" Job ID: [cyan]{result.job_id}[/cyan]")
280
+ if result.workflow_job_id:
281
+ console.print(f" Workflow ID: {result.workflow_job_id}")
282
+ console.print(f" Status: {format_status(result.status)}")
283
+ if result.name:
284
+ console.print(f" Name: {result.name}")
285
+
286
+
287
+ @jobs_app.command("get")
288
+ def get_job(
289
+ job_id: str = typer.Argument(..., help="Job ID"),
290
+ team_id: Optional[str] = typer.Option(None, "--team-id", help="Team ID (admin only)"),
291
+ output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
292
+ ):
293
+ """Get details for a specific job."""
294
+ client = get_client()
295
+ job = client.get_job(job_id, team_id=team_id)
296
+
297
+ if output_json:
298
+ console.print(json.dumps(job.model_dump(), indent=2, default=str))
299
+ return
300
+
301
+ console.print(f"[bold]Job Details[/bold]")
302
+ console.print(f" ID: [cyan]{job.id}[/cyan]")
303
+ console.print(f" Name: {job.name or '-'}")
304
+ console.print(f" Status: {format_status(job.status)}")
305
+ console.print(f" Created At: {job.created_at or '-'}")
306
+
307
+ # Show tips
308
+ console.print()
309
+ console.print("[dim]Tips:[/dim]")
310
+ console.print(f"[dim] Job sessions: flt jobs sessions {job.id}[/dim]")
311
+ console.print(f"[dim] Session transcript: flt sessions transcript <session-id>[/dim]")
312
+
313
+
314
+ @jobs_app.command("sessions")
315
+ def list_job_sessions(
316
+ job_id: str = typer.Argument(..., help="Job ID"),
317
+ output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
318
+ ):
319
+ """List all sessions for a job, grouped by task."""
320
+ client = get_client()
321
+ result = client.list_job_sessions(job_id)
322
+
323
+ if output_json:
324
+ console.print(json.dumps(result.model_dump(), indent=2, default=str))
325
+ return
326
+
327
+ console.print(f"[bold]Sessions for Job:[/bold] [cyan]{result.job_id}[/cyan]")
328
+ console.print(f"Total Sessions: {result.total_sessions}\n")
329
+
330
+ first_session_id = None
331
+ for task_group in result.tasks:
332
+ task_name = task_group.task.key if task_group.task else task_group.task_id or "Unknown"
333
+ pass_rate_pct = task_group.pass_rate * 100
334
+
335
+ console.print(f"[bold green]Task:[/bold green] {task_name}")
336
+ console.print(f" Pass Rate: {task_group.passed_sessions}/{task_group.total_sessions} ({pass_rate_pct:.1f}%)")
337
+ if task_group.average_score is not None:
338
+ console.print(f" Average Score: {task_group.average_score:.2f}")
339
+
340
+ table = Table(show_header=True)
341
+ table.add_column("Session ID", style="cyan")
342
+ table.add_column("Model", style="blue")
343
+ table.add_column("Status", style="yellow")
344
+ table.add_column("Steps")
345
+ table.add_column("Result")
346
+
347
+ for session in task_group.sessions:
348
+ if first_session_id is None:
349
+ first_session_id = session.session_id
350
+ result_str = "-"
351
+ if session.verifier_execution:
352
+ if session.verifier_execution.success:
353
+ result_str = "[green]PASS[/green]"
354
+ if session.verifier_execution.score is not None:
355
+ score_colored = colorize_score(session.verifier_execution.score)
356
+ result_str += f" ({score_colored})"
357
+ else:
358
+ result_str = "[red]FAIL[/red]"
359
+
360
+ table.add_row(
361
+ session.session_id,
362
+ session.model,
363
+ format_status(session.status),
364
+ str(session.step_count),
365
+ result_str,
366
+ )
367
+
368
+ console.print(table)
369
+ console.print()
370
+
371
+ # Show tips with a real session ID
372
+ if first_session_id:
373
+ console.print("[dim]Tips:[/dim]")
374
+ console.print(f"[dim] Session transcript: flt sessions transcript {first_session_id}[/dim]")
375
+
376
+
377
+ @jobs_app.command("oversight")
378
+ def run_job_oversight(
379
+ job_id: str = typer.Argument(..., help="Job ID to analyze"),
380
+ model: str = typer.Option("anthropic/claude-sonnet-4", "--model", "-m", help="Model for oversight analysis"),
381
+ ):
382
+ """Run AI oversight analysis on a job."""
383
+ _run_oversight(job_id, model)
384
+
385
+
386
+ # Sessions commands
387
+
388
+
389
+ @sessions_app.command("transcript")
390
+ def get_session_transcript(
391
+ session_id: str = typer.Argument(..., help="Session ID"),
392
+ output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
393
+ ):
394
+ """Get the transcript for a session."""
395
+ client = get_client()
396
+ result = client.get_session_transcript(session_id)
397
+
398
+ if output_json:
399
+ console.print(json.dumps(result.model_dump(), indent=2, default=str))
400
+ return
401
+
402
+ # Header
403
+ console.print(f"[bold]Session Transcript[/bold]")
404
+ if result.instance:
405
+ console.print(f" Status: {format_status(result.instance.status)}")
406
+ console.print()
407
+
408
+ # Task info
409
+ if result.task:
410
+ console.print(f"[bold]Task:[/bold] {result.task.key}")
411
+ console.print(f" Environment: {result.task.env_id}")
412
+ if result.task.version:
413
+ console.print(f" Version: {result.task.version}")
414
+ console.print()
415
+ console.print(f"[bold]Prompt:[/bold]")
416
+ console.print(f" {result.task.prompt}")
417
+ console.print()
418
+
419
+ # Verifier result
420
+ if result.verifier_execution:
421
+ status = "[green]PASS[/green]" if result.verifier_execution.success else "[red]FAIL[/red]"
422
+ console.print(f"[bold]Verifier Result:[/bold] {status}")
423
+ if result.verifier_execution.score is not None:
424
+ score_colored = colorize_score(result.verifier_execution.score)
425
+ console.print(f" Score: {score_colored}")
426
+ console.print(f" Execution Time: {result.verifier_execution.execution_time_ms}ms")
427
+ console.print()
428
+
429
+ # Transcript
430
+ console.print(f"[bold]Transcript:[/bold] ({len(result.transcript)} messages)")
431
+ console.print("-" * 60)
432
+
433
+ for msg in result.transcript:
434
+ role_colors = {
435
+ "user": "green",
436
+ "assistant": "blue",
437
+ "tool": "yellow",
438
+ "system": "magenta",
439
+ }
440
+ color = role_colors.get(msg.role, "white")
441
+ console.print(f"[bold {color}]{msg.role.upper()}:[/bold {color}]")
442
+
443
+ # Handle content
444
+ if isinstance(msg.content, str):
445
+ # Truncate long content
446
+ content = msg.content
447
+ if len(content) > 500:
448
+ content = content[:500] + "..."
449
+ console.print(f" {content}")
450
+ elif isinstance(msg.content, list):
451
+ # Multimodal content
452
+ for item in msg.content:
453
+ if isinstance(item, dict):
454
+ if item.get("type") == "text":
455
+ text = item.get("text", "")
456
+ if len(text) > 500:
457
+ text = text[:500] + "..."
458
+ console.print(f" {text}")
459
+ elif item.get("type") == "image_url":
460
+ console.print(f" [dim][Image][/dim]")
461
+ elif item.get("type") == "tool_use":
462
+ console.print(f" [dim]Tool: {item.get('name', 'unknown')}[/dim]")
463
+ elif item.get("type") == "tool_result":
464
+ console.print(f" [dim]Tool Result[/dim]")
465
+ else:
466
+ console.print(f" {item}")
467
+ else:
468
+ console.print(f" {msg.content}")
469
+
470
+ # Tool calls
471
+ if msg.tool_calls:
472
+ for tc in msg.tool_calls:
473
+ if isinstance(tc, dict):
474
+ name = tc.get("function", {}).get("name", tc.get("name", "unknown"))
475
+ console.print(f" [dim]-> Tool call: {name}[/dim]")
476
+
477
+ console.print()
478
+
479
+
480
+ # Projects commands
481
+
482
+
483
+ @projects_app.command("list")
484
+ def list_projects(
485
+ output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
486
+ ):
487
+ """List all active projects."""
488
+ client = get_client()
489
+
490
+ # Call the projects endpoint directly since there's no SDK method yet
491
+ response = client.client.request("GET", "/v1/tasks/projects")
492
+ data = response.json()
493
+
494
+ if output_json:
495
+ console.print(json.dumps(data, indent=2, default=str))
496
+ return
497
+
498
+ projects = data.get("projects", [])
499
+
500
+ if not projects:
501
+ console.print("No projects found.")
502
+ return
503
+
504
+ table = Table(title="Projects")
505
+ table.add_column("Project Key", style="cyan", no_wrap=True)
506
+ table.add_column("Modality", style="blue")
507
+ table.add_column("Created At", style="dim")
508
+
509
+ for project in projects:
510
+ modality = project.get("task_modality") or "-"
511
+ # Clean up modality display
512
+ if modality == "tool_use":
513
+ modality = "tool-use"
514
+ elif modality == "computer_use":
515
+ modality = "computer-use"
516
+
517
+ table.add_row(
518
+ project.get("project_key", "-"),
519
+ modality,
520
+ project.get("created_at", "-"),
521
+ )
522
+
523
+ console.print(table)
524
+
525
+ # Show tips
526
+ if projects:
527
+ first_project = projects[0].get("project_key", "my-project")
528
+ console.print()
529
+ console.print("[dim]Tips:[/dim]")
530
+ console.print(f"[dim] Run eval: flt eval run -p {first_project} -m openai/gpt-4o-mini[/dim]")
531
+
532
+
533
+ # Eval commands
534
+
535
+
536
+ def _run_local_agent(
537
+ project_key: Optional[str],
538
+ task_keys: Optional[List[str]],
539
+ model: str,
540
+ agent: str,
541
+ max_steps: int,
542
+ max_duration: int,
543
+ max_concurrent: int,
544
+ byok: Optional[List[str]],
545
+ output_json: bool,
546
+ verbose: bool = False,
547
+ headful: bool = False,
548
+ oversight: bool = False,
549
+ oversight_model: str = "anthropic/claude-sonnet-4",
550
+ ):
551
+ """Run agent locally with Docker-based browser control."""
552
+ import asyncio
553
+ import logging
554
+
555
+ if verbose:
556
+ logging.basicConfig(level=logging.DEBUG, format='%(name)s: %(message)s')
557
+
558
+ # Parse API keys
559
+ api_keys = {}
560
+ if os.getenv("GEMINI_API_KEY"):
561
+ api_keys["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")
562
+ if os.getenv("OPENAI_API_KEY"):
563
+ api_keys["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
564
+ if os.getenv("ANTHROPIC_API_KEY"):
565
+ api_keys["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")
566
+
567
+ # Parse BYOK and add to api_keys
568
+ if byok:
569
+ provider_to_env = {"google": "GEMINI_API_KEY", "openai": "OPENAI_API_KEY", "anthropic": "ANTHROPIC_API_KEY"}
570
+ for b in byok:
571
+ if "=" not in b:
572
+ console.print(f"[red]Error:[/red] Invalid --byok format: {b}")
573
+ raise typer.Exit(1)
574
+ provider, key = b.split("=", 1)
575
+ api_keys[provider_to_env.get(provider.lower(), f"{provider.upper()}_API_KEY")] = key
576
+
577
+ # Check for required API key based on agent
578
+ if "gemini" in agent.lower() and "GEMINI_API_KEY" not in api_keys:
579
+ console.print("[red]Error:[/red] GEMINI_API_KEY required for gemini_cua agent")
580
+ console.print()
581
+ console.print("Set it via environment:")
582
+ console.print(" [cyan]export GEMINI_API_KEY=your-key[/cyan]")
583
+ console.print()
584
+ console.print("Or pass via --byok:")
585
+ console.print(" [cyan]flt eval run ... --byok google=your-key[/cyan]")
586
+ raise typer.Exit(1)
587
+
588
+ if verbose:
589
+ console.print(f"[dim]API keys configured: {list(api_keys.keys())}[/dim]")
590
+
591
+ # Display config (matching remote format)
592
+ suite_name = project_key if project_key else (', '.join(task_keys) if task_keys else "all tasks")
593
+ console.print()
594
+ console.print("[green bold]Eval started[/green bold] [dim](local)[/dim]")
595
+ console.print()
596
+ console.print(f" [bold]Suite[/bold] {suite_name}")
597
+ console.print(f" [bold]Models[/bold] {model}")
598
+ console.print(f" [bold]Agent[/bold] {agent}")
599
+ console.print(f" [bold]Max Steps[/bold] {max_steps}")
600
+ console.print(f" [bold]Concurrent[/bold] {max_concurrent}")
601
+ if headful:
602
+ console.print(f" [bold]Headful[/bold] [green]Yes[/green] (browser visible via noVNC)")
603
+ console.print()
604
+
605
+ async def run():
606
+ from fleet.agent import run_agent
607
+ return await run_agent(
608
+ project_key=project_key,
609
+ task_keys=task_keys,
610
+ agent=agent,
611
+ model=model,
612
+ max_concurrent=max_concurrent,
613
+ max_steps=max_steps,
614
+ timeout_seconds=max_duration * 60,
615
+ api_keys=api_keys,
616
+ headful=headful,
617
+ verbose=verbose,
618
+ )
619
+
620
+ console.print("[dim]Starting...[/dim]")
621
+ console.print()
622
+
623
+ job_id = None
624
+ try:
625
+ results, job_id = asyncio.run(run())
626
+ except KeyboardInterrupt:
627
+ console.print()
628
+ console.print("[yellow]Cancelled.[/yellow]")
629
+ raise typer.Exit(1)
630
+ except Exception as e:
631
+ console.print(f"[red]Error:[/red] {e}")
632
+ raise typer.Exit(1)
633
+
634
+ # Display results
635
+ if output_json:
636
+ output = []
637
+ for r in results:
638
+ output.append({
639
+ "task_key": r.task_key,
640
+ "task_prompt": r.task_prompt,
641
+ "completed": r.agent_result.completed if r.agent_result else False,
642
+ "final_answer": r.agent_result.final_answer if r.agent_result else None,
643
+ "verification_success": r.verification_success,
644
+ "verification_score": r.verification_score,
645
+ "error": r.error or (r.agent_result.error if r.agent_result else None),
646
+ "steps_taken": r.agent_result.steps_taken if r.agent_result else 0,
647
+ "execution_time_ms": r.execution_time_ms,
648
+ })
649
+ console.print(json.dumps(output, indent=2))
650
+ return
651
+
652
+ # Show dashboard link panel (matching remote format)
653
+ console.print()
654
+ if job_id:
655
+ console.print(Panel(
656
+ f"[bold]Live agent traces[/bold]\n\n https://www.fleetai.com/dashboard/jobs/{job_id}",
657
+ border_style="cyan",
658
+ ))
659
+ console.print()
660
+ console.print("[dim]Tips:[/dim]")
661
+ console.print(f"[dim] Job details: flt jobs get {job_id}[/dim]")
662
+ console.print(f"[dim] Job sessions: flt jobs sessions {job_id}[/dim]")
663
+ console.print(f"[dim] Session transcript: flt sessions transcript <session-id>[/dim]")
664
+
665
+ # Summary
666
+ console.print()
667
+ console.print("[bold]Results[/bold]")
668
+ console.print("-" * 60)
669
+
670
+ errors = 0
671
+ scores = []
672
+ completed = 0
673
+
674
+ for r in results:
675
+ if r.error:
676
+ status = "[red]ERROR[/red]"
677
+ errors += 1
678
+ elif r.verification_score is not None:
679
+ scores.append(r.verification_score)
680
+ completed += 1
681
+ # Color based on score
682
+ if r.verification_score >= 0.7:
683
+ status = f"[green]{r.verification_score:.2f}[/green]"
684
+ elif r.verification_score >= 0.4:
685
+ status = f"[yellow]{r.verification_score:.2f}[/yellow]"
686
+ else:
687
+ status = f"[red]{r.verification_score:.2f}[/red]"
688
+ elif r.verification_success is True:
689
+ status = "[green]PASS[/green]"
690
+ completed += 1
691
+ elif r.verification_success is False:
692
+ status = "[red]FAIL[/red]"
693
+ completed += 1
694
+ elif r.agent_result and r.agent_result.completed:
695
+ status = "[yellow]DONE[/yellow]"
696
+ completed += 1
697
+ else:
698
+ status = "[red]INCOMPLETE[/red]"
699
+
700
+ key = r.task_key[:40] + "..." if len(r.task_key) > 40 else r.task_key
701
+ console.print(f" {status} {key}")
702
+
703
+ if r.error:
704
+ # Show first 100 chars of error
705
+ err = r.error.replace('\n', ' ')[:100]
706
+ console.print(f" [dim]{err}[/dim]")
707
+
708
+ console.print("-" * 60)
709
+
710
+ total = len(results)
711
+ if total > 0:
712
+ console.print(f"[bold]Completed:[/bold] {completed}/{total}")
713
+ if scores:
714
+ avg_score = sum(scores) / len(scores)
715
+ score_color = "green" if avg_score >= 0.7 else "yellow" if avg_score >= 0.4 else "red"
716
+ console.print(f"[bold]Avg. Score:[/bold] [{score_color}]{avg_score:.2f}[/{score_color}]")
717
+ if errors:
718
+ console.print(f"[bold]Errors:[/bold] [red]{errors}[/red]")
719
+
720
+ # Run oversight if requested
721
+ if oversight and job_id:
722
+ _run_oversight(job_id, oversight_model)
723
+
724
+
725
+ def _listen_for_detach_key(stop_event: threading.Event):
726
+ """Listen for Ctrl+B in a background thread to signal detachment."""
727
+ try:
728
+ # Platform-specific keyboard input handling
729
+ if sys.platform == 'win32':
730
+ import msvcrt
731
+ while not stop_event.is_set():
732
+ if msvcrt.kbhit():
733
+ ch = msvcrt.getch()
734
+ if ch == b'\x02': # Ctrl+B
735
+ stop_event.set()
736
+ break
737
+ time.sleep(0.1)
738
+ else:
739
+ # Unix-like systems
740
+ import select
741
+ import tty
742
+ import termios
743
+
744
+ old_settings = termios.tcgetattr(sys.stdin)
745
+ try:
746
+ tty.setcbreak(sys.stdin.fileno())
747
+ while not stop_event.is_set():
748
+ # Check if input is available with timeout
749
+ if select.select([sys.stdin], [], [], 0.1)[0]:
750
+ ch = sys.stdin.read(1)
751
+ if ch == '\x02': # Ctrl+B
752
+ stop_event.set()
753
+ break
754
+ finally:
755
+ termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
756
+ except Exception:
757
+ # If we can't set up keyboard listening, just exit gracefully
758
+ pass
759
+
760
+
761
+ @eval_app.command("run")
762
+ def eval_run(
763
+ project_key: Optional[str] = typer.Option(None, "--project", "-p", help="Project key to evaluate"),
764
+ task_keys: Optional[List[str]] = typer.Option(None, "--task", "-t", help="Specific task key(s) to run (repeatable)"),
765
+ model: List[str] = typer.Option(..., "--model", "-m", help="Model (e.g., google/gemini-2.5-pro)"),
766
+ name: Optional[str] = typer.Option(None, "--name", "-n", help="Job name"),
767
+ pass_k: int = typer.Option(1, "--pass-k", "-k", help="Number of passes per task"),
768
+ max_steps: Optional[int] = typer.Option(None, "--max-steps", help="Maximum agent steps"),
769
+ max_duration: int = typer.Option(60, "--max-duration", help="Timeout in minutes"),
770
+ max_concurrent: int = typer.Option(30, "--max-concurrent", help="Max concurrent per model"),
771
+ byok: Optional[List[str]] = typer.Option(None, "--byok", help="Bring Your Own Key: 'provider=key'"),
772
+ no_watch: bool = typer.Option(False, "--no-watch", help="Don't watch progress"),
773
+ output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
774
+ # Local execution
775
+ local: Optional[str] = typer.Option(None, "--local", "-l", help="Run locally. Use 'gemini_cua' for built-in or path for custom agent"),
776
+ headful: bool = typer.Option(False, "--headful", help="Show browser via noVNC (local mode)"),
777
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Show debug output"),
778
+ # Oversight
779
+ oversight: bool = typer.Option(False, "--oversight", help="Run AI oversight analysis on job completion"),
780
+ oversight_model: str = typer.Option("anthropic/claude-sonnet-4", "--oversight-model", help="Model for oversight analysis"),
781
+ ):
782
+ """
783
+ Run an evaluation on a project or specific tasks.
784
+
785
+ \b
786
+ Examples:
787
+ # Cloud execution (default)
788
+ flt eval run -p my-project -m google/gemini-2.5-pro
789
+
790
+ # Run specific task(s)
791
+ flt eval run -t task_abc123 -m google/gemini-2.5-pro --local gemini_cua
792
+
793
+ # Local with built-in agent
794
+ flt eval run -p my-project -m google/gemini-2.5-pro --local gemini_cua
795
+
796
+ # Local with headful mode (watch the browser)
797
+ flt eval run -p my-project -m google/gemini-2.5-pro --local gemini_cua --headful
798
+
799
+ # Local with custom agent
800
+ flt eval run -p my-project -m google/gemini-2.5-pro --local ./my-agent
801
+ """
802
+ # Validate: need either project or task keys
803
+ if not project_key and not task_keys:
804
+ console.print("[red]Error:[/red] Either --project (-p) or --task (-t) must be specified")
805
+ raise typer.Exit(1)
806
+
807
+ # Local mode
808
+ if local is not None:
809
+ _run_local_agent(
810
+ project_key=project_key,
811
+ task_keys=task_keys,
812
+ model=model[0] if model else "gemini-2.5-pro",
813
+ agent=local if local else "gemini_cua",
814
+ max_steps=max_steps or 200,
815
+ max_duration=max_duration,
816
+ max_concurrent=max_concurrent,
817
+ byok=byok,
818
+ output_json=output_json,
819
+ verbose=verbose,
820
+ headful=headful,
821
+ oversight=oversight,
822
+ oversight_model=oversight_model,
823
+ )
824
+ return
825
+
826
+ client = get_client()
827
+
828
+ # Parse BYOK keys
829
+ byok_keys = None
830
+ if byok:
831
+ byok_keys = {}
832
+ for b in byok:
833
+ if "=" not in b:
834
+ console.print(
835
+ f"[red]Error:[/red] Invalid --byok format: {b}. Expected 'provider=key'",
836
+ style="bold",
837
+ )
838
+ raise typer.Exit(1)
839
+ provider, key = b.split("=", 1)
840
+ byok_keys[provider] = key
841
+
842
+ try:
843
+ result = client.create_job(
844
+ models=model,
845
+ name=name,
846
+ pass_k=pass_k,
847
+ project_key=project_key if project_key else None,
848
+ task_keys=task_keys if task_keys else None,
849
+ max_steps=max_steps,
850
+ max_duration_minutes=max_duration,
851
+ max_concurrent_per_model=max_concurrent,
852
+ byok_keys=byok_keys,
853
+ )
854
+ except Exception as e:
855
+ error_str = str(e)
856
+
857
+ # Check if it's a model not found error and format nicely
858
+ if "not found" in error_str.lower() and "available models" in error_str.lower():
859
+ console.print(f"[red]Error:[/red] Invalid model specified")
860
+ console.print()
861
+ # Extract and display available models
862
+ if "Available models:" in error_str:
863
+ try:
864
+ models_part = error_str.split("Available models:")[1].strip()
865
+ # Parse the list string
866
+ import ast
867
+ available = ast.literal_eval(models_part)
868
+ console.print("[bold]Available models:[/bold]")
869
+ for m in sorted(available):
870
+ console.print(f" [cyan]{m}[/cyan]")
871
+ except:
872
+ console.print(f"[dim]{error_str}[/dim]")
873
+ else:
874
+ console.print(f"[red]Error creating job:[/red] {e}")
875
+ raise typer.Exit(1)
876
+
877
+ job_id = result.job_id
878
+
879
+ if output_json:
880
+ console.print(json.dumps(result.model_dump(), indent=2, default=str))
881
+ return
882
+
883
+ # Display summary
884
+ suite_name = project_key if project_key else "all tasks"
885
+ job_name = name or result.name # Use provided name or server-generated name
886
+ console.print()
887
+ console.print("[green bold]Eval started[/green bold]")
888
+ console.print()
889
+ if job_name:
890
+ console.print(f" [bold]Name[/bold] {job_name}")
891
+ console.print(f" [bold]Suite[/bold] {suite_name}")
892
+ console.print(f" [bold]Models[/bold] {', '.join(model)}")
893
+ console.print(f" [bold]Passes[/bold] {pass_k}")
894
+ console.print(f" [bold]Job ID[/bold] [cyan]{job_id}[/cyan]")
895
+ console.print()
896
+
897
+ # Show dashboard link
898
+ console.print(Panel(
899
+ f"[bold]Live agent traces[/bold]\n\n https://www.fleetai.com/dashboard/jobs/{job_id}",
900
+ border_style="cyan",
901
+ ))
902
+ console.print()
903
+
904
+ # Show tips
905
+ console.print("[dim]Tips:[/dim]")
906
+ console.print(f"[dim] Job details: flt jobs get {job_id}[/dim]")
907
+ console.print(f"[dim] Job sessions: flt jobs sessions {job_id}[/dim]")
908
+ console.print(f"[dim] Session transcript: flt sessions transcript <session-id>[/dim]")
909
+ console.print()
910
+
911
+ if no_watch:
912
+ return
913
+
914
+ # Watch progress
915
+ console.print("[dim]Watching progress... (Press Ctrl+B to detach, job continues running)[/dim]")
916
+ console.print()
917
+
918
+ # Terminal statuses for sessions
919
+ TERMINAL_SESSION_STATUSES = {"completed", "timed_out", "errored", "failed"}
920
+ TERMINAL_JOB_STATUSES = {"completed", "errored", "failed", "cancelled"}
921
+
922
+ detached = False
923
+ detach_event = threading.Event()
924
+
925
+ # Start keyboard listener thread
926
+ listener_thread = threading.Thread(target=_listen_for_detach_key, args=(detach_event,), daemon=True)
927
+ listener_thread.start()
928
+
929
+ try:
930
+ with Progress(
931
+ SpinnerColumn(),
932
+ TextColumn("[progress.description]{task.description}"),
933
+ BarColumn(),
934
+ TaskProgressColumn(),
935
+ TimeRemainingColumn(),
936
+ console=console,
937
+ ) as progress:
938
+ task = progress.add_task("[cyan]Starting eval...", total=None)
939
+
940
+ while True:
941
+ # Poll sessions for progress
942
+ try:
943
+ sessions_response = client.list_job_sessions(job_id)
944
+ total = sessions_response.total_sessions
945
+
946
+ # Count sessions in terminal state
947
+ completed = sum(
948
+ 1 for tg in sessions_response.tasks
949
+ for s in tg.sessions
950
+ if s.status in TERMINAL_SESSION_STATUSES
951
+ )
952
+
953
+ # Count passed sessions and collect scores
954
+ passed = 0
955
+ scores = []
956
+ for tg in sessions_response.tasks:
957
+ for s in tg.sessions:
958
+ if s.verifier_execution:
959
+ if s.verifier_execution.success:
960
+ passed += 1
961
+ if s.verifier_execution.score is not None:
962
+ scores.append(s.verifier_execution.score)
963
+
964
+ # Calculate average score
965
+ avg_score = sum(scores) / len(scores) if scores else None
966
+
967
+ if total > 0:
968
+ # Build description with score if available
969
+ if avg_score is not None:
970
+ desc = f"[cyan]Running ({completed}/{total}) | {passed} passed | avg: {avg_score:.2f}[/cyan]"
971
+ else:
972
+ desc = f"[cyan]Running ({completed}/{total}) | {passed} passed[/cyan]"
973
+
974
+ progress.update(
975
+ task,
976
+ completed=completed,
977
+ total=total,
978
+ description=desc
979
+ )
980
+
981
+ # Check if all sessions are done
982
+ if completed >= total:
983
+ break
984
+ except:
985
+ # Sessions endpoint might not be ready yet
986
+ pass
987
+
988
+ # Also check job status as fallback
989
+ try:
990
+ job = client.get_job(job_id)
991
+ if job.status in TERMINAL_JOB_STATUSES:
992
+ break
993
+ except:
994
+ pass
995
+
996
+ # Check if user pressed Ctrl+B to detach
997
+ if detach_event.is_set():
998
+ detached = True
999
+ break
1000
+
1001
+ time.sleep(3) # Poll every 3 seconds
1002
+
1003
+ # Show final status
1004
+ console.print()
1005
+ try:
1006
+ job = client.get_job(job_id)
1007
+ console.print(f"[bold]Final Status:[/bold] {format_status(job.status)}")
1008
+
1009
+ # Show summary stats
1010
+ sessions_response = client.list_job_sessions(job_id)
1011
+ total_passed = sum(tg.passed_sessions for tg in sessions_response.tasks)
1012
+ total_sessions = sessions_response.total_sessions
1013
+
1014
+ if total_sessions > 0:
1015
+ pass_rate = (total_passed / total_sessions) * 100
1016
+
1017
+ # Color the pass rate
1018
+ if pass_rate >= 70:
1019
+ rate_color = "green"
1020
+ elif pass_rate >= 40:
1021
+ rate_color = "yellow"
1022
+ else:
1023
+ rate_color = "red"
1024
+
1025
+ console.print(f"[bold]Pass Rate:[/bold] [{rate_color}]{total_passed}/{total_sessions} ({pass_rate:.1f}%)[/{rate_color}]")
1026
+
1027
+ # Show per-task breakdown if multiple tasks
1028
+ if len(sessions_response.tasks) > 1:
1029
+ console.print()
1030
+ console.print("[bold]Per-task results:[/bold]")
1031
+ for tg in sessions_response.tasks:
1032
+ task_name = tg.task.key if tg.task else tg.task_id or "Unknown"
1033
+ task_rate = tg.pass_rate * 100
1034
+ console.print(f" {task_name}: {tg.passed_sessions}/{tg.total_sessions} ({task_rate:.0f}%)")
1035
+ except:
1036
+ pass
1037
+
1038
+ # Run oversight if requested and job completed (not detached)
1039
+ if oversight and not detached:
1040
+ _run_oversight(job_id, oversight_model)
1041
+
1042
+ finally:
1043
+ # Signal the keyboard listener thread to stop
1044
+ detach_event.set()
1045
+
1046
+ # Show detached message if user pressed Ctrl+B
1047
+ if detached:
1048
+ console.print()
1049
+ console.print("[yellow]Detached. Eval continues running in background.[/yellow]")
1050
+ console.print(f"[dim]Check status: flt jobs get {job_id}[/dim]")
1051
+ if oversight:
1052
+ console.print(f"[dim]Run oversight manually: flt jobs oversight {job_id}[/dim]")
1053
+
1054
+
1055
+ def main():
1056
+ """Entry point for the CLI."""
1057
+ app()
1058
+
1059
+
1060
+ if __name__ == "__main__":
1061
+ main()