fleet-python 0.2.66b2__py3-none-any.whl → 0.2.105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/export_tasks.py +16 -5
- examples/export_tasks_filtered.py +245 -0
- examples/fetch_tasks.py +230 -0
- examples/import_tasks.py +140 -8
- examples/iterate_verifiers.py +725 -0
- fleet/__init__.py +128 -5
- fleet/_async/__init__.py +27 -3
- fleet/_async/base.py +24 -9
- fleet/_async/client.py +938 -41
- fleet/_async/env/client.py +60 -3
- fleet/_async/instance/client.py +52 -7
- fleet/_async/models.py +15 -0
- fleet/_async/resources/api.py +200 -0
- fleet/_async/resources/sqlite.py +1801 -46
- fleet/_async/tasks.py +122 -25
- fleet/_async/verifiers/bundler.py +22 -21
- fleet/_async/verifiers/verifier.py +25 -19
- fleet/agent/__init__.py +32 -0
- fleet/agent/gemini_cua/Dockerfile +45 -0
- fleet/agent/gemini_cua/__init__.py +10 -0
- fleet/agent/gemini_cua/agent.py +759 -0
- fleet/agent/gemini_cua/mcp/main.py +108 -0
- fleet/agent/gemini_cua/mcp_server/__init__.py +5 -0
- fleet/agent/gemini_cua/mcp_server/main.py +105 -0
- fleet/agent/gemini_cua/mcp_server/tools.py +178 -0
- fleet/agent/gemini_cua/requirements.txt +5 -0
- fleet/agent/gemini_cua/start.sh +30 -0
- fleet/agent/orchestrator.py +854 -0
- fleet/agent/types.py +49 -0
- fleet/agent/utils.py +34 -0
- fleet/base.py +34 -9
- fleet/cli.py +1061 -0
- fleet/client.py +1060 -48
- fleet/config.py +1 -1
- fleet/env/__init__.py +16 -0
- fleet/env/client.py +60 -3
- fleet/eval/__init__.py +15 -0
- fleet/eval/uploader.py +231 -0
- fleet/exceptions.py +8 -0
- fleet/instance/client.py +53 -8
- fleet/instance/models.py +1 -0
- fleet/models.py +303 -0
- fleet/proxy/__init__.py +25 -0
- fleet/proxy/proxy.py +453 -0
- fleet/proxy/whitelist.py +244 -0
- fleet/resources/api.py +200 -0
- fleet/resources/sqlite.py +1845 -46
- fleet/tasks.py +113 -20
- fleet/utils/__init__.py +7 -0
- fleet/utils/http_logging.py +178 -0
- fleet/utils/logging.py +13 -0
- fleet/utils/playwright.py +440 -0
- fleet/verifiers/bundler.py +22 -21
- fleet/verifiers/db.py +985 -1
- fleet/verifiers/decorator.py +1 -1
- fleet/verifiers/verifier.py +25 -19
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/METADATA +28 -1
- fleet_python-0.2.105.dist-info/RECORD +115 -0
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/WHEEL +1 -1
- fleet_python-0.2.105.dist-info/entry_points.txt +2 -0
- tests/test_app_method.py +85 -0
- tests/test_expect_exactly.py +4148 -0
- tests/test_expect_only.py +2593 -0
- tests/test_instance_dispatch.py +607 -0
- tests/test_sqlite_resource_dual_mode.py +263 -0
- tests/test_sqlite_shared_memory_behavior.py +117 -0
- fleet_python-0.2.66b2.dist-info/RECORD +0 -81
- tests/test_verifier_security.py +0 -427
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/licenses/LICENSE +0 -0
- {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/top_level.txt +0 -0
fleet/cli.py
ADDED
|
@@ -0,0 +1,1061 @@
|
|
|
1
|
+
"""Fleet CLI - Command line interface for Fleet SDK."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import signal
|
|
6
|
+
import sys
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
# Load .env file if present (before other imports that might need env vars)
|
|
12
|
+
try:
|
|
13
|
+
from dotenv import load_dotenv
|
|
14
|
+
load_dotenv()
|
|
15
|
+
except ImportError:
|
|
16
|
+
pass # python-dotenv not installed, skip
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import typer
|
|
20
|
+
from rich.console import Console
|
|
21
|
+
from rich.live import Live
|
|
22
|
+
from rich.panel import Panel
|
|
23
|
+
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TaskProgressColumn, TimeRemainingColumn
|
|
24
|
+
from rich.table import Table
|
|
25
|
+
except ImportError:
|
|
26
|
+
print(
|
|
27
|
+
"Error: CLI dependencies not installed.\n"
|
|
28
|
+
"Install with: pip install 'fleet-python[cli]'",
|
|
29
|
+
file=sys.stderr,
|
|
30
|
+
)
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
33
|
+
from .client import Fleet
|
|
34
|
+
from .models import JobCreateRequest
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
app = typer.Typer(
|
|
38
|
+
name="flt",
|
|
39
|
+
help="Fleet CLI - Interact with Fleet jobs and sessions",
|
|
40
|
+
no_args_is_help=True,
|
|
41
|
+
)
|
|
42
|
+
jobs_app = typer.Typer(help="Manage jobs", no_args_is_help=True)
|
|
43
|
+
sessions_app = typer.Typer(help="Manage sessions", no_args_is_help=True)
|
|
44
|
+
eval_app = typer.Typer(help="Run evaluations", no_args_is_help=True)
|
|
45
|
+
projects_app = typer.Typer(help="Manage projects", no_args_is_help=True)
|
|
46
|
+
|
|
47
|
+
app.add_typer(jobs_app, name="jobs")
|
|
48
|
+
app.add_typer(sessions_app, name="sessions")
|
|
49
|
+
app.add_typer(eval_app, name="eval")
|
|
50
|
+
app.add_typer(projects_app, name="projects")
|
|
51
|
+
|
|
52
|
+
console = Console()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
CLI_DEFAULT_BASE_URL = "https://us-west-1.fleetai.com"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def colorize_score(score: float) -> str:
|
|
59
|
+
"""Color a score from red (0.0) to yellow (0.5) to green (1.0)."""
|
|
60
|
+
if score >= 0.7:
|
|
61
|
+
return f"[green]{score:.2f}[/green]"
|
|
62
|
+
elif score >= 0.4:
|
|
63
|
+
return f"[yellow]{score:.2f}[/yellow]"
|
|
64
|
+
else:
|
|
65
|
+
return f"[red]{score:.2f}[/red]"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def format_status(status: Optional[str]) -> str:
|
|
69
|
+
"""Format job status with color and clean text."""
|
|
70
|
+
if not status:
|
|
71
|
+
return "[dim]-[/dim]"
|
|
72
|
+
|
|
73
|
+
status_map = {
|
|
74
|
+
"completed": "[green]✓ completed[/green]",
|
|
75
|
+
"in_progress": "[yellow]● running[/yellow]",
|
|
76
|
+
"pending": "[dim]○ pending[/dim]",
|
|
77
|
+
"load_tasks": "[blue]↻ loading[/blue]",
|
|
78
|
+
"failed": "[red]✗ failed[/red]",
|
|
79
|
+
"cancelled": "[dim]✗ cancelled[/dim]",
|
|
80
|
+
}
|
|
81
|
+
return status_map.get(status, f"[dim]{status}[/dim]")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_client() -> Fleet:
|
|
85
|
+
"""Get a Fleet client using environment variables."""
|
|
86
|
+
api_key = os.getenv("FLEET_API_KEY")
|
|
87
|
+
if not api_key:
|
|
88
|
+
console.print(
|
|
89
|
+
"[red]Error:[/red] FLEET_API_KEY environment variable not set",
|
|
90
|
+
style="bold",
|
|
91
|
+
)
|
|
92
|
+
raise typer.Exit(1)
|
|
93
|
+
base_url = os.getenv("FLEET_BASE_URL", CLI_DEFAULT_BASE_URL)
|
|
94
|
+
return Fleet(api_key=api_key, base_url=base_url)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _run_oversight(job_id: str, model: str = "anthropic/claude-sonnet-4"):
|
|
98
|
+
"""Run oversight summarization on a completed job."""
|
|
99
|
+
import httpx
|
|
100
|
+
|
|
101
|
+
api_key = os.getenv("FLEET_API_KEY")
|
|
102
|
+
if not api_key:
|
|
103
|
+
console.print("[yellow]Warning:[/yellow] FLEET_API_KEY not set, skipping oversight")
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
base_url = os.getenv("FLEET_BASE_URL", CLI_DEFAULT_BASE_URL)
|
|
107
|
+
oversight_url = f"{base_url}/v1/summarize/job"
|
|
108
|
+
|
|
109
|
+
console.print()
|
|
110
|
+
console.print("[bold]Running Oversight Analysis...[/bold]")
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
with httpx.Client(timeout=300) as client:
|
|
114
|
+
response = client.post(
|
|
115
|
+
oversight_url,
|
|
116
|
+
headers={
|
|
117
|
+
"accept": "application/json",
|
|
118
|
+
"Authorization": f"Bearer {api_key}",
|
|
119
|
+
"Content-Type": "application/json",
|
|
120
|
+
},
|
|
121
|
+
json={
|
|
122
|
+
"job_id": job_id,
|
|
123
|
+
"model": model,
|
|
124
|
+
"max_context_tokens": 180000,
|
|
125
|
+
"force_new_summary": False,
|
|
126
|
+
"max_concurrent": 20,
|
|
127
|
+
},
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
if response.status_code == 200:
|
|
131
|
+
result = response.json()
|
|
132
|
+
console.print(f"[green]✓[/green] Oversight analysis started")
|
|
133
|
+
if "summary_id" in result:
|
|
134
|
+
console.print(f" Summary ID: [cyan]{result['summary_id']}[/cyan]")
|
|
135
|
+
# Show link to dashboard
|
|
136
|
+
console.print(f" View: [cyan]https://fleetai.com/dashboard/jobs/{job_id}[/cyan]")
|
|
137
|
+
else:
|
|
138
|
+
console.print(f"[yellow]Warning:[/yellow] Oversight API returned {response.status_code}")
|
|
139
|
+
console.print(f" {response.text[:200]}")
|
|
140
|
+
except Exception as e:
|
|
141
|
+
console.print(f"[yellow]Warning:[/yellow] Oversight request failed: {e}")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# Jobs commands
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@jobs_app.command("list")
|
|
148
|
+
def list_jobs(
|
|
149
|
+
team_id: Optional[str] = typer.Option(None, "--team-id", help="Filter by team ID (admin only)"),
|
|
150
|
+
output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
151
|
+
):
|
|
152
|
+
"""List all jobs."""
|
|
153
|
+
client = get_client()
|
|
154
|
+
jobs = client.list_jobs(team_id=team_id)
|
|
155
|
+
|
|
156
|
+
if output_json:
|
|
157
|
+
console.print(json.dumps([j.model_dump() for j in jobs], indent=2, default=str))
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
if not jobs:
|
|
161
|
+
console.print("No jobs found.")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
table = Table(title="Jobs")
|
|
165
|
+
table.add_column("ID", style="cyan")
|
|
166
|
+
table.add_column("Name", style="green")
|
|
167
|
+
table.add_column("Status", style="yellow")
|
|
168
|
+
table.add_column("Created At", style="dim")
|
|
169
|
+
|
|
170
|
+
for job in jobs:
|
|
171
|
+
table.add_row(
|
|
172
|
+
job.id,
|
|
173
|
+
job.name or "-",
|
|
174
|
+
format_status(job.status),
|
|
175
|
+
job.created_at or "-",
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
console.print(table)
|
|
179
|
+
|
|
180
|
+
# Show tips with a real job ID from the results
|
|
181
|
+
first_job_id = jobs[0].id
|
|
182
|
+
console.print()
|
|
183
|
+
console.print("[dim]Tips:[/dim]")
|
|
184
|
+
console.print(f"[dim] Job details: flt jobs get {first_job_id}[/dim]")
|
|
185
|
+
console.print(f"[dim] Job sessions: flt jobs sessions {first_job_id}[/dim]")
|
|
186
|
+
console.print(f"[dim] Session transcript: flt sessions transcript <session-id>[/dim]")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@jobs_app.command("create")
|
|
190
|
+
def create_job(
|
|
191
|
+
model: List[str] = typer.Option(..., "--model", "-m", help="Model in 'provider/model' format (repeatable)"),
|
|
192
|
+
env_key: Optional[str] = typer.Option(None, "--env-key", "-e", help="Environment key"),
|
|
193
|
+
project_key: Optional[str] = typer.Option(None, "--project-key", "-p", help="Project key"),
|
|
194
|
+
task_keys: Optional[List[str]] = typer.Option(None, "--task-key", "-t", help="Task key (repeatable)"),
|
|
195
|
+
name: Optional[str] = typer.Option(None, "--name", "-n", help="Job name. Supports placeholders: {id} (UUID), {sid} (short UUID), {i} (auto-increment, must be suffix)"),
|
|
196
|
+
pass_k: int = typer.Option(1, "--pass-k", help="Number of passes"),
|
|
197
|
+
max_steps: Optional[int] = typer.Option(None, "--max-steps", help="Maximum agent steps"),
|
|
198
|
+
max_duration: int = typer.Option(60, "--max-duration", help="Timeout in minutes"),
|
|
199
|
+
max_concurrent: int = typer.Option(30, "--max-concurrent", help="Max concurrent per model"),
|
|
200
|
+
mode: Optional[str] = typer.Option(None, "--mode", help="Mode: 'tool-use' or 'computer-use'"),
|
|
201
|
+
system_prompt: Optional[str] = typer.Option(None, "--system-prompt", help="Custom system prompt"),
|
|
202
|
+
model_prompt: Optional[List[str]] = typer.Option(None, "--model-prompt", help="Per-model prompt in 'provider/model=prompt' format (repeatable)"),
|
|
203
|
+
byok: Optional[List[str]] = typer.Option(None, "--byok", help="Bring Your Own Key in 'provider=key' format (repeatable)"),
|
|
204
|
+
byok_ttl: Optional[int] = typer.Option(None, "--byok-ttl", help="TTL for BYOK keys in minutes"),
|
|
205
|
+
harness: Optional[str] = typer.Option(None, "--harness", help="Harness identifier"),
|
|
206
|
+
output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
207
|
+
):
|
|
208
|
+
"""Create a new job.
|
|
209
|
+
|
|
210
|
+
Requires --model (repeatable) and exactly one of --env-key, --project-key, or --task-key.
|
|
211
|
+
"""
|
|
212
|
+
# Validate mutual exclusivity
|
|
213
|
+
sources = [env_key, project_key, task_keys]
|
|
214
|
+
specified = sum(1 for s in sources if s)
|
|
215
|
+
if specified != 1:
|
|
216
|
+
console.print(
|
|
217
|
+
"[red]Error:[/red] Exactly one of --env-key, --project-key, or --task-key must be specified",
|
|
218
|
+
style="bold",
|
|
219
|
+
)
|
|
220
|
+
raise typer.Exit(1)
|
|
221
|
+
|
|
222
|
+
# Parse model prompts
|
|
223
|
+
model_prompts = None
|
|
224
|
+
if model_prompt:
|
|
225
|
+
model_prompts = {}
|
|
226
|
+
for mp in model_prompt:
|
|
227
|
+
if "=" not in mp:
|
|
228
|
+
console.print(
|
|
229
|
+
f"[red]Error:[/red] Invalid --model-prompt format: {mp}. Expected 'provider/model=prompt'",
|
|
230
|
+
style="bold",
|
|
231
|
+
)
|
|
232
|
+
raise typer.Exit(1)
|
|
233
|
+
key, value = mp.split("=", 1)
|
|
234
|
+
model_prompts[key] = value
|
|
235
|
+
|
|
236
|
+
# Parse BYOK keys
|
|
237
|
+
byok_keys = None
|
|
238
|
+
if byok:
|
|
239
|
+
byok_keys = {}
|
|
240
|
+
for b in byok:
|
|
241
|
+
if "=" not in b:
|
|
242
|
+
console.print(
|
|
243
|
+
f"[red]Error:[/red] Invalid --byok format: {b}. Expected 'provider=key'",
|
|
244
|
+
style="bold",
|
|
245
|
+
)
|
|
246
|
+
raise typer.Exit(1)
|
|
247
|
+
provider, key = b.split("=", 1)
|
|
248
|
+
byok_keys[provider] = key
|
|
249
|
+
|
|
250
|
+
client = get_client()
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
result = client.create_job(
|
|
254
|
+
models=model,
|
|
255
|
+
name=name,
|
|
256
|
+
pass_k=pass_k,
|
|
257
|
+
env_key=env_key,
|
|
258
|
+
project_key=project_key,
|
|
259
|
+
task_keys=task_keys,
|
|
260
|
+
max_steps=max_steps,
|
|
261
|
+
max_duration_minutes=max_duration,
|
|
262
|
+
max_concurrent_per_model=max_concurrent,
|
|
263
|
+
mode=mode,
|
|
264
|
+
system_prompt=system_prompt,
|
|
265
|
+
model_prompts=model_prompts,
|
|
266
|
+
byok_keys=byok_keys,
|
|
267
|
+
byok_ttl_minutes=byok_ttl,
|
|
268
|
+
harness=harness,
|
|
269
|
+
)
|
|
270
|
+
except Exception as e:
|
|
271
|
+
console.print(f"[red]Error creating job:[/red] {e}")
|
|
272
|
+
raise typer.Exit(1)
|
|
273
|
+
|
|
274
|
+
if output_json:
|
|
275
|
+
console.print(json.dumps(result.model_dump(), indent=2, default=str))
|
|
276
|
+
return
|
|
277
|
+
|
|
278
|
+
console.print(f"[green]Job created successfully![/green]")
|
|
279
|
+
console.print(f" Job ID: [cyan]{result.job_id}[/cyan]")
|
|
280
|
+
if result.workflow_job_id:
|
|
281
|
+
console.print(f" Workflow ID: {result.workflow_job_id}")
|
|
282
|
+
console.print(f" Status: {format_status(result.status)}")
|
|
283
|
+
if result.name:
|
|
284
|
+
console.print(f" Name: {result.name}")
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@jobs_app.command("get")
|
|
288
|
+
def get_job(
|
|
289
|
+
job_id: str = typer.Argument(..., help="Job ID"),
|
|
290
|
+
team_id: Optional[str] = typer.Option(None, "--team-id", help="Team ID (admin only)"),
|
|
291
|
+
output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
292
|
+
):
|
|
293
|
+
"""Get details for a specific job."""
|
|
294
|
+
client = get_client()
|
|
295
|
+
job = client.get_job(job_id, team_id=team_id)
|
|
296
|
+
|
|
297
|
+
if output_json:
|
|
298
|
+
console.print(json.dumps(job.model_dump(), indent=2, default=str))
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
console.print(f"[bold]Job Details[/bold]")
|
|
302
|
+
console.print(f" ID: [cyan]{job.id}[/cyan]")
|
|
303
|
+
console.print(f" Name: {job.name or '-'}")
|
|
304
|
+
console.print(f" Status: {format_status(job.status)}")
|
|
305
|
+
console.print(f" Created At: {job.created_at or '-'}")
|
|
306
|
+
|
|
307
|
+
# Show tips
|
|
308
|
+
console.print()
|
|
309
|
+
console.print("[dim]Tips:[/dim]")
|
|
310
|
+
console.print(f"[dim] Job sessions: flt jobs sessions {job.id}[/dim]")
|
|
311
|
+
console.print(f"[dim] Session transcript: flt sessions transcript <session-id>[/dim]")
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
@jobs_app.command("sessions")
|
|
315
|
+
def list_job_sessions(
|
|
316
|
+
job_id: str = typer.Argument(..., help="Job ID"),
|
|
317
|
+
output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
318
|
+
):
|
|
319
|
+
"""List all sessions for a job, grouped by task."""
|
|
320
|
+
client = get_client()
|
|
321
|
+
result = client.list_job_sessions(job_id)
|
|
322
|
+
|
|
323
|
+
if output_json:
|
|
324
|
+
console.print(json.dumps(result.model_dump(), indent=2, default=str))
|
|
325
|
+
return
|
|
326
|
+
|
|
327
|
+
console.print(f"[bold]Sessions for Job:[/bold] [cyan]{result.job_id}[/cyan]")
|
|
328
|
+
console.print(f"Total Sessions: {result.total_sessions}\n")
|
|
329
|
+
|
|
330
|
+
first_session_id = None
|
|
331
|
+
for task_group in result.tasks:
|
|
332
|
+
task_name = task_group.task.key if task_group.task else task_group.task_id or "Unknown"
|
|
333
|
+
pass_rate_pct = task_group.pass_rate * 100
|
|
334
|
+
|
|
335
|
+
console.print(f"[bold green]Task:[/bold green] {task_name}")
|
|
336
|
+
console.print(f" Pass Rate: {task_group.passed_sessions}/{task_group.total_sessions} ({pass_rate_pct:.1f}%)")
|
|
337
|
+
if task_group.average_score is not None:
|
|
338
|
+
console.print(f" Average Score: {task_group.average_score:.2f}")
|
|
339
|
+
|
|
340
|
+
table = Table(show_header=True)
|
|
341
|
+
table.add_column("Session ID", style="cyan")
|
|
342
|
+
table.add_column("Model", style="blue")
|
|
343
|
+
table.add_column("Status", style="yellow")
|
|
344
|
+
table.add_column("Steps")
|
|
345
|
+
table.add_column("Result")
|
|
346
|
+
|
|
347
|
+
for session in task_group.sessions:
|
|
348
|
+
if first_session_id is None:
|
|
349
|
+
first_session_id = session.session_id
|
|
350
|
+
result_str = "-"
|
|
351
|
+
if session.verifier_execution:
|
|
352
|
+
if session.verifier_execution.success:
|
|
353
|
+
result_str = "[green]PASS[/green]"
|
|
354
|
+
if session.verifier_execution.score is not None:
|
|
355
|
+
score_colored = colorize_score(session.verifier_execution.score)
|
|
356
|
+
result_str += f" ({score_colored})"
|
|
357
|
+
else:
|
|
358
|
+
result_str = "[red]FAIL[/red]"
|
|
359
|
+
|
|
360
|
+
table.add_row(
|
|
361
|
+
session.session_id,
|
|
362
|
+
session.model,
|
|
363
|
+
format_status(session.status),
|
|
364
|
+
str(session.step_count),
|
|
365
|
+
result_str,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
console.print(table)
|
|
369
|
+
console.print()
|
|
370
|
+
|
|
371
|
+
# Show tips with a real session ID
|
|
372
|
+
if first_session_id:
|
|
373
|
+
console.print("[dim]Tips:[/dim]")
|
|
374
|
+
console.print(f"[dim] Session transcript: flt sessions transcript {first_session_id}[/dim]")
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
@jobs_app.command("oversight")
|
|
378
|
+
def run_job_oversight(
|
|
379
|
+
job_id: str = typer.Argument(..., help="Job ID to analyze"),
|
|
380
|
+
model: str = typer.Option("anthropic/claude-sonnet-4", "--model", "-m", help="Model for oversight analysis"),
|
|
381
|
+
):
|
|
382
|
+
"""Run AI oversight analysis on a job."""
|
|
383
|
+
_run_oversight(job_id, model)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
# Sessions commands
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
@sessions_app.command("transcript")
|
|
390
|
+
def get_session_transcript(
|
|
391
|
+
session_id: str = typer.Argument(..., help="Session ID"),
|
|
392
|
+
output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
393
|
+
):
|
|
394
|
+
"""Get the transcript for a session."""
|
|
395
|
+
client = get_client()
|
|
396
|
+
result = client.get_session_transcript(session_id)
|
|
397
|
+
|
|
398
|
+
if output_json:
|
|
399
|
+
console.print(json.dumps(result.model_dump(), indent=2, default=str))
|
|
400
|
+
return
|
|
401
|
+
|
|
402
|
+
# Header
|
|
403
|
+
console.print(f"[bold]Session Transcript[/bold]")
|
|
404
|
+
if result.instance:
|
|
405
|
+
console.print(f" Status: {format_status(result.instance.status)}")
|
|
406
|
+
console.print()
|
|
407
|
+
|
|
408
|
+
# Task info
|
|
409
|
+
if result.task:
|
|
410
|
+
console.print(f"[bold]Task:[/bold] {result.task.key}")
|
|
411
|
+
console.print(f" Environment: {result.task.env_id}")
|
|
412
|
+
if result.task.version:
|
|
413
|
+
console.print(f" Version: {result.task.version}")
|
|
414
|
+
console.print()
|
|
415
|
+
console.print(f"[bold]Prompt:[/bold]")
|
|
416
|
+
console.print(f" {result.task.prompt}")
|
|
417
|
+
console.print()
|
|
418
|
+
|
|
419
|
+
# Verifier result
|
|
420
|
+
if result.verifier_execution:
|
|
421
|
+
status = "[green]PASS[/green]" if result.verifier_execution.success else "[red]FAIL[/red]"
|
|
422
|
+
console.print(f"[bold]Verifier Result:[/bold] {status}")
|
|
423
|
+
if result.verifier_execution.score is not None:
|
|
424
|
+
score_colored = colorize_score(result.verifier_execution.score)
|
|
425
|
+
console.print(f" Score: {score_colored}")
|
|
426
|
+
console.print(f" Execution Time: {result.verifier_execution.execution_time_ms}ms")
|
|
427
|
+
console.print()
|
|
428
|
+
|
|
429
|
+
# Transcript
|
|
430
|
+
console.print(f"[bold]Transcript:[/bold] ({len(result.transcript)} messages)")
|
|
431
|
+
console.print("-" * 60)
|
|
432
|
+
|
|
433
|
+
for msg in result.transcript:
|
|
434
|
+
role_colors = {
|
|
435
|
+
"user": "green",
|
|
436
|
+
"assistant": "blue",
|
|
437
|
+
"tool": "yellow",
|
|
438
|
+
"system": "magenta",
|
|
439
|
+
}
|
|
440
|
+
color = role_colors.get(msg.role, "white")
|
|
441
|
+
console.print(f"[bold {color}]{msg.role.upper()}:[/bold {color}]")
|
|
442
|
+
|
|
443
|
+
# Handle content
|
|
444
|
+
if isinstance(msg.content, str):
|
|
445
|
+
# Truncate long content
|
|
446
|
+
content = msg.content
|
|
447
|
+
if len(content) > 500:
|
|
448
|
+
content = content[:500] + "..."
|
|
449
|
+
console.print(f" {content}")
|
|
450
|
+
elif isinstance(msg.content, list):
|
|
451
|
+
# Multimodal content
|
|
452
|
+
for item in msg.content:
|
|
453
|
+
if isinstance(item, dict):
|
|
454
|
+
if item.get("type") == "text":
|
|
455
|
+
text = item.get("text", "")
|
|
456
|
+
if len(text) > 500:
|
|
457
|
+
text = text[:500] + "..."
|
|
458
|
+
console.print(f" {text}")
|
|
459
|
+
elif item.get("type") == "image_url":
|
|
460
|
+
console.print(f" [dim][Image][/dim]")
|
|
461
|
+
elif item.get("type") == "tool_use":
|
|
462
|
+
console.print(f" [dim]Tool: {item.get('name', 'unknown')}[/dim]")
|
|
463
|
+
elif item.get("type") == "tool_result":
|
|
464
|
+
console.print(f" [dim]Tool Result[/dim]")
|
|
465
|
+
else:
|
|
466
|
+
console.print(f" {item}")
|
|
467
|
+
else:
|
|
468
|
+
console.print(f" {msg.content}")
|
|
469
|
+
|
|
470
|
+
# Tool calls
|
|
471
|
+
if msg.tool_calls:
|
|
472
|
+
for tc in msg.tool_calls:
|
|
473
|
+
if isinstance(tc, dict):
|
|
474
|
+
name = tc.get("function", {}).get("name", tc.get("name", "unknown"))
|
|
475
|
+
console.print(f" [dim]-> Tool call: {name}[/dim]")
|
|
476
|
+
|
|
477
|
+
console.print()
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
# Projects commands
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
@projects_app.command("list")
|
|
484
|
+
def list_projects(
|
|
485
|
+
output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
486
|
+
):
|
|
487
|
+
"""List all active projects."""
|
|
488
|
+
client = get_client()
|
|
489
|
+
|
|
490
|
+
# Call the projects endpoint directly since there's no SDK method yet
|
|
491
|
+
response = client.client.request("GET", "/v1/tasks/projects")
|
|
492
|
+
data = response.json()
|
|
493
|
+
|
|
494
|
+
if output_json:
|
|
495
|
+
console.print(json.dumps(data, indent=2, default=str))
|
|
496
|
+
return
|
|
497
|
+
|
|
498
|
+
projects = data.get("projects", [])
|
|
499
|
+
|
|
500
|
+
if not projects:
|
|
501
|
+
console.print("No projects found.")
|
|
502
|
+
return
|
|
503
|
+
|
|
504
|
+
table = Table(title="Projects")
|
|
505
|
+
table.add_column("Project Key", style="cyan", no_wrap=True)
|
|
506
|
+
table.add_column("Modality", style="blue")
|
|
507
|
+
table.add_column("Created At", style="dim")
|
|
508
|
+
|
|
509
|
+
for project in projects:
|
|
510
|
+
modality = project.get("task_modality") or "-"
|
|
511
|
+
# Clean up modality display
|
|
512
|
+
if modality == "tool_use":
|
|
513
|
+
modality = "tool-use"
|
|
514
|
+
elif modality == "computer_use":
|
|
515
|
+
modality = "computer-use"
|
|
516
|
+
|
|
517
|
+
table.add_row(
|
|
518
|
+
project.get("project_key", "-"),
|
|
519
|
+
modality,
|
|
520
|
+
project.get("created_at", "-"),
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
console.print(table)
|
|
524
|
+
|
|
525
|
+
# Show tips
|
|
526
|
+
if projects:
|
|
527
|
+
first_project = projects[0].get("project_key", "my-project")
|
|
528
|
+
console.print()
|
|
529
|
+
console.print("[dim]Tips:[/dim]")
|
|
530
|
+
console.print(f"[dim] Run eval: flt eval run -p {first_project} -m openai/gpt-4o-mini[/dim]")
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
# Eval commands
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def _run_local_agent(
|
|
537
|
+
project_key: Optional[str],
|
|
538
|
+
task_keys: Optional[List[str]],
|
|
539
|
+
model: str,
|
|
540
|
+
agent: str,
|
|
541
|
+
max_steps: int,
|
|
542
|
+
max_duration: int,
|
|
543
|
+
max_concurrent: int,
|
|
544
|
+
byok: Optional[List[str]],
|
|
545
|
+
output_json: bool,
|
|
546
|
+
verbose: bool = False,
|
|
547
|
+
headful: bool = False,
|
|
548
|
+
oversight: bool = False,
|
|
549
|
+
oversight_model: str = "anthropic/claude-sonnet-4",
|
|
550
|
+
):
|
|
551
|
+
"""Run agent locally with Docker-based browser control."""
|
|
552
|
+
import asyncio
|
|
553
|
+
import logging
|
|
554
|
+
|
|
555
|
+
if verbose:
|
|
556
|
+
logging.basicConfig(level=logging.DEBUG, format='%(name)s: %(message)s')
|
|
557
|
+
|
|
558
|
+
# Parse API keys
|
|
559
|
+
api_keys = {}
|
|
560
|
+
if os.getenv("GEMINI_API_KEY"):
|
|
561
|
+
api_keys["GEMINI_API_KEY"] = os.getenv("GEMINI_API_KEY")
|
|
562
|
+
if os.getenv("OPENAI_API_KEY"):
|
|
563
|
+
api_keys["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
|
|
564
|
+
if os.getenv("ANTHROPIC_API_KEY"):
|
|
565
|
+
api_keys["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")
|
|
566
|
+
|
|
567
|
+
# Parse BYOK and add to api_keys
|
|
568
|
+
if byok:
|
|
569
|
+
provider_to_env = {"google": "GEMINI_API_KEY", "openai": "OPENAI_API_KEY", "anthropic": "ANTHROPIC_API_KEY"}
|
|
570
|
+
for b in byok:
|
|
571
|
+
if "=" not in b:
|
|
572
|
+
console.print(f"[red]Error:[/red] Invalid --byok format: {b}")
|
|
573
|
+
raise typer.Exit(1)
|
|
574
|
+
provider, key = b.split("=", 1)
|
|
575
|
+
api_keys[provider_to_env.get(provider.lower(), f"{provider.upper()}_API_KEY")] = key
|
|
576
|
+
|
|
577
|
+
# Check for required API key based on agent
|
|
578
|
+
if "gemini" in agent.lower() and "GEMINI_API_KEY" not in api_keys:
|
|
579
|
+
console.print("[red]Error:[/red] GEMINI_API_KEY required for gemini_cua agent")
|
|
580
|
+
console.print()
|
|
581
|
+
console.print("Set it via environment:")
|
|
582
|
+
console.print(" [cyan]export GEMINI_API_KEY=your-key[/cyan]")
|
|
583
|
+
console.print()
|
|
584
|
+
console.print("Or pass via --byok:")
|
|
585
|
+
console.print(" [cyan]flt eval run ... --byok google=your-key[/cyan]")
|
|
586
|
+
raise typer.Exit(1)
|
|
587
|
+
|
|
588
|
+
if verbose:
|
|
589
|
+
console.print(f"[dim]API keys configured: {list(api_keys.keys())}[/dim]")
|
|
590
|
+
|
|
591
|
+
# Display config (matching remote format)
|
|
592
|
+
suite_name = project_key if project_key else (', '.join(task_keys) if task_keys else "all tasks")
|
|
593
|
+
console.print()
|
|
594
|
+
console.print("[green bold]Eval started[/green bold] [dim](local)[/dim]")
|
|
595
|
+
console.print()
|
|
596
|
+
console.print(f" [bold]Suite[/bold] {suite_name}")
|
|
597
|
+
console.print(f" [bold]Models[/bold] {model}")
|
|
598
|
+
console.print(f" [bold]Agent[/bold] {agent}")
|
|
599
|
+
console.print(f" [bold]Max Steps[/bold] {max_steps}")
|
|
600
|
+
console.print(f" [bold]Concurrent[/bold] {max_concurrent}")
|
|
601
|
+
if headful:
|
|
602
|
+
console.print(f" [bold]Headful[/bold] [green]Yes[/green] (browser visible via noVNC)")
|
|
603
|
+
console.print()
|
|
604
|
+
|
|
605
|
+
async def run():
|
|
606
|
+
from fleet.agent import run_agent
|
|
607
|
+
return await run_agent(
|
|
608
|
+
project_key=project_key,
|
|
609
|
+
task_keys=task_keys,
|
|
610
|
+
agent=agent,
|
|
611
|
+
model=model,
|
|
612
|
+
max_concurrent=max_concurrent,
|
|
613
|
+
max_steps=max_steps,
|
|
614
|
+
timeout_seconds=max_duration * 60,
|
|
615
|
+
api_keys=api_keys,
|
|
616
|
+
headful=headful,
|
|
617
|
+
verbose=verbose,
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
console.print("[dim]Starting...[/dim]")
|
|
621
|
+
console.print()
|
|
622
|
+
|
|
623
|
+
job_id = None
|
|
624
|
+
try:
|
|
625
|
+
results, job_id = asyncio.run(run())
|
|
626
|
+
except KeyboardInterrupt:
|
|
627
|
+
console.print()
|
|
628
|
+
console.print("[yellow]Cancelled.[/yellow]")
|
|
629
|
+
raise typer.Exit(1)
|
|
630
|
+
except Exception as e:
|
|
631
|
+
console.print(f"[red]Error:[/red] {e}")
|
|
632
|
+
raise typer.Exit(1)
|
|
633
|
+
|
|
634
|
+
# Display results
|
|
635
|
+
if output_json:
|
|
636
|
+
output = []
|
|
637
|
+
for r in results:
|
|
638
|
+
output.append({
|
|
639
|
+
"task_key": r.task_key,
|
|
640
|
+
"task_prompt": r.task_prompt,
|
|
641
|
+
"completed": r.agent_result.completed if r.agent_result else False,
|
|
642
|
+
"final_answer": r.agent_result.final_answer if r.agent_result else None,
|
|
643
|
+
"verification_success": r.verification_success,
|
|
644
|
+
"verification_score": r.verification_score,
|
|
645
|
+
"error": r.error or (r.agent_result.error if r.agent_result else None),
|
|
646
|
+
"steps_taken": r.agent_result.steps_taken if r.agent_result else 0,
|
|
647
|
+
"execution_time_ms": r.execution_time_ms,
|
|
648
|
+
})
|
|
649
|
+
console.print(json.dumps(output, indent=2))
|
|
650
|
+
return
|
|
651
|
+
|
|
652
|
+
# Show dashboard link panel (matching remote format)
|
|
653
|
+
console.print()
|
|
654
|
+
if job_id:
|
|
655
|
+
console.print(Panel(
|
|
656
|
+
f"[bold]Live agent traces[/bold]\n\n https://www.fleetai.com/dashboard/jobs/{job_id}",
|
|
657
|
+
border_style="cyan",
|
|
658
|
+
))
|
|
659
|
+
console.print()
|
|
660
|
+
console.print("[dim]Tips:[/dim]")
|
|
661
|
+
console.print(f"[dim] Job details: flt jobs get {job_id}[/dim]")
|
|
662
|
+
console.print(f"[dim] Job sessions: flt jobs sessions {job_id}[/dim]")
|
|
663
|
+
console.print(f"[dim] Session transcript: flt sessions transcript <session-id>[/dim]")
|
|
664
|
+
|
|
665
|
+
# Summary
|
|
666
|
+
console.print()
|
|
667
|
+
console.print("[bold]Results[/bold]")
|
|
668
|
+
console.print("-" * 60)
|
|
669
|
+
|
|
670
|
+
errors = 0
|
|
671
|
+
scores = []
|
|
672
|
+
completed = 0
|
|
673
|
+
|
|
674
|
+
for r in results:
|
|
675
|
+
if r.error:
|
|
676
|
+
status = "[red]ERROR[/red]"
|
|
677
|
+
errors += 1
|
|
678
|
+
elif r.verification_score is not None:
|
|
679
|
+
scores.append(r.verification_score)
|
|
680
|
+
completed += 1
|
|
681
|
+
# Color based on score
|
|
682
|
+
if r.verification_score >= 0.7:
|
|
683
|
+
status = f"[green]{r.verification_score:.2f}[/green]"
|
|
684
|
+
elif r.verification_score >= 0.4:
|
|
685
|
+
status = f"[yellow]{r.verification_score:.2f}[/yellow]"
|
|
686
|
+
else:
|
|
687
|
+
status = f"[red]{r.verification_score:.2f}[/red]"
|
|
688
|
+
elif r.verification_success is True:
|
|
689
|
+
status = "[green]PASS[/green]"
|
|
690
|
+
completed += 1
|
|
691
|
+
elif r.verification_success is False:
|
|
692
|
+
status = "[red]FAIL[/red]"
|
|
693
|
+
completed += 1
|
|
694
|
+
elif r.agent_result and r.agent_result.completed:
|
|
695
|
+
status = "[yellow]DONE[/yellow]"
|
|
696
|
+
completed += 1
|
|
697
|
+
else:
|
|
698
|
+
status = "[red]INCOMPLETE[/red]"
|
|
699
|
+
|
|
700
|
+
key = r.task_key[:40] + "..." if len(r.task_key) > 40 else r.task_key
|
|
701
|
+
console.print(f" {status} {key}")
|
|
702
|
+
|
|
703
|
+
if r.error:
|
|
704
|
+
# Show first 100 chars of error
|
|
705
|
+
err = r.error.replace('\n', ' ')[:100]
|
|
706
|
+
console.print(f" [dim]{err}[/dim]")
|
|
707
|
+
|
|
708
|
+
console.print("-" * 60)
|
|
709
|
+
|
|
710
|
+
total = len(results)
|
|
711
|
+
if total > 0:
|
|
712
|
+
console.print(f"[bold]Completed:[/bold] {completed}/{total}")
|
|
713
|
+
if scores:
|
|
714
|
+
avg_score = sum(scores) / len(scores)
|
|
715
|
+
score_color = "green" if avg_score >= 0.7 else "yellow" if avg_score >= 0.4 else "red"
|
|
716
|
+
console.print(f"[bold]Avg. Score:[/bold] [{score_color}]{avg_score:.2f}[/{score_color}]")
|
|
717
|
+
if errors:
|
|
718
|
+
console.print(f"[bold]Errors:[/bold] [red]{errors}[/red]")
|
|
719
|
+
|
|
720
|
+
# Run oversight if requested
|
|
721
|
+
if oversight and job_id:
|
|
722
|
+
_run_oversight(job_id, oversight_model)
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
def _listen_for_detach_key(stop_event: threading.Event):
|
|
726
|
+
"""Listen for Ctrl+B in a background thread to signal detachment."""
|
|
727
|
+
try:
|
|
728
|
+
# Platform-specific keyboard input handling
|
|
729
|
+
if sys.platform == 'win32':
|
|
730
|
+
import msvcrt
|
|
731
|
+
while not stop_event.is_set():
|
|
732
|
+
if msvcrt.kbhit():
|
|
733
|
+
ch = msvcrt.getch()
|
|
734
|
+
if ch == b'\x02': # Ctrl+B
|
|
735
|
+
stop_event.set()
|
|
736
|
+
break
|
|
737
|
+
time.sleep(0.1)
|
|
738
|
+
else:
|
|
739
|
+
# Unix-like systems
|
|
740
|
+
import select
|
|
741
|
+
import tty
|
|
742
|
+
import termios
|
|
743
|
+
|
|
744
|
+
old_settings = termios.tcgetattr(sys.stdin)
|
|
745
|
+
try:
|
|
746
|
+
tty.setcbreak(sys.stdin.fileno())
|
|
747
|
+
while not stop_event.is_set():
|
|
748
|
+
# Check if input is available with timeout
|
|
749
|
+
if select.select([sys.stdin], [], [], 0.1)[0]:
|
|
750
|
+
ch = sys.stdin.read(1)
|
|
751
|
+
if ch == '\x02': # Ctrl+B
|
|
752
|
+
stop_event.set()
|
|
753
|
+
break
|
|
754
|
+
finally:
|
|
755
|
+
termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
|
|
756
|
+
except Exception:
|
|
757
|
+
# If we can't set up keyboard listening, just exit gracefully
|
|
758
|
+
pass
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
@eval_app.command("run")
|
|
762
|
+
def eval_run(
|
|
763
|
+
project_key: Optional[str] = typer.Option(None, "--project", "-p", help="Project key to evaluate"),
|
|
764
|
+
task_keys: Optional[List[str]] = typer.Option(None, "--task", "-t", help="Specific task key(s) to run (repeatable)"),
|
|
765
|
+
model: List[str] = typer.Option(..., "--model", "-m", help="Model (e.g., google/gemini-2.5-pro)"),
|
|
766
|
+
name: Optional[str] = typer.Option(None, "--name", "-n", help="Job name"),
|
|
767
|
+
pass_k: int = typer.Option(1, "--pass-k", "-k", help="Number of passes per task"),
|
|
768
|
+
max_steps: Optional[int] = typer.Option(None, "--max-steps", help="Maximum agent steps"),
|
|
769
|
+
max_duration: int = typer.Option(60, "--max-duration", help="Timeout in minutes"),
|
|
770
|
+
max_concurrent: int = typer.Option(30, "--max-concurrent", help="Max concurrent per model"),
|
|
771
|
+
byok: Optional[List[str]] = typer.Option(None, "--byok", help="Bring Your Own Key: 'provider=key'"),
|
|
772
|
+
no_watch: bool = typer.Option(False, "--no-watch", help="Don't watch progress"),
|
|
773
|
+
output_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
|
774
|
+
# Local execution
|
|
775
|
+
local: Optional[str] = typer.Option(None, "--local", "-l", help="Run locally. Use 'gemini_cua' for built-in or path for custom agent"),
|
|
776
|
+
headful: bool = typer.Option(False, "--headful", help="Show browser via noVNC (local mode)"),
|
|
777
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Show debug output"),
|
|
778
|
+
# Oversight
|
|
779
|
+
oversight: bool = typer.Option(False, "--oversight", help="Run AI oversight analysis on job completion"),
|
|
780
|
+
oversight_model: str = typer.Option("anthropic/claude-sonnet-4", "--oversight-model", help="Model for oversight analysis"),
|
|
781
|
+
):
|
|
782
|
+
"""
|
|
783
|
+
Run an evaluation on a project or specific tasks.
|
|
784
|
+
|
|
785
|
+
\b
|
|
786
|
+
Examples:
|
|
787
|
+
# Cloud execution (default)
|
|
788
|
+
flt eval run -p my-project -m google/gemini-2.5-pro
|
|
789
|
+
|
|
790
|
+
# Run specific task(s)
|
|
791
|
+
flt eval run -t task_abc123 -m google/gemini-2.5-pro --local gemini_cua
|
|
792
|
+
|
|
793
|
+
# Local with built-in agent
|
|
794
|
+
flt eval run -p my-project -m google/gemini-2.5-pro --local gemini_cua
|
|
795
|
+
|
|
796
|
+
# Local with headful mode (watch the browser)
|
|
797
|
+
flt eval run -p my-project -m google/gemini-2.5-pro --local gemini_cua --headful
|
|
798
|
+
|
|
799
|
+
# Local with custom agent
|
|
800
|
+
flt eval run -p my-project -m google/gemini-2.5-pro --local ./my-agent
|
|
801
|
+
"""
|
|
802
|
+
# Validate: need either project or task keys
|
|
803
|
+
if not project_key and not task_keys:
|
|
804
|
+
console.print("[red]Error:[/red] Either --project (-p) or --task (-t) must be specified")
|
|
805
|
+
raise typer.Exit(1)
|
|
806
|
+
|
|
807
|
+
# Local mode
|
|
808
|
+
if local is not None:
|
|
809
|
+
_run_local_agent(
|
|
810
|
+
project_key=project_key,
|
|
811
|
+
task_keys=task_keys,
|
|
812
|
+
model=model[0] if model else "gemini-2.5-pro",
|
|
813
|
+
agent=local if local else "gemini_cua",
|
|
814
|
+
max_steps=max_steps or 200,
|
|
815
|
+
max_duration=max_duration,
|
|
816
|
+
max_concurrent=max_concurrent,
|
|
817
|
+
byok=byok,
|
|
818
|
+
output_json=output_json,
|
|
819
|
+
verbose=verbose,
|
|
820
|
+
headful=headful,
|
|
821
|
+
oversight=oversight,
|
|
822
|
+
oversight_model=oversight_model,
|
|
823
|
+
)
|
|
824
|
+
return
|
|
825
|
+
|
|
826
|
+
client = get_client()
|
|
827
|
+
|
|
828
|
+
# Parse BYOK keys
|
|
829
|
+
byok_keys = None
|
|
830
|
+
if byok:
|
|
831
|
+
byok_keys = {}
|
|
832
|
+
for b in byok:
|
|
833
|
+
if "=" not in b:
|
|
834
|
+
console.print(
|
|
835
|
+
f"[red]Error:[/red] Invalid --byok format: {b}. Expected 'provider=key'",
|
|
836
|
+
style="bold",
|
|
837
|
+
)
|
|
838
|
+
raise typer.Exit(1)
|
|
839
|
+
provider, key = b.split("=", 1)
|
|
840
|
+
byok_keys[provider] = key
|
|
841
|
+
|
|
842
|
+
try:
|
|
843
|
+
result = client.create_job(
|
|
844
|
+
models=model,
|
|
845
|
+
name=name,
|
|
846
|
+
pass_k=pass_k,
|
|
847
|
+
project_key=project_key if project_key else None,
|
|
848
|
+
task_keys=task_keys if task_keys else None,
|
|
849
|
+
max_steps=max_steps,
|
|
850
|
+
max_duration_minutes=max_duration,
|
|
851
|
+
max_concurrent_per_model=max_concurrent,
|
|
852
|
+
byok_keys=byok_keys,
|
|
853
|
+
)
|
|
854
|
+
except Exception as e:
|
|
855
|
+
error_str = str(e)
|
|
856
|
+
|
|
857
|
+
# Check if it's a model not found error and format nicely
|
|
858
|
+
if "not found" in error_str.lower() and "available models" in error_str.lower():
|
|
859
|
+
console.print(f"[red]Error:[/red] Invalid model specified")
|
|
860
|
+
console.print()
|
|
861
|
+
# Extract and display available models
|
|
862
|
+
if "Available models:" in error_str:
|
|
863
|
+
try:
|
|
864
|
+
models_part = error_str.split("Available models:")[1].strip()
|
|
865
|
+
# Parse the list string
|
|
866
|
+
import ast
|
|
867
|
+
available = ast.literal_eval(models_part)
|
|
868
|
+
console.print("[bold]Available models:[/bold]")
|
|
869
|
+
for m in sorted(available):
|
|
870
|
+
console.print(f" [cyan]{m}[/cyan]")
|
|
871
|
+
except:
|
|
872
|
+
console.print(f"[dim]{error_str}[/dim]")
|
|
873
|
+
else:
|
|
874
|
+
console.print(f"[red]Error creating job:[/red] {e}")
|
|
875
|
+
raise typer.Exit(1)
|
|
876
|
+
|
|
877
|
+
job_id = result.job_id
|
|
878
|
+
|
|
879
|
+
if output_json:
|
|
880
|
+
console.print(json.dumps(result.model_dump(), indent=2, default=str))
|
|
881
|
+
return
|
|
882
|
+
|
|
883
|
+
# Display summary
|
|
884
|
+
suite_name = project_key if project_key else "all tasks"
|
|
885
|
+
job_name = name or result.name # Use provided name or server-generated name
|
|
886
|
+
console.print()
|
|
887
|
+
console.print("[green bold]Eval started[/green bold]")
|
|
888
|
+
console.print()
|
|
889
|
+
if job_name:
|
|
890
|
+
console.print(f" [bold]Name[/bold] {job_name}")
|
|
891
|
+
console.print(f" [bold]Suite[/bold] {suite_name}")
|
|
892
|
+
console.print(f" [bold]Models[/bold] {', '.join(model)}")
|
|
893
|
+
console.print(f" [bold]Passes[/bold] {pass_k}")
|
|
894
|
+
console.print(f" [bold]Job ID[/bold] [cyan]{job_id}[/cyan]")
|
|
895
|
+
console.print()
|
|
896
|
+
|
|
897
|
+
# Show dashboard link
|
|
898
|
+
console.print(Panel(
|
|
899
|
+
f"[bold]Live agent traces[/bold]\n\n https://www.fleetai.com/dashboard/jobs/{job_id}",
|
|
900
|
+
border_style="cyan",
|
|
901
|
+
))
|
|
902
|
+
console.print()
|
|
903
|
+
|
|
904
|
+
# Show tips
|
|
905
|
+
console.print("[dim]Tips:[/dim]")
|
|
906
|
+
console.print(f"[dim] Job details: flt jobs get {job_id}[/dim]")
|
|
907
|
+
console.print(f"[dim] Job sessions: flt jobs sessions {job_id}[/dim]")
|
|
908
|
+
console.print(f"[dim] Session transcript: flt sessions transcript <session-id>[/dim]")
|
|
909
|
+
console.print()
|
|
910
|
+
|
|
911
|
+
if no_watch:
|
|
912
|
+
return
|
|
913
|
+
|
|
914
|
+
# Watch progress
|
|
915
|
+
console.print("[dim]Watching progress... (Press Ctrl+B to detach, job continues running)[/dim]")
|
|
916
|
+
console.print()
|
|
917
|
+
|
|
918
|
+
# Terminal statuses for sessions
|
|
919
|
+
TERMINAL_SESSION_STATUSES = {"completed", "timed_out", "errored", "failed"}
|
|
920
|
+
TERMINAL_JOB_STATUSES = {"completed", "errored", "failed", "cancelled"}
|
|
921
|
+
|
|
922
|
+
detached = False
|
|
923
|
+
detach_event = threading.Event()
|
|
924
|
+
|
|
925
|
+
# Start keyboard listener thread
|
|
926
|
+
listener_thread = threading.Thread(target=_listen_for_detach_key, args=(detach_event,), daemon=True)
|
|
927
|
+
listener_thread.start()
|
|
928
|
+
|
|
929
|
+
try:
|
|
930
|
+
with Progress(
|
|
931
|
+
SpinnerColumn(),
|
|
932
|
+
TextColumn("[progress.description]{task.description}"),
|
|
933
|
+
BarColumn(),
|
|
934
|
+
TaskProgressColumn(),
|
|
935
|
+
TimeRemainingColumn(),
|
|
936
|
+
console=console,
|
|
937
|
+
) as progress:
|
|
938
|
+
task = progress.add_task("[cyan]Starting eval...", total=None)
|
|
939
|
+
|
|
940
|
+
while True:
|
|
941
|
+
# Poll sessions for progress
|
|
942
|
+
try:
|
|
943
|
+
sessions_response = client.list_job_sessions(job_id)
|
|
944
|
+
total = sessions_response.total_sessions
|
|
945
|
+
|
|
946
|
+
# Count sessions in terminal state
|
|
947
|
+
completed = sum(
|
|
948
|
+
1 for tg in sessions_response.tasks
|
|
949
|
+
for s in tg.sessions
|
|
950
|
+
if s.status in TERMINAL_SESSION_STATUSES
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
# Count passed sessions and collect scores
|
|
954
|
+
passed = 0
|
|
955
|
+
scores = []
|
|
956
|
+
for tg in sessions_response.tasks:
|
|
957
|
+
for s in tg.sessions:
|
|
958
|
+
if s.verifier_execution:
|
|
959
|
+
if s.verifier_execution.success:
|
|
960
|
+
passed += 1
|
|
961
|
+
if s.verifier_execution.score is not None:
|
|
962
|
+
scores.append(s.verifier_execution.score)
|
|
963
|
+
|
|
964
|
+
# Calculate average score
|
|
965
|
+
avg_score = sum(scores) / len(scores) if scores else None
|
|
966
|
+
|
|
967
|
+
if total > 0:
|
|
968
|
+
# Build description with score if available
|
|
969
|
+
if avg_score is not None:
|
|
970
|
+
desc = f"[cyan]Running ({completed}/{total}) | {passed} passed | avg: {avg_score:.2f}[/cyan]"
|
|
971
|
+
else:
|
|
972
|
+
desc = f"[cyan]Running ({completed}/{total}) | {passed} passed[/cyan]"
|
|
973
|
+
|
|
974
|
+
progress.update(
|
|
975
|
+
task,
|
|
976
|
+
completed=completed,
|
|
977
|
+
total=total,
|
|
978
|
+
description=desc
|
|
979
|
+
)
|
|
980
|
+
|
|
981
|
+
# Check if all sessions are done
|
|
982
|
+
if completed >= total:
|
|
983
|
+
break
|
|
984
|
+
except:
|
|
985
|
+
# Sessions endpoint might not be ready yet
|
|
986
|
+
pass
|
|
987
|
+
|
|
988
|
+
# Also check job status as fallback
|
|
989
|
+
try:
|
|
990
|
+
job = client.get_job(job_id)
|
|
991
|
+
if job.status in TERMINAL_JOB_STATUSES:
|
|
992
|
+
break
|
|
993
|
+
except:
|
|
994
|
+
pass
|
|
995
|
+
|
|
996
|
+
# Check if user pressed Ctrl+B to detach
|
|
997
|
+
if detach_event.is_set():
|
|
998
|
+
detached = True
|
|
999
|
+
break
|
|
1000
|
+
|
|
1001
|
+
time.sleep(3) # Poll every 3 seconds
|
|
1002
|
+
|
|
1003
|
+
# Show final status
|
|
1004
|
+
console.print()
|
|
1005
|
+
try:
|
|
1006
|
+
job = client.get_job(job_id)
|
|
1007
|
+
console.print(f"[bold]Final Status:[/bold] {format_status(job.status)}")
|
|
1008
|
+
|
|
1009
|
+
# Show summary stats
|
|
1010
|
+
sessions_response = client.list_job_sessions(job_id)
|
|
1011
|
+
total_passed = sum(tg.passed_sessions for tg in sessions_response.tasks)
|
|
1012
|
+
total_sessions = sessions_response.total_sessions
|
|
1013
|
+
|
|
1014
|
+
if total_sessions > 0:
|
|
1015
|
+
pass_rate = (total_passed / total_sessions) * 100
|
|
1016
|
+
|
|
1017
|
+
# Color the pass rate
|
|
1018
|
+
if pass_rate >= 70:
|
|
1019
|
+
rate_color = "green"
|
|
1020
|
+
elif pass_rate >= 40:
|
|
1021
|
+
rate_color = "yellow"
|
|
1022
|
+
else:
|
|
1023
|
+
rate_color = "red"
|
|
1024
|
+
|
|
1025
|
+
console.print(f"[bold]Pass Rate:[/bold] [{rate_color}]{total_passed}/{total_sessions} ({pass_rate:.1f}%)[/{rate_color}]")
|
|
1026
|
+
|
|
1027
|
+
# Show per-task breakdown if multiple tasks
|
|
1028
|
+
if len(sessions_response.tasks) > 1:
|
|
1029
|
+
console.print()
|
|
1030
|
+
console.print("[bold]Per-task results:[/bold]")
|
|
1031
|
+
for tg in sessions_response.tasks:
|
|
1032
|
+
task_name = tg.task.key if tg.task else tg.task_id or "Unknown"
|
|
1033
|
+
task_rate = tg.pass_rate * 100
|
|
1034
|
+
console.print(f" {task_name}: {tg.passed_sessions}/{tg.total_sessions} ({task_rate:.0f}%)")
|
|
1035
|
+
except:
|
|
1036
|
+
pass
|
|
1037
|
+
|
|
1038
|
+
# Run oversight if requested and job completed (not detached)
|
|
1039
|
+
if oversight and not detached:
|
|
1040
|
+
_run_oversight(job_id, oversight_model)
|
|
1041
|
+
|
|
1042
|
+
finally:
|
|
1043
|
+
# Signal the keyboard listener thread to stop
|
|
1044
|
+
detach_event.set()
|
|
1045
|
+
|
|
1046
|
+
# Show detached message if user pressed Ctrl+B
|
|
1047
|
+
if detached:
|
|
1048
|
+
console.print()
|
|
1049
|
+
console.print("[yellow]Detached. Eval continues running in background.[/yellow]")
|
|
1050
|
+
console.print(f"[dim]Check status: flt jobs get {job_id}[/dim]")
|
|
1051
|
+
if oversight:
|
|
1052
|
+
console.print(f"[dim]Run oversight manually: flt jobs oversight {job_id}[/dim]")
|
|
1053
|
+
|
|
1054
|
+
|
|
1055
|
+
def main():
|
|
1056
|
+
"""Entry point for the CLI."""
|
|
1057
|
+
app()
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
if __name__ == "__main__":
|
|
1061
|
+
main()
|