veris-cli 2.1.2__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {veris_cli-2.1.2 → veris_cli-2.2.0}/PKG-INFO +72 -11
- {veris_cli-2.1.2 → veris_cli-2.2.0}/README.md +71 -10
- {veris_cli-2.1.2 → veris_cli-2.2.0}/pyproject.toml +1 -1
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/api.py +102 -14
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/cli.py +211 -5
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/output.py +95 -2
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/prompts.py +13 -0
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/scripts/docker_build.sh +6 -0
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/templates.py +1 -1
- {veris_cli-2.1.2 → veris_cli-2.2.0}/.gitignore +0 -0
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/__init__.py +0 -0
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/config.py +0 -0
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/scripts/__init__.py +0 -0
- {veris_cli-2.1.2 → veris_cli-2.2.0}/src/veris_cli/scripts/docker_push.sh +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: veris-cli
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: CLI to connect local agents to the Veris backend
|
|
5
5
|
Project-URL: Homepage, https://github.com/veris-ai/veris-cli
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/veris-ai/veris-cli/issues
|
|
@@ -125,18 +125,28 @@ This will:
|
|
|
125
125
|
|
|
126
126
|
**Note:** On macOS, this uses `docker buildx` for multi-platform builds targeting `linux/amd64` (GKE platform).
|
|
127
127
|
|
|
128
|
-
### 6.
|
|
128
|
+
### 6. Generate Scenarios (Optional)
|
|
129
|
+
|
|
130
|
+
You can write scenarios by hand (see [Local Development](#local-development--testing)) or generate them automatically:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Generate 5 scenarios + graders using Claude Code
|
|
134
|
+
veris scenarios generate --num 5
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
This launches a K8s job that explores your agent's source code and produces test scenarios and graders. Poll status with:
|
|
129
138
|
|
|
130
139
|
```bash
|
|
131
140
|
veris scenarios list
|
|
132
141
|
```
|
|
133
142
|
|
|
134
|
-
|
|
143
|
+
### 7. List Available Scenarios
|
|
144
|
+
|
|
135
145
|
```bash
|
|
136
|
-
veris scenarios list
|
|
146
|
+
veris scenarios list
|
|
137
147
|
```
|
|
138
148
|
|
|
139
|
-
###
|
|
149
|
+
### 8. Create and Run a Simulation
|
|
140
150
|
|
|
141
151
|
```bash
|
|
142
152
|
# Interactive mode (prompts for scenario and environment)
|
|
@@ -146,7 +156,7 @@ veris run create
|
|
|
146
156
|
veris run create --scenario-set-id scenset_abc123 --env-id env_xyz789
|
|
147
157
|
```
|
|
148
158
|
|
|
149
|
-
###
|
|
159
|
+
### 9. Monitor Your Run
|
|
150
160
|
|
|
151
161
|
```bash
|
|
152
162
|
# Check status
|
|
@@ -162,7 +172,23 @@ veris run logs run_abc123
|
|
|
162
172
|
veris run logs run_abc123 --follow
|
|
163
173
|
```
|
|
164
174
|
|
|
165
|
-
###
|
|
175
|
+
### 10. Evaluate Results (Optional)
|
|
176
|
+
|
|
177
|
+
Once a run completes and graders are available:
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# List available graders
|
|
181
|
+
veris eval list
|
|
182
|
+
|
|
183
|
+
# Trigger evaluation (interactive prompts for run and grader)
|
|
184
|
+
veris evaluation-runs create
|
|
185
|
+
|
|
186
|
+
# Check evaluation status
|
|
187
|
+
veris evaluation-runs list --run-id run_abc123
|
|
188
|
+
veris evaluation-runs status evalrun_abc123 --run-id run_abc123
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### 11. Cancel a Run (if needed)
|
|
166
192
|
|
|
167
193
|
```bash
|
|
168
194
|
veris run cancel run_abc123
|
|
@@ -208,8 +234,31 @@ veris env list [--status ready]
|
|
|
208
234
|
### Scenarios
|
|
209
235
|
|
|
210
236
|
```bash
|
|
211
|
-
# List
|
|
212
|
-
veris scenarios list [--
|
|
237
|
+
# List scenario sets
|
|
238
|
+
veris scenarios list [--env-id <id>]
|
|
239
|
+
|
|
240
|
+
# Generate scenarios + graders via K8s job
|
|
241
|
+
veris scenarios generate [--env-id <id>] [--num 5] [--image-tag latest]
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Eval (Graders)
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
# List graders for an environment
|
|
248
|
+
veris eval list [--env-id <id>]
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### Evaluation Runs
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
# Trigger grading on a completed run
|
|
255
|
+
veris evaluation-runs create [--run-id <id>] [--grader-id <id>]
|
|
256
|
+
|
|
257
|
+
# List evaluation runs for a run
|
|
258
|
+
veris evaluation-runs list --run-id <id>
|
|
259
|
+
|
|
260
|
+
# Get evaluation run status and results
|
|
261
|
+
veris evaluation-runs status <eval-run-id> --run-id <id> [--watch]
|
|
213
262
|
```
|
|
214
263
|
|
|
215
264
|
### Runs
|
|
@@ -347,13 +396,25 @@ Each scenario runs in an isolated container with:
|
|
|
347
396
|
└─────────────────────────────────────────────────────────────┘
|
|
348
397
|
↓
|
|
349
398
|
┌─────────────────────────────────────────────────────────────┐
|
|
350
|
-
│ 3.
|
|
399
|
+
│ 3. Generate Scenarios (optional) │
|
|
400
|
+
│ veris scenarios generate → Claude Code explores agent │
|
|
401
|
+
│ → produces scenarios + graders │
|
|
402
|
+
└─────────────────────────────────────────────────────────────┘
|
|
403
|
+
↓
|
|
404
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
405
|
+
│ 4. Run Simulations │
|
|
351
406
|
│ veris run create → Veris spawns your agent in K8s │
|
|
352
407
|
│ → Runs scenarios against it │
|
|
353
408
|
└─────────────────────────────────────────────────────────────┘
|
|
354
409
|
↓
|
|
355
410
|
┌─────────────────────────────────────────────────────────────┐
|
|
356
|
-
│
|
|
411
|
+
│ 5. Evaluate Results (optional) │
|
|
412
|
+
│ veris evaluation-runs create → grades simulation traces │
|
|
413
|
+
│ veris evaluation-runs status → view grading results │
|
|
414
|
+
└─────────────────────────────────────────────────────────────┘
|
|
415
|
+
↓
|
|
416
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
417
|
+
│ 6. Monitor & Analyze │
|
|
357
418
|
│ veris run status → check progress │
|
|
358
419
|
│ veris run logs → view events │
|
|
359
420
|
└─────────────────────────────────────────────────────────────┘
|
|
@@ -104,18 +104,28 @@ This will:
|
|
|
104
104
|
|
|
105
105
|
**Note:** On macOS, this uses `docker buildx` for multi-platform builds targeting `linux/amd64` (GKE platform).
|
|
106
106
|
|
|
107
|
-
### 6.
|
|
107
|
+
### 6. Generate Scenarios (Optional)
|
|
108
|
+
|
|
109
|
+
You can write scenarios by hand (see [Local Development](#local-development--testing)) or generate them automatically:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# Generate 5 scenarios + graders using Claude Code
|
|
113
|
+
veris scenarios generate --num 5
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
This launches a K8s job that explores your agent's source code and produces test scenarios and graders. Poll status with:
|
|
108
117
|
|
|
109
118
|
```bash
|
|
110
119
|
veris scenarios list
|
|
111
120
|
```
|
|
112
121
|
|
|
113
|
-
|
|
122
|
+
### 7. List Available Scenarios
|
|
123
|
+
|
|
114
124
|
```bash
|
|
115
|
-
veris scenarios list
|
|
125
|
+
veris scenarios list
|
|
116
126
|
```
|
|
117
127
|
|
|
118
|
-
###
|
|
128
|
+
### 8. Create and Run a Simulation
|
|
119
129
|
|
|
120
130
|
```bash
|
|
121
131
|
# Interactive mode (prompts for scenario and environment)
|
|
@@ -125,7 +135,7 @@ veris run create
|
|
|
125
135
|
veris run create --scenario-set-id scenset_abc123 --env-id env_xyz789
|
|
126
136
|
```
|
|
127
137
|
|
|
128
|
-
###
|
|
138
|
+
### 9. Monitor Your Run
|
|
129
139
|
|
|
130
140
|
```bash
|
|
131
141
|
# Check status
|
|
@@ -141,7 +151,23 @@ veris run logs run_abc123
|
|
|
141
151
|
veris run logs run_abc123 --follow
|
|
142
152
|
```
|
|
143
153
|
|
|
144
|
-
###
|
|
154
|
+
### 10. Evaluate Results (Optional)
|
|
155
|
+
|
|
156
|
+
Once a run completes and graders are available:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# List available graders
|
|
160
|
+
veris eval list
|
|
161
|
+
|
|
162
|
+
# Trigger evaluation (interactive prompts for run and grader)
|
|
163
|
+
veris evaluation-runs create
|
|
164
|
+
|
|
165
|
+
# Check evaluation status
|
|
166
|
+
veris evaluation-runs list --run-id run_abc123
|
|
167
|
+
veris evaluation-runs status evalrun_abc123 --run-id run_abc123
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### 11. Cancel a Run (if needed)
|
|
145
171
|
|
|
146
172
|
```bash
|
|
147
173
|
veris run cancel run_abc123
|
|
@@ -187,8 +213,31 @@ veris env list [--status ready]
|
|
|
187
213
|
### Scenarios
|
|
188
214
|
|
|
189
215
|
```bash
|
|
190
|
-
# List
|
|
191
|
-
veris scenarios list [--
|
|
216
|
+
# List scenario sets
|
|
217
|
+
veris scenarios list [--env-id <id>]
|
|
218
|
+
|
|
219
|
+
# Generate scenarios + graders via K8s job
|
|
220
|
+
veris scenarios generate [--env-id <id>] [--num 5] [--image-tag latest]
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### Eval (Graders)
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
# List graders for an environment
|
|
227
|
+
veris eval list [--env-id <id>]
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Evaluation Runs
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
# Trigger grading on a completed run
|
|
234
|
+
veris evaluation-runs create [--run-id <id>] [--grader-id <id>]
|
|
235
|
+
|
|
236
|
+
# List evaluation runs for a run
|
|
237
|
+
veris evaluation-runs list --run-id <id>
|
|
238
|
+
|
|
239
|
+
# Get evaluation run status and results
|
|
240
|
+
veris evaluation-runs status <eval-run-id> --run-id <id> [--watch]
|
|
192
241
|
```
|
|
193
242
|
|
|
194
243
|
### Runs
|
|
@@ -326,13 +375,25 @@ Each scenario runs in an isolated container with:
|
|
|
326
375
|
└─────────────────────────────────────────────────────────────┘
|
|
327
376
|
↓
|
|
328
377
|
┌─────────────────────────────────────────────────────────────┐
|
|
329
|
-
│ 3.
|
|
378
|
+
│ 3. Generate Scenarios (optional) │
|
|
379
|
+
│ veris scenarios generate → Claude Code explores agent │
|
|
380
|
+
│ → produces scenarios + graders │
|
|
381
|
+
└─────────────────────────────────────────────────────────────┘
|
|
382
|
+
↓
|
|
383
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
384
|
+
│ 4. Run Simulations │
|
|
330
385
|
│ veris run create → Veris spawns your agent in K8s │
|
|
331
386
|
│ → Runs scenarios against it │
|
|
332
387
|
└─────────────────────────────────────────────────────────────┘
|
|
333
388
|
↓
|
|
334
389
|
┌─────────────────────────────────────────────────────────────┐
|
|
335
|
-
│
|
|
390
|
+
│ 5. Evaluate Results (optional) │
|
|
391
|
+
│ veris evaluation-runs create → grades simulation traces │
|
|
392
|
+
│ veris evaluation-runs status → view grading results │
|
|
393
|
+
└─────────────────────────────────────────────────────────────┘
|
|
394
|
+
↓
|
|
395
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
396
|
+
│ 6. Monitor & Analyze │
|
|
336
397
|
│ veris run status → check progress │
|
|
337
398
|
│ veris run logs → view events │
|
|
338
399
|
└─────────────────────────────────────────────────────────────┘
|
|
@@ -7,6 +7,28 @@ import httpx
|
|
|
7
7
|
from veris_cli.config import Config
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
class APIError(Exception):
|
|
11
|
+
"""Raised when the backend returns an error response with details."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, status_code: int, detail: str, url: str):
|
|
14
|
+
self.status_code = status_code
|
|
15
|
+
self.detail = detail
|
|
16
|
+
self.url = url
|
|
17
|
+
super().__init__(f"[{status_code}] {detail} ({url})")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _raise_for_status(response: httpx.Response) -> None:
|
|
21
|
+
"""Like response.raise_for_status() but includes the response body."""
|
|
22
|
+
if response.is_success:
|
|
23
|
+
return
|
|
24
|
+
try:
|
|
25
|
+
body = response.json()
|
|
26
|
+
detail = body.get("detail", response.text)
|
|
27
|
+
except Exception:
|
|
28
|
+
detail = response.text or response.reason_phrase
|
|
29
|
+
raise APIError(response.status_code, detail, str(response.url))
|
|
30
|
+
|
|
31
|
+
|
|
10
32
|
class VerisAPI:
|
|
11
33
|
"""Simple HTTP client for Veris backend API."""
|
|
12
34
|
|
|
@@ -30,7 +52,7 @@ class VerisAPI:
|
|
|
30
52
|
"/v1/environments",
|
|
31
53
|
json={"name": name, "description": description},
|
|
32
54
|
)
|
|
33
|
-
response
|
|
55
|
+
_raise_for_status(response)
|
|
34
56
|
return response.json()
|
|
35
57
|
|
|
36
58
|
def create_environment_tag(self, environment_id: str, tag: str = "latest") -> dict[str, Any]:
|
|
@@ -40,7 +62,7 @@ class VerisAPI:
|
|
|
40
62
|
f"/v1/environments/{environment_id}/tags",
|
|
41
63
|
json={"tag": tag},
|
|
42
64
|
)
|
|
43
|
-
response
|
|
65
|
+
_raise_for_status(response)
|
|
44
66
|
return response.json()
|
|
45
67
|
|
|
46
68
|
def list_environments(
|
|
@@ -52,33 +74,33 @@ class VerisAPI:
|
|
|
52
74
|
params["status"] = status
|
|
53
75
|
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
54
76
|
response = client.get("/v1/environments", params=params)
|
|
55
|
-
response
|
|
77
|
+
_raise_for_status(response)
|
|
56
78
|
return response.json()
|
|
57
79
|
|
|
58
80
|
def delete_environment(self, env_id: str) -> None:
|
|
59
81
|
"""Delete an environment."""
|
|
60
82
|
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
61
83
|
response = client.delete(f"/v1/environments/{env_id}")
|
|
62
|
-
response
|
|
84
|
+
_raise_for_status(response)
|
|
63
85
|
|
|
64
86
|
# Scenario Sets
|
|
65
87
|
def list_scenario_sets(
|
|
66
|
-
self,
|
|
88
|
+
self, environment_id: Optional[str] = None, limit: int = 100, skip: int = 0
|
|
67
89
|
) -> list[dict[str, Any]]:
|
|
68
90
|
"""List scenario sets."""
|
|
69
91
|
params = {"limit": limit, "skip": skip}
|
|
70
|
-
if
|
|
71
|
-
params["
|
|
92
|
+
if environment_id:
|
|
93
|
+
params["environment_id"] = environment_id
|
|
72
94
|
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
73
95
|
response = client.get("/v1/scenario-sets", params=params)
|
|
74
|
-
response
|
|
96
|
+
_raise_for_status(response)
|
|
75
97
|
return response.json()
|
|
76
98
|
|
|
77
99
|
def get_scenario_set(self, set_id: str) -> dict[str, Any]:
|
|
78
100
|
"""Get scenario set details."""
|
|
79
101
|
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
80
102
|
response = client.get(f"/v1/scenario-sets/{set_id}")
|
|
81
|
-
response
|
|
103
|
+
_raise_for_status(response)
|
|
82
104
|
return response.json()
|
|
83
105
|
|
|
84
106
|
# Runs
|
|
@@ -100,7 +122,7 @@ class VerisAPI:
|
|
|
100
122
|
"config": config or {},
|
|
101
123
|
},
|
|
102
124
|
)
|
|
103
|
-
response
|
|
125
|
+
_raise_for_status(response)
|
|
104
126
|
return response.json()
|
|
105
127
|
|
|
106
128
|
def list_runs(
|
|
@@ -112,26 +134,92 @@ class VerisAPI:
|
|
|
112
134
|
params["status"] = status
|
|
113
135
|
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
114
136
|
response = client.get("/v1/runs", params=params)
|
|
115
|
-
response
|
|
137
|
+
_raise_for_status(response)
|
|
116
138
|
return response.json()
|
|
117
139
|
|
|
118
140
|
def get_run(self, run_id: str) -> dict[str, Any]:
|
|
119
141
|
"""Get run details."""
|
|
120
142
|
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
121
143
|
response = client.get(f"/v1/runs/{run_id}")
|
|
122
|
-
response
|
|
144
|
+
_raise_for_status(response)
|
|
123
145
|
return response.json()
|
|
124
146
|
|
|
125
147
|
def cancel_run(self, run_id: str) -> None:
|
|
126
148
|
"""Cancel a run."""
|
|
127
149
|
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
128
150
|
response = client.delete(f"/v1/runs/{run_id}")
|
|
129
|
-
response
|
|
151
|
+
_raise_for_status(response)
|
|
130
152
|
|
|
131
153
|
def get_run_events(self, run_id: str, limit: int = 100, offset: int = 0) -> dict[str, Any]:
|
|
132
154
|
"""Get run events/logs."""
|
|
133
155
|
params = {"limit": limit, "offset": offset}
|
|
134
156
|
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
135
157
|
response = client.get(f"/v1/runs/{run_id}/events", params=params)
|
|
136
|
-
response
|
|
158
|
+
_raise_for_status(response)
|
|
159
|
+
return response.json()
|
|
160
|
+
|
|
161
|
+
# Scenario Generation
|
|
162
|
+
def generate_scenario_set(
|
|
163
|
+
self,
|
|
164
|
+
environment_id: str,
|
|
165
|
+
num_scenarios: int = 5,
|
|
166
|
+
image_tag: Optional[str] = None,
|
|
167
|
+
) -> dict[str, Any]:
|
|
168
|
+
"""Trigger async scenario + grader generation via K8s job."""
|
|
169
|
+
payload: dict[str, Any] = {
|
|
170
|
+
"environment_id": environment_id,
|
|
171
|
+
"num_scenarios": num_scenarios,
|
|
172
|
+
}
|
|
173
|
+
if image_tag:
|
|
174
|
+
payload["image_tag"] = image_tag
|
|
175
|
+
with httpx.Client(base_url=self.base_url, headers=self._headers(), timeout=30) as client:
|
|
176
|
+
response = client.post("/v1/scenario-sets/generate", json=payload)
|
|
177
|
+
_raise_for_status(response)
|
|
178
|
+
return response.json()
|
|
179
|
+
|
|
180
|
+
# Graders
|
|
181
|
+
def list_graders(
|
|
182
|
+
self,
|
|
183
|
+
environment_id: str,
|
|
184
|
+
scenario_set_id: Optional[str] = None,
|
|
185
|
+
limit: int = 20,
|
|
186
|
+
offset: int = 0,
|
|
187
|
+
) -> dict[str, Any]:
|
|
188
|
+
"""List graders for an environment."""
|
|
189
|
+
params: dict[str, Any] = {
|
|
190
|
+
"environment_id": environment_id,
|
|
191
|
+
"limit": limit,
|
|
192
|
+
"offset": offset,
|
|
193
|
+
}
|
|
194
|
+
if scenario_set_id:
|
|
195
|
+
params["scenario_set_id"] = scenario_set_id
|
|
196
|
+
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
197
|
+
response = client.get("/v1/graders", params=params)
|
|
198
|
+
_raise_for_status(response)
|
|
199
|
+
return response.json()
|
|
200
|
+
|
|
201
|
+
# Evaluations
|
|
202
|
+
def trigger_evaluation(self, run_id: str, grader_id: str) -> dict[str, Any]:
|
|
203
|
+
"""Trigger grading on a completed run."""
|
|
204
|
+
with httpx.Client(base_url=self.base_url, headers=self._headers(), timeout=30) as client:
|
|
205
|
+
response = client.post(
|
|
206
|
+
f"/v1/runs/{run_id}/evaluate",
|
|
207
|
+
params={"grader_id": grader_id},
|
|
208
|
+
)
|
|
209
|
+
_raise_for_status(response)
|
|
210
|
+
return response.json()
|
|
211
|
+
|
|
212
|
+
def list_evaluation_runs(self, run_id: str, limit: int = 20, offset: int = 0) -> dict[str, Any]:
|
|
213
|
+
"""List evaluation runs for a given run."""
|
|
214
|
+
params = {"limit": limit, "offset": offset}
|
|
215
|
+
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
216
|
+
response = client.get(f"/v1/runs/{run_id}/evaluation-runs", params=params)
|
|
217
|
+
_raise_for_status(response)
|
|
218
|
+
return response.json()
|
|
219
|
+
|
|
220
|
+
def get_evaluation_run(self, run_id: str, eval_run_id: str) -> dict[str, Any]:
|
|
221
|
+
"""Get evaluation run details including per-simulation results."""
|
|
222
|
+
with httpx.Client(base_url=self.base_url, headers=self._headers()) as client:
|
|
223
|
+
response = client.get(f"/v1/runs/{run_id}/evaluation-runs/{eval_run_id}")
|
|
224
|
+
_raise_for_status(response)
|
|
137
225
|
return response.json()
|
|
@@ -13,6 +13,7 @@ from pathlib import Path
|
|
|
13
13
|
from urllib.parse import parse_qs, urlparse
|
|
14
14
|
|
|
15
15
|
import click
|
|
16
|
+
import httpx
|
|
16
17
|
|
|
17
18
|
from veris_cli import output, prompts, templates
|
|
18
19
|
from veris_cli.api import VerisAPI
|
|
@@ -224,6 +225,11 @@ def init(name: str):
|
|
|
224
225
|
except ValueError as e:
|
|
225
226
|
output.print_error(str(e))
|
|
226
227
|
output.print_info("You can create the environment later with 'veris env push'")
|
|
228
|
+
except httpx.ConnectError:
|
|
229
|
+
output.print_error(f"Could not connect to backend at {Config().get_backend_url()}")
|
|
230
|
+
output.print_info(
|
|
231
|
+
"Is the backend running? You can create the environment later with 'veris env push'"
|
|
232
|
+
)
|
|
227
233
|
except Exception as e:
|
|
228
234
|
output.print_error(f"Failed to create environment: {e}")
|
|
229
235
|
output.print_info("You can create the environment later with 'veris env push'")
|
|
@@ -275,7 +281,7 @@ def env_build(tag: str, no_cache: bool):
|
|
|
275
281
|
|
|
276
282
|
username = push_creds.get("username", "_token")
|
|
277
283
|
password = push_creds.get("password", "")
|
|
278
|
-
registry = push_creds.get("registry", "
|
|
284
|
+
registry = push_creds.get("registry", "us-docker.pkg.dev")
|
|
279
285
|
|
|
280
286
|
output.print_success(f"Tag created: {tag}")
|
|
281
287
|
output.print_info(f"Building image: {image_uri}\n")
|
|
@@ -346,7 +352,7 @@ def env_push(tag: str, no_cache: bool):
|
|
|
346
352
|
|
|
347
353
|
username = push_creds.get("username", "_token")
|
|
348
354
|
password = push_creds.get("password", "")
|
|
349
|
-
registry = push_creds.get("registry", "
|
|
355
|
+
registry = push_creds.get("registry", "us-docker.pkg.dev")
|
|
350
356
|
|
|
351
357
|
output.print_success(f"Tag created: {tag}")
|
|
352
358
|
output.print_info(f"Building and pushing image: {image_uri}\n")
|
|
@@ -408,12 +414,12 @@ def scenarios():
|
|
|
408
414
|
|
|
409
415
|
|
|
410
416
|
@scenarios.command(name="list")
|
|
411
|
-
@click.option("--
|
|
412
|
-
def scenarios_list(
|
|
417
|
+
@click.option("--env-id", default=None, help="Filter by environment ID")
|
|
418
|
+
def scenarios_list(env_id: str):
|
|
413
419
|
"""List scenarios"""
|
|
414
420
|
try:
|
|
415
421
|
api = VerisAPI()
|
|
416
|
-
result = api.list_scenario_sets(
|
|
422
|
+
result = api.list_scenario_sets(environment_id=env_id)
|
|
417
423
|
output.print_scenario_sets_table(result)
|
|
418
424
|
except ValueError as e:
|
|
419
425
|
output.print_error(str(e))
|
|
@@ -423,6 +429,51 @@ def scenarios_list(visibility: str):
|
|
|
423
429
|
sys.exit(1)
|
|
424
430
|
|
|
425
431
|
|
|
432
|
+
@scenarios.command(name="generate")
|
|
433
|
+
@click.option("--env-id", default=None, help="Environment ID")
|
|
434
|
+
@click.option("--num", default=5, help="Number of scenarios to generate (default: 5)")
|
|
435
|
+
@click.option("--image-tag", default=None, help="Image tag to use (default: latest)")
|
|
436
|
+
def scenarios_generate(env_id: str, num: int, image_tag: str):
|
|
437
|
+
"""Generate scenarios + grader via K8s job.
|
|
438
|
+
|
|
439
|
+
Launches an async job that explores your agent code, generates test
|
|
440
|
+
scenarios and a grader definition. Poll with 'veris scenarios list'
|
|
441
|
+
to check when generation is complete.
|
|
442
|
+
"""
|
|
443
|
+
try:
|
|
444
|
+
api = VerisAPI()
|
|
445
|
+
|
|
446
|
+
if not env_id:
|
|
447
|
+
project_config = ProjectConfig()
|
|
448
|
+
env_id = project_config.get_environment_id()
|
|
449
|
+
|
|
450
|
+
if not env_id:
|
|
451
|
+
result = api.list_environments(status="ready")
|
|
452
|
+
env_id = prompts.select_environment(result.get("environments", []))
|
|
453
|
+
if not env_id:
|
|
454
|
+
output.print_error("No environment selected")
|
|
455
|
+
sys.exit(1)
|
|
456
|
+
|
|
457
|
+
output.print_info(f"Generating {num} scenario(s) for environment {env_id}...")
|
|
458
|
+
result = api.generate_scenario_set(
|
|
459
|
+
environment_id=env_id,
|
|
460
|
+
num_scenarios=num,
|
|
461
|
+
image_tag=image_tag,
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
set_id = result.get("id", "")
|
|
465
|
+
output.print_success(f"Scenario generation started: {set_id}")
|
|
466
|
+
output.print_info("Status: generating")
|
|
467
|
+
output.print_info("Poll with 'veris scenarios list' to check when generation is complete")
|
|
468
|
+
|
|
469
|
+
except ValueError as e:
|
|
470
|
+
output.print_error(str(e))
|
|
471
|
+
sys.exit(1)
|
|
472
|
+
except Exception as e:
|
|
473
|
+
output.print_error(f"Failed to generate scenarios: {e}")
|
|
474
|
+
sys.exit(1)
|
|
475
|
+
|
|
476
|
+
|
|
426
477
|
# Run commands
|
|
427
478
|
@cli.group()
|
|
428
479
|
def run():
|
|
@@ -558,6 +609,161 @@ def run_cancel(run_id: str):
|
|
|
558
609
|
sys.exit(1)
|
|
559
610
|
|
|
560
611
|
|
|
612
|
+
# Evaluation-run commands
|
|
613
|
+
@cli.group(name="evaluation-runs")
|
|
614
|
+
def eval_group():
|
|
615
|
+
"""Evaluation run commands"""
|
|
616
|
+
pass
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
@eval_group.command(name="create")
|
|
620
|
+
@click.option("--run-id", default=None, help="Run ID to evaluate")
|
|
621
|
+
@click.option("--grader-id", default=None, help="Grader ID to use")
|
|
622
|
+
def eval_create(run_id: str, grader_id: str):
|
|
623
|
+
"""Trigger grading on a completed run.
|
|
624
|
+
|
|
625
|
+
Launches an async K8s grading job that evaluates every simulation
|
|
626
|
+
in the run against the specified grader. Poll with 'veris eval list'
|
|
627
|
+
to check progress.
|
|
628
|
+
"""
|
|
629
|
+
try:
|
|
630
|
+
api = VerisAPI()
|
|
631
|
+
|
|
632
|
+
if not run_id:
|
|
633
|
+
result = api.list_runs(status="completed")
|
|
634
|
+
runs = result.get("runs", [])
|
|
635
|
+
if not runs:
|
|
636
|
+
output.print_error("No completed runs found")
|
|
637
|
+
sys.exit(1)
|
|
638
|
+
choices = [{"id": r.get("id", ""), "title": r.get("id", "")} for r in runs]
|
|
639
|
+
run_id = prompts.select_from_list("Select a completed run:", choices)
|
|
640
|
+
if not run_id:
|
|
641
|
+
output.print_error("No run selected")
|
|
642
|
+
sys.exit(1)
|
|
643
|
+
|
|
644
|
+
if not grader_id:
|
|
645
|
+
run_data = api.get_run(run_id)
|
|
646
|
+
env_id = run_data.get("environment_id")
|
|
647
|
+
if not env_id:
|
|
648
|
+
output.print_error("Could not determine environment from run")
|
|
649
|
+
sys.exit(1)
|
|
650
|
+
graders_result = api.list_graders(environment_id=env_id)
|
|
651
|
+
graders = graders_result.get("graders", [])
|
|
652
|
+
if not graders:
|
|
653
|
+
output.print_error(
|
|
654
|
+
f"No graders found for environment {env_id}. "
|
|
655
|
+
"Generate scenarios first with 'veris scenarios generate'."
|
|
656
|
+
)
|
|
657
|
+
sys.exit(1)
|
|
658
|
+
choices = [
|
|
659
|
+
{
|
|
660
|
+
"id": g.get("id", ""),
|
|
661
|
+
"title": f"{g.get('id', '')} (tags: {g.get('tags', [])})",
|
|
662
|
+
}
|
|
663
|
+
for g in graders
|
|
664
|
+
]
|
|
665
|
+
grader_id = prompts.select_from_list("Select a grader:", choices)
|
|
666
|
+
if not grader_id:
|
|
667
|
+
output.print_error("No grader selected")
|
|
668
|
+
sys.exit(1)
|
|
669
|
+
|
|
670
|
+
output.print_info(f"Triggering evaluation on run {run_id} with grader {grader_id}...")
|
|
671
|
+
result = api.trigger_evaluation(run_id=run_id, grader_id=grader_id)
|
|
672
|
+
|
|
673
|
+
eval_run_id = result.get("evaluation_run_id", "")
|
|
674
|
+
output.print_success(f"Evaluation started: {eval_run_id}")
|
|
675
|
+
output.print_info(f"Check progress with 'veris evaluation-runs list --run-id {run_id}'")
|
|
676
|
+
output.print_info(
|
|
677
|
+
f"View results with 'veris evaluation-runs status --run-id {run_id} {eval_run_id}'"
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
except ValueError as e:
|
|
681
|
+
output.print_error(str(e))
|
|
682
|
+
sys.exit(1)
|
|
683
|
+
except Exception as e:
|
|
684
|
+
output.print_error(f"Failed to create evaluation: {e}")
|
|
685
|
+
sys.exit(1)
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
@eval_group.command(name="list")
|
|
689
|
+
@click.option("--run-id", required=True, help="Run ID to list evaluations for")
|
|
690
|
+
def eval_list(run_id: str):
|
|
691
|
+
"""List evaluation runs for a given run."""
|
|
692
|
+
try:
|
|
693
|
+
api = VerisAPI()
|
|
694
|
+
result = api.list_evaluation_runs(run_id=run_id)
|
|
695
|
+
output.print_evaluation_runs_table(result.get("evaluation_runs", []))
|
|
696
|
+
except ValueError as e:
|
|
697
|
+
output.print_error(str(e))
|
|
698
|
+
sys.exit(1)
|
|
699
|
+
except Exception as e:
|
|
700
|
+
output.print_error(f"Failed to list evaluations: {e}")
|
|
701
|
+
sys.exit(1)
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
@eval_group.command(name="status")
|
|
705
|
+
@click.argument("eval_run_id")
|
|
706
|
+
@click.option("--run-id", required=True, help="Parent run ID")
|
|
707
|
+
@click.option("--watch", is_flag=True, help="Poll every 5 seconds until complete")
|
|
708
|
+
def eval_status(eval_run_id: str, run_id: str, watch: bool):
|
|
709
|
+
"""Get evaluation run status and results."""
|
|
710
|
+
try:
|
|
711
|
+
api = VerisAPI()
|
|
712
|
+
|
|
713
|
+
if watch:
|
|
714
|
+
while True:
|
|
715
|
+
data = api.get_evaluation_run(run_id=run_id, eval_run_id=eval_run_id)
|
|
716
|
+
output.print_evaluation_run_details(data)
|
|
717
|
+
|
|
718
|
+
status = data.get("status", "")
|
|
719
|
+
if status in ["completed", "failed"]:
|
|
720
|
+
break
|
|
721
|
+
|
|
722
|
+
time.sleep(5)
|
|
723
|
+
else:
|
|
724
|
+
data = api.get_evaluation_run(run_id=run_id, eval_run_id=eval_run_id)
|
|
725
|
+
output.print_evaluation_run_details(data)
|
|
726
|
+
|
|
727
|
+
except ValueError as e:
|
|
728
|
+
output.print_error(str(e))
|
|
729
|
+
sys.exit(1)
|
|
730
|
+
except Exception as e:
|
|
731
|
+
output.print_error(f"Failed to get evaluation status: {e}")
|
|
732
|
+
sys.exit(1)
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
# Eval commands (graders)
|
|
736
|
+
@cli.group(name="eval")
|
|
737
|
+
def eval_graders():
|
|
738
|
+
"""Eval commands (graders)"""
|
|
739
|
+
pass
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
@eval_graders.command(name="list")
|
|
743
|
+
@click.option("--env-id", default=None, help="Environment ID (uses project config if omitted)")
|
|
744
|
+
def graders_list(env_id: str):
|
|
745
|
+
"""List graders for an environment."""
|
|
746
|
+
try:
|
|
747
|
+
api = VerisAPI()
|
|
748
|
+
|
|
749
|
+
if not env_id:
|
|
750
|
+
project_config = ProjectConfig()
|
|
751
|
+
env_id = project_config.get_environment_id()
|
|
752
|
+
|
|
753
|
+
if not env_id:
|
|
754
|
+
output.print_error("No environment ID. Use --env-id or run 'veris init' first.")
|
|
755
|
+
sys.exit(1)
|
|
756
|
+
|
|
757
|
+
result = api.list_graders(environment_id=env_id)
|
|
758
|
+
output.print_graders_table(result.get("graders", []))
|
|
759
|
+
except ValueError as e:
|
|
760
|
+
output.print_error(str(e))
|
|
761
|
+
sys.exit(1)
|
|
762
|
+
except Exception as e:
|
|
763
|
+
output.print_error(f"Failed to list graders: {e}")
|
|
764
|
+
sys.exit(1)
|
|
765
|
+
|
|
766
|
+
|
|
561
767
|
def _load_dotenv(path: Path) -> dict[str, str]:
|
|
562
768
|
"""Parse and return environment variables from a .env file."""
|
|
563
769
|
env = {}
|
|
@@ -50,7 +50,7 @@ def print_scenario_sets_table(scenario_sets: list[dict[str, Any]]) -> None:
|
|
|
50
50
|
table.add_column("ID", style="cyan")
|
|
51
51
|
table.add_column("Title", style="green")
|
|
52
52
|
table.add_column("Scenarios", style="blue")
|
|
53
|
-
table.add_column("
|
|
53
|
+
table.add_column("Environment", style="yellow")
|
|
54
54
|
table.add_column("Description", style="white")
|
|
55
55
|
|
|
56
56
|
for ss in scenario_sets:
|
|
@@ -59,7 +59,7 @@ def print_scenario_sets_table(scenario_sets: list[dict[str, Any]]) -> None:
|
|
|
59
59
|
ss.get("id", ""),
|
|
60
60
|
ss.get("title", ""),
|
|
61
61
|
str(ss.get("scenario_count", 0)),
|
|
62
|
-
ss.get("
|
|
62
|
+
ss.get("environment_id") or "—",
|
|
63
63
|
desc[:50] + "..." if len(desc) > 50 else desc,
|
|
64
64
|
)
|
|
65
65
|
|
|
@@ -119,3 +119,96 @@ def print_run_events(events: list[dict[str, Any]]) -> None:
|
|
|
119
119
|
console.print(
|
|
120
120
|
f"[dim]{timestamp}[/dim] [{level_color}]{service}:{event_type}[/{level_color}] {data}"
|
|
121
121
|
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def print_evaluation_runs_table(eval_runs: list[dict[str, Any]]) -> None:
|
|
125
|
+
"""Print evaluation runs in a table."""
|
|
126
|
+
if not eval_runs:
|
|
127
|
+
console.print("No evaluation runs found.")
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
table = Table(title="Evaluation Runs")
|
|
131
|
+
table.add_column("ID", style="cyan")
|
|
132
|
+
table.add_column("Grader ID", style="green")
|
|
133
|
+
table.add_column("Status", style="yellow")
|
|
134
|
+
table.add_column("Total", style="blue")
|
|
135
|
+
table.add_column("Completed", style="green")
|
|
136
|
+
table.add_column("Failed", style="red")
|
|
137
|
+
table.add_column("Created", style="magenta")
|
|
138
|
+
|
|
139
|
+
for er in eval_runs:
|
|
140
|
+
table.add_row(
|
|
141
|
+
er.get("id", ""),
|
|
142
|
+
er.get("grader_id", ""),
|
|
143
|
+
er.get("status", ""),
|
|
144
|
+
str(er.get("total_evaluations", 0)),
|
|
145
|
+
str(er.get("completed_evaluations", 0)),
|
|
146
|
+
str(er.get("failed_evaluations", 0)),
|
|
147
|
+
er.get("created_at", ""),
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
console.print(table)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def print_evaluation_run_details(data: dict[str, Any]) -> None:
|
|
154
|
+
"""Print detailed evaluation run info."""
|
|
155
|
+
console.print("\n[bold]Evaluation Run[/bold]")
|
|
156
|
+
console.print(f"ID: [cyan]{data.get('id', '')}[/cyan]")
|
|
157
|
+
console.print(f"Run: [blue]{data.get('run_id', '')}[/blue]")
|
|
158
|
+
console.print(f"Grader: [green]{data.get('grader_id', '')}[/green]")
|
|
159
|
+
console.print(f"Status: [yellow]{data.get('status', '')}[/yellow]")
|
|
160
|
+
console.print(
|
|
161
|
+
f"Progress: {data.get('completed_evaluations', 0)}"
|
|
162
|
+
f"/{data.get('total_evaluations', 0)} completed"
|
|
163
|
+
f", {data.get('failed_evaluations', 0)} failed"
|
|
164
|
+
)
|
|
165
|
+
console.print(f"Created: [magenta]{data.get('created_at', '')}[/magenta]")
|
|
166
|
+
|
|
167
|
+
evaluations = data.get("evaluations", [])
|
|
168
|
+
if evaluations:
|
|
169
|
+
console.print(f"\n[bold]Evaluations ({len(evaluations)})[/bold]")
|
|
170
|
+
table = Table()
|
|
171
|
+
table.add_column("Simulation ID", style="cyan")
|
|
172
|
+
table.add_column("Status", style="yellow")
|
|
173
|
+
table.add_column("Result", style="white", max_width=60)
|
|
174
|
+
|
|
175
|
+
for ev in evaluations:
|
|
176
|
+
result_str = ""
|
|
177
|
+
result = ev.get("result")
|
|
178
|
+
if result:
|
|
179
|
+
import json
|
|
180
|
+
|
|
181
|
+
result_str = json.dumps(result, indent=None)[:60]
|
|
182
|
+
table.add_row(
|
|
183
|
+
ev.get("simulation_id", ""),
|
|
184
|
+
ev.get("status", ""),
|
|
185
|
+
result_str,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
console.print(table)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def print_graders_table(graders: list[dict[str, Any]]) -> None:
|
|
192
|
+
"""Print graders in a table."""
|
|
193
|
+
if not graders:
|
|
194
|
+
console.print("No graders found.")
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
table = Table(title="Graders")
|
|
198
|
+
table.add_column("ID", style="cyan")
|
|
199
|
+
table.add_column("Environment", style="blue")
|
|
200
|
+
table.add_column("Scenario Set", style="green")
|
|
201
|
+
table.add_column("Tags", style="yellow")
|
|
202
|
+
table.add_column("Created", style="magenta")
|
|
203
|
+
|
|
204
|
+
for g in graders:
|
|
205
|
+
tags = g.get("tags") or []
|
|
206
|
+
table.add_row(
|
|
207
|
+
g.get("id", ""),
|
|
208
|
+
g.get("environment_id", ""),
|
|
209
|
+
g.get("scenario_set_id") or "global",
|
|
210
|
+
", ".join(tags) if tags else "",
|
|
211
|
+
g.get("created_at", ""),
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
console.print(table)
|
|
@@ -79,3 +79,16 @@ def prompt_environment_name() -> Optional[str]:
|
|
|
79
79
|
).ask()
|
|
80
80
|
|
|
81
81
|
return answer
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def select_from_list(prompt: str, items: list[dict[str, Any]]) -> Optional[str]:
|
|
85
|
+
"""Generic interactive selection from a list of {id, title} dicts."""
|
|
86
|
+
if not items:
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
choices = [
|
|
90
|
+
questionary.Choice(title=item.get("title", item.get("id", "")), value=item.get("id", ""))
|
|
91
|
+
for item in items
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
return questionary.select(prompt, choices=choices).ask()
|
|
@@ -27,6 +27,12 @@ TEMP_DOCKER_CONFIG=$(mktemp -d)
|
|
|
27
27
|
export DOCKER_CONFIG="$TEMP_DOCKER_CONFIG"
|
|
28
28
|
trap "rm -rf $TEMP_DOCKER_CONFIG" EXIT
|
|
29
29
|
|
|
30
|
+
# Preserve CLI plugins (buildx, etc.) from the default config directory
|
|
31
|
+
DEFAULT_DOCKER_CONFIG="${HOME}/.docker"
|
|
32
|
+
if [ -d "$DEFAULT_DOCKER_CONFIG/cli-plugins" ]; then
|
|
33
|
+
ln -s "$DEFAULT_DOCKER_CONFIG/cli-plugins" "$TEMP_DOCKER_CONFIG/cli-plugins"
|
|
34
|
+
fi
|
|
35
|
+
|
|
30
36
|
# Login to registry first so we can pull the base image
|
|
31
37
|
echo "Authenticating with Docker registry..."
|
|
32
38
|
echo " Registry: $REGISTRY"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Static file templates for veris init."""
|
|
2
2
|
|
|
3
3
|
DOCKERFILE_SANDBOX = """# Extends veris-gvisor base with your agent code
|
|
4
|
-
FROM
|
|
4
|
+
FROM us-docker.pkg.dev/veris-ai-dev/veris-sandbox-dev/veris-gvisor:latest
|
|
5
5
|
|
|
6
6
|
# Copy agent code and dependencies
|
|
7
7
|
# NOTE: Build context is project root, so paths are relative to project root
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|