applied-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- applied_cli/__init__.py +2 -0
- applied_cli/auth_store.py +263 -0
- applied_cli/commands/__init__.py +2 -0
- applied_cli/commands/_hints.py +11 -0
- applied_cli/commands/_normalize.py +79 -0
- applied_cli/commands/_parsers.py +58 -0
- applied_cli/commands/_ui.py +33 -0
- applied_cli/commands/agent.py +1231 -0
- applied_cli/commands/auth.py +739 -0
- applied_cli/commands/chat.py +379 -0
- applied_cli/commands/coverage.py +348 -0
- applied_cli/commands/discover.py +1006 -0
- applied_cli/commands/fix.py +1204 -0
- applied_cli/commands/insights.py +614 -0
- applied_cli/commands/intents.py +447 -0
- applied_cli/commands/rate.py +508 -0
- applied_cli/commands/responses.py +604 -0
- applied_cli/commands/shop.py +1757 -0
- applied_cli/commands/simulate.py +330 -0
- applied_cli/commands/spec.py +238 -0
- applied_cli/config.py +50 -0
- applied_cli/error_reporting.py +38 -0
- applied_cli/http.py +1614 -0
- applied_cli/main.py +90 -0
- applied_cli/mcp_server.py +738 -0
- applied_cli/presets/demo.yaml +170 -0
- applied_cli/runtime.py +53 -0
- applied_cli/shop_spec.py +398 -0
- applied_cli/spec_workflow.py +432 -0
- applied_cli-0.1.0.dist-info/METADATA +176 -0
- applied_cli-0.1.0.dist-info/RECORD +34 -0
- applied_cli-0.1.0.dist-info/WHEEL +5 -0
- applied_cli-0.1.0.dist-info/entry_points.txt +3 -0
- applied_cli-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1204 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark fix workflow commands for AI agents.
|
|
3
|
+
|
|
4
|
+
This module provides commands optimized for AI agents to fix failing test scenarios:
|
|
5
|
+
1. `fix context` - Returns all context needed to understand and fix failing scenarios
|
|
6
|
+
2. `fix test` - Replays a scenario's input message to validate a fix
|
|
7
|
+
3. `fix batch` - Batch test multiple scenarios with retries and parallelism
|
|
8
|
+
4. `fix status` - Track progress between source and target benchmarks
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import random
|
|
13
|
+
import time
|
|
14
|
+
import uuid as uuid_module
|
|
15
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Any, Optional
|
|
18
|
+
|
|
19
|
+
import httpx
|
|
20
|
+
import typer
|
|
21
|
+
|
|
22
|
+
from applied_cli.commands._parsers import validate_uuid
|
|
23
|
+
from applied_cli.commands.responses import RESPONSE_SEMANTICS
|
|
24
|
+
from applied_cli.error_reporting import render_api_error
|
|
25
|
+
from applied_cli.http import (
|
|
26
|
+
APIError,
|
|
27
|
+
create_conversation_benchmark,
|
|
28
|
+
create_conversation_scenario,
|
|
29
|
+
get_agent,
|
|
30
|
+
get_conversation,
|
|
31
|
+
get_conversation_benchmark,
|
|
32
|
+
get_conversation_scenario,
|
|
33
|
+
list_conversation_scenarios,
|
|
34
|
+
list_responses,
|
|
35
|
+
patch_conversation_scenario,
|
|
36
|
+
)
|
|
37
|
+
from applied_cli.runtime import resolve_runtime
|
|
38
|
+
|
|
39
|
+
app = typer.Typer(
|
|
40
|
+
help=(
|
|
41
|
+
"Fix failing benchmark scenarios. Optimized for AI agent workflows.\n\n"
|
|
42
|
+
"Get all context needed to fix failures:\n"
|
|
43
|
+
" applied-cli test fix context --benchmark-id <uuid>\n\n"
|
|
44
|
+
"Test a fix by replaying a scenario:\n"
|
|
45
|
+
" applied-cli test fix test --scenario-id <uuid> --benchmark-id <uuid>\n\n"
|
|
46
|
+
"Batch test all failing scenarios:\n"
|
|
47
|
+
" applied-cli test fix batch --source <uuid> --target <uuid>\n\n"
|
|
48
|
+
"Track progress between benchmarks:\n"
|
|
49
|
+
" applied-cli test fix status --source <uuid> --target <uuid>"
|
|
50
|
+
)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Default test contact names for auto-generation
|
|
55
|
+
_TEST_FIRST_NAMES = ["Alex", "Jordan", "Taylor", "Morgan", "Casey", "Riley", "Quinn", "Avery"]
|
|
56
|
+
_TEST_LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Davis", "Miller", "Wilson", "Moore"]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _generate_test_contact() -> tuple[str, str]:
|
|
60
|
+
"""Generate a random test contact name and email."""
|
|
61
|
+
first = random.choice(_TEST_FIRST_NAMES)
|
|
62
|
+
last = random.choice(_TEST_LAST_NAMES)
|
|
63
|
+
name = f"{first} {last}"
|
|
64
|
+
email = f"{first.lower()}.{last.lower()}.{random.randint(100, 999)}@test.example.com"
|
|
65
|
+
return name, email
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class TestResult:
|
|
70
|
+
"""Result of testing a single scenario."""
|
|
71
|
+
|
|
72
|
+
original_scenario_id: str
|
|
73
|
+
success: bool
|
|
74
|
+
new_scenario_id: str | None = None
|
|
75
|
+
conversation_id: str | None = None
|
|
76
|
+
resolution: str | None = None
|
|
77
|
+
error: str | None = None
|
|
78
|
+
retries_used: int = 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _extract_user_messages(conversation: dict[str, Any]) -> list[dict[str, str]]:
|
|
82
|
+
"""Extract user messages from a conversation for replay."""
|
|
83
|
+
messages = conversation.get("messages") or []
|
|
84
|
+
user_messages = []
|
|
85
|
+
for msg in messages:
|
|
86
|
+
if msg.get("role") == "user":
|
|
87
|
+
user_messages.append({
|
|
88
|
+
"content": msg.get("content") or msg.get("text") or "",
|
|
89
|
+
"role": "user",
|
|
90
|
+
})
|
|
91
|
+
return user_messages
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _get_failing_scenarios(
|
|
95
|
+
*,
|
|
96
|
+
base_url: str,
|
|
97
|
+
shop_id: str,
|
|
98
|
+
api_token: str,
|
|
99
|
+
benchmark_id: str,
|
|
100
|
+
limit: int = 200,
|
|
101
|
+
) -> list[dict[str, Any]]:
|
|
102
|
+
"""Fetch all failing scenarios from a benchmark."""
|
|
103
|
+
all_scenarios = list_conversation_scenarios(
|
|
104
|
+
base_url=base_url,
|
|
105
|
+
shop_id=shop_id,
|
|
106
|
+
api_token=api_token,
|
|
107
|
+
benchmark_id=benchmark_id,
|
|
108
|
+
limit=limit,
|
|
109
|
+
ordering="-created_at",
|
|
110
|
+
)
|
|
111
|
+
return [s for s in all_scenarios if s.get("pass_status") == "fail"]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _summarize_scenario(scenario: dict[str, Any]) -> dict[str, Any]:
|
|
115
|
+
"""Create a concise summary of a scenario for AI consumption."""
|
|
116
|
+
input_conv = scenario.get("input_conversation") or {}
|
|
117
|
+
user_messages = _extract_user_messages(input_conv)
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
"scenario_id": scenario.get("id"),
|
|
121
|
+
"name": scenario.get("name"),
|
|
122
|
+
"pass_status": scenario.get("pass_status"),
|
|
123
|
+
"feedback": scenario.get("feedback") or "",
|
|
124
|
+
"label": (input_conv.get("label") or {}).get("name"),
|
|
125
|
+
"sublabel": (input_conv.get("sublabel") or {}).get("name"),
|
|
126
|
+
"user_messages": user_messages,
|
|
127
|
+
"run_count": scenario.get("run_count", 0),
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _summarize_response(response: dict[str, Any]) -> dict[str, Any]:
|
|
132
|
+
"""Create a concise summary of a response rule."""
|
|
133
|
+
return {
|
|
134
|
+
"response_id": response.get("id"),
|
|
135
|
+
"type": response.get("type"),
|
|
136
|
+
"question": response.get("question"),
|
|
137
|
+
"answer": response.get("answer"),
|
|
138
|
+
"guardrail": response.get("guardrail") or "",
|
|
139
|
+
"active": response.get("active", True),
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _create_test_conversation(
|
|
144
|
+
client: httpx.Client,
|
|
145
|
+
*,
|
|
146
|
+
base_url: str,
|
|
147
|
+
shop_id: str,
|
|
148
|
+
api_token: str,
|
|
149
|
+
agent_id: str,
|
|
150
|
+
channel: str,
|
|
151
|
+
contact_name: str | None = None,
|
|
152
|
+
contact_email: str | None = None,
|
|
153
|
+
) -> str:
|
|
154
|
+
"""Create a test conversation for replay."""
|
|
155
|
+
# Parse contact name into first/last
|
|
156
|
+
first_name = ""
|
|
157
|
+
last_name = ""
|
|
158
|
+
if contact_name:
|
|
159
|
+
parts = contact_name.strip().split(" ", 1)
|
|
160
|
+
first_name = parts[0]
|
|
161
|
+
last_name = parts[1] if len(parts) > 1 else ""
|
|
162
|
+
|
|
163
|
+
context: dict[str, object] = {
|
|
164
|
+
"channel": channel,
|
|
165
|
+
"firstName": first_name,
|
|
166
|
+
"lastName": last_name,
|
|
167
|
+
"contact": {
|
|
168
|
+
"contextFields": {
|
|
169
|
+
"channel": channel,
|
|
170
|
+
},
|
|
171
|
+
},
|
|
172
|
+
}
|
|
173
|
+
if contact_email:
|
|
174
|
+
context["email"] = contact_email
|
|
175
|
+
context["contact"]["email"] = contact_email # type: ignore[index]
|
|
176
|
+
|
|
177
|
+
payload: dict[str, object] = {
|
|
178
|
+
"agent_id": agent_id,
|
|
179
|
+
"is_test": True,
|
|
180
|
+
"metadata": {
|
|
181
|
+
"isTest": True,
|
|
182
|
+
"source": "applied-cli-fix",
|
|
183
|
+
"context": context,
|
|
184
|
+
},
|
|
185
|
+
}
|
|
186
|
+
if channel == "email":
|
|
187
|
+
payload["type"] = "email"
|
|
188
|
+
|
|
189
|
+
headers = {
|
|
190
|
+
"Authorization": f"Bearer {api_token}",
|
|
191
|
+
"X-Shop-Id": shop_id,
|
|
192
|
+
"Content-Type": "application/json",
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
response = client.post(
|
|
196
|
+
f"{base_url}/v1/c/",
|
|
197
|
+
json=payload,
|
|
198
|
+
headers=headers,
|
|
199
|
+
timeout=10.0,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if response.status_code >= 400:
|
|
203
|
+
raise APIError(
|
|
204
|
+
f"Failed to create test conversation ({response.status_code})",
|
|
205
|
+
status_code=response.status_code,
|
|
206
|
+
code="CONVERSATION_CREATE_FAILED",
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
data = response.json()
|
|
210
|
+
return str(data.get("id"))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _send_message_and_get_response(
|
|
214
|
+
client: httpx.Client,
|
|
215
|
+
*,
|
|
216
|
+
base_url: str,
|
|
217
|
+
shop_id: str,
|
|
218
|
+
api_token: str,
|
|
219
|
+
agent_id: str,
|
|
220
|
+
conversation_id: str,
|
|
221
|
+
message: str,
|
|
222
|
+
timeout: float = 60.0,
|
|
223
|
+
) -> str:
|
|
224
|
+
"""Send a message and get the agent's response."""
|
|
225
|
+
headers = {
|
|
226
|
+
"Authorization": f"Bearer {api_token}",
|
|
227
|
+
"X-Shop-Id": shop_id,
|
|
228
|
+
"Content-Type": "application/json",
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
transcript = [
|
|
232
|
+
{
|
|
233
|
+
"id": str(uuid_module.uuid4()),
|
|
234
|
+
"role": "user",
|
|
235
|
+
"content": message,
|
|
236
|
+
"text": message,
|
|
237
|
+
"format": "TEXT",
|
|
238
|
+
"entity": {"type": "user"},
|
|
239
|
+
}
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
payload = {
|
|
243
|
+
"conversation_id": conversation_id,
|
|
244
|
+
"context": "EVALUATE",
|
|
245
|
+
"transcript": transcript,
|
|
246
|
+
"metadata": {
|
|
247
|
+
"source": "applied-cli-fix",
|
|
248
|
+
"isTest": True,
|
|
249
|
+
},
|
|
250
|
+
"draft": False,
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
# Use non-streaming completion for simplicity
|
|
254
|
+
response = client.post(
|
|
255
|
+
f"{base_url}/v1/agents/{agent_id}/complete/",
|
|
256
|
+
headers=headers,
|
|
257
|
+
json=payload,
|
|
258
|
+
timeout=timeout,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
if response.status_code >= 400:
|
|
262
|
+
raise APIError(
|
|
263
|
+
f"Completion failed ({response.status_code})",
|
|
264
|
+
status_code=response.status_code,
|
|
265
|
+
code="COMPLETION_FAILED",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Parse streamed response to extract content
|
|
269
|
+
generated_text = ""
|
|
270
|
+
for line in response.text.split("\n"):
|
|
271
|
+
line = line.strip()
|
|
272
|
+
if not line:
|
|
273
|
+
continue
|
|
274
|
+
try:
|
|
275
|
+
data = json.loads(line)
|
|
276
|
+
content = data.get("content")
|
|
277
|
+
if isinstance(content, str):
|
|
278
|
+
generated_text += content
|
|
279
|
+
except json.JSONDecodeError:
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
return generated_text
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _get_conversation_resolution(
|
|
286
|
+
*,
|
|
287
|
+
base_url: str,
|
|
288
|
+
shop_id: str,
|
|
289
|
+
api_token: str,
|
|
290
|
+
conversation_id: str,
|
|
291
|
+
) -> str | None:
|
|
292
|
+
"""Get the resolution status of a conversation."""
|
|
293
|
+
try:
|
|
294
|
+
conv = get_conversation(
|
|
295
|
+
base_url=base_url,
|
|
296
|
+
shop_id=shop_id,
|
|
297
|
+
api_token=api_token,
|
|
298
|
+
conversation_id=conversation_id,
|
|
299
|
+
)
|
|
300
|
+
return conv.get("resolution")
|
|
301
|
+
except Exception:
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _run_single_test(
|
|
306
|
+
*,
|
|
307
|
+
base_url: str,
|
|
308
|
+
shop_id: str,
|
|
309
|
+
api_token: str,
|
|
310
|
+
original_scenario: dict[str, Any],
|
|
311
|
+
target_benchmark_id: str,
|
|
312
|
+
contact_name: str | None = None,
|
|
313
|
+
contact_email: str | None = None,
|
|
314
|
+
auto_pass: bool = False,
|
|
315
|
+
timeout: float = 90.0,
|
|
316
|
+
retries: int = 3,
|
|
317
|
+
expect_escalation: bool | None = None,
|
|
318
|
+
quiet: bool = False,
|
|
319
|
+
) -> TestResult:
|
|
320
|
+
"""
|
|
321
|
+
Run a single test with retry logic.
|
|
322
|
+
|
|
323
|
+
Returns a TestResult with success/failure status.
|
|
324
|
+
"""
|
|
325
|
+
scenario_id = original_scenario.get("id", "")
|
|
326
|
+
agent_data = original_scenario.get("agent") or {}
|
|
327
|
+
agent_id = agent_data.get("id")
|
|
328
|
+
|
|
329
|
+
if not agent_id:
|
|
330
|
+
return TestResult(
|
|
331
|
+
original_scenario_id=scenario_id,
|
|
332
|
+
success=False,
|
|
333
|
+
error="Scenario has no associated agent",
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
modality = (agent_data.get("modality") or "chat").lower()
|
|
337
|
+
channel = "email" if modality == "email" else "chat"
|
|
338
|
+
|
|
339
|
+
# Extract user messages
|
|
340
|
+
input_conv = original_scenario.get("input_conversation") or {}
|
|
341
|
+
user_messages = _extract_user_messages(input_conv)
|
|
342
|
+
|
|
343
|
+
if not user_messages:
|
|
344
|
+
return TestResult(
|
|
345
|
+
original_scenario_id=scenario_id,
|
|
346
|
+
success=False,
|
|
347
|
+
error="Scenario has no user messages to replay",
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
test_message = user_messages[0]["content"]
|
|
351
|
+
|
|
352
|
+
# Generate contact info if not provided
|
|
353
|
+
if not contact_name or not contact_email:
|
|
354
|
+
gen_name, gen_email = _generate_test_contact()
|
|
355
|
+
contact_name = contact_name or gen_name
|
|
356
|
+
contact_email = contact_email or gen_email
|
|
357
|
+
|
|
358
|
+
last_error: str | None = None
|
|
359
|
+
retries_used = 0
|
|
360
|
+
|
|
361
|
+
for attempt in range(retries + 1):
|
|
362
|
+
retries_used = attempt
|
|
363
|
+
try:
|
|
364
|
+
with httpx.Client() as client:
|
|
365
|
+
# Create test conversation
|
|
366
|
+
conversation_id = _create_test_conversation(
|
|
367
|
+
client,
|
|
368
|
+
base_url=base_url,
|
|
369
|
+
shop_id=shop_id,
|
|
370
|
+
api_token=api_token,
|
|
371
|
+
agent_id=agent_id,
|
|
372
|
+
channel=channel,
|
|
373
|
+
contact_name=contact_name,
|
|
374
|
+
contact_email=contact_email,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Send message and get response
|
|
378
|
+
_send_message_and_get_response(
|
|
379
|
+
client,
|
|
380
|
+
base_url=base_url,
|
|
381
|
+
shop_id=shop_id,
|
|
382
|
+
api_token=api_token,
|
|
383
|
+
agent_id=agent_id,
|
|
384
|
+
conversation_id=conversation_id,
|
|
385
|
+
message=test_message,
|
|
386
|
+
timeout=timeout,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Brief delay for processing
|
|
390
|
+
time.sleep(2)
|
|
391
|
+
|
|
392
|
+
# Get conversation resolution
|
|
393
|
+
resolution = _get_conversation_resolution(
|
|
394
|
+
base_url=base_url,
|
|
395
|
+
shop_id=shop_id,
|
|
396
|
+
api_token=api_token,
|
|
397
|
+
conversation_id=conversation_id,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Validate expectations if set
|
|
401
|
+
if expect_escalation is not None:
|
|
402
|
+
is_escalated = resolution == "escalated"
|
|
403
|
+
if expect_escalation and not is_escalated:
|
|
404
|
+
return TestResult(
|
|
405
|
+
original_scenario_id=scenario_id,
|
|
406
|
+
success=False,
|
|
407
|
+
conversation_id=conversation_id,
|
|
408
|
+
resolution=resolution,
|
|
409
|
+
error="Expected escalation but agent responded",
|
|
410
|
+
retries_used=retries_used,
|
|
411
|
+
)
|
|
412
|
+
elif not expect_escalation and is_escalated:
|
|
413
|
+
return TestResult(
|
|
414
|
+
original_scenario_id=scenario_id,
|
|
415
|
+
success=False,
|
|
416
|
+
conversation_id=conversation_id,
|
|
417
|
+
resolution=resolution,
|
|
418
|
+
error="Expected response but agent escalated",
|
|
419
|
+
retries_used=retries_used,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
# Create new scenario in target benchmark
|
|
423
|
+
new_scenario_name = f"[Fix Test] {original_scenario.get('name', 'Unnamed')}"
|
|
424
|
+
new_scenario = create_conversation_scenario(
|
|
425
|
+
base_url=base_url,
|
|
426
|
+
shop_id=shop_id,
|
|
427
|
+
api_token=api_token,
|
|
428
|
+
agent_id=agent_id,
|
|
429
|
+
benchmark_id=target_benchmark_id,
|
|
430
|
+
name=new_scenario_name,
|
|
431
|
+
input_conversation_id=conversation_id,
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
new_scenario_id = new_scenario.get("id")
|
|
435
|
+
|
|
436
|
+
# Auto-mark as pass if requested
|
|
437
|
+
if auto_pass and new_scenario_id:
|
|
438
|
+
patch_conversation_scenario(
|
|
439
|
+
base_url=base_url,
|
|
440
|
+
shop_id=shop_id,
|
|
441
|
+
api_token=api_token,
|
|
442
|
+
scenario_id=new_scenario_id,
|
|
443
|
+
payload={"pass_status": "pass"},
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
return TestResult(
|
|
447
|
+
original_scenario_id=scenario_id,
|
|
448
|
+
success=True,
|
|
449
|
+
new_scenario_id=new_scenario_id,
|
|
450
|
+
conversation_id=conversation_id,
|
|
451
|
+
resolution=resolution,
|
|
452
|
+
retries_used=retries_used,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
except httpx.TimeoutException as exc:
|
|
456
|
+
last_error = f"Timeout after {timeout}s"
|
|
457
|
+
if attempt < retries:
|
|
458
|
+
if not quiet:
|
|
459
|
+
typer.echo(f" Retry {attempt + 1}/{retries} after timeout...")
|
|
460
|
+
time.sleep(2 ** attempt) # Exponential backoff
|
|
461
|
+
continue
|
|
462
|
+
break
|
|
463
|
+
|
|
464
|
+
except httpx.HTTPError as exc:
|
|
465
|
+
last_error = f"Network error: {exc}"
|
|
466
|
+
if attempt < retries:
|
|
467
|
+
if not quiet:
|
|
468
|
+
typer.echo(f" Retry {attempt + 1}/{retries} after error...")
|
|
469
|
+
time.sleep(2 ** attempt)
|
|
470
|
+
continue
|
|
471
|
+
break
|
|
472
|
+
|
|
473
|
+
except APIError as exc:
|
|
474
|
+
last_error = f"API error: {exc}"
|
|
475
|
+
# Don't retry API errors (likely not transient)
|
|
476
|
+
break
|
|
477
|
+
|
|
478
|
+
except Exception as exc:
|
|
479
|
+
last_error = f"Unexpected error: {exc}"
|
|
480
|
+
break
|
|
481
|
+
|
|
482
|
+
return TestResult(
|
|
483
|
+
original_scenario_id=scenario_id,
|
|
484
|
+
success=False,
|
|
485
|
+
error=last_error,
|
|
486
|
+
retries_used=retries_used,
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
@app.command(
|
|
491
|
+
"context",
|
|
492
|
+
help=(
|
|
493
|
+
"Get all context needed to fix failing scenarios in a benchmark.\n\n"
|
|
494
|
+
"Returns:\n"
|
|
495
|
+
"- All failing scenarios with feedback and user messages\n"
|
|
496
|
+
"- Agent responses (knowledge base)\n"
|
|
497
|
+
"- Agent guardrails\n"
|
|
498
|
+
"- Response type semantics\n"
|
|
499
|
+
"- Instructions for making fixes\n\n"
|
|
500
|
+
"Example: applied-cli test fix context --benchmark-id <uuid>"
|
|
501
|
+
),
|
|
502
|
+
)
|
|
503
|
+
def fix_context(
|
|
504
|
+
benchmark_id: str = typer.Option(
|
|
505
|
+
..., "--benchmark-id", "--benchmark", "--id", help="Benchmark UUID."
|
|
506
|
+
),
|
|
507
|
+
include_passing: bool = typer.Option(
|
|
508
|
+
False, "--include-passing", help="Include passing scenarios for reference."
|
|
509
|
+
),
|
|
510
|
+
output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
|
|
511
|
+
base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
|
|
512
|
+
shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
|
|
513
|
+
api_token: Optional[str] = typer.Option(None, help="Applied API token."),
|
|
514
|
+
) -> None:
|
|
515
|
+
validate_uuid(benchmark_id, field_name="benchmark-id")
|
|
516
|
+
|
|
517
|
+
try:
|
|
518
|
+
resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
|
|
519
|
+
base_url=base_url, shop_id=shop_id, api_token=api_token
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
# Get benchmark details
|
|
523
|
+
benchmark = get_conversation_benchmark(
|
|
524
|
+
base_url=resolved_base_url,
|
|
525
|
+
shop_id=resolved_shop_id,
|
|
526
|
+
api_token=resolved_token,
|
|
527
|
+
benchmark_id=benchmark_id,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
# Get agent from benchmark
|
|
531
|
+
agent_data = benchmark.get("agent") or {}
|
|
532
|
+
agent_id = agent_data.get("id")
|
|
533
|
+
if not agent_id:
|
|
534
|
+
raise typer.BadParameter("Benchmark has no associated agent.")
|
|
535
|
+
|
|
536
|
+
# Get full agent details including guardrails
|
|
537
|
+
agent = get_agent(
|
|
538
|
+
base_url=resolved_base_url,
|
|
539
|
+
shop_id=resolved_shop_id,
|
|
540
|
+
api_token=resolved_token,
|
|
541
|
+
agent_id=agent_id,
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
# Get all scenarios
|
|
545
|
+
all_scenarios = list_conversation_scenarios(
|
|
546
|
+
base_url=resolved_base_url,
|
|
547
|
+
shop_id=resolved_shop_id,
|
|
548
|
+
api_token=resolved_token,
|
|
549
|
+
benchmark_id=benchmark_id,
|
|
550
|
+
limit=500,
|
|
551
|
+
ordering="-created_at",
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
# For failing scenarios, we need full details including input_conversation
|
|
555
|
+
failing_scenarios = []
|
|
556
|
+
passing_scenarios = []
|
|
557
|
+
for scenario in all_scenarios:
|
|
558
|
+
if scenario.get("pass_status") == "fail":
|
|
559
|
+
# Get full scenario details
|
|
560
|
+
full_scenario = get_conversation_scenario(
|
|
561
|
+
base_url=resolved_base_url,
|
|
562
|
+
shop_id=resolved_shop_id,
|
|
563
|
+
api_token=resolved_token,
|
|
564
|
+
scenario_id=scenario.get("id"),
|
|
565
|
+
)
|
|
566
|
+
failing_scenarios.append(_summarize_scenario(full_scenario))
|
|
567
|
+
elif include_passing and scenario.get("pass_status") == "pass":
|
|
568
|
+
passing_scenarios.append({
|
|
569
|
+
"scenario_id": scenario.get("id"),
|
|
570
|
+
"name": scenario.get("name"),
|
|
571
|
+
"pass_status": "pass",
|
|
572
|
+
})
|
|
573
|
+
|
|
574
|
+
# Get agent responses (knowledge base)
|
|
575
|
+
responses = list_responses(
|
|
576
|
+
base_url=resolved_base_url,
|
|
577
|
+
shop_id=resolved_shop_id,
|
|
578
|
+
api_token=resolved_token,
|
|
579
|
+
agent_id=agent_id,
|
|
580
|
+
active=True,
|
|
581
|
+
limit=500,
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
except APIError as exc:
|
|
585
|
+
typer.echo(render_api_error(exc, action="get fix context"), err=True)
|
|
586
|
+
raise typer.Exit(code=1) from exc
|
|
587
|
+
|
|
588
|
+
# Build the context output
|
|
589
|
+
context: dict[str, Any] = {
|
|
590
|
+
"benchmark": {
|
|
591
|
+
"id": benchmark_id,
|
|
592
|
+
"name": benchmark.get("name"),
|
|
593
|
+
"scenario_count": benchmark.get("scenario_count"),
|
|
594
|
+
},
|
|
595
|
+
"agent": {
|
|
596
|
+
"id": agent_id,
|
|
597
|
+
"name": agent.get("name"),
|
|
598
|
+
"modality": agent.get("modality"),
|
|
599
|
+
"guardrail": agent.get("guardrail") or "",
|
|
600
|
+
},
|
|
601
|
+
"summary": {
|
|
602
|
+
"total_scenarios": len(all_scenarios),
|
|
603
|
+
"failing_count": len(failing_scenarios),
|
|
604
|
+
"passing_count": len([s for s in all_scenarios if s.get("pass_status") == "pass"]),
|
|
605
|
+
"unrated_count": len([s for s in all_scenarios if s.get("pass_status") is None]),
|
|
606
|
+
},
|
|
607
|
+
"failing_scenarios": failing_scenarios,
|
|
608
|
+
"responses": [_summarize_response(r) for r in responses],
|
|
609
|
+
"response_type_semantics": RESPONSE_SEMANTICS,
|
|
610
|
+
"instructions": {
|
|
611
|
+
"overview": (
|
|
612
|
+
"To fix failing scenarios, analyze each failure's feedback and user messages, "
|
|
613
|
+
"then update responses or agent guardrails accordingly."
|
|
614
|
+
),
|
|
615
|
+
"response_types": {
|
|
616
|
+
"escalation": "Use for messages that should be escalated to humans. Set question to trigger criteria.",
|
|
617
|
+
"exact": "Use for verbatim templated responses. Answer is returned exactly as written.",
|
|
618
|
+
"qa": "Use for knowledge-grounded answers. Answer is used as context for generation.",
|
|
619
|
+
"context": "Use for background knowledge that informs responses broadly.",
|
|
620
|
+
},
|
|
621
|
+
"commands": {
|
|
622
|
+
"add_knowledge": "applied-cli knowledge upsert --agent-id <agent_id> --type <type> --question '<question>' --answer '<answer>' --yes",
|
|
623
|
+
"update_knowledge": "applied-cli knowledge update --response-id <response_id> --answer '<new_answer>' --yes",
|
|
624
|
+
"update_guardrail": "applied-cli agent update --agent-id <agent_id> --guardrail '<guardrail_text>' --yes",
|
|
625
|
+
"test_fix": "applied-cli test fix test --scenario-id <scenario_id> --benchmark-id <new_benchmark_id>",
|
|
626
|
+
"create_benchmark": "applied-cli benchmarks create --agent-id <agent_id> --name 'Fix Validation'",
|
|
627
|
+
},
|
|
628
|
+
},
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
if include_passing:
|
|
632
|
+
context["passing_scenarios"] = passing_scenarios
|
|
633
|
+
|
|
634
|
+
if output_json:
|
|
635
|
+
typer.echo(json.dumps(context, indent=2, default=str))
|
|
636
|
+
else:
|
|
637
|
+
typer.echo(f"Benchmark: {context['benchmark']['name']} ({benchmark_id})")
|
|
638
|
+
typer.echo(f"Agent: {context['agent']['name']} ({agent_id})")
|
|
639
|
+
typer.echo(f"Modality: {context['agent']['modality']}")
|
|
640
|
+
typer.echo("")
|
|
641
|
+
typer.echo(f"Summary: {context['summary']['failing_count']} failing, {context['summary']['passing_count']} passing, {context['summary']['unrated_count']} unrated")
|
|
642
|
+
typer.echo("")
|
|
643
|
+
|
|
644
|
+
if failing_scenarios:
|
|
645
|
+
typer.echo("=== FAILING SCENARIOS ===")
|
|
646
|
+
for scenario in failing_scenarios:
|
|
647
|
+
typer.echo(f"\n--- {scenario['name']} ---")
|
|
648
|
+
typer.echo(f"ID: {scenario['scenario_id']}")
|
|
649
|
+
typer.echo(f"Label: {scenario['label']} / {scenario['sublabel']}")
|
|
650
|
+
typer.echo(f"Feedback: {scenario['feedback'] or '(none)'}")
|
|
651
|
+
typer.echo("User messages:")
|
|
652
|
+
for msg in scenario['user_messages']:
|
|
653
|
+
content = msg['content'][:200] + "..." if len(msg['content']) > 200 else msg['content']
|
|
654
|
+
typer.echo(f" - {content}")
|
|
655
|
+
|
|
656
|
+
typer.echo("\n=== AGENT RESPONSES ===")
|
|
657
|
+
typer.echo(f"Total: {len(responses)} active responses")
|
|
658
|
+
for r in responses[:10]: # Show first 10
|
|
659
|
+
typer.echo(f" [{r.get('type')}] {str(r.get('question') or '')[:60]}")
|
|
660
|
+
if len(responses) > 10:
|
|
661
|
+
typer.echo(f" ... and {len(responses) - 10} more")
|
|
662
|
+
|
|
663
|
+
typer.echo("\n=== AGENT GUARDRAIL ===")
|
|
664
|
+
guardrail = context['agent']['guardrail']
|
|
665
|
+
if guardrail:
|
|
666
|
+
typer.echo(guardrail[:500] + "..." if len(guardrail) > 500 else guardrail)
|
|
667
|
+
else:
|
|
668
|
+
typer.echo("(no guardrail set)")
|
|
669
|
+
|
|
670
|
+
typer.echo("\nUse --json for full machine-readable output.")
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
@app.command(
|
|
674
|
+
"test",
|
|
675
|
+
help=(
|
|
676
|
+
"Test a fix by replaying a scenario's input message.\n\n"
|
|
677
|
+
"This command:\n"
|
|
678
|
+
"1. Gets the original scenario's input message\n"
|
|
679
|
+
"2. Sends it to the agent to get a new response\n"
|
|
680
|
+
"3. Creates a new scenario in the target benchmark\n"
|
|
681
|
+
"4. Returns the scenario ID for rating\n\n"
|
|
682
|
+
"Example: applied-cli test fix test --scenario-id <uuid> --benchmark-id <uuid>"
|
|
683
|
+
),
|
|
684
|
+
)
|
|
685
|
+
def fix_test(
|
|
686
|
+
scenario_id: str = typer.Option(
|
|
687
|
+
..., "--scenario-id", "--scenario", help="Original scenario UUID to replay."
|
|
688
|
+
),
|
|
689
|
+
benchmark_id: str = typer.Option(
|
|
690
|
+
..., "--benchmark-id", "--benchmark", help="Target benchmark UUID for the new scenario."
|
|
691
|
+
),
|
|
692
|
+
contact_name: Optional[str] = typer.Option(
|
|
693
|
+
None, "--contact-name", "--name", help="Contact name for the test. Auto-generated if not provided."
|
|
694
|
+
),
|
|
695
|
+
contact_email: Optional[str] = typer.Option(
|
|
696
|
+
None, "--contact-email", "--email", help="Contact email for the test. Auto-generated if not provided."
|
|
697
|
+
),
|
|
698
|
+
auto_pass: bool = typer.Option(
|
|
699
|
+
False, "--auto-pass", help="Automatically mark the new scenario as pass."
|
|
700
|
+
),
|
|
701
|
+
expect_escalation: Optional[bool] = typer.Option(
|
|
702
|
+
None, "--expect-escalation/--expect-response",
|
|
703
|
+
help="Validate escalation behavior. --expect-escalation fails if agent responds, --expect-response fails if agent escalates."
|
|
704
|
+
),
|
|
705
|
+
timeout: int = typer.Option(
|
|
706
|
+
90, "--timeout", "-t", help="Timeout in seconds for agent response."
|
|
707
|
+
),
|
|
708
|
+
retries: int = typer.Option(
|
|
709
|
+
2, "--retry", "-r", help="Number of retries on timeout/network errors."
|
|
710
|
+
),
|
|
711
|
+
feedback: Optional[str] = typer.Option(
|
|
712
|
+
None, "--feedback", help="Feedback to add to the new scenario."
|
|
713
|
+
),
|
|
714
|
+
quiet: bool = typer.Option(
|
|
715
|
+
False, "--quiet", "-q", help="Minimal output (only show result)."
|
|
716
|
+
),
|
|
717
|
+
output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
|
|
718
|
+
base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
|
|
719
|
+
shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
|
|
720
|
+
api_token: Optional[str] = typer.Option(None, help="Applied API token."),
|
|
721
|
+
) -> None:
|
|
722
|
+
validate_uuid(scenario_id, field_name="scenario-id")
|
|
723
|
+
validate_uuid(benchmark_id, field_name="benchmark-id")
|
|
724
|
+
|
|
725
|
+
try:
|
|
726
|
+
resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
|
|
727
|
+
base_url=base_url, shop_id=shop_id, api_token=api_token
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Get the original scenario
|
|
731
|
+
original_scenario = get_conversation_scenario(
|
|
732
|
+
base_url=resolved_base_url,
|
|
733
|
+
shop_id=resolved_shop_id,
|
|
734
|
+
api_token=resolved_token,
|
|
735
|
+
scenario_id=scenario_id,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
if not quiet:
|
|
739
|
+
input_conv = original_scenario.get("input_conversation") or {}
|
|
740
|
+
user_messages = _extract_user_messages(input_conv)
|
|
741
|
+
test_message = user_messages[0]["content"] if user_messages else ""
|
|
742
|
+
typer.echo(f"Replaying scenario: {original_scenario.get('name')}")
|
|
743
|
+
typer.echo(f"Message: {test_message[:100]}...")
|
|
744
|
+
typer.echo("")
|
|
745
|
+
typer.echo("Sending message to agent...")
|
|
746
|
+
|
|
747
|
+
except APIError as exc:
|
|
748
|
+
typer.echo(render_api_error(exc, action="get original scenario"), err=True)
|
|
749
|
+
raise typer.Exit(code=1) from exc
|
|
750
|
+
|
|
751
|
+
# Run the test with retry logic
|
|
752
|
+
test_result = _run_single_test(
|
|
753
|
+
base_url=resolved_base_url,
|
|
754
|
+
shop_id=resolved_shop_id,
|
|
755
|
+
api_token=resolved_token,
|
|
756
|
+
original_scenario=original_scenario,
|
|
757
|
+
target_benchmark_id=benchmark_id,
|
|
758
|
+
contact_name=contact_name,
|
|
759
|
+
contact_email=contact_email,
|
|
760
|
+
auto_pass=auto_pass,
|
|
761
|
+
timeout=float(timeout),
|
|
762
|
+
retries=retries,
|
|
763
|
+
expect_escalation=expect_escalation,
|
|
764
|
+
quiet=quiet,
|
|
765
|
+
)
|
|
766
|
+
|
|
767
|
+
# Handle feedback update if provided
|
|
768
|
+
if test_result.success and feedback and test_result.new_scenario_id:
|
|
769
|
+
try:
|
|
770
|
+
patch_conversation_scenario(
|
|
771
|
+
base_url=resolved_base_url,
|
|
772
|
+
shop_id=resolved_shop_id,
|
|
773
|
+
api_token=resolved_token,
|
|
774
|
+
scenario_id=test_result.new_scenario_id,
|
|
775
|
+
payload={"feedback": feedback},
|
|
776
|
+
)
|
|
777
|
+
except APIError:
|
|
778
|
+
pass # Non-critical
|
|
779
|
+
|
|
780
|
+
result = {
|
|
781
|
+
"result": "success" if test_result.success else "failed",
|
|
782
|
+
"original_scenario_id": scenario_id,
|
|
783
|
+
"new_scenario_id": test_result.new_scenario_id,
|
|
784
|
+
"benchmark_id": benchmark_id,
|
|
785
|
+
"conversation_id": test_result.conversation_id,
|
|
786
|
+
"resolution": test_result.resolution,
|
|
787
|
+
"pass_status": "pass" if auto_pass and test_result.success else None,
|
|
788
|
+
"retries_used": test_result.retries_used,
|
|
789
|
+
"error": test_result.error,
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
if output_json:
|
|
793
|
+
typer.echo(json.dumps(result, indent=2, default=str))
|
|
794
|
+
elif test_result.success:
|
|
795
|
+
if quiet:
|
|
796
|
+
typer.echo(f"✓ {test_result.new_scenario_id}")
|
|
797
|
+
else:
|
|
798
|
+
typer.echo(f"\nCreated new scenario: {test_result.new_scenario_id}")
|
|
799
|
+
typer.echo(f"In benchmark: {benchmark_id}")
|
|
800
|
+
typer.echo(f"Resolution: {test_result.resolution or 'answered'}")
|
|
801
|
+
if auto_pass:
|
|
802
|
+
typer.echo("Status: pass (auto-marked)")
|
|
803
|
+
else:
|
|
804
|
+
typer.echo("Status: unrated")
|
|
805
|
+
typer.echo(f"\nTo rate: applied-cli scenarios update --scenario-id {test_result.new_scenario_id} --pass-status pass")
|
|
806
|
+
else:
|
|
807
|
+
if quiet:
|
|
808
|
+
typer.echo(f"✗ {scenario_id}: {test_result.error}")
|
|
809
|
+
else:
|
|
810
|
+
typer.echo(f"\nTest failed: {test_result.error}", err=True)
|
|
811
|
+
if test_result.retries_used > 0:
|
|
812
|
+
typer.echo(f"Retries used: {test_result.retries_used}", err=True)
|
|
813
|
+
raise typer.Exit(code=1)
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
@app.command(
|
|
817
|
+
"summary",
|
|
818
|
+
help=(
|
|
819
|
+
"Get a quick summary of a benchmark's pass/fail status.\n\n"
|
|
820
|
+
"Example: applied-cli test fix summary --benchmark-id <uuid>"
|
|
821
|
+
),
|
|
822
|
+
)
|
|
823
|
+
def fix_summary(
|
|
824
|
+
benchmark_id: str = typer.Option(
|
|
825
|
+
..., "--benchmark-id", "--benchmark", "--id", help="Benchmark UUID."
|
|
826
|
+
),
|
|
827
|
+
output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
|
|
828
|
+
base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
|
|
829
|
+
shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
|
|
830
|
+
api_token: Optional[str] = typer.Option(None, help="Applied API token."),
|
|
831
|
+
) -> None:
|
|
832
|
+
validate_uuid(benchmark_id, field_name="benchmark-id")
|
|
833
|
+
|
|
834
|
+
try:
|
|
835
|
+
resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
|
|
836
|
+
base_url=base_url, shop_id=shop_id, api_token=api_token
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
benchmark = get_conversation_benchmark(
|
|
840
|
+
base_url=resolved_base_url,
|
|
841
|
+
shop_id=resolved_shop_id,
|
|
842
|
+
api_token=resolved_token,
|
|
843
|
+
benchmark_id=benchmark_id,
|
|
844
|
+
)
|
|
845
|
+
|
|
846
|
+
all_scenarios = list_conversation_scenarios(
|
|
847
|
+
base_url=resolved_base_url,
|
|
848
|
+
shop_id=resolved_shop_id,
|
|
849
|
+
api_token=resolved_token,
|
|
850
|
+
benchmark_id=benchmark_id,
|
|
851
|
+
limit=500,
|
|
852
|
+
)
|
|
853
|
+
|
|
854
|
+
except APIError as exc:
|
|
855
|
+
typer.echo(render_api_error(exc, action="get benchmark summary"), err=True)
|
|
856
|
+
raise typer.Exit(code=1) from exc
|
|
857
|
+
|
|
858
|
+
passing = [s for s in all_scenarios if s.get("pass_status") == "pass"]
|
|
859
|
+
failing = [s for s in all_scenarios if s.get("pass_status") == "fail"]
|
|
860
|
+
unrated = [s for s in all_scenarios if s.get("pass_status") is None]
|
|
861
|
+
|
|
862
|
+
summary = {
|
|
863
|
+
"benchmark_id": benchmark_id,
|
|
864
|
+
"benchmark_name": benchmark.get("name"),
|
|
865
|
+
"total": len(all_scenarios),
|
|
866
|
+
"passing": len(passing),
|
|
867
|
+
"failing": len(failing),
|
|
868
|
+
"unrated": len(unrated),
|
|
869
|
+
"pass_rate": f"{len(passing) / len(all_scenarios) * 100:.1f}%" if all_scenarios else "N/A",
|
|
870
|
+
"failing_scenario_ids": [s.get("id") for s in failing],
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
if output_json:
|
|
874
|
+
typer.echo(json.dumps(summary, indent=2, default=str))
|
|
875
|
+
else:
|
|
876
|
+
typer.echo(f"Benchmark: {summary['benchmark_name']}")
|
|
877
|
+
typer.echo(f"Total: {summary['total']} scenarios")
|
|
878
|
+
typer.echo(f"Passing: {summary['passing']} ({summary['pass_rate']})")
|
|
879
|
+
typer.echo(f"Failing: {summary['failing']}")
|
|
880
|
+
typer.echo(f"Unrated: {summary['unrated']}")
|
|
881
|
+
if failing:
|
|
882
|
+
typer.echo(f"\nFailing scenario IDs:")
|
|
883
|
+
for sid in summary['failing_scenario_ids']:
|
|
884
|
+
typer.echo(f" - {sid}")
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
@app.command(
|
|
888
|
+
"batch",
|
|
889
|
+
help=(
|
|
890
|
+
"Batch test all failing scenarios from a source benchmark.\n\n"
|
|
891
|
+
"This command:\n"
|
|
892
|
+
"1. Fetches all failing scenarios from the source benchmark\n"
|
|
893
|
+
"2. Tests each one with retries and parallelism\n"
|
|
894
|
+
"3. Creates new scenarios in the target benchmark\n"
|
|
895
|
+
"4. Reports overall progress\n\n"
|
|
896
|
+
"Example: applied-cli test fix batch --source <uuid> --target <uuid> --auto-pass"
|
|
897
|
+
),
|
|
898
|
+
)
|
|
899
|
+
def fix_batch(
|
|
900
|
+
source_benchmark_id: str = typer.Option(
|
|
901
|
+
..., "--source", "--source-benchmark", help="Source benchmark UUID with failing scenarios."
|
|
902
|
+
),
|
|
903
|
+
target_benchmark_id: str = typer.Option(
|
|
904
|
+
..., "--target", "--target-benchmark", help="Target benchmark UUID for new scenarios."
|
|
905
|
+
),
|
|
906
|
+
pass_status_filter: str = typer.Option(
|
|
907
|
+
"fail", "--pass-status", help="Filter scenarios by pass status: fail, pass, unrated, or all."
|
|
908
|
+
),
|
|
909
|
+
auto_pass: bool = typer.Option(
|
|
910
|
+
False, "--auto-pass", help="Automatically mark successful tests as pass."
|
|
911
|
+
),
|
|
912
|
+
timeout: int = typer.Option(
|
|
913
|
+
90, "--timeout", "-t", help="Timeout in seconds per test."
|
|
914
|
+
),
|
|
915
|
+
retries: int = typer.Option(
|
|
916
|
+
2, "--retry", "-r", help="Number of retries per test."
|
|
917
|
+
),
|
|
918
|
+
parallel: int = typer.Option(
|
|
919
|
+
1, "--parallel", "-p", help="Number of parallel tests (1-10)."
|
|
920
|
+
),
|
|
921
|
+
limit: int = typer.Option(
|
|
922
|
+
0, "--limit", "-l", help="Max scenarios to test (0 = all)."
|
|
923
|
+
),
|
|
924
|
+
continue_on_error: bool = typer.Option(
|
|
925
|
+
True, "--continue-on-error/--stop-on-error",
|
|
926
|
+
help="Continue testing even if some scenarios fail."
|
|
927
|
+
),
|
|
928
|
+
output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
|
|
929
|
+
base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
|
|
930
|
+
shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
|
|
931
|
+
api_token: Optional[str] = typer.Option(None, help="Applied API token."),
|
|
932
|
+
) -> None:
|
|
933
|
+
validate_uuid(source_benchmark_id, field_name="source-benchmark")
|
|
934
|
+
validate_uuid(target_benchmark_id, field_name="target-benchmark")
|
|
935
|
+
|
|
936
|
+
# Clamp parallel workers
|
|
937
|
+
parallel = max(1, min(10, parallel))
|
|
938
|
+
|
|
939
|
+
try:
|
|
940
|
+
resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
|
|
941
|
+
base_url=base_url, shop_id=shop_id, api_token=api_token
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
# Get source scenarios
|
|
945
|
+
all_scenarios = list_conversation_scenarios(
|
|
946
|
+
base_url=resolved_base_url,
|
|
947
|
+
shop_id=resolved_shop_id,
|
|
948
|
+
api_token=resolved_token,
|
|
949
|
+
benchmark_id=source_benchmark_id,
|
|
950
|
+
limit=500,
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
# Filter by pass status
|
|
954
|
+
if pass_status_filter == "all":
|
|
955
|
+
scenarios_to_test = all_scenarios
|
|
956
|
+
else:
|
|
957
|
+
filter_value = None if pass_status_filter == "unrated" else pass_status_filter
|
|
958
|
+
scenarios_to_test = [s for s in all_scenarios if s.get("pass_status") == filter_value]
|
|
959
|
+
|
|
960
|
+
# Apply limit
|
|
961
|
+
if limit > 0:
|
|
962
|
+
scenarios_to_test = scenarios_to_test[:limit]
|
|
963
|
+
|
|
964
|
+
if not scenarios_to_test:
|
|
965
|
+
typer.echo(f"No scenarios found with pass_status={pass_status_filter}")
|
|
966
|
+
raise typer.Exit(code=0)
|
|
967
|
+
|
|
968
|
+
# Fetch full scenario details for each
|
|
969
|
+
full_scenarios = []
|
|
970
|
+
for scenario in scenarios_to_test:
|
|
971
|
+
full = get_conversation_scenario(
|
|
972
|
+
base_url=resolved_base_url,
|
|
973
|
+
shop_id=resolved_shop_id,
|
|
974
|
+
api_token=resolved_token,
|
|
975
|
+
scenario_id=scenario.get("id"),
|
|
976
|
+
)
|
|
977
|
+
full_scenarios.append(full)
|
|
978
|
+
|
|
979
|
+
except APIError as exc:
|
|
980
|
+
typer.echo(render_api_error(exc, action="get scenarios"), err=True)
|
|
981
|
+
raise typer.Exit(code=1) from exc
|
|
982
|
+
|
|
983
|
+
typer.echo(f"Testing {len(full_scenarios)} scenarios...")
|
|
984
|
+
typer.echo(f" Source: {source_benchmark_id}")
|
|
985
|
+
typer.echo(f" Target: {target_benchmark_id}")
|
|
986
|
+
typer.echo(f" Parallel: {parallel}, Retries: {retries}, Timeout: {timeout}s")
|
|
987
|
+
typer.echo("")
|
|
988
|
+
|
|
989
|
+
results: list[TestResult] = []
|
|
990
|
+
success_count = 0
|
|
991
|
+
fail_count = 0
|
|
992
|
+
|
|
993
|
+
def run_test(scenario: dict[str, Any]) -> TestResult:
|
|
994
|
+
return _run_single_test(
|
|
995
|
+
base_url=resolved_base_url,
|
|
996
|
+
shop_id=resolved_shop_id,
|
|
997
|
+
api_token=resolved_token,
|
|
998
|
+
original_scenario=scenario,
|
|
999
|
+
target_benchmark_id=target_benchmark_id,
|
|
1000
|
+
auto_pass=auto_pass,
|
|
1001
|
+
timeout=float(timeout),
|
|
1002
|
+
retries=retries,
|
|
1003
|
+
quiet=True,
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
if parallel == 1:
|
|
1007
|
+
# Sequential execution with progress
|
|
1008
|
+
for i, scenario in enumerate(full_scenarios):
|
|
1009
|
+
name = scenario.get("name", "Unnamed")[:40]
|
|
1010
|
+
typer.echo(f"[{i+1}/{len(full_scenarios)}] {name}...", nl=False)
|
|
1011
|
+
|
|
1012
|
+
result = run_test(scenario)
|
|
1013
|
+
results.append(result)
|
|
1014
|
+
|
|
1015
|
+
if result.success:
|
|
1016
|
+
success_count += 1
|
|
1017
|
+
typer.echo(f" ✓ {result.resolution or 'answered'}")
|
|
1018
|
+
else:
|
|
1019
|
+
fail_count += 1
|
|
1020
|
+
typer.echo(f" ✗ {result.error}")
|
|
1021
|
+
if not continue_on_error:
|
|
1022
|
+
break
|
|
1023
|
+
else:
|
|
1024
|
+
# Parallel execution
|
|
1025
|
+
with ThreadPoolExecutor(max_workers=parallel) as executor:
|
|
1026
|
+
future_to_scenario = {
|
|
1027
|
+
executor.submit(run_test, s): s for s in full_scenarios
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
for i, future in enumerate(as_completed(future_to_scenario)):
|
|
1031
|
+
scenario = future_to_scenario[future]
|
|
1032
|
+
name = scenario.get("name", "Unnamed")[:40]
|
|
1033
|
+
|
|
1034
|
+
try:
|
|
1035
|
+
result = future.result()
|
|
1036
|
+
results.append(result)
|
|
1037
|
+
|
|
1038
|
+
if result.success:
|
|
1039
|
+
success_count += 1
|
|
1040
|
+
typer.echo(f"[{i+1}/{len(full_scenarios)}] ✓ {name}")
|
|
1041
|
+
else:
|
|
1042
|
+
fail_count += 1
|
|
1043
|
+
typer.echo(f"[{i+1}/{len(full_scenarios)}] ✗ {name}: {result.error}")
|
|
1044
|
+
except Exception as exc:
|
|
1045
|
+
fail_count += 1
|
|
1046
|
+
results.append(TestResult(
|
|
1047
|
+
original_scenario_id=scenario.get("id", ""),
|
|
1048
|
+
success=False,
|
|
1049
|
+
error=str(exc),
|
|
1050
|
+
))
|
|
1051
|
+
typer.echo(f"[{i+1}/{len(full_scenarios)}] ✗ {name}: {exc}")
|
|
1052
|
+
|
|
1053
|
+
typer.echo("")
|
|
1054
|
+
typer.echo(f"=== Results ===")
|
|
1055
|
+
typer.echo(f"Tested: {len(results)}")
|
|
1056
|
+
typer.echo(f"Success: {success_count}")
|
|
1057
|
+
typer.echo(f"Failed: {fail_count}")
|
|
1058
|
+
|
|
1059
|
+
if output_json:
|
|
1060
|
+
summary = {
|
|
1061
|
+
"source_benchmark_id": source_benchmark_id,
|
|
1062
|
+
"target_benchmark_id": target_benchmark_id,
|
|
1063
|
+
"total_tested": len(results),
|
|
1064
|
+
"success": success_count,
|
|
1065
|
+
"failed": fail_count,
|
|
1066
|
+
"results": [
|
|
1067
|
+
{
|
|
1068
|
+
"original_id": r.original_scenario_id,
|
|
1069
|
+
"new_id": r.new_scenario_id,
|
|
1070
|
+
"success": r.success,
|
|
1071
|
+
"resolution": r.resolution,
|
|
1072
|
+
"error": r.error,
|
|
1073
|
+
}
|
|
1074
|
+
for r in results
|
|
1075
|
+
],
|
|
1076
|
+
}
|
|
1077
|
+
typer.echo(json.dumps(summary, indent=2, default=str))
|
|
1078
|
+
|
|
1079
|
+
if fail_count > 0 and not continue_on_error:
|
|
1080
|
+
raise typer.Exit(code=1)
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
@app.command(
|
|
1084
|
+
"status",
|
|
1085
|
+
help=(
|
|
1086
|
+
"Track fix progress between source and target benchmarks.\n\n"
|
|
1087
|
+
"Shows which scenarios from the source have been tested in the target.\n\n"
|
|
1088
|
+
"Example: applied-cli test fix status --source <uuid> --target <uuid>"
|
|
1089
|
+
),
|
|
1090
|
+
)
|
|
1091
|
+
def fix_status(
|
|
1092
|
+
source_benchmark_id: str = typer.Option(
|
|
1093
|
+
..., "--source", "--source-benchmark", help="Source benchmark UUID with original scenarios."
|
|
1094
|
+
),
|
|
1095
|
+
target_benchmark_id: str = typer.Option(
|
|
1096
|
+
..., "--target", "--target-benchmark", help="Target benchmark UUID with test results."
|
|
1097
|
+
),
|
|
1098
|
+
output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
|
|
1099
|
+
base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
|
|
1100
|
+
shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
|
|
1101
|
+
api_token: Optional[str] = typer.Option(None, help="Applied API token."),
|
|
1102
|
+
) -> None:
|
|
1103
|
+
validate_uuid(source_benchmark_id, field_name="source-benchmark")
|
|
1104
|
+
validate_uuid(target_benchmark_id, field_name="target-benchmark")
|
|
1105
|
+
|
|
1106
|
+
try:
|
|
1107
|
+
resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
|
|
1108
|
+
base_url=base_url, shop_id=shop_id, api_token=api_token
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
# Get both benchmarks
|
|
1112
|
+
source_benchmark = get_conversation_benchmark(
|
|
1113
|
+
base_url=resolved_base_url,
|
|
1114
|
+
shop_id=resolved_shop_id,
|
|
1115
|
+
api_token=resolved_token,
|
|
1116
|
+
benchmark_id=source_benchmark_id,
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
target_benchmark = get_conversation_benchmark(
|
|
1120
|
+
base_url=resolved_base_url,
|
|
1121
|
+
shop_id=resolved_shop_id,
|
|
1122
|
+
api_token=resolved_token,
|
|
1123
|
+
benchmark_id=target_benchmark_id,
|
|
1124
|
+
)
|
|
1125
|
+
|
|
1126
|
+
# Get scenarios from both
|
|
1127
|
+
source_scenarios = list_conversation_scenarios(
|
|
1128
|
+
base_url=resolved_base_url,
|
|
1129
|
+
shop_id=resolved_shop_id,
|
|
1130
|
+
api_token=resolved_token,
|
|
1131
|
+
benchmark_id=source_benchmark_id,
|
|
1132
|
+
limit=500,
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
target_scenarios = list_conversation_scenarios(
|
|
1136
|
+
base_url=resolved_base_url,
|
|
1137
|
+
shop_id=resolved_shop_id,
|
|
1138
|
+
api_token=resolved_token,
|
|
1139
|
+
benchmark_id=target_benchmark_id,
|
|
1140
|
+
limit=500,
|
|
1141
|
+
)
|
|
1142
|
+
|
|
1143
|
+
except APIError as exc:
|
|
1144
|
+
typer.echo(render_api_error(exc, action="get benchmark status"), err=True)
|
|
1145
|
+
raise typer.Exit(code=1) from exc
|
|
1146
|
+
|
|
1147
|
+
# Analyze source
|
|
1148
|
+
source_failing = [s for s in source_scenarios if s.get("pass_status") == "fail"]
|
|
1149
|
+
source_passing = [s for s in source_scenarios if s.get("pass_status") == "pass"]
|
|
1150
|
+
source_unrated = [s for s in source_scenarios if s.get("pass_status") is None]
|
|
1151
|
+
|
|
1152
|
+
# Analyze target
|
|
1153
|
+
target_passing = [s for s in target_scenarios if s.get("pass_status") == "pass"]
|
|
1154
|
+
target_failing = [s for s in target_scenarios if s.get("pass_status") == "fail"]
|
|
1155
|
+
target_unrated = [s for s in target_scenarios if s.get("pass_status") is None]
|
|
1156
|
+
|
|
1157
|
+
# Calculate progress
|
|
1158
|
+
source_fail_count = len(source_failing)
|
|
1159
|
+
target_tested = len(target_scenarios)
|
|
1160
|
+
target_pass_count = len(target_passing)
|
|
1161
|
+
|
|
1162
|
+
progress_pct = (target_tested / source_fail_count * 100) if source_fail_count > 0 else 0
|
|
1163
|
+
pass_rate = (target_pass_count / target_tested * 100) if target_tested > 0 else 0
|
|
1164
|
+
|
|
1165
|
+
status = {
|
|
1166
|
+
"source": {
|
|
1167
|
+
"benchmark_id": source_benchmark_id,
|
|
1168
|
+
"name": source_benchmark.get("name"),
|
|
1169
|
+
"total": len(source_scenarios),
|
|
1170
|
+
"failing": source_fail_count,
|
|
1171
|
+
"passing": len(source_passing),
|
|
1172
|
+
"unrated": len(source_unrated),
|
|
1173
|
+
},
|
|
1174
|
+
"target": {
|
|
1175
|
+
"benchmark_id": target_benchmark_id,
|
|
1176
|
+
"name": target_benchmark.get("name"),
|
|
1177
|
+
"total": target_tested,
|
|
1178
|
+
"passing": target_pass_count,
|
|
1179
|
+
"failing": len(target_failing),
|
|
1180
|
+
"unrated": len(target_unrated),
|
|
1181
|
+
},
|
|
1182
|
+
"progress": {
|
|
1183
|
+
"tested": target_tested,
|
|
1184
|
+
"of_source_failing": source_fail_count,
|
|
1185
|
+
"progress_pct": round(progress_pct, 1),
|
|
1186
|
+
"pass_rate": round(pass_rate, 1),
|
|
1187
|
+
"remaining": max(0, source_fail_count - target_tested),
|
|
1188
|
+
},
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
if output_json:
|
|
1192
|
+
typer.echo(json.dumps(status, indent=2, default=str))
|
|
1193
|
+
else:
|
|
1194
|
+
typer.echo(f"=== Fix Progress ===")
|
|
1195
|
+
typer.echo("")
|
|
1196
|
+
typer.echo(f"Source: {status['source']['name']}")
|
|
1197
|
+
typer.echo(f" Total: {status['source']['total']}, Failing: {status['source']['failing']}")
|
|
1198
|
+
typer.echo("")
|
|
1199
|
+
typer.echo(f"Target: {status['target']['name']}")
|
|
1200
|
+
typer.echo(f" Tested: {status['target']['total']}, Passing: {status['target']['passing']}")
|
|
1201
|
+
typer.echo("")
|
|
1202
|
+
typer.echo(f"Progress: {status['progress']['tested']}/{status['progress']['of_source_failing']} ({status['progress']['progress_pct']}%)")
|
|
1203
|
+
typer.echo(f"Pass Rate: {status['progress']['pass_rate']}%")
|
|
1204
|
+
typer.echo(f"Remaining: {status['progress']['remaining']} scenarios")
|