applied-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1204 @@
1
+ """
2
+ Benchmark fix workflow commands for AI agents.
3
+
4
+ This module provides commands optimized for AI agents to fix failing test scenarios:
5
+ 1. `fix context` - Returns all context needed to understand and fix failing scenarios
6
+ 2. `fix test` - Replays a scenario's input message to validate a fix
7
+ 3. `fix batch` - Batch test multiple scenarios with retries and parallelism
8
+ 4. `fix status` - Track progress between source and target benchmarks
9
+ """
10
+
11
+ import json
12
+ import random
13
+ import time
14
+ import uuid as uuid_module
15
+ from concurrent.futures import ThreadPoolExecutor, as_completed
16
+ from dataclasses import dataclass
17
+ from typing import Any, Optional
18
+
19
+ import httpx
20
+ import typer
21
+
22
+ from applied_cli.commands._parsers import validate_uuid
23
+ from applied_cli.commands.responses import RESPONSE_SEMANTICS
24
+ from applied_cli.error_reporting import render_api_error
25
+ from applied_cli.http import (
26
+ APIError,
27
+ create_conversation_benchmark,
28
+ create_conversation_scenario,
29
+ get_agent,
30
+ get_conversation,
31
+ get_conversation_benchmark,
32
+ get_conversation_scenario,
33
+ list_conversation_scenarios,
34
+ list_responses,
35
+ patch_conversation_scenario,
36
+ )
37
+ from applied_cli.runtime import resolve_runtime
38
+
39
+ app = typer.Typer(
40
+ help=(
41
+ "Fix failing benchmark scenarios. Optimized for AI agent workflows.\n\n"
42
+ "Get all context needed to fix failures:\n"
43
+ " applied-cli test fix context --benchmark-id <uuid>\n\n"
44
+ "Test a fix by replaying a scenario:\n"
45
+ " applied-cli test fix test --scenario-id <uuid> --benchmark-id <uuid>\n\n"
46
+ "Batch test all failing scenarios:\n"
47
+ " applied-cli test fix batch --source <uuid> --target <uuid>\n\n"
48
+ "Track progress between benchmarks:\n"
49
+ " applied-cli test fix status --source <uuid> --target <uuid>"
50
+ )
51
+ )
52
+
53
+
54
+ # Default test contact names for auto-generation
55
+ _TEST_FIRST_NAMES = ["Alex", "Jordan", "Taylor", "Morgan", "Casey", "Riley", "Quinn", "Avery"]
56
+ _TEST_LAST_NAMES = ["Smith", "Johnson", "Williams", "Brown", "Davis", "Miller", "Wilson", "Moore"]
57
+
58
+
59
+ def _generate_test_contact() -> tuple[str, str]:
60
+ """Generate a random test contact name and email."""
61
+ first = random.choice(_TEST_FIRST_NAMES)
62
+ last = random.choice(_TEST_LAST_NAMES)
63
+ name = f"{first} {last}"
64
+ email = f"{first.lower()}.{last.lower()}.{random.randint(100, 999)}@test.example.com"
65
+ return name, email
66
+
67
+
68
+ @dataclass
69
+ class TestResult:
70
+ """Result of testing a single scenario."""
71
+
72
+ original_scenario_id: str
73
+ success: bool
74
+ new_scenario_id: str | None = None
75
+ conversation_id: str | None = None
76
+ resolution: str | None = None
77
+ error: str | None = None
78
+ retries_used: int = 0
79
+
80
+
81
+ def _extract_user_messages(conversation: dict[str, Any]) -> list[dict[str, str]]:
82
+ """Extract user messages from a conversation for replay."""
83
+ messages = conversation.get("messages") or []
84
+ user_messages = []
85
+ for msg in messages:
86
+ if msg.get("role") == "user":
87
+ user_messages.append({
88
+ "content": msg.get("content") or msg.get("text") or "",
89
+ "role": "user",
90
+ })
91
+ return user_messages
92
+
93
+
94
+ def _get_failing_scenarios(
95
+ *,
96
+ base_url: str,
97
+ shop_id: str,
98
+ api_token: str,
99
+ benchmark_id: str,
100
+ limit: int = 200,
101
+ ) -> list[dict[str, Any]]:
102
+ """Fetch all failing scenarios from a benchmark."""
103
+ all_scenarios = list_conversation_scenarios(
104
+ base_url=base_url,
105
+ shop_id=shop_id,
106
+ api_token=api_token,
107
+ benchmark_id=benchmark_id,
108
+ limit=limit,
109
+ ordering="-created_at",
110
+ )
111
+ return [s for s in all_scenarios if s.get("pass_status") == "fail"]
112
+
113
+
114
+ def _summarize_scenario(scenario: dict[str, Any]) -> dict[str, Any]:
115
+ """Create a concise summary of a scenario for AI consumption."""
116
+ input_conv = scenario.get("input_conversation") or {}
117
+ user_messages = _extract_user_messages(input_conv)
118
+
119
+ return {
120
+ "scenario_id": scenario.get("id"),
121
+ "name": scenario.get("name"),
122
+ "pass_status": scenario.get("pass_status"),
123
+ "feedback": scenario.get("feedback") or "",
124
+ "label": (input_conv.get("label") or {}).get("name"),
125
+ "sublabel": (input_conv.get("sublabel") or {}).get("name"),
126
+ "user_messages": user_messages,
127
+ "run_count": scenario.get("run_count", 0),
128
+ }
129
+
130
+
131
+ def _summarize_response(response: dict[str, Any]) -> dict[str, Any]:
132
+ """Create a concise summary of a response rule."""
133
+ return {
134
+ "response_id": response.get("id"),
135
+ "type": response.get("type"),
136
+ "question": response.get("question"),
137
+ "answer": response.get("answer"),
138
+ "guardrail": response.get("guardrail") or "",
139
+ "active": response.get("active", True),
140
+ }
141
+
142
+
143
+ def _create_test_conversation(
144
+ client: httpx.Client,
145
+ *,
146
+ base_url: str,
147
+ shop_id: str,
148
+ api_token: str,
149
+ agent_id: str,
150
+ channel: str,
151
+ contact_name: str | None = None,
152
+ contact_email: str | None = None,
153
+ ) -> str:
154
+ """Create a test conversation for replay."""
155
+ # Parse contact name into first/last
156
+ first_name = ""
157
+ last_name = ""
158
+ if contact_name:
159
+ parts = contact_name.strip().split(" ", 1)
160
+ first_name = parts[0]
161
+ last_name = parts[1] if len(parts) > 1 else ""
162
+
163
+ context: dict[str, object] = {
164
+ "channel": channel,
165
+ "firstName": first_name,
166
+ "lastName": last_name,
167
+ "contact": {
168
+ "contextFields": {
169
+ "channel": channel,
170
+ },
171
+ },
172
+ }
173
+ if contact_email:
174
+ context["email"] = contact_email
175
+ context["contact"]["email"] = contact_email # type: ignore[index]
176
+
177
+ payload: dict[str, object] = {
178
+ "agent_id": agent_id,
179
+ "is_test": True,
180
+ "metadata": {
181
+ "isTest": True,
182
+ "source": "applied-cli-fix",
183
+ "context": context,
184
+ },
185
+ }
186
+ if channel == "email":
187
+ payload["type"] = "email"
188
+
189
+ headers = {
190
+ "Authorization": f"Bearer {api_token}",
191
+ "X-Shop-Id": shop_id,
192
+ "Content-Type": "application/json",
193
+ }
194
+
195
+ response = client.post(
196
+ f"{base_url}/v1/c/",
197
+ json=payload,
198
+ headers=headers,
199
+ timeout=10.0,
200
+ )
201
+
202
+ if response.status_code >= 400:
203
+ raise APIError(
204
+ f"Failed to create test conversation ({response.status_code})",
205
+ status_code=response.status_code,
206
+ code="CONVERSATION_CREATE_FAILED",
207
+ )
208
+
209
+ data = response.json()
210
+ return str(data.get("id"))
211
+
212
+
213
+ def _send_message_and_get_response(
214
+ client: httpx.Client,
215
+ *,
216
+ base_url: str,
217
+ shop_id: str,
218
+ api_token: str,
219
+ agent_id: str,
220
+ conversation_id: str,
221
+ message: str,
222
+ timeout: float = 60.0,
223
+ ) -> str:
224
+ """Send a message and get the agent's response."""
225
+ headers = {
226
+ "Authorization": f"Bearer {api_token}",
227
+ "X-Shop-Id": shop_id,
228
+ "Content-Type": "application/json",
229
+ }
230
+
231
+ transcript = [
232
+ {
233
+ "id": str(uuid_module.uuid4()),
234
+ "role": "user",
235
+ "content": message,
236
+ "text": message,
237
+ "format": "TEXT",
238
+ "entity": {"type": "user"},
239
+ }
240
+ ]
241
+
242
+ payload = {
243
+ "conversation_id": conversation_id,
244
+ "context": "EVALUATE",
245
+ "transcript": transcript,
246
+ "metadata": {
247
+ "source": "applied-cli-fix",
248
+ "isTest": True,
249
+ },
250
+ "draft": False,
251
+ }
252
+
253
+ # Use non-streaming completion for simplicity
254
+ response = client.post(
255
+ f"{base_url}/v1/agents/{agent_id}/complete/",
256
+ headers=headers,
257
+ json=payload,
258
+ timeout=timeout,
259
+ )
260
+
261
+ if response.status_code >= 400:
262
+ raise APIError(
263
+ f"Completion failed ({response.status_code})",
264
+ status_code=response.status_code,
265
+ code="COMPLETION_FAILED",
266
+ )
267
+
268
+ # Parse streamed response to extract content
269
+ generated_text = ""
270
+ for line in response.text.split("\n"):
271
+ line = line.strip()
272
+ if not line:
273
+ continue
274
+ try:
275
+ data = json.loads(line)
276
+ content = data.get("content")
277
+ if isinstance(content, str):
278
+ generated_text += content
279
+ except json.JSONDecodeError:
280
+ continue
281
+
282
+ return generated_text
283
+
284
+
285
+ def _get_conversation_resolution(
286
+ *,
287
+ base_url: str,
288
+ shop_id: str,
289
+ api_token: str,
290
+ conversation_id: str,
291
+ ) -> str | None:
292
+ """Get the resolution status of a conversation."""
293
+ try:
294
+ conv = get_conversation(
295
+ base_url=base_url,
296
+ shop_id=shop_id,
297
+ api_token=api_token,
298
+ conversation_id=conversation_id,
299
+ )
300
+ return conv.get("resolution")
301
+ except Exception:
302
+ return None
303
+
304
+
305
+ def _run_single_test(
306
+ *,
307
+ base_url: str,
308
+ shop_id: str,
309
+ api_token: str,
310
+ original_scenario: dict[str, Any],
311
+ target_benchmark_id: str,
312
+ contact_name: str | None = None,
313
+ contact_email: str | None = None,
314
+ auto_pass: bool = False,
315
+ timeout: float = 90.0,
316
+ retries: int = 3,
317
+ expect_escalation: bool | None = None,
318
+ quiet: bool = False,
319
+ ) -> TestResult:
320
+ """
321
+ Run a single test with retry logic.
322
+
323
+ Returns a TestResult with success/failure status.
324
+ """
325
+ scenario_id = original_scenario.get("id", "")
326
+ agent_data = original_scenario.get("agent") or {}
327
+ agent_id = agent_data.get("id")
328
+
329
+ if not agent_id:
330
+ return TestResult(
331
+ original_scenario_id=scenario_id,
332
+ success=False,
333
+ error="Scenario has no associated agent",
334
+ )
335
+
336
+ modality = (agent_data.get("modality") or "chat").lower()
337
+ channel = "email" if modality == "email" else "chat"
338
+
339
+ # Extract user messages
340
+ input_conv = original_scenario.get("input_conversation") or {}
341
+ user_messages = _extract_user_messages(input_conv)
342
+
343
+ if not user_messages:
344
+ return TestResult(
345
+ original_scenario_id=scenario_id,
346
+ success=False,
347
+ error="Scenario has no user messages to replay",
348
+ )
349
+
350
+ test_message = user_messages[0]["content"]
351
+
352
+ # Generate contact info if not provided
353
+ if not contact_name or not contact_email:
354
+ gen_name, gen_email = _generate_test_contact()
355
+ contact_name = contact_name or gen_name
356
+ contact_email = contact_email or gen_email
357
+
358
+ last_error: str | None = None
359
+ retries_used = 0
360
+
361
+ for attempt in range(retries + 1):
362
+ retries_used = attempt
363
+ try:
364
+ with httpx.Client() as client:
365
+ # Create test conversation
366
+ conversation_id = _create_test_conversation(
367
+ client,
368
+ base_url=base_url,
369
+ shop_id=shop_id,
370
+ api_token=api_token,
371
+ agent_id=agent_id,
372
+ channel=channel,
373
+ contact_name=contact_name,
374
+ contact_email=contact_email,
375
+ )
376
+
377
+ # Send message and get response
378
+ _send_message_and_get_response(
379
+ client,
380
+ base_url=base_url,
381
+ shop_id=shop_id,
382
+ api_token=api_token,
383
+ agent_id=agent_id,
384
+ conversation_id=conversation_id,
385
+ message=test_message,
386
+ timeout=timeout,
387
+ )
388
+
389
+ # Brief delay for processing
390
+ time.sleep(2)
391
+
392
+ # Get conversation resolution
393
+ resolution = _get_conversation_resolution(
394
+ base_url=base_url,
395
+ shop_id=shop_id,
396
+ api_token=api_token,
397
+ conversation_id=conversation_id,
398
+ )
399
+
400
+ # Validate expectations if set
401
+ if expect_escalation is not None:
402
+ is_escalated = resolution == "escalated"
403
+ if expect_escalation and not is_escalated:
404
+ return TestResult(
405
+ original_scenario_id=scenario_id,
406
+ success=False,
407
+ conversation_id=conversation_id,
408
+ resolution=resolution,
409
+ error="Expected escalation but agent responded",
410
+ retries_used=retries_used,
411
+ )
412
+ elif not expect_escalation and is_escalated:
413
+ return TestResult(
414
+ original_scenario_id=scenario_id,
415
+ success=False,
416
+ conversation_id=conversation_id,
417
+ resolution=resolution,
418
+ error="Expected response but agent escalated",
419
+ retries_used=retries_used,
420
+ )
421
+
422
+ # Create new scenario in target benchmark
423
+ new_scenario_name = f"[Fix Test] {original_scenario.get('name', 'Unnamed')}"
424
+ new_scenario = create_conversation_scenario(
425
+ base_url=base_url,
426
+ shop_id=shop_id,
427
+ api_token=api_token,
428
+ agent_id=agent_id,
429
+ benchmark_id=target_benchmark_id,
430
+ name=new_scenario_name,
431
+ input_conversation_id=conversation_id,
432
+ )
433
+
434
+ new_scenario_id = new_scenario.get("id")
435
+
436
+ # Auto-mark as pass if requested
437
+ if auto_pass and new_scenario_id:
438
+ patch_conversation_scenario(
439
+ base_url=base_url,
440
+ shop_id=shop_id,
441
+ api_token=api_token,
442
+ scenario_id=new_scenario_id,
443
+ payload={"pass_status": "pass"},
444
+ )
445
+
446
+ return TestResult(
447
+ original_scenario_id=scenario_id,
448
+ success=True,
449
+ new_scenario_id=new_scenario_id,
450
+ conversation_id=conversation_id,
451
+ resolution=resolution,
452
+ retries_used=retries_used,
453
+ )
454
+
455
+ except httpx.TimeoutException as exc:
456
+ last_error = f"Timeout after {timeout}s"
457
+ if attempt < retries:
458
+ if not quiet:
459
+ typer.echo(f" Retry {attempt + 1}/{retries} after timeout...")
460
+ time.sleep(2 ** attempt) # Exponential backoff
461
+ continue
462
+ break
463
+
464
+ except httpx.HTTPError as exc:
465
+ last_error = f"Network error: {exc}"
466
+ if attempt < retries:
467
+ if not quiet:
468
+ typer.echo(f" Retry {attempt + 1}/{retries} after error...")
469
+ time.sleep(2 ** attempt)
470
+ continue
471
+ break
472
+
473
+ except APIError as exc:
474
+ last_error = f"API error: {exc}"
475
+ # Don't retry API errors (likely not transient)
476
+ break
477
+
478
+ except Exception as exc:
479
+ last_error = f"Unexpected error: {exc}"
480
+ break
481
+
482
+ return TestResult(
483
+ original_scenario_id=scenario_id,
484
+ success=False,
485
+ error=last_error,
486
+ retries_used=retries_used,
487
+ )
488
+
489
+
490
+ @app.command(
491
+ "context",
492
+ help=(
493
+ "Get all context needed to fix failing scenarios in a benchmark.\n\n"
494
+ "Returns:\n"
495
+ "- All failing scenarios with feedback and user messages\n"
496
+ "- Agent responses (knowledge base)\n"
497
+ "- Agent guardrails\n"
498
+ "- Response type semantics\n"
499
+ "- Instructions for making fixes\n\n"
500
+ "Example: applied-cli test fix context --benchmark-id <uuid>"
501
+ ),
502
+ )
503
+ def fix_context(
504
+ benchmark_id: str = typer.Option(
505
+ ..., "--benchmark-id", "--benchmark", "--id", help="Benchmark UUID."
506
+ ),
507
+ include_passing: bool = typer.Option(
508
+ False, "--include-passing", help="Include passing scenarios for reference."
509
+ ),
510
+ output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
511
+ base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
512
+ shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
513
+ api_token: Optional[str] = typer.Option(None, help="Applied API token."),
514
+ ) -> None:
515
+ validate_uuid(benchmark_id, field_name="benchmark-id")
516
+
517
+ try:
518
+ resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
519
+ base_url=base_url, shop_id=shop_id, api_token=api_token
520
+ )
521
+
522
+ # Get benchmark details
523
+ benchmark = get_conversation_benchmark(
524
+ base_url=resolved_base_url,
525
+ shop_id=resolved_shop_id,
526
+ api_token=resolved_token,
527
+ benchmark_id=benchmark_id,
528
+ )
529
+
530
+ # Get agent from benchmark
531
+ agent_data = benchmark.get("agent") or {}
532
+ agent_id = agent_data.get("id")
533
+ if not agent_id:
534
+ raise typer.BadParameter("Benchmark has no associated agent.")
535
+
536
+ # Get full agent details including guardrails
537
+ agent = get_agent(
538
+ base_url=resolved_base_url,
539
+ shop_id=resolved_shop_id,
540
+ api_token=resolved_token,
541
+ agent_id=agent_id,
542
+ )
543
+
544
+ # Get all scenarios
545
+ all_scenarios = list_conversation_scenarios(
546
+ base_url=resolved_base_url,
547
+ shop_id=resolved_shop_id,
548
+ api_token=resolved_token,
549
+ benchmark_id=benchmark_id,
550
+ limit=500,
551
+ ordering="-created_at",
552
+ )
553
+
554
+ # For failing scenarios, we need full details including input_conversation
555
+ failing_scenarios = []
556
+ passing_scenarios = []
557
+ for scenario in all_scenarios:
558
+ if scenario.get("pass_status") == "fail":
559
+ # Get full scenario details
560
+ full_scenario = get_conversation_scenario(
561
+ base_url=resolved_base_url,
562
+ shop_id=resolved_shop_id,
563
+ api_token=resolved_token,
564
+ scenario_id=scenario.get("id"),
565
+ )
566
+ failing_scenarios.append(_summarize_scenario(full_scenario))
567
+ elif include_passing and scenario.get("pass_status") == "pass":
568
+ passing_scenarios.append({
569
+ "scenario_id": scenario.get("id"),
570
+ "name": scenario.get("name"),
571
+ "pass_status": "pass",
572
+ })
573
+
574
+ # Get agent responses (knowledge base)
575
+ responses = list_responses(
576
+ base_url=resolved_base_url,
577
+ shop_id=resolved_shop_id,
578
+ api_token=resolved_token,
579
+ agent_id=agent_id,
580
+ active=True,
581
+ limit=500,
582
+ )
583
+
584
+ except APIError as exc:
585
+ typer.echo(render_api_error(exc, action="get fix context"), err=True)
586
+ raise typer.Exit(code=1) from exc
587
+
588
+ # Build the context output
589
+ context: dict[str, Any] = {
590
+ "benchmark": {
591
+ "id": benchmark_id,
592
+ "name": benchmark.get("name"),
593
+ "scenario_count": benchmark.get("scenario_count"),
594
+ },
595
+ "agent": {
596
+ "id": agent_id,
597
+ "name": agent.get("name"),
598
+ "modality": agent.get("modality"),
599
+ "guardrail": agent.get("guardrail") or "",
600
+ },
601
+ "summary": {
602
+ "total_scenarios": len(all_scenarios),
603
+ "failing_count": len(failing_scenarios),
604
+ "passing_count": len([s for s in all_scenarios if s.get("pass_status") == "pass"]),
605
+ "unrated_count": len([s for s in all_scenarios if s.get("pass_status") is None]),
606
+ },
607
+ "failing_scenarios": failing_scenarios,
608
+ "responses": [_summarize_response(r) for r in responses],
609
+ "response_type_semantics": RESPONSE_SEMANTICS,
610
+ "instructions": {
611
+ "overview": (
612
+ "To fix failing scenarios, analyze each failure's feedback and user messages, "
613
+ "then update responses or agent guardrails accordingly."
614
+ ),
615
+ "response_types": {
616
+ "escalation": "Use for messages that should be escalated to humans. Set question to trigger criteria.",
617
+ "exact": "Use for verbatim templated responses. Answer is returned exactly as written.",
618
+ "qa": "Use for knowledge-grounded answers. Answer is used as context for generation.",
619
+ "context": "Use for background knowledge that informs responses broadly.",
620
+ },
621
+ "commands": {
622
+ "add_knowledge": "applied-cli knowledge upsert --agent-id <agent_id> --type <type> --question '<question>' --answer '<answer>' --yes",
623
+ "update_knowledge": "applied-cli knowledge update --response-id <response_id> --answer '<new_answer>' --yes",
624
+ "update_guardrail": "applied-cli agent update --agent-id <agent_id> --guardrail '<guardrail_text>' --yes",
625
+ "test_fix": "applied-cli test fix test --scenario-id <scenario_id> --benchmark-id <new_benchmark_id>",
626
+ "create_benchmark": "applied-cli benchmarks create --agent-id <agent_id> --name 'Fix Validation'",
627
+ },
628
+ },
629
+ }
630
+
631
+ if include_passing:
632
+ context["passing_scenarios"] = passing_scenarios
633
+
634
+ if output_json:
635
+ typer.echo(json.dumps(context, indent=2, default=str))
636
+ else:
637
+ typer.echo(f"Benchmark: {context['benchmark']['name']} ({benchmark_id})")
638
+ typer.echo(f"Agent: {context['agent']['name']} ({agent_id})")
639
+ typer.echo(f"Modality: {context['agent']['modality']}")
640
+ typer.echo("")
641
+ typer.echo(f"Summary: {context['summary']['failing_count']} failing, {context['summary']['passing_count']} passing, {context['summary']['unrated_count']} unrated")
642
+ typer.echo("")
643
+
644
+ if failing_scenarios:
645
+ typer.echo("=== FAILING SCENARIOS ===")
646
+ for scenario in failing_scenarios:
647
+ typer.echo(f"\n--- {scenario['name']} ---")
648
+ typer.echo(f"ID: {scenario['scenario_id']}")
649
+ typer.echo(f"Label: {scenario['label']} / {scenario['sublabel']}")
650
+ typer.echo(f"Feedback: {scenario['feedback'] or '(none)'}")
651
+ typer.echo("User messages:")
652
+ for msg in scenario['user_messages']:
653
+ content = msg['content'][:200] + "..." if len(msg['content']) > 200 else msg['content']
654
+ typer.echo(f" - {content}")
655
+
656
+ typer.echo("\n=== AGENT RESPONSES ===")
657
+ typer.echo(f"Total: {len(responses)} active responses")
658
+ for r in responses[:10]: # Show first 10
659
+ typer.echo(f" [{r.get('type')}] {str(r.get('question') or '')[:60]}")
660
+ if len(responses) > 10:
661
+ typer.echo(f" ... and {len(responses) - 10} more")
662
+
663
+ typer.echo("\n=== AGENT GUARDRAIL ===")
664
+ guardrail = context['agent']['guardrail']
665
+ if guardrail:
666
+ typer.echo(guardrail[:500] + "..." if len(guardrail) > 500 else guardrail)
667
+ else:
668
+ typer.echo("(no guardrail set)")
669
+
670
+ typer.echo("\nUse --json for full machine-readable output.")
671
+
672
+
673
+ @app.command(
674
+ "test",
675
+ help=(
676
+ "Test a fix by replaying a scenario's input message.\n\n"
677
+ "This command:\n"
678
+ "1. Gets the original scenario's input message\n"
679
+ "2. Sends it to the agent to get a new response\n"
680
+ "3. Creates a new scenario in the target benchmark\n"
681
+ "4. Returns the scenario ID for rating\n\n"
682
+ "Example: applied-cli test fix test --scenario-id <uuid> --benchmark-id <uuid>"
683
+ ),
684
+ )
685
+ def fix_test(
686
+ scenario_id: str = typer.Option(
687
+ ..., "--scenario-id", "--scenario", help="Original scenario UUID to replay."
688
+ ),
689
+ benchmark_id: str = typer.Option(
690
+ ..., "--benchmark-id", "--benchmark", help="Target benchmark UUID for the new scenario."
691
+ ),
692
+ contact_name: Optional[str] = typer.Option(
693
+ None, "--contact-name", "--name", help="Contact name for the test. Auto-generated if not provided."
694
+ ),
695
+ contact_email: Optional[str] = typer.Option(
696
+ None, "--contact-email", "--email", help="Contact email for the test. Auto-generated if not provided."
697
+ ),
698
+ auto_pass: bool = typer.Option(
699
+ False, "--auto-pass", help="Automatically mark the new scenario as pass."
700
+ ),
701
+ expect_escalation: Optional[bool] = typer.Option(
702
+ None, "--expect-escalation/--expect-response",
703
+ help="Validate escalation behavior. --expect-escalation fails if agent responds, --expect-response fails if agent escalates."
704
+ ),
705
+ timeout: int = typer.Option(
706
+ 90, "--timeout", "-t", help="Timeout in seconds for agent response."
707
+ ),
708
+ retries: int = typer.Option(
709
+ 2, "--retry", "-r", help="Number of retries on timeout/network errors."
710
+ ),
711
+ feedback: Optional[str] = typer.Option(
712
+ None, "--feedback", help="Feedback to add to the new scenario."
713
+ ),
714
+ quiet: bool = typer.Option(
715
+ False, "--quiet", "-q", help="Minimal output (only show result)."
716
+ ),
717
+ output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
718
+ base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
719
+ shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
720
+ api_token: Optional[str] = typer.Option(None, help="Applied API token."),
721
+ ) -> None:
722
+ validate_uuid(scenario_id, field_name="scenario-id")
723
+ validate_uuid(benchmark_id, field_name="benchmark-id")
724
+
725
+ try:
726
+ resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
727
+ base_url=base_url, shop_id=shop_id, api_token=api_token
728
+ )
729
+
730
+ # Get the original scenario
731
+ original_scenario = get_conversation_scenario(
732
+ base_url=resolved_base_url,
733
+ shop_id=resolved_shop_id,
734
+ api_token=resolved_token,
735
+ scenario_id=scenario_id,
736
+ )
737
+
738
+ if not quiet:
739
+ input_conv = original_scenario.get("input_conversation") or {}
740
+ user_messages = _extract_user_messages(input_conv)
741
+ test_message = user_messages[0]["content"] if user_messages else ""
742
+ typer.echo(f"Replaying scenario: {original_scenario.get('name')}")
743
+ typer.echo(f"Message: {test_message[:100]}...")
744
+ typer.echo("")
745
+ typer.echo("Sending message to agent...")
746
+
747
+ except APIError as exc:
748
+ typer.echo(render_api_error(exc, action="get original scenario"), err=True)
749
+ raise typer.Exit(code=1) from exc
750
+
751
+ # Run the test with retry logic
752
+ test_result = _run_single_test(
753
+ base_url=resolved_base_url,
754
+ shop_id=resolved_shop_id,
755
+ api_token=resolved_token,
756
+ original_scenario=original_scenario,
757
+ target_benchmark_id=benchmark_id,
758
+ contact_name=contact_name,
759
+ contact_email=contact_email,
760
+ auto_pass=auto_pass,
761
+ timeout=float(timeout),
762
+ retries=retries,
763
+ expect_escalation=expect_escalation,
764
+ quiet=quiet,
765
+ )
766
+
767
+ # Handle feedback update if provided
768
+ if test_result.success and feedback and test_result.new_scenario_id:
769
+ try:
770
+ patch_conversation_scenario(
771
+ base_url=resolved_base_url,
772
+ shop_id=resolved_shop_id,
773
+ api_token=resolved_token,
774
+ scenario_id=test_result.new_scenario_id,
775
+ payload={"feedback": feedback},
776
+ )
777
+ except APIError:
778
+ pass # Non-critical
779
+
780
+ result = {
781
+ "result": "success" if test_result.success else "failed",
782
+ "original_scenario_id": scenario_id,
783
+ "new_scenario_id": test_result.new_scenario_id,
784
+ "benchmark_id": benchmark_id,
785
+ "conversation_id": test_result.conversation_id,
786
+ "resolution": test_result.resolution,
787
+ "pass_status": "pass" if auto_pass and test_result.success else None,
788
+ "retries_used": test_result.retries_used,
789
+ "error": test_result.error,
790
+ }
791
+
792
+ if output_json:
793
+ typer.echo(json.dumps(result, indent=2, default=str))
794
+ elif test_result.success:
795
+ if quiet:
796
+ typer.echo(f"✓ {test_result.new_scenario_id}")
797
+ else:
798
+ typer.echo(f"\nCreated new scenario: {test_result.new_scenario_id}")
799
+ typer.echo(f"In benchmark: {benchmark_id}")
800
+ typer.echo(f"Resolution: {test_result.resolution or 'answered'}")
801
+ if auto_pass:
802
+ typer.echo("Status: pass (auto-marked)")
803
+ else:
804
+ typer.echo("Status: unrated")
805
+ typer.echo(f"\nTo rate: applied-cli scenarios update --scenario-id {test_result.new_scenario_id} --pass-status pass")
806
+ else:
807
+ if quiet:
808
+ typer.echo(f"✗ {scenario_id}: {test_result.error}")
809
+ else:
810
+ typer.echo(f"\nTest failed: {test_result.error}", err=True)
811
+ if test_result.retries_used > 0:
812
+ typer.echo(f"Retries used: {test_result.retries_used}", err=True)
813
+ raise typer.Exit(code=1)
814
+
815
+
816
+ @app.command(
817
+ "summary",
818
+ help=(
819
+ "Get a quick summary of a benchmark's pass/fail status.\n\n"
820
+ "Example: applied-cli test fix summary --benchmark-id <uuid>"
821
+ ),
822
+ )
823
+ def fix_summary(
824
+ benchmark_id: str = typer.Option(
825
+ ..., "--benchmark-id", "--benchmark", "--id", help="Benchmark UUID."
826
+ ),
827
+ output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
828
+ base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
829
+ shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
830
+ api_token: Optional[str] = typer.Option(None, help="Applied API token."),
831
+ ) -> None:
832
+ validate_uuid(benchmark_id, field_name="benchmark-id")
833
+
834
+ try:
835
+ resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
836
+ base_url=base_url, shop_id=shop_id, api_token=api_token
837
+ )
838
+
839
+ benchmark = get_conversation_benchmark(
840
+ base_url=resolved_base_url,
841
+ shop_id=resolved_shop_id,
842
+ api_token=resolved_token,
843
+ benchmark_id=benchmark_id,
844
+ )
845
+
846
+ all_scenarios = list_conversation_scenarios(
847
+ base_url=resolved_base_url,
848
+ shop_id=resolved_shop_id,
849
+ api_token=resolved_token,
850
+ benchmark_id=benchmark_id,
851
+ limit=500,
852
+ )
853
+
854
+ except APIError as exc:
855
+ typer.echo(render_api_error(exc, action="get benchmark summary"), err=True)
856
+ raise typer.Exit(code=1) from exc
857
+
858
+ passing = [s for s in all_scenarios if s.get("pass_status") == "pass"]
859
+ failing = [s for s in all_scenarios if s.get("pass_status") == "fail"]
860
+ unrated = [s for s in all_scenarios if s.get("pass_status") is None]
861
+
862
+ summary = {
863
+ "benchmark_id": benchmark_id,
864
+ "benchmark_name": benchmark.get("name"),
865
+ "total": len(all_scenarios),
866
+ "passing": len(passing),
867
+ "failing": len(failing),
868
+ "unrated": len(unrated),
869
+ "pass_rate": f"{len(passing) / len(all_scenarios) * 100:.1f}%" if all_scenarios else "N/A",
870
+ "failing_scenario_ids": [s.get("id") for s in failing],
871
+ }
872
+
873
+ if output_json:
874
+ typer.echo(json.dumps(summary, indent=2, default=str))
875
+ else:
876
+ typer.echo(f"Benchmark: {summary['benchmark_name']}")
877
+ typer.echo(f"Total: {summary['total']} scenarios")
878
+ typer.echo(f"Passing: {summary['passing']} ({summary['pass_rate']})")
879
+ typer.echo(f"Failing: {summary['failing']}")
880
+ typer.echo(f"Unrated: {summary['unrated']}")
881
+ if failing:
882
+ typer.echo(f"\nFailing scenario IDs:")
883
+ for sid in summary['failing_scenario_ids']:
884
+ typer.echo(f" - {sid}")
885
+
886
+
887
+ @app.command(
888
+ "batch",
889
+ help=(
890
+ "Batch test all failing scenarios from a source benchmark.\n\n"
891
+ "This command:\n"
892
+ "1. Fetches all failing scenarios from the source benchmark\n"
893
+ "2. Tests each one with retries and parallelism\n"
894
+ "3. Creates new scenarios in the target benchmark\n"
895
+ "4. Reports overall progress\n\n"
896
+ "Example: applied-cli test fix batch --source <uuid> --target <uuid> --auto-pass"
897
+ ),
898
+ )
899
+ def fix_batch(
900
+ source_benchmark_id: str = typer.Option(
901
+ ..., "--source", "--source-benchmark", help="Source benchmark UUID with failing scenarios."
902
+ ),
903
+ target_benchmark_id: str = typer.Option(
904
+ ..., "--target", "--target-benchmark", help="Target benchmark UUID for new scenarios."
905
+ ),
906
+ pass_status_filter: str = typer.Option(
907
+ "fail", "--pass-status", help="Filter scenarios by pass status: fail, pass, unrated, or all."
908
+ ),
909
+ auto_pass: bool = typer.Option(
910
+ False, "--auto-pass", help="Automatically mark successful tests as pass."
911
+ ),
912
+ timeout: int = typer.Option(
913
+ 90, "--timeout", "-t", help="Timeout in seconds per test."
914
+ ),
915
+ retries: int = typer.Option(
916
+ 2, "--retry", "-r", help="Number of retries per test."
917
+ ),
918
+ parallel: int = typer.Option(
919
+ 1, "--parallel", "-p", help="Number of parallel tests (1-10)."
920
+ ),
921
+ limit: int = typer.Option(
922
+ 0, "--limit", "-l", help="Max scenarios to test (0 = all)."
923
+ ),
924
+ continue_on_error: bool = typer.Option(
925
+ True, "--continue-on-error/--stop-on-error",
926
+ help="Continue testing even if some scenarios fail."
927
+ ),
928
+ output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
929
+ base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
930
+ shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
931
+ api_token: Optional[str] = typer.Option(None, help="Applied API token."),
932
+ ) -> None:
933
+ validate_uuid(source_benchmark_id, field_name="source-benchmark")
934
+ validate_uuid(target_benchmark_id, field_name="target-benchmark")
935
+
936
+ # Clamp parallel workers
937
+ parallel = max(1, min(10, parallel))
938
+
939
+ try:
940
+ resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
941
+ base_url=base_url, shop_id=shop_id, api_token=api_token
942
+ )
943
+
944
+ # Get source scenarios
945
+ all_scenarios = list_conversation_scenarios(
946
+ base_url=resolved_base_url,
947
+ shop_id=resolved_shop_id,
948
+ api_token=resolved_token,
949
+ benchmark_id=source_benchmark_id,
950
+ limit=500,
951
+ )
952
+
953
+ # Filter by pass status
954
+ if pass_status_filter == "all":
955
+ scenarios_to_test = all_scenarios
956
+ else:
957
+ filter_value = None if pass_status_filter == "unrated" else pass_status_filter
958
+ scenarios_to_test = [s for s in all_scenarios if s.get("pass_status") == filter_value]
959
+
960
+ # Apply limit
961
+ if limit > 0:
962
+ scenarios_to_test = scenarios_to_test[:limit]
963
+
964
+ if not scenarios_to_test:
965
+ typer.echo(f"No scenarios found with pass_status={pass_status_filter}")
966
+ raise typer.Exit(code=0)
967
+
968
+ # Fetch full scenario details for each
969
+ full_scenarios = []
970
+ for scenario in scenarios_to_test:
971
+ full = get_conversation_scenario(
972
+ base_url=resolved_base_url,
973
+ shop_id=resolved_shop_id,
974
+ api_token=resolved_token,
975
+ scenario_id=scenario.get("id"),
976
+ )
977
+ full_scenarios.append(full)
978
+
979
+ except APIError as exc:
980
+ typer.echo(render_api_error(exc, action="get scenarios"), err=True)
981
+ raise typer.Exit(code=1) from exc
982
+
983
+ typer.echo(f"Testing {len(full_scenarios)} scenarios...")
984
+ typer.echo(f" Source: {source_benchmark_id}")
985
+ typer.echo(f" Target: {target_benchmark_id}")
986
+ typer.echo(f" Parallel: {parallel}, Retries: {retries}, Timeout: {timeout}s")
987
+ typer.echo("")
988
+
989
+ results: list[TestResult] = []
990
+ success_count = 0
991
+ fail_count = 0
992
+
993
+ def run_test(scenario: dict[str, Any]) -> TestResult:
994
+ return _run_single_test(
995
+ base_url=resolved_base_url,
996
+ shop_id=resolved_shop_id,
997
+ api_token=resolved_token,
998
+ original_scenario=scenario,
999
+ target_benchmark_id=target_benchmark_id,
1000
+ auto_pass=auto_pass,
1001
+ timeout=float(timeout),
1002
+ retries=retries,
1003
+ quiet=True,
1004
+ )
1005
+
1006
+ if parallel == 1:
1007
+ # Sequential execution with progress
1008
+ for i, scenario in enumerate(full_scenarios):
1009
+ name = scenario.get("name", "Unnamed")[:40]
1010
+ typer.echo(f"[{i+1}/{len(full_scenarios)}] {name}...", nl=False)
1011
+
1012
+ result = run_test(scenario)
1013
+ results.append(result)
1014
+
1015
+ if result.success:
1016
+ success_count += 1
1017
+ typer.echo(f" ✓ {result.resolution or 'answered'}")
1018
+ else:
1019
+ fail_count += 1
1020
+ typer.echo(f" ✗ {result.error}")
1021
+ if not continue_on_error:
1022
+ break
1023
+ else:
1024
+ # Parallel execution
1025
+ with ThreadPoolExecutor(max_workers=parallel) as executor:
1026
+ future_to_scenario = {
1027
+ executor.submit(run_test, s): s for s in full_scenarios
1028
+ }
1029
+
1030
+ for i, future in enumerate(as_completed(future_to_scenario)):
1031
+ scenario = future_to_scenario[future]
1032
+ name = scenario.get("name", "Unnamed")[:40]
1033
+
1034
+ try:
1035
+ result = future.result()
1036
+ results.append(result)
1037
+
1038
+ if result.success:
1039
+ success_count += 1
1040
+ typer.echo(f"[{i+1}/{len(full_scenarios)}] ✓ {name}")
1041
+ else:
1042
+ fail_count += 1
1043
+ typer.echo(f"[{i+1}/{len(full_scenarios)}] ✗ {name}: {result.error}")
1044
+ except Exception as exc:
1045
+ fail_count += 1
1046
+ results.append(TestResult(
1047
+ original_scenario_id=scenario.get("id", ""),
1048
+ success=False,
1049
+ error=str(exc),
1050
+ ))
1051
+ typer.echo(f"[{i+1}/{len(full_scenarios)}] ✗ {name}: {exc}")
1052
+
1053
+ typer.echo("")
1054
+ typer.echo(f"=== Results ===")
1055
+ typer.echo(f"Tested: {len(results)}")
1056
+ typer.echo(f"Success: {success_count}")
1057
+ typer.echo(f"Failed: {fail_count}")
1058
+
1059
+ if output_json:
1060
+ summary = {
1061
+ "source_benchmark_id": source_benchmark_id,
1062
+ "target_benchmark_id": target_benchmark_id,
1063
+ "total_tested": len(results),
1064
+ "success": success_count,
1065
+ "failed": fail_count,
1066
+ "results": [
1067
+ {
1068
+ "original_id": r.original_scenario_id,
1069
+ "new_id": r.new_scenario_id,
1070
+ "success": r.success,
1071
+ "resolution": r.resolution,
1072
+ "error": r.error,
1073
+ }
1074
+ for r in results
1075
+ ],
1076
+ }
1077
+ typer.echo(json.dumps(summary, indent=2, default=str))
1078
+
1079
+ if fail_count > 0 and not continue_on_error:
1080
+ raise typer.Exit(code=1)
1081
+
1082
+
1083
+ @app.command(
1084
+ "status",
1085
+ help=(
1086
+ "Track fix progress between source and target benchmarks.\n\n"
1087
+ "Shows which scenarios from the source have been tested in the target.\n\n"
1088
+ "Example: applied-cli test fix status --source <uuid> --target <uuid>"
1089
+ ),
1090
+ )
1091
+ def fix_status(
1092
+ source_benchmark_id: str = typer.Option(
1093
+ ..., "--source", "--source-benchmark", help="Source benchmark UUID with original scenarios."
1094
+ ),
1095
+ target_benchmark_id: str = typer.Option(
1096
+ ..., "--target", "--target-benchmark", help="Target benchmark UUID with test results."
1097
+ ),
1098
+ output_json: bool = typer.Option(False, "--json", help="Emit JSON output."),
1099
+ base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
1100
+ shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
1101
+ api_token: Optional[str] = typer.Option(None, help="Applied API token."),
1102
+ ) -> None:
1103
+ validate_uuid(source_benchmark_id, field_name="source-benchmark")
1104
+ validate_uuid(target_benchmark_id, field_name="target-benchmark")
1105
+
1106
+ try:
1107
+ resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
1108
+ base_url=base_url, shop_id=shop_id, api_token=api_token
1109
+ )
1110
+
1111
+ # Get both benchmarks
1112
+ source_benchmark = get_conversation_benchmark(
1113
+ base_url=resolved_base_url,
1114
+ shop_id=resolved_shop_id,
1115
+ api_token=resolved_token,
1116
+ benchmark_id=source_benchmark_id,
1117
+ )
1118
+
1119
+ target_benchmark = get_conversation_benchmark(
1120
+ base_url=resolved_base_url,
1121
+ shop_id=resolved_shop_id,
1122
+ api_token=resolved_token,
1123
+ benchmark_id=target_benchmark_id,
1124
+ )
1125
+
1126
+ # Get scenarios from both
1127
+ source_scenarios = list_conversation_scenarios(
1128
+ base_url=resolved_base_url,
1129
+ shop_id=resolved_shop_id,
1130
+ api_token=resolved_token,
1131
+ benchmark_id=source_benchmark_id,
1132
+ limit=500,
1133
+ )
1134
+
1135
+ target_scenarios = list_conversation_scenarios(
1136
+ base_url=resolved_base_url,
1137
+ shop_id=resolved_shop_id,
1138
+ api_token=resolved_token,
1139
+ benchmark_id=target_benchmark_id,
1140
+ limit=500,
1141
+ )
1142
+
1143
+ except APIError as exc:
1144
+ typer.echo(render_api_error(exc, action="get benchmark status"), err=True)
1145
+ raise typer.Exit(code=1) from exc
1146
+
1147
+ # Analyze source
1148
+ source_failing = [s for s in source_scenarios if s.get("pass_status") == "fail"]
1149
+ source_passing = [s for s in source_scenarios if s.get("pass_status") == "pass"]
1150
+ source_unrated = [s for s in source_scenarios if s.get("pass_status") is None]
1151
+
1152
+ # Analyze target
1153
+ target_passing = [s for s in target_scenarios if s.get("pass_status") == "pass"]
1154
+ target_failing = [s for s in target_scenarios if s.get("pass_status") == "fail"]
1155
+ target_unrated = [s for s in target_scenarios if s.get("pass_status") is None]
1156
+
1157
+ # Calculate progress
1158
+ source_fail_count = len(source_failing)
1159
+ target_tested = len(target_scenarios)
1160
+ target_pass_count = len(target_passing)
1161
+
1162
+ progress_pct = (target_tested / source_fail_count * 100) if source_fail_count > 0 else 0
1163
+ pass_rate = (target_pass_count / target_tested * 100) if target_tested > 0 else 0
1164
+
1165
+ status = {
1166
+ "source": {
1167
+ "benchmark_id": source_benchmark_id,
1168
+ "name": source_benchmark.get("name"),
1169
+ "total": len(source_scenarios),
1170
+ "failing": source_fail_count,
1171
+ "passing": len(source_passing),
1172
+ "unrated": len(source_unrated),
1173
+ },
1174
+ "target": {
1175
+ "benchmark_id": target_benchmark_id,
1176
+ "name": target_benchmark.get("name"),
1177
+ "total": target_tested,
1178
+ "passing": target_pass_count,
1179
+ "failing": len(target_failing),
1180
+ "unrated": len(target_unrated),
1181
+ },
1182
+ "progress": {
1183
+ "tested": target_tested,
1184
+ "of_source_failing": source_fail_count,
1185
+ "progress_pct": round(progress_pct, 1),
1186
+ "pass_rate": round(pass_rate, 1),
1187
+ "remaining": max(0, source_fail_count - target_tested),
1188
+ },
1189
+ }
1190
+
1191
+ if output_json:
1192
+ typer.echo(json.dumps(status, indent=2, default=str))
1193
+ else:
1194
+ typer.echo(f"=== Fix Progress ===")
1195
+ typer.echo("")
1196
+ typer.echo(f"Source: {status['source']['name']}")
1197
+ typer.echo(f" Total: {status['source']['total']}, Failing: {status['source']['failing']}")
1198
+ typer.echo("")
1199
+ typer.echo(f"Target: {status['target']['name']}")
1200
+ typer.echo(f" Tested: {status['target']['total']}, Passing: {status['target']['passing']}")
1201
+ typer.echo("")
1202
+ typer.echo(f"Progress: {status['progress']['tested']}/{status['progress']['of_source_failing']} ({status['progress']['progress_pct']}%)")
1203
+ typer.echo(f"Pass Rate: {status['progress']['pass_rate']}%")
1204
+ typer.echo(f"Remaining: {status['progress']['remaining']} scenarios")