applied-cli 0.6.6__tar.gz → 0.6.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {applied_cli-0.6.6 → applied_cli-0.6.8}/PKG-INFO +57 -1
  2. {applied_cli-0.6.6 → applied_cli-0.6.8}/README.md +56 -0
  3. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/cli.py +13 -2
  4. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/tools.py +75 -33
  5. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/scenarios.py +23 -2
  6. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/PKG-INFO +57 -1
  7. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/SOURCES.txt +1 -0
  8. {applied_cli-0.6.6 → applied_cli-0.6.8}/pyproject.toml +1 -1
  9. applied_cli-0.6.8/tests/test_benchmark_list_with_results.py +104 -0
  10. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/__init__.py +0 -0
  11. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/agent_scoped_flows.py +0 -0
  12. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/auth.py +0 -0
  13. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/client.py +0 -0
  14. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/conversation_lookup.py +0 -0
  15. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/conversations.py +0 -0
  16. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/credentials.py +0 -0
  17. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/flow_helpers.py +0 -0
  18. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/formatters.py +0 -0
  19. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/mcp.py +0 -0
  20. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/recovery.py +0 -0
  21. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/toolkit.py +0 -0
  22. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/__init__.py +0 -0
  23. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/agents.py +0 -0
  24. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/articles.py +0 -0
  25. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/catalog.py +0 -0
  26. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/connectors.py +0 -0
  27. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/content.py +0 -0
  28. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/conversations.py +0 -0
  29. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/domains.py +0 -0
  30. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/flows.py +0 -0
  31. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/knowledge.py +0 -0
  32. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/manifest.py +0 -0
  33. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/products.py +0 -0
  34. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/taxonomy.py +0 -0
  35. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/tickets.py +0 -0
  36. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/dependency_links.txt +0 -0
  37. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/entry_points.txt +0 -0
  38. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/requires.txt +0 -0
  39. {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/top_level.txt +0 -0
  40. {applied_cli-0.6.6 → applied_cli-0.6.8}/setup.cfg +0 -0
  41. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_agent_scoped_flows.py +0 -0
  42. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_audit_tools.py +0 -0
  43. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_auth_context.py +0 -0
  44. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_benchmark_clone.py +0 -0
  45. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_benchmark_delete_guardrail.py +0 -0
  46. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_benchmark_results.py +0 -0
  47. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_benchmark_scenario_tools.py +0 -0
  48. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_cli.py +0 -0
  49. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_cli_v2.py +0 -0
  50. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_client.py +0 -0
  51. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_client_v2.py +0 -0
  52. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_conversation_tools.py +0 -0
  53. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_flow_tools.py +0 -0
  54. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_knowledge_content_tools.py +0 -0
  55. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_recovery.py +0 -0
  56. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_scenario_bulk_cancel.py +0 -0
  57. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_scenario_bulk_run_contact.py +0 -0
  58. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_scenario_bulk_run_wait.py +0 -0
  59. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_toolkit_contract.py +0 -0
  60. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_agents.py +0 -0
  61. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_articles.py +0 -0
  62. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_catalog_and_mcp.py +0 -0
  63. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_connectors.py +0 -0
  64. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_content.py +0 -0
  65. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_conversations.py +0 -0
  66. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_flows.py +0 -0
  67. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_knowledge.py +0 -0
  68. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_products.py +0 -0
  69. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_scenarios.py +0 -0
  70. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_taxonomy.py +0 -0
  71. {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_tickets.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.6
3
+ Version: 0.6.8
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -80,6 +80,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
80
80
  object. `analytics` returns grouped rows and currently supports `--metrics count`.
81
81
  Raw analytics SQL is not available through the public CLI surface.
82
82
 
83
+ ## Benchmarks & Scenarios
84
+
85
+ A **benchmark** is a named regression suite; a **scenario** is one test conversation
86
+ (built from a real `input_conversation_id`) that can belong to one or more benchmarks.
87
+ The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
88
+
89
+ ```bash
90
+ # Inspect benchmarks and their scenarios
91
+ applied benchmarks --agent-id <agent_id> --format json
92
+ applied benchmark <benchmark_id> --format json
93
+ applied scenarios --benchmark-id <benchmark_id> --format json
94
+
95
+ # Build a suite
96
+ applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
97
+ applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
98
+ --benchmark-id <benchmark_id>
99
+
100
+ # Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
101
+ # scenarios under the destination agent; same-agent just tags them in.
102
+ # Dry-run by default; add --apply to write.
103
+ applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
104
+ --target-agent-id <chat_agent_id> --apply
105
+
106
+ # Run a benchmark and wait for results in one command.
107
+ # --contact-email runs as a contact that has an email, fixing
108
+ # "Email is not present in the conversation" on test conversations.
109
+ applied scenario-bulk-run --benchmark-id <benchmark_id> \
110
+ --contact-email test@example.com --wait
111
+ applied scenario-bulk-status <job_id> --include-runs --format json
112
+
113
+ # Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
114
+ applied scenario-bulk-cancel <job_id> --apply
115
+
116
+ # Review pass/fail health (pass_status reflects the latest run per scenario)
117
+ applied benchmark-results <benchmark_id> --format json
118
+
119
+ # Rate scenarios as you evaluate
120
+ applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
121
+
122
+ # Safe delete — refuses to wipe scenarios unless you opt in
123
+ applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
124
+ applied benchmark-delete <benchmark_id> --force # cascade delete
125
+
126
+ # Recover deleted benchmark/scenario rows from a local PITR export
127
+ applied scenario-recover-catalog --recovery-dir <dir> --apply
128
+ ```
129
+
130
+ Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
131
+ `benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
132
+ (unlink the scenarios first so they survive under their agent) or `--force`.
133
+
83
134
  ## Library Usage
84
135
 
85
136
  ```python
@@ -113,6 +164,11 @@ conversations = await tools.conversation_query(
113
164
  | `analytics_report` | Read standard dashboard/report analytics views |
114
165
  | `analytics_query` | Aggregate supported conversation dimensions with count |
115
166
  | `metrics_query` | Roll up named metric events |
167
+ | `benchmark_clone` | Copy all scenarios from one benchmark into another |
168
+ | `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
169
+ | `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
170
+ | `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
171
+ | `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
116
172
 
117
173
  ## Examples
118
174
 
@@ -54,6 +54,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
54
54
  object. `analytics` returns grouped rows and currently supports `--metrics count`.
55
55
  Raw analytics SQL is not available through the public CLI surface.
56
56
 
57
+ ## Benchmarks & Scenarios
58
+
59
+ A **benchmark** is a named regression suite; a **scenario** is one test conversation
60
+ (built from a real `input_conversation_id`) that can belong to one or more benchmarks.
61
+ The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
62
+
63
+ ```bash
64
+ # Inspect benchmarks and their scenarios
65
+ applied benchmarks --agent-id <agent_id> --format json
66
+ applied benchmark <benchmark_id> --format json
67
+ applied scenarios --benchmark-id <benchmark_id> --format json
68
+
69
+ # Build a suite
70
+ applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
71
+ applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
72
+ --benchmark-id <benchmark_id>
73
+
74
+ # Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
75
+ # scenarios under the destination agent; same-agent just tags them in.
76
+ # Dry-run by default; add --apply to write.
77
+ applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
78
+ --target-agent-id <chat_agent_id> --apply
79
+
80
+ # Run a benchmark and wait for results in one command.
81
+ # --contact-email runs as a contact that has an email, fixing
82
+ # "Email is not present in the conversation" on test conversations.
83
+ applied scenario-bulk-run --benchmark-id <benchmark_id> \
84
+ --contact-email test@example.com --wait
85
+ applied scenario-bulk-status <job_id> --include-runs --format json
86
+
87
+ # Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
88
+ applied scenario-bulk-cancel <job_id> --apply
89
+
90
+ # Review pass/fail health (pass_status reflects the latest run per scenario)
91
+ applied benchmark-results <benchmark_id> --format json
92
+
93
+ # Rate scenarios as you evaluate
94
+ applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
95
+
96
+ # Safe delete — refuses to wipe scenarios unless you opt in
97
+ applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
98
+ applied benchmark-delete <benchmark_id> --force # cascade delete
99
+
100
+ # Recover deleted benchmark/scenario rows from a local PITR export
101
+ applied scenario-recover-catalog --recovery-dir <dir> --apply
102
+ ```
103
+
104
+ Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
105
+ `benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
106
+ (unlink the scenarios first so they survive under their agent) or `--force`.
107
+
57
108
  ## Library Usage
58
109
 
59
110
  ```python
@@ -87,6 +138,11 @@ conversations = await tools.conversation_query(
87
138
  | `analytics_report` | Read standard dashboard/report analytics views |
88
139
  | `analytics_query` | Aggregate supported conversation dimensions with count |
89
140
  | `metrics_query` | Roll up named metric events |
141
+ | `benchmark_clone` | Copy all scenarios from one benchmark into another |
142
+ | `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
143
+ | `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
144
+ | `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
145
+ | `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
90
146
 
91
147
  ## Examples
92
148
 
@@ -1540,15 +1540,26 @@ def send_message_cmd(
1540
1540
  @app.command()
1541
1541
  def benchmarks(
1542
1542
  agent_id: str = typer.Option(None, "--agent-id", help="Filter by agent ID"),
1543
+ with_results: bool = typer.Option(
1544
+ False,
1545
+ "--with-results",
1546
+ help="Include each benchmark's pass/fail/unrated tally and pass rate "
1547
+ "(one scenario fetch per benchmark) — a go/no-go portfolio view",
1548
+ ),
1543
1549
  shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
1544
1550
  format: str = typer.Option(
1545
1551
  "csv", "--format", "-f", help="Output format: csv or json"
1546
1552
  ),
1547
1553
  ) -> None:
1548
- """List benchmarks."""
1554
+ """List benchmarks (optionally with per-benchmark pass rates via --with-results)."""
1549
1555
  client = get_client(shop_id=shop_id)
1550
1556
  result = asyncio.run(
1551
- tools.benchmark_list(client, agent_id=agent_id, output_format=format)
1557
+ tools.benchmark_list(
1558
+ client,
1559
+ agent_id=agent_id,
1560
+ output_format=format,
1561
+ with_results=with_results,
1562
+ )
1552
1563
  )
1553
1564
  typer.echo(result)
1554
1565
 
@@ -5306,6 +5306,7 @@ async def benchmark_list(
5306
5306
  client: AppliedClient,
5307
5307
  agent_id: str | None = None,
5308
5308
  output_format: str = "csv",
5309
+ with_results: bool = False,
5309
5310
  ) -> str:
5310
5311
  """
5311
5312
  List conversation benchmarks.
@@ -5314,26 +5315,45 @@ async def benchmark_list(
5314
5315
  client: Authenticated AppliedClient
5315
5316
  agent_id: Optional - filter by agent UUID
5316
5317
  output_format: 'csv' or 'json'
5318
+ with_results: Also compute each benchmark's pass/fail/unrated tally and
5319
+ pass rate (one extra scenario fetch per benchmark) — a go/no-go
5320
+ portfolio view across all benchmarks
5317
5321
 
5318
5322
  Returns:
5319
- List of benchmarks with id, name, agent, scenario count
5323
+ List of benchmarks with id, name, agent, scenario count (and pass-rate
5324
+ columns when with_results is set)
5320
5325
  """
5321
5326
  benchmarks = await client.list_benchmarks(agent_id=agent_id)
5322
- mapped = [
5323
- {
5327
+ mapped = []
5328
+ for b in benchmarks:
5329
+ row = {
5324
5330
  "id": b.get("id"),
5325
5331
  "name": b.get("name"),
5326
5332
  "agent_name": b.get("agent", {}).get("name", ""),
5327
5333
  "scenario_count": b.get("scenario_count", 0),
5328
5334
  "description": str(b.get("description", ""))[:80],
5329
5335
  }
5330
- for b in benchmarks
5331
- ]
5336
+ if with_results:
5337
+ scenarios = await client.list_scenarios(
5338
+ benchmark_id=b.get("id"), fetch_all=True
5339
+ )
5340
+ tally = _pass_status_tally(scenarios)
5341
+ pass_rate = tally["pass_rate"]
5342
+ row["passed"] = tally["passed"]
5343
+ row["failed"] = tally["failed"]
5344
+ row["unrated"] = tally["unrated"]
5345
+ row["pass_rate"] = (
5346
+ f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a"
5347
+ )
5348
+ mapped.append(row)
5349
+
5350
+ columns = ["id", "name", "agent_name", "scenario_count"]
5351
+ if with_results:
5352
+ columns += ["passed", "failed", "unrated", "pass_rate"]
5353
+ columns.append("description")
5332
5354
 
5333
5355
  if output_format == "csv":
5334
- return to_csv(
5335
- mapped, ["id", "name", "agent_name", "scenario_count", "description"]
5336
- )
5356
+ return to_csv(mapped, columns)
5337
5357
  return to_json(mapped)
5338
5358
 
5339
5359
 
@@ -5710,6 +5730,40 @@ async def benchmark_clone(
5710
5730
  return "\n".join(lines)
5711
5731
 
5712
5732
 
5733
+ def _pass_status_tally(scenarios: list[dict]) -> dict[str, Any]:
5734
+ """Tally scenarios by pass_status and compute the pass rate among rated.
5735
+
5736
+ Scenario pass_status from the API is the *effective* value (the latest run's
5737
+ pass_status when present, else the scenario's own), so this reflects the most
5738
+ recent run per scenario.
5739
+ """
5740
+ tally = {"pass": 0, "fail": 0, "unrated": 0}
5741
+ failing: list[dict[str, Any]] = []
5742
+ unrated: list[dict[str, Any]] = []
5743
+ for scenario in scenarios:
5744
+ status = str(scenario.get("pass_status") or "unrated").lower()
5745
+ if status not in tally:
5746
+ status = "unrated"
5747
+ tally[status] += 1
5748
+ entry = {"id": scenario.get("id"), "name": scenario.get("name")}
5749
+ if status == "fail":
5750
+ failing.append(entry)
5751
+ elif status == "unrated":
5752
+ unrated.append(entry)
5753
+
5754
+ rated = tally["pass"] + tally["fail"]
5755
+ return {
5756
+ "total": len(scenarios),
5757
+ "passed": tally["pass"],
5758
+ "failed": tally["fail"],
5759
+ "unrated": tally["unrated"],
5760
+ "rated": rated,
5761
+ "pass_rate": round(tally["pass"] / rated, 4) if rated else None,
5762
+ "failing_scenarios": failing,
5763
+ "unrated_scenarios": unrated,
5764
+ }
5765
+
5766
+
5713
5767
  async def benchmark_results(
5714
5768
  client: AppliedClient,
5715
5769
  benchmark_id: str,
@@ -5739,30 +5793,18 @@ async def benchmark_results(
5739
5793
  except AppliedAPIError as e:
5740
5794
  return _format_error(e)
5741
5795
 
5742
- tally = {"pass": 0, "fail": 0, "unrated": 0}
5743
- failing: list[dict[str, Any]] = []
5744
- unrated: list[dict[str, Any]] = []
5745
- for scenario in scenarios:
5746
- status = str(scenario.get("pass_status") or "unrated").lower()
5747
- if status not in tally:
5748
- status = "unrated"
5749
- tally[status] += 1
5750
- entry = {"id": scenario.get("id"), "name": scenario.get("name")}
5751
- if status == "fail":
5752
- failing.append(entry)
5753
- elif status == "unrated":
5754
- unrated.append(entry)
5755
-
5756
- rated = tally["pass"] + tally["fail"]
5757
- pass_rate = round(tally["pass"] / rated, 4) if rated else None
5796
+ t = _pass_status_tally(scenarios)
5797
+ failing = t["failing_scenarios"]
5798
+ unrated = t["unrated_scenarios"]
5799
+ pass_rate = t["pass_rate"]
5758
5800
  summary = {
5759
5801
  "benchmark_id": benchmark_id,
5760
5802
  "benchmark_name": benchmark.get("name"),
5761
- "total_scenarios": len(scenarios),
5762
- "passed": tally["pass"],
5763
- "failed": tally["fail"],
5764
- "unrated": tally["unrated"],
5765
- "rated": rated,
5803
+ "total_scenarios": t["total"],
5804
+ "passed": t["passed"],
5805
+ "failed": t["failed"],
5806
+ "unrated": t["unrated"],
5807
+ "rated": t["rated"],
5766
5808
  "pass_rate": pass_rate,
5767
5809
  "failing_scenarios": failing,
5768
5810
  "unrated_scenarios": unrated,
@@ -5772,16 +5814,16 @@ async def benchmark_results(
5772
5814
  return to_json(summary)
5773
5815
 
5774
5816
  pass_rate_str = (
5775
- f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
5817
+ f"{pass_rate * 100:.1f}% ({t['passed']}/{t['rated']} rated)"
5776
5818
  if pass_rate is not None
5777
5819
  else "n/a (no rated scenarios yet)"
5778
5820
  )
5779
5821
  lines = [
5780
5822
  f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
5781
5823
  f"total_scenarios: {summary['total_scenarios']}",
5782
- f"passed: {tally['pass']}",
5783
- f"failed: {tally['fail']}",
5784
- f"unrated: {tally['unrated']}",
5824
+ f"passed: {t['passed']}",
5825
+ f"failed: {t['failed']}",
5826
+ f"unrated: {t['unrated']}",
5785
5827
  f"pass_rate: {pass_rate_str}",
5786
5828
  ]
5787
5829
  if failing:
@@ -43,6 +43,7 @@ class ScenariosBulkCancelInput(StrictInput):
43
43
  class BenchmarksListInput(StrictInput):
44
44
  agent_id: str | None = None
45
45
  limit: int = 50
46
+ with_results: bool = False
46
47
 
47
48
 
48
49
  class BenchmarksGetInput(StrictInput):
@@ -395,10 +396,26 @@ async def benchmarks_list_handler(
395
396
  agent_id=params.agent_id,
396
397
  limit=params.limit,
397
398
  )
399
+ payload = []
400
+ for benchmark in benchmarks:
401
+ row = _project_benchmark_payload(benchmark)
402
+ if params.with_results:
403
+ from applied_cli.tools import _pass_status_tally
404
+
405
+ scenarios = await client.list_scenarios(
406
+ benchmark_id=benchmark.get("id"), fetch_all=True
407
+ )
408
+ tally = _pass_status_tally(scenarios)
409
+ row["results"] = {
410
+ "passed": tally["passed"],
411
+ "failed": tally["failed"],
412
+ "unrated": tally["unrated"],
413
+ "pass_rate": tally["pass_rate"],
414
+ }
415
+ payload.append(row)
398
416
  except AppliedAPIError as exc:
399
417
  return _api_error_result(exc)
400
418
 
401
- payload = [_project_benchmark_payload(benchmark) for benchmark in benchmarks]
402
419
  return ToolResult(
403
420
  data=payload,
404
421
  summary=_count_summary(len(payload), "benchmark"),
@@ -991,7 +1008,11 @@ def scenario_specs() -> list[ToolSpec]:
991
1008
  ToolSpec(
992
1009
  name="benchmarks_list",
993
1010
  namespace="benchmarks",
994
- description="List conversation benchmarks as structured rows.",
1011
+ description=(
1012
+ "List conversation benchmarks as structured rows. Set "
1013
+ "with_results=true for each benchmark's pass/fail/unrated tally "
1014
+ "and pass rate (a go/no-go portfolio view)."
1015
+ ),
995
1016
  input_model=BenchmarksListInput,
996
1017
  output_model=None,
997
1018
  handler=benchmarks_list_handler,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.6
3
+ Version: 0.6.8
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -80,6 +80,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
80
80
  object. `analytics` returns grouped rows and currently supports `--metrics count`.
81
81
  Raw analytics SQL is not available through the public CLI surface.
82
82
 
83
+ ## Benchmarks & Scenarios
84
+
85
+ A **benchmark** is a named regression suite; a **scenario** is one test conversation
86
+ (built from a real `input_conversation_id`) that can belong to one or more benchmarks.
87
+ The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
88
+
89
+ ```bash
90
+ # Inspect benchmarks and their scenarios
91
+ applied benchmarks --agent-id <agent_id> --format json
92
+ applied benchmark <benchmark_id> --format json
93
+ applied scenarios --benchmark-id <benchmark_id> --format json
94
+
95
+ # Build a suite
96
+ applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
97
+ applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
98
+ --benchmark-id <benchmark_id>
99
+
100
+ # Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
101
+ # scenarios under the destination agent; same-agent just tags them in.
102
+ # Dry-run by default; add --apply to write.
103
+ applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
104
+ --target-agent-id <chat_agent_id> --apply
105
+
106
+ # Run a benchmark and wait for results in one command.
107
+ # --contact-email runs as a contact that has an email, fixing
108
+ # "Email is not present in the conversation" on test conversations.
109
+ applied scenario-bulk-run --benchmark-id <benchmark_id> \
110
+ --contact-email test@example.com --wait
111
+ applied scenario-bulk-status <job_id> --include-runs --format json
112
+
113
+ # Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
114
+ applied scenario-bulk-cancel <job_id> --apply
115
+
116
+ # Review pass/fail health (pass_status reflects the latest run per scenario)
117
+ applied benchmark-results <benchmark_id> --format json
118
+
119
+ # Rate scenarios as you evaluate
120
+ applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
121
+
122
+ # Safe delete — refuses to wipe scenarios unless you opt in
123
+ applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
124
+ applied benchmark-delete <benchmark_id> --force # cascade delete
125
+
126
+ # Recover deleted benchmark/scenario rows from a local PITR export
127
+ applied scenario-recover-catalog --recovery-dir <dir> --apply
128
+ ```
129
+
130
+ Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
131
+ `benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
132
+ (unlink the scenarios first so they survive under their agent) or `--force`.
133
+
83
134
  ## Library Usage
84
135
 
85
136
  ```python
@@ -113,6 +164,11 @@ conversations = await tools.conversation_query(
113
164
  | `analytics_report` | Read standard dashboard/report analytics views |
114
165
  | `analytics_query` | Aggregate supported conversation dimensions with count |
115
166
  | `metrics_query` | Roll up named metric events |
167
+ | `benchmark_clone` | Copy all scenarios from one benchmark into another |
168
+ | `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
169
+ | `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
170
+ | `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
171
+ | `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
116
172
 
117
173
  ## Examples
118
174
 
@@ -40,6 +40,7 @@ tests/test_audit_tools.py
40
40
  tests/test_auth_context.py
41
41
  tests/test_benchmark_clone.py
42
42
  tests/test_benchmark_delete_guardrail.py
43
+ tests/test_benchmark_list_with_results.py
43
44
  tests/test_benchmark_results.py
44
45
  tests/test_benchmark_scenario_tools.py
45
46
  tests/test_cli.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "applied-cli"
3
- version = "0.6.6"
3
+ version = "0.6.8"
4
4
  description = "CLI and shared client library for Applied Labs AI support agents"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -0,0 +1,104 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from applied_cli import tools
6
+
7
+ BENCHMARKS = [
8
+ {"id": "b1", "name": "Cancel", "agent": {"name": "August"}, "scenario_count": 3},
9
+ {"id": "b2", "name": "Refund", "agent": {"name": "August"}, "scenario_count": 1},
10
+ ]
11
+
12
+ SCENARIOS_BY_BENCHMARK = {
13
+ "b1": [
14
+ {"id": "s1", "name": "a", "pass_status": "pass"},
15
+ {"id": "s2", "name": "b", "pass_status": "fail"},
16
+ {"id": "s3", "name": "c", "pass_status": "unrated"},
17
+ ],
18
+ "b2": [{"id": "s4", "name": "d", "pass_status": "pass"}],
19
+ }
20
+
21
+
22
+ class FakeListClient:
23
+ def __init__(self):
24
+ self.list_scenarios_calls = 0
25
+
26
+ async def list_benchmarks(self, agent_id=None, limit=50):
27
+ return list(BENCHMARKS)
28
+
29
+ async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
30
+ self.list_scenarios_calls += 1
31
+ return list(SCENARIOS_BY_BENCHMARK.get(benchmark_id, []))
32
+
33
+
34
+ @pytest.mark.asyncio
35
+ async def test_list_without_results_does_not_fetch_scenarios():
36
+ client = FakeListClient()
37
+ out = await tools.benchmark_list(client, output_format="json")
38
+ rows = json.loads(out)
39
+ assert client.list_scenarios_calls == 0
40
+ assert "pass_rate" not in rows[0]
41
+
42
+
43
+ @pytest.mark.asyncio
44
+ async def test_list_with_results_adds_pass_rate_per_benchmark():
45
+ client = FakeListClient()
46
+ out = await tools.benchmark_list(
47
+ client, output_format="json", with_results=True
48
+ )
49
+ rows = {r["id"]: r for r in json.loads(out)}
50
+ # One scenario fetch per benchmark.
51
+ assert client.list_scenarios_calls == 2
52
+ # b1: 1 pass / 2 rated = 50%
53
+ assert rows["b1"]["passed"] == 1
54
+ assert rows["b1"]["failed"] == 1
55
+ assert rows["b1"]["unrated"] == 1
56
+ assert rows["b1"]["pass_rate"] == "50.0%"
57
+ # b2: 1 pass / 1 rated = 100%
58
+ assert rows["b2"]["pass_rate"] == "100.0%"
59
+
60
+
61
+ @pytest.mark.asyncio
62
+ async def test_list_with_results_csv_has_columns():
63
+ client = FakeListClient()
64
+ out = await tools.benchmark_list(client, output_format="csv", with_results=True)
65
+ header = out.splitlines()[0]
66
+ for col in ("passed", "failed", "unrated", "pass_rate"):
67
+ assert col in header
68
+
69
+
70
+ def test_pass_status_tally_pure():
71
+ tally = tools._pass_status_tally(
72
+ [
73
+ {"id": "1", "pass_status": "pass"},
74
+ {"id": "2", "pass_status": "PASS"}, # case-insensitive
75
+ {"id": "3", "pass_status": "fail"},
76
+ {"id": "4"}, # missing -> unrated
77
+ ]
78
+ )
79
+ assert tally["passed"] == 2
80
+ assert tally["failed"] == 1
81
+ assert tally["unrated"] == 1
82
+ assert tally["rated"] == 3
83
+ assert tally["pass_rate"] == round(2 / 3, 4)
84
+
85
+
86
+ def test_pass_status_tally_no_rated():
87
+ tally = tools._pass_status_tally([{"id": "1", "pass_status": "unrated"}])
88
+ assert tally["pass_rate"] is None
89
+
90
+
91
+ @pytest.mark.asyncio
92
+ async def test_v2_benchmarks_list_with_results():
93
+ from applied_cli.v2.scenarios import (
94
+ BenchmarksListInput,
95
+ benchmarks_list_handler,
96
+ )
97
+
98
+ client = FakeListClient()
99
+ result = await benchmarks_list_handler(
100
+ client, BenchmarksListInput(with_results=True)
101
+ )
102
+ by_id = {r["id"]: r for r in result.data}
103
+ assert by_id["b1"]["results"]["passed"] == 1
104
+ assert by_id["b2"]["results"]["pass_rate"] == 1.0
File without changes