applied-cli 0.6.5__tar.gz → 0.6.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {applied_cli-0.6.5 → applied_cli-0.6.7}/PKG-INFO +57 -1
  2. {applied_cli-0.6.5 → applied_cli-0.6.7}/README.md +56 -0
  3. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/cli.py +16 -0
  4. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/tools.py +87 -0
  5. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/domains.py +1 -0
  6. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/scenarios.py +55 -0
  7. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/PKG-INFO +57 -1
  8. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/SOURCES.txt +1 -0
  9. {applied_cli-0.6.5 → applied_cli-0.6.7}/pyproject.toml +1 -1
  10. applied_cli-0.6.7/tests/test_benchmark_results.py +78 -0
  11. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/__init__.py +0 -0
  12. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/agent_scoped_flows.py +0 -0
  13. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/auth.py +0 -0
  14. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/client.py +0 -0
  15. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/conversation_lookup.py +0 -0
  16. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/conversations.py +0 -0
  17. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/credentials.py +0 -0
  18. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/flow_helpers.py +0 -0
  19. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/formatters.py +0 -0
  20. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/mcp.py +0 -0
  21. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/recovery.py +0 -0
  22. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/toolkit.py +0 -0
  23. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/__init__.py +0 -0
  24. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/agents.py +0 -0
  25. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/articles.py +0 -0
  26. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/catalog.py +0 -0
  27. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/connectors.py +0 -0
  28. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/content.py +0 -0
  29. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/conversations.py +0 -0
  30. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/flows.py +0 -0
  31. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/knowledge.py +0 -0
  32. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/manifest.py +0 -0
  33. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/products.py +0 -0
  34. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/taxonomy.py +0 -0
  35. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/tickets.py +0 -0
  36. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/dependency_links.txt +0 -0
  37. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/entry_points.txt +0 -0
  38. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/requires.txt +0 -0
  39. {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/top_level.txt +0 -0
  40. {applied_cli-0.6.5 → applied_cli-0.6.7}/setup.cfg +0 -0
  41. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_agent_scoped_flows.py +0 -0
  42. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_audit_tools.py +0 -0
  43. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_auth_context.py +0 -0
  44. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_benchmark_clone.py +0 -0
  45. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_benchmark_delete_guardrail.py +0 -0
  46. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_benchmark_scenario_tools.py +0 -0
  47. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_cli.py +0 -0
  48. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_cli_v2.py +0 -0
  49. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_client.py +0 -0
  50. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_client_v2.py +0 -0
  51. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_conversation_tools.py +0 -0
  52. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_flow_tools.py +0 -0
  53. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_knowledge_content_tools.py +0 -0
  54. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_recovery.py +0 -0
  55. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_scenario_bulk_cancel.py +0 -0
  56. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_scenario_bulk_run_contact.py +0 -0
  57. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_scenario_bulk_run_wait.py +0 -0
  58. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_toolkit_contract.py +0 -0
  59. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_agents.py +0 -0
  60. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_articles.py +0 -0
  61. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_catalog_and_mcp.py +0 -0
  62. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_connectors.py +0 -0
  63. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_content.py +0 -0
  64. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_conversations.py +0 -0
  65. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_flows.py +0 -0
  66. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_knowledge.py +0 -0
  67. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_products.py +0 -0
  68. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_scenarios.py +0 -0
  69. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_taxonomy.py +0 -0
  70. {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_tickets.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.5
3
+ Version: 0.6.7
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -80,6 +80,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
80
80
  object. `analytics` returns grouped rows and currently supports `--metrics count`.
81
81
  Raw analytics SQL is not available through the public CLI surface.
82
82
 
83
+ ## Benchmarks & Scenarios
84
+
85
+ A **benchmark** is a named regression suite; a **scenario** is one test conversation
86
+ (built from a real `input_conversation_id`) that can belong to one or more benchmarks.
87
+ The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
88
+
89
+ ```bash
90
+ # Inspect benchmarks and their scenarios
91
+ applied benchmarks --agent-id <agent_id> --format json
92
+ applied benchmark <benchmark_id> --format json
93
+ applied scenarios --benchmark-id <benchmark_id> --format json
94
+
95
+ # Build a suite
96
+ applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
97
+ applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
98
+ --benchmark-id <benchmark_id>
99
+
100
+ # Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
101
+ # scenarios under the destination agent; same-agent just tags them in.
102
+ # Dry-run by default; add --apply to write.
103
+ applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
104
+ --target-agent-id <chat_agent_id> --apply
105
+
106
+ # Run a benchmark and wait for results in one command.
107
+ # --contact-email runs as a contact that has an email, fixing
108
+ # "Email is not present in the conversation" on test conversations.
109
+ applied scenario-bulk-run --benchmark-id <benchmark_id> \
110
+ --contact-email test@example.com --wait
111
+ applied scenario-bulk-status <job_id> --include-runs --format json
112
+
113
+ # Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
114
+ applied scenario-bulk-cancel <job_id> --apply
115
+
116
+ # Review pass/fail health (pass_status reflects the latest run per scenario)
117
+ applied benchmark-results <benchmark_id> --format json
118
+
119
+ # Rate scenarios as you evaluate
120
+ applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
121
+
122
+ # Safe delete — refuses to wipe scenarios unless you opt in
123
+ applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
124
+ applied benchmark-delete <benchmark_id> --force # cascade delete
125
+
126
+ # Recover deleted benchmark/scenario rows from a local PITR export
127
+ applied scenario-recover-catalog --recovery-dir <dir> --apply
128
+ ```
129
+
130
+ Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
131
+ `benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
132
+ (unlink the scenarios first so they survive under their agent) or `--force`.
133
+
83
134
  ## Library Usage
84
135
 
85
136
  ```python
@@ -113,6 +164,11 @@ conversations = await tools.conversation_query(
113
164
  | `analytics_report` | Read standard dashboard/report analytics views |
114
165
  | `analytics_query` | Aggregate supported conversation dimensions with count |
115
166
  | `metrics_query` | Roll up named metric events |
167
+ | `benchmark_clone` | Copy all scenarios from one benchmark into another |
168
+ | `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
169
+ | `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
170
+ | `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
171
+ | `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
116
172
 
117
173
  ## Examples
118
174
 
@@ -54,6 +54,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
54
54
  object. `analytics` returns grouped rows and currently supports `--metrics count`.
55
55
  Raw analytics SQL is not available through the public CLI surface.
56
56
 
57
+ ## Benchmarks & Scenarios
58
+
59
+ A **benchmark** is a named regression suite; a **scenario** is one test conversation
60
+ (built from a real `input_conversation_id`) that can belong to one or more benchmarks.
61
+ The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
62
+
63
+ ```bash
64
+ # Inspect benchmarks and their scenarios
65
+ applied benchmarks --agent-id <agent_id> --format json
66
+ applied benchmark <benchmark_id> --format json
67
+ applied scenarios --benchmark-id <benchmark_id> --format json
68
+
69
+ # Build a suite
70
+ applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
71
+ applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
72
+ --benchmark-id <benchmark_id>
73
+
74
+ # Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
75
+ # scenarios under the destination agent; same-agent just tags them in.
76
+ # Dry-run by default; add --apply to write.
77
+ applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
78
+ --target-agent-id <chat_agent_id> --apply
79
+
80
+ # Run a benchmark and wait for results in one command.
81
+ # --contact-email runs as a contact that has an email, fixing
82
+ # "Email is not present in the conversation" on test conversations.
83
+ applied scenario-bulk-run --benchmark-id <benchmark_id> \
84
+ --contact-email test@example.com --wait
85
+ applied scenario-bulk-status <job_id> --include-runs --format json
86
+
87
+ # Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
88
+ applied scenario-bulk-cancel <job_id> --apply
89
+
90
+ # Review pass/fail health (pass_status reflects the latest run per scenario)
91
+ applied benchmark-results <benchmark_id> --format json
92
+
93
+ # Rate scenarios as you evaluate
94
+ applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
95
+
96
+ # Safe delete — refuses to wipe scenarios unless you opt in
97
+ applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
98
+ applied benchmark-delete <benchmark_id> --force # cascade delete
99
+
100
+ # Recover deleted benchmark/scenario rows from a local PITR export
101
+ applied scenario-recover-catalog --recovery-dir <dir> --apply
102
+ ```
103
+
104
+ Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
105
+ `benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
106
+ (unlink the scenarios first so they survive under their agent) or `--force`.
107
+
57
108
  ## Library Usage
58
109
 
59
110
  ```python
@@ -87,6 +138,11 @@ conversations = await tools.conversation_query(
87
138
  | `analytics_report` | Read standard dashboard/report analytics views |
88
139
  | `analytics_query` | Aggregate supported conversation dimensions with count |
89
140
  | `metrics_query` | Roll up named metric events |
141
+ | `benchmark_clone` | Copy all scenarios from one benchmark into another |
142
+ | `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
143
+ | `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
144
+ | `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
145
+ | `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
90
146
 
91
147
  ## Examples
92
148
 
@@ -1672,6 +1672,22 @@ def benchmark_delete(
1672
1672
  typer.echo(result)
1673
1673
 
1674
1674
 
1675
+ @app.command("benchmark-results")
1676
+ def benchmark_results(
1677
+ id: str = typer.Argument(..., help="Benchmark ID"),
1678
+ shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
1679
+ format: str = typer.Option(
1680
+ "text", "--format", "-f", help="Output format: text or json"
1681
+ ),
1682
+ ) -> None:
1683
+ """Summarize a benchmark's pass/fail/unrated health and pass rate."""
1684
+ client = get_client(shop_id=shop_id)
1685
+ result = asyncio.run(
1686
+ tools.benchmark_results(client, benchmark_id=id, output_format=format)
1687
+ )
1688
+ typer.echo(result)
1689
+
1690
+
1675
1691
  @app.command()
1676
1692
  def scenarios(
1677
1693
  benchmark_id: str = typer.Option(
@@ -5710,6 +5710,93 @@ async def benchmark_clone(
5710
5710
  return "\n".join(lines)
5711
5711
 
5712
5712
 
5713
+ async def benchmark_results(
5714
+ client: AppliedClient,
5715
+ benchmark_id: str,
5716
+ *,
5717
+ output_format: str = "text",
5718
+ ) -> str:
5719
+ """
5720
+ Summarize a benchmark's pass/fail health.
5721
+
5722
+ Tallies the pass_status across the benchmark's scenarios (pass / fail /
5723
+ unrated), computes the pass rate among rated scenarios, and lists the failing
5724
+ and still-unrated scenarios so you know what to fix or evaluate next.
5725
+
5726
+ Args:
5727
+ client: Authenticated AppliedClient
5728
+ benchmark_id: The benchmark UUID
5729
+ output_format: 'text' (default) or 'json'
5730
+
5731
+ Returns:
5732
+ Pass-rate summary with failing and unrated scenario lists.
5733
+ """
5734
+ try:
5735
+ benchmark = await client.get_benchmark(benchmark_id)
5736
+ scenarios = await client.list_scenarios(
5737
+ benchmark_id=benchmark_id, fetch_all=True
5738
+ )
5739
+ except AppliedAPIError as e:
5740
+ return _format_error(e)
5741
+
5742
+ tally = {"pass": 0, "fail": 0, "unrated": 0}
5743
+ failing: list[dict[str, Any]] = []
5744
+ unrated: list[dict[str, Any]] = []
5745
+ for scenario in scenarios:
5746
+ status = str(scenario.get("pass_status") or "unrated").lower()
5747
+ if status not in tally:
5748
+ status = "unrated"
5749
+ tally[status] += 1
5750
+ entry = {"id": scenario.get("id"), "name": scenario.get("name")}
5751
+ if status == "fail":
5752
+ failing.append(entry)
5753
+ elif status == "unrated":
5754
+ unrated.append(entry)
5755
+
5756
+ rated = tally["pass"] + tally["fail"]
5757
+ pass_rate = round(tally["pass"] / rated, 4) if rated else None
5758
+ summary = {
5759
+ "benchmark_id": benchmark_id,
5760
+ "benchmark_name": benchmark.get("name"),
5761
+ "total_scenarios": len(scenarios),
5762
+ "passed": tally["pass"],
5763
+ "failed": tally["fail"],
5764
+ "unrated": tally["unrated"],
5765
+ "rated": rated,
5766
+ "pass_rate": pass_rate,
5767
+ "failing_scenarios": failing,
5768
+ "unrated_scenarios": unrated,
5769
+ }
5770
+
5771
+ if output_format == "json":
5772
+ return to_json(summary)
5773
+
5774
+ pass_rate_str = (
5775
+ f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
5776
+ if pass_rate is not None
5777
+ else "n/a (no rated scenarios yet)"
5778
+ )
5779
+ lines = [
5780
+ f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
5781
+ f"total_scenarios: {summary['total_scenarios']}",
5782
+ f"passed: {tally['pass']}",
5783
+ f"failed: {tally['fail']}",
5784
+ f"unrated: {tally['unrated']}",
5785
+ f"pass_rate: {pass_rate_str}",
5786
+ ]
5787
+ if failing:
5788
+ lines.append(f"\n# Failing ({len(failing)})")
5789
+ lines.extend(f" - {s['name']} ({s['id']})" for s in failing[:50])
5790
+ if len(failing) > 50:
5791
+ lines.append(f" ... and {len(failing) - 50} more")
5792
+ if unrated:
5793
+ lines.append(f"\n# Unrated ({len(unrated)}) — evaluate these next")
5794
+ lines.extend(f" - {s['name']} ({s['id']})" for s in unrated[:50])
5795
+ if len(unrated) > 50:
5796
+ lines.append(f" ... and {len(unrated) - 50} more")
5797
+ return "\n".join(lines)
5798
+
5799
+
5713
5800
  # -----------------------------------------------------------------------------
5714
5801
  # Scenarios
5715
5802
  # -----------------------------------------------------------------------------
@@ -43,6 +43,7 @@ DOMAIN_TOOL_RENAMES: dict[str, dict[str, str]] = {
43
43
  "benchmark_create": "benchmarks_create",
44
44
  "benchmark_delete": "benchmarks_delete",
45
45
  "benchmark_clone": "benchmarks_clone",
46
+ "benchmark_results": "benchmarks_results",
46
47
  },
47
48
  "connectors": {
48
49
  "connector_types": "connectors_types_list",
@@ -71,6 +71,10 @@ class BenchmarksCloneInput(StrictInput):
71
71
  apply: bool = False
72
72
 
73
73
 
74
+ class BenchmarksResultsInput(StrictInput):
75
+ benchmark_id: str
76
+
77
+
74
78
  class ScenariosListInput(StrictInput):
75
79
  benchmark_id: str | None = None
76
80
  agent_id: str | None = None
@@ -503,6 +507,44 @@ async def benchmarks_clone_handler(
503
507
  )
504
508
 
505
509
 
510
+ async def benchmarks_results_handler(
511
+ client: AppliedClient,
512
+ params: BenchmarksResultsInput,
513
+ ) -> ToolResult[Any]:
514
+ from applied_cli import tools as legacy_tools
515
+
516
+ raw = await legacy_tools.benchmark_results(
517
+ client, benchmark_id=params.benchmark_id, output_format="json"
518
+ )
519
+ try:
520
+ data = json.loads(raw)
521
+ except (json.JSONDecodeError, TypeError):
522
+ return ToolResult(data={"message": raw}, summary=str(raw))
523
+
524
+ pass_rate = data.get("pass_rate")
525
+ rate_str = (
526
+ f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a (no rated yet)"
527
+ )
528
+ next_actions = []
529
+ if data.get("unrated"):
530
+ next_actions.append(
531
+ "Rate the unrated scenarios with scenarios_update (pass_status)."
532
+ )
533
+ if data.get("failed"):
534
+ next_actions.append(
535
+ "Inspect failing scenarios with scenarios_get / conversations_debug_bundle."
536
+ )
537
+ return ToolResult(
538
+ data=data,
539
+ summary=(
540
+ f"{data.get('benchmark_name') or params.benchmark_id}: pass rate "
541
+ f"{rate_str} — {data.get('passed', 0)} passed, "
542
+ f"{data.get('failed', 0)} failed, {data.get('unrated', 0)} unrated."
543
+ ),
544
+ next_actions=next_actions,
545
+ )
546
+
547
+
506
548
  async def benchmarks_delete_handler(
507
549
  client: AppliedClient,
508
550
  params: BenchmarksDeleteInput,
@@ -1006,6 +1048,19 @@ def scenario_specs() -> list[ToolSpec]:
1006
1048
  read_write_mode="write",
1007
1049
  tags=["benchmark_clone", "native"],
1008
1050
  ),
1051
+ ToolSpec(
1052
+ name="benchmarks_results",
1053
+ namespace="benchmarks",
1054
+ description=(
1055
+ "Summarize a benchmark's pass/fail/unrated health and pass rate, "
1056
+ "with the failing and unrated scenario lists."
1057
+ ),
1058
+ input_model=BenchmarksResultsInput,
1059
+ output_model=None,
1060
+ handler=benchmarks_results_handler,
1061
+ read_write_mode="read",
1062
+ tags=["benchmark_results", "native"],
1063
+ ),
1009
1064
  ToolSpec(
1010
1065
  name="scenarios_list",
1011
1066
  namespace="scenarios",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.5
3
+ Version: 0.6.7
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -80,6 +80,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
80
80
  object. `analytics` returns grouped rows and currently supports `--metrics count`.
81
81
  Raw analytics SQL is not available through the public CLI surface.
82
82
 
83
+ ## Benchmarks & Scenarios
84
+
85
+ A **benchmark** is a named regression suite; a **scenario** is one test conversation
86
+ (built from a real `input_conversation_id`) that can belong to one or more benchmarks.
87
+ The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
88
+
89
+ ```bash
90
+ # Inspect benchmarks and their scenarios
91
+ applied benchmarks --agent-id <agent_id> --format json
92
+ applied benchmark <benchmark_id> --format json
93
+ applied scenarios --benchmark-id <benchmark_id> --format json
94
+
95
+ # Build a suite
96
+ applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
97
+ applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
98
+ --benchmark-id <benchmark_id>
99
+
100
+ # Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
101
+ # scenarios under the destination agent; same-agent just tags them in.
102
+ # Dry-run by default; add --apply to write.
103
+ applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
104
+ --target-agent-id <chat_agent_id> --apply
105
+
106
+ # Run a benchmark and wait for results in one command.
107
+ # --contact-email runs as a contact that has an email, fixing
108
+ # "Email is not present in the conversation" on test conversations.
109
+ applied scenario-bulk-run --benchmark-id <benchmark_id> \
110
+ --contact-email test@example.com --wait
111
+ applied scenario-bulk-status <job_id> --include-runs --format json
112
+
113
+ # Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
114
+ applied scenario-bulk-cancel <job_id> --apply
115
+
116
+ # Review pass/fail health (pass_status reflects the latest run per scenario)
117
+ applied benchmark-results <benchmark_id> --format json
118
+
119
+ # Rate scenarios as you evaluate
120
+ applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
121
+
122
+ # Safe delete — refuses to wipe scenarios unless you opt in
123
+ applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
124
+ applied benchmark-delete <benchmark_id> --force # cascade delete
125
+
126
+ # Recover deleted benchmark/scenario rows from a local PITR export
127
+ applied scenario-recover-catalog --recovery-dir <dir> --apply
128
+ ```
129
+
130
+ Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
131
+ `benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
132
+ (unlink the scenarios first so they survive under their agent) or `--force`.
133
+
83
134
  ## Library Usage
84
135
 
85
136
  ```python
@@ -113,6 +164,11 @@ conversations = await tools.conversation_query(
113
164
  | `analytics_report` | Read standard dashboard/report analytics views |
114
165
  | `analytics_query` | Aggregate supported conversation dimensions with count |
115
166
  | `metrics_query` | Roll up named metric events |
167
+ | `benchmark_clone` | Copy all scenarios from one benchmark into another |
168
+ | `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
169
+ | `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
170
+ | `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
171
+ | `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
116
172
 
117
173
  ## Examples
118
174
 
@@ -40,6 +40,7 @@ tests/test_audit_tools.py
40
40
  tests/test_auth_context.py
41
41
  tests/test_benchmark_clone.py
42
42
  tests/test_benchmark_delete_guardrail.py
43
+ tests/test_benchmark_results.py
43
44
  tests/test_benchmark_scenario_tools.py
44
45
  tests/test_cli.py
45
46
  tests/test_cli_v2.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "applied-cli"
3
- version = "0.6.5"
3
+ version = "0.6.7"
4
4
  description = "CLI and shared client library for Applied Labs AI support agents"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -0,0 +1,78 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from applied_cli import tools
6
+
7
+ BENCHMARK = {"id": "bench-1", "name": "Cancel Regression"}
8
+
9
+ SCENARIOS = [
10
+ {"id": "s1", "name": "Cancel order", "pass_status": "pass"},
11
+ {"id": "s2", "name": "Refund flow", "pass_status": "pass"},
12
+ {"id": "s3", "name": "Pause subscription", "pass_status": "fail"},
13
+ {"id": "s4", "name": "Address change", "pass_status": "unrated"},
14
+ {"id": "s5", "name": "No status field"}, # missing -> unrated
15
+ ]
16
+
17
+
18
+ class FakeResultsClient:
19
+ def __init__(self, scenarios=SCENARIOS):
20
+ self._scenarios = scenarios
21
+
22
+ async def get_benchmark(self, benchmark_id):
23
+ return BENCHMARK
24
+
25
+ async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
26
+ return list(self._scenarios)
27
+
28
+
29
+ @pytest.mark.asyncio
30
+ async def test_results_tally_and_pass_rate():
31
+ client = FakeResultsClient()
32
+ data = json.loads(
33
+ await tools.benchmark_results(client, "bench-1", output_format="json")
34
+ )
35
+ assert data["total_scenarios"] == 5
36
+ assert data["passed"] == 2
37
+ assert data["failed"] == 1
38
+ assert data["unrated"] == 2
39
+ assert data["rated"] == 3
40
+ # 2 passed / 3 rated
41
+ assert data["pass_rate"] == round(2 / 3, 4)
42
+ assert [s["id"] for s in data["failing_scenarios"]] == ["s3"]
43
+ assert {s["id"] for s in data["unrated_scenarios"]} == {"s4", "s5"}
44
+
45
+
46
+ @pytest.mark.asyncio
47
+ async def test_results_no_rated_scenarios_pass_rate_none():
48
+ client = FakeResultsClient(
49
+ scenarios=[{"id": "s1", "name": "A", "pass_status": "unrated"}]
50
+ )
51
+ text = await tools.benchmark_results(client, "bench-1", output_format="text")
52
+ assert "n/a (no rated scenarios yet)" in text
53
+
54
+
55
+ @pytest.mark.asyncio
56
+ async def test_results_text_lists_failing_and_unrated():
57
+ client = FakeResultsClient()
58
+ text = await tools.benchmark_results(client, "bench-1")
59
+ assert "# Failing (1)" in text
60
+ assert "Pause subscription" in text
61
+ assert "# Unrated (2)" in text
62
+
63
+
64
+ @pytest.mark.asyncio
65
+ async def test_v2_benchmarks_results_handler_summary():
66
+ from applied_cli.v2.scenarios import (
67
+ BenchmarksResultsInput,
68
+ benchmarks_results_handler,
69
+ )
70
+
71
+ client = FakeResultsClient()
72
+ result = await benchmarks_results_handler(
73
+ client, BenchmarksResultsInput(benchmark_id="bench-1")
74
+ )
75
+ assert result.data["passed"] == 2
76
+ assert "pass rate" in result.summary
77
+ # Has unrated + failing → both follow-up actions surfaced.
78
+ assert len(result.next_actions) == 2
File without changes