applied-cli 0.6.5__tar.gz → 0.6.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {applied_cli-0.6.5 → applied_cli-0.6.7}/PKG-INFO +57 -1
- {applied_cli-0.6.5 → applied_cli-0.6.7}/README.md +56 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/cli.py +16 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/tools.py +87 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/domains.py +1 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/scenarios.py +55 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/PKG-INFO +57 -1
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/SOURCES.txt +1 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/pyproject.toml +1 -1
- applied_cli-0.6.7/tests/test_benchmark_results.py +78 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/__init__.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/agent_scoped_flows.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/auth.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/client.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/conversation_lookup.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/conversations.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/credentials.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/flow_helpers.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/formatters.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/mcp.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/recovery.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/toolkit.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/__init__.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/agents.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/articles.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/catalog.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/connectors.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/content.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/conversations.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/flows.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/knowledge.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/manifest.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/products.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/taxonomy.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli/v2/tickets.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/dependency_links.txt +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/entry_points.txt +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/requires.txt +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/applied_cli.egg-info/top_level.txt +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/setup.cfg +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_agent_scoped_flows.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_audit_tools.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_auth_context.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_benchmark_clone.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_benchmark_delete_guardrail.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_benchmark_scenario_tools.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_cli.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_cli_v2.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_client.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_client_v2.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_conversation_tools.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_flow_tools.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_knowledge_content_tools.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_recovery.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_scenario_bulk_cancel.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_scenario_bulk_run_contact.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_scenario_bulk_run_wait.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_toolkit_contract.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_agents.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_articles.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_catalog_and_mcp.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_connectors.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_content.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_conversations.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_flows.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_knowledge.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_products.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_scenarios.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_taxonomy.py +0 -0
- {applied_cli-0.6.5 → applied_cli-0.6.7}/tests/test_v2_tickets.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: applied-cli
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.7
|
|
4
4
|
Summary: CLI and shared client library for Applied Labs AI support agents
|
|
5
5
|
Author: Applied Labs
|
|
6
6
|
License-Expression: MIT
|
|
@@ -80,6 +80,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
|
|
|
80
80
|
object. `analytics` returns grouped rows and currently supports `--metrics count`.
|
|
81
81
|
Raw analytics SQL is not available through the public CLI surface.
|
|
82
82
|
|
|
83
|
+
## Benchmarks & Scenarios
|
|
84
|
+
|
|
85
|
+
A **benchmark** is a named regression suite; a **scenario** is one test conversation
|
|
86
|
+
(built from a real `input_conversation_id`) that can belong to one or more benchmarks.
|
|
87
|
+
The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Inspect benchmarks and their scenarios
|
|
91
|
+
applied benchmarks --agent-id <agent_id> --format json
|
|
92
|
+
applied benchmark <benchmark_id> --format json
|
|
93
|
+
applied scenarios --benchmark-id <benchmark_id> --format json
|
|
94
|
+
|
|
95
|
+
# Build a suite
|
|
96
|
+
applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
|
|
97
|
+
applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
|
|
98
|
+
--benchmark-id <benchmark_id>
|
|
99
|
+
|
|
100
|
+
# Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
|
|
101
|
+
# scenarios under the destination agent; same-agent just tags them in.
|
|
102
|
+
# Dry-run by default; add --apply to write.
|
|
103
|
+
applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
|
|
104
|
+
--target-agent-id <chat_agent_id> --apply
|
|
105
|
+
|
|
106
|
+
# Run a benchmark and wait for results in one command.
|
|
107
|
+
# --contact-email runs as a contact that has an email, fixing
|
|
108
|
+
# "Email is not present in the conversation" on test conversations.
|
|
109
|
+
applied scenario-bulk-run --benchmark-id <benchmark_id> \
|
|
110
|
+
--contact-email test@example.com --wait
|
|
111
|
+
applied scenario-bulk-status <job_id> --include-runs --format json
|
|
112
|
+
|
|
113
|
+
# Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
|
|
114
|
+
applied scenario-bulk-cancel <job_id> --apply
|
|
115
|
+
|
|
116
|
+
# Review pass/fail health (pass_status reflects the latest run per scenario)
|
|
117
|
+
applied benchmark-results <benchmark_id> --format json
|
|
118
|
+
|
|
119
|
+
# Rate scenarios as you evaluate
|
|
120
|
+
applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
|
|
121
|
+
|
|
122
|
+
# Safe delete — refuses to wipe scenarios unless you opt in
|
|
123
|
+
applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
|
|
124
|
+
applied benchmark-delete <benchmark_id> --force # cascade delete
|
|
125
|
+
|
|
126
|
+
# Recover deleted benchmark/scenario rows from a local PITR export
|
|
127
|
+
applied scenario-recover-catalog --recovery-dir <dir> --apply
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
|
|
131
|
+
`benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
|
|
132
|
+
(unlink the scenarios first so they survive under their agent) or `--force`.
|
|
133
|
+
|
|
83
134
|
## Library Usage
|
|
84
135
|
|
|
85
136
|
```python
|
|
@@ -113,6 +164,11 @@ conversations = await tools.conversation_query(
|
|
|
113
164
|
| `analytics_report` | Read standard dashboard/report analytics views |
|
|
114
165
|
| `analytics_query` | Aggregate supported conversation dimensions with count |
|
|
115
166
|
| `metrics_query` | Roll up named metric events |
|
|
167
|
+
| `benchmark_clone` | Copy all scenarios from one benchmark into another |
|
|
168
|
+
| `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
|
|
169
|
+
| `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
|
|
170
|
+
| `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
|
|
171
|
+
| `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
|
|
116
172
|
|
|
117
173
|
## Examples
|
|
118
174
|
|
|
@@ -54,6 +54,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
|
|
|
54
54
|
object. `analytics` returns grouped rows and currently supports `--metrics count`.
|
|
55
55
|
Raw analytics SQL is not available through the public CLI surface.
|
|
56
56
|
|
|
57
|
+
## Benchmarks & Scenarios
|
|
58
|
+
|
|
59
|
+
A **benchmark** is a named regression suite; a **scenario** is one test conversation
|
|
60
|
+
(built from a real `input_conversation_id`) that can belong to one or more benchmarks.
|
|
61
|
+
The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Inspect benchmarks and their scenarios
|
|
65
|
+
applied benchmarks --agent-id <agent_id> --format json
|
|
66
|
+
applied benchmark <benchmark_id> --format json
|
|
67
|
+
applied scenarios --benchmark-id <benchmark_id> --format json
|
|
68
|
+
|
|
69
|
+
# Build a suite
|
|
70
|
+
applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
|
|
71
|
+
applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
|
|
72
|
+
--benchmark-id <benchmark_id>
|
|
73
|
+
|
|
74
|
+
# Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
|
|
75
|
+
# scenarios under the destination agent; same-agent just tags them in.
|
|
76
|
+
# Dry-run by default; add --apply to write.
|
|
77
|
+
applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
|
|
78
|
+
--target-agent-id <chat_agent_id> --apply
|
|
79
|
+
|
|
80
|
+
# Run a benchmark and wait for results in one command.
|
|
81
|
+
# --contact-email runs as a contact that has an email, fixing
|
|
82
|
+
# "Email is not present in the conversation" on test conversations.
|
|
83
|
+
applied scenario-bulk-run --benchmark-id <benchmark_id> \
|
|
84
|
+
--contact-email test@example.com --wait
|
|
85
|
+
applied scenario-bulk-status <job_id> --include-runs --format json
|
|
86
|
+
|
|
87
|
+
# Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
|
|
88
|
+
applied scenario-bulk-cancel <job_id> --apply
|
|
89
|
+
|
|
90
|
+
# Review pass/fail health (pass_status reflects the latest run per scenario)
|
|
91
|
+
applied benchmark-results <benchmark_id> --format json
|
|
92
|
+
|
|
93
|
+
# Rate scenarios as you evaluate
|
|
94
|
+
applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
|
|
95
|
+
|
|
96
|
+
# Safe delete — refuses to wipe scenarios unless you opt in
|
|
97
|
+
applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
|
|
98
|
+
applied benchmark-delete <benchmark_id> --force # cascade delete
|
|
99
|
+
|
|
100
|
+
# Recover deleted benchmark/scenario rows from a local PITR export
|
|
101
|
+
applied scenario-recover-catalog --recovery-dir <dir> --apply
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
|
|
105
|
+
`benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
|
|
106
|
+
(unlink the scenarios first so they survive under their agent) or `--force`.
|
|
107
|
+
|
|
57
108
|
## Library Usage
|
|
58
109
|
|
|
59
110
|
```python
|
|
@@ -87,6 +138,11 @@ conversations = await tools.conversation_query(
|
|
|
87
138
|
| `analytics_report` | Read standard dashboard/report analytics views |
|
|
88
139
|
| `analytics_query` | Aggregate supported conversation dimensions with count |
|
|
89
140
|
| `metrics_query` | Roll up named metric events |
|
|
141
|
+
| `benchmark_clone` | Copy all scenarios from one benchmark into another |
|
|
142
|
+
| `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
|
|
143
|
+
| `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
|
|
144
|
+
| `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
|
|
145
|
+
| `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
|
|
90
146
|
|
|
91
147
|
## Examples
|
|
92
148
|
|
|
@@ -1672,6 +1672,22 @@ def benchmark_delete(
|
|
|
1672
1672
|
typer.echo(result)
|
|
1673
1673
|
|
|
1674
1674
|
|
|
1675
|
+
@app.command("benchmark-results")
|
|
1676
|
+
def benchmark_results(
|
|
1677
|
+
id: str = typer.Argument(..., help="Benchmark ID"),
|
|
1678
|
+
shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
|
|
1679
|
+
format: str = typer.Option(
|
|
1680
|
+
"text", "--format", "-f", help="Output format: text or json"
|
|
1681
|
+
),
|
|
1682
|
+
) -> None:
|
|
1683
|
+
"""Summarize a benchmark's pass/fail/unrated health and pass rate."""
|
|
1684
|
+
client = get_client(shop_id=shop_id)
|
|
1685
|
+
result = asyncio.run(
|
|
1686
|
+
tools.benchmark_results(client, benchmark_id=id, output_format=format)
|
|
1687
|
+
)
|
|
1688
|
+
typer.echo(result)
|
|
1689
|
+
|
|
1690
|
+
|
|
1675
1691
|
@app.command()
|
|
1676
1692
|
def scenarios(
|
|
1677
1693
|
benchmark_id: str = typer.Option(
|
|
@@ -5710,6 +5710,93 @@ async def benchmark_clone(
|
|
|
5710
5710
|
return "\n".join(lines)
|
|
5711
5711
|
|
|
5712
5712
|
|
|
5713
|
+
async def benchmark_results(
|
|
5714
|
+
client: AppliedClient,
|
|
5715
|
+
benchmark_id: str,
|
|
5716
|
+
*,
|
|
5717
|
+
output_format: str = "text",
|
|
5718
|
+
) -> str:
|
|
5719
|
+
"""
|
|
5720
|
+
Summarize a benchmark's pass/fail health.
|
|
5721
|
+
|
|
5722
|
+
Tallies the pass_status across the benchmark's scenarios (pass / fail /
|
|
5723
|
+
unrated), computes the pass rate among rated scenarios, and lists the failing
|
|
5724
|
+
and still-unrated scenarios so you know what to fix or evaluate next.
|
|
5725
|
+
|
|
5726
|
+
Args:
|
|
5727
|
+
client: Authenticated AppliedClient
|
|
5728
|
+
benchmark_id: The benchmark UUID
|
|
5729
|
+
output_format: 'text' (default) or 'json'
|
|
5730
|
+
|
|
5731
|
+
Returns:
|
|
5732
|
+
Pass-rate summary with failing and unrated scenario lists.
|
|
5733
|
+
"""
|
|
5734
|
+
try:
|
|
5735
|
+
benchmark = await client.get_benchmark(benchmark_id)
|
|
5736
|
+
scenarios = await client.list_scenarios(
|
|
5737
|
+
benchmark_id=benchmark_id, fetch_all=True
|
|
5738
|
+
)
|
|
5739
|
+
except AppliedAPIError as e:
|
|
5740
|
+
return _format_error(e)
|
|
5741
|
+
|
|
5742
|
+
tally = {"pass": 0, "fail": 0, "unrated": 0}
|
|
5743
|
+
failing: list[dict[str, Any]] = []
|
|
5744
|
+
unrated: list[dict[str, Any]] = []
|
|
5745
|
+
for scenario in scenarios:
|
|
5746
|
+
status = str(scenario.get("pass_status") or "unrated").lower()
|
|
5747
|
+
if status not in tally:
|
|
5748
|
+
status = "unrated"
|
|
5749
|
+
tally[status] += 1
|
|
5750
|
+
entry = {"id": scenario.get("id"), "name": scenario.get("name")}
|
|
5751
|
+
if status == "fail":
|
|
5752
|
+
failing.append(entry)
|
|
5753
|
+
elif status == "unrated":
|
|
5754
|
+
unrated.append(entry)
|
|
5755
|
+
|
|
5756
|
+
rated = tally["pass"] + tally["fail"]
|
|
5757
|
+
pass_rate = round(tally["pass"] / rated, 4) if rated else None
|
|
5758
|
+
summary = {
|
|
5759
|
+
"benchmark_id": benchmark_id,
|
|
5760
|
+
"benchmark_name": benchmark.get("name"),
|
|
5761
|
+
"total_scenarios": len(scenarios),
|
|
5762
|
+
"passed": tally["pass"],
|
|
5763
|
+
"failed": tally["fail"],
|
|
5764
|
+
"unrated": tally["unrated"],
|
|
5765
|
+
"rated": rated,
|
|
5766
|
+
"pass_rate": pass_rate,
|
|
5767
|
+
"failing_scenarios": failing,
|
|
5768
|
+
"unrated_scenarios": unrated,
|
|
5769
|
+
}
|
|
5770
|
+
|
|
5771
|
+
if output_format == "json":
|
|
5772
|
+
return to_json(summary)
|
|
5773
|
+
|
|
5774
|
+
pass_rate_str = (
|
|
5775
|
+
f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
|
|
5776
|
+
if pass_rate is not None
|
|
5777
|
+
else "n/a (no rated scenarios yet)"
|
|
5778
|
+
)
|
|
5779
|
+
lines = [
|
|
5780
|
+
f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
|
|
5781
|
+
f"total_scenarios: {summary['total_scenarios']}",
|
|
5782
|
+
f"passed: {tally['pass']}",
|
|
5783
|
+
f"failed: {tally['fail']}",
|
|
5784
|
+
f"unrated: {tally['unrated']}",
|
|
5785
|
+
f"pass_rate: {pass_rate_str}",
|
|
5786
|
+
]
|
|
5787
|
+
if failing:
|
|
5788
|
+
lines.append(f"\n# Failing ({len(failing)})")
|
|
5789
|
+
lines.extend(f" - {s['name']} ({s['id']})" for s in failing[:50])
|
|
5790
|
+
if len(failing) > 50:
|
|
5791
|
+
lines.append(f" ... and {len(failing) - 50} more")
|
|
5792
|
+
if unrated:
|
|
5793
|
+
lines.append(f"\n# Unrated ({len(unrated)}) — evaluate these next")
|
|
5794
|
+
lines.extend(f" - {s['name']} ({s['id']})" for s in unrated[:50])
|
|
5795
|
+
if len(unrated) > 50:
|
|
5796
|
+
lines.append(f" ... and {len(unrated) - 50} more")
|
|
5797
|
+
return "\n".join(lines)
|
|
5798
|
+
|
|
5799
|
+
|
|
5713
5800
|
# -----------------------------------------------------------------------------
|
|
5714
5801
|
# Scenarios
|
|
5715
5802
|
# -----------------------------------------------------------------------------
|
|
@@ -43,6 +43,7 @@ DOMAIN_TOOL_RENAMES: dict[str, dict[str, str]] = {
|
|
|
43
43
|
"benchmark_create": "benchmarks_create",
|
|
44
44
|
"benchmark_delete": "benchmarks_delete",
|
|
45
45
|
"benchmark_clone": "benchmarks_clone",
|
|
46
|
+
"benchmark_results": "benchmarks_results",
|
|
46
47
|
},
|
|
47
48
|
"connectors": {
|
|
48
49
|
"connector_types": "connectors_types_list",
|
|
@@ -71,6 +71,10 @@ class BenchmarksCloneInput(StrictInput):
|
|
|
71
71
|
apply: bool = False
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
class BenchmarksResultsInput(StrictInput):
|
|
75
|
+
benchmark_id: str
|
|
76
|
+
|
|
77
|
+
|
|
74
78
|
class ScenariosListInput(StrictInput):
|
|
75
79
|
benchmark_id: str | None = None
|
|
76
80
|
agent_id: str | None = None
|
|
@@ -503,6 +507,44 @@ async def benchmarks_clone_handler(
|
|
|
503
507
|
)
|
|
504
508
|
|
|
505
509
|
|
|
510
|
+
async def benchmarks_results_handler(
|
|
511
|
+
client: AppliedClient,
|
|
512
|
+
params: BenchmarksResultsInput,
|
|
513
|
+
) -> ToolResult[Any]:
|
|
514
|
+
from applied_cli import tools as legacy_tools
|
|
515
|
+
|
|
516
|
+
raw = await legacy_tools.benchmark_results(
|
|
517
|
+
client, benchmark_id=params.benchmark_id, output_format="json"
|
|
518
|
+
)
|
|
519
|
+
try:
|
|
520
|
+
data = json.loads(raw)
|
|
521
|
+
except (json.JSONDecodeError, TypeError):
|
|
522
|
+
return ToolResult(data={"message": raw}, summary=str(raw))
|
|
523
|
+
|
|
524
|
+
pass_rate = data.get("pass_rate")
|
|
525
|
+
rate_str = (
|
|
526
|
+
f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a (no rated yet)"
|
|
527
|
+
)
|
|
528
|
+
next_actions = []
|
|
529
|
+
if data.get("unrated"):
|
|
530
|
+
next_actions.append(
|
|
531
|
+
"Rate the unrated scenarios with scenarios_update (pass_status)."
|
|
532
|
+
)
|
|
533
|
+
if data.get("failed"):
|
|
534
|
+
next_actions.append(
|
|
535
|
+
"Inspect failing scenarios with scenarios_get / conversations_debug_bundle."
|
|
536
|
+
)
|
|
537
|
+
return ToolResult(
|
|
538
|
+
data=data,
|
|
539
|
+
summary=(
|
|
540
|
+
f"{data.get('benchmark_name') or params.benchmark_id}: pass rate "
|
|
541
|
+
f"{rate_str} — {data.get('passed', 0)} passed, "
|
|
542
|
+
f"{data.get('failed', 0)} failed, {data.get('unrated', 0)} unrated."
|
|
543
|
+
),
|
|
544
|
+
next_actions=next_actions,
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|
|
506
548
|
async def benchmarks_delete_handler(
|
|
507
549
|
client: AppliedClient,
|
|
508
550
|
params: BenchmarksDeleteInput,
|
|
@@ -1006,6 +1048,19 @@ def scenario_specs() -> list[ToolSpec]:
|
|
|
1006
1048
|
read_write_mode="write",
|
|
1007
1049
|
tags=["benchmark_clone", "native"],
|
|
1008
1050
|
),
|
|
1051
|
+
ToolSpec(
|
|
1052
|
+
name="benchmarks_results",
|
|
1053
|
+
namespace="benchmarks",
|
|
1054
|
+
description=(
|
|
1055
|
+
"Summarize a benchmark's pass/fail/unrated health and pass rate, "
|
|
1056
|
+
"with the failing and unrated scenario lists."
|
|
1057
|
+
),
|
|
1058
|
+
input_model=BenchmarksResultsInput,
|
|
1059
|
+
output_model=None,
|
|
1060
|
+
handler=benchmarks_results_handler,
|
|
1061
|
+
read_write_mode="read",
|
|
1062
|
+
tags=["benchmark_results", "native"],
|
|
1063
|
+
),
|
|
1009
1064
|
ToolSpec(
|
|
1010
1065
|
name="scenarios_list",
|
|
1011
1066
|
namespace="scenarios",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: applied-cli
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.7
|
|
4
4
|
Summary: CLI and shared client library for Applied Labs AI support agents
|
|
5
5
|
Author: Applied Labs
|
|
6
6
|
License-Expression: MIT
|
|
@@ -80,6 +80,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
|
|
|
80
80
|
object. `analytics` returns grouped rows and currently supports `--metrics count`.
|
|
81
81
|
Raw analytics SQL is not available through the public CLI surface.
|
|
82
82
|
|
|
83
|
+
## Benchmarks & Scenarios
|
|
84
|
+
|
|
85
|
+
A **benchmark** is a named regression suite; a **scenario** is one test conversation
|
|
86
|
+
(built from a real `input_conversation_id`) that can belong to one or more benchmarks.
|
|
87
|
+
The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Inspect benchmarks and their scenarios
|
|
91
|
+
applied benchmarks --agent-id <agent_id> --format json
|
|
92
|
+
applied benchmark <benchmark_id> --format json
|
|
93
|
+
applied scenarios --benchmark-id <benchmark_id> --format json
|
|
94
|
+
|
|
95
|
+
# Build a suite
|
|
96
|
+
applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
|
|
97
|
+
applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
|
|
98
|
+
--benchmark-id <benchmark_id>
|
|
99
|
+
|
|
100
|
+
# Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
|
|
101
|
+
# scenarios under the destination agent; same-agent just tags them in.
|
|
102
|
+
# Dry-run by default; add --apply to write.
|
|
103
|
+
applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
|
|
104
|
+
--target-agent-id <chat_agent_id> --apply
|
|
105
|
+
|
|
106
|
+
# Run a benchmark and wait for results in one command.
|
|
107
|
+
# --contact-email runs as a contact that has an email, fixing
|
|
108
|
+
# "Email is not present in the conversation" on test conversations.
|
|
109
|
+
applied scenario-bulk-run --benchmark-id <benchmark_id> \
|
|
110
|
+
--contact-email test@example.com --wait
|
|
111
|
+
applied scenario-bulk-status <job_id> --include-runs --format json
|
|
112
|
+
|
|
113
|
+
# Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
|
|
114
|
+
applied scenario-bulk-cancel <job_id> --apply
|
|
115
|
+
|
|
116
|
+
# Review pass/fail health (pass_status reflects the latest run per scenario)
|
|
117
|
+
applied benchmark-results <benchmark_id> --format json
|
|
118
|
+
|
|
119
|
+
# Rate scenarios as you evaluate
|
|
120
|
+
applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
|
|
121
|
+
|
|
122
|
+
# Safe delete — refuses to wipe scenarios unless you opt in
|
|
123
|
+
applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
|
|
124
|
+
applied benchmark-delete <benchmark_id> --force # cascade delete
|
|
125
|
+
|
|
126
|
+
# Recover deleted benchmark/scenario rows from a local PITR export
|
|
127
|
+
applied scenario-recover-catalog --recovery-dir <dir> --apply
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
|
|
131
|
+
`benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
|
|
132
|
+
(unlink the scenarios first so they survive under their agent) or `--force`.
|
|
133
|
+
|
|
83
134
|
## Library Usage
|
|
84
135
|
|
|
85
136
|
```python
|
|
@@ -113,6 +164,11 @@ conversations = await tools.conversation_query(
|
|
|
113
164
|
| `analytics_report` | Read standard dashboard/report analytics views |
|
|
114
165
|
| `analytics_query` | Aggregate supported conversation dimensions with count |
|
|
115
166
|
| `metrics_query` | Roll up named metric events |
|
|
167
|
+
| `benchmark_clone` | Copy all scenarios from one benchmark into another |
|
|
168
|
+
| `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
|
|
169
|
+
| `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
|
|
170
|
+
| `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
|
|
171
|
+
| `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
|
|
116
172
|
|
|
117
173
|
## Examples
|
|
118
174
|
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from applied_cli import tools
|
|
6
|
+
|
|
7
|
+
BENCHMARK = {"id": "bench-1", "name": "Cancel Regression"}
|
|
8
|
+
|
|
9
|
+
SCENARIOS = [
|
|
10
|
+
{"id": "s1", "name": "Cancel order", "pass_status": "pass"},
|
|
11
|
+
{"id": "s2", "name": "Refund flow", "pass_status": "pass"},
|
|
12
|
+
{"id": "s3", "name": "Pause subscription", "pass_status": "fail"},
|
|
13
|
+
{"id": "s4", "name": "Address change", "pass_status": "unrated"},
|
|
14
|
+
{"id": "s5", "name": "No status field"}, # missing -> unrated
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FakeResultsClient:
|
|
19
|
+
def __init__(self, scenarios=SCENARIOS):
|
|
20
|
+
self._scenarios = scenarios
|
|
21
|
+
|
|
22
|
+
async def get_benchmark(self, benchmark_id):
|
|
23
|
+
return BENCHMARK
|
|
24
|
+
|
|
25
|
+
async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
|
|
26
|
+
return list(self._scenarios)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.mark.asyncio
|
|
30
|
+
async def test_results_tally_and_pass_rate():
|
|
31
|
+
client = FakeResultsClient()
|
|
32
|
+
data = json.loads(
|
|
33
|
+
await tools.benchmark_results(client, "bench-1", output_format="json")
|
|
34
|
+
)
|
|
35
|
+
assert data["total_scenarios"] == 5
|
|
36
|
+
assert data["passed"] == 2
|
|
37
|
+
assert data["failed"] == 1
|
|
38
|
+
assert data["unrated"] == 2
|
|
39
|
+
assert data["rated"] == 3
|
|
40
|
+
# 2 passed / 3 rated
|
|
41
|
+
assert data["pass_rate"] == round(2 / 3, 4)
|
|
42
|
+
assert [s["id"] for s in data["failing_scenarios"]] == ["s3"]
|
|
43
|
+
assert {s["id"] for s in data["unrated_scenarios"]} == {"s4", "s5"}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.mark.asyncio
|
|
47
|
+
async def test_results_no_rated_scenarios_pass_rate_none():
|
|
48
|
+
client = FakeResultsClient(
|
|
49
|
+
scenarios=[{"id": "s1", "name": "A", "pass_status": "unrated"}]
|
|
50
|
+
)
|
|
51
|
+
text = await tools.benchmark_results(client, "bench-1", output_format="text")
|
|
52
|
+
assert "n/a (no rated scenarios yet)" in text
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@pytest.mark.asyncio
|
|
56
|
+
async def test_results_text_lists_failing_and_unrated():
|
|
57
|
+
client = FakeResultsClient()
|
|
58
|
+
text = await tools.benchmark_results(client, "bench-1")
|
|
59
|
+
assert "# Failing (1)" in text
|
|
60
|
+
assert "Pause subscription" in text
|
|
61
|
+
assert "# Unrated (2)" in text
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@pytest.mark.asyncio
|
|
65
|
+
async def test_v2_benchmarks_results_handler_summary():
|
|
66
|
+
from applied_cli.v2.scenarios import (
|
|
67
|
+
BenchmarksResultsInput,
|
|
68
|
+
benchmarks_results_handler,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
client = FakeResultsClient()
|
|
72
|
+
result = await benchmarks_results_handler(
|
|
73
|
+
client, BenchmarksResultsInput(benchmark_id="bench-1")
|
|
74
|
+
)
|
|
75
|
+
assert result.data["passed"] == 2
|
|
76
|
+
assert "pass rate" in result.summary
|
|
77
|
+
# Has unrated + failing → both follow-up actions surfaced.
|
|
78
|
+
assert len(result.next_actions) == 2
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|