applied-cli 0.6.6__tar.gz → 0.6.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {applied_cli-0.6.6 → applied_cli-0.6.8}/PKG-INFO +57 -1
- {applied_cli-0.6.6 → applied_cli-0.6.8}/README.md +56 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/cli.py +13 -2
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/tools.py +75 -33
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/scenarios.py +23 -2
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/PKG-INFO +57 -1
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/SOURCES.txt +1 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/pyproject.toml +1 -1
- applied_cli-0.6.8/tests/test_benchmark_list_with_results.py +104 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/__init__.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/agent_scoped_flows.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/auth.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/client.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/conversation_lookup.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/conversations.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/credentials.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/flow_helpers.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/formatters.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/mcp.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/recovery.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/toolkit.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/__init__.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/agents.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/articles.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/catalog.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/connectors.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/content.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/conversations.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/domains.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/flows.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/knowledge.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/manifest.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/products.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/taxonomy.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli/v2/tickets.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/dependency_links.txt +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/entry_points.txt +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/requires.txt +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/applied_cli.egg-info/top_level.txt +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/setup.cfg +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_agent_scoped_flows.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_audit_tools.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_auth_context.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_benchmark_clone.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_benchmark_delete_guardrail.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_benchmark_results.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_benchmark_scenario_tools.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_cli.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_cli_v2.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_client.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_client_v2.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_conversation_tools.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_flow_tools.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_knowledge_content_tools.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_recovery.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_scenario_bulk_cancel.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_scenario_bulk_run_contact.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_scenario_bulk_run_wait.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_toolkit_contract.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_agents.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_articles.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_catalog_and_mcp.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_connectors.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_content.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_conversations.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_flows.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_knowledge.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_products.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_scenarios.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_taxonomy.py +0 -0
- {applied_cli-0.6.6 → applied_cli-0.6.8}/tests/test_v2_tickets.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: applied-cli
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.8
|
|
4
4
|
Summary: CLI and shared client library for Applied Labs AI support agents
|
|
5
5
|
Author: Applied Labs
|
|
6
6
|
License-Expression: MIT
|
|
@@ -80,6 +80,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
|
|
|
80
80
|
object. `analytics` returns grouped rows and currently supports `--metrics count`.
|
|
81
81
|
Raw analytics SQL is not available through the public CLI surface.
|
|
82
82
|
|
|
83
|
+
## Benchmarks & Scenarios
|
|
84
|
+
|
|
85
|
+
A **benchmark** is a named regression suite; a **scenario** is one test conversation
|
|
86
|
+
(built from a real `input_conversation_id`) that can belong to one or more benchmarks.
|
|
87
|
+
The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Inspect benchmarks and their scenarios
|
|
91
|
+
applied benchmarks --agent-id <agent_id> --format json
|
|
92
|
+
applied benchmark <benchmark_id> --format json
|
|
93
|
+
applied scenarios --benchmark-id <benchmark_id> --format json
|
|
94
|
+
|
|
95
|
+
# Build a suite
|
|
96
|
+
applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
|
|
97
|
+
applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
|
|
98
|
+
--benchmark-id <benchmark_id>
|
|
99
|
+
|
|
100
|
+
# Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
|
|
101
|
+
# scenarios under the destination agent; same-agent just tags them in.
|
|
102
|
+
# Dry-run by default; add --apply to write.
|
|
103
|
+
applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
|
|
104
|
+
--target-agent-id <chat_agent_id> --apply
|
|
105
|
+
|
|
106
|
+
# Run a benchmark and wait for results in one command.
|
|
107
|
+
# --contact-email runs as a contact that has an email, fixing
|
|
108
|
+
# "Email is not present in the conversation" on test conversations.
|
|
109
|
+
applied scenario-bulk-run --benchmark-id <benchmark_id> \
|
|
110
|
+
--contact-email test@example.com --wait
|
|
111
|
+
applied scenario-bulk-status <job_id> --include-runs --format json
|
|
112
|
+
|
|
113
|
+
# Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
|
|
114
|
+
applied scenario-bulk-cancel <job_id> --apply
|
|
115
|
+
|
|
116
|
+
# Review pass/fail health (pass_status reflects the latest run per scenario)
|
|
117
|
+
applied benchmark-results <benchmark_id> --format json
|
|
118
|
+
|
|
119
|
+
# Rate scenarios as you evaluate
|
|
120
|
+
applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
|
|
121
|
+
|
|
122
|
+
# Safe delete — refuses to wipe scenarios unless you opt in
|
|
123
|
+
applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
|
|
124
|
+
applied benchmark-delete <benchmark_id> --force # cascade delete
|
|
125
|
+
|
|
126
|
+
# Recover deleted benchmark/scenario rows from a local PITR export
|
|
127
|
+
applied scenario-recover-catalog --recovery-dir <dir> --apply
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
|
|
131
|
+
`benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
|
|
132
|
+
(unlink the scenarios first so they survive under their agent) or `--force`.
|
|
133
|
+
|
|
83
134
|
## Library Usage
|
|
84
135
|
|
|
85
136
|
```python
|
|
@@ -113,6 +164,11 @@ conversations = await tools.conversation_query(
|
|
|
113
164
|
| `analytics_report` | Read standard dashboard/report analytics views |
|
|
114
165
|
| `analytics_query` | Aggregate supported conversation dimensions with count |
|
|
115
166
|
| `metrics_query` | Roll up named metric events |
|
|
167
|
+
| `benchmark_clone` | Copy all scenarios from one benchmark into another |
|
|
168
|
+
| `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
|
|
169
|
+
| `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
|
|
170
|
+
| `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
|
|
171
|
+
| `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
|
|
116
172
|
|
|
117
173
|
## Examples
|
|
118
174
|
|
|
@@ -54,6 +54,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
|
|
|
54
54
|
object. `analytics` returns grouped rows and currently supports `--metrics count`.
|
|
55
55
|
Raw analytics SQL is not available through the public CLI surface.
|
|
56
56
|
|
|
57
|
+
## Benchmarks & Scenarios
|
|
58
|
+
|
|
59
|
+
A **benchmark** is a named regression suite; a **scenario** is one test conversation
|
|
60
|
+
(built from a real `input_conversation_id`) that can belong to one or more benchmarks.
|
|
61
|
+
The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Inspect benchmarks and their scenarios
|
|
65
|
+
applied benchmarks --agent-id <agent_id> --format json
|
|
66
|
+
applied benchmark <benchmark_id> --format json
|
|
67
|
+
applied scenarios --benchmark-id <benchmark_id> --format json
|
|
68
|
+
|
|
69
|
+
# Build a suite
|
|
70
|
+
applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
|
|
71
|
+
applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
|
|
72
|
+
--benchmark-id <benchmark_id>
|
|
73
|
+
|
|
74
|
+
# Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
|
|
75
|
+
# scenarios under the destination agent; same-agent just tags them in.
|
|
76
|
+
# Dry-run by default; add --apply to write.
|
|
77
|
+
applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
|
|
78
|
+
--target-agent-id <chat_agent_id> --apply
|
|
79
|
+
|
|
80
|
+
# Run a benchmark and wait for results in one command.
|
|
81
|
+
# --contact-email runs as a contact that has an email, fixing
|
|
82
|
+
# "Email is not present in the conversation" on test conversations.
|
|
83
|
+
applied scenario-bulk-run --benchmark-id <benchmark_id> \
|
|
84
|
+
--contact-email test@example.com --wait
|
|
85
|
+
applied scenario-bulk-status <job_id> --include-runs --format json
|
|
86
|
+
|
|
87
|
+
# Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
|
|
88
|
+
applied scenario-bulk-cancel <job_id> --apply
|
|
89
|
+
|
|
90
|
+
# Review pass/fail health (pass_status reflects the latest run per scenario)
|
|
91
|
+
applied benchmark-results <benchmark_id> --format json
|
|
92
|
+
|
|
93
|
+
# Rate scenarios as you evaluate
|
|
94
|
+
applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
|
|
95
|
+
|
|
96
|
+
# Safe delete — refuses to wipe scenarios unless you opt in
|
|
97
|
+
applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
|
|
98
|
+
applied benchmark-delete <benchmark_id> --force # cascade delete
|
|
99
|
+
|
|
100
|
+
# Recover deleted benchmark/scenario rows from a local PITR export
|
|
101
|
+
applied scenario-recover-catalog --recovery-dir <dir> --apply
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
|
|
105
|
+
`benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
|
|
106
|
+
(unlink the scenarios first so they survive under their agent) or `--force`.
|
|
107
|
+
|
|
57
108
|
## Library Usage
|
|
58
109
|
|
|
59
110
|
```python
|
|
@@ -87,6 +138,11 @@ conversations = await tools.conversation_query(
|
|
|
87
138
|
| `analytics_report` | Read standard dashboard/report analytics views |
|
|
88
139
|
| `analytics_query` | Aggregate supported conversation dimensions with count |
|
|
89
140
|
| `metrics_query` | Roll up named metric events |
|
|
141
|
+
| `benchmark_clone` | Copy all scenarios from one benchmark into another |
|
|
142
|
+
| `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
|
|
143
|
+
| `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
|
|
144
|
+
| `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
|
|
145
|
+
| `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
|
|
90
146
|
|
|
91
147
|
## Examples
|
|
92
148
|
|
|
@@ -1540,15 +1540,26 @@ def send_message_cmd(
|
|
|
1540
1540
|
@app.command()
|
|
1541
1541
|
def benchmarks(
|
|
1542
1542
|
agent_id: str = typer.Option(None, "--agent-id", help="Filter by agent ID"),
|
|
1543
|
+
with_results: bool = typer.Option(
|
|
1544
|
+
False,
|
|
1545
|
+
"--with-results",
|
|
1546
|
+
help="Include each benchmark's pass/fail/unrated tally and pass rate "
|
|
1547
|
+
"(one scenario fetch per benchmark) — a go/no-go portfolio view",
|
|
1548
|
+
),
|
|
1543
1549
|
shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
|
|
1544
1550
|
format: str = typer.Option(
|
|
1545
1551
|
"csv", "--format", "-f", help="Output format: csv or json"
|
|
1546
1552
|
),
|
|
1547
1553
|
) -> None:
|
|
1548
|
-
"""List benchmarks."""
|
|
1554
|
+
"""List benchmarks (optionally with per-benchmark pass rates via --with-results)."""
|
|
1549
1555
|
client = get_client(shop_id=shop_id)
|
|
1550
1556
|
result = asyncio.run(
|
|
1551
|
-
tools.benchmark_list(
|
|
1557
|
+
tools.benchmark_list(
|
|
1558
|
+
client,
|
|
1559
|
+
agent_id=agent_id,
|
|
1560
|
+
output_format=format,
|
|
1561
|
+
with_results=with_results,
|
|
1562
|
+
)
|
|
1552
1563
|
)
|
|
1553
1564
|
typer.echo(result)
|
|
1554
1565
|
|
|
@@ -5306,6 +5306,7 @@ async def benchmark_list(
|
|
|
5306
5306
|
client: AppliedClient,
|
|
5307
5307
|
agent_id: str | None = None,
|
|
5308
5308
|
output_format: str = "csv",
|
|
5309
|
+
with_results: bool = False,
|
|
5309
5310
|
) -> str:
|
|
5310
5311
|
"""
|
|
5311
5312
|
List conversation benchmarks.
|
|
@@ -5314,26 +5315,45 @@ async def benchmark_list(
|
|
|
5314
5315
|
client: Authenticated AppliedClient
|
|
5315
5316
|
agent_id: Optional - filter by agent UUID
|
|
5316
5317
|
output_format: 'csv' or 'json'
|
|
5318
|
+
with_results: Also compute each benchmark's pass/fail/unrated tally and
|
|
5319
|
+
pass rate (one extra scenario fetch per benchmark) — a go/no-go
|
|
5320
|
+
portfolio view across all benchmarks
|
|
5317
5321
|
|
|
5318
5322
|
Returns:
|
|
5319
|
-
List of benchmarks with id, name, agent, scenario count
|
|
5323
|
+
List of benchmarks with id, name, agent, scenario count (and pass-rate
|
|
5324
|
+
columns when with_results is set)
|
|
5320
5325
|
"""
|
|
5321
5326
|
benchmarks = await client.list_benchmarks(agent_id=agent_id)
|
|
5322
|
-
mapped = [
|
|
5323
|
-
|
|
5327
|
+
mapped = []
|
|
5328
|
+
for b in benchmarks:
|
|
5329
|
+
row = {
|
|
5324
5330
|
"id": b.get("id"),
|
|
5325
5331
|
"name": b.get("name"),
|
|
5326
5332
|
"agent_name": b.get("agent", {}).get("name", ""),
|
|
5327
5333
|
"scenario_count": b.get("scenario_count", 0),
|
|
5328
5334
|
"description": str(b.get("description", ""))[:80],
|
|
5329
5335
|
}
|
|
5330
|
-
|
|
5331
|
-
|
|
5336
|
+
if with_results:
|
|
5337
|
+
scenarios = await client.list_scenarios(
|
|
5338
|
+
benchmark_id=b.get("id"), fetch_all=True
|
|
5339
|
+
)
|
|
5340
|
+
tally = _pass_status_tally(scenarios)
|
|
5341
|
+
pass_rate = tally["pass_rate"]
|
|
5342
|
+
row["passed"] = tally["passed"]
|
|
5343
|
+
row["failed"] = tally["failed"]
|
|
5344
|
+
row["unrated"] = tally["unrated"]
|
|
5345
|
+
row["pass_rate"] = (
|
|
5346
|
+
f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a"
|
|
5347
|
+
)
|
|
5348
|
+
mapped.append(row)
|
|
5349
|
+
|
|
5350
|
+
columns = ["id", "name", "agent_name", "scenario_count"]
|
|
5351
|
+
if with_results:
|
|
5352
|
+
columns += ["passed", "failed", "unrated", "pass_rate"]
|
|
5353
|
+
columns.append("description")
|
|
5332
5354
|
|
|
5333
5355
|
if output_format == "csv":
|
|
5334
|
-
return to_csv(
|
|
5335
|
-
mapped, ["id", "name", "agent_name", "scenario_count", "description"]
|
|
5336
|
-
)
|
|
5356
|
+
return to_csv(mapped, columns)
|
|
5337
5357
|
return to_json(mapped)
|
|
5338
5358
|
|
|
5339
5359
|
|
|
@@ -5710,6 +5730,40 @@ async def benchmark_clone(
|
|
|
5710
5730
|
return "\n".join(lines)
|
|
5711
5731
|
|
|
5712
5732
|
|
|
5733
|
+
def _pass_status_tally(scenarios: list[dict]) -> dict[str, Any]:
|
|
5734
|
+
"""Tally scenarios by pass_status and compute the pass rate among rated.
|
|
5735
|
+
|
|
5736
|
+
Scenario pass_status from the API is the *effective* value (the latest run's
|
|
5737
|
+
pass_status when present, else the scenario's own), so this reflects the most
|
|
5738
|
+
recent run per scenario.
|
|
5739
|
+
"""
|
|
5740
|
+
tally = {"pass": 0, "fail": 0, "unrated": 0}
|
|
5741
|
+
failing: list[dict[str, Any]] = []
|
|
5742
|
+
unrated: list[dict[str, Any]] = []
|
|
5743
|
+
for scenario in scenarios:
|
|
5744
|
+
status = str(scenario.get("pass_status") or "unrated").lower()
|
|
5745
|
+
if status not in tally:
|
|
5746
|
+
status = "unrated"
|
|
5747
|
+
tally[status] += 1
|
|
5748
|
+
entry = {"id": scenario.get("id"), "name": scenario.get("name")}
|
|
5749
|
+
if status == "fail":
|
|
5750
|
+
failing.append(entry)
|
|
5751
|
+
elif status == "unrated":
|
|
5752
|
+
unrated.append(entry)
|
|
5753
|
+
|
|
5754
|
+
rated = tally["pass"] + tally["fail"]
|
|
5755
|
+
return {
|
|
5756
|
+
"total": len(scenarios),
|
|
5757
|
+
"passed": tally["pass"],
|
|
5758
|
+
"failed": tally["fail"],
|
|
5759
|
+
"unrated": tally["unrated"],
|
|
5760
|
+
"rated": rated,
|
|
5761
|
+
"pass_rate": round(tally["pass"] / rated, 4) if rated else None,
|
|
5762
|
+
"failing_scenarios": failing,
|
|
5763
|
+
"unrated_scenarios": unrated,
|
|
5764
|
+
}
|
|
5765
|
+
|
|
5766
|
+
|
|
5713
5767
|
async def benchmark_results(
|
|
5714
5768
|
client: AppliedClient,
|
|
5715
5769
|
benchmark_id: str,
|
|
@@ -5739,30 +5793,18 @@ async def benchmark_results(
|
|
|
5739
5793
|
except AppliedAPIError as e:
|
|
5740
5794
|
return _format_error(e)
|
|
5741
5795
|
|
|
5742
|
-
|
|
5743
|
-
failing
|
|
5744
|
-
unrated
|
|
5745
|
-
|
|
5746
|
-
status = str(scenario.get("pass_status") or "unrated").lower()
|
|
5747
|
-
if status not in tally:
|
|
5748
|
-
status = "unrated"
|
|
5749
|
-
tally[status] += 1
|
|
5750
|
-
entry = {"id": scenario.get("id"), "name": scenario.get("name")}
|
|
5751
|
-
if status == "fail":
|
|
5752
|
-
failing.append(entry)
|
|
5753
|
-
elif status == "unrated":
|
|
5754
|
-
unrated.append(entry)
|
|
5755
|
-
|
|
5756
|
-
rated = tally["pass"] + tally["fail"]
|
|
5757
|
-
pass_rate = round(tally["pass"] / rated, 4) if rated else None
|
|
5796
|
+
t = _pass_status_tally(scenarios)
|
|
5797
|
+
failing = t["failing_scenarios"]
|
|
5798
|
+
unrated = t["unrated_scenarios"]
|
|
5799
|
+
pass_rate = t["pass_rate"]
|
|
5758
5800
|
summary = {
|
|
5759
5801
|
"benchmark_id": benchmark_id,
|
|
5760
5802
|
"benchmark_name": benchmark.get("name"),
|
|
5761
|
-
"total_scenarios":
|
|
5762
|
-
"passed":
|
|
5763
|
-
"failed":
|
|
5764
|
-
"unrated":
|
|
5765
|
-
"rated": rated,
|
|
5803
|
+
"total_scenarios": t["total"],
|
|
5804
|
+
"passed": t["passed"],
|
|
5805
|
+
"failed": t["failed"],
|
|
5806
|
+
"unrated": t["unrated"],
|
|
5807
|
+
"rated": t["rated"],
|
|
5766
5808
|
"pass_rate": pass_rate,
|
|
5767
5809
|
"failing_scenarios": failing,
|
|
5768
5810
|
"unrated_scenarios": unrated,
|
|
@@ -5772,16 +5814,16 @@ async def benchmark_results(
|
|
|
5772
5814
|
return to_json(summary)
|
|
5773
5815
|
|
|
5774
5816
|
pass_rate_str = (
|
|
5775
|
-
f"{pass_rate * 100:.1f}% ({
|
|
5817
|
+
f"{pass_rate * 100:.1f}% ({t['passed']}/{t['rated']} rated)"
|
|
5776
5818
|
if pass_rate is not None
|
|
5777
5819
|
else "n/a (no rated scenarios yet)"
|
|
5778
5820
|
)
|
|
5779
5821
|
lines = [
|
|
5780
5822
|
f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
|
|
5781
5823
|
f"total_scenarios: {summary['total_scenarios']}",
|
|
5782
|
-
f"passed: {
|
|
5783
|
-
f"failed: {
|
|
5784
|
-
f"unrated: {
|
|
5824
|
+
f"passed: {t['passed']}",
|
|
5825
|
+
f"failed: {t['failed']}",
|
|
5826
|
+
f"unrated: {t['unrated']}",
|
|
5785
5827
|
f"pass_rate: {pass_rate_str}",
|
|
5786
5828
|
]
|
|
5787
5829
|
if failing:
|
|
@@ -43,6 +43,7 @@ class ScenariosBulkCancelInput(StrictInput):
|
|
|
43
43
|
class BenchmarksListInput(StrictInput):
|
|
44
44
|
agent_id: str | None = None
|
|
45
45
|
limit: int = 50
|
|
46
|
+
with_results: bool = False
|
|
46
47
|
|
|
47
48
|
|
|
48
49
|
class BenchmarksGetInput(StrictInput):
|
|
@@ -395,10 +396,26 @@ async def benchmarks_list_handler(
|
|
|
395
396
|
agent_id=params.agent_id,
|
|
396
397
|
limit=params.limit,
|
|
397
398
|
)
|
|
399
|
+
payload = []
|
|
400
|
+
for benchmark in benchmarks:
|
|
401
|
+
row = _project_benchmark_payload(benchmark)
|
|
402
|
+
if params.with_results:
|
|
403
|
+
from applied_cli.tools import _pass_status_tally
|
|
404
|
+
|
|
405
|
+
scenarios = await client.list_scenarios(
|
|
406
|
+
benchmark_id=benchmark.get("id"), fetch_all=True
|
|
407
|
+
)
|
|
408
|
+
tally = _pass_status_tally(scenarios)
|
|
409
|
+
row["results"] = {
|
|
410
|
+
"passed": tally["passed"],
|
|
411
|
+
"failed": tally["failed"],
|
|
412
|
+
"unrated": tally["unrated"],
|
|
413
|
+
"pass_rate": tally["pass_rate"],
|
|
414
|
+
}
|
|
415
|
+
payload.append(row)
|
|
398
416
|
except AppliedAPIError as exc:
|
|
399
417
|
return _api_error_result(exc)
|
|
400
418
|
|
|
401
|
-
payload = [_project_benchmark_payload(benchmark) for benchmark in benchmarks]
|
|
402
419
|
return ToolResult(
|
|
403
420
|
data=payload,
|
|
404
421
|
summary=_count_summary(len(payload), "benchmark"),
|
|
@@ -991,7 +1008,11 @@ def scenario_specs() -> list[ToolSpec]:
|
|
|
991
1008
|
ToolSpec(
|
|
992
1009
|
name="benchmarks_list",
|
|
993
1010
|
namespace="benchmarks",
|
|
994
|
-
description=
|
|
1011
|
+
description=(
|
|
1012
|
+
"List conversation benchmarks as structured rows. Set "
|
|
1013
|
+
"with_results=true for each benchmark's pass/fail/unrated tally "
|
|
1014
|
+
"and pass rate (a go/no-go portfolio view)."
|
|
1015
|
+
),
|
|
995
1016
|
input_model=BenchmarksListInput,
|
|
996
1017
|
output_model=None,
|
|
997
1018
|
handler=benchmarks_list_handler,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: applied-cli
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.8
|
|
4
4
|
Summary: CLI and shared client library for Applied Labs AI support agents
|
|
5
5
|
Author: Applied Labs
|
|
6
6
|
License-Expression: MIT
|
|
@@ -80,6 +80,57 @@ applied metrics --metric-name conversation.resolve --start 2026-04-01 --end 2026
|
|
|
80
80
|
object. `analytics` returns grouped rows and currently supports `--metrics count`.
|
|
81
81
|
Raw analytics SQL is not available through the public CLI surface.
|
|
82
82
|
|
|
83
|
+
## Benchmarks & Scenarios
|
|
84
|
+
|
|
85
|
+
A **benchmark** is a named regression suite; a **scenario** is one test conversation
|
|
86
|
+
(built from a real `input_conversation_id`) that can belong to one or more benchmarks.
|
|
87
|
+
The typical loop is: build a suite → run it → review the pass rate → fix → re-run.
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
# Inspect benchmarks and their scenarios
|
|
91
|
+
applied benchmarks --agent-id <agent_id> --format json
|
|
92
|
+
applied benchmark <benchmark_id> --format json
|
|
93
|
+
applied scenarios --benchmark-id <benchmark_id> --format json
|
|
94
|
+
|
|
95
|
+
# Build a suite
|
|
96
|
+
applied benchmark-create --agent-id <agent_id> --name "Cancel Regression"
|
|
97
|
+
applied scenario-create --input-conversation-id <conversation_id> --name "<name>" \
|
|
98
|
+
--benchmark-id <benchmark_id>
|
|
99
|
+
|
|
100
|
+
# Port a suite to another agent (e.g. email -> chat). Cross-agent recreates the
|
|
101
|
+
# scenarios under the destination agent; same-agent just tags them in.
|
|
102
|
+
# Dry-run by default; add --apply to write.
|
|
103
|
+
applied benchmark-clone <source_benchmark_id> --dest-benchmark-name "Chat Regression" \
|
|
104
|
+
--target-agent-id <chat_agent_id> --apply
|
|
105
|
+
|
|
106
|
+
# Run a benchmark and wait for results in one command.
|
|
107
|
+
# --contact-email runs as a contact that has an email, fixing
|
|
108
|
+
# "Email is not present in the conversation" on test conversations.
|
|
109
|
+
applied scenario-bulk-run --benchmark-id <benchmark_id> \
|
|
110
|
+
--contact-email test@example.com --wait
|
|
111
|
+
applied scenario-bulk-status <job_id> --include-runs --format json
|
|
112
|
+
|
|
113
|
+
# Kill a stuck bulk run (deletes its queued/running runs; finished runs preserved)
|
|
114
|
+
applied scenario-bulk-cancel <job_id> --apply
|
|
115
|
+
|
|
116
|
+
# Review pass/fail health (pass_status reflects the latest run per scenario)
|
|
117
|
+
applied benchmark-results <benchmark_id> --format json
|
|
118
|
+
|
|
119
|
+
# Rate scenarios as you evaluate
|
|
120
|
+
applied scenario-update <scenario_id> --pass-status pass --feedback "<note>"
|
|
121
|
+
|
|
122
|
+
# Safe delete — refuses to wipe scenarios unless you opt in
|
|
123
|
+
applied benchmark-delete <benchmark_id> --detach-scenarios # preserve scenarios
|
|
124
|
+
applied benchmark-delete <benchmark_id> --force # cascade delete
|
|
125
|
+
|
|
126
|
+
# Recover deleted benchmark/scenario rows from a local PITR export
|
|
127
|
+
applied scenario-recover-catalog --recovery-dir <dir> --apply
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Deleting a benchmark cascades and permanently deletes its scenarios and runs, so
|
|
131
|
+
`benchmark-delete` refuses a non-empty benchmark unless you pass `--detach-scenarios`
|
|
132
|
+
(unlink the scenarios first so they survive under their agent) or `--force`.
|
|
133
|
+
|
|
83
134
|
## Library Usage
|
|
84
135
|
|
|
85
136
|
```python
|
|
@@ -113,6 +164,11 @@ conversations = await tools.conversation_query(
|
|
|
113
164
|
| `analytics_report` | Read standard dashboard/report analytics views |
|
|
114
165
|
| `analytics_query` | Aggregate supported conversation dimensions with count |
|
|
115
166
|
| `metrics_query` | Roll up named metric events |
|
|
167
|
+
| `benchmark_clone` | Copy all scenarios from one benchmark into another |
|
|
168
|
+
| `benchmark_delete` | Delete a benchmark (guards against wiping scenarios) |
|
|
169
|
+
| `benchmark_results` | Pass/fail/unrated tally and pass rate for a benchmark |
|
|
170
|
+
| `scenario_bulk_run` | Run scenarios (contact override + wait-to-completion) |
|
|
171
|
+
| `scenario_bulk_cancel` | Cancel a stuck bulk run's queued/running scenario runs |
|
|
116
172
|
|
|
117
173
|
## Examples
|
|
118
174
|
|
|
@@ -40,6 +40,7 @@ tests/test_audit_tools.py
|
|
|
40
40
|
tests/test_auth_context.py
|
|
41
41
|
tests/test_benchmark_clone.py
|
|
42
42
|
tests/test_benchmark_delete_guardrail.py
|
|
43
|
+
tests/test_benchmark_list_with_results.py
|
|
43
44
|
tests/test_benchmark_results.py
|
|
44
45
|
tests/test_benchmark_scenario_tools.py
|
|
45
46
|
tests/test_cli.py
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from applied_cli import tools
|
|
6
|
+
|
|
7
|
+
BENCHMARKS = [
|
|
8
|
+
{"id": "b1", "name": "Cancel", "agent": {"name": "August"}, "scenario_count": 3},
|
|
9
|
+
{"id": "b2", "name": "Refund", "agent": {"name": "August"}, "scenario_count": 1},
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
SCENARIOS_BY_BENCHMARK = {
|
|
13
|
+
"b1": [
|
|
14
|
+
{"id": "s1", "name": "a", "pass_status": "pass"},
|
|
15
|
+
{"id": "s2", "name": "b", "pass_status": "fail"},
|
|
16
|
+
{"id": "s3", "name": "c", "pass_status": "unrated"},
|
|
17
|
+
],
|
|
18
|
+
"b2": [{"id": "s4", "name": "d", "pass_status": "pass"}],
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FakeListClient:
|
|
23
|
+
def __init__(self):
|
|
24
|
+
self.list_scenarios_calls = 0
|
|
25
|
+
|
|
26
|
+
async def list_benchmarks(self, agent_id=None, limit=50):
|
|
27
|
+
return list(BENCHMARKS)
|
|
28
|
+
|
|
29
|
+
async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
|
|
30
|
+
self.list_scenarios_calls += 1
|
|
31
|
+
return list(SCENARIOS_BY_BENCHMARK.get(benchmark_id, []))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.mark.asyncio
|
|
35
|
+
async def test_list_without_results_does_not_fetch_scenarios():
|
|
36
|
+
client = FakeListClient()
|
|
37
|
+
out = await tools.benchmark_list(client, output_format="json")
|
|
38
|
+
rows = json.loads(out)
|
|
39
|
+
assert client.list_scenarios_calls == 0
|
|
40
|
+
assert "pass_rate" not in rows[0]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.mark.asyncio
|
|
44
|
+
async def test_list_with_results_adds_pass_rate_per_benchmark():
|
|
45
|
+
client = FakeListClient()
|
|
46
|
+
out = await tools.benchmark_list(
|
|
47
|
+
client, output_format="json", with_results=True
|
|
48
|
+
)
|
|
49
|
+
rows = {r["id"]: r for r in json.loads(out)}
|
|
50
|
+
# One scenario fetch per benchmark.
|
|
51
|
+
assert client.list_scenarios_calls == 2
|
|
52
|
+
# b1: 1 pass / 2 rated = 50%
|
|
53
|
+
assert rows["b1"]["passed"] == 1
|
|
54
|
+
assert rows["b1"]["failed"] == 1
|
|
55
|
+
assert rows["b1"]["unrated"] == 1
|
|
56
|
+
assert rows["b1"]["pass_rate"] == "50.0%"
|
|
57
|
+
# b2: 1 pass / 1 rated = 100%
|
|
58
|
+
assert rows["b2"]["pass_rate"] == "100.0%"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@pytest.mark.asyncio
|
|
62
|
+
async def test_list_with_results_csv_has_columns():
|
|
63
|
+
client = FakeListClient()
|
|
64
|
+
out = await tools.benchmark_list(client, output_format="csv", with_results=True)
|
|
65
|
+
header = out.splitlines()[0]
|
|
66
|
+
for col in ("passed", "failed", "unrated", "pass_rate"):
|
|
67
|
+
assert col in header
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_pass_status_tally_pure():
|
|
71
|
+
tally = tools._pass_status_tally(
|
|
72
|
+
[
|
|
73
|
+
{"id": "1", "pass_status": "pass"},
|
|
74
|
+
{"id": "2", "pass_status": "PASS"}, # case-insensitive
|
|
75
|
+
{"id": "3", "pass_status": "fail"},
|
|
76
|
+
{"id": "4"}, # missing -> unrated
|
|
77
|
+
]
|
|
78
|
+
)
|
|
79
|
+
assert tally["passed"] == 2
|
|
80
|
+
assert tally["failed"] == 1
|
|
81
|
+
assert tally["unrated"] == 1
|
|
82
|
+
assert tally["rated"] == 3
|
|
83
|
+
assert tally["pass_rate"] == round(2 / 3, 4)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_pass_status_tally_no_rated():
|
|
87
|
+
tally = tools._pass_status_tally([{"id": "1", "pass_status": "unrated"}])
|
|
88
|
+
assert tally["pass_rate"] is None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@pytest.mark.asyncio
|
|
92
|
+
async def test_v2_benchmarks_list_with_results():
|
|
93
|
+
from applied_cli.v2.scenarios import (
|
|
94
|
+
BenchmarksListInput,
|
|
95
|
+
benchmarks_list_handler,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
client = FakeListClient()
|
|
99
|
+
result = await benchmarks_list_handler(
|
|
100
|
+
client, BenchmarksListInput(with_results=True)
|
|
101
|
+
)
|
|
102
|
+
by_id = {r["id"]: r for r in result.data}
|
|
103
|
+
assert by_id["b1"]["results"]["passed"] == 1
|
|
104
|
+
assert by_id["b2"]["results"]["pass_rate"] == 1.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|