applied-cli 0.6.4__tar.gz → 0.6.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {applied_cli-0.6.4 → applied_cli-0.6.6}/PKG-INFO +1 -1
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/cli.py +28 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/tools.py +176 -6
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/domains.py +1 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/scenarios.py +102 -1
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/PKG-INFO +1 -1
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/SOURCES.txt +2 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/pyproject.toml +1 -1
- applied_cli-0.6.6/tests/test_benchmark_results.py +78 -0
- applied_cli-0.6.6/tests/test_scenario_bulk_run_wait.py +107 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/README.md +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/__init__.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/agent_scoped_flows.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/auth.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/client.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/conversation_lookup.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/conversations.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/credentials.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/flow_helpers.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/formatters.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/mcp.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/recovery.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/toolkit.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/__init__.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/agents.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/articles.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/catalog.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/connectors.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/content.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/conversations.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/flows.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/knowledge.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/manifest.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/products.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/taxonomy.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/tickets.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/dependency_links.txt +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/entry_points.txt +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/requires.txt +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/top_level.txt +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/setup.cfg +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_agent_scoped_flows.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_audit_tools.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_auth_context.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_benchmark_clone.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_benchmark_delete_guardrail.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_benchmark_scenario_tools.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_cli.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_cli_v2.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_client.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_client_v2.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_conversation_tools.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_flow_tools.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_knowledge_content_tools.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_recovery.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_scenario_bulk_cancel.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_scenario_bulk_run_contact.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_toolkit_contract.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_agents.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_articles.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_catalog_and_mcp.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_connectors.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_content.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_conversations.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_flows.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_knowledge.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_products.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_scenarios.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_taxonomy.py +0 -0
- {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_tickets.py +0 -0
|
@@ -1672,6 +1672,22 @@ def benchmark_delete(
|
|
|
1672
1672
|
typer.echo(result)
|
|
1673
1673
|
|
|
1674
1674
|
|
|
1675
|
+
@app.command("benchmark-results")
|
|
1676
|
+
def benchmark_results(
|
|
1677
|
+
id: str = typer.Argument(..., help="Benchmark ID"),
|
|
1678
|
+
shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
|
|
1679
|
+
format: str = typer.Option(
|
|
1680
|
+
"text", "--format", "-f", help="Output format: text or json"
|
|
1681
|
+
),
|
|
1682
|
+
) -> None:
|
|
1683
|
+
"""Summarize a benchmark's pass/fail/unrated health and pass rate."""
|
|
1684
|
+
client = get_client(shop_id=shop_id)
|
|
1685
|
+
result = asyncio.run(
|
|
1686
|
+
tools.benchmark_results(client, benchmark_id=id, output_format=format)
|
|
1687
|
+
)
|
|
1688
|
+
typer.echo(result)
|
|
1689
|
+
|
|
1690
|
+
|
|
1675
1691
|
@app.command()
|
|
1676
1692
|
def scenarios(
|
|
1677
1693
|
benchmark_id: str = typer.Option(
|
|
@@ -1928,6 +1944,15 @@ def scenario_bulk_run(
|
|
|
1928
1944
|
anonymous: bool = typer.Option(
|
|
1929
1945
|
False, "--anonymous", help="Run with an anonymous contact"
|
|
1930
1946
|
),
|
|
1947
|
+
wait: bool = typer.Option(
|
|
1948
|
+
False, "--wait", help="Poll until all runs finish, then print final status"
|
|
1949
|
+
),
|
|
1950
|
+
wait_timeout: float = typer.Option(
|
|
1951
|
+
300.0, "--wait-timeout", help="Max seconds to wait with --wait (default 300)"
|
|
1952
|
+
),
|
|
1953
|
+
poll_interval: float = typer.Option(
|
|
1954
|
+
3.0, "--poll-interval", help="Seconds between status polls with --wait"
|
|
1955
|
+
),
|
|
1931
1956
|
shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
|
|
1932
1957
|
format: str = typer.Option(
|
|
1933
1958
|
"text", "--format", "-f", help="Output format: text or json"
|
|
@@ -1944,6 +1969,9 @@ def scenario_bulk_run(
|
|
|
1944
1969
|
contact_email=contact_email,
|
|
1945
1970
|
contact_id=contact_id,
|
|
1946
1971
|
anonymous=anonymous,
|
|
1972
|
+
wait=wait,
|
|
1973
|
+
wait_timeout=wait_timeout,
|
|
1974
|
+
poll_interval=poll_interval,
|
|
1947
1975
|
output_format=format,
|
|
1948
1976
|
)
|
|
1949
1977
|
)
|
|
@@ -8,6 +8,7 @@ import asyncio
|
|
|
8
8
|
import difflib
|
|
9
9
|
import json
|
|
10
10
|
import re
|
|
11
|
+
import time
|
|
11
12
|
from contextlib import suppress
|
|
12
13
|
from html.parser import HTMLParser
|
|
13
14
|
from typing import Any
|
|
@@ -5709,6 +5710,93 @@ async def benchmark_clone(
|
|
|
5709
5710
|
return "\n".join(lines)
|
|
5710
5711
|
|
|
5711
5712
|
|
|
5713
|
+
async def benchmark_results(
|
|
5714
|
+
client: AppliedClient,
|
|
5715
|
+
benchmark_id: str,
|
|
5716
|
+
*,
|
|
5717
|
+
output_format: str = "text",
|
|
5718
|
+
) -> str:
|
|
5719
|
+
"""
|
|
5720
|
+
Summarize a benchmark's pass/fail health.
|
|
5721
|
+
|
|
5722
|
+
Tallies the pass_status across the benchmark's scenarios (pass / fail /
|
|
5723
|
+
unrated), computes the pass rate among rated scenarios, and lists the failing
|
|
5724
|
+
and still-unrated scenarios so you know what to fix or evaluate next.
|
|
5725
|
+
|
|
5726
|
+
Args:
|
|
5727
|
+
client: Authenticated AppliedClient
|
|
5728
|
+
benchmark_id: The benchmark UUID
|
|
5729
|
+
output_format: 'text' (default) or 'json'
|
|
5730
|
+
|
|
5731
|
+
Returns:
|
|
5732
|
+
Pass-rate summary with failing and unrated scenario lists.
|
|
5733
|
+
"""
|
|
5734
|
+
try:
|
|
5735
|
+
benchmark = await client.get_benchmark(benchmark_id)
|
|
5736
|
+
scenarios = await client.list_scenarios(
|
|
5737
|
+
benchmark_id=benchmark_id, fetch_all=True
|
|
5738
|
+
)
|
|
5739
|
+
except AppliedAPIError as e:
|
|
5740
|
+
return _format_error(e)
|
|
5741
|
+
|
|
5742
|
+
tally = {"pass": 0, "fail": 0, "unrated": 0}
|
|
5743
|
+
failing: list[dict[str, Any]] = []
|
|
5744
|
+
unrated: list[dict[str, Any]] = []
|
|
5745
|
+
for scenario in scenarios:
|
|
5746
|
+
status = str(scenario.get("pass_status") or "unrated").lower()
|
|
5747
|
+
if status not in tally:
|
|
5748
|
+
status = "unrated"
|
|
5749
|
+
tally[status] += 1
|
|
5750
|
+
entry = {"id": scenario.get("id"), "name": scenario.get("name")}
|
|
5751
|
+
if status == "fail":
|
|
5752
|
+
failing.append(entry)
|
|
5753
|
+
elif status == "unrated":
|
|
5754
|
+
unrated.append(entry)
|
|
5755
|
+
|
|
5756
|
+
rated = tally["pass"] + tally["fail"]
|
|
5757
|
+
pass_rate = round(tally["pass"] / rated, 4) if rated else None
|
|
5758
|
+
summary = {
|
|
5759
|
+
"benchmark_id": benchmark_id,
|
|
5760
|
+
"benchmark_name": benchmark.get("name"),
|
|
5761
|
+
"total_scenarios": len(scenarios),
|
|
5762
|
+
"passed": tally["pass"],
|
|
5763
|
+
"failed": tally["fail"],
|
|
5764
|
+
"unrated": tally["unrated"],
|
|
5765
|
+
"rated": rated,
|
|
5766
|
+
"pass_rate": pass_rate,
|
|
5767
|
+
"failing_scenarios": failing,
|
|
5768
|
+
"unrated_scenarios": unrated,
|
|
5769
|
+
}
|
|
5770
|
+
|
|
5771
|
+
if output_format == "json":
|
|
5772
|
+
return to_json(summary)
|
|
5773
|
+
|
|
5774
|
+
pass_rate_str = (
|
|
5775
|
+
f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
|
|
5776
|
+
if pass_rate is not None
|
|
5777
|
+
else "n/a (no rated scenarios yet)"
|
|
5778
|
+
)
|
|
5779
|
+
lines = [
|
|
5780
|
+
f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
|
|
5781
|
+
f"total_scenarios: {summary['total_scenarios']}",
|
|
5782
|
+
f"passed: {tally['pass']}",
|
|
5783
|
+
f"failed: {tally['fail']}",
|
|
5784
|
+
f"unrated: {tally['unrated']}",
|
|
5785
|
+
f"pass_rate: {pass_rate_str}",
|
|
5786
|
+
]
|
|
5787
|
+
if failing:
|
|
5788
|
+
lines.append(f"\n# Failing ({len(failing)})")
|
|
5789
|
+
lines.extend(f" - {s['name']} ({s['id']})" for s in failing[:50])
|
|
5790
|
+
if len(failing) > 50:
|
|
5791
|
+
lines.append(f" ... and {len(failing) - 50} more")
|
|
5792
|
+
if unrated:
|
|
5793
|
+
lines.append(f"\n# Unrated ({len(unrated)}) — evaluate these next")
|
|
5794
|
+
lines.extend(f" - {s['name']} ({s['id']})" for s in unrated[:50])
|
|
5795
|
+
if len(unrated) > 50:
|
|
5796
|
+
lines.append(f" ... and {len(unrated) - 50} more")
|
|
5797
|
+
return "\n".join(lines)
|
|
5798
|
+
|
|
5799
|
+
|
|
5712
5800
|
# -----------------------------------------------------------------------------
|
|
5713
5801
|
# Scenarios
|
|
5714
5802
|
# -----------------------------------------------------------------------------
|
|
@@ -6082,6 +6170,41 @@ async def scenario_run_delete(
|
|
|
6082
6170
|
return f"Scenario run {run_id} deleted successfully."
|
|
6083
6171
|
|
|
6084
6172
|
|
|
6173
|
+
def _bulk_status_counts(counts: dict | None) -> dict[str, int]:
|
|
6174
|
+
"""Normalize bulk-status counts to lowercase keys with int values."""
|
|
6175
|
+
lowered: dict[str, int] = {}
|
|
6176
|
+
for key, value in (counts or {}).items():
|
|
6177
|
+
lowered[str(key).lower()] = int(value or 0)
|
|
6178
|
+
return lowered
|
|
6179
|
+
|
|
6180
|
+
|
|
6181
|
+
def _bulk_pending_count(counts: dict | None) -> int:
|
|
6182
|
+
"""Count runs still queued or running (case-insensitive)."""
|
|
6183
|
+
normalized = _bulk_status_counts(counts)
|
|
6184
|
+
return normalized.get("queued", 0) + normalized.get("running", 0)
|
|
6185
|
+
|
|
6186
|
+
|
|
6187
|
+
async def _await_bulk_run(
|
|
6188
|
+
client: AppliedClient,
|
|
6189
|
+
job_id: str,
|
|
6190
|
+
*,
|
|
6191
|
+
timeout: float,
|
|
6192
|
+
poll_interval: float,
|
|
6193
|
+
) -> tuple[dict, bool]:
|
|
6194
|
+
"""Poll a bulk run until no runs are queued/running or the timeout elapses.
|
|
6195
|
+
|
|
6196
|
+
Returns (latest_status_payload, timed_out).
|
|
6197
|
+
"""
|
|
6198
|
+
start = time.monotonic()
|
|
6199
|
+
status = await client.get_scenario_bulk_run_status(job_id)
|
|
6200
|
+
while _bulk_pending_count(status.get("counts")) > 0:
|
|
6201
|
+
if time.monotonic() - start >= timeout:
|
|
6202
|
+
return status, True
|
|
6203
|
+
await asyncio.sleep(poll_interval)
|
|
6204
|
+
status = await client.get_scenario_bulk_run_status(job_id)
|
|
6205
|
+
return status, False
|
|
6206
|
+
|
|
6207
|
+
|
|
6085
6208
|
async def _resolve_contact_override(
|
|
6086
6209
|
client: AppliedClient,
|
|
6087
6210
|
*,
|
|
@@ -6123,6 +6246,9 @@ async def scenario_bulk_run(
|
|
|
6123
6246
|
contact_id: str | None = None,
|
|
6124
6247
|
contact_email: str | None = None,
|
|
6125
6248
|
anonymous: bool = False,
|
|
6249
|
+
wait: bool = False,
|
|
6250
|
+
wait_timeout: float = 300.0,
|
|
6251
|
+
poll_interval: float = 3.0,
|
|
6126
6252
|
output_format: str = "text",
|
|
6127
6253
|
) -> str:
|
|
6128
6254
|
"""
|
|
@@ -6134,6 +6260,10 @@ async def scenario_bulk_run(
|
|
|
6134
6260
|
to run the scenarios as a contact that has an email, so the test conversation
|
|
6135
6261
|
carries it.
|
|
6136
6262
|
|
|
6263
|
+
With wait=True, this polls until every run finishes (or the timeout elapses)
|
|
6264
|
+
and returns the final status, so you can run a benchmark and read results in
|
|
6265
|
+
one call instead of polling scenario_bulk_status yourself.
|
|
6266
|
+
|
|
6137
6267
|
Args:
|
|
6138
6268
|
client: Authenticated AppliedClient
|
|
6139
6269
|
scenario_ids: List of scenario UUIDs to run
|
|
@@ -6144,9 +6274,12 @@ async def scenario_bulk_run(
|
|
|
6144
6274
|
contact_id: Run scenarios as this existing contact (gives test convos its email)
|
|
6145
6275
|
contact_email: Resolve/create a contact with this email and run as them
|
|
6146
6276
|
anonymous: Run with an anonymous contact (mode='anonymous')
|
|
6277
|
+
wait: Poll until all runs finish (or wait_timeout elapses)
|
|
6278
|
+
wait_timeout: Max seconds to wait when wait=True (default 300)
|
|
6279
|
+
poll_interval: Seconds between status polls when wait=True (default 3)
|
|
6147
6280
|
|
|
6148
6281
|
Returns:
|
|
6149
|
-
Summary of runs created
|
|
6282
|
+
Summary of runs created (plus the final status when wait=True)
|
|
6150
6283
|
"""
|
|
6151
6284
|
resolved_scenario_ids = list(scenario_ids or [])
|
|
6152
6285
|
if not resolved_scenario_ids:
|
|
@@ -6196,6 +6329,26 @@ async def scenario_bulk_run(
|
|
|
6196
6329
|
"contact_override": result.get("contact_override"),
|
|
6197
6330
|
}
|
|
6198
6331
|
|
|
6332
|
+
job_id = payload.get("job_id")
|
|
6333
|
+
final_status: dict | None = None
|
|
6334
|
+
timed_out = False
|
|
6335
|
+
if wait and job_id:
|
|
6336
|
+
try:
|
|
6337
|
+
final_status, timed_out = await _await_bulk_run(
|
|
6338
|
+
client,
|
|
6339
|
+
str(job_id),
|
|
6340
|
+
timeout=wait_timeout,
|
|
6341
|
+
poll_interval=poll_interval,
|
|
6342
|
+
)
|
|
6343
|
+
except AppliedAPIError as e:
|
|
6344
|
+
return _format_error(e)
|
|
6345
|
+
counts = _bulk_status_counts(final_status.get("counts"))
|
|
6346
|
+
payload["final_counts"] = counts
|
|
6347
|
+
payload["timed_out"] = timed_out
|
|
6348
|
+
payload["duration_seconds"] = final_status.get("duration_seconds")
|
|
6349
|
+
payload["completed_at"] = final_status.get("completed_at")
|
|
6350
|
+
payload["failed"] = final_status.get("failed") or []
|
|
6351
|
+
|
|
6199
6352
|
if output_format == "json":
|
|
6200
6353
|
return to_json(payload)
|
|
6201
6354
|
|
|
@@ -6212,6 +6365,23 @@ async def scenario_bulk_run(
|
|
|
6212
6365
|
output += f"scenario_run_ids: {preview_ids}\n"
|
|
6213
6366
|
if len(run_ids) > 10:
|
|
6214
6367
|
output += f"more_runs: {len(run_ids) - 10}\n"
|
|
6368
|
+
|
|
6369
|
+
if final_status is not None:
|
|
6370
|
+
counts = payload["final_counts"]
|
|
6371
|
+
output += "\n# Final Status\n"
|
|
6372
|
+
output += "timed_out: " + ("true (still pending)" if timed_out else "false") + "\n"
|
|
6373
|
+
output += f"completed: {counts.get('completed', 0)}\n"
|
|
6374
|
+
output += f"failed: {counts.get('failed', 0)}\n"
|
|
6375
|
+
pending = counts.get("queued", 0) + counts.get("running", 0)
|
|
6376
|
+
output += f"still_pending: {pending}\n"
|
|
6377
|
+
if payload.get("duration_seconds") is not None:
|
|
6378
|
+
output += f"duration_seconds: {payload['duration_seconds']}\n"
|
|
6379
|
+
failed_runs = payload.get("failed") or []
|
|
6380
|
+
if failed_runs:
|
|
6381
|
+
output += f"\n# Failed Runs ({len(failed_runs)})\n"
|
|
6382
|
+
output += to_json(failed_runs)
|
|
6383
|
+
return output
|
|
6384
|
+
|
|
6215
6385
|
output += "\nTip: use scenario_bulk_status(job_id, include_runs=True) or scenario_run_list(bulk_job_id=job_id) to get per-run details with scenario mappings."
|
|
6216
6386
|
return output
|
|
6217
6387
|
|
|
@@ -6245,14 +6415,14 @@ async def scenario_bulk_status(
|
|
|
6245
6415
|
payload.pop("runs", None)
|
|
6246
6416
|
return to_json(payload)
|
|
6247
6417
|
|
|
6248
|
-
counts = result.get("counts")
|
|
6418
|
+
counts = _bulk_status_counts(result.get("counts"))
|
|
6249
6419
|
output = "# Bulk Run Status\n"
|
|
6250
6420
|
output += f"job_id: {result.get('job_id')}\n"
|
|
6251
6421
|
output += f"total: {result.get('total')}\n"
|
|
6252
|
-
output += f"queued: {counts.get('
|
|
6253
|
-
output += f"running: {counts.get('
|
|
6254
|
-
output += f"completed: {counts.get('
|
|
6255
|
-
output += f"failed: {counts.get('
|
|
6422
|
+
output += f"queued: {counts.get('queued', 0)}\n"
|
|
6423
|
+
output += f"running: {counts.get('running', 0)}\n"
|
|
6424
|
+
output += f"completed: {counts.get('completed', 0)}\n"
|
|
6425
|
+
output += f"failed: {counts.get('failed', 0)}\n"
|
|
6256
6426
|
output += f"created_at: {result.get('created_at')}\n"
|
|
6257
6427
|
output += f"updated_at: {result.get('updated_at')}\n"
|
|
6258
6428
|
if result.get("completed_at"):
|
|
@@ -43,6 +43,7 @@ DOMAIN_TOOL_RENAMES: dict[str, dict[str, str]] = {
|
|
|
43
43
|
"benchmark_create": "benchmarks_create",
|
|
44
44
|
"benchmark_delete": "benchmarks_delete",
|
|
45
45
|
"benchmark_clone": "benchmarks_clone",
|
|
46
|
+
"benchmark_results": "benchmarks_results",
|
|
46
47
|
},
|
|
47
48
|
"connectors": {
|
|
48
49
|
"connector_types": "connectors_types_list",
|
|
@@ -30,6 +30,9 @@ class ScenariosBulkRunInput(StrictInput):
|
|
|
30
30
|
contact_id: str | None = None
|
|
31
31
|
contact_email: str | None = None
|
|
32
32
|
anonymous: bool = False
|
|
33
|
+
wait: bool = False
|
|
34
|
+
wait_timeout: float = 300.0
|
|
35
|
+
poll_interval: float = 3.0
|
|
33
36
|
|
|
34
37
|
|
|
35
38
|
class ScenariosBulkCancelInput(StrictInput):
|
|
@@ -68,6 +71,10 @@ class BenchmarksCloneInput(StrictInput):
|
|
|
68
71
|
apply: bool = False
|
|
69
72
|
|
|
70
73
|
|
|
74
|
+
class BenchmarksResultsInput(StrictInput):
|
|
75
|
+
benchmark_id: str
|
|
76
|
+
|
|
77
|
+
|
|
71
78
|
class ScenariosListInput(StrictInput):
|
|
72
79
|
benchmark_id: str | None = None
|
|
73
80
|
agent_id: str | None = None
|
|
@@ -500,6 +507,44 @@ async def benchmarks_clone_handler(
|
|
|
500
507
|
)
|
|
501
508
|
|
|
502
509
|
|
|
510
|
+
async def benchmarks_results_handler(
|
|
511
|
+
client: AppliedClient,
|
|
512
|
+
params: BenchmarksResultsInput,
|
|
513
|
+
) -> ToolResult[Any]:
|
|
514
|
+
from applied_cli import tools as legacy_tools
|
|
515
|
+
|
|
516
|
+
raw = await legacy_tools.benchmark_results(
|
|
517
|
+
client, benchmark_id=params.benchmark_id, output_format="json"
|
|
518
|
+
)
|
|
519
|
+
try:
|
|
520
|
+
data = json.loads(raw)
|
|
521
|
+
except (json.JSONDecodeError, TypeError):
|
|
522
|
+
return ToolResult(data={"message": raw}, summary=str(raw))
|
|
523
|
+
|
|
524
|
+
pass_rate = data.get("pass_rate")
|
|
525
|
+
rate_str = (
|
|
526
|
+
f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a (no rated yet)"
|
|
527
|
+
)
|
|
528
|
+
next_actions = []
|
|
529
|
+
if data.get("unrated"):
|
|
530
|
+
next_actions.append(
|
|
531
|
+
"Rate the unrated scenarios with scenarios_update (pass_status)."
|
|
532
|
+
)
|
|
533
|
+
if data.get("failed"):
|
|
534
|
+
next_actions.append(
|
|
535
|
+
"Inspect failing scenarios with scenarios_get / conversations_debug_bundle."
|
|
536
|
+
)
|
|
537
|
+
return ToolResult(
|
|
538
|
+
data=data,
|
|
539
|
+
summary=(
|
|
540
|
+
f"{data.get('benchmark_name') or params.benchmark_id}: pass rate "
|
|
541
|
+
f"{rate_str} — {data.get('passed', 0)} passed, "
|
|
542
|
+
f"{data.get('failed', 0)} failed, {data.get('unrated', 0)} unrated."
|
|
543
|
+
),
|
|
544
|
+
next_actions=next_actions,
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|
|
503
548
|
async def benchmarks_delete_handler(
|
|
504
549
|
client: AppliedClient,
|
|
505
550
|
params: BenchmarksDeleteInput,
|
|
@@ -828,6 +873,48 @@ async def scenarios_bulk_run_handler(
|
|
|
828
873
|
"duplicated_scenarios": result.get("duplicated_scenarios"),
|
|
829
874
|
"contact_override": result.get("contact_override"),
|
|
830
875
|
}
|
|
876
|
+
|
|
877
|
+
job_id = payload.get("job_id")
|
|
878
|
+
if params.wait and job_id:
|
|
879
|
+
from applied_cli.tools import _await_bulk_run, _bulk_status_counts
|
|
880
|
+
|
|
881
|
+
try:
|
|
882
|
+
final_status, timed_out = await _await_bulk_run(
|
|
883
|
+
client,
|
|
884
|
+
str(job_id),
|
|
885
|
+
timeout=params.wait_timeout,
|
|
886
|
+
poll_interval=params.poll_interval,
|
|
887
|
+
)
|
|
888
|
+
except AppliedAPIError as exc:
|
|
889
|
+
return _api_error_result(exc)
|
|
890
|
+
counts = _bulk_status_counts(final_status.get("counts"))
|
|
891
|
+
payload["final_counts"] = counts
|
|
892
|
+
payload["timed_out"] = timed_out
|
|
893
|
+
payload["duration_seconds"] = final_status.get("duration_seconds")
|
|
894
|
+
payload["failed"] = final_status.get("failed") or []
|
|
895
|
+
pending = counts.get("queued", 0) + counts.get("running", 0)
|
|
896
|
+
summary = (
|
|
897
|
+
f"Bulk job {job_id} "
|
|
898
|
+
+ ("timed out with " if timed_out else "finished: ")
|
|
899
|
+
+ f"{counts.get('completed', 0)} completed, "
|
|
900
|
+
+ f"{counts.get('failed', 0)} failed"
|
|
901
|
+
+ (f", {pending} still pending" if pending else "")
|
|
902
|
+
+ "."
|
|
903
|
+
)
|
|
904
|
+
warnings = []
|
|
905
|
+
if counts.get("failed"):
|
|
906
|
+
warnings.append(f"{counts['failed']} run(s) failed.")
|
|
907
|
+
if timed_out:
|
|
908
|
+
warnings.append("Timed out before all runs finished.")
|
|
909
|
+
return ToolResult(
|
|
910
|
+
data=payload,
|
|
911
|
+
summary=summary,
|
|
912
|
+
warnings=warnings,
|
|
913
|
+
next_actions=[
|
|
914
|
+
"Use scenarios_bulk_status with include_runs=true to inspect runs.",
|
|
915
|
+
],
|
|
916
|
+
)
|
|
917
|
+
|
|
831
918
|
queued = payload.get("queued") or 0
|
|
832
919
|
return ToolResult(
|
|
833
920
|
data=payload,
|
|
@@ -961,6 +1048,19 @@ def scenario_specs() -> list[ToolSpec]:
|
|
|
961
1048
|
read_write_mode="write",
|
|
962
1049
|
tags=["benchmark_clone", "native"],
|
|
963
1050
|
),
|
|
1051
|
+
ToolSpec(
|
|
1052
|
+
name="benchmarks_results",
|
|
1053
|
+
namespace="benchmarks",
|
|
1054
|
+
description=(
|
|
1055
|
+
"Summarize a benchmark's pass/fail/unrated health and pass rate, "
|
|
1056
|
+
"with the failing and unrated scenario lists."
|
|
1057
|
+
),
|
|
1058
|
+
input_model=BenchmarksResultsInput,
|
|
1059
|
+
output_model=None,
|
|
1060
|
+
handler=benchmarks_results_handler,
|
|
1061
|
+
read_write_mode="read",
|
|
1062
|
+
tags=["benchmark_results", "native"],
|
|
1063
|
+
),
|
|
964
1064
|
ToolSpec(
|
|
965
1065
|
name="scenarios_list",
|
|
966
1066
|
namespace="scenarios",
|
|
@@ -1064,7 +1164,8 @@ def scenario_specs() -> list[ToolSpec]:
|
|
|
1064
1164
|
"Run selected scenarios or every scenario in a benchmark and "
|
|
1065
1165
|
"return the queued job metadata. Pass contact_email or contact_id "
|
|
1066
1166
|
"to run as a contact with an email (fixes 'Email is not present' "
|
|
1067
|
-
"failures on test conversations)."
|
|
1167
|
+
"failures on test conversations). Pass wait=true to block until "
|
|
1168
|
+
"all runs finish and return the final status in one call."
|
|
1068
1169
|
),
|
|
1069
1170
|
input_model=ScenariosBulkRunInput,
|
|
1070
1171
|
output_model=None,
|
|
@@ -40,6 +40,7 @@ tests/test_audit_tools.py
|
|
|
40
40
|
tests/test_auth_context.py
|
|
41
41
|
tests/test_benchmark_clone.py
|
|
42
42
|
tests/test_benchmark_delete_guardrail.py
|
|
43
|
+
tests/test_benchmark_results.py
|
|
43
44
|
tests/test_benchmark_scenario_tools.py
|
|
44
45
|
tests/test_cli.py
|
|
45
46
|
tests/test_cli_v2.py
|
|
@@ -51,6 +52,7 @@ tests/test_knowledge_content_tools.py
|
|
|
51
52
|
tests/test_recovery.py
|
|
52
53
|
tests/test_scenario_bulk_cancel.py
|
|
53
54
|
tests/test_scenario_bulk_run_contact.py
|
|
55
|
+
tests/test_scenario_bulk_run_wait.py
|
|
54
56
|
tests/test_toolkit_contract.py
|
|
55
57
|
tests/test_v2_agents.py
|
|
56
58
|
tests/test_v2_articles.py
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from applied_cli import tools
|
|
6
|
+
|
|
7
|
+
BENCHMARK = {"id": "bench-1", "name": "Cancel Regression"}
|
|
8
|
+
|
|
9
|
+
SCENARIOS = [
|
|
10
|
+
{"id": "s1", "name": "Cancel order", "pass_status": "pass"},
|
|
11
|
+
{"id": "s2", "name": "Refund flow", "pass_status": "pass"},
|
|
12
|
+
{"id": "s3", "name": "Pause subscription", "pass_status": "fail"},
|
|
13
|
+
{"id": "s4", "name": "Address change", "pass_status": "unrated"},
|
|
14
|
+
{"id": "s5", "name": "No status field"}, # missing -> unrated
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FakeResultsClient:
|
|
19
|
+
def __init__(self, scenarios=SCENARIOS):
|
|
20
|
+
self._scenarios = scenarios
|
|
21
|
+
|
|
22
|
+
async def get_benchmark(self, benchmark_id):
|
|
23
|
+
return BENCHMARK
|
|
24
|
+
|
|
25
|
+
async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
|
|
26
|
+
return list(self._scenarios)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.mark.asyncio
|
|
30
|
+
async def test_results_tally_and_pass_rate():
|
|
31
|
+
client = FakeResultsClient()
|
|
32
|
+
data = json.loads(
|
|
33
|
+
await tools.benchmark_results(client, "bench-1", output_format="json")
|
|
34
|
+
)
|
|
35
|
+
assert data["total_scenarios"] == 5
|
|
36
|
+
assert data["passed"] == 2
|
|
37
|
+
assert data["failed"] == 1
|
|
38
|
+
assert data["unrated"] == 2
|
|
39
|
+
assert data["rated"] == 3
|
|
40
|
+
# 2 passed / 3 rated
|
|
41
|
+
assert data["pass_rate"] == round(2 / 3, 4)
|
|
42
|
+
assert [s["id"] for s in data["failing_scenarios"]] == ["s3"]
|
|
43
|
+
assert {s["id"] for s in data["unrated_scenarios"]} == {"s4", "s5"}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@pytest.mark.asyncio
|
|
47
|
+
async def test_results_no_rated_scenarios_pass_rate_none():
|
|
48
|
+
client = FakeResultsClient(
|
|
49
|
+
scenarios=[{"id": "s1", "name": "A", "pass_status": "unrated"}]
|
|
50
|
+
)
|
|
51
|
+
text = await tools.benchmark_results(client, "bench-1", output_format="text")
|
|
52
|
+
assert "n/a (no rated scenarios yet)" in text
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@pytest.mark.asyncio
|
|
56
|
+
async def test_results_text_lists_failing_and_unrated():
|
|
57
|
+
client = FakeResultsClient()
|
|
58
|
+
text = await tools.benchmark_results(client, "bench-1")
|
|
59
|
+
assert "# Failing (1)" in text
|
|
60
|
+
assert "Pause subscription" in text
|
|
61
|
+
assert "# Unrated (2)" in text
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@pytest.mark.asyncio
|
|
65
|
+
async def test_v2_benchmarks_results_handler_summary():
|
|
66
|
+
from applied_cli.v2.scenarios import (
|
|
67
|
+
BenchmarksResultsInput,
|
|
68
|
+
benchmarks_results_handler,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
client = FakeResultsClient()
|
|
72
|
+
result = await benchmarks_results_handler(
|
|
73
|
+
client, BenchmarksResultsInput(benchmark_id="bench-1")
|
|
74
|
+
)
|
|
75
|
+
assert result.data["passed"] == 2
|
|
76
|
+
assert "pass rate" in result.summary
|
|
77
|
+
# Has unrated + failing → both follow-up actions surfaced.
|
|
78
|
+
assert len(result.next_actions) == 2
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from applied_cli import tools
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FakeWaitClient:
|
|
9
|
+
"""Bulk client whose status transitions to done after N polls."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, status_sequence):
|
|
12
|
+
self._status_sequence = list(status_sequence)
|
|
13
|
+
self._poll = 0
|
|
14
|
+
self.status_calls = 0
|
|
15
|
+
|
|
16
|
+
async def list_scenarios(self, benchmark_id=None, limit=500, **kwargs):
|
|
17
|
+
return [{"id": "s1"}, {"id": "s2"}]
|
|
18
|
+
|
|
19
|
+
async def bulk_run_scenarios(
|
|
20
|
+
self, scenario_ids=None, target_agent_id=None, contact_override=None
|
|
21
|
+
):
|
|
22
|
+
return {
|
|
23
|
+
"job_id": "job-1",
|
|
24
|
+
"total": len(scenario_ids or []),
|
|
25
|
+
"queued": len(scenario_ids or []),
|
|
26
|
+
"scenario_run_ids": ["r1", "r2"],
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
async def get_scenario_bulk_run_status(self, job_id):
|
|
30
|
+
self.status_calls += 1
|
|
31
|
+
idx = min(self._poll, len(self._status_sequence) - 1)
|
|
32
|
+
self._poll += 1
|
|
33
|
+
return self._status_sequence[idx]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.mark.asyncio
|
|
37
|
+
async def test_wait_polls_until_no_pending(monkeypatch):
|
|
38
|
+
# Avoid real sleeping between polls.
|
|
39
|
+
async def _no_sleep(_seconds):
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
monkeypatch.setattr(tools.asyncio, "sleep", _no_sleep)
|
|
43
|
+
|
|
44
|
+
client = FakeWaitClient(
|
|
45
|
+
status_sequence=[
|
|
46
|
+
{"counts": {"queued": 2, "running": 0, "completed": 0, "failed": 0}},
|
|
47
|
+
{"counts": {"queued": 0, "running": 1, "completed": 1, "failed": 0}},
|
|
48
|
+
{
|
|
49
|
+
"counts": {"queued": 0, "running": 0, "completed": 2, "failed": 0},
|
|
50
|
+
"duration_seconds": 12.5,
|
|
51
|
+
"completed_at": "2026-06-05T10:00:00Z",
|
|
52
|
+
"failed": [],
|
|
53
|
+
},
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
result = await tools.scenario_bulk_run(
|
|
57
|
+
client, benchmark_id="bench-1", wait=True, output_format="json"
|
|
58
|
+
)
|
|
59
|
+
data = json.loads(result)
|
|
60
|
+
assert data["timed_out"] is False
|
|
61
|
+
assert data["final_counts"]["completed"] == 2
|
|
62
|
+
assert data["duration_seconds"] == 12.5
|
|
63
|
+
assert client.status_calls == 3
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@pytest.mark.asyncio
|
|
67
|
+
async def test_wait_times_out_when_runs_stay_pending(monkeypatch):
|
|
68
|
+
async def _no_sleep(_seconds):
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
monkeypatch.setattr(tools.asyncio, "sleep", _no_sleep)
|
|
72
|
+
|
|
73
|
+
# Always pending → must hit the timeout path.
|
|
74
|
+
client = FakeWaitClient(
|
|
75
|
+
status_sequence=[
|
|
76
|
+
{"counts": {"queued": 2, "running": 0, "completed": 0, "failed": 0}}
|
|
77
|
+
]
|
|
78
|
+
)
|
|
79
|
+
result = await tools.scenario_bulk_run(
|
|
80
|
+
client,
|
|
81
|
+
benchmark_id="bench-1",
|
|
82
|
+
wait=True,
|
|
83
|
+
wait_timeout=0.0, # immediate timeout after first poll
|
|
84
|
+
output_format="json",
|
|
85
|
+
)
|
|
86
|
+
data = json.loads(result)
|
|
87
|
+
assert data["timed_out"] is True
|
|
88
|
+
assert data["final_counts"]["queued"] == 2
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@pytest.mark.asyncio
|
|
92
|
+
async def test_no_wait_returns_started_summary():
|
|
93
|
+
client = FakeWaitClient(status_sequence=[{"counts": {}}])
|
|
94
|
+
result = await tools.scenario_bulk_run(
|
|
95
|
+
client, benchmark_id="bench-1", output_format="json"
|
|
96
|
+
)
|
|
97
|
+
data = json.loads(result)
|
|
98
|
+
assert "final_counts" not in data
|
|
99
|
+
assert client.status_calls == 0 # no polling when wait is False
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_bulk_status_counts_normalizes_case_and_types():
|
|
103
|
+
assert tools._bulk_status_counts({"QUEUED": 2, "Running": "1"}) == {
|
|
104
|
+
"queued": 2,
|
|
105
|
+
"running": 1,
|
|
106
|
+
}
|
|
107
|
+
assert tools._bulk_pending_count({"QUEUED": 3, "RUNNING": 4, "COMPLETED": 9}) == 7
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|