applied-cli 0.6.4__tar.gz → 0.6.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {applied_cli-0.6.4 → applied_cli-0.6.6}/PKG-INFO +1 -1
  2. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/cli.py +28 -0
  3. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/tools.py +176 -6
  4. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/domains.py +1 -0
  5. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/scenarios.py +102 -1
  6. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/PKG-INFO +1 -1
  7. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/SOURCES.txt +2 -0
  8. {applied_cli-0.6.4 → applied_cli-0.6.6}/pyproject.toml +1 -1
  9. applied_cli-0.6.6/tests/test_benchmark_results.py +78 -0
  10. applied_cli-0.6.6/tests/test_scenario_bulk_run_wait.py +107 -0
  11. {applied_cli-0.6.4 → applied_cli-0.6.6}/README.md +0 -0
  12. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/__init__.py +0 -0
  13. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/agent_scoped_flows.py +0 -0
  14. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/auth.py +0 -0
  15. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/client.py +0 -0
  16. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/conversation_lookup.py +0 -0
  17. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/conversations.py +0 -0
  18. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/credentials.py +0 -0
  19. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/flow_helpers.py +0 -0
  20. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/formatters.py +0 -0
  21. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/mcp.py +0 -0
  22. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/recovery.py +0 -0
  23. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/toolkit.py +0 -0
  24. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/__init__.py +0 -0
  25. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/agents.py +0 -0
  26. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/articles.py +0 -0
  27. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/catalog.py +0 -0
  28. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/connectors.py +0 -0
  29. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/content.py +0 -0
  30. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/conversations.py +0 -0
  31. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/flows.py +0 -0
  32. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/knowledge.py +0 -0
  33. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/manifest.py +0 -0
  34. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/products.py +0 -0
  35. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/taxonomy.py +0 -0
  36. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli/v2/tickets.py +0 -0
  37. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/dependency_links.txt +0 -0
  38. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/entry_points.txt +0 -0
  39. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/requires.txt +0 -0
  40. {applied_cli-0.6.4 → applied_cli-0.6.6}/applied_cli.egg-info/top_level.txt +0 -0
  41. {applied_cli-0.6.4 → applied_cli-0.6.6}/setup.cfg +0 -0
  42. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_agent_scoped_flows.py +0 -0
  43. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_audit_tools.py +0 -0
  44. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_auth_context.py +0 -0
  45. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_benchmark_clone.py +0 -0
  46. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_benchmark_delete_guardrail.py +0 -0
  47. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_benchmark_scenario_tools.py +0 -0
  48. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_cli.py +0 -0
  49. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_cli_v2.py +0 -0
  50. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_client.py +0 -0
  51. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_client_v2.py +0 -0
  52. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_conversation_tools.py +0 -0
  53. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_flow_tools.py +0 -0
  54. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_knowledge_content_tools.py +0 -0
  55. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_recovery.py +0 -0
  56. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_scenario_bulk_cancel.py +0 -0
  57. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_scenario_bulk_run_contact.py +0 -0
  58. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_toolkit_contract.py +0 -0
  59. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_agents.py +0 -0
  60. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_articles.py +0 -0
  61. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_catalog_and_mcp.py +0 -0
  62. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_connectors.py +0 -0
  63. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_content.py +0 -0
  64. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_conversations.py +0 -0
  65. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_flows.py +0 -0
  66. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_knowledge.py +0 -0
  67. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_products.py +0 -0
  68. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_scenarios.py +0 -0
  69. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_taxonomy.py +0 -0
  70. {applied_cli-0.6.4 → applied_cli-0.6.6}/tests/test_v2_tickets.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.4
3
+ Version: 0.6.6
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -1672,6 +1672,22 @@ def benchmark_delete(
1672
1672
  typer.echo(result)
1673
1673
 
1674
1674
 
1675
+ @app.command("benchmark-results")
1676
+ def benchmark_results(
1677
+ id: str = typer.Argument(..., help="Benchmark ID"),
1678
+ shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
1679
+ format: str = typer.Option(
1680
+ "text", "--format", "-f", help="Output format: text or json"
1681
+ ),
1682
+ ) -> None:
1683
+ """Summarize a benchmark's pass/fail/unrated health and pass rate."""
1684
+ client = get_client(shop_id=shop_id)
1685
+ result = asyncio.run(
1686
+ tools.benchmark_results(client, benchmark_id=id, output_format=format)
1687
+ )
1688
+ typer.echo(result)
1689
+
1690
+
1675
1691
  @app.command()
1676
1692
  def scenarios(
1677
1693
  benchmark_id: str = typer.Option(
@@ -1928,6 +1944,15 @@ def scenario_bulk_run(
1928
1944
  anonymous: bool = typer.Option(
1929
1945
  False, "--anonymous", help="Run with an anonymous contact"
1930
1946
  ),
1947
+ wait: bool = typer.Option(
1948
+ False, "--wait", help="Poll until all runs finish, then print final status"
1949
+ ),
1950
+ wait_timeout: float = typer.Option(
1951
+ 300.0, "--wait-timeout", help="Max seconds to wait with --wait (default 300)"
1952
+ ),
1953
+ poll_interval: float = typer.Option(
1954
+ 3.0, "--poll-interval", help="Seconds between status polls with --wait"
1955
+ ),
1931
1956
  shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
1932
1957
  format: str = typer.Option(
1933
1958
  "text", "--format", "-f", help="Output format: text or json"
@@ -1944,6 +1969,9 @@ def scenario_bulk_run(
1944
1969
  contact_email=contact_email,
1945
1970
  contact_id=contact_id,
1946
1971
  anonymous=anonymous,
1972
+ wait=wait,
1973
+ wait_timeout=wait_timeout,
1974
+ poll_interval=poll_interval,
1947
1975
  output_format=format,
1948
1976
  )
1949
1977
  )
@@ -8,6 +8,7 @@ import asyncio
8
8
  import difflib
9
9
  import json
10
10
  import re
11
+ import time
11
12
  from contextlib import suppress
12
13
  from html.parser import HTMLParser
13
14
  from typing import Any
@@ -5709,6 +5710,93 @@ async def benchmark_clone(
5709
5710
  return "\n".join(lines)
5710
5711
 
5711
5712
 
5713
+ async def benchmark_results(
5714
+ client: AppliedClient,
5715
+ benchmark_id: str,
5716
+ *,
5717
+ output_format: str = "text",
5718
+ ) -> str:
5719
+ """
5720
+ Summarize a benchmark's pass/fail health.
5721
+
5722
+ Tallies the pass_status across the benchmark's scenarios (pass / fail /
5723
+ unrated), computes the pass rate among rated scenarios, and lists the failing
5724
+ and still-unrated scenarios so you know what to fix or evaluate next.
5725
+
5726
+ Args:
5727
+ client: Authenticated AppliedClient
5728
+ benchmark_id: The benchmark UUID
5729
+ output_format: 'text' (default) or 'json'
5730
+
5731
+ Returns:
5732
+ Pass-rate summary with failing and unrated scenario lists.
5733
+ """
5734
+ try:
5735
+ benchmark = await client.get_benchmark(benchmark_id)
5736
+ scenarios = await client.list_scenarios(
5737
+ benchmark_id=benchmark_id, fetch_all=True
5738
+ )
5739
+ except AppliedAPIError as e:
5740
+ return _format_error(e)
5741
+
5742
+ tally = {"pass": 0, "fail": 0, "unrated": 0}
5743
+ failing: list[dict[str, Any]] = []
5744
+ unrated: list[dict[str, Any]] = []
5745
+ for scenario in scenarios:
5746
+ status = str(scenario.get("pass_status") or "unrated").lower()
5747
+ if status not in tally:
5748
+ status = "unrated"
5749
+ tally[status] += 1
5750
+ entry = {"id": scenario.get("id"), "name": scenario.get("name")}
5751
+ if status == "fail":
5752
+ failing.append(entry)
5753
+ elif status == "unrated":
5754
+ unrated.append(entry)
5755
+
5756
+ rated = tally["pass"] + tally["fail"]
5757
+ pass_rate = round(tally["pass"] / rated, 4) if rated else None
5758
+ summary = {
5759
+ "benchmark_id": benchmark_id,
5760
+ "benchmark_name": benchmark.get("name"),
5761
+ "total_scenarios": len(scenarios),
5762
+ "passed": tally["pass"],
5763
+ "failed": tally["fail"],
5764
+ "unrated": tally["unrated"],
5765
+ "rated": rated,
5766
+ "pass_rate": pass_rate,
5767
+ "failing_scenarios": failing,
5768
+ "unrated_scenarios": unrated,
5769
+ }
5770
+
5771
+ if output_format == "json":
5772
+ return to_json(summary)
5773
+
5774
+ pass_rate_str = (
5775
+ f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
5776
+ if pass_rate is not None
5777
+ else "n/a (no rated scenarios yet)"
5778
+ )
5779
+ lines = [
5780
+ f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
5781
+ f"total_scenarios: {summary['total_scenarios']}",
5782
+ f"passed: {tally['pass']}",
5783
+ f"failed: {tally['fail']}",
5784
+ f"unrated: {tally['unrated']}",
5785
+ f"pass_rate: {pass_rate_str}",
5786
+ ]
5787
+ if failing:
5788
+ lines.append(f"\n# Failing ({len(failing)})")
5789
+ lines.extend(f" - {s['name']} ({s['id']})" for s in failing[:50])
5790
+ if len(failing) > 50:
5791
+ lines.append(f" ... and {len(failing) - 50} more")
5792
+ if unrated:
5793
+ lines.append(f"\n# Unrated ({len(unrated)}) — evaluate these next")
5794
+ lines.extend(f" - {s['name']} ({s['id']})" for s in unrated[:50])
5795
+ if len(unrated) > 50:
5796
+ lines.append(f" ... and {len(unrated) - 50} more")
5797
+ return "\n".join(lines)
5798
+
5799
+
5712
5800
  # -----------------------------------------------------------------------------
5713
5801
  # Scenarios
5714
5802
  # -----------------------------------------------------------------------------
@@ -6082,6 +6170,41 @@ async def scenario_run_delete(
6082
6170
  return f"Scenario run {run_id} deleted successfully."
6083
6171
 
6084
6172
 
6173
+ def _bulk_status_counts(counts: dict | None) -> dict[str, int]:
6174
+ """Normalize bulk-status counts to lowercase keys with int values."""
6175
+ lowered: dict[str, int] = {}
6176
+ for key, value in (counts or {}).items():
6177
+ lowered[str(key).lower()] = int(value or 0)
6178
+ return lowered
6179
+
6180
+
6181
+ def _bulk_pending_count(counts: dict | None) -> int:
6182
+ """Count runs still queued or running (case-insensitive)."""
6183
+ normalized = _bulk_status_counts(counts)
6184
+ return normalized.get("queued", 0) + normalized.get("running", 0)
6185
+
6186
+
6187
+ async def _await_bulk_run(
6188
+ client: AppliedClient,
6189
+ job_id: str,
6190
+ *,
6191
+ timeout: float,
6192
+ poll_interval: float,
6193
+ ) -> tuple[dict, bool]:
6194
+ """Poll a bulk run until no runs are queued/running or the timeout elapses.
6195
+
6196
+ Returns (latest_status_payload, timed_out).
6197
+ """
6198
+ start = time.monotonic()
6199
+ status = await client.get_scenario_bulk_run_status(job_id)
6200
+ while _bulk_pending_count(status.get("counts")) > 0:
6201
+ if time.monotonic() - start >= timeout:
6202
+ return status, True
6203
+ await asyncio.sleep(poll_interval)
6204
+ status = await client.get_scenario_bulk_run_status(job_id)
6205
+ return status, False
6206
+
6207
+
6085
6208
  async def _resolve_contact_override(
6086
6209
  client: AppliedClient,
6087
6210
  *,
@@ -6123,6 +6246,9 @@ async def scenario_bulk_run(
6123
6246
  contact_id: str | None = None,
6124
6247
  contact_email: str | None = None,
6125
6248
  anonymous: bool = False,
6249
+ wait: bool = False,
6250
+ wait_timeout: float = 300.0,
6251
+ poll_interval: float = 3.0,
6126
6252
  output_format: str = "text",
6127
6253
  ) -> str:
6128
6254
  """
@@ -6134,6 +6260,10 @@ async def scenario_bulk_run(
6134
6260
  to run the scenarios as a contact that has an email, so the test conversation
6135
6261
  carries it.
6136
6262
 
6263
+ With wait=True, this polls until every run finishes (or the timeout elapses)
6264
+ and returns the final status, so you can run a benchmark and read results in
6265
+ one call instead of polling scenario_bulk_status yourself.
6266
+
6137
6267
  Args:
6138
6268
  client: Authenticated AppliedClient
6139
6269
  scenario_ids: List of scenario UUIDs to run
@@ -6144,9 +6274,12 @@ async def scenario_bulk_run(
6144
6274
  contact_id: Run scenarios as this existing contact (gives test convos its email)
6145
6275
  contact_email: Resolve/create a contact with this email and run as them
6146
6276
  anonymous: Run with an anonymous contact (mode='anonymous')
6277
+ wait: Poll until all runs finish (or wait_timeout elapses)
6278
+ wait_timeout: Max seconds to wait when wait=True (default 300)
6279
+ poll_interval: Seconds between status polls when wait=True (default 3)
6147
6280
 
6148
6281
  Returns:
6149
- Summary of runs created
6282
+ Summary of runs created (plus the final status when wait=True)
6150
6283
  """
6151
6284
  resolved_scenario_ids = list(scenario_ids or [])
6152
6285
  if not resolved_scenario_ids:
@@ -6196,6 +6329,26 @@ async def scenario_bulk_run(
6196
6329
  "contact_override": result.get("contact_override"),
6197
6330
  }
6198
6331
 
6332
+ job_id = payload.get("job_id")
6333
+ final_status: dict | None = None
6334
+ timed_out = False
6335
+ if wait and job_id:
6336
+ try:
6337
+ final_status, timed_out = await _await_bulk_run(
6338
+ client,
6339
+ str(job_id),
6340
+ timeout=wait_timeout,
6341
+ poll_interval=poll_interval,
6342
+ )
6343
+ except AppliedAPIError as e:
6344
+ return _format_error(e)
6345
+ counts = _bulk_status_counts(final_status.get("counts"))
6346
+ payload["final_counts"] = counts
6347
+ payload["timed_out"] = timed_out
6348
+ payload["duration_seconds"] = final_status.get("duration_seconds")
6349
+ payload["completed_at"] = final_status.get("completed_at")
6350
+ payload["failed"] = final_status.get("failed") or []
6351
+
6199
6352
  if output_format == "json":
6200
6353
  return to_json(payload)
6201
6354
 
@@ -6212,6 +6365,23 @@ async def scenario_bulk_run(
6212
6365
  output += f"scenario_run_ids: {preview_ids}\n"
6213
6366
  if len(run_ids) > 10:
6214
6367
  output += f"more_runs: {len(run_ids) - 10}\n"
6368
+
6369
+ if final_status is not None:
6370
+ counts = payload["final_counts"]
6371
+ output += "\n# Final Status\n"
6372
+ output += "timed_out: " + ("true (still pending)" if timed_out else "false") + "\n"
6373
+ output += f"completed: {counts.get('completed', 0)}\n"
6374
+ output += f"failed: {counts.get('failed', 0)}\n"
6375
+ pending = counts.get("queued", 0) + counts.get("running", 0)
6376
+ output += f"still_pending: {pending}\n"
6377
+ if payload.get("duration_seconds") is not None:
6378
+ output += f"duration_seconds: {payload['duration_seconds']}\n"
6379
+ failed_runs = payload.get("failed") or []
6380
+ if failed_runs:
6381
+ output += f"\n# Failed Runs ({len(failed_runs)})\n"
6382
+ output += to_json(failed_runs)
6383
+ return output
6384
+
6215
6385
  output += "\nTip: use scenario_bulk_status(job_id, include_runs=True) or scenario_run_list(bulk_job_id=job_id) to get per-run details with scenario mappings."
6216
6386
  return output
6217
6387
 
@@ -6245,14 +6415,14 @@ async def scenario_bulk_status(
6245
6415
  payload.pop("runs", None)
6246
6416
  return to_json(payload)
6247
6417
 
6248
- counts = result.get("counts") or {}
6418
+ counts = _bulk_status_counts(result.get("counts"))
6249
6419
  output = "# Bulk Run Status\n"
6250
6420
  output += f"job_id: {result.get('job_id')}\n"
6251
6421
  output += f"total: {result.get('total')}\n"
6252
- output += f"queued: {counts.get('QUEUED', 0)}\n"
6253
- output += f"running: {counts.get('RUNNING', 0)}\n"
6254
- output += f"completed: {counts.get('COMPLETED', 0)}\n"
6255
- output += f"failed: {counts.get('FAILED', 0)}\n"
6422
+ output += f"queued: {counts.get('queued', 0)}\n"
6423
+ output += f"running: {counts.get('running', 0)}\n"
6424
+ output += f"completed: {counts.get('completed', 0)}\n"
6425
+ output += f"failed: {counts.get('failed', 0)}\n"
6256
6426
  output += f"created_at: {result.get('created_at')}\n"
6257
6427
  output += f"updated_at: {result.get('updated_at')}\n"
6258
6428
  if result.get("completed_at"):
@@ -43,6 +43,7 @@ DOMAIN_TOOL_RENAMES: dict[str, dict[str, str]] = {
43
43
  "benchmark_create": "benchmarks_create",
44
44
  "benchmark_delete": "benchmarks_delete",
45
45
  "benchmark_clone": "benchmarks_clone",
46
+ "benchmark_results": "benchmarks_results",
46
47
  },
47
48
  "connectors": {
48
49
  "connector_types": "connectors_types_list",
@@ -30,6 +30,9 @@ class ScenariosBulkRunInput(StrictInput):
30
30
  contact_id: str | None = None
31
31
  contact_email: str | None = None
32
32
  anonymous: bool = False
33
+ wait: bool = False
34
+ wait_timeout: float = 300.0
35
+ poll_interval: float = 3.0
33
36
 
34
37
 
35
38
  class ScenariosBulkCancelInput(StrictInput):
@@ -68,6 +71,10 @@ class BenchmarksCloneInput(StrictInput):
68
71
  apply: bool = False
69
72
 
70
73
 
74
+ class BenchmarksResultsInput(StrictInput):
75
+ benchmark_id: str
76
+
77
+
71
78
  class ScenariosListInput(StrictInput):
72
79
  benchmark_id: str | None = None
73
80
  agent_id: str | None = None
@@ -500,6 +507,44 @@ async def benchmarks_clone_handler(
500
507
  )
501
508
 
502
509
 
510
+ async def benchmarks_results_handler(
511
+ client: AppliedClient,
512
+ params: BenchmarksResultsInput,
513
+ ) -> ToolResult[Any]:
514
+ from applied_cli import tools as legacy_tools
515
+
516
+ raw = await legacy_tools.benchmark_results(
517
+ client, benchmark_id=params.benchmark_id, output_format="json"
518
+ )
519
+ try:
520
+ data = json.loads(raw)
521
+ except (json.JSONDecodeError, TypeError):
522
+ return ToolResult(data={"message": raw}, summary=str(raw))
523
+
524
+ pass_rate = data.get("pass_rate")
525
+ rate_str = (
526
+ f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a (no rated yet)"
527
+ )
528
+ next_actions = []
529
+ if data.get("unrated"):
530
+ next_actions.append(
531
+ "Rate the unrated scenarios with scenarios_update (pass_status)."
532
+ )
533
+ if data.get("failed"):
534
+ next_actions.append(
535
+ "Inspect failing scenarios with scenarios_get / conversations_debug_bundle."
536
+ )
537
+ return ToolResult(
538
+ data=data,
539
+ summary=(
540
+ f"{data.get('benchmark_name') or params.benchmark_id}: pass rate "
541
+ f"{rate_str} — {data.get('passed', 0)} passed, "
542
+ f"{data.get('failed', 0)} failed, {data.get('unrated', 0)} unrated."
543
+ ),
544
+ next_actions=next_actions,
545
+ )
546
+
547
+
503
548
  async def benchmarks_delete_handler(
504
549
  client: AppliedClient,
505
550
  params: BenchmarksDeleteInput,
@@ -828,6 +873,48 @@ async def scenarios_bulk_run_handler(
828
873
  "duplicated_scenarios": result.get("duplicated_scenarios"),
829
874
  "contact_override": result.get("contact_override"),
830
875
  }
876
+
877
+ job_id = payload.get("job_id")
878
+ if params.wait and job_id:
879
+ from applied_cli.tools import _await_bulk_run, _bulk_status_counts
880
+
881
+ try:
882
+ final_status, timed_out = await _await_bulk_run(
883
+ client,
884
+ str(job_id),
885
+ timeout=params.wait_timeout,
886
+ poll_interval=params.poll_interval,
887
+ )
888
+ except AppliedAPIError as exc:
889
+ return _api_error_result(exc)
890
+ counts = _bulk_status_counts(final_status.get("counts"))
891
+ payload["final_counts"] = counts
892
+ payload["timed_out"] = timed_out
893
+ payload["duration_seconds"] = final_status.get("duration_seconds")
894
+ payload["failed"] = final_status.get("failed") or []
895
+ pending = counts.get("queued", 0) + counts.get("running", 0)
896
+ summary = (
897
+ f"Bulk job {job_id} "
898
+ + ("timed out with " if timed_out else "finished: ")
899
+ + f"{counts.get('completed', 0)} completed, "
900
+ + f"{counts.get('failed', 0)} failed"
901
+ + (f", {pending} still pending" if pending else "")
902
+ + "."
903
+ )
904
+ warnings = []
905
+ if counts.get("failed"):
906
+ warnings.append(f"{counts['failed']} run(s) failed.")
907
+ if timed_out:
908
+ warnings.append("Timed out before all runs finished.")
909
+ return ToolResult(
910
+ data=payload,
911
+ summary=summary,
912
+ warnings=warnings,
913
+ next_actions=[
914
+ "Use scenarios_bulk_status with include_runs=true to inspect runs.",
915
+ ],
916
+ )
917
+
831
918
  queued = payload.get("queued") or 0
832
919
  return ToolResult(
833
920
  data=payload,
@@ -961,6 +1048,19 @@ def scenario_specs() -> list[ToolSpec]:
961
1048
  read_write_mode="write",
962
1049
  tags=["benchmark_clone", "native"],
963
1050
  ),
1051
+ ToolSpec(
1052
+ name="benchmarks_results",
1053
+ namespace="benchmarks",
1054
+ description=(
1055
+ "Summarize a benchmark's pass/fail/unrated health and pass rate, "
1056
+ "with the failing and unrated scenario lists."
1057
+ ),
1058
+ input_model=BenchmarksResultsInput,
1059
+ output_model=None,
1060
+ handler=benchmarks_results_handler,
1061
+ read_write_mode="read",
1062
+ tags=["benchmark_results", "native"],
1063
+ ),
964
1064
  ToolSpec(
965
1065
  name="scenarios_list",
966
1066
  namespace="scenarios",
@@ -1064,7 +1164,8 @@ def scenario_specs() -> list[ToolSpec]:
1064
1164
  "Run selected scenarios or every scenario in a benchmark and "
1065
1165
  "return the queued job metadata. Pass contact_email or contact_id "
1066
1166
  "to run as a contact with an email (fixes 'Email is not present' "
1067
- "failures on test conversations)."
1167
+ "failures on test conversations). Pass wait=true to block until "
1168
+ "all runs finish and return the final status in one call."
1068
1169
  ),
1069
1170
  input_model=ScenariosBulkRunInput,
1070
1171
  output_model=None,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.4
3
+ Version: 0.6.6
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -40,6 +40,7 @@ tests/test_audit_tools.py
40
40
  tests/test_auth_context.py
41
41
  tests/test_benchmark_clone.py
42
42
  tests/test_benchmark_delete_guardrail.py
43
+ tests/test_benchmark_results.py
43
44
  tests/test_benchmark_scenario_tools.py
44
45
  tests/test_cli.py
45
46
  tests/test_cli_v2.py
@@ -51,6 +52,7 @@ tests/test_knowledge_content_tools.py
51
52
  tests/test_recovery.py
52
53
  tests/test_scenario_bulk_cancel.py
53
54
  tests/test_scenario_bulk_run_contact.py
55
+ tests/test_scenario_bulk_run_wait.py
54
56
  tests/test_toolkit_contract.py
55
57
  tests/test_v2_agents.py
56
58
  tests/test_v2_articles.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "applied-cli"
3
- version = "0.6.4"
3
+ version = "0.6.6"
4
4
  description = "CLI and shared client library for Applied Labs AI support agents"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -0,0 +1,78 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from applied_cli import tools
6
+
7
+ BENCHMARK = {"id": "bench-1", "name": "Cancel Regression"}
8
+
9
+ SCENARIOS = [
10
+ {"id": "s1", "name": "Cancel order", "pass_status": "pass"},
11
+ {"id": "s2", "name": "Refund flow", "pass_status": "pass"},
12
+ {"id": "s3", "name": "Pause subscription", "pass_status": "fail"},
13
+ {"id": "s4", "name": "Address change", "pass_status": "unrated"},
14
+ {"id": "s5", "name": "No status field"}, # missing -> unrated
15
+ ]
16
+
17
+
18
+ class FakeResultsClient:
19
+ def __init__(self, scenarios=SCENARIOS):
20
+ self._scenarios = scenarios
21
+
22
+ async def get_benchmark(self, benchmark_id):
23
+ return BENCHMARK
24
+
25
+ async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
26
+ return list(self._scenarios)
27
+
28
+
29
+ @pytest.mark.asyncio
30
+ async def test_results_tally_and_pass_rate():
31
+ client = FakeResultsClient()
32
+ data = json.loads(
33
+ await tools.benchmark_results(client, "bench-1", output_format="json")
34
+ )
35
+ assert data["total_scenarios"] == 5
36
+ assert data["passed"] == 2
37
+ assert data["failed"] == 1
38
+ assert data["unrated"] == 2
39
+ assert data["rated"] == 3
40
+ # 2 passed / 3 rated
41
+ assert data["pass_rate"] == round(2 / 3, 4)
42
+ assert [s["id"] for s in data["failing_scenarios"]] == ["s3"]
43
+ assert {s["id"] for s in data["unrated_scenarios"]} == {"s4", "s5"}
44
+
45
+
46
+ @pytest.mark.asyncio
47
+ async def test_results_no_rated_scenarios_pass_rate_none():
48
+ client = FakeResultsClient(
49
+ scenarios=[{"id": "s1", "name": "A", "pass_status": "unrated"}]
50
+ )
51
+ text = await tools.benchmark_results(client, "bench-1", output_format="text")
52
+ assert "n/a (no rated scenarios yet)" in text
53
+
54
+
55
+ @pytest.mark.asyncio
56
+ async def test_results_text_lists_failing_and_unrated():
57
+ client = FakeResultsClient()
58
+ text = await tools.benchmark_results(client, "bench-1")
59
+ assert "# Failing (1)" in text
60
+ assert "Pause subscription" in text
61
+ assert "# Unrated (2)" in text
62
+
63
+
64
+ @pytest.mark.asyncio
65
+ async def test_v2_benchmarks_results_handler_summary():
66
+ from applied_cli.v2.scenarios import (
67
+ BenchmarksResultsInput,
68
+ benchmarks_results_handler,
69
+ )
70
+
71
+ client = FakeResultsClient()
72
+ result = await benchmarks_results_handler(
73
+ client, BenchmarksResultsInput(benchmark_id="bench-1")
74
+ )
75
+ assert result.data["passed"] == 2
76
+ assert "pass rate" in result.summary
77
+ # Has unrated + failing → both follow-up actions surfaced.
78
+ assert len(result.next_actions) == 2
@@ -0,0 +1,107 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from applied_cli import tools
6
+
7
+
8
+ class FakeWaitClient:
9
+ """Bulk client whose status transitions to done after N polls."""
10
+
11
+ def __init__(self, status_sequence):
12
+ self._status_sequence = list(status_sequence)
13
+ self._poll = 0
14
+ self.status_calls = 0
15
+
16
+ async def list_scenarios(self, benchmark_id=None, limit=500, **kwargs):
17
+ return [{"id": "s1"}, {"id": "s2"}]
18
+
19
+ async def bulk_run_scenarios(
20
+ self, scenario_ids=None, target_agent_id=None, contact_override=None
21
+ ):
22
+ return {
23
+ "job_id": "job-1",
24
+ "total": len(scenario_ids or []),
25
+ "queued": len(scenario_ids or []),
26
+ "scenario_run_ids": ["r1", "r2"],
27
+ }
28
+
29
+ async def get_scenario_bulk_run_status(self, job_id):
30
+ self.status_calls += 1
31
+ idx = min(self._poll, len(self._status_sequence) - 1)
32
+ self._poll += 1
33
+ return self._status_sequence[idx]
34
+
35
+
36
+ @pytest.mark.asyncio
37
+ async def test_wait_polls_until_no_pending(monkeypatch):
38
+ # Avoid real sleeping between polls.
39
+ async def _no_sleep(_seconds):
40
+ return None
41
+
42
+ monkeypatch.setattr(tools.asyncio, "sleep", _no_sleep)
43
+
44
+ client = FakeWaitClient(
45
+ status_sequence=[
46
+ {"counts": {"queued": 2, "running": 0, "completed": 0, "failed": 0}},
47
+ {"counts": {"queued": 0, "running": 1, "completed": 1, "failed": 0}},
48
+ {
49
+ "counts": {"queued": 0, "running": 0, "completed": 2, "failed": 0},
50
+ "duration_seconds": 12.5,
51
+ "completed_at": "2026-06-05T10:00:00Z",
52
+ "failed": [],
53
+ },
54
+ ]
55
+ )
56
+ result = await tools.scenario_bulk_run(
57
+ client, benchmark_id="bench-1", wait=True, output_format="json"
58
+ )
59
+ data = json.loads(result)
60
+ assert data["timed_out"] is False
61
+ assert data["final_counts"]["completed"] == 2
62
+ assert data["duration_seconds"] == 12.5
63
+ assert client.status_calls == 3
64
+
65
+
66
+ @pytest.mark.asyncio
67
+ async def test_wait_times_out_when_runs_stay_pending(monkeypatch):
68
+ async def _no_sleep(_seconds):
69
+ return None
70
+
71
+ monkeypatch.setattr(tools.asyncio, "sleep", _no_sleep)
72
+
73
+ # Always pending → must hit the timeout path.
74
+ client = FakeWaitClient(
75
+ status_sequence=[
76
+ {"counts": {"queued": 2, "running": 0, "completed": 0, "failed": 0}}
77
+ ]
78
+ )
79
+ result = await tools.scenario_bulk_run(
80
+ client,
81
+ benchmark_id="bench-1",
82
+ wait=True,
83
+ wait_timeout=0.0, # immediate timeout after first poll
84
+ output_format="json",
85
+ )
86
+ data = json.loads(result)
87
+ assert data["timed_out"] is True
88
+ assert data["final_counts"]["queued"] == 2
89
+
90
+
91
+ @pytest.mark.asyncio
92
+ async def test_no_wait_returns_started_summary():
93
+ client = FakeWaitClient(status_sequence=[{"counts": {}}])
94
+ result = await tools.scenario_bulk_run(
95
+ client, benchmark_id="bench-1", output_format="json"
96
+ )
97
+ data = json.loads(result)
98
+ assert "final_counts" not in data
99
+ assert client.status_calls == 0 # no polling when wait is False
100
+
101
+
102
+ def test_bulk_status_counts_normalizes_case_and_types():
103
+ assert tools._bulk_status_counts({"QUEUED": 2, "Running": "1"}) == {
104
+ "queued": 2,
105
+ "running": 1,
106
+ }
107
+ assert tools._bulk_pending_count({"QUEUED": 3, "RUNNING": 4, "COMPLETED": 9}) == 7
File without changes
File without changes