applied-cli 0.6.5__tar.gz → 0.6.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {applied_cli-0.6.5 → applied_cli-0.6.6}/PKG-INFO +1 -1
  2. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/cli.py +16 -0
  3. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/tools.py +87 -0
  4. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/domains.py +1 -0
  5. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/scenarios.py +55 -0
  6. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/PKG-INFO +1 -1
  7. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/SOURCES.txt +1 -0
  8. {applied_cli-0.6.5 → applied_cli-0.6.6}/pyproject.toml +1 -1
  9. applied_cli-0.6.6/tests/test_benchmark_results.py +78 -0
  10. {applied_cli-0.6.5 → applied_cli-0.6.6}/README.md +0 -0
  11. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/__init__.py +0 -0
  12. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/agent_scoped_flows.py +0 -0
  13. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/auth.py +0 -0
  14. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/client.py +0 -0
  15. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/conversation_lookup.py +0 -0
  16. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/conversations.py +0 -0
  17. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/credentials.py +0 -0
  18. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/flow_helpers.py +0 -0
  19. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/formatters.py +0 -0
  20. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/mcp.py +0 -0
  21. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/recovery.py +0 -0
  22. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/toolkit.py +0 -0
  23. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/__init__.py +0 -0
  24. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/agents.py +0 -0
  25. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/articles.py +0 -0
  26. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/catalog.py +0 -0
  27. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/connectors.py +0 -0
  28. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/content.py +0 -0
  29. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/conversations.py +0 -0
  30. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/flows.py +0 -0
  31. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/knowledge.py +0 -0
  32. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/manifest.py +0 -0
  33. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/products.py +0 -0
  34. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/taxonomy.py +0 -0
  35. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/tickets.py +0 -0
  36. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/dependency_links.txt +0 -0
  37. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/entry_points.txt +0 -0
  38. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/requires.txt +0 -0
  39. {applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/top_level.txt +0 -0
  40. {applied_cli-0.6.5 → applied_cli-0.6.6}/setup.cfg +0 -0
  41. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_agent_scoped_flows.py +0 -0
  42. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_audit_tools.py +0 -0
  43. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_auth_context.py +0 -0
  44. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_benchmark_clone.py +0 -0
  45. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_benchmark_delete_guardrail.py +0 -0
  46. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_benchmark_scenario_tools.py +0 -0
  47. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_cli.py +0 -0
  48. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_cli_v2.py +0 -0
  49. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_client.py +0 -0
  50. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_client_v2.py +0 -0
  51. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_conversation_tools.py +0 -0
  52. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_flow_tools.py +0 -0
  53. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_knowledge_content_tools.py +0 -0
  54. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_recovery.py +0 -0
  55. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_scenario_bulk_cancel.py +0 -0
  56. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_scenario_bulk_run_contact.py +0 -0
  57. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_scenario_bulk_run_wait.py +0 -0
  58. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_toolkit_contract.py +0 -0
  59. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_agents.py +0 -0
  60. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_articles.py +0 -0
  61. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_catalog_and_mcp.py +0 -0
  62. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_connectors.py +0 -0
  63. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_content.py +0 -0
  64. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_conversations.py +0 -0
  65. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_flows.py +0 -0
  66. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_knowledge.py +0 -0
  67. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_products.py +0 -0
  68. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_scenarios.py +0 -0
  69. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_taxonomy.py +0 -0
  70. {applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_tickets.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.5
3
+ Version: 0.6.6
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -1672,6 +1672,22 @@ def benchmark_delete(
1672
1672
  typer.echo(result)
1673
1673
 
1674
1674
 
1675
+ @app.command("benchmark-results")
1676
+ def benchmark_results(
1677
+ id: str = typer.Argument(..., help="Benchmark ID"),
1678
+ shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
1679
+ format: str = typer.Option(
1680
+ "text", "--format", "-f", help="Output format: text or json"
1681
+ ),
1682
+ ) -> None:
1683
+ """Summarize a benchmark's pass/fail/unrated health and pass rate."""
1684
+ client = get_client(shop_id=shop_id)
1685
+ result = asyncio.run(
1686
+ tools.benchmark_results(client, benchmark_id=id, output_format=format)
1687
+ )
1688
+ typer.echo(result)
1689
+
1690
+
1675
1691
  @app.command()
1676
1692
  def scenarios(
1677
1693
  benchmark_id: str = typer.Option(
@@ -5710,6 +5710,93 @@ async def benchmark_clone(
5710
5710
  return "\n".join(lines)
5711
5711
 
5712
5712
 
5713
+ async def benchmark_results(
5714
+ client: AppliedClient,
5715
+ benchmark_id: str,
5716
+ *,
5717
+ output_format: str = "text",
5718
+ ) -> str:
5719
+ """
5720
+ Summarize a benchmark's pass/fail health.
5721
+
5722
+ Tallies the pass_status across the benchmark's scenarios (pass / fail /
5723
+ unrated), computes the pass rate among rated scenarios, and lists the failing
5724
+ and still-unrated scenarios so you know what to fix or evaluate next.
5725
+
5726
+ Args:
5727
+ client: Authenticated AppliedClient
5728
+ benchmark_id: The benchmark UUID
5729
+ output_format: 'text' (default) or 'json'
5730
+
5731
+ Returns:
5732
+ Pass-rate summary with failing and unrated scenario lists.
5733
+ """
5734
+ try:
5735
+ benchmark = await client.get_benchmark(benchmark_id)
5736
+ scenarios = await client.list_scenarios(
5737
+ benchmark_id=benchmark_id, fetch_all=True
5738
+ )
5739
+ except AppliedAPIError as e:
5740
+ return _format_error(e)
5741
+
5742
+ tally = {"pass": 0, "fail": 0, "unrated": 0}
5743
+ failing: list[dict[str, Any]] = []
5744
+ unrated: list[dict[str, Any]] = []
5745
+ for scenario in scenarios:
5746
+ status = str(scenario.get("pass_status") or "unrated").lower()
5747
+ if status not in tally:
5748
+ status = "unrated"
5749
+ tally[status] += 1
5750
+ entry = {"id": scenario.get("id"), "name": scenario.get("name")}
5751
+ if status == "fail":
5752
+ failing.append(entry)
5753
+ elif status == "unrated":
5754
+ unrated.append(entry)
5755
+
5756
+ rated = tally["pass"] + tally["fail"]
5757
+ pass_rate = round(tally["pass"] / rated, 4) if rated else None
5758
+ summary = {
5759
+ "benchmark_id": benchmark_id,
5760
+ "benchmark_name": benchmark.get("name"),
5761
+ "total_scenarios": len(scenarios),
5762
+ "passed": tally["pass"],
5763
+ "failed": tally["fail"],
5764
+ "unrated": tally["unrated"],
5765
+ "rated": rated,
5766
+ "pass_rate": pass_rate,
5767
+ "failing_scenarios": failing,
5768
+ "unrated_scenarios": unrated,
5769
+ }
5770
+
5771
+ if output_format == "json":
5772
+ return to_json(summary)
5773
+
5774
+ pass_rate_str = (
5775
+ f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
5776
+ if pass_rate is not None
5777
+ else "n/a (no rated scenarios yet)"
5778
+ )
5779
+ lines = [
5780
+ f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
5781
+ f"total_scenarios: {summary['total_scenarios']}",
5782
+ f"passed: {tally['pass']}",
5783
+ f"failed: {tally['fail']}",
5784
+ f"unrated: {tally['unrated']}",
5785
+ f"pass_rate: {pass_rate_str}",
5786
+ ]
5787
+ if failing:
5788
+ lines.append(f"\n# Failing ({len(failing)})")
5789
+ lines.extend(f" - {s['name']} ({s['id']})" for s in failing[:50])
5790
+ if len(failing) > 50:
5791
+ lines.append(f" ... and {len(failing) - 50} more")
5792
+ if unrated:
5793
+ lines.append(f"\n# Unrated ({len(unrated)}) — evaluate these next")
5794
+ lines.extend(f" - {s['name']} ({s['id']})" for s in unrated[:50])
5795
+ if len(unrated) > 50:
5796
+ lines.append(f" ... and {len(unrated) - 50} more")
5797
+ return "\n".join(lines)
5798
+
5799
+
5713
5800
  # -----------------------------------------------------------------------------
5714
5801
  # Scenarios
5715
5802
  # -----------------------------------------------------------------------------
@@ -43,6 +43,7 @@ DOMAIN_TOOL_RENAMES: dict[str, dict[str, str]] = {
43
43
  "benchmark_create": "benchmarks_create",
44
44
  "benchmark_delete": "benchmarks_delete",
45
45
  "benchmark_clone": "benchmarks_clone",
46
+ "benchmark_results": "benchmarks_results",
46
47
  },
47
48
  "connectors": {
48
49
  "connector_types": "connectors_types_list",
@@ -71,6 +71,10 @@ class BenchmarksCloneInput(StrictInput):
71
71
  apply: bool = False
72
72
 
73
73
 
74
+ class BenchmarksResultsInput(StrictInput):
75
+ benchmark_id: str
76
+
77
+
74
78
  class ScenariosListInput(StrictInput):
75
79
  benchmark_id: str | None = None
76
80
  agent_id: str | None = None
@@ -503,6 +507,44 @@ async def benchmarks_clone_handler(
503
507
  )
504
508
 
505
509
 
510
+ async def benchmarks_results_handler(
511
+ client: AppliedClient,
512
+ params: BenchmarksResultsInput,
513
+ ) -> ToolResult[Any]:
514
+ from applied_cli import tools as legacy_tools
515
+
516
+ raw = await legacy_tools.benchmark_results(
517
+ client, benchmark_id=params.benchmark_id, output_format="json"
518
+ )
519
+ try:
520
+ data = json.loads(raw)
521
+ except (json.JSONDecodeError, TypeError):
522
+ return ToolResult(data={"message": raw}, summary=str(raw))
523
+
524
+ pass_rate = data.get("pass_rate")
525
+ rate_str = (
526
+ f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a (no rated yet)"
527
+ )
528
+ next_actions = []
529
+ if data.get("unrated"):
530
+ next_actions.append(
531
+ "Rate the unrated scenarios with scenarios_update (pass_status)."
532
+ )
533
+ if data.get("failed"):
534
+ next_actions.append(
535
+ "Inspect failing scenarios with scenarios_get / conversations_debug_bundle."
536
+ )
537
+ return ToolResult(
538
+ data=data,
539
+ summary=(
540
+ f"{data.get('benchmark_name') or params.benchmark_id}: pass rate "
541
+ f"{rate_str} — {data.get('passed', 0)} passed, "
542
+ f"{data.get('failed', 0)} failed, {data.get('unrated', 0)} unrated."
543
+ ),
544
+ next_actions=next_actions,
545
+ )
546
+
547
+
506
548
  async def benchmarks_delete_handler(
507
549
  client: AppliedClient,
508
550
  params: BenchmarksDeleteInput,
@@ -1006,6 +1048,19 @@ def scenario_specs() -> list[ToolSpec]:
1006
1048
  read_write_mode="write",
1007
1049
  tags=["benchmark_clone", "native"],
1008
1050
  ),
1051
+ ToolSpec(
1052
+ name="benchmarks_results",
1053
+ namespace="benchmarks",
1054
+ description=(
1055
+ "Summarize a benchmark's pass/fail/unrated health and pass rate, "
1056
+ "with the failing and unrated scenario lists."
1057
+ ),
1058
+ input_model=BenchmarksResultsInput,
1059
+ output_model=None,
1060
+ handler=benchmarks_results_handler,
1061
+ read_write_mode="read",
1062
+ tags=["benchmark_results", "native"],
1063
+ ),
1009
1064
  ToolSpec(
1010
1065
  name="scenarios_list",
1011
1066
  namespace="scenarios",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.5
3
+ Version: 0.6.6
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -40,6 +40,7 @@ tests/test_audit_tools.py
40
40
  tests/test_auth_context.py
41
41
  tests/test_benchmark_clone.py
42
42
  tests/test_benchmark_delete_guardrail.py
43
+ tests/test_benchmark_results.py
43
44
  tests/test_benchmark_scenario_tools.py
44
45
  tests/test_cli.py
45
46
  tests/test_cli_v2.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "applied-cli"
3
- version = "0.6.5"
3
+ version = "0.6.6"
4
4
  description = "CLI and shared client library for Applied Labs AI support agents"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -0,0 +1,78 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from applied_cli import tools
6
+
7
+ BENCHMARK = {"id": "bench-1", "name": "Cancel Regression"}
8
+
9
+ SCENARIOS = [
10
+ {"id": "s1", "name": "Cancel order", "pass_status": "pass"},
11
+ {"id": "s2", "name": "Refund flow", "pass_status": "pass"},
12
+ {"id": "s3", "name": "Pause subscription", "pass_status": "fail"},
13
+ {"id": "s4", "name": "Address change", "pass_status": "unrated"},
14
+ {"id": "s5", "name": "No status field"}, # missing -> unrated
15
+ ]
16
+
17
+
18
+ class FakeResultsClient:
19
+ def __init__(self, scenarios=SCENARIOS):
20
+ self._scenarios = scenarios
21
+
22
+ async def get_benchmark(self, benchmark_id):
23
+ return BENCHMARK
24
+
25
+ async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
26
+ return list(self._scenarios)
27
+
28
+
29
+ @pytest.mark.asyncio
30
+ async def test_results_tally_and_pass_rate():
31
+ client = FakeResultsClient()
32
+ data = json.loads(
33
+ await tools.benchmark_results(client, "bench-1", output_format="json")
34
+ )
35
+ assert data["total_scenarios"] == 5
36
+ assert data["passed"] == 2
37
+ assert data["failed"] == 1
38
+ assert data["unrated"] == 2
39
+ assert data["rated"] == 3
40
+ # 2 passed / 3 rated
41
+ assert data["pass_rate"] == round(2 / 3, 4)
42
+ assert [s["id"] for s in data["failing_scenarios"]] == ["s3"]
43
+ assert {s["id"] for s in data["unrated_scenarios"]} == {"s4", "s5"}
44
+
45
+
46
+ @pytest.mark.asyncio
47
+ async def test_results_no_rated_scenarios_pass_rate_none():
48
+ client = FakeResultsClient(
49
+ scenarios=[{"id": "s1", "name": "A", "pass_status": "unrated"}]
50
+ )
51
+ text = await tools.benchmark_results(client, "bench-1", output_format="text")
52
+ assert "n/a (no rated scenarios yet)" in text
53
+
54
+
55
+ @pytest.mark.asyncio
56
+ async def test_results_text_lists_failing_and_unrated():
57
+ client = FakeResultsClient()
58
+ text = await tools.benchmark_results(client, "bench-1")
59
+ assert "# Failing (1)" in text
60
+ assert "Pause subscription" in text
61
+ assert "# Unrated (2)" in text
62
+
63
+
64
+ @pytest.mark.asyncio
65
+ async def test_v2_benchmarks_results_handler_summary():
66
+ from applied_cli.v2.scenarios import (
67
+ BenchmarksResultsInput,
68
+ benchmarks_results_handler,
69
+ )
70
+
71
+ client = FakeResultsClient()
72
+ result = await benchmarks_results_handler(
73
+ client, BenchmarksResultsInput(benchmark_id="bench-1")
74
+ )
75
+ assert result.data["passed"] == 2
76
+ assert "pass rate" in result.summary
77
+ # Has unrated + failing → both follow-up actions surfaced.
78
+ assert len(result.next_actions) == 2
File without changes
File without changes