PyPI - applied-cli - Versions diffs - 0.6.7__tar.gz → 0.6.8__tar.gz - Mend

applied-cli 0.6.7tar.gz → 0.6.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

{applied_cli-0.6.7 → applied_cli-0.6.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: applied-cli
-Version: 0.6.7
+Version: 0.6.8
 Summary: CLI and shared client library for Applied Labs AI support agents
 Author: Applied Labs
 License-Expression: MIT

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/cli.py RENAMED Viewed

@@ -1540,15 +1540,26 @@ def send_message_cmd(
 @app.command()
 def benchmarks(
     agent_id: str = typer.Option(None, "--agent-id", help="Filter by agent ID"),
+    with_results: bool = typer.Option(
+        False,
+        "--with-results",
+        help="Include each benchmark's pass/fail/unrated tally and pass rate "
+        "(one scenario fetch per benchmark) — a go/no-go portfolio view",
+    ),
     shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
     format: str = typer.Option(
         "csv", "--format", "-f", help="Output format: csv or json"
     ),
 ) -> None:
-    """List benchmarks."""
+    """List benchmarks (optionally with per-benchmark pass rates via --with-results)."""
     client = get_client(shop_id=shop_id)
     result = asyncio.run(
-        tools.benchmark_list(client, agent_id=agent_id, output_format=format)
+        tools.benchmark_list(
+            client,
+            agent_id=agent_id,
+            output_format=format,
+            with_results=with_results,
+        )
     )
     typer.echo(result)

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/tools.py RENAMED Viewed

@@ -5306,6 +5306,7 @@ async def benchmark_list(
     client: AppliedClient,
     agent_id: str | None = None,
     output_format: str = "csv",
+    with_results: bool = False,
 ) -> str:
     """
     List conversation benchmarks.
@@ -5314,26 +5315,45 @@ async def benchmark_list(
         client: Authenticated AppliedClient
         agent_id: Optional - filter by agent UUID
         output_format: 'csv' or 'json'
+        with_results: Also compute each benchmark's pass/fail/unrated tally and
+            pass rate (one extra scenario fetch per benchmark) — a go/no-go
+            portfolio view across all benchmarks
     Returns:
-        List of benchmarks with id, name, agent, scenario count
+        List of benchmarks with id, name, agent, scenario count (and pass-rate
+        columns when with_results is set)
     """
     benchmarks = await client.list_benchmarks(agent_id=agent_id)
-    mapped = [
-        {
+    mapped = []
+    for b in benchmarks:
+        row = {
             "id": b.get("id"),
             "name": b.get("name"),
             "agent_name": b.get("agent", {}).get("name", ""),
             "scenario_count": b.get("scenario_count", 0),
             "description": str(b.get("description", ""))[:80],
         }
-        for b in benchmarks
-    ]
+        if with_results:
+            scenarios = await client.list_scenarios(
+                benchmark_id=b.get("id"), fetch_all=True
+            )
+            tally = _pass_status_tally(scenarios)
+            pass_rate = tally["pass_rate"]
+            row["passed"] = tally["passed"]
+            row["failed"] = tally["failed"]
+            row["unrated"] = tally["unrated"]
+            row["pass_rate"] = (
+                f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a"
+            )
+        mapped.append(row)
+    columns = ["id", "name", "agent_name", "scenario_count"]
+    if with_results:
+        columns += ["passed", "failed", "unrated", "pass_rate"]
+    columns.append("description")
     if output_format == "csv":
-        return to_csv(
-            mapped, ["id", "name", "agent_name", "scenario_count", "description"]
-        )
+        return to_csv(mapped, columns)
     return to_json(mapped)
@@ -5710,6 +5730,40 @@ async def benchmark_clone(
     return "\n".join(lines)
+def _pass_status_tally(scenarios: list[dict]) -> dict[str, Any]:
+    """Tally scenarios by pass_status and compute the pass rate among rated.
+    Scenario pass_status from the API is the *effective* value (the latest run's
+    pass_status when present, else the scenario's own), so this reflects the most
+    recent run per scenario.
+    """
+    tally = {"pass": 0, "fail": 0, "unrated": 0}
+    failing: list[dict[str, Any]] = []
+    unrated: list[dict[str, Any]] = []
+    for scenario in scenarios:
+        status = str(scenario.get("pass_status") or "unrated").lower()
+        if status not in tally:
+            status = "unrated"
+        tally[status] += 1
+        entry = {"id": scenario.get("id"), "name": scenario.get("name")}
+        if status == "fail":
+            failing.append(entry)
+        elif status == "unrated":
+            unrated.append(entry)
+    rated = tally["pass"] + tally["fail"]
+    return {
+        "total": len(scenarios),
+        "passed": tally["pass"],
+        "failed": tally["fail"],
+        "unrated": tally["unrated"],
+        "rated": rated,
+        "pass_rate": round(tally["pass"] / rated, 4) if rated else None,
+        "failing_scenarios": failing,
+        "unrated_scenarios": unrated,
+    }
 async def benchmark_results(
     client: AppliedClient,
     benchmark_id: str,
@@ -5739,30 +5793,18 @@ async def benchmark_results(
     except AppliedAPIError as e:
         return _format_error(e)
-    tally = {"pass": 0, "fail": 0, "unrated": 0}
-    failing: list[dict[str, Any]] = []
-    unrated: list[dict[str, Any]] = []
-    for scenario in scenarios:
-        status = str(scenario.get("pass_status") or "unrated").lower()
-        if status not in tally:
-            status = "unrated"
-        tally[status] += 1
-        entry = {"id": scenario.get("id"), "name": scenario.get("name")}
-        if status == "fail":
-            failing.append(entry)
-        elif status == "unrated":
-            unrated.append(entry)
-    rated = tally["pass"] + tally["fail"]
-    pass_rate = round(tally["pass"] / rated, 4) if rated else None
+    t = _pass_status_tally(scenarios)
+    failing = t["failing_scenarios"]
+    unrated = t["unrated_scenarios"]
+    pass_rate = t["pass_rate"]
     summary = {
         "benchmark_id": benchmark_id,
         "benchmark_name": benchmark.get("name"),
-        "total_scenarios": len(scenarios),
-        "passed": tally["pass"],
-        "failed": tally["fail"],
-        "unrated": tally["unrated"],
-        "rated": rated,
+        "total_scenarios": t["total"],
+        "passed": t["passed"],
+        "failed": t["failed"],
+        "unrated": t["unrated"],
+        "rated": t["rated"],
         "pass_rate": pass_rate,
         "failing_scenarios": failing,
         "unrated_scenarios": unrated,
@@ -5772,16 +5814,16 @@ async def benchmark_results(
         return to_json(summary)
     pass_rate_str = (
-        f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
+        f"{pass_rate * 100:.1f}% ({t['passed']}/{t['rated']} rated)"
         if pass_rate is not None
         else "n/a (no rated scenarios yet)"
     )
     lines = [
         f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
         f"total_scenarios: {summary['total_scenarios']}",
-        f"passed: {tally['pass']}",
-        f"failed: {tally['fail']}",
-        f"unrated: {tally['unrated']}",
+        f"passed: {t['passed']}",
+        f"failed: {t['failed']}",
+        f"unrated: {t['unrated']}",
         f"pass_rate: {pass_rate_str}",
     ]
     if failing:

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/scenarios.py RENAMED Viewed

@@ -43,6 +43,7 @@ class ScenariosBulkCancelInput(StrictInput):
 class BenchmarksListInput(StrictInput):
     agent_id: str | None = None
     limit: int = 50
+    with_results: bool = False
 class BenchmarksGetInput(StrictInput):
@@ -395,10 +396,26 @@ async def benchmarks_list_handler(
             agent_id=params.agent_id,
             limit=params.limit,
         )
+        payload = []
+        for benchmark in benchmarks:
+            row = _project_benchmark_payload(benchmark)
+            if params.with_results:
+                from applied_cli.tools import _pass_status_tally
+                scenarios = await client.list_scenarios(
+                    benchmark_id=benchmark.get("id"), fetch_all=True
+                )
+                tally = _pass_status_tally(scenarios)
+                row["results"] = {
+                    "passed": tally["passed"],
+                    "failed": tally["failed"],
+                    "unrated": tally["unrated"],
+                    "pass_rate": tally["pass_rate"],
+                }
+            payload.append(row)
     except AppliedAPIError as exc:
         return _api_error_result(exc)
-    payload = [_project_benchmark_payload(benchmark) for benchmark in benchmarks]
     return ToolResult(
         data=payload,
         summary=_count_summary(len(payload), "benchmark"),
@@ -991,7 +1008,11 @@ def scenario_specs() -> list[ToolSpec]:
         ToolSpec(
             name="benchmarks_list",
             namespace="benchmarks",
-            description="List conversation benchmarks as structured rows.",
+            description=(
+                "List conversation benchmarks as structured rows. Set "
+                "with_results=true for each benchmark's pass/fail/unrated tally "
+                "and pass rate (a go/no-go portfolio view)."
+            ),
             input_model=BenchmarksListInput,
             output_model=None,
             handler=benchmarks_list_handler,

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: applied-cli
-Version: 0.6.7
+Version: 0.6.8
 Summary: CLI and shared client library for Applied Labs AI support agents
 Author: Applied Labs
 License-Expression: MIT

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli.egg-info/SOURCES.txt RENAMED Viewed

@@ -40,6 +40,7 @@ tests/test_audit_tools.py
 tests/test_auth_context.py
 tests/test_benchmark_clone.py
 tests/test_benchmark_delete_guardrail.py
+tests/test_benchmark_list_with_results.py
 tests/test_benchmark_results.py
 tests/test_benchmark_scenario_tools.py
 tests/test_cli.py

{applied_cli-0.6.7 → applied_cli-0.6.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "applied-cli"
-version = "0.6.7"
+version = "0.6.8"
 description = "CLI and shared client library for Applied Labs AI support agents"
 readme = "README.md"
 requires-python = ">=3.11"

applied_cli-0.6.8/tests/test_benchmark_list_with_results.py ADDED Viewed

@@ -0,0 +1,104 @@
+import json
+import pytest
+from applied_cli import tools
+BENCHMARKS = [
+    {"id": "b1", "name": "Cancel", "agent": {"name": "August"}, "scenario_count": 3},
+    {"id": "b2", "name": "Refund", "agent": {"name": "August"}, "scenario_count": 1},
+]
+SCENARIOS_BY_BENCHMARK = {
+    "b1": [
+        {"id": "s1", "name": "a", "pass_status": "pass"},
+        {"id": "s2", "name": "b", "pass_status": "fail"},
+        {"id": "s3", "name": "c", "pass_status": "unrated"},
+    ],
+    "b2": [{"id": "s4", "name": "d", "pass_status": "pass"}],
+}
+class FakeListClient:
+    def __init__(self):
+        self.list_scenarios_calls = 0
+    async def list_benchmarks(self, agent_id=None, limit=50):
+        return list(BENCHMARKS)
+    async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
+        self.list_scenarios_calls += 1
+        return list(SCENARIOS_BY_BENCHMARK.get(benchmark_id, []))
+@pytest.mark.asyncio
+async def test_list_without_results_does_not_fetch_scenarios():
+    client = FakeListClient()
+    out = await tools.benchmark_list(client, output_format="json")
+    rows = json.loads(out)
+    assert client.list_scenarios_calls == 0
+    assert "pass_rate" not in rows[0]
+@pytest.mark.asyncio
+async def test_list_with_results_adds_pass_rate_per_benchmark():
+    client = FakeListClient()
+    out = await tools.benchmark_list(
+        client, output_format="json", with_results=True
+    )
+    rows = {r["id"]: r for r in json.loads(out)}
+    # One scenario fetch per benchmark.
+    assert client.list_scenarios_calls == 2
+    # b1: 1 pass / 2 rated = 50%
+    assert rows["b1"]["passed"] == 1
+    assert rows["b1"]["failed"] == 1
+    assert rows["b1"]["unrated"] == 1
+    assert rows["b1"]["pass_rate"] == "50.0%"
+    # b2: 1 pass / 1 rated = 100%
+    assert rows["b2"]["pass_rate"] == "100.0%"
+@pytest.mark.asyncio
+async def test_list_with_results_csv_has_columns():
+    client = FakeListClient()
+    out = await tools.benchmark_list(client, output_format="csv", with_results=True)
+    header = out.splitlines()[0]
+    for col in ("passed", "failed", "unrated", "pass_rate"):
+        assert col in header
+def test_pass_status_tally_pure():
+    tally = tools._pass_status_tally(
+        [
+            {"id": "1", "pass_status": "pass"},
+            {"id": "2", "pass_status": "PASS"},  # case-insensitive
+            {"id": "3", "pass_status": "fail"},
+            {"id": "4"},  # missing -> unrated
+        ]
+    )
+    assert tally["passed"] == 2
+    assert tally["failed"] == 1
+    assert tally["unrated"] == 1
+    assert tally["rated"] == 3
+    assert tally["pass_rate"] == round(2 / 3, 4)
+def test_pass_status_tally_no_rated():
+    tally = tools._pass_status_tally([{"id": "1", "pass_status": "unrated"}])
+    assert tally["pass_rate"] is None
+@pytest.mark.asyncio
+async def test_v2_benchmarks_list_with_results():
+    from applied_cli.v2.scenarios import (
+        BenchmarksListInput,
+        benchmarks_list_handler,
+    )
+    client = FakeListClient()
+    result = await benchmarks_list_handler(
+        client, BenchmarksListInput(with_results=True)
+    )
+    by_id = {r["id"]: r for r in result.data}
+    assert by_id["b1"]["results"]["passed"] == 1
+    assert by_id["b2"]["results"]["pass_rate"] == 1.0

{applied_cli-0.6.7 → applied_cli-0.6.8}/README.md RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/__init__.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/agent_scoped_flows.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/auth.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/client.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/conversation_lookup.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/conversations.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/credentials.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/flow_helpers.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/formatters.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/mcp.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/recovery.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/toolkit.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/__init__.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/agents.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/articles.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/catalog.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/connectors.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/content.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/conversations.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/domains.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/flows.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/knowledge.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/manifest.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/products.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/taxonomy.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli/v2/tickets.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli.egg-info/entry_points.txt RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli.egg-info/requires.txt RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/applied_cli.egg-info/top_level.txt RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/setup.cfg RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_agent_scoped_flows.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_audit_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_auth_context.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_benchmark_clone.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_benchmark_delete_guardrail.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_benchmark_results.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_benchmark_scenario_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_cli.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_cli_v2.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_client.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_client_v2.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_conversation_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_flow_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_knowledge_content_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_recovery.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_scenario_bulk_cancel.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_scenario_bulk_run_contact.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_scenario_bulk_run_wait.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_toolkit_contract.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_agents.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_articles.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_catalog_and_mcp.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_connectors.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_content.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_conversations.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_flows.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_knowledge.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_products.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_scenarios.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_taxonomy.py RENAMED Viewed

File without changes

{applied_cli-0.6.7 → applied_cli-0.6.8}/tests/test_v2_tickets.py RENAMED Viewed

File without changes

applied-cli 0.6.7__tar.gz → 0.6.8__tar.gz

applied-cli 0.6.7tar.gz → 0.6.8tar.gz