PyPI - applied-cli - Versions diffs - 0.6.5__tar.gz → 0.6.6__tar.gz - Mend

applied-cli 0.6.5tar.gz → 0.6.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{applied_cli-0.6.5 → applied_cli-0.6.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: applied-cli
-Version: 0.6.5
+Version: 0.6.6
 Summary: CLI and shared client library for Applied Labs AI support agents
 Author: Applied Labs
 License-Expression: MIT

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/cli.py RENAMED Viewed

@@ -1672,6 +1672,22 @@ def benchmark_delete(
     typer.echo(result)
+@app.command("benchmark-results")
+def benchmark_results(
+    id: str = typer.Argument(..., help="Benchmark ID"),
+    shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
+    format: str = typer.Option(
+        "text", "--format", "-f", help="Output format: text or json"
+    ),
+) -> None:
+    """Summarize a benchmark's pass/fail/unrated health and pass rate."""
+    client = get_client(shop_id=shop_id)
+    result = asyncio.run(
+        tools.benchmark_results(client, benchmark_id=id, output_format=format)
+    )
+    typer.echo(result)
 @app.command()
 def scenarios(
     benchmark_id: str = typer.Option(

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/tools.py RENAMED Viewed

@@ -5710,6 +5710,93 @@ async def benchmark_clone(
     return "\n".join(lines)
+async def benchmark_results(
+    client: AppliedClient,
+    benchmark_id: str,
+    *,
+    output_format: str = "text",
+) -> str:
+    """
+    Summarize a benchmark's pass/fail health.
+    Tallies the pass_status across the benchmark's scenarios (pass / fail /
+    unrated), computes the pass rate among rated scenarios, and lists the failing
+    and still-unrated scenarios so you know what to fix or evaluate next.
+    Args:
+        client: Authenticated AppliedClient
+        benchmark_id: The benchmark UUID
+        output_format: 'text' (default) or 'json'
+    Returns:
+        Pass-rate summary with failing and unrated scenario lists.
+    """
+    try:
+        benchmark = await client.get_benchmark(benchmark_id)
+        scenarios = await client.list_scenarios(
+            benchmark_id=benchmark_id, fetch_all=True
+        )
+    except AppliedAPIError as e:
+        return _format_error(e)
+    tally = {"pass": 0, "fail": 0, "unrated": 0}
+    failing: list[dict[str, Any]] = []
+    unrated: list[dict[str, Any]] = []
+    for scenario in scenarios:
+        status = str(scenario.get("pass_status") or "unrated").lower()
+        if status not in tally:
+            status = "unrated"
+        tally[status] += 1
+        entry = {"id": scenario.get("id"), "name": scenario.get("name")}
+        if status == "fail":
+            failing.append(entry)
+        elif status == "unrated":
+            unrated.append(entry)
+    rated = tally["pass"] + tally["fail"]
+    pass_rate = round(tally["pass"] / rated, 4) if rated else None
+    summary = {
+        "benchmark_id": benchmark_id,
+        "benchmark_name": benchmark.get("name"),
+        "total_scenarios": len(scenarios),
+        "passed": tally["pass"],
+        "failed": tally["fail"],
+        "unrated": tally["unrated"],
+        "rated": rated,
+        "pass_rate": pass_rate,
+        "failing_scenarios": failing,
+        "unrated_scenarios": unrated,
+    }
+    if output_format == "json":
+        return to_json(summary)
+    pass_rate_str = (
+        f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
+        if pass_rate is not None
+        else "n/a (no rated scenarios yet)"
+    )
+    lines = [
+        f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
+        f"total_scenarios: {summary['total_scenarios']}",
+        f"passed: {tally['pass']}",
+        f"failed: {tally['fail']}",
+        f"unrated: {tally['unrated']}",
+        f"pass_rate: {pass_rate_str}",
+    ]
+    if failing:
+        lines.append(f"\n# Failing ({len(failing)})")
+        lines.extend(f"  - {s['name']} ({s['id']})" for s in failing[:50])
+        if len(failing) > 50:
+            lines.append(f"  ... and {len(failing) - 50} more")
+    if unrated:
+        lines.append(f"\n# Unrated ({len(unrated)}) — evaluate these next")
+        lines.extend(f"  - {s['name']} ({s['id']})" for s in unrated[:50])
+        if len(unrated) > 50:
+            lines.append(f"  ... and {len(unrated) - 50} more")
+    return "\n".join(lines)
 # -----------------------------------------------------------------------------
 # Scenarios
 # -----------------------------------------------------------------------------

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/domains.py RENAMED Viewed

@@ -43,6 +43,7 @@ DOMAIN_TOOL_RENAMES: dict[str, dict[str, str]] = {
         "benchmark_create": "benchmarks_create",
         "benchmark_delete": "benchmarks_delete",
         "benchmark_clone": "benchmarks_clone",
+        "benchmark_results": "benchmarks_results",
     },
     "connectors": {
         "connector_types": "connectors_types_list",

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/scenarios.py RENAMED Viewed

@@ -71,6 +71,10 @@ class BenchmarksCloneInput(StrictInput):
     apply: bool = False
+class BenchmarksResultsInput(StrictInput):
+    benchmark_id: str
 class ScenariosListInput(StrictInput):
     benchmark_id: str | None = None
     agent_id: str | None = None
@@ -503,6 +507,44 @@ async def benchmarks_clone_handler(
     )
+async def benchmarks_results_handler(
+    client: AppliedClient,
+    params: BenchmarksResultsInput,
+) -> ToolResult[Any]:
+    from applied_cli import tools as legacy_tools
+    raw = await legacy_tools.benchmark_results(
+        client, benchmark_id=params.benchmark_id, output_format="json"
+    )
+    try:
+        data = json.loads(raw)
+    except (json.JSONDecodeError, TypeError):
+        return ToolResult(data={"message": raw}, summary=str(raw))
+    pass_rate = data.get("pass_rate")
+    rate_str = (
+        f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a (no rated yet)"
+    )
+    next_actions = []
+    if data.get("unrated"):
+        next_actions.append(
+            "Rate the unrated scenarios with scenarios_update (pass_status)."
+        )
+    if data.get("failed"):
+        next_actions.append(
+            "Inspect failing scenarios with scenarios_get / conversations_debug_bundle."
+        )
+    return ToolResult(
+        data=data,
+        summary=(
+            f"{data.get('benchmark_name') or params.benchmark_id}: pass rate "
+            f"{rate_str} — {data.get('passed', 0)} passed, "
+            f"{data.get('failed', 0)} failed, {data.get('unrated', 0)} unrated."
+        ),
+        next_actions=next_actions,
+    )
 async def benchmarks_delete_handler(
     client: AppliedClient,
     params: BenchmarksDeleteInput,
@@ -1006,6 +1048,19 @@ def scenario_specs() -> list[ToolSpec]:
             read_write_mode="write",
             tags=["benchmark_clone", "native"],
         ),
+        ToolSpec(
+            name="benchmarks_results",
+            namespace="benchmarks",
+            description=(
+                "Summarize a benchmark's pass/fail/unrated health and pass rate, "
+                "with the failing and unrated scenario lists."
+            ),
+            input_model=BenchmarksResultsInput,
+            output_model=None,
+            handler=benchmarks_results_handler,
+            read_write_mode="read",
+            tags=["benchmark_results", "native"],
+        ),
         ToolSpec(
             name="scenarios_list",
             namespace="scenarios",

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: applied-cli
-Version: 0.6.5
+Version: 0.6.6
 Summary: CLI and shared client library for Applied Labs AI support agents
 Author: Applied Labs
 License-Expression: MIT

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/SOURCES.txt RENAMED Viewed

@@ -40,6 +40,7 @@ tests/test_audit_tools.py
 tests/test_auth_context.py
 tests/test_benchmark_clone.py
 tests/test_benchmark_delete_guardrail.py
+tests/test_benchmark_results.py
 tests/test_benchmark_scenario_tools.py
 tests/test_cli.py
 tests/test_cli_v2.py

{applied_cli-0.6.5 → applied_cli-0.6.6}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "applied-cli"
-version = "0.6.5"
+version = "0.6.6"
 description = "CLI and shared client library for Applied Labs AI support agents"
 readme = "README.md"
 requires-python = ">=3.11"

applied_cli-0.6.6/tests/test_benchmark_results.py ADDED Viewed

@@ -0,0 +1,78 @@
+import json
+import pytest
+from applied_cli import tools
+BENCHMARK = {"id": "bench-1", "name": "Cancel Regression"}
+SCENARIOS = [
+    {"id": "s1", "name": "Cancel order", "pass_status": "pass"},
+    {"id": "s2", "name": "Refund flow", "pass_status": "pass"},
+    {"id": "s3", "name": "Pause subscription", "pass_status": "fail"},
+    {"id": "s4", "name": "Address change", "pass_status": "unrated"},
+    {"id": "s5", "name": "No status field"},  # missing -> unrated
+]
+class FakeResultsClient:
+    def __init__(self, scenarios=SCENARIOS):
+        self._scenarios = scenarios
+    async def get_benchmark(self, benchmark_id):
+        return BENCHMARK
+    async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
+        return list(self._scenarios)
+@pytest.mark.asyncio
+async def test_results_tally_and_pass_rate():
+    client = FakeResultsClient()
+    data = json.loads(
+        await tools.benchmark_results(client, "bench-1", output_format="json")
+    )
+    assert data["total_scenarios"] == 5
+    assert data["passed"] == 2
+    assert data["failed"] == 1
+    assert data["unrated"] == 2
+    assert data["rated"] == 3
+    # 2 passed / 3 rated
+    assert data["pass_rate"] == round(2 / 3, 4)
+    assert [s["id"] for s in data["failing_scenarios"]] == ["s3"]
+    assert {s["id"] for s in data["unrated_scenarios"]} == {"s4", "s5"}
+@pytest.mark.asyncio
+async def test_results_no_rated_scenarios_pass_rate_none():
+    client = FakeResultsClient(
+        scenarios=[{"id": "s1", "name": "A", "pass_status": "unrated"}]
+    )
+    text = await tools.benchmark_results(client, "bench-1", output_format="text")
+    assert "n/a (no rated scenarios yet)" in text
+@pytest.mark.asyncio
+async def test_results_text_lists_failing_and_unrated():
+    client = FakeResultsClient()
+    text = await tools.benchmark_results(client, "bench-1")
+    assert "# Failing (1)" in text
+    assert "Pause subscription" in text
+    assert "# Unrated (2)" in text
+@pytest.mark.asyncio
+async def test_v2_benchmarks_results_handler_summary():
+    from applied_cli.v2.scenarios import (
+        BenchmarksResultsInput,
+        benchmarks_results_handler,
+    )
+    client = FakeResultsClient()
+    result = await benchmarks_results_handler(
+        client, BenchmarksResultsInput(benchmark_id="bench-1")
+    )
+    assert result.data["passed"] == 2
+    assert "pass rate" in result.summary
+    # Has unrated + failing → both follow-up actions surfaced.
+    assert len(result.next_actions) == 2

{applied_cli-0.6.5 → applied_cli-0.6.6}/README.md RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/__init__.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/agent_scoped_flows.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/auth.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/client.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/conversation_lookup.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/conversations.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/credentials.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/flow_helpers.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/formatters.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/mcp.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/recovery.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/toolkit.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/__init__.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/agents.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/articles.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/catalog.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/connectors.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/content.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/conversations.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/flows.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/knowledge.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/manifest.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/products.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/taxonomy.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli/v2/tickets.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/entry_points.txt RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/requires.txt RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/applied_cli.egg-info/top_level.txt RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/setup.cfg RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_agent_scoped_flows.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_audit_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_auth_context.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_benchmark_clone.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_benchmark_delete_guardrail.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_benchmark_scenario_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_cli.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_cli_v2.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_client.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_client_v2.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_conversation_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_flow_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_knowledge_content_tools.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_recovery.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_scenario_bulk_cancel.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_scenario_bulk_run_contact.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_scenario_bulk_run_wait.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_toolkit_contract.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_agents.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_articles.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_catalog_and_mcp.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_connectors.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_content.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_conversations.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_flows.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_knowledge.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_products.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_scenarios.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_taxonomy.py RENAMED Viewed

File without changes

{applied_cli-0.6.5 → applied_cli-0.6.6}/tests/test_v2_tickets.py RENAMED Viewed

File without changes

applied-cli 0.6.5__tar.gz → 0.6.6__tar.gz

applied-cli 0.6.5tar.gz → 0.6.6tar.gz