applied-cli 0.6.7__tar.gz → 0.6.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {applied_cli-0.6.7 → applied_cli-0.6.9}/PKG-INFO +1 -1
  2. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/cli.py +53 -2
  3. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/tools.py +166 -33
  4. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/domains.py +1 -0
  5. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/scenarios.py +86 -2
  6. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli.egg-info/PKG-INFO +1 -1
  7. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli.egg-info/SOURCES.txt +2 -0
  8. {applied_cli-0.6.7 → applied_cli-0.6.9}/pyproject.toml +1 -1
  9. applied_cli-0.6.9/tests/test_benchmark_list_with_results.py +104 -0
  10. applied_cli-0.6.9/tests/test_scenario_create_bulk.py +112 -0
  11. {applied_cli-0.6.7 → applied_cli-0.6.9}/README.md +0 -0
  12. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/__init__.py +0 -0
  13. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/agent_scoped_flows.py +0 -0
  14. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/auth.py +0 -0
  15. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/client.py +0 -0
  16. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/conversation_lookup.py +0 -0
  17. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/conversations.py +0 -0
  18. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/credentials.py +0 -0
  19. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/flow_helpers.py +0 -0
  20. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/formatters.py +0 -0
  21. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/mcp.py +0 -0
  22. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/recovery.py +0 -0
  23. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/toolkit.py +0 -0
  24. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/__init__.py +0 -0
  25. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/agents.py +0 -0
  26. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/articles.py +0 -0
  27. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/catalog.py +0 -0
  28. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/connectors.py +0 -0
  29. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/content.py +0 -0
  30. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/conversations.py +0 -0
  31. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/flows.py +0 -0
  32. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/knowledge.py +0 -0
  33. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/manifest.py +0 -0
  34. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/products.py +0 -0
  35. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/taxonomy.py +0 -0
  36. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli/v2/tickets.py +0 -0
  37. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli.egg-info/dependency_links.txt +0 -0
  38. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli.egg-info/entry_points.txt +0 -0
  39. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli.egg-info/requires.txt +0 -0
  40. {applied_cli-0.6.7 → applied_cli-0.6.9}/applied_cli.egg-info/top_level.txt +0 -0
  41. {applied_cli-0.6.7 → applied_cli-0.6.9}/setup.cfg +0 -0
  42. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_agent_scoped_flows.py +0 -0
  43. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_audit_tools.py +0 -0
  44. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_auth_context.py +0 -0
  45. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_benchmark_clone.py +0 -0
  46. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_benchmark_delete_guardrail.py +0 -0
  47. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_benchmark_results.py +0 -0
  48. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_benchmark_scenario_tools.py +0 -0
  49. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_cli.py +0 -0
  50. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_cli_v2.py +0 -0
  51. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_client.py +0 -0
  52. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_client_v2.py +0 -0
  53. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_conversation_tools.py +0 -0
  54. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_flow_tools.py +0 -0
  55. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_knowledge_content_tools.py +0 -0
  56. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_recovery.py +0 -0
  57. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_scenario_bulk_cancel.py +0 -0
  58. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_scenario_bulk_run_contact.py +0 -0
  59. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_scenario_bulk_run_wait.py +0 -0
  60. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_toolkit_contract.py +0 -0
  61. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_agents.py +0 -0
  62. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_articles.py +0 -0
  63. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_catalog_and_mcp.py +0 -0
  64. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_connectors.py +0 -0
  65. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_content.py +0 -0
  66. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_conversations.py +0 -0
  67. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_flows.py +0 -0
  68. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_knowledge.py +0 -0
  69. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_products.py +0 -0
  70. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_scenarios.py +0 -0
  71. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_taxonomy.py +0 -0
  72. {applied_cli-0.6.7 → applied_cli-0.6.9}/tests/test_v2_tickets.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.7
3
+ Version: 0.6.9
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -1540,15 +1540,26 @@ def send_message_cmd(
1540
1540
  @app.command()
1541
1541
  def benchmarks(
1542
1542
  agent_id: str = typer.Option(None, "--agent-id", help="Filter by agent ID"),
1543
+ with_results: bool = typer.Option(
1544
+ False,
1545
+ "--with-results",
1546
+ help="Include each benchmark's pass/fail/unrated tally and pass rate "
1547
+ "(one scenario fetch per benchmark) — a go/no-go portfolio view",
1548
+ ),
1543
1549
  shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
1544
1550
  format: str = typer.Option(
1545
1551
  "csv", "--format", "-f", help="Output format: csv or json"
1546
1552
  ),
1547
1553
  ) -> None:
1548
- """List benchmarks."""
1554
+ """List benchmarks (optionally with per-benchmark pass rates via --with-results)."""
1549
1555
  client = get_client(shop_id=shop_id)
1550
1556
  result = asyncio.run(
1551
- tools.benchmark_list(client, agent_id=agent_id, output_format=format)
1557
+ tools.benchmark_list(
1558
+ client,
1559
+ agent_id=agent_id,
1560
+ output_format=format,
1561
+ with_results=with_results,
1562
+ )
1552
1563
  )
1553
1564
  typer.echo(result)
1554
1565
 
@@ -1774,6 +1785,46 @@ def scenario_create(
1774
1785
  typer.echo(result)
1775
1786
 
1776
1787
 
1788
+ @app.command("scenario-create-bulk")
1789
+ def scenario_create_bulk(
1790
+ conversation_ids: str = typer.Option(
1791
+ ..., "--conversation-ids", help="Comma-separated source conversation IDs"
1792
+ ),
1793
+ benchmark_id: str = typer.Option(
1794
+ None, "--benchmark-id", help="Attach the scenarios to an existing benchmark"
1795
+ ),
1796
+ benchmark_name: str = typer.Option(
1797
+ None, "--benchmark-name", help="Create or reuse a benchmark by name"
1798
+ ),
1799
+ agent_id: str = typer.Option(
1800
+ None, "--agent-id", help="Required when --benchmark-name is used"
1801
+ ),
1802
+ name_prefix: str = typer.Option(
1803
+ None,
1804
+ "--name-prefix",
1805
+ help="Name scenarios '<prefix> N' instead of deriving from conversation titles",
1806
+ ),
1807
+ shop_id: str = typer.Option(None, "--shop-id", help="Override shop ID"),
1808
+ format: str = typer.Option(
1809
+ "text", "--format", "-f", help="Output format: text or json"
1810
+ ),
1811
+ ) -> None:
1812
+ """Create scenarios from several conversations at once (build a suite fast)."""
1813
+ client = get_client(shop_id=shop_id)
1814
+ result = asyncio.run(
1815
+ tools.scenario_create_bulk(
1816
+ client,
1817
+ conversation_ids=_parse_csv_option(conversation_ids),
1818
+ benchmark_id=benchmark_id,
1819
+ benchmark_name=benchmark_name,
1820
+ agent_id=agent_id,
1821
+ name_prefix=name_prefix,
1822
+ output_format=format,
1823
+ )
1824
+ )
1825
+ typer.echo(result)
1826
+
1827
+
1777
1828
  @app.command("scenario-update")
1778
1829
  def scenario_update_cmd(
1779
1830
  id: str = typer.Argument(..., help="Scenario ID"),
@@ -5306,6 +5306,7 @@ async def benchmark_list(
5306
5306
  client: AppliedClient,
5307
5307
  agent_id: str | None = None,
5308
5308
  output_format: str = "csv",
5309
+ with_results: bool = False,
5309
5310
  ) -> str:
5310
5311
  """
5311
5312
  List conversation benchmarks.
@@ -5314,26 +5315,45 @@ async def benchmark_list(
5314
5315
  client: Authenticated AppliedClient
5315
5316
  agent_id: Optional - filter by agent UUID
5316
5317
  output_format: 'csv' or 'json'
5318
+ with_results: Also compute each benchmark's pass/fail/unrated tally and
5319
+ pass rate (one extra scenario fetch per benchmark) — a go/no-go
5320
+ portfolio view across all benchmarks
5317
5321
 
5318
5322
  Returns:
5319
- List of benchmarks with id, name, agent, scenario count
5323
+ List of benchmarks with id, name, agent, scenario count (and pass-rate
5324
+ columns when with_results is set)
5320
5325
  """
5321
5326
  benchmarks = await client.list_benchmarks(agent_id=agent_id)
5322
- mapped = [
5323
- {
5327
+ mapped = []
5328
+ for b in benchmarks:
5329
+ row = {
5324
5330
  "id": b.get("id"),
5325
5331
  "name": b.get("name"),
5326
5332
  "agent_name": b.get("agent", {}).get("name", ""),
5327
5333
  "scenario_count": b.get("scenario_count", 0),
5328
5334
  "description": str(b.get("description", ""))[:80],
5329
5335
  }
5330
- for b in benchmarks
5331
- ]
5336
+ if with_results:
5337
+ scenarios = await client.list_scenarios(
5338
+ benchmark_id=b.get("id"), fetch_all=True
5339
+ )
5340
+ tally = _pass_status_tally(scenarios)
5341
+ pass_rate = tally["pass_rate"]
5342
+ row["passed"] = tally["passed"]
5343
+ row["failed"] = tally["failed"]
5344
+ row["unrated"] = tally["unrated"]
5345
+ row["pass_rate"] = (
5346
+ f"{pass_rate * 100:.1f}%" if pass_rate is not None else "n/a"
5347
+ )
5348
+ mapped.append(row)
5349
+
5350
+ columns = ["id", "name", "agent_name", "scenario_count"]
5351
+ if with_results:
5352
+ columns += ["passed", "failed", "unrated", "pass_rate"]
5353
+ columns.append("description")
5332
5354
 
5333
5355
  if output_format == "csv":
5334
- return to_csv(
5335
- mapped, ["id", "name", "agent_name", "scenario_count", "description"]
5336
- )
5356
+ return to_csv(mapped, columns)
5337
5357
  return to_json(mapped)
5338
5358
 
5339
5359
 
@@ -5710,6 +5730,40 @@ async def benchmark_clone(
5710
5730
  return "\n".join(lines)
5711
5731
 
5712
5732
 
5733
+ def _pass_status_tally(scenarios: list[dict]) -> dict[str, Any]:
5734
+ """Tally scenarios by pass_status and compute the pass rate among rated.
5735
+
5736
+ Scenario pass_status from the API is the *effective* value (the latest run's
5737
+ pass_status when present, else the scenario's own), so this reflects the most
5738
+ recent run per scenario.
5739
+ """
5740
+ tally = {"pass": 0, "fail": 0, "unrated": 0}
5741
+ failing: list[dict[str, Any]] = []
5742
+ unrated: list[dict[str, Any]] = []
5743
+ for scenario in scenarios:
5744
+ status = str(scenario.get("pass_status") or "unrated").lower()
5745
+ if status not in tally:
5746
+ status = "unrated"
5747
+ tally[status] += 1
5748
+ entry = {"id": scenario.get("id"), "name": scenario.get("name")}
5749
+ if status == "fail":
5750
+ failing.append(entry)
5751
+ elif status == "unrated":
5752
+ unrated.append(entry)
5753
+
5754
+ rated = tally["pass"] + tally["fail"]
5755
+ return {
5756
+ "total": len(scenarios),
5757
+ "passed": tally["pass"],
5758
+ "failed": tally["fail"],
5759
+ "unrated": tally["unrated"],
5760
+ "rated": rated,
5761
+ "pass_rate": round(tally["pass"] / rated, 4) if rated else None,
5762
+ "failing_scenarios": failing,
5763
+ "unrated_scenarios": unrated,
5764
+ }
5765
+
5766
+
5713
5767
  async def benchmark_results(
5714
5768
  client: AppliedClient,
5715
5769
  benchmark_id: str,
@@ -5739,30 +5793,18 @@ async def benchmark_results(
5739
5793
  except AppliedAPIError as e:
5740
5794
  return _format_error(e)
5741
5795
 
5742
- tally = {"pass": 0, "fail": 0, "unrated": 0}
5743
- failing: list[dict[str, Any]] = []
5744
- unrated: list[dict[str, Any]] = []
5745
- for scenario in scenarios:
5746
- status = str(scenario.get("pass_status") or "unrated").lower()
5747
- if status not in tally:
5748
- status = "unrated"
5749
- tally[status] += 1
5750
- entry = {"id": scenario.get("id"), "name": scenario.get("name")}
5751
- if status == "fail":
5752
- failing.append(entry)
5753
- elif status == "unrated":
5754
- unrated.append(entry)
5755
-
5756
- rated = tally["pass"] + tally["fail"]
5757
- pass_rate = round(tally["pass"] / rated, 4) if rated else None
5796
+ t = _pass_status_tally(scenarios)
5797
+ failing = t["failing_scenarios"]
5798
+ unrated = t["unrated_scenarios"]
5799
+ pass_rate = t["pass_rate"]
5758
5800
  summary = {
5759
5801
  "benchmark_id": benchmark_id,
5760
5802
  "benchmark_name": benchmark.get("name"),
5761
- "total_scenarios": len(scenarios),
5762
- "passed": tally["pass"],
5763
- "failed": tally["fail"],
5764
- "unrated": tally["unrated"],
5765
- "rated": rated,
5803
+ "total_scenarios": t["total"],
5804
+ "passed": t["passed"],
5805
+ "failed": t["failed"],
5806
+ "unrated": t["unrated"],
5807
+ "rated": t["rated"],
5766
5808
  "pass_rate": pass_rate,
5767
5809
  "failing_scenarios": failing,
5768
5810
  "unrated_scenarios": unrated,
@@ -5772,16 +5814,16 @@ async def benchmark_results(
5772
5814
  return to_json(summary)
5773
5815
 
5774
5816
  pass_rate_str = (
5775
- f"{pass_rate * 100:.1f}% ({tally['pass']}/{rated} rated)"
5817
+ f"{pass_rate * 100:.1f}% ({t['passed']}/{t['rated']} rated)"
5776
5818
  if pass_rate is not None
5777
5819
  else "n/a (no rated scenarios yet)"
5778
5820
  )
5779
5821
  lines = [
5780
5822
  f"# Benchmark Results: {benchmark.get('name')} ({benchmark_id})",
5781
5823
  f"total_scenarios: {summary['total_scenarios']}",
5782
- f"passed: {tally['pass']}",
5783
- f"failed: {tally['fail']}",
5784
- f"unrated: {tally['unrated']}",
5824
+ f"passed: {t['passed']}",
5825
+ f"failed: {t['failed']}",
5826
+ f"unrated: {t['unrated']}",
5785
5827
  f"pass_rate: {pass_rate_str}",
5786
5828
  ]
5787
5829
  if failing:
@@ -5942,6 +5984,97 @@ async def scenario_create(
5942
5984
  return result
5943
5985
 
5944
5986
 
5987
+ async def scenario_create_bulk(
5988
+ client: AppliedClient,
5989
+ conversation_ids: list[str],
5990
+ *,
5991
+ benchmark_id: str | None = None,
5992
+ benchmark_name: str | None = None,
5993
+ agent_id: str | None = None,
5994
+ name_prefix: str | None = None,
5995
+ output_format: str = "text",
5996
+ ) -> str:
5997
+ """
5998
+ Create scenarios from several conversations at once and attach them to a
5999
+ benchmark — the fast way to build a regression suite from real conversations.
6000
+
6001
+ Each scenario's name is derived from its source conversation's title; pass
6002
+ name_prefix to instead name them "<prefix> 1", "<prefix> 2", … (skips the
6003
+ per-conversation title lookup). Names are de-duplicated server-side.
6004
+
6005
+ Args:
6006
+ client: Authenticated AppliedClient
6007
+ conversation_ids: Source conversation UUIDs (one scenario each)
6008
+ benchmark_id: Attach the scenarios to this existing benchmark
6009
+ benchmark_name: Create or reuse a benchmark by name (requires agent_id)
6010
+ agent_id: Required when benchmark_name is used
6011
+ name_prefix: Name scenarios "<prefix> N" instead of using conversation titles
6012
+ output_format: 'text' (default) or 'json'
6013
+
6014
+ Returns:
6015
+ Summary of created scenarios and any per-conversation errors.
6016
+ """
6017
+ if not conversation_ids:
6018
+ return _format_argument_error("Pass at least one conversation id.")
6019
+
6020
+ created: list[dict[str, Any]] = []
6021
+ errors: list[dict[str, Any]] = []
6022
+ for index, conversation_id in enumerate(conversation_ids):
6023
+ if name_prefix:
6024
+ name = f"{name_prefix} {index + 1}"
6025
+ else:
6026
+ try:
6027
+ conversation = await client.get_conversation(conversation_id)
6028
+ name = conversation.get("title") or f"Scenario {str(conversation_id)[:8]}"
6029
+ except AppliedAPIError:
6030
+ name = f"Scenario {str(conversation_id)[:8]}"
6031
+
6032
+ try:
6033
+ scenario = await client.create_scenario(
6034
+ input_conversation_id=conversation_id,
6035
+ name=name,
6036
+ benchmark_id=benchmark_id,
6037
+ benchmark_name=benchmark_name,
6038
+ agent_id=agent_id,
6039
+ )
6040
+ created.append(
6041
+ {
6042
+ "id": scenario.get("id"),
6043
+ "name": scenario.get("name"),
6044
+ "conversation_id": conversation_id,
6045
+ }
6046
+ )
6047
+ except AppliedAPIError as e:
6048
+ errors.append({"conversation_id": conversation_id, "error": str(e)})
6049
+
6050
+ summary = {
6051
+ "requested": len(conversation_ids),
6052
+ "created": len(created),
6053
+ "failed": len(errors),
6054
+ "scenarios": created,
6055
+ "errors": errors,
6056
+ }
6057
+
6058
+ if output_format == "json":
6059
+ return to_json(summary)
6060
+
6061
+ lines = [
6062
+ "# Bulk Scenario Create",
6063
+ f"requested: {summary['requested']}",
6064
+ f"created: {summary['created']}",
6065
+ f"failed: {summary['failed']}",
6066
+ ]
6067
+ if created:
6068
+ lines.append("\n# Created")
6069
+ lines.extend(f" - {s['name']} ({s['id']})" for s in created[:50])
6070
+ if len(created) > 50:
6071
+ lines.append(f" ... and {len(created) - 50} more")
6072
+ if errors:
6073
+ lines.append(f"\n# Errors ({len(errors)})")
6074
+ lines.extend(f" - {e['conversation_id']}: {e['error']}" for e in errors[:20])
6075
+ return "\n".join(lines)
6076
+
6077
+
5945
6078
  async def scenario_update(
5946
6079
  client: AppliedClient,
5947
6080
  scenario_id: str,
@@ -112,6 +112,7 @@ DOMAIN_TOOL_RENAMES: dict[str, dict[str, str]] = {
112
112
  "scenario_list": "scenarios_list",
113
113
  "scenario_get": "scenarios_get",
114
114
  "scenario_create": "scenarios_create",
115
+ "scenario_create_bulk": "scenarios_create_bulk",
115
116
  "scenario_update": "scenarios_update",
116
117
  "scenario_delete": "scenarios_delete",
117
118
  "scenario_run_list": "scenarios_runs_list",
@@ -43,6 +43,7 @@ class ScenariosBulkCancelInput(StrictInput):
43
43
  class BenchmarksListInput(StrictInput):
44
44
  agent_id: str | None = None
45
45
  limit: int = 50
46
+ with_results: bool = False
46
47
 
47
48
 
48
49
  class BenchmarksGetInput(StrictInput):
@@ -97,6 +98,14 @@ class ScenariosCreateInput(StrictInput):
97
98
  agent_id: str | None = None
98
99
 
99
100
 
101
+ class ScenariosCreateBulkInput(StrictInput):
102
+ conversation_ids: list[str]
103
+ benchmark_id: str | None = None
104
+ benchmark_name: str | None = None
105
+ agent_id: str | None = None
106
+ name_prefix: str | None = None
107
+
108
+
100
109
  class ScenariosUpdateInput(StrictInput):
101
110
  scenario_id: str
102
111
  name: str | None = None
@@ -395,10 +404,26 @@ async def benchmarks_list_handler(
395
404
  agent_id=params.agent_id,
396
405
  limit=params.limit,
397
406
  )
407
+ payload = []
408
+ for benchmark in benchmarks:
409
+ row = _project_benchmark_payload(benchmark)
410
+ if params.with_results:
411
+ from applied_cli.tools import _pass_status_tally
412
+
413
+ scenarios = await client.list_scenarios(
414
+ benchmark_id=benchmark.get("id"), fetch_all=True
415
+ )
416
+ tally = _pass_status_tally(scenarios)
417
+ row["results"] = {
418
+ "passed": tally["passed"],
419
+ "failed": tally["failed"],
420
+ "unrated": tally["unrated"],
421
+ "pass_rate": tally["pass_rate"],
422
+ }
423
+ payload.append(row)
398
424
  except AppliedAPIError as exc:
399
425
  return _api_error_result(exc)
400
426
 
401
- payload = [_project_benchmark_payload(benchmark) for benchmark in benchmarks]
402
427
  return ToolResult(
403
428
  data=payload,
404
429
  summary=_count_summary(len(payload), "benchmark"),
@@ -634,6 +659,46 @@ async def scenarios_create_handler(
634
659
  )
635
660
 
636
661
 
662
+ async def scenarios_create_bulk_handler(
663
+ client: AppliedClient,
664
+ params: ScenariosCreateBulkInput,
665
+ ) -> ToolResult[Any]:
666
+ from applied_cli import tools as legacy_tools
667
+
668
+ raw = await legacy_tools.scenario_create_bulk(
669
+ client,
670
+ conversation_ids=params.conversation_ids,
671
+ benchmark_id=params.benchmark_id,
672
+ benchmark_name=params.benchmark_name,
673
+ agent_id=params.agent_id,
674
+ name_prefix=params.name_prefix,
675
+ output_format="json",
676
+ )
677
+ try:
678
+ data = json.loads(raw)
679
+ except (json.JSONDecodeError, TypeError):
680
+ return ToolResult(data={"message": raw}, summary=str(raw))
681
+
682
+ next_actions = []
683
+ if data.get("created"):
684
+ next_actions.append(
685
+ "Use scenarios_bulk_run with the benchmark_id to run the new scenarios."
686
+ )
687
+ return ToolResult(
688
+ data=data,
689
+ summary=(
690
+ f"Created {data.get('created', 0)}/{data.get('requested', 0)} "
691
+ f"scenarios ({data.get('failed', 0)} failed)."
692
+ ),
693
+ warnings=(
694
+ [f"{data['failed']} conversation(s) failed to convert."]
695
+ if data.get("failed")
696
+ else []
697
+ ),
698
+ next_actions=next_actions,
699
+ )
700
+
701
+
637
702
  async def scenarios_get_handler(
638
703
  client: AppliedClient,
639
704
  params: ScenariosGetInput,
@@ -991,7 +1056,11 @@ def scenario_specs() -> list[ToolSpec]:
991
1056
  ToolSpec(
992
1057
  name="benchmarks_list",
993
1058
  namespace="benchmarks",
994
- description="List conversation benchmarks as structured rows.",
1059
+ description=(
1060
+ "List conversation benchmarks as structured rows. Set "
1061
+ "with_results=true for each benchmark's pass/fail/unrated tally "
1062
+ "and pass rate (a go/no-go portfolio view)."
1063
+ ),
995
1064
  input_model=BenchmarksListInput,
996
1065
  output_model=None,
997
1066
  handler=benchmarks_list_handler,
@@ -1095,6 +1164,21 @@ def scenario_specs() -> list[ToolSpec]:
1095
1164
  read_write_mode="write",
1096
1165
  tags=["scenario_create", "native"],
1097
1166
  ),
1167
+ ToolSpec(
1168
+ name="scenarios_create_bulk",
1169
+ namespace="scenarios",
1170
+ description=(
1171
+ "Create scenarios from several conversations at once and attach "
1172
+ "them to a benchmark — build a regression suite from real "
1173
+ "conversations. Names derive from each conversation's title "
1174
+ "unless name_prefix is given."
1175
+ ),
1176
+ input_model=ScenariosCreateBulkInput,
1177
+ output_model=None,
1178
+ handler=scenarios_create_bulk_handler,
1179
+ read_write_mode="write",
1180
+ tags=["scenario_create_bulk", "native"],
1181
+ ),
1098
1182
  ToolSpec(
1099
1183
  name="scenarios_update",
1100
1184
  namespace="scenarios",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: applied-cli
3
- Version: 0.6.7
3
+ Version: 0.6.9
4
4
  Summary: CLI and shared client library for Applied Labs AI support agents
5
5
  Author: Applied Labs
6
6
  License-Expression: MIT
@@ -40,6 +40,7 @@ tests/test_audit_tools.py
40
40
  tests/test_auth_context.py
41
41
  tests/test_benchmark_clone.py
42
42
  tests/test_benchmark_delete_guardrail.py
43
+ tests/test_benchmark_list_with_results.py
43
44
  tests/test_benchmark_results.py
44
45
  tests/test_benchmark_scenario_tools.py
45
46
  tests/test_cli.py
@@ -53,6 +54,7 @@ tests/test_recovery.py
53
54
  tests/test_scenario_bulk_cancel.py
54
55
  tests/test_scenario_bulk_run_contact.py
55
56
  tests/test_scenario_bulk_run_wait.py
57
+ tests/test_scenario_create_bulk.py
56
58
  tests/test_toolkit_contract.py
57
59
  tests/test_v2_agents.py
58
60
  tests/test_v2_articles.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "applied-cli"
3
- version = "0.6.7"
3
+ version = "0.6.9"
4
4
  description = "CLI and shared client library for Applied Labs AI support agents"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.11"
@@ -0,0 +1,104 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from applied_cli import tools
6
+
7
+ BENCHMARKS = [
8
+ {"id": "b1", "name": "Cancel", "agent": {"name": "August"}, "scenario_count": 3},
9
+ {"id": "b2", "name": "Refund", "agent": {"name": "August"}, "scenario_count": 1},
10
+ ]
11
+
12
+ SCENARIOS_BY_BENCHMARK = {
13
+ "b1": [
14
+ {"id": "s1", "name": "a", "pass_status": "pass"},
15
+ {"id": "s2", "name": "b", "pass_status": "fail"},
16
+ {"id": "s3", "name": "c", "pass_status": "unrated"},
17
+ ],
18
+ "b2": [{"id": "s4", "name": "d", "pass_status": "pass"}],
19
+ }
20
+
21
+
22
+ class FakeListClient:
23
+ def __init__(self):
24
+ self.list_scenarios_calls = 0
25
+
26
+ async def list_benchmarks(self, agent_id=None, limit=50):
27
+ return list(BENCHMARKS)
28
+
29
+ async def list_scenarios(self, benchmark_id=None, fetch_all=True, **kwargs):
30
+ self.list_scenarios_calls += 1
31
+ return list(SCENARIOS_BY_BENCHMARK.get(benchmark_id, []))
32
+
33
+
34
+ @pytest.mark.asyncio
35
+ async def test_list_without_results_does_not_fetch_scenarios():
36
+ client = FakeListClient()
37
+ out = await tools.benchmark_list(client, output_format="json")
38
+ rows = json.loads(out)
39
+ assert client.list_scenarios_calls == 0
40
+ assert "pass_rate" not in rows[0]
41
+
42
+
43
+ @pytest.mark.asyncio
44
+ async def test_list_with_results_adds_pass_rate_per_benchmark():
45
+ client = FakeListClient()
46
+ out = await tools.benchmark_list(
47
+ client, output_format="json", with_results=True
48
+ )
49
+ rows = {r["id"]: r for r in json.loads(out)}
50
+ # One scenario fetch per benchmark.
51
+ assert client.list_scenarios_calls == 2
52
+ # b1: 1 pass / 2 rated = 50%
53
+ assert rows["b1"]["passed"] == 1
54
+ assert rows["b1"]["failed"] == 1
55
+ assert rows["b1"]["unrated"] == 1
56
+ assert rows["b1"]["pass_rate"] == "50.0%"
57
+ # b2: 1 pass / 1 rated = 100%
58
+ assert rows["b2"]["pass_rate"] == "100.0%"
59
+
60
+
61
+ @pytest.mark.asyncio
62
+ async def test_list_with_results_csv_has_columns():
63
+ client = FakeListClient()
64
+ out = await tools.benchmark_list(client, output_format="csv", with_results=True)
65
+ header = out.splitlines()[0]
66
+ for col in ("passed", "failed", "unrated", "pass_rate"):
67
+ assert col in header
68
+
69
+
70
+ def test_pass_status_tally_pure():
71
+ tally = tools._pass_status_tally(
72
+ [
73
+ {"id": "1", "pass_status": "pass"},
74
+ {"id": "2", "pass_status": "PASS"}, # case-insensitive
75
+ {"id": "3", "pass_status": "fail"},
76
+ {"id": "4"}, # missing -> unrated
77
+ ]
78
+ )
79
+ assert tally["passed"] == 2
80
+ assert tally["failed"] == 1
81
+ assert tally["unrated"] == 1
82
+ assert tally["rated"] == 3
83
+ assert tally["pass_rate"] == round(2 / 3, 4)
84
+
85
+
86
+ def test_pass_status_tally_no_rated():
87
+ tally = tools._pass_status_tally([{"id": "1", "pass_status": "unrated"}])
88
+ assert tally["pass_rate"] is None
89
+
90
+
91
+ @pytest.mark.asyncio
92
+ async def test_v2_benchmarks_list_with_results():
93
+ from applied_cli.v2.scenarios import (
94
+ BenchmarksListInput,
95
+ benchmarks_list_handler,
96
+ )
97
+
98
+ client = FakeListClient()
99
+ result = await benchmarks_list_handler(
100
+ client, BenchmarksListInput(with_results=True)
101
+ )
102
+ by_id = {r["id"]: r for r in result.data}
103
+ assert by_id["b1"]["results"]["passed"] == 1
104
+ assert by_id["b2"]["results"]["pass_rate"] == 1.0
@@ -0,0 +1,112 @@
1
+ import json
2
+
3
+ import pytest
4
+
5
+ from applied_cli import tools
6
+ from applied_cli.client import AppliedAPIError
7
+
8
+
9
+ class FakeBulkCreateClient:
10
+ def __init__(self, titles=None, fail_ids=None):
11
+ self._titles = titles or {}
12
+ self._fail_ids = set(fail_ids or [])
13
+ self.get_conversation_calls = []
14
+ self.created = []
15
+
16
+ async def get_conversation(self, conversation_id, *, shop_id=None):
17
+ self.get_conversation_calls.append(conversation_id)
18
+ return {"id": conversation_id, "title": self._titles.get(conversation_id)}
19
+
20
+ async def create_scenario(
21
+ self,
22
+ input_conversation_id,
23
+ name,
24
+ benchmark_id=None,
25
+ benchmark_name=None,
26
+ agent_id=None,
27
+ ):
28
+ if input_conversation_id in self._fail_ids:
29
+ raise AppliedAPIError("boom", status_code=400)
30
+ rec = {
31
+ "id": f"scn-{len(self.created) + 1}",
32
+ "name": name,
33
+ "input_conversation_id": input_conversation_id,
34
+ "benchmark_id": benchmark_id,
35
+ }
36
+ self.created.append(rec)
37
+ return rec
38
+
39
+
40
+ @pytest.mark.asyncio
41
+ async def test_bulk_create_names_from_conversation_titles():
42
+ client = FakeBulkCreateClient(
43
+ titles={"c1": "Cancel order BP123", "c2": "Refund request"}
44
+ )
45
+ out = await tools.scenario_create_bulk(
46
+ client, ["c1", "c2"], benchmark_id="b1", output_format="json"
47
+ )
48
+ data = json.loads(out)
49
+ assert data["created"] == 2
50
+ assert [s["name"] for s in client.created] == ["Cancel order BP123", "Refund request"]
51
+ assert all(s["benchmark_id"] == "b1" for s in client.created)
52
+ assert client.get_conversation_calls == ["c1", "c2"]
53
+
54
+
55
+ @pytest.mark.asyncio
56
+ async def test_name_prefix_skips_title_lookup():
57
+ client = FakeBulkCreateClient()
58
+ await tools.scenario_create_bulk(
59
+ client, ["c1", "c2", "c3"], name_prefix="DG Cancel", output_format="json"
60
+ )
61
+ assert [s["name"] for s in client.created] == [
62
+ "DG Cancel 1",
63
+ "DG Cancel 2",
64
+ "DG Cancel 3",
65
+ ]
66
+ # No per-conversation fetches when a prefix is supplied.
67
+ assert client.get_conversation_calls == []
68
+
69
+
70
+ @pytest.mark.asyncio
71
+ async def test_missing_title_falls_back_to_short_id():
72
+ client = FakeBulkCreateClient(titles={"abcdef12-0000": None})
73
+ await tools.scenario_create_bulk(
74
+ client, ["abcdef12-0000"], output_format="json"
75
+ )
76
+ assert client.created[0]["name"] == "Scenario abcdef12"
77
+
78
+
79
+ @pytest.mark.asyncio
80
+ async def test_partial_failures_are_reported_not_fatal():
81
+ client = FakeBulkCreateClient(
82
+ titles={"c1": "A", "c2": "B", "c3": "C"}, fail_ids=["c2"]
83
+ )
84
+ out = await tools.scenario_create_bulk(
85
+ client, ["c1", "c2", "c3"], output_format="json"
86
+ )
87
+ data = json.loads(out)
88
+ assert data["created"] == 2
89
+ assert data["failed"] == 1
90
+ assert data["errors"][0]["conversation_id"] == "c2"
91
+
92
+
93
+ @pytest.mark.asyncio
94
+ async def test_empty_list_is_an_argument_error():
95
+ client = FakeBulkCreateClient()
96
+ out = await tools.scenario_create_bulk(client, [], output_format="text")
97
+ assert "at least one conversation id" in out.lower()
98
+
99
+
100
+ @pytest.mark.asyncio
101
+ async def test_v2_scenarios_create_bulk_handler():
102
+ from applied_cli.v2.scenarios import (
103
+ ScenariosCreateBulkInput,
104
+ scenarios_create_bulk_handler,
105
+ )
106
+
107
+ client = FakeBulkCreateClient(titles={"c1": "A"})
108
+ result = await scenarios_create_bulk_handler(
109
+ client, ScenariosCreateBulkInput(conversation_ids=["c1"], benchmark_id="b1")
110
+ )
111
+ assert result.data["created"] == 1
112
+ assert "scenarios" in " ".join(result.next_actions).lower()
File without changes
File without changes