applied-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,508 @@
1
+ import uuid
2
+ from datetime import datetime
3
+ from typing import Any, Optional
4
+
5
+ import typer
6
+
7
+ from applied_cli.commands._hints import suggest_value
8
+ from applied_cli.commands._ui import confirm_or_exit, emit_success, show_target
9
+ from applied_cli.error_reporting import render_api_error
10
+ from applied_cli.http import (
11
+ APIError,
12
+ create_conversation_benchmark,
13
+ create_conversation_scenario,
14
+ create_scenario_run,
15
+ get_conversation,
16
+ list_conversation_benchmarks,
17
+ list_conversation_messages,
18
+ list_conversation_references,
19
+ list_conversation_scenarios,
20
+ list_scenario_runs,
21
+ patch_conversation_scenario,
22
+ patch_scenario_run,
23
+ )
24
+ from applied_cli.runtime import resolve_runtime
25
+
26
+ app = typer.Typer(help="Rate conversations and persist results in Test Coverage.")
27
+
28
+ DEFAULT_BENCHMARK_NAME = "CLI Self-Rated Conversations"
29
+
30
+
31
+ def _parse_dt(value: str) -> datetime:
32
+ normalized = value.replace("Z", "+00:00")
33
+ return datetime.fromisoformat(normalized)
34
+
35
+
36
+ def _safe_float(value: Any) -> Optional[float]:
37
+ if value is None:
38
+ return None
39
+ try:
40
+ return float(value)
41
+ except Exception:
42
+ return None
43
+
44
+
45
+ def _auto_rate(
46
+ *,
47
+ messages: list[dict[str, Any]],
48
+ references: list[dict[str, Any]],
49
+ ) -> dict[str, Any]:
50
+ user_messages = [m for m in messages if m.get("role") == "user"]
51
+ assistant_messages = [m for m in messages if m.get("role") == "assistant"]
52
+
53
+ pass_status = "pass"
54
+ csat_score = 4.0
55
+ feedback_parts: list[str] = []
56
+
57
+ if not assistant_messages:
58
+ pass_status = "fail"
59
+ csat_score = 1.0
60
+ feedback_parts.append("No assistant message was generated.")
61
+ else:
62
+ latest_assistant = assistant_messages[-1]
63
+ latest_text = str(
64
+ latest_assistant.get("text") or latest_assistant.get("content") or ""
65
+ ).strip()
66
+ if not latest_text:
67
+ pass_status = "fail"
68
+ csat_score = min(csat_score, 2.0)
69
+ feedback_parts.append("Latest assistant response is empty.")
70
+ if len(assistant_messages) < len(user_messages):
71
+ pass_status = "fail"
72
+ csat_score = min(csat_score, 2.0)
73
+ feedback_parts.append(
74
+ "Assistant replies are fewer than user turns in transcript."
75
+ )
76
+
77
+ assistant_message_ids = {str(m.get("id")) for m in assistant_messages}
78
+ assistant_refs = [
79
+ ref for ref in references if str(ref.get("message_id")) in assistant_message_ids
80
+ ]
81
+ relevance_values = [
82
+ score
83
+ for score in (_safe_float(r.get("relevance_score")) for r in assistant_refs)
84
+ if score is not None
85
+ ]
86
+ average_relevance = (
87
+ sum(relevance_values) / len(relevance_values) if relevance_values else None
88
+ )
89
+
90
+ if assistant_messages and not assistant_refs:
91
+ csat_score = min(csat_score, 3.0)
92
+ feedback_parts.append("No message references were found for assistant outputs.")
93
+ elif average_relevance is not None and average_relevance < 0.35:
94
+ pass_status = "fail"
95
+ csat_score = min(csat_score, 2.0)
96
+ feedback_parts.append(
97
+ f"Low reference relevance average ({average_relevance:.2f}) indicates weak grounding."
98
+ )
99
+ elif average_relevance is not None:
100
+ feedback_parts.append(
101
+ f"References found with average relevance {average_relevance:.2f}."
102
+ )
103
+
104
+ if not feedback_parts:
105
+ feedback_parts.append("Response appears grounded and complete.")
106
+
107
+ reasons: list[str] = []
108
+ for ref in assistant_refs:
109
+ reason = str(ref.get("reason") or "").strip()
110
+ if reason:
111
+ reasons.append(reason)
112
+ if reasons:
113
+ preview = "; ".join(reasons[:3])
114
+ feedback_parts.append(f"Top reference reasons: {preview}")
115
+
116
+ return {
117
+ "pass_status": pass_status,
118
+ "csat_score": float(max(1.0, min(5.0, round(csat_score, 1)))),
119
+ "feedback": " ".join(feedback_parts),
120
+ "reference_score": round(average_relevance, 3)
121
+ if average_relevance is not None
122
+ else None,
123
+ "reference_notes": f"assistant_refs={len(assistant_refs)} total_refs={len(references)}",
124
+ }
125
+
126
+
127
+ def _extract_agent_id(conversation: dict[str, Any]) -> Optional[str]:
128
+ nested_agent = conversation.get("agent")
129
+ if isinstance(nested_agent, dict) and nested_agent.get("id"):
130
+ return str(nested_agent["id"])
131
+ flat_agent_id = conversation.get("agent_id")
132
+ if flat_agent_id:
133
+ return str(flat_agent_id)
134
+ return None
135
+
136
+
137
+ def _require_id(value: Any, *, label: str) -> str:
138
+ if not value:
139
+ raise APIError(
140
+ f"Expected {label} in API response.",
141
+ code="MISSING_RESPONSE_FIELD",
142
+ hint="Server returned an unexpected payload shape; inspect the raw API response.",
143
+ retryable=False,
144
+ )
145
+ return str(value)
146
+
147
+
148
+ def _find_or_create_benchmark(
149
+ *,
150
+ base_url: str,
151
+ shop_id: str,
152
+ api_token: str,
153
+ agent_id: str,
154
+ benchmark_name: str,
155
+ ) -> Optional[dict[str, Any]]:
156
+ benchmarks = list_conversation_benchmarks(
157
+ base_url=base_url,
158
+ shop_id=shop_id,
159
+ api_token=api_token,
160
+ agent_id=agent_id,
161
+ )
162
+ for benchmark in benchmarks:
163
+ if str(benchmark.get("name", "")).strip().lower() == benchmark_name.lower():
164
+ return benchmark
165
+ try:
166
+ return create_conversation_benchmark(
167
+ base_url=base_url,
168
+ shop_id=shop_id,
169
+ api_token=api_token,
170
+ agent_id=agent_id,
171
+ name=benchmark_name,
172
+ description="Scenario collection created by applied-cli rating workflow.",
173
+ )
174
+ except APIError:
175
+ return None
176
+
177
+
178
+ def _find_or_create_scenario(
179
+ *,
180
+ base_url: str,
181
+ shop_id: str,
182
+ api_token: str,
183
+ agent_id: str,
184
+ benchmark_id: Optional[str],
185
+ conversation_id: str,
186
+ ) -> dict[str, Any]:
187
+ scenario_name = f"CLI Rated {conversation_id}"
188
+ scenarios = list_conversation_scenarios(
189
+ base_url=base_url,
190
+ shop_id=shop_id,
191
+ api_token=api_token,
192
+ agent_id=agent_id,
193
+ name=scenario_name,
194
+ )
195
+ for scenario in scenarios:
196
+ if str(scenario.get("name", "")).strip() != scenario_name:
197
+ continue
198
+ if benchmark_id:
199
+ existing_benchmarks = scenario.get("benchmarks")
200
+ benchmark_ids: list[str] = []
201
+ if isinstance(existing_benchmarks, list):
202
+ for benchmark in existing_benchmarks:
203
+ if isinstance(benchmark, dict) and benchmark.get("id"):
204
+ benchmark_ids.append(str(benchmark["id"]))
205
+ if benchmark_id not in benchmark_ids:
206
+ benchmark_ids.append(benchmark_id)
207
+ scenario = patch_conversation_scenario(
208
+ base_url=base_url,
209
+ shop_id=shop_id,
210
+ api_token=api_token,
211
+ scenario_id=_require_id(scenario.get("id"), label="scenario id"),
212
+ payload={"benchmark_ids": benchmark_ids},
213
+ )
214
+ return scenario
215
+ return create_conversation_scenario(
216
+ base_url=base_url,
217
+ shop_id=shop_id,
218
+ api_token=api_token,
219
+ agent_id=agent_id,
220
+ benchmark_id=benchmark_id,
221
+ name=scenario_name,
222
+ input_conversation_id=conversation_id,
223
+ )
224
+
225
+
226
+ def _find_or_create_run(
227
+ *,
228
+ base_url: str,
229
+ shop_id: str,
230
+ api_token: str,
231
+ scenario_id: str,
232
+ conversation_id: str,
233
+ ) -> dict[str, Any]:
234
+ runs = list_scenario_runs(
235
+ base_url=base_url,
236
+ shop_id=shop_id,
237
+ api_token=api_token,
238
+ scenario_id=scenario_id,
239
+ latest_only=True,
240
+ )
241
+ if runs:
242
+ return runs[0]
243
+ return create_scenario_run(
244
+ base_url=base_url,
245
+ shop_id=shop_id,
246
+ api_token=api_token,
247
+ scenario_id=scenario_id,
248
+ output_conversation_id=conversation_id,
249
+ )
250
+
251
+
252
+ def _validate_manual_values(
253
+ *,
254
+ pass_status: Optional[str],
255
+ csat_score: Optional[float],
256
+ ) -> None:
257
+ if pass_status is not None and pass_status not in {"pass", "fail"}:
258
+ suggestion = suggest_value(pass_status, ["pass", "fail"])
259
+ hint = f" Did you mean '{suggestion}'?" if suggestion else ""
260
+ raise typer.BadParameter(f"pass-status must be one of: pass, fail.{hint}")
261
+ if csat_score is not None and (csat_score < 1.0 or csat_score > 5.0):
262
+ raise typer.BadParameter("csat-score must be between 1 and 5")
263
+
264
+
265
+ @app.command(
266
+ "conversation",
267
+ help=(
268
+ "Rate a conversation and persist in Test Coverage. Example: applied-cli test scenarios rate "
269
+ "--conversation-id <uuid> --agent-id <uuid> --auto --yes"
270
+ ),
271
+ )
272
+ def conversation(
273
+ conversation_id: str = typer.Option(
274
+ ..., "--conversation-id", "--conversation", "--id", help="Conversation UUID to rate."
275
+ ),
276
+ agent_id: Optional[str] = typer.Option(
277
+ None,
278
+ "--agent-id",
279
+ "--agent",
280
+ help="Target agent UUID override (defaults to conversation agent).",
281
+ ),
282
+ benchmark_name: str = typer.Option(
283
+ DEFAULT_BENCHMARK_NAME,
284
+ help="Benchmark collection name used for persisted scenarios.",
285
+ ),
286
+ auto: bool = typer.Option(
287
+ True,
288
+ "--auto/--manual",
289
+ help="Auto-compute rating or provide manual score values.",
290
+ ),
291
+ include_references: bool = typer.Option(
292
+ True,
293
+ "--include-references/--no-include-references",
294
+ help="Include MessageReference attribution context in rating.",
295
+ ),
296
+ pass_status: Optional[str] = typer.Option(
297
+ None, "--pass-status", help="Manual pass/fail result (required for --manual)."
298
+ ),
299
+ csat_score: Optional[float] = typer.Option(
300
+ None, "--csat-score", help="Manual CSAT score between 1 and 5."
301
+ ),
302
+ feedback: Optional[str] = typer.Option(None, "--feedback", help="Manual feedback notes."),
303
+ reference_score: Optional[float] = typer.Option(
304
+ None, help="Optional reference quality score."
305
+ ),
306
+ reference_notes: Optional[str] = typer.Option(
307
+ None, help="Optional notes about reference quality."
308
+ ),
309
+ base_url: Optional[str] = typer.Option(None, help="Applied base URL."),
310
+ shop_id: Optional[str] = typer.Option(None, help="Target shop UUID."),
311
+ api_token: Optional[str] = typer.Option(None, help="Applied API token."),
312
+ dry_run: bool = typer.Option(False, help="Show rating and payloads without persisting."),
313
+ yes: bool = typer.Option(
314
+ False, "--yes", "-y", help="Skip pre-execution confirmation prompt."
315
+ ),
316
+ ) -> None:
317
+ try:
318
+ uuid.UUID(conversation_id)
319
+ except ValueError as exc:
320
+ raise typer.BadParameter(
321
+ "conversation-id must be a valid UUID."
322
+ ) from exc
323
+ if agent_id:
324
+ try:
325
+ uuid.UUID(agent_id)
326
+ except ValueError as exc:
327
+ raise typer.BadParameter("agent-id must be a valid UUID.") from exc
328
+
329
+ _validate_manual_values(pass_status=pass_status, csat_score=csat_score)
330
+ if not auto and yes and not pass_status:
331
+ raise typer.BadParameter(
332
+ "manual mode with --yes requires --pass-status to avoid interactive prompts."
333
+ )
334
+ try:
335
+ resolved_base_url, resolved_shop_id, resolved_token = resolve_runtime(
336
+ base_url=base_url,
337
+ shop_id=shop_id,
338
+ api_token=api_token,
339
+ )
340
+ except APIError as exc:
341
+ typer.echo(render_api_error(exc, action="resolve runtime for rating"), err=True)
342
+ raise typer.Exit(code=1) from exc
343
+
344
+ try:
345
+ conversation_data = get_conversation(
346
+ base_url=resolved_base_url,
347
+ shop_id=resolved_shop_id,
348
+ api_token=resolved_token,
349
+ conversation_id=conversation_id,
350
+ )
351
+ messages = list_conversation_messages(
352
+ base_url=resolved_base_url,
353
+ shop_id=resolved_shop_id,
354
+ api_token=resolved_token,
355
+ conversation_id=conversation_id,
356
+ )
357
+ references: list[dict[str, Any]] = []
358
+ if include_references:
359
+ references = list_conversation_references(
360
+ base_url=resolved_base_url,
361
+ shop_id=resolved_shop_id,
362
+ api_token=resolved_token,
363
+ conversation_id=conversation_id,
364
+ )
365
+ except APIError as exc:
366
+ typer.echo(render_api_error(exc, action="read conversation for rating"), err=True)
367
+ raise typer.Exit(code=1) from exc
368
+
369
+ messages.sort(
370
+ key=lambda item: _parse_dt(str(item.get("created_at") or "1970-01-01T00:00:00"))
371
+ )
372
+
373
+ resolved_agent_id = agent_id or _extract_agent_id(conversation_data)
374
+ if not resolved_agent_id:
375
+ typer.echo(
376
+ "Could not determine agent_id from conversation. Provide --agent-id.",
377
+ err=True,
378
+ )
379
+ raise typer.Exit(code=1)
380
+
381
+ computed = _auto_rate(messages=messages, references=references)
382
+ if not auto:
383
+ if not pass_status:
384
+ pass_status = typer.prompt("Pass status (pass/fail)").strip().lower()
385
+ computed["pass_status"] = pass_status
386
+ if csat_score is not None:
387
+ computed["csat_score"] = float(csat_score)
388
+ if feedback is not None:
389
+ computed["feedback"] = feedback
390
+ if reference_score is not None:
391
+ computed["reference_score"] = float(reference_score)
392
+ if reference_notes is not None:
393
+ computed["reference_notes"] = reference_notes
394
+ else:
395
+ if pass_status is not None:
396
+ computed["pass_status"] = pass_status
397
+ if csat_score is not None:
398
+ computed["csat_score"] = float(csat_score)
399
+ if feedback is not None:
400
+ computed["feedback"] = feedback
401
+ if reference_score is not None:
402
+ computed["reference_score"] = float(reference_score)
403
+ if reference_notes is not None:
404
+ computed["reference_notes"] = reference_notes
405
+
406
+ show_target(
407
+ {
408
+ "base_url": resolved_base_url,
409
+ "shop_id": resolved_shop_id,
410
+ "conversation_id": conversation_id,
411
+ "agent_id": resolved_agent_id,
412
+ "benchmark_name": benchmark_name,
413
+ "mode": "auto" if auto else "manual",
414
+ "include_references": include_references,
415
+ "dry_run": dry_run,
416
+ }
417
+ )
418
+
419
+ typer.echo("Computed rating:")
420
+ typer.echo(f"- pass_status: {computed['pass_status']}")
421
+ typer.echo(f"- csat_score: {computed['csat_score']}")
422
+ typer.echo(f"- feedback: {computed['feedback']}")
423
+ typer.echo(f"- reference_score: {computed.get('reference_score')}")
424
+ typer.echo(f"- reference_notes: {computed.get('reference_notes')}")
425
+
426
+ confirm_or_exit(yes=yes, prompt="Continue and persist rating to Test Coverage?")
427
+
428
+ if dry_run:
429
+ typer.echo("Dry run complete. No records were written.")
430
+ raise typer.Exit(code=0)
431
+
432
+ try:
433
+ benchmark = _find_or_create_benchmark(
434
+ base_url=resolved_base_url,
435
+ shop_id=resolved_shop_id,
436
+ api_token=resolved_token,
437
+ agent_id=resolved_agent_id,
438
+ benchmark_name=benchmark_name,
439
+ )
440
+ benchmark_id = None
441
+ if benchmark is not None:
442
+ benchmark_id = _require_id(benchmark.get("id"), label="benchmark id")
443
+ else:
444
+ typer.echo(
445
+ "Warning: could not create benchmark with current credentials. "
446
+ "Proceeding without benchmark linkage."
447
+ )
448
+
449
+ scenario = _find_or_create_scenario(
450
+ base_url=resolved_base_url,
451
+ shop_id=resolved_shop_id,
452
+ api_token=resolved_token,
453
+ agent_id=resolved_agent_id,
454
+ benchmark_id=benchmark_id,
455
+ conversation_id=conversation_id,
456
+ )
457
+ scenario_id = _require_id(scenario.get("id"), label="scenario id")
458
+
459
+ run = _find_or_create_run(
460
+ base_url=resolved_base_url,
461
+ shop_id=resolved_shop_id,
462
+ api_token=resolved_token,
463
+ scenario_id=scenario_id,
464
+ conversation_id=conversation_id,
465
+ )
466
+ run_id = _require_id(run.get("id"), label="run id")
467
+
468
+ run_payload = {
469
+ "pass_status": computed["pass_status"],
470
+ "csat_score": computed["csat_score"],
471
+ "feedback": computed["feedback"],
472
+ "reference_score": computed.get("reference_score"),
473
+ "reference_notes": computed.get("reference_notes"),
474
+ }
475
+ scenario_payload = {
476
+ "pass_status": computed["pass_status"],
477
+ "csat_score": computed["csat_score"],
478
+ "feedback": computed["feedback"],
479
+ }
480
+
481
+ updated_run = patch_scenario_run(
482
+ base_url=resolved_base_url,
483
+ shop_id=resolved_shop_id,
484
+ api_token=resolved_token,
485
+ run_id=run_id,
486
+ payload=run_payload,
487
+ )
488
+ patch_conversation_scenario(
489
+ base_url=resolved_base_url,
490
+ shop_id=resolved_shop_id,
491
+ api_token=resolved_token,
492
+ scenario_id=scenario_id,
493
+ payload=scenario_payload,
494
+ )
495
+ except APIError as exc:
496
+ typer.echo(render_api_error(exc, action="persist rating"), err=True)
497
+ raise typer.Exit(code=1) from exc
498
+
499
+ emit_success(
500
+ output_json=False,
501
+ payload={},
502
+ fields={
503
+ "benchmark_id": benchmark_id or "(none)",
504
+ "scenario_id": scenario_id,
505
+ "run_id": run_id,
506
+ "evaluated_at": updated_run.get("evaluated_at"),
507
+ },
508
+ )