@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +135 -100
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +7 -0
  4. package/schema/v1/eval-document.schema.json +143 -11
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  11. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  12. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  13. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  14. package/schema/version.json +2 -2
  15. package/src/clients/cli/agent_selector.py +74 -0
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +77 -0
  20. package/src/clients/cli/cli_args.py +136 -0
  21. package/src/clients/cli/cli_logging/cli_logger.py +33 -0
  22. package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
  23. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  24. package/src/clients/cli/common.py +64 -0
  25. package/src/clients/cli/env_validator.py +73 -0
  26. package/src/clients/cli/evaluation_runner.py +653 -0
  27. package/src/clients/cli/evaluator_resolver.py +9 -6
  28. package/src/clients/cli/generate_report.py +272 -129
  29. package/src/clients/cli/main.py +157 -1174
  30. package/src/clients/cli/parallel_executor.py +57 -0
  31. package/src/clients/cli/prompt_loader.py +148 -0
  32. package/src/clients/cli/readme.md +9 -53
  33. package/src/clients/cli/requirements.txt +1 -1
  34. package/src/clients/cli/response_extractor.py +4 -603
  35. package/src/clients/cli/result_writer.py +488 -0
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +82 -20
  40. package/src/clients/node-js/config/default.js +12 -11
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +14 -20
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -0,0 +1,488 @@
1
+ """Output formatting, score conversion, and result writing."""
2
+
3
+ import csv
4
+ import json
5
+ import os
6
+ import sys
7
+ import webbrowser
8
+ from datetime import datetime, timezone
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from cli_logging.cli_logger import emit_structured_log
12
+ from cli_logging.logging_utils import Operation
13
+ from common import (
14
+ DEFAULT_PASS_THRESHOLD,
15
+ RELEVANCE,
16
+ COHERENCE,
17
+ GROUNDEDNESS,
18
+ SIMILARITY,
19
+ TOOL_CALL_ACCURACY,
20
+ CITATIONS,
21
+ EXACT_MATCH,
22
+ PARTIAL_MATCH,
23
+ METRIC_IDS,
24
+ STATUS_PASS,
25
+ STATUS_FAIL,
26
+ STATUS_ERROR,
27
+ STATUS_PARTIAL,
28
+ STATUS_UNKNOWN,
29
+ pascal_case_to_title,
30
+ RunConfig,
31
+ )
32
+ from generate_report import generate_html_report, calculate_aggregate_statistics
33
+ from schema_handler import SchemaVersionManager
34
+
35
+
36
+ def write_results_to_html(results: List[Dict], output_file: str,
37
+ agent_name: Optional[str] = None, agent_id: Optional[str] = None,
38
+ cli_version: Optional[str] = None):
39
+ """Write results to HTML file using generate_html_report from generate_report.py."""
40
+ try:
41
+ html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
42
+ cli_version=cli_version)
43
+ os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
44
+ with open(output_file, 'w', encoding='utf-8') as f:
45
+ f.write(html)
46
+ emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
47
+ except Exception as e:
48
+ emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
49
+ sys.exit(1)
50
+
51
+
52
+ def write_results_to_console(results, agent_name: Optional[str] = None,
53
+ agent_id: Optional[str] = None,
54
+ cli_version: Optional[str] = None):
55
+ """Write the response to console."""
56
+ # ANSI color codes
57
+ BOLD = '\033[1m'
58
+ BLUE = '\033[94m'
59
+ GREEN = '\033[92m'
60
+ YELLOW = '\033[93m'
61
+ CYAN = '\033[96m'
62
+ MAGENTA = '\033[95m'
63
+ ORANGE = '\033[38;5;208m'
64
+ RED = '\033[91m'
65
+ RESET = '\033[0m'
66
+
67
+ def _print_evaluated_item(response: str, expected_response: str,
68
+ evaluators_ran: List[str], item_results: Dict[str, Any],
69
+ error: Optional[str] = None) -> None:
70
+ """Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
71
+
72
+ The item header (Prompt X / Turn X) is printed by the caller; this helper
73
+ prints evaluators, response, expected response, error, and metrics.
74
+ """
75
+ if evaluators_ran:
76
+ print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
77
+ if response:
78
+ print(f"{BOLD}{CYAN}Response:{RESET} {response}")
79
+ if expected_response:
80
+ print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
81
+ if error:
82
+ print(f"{BOLD}{RED}Error:{RESET} {error}")
83
+
84
+ for eval_name, v in item_results.items():
85
+ if v is None:
86
+ continue
87
+ display_name = pascal_case_to_title(eval_name)
88
+ if eval_name == RELEVANCE:
89
+ color = MAGENTA
90
+ elif eval_name == COHERENCE:
91
+ color = ORANGE
92
+ else:
93
+ color = BLUE
94
+ print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
95
+
96
+ # Show metadata
97
+ metadata_parts = []
98
+ if agent_name:
99
+ metadata_parts.append(f"Agent Name: {agent_name}")
100
+ if agent_id:
101
+ metadata_parts.append(f"Agent ID: {agent_id}")
102
+ if cli_version:
103
+ metadata_parts.append(f"CLI Version: {cli_version}")
104
+ if metadata_parts:
105
+ print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
106
+ print()
107
+
108
+ aggregates = calculate_aggregate_statistics(results)
109
+ if aggregates:
110
+ total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
111
+ if total_items > 1:
112
+ print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
113
+ print(f"{BLUE}{'=' * 60}{RESET}")
114
+
115
+ for metric_name, stats in aggregates.items():
116
+ pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
117
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
118
+ total_prompts = stats.get('total_prompts', total_items)
119
+ print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
120
+ print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
121
+ print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
122
+ if stats.get('threshold') is not None:
123
+ print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
124
+ print()
125
+
126
+ print(f"{BLUE}{'=' * 60}{RESET}")
127
+ print()
128
+
129
+ print(f"{BOLD}{BLUE}Individual Results:{RESET}")
130
+ print(f"{BLUE}{'=' * 50}{RESET}")
131
+ for i, result in enumerate(results, 1):
132
+ if result.get("type") == "multi_turn":
133
+ thread_name = result.get("name", "Unnamed Thread")
134
+ summary = result.get("summary", {})
135
+ status = summary.get("overall_status", STATUS_UNKNOWN)
136
+ status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
137
+
138
+ print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
139
+ for t_idx, turn in enumerate(result.get("turns", []), 1):
140
+ turn_status = turn.get("status", STATUS_UNKNOWN)
141
+ turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
142
+ print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
143
+ _print_evaluated_item(
144
+ response=turn.get("response", ""),
145
+ expected_response=turn.get("expected_response", ""),
146
+ evaluators_ran=turn.get("evaluators_ran", []),
147
+ item_results=turn.get("results", {}),
148
+ error=turn.get("error"),
149
+ )
150
+ print()
151
+ print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
152
+ print(f" Status: {status_color}{status.upper()}{RESET}")
153
+ print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
154
+ print(f"{BLUE}{'-' * 30}{RESET}")
155
+ else:
156
+ print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
157
+ _print_evaluated_item(
158
+ response=result.get('response', ''),
159
+ expected_response=result.get('expected_response', ''),
160
+ evaluators_ran=result.get('evaluators_ran', []),
161
+ item_results=result.get('results', {}),
162
+ error=result.get('errorDetails'),
163
+ )
164
+ print(f"{BLUE}{'-' * 30}{RESET}")
165
+
166
+
167
+ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
168
+ """Extract an EvalScore object from a decorated metric dict.
169
+
170
+ Maps internal decorated-metric format to schema EvalScore:
171
+ {score, result, threshold} (required) + reason, evaluator (optional).
172
+ """
173
+ score_val = None
174
+ if metric_id in data and isinstance(data[metric_id], (int, float)):
175
+ score_val = data[metric_id]
176
+ if score_val is None:
177
+ return None
178
+
179
+ result = data.get("result")
180
+ if result not in (STATUS_PASS, STATUS_FAIL):
181
+ result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_PASS_THRESHOLD) else STATUS_FAIL
182
+
183
+ eval_score: Dict[str, Any] = {
184
+ "score": score_val,
185
+ "result": result,
186
+ "threshold": data.get("threshold", DEFAULT_PASS_THRESHOLD),
187
+ }
188
+ reason = data.get(f"{metric_id}_reason") or data.get("reason")
189
+ if reason:
190
+ eval_score["reason"] = reason
191
+ return eval_score
192
+
193
+
194
+ def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
195
+ """Convert raw evaluator results to schema-compliant score objects.
196
+
197
+ Evaluator results in results_dict are dicts (from _decorate_metric) or
198
+ None when skipped/crashed. None values are omitted from output.
199
+ """
200
+ scores: Dict[str, Any] = {}
201
+
202
+ for eval_key, schema_key in [
203
+ (RELEVANCE, "relevance"),
204
+ (COHERENCE, "coherence"),
205
+ (GROUNDEDNESS, "groundedness"),
206
+ (SIMILARITY, "similarity"),
207
+ (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
208
+ ]:
209
+ data = results_dict.get(eval_key)
210
+ if data is None:
211
+ continue
212
+ eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
213
+ if eval_score:
214
+ scores[schema_key] = eval_score
215
+
216
+ data = results_dict.get(CITATIONS)
217
+ if data is not None:
218
+ count = data.get("citations", 0)
219
+ cit_result = data.get("result")
220
+ if cit_result not in (STATUS_PASS, STATUS_FAIL):
221
+ cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
222
+ citation_score: Dict[str, Any] = {
223
+ "count": count,
224
+ "result": cit_result,
225
+ "threshold": data.get("threshold", 1),
226
+ }
227
+ if "citation_format" in data:
228
+ citation_score["format"] = data["citation_format"]
229
+ scores["citations"] = citation_score
230
+
231
+ data = results_dict.get(EXACT_MATCH)
232
+ if data is not None:
233
+ is_match = data.get("exact_match", 0.0) == 1.0
234
+ scores["exactMatch"] = {
235
+ "match": is_match,
236
+ "result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
237
+ "reason": data.get("exact_match_reason", ""),
238
+ }
239
+
240
+ data = results_dict.get(PARTIAL_MATCH)
241
+ if data is not None:
242
+ scores["partialMatch"] = {
243
+ "score": data.get("partial_match", 0.0),
244
+ "result": data.get("result", STATUS_FAIL),
245
+ "threshold": data.get("threshold", 0.5),
246
+ "reason": data.get("partial_match_reason", ""),
247
+ }
248
+
249
+ return scores
250
+
251
+
252
+ def convert_result_to_eval_item(result: Dict) -> Dict:
253
+ """Convert an internal evaluation result dict to a schema-compliant EvalItem."""
254
+ item: Dict[str, Any] = {
255
+ "prompt": result["prompt"],
256
+ "response": result["response"],
257
+ "expected_response": result["expected_response"],
258
+ }
259
+
260
+ if "evaluators" in result:
261
+ item["evaluators"] = result["evaluators"]
262
+ if "evaluators_mode" in result:
263
+ item["evaluators_mode"] = result["evaluators_mode"]
264
+
265
+ scores = _convert_scores_to_schema(result.get("results", {}))
266
+ if scores:
267
+ item["scores"] = scores
268
+
269
+ return item
270
+
271
+
272
+ def convert_thread_result_to_output(thread_result: Dict) -> Dict:
273
+ """Convert a multi-turn thread result to the output format."""
274
+ output_turns = []
275
+ for turn in thread_result.get("turns", []):
276
+ output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
277
+ if "expected_response" in turn:
278
+ output_turn["expected_response"] = turn["expected_response"]
279
+ if "response" in turn:
280
+ output_turn["response"] = turn["response"]
281
+ if "status" in turn:
282
+ output_turn["status"] = turn["status"]
283
+ if "error" in turn:
284
+ output_turn["error"] = turn["error"]
285
+ if "evaluators" in turn:
286
+ output_turn["evaluators"] = turn["evaluators"]
287
+ if "evaluators_mode" in turn:
288
+ output_turn["evaluators_mode"] = turn["evaluators_mode"]
289
+
290
+ scores = _convert_scores_to_schema(turn.get("results", {}))
291
+ if scores:
292
+ output_turn["scores"] = scores
293
+
294
+ output_turns.append(output_turn)
295
+
296
+ output: Dict[str, Any] = {}
297
+ if thread_result.get("name"):
298
+ output["name"] = thread_result["name"]
299
+ if thread_result.get("description"):
300
+ output["description"] = thread_result["description"]
301
+ if thread_result.get("conversation_id"):
302
+ output["conversation_id"] = thread_result["conversation_id"]
303
+ output["turns"] = output_turns
304
+ if thread_result.get("summary"):
305
+ output["summary"] = thread_result["summary"]
306
+
307
+ return output
308
+
309
+
310
+ def convert_result_to_output_item(result: Dict) -> Dict:
311
+ """Convert an internal result dict to an output item. Routes by type."""
312
+ if result.get("type") == "multi_turn":
313
+ return convert_thread_result_to_output(result)
314
+ return convert_result_to_eval_item(result)
315
+
316
+
317
+ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
318
+ default_evaluators: Optional[Dict[str, Any]] = None,
319
+ agent_name: Optional[str] = None,
320
+ cli_version: Optional[str] = None):
321
+ """Write results to a schema-compliant eval document JSON file.
322
+
323
+ Output follows the eval-document.schema.json format:
324
+ {schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
325
+ """
326
+ try:
327
+ try:
328
+ current_version = SchemaVersionManager().get_current_version()
329
+ except Exception:
330
+ current_version = "1.0.0"
331
+
332
+ items = [convert_result_to_output_item(r) for r in results]
333
+
334
+ metadata: Dict[str, Any] = {
335
+ "evaluatedAt": datetime.now(timezone.utc).isoformat(),
336
+ }
337
+ if agent_id:
338
+ metadata["agentId"] = agent_id
339
+ if agent_name:
340
+ metadata["agentName"] = agent_name
341
+ if cli_version:
342
+ metadata["cliVersion"] = cli_version
343
+
344
+ output_data: Dict[str, Any] = {
345
+ "schemaVersion": current_version,
346
+ "metadata": metadata,
347
+ }
348
+
349
+ if default_evaluators is not None:
350
+ output_data["default_evaluators"] = default_evaluators
351
+
352
+ output_data["items"] = items
353
+
354
+ os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
355
+ with open(output_file, 'w', encoding='utf-8') as f:
356
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
357
+ emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
358
+ except Exception as e:
359
+ emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
360
+ sys.exit(1)
361
+
362
+
363
+ def _results_to_csv_json(results_dict: Dict) -> str:
364
+ """Serialize evaluator results dict to a CSV-safe JSON string.
365
+
366
+ Skips None (crashed/skipped evaluators). Results are dicts produced
367
+ by _decorate_metric.
368
+ """
369
+ if not results_dict:
370
+ return ""
371
+ non_null = {k: v for k, v in results_dict.items() if v is not None}
372
+ return json.dumps(non_null) if non_null else ""
373
+
374
+
375
+ def write_results_to_csv(results: List[Dict], output_file: str,
376
+ agent_name: Optional[str] = None, agent_id: Optional[str] = None,
377
+ cli_version: Optional[str] = None):
378
+ """Write results to CSV file."""
379
+ try:
380
+ os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
381
+ with open(output_file, 'w', newline='', encoding='utf-8') as f:
382
+ if results:
383
+ metadata_parts = []
384
+ if agent_name:
385
+ metadata_parts.append(f"Agent Name: {agent_name}")
386
+ if agent_id:
387
+ metadata_parts.append(f"Agent ID: {agent_id}")
388
+ if cli_version:
389
+ metadata_parts.append(f"CLI Version: {cli_version}")
390
+ if metadata_parts:
391
+ f.write(f"# {' | '.join(metadata_parts)}\n")
392
+
393
+ aggregates = calculate_aggregate_statistics(results)
394
+ if aggregates:
395
+ total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
396
+ if total_items > 1:
397
+ f.write("# AGGREGATE STATISTICS\n")
398
+ f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
399
+ for metric_name, stats in aggregates.items():
400
+ threshold_str = str(stats.get('threshold', 'N/A'))
401
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
402
+ total_prompts = stats.get('total_prompts', total_items)
403
+ f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
404
+ f.write("\n# INDIVIDUAL RESULTS\n")
405
+
406
+ single_turn_rows = []
407
+ multi_turn_rows = []
408
+ for result in results:
409
+ if result.get("type") == "multi_turn":
410
+ thread_name = result.get("name", "")
411
+ for turn_idx, turn in enumerate(result.get("turns", [])):
412
+ multi_turn_rows.append({
413
+ "thread_name": thread_name,
414
+ "turn_index": turn_idx + 1,
415
+ "prompt": turn.get("prompt", ""),
416
+ "response": turn.get("response", ""),
417
+ "expected_response": turn.get("expected_response", ""),
418
+ "status": turn.get("status", ""),
419
+ "error": turn.get("error", ""),
420
+ "scores": _results_to_csv_json(turn.get("results", {})),
421
+ })
422
+ summary = result.get("summary", {})
423
+ multi_turn_rows.append({
424
+ "thread_name": thread_name,
425
+ "turn_index": "summary",
426
+ "prompt": "",
427
+ "response": "",
428
+ "expected_response": "",
429
+ "status": summary.get("overall_status", ""),
430
+ "scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
431
+ })
432
+ else:
433
+ exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
434
+ row = {k: v for k, v in result.items() if k not in exclude_keys}
435
+ if "results" in result:
436
+ row["scores"] = _results_to_csv_json(result["results"])
437
+ single_turn_rows.append(row)
438
+
439
+ if single_turn_rows:
440
+ if multi_turn_rows:
441
+ f.write("# SINGLE-TURN RESULTS\n")
442
+ fieldnames = list(single_turn_rows[0].keys())
443
+ for row in single_turn_rows:
444
+ for k in row:
445
+ if k not in fieldnames:
446
+ fieldnames.append(k)
447
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
448
+ writer.writeheader()
449
+ writer.writerows(single_turn_rows)
450
+
451
+ if multi_turn_rows:
452
+ if single_turn_rows:
453
+ f.write("\n")
454
+ f.write("# MULTI-TURN RESULTS\n")
455
+ fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
456
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
457
+ writer.writeheader()
458
+ writer.writerows(multi_turn_rows)
459
+ emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
460
+ except Exception as e:
461
+ emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
462
+ sys.exit(1)
463
+
464
+
465
+ def output_results(results: List[Dict], config: RunConfig, default_evaluators: Optional[Dict[str, Any]] = None,
466
+ agent_name: Optional[str] = None, cli_version: Optional[str] = None):
467
+ """Output results based on specified format."""
468
+ metadata_kwargs = dict(
469
+ agent_name=agent_name,
470
+ agent_id=config.m365_agent_id,
471
+ cli_version=cli_version,
472
+ )
473
+ if config.output:
474
+ output_lower = config.output.lower()
475
+ if output_lower.endswith('.json'):
476
+ write_results_to_json(results, config.output, default_evaluators=default_evaluators,
477
+ **metadata_kwargs)
478
+ elif output_lower.endswith('.csv'):
479
+ write_results_to_csv(results, config.output, **metadata_kwargs)
480
+ elif output_lower.endswith('.html'):
481
+ write_results_to_html(results, config.output, **metadata_kwargs)
482
+ abs_path = os.path.abspath(config.output)
483
+ webbrowser.open(f'file://{abs_path}')
484
+ else:
485
+ write_results_to_json(results, config.output, default_evaluators=default_evaluators,
486
+ **metadata_kwargs)
487
+ else:
488
+ write_results_to_console(results, **metadata_kwargs)
@@ -0,0 +1,52 @@
1
+ """Retry utilities for transient HTTP failures in evaluation flows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+ from email.utils import parsedate_to_datetime
7
+ from typing import Optional
8
+
9
+ RETRYABLE_HTTP_STATUS_CODES = {429, 503, 504}
10
+ MAX_BACKOFF_SECONDS = 60
11
+
12
+
13
+ def is_retryable_status(status_code: Optional[int]) -> bool:
14
+ """Return True for transient HTTP status codes covered by the spec."""
15
+ if status_code is None:
16
+ return False
17
+ return int(status_code) in RETRYABLE_HTTP_STATUS_CODES
18
+
19
+
20
+ def get_backoff_seconds(attempt: int) -> int:
21
+ """Return exponential backoff delay capped at MAX_BACKOFF_SECONDS.
22
+
23
+ Examples: 2, 4, 8 for attempts 1..3.
24
+ """
25
+ if attempt < 1:
26
+ raise ValueError("attempt must be >= 1")
27
+ return min(2 ** attempt, MAX_BACKOFF_SECONDS)
28
+
29
+
30
+ def get_retry_after_seconds(retry_after_header: Optional[str]) -> Optional[int]:
31
+ """Parse Retry-After header value (delay-seconds or HTTP-date per RFC 7231)."""
32
+ if retry_after_header is None:
33
+ return None
34
+
35
+ value = retry_after_header.strip()
36
+ if not value:
37
+ return None
38
+
39
+ # Try delay-seconds (integer) first
40
+ try:
41
+ return max(0, int(value))
42
+ except ValueError:
43
+ pass
44
+
45
+ # Try HTTP-date format (RFC 7231 §7.1.3)
46
+ try:
47
+ retry_date = parsedate_to_datetime(value)
48
+ now = datetime.now(timezone.utc)
49
+ delta = int((retry_date - now).total_seconds())
50
+ return max(0, delta)
51
+ except (ValueError, TypeError):
52
+ return None
@@ -0,0 +1,35 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "default_evaluators": {
4
+ "Relevance": {},
5
+ "Coherence": {}
6
+ },
7
+ "items": [
8
+ {
9
+ "prompt": "What is Microsoft Graph?",
10
+ "expected_response": "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
11
+ },
12
+ {
13
+ "name": "Travel planning conversation",
14
+ "description": "Multi-turn thread testing context retention across turns",
15
+ "turns": [
16
+ {
17
+ "prompt": "I'm planning a trip to Seattle next week.",
18
+ "expected_response": "I can help you plan your Seattle trip."
19
+ },
20
+ {
21
+ "prompt": "What's the weather going to be like?",
22
+ "expected_response": "Seattle weather is typically mild with possible rain."
23
+ },
24
+ {
25
+ "prompt": "Should I bring a rain jacket?",
26
+ "expected_response": "Yes, it's always a good idea to bring rain gear to Seattle.",
27
+ "evaluators": {
28
+ "Groundedness": { "threshold": 4 }
29
+ },
30
+ "evaluators_mode": "extend"
31
+ }
32
+ ]
33
+ }
34
+ ]
35
+ }
@@ -0,0 +1,82 @@
1
+ """Per-API throttle gate support for transient HTTP 429 handling."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import threading
6
+ import time
7
+ from dataclasses import dataclass
8
+ from typing import Optional
9
+
10
+
11
+ @dataclass
12
+ class GateState:
13
+ """Snapshot state for diagnostics and tests."""
14
+
15
+ api_name: str
16
+ blocked_until_epoch: float
17
+ is_blocked: bool
18
+ last_retry_after_seconds: Optional[int]
19
+
20
+
21
+ class ThrottleGate:
22
+ """Thread-safe per-API gate that pauses workers until the block window elapses."""
23
+
24
+ def __init__(self, api_name: str) -> None:
25
+ self.api_name = api_name
26
+ self._lock = threading.Lock()
27
+ self._blocked_until_epoch = 0.0
28
+ self._last_retry_after_seconds: Optional[int] = None
29
+
30
+ def apply_retry_after(self, retry_after_seconds: int) -> float:
31
+ """Apply retry-after duration and keep the maximum active block window.
32
+
33
+ Returns the current effective blocked-until epoch.
34
+ """
35
+ retry_after_seconds = max(0, int(retry_after_seconds))
36
+ candidate = time.time() + retry_after_seconds
37
+
38
+ with self._lock:
39
+ if candidate > self._blocked_until_epoch:
40
+ self._blocked_until_epoch = candidate
41
+ self._last_retry_after_seconds = retry_after_seconds
42
+ return self._blocked_until_epoch
43
+
44
+ MAX_GATE_WAIT_SECONDS = 300.0
45
+
46
+ def wait_if_blocked(self) -> float:
47
+ """Sleep until the gate opens. Returns the total slept duration in seconds.
48
+
49
+ Re-checks the block window after each sleep to handle concurrent
50
+ ``apply_retry_after`` calls that extend the window (avoids TOCTOU).
51
+ Raises ``TimeoutError`` if the total wait exceeds ``MAX_GATE_WAIT_SECONDS``.
52
+ """
53
+ total_slept = 0.0
54
+ while True:
55
+ with self._lock:
56
+ delay = max(0.0, self._blocked_until_epoch - time.time())
57
+ if delay <= 0:
58
+ return total_slept
59
+ if total_slept + delay > self.MAX_GATE_WAIT_SECONDS:
60
+ raise TimeoutError(
61
+ f"ThrottleGate '{self.api_name}' exceeded maximum wait of "
62
+ f"{self.MAX_GATE_WAIT_SECONDS}s (slept {total_slept:.1f}s so far)."
63
+ )
64
+ time.sleep(delay)
65
+ total_slept += delay
66
+
67
+ def clear(self) -> None:
68
+ """Reset the gate to unblocked state."""
69
+ with self._lock:
70
+ self._blocked_until_epoch = 0.0
71
+ self._last_retry_after_seconds = None
72
+
73
+ def state(self) -> GateState:
74
+ """Return immutable snapshot state."""
75
+ with self._lock:
76
+ now = time.time()
77
+ return GateState(
78
+ api_name=self.api_name,
79
+ blocked_until_epoch=self._blocked_until_epoch,
80
+ is_blocked=self._blocked_until_epoch > now,
81
+ last_retry_after_seconds=self._last_retry_after_seconds,
82
+ )