@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +135 -100
- package/package.json +7 -4
- package/schema/CHANGELOG.md +7 -0
- package/schema/v1/eval-document.schema.json +143 -11
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +77 -0
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +64 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +157 -1174
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +4 -603
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +82 -20
- package/src/clients/node-js/config/default.js +12 -11
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +14 -20
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
"""Output formatting, score conversion, and result writing."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import webbrowser
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from cli_logging.cli_logger import emit_structured_log
|
|
12
|
+
from cli_logging.logging_utils import Operation
|
|
13
|
+
from common import (
|
|
14
|
+
DEFAULT_PASS_THRESHOLD,
|
|
15
|
+
RELEVANCE,
|
|
16
|
+
COHERENCE,
|
|
17
|
+
GROUNDEDNESS,
|
|
18
|
+
SIMILARITY,
|
|
19
|
+
TOOL_CALL_ACCURACY,
|
|
20
|
+
CITATIONS,
|
|
21
|
+
EXACT_MATCH,
|
|
22
|
+
PARTIAL_MATCH,
|
|
23
|
+
METRIC_IDS,
|
|
24
|
+
STATUS_PASS,
|
|
25
|
+
STATUS_FAIL,
|
|
26
|
+
STATUS_ERROR,
|
|
27
|
+
STATUS_PARTIAL,
|
|
28
|
+
STATUS_UNKNOWN,
|
|
29
|
+
pascal_case_to_title,
|
|
30
|
+
RunConfig,
|
|
31
|
+
)
|
|
32
|
+
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
33
|
+
from schema_handler import SchemaVersionManager
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def write_results_to_html(results: List[Dict], output_file: str,
|
|
37
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
38
|
+
cli_version: Optional[str] = None):
|
|
39
|
+
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
40
|
+
try:
|
|
41
|
+
html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
|
|
42
|
+
cli_version=cli_version)
|
|
43
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
|
|
44
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
45
|
+
f.write(html)
|
|
46
|
+
emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
53
|
+
agent_id: Optional[str] = None,
|
|
54
|
+
cli_version: Optional[str] = None):
|
|
55
|
+
"""Write the response to console."""
|
|
56
|
+
# ANSI color codes
|
|
57
|
+
BOLD = '\033[1m'
|
|
58
|
+
BLUE = '\033[94m'
|
|
59
|
+
GREEN = '\033[92m'
|
|
60
|
+
YELLOW = '\033[93m'
|
|
61
|
+
CYAN = '\033[96m'
|
|
62
|
+
MAGENTA = '\033[95m'
|
|
63
|
+
ORANGE = '\033[38;5;208m'
|
|
64
|
+
RED = '\033[91m'
|
|
65
|
+
RESET = '\033[0m'
|
|
66
|
+
|
|
67
|
+
def _print_evaluated_item(response: str, expected_response: str,
|
|
68
|
+
evaluators_ran: List[str], item_results: Dict[str, Any],
|
|
69
|
+
error: Optional[str] = None) -> None:
|
|
70
|
+
"""Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
|
|
71
|
+
|
|
72
|
+
The item header (Prompt X / Turn X) is printed by the caller; this helper
|
|
73
|
+
prints evaluators, response, expected response, error, and metrics.
|
|
74
|
+
"""
|
|
75
|
+
if evaluators_ran:
|
|
76
|
+
print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
|
|
77
|
+
if response:
|
|
78
|
+
print(f"{BOLD}{CYAN}Response:{RESET} {response}")
|
|
79
|
+
if expected_response:
|
|
80
|
+
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
|
|
81
|
+
if error:
|
|
82
|
+
print(f"{BOLD}{RED}Error:{RESET} {error}")
|
|
83
|
+
|
|
84
|
+
for eval_name, v in item_results.items():
|
|
85
|
+
if v is None:
|
|
86
|
+
continue
|
|
87
|
+
display_name = pascal_case_to_title(eval_name)
|
|
88
|
+
if eval_name == RELEVANCE:
|
|
89
|
+
color = MAGENTA
|
|
90
|
+
elif eval_name == COHERENCE:
|
|
91
|
+
color = ORANGE
|
|
92
|
+
else:
|
|
93
|
+
color = BLUE
|
|
94
|
+
print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
|
|
95
|
+
|
|
96
|
+
# Show metadata
|
|
97
|
+
metadata_parts = []
|
|
98
|
+
if agent_name:
|
|
99
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
100
|
+
if agent_id:
|
|
101
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
102
|
+
if cli_version:
|
|
103
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
104
|
+
if metadata_parts:
|
|
105
|
+
print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
|
|
106
|
+
print()
|
|
107
|
+
|
|
108
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
109
|
+
if aggregates:
|
|
110
|
+
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
111
|
+
if total_items > 1:
|
|
112
|
+
print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
|
|
113
|
+
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
114
|
+
|
|
115
|
+
for metric_name, stats in aggregates.items():
|
|
116
|
+
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
117
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
118
|
+
total_prompts = stats.get('total_prompts', total_items)
|
|
119
|
+
print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
|
|
120
|
+
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
121
|
+
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
122
|
+
if stats.get('threshold') is not None:
|
|
123
|
+
print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
|
|
124
|
+
print()
|
|
125
|
+
|
|
126
|
+
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
127
|
+
print()
|
|
128
|
+
|
|
129
|
+
print(f"{BOLD}{BLUE}Individual Results:{RESET}")
|
|
130
|
+
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
131
|
+
for i, result in enumerate(results, 1):
|
|
132
|
+
if result.get("type") == "multi_turn":
|
|
133
|
+
thread_name = result.get("name", "Unnamed Thread")
|
|
134
|
+
summary = result.get("summary", {})
|
|
135
|
+
status = summary.get("overall_status", STATUS_UNKNOWN)
|
|
136
|
+
status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
|
|
137
|
+
|
|
138
|
+
print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
|
|
139
|
+
for t_idx, turn in enumerate(result.get("turns", []), 1):
|
|
140
|
+
turn_status = turn.get("status", STATUS_UNKNOWN)
|
|
141
|
+
turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
|
|
142
|
+
print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
|
|
143
|
+
_print_evaluated_item(
|
|
144
|
+
response=turn.get("response", ""),
|
|
145
|
+
expected_response=turn.get("expected_response", ""),
|
|
146
|
+
evaluators_ran=turn.get("evaluators_ran", []),
|
|
147
|
+
item_results=turn.get("results", {}),
|
|
148
|
+
error=turn.get("error"),
|
|
149
|
+
)
|
|
150
|
+
print()
|
|
151
|
+
print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
|
|
152
|
+
print(f" Status: {status_color}{status.upper()}{RESET}")
|
|
153
|
+
print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
|
|
154
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
155
|
+
else:
|
|
156
|
+
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
157
|
+
_print_evaluated_item(
|
|
158
|
+
response=result.get('response', ''),
|
|
159
|
+
expected_response=result.get('expected_response', ''),
|
|
160
|
+
evaluators_ran=result.get('evaluators_ran', []),
|
|
161
|
+
item_results=result.get('results', {}),
|
|
162
|
+
error=result.get('errorDetails'),
|
|
163
|
+
)
|
|
164
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
168
|
+
"""Extract an EvalScore object from a decorated metric dict.
|
|
169
|
+
|
|
170
|
+
Maps internal decorated-metric format to schema EvalScore:
|
|
171
|
+
{score, result, threshold} (required) + reason, evaluator (optional).
|
|
172
|
+
"""
|
|
173
|
+
score_val = None
|
|
174
|
+
if metric_id in data and isinstance(data[metric_id], (int, float)):
|
|
175
|
+
score_val = data[metric_id]
|
|
176
|
+
if score_val is None:
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
result = data.get("result")
|
|
180
|
+
if result not in (STATUS_PASS, STATUS_FAIL):
|
|
181
|
+
result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_PASS_THRESHOLD) else STATUS_FAIL
|
|
182
|
+
|
|
183
|
+
eval_score: Dict[str, Any] = {
|
|
184
|
+
"score": score_val,
|
|
185
|
+
"result": result,
|
|
186
|
+
"threshold": data.get("threshold", DEFAULT_PASS_THRESHOLD),
|
|
187
|
+
}
|
|
188
|
+
reason = data.get(f"{metric_id}_reason") or data.get("reason")
|
|
189
|
+
if reason:
|
|
190
|
+
eval_score["reason"] = reason
|
|
191
|
+
return eval_score
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
195
|
+
"""Convert raw evaluator results to schema-compliant score objects.
|
|
196
|
+
|
|
197
|
+
Evaluator results in results_dict are dicts (from _decorate_metric) or
|
|
198
|
+
None when skipped/crashed. None values are omitted from output.
|
|
199
|
+
"""
|
|
200
|
+
scores: Dict[str, Any] = {}
|
|
201
|
+
|
|
202
|
+
for eval_key, schema_key in [
|
|
203
|
+
(RELEVANCE, "relevance"),
|
|
204
|
+
(COHERENCE, "coherence"),
|
|
205
|
+
(GROUNDEDNESS, "groundedness"),
|
|
206
|
+
(SIMILARITY, "similarity"),
|
|
207
|
+
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
208
|
+
]:
|
|
209
|
+
data = results_dict.get(eval_key)
|
|
210
|
+
if data is None:
|
|
211
|
+
continue
|
|
212
|
+
eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
|
|
213
|
+
if eval_score:
|
|
214
|
+
scores[schema_key] = eval_score
|
|
215
|
+
|
|
216
|
+
data = results_dict.get(CITATIONS)
|
|
217
|
+
if data is not None:
|
|
218
|
+
count = data.get("citations", 0)
|
|
219
|
+
cit_result = data.get("result")
|
|
220
|
+
if cit_result not in (STATUS_PASS, STATUS_FAIL):
|
|
221
|
+
cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
|
|
222
|
+
citation_score: Dict[str, Any] = {
|
|
223
|
+
"count": count,
|
|
224
|
+
"result": cit_result,
|
|
225
|
+
"threshold": data.get("threshold", 1),
|
|
226
|
+
}
|
|
227
|
+
if "citation_format" in data:
|
|
228
|
+
citation_score["format"] = data["citation_format"]
|
|
229
|
+
scores["citations"] = citation_score
|
|
230
|
+
|
|
231
|
+
data = results_dict.get(EXACT_MATCH)
|
|
232
|
+
if data is not None:
|
|
233
|
+
is_match = data.get("exact_match", 0.0) == 1.0
|
|
234
|
+
scores["exactMatch"] = {
|
|
235
|
+
"match": is_match,
|
|
236
|
+
"result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
|
|
237
|
+
"reason": data.get("exact_match_reason", ""),
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
data = results_dict.get(PARTIAL_MATCH)
|
|
241
|
+
if data is not None:
|
|
242
|
+
scores["partialMatch"] = {
|
|
243
|
+
"score": data.get("partial_match", 0.0),
|
|
244
|
+
"result": data.get("result", STATUS_FAIL),
|
|
245
|
+
"threshold": data.get("threshold", 0.5),
|
|
246
|
+
"reason": data.get("partial_match_reason", ""),
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
return scores
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
253
|
+
"""Convert an internal evaluation result dict to a schema-compliant EvalItem."""
|
|
254
|
+
item: Dict[str, Any] = {
|
|
255
|
+
"prompt": result["prompt"],
|
|
256
|
+
"response": result["response"],
|
|
257
|
+
"expected_response": result["expected_response"],
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if "evaluators" in result:
|
|
261
|
+
item["evaluators"] = result["evaluators"]
|
|
262
|
+
if "evaluators_mode" in result:
|
|
263
|
+
item["evaluators_mode"] = result["evaluators_mode"]
|
|
264
|
+
|
|
265
|
+
scores = _convert_scores_to_schema(result.get("results", {}))
|
|
266
|
+
if scores:
|
|
267
|
+
item["scores"] = scores
|
|
268
|
+
|
|
269
|
+
return item
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def convert_thread_result_to_output(thread_result: Dict) -> Dict:
|
|
273
|
+
"""Convert a multi-turn thread result to the output format."""
|
|
274
|
+
output_turns = []
|
|
275
|
+
for turn in thread_result.get("turns", []):
|
|
276
|
+
output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
|
|
277
|
+
if "expected_response" in turn:
|
|
278
|
+
output_turn["expected_response"] = turn["expected_response"]
|
|
279
|
+
if "response" in turn:
|
|
280
|
+
output_turn["response"] = turn["response"]
|
|
281
|
+
if "status" in turn:
|
|
282
|
+
output_turn["status"] = turn["status"]
|
|
283
|
+
if "error" in turn:
|
|
284
|
+
output_turn["error"] = turn["error"]
|
|
285
|
+
if "evaluators" in turn:
|
|
286
|
+
output_turn["evaluators"] = turn["evaluators"]
|
|
287
|
+
if "evaluators_mode" in turn:
|
|
288
|
+
output_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
289
|
+
|
|
290
|
+
scores = _convert_scores_to_schema(turn.get("results", {}))
|
|
291
|
+
if scores:
|
|
292
|
+
output_turn["scores"] = scores
|
|
293
|
+
|
|
294
|
+
output_turns.append(output_turn)
|
|
295
|
+
|
|
296
|
+
output: Dict[str, Any] = {}
|
|
297
|
+
if thread_result.get("name"):
|
|
298
|
+
output["name"] = thread_result["name"]
|
|
299
|
+
if thread_result.get("description"):
|
|
300
|
+
output["description"] = thread_result["description"]
|
|
301
|
+
if thread_result.get("conversation_id"):
|
|
302
|
+
output["conversation_id"] = thread_result["conversation_id"]
|
|
303
|
+
output["turns"] = output_turns
|
|
304
|
+
if thread_result.get("summary"):
|
|
305
|
+
output["summary"] = thread_result["summary"]
|
|
306
|
+
|
|
307
|
+
return output
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def convert_result_to_output_item(result: Dict) -> Dict:
|
|
311
|
+
"""Convert an internal result dict to an output item. Routes by type."""
|
|
312
|
+
if result.get("type") == "multi_turn":
|
|
313
|
+
return convert_thread_result_to_output(result)
|
|
314
|
+
return convert_result_to_eval_item(result)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
318
|
+
default_evaluators: Optional[Dict[str, Any]] = None,
|
|
319
|
+
agent_name: Optional[str] = None,
|
|
320
|
+
cli_version: Optional[str] = None):
|
|
321
|
+
"""Write results to a schema-compliant eval document JSON file.
|
|
322
|
+
|
|
323
|
+
Output follows the eval-document.schema.json format:
|
|
324
|
+
{schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
|
|
325
|
+
"""
|
|
326
|
+
try:
|
|
327
|
+
try:
|
|
328
|
+
current_version = SchemaVersionManager().get_current_version()
|
|
329
|
+
except Exception:
|
|
330
|
+
current_version = "1.0.0"
|
|
331
|
+
|
|
332
|
+
items = [convert_result_to_output_item(r) for r in results]
|
|
333
|
+
|
|
334
|
+
metadata: Dict[str, Any] = {
|
|
335
|
+
"evaluatedAt": datetime.now(timezone.utc).isoformat(),
|
|
336
|
+
}
|
|
337
|
+
if agent_id:
|
|
338
|
+
metadata["agentId"] = agent_id
|
|
339
|
+
if agent_name:
|
|
340
|
+
metadata["agentName"] = agent_name
|
|
341
|
+
if cli_version:
|
|
342
|
+
metadata["cliVersion"] = cli_version
|
|
343
|
+
|
|
344
|
+
output_data: Dict[str, Any] = {
|
|
345
|
+
"schemaVersion": current_version,
|
|
346
|
+
"metadata": metadata,
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
if default_evaluators is not None:
|
|
350
|
+
output_data["default_evaluators"] = default_evaluators
|
|
351
|
+
|
|
352
|
+
output_data["items"] = items
|
|
353
|
+
|
|
354
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
|
|
355
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
356
|
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
357
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
358
|
+
except Exception as e:
|
|
359
|
+
emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
360
|
+
sys.exit(1)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _results_to_csv_json(results_dict: Dict) -> str:
|
|
364
|
+
"""Serialize evaluator results dict to a CSV-safe JSON string.
|
|
365
|
+
|
|
366
|
+
Skips None (crashed/skipped evaluators). Results are dicts produced
|
|
367
|
+
by _decorate_metric.
|
|
368
|
+
"""
|
|
369
|
+
if not results_dict:
|
|
370
|
+
return ""
|
|
371
|
+
non_null = {k: v for k, v in results_dict.items() if v is not None}
|
|
372
|
+
return json.dumps(non_null) if non_null else ""
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def write_results_to_csv(results: List[Dict], output_file: str,
|
|
376
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
377
|
+
cli_version: Optional[str] = None):
|
|
378
|
+
"""Write results to CSV file."""
|
|
379
|
+
try:
|
|
380
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
|
|
381
|
+
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
382
|
+
if results:
|
|
383
|
+
metadata_parts = []
|
|
384
|
+
if agent_name:
|
|
385
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
386
|
+
if agent_id:
|
|
387
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
388
|
+
if cli_version:
|
|
389
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
390
|
+
if metadata_parts:
|
|
391
|
+
f.write(f"# {' | '.join(metadata_parts)}\n")
|
|
392
|
+
|
|
393
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
394
|
+
if aggregates:
|
|
395
|
+
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
396
|
+
if total_items > 1:
|
|
397
|
+
f.write("# AGGREGATE STATISTICS\n")
|
|
398
|
+
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
399
|
+
for metric_name, stats in aggregates.items():
|
|
400
|
+
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
401
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
402
|
+
total_prompts = stats.get('total_prompts', total_items)
|
|
403
|
+
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
404
|
+
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
405
|
+
|
|
406
|
+
single_turn_rows = []
|
|
407
|
+
multi_turn_rows = []
|
|
408
|
+
for result in results:
|
|
409
|
+
if result.get("type") == "multi_turn":
|
|
410
|
+
thread_name = result.get("name", "")
|
|
411
|
+
for turn_idx, turn in enumerate(result.get("turns", [])):
|
|
412
|
+
multi_turn_rows.append({
|
|
413
|
+
"thread_name": thread_name,
|
|
414
|
+
"turn_index": turn_idx + 1,
|
|
415
|
+
"prompt": turn.get("prompt", ""),
|
|
416
|
+
"response": turn.get("response", ""),
|
|
417
|
+
"expected_response": turn.get("expected_response", ""),
|
|
418
|
+
"status": turn.get("status", ""),
|
|
419
|
+
"error": turn.get("error", ""),
|
|
420
|
+
"scores": _results_to_csv_json(turn.get("results", {})),
|
|
421
|
+
})
|
|
422
|
+
summary = result.get("summary", {})
|
|
423
|
+
multi_turn_rows.append({
|
|
424
|
+
"thread_name": thread_name,
|
|
425
|
+
"turn_index": "summary",
|
|
426
|
+
"prompt": "",
|
|
427
|
+
"response": "",
|
|
428
|
+
"expected_response": "",
|
|
429
|
+
"status": summary.get("overall_status", ""),
|
|
430
|
+
"scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
|
|
431
|
+
})
|
|
432
|
+
else:
|
|
433
|
+
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
|
|
434
|
+
row = {k: v for k, v in result.items() if k not in exclude_keys}
|
|
435
|
+
if "results" in result:
|
|
436
|
+
row["scores"] = _results_to_csv_json(result["results"])
|
|
437
|
+
single_turn_rows.append(row)
|
|
438
|
+
|
|
439
|
+
if single_turn_rows:
|
|
440
|
+
if multi_turn_rows:
|
|
441
|
+
f.write("# SINGLE-TURN RESULTS\n")
|
|
442
|
+
fieldnames = list(single_turn_rows[0].keys())
|
|
443
|
+
for row in single_turn_rows:
|
|
444
|
+
for k in row:
|
|
445
|
+
if k not in fieldnames:
|
|
446
|
+
fieldnames.append(k)
|
|
447
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
448
|
+
writer.writeheader()
|
|
449
|
+
writer.writerows(single_turn_rows)
|
|
450
|
+
|
|
451
|
+
if multi_turn_rows:
|
|
452
|
+
if single_turn_rows:
|
|
453
|
+
f.write("\n")
|
|
454
|
+
f.write("# MULTI-TURN RESULTS\n")
|
|
455
|
+
fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
|
|
456
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
457
|
+
writer.writeheader()
|
|
458
|
+
writer.writerows(multi_turn_rows)
|
|
459
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
460
|
+
except Exception as e:
|
|
461
|
+
emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
462
|
+
sys.exit(1)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def output_results(results: List[Dict], config: RunConfig, default_evaluators: Optional[Dict[str, Any]] = None,
|
|
466
|
+
agent_name: Optional[str] = None, cli_version: Optional[str] = None):
|
|
467
|
+
"""Output results based on specified format."""
|
|
468
|
+
metadata_kwargs = dict(
|
|
469
|
+
agent_name=agent_name,
|
|
470
|
+
agent_id=config.m365_agent_id,
|
|
471
|
+
cli_version=cli_version,
|
|
472
|
+
)
|
|
473
|
+
if config.output:
|
|
474
|
+
output_lower = config.output.lower()
|
|
475
|
+
if output_lower.endswith('.json'):
|
|
476
|
+
write_results_to_json(results, config.output, default_evaluators=default_evaluators,
|
|
477
|
+
**metadata_kwargs)
|
|
478
|
+
elif output_lower.endswith('.csv'):
|
|
479
|
+
write_results_to_csv(results, config.output, **metadata_kwargs)
|
|
480
|
+
elif output_lower.endswith('.html'):
|
|
481
|
+
write_results_to_html(results, config.output, **metadata_kwargs)
|
|
482
|
+
abs_path = os.path.abspath(config.output)
|
|
483
|
+
webbrowser.open(f'file://{abs_path}')
|
|
484
|
+
else:
|
|
485
|
+
write_results_to_json(results, config.output, default_evaluators=default_evaluators,
|
|
486
|
+
**metadata_kwargs)
|
|
487
|
+
else:
|
|
488
|
+
write_results_to_console(results, **metadata_kwargs)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Retry utilities for transient HTTP failures in evaluation flows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from email.utils import parsedate_to_datetime
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
RETRYABLE_HTTP_STATUS_CODES = {429, 503, 504}
|
|
10
|
+
MAX_BACKOFF_SECONDS = 60
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_retryable_status(status_code: Optional[int]) -> bool:
|
|
14
|
+
"""Return True for transient HTTP status codes covered by the spec."""
|
|
15
|
+
if status_code is None:
|
|
16
|
+
return False
|
|
17
|
+
return int(status_code) in RETRYABLE_HTTP_STATUS_CODES
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_backoff_seconds(attempt: int) -> int:
|
|
21
|
+
"""Return exponential backoff delay capped at MAX_BACKOFF_SECONDS.
|
|
22
|
+
|
|
23
|
+
Examples: 2, 4, 8 for attempts 1..3.
|
|
24
|
+
"""
|
|
25
|
+
if attempt < 1:
|
|
26
|
+
raise ValueError("attempt must be >= 1")
|
|
27
|
+
return min(2 ** attempt, MAX_BACKOFF_SECONDS)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_retry_after_seconds(retry_after_header: Optional[str]) -> Optional[int]:
|
|
31
|
+
"""Parse Retry-After header value (delay-seconds or HTTP-date per RFC 7231)."""
|
|
32
|
+
if retry_after_header is None:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
value = retry_after_header.strip()
|
|
36
|
+
if not value:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
# Try delay-seconds (integer) first
|
|
40
|
+
try:
|
|
41
|
+
return max(0, int(value))
|
|
42
|
+
except ValueError:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
# Try HTTP-date format (RFC 7231 §7.1.3)
|
|
46
|
+
try:
|
|
47
|
+
retry_date = parsedate_to_datetime(value)
|
|
48
|
+
now = datetime.now(timezone.utc)
|
|
49
|
+
delta = int((retry_date - now).total_seconds())
|
|
50
|
+
return max(0, delta)
|
|
51
|
+
except (ValueError, TypeError):
|
|
52
|
+
return None
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.2.0",
|
|
3
|
+
"default_evaluators": {
|
|
4
|
+
"Relevance": {},
|
|
5
|
+
"Coherence": {}
|
|
6
|
+
},
|
|
7
|
+
"items": [
|
|
8
|
+
{
|
|
9
|
+
"prompt": "What is Microsoft Graph?",
|
|
10
|
+
"expected_response": "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"name": "Travel planning conversation",
|
|
14
|
+
"description": "Multi-turn thread testing context retention across turns",
|
|
15
|
+
"turns": [
|
|
16
|
+
{
|
|
17
|
+
"prompt": "I'm planning a trip to Seattle next week.",
|
|
18
|
+
"expected_response": "I can help you plan your Seattle trip."
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"prompt": "What's the weather going to be like?",
|
|
22
|
+
"expected_response": "Seattle weather is typically mild with possible rain."
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"prompt": "Should I bring a rain jacket?",
|
|
26
|
+
"expected_response": "Yes, it's always a good idea to bring rain gear to Seattle.",
|
|
27
|
+
"evaluators": {
|
|
28
|
+
"Groundedness": { "threshold": 4 }
|
|
29
|
+
},
|
|
30
|
+
"evaluators_mode": "extend"
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Per-API throttle gate support for transient HTTP 429 handling."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class GateState:
|
|
13
|
+
"""Snapshot state for diagnostics and tests."""
|
|
14
|
+
|
|
15
|
+
api_name: str
|
|
16
|
+
blocked_until_epoch: float
|
|
17
|
+
is_blocked: bool
|
|
18
|
+
last_retry_after_seconds: Optional[int]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ThrottleGate:
|
|
22
|
+
"""Thread-safe per-API gate that pauses workers until the block window elapses."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, api_name: str) -> None:
|
|
25
|
+
self.api_name = api_name
|
|
26
|
+
self._lock = threading.Lock()
|
|
27
|
+
self._blocked_until_epoch = 0.0
|
|
28
|
+
self._last_retry_after_seconds: Optional[int] = None
|
|
29
|
+
|
|
30
|
+
def apply_retry_after(self, retry_after_seconds: int) -> float:
|
|
31
|
+
"""Apply retry-after duration and keep the maximum active block window.
|
|
32
|
+
|
|
33
|
+
Returns the current effective blocked-until epoch.
|
|
34
|
+
"""
|
|
35
|
+
retry_after_seconds = max(0, int(retry_after_seconds))
|
|
36
|
+
candidate = time.time() + retry_after_seconds
|
|
37
|
+
|
|
38
|
+
with self._lock:
|
|
39
|
+
if candidate > self._blocked_until_epoch:
|
|
40
|
+
self._blocked_until_epoch = candidate
|
|
41
|
+
self._last_retry_after_seconds = retry_after_seconds
|
|
42
|
+
return self._blocked_until_epoch
|
|
43
|
+
|
|
44
|
+
MAX_GATE_WAIT_SECONDS = 300.0
|
|
45
|
+
|
|
46
|
+
def wait_if_blocked(self) -> float:
|
|
47
|
+
"""Sleep until the gate opens. Returns the total slept duration in seconds.
|
|
48
|
+
|
|
49
|
+
Re-checks the block window after each sleep to handle concurrent
|
|
50
|
+
``apply_retry_after`` calls that extend the window (avoids TOCTOU).
|
|
51
|
+
Raises ``TimeoutError`` if the total wait exceeds ``MAX_GATE_WAIT_SECONDS``.
|
|
52
|
+
"""
|
|
53
|
+
total_slept = 0.0
|
|
54
|
+
while True:
|
|
55
|
+
with self._lock:
|
|
56
|
+
delay = max(0.0, self._blocked_until_epoch - time.time())
|
|
57
|
+
if delay <= 0:
|
|
58
|
+
return total_slept
|
|
59
|
+
if total_slept + delay > self.MAX_GATE_WAIT_SECONDS:
|
|
60
|
+
raise TimeoutError(
|
|
61
|
+
f"ThrottleGate '{self.api_name}' exceeded maximum wait of "
|
|
62
|
+
f"{self.MAX_GATE_WAIT_SECONDS}s (slept {total_slept:.1f}s so far)."
|
|
63
|
+
)
|
|
64
|
+
time.sleep(delay)
|
|
65
|
+
total_slept += delay
|
|
66
|
+
|
|
67
|
+
def clear(self) -> None:
|
|
68
|
+
"""Reset the gate to unblocked state."""
|
|
69
|
+
with self._lock:
|
|
70
|
+
self._blocked_until_epoch = 0.0
|
|
71
|
+
self._last_retry_after_seconds = None
|
|
72
|
+
|
|
73
|
+
def state(self) -> GateState:
|
|
74
|
+
"""Return immutable snapshot state."""
|
|
75
|
+
with self._lock:
|
|
76
|
+
now = time.time()
|
|
77
|
+
return GateState(
|
|
78
|
+
api_name=self.api_name,
|
|
79
|
+
blocked_until_epoch=self._blocked_until_epoch,
|
|
80
|
+
is_blocked=self._blocked_until_epoch > now,
|
|
81
|
+
last_retry_after_seconds=self._last_retry_after_seconds,
|
|
82
|
+
)
|