@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.6.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -4
- package/package.json +4 -3
- package/schema/CHANGELOG.md +14 -0
- package/schema/v1/eval-document.schema.json +3 -3
- package/schema/version.json +1 -1
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +96 -30
- package/src/clients/cli/api_clients/base_agent_client.py +0 -1
- package/src/clients/cli/auth/auth_handler.py +21 -1
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +3 -1
- package/src/clients/cli/common.py +53 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/main.py +130 -1676
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/response_extractor.py +4 -601
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/node-js/bin/runevals.js +34 -13
- package/src/clients/node-js/config/default.js +8 -11
- package/src/clients/node-js/lib/env-loader.js +3 -4
- package/src/clients/node-js/lib/python-runtime.js +137 -65
- package/src/clients/node-js/lib/venv-manager.js +3 -2
- package/src/clients/node-js/lib/version-check.js +268 -0
- package/src/clients/cli/api_clients/REST/__init__.py +0 -3
- package/src/clients/cli/api_clients/REST/sydney_client.py +0 -204
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
"""Output formatting, score conversion, and result writing."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import webbrowser
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from cli_logging.cli_logger import emit_structured_log
|
|
12
|
+
from cli_logging.logging_utils import Operation
|
|
13
|
+
from common import (
|
|
14
|
+
DEFAULT_PASS_THRESHOLD,
|
|
15
|
+
RELEVANCE,
|
|
16
|
+
COHERENCE,
|
|
17
|
+
GROUNDEDNESS,
|
|
18
|
+
SIMILARITY,
|
|
19
|
+
TOOL_CALL_ACCURACY,
|
|
20
|
+
CITATIONS,
|
|
21
|
+
EXACT_MATCH,
|
|
22
|
+
PARTIAL_MATCH,
|
|
23
|
+
METRIC_IDS,
|
|
24
|
+
STATUS_PASS,
|
|
25
|
+
STATUS_FAIL,
|
|
26
|
+
STATUS_ERROR,
|
|
27
|
+
STATUS_PARTIAL,
|
|
28
|
+
STATUS_UNKNOWN,
|
|
29
|
+
pascal_case_to_title,
|
|
30
|
+
RunConfig,
|
|
31
|
+
)
|
|
32
|
+
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
33
|
+
from schema_handler import SchemaVersionManager
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def write_results_to_html(results: List[Dict], output_file: str,
|
|
37
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
38
|
+
cli_version: Optional[str] = None):
|
|
39
|
+
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
40
|
+
try:
|
|
41
|
+
html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
|
|
42
|
+
cli_version=cli_version)
|
|
43
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
|
|
44
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
45
|
+
f.write(html)
|
|
46
|
+
emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
53
|
+
agent_id: Optional[str] = None,
|
|
54
|
+
cli_version: Optional[str] = None):
|
|
55
|
+
"""Write the response to console."""
|
|
56
|
+
# ANSI color codes
|
|
57
|
+
BOLD = '\033[1m'
|
|
58
|
+
BLUE = '\033[94m'
|
|
59
|
+
GREEN = '\033[92m'
|
|
60
|
+
YELLOW = '\033[93m'
|
|
61
|
+
CYAN = '\033[96m'
|
|
62
|
+
MAGENTA = '\033[95m'
|
|
63
|
+
ORANGE = '\033[38;5;208m'
|
|
64
|
+
RED = '\033[91m'
|
|
65
|
+
RESET = '\033[0m'
|
|
66
|
+
|
|
67
|
+
def _print_evaluated_item(response: str, expected_response: str,
|
|
68
|
+
evaluators_ran: List[str], item_results: Dict[str, Any],
|
|
69
|
+
error: Optional[str] = None) -> None:
|
|
70
|
+
"""Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
|
|
71
|
+
|
|
72
|
+
The item header (Prompt X / Turn X) is printed by the caller; this helper
|
|
73
|
+
prints evaluators, response, expected response, error, and metrics.
|
|
74
|
+
"""
|
|
75
|
+
if evaluators_ran:
|
|
76
|
+
print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
|
|
77
|
+
if response:
|
|
78
|
+
print(f"{BOLD}{CYAN}Response:{RESET} {response}")
|
|
79
|
+
if expected_response:
|
|
80
|
+
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
|
|
81
|
+
if error:
|
|
82
|
+
print(f"{BOLD}{RED}Error:{RESET} {error}")
|
|
83
|
+
|
|
84
|
+
for eval_name, v in item_results.items():
|
|
85
|
+
if v is None:
|
|
86
|
+
continue
|
|
87
|
+
display_name = pascal_case_to_title(eval_name)
|
|
88
|
+
if eval_name == RELEVANCE:
|
|
89
|
+
color = MAGENTA
|
|
90
|
+
elif eval_name == COHERENCE:
|
|
91
|
+
color = ORANGE
|
|
92
|
+
else:
|
|
93
|
+
color = BLUE
|
|
94
|
+
print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
|
|
95
|
+
|
|
96
|
+
# Show metadata
|
|
97
|
+
metadata_parts = []
|
|
98
|
+
if agent_name:
|
|
99
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
100
|
+
if agent_id:
|
|
101
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
102
|
+
if cli_version:
|
|
103
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
104
|
+
if metadata_parts:
|
|
105
|
+
print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
|
|
106
|
+
print()
|
|
107
|
+
|
|
108
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
109
|
+
if aggregates:
|
|
110
|
+
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
111
|
+
if total_items > 1:
|
|
112
|
+
print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
|
|
113
|
+
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
114
|
+
|
|
115
|
+
for metric_name, stats in aggregates.items():
|
|
116
|
+
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
117
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
118
|
+
total_prompts = stats.get('total_prompts', total_items)
|
|
119
|
+
print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
|
|
120
|
+
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
121
|
+
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
122
|
+
if stats.get('threshold') is not None:
|
|
123
|
+
print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
|
|
124
|
+
print()
|
|
125
|
+
|
|
126
|
+
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
127
|
+
print()
|
|
128
|
+
|
|
129
|
+
print(f"{BOLD}{BLUE}Individual Results:{RESET}")
|
|
130
|
+
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
131
|
+
for i, result in enumerate(results, 1):
|
|
132
|
+
if result.get("type") == "multi_turn":
|
|
133
|
+
thread_name = result.get("name", "Unnamed Thread")
|
|
134
|
+
summary = result.get("summary", {})
|
|
135
|
+
status = summary.get("overall_status", STATUS_UNKNOWN)
|
|
136
|
+
status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
|
|
137
|
+
|
|
138
|
+
print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
|
|
139
|
+
for t_idx, turn in enumerate(result.get("turns", []), 1):
|
|
140
|
+
turn_status = turn.get("status", STATUS_UNKNOWN)
|
|
141
|
+
turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
|
|
142
|
+
print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
|
|
143
|
+
_print_evaluated_item(
|
|
144
|
+
response=turn.get("response", ""),
|
|
145
|
+
expected_response=turn.get("expected_response", ""),
|
|
146
|
+
evaluators_ran=turn.get("evaluators_ran", []),
|
|
147
|
+
item_results=turn.get("results", {}),
|
|
148
|
+
error=turn.get("error"),
|
|
149
|
+
)
|
|
150
|
+
print()
|
|
151
|
+
print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
|
|
152
|
+
print(f" Status: {status_color}{status.upper()}{RESET}")
|
|
153
|
+
print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
|
|
154
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
155
|
+
else:
|
|
156
|
+
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
157
|
+
_print_evaluated_item(
|
|
158
|
+
response=result.get('response', ''),
|
|
159
|
+
expected_response=result.get('expected_response', ''),
|
|
160
|
+
evaluators_ran=result.get('evaluators_ran', []),
|
|
161
|
+
item_results=result.get('results', {}),
|
|
162
|
+
error=result.get('errorDetails'),
|
|
163
|
+
)
|
|
164
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
168
|
+
"""Extract an EvalScore object from a decorated metric dict.
|
|
169
|
+
|
|
170
|
+
Maps internal decorated-metric format to schema EvalScore:
|
|
171
|
+
{score, result, threshold} (required) + reason, evaluator (optional).
|
|
172
|
+
"""
|
|
173
|
+
score_val = None
|
|
174
|
+
if metric_id in data and isinstance(data[metric_id], (int, float)):
|
|
175
|
+
score_val = data[metric_id]
|
|
176
|
+
if score_val is None:
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
result = data.get("result")
|
|
180
|
+
if result not in (STATUS_PASS, STATUS_FAIL):
|
|
181
|
+
result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_PASS_THRESHOLD) else STATUS_FAIL
|
|
182
|
+
|
|
183
|
+
eval_score: Dict[str, Any] = {
|
|
184
|
+
"score": score_val,
|
|
185
|
+
"result": result,
|
|
186
|
+
"threshold": data.get("threshold", DEFAULT_PASS_THRESHOLD),
|
|
187
|
+
}
|
|
188
|
+
reason = data.get(f"{metric_id}_reason") or data.get("reason")
|
|
189
|
+
if reason:
|
|
190
|
+
eval_score["reason"] = reason
|
|
191
|
+
return eval_score
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
195
|
+
"""Convert raw evaluator results to schema-compliant score objects.
|
|
196
|
+
|
|
197
|
+
Evaluator results in results_dict are dicts (from _decorate_metric) or
|
|
198
|
+
None when skipped/crashed. None values are omitted from output.
|
|
199
|
+
"""
|
|
200
|
+
scores: Dict[str, Any] = {}
|
|
201
|
+
|
|
202
|
+
for eval_key, schema_key in [
|
|
203
|
+
(RELEVANCE, "relevance"),
|
|
204
|
+
(COHERENCE, "coherence"),
|
|
205
|
+
(GROUNDEDNESS, "groundedness"),
|
|
206
|
+
(SIMILARITY, "similarity"),
|
|
207
|
+
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
208
|
+
]:
|
|
209
|
+
data = results_dict.get(eval_key)
|
|
210
|
+
if data is None:
|
|
211
|
+
continue
|
|
212
|
+
eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
|
|
213
|
+
if eval_score:
|
|
214
|
+
scores[schema_key] = eval_score
|
|
215
|
+
|
|
216
|
+
data = results_dict.get(CITATIONS)
|
|
217
|
+
if data is not None:
|
|
218
|
+
count = data.get("citations", 0)
|
|
219
|
+
cit_result = data.get("result")
|
|
220
|
+
if cit_result not in (STATUS_PASS, STATUS_FAIL):
|
|
221
|
+
cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
|
|
222
|
+
citation_score: Dict[str, Any] = {
|
|
223
|
+
"count": count,
|
|
224
|
+
"result": cit_result,
|
|
225
|
+
"threshold": data.get("threshold", 1),
|
|
226
|
+
}
|
|
227
|
+
if "citation_format" in data:
|
|
228
|
+
citation_score["format"] = data["citation_format"]
|
|
229
|
+
scores["citations"] = citation_score
|
|
230
|
+
|
|
231
|
+
data = results_dict.get(EXACT_MATCH)
|
|
232
|
+
if data is not None:
|
|
233
|
+
is_match = data.get("exact_match", 0.0) == 1.0
|
|
234
|
+
scores["exactMatch"] = {
|
|
235
|
+
"match": is_match,
|
|
236
|
+
"result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
|
|
237
|
+
"reason": data.get("exact_match_reason", ""),
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
data = results_dict.get(PARTIAL_MATCH)
|
|
241
|
+
if data is not None:
|
|
242
|
+
scores["partialMatch"] = {
|
|
243
|
+
"score": data.get("partial_match", 0.0),
|
|
244
|
+
"result": data.get("result", STATUS_FAIL),
|
|
245
|
+
"threshold": data.get("threshold", 0.5),
|
|
246
|
+
"reason": data.get("partial_match_reason", ""),
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
return scores
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
253
|
+
"""Convert an internal evaluation result dict to a schema-compliant EvalItem."""
|
|
254
|
+
item: Dict[str, Any] = {
|
|
255
|
+
"prompt": result["prompt"],
|
|
256
|
+
"response": result["response"],
|
|
257
|
+
"expected_response": result["expected_response"],
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
if "evaluators" in result:
|
|
261
|
+
item["evaluators"] = result["evaluators"]
|
|
262
|
+
if "evaluators_mode" in result:
|
|
263
|
+
item["evaluators_mode"] = result["evaluators_mode"]
|
|
264
|
+
|
|
265
|
+
scores = _convert_scores_to_schema(result.get("results", {}))
|
|
266
|
+
if scores:
|
|
267
|
+
item["scores"] = scores
|
|
268
|
+
|
|
269
|
+
return item
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def convert_thread_result_to_output(thread_result: Dict) -> Dict:
|
|
273
|
+
"""Convert a multi-turn thread result to the output format."""
|
|
274
|
+
output_turns = []
|
|
275
|
+
for turn in thread_result.get("turns", []):
|
|
276
|
+
output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
|
|
277
|
+
if "expected_response" in turn:
|
|
278
|
+
output_turn["expected_response"] = turn["expected_response"]
|
|
279
|
+
if "response" in turn:
|
|
280
|
+
output_turn["response"] = turn["response"]
|
|
281
|
+
if "status" in turn:
|
|
282
|
+
output_turn["status"] = turn["status"]
|
|
283
|
+
if "error" in turn:
|
|
284
|
+
output_turn["error"] = turn["error"]
|
|
285
|
+
if "evaluators" in turn:
|
|
286
|
+
output_turn["evaluators"] = turn["evaluators"]
|
|
287
|
+
if "evaluators_mode" in turn:
|
|
288
|
+
output_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
289
|
+
|
|
290
|
+
scores = _convert_scores_to_schema(turn.get("results", {}))
|
|
291
|
+
if scores:
|
|
292
|
+
output_turn["scores"] = scores
|
|
293
|
+
|
|
294
|
+
output_turns.append(output_turn)
|
|
295
|
+
|
|
296
|
+
output: Dict[str, Any] = {}
|
|
297
|
+
if thread_result.get("name"):
|
|
298
|
+
output["name"] = thread_result["name"]
|
|
299
|
+
if thread_result.get("description"):
|
|
300
|
+
output["description"] = thread_result["description"]
|
|
301
|
+
if thread_result.get("conversation_id"):
|
|
302
|
+
output["conversation_id"] = thread_result["conversation_id"]
|
|
303
|
+
output["turns"] = output_turns
|
|
304
|
+
if thread_result.get("summary"):
|
|
305
|
+
output["summary"] = thread_result["summary"]
|
|
306
|
+
|
|
307
|
+
return output
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def convert_result_to_output_item(result: Dict) -> Dict:
|
|
311
|
+
"""Convert an internal result dict to an output item. Routes by type."""
|
|
312
|
+
if result.get("type") == "multi_turn":
|
|
313
|
+
return convert_thread_result_to_output(result)
|
|
314
|
+
return convert_result_to_eval_item(result)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
318
|
+
default_evaluators: Optional[Dict[str, Any]] = None,
|
|
319
|
+
agent_name: Optional[str] = None,
|
|
320
|
+
cli_version: Optional[str] = None):
|
|
321
|
+
"""Write results to a schema-compliant eval document JSON file.
|
|
322
|
+
|
|
323
|
+
Output follows the eval-document.schema.json format:
|
|
324
|
+
{schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
|
|
325
|
+
"""
|
|
326
|
+
try:
|
|
327
|
+
try:
|
|
328
|
+
current_version = SchemaVersionManager().get_current_version()
|
|
329
|
+
except Exception:
|
|
330
|
+
current_version = "1.0.0"
|
|
331
|
+
|
|
332
|
+
items = [convert_result_to_output_item(r) for r in results]
|
|
333
|
+
|
|
334
|
+
metadata: Dict[str, Any] = {
|
|
335
|
+
"evaluatedAt": datetime.now(timezone.utc).isoformat(),
|
|
336
|
+
}
|
|
337
|
+
if agent_id:
|
|
338
|
+
metadata["agentId"] = agent_id
|
|
339
|
+
if agent_name:
|
|
340
|
+
metadata["agentName"] = agent_name
|
|
341
|
+
if cli_version:
|
|
342
|
+
metadata["cliVersion"] = cli_version
|
|
343
|
+
|
|
344
|
+
output_data: Dict[str, Any] = {
|
|
345
|
+
"schemaVersion": current_version,
|
|
346
|
+
"metadata": metadata,
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
if default_evaluators is not None:
|
|
350
|
+
output_data["default_evaluators"] = default_evaluators
|
|
351
|
+
|
|
352
|
+
output_data["items"] = items
|
|
353
|
+
|
|
354
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
|
|
355
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
356
|
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
357
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
358
|
+
except Exception as e:
|
|
359
|
+
emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
360
|
+
sys.exit(1)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def _results_to_csv_json(results_dict: Dict) -> str:
|
|
364
|
+
"""Serialize evaluator results dict to a CSV-safe JSON string.
|
|
365
|
+
|
|
366
|
+
Skips None (crashed/skipped evaluators). Results are dicts produced
|
|
367
|
+
by _decorate_metric.
|
|
368
|
+
"""
|
|
369
|
+
if not results_dict:
|
|
370
|
+
return ""
|
|
371
|
+
non_null = {k: v for k, v in results_dict.items() if v is not None}
|
|
372
|
+
return json.dumps(non_null) if non_null else ""
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def write_results_to_csv(results: List[Dict], output_file: str,
|
|
376
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
377
|
+
cli_version: Optional[str] = None):
|
|
378
|
+
"""Write results to CSV file."""
|
|
379
|
+
try:
|
|
380
|
+
os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True)
|
|
381
|
+
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
382
|
+
if results:
|
|
383
|
+
metadata_parts = []
|
|
384
|
+
if agent_name:
|
|
385
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
386
|
+
if agent_id:
|
|
387
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
388
|
+
if cli_version:
|
|
389
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
390
|
+
if metadata_parts:
|
|
391
|
+
f.write(f"# {' | '.join(metadata_parts)}\n")
|
|
392
|
+
|
|
393
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
394
|
+
if aggregates:
|
|
395
|
+
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
396
|
+
if total_items > 1:
|
|
397
|
+
f.write("# AGGREGATE STATISTICS\n")
|
|
398
|
+
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
399
|
+
for metric_name, stats in aggregates.items():
|
|
400
|
+
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
401
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
402
|
+
total_prompts = stats.get('total_prompts', total_items)
|
|
403
|
+
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
404
|
+
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
405
|
+
|
|
406
|
+
single_turn_rows = []
|
|
407
|
+
multi_turn_rows = []
|
|
408
|
+
for result in results:
|
|
409
|
+
if result.get("type") == "multi_turn":
|
|
410
|
+
thread_name = result.get("name", "")
|
|
411
|
+
for turn_idx, turn in enumerate(result.get("turns", [])):
|
|
412
|
+
multi_turn_rows.append({
|
|
413
|
+
"thread_name": thread_name,
|
|
414
|
+
"turn_index": turn_idx + 1,
|
|
415
|
+
"prompt": turn.get("prompt", ""),
|
|
416
|
+
"response": turn.get("response", ""),
|
|
417
|
+
"expected_response": turn.get("expected_response", ""),
|
|
418
|
+
"status": turn.get("status", ""),
|
|
419
|
+
"error": turn.get("error", ""),
|
|
420
|
+
"scores": _results_to_csv_json(turn.get("results", {})),
|
|
421
|
+
})
|
|
422
|
+
summary = result.get("summary", {})
|
|
423
|
+
multi_turn_rows.append({
|
|
424
|
+
"thread_name": thread_name,
|
|
425
|
+
"turn_index": "summary",
|
|
426
|
+
"prompt": "",
|
|
427
|
+
"response": "",
|
|
428
|
+
"expected_response": "",
|
|
429
|
+
"status": summary.get("overall_status", ""),
|
|
430
|
+
"scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
|
|
431
|
+
})
|
|
432
|
+
else:
|
|
433
|
+
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
|
|
434
|
+
row = {k: v for k, v in result.items() if k not in exclude_keys}
|
|
435
|
+
if "results" in result:
|
|
436
|
+
row["scores"] = _results_to_csv_json(result["results"])
|
|
437
|
+
single_turn_rows.append(row)
|
|
438
|
+
|
|
439
|
+
if single_turn_rows:
|
|
440
|
+
if multi_turn_rows:
|
|
441
|
+
f.write("# SINGLE-TURN RESULTS\n")
|
|
442
|
+
fieldnames = list(single_turn_rows[0].keys())
|
|
443
|
+
for row in single_turn_rows:
|
|
444
|
+
for k in row:
|
|
445
|
+
if k not in fieldnames:
|
|
446
|
+
fieldnames.append(k)
|
|
447
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
448
|
+
writer.writeheader()
|
|
449
|
+
writer.writerows(single_turn_rows)
|
|
450
|
+
|
|
451
|
+
if multi_turn_rows:
|
|
452
|
+
if single_turn_rows:
|
|
453
|
+
f.write("\n")
|
|
454
|
+
f.write("# MULTI-TURN RESULTS\n")
|
|
455
|
+
fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
|
|
456
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
457
|
+
writer.writeheader()
|
|
458
|
+
writer.writerows(multi_turn_rows)
|
|
459
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
460
|
+
except Exception as e:
|
|
461
|
+
emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
462
|
+
sys.exit(1)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def output_results(results: List[Dict], config: RunConfig, default_evaluators: Optional[Dict[str, Any]] = None,
|
|
466
|
+
agent_name: Optional[str] = None, cli_version: Optional[str] = None):
|
|
467
|
+
"""Output results based on specified format."""
|
|
468
|
+
metadata_kwargs = dict(
|
|
469
|
+
agent_name=agent_name,
|
|
470
|
+
agent_id=config.m365_agent_id,
|
|
471
|
+
cli_version=cli_version,
|
|
472
|
+
)
|
|
473
|
+
if config.output:
|
|
474
|
+
output_lower = config.output.lower()
|
|
475
|
+
if output_lower.endswith('.json'):
|
|
476
|
+
write_results_to_json(results, config.output, default_evaluators=default_evaluators,
|
|
477
|
+
**metadata_kwargs)
|
|
478
|
+
elif output_lower.endswith('.csv'):
|
|
479
|
+
write_results_to_csv(results, config.output, **metadata_kwargs)
|
|
480
|
+
elif output_lower.endswith('.html'):
|
|
481
|
+
write_results_to_html(results, config.output, **metadata_kwargs)
|
|
482
|
+
abs_path = os.path.abspath(config.output)
|
|
483
|
+
webbrowser.open(f'file://{abs_path}')
|
|
484
|
+
else:
|
|
485
|
+
write_results_to_json(results, config.output, default_evaluators=default_evaluators,
|
|
486
|
+
**metadata_kwargs)
|
|
487
|
+
else:
|
|
488
|
+
write_results_to_console(results, **metadata_kwargs)
|
|
@@ -9,8 +9,13 @@ import { ensureVenv, executePythonCli } from '../lib/venv-manager.js';
|
|
|
9
9
|
import { getCacheStats, clearCache, formatBytes } from '../lib/cache-utils.js';
|
|
10
10
|
import { checkPackageExpiry } from '../lib/expiry-check.js';
|
|
11
11
|
import { recordAcceptance, checkAcceptance } from '../lib/eula-manager.js';
|
|
12
|
-
import { ProgressReporter } from '../lib/progress.js';
|
|
12
|
+
import { ProgressReporter, isInteractiveTerminal } from '../lib/progress.js';
|
|
13
13
|
import { _loadEnvFile as loadEnvFile, _loadUserEnvOverride } from '../lib/env-loader.js';
|
|
14
|
+
import {
|
|
15
|
+
_handlePythonVersionMismatch,
|
|
16
|
+
_buildInitializationFailureLines,
|
|
17
|
+
_promptForContinueWithMismatch,
|
|
18
|
+
} from '../lib/version-check.js';
|
|
14
19
|
import { normalizeAgentId } from '../lib/agent-id.js';
|
|
15
20
|
|
|
16
21
|
// Check package expiry (exits if expired, warns if close to expiry)
|
|
@@ -38,10 +43,9 @@ const REQUIREMENTS_FILE = path.join(PYTHON_CLI_DIR, 'requirements.txt');
|
|
|
38
43
|
*/
|
|
39
44
|
async function setDefaultEnvironmentConstants() {
|
|
40
45
|
const config = (await import('../config/default.js')).default;
|
|
41
|
-
process.env.
|
|
42
|
-
process.env.
|
|
43
|
-
process.env.
|
|
44
|
-
process.env.X_SCENARIO_HEADER = config.copilotApi.scenarioHeader;
|
|
46
|
+
process.env.WORK_IQ_A2A_ENDPOINT = config.workIq.a2aEndpoint;
|
|
47
|
+
process.env.WORK_IQ_A2A_CLIENT_ID = config.workIq.a2aClientId;
|
|
48
|
+
process.env.WORK_IQ_A2A_SCOPES = config.workIq.a2aScopes;
|
|
45
49
|
}
|
|
46
50
|
|
|
47
51
|
/**
|
|
@@ -131,10 +135,26 @@ async function initializePythonEnvironment(verbose = false, quiet = false) {
|
|
|
131
135
|
|
|
132
136
|
try {
|
|
133
137
|
// Step 1: Ensure Python runtime is available (handles download + extract phases)
|
|
134
|
-
await ensurePythonRuntime(verbose, onProgress);
|
|
138
|
+
const runtime = await ensurePythonRuntime(verbose, onProgress);
|
|
139
|
+
|
|
140
|
+
// Step 2: Handle version mismatch from PYTHON_PATH fallback.
|
|
141
|
+
// The decision tree (EOL block, interactive prompt, non-interactive
|
|
142
|
+
// auto-reject) lives in _handlePythonVersionMismatch so it is
|
|
143
|
+
// unit-testable without spawning the CLI; we only own the readline
|
|
144
|
+
// wiring and the actual process.exit here.
|
|
145
|
+
const mismatch = await _handlePythonVersionMismatch({
|
|
146
|
+
runtime,
|
|
147
|
+
isInteractive: isInteractiveTerminal(),
|
|
148
|
+
promptForContinue: _promptForContinueWithMismatch,
|
|
149
|
+
warn: (msg) => console.warn(msg),
|
|
150
|
+
error: (msg) => console.error(msg),
|
|
151
|
+
});
|
|
152
|
+
if (mismatch.shouldExit) {
|
|
153
|
+
process.exit(mismatch.exitCode ?? 1);
|
|
154
|
+
}
|
|
135
155
|
|
|
136
|
-
// Step
|
|
137
|
-
await ensureVenv(REQUIREMENTS_FILE, verbose, onProgress);
|
|
156
|
+
// Step 3: Ensure venv with dependencies is set up (handles venv + deps phases)
|
|
157
|
+
await ensureVenv(REQUIREMENTS_FILE, verbose, onProgress, runtime.pythonPath);
|
|
138
158
|
|
|
139
159
|
// Show completion summary
|
|
140
160
|
reporter.complete();
|
|
@@ -145,11 +165,12 @@ async function initializePythonEnvironment(verbose = false, quiet = false) {
|
|
|
145
165
|
console.error('\nFull error:', error);
|
|
146
166
|
}
|
|
147
167
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
168
|
+
for (const line of _buildInitializationFailureLines({
|
|
169
|
+
error,
|
|
170
|
+
platform: process.platform,
|
|
171
|
+
})) {
|
|
172
|
+
console.error(line);
|
|
173
|
+
}
|
|
153
174
|
|
|
154
175
|
process.exit(1);
|
|
155
176
|
}
|
|
@@ -2,25 +2,22 @@
|
|
|
2
2
|
* Build-time injected default values
|
|
3
3
|
* DO NOT EDIT - This file is auto-generated during build.
|
|
4
4
|
*
|
|
5
|
-
* Generated: 2026-
|
|
5
|
+
* Generated: 2026-05-07T22:53:22.056Z
|
|
6
6
|
*
|
|
7
7
|
* @copyright Microsoft Corporation. All rights reserved.
|
|
8
8
|
* @license MIT
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
export default {
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
|
|
12
|
+
workIq: {
|
|
13
|
+
/** Work IQ A2A Endpoint */
|
|
14
|
+
a2aEndpoint: "https://graph.microsoft.com/rp/workiq",
|
|
15
15
|
|
|
16
|
-
/**
|
|
17
|
-
|
|
16
|
+
/** Work IQ A2A Client ID */
|
|
17
|
+
a2aClientId: "ba081686-5d24-4bc6-a0d6-d034ecffed87",
|
|
18
18
|
|
|
19
|
-
/**
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
/** Scenario Header for Copilot API */
|
|
23
|
-
scenarioHeader: "agenticevaluation"
|
|
19
|
+
/** Work IQ A2A OAuth Scopes */
|
|
20
|
+
a2aScopes: "Sites.Read.All Mail.Read People.Read.All OnlineMeetingTranscript.Read.All Chat.Read ChannelMessage.Read.All ExternalItem.Read.All"
|
|
24
21
|
},
|
|
25
22
|
eula: {
|
|
26
23
|
/** EULA version string for acceptance tracking */
|
|
@@ -9,10 +9,9 @@ import path from 'path';
|
|
|
9
9
|
|
|
10
10
|
// Keys that cannot be overridden from .env files (baked in via default.js config)
|
|
11
11
|
const PROTECTED_KEYS = [
|
|
12
|
-
'
|
|
13
|
-
'
|
|
14
|
-
'
|
|
15
|
-
'X_SCENARIO_HEADER',
|
|
12
|
+
'WORK_IQ_A2A_ENDPOINT',
|
|
13
|
+
'WORK_IQ_A2A_CLIENT_ID',
|
|
14
|
+
'WORK_IQ_A2A_SCOPES',
|
|
16
15
|
];
|
|
17
16
|
|
|
18
17
|
// Aliases resolved into M365_AGENT_ID (first match wins)
|