@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +135 -100
- package/package.json +7 -4
- package/schema/CHANGELOG.md +7 -0
- package/schema/v1/eval-document.schema.json +143 -11
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +77 -0
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +64 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +157 -1174
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +4 -603
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +82 -20
- package/src/clients/node-js/config/default.js +12 -11
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +14 -20
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
package/src/clients/cli/main.py
CHANGED
|
@@ -1,1097 +1,57 @@
|
|
|
1
|
-
|
|
1
|
+
"""M365 Copilot Agent Evaluation CLI — thin orchestrator.
|
|
2
|
+
|
|
3
|
+
Delegates to focused modules:
|
|
4
|
+
cli_args – argument parsing & version-check bypass
|
|
5
|
+
env_validator – environment validation & URL security
|
|
6
|
+
prompt_loader – dataset loading & agent selection
|
|
7
|
+
evaluation_runner – pipeline, evaluator dispatch, retry
|
|
8
|
+
result_writer – console / JSON / CSV / HTML output
|
|
9
|
+
"""
|
|
10
|
+
|
|
2
11
|
import os
|
|
3
|
-
import argparse
|
|
4
12
|
import sys
|
|
5
|
-
import
|
|
6
|
-
|
|
7
|
-
import
|
|
8
|
-
import webbrowser
|
|
9
|
-
import urllib.request
|
|
10
|
-
import urllib.error
|
|
11
|
-
import urllib.parse
|
|
12
|
-
import questionary
|
|
13
|
-
from enum import Enum
|
|
14
|
-
from typing import List, Dict, Tuple, Optional, Any
|
|
15
|
-
from azure.ai.evaluation import (
|
|
16
|
-
AzureOpenAIModelConfiguration,
|
|
17
|
-
RelevanceEvaluator,
|
|
18
|
-
CoherenceEvaluator,
|
|
19
|
-
GroundednessEvaluator,
|
|
20
|
-
ToolCallAccuracyEvaluator
|
|
21
|
-
)
|
|
13
|
+
import traceback
|
|
14
|
+
|
|
15
|
+
from azure.ai.evaluation import AzureOpenAIModelConfiguration
|
|
22
16
|
from dotenv import load_dotenv
|
|
17
|
+
|
|
18
|
+
from api_clients.A2A import A2AClient
|
|
23
19
|
from auth.auth_handler import AuthHandler
|
|
24
|
-
from
|
|
25
|
-
from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
|
|
26
|
-
from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
|
|
27
|
-
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
28
|
-
from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
|
|
29
|
-
from schema_handler import DocumentUpgrader, SchemaVersionManager
|
|
30
|
-
from common import (
|
|
31
|
-
RELEVANCE,
|
|
32
|
-
COHERENCE,
|
|
33
|
-
GROUNDEDNESS,
|
|
34
|
-
TOOL_CALL_ACCURACY,
|
|
35
|
-
CITATIONS,
|
|
36
|
-
EXACT_MATCH,
|
|
37
|
-
PARTIAL_MATCH,
|
|
38
|
-
REQUIRES_AZURE_OPENAI,
|
|
39
|
-
REQUIRES_TOOL_DEFINITIONS,
|
|
40
|
-
METRIC_IDS,
|
|
41
|
-
pascal_case_to_title,
|
|
42
|
-
)
|
|
43
|
-
from evaluator_resolver import (
|
|
44
|
-
EVALUATOR_REGISTRY,
|
|
45
|
-
validate_evaluator_names,
|
|
46
|
-
check_prerequisites,
|
|
47
|
-
resolve_default_evaluators,
|
|
48
|
-
resolve_evaluators_for_prompt,
|
|
49
|
-
get_evaluator_threshold,
|
|
50
|
-
)
|
|
20
|
+
from evaluator_resolver import resolve_default_evaluators
|
|
51
21
|
from version_check import check_min_version, get_cli_version
|
|
52
|
-
from datetime import datetime, timezone
|
|
53
|
-
from pathlib import Path
|
|
54
|
-
import tzlocal
|
|
55
22
|
|
|
56
|
-
from cli_logging.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
'substrate.office.com'
|
|
62
|
-
]
|
|
63
|
-
|
|
64
|
-
class CallPath(Enum):
|
|
65
|
-
""" Enum to indicate which call path to use. """
|
|
66
|
-
ACCESS_TOKEN = "access_token"
|
|
67
|
-
COPILOT_AUTH = "copilot_auth"
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
# Flags that should bypass remote min-version enforcement.
|
|
71
|
-
# --help is not needed here because argparse exits before runtime checks.
|
|
72
|
-
VERSION_CHECK_BYPASS_FLAGS = (
|
|
73
|
-
"signout",
|
|
23
|
+
from cli_logging.cli_logger import (
|
|
24
|
+
CLI_LOGGER,
|
|
25
|
+
DIAGNOSTIC_RECORDS,
|
|
26
|
+
configure_cli_logging,
|
|
27
|
+
emit_structured_log,
|
|
74
28
|
)
|
|
29
|
+
from cli_logging.logging_utils import Operation, resolve_log_level
|
|
75
30
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
context = {
|
|
98
|
-
"request-id": None,
|
|
99
|
-
"conversation-id": None,
|
|
100
|
-
"message-id": None,
|
|
101
|
-
"operation": operation,
|
|
102
|
-
}
|
|
103
|
-
entry = format_structured_log_entry(
|
|
104
|
-
level=level,
|
|
105
|
-
message=message,
|
|
106
|
-
logger_name=CLI_LOGGER_NAME,
|
|
107
|
-
run_context=context,
|
|
108
|
-
)
|
|
109
|
-
DIAGNOSTIC_RECORDS.append(entry)
|
|
110
|
-
|
|
111
|
-
try:
|
|
112
|
-
CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
|
|
113
|
-
except Exception:
|
|
114
|
-
pass
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
|
|
118
|
-
"""Return True if the current invocation should skip min-version checks."""
|
|
119
|
-
return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
|
|
120
|
-
|
|
121
|
-
def write_results_to_html(results: List[Dict], output_file: str,
|
|
122
|
-
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
123
|
-
cli_version: Optional[str] = None):
|
|
124
|
-
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
125
|
-
try:
|
|
126
|
-
html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
|
|
127
|
-
cli_version=cli_version)
|
|
128
|
-
with open(output_file, 'w', encoding='utf-8') as f:
|
|
129
|
-
f.write(html)
|
|
130
|
-
emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
131
|
-
except Exception as e:
|
|
132
|
-
emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
133
|
-
sys.exit(1)
|
|
134
|
-
|
|
135
|
-
def get_default_prompts_and_responses():
|
|
136
|
-
"""Get a list of prompts and responses."""
|
|
137
|
-
prompts = [
|
|
138
|
-
"What is Microsoft Graph?"
|
|
139
|
-
]
|
|
140
|
-
expected_responses = [
|
|
141
|
-
"Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
|
|
142
|
-
]
|
|
143
|
-
return prompts, expected_responses
|
|
144
|
-
|
|
145
|
-
def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
|
|
146
|
-
"""Load prompts and expected responses from a JSON file.
|
|
147
|
-
|
|
148
|
-
Supports three formats:
|
|
149
|
-
1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
|
|
150
|
-
2. Array format: [{"prompt": "...", "expected_response": "..."}]
|
|
151
|
-
3. Dict format: {"prompts": [...], "expected_responses": [...]}
|
|
152
|
-
|
|
153
|
-
For eval documents (format 1) and array format (format 2), schema validation
|
|
154
|
-
and auto-upgrade are applied via DocumentUpgrader.
|
|
155
|
-
|
|
156
|
-
Returns:
|
|
157
|
-
Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
|
|
158
|
-
expected_response, and optional evaluators/evaluators_mode fields.
|
|
159
|
-
"""
|
|
160
|
-
try:
|
|
161
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
162
|
-
data = json.load(f)
|
|
163
|
-
|
|
164
|
-
# Detect if this is an eval document (has "items" key) or could be upgraded
|
|
165
|
-
is_eval_document = (
|
|
166
|
-
isinstance(data, dict) and "items" in data
|
|
167
|
-
) or isinstance(data, list)
|
|
168
|
-
|
|
169
|
-
# Run schema validation and auto-upgrade for eval documents
|
|
170
|
-
if is_eval_document:
|
|
171
|
-
try:
|
|
172
|
-
upgrader = DocumentUpgrader()
|
|
173
|
-
except Exception as e:
|
|
174
|
-
# Schema infrastructure not available (missing files, etc.) — skip
|
|
175
|
-
emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
|
|
176
|
-
upgrader = None
|
|
177
|
-
|
|
178
|
-
if upgrader is not None:
|
|
179
|
-
result = upgrader.upgrade(Path(file_path))
|
|
180
|
-
|
|
181
|
-
if result.error:
|
|
182
|
-
emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
|
|
183
|
-
sys.exit(1)
|
|
184
|
-
|
|
185
|
-
if result.upgraded and result.message:
|
|
186
|
-
emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
|
|
187
|
-
|
|
188
|
-
# Use the parsed document from the upgrade result
|
|
189
|
-
if result.document is not None:
|
|
190
|
-
data = result.document
|
|
191
|
-
|
|
192
|
-
if isinstance(data, list):
|
|
193
|
-
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
194
|
-
return data, None
|
|
195
|
-
elif isinstance(data, dict):
|
|
196
|
-
if "items" in data:
|
|
197
|
-
# Eval document format: {"schemaVersion": "...", "items": [...]}
|
|
198
|
-
return data["items"], data.get("default_evaluators")
|
|
199
|
-
else:
|
|
200
|
-
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
201
|
-
prompts = data.get("prompts", [])
|
|
202
|
-
expected_responses = data.get("expected_responses", [])
|
|
203
|
-
eval_items = [
|
|
204
|
-
{"prompt": p, "expected_response": e}
|
|
205
|
-
for p, e in zip(prompts, expected_responses)
|
|
206
|
-
]
|
|
207
|
-
return eval_items, None
|
|
208
|
-
else:
|
|
209
|
-
raise ValueError("Invalid file format")
|
|
210
|
-
except SystemExit:
|
|
211
|
-
raise
|
|
212
|
-
except Exception as e:
|
|
213
|
-
emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
|
|
214
|
-
sys.exit(1)
|
|
215
|
-
|
|
216
|
-
def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
217
|
-
"""Get prompts and expected responses interactively."""
|
|
218
|
-
prompts = []
|
|
219
|
-
expected_responses = []
|
|
220
|
-
|
|
221
|
-
print("Interactive mode: Enter your prompts and expected responses.")
|
|
222
|
-
print("Press Enter with empty prompt to finish.")
|
|
223
|
-
|
|
224
|
-
while True:
|
|
225
|
-
prompt = input(f"\nPrompt {len(prompts) + 1}: ").strip()
|
|
226
|
-
if not prompt:
|
|
227
|
-
break
|
|
228
|
-
|
|
229
|
-
expected = input(f"Expected response {len(expected_responses) + 1}: ").strip()
|
|
230
|
-
|
|
231
|
-
prompts.append(prompt)
|
|
232
|
-
expected_responses.append(expected)
|
|
233
|
-
|
|
234
|
-
if not prompts:
|
|
235
|
-
print("No prompts entered. Exiting.")
|
|
236
|
-
sys.exit(1)
|
|
237
|
-
|
|
238
|
-
return prompts, expected_responses
|
|
239
|
-
|
|
240
|
-
def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
|
|
241
|
-
default_evaluators: Dict[str, Any]) -> list:
|
|
242
|
-
"""Run evaluations against the responses using per-prompt evaluator resolution.
|
|
243
|
-
|
|
244
|
-
Args:
|
|
245
|
-
args: CLI arguments.
|
|
246
|
-
responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
|
|
247
|
-
eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
|
|
248
|
-
default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
|
|
249
|
-
"""
|
|
250
|
-
if len(responses) != len(eval_items):
|
|
251
|
-
raise ValueError(
|
|
252
|
-
f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
model_config = AzureOpenAIModelConfiguration(
|
|
256
|
-
azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
|
|
257
|
-
api_key=os.environ.get("AZURE_AI_API_KEY"),
|
|
258
|
-
api_version=os.environ.get("AZURE_AI_API_VERSION"),
|
|
259
|
-
azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
# Build available context for prerequisite checks
|
|
263
|
-
has_azure_openai = bool(
|
|
264
|
-
os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
|
|
265
|
-
and os.environ.get("AZURE_AI_API_KEY")
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
DEFAULT_PASS_THRESHOLD = 3
|
|
269
|
-
|
|
270
|
-
def decorate_metric(metric_id: str, data, threshold: Optional[int] = None):
|
|
271
|
-
"""Augment raw evaluator output with standardized threshold + pass/fail result."""
|
|
272
|
-
pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
|
|
273
|
-
payload = {}
|
|
274
|
-
if isinstance(data, dict):
|
|
275
|
-
payload.update(data)
|
|
276
|
-
else:
|
|
277
|
-
payload['raw'] = data
|
|
278
|
-
|
|
279
|
-
score_val = None
|
|
280
|
-
if isinstance(data, dict):
|
|
281
|
-
if metric_id in data:
|
|
282
|
-
score_val = data[metric_id]
|
|
283
|
-
if isinstance(score_val, (int, float)):
|
|
284
|
-
payload['threshold'] = pass_threshold
|
|
285
|
-
payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
|
|
286
|
-
else:
|
|
287
|
-
payload['threshold'] = pass_threshold
|
|
288
|
-
payload.setdefault('result', 'unknown')
|
|
289
|
-
return json.dumps(payload, indent=4)
|
|
290
|
-
|
|
291
|
-
# Validate all evaluator names upfront (across defaults and all items)
|
|
292
|
-
all_evaluator_maps = [default_evaluators]
|
|
293
|
-
for eval_item in eval_items:
|
|
294
|
-
if "evaluators" in eval_item:
|
|
295
|
-
all_evaluator_maps.append(eval_item["evaluators"])
|
|
296
|
-
for emap in all_evaluator_maps:
|
|
297
|
-
validate_evaluator_names(emap)
|
|
298
|
-
|
|
299
|
-
evaluation_results = []
|
|
300
|
-
for enhanced_response, eval_item in zip(responses, eval_items):
|
|
301
|
-
actual_response_text = get_response_text_for_evaluation(enhanced_response)
|
|
302
|
-
prompt = eval_item.get("prompt", "")
|
|
303
|
-
expected_response = eval_item.get("expected_response", "")
|
|
304
|
-
prompt_evaluators = eval_item.get("evaluators")
|
|
305
|
-
evaluators_mode = eval_item.get("evaluators_mode", "extend")
|
|
306
|
-
|
|
307
|
-
# Resolve evaluators for this prompt
|
|
308
|
-
resolved = resolve_evaluators_for_prompt(
|
|
309
|
-
prompt_evaluators, evaluators_mode, prompt, default_evaluators,
|
|
310
|
-
)
|
|
311
|
-
|
|
312
|
-
# Build runtime context for prerequisite checks
|
|
313
|
-
has_tool_defs = bool(
|
|
314
|
-
args.m365_agent_id and enhanced_response.get("tool_definitions")
|
|
315
|
-
)
|
|
316
|
-
available_context = {
|
|
317
|
-
REQUIRES_AZURE_OPENAI: has_azure_openai,
|
|
318
|
-
REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
results_dict: Dict[str, Optional[str]] = {}
|
|
322
|
-
evaluators_ran: List[str] = []
|
|
323
|
-
|
|
324
|
-
for eval_name, eval_options in resolved.items():
|
|
325
|
-
# Check prerequisites
|
|
326
|
-
can_run, warn_msg = check_prerequisites(eval_name, available_context)
|
|
327
|
-
if not can_run:
|
|
328
|
-
if warn_msg:
|
|
329
|
-
emit_structured_log("warning", f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}", operation=Operation.EVALUATE)
|
|
330
|
-
results_dict[eval_name] = None
|
|
331
|
-
continue
|
|
332
|
-
|
|
333
|
-
evaluators_ran.append(eval_name)
|
|
334
|
-
threshold = get_evaluator_threshold(eval_name, eval_options)
|
|
335
|
-
|
|
336
|
-
if eval_name == RELEVANCE:
|
|
337
|
-
raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
|
|
338
|
-
results_dict[RELEVANCE] = decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
|
|
339
|
-
elif eval_name == COHERENCE:
|
|
340
|
-
raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
|
|
341
|
-
results_dict[COHERENCE] = decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
|
|
342
|
-
elif eval_name == GROUNDEDNESS:
|
|
343
|
-
raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response_text, context=expected_response)
|
|
344
|
-
results_dict[GROUNDEDNESS] = decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
|
|
345
|
-
elif eval_name == TOOL_CALL_ACCURACY:
|
|
346
|
-
raw_score = ToolCallAccuracyEvaluator(model_config)(
|
|
347
|
-
query=prompt,
|
|
348
|
-
response=enhanced_response.get("response", actual_response_text),
|
|
349
|
-
tool_definitions=enhanced_response["tool_definitions"],
|
|
350
|
-
)
|
|
351
|
-
results_dict[TOOL_CALL_ACCURACY] = decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
|
|
352
|
-
elif eval_name == CITATIONS:
|
|
353
|
-
fmt_str = eval_options.get("citation_format", "oai_unicode")
|
|
354
|
-
fmt_map = {
|
|
355
|
-
"oai_unicode": CitationFormat.OAI_UNICODE,
|
|
356
|
-
"bracket": CitationFormat.LEGACY_BRACKET,
|
|
357
|
-
"mixed": CitationFormat.AUTO,
|
|
358
|
-
}
|
|
359
|
-
raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response_text)
|
|
360
|
-
results_dict[CITATIONS] = decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
|
|
361
|
-
elif eval_name == EXACT_MATCH:
|
|
362
|
-
# ExactMatch is binary (match/no-match) — it includes its own result
|
|
363
|
-
# field, so we skip decorate_metric which assumes a numeric score.
|
|
364
|
-
case_sensitive = eval_options.get("case_sensitive", False)
|
|
365
|
-
raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
|
|
366
|
-
results_dict[EXACT_MATCH] = json.dumps(raw_score, indent=4)
|
|
367
|
-
elif eval_name == PARTIAL_MATCH:
|
|
368
|
-
case_sensitive = eval_options.get("case_sensitive", False)
|
|
369
|
-
raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
|
|
370
|
-
results_dict[PARTIAL_MATCH] = decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
|
|
371
|
-
|
|
372
|
-
evaluation_result = {
|
|
373
|
-
"prompt": prompt,
|
|
374
|
-
"response": actual_response_text,
|
|
375
|
-
"expected_response": expected_response,
|
|
376
|
-
"evaluators_ran": evaluators_ran,
|
|
377
|
-
"results": results_dict,
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
# Preserve evaluator config metadata for output
|
|
381
|
-
if "evaluators" in eval_item:
|
|
382
|
-
evaluation_result["evaluators"] = eval_item["evaluators"]
|
|
383
|
-
if "evaluators_mode" in eval_item:
|
|
384
|
-
evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
|
|
385
|
-
|
|
386
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
387
|
-
emit_structured_log(
|
|
388
|
-
"debug",
|
|
389
|
-
f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
|
|
390
|
-
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
391
|
-
f"Scores: {evaluation_result['results']}",
|
|
392
|
-
operation=Operation.EVALUATE,
|
|
393
|
-
)
|
|
394
|
-
|
|
395
|
-
evaluation_results.append(evaluation_result)
|
|
396
|
-
|
|
397
|
-
return evaluation_results
|
|
398
|
-
|
|
399
|
-
def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
400
|
-
agent_id: Optional[str] = None,
|
|
401
|
-
cli_version: Optional[str] = None):
|
|
402
|
-
"""Write the response to console."""
|
|
403
|
-
# ANSI color codes
|
|
404
|
-
BOLD = '\033[1m'
|
|
405
|
-
BLUE = '\033[94m'
|
|
406
|
-
GREEN = '\033[92m'
|
|
407
|
-
YELLOW = '\033[93m'
|
|
408
|
-
CYAN = '\033[96m'
|
|
409
|
-
MAGENTA = '\033[95m'
|
|
410
|
-
ORANGE = '\033[38;5;208m'
|
|
411
|
-
RED = '\033[91m'
|
|
412
|
-
RESET = '\033[0m'
|
|
413
|
-
|
|
414
|
-
# Show metadata
|
|
415
|
-
metadata_parts = []
|
|
416
|
-
if agent_name:
|
|
417
|
-
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
418
|
-
if agent_id:
|
|
419
|
-
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
420
|
-
if cli_version:
|
|
421
|
-
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
422
|
-
if metadata_parts:
|
|
423
|
-
print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
|
|
424
|
-
print()
|
|
425
|
-
|
|
426
|
-
# Show aggregate statistics if multiple results
|
|
427
|
-
if len(results) > 1:
|
|
428
|
-
aggregates = calculate_aggregate_statistics(results)
|
|
429
|
-
if aggregates:
|
|
430
|
-
print(f"{BOLD}{BLUE}Aggregate Statistics ({len(results)} prompts):{RESET}")
|
|
431
|
-
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
432
|
-
|
|
433
|
-
for metric_name, stats in aggregates.items():
|
|
434
|
-
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
435
|
-
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
436
|
-
total_prompts = stats.get('total_prompts', len(results))
|
|
437
|
-
print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
|
|
438
|
-
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
439
|
-
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
440
|
-
if stats.get('threshold') is not None:
|
|
441
|
-
print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
|
|
442
|
-
print()
|
|
443
|
-
|
|
444
|
-
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
445
|
-
print()
|
|
446
|
-
|
|
447
|
-
print(f"{BOLD}{BLUE}Individual Results:{RESET}")
|
|
448
|
-
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
449
|
-
for i, result in enumerate(results, 1):
|
|
450
|
-
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
451
|
-
|
|
452
|
-
# Show which evaluators ran for this prompt
|
|
453
|
-
evaluators_ran = result.get('evaluators_ran', [])
|
|
454
|
-
if evaluators_ran:
|
|
455
|
-
print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
|
|
456
|
-
|
|
457
|
-
print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
|
|
458
|
-
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
|
|
459
|
-
|
|
460
|
-
# Print metric scores from results
|
|
461
|
-
metrics = result.get('results', {})
|
|
462
|
-
for eval_name, v in metrics.items():
|
|
463
|
-
if v is None:
|
|
464
|
-
continue # Skip null/N/A scores from skipped evaluators
|
|
465
|
-
display_name = pascal_case_to_title(eval_name)
|
|
466
|
-
if eval_name == RELEVANCE:
|
|
467
|
-
color = MAGENTA
|
|
468
|
-
elif eval_name == COHERENCE:
|
|
469
|
-
color = ORANGE
|
|
470
|
-
else:
|
|
471
|
-
color = BLUE
|
|
472
|
-
print(f"{BOLD}{color}{display_name}:{RESET} {v}")
|
|
473
|
-
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
474
|
-
|
|
475
|
-
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
476
|
-
"""Extract an EvalScore object from a decorated metric dict.
|
|
477
|
-
|
|
478
|
-
Maps internal decorated-metric format to schema EvalScore:
|
|
479
|
-
{score, result, threshold} (required) + reason, evaluator (optional).
|
|
480
|
-
"""
|
|
481
|
-
DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
|
|
482
|
-
|
|
483
|
-
score_val = None
|
|
484
|
-
if metric_id in data and isinstance(data[metric_id], (int, float)):
|
|
485
|
-
score_val = data[metric_id]
|
|
486
|
-
if score_val is None:
|
|
487
|
-
return None
|
|
488
|
-
|
|
489
|
-
result = data.get("result")
|
|
490
|
-
if result not in ("pass", "fail"):
|
|
491
|
-
result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
|
|
492
|
-
|
|
493
|
-
eval_score: Dict[str, Any] = {
|
|
494
|
-
"score": score_val,
|
|
495
|
-
"result": result,
|
|
496
|
-
"threshold": data.get("threshold", DEFAULT_THRESHOLD),
|
|
497
|
-
}
|
|
498
|
-
reason = data.get(f"{metric_id}_reason") or data.get("reason")
|
|
499
|
-
if reason:
|
|
500
|
-
eval_score["reason"] = reason
|
|
501
|
-
return eval_score
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
505
|
-
"""Convert an internal evaluation result dict to a schema-compliant EvalItem.
|
|
506
|
-
|
|
507
|
-
Internal format (from run_evaluations):
|
|
508
|
-
{prompt, response, expected_response, results: {Relevance: "JSON", ...},
|
|
509
|
-
evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
|
|
510
|
-
Schema EvalItem format:
|
|
511
|
-
{prompt, response, expected_response, scores: {relevance: EvalScore, ...},
|
|
512
|
-
evaluators: {...}, evaluators_mode: "..."}
|
|
513
|
-
"""
|
|
514
|
-
item: Dict[str, Any] = {
|
|
515
|
-
"prompt": result["prompt"],
|
|
516
|
-
"response": result["response"],
|
|
517
|
-
"expected_response": result["expected_response"],
|
|
518
|
-
}
|
|
519
|
-
|
|
520
|
-
# Preserve evaluator config in output
|
|
521
|
-
if "evaluators" in result:
|
|
522
|
-
item["evaluators"] = result["evaluators"]
|
|
523
|
-
if "evaluators_mode" in result:
|
|
524
|
-
item["evaluators_mode"] = result["evaluators_mode"]
|
|
525
|
-
|
|
526
|
-
scores: Dict[str, Any] = {}
|
|
527
|
-
results_dict = result.get("results", {})
|
|
528
|
-
|
|
529
|
-
# EvalScore metrics (all share the same schema shape: {score, result, threshold})
|
|
530
|
-
for eval_key, schema_key in [
|
|
531
|
-
(RELEVANCE, "relevance"),
|
|
532
|
-
(COHERENCE, "coherence"),
|
|
533
|
-
(GROUNDEDNESS, "groundedness"),
|
|
534
|
-
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
535
|
-
]:
|
|
536
|
-
raw = results_dict.get(eval_key)
|
|
537
|
-
if not raw:
|
|
538
|
-
continue
|
|
539
|
-
data = json.loads(raw) if isinstance(raw, str) else raw
|
|
540
|
-
eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
|
|
541
|
-
if eval_score:
|
|
542
|
-
scores[schema_key] = eval_score
|
|
543
|
-
|
|
544
|
-
# Citations → CitationScore
|
|
545
|
-
raw_citations = results_dict.get(CITATIONS)
|
|
546
|
-
if raw_citations:
|
|
547
|
-
data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
|
|
548
|
-
count = data.get("citations", 0)
|
|
549
|
-
cit_result = data.get("result")
|
|
550
|
-
if cit_result not in ("pass", "fail"):
|
|
551
|
-
cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
|
|
552
|
-
|
|
553
|
-
citation_score: Dict[str, Any] = {
|
|
554
|
-
"count": count,
|
|
555
|
-
"result": cit_result,
|
|
556
|
-
"threshold": data.get("threshold", 1),
|
|
557
|
-
}
|
|
558
|
-
if "citation_format" in data:
|
|
559
|
-
citation_score["format"] = data["citation_format"]
|
|
560
|
-
scores["citations"] = citation_score
|
|
561
|
-
|
|
562
|
-
# ExactMatch → ExactMatchScore
|
|
563
|
-
raw_exact = results_dict.get(EXACT_MATCH)
|
|
564
|
-
if raw_exact:
|
|
565
|
-
data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
|
|
566
|
-
is_match = data.get("exact_match", 0.0) == 1.0
|
|
567
|
-
scores["exactMatch"] = {
|
|
568
|
-
"match": is_match,
|
|
569
|
-
"result": data.get("result", "pass" if is_match else "fail"),
|
|
570
|
-
"reason": data.get("exact_match_reason", ""),
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
# PartialMatch → PartialMatchScore
|
|
574
|
-
raw_partial = results_dict.get(PARTIAL_MATCH)
|
|
575
|
-
if raw_partial:
|
|
576
|
-
data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
|
|
577
|
-
scores["partialMatch"] = {
|
|
578
|
-
"score": data.get("partial_match", 0.0),
|
|
579
|
-
"result": data.get("result", "fail"),
|
|
580
|
-
"threshold": data.get("threshold", 0.5),
|
|
581
|
-
"reason": data.get("partial_match_reason", ""),
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
if scores:
|
|
585
|
-
item["scores"] = scores
|
|
586
|
-
|
|
587
|
-
return item
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
591
|
-
default_evaluators: Optional[Dict[str, Any]] = None,
|
|
592
|
-
agent_name: Optional[str] = None,
|
|
593
|
-
cli_version: Optional[str] = None):
|
|
594
|
-
"""Write results to a schema-compliant eval document JSON file.
|
|
595
|
-
|
|
596
|
-
Output follows the eval-document.schema.json format:
|
|
597
|
-
{schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
|
|
598
|
-
"""
|
|
599
|
-
try:
|
|
600
|
-
try:
|
|
601
|
-
current_version = SchemaVersionManager().get_current_version()
|
|
602
|
-
except Exception:
|
|
603
|
-
current_version = "1.0.0"
|
|
604
|
-
|
|
605
|
-
items = [convert_result_to_eval_item(r) for r in results]
|
|
606
|
-
|
|
607
|
-
metadata: Dict[str, Any] = {
|
|
608
|
-
"evaluatedAt": datetime.now(timezone.utc).isoformat(),
|
|
609
|
-
}
|
|
610
|
-
if agent_id:
|
|
611
|
-
metadata["agentId"] = agent_id
|
|
612
|
-
if agent_name:
|
|
613
|
-
metadata["agentName"] = agent_name
|
|
614
|
-
if cli_version:
|
|
615
|
-
metadata["cliVersion"] = cli_version
|
|
616
|
-
|
|
617
|
-
output_data: Dict[str, Any] = {
|
|
618
|
-
"schemaVersion": current_version,
|
|
619
|
-
"metadata": metadata,
|
|
620
|
-
}
|
|
621
|
-
|
|
622
|
-
if default_evaluators is not None:
|
|
623
|
-
output_data["default_evaluators"] = default_evaluators
|
|
624
|
-
|
|
625
|
-
output_data["items"] = items
|
|
626
|
-
|
|
627
|
-
with open(output_file, 'w', encoding='utf-8') as f:
|
|
628
|
-
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
629
|
-
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
630
|
-
except Exception as e:
|
|
631
|
-
emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
632
|
-
sys.exit(1)
|
|
633
|
-
|
|
634
|
-
def write_results_to_csv(results: List[Dict], output_file: str,
|
|
635
|
-
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
636
|
-
cli_version: Optional[str] = None):
|
|
637
|
-
"""Write results to CSV file."""
|
|
638
|
-
try:
|
|
639
|
-
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
640
|
-
if results:
|
|
641
|
-
# Write metadata header
|
|
642
|
-
metadata_parts = []
|
|
643
|
-
if agent_name:
|
|
644
|
-
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
645
|
-
if agent_id:
|
|
646
|
-
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
647
|
-
if cli_version:
|
|
648
|
-
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
649
|
-
if metadata_parts:
|
|
650
|
-
f.write(f"# {' | '.join(metadata_parts)}\n")
|
|
651
|
-
|
|
652
|
-
# Write aggregate statistics first if multiple results
|
|
653
|
-
if len(results) > 1:
|
|
654
|
-
aggregates = calculate_aggregate_statistics(results)
|
|
655
|
-
if aggregates:
|
|
656
|
-
f.write("# AGGREGATE STATISTICS\n")
|
|
657
|
-
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
658
|
-
for metric_name, stats in aggregates.items():
|
|
659
|
-
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
660
|
-
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
661
|
-
total_prompts = stats.get('total_prompts', len(results))
|
|
662
|
-
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
663
|
-
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
664
|
-
|
|
665
|
-
# Write individual results (exclude internal fields)
|
|
666
|
-
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode'}
|
|
667
|
-
fieldnames = [k for k in results[0].keys() if k not in exclude_keys]
|
|
668
|
-
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
669
|
-
writer.writeheader()
|
|
670
|
-
writer.writerows(results)
|
|
671
|
-
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
672
|
-
except Exception as e:
|
|
673
|
-
emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
674
|
-
sys.exit(1)
|
|
675
|
-
|
|
676
|
-
def parse_arguments():
|
|
677
|
-
"""Parse command line arguments."""
|
|
678
|
-
parser = argparse.ArgumentParser(
|
|
679
|
-
description="M365 Copilot Agent Evaluation CLI",
|
|
680
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
681
|
-
epilog="""
|
|
682
|
-
Examples:
|
|
683
|
-
# Run with default prompts
|
|
684
|
-
python main.py
|
|
685
|
-
|
|
686
|
-
# Run with custom prompts
|
|
687
|
-
python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph is a gateway..."
|
|
688
|
-
|
|
689
|
-
# Run with prompts from file
|
|
690
|
-
python main.py --prompts-file prompts.json
|
|
691
|
-
|
|
692
|
-
# Interactive mode
|
|
693
|
-
python main.py --interactive
|
|
694
|
-
|
|
695
|
-
# Save results to JSON
|
|
696
|
-
python main.py --output results.json
|
|
697
|
-
|
|
698
|
-
# Save results to CSV
|
|
699
|
-
python main.py --output results.csv
|
|
700
|
-
|
|
701
|
-
# Save results to HTML and open in browser
|
|
702
|
-
python main.py --output report.html
|
|
703
|
-
|
|
704
|
-
# Debug-level diagnostics
|
|
705
|
-
python main.py --log-level debug
|
|
706
|
-
|
|
707
|
-
# Sign out and clear cached authentication tokens
|
|
708
|
-
python main.py --signout
|
|
709
|
-
"""
|
|
710
|
-
)
|
|
711
|
-
|
|
712
|
-
# Input options (mutually exclusive)
|
|
713
|
-
input_group = parser.add_mutually_exclusive_group()
|
|
714
|
-
input_group.add_argument(
|
|
715
|
-
'--prompts',
|
|
716
|
-
nargs='+',
|
|
717
|
-
help='List of prompts to evaluate'
|
|
718
|
-
)
|
|
719
|
-
input_group.add_argument(
|
|
720
|
-
'--prompts-file',
|
|
721
|
-
type=str,
|
|
722
|
-
help='JSON file containing prompts and expected responses'
|
|
723
|
-
)
|
|
724
|
-
input_group.add_argument(
|
|
725
|
-
'--interactive',
|
|
726
|
-
action='store_true',
|
|
727
|
-
help='Interactive mode to enter prompts'
|
|
728
|
-
)
|
|
729
|
-
|
|
730
|
-
# Expected responses (only used with --prompts)
|
|
731
|
-
parser.add_argument(
|
|
732
|
-
'--expected',
|
|
733
|
-
nargs='+',
|
|
734
|
-
help='List of expected responses (must match number of prompts)'
|
|
735
|
-
)
|
|
736
|
-
|
|
737
|
-
# Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
|
|
738
|
-
parser.add_argument(
|
|
739
|
-
'--m365-agent-id', '--agent-id',
|
|
740
|
-
type=str,
|
|
741
|
-
default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
|
|
742
|
-
help='Agent ID (default from M365_AGENT_ID environment variable)'
|
|
743
|
-
)
|
|
744
|
-
|
|
745
|
-
# Output options
|
|
746
|
-
parser.add_argument(
|
|
747
|
-
'--output',
|
|
748
|
-
type=str,
|
|
749
|
-
help='Output file path. Format is determined by file extension: .json, .csv, .html. If not provided, results are printed to console.'
|
|
750
|
-
)
|
|
751
|
-
|
|
752
|
-
# Behavior options
|
|
753
|
-
parser.add_argument(
|
|
754
|
-
'--log-level',
|
|
755
|
-
nargs='?',
|
|
756
|
-
const='info',
|
|
757
|
-
action='append',
|
|
758
|
-
help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
|
|
759
|
-
)
|
|
760
|
-
|
|
761
|
-
parser.add_argument(
|
|
762
|
-
'--signout',
|
|
763
|
-
action='store_true',
|
|
764
|
-
help='Sign out and clear cached authentication tokens'
|
|
765
|
-
)
|
|
766
|
-
|
|
767
|
-
return parser.parse_args()
|
|
768
|
-
|
|
769
|
-
def validate_environment() -> CallPath:
|
|
770
|
-
"""Validate required environment variables."""
|
|
771
|
-
required_env_vars = [
|
|
772
|
-
"AZURE_AI_OPENAI_ENDPOINT",
|
|
773
|
-
"AZURE_AI_API_KEY",
|
|
774
|
-
"AZURE_AI_API_VERSION",
|
|
775
|
-
"AZURE_AI_MODEL_NAME",
|
|
776
|
-
# Chat API specific
|
|
777
|
-
"COPILOT_API_ENDPOINT",
|
|
778
|
-
"X_SCENARIO_HEADER"
|
|
779
|
-
]
|
|
780
|
-
|
|
781
|
-
if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
|
|
782
|
-
call_path = CallPath.ACCESS_TOKEN
|
|
783
|
-
required_env_vars.append("COPILOT_API_ACCESS_TOKEN")
|
|
784
|
-
else:
|
|
785
|
-
call_path = CallPath.COPILOT_AUTH
|
|
786
|
-
required_env_vars.extend([
|
|
787
|
-
"M365_EVAL_CLIENT_ID",
|
|
788
|
-
"TENANT_ID"
|
|
789
|
-
])
|
|
790
|
-
|
|
791
|
-
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
|
792
|
-
if missing_vars:
|
|
793
|
-
emit_structured_log(
|
|
794
|
-
"error",
|
|
795
|
-
"Missing required environment variables: "
|
|
796
|
-
f"{', '.join(missing_vars)}. Please ensure your .env file contains "
|
|
797
|
-
"all required Azure configuration.",
|
|
798
|
-
operation=Operation.VALIDATE_ENV,
|
|
799
|
-
)
|
|
800
|
-
sys.exit(1)
|
|
801
|
-
return call_path
|
|
802
|
-
|
|
803
|
-
def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
|
|
804
|
-
"""Validate URL against security requirements."""
|
|
805
|
-
try:
|
|
806
|
-
parsed = urllib.parse.urlparse(url)
|
|
807
|
-
|
|
808
|
-
# Check for dangerous schemes
|
|
809
|
-
if parsed.scheme in ['javascript', 'data']:
|
|
810
|
-
raise ValueError(f"Dangerous URL scheme detected: {parsed.scheme}")
|
|
811
|
-
|
|
812
|
-
# Check for HTTPS requirement
|
|
813
|
-
if parsed.scheme != 'https':
|
|
814
|
-
raise ValueError(f"Only HTTPS URLs are allowed, got: {parsed.scheme}")
|
|
815
|
-
|
|
816
|
-
# Check if domain is in allowed list
|
|
817
|
-
if parsed.netloc not in allowed_domains:
|
|
818
|
-
raise ValueError(f"Domain not in allowed list: {parsed.netloc}")
|
|
819
|
-
|
|
820
|
-
# Reject fragment URLs
|
|
821
|
-
if parsed.fragment:
|
|
822
|
-
raise ValueError("Fragment URLs are not allowed")
|
|
823
|
-
|
|
824
|
-
return True
|
|
825
|
-
|
|
826
|
-
except ValueError:
|
|
827
|
-
# Re-raise ValueError exceptions
|
|
828
|
-
raise
|
|
829
|
-
except Exception as e:
|
|
830
|
-
# Convert other parsing errors to ValueError
|
|
831
|
-
raise ValueError(f"Invalid URL format: {url}") from e
|
|
832
|
-
|
|
833
|
-
def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
|
|
834
|
-
"""Get prompts and expected responses based on command line arguments.
|
|
835
|
-
|
|
836
|
-
Returns:
|
|
837
|
-
Tuple of (eval_items, default_evaluators).
|
|
838
|
-
"""
|
|
839
|
-
if args.prompts:
|
|
840
|
-
if args.expected and len(args.prompts) != len(args.expected):
|
|
841
|
-
emit_structured_log(
|
|
842
|
-
"error",
|
|
843
|
-
"Number of prompts must match number of expected responses. "
|
|
844
|
-
"Update --expected values to match the prompt count.",
|
|
845
|
-
)
|
|
846
|
-
sys.exit(1)
|
|
847
|
-
expected_responses = args.expected or [""] * len(args.prompts)
|
|
848
|
-
eval_items = [
|
|
849
|
-
{"prompt": p, "expected_response": e}
|
|
850
|
-
for p, e in zip(args.prompts, expected_responses)
|
|
851
|
-
]
|
|
852
|
-
return eval_items, None
|
|
853
|
-
elif args.prompts_file:
|
|
854
|
-
return load_prompts_from_file(args.prompts_file)
|
|
855
|
-
elif args.interactive:
|
|
856
|
-
prompts, expected_responses = get_interactive_prompts()
|
|
857
|
-
eval_items = [
|
|
858
|
-
{"prompt": p, "expected_response": e}
|
|
859
|
-
for p, e in zip(prompts, expected_responses)
|
|
860
|
-
]
|
|
861
|
-
return eval_items, None
|
|
862
|
-
else:
|
|
863
|
-
prompts, expected_responses = get_default_prompts_and_responses()
|
|
864
|
-
eval_items = [
|
|
865
|
-
{"prompt": p, "expected_response": e}
|
|
866
|
-
for p, e in zip(prompts, expected_responses)
|
|
867
|
-
]
|
|
868
|
-
return eval_items, None
|
|
869
|
-
|
|
870
|
-
def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
|
|
871
|
-
"""
|
|
872
|
-
Fetch available agents for the user from the Copilot API.
|
|
873
|
-
|
|
874
|
-
Args:
|
|
875
|
-
access_token: Bearer token for API authentication
|
|
876
|
-
user_oid: User object ID for agent filtering
|
|
877
|
-
|
|
878
|
-
Returns:
|
|
879
|
-
List of agent dictionaries.
|
|
880
|
-
"""
|
|
881
|
-
request_headers = {
|
|
882
|
-
"Content-Type": "application/json",
|
|
883
|
-
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
884
|
-
"Authorization": f"Bearer {access_token}"
|
|
885
|
-
}
|
|
886
|
-
|
|
887
|
-
try:
|
|
888
|
-
# Build the query parameter with participant info
|
|
889
|
-
request_data = json.dumps({"participant": {"id": user_oid}})
|
|
890
|
-
query_param = urllib.parse.quote(request_data)
|
|
891
|
-
|
|
892
|
-
# Try to fetch agents from /GetGptList endpoint
|
|
893
|
-
req = urllib.request.Request(
|
|
894
|
-
f"{copilot_api_endpoint}/GetGptList?request={query_param}",
|
|
895
|
-
headers=request_headers,
|
|
896
|
-
method="GET"
|
|
897
|
-
)
|
|
898
|
-
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
899
|
-
data = json.loads(resp.read().decode("utf-8"))
|
|
900
|
-
agents = data.get("gptList", [])
|
|
901
|
-
return agents
|
|
902
|
-
except urllib.error.HTTPError as e:
|
|
903
|
-
# If endpoint doesn't exist or returns error, return empty list
|
|
904
|
-
emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
|
|
905
|
-
return []
|
|
906
|
-
except Exception as e:
|
|
907
|
-
emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
|
|
908
|
-
return []
|
|
909
|
-
|
|
910
|
-
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
|
|
911
|
-
"""
|
|
912
|
-
Display an interactive agent selector using questionary.
|
|
913
|
-
|
|
914
|
-
Args:
|
|
915
|
-
agents: List of agent dictionaries.
|
|
916
|
-
|
|
917
|
-
Returns:
|
|
918
|
-
Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
|
|
919
|
-
"""
|
|
920
|
-
if not agents:
|
|
921
|
-
return None, None
|
|
922
|
-
|
|
923
|
-
# Build id→name lookup and choices
|
|
924
|
-
id_to_name: Dict[str, str] = {}
|
|
925
|
-
choices = []
|
|
926
|
-
sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
|
|
927
|
-
for agent in sorted_agents:
|
|
928
|
-
agent_id = agent.get("gptId", "Unknown")
|
|
929
|
-
agent_name = agent.get("name", "Unknown")
|
|
930
|
-
agent_description = agent.get("description", "Unknown")
|
|
931
|
-
agent_is_owner = agent.get('isOwner')
|
|
932
|
-
id_to_name[agent_id] = agent_name
|
|
933
|
-
|
|
934
|
-
# Format the display text
|
|
935
|
-
display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
|
|
936
|
-
|
|
937
|
-
choices.append(questionary.Choice(title=display_text, value=agent_id))
|
|
938
|
-
|
|
939
|
-
# Display the selection prompt
|
|
940
|
-
selected_agent = questionary.select(
|
|
941
|
-
"Select an agent to evaluate:",
|
|
942
|
-
choices=choices,
|
|
943
|
-
use_shortcuts=True,
|
|
944
|
-
use_arrow_keys=True
|
|
945
|
-
).ask()
|
|
946
|
-
|
|
947
|
-
return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
|
|
948
|
-
|
|
949
|
-
@functools.lru_cache(maxsize=1)
|
|
950
|
-
def _get_iana_timezone_name() -> str:
|
|
951
|
-
"""Get the IANA timezone name from the system using tzlocal.
|
|
952
|
-
|
|
953
|
-
Tries get_localzone_name() first; falls back to str(get_localzone()) when the
|
|
954
|
-
former raises (e.g. no zone configured on some Unix systems). Result is cached
|
|
955
|
-
after the first call so tzlocal is only invoked once per session.
|
|
956
|
-
"""
|
|
957
|
-
try:
|
|
958
|
-
return tzlocal.get_localzone_name()
|
|
959
|
-
except Exception:
|
|
960
|
-
return str(tzlocal.get_localzone())
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
@functools.lru_cache(maxsize=1)
|
|
964
|
-
def _get_location_info() -> Dict[str, Any]:
|
|
965
|
-
"""Return a locationInfo dict containing the local UTC offset and IANA timezone name.
|
|
966
|
-
|
|
967
|
-
Result is cached after the first call so the computation runs only once per session.
|
|
968
|
-
"""
|
|
969
|
-
now = datetime.now().astimezone()
|
|
970
|
-
utc_offset = now.utcoffset()
|
|
971
|
-
offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
|
|
972
|
-
return {
|
|
973
|
-
"timeZoneOffset": offset_hours,
|
|
974
|
-
"timeZone": _get_iana_timezone_name(),
|
|
975
|
-
}
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
979
|
-
message = {
|
|
980
|
-
"message": {
|
|
981
|
-
"text": prompt,
|
|
982
|
-
"author": "user",
|
|
983
|
-
"messageType": "chat",
|
|
984
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
985
|
-
"locationInfo": _get_location_info(),
|
|
986
|
-
"from": {
|
|
987
|
-
"id": user_oid,
|
|
988
|
-
}
|
|
989
|
-
},
|
|
990
|
-
"verbosity": "verbose", # To enable detailed telemetry in response (to extract tool usage, etc.)
|
|
991
|
-
}
|
|
992
|
-
|
|
993
|
-
if agent_id:
|
|
994
|
-
message["gpts"] = [
|
|
995
|
-
{
|
|
996
|
-
"id": agent_id.strip(),
|
|
997
|
-
"source": "MOS3"
|
|
998
|
-
}
|
|
999
|
-
]
|
|
1000
|
-
message["optionsSets"] = [
|
|
1001
|
-
"disable_action_confirmation" # Disable 3P action confirmation prompts for agents while scraping
|
|
1002
|
-
]
|
|
1003
|
-
|
|
1004
|
-
return json.dumps(message).encode("utf-8")
|
|
1005
|
-
|
|
1006
|
-
def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
|
|
1007
|
-
""" Send prompts to the chat API and return enhanced responses. """
|
|
1008
|
-
|
|
1009
|
-
request_headers = {
|
|
1010
|
-
"Content-Type": "application/json",
|
|
1011
|
-
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
1012
|
-
"Authorization": f"Bearer {access_token}"
|
|
1013
|
-
}
|
|
1014
|
-
raw_responses: List[Tuple[str, str]] = []
|
|
1015
|
-
for i, prompt in enumerate(prompts, 1):
|
|
1016
|
-
if getattr(args, "effective_log_level", "info") in ("info", "debug"):
|
|
1017
|
-
emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
|
|
1018
|
-
|
|
1019
|
-
# Build the payload
|
|
1020
|
-
payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
|
|
1021
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1022
|
-
emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
|
|
1023
|
-
|
|
1024
|
-
# Send the request to /chat
|
|
1025
|
-
req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
|
|
1026
|
-
try:
|
|
1027
|
-
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
1028
|
-
raw = resp.read().decode("utf-8", errors="replace")
|
|
1029
|
-
except urllib.error.HTTPError as e:
|
|
1030
|
-
error_body = None
|
|
1031
|
-
try:
|
|
1032
|
-
error_body = e.read().decode("utf-8", errors="replace")
|
|
1033
|
-
except Exception:
|
|
1034
|
-
pass
|
|
1035
|
-
msg = f"Chat API request failed (HTTP {e.code} {e.reason})."
|
|
1036
|
-
if error_body:
|
|
1037
|
-
msg += f" Body: {error_body[:500]}"
|
|
1038
|
-
raise RuntimeError(msg) from e
|
|
1039
|
-
except urllib.error.URLError as e:
|
|
1040
|
-
raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
|
|
1041
|
-
|
|
1042
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1043
|
-
emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
|
|
1044
|
-
|
|
1045
|
-
# Store raw response for enhancement
|
|
1046
|
-
raw_responses.append((prompt, raw.strip()))
|
|
1047
|
-
|
|
1048
|
-
# Extract enhanced responses using the new extractor
|
|
1049
|
-
enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
|
|
1050
|
-
|
|
1051
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1052
|
-
for idx, enhanced in enumerate(enhanced_responses, 1):
|
|
1053
|
-
metadata = enhanced.get("metadata", {})
|
|
1054
|
-
context = {
|
|
1055
|
-
"request-id": metadata.get("request_id"),
|
|
1056
|
-
"conversation-id": metadata.get("conversation_id"),
|
|
1057
|
-
"message-id": metadata.get("message_id"),
|
|
1058
|
-
"operation": Operation.SEND_PROMPT,
|
|
1059
|
-
}
|
|
1060
|
-
entry = format_structured_log_entry(
|
|
1061
|
-
level="debug",
|
|
1062
|
-
message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
|
|
1063
|
-
logger_name=CLI_LOGGER_NAME,
|
|
1064
|
-
run_context=context,
|
|
1065
|
-
)
|
|
1066
|
-
DIAGNOSTIC_RECORDS.append(entry)
|
|
1067
|
-
CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
|
|
31
|
+
from cli_args import parse_arguments, should_bypass_min_version_check
|
|
32
|
+
from env_validator import (
|
|
33
|
+
ALLOWED_ENDPOINTS,
|
|
34
|
+
validate_endpoint_url,
|
|
35
|
+
validate_environment,
|
|
36
|
+
)
|
|
37
|
+
from common import (
|
|
38
|
+
ENV_AZURE_AI_OPENAI_ENDPOINT,
|
|
39
|
+
ENV_AZURE_AI_API_KEY,
|
|
40
|
+
ENV_AZURE_AI_API_VERSION,
|
|
41
|
+
ENV_AZURE_AI_MODEL_NAME,
|
|
42
|
+
ENV_TENANT_ID,
|
|
43
|
+
ENV_WORK_IQ_A2A_ENDPOINT,
|
|
44
|
+
ENV_WORK_IQ_A2A_CLIENT_ID,
|
|
45
|
+
ENV_WORK_IQ_A2A_SCOPES,
|
|
46
|
+
RunConfig,
|
|
47
|
+
)
|
|
48
|
+
from prompt_loader import get_prompt_datasets
|
|
49
|
+
from agent_selector import select_agent_interactively
|
|
50
|
+
from evaluation_runner import PipelineConfig, run_pipeline
|
|
51
|
+
from result_writer import output_results
|
|
1068
52
|
|
|
1069
|
-
|
|
53
|
+
from dataclasses import replace
|
|
1070
54
|
|
|
1071
|
-
def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
|
|
1072
|
-
agent_name: Optional[str] = None, cli_version: Optional[str] = None):
|
|
1073
|
-
"""Output results based on specified format."""
|
|
1074
|
-
metadata_kwargs = dict(
|
|
1075
|
-
agent_name=agent_name,
|
|
1076
|
-
agent_id=getattr(args, 'm365_agent_id', None),
|
|
1077
|
-
cli_version=cli_version,
|
|
1078
|
-
)
|
|
1079
|
-
if args.output:
|
|
1080
|
-
output_lower = args.output.lower()
|
|
1081
|
-
if output_lower.endswith('.json'):
|
|
1082
|
-
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1083
|
-
**metadata_kwargs)
|
|
1084
|
-
elif output_lower.endswith('.csv'):
|
|
1085
|
-
write_results_to_csv(results, args.output, **metadata_kwargs)
|
|
1086
|
-
elif output_lower.endswith('.html'):
|
|
1087
|
-
write_results_to_html(results, args.output, **metadata_kwargs)
|
|
1088
|
-
abs_path = os.path.abspath(args.output)
|
|
1089
|
-
webbrowser.open(f'file://{abs_path}')
|
|
1090
|
-
else:
|
|
1091
|
-
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1092
|
-
**metadata_kwargs)
|
|
1093
|
-
else:
|
|
1094
|
-
write_results_to_console(results, **metadata_kwargs)
|
|
1095
55
|
|
|
1096
56
|
def main():
|
|
1097
57
|
"""Main function to orchestrate the evaluation process."""
|
|
@@ -1107,136 +67,159 @@ def main():
|
|
|
1107
67
|
)
|
|
1108
68
|
sys.exit(2)
|
|
1109
69
|
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
70
|
+
config = replace(
|
|
71
|
+
RunConfig.from_namespace(args),
|
|
72
|
+
effective_log_level=effective_log_level,
|
|
73
|
+
)
|
|
74
|
+
configure_cli_logging(config.effective_log_level)
|
|
75
|
+
emit_structured_log("info", f"Log level set to '{config.effective_log_level}'.", operation=Operation.SETUP)
|
|
1113
76
|
|
|
1114
77
|
# Check minimum version before proceeding
|
|
1115
|
-
quiet_for_version = effective_log_level in ("warning", "error")
|
|
78
|
+
quiet_for_version = config.effective_log_level in ("warning", "error")
|
|
1116
79
|
cli_version = get_cli_version(quiet=quiet_for_version)
|
|
1117
|
-
if not should_bypass_min_version_check(
|
|
80
|
+
if not should_bypass_min_version_check(config) and not check_min_version(cli_version, quiet=quiet_for_version):
|
|
1118
81
|
sys.exit(1)
|
|
1119
82
|
|
|
1120
83
|
# Validate environment variables required for evaluation
|
|
1121
|
-
|
|
1122
|
-
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
1123
|
-
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
1124
|
-
|
|
1125
|
-
user_oid = ""
|
|
1126
|
-
|
|
1127
|
-
if call_path == CallPath.ACCESS_TOKEN:
|
|
1128
|
-
access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
|
|
1129
|
-
else:
|
|
1130
|
-
scopes_str = os.environ.get(
|
|
1131
|
-
"COPILOT_SCOPES", "https://substrate.office.com/sydney/.default"
|
|
1132
|
-
)
|
|
1133
|
-
|
|
1134
|
-
auth_handler = AuthHandler(
|
|
1135
|
-
client_id=os.environ["M365_EVAL_CLIENT_ID"],
|
|
1136
|
-
tenant_id=os.environ["TENANT_ID"],
|
|
1137
|
-
scopes_str=scopes_str
|
|
1138
|
-
)
|
|
84
|
+
validate_environment()
|
|
1139
85
|
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
try:
|
|
1143
|
-
auth_handler.clear_cache()
|
|
1144
|
-
except Exception as e:
|
|
1145
|
-
emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
|
|
1146
|
-
sys.exit(1)
|
|
1147
|
-
sys.exit(0)
|
|
86
|
+
a2a_endpoint = os.environ[ENV_WORK_IQ_A2A_ENDPOINT]
|
|
87
|
+
validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
|
|
1148
88
|
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
89
|
+
a2a_scopes_str = os.environ.get(ENV_WORK_IQ_A2A_SCOPES, "")
|
|
90
|
+
a2a_auth_handler = AuthHandler(
|
|
91
|
+
client_id=os.environ[ENV_WORK_IQ_A2A_CLIENT_ID],
|
|
92
|
+
tenant_id=os.environ[ENV_TENANT_ID],
|
|
93
|
+
scopes_str=a2a_scopes_str,
|
|
94
|
+
)
|
|
95
|
+
if config.signout:
|
|
96
|
+
try:
|
|
97
|
+
a2a_auth_handler.clear_cache()
|
|
98
|
+
except Exception as e:
|
|
1158
99
|
emit_structured_log(
|
|
1159
|
-
"
|
|
100
|
+
"error",
|
|
101
|
+
f"Error during signout: {e}",
|
|
1160
102
|
operation=Operation.AUTHENTICATE,
|
|
1161
103
|
)
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
except Exception as e:
|
|
1166
|
-
emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
|
|
1167
|
-
if effective_log_level == "debug":
|
|
1168
|
-
import traceback
|
|
1169
|
-
traceback.print_exc()
|
|
1170
|
-
sys.exit(1)
|
|
104
|
+
sys.exit(1)
|
|
105
|
+
sys.exit(0)
|
|
1171
106
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
107
|
+
try:
|
|
108
|
+
a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
|
|
109
|
+
a2a_access_token = a2a_auth_result.get("access_token") or ""
|
|
110
|
+
if not a2a_access_token:
|
|
111
|
+
raise RuntimeError("Failed to acquire A2A access token")
|
|
112
|
+
except Exception as e:
|
|
113
|
+
emit_structured_log(
|
|
114
|
+
"error",
|
|
115
|
+
f"Error during A2A authentication: {e}",
|
|
116
|
+
operation=Operation.AUTHENTICATE,
|
|
117
|
+
)
|
|
118
|
+
if config.effective_log_level == "debug":
|
|
119
|
+
traceback.print_exc()
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
try:
|
|
122
|
+
agent_client = A2AClient(
|
|
123
|
+
a2a_endpoint=a2a_endpoint,
|
|
124
|
+
access_token=a2a_access_token,
|
|
125
|
+
logger=CLI_LOGGER,
|
|
126
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
127
|
+
)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
emit_structured_log(
|
|
130
|
+
"error",
|
|
131
|
+
f"Failed to initialize A2A client: {e}",
|
|
132
|
+
operation=Operation.SETUP,
|
|
133
|
+
)
|
|
134
|
+
sys.exit(1)
|
|
1175
135
|
|
|
1176
136
|
# 1. Load evaluation datasets
|
|
1177
|
-
eval_items, file_default_evaluators = get_prompt_datasets(
|
|
137
|
+
eval_items, file_default_evaluators = get_prompt_datasets(config)
|
|
1178
138
|
default_evaluators = resolve_default_evaluators(file_default_evaluators)
|
|
1179
|
-
prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
|
|
1180
139
|
|
|
1181
|
-
if effective_log_level in ("info", "debug"):
|
|
1182
|
-
|
|
140
|
+
if config.effective_log_level in ("info", "debug"):
|
|
141
|
+
multi_turn_count = sum(1 for item in eval_items if "turns" in item)
|
|
142
|
+
single_turn_count = len(eval_items) - multi_turn_count
|
|
143
|
+
emit_structured_log(
|
|
144
|
+
"info",
|
|
145
|
+
f"Running evaluation on {len(eval_items)} item(s) "
|
|
146
|
+
f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
|
|
147
|
+
operation=Operation.SETUP,
|
|
148
|
+
)
|
|
1183
149
|
|
|
1184
150
|
agent_name = None
|
|
1185
151
|
try:
|
|
1186
|
-
#
|
|
1187
|
-
|
|
1188
|
-
|
|
152
|
+
# 2. Agent selection - when no agent ID is provided, discover agents
|
|
153
|
+
# via the active client (A2A) and prompt interactively.
|
|
154
|
+
if not config.m365_agent_id:
|
|
155
|
+
if config.effective_log_level in ("info", "debug"):
|
|
1189
156
|
emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
|
|
1190
|
-
|
|
1191
|
-
available_agents = fetch_available_agents(
|
|
157
|
+
|
|
158
|
+
available_agents = agent_client.fetch_available_agents()
|
|
1192
159
|
if not available_agents:
|
|
1193
160
|
emit_structured_log(
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
161
|
+
"error",
|
|
162
|
+
"No agents are available for interactive selection."
|
|
163
|
+
" Re-run with --m365-agent-id or set M365_AGENT_ID.",
|
|
164
|
+
operation=Operation.FETCH_AGENTS,
|
|
1198
165
|
)
|
|
1199
166
|
sys.exit(1)
|
|
1200
167
|
|
|
1201
168
|
selected_agent_id, agent_name = select_agent_interactively(available_agents)
|
|
1202
169
|
if selected_agent_id:
|
|
1203
|
-
|
|
1204
|
-
if effective_log_level in ("info", "debug"):
|
|
1205
|
-
emit_structured_log("info", f"Selected agent: {
|
|
170
|
+
config = replace(config, m365_agent_id=selected_agent_id)
|
|
171
|
+
if config.effective_log_level in ("info", "debug"):
|
|
172
|
+
emit_structured_log("info", f"Selected agent: {config.m365_agent_id}", operation=Operation.FETCH_AGENTS)
|
|
1206
173
|
else:
|
|
1207
174
|
emit_structured_log(
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
175
|
+
"error",
|
|
176
|
+
"No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
|
|
177
|
+
operation=Operation.FETCH_AGENTS,
|
|
1211
178
|
)
|
|
1212
179
|
sys.exit(1)
|
|
1213
|
-
|
|
1214
|
-
# 4. Send prompts to chat API
|
|
1215
|
-
responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
|
|
1216
180
|
except Exception as e:
|
|
1217
|
-
emit_structured_log("error", f"Error
|
|
1218
|
-
if effective_log_level == "debug":
|
|
1219
|
-
import traceback
|
|
181
|
+
emit_structured_log("error", f"Error during agent discovery: {e}", operation=Operation.FETCH_AGENTS)
|
|
182
|
+
if config.effective_log_level == "debug":
|
|
1220
183
|
traceback.print_exc()
|
|
1221
184
|
sys.exit(1)
|
|
1222
185
|
|
|
186
|
+
# Pre-resolve agent endpoint (A2A agent card lookup)
|
|
187
|
+
if config.m365_agent_id:
|
|
188
|
+
agent_client.resolve_agent(config.m365_agent_id)
|
|
189
|
+
|
|
190
|
+
# 3. Build pipeline config and run evaluation pipeline
|
|
191
|
+
model_config = AzureOpenAIModelConfiguration(
|
|
192
|
+
azure_endpoint=os.environ.get(ENV_AZURE_AI_OPENAI_ENDPOINT),
|
|
193
|
+
api_key=os.environ.get(ENV_AZURE_AI_API_KEY),
|
|
194
|
+
api_version=os.environ.get(ENV_AZURE_AI_API_VERSION),
|
|
195
|
+
azure_deployment=os.environ.get(ENV_AZURE_AI_MODEL_NAME),
|
|
196
|
+
)
|
|
197
|
+
has_azure_openai = bool(
|
|
198
|
+
os.environ.get(ENV_AZURE_AI_OPENAI_ENDPOINT)
|
|
199
|
+
and os.environ.get(ENV_AZURE_AI_API_KEY)
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
pipeline = PipelineConfig(
|
|
203
|
+
agent_client=agent_client,
|
|
204
|
+
model_config=model_config,
|
|
205
|
+
has_azure_openai=has_azure_openai,
|
|
206
|
+
default_evaluators=default_evaluators,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
results = run_pipeline(pipeline, eval_items, config)
|
|
210
|
+
|
|
211
|
+
# 4. Output results
|
|
212
|
+
output_results(
|
|
213
|
+
results, config, default_evaluators=default_evaluators,
|
|
214
|
+
agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
|
|
1223
215
|
|
|
1224
|
-
|
|
1225
|
-
if effective_log_level in ("info", "debug"):
|
|
1226
|
-
emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
|
|
1227
|
-
results = run_evaluations(args, responses, eval_items, default_evaluators)
|
|
1228
|
-
|
|
1229
|
-
# 6. Output results
|
|
1230
|
-
output_results(results, args, default_evaluators=default_evaluators,
|
|
1231
|
-
agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
|
|
1232
|
-
|
|
1233
|
-
if effective_log_level in ("info", "debug"):
|
|
216
|
+
if config.effective_log_level in ("info", "debug"):
|
|
1234
217
|
emit_structured_log(
|
|
1235
218
|
"info",
|
|
1236
|
-
f"Evaluation completed successfully. Processed {len(
|
|
219
|
+
f"Evaluation completed successfully. Processed {len(eval_items)} item(s).",
|
|
1237
220
|
operation=Operation.EVALUATE,
|
|
1238
221
|
)
|
|
1239
222
|
|
|
1240
223
|
# Call the main function when script is run directly
|
|
1241
|
-
if __name__ == "__main__":
|
|
224
|
+
if __name__ == "__main__": # pragma: no cover
|
|
1242
225
|
main()
|