@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.6.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -4
- package/package.json +4 -3
- package/schema/CHANGELOG.md +14 -0
- package/schema/v1/eval-document.schema.json +3 -3
- package/schema/version.json +1 -1
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +96 -30
- package/src/clients/cli/api_clients/base_agent_client.py +0 -1
- package/src/clients/cli/auth/auth_handler.py +21 -1
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +3 -1
- package/src/clients/cli/common.py +53 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/main.py +130 -1676
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/response_extractor.py +4 -601
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/node-js/bin/runevals.js +34 -13
- package/src/clients/node-js/config/default.js +8 -11
- package/src/clients/node-js/lib/env-loader.js +3 -4
- package/src/clients/node-js/lib/python-runtime.js +137 -65
- package/src/clients/node-js/lib/venv-manager.js +3 -2
- package/src/clients/node-js/lib/version-check.js +268 -0
- package/src/clients/cli/api_clients/REST/__init__.py +0 -3
- package/src/clients/cli/api_clients/REST/sydney_client.py +0 -204
package/src/clients/cli/main.py
CHANGED
|
@@ -1,1546 +1,58 @@
|
|
|
1
|
-
|
|
1
|
+
"""M365 Copilot Agent Evaluation CLI — thin orchestrator.
|
|
2
|
+
|
|
3
|
+
Delegates to focused modules:
|
|
4
|
+
cli_args – argument parsing & version-check bypass
|
|
5
|
+
env_validator – environment validation & URL security
|
|
6
|
+
prompt_loader – dataset loading & agent selection
|
|
7
|
+
evaluation_runner – pipeline, evaluator dispatch, retry
|
|
8
|
+
result_writer – console / JSON / CSV / HTML output
|
|
9
|
+
"""
|
|
10
|
+
|
|
2
11
|
import os
|
|
3
|
-
import argparse
|
|
4
12
|
import sys
|
|
5
|
-
import
|
|
6
|
-
import logging
|
|
7
|
-
import time
|
|
8
|
-
import webbrowser
|
|
9
|
-
import urllib.parse
|
|
10
|
-
import questionary
|
|
11
|
-
from dataclasses import dataclass, field
|
|
12
|
-
from enum import Enum
|
|
13
|
-
from typing import List, Dict, Tuple, Optional, Any
|
|
14
|
-
|
|
15
|
-
from api_clients.A2A import A2AClient
|
|
16
|
-
from api_clients.REST import SydneyClient
|
|
17
|
-
from api_clients.base_agent_client import BaseAgentClient
|
|
13
|
+
import traceback
|
|
18
14
|
|
|
19
|
-
from azure.ai.evaluation import
|
|
20
|
-
AzureOpenAIModelConfiguration,
|
|
21
|
-
RelevanceEvaluator,
|
|
22
|
-
CoherenceEvaluator,
|
|
23
|
-
GroundednessEvaluator,
|
|
24
|
-
ToolCallAccuracyEvaluator
|
|
25
|
-
)
|
|
15
|
+
from azure.ai.evaluation import AzureOpenAIModelConfiguration
|
|
26
16
|
from dotenv import load_dotenv
|
|
27
|
-
from auth.auth_handler import AuthHandler
|
|
28
|
-
from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
|
|
29
|
-
from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
|
|
30
|
-
from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
|
|
31
|
-
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
32
|
-
from response_extractor import get_response_text_for_evaluation
|
|
33
|
-
from schema_handler import DocumentUpgrader, SchemaVersionManager
|
|
34
|
-
from common import (
|
|
35
|
-
RELEVANCE,
|
|
36
|
-
COHERENCE,
|
|
37
|
-
GROUNDEDNESS,
|
|
38
|
-
TOOL_CALL_ACCURACY,
|
|
39
|
-
CITATIONS,
|
|
40
|
-
EXACT_MATCH,
|
|
41
|
-
PARTIAL_MATCH,
|
|
42
|
-
REQUIRES_AZURE_OPENAI,
|
|
43
|
-
REQUIRES_TOOL_DEFINITIONS,
|
|
44
|
-
METRIC_IDS,
|
|
45
|
-
STATUS_PASS,
|
|
46
|
-
STATUS_FAIL,
|
|
47
|
-
STATUS_ERROR,
|
|
48
|
-
STATUS_PARTIAL,
|
|
49
|
-
STATUS_UNKNOWN,
|
|
50
|
-
pascal_case_to_title,
|
|
51
|
-
)
|
|
52
|
-
from evaluator_resolver import (
|
|
53
|
-
validate_evaluator_names,
|
|
54
|
-
check_prerequisites,
|
|
55
|
-
resolve_default_evaluators,
|
|
56
|
-
resolve_evaluators_for_prompt,
|
|
57
|
-
get_evaluator_threshold,
|
|
58
|
-
)
|
|
59
|
-
from version_check import check_min_version, get_cli_version
|
|
60
|
-
from datetime import datetime, timezone
|
|
61
|
-
from pathlib import Path
|
|
62
17
|
|
|
18
|
+
from api_clients.A2A import A2AClient
|
|
19
|
+
from auth.auth_handler import AuthHandler, make_token_refresh_fn
|
|
20
|
+
from evaluator_resolver import resolve_default_evaluators
|
|
21
|
+
from version_check import check_min_version, get_cli_version
|
|
63
22
|
|
|
64
|
-
from cli_logging.
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
is_retryable_status,
|
|
70
|
-
get_backoff_seconds,
|
|
71
|
-
get_retry_after_seconds,
|
|
23
|
+
from cli_logging.cli_logger import (
|
|
24
|
+
CLI_LOGGER,
|
|
25
|
+
DIAGNOSTIC_RECORDS,
|
|
26
|
+
configure_cli_logging,
|
|
27
|
+
emit_structured_log,
|
|
72
28
|
)
|
|
29
|
+
from cli_logging.logging_utils import Operation, resolve_log_level
|
|
73
30
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
MAX_CONCURRENCY = 5
|
|
81
|
-
MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
|
|
82
|
-
MAX_TURNS_PER_THREAD = 20
|
|
83
|
-
LONG_THREAD_WARNING_THRESHOLD = 10
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
@dataclass
|
|
87
|
-
class PipelineConfig:
|
|
88
|
-
"""Runtime configuration for the evaluation pipeline."""
|
|
89
|
-
agent_client: BaseAgentClient
|
|
90
|
-
model_config: AzureOpenAIModelConfiguration
|
|
91
|
-
has_azure_openai: bool
|
|
92
|
-
default_evaluators: Dict[str, Any]
|
|
93
|
-
chat_gate: ThrottleGate = field(default_factory=lambda: ThrottleGate("chat_api"))
|
|
94
|
-
is_retryable_status: Any = field(default=is_retryable_status)
|
|
95
|
-
get_backoff_seconds: Any = field(default=get_backoff_seconds)
|
|
96
|
-
get_retry_after_seconds: Any = field(default=get_retry_after_seconds)
|
|
97
|
-
|
|
98
|
-
class CallPath(Enum):
|
|
99
|
-
""" Enum to indicate which call path to use. """
|
|
100
|
-
ACCESS_TOKEN = "access_token"
|
|
101
|
-
COPILOT_AUTH = "copilot_auth"
|
|
102
|
-
A2A = "a2a"
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
class ItemType(Enum):
|
|
106
|
-
SINGLE_TURN = "single_turn"
|
|
107
|
-
MULTI_TURN = "multi_turn"
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def detect_item_type(item: dict) -> ItemType:
|
|
111
|
-
"""Determine if an evaluation item is single-turn or multi-turn.
|
|
112
|
-
|
|
113
|
-
Returns ItemType.SINGLE_TURN if item has 'prompt' without 'turns',
|
|
114
|
-
ItemType.MULTI_TURN if item has 'turns' array.
|
|
115
|
-
|
|
116
|
-
Raises ValueError for invalid items (both, neither, or invalid turns).
|
|
117
|
-
"""
|
|
118
|
-
has_turns = "turns" in item
|
|
119
|
-
has_prompt = "prompt" in item
|
|
120
|
-
|
|
121
|
-
if has_turns and has_prompt:
|
|
122
|
-
raise ValueError(
|
|
123
|
-
"Invalid evaluation item: cannot have both 'turns' and 'prompt'. "
|
|
124
|
-
"Use 'turns' for multi-turn threads or 'prompt' for single-turn."
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
if has_turns and not isinstance(item["turns"], list):
|
|
128
|
-
raise ValueError("Invalid evaluation item: 'turns' must be a list")
|
|
129
|
-
|
|
130
|
-
if has_turns:
|
|
131
|
-
if len(item["turns"]) == 0:
|
|
132
|
-
raise ValueError("Invalid multi-turn thread: 'turns' array cannot be empty")
|
|
133
|
-
return ItemType.MULTI_TURN
|
|
134
|
-
|
|
135
|
-
if has_prompt:
|
|
136
|
-
return ItemType.SINGLE_TURN
|
|
137
|
-
|
|
138
|
-
raise ValueError(
|
|
139
|
-
"Invalid evaluation item: must have either 'turns' array (multi-turn) "
|
|
140
|
-
"or 'prompt' field (single-turn)"
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
# Flags that should bypass remote min-version enforcement.
|
|
145
|
-
# --help is not needed here because argparse exits before runtime checks.
|
|
146
|
-
VERSION_CHECK_BYPASS_FLAGS = (
|
|
147
|
-
"signout",
|
|
31
|
+
from cli_args import parse_arguments, should_bypass_min_version_check
|
|
32
|
+
from env_validator import (
|
|
33
|
+
ALLOWED_ENDPOINTS,
|
|
34
|
+
validate_endpoint_url,
|
|
35
|
+
validate_environment,
|
|
148
36
|
)
|
|
37
|
+
from common import (
|
|
38
|
+
ENV_AZURE_AI_OPENAI_ENDPOINT,
|
|
39
|
+
ENV_AZURE_AI_API_KEY,
|
|
40
|
+
ENV_AZURE_AI_API_VERSION,
|
|
41
|
+
ENV_AZURE_AI_MODEL_NAME,
|
|
42
|
+
ENV_TENANT_ID,
|
|
43
|
+
ENV_WORK_IQ_A2A_ENDPOINT,
|
|
44
|
+
ENV_WORK_IQ_A2A_CLIENT_ID,
|
|
45
|
+
ENV_WORK_IQ_A2A_SCOPES,
|
|
46
|
+
RunConfig,
|
|
47
|
+
)
|
|
48
|
+
from prompt_loader import get_prompt_datasets
|
|
49
|
+
from agent_selector import select_agent_interactively
|
|
50
|
+
from evaluation_runner import PipelineConfig, run_pipeline
|
|
51
|
+
from result_writer import output_results
|
|
149
52
|
|
|
150
|
-
|
|
151
|
-
CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
|
|
152
|
-
DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
def configure_cli_logging(effective_log_level: str) -> None:
|
|
156
|
-
if not CLI_LOGGER.handlers:
|
|
157
|
-
handler = logging.StreamHandler(sys.stdout)
|
|
158
|
-
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
159
|
-
CLI_LOGGER.addHandler(handler)
|
|
160
|
-
CLI_LOGGER.propagate = False
|
|
161
|
-
CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
|
|
165
|
-
_emit_structured_log(
|
|
166
|
-
level, message, operation,
|
|
167
|
-
logger=CLI_LOGGER,
|
|
168
|
-
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
|
|
173
|
-
"""Return True if the current invocation should skip min-version checks."""
|
|
174
|
-
return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
|
|
175
|
-
|
|
176
|
-
def write_results_to_html(results: List[Dict], output_file: str,
|
|
177
|
-
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
178
|
-
cli_version: Optional[str] = None):
|
|
179
|
-
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
180
|
-
try:
|
|
181
|
-
html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
|
|
182
|
-
cli_version=cli_version)
|
|
183
|
-
with open(output_file, 'w', encoding='utf-8') as f:
|
|
184
|
-
f.write(html)
|
|
185
|
-
emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
186
|
-
except Exception as e:
|
|
187
|
-
emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
188
|
-
sys.exit(1)
|
|
189
|
-
|
|
190
|
-
def get_default_prompts_and_responses():
|
|
191
|
-
"""Get a list of prompts and responses."""
|
|
192
|
-
prompts = [
|
|
193
|
-
"What is Microsoft Graph?"
|
|
194
|
-
]
|
|
195
|
-
expected_responses = [
|
|
196
|
-
"Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
|
|
197
|
-
]
|
|
198
|
-
return prompts, expected_responses
|
|
199
|
-
|
|
200
|
-
def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
|
|
201
|
-
"""Load prompts and expected responses from a JSON file.
|
|
202
|
-
|
|
203
|
-
Supports three formats:
|
|
204
|
-
1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
|
|
205
|
-
2. Array format: [{"prompt": "...", "expected_response": "..."}]
|
|
206
|
-
3. Dict format: {"prompts": [...], "expected_responses": [...]}
|
|
207
|
-
|
|
208
|
-
For eval documents (format 1) and array format (format 2), schema validation
|
|
209
|
-
and auto-upgrade are applied via DocumentUpgrader.
|
|
210
|
-
|
|
211
|
-
Returns:
|
|
212
|
-
Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
|
|
213
|
-
expected_response, and optional evaluators/evaluators_mode fields.
|
|
214
|
-
"""
|
|
215
|
-
try:
|
|
216
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
|
217
|
-
data = json.load(f)
|
|
218
|
-
|
|
219
|
-
# Detect if this is an eval document (has "items" key) or could be upgraded
|
|
220
|
-
is_eval_document = (
|
|
221
|
-
isinstance(data, dict) and "items" in data
|
|
222
|
-
) or isinstance(data, list)
|
|
223
|
-
|
|
224
|
-
# Run schema validation and auto-upgrade for eval documents
|
|
225
|
-
if is_eval_document:
|
|
226
|
-
try:
|
|
227
|
-
upgrader = DocumentUpgrader()
|
|
228
|
-
except Exception as e:
|
|
229
|
-
# Schema infrastructure not available (missing files, etc.) — skip
|
|
230
|
-
emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
|
|
231
|
-
upgrader = None
|
|
232
|
-
|
|
233
|
-
if upgrader is not None:
|
|
234
|
-
result = upgrader.upgrade(Path(file_path))
|
|
235
|
-
|
|
236
|
-
if result.error:
|
|
237
|
-
emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
|
|
238
|
-
sys.exit(1)
|
|
239
|
-
|
|
240
|
-
if result.upgraded and result.message:
|
|
241
|
-
emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
|
|
242
|
-
|
|
243
|
-
# Use the parsed document from the upgrade result
|
|
244
|
-
if result.document is not None:
|
|
245
|
-
data = result.document
|
|
246
|
-
|
|
247
|
-
if isinstance(data, list):
|
|
248
|
-
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
249
|
-
return data, None
|
|
250
|
-
elif isinstance(data, dict):
|
|
251
|
-
if "items" in data:
|
|
252
|
-
# Eval document format: {"schemaVersion": "...", "items": [...]}
|
|
253
|
-
return data["items"], data.get("default_evaluators")
|
|
254
|
-
else:
|
|
255
|
-
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
256
|
-
prompts = data.get("prompts", [])
|
|
257
|
-
expected_responses = data.get("expected_responses", [])
|
|
258
|
-
eval_items = [
|
|
259
|
-
{"prompt": p, "expected_response": e}
|
|
260
|
-
for p, e in zip(prompts, expected_responses)
|
|
261
|
-
]
|
|
262
|
-
return eval_items, None
|
|
263
|
-
else:
|
|
264
|
-
raise ValueError("Invalid file format")
|
|
265
|
-
except SystemExit:
|
|
266
|
-
raise
|
|
267
|
-
except Exception as e:
|
|
268
|
-
emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
|
|
269
|
-
sys.exit(1)
|
|
270
|
-
|
|
271
|
-
def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
272
|
-
"""Get prompts and expected responses interactively."""
|
|
273
|
-
prompts = []
|
|
274
|
-
expected_responses = []
|
|
275
|
-
|
|
276
|
-
print("Interactive mode: Enter your prompts and expected responses.")
|
|
277
|
-
print("Press Enter with empty prompt to finish.")
|
|
278
|
-
|
|
279
|
-
while True:
|
|
280
|
-
prompt = input(f"\nPrompt {len(prompts) + 1}: ").strip()
|
|
281
|
-
if not prompt:
|
|
282
|
-
break
|
|
283
|
-
|
|
284
|
-
expected = input(f"Expected response {len(expected_responses) + 1}: ").strip()
|
|
285
|
-
|
|
286
|
-
prompts.append(prompt)
|
|
287
|
-
expected_responses.append(expected)
|
|
288
|
-
|
|
289
|
-
if not prompts:
|
|
290
|
-
print("No prompts entered. Exiting.")
|
|
291
|
-
sys.exit(1)
|
|
292
|
-
|
|
293
|
-
return prompts, expected_responses
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
_DEFAULT_PASS_THRESHOLD = 3
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
def _decorate_metric(metric_id: str, data, threshold: Optional[int] = None) -> Dict[str, Any]:
|
|
300
|
-
"""Augment raw evaluator output with standardized threshold + pass/fail result."""
|
|
301
|
-
pass_threshold = threshold if threshold is not None else _DEFAULT_PASS_THRESHOLD
|
|
302
|
-
payload = {}
|
|
303
|
-
if isinstance(data, dict):
|
|
304
|
-
payload.update(data)
|
|
305
|
-
else:
|
|
306
|
-
payload['raw'] = data
|
|
307
|
-
|
|
308
|
-
score_val = None
|
|
309
|
-
if isinstance(data, dict):
|
|
310
|
-
if metric_id in data:
|
|
311
|
-
score_val = data[metric_id]
|
|
312
|
-
if isinstance(score_val, (int, float)):
|
|
313
|
-
payload['threshold'] = pass_threshold
|
|
314
|
-
payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
|
|
315
|
-
else:
|
|
316
|
-
payload['threshold'] = pass_threshold
|
|
317
|
-
payload.setdefault('result', STATUS_UNKNOWN)
|
|
318
|
-
return payload
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
def _run_evaluators_for_item(
|
|
322
|
-
prompt: str,
|
|
323
|
-
actual_response: str,
|
|
324
|
-
expected_response: str,
|
|
325
|
-
enhanced_response: Dict[str, Any],
|
|
326
|
-
resolved_evaluators: Dict[str, Any],
|
|
327
|
-
model_config: AzureOpenAIModelConfiguration,
|
|
328
|
-
has_azure_openai: bool,
|
|
329
|
-
args,
|
|
330
|
-
) -> Tuple[Dict[str, Optional[str]], List[str]]:
|
|
331
|
-
"""Run resolved evaluators against a single item/turn.
|
|
332
|
-
|
|
333
|
-
Returns (results_dict, evaluators_ran).
|
|
334
|
-
"""
|
|
335
|
-
has_tool_defs = bool(
|
|
336
|
-
args.m365_agent_id and enhanced_response.get("tool_definitions")
|
|
337
|
-
)
|
|
338
|
-
available_context = {
|
|
339
|
-
REQUIRES_AZURE_OPENAI: has_azure_openai,
|
|
340
|
-
REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
results_dict: Dict[str, Optional[str]] = {}
|
|
344
|
-
evaluators_ran: List[str] = []
|
|
345
|
-
|
|
346
|
-
for eval_name, eval_options in resolved_evaluators.items():
|
|
347
|
-
can_run, warn_msg = check_prerequisites(eval_name, available_context)
|
|
348
|
-
if not can_run:
|
|
349
|
-
if warn_msg:
|
|
350
|
-
emit_structured_log(
|
|
351
|
-
"warning",
|
|
352
|
-
f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
|
|
353
|
-
operation=Operation.EVALUATE,
|
|
354
|
-
)
|
|
355
|
-
results_dict[eval_name] = None
|
|
356
|
-
continue
|
|
357
|
-
|
|
358
|
-
threshold = get_evaluator_threshold(eval_name, eval_options)
|
|
359
|
-
|
|
360
|
-
try:
|
|
361
|
-
if eval_name == RELEVANCE:
|
|
362
|
-
raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
|
|
363
|
-
results_dict[RELEVANCE] = _decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
|
|
364
|
-
elif eval_name == COHERENCE:
|
|
365
|
-
raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
|
|
366
|
-
results_dict[COHERENCE] = _decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
|
|
367
|
-
elif eval_name == GROUNDEDNESS:
|
|
368
|
-
raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response, context=expected_response)
|
|
369
|
-
results_dict[GROUNDEDNESS] = _decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
|
|
370
|
-
elif eval_name == TOOL_CALL_ACCURACY:
|
|
371
|
-
raw_score = ToolCallAccuracyEvaluator(model_config)(
|
|
372
|
-
query=prompt,
|
|
373
|
-
response=enhanced_response.get("response", actual_response),
|
|
374
|
-
tool_definitions=enhanced_response.get("tool_definitions", []),
|
|
375
|
-
)
|
|
376
|
-
results_dict[TOOL_CALL_ACCURACY] = _decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
|
|
377
|
-
elif eval_name == CITATIONS:
|
|
378
|
-
fmt_str = eval_options.get("citation_format", "oai_unicode")
|
|
379
|
-
fmt_map = {
|
|
380
|
-
"oai_unicode": CitationFormat.OAI_UNICODE,
|
|
381
|
-
"bracket": CitationFormat.LEGACY_BRACKET,
|
|
382
|
-
"mixed": CitationFormat.AUTO,
|
|
383
|
-
}
|
|
384
|
-
raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response)
|
|
385
|
-
results_dict[CITATIONS] = _decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
|
|
386
|
-
elif eval_name == EXACT_MATCH:
|
|
387
|
-
case_sensitive = eval_options.get("case_sensitive", False)
|
|
388
|
-
raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
|
|
389
|
-
# ExactMatch is binary — the evaluator already sets 'result'
|
|
390
|
-
# so _decorate_metric (which computes result from score vs threshold) is not needed.
|
|
391
|
-
results_dict[EXACT_MATCH] = raw_score
|
|
392
|
-
elif eval_name == PARTIAL_MATCH:
|
|
393
|
-
case_sensitive = eval_options.get("case_sensitive", False)
|
|
394
|
-
raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
|
|
395
|
-
results_dict[PARTIAL_MATCH] = _decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
|
|
396
|
-
|
|
397
|
-
evaluators_ran.append(eval_name)
|
|
398
|
-
except Exception as e:
|
|
399
|
-
emit_structured_log(
|
|
400
|
-
"error",
|
|
401
|
-
f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
|
|
402
|
-
operation=Operation.EVALUATE,
|
|
403
|
-
)
|
|
404
|
-
results_dict[eval_name] = None
|
|
405
|
-
|
|
406
|
-
return results_dict, evaluators_ran
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
def _evaluate_single_response(
|
|
410
|
-
enhanced_response: Dict[str, Any],
|
|
411
|
-
eval_item: Dict,
|
|
412
|
-
args,
|
|
413
|
-
model_config: AzureOpenAIModelConfiguration,
|
|
414
|
-
has_azure_openai: bool,
|
|
415
|
-
default_evaluators: Dict[str, Any],
|
|
416
|
-
) -> Dict[str, Any]:
|
|
417
|
-
"""Run all evaluators for a single prompt/response pair and return the result dict."""
|
|
418
|
-
actual_response_text = get_response_text_for_evaluation(enhanced_response)
|
|
419
|
-
prompt = eval_item.get("prompt", "")
|
|
420
|
-
expected_response = eval_item.get("expected_response", "")
|
|
421
|
-
|
|
422
|
-
resolved = resolve_evaluators_for_prompt(
|
|
423
|
-
eval_item.get("evaluators"), eval_item.get("evaluators_mode", "extend"),
|
|
424
|
-
prompt, default_evaluators,
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
results_dict, evaluators_ran = _run_evaluators_for_item(
|
|
428
|
-
prompt, actual_response_text, expected_response, enhanced_response,
|
|
429
|
-
resolved, model_config, has_azure_openai, args,
|
|
430
|
-
)
|
|
431
|
-
|
|
432
|
-
evaluation_result = {
|
|
433
|
-
"prompt": prompt,
|
|
434
|
-
"response": enhanced_response.get(
|
|
435
|
-
"display_response_text", actual_response_text
|
|
436
|
-
),
|
|
437
|
-
"expected_response": expected_response,
|
|
438
|
-
"evaluators_ran": evaluators_ran,
|
|
439
|
-
"results": results_dict,
|
|
440
|
-
}
|
|
441
|
-
|
|
442
|
-
if "evaluators" in eval_item:
|
|
443
|
-
evaluation_result["evaluators"] = eval_item["evaluators"]
|
|
444
|
-
if "evaluators_mode" in eval_item:
|
|
445
|
-
evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
|
|
446
|
-
|
|
447
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
448
|
-
emit_structured_log(
|
|
449
|
-
"debug",
|
|
450
|
-
f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
|
|
451
|
-
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
452
|
-
f"Scores: {evaluation_result['results']}",
|
|
453
|
-
operation=Operation.EVALUATE,
|
|
454
|
-
)
|
|
455
|
-
|
|
456
|
-
return evaluation_result
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
|
|
460
|
-
"""Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
|
|
461
|
-
for result_data in results_dict.values():
|
|
462
|
-
if result_data is None:
|
|
463
|
-
continue
|
|
464
|
-
if result_data.get("result") == STATUS_FAIL:
|
|
465
|
-
return False
|
|
466
|
-
return True
|
|
467
|
-
|
|
468
|
-
def _evaluate_multi_turn_responses(
|
|
469
|
-
turns: List[Dict],
|
|
470
|
-
args,
|
|
471
|
-
default_evaluators: Dict[str, Any],
|
|
472
|
-
model_config: AzureOpenAIModelConfiguration,
|
|
473
|
-
has_azure_openai: bool,
|
|
474
|
-
) -> Tuple[List[Dict], Dict]:
|
|
475
|
-
"""Run per-turn evaluations and build evaluated turn results with summary.
|
|
476
|
-
|
|
477
|
-
Returns:
|
|
478
|
-
Tuple of (evaluated_turns, summary). Each evaluated turn contains
|
|
479
|
-
prompt, response, expected_response, status, evaluators_ran, results,
|
|
480
|
-
and optionally error. Does not mutate the input turns.
|
|
481
|
-
"""
|
|
482
|
-
evaluated_turns: List[Dict] = []
|
|
483
|
-
turns_passed = 0
|
|
484
|
-
turns_failed = 0
|
|
485
|
-
|
|
486
|
-
for i, turn in enumerate(turns):
|
|
487
|
-
evaluated_turn: Dict[str, Any] = {
|
|
488
|
-
"prompt": turn.get("prompt", ""),
|
|
489
|
-
}
|
|
490
|
-
if "expected_response" in turn:
|
|
491
|
-
evaluated_turn["expected_response"] = turn["expected_response"]
|
|
492
|
-
if "response" in turn:
|
|
493
|
-
evaluated_turn["response"] = turn["response"]
|
|
494
|
-
if "evaluators" in turn:
|
|
495
|
-
evaluated_turn["evaluators"] = turn["evaluators"]
|
|
496
|
-
if "evaluators_mode" in turn:
|
|
497
|
-
evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
498
|
-
|
|
499
|
-
if turn.get("status") == STATUS_ERROR:
|
|
500
|
-
evaluated_turn["status"] = STATUS_ERROR
|
|
501
|
-
evaluated_turn["error"] = turn.get("error", "")
|
|
502
|
-
turns_failed += 1
|
|
503
|
-
evaluated_turns.append(evaluated_turn)
|
|
504
|
-
continue
|
|
505
|
-
|
|
506
|
-
enhanced_response = turn.get("_enhanced_response", {})
|
|
507
|
-
actual_response = get_response_text_for_evaluation(enhanced_response)
|
|
508
|
-
|
|
509
|
-
resolved = resolve_evaluators_for_prompt(
|
|
510
|
-
turn.get("evaluators"), turn.get("evaluators_mode", "extend"),
|
|
511
|
-
turn.get("prompt", ""), default_evaluators,
|
|
512
|
-
)
|
|
513
|
-
|
|
514
|
-
results_dict, evaluators_ran = _run_evaluators_for_item(
|
|
515
|
-
turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
|
|
516
|
-
enhanced_response, resolved, model_config, has_azure_openai, args,
|
|
517
|
-
)
|
|
518
|
-
|
|
519
|
-
all_passed = _check_all_passed(results_dict)
|
|
520
|
-
|
|
521
|
-
evaluated_turn["results"] = results_dict
|
|
522
|
-
evaluated_turn["evaluators_ran"] = evaluators_ran
|
|
523
|
-
evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
|
|
524
|
-
|
|
525
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
526
|
-
emit_structured_log(
|
|
527
|
-
"debug",
|
|
528
|
-
f"Evaluation completed for turn {i + 1} prompt='{turn.get('prompt', '')}'. "
|
|
529
|
-
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
530
|
-
f"Scores: {results_dict}",
|
|
531
|
-
operation=Operation.EVALUATE,
|
|
532
|
-
)
|
|
533
|
-
|
|
534
|
-
if all_passed:
|
|
535
|
-
turns_passed += 1
|
|
536
|
-
else:
|
|
537
|
-
turns_failed += 1
|
|
538
|
-
|
|
539
|
-
evaluated_turns.append(evaluated_turn)
|
|
540
|
-
|
|
541
|
-
turns_total = len(turns)
|
|
542
|
-
if turns_passed == turns_total:
|
|
543
|
-
overall_status = STATUS_PASS
|
|
544
|
-
elif turns_failed == turns_total:
|
|
545
|
-
overall_status = STATUS_FAIL
|
|
546
|
-
else:
|
|
547
|
-
overall_status = STATUS_PARTIAL
|
|
548
|
-
|
|
549
|
-
summary = {
|
|
550
|
-
"turns_total": turns_total,
|
|
551
|
-
"turns_passed": turns_passed,
|
|
552
|
-
"turns_failed": turns_failed,
|
|
553
|
-
"overall_status": overall_status,
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
return evaluated_turns, summary
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
def get_effective_worker_count(prompt_count: int, args) -> int:
|
|
560
|
-
"""Compute safe worker count for prompt processing."""
|
|
561
|
-
if prompt_count <= 0:
|
|
562
|
-
return 1
|
|
563
|
-
|
|
564
|
-
requested = getattr(args, "concurrency", 5)
|
|
565
|
-
try:
|
|
566
|
-
requested_int = int(requested)
|
|
567
|
-
except (TypeError, ValueError):
|
|
568
|
-
requested_int = 5
|
|
569
|
-
|
|
570
|
-
bounded = max(1, min(requested_int, MAX_CONCURRENCY))
|
|
571
|
-
return min(bounded, prompt_count)
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
def run_pipeline(
|
|
575
|
-
pipeline: PipelineConfig,
|
|
576
|
-
eval_items: List[Dict],
|
|
577
|
-
args,
|
|
578
|
-
) -> List[Dict[str, Any]]:
|
|
579
|
-
"""Run the full evaluation pipeline: send prompts and evaluate responses in parallel.
|
|
580
|
-
|
|
581
|
-
Each worker processes one prompt end-to-end: send → evaluate.
|
|
582
|
-
Results are returned in original prompt order (FR-006).
|
|
583
|
-
"""
|
|
584
|
-
# Validate all evaluator names upfront before dispatching workers
|
|
585
|
-
all_evaluator_maps = [pipeline.default_evaluators]
|
|
586
|
-
for eval_item in eval_items:
|
|
587
|
-
if "evaluators" in eval_item:
|
|
588
|
-
all_evaluator_maps.append(eval_item["evaluators"])
|
|
589
|
-
for turn in eval_item.get("turns", []):
|
|
590
|
-
if "evaluators" in turn:
|
|
591
|
-
all_evaluator_maps.append(turn["evaluators"])
|
|
592
|
-
for emap in all_evaluator_maps:
|
|
593
|
-
validate_evaluator_names(emap)
|
|
594
|
-
|
|
595
|
-
# Validate all items upfront and classify types before dispatching workers
|
|
596
|
-
item_types: List[ItemType] = []
|
|
597
|
-
for idx, eval_item in enumerate(eval_items):
|
|
598
|
-
try:
|
|
599
|
-
item_type = detect_item_type(eval_item)
|
|
600
|
-
except ValueError as e:
|
|
601
|
-
raise ValueError(f"Invalid evaluation item at index {idx}: {e}") from e
|
|
602
|
-
if item_type == ItemType.MULTI_TURN:
|
|
603
|
-
turn_count = len(eval_item["turns"])
|
|
604
|
-
if turn_count > MAX_TURNS_PER_THREAD:
|
|
605
|
-
raise ValueError(
|
|
606
|
-
f"Invalid evaluation item at index {idx}: 'turns' array has "
|
|
607
|
-
f"{turn_count} items (max {MAX_TURNS_PER_THREAD})"
|
|
608
|
-
)
|
|
609
|
-
item_types.append(item_type)
|
|
610
|
-
|
|
611
|
-
total = len(eval_items)
|
|
612
|
-
worker_count = get_effective_worker_count(total, args)
|
|
613
|
-
|
|
614
|
-
multi_turn_count = sum(1 for t in item_types if t == ItemType.MULTI_TURN)
|
|
615
|
-
single_turn_count = total - multi_turn_count
|
|
616
|
-
|
|
617
|
-
emit_structured_log(
|
|
618
|
-
"info",
|
|
619
|
-
f"Running pipeline with {worker_count} worker(s) for {total} item(s) "
|
|
620
|
-
f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
|
|
621
|
-
operation=Operation.EVALUATE,
|
|
622
|
-
)
|
|
623
|
-
|
|
624
|
-
def _process_item(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
625
|
-
if item_types[index] == ItemType.MULTI_TURN:
|
|
626
|
-
return _process_multi_turn(eval_item, index)
|
|
627
|
-
return _process_single_turn(eval_item, index)
|
|
628
|
-
|
|
629
|
-
def _process_single_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
630
|
-
prompt = eval_item.get("prompt", "")
|
|
631
|
-
emit_structured_log(
|
|
632
|
-
"info",
|
|
633
|
-
f"Processing item {index + 1}/{total} (single-turn).",
|
|
634
|
-
operation=Operation.SEND_PROMPT,
|
|
635
|
-
)
|
|
636
|
-
|
|
637
|
-
# Phase A: Send prompt to agent (with retry + throttle gate)
|
|
638
|
-
response = None
|
|
639
|
-
for attempt in range(1, MAX_ATTEMPTS + 1):
|
|
640
|
-
pipeline.chat_gate.wait_if_blocked()
|
|
641
|
-
try:
|
|
642
|
-
response, _ = pipeline.agent_client.send_prompt(prompt, agent_id=args.m365_agent_id)
|
|
643
|
-
break
|
|
644
|
-
except Exception as exc:
|
|
645
|
-
cause = exc.__cause__
|
|
646
|
-
status = int(getattr(cause, "code", 0) or 0) or None if cause else None
|
|
647
|
-
retry_after = get_retry_after_seconds(
|
|
648
|
-
cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
|
|
649
|
-
)
|
|
650
|
-
|
|
651
|
-
if retry_after is not None and pipeline.is_retryable_status(status):
|
|
652
|
-
pipeline.chat_gate.apply_retry_after(retry_after)
|
|
653
|
-
|
|
654
|
-
if not pipeline.is_retryable_status(status) or attempt >= MAX_ATTEMPTS:
|
|
655
|
-
emit_structured_log(
|
|
656
|
-
"error",
|
|
657
|
-
f"Item {index + 1}/{total} failed after {attempt} attempt(s): {exc}",
|
|
658
|
-
operation=Operation.SEND_PROMPT,
|
|
659
|
-
)
|
|
660
|
-
return {
|
|
661
|
-
"prompt": prompt,
|
|
662
|
-
"response": "",
|
|
663
|
-
"expected_response": eval_item.get("expected_response", ""),
|
|
664
|
-
"evaluators_ran": [],
|
|
665
|
-
"results": {},
|
|
666
|
-
"status": STATUS_ERROR,
|
|
667
|
-
"errorDetails": str(exc),
|
|
668
|
-
}
|
|
669
|
-
|
|
670
|
-
delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
|
|
671
|
-
time.sleep(delay)
|
|
672
|
-
|
|
673
|
-
# Phase B: Evaluate response
|
|
674
|
-
return _evaluate_single_response(
|
|
675
|
-
response, eval_item, args,
|
|
676
|
-
pipeline.model_config, pipeline.has_azure_openai,
|
|
677
|
-
pipeline.default_evaluators,
|
|
678
|
-
)
|
|
679
|
-
|
|
680
|
-
def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
681
|
-
turns = eval_item["turns"]
|
|
682
|
-
thread_name = eval_item.get("name", "Unnamed thread")
|
|
683
|
-
emit_structured_log(
|
|
684
|
-
"info",
|
|
685
|
-
f"Processing item {index + 1}/{total} (multi-turn: '{thread_name}').",
|
|
686
|
-
operation=Operation.SEND_PROMPT,
|
|
687
|
-
)
|
|
688
|
-
|
|
689
|
-
if len(turns) > LONG_THREAD_WARNING_THRESHOLD:
|
|
690
|
-
emit_structured_log(
|
|
691
|
-
"warning",
|
|
692
|
-
f"Thread '{thread_name}' has {len(turns)} turns (>{LONG_THREAD_WARNING_THRESHOLD}). This may take a while.",
|
|
693
|
-
operation=Operation.SEND_PROMPT,
|
|
694
|
-
)
|
|
695
|
-
|
|
696
|
-
# Phase A: Send each turn with throttle gate + 429-only retry
|
|
697
|
-
# Multi-turn only retries on 429 (server confirmed it didn't process
|
|
698
|
-
# the request). Other transient errors (503, 504) are ambiguous about
|
|
699
|
-
# whether the server processed the turn, risking duplicate turns in
|
|
700
|
-
# the conversation if retried.
|
|
701
|
-
conversation_context = None
|
|
702
|
-
conversation_id = None
|
|
703
|
-
enriched_turns: List[Dict[str, Any]] = []
|
|
704
|
-
failed = False
|
|
705
|
-
|
|
706
|
-
for i, turn in enumerate(turns):
|
|
707
|
-
prompt = turn["prompt"]
|
|
708
|
-
emit_structured_log(
|
|
709
|
-
"debug",
|
|
710
|
-
f"Sending turn {i + 1}/{len(turns)} of '{thread_name}'.",
|
|
711
|
-
operation=Operation.SEND_PROMPT,
|
|
712
|
-
)
|
|
713
|
-
|
|
714
|
-
response = None
|
|
715
|
-
for attempt in range(1, MAX_ATTEMPTS + 1):
|
|
716
|
-
pipeline.chat_gate.wait_if_blocked()
|
|
717
|
-
try:
|
|
718
|
-
response, conversation_context = pipeline.agent_client.send_prompt(
|
|
719
|
-
prompt, agent_id=args.m365_agent_id,
|
|
720
|
-
conversation_context=conversation_context,
|
|
721
|
-
)
|
|
722
|
-
break
|
|
723
|
-
except Exception as exc:
|
|
724
|
-
cause = exc.__cause__
|
|
725
|
-
status = int(getattr(cause, "code", 0) or 0) or None if cause else None
|
|
726
|
-
retry_after = get_retry_after_seconds(
|
|
727
|
-
cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
|
|
728
|
-
)
|
|
729
|
-
|
|
730
|
-
# Only retry on 429 — server confirmed it didn't process the request
|
|
731
|
-
if status == 429 and attempt < MAX_ATTEMPTS:
|
|
732
|
-
if retry_after is not None:
|
|
733
|
-
pipeline.chat_gate.apply_retry_after(retry_after)
|
|
734
|
-
delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
|
|
735
|
-
time.sleep(delay)
|
|
736
|
-
continue
|
|
737
|
-
|
|
738
|
-
# All other errors: stop the thread
|
|
739
|
-
emit_structured_log(
|
|
740
|
-
"error",
|
|
741
|
-
f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
|
|
742
|
-
operation=Operation.SEND_PROMPT,
|
|
743
|
-
)
|
|
744
|
-
failed = True
|
|
745
|
-
break
|
|
746
|
-
|
|
747
|
-
if failed:
|
|
748
|
-
# Mark this turn and all remaining turns as error
|
|
749
|
-
enriched_turns.append({
|
|
750
|
-
**turn,
|
|
751
|
-
"response": "",
|
|
752
|
-
"status": STATUS_ERROR,
|
|
753
|
-
"error": "Failed to get response from agent",
|
|
754
|
-
})
|
|
755
|
-
for j in range(i + 1, len(turns)):
|
|
756
|
-
enriched_turns.append({
|
|
757
|
-
**turns[j],
|
|
758
|
-
"response": "",
|
|
759
|
-
"status": STATUS_ERROR,
|
|
760
|
-
"error": "Skipped: preceding turn failed",
|
|
761
|
-
})
|
|
762
|
-
break
|
|
763
|
-
|
|
764
|
-
# Enrich turn with response
|
|
765
|
-
response_text = get_response_text_for_evaluation(response)
|
|
766
|
-
enriched_turns.append({
|
|
767
|
-
**turn,
|
|
768
|
-
"response": response.get("display_response_text", response_text),
|
|
769
|
-
"_enhanced_response": response,
|
|
770
|
-
})
|
|
771
|
-
|
|
772
|
-
# Capture conversation_id from first response
|
|
773
|
-
if conversation_id is None:
|
|
774
|
-
conversation_id = response.get("metadata", {}).get("conversation_id")
|
|
775
|
-
|
|
776
|
-
# Phase B: Run per-turn evaluations
|
|
777
|
-
evaluated_turns, summary = _evaluate_multi_turn_responses(
|
|
778
|
-
enriched_turns, args, pipeline.default_evaluators,
|
|
779
|
-
model_config=pipeline.model_config,
|
|
780
|
-
has_azure_openai=pipeline.has_azure_openai,
|
|
781
|
-
)
|
|
782
|
-
|
|
783
|
-
return {
|
|
784
|
-
"type": "multi_turn",
|
|
785
|
-
"name": eval_item.get("name", ""),
|
|
786
|
-
"description": eval_item.get("description", ""),
|
|
787
|
-
"conversation_id": conversation_id or "",
|
|
788
|
-
"turns": evaluated_turns,
|
|
789
|
-
"summary": summary,
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
execution_results = execute_in_parallel(
|
|
793
|
-
eval_items, _process_item, max_workers=worker_count,
|
|
794
|
-
)
|
|
795
|
-
|
|
796
|
-
# Unwrap WorkerResult objects into plain dicts, with error fallback
|
|
797
|
-
ordered_results: List[Dict[str, Any]] = []
|
|
798
|
-
for wr in execution_results:
|
|
799
|
-
if wr.error:
|
|
800
|
-
idx = wr.index
|
|
801
|
-
item = eval_items[idx]
|
|
802
|
-
if item_types[idx] == ItemType.MULTI_TURN:
|
|
803
|
-
ordered_results.append({
|
|
804
|
-
"type": "multi_turn",
|
|
805
|
-
"name": item.get("name", ""),
|
|
806
|
-
"turns": [
|
|
807
|
-
{**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
|
|
808
|
-
for t in item.get("turns", [])
|
|
809
|
-
],
|
|
810
|
-
"summary": {
|
|
811
|
-
"turns_total": len(item.get("turns", [])),
|
|
812
|
-
"turns_passed": 0,
|
|
813
|
-
"turns_failed": len(item.get("turns", [])),
|
|
814
|
-
"overall_status": STATUS_FAIL,
|
|
815
|
-
},
|
|
816
|
-
"error": str(wr.error),
|
|
817
|
-
})
|
|
818
|
-
else:
|
|
819
|
-
ordered_results.append({
|
|
820
|
-
"prompt": item.get("prompt", ""),
|
|
821
|
-
"response": "",
|
|
822
|
-
"expected_response": item.get("expected_response", ""),
|
|
823
|
-
"evaluators_ran": [],
|
|
824
|
-
"results": {},
|
|
825
|
-
"status": STATUS_ERROR,
|
|
826
|
-
"errorDetails": str(wr.error),
|
|
827
|
-
})
|
|
828
|
-
else:
|
|
829
|
-
ordered_results.append(wr.value)
|
|
830
|
-
|
|
831
|
-
return ordered_results
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
836
|
-
agent_id: Optional[str] = None,
|
|
837
|
-
cli_version: Optional[str] = None):
|
|
838
|
-
"""Write the response to console."""
|
|
839
|
-
# ANSI color codes
|
|
840
|
-
BOLD = '\033[1m'
|
|
841
|
-
BLUE = '\033[94m'
|
|
842
|
-
GREEN = '\033[92m'
|
|
843
|
-
YELLOW = '\033[93m'
|
|
844
|
-
CYAN = '\033[96m'
|
|
845
|
-
MAGENTA = '\033[95m'
|
|
846
|
-
ORANGE = '\033[38;5;208m'
|
|
847
|
-
RED = '\033[91m'
|
|
848
|
-
RESET = '\033[0m'
|
|
849
|
-
|
|
850
|
-
def _print_evaluated_item(response: str, expected_response: str,
|
|
851
|
-
evaluators_ran: List[str], item_results: Dict[str, Any],
|
|
852
|
-
error: Optional[str] = None) -> None:
|
|
853
|
-
"""Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
|
|
854
|
-
|
|
855
|
-
The item header (Prompt X / Turn X) is printed by the caller; this helper
|
|
856
|
-
prints evaluators, response, expected response, error, and metrics.
|
|
857
|
-
"""
|
|
858
|
-
if evaluators_ran:
|
|
859
|
-
print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
|
|
860
|
-
if response:
|
|
861
|
-
print(f"{BOLD}{CYAN}Response:{RESET} {response}")
|
|
862
|
-
if expected_response:
|
|
863
|
-
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
|
|
864
|
-
if error:
|
|
865
|
-
print(f"{BOLD}{RED}Error:{RESET} {error}")
|
|
866
|
-
|
|
867
|
-
for eval_name, v in item_results.items():
|
|
868
|
-
if v is None:
|
|
869
|
-
continue
|
|
870
|
-
display_name = pascal_case_to_title(eval_name)
|
|
871
|
-
if eval_name == RELEVANCE:
|
|
872
|
-
color = MAGENTA
|
|
873
|
-
elif eval_name == COHERENCE:
|
|
874
|
-
color = ORANGE
|
|
875
|
-
else:
|
|
876
|
-
color = BLUE
|
|
877
|
-
print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
|
|
878
|
-
|
|
879
|
-
# Show metadata
|
|
880
|
-
metadata_parts = []
|
|
881
|
-
if agent_name:
|
|
882
|
-
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
883
|
-
if agent_id:
|
|
884
|
-
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
885
|
-
if cli_version:
|
|
886
|
-
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
887
|
-
if metadata_parts:
|
|
888
|
-
print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
|
|
889
|
-
print()
|
|
890
|
-
|
|
891
|
-
aggregates = calculate_aggregate_statistics(results)
|
|
892
|
-
if aggregates:
|
|
893
|
-
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
894
|
-
if total_items > 1:
|
|
895
|
-
print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
|
|
896
|
-
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
897
|
-
|
|
898
|
-
for metric_name, stats in aggregates.items():
|
|
899
|
-
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
900
|
-
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
901
|
-
total_prompts = stats.get('total_prompts', total_items)
|
|
902
|
-
print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
|
|
903
|
-
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
904
|
-
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
905
|
-
if stats.get('threshold') is not None:
|
|
906
|
-
print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
|
|
907
|
-
print()
|
|
908
|
-
|
|
909
|
-
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
910
|
-
print()
|
|
911
|
-
|
|
912
|
-
print(f"{BOLD}{BLUE}Individual Results:{RESET}")
|
|
913
|
-
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
914
|
-
for i, result in enumerate(results, 1):
|
|
915
|
-
if result.get("type") == "multi_turn":
|
|
916
|
-
thread_name = result.get("name", "Unnamed Thread")
|
|
917
|
-
summary = result.get("summary", {})
|
|
918
|
-
status = summary.get("overall_status", STATUS_UNKNOWN)
|
|
919
|
-
status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
|
|
920
|
-
|
|
921
|
-
print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
|
|
922
|
-
for t_idx, turn in enumerate(result.get("turns", []), 1):
|
|
923
|
-
turn_status = turn.get("status", STATUS_UNKNOWN)
|
|
924
|
-
turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
|
|
925
|
-
print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
|
|
926
|
-
_print_evaluated_item(
|
|
927
|
-
response=turn.get("response", ""),
|
|
928
|
-
expected_response=turn.get("expected_response", ""),
|
|
929
|
-
evaluators_ran=turn.get("evaluators_ran", []),
|
|
930
|
-
item_results=turn.get("results", {}),
|
|
931
|
-
error=turn.get("error"),
|
|
932
|
-
)
|
|
933
|
-
print()
|
|
934
|
-
print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
|
|
935
|
-
print(f" Status: {status_color}{status.upper()}{RESET}")
|
|
936
|
-
print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
|
|
937
|
-
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
938
|
-
else:
|
|
939
|
-
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
940
|
-
_print_evaluated_item(
|
|
941
|
-
response=result.get('response', ''),
|
|
942
|
-
expected_response=result.get('expected_response', ''),
|
|
943
|
-
evaluators_ran=result.get('evaluators_ran', []),
|
|
944
|
-
item_results=result.get('results', {}),
|
|
945
|
-
error=result.get('errorDetails'),
|
|
946
|
-
)
|
|
947
|
-
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
948
|
-
|
|
949
|
-
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
950
|
-
"""Extract an EvalScore object from a decorated metric dict.
|
|
951
|
-
|
|
952
|
-
Maps internal decorated-metric format to schema EvalScore:
|
|
953
|
-
{score, result, threshold} (required) + reason, evaluator (optional).
|
|
954
|
-
"""
|
|
955
|
-
DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
|
|
956
|
-
|
|
957
|
-
score_val = None
|
|
958
|
-
if metric_id in data and isinstance(data[metric_id], (int, float)):
|
|
959
|
-
score_val = data[metric_id]
|
|
960
|
-
if score_val is None:
|
|
961
|
-
return None
|
|
962
|
-
|
|
963
|
-
result = data.get("result")
|
|
964
|
-
if result not in (STATUS_PASS, STATUS_FAIL):
|
|
965
|
-
result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else STATUS_FAIL
|
|
966
|
-
|
|
967
|
-
eval_score: Dict[str, Any] = {
|
|
968
|
-
"score": score_val,
|
|
969
|
-
"result": result,
|
|
970
|
-
"threshold": data.get("threshold", DEFAULT_THRESHOLD),
|
|
971
|
-
}
|
|
972
|
-
reason = data.get(f"{metric_id}_reason") or data.get("reason")
|
|
973
|
-
if reason:
|
|
974
|
-
eval_score["reason"] = reason
|
|
975
|
-
return eval_score
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
979
|
-
"""Convert raw evaluator results to schema-compliant score objects.
|
|
980
|
-
|
|
981
|
-
Evaluator results in results_dict are dicts (from _decorate_metric) or
|
|
982
|
-
None when skipped/crashed. None values are omitted from output.
|
|
983
|
-
"""
|
|
984
|
-
scores: Dict[str, Any] = {}
|
|
985
|
-
|
|
986
|
-
for eval_key, schema_key in [
|
|
987
|
-
(RELEVANCE, "relevance"),
|
|
988
|
-
(COHERENCE, "coherence"),
|
|
989
|
-
(GROUNDEDNESS, "groundedness"),
|
|
990
|
-
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
991
|
-
]:
|
|
992
|
-
data = results_dict.get(eval_key)
|
|
993
|
-
if data is None:
|
|
994
|
-
continue
|
|
995
|
-
eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
|
|
996
|
-
if eval_score:
|
|
997
|
-
scores[schema_key] = eval_score
|
|
998
|
-
|
|
999
|
-
data = results_dict.get(CITATIONS)
|
|
1000
|
-
if data is not None:
|
|
1001
|
-
count = data.get("citations", 0)
|
|
1002
|
-
cit_result = data.get("result")
|
|
1003
|
-
if cit_result not in (STATUS_PASS, STATUS_FAIL):
|
|
1004
|
-
cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
|
|
1005
|
-
citation_score: Dict[str, Any] = {
|
|
1006
|
-
"count": count,
|
|
1007
|
-
"result": cit_result,
|
|
1008
|
-
"threshold": data.get("threshold", 1),
|
|
1009
|
-
}
|
|
1010
|
-
if "citation_format" in data:
|
|
1011
|
-
citation_score["format"] = data["citation_format"]
|
|
1012
|
-
scores["citations"] = citation_score
|
|
1013
|
-
|
|
1014
|
-
data = results_dict.get(EXACT_MATCH)
|
|
1015
|
-
if data is not None:
|
|
1016
|
-
is_match = data.get("exact_match", 0.0) == 1.0
|
|
1017
|
-
scores["exactMatch"] = {
|
|
1018
|
-
"match": is_match,
|
|
1019
|
-
"result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
|
|
1020
|
-
"reason": data.get("exact_match_reason", ""),
|
|
1021
|
-
}
|
|
1022
|
-
|
|
1023
|
-
data = results_dict.get(PARTIAL_MATCH)
|
|
1024
|
-
if data is not None:
|
|
1025
|
-
scores["partialMatch"] = {
|
|
1026
|
-
"score": data.get("partial_match", 0.0),
|
|
1027
|
-
"result": data.get("result", STATUS_FAIL),
|
|
1028
|
-
"threshold": data.get("threshold", 0.5),
|
|
1029
|
-
"reason": data.get("partial_match_reason", ""),
|
|
1030
|
-
}
|
|
1031
|
-
|
|
1032
|
-
return scores
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
1036
|
-
"""Convert an internal evaluation result dict to a schema-compliant EvalItem."""
|
|
1037
|
-
item: Dict[str, Any] = {
|
|
1038
|
-
"prompt": result["prompt"],
|
|
1039
|
-
"response": result["response"],
|
|
1040
|
-
"expected_response": result["expected_response"],
|
|
1041
|
-
}
|
|
1042
|
-
|
|
1043
|
-
if "evaluators" in result:
|
|
1044
|
-
item["evaluators"] = result["evaluators"]
|
|
1045
|
-
if "evaluators_mode" in result:
|
|
1046
|
-
item["evaluators_mode"] = result["evaluators_mode"]
|
|
1047
|
-
|
|
1048
|
-
scores = _convert_scores_to_schema(result.get("results", {}))
|
|
1049
|
-
if scores:
|
|
1050
|
-
item["scores"] = scores
|
|
1051
|
-
|
|
1052
|
-
return item
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
def convert_thread_result_to_output(thread_result: Dict) -> Dict:
|
|
1056
|
-
"""Convert a multi-turn thread result to the output format."""
|
|
1057
|
-
output_turns = []
|
|
1058
|
-
for turn in thread_result.get("turns", []):
|
|
1059
|
-
output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
|
|
1060
|
-
if "expected_response" in turn:
|
|
1061
|
-
output_turn["expected_response"] = turn["expected_response"]
|
|
1062
|
-
if "response" in turn:
|
|
1063
|
-
output_turn["response"] = turn["response"]
|
|
1064
|
-
if "status" in turn:
|
|
1065
|
-
output_turn["status"] = turn["status"]
|
|
1066
|
-
if "error" in turn:
|
|
1067
|
-
output_turn["error"] = turn["error"]
|
|
1068
|
-
if "evaluators" in turn:
|
|
1069
|
-
output_turn["evaluators"] = turn["evaluators"]
|
|
1070
|
-
if "evaluators_mode" in turn:
|
|
1071
|
-
output_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
1072
|
-
|
|
1073
|
-
scores = _convert_scores_to_schema(turn.get("results", {}))
|
|
1074
|
-
if scores:
|
|
1075
|
-
output_turn["scores"] = scores
|
|
1076
|
-
|
|
1077
|
-
output_turns.append(output_turn)
|
|
1078
|
-
|
|
1079
|
-
output: Dict[str, Any] = {}
|
|
1080
|
-
if thread_result.get("name"):
|
|
1081
|
-
output["name"] = thread_result["name"]
|
|
1082
|
-
if thread_result.get("description"):
|
|
1083
|
-
output["description"] = thread_result["description"]
|
|
1084
|
-
if thread_result.get("conversation_id"):
|
|
1085
|
-
output["conversation_id"] = thread_result["conversation_id"]
|
|
1086
|
-
output["turns"] = output_turns
|
|
1087
|
-
if thread_result.get("summary"):
|
|
1088
|
-
output["summary"] = thread_result["summary"]
|
|
1089
|
-
|
|
1090
|
-
return output
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
def convert_result_to_output_item(result: Dict) -> Dict:
|
|
1094
|
-
"""Convert an internal result dict to an output item. Routes by type."""
|
|
1095
|
-
if result.get("type") == "multi_turn":
|
|
1096
|
-
return convert_thread_result_to_output(result)
|
|
1097
|
-
return convert_result_to_eval_item(result)
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
1101
|
-
default_evaluators: Optional[Dict[str, Any]] = None,
|
|
1102
|
-
agent_name: Optional[str] = None,
|
|
1103
|
-
cli_version: Optional[str] = None):
|
|
1104
|
-
"""Write results to a schema-compliant eval document JSON file.
|
|
1105
|
-
|
|
1106
|
-
Output follows the eval-document.schema.json format:
|
|
1107
|
-
{schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
|
|
1108
|
-
"""
|
|
1109
|
-
try:
|
|
1110
|
-
try:
|
|
1111
|
-
current_version = SchemaVersionManager().get_current_version()
|
|
1112
|
-
except Exception:
|
|
1113
|
-
current_version = "1.0.0"
|
|
1114
|
-
|
|
1115
|
-
items = [convert_result_to_output_item(r) for r in results]
|
|
1116
|
-
|
|
1117
|
-
metadata: Dict[str, Any] = {
|
|
1118
|
-
"evaluatedAt": datetime.now(timezone.utc).isoformat(),
|
|
1119
|
-
}
|
|
1120
|
-
if agent_id:
|
|
1121
|
-
metadata["agentId"] = agent_id
|
|
1122
|
-
if agent_name:
|
|
1123
|
-
metadata["agentName"] = agent_name
|
|
1124
|
-
if cli_version:
|
|
1125
|
-
metadata["cliVersion"] = cli_version
|
|
1126
|
-
|
|
1127
|
-
output_data: Dict[str, Any] = {
|
|
1128
|
-
"schemaVersion": current_version,
|
|
1129
|
-
"metadata": metadata,
|
|
1130
|
-
}
|
|
1131
|
-
|
|
1132
|
-
if default_evaluators is not None:
|
|
1133
|
-
output_data["default_evaluators"] = default_evaluators
|
|
1134
|
-
|
|
1135
|
-
output_data["items"] = items
|
|
1136
|
-
|
|
1137
|
-
with open(output_file, 'w', encoding='utf-8') as f:
|
|
1138
|
-
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
1139
|
-
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
1140
|
-
except Exception as e:
|
|
1141
|
-
emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
1142
|
-
sys.exit(1)
|
|
1143
|
-
|
|
1144
|
-
def _results_to_csv_json(results_dict: Dict) -> str:
|
|
1145
|
-
"""Serialize evaluator results dict to a CSV-safe JSON string.
|
|
1146
|
-
|
|
1147
|
-
Skips None (crashed/skipped evaluators). Results are dicts produced
|
|
1148
|
-
by _decorate_metric.
|
|
1149
|
-
"""
|
|
1150
|
-
if not results_dict:
|
|
1151
|
-
return ""
|
|
1152
|
-
non_null = {k: v for k, v in results_dict.items() if v is not None}
|
|
1153
|
-
return json.dumps(non_null) if non_null else ""
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
def write_results_to_csv(results: List[Dict], output_file: str,
|
|
1157
|
-
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
1158
|
-
cli_version: Optional[str] = None):
|
|
1159
|
-
"""Write results to CSV file."""
|
|
1160
|
-
try:
|
|
1161
|
-
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
1162
|
-
if results:
|
|
1163
|
-
metadata_parts = []
|
|
1164
|
-
if agent_name:
|
|
1165
|
-
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
1166
|
-
if agent_id:
|
|
1167
|
-
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
1168
|
-
if cli_version:
|
|
1169
|
-
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
1170
|
-
if metadata_parts:
|
|
1171
|
-
f.write(f"# {' | '.join(metadata_parts)}\n")
|
|
1172
|
-
|
|
1173
|
-
aggregates = calculate_aggregate_statistics(results)
|
|
1174
|
-
if aggregates:
|
|
1175
|
-
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
1176
|
-
if total_items > 1:
|
|
1177
|
-
f.write("# AGGREGATE STATISTICS\n")
|
|
1178
|
-
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
1179
|
-
for metric_name, stats in aggregates.items():
|
|
1180
|
-
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
1181
|
-
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
1182
|
-
total_prompts = stats.get('total_prompts', total_items)
|
|
1183
|
-
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
1184
|
-
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
1185
|
-
|
|
1186
|
-
single_turn_rows = []
|
|
1187
|
-
multi_turn_rows = []
|
|
1188
|
-
for result in results:
|
|
1189
|
-
if result.get("type") == "multi_turn":
|
|
1190
|
-
thread_name = result.get("name", "")
|
|
1191
|
-
for turn_idx, turn in enumerate(result.get("turns", [])):
|
|
1192
|
-
multi_turn_rows.append({
|
|
1193
|
-
"thread_name": thread_name,
|
|
1194
|
-
"turn_index": turn_idx + 1,
|
|
1195
|
-
"prompt": turn.get("prompt", ""),
|
|
1196
|
-
"response": turn.get("response", ""),
|
|
1197
|
-
"expected_response": turn.get("expected_response", ""),
|
|
1198
|
-
"status": turn.get("status", ""),
|
|
1199
|
-
"error": turn.get("error", ""),
|
|
1200
|
-
"scores": _results_to_csv_json(turn.get("results", {})),
|
|
1201
|
-
})
|
|
1202
|
-
summary = result.get("summary", {})
|
|
1203
|
-
multi_turn_rows.append({
|
|
1204
|
-
"thread_name": thread_name,
|
|
1205
|
-
"turn_index": "summary",
|
|
1206
|
-
"prompt": "",
|
|
1207
|
-
"response": "",
|
|
1208
|
-
"expected_response": "",
|
|
1209
|
-
"status": summary.get("overall_status", ""),
|
|
1210
|
-
"scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
|
|
1211
|
-
})
|
|
1212
|
-
else:
|
|
1213
|
-
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
|
|
1214
|
-
row = {k: v for k, v in result.items() if k not in exclude_keys}
|
|
1215
|
-
if "results" in result:
|
|
1216
|
-
row["scores"] = _results_to_csv_json(result["results"])
|
|
1217
|
-
single_turn_rows.append(row)
|
|
1218
|
-
|
|
1219
|
-
if single_turn_rows:
|
|
1220
|
-
if multi_turn_rows:
|
|
1221
|
-
f.write("# SINGLE-TURN RESULTS\n")
|
|
1222
|
-
fieldnames = list(single_turn_rows[0].keys())
|
|
1223
|
-
for row in single_turn_rows:
|
|
1224
|
-
for k in row:
|
|
1225
|
-
if k not in fieldnames:
|
|
1226
|
-
fieldnames.append(k)
|
|
1227
|
-
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
1228
|
-
writer.writeheader()
|
|
1229
|
-
writer.writerows(single_turn_rows)
|
|
1230
|
-
|
|
1231
|
-
if multi_turn_rows:
|
|
1232
|
-
if single_turn_rows:
|
|
1233
|
-
f.write("\n")
|
|
1234
|
-
f.write("# MULTI-TURN RESULTS\n")
|
|
1235
|
-
fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
|
|
1236
|
-
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
1237
|
-
writer.writeheader()
|
|
1238
|
-
writer.writerows(multi_turn_rows)
|
|
1239
|
-
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
1240
|
-
except Exception as e:
|
|
1241
|
-
emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
1242
|
-
sys.exit(1)
|
|
1243
|
-
|
|
1244
|
-
def normalize_agent_id(agent_id):
|
|
1245
|
-
"""Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
|
|
1246
|
-
|
|
1247
|
-
Returns the input unchanged when it is None/empty or already contains a dot.
|
|
1248
|
-
"""
|
|
1249
|
-
if not agent_id:
|
|
1250
|
-
return agent_id
|
|
1251
|
-
return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
def parse_arguments():
|
|
1255
|
-
"""Parse command line arguments."""
|
|
1256
|
-
parser = argparse.ArgumentParser(
|
|
1257
|
-
description="M365 Copilot Agent Evaluation CLI",
|
|
1258
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1259
|
-
epilog="""
|
|
1260
|
-
Examples:
|
|
1261
|
-
# Run with default prompts
|
|
1262
|
-
python main.py
|
|
1263
|
-
|
|
1264
|
-
# Run with custom prompts
|
|
1265
|
-
python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph is a gateway..."
|
|
1266
|
-
|
|
1267
|
-
# Run with prompts from file
|
|
1268
|
-
python main.py --prompts-file prompts.json
|
|
1269
|
-
|
|
1270
|
-
# Interactive mode
|
|
1271
|
-
python main.py --interactive
|
|
1272
|
-
|
|
1273
|
-
# Save results to JSON
|
|
1274
|
-
python main.py --output results.json
|
|
1275
|
-
|
|
1276
|
-
# Save results to CSV
|
|
1277
|
-
python main.py --output results.csv
|
|
1278
|
-
|
|
1279
|
-
# Save results to HTML and open in browser
|
|
1280
|
-
python main.py --output report.html
|
|
1281
|
-
|
|
1282
|
-
# Debug-level diagnostics
|
|
1283
|
-
python main.py --log-level debug
|
|
1284
|
-
|
|
1285
|
-
# Sign out and clear cached authentication tokens
|
|
1286
|
-
python main.py --signout
|
|
1287
|
-
"""
|
|
1288
|
-
)
|
|
1289
|
-
|
|
1290
|
-
# Input options (mutually exclusive)
|
|
1291
|
-
input_group = parser.add_mutually_exclusive_group()
|
|
1292
|
-
input_group.add_argument(
|
|
1293
|
-
'--prompts',
|
|
1294
|
-
nargs='+',
|
|
1295
|
-
help='List of prompts to evaluate'
|
|
1296
|
-
)
|
|
1297
|
-
input_group.add_argument(
|
|
1298
|
-
'--prompts-file',
|
|
1299
|
-
type=str,
|
|
1300
|
-
help='JSON file containing prompts and expected responses'
|
|
1301
|
-
)
|
|
1302
|
-
input_group.add_argument(
|
|
1303
|
-
'--interactive',
|
|
1304
|
-
action='store_true',
|
|
1305
|
-
help='Interactive mode to enter prompts'
|
|
1306
|
-
)
|
|
1307
|
-
|
|
1308
|
-
# Expected responses (only used with --prompts)
|
|
1309
|
-
parser.add_argument(
|
|
1310
|
-
'--expected',
|
|
1311
|
-
nargs='+',
|
|
1312
|
-
help='List of expected responses (must match number of prompts)'
|
|
1313
|
-
)
|
|
1314
|
-
|
|
1315
|
-
# Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
|
|
1316
|
-
parser.add_argument(
|
|
1317
|
-
'--m365-agent-id', '--agent-id',
|
|
1318
|
-
type=str,
|
|
1319
|
-
default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
|
|
1320
|
-
help='Agent ID (default from M365_AGENT_ID environment variable)'
|
|
1321
|
-
)
|
|
1322
|
-
|
|
1323
|
-
# Output options
|
|
1324
|
-
parser.add_argument(
|
|
1325
|
-
'--output',
|
|
1326
|
-
type=str,
|
|
1327
|
-
help='Output file path. Format is determined by file extension: .json, .csv, .html. If not provided, results are printed to console.'
|
|
1328
|
-
)
|
|
1329
|
-
|
|
1330
|
-
# Behavior options
|
|
1331
|
-
parser.add_argument(
|
|
1332
|
-
'--log-level',
|
|
1333
|
-
nargs='?',
|
|
1334
|
-
const='info',
|
|
1335
|
-
action='append',
|
|
1336
|
-
help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
|
|
1337
|
-
)
|
|
1338
|
-
|
|
1339
|
-
parser.add_argument(
|
|
1340
|
-
'--signout',
|
|
1341
|
-
action='store_true',
|
|
1342
|
-
help='Sign out and clear cached authentication tokens'
|
|
1343
|
-
)
|
|
1344
|
-
|
|
1345
|
-
parser.add_argument(
|
|
1346
|
-
'--concurrency',
|
|
1347
|
-
type=int,
|
|
1348
|
-
default=5,
|
|
1349
|
-
help='Number of parallel workers for prompt processing (1-5, default: 5)'
|
|
1350
|
-
)
|
|
1351
|
-
|
|
1352
|
-
args = parser.parse_args()
|
|
1353
|
-
|
|
1354
|
-
args.m365_agent_id = normalize_agent_id(args.m365_agent_id)
|
|
1355
|
-
|
|
1356
|
-
if args.concurrency < 1:
|
|
1357
|
-
parser.error('--concurrency must be an integer >= 1.')
|
|
1358
|
-
if args.concurrency > MAX_CONCURRENCY:
|
|
1359
|
-
emit_structured_log(
|
|
1360
|
-
"warning",
|
|
1361
|
-
f"--concurrency {args.concurrency} exceeds max {MAX_CONCURRENCY}; clamping to {MAX_CONCURRENCY}.",
|
|
1362
|
-
operation=Operation.SETUP,
|
|
1363
|
-
)
|
|
1364
|
-
args.concurrency = MAX_CONCURRENCY
|
|
1365
|
-
|
|
1366
|
-
return args
|
|
1367
|
-
|
|
1368
|
-
def validate_environment() -> CallPath:
|
|
1369
|
-
"""Validate required environment variables."""
|
|
1370
|
-
required_env_vars = [
|
|
1371
|
-
"AZURE_AI_OPENAI_ENDPOINT",
|
|
1372
|
-
"AZURE_AI_API_KEY",
|
|
1373
|
-
"AZURE_AI_API_VERSION",
|
|
1374
|
-
"AZURE_AI_MODEL_NAME",
|
|
1375
|
-
]
|
|
1376
|
-
|
|
1377
|
-
if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
|
|
1378
|
-
call_path = CallPath.ACCESS_TOKEN
|
|
1379
|
-
required_env_vars.extend([
|
|
1380
|
-
"COPILOT_API_ACCESS_TOKEN",
|
|
1381
|
-
"COPILOT_API_ENDPOINT",
|
|
1382
|
-
"X_SCENARIO_HEADER",
|
|
1383
|
-
])
|
|
1384
|
-
elif os.environ.get("WORK_IQ_A2A_ENDPOINT"):
|
|
1385
|
-
call_path = CallPath.A2A
|
|
1386
|
-
required_env_vars.extend([
|
|
1387
|
-
"WORK_IQ_A2A_ENDPOINT",
|
|
1388
|
-
"WORK_IQ_A2A_CLIENT_ID",
|
|
1389
|
-
"TENANT_ID",
|
|
1390
|
-
])
|
|
1391
|
-
else:
|
|
1392
|
-
call_path = CallPath.COPILOT_AUTH
|
|
1393
|
-
required_env_vars.extend([
|
|
1394
|
-
"COPILOT_API_ENDPOINT",
|
|
1395
|
-
"X_SCENARIO_HEADER",
|
|
1396
|
-
"M365_EVAL_CLIENT_ID",
|
|
1397
|
-
"TENANT_ID",
|
|
1398
|
-
])
|
|
1399
|
-
|
|
1400
|
-
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
|
1401
|
-
if missing_vars:
|
|
1402
|
-
emit_structured_log(
|
|
1403
|
-
"error",
|
|
1404
|
-
"Missing required environment variables: "
|
|
1405
|
-
f"{', '.join(missing_vars)}. Please ensure your .env file contains "
|
|
1406
|
-
"all required Azure configuration.",
|
|
1407
|
-
operation=Operation.VALIDATE_ENV,
|
|
1408
|
-
)
|
|
1409
|
-
sys.exit(1)
|
|
1410
|
-
return call_path
|
|
1411
|
-
|
|
1412
|
-
def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
|
|
1413
|
-
"""Validate URL against security requirements."""
|
|
1414
|
-
try:
|
|
1415
|
-
parsed = urllib.parse.urlparse(url)
|
|
1416
|
-
|
|
1417
|
-
# Check for dangerous schemes
|
|
1418
|
-
if parsed.scheme in ['javascript', 'data']:
|
|
1419
|
-
raise ValueError(f"Dangerous URL scheme detected: {parsed.scheme}")
|
|
1420
|
-
|
|
1421
|
-
# Check for HTTPS requirement
|
|
1422
|
-
if parsed.scheme != 'https':
|
|
1423
|
-
raise ValueError(f"Only HTTPS URLs are allowed, got: {parsed.scheme}")
|
|
1424
|
-
|
|
1425
|
-
# Check if domain is in allowed list
|
|
1426
|
-
if parsed.netloc not in allowed_domains:
|
|
1427
|
-
raise ValueError(f"Domain not in allowed list: {parsed.netloc}")
|
|
1428
|
-
|
|
1429
|
-
# Reject fragment URLs
|
|
1430
|
-
if parsed.fragment:
|
|
1431
|
-
raise ValueError("Fragment URLs are not allowed")
|
|
1432
|
-
|
|
1433
|
-
return True
|
|
1434
|
-
|
|
1435
|
-
except ValueError:
|
|
1436
|
-
# Re-raise ValueError exceptions
|
|
1437
|
-
raise
|
|
1438
|
-
except Exception as e:
|
|
1439
|
-
# Convert other parsing errors to ValueError
|
|
1440
|
-
raise ValueError(f"Invalid URL format: {url}") from e
|
|
1441
|
-
|
|
1442
|
-
def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
|
|
1443
|
-
"""Get prompts and expected responses based on command line arguments.
|
|
1444
|
-
|
|
1445
|
-
Returns:
|
|
1446
|
-
Tuple of (eval_items, default_evaluators).
|
|
1447
|
-
"""
|
|
1448
|
-
if args.prompts:
|
|
1449
|
-
if args.expected and len(args.prompts) != len(args.expected):
|
|
1450
|
-
emit_structured_log(
|
|
1451
|
-
"error",
|
|
1452
|
-
"Number of prompts must match number of expected responses. "
|
|
1453
|
-
"Update --expected values to match the prompt count.",
|
|
1454
|
-
)
|
|
1455
|
-
sys.exit(1)
|
|
1456
|
-
expected_responses = args.expected or [""] * len(args.prompts)
|
|
1457
|
-
eval_items = [
|
|
1458
|
-
{"prompt": p, "expected_response": e}
|
|
1459
|
-
for p, e in zip(args.prompts, expected_responses)
|
|
1460
|
-
]
|
|
1461
|
-
return eval_items, None
|
|
1462
|
-
elif args.prompts_file:
|
|
1463
|
-
return load_prompts_from_file(args.prompts_file)
|
|
1464
|
-
elif args.interactive:
|
|
1465
|
-
prompts, expected_responses = get_interactive_prompts()
|
|
1466
|
-
eval_items = [
|
|
1467
|
-
{"prompt": p, "expected_response": e}
|
|
1468
|
-
for p, e in zip(prompts, expected_responses)
|
|
1469
|
-
]
|
|
1470
|
-
return eval_items, None
|
|
1471
|
-
else:
|
|
1472
|
-
prompts, expected_responses = get_default_prompts_and_responses()
|
|
1473
|
-
eval_items = [
|
|
1474
|
-
{"prompt": p, "expected_response": e}
|
|
1475
|
-
for p, e in zip(prompts, expected_responses)
|
|
1476
|
-
]
|
|
1477
|
-
return eval_items, None
|
|
1478
|
-
|
|
1479
|
-
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
|
|
1480
|
-
"""
|
|
1481
|
-
Display an interactive agent selector using questionary.
|
|
1482
|
-
|
|
1483
|
-
Args:
|
|
1484
|
-
agents: List of agent dictionaries.
|
|
1485
|
-
|
|
1486
|
-
Returns:
|
|
1487
|
-
Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
|
|
1488
|
-
"""
|
|
1489
|
-
if not agents:
|
|
1490
|
-
return None, None
|
|
1491
|
-
|
|
1492
|
-
# Build id→name lookup and choices
|
|
1493
|
-
id_to_name: Dict[str, str] = {}
|
|
1494
|
-
choices = []
|
|
1495
|
-
sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
|
|
1496
|
-
for agent in sorted_agents:
|
|
1497
|
-
agent_id = agent.get("gptId", "Unknown")
|
|
1498
|
-
agent_name = agent.get("name", "Unknown")
|
|
1499
|
-
agent_description = agent.get("description", "Unknown")
|
|
1500
|
-
agent_is_owner = agent.get('isOwner')
|
|
1501
|
-
id_to_name[agent_id] = agent_name
|
|
1502
|
-
|
|
1503
|
-
# Format the display text
|
|
1504
|
-
display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
|
|
1505
|
-
|
|
1506
|
-
choices.append(questionary.Choice(title=display_text, value=agent_id))
|
|
1507
|
-
|
|
1508
|
-
# Display the selection prompt
|
|
1509
|
-
selected_agent = questionary.select(
|
|
1510
|
-
"Select an agent to evaluate:",
|
|
1511
|
-
choices=choices,
|
|
1512
|
-
use_shortcuts=True,
|
|
1513
|
-
use_arrow_keys=True
|
|
1514
|
-
).ask()
|
|
1515
|
-
|
|
1516
|
-
return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
|
|
53
|
+
from dataclasses import replace
|
|
1517
54
|
|
|
1518
55
|
|
|
1519
|
-
def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
|
|
1520
|
-
agent_name: Optional[str] = None, cli_version: Optional[str] = None):
|
|
1521
|
-
"""Output results based on specified format."""
|
|
1522
|
-
metadata_kwargs = dict(
|
|
1523
|
-
agent_name=agent_name,
|
|
1524
|
-
agent_id=getattr(args, 'm365_agent_id', None),
|
|
1525
|
-
cli_version=cli_version,
|
|
1526
|
-
)
|
|
1527
|
-
if args.output:
|
|
1528
|
-
output_lower = args.output.lower()
|
|
1529
|
-
if output_lower.endswith('.json'):
|
|
1530
|
-
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1531
|
-
**metadata_kwargs)
|
|
1532
|
-
elif output_lower.endswith('.csv'):
|
|
1533
|
-
write_results_to_csv(results, args.output, **metadata_kwargs)
|
|
1534
|
-
elif output_lower.endswith('.html'):
|
|
1535
|
-
write_results_to_html(results, args.output, **metadata_kwargs)
|
|
1536
|
-
abs_path = os.path.abspath(args.output)
|
|
1537
|
-
webbrowser.open(f'file://{abs_path}')
|
|
1538
|
-
else:
|
|
1539
|
-
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1540
|
-
**metadata_kwargs)
|
|
1541
|
-
else:
|
|
1542
|
-
write_results_to_console(results, **metadata_kwargs)
|
|
1543
|
-
|
|
1544
56
|
def main():
|
|
1545
57
|
"""Main function to orchestrate the evaluation process."""
|
|
1546
58
|
load_dotenv()
|
|
@@ -1555,136 +67,78 @@ def main():
|
|
|
1555
67
|
)
|
|
1556
68
|
sys.exit(2)
|
|
1557
69
|
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
70
|
+
config = replace(
|
|
71
|
+
RunConfig.from_namespace(args),
|
|
72
|
+
effective_log_level=effective_log_level,
|
|
73
|
+
)
|
|
74
|
+
configure_cli_logging(config.effective_log_level)
|
|
75
|
+
emit_structured_log("info", f"Log level set to '{config.effective_log_level}'.", operation=Operation.SETUP)
|
|
1561
76
|
|
|
1562
77
|
# Check minimum version before proceeding
|
|
1563
|
-
quiet_for_version = effective_log_level in ("warning", "error")
|
|
78
|
+
quiet_for_version = config.effective_log_level in ("warning", "error")
|
|
1564
79
|
cli_version = get_cli_version(quiet=quiet_for_version)
|
|
1565
|
-
if not should_bypass_min_version_check(
|
|
80
|
+
if not should_bypass_min_version_check(config) and not check_min_version(cli_version, quiet=quiet_for_version):
|
|
1566
81
|
sys.exit(1)
|
|
1567
82
|
|
|
1568
83
|
# Validate environment variables required for evaluation
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
user_oid = ""
|
|
84
|
+
validate_environment()
|
|
1572
85
|
|
|
1573
|
-
|
|
1574
|
-
|
|
1575
|
-
access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
|
|
1576
|
-
user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
|
|
1577
|
-
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
1578
|
-
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
1579
|
-
agent_client = SydneyClient(
|
|
1580
|
-
copilot_api_endpoint=copilot_api_endpoint,
|
|
1581
|
-
access_token=access_token,
|
|
1582
|
-
user_oid=user_oid,
|
|
1583
|
-
logger=CLI_LOGGER,
|
|
1584
|
-
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1585
|
-
)
|
|
86
|
+
a2a_endpoint = os.environ[ENV_WORK_IQ_A2A_ENDPOINT]
|
|
87
|
+
validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
|
|
1586
88
|
|
|
1587
|
-
|
|
89
|
+
a2a_scopes_str = os.environ.get(ENV_WORK_IQ_A2A_SCOPES, "")
|
|
90
|
+
a2a_auth_handler = AuthHandler(
|
|
91
|
+
client_id=os.environ[ENV_WORK_IQ_A2A_CLIENT_ID],
|
|
92
|
+
tenant_id=os.environ[ENV_TENANT_ID],
|
|
93
|
+
scopes_str=a2a_scopes_str,
|
|
94
|
+
)
|
|
95
|
+
if config.signout:
|
|
96
|
+
try:
|
|
97
|
+
a2a_auth_handler.clear_cache()
|
|
98
|
+
except Exception as e:
|
|
1588
99
|
emit_structured_log(
|
|
1589
|
-
"
|
|
1590
|
-
"
|
|
1591
|
-
operation=Operation.
|
|
1592
|
-
)
|
|
1593
|
-
a2a_endpoint = os.environ["WORK_IQ_A2A_ENDPOINT"]
|
|
1594
|
-
validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
|
|
1595
|
-
|
|
1596
|
-
a2a_scopes_str = os.environ.get("WORK_IQ_A2A_SCOPES", "")
|
|
1597
|
-
a2a_auth_handler = AuthHandler(
|
|
1598
|
-
client_id=os.environ["WORK_IQ_A2A_CLIENT_ID"],
|
|
1599
|
-
tenant_id=os.environ["TENANT_ID"],
|
|
1600
|
-
scopes_str=a2a_scopes_str,
|
|
1601
|
-
)
|
|
1602
|
-
try:
|
|
1603
|
-
a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
|
|
1604
|
-
a2a_access_token = a2a_auth_result.get("access_token") or ""
|
|
1605
|
-
if not a2a_access_token:
|
|
1606
|
-
raise RuntimeError("Failed to acquire A2A access token")
|
|
1607
|
-
except Exception as e:
|
|
1608
|
-
emit_structured_log(
|
|
1609
|
-
"error",
|
|
1610
|
-
f"Error during A2A authentication: {e}",
|
|
1611
|
-
operation=Operation.AUTHENTICATE,
|
|
1612
|
-
)
|
|
1613
|
-
if effective_log_level == "debug":
|
|
1614
|
-
import traceback
|
|
1615
|
-
traceback.print_exc()
|
|
1616
|
-
sys.exit(1)
|
|
1617
|
-
try:
|
|
1618
|
-
agent_client = A2AClient(
|
|
1619
|
-
a2a_endpoint=a2a_endpoint,
|
|
1620
|
-
access_token=a2a_access_token,
|
|
1621
|
-
logger=CLI_LOGGER,
|
|
1622
|
-
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1623
|
-
)
|
|
1624
|
-
except Exception as e:
|
|
1625
|
-
emit_structured_log(
|
|
1626
|
-
"error",
|
|
1627
|
-
f"Failed to initialize A2A client: {e}",
|
|
1628
|
-
operation=Operation.SETUP,
|
|
1629
|
-
)
|
|
1630
|
-
sys.exit(1)
|
|
1631
|
-
|
|
1632
|
-
case CallPath.COPILOT_AUTH:
|
|
1633
|
-
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
1634
|
-
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
1635
|
-
auth_handler = AuthHandler(
|
|
1636
|
-
client_id=os.environ["M365_EVAL_CLIENT_ID"],
|
|
1637
|
-
tenant_id=os.environ["TENANT_ID"],
|
|
1638
|
-
scopes_str=os.environ.get("COPILOT_SCOPES", ""),
|
|
100
|
+
"error",
|
|
101
|
+
f"Error during signout: {e}",
|
|
102
|
+
operation=Operation.AUTHENTICATE,
|
|
1639
103
|
)
|
|
104
|
+
sys.exit(1)
|
|
105
|
+
sys.exit(0)
|
|
1640
106
|
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
1661
|
-
|
|
1662
|
-
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
|
|
1668
|
-
|
|
1669
|
-
|
|
1670
|
-
if effective_log_level == "debug":
|
|
1671
|
-
import traceback
|
|
1672
|
-
traceback.print_exc()
|
|
1673
|
-
sys.exit(1)
|
|
1674
|
-
|
|
1675
|
-
agent_client = SydneyClient(
|
|
1676
|
-
copilot_api_endpoint=copilot_api_endpoint,
|
|
1677
|
-
access_token=access_token,
|
|
1678
|
-
user_oid=user_oid,
|
|
1679
|
-
logger=CLI_LOGGER,
|
|
1680
|
-
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1681
|
-
)
|
|
107
|
+
try:
|
|
108
|
+
a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
|
|
109
|
+
a2a_access_token = a2a_auth_result.get("access_token") or ""
|
|
110
|
+
if not a2a_access_token:
|
|
111
|
+
raise RuntimeError("Failed to acquire A2A access token")
|
|
112
|
+
except Exception as e:
|
|
113
|
+
emit_structured_log(
|
|
114
|
+
"error",
|
|
115
|
+
f"Error during A2A authentication: {e}",
|
|
116
|
+
operation=Operation.AUTHENTICATE,
|
|
117
|
+
)
|
|
118
|
+
if config.effective_log_level == "debug":
|
|
119
|
+
traceback.print_exc()
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
try:
|
|
122
|
+
agent_client = A2AClient(
|
|
123
|
+
a2a_endpoint=a2a_endpoint,
|
|
124
|
+
access_token=a2a_access_token,
|
|
125
|
+
token_refresh_fn=make_token_refresh_fn(a2a_auth_handler),
|
|
126
|
+
logger=CLI_LOGGER,
|
|
127
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
128
|
+
)
|
|
129
|
+
except Exception as e:
|
|
130
|
+
emit_structured_log(
|
|
131
|
+
"error",
|
|
132
|
+
f"Failed to initialize A2A client: {e}",
|
|
133
|
+
operation=Operation.SETUP,
|
|
134
|
+
)
|
|
135
|
+
sys.exit(1)
|
|
1682
136
|
|
|
1683
137
|
# 1. Load evaluation datasets
|
|
1684
|
-
eval_items, file_default_evaluators = get_prompt_datasets(
|
|
138
|
+
eval_items, file_default_evaluators = get_prompt_datasets(config)
|
|
1685
139
|
default_evaluators = resolve_default_evaluators(file_default_evaluators)
|
|
1686
140
|
|
|
1687
|
-
if effective_log_level in ("info", "debug"):
|
|
141
|
+
if config.effective_log_level in ("info", "debug"):
|
|
1688
142
|
multi_turn_count = sum(1 for item in eval_items if "turns" in item)
|
|
1689
143
|
single_turn_count = len(eval_items) - multi_turn_count
|
|
1690
144
|
emit_structured_log(
|
|
@@ -1697,54 +151,53 @@ def main():
|
|
|
1697
151
|
agent_name = None
|
|
1698
152
|
try:
|
|
1699
153
|
# 2. Agent selection - when no agent ID is provided, discover agents
|
|
1700
|
-
# via the active client (A2A
|
|
1701
|
-
if not
|
|
1702
|
-
if effective_log_level in ("info", "debug"):
|
|
154
|
+
# via the active client (A2A) and prompt interactively.
|
|
155
|
+
if not config.m365_agent_id:
|
|
156
|
+
if config.effective_log_level in ("info", "debug"):
|
|
1703
157
|
emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
|
|
1704
158
|
|
|
1705
159
|
available_agents = agent_client.fetch_available_agents()
|
|
1706
160
|
if not available_agents:
|
|
1707
161
|
emit_structured_log(
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
162
|
+
"error",
|
|
163
|
+
"No agents are available for interactive selection."
|
|
164
|
+
" Re-run with --m365-agent-id or set M365_AGENT_ID.",
|
|
165
|
+
operation=Operation.FETCH_AGENTS,
|
|
1712
166
|
)
|
|
1713
167
|
sys.exit(1)
|
|
1714
168
|
|
|
1715
169
|
selected_agent_id, agent_name = select_agent_interactively(available_agents)
|
|
1716
170
|
if selected_agent_id:
|
|
1717
|
-
|
|
1718
|
-
if effective_log_level in ("info", "debug"):
|
|
1719
|
-
emit_structured_log("info", f"Selected agent: {
|
|
171
|
+
config = replace(config, m365_agent_id=selected_agent_id)
|
|
172
|
+
if config.effective_log_level in ("info", "debug"):
|
|
173
|
+
emit_structured_log("info", f"Selected agent: {config.m365_agent_id}", operation=Operation.FETCH_AGENTS)
|
|
1720
174
|
else:
|
|
1721
175
|
emit_structured_log(
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
176
|
+
"error",
|
|
177
|
+
"No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
|
|
178
|
+
operation=Operation.FETCH_AGENTS,
|
|
1725
179
|
)
|
|
1726
180
|
sys.exit(1)
|
|
1727
181
|
except Exception as e:
|
|
1728
182
|
emit_structured_log("error", f"Error during agent discovery: {e}", operation=Operation.FETCH_AGENTS)
|
|
1729
|
-
if effective_log_level == "debug":
|
|
1730
|
-
import traceback
|
|
183
|
+
if config.effective_log_level == "debug":
|
|
1731
184
|
traceback.print_exc()
|
|
1732
185
|
sys.exit(1)
|
|
1733
186
|
|
|
1734
|
-
# Pre-resolve agent endpoint (A2A agent card lookup
|
|
1735
|
-
if
|
|
1736
|
-
agent_client.resolve_agent(
|
|
187
|
+
# Pre-resolve agent endpoint (A2A agent card lookup)
|
|
188
|
+
if config.m365_agent_id:
|
|
189
|
+
agent_client.resolve_agent(config.m365_agent_id)
|
|
1737
190
|
|
|
1738
191
|
# 3. Build pipeline config and run evaluation pipeline
|
|
1739
192
|
model_config = AzureOpenAIModelConfiguration(
|
|
1740
|
-
azure_endpoint=os.environ.get(
|
|
1741
|
-
api_key=os.environ.get(
|
|
1742
|
-
api_version=os.environ.get(
|
|
1743
|
-
azure_deployment=os.environ.get(
|
|
193
|
+
azure_endpoint=os.environ.get(ENV_AZURE_AI_OPENAI_ENDPOINT),
|
|
194
|
+
api_key=os.environ.get(ENV_AZURE_AI_API_KEY),
|
|
195
|
+
api_version=os.environ.get(ENV_AZURE_AI_API_VERSION),
|
|
196
|
+
azure_deployment=os.environ.get(ENV_AZURE_AI_MODEL_NAME),
|
|
1744
197
|
)
|
|
1745
198
|
has_azure_openai = bool(
|
|
1746
|
-
os.environ.get(
|
|
1747
|
-
and os.environ.get(
|
|
199
|
+
os.environ.get(ENV_AZURE_AI_OPENAI_ENDPOINT)
|
|
200
|
+
and os.environ.get(ENV_AZURE_AI_API_KEY)
|
|
1748
201
|
)
|
|
1749
202
|
|
|
1750
203
|
pipeline = PipelineConfig(
|
|
@@ -1754,13 +207,14 @@ def main():
|
|
|
1754
207
|
default_evaluators=default_evaluators,
|
|
1755
208
|
)
|
|
1756
209
|
|
|
1757
|
-
results = run_pipeline(pipeline, eval_items,
|
|
1758
|
-
|
|
210
|
+
results = run_pipeline(pipeline, eval_items, config)
|
|
211
|
+
|
|
1759
212
|
# 4. Output results
|
|
1760
|
-
output_results(
|
|
1761
|
-
|
|
213
|
+
output_results(
|
|
214
|
+
results, config, default_evaluators=default_evaluators,
|
|
215
|
+
agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
|
|
1762
216
|
|
|
1763
|
-
if effective_log_level in ("info", "debug"):
|
|
217
|
+
if config.effective_log_level in ("info", "debug"):
|
|
1764
218
|
emit_structured_log(
|
|
1765
219
|
"info",
|
|
1766
220
|
f"Evaluation completed successfully. Processed {len(eval_items)} item(s).",
|
|
@@ -1768,5 +222,5 @@ def main():
|
|
|
1768
222
|
)
|
|
1769
223
|
|
|
1770
224
|
# Call the main function when script is run directly
|
|
1771
|
-
if __name__ == "__main__":
|
|
225
|
+
if __name__ == "__main__": # pragma: no cover
|
|
1772
226
|
main()
|