@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -101
- package/package.json +7 -4
- package/schema/CHANGELOG.md +8 -0
- package/schema/v1/eval-document.schema.json +256 -8
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/comprehensive.json +27 -2
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
- package/src/clients/cli/api_clients/REST/__init__.py +3 -0
- package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +78 -0
- package/src/clients/cli/cli_logging/__init__.py +0 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
- package/src/clients/cli/cli_logging/logging_utils.py +144 -0
- package/src/clients/cli/common.py +62 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
- package/src/clients/cli/evaluator_resolver.py +150 -0
- package/src/clients/cli/generate_report.py +347 -184
- package/src/clients/cli/main.py +1288 -481
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/readme.md +14 -7
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +30 -14
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +134 -41
- package/src/clients/node-js/config/default.js +5 -1
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +11 -16
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
package/src/clients/cli/main.py
CHANGED
|
@@ -3,14 +3,19 @@ import os
|
|
|
3
3
|
import argparse
|
|
4
4
|
import sys
|
|
5
5
|
import csv
|
|
6
|
-
import
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
7
8
|
import webbrowser
|
|
8
|
-
import urllib.request
|
|
9
|
-
import urllib.error
|
|
10
9
|
import urllib.parse
|
|
11
10
|
import questionary
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
12
|
from enum import Enum
|
|
13
13
|
from typing import List, Dict, Tuple, Optional, Any
|
|
14
|
+
|
|
15
|
+
from api_clients.A2A import A2AClient
|
|
16
|
+
from api_clients.REST import SydneyClient
|
|
17
|
+
from api_clients.base_agent_client import BaseAgentClient
|
|
18
|
+
|
|
14
19
|
from azure.ai.evaluation import (
|
|
15
20
|
AzureOpenAIModelConfiguration,
|
|
16
21
|
RelevanceEvaluator,
|
|
@@ -21,25 +26,119 @@ from azure.ai.evaluation import (
|
|
|
21
26
|
from dotenv import load_dotenv
|
|
22
27
|
from auth.auth_handler import AuthHandler
|
|
23
28
|
from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
|
|
24
|
-
|
|
25
|
-
|
|
29
|
+
from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
|
|
30
|
+
from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
|
|
26
31
|
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
27
|
-
from response_extractor import
|
|
32
|
+
from response_extractor import get_response_text_for_evaluation
|
|
28
33
|
from schema_handler import DocumentUpgrader, SchemaVersionManager
|
|
34
|
+
from common import (
|
|
35
|
+
RELEVANCE,
|
|
36
|
+
COHERENCE,
|
|
37
|
+
GROUNDEDNESS,
|
|
38
|
+
TOOL_CALL_ACCURACY,
|
|
39
|
+
CITATIONS,
|
|
40
|
+
EXACT_MATCH,
|
|
41
|
+
PARTIAL_MATCH,
|
|
42
|
+
REQUIRES_AZURE_OPENAI,
|
|
43
|
+
REQUIRES_TOOL_DEFINITIONS,
|
|
44
|
+
METRIC_IDS,
|
|
45
|
+
STATUS_PASS,
|
|
46
|
+
STATUS_FAIL,
|
|
47
|
+
STATUS_ERROR,
|
|
48
|
+
STATUS_PARTIAL,
|
|
49
|
+
STATUS_UNKNOWN,
|
|
50
|
+
pascal_case_to_title,
|
|
51
|
+
)
|
|
52
|
+
from evaluator_resolver import (
|
|
53
|
+
validate_evaluator_names,
|
|
54
|
+
check_prerequisites,
|
|
55
|
+
resolve_default_evaluators,
|
|
56
|
+
resolve_evaluators_for_prompt,
|
|
57
|
+
get_evaluator_threshold,
|
|
58
|
+
)
|
|
29
59
|
from version_check import check_min_version, get_cli_version
|
|
30
60
|
from datetime import datetime, timezone
|
|
31
61
|
from pathlib import Path
|
|
32
|
-
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
from cli_logging.console_diagnostics import emit_structured_log as _emit_structured_log
|
|
65
|
+
from cli_logging.logging_utils import LOG_LEVEL_MAP, Operation, resolve_log_level
|
|
66
|
+
from parallel_executor import execute_in_parallel
|
|
67
|
+
from throttle_gate import ThrottleGate
|
|
68
|
+
from retry_policy import (
|
|
69
|
+
is_retryable_status,
|
|
70
|
+
get_backoff_seconds,
|
|
71
|
+
get_retry_after_seconds,
|
|
72
|
+
)
|
|
33
73
|
|
|
34
74
|
# Allowed endpoints for URL validation
|
|
35
75
|
ALLOWED_ENDPOINTS = [
|
|
36
|
-
'substrate.office.com'
|
|
76
|
+
'substrate.office.com',
|
|
77
|
+
'graph.microsoft.com',
|
|
37
78
|
]
|
|
38
79
|
|
|
80
|
+
MAX_CONCURRENCY = 5
|
|
81
|
+
MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
|
|
82
|
+
MAX_TURNS_PER_THREAD = 20
|
|
83
|
+
LONG_THREAD_WARNING_THRESHOLD = 10
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class PipelineConfig:
|
|
88
|
+
"""Runtime configuration for the evaluation pipeline."""
|
|
89
|
+
agent_client: BaseAgentClient
|
|
90
|
+
model_config: AzureOpenAIModelConfiguration
|
|
91
|
+
has_azure_openai: bool
|
|
92
|
+
default_evaluators: Dict[str, Any]
|
|
93
|
+
chat_gate: ThrottleGate = field(default_factory=lambda: ThrottleGate("chat_api"))
|
|
94
|
+
is_retryable_status: Any = field(default=is_retryable_status)
|
|
95
|
+
get_backoff_seconds: Any = field(default=get_backoff_seconds)
|
|
96
|
+
get_retry_after_seconds: Any = field(default=get_retry_after_seconds)
|
|
97
|
+
|
|
39
98
|
class CallPath(Enum):
|
|
40
99
|
""" Enum to indicate which call path to use. """
|
|
41
100
|
ACCESS_TOKEN = "access_token"
|
|
42
101
|
COPILOT_AUTH = "copilot_auth"
|
|
102
|
+
A2A = "a2a"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class ItemType(Enum):
|
|
106
|
+
SINGLE_TURN = "single_turn"
|
|
107
|
+
MULTI_TURN = "multi_turn"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def detect_item_type(item: dict) -> ItemType:
|
|
111
|
+
"""Determine if an evaluation item is single-turn or multi-turn.
|
|
112
|
+
|
|
113
|
+
Returns ItemType.SINGLE_TURN if item has 'prompt' without 'turns',
|
|
114
|
+
ItemType.MULTI_TURN if item has 'turns' array.
|
|
115
|
+
|
|
116
|
+
Raises ValueError for invalid items (both, neither, or invalid turns).
|
|
117
|
+
"""
|
|
118
|
+
has_turns = "turns" in item
|
|
119
|
+
has_prompt = "prompt" in item
|
|
120
|
+
|
|
121
|
+
if has_turns and has_prompt:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
"Invalid evaluation item: cannot have both 'turns' and 'prompt'. "
|
|
124
|
+
"Use 'turns' for multi-turn threads or 'prompt' for single-turn."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if has_turns and not isinstance(item["turns"], list):
|
|
128
|
+
raise ValueError("Invalid evaluation item: 'turns' must be a list")
|
|
129
|
+
|
|
130
|
+
if has_turns:
|
|
131
|
+
if len(item["turns"]) == 0:
|
|
132
|
+
raise ValueError("Invalid multi-turn thread: 'turns' array cannot be empty")
|
|
133
|
+
return ItemType.MULTI_TURN
|
|
134
|
+
|
|
135
|
+
if has_prompt:
|
|
136
|
+
return ItemType.SINGLE_TURN
|
|
137
|
+
|
|
138
|
+
raise ValueError(
|
|
139
|
+
"Invalid evaluation item: must have either 'turns' array (multi-turn) "
|
|
140
|
+
"or 'prompt' field (single-turn)"
|
|
141
|
+
)
|
|
43
142
|
|
|
44
143
|
|
|
45
144
|
# Flags that should bypass remote min-version enforcement.
|
|
@@ -48,20 +147,44 @@ VERSION_CHECK_BYPASS_FLAGS = (
|
|
|
48
147
|
"signout",
|
|
49
148
|
)
|
|
50
149
|
|
|
150
|
+
CLI_LOGGER_NAME = "m365.eval.cli"
|
|
151
|
+
CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
|
|
152
|
+
DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def configure_cli_logging(effective_log_level: str) -> None:
|
|
156
|
+
if not CLI_LOGGER.handlers:
|
|
157
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
158
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
159
|
+
CLI_LOGGER.addHandler(handler)
|
|
160
|
+
CLI_LOGGER.propagate = False
|
|
161
|
+
CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
|
|
165
|
+
_emit_structured_log(
|
|
166
|
+
level, message, operation,
|
|
167
|
+
logger=CLI_LOGGER,
|
|
168
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
169
|
+
)
|
|
170
|
+
|
|
51
171
|
|
|
52
172
|
def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
|
|
53
173
|
"""Return True if the current invocation should skip min-version checks."""
|
|
54
174
|
return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
|
|
55
175
|
|
|
56
|
-
def write_results_to_html(results: List[Dict], output_file: str
|
|
176
|
+
def write_results_to_html(results: List[Dict], output_file: str,
|
|
177
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
178
|
+
cli_version: Optional[str] = None):
|
|
57
179
|
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
58
180
|
try:
|
|
59
|
-
html = generate_html_report(results
|
|
181
|
+
html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
|
|
182
|
+
cli_version=cli_version)
|
|
60
183
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
61
184
|
f.write(html)
|
|
62
|
-
|
|
185
|
+
emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
63
186
|
except Exception as e:
|
|
64
|
-
|
|
187
|
+
emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
65
188
|
sys.exit(1)
|
|
66
189
|
|
|
67
190
|
def get_default_prompts_and_responses():
|
|
@@ -74,7 +197,7 @@ def get_default_prompts_and_responses():
|
|
|
74
197
|
]
|
|
75
198
|
return prompts, expected_responses
|
|
76
199
|
|
|
77
|
-
def load_prompts_from_file(file_path: str) -> Tuple[List[
|
|
200
|
+
def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
|
|
78
201
|
"""Load prompts and expected responses from a JSON file.
|
|
79
202
|
|
|
80
203
|
Supports three formats:
|
|
@@ -84,6 +207,10 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
84
207
|
|
|
85
208
|
For eval documents (format 1) and array format (format 2), schema validation
|
|
86
209
|
and auto-upgrade are applied via DocumentUpgrader.
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
|
|
213
|
+
expected_response, and optional evaluators/evaluators_mode fields.
|
|
87
214
|
"""
|
|
88
215
|
try:
|
|
89
216
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
@@ -100,18 +227,18 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
100
227
|
upgrader = DocumentUpgrader()
|
|
101
228
|
except Exception as e:
|
|
102
229
|
# Schema infrastructure not available (missing files, etc.) — skip
|
|
103
|
-
|
|
230
|
+
emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
|
|
104
231
|
upgrader = None
|
|
105
232
|
|
|
106
233
|
if upgrader is not None:
|
|
107
234
|
result = upgrader.upgrade(Path(file_path))
|
|
108
235
|
|
|
109
236
|
if result.error:
|
|
110
|
-
|
|
237
|
+
emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
|
|
111
238
|
sys.exit(1)
|
|
112
239
|
|
|
113
240
|
if result.upgraded and result.message:
|
|
114
|
-
|
|
241
|
+
emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
|
|
115
242
|
|
|
116
243
|
# Use the parsed document from the upgrade result
|
|
117
244
|
if result.document is not None:
|
|
@@ -119,26 +246,26 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
119
246
|
|
|
120
247
|
if isinstance(data, list):
|
|
121
248
|
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
122
|
-
|
|
123
|
-
expected_responses = [item.get("expected_response", "") for item in data]
|
|
249
|
+
return data, None
|
|
124
250
|
elif isinstance(data, dict):
|
|
125
251
|
if "items" in data:
|
|
126
252
|
# Eval document format: {"schemaVersion": "...", "items": [...]}
|
|
127
|
-
|
|
128
|
-
prompts = [item.get("prompt", "") for item in items]
|
|
129
|
-
expected_responses = [item.get("expected_response", "") for item in items]
|
|
253
|
+
return data["items"], data.get("default_evaluators")
|
|
130
254
|
else:
|
|
131
255
|
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
132
256
|
prompts = data.get("prompts", [])
|
|
133
257
|
expected_responses = data.get("expected_responses", [])
|
|
258
|
+
eval_items = [
|
|
259
|
+
{"prompt": p, "expected_response": e}
|
|
260
|
+
for p, e in zip(prompts, expected_responses)
|
|
261
|
+
]
|
|
262
|
+
return eval_items, None
|
|
134
263
|
else:
|
|
135
264
|
raise ValueError("Invalid file format")
|
|
136
|
-
|
|
137
|
-
return prompts, expected_responses
|
|
138
265
|
except SystemExit:
|
|
139
266
|
raise
|
|
140
267
|
except Exception as e:
|
|
141
|
-
|
|
268
|
+
emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
|
|
142
269
|
sys.exit(1)
|
|
143
270
|
|
|
144
271
|
def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
@@ -165,116 +292,549 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
|
165
292
|
|
|
166
293
|
return prompts, expected_responses
|
|
167
294
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
295
|
+
|
|
296
|
+
_DEFAULT_PASS_THRESHOLD = 3
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _decorate_metric(metric_id: str, data, threshold: Optional[int] = None) -> Dict[str, Any]:
|
|
300
|
+
"""Augment raw evaluator output with standardized threshold + pass/fail result."""
|
|
301
|
+
pass_threshold = threshold if threshold is not None else _DEFAULT_PASS_THRESHOLD
|
|
302
|
+
payload = {}
|
|
303
|
+
if isinstance(data, dict):
|
|
304
|
+
payload.update(data)
|
|
305
|
+
else:
|
|
306
|
+
payload['raw'] = data
|
|
307
|
+
|
|
308
|
+
score_val = None
|
|
309
|
+
if isinstance(data, dict):
|
|
310
|
+
if metric_id in data:
|
|
311
|
+
score_val = data[metric_id]
|
|
312
|
+
if isinstance(score_val, (int, float)):
|
|
313
|
+
payload['threshold'] = pass_threshold
|
|
314
|
+
payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
|
|
315
|
+
else:
|
|
316
|
+
payload['threshold'] = pass_threshold
|
|
317
|
+
payload.setdefault('result', STATUS_UNKNOWN)
|
|
318
|
+
return payload
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _run_evaluators_for_item(
|
|
322
|
+
prompt: str,
|
|
323
|
+
actual_response: str,
|
|
324
|
+
expected_response: str,
|
|
325
|
+
enhanced_response: Dict[str, Any],
|
|
326
|
+
resolved_evaluators: Dict[str, Any],
|
|
327
|
+
model_config: AzureOpenAIModelConfiguration,
|
|
328
|
+
has_azure_openai: bool,
|
|
329
|
+
args,
|
|
330
|
+
) -> Tuple[Dict[str, Optional[str]], List[str]]:
|
|
331
|
+
"""Run resolved evaluators against a single item/turn.
|
|
332
|
+
|
|
333
|
+
Returns (results_dict, evaluators_ran).
|
|
334
|
+
"""
|
|
335
|
+
has_tool_defs = bool(
|
|
336
|
+
args.m365_agent_id and enhanced_response.get("tool_definitions")
|
|
175
337
|
)
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
groundedness_evaluator = GroundednessEvaluator(model_config=model_config) # Evaluates the response for for factuality and groundedness against provided context. Range is 1 - 5.
|
|
181
|
-
#concisenessnonllm_evaluator = ConcisenessNonLLMEvaluator() # Evaluates the response for conciseness. Range is 1 - 5.
|
|
182
|
-
#pii_evaluator = PIIEvaluator(model_config=model_config) # Evaluates the response for presence of PII. Range
|
|
183
|
-
# Parse citation format from args
|
|
184
|
-
citation_format = CitationFormat.OAI_UNICODE if args.citation_format == 'oai_unicode' else CitationFormat.LEGACY_BRACKET
|
|
185
|
-
citations_evaluator = CitationsEvaluator(citation_format=citation_format) # Evaluates citations present in the response using regex pattern matching
|
|
186
|
-
|
|
187
|
-
tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) # Evaluate tool call accuracy if tool definitions are present in response
|
|
188
|
-
|
|
338
|
+
available_context = {
|
|
339
|
+
REQUIRES_AZURE_OPENAI: has_azure_openai,
|
|
340
|
+
REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
|
|
341
|
+
}
|
|
189
342
|
|
|
190
|
-
|
|
343
|
+
results_dict: Dict[str, Optional[str]] = {}
|
|
344
|
+
evaluators_ran: List[str] = []
|
|
345
|
+
|
|
346
|
+
for eval_name, eval_options in resolved_evaluators.items():
|
|
347
|
+
can_run, warn_msg = check_prerequisites(eval_name, available_context)
|
|
348
|
+
if not can_run:
|
|
349
|
+
if warn_msg:
|
|
350
|
+
emit_structured_log(
|
|
351
|
+
"warning",
|
|
352
|
+
f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
|
|
353
|
+
operation=Operation.EVALUATE,
|
|
354
|
+
)
|
|
355
|
+
results_dict[eval_name] = None
|
|
356
|
+
continue
|
|
191
357
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
358
|
+
threshold = get_evaluator_threshold(eval_name, eval_options)
|
|
359
|
+
|
|
360
|
+
try:
|
|
361
|
+
if eval_name == RELEVANCE:
|
|
362
|
+
raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
|
|
363
|
+
results_dict[RELEVANCE] = _decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
|
|
364
|
+
elif eval_name == COHERENCE:
|
|
365
|
+
raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
|
|
366
|
+
results_dict[COHERENCE] = _decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
|
|
367
|
+
elif eval_name == GROUNDEDNESS:
|
|
368
|
+
raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response, context=expected_response)
|
|
369
|
+
results_dict[GROUNDEDNESS] = _decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
|
|
370
|
+
elif eval_name == TOOL_CALL_ACCURACY:
|
|
371
|
+
raw_score = ToolCallAccuracyEvaluator(model_config)(
|
|
372
|
+
query=prompt,
|
|
373
|
+
response=enhanced_response.get("response", actual_response),
|
|
374
|
+
tool_definitions=enhanced_response.get("tool_definitions", []),
|
|
375
|
+
)
|
|
376
|
+
results_dict[TOOL_CALL_ACCURACY] = _decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
|
|
377
|
+
elif eval_name == CITATIONS:
|
|
378
|
+
fmt_str = eval_options.get("citation_format", "oai_unicode")
|
|
379
|
+
fmt_map = {
|
|
380
|
+
"oai_unicode": CitationFormat.OAI_UNICODE,
|
|
381
|
+
"bracket": CitationFormat.LEGACY_BRACKET,
|
|
382
|
+
"mixed": CitationFormat.AUTO,
|
|
383
|
+
}
|
|
384
|
+
raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response)
|
|
385
|
+
results_dict[CITATIONS] = _decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
|
|
386
|
+
elif eval_name == EXACT_MATCH:
|
|
387
|
+
case_sensitive = eval_options.get("case_sensitive", False)
|
|
388
|
+
raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
|
|
389
|
+
# ExactMatch is binary — the evaluator already sets 'result'
|
|
390
|
+
# so _decorate_metric (which computes result from score vs threshold) is not needed.
|
|
391
|
+
results_dict[EXACT_MATCH] = raw_score
|
|
392
|
+
elif eval_name == PARTIAL_MATCH:
|
|
393
|
+
case_sensitive = eval_options.get("case_sensitive", False)
|
|
394
|
+
raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
|
|
395
|
+
results_dict[PARTIAL_MATCH] = _decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
|
|
396
|
+
|
|
397
|
+
evaluators_ran.append(eval_name)
|
|
398
|
+
except Exception as e:
|
|
399
|
+
emit_structured_log(
|
|
400
|
+
"error",
|
|
401
|
+
f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
|
|
402
|
+
operation=Operation.EVALUATE,
|
|
403
|
+
)
|
|
404
|
+
results_dict[eval_name] = None
|
|
405
|
+
|
|
406
|
+
return results_dict, evaluators_ran
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _evaluate_single_response(
|
|
410
|
+
enhanced_response: Dict[str, Any],
|
|
411
|
+
eval_item: Dict,
|
|
412
|
+
args,
|
|
413
|
+
model_config: AzureOpenAIModelConfiguration,
|
|
414
|
+
has_azure_openai: bool,
|
|
415
|
+
default_evaluators: Dict[str, Any],
|
|
416
|
+
) -> Dict[str, Any]:
|
|
417
|
+
"""Run all evaluators for a single prompt/response pair and return the result dict."""
|
|
418
|
+
actual_response_text = get_response_text_for_evaluation(enhanced_response)
|
|
419
|
+
prompt = eval_item.get("prompt", "")
|
|
420
|
+
expected_response = eval_item.get("expected_response", "")
|
|
421
|
+
|
|
422
|
+
resolved = resolve_evaluators_for_prompt(
|
|
423
|
+
eval_item.get("evaluators"), eval_item.get("evaluators_mode", "extend"),
|
|
424
|
+
prompt, default_evaluators,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
results_dict, evaluators_ran = _run_evaluators_for_item(
|
|
428
|
+
prompt, actual_response_text, expected_response, enhanced_response,
|
|
429
|
+
resolved, model_config, has_azure_openai, args,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
evaluation_result = {
|
|
433
|
+
"prompt": prompt,
|
|
434
|
+
"response": enhanced_response.get(
|
|
435
|
+
"display_response_text", actual_response_text
|
|
436
|
+
),
|
|
437
|
+
"expected_response": expected_response,
|
|
438
|
+
"evaluators_ran": evaluators_ran,
|
|
439
|
+
"results": results_dict,
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
if "evaluators" in eval_item:
|
|
443
|
+
evaluation_result["evaluators"] = eval_item["evaluators"]
|
|
444
|
+
if "evaluators_mode" in eval_item:
|
|
445
|
+
evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
|
|
446
|
+
|
|
447
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
448
|
+
emit_structured_log(
|
|
449
|
+
"debug",
|
|
450
|
+
f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
|
|
451
|
+
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
452
|
+
f"Scores: {evaluation_result['results']}",
|
|
453
|
+
operation=Operation.EVALUATE,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
return evaluation_result
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
|
|
460
|
+
"""Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
|
|
461
|
+
for result_data in results_dict.values():
|
|
462
|
+
if result_data is None:
|
|
463
|
+
continue
|
|
464
|
+
if result_data.get("result") == STATUS_FAIL:
|
|
465
|
+
return False
|
|
466
|
+
return True
|
|
467
|
+
|
|
468
|
+
def _evaluate_multi_turn_responses(
|
|
469
|
+
turns: List[Dict],
|
|
470
|
+
args,
|
|
471
|
+
default_evaluators: Dict[str, Any],
|
|
472
|
+
model_config: AzureOpenAIModelConfiguration,
|
|
473
|
+
has_azure_openai: bool,
|
|
474
|
+
) -> Tuple[List[Dict], Dict]:
|
|
475
|
+
"""Run per-turn evaluations and build evaluated turn results with summary.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
Tuple of (evaluated_turns, summary). Each evaluated turn contains
|
|
479
|
+
prompt, response, expected_response, status, evaluators_ran, results,
|
|
480
|
+
and optionally error. Does not mutate the input turns.
|
|
481
|
+
"""
|
|
482
|
+
evaluated_turns: List[Dict] = []
|
|
483
|
+
turns_passed = 0
|
|
484
|
+
turns_failed = 0
|
|
485
|
+
|
|
486
|
+
for i, turn in enumerate(turns):
|
|
487
|
+
evaluated_turn: Dict[str, Any] = {
|
|
488
|
+
"prompt": turn.get("prompt", ""),
|
|
489
|
+
}
|
|
490
|
+
if "expected_response" in turn:
|
|
491
|
+
evaluated_turn["expected_response"] = turn["expected_response"]
|
|
492
|
+
if "response" in turn:
|
|
493
|
+
evaluated_turn["response"] = turn["response"]
|
|
494
|
+
if "evaluators" in turn:
|
|
495
|
+
evaluated_turn["evaluators"] = turn["evaluators"]
|
|
496
|
+
if "evaluators_mode" in turn:
|
|
497
|
+
evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
498
|
+
|
|
499
|
+
if turn.get("status") == STATUS_ERROR:
|
|
500
|
+
evaluated_turn["status"] = STATUS_ERROR
|
|
501
|
+
evaluated_turn["error"] = turn.get("error", "")
|
|
502
|
+
turns_failed += 1
|
|
503
|
+
evaluated_turns.append(evaluated_turn)
|
|
504
|
+
continue
|
|
505
|
+
|
|
506
|
+
enhanced_response = turn.get("_enhanced_response", {})
|
|
507
|
+
actual_response = get_response_text_for_evaluation(enhanced_response)
|
|
508
|
+
|
|
509
|
+
resolved = resolve_evaluators_for_prompt(
|
|
510
|
+
turn.get("evaluators"), turn.get("evaluators_mode", "extend"),
|
|
511
|
+
turn.get("prompt", ""), default_evaluators,
|
|
227
512
|
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
513
|
+
|
|
514
|
+
results_dict, evaluators_ran = _run_evaluators_for_item(
|
|
515
|
+
turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
|
|
516
|
+
enhanced_response, resolved, model_config, has_azure_openai, args,
|
|
231
517
|
)
|
|
232
518
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
519
|
+
all_passed = _check_all_passed(results_dict)
|
|
520
|
+
|
|
521
|
+
evaluated_turn["results"] = results_dict
|
|
522
|
+
evaluated_turn["evaluators_ran"] = evaluators_ran
|
|
523
|
+
evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
|
|
524
|
+
|
|
525
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
526
|
+
emit_structured_log(
|
|
527
|
+
"debug",
|
|
528
|
+
f"Evaluation completed for turn {i + 1} prompt='{turn.get('prompt', '')}'. "
|
|
529
|
+
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
530
|
+
f"Scores: {results_dict}",
|
|
531
|
+
operation=Operation.EVALUATE,
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
if all_passed:
|
|
535
|
+
turns_passed += 1
|
|
536
|
+
else:
|
|
537
|
+
turns_failed += 1
|
|
538
|
+
|
|
539
|
+
evaluated_turns.append(evaluated_turn)
|
|
540
|
+
|
|
541
|
+
turns_total = len(turns)
|
|
542
|
+
if turns_passed == turns_total:
|
|
543
|
+
overall_status = STATUS_PASS
|
|
544
|
+
elif turns_failed == turns_total:
|
|
545
|
+
overall_status = STATUS_FAIL
|
|
546
|
+
else:
|
|
547
|
+
overall_status = STATUS_PARTIAL
|
|
548
|
+
|
|
549
|
+
summary = {
|
|
550
|
+
"turns_total": turns_total,
|
|
551
|
+
"turns_passed": turns_passed,
|
|
552
|
+
"turns_failed": turns_failed,
|
|
553
|
+
"overall_status": overall_status,
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
return evaluated_turns, summary
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def get_effective_worker_count(prompt_count: int, args) -> int:
|
|
560
|
+
"""Compute safe worker count for prompt processing."""
|
|
561
|
+
if prompt_count <= 0:
|
|
562
|
+
return 1
|
|
563
|
+
|
|
564
|
+
requested = getattr(args, "concurrency", 5)
|
|
565
|
+
try:
|
|
566
|
+
requested_int = int(requested)
|
|
567
|
+
except (TypeError, ValueError):
|
|
568
|
+
requested_int = 5
|
|
569
|
+
|
|
570
|
+
bounded = max(1, min(requested_int, MAX_CONCURRENCY))
|
|
571
|
+
return min(bounded, prompt_count)
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def run_pipeline(
|
|
575
|
+
pipeline: PipelineConfig,
|
|
576
|
+
eval_items: List[Dict],
|
|
577
|
+
args,
|
|
578
|
+
) -> List[Dict[str, Any]]:
|
|
579
|
+
"""Run the full evaluation pipeline: send prompts and evaluate responses in parallel.
|
|
580
|
+
|
|
581
|
+
Each worker processes one prompt end-to-end: send → evaluate.
|
|
582
|
+
Results are returned in original prompt order (FR-006).
|
|
583
|
+
"""
|
|
584
|
+
# Validate all evaluator names upfront before dispatching workers
|
|
585
|
+
all_evaluator_maps = [pipeline.default_evaluators]
|
|
586
|
+
for eval_item in eval_items:
|
|
587
|
+
if "evaluators" in eval_item:
|
|
588
|
+
all_evaluator_maps.append(eval_item["evaluators"])
|
|
589
|
+
for turn in eval_item.get("turns", []):
|
|
590
|
+
if "evaluators" in turn:
|
|
591
|
+
all_evaluator_maps.append(turn["evaluators"])
|
|
592
|
+
for emap in all_evaluator_maps:
|
|
593
|
+
validate_evaluator_names(emap)
|
|
594
|
+
|
|
595
|
+
# Validate all items upfront and classify types before dispatching workers
|
|
596
|
+
item_types: List[ItemType] = []
|
|
597
|
+
for idx, eval_item in enumerate(eval_items):
|
|
598
|
+
try:
|
|
599
|
+
item_type = detect_item_type(eval_item)
|
|
600
|
+
except ValueError as e:
|
|
601
|
+
raise ValueError(f"Invalid evaluation item at index {idx}: {e}") from e
|
|
602
|
+
if item_type == ItemType.MULTI_TURN:
|
|
603
|
+
turn_count = len(eval_item["turns"])
|
|
604
|
+
if turn_count > MAX_TURNS_PER_THREAD:
|
|
605
|
+
raise ValueError(
|
|
606
|
+
f"Invalid evaluation item at index {idx}: 'turns' array has "
|
|
607
|
+
f"{turn_count} items (max {MAX_TURNS_PER_THREAD})"
|
|
608
|
+
)
|
|
609
|
+
item_types.append(item_type)
|
|
610
|
+
|
|
611
|
+
total = len(eval_items)
|
|
612
|
+
worker_count = get_effective_worker_count(total, args)
|
|
613
|
+
|
|
614
|
+
multi_turn_count = sum(1 for t in item_types if t == ItemType.MULTI_TURN)
|
|
615
|
+
single_turn_count = total - multi_turn_count
|
|
616
|
+
|
|
617
|
+
emit_structured_log(
|
|
618
|
+
"info",
|
|
619
|
+
f"Running pipeline with {worker_count} worker(s) for {total} item(s) "
|
|
620
|
+
f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
|
|
621
|
+
operation=Operation.EVALUATE,
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
def _process_item(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
625
|
+
if item_types[index] == ItemType.MULTI_TURN:
|
|
626
|
+
return _process_multi_turn(eval_item, index)
|
|
627
|
+
return _process_single_turn(eval_item, index)
|
|
628
|
+
|
|
629
|
+
def _process_single_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
630
|
+
prompt = eval_item.get("prompt", "")
|
|
631
|
+
emit_structured_log(
|
|
632
|
+
"info",
|
|
633
|
+
f"Processing item {index + 1}/{total} (single-turn).",
|
|
634
|
+
operation=Operation.SEND_PROMPT,
|
|
236
635
|
)
|
|
237
636
|
|
|
238
|
-
#
|
|
239
|
-
|
|
637
|
+
# Phase A: Send prompt to agent (with retry + throttle gate)
|
|
638
|
+
response = None
|
|
639
|
+
for attempt in range(1, MAX_ATTEMPTS + 1):
|
|
640
|
+
pipeline.chat_gate.wait_if_blocked()
|
|
641
|
+
try:
|
|
642
|
+
response, _ = pipeline.agent_client.send_prompt(prompt, agent_id=args.m365_agent_id)
|
|
643
|
+
break
|
|
644
|
+
except Exception as exc:
|
|
645
|
+
cause = exc.__cause__
|
|
646
|
+
status = int(getattr(cause, "code", 0) or 0) or None if cause else None
|
|
647
|
+
retry_after = get_retry_after_seconds(
|
|
648
|
+
cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
if retry_after is not None and pipeline.is_retryable_status(status):
|
|
652
|
+
pipeline.chat_gate.apply_retry_after(retry_after)
|
|
653
|
+
|
|
654
|
+
if not pipeline.is_retryable_status(status) or attempt >= MAX_ATTEMPTS:
|
|
655
|
+
emit_structured_log(
|
|
656
|
+
"error",
|
|
657
|
+
f"Item {index + 1}/{total} failed after {attempt} attempt(s): {exc}",
|
|
658
|
+
operation=Operation.SEND_PROMPT,
|
|
659
|
+
)
|
|
660
|
+
return {
|
|
661
|
+
"prompt": prompt,
|
|
662
|
+
"response": "",
|
|
663
|
+
"expected_response": eval_item.get("expected_response", ""),
|
|
664
|
+
"evaluators_ran": [],
|
|
665
|
+
"results": {},
|
|
666
|
+
"status": STATUS_ERROR,
|
|
667
|
+
"errorDetails": str(exc),
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
|
|
671
|
+
time.sleep(delay)
|
|
672
|
+
|
|
673
|
+
# Phase B: Evaluate response
|
|
674
|
+
return _evaluate_single_response(
|
|
675
|
+
response, eval_item, args,
|
|
676
|
+
pipeline.model_config, pipeline.has_azure_openai,
|
|
677
|
+
pipeline.default_evaluators,
|
|
678
|
+
)
|
|
240
679
|
|
|
241
|
-
|
|
242
|
-
|
|
680
|
+
def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
681
|
+
turns = eval_item["turns"]
|
|
682
|
+
thread_name = eval_item.get("name", "Unnamed thread")
|
|
683
|
+
emit_structured_log(
|
|
684
|
+
"info",
|
|
685
|
+
f"Processing item {index + 1}/{total} (multi-turn: '{thread_name}').",
|
|
686
|
+
operation=Operation.SEND_PROMPT,
|
|
243
687
|
)
|
|
244
688
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
tool_definitions=enhanced_response["tool_definitions"]
|
|
689
|
+
if len(turns) > LONG_THREAD_WARNING_THRESHOLD:
|
|
690
|
+
emit_structured_log(
|
|
691
|
+
"warning",
|
|
692
|
+
f"Thread '{thread_name}' has {len(turns)} turns (>{LONG_THREAD_WARNING_THRESHOLD}). This may take a while.",
|
|
693
|
+
operation=Operation.SEND_PROMPT,
|
|
251
694
|
)
|
|
252
695
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
696
|
+
# Phase A: Send each turn with throttle gate + 429-only retry
|
|
697
|
+
# Multi-turn only retries on 429 (server confirmed it didn't process
|
|
698
|
+
# the request). Other transient errors (503, 504) are ambiguous about
|
|
699
|
+
# whether the server processed the turn, risking duplicate turns in
|
|
700
|
+
# the conversation if retried.
|
|
701
|
+
conversation_context = None
|
|
702
|
+
conversation_id = None
|
|
703
|
+
enriched_turns: List[Dict[str, Any]] = []
|
|
704
|
+
failed = False
|
|
705
|
+
|
|
706
|
+
for i, turn in enumerate(turns):
|
|
707
|
+
prompt = turn["prompt"]
|
|
708
|
+
emit_structured_log(
|
|
709
|
+
"debug",
|
|
710
|
+
f"Sending turn {i + 1}/{len(turns)} of '{thread_name}'.",
|
|
711
|
+
operation=Operation.SEND_PROMPT,
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
response = None
|
|
715
|
+
for attempt in range(1, MAX_ATTEMPTS + 1):
|
|
716
|
+
pipeline.chat_gate.wait_if_blocked()
|
|
717
|
+
try:
|
|
718
|
+
response, conversation_context = pipeline.agent_client.send_prompt(
|
|
719
|
+
prompt, agent_id=args.m365_agent_id,
|
|
720
|
+
conversation_context=conversation_context,
|
|
721
|
+
)
|
|
722
|
+
break
|
|
723
|
+
except Exception as exc:
|
|
724
|
+
cause = exc.__cause__
|
|
725
|
+
status = int(getattr(cause, "code", 0) or 0) or None if cause else None
|
|
726
|
+
retry_after = get_retry_after_seconds(
|
|
727
|
+
cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Only retry on 429 — server confirmed it didn't process the request
|
|
731
|
+
if status == 429 and attempt < MAX_ATTEMPTS:
|
|
732
|
+
if retry_after is not None:
|
|
733
|
+
pipeline.chat_gate.apply_retry_after(retry_after)
|
|
734
|
+
delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
|
|
735
|
+
time.sleep(delay)
|
|
736
|
+
continue
|
|
737
|
+
|
|
738
|
+
# All other errors: stop the thread
|
|
739
|
+
emit_structured_log(
|
|
740
|
+
"error",
|
|
741
|
+
f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
|
|
742
|
+
operation=Operation.SEND_PROMPT,
|
|
743
|
+
)
|
|
744
|
+
failed = True
|
|
745
|
+
break
|
|
746
|
+
|
|
747
|
+
if failed:
|
|
748
|
+
# Mark this turn and all remaining turns as error
|
|
749
|
+
enriched_turns.append({
|
|
750
|
+
**turn,
|
|
751
|
+
"response": "",
|
|
752
|
+
"status": STATUS_ERROR,
|
|
753
|
+
"error": "Failed to get response from agent",
|
|
754
|
+
})
|
|
755
|
+
for j in range(i + 1, len(turns)):
|
|
756
|
+
enriched_turns.append({
|
|
757
|
+
**turns[j],
|
|
758
|
+
"response": "",
|
|
759
|
+
"status": STATUS_ERROR,
|
|
760
|
+
"error": "Skipped: preceding turn failed",
|
|
761
|
+
})
|
|
762
|
+
break
|
|
763
|
+
|
|
764
|
+
# Enrich turn with response
|
|
765
|
+
response_text = get_response_text_for_evaluation(response)
|
|
766
|
+
enriched_turns.append({
|
|
767
|
+
**turn,
|
|
768
|
+
"response": response.get("display_response_text", response_text),
|
|
769
|
+
"_enhanced_response": response,
|
|
770
|
+
})
|
|
771
|
+
|
|
772
|
+
# Capture conversation_id from first response
|
|
773
|
+
if conversation_id is None:
|
|
774
|
+
conversation_id = response.get("metadata", {}).get("conversation_id")
|
|
775
|
+
|
|
776
|
+
# Phase B: Run per-turn evaluations
|
|
777
|
+
evaluated_turns, summary = _evaluate_multi_turn_responses(
|
|
778
|
+
enriched_turns, args, pipeline.default_evaluators,
|
|
779
|
+
model_config=pipeline.model_config,
|
|
780
|
+
has_azure_openai=pipeline.has_azure_openai,
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
return {
|
|
784
|
+
"type": "multi_turn",
|
|
785
|
+
"name": eval_item.get("name", ""),
|
|
786
|
+
"description": eval_item.get("description", ""),
|
|
787
|
+
"conversation_id": conversation_id or "",
|
|
788
|
+
"turns": evaluated_turns,
|
|
789
|
+
"summary": summary,
|
|
266
790
|
}
|
|
267
791
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
792
|
+
execution_results = execute_in_parallel(
|
|
793
|
+
eval_items, _process_item, max_workers=worker_count,
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
# Unwrap WorkerResult objects into plain dicts, with error fallback
|
|
797
|
+
ordered_results: List[Dict[str, Any]] = []
|
|
798
|
+
for wr in execution_results:
|
|
799
|
+
if wr.error:
|
|
800
|
+
idx = wr.index
|
|
801
|
+
item = eval_items[idx]
|
|
802
|
+
if item_types[idx] == ItemType.MULTI_TURN:
|
|
803
|
+
ordered_results.append({
|
|
804
|
+
"type": "multi_turn",
|
|
805
|
+
"name": item.get("name", ""),
|
|
806
|
+
"turns": [
|
|
807
|
+
{**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
|
|
808
|
+
for t in item.get("turns", [])
|
|
809
|
+
],
|
|
810
|
+
"summary": {
|
|
811
|
+
"turns_total": len(item.get("turns", [])),
|
|
812
|
+
"turns_passed": 0,
|
|
813
|
+
"turns_failed": len(item.get("turns", [])),
|
|
814
|
+
"overall_status": STATUS_FAIL,
|
|
815
|
+
},
|
|
816
|
+
"error": str(wr.error),
|
|
817
|
+
})
|
|
818
|
+
else:
|
|
819
|
+
ordered_results.append({
|
|
820
|
+
"prompt": item.get("prompt", ""),
|
|
821
|
+
"response": "",
|
|
822
|
+
"expected_response": item.get("expected_response", ""),
|
|
823
|
+
"evaluators_ran": [],
|
|
824
|
+
"results": {},
|
|
825
|
+
"status": STATUS_ERROR,
|
|
826
|
+
"errorDetails": str(wr.error),
|
|
827
|
+
})
|
|
828
|
+
else:
|
|
829
|
+
ordered_results.append(wr.value)
|
|
830
|
+
|
|
831
|
+
return ordered_results
|
|
272
832
|
|
|
273
|
-
evaluation_results.append(evaluation_result)
|
|
274
|
-
|
|
275
|
-
return evaluation_results
|
|
276
833
|
|
|
277
|
-
|
|
834
|
+
|
|
835
|
+
def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
836
|
+
agent_id: Optional[str] = None,
|
|
837
|
+
cli_version: Optional[str] = None):
|
|
278
838
|
"""Write the response to console."""
|
|
279
839
|
# ANSI color codes
|
|
280
840
|
BOLD = '\033[1m'
|
|
@@ -286,48 +846,105 @@ def write_results_to_console(results):
|
|
|
286
846
|
ORANGE = '\033[38;5;208m'
|
|
287
847
|
RED = '\033[91m'
|
|
288
848
|
RESET = '\033[0m'
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
849
|
+
|
|
850
|
+
def _print_evaluated_item(response: str, expected_response: str,
|
|
851
|
+
evaluators_ran: List[str], item_results: Dict[str, Any],
|
|
852
|
+
error: Optional[str] = None) -> None:
|
|
853
|
+
"""Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
|
|
854
|
+
|
|
855
|
+
The item header (Prompt X / Turn X) is printed by the caller; this helper
|
|
856
|
+
prints evaluators, response, expected response, error, and metrics.
|
|
857
|
+
"""
|
|
858
|
+
if evaluators_ran:
|
|
859
|
+
print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
|
|
860
|
+
if response:
|
|
861
|
+
print(f"{BOLD}{CYAN}Response:{RESET} {response}")
|
|
862
|
+
if expected_response:
|
|
863
|
+
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
|
|
864
|
+
if error:
|
|
865
|
+
print(f"{BOLD}{RED}Error:{RESET} {error}")
|
|
866
|
+
|
|
867
|
+
for eval_name, v in item_results.items():
|
|
868
|
+
if v is None:
|
|
869
|
+
continue
|
|
870
|
+
display_name = pascal_case_to_title(eval_name)
|
|
871
|
+
if eval_name == RELEVANCE:
|
|
872
|
+
color = MAGENTA
|
|
873
|
+
elif eval_name == COHERENCE:
|
|
874
|
+
color = ORANGE
|
|
875
|
+
else:
|
|
876
|
+
color = BLUE
|
|
877
|
+
print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
|
|
878
|
+
|
|
879
|
+
# Show metadata
|
|
880
|
+
metadata_parts = []
|
|
881
|
+
if agent_name:
|
|
882
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
883
|
+
if agent_id:
|
|
884
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
885
|
+
if cli_version:
|
|
886
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
887
|
+
if metadata_parts:
|
|
888
|
+
print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
|
|
889
|
+
print()
|
|
890
|
+
|
|
891
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
892
|
+
if aggregates:
|
|
893
|
+
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
894
|
+
if total_items > 1:
|
|
895
|
+
print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
|
|
295
896
|
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
296
|
-
|
|
897
|
+
|
|
297
898
|
for metric_name, stats in aggregates.items():
|
|
298
899
|
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
299
|
-
|
|
900
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
901
|
+
total_prompts = stats.get('total_prompts', total_items)
|
|
902
|
+
print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
|
|
300
903
|
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
301
904
|
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
302
905
|
if stats.get('threshold') is not None:
|
|
303
906
|
print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
|
|
304
907
|
print()
|
|
305
|
-
|
|
908
|
+
|
|
306
909
|
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
307
910
|
print()
|
|
308
|
-
|
|
309
|
-
print(f"{BOLD}{BLUE}
|
|
911
|
+
|
|
912
|
+
print(f"{BOLD}{BLUE}Individual Results:{RESET}")
|
|
310
913
|
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
311
914
|
for i, result in enumerate(results, 1):
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
for
|
|
320
|
-
|
|
321
|
-
if
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
915
|
+
if result.get("type") == "multi_turn":
|
|
916
|
+
thread_name = result.get("name", "Unnamed Thread")
|
|
917
|
+
summary = result.get("summary", {})
|
|
918
|
+
status = summary.get("overall_status", STATUS_UNKNOWN)
|
|
919
|
+
status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
|
|
920
|
+
|
|
921
|
+
print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
|
|
922
|
+
for t_idx, turn in enumerate(result.get("turns", []), 1):
|
|
923
|
+
turn_status = turn.get("status", STATUS_UNKNOWN)
|
|
924
|
+
turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
|
|
925
|
+
print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
|
|
926
|
+
_print_evaluated_item(
|
|
927
|
+
response=turn.get("response", ""),
|
|
928
|
+
expected_response=turn.get("expected_response", ""),
|
|
929
|
+
evaluators_ran=turn.get("evaluators_ran", []),
|
|
930
|
+
item_results=turn.get("results", {}),
|
|
931
|
+
error=turn.get("error"),
|
|
932
|
+
)
|
|
933
|
+
print()
|
|
934
|
+
print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
|
|
935
|
+
print(f" Status: {status_color}{status.upper()}{RESET}")
|
|
936
|
+
print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
|
|
937
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
938
|
+
else:
|
|
939
|
+
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
940
|
+
_print_evaluated_item(
|
|
941
|
+
response=result.get('response', ''),
|
|
942
|
+
expected_response=result.get('expected_response', ''),
|
|
943
|
+
evaluators_ran=result.get('evaluators_ran', []),
|
|
944
|
+
item_results=result.get('results', {}),
|
|
945
|
+
error=result.get('errorDetails'),
|
|
946
|
+
)
|
|
947
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
331
948
|
|
|
332
949
|
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
333
950
|
"""Extract an EvalScore object from a decorated metric dict.
|
|
@@ -338,16 +955,14 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
|
338
955
|
DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
|
|
339
956
|
|
|
340
957
|
score_val = None
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
score_val = data[k]
|
|
344
|
-
break
|
|
958
|
+
if metric_id in data and isinstance(data[metric_id], (int, float)):
|
|
959
|
+
score_val = data[metric_id]
|
|
345
960
|
if score_val is None:
|
|
346
961
|
return None
|
|
347
962
|
|
|
348
963
|
result = data.get("result")
|
|
349
|
-
if result not in (
|
|
350
|
-
result =
|
|
964
|
+
if result not in (STATUS_PASS, STATUS_FAIL):
|
|
965
|
+
result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else STATUS_FAIL
|
|
351
966
|
|
|
352
967
|
eval_score: Dict[str, Any] = {
|
|
353
968
|
"score": score_val,
|
|
@@ -360,48 +975,33 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
|
360
975
|
return eval_score
|
|
361
976
|
|
|
362
977
|
|
|
363
|
-
def
|
|
364
|
-
"""Convert
|
|
978
|
+
def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
979
|
+
"""Convert raw evaluator results to schema-compliant score objects.
|
|
365
980
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
Schema EvalItem format:
|
|
369
|
-
{prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
|
|
981
|
+
Evaluator results in results_dict are dicts (from _decorate_metric) or
|
|
982
|
+
None when skipped/crashed. None values are omitted from output.
|
|
370
983
|
"""
|
|
371
|
-
item: Dict[str, Any] = {
|
|
372
|
-
"prompt": result["prompt"],
|
|
373
|
-
"response": result["response"],
|
|
374
|
-
"expected_response": result["expected_response"],
|
|
375
|
-
}
|
|
376
|
-
|
|
377
984
|
scores: Dict[str, Any] = {}
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
(
|
|
384
|
-
("coherence_score", "coherence", "coherence"),
|
|
385
|
-
("groundedness_score", "groundedness", "groundedness"),
|
|
386
|
-
("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
|
|
985
|
+
|
|
986
|
+
for eval_key, schema_key in [
|
|
987
|
+
(RELEVANCE, "relevance"),
|
|
988
|
+
(COHERENCE, "coherence"),
|
|
989
|
+
(GROUNDEDNESS, "groundedness"),
|
|
990
|
+
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
387
991
|
]:
|
|
388
|
-
|
|
389
|
-
if
|
|
992
|
+
data = results_dict.get(eval_key)
|
|
993
|
+
if data is None:
|
|
390
994
|
continue
|
|
391
|
-
|
|
392
|
-
eval_score = extract_eval_score(data, metric_id)
|
|
995
|
+
eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
|
|
393
996
|
if eval_score:
|
|
394
997
|
scores[schema_key] = eval_score
|
|
395
998
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
|
|
400
|
-
count = data.get("score", 0)
|
|
999
|
+
data = results_dict.get(CITATIONS)
|
|
1000
|
+
if data is not None:
|
|
1001
|
+
count = data.get("citations", 0)
|
|
401
1002
|
cit_result = data.get("result")
|
|
402
|
-
if cit_result not in (
|
|
403
|
-
cit_result =
|
|
404
|
-
|
|
1003
|
+
if cit_result not in (STATUS_PASS, STATUS_FAIL):
|
|
1004
|
+
cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
|
|
405
1005
|
citation_score: Dict[str, Any] = {
|
|
406
1006
|
"count": count,
|
|
407
1007
|
"result": cit_result,
|
|
@@ -411,17 +1011,100 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
|
411
1011
|
citation_score["format"] = data["citation_format"]
|
|
412
1012
|
scores["citations"] = citation_score
|
|
413
1013
|
|
|
1014
|
+
data = results_dict.get(EXACT_MATCH)
|
|
1015
|
+
if data is not None:
|
|
1016
|
+
is_match = data.get("exact_match", 0.0) == 1.0
|
|
1017
|
+
scores["exactMatch"] = {
|
|
1018
|
+
"match": is_match,
|
|
1019
|
+
"result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
|
|
1020
|
+
"reason": data.get("exact_match_reason", ""),
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
data = results_dict.get(PARTIAL_MATCH)
|
|
1024
|
+
if data is not None:
|
|
1025
|
+
scores["partialMatch"] = {
|
|
1026
|
+
"score": data.get("partial_match", 0.0),
|
|
1027
|
+
"result": data.get("result", STATUS_FAIL),
|
|
1028
|
+
"threshold": data.get("threshold", 0.5),
|
|
1029
|
+
"reason": data.get("partial_match_reason", ""),
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
return scores
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
1036
|
+
"""Convert an internal evaluation result dict to a schema-compliant EvalItem."""
|
|
1037
|
+
item: Dict[str, Any] = {
|
|
1038
|
+
"prompt": result["prompt"],
|
|
1039
|
+
"response": result["response"],
|
|
1040
|
+
"expected_response": result["expected_response"],
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
if "evaluators" in result:
|
|
1044
|
+
item["evaluators"] = result["evaluators"]
|
|
1045
|
+
if "evaluators_mode" in result:
|
|
1046
|
+
item["evaluators_mode"] = result["evaluators_mode"]
|
|
1047
|
+
|
|
1048
|
+
scores = _convert_scores_to_schema(result.get("results", {}))
|
|
414
1049
|
if scores:
|
|
415
1050
|
item["scores"] = scores
|
|
416
1051
|
|
|
417
1052
|
return item
|
|
418
1053
|
|
|
419
1054
|
|
|
420
|
-
def
|
|
1055
|
+
def convert_thread_result_to_output(thread_result: Dict) -> Dict:
|
|
1056
|
+
"""Convert a multi-turn thread result to the output format."""
|
|
1057
|
+
output_turns = []
|
|
1058
|
+
for turn in thread_result.get("turns", []):
|
|
1059
|
+
output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
|
|
1060
|
+
if "expected_response" in turn:
|
|
1061
|
+
output_turn["expected_response"] = turn["expected_response"]
|
|
1062
|
+
if "response" in turn:
|
|
1063
|
+
output_turn["response"] = turn["response"]
|
|
1064
|
+
if "status" in turn:
|
|
1065
|
+
output_turn["status"] = turn["status"]
|
|
1066
|
+
if "error" in turn:
|
|
1067
|
+
output_turn["error"] = turn["error"]
|
|
1068
|
+
if "evaluators" in turn:
|
|
1069
|
+
output_turn["evaluators"] = turn["evaluators"]
|
|
1070
|
+
if "evaluators_mode" in turn:
|
|
1071
|
+
output_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
1072
|
+
|
|
1073
|
+
scores = _convert_scores_to_schema(turn.get("results", {}))
|
|
1074
|
+
if scores:
|
|
1075
|
+
output_turn["scores"] = scores
|
|
1076
|
+
|
|
1077
|
+
output_turns.append(output_turn)
|
|
1078
|
+
|
|
1079
|
+
output: Dict[str, Any] = {}
|
|
1080
|
+
if thread_result.get("name"):
|
|
1081
|
+
output["name"] = thread_result["name"]
|
|
1082
|
+
if thread_result.get("description"):
|
|
1083
|
+
output["description"] = thread_result["description"]
|
|
1084
|
+
if thread_result.get("conversation_id"):
|
|
1085
|
+
output["conversation_id"] = thread_result["conversation_id"]
|
|
1086
|
+
output["turns"] = output_turns
|
|
1087
|
+
if thread_result.get("summary"):
|
|
1088
|
+
output["summary"] = thread_result["summary"]
|
|
1089
|
+
|
|
1090
|
+
return output
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
def convert_result_to_output_item(result: Dict) -> Dict:
|
|
1094
|
+
"""Convert an internal result dict to an output item. Routes by type."""
|
|
1095
|
+
if result.get("type") == "multi_turn":
|
|
1096
|
+
return convert_thread_result_to_output(result)
|
|
1097
|
+
return convert_result_to_eval_item(result)
|
|
1098
|
+
|
|
1099
|
+
|
|
1100
|
+
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
1101
|
+
default_evaluators: Optional[Dict[str, Any]] = None,
|
|
1102
|
+
agent_name: Optional[str] = None,
|
|
1103
|
+
cli_version: Optional[str] = None):
|
|
421
1104
|
"""Write results to a schema-compliant eval document JSON file.
|
|
422
1105
|
|
|
423
1106
|
Output follows the eval-document.schema.json format:
|
|
424
|
-
{schemaVersion, metadata, items: [EvalItem]}
|
|
1107
|
+
{schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
|
|
425
1108
|
"""
|
|
426
1109
|
try:
|
|
427
1110
|
try:
|
|
@@ -429,52 +1112,145 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
|
|
|
429
1112
|
except Exception:
|
|
430
1113
|
current_version = "1.0.0"
|
|
431
1114
|
|
|
432
|
-
items = [
|
|
1115
|
+
items = [convert_result_to_output_item(r) for r in results]
|
|
433
1116
|
|
|
434
1117
|
metadata: Dict[str, Any] = {
|
|
435
1118
|
"evaluatedAt": datetime.now(timezone.utc).isoformat(),
|
|
436
1119
|
}
|
|
437
1120
|
if agent_id:
|
|
438
1121
|
metadata["agentId"] = agent_id
|
|
1122
|
+
if agent_name:
|
|
1123
|
+
metadata["agentName"] = agent_name
|
|
1124
|
+
if cli_version:
|
|
1125
|
+
metadata["cliVersion"] = cli_version
|
|
439
1126
|
|
|
440
1127
|
output_data: Dict[str, Any] = {
|
|
441
1128
|
"schemaVersion": current_version,
|
|
442
1129
|
"metadata": metadata,
|
|
443
|
-
"items": items,
|
|
444
1130
|
}
|
|
445
1131
|
|
|
1132
|
+
if default_evaluators is not None:
|
|
1133
|
+
output_data["default_evaluators"] = default_evaluators
|
|
1134
|
+
|
|
1135
|
+
output_data["items"] = items
|
|
1136
|
+
|
|
446
1137
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
447
1138
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
448
|
-
|
|
1139
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
449
1140
|
except Exception as e:
|
|
450
|
-
|
|
1141
|
+
emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
451
1142
|
sys.exit(1)
|
|
452
1143
|
|
|
453
|
-
def
|
|
1144
|
+
def _results_to_csv_json(results_dict: Dict) -> str:
|
|
1145
|
+
"""Serialize evaluator results dict to a CSV-safe JSON string.
|
|
1146
|
+
|
|
1147
|
+
Skips None (crashed/skipped evaluators). Results are dicts produced
|
|
1148
|
+
by _decorate_metric.
|
|
1149
|
+
"""
|
|
1150
|
+
if not results_dict:
|
|
1151
|
+
return ""
|
|
1152
|
+
non_null = {k: v for k, v in results_dict.items() if v is not None}
|
|
1153
|
+
return json.dumps(non_null) if non_null else ""
|
|
1154
|
+
|
|
1155
|
+
|
|
1156
|
+
def write_results_to_csv(results: List[Dict], output_file: str,
|
|
1157
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
1158
|
+
cli_version: Optional[str] = None):
|
|
454
1159
|
"""Write results to CSV file."""
|
|
455
1160
|
try:
|
|
456
1161
|
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
457
1162
|
if results:
|
|
458
|
-
|
|
459
|
-
if
|
|
460
|
-
|
|
461
|
-
|
|
1163
|
+
metadata_parts = []
|
|
1164
|
+
if agent_name:
|
|
1165
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
1166
|
+
if agent_id:
|
|
1167
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
1168
|
+
if cli_version:
|
|
1169
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
1170
|
+
if metadata_parts:
|
|
1171
|
+
f.write(f"# {' | '.join(metadata_parts)}\n")
|
|
1172
|
+
|
|
1173
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
1174
|
+
if aggregates:
|
|
1175
|
+
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
1176
|
+
if total_items > 1:
|
|
462
1177
|
f.write("# AGGREGATE STATISTICS\n")
|
|
463
|
-
f.write("Metric,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
1178
|
+
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
464
1179
|
for metric_name, stats in aggregates.items():
|
|
465
1180
|
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
466
|
-
|
|
1181
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
1182
|
+
total_prompts = stats.get('total_prompts', total_items)
|
|
1183
|
+
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
467
1184
|
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
1185
|
+
|
|
1186
|
+
single_turn_rows = []
|
|
1187
|
+
multi_turn_rows = []
|
|
1188
|
+
for result in results:
|
|
1189
|
+
if result.get("type") == "multi_turn":
|
|
1190
|
+
thread_name = result.get("name", "")
|
|
1191
|
+
for turn_idx, turn in enumerate(result.get("turns", [])):
|
|
1192
|
+
multi_turn_rows.append({
|
|
1193
|
+
"thread_name": thread_name,
|
|
1194
|
+
"turn_index": turn_idx + 1,
|
|
1195
|
+
"prompt": turn.get("prompt", ""),
|
|
1196
|
+
"response": turn.get("response", ""),
|
|
1197
|
+
"expected_response": turn.get("expected_response", ""),
|
|
1198
|
+
"status": turn.get("status", ""),
|
|
1199
|
+
"error": turn.get("error", ""),
|
|
1200
|
+
"scores": _results_to_csv_json(turn.get("results", {})),
|
|
1201
|
+
})
|
|
1202
|
+
summary = result.get("summary", {})
|
|
1203
|
+
multi_turn_rows.append({
|
|
1204
|
+
"thread_name": thread_name,
|
|
1205
|
+
"turn_index": "summary",
|
|
1206
|
+
"prompt": "",
|
|
1207
|
+
"response": "",
|
|
1208
|
+
"expected_response": "",
|
|
1209
|
+
"status": summary.get("overall_status", ""),
|
|
1210
|
+
"scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
|
|
1211
|
+
})
|
|
1212
|
+
else:
|
|
1213
|
+
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
|
|
1214
|
+
row = {k: v for k, v in result.items() if k not in exclude_keys}
|
|
1215
|
+
if "results" in result:
|
|
1216
|
+
row["scores"] = _results_to_csv_json(result["results"])
|
|
1217
|
+
single_turn_rows.append(row)
|
|
1218
|
+
|
|
1219
|
+
if single_turn_rows:
|
|
1220
|
+
if multi_turn_rows:
|
|
1221
|
+
f.write("# SINGLE-TURN RESULTS\n")
|
|
1222
|
+
fieldnames = list(single_turn_rows[0].keys())
|
|
1223
|
+
for row in single_turn_rows:
|
|
1224
|
+
for k in row:
|
|
1225
|
+
if k not in fieldnames:
|
|
1226
|
+
fieldnames.append(k)
|
|
1227
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
1228
|
+
writer.writeheader()
|
|
1229
|
+
writer.writerows(single_turn_rows)
|
|
1230
|
+
|
|
1231
|
+
if multi_turn_rows:
|
|
1232
|
+
if single_turn_rows:
|
|
1233
|
+
f.write("\n")
|
|
1234
|
+
f.write("# MULTI-TURN RESULTS\n")
|
|
1235
|
+
fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
|
|
1236
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
1237
|
+
writer.writeheader()
|
|
1238
|
+
writer.writerows(multi_turn_rows)
|
|
1239
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
474
1240
|
except Exception as e:
|
|
475
|
-
|
|
1241
|
+
emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
476
1242
|
sys.exit(1)
|
|
477
1243
|
|
|
1244
|
+
def normalize_agent_id(agent_id):
|
|
1245
|
+
"""Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
|
|
1246
|
+
|
|
1247
|
+
Returns the input unchanged when it is None/empty or already contains a dot.
|
|
1248
|
+
"""
|
|
1249
|
+
if not agent_id:
|
|
1250
|
+
return agent_id
|
|
1251
|
+
return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
|
|
1252
|
+
|
|
1253
|
+
|
|
478
1254
|
def parse_arguments():
|
|
479
1255
|
"""Parse command line arguments."""
|
|
480
1256
|
parser = argparse.ArgumentParser(
|
|
@@ -503,8 +1279,8 @@ Examples:
|
|
|
503
1279
|
# Save results to HTML and open in browser
|
|
504
1280
|
python main.py --output report.html
|
|
505
1281
|
|
|
506
|
-
#
|
|
507
|
-
python main.py --
|
|
1282
|
+
# Debug-level diagnostics
|
|
1283
|
+
python main.py --log-level debug
|
|
508
1284
|
|
|
509
1285
|
# Sign out and clear cached authentication tokens
|
|
510
1286
|
python main.py --signout
|
|
@@ -553,28 +1329,41 @@ Examples:
|
|
|
553
1329
|
|
|
554
1330
|
# Behavior options
|
|
555
1331
|
parser.add_argument(
|
|
556
|
-
'--
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
'--quiet',
|
|
562
|
-
action='store_true',
|
|
563
|
-
help='Suppress non-essential output'
|
|
564
|
-
)
|
|
565
|
-
parser.add_argument(
|
|
566
|
-
'--citation-format',
|
|
567
|
-
choices=['oai_unicode', 'legacy_bracket'],
|
|
568
|
-
default='oai_unicode',
|
|
569
|
-
help='Citation format to detect. "oai_unicode" for new OAI format (default), "legacy_bracket" for old [^i^] format'
|
|
1332
|
+
'--log-level',
|
|
1333
|
+
nargs='?',
|
|
1334
|
+
const='info',
|
|
1335
|
+
action='append',
|
|
1336
|
+
help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
|
|
570
1337
|
)
|
|
1338
|
+
|
|
571
1339
|
parser.add_argument(
|
|
572
1340
|
'--signout',
|
|
573
1341
|
action='store_true',
|
|
574
1342
|
help='Sign out and clear cached authentication tokens'
|
|
575
1343
|
)
|
|
1344
|
+
|
|
1345
|
+
parser.add_argument(
|
|
1346
|
+
'--concurrency',
|
|
1347
|
+
type=int,
|
|
1348
|
+
default=5,
|
|
1349
|
+
help='Number of parallel workers for prompt processing (1-5, default: 5)'
|
|
1350
|
+
)
|
|
576
1351
|
|
|
577
|
-
|
|
1352
|
+
args = parser.parse_args()
|
|
1353
|
+
|
|
1354
|
+
args.m365_agent_id = normalize_agent_id(args.m365_agent_id)
|
|
1355
|
+
|
|
1356
|
+
if args.concurrency < 1:
|
|
1357
|
+
parser.error('--concurrency must be an integer >= 1.')
|
|
1358
|
+
if args.concurrency > MAX_CONCURRENCY:
|
|
1359
|
+
emit_structured_log(
|
|
1360
|
+
"warning",
|
|
1361
|
+
f"--concurrency {args.concurrency} exceeds max {MAX_CONCURRENCY}; clamping to {MAX_CONCURRENCY}.",
|
|
1362
|
+
operation=Operation.SETUP,
|
|
1363
|
+
)
|
|
1364
|
+
args.concurrency = MAX_CONCURRENCY
|
|
1365
|
+
|
|
1366
|
+
return args
|
|
578
1367
|
|
|
579
1368
|
def validate_environment() -> CallPath:
|
|
580
1369
|
"""Validate required environment variables."""
|
|
@@ -583,25 +1372,40 @@ def validate_environment() -> CallPath:
|
|
|
583
1372
|
"AZURE_AI_API_KEY",
|
|
584
1373
|
"AZURE_AI_API_VERSION",
|
|
585
1374
|
"AZURE_AI_MODEL_NAME",
|
|
586
|
-
# Chat API specific
|
|
587
|
-
"COPILOT_API_ENDPOINT",
|
|
588
|
-
"X_SCENARIO_HEADER"
|
|
589
1375
|
]
|
|
590
1376
|
|
|
591
1377
|
if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
|
|
592
1378
|
call_path = CallPath.ACCESS_TOKEN
|
|
593
|
-
required_env_vars.
|
|
1379
|
+
required_env_vars.extend([
|
|
1380
|
+
"COPILOT_API_ACCESS_TOKEN",
|
|
1381
|
+
"COPILOT_API_ENDPOINT",
|
|
1382
|
+
"X_SCENARIO_HEADER",
|
|
1383
|
+
])
|
|
1384
|
+
elif os.environ.get("WORK_IQ_A2A_ENDPOINT"):
|
|
1385
|
+
call_path = CallPath.A2A
|
|
1386
|
+
required_env_vars.extend([
|
|
1387
|
+
"WORK_IQ_A2A_ENDPOINT",
|
|
1388
|
+
"WORK_IQ_A2A_CLIENT_ID",
|
|
1389
|
+
"TENANT_ID",
|
|
1390
|
+
])
|
|
594
1391
|
else:
|
|
595
1392
|
call_path = CallPath.COPILOT_AUTH
|
|
596
1393
|
required_env_vars.extend([
|
|
1394
|
+
"COPILOT_API_ENDPOINT",
|
|
1395
|
+
"X_SCENARIO_HEADER",
|
|
597
1396
|
"M365_EVAL_CLIENT_ID",
|
|
598
|
-
"TENANT_ID"
|
|
1397
|
+
"TENANT_ID",
|
|
599
1398
|
])
|
|
600
1399
|
|
|
601
1400
|
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
|
602
1401
|
if missing_vars:
|
|
603
|
-
|
|
604
|
-
|
|
1402
|
+
emit_structured_log(
|
|
1403
|
+
"error",
|
|
1404
|
+
"Missing required environment variables: "
|
|
1405
|
+
f"{', '.join(missing_vars)}. Please ensure your .env file contains "
|
|
1406
|
+
"all required Azure configuration.",
|
|
1407
|
+
operation=Operation.VALIDATE_ENV,
|
|
1408
|
+
)
|
|
605
1409
|
sys.exit(1)
|
|
606
1410
|
return call_path
|
|
607
1411
|
|
|
@@ -635,78 +1439,58 @@ def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
|
|
|
635
1439
|
# Convert other parsing errors to ValueError
|
|
636
1440
|
raise ValueError(f"Invalid URL format: {url}") from e
|
|
637
1441
|
|
|
638
|
-
def get_prompt_datasets(args) -> Tuple[List[
|
|
639
|
-
"""Get prompts and expected responses based on command line arguments.
|
|
1442
|
+
def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
|
|
1443
|
+
"""Get prompts and expected responses based on command line arguments.
|
|
1444
|
+
|
|
1445
|
+
Returns:
|
|
1446
|
+
Tuple of (eval_items, default_evaluators).
|
|
1447
|
+
"""
|
|
640
1448
|
if args.prompts:
|
|
641
1449
|
if args.expected and len(args.prompts) != len(args.expected):
|
|
642
|
-
|
|
1450
|
+
emit_structured_log(
|
|
1451
|
+
"error",
|
|
1452
|
+
"Number of prompts must match number of expected responses. "
|
|
1453
|
+
"Update --expected values to match the prompt count.",
|
|
1454
|
+
)
|
|
643
1455
|
sys.exit(1)
|
|
644
|
-
|
|
645
|
-
|
|
1456
|
+
expected_responses = args.expected or [""] * len(args.prompts)
|
|
1457
|
+
eval_items = [
|
|
1458
|
+
{"prompt": p, "expected_response": e}
|
|
1459
|
+
for p, e in zip(args.prompts, expected_responses)
|
|
1460
|
+
]
|
|
1461
|
+
return eval_items, None
|
|
646
1462
|
elif args.prompts_file:
|
|
647
|
-
|
|
1463
|
+
return load_prompts_from_file(args.prompts_file)
|
|
648
1464
|
elif args.interactive:
|
|
649
1465
|
prompts, expected_responses = get_interactive_prompts()
|
|
1466
|
+
eval_items = [
|
|
1467
|
+
{"prompt": p, "expected_response": e}
|
|
1468
|
+
for p, e in zip(prompts, expected_responses)
|
|
1469
|
+
]
|
|
1470
|
+
return eval_items, None
|
|
650
1471
|
else:
|
|
651
|
-
# Use default prompts
|
|
652
1472
|
prompts, expected_responses = get_default_prompts_and_responses()
|
|
653
|
-
|
|
654
|
-
|
|
1473
|
+
eval_items = [
|
|
1474
|
+
{"prompt": p, "expected_response": e}
|
|
1475
|
+
for p, e in zip(prompts, expected_responses)
|
|
1476
|
+
]
|
|
1477
|
+
return eval_items, None
|
|
655
1478
|
|
|
656
|
-
def
|
|
657
|
-
"""
|
|
658
|
-
Fetch available agents for the user from the Copilot API.
|
|
659
|
-
|
|
660
|
-
Args:
|
|
661
|
-
access_token: Bearer token for API authentication
|
|
662
|
-
user_oid: User object ID for agent filtering
|
|
663
|
-
|
|
664
|
-
Returns:
|
|
665
|
-
List of agent dictionaries.
|
|
666
|
-
"""
|
|
667
|
-
request_headers = {
|
|
668
|
-
"Content-Type": "application/json",
|
|
669
|
-
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
670
|
-
"Authorization": f"Bearer {access_token}"
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
try:
|
|
674
|
-
# Build the query parameter with participant info
|
|
675
|
-
request_data = json.dumps({"participant": {"id": user_oid}})
|
|
676
|
-
query_param = urllib.parse.quote(request_data)
|
|
677
|
-
|
|
678
|
-
# Try to fetch agents from /GetGptList endpoint
|
|
679
|
-
req = urllib.request.Request(
|
|
680
|
-
f"{copilot_api_endpoint}/GetGptList?request={query_param}",
|
|
681
|
-
headers=request_headers,
|
|
682
|
-
method="GET"
|
|
683
|
-
)
|
|
684
|
-
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
685
|
-
data = json.loads(resp.read().decode("utf-8"))
|
|
686
|
-
agents = data.get("gptList", [])
|
|
687
|
-
return agents
|
|
688
|
-
except urllib.error.HTTPError as e:
|
|
689
|
-
# If endpoint doesn't exist or returns error, return empty list
|
|
690
|
-
print(f"Warning: Unable to fetch agents list (HTTP {e.code}).")
|
|
691
|
-
return []
|
|
692
|
-
except Exception as e:
|
|
693
|
-
print(f"Warning: Error fetching agents: {e}")
|
|
694
|
-
return []
|
|
695
|
-
|
|
696
|
-
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
1479
|
+
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
|
|
697
1480
|
"""
|
|
698
1481
|
Display an interactive agent selector using questionary.
|
|
699
|
-
|
|
1482
|
+
|
|
700
1483
|
Args:
|
|
701
1484
|
agents: List of agent dictionaries.
|
|
702
|
-
|
|
1485
|
+
|
|
703
1486
|
Returns:
|
|
704
|
-
|
|
1487
|
+
Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
|
|
705
1488
|
"""
|
|
706
1489
|
if not agents:
|
|
707
|
-
return None
|
|
708
|
-
|
|
709
|
-
#
|
|
1490
|
+
return None, None
|
|
1491
|
+
|
|
1492
|
+
# Build id→name lookup and choices
|
|
1493
|
+
id_to_name: Dict[str, str] = {}
|
|
710
1494
|
choices = []
|
|
711
1495
|
sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
|
|
712
1496
|
for agent in sorted_agents:
|
|
@@ -714,12 +1498,13 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
|
714
1498
|
agent_name = agent.get("name", "Unknown")
|
|
715
1499
|
agent_description = agent.get("description", "Unknown")
|
|
716
1500
|
agent_is_owner = agent.get('isOwner')
|
|
717
|
-
|
|
1501
|
+
id_to_name[agent_id] = agent_name
|
|
1502
|
+
|
|
718
1503
|
# Format the display text
|
|
719
1504
|
display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
|
|
720
|
-
|
|
1505
|
+
|
|
721
1506
|
choices.append(questionary.Choice(title=display_text, value=agent_id))
|
|
722
|
-
|
|
1507
|
+
|
|
723
1508
|
# Display the selection prompt
|
|
724
1509
|
selected_agent = questionary.select(
|
|
725
1510
|
"Select an agent to evaluate:",
|
|
@@ -727,238 +1512,260 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
|
727
1512
|
use_shortcuts=True,
|
|
728
1513
|
use_arrow_keys=True
|
|
729
1514
|
).ask()
|
|
730
|
-
|
|
731
|
-
return selected_agent
|
|
732
|
-
|
|
733
|
-
@functools.lru_cache(maxsize=1)
|
|
734
|
-
def _get_iana_timezone_name() -> str:
|
|
735
|
-
"""Get the IANA timezone name from the system using tzlocal.
|
|
736
|
-
|
|
737
|
-
Tries get_localzone_name() first; falls back to str(get_localzone()) when the
|
|
738
|
-
former raises (e.g. no zone configured on some Unix systems). Result is cached
|
|
739
|
-
after the first call so tzlocal is only invoked once per session.
|
|
740
|
-
"""
|
|
741
|
-
try:
|
|
742
|
-
return tzlocal.get_localzone_name()
|
|
743
|
-
except Exception:
|
|
744
|
-
return str(tzlocal.get_localzone())
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
@functools.lru_cache(maxsize=1)
|
|
748
|
-
def _get_location_info() -> Dict[str, Any]:
|
|
749
|
-
"""Return a locationInfo dict containing the local UTC offset and IANA timezone name.
|
|
750
1515
|
|
|
751
|
-
|
|
752
|
-
"""
|
|
753
|
-
now = datetime.now().astimezone()
|
|
754
|
-
utc_offset = now.utcoffset()
|
|
755
|
-
offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
|
|
756
|
-
return {
|
|
757
|
-
"timeZoneOffset": offset_hours,
|
|
758
|
-
"timeZone": _get_iana_timezone_name(),
|
|
759
|
-
}
|
|
1516
|
+
return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
|
|
760
1517
|
|
|
761
1518
|
|
|
762
|
-
def
|
|
763
|
-
|
|
764
|
-
"message": {
|
|
765
|
-
"text": prompt,
|
|
766
|
-
"author": "user",
|
|
767
|
-
"messageType": "chat",
|
|
768
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
769
|
-
"locationInfo": _get_location_info(),
|
|
770
|
-
"from": {
|
|
771
|
-
"id": user_oid,
|
|
772
|
-
}
|
|
773
|
-
},
|
|
774
|
-
"verbosity": "verbose", # To enable detailed telemetry in response (to extract tool usage, etc.)
|
|
775
|
-
}
|
|
776
|
-
|
|
777
|
-
if agent_id:
|
|
778
|
-
message["gpts"] = [
|
|
779
|
-
{
|
|
780
|
-
"id": agent_id.strip(),
|
|
781
|
-
"source": "MOS3"
|
|
782
|
-
}
|
|
783
|
-
]
|
|
784
|
-
message["optionsSets"] = [
|
|
785
|
-
"disable_action_confirmation" # Disable 3P action confirmation prompts for agents while scraping
|
|
786
|
-
]
|
|
787
|
-
|
|
788
|
-
return json.dumps(message).encode("utf-8")
|
|
789
|
-
|
|
790
|
-
def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> Dict[str, Dict[str, any]]:
|
|
791
|
-
""" Send prompts to the chat API and return enhanced responses. """
|
|
792
|
-
|
|
793
|
-
request_headers = {
|
|
794
|
-
"Content-Type": "application/json",
|
|
795
|
-
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
796
|
-
"Authorization": f"Bearer {access_token}"
|
|
797
|
-
}
|
|
798
|
-
raw_responses: Dict[str, str] = {}
|
|
799
|
-
for i, prompt in enumerate(prompts, 1):
|
|
800
|
-
if not args.quiet:
|
|
801
|
-
print(f"Processing prompt {i}/{len(prompts)}...")
|
|
802
|
-
|
|
803
|
-
# Build the payload
|
|
804
|
-
payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
|
|
805
|
-
if args.verbose:
|
|
806
|
-
print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
|
|
807
|
-
|
|
808
|
-
# Send the request to /chat
|
|
809
|
-
req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
|
|
810
|
-
try:
|
|
811
|
-
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
812
|
-
raw = resp.read().decode("utf-8", errors="replace")
|
|
813
|
-
except urllib.error.HTTPError as e:
|
|
814
|
-
error_body = None
|
|
815
|
-
try:
|
|
816
|
-
error_body = e.read().decode("utf-8", errors="replace")
|
|
817
|
-
except Exception:
|
|
818
|
-
pass
|
|
819
|
-
msg = f"Chat API request failed (HTTP {e.code} {e.reason})."
|
|
820
|
-
if error_body:
|
|
821
|
-
msg += f" Body: {error_body[:500]}"
|
|
822
|
-
raise RuntimeError(msg) from e
|
|
823
|
-
except urllib.error.URLError as e:
|
|
824
|
-
raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
|
|
825
|
-
|
|
826
|
-
if args.verbose:
|
|
827
|
-
print(f"[Sydney] Raw response: {raw}")
|
|
828
|
-
|
|
829
|
-
# Store raw response for enhancement
|
|
830
|
-
raw_responses[prompt] = raw.strip()
|
|
831
|
-
|
|
832
|
-
# Extract enhanced responses using the new extractor
|
|
833
|
-
enhanced_responses = extract_enhanced_responses(raw_responses)
|
|
834
|
-
return enhanced_responses
|
|
835
|
-
|
|
836
|
-
def output_results(results: List[Dict], args):
|
|
1519
|
+
def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
|
|
1520
|
+
agent_name: Optional[str] = None, cli_version: Optional[str] = None):
|
|
837
1521
|
"""Output results based on specified format."""
|
|
1522
|
+
metadata_kwargs = dict(
|
|
1523
|
+
agent_name=agent_name,
|
|
1524
|
+
agent_id=getattr(args, 'm365_agent_id', None),
|
|
1525
|
+
cli_version=cli_version,
|
|
1526
|
+
)
|
|
838
1527
|
if args.output:
|
|
839
1528
|
output_lower = args.output.lower()
|
|
840
1529
|
if output_lower.endswith('.json'):
|
|
841
|
-
write_results_to_json(results, args.output,
|
|
1530
|
+
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1531
|
+
**metadata_kwargs)
|
|
842
1532
|
elif output_lower.endswith('.csv'):
|
|
843
|
-
write_results_to_csv(results, args.output)
|
|
1533
|
+
write_results_to_csv(results, args.output, **metadata_kwargs)
|
|
844
1534
|
elif output_lower.endswith('.html'):
|
|
845
|
-
write_results_to_html(results, args.output)
|
|
1535
|
+
write_results_to_html(results, args.output, **metadata_kwargs)
|
|
846
1536
|
abs_path = os.path.abspath(args.output)
|
|
847
1537
|
webbrowser.open(f'file://{abs_path}')
|
|
848
1538
|
else:
|
|
849
|
-
write_results_to_json(results, args.output,
|
|
1539
|
+
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1540
|
+
**metadata_kwargs)
|
|
850
1541
|
else:
|
|
851
|
-
write_results_to_console(results)
|
|
1542
|
+
write_results_to_console(results, **metadata_kwargs)
|
|
852
1543
|
|
|
853
1544
|
def main():
|
|
854
1545
|
"""Main function to orchestrate the evaluation process."""
|
|
855
1546
|
load_dotenv()
|
|
856
1547
|
args = parse_arguments()
|
|
857
1548
|
|
|
1549
|
+
effective_log_level, error_message = resolve_log_level(args.log_level)
|
|
1550
|
+
if error_message:
|
|
1551
|
+
print(error_message)
|
|
1552
|
+
print(
|
|
1553
|
+
"Next step: rerun with --log-level {debug|info|warning|error}. "
|
|
1554
|
+
"For support, share the console diagnostics output from this run."
|
|
1555
|
+
)
|
|
1556
|
+
sys.exit(2)
|
|
1557
|
+
|
|
1558
|
+
args.effective_log_level = effective_log_level
|
|
1559
|
+
configure_cli_logging(effective_log_level)
|
|
1560
|
+
emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
|
|
1561
|
+
|
|
858
1562
|
# Check minimum version before proceeding
|
|
859
|
-
|
|
860
|
-
|
|
1563
|
+
quiet_for_version = effective_log_level in ("warning", "error")
|
|
1564
|
+
cli_version = get_cli_version(quiet=quiet_for_version)
|
|
1565
|
+
if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
|
|
861
1566
|
sys.exit(1)
|
|
862
1567
|
|
|
863
1568
|
# Validate environment variables required for evaluation
|
|
864
1569
|
call_path = validate_environment()
|
|
865
|
-
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
866
|
-
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
867
1570
|
|
|
868
1571
|
user_oid = ""
|
|
869
1572
|
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
# Signout user
|
|
884
|
-
if args.signout:
|
|
885
|
-
try:
|
|
886
|
-
auth_handler.clear_cache()
|
|
887
|
-
except Exception as e:
|
|
888
|
-
print(f"Error during signout: {e}")
|
|
889
|
-
sys.exit(1)
|
|
890
|
-
sys.exit(0)
|
|
891
|
-
|
|
892
|
-
# Authenticate before loading prompts
|
|
893
|
-
try:
|
|
894
|
-
auth_result = auth_handler.acquire_token_interactive() or {}
|
|
895
|
-
access_token = auth_result.get("access_token") or ""
|
|
896
|
-
if not access_token:
|
|
897
|
-
raise RuntimeError("Failed to acquire access token from authentication result")
|
|
898
|
-
|
|
899
|
-
id_token_claims = auth_result.get("id_token_claims")
|
|
900
|
-
if not isinstance(id_token_claims, dict):
|
|
901
|
-
print("id_token_claims is missing or invalid in authentication result")
|
|
902
|
-
else:
|
|
903
|
-
user_oid = id_token_claims.get("oid") or ""
|
|
904
|
-
|
|
905
|
-
except Exception as e:
|
|
906
|
-
print(f"\033[91mError during authentication: {e}\033[0m")
|
|
907
|
-
if args.verbose:
|
|
908
|
-
import traceback
|
|
909
|
-
traceback.print_exc()
|
|
910
|
-
sys.exit(1)
|
|
1573
|
+
match call_path:
|
|
1574
|
+
case CallPath.ACCESS_TOKEN:
|
|
1575
|
+
access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
|
|
1576
|
+
user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
|
|
1577
|
+
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
1578
|
+
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
1579
|
+
agent_client = SydneyClient(
|
|
1580
|
+
copilot_api_endpoint=copilot_api_endpoint,
|
|
1581
|
+
access_token=access_token,
|
|
1582
|
+
user_oid=user_oid,
|
|
1583
|
+
logger=CLI_LOGGER,
|
|
1584
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1585
|
+
)
|
|
911
1586
|
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
1587
|
+
case CallPath.A2A:
|
|
1588
|
+
emit_structured_log(
|
|
1589
|
+
"warning",
|
|
1590
|
+
"The A2A endpoint is experimental and may change without notice.",
|
|
1591
|
+
operation=Operation.SETUP,
|
|
1592
|
+
)
|
|
1593
|
+
a2a_endpoint = os.environ["WORK_IQ_A2A_ENDPOINT"]
|
|
1594
|
+
validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
|
|
1595
|
+
|
|
1596
|
+
a2a_scopes_str = os.environ.get("WORK_IQ_A2A_SCOPES", "")
|
|
1597
|
+
a2a_auth_handler = AuthHandler(
|
|
1598
|
+
client_id=os.environ["WORK_IQ_A2A_CLIENT_ID"],
|
|
1599
|
+
tenant_id=os.environ["TENANT_ID"],
|
|
1600
|
+
scopes_str=a2a_scopes_str,
|
|
1601
|
+
)
|
|
1602
|
+
try:
|
|
1603
|
+
a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
|
|
1604
|
+
a2a_access_token = a2a_auth_result.get("access_token") or ""
|
|
1605
|
+
if not a2a_access_token:
|
|
1606
|
+
raise RuntimeError("Failed to acquire A2A access token")
|
|
1607
|
+
except Exception as e:
|
|
1608
|
+
emit_structured_log(
|
|
1609
|
+
"error",
|
|
1610
|
+
f"Error during A2A authentication: {e}",
|
|
1611
|
+
operation=Operation.AUTHENTICATE,
|
|
1612
|
+
)
|
|
1613
|
+
if effective_log_level == "debug":
|
|
1614
|
+
import traceback
|
|
1615
|
+
traceback.print_exc()
|
|
1616
|
+
sys.exit(1)
|
|
1617
|
+
try:
|
|
1618
|
+
agent_client = A2AClient(
|
|
1619
|
+
a2a_endpoint=a2a_endpoint,
|
|
1620
|
+
access_token=a2a_access_token,
|
|
1621
|
+
logger=CLI_LOGGER,
|
|
1622
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1623
|
+
)
|
|
1624
|
+
except Exception as e:
|
|
1625
|
+
emit_structured_log(
|
|
1626
|
+
"error",
|
|
1627
|
+
f"Failed to initialize A2A client: {e}",
|
|
1628
|
+
operation=Operation.SETUP,
|
|
1629
|
+
)
|
|
1630
|
+
sys.exit(1)
|
|
1631
|
+
|
|
1632
|
+
case CallPath.COPILOT_AUTH:
|
|
1633
|
+
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
1634
|
+
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
1635
|
+
auth_handler = AuthHandler(
|
|
1636
|
+
client_id=os.environ["M365_EVAL_CLIENT_ID"],
|
|
1637
|
+
tenant_id=os.environ["TENANT_ID"],
|
|
1638
|
+
scopes_str=os.environ.get("COPILOT_SCOPES", ""),
|
|
1639
|
+
)
|
|
1640
|
+
|
|
1641
|
+
# Signout user
|
|
1642
|
+
if args.signout:
|
|
1643
|
+
try:
|
|
1644
|
+
auth_handler.clear_cache()
|
|
1645
|
+
except Exception as e:
|
|
1646
|
+
emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
|
|
1647
|
+
sys.exit(1)
|
|
1648
|
+
sys.exit(0)
|
|
915
1649
|
|
|
916
|
-
|
|
917
|
-
|
|
1650
|
+
try:
|
|
1651
|
+
auth_result = auth_handler.acquire_token_interactive() or {}
|
|
1652
|
+
access_token = auth_result.get("access_token") or ""
|
|
1653
|
+
if not access_token:
|
|
1654
|
+
raise RuntimeError("Failed to acquire access token from authentication result")
|
|
1655
|
+
|
|
1656
|
+
id_token_claims = auth_result.get("id_token_claims")
|
|
1657
|
+
if not isinstance(id_token_claims, dict):
|
|
1658
|
+
emit_structured_log(
|
|
1659
|
+
"warning", "id_token_claims is missing or invalid in authentication result",
|
|
1660
|
+
operation=Operation.AUTHENTICATE,
|
|
1661
|
+
)
|
|
1662
|
+
else:
|
|
1663
|
+
user_oid = id_token_claims.get("oid") or ""
|
|
1664
|
+
|
|
1665
|
+
if not user_oid:
|
|
1666
|
+
user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
|
|
918
1667
|
|
|
919
|
-
|
|
920
|
-
|
|
1668
|
+
except Exception as e:
|
|
1669
|
+
emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
|
|
1670
|
+
if effective_log_level == "debug":
|
|
1671
|
+
import traceback
|
|
1672
|
+
traceback.print_exc()
|
|
1673
|
+
sys.exit(1)
|
|
1674
|
+
|
|
1675
|
+
agent_client = SydneyClient(
|
|
1676
|
+
copilot_api_endpoint=copilot_api_endpoint,
|
|
1677
|
+
access_token=access_token,
|
|
1678
|
+
user_oid=user_oid,
|
|
1679
|
+
logger=CLI_LOGGER,
|
|
1680
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1681
|
+
)
|
|
921
1682
|
|
|
1683
|
+
# 1. Load evaluation datasets
|
|
1684
|
+
eval_items, file_default_evaluators = get_prompt_datasets(args)
|
|
1685
|
+
default_evaluators = resolve_default_evaluators(file_default_evaluators)
|
|
1686
|
+
|
|
1687
|
+
if effective_log_level in ("info", "debug"):
|
|
1688
|
+
multi_turn_count = sum(1 for item in eval_items if "turns" in item)
|
|
1689
|
+
single_turn_count = len(eval_items) - multi_turn_count
|
|
1690
|
+
emit_structured_log(
|
|
1691
|
+
"info",
|
|
1692
|
+
f"Running evaluation on {len(eval_items)} item(s) "
|
|
1693
|
+
f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
|
|
1694
|
+
operation=Operation.SETUP,
|
|
1695
|
+
)
|
|
1696
|
+
|
|
1697
|
+
agent_name = None
|
|
922
1698
|
try:
|
|
923
|
-
#
|
|
1699
|
+
# 2. Agent selection - when no agent ID is provided, discover agents
|
|
1700
|
+
# via the active client (A2A or REST) and prompt interactively.
|
|
924
1701
|
if not args.m365_agent_id:
|
|
925
|
-
if
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
|
|
929
|
-
if not available_agents:
|
|
930
|
-
print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
|
|
931
|
-
sys.exit(1)
|
|
932
|
-
|
|
933
|
-
if available_agents:
|
|
934
|
-
selected_agent_id = select_agent_interactively(available_agents)
|
|
935
|
-
if selected_agent_id:
|
|
936
|
-
args.m365_agent_id = selected_agent_id
|
|
937
|
-
if not args.quiet:
|
|
938
|
-
print(f"Selected agent: {args.m365_agent_id}")
|
|
939
|
-
else:
|
|
940
|
-
print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
|
|
941
|
-
sys.exit(1)
|
|
1702
|
+
if effective_log_level in ("info", "debug"):
|
|
1703
|
+
emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
|
|
942
1704
|
|
|
943
|
-
|
|
944
|
-
|
|
1705
|
+
available_agents = agent_client.fetch_available_agents()
|
|
1706
|
+
if not available_agents:
|
|
1707
|
+
emit_structured_log(
|
|
1708
|
+
"error",
|
|
1709
|
+
"No agents are available for interactive selection. Re-run with "
|
|
1710
|
+
"--m365-agent-id or set M365_AGENT_ID.",
|
|
1711
|
+
operation=Operation.FETCH_AGENTS,
|
|
1712
|
+
)
|
|
1713
|
+
sys.exit(1)
|
|
1714
|
+
|
|
1715
|
+
selected_agent_id, agent_name = select_agent_interactively(available_agents)
|
|
1716
|
+
if selected_agent_id:
|
|
1717
|
+
args.m365_agent_id = selected_agent_id
|
|
1718
|
+
if effective_log_level in ("info", "debug"):
|
|
1719
|
+
emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
|
|
1720
|
+
else:
|
|
1721
|
+
emit_structured_log(
|
|
1722
|
+
"error",
|
|
1723
|
+
"No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
|
|
1724
|
+
operation=Operation.FETCH_AGENTS,
|
|
1725
|
+
)
|
|
1726
|
+
sys.exit(1)
|
|
945
1727
|
except Exception as e:
|
|
946
|
-
|
|
947
|
-
if
|
|
1728
|
+
emit_structured_log("error", f"Error during agent discovery: {e}", operation=Operation.FETCH_AGENTS)
|
|
1729
|
+
if effective_log_level == "debug":
|
|
948
1730
|
import traceback
|
|
949
1731
|
traceback.print_exc()
|
|
950
1732
|
sys.exit(1)
|
|
1733
|
+
|
|
1734
|
+
# Pre-resolve agent endpoint (A2A agent card lookup; no-op for REST)
|
|
1735
|
+
if args.m365_agent_id:
|
|
1736
|
+
agent_client.resolve_agent(args.m365_agent_id)
|
|
1737
|
+
|
|
1738
|
+
# 3. Build pipeline config and run evaluation pipeline
|
|
1739
|
+
model_config = AzureOpenAIModelConfiguration(
|
|
1740
|
+
azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
|
|
1741
|
+
api_key=os.environ.get("AZURE_AI_API_KEY"),
|
|
1742
|
+
api_version=os.environ.get("AZURE_AI_API_VERSION"),
|
|
1743
|
+
azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
|
|
1744
|
+
)
|
|
1745
|
+
has_azure_openai = bool(
|
|
1746
|
+
os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
|
|
1747
|
+
and os.environ.get("AZURE_AI_API_KEY")
|
|
1748
|
+
)
|
|
1749
|
+
|
|
1750
|
+
pipeline = PipelineConfig(
|
|
1751
|
+
agent_client=agent_client,
|
|
1752
|
+
model_config=model_config,
|
|
1753
|
+
has_azure_openai=has_azure_openai,
|
|
1754
|
+
default_evaluators=default_evaluators,
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
results = run_pipeline(pipeline, eval_items, args)
|
|
951
1758
|
|
|
952
|
-
#
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
results = run_evaluations(args, responses, expected_responses)
|
|
956
|
-
|
|
957
|
-
# 6. Output results
|
|
958
|
-
output_results(results, args)
|
|
1759
|
+
# 4. Output results
|
|
1760
|
+
output_results(results, args, default_evaluators=default_evaluators,
|
|
1761
|
+
agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
|
|
959
1762
|
|
|
960
|
-
if
|
|
961
|
-
|
|
1763
|
+
if effective_log_level in ("info", "debug"):
|
|
1764
|
+
emit_structured_log(
|
|
1765
|
+
"info",
|
|
1766
|
+
f"Evaluation completed successfully. Processed {len(eval_items)} item(s).",
|
|
1767
|
+
operation=Operation.EVALUATE,
|
|
1768
|
+
)
|
|
962
1769
|
|
|
963
1770
|
# Call the main function when script is run directly
|
|
964
1771
|
if __name__ == "__main__":
|