@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.4.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -97
- package/package.json +7 -4
- package/schema/v1/eval-document.schema.json +140 -8
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
- package/src/clients/cli/api_clients/REST/__init__.py +3 -0
- package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +78 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +54 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +11 -0
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +1006 -476
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +12 -14
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +79 -16
- package/src/clients/node-js/config/default.js +5 -1
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +11 -16
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
package/src/clients/cli/main.py
CHANGED
|
@@ -3,15 +3,19 @@ import os
|
|
|
3
3
|
import argparse
|
|
4
4
|
import sys
|
|
5
5
|
import csv
|
|
6
|
-
import functools
|
|
7
6
|
import logging
|
|
7
|
+
import time
|
|
8
8
|
import webbrowser
|
|
9
|
-
import urllib.request
|
|
10
|
-
import urllib.error
|
|
11
9
|
import urllib.parse
|
|
12
10
|
import questionary
|
|
11
|
+
from dataclasses import dataclass, field
|
|
13
12
|
from enum import Enum
|
|
14
13
|
from typing import List, Dict, Tuple, Optional, Any
|
|
14
|
+
|
|
15
|
+
from api_clients.A2A import A2AClient
|
|
16
|
+
from api_clients.REST import SydneyClient
|
|
17
|
+
from api_clients.base_agent_client import BaseAgentClient
|
|
18
|
+
|
|
15
19
|
from azure.ai.evaluation import (
|
|
16
20
|
AzureOpenAIModelConfiguration,
|
|
17
21
|
RelevanceEvaluator,
|
|
@@ -25,7 +29,7 @@ from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFor
|
|
|
25
29
|
from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
|
|
26
30
|
from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
|
|
27
31
|
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
28
|
-
from response_extractor import
|
|
32
|
+
from response_extractor import get_response_text_for_evaluation
|
|
29
33
|
from schema_handler import DocumentUpgrader, SchemaVersionManager
|
|
30
34
|
from common import (
|
|
31
35
|
RELEVANCE,
|
|
@@ -38,10 +42,14 @@ from common import (
|
|
|
38
42
|
REQUIRES_AZURE_OPENAI,
|
|
39
43
|
REQUIRES_TOOL_DEFINITIONS,
|
|
40
44
|
METRIC_IDS,
|
|
45
|
+
STATUS_PASS,
|
|
46
|
+
STATUS_FAIL,
|
|
47
|
+
STATUS_ERROR,
|
|
48
|
+
STATUS_PARTIAL,
|
|
49
|
+
STATUS_UNKNOWN,
|
|
41
50
|
pascal_case_to_title,
|
|
42
51
|
)
|
|
43
52
|
from evaluator_resolver import (
|
|
44
|
-
EVALUATOR_REGISTRY,
|
|
45
53
|
validate_evaluator_names,
|
|
46
54
|
check_prerequisites,
|
|
47
55
|
resolve_default_evaluators,
|
|
@@ -51,20 +59,86 @@ from evaluator_resolver import (
|
|
|
51
59
|
from version_check import check_min_version, get_cli_version
|
|
52
60
|
from datetime import datetime, timezone
|
|
53
61
|
from pathlib import Path
|
|
54
|
-
import tzlocal
|
|
55
62
|
|
|
56
|
-
|
|
57
|
-
from cli_logging.
|
|
63
|
+
|
|
64
|
+
from cli_logging.console_diagnostics import emit_structured_log as _emit_structured_log
|
|
65
|
+
from cli_logging.logging_utils import LOG_LEVEL_MAP, Operation, resolve_log_level
|
|
66
|
+
from parallel_executor import execute_in_parallel
|
|
67
|
+
from throttle_gate import ThrottleGate
|
|
68
|
+
from retry_policy import (
|
|
69
|
+
is_retryable_status,
|
|
70
|
+
get_backoff_seconds,
|
|
71
|
+
get_retry_after_seconds,
|
|
72
|
+
)
|
|
58
73
|
|
|
59
74
|
# Allowed endpoints for URL validation
|
|
60
75
|
ALLOWED_ENDPOINTS = [
|
|
61
|
-
'substrate.office.com'
|
|
76
|
+
'substrate.office.com',
|
|
77
|
+
'graph.microsoft.com',
|
|
62
78
|
]
|
|
63
79
|
|
|
80
|
+
MAX_CONCURRENCY = 5
|
|
81
|
+
MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
|
|
82
|
+
MAX_TURNS_PER_THREAD = 20
|
|
83
|
+
LONG_THREAD_WARNING_THRESHOLD = 10
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class PipelineConfig:
|
|
88
|
+
"""Runtime configuration for the evaluation pipeline."""
|
|
89
|
+
agent_client: BaseAgentClient
|
|
90
|
+
model_config: AzureOpenAIModelConfiguration
|
|
91
|
+
has_azure_openai: bool
|
|
92
|
+
default_evaluators: Dict[str, Any]
|
|
93
|
+
chat_gate: ThrottleGate = field(default_factory=lambda: ThrottleGate("chat_api"))
|
|
94
|
+
is_retryable_status: Any = field(default=is_retryable_status)
|
|
95
|
+
get_backoff_seconds: Any = field(default=get_backoff_seconds)
|
|
96
|
+
get_retry_after_seconds: Any = field(default=get_retry_after_seconds)
|
|
97
|
+
|
|
64
98
|
class CallPath(Enum):
|
|
65
99
|
""" Enum to indicate which call path to use. """
|
|
66
100
|
ACCESS_TOKEN = "access_token"
|
|
67
101
|
COPILOT_AUTH = "copilot_auth"
|
|
102
|
+
A2A = "a2a"
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class ItemType(Enum):
|
|
106
|
+
SINGLE_TURN = "single_turn"
|
|
107
|
+
MULTI_TURN = "multi_turn"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def detect_item_type(item: dict) -> ItemType:
|
|
111
|
+
"""Determine if an evaluation item is single-turn or multi-turn.
|
|
112
|
+
|
|
113
|
+
Returns ItemType.SINGLE_TURN if item has 'prompt' without 'turns',
|
|
114
|
+
ItemType.MULTI_TURN if item has 'turns' array.
|
|
115
|
+
|
|
116
|
+
Raises ValueError for invalid items (both, neither, or invalid turns).
|
|
117
|
+
"""
|
|
118
|
+
has_turns = "turns" in item
|
|
119
|
+
has_prompt = "prompt" in item
|
|
120
|
+
|
|
121
|
+
if has_turns and has_prompt:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
"Invalid evaluation item: cannot have both 'turns' and 'prompt'. "
|
|
124
|
+
"Use 'turns' for multi-turn threads or 'prompt' for single-turn."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if has_turns and not isinstance(item["turns"], list):
|
|
128
|
+
raise ValueError("Invalid evaluation item: 'turns' must be a list")
|
|
129
|
+
|
|
130
|
+
if has_turns:
|
|
131
|
+
if len(item["turns"]) == 0:
|
|
132
|
+
raise ValueError("Invalid multi-turn thread: 'turns' array cannot be empty")
|
|
133
|
+
return ItemType.MULTI_TURN
|
|
134
|
+
|
|
135
|
+
if has_prompt:
|
|
136
|
+
return ItemType.SINGLE_TURN
|
|
137
|
+
|
|
138
|
+
raise ValueError(
|
|
139
|
+
"Invalid evaluation item: must have either 'turns' array (multi-turn) "
|
|
140
|
+
"or 'prompt' field (single-turn)"
|
|
141
|
+
)
|
|
68
142
|
|
|
69
143
|
|
|
70
144
|
# Flags that should bypass remote min-version enforcement.
|
|
@@ -78,40 +152,21 @@ CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
|
|
|
78
152
|
DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
|
|
79
153
|
|
|
80
154
|
|
|
81
|
-
def _ensure_logger_handler() -> None:
|
|
82
|
-
if CLI_LOGGER.handlers:
|
|
83
|
-
return
|
|
84
|
-
handler = logging.StreamHandler(sys.stdout)
|
|
85
|
-
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
86
|
-
CLI_LOGGER.addHandler(handler)
|
|
87
|
-
CLI_LOGGER.propagate = False
|
|
88
|
-
|
|
89
|
-
|
|
90
155
|
def configure_cli_logging(effective_log_level: str) -> None:
|
|
91
|
-
|
|
156
|
+
if not CLI_LOGGER.handlers:
|
|
157
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
158
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
159
|
+
CLI_LOGGER.addHandler(handler)
|
|
160
|
+
CLI_LOGGER.propagate = False
|
|
92
161
|
CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
|
|
93
162
|
|
|
94
163
|
|
|
95
164
|
def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"message-id": None,
|
|
101
|
-
"operation": operation,
|
|
102
|
-
}
|
|
103
|
-
entry = format_structured_log_entry(
|
|
104
|
-
level=level,
|
|
105
|
-
message=message,
|
|
106
|
-
logger_name=CLI_LOGGER_NAME,
|
|
107
|
-
run_context=context,
|
|
165
|
+
_emit_structured_log(
|
|
166
|
+
level, message, operation,
|
|
167
|
+
logger=CLI_LOGGER,
|
|
168
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
108
169
|
)
|
|
109
|
-
DIAGNOSTIC_RECORDS.append(entry)
|
|
110
|
-
|
|
111
|
-
try:
|
|
112
|
-
CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
|
|
113
|
-
except Exception:
|
|
114
|
-
pass
|
|
115
170
|
|
|
116
171
|
|
|
117
172
|
def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
|
|
@@ -237,118 +292,88 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
|
237
292
|
|
|
238
293
|
return prompts, expected_responses
|
|
239
294
|
|
|
240
|
-
def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
|
|
241
|
-
default_evaluators: Dict[str, Any]) -> list:
|
|
242
|
-
"""Run evaluations against the responses using per-prompt evaluator resolution.
|
|
243
295
|
|
|
244
|
-
|
|
245
|
-
args: CLI arguments.
|
|
246
|
-
responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
|
|
247
|
-
eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
|
|
248
|
-
default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
|
|
249
|
-
"""
|
|
250
|
-
if len(responses) != len(eval_items):
|
|
251
|
-
raise ValueError(
|
|
252
|
-
f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
model_config = AzureOpenAIModelConfiguration(
|
|
256
|
-
azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
|
|
257
|
-
api_key=os.environ.get("AZURE_AI_API_KEY"),
|
|
258
|
-
api_version=os.environ.get("AZURE_AI_API_VERSION"),
|
|
259
|
-
azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
# Build available context for prerequisite checks
|
|
263
|
-
has_azure_openai = bool(
|
|
264
|
-
os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
|
|
265
|
-
and os.environ.get("AZURE_AI_API_KEY")
|
|
266
|
-
)
|
|
296
|
+
_DEFAULT_PASS_THRESHOLD = 3
|
|
267
297
|
|
|
268
|
-
DEFAULT_PASS_THRESHOLD = 3
|
|
269
298
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
score_val = None
|
|
280
|
-
if isinstance(data, dict):
|
|
281
|
-
if metric_id in data:
|
|
282
|
-
score_val = data[metric_id]
|
|
283
|
-
if isinstance(score_val, (int, float)):
|
|
284
|
-
payload['threshold'] = pass_threshold
|
|
285
|
-
payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
|
|
286
|
-
else:
|
|
287
|
-
payload['threshold'] = pass_threshold
|
|
288
|
-
payload.setdefault('result', 'unknown')
|
|
289
|
-
return json.dumps(payload, indent=4)
|
|
290
|
-
|
|
291
|
-
# Validate all evaluator names upfront (across defaults and all items)
|
|
292
|
-
all_evaluator_maps = [default_evaluators]
|
|
293
|
-
for eval_item in eval_items:
|
|
294
|
-
if "evaluators" in eval_item:
|
|
295
|
-
all_evaluator_maps.append(eval_item["evaluators"])
|
|
296
|
-
for emap in all_evaluator_maps:
|
|
297
|
-
validate_evaluator_names(emap)
|
|
298
|
-
|
|
299
|
-
evaluation_results = []
|
|
300
|
-
for enhanced_response, eval_item in zip(responses, eval_items):
|
|
301
|
-
actual_response_text = get_response_text_for_evaluation(enhanced_response)
|
|
302
|
-
prompt = eval_item.get("prompt", "")
|
|
303
|
-
expected_response = eval_item.get("expected_response", "")
|
|
304
|
-
prompt_evaluators = eval_item.get("evaluators")
|
|
305
|
-
evaluators_mode = eval_item.get("evaluators_mode", "extend")
|
|
306
|
-
|
|
307
|
-
# Resolve evaluators for this prompt
|
|
308
|
-
resolved = resolve_evaluators_for_prompt(
|
|
309
|
-
prompt_evaluators, evaluators_mode, prompt, default_evaluators,
|
|
310
|
-
)
|
|
299
|
+
def _decorate_metric(metric_id: str, data, threshold: Optional[int] = None) -> Dict[str, Any]:
|
|
300
|
+
"""Augment raw evaluator output with standardized threshold + pass/fail result."""
|
|
301
|
+
pass_threshold = threshold if threshold is not None else _DEFAULT_PASS_THRESHOLD
|
|
302
|
+
payload = {}
|
|
303
|
+
if isinstance(data, dict):
|
|
304
|
+
payload.update(data)
|
|
305
|
+
else:
|
|
306
|
+
payload['raw'] = data
|
|
311
307
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
308
|
+
score_val = None
|
|
309
|
+
if isinstance(data, dict):
|
|
310
|
+
if metric_id in data:
|
|
311
|
+
score_val = data[metric_id]
|
|
312
|
+
if isinstance(score_val, (int, float)):
|
|
313
|
+
payload['threshold'] = pass_threshold
|
|
314
|
+
payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
|
|
315
|
+
else:
|
|
316
|
+
payload['threshold'] = pass_threshold
|
|
317
|
+
payload.setdefault('result', STATUS_UNKNOWN)
|
|
318
|
+
return payload
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _run_evaluators_for_item(
|
|
322
|
+
prompt: str,
|
|
323
|
+
actual_response: str,
|
|
324
|
+
expected_response: str,
|
|
325
|
+
enhanced_response: Dict[str, Any],
|
|
326
|
+
resolved_evaluators: Dict[str, Any],
|
|
327
|
+
model_config: AzureOpenAIModelConfiguration,
|
|
328
|
+
has_azure_openai: bool,
|
|
329
|
+
args,
|
|
330
|
+
) -> Tuple[Dict[str, Optional[str]], List[str]]:
|
|
331
|
+
"""Run resolved evaluators against a single item/turn.
|
|
332
|
+
|
|
333
|
+
Returns (results_dict, evaluators_ran).
|
|
334
|
+
"""
|
|
335
|
+
has_tool_defs = bool(
|
|
336
|
+
args.m365_agent_id and enhanced_response.get("tool_definitions")
|
|
337
|
+
)
|
|
338
|
+
available_context = {
|
|
339
|
+
REQUIRES_AZURE_OPENAI: has_azure_openai,
|
|
340
|
+
REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
|
|
341
|
+
}
|
|
320
342
|
|
|
321
|
-
|
|
322
|
-
|
|
343
|
+
results_dict: Dict[str, Optional[str]] = {}
|
|
344
|
+
evaluators_ran: List[str] = []
|
|
323
345
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
if
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
346
|
+
for eval_name, eval_options in resolved_evaluators.items():
|
|
347
|
+
can_run, warn_msg = check_prerequisites(eval_name, available_context)
|
|
348
|
+
if not can_run:
|
|
349
|
+
if warn_msg:
|
|
350
|
+
emit_structured_log(
|
|
351
|
+
"warning",
|
|
352
|
+
f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
|
|
353
|
+
operation=Operation.EVALUATE,
|
|
354
|
+
)
|
|
355
|
+
results_dict[eval_name] = None
|
|
356
|
+
continue
|
|
332
357
|
|
|
333
|
-
|
|
334
|
-
threshold = get_evaluator_threshold(eval_name, eval_options)
|
|
358
|
+
threshold = get_evaluator_threshold(eval_name, eval_options)
|
|
335
359
|
|
|
360
|
+
try:
|
|
336
361
|
if eval_name == RELEVANCE:
|
|
337
|
-
raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=
|
|
338
|
-
results_dict[RELEVANCE] =
|
|
362
|
+
raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
|
|
363
|
+
results_dict[RELEVANCE] = _decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
|
|
339
364
|
elif eval_name == COHERENCE:
|
|
340
|
-
raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=
|
|
341
|
-
results_dict[COHERENCE] =
|
|
365
|
+
raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
|
|
366
|
+
results_dict[COHERENCE] = _decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
|
|
342
367
|
elif eval_name == GROUNDEDNESS:
|
|
343
|
-
raw_score = GroundednessEvaluator(model_config=model_config)(response=
|
|
344
|
-
results_dict[GROUNDEDNESS] =
|
|
368
|
+
raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response, context=expected_response)
|
|
369
|
+
results_dict[GROUNDEDNESS] = _decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
|
|
345
370
|
elif eval_name == TOOL_CALL_ACCURACY:
|
|
346
371
|
raw_score = ToolCallAccuracyEvaluator(model_config)(
|
|
347
372
|
query=prompt,
|
|
348
|
-
response=enhanced_response.get("response",
|
|
349
|
-
tool_definitions=enhanced_response
|
|
373
|
+
response=enhanced_response.get("response", actual_response),
|
|
374
|
+
tool_definitions=enhanced_response.get("tool_definitions", []),
|
|
350
375
|
)
|
|
351
|
-
results_dict[TOOL_CALL_ACCURACY] =
|
|
376
|
+
results_dict[TOOL_CALL_ACCURACY] = _decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
|
|
352
377
|
elif eval_name == CITATIONS:
|
|
353
378
|
fmt_str = eval_options.get("citation_format", "oai_unicode")
|
|
354
379
|
fmt_map = {
|
|
@@ -356,45 +381,456 @@ def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict
|
|
|
356
381
|
"bracket": CitationFormat.LEGACY_BRACKET,
|
|
357
382
|
"mixed": CitationFormat.AUTO,
|
|
358
383
|
}
|
|
359
|
-
raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=
|
|
360
|
-
results_dict[CITATIONS] =
|
|
384
|
+
raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response)
|
|
385
|
+
results_dict[CITATIONS] = _decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
|
|
361
386
|
elif eval_name == EXACT_MATCH:
|
|
362
|
-
# ExactMatch is binary (match/no-match) — it includes its own result
|
|
363
|
-
# field, so we skip decorate_metric which assumes a numeric score.
|
|
364
387
|
case_sensitive = eval_options.get("case_sensitive", False)
|
|
365
|
-
raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=
|
|
366
|
-
|
|
388
|
+
raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
|
|
389
|
+
# ExactMatch is binary — the evaluator already sets 'result'
|
|
390
|
+
# so _decorate_metric (which computes result from score vs threshold) is not needed.
|
|
391
|
+
results_dict[EXACT_MATCH] = raw_score
|
|
367
392
|
elif eval_name == PARTIAL_MATCH:
|
|
368
393
|
case_sensitive = eval_options.get("case_sensitive", False)
|
|
369
|
-
raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=
|
|
370
|
-
results_dict[PARTIAL_MATCH] =
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
394
|
+
raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
|
|
395
|
+
results_dict[PARTIAL_MATCH] = _decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
|
|
396
|
+
|
|
397
|
+
evaluators_ran.append(eval_name)
|
|
398
|
+
except Exception as e:
|
|
399
|
+
emit_structured_log(
|
|
400
|
+
"error",
|
|
401
|
+
f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
|
|
402
|
+
operation=Operation.EVALUATE,
|
|
403
|
+
)
|
|
404
|
+
results_dict[eval_name] = None
|
|
405
|
+
|
|
406
|
+
return results_dict, evaluators_ran
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _evaluate_single_response(
|
|
410
|
+
enhanced_response: Dict[str, Any],
|
|
411
|
+
eval_item: Dict,
|
|
412
|
+
args,
|
|
413
|
+
model_config: AzureOpenAIModelConfiguration,
|
|
414
|
+
has_azure_openai: bool,
|
|
415
|
+
default_evaluators: Dict[str, Any],
|
|
416
|
+
) -> Dict[str, Any]:
|
|
417
|
+
"""Run all evaluators for a single prompt/response pair and return the result dict."""
|
|
418
|
+
actual_response_text = get_response_text_for_evaluation(enhanced_response)
|
|
419
|
+
prompt = eval_item.get("prompt", "")
|
|
420
|
+
expected_response = eval_item.get("expected_response", "")
|
|
421
|
+
|
|
422
|
+
resolved = resolve_evaluators_for_prompt(
|
|
423
|
+
eval_item.get("evaluators"), eval_item.get("evaluators_mode", "extend"),
|
|
424
|
+
prompt, default_evaluators,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
results_dict, evaluators_ran = _run_evaluators_for_item(
|
|
428
|
+
prompt, actual_response_text, expected_response, enhanced_response,
|
|
429
|
+
resolved, model_config, has_azure_openai, args,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
evaluation_result = {
|
|
433
|
+
"prompt": prompt,
|
|
434
|
+
"response": enhanced_response.get(
|
|
435
|
+
"display_response_text", actual_response_text
|
|
436
|
+
),
|
|
437
|
+
"expected_response": expected_response,
|
|
438
|
+
"evaluators_ran": evaluators_ran,
|
|
439
|
+
"results": results_dict,
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
if "evaluators" in eval_item:
|
|
443
|
+
evaluation_result["evaluators"] = eval_item["evaluators"]
|
|
444
|
+
if "evaluators_mode" in eval_item:
|
|
445
|
+
evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
|
|
446
|
+
|
|
447
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
448
|
+
emit_structured_log(
|
|
449
|
+
"debug",
|
|
450
|
+
f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
|
|
451
|
+
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
452
|
+
f"Scores: {evaluation_result['results']}",
|
|
453
|
+
operation=Operation.EVALUATE,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
return evaluation_result
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
|
|
460
|
+
"""Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
|
|
461
|
+
for result_data in results_dict.values():
|
|
462
|
+
if result_data is None:
|
|
463
|
+
continue
|
|
464
|
+
if result_data.get("result") == STATUS_FAIL:
|
|
465
|
+
return False
|
|
466
|
+
return True
|
|
467
|
+
|
|
468
|
+
def _evaluate_multi_turn_responses(
|
|
469
|
+
turns: List[Dict],
|
|
470
|
+
args,
|
|
471
|
+
default_evaluators: Dict[str, Any],
|
|
472
|
+
model_config: AzureOpenAIModelConfiguration,
|
|
473
|
+
has_azure_openai: bool,
|
|
474
|
+
) -> Tuple[List[Dict], Dict]:
|
|
475
|
+
"""Run per-turn evaluations and build evaluated turn results with summary.
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
Tuple of (evaluated_turns, summary). Each evaluated turn contains
|
|
479
|
+
prompt, response, expected_response, status, evaluators_ran, results,
|
|
480
|
+
and optionally error. Does not mutate the input turns.
|
|
481
|
+
"""
|
|
482
|
+
evaluated_turns: List[Dict] = []
|
|
483
|
+
turns_passed = 0
|
|
484
|
+
turns_failed = 0
|
|
485
|
+
|
|
486
|
+
for i, turn in enumerate(turns):
|
|
487
|
+
evaluated_turn: Dict[str, Any] = {
|
|
488
|
+
"prompt": turn.get("prompt", ""),
|
|
378
489
|
}
|
|
490
|
+
if "expected_response" in turn:
|
|
491
|
+
evaluated_turn["expected_response"] = turn["expected_response"]
|
|
492
|
+
if "response" in turn:
|
|
493
|
+
evaluated_turn["response"] = turn["response"]
|
|
494
|
+
if "evaluators" in turn:
|
|
495
|
+
evaluated_turn["evaluators"] = turn["evaluators"]
|
|
496
|
+
if "evaluators_mode" in turn:
|
|
497
|
+
evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
498
|
+
|
|
499
|
+
if turn.get("status") == STATUS_ERROR:
|
|
500
|
+
evaluated_turn["status"] = STATUS_ERROR
|
|
501
|
+
evaluated_turn["error"] = turn.get("error", "")
|
|
502
|
+
turns_failed += 1
|
|
503
|
+
evaluated_turns.append(evaluated_turn)
|
|
504
|
+
continue
|
|
379
505
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
506
|
+
enhanced_response = turn.get("_enhanced_response", {})
|
|
507
|
+
actual_response = get_response_text_for_evaluation(enhanced_response)
|
|
508
|
+
|
|
509
|
+
resolved = resolve_evaluators_for_prompt(
|
|
510
|
+
turn.get("evaluators"), turn.get("evaluators_mode", "extend"),
|
|
511
|
+
turn.get("prompt", ""), default_evaluators,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
results_dict, evaluators_ran = _run_evaluators_for_item(
|
|
515
|
+
turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
|
|
516
|
+
enhanced_response, resolved, model_config, has_azure_openai, args,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
all_passed = _check_all_passed(results_dict)
|
|
520
|
+
|
|
521
|
+
evaluated_turn["results"] = results_dict
|
|
522
|
+
evaluated_turn["evaluators_ran"] = evaluators_ran
|
|
523
|
+
evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
|
|
385
524
|
|
|
386
525
|
if getattr(args, "effective_log_level", "info") == "debug":
|
|
387
526
|
emit_structured_log(
|
|
388
527
|
"debug",
|
|
389
|
-
f"Evaluation completed for prompt='{
|
|
528
|
+
f"Evaluation completed for turn {i + 1} prompt='{turn.get('prompt', '')}'. "
|
|
390
529
|
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
391
|
-
f"Scores: {
|
|
530
|
+
f"Scores: {results_dict}",
|
|
392
531
|
operation=Operation.EVALUATE,
|
|
393
532
|
)
|
|
394
533
|
|
|
395
|
-
|
|
534
|
+
if all_passed:
|
|
535
|
+
turns_passed += 1
|
|
536
|
+
else:
|
|
537
|
+
turns_failed += 1
|
|
538
|
+
|
|
539
|
+
evaluated_turns.append(evaluated_turn)
|
|
540
|
+
|
|
541
|
+
turns_total = len(turns)
|
|
542
|
+
if turns_passed == turns_total:
|
|
543
|
+
overall_status = STATUS_PASS
|
|
544
|
+
elif turns_failed == turns_total:
|
|
545
|
+
overall_status = STATUS_FAIL
|
|
546
|
+
else:
|
|
547
|
+
overall_status = STATUS_PARTIAL
|
|
548
|
+
|
|
549
|
+
summary = {
|
|
550
|
+
"turns_total": turns_total,
|
|
551
|
+
"turns_passed": turns_passed,
|
|
552
|
+
"turns_failed": turns_failed,
|
|
553
|
+
"overall_status": overall_status,
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
return evaluated_turns, summary
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def get_effective_worker_count(prompt_count: int, args) -> int:
|
|
560
|
+
"""Compute safe worker count for prompt processing."""
|
|
561
|
+
if prompt_count <= 0:
|
|
562
|
+
return 1
|
|
563
|
+
|
|
564
|
+
requested = getattr(args, "concurrency", 5)
|
|
565
|
+
try:
|
|
566
|
+
requested_int = int(requested)
|
|
567
|
+
except (TypeError, ValueError):
|
|
568
|
+
requested_int = 5
|
|
569
|
+
|
|
570
|
+
bounded = max(1, min(requested_int, MAX_CONCURRENCY))
|
|
571
|
+
return min(bounded, prompt_count)
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def run_pipeline(
|
|
575
|
+
pipeline: PipelineConfig,
|
|
576
|
+
eval_items: List[Dict],
|
|
577
|
+
args,
|
|
578
|
+
) -> List[Dict[str, Any]]:
|
|
579
|
+
"""Run the full evaluation pipeline: send prompts and evaluate responses in parallel.
|
|
580
|
+
|
|
581
|
+
Each worker processes one prompt end-to-end: send → evaluate.
|
|
582
|
+
Results are returned in original prompt order (FR-006).
|
|
583
|
+
"""
|
|
584
|
+
# Validate all evaluator names upfront before dispatching workers
|
|
585
|
+
all_evaluator_maps = [pipeline.default_evaluators]
|
|
586
|
+
for eval_item in eval_items:
|
|
587
|
+
if "evaluators" in eval_item:
|
|
588
|
+
all_evaluator_maps.append(eval_item["evaluators"])
|
|
589
|
+
for turn in eval_item.get("turns", []):
|
|
590
|
+
if "evaluators" in turn:
|
|
591
|
+
all_evaluator_maps.append(turn["evaluators"])
|
|
592
|
+
for emap in all_evaluator_maps:
|
|
593
|
+
validate_evaluator_names(emap)
|
|
594
|
+
|
|
595
|
+
# Validate all items upfront and classify types before dispatching workers
|
|
596
|
+
item_types: List[ItemType] = []
|
|
597
|
+
for idx, eval_item in enumerate(eval_items):
|
|
598
|
+
try:
|
|
599
|
+
item_type = detect_item_type(eval_item)
|
|
600
|
+
except ValueError as e:
|
|
601
|
+
raise ValueError(f"Invalid evaluation item at index {idx}: {e}") from e
|
|
602
|
+
if item_type == ItemType.MULTI_TURN:
|
|
603
|
+
turn_count = len(eval_item["turns"])
|
|
604
|
+
if turn_count > MAX_TURNS_PER_THREAD:
|
|
605
|
+
raise ValueError(
|
|
606
|
+
f"Invalid evaluation item at index {idx}: 'turns' array has "
|
|
607
|
+
f"{turn_count} items (max {MAX_TURNS_PER_THREAD})"
|
|
608
|
+
)
|
|
609
|
+
item_types.append(item_type)
|
|
610
|
+
|
|
611
|
+
total = len(eval_items)
|
|
612
|
+
worker_count = get_effective_worker_count(total, args)
|
|
613
|
+
|
|
614
|
+
multi_turn_count = sum(1 for t in item_types if t == ItemType.MULTI_TURN)
|
|
615
|
+
single_turn_count = total - multi_turn_count
|
|
616
|
+
|
|
617
|
+
emit_structured_log(
|
|
618
|
+
"info",
|
|
619
|
+
f"Running pipeline with {worker_count} worker(s) for {total} item(s) "
|
|
620
|
+
f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
|
|
621
|
+
operation=Operation.EVALUATE,
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
def _process_item(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
625
|
+
if item_types[index] == ItemType.MULTI_TURN:
|
|
626
|
+
return _process_multi_turn(eval_item, index)
|
|
627
|
+
return _process_single_turn(eval_item, index)
|
|
628
|
+
|
|
629
|
+
def _process_single_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
630
|
+
prompt = eval_item.get("prompt", "")
|
|
631
|
+
emit_structured_log(
|
|
632
|
+
"info",
|
|
633
|
+
f"Processing item {index + 1}/{total} (single-turn).",
|
|
634
|
+
operation=Operation.SEND_PROMPT,
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
# Phase A: Send prompt to agent (with retry + throttle gate)
|
|
638
|
+
response = None
|
|
639
|
+
for attempt in range(1, MAX_ATTEMPTS + 1):
|
|
640
|
+
pipeline.chat_gate.wait_if_blocked()
|
|
641
|
+
try:
|
|
642
|
+
response, _ = pipeline.agent_client.send_prompt(prompt, agent_id=args.m365_agent_id)
|
|
643
|
+
break
|
|
644
|
+
except Exception as exc:
|
|
645
|
+
cause = exc.__cause__
|
|
646
|
+
status = int(getattr(cause, "code", 0) or 0) or None if cause else None
|
|
647
|
+
retry_after = get_retry_after_seconds(
|
|
648
|
+
cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
if retry_after is not None and pipeline.is_retryable_status(status):
|
|
652
|
+
pipeline.chat_gate.apply_retry_after(retry_after)
|
|
653
|
+
|
|
654
|
+
if not pipeline.is_retryable_status(status) or attempt >= MAX_ATTEMPTS:
|
|
655
|
+
emit_structured_log(
|
|
656
|
+
"error",
|
|
657
|
+
f"Item {index + 1}/{total} failed after {attempt} attempt(s): {exc}",
|
|
658
|
+
operation=Operation.SEND_PROMPT,
|
|
659
|
+
)
|
|
660
|
+
return {
|
|
661
|
+
"prompt": prompt,
|
|
662
|
+
"response": "",
|
|
663
|
+
"expected_response": eval_item.get("expected_response", ""),
|
|
664
|
+
"evaluators_ran": [],
|
|
665
|
+
"results": {},
|
|
666
|
+
"status": STATUS_ERROR,
|
|
667
|
+
"errorDetails": str(exc),
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
|
|
671
|
+
time.sleep(delay)
|
|
672
|
+
|
|
673
|
+
# Phase B: Evaluate response
|
|
674
|
+
return _evaluate_single_response(
|
|
675
|
+
response, eval_item, args,
|
|
676
|
+
pipeline.model_config, pipeline.has_azure_openai,
|
|
677
|
+
pipeline.default_evaluators,
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
|
|
681
|
+
turns = eval_item["turns"]
|
|
682
|
+
thread_name = eval_item.get("name", "Unnamed thread")
|
|
683
|
+
emit_structured_log(
|
|
684
|
+
"info",
|
|
685
|
+
f"Processing item {index + 1}/{total} (multi-turn: '{thread_name}').",
|
|
686
|
+
operation=Operation.SEND_PROMPT,
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
if len(turns) > LONG_THREAD_WARNING_THRESHOLD:
|
|
690
|
+
emit_structured_log(
|
|
691
|
+
"warning",
|
|
692
|
+
f"Thread '{thread_name}' has {len(turns)} turns (>{LONG_THREAD_WARNING_THRESHOLD}). This may take a while.",
|
|
693
|
+
operation=Operation.SEND_PROMPT,
|
|
694
|
+
)
|
|
695
|
+
|
|
696
|
+
# Phase A: Send each turn with throttle gate + 429-only retry
|
|
697
|
+
# Multi-turn only retries on 429 (server confirmed it didn't process
|
|
698
|
+
# the request). Other transient errors (503, 504) are ambiguous about
|
|
699
|
+
# whether the server processed the turn, risking duplicate turns in
|
|
700
|
+
# the conversation if retried.
|
|
701
|
+
conversation_context = None
|
|
702
|
+
conversation_id = None
|
|
703
|
+
enriched_turns: List[Dict[str, Any]] = []
|
|
704
|
+
failed = False
|
|
705
|
+
|
|
706
|
+
for i, turn in enumerate(turns):
|
|
707
|
+
prompt = turn["prompt"]
|
|
708
|
+
emit_structured_log(
|
|
709
|
+
"debug",
|
|
710
|
+
f"Sending turn {i + 1}/{len(turns)} of '{thread_name}'.",
|
|
711
|
+
operation=Operation.SEND_PROMPT,
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
response = None
|
|
715
|
+
for attempt in range(1, MAX_ATTEMPTS + 1):
|
|
716
|
+
pipeline.chat_gate.wait_if_blocked()
|
|
717
|
+
try:
|
|
718
|
+
response, conversation_context = pipeline.agent_client.send_prompt(
|
|
719
|
+
prompt, agent_id=args.m365_agent_id,
|
|
720
|
+
conversation_context=conversation_context,
|
|
721
|
+
)
|
|
722
|
+
break
|
|
723
|
+
except Exception as exc:
|
|
724
|
+
cause = exc.__cause__
|
|
725
|
+
status = int(getattr(cause, "code", 0) or 0) or None if cause else None
|
|
726
|
+
retry_after = get_retry_after_seconds(
|
|
727
|
+
cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
# Only retry on 429 — server confirmed it didn't process the request
|
|
731
|
+
if status == 429 and attempt < MAX_ATTEMPTS:
|
|
732
|
+
if retry_after is not None:
|
|
733
|
+
pipeline.chat_gate.apply_retry_after(retry_after)
|
|
734
|
+
delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
|
|
735
|
+
time.sleep(delay)
|
|
736
|
+
continue
|
|
737
|
+
|
|
738
|
+
# All other errors: stop the thread
|
|
739
|
+
emit_structured_log(
|
|
740
|
+
"error",
|
|
741
|
+
f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
|
|
742
|
+
operation=Operation.SEND_PROMPT,
|
|
743
|
+
)
|
|
744
|
+
failed = True
|
|
745
|
+
break
|
|
746
|
+
|
|
747
|
+
if failed:
|
|
748
|
+
# Mark this turn and all remaining turns as error
|
|
749
|
+
enriched_turns.append({
|
|
750
|
+
**turn,
|
|
751
|
+
"response": "",
|
|
752
|
+
"status": STATUS_ERROR,
|
|
753
|
+
"error": "Failed to get response from agent",
|
|
754
|
+
})
|
|
755
|
+
for j in range(i + 1, len(turns)):
|
|
756
|
+
enriched_turns.append({
|
|
757
|
+
**turns[j],
|
|
758
|
+
"response": "",
|
|
759
|
+
"status": STATUS_ERROR,
|
|
760
|
+
"error": "Skipped: preceding turn failed",
|
|
761
|
+
})
|
|
762
|
+
break
|
|
763
|
+
|
|
764
|
+
# Enrich turn with response
|
|
765
|
+
response_text = get_response_text_for_evaluation(response)
|
|
766
|
+
enriched_turns.append({
|
|
767
|
+
**turn,
|
|
768
|
+
"response": response.get("display_response_text", response_text),
|
|
769
|
+
"_enhanced_response": response,
|
|
770
|
+
})
|
|
771
|
+
|
|
772
|
+
# Capture conversation_id from first response
|
|
773
|
+
if conversation_id is None:
|
|
774
|
+
conversation_id = response.get("metadata", {}).get("conversation_id")
|
|
775
|
+
|
|
776
|
+
# Phase B: Run per-turn evaluations
|
|
777
|
+
evaluated_turns, summary = _evaluate_multi_turn_responses(
|
|
778
|
+
enriched_turns, args, pipeline.default_evaluators,
|
|
779
|
+
model_config=pipeline.model_config,
|
|
780
|
+
has_azure_openai=pipeline.has_azure_openai,
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
return {
|
|
784
|
+
"type": "multi_turn",
|
|
785
|
+
"name": eval_item.get("name", ""),
|
|
786
|
+
"description": eval_item.get("description", ""),
|
|
787
|
+
"conversation_id": conversation_id or "",
|
|
788
|
+
"turns": evaluated_turns,
|
|
789
|
+
"summary": summary,
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
execution_results = execute_in_parallel(
|
|
793
|
+
eval_items, _process_item, max_workers=worker_count,
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
# Unwrap WorkerResult objects into plain dicts, with error fallback
|
|
797
|
+
ordered_results: List[Dict[str, Any]] = []
|
|
798
|
+
for wr in execution_results:
|
|
799
|
+
if wr.error:
|
|
800
|
+
idx = wr.index
|
|
801
|
+
item = eval_items[idx]
|
|
802
|
+
if item_types[idx] == ItemType.MULTI_TURN:
|
|
803
|
+
ordered_results.append({
|
|
804
|
+
"type": "multi_turn",
|
|
805
|
+
"name": item.get("name", ""),
|
|
806
|
+
"turns": [
|
|
807
|
+
{**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
|
|
808
|
+
for t in item.get("turns", [])
|
|
809
|
+
],
|
|
810
|
+
"summary": {
|
|
811
|
+
"turns_total": len(item.get("turns", [])),
|
|
812
|
+
"turns_passed": 0,
|
|
813
|
+
"turns_failed": len(item.get("turns", [])),
|
|
814
|
+
"overall_status": STATUS_FAIL,
|
|
815
|
+
},
|
|
816
|
+
"error": str(wr.error),
|
|
817
|
+
})
|
|
818
|
+
else:
|
|
819
|
+
ordered_results.append({
|
|
820
|
+
"prompt": item.get("prompt", ""),
|
|
821
|
+
"response": "",
|
|
822
|
+
"expected_response": item.get("expected_response", ""),
|
|
823
|
+
"evaluators_ran": [],
|
|
824
|
+
"results": {},
|
|
825
|
+
"status": STATUS_ERROR,
|
|
826
|
+
"errorDetails": str(wr.error),
|
|
827
|
+
})
|
|
828
|
+
else:
|
|
829
|
+
ordered_results.append(wr.value)
|
|
830
|
+
|
|
831
|
+
return ordered_results
|
|
832
|
+
|
|
396
833
|
|
|
397
|
-
return evaluation_results
|
|
398
834
|
|
|
399
835
|
def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
400
836
|
agent_id: Optional[str] = None,
|
|
@@ -411,6 +847,35 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
|
411
847
|
RED = '\033[91m'
|
|
412
848
|
RESET = '\033[0m'
|
|
413
849
|
|
|
850
|
+
def _print_evaluated_item(response: str, expected_response: str,
|
|
851
|
+
evaluators_ran: List[str], item_results: Dict[str, Any],
|
|
852
|
+
error: Optional[str] = None) -> None:
|
|
853
|
+
"""Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
|
|
854
|
+
|
|
855
|
+
The item header (Prompt X / Turn X) is printed by the caller; this helper
|
|
856
|
+
prints evaluators, response, expected response, error, and metrics.
|
|
857
|
+
"""
|
|
858
|
+
if evaluators_ran:
|
|
859
|
+
print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
|
|
860
|
+
if response:
|
|
861
|
+
print(f"{BOLD}{CYAN}Response:{RESET} {response}")
|
|
862
|
+
if expected_response:
|
|
863
|
+
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
|
|
864
|
+
if error:
|
|
865
|
+
print(f"{BOLD}{RED}Error:{RESET} {error}")
|
|
866
|
+
|
|
867
|
+
for eval_name, v in item_results.items():
|
|
868
|
+
if v is None:
|
|
869
|
+
continue
|
|
870
|
+
display_name = pascal_case_to_title(eval_name)
|
|
871
|
+
if eval_name == RELEVANCE:
|
|
872
|
+
color = MAGENTA
|
|
873
|
+
elif eval_name == COHERENCE:
|
|
874
|
+
color = ORANGE
|
|
875
|
+
else:
|
|
876
|
+
color = BLUE
|
|
877
|
+
print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
|
|
878
|
+
|
|
414
879
|
# Show metadata
|
|
415
880
|
metadata_parts = []
|
|
416
881
|
if agent_name:
|
|
@@ -423,17 +888,17 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
|
423
888
|
print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
|
|
424
889
|
print()
|
|
425
890
|
|
|
426
|
-
|
|
427
|
-
if
|
|
428
|
-
|
|
429
|
-
if
|
|
430
|
-
print(f"{BOLD}{BLUE}Aggregate Statistics ({
|
|
891
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
892
|
+
if aggregates:
|
|
893
|
+
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
894
|
+
if total_items > 1:
|
|
895
|
+
print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
|
|
431
896
|
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
432
897
|
|
|
433
898
|
for metric_name, stats in aggregates.items():
|
|
434
899
|
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
435
900
|
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
436
|
-
total_prompts = stats.get('total_prompts',
|
|
901
|
+
total_prompts = stats.get('total_prompts', total_items)
|
|
437
902
|
print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
|
|
438
903
|
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
439
904
|
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
@@ -447,30 +912,39 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
|
447
912
|
print(f"{BOLD}{BLUE}Individual Results:{RESET}")
|
|
448
913
|
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
449
914
|
for i, result in enumerate(results, 1):
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
print(f"{
|
|
473
|
-
|
|
915
|
+
if result.get("type") == "multi_turn":
|
|
916
|
+
thread_name = result.get("name", "Unnamed Thread")
|
|
917
|
+
summary = result.get("summary", {})
|
|
918
|
+
status = summary.get("overall_status", STATUS_UNKNOWN)
|
|
919
|
+
status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
|
|
920
|
+
|
|
921
|
+
print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
|
|
922
|
+
for t_idx, turn in enumerate(result.get("turns", []), 1):
|
|
923
|
+
turn_status = turn.get("status", STATUS_UNKNOWN)
|
|
924
|
+
turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
|
|
925
|
+
print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
|
|
926
|
+
_print_evaluated_item(
|
|
927
|
+
response=turn.get("response", ""),
|
|
928
|
+
expected_response=turn.get("expected_response", ""),
|
|
929
|
+
evaluators_ran=turn.get("evaluators_ran", []),
|
|
930
|
+
item_results=turn.get("results", {}),
|
|
931
|
+
error=turn.get("error"),
|
|
932
|
+
)
|
|
933
|
+
print()
|
|
934
|
+
print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
|
|
935
|
+
print(f" Status: {status_color}{status.upper()}{RESET}")
|
|
936
|
+
print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
|
|
937
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
938
|
+
else:
|
|
939
|
+
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
940
|
+
_print_evaluated_item(
|
|
941
|
+
response=result.get('response', ''),
|
|
942
|
+
expected_response=result.get('expected_response', ''),
|
|
943
|
+
evaluators_ran=result.get('evaluators_ran', []),
|
|
944
|
+
item_results=result.get('results', {}),
|
|
945
|
+
error=result.get('errorDetails'),
|
|
946
|
+
)
|
|
947
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
474
948
|
|
|
475
949
|
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
476
950
|
"""Extract an EvalScore object from a decorated metric dict.
|
|
@@ -487,8 +961,8 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
|
487
961
|
return None
|
|
488
962
|
|
|
489
963
|
result = data.get("result")
|
|
490
|
-
if result not in (
|
|
491
|
-
result =
|
|
964
|
+
if result not in (STATUS_PASS, STATUS_FAIL):
|
|
965
|
+
result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else STATUS_FAIL
|
|
492
966
|
|
|
493
967
|
eval_score: Dict[str, Any] = {
|
|
494
968
|
"score": score_val,
|
|
@@ -501,55 +975,33 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
|
501
975
|
return eval_score
|
|
502
976
|
|
|
503
977
|
|
|
504
|
-
def
|
|
505
|
-
"""Convert
|
|
506
|
-
|
|
507
|
-
Internal format (from run_evaluations):
|
|
508
|
-
{prompt, response, expected_response, results: {Relevance: "JSON", ...},
|
|
509
|
-
evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
|
|
510
|
-
Schema EvalItem format:
|
|
511
|
-
{prompt, response, expected_response, scores: {relevance: EvalScore, ...},
|
|
512
|
-
evaluators: {...}, evaluators_mode: "..."}
|
|
513
|
-
"""
|
|
514
|
-
item: Dict[str, Any] = {
|
|
515
|
-
"prompt": result["prompt"],
|
|
516
|
-
"response": result["response"],
|
|
517
|
-
"expected_response": result["expected_response"],
|
|
518
|
-
}
|
|
519
|
-
|
|
520
|
-
# Preserve evaluator config in output
|
|
521
|
-
if "evaluators" in result:
|
|
522
|
-
item["evaluators"] = result["evaluators"]
|
|
523
|
-
if "evaluators_mode" in result:
|
|
524
|
-
item["evaluators_mode"] = result["evaluators_mode"]
|
|
978
|
+
def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
979
|
+
"""Convert raw evaluator results to schema-compliant score objects.
|
|
525
980
|
|
|
981
|
+
Evaluator results in results_dict are dicts (from _decorate_metric) or
|
|
982
|
+
None when skipped/crashed. None values are omitted from output.
|
|
983
|
+
"""
|
|
526
984
|
scores: Dict[str, Any] = {}
|
|
527
|
-
results_dict = result.get("results", {})
|
|
528
985
|
|
|
529
|
-
# EvalScore metrics (all share the same schema shape: {score, result, threshold})
|
|
530
986
|
for eval_key, schema_key in [
|
|
531
987
|
(RELEVANCE, "relevance"),
|
|
532
988
|
(COHERENCE, "coherence"),
|
|
533
989
|
(GROUNDEDNESS, "groundedness"),
|
|
534
990
|
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
535
991
|
]:
|
|
536
|
-
|
|
537
|
-
if
|
|
992
|
+
data = results_dict.get(eval_key)
|
|
993
|
+
if data is None:
|
|
538
994
|
continue
|
|
539
|
-
data = json.loads(raw) if isinstance(raw, str) else raw
|
|
540
995
|
eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
|
|
541
996
|
if eval_score:
|
|
542
997
|
scores[schema_key] = eval_score
|
|
543
998
|
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
if raw_citations:
|
|
547
|
-
data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
|
|
999
|
+
data = results_dict.get(CITATIONS)
|
|
1000
|
+
if data is not None:
|
|
548
1001
|
count = data.get("citations", 0)
|
|
549
1002
|
cit_result = data.get("result")
|
|
550
|
-
if cit_result not in (
|
|
551
|
-
cit_result =
|
|
552
|
-
|
|
1003
|
+
if cit_result not in (STATUS_PASS, STATUS_FAIL):
|
|
1004
|
+
cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
|
|
553
1005
|
citation_score: Dict[str, Any] = {
|
|
554
1006
|
"count": count,
|
|
555
1007
|
"result": cit_result,
|
|
@@ -559,34 +1011,92 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
|
559
1011
|
citation_score["format"] = data["citation_format"]
|
|
560
1012
|
scores["citations"] = citation_score
|
|
561
1013
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
if raw_exact:
|
|
565
|
-
data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
|
|
1014
|
+
data = results_dict.get(EXACT_MATCH)
|
|
1015
|
+
if data is not None:
|
|
566
1016
|
is_match = data.get("exact_match", 0.0) == 1.0
|
|
567
1017
|
scores["exactMatch"] = {
|
|
568
1018
|
"match": is_match,
|
|
569
|
-
"result": data.get("result",
|
|
1019
|
+
"result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
|
|
570
1020
|
"reason": data.get("exact_match_reason", ""),
|
|
571
1021
|
}
|
|
572
1022
|
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
if raw_partial:
|
|
576
|
-
data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
|
|
1023
|
+
data = results_dict.get(PARTIAL_MATCH)
|
|
1024
|
+
if data is not None:
|
|
577
1025
|
scores["partialMatch"] = {
|
|
578
1026
|
"score": data.get("partial_match", 0.0),
|
|
579
|
-
"result": data.get("result",
|
|
1027
|
+
"result": data.get("result", STATUS_FAIL),
|
|
580
1028
|
"threshold": data.get("threshold", 0.5),
|
|
581
1029
|
"reason": data.get("partial_match_reason", ""),
|
|
582
1030
|
}
|
|
583
1031
|
|
|
1032
|
+
return scores
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
1036
|
+
"""Convert an internal evaluation result dict to a schema-compliant EvalItem."""
|
|
1037
|
+
item: Dict[str, Any] = {
|
|
1038
|
+
"prompt": result["prompt"],
|
|
1039
|
+
"response": result["response"],
|
|
1040
|
+
"expected_response": result["expected_response"],
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
if "evaluators" in result:
|
|
1044
|
+
item["evaluators"] = result["evaluators"]
|
|
1045
|
+
if "evaluators_mode" in result:
|
|
1046
|
+
item["evaluators_mode"] = result["evaluators_mode"]
|
|
1047
|
+
|
|
1048
|
+
scores = _convert_scores_to_schema(result.get("results", {}))
|
|
584
1049
|
if scores:
|
|
585
1050
|
item["scores"] = scores
|
|
586
1051
|
|
|
587
1052
|
return item
|
|
588
1053
|
|
|
589
1054
|
|
|
1055
|
+
def convert_thread_result_to_output(thread_result: Dict) -> Dict:
|
|
1056
|
+
"""Convert a multi-turn thread result to the output format."""
|
|
1057
|
+
output_turns = []
|
|
1058
|
+
for turn in thread_result.get("turns", []):
|
|
1059
|
+
output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
|
|
1060
|
+
if "expected_response" in turn:
|
|
1061
|
+
output_turn["expected_response"] = turn["expected_response"]
|
|
1062
|
+
if "response" in turn:
|
|
1063
|
+
output_turn["response"] = turn["response"]
|
|
1064
|
+
if "status" in turn:
|
|
1065
|
+
output_turn["status"] = turn["status"]
|
|
1066
|
+
if "error" in turn:
|
|
1067
|
+
output_turn["error"] = turn["error"]
|
|
1068
|
+
if "evaluators" in turn:
|
|
1069
|
+
output_turn["evaluators"] = turn["evaluators"]
|
|
1070
|
+
if "evaluators_mode" in turn:
|
|
1071
|
+
output_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
1072
|
+
|
|
1073
|
+
scores = _convert_scores_to_schema(turn.get("results", {}))
|
|
1074
|
+
if scores:
|
|
1075
|
+
output_turn["scores"] = scores
|
|
1076
|
+
|
|
1077
|
+
output_turns.append(output_turn)
|
|
1078
|
+
|
|
1079
|
+
output: Dict[str, Any] = {}
|
|
1080
|
+
if thread_result.get("name"):
|
|
1081
|
+
output["name"] = thread_result["name"]
|
|
1082
|
+
if thread_result.get("description"):
|
|
1083
|
+
output["description"] = thread_result["description"]
|
|
1084
|
+
if thread_result.get("conversation_id"):
|
|
1085
|
+
output["conversation_id"] = thread_result["conversation_id"]
|
|
1086
|
+
output["turns"] = output_turns
|
|
1087
|
+
if thread_result.get("summary"):
|
|
1088
|
+
output["summary"] = thread_result["summary"]
|
|
1089
|
+
|
|
1090
|
+
return output
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
def convert_result_to_output_item(result: Dict) -> Dict:
|
|
1094
|
+
"""Convert an internal result dict to an output item. Routes by type."""
|
|
1095
|
+
if result.get("type") == "multi_turn":
|
|
1096
|
+
return convert_thread_result_to_output(result)
|
|
1097
|
+
return convert_result_to_eval_item(result)
|
|
1098
|
+
|
|
1099
|
+
|
|
590
1100
|
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
591
1101
|
default_evaluators: Optional[Dict[str, Any]] = None,
|
|
592
1102
|
agent_name: Optional[str] = None,
|
|
@@ -602,7 +1112,7 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
|
|
|
602
1112
|
except Exception:
|
|
603
1113
|
current_version = "1.0.0"
|
|
604
1114
|
|
|
605
|
-
items = [
|
|
1115
|
+
items = [convert_result_to_output_item(r) for r in results]
|
|
606
1116
|
|
|
607
1117
|
metadata: Dict[str, Any] = {
|
|
608
1118
|
"evaluatedAt": datetime.now(timezone.utc).isoformat(),
|
|
@@ -631,6 +1141,18 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
|
|
|
631
1141
|
emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
632
1142
|
sys.exit(1)
|
|
633
1143
|
|
|
1144
|
+
def _results_to_csv_json(results_dict: Dict) -> str:
|
|
1145
|
+
"""Serialize evaluator results dict to a CSV-safe JSON string.
|
|
1146
|
+
|
|
1147
|
+
Skips None (crashed/skipped evaluators). Results are dicts produced
|
|
1148
|
+
by _decorate_metric.
|
|
1149
|
+
"""
|
|
1150
|
+
if not results_dict:
|
|
1151
|
+
return ""
|
|
1152
|
+
non_null = {k: v for k, v in results_dict.items() if v is not None}
|
|
1153
|
+
return json.dumps(non_null) if non_null else ""
|
|
1154
|
+
|
|
1155
|
+
|
|
634
1156
|
def write_results_to_csv(results: List[Dict], output_file: str,
|
|
635
1157
|
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
636
1158
|
cli_version: Optional[str] = None):
|
|
@@ -638,7 +1160,6 @@ def write_results_to_csv(results: List[Dict], output_file: str,
|
|
|
638
1160
|
try:
|
|
639
1161
|
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
640
1162
|
if results:
|
|
641
|
-
# Write metadata header
|
|
642
1163
|
metadata_parts = []
|
|
643
1164
|
if agent_name:
|
|
644
1165
|
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
@@ -649,30 +1170,87 @@ def write_results_to_csv(results: List[Dict], output_file: str,
|
|
|
649
1170
|
if metadata_parts:
|
|
650
1171
|
f.write(f"# {' | '.join(metadata_parts)}\n")
|
|
651
1172
|
|
|
652
|
-
|
|
653
|
-
if
|
|
654
|
-
|
|
655
|
-
if
|
|
1173
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
1174
|
+
if aggregates:
|
|
1175
|
+
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
1176
|
+
if total_items > 1:
|
|
656
1177
|
f.write("# AGGREGATE STATISTICS\n")
|
|
657
1178
|
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
658
1179
|
for metric_name, stats in aggregates.items():
|
|
659
1180
|
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
660
1181
|
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
661
|
-
total_prompts = stats.get('total_prompts',
|
|
1182
|
+
total_prompts = stats.get('total_prompts', total_items)
|
|
662
1183
|
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
663
1184
|
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
664
1185
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
1186
|
+
single_turn_rows = []
|
|
1187
|
+
multi_turn_rows = []
|
|
1188
|
+
for result in results:
|
|
1189
|
+
if result.get("type") == "multi_turn":
|
|
1190
|
+
thread_name = result.get("name", "")
|
|
1191
|
+
for turn_idx, turn in enumerate(result.get("turns", [])):
|
|
1192
|
+
multi_turn_rows.append({
|
|
1193
|
+
"thread_name": thread_name,
|
|
1194
|
+
"turn_index": turn_idx + 1,
|
|
1195
|
+
"prompt": turn.get("prompt", ""),
|
|
1196
|
+
"response": turn.get("response", ""),
|
|
1197
|
+
"expected_response": turn.get("expected_response", ""),
|
|
1198
|
+
"status": turn.get("status", ""),
|
|
1199
|
+
"error": turn.get("error", ""),
|
|
1200
|
+
"scores": _results_to_csv_json(turn.get("results", {})),
|
|
1201
|
+
})
|
|
1202
|
+
summary = result.get("summary", {})
|
|
1203
|
+
multi_turn_rows.append({
|
|
1204
|
+
"thread_name": thread_name,
|
|
1205
|
+
"turn_index": "summary",
|
|
1206
|
+
"prompt": "",
|
|
1207
|
+
"response": "",
|
|
1208
|
+
"expected_response": "",
|
|
1209
|
+
"status": summary.get("overall_status", ""),
|
|
1210
|
+
"scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
|
|
1211
|
+
})
|
|
1212
|
+
else:
|
|
1213
|
+
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
|
|
1214
|
+
row = {k: v for k, v in result.items() if k not in exclude_keys}
|
|
1215
|
+
if "results" in result:
|
|
1216
|
+
row["scores"] = _results_to_csv_json(result["results"])
|
|
1217
|
+
single_turn_rows.append(row)
|
|
1218
|
+
|
|
1219
|
+
if single_turn_rows:
|
|
1220
|
+
if multi_turn_rows:
|
|
1221
|
+
f.write("# SINGLE-TURN RESULTS\n")
|
|
1222
|
+
fieldnames = list(single_turn_rows[0].keys())
|
|
1223
|
+
for row in single_turn_rows:
|
|
1224
|
+
for k in row:
|
|
1225
|
+
if k not in fieldnames:
|
|
1226
|
+
fieldnames.append(k)
|
|
1227
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
1228
|
+
writer.writeheader()
|
|
1229
|
+
writer.writerows(single_turn_rows)
|
|
1230
|
+
|
|
1231
|
+
if multi_turn_rows:
|
|
1232
|
+
if single_turn_rows:
|
|
1233
|
+
f.write("\n")
|
|
1234
|
+
f.write("# MULTI-TURN RESULTS\n")
|
|
1235
|
+
fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
|
|
1236
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
1237
|
+
writer.writeheader()
|
|
1238
|
+
writer.writerows(multi_turn_rows)
|
|
671
1239
|
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
672
1240
|
except Exception as e:
|
|
673
1241
|
emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
674
1242
|
sys.exit(1)
|
|
675
1243
|
|
|
1244
|
+
def normalize_agent_id(agent_id):
|
|
1245
|
+
"""Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
|
|
1246
|
+
|
|
1247
|
+
Returns the input unchanged when it is None/empty or already contains a dot.
|
|
1248
|
+
"""
|
|
1249
|
+
if not agent_id:
|
|
1250
|
+
return agent_id
|
|
1251
|
+
return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
|
|
1252
|
+
|
|
1253
|
+
|
|
676
1254
|
def parse_arguments():
|
|
677
1255
|
"""Parse command line arguments."""
|
|
678
1256
|
parser = argparse.ArgumentParser(
|
|
@@ -763,8 +1341,29 @@ Examples:
|
|
|
763
1341
|
action='store_true',
|
|
764
1342
|
help='Sign out and clear cached authentication tokens'
|
|
765
1343
|
)
|
|
1344
|
+
|
|
1345
|
+
parser.add_argument(
|
|
1346
|
+
'--concurrency',
|
|
1347
|
+
type=int,
|
|
1348
|
+
default=5,
|
|
1349
|
+
help='Number of parallel workers for prompt processing (1-5, default: 5)'
|
|
1350
|
+
)
|
|
766
1351
|
|
|
767
|
-
|
|
1352
|
+
args = parser.parse_args()
|
|
1353
|
+
|
|
1354
|
+
args.m365_agent_id = normalize_agent_id(args.m365_agent_id)
|
|
1355
|
+
|
|
1356
|
+
if args.concurrency < 1:
|
|
1357
|
+
parser.error('--concurrency must be an integer >= 1.')
|
|
1358
|
+
if args.concurrency > MAX_CONCURRENCY:
|
|
1359
|
+
emit_structured_log(
|
|
1360
|
+
"warning",
|
|
1361
|
+
f"--concurrency {args.concurrency} exceeds max {MAX_CONCURRENCY}; clamping to {MAX_CONCURRENCY}.",
|
|
1362
|
+
operation=Operation.SETUP,
|
|
1363
|
+
)
|
|
1364
|
+
args.concurrency = MAX_CONCURRENCY
|
|
1365
|
+
|
|
1366
|
+
return args
|
|
768
1367
|
|
|
769
1368
|
def validate_environment() -> CallPath:
|
|
770
1369
|
"""Validate required environment variables."""
|
|
@@ -773,19 +1372,29 @@ def validate_environment() -> CallPath:
|
|
|
773
1372
|
"AZURE_AI_API_KEY",
|
|
774
1373
|
"AZURE_AI_API_VERSION",
|
|
775
1374
|
"AZURE_AI_MODEL_NAME",
|
|
776
|
-
# Chat API specific
|
|
777
|
-
"COPILOT_API_ENDPOINT",
|
|
778
|
-
"X_SCENARIO_HEADER"
|
|
779
1375
|
]
|
|
780
1376
|
|
|
781
1377
|
if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
|
|
782
1378
|
call_path = CallPath.ACCESS_TOKEN
|
|
783
|
-
required_env_vars.
|
|
1379
|
+
required_env_vars.extend([
|
|
1380
|
+
"COPILOT_API_ACCESS_TOKEN",
|
|
1381
|
+
"COPILOT_API_ENDPOINT",
|
|
1382
|
+
"X_SCENARIO_HEADER",
|
|
1383
|
+
])
|
|
1384
|
+
elif os.environ.get("WORK_IQ_A2A_ENDPOINT"):
|
|
1385
|
+
call_path = CallPath.A2A
|
|
1386
|
+
required_env_vars.extend([
|
|
1387
|
+
"WORK_IQ_A2A_ENDPOINT",
|
|
1388
|
+
"WORK_IQ_A2A_CLIENT_ID",
|
|
1389
|
+
"TENANT_ID",
|
|
1390
|
+
])
|
|
784
1391
|
else:
|
|
785
1392
|
call_path = CallPath.COPILOT_AUTH
|
|
786
1393
|
required_env_vars.extend([
|
|
1394
|
+
"COPILOT_API_ENDPOINT",
|
|
1395
|
+
"X_SCENARIO_HEADER",
|
|
787
1396
|
"M365_EVAL_CLIENT_ID",
|
|
788
|
-
"TENANT_ID"
|
|
1397
|
+
"TENANT_ID",
|
|
789
1398
|
])
|
|
790
1399
|
|
|
791
1400
|
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
|
@@ -867,46 +1476,6 @@ def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
|
|
|
867
1476
|
]
|
|
868
1477
|
return eval_items, None
|
|
869
1478
|
|
|
870
|
-
def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
|
|
871
|
-
"""
|
|
872
|
-
Fetch available agents for the user from the Copilot API.
|
|
873
|
-
|
|
874
|
-
Args:
|
|
875
|
-
access_token: Bearer token for API authentication
|
|
876
|
-
user_oid: User object ID for agent filtering
|
|
877
|
-
|
|
878
|
-
Returns:
|
|
879
|
-
List of agent dictionaries.
|
|
880
|
-
"""
|
|
881
|
-
request_headers = {
|
|
882
|
-
"Content-Type": "application/json",
|
|
883
|
-
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
884
|
-
"Authorization": f"Bearer {access_token}"
|
|
885
|
-
}
|
|
886
|
-
|
|
887
|
-
try:
|
|
888
|
-
# Build the query parameter with participant info
|
|
889
|
-
request_data = json.dumps({"participant": {"id": user_oid}})
|
|
890
|
-
query_param = urllib.parse.quote(request_data)
|
|
891
|
-
|
|
892
|
-
# Try to fetch agents from /GetGptList endpoint
|
|
893
|
-
req = urllib.request.Request(
|
|
894
|
-
f"{copilot_api_endpoint}/GetGptList?request={query_param}",
|
|
895
|
-
headers=request_headers,
|
|
896
|
-
method="GET"
|
|
897
|
-
)
|
|
898
|
-
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
899
|
-
data = json.loads(resp.read().decode("utf-8"))
|
|
900
|
-
agents = data.get("gptList", [])
|
|
901
|
-
return agents
|
|
902
|
-
except urllib.error.HTTPError as e:
|
|
903
|
-
# If endpoint doesn't exist or returns error, return empty list
|
|
904
|
-
emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
|
|
905
|
-
return []
|
|
906
|
-
except Exception as e:
|
|
907
|
-
emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
|
|
908
|
-
return []
|
|
909
|
-
|
|
910
1479
|
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
|
|
911
1480
|
"""
|
|
912
1481
|
Display an interactive agent selector using questionary.
|
|
@@ -946,127 +1515,6 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[s
|
|
|
946
1515
|
|
|
947
1516
|
return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
|
|
948
1517
|
|
|
949
|
-
@functools.lru_cache(maxsize=1)
|
|
950
|
-
def _get_iana_timezone_name() -> str:
|
|
951
|
-
"""Get the IANA timezone name from the system using tzlocal.
|
|
952
|
-
|
|
953
|
-
Tries get_localzone_name() first; falls back to str(get_localzone()) when the
|
|
954
|
-
former raises (e.g. no zone configured on some Unix systems). Result is cached
|
|
955
|
-
after the first call so tzlocal is only invoked once per session.
|
|
956
|
-
"""
|
|
957
|
-
try:
|
|
958
|
-
return tzlocal.get_localzone_name()
|
|
959
|
-
except Exception:
|
|
960
|
-
return str(tzlocal.get_localzone())
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
@functools.lru_cache(maxsize=1)
|
|
964
|
-
def _get_location_info() -> Dict[str, Any]:
|
|
965
|
-
"""Return a locationInfo dict containing the local UTC offset and IANA timezone name.
|
|
966
|
-
|
|
967
|
-
Result is cached after the first call so the computation runs only once per session.
|
|
968
|
-
"""
|
|
969
|
-
now = datetime.now().astimezone()
|
|
970
|
-
utc_offset = now.utcoffset()
|
|
971
|
-
offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
|
|
972
|
-
return {
|
|
973
|
-
"timeZoneOffset": offset_hours,
|
|
974
|
-
"timeZone": _get_iana_timezone_name(),
|
|
975
|
-
}
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
979
|
-
message = {
|
|
980
|
-
"message": {
|
|
981
|
-
"text": prompt,
|
|
982
|
-
"author": "user",
|
|
983
|
-
"messageType": "chat",
|
|
984
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
985
|
-
"locationInfo": _get_location_info(),
|
|
986
|
-
"from": {
|
|
987
|
-
"id": user_oid,
|
|
988
|
-
}
|
|
989
|
-
},
|
|
990
|
-
"verbosity": "verbose", # To enable detailed telemetry in response (to extract tool usage, etc.)
|
|
991
|
-
}
|
|
992
|
-
|
|
993
|
-
if agent_id:
|
|
994
|
-
message["gpts"] = [
|
|
995
|
-
{
|
|
996
|
-
"id": agent_id.strip(),
|
|
997
|
-
"source": "MOS3"
|
|
998
|
-
}
|
|
999
|
-
]
|
|
1000
|
-
message["optionsSets"] = [
|
|
1001
|
-
"disable_action_confirmation" # Disable 3P action confirmation prompts for agents while scraping
|
|
1002
|
-
]
|
|
1003
|
-
|
|
1004
|
-
return json.dumps(message).encode("utf-8")
|
|
1005
|
-
|
|
1006
|
-
def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
|
|
1007
|
-
""" Send prompts to the chat API and return enhanced responses. """
|
|
1008
|
-
|
|
1009
|
-
request_headers = {
|
|
1010
|
-
"Content-Type": "application/json",
|
|
1011
|
-
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
1012
|
-
"Authorization": f"Bearer {access_token}"
|
|
1013
|
-
}
|
|
1014
|
-
raw_responses: List[Tuple[str, str]] = []
|
|
1015
|
-
for i, prompt in enumerate(prompts, 1):
|
|
1016
|
-
if getattr(args, "effective_log_level", "info") in ("info", "debug"):
|
|
1017
|
-
emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
|
|
1018
|
-
|
|
1019
|
-
# Build the payload
|
|
1020
|
-
payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
|
|
1021
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1022
|
-
emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
|
|
1023
|
-
|
|
1024
|
-
# Send the request to /chat
|
|
1025
|
-
req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
|
|
1026
|
-
try:
|
|
1027
|
-
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
1028
|
-
raw = resp.read().decode("utf-8", errors="replace")
|
|
1029
|
-
except urllib.error.HTTPError as e:
|
|
1030
|
-
error_body = None
|
|
1031
|
-
try:
|
|
1032
|
-
error_body = e.read().decode("utf-8", errors="replace")
|
|
1033
|
-
except Exception:
|
|
1034
|
-
pass
|
|
1035
|
-
msg = f"Chat API request failed (HTTP {e.code} {e.reason})."
|
|
1036
|
-
if error_body:
|
|
1037
|
-
msg += f" Body: {error_body[:500]}"
|
|
1038
|
-
raise RuntimeError(msg) from e
|
|
1039
|
-
except urllib.error.URLError as e:
|
|
1040
|
-
raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
|
|
1041
|
-
|
|
1042
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1043
|
-
emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
|
|
1044
|
-
|
|
1045
|
-
# Store raw response for enhancement
|
|
1046
|
-
raw_responses.append((prompt, raw.strip()))
|
|
1047
|
-
|
|
1048
|
-
# Extract enhanced responses using the new extractor
|
|
1049
|
-
enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
|
|
1050
|
-
|
|
1051
|
-
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1052
|
-
for idx, enhanced in enumerate(enhanced_responses, 1):
|
|
1053
|
-
metadata = enhanced.get("metadata", {})
|
|
1054
|
-
context = {
|
|
1055
|
-
"request-id": metadata.get("request_id"),
|
|
1056
|
-
"conversation-id": metadata.get("conversation_id"),
|
|
1057
|
-
"message-id": metadata.get("message_id"),
|
|
1058
|
-
"operation": Operation.SEND_PROMPT,
|
|
1059
|
-
}
|
|
1060
|
-
entry = format_structured_log_entry(
|
|
1061
|
-
level="debug",
|
|
1062
|
-
message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
|
|
1063
|
-
logger_name=CLI_LOGGER_NAME,
|
|
1064
|
-
run_context=context,
|
|
1065
|
-
)
|
|
1066
|
-
DIAGNOSTIC_RECORDS.append(entry)
|
|
1067
|
-
CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
|
|
1068
|
-
|
|
1069
|
-
return enhanced_responses
|
|
1070
1518
|
|
|
1071
1519
|
def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
|
|
1072
1520
|
agent_name: Optional[str] = None, cli_version: Optional[str] = None):
|
|
@@ -1119,76 +1567,142 @@ def main():
|
|
|
1119
1567
|
|
|
1120
1568
|
# Validate environment variables required for evaluation
|
|
1121
1569
|
call_path = validate_environment()
|
|
1122
|
-
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
1123
|
-
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
1124
1570
|
|
|
1125
1571
|
user_oid = ""
|
|
1126
1572
|
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
try:
|
|
1143
|
-
auth_handler.clear_cache()
|
|
1144
|
-
except Exception as e:
|
|
1145
|
-
emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
|
|
1146
|
-
sys.exit(1)
|
|
1147
|
-
sys.exit(0)
|
|
1148
|
-
|
|
1149
|
-
# Authenticate before loading prompts
|
|
1150
|
-
try:
|
|
1151
|
-
auth_result = auth_handler.acquire_token_interactive() or {}
|
|
1152
|
-
access_token = auth_result.get("access_token") or ""
|
|
1153
|
-
if not access_token:
|
|
1154
|
-
raise RuntimeError("Failed to acquire access token from authentication result")
|
|
1155
|
-
|
|
1156
|
-
id_token_claims = auth_result.get("id_token_claims")
|
|
1157
|
-
if not isinstance(id_token_claims, dict):
|
|
1573
|
+
match call_path:
|
|
1574
|
+
case CallPath.ACCESS_TOKEN:
|
|
1575
|
+
access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
|
|
1576
|
+
user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
|
|
1577
|
+
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
1578
|
+
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
1579
|
+
agent_client = SydneyClient(
|
|
1580
|
+
copilot_api_endpoint=copilot_api_endpoint,
|
|
1581
|
+
access_token=access_token,
|
|
1582
|
+
user_oid=user_oid,
|
|
1583
|
+
logger=CLI_LOGGER,
|
|
1584
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1585
|
+
)
|
|
1586
|
+
|
|
1587
|
+
case CallPath.A2A:
|
|
1158
1588
|
emit_structured_log(
|
|
1159
|
-
"warning",
|
|
1160
|
-
|
|
1589
|
+
"warning",
|
|
1590
|
+
"The A2A endpoint is experimental and may change without notice.",
|
|
1591
|
+
operation=Operation.SETUP,
|
|
1161
1592
|
)
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1593
|
+
a2a_endpoint = os.environ["WORK_IQ_A2A_ENDPOINT"]
|
|
1594
|
+
validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
|
|
1595
|
+
|
|
1596
|
+
a2a_scopes_str = os.environ.get("WORK_IQ_A2A_SCOPES", "")
|
|
1597
|
+
a2a_auth_handler = AuthHandler(
|
|
1598
|
+
client_id=os.environ["WORK_IQ_A2A_CLIENT_ID"],
|
|
1599
|
+
tenant_id=os.environ["TENANT_ID"],
|
|
1600
|
+
scopes_str=a2a_scopes_str,
|
|
1601
|
+
)
|
|
1602
|
+
try:
|
|
1603
|
+
a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
|
|
1604
|
+
a2a_access_token = a2a_auth_result.get("access_token") or ""
|
|
1605
|
+
if not a2a_access_token:
|
|
1606
|
+
raise RuntimeError("Failed to acquire A2A access token")
|
|
1607
|
+
except Exception as e:
|
|
1608
|
+
emit_structured_log(
|
|
1609
|
+
"error",
|
|
1610
|
+
f"Error during A2A authentication: {e}",
|
|
1611
|
+
operation=Operation.AUTHENTICATE,
|
|
1612
|
+
)
|
|
1613
|
+
if effective_log_level == "debug":
|
|
1614
|
+
import traceback
|
|
1615
|
+
traceback.print_exc()
|
|
1616
|
+
sys.exit(1)
|
|
1617
|
+
try:
|
|
1618
|
+
agent_client = A2AClient(
|
|
1619
|
+
a2a_endpoint=a2a_endpoint,
|
|
1620
|
+
access_token=a2a_access_token,
|
|
1621
|
+
logger=CLI_LOGGER,
|
|
1622
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1623
|
+
)
|
|
1624
|
+
except Exception as e:
|
|
1625
|
+
emit_structured_log(
|
|
1626
|
+
"error",
|
|
1627
|
+
f"Failed to initialize A2A client: {e}",
|
|
1628
|
+
operation=Operation.SETUP,
|
|
1629
|
+
)
|
|
1630
|
+
sys.exit(1)
|
|
1171
1631
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1632
|
+
case CallPath.COPILOT_AUTH:
|
|
1633
|
+
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
1634
|
+
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
1635
|
+
auth_handler = AuthHandler(
|
|
1636
|
+
client_id=os.environ["M365_EVAL_CLIENT_ID"],
|
|
1637
|
+
tenant_id=os.environ["TENANT_ID"],
|
|
1638
|
+
scopes_str=os.environ.get("COPILOT_SCOPES", ""),
|
|
1639
|
+
)
|
|
1640
|
+
|
|
1641
|
+
# Signout user
|
|
1642
|
+
if args.signout:
|
|
1643
|
+
try:
|
|
1644
|
+
auth_handler.clear_cache()
|
|
1645
|
+
except Exception as e:
|
|
1646
|
+
emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
|
|
1647
|
+
sys.exit(1)
|
|
1648
|
+
sys.exit(0)
|
|
1649
|
+
|
|
1650
|
+
try:
|
|
1651
|
+
auth_result = auth_handler.acquire_token_interactive() or {}
|
|
1652
|
+
access_token = auth_result.get("access_token") or ""
|
|
1653
|
+
if not access_token:
|
|
1654
|
+
raise RuntimeError("Failed to acquire access token from authentication result")
|
|
1655
|
+
|
|
1656
|
+
id_token_claims = auth_result.get("id_token_claims")
|
|
1657
|
+
if not isinstance(id_token_claims, dict):
|
|
1658
|
+
emit_structured_log(
|
|
1659
|
+
"warning", "id_token_claims is missing or invalid in authentication result",
|
|
1660
|
+
operation=Operation.AUTHENTICATE,
|
|
1661
|
+
)
|
|
1662
|
+
else:
|
|
1663
|
+
user_oid = id_token_claims.get("oid") or ""
|
|
1664
|
+
|
|
1665
|
+
if not user_oid:
|
|
1666
|
+
user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
|
|
1667
|
+
|
|
1668
|
+
except Exception as e:
|
|
1669
|
+
emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
|
|
1670
|
+
if effective_log_level == "debug":
|
|
1671
|
+
import traceback
|
|
1672
|
+
traceback.print_exc()
|
|
1673
|
+
sys.exit(1)
|
|
1674
|
+
|
|
1675
|
+
agent_client = SydneyClient(
|
|
1676
|
+
copilot_api_endpoint=copilot_api_endpoint,
|
|
1677
|
+
access_token=access_token,
|
|
1678
|
+
user_oid=user_oid,
|
|
1679
|
+
logger=CLI_LOGGER,
|
|
1680
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
1681
|
+
)
|
|
1175
1682
|
|
|
1176
1683
|
# 1. Load evaluation datasets
|
|
1177
1684
|
eval_items, file_default_evaluators = get_prompt_datasets(args)
|
|
1178
1685
|
default_evaluators = resolve_default_evaluators(file_default_evaluators)
|
|
1179
|
-
prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
|
|
1180
1686
|
|
|
1181
1687
|
if effective_log_level in ("info", "debug"):
|
|
1182
|
-
|
|
1688
|
+
multi_turn_count = sum(1 for item in eval_items if "turns" in item)
|
|
1689
|
+
single_turn_count = len(eval_items) - multi_turn_count
|
|
1690
|
+
emit_structured_log(
|
|
1691
|
+
"info",
|
|
1692
|
+
f"Running evaluation on {len(eval_items)} item(s) "
|
|
1693
|
+
f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
|
|
1694
|
+
operation=Operation.SETUP,
|
|
1695
|
+
)
|
|
1183
1696
|
|
|
1184
1697
|
agent_name = None
|
|
1185
1698
|
try:
|
|
1186
|
-
#
|
|
1699
|
+
# 2. Agent selection - when no agent ID is provided, discover agents
|
|
1700
|
+
# via the active client (A2A or REST) and prompt interactively.
|
|
1187
1701
|
if not args.m365_agent_id:
|
|
1188
1702
|
if effective_log_level in ("info", "debug"):
|
|
1189
1703
|
emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
|
|
1190
|
-
|
|
1191
|
-
available_agents = fetch_available_agents(
|
|
1704
|
+
|
|
1705
|
+
available_agents = agent_client.fetch_available_agents()
|
|
1192
1706
|
if not available_agents:
|
|
1193
1707
|
emit_structured_log(
|
|
1194
1708
|
"error",
|
|
@@ -1210,30 +1724,46 @@ def main():
|
|
|
1210
1724
|
operation=Operation.FETCH_AGENTS,
|
|
1211
1725
|
)
|
|
1212
1726
|
sys.exit(1)
|
|
1213
|
-
|
|
1214
|
-
# 4. Send prompts to chat API
|
|
1215
|
-
responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
|
|
1216
1727
|
except Exception as e:
|
|
1217
|
-
emit_structured_log("error", f"Error
|
|
1728
|
+
emit_structured_log("error", f"Error during agent discovery: {e}", operation=Operation.FETCH_AGENTS)
|
|
1218
1729
|
if effective_log_level == "debug":
|
|
1219
1730
|
import traceback
|
|
1220
1731
|
traceback.print_exc()
|
|
1221
1732
|
sys.exit(1)
|
|
1222
1733
|
|
|
1734
|
+
# Pre-resolve agent endpoint (A2A agent card lookup; no-op for REST)
|
|
1735
|
+
if args.m365_agent_id:
|
|
1736
|
+
agent_client.resolve_agent(args.m365_agent_id)
|
|
1737
|
+
|
|
1738
|
+
# 3. Build pipeline config and run evaluation pipeline
|
|
1739
|
+
model_config = AzureOpenAIModelConfiguration(
|
|
1740
|
+
azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
|
|
1741
|
+
api_key=os.environ.get("AZURE_AI_API_KEY"),
|
|
1742
|
+
api_version=os.environ.get("AZURE_AI_API_VERSION"),
|
|
1743
|
+
azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
|
|
1744
|
+
)
|
|
1745
|
+
has_azure_openai = bool(
|
|
1746
|
+
os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
|
|
1747
|
+
and os.environ.get("AZURE_AI_API_KEY")
|
|
1748
|
+
)
|
|
1749
|
+
|
|
1750
|
+
pipeline = PipelineConfig(
|
|
1751
|
+
agent_client=agent_client,
|
|
1752
|
+
model_config=model_config,
|
|
1753
|
+
has_azure_openai=has_azure_openai,
|
|
1754
|
+
default_evaluators=default_evaluators,
|
|
1755
|
+
)
|
|
1756
|
+
|
|
1757
|
+
results = run_pipeline(pipeline, eval_items, args)
|
|
1223
1758
|
|
|
1224
|
-
#
|
|
1225
|
-
if effective_log_level in ("info", "debug"):
|
|
1226
|
-
emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
|
|
1227
|
-
results = run_evaluations(args, responses, eval_items, default_evaluators)
|
|
1228
|
-
|
|
1229
|
-
# 6. Output results
|
|
1759
|
+
# 4. Output results
|
|
1230
1760
|
output_results(results, args, default_evaluators=default_evaluators,
|
|
1231
1761
|
agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
|
|
1232
1762
|
|
|
1233
1763
|
if effective_log_level in ("info", "debug"):
|
|
1234
1764
|
emit_structured_log(
|
|
1235
1765
|
"info",
|
|
1236
|
-
f"Evaluation completed successfully. Processed {len(
|
|
1766
|
+
f"Evaluation completed successfully. Processed {len(eval_items)} item(s).",
|
|
1237
1767
|
operation=Operation.EVALUATE,
|
|
1238
1768
|
)
|
|
1239
1769
|
|