@microsoft/m365-copilot-eval 1.2.0-preview.1 → 1.3.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -4
- package/package.json +3 -2
- package/schema/CHANGELOG.md +8 -0
- package/schema/v1/eval-document.schema.json +117 -1
- package/schema/v1/examples/valid/comprehensive.json +27 -2
- package/schema/version.json +2 -2
- package/src/clients/cli/cli_logging/__init__.py +0 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +55 -0
- package/src/clients/cli/cli_logging/logging_utils.py +145 -0
- package/src/clients/cli/common.py +51 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
- package/src/clients/cli/evaluator_resolver.py +150 -0
- package/src/clients/cli/generate_report.py +130 -110
- package/src/clients/cli/main.py +545 -236
- package/src/clients/cli/readme.md +14 -7
- package/src/clients/cli/requirements.txt +1 -0
- package/src/clients/cli/response_extractor.py +32 -14
- package/src/clients/node-js/bin/runevals.js +58 -28
- package/src/clients/node-js/config/default.js +1 -1
package/src/clients/cli/main.py
CHANGED
|
@@ -3,6 +3,8 @@ import os
|
|
|
3
3
|
import argparse
|
|
4
4
|
import sys
|
|
5
5
|
import csv
|
|
6
|
+
import functools
|
|
7
|
+
import logging
|
|
6
8
|
import webbrowser
|
|
7
9
|
import urllib.request
|
|
8
10
|
import urllib.error
|
|
@@ -20,14 +22,39 @@ from azure.ai.evaluation import (
|
|
|
20
22
|
from dotenv import load_dotenv
|
|
21
23
|
from auth.auth_handler import AuthHandler
|
|
22
24
|
from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
|
|
23
|
-
|
|
24
|
-
|
|
25
|
+
from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
|
|
26
|
+
from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
|
|
25
27
|
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
26
28
|
from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
|
|
27
29
|
from schema_handler import DocumentUpgrader, SchemaVersionManager
|
|
30
|
+
from common import (
|
|
31
|
+
RELEVANCE,
|
|
32
|
+
COHERENCE,
|
|
33
|
+
GROUNDEDNESS,
|
|
34
|
+
TOOL_CALL_ACCURACY,
|
|
35
|
+
CITATIONS,
|
|
36
|
+
EXACT_MATCH,
|
|
37
|
+
PARTIAL_MATCH,
|
|
38
|
+
REQUIRES_AZURE_OPENAI,
|
|
39
|
+
REQUIRES_TOOL_DEFINITIONS,
|
|
40
|
+
METRIC_IDS,
|
|
41
|
+
pascal_case_to_title,
|
|
42
|
+
)
|
|
43
|
+
from evaluator_resolver import (
|
|
44
|
+
EVALUATOR_REGISTRY,
|
|
45
|
+
validate_evaluator_names,
|
|
46
|
+
check_prerequisites,
|
|
47
|
+
resolve_default_evaluators,
|
|
48
|
+
resolve_evaluators_for_prompt,
|
|
49
|
+
get_evaluator_threshold,
|
|
50
|
+
)
|
|
28
51
|
from version_check import check_min_version, get_cli_version
|
|
29
52
|
from datetime import datetime, timezone
|
|
30
53
|
from pathlib import Path
|
|
54
|
+
import tzlocal
|
|
55
|
+
|
|
56
|
+
from cli_logging.console_diagnostics import render_diagnostic, serialize_diagnostic_record
|
|
57
|
+
from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel, Operation, format_structured_log_entry, resolve_log_level
|
|
31
58
|
|
|
32
59
|
# Allowed endpoints for URL validation
|
|
33
60
|
ALLOWED_ENDPOINTS = [
|
|
@@ -46,20 +73,63 @@ VERSION_CHECK_BYPASS_FLAGS = (
|
|
|
46
73
|
"signout",
|
|
47
74
|
)
|
|
48
75
|
|
|
76
|
+
CLI_LOGGER_NAME = "m365.eval.cli"
|
|
77
|
+
CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
|
|
78
|
+
DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _ensure_logger_handler() -> None:
|
|
82
|
+
if CLI_LOGGER.handlers:
|
|
83
|
+
return
|
|
84
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
85
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
86
|
+
CLI_LOGGER.addHandler(handler)
|
|
87
|
+
CLI_LOGGER.propagate = False
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def configure_cli_logging(effective_log_level: str) -> None:
|
|
91
|
+
_ensure_logger_handler()
|
|
92
|
+
CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
|
|
96
|
+
_ensure_logger_handler()
|
|
97
|
+
context = {
|
|
98
|
+
"request-id": None,
|
|
99
|
+
"conversation-id": None,
|
|
100
|
+
"message-id": None,
|
|
101
|
+
"operation": operation,
|
|
102
|
+
}
|
|
103
|
+
entry = format_structured_log_entry(
|
|
104
|
+
level=level,
|
|
105
|
+
message=message,
|
|
106
|
+
logger_name=CLI_LOGGER_NAME,
|
|
107
|
+
run_context=context,
|
|
108
|
+
)
|
|
109
|
+
DIAGNOSTIC_RECORDS.append(entry)
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
|
|
113
|
+
except Exception:
|
|
114
|
+
pass
|
|
115
|
+
|
|
49
116
|
|
|
50
117
|
def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
|
|
51
118
|
"""Return True if the current invocation should skip min-version checks."""
|
|
52
119
|
return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
|
|
53
120
|
|
|
54
|
-
def write_results_to_html(results: List[Dict], output_file: str
|
|
121
|
+
def write_results_to_html(results: List[Dict], output_file: str,
|
|
122
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
123
|
+
cli_version: Optional[str] = None):
|
|
55
124
|
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
56
125
|
try:
|
|
57
|
-
html = generate_html_report(results
|
|
126
|
+
html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
|
|
127
|
+
cli_version=cli_version)
|
|
58
128
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
59
129
|
f.write(html)
|
|
60
|
-
|
|
130
|
+
emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
61
131
|
except Exception as e:
|
|
62
|
-
|
|
132
|
+
emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
63
133
|
sys.exit(1)
|
|
64
134
|
|
|
65
135
|
def get_default_prompts_and_responses():
|
|
@@ -72,7 +142,7 @@ def get_default_prompts_and_responses():
|
|
|
72
142
|
]
|
|
73
143
|
return prompts, expected_responses
|
|
74
144
|
|
|
75
|
-
def load_prompts_from_file(file_path: str) -> Tuple[List[
|
|
145
|
+
def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
|
|
76
146
|
"""Load prompts and expected responses from a JSON file.
|
|
77
147
|
|
|
78
148
|
Supports three formats:
|
|
@@ -82,6 +152,10 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
82
152
|
|
|
83
153
|
For eval documents (format 1) and array format (format 2), schema validation
|
|
84
154
|
and auto-upgrade are applied via DocumentUpgrader.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
|
|
158
|
+
expected_response, and optional evaluators/evaluators_mode fields.
|
|
85
159
|
"""
|
|
86
160
|
try:
|
|
87
161
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
@@ -98,18 +172,18 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
98
172
|
upgrader = DocumentUpgrader()
|
|
99
173
|
except Exception as e:
|
|
100
174
|
# Schema infrastructure not available (missing files, etc.) — skip
|
|
101
|
-
|
|
175
|
+
emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
|
|
102
176
|
upgrader = None
|
|
103
177
|
|
|
104
178
|
if upgrader is not None:
|
|
105
179
|
result = upgrader.upgrade(Path(file_path))
|
|
106
180
|
|
|
107
181
|
if result.error:
|
|
108
|
-
|
|
182
|
+
emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
|
|
109
183
|
sys.exit(1)
|
|
110
184
|
|
|
111
185
|
if result.upgraded and result.message:
|
|
112
|
-
|
|
186
|
+
emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
|
|
113
187
|
|
|
114
188
|
# Use the parsed document from the upgrade result
|
|
115
189
|
if result.document is not None:
|
|
@@ -117,26 +191,26 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
117
191
|
|
|
118
192
|
if isinstance(data, list):
|
|
119
193
|
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
120
|
-
|
|
121
|
-
expected_responses = [item.get("expected_response", "") for item in data]
|
|
194
|
+
return data, None
|
|
122
195
|
elif isinstance(data, dict):
|
|
123
196
|
if "items" in data:
|
|
124
197
|
# Eval document format: {"schemaVersion": "...", "items": [...]}
|
|
125
|
-
|
|
126
|
-
prompts = [item.get("prompt", "") for item in items]
|
|
127
|
-
expected_responses = [item.get("expected_response", "") for item in items]
|
|
198
|
+
return data["items"], data.get("default_evaluators")
|
|
128
199
|
else:
|
|
129
200
|
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
130
201
|
prompts = data.get("prompts", [])
|
|
131
202
|
expected_responses = data.get("expected_responses", [])
|
|
203
|
+
eval_items = [
|
|
204
|
+
{"prompt": p, "expected_response": e}
|
|
205
|
+
for p, e in zip(prompts, expected_responses)
|
|
206
|
+
]
|
|
207
|
+
return eval_items, None
|
|
132
208
|
else:
|
|
133
209
|
raise ValueError("Invalid file format")
|
|
134
|
-
|
|
135
|
-
return prompts, expected_responses
|
|
136
210
|
except SystemExit:
|
|
137
211
|
raise
|
|
138
212
|
except Exception as e:
|
|
139
|
-
|
|
213
|
+
emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
|
|
140
214
|
sys.exit(1)
|
|
141
215
|
|
|
142
216
|
def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
@@ -163,116 +237,168 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
|
163
237
|
|
|
164
238
|
return prompts, expected_responses
|
|
165
239
|
|
|
166
|
-
def run_evaluations(args, responses:
|
|
167
|
-
|
|
240
|
+
def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
|
|
241
|
+
default_evaluators: Dict[str, Any]) -> list:
|
|
242
|
+
"""Run evaluations against the responses using per-prompt evaluator resolution.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
args: CLI arguments.
|
|
246
|
+
responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
|
|
247
|
+
eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
|
|
248
|
+
default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
|
|
249
|
+
"""
|
|
250
|
+
if len(responses) != len(eval_items):
|
|
251
|
+
raise ValueError(
|
|
252
|
+
f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
|
|
253
|
+
)
|
|
254
|
+
|
|
168
255
|
model_config = AzureOpenAIModelConfiguration(
|
|
169
256
|
azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
|
|
170
257
|
api_key=os.environ.get("AZURE_AI_API_KEY"),
|
|
171
258
|
api_version=os.environ.get("AZURE_AI_API_VERSION"),
|
|
172
259
|
azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
|
|
173
260
|
)
|
|
174
|
-
|
|
175
|
-
# Initialize evaluators
|
|
176
|
-
relevance_evaluator = RelevanceEvaluator(model_config=model_config) # Evaluate relevance for a given response. Range is 1 - 5.
|
|
177
|
-
coherence_evaluator = CoherenceEvaluator(model_config=model_config) # Measures the coherence (human-like quality) of the response. Range is 1 - 5.
|
|
178
|
-
groundedness_evaluator = GroundednessEvaluator(model_config=model_config) # Evaluates the response for for factuality and groundedness against provided context. Range is 1 - 5.
|
|
179
|
-
#concisenessnonllm_evaluator = ConcisenessNonLLMEvaluator() # Evaluates the response for conciseness. Range is 1 - 5.
|
|
180
|
-
#pii_evaluator = PIIEvaluator(model_config=model_config) # Evaluates the response for presence of PII. Range
|
|
181
|
-
# Parse citation format from args
|
|
182
|
-
citation_format = CitationFormat.OAI_UNICODE if args.citation_format == 'oai_unicode' else CitationFormat.LEGACY_BRACKET
|
|
183
|
-
citations_evaluator = CitationsEvaluator(citation_format=citation_format) # Evaluates citations present in the response using regex pattern matching
|
|
184
|
-
|
|
185
|
-
tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) # Evaluate tool call accuracy if tool definitions are present in response
|
|
186
|
-
|
|
187
261
|
|
|
188
|
-
|
|
262
|
+
# Build available context for prerequisite checks
|
|
263
|
+
has_azure_openai = bool(
|
|
264
|
+
os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
|
|
265
|
+
and os.environ.get("AZURE_AI_API_KEY")
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
DEFAULT_PASS_THRESHOLD = 3
|
|
189
269
|
|
|
190
|
-
def decorate_metric(metric_id: str, data):
|
|
270
|
+
def decorate_metric(metric_id: str, data, threshold: Optional[int] = None):
|
|
191
271
|
"""Augment raw evaluator output with standardized threshold + pass/fail result."""
|
|
272
|
+
pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
|
|
192
273
|
payload = {}
|
|
193
|
-
# Preserve original structure if dict
|
|
194
274
|
if isinstance(data, dict):
|
|
195
275
|
payload.update(data)
|
|
196
276
|
else:
|
|
197
277
|
payload['raw'] = data
|
|
198
278
|
|
|
199
|
-
# Try to extract a numeric score
|
|
200
279
|
score_val = None
|
|
201
280
|
if isinstance(data, dict):
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
score_val = data[k]
|
|
205
|
-
break
|
|
281
|
+
if metric_id in data:
|
|
282
|
+
score_val = data[metric_id]
|
|
206
283
|
if isinstance(score_val, (int, float)):
|
|
207
|
-
payload['threshold'] =
|
|
208
|
-
payload['result'] = 'pass' if score_val >=
|
|
284
|
+
payload['threshold'] = pass_threshold
|
|
285
|
+
payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
|
|
209
286
|
else:
|
|
210
|
-
|
|
211
|
-
payload['threshold'] = PASS_THRESHOLD
|
|
287
|
+
payload['threshold'] = pass_threshold
|
|
212
288
|
payload.setdefault('result', 'unknown')
|
|
213
289
|
return json.dumps(payload, indent=4)
|
|
214
290
|
|
|
291
|
+
# Validate all evaluator names upfront (across defaults and all items)
|
|
292
|
+
all_evaluator_maps = [default_evaluators]
|
|
293
|
+
for eval_item in eval_items:
|
|
294
|
+
if "evaluators" in eval_item:
|
|
295
|
+
all_evaluator_maps.append(eval_item["evaluators"])
|
|
296
|
+
for emap in all_evaluator_maps:
|
|
297
|
+
validate_evaluator_names(emap)
|
|
298
|
+
|
|
215
299
|
evaluation_results = []
|
|
216
|
-
for
|
|
217
|
-
# Extract text response for evaluation (backward compatibility)
|
|
218
|
-
enhanced_response = responses[prompt]
|
|
300
|
+
for enhanced_response, eval_item in zip(responses, eval_items):
|
|
219
301
|
actual_response_text = get_response_text_for_evaluation(enhanced_response)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
response=actual_response_text
|
|
229
|
-
)
|
|
230
|
-
|
|
231
|
-
groundedness_score = groundedness_evaluator(
|
|
232
|
-
response=actual_response_text,
|
|
233
|
-
context=expected_response
|
|
302
|
+
prompt = eval_item.get("prompt", "")
|
|
303
|
+
expected_response = eval_item.get("expected_response", "")
|
|
304
|
+
prompt_evaluators = eval_item.get("evaluators")
|
|
305
|
+
evaluators_mode = eval_item.get("evaluators_mode", "extend")
|
|
306
|
+
|
|
307
|
+
# Resolve evaluators for this prompt
|
|
308
|
+
resolved = resolve_evaluators_for_prompt(
|
|
309
|
+
prompt_evaluators, evaluators_mode, prompt, default_evaluators,
|
|
234
310
|
)
|
|
235
311
|
|
|
236
|
-
#
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
citations_score = citations_evaluator(
|
|
240
|
-
response=actual_response_text
|
|
312
|
+
# Build runtime context for prerequisite checks
|
|
313
|
+
has_tool_defs = bool(
|
|
314
|
+
args.m365_agent_id and enhanced_response.get("tool_definitions")
|
|
241
315
|
)
|
|
316
|
+
available_context = {
|
|
317
|
+
REQUIRES_AZURE_OPENAI: has_azure_openai,
|
|
318
|
+
REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
|
|
319
|
+
}
|
|
242
320
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
321
|
+
results_dict: Dict[str, Optional[str]] = {}
|
|
322
|
+
evaluators_ran: List[str] = []
|
|
323
|
+
|
|
324
|
+
for eval_name, eval_options in resolved.items():
|
|
325
|
+
# Check prerequisites
|
|
326
|
+
can_run, warn_msg = check_prerequisites(eval_name, available_context)
|
|
327
|
+
if not can_run:
|
|
328
|
+
if warn_msg:
|
|
329
|
+
emit_structured_log("warning", f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}", operation=Operation.EVALUATE)
|
|
330
|
+
results_dict[eval_name] = None
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
evaluators_ran.append(eval_name)
|
|
334
|
+
threshold = get_evaluator_threshold(eval_name, eval_options)
|
|
335
|
+
|
|
336
|
+
if eval_name == RELEVANCE:
|
|
337
|
+
raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
|
|
338
|
+
results_dict[RELEVANCE] = decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
|
|
339
|
+
elif eval_name == COHERENCE:
|
|
340
|
+
raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
|
|
341
|
+
results_dict[COHERENCE] = decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
|
|
342
|
+
elif eval_name == GROUNDEDNESS:
|
|
343
|
+
raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response_text, context=expected_response)
|
|
344
|
+
results_dict[GROUNDEDNESS] = decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
|
|
345
|
+
elif eval_name == TOOL_CALL_ACCURACY:
|
|
346
|
+
raw_score = ToolCallAccuracyEvaluator(model_config)(
|
|
347
|
+
query=prompt,
|
|
348
|
+
response=enhanced_response.get("response", actual_response_text),
|
|
349
|
+
tool_definitions=enhanced_response["tool_definitions"],
|
|
350
|
+
)
|
|
351
|
+
results_dict[TOOL_CALL_ACCURACY] = decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
|
|
352
|
+
elif eval_name == CITATIONS:
|
|
353
|
+
fmt_str = eval_options.get("citation_format", "oai_unicode")
|
|
354
|
+
fmt_map = {
|
|
355
|
+
"oai_unicode": CitationFormat.OAI_UNICODE,
|
|
356
|
+
"bracket": CitationFormat.LEGACY_BRACKET,
|
|
357
|
+
"mixed": CitationFormat.AUTO,
|
|
358
|
+
}
|
|
359
|
+
raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response_text)
|
|
360
|
+
results_dict[CITATIONS] = decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
|
|
361
|
+
elif eval_name == EXACT_MATCH:
|
|
362
|
+
# ExactMatch is binary (match/no-match) — it includes its own result
|
|
363
|
+
# field, so we skip decorate_metric which assumes a numeric score.
|
|
364
|
+
case_sensitive = eval_options.get("case_sensitive", False)
|
|
365
|
+
raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
|
|
366
|
+
results_dict[EXACT_MATCH] = json.dumps(raw_score, indent=4)
|
|
367
|
+
elif eval_name == PARTIAL_MATCH:
|
|
368
|
+
case_sensitive = eval_options.get("case_sensitive", False)
|
|
369
|
+
raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
|
|
370
|
+
results_dict[PARTIAL_MATCH] = decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
|
|
250
371
|
|
|
251
372
|
evaluation_result = {
|
|
252
373
|
"prompt": prompt,
|
|
253
|
-
"response": actual_response_text,
|
|
374
|
+
"response": actual_response_text,
|
|
254
375
|
"expected_response": expected_response,
|
|
255
|
-
"
|
|
256
|
-
|
|
257
|
-
"coherence_score": decorate_metric("coherence", coherence_score),
|
|
258
|
-
"groundedness_score": decorate_metric("groundedness", groundedness_score),
|
|
259
|
-
#"concisenessnonllm_score": decorate_metric("concisenessnonllm", concisenessNonLLM_score),
|
|
260
|
-
#"pii_score": decorate_metric("pii", PII_score),
|
|
261
|
-
"citations_score": json.dumps(citations_score, indent=4),
|
|
262
|
-
"tool_call_accuracy_score": json.dumps(tool_call_accuracy, indent=4) if tool_call_accuracy else None
|
|
263
|
-
}
|
|
376
|
+
"evaluators_ran": evaluators_ran,
|
|
377
|
+
"results": results_dict,
|
|
264
378
|
}
|
|
265
379
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
380
|
+
# Preserve evaluator config metadata for output
|
|
381
|
+
if "evaluators" in eval_item:
|
|
382
|
+
evaluation_result["evaluators"] = eval_item["evaluators"]
|
|
383
|
+
if "evaluators_mode" in eval_item:
|
|
384
|
+
evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
|
|
385
|
+
|
|
386
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
387
|
+
emit_structured_log(
|
|
388
|
+
"debug",
|
|
389
|
+
f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
|
|
390
|
+
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
391
|
+
f"Scores: {evaluation_result['results']}",
|
|
392
|
+
operation=Operation.EVALUATE,
|
|
393
|
+
)
|
|
270
394
|
|
|
271
395
|
evaluation_results.append(evaluation_result)
|
|
272
|
-
|
|
396
|
+
|
|
273
397
|
return evaluation_results
|
|
274
398
|
|
|
275
|
-
def write_results_to_console(results
|
|
399
|
+
def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
400
|
+
agent_id: Optional[str] = None,
|
|
401
|
+
cli_version: Optional[str] = None):
|
|
276
402
|
"""Write the response to console."""
|
|
277
403
|
# ANSI color codes
|
|
278
404
|
BOLD = '\033[1m'
|
|
@@ -284,47 +410,66 @@ def write_results_to_console(results):
|
|
|
284
410
|
ORANGE = '\033[38;5;208m'
|
|
285
411
|
RED = '\033[91m'
|
|
286
412
|
RESET = '\033[0m'
|
|
287
|
-
|
|
413
|
+
|
|
414
|
+
# Show metadata
|
|
415
|
+
metadata_parts = []
|
|
416
|
+
if agent_name:
|
|
417
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
418
|
+
if agent_id:
|
|
419
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
420
|
+
if cli_version:
|
|
421
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
422
|
+
if metadata_parts:
|
|
423
|
+
print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
|
|
424
|
+
print()
|
|
425
|
+
|
|
288
426
|
# Show aggregate statistics if multiple results
|
|
289
427
|
if len(results) > 1:
|
|
290
428
|
aggregates = calculate_aggregate_statistics(results)
|
|
291
429
|
if aggregates:
|
|
292
|
-
print(f"{BOLD}{BLUE}
|
|
430
|
+
print(f"{BOLD}{BLUE}Aggregate Statistics ({len(results)} prompts):{RESET}")
|
|
293
431
|
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
294
|
-
|
|
432
|
+
|
|
295
433
|
for metric_name, stats in aggregates.items():
|
|
296
434
|
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
297
|
-
|
|
435
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
436
|
+
total_prompts = stats.get('total_prompts', len(results))
|
|
437
|
+
print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
|
|
298
438
|
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
299
439
|
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
300
440
|
if stats.get('threshold') is not None:
|
|
301
441
|
print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
|
|
302
442
|
print()
|
|
303
|
-
|
|
443
|
+
|
|
304
444
|
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
305
445
|
print()
|
|
306
|
-
|
|
307
|
-
print(f"{BOLD}{BLUE}
|
|
446
|
+
|
|
447
|
+
print(f"{BOLD}{BLUE}Individual Results:{RESET}")
|
|
308
448
|
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
309
449
|
for i, result in enumerate(results, 1):
|
|
310
450
|
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
451
|
+
|
|
452
|
+
# Show which evaluators ran for this prompt
|
|
453
|
+
evaluators_ran = result.get('evaluators_ran', [])
|
|
454
|
+
if evaluators_ran:
|
|
455
|
+
print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
|
|
456
|
+
|
|
311
457
|
print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
|
|
312
458
|
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
|
|
313
459
|
|
|
314
|
-
# Print metric scores
|
|
315
|
-
metrics = result.get('results'
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
print(f"{BOLD}{color}{name}:{RESET} {v}")
|
|
460
|
+
# Print metric scores from results
|
|
461
|
+
metrics = result.get('results', {})
|
|
462
|
+
for eval_name, v in metrics.items():
|
|
463
|
+
if v is None:
|
|
464
|
+
continue # Skip null/N/A scores from skipped evaluators
|
|
465
|
+
display_name = pascal_case_to_title(eval_name)
|
|
466
|
+
if eval_name == RELEVANCE:
|
|
467
|
+
color = MAGENTA
|
|
468
|
+
elif eval_name == COHERENCE:
|
|
469
|
+
color = ORANGE
|
|
470
|
+
else:
|
|
471
|
+
color = BLUE
|
|
472
|
+
print(f"{BOLD}{color}{display_name}:{RESET} {v}")
|
|
328
473
|
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
329
474
|
|
|
330
475
|
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
@@ -336,10 +481,8 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
|
336
481
|
DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
|
|
337
482
|
|
|
338
483
|
score_val = None
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
score_val = data[k]
|
|
342
|
-
break
|
|
484
|
+
if metric_id in data and isinstance(data[metric_id], (int, float)):
|
|
485
|
+
score_val = data[metric_id]
|
|
343
486
|
if score_val is None:
|
|
344
487
|
return None
|
|
345
488
|
|
|
@@ -362,9 +505,11 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
|
362
505
|
"""Convert an internal evaluation result dict to a schema-compliant EvalItem.
|
|
363
506
|
|
|
364
507
|
Internal format (from run_evaluations):
|
|
365
|
-
{prompt, response, expected_response, results: {
|
|
508
|
+
{prompt, response, expected_response, results: {Relevance: "JSON", ...},
|
|
509
|
+
evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
|
|
366
510
|
Schema EvalItem format:
|
|
367
|
-
{prompt, response, expected_response, scores: {relevance: EvalScore, ...}
|
|
511
|
+
{prompt, response, expected_response, scores: {relevance: EvalScore, ...},
|
|
512
|
+
evaluators: {...}, evaluators_mode: "..."}
|
|
368
513
|
"""
|
|
369
514
|
item: Dict[str, Any] = {
|
|
370
515
|
"prompt": result["prompt"],
|
|
@@ -372,30 +517,35 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
|
372
517
|
"expected_response": result["expected_response"],
|
|
373
518
|
}
|
|
374
519
|
|
|
520
|
+
# Preserve evaluator config in output
|
|
521
|
+
if "evaluators" in result:
|
|
522
|
+
item["evaluators"] = result["evaluators"]
|
|
523
|
+
if "evaluators_mode" in result:
|
|
524
|
+
item["evaluators_mode"] = result["evaluators_mode"]
|
|
525
|
+
|
|
375
526
|
scores: Dict[str, Any] = {}
|
|
376
527
|
results_dict = result.get("results", {})
|
|
377
528
|
|
|
378
529
|
# EvalScore metrics (all share the same schema shape: {score, result, threshold})
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
(
|
|
382
|
-
(
|
|
383
|
-
(
|
|
384
|
-
("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
|
|
530
|
+
for eval_key, schema_key in [
|
|
531
|
+
(RELEVANCE, "relevance"),
|
|
532
|
+
(COHERENCE, "coherence"),
|
|
533
|
+
(GROUNDEDNESS, "groundedness"),
|
|
534
|
+
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
385
535
|
]:
|
|
386
|
-
raw = results_dict.get(
|
|
536
|
+
raw = results_dict.get(eval_key)
|
|
387
537
|
if not raw:
|
|
388
538
|
continue
|
|
389
539
|
data = json.loads(raw) if isinstance(raw, str) else raw
|
|
390
|
-
eval_score = extract_eval_score(data,
|
|
540
|
+
eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
|
|
391
541
|
if eval_score:
|
|
392
542
|
scores[schema_key] = eval_score
|
|
393
543
|
|
|
394
|
-
# Citations → CitationScore
|
|
395
|
-
raw_citations = results_dict.get(
|
|
544
|
+
# Citations → CitationScore
|
|
545
|
+
raw_citations = results_dict.get(CITATIONS)
|
|
396
546
|
if raw_citations:
|
|
397
547
|
data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
|
|
398
|
-
count = data.get("
|
|
548
|
+
count = data.get("citations", 0)
|
|
399
549
|
cit_result = data.get("result")
|
|
400
550
|
if cit_result not in ("pass", "fail"):
|
|
401
551
|
cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
|
|
@@ -409,17 +559,42 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
|
409
559
|
citation_score["format"] = data["citation_format"]
|
|
410
560
|
scores["citations"] = citation_score
|
|
411
561
|
|
|
562
|
+
# ExactMatch → ExactMatchScore
|
|
563
|
+
raw_exact = results_dict.get(EXACT_MATCH)
|
|
564
|
+
if raw_exact:
|
|
565
|
+
data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
|
|
566
|
+
is_match = data.get("exact_match", 0.0) == 1.0
|
|
567
|
+
scores["exactMatch"] = {
|
|
568
|
+
"match": is_match,
|
|
569
|
+
"result": data.get("result", "pass" if is_match else "fail"),
|
|
570
|
+
"reason": data.get("exact_match_reason", ""),
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
# PartialMatch → PartialMatchScore
|
|
574
|
+
raw_partial = results_dict.get(PARTIAL_MATCH)
|
|
575
|
+
if raw_partial:
|
|
576
|
+
data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
|
|
577
|
+
scores["partialMatch"] = {
|
|
578
|
+
"score": data.get("partial_match", 0.0),
|
|
579
|
+
"result": data.get("result", "fail"),
|
|
580
|
+
"threshold": data.get("threshold", 0.5),
|
|
581
|
+
"reason": data.get("partial_match_reason", ""),
|
|
582
|
+
}
|
|
583
|
+
|
|
412
584
|
if scores:
|
|
413
585
|
item["scores"] = scores
|
|
414
586
|
|
|
415
587
|
return item
|
|
416
588
|
|
|
417
589
|
|
|
418
|
-
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None
|
|
590
|
+
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
591
|
+
default_evaluators: Optional[Dict[str, Any]] = None,
|
|
592
|
+
agent_name: Optional[str] = None,
|
|
593
|
+
cli_version: Optional[str] = None):
|
|
419
594
|
"""Write results to a schema-compliant eval document JSON file.
|
|
420
595
|
|
|
421
596
|
Output follows the eval-document.schema.json format:
|
|
422
|
-
{schemaVersion, metadata, items: [EvalItem]}
|
|
597
|
+
{schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
|
|
423
598
|
"""
|
|
424
599
|
try:
|
|
425
600
|
try:
|
|
@@ -434,43 +609,68 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
|
|
|
434
609
|
}
|
|
435
610
|
if agent_id:
|
|
436
611
|
metadata["agentId"] = agent_id
|
|
612
|
+
if agent_name:
|
|
613
|
+
metadata["agentName"] = agent_name
|
|
614
|
+
if cli_version:
|
|
615
|
+
metadata["cliVersion"] = cli_version
|
|
437
616
|
|
|
438
617
|
output_data: Dict[str, Any] = {
|
|
439
618
|
"schemaVersion": current_version,
|
|
440
619
|
"metadata": metadata,
|
|
441
|
-
"items": items,
|
|
442
620
|
}
|
|
443
621
|
|
|
622
|
+
if default_evaluators is not None:
|
|
623
|
+
output_data["default_evaluators"] = default_evaluators
|
|
624
|
+
|
|
625
|
+
output_data["items"] = items
|
|
626
|
+
|
|
444
627
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
445
628
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
446
|
-
|
|
629
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
447
630
|
except Exception as e:
|
|
448
|
-
|
|
631
|
+
emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
449
632
|
sys.exit(1)
|
|
450
633
|
|
|
451
|
-
def write_results_to_csv(results: List[Dict], output_file: str
|
|
634
|
+
def write_results_to_csv(results: List[Dict], output_file: str,
|
|
635
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
636
|
+
cli_version: Optional[str] = None):
|
|
452
637
|
"""Write results to CSV file."""
|
|
453
638
|
try:
|
|
454
639
|
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
455
640
|
if results:
|
|
641
|
+
# Write metadata header
|
|
642
|
+
metadata_parts = []
|
|
643
|
+
if agent_name:
|
|
644
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
645
|
+
if agent_id:
|
|
646
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
647
|
+
if cli_version:
|
|
648
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
649
|
+
if metadata_parts:
|
|
650
|
+
f.write(f"# {' | '.join(metadata_parts)}\n")
|
|
651
|
+
|
|
456
652
|
# Write aggregate statistics first if multiple results
|
|
457
653
|
if len(results) > 1:
|
|
458
654
|
aggregates = calculate_aggregate_statistics(results)
|
|
459
655
|
if aggregates:
|
|
460
656
|
f.write("# AGGREGATE STATISTICS\n")
|
|
461
|
-
f.write("Metric,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
657
|
+
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
462
658
|
for metric_name, stats in aggregates.items():
|
|
463
659
|
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
464
|
-
|
|
660
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
661
|
+
total_prompts = stats.get('total_prompts', len(results))
|
|
662
|
+
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
465
663
|
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
466
|
-
|
|
467
|
-
# Write individual results
|
|
468
|
-
|
|
664
|
+
|
|
665
|
+
# Write individual results (exclude internal fields)
|
|
666
|
+
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode'}
|
|
667
|
+
fieldnames = [k for k in results[0].keys() if k not in exclude_keys]
|
|
668
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
469
669
|
writer.writeheader()
|
|
470
670
|
writer.writerows(results)
|
|
471
|
-
|
|
671
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
472
672
|
except Exception as e:
|
|
473
|
-
|
|
673
|
+
emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
474
674
|
sys.exit(1)
|
|
475
675
|
|
|
476
676
|
def parse_arguments():
|
|
@@ -501,8 +701,8 @@ Examples:
|
|
|
501
701
|
# Save results to HTML and open in browser
|
|
502
702
|
python main.py --output report.html
|
|
503
703
|
|
|
504
|
-
#
|
|
505
|
-
python main.py --
|
|
704
|
+
# Debug-level diagnostics
|
|
705
|
+
python main.py --log-level debug
|
|
506
706
|
|
|
507
707
|
# Sign out and clear cached authentication tokens
|
|
508
708
|
python main.py --signout
|
|
@@ -551,21 +751,13 @@ Examples:
|
|
|
551
751
|
|
|
552
752
|
# Behavior options
|
|
553
753
|
parser.add_argument(
|
|
554
|
-
'--
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
'--quiet',
|
|
560
|
-
action='store_true',
|
|
561
|
-
help='Suppress non-essential output'
|
|
562
|
-
)
|
|
563
|
-
parser.add_argument(
|
|
564
|
-
'--citation-format',
|
|
565
|
-
choices=['oai_unicode', 'legacy_bracket'],
|
|
566
|
-
default='oai_unicode',
|
|
567
|
-
help='Citation format to detect. "oai_unicode" for new OAI format (default), "legacy_bracket" for old [^i^] format'
|
|
754
|
+
'--log-level',
|
|
755
|
+
nargs='?',
|
|
756
|
+
const='info',
|
|
757
|
+
action='append',
|
|
758
|
+
help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
|
|
568
759
|
)
|
|
760
|
+
|
|
569
761
|
parser.add_argument(
|
|
570
762
|
'--signout',
|
|
571
763
|
action='store_true',
|
|
@@ -598,8 +790,13 @@ def validate_environment() -> CallPath:
|
|
|
598
790
|
|
|
599
791
|
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
|
600
792
|
if missing_vars:
|
|
601
|
-
|
|
602
|
-
|
|
793
|
+
emit_structured_log(
|
|
794
|
+
"error",
|
|
795
|
+
"Missing required environment variables: "
|
|
796
|
+
f"{', '.join(missing_vars)}. Please ensure your .env file contains "
|
|
797
|
+
"all required Azure configuration.",
|
|
798
|
+
operation=Operation.VALIDATE_ENV,
|
|
799
|
+
)
|
|
603
800
|
sys.exit(1)
|
|
604
801
|
return call_path
|
|
605
802
|
|
|
@@ -633,23 +830,42 @@ def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
|
|
|
633
830
|
# Convert other parsing errors to ValueError
|
|
634
831
|
raise ValueError(f"Invalid URL format: {url}") from e
|
|
635
832
|
|
|
636
|
-
def get_prompt_datasets(args) -> Tuple[List[
|
|
637
|
-
"""Get prompts and expected responses based on command line arguments.
|
|
833
|
+
def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
|
|
834
|
+
"""Get prompts and expected responses based on command line arguments.
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
Tuple of (eval_items, default_evaluators).
|
|
838
|
+
"""
|
|
638
839
|
if args.prompts:
|
|
639
840
|
if args.expected and len(args.prompts) != len(args.expected):
|
|
640
|
-
|
|
841
|
+
emit_structured_log(
|
|
842
|
+
"error",
|
|
843
|
+
"Number of prompts must match number of expected responses. "
|
|
844
|
+
"Update --expected values to match the prompt count.",
|
|
845
|
+
)
|
|
641
846
|
sys.exit(1)
|
|
642
|
-
|
|
643
|
-
|
|
847
|
+
expected_responses = args.expected or [""] * len(args.prompts)
|
|
848
|
+
eval_items = [
|
|
849
|
+
{"prompt": p, "expected_response": e}
|
|
850
|
+
for p, e in zip(args.prompts, expected_responses)
|
|
851
|
+
]
|
|
852
|
+
return eval_items, None
|
|
644
853
|
elif args.prompts_file:
|
|
645
|
-
|
|
854
|
+
return load_prompts_from_file(args.prompts_file)
|
|
646
855
|
elif args.interactive:
|
|
647
856
|
prompts, expected_responses = get_interactive_prompts()
|
|
857
|
+
eval_items = [
|
|
858
|
+
{"prompt": p, "expected_response": e}
|
|
859
|
+
for p, e in zip(prompts, expected_responses)
|
|
860
|
+
]
|
|
861
|
+
return eval_items, None
|
|
648
862
|
else:
|
|
649
|
-
# Use default prompts
|
|
650
863
|
prompts, expected_responses = get_default_prompts_and_responses()
|
|
651
|
-
|
|
652
|
-
|
|
864
|
+
eval_items = [
|
|
865
|
+
{"prompt": p, "expected_response": e}
|
|
866
|
+
for p, e in zip(prompts, expected_responses)
|
|
867
|
+
]
|
|
868
|
+
return eval_items, None
|
|
653
869
|
|
|
654
870
|
def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
|
|
655
871
|
"""
|
|
@@ -685,26 +901,27 @@ def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oi
|
|
|
685
901
|
return agents
|
|
686
902
|
except urllib.error.HTTPError as e:
|
|
687
903
|
# If endpoint doesn't exist or returns error, return empty list
|
|
688
|
-
|
|
904
|
+
emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
|
|
689
905
|
return []
|
|
690
906
|
except Exception as e:
|
|
691
|
-
|
|
907
|
+
emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
|
|
692
908
|
return []
|
|
693
909
|
|
|
694
|
-
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
910
|
+
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
|
|
695
911
|
"""
|
|
696
912
|
Display an interactive agent selector using questionary.
|
|
697
|
-
|
|
913
|
+
|
|
698
914
|
Args:
|
|
699
915
|
agents: List of agent dictionaries.
|
|
700
|
-
|
|
916
|
+
|
|
701
917
|
Returns:
|
|
702
|
-
|
|
918
|
+
Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
|
|
703
919
|
"""
|
|
704
920
|
if not agents:
|
|
705
|
-
return None
|
|
706
|
-
|
|
707
|
-
#
|
|
921
|
+
return None, None
|
|
922
|
+
|
|
923
|
+
# Build id→name lookup and choices
|
|
924
|
+
id_to_name: Dict[str, str] = {}
|
|
708
925
|
choices = []
|
|
709
926
|
sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
|
|
710
927
|
for agent in sorted_agents:
|
|
@@ -712,12 +929,13 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
|
712
929
|
agent_name = agent.get("name", "Unknown")
|
|
713
930
|
agent_description = agent.get("description", "Unknown")
|
|
714
931
|
agent_is_owner = agent.get('isOwner')
|
|
715
|
-
|
|
932
|
+
id_to_name[agent_id] = agent_name
|
|
933
|
+
|
|
716
934
|
# Format the display text
|
|
717
935
|
display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
|
|
718
|
-
|
|
936
|
+
|
|
719
937
|
choices.append(questionary.Choice(title=display_text, value=agent_id))
|
|
720
|
-
|
|
938
|
+
|
|
721
939
|
# Display the selection prompt
|
|
722
940
|
selected_agent = questionary.select(
|
|
723
941
|
"Select an agent to evaluate:",
|
|
@@ -725,8 +943,37 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
|
725
943
|
use_shortcuts=True,
|
|
726
944
|
use_arrow_keys=True
|
|
727
945
|
).ask()
|
|
728
|
-
|
|
729
|
-
return selected_agent
|
|
946
|
+
|
|
947
|
+
return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
|
|
948
|
+
|
|
949
|
+
@functools.lru_cache(maxsize=1)
|
|
950
|
+
def _get_iana_timezone_name() -> str:
|
|
951
|
+
"""Get the IANA timezone name from the system using tzlocal.
|
|
952
|
+
|
|
953
|
+
Tries get_localzone_name() first; falls back to str(get_localzone()) when the
|
|
954
|
+
former raises (e.g. no zone configured on some Unix systems). Result is cached
|
|
955
|
+
after the first call so tzlocal is only invoked once per session.
|
|
956
|
+
"""
|
|
957
|
+
try:
|
|
958
|
+
return tzlocal.get_localzone_name()
|
|
959
|
+
except Exception:
|
|
960
|
+
return str(tzlocal.get_localzone())
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
@functools.lru_cache(maxsize=1)
|
|
964
|
+
def _get_location_info() -> Dict[str, Any]:
|
|
965
|
+
"""Return a locationInfo dict containing the local UTC offset and IANA timezone name.
|
|
966
|
+
|
|
967
|
+
Result is cached after the first call so the computation runs only once per session.
|
|
968
|
+
"""
|
|
969
|
+
now = datetime.now().astimezone()
|
|
970
|
+
utc_offset = now.utcoffset()
|
|
971
|
+
offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
|
|
972
|
+
return {
|
|
973
|
+
"timeZoneOffset": offset_hours,
|
|
974
|
+
"timeZone": _get_iana_timezone_name(),
|
|
975
|
+
}
|
|
976
|
+
|
|
730
977
|
|
|
731
978
|
def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
732
979
|
message = {
|
|
@@ -735,6 +982,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
|
735
982
|
"author": "user",
|
|
736
983
|
"messageType": "chat",
|
|
737
984
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
985
|
+
"locationInfo": _get_location_info(),
|
|
738
986
|
"from": {
|
|
739
987
|
"id": user_oid,
|
|
740
988
|
}
|
|
@@ -755,7 +1003,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
|
755
1003
|
|
|
756
1004
|
return json.dumps(message).encode("utf-8")
|
|
757
1005
|
|
|
758
|
-
def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) ->
|
|
1006
|
+
def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
|
|
759
1007
|
""" Send prompts to the chat API and return enhanced responses. """
|
|
760
1008
|
|
|
761
1009
|
request_headers = {
|
|
@@ -763,15 +1011,15 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
|
|
|
763
1011
|
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
764
1012
|
"Authorization": f"Bearer {access_token}"
|
|
765
1013
|
}
|
|
766
|
-
raw_responses:
|
|
1014
|
+
raw_responses: List[Tuple[str, str]] = []
|
|
767
1015
|
for i, prompt in enumerate(prompts, 1):
|
|
768
|
-
if
|
|
769
|
-
|
|
1016
|
+
if getattr(args, "effective_log_level", "info") in ("info", "debug"):
|
|
1017
|
+
emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
|
|
770
1018
|
|
|
771
1019
|
# Build the payload
|
|
772
1020
|
payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
|
|
773
|
-
if args
|
|
774
|
-
|
|
1021
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1022
|
+
emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
|
|
775
1023
|
|
|
776
1024
|
# Send the request to /chat
|
|
777
1025
|
req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
|
|
@@ -790,42 +1038,83 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
|
|
|
790
1038
|
raise RuntimeError(msg) from e
|
|
791
1039
|
except urllib.error.URLError as e:
|
|
792
1040
|
raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
|
|
793
|
-
|
|
794
|
-
if args
|
|
795
|
-
|
|
796
|
-
|
|
1041
|
+
|
|
1042
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1043
|
+
emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
|
|
1044
|
+
|
|
797
1045
|
# Store raw response for enhancement
|
|
798
|
-
raw_responses
|
|
799
|
-
|
|
1046
|
+
raw_responses.append((prompt, raw.strip()))
|
|
1047
|
+
|
|
800
1048
|
# Extract enhanced responses using the new extractor
|
|
801
|
-
enhanced_responses = extract_enhanced_responses(raw_responses)
|
|
1049
|
+
enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
|
|
1050
|
+
|
|
1051
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1052
|
+
for idx, enhanced in enumerate(enhanced_responses, 1):
|
|
1053
|
+
metadata = enhanced.get("metadata", {})
|
|
1054
|
+
context = {
|
|
1055
|
+
"request-id": metadata.get("request_id"),
|
|
1056
|
+
"conversation-id": metadata.get("conversation_id"),
|
|
1057
|
+
"message-id": metadata.get("message_id"),
|
|
1058
|
+
"operation": Operation.SEND_PROMPT,
|
|
1059
|
+
}
|
|
1060
|
+
entry = format_structured_log_entry(
|
|
1061
|
+
level="debug",
|
|
1062
|
+
message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
|
|
1063
|
+
logger_name=CLI_LOGGER_NAME,
|
|
1064
|
+
run_context=context,
|
|
1065
|
+
)
|
|
1066
|
+
DIAGNOSTIC_RECORDS.append(entry)
|
|
1067
|
+
CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
|
|
1068
|
+
|
|
802
1069
|
return enhanced_responses
|
|
803
1070
|
|
|
804
|
-
def output_results(results: List[Dict], args
|
|
1071
|
+
def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
|
|
1072
|
+
agent_name: Optional[str] = None, cli_version: Optional[str] = None):
|
|
805
1073
|
"""Output results based on specified format."""
|
|
1074
|
+
metadata_kwargs = dict(
|
|
1075
|
+
agent_name=agent_name,
|
|
1076
|
+
agent_id=getattr(args, 'm365_agent_id', None),
|
|
1077
|
+
cli_version=cli_version,
|
|
1078
|
+
)
|
|
806
1079
|
if args.output:
|
|
807
1080
|
output_lower = args.output.lower()
|
|
808
1081
|
if output_lower.endswith('.json'):
|
|
809
|
-
write_results_to_json(results, args.output,
|
|
1082
|
+
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1083
|
+
**metadata_kwargs)
|
|
810
1084
|
elif output_lower.endswith('.csv'):
|
|
811
|
-
write_results_to_csv(results, args.output)
|
|
1085
|
+
write_results_to_csv(results, args.output, **metadata_kwargs)
|
|
812
1086
|
elif output_lower.endswith('.html'):
|
|
813
|
-
write_results_to_html(results, args.output)
|
|
1087
|
+
write_results_to_html(results, args.output, **metadata_kwargs)
|
|
814
1088
|
abs_path = os.path.abspath(args.output)
|
|
815
1089
|
webbrowser.open(f'file://{abs_path}')
|
|
816
1090
|
else:
|
|
817
|
-
write_results_to_json(results, args.output,
|
|
1091
|
+
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1092
|
+
**metadata_kwargs)
|
|
818
1093
|
else:
|
|
819
|
-
write_results_to_console(results)
|
|
1094
|
+
write_results_to_console(results, **metadata_kwargs)
|
|
820
1095
|
|
|
821
1096
|
def main():
|
|
822
1097
|
"""Main function to orchestrate the evaluation process."""
|
|
823
1098
|
load_dotenv()
|
|
824
1099
|
args = parse_arguments()
|
|
825
1100
|
|
|
1101
|
+
effective_log_level, error_message = resolve_log_level(args.log_level)
|
|
1102
|
+
if error_message:
|
|
1103
|
+
print(error_message)
|
|
1104
|
+
print(
|
|
1105
|
+
"Next step: rerun with --log-level {debug|info|warning|error}. "
|
|
1106
|
+
"For support, share the console diagnostics output from this run."
|
|
1107
|
+
)
|
|
1108
|
+
sys.exit(2)
|
|
1109
|
+
|
|
1110
|
+
args.effective_log_level = effective_log_level
|
|
1111
|
+
configure_cli_logging(effective_log_level)
|
|
1112
|
+
emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
|
|
1113
|
+
|
|
826
1114
|
# Check minimum version before proceeding
|
|
827
|
-
|
|
828
|
-
|
|
1115
|
+
quiet_for_version = effective_log_level in ("warning", "error")
|
|
1116
|
+
cli_version = get_cli_version(quiet=quiet_for_version)
|
|
1117
|
+
if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
|
|
829
1118
|
sys.exit(1)
|
|
830
1119
|
|
|
831
1120
|
# Validate environment variables required for evaluation
|
|
@@ -853,7 +1142,7 @@ def main():
|
|
|
853
1142
|
try:
|
|
854
1143
|
auth_handler.clear_cache()
|
|
855
1144
|
except Exception as e:
|
|
856
|
-
|
|
1145
|
+
emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
|
|
857
1146
|
sys.exit(1)
|
|
858
1147
|
sys.exit(0)
|
|
859
1148
|
|
|
@@ -866,67 +1155,87 @@ def main():
|
|
|
866
1155
|
|
|
867
1156
|
id_token_claims = auth_result.get("id_token_claims")
|
|
868
1157
|
if not isinstance(id_token_claims, dict):
|
|
869
|
-
|
|
1158
|
+
emit_structured_log(
|
|
1159
|
+
"warning", "id_token_claims is missing or invalid in authentication result",
|
|
1160
|
+
operation=Operation.AUTHENTICATE,
|
|
1161
|
+
)
|
|
870
1162
|
else:
|
|
871
1163
|
user_oid = id_token_claims.get("oid") or ""
|
|
872
1164
|
|
|
873
1165
|
except Exception as e:
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
1166
|
+
emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
|
|
1167
|
+
if effective_log_level == "debug":
|
|
1168
|
+
import traceback
|
|
1169
|
+
traceback.print_exc()
|
|
1170
|
+
sys.exit(1)
|
|
879
1171
|
|
|
880
1172
|
if not user_oid and access_token:
|
|
881
1173
|
# Fallback: extract from access token.
|
|
882
1174
|
user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
|
|
883
1175
|
|
|
884
|
-
# 1. Load evaluation datasets
|
|
885
|
-
|
|
1176
|
+
# 1. Load evaluation datasets
|
|
1177
|
+
eval_items, file_default_evaluators = get_prompt_datasets(args)
|
|
1178
|
+
default_evaluators = resolve_default_evaluators(file_default_evaluators)
|
|
1179
|
+
prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
|
|
886
1180
|
|
|
887
|
-
if
|
|
888
|
-
|
|
1181
|
+
if effective_log_level in ("info", "debug"):
|
|
1182
|
+
emit_structured_log("info", f"Running evaluation on {len(prompts)} prompt(s).", operation=Operation.SETUP)
|
|
889
1183
|
|
|
1184
|
+
agent_name = None
|
|
890
1185
|
try:
|
|
891
1186
|
# 3. Agent selection - if no agent ID provided, prompt user to select
|
|
892
1187
|
if not args.m365_agent_id:
|
|
893
|
-
if
|
|
894
|
-
|
|
1188
|
+
if effective_log_level in ("info", "debug"):
|
|
1189
|
+
emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
|
|
895
1190
|
|
|
896
1191
|
available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
|
|
897
1192
|
if not available_agents:
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
1193
|
+
emit_structured_log(
|
|
1194
|
+
"error",
|
|
1195
|
+
"No agents are available for interactive selection. Re-run with "
|
|
1196
|
+
"--m365-agent-id or set M365_AGENT_ID.",
|
|
1197
|
+
operation=Operation.FETCH_AGENTS,
|
|
1198
|
+
)
|
|
1199
|
+
sys.exit(1)
|
|
1200
|
+
|
|
1201
|
+
selected_agent_id, agent_name = select_agent_interactively(available_agents)
|
|
1202
|
+
if selected_agent_id:
|
|
1203
|
+
args.m365_agent_id = selected_agent_id
|
|
1204
|
+
if effective_log_level in ("info", "debug"):
|
|
1205
|
+
emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
|
|
1206
|
+
else:
|
|
1207
|
+
emit_structured_log(
|
|
1208
|
+
"error",
|
|
1209
|
+
"No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
|
|
1210
|
+
operation=Operation.FETCH_AGENTS,
|
|
1211
|
+
)
|
|
1212
|
+
sys.exit(1)
|
|
910
1213
|
|
|
911
1214
|
# 4. Send prompts to chat API
|
|
912
1215
|
responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
|
|
913
1216
|
except Exception as e:
|
|
914
|
-
|
|
915
|
-
if
|
|
1217
|
+
emit_structured_log("error", f"Error sending prompts to chat API: {e}", operation=Operation.SEND_PROMPT)
|
|
1218
|
+
if effective_log_level == "debug":
|
|
916
1219
|
import traceback
|
|
917
1220
|
traceback.print_exc()
|
|
918
1221
|
sys.exit(1)
|
|
1222
|
+
|
|
919
1223
|
|
|
920
1224
|
# 5. Run evaluations
|
|
921
|
-
if
|
|
922
|
-
|
|
923
|
-
results = run_evaluations(args, responses,
|
|
1225
|
+
if effective_log_level in ("info", "debug"):
|
|
1226
|
+
emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
|
|
1227
|
+
results = run_evaluations(args, responses, eval_items, default_evaluators)
|
|
924
1228
|
|
|
925
1229
|
# 6. Output results
|
|
926
|
-
output_results(results, args
|
|
1230
|
+
output_results(results, args, default_evaluators=default_evaluators,
|
|
1231
|
+
agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
|
|
927
1232
|
|
|
928
|
-
if
|
|
929
|
-
|
|
1233
|
+
if effective_log_level in ("info", "debug"):
|
|
1234
|
+
emit_structured_log(
|
|
1235
|
+
"info",
|
|
1236
|
+
f"Evaluation completed successfully. Processed {len(prompts)} prompt(s).",
|
|
1237
|
+
operation=Operation.EVALUATE,
|
|
1238
|
+
)
|
|
930
1239
|
|
|
931
1240
|
# Call the main function when script is run directly
|
|
932
1241
|
if __name__ == "__main__":
|