@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.3.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -4
- package/package.json +2 -2
- package/schema/CHANGELOG.md +8 -0
- package/schema/v1/eval-document.schema.json +117 -1
- package/schema/v1/examples/valid/comprehensive.json +27 -2
- package/schema/version.json +2 -2
- package/src/clients/cli/cli_logging/__init__.py +0 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +55 -0
- package/src/clients/cli/cli_logging/logging_utils.py +145 -0
- package/src/clients/cli/common.py +51 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
- package/src/clients/cli/evaluator_resolver.py +150 -0
- package/src/clients/cli/generate_report.py +130 -110
- package/src/clients/cli/main.py +513 -236
- package/src/clients/cli/readme.md +14 -7
- package/src/clients/cli/response_extractor.py +32 -14
- package/src/clients/node-js/bin/runevals.js +58 -28
- package/src/clients/node-js/config/default.js +1 -1
package/src/clients/cli/main.py
CHANGED
|
@@ -4,6 +4,7 @@ import argparse
|
|
|
4
4
|
import sys
|
|
5
5
|
import csv
|
|
6
6
|
import functools
|
|
7
|
+
import logging
|
|
7
8
|
import webbrowser
|
|
8
9
|
import urllib.request
|
|
9
10
|
import urllib.error
|
|
@@ -21,16 +22,40 @@ from azure.ai.evaluation import (
|
|
|
21
22
|
from dotenv import load_dotenv
|
|
22
23
|
from auth.auth_handler import AuthHandler
|
|
23
24
|
from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
|
|
24
|
-
|
|
25
|
-
|
|
25
|
+
from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
|
|
26
|
+
from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
|
|
26
27
|
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
27
28
|
from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
|
|
28
29
|
from schema_handler import DocumentUpgrader, SchemaVersionManager
|
|
30
|
+
from common import (
|
|
31
|
+
RELEVANCE,
|
|
32
|
+
COHERENCE,
|
|
33
|
+
GROUNDEDNESS,
|
|
34
|
+
TOOL_CALL_ACCURACY,
|
|
35
|
+
CITATIONS,
|
|
36
|
+
EXACT_MATCH,
|
|
37
|
+
PARTIAL_MATCH,
|
|
38
|
+
REQUIRES_AZURE_OPENAI,
|
|
39
|
+
REQUIRES_TOOL_DEFINITIONS,
|
|
40
|
+
METRIC_IDS,
|
|
41
|
+
pascal_case_to_title,
|
|
42
|
+
)
|
|
43
|
+
from evaluator_resolver import (
|
|
44
|
+
EVALUATOR_REGISTRY,
|
|
45
|
+
validate_evaluator_names,
|
|
46
|
+
check_prerequisites,
|
|
47
|
+
resolve_default_evaluators,
|
|
48
|
+
resolve_evaluators_for_prompt,
|
|
49
|
+
get_evaluator_threshold,
|
|
50
|
+
)
|
|
29
51
|
from version_check import check_min_version, get_cli_version
|
|
30
52
|
from datetime import datetime, timezone
|
|
31
53
|
from pathlib import Path
|
|
32
54
|
import tzlocal
|
|
33
55
|
|
|
56
|
+
from cli_logging.console_diagnostics import render_diagnostic, serialize_diagnostic_record
|
|
57
|
+
from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel, Operation, format_structured_log_entry, resolve_log_level
|
|
58
|
+
|
|
34
59
|
# Allowed endpoints for URL validation
|
|
35
60
|
ALLOWED_ENDPOINTS = [
|
|
36
61
|
'substrate.office.com'
|
|
@@ -48,20 +73,63 @@ VERSION_CHECK_BYPASS_FLAGS = (
|
|
|
48
73
|
"signout",
|
|
49
74
|
)
|
|
50
75
|
|
|
76
|
+
CLI_LOGGER_NAME = "m365.eval.cli"
|
|
77
|
+
CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
|
|
78
|
+
DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _ensure_logger_handler() -> None:
|
|
82
|
+
if CLI_LOGGER.handlers:
|
|
83
|
+
return
|
|
84
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
85
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
86
|
+
CLI_LOGGER.addHandler(handler)
|
|
87
|
+
CLI_LOGGER.propagate = False
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def configure_cli_logging(effective_log_level: str) -> None:
|
|
91
|
+
_ensure_logger_handler()
|
|
92
|
+
CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
|
|
96
|
+
_ensure_logger_handler()
|
|
97
|
+
context = {
|
|
98
|
+
"request-id": None,
|
|
99
|
+
"conversation-id": None,
|
|
100
|
+
"message-id": None,
|
|
101
|
+
"operation": operation,
|
|
102
|
+
}
|
|
103
|
+
entry = format_structured_log_entry(
|
|
104
|
+
level=level,
|
|
105
|
+
message=message,
|
|
106
|
+
logger_name=CLI_LOGGER_NAME,
|
|
107
|
+
run_context=context,
|
|
108
|
+
)
|
|
109
|
+
DIAGNOSTIC_RECORDS.append(entry)
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
|
|
113
|
+
except Exception:
|
|
114
|
+
pass
|
|
115
|
+
|
|
51
116
|
|
|
52
117
|
def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
|
|
53
118
|
"""Return True if the current invocation should skip min-version checks."""
|
|
54
119
|
return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
|
|
55
120
|
|
|
56
|
-
def write_results_to_html(results: List[Dict], output_file: str
|
|
121
|
+
def write_results_to_html(results: List[Dict], output_file: str,
|
|
122
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
123
|
+
cli_version: Optional[str] = None):
|
|
57
124
|
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
58
125
|
try:
|
|
59
|
-
html = generate_html_report(results
|
|
126
|
+
html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
|
|
127
|
+
cli_version=cli_version)
|
|
60
128
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
61
129
|
f.write(html)
|
|
62
|
-
|
|
130
|
+
emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
63
131
|
except Exception as e:
|
|
64
|
-
|
|
132
|
+
emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
65
133
|
sys.exit(1)
|
|
66
134
|
|
|
67
135
|
def get_default_prompts_and_responses():
|
|
@@ -74,7 +142,7 @@ def get_default_prompts_and_responses():
|
|
|
74
142
|
]
|
|
75
143
|
return prompts, expected_responses
|
|
76
144
|
|
|
77
|
-
def load_prompts_from_file(file_path: str) -> Tuple[List[
|
|
145
|
+
def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
|
|
78
146
|
"""Load prompts and expected responses from a JSON file.
|
|
79
147
|
|
|
80
148
|
Supports three formats:
|
|
@@ -84,6 +152,10 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
84
152
|
|
|
85
153
|
For eval documents (format 1) and array format (format 2), schema validation
|
|
86
154
|
and auto-upgrade are applied via DocumentUpgrader.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
|
|
158
|
+
expected_response, and optional evaluators/evaluators_mode fields.
|
|
87
159
|
"""
|
|
88
160
|
try:
|
|
89
161
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
@@ -100,18 +172,18 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
100
172
|
upgrader = DocumentUpgrader()
|
|
101
173
|
except Exception as e:
|
|
102
174
|
# Schema infrastructure not available (missing files, etc.) — skip
|
|
103
|
-
|
|
175
|
+
emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
|
|
104
176
|
upgrader = None
|
|
105
177
|
|
|
106
178
|
if upgrader is not None:
|
|
107
179
|
result = upgrader.upgrade(Path(file_path))
|
|
108
180
|
|
|
109
181
|
if result.error:
|
|
110
|
-
|
|
182
|
+
emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
|
|
111
183
|
sys.exit(1)
|
|
112
184
|
|
|
113
185
|
if result.upgraded and result.message:
|
|
114
|
-
|
|
186
|
+
emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
|
|
115
187
|
|
|
116
188
|
# Use the parsed document from the upgrade result
|
|
117
189
|
if result.document is not None:
|
|
@@ -119,26 +191,26 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
|
119
191
|
|
|
120
192
|
if isinstance(data, list):
|
|
121
193
|
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
122
|
-
|
|
123
|
-
expected_responses = [item.get("expected_response", "") for item in data]
|
|
194
|
+
return data, None
|
|
124
195
|
elif isinstance(data, dict):
|
|
125
196
|
if "items" in data:
|
|
126
197
|
# Eval document format: {"schemaVersion": "...", "items": [...]}
|
|
127
|
-
|
|
128
|
-
prompts = [item.get("prompt", "") for item in items]
|
|
129
|
-
expected_responses = [item.get("expected_response", "") for item in items]
|
|
198
|
+
return data["items"], data.get("default_evaluators")
|
|
130
199
|
else:
|
|
131
200
|
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
132
201
|
prompts = data.get("prompts", [])
|
|
133
202
|
expected_responses = data.get("expected_responses", [])
|
|
203
|
+
eval_items = [
|
|
204
|
+
{"prompt": p, "expected_response": e}
|
|
205
|
+
for p, e in zip(prompts, expected_responses)
|
|
206
|
+
]
|
|
207
|
+
return eval_items, None
|
|
134
208
|
else:
|
|
135
209
|
raise ValueError("Invalid file format")
|
|
136
|
-
|
|
137
|
-
return prompts, expected_responses
|
|
138
210
|
except SystemExit:
|
|
139
211
|
raise
|
|
140
212
|
except Exception as e:
|
|
141
|
-
|
|
213
|
+
emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
|
|
142
214
|
sys.exit(1)
|
|
143
215
|
|
|
144
216
|
def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
@@ -165,116 +237,168 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
|
165
237
|
|
|
166
238
|
return prompts, expected_responses
|
|
167
239
|
|
|
168
|
-
def run_evaluations(args, responses:
|
|
169
|
-
|
|
240
|
+
def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
|
|
241
|
+
default_evaluators: Dict[str, Any]) -> list:
|
|
242
|
+
"""Run evaluations against the responses using per-prompt evaluator resolution.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
args: CLI arguments.
|
|
246
|
+
responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
|
|
247
|
+
eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
|
|
248
|
+
default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
|
|
249
|
+
"""
|
|
250
|
+
if len(responses) != len(eval_items):
|
|
251
|
+
raise ValueError(
|
|
252
|
+
f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
|
|
253
|
+
)
|
|
254
|
+
|
|
170
255
|
model_config = AzureOpenAIModelConfiguration(
|
|
171
256
|
azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
|
|
172
257
|
api_key=os.environ.get("AZURE_AI_API_KEY"),
|
|
173
258
|
api_version=os.environ.get("AZURE_AI_API_VERSION"),
|
|
174
259
|
azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
|
|
175
260
|
)
|
|
176
|
-
|
|
177
|
-
# Initialize evaluators
|
|
178
|
-
relevance_evaluator = RelevanceEvaluator(model_config=model_config) # Evaluate relevance for a given response. Range is 1 - 5.
|
|
179
|
-
coherence_evaluator = CoherenceEvaluator(model_config=model_config) # Measures the coherence (human-like quality) of the response. Range is 1 - 5.
|
|
180
|
-
groundedness_evaluator = GroundednessEvaluator(model_config=model_config) # Evaluates the response for for factuality and groundedness against provided context. Range is 1 - 5.
|
|
181
|
-
#concisenessnonllm_evaluator = ConcisenessNonLLMEvaluator() # Evaluates the response for conciseness. Range is 1 - 5.
|
|
182
|
-
#pii_evaluator = PIIEvaluator(model_config=model_config) # Evaluates the response for presence of PII. Range
|
|
183
|
-
# Parse citation format from args
|
|
184
|
-
citation_format = CitationFormat.OAI_UNICODE if args.citation_format == 'oai_unicode' else CitationFormat.LEGACY_BRACKET
|
|
185
|
-
citations_evaluator = CitationsEvaluator(citation_format=citation_format) # Evaluates citations present in the response using regex pattern matching
|
|
186
|
-
|
|
187
|
-
tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) # Evaluate tool call accuracy if tool definitions are present in response
|
|
188
|
-
|
|
189
261
|
|
|
190
|
-
|
|
262
|
+
# Build available context for prerequisite checks
|
|
263
|
+
has_azure_openai = bool(
|
|
264
|
+
os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
|
|
265
|
+
and os.environ.get("AZURE_AI_API_KEY")
|
|
266
|
+
)
|
|
191
267
|
|
|
192
|
-
|
|
268
|
+
DEFAULT_PASS_THRESHOLD = 3
|
|
269
|
+
|
|
270
|
+
def decorate_metric(metric_id: str, data, threshold: Optional[int] = None):
|
|
193
271
|
"""Augment raw evaluator output with standardized threshold + pass/fail result."""
|
|
272
|
+
pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
|
|
194
273
|
payload = {}
|
|
195
|
-
# Preserve original structure if dict
|
|
196
274
|
if isinstance(data, dict):
|
|
197
275
|
payload.update(data)
|
|
198
276
|
else:
|
|
199
277
|
payload['raw'] = data
|
|
200
278
|
|
|
201
|
-
# Try to extract a numeric score
|
|
202
279
|
score_val = None
|
|
203
280
|
if isinstance(data, dict):
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
score_val = data[k]
|
|
207
|
-
break
|
|
281
|
+
if metric_id in data:
|
|
282
|
+
score_val = data[metric_id]
|
|
208
283
|
if isinstance(score_val, (int, float)):
|
|
209
|
-
payload['threshold'] =
|
|
210
|
-
payload['result'] = 'pass' if score_val >=
|
|
284
|
+
payload['threshold'] = pass_threshold
|
|
285
|
+
payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
|
|
211
286
|
else:
|
|
212
|
-
|
|
213
|
-
payload['threshold'] = PASS_THRESHOLD
|
|
287
|
+
payload['threshold'] = pass_threshold
|
|
214
288
|
payload.setdefault('result', 'unknown')
|
|
215
289
|
return json.dumps(payload, indent=4)
|
|
216
290
|
|
|
291
|
+
# Validate all evaluator names upfront (across defaults and all items)
|
|
292
|
+
all_evaluator_maps = [default_evaluators]
|
|
293
|
+
for eval_item in eval_items:
|
|
294
|
+
if "evaluators" in eval_item:
|
|
295
|
+
all_evaluator_maps.append(eval_item["evaluators"])
|
|
296
|
+
for emap in all_evaluator_maps:
|
|
297
|
+
validate_evaluator_names(emap)
|
|
298
|
+
|
|
217
299
|
evaluation_results = []
|
|
218
|
-
for
|
|
219
|
-
# Extract text response for evaluation (backward compatibility)
|
|
220
|
-
enhanced_response = responses[prompt]
|
|
300
|
+
for enhanced_response, eval_item in zip(responses, eval_items):
|
|
221
301
|
actual_response_text = get_response_text_for_evaluation(enhanced_response)
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
302
|
+
prompt = eval_item.get("prompt", "")
|
|
303
|
+
expected_response = eval_item.get("expected_response", "")
|
|
304
|
+
prompt_evaluators = eval_item.get("evaluators")
|
|
305
|
+
evaluators_mode = eval_item.get("evaluators_mode", "extend")
|
|
306
|
+
|
|
307
|
+
# Resolve evaluators for this prompt
|
|
308
|
+
resolved = resolve_evaluators_for_prompt(
|
|
309
|
+
prompt_evaluators, evaluators_mode, prompt, default_evaluators,
|
|
227
310
|
)
|
|
228
|
-
coherence_score = coherence_evaluator(
|
|
229
|
-
query=prompt,
|
|
230
|
-
response=actual_response_text
|
|
231
|
-
)
|
|
232
|
-
|
|
233
|
-
groundedness_score = groundedness_evaluator(
|
|
234
|
-
response=actual_response_text,
|
|
235
|
-
context=expected_response
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
#PII_score = pii_evaluator(response=actual_response_text)
|
|
239
|
-
#concisenessNonLLM_score = concisenessnonllm_evaluator(response=actual_response_text)
|
|
240
311
|
|
|
241
|
-
|
|
242
|
-
|
|
312
|
+
# Build runtime context for prerequisite checks
|
|
313
|
+
has_tool_defs = bool(
|
|
314
|
+
args.m365_agent_id and enhanced_response.get("tool_definitions")
|
|
243
315
|
)
|
|
316
|
+
available_context = {
|
|
317
|
+
REQUIRES_AZURE_OPENAI: has_azure_openai,
|
|
318
|
+
REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
|
|
319
|
+
}
|
|
244
320
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
321
|
+
results_dict: Dict[str, Optional[str]] = {}
|
|
322
|
+
evaluators_ran: List[str] = []
|
|
323
|
+
|
|
324
|
+
for eval_name, eval_options in resolved.items():
|
|
325
|
+
# Check prerequisites
|
|
326
|
+
can_run, warn_msg = check_prerequisites(eval_name, available_context)
|
|
327
|
+
if not can_run:
|
|
328
|
+
if warn_msg:
|
|
329
|
+
emit_structured_log("warning", f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}", operation=Operation.EVALUATE)
|
|
330
|
+
results_dict[eval_name] = None
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
evaluators_ran.append(eval_name)
|
|
334
|
+
threshold = get_evaluator_threshold(eval_name, eval_options)
|
|
335
|
+
|
|
336
|
+
if eval_name == RELEVANCE:
|
|
337
|
+
raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
|
|
338
|
+
results_dict[RELEVANCE] = decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
|
|
339
|
+
elif eval_name == COHERENCE:
|
|
340
|
+
raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
|
|
341
|
+
results_dict[COHERENCE] = decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
|
|
342
|
+
elif eval_name == GROUNDEDNESS:
|
|
343
|
+
raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response_text, context=expected_response)
|
|
344
|
+
results_dict[GROUNDEDNESS] = decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
|
|
345
|
+
elif eval_name == TOOL_CALL_ACCURACY:
|
|
346
|
+
raw_score = ToolCallAccuracyEvaluator(model_config)(
|
|
347
|
+
query=prompt,
|
|
348
|
+
response=enhanced_response.get("response", actual_response_text),
|
|
349
|
+
tool_definitions=enhanced_response["tool_definitions"],
|
|
350
|
+
)
|
|
351
|
+
results_dict[TOOL_CALL_ACCURACY] = decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
|
|
352
|
+
elif eval_name == CITATIONS:
|
|
353
|
+
fmt_str = eval_options.get("citation_format", "oai_unicode")
|
|
354
|
+
fmt_map = {
|
|
355
|
+
"oai_unicode": CitationFormat.OAI_UNICODE,
|
|
356
|
+
"bracket": CitationFormat.LEGACY_BRACKET,
|
|
357
|
+
"mixed": CitationFormat.AUTO,
|
|
358
|
+
}
|
|
359
|
+
raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response_text)
|
|
360
|
+
results_dict[CITATIONS] = decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
|
|
361
|
+
elif eval_name == EXACT_MATCH:
|
|
362
|
+
# ExactMatch is binary (match/no-match) — it includes its own result
|
|
363
|
+
# field, so we skip decorate_metric which assumes a numeric score.
|
|
364
|
+
case_sensitive = eval_options.get("case_sensitive", False)
|
|
365
|
+
raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
|
|
366
|
+
results_dict[EXACT_MATCH] = json.dumps(raw_score, indent=4)
|
|
367
|
+
elif eval_name == PARTIAL_MATCH:
|
|
368
|
+
case_sensitive = eval_options.get("case_sensitive", False)
|
|
369
|
+
raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
|
|
370
|
+
results_dict[PARTIAL_MATCH] = decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
|
|
252
371
|
|
|
253
372
|
evaluation_result = {
|
|
254
373
|
"prompt": prompt,
|
|
255
|
-
"response": actual_response_text,
|
|
374
|
+
"response": actual_response_text,
|
|
256
375
|
"expected_response": expected_response,
|
|
257
|
-
"
|
|
258
|
-
|
|
259
|
-
"coherence_score": decorate_metric("coherence", coherence_score),
|
|
260
|
-
"groundedness_score": decorate_metric("groundedness", groundedness_score),
|
|
261
|
-
#"concisenessnonllm_score": decorate_metric("concisenessnonllm", concisenessNonLLM_score),
|
|
262
|
-
#"pii_score": decorate_metric("pii", PII_score),
|
|
263
|
-
"citations_score": json.dumps(citations_score, indent=4),
|
|
264
|
-
"tool_call_accuracy_score": json.dumps(tool_call_accuracy, indent=4) if tool_call_accuracy else None
|
|
265
|
-
}
|
|
376
|
+
"evaluators_ran": evaluators_ran,
|
|
377
|
+
"results": results_dict,
|
|
266
378
|
}
|
|
267
379
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
380
|
+
# Preserve evaluator config metadata for output
|
|
381
|
+
if "evaluators" in eval_item:
|
|
382
|
+
evaluation_result["evaluators"] = eval_item["evaluators"]
|
|
383
|
+
if "evaluators_mode" in eval_item:
|
|
384
|
+
evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
|
|
385
|
+
|
|
386
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
387
|
+
emit_structured_log(
|
|
388
|
+
"debug",
|
|
389
|
+
f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
|
|
390
|
+
f"Evaluators: {', '.join(evaluators_ran)}. "
|
|
391
|
+
f"Scores: {evaluation_result['results']}",
|
|
392
|
+
operation=Operation.EVALUATE,
|
|
393
|
+
)
|
|
272
394
|
|
|
273
395
|
evaluation_results.append(evaluation_result)
|
|
274
|
-
|
|
396
|
+
|
|
275
397
|
return evaluation_results
|
|
276
398
|
|
|
277
|
-
def write_results_to_console(results
|
|
399
|
+
def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
400
|
+
agent_id: Optional[str] = None,
|
|
401
|
+
cli_version: Optional[str] = None):
|
|
278
402
|
"""Write the response to console."""
|
|
279
403
|
# ANSI color codes
|
|
280
404
|
BOLD = '\033[1m'
|
|
@@ -286,47 +410,66 @@ def write_results_to_console(results):
|
|
|
286
410
|
ORANGE = '\033[38;5;208m'
|
|
287
411
|
RED = '\033[91m'
|
|
288
412
|
RESET = '\033[0m'
|
|
289
|
-
|
|
413
|
+
|
|
414
|
+
# Show metadata
|
|
415
|
+
metadata_parts = []
|
|
416
|
+
if agent_name:
|
|
417
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
418
|
+
if agent_id:
|
|
419
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
420
|
+
if cli_version:
|
|
421
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
422
|
+
if metadata_parts:
|
|
423
|
+
print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
|
|
424
|
+
print()
|
|
425
|
+
|
|
290
426
|
# Show aggregate statistics if multiple results
|
|
291
427
|
if len(results) > 1:
|
|
292
428
|
aggregates = calculate_aggregate_statistics(results)
|
|
293
429
|
if aggregates:
|
|
294
|
-
print(f"{BOLD}{BLUE}
|
|
430
|
+
print(f"{BOLD}{BLUE}Aggregate Statistics ({len(results)} prompts):{RESET}")
|
|
295
431
|
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
296
|
-
|
|
432
|
+
|
|
297
433
|
for metric_name, stats in aggregates.items():
|
|
298
434
|
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
299
|
-
|
|
435
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
436
|
+
total_prompts = stats.get('total_prompts', len(results))
|
|
437
|
+
print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
|
|
300
438
|
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
301
439
|
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
302
440
|
if stats.get('threshold') is not None:
|
|
303
441
|
print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
|
|
304
442
|
print()
|
|
305
|
-
|
|
443
|
+
|
|
306
444
|
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
307
445
|
print()
|
|
308
|
-
|
|
309
|
-
print(f"{BOLD}{BLUE}
|
|
446
|
+
|
|
447
|
+
print(f"{BOLD}{BLUE}Individual Results:{RESET}")
|
|
310
448
|
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
311
449
|
for i, result in enumerate(results, 1):
|
|
312
450
|
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
451
|
+
|
|
452
|
+
# Show which evaluators ran for this prompt
|
|
453
|
+
evaluators_ran = result.get('evaluators_ran', [])
|
|
454
|
+
if evaluators_ran:
|
|
455
|
+
print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
|
|
456
|
+
|
|
313
457
|
print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
|
|
314
458
|
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
|
|
315
459
|
|
|
316
|
-
# Print metric scores
|
|
317
|
-
metrics = result.get('results'
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
print(f"{BOLD}{color}{name}:{RESET} {v}")
|
|
460
|
+
# Print metric scores from results
|
|
461
|
+
metrics = result.get('results', {})
|
|
462
|
+
for eval_name, v in metrics.items():
|
|
463
|
+
if v is None:
|
|
464
|
+
continue # Skip null/N/A scores from skipped evaluators
|
|
465
|
+
display_name = pascal_case_to_title(eval_name)
|
|
466
|
+
if eval_name == RELEVANCE:
|
|
467
|
+
color = MAGENTA
|
|
468
|
+
elif eval_name == COHERENCE:
|
|
469
|
+
color = ORANGE
|
|
470
|
+
else:
|
|
471
|
+
color = BLUE
|
|
472
|
+
print(f"{BOLD}{color}{display_name}:{RESET} {v}")
|
|
330
473
|
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
331
474
|
|
|
332
475
|
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
@@ -338,10 +481,8 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
|
338
481
|
DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
|
|
339
482
|
|
|
340
483
|
score_val = None
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
score_val = data[k]
|
|
344
|
-
break
|
|
484
|
+
if metric_id in data and isinstance(data[metric_id], (int, float)):
|
|
485
|
+
score_val = data[metric_id]
|
|
345
486
|
if score_val is None:
|
|
346
487
|
return None
|
|
347
488
|
|
|
@@ -364,9 +505,11 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
|
364
505
|
"""Convert an internal evaluation result dict to a schema-compliant EvalItem.
|
|
365
506
|
|
|
366
507
|
Internal format (from run_evaluations):
|
|
367
|
-
{prompt, response, expected_response, results: {
|
|
508
|
+
{prompt, response, expected_response, results: {Relevance: "JSON", ...},
|
|
509
|
+
evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
|
|
368
510
|
Schema EvalItem format:
|
|
369
|
-
{prompt, response, expected_response, scores: {relevance: EvalScore, ...}
|
|
511
|
+
{prompt, response, expected_response, scores: {relevance: EvalScore, ...},
|
|
512
|
+
evaluators: {...}, evaluators_mode: "..."}
|
|
370
513
|
"""
|
|
371
514
|
item: Dict[str, Any] = {
|
|
372
515
|
"prompt": result["prompt"],
|
|
@@ -374,30 +517,35 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
|
374
517
|
"expected_response": result["expected_response"],
|
|
375
518
|
}
|
|
376
519
|
|
|
520
|
+
# Preserve evaluator config in output
|
|
521
|
+
if "evaluators" in result:
|
|
522
|
+
item["evaluators"] = result["evaluators"]
|
|
523
|
+
if "evaluators_mode" in result:
|
|
524
|
+
item["evaluators_mode"] = result["evaluators_mode"]
|
|
525
|
+
|
|
377
526
|
scores: Dict[str, Any] = {}
|
|
378
527
|
results_dict = result.get("results", {})
|
|
379
528
|
|
|
380
529
|
# EvalScore metrics (all share the same schema shape: {score, result, threshold})
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
(
|
|
384
|
-
(
|
|
385
|
-
(
|
|
386
|
-
("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
|
|
530
|
+
for eval_key, schema_key in [
|
|
531
|
+
(RELEVANCE, "relevance"),
|
|
532
|
+
(COHERENCE, "coherence"),
|
|
533
|
+
(GROUNDEDNESS, "groundedness"),
|
|
534
|
+
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
387
535
|
]:
|
|
388
|
-
raw = results_dict.get(
|
|
536
|
+
raw = results_dict.get(eval_key)
|
|
389
537
|
if not raw:
|
|
390
538
|
continue
|
|
391
539
|
data = json.loads(raw) if isinstance(raw, str) else raw
|
|
392
|
-
eval_score = extract_eval_score(data,
|
|
540
|
+
eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
|
|
393
541
|
if eval_score:
|
|
394
542
|
scores[schema_key] = eval_score
|
|
395
543
|
|
|
396
|
-
# Citations → CitationScore
|
|
397
|
-
raw_citations = results_dict.get(
|
|
544
|
+
# Citations → CitationScore
|
|
545
|
+
raw_citations = results_dict.get(CITATIONS)
|
|
398
546
|
if raw_citations:
|
|
399
547
|
data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
|
|
400
|
-
count = data.get("
|
|
548
|
+
count = data.get("citations", 0)
|
|
401
549
|
cit_result = data.get("result")
|
|
402
550
|
if cit_result not in ("pass", "fail"):
|
|
403
551
|
cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
|
|
@@ -411,17 +559,42 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
|
411
559
|
citation_score["format"] = data["citation_format"]
|
|
412
560
|
scores["citations"] = citation_score
|
|
413
561
|
|
|
562
|
+
# ExactMatch → ExactMatchScore
|
|
563
|
+
raw_exact = results_dict.get(EXACT_MATCH)
|
|
564
|
+
if raw_exact:
|
|
565
|
+
data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
|
|
566
|
+
is_match = data.get("exact_match", 0.0) == 1.0
|
|
567
|
+
scores["exactMatch"] = {
|
|
568
|
+
"match": is_match,
|
|
569
|
+
"result": data.get("result", "pass" if is_match else "fail"),
|
|
570
|
+
"reason": data.get("exact_match_reason", ""),
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
# PartialMatch → PartialMatchScore
|
|
574
|
+
raw_partial = results_dict.get(PARTIAL_MATCH)
|
|
575
|
+
if raw_partial:
|
|
576
|
+
data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
|
|
577
|
+
scores["partialMatch"] = {
|
|
578
|
+
"score": data.get("partial_match", 0.0),
|
|
579
|
+
"result": data.get("result", "fail"),
|
|
580
|
+
"threshold": data.get("threshold", 0.5),
|
|
581
|
+
"reason": data.get("partial_match_reason", ""),
|
|
582
|
+
}
|
|
583
|
+
|
|
414
584
|
if scores:
|
|
415
585
|
item["scores"] = scores
|
|
416
586
|
|
|
417
587
|
return item
|
|
418
588
|
|
|
419
589
|
|
|
420
|
-
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None
|
|
590
|
+
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
591
|
+
default_evaluators: Optional[Dict[str, Any]] = None,
|
|
592
|
+
agent_name: Optional[str] = None,
|
|
593
|
+
cli_version: Optional[str] = None):
|
|
421
594
|
"""Write results to a schema-compliant eval document JSON file.
|
|
422
595
|
|
|
423
596
|
Output follows the eval-document.schema.json format:
|
|
424
|
-
{schemaVersion, metadata, items: [EvalItem]}
|
|
597
|
+
{schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
|
|
425
598
|
"""
|
|
426
599
|
try:
|
|
427
600
|
try:
|
|
@@ -436,43 +609,68 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
|
|
|
436
609
|
}
|
|
437
610
|
if agent_id:
|
|
438
611
|
metadata["agentId"] = agent_id
|
|
612
|
+
if agent_name:
|
|
613
|
+
metadata["agentName"] = agent_name
|
|
614
|
+
if cli_version:
|
|
615
|
+
metadata["cliVersion"] = cli_version
|
|
439
616
|
|
|
440
617
|
output_data: Dict[str, Any] = {
|
|
441
618
|
"schemaVersion": current_version,
|
|
442
619
|
"metadata": metadata,
|
|
443
|
-
"items": items,
|
|
444
620
|
}
|
|
445
621
|
|
|
622
|
+
if default_evaluators is not None:
|
|
623
|
+
output_data["default_evaluators"] = default_evaluators
|
|
624
|
+
|
|
625
|
+
output_data["items"] = items
|
|
626
|
+
|
|
446
627
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
447
628
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
448
|
-
|
|
629
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
449
630
|
except Exception as e:
|
|
450
|
-
|
|
631
|
+
emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
451
632
|
sys.exit(1)
|
|
452
633
|
|
|
453
|
-
def write_results_to_csv(results: List[Dict], output_file: str
|
|
634
|
+
def write_results_to_csv(results: List[Dict], output_file: str,
|
|
635
|
+
agent_name: Optional[str] = None, agent_id: Optional[str] = None,
|
|
636
|
+
cli_version: Optional[str] = None):
|
|
454
637
|
"""Write results to CSV file."""
|
|
455
638
|
try:
|
|
456
639
|
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
457
640
|
if results:
|
|
641
|
+
# Write metadata header
|
|
642
|
+
metadata_parts = []
|
|
643
|
+
if agent_name:
|
|
644
|
+
metadata_parts.append(f"Agent Name: {agent_name}")
|
|
645
|
+
if agent_id:
|
|
646
|
+
metadata_parts.append(f"Agent ID: {agent_id}")
|
|
647
|
+
if cli_version:
|
|
648
|
+
metadata_parts.append(f"CLI Version: {cli_version}")
|
|
649
|
+
if metadata_parts:
|
|
650
|
+
f.write(f"# {' | '.join(metadata_parts)}\n")
|
|
651
|
+
|
|
458
652
|
# Write aggregate statistics first if multiple results
|
|
459
653
|
if len(results) > 1:
|
|
460
654
|
aggregates = calculate_aggregate_statistics(results)
|
|
461
655
|
if aggregates:
|
|
462
656
|
f.write("# AGGREGATE STATISTICS\n")
|
|
463
|
-
f.write("Metric,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
657
|
+
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
464
658
|
for metric_name, stats in aggregates.items():
|
|
465
659
|
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
466
|
-
|
|
660
|
+
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
661
|
+
total_prompts = stats.get('total_prompts', len(results))
|
|
662
|
+
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
467
663
|
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
468
|
-
|
|
469
|
-
# Write individual results
|
|
470
|
-
|
|
664
|
+
|
|
665
|
+
# Write individual results (exclude internal fields)
|
|
666
|
+
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode'}
|
|
667
|
+
fieldnames = [k for k in results[0].keys() if k not in exclude_keys]
|
|
668
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
|
|
471
669
|
writer.writeheader()
|
|
472
670
|
writer.writerows(results)
|
|
473
|
-
|
|
671
|
+
emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
|
|
474
672
|
except Exception as e:
|
|
475
|
-
|
|
673
|
+
emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
|
|
476
674
|
sys.exit(1)
|
|
477
675
|
|
|
478
676
|
def parse_arguments():
|
|
@@ -503,8 +701,8 @@ Examples:
|
|
|
503
701
|
# Save results to HTML and open in browser
|
|
504
702
|
python main.py --output report.html
|
|
505
703
|
|
|
506
|
-
#
|
|
507
|
-
python main.py --
|
|
704
|
+
# Debug-level diagnostics
|
|
705
|
+
python main.py --log-level debug
|
|
508
706
|
|
|
509
707
|
# Sign out and clear cached authentication tokens
|
|
510
708
|
python main.py --signout
|
|
@@ -553,21 +751,13 @@ Examples:
|
|
|
553
751
|
|
|
554
752
|
# Behavior options
|
|
555
753
|
parser.add_argument(
|
|
556
|
-
'--
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
'--quiet',
|
|
562
|
-
action='store_true',
|
|
563
|
-
help='Suppress non-essential output'
|
|
564
|
-
)
|
|
565
|
-
parser.add_argument(
|
|
566
|
-
'--citation-format',
|
|
567
|
-
choices=['oai_unicode', 'legacy_bracket'],
|
|
568
|
-
default='oai_unicode',
|
|
569
|
-
help='Citation format to detect. "oai_unicode" for new OAI format (default), "legacy_bracket" for old [^i^] format'
|
|
754
|
+
'--log-level',
|
|
755
|
+
nargs='?',
|
|
756
|
+
const='info',
|
|
757
|
+
action='append',
|
|
758
|
+
help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
|
|
570
759
|
)
|
|
760
|
+
|
|
571
761
|
parser.add_argument(
|
|
572
762
|
'--signout',
|
|
573
763
|
action='store_true',
|
|
@@ -600,8 +790,13 @@ def validate_environment() -> CallPath:
|
|
|
600
790
|
|
|
601
791
|
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
|
602
792
|
if missing_vars:
|
|
603
|
-
|
|
604
|
-
|
|
793
|
+
emit_structured_log(
|
|
794
|
+
"error",
|
|
795
|
+
"Missing required environment variables: "
|
|
796
|
+
f"{', '.join(missing_vars)}. Please ensure your .env file contains "
|
|
797
|
+
"all required Azure configuration.",
|
|
798
|
+
operation=Operation.VALIDATE_ENV,
|
|
799
|
+
)
|
|
605
800
|
sys.exit(1)
|
|
606
801
|
return call_path
|
|
607
802
|
|
|
@@ -635,23 +830,42 @@ def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
|
|
|
635
830
|
# Convert other parsing errors to ValueError
|
|
636
831
|
raise ValueError(f"Invalid URL format: {url}") from e
|
|
637
832
|
|
|
638
|
-
def get_prompt_datasets(args) -> Tuple[List[
|
|
639
|
-
"""Get prompts and expected responses based on command line arguments.
|
|
833
|
+
def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
|
|
834
|
+
"""Get prompts and expected responses based on command line arguments.
|
|
835
|
+
|
|
836
|
+
Returns:
|
|
837
|
+
Tuple of (eval_items, default_evaluators).
|
|
838
|
+
"""
|
|
640
839
|
if args.prompts:
|
|
641
840
|
if args.expected and len(args.prompts) != len(args.expected):
|
|
642
|
-
|
|
841
|
+
emit_structured_log(
|
|
842
|
+
"error",
|
|
843
|
+
"Number of prompts must match number of expected responses. "
|
|
844
|
+
"Update --expected values to match the prompt count.",
|
|
845
|
+
)
|
|
643
846
|
sys.exit(1)
|
|
644
|
-
|
|
645
|
-
|
|
847
|
+
expected_responses = args.expected or [""] * len(args.prompts)
|
|
848
|
+
eval_items = [
|
|
849
|
+
{"prompt": p, "expected_response": e}
|
|
850
|
+
for p, e in zip(args.prompts, expected_responses)
|
|
851
|
+
]
|
|
852
|
+
return eval_items, None
|
|
646
853
|
elif args.prompts_file:
|
|
647
|
-
|
|
854
|
+
return load_prompts_from_file(args.prompts_file)
|
|
648
855
|
elif args.interactive:
|
|
649
856
|
prompts, expected_responses = get_interactive_prompts()
|
|
857
|
+
eval_items = [
|
|
858
|
+
{"prompt": p, "expected_response": e}
|
|
859
|
+
for p, e in zip(prompts, expected_responses)
|
|
860
|
+
]
|
|
861
|
+
return eval_items, None
|
|
650
862
|
else:
|
|
651
|
-
# Use default prompts
|
|
652
863
|
prompts, expected_responses = get_default_prompts_and_responses()
|
|
653
|
-
|
|
654
|
-
|
|
864
|
+
eval_items = [
|
|
865
|
+
{"prompt": p, "expected_response": e}
|
|
866
|
+
for p, e in zip(prompts, expected_responses)
|
|
867
|
+
]
|
|
868
|
+
return eval_items, None
|
|
655
869
|
|
|
656
870
|
def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
|
|
657
871
|
"""
|
|
@@ -687,26 +901,27 @@ def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oi
|
|
|
687
901
|
return agents
|
|
688
902
|
except urllib.error.HTTPError as e:
|
|
689
903
|
# If endpoint doesn't exist or returns error, return empty list
|
|
690
|
-
|
|
904
|
+
emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
|
|
691
905
|
return []
|
|
692
906
|
except Exception as e:
|
|
693
|
-
|
|
907
|
+
emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
|
|
694
908
|
return []
|
|
695
909
|
|
|
696
|
-
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
910
|
+
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
|
|
697
911
|
"""
|
|
698
912
|
Display an interactive agent selector using questionary.
|
|
699
|
-
|
|
913
|
+
|
|
700
914
|
Args:
|
|
701
915
|
agents: List of agent dictionaries.
|
|
702
|
-
|
|
916
|
+
|
|
703
917
|
Returns:
|
|
704
|
-
|
|
918
|
+
Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
|
|
705
919
|
"""
|
|
706
920
|
if not agents:
|
|
707
|
-
return None
|
|
708
|
-
|
|
709
|
-
#
|
|
921
|
+
return None, None
|
|
922
|
+
|
|
923
|
+
# Build id→name lookup and choices
|
|
924
|
+
id_to_name: Dict[str, str] = {}
|
|
710
925
|
choices = []
|
|
711
926
|
sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
|
|
712
927
|
for agent in sorted_agents:
|
|
@@ -714,12 +929,13 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
|
714
929
|
agent_name = agent.get("name", "Unknown")
|
|
715
930
|
agent_description = agent.get("description", "Unknown")
|
|
716
931
|
agent_is_owner = agent.get('isOwner')
|
|
717
|
-
|
|
932
|
+
id_to_name[agent_id] = agent_name
|
|
933
|
+
|
|
718
934
|
# Format the display text
|
|
719
935
|
display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
|
|
720
|
-
|
|
936
|
+
|
|
721
937
|
choices.append(questionary.Choice(title=display_text, value=agent_id))
|
|
722
|
-
|
|
938
|
+
|
|
723
939
|
# Display the selection prompt
|
|
724
940
|
selected_agent = questionary.select(
|
|
725
941
|
"Select an agent to evaluate:",
|
|
@@ -727,8 +943,8 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
|
727
943
|
use_shortcuts=True,
|
|
728
944
|
use_arrow_keys=True
|
|
729
945
|
).ask()
|
|
730
|
-
|
|
731
|
-
return selected_agent
|
|
946
|
+
|
|
947
|
+
return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
|
|
732
948
|
|
|
733
949
|
@functools.lru_cache(maxsize=1)
|
|
734
950
|
def _get_iana_timezone_name() -> str:
|
|
@@ -787,7 +1003,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
|
787
1003
|
|
|
788
1004
|
return json.dumps(message).encode("utf-8")
|
|
789
1005
|
|
|
790
|
-
def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) ->
|
|
1006
|
+
def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
|
|
791
1007
|
""" Send prompts to the chat API and return enhanced responses. """
|
|
792
1008
|
|
|
793
1009
|
request_headers = {
|
|
@@ -795,15 +1011,15 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
|
|
|
795
1011
|
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
796
1012
|
"Authorization": f"Bearer {access_token}"
|
|
797
1013
|
}
|
|
798
|
-
raw_responses:
|
|
1014
|
+
raw_responses: List[Tuple[str, str]] = []
|
|
799
1015
|
for i, prompt in enumerate(prompts, 1):
|
|
800
|
-
if
|
|
801
|
-
|
|
1016
|
+
if getattr(args, "effective_log_level", "info") in ("info", "debug"):
|
|
1017
|
+
emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
|
|
802
1018
|
|
|
803
1019
|
# Build the payload
|
|
804
1020
|
payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
|
|
805
|
-
if args
|
|
806
|
-
|
|
1021
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1022
|
+
emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
|
|
807
1023
|
|
|
808
1024
|
# Send the request to /chat
|
|
809
1025
|
req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
|
|
@@ -822,42 +1038,83 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
|
|
|
822
1038
|
raise RuntimeError(msg) from e
|
|
823
1039
|
except urllib.error.URLError as e:
|
|
824
1040
|
raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
|
|
825
|
-
|
|
826
|
-
if args
|
|
827
|
-
|
|
828
|
-
|
|
1041
|
+
|
|
1042
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1043
|
+
emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
|
|
1044
|
+
|
|
829
1045
|
# Store raw response for enhancement
|
|
830
|
-
raw_responses
|
|
831
|
-
|
|
1046
|
+
raw_responses.append((prompt, raw.strip()))
|
|
1047
|
+
|
|
832
1048
|
# Extract enhanced responses using the new extractor
|
|
833
|
-
enhanced_responses = extract_enhanced_responses(raw_responses)
|
|
1049
|
+
enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
|
|
1050
|
+
|
|
1051
|
+
if getattr(args, "effective_log_level", "info") == "debug":
|
|
1052
|
+
for idx, enhanced in enumerate(enhanced_responses, 1):
|
|
1053
|
+
metadata = enhanced.get("metadata", {})
|
|
1054
|
+
context = {
|
|
1055
|
+
"request-id": metadata.get("request_id"),
|
|
1056
|
+
"conversation-id": metadata.get("conversation_id"),
|
|
1057
|
+
"message-id": metadata.get("message_id"),
|
|
1058
|
+
"operation": Operation.SEND_PROMPT,
|
|
1059
|
+
}
|
|
1060
|
+
entry = format_structured_log_entry(
|
|
1061
|
+
level="debug",
|
|
1062
|
+
message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
|
|
1063
|
+
logger_name=CLI_LOGGER_NAME,
|
|
1064
|
+
run_context=context,
|
|
1065
|
+
)
|
|
1066
|
+
DIAGNOSTIC_RECORDS.append(entry)
|
|
1067
|
+
CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
|
|
1068
|
+
|
|
834
1069
|
return enhanced_responses
|
|
835
1070
|
|
|
836
|
-
def output_results(results: List[Dict], args
|
|
1071
|
+
def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
|
|
1072
|
+
agent_name: Optional[str] = None, cli_version: Optional[str] = None):
|
|
837
1073
|
"""Output results based on specified format."""
|
|
1074
|
+
metadata_kwargs = dict(
|
|
1075
|
+
agent_name=agent_name,
|
|
1076
|
+
agent_id=getattr(args, 'm365_agent_id', None),
|
|
1077
|
+
cli_version=cli_version,
|
|
1078
|
+
)
|
|
838
1079
|
if args.output:
|
|
839
1080
|
output_lower = args.output.lower()
|
|
840
1081
|
if output_lower.endswith('.json'):
|
|
841
|
-
write_results_to_json(results, args.output,
|
|
1082
|
+
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1083
|
+
**metadata_kwargs)
|
|
842
1084
|
elif output_lower.endswith('.csv'):
|
|
843
|
-
write_results_to_csv(results, args.output)
|
|
1085
|
+
write_results_to_csv(results, args.output, **metadata_kwargs)
|
|
844
1086
|
elif output_lower.endswith('.html'):
|
|
845
|
-
write_results_to_html(results, args.output)
|
|
1087
|
+
write_results_to_html(results, args.output, **metadata_kwargs)
|
|
846
1088
|
abs_path = os.path.abspath(args.output)
|
|
847
1089
|
webbrowser.open(f'file://{abs_path}')
|
|
848
1090
|
else:
|
|
849
|
-
write_results_to_json(results, args.output,
|
|
1091
|
+
write_results_to_json(results, args.output, default_evaluators=default_evaluators,
|
|
1092
|
+
**metadata_kwargs)
|
|
850
1093
|
else:
|
|
851
|
-
write_results_to_console(results)
|
|
1094
|
+
write_results_to_console(results, **metadata_kwargs)
|
|
852
1095
|
|
|
853
1096
|
def main():
|
|
854
1097
|
"""Main function to orchestrate the evaluation process."""
|
|
855
1098
|
load_dotenv()
|
|
856
1099
|
args = parse_arguments()
|
|
857
1100
|
|
|
1101
|
+
effective_log_level, error_message = resolve_log_level(args.log_level)
|
|
1102
|
+
if error_message:
|
|
1103
|
+
print(error_message)
|
|
1104
|
+
print(
|
|
1105
|
+
"Next step: rerun with --log-level {debug|info|warning|error}. "
|
|
1106
|
+
"For support, share the console diagnostics output from this run."
|
|
1107
|
+
)
|
|
1108
|
+
sys.exit(2)
|
|
1109
|
+
|
|
1110
|
+
args.effective_log_level = effective_log_level
|
|
1111
|
+
configure_cli_logging(effective_log_level)
|
|
1112
|
+
emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
|
|
1113
|
+
|
|
858
1114
|
# Check minimum version before proceeding
|
|
859
|
-
|
|
860
|
-
|
|
1115
|
+
quiet_for_version = effective_log_level in ("warning", "error")
|
|
1116
|
+
cli_version = get_cli_version(quiet=quiet_for_version)
|
|
1117
|
+
if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
|
|
861
1118
|
sys.exit(1)
|
|
862
1119
|
|
|
863
1120
|
# Validate environment variables required for evaluation
|
|
@@ -885,7 +1142,7 @@ def main():
|
|
|
885
1142
|
try:
|
|
886
1143
|
auth_handler.clear_cache()
|
|
887
1144
|
except Exception as e:
|
|
888
|
-
|
|
1145
|
+
emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
|
|
889
1146
|
sys.exit(1)
|
|
890
1147
|
sys.exit(0)
|
|
891
1148
|
|
|
@@ -898,67 +1155,87 @@ def main():
|
|
|
898
1155
|
|
|
899
1156
|
id_token_claims = auth_result.get("id_token_claims")
|
|
900
1157
|
if not isinstance(id_token_claims, dict):
|
|
901
|
-
|
|
1158
|
+
emit_structured_log(
|
|
1159
|
+
"warning", "id_token_claims is missing or invalid in authentication result",
|
|
1160
|
+
operation=Operation.AUTHENTICATE,
|
|
1161
|
+
)
|
|
902
1162
|
else:
|
|
903
1163
|
user_oid = id_token_claims.get("oid") or ""
|
|
904
1164
|
|
|
905
1165
|
except Exception as e:
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
1166
|
+
emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
|
|
1167
|
+
if effective_log_level == "debug":
|
|
1168
|
+
import traceback
|
|
1169
|
+
traceback.print_exc()
|
|
1170
|
+
sys.exit(1)
|
|
911
1171
|
|
|
912
1172
|
if not user_oid and access_token:
|
|
913
1173
|
# Fallback: extract from access token.
|
|
914
1174
|
user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
|
|
915
1175
|
|
|
916
|
-
# 1. Load evaluation datasets
|
|
917
|
-
|
|
1176
|
+
# 1. Load evaluation datasets
|
|
1177
|
+
eval_items, file_default_evaluators = get_prompt_datasets(args)
|
|
1178
|
+
default_evaluators = resolve_default_evaluators(file_default_evaluators)
|
|
1179
|
+
prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
|
|
918
1180
|
|
|
919
|
-
if
|
|
920
|
-
|
|
1181
|
+
if effective_log_level in ("info", "debug"):
|
|
1182
|
+
emit_structured_log("info", f"Running evaluation on {len(prompts)} prompt(s).", operation=Operation.SETUP)
|
|
921
1183
|
|
|
1184
|
+
agent_name = None
|
|
922
1185
|
try:
|
|
923
1186
|
# 3. Agent selection - if no agent ID provided, prompt user to select
|
|
924
1187
|
if not args.m365_agent_id:
|
|
925
|
-
if
|
|
926
|
-
|
|
1188
|
+
if effective_log_level in ("info", "debug"):
|
|
1189
|
+
emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
|
|
927
1190
|
|
|
928
1191
|
available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
|
|
929
1192
|
if not available_agents:
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
1193
|
+
emit_structured_log(
|
|
1194
|
+
"error",
|
|
1195
|
+
"No agents are available for interactive selection. Re-run with "
|
|
1196
|
+
"--m365-agent-id or set M365_AGENT_ID.",
|
|
1197
|
+
operation=Operation.FETCH_AGENTS,
|
|
1198
|
+
)
|
|
1199
|
+
sys.exit(1)
|
|
1200
|
+
|
|
1201
|
+
selected_agent_id, agent_name = select_agent_interactively(available_agents)
|
|
1202
|
+
if selected_agent_id:
|
|
1203
|
+
args.m365_agent_id = selected_agent_id
|
|
1204
|
+
if effective_log_level in ("info", "debug"):
|
|
1205
|
+
emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
|
|
1206
|
+
else:
|
|
1207
|
+
emit_structured_log(
|
|
1208
|
+
"error",
|
|
1209
|
+
"No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
|
|
1210
|
+
operation=Operation.FETCH_AGENTS,
|
|
1211
|
+
)
|
|
1212
|
+
sys.exit(1)
|
|
942
1213
|
|
|
943
1214
|
# 4. Send prompts to chat API
|
|
944
1215
|
responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
|
|
945
1216
|
except Exception as e:
|
|
946
|
-
|
|
947
|
-
if
|
|
1217
|
+
emit_structured_log("error", f"Error sending prompts to chat API: {e}", operation=Operation.SEND_PROMPT)
|
|
1218
|
+
if effective_log_level == "debug":
|
|
948
1219
|
import traceback
|
|
949
1220
|
traceback.print_exc()
|
|
950
1221
|
sys.exit(1)
|
|
1222
|
+
|
|
951
1223
|
|
|
952
1224
|
# 5. Run evaluations
|
|
953
|
-
if
|
|
954
|
-
|
|
955
|
-
results = run_evaluations(args, responses,
|
|
1225
|
+
if effective_log_level in ("info", "debug"):
|
|
1226
|
+
emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
|
|
1227
|
+
results = run_evaluations(args, responses, eval_items, default_evaluators)
|
|
956
1228
|
|
|
957
1229
|
# 6. Output results
|
|
958
|
-
output_results(results, args
|
|
1230
|
+
output_results(results, args, default_evaluators=default_evaluators,
|
|
1231
|
+
agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
|
|
959
1232
|
|
|
960
|
-
if
|
|
961
|
-
|
|
1233
|
+
if effective_log_level in ("info", "debug"):
|
|
1234
|
+
emit_structured_log(
|
|
1235
|
+
"info",
|
|
1236
|
+
f"Evaluation completed successfully. Processed {len(prompts)} prompt(s).",
|
|
1237
|
+
operation=Operation.EVALUATE,
|
|
1238
|
+
)
|
|
962
1239
|
|
|
963
1240
|
# Call the main function when script is run directly
|
|
964
1241
|
if __name__ == "__main__":
|