@microsoft/m365-copilot-eval 1.2.0-preview.1 → 1.3.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,8 @@ import os
3
3
  import argparse
4
4
  import sys
5
5
  import csv
6
+ import functools
7
+ import logging
6
8
  import webbrowser
7
9
  import urllib.request
8
10
  import urllib.error
@@ -20,14 +22,39 @@ from azure.ai.evaluation import (
20
22
  from dotenv import load_dotenv
21
23
  from auth.auth_handler import AuthHandler
22
24
  from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
23
- #from custom_evaluators.ConcisenessNonLLMEvaluator import ConcisenessNonLLMEvaluator
24
- #from custom_evaluators.PII.PII import PIIEvaluator
25
+ from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
26
+ from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
25
27
  from generate_report import generate_html_report, calculate_aggregate_statistics
26
28
  from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
27
29
  from schema_handler import DocumentUpgrader, SchemaVersionManager
30
+ from common import (
31
+ RELEVANCE,
32
+ COHERENCE,
33
+ GROUNDEDNESS,
34
+ TOOL_CALL_ACCURACY,
35
+ CITATIONS,
36
+ EXACT_MATCH,
37
+ PARTIAL_MATCH,
38
+ REQUIRES_AZURE_OPENAI,
39
+ REQUIRES_TOOL_DEFINITIONS,
40
+ METRIC_IDS,
41
+ pascal_case_to_title,
42
+ )
43
+ from evaluator_resolver import (
44
+ EVALUATOR_REGISTRY,
45
+ validate_evaluator_names,
46
+ check_prerequisites,
47
+ resolve_default_evaluators,
48
+ resolve_evaluators_for_prompt,
49
+ get_evaluator_threshold,
50
+ )
28
51
  from version_check import check_min_version, get_cli_version
29
52
  from datetime import datetime, timezone
30
53
  from pathlib import Path
54
+ import tzlocal
55
+
56
+ from cli_logging.console_diagnostics import render_diagnostic, serialize_diagnostic_record
57
+ from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel, Operation, format_structured_log_entry, resolve_log_level
31
58
 
32
59
  # Allowed endpoints for URL validation
33
60
  ALLOWED_ENDPOINTS = [
@@ -46,20 +73,63 @@ VERSION_CHECK_BYPASS_FLAGS = (
46
73
  "signout",
47
74
  )
48
75
 
76
+ CLI_LOGGER_NAME = "m365.eval.cli"
77
+ CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
78
+ DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
79
+
80
+
81
+ def _ensure_logger_handler() -> None:
82
+ if CLI_LOGGER.handlers:
83
+ return
84
+ handler = logging.StreamHandler(sys.stdout)
85
+ handler.setFormatter(logging.Formatter("%(message)s"))
86
+ CLI_LOGGER.addHandler(handler)
87
+ CLI_LOGGER.propagate = False
88
+
89
+
90
+ def configure_cli_logging(effective_log_level: str) -> None:
91
+ _ensure_logger_handler()
92
+ CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
93
+
94
+
95
+ def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
96
+ _ensure_logger_handler()
97
+ context = {
98
+ "request-id": None,
99
+ "conversation-id": None,
100
+ "message-id": None,
101
+ "operation": operation,
102
+ }
103
+ entry = format_structured_log_entry(
104
+ level=level,
105
+ message=message,
106
+ logger_name=CLI_LOGGER_NAME,
107
+ run_context=context,
108
+ )
109
+ DIAGNOSTIC_RECORDS.append(entry)
110
+
111
+ try:
112
+ CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
113
+ except Exception:
114
+ pass
115
+
49
116
 
50
117
  def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
51
118
  """Return True if the current invocation should skip min-version checks."""
52
119
  return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
53
120
 
54
- def write_results_to_html(results: List[Dict], output_file: str):
121
+ def write_results_to_html(results: List[Dict], output_file: str,
122
+ agent_name: Optional[str] = None, agent_id: Optional[str] = None,
123
+ cli_version: Optional[str] = None):
55
124
  """Write results to HTML file using generate_html_report from generate_report.py."""
56
125
  try:
57
- html = generate_html_report(results)
126
+ html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
127
+ cli_version=cli_version)
58
128
  with open(output_file, 'w', encoding='utf-8') as f:
59
129
  f.write(html)
60
- print(f"HTML report saved to {output_file}")
130
+ emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
61
131
  except Exception as e:
62
- print(f"Error writing to HTML file: {e}")
132
+ emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
63
133
  sys.exit(1)
64
134
 
65
135
  def get_default_prompts_and_responses():
@@ -72,7 +142,7 @@ def get_default_prompts_and_responses():
72
142
  ]
73
143
  return prompts, expected_responses
74
144
 
75
- def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
145
+ def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
76
146
  """Load prompts and expected responses from a JSON file.
77
147
 
78
148
  Supports three formats:
@@ -82,6 +152,10 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
82
152
 
83
153
  For eval documents (format 1) and array format (format 2), schema validation
84
154
  and auto-upgrade are applied via DocumentUpgrader.
155
+
156
+ Returns:
157
+ Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
158
+ expected_response, and optional evaluators/evaluators_mode fields.
85
159
  """
86
160
  try:
87
161
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -98,18 +172,18 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
98
172
  upgrader = DocumentUpgrader()
99
173
  except Exception as e:
100
174
  # Schema infrastructure not available (missing files, etc.) — skip
101
- print(f"Warning: Unable to initialize document upgrader: {e}")
175
+ emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
102
176
  upgrader = None
103
177
 
104
178
  if upgrader is not None:
105
179
  result = upgrader.upgrade(Path(file_path))
106
180
 
107
181
  if result.error:
108
- print(f"Schema validation error: {result.error}")
182
+ emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
109
183
  sys.exit(1)
110
184
 
111
185
  if result.upgraded and result.message:
112
- print(result.message)
186
+ emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
113
187
 
114
188
  # Use the parsed document from the upgrade result
115
189
  if result.document is not None:
@@ -117,26 +191,26 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
117
191
 
118
192
  if isinstance(data, list):
119
193
  # Format: [{"prompt": "...", "expected_response": "..."}, ...]
120
- prompts = [item.get("prompt", "") for item in data]
121
- expected_responses = [item.get("expected_response", "") for item in data]
194
+ return data, None
122
195
  elif isinstance(data, dict):
123
196
  if "items" in data:
124
197
  # Eval document format: {"schemaVersion": "...", "items": [...]}
125
- items = data["items"]
126
- prompts = [item.get("prompt", "") for item in items]
127
- expected_responses = [item.get("expected_response", "") for item in items]
198
+ return data["items"], data.get("default_evaluators")
128
199
  else:
129
200
  # Format: {"prompts": [...], "expected_responses": [...]}
130
201
  prompts = data.get("prompts", [])
131
202
  expected_responses = data.get("expected_responses", [])
203
+ eval_items = [
204
+ {"prompt": p, "expected_response": e}
205
+ for p, e in zip(prompts, expected_responses)
206
+ ]
207
+ return eval_items, None
132
208
  else:
133
209
  raise ValueError("Invalid file format")
134
-
135
- return prompts, expected_responses
136
210
  except SystemExit:
137
211
  raise
138
212
  except Exception as e:
139
- print(f"Error loading prompts from file: {e}")
213
+ emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
140
214
  sys.exit(1)
141
215
 
142
216
  def get_interactive_prompts() -> Tuple[List[str], List[str]]:
@@ -163,116 +237,168 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
163
237
 
164
238
  return prompts, expected_responses
165
239
 
166
- def run_evaluations(args, responses: dict, expected_responses: list) -> list:
167
- """Run evaluations against the responses."""
240
+ def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
241
+ default_evaluators: Dict[str, Any]) -> list:
242
+ """Run evaluations against the responses using per-prompt evaluator resolution.
243
+
244
+ Args:
245
+ args: CLI arguments.
246
+ responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
247
+ eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
248
+ default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
249
+ """
250
+ if len(responses) != len(eval_items):
251
+ raise ValueError(
252
+ f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
253
+ )
254
+
168
255
  model_config = AzureOpenAIModelConfiguration(
169
256
  azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
170
257
  api_key=os.environ.get("AZURE_AI_API_KEY"),
171
258
  api_version=os.environ.get("AZURE_AI_API_VERSION"),
172
259
  azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
173
260
  )
174
-
175
- # Initialize evaluators
176
- relevance_evaluator = RelevanceEvaluator(model_config=model_config) # Evaluate relevance for a given response. Range is 1 - 5.
177
- coherence_evaluator = CoherenceEvaluator(model_config=model_config) # Measures the coherence (human-like quality) of the response. Range is 1 - 5.
178
- groundedness_evaluator = GroundednessEvaluator(model_config=model_config) # Evaluates the response for for factuality and groundedness against provided context. Range is 1 - 5.
179
- #concisenessnonllm_evaluator = ConcisenessNonLLMEvaluator() # Evaluates the response for conciseness. Range is 1 - 5.
180
- #pii_evaluator = PIIEvaluator(model_config=model_config) # Evaluates the response for presence of PII. Range
181
- # Parse citation format from args
182
- citation_format = CitationFormat.OAI_UNICODE if args.citation_format == 'oai_unicode' else CitationFormat.LEGACY_BRACKET
183
- citations_evaluator = CitationsEvaluator(citation_format=citation_format) # Evaluates citations present in the response using regex pattern matching
184
-
185
- tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) # Evaluate tool call accuracy if tool definitions are present in response
186
-
187
261
 
188
- PASS_THRESHOLD = 3 # All evaluators must meet or exceed this value (out of 5) to pass
262
+ # Build available context for prerequisite checks
263
+ has_azure_openai = bool(
264
+ os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
265
+ and os.environ.get("AZURE_AI_API_KEY")
266
+ )
267
+
268
+ DEFAULT_PASS_THRESHOLD = 3
189
269
 
190
- def decorate_metric(metric_id: str, data):
270
+ def decorate_metric(metric_id: str, data, threshold: Optional[int] = None):
191
271
  """Augment raw evaluator output with standardized threshold + pass/fail result."""
272
+ pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
192
273
  payload = {}
193
- # Preserve original structure if dict
194
274
  if isinstance(data, dict):
195
275
  payload.update(data)
196
276
  else:
197
277
  payload['raw'] = data
198
278
 
199
- # Try to extract a numeric score
200
279
  score_val = None
201
280
  if isinstance(data, dict):
202
- for k in (metric_id, f"{metric_id}_score", 'score', 'value'):
203
- if k in data:
204
- score_val = data[k]
205
- break
281
+ if metric_id in data:
282
+ score_val = data[metric_id]
206
283
  if isinstance(score_val, (int, float)):
207
- payload['threshold'] = PASS_THRESHOLD
208
- payload['result'] = 'pass' if score_val >= PASS_THRESHOLD else 'fail'
284
+ payload['threshold'] = pass_threshold
285
+ payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
209
286
  else:
210
- # If we cannot determine score, mark unknown (no pass/fail)
211
- payload['threshold'] = PASS_THRESHOLD
287
+ payload['threshold'] = pass_threshold
212
288
  payload.setdefault('result', 'unknown')
213
289
  return json.dumps(payload, indent=4)
214
290
 
291
+ # Validate all evaluator names upfront (across defaults and all items)
292
+ all_evaluator_maps = [default_evaluators]
293
+ for eval_item in eval_items:
294
+ if "evaluators" in eval_item:
295
+ all_evaluator_maps.append(eval_item["evaluators"])
296
+ for emap in all_evaluator_maps:
297
+ validate_evaluator_names(emap)
298
+
215
299
  evaluation_results = []
216
- for prompt, expected_response in zip(responses.keys(), expected_responses):
217
- # Extract text response for evaluation (backward compatibility)
218
- enhanced_response = responses[prompt]
300
+ for enhanced_response, eval_item in zip(responses, eval_items):
219
301
  actual_response_text = get_response_text_for_evaluation(enhanced_response)
220
-
221
- # Run evaluations using text response
222
- relevance_score = relevance_evaluator(
223
- query=prompt,
224
- response=actual_response_text
225
- )
226
- coherence_score = coherence_evaluator(
227
- query=prompt,
228
- response=actual_response_text
229
- )
230
-
231
- groundedness_score = groundedness_evaluator(
232
- response=actual_response_text,
233
- context=expected_response
302
+ prompt = eval_item.get("prompt", "")
303
+ expected_response = eval_item.get("expected_response", "")
304
+ prompt_evaluators = eval_item.get("evaluators")
305
+ evaluators_mode = eval_item.get("evaluators_mode", "extend")
306
+
307
+ # Resolve evaluators for this prompt
308
+ resolved = resolve_evaluators_for_prompt(
309
+ prompt_evaluators, evaluators_mode, prompt, default_evaluators,
234
310
  )
235
311
 
236
- #PII_score = pii_evaluator(response=actual_response_text)
237
- #concisenessNonLLM_score = concisenessnonllm_evaluator(response=actual_response_text)
238
-
239
- citations_score = citations_evaluator(
240
- response=actual_response_text
312
+ # Build runtime context for prerequisite checks
313
+ has_tool_defs = bool(
314
+ args.m365_agent_id and enhanced_response.get("tool_definitions")
241
315
  )
316
+ available_context = {
317
+ REQUIRES_AZURE_OPENAI: has_azure_openai,
318
+ REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
319
+ }
242
320
 
243
- tool_call_accuracy = None
244
- if args.m365_agent_id and enhanced_response.get("tool_definitions"):
245
- tool_call_accuracy = tool_call_accuracy_evaluator(
246
- query=prompt,
247
- response=enhanced_response.get("response", actual_response_text),
248
- tool_definitions=enhanced_response["tool_definitions"]
249
- )
321
+ results_dict: Dict[str, Optional[str]] = {}
322
+ evaluators_ran: List[str] = []
323
+
324
+ for eval_name, eval_options in resolved.items():
325
+ # Check prerequisites
326
+ can_run, warn_msg = check_prerequisites(eval_name, available_context)
327
+ if not can_run:
328
+ if warn_msg:
329
+ emit_structured_log("warning", f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}", operation=Operation.EVALUATE)
330
+ results_dict[eval_name] = None
331
+ continue
332
+
333
+ evaluators_ran.append(eval_name)
334
+ threshold = get_evaluator_threshold(eval_name, eval_options)
335
+
336
+ if eval_name == RELEVANCE:
337
+ raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
338
+ results_dict[RELEVANCE] = decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
339
+ elif eval_name == COHERENCE:
340
+ raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
341
+ results_dict[COHERENCE] = decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
342
+ elif eval_name == GROUNDEDNESS:
343
+ raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response_text, context=expected_response)
344
+ results_dict[GROUNDEDNESS] = decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
345
+ elif eval_name == TOOL_CALL_ACCURACY:
346
+ raw_score = ToolCallAccuracyEvaluator(model_config)(
347
+ query=prompt,
348
+ response=enhanced_response.get("response", actual_response_text),
349
+ tool_definitions=enhanced_response["tool_definitions"],
350
+ )
351
+ results_dict[TOOL_CALL_ACCURACY] = decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
352
+ elif eval_name == CITATIONS:
353
+ fmt_str = eval_options.get("citation_format", "oai_unicode")
354
+ fmt_map = {
355
+ "oai_unicode": CitationFormat.OAI_UNICODE,
356
+ "bracket": CitationFormat.LEGACY_BRACKET,
357
+ "mixed": CitationFormat.AUTO,
358
+ }
359
+ raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response_text)
360
+ results_dict[CITATIONS] = decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
361
+ elif eval_name == EXACT_MATCH:
362
+ # ExactMatch is binary (match/no-match) — it includes its own result
363
+ # field, so we skip decorate_metric which assumes a numeric score.
364
+ case_sensitive = eval_options.get("case_sensitive", False)
365
+ raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
366
+ results_dict[EXACT_MATCH] = json.dumps(raw_score, indent=4)
367
+ elif eval_name == PARTIAL_MATCH:
368
+ case_sensitive = eval_options.get("case_sensitive", False)
369
+ raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
370
+ results_dict[PARTIAL_MATCH] = decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
250
371
 
251
372
  evaluation_result = {
252
373
  "prompt": prompt,
253
- "response": actual_response_text, # Keep simple text for backward compatibility
374
+ "response": actual_response_text,
254
375
  "expected_response": expected_response,
255
- "results": {
256
- "relevance_score": decorate_metric("relevance", relevance_score),
257
- "coherence_score": decorate_metric("coherence", coherence_score),
258
- "groundedness_score": decorate_metric("groundedness", groundedness_score),
259
- #"concisenessnonllm_score": decorate_metric("concisenessnonllm", concisenessNonLLM_score),
260
- #"pii_score": decorate_metric("pii", PII_score),
261
- "citations_score": json.dumps(citations_score, indent=4),
262
- "tool_call_accuracy_score": json.dumps(tool_call_accuracy, indent=4) if tool_call_accuracy else None
263
- }
376
+ "evaluators_ran": evaluators_ran,
377
+ "results": results_dict,
264
378
  }
265
379
 
266
- if args.verbose:
267
- print(f".................................. Evaluation for prompt: {evaluation_result['prompt']} ..................................")
268
- print(f"Scores: {evaluation_result['results']}")
269
- print("...........................................................................................................................")
380
+ # Preserve evaluator config metadata for output
381
+ if "evaluators" in eval_item:
382
+ evaluation_result["evaluators"] = eval_item["evaluators"]
383
+ if "evaluators_mode" in eval_item:
384
+ evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
385
+
386
+ if getattr(args, "effective_log_level", "info") == "debug":
387
+ emit_structured_log(
388
+ "debug",
389
+ f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
390
+ f"Evaluators: {', '.join(evaluators_ran)}. "
391
+ f"Scores: {evaluation_result['results']}",
392
+ operation=Operation.EVALUATE,
393
+ )
270
394
 
271
395
  evaluation_results.append(evaluation_result)
272
-
396
+
273
397
  return evaluation_results
274
398
 
275
- def write_results_to_console(results):
399
+ def write_results_to_console(results, agent_name: Optional[str] = None,
400
+ agent_id: Optional[str] = None,
401
+ cli_version: Optional[str] = None):
276
402
  """Write the response to console."""
277
403
  # ANSI color codes
278
404
  BOLD = '\033[1m'
@@ -284,47 +410,66 @@ def write_results_to_console(results):
284
410
  ORANGE = '\033[38;5;208m'
285
411
  RED = '\033[91m'
286
412
  RESET = '\033[0m'
287
-
413
+
414
+ # Show metadata
415
+ metadata_parts = []
416
+ if agent_name:
417
+ metadata_parts.append(f"Agent Name: {agent_name}")
418
+ if agent_id:
419
+ metadata_parts.append(f"Agent ID: {agent_id}")
420
+ if cli_version:
421
+ metadata_parts.append(f"CLI Version: {cli_version}")
422
+ if metadata_parts:
423
+ print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
424
+ print()
425
+
288
426
  # Show aggregate statistics if multiple results
289
427
  if len(results) > 1:
290
428
  aggregates = calculate_aggregate_statistics(results)
291
429
  if aggregates:
292
- print(f"{BOLD}{BLUE}📊 Aggregate Statistics ({len(results)} prompts):{RESET}")
430
+ print(f"{BOLD}{BLUE}Aggregate Statistics ({len(results)} prompts):{RESET}")
293
431
  print(f"{BLUE}{'=' * 60}{RESET}")
294
-
432
+
295
433
  for metric_name, stats in aggregates.items():
296
434
  pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
297
- print(f"{BOLD}{CYAN}{metric_name}:{RESET}")
435
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
436
+ total_prompts = stats.get('total_prompts', len(results))
437
+ print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
298
438
  print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
299
439
  print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
300
440
  if stats.get('threshold') is not None:
301
441
  print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
302
442
  print()
303
-
443
+
304
444
  print(f"{BLUE}{'=' * 60}{RESET}")
305
445
  print()
306
-
307
- print(f"{BOLD}{BLUE}📝 Individual Results:{RESET}")
446
+
447
+ print(f"{BOLD}{BLUE}Individual Results:{RESET}")
308
448
  print(f"{BLUE}{'=' * 50}{RESET}")
309
449
  for i, result in enumerate(results, 1):
310
450
  print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
451
+
452
+ # Show which evaluators ran for this prompt
453
+ evaluators_ran = result.get('evaluators_ran', [])
454
+ if evaluators_ran:
455
+ print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
456
+
311
457
  print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
312
458
  print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
313
459
 
314
- # Print metric scores generically from nested results (fallback to flat keys for back-compat)
315
- metrics = result.get('results') or {k: v for k, v in result.items() if isinstance(k, str) and k.endswith('_score')}
316
- if metrics:
317
- for k, v in metrics.items():
318
- name = k.replace('_', ' ')
319
- if 'relevance' in k:
320
- color = MAGENTA
321
- elif 'coherence' in k:
322
- color = ORANGE
323
- elif 'fluency' in k:
324
- color = GREEN
325
- else:
326
- color = BLUE
327
- print(f"{BOLD}{color}{name}:{RESET} {v}")
460
+ # Print metric scores from results
461
+ metrics = result.get('results', {})
462
+ for eval_name, v in metrics.items():
463
+ if v is None:
464
+ continue # Skip null/N/A scores from skipped evaluators
465
+ display_name = pascal_case_to_title(eval_name)
466
+ if eval_name == RELEVANCE:
467
+ color = MAGENTA
468
+ elif eval_name == COHERENCE:
469
+ color = ORANGE
470
+ else:
471
+ color = BLUE
472
+ print(f"{BOLD}{color}{display_name}:{RESET} {v}")
328
473
  print(f"{BLUE}{'-' * 30}{RESET}")
329
474
 
330
475
  def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
@@ -336,10 +481,8 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
336
481
  DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
337
482
 
338
483
  score_val = None
339
- for k in (metric_id, f"{metric_id}_score", "score", "value"):
340
- if k in data and isinstance(data[k], (int, float)):
341
- score_val = data[k]
342
- break
484
+ if metric_id in data and isinstance(data[metric_id], (int, float)):
485
+ score_val = data[metric_id]
343
486
  if score_val is None:
344
487
  return None
345
488
 
@@ -362,9 +505,11 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
362
505
  """Convert an internal evaluation result dict to a schema-compliant EvalItem.
363
506
 
364
507
  Internal format (from run_evaluations):
365
- {prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
508
+ {prompt, response, expected_response, results: {Relevance: "JSON", ...},
509
+ evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
366
510
  Schema EvalItem format:
367
- {prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
511
+ {prompt, response, expected_response, scores: {relevance: EvalScore, ...},
512
+ evaluators: {...}, evaluators_mode: "..."}
368
513
  """
369
514
  item: Dict[str, Any] = {
370
515
  "prompt": result["prompt"],
@@ -372,30 +517,35 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
372
517
  "expected_response": result["expected_response"],
373
518
  }
374
519
 
520
+ # Preserve evaluator config in output
521
+ if "evaluators" in result:
522
+ item["evaluators"] = result["evaluators"]
523
+ if "evaluators_mode" in result:
524
+ item["evaluators_mode"] = result["evaluators_mode"]
525
+
375
526
  scores: Dict[str, Any] = {}
376
527
  results_dict = result.get("results", {})
377
528
 
378
529
  # EvalScore metrics (all share the same schema shape: {score, result, threshold})
379
- # Tuple: (internal results key, metric ID for score lookup, schema output key)
380
- for internal_key, metric_id, schema_key in [
381
- ("relevance_score", "relevance", "relevance"),
382
- ("coherence_score", "coherence", "coherence"),
383
- ("groundedness_score", "groundedness", "groundedness"),
384
- ("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
530
+ for eval_key, schema_key in [
531
+ (RELEVANCE, "relevance"),
532
+ (COHERENCE, "coherence"),
533
+ (GROUNDEDNESS, "groundedness"),
534
+ (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
385
535
  ]:
386
- raw = results_dict.get(internal_key)
536
+ raw = results_dict.get(eval_key)
387
537
  if not raw:
388
538
  continue
389
539
  data = json.loads(raw) if isinstance(raw, str) else raw
390
- eval_score = extract_eval_score(data, metric_id)
540
+ eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
391
541
  if eval_score:
392
542
  scores[schema_key] = eval_score
393
543
 
394
- # Citations → CitationScore (different schema shape: {count, result, threshold} + format)
395
- raw_citations = results_dict.get("citations_score")
544
+ # Citations → CitationScore
545
+ raw_citations = results_dict.get(CITATIONS)
396
546
  if raw_citations:
397
547
  data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
398
- count = data.get("score", 0)
548
+ count = data.get("citations", 0)
399
549
  cit_result = data.get("result")
400
550
  if cit_result not in ("pass", "fail"):
401
551
  cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
@@ -409,17 +559,42 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
409
559
  citation_score["format"] = data["citation_format"]
410
560
  scores["citations"] = citation_score
411
561
 
562
+ # ExactMatch → ExactMatchScore
563
+ raw_exact = results_dict.get(EXACT_MATCH)
564
+ if raw_exact:
565
+ data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
566
+ is_match = data.get("exact_match", 0.0) == 1.0
567
+ scores["exactMatch"] = {
568
+ "match": is_match,
569
+ "result": data.get("result", "pass" if is_match else "fail"),
570
+ "reason": data.get("exact_match_reason", ""),
571
+ }
572
+
573
+ # PartialMatch → PartialMatchScore
574
+ raw_partial = results_dict.get(PARTIAL_MATCH)
575
+ if raw_partial:
576
+ data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
577
+ scores["partialMatch"] = {
578
+ "score": data.get("partial_match", 0.0),
579
+ "result": data.get("result", "fail"),
580
+ "threshold": data.get("threshold", 0.5),
581
+ "reason": data.get("partial_match_reason", ""),
582
+ }
583
+
412
584
  if scores:
413
585
  item["scores"] = scores
414
586
 
415
587
  return item
416
588
 
417
589
 
418
- def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
590
+ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
591
+ default_evaluators: Optional[Dict[str, Any]] = None,
592
+ agent_name: Optional[str] = None,
593
+ cli_version: Optional[str] = None):
419
594
  """Write results to a schema-compliant eval document JSON file.
420
595
 
421
596
  Output follows the eval-document.schema.json format:
422
- {schemaVersion, metadata, items: [EvalItem]}
597
+ {schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
423
598
  """
424
599
  try:
425
600
  try:
@@ -434,43 +609,68 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
434
609
  }
435
610
  if agent_id:
436
611
  metadata["agentId"] = agent_id
612
+ if agent_name:
613
+ metadata["agentName"] = agent_name
614
+ if cli_version:
615
+ metadata["cliVersion"] = cli_version
437
616
 
438
617
  output_data: Dict[str, Any] = {
439
618
  "schemaVersion": current_version,
440
619
  "metadata": metadata,
441
- "items": items,
442
620
  }
443
621
 
622
+ if default_evaluators is not None:
623
+ output_data["default_evaluators"] = default_evaluators
624
+
625
+ output_data["items"] = items
626
+
444
627
  with open(output_file, 'w', encoding='utf-8') as f:
445
628
  json.dump(output_data, f, indent=2, ensure_ascii=False)
446
- print(f"Results saved to {output_file}")
629
+ emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
447
630
  except Exception as e:
448
- print(f"Error writing to JSON file: {e}")
631
+ emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
449
632
  sys.exit(1)
450
633
 
451
- def write_results_to_csv(results: List[Dict], output_file: str):
634
+ def write_results_to_csv(results: List[Dict], output_file: str,
635
+ agent_name: Optional[str] = None, agent_id: Optional[str] = None,
636
+ cli_version: Optional[str] = None):
452
637
  """Write results to CSV file."""
453
638
  try:
454
639
  with open(output_file, 'w', newline='', encoding='utf-8') as f:
455
640
  if results:
641
+ # Write metadata header
642
+ metadata_parts = []
643
+ if agent_name:
644
+ metadata_parts.append(f"Agent Name: {agent_name}")
645
+ if agent_id:
646
+ metadata_parts.append(f"Agent ID: {agent_id}")
647
+ if cli_version:
648
+ metadata_parts.append(f"CLI Version: {cli_version}")
649
+ if metadata_parts:
650
+ f.write(f"# {' | '.join(metadata_parts)}\n")
651
+
456
652
  # Write aggregate statistics first if multiple results
457
653
  if len(results) > 1:
458
654
  aggregates = calculate_aggregate_statistics(results)
459
655
  if aggregates:
460
656
  f.write("# AGGREGATE STATISTICS\n")
461
- f.write("Metric,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
657
+ f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
462
658
  for metric_name, stats in aggregates.items():
463
659
  threshold_str = str(stats.get('threshold', 'N/A'))
464
- f.write(f"{metric_name},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
660
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
661
+ total_prompts = stats.get('total_prompts', len(results))
662
+ f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
465
663
  f.write("\n# INDIVIDUAL RESULTS\n")
466
-
467
- # Write individual results
468
- writer = csv.DictWriter(f, fieldnames=results[0].keys())
664
+
665
+ # Write individual results (exclude internal fields)
666
+ exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode'}
667
+ fieldnames = [k for k in results[0].keys() if k not in exclude_keys]
668
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
469
669
  writer.writeheader()
470
670
  writer.writerows(results)
471
- print(f"Results saved to {output_file}")
671
+ emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
472
672
  except Exception as e:
473
- print(f"Error writing to CSV file: {e}")
673
+ emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
474
674
  sys.exit(1)
475
675
 
476
676
  def parse_arguments():
@@ -501,8 +701,8 @@ Examples:
501
701
  # Save results to HTML and open in browser
502
702
  python main.py --output report.html
503
703
 
504
- # Verbose output
505
- python main.py --verbose
704
+ # Debug-level diagnostics
705
+ python main.py --log-level debug
506
706
 
507
707
  # Sign out and clear cached authentication tokens
508
708
  python main.py --signout
@@ -551,21 +751,13 @@ Examples:
551
751
 
552
752
  # Behavior options
553
753
  parser.add_argument(
554
- '--verbose',
555
- action='store_true',
556
- help='Enable verbose output'
557
- )
558
- parser.add_argument(
559
- '--quiet',
560
- action='store_true',
561
- help='Suppress non-essential output'
562
- )
563
- parser.add_argument(
564
- '--citation-format',
565
- choices=['oai_unicode', 'legacy_bracket'],
566
- default='oai_unicode',
567
- help='Citation format to detect. "oai_unicode" for new OAI format (default), "legacy_bracket" for old [^i^] format'
754
+ '--log-level',
755
+ nargs='?',
756
+ const='info',
757
+ action='append',
758
+ help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
568
759
  )
760
+
569
761
  parser.add_argument(
570
762
  '--signout',
571
763
  action='store_true',
@@ -598,8 +790,13 @@ def validate_environment() -> CallPath:
598
790
 
599
791
  missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
600
792
  if missing_vars:
601
- print(f"Error: Missing required environment variables: {', '.join(missing_vars)}")
602
- print("Please ensure your .env file contains all required Azure configuration.")
793
+ emit_structured_log(
794
+ "error",
795
+ "Missing required environment variables: "
796
+ f"{', '.join(missing_vars)}. Please ensure your .env file contains "
797
+ "all required Azure configuration.",
798
+ operation=Operation.VALIDATE_ENV,
799
+ )
603
800
  sys.exit(1)
604
801
  return call_path
605
802
 
@@ -633,23 +830,42 @@ def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
633
830
  # Convert other parsing errors to ValueError
634
831
  raise ValueError(f"Invalid URL format: {url}") from e
635
832
 
636
- def get_prompt_datasets(args) -> Tuple[List[str], List[str]]:
637
- """Get prompts and expected responses based on command line arguments."""
833
+ def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
834
+ """Get prompts and expected responses based on command line arguments.
835
+
836
+ Returns:
837
+ Tuple of (eval_items, default_evaluators).
838
+ """
638
839
  if args.prompts:
639
840
  if args.expected and len(args.prompts) != len(args.expected):
640
- print("Error: Number of prompts must match number of expected responses")
841
+ emit_structured_log(
842
+ "error",
843
+ "Number of prompts must match number of expected responses. "
844
+ "Update --expected values to match the prompt count.",
845
+ )
641
846
  sys.exit(1)
642
- prompts = args.prompts
643
- expected_responses = args.expected or [""] * len(prompts)
847
+ expected_responses = args.expected or [""] * len(args.prompts)
848
+ eval_items = [
849
+ {"prompt": p, "expected_response": e}
850
+ for p, e in zip(args.prompts, expected_responses)
851
+ ]
852
+ return eval_items, None
644
853
  elif args.prompts_file:
645
- prompts, expected_responses = load_prompts_from_file(args.prompts_file)
854
+ return load_prompts_from_file(args.prompts_file)
646
855
  elif args.interactive:
647
856
  prompts, expected_responses = get_interactive_prompts()
857
+ eval_items = [
858
+ {"prompt": p, "expected_response": e}
859
+ for p, e in zip(prompts, expected_responses)
860
+ ]
861
+ return eval_items, None
648
862
  else:
649
- # Use default prompts
650
863
  prompts, expected_responses = get_default_prompts_and_responses()
651
-
652
- return prompts, expected_responses
864
+ eval_items = [
865
+ {"prompt": p, "expected_response": e}
866
+ for p, e in zip(prompts, expected_responses)
867
+ ]
868
+ return eval_items, None
653
869
 
654
870
  def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
655
871
  """
@@ -685,26 +901,27 @@ def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oi
685
901
  return agents
686
902
  except urllib.error.HTTPError as e:
687
903
  # If endpoint doesn't exist or returns error, return empty list
688
- print(f"Warning: Unable to fetch agents list (HTTP {e.code}).")
904
+ emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
689
905
  return []
690
906
  except Exception as e:
691
- print(f"Warning: Error fetching agents: {e}")
907
+ emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
692
908
  return []
693
909
 
694
- def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
910
+ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
695
911
  """
696
912
  Display an interactive agent selector using questionary.
697
-
913
+
698
914
  Args:
699
915
  agents: List of agent dictionaries.
700
-
916
+
701
917
  Returns:
702
- Selected agent ID or None if cancelled/skipped
918
+ Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
703
919
  """
704
920
  if not agents:
705
- return None
706
-
707
- # Create choices for questionary
921
+ return None, None
922
+
923
+ # Build id→name lookup and choices
924
+ id_to_name: Dict[str, str] = {}
708
925
  choices = []
709
926
  sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
710
927
  for agent in sorted_agents:
@@ -712,12 +929,13 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
712
929
  agent_name = agent.get("name", "Unknown")
713
930
  agent_description = agent.get("description", "Unknown")
714
931
  agent_is_owner = agent.get('isOwner')
715
-
932
+ id_to_name[agent_id] = agent_name
933
+
716
934
  # Format the display text
717
935
  display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
718
-
936
+
719
937
  choices.append(questionary.Choice(title=display_text, value=agent_id))
720
-
938
+
721
939
  # Display the selection prompt
722
940
  selected_agent = questionary.select(
723
941
  "Select an agent to evaluate:",
@@ -725,8 +943,37 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
725
943
  use_shortcuts=True,
726
944
  use_arrow_keys=True
727
945
  ).ask()
728
-
729
- return selected_agent
946
+
947
+ return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
948
+
949
+ @functools.lru_cache(maxsize=1)
950
+ def _get_iana_timezone_name() -> str:
951
+ """Get the IANA timezone name from the system using tzlocal.
952
+
953
+ Tries get_localzone_name() first; falls back to str(get_localzone()) when the
954
+ former raises (e.g. no zone configured on some Unix systems). Result is cached
955
+ after the first call so tzlocal is only invoked once per session.
956
+ """
957
+ try:
958
+ return tzlocal.get_localzone_name()
959
+ except Exception:
960
+ return str(tzlocal.get_localzone())
961
+
962
+
963
+ @functools.lru_cache(maxsize=1)
964
+ def _get_location_info() -> Dict[str, Any]:
965
+ """Return a locationInfo dict containing the local UTC offset and IANA timezone name.
966
+
967
+ Result is cached after the first call so the computation runs only once per session.
968
+ """
969
+ now = datetime.now().astimezone()
970
+ utc_offset = now.utcoffset()
971
+ offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
972
+ return {
973
+ "timeZoneOffset": offset_hours,
974
+ "timeZone": _get_iana_timezone_name(),
975
+ }
976
+
730
977
 
731
978
  def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
732
979
  message = {
@@ -735,6 +982,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
735
982
  "author": "user",
736
983
  "messageType": "chat",
737
984
  "timestamp": datetime.now(timezone.utc).isoformat(),
985
+ "locationInfo": _get_location_info(),
738
986
  "from": {
739
987
  "id": user_oid,
740
988
  }
@@ -755,7 +1003,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
755
1003
 
756
1004
  return json.dumps(message).encode("utf-8")
757
1005
 
758
- def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> Dict[str, Dict[str, any]]:
1006
+ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
759
1007
  """ Send prompts to the chat API and return enhanced responses. """
760
1008
 
761
1009
  request_headers = {
@@ -763,15 +1011,15 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
763
1011
  "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
764
1012
  "Authorization": f"Bearer {access_token}"
765
1013
  }
766
- raw_responses: Dict[str, str] = {}
1014
+ raw_responses: List[Tuple[str, str]] = []
767
1015
  for i, prompt in enumerate(prompts, 1):
768
- if not args.quiet:
769
- print(f"Processing prompt {i}/{len(prompts)}...")
1016
+ if getattr(args, "effective_log_level", "info") in ("info", "debug"):
1017
+ emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
770
1018
 
771
1019
  # Build the payload
772
1020
  payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
773
- if args.verbose:
774
- print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
1021
+ if getattr(args, "effective_log_level", "info") == "debug":
1022
+ emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
775
1023
 
776
1024
  # Send the request to /chat
777
1025
  req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
@@ -790,42 +1038,83 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
790
1038
  raise RuntimeError(msg) from e
791
1039
  except urllib.error.URLError as e:
792
1040
  raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
793
-
794
- if args.verbose:
795
- print(f"[Sydney] Raw response: {raw}")
796
-
1041
+
1042
+ if getattr(args, "effective_log_level", "info") == "debug":
1043
+ emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
1044
+
797
1045
  # Store raw response for enhancement
798
- raw_responses[prompt] = raw.strip()
799
-
1046
+ raw_responses.append((prompt, raw.strip()))
1047
+
800
1048
  # Extract enhanced responses using the new extractor
801
- enhanced_responses = extract_enhanced_responses(raw_responses)
1049
+ enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
1050
+
1051
+ if getattr(args, "effective_log_level", "info") == "debug":
1052
+ for idx, enhanced in enumerate(enhanced_responses, 1):
1053
+ metadata = enhanced.get("metadata", {})
1054
+ context = {
1055
+ "request-id": metadata.get("request_id"),
1056
+ "conversation-id": metadata.get("conversation_id"),
1057
+ "message-id": metadata.get("message_id"),
1058
+ "operation": Operation.SEND_PROMPT,
1059
+ }
1060
+ entry = format_structured_log_entry(
1061
+ level="debug",
1062
+ message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
1063
+ logger_name=CLI_LOGGER_NAME,
1064
+ run_context=context,
1065
+ )
1066
+ DIAGNOSTIC_RECORDS.append(entry)
1067
+ CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
1068
+
802
1069
  return enhanced_responses
803
1070
 
804
- def output_results(results: List[Dict], args):
1071
+ def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
1072
+ agent_name: Optional[str] = None, cli_version: Optional[str] = None):
805
1073
  """Output results based on specified format."""
1074
+ metadata_kwargs = dict(
1075
+ agent_name=agent_name,
1076
+ agent_id=getattr(args, 'm365_agent_id', None),
1077
+ cli_version=cli_version,
1078
+ )
806
1079
  if args.output:
807
1080
  output_lower = args.output.lower()
808
1081
  if output_lower.endswith('.json'):
809
- write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
1082
+ write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1083
+ **metadata_kwargs)
810
1084
  elif output_lower.endswith('.csv'):
811
- write_results_to_csv(results, args.output)
1085
+ write_results_to_csv(results, args.output, **metadata_kwargs)
812
1086
  elif output_lower.endswith('.html'):
813
- write_results_to_html(results, args.output)
1087
+ write_results_to_html(results, args.output, **metadata_kwargs)
814
1088
  abs_path = os.path.abspath(args.output)
815
1089
  webbrowser.open(f'file://{abs_path}')
816
1090
  else:
817
- write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
1091
+ write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1092
+ **metadata_kwargs)
818
1093
  else:
819
- write_results_to_console(results)
1094
+ write_results_to_console(results, **metadata_kwargs)
820
1095
 
821
1096
  def main():
822
1097
  """Main function to orchestrate the evaluation process."""
823
1098
  load_dotenv()
824
1099
  args = parse_arguments()
825
1100
 
1101
+ effective_log_level, error_message = resolve_log_level(args.log_level)
1102
+ if error_message:
1103
+ print(error_message)
1104
+ print(
1105
+ "Next step: rerun with --log-level {debug|info|warning|error}. "
1106
+ "For support, share the console diagnostics output from this run."
1107
+ )
1108
+ sys.exit(2)
1109
+
1110
+ args.effective_log_level = effective_log_level
1111
+ configure_cli_logging(effective_log_level)
1112
+ emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
1113
+
826
1114
  # Check minimum version before proceeding
827
- cli_version = get_cli_version(quiet=args.quiet)
828
- if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
1115
+ quiet_for_version = effective_log_level in ("warning", "error")
1116
+ cli_version = get_cli_version(quiet=quiet_for_version)
1117
+ if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
829
1118
  sys.exit(1)
830
1119
 
831
1120
  # Validate environment variables required for evaluation
@@ -853,7 +1142,7 @@ def main():
853
1142
  try:
854
1143
  auth_handler.clear_cache()
855
1144
  except Exception as e:
856
- print(f"Error during signout: {e}")
1145
+ emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
857
1146
  sys.exit(1)
858
1147
  sys.exit(0)
859
1148
 
@@ -866,67 +1155,87 @@ def main():
866
1155
 
867
1156
  id_token_claims = auth_result.get("id_token_claims")
868
1157
  if not isinstance(id_token_claims, dict):
869
- print("id_token_claims is missing or invalid in authentication result")
1158
+ emit_structured_log(
1159
+ "warning", "id_token_claims is missing or invalid in authentication result",
1160
+ operation=Operation.AUTHENTICATE,
1161
+ )
870
1162
  else:
871
1163
  user_oid = id_token_claims.get("oid") or ""
872
1164
 
873
1165
  except Exception as e:
874
- print(f"\033[91mError during authentication: {e}\033[0m")
875
- if args.verbose:
876
- import traceback
877
- traceback.print_exc()
878
- sys.exit(1)
1166
+ emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
1167
+ if effective_log_level == "debug":
1168
+ import traceback
1169
+ traceback.print_exc()
1170
+ sys.exit(1)
879
1171
 
880
1172
  if not user_oid and access_token:
881
1173
  # Fallback: extract from access token.
882
1174
  user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
883
1175
 
884
- # 1. Load evaluation datasets (prompts and expected_responses)
885
- prompts, expected_responses = get_prompt_datasets(args)
1176
+ # 1. Load evaluation datasets
1177
+ eval_items, file_default_evaluators = get_prompt_datasets(args)
1178
+ default_evaluators = resolve_default_evaluators(file_default_evaluators)
1179
+ prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
886
1180
 
887
- if not args.quiet:
888
- print(f"Running evaluation on {len(prompts)} prompt(s)...")
1181
+ if effective_log_level in ("info", "debug"):
1182
+ emit_structured_log("info", f"Running evaluation on {len(prompts)} prompt(s).", operation=Operation.SETUP)
889
1183
 
1184
+ agent_name = None
890
1185
  try:
891
1186
  # 3. Agent selection - if no agent ID provided, prompt user to select
892
1187
  if not args.m365_agent_id:
893
- if not args.quiet:
894
- print("No agent ID provided. Fetching available agents...")
1188
+ if effective_log_level in ("info", "debug"):
1189
+ emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
895
1190
 
896
1191
  available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
897
1192
  if not available_agents:
898
- print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
899
- sys.exit(1)
900
-
901
- if available_agents:
902
- selected_agent_id = select_agent_interactively(available_agents)
903
- if selected_agent_id:
904
- args.m365_agent_id = selected_agent_id
905
- if not args.quiet:
906
- print(f"Selected agent: {args.m365_agent_id}")
907
- else:
908
- print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
909
- sys.exit(1)
1193
+ emit_structured_log(
1194
+ "error",
1195
+ "No agents are available for interactive selection. Re-run with "
1196
+ "--m365-agent-id or set M365_AGENT_ID.",
1197
+ operation=Operation.FETCH_AGENTS,
1198
+ )
1199
+ sys.exit(1)
1200
+
1201
+ selected_agent_id, agent_name = select_agent_interactively(available_agents)
1202
+ if selected_agent_id:
1203
+ args.m365_agent_id = selected_agent_id
1204
+ if effective_log_level in ("info", "debug"):
1205
+ emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
1206
+ else:
1207
+ emit_structured_log(
1208
+ "error",
1209
+ "No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
1210
+ operation=Operation.FETCH_AGENTS,
1211
+ )
1212
+ sys.exit(1)
910
1213
 
911
1214
  # 4. Send prompts to chat API
912
1215
  responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
913
1216
  except Exception as e:
914
- print(f"\033[91mError sending prompts to chat API: {e}\033[0m")
915
- if args.verbose:
1217
+ emit_structured_log("error", f"Error sending prompts to chat API: {e}", operation=Operation.SEND_PROMPT)
1218
+ if effective_log_level == "debug":
916
1219
  import traceback
917
1220
  traceback.print_exc()
918
1221
  sys.exit(1)
1222
+
919
1223
 
920
1224
  # 5. Run evaluations
921
- if not args.quiet:
922
- print("Running evaluations...")
923
- results = run_evaluations(args, responses, expected_responses)
1225
+ if effective_log_level in ("info", "debug"):
1226
+ emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
1227
+ results = run_evaluations(args, responses, eval_items, default_evaluators)
924
1228
 
925
1229
  # 6. Output results
926
- output_results(results, args)
1230
+ output_results(results, args, default_evaluators=default_evaluators,
1231
+ agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
927
1232
 
928
- if not args.quiet:
929
- print(f"\nEvaluation completed successfully! Processed {len(prompts)} prompt(s).")
1233
+ if effective_log_level in ("info", "debug"):
1234
+ emit_structured_log(
1235
+ "info",
1236
+ f"Evaluation completed successfully. Processed {len(prompts)} prompt(s).",
1237
+ operation=Operation.EVALUATE,
1238
+ )
930
1239
 
931
1240
  # Call the main function when script is run directly
932
1241
  if __name__ == "__main__":