@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.3.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ import argparse
4
4
  import sys
5
5
  import csv
6
6
  import functools
7
+ import logging
7
8
  import webbrowser
8
9
  import urllib.request
9
10
  import urllib.error
@@ -21,16 +22,40 @@ from azure.ai.evaluation import (
21
22
  from dotenv import load_dotenv
22
23
  from auth.auth_handler import AuthHandler
23
24
  from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
24
- #from custom_evaluators.ConcisenessNonLLMEvaluator import ConcisenessNonLLMEvaluator
25
- #from custom_evaluators.PII.PII import PIIEvaluator
25
+ from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
26
+ from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
26
27
  from generate_report import generate_html_report, calculate_aggregate_statistics
27
28
  from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
28
29
  from schema_handler import DocumentUpgrader, SchemaVersionManager
30
+ from common import (
31
+ RELEVANCE,
32
+ COHERENCE,
33
+ GROUNDEDNESS,
34
+ TOOL_CALL_ACCURACY,
35
+ CITATIONS,
36
+ EXACT_MATCH,
37
+ PARTIAL_MATCH,
38
+ REQUIRES_AZURE_OPENAI,
39
+ REQUIRES_TOOL_DEFINITIONS,
40
+ METRIC_IDS,
41
+ pascal_case_to_title,
42
+ )
43
+ from evaluator_resolver import (
44
+ EVALUATOR_REGISTRY,
45
+ validate_evaluator_names,
46
+ check_prerequisites,
47
+ resolve_default_evaluators,
48
+ resolve_evaluators_for_prompt,
49
+ get_evaluator_threshold,
50
+ )
29
51
  from version_check import check_min_version, get_cli_version
30
52
  from datetime import datetime, timezone
31
53
  from pathlib import Path
32
54
  import tzlocal
33
55
 
56
+ from cli_logging.console_diagnostics import render_diagnostic, serialize_diagnostic_record
57
+ from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel, Operation, format_structured_log_entry, resolve_log_level
58
+
34
59
  # Allowed endpoints for URL validation
35
60
  ALLOWED_ENDPOINTS = [
36
61
  'substrate.office.com'
@@ -48,20 +73,63 @@ VERSION_CHECK_BYPASS_FLAGS = (
48
73
  "signout",
49
74
  )
50
75
 
76
+ CLI_LOGGER_NAME = "m365.eval.cli"
77
+ CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
78
+ DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
79
+
80
+
81
+ def _ensure_logger_handler() -> None:
82
+ if CLI_LOGGER.handlers:
83
+ return
84
+ handler = logging.StreamHandler(sys.stdout)
85
+ handler.setFormatter(logging.Formatter("%(message)s"))
86
+ CLI_LOGGER.addHandler(handler)
87
+ CLI_LOGGER.propagate = False
88
+
89
+
90
+ def configure_cli_logging(effective_log_level: str) -> None:
91
+ _ensure_logger_handler()
92
+ CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
93
+
94
+
95
+ def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
96
+ _ensure_logger_handler()
97
+ context = {
98
+ "request-id": None,
99
+ "conversation-id": None,
100
+ "message-id": None,
101
+ "operation": operation,
102
+ }
103
+ entry = format_structured_log_entry(
104
+ level=level,
105
+ message=message,
106
+ logger_name=CLI_LOGGER_NAME,
107
+ run_context=context,
108
+ )
109
+ DIAGNOSTIC_RECORDS.append(entry)
110
+
111
+ try:
112
+ CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
113
+ except Exception:
114
+ pass
115
+
51
116
 
52
117
  def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
53
118
  """Return True if the current invocation should skip min-version checks."""
54
119
  return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
55
120
 
56
- def write_results_to_html(results: List[Dict], output_file: str):
121
+ def write_results_to_html(results: List[Dict], output_file: str,
122
+ agent_name: Optional[str] = None, agent_id: Optional[str] = None,
123
+ cli_version: Optional[str] = None):
57
124
  """Write results to HTML file using generate_html_report from generate_report.py."""
58
125
  try:
59
- html = generate_html_report(results)
126
+ html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
127
+ cli_version=cli_version)
60
128
  with open(output_file, 'w', encoding='utf-8') as f:
61
129
  f.write(html)
62
- print(f"HTML report saved to {output_file}")
130
+ emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
63
131
  except Exception as e:
64
- print(f"Error writing to HTML file: {e}")
132
+ emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
65
133
  sys.exit(1)
66
134
 
67
135
  def get_default_prompts_and_responses():
@@ -74,7 +142,7 @@ def get_default_prompts_and_responses():
74
142
  ]
75
143
  return prompts, expected_responses
76
144
 
77
- def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
145
+ def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
78
146
  """Load prompts and expected responses from a JSON file.
79
147
 
80
148
  Supports three formats:
@@ -84,6 +152,10 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
84
152
 
85
153
  For eval documents (format 1) and array format (format 2), schema validation
86
154
  and auto-upgrade are applied via DocumentUpgrader.
155
+
156
+ Returns:
157
+ Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
158
+ expected_response, and optional evaluators/evaluators_mode fields.
87
159
  """
88
160
  try:
89
161
  with open(file_path, 'r', encoding='utf-8') as f:
@@ -100,18 +172,18 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
100
172
  upgrader = DocumentUpgrader()
101
173
  except Exception as e:
102
174
  # Schema infrastructure not available (missing files, etc.) — skip
103
- print(f"Warning: Unable to initialize document upgrader: {e}")
175
+ emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
104
176
  upgrader = None
105
177
 
106
178
  if upgrader is not None:
107
179
  result = upgrader.upgrade(Path(file_path))
108
180
 
109
181
  if result.error:
110
- print(f"Schema validation error: {result.error}")
182
+ emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
111
183
  sys.exit(1)
112
184
 
113
185
  if result.upgraded and result.message:
114
- print(result.message)
186
+ emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
115
187
 
116
188
  # Use the parsed document from the upgrade result
117
189
  if result.document is not None:
@@ -119,26 +191,26 @@ def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
119
191
 
120
192
  if isinstance(data, list):
121
193
  # Format: [{"prompt": "...", "expected_response": "..."}, ...]
122
- prompts = [item.get("prompt", "") for item in data]
123
- expected_responses = [item.get("expected_response", "") for item in data]
194
+ return data, None
124
195
  elif isinstance(data, dict):
125
196
  if "items" in data:
126
197
  # Eval document format: {"schemaVersion": "...", "items": [...]}
127
- items = data["items"]
128
- prompts = [item.get("prompt", "") for item in items]
129
- expected_responses = [item.get("expected_response", "") for item in items]
198
+ return data["items"], data.get("default_evaluators")
130
199
  else:
131
200
  # Format: {"prompts": [...], "expected_responses": [...]}
132
201
  prompts = data.get("prompts", [])
133
202
  expected_responses = data.get("expected_responses", [])
203
+ eval_items = [
204
+ {"prompt": p, "expected_response": e}
205
+ for p, e in zip(prompts, expected_responses)
206
+ ]
207
+ return eval_items, None
134
208
  else:
135
209
  raise ValueError("Invalid file format")
136
-
137
- return prompts, expected_responses
138
210
  except SystemExit:
139
211
  raise
140
212
  except Exception as e:
141
- print(f"Error loading prompts from file: {e}")
213
+ emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
142
214
  sys.exit(1)
143
215
 
144
216
  def get_interactive_prompts() -> Tuple[List[str], List[str]]:
@@ -165,116 +237,168 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
165
237
 
166
238
  return prompts, expected_responses
167
239
 
168
- def run_evaluations(args, responses: dict, expected_responses: list) -> list:
169
- """Run evaluations against the responses."""
240
+ def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
241
+ default_evaluators: Dict[str, Any]) -> list:
242
+ """Run evaluations against the responses using per-prompt evaluator resolution.
243
+
244
+ Args:
245
+ args: CLI arguments.
246
+ responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
247
+ eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
248
+ default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
249
+ """
250
+ if len(responses) != len(eval_items):
251
+ raise ValueError(
252
+ f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
253
+ )
254
+
170
255
  model_config = AzureOpenAIModelConfiguration(
171
256
  azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
172
257
  api_key=os.environ.get("AZURE_AI_API_KEY"),
173
258
  api_version=os.environ.get("AZURE_AI_API_VERSION"),
174
259
  azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
175
260
  )
176
-
177
- # Initialize evaluators
178
- relevance_evaluator = RelevanceEvaluator(model_config=model_config) # Evaluate relevance for a given response. Range is 1 - 5.
179
- coherence_evaluator = CoherenceEvaluator(model_config=model_config) # Measures the coherence (human-like quality) of the response. Range is 1 - 5.
180
- groundedness_evaluator = GroundednessEvaluator(model_config=model_config) # Evaluates the response for for factuality and groundedness against provided context. Range is 1 - 5.
181
- #concisenessnonllm_evaluator = ConcisenessNonLLMEvaluator() # Evaluates the response for conciseness. Range is 1 - 5.
182
- #pii_evaluator = PIIEvaluator(model_config=model_config) # Evaluates the response for presence of PII. Range
183
- # Parse citation format from args
184
- citation_format = CitationFormat.OAI_UNICODE if args.citation_format == 'oai_unicode' else CitationFormat.LEGACY_BRACKET
185
- citations_evaluator = CitationsEvaluator(citation_format=citation_format) # Evaluates citations present in the response using regex pattern matching
186
-
187
- tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) # Evaluate tool call accuracy if tool definitions are present in response
188
-
189
261
 
190
- PASS_THRESHOLD = 3 # All evaluators must meet or exceed this value (out of 5) to pass
262
+ # Build available context for prerequisite checks
263
+ has_azure_openai = bool(
264
+ os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
265
+ and os.environ.get("AZURE_AI_API_KEY")
266
+ )
191
267
 
192
- def decorate_metric(metric_id: str, data):
268
+ DEFAULT_PASS_THRESHOLD = 3
269
+
270
+ def decorate_metric(metric_id: str, data, threshold: Optional[int] = None):
193
271
  """Augment raw evaluator output with standardized threshold + pass/fail result."""
272
+ pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
194
273
  payload = {}
195
- # Preserve original structure if dict
196
274
  if isinstance(data, dict):
197
275
  payload.update(data)
198
276
  else:
199
277
  payload['raw'] = data
200
278
 
201
- # Try to extract a numeric score
202
279
  score_val = None
203
280
  if isinstance(data, dict):
204
- for k in (metric_id, f"{metric_id}_score", 'score', 'value'):
205
- if k in data:
206
- score_val = data[k]
207
- break
281
+ if metric_id in data:
282
+ score_val = data[metric_id]
208
283
  if isinstance(score_val, (int, float)):
209
- payload['threshold'] = PASS_THRESHOLD
210
- payload['result'] = 'pass' if score_val >= PASS_THRESHOLD else 'fail'
284
+ payload['threshold'] = pass_threshold
285
+ payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
211
286
  else:
212
- # If we cannot determine score, mark unknown (no pass/fail)
213
- payload['threshold'] = PASS_THRESHOLD
287
+ payload['threshold'] = pass_threshold
214
288
  payload.setdefault('result', 'unknown')
215
289
  return json.dumps(payload, indent=4)
216
290
 
291
+ # Validate all evaluator names upfront (across defaults and all items)
292
+ all_evaluator_maps = [default_evaluators]
293
+ for eval_item in eval_items:
294
+ if "evaluators" in eval_item:
295
+ all_evaluator_maps.append(eval_item["evaluators"])
296
+ for emap in all_evaluator_maps:
297
+ validate_evaluator_names(emap)
298
+
217
299
  evaluation_results = []
218
- for prompt, expected_response in zip(responses.keys(), expected_responses):
219
- # Extract text response for evaluation (backward compatibility)
220
- enhanced_response = responses[prompt]
300
+ for enhanced_response, eval_item in zip(responses, eval_items):
221
301
  actual_response_text = get_response_text_for_evaluation(enhanced_response)
222
-
223
- # Run evaluations using text response
224
- relevance_score = relevance_evaluator(
225
- query=prompt,
226
- response=actual_response_text
302
+ prompt = eval_item.get("prompt", "")
303
+ expected_response = eval_item.get("expected_response", "")
304
+ prompt_evaluators = eval_item.get("evaluators")
305
+ evaluators_mode = eval_item.get("evaluators_mode", "extend")
306
+
307
+ # Resolve evaluators for this prompt
308
+ resolved = resolve_evaluators_for_prompt(
309
+ prompt_evaluators, evaluators_mode, prompt, default_evaluators,
227
310
  )
228
- coherence_score = coherence_evaluator(
229
- query=prompt,
230
- response=actual_response_text
231
- )
232
-
233
- groundedness_score = groundedness_evaluator(
234
- response=actual_response_text,
235
- context=expected_response
236
- )
237
-
238
- #PII_score = pii_evaluator(response=actual_response_text)
239
- #concisenessNonLLM_score = concisenessnonllm_evaluator(response=actual_response_text)
240
311
 
241
- citations_score = citations_evaluator(
242
- response=actual_response_text
312
+ # Build runtime context for prerequisite checks
313
+ has_tool_defs = bool(
314
+ args.m365_agent_id and enhanced_response.get("tool_definitions")
243
315
  )
316
+ available_context = {
317
+ REQUIRES_AZURE_OPENAI: has_azure_openai,
318
+ REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
319
+ }
244
320
 
245
- tool_call_accuracy = None
246
- if args.m365_agent_id and enhanced_response.get("tool_definitions"):
247
- tool_call_accuracy = tool_call_accuracy_evaluator(
248
- query=prompt,
249
- response=enhanced_response.get("response", actual_response_text),
250
- tool_definitions=enhanced_response["tool_definitions"]
251
- )
321
+ results_dict: Dict[str, Optional[str]] = {}
322
+ evaluators_ran: List[str] = []
323
+
324
+ for eval_name, eval_options in resolved.items():
325
+ # Check prerequisites
326
+ can_run, warn_msg = check_prerequisites(eval_name, available_context)
327
+ if not can_run:
328
+ if warn_msg:
329
+ emit_structured_log("warning", f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}", operation=Operation.EVALUATE)
330
+ results_dict[eval_name] = None
331
+ continue
332
+
333
+ evaluators_ran.append(eval_name)
334
+ threshold = get_evaluator_threshold(eval_name, eval_options)
335
+
336
+ if eval_name == RELEVANCE:
337
+ raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
338
+ results_dict[RELEVANCE] = decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
339
+ elif eval_name == COHERENCE:
340
+ raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
341
+ results_dict[COHERENCE] = decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
342
+ elif eval_name == GROUNDEDNESS:
343
+ raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response_text, context=expected_response)
344
+ results_dict[GROUNDEDNESS] = decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
345
+ elif eval_name == TOOL_CALL_ACCURACY:
346
+ raw_score = ToolCallAccuracyEvaluator(model_config)(
347
+ query=prompt,
348
+ response=enhanced_response.get("response", actual_response_text),
349
+ tool_definitions=enhanced_response["tool_definitions"],
350
+ )
351
+ results_dict[TOOL_CALL_ACCURACY] = decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
352
+ elif eval_name == CITATIONS:
353
+ fmt_str = eval_options.get("citation_format", "oai_unicode")
354
+ fmt_map = {
355
+ "oai_unicode": CitationFormat.OAI_UNICODE,
356
+ "bracket": CitationFormat.LEGACY_BRACKET,
357
+ "mixed": CitationFormat.AUTO,
358
+ }
359
+ raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response_text)
360
+ results_dict[CITATIONS] = decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
361
+ elif eval_name == EXACT_MATCH:
362
+ # ExactMatch is binary (match/no-match) — it includes its own result
363
+ # field, so we skip decorate_metric which assumes a numeric score.
364
+ case_sensitive = eval_options.get("case_sensitive", False)
365
+ raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
366
+ results_dict[EXACT_MATCH] = json.dumps(raw_score, indent=4)
367
+ elif eval_name == PARTIAL_MATCH:
368
+ case_sensitive = eval_options.get("case_sensitive", False)
369
+ raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
370
+ results_dict[PARTIAL_MATCH] = decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
252
371
 
253
372
  evaluation_result = {
254
373
  "prompt": prompt,
255
- "response": actual_response_text, # Keep simple text for backward compatibility
374
+ "response": actual_response_text,
256
375
  "expected_response": expected_response,
257
- "results": {
258
- "relevance_score": decorate_metric("relevance", relevance_score),
259
- "coherence_score": decorate_metric("coherence", coherence_score),
260
- "groundedness_score": decorate_metric("groundedness", groundedness_score),
261
- #"concisenessnonllm_score": decorate_metric("concisenessnonllm", concisenessNonLLM_score),
262
- #"pii_score": decorate_metric("pii", PII_score),
263
- "citations_score": json.dumps(citations_score, indent=4),
264
- "tool_call_accuracy_score": json.dumps(tool_call_accuracy, indent=4) if tool_call_accuracy else None
265
- }
376
+ "evaluators_ran": evaluators_ran,
377
+ "results": results_dict,
266
378
  }
267
379
 
268
- if args.verbose:
269
- print(f".................................. Evaluation for prompt: {evaluation_result['prompt']} ..................................")
270
- print(f"Scores: {evaluation_result['results']}")
271
- print("...........................................................................................................................")
380
+ # Preserve evaluator config metadata for output
381
+ if "evaluators" in eval_item:
382
+ evaluation_result["evaluators"] = eval_item["evaluators"]
383
+ if "evaluators_mode" in eval_item:
384
+ evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
385
+
386
+ if getattr(args, "effective_log_level", "info") == "debug":
387
+ emit_structured_log(
388
+ "debug",
389
+ f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
390
+ f"Evaluators: {', '.join(evaluators_ran)}. "
391
+ f"Scores: {evaluation_result['results']}",
392
+ operation=Operation.EVALUATE,
393
+ )
272
394
 
273
395
  evaluation_results.append(evaluation_result)
274
-
396
+
275
397
  return evaluation_results
276
398
 
277
- def write_results_to_console(results):
399
+ def write_results_to_console(results, agent_name: Optional[str] = None,
400
+ agent_id: Optional[str] = None,
401
+ cli_version: Optional[str] = None):
278
402
  """Write the response to console."""
279
403
  # ANSI color codes
280
404
  BOLD = '\033[1m'
@@ -286,47 +410,66 @@ def write_results_to_console(results):
286
410
  ORANGE = '\033[38;5;208m'
287
411
  RED = '\033[91m'
288
412
  RESET = '\033[0m'
289
-
413
+
414
+ # Show metadata
415
+ metadata_parts = []
416
+ if agent_name:
417
+ metadata_parts.append(f"Agent Name: {agent_name}")
418
+ if agent_id:
419
+ metadata_parts.append(f"Agent ID: {agent_id}")
420
+ if cli_version:
421
+ metadata_parts.append(f"CLI Version: {cli_version}")
422
+ if metadata_parts:
423
+ print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
424
+ print()
425
+
290
426
  # Show aggregate statistics if multiple results
291
427
  if len(results) > 1:
292
428
  aggregates = calculate_aggregate_statistics(results)
293
429
  if aggregates:
294
- print(f"{BOLD}{BLUE}📊 Aggregate Statistics ({len(results)} prompts):{RESET}")
430
+ print(f"{BOLD}{BLUE}Aggregate Statistics ({len(results)} prompts):{RESET}")
295
431
  print(f"{BLUE}{'=' * 60}{RESET}")
296
-
432
+
297
433
  for metric_name, stats in aggregates.items():
298
434
  pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
299
- print(f"{BOLD}{CYAN}{metric_name}:{RESET}")
435
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
436
+ total_prompts = stats.get('total_prompts', len(results))
437
+ print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
300
438
  print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
301
439
  print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
302
440
  if stats.get('threshold') is not None:
303
441
  print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
304
442
  print()
305
-
443
+
306
444
  print(f"{BLUE}{'=' * 60}{RESET}")
307
445
  print()
308
-
309
- print(f"{BOLD}{BLUE}📝 Individual Results:{RESET}")
446
+
447
+ print(f"{BOLD}{BLUE}Individual Results:{RESET}")
310
448
  print(f"{BLUE}{'=' * 50}{RESET}")
311
449
  for i, result in enumerate(results, 1):
312
450
  print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
451
+
452
+ # Show which evaluators ran for this prompt
453
+ evaluators_ran = result.get('evaluators_ran', [])
454
+ if evaluators_ran:
455
+ print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
456
+
313
457
  print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
314
458
  print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
315
459
 
316
- # Print metric scores generically from nested results (fallback to flat keys for back-compat)
317
- metrics = result.get('results') or {k: v for k, v in result.items() if isinstance(k, str) and k.endswith('_score')}
318
- if metrics:
319
- for k, v in metrics.items():
320
- name = k.replace('_', ' ')
321
- if 'relevance' in k:
322
- color = MAGENTA
323
- elif 'coherence' in k:
324
- color = ORANGE
325
- elif 'fluency' in k:
326
- color = GREEN
327
- else:
328
- color = BLUE
329
- print(f"{BOLD}{color}{name}:{RESET} {v}")
460
+ # Print metric scores from results
461
+ metrics = result.get('results', {})
462
+ for eval_name, v in metrics.items():
463
+ if v is None:
464
+ continue # Skip null/N/A scores from skipped evaluators
465
+ display_name = pascal_case_to_title(eval_name)
466
+ if eval_name == RELEVANCE:
467
+ color = MAGENTA
468
+ elif eval_name == COHERENCE:
469
+ color = ORANGE
470
+ else:
471
+ color = BLUE
472
+ print(f"{BOLD}{color}{display_name}:{RESET} {v}")
330
473
  print(f"{BLUE}{'-' * 30}{RESET}")
331
474
 
332
475
  def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
@@ -338,10 +481,8 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
338
481
  DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
339
482
 
340
483
  score_val = None
341
- for k in (metric_id, f"{metric_id}_score", "score", "value"):
342
- if k in data and isinstance(data[k], (int, float)):
343
- score_val = data[k]
344
- break
484
+ if metric_id in data and isinstance(data[metric_id], (int, float)):
485
+ score_val = data[metric_id]
345
486
  if score_val is None:
346
487
  return None
347
488
 
@@ -364,9 +505,11 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
364
505
  """Convert an internal evaluation result dict to a schema-compliant EvalItem.
365
506
 
366
507
  Internal format (from run_evaluations):
367
- {prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
508
+ {prompt, response, expected_response, results: {Relevance: "JSON", ...},
509
+ evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
368
510
  Schema EvalItem format:
369
- {prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
511
+ {prompt, response, expected_response, scores: {relevance: EvalScore, ...},
512
+ evaluators: {...}, evaluators_mode: "..."}
370
513
  """
371
514
  item: Dict[str, Any] = {
372
515
  "prompt": result["prompt"],
@@ -374,30 +517,35 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
374
517
  "expected_response": result["expected_response"],
375
518
  }
376
519
 
520
+ # Preserve evaluator config in output
521
+ if "evaluators" in result:
522
+ item["evaluators"] = result["evaluators"]
523
+ if "evaluators_mode" in result:
524
+ item["evaluators_mode"] = result["evaluators_mode"]
525
+
377
526
  scores: Dict[str, Any] = {}
378
527
  results_dict = result.get("results", {})
379
528
 
380
529
  # EvalScore metrics (all share the same schema shape: {score, result, threshold})
381
- # Tuple: (internal results key, metric ID for score lookup, schema output key)
382
- for internal_key, metric_id, schema_key in [
383
- ("relevance_score", "relevance", "relevance"),
384
- ("coherence_score", "coherence", "coherence"),
385
- ("groundedness_score", "groundedness", "groundedness"),
386
- ("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
530
+ for eval_key, schema_key in [
531
+ (RELEVANCE, "relevance"),
532
+ (COHERENCE, "coherence"),
533
+ (GROUNDEDNESS, "groundedness"),
534
+ (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
387
535
  ]:
388
- raw = results_dict.get(internal_key)
536
+ raw = results_dict.get(eval_key)
389
537
  if not raw:
390
538
  continue
391
539
  data = json.loads(raw) if isinstance(raw, str) else raw
392
- eval_score = extract_eval_score(data, metric_id)
540
+ eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
393
541
  if eval_score:
394
542
  scores[schema_key] = eval_score
395
543
 
396
- # Citations → CitationScore (different schema shape: {count, result, threshold} + format)
397
- raw_citations = results_dict.get("citations_score")
544
+ # Citations → CitationScore
545
+ raw_citations = results_dict.get(CITATIONS)
398
546
  if raw_citations:
399
547
  data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
400
- count = data.get("score", 0)
548
+ count = data.get("citations", 0)
401
549
  cit_result = data.get("result")
402
550
  if cit_result not in ("pass", "fail"):
403
551
  cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
@@ -411,17 +559,42 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
411
559
  citation_score["format"] = data["citation_format"]
412
560
  scores["citations"] = citation_score
413
561
 
562
+ # ExactMatch → ExactMatchScore
563
+ raw_exact = results_dict.get(EXACT_MATCH)
564
+ if raw_exact:
565
+ data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
566
+ is_match = data.get("exact_match", 0.0) == 1.0
567
+ scores["exactMatch"] = {
568
+ "match": is_match,
569
+ "result": data.get("result", "pass" if is_match else "fail"),
570
+ "reason": data.get("exact_match_reason", ""),
571
+ }
572
+
573
+ # PartialMatch → PartialMatchScore
574
+ raw_partial = results_dict.get(PARTIAL_MATCH)
575
+ if raw_partial:
576
+ data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
577
+ scores["partialMatch"] = {
578
+ "score": data.get("partial_match", 0.0),
579
+ "result": data.get("result", "fail"),
580
+ "threshold": data.get("threshold", 0.5),
581
+ "reason": data.get("partial_match_reason", ""),
582
+ }
583
+
414
584
  if scores:
415
585
  item["scores"] = scores
416
586
 
417
587
  return item
418
588
 
419
589
 
420
- def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
590
+ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
591
+ default_evaluators: Optional[Dict[str, Any]] = None,
592
+ agent_name: Optional[str] = None,
593
+ cli_version: Optional[str] = None):
421
594
  """Write results to a schema-compliant eval document JSON file.
422
595
 
423
596
  Output follows the eval-document.schema.json format:
424
- {schemaVersion, metadata, items: [EvalItem]}
597
+ {schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
425
598
  """
426
599
  try:
427
600
  try:
@@ -436,43 +609,68 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
436
609
  }
437
610
  if agent_id:
438
611
  metadata["agentId"] = agent_id
612
+ if agent_name:
613
+ metadata["agentName"] = agent_name
614
+ if cli_version:
615
+ metadata["cliVersion"] = cli_version
439
616
 
440
617
  output_data: Dict[str, Any] = {
441
618
  "schemaVersion": current_version,
442
619
  "metadata": metadata,
443
- "items": items,
444
620
  }
445
621
 
622
+ if default_evaluators is not None:
623
+ output_data["default_evaluators"] = default_evaluators
624
+
625
+ output_data["items"] = items
626
+
446
627
  with open(output_file, 'w', encoding='utf-8') as f:
447
628
  json.dump(output_data, f, indent=2, ensure_ascii=False)
448
- print(f"Results saved to {output_file}")
629
+ emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
449
630
  except Exception as e:
450
- print(f"Error writing to JSON file: {e}")
631
+ emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
451
632
  sys.exit(1)
452
633
 
453
- def write_results_to_csv(results: List[Dict], output_file: str):
634
+ def write_results_to_csv(results: List[Dict], output_file: str,
635
+ agent_name: Optional[str] = None, agent_id: Optional[str] = None,
636
+ cli_version: Optional[str] = None):
454
637
  """Write results to CSV file."""
455
638
  try:
456
639
  with open(output_file, 'w', newline='', encoding='utf-8') as f:
457
640
  if results:
641
+ # Write metadata header
642
+ metadata_parts = []
643
+ if agent_name:
644
+ metadata_parts.append(f"Agent Name: {agent_name}")
645
+ if agent_id:
646
+ metadata_parts.append(f"Agent ID: {agent_id}")
647
+ if cli_version:
648
+ metadata_parts.append(f"CLI Version: {cli_version}")
649
+ if metadata_parts:
650
+ f.write(f"# {' | '.join(metadata_parts)}\n")
651
+
458
652
  # Write aggregate statistics first if multiple results
459
653
  if len(results) > 1:
460
654
  aggregates = calculate_aggregate_statistics(results)
461
655
  if aggregates:
462
656
  f.write("# AGGREGATE STATISTICS\n")
463
- f.write("Metric,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
657
+ f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
464
658
  for metric_name, stats in aggregates.items():
465
659
  threshold_str = str(stats.get('threshold', 'N/A'))
466
- f.write(f"{metric_name},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
660
+ prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
661
+ total_prompts = stats.get('total_prompts', len(results))
662
+ f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
467
663
  f.write("\n# INDIVIDUAL RESULTS\n")
468
-
469
- # Write individual results
470
- writer = csv.DictWriter(f, fieldnames=results[0].keys())
664
+
665
+ # Write individual results (exclude internal fields)
666
+ exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode'}
667
+ fieldnames = [k for k in results[0].keys() if k not in exclude_keys]
668
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
471
669
  writer.writeheader()
472
670
  writer.writerows(results)
473
- print(f"Results saved to {output_file}")
671
+ emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
474
672
  except Exception as e:
475
- print(f"Error writing to CSV file: {e}")
673
+ emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
476
674
  sys.exit(1)
477
675
 
478
676
  def parse_arguments():
@@ -503,8 +701,8 @@ Examples:
503
701
  # Save results to HTML and open in browser
504
702
  python main.py --output report.html
505
703
 
506
- # Verbose output
507
- python main.py --verbose
704
+ # Debug-level diagnostics
705
+ python main.py --log-level debug
508
706
 
509
707
  # Sign out and clear cached authentication tokens
510
708
  python main.py --signout
@@ -553,21 +751,13 @@ Examples:
553
751
 
554
752
  # Behavior options
555
753
  parser.add_argument(
556
- '--verbose',
557
- action='store_true',
558
- help='Enable verbose output'
559
- )
560
- parser.add_argument(
561
- '--quiet',
562
- action='store_true',
563
- help='Suppress non-essential output'
564
- )
565
- parser.add_argument(
566
- '--citation-format',
567
- choices=['oai_unicode', 'legacy_bracket'],
568
- default='oai_unicode',
569
- help='Citation format to detect. "oai_unicode" for new OAI format (default), "legacy_bracket" for old [^i^] format'
754
+ '--log-level',
755
+ nargs='?',
756
+ const='info',
757
+ action='append',
758
+ help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
570
759
  )
760
+
571
761
  parser.add_argument(
572
762
  '--signout',
573
763
  action='store_true',
@@ -600,8 +790,13 @@ def validate_environment() -> CallPath:
600
790
 
601
791
  missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
602
792
  if missing_vars:
603
- print(f"Error: Missing required environment variables: {', '.join(missing_vars)}")
604
- print("Please ensure your .env file contains all required Azure configuration.")
793
+ emit_structured_log(
794
+ "error",
795
+ "Missing required environment variables: "
796
+ f"{', '.join(missing_vars)}. Please ensure your .env file contains "
797
+ "all required Azure configuration.",
798
+ operation=Operation.VALIDATE_ENV,
799
+ )
605
800
  sys.exit(1)
606
801
  return call_path
607
802
 
@@ -635,23 +830,42 @@ def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
635
830
  # Convert other parsing errors to ValueError
636
831
  raise ValueError(f"Invalid URL format: {url}") from e
637
832
 
638
- def get_prompt_datasets(args) -> Tuple[List[str], List[str]]:
639
- """Get prompts and expected responses based on command line arguments."""
833
+ def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
834
+ """Get prompts and expected responses based on command line arguments.
835
+
836
+ Returns:
837
+ Tuple of (eval_items, default_evaluators).
838
+ """
640
839
  if args.prompts:
641
840
  if args.expected and len(args.prompts) != len(args.expected):
642
- print("Error: Number of prompts must match number of expected responses")
841
+ emit_structured_log(
842
+ "error",
843
+ "Number of prompts must match number of expected responses. "
844
+ "Update --expected values to match the prompt count.",
845
+ )
643
846
  sys.exit(1)
644
- prompts = args.prompts
645
- expected_responses = args.expected or [""] * len(prompts)
847
+ expected_responses = args.expected or [""] * len(args.prompts)
848
+ eval_items = [
849
+ {"prompt": p, "expected_response": e}
850
+ for p, e in zip(args.prompts, expected_responses)
851
+ ]
852
+ return eval_items, None
646
853
  elif args.prompts_file:
647
- prompts, expected_responses = load_prompts_from_file(args.prompts_file)
854
+ return load_prompts_from_file(args.prompts_file)
648
855
  elif args.interactive:
649
856
  prompts, expected_responses = get_interactive_prompts()
857
+ eval_items = [
858
+ {"prompt": p, "expected_response": e}
859
+ for p, e in zip(prompts, expected_responses)
860
+ ]
861
+ return eval_items, None
650
862
  else:
651
- # Use default prompts
652
863
  prompts, expected_responses = get_default_prompts_and_responses()
653
-
654
- return prompts, expected_responses
864
+ eval_items = [
865
+ {"prompt": p, "expected_response": e}
866
+ for p, e in zip(prompts, expected_responses)
867
+ ]
868
+ return eval_items, None
655
869
 
656
870
  def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
657
871
  """
@@ -687,26 +901,27 @@ def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oi
687
901
  return agents
688
902
  except urllib.error.HTTPError as e:
689
903
  # If endpoint doesn't exist or returns error, return empty list
690
- print(f"Warning: Unable to fetch agents list (HTTP {e.code}).")
904
+ emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
691
905
  return []
692
906
  except Exception as e:
693
- print(f"Warning: Error fetching agents: {e}")
907
+ emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
694
908
  return []
695
909
 
696
- def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
910
+ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
697
911
  """
698
912
  Display an interactive agent selector using questionary.
699
-
913
+
700
914
  Args:
701
915
  agents: List of agent dictionaries.
702
-
916
+
703
917
  Returns:
704
- Selected agent ID or None if cancelled/skipped
918
+ Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
705
919
  """
706
920
  if not agents:
707
- return None
708
-
709
- # Create choices for questionary
921
+ return None, None
922
+
923
+ # Build id→name lookup and choices
924
+ id_to_name: Dict[str, str] = {}
710
925
  choices = []
711
926
  sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
712
927
  for agent in sorted_agents:
@@ -714,12 +929,13 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
714
929
  agent_name = agent.get("name", "Unknown")
715
930
  agent_description = agent.get("description", "Unknown")
716
931
  agent_is_owner = agent.get('isOwner')
717
-
932
+ id_to_name[agent_id] = agent_name
933
+
718
934
  # Format the display text
719
935
  display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
720
-
936
+
721
937
  choices.append(questionary.Choice(title=display_text, value=agent_id))
722
-
938
+
723
939
  # Display the selection prompt
724
940
  selected_agent = questionary.select(
725
941
  "Select an agent to evaluate:",
@@ -727,8 +943,8 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
727
943
  use_shortcuts=True,
728
944
  use_arrow_keys=True
729
945
  ).ask()
730
-
731
- return selected_agent
946
+
947
+ return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
732
948
 
733
949
  @functools.lru_cache(maxsize=1)
734
950
  def _get_iana_timezone_name() -> str:
@@ -787,7 +1003,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
787
1003
 
788
1004
  return json.dumps(message).encode("utf-8")
789
1005
 
790
- def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> Dict[str, Dict[str, any]]:
1006
+ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
791
1007
  """ Send prompts to the chat API and return enhanced responses. """
792
1008
 
793
1009
  request_headers = {
@@ -795,15 +1011,15 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
795
1011
  "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
796
1012
  "Authorization": f"Bearer {access_token}"
797
1013
  }
798
- raw_responses: Dict[str, str] = {}
1014
+ raw_responses: List[Tuple[str, str]] = []
799
1015
  for i, prompt in enumerate(prompts, 1):
800
- if not args.quiet:
801
- print(f"Processing prompt {i}/{len(prompts)}...")
1016
+ if getattr(args, "effective_log_level", "info") in ("info", "debug"):
1017
+ emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
802
1018
 
803
1019
  # Build the payload
804
1020
  payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
805
- if args.verbose:
806
- print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
1021
+ if getattr(args, "effective_log_level", "info") == "debug":
1022
+ emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
807
1023
 
808
1024
  # Send the request to /chat
809
1025
  req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
@@ -822,42 +1038,83 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
822
1038
  raise RuntimeError(msg) from e
823
1039
  except urllib.error.URLError as e:
824
1040
  raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
825
-
826
- if args.verbose:
827
- print(f"[Sydney] Raw response: {raw}")
828
-
1041
+
1042
+ if getattr(args, "effective_log_level", "info") == "debug":
1043
+ emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
1044
+
829
1045
  # Store raw response for enhancement
830
- raw_responses[prompt] = raw.strip()
831
-
1046
+ raw_responses.append((prompt, raw.strip()))
1047
+
832
1048
  # Extract enhanced responses using the new extractor
833
- enhanced_responses = extract_enhanced_responses(raw_responses)
1049
+ enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
1050
+
1051
+ if getattr(args, "effective_log_level", "info") == "debug":
1052
+ for idx, enhanced in enumerate(enhanced_responses, 1):
1053
+ metadata = enhanced.get("metadata", {})
1054
+ context = {
1055
+ "request-id": metadata.get("request_id"),
1056
+ "conversation-id": metadata.get("conversation_id"),
1057
+ "message-id": metadata.get("message_id"),
1058
+ "operation": Operation.SEND_PROMPT,
1059
+ }
1060
+ entry = format_structured_log_entry(
1061
+ level="debug",
1062
+ message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
1063
+ logger_name=CLI_LOGGER_NAME,
1064
+ run_context=context,
1065
+ )
1066
+ DIAGNOSTIC_RECORDS.append(entry)
1067
+ CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
1068
+
834
1069
  return enhanced_responses
835
1070
 
836
- def output_results(results: List[Dict], args):
1071
+ def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
1072
+ agent_name: Optional[str] = None, cli_version: Optional[str] = None):
837
1073
  """Output results based on specified format."""
1074
+ metadata_kwargs = dict(
1075
+ agent_name=agent_name,
1076
+ agent_id=getattr(args, 'm365_agent_id', None),
1077
+ cli_version=cli_version,
1078
+ )
838
1079
  if args.output:
839
1080
  output_lower = args.output.lower()
840
1081
  if output_lower.endswith('.json'):
841
- write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
1082
+ write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1083
+ **metadata_kwargs)
842
1084
  elif output_lower.endswith('.csv'):
843
- write_results_to_csv(results, args.output)
1085
+ write_results_to_csv(results, args.output, **metadata_kwargs)
844
1086
  elif output_lower.endswith('.html'):
845
- write_results_to_html(results, args.output)
1087
+ write_results_to_html(results, args.output, **metadata_kwargs)
846
1088
  abs_path = os.path.abspath(args.output)
847
1089
  webbrowser.open(f'file://{abs_path}')
848
1090
  else:
849
- write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
1091
+ write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1092
+ **metadata_kwargs)
850
1093
  else:
851
- write_results_to_console(results)
1094
+ write_results_to_console(results, **metadata_kwargs)
852
1095
 
853
1096
  def main():
854
1097
  """Main function to orchestrate the evaluation process."""
855
1098
  load_dotenv()
856
1099
  args = parse_arguments()
857
1100
 
1101
+ effective_log_level, error_message = resolve_log_level(args.log_level)
1102
+ if error_message:
1103
+ print(error_message)
1104
+ print(
1105
+ "Next step: rerun with --log-level {debug|info|warning|error}. "
1106
+ "For support, share the console diagnostics output from this run."
1107
+ )
1108
+ sys.exit(2)
1109
+
1110
+ args.effective_log_level = effective_log_level
1111
+ configure_cli_logging(effective_log_level)
1112
+ emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
1113
+
858
1114
  # Check minimum version before proceeding
859
- cli_version = get_cli_version(quiet=args.quiet)
860
- if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
1115
+ quiet_for_version = effective_log_level in ("warning", "error")
1116
+ cli_version = get_cli_version(quiet=quiet_for_version)
1117
+ if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
861
1118
  sys.exit(1)
862
1119
 
863
1120
  # Validate environment variables required for evaluation
@@ -885,7 +1142,7 @@ def main():
885
1142
  try:
886
1143
  auth_handler.clear_cache()
887
1144
  except Exception as e:
888
- print(f"Error during signout: {e}")
1145
+ emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
889
1146
  sys.exit(1)
890
1147
  sys.exit(0)
891
1148
 
@@ -898,67 +1155,87 @@ def main():
898
1155
 
899
1156
  id_token_claims = auth_result.get("id_token_claims")
900
1157
  if not isinstance(id_token_claims, dict):
901
- print("id_token_claims is missing or invalid in authentication result")
1158
+ emit_structured_log(
1159
+ "warning", "id_token_claims is missing or invalid in authentication result",
1160
+ operation=Operation.AUTHENTICATE,
1161
+ )
902
1162
  else:
903
1163
  user_oid = id_token_claims.get("oid") or ""
904
1164
 
905
1165
  except Exception as e:
906
- print(f"\033[91mError during authentication: {e}\033[0m")
907
- if args.verbose:
908
- import traceback
909
- traceback.print_exc()
910
- sys.exit(1)
1166
+ emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
1167
+ if effective_log_level == "debug":
1168
+ import traceback
1169
+ traceback.print_exc()
1170
+ sys.exit(1)
911
1171
 
912
1172
  if not user_oid and access_token:
913
1173
  # Fallback: extract from access token.
914
1174
  user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
915
1175
 
916
- # 1. Load evaluation datasets (prompts and expected_responses)
917
- prompts, expected_responses = get_prompt_datasets(args)
1176
+ # 1. Load evaluation datasets
1177
+ eval_items, file_default_evaluators = get_prompt_datasets(args)
1178
+ default_evaluators = resolve_default_evaluators(file_default_evaluators)
1179
+ prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
918
1180
 
919
- if not args.quiet:
920
- print(f"Running evaluation on {len(prompts)} prompt(s)...")
1181
+ if effective_log_level in ("info", "debug"):
1182
+ emit_structured_log("info", f"Running evaluation on {len(prompts)} prompt(s).", operation=Operation.SETUP)
921
1183
 
1184
+ agent_name = None
922
1185
  try:
923
1186
  # 3. Agent selection - if no agent ID provided, prompt user to select
924
1187
  if not args.m365_agent_id:
925
- if not args.quiet:
926
- print("No agent ID provided. Fetching available agents...")
1188
+ if effective_log_level in ("info", "debug"):
1189
+ emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
927
1190
 
928
1191
  available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
929
1192
  if not available_agents:
930
- print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
931
- sys.exit(1)
932
-
933
- if available_agents:
934
- selected_agent_id = select_agent_interactively(available_agents)
935
- if selected_agent_id:
936
- args.m365_agent_id = selected_agent_id
937
- if not args.quiet:
938
- print(f"Selected agent: {args.m365_agent_id}")
939
- else:
940
- print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
941
- sys.exit(1)
1193
+ emit_structured_log(
1194
+ "error",
1195
+ "No agents are available for interactive selection. Re-run with "
1196
+ "--m365-agent-id or set M365_AGENT_ID.",
1197
+ operation=Operation.FETCH_AGENTS,
1198
+ )
1199
+ sys.exit(1)
1200
+
1201
+ selected_agent_id, agent_name = select_agent_interactively(available_agents)
1202
+ if selected_agent_id:
1203
+ args.m365_agent_id = selected_agent_id
1204
+ if effective_log_level in ("info", "debug"):
1205
+ emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
1206
+ else:
1207
+ emit_structured_log(
1208
+ "error",
1209
+ "No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
1210
+ operation=Operation.FETCH_AGENTS,
1211
+ )
1212
+ sys.exit(1)
942
1213
 
943
1214
  # 4. Send prompts to chat API
944
1215
  responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
945
1216
  except Exception as e:
946
- print(f"\033[91mError sending prompts to chat API: {e}\033[0m")
947
- if args.verbose:
1217
+ emit_structured_log("error", f"Error sending prompts to chat API: {e}", operation=Operation.SEND_PROMPT)
1218
+ if effective_log_level == "debug":
948
1219
  import traceback
949
1220
  traceback.print_exc()
950
1221
  sys.exit(1)
1222
+
951
1223
 
952
1224
  # 5. Run evaluations
953
- if not args.quiet:
954
- print("Running evaluations...")
955
- results = run_evaluations(args, responses, expected_responses)
1225
+ if effective_log_level in ("info", "debug"):
1226
+ emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
1227
+ results = run_evaluations(args, responses, eval_items, default_evaluators)
956
1228
 
957
1229
  # 6. Output results
958
- output_results(results, args)
1230
+ output_results(results, args, default_evaluators=default_evaluators,
1231
+ agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
959
1232
 
960
- if not args.quiet:
961
- print(f"\nEvaluation completed successfully! Processed {len(prompts)} prompt(s).")
1233
+ if effective_log_level in ("info", "debug"):
1234
+ emit_structured_log(
1235
+ "info",
1236
+ f"Evaluation completed successfully. Processed {len(prompts)} prompt(s).",
1237
+ operation=Operation.EVALUATE,
1238
+ )
962
1239
 
963
1240
  # Call the main function when script is run directly
964
1241
  if __name__ == "__main__":