@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +135 -100
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +7 -0
  4. package/schema/v1/eval-document.schema.json +143 -11
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  11. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  12. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  13. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  14. package/schema/version.json +2 -2
  15. package/src/clients/cli/agent_selector.py +74 -0
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +77 -0
  20. package/src/clients/cli/cli_args.py +136 -0
  21. package/src/clients/cli/cli_logging/cli_logger.py +33 -0
  22. package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
  23. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  24. package/src/clients/cli/common.py +64 -0
  25. package/src/clients/cli/env_validator.py +73 -0
  26. package/src/clients/cli/evaluation_runner.py +653 -0
  27. package/src/clients/cli/evaluator_resolver.py +9 -6
  28. package/src/clients/cli/generate_report.py +272 -129
  29. package/src/clients/cli/main.py +157 -1174
  30. package/src/clients/cli/parallel_executor.py +57 -0
  31. package/src/clients/cli/prompt_loader.py +148 -0
  32. package/src/clients/cli/readme.md +9 -53
  33. package/src/clients/cli/requirements.txt +1 -1
  34. package/src/clients/cli/response_extractor.py +4 -603
  35. package/src/clients/cli/result_writer.py +488 -0
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +82 -20
  40. package/src/clients/node-js/config/default.js +12 -11
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +14 -20
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -1,1097 +1,57 @@
1
- import json
1
+ """M365 Copilot Agent Evaluation CLI — thin orchestrator.
2
+
3
+ Delegates to focused modules:
4
+ cli_args – argument parsing & version-check bypass
5
+ env_validator – environment validation & URL security
6
+ prompt_loader – dataset loading & agent selection
7
+ evaluation_runner – pipeline, evaluator dispatch, retry
8
+ result_writer – console / JSON / CSV / HTML output
9
+ """
10
+
2
11
  import os
3
- import argparse
4
12
  import sys
5
- import csv
6
- import functools
7
- import logging
8
- import webbrowser
9
- import urllib.request
10
- import urllib.error
11
- import urllib.parse
12
- import questionary
13
- from enum import Enum
14
- from typing import List, Dict, Tuple, Optional, Any
15
- from azure.ai.evaluation import (
16
- AzureOpenAIModelConfiguration,
17
- RelevanceEvaluator,
18
- CoherenceEvaluator,
19
- GroundednessEvaluator,
20
- ToolCallAccuracyEvaluator
21
- )
13
+ import traceback
14
+
15
+ from azure.ai.evaluation import AzureOpenAIModelConfiguration
22
16
  from dotenv import load_dotenv
17
+
18
+ from api_clients.A2A import A2AClient
23
19
  from auth.auth_handler import AuthHandler
24
- from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
25
- from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
26
- from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
27
- from generate_report import generate_html_report, calculate_aggregate_statistics
28
- from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
29
- from schema_handler import DocumentUpgrader, SchemaVersionManager
30
- from common import (
31
- RELEVANCE,
32
- COHERENCE,
33
- GROUNDEDNESS,
34
- TOOL_CALL_ACCURACY,
35
- CITATIONS,
36
- EXACT_MATCH,
37
- PARTIAL_MATCH,
38
- REQUIRES_AZURE_OPENAI,
39
- REQUIRES_TOOL_DEFINITIONS,
40
- METRIC_IDS,
41
- pascal_case_to_title,
42
- )
43
- from evaluator_resolver import (
44
- EVALUATOR_REGISTRY,
45
- validate_evaluator_names,
46
- check_prerequisites,
47
- resolve_default_evaluators,
48
- resolve_evaluators_for_prompt,
49
- get_evaluator_threshold,
50
- )
20
+ from evaluator_resolver import resolve_default_evaluators
51
21
  from version_check import check_min_version, get_cli_version
52
- from datetime import datetime, timezone
53
- from pathlib import Path
54
- import tzlocal
55
22
 
56
- from cli_logging.console_diagnostics import render_diagnostic, serialize_diagnostic_record
57
- from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel, Operation, format_structured_log_entry, resolve_log_level
58
-
59
- # Allowed endpoints for URL validation
60
- ALLOWED_ENDPOINTS = [
61
- 'substrate.office.com'
62
- ]
63
-
64
- class CallPath(Enum):
65
- """ Enum to indicate which call path to use. """
66
- ACCESS_TOKEN = "access_token"
67
- COPILOT_AUTH = "copilot_auth"
68
-
69
-
70
- # Flags that should bypass remote min-version enforcement.
71
- # --help is not needed here because argparse exits before runtime checks.
72
- VERSION_CHECK_BYPASS_FLAGS = (
73
- "signout",
23
+ from cli_logging.cli_logger import (
24
+ CLI_LOGGER,
25
+ DIAGNOSTIC_RECORDS,
26
+ configure_cli_logging,
27
+ emit_structured_log,
74
28
  )
29
+ from cli_logging.logging_utils import Operation, resolve_log_level
75
30
 
76
- CLI_LOGGER_NAME = "m365.eval.cli"
77
- CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
78
- DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
79
-
80
-
81
- def _ensure_logger_handler() -> None:
82
- if CLI_LOGGER.handlers:
83
- return
84
- handler = logging.StreamHandler(sys.stdout)
85
- handler.setFormatter(logging.Formatter("%(message)s"))
86
- CLI_LOGGER.addHandler(handler)
87
- CLI_LOGGER.propagate = False
88
-
89
-
90
- def configure_cli_logging(effective_log_level: str) -> None:
91
- _ensure_logger_handler()
92
- CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
93
-
94
-
95
- def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
96
- _ensure_logger_handler()
97
- context = {
98
- "request-id": None,
99
- "conversation-id": None,
100
- "message-id": None,
101
- "operation": operation,
102
- }
103
- entry = format_structured_log_entry(
104
- level=level,
105
- message=message,
106
- logger_name=CLI_LOGGER_NAME,
107
- run_context=context,
108
- )
109
- DIAGNOSTIC_RECORDS.append(entry)
110
-
111
- try:
112
- CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
113
- except Exception:
114
- pass
115
-
116
-
117
- def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
118
- """Return True if the current invocation should skip min-version checks."""
119
- return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
120
-
121
- def write_results_to_html(results: List[Dict], output_file: str,
122
- agent_name: Optional[str] = None, agent_id: Optional[str] = None,
123
- cli_version: Optional[str] = None):
124
- """Write results to HTML file using generate_html_report from generate_report.py."""
125
- try:
126
- html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
127
- cli_version=cli_version)
128
- with open(output_file, 'w', encoding='utf-8') as f:
129
- f.write(html)
130
- emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
131
- except Exception as e:
132
- emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
133
- sys.exit(1)
134
-
135
- def get_default_prompts_and_responses():
136
- """Get a list of prompts and responses."""
137
- prompts = [
138
- "What is Microsoft Graph?"
139
- ]
140
- expected_responses = [
141
- "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
142
- ]
143
- return prompts, expected_responses
144
-
145
- def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
146
- """Load prompts and expected responses from a JSON file.
147
-
148
- Supports three formats:
149
- 1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
150
- 2. Array format: [{"prompt": "...", "expected_response": "..."}]
151
- 3. Dict format: {"prompts": [...], "expected_responses": [...]}
152
-
153
- For eval documents (format 1) and array format (format 2), schema validation
154
- and auto-upgrade are applied via DocumentUpgrader.
155
-
156
- Returns:
157
- Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
158
- expected_response, and optional evaluators/evaluators_mode fields.
159
- """
160
- try:
161
- with open(file_path, 'r', encoding='utf-8') as f:
162
- data = json.load(f)
163
-
164
- # Detect if this is an eval document (has "items" key) or could be upgraded
165
- is_eval_document = (
166
- isinstance(data, dict) and "items" in data
167
- ) or isinstance(data, list)
168
-
169
- # Run schema validation and auto-upgrade for eval documents
170
- if is_eval_document:
171
- try:
172
- upgrader = DocumentUpgrader()
173
- except Exception as e:
174
- # Schema infrastructure not available (missing files, etc.) — skip
175
- emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
176
- upgrader = None
177
-
178
- if upgrader is not None:
179
- result = upgrader.upgrade(Path(file_path))
180
-
181
- if result.error:
182
- emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
183
- sys.exit(1)
184
-
185
- if result.upgraded and result.message:
186
- emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
187
-
188
- # Use the parsed document from the upgrade result
189
- if result.document is not None:
190
- data = result.document
191
-
192
- if isinstance(data, list):
193
- # Format: [{"prompt": "...", "expected_response": "..."}, ...]
194
- return data, None
195
- elif isinstance(data, dict):
196
- if "items" in data:
197
- # Eval document format: {"schemaVersion": "...", "items": [...]}
198
- return data["items"], data.get("default_evaluators")
199
- else:
200
- # Format: {"prompts": [...], "expected_responses": [...]}
201
- prompts = data.get("prompts", [])
202
- expected_responses = data.get("expected_responses", [])
203
- eval_items = [
204
- {"prompt": p, "expected_response": e}
205
- for p, e in zip(prompts, expected_responses)
206
- ]
207
- return eval_items, None
208
- else:
209
- raise ValueError("Invalid file format")
210
- except SystemExit:
211
- raise
212
- except Exception as e:
213
- emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
214
- sys.exit(1)
215
-
216
- def get_interactive_prompts() -> Tuple[List[str], List[str]]:
217
- """Get prompts and expected responses interactively."""
218
- prompts = []
219
- expected_responses = []
220
-
221
- print("Interactive mode: Enter your prompts and expected responses.")
222
- print("Press Enter with empty prompt to finish.")
223
-
224
- while True:
225
- prompt = input(f"\nPrompt {len(prompts) + 1}: ").strip()
226
- if not prompt:
227
- break
228
-
229
- expected = input(f"Expected response {len(expected_responses) + 1}: ").strip()
230
-
231
- prompts.append(prompt)
232
- expected_responses.append(expected)
233
-
234
- if not prompts:
235
- print("No prompts entered. Exiting.")
236
- sys.exit(1)
237
-
238
- return prompts, expected_responses
239
-
240
- def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
241
- default_evaluators: Dict[str, Any]) -> list:
242
- """Run evaluations against the responses using per-prompt evaluator resolution.
243
-
244
- Args:
245
- args: CLI arguments.
246
- responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
247
- eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
248
- default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
249
- """
250
- if len(responses) != len(eval_items):
251
- raise ValueError(
252
- f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
253
- )
254
-
255
- model_config = AzureOpenAIModelConfiguration(
256
- azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
257
- api_key=os.environ.get("AZURE_AI_API_KEY"),
258
- api_version=os.environ.get("AZURE_AI_API_VERSION"),
259
- azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
260
- )
261
-
262
- # Build available context for prerequisite checks
263
- has_azure_openai = bool(
264
- os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
265
- and os.environ.get("AZURE_AI_API_KEY")
266
- )
267
-
268
- DEFAULT_PASS_THRESHOLD = 3
269
-
270
- def decorate_metric(metric_id: str, data, threshold: Optional[int] = None):
271
- """Augment raw evaluator output with standardized threshold + pass/fail result."""
272
- pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
273
- payload = {}
274
- if isinstance(data, dict):
275
- payload.update(data)
276
- else:
277
- payload['raw'] = data
278
-
279
- score_val = None
280
- if isinstance(data, dict):
281
- if metric_id in data:
282
- score_val = data[metric_id]
283
- if isinstance(score_val, (int, float)):
284
- payload['threshold'] = pass_threshold
285
- payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
286
- else:
287
- payload['threshold'] = pass_threshold
288
- payload.setdefault('result', 'unknown')
289
- return json.dumps(payload, indent=4)
290
-
291
- # Validate all evaluator names upfront (across defaults and all items)
292
- all_evaluator_maps = [default_evaluators]
293
- for eval_item in eval_items:
294
- if "evaluators" in eval_item:
295
- all_evaluator_maps.append(eval_item["evaluators"])
296
- for emap in all_evaluator_maps:
297
- validate_evaluator_names(emap)
298
-
299
- evaluation_results = []
300
- for enhanced_response, eval_item in zip(responses, eval_items):
301
- actual_response_text = get_response_text_for_evaluation(enhanced_response)
302
- prompt = eval_item.get("prompt", "")
303
- expected_response = eval_item.get("expected_response", "")
304
- prompt_evaluators = eval_item.get("evaluators")
305
- evaluators_mode = eval_item.get("evaluators_mode", "extend")
306
-
307
- # Resolve evaluators for this prompt
308
- resolved = resolve_evaluators_for_prompt(
309
- prompt_evaluators, evaluators_mode, prompt, default_evaluators,
310
- )
311
-
312
- # Build runtime context for prerequisite checks
313
- has_tool_defs = bool(
314
- args.m365_agent_id and enhanced_response.get("tool_definitions")
315
- )
316
- available_context = {
317
- REQUIRES_AZURE_OPENAI: has_azure_openai,
318
- REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
319
- }
320
-
321
- results_dict: Dict[str, Optional[str]] = {}
322
- evaluators_ran: List[str] = []
323
-
324
- for eval_name, eval_options in resolved.items():
325
- # Check prerequisites
326
- can_run, warn_msg = check_prerequisites(eval_name, available_context)
327
- if not can_run:
328
- if warn_msg:
329
- emit_structured_log("warning", f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}", operation=Operation.EVALUATE)
330
- results_dict[eval_name] = None
331
- continue
332
-
333
- evaluators_ran.append(eval_name)
334
- threshold = get_evaluator_threshold(eval_name, eval_options)
335
-
336
- if eval_name == RELEVANCE:
337
- raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
338
- results_dict[RELEVANCE] = decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
339
- elif eval_name == COHERENCE:
340
- raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
341
- results_dict[COHERENCE] = decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
342
- elif eval_name == GROUNDEDNESS:
343
- raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response_text, context=expected_response)
344
- results_dict[GROUNDEDNESS] = decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
345
- elif eval_name == TOOL_CALL_ACCURACY:
346
- raw_score = ToolCallAccuracyEvaluator(model_config)(
347
- query=prompt,
348
- response=enhanced_response.get("response", actual_response_text),
349
- tool_definitions=enhanced_response["tool_definitions"],
350
- )
351
- results_dict[TOOL_CALL_ACCURACY] = decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
352
- elif eval_name == CITATIONS:
353
- fmt_str = eval_options.get("citation_format", "oai_unicode")
354
- fmt_map = {
355
- "oai_unicode": CitationFormat.OAI_UNICODE,
356
- "bracket": CitationFormat.LEGACY_BRACKET,
357
- "mixed": CitationFormat.AUTO,
358
- }
359
- raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response_text)
360
- results_dict[CITATIONS] = decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
361
- elif eval_name == EXACT_MATCH:
362
- # ExactMatch is binary (match/no-match) — it includes its own result
363
- # field, so we skip decorate_metric which assumes a numeric score.
364
- case_sensitive = eval_options.get("case_sensitive", False)
365
- raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
366
- results_dict[EXACT_MATCH] = json.dumps(raw_score, indent=4)
367
- elif eval_name == PARTIAL_MATCH:
368
- case_sensitive = eval_options.get("case_sensitive", False)
369
- raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
370
- results_dict[PARTIAL_MATCH] = decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
371
-
372
- evaluation_result = {
373
- "prompt": prompt,
374
- "response": actual_response_text,
375
- "expected_response": expected_response,
376
- "evaluators_ran": evaluators_ran,
377
- "results": results_dict,
378
- }
379
-
380
- # Preserve evaluator config metadata for output
381
- if "evaluators" in eval_item:
382
- evaluation_result["evaluators"] = eval_item["evaluators"]
383
- if "evaluators_mode" in eval_item:
384
- evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
385
-
386
- if getattr(args, "effective_log_level", "info") == "debug":
387
- emit_structured_log(
388
- "debug",
389
- f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
390
- f"Evaluators: {', '.join(evaluators_ran)}. "
391
- f"Scores: {evaluation_result['results']}",
392
- operation=Operation.EVALUATE,
393
- )
394
-
395
- evaluation_results.append(evaluation_result)
396
-
397
- return evaluation_results
398
-
399
- def write_results_to_console(results, agent_name: Optional[str] = None,
400
- agent_id: Optional[str] = None,
401
- cli_version: Optional[str] = None):
402
- """Write the response to console."""
403
- # ANSI color codes
404
- BOLD = '\033[1m'
405
- BLUE = '\033[94m'
406
- GREEN = '\033[92m'
407
- YELLOW = '\033[93m'
408
- CYAN = '\033[96m'
409
- MAGENTA = '\033[95m'
410
- ORANGE = '\033[38;5;208m'
411
- RED = '\033[91m'
412
- RESET = '\033[0m'
413
-
414
- # Show metadata
415
- metadata_parts = []
416
- if agent_name:
417
- metadata_parts.append(f"Agent Name: {agent_name}")
418
- if agent_id:
419
- metadata_parts.append(f"Agent ID: {agent_id}")
420
- if cli_version:
421
- metadata_parts.append(f"CLI Version: {cli_version}")
422
- if metadata_parts:
423
- print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
424
- print()
425
-
426
- # Show aggregate statistics if multiple results
427
- if len(results) > 1:
428
- aggregates = calculate_aggregate_statistics(results)
429
- if aggregates:
430
- print(f"{BOLD}{BLUE}Aggregate Statistics ({len(results)} prompts):{RESET}")
431
- print(f"{BLUE}{'=' * 60}{RESET}")
432
-
433
- for metric_name, stats in aggregates.items():
434
- pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
435
- prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
436
- total_prompts = stats.get('total_prompts', len(results))
437
- print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
438
- print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
439
- print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
440
- if stats.get('threshold') is not None:
441
- print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
442
- print()
443
-
444
- print(f"{BLUE}{'=' * 60}{RESET}")
445
- print()
446
-
447
- print(f"{BOLD}{BLUE}Individual Results:{RESET}")
448
- print(f"{BLUE}{'=' * 50}{RESET}")
449
- for i, result in enumerate(results, 1):
450
- print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
451
-
452
- # Show which evaluators ran for this prompt
453
- evaluators_ran = result.get('evaluators_ran', [])
454
- if evaluators_ran:
455
- print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
456
-
457
- print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
458
- print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
459
-
460
- # Print metric scores from results
461
- metrics = result.get('results', {})
462
- for eval_name, v in metrics.items():
463
- if v is None:
464
- continue # Skip null/N/A scores from skipped evaluators
465
- display_name = pascal_case_to_title(eval_name)
466
- if eval_name == RELEVANCE:
467
- color = MAGENTA
468
- elif eval_name == COHERENCE:
469
- color = ORANGE
470
- else:
471
- color = BLUE
472
- print(f"{BOLD}{color}{display_name}:{RESET} {v}")
473
- print(f"{BLUE}{'-' * 30}{RESET}")
474
-
475
- def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
476
- """Extract an EvalScore object from a decorated metric dict.
477
-
478
- Maps internal decorated-metric format to schema EvalScore:
479
- {score, result, threshold} (required) + reason, evaluator (optional).
480
- """
481
- DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
482
-
483
- score_val = None
484
- if metric_id in data and isinstance(data[metric_id], (int, float)):
485
- score_val = data[metric_id]
486
- if score_val is None:
487
- return None
488
-
489
- result = data.get("result")
490
- if result not in ("pass", "fail"):
491
- result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
492
-
493
- eval_score: Dict[str, Any] = {
494
- "score": score_val,
495
- "result": result,
496
- "threshold": data.get("threshold", DEFAULT_THRESHOLD),
497
- }
498
- reason = data.get(f"{metric_id}_reason") or data.get("reason")
499
- if reason:
500
- eval_score["reason"] = reason
501
- return eval_score
502
-
503
-
504
- def convert_result_to_eval_item(result: Dict) -> Dict:
505
- """Convert an internal evaluation result dict to a schema-compliant EvalItem.
506
-
507
- Internal format (from run_evaluations):
508
- {prompt, response, expected_response, results: {Relevance: "JSON", ...},
509
- evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
510
- Schema EvalItem format:
511
- {prompt, response, expected_response, scores: {relevance: EvalScore, ...},
512
- evaluators: {...}, evaluators_mode: "..."}
513
- """
514
- item: Dict[str, Any] = {
515
- "prompt": result["prompt"],
516
- "response": result["response"],
517
- "expected_response": result["expected_response"],
518
- }
519
-
520
- # Preserve evaluator config in output
521
- if "evaluators" in result:
522
- item["evaluators"] = result["evaluators"]
523
- if "evaluators_mode" in result:
524
- item["evaluators_mode"] = result["evaluators_mode"]
525
-
526
- scores: Dict[str, Any] = {}
527
- results_dict = result.get("results", {})
528
-
529
- # EvalScore metrics (all share the same schema shape: {score, result, threshold})
530
- for eval_key, schema_key in [
531
- (RELEVANCE, "relevance"),
532
- (COHERENCE, "coherence"),
533
- (GROUNDEDNESS, "groundedness"),
534
- (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
535
- ]:
536
- raw = results_dict.get(eval_key)
537
- if not raw:
538
- continue
539
- data = json.loads(raw) if isinstance(raw, str) else raw
540
- eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
541
- if eval_score:
542
- scores[schema_key] = eval_score
543
-
544
- # Citations → CitationScore
545
- raw_citations = results_dict.get(CITATIONS)
546
- if raw_citations:
547
- data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
548
- count = data.get("citations", 0)
549
- cit_result = data.get("result")
550
- if cit_result not in ("pass", "fail"):
551
- cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
552
-
553
- citation_score: Dict[str, Any] = {
554
- "count": count,
555
- "result": cit_result,
556
- "threshold": data.get("threshold", 1),
557
- }
558
- if "citation_format" in data:
559
- citation_score["format"] = data["citation_format"]
560
- scores["citations"] = citation_score
561
-
562
- # ExactMatch → ExactMatchScore
563
- raw_exact = results_dict.get(EXACT_MATCH)
564
- if raw_exact:
565
- data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
566
- is_match = data.get("exact_match", 0.0) == 1.0
567
- scores["exactMatch"] = {
568
- "match": is_match,
569
- "result": data.get("result", "pass" if is_match else "fail"),
570
- "reason": data.get("exact_match_reason", ""),
571
- }
572
-
573
- # PartialMatch → PartialMatchScore
574
- raw_partial = results_dict.get(PARTIAL_MATCH)
575
- if raw_partial:
576
- data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
577
- scores["partialMatch"] = {
578
- "score": data.get("partial_match", 0.0),
579
- "result": data.get("result", "fail"),
580
- "threshold": data.get("threshold", 0.5),
581
- "reason": data.get("partial_match_reason", ""),
582
- }
583
-
584
- if scores:
585
- item["scores"] = scores
586
-
587
- return item
588
-
589
-
590
- def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
591
- default_evaluators: Optional[Dict[str, Any]] = None,
592
- agent_name: Optional[str] = None,
593
- cli_version: Optional[str] = None):
594
- """Write results to a schema-compliant eval document JSON file.
595
-
596
- Output follows the eval-document.schema.json format:
597
- {schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
598
- """
599
- try:
600
- try:
601
- current_version = SchemaVersionManager().get_current_version()
602
- except Exception:
603
- current_version = "1.0.0"
604
-
605
- items = [convert_result_to_eval_item(r) for r in results]
606
-
607
- metadata: Dict[str, Any] = {
608
- "evaluatedAt": datetime.now(timezone.utc).isoformat(),
609
- }
610
- if agent_id:
611
- metadata["agentId"] = agent_id
612
- if agent_name:
613
- metadata["agentName"] = agent_name
614
- if cli_version:
615
- metadata["cliVersion"] = cli_version
616
-
617
- output_data: Dict[str, Any] = {
618
- "schemaVersion": current_version,
619
- "metadata": metadata,
620
- }
621
-
622
- if default_evaluators is not None:
623
- output_data["default_evaluators"] = default_evaluators
624
-
625
- output_data["items"] = items
626
-
627
- with open(output_file, 'w', encoding='utf-8') as f:
628
- json.dump(output_data, f, indent=2, ensure_ascii=False)
629
- emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
630
- except Exception as e:
631
- emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
632
- sys.exit(1)
633
-
634
- def write_results_to_csv(results: List[Dict], output_file: str,
635
- agent_name: Optional[str] = None, agent_id: Optional[str] = None,
636
- cli_version: Optional[str] = None):
637
- """Write results to CSV file."""
638
- try:
639
- with open(output_file, 'w', newline='', encoding='utf-8') as f:
640
- if results:
641
- # Write metadata header
642
- metadata_parts = []
643
- if agent_name:
644
- metadata_parts.append(f"Agent Name: {agent_name}")
645
- if agent_id:
646
- metadata_parts.append(f"Agent ID: {agent_id}")
647
- if cli_version:
648
- metadata_parts.append(f"CLI Version: {cli_version}")
649
- if metadata_parts:
650
- f.write(f"# {' | '.join(metadata_parts)}\n")
651
-
652
- # Write aggregate statistics first if multiple results
653
- if len(results) > 1:
654
- aggregates = calculate_aggregate_statistics(results)
655
- if aggregates:
656
- f.write("# AGGREGATE STATISTICS\n")
657
- f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
658
- for metric_name, stats in aggregates.items():
659
- threshold_str = str(stats.get('threshold', 'N/A'))
660
- prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
661
- total_prompts = stats.get('total_prompts', len(results))
662
- f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
663
- f.write("\n# INDIVIDUAL RESULTS\n")
664
-
665
- # Write individual results (exclude internal fields)
666
- exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode'}
667
- fieldnames = [k for k in results[0].keys() if k not in exclude_keys]
668
- writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
669
- writer.writeheader()
670
- writer.writerows(results)
671
- emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
672
- except Exception as e:
673
- emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
674
- sys.exit(1)
675
-
676
- def parse_arguments():
677
- """Parse command line arguments."""
678
- parser = argparse.ArgumentParser(
679
- description="M365 Copilot Agent Evaluation CLI",
680
- formatter_class=argparse.RawDescriptionHelpFormatter,
681
- epilog="""
682
- Examples:
683
- # Run with default prompts
684
- python main.py
685
-
686
- # Run with custom prompts
687
- python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph is a gateway..."
688
-
689
- # Run with prompts from file
690
- python main.py --prompts-file prompts.json
691
-
692
- # Interactive mode
693
- python main.py --interactive
694
-
695
- # Save results to JSON
696
- python main.py --output results.json
697
-
698
- # Save results to CSV
699
- python main.py --output results.csv
700
-
701
- # Save results to HTML and open in browser
702
- python main.py --output report.html
703
-
704
- # Debug-level diagnostics
705
- python main.py --log-level debug
706
-
707
- # Sign out and clear cached authentication tokens
708
- python main.py --signout
709
- """
710
- )
711
-
712
- # Input options (mutually exclusive)
713
- input_group = parser.add_mutually_exclusive_group()
714
- input_group.add_argument(
715
- '--prompts',
716
- nargs='+',
717
- help='List of prompts to evaluate'
718
- )
719
- input_group.add_argument(
720
- '--prompts-file',
721
- type=str,
722
- help='JSON file containing prompts and expected responses'
723
- )
724
- input_group.add_argument(
725
- '--interactive',
726
- action='store_true',
727
- help='Interactive mode to enter prompts'
728
- )
729
-
730
- # Expected responses (only used with --prompts)
731
- parser.add_argument(
732
- '--expected',
733
- nargs='+',
734
- help='List of expected responses (must match number of prompts)'
735
- )
736
-
737
- # Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
738
- parser.add_argument(
739
- '--m365-agent-id', '--agent-id',
740
- type=str,
741
- default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
742
- help='Agent ID (default from M365_AGENT_ID environment variable)'
743
- )
744
-
745
- # Output options
746
- parser.add_argument(
747
- '--output',
748
- type=str,
749
- help='Output file path. Format is determined by file extension: .json, .csv, .html. If not provided, results are printed to console.'
750
- )
751
-
752
- # Behavior options
753
- parser.add_argument(
754
- '--log-level',
755
- nargs='?',
756
- const='info',
757
- action='append',
758
- help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
759
- )
760
-
761
- parser.add_argument(
762
- '--signout',
763
- action='store_true',
764
- help='Sign out and clear cached authentication tokens'
765
- )
766
-
767
- return parser.parse_args()
768
-
769
- def validate_environment() -> CallPath:
770
- """Validate required environment variables."""
771
- required_env_vars = [
772
- "AZURE_AI_OPENAI_ENDPOINT",
773
- "AZURE_AI_API_KEY",
774
- "AZURE_AI_API_VERSION",
775
- "AZURE_AI_MODEL_NAME",
776
- # Chat API specific
777
- "COPILOT_API_ENDPOINT",
778
- "X_SCENARIO_HEADER"
779
- ]
780
-
781
- if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
782
- call_path = CallPath.ACCESS_TOKEN
783
- required_env_vars.append("COPILOT_API_ACCESS_TOKEN")
784
- else:
785
- call_path = CallPath.COPILOT_AUTH
786
- required_env_vars.extend([
787
- "M365_EVAL_CLIENT_ID",
788
- "TENANT_ID"
789
- ])
790
-
791
- missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
792
- if missing_vars:
793
- emit_structured_log(
794
- "error",
795
- "Missing required environment variables: "
796
- f"{', '.join(missing_vars)}. Please ensure your .env file contains "
797
- "all required Azure configuration.",
798
- operation=Operation.VALIDATE_ENV,
799
- )
800
- sys.exit(1)
801
- return call_path
802
-
803
- def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
804
- """Validate URL against security requirements."""
805
- try:
806
- parsed = urllib.parse.urlparse(url)
807
-
808
- # Check for dangerous schemes
809
- if parsed.scheme in ['javascript', 'data']:
810
- raise ValueError(f"Dangerous URL scheme detected: {parsed.scheme}")
811
-
812
- # Check for HTTPS requirement
813
- if parsed.scheme != 'https':
814
- raise ValueError(f"Only HTTPS URLs are allowed, got: {parsed.scheme}")
815
-
816
- # Check if domain is in allowed list
817
- if parsed.netloc not in allowed_domains:
818
- raise ValueError(f"Domain not in allowed list: {parsed.netloc}")
819
-
820
- # Reject fragment URLs
821
- if parsed.fragment:
822
- raise ValueError("Fragment URLs are not allowed")
823
-
824
- return True
825
-
826
- except ValueError:
827
- # Re-raise ValueError exceptions
828
- raise
829
- except Exception as e:
830
- # Convert other parsing errors to ValueError
831
- raise ValueError(f"Invalid URL format: {url}") from e
832
-
833
- def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
834
- """Get prompts and expected responses based on command line arguments.
835
-
836
- Returns:
837
- Tuple of (eval_items, default_evaluators).
838
- """
839
- if args.prompts:
840
- if args.expected and len(args.prompts) != len(args.expected):
841
- emit_structured_log(
842
- "error",
843
- "Number of prompts must match number of expected responses. "
844
- "Update --expected values to match the prompt count.",
845
- )
846
- sys.exit(1)
847
- expected_responses = args.expected or [""] * len(args.prompts)
848
- eval_items = [
849
- {"prompt": p, "expected_response": e}
850
- for p, e in zip(args.prompts, expected_responses)
851
- ]
852
- return eval_items, None
853
- elif args.prompts_file:
854
- return load_prompts_from_file(args.prompts_file)
855
- elif args.interactive:
856
- prompts, expected_responses = get_interactive_prompts()
857
- eval_items = [
858
- {"prompt": p, "expected_response": e}
859
- for p, e in zip(prompts, expected_responses)
860
- ]
861
- return eval_items, None
862
- else:
863
- prompts, expected_responses = get_default_prompts_and_responses()
864
- eval_items = [
865
- {"prompt": p, "expected_response": e}
866
- for p, e in zip(prompts, expected_responses)
867
- ]
868
- return eval_items, None
869
-
870
- def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
871
- """
872
- Fetch available agents for the user from the Copilot API.
873
-
874
- Args:
875
- access_token: Bearer token for API authentication
876
- user_oid: User object ID for agent filtering
877
-
878
- Returns:
879
- List of agent dictionaries.
880
- """
881
- request_headers = {
882
- "Content-Type": "application/json",
883
- "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
884
- "Authorization": f"Bearer {access_token}"
885
- }
886
-
887
- try:
888
- # Build the query parameter with participant info
889
- request_data = json.dumps({"participant": {"id": user_oid}})
890
- query_param = urllib.parse.quote(request_data)
891
-
892
- # Try to fetch agents from /GetGptList endpoint
893
- req = urllib.request.Request(
894
- f"{copilot_api_endpoint}/GetGptList?request={query_param}",
895
- headers=request_headers,
896
- method="GET"
897
- )
898
- with urllib.request.urlopen(req, timeout=120) as resp:
899
- data = json.loads(resp.read().decode("utf-8"))
900
- agents = data.get("gptList", [])
901
- return agents
902
- except urllib.error.HTTPError as e:
903
- # If endpoint doesn't exist or returns error, return empty list
904
- emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
905
- return []
906
- except Exception as e:
907
- emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
908
- return []
909
-
910
- def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
911
- """
912
- Display an interactive agent selector using questionary.
913
-
914
- Args:
915
- agents: List of agent dictionaries.
916
-
917
- Returns:
918
- Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
919
- """
920
- if not agents:
921
- return None, None
922
-
923
- # Build id→name lookup and choices
924
- id_to_name: Dict[str, str] = {}
925
- choices = []
926
- sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
927
- for agent in sorted_agents:
928
- agent_id = agent.get("gptId", "Unknown")
929
- agent_name = agent.get("name", "Unknown")
930
- agent_description = agent.get("description", "Unknown")
931
- agent_is_owner = agent.get('isOwner')
932
- id_to_name[agent_id] = agent_name
933
-
934
- # Format the display text
935
- display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
936
-
937
- choices.append(questionary.Choice(title=display_text, value=agent_id))
938
-
939
- # Display the selection prompt
940
- selected_agent = questionary.select(
941
- "Select an agent to evaluate:",
942
- choices=choices,
943
- use_shortcuts=True,
944
- use_arrow_keys=True
945
- ).ask()
946
-
947
- return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
948
-
949
- @functools.lru_cache(maxsize=1)
950
- def _get_iana_timezone_name() -> str:
951
- """Get the IANA timezone name from the system using tzlocal.
952
-
953
- Tries get_localzone_name() first; falls back to str(get_localzone()) when the
954
- former raises (e.g. no zone configured on some Unix systems). Result is cached
955
- after the first call so tzlocal is only invoked once per session.
956
- """
957
- try:
958
- return tzlocal.get_localzone_name()
959
- except Exception:
960
- return str(tzlocal.get_localzone())
961
-
962
-
963
- @functools.lru_cache(maxsize=1)
964
- def _get_location_info() -> Dict[str, Any]:
965
- """Return a locationInfo dict containing the local UTC offset and IANA timezone name.
966
-
967
- Result is cached after the first call so the computation runs only once per session.
968
- """
969
- now = datetime.now().astimezone()
970
- utc_offset = now.utcoffset()
971
- offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
972
- return {
973
- "timeZoneOffset": offset_hours,
974
- "timeZone": _get_iana_timezone_name(),
975
- }
976
-
977
-
978
- def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
979
- message = {
980
- "message": {
981
- "text": prompt,
982
- "author": "user",
983
- "messageType": "chat",
984
- "timestamp": datetime.now(timezone.utc).isoformat(),
985
- "locationInfo": _get_location_info(),
986
- "from": {
987
- "id": user_oid,
988
- }
989
- },
990
- "verbosity": "verbose", # To enable detailed telemetry in response (to extract tool usage, etc.)
991
- }
992
-
993
- if agent_id:
994
- message["gpts"] = [
995
- {
996
- "id": agent_id.strip(),
997
- "source": "MOS3"
998
- }
999
- ]
1000
- message["optionsSets"] = [
1001
- "disable_action_confirmation" # Disable 3P action confirmation prompts for agents while scraping
1002
- ]
1003
-
1004
- return json.dumps(message).encode("utf-8")
1005
-
1006
- def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
1007
- """ Send prompts to the chat API and return enhanced responses. """
1008
-
1009
- request_headers = {
1010
- "Content-Type": "application/json",
1011
- "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
1012
- "Authorization": f"Bearer {access_token}"
1013
- }
1014
- raw_responses: List[Tuple[str, str]] = []
1015
- for i, prompt in enumerate(prompts, 1):
1016
- if getattr(args, "effective_log_level", "info") in ("info", "debug"):
1017
- emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
1018
-
1019
- # Build the payload
1020
- payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
1021
- if getattr(args, "effective_log_level", "info") == "debug":
1022
- emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
1023
-
1024
- # Send the request to /chat
1025
- req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
1026
- try:
1027
- with urllib.request.urlopen(req, timeout=120) as resp:
1028
- raw = resp.read().decode("utf-8", errors="replace")
1029
- except urllib.error.HTTPError as e:
1030
- error_body = None
1031
- try:
1032
- error_body = e.read().decode("utf-8", errors="replace")
1033
- except Exception:
1034
- pass
1035
- msg = f"Chat API request failed (HTTP {e.code} {e.reason})."
1036
- if error_body:
1037
- msg += f" Body: {error_body[:500]}"
1038
- raise RuntimeError(msg) from e
1039
- except urllib.error.URLError as e:
1040
- raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
1041
-
1042
- if getattr(args, "effective_log_level", "info") == "debug":
1043
- emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
1044
-
1045
- # Store raw response for enhancement
1046
- raw_responses.append((prompt, raw.strip()))
1047
-
1048
- # Extract enhanced responses using the new extractor
1049
- enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
1050
-
1051
- if getattr(args, "effective_log_level", "info") == "debug":
1052
- for idx, enhanced in enumerate(enhanced_responses, 1):
1053
- metadata = enhanced.get("metadata", {})
1054
- context = {
1055
- "request-id": metadata.get("request_id"),
1056
- "conversation-id": metadata.get("conversation_id"),
1057
- "message-id": metadata.get("message_id"),
1058
- "operation": Operation.SEND_PROMPT,
1059
- }
1060
- entry = format_structured_log_entry(
1061
- level="debug",
1062
- message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
1063
- logger_name=CLI_LOGGER_NAME,
1064
- run_context=context,
1065
- )
1066
- DIAGNOSTIC_RECORDS.append(entry)
1067
- CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
31
+ from cli_args import parse_arguments, should_bypass_min_version_check
32
+ from env_validator import (
33
+ ALLOWED_ENDPOINTS,
34
+ validate_endpoint_url,
35
+ validate_environment,
36
+ )
37
+ from common import (
38
+ ENV_AZURE_AI_OPENAI_ENDPOINT,
39
+ ENV_AZURE_AI_API_KEY,
40
+ ENV_AZURE_AI_API_VERSION,
41
+ ENV_AZURE_AI_MODEL_NAME,
42
+ ENV_TENANT_ID,
43
+ ENV_WORK_IQ_A2A_ENDPOINT,
44
+ ENV_WORK_IQ_A2A_CLIENT_ID,
45
+ ENV_WORK_IQ_A2A_SCOPES,
46
+ RunConfig,
47
+ )
48
+ from prompt_loader import get_prompt_datasets
49
+ from agent_selector import select_agent_interactively
50
+ from evaluation_runner import PipelineConfig, run_pipeline
51
+ from result_writer import output_results
1068
52
 
1069
- return enhanced_responses
53
+ from dataclasses import replace
1070
54
 
1071
- def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
1072
- agent_name: Optional[str] = None, cli_version: Optional[str] = None):
1073
- """Output results based on specified format."""
1074
- metadata_kwargs = dict(
1075
- agent_name=agent_name,
1076
- agent_id=getattr(args, 'm365_agent_id', None),
1077
- cli_version=cli_version,
1078
- )
1079
- if args.output:
1080
- output_lower = args.output.lower()
1081
- if output_lower.endswith('.json'):
1082
- write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1083
- **metadata_kwargs)
1084
- elif output_lower.endswith('.csv'):
1085
- write_results_to_csv(results, args.output, **metadata_kwargs)
1086
- elif output_lower.endswith('.html'):
1087
- write_results_to_html(results, args.output, **metadata_kwargs)
1088
- abs_path = os.path.abspath(args.output)
1089
- webbrowser.open(f'file://{abs_path}')
1090
- else:
1091
- write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1092
- **metadata_kwargs)
1093
- else:
1094
- write_results_to_console(results, **metadata_kwargs)
1095
55
 
1096
56
  def main():
1097
57
  """Main function to orchestrate the evaluation process."""
@@ -1107,136 +67,159 @@ def main():
1107
67
  )
1108
68
  sys.exit(2)
1109
69
 
1110
- args.effective_log_level = effective_log_level
1111
- configure_cli_logging(effective_log_level)
1112
- emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
70
+ config = replace(
71
+ RunConfig.from_namespace(args),
72
+ effective_log_level=effective_log_level,
73
+ )
74
+ configure_cli_logging(config.effective_log_level)
75
+ emit_structured_log("info", f"Log level set to '{config.effective_log_level}'.", operation=Operation.SETUP)
1113
76
 
1114
77
  # Check minimum version before proceeding
1115
- quiet_for_version = effective_log_level in ("warning", "error")
78
+ quiet_for_version = config.effective_log_level in ("warning", "error")
1116
79
  cli_version = get_cli_version(quiet=quiet_for_version)
1117
- if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
80
+ if not should_bypass_min_version_check(config) and not check_min_version(cli_version, quiet=quiet_for_version):
1118
81
  sys.exit(1)
1119
82
 
1120
83
  # Validate environment variables required for evaluation
1121
- call_path = validate_environment()
1122
- copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
1123
- validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
1124
-
1125
- user_oid = ""
1126
-
1127
- if call_path == CallPath.ACCESS_TOKEN:
1128
- access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
1129
- else:
1130
- scopes_str = os.environ.get(
1131
- "COPILOT_SCOPES", "https://substrate.office.com/sydney/.default"
1132
- )
1133
-
1134
- auth_handler = AuthHandler(
1135
- client_id=os.environ["M365_EVAL_CLIENT_ID"],
1136
- tenant_id=os.environ["TENANT_ID"],
1137
- scopes_str=scopes_str
1138
- )
84
+ validate_environment()
1139
85
 
1140
- # Signout user
1141
- if args.signout:
1142
- try:
1143
- auth_handler.clear_cache()
1144
- except Exception as e:
1145
- emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
1146
- sys.exit(1)
1147
- sys.exit(0)
86
+ a2a_endpoint = os.environ[ENV_WORK_IQ_A2A_ENDPOINT]
87
+ validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
1148
88
 
1149
- # Authenticate before loading prompts
1150
- try:
1151
- auth_result = auth_handler.acquire_token_interactive() or {}
1152
- access_token = auth_result.get("access_token") or ""
1153
- if not access_token:
1154
- raise RuntimeError("Failed to acquire access token from authentication result")
1155
-
1156
- id_token_claims = auth_result.get("id_token_claims")
1157
- if not isinstance(id_token_claims, dict):
89
+ a2a_scopes_str = os.environ.get(ENV_WORK_IQ_A2A_SCOPES, "")
90
+ a2a_auth_handler = AuthHandler(
91
+ client_id=os.environ[ENV_WORK_IQ_A2A_CLIENT_ID],
92
+ tenant_id=os.environ[ENV_TENANT_ID],
93
+ scopes_str=a2a_scopes_str,
94
+ )
95
+ if config.signout:
96
+ try:
97
+ a2a_auth_handler.clear_cache()
98
+ except Exception as e:
1158
99
  emit_structured_log(
1159
- "warning", "id_token_claims is missing or invalid in authentication result",
100
+ "error",
101
+ f"Error during signout: {e}",
1160
102
  operation=Operation.AUTHENTICATE,
1161
103
  )
1162
- else:
1163
- user_oid = id_token_claims.get("oid") or ""
1164
-
1165
- except Exception as e:
1166
- emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
1167
- if effective_log_level == "debug":
1168
- import traceback
1169
- traceback.print_exc()
1170
- sys.exit(1)
104
+ sys.exit(1)
105
+ sys.exit(0)
1171
106
 
1172
- if not user_oid and access_token:
1173
- # Fallback: extract from access token.
1174
- user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
107
+ try:
108
+ a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
109
+ a2a_access_token = a2a_auth_result.get("access_token") or ""
110
+ if not a2a_access_token:
111
+ raise RuntimeError("Failed to acquire A2A access token")
112
+ except Exception as e:
113
+ emit_structured_log(
114
+ "error",
115
+ f"Error during A2A authentication: {e}",
116
+ operation=Operation.AUTHENTICATE,
117
+ )
118
+ if config.effective_log_level == "debug":
119
+ traceback.print_exc()
120
+ sys.exit(1)
121
+ try:
122
+ agent_client = A2AClient(
123
+ a2a_endpoint=a2a_endpoint,
124
+ access_token=a2a_access_token,
125
+ logger=CLI_LOGGER,
126
+ diagnostic_records=DIAGNOSTIC_RECORDS,
127
+ )
128
+ except Exception as e:
129
+ emit_structured_log(
130
+ "error",
131
+ f"Failed to initialize A2A client: {e}",
132
+ operation=Operation.SETUP,
133
+ )
134
+ sys.exit(1)
1175
135
 
1176
136
  # 1. Load evaluation datasets
1177
- eval_items, file_default_evaluators = get_prompt_datasets(args)
137
+ eval_items, file_default_evaluators = get_prompt_datasets(config)
1178
138
  default_evaluators = resolve_default_evaluators(file_default_evaluators)
1179
- prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
1180
139
 
1181
- if effective_log_level in ("info", "debug"):
1182
- emit_structured_log("info", f"Running evaluation on {len(prompts)} prompt(s).", operation=Operation.SETUP)
140
+ if config.effective_log_level in ("info", "debug"):
141
+ multi_turn_count = sum(1 for item in eval_items if "turns" in item)
142
+ single_turn_count = len(eval_items) - multi_turn_count
143
+ emit_structured_log(
144
+ "info",
145
+ f"Running evaluation on {len(eval_items)} item(s) "
146
+ f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
147
+ operation=Operation.SETUP,
148
+ )
1183
149
 
1184
150
  agent_name = None
1185
151
  try:
1186
- # 3. Agent selection - if no agent ID provided, prompt user to select
1187
- if not args.m365_agent_id:
1188
- if effective_log_level in ("info", "debug"):
152
+ # 2. Agent selection - when no agent ID is provided, discover agents
153
+ # via the active client (A2A) and prompt interactively.
154
+ if not config.m365_agent_id:
155
+ if config.effective_log_level in ("info", "debug"):
1189
156
  emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
1190
-
1191
- available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
157
+
158
+ available_agents = agent_client.fetch_available_agents()
1192
159
  if not available_agents:
1193
160
  emit_structured_log(
1194
- "error",
1195
- "No agents are available for interactive selection. Re-run with "
1196
- "--m365-agent-id or set M365_AGENT_ID.",
1197
- operation=Operation.FETCH_AGENTS,
161
+ "error",
162
+ "No agents are available for interactive selection."
163
+ " Re-run with --m365-agent-id or set M365_AGENT_ID.",
164
+ operation=Operation.FETCH_AGENTS,
1198
165
  )
1199
166
  sys.exit(1)
1200
167
 
1201
168
  selected_agent_id, agent_name = select_agent_interactively(available_agents)
1202
169
  if selected_agent_id:
1203
- args.m365_agent_id = selected_agent_id
1204
- if effective_log_level in ("info", "debug"):
1205
- emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
170
+ config = replace(config, m365_agent_id=selected_agent_id)
171
+ if config.effective_log_level in ("info", "debug"):
172
+ emit_structured_log("info", f"Selected agent: {config.m365_agent_id}", operation=Operation.FETCH_AGENTS)
1206
173
  else:
1207
174
  emit_structured_log(
1208
- "error",
1209
- "No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
1210
- operation=Operation.FETCH_AGENTS,
175
+ "error",
176
+ "No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
177
+ operation=Operation.FETCH_AGENTS,
1211
178
  )
1212
179
  sys.exit(1)
1213
-
1214
- # 4. Send prompts to chat API
1215
- responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
1216
180
  except Exception as e:
1217
- emit_structured_log("error", f"Error sending prompts to chat API: {e}", operation=Operation.SEND_PROMPT)
1218
- if effective_log_level == "debug":
1219
- import traceback
181
+ emit_structured_log("error", f"Error during agent discovery: {e}", operation=Operation.FETCH_AGENTS)
182
+ if config.effective_log_level == "debug":
1220
183
  traceback.print_exc()
1221
184
  sys.exit(1)
1222
185
 
186
+ # Pre-resolve agent endpoint (A2A agent card lookup)
187
+ if config.m365_agent_id:
188
+ agent_client.resolve_agent(config.m365_agent_id)
189
+
190
+ # 3. Build pipeline config and run evaluation pipeline
191
+ model_config = AzureOpenAIModelConfiguration(
192
+ azure_endpoint=os.environ.get(ENV_AZURE_AI_OPENAI_ENDPOINT),
193
+ api_key=os.environ.get(ENV_AZURE_AI_API_KEY),
194
+ api_version=os.environ.get(ENV_AZURE_AI_API_VERSION),
195
+ azure_deployment=os.environ.get(ENV_AZURE_AI_MODEL_NAME),
196
+ )
197
+ has_azure_openai = bool(
198
+ os.environ.get(ENV_AZURE_AI_OPENAI_ENDPOINT)
199
+ and os.environ.get(ENV_AZURE_AI_API_KEY)
200
+ )
201
+
202
+ pipeline = PipelineConfig(
203
+ agent_client=agent_client,
204
+ model_config=model_config,
205
+ has_azure_openai=has_azure_openai,
206
+ default_evaluators=default_evaluators,
207
+ )
208
+
209
+ results = run_pipeline(pipeline, eval_items, config)
210
+
211
+ # 4. Output results
212
+ output_results(
213
+ results, config, default_evaluators=default_evaluators,
214
+ agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
1223
215
 
1224
- # 5. Run evaluations
1225
- if effective_log_level in ("info", "debug"):
1226
- emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
1227
- results = run_evaluations(args, responses, eval_items, default_evaluators)
1228
-
1229
- # 6. Output results
1230
- output_results(results, args, default_evaluators=default_evaluators,
1231
- agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
1232
-
1233
- if effective_log_level in ("info", "debug"):
216
+ if config.effective_log_level in ("info", "debug"):
1234
217
  emit_structured_log(
1235
218
  "info",
1236
- f"Evaluation completed successfully. Processed {len(prompts)} prompt(s).",
219
+ f"Evaluation completed successfully. Processed {len(eval_items)} item(s).",
1237
220
  operation=Operation.EVALUATE,
1238
221
  )
1239
222
 
1240
223
  # Call the main function when script is run directly
1241
- if __name__ == "__main__":
224
+ if __name__ == "__main__": # pragma: no cover
1242
225
  main()