@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,1546 +1,58 @@
1
- import json
1
+ """M365 Copilot Agent Evaluation CLI — thin orchestrator.
2
+
3
+ Delegates to focused modules:
4
+ cli_args – argument parsing & version-check bypass
5
+ env_validator – environment validation & URL security
6
+ prompt_loader – dataset loading & agent selection
7
+ evaluation_runner – pipeline, evaluator dispatch, retry
8
+ result_writer – console / JSON / CSV / HTML output
9
+ """
10
+
2
11
  import os
3
- import argparse
4
12
  import sys
5
- import csv
6
- import logging
7
- import time
8
- import webbrowser
9
- import urllib.parse
10
- import questionary
11
- from dataclasses import dataclass, field
12
- from enum import Enum
13
- from typing import List, Dict, Tuple, Optional, Any
14
-
15
- from api_clients.A2A import A2AClient
16
- from api_clients.REST import SydneyClient
17
- from api_clients.base_agent_client import BaseAgentClient
13
+ import traceback
18
14
 
19
- from azure.ai.evaluation import (
20
- AzureOpenAIModelConfiguration,
21
- RelevanceEvaluator,
22
- CoherenceEvaluator,
23
- GroundednessEvaluator,
24
- ToolCallAccuracyEvaluator
25
- )
15
+ from azure.ai.evaluation import AzureOpenAIModelConfiguration
26
16
  from dotenv import load_dotenv
17
+
18
+ from api_clients.A2A import A2AClient
27
19
  from auth.auth_handler import AuthHandler
28
- from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
29
- from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
30
- from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
31
- from generate_report import generate_html_report, calculate_aggregate_statistics
32
- from response_extractor import get_response_text_for_evaluation
33
- from schema_handler import DocumentUpgrader, SchemaVersionManager
34
- from common import (
35
- RELEVANCE,
36
- COHERENCE,
37
- GROUNDEDNESS,
38
- TOOL_CALL_ACCURACY,
39
- CITATIONS,
40
- EXACT_MATCH,
41
- PARTIAL_MATCH,
42
- REQUIRES_AZURE_OPENAI,
43
- REQUIRES_TOOL_DEFINITIONS,
44
- METRIC_IDS,
45
- STATUS_PASS,
46
- STATUS_FAIL,
47
- STATUS_ERROR,
48
- STATUS_PARTIAL,
49
- STATUS_UNKNOWN,
50
- pascal_case_to_title,
51
- )
52
- from evaluator_resolver import (
53
- validate_evaluator_names,
54
- check_prerequisites,
55
- resolve_default_evaluators,
56
- resolve_evaluators_for_prompt,
57
- get_evaluator_threshold,
58
- )
20
+ from evaluator_resolver import resolve_default_evaluators
59
21
  from version_check import check_min_version, get_cli_version
60
- from datetime import datetime, timezone
61
- from pathlib import Path
62
-
63
22
 
64
- from cli_logging.console_diagnostics import emit_structured_log as _emit_structured_log
65
- from cli_logging.logging_utils import LOG_LEVEL_MAP, Operation, resolve_log_level
66
- from parallel_executor import execute_in_parallel
67
- from throttle_gate import ThrottleGate
68
- from retry_policy import (
69
- is_retryable_status,
70
- get_backoff_seconds,
71
- get_retry_after_seconds,
23
+ from cli_logging.cli_logger import (
24
+ CLI_LOGGER,
25
+ DIAGNOSTIC_RECORDS,
26
+ configure_cli_logging,
27
+ emit_structured_log,
72
28
  )
29
+ from cli_logging.logging_utils import Operation, resolve_log_level
73
30
 
74
- # Allowed endpoints for URL validation
75
- ALLOWED_ENDPOINTS = [
76
- 'substrate.office.com',
77
- 'graph.microsoft.com',
78
- ]
79
-
80
- MAX_CONCURRENCY = 5
81
- MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
82
- MAX_TURNS_PER_THREAD = 20
83
- LONG_THREAD_WARNING_THRESHOLD = 10
84
-
85
-
86
- @dataclass
87
- class PipelineConfig:
88
- """Runtime configuration for the evaluation pipeline."""
89
- agent_client: BaseAgentClient
90
- model_config: AzureOpenAIModelConfiguration
91
- has_azure_openai: bool
92
- default_evaluators: Dict[str, Any]
93
- chat_gate: ThrottleGate = field(default_factory=lambda: ThrottleGate("chat_api"))
94
- is_retryable_status: Any = field(default=is_retryable_status)
95
- get_backoff_seconds: Any = field(default=get_backoff_seconds)
96
- get_retry_after_seconds: Any = field(default=get_retry_after_seconds)
97
-
98
- class CallPath(Enum):
99
- """ Enum to indicate which call path to use. """
100
- ACCESS_TOKEN = "access_token"
101
- COPILOT_AUTH = "copilot_auth"
102
- A2A = "a2a"
103
-
104
-
105
- class ItemType(Enum):
106
- SINGLE_TURN = "single_turn"
107
- MULTI_TURN = "multi_turn"
108
-
109
-
110
- def detect_item_type(item: dict) -> ItemType:
111
- """Determine if an evaluation item is single-turn or multi-turn.
112
-
113
- Returns ItemType.SINGLE_TURN if item has 'prompt' without 'turns',
114
- ItemType.MULTI_TURN if item has 'turns' array.
115
-
116
- Raises ValueError for invalid items (both, neither, or invalid turns).
117
- """
118
- has_turns = "turns" in item
119
- has_prompt = "prompt" in item
120
-
121
- if has_turns and has_prompt:
122
- raise ValueError(
123
- "Invalid evaluation item: cannot have both 'turns' and 'prompt'. "
124
- "Use 'turns' for multi-turn threads or 'prompt' for single-turn."
125
- )
126
-
127
- if has_turns and not isinstance(item["turns"], list):
128
- raise ValueError("Invalid evaluation item: 'turns' must be a list")
129
-
130
- if has_turns:
131
- if len(item["turns"]) == 0:
132
- raise ValueError("Invalid multi-turn thread: 'turns' array cannot be empty")
133
- return ItemType.MULTI_TURN
134
-
135
- if has_prompt:
136
- return ItemType.SINGLE_TURN
137
-
138
- raise ValueError(
139
- "Invalid evaluation item: must have either 'turns' array (multi-turn) "
140
- "or 'prompt' field (single-turn)"
141
- )
142
-
143
-
144
- # Flags that should bypass remote min-version enforcement.
145
- # --help is not needed here because argparse exits before runtime checks.
146
- VERSION_CHECK_BYPASS_FLAGS = (
147
- "signout",
31
+ from cli_args import parse_arguments, should_bypass_min_version_check
32
+ from env_validator import (
33
+ ALLOWED_ENDPOINTS,
34
+ validate_endpoint_url,
35
+ validate_environment,
148
36
  )
37
+ from common import (
38
+ ENV_AZURE_AI_OPENAI_ENDPOINT,
39
+ ENV_AZURE_AI_API_KEY,
40
+ ENV_AZURE_AI_API_VERSION,
41
+ ENV_AZURE_AI_MODEL_NAME,
42
+ ENV_TENANT_ID,
43
+ ENV_WORK_IQ_A2A_ENDPOINT,
44
+ ENV_WORK_IQ_A2A_CLIENT_ID,
45
+ ENV_WORK_IQ_A2A_SCOPES,
46
+ RunConfig,
47
+ )
48
+ from prompt_loader import get_prompt_datasets
49
+ from agent_selector import select_agent_interactively
50
+ from evaluation_runner import PipelineConfig, run_pipeline
51
+ from result_writer import output_results
149
52
 
150
- CLI_LOGGER_NAME = "m365.eval.cli"
151
- CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
152
- DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
153
-
154
-
155
- def configure_cli_logging(effective_log_level: str) -> None:
156
- if not CLI_LOGGER.handlers:
157
- handler = logging.StreamHandler(sys.stdout)
158
- handler.setFormatter(logging.Formatter("%(message)s"))
159
- CLI_LOGGER.addHandler(handler)
160
- CLI_LOGGER.propagate = False
161
- CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
162
-
163
-
164
- def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
165
- _emit_structured_log(
166
- level, message, operation,
167
- logger=CLI_LOGGER,
168
- diagnostic_records=DIAGNOSTIC_RECORDS,
169
- )
170
-
171
-
172
- def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
173
- """Return True if the current invocation should skip min-version checks."""
174
- return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
175
-
176
- def write_results_to_html(results: List[Dict], output_file: str,
177
- agent_name: Optional[str] = None, agent_id: Optional[str] = None,
178
- cli_version: Optional[str] = None):
179
- """Write results to HTML file using generate_html_report from generate_report.py."""
180
- try:
181
- html = generate_html_report(results, agent_name=agent_name, agent_id=agent_id,
182
- cli_version=cli_version)
183
- with open(output_file, 'w', encoding='utf-8') as f:
184
- f.write(html)
185
- emit_structured_log("info", f"HTML report saved to {output_file}", operation=Operation.WRITE_OUTPUT)
186
- except Exception as e:
187
- emit_structured_log("error", f"Error writing to HTML file: {e}", operation=Operation.WRITE_OUTPUT)
188
- sys.exit(1)
189
-
190
- def get_default_prompts_and_responses():
191
- """Get a list of prompts and responses."""
192
- prompts = [
193
- "What is Microsoft Graph?"
194
- ]
195
- expected_responses = [
196
- "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
197
- ]
198
- return prompts, expected_responses
199
-
200
- def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
201
- """Load prompts and expected responses from a JSON file.
202
-
203
- Supports three formats:
204
- 1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
205
- 2. Array format: [{"prompt": "...", "expected_response": "..."}]
206
- 3. Dict format: {"prompts": [...], "expected_responses": [...]}
207
-
208
- For eval documents (format 1) and array format (format 2), schema validation
209
- and auto-upgrade are applied via DocumentUpgrader.
210
-
211
- Returns:
212
- Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
213
- expected_response, and optional evaluators/evaluators_mode fields.
214
- """
215
- try:
216
- with open(file_path, 'r', encoding='utf-8') as f:
217
- data = json.load(f)
218
-
219
- # Detect if this is an eval document (has "items" key) or could be upgraded
220
- is_eval_document = (
221
- isinstance(data, dict) and "items" in data
222
- ) or isinstance(data, list)
223
-
224
- # Run schema validation and auto-upgrade for eval documents
225
- if is_eval_document:
226
- try:
227
- upgrader = DocumentUpgrader()
228
- except Exception as e:
229
- # Schema infrastructure not available (missing files, etc.) — skip
230
- emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
231
- upgrader = None
232
-
233
- if upgrader is not None:
234
- result = upgrader.upgrade(Path(file_path))
235
-
236
- if result.error:
237
- emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
238
- sys.exit(1)
239
-
240
- if result.upgraded and result.message:
241
- emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
242
-
243
- # Use the parsed document from the upgrade result
244
- if result.document is not None:
245
- data = result.document
246
-
247
- if isinstance(data, list):
248
- # Format: [{"prompt": "...", "expected_response": "..."}, ...]
249
- return data, None
250
- elif isinstance(data, dict):
251
- if "items" in data:
252
- # Eval document format: {"schemaVersion": "...", "items": [...]}
253
- return data["items"], data.get("default_evaluators")
254
- else:
255
- # Format: {"prompts": [...], "expected_responses": [...]}
256
- prompts = data.get("prompts", [])
257
- expected_responses = data.get("expected_responses", [])
258
- eval_items = [
259
- {"prompt": p, "expected_response": e}
260
- for p, e in zip(prompts, expected_responses)
261
- ]
262
- return eval_items, None
263
- else:
264
- raise ValueError("Invalid file format")
265
- except SystemExit:
266
- raise
267
- except Exception as e:
268
- emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
269
- sys.exit(1)
270
-
271
- def get_interactive_prompts() -> Tuple[List[str], List[str]]:
272
- """Get prompts and expected responses interactively."""
273
- prompts = []
274
- expected_responses = []
275
-
276
- print("Interactive mode: Enter your prompts and expected responses.")
277
- print("Press Enter with empty prompt to finish.")
278
-
279
- while True:
280
- prompt = input(f"\nPrompt {len(prompts) + 1}: ").strip()
281
- if not prompt:
282
- break
283
-
284
- expected = input(f"Expected response {len(expected_responses) + 1}: ").strip()
285
-
286
- prompts.append(prompt)
287
- expected_responses.append(expected)
288
-
289
- if not prompts:
290
- print("No prompts entered. Exiting.")
291
- sys.exit(1)
292
-
293
- return prompts, expected_responses
294
-
295
-
296
- _DEFAULT_PASS_THRESHOLD = 3
297
-
298
-
299
- def _decorate_metric(metric_id: str, data, threshold: Optional[int] = None) -> Dict[str, Any]:
300
- """Augment raw evaluator output with standardized threshold + pass/fail result."""
301
- pass_threshold = threshold if threshold is not None else _DEFAULT_PASS_THRESHOLD
302
- payload = {}
303
- if isinstance(data, dict):
304
- payload.update(data)
305
- else:
306
- payload['raw'] = data
307
-
308
- score_val = None
309
- if isinstance(data, dict):
310
- if metric_id in data:
311
- score_val = data[metric_id]
312
- if isinstance(score_val, (int, float)):
313
- payload['threshold'] = pass_threshold
314
- payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
315
- else:
316
- payload['threshold'] = pass_threshold
317
- payload.setdefault('result', STATUS_UNKNOWN)
318
- return payload
319
-
320
-
321
- def _run_evaluators_for_item(
322
- prompt: str,
323
- actual_response: str,
324
- expected_response: str,
325
- enhanced_response: Dict[str, Any],
326
- resolved_evaluators: Dict[str, Any],
327
- model_config: AzureOpenAIModelConfiguration,
328
- has_azure_openai: bool,
329
- args,
330
- ) -> Tuple[Dict[str, Optional[str]], List[str]]:
331
- """Run resolved evaluators against a single item/turn.
332
-
333
- Returns (results_dict, evaluators_ran).
334
- """
335
- has_tool_defs = bool(
336
- args.m365_agent_id and enhanced_response.get("tool_definitions")
337
- )
338
- available_context = {
339
- REQUIRES_AZURE_OPENAI: has_azure_openai,
340
- REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
341
- }
342
-
343
- results_dict: Dict[str, Optional[str]] = {}
344
- evaluators_ran: List[str] = []
345
-
346
- for eval_name, eval_options in resolved_evaluators.items():
347
- can_run, warn_msg = check_prerequisites(eval_name, available_context)
348
- if not can_run:
349
- if warn_msg:
350
- emit_structured_log(
351
- "warning",
352
- f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
353
- operation=Operation.EVALUATE,
354
- )
355
- results_dict[eval_name] = None
356
- continue
357
-
358
- threshold = get_evaluator_threshold(eval_name, eval_options)
359
-
360
- try:
361
- if eval_name == RELEVANCE:
362
- raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
363
- results_dict[RELEVANCE] = _decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
364
- elif eval_name == COHERENCE:
365
- raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
366
- results_dict[COHERENCE] = _decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
367
- elif eval_name == GROUNDEDNESS:
368
- raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response, context=expected_response)
369
- results_dict[GROUNDEDNESS] = _decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
370
- elif eval_name == TOOL_CALL_ACCURACY:
371
- raw_score = ToolCallAccuracyEvaluator(model_config)(
372
- query=prompt,
373
- response=enhanced_response.get("response", actual_response),
374
- tool_definitions=enhanced_response.get("tool_definitions", []),
375
- )
376
- results_dict[TOOL_CALL_ACCURACY] = _decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
377
- elif eval_name == CITATIONS:
378
- fmt_str = eval_options.get("citation_format", "oai_unicode")
379
- fmt_map = {
380
- "oai_unicode": CitationFormat.OAI_UNICODE,
381
- "bracket": CitationFormat.LEGACY_BRACKET,
382
- "mixed": CitationFormat.AUTO,
383
- }
384
- raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response)
385
- results_dict[CITATIONS] = _decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
386
- elif eval_name == EXACT_MATCH:
387
- case_sensitive = eval_options.get("case_sensitive", False)
388
- raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
389
- # ExactMatch is binary — the evaluator already sets 'result'
390
- # so _decorate_metric (which computes result from score vs threshold) is not needed.
391
- results_dict[EXACT_MATCH] = raw_score
392
- elif eval_name == PARTIAL_MATCH:
393
- case_sensitive = eval_options.get("case_sensitive", False)
394
- raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
395
- results_dict[PARTIAL_MATCH] = _decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
396
-
397
- evaluators_ran.append(eval_name)
398
- except Exception as e:
399
- emit_structured_log(
400
- "error",
401
- f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
402
- operation=Operation.EVALUATE,
403
- )
404
- results_dict[eval_name] = None
405
-
406
- return results_dict, evaluators_ran
407
-
408
-
409
- def _evaluate_single_response(
410
- enhanced_response: Dict[str, Any],
411
- eval_item: Dict,
412
- args,
413
- model_config: AzureOpenAIModelConfiguration,
414
- has_azure_openai: bool,
415
- default_evaluators: Dict[str, Any],
416
- ) -> Dict[str, Any]:
417
- """Run all evaluators for a single prompt/response pair and return the result dict."""
418
- actual_response_text = get_response_text_for_evaluation(enhanced_response)
419
- prompt = eval_item.get("prompt", "")
420
- expected_response = eval_item.get("expected_response", "")
421
-
422
- resolved = resolve_evaluators_for_prompt(
423
- eval_item.get("evaluators"), eval_item.get("evaluators_mode", "extend"),
424
- prompt, default_evaluators,
425
- )
426
-
427
- results_dict, evaluators_ran = _run_evaluators_for_item(
428
- prompt, actual_response_text, expected_response, enhanced_response,
429
- resolved, model_config, has_azure_openai, args,
430
- )
431
-
432
- evaluation_result = {
433
- "prompt": prompt,
434
- "response": enhanced_response.get(
435
- "display_response_text", actual_response_text
436
- ),
437
- "expected_response": expected_response,
438
- "evaluators_ran": evaluators_ran,
439
- "results": results_dict,
440
- }
441
-
442
- if "evaluators" in eval_item:
443
- evaluation_result["evaluators"] = eval_item["evaluators"]
444
- if "evaluators_mode" in eval_item:
445
- evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
446
-
447
- if getattr(args, "effective_log_level", "info") == "debug":
448
- emit_structured_log(
449
- "debug",
450
- f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
451
- f"Evaluators: {', '.join(evaluators_ran)}. "
452
- f"Scores: {evaluation_result['results']}",
453
- operation=Operation.EVALUATE,
454
- )
455
-
456
- return evaluation_result
457
-
458
-
459
- def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
460
- """Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
461
- for result_data in results_dict.values():
462
- if result_data is None:
463
- continue
464
- if result_data.get("result") == STATUS_FAIL:
465
- return False
466
- return True
467
-
468
- def _evaluate_multi_turn_responses(
469
- turns: List[Dict],
470
- args,
471
- default_evaluators: Dict[str, Any],
472
- model_config: AzureOpenAIModelConfiguration,
473
- has_azure_openai: bool,
474
- ) -> Tuple[List[Dict], Dict]:
475
- """Run per-turn evaluations and build evaluated turn results with summary.
476
-
477
- Returns:
478
- Tuple of (evaluated_turns, summary). Each evaluated turn contains
479
- prompt, response, expected_response, status, evaluators_ran, results,
480
- and optionally error. Does not mutate the input turns.
481
- """
482
- evaluated_turns: List[Dict] = []
483
- turns_passed = 0
484
- turns_failed = 0
485
-
486
- for i, turn in enumerate(turns):
487
- evaluated_turn: Dict[str, Any] = {
488
- "prompt": turn.get("prompt", ""),
489
- }
490
- if "expected_response" in turn:
491
- evaluated_turn["expected_response"] = turn["expected_response"]
492
- if "response" in turn:
493
- evaluated_turn["response"] = turn["response"]
494
- if "evaluators" in turn:
495
- evaluated_turn["evaluators"] = turn["evaluators"]
496
- if "evaluators_mode" in turn:
497
- evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
498
-
499
- if turn.get("status") == STATUS_ERROR:
500
- evaluated_turn["status"] = STATUS_ERROR
501
- evaluated_turn["error"] = turn.get("error", "")
502
- turns_failed += 1
503
- evaluated_turns.append(evaluated_turn)
504
- continue
505
-
506
- enhanced_response = turn.get("_enhanced_response", {})
507
- actual_response = get_response_text_for_evaluation(enhanced_response)
508
-
509
- resolved = resolve_evaluators_for_prompt(
510
- turn.get("evaluators"), turn.get("evaluators_mode", "extend"),
511
- turn.get("prompt", ""), default_evaluators,
512
- )
513
-
514
- results_dict, evaluators_ran = _run_evaluators_for_item(
515
- turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
516
- enhanced_response, resolved, model_config, has_azure_openai, args,
517
- )
518
-
519
- all_passed = _check_all_passed(results_dict)
520
-
521
- evaluated_turn["results"] = results_dict
522
- evaluated_turn["evaluators_ran"] = evaluators_ran
523
- evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
524
-
525
- if getattr(args, "effective_log_level", "info") == "debug":
526
- emit_structured_log(
527
- "debug",
528
- f"Evaluation completed for turn {i + 1} prompt='{turn.get('prompt', '')}'. "
529
- f"Evaluators: {', '.join(evaluators_ran)}. "
530
- f"Scores: {results_dict}",
531
- operation=Operation.EVALUATE,
532
- )
533
-
534
- if all_passed:
535
- turns_passed += 1
536
- else:
537
- turns_failed += 1
538
-
539
- evaluated_turns.append(evaluated_turn)
540
-
541
- turns_total = len(turns)
542
- if turns_passed == turns_total:
543
- overall_status = STATUS_PASS
544
- elif turns_failed == turns_total:
545
- overall_status = STATUS_FAIL
546
- else:
547
- overall_status = STATUS_PARTIAL
548
-
549
- summary = {
550
- "turns_total": turns_total,
551
- "turns_passed": turns_passed,
552
- "turns_failed": turns_failed,
553
- "overall_status": overall_status,
554
- }
555
-
556
- return evaluated_turns, summary
557
-
558
-
559
- def get_effective_worker_count(prompt_count: int, args) -> int:
560
- """Compute safe worker count for prompt processing."""
561
- if prompt_count <= 0:
562
- return 1
563
-
564
- requested = getattr(args, "concurrency", 5)
565
- try:
566
- requested_int = int(requested)
567
- except (TypeError, ValueError):
568
- requested_int = 5
569
-
570
- bounded = max(1, min(requested_int, MAX_CONCURRENCY))
571
- return min(bounded, prompt_count)
572
-
573
-
574
- def run_pipeline(
575
- pipeline: PipelineConfig,
576
- eval_items: List[Dict],
577
- args,
578
- ) -> List[Dict[str, Any]]:
579
- """Run the full evaluation pipeline: send prompts and evaluate responses in parallel.
580
-
581
- Each worker processes one prompt end-to-end: send → evaluate.
582
- Results are returned in original prompt order (FR-006).
583
- """
584
- # Validate all evaluator names upfront before dispatching workers
585
- all_evaluator_maps = [pipeline.default_evaluators]
586
- for eval_item in eval_items:
587
- if "evaluators" in eval_item:
588
- all_evaluator_maps.append(eval_item["evaluators"])
589
- for turn in eval_item.get("turns", []):
590
- if "evaluators" in turn:
591
- all_evaluator_maps.append(turn["evaluators"])
592
- for emap in all_evaluator_maps:
593
- validate_evaluator_names(emap)
594
-
595
- # Validate all items upfront and classify types before dispatching workers
596
- item_types: List[ItemType] = []
597
- for idx, eval_item in enumerate(eval_items):
598
- try:
599
- item_type = detect_item_type(eval_item)
600
- except ValueError as e:
601
- raise ValueError(f"Invalid evaluation item at index {idx}: {e}") from e
602
- if item_type == ItemType.MULTI_TURN:
603
- turn_count = len(eval_item["turns"])
604
- if turn_count > MAX_TURNS_PER_THREAD:
605
- raise ValueError(
606
- f"Invalid evaluation item at index {idx}: 'turns' array has "
607
- f"{turn_count} items (max {MAX_TURNS_PER_THREAD})"
608
- )
609
- item_types.append(item_type)
610
-
611
- total = len(eval_items)
612
- worker_count = get_effective_worker_count(total, args)
613
-
614
- multi_turn_count = sum(1 for t in item_types if t == ItemType.MULTI_TURN)
615
- single_turn_count = total - multi_turn_count
616
-
617
- emit_structured_log(
618
- "info",
619
- f"Running pipeline with {worker_count} worker(s) for {total} item(s) "
620
- f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
621
- operation=Operation.EVALUATE,
622
- )
623
-
624
- def _process_item(eval_item: Dict, index: int) -> Dict[str, Any]:
625
- if item_types[index] == ItemType.MULTI_TURN:
626
- return _process_multi_turn(eval_item, index)
627
- return _process_single_turn(eval_item, index)
628
-
629
- def _process_single_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
630
- prompt = eval_item.get("prompt", "")
631
- emit_structured_log(
632
- "info",
633
- f"Processing item {index + 1}/{total} (single-turn).",
634
- operation=Operation.SEND_PROMPT,
635
- )
636
-
637
- # Phase A: Send prompt to agent (with retry + throttle gate)
638
- response = None
639
- for attempt in range(1, MAX_ATTEMPTS + 1):
640
- pipeline.chat_gate.wait_if_blocked()
641
- try:
642
- response, _ = pipeline.agent_client.send_prompt(prompt, agent_id=args.m365_agent_id)
643
- break
644
- except Exception as exc:
645
- cause = exc.__cause__
646
- status = int(getattr(cause, "code", 0) or 0) or None if cause else None
647
- retry_after = get_retry_after_seconds(
648
- cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
649
- )
650
-
651
- if retry_after is not None and pipeline.is_retryable_status(status):
652
- pipeline.chat_gate.apply_retry_after(retry_after)
653
-
654
- if not pipeline.is_retryable_status(status) or attempt >= MAX_ATTEMPTS:
655
- emit_structured_log(
656
- "error",
657
- f"Item {index + 1}/{total} failed after {attempt} attempt(s): {exc}",
658
- operation=Operation.SEND_PROMPT,
659
- )
660
- return {
661
- "prompt": prompt,
662
- "response": "",
663
- "expected_response": eval_item.get("expected_response", ""),
664
- "evaluators_ran": [],
665
- "results": {},
666
- "status": STATUS_ERROR,
667
- "errorDetails": str(exc),
668
- }
669
-
670
- delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
671
- time.sleep(delay)
672
-
673
- # Phase B: Evaluate response
674
- return _evaluate_single_response(
675
- response, eval_item, args,
676
- pipeline.model_config, pipeline.has_azure_openai,
677
- pipeline.default_evaluators,
678
- )
679
-
680
- def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
681
- turns = eval_item["turns"]
682
- thread_name = eval_item.get("name", "Unnamed thread")
683
- emit_structured_log(
684
- "info",
685
- f"Processing item {index + 1}/{total} (multi-turn: '{thread_name}').",
686
- operation=Operation.SEND_PROMPT,
687
- )
688
-
689
- if len(turns) > LONG_THREAD_WARNING_THRESHOLD:
690
- emit_structured_log(
691
- "warning",
692
- f"Thread '{thread_name}' has {len(turns)} turns (>{LONG_THREAD_WARNING_THRESHOLD}). This may take a while.",
693
- operation=Operation.SEND_PROMPT,
694
- )
695
-
696
- # Phase A: Send each turn with throttle gate + 429-only retry
697
- # Multi-turn only retries on 429 (server confirmed it didn't process
698
- # the request). Other transient errors (503, 504) are ambiguous about
699
- # whether the server processed the turn, risking duplicate turns in
700
- # the conversation if retried.
701
- conversation_context = None
702
- conversation_id = None
703
- enriched_turns: List[Dict[str, Any]] = []
704
- failed = False
705
-
706
- for i, turn in enumerate(turns):
707
- prompt = turn["prompt"]
708
- emit_structured_log(
709
- "debug",
710
- f"Sending turn {i + 1}/{len(turns)} of '{thread_name}'.",
711
- operation=Operation.SEND_PROMPT,
712
- )
713
-
714
- response = None
715
- for attempt in range(1, MAX_ATTEMPTS + 1):
716
- pipeline.chat_gate.wait_if_blocked()
717
- try:
718
- response, conversation_context = pipeline.agent_client.send_prompt(
719
- prompt, agent_id=args.m365_agent_id,
720
- conversation_context=conversation_context,
721
- )
722
- break
723
- except Exception as exc:
724
- cause = exc.__cause__
725
- status = int(getattr(cause, "code", 0) or 0) or None if cause else None
726
- retry_after = get_retry_after_seconds(
727
- cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
728
- )
729
-
730
- # Only retry on 429 — server confirmed it didn't process the request
731
- if status == 429 and attempt < MAX_ATTEMPTS:
732
- if retry_after is not None:
733
- pipeline.chat_gate.apply_retry_after(retry_after)
734
- delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
735
- time.sleep(delay)
736
- continue
737
-
738
- # All other errors: stop the thread
739
- emit_structured_log(
740
- "error",
741
- f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
742
- operation=Operation.SEND_PROMPT,
743
- )
744
- failed = True
745
- break
746
-
747
- if failed:
748
- # Mark this turn and all remaining turns as error
749
- enriched_turns.append({
750
- **turn,
751
- "response": "",
752
- "status": STATUS_ERROR,
753
- "error": "Failed to get response from agent",
754
- })
755
- for j in range(i + 1, len(turns)):
756
- enriched_turns.append({
757
- **turns[j],
758
- "response": "",
759
- "status": STATUS_ERROR,
760
- "error": "Skipped: preceding turn failed",
761
- })
762
- break
763
-
764
- # Enrich turn with response
765
- response_text = get_response_text_for_evaluation(response)
766
- enriched_turns.append({
767
- **turn,
768
- "response": response.get("display_response_text", response_text),
769
- "_enhanced_response": response,
770
- })
771
-
772
- # Capture conversation_id from first response
773
- if conversation_id is None:
774
- conversation_id = response.get("metadata", {}).get("conversation_id")
775
-
776
- # Phase B: Run per-turn evaluations
777
- evaluated_turns, summary = _evaluate_multi_turn_responses(
778
- enriched_turns, args, pipeline.default_evaluators,
779
- model_config=pipeline.model_config,
780
- has_azure_openai=pipeline.has_azure_openai,
781
- )
782
-
783
- return {
784
- "type": "multi_turn",
785
- "name": eval_item.get("name", ""),
786
- "description": eval_item.get("description", ""),
787
- "conversation_id": conversation_id or "",
788
- "turns": evaluated_turns,
789
- "summary": summary,
790
- }
791
-
792
- execution_results = execute_in_parallel(
793
- eval_items, _process_item, max_workers=worker_count,
794
- )
795
-
796
- # Unwrap WorkerResult objects into plain dicts, with error fallback
797
- ordered_results: List[Dict[str, Any]] = []
798
- for wr in execution_results:
799
- if wr.error:
800
- idx = wr.index
801
- item = eval_items[idx]
802
- if item_types[idx] == ItemType.MULTI_TURN:
803
- ordered_results.append({
804
- "type": "multi_turn",
805
- "name": item.get("name", ""),
806
- "turns": [
807
- {**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
808
- for t in item.get("turns", [])
809
- ],
810
- "summary": {
811
- "turns_total": len(item.get("turns", [])),
812
- "turns_passed": 0,
813
- "turns_failed": len(item.get("turns", [])),
814
- "overall_status": STATUS_FAIL,
815
- },
816
- "error": str(wr.error),
817
- })
818
- else:
819
- ordered_results.append({
820
- "prompt": item.get("prompt", ""),
821
- "response": "",
822
- "expected_response": item.get("expected_response", ""),
823
- "evaluators_ran": [],
824
- "results": {},
825
- "status": STATUS_ERROR,
826
- "errorDetails": str(wr.error),
827
- })
828
- else:
829
- ordered_results.append(wr.value)
830
-
831
- return ordered_results
832
-
833
-
834
-
835
- def write_results_to_console(results, agent_name: Optional[str] = None,
836
- agent_id: Optional[str] = None,
837
- cli_version: Optional[str] = None):
838
- """Write the response to console."""
839
- # ANSI color codes
840
- BOLD = '\033[1m'
841
- BLUE = '\033[94m'
842
- GREEN = '\033[92m'
843
- YELLOW = '\033[93m'
844
- CYAN = '\033[96m'
845
- MAGENTA = '\033[95m'
846
- ORANGE = '\033[38;5;208m'
847
- RED = '\033[91m'
848
- RESET = '\033[0m'
849
-
850
- def _print_evaluated_item(response: str, expected_response: str,
851
- evaluators_ran: List[str], item_results: Dict[str, Any],
852
- error: Optional[str] = None) -> None:
853
- """Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
854
-
855
- The item header (Prompt X / Turn X) is printed by the caller; this helper
856
- prints evaluators, response, expected response, error, and metrics.
857
- """
858
- if evaluators_ran:
859
- print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
860
- if response:
861
- print(f"{BOLD}{CYAN}Response:{RESET} {response}")
862
- if expected_response:
863
- print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
864
- if error:
865
- print(f"{BOLD}{RED}Error:{RESET} {error}")
866
-
867
- for eval_name, v in item_results.items():
868
- if v is None:
869
- continue
870
- display_name = pascal_case_to_title(eval_name)
871
- if eval_name == RELEVANCE:
872
- color = MAGENTA
873
- elif eval_name == COHERENCE:
874
- color = ORANGE
875
- else:
876
- color = BLUE
877
- print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
878
-
879
- # Show metadata
880
- metadata_parts = []
881
- if agent_name:
882
- metadata_parts.append(f"Agent Name: {agent_name}")
883
- if agent_id:
884
- metadata_parts.append(f"Agent ID: {agent_id}")
885
- if cli_version:
886
- metadata_parts.append(f"CLI Version: {cli_version}")
887
- if metadata_parts:
888
- print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
889
- print()
890
-
891
- aggregates = calculate_aggregate_statistics(results)
892
- if aggregates:
893
- total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
894
- if total_items > 1:
895
- print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
896
- print(f"{BLUE}{'=' * 60}{RESET}")
897
-
898
- for metric_name, stats in aggregates.items():
899
- pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
900
- prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
901
- total_prompts = stats.get('total_prompts', total_items)
902
- print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
903
- print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
904
- print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
905
- if stats.get('threshold') is not None:
906
- print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
907
- print()
908
-
909
- print(f"{BLUE}{'=' * 60}{RESET}")
910
- print()
911
-
912
- print(f"{BOLD}{BLUE}Individual Results:{RESET}")
913
- print(f"{BLUE}{'=' * 50}{RESET}")
914
- for i, result in enumerate(results, 1):
915
- if result.get("type") == "multi_turn":
916
- thread_name = result.get("name", "Unnamed Thread")
917
- summary = result.get("summary", {})
918
- status = summary.get("overall_status", STATUS_UNKNOWN)
919
- status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
920
-
921
- print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
922
- for t_idx, turn in enumerate(result.get("turns", []), 1):
923
- turn_status = turn.get("status", STATUS_UNKNOWN)
924
- turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
925
- print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
926
- _print_evaluated_item(
927
- response=turn.get("response", ""),
928
- expected_response=turn.get("expected_response", ""),
929
- evaluators_ran=turn.get("evaluators_ran", []),
930
- item_results=turn.get("results", {}),
931
- error=turn.get("error"),
932
- )
933
- print()
934
- print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
935
- print(f" Status: {status_color}{status.upper()}{RESET}")
936
- print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
937
- print(f"{BLUE}{'-' * 30}{RESET}")
938
- else:
939
- print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
940
- _print_evaluated_item(
941
- response=result.get('response', ''),
942
- expected_response=result.get('expected_response', ''),
943
- evaluators_ran=result.get('evaluators_ran', []),
944
- item_results=result.get('results', {}),
945
- error=result.get('errorDetails'),
946
- )
947
- print(f"{BLUE}{'-' * 30}{RESET}")
948
-
949
- def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
950
- """Extract an EvalScore object from a decorated metric dict.
951
-
952
- Maps internal decorated-metric format to schema EvalScore:
953
- {score, result, threshold} (required) + reason, evaluator (optional).
954
- """
955
- DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
956
-
957
- score_val = None
958
- if metric_id in data and isinstance(data[metric_id], (int, float)):
959
- score_val = data[metric_id]
960
- if score_val is None:
961
- return None
962
-
963
- result = data.get("result")
964
- if result not in (STATUS_PASS, STATUS_FAIL):
965
- result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else STATUS_FAIL
966
-
967
- eval_score: Dict[str, Any] = {
968
- "score": score_val,
969
- "result": result,
970
- "threshold": data.get("threshold", DEFAULT_THRESHOLD),
971
- }
972
- reason = data.get(f"{metric_id}_reason") or data.get("reason")
973
- if reason:
974
- eval_score["reason"] = reason
975
- return eval_score
976
-
977
-
978
- def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
979
- """Convert raw evaluator results to schema-compliant score objects.
980
-
981
- Evaluator results in results_dict are dicts (from _decorate_metric) or
982
- None when skipped/crashed. None values are omitted from output.
983
- """
984
- scores: Dict[str, Any] = {}
985
-
986
- for eval_key, schema_key in [
987
- (RELEVANCE, "relevance"),
988
- (COHERENCE, "coherence"),
989
- (GROUNDEDNESS, "groundedness"),
990
- (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
991
- ]:
992
- data = results_dict.get(eval_key)
993
- if data is None:
994
- continue
995
- eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
996
- if eval_score:
997
- scores[schema_key] = eval_score
998
-
999
- data = results_dict.get(CITATIONS)
1000
- if data is not None:
1001
- count = data.get("citations", 0)
1002
- cit_result = data.get("result")
1003
- if cit_result not in (STATUS_PASS, STATUS_FAIL):
1004
- cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
1005
- citation_score: Dict[str, Any] = {
1006
- "count": count,
1007
- "result": cit_result,
1008
- "threshold": data.get("threshold", 1),
1009
- }
1010
- if "citation_format" in data:
1011
- citation_score["format"] = data["citation_format"]
1012
- scores["citations"] = citation_score
1013
-
1014
- data = results_dict.get(EXACT_MATCH)
1015
- if data is not None:
1016
- is_match = data.get("exact_match", 0.0) == 1.0
1017
- scores["exactMatch"] = {
1018
- "match": is_match,
1019
- "result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
1020
- "reason": data.get("exact_match_reason", ""),
1021
- }
1022
-
1023
- data = results_dict.get(PARTIAL_MATCH)
1024
- if data is not None:
1025
- scores["partialMatch"] = {
1026
- "score": data.get("partial_match", 0.0),
1027
- "result": data.get("result", STATUS_FAIL),
1028
- "threshold": data.get("threshold", 0.5),
1029
- "reason": data.get("partial_match_reason", ""),
1030
- }
1031
-
1032
- return scores
1033
-
1034
-
1035
- def convert_result_to_eval_item(result: Dict) -> Dict:
1036
- """Convert an internal evaluation result dict to a schema-compliant EvalItem."""
1037
- item: Dict[str, Any] = {
1038
- "prompt": result["prompt"],
1039
- "response": result["response"],
1040
- "expected_response": result["expected_response"],
1041
- }
1042
-
1043
- if "evaluators" in result:
1044
- item["evaluators"] = result["evaluators"]
1045
- if "evaluators_mode" in result:
1046
- item["evaluators_mode"] = result["evaluators_mode"]
1047
-
1048
- scores = _convert_scores_to_schema(result.get("results", {}))
1049
- if scores:
1050
- item["scores"] = scores
1051
-
1052
- return item
1053
-
1054
-
1055
- def convert_thread_result_to_output(thread_result: Dict) -> Dict:
1056
- """Convert a multi-turn thread result to the output format."""
1057
- output_turns = []
1058
- for turn in thread_result.get("turns", []):
1059
- output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
1060
- if "expected_response" in turn:
1061
- output_turn["expected_response"] = turn["expected_response"]
1062
- if "response" in turn:
1063
- output_turn["response"] = turn["response"]
1064
- if "status" in turn:
1065
- output_turn["status"] = turn["status"]
1066
- if "error" in turn:
1067
- output_turn["error"] = turn["error"]
1068
- if "evaluators" in turn:
1069
- output_turn["evaluators"] = turn["evaluators"]
1070
- if "evaluators_mode" in turn:
1071
- output_turn["evaluators_mode"] = turn["evaluators_mode"]
1072
-
1073
- scores = _convert_scores_to_schema(turn.get("results", {}))
1074
- if scores:
1075
- output_turn["scores"] = scores
1076
-
1077
- output_turns.append(output_turn)
1078
-
1079
- output: Dict[str, Any] = {}
1080
- if thread_result.get("name"):
1081
- output["name"] = thread_result["name"]
1082
- if thread_result.get("description"):
1083
- output["description"] = thread_result["description"]
1084
- if thread_result.get("conversation_id"):
1085
- output["conversation_id"] = thread_result["conversation_id"]
1086
- output["turns"] = output_turns
1087
- if thread_result.get("summary"):
1088
- output["summary"] = thread_result["summary"]
1089
-
1090
- return output
1091
-
1092
-
1093
- def convert_result_to_output_item(result: Dict) -> Dict:
1094
- """Convert an internal result dict to an output item. Routes by type."""
1095
- if result.get("type") == "multi_turn":
1096
- return convert_thread_result_to_output(result)
1097
- return convert_result_to_eval_item(result)
1098
-
1099
-
1100
- def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
1101
- default_evaluators: Optional[Dict[str, Any]] = None,
1102
- agent_name: Optional[str] = None,
1103
- cli_version: Optional[str] = None):
1104
- """Write results to a schema-compliant eval document JSON file.
1105
-
1106
- Output follows the eval-document.schema.json format:
1107
- {schemaVersion, metadata, default_evaluators?, items: [EvalItem]}
1108
- """
1109
- try:
1110
- try:
1111
- current_version = SchemaVersionManager().get_current_version()
1112
- except Exception:
1113
- current_version = "1.0.0"
1114
-
1115
- items = [convert_result_to_output_item(r) for r in results]
1116
-
1117
- metadata: Dict[str, Any] = {
1118
- "evaluatedAt": datetime.now(timezone.utc).isoformat(),
1119
- }
1120
- if agent_id:
1121
- metadata["agentId"] = agent_id
1122
- if agent_name:
1123
- metadata["agentName"] = agent_name
1124
- if cli_version:
1125
- metadata["cliVersion"] = cli_version
1126
-
1127
- output_data: Dict[str, Any] = {
1128
- "schemaVersion": current_version,
1129
- "metadata": metadata,
1130
- }
1131
-
1132
- if default_evaluators is not None:
1133
- output_data["default_evaluators"] = default_evaluators
1134
-
1135
- output_data["items"] = items
1136
-
1137
- with open(output_file, 'w', encoding='utf-8') as f:
1138
- json.dump(output_data, f, indent=2, ensure_ascii=False)
1139
- emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
1140
- except Exception as e:
1141
- emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
1142
- sys.exit(1)
1143
-
1144
- def _results_to_csv_json(results_dict: Dict) -> str:
1145
- """Serialize evaluator results dict to a CSV-safe JSON string.
1146
-
1147
- Skips None (crashed/skipped evaluators). Results are dicts produced
1148
- by _decorate_metric.
1149
- """
1150
- if not results_dict:
1151
- return ""
1152
- non_null = {k: v for k, v in results_dict.items() if v is not None}
1153
- return json.dumps(non_null) if non_null else ""
1154
-
1155
-
1156
- def write_results_to_csv(results: List[Dict], output_file: str,
1157
- agent_name: Optional[str] = None, agent_id: Optional[str] = None,
1158
- cli_version: Optional[str] = None):
1159
- """Write results to CSV file."""
1160
- try:
1161
- with open(output_file, 'w', newline='', encoding='utf-8') as f:
1162
- if results:
1163
- metadata_parts = []
1164
- if agent_name:
1165
- metadata_parts.append(f"Agent Name: {agent_name}")
1166
- if agent_id:
1167
- metadata_parts.append(f"Agent ID: {agent_id}")
1168
- if cli_version:
1169
- metadata_parts.append(f"CLI Version: {cli_version}")
1170
- if metadata_parts:
1171
- f.write(f"# {' | '.join(metadata_parts)}\n")
1172
-
1173
- aggregates = calculate_aggregate_statistics(results)
1174
- if aggregates:
1175
- total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
1176
- if total_items > 1:
1177
- f.write("# AGGREGATE STATISTICS\n")
1178
- f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
1179
- for metric_name, stats in aggregates.items():
1180
- threshold_str = str(stats.get('threshold', 'N/A'))
1181
- prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
1182
- total_prompts = stats.get('total_prompts', total_items)
1183
- f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
1184
- f.write("\n# INDIVIDUAL RESULTS\n")
1185
-
1186
- single_turn_rows = []
1187
- multi_turn_rows = []
1188
- for result in results:
1189
- if result.get("type") == "multi_turn":
1190
- thread_name = result.get("name", "")
1191
- for turn_idx, turn in enumerate(result.get("turns", [])):
1192
- multi_turn_rows.append({
1193
- "thread_name": thread_name,
1194
- "turn_index": turn_idx + 1,
1195
- "prompt": turn.get("prompt", ""),
1196
- "response": turn.get("response", ""),
1197
- "expected_response": turn.get("expected_response", ""),
1198
- "status": turn.get("status", ""),
1199
- "error": turn.get("error", ""),
1200
- "scores": _results_to_csv_json(turn.get("results", {})),
1201
- })
1202
- summary = result.get("summary", {})
1203
- multi_turn_rows.append({
1204
- "thread_name": thread_name,
1205
- "turn_index": "summary",
1206
- "prompt": "",
1207
- "response": "",
1208
- "expected_response": "",
1209
- "status": summary.get("overall_status", ""),
1210
- "scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
1211
- })
1212
- else:
1213
- exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
1214
- row = {k: v for k, v in result.items() if k not in exclude_keys}
1215
- if "results" in result:
1216
- row["scores"] = _results_to_csv_json(result["results"])
1217
- single_turn_rows.append(row)
1218
-
1219
- if single_turn_rows:
1220
- if multi_turn_rows:
1221
- f.write("# SINGLE-TURN RESULTS\n")
1222
- fieldnames = list(single_turn_rows[0].keys())
1223
- for row in single_turn_rows:
1224
- for k in row:
1225
- if k not in fieldnames:
1226
- fieldnames.append(k)
1227
- writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
1228
- writer.writeheader()
1229
- writer.writerows(single_turn_rows)
1230
-
1231
- if multi_turn_rows:
1232
- if single_turn_rows:
1233
- f.write("\n")
1234
- f.write("# MULTI-TURN RESULTS\n")
1235
- fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
1236
- writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
1237
- writer.writeheader()
1238
- writer.writerows(multi_turn_rows)
1239
- emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
1240
- except Exception as e:
1241
- emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
1242
- sys.exit(1)
1243
-
1244
- def normalize_agent_id(agent_id):
1245
- """Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
1246
-
1247
- Returns the input unchanged when it is None/empty or already contains a dot.
1248
- """
1249
- if not agent_id:
1250
- return agent_id
1251
- return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
1252
-
1253
-
1254
- def parse_arguments():
1255
- """Parse command line arguments."""
1256
- parser = argparse.ArgumentParser(
1257
- description="M365 Copilot Agent Evaluation CLI",
1258
- formatter_class=argparse.RawDescriptionHelpFormatter,
1259
- epilog="""
1260
- Examples:
1261
- # Run with default prompts
1262
- python main.py
1263
-
1264
- # Run with custom prompts
1265
- python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph is a gateway..."
1266
-
1267
- # Run with prompts from file
1268
- python main.py --prompts-file prompts.json
1269
-
1270
- # Interactive mode
1271
- python main.py --interactive
1272
-
1273
- # Save results to JSON
1274
- python main.py --output results.json
1275
-
1276
- # Save results to CSV
1277
- python main.py --output results.csv
1278
-
1279
- # Save results to HTML and open in browser
1280
- python main.py --output report.html
1281
-
1282
- # Debug-level diagnostics
1283
- python main.py --log-level debug
1284
-
1285
- # Sign out and clear cached authentication tokens
1286
- python main.py --signout
1287
- """
1288
- )
1289
-
1290
- # Input options (mutually exclusive)
1291
- input_group = parser.add_mutually_exclusive_group()
1292
- input_group.add_argument(
1293
- '--prompts',
1294
- nargs='+',
1295
- help='List of prompts to evaluate'
1296
- )
1297
- input_group.add_argument(
1298
- '--prompts-file',
1299
- type=str,
1300
- help='JSON file containing prompts and expected responses'
1301
- )
1302
- input_group.add_argument(
1303
- '--interactive',
1304
- action='store_true',
1305
- help='Interactive mode to enter prompts'
1306
- )
1307
-
1308
- # Expected responses (only used with --prompts)
1309
- parser.add_argument(
1310
- '--expected',
1311
- nargs='+',
1312
- help='List of expected responses (must match number of prompts)'
1313
- )
1314
-
1315
- # Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
1316
- parser.add_argument(
1317
- '--m365-agent-id', '--agent-id',
1318
- type=str,
1319
- default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
1320
- help='Agent ID (default from M365_AGENT_ID environment variable)'
1321
- )
1322
-
1323
- # Output options
1324
- parser.add_argument(
1325
- '--output',
1326
- type=str,
1327
- help='Output file path. Format is determined by file extension: .json, .csv, .html. If not provided, results are printed to console.'
1328
- )
1329
-
1330
- # Behavior options
1331
- parser.add_argument(
1332
- '--log-level',
1333
- nargs='?',
1334
- const='info',
1335
- action='append',
1336
- help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
1337
- )
1338
-
1339
- parser.add_argument(
1340
- '--signout',
1341
- action='store_true',
1342
- help='Sign out and clear cached authentication tokens'
1343
- )
1344
-
1345
- parser.add_argument(
1346
- '--concurrency',
1347
- type=int,
1348
- default=5,
1349
- help='Number of parallel workers for prompt processing (1-5, default: 5)'
1350
- )
1351
-
1352
- args = parser.parse_args()
1353
-
1354
- args.m365_agent_id = normalize_agent_id(args.m365_agent_id)
1355
-
1356
- if args.concurrency < 1:
1357
- parser.error('--concurrency must be an integer >= 1.')
1358
- if args.concurrency > MAX_CONCURRENCY:
1359
- emit_structured_log(
1360
- "warning",
1361
- f"--concurrency {args.concurrency} exceeds max {MAX_CONCURRENCY}; clamping to {MAX_CONCURRENCY}.",
1362
- operation=Operation.SETUP,
1363
- )
1364
- args.concurrency = MAX_CONCURRENCY
1365
-
1366
- return args
1367
-
1368
- def validate_environment() -> CallPath:
1369
- """Validate required environment variables."""
1370
- required_env_vars = [
1371
- "AZURE_AI_OPENAI_ENDPOINT",
1372
- "AZURE_AI_API_KEY",
1373
- "AZURE_AI_API_VERSION",
1374
- "AZURE_AI_MODEL_NAME",
1375
- ]
1376
-
1377
- if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
1378
- call_path = CallPath.ACCESS_TOKEN
1379
- required_env_vars.extend([
1380
- "COPILOT_API_ACCESS_TOKEN",
1381
- "COPILOT_API_ENDPOINT",
1382
- "X_SCENARIO_HEADER",
1383
- ])
1384
- elif os.environ.get("WORK_IQ_A2A_ENDPOINT"):
1385
- call_path = CallPath.A2A
1386
- required_env_vars.extend([
1387
- "WORK_IQ_A2A_ENDPOINT",
1388
- "WORK_IQ_A2A_CLIENT_ID",
1389
- "TENANT_ID",
1390
- ])
1391
- else:
1392
- call_path = CallPath.COPILOT_AUTH
1393
- required_env_vars.extend([
1394
- "COPILOT_API_ENDPOINT",
1395
- "X_SCENARIO_HEADER",
1396
- "M365_EVAL_CLIENT_ID",
1397
- "TENANT_ID",
1398
- ])
1399
-
1400
- missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
1401
- if missing_vars:
1402
- emit_structured_log(
1403
- "error",
1404
- "Missing required environment variables: "
1405
- f"{', '.join(missing_vars)}. Please ensure your .env file contains "
1406
- "all required Azure configuration.",
1407
- operation=Operation.VALIDATE_ENV,
1408
- )
1409
- sys.exit(1)
1410
- return call_path
1411
-
1412
- def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
1413
- """Validate URL against security requirements."""
1414
- try:
1415
- parsed = urllib.parse.urlparse(url)
1416
-
1417
- # Check for dangerous schemes
1418
- if parsed.scheme in ['javascript', 'data']:
1419
- raise ValueError(f"Dangerous URL scheme detected: {parsed.scheme}")
1420
-
1421
- # Check for HTTPS requirement
1422
- if parsed.scheme != 'https':
1423
- raise ValueError(f"Only HTTPS URLs are allowed, got: {parsed.scheme}")
1424
-
1425
- # Check if domain is in allowed list
1426
- if parsed.netloc not in allowed_domains:
1427
- raise ValueError(f"Domain not in allowed list: {parsed.netloc}")
1428
-
1429
- # Reject fragment URLs
1430
- if parsed.fragment:
1431
- raise ValueError("Fragment URLs are not allowed")
1432
-
1433
- return True
1434
-
1435
- except ValueError:
1436
- # Re-raise ValueError exceptions
1437
- raise
1438
- except Exception as e:
1439
- # Convert other parsing errors to ValueError
1440
- raise ValueError(f"Invalid URL format: {url}") from e
1441
-
1442
- def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
1443
- """Get prompts and expected responses based on command line arguments.
1444
-
1445
- Returns:
1446
- Tuple of (eval_items, default_evaluators).
1447
- """
1448
- if args.prompts:
1449
- if args.expected and len(args.prompts) != len(args.expected):
1450
- emit_structured_log(
1451
- "error",
1452
- "Number of prompts must match number of expected responses. "
1453
- "Update --expected values to match the prompt count.",
1454
- )
1455
- sys.exit(1)
1456
- expected_responses = args.expected or [""] * len(args.prompts)
1457
- eval_items = [
1458
- {"prompt": p, "expected_response": e}
1459
- for p, e in zip(args.prompts, expected_responses)
1460
- ]
1461
- return eval_items, None
1462
- elif args.prompts_file:
1463
- return load_prompts_from_file(args.prompts_file)
1464
- elif args.interactive:
1465
- prompts, expected_responses = get_interactive_prompts()
1466
- eval_items = [
1467
- {"prompt": p, "expected_response": e}
1468
- for p, e in zip(prompts, expected_responses)
1469
- ]
1470
- return eval_items, None
1471
- else:
1472
- prompts, expected_responses = get_default_prompts_and_responses()
1473
- eval_items = [
1474
- {"prompt": p, "expected_response": e}
1475
- for p, e in zip(prompts, expected_responses)
1476
- ]
1477
- return eval_items, None
1478
-
1479
- def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
1480
- """
1481
- Display an interactive agent selector using questionary.
1482
-
1483
- Args:
1484
- agents: List of agent dictionaries.
1485
-
1486
- Returns:
1487
- Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
1488
- """
1489
- if not agents:
1490
- return None, None
1491
-
1492
- # Build id→name lookup and choices
1493
- id_to_name: Dict[str, str] = {}
1494
- choices = []
1495
- sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
1496
- for agent in sorted_agents:
1497
- agent_id = agent.get("gptId", "Unknown")
1498
- agent_name = agent.get("name", "Unknown")
1499
- agent_description = agent.get("description", "Unknown")
1500
- agent_is_owner = agent.get('isOwner')
1501
- id_to_name[agent_id] = agent_name
1502
-
1503
- # Format the display text
1504
- display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
1505
-
1506
- choices.append(questionary.Choice(title=display_text, value=agent_id))
1507
-
1508
- # Display the selection prompt
1509
- selected_agent = questionary.select(
1510
- "Select an agent to evaluate:",
1511
- choices=choices,
1512
- use_shortcuts=True,
1513
- use_arrow_keys=True
1514
- ).ask()
1515
-
1516
- return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
53
+ from dataclasses import replace
1517
54
 
1518
55
 
1519
- def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
1520
- agent_name: Optional[str] = None, cli_version: Optional[str] = None):
1521
- """Output results based on specified format."""
1522
- metadata_kwargs = dict(
1523
- agent_name=agent_name,
1524
- agent_id=getattr(args, 'm365_agent_id', None),
1525
- cli_version=cli_version,
1526
- )
1527
- if args.output:
1528
- output_lower = args.output.lower()
1529
- if output_lower.endswith('.json'):
1530
- write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1531
- **metadata_kwargs)
1532
- elif output_lower.endswith('.csv'):
1533
- write_results_to_csv(results, args.output, **metadata_kwargs)
1534
- elif output_lower.endswith('.html'):
1535
- write_results_to_html(results, args.output, **metadata_kwargs)
1536
- abs_path = os.path.abspath(args.output)
1537
- webbrowser.open(f'file://{abs_path}')
1538
- else:
1539
- write_results_to_json(results, args.output, default_evaluators=default_evaluators,
1540
- **metadata_kwargs)
1541
- else:
1542
- write_results_to_console(results, **metadata_kwargs)
1543
-
1544
56
  def main():
1545
57
  """Main function to orchestrate the evaluation process."""
1546
58
  load_dotenv()
@@ -1555,136 +67,77 @@ def main():
1555
67
  )
1556
68
  sys.exit(2)
1557
69
 
1558
- args.effective_log_level = effective_log_level
1559
- configure_cli_logging(effective_log_level)
1560
- emit_structured_log("info", f"Log level set to '{effective_log_level}'.", operation=Operation.SETUP)
70
+ config = replace(
71
+ RunConfig.from_namespace(args),
72
+ effective_log_level=effective_log_level,
73
+ )
74
+ configure_cli_logging(config.effective_log_level)
75
+ emit_structured_log("info", f"Log level set to '{config.effective_log_level}'.", operation=Operation.SETUP)
1561
76
 
1562
77
  # Check minimum version before proceeding
1563
- quiet_for_version = effective_log_level in ("warning", "error")
78
+ quiet_for_version = config.effective_log_level in ("warning", "error")
1564
79
  cli_version = get_cli_version(quiet=quiet_for_version)
1565
- if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=quiet_for_version):
80
+ if not should_bypass_min_version_check(config) and not check_min_version(cli_version, quiet=quiet_for_version):
1566
81
  sys.exit(1)
1567
82
 
1568
83
  # Validate environment variables required for evaluation
1569
- call_path = validate_environment()
1570
-
1571
- user_oid = ""
84
+ validate_environment()
1572
85
 
1573
- match call_path:
1574
- case CallPath.ACCESS_TOKEN:
1575
- access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
1576
- user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
1577
- copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
1578
- validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
1579
- agent_client = SydneyClient(
1580
- copilot_api_endpoint=copilot_api_endpoint,
1581
- access_token=access_token,
1582
- user_oid=user_oid,
1583
- logger=CLI_LOGGER,
1584
- diagnostic_records=DIAGNOSTIC_RECORDS,
1585
- )
86
+ a2a_endpoint = os.environ[ENV_WORK_IQ_A2A_ENDPOINT]
87
+ validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
1586
88
 
1587
- case CallPath.A2A:
89
+ a2a_scopes_str = os.environ.get(ENV_WORK_IQ_A2A_SCOPES, "")
90
+ a2a_auth_handler = AuthHandler(
91
+ client_id=os.environ[ENV_WORK_IQ_A2A_CLIENT_ID],
92
+ tenant_id=os.environ[ENV_TENANT_ID],
93
+ scopes_str=a2a_scopes_str,
94
+ )
95
+ if config.signout:
96
+ try:
97
+ a2a_auth_handler.clear_cache()
98
+ except Exception as e:
1588
99
  emit_structured_log(
1589
- "warning",
1590
- "The A2A endpoint is experimental and may change without notice.",
1591
- operation=Operation.SETUP,
1592
- )
1593
- a2a_endpoint = os.environ["WORK_IQ_A2A_ENDPOINT"]
1594
- validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
1595
-
1596
- a2a_scopes_str = os.environ.get("WORK_IQ_A2A_SCOPES", "")
1597
- a2a_auth_handler = AuthHandler(
1598
- client_id=os.environ["WORK_IQ_A2A_CLIENT_ID"],
1599
- tenant_id=os.environ["TENANT_ID"],
1600
- scopes_str=a2a_scopes_str,
1601
- )
1602
- try:
1603
- a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
1604
- a2a_access_token = a2a_auth_result.get("access_token") or ""
1605
- if not a2a_access_token:
1606
- raise RuntimeError("Failed to acquire A2A access token")
1607
- except Exception as e:
1608
- emit_structured_log(
1609
- "error",
1610
- f"Error during A2A authentication: {e}",
1611
- operation=Operation.AUTHENTICATE,
1612
- )
1613
- if effective_log_level == "debug":
1614
- import traceback
1615
- traceback.print_exc()
1616
- sys.exit(1)
1617
- try:
1618
- agent_client = A2AClient(
1619
- a2a_endpoint=a2a_endpoint,
1620
- access_token=a2a_access_token,
1621
- logger=CLI_LOGGER,
1622
- diagnostic_records=DIAGNOSTIC_RECORDS,
1623
- )
1624
- except Exception as e:
1625
- emit_structured_log(
1626
- "error",
1627
- f"Failed to initialize A2A client: {e}",
1628
- operation=Operation.SETUP,
1629
- )
1630
- sys.exit(1)
1631
-
1632
- case CallPath.COPILOT_AUTH:
1633
- copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
1634
- validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
1635
- auth_handler = AuthHandler(
1636
- client_id=os.environ["M365_EVAL_CLIENT_ID"],
1637
- tenant_id=os.environ["TENANT_ID"],
1638
- scopes_str=os.environ.get("COPILOT_SCOPES", ""),
100
+ "error",
101
+ f"Error during signout: {e}",
102
+ operation=Operation.AUTHENTICATE,
1639
103
  )
104
+ sys.exit(1)
105
+ sys.exit(0)
1640
106
 
1641
- # Signout user
1642
- if args.signout:
1643
- try:
1644
- auth_handler.clear_cache()
1645
- except Exception as e:
1646
- emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
1647
- sys.exit(1)
1648
- sys.exit(0)
1649
-
1650
- try:
1651
- auth_result = auth_handler.acquire_token_interactive() or {}
1652
- access_token = auth_result.get("access_token") or ""
1653
- if not access_token:
1654
- raise RuntimeError("Failed to acquire access token from authentication result")
1655
-
1656
- id_token_claims = auth_result.get("id_token_claims")
1657
- if not isinstance(id_token_claims, dict):
1658
- emit_structured_log(
1659
- "warning", "id_token_claims is missing or invalid in authentication result",
1660
- operation=Operation.AUTHENTICATE,
1661
- )
1662
- else:
1663
- user_oid = id_token_claims.get("oid") or ""
1664
-
1665
- if not user_oid:
1666
- user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
1667
-
1668
- except Exception as e:
1669
- emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
1670
- if effective_log_level == "debug":
1671
- import traceback
1672
- traceback.print_exc()
1673
- sys.exit(1)
1674
-
1675
- agent_client = SydneyClient(
1676
- copilot_api_endpoint=copilot_api_endpoint,
1677
- access_token=access_token,
1678
- user_oid=user_oid,
1679
- logger=CLI_LOGGER,
1680
- diagnostic_records=DIAGNOSTIC_RECORDS,
1681
- )
107
+ try:
108
+ a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
109
+ a2a_access_token = a2a_auth_result.get("access_token") or ""
110
+ if not a2a_access_token:
111
+ raise RuntimeError("Failed to acquire A2A access token")
112
+ except Exception as e:
113
+ emit_structured_log(
114
+ "error",
115
+ f"Error during A2A authentication: {e}",
116
+ operation=Operation.AUTHENTICATE,
117
+ )
118
+ if config.effective_log_level == "debug":
119
+ traceback.print_exc()
120
+ sys.exit(1)
121
+ try:
122
+ agent_client = A2AClient(
123
+ a2a_endpoint=a2a_endpoint,
124
+ access_token=a2a_access_token,
125
+ logger=CLI_LOGGER,
126
+ diagnostic_records=DIAGNOSTIC_RECORDS,
127
+ )
128
+ except Exception as e:
129
+ emit_structured_log(
130
+ "error",
131
+ f"Failed to initialize A2A client: {e}",
132
+ operation=Operation.SETUP,
133
+ )
134
+ sys.exit(1)
1682
135
 
1683
136
  # 1. Load evaluation datasets
1684
- eval_items, file_default_evaluators = get_prompt_datasets(args)
137
+ eval_items, file_default_evaluators = get_prompt_datasets(config)
1685
138
  default_evaluators = resolve_default_evaluators(file_default_evaluators)
1686
139
 
1687
- if effective_log_level in ("info", "debug"):
140
+ if config.effective_log_level in ("info", "debug"):
1688
141
  multi_turn_count = sum(1 for item in eval_items if "turns" in item)
1689
142
  single_turn_count = len(eval_items) - multi_turn_count
1690
143
  emit_structured_log(
@@ -1697,54 +150,53 @@ def main():
1697
150
  agent_name = None
1698
151
  try:
1699
152
  # 2. Agent selection - when no agent ID is provided, discover agents
1700
- # via the active client (A2A or REST) and prompt interactively.
1701
- if not args.m365_agent_id:
1702
- if effective_log_level in ("info", "debug"):
153
+ # via the active client (A2A) and prompt interactively.
154
+ if not config.m365_agent_id:
155
+ if config.effective_log_level in ("info", "debug"):
1703
156
  emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
1704
157
 
1705
158
  available_agents = agent_client.fetch_available_agents()
1706
159
  if not available_agents:
1707
160
  emit_structured_log(
1708
- "error",
1709
- "No agents are available for interactive selection. Re-run with "
1710
- "--m365-agent-id or set M365_AGENT_ID.",
1711
- operation=Operation.FETCH_AGENTS,
161
+ "error",
162
+ "No agents are available for interactive selection."
163
+ " Re-run with --m365-agent-id or set M365_AGENT_ID.",
164
+ operation=Operation.FETCH_AGENTS,
1712
165
  )
1713
166
  sys.exit(1)
1714
167
 
1715
168
  selected_agent_id, agent_name = select_agent_interactively(available_agents)
1716
169
  if selected_agent_id:
1717
- args.m365_agent_id = selected_agent_id
1718
- if effective_log_level in ("info", "debug"):
1719
- emit_structured_log("info", f"Selected agent: {args.m365_agent_id}", operation=Operation.FETCH_AGENTS)
170
+ config = replace(config, m365_agent_id=selected_agent_id)
171
+ if config.effective_log_level in ("info", "debug"):
172
+ emit_structured_log("info", f"Selected agent: {config.m365_agent_id}", operation=Operation.FETCH_AGENTS)
1720
173
  else:
1721
174
  emit_structured_log(
1722
- "error",
1723
- "No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
1724
- operation=Operation.FETCH_AGENTS,
175
+ "error",
176
+ "No agent selected. Re-run with --m365-agent-id or set M365_AGENT_ID.",
177
+ operation=Operation.FETCH_AGENTS,
1725
178
  )
1726
179
  sys.exit(1)
1727
180
  except Exception as e:
1728
181
  emit_structured_log("error", f"Error during agent discovery: {e}", operation=Operation.FETCH_AGENTS)
1729
- if effective_log_level == "debug":
1730
- import traceback
182
+ if config.effective_log_level == "debug":
1731
183
  traceback.print_exc()
1732
184
  sys.exit(1)
1733
185
 
1734
- # Pre-resolve agent endpoint (A2A agent card lookup; no-op for REST)
1735
- if args.m365_agent_id:
1736
- agent_client.resolve_agent(args.m365_agent_id)
186
+ # Pre-resolve agent endpoint (A2A agent card lookup)
187
+ if config.m365_agent_id:
188
+ agent_client.resolve_agent(config.m365_agent_id)
1737
189
 
1738
190
  # 3. Build pipeline config and run evaluation pipeline
1739
191
  model_config = AzureOpenAIModelConfiguration(
1740
- azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
1741
- api_key=os.environ.get("AZURE_AI_API_KEY"),
1742
- api_version=os.environ.get("AZURE_AI_API_VERSION"),
1743
- azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
192
+ azure_endpoint=os.environ.get(ENV_AZURE_AI_OPENAI_ENDPOINT),
193
+ api_key=os.environ.get(ENV_AZURE_AI_API_KEY),
194
+ api_version=os.environ.get(ENV_AZURE_AI_API_VERSION),
195
+ azure_deployment=os.environ.get(ENV_AZURE_AI_MODEL_NAME),
1744
196
  )
1745
197
  has_azure_openai = bool(
1746
- os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
1747
- and os.environ.get("AZURE_AI_API_KEY")
198
+ os.environ.get(ENV_AZURE_AI_OPENAI_ENDPOINT)
199
+ and os.environ.get(ENV_AZURE_AI_API_KEY)
1748
200
  )
1749
201
 
1750
202
  pipeline = PipelineConfig(
@@ -1754,13 +206,14 @@ def main():
1754
206
  default_evaluators=default_evaluators,
1755
207
  )
1756
208
 
1757
- results = run_pipeline(pipeline, eval_items, args)
1758
-
209
+ results = run_pipeline(pipeline, eval_items, config)
210
+
1759
211
  # 4. Output results
1760
- output_results(results, args, default_evaluators=default_evaluators,
1761
- agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
212
+ output_results(
213
+ results, config, default_evaluators=default_evaluators,
214
+ agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
1762
215
 
1763
- if effective_log_level in ("info", "debug"):
216
+ if config.effective_log_level in ("info", "debug"):
1764
217
  emit_structured_log(
1765
218
  "info",
1766
219
  f"Evaluation completed successfully. Processed {len(eval_items)} item(s).",
@@ -1768,5 +221,5 @@ def main():
1768
221
  )
1769
222
 
1770
223
  # Call the main function when script is run directly
1771
- if __name__ == "__main__":
224
+ if __name__ == "__main__": # pragma: no cover
1772
225
  main()