@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.4.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +129 -97
  2. package/package.json +7 -4
  3. package/schema/v1/eval-document.schema.json +140 -8
  4. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  5. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  6. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  7. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  8. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  9. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  10. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  11. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  12. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  13. package/schema/version.json +2 -2
  14. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  15. package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
  16. package/src/clients/cli/api_clients/REST/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +78 -0
  20. package/src/clients/cli/cli_logging/console_diagnostics.py +54 -2
  21. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  22. package/src/clients/cli/common.py +11 -0
  23. package/src/clients/cli/generate_report.py +272 -129
  24. package/src/clients/cli/main.py +1006 -476
  25. package/src/clients/cli/parallel_executor.py +57 -0
  26. package/src/clients/cli/requirements.txt +1 -1
  27. package/src/clients/cli/response_extractor.py +12 -14
  28. package/src/clients/cli/retry_policy.py +52 -0
  29. package/src/clients/cli/samples/multiturn_example.json +35 -0
  30. package/src/clients/cli/throttle_gate.py +82 -0
  31. package/src/clients/node-js/bin/runevals.js +79 -16
  32. package/src/clients/node-js/config/default.js +5 -1
  33. package/src/clients/node-js/lib/agent-id.js +12 -0
  34. package/src/clients/node-js/lib/env-loader.js +11 -16
  35. package/src/clients/node-js/lib/eula-manager.js +78 -0
  36. package/src/clients/node-js/lib/progress.js +13 -11
@@ -3,15 +3,19 @@ import os
3
3
  import argparse
4
4
  import sys
5
5
  import csv
6
- import functools
7
6
  import logging
7
+ import time
8
8
  import webbrowser
9
- import urllib.request
10
- import urllib.error
11
9
  import urllib.parse
12
10
  import questionary
11
+ from dataclasses import dataclass, field
13
12
  from enum import Enum
14
13
  from typing import List, Dict, Tuple, Optional, Any
14
+
15
+ from api_clients.A2A import A2AClient
16
+ from api_clients.REST import SydneyClient
17
+ from api_clients.base_agent_client import BaseAgentClient
18
+
15
19
  from azure.ai.evaluation import (
16
20
  AzureOpenAIModelConfiguration,
17
21
  RelevanceEvaluator,
@@ -25,7 +29,7 @@ from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFor
25
29
  from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
26
30
  from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
27
31
  from generate_report import generate_html_report, calculate_aggregate_statistics
28
- from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
32
+ from response_extractor import get_response_text_for_evaluation
29
33
  from schema_handler import DocumentUpgrader, SchemaVersionManager
30
34
  from common import (
31
35
  RELEVANCE,
@@ -38,10 +42,14 @@ from common import (
38
42
  REQUIRES_AZURE_OPENAI,
39
43
  REQUIRES_TOOL_DEFINITIONS,
40
44
  METRIC_IDS,
45
+ STATUS_PASS,
46
+ STATUS_FAIL,
47
+ STATUS_ERROR,
48
+ STATUS_PARTIAL,
49
+ STATUS_UNKNOWN,
41
50
  pascal_case_to_title,
42
51
  )
43
52
  from evaluator_resolver import (
44
- EVALUATOR_REGISTRY,
45
53
  validate_evaluator_names,
46
54
  check_prerequisites,
47
55
  resolve_default_evaluators,
@@ -51,20 +59,86 @@ from evaluator_resolver import (
51
59
  from version_check import check_min_version, get_cli_version
52
60
  from datetime import datetime, timezone
53
61
  from pathlib import Path
54
- import tzlocal
55
62
 
56
- from cli_logging.console_diagnostics import render_diagnostic, serialize_diagnostic_record
57
- from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel, Operation, format_structured_log_entry, resolve_log_level
63
+
64
+ from cli_logging.console_diagnostics import emit_structured_log as _emit_structured_log
65
+ from cli_logging.logging_utils import LOG_LEVEL_MAP, Operation, resolve_log_level
66
+ from parallel_executor import execute_in_parallel
67
+ from throttle_gate import ThrottleGate
68
+ from retry_policy import (
69
+ is_retryable_status,
70
+ get_backoff_seconds,
71
+ get_retry_after_seconds,
72
+ )
58
73
 
59
74
  # Allowed endpoints for URL validation
60
75
  ALLOWED_ENDPOINTS = [
61
- 'substrate.office.com'
76
+ 'substrate.office.com',
77
+ 'graph.microsoft.com',
62
78
  ]
63
79
 
80
+ MAX_CONCURRENCY = 5
81
+ MAX_ATTEMPTS = 4 # Initial attempt + 3 retries
82
+ MAX_TURNS_PER_THREAD = 20
83
+ LONG_THREAD_WARNING_THRESHOLD = 10
84
+
85
+
86
+ @dataclass
87
+ class PipelineConfig:
88
+ """Runtime configuration for the evaluation pipeline."""
89
+ agent_client: BaseAgentClient
90
+ model_config: AzureOpenAIModelConfiguration
91
+ has_azure_openai: bool
92
+ default_evaluators: Dict[str, Any]
93
+ chat_gate: ThrottleGate = field(default_factory=lambda: ThrottleGate("chat_api"))
94
+ is_retryable_status: Any = field(default=is_retryable_status)
95
+ get_backoff_seconds: Any = field(default=get_backoff_seconds)
96
+ get_retry_after_seconds: Any = field(default=get_retry_after_seconds)
97
+
64
98
  class CallPath(Enum):
65
99
  """ Enum to indicate which call path to use. """
66
100
  ACCESS_TOKEN = "access_token"
67
101
  COPILOT_AUTH = "copilot_auth"
102
+ A2A = "a2a"
103
+
104
+
105
+ class ItemType(Enum):
106
+ SINGLE_TURN = "single_turn"
107
+ MULTI_TURN = "multi_turn"
108
+
109
+
110
+ def detect_item_type(item: dict) -> ItemType:
111
+ """Determine if an evaluation item is single-turn or multi-turn.
112
+
113
+ Returns ItemType.SINGLE_TURN if item has 'prompt' without 'turns',
114
+ ItemType.MULTI_TURN if item has 'turns' array.
115
+
116
+ Raises ValueError for invalid items (both, neither, or invalid turns).
117
+ """
118
+ has_turns = "turns" in item
119
+ has_prompt = "prompt" in item
120
+
121
+ if has_turns and has_prompt:
122
+ raise ValueError(
123
+ "Invalid evaluation item: cannot have both 'turns' and 'prompt'. "
124
+ "Use 'turns' for multi-turn threads or 'prompt' for single-turn."
125
+ )
126
+
127
+ if has_turns and not isinstance(item["turns"], list):
128
+ raise ValueError("Invalid evaluation item: 'turns' must be a list")
129
+
130
+ if has_turns:
131
+ if len(item["turns"]) == 0:
132
+ raise ValueError("Invalid multi-turn thread: 'turns' array cannot be empty")
133
+ return ItemType.MULTI_TURN
134
+
135
+ if has_prompt:
136
+ return ItemType.SINGLE_TURN
137
+
138
+ raise ValueError(
139
+ "Invalid evaluation item: must have either 'turns' array (multi-turn) "
140
+ "or 'prompt' field (single-turn)"
141
+ )
68
142
 
69
143
 
70
144
  # Flags that should bypass remote min-version enforcement.
@@ -78,40 +152,21 @@ CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
78
152
  DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
79
153
 
80
154
 
81
- def _ensure_logger_handler() -> None:
82
- if CLI_LOGGER.handlers:
83
- return
84
- handler = logging.StreamHandler(sys.stdout)
85
- handler.setFormatter(logging.Formatter("%(message)s"))
86
- CLI_LOGGER.addHandler(handler)
87
- CLI_LOGGER.propagate = False
88
-
89
-
90
155
  def configure_cli_logging(effective_log_level: str) -> None:
91
- _ensure_logger_handler()
156
+ if not CLI_LOGGER.handlers:
157
+ handler = logging.StreamHandler(sys.stdout)
158
+ handler.setFormatter(logging.Formatter("%(message)s"))
159
+ CLI_LOGGER.addHandler(handler)
160
+ CLI_LOGGER.propagate = False
92
161
  CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
93
162
 
94
163
 
95
164
  def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
96
- _ensure_logger_handler()
97
- context = {
98
- "request-id": None,
99
- "conversation-id": None,
100
- "message-id": None,
101
- "operation": operation,
102
- }
103
- entry = format_structured_log_entry(
104
- level=level,
105
- message=message,
106
- logger_name=CLI_LOGGER_NAME,
107
- run_context=context,
165
+ _emit_structured_log(
166
+ level, message, operation,
167
+ logger=CLI_LOGGER,
168
+ diagnostic_records=DIAGNOSTIC_RECORDS,
108
169
  )
109
- DIAGNOSTIC_RECORDS.append(entry)
110
-
111
- try:
112
- CLI_LOGGER.log(LOG_LEVEL_MAP.get(level, logging.INFO), render_diagnostic(entry))
113
- except Exception:
114
- pass
115
170
 
116
171
 
117
172
  def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
@@ -237,118 +292,88 @@ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
237
292
 
238
293
  return prompts, expected_responses
239
294
 
240
- def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict],
241
- default_evaluators: Dict[str, Any]) -> list:
242
- """Run evaluations against the responses using per-prompt evaluator resolution.
243
295
 
244
- Args:
245
- args: CLI arguments.
246
- responses: List of enhanced response dicts (one per prompt, aligned with eval_items by index).
247
- eval_items: List of item dicts (prompt, expected_response, evaluators, evaluators_mode).
248
- default_evaluators: Resolved default evaluators (from resolve_default_evaluators).
249
- """
250
- if len(responses) != len(eval_items):
251
- raise ValueError(
252
- f"Mismatch between number of responses ({len(responses)}) and evaluation items ({len(eval_items)})."
253
- )
254
-
255
- model_config = AzureOpenAIModelConfiguration(
256
- azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
257
- api_key=os.environ.get("AZURE_AI_API_KEY"),
258
- api_version=os.environ.get("AZURE_AI_API_VERSION"),
259
- azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
260
- )
261
-
262
- # Build available context for prerequisite checks
263
- has_azure_openai = bool(
264
- os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
265
- and os.environ.get("AZURE_AI_API_KEY")
266
- )
296
+ _DEFAULT_PASS_THRESHOLD = 3
267
297
 
268
- DEFAULT_PASS_THRESHOLD = 3
269
298
 
270
- def decorate_metric(metric_id: str, data, threshold: Optional[int] = None):
271
- """Augment raw evaluator output with standardized threshold + pass/fail result."""
272
- pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
273
- payload = {}
274
- if isinstance(data, dict):
275
- payload.update(data)
276
- else:
277
- payload['raw'] = data
278
-
279
- score_val = None
280
- if isinstance(data, dict):
281
- if metric_id in data:
282
- score_val = data[metric_id]
283
- if isinstance(score_val, (int, float)):
284
- payload['threshold'] = pass_threshold
285
- payload['result'] = 'pass' if score_val >= pass_threshold else 'fail'
286
- else:
287
- payload['threshold'] = pass_threshold
288
- payload.setdefault('result', 'unknown')
289
- return json.dumps(payload, indent=4)
290
-
291
- # Validate all evaluator names upfront (across defaults and all items)
292
- all_evaluator_maps = [default_evaluators]
293
- for eval_item in eval_items:
294
- if "evaluators" in eval_item:
295
- all_evaluator_maps.append(eval_item["evaluators"])
296
- for emap in all_evaluator_maps:
297
- validate_evaluator_names(emap)
298
-
299
- evaluation_results = []
300
- for enhanced_response, eval_item in zip(responses, eval_items):
301
- actual_response_text = get_response_text_for_evaluation(enhanced_response)
302
- prompt = eval_item.get("prompt", "")
303
- expected_response = eval_item.get("expected_response", "")
304
- prompt_evaluators = eval_item.get("evaluators")
305
- evaluators_mode = eval_item.get("evaluators_mode", "extend")
306
-
307
- # Resolve evaluators for this prompt
308
- resolved = resolve_evaluators_for_prompt(
309
- prompt_evaluators, evaluators_mode, prompt, default_evaluators,
310
- )
299
+ def _decorate_metric(metric_id: str, data, threshold: Optional[int] = None) -> Dict[str, Any]:
300
+ """Augment raw evaluator output with standardized threshold + pass/fail result."""
301
+ pass_threshold = threshold if threshold is not None else _DEFAULT_PASS_THRESHOLD
302
+ payload = {}
303
+ if isinstance(data, dict):
304
+ payload.update(data)
305
+ else:
306
+ payload['raw'] = data
311
307
 
312
- # Build runtime context for prerequisite checks
313
- has_tool_defs = bool(
314
- args.m365_agent_id and enhanced_response.get("tool_definitions")
315
- )
316
- available_context = {
317
- REQUIRES_AZURE_OPENAI: has_azure_openai,
318
- REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
319
- }
308
+ score_val = None
309
+ if isinstance(data, dict):
310
+ if metric_id in data:
311
+ score_val = data[metric_id]
312
+ if isinstance(score_val, (int, float)):
313
+ payload['threshold'] = pass_threshold
314
+ payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
315
+ else:
316
+ payload['threshold'] = pass_threshold
317
+ payload.setdefault('result', STATUS_UNKNOWN)
318
+ return payload
319
+
320
+
321
+ def _run_evaluators_for_item(
322
+ prompt: str,
323
+ actual_response: str,
324
+ expected_response: str,
325
+ enhanced_response: Dict[str, Any],
326
+ resolved_evaluators: Dict[str, Any],
327
+ model_config: AzureOpenAIModelConfiguration,
328
+ has_azure_openai: bool,
329
+ args,
330
+ ) -> Tuple[Dict[str, Optional[str]], List[str]]:
331
+ """Run resolved evaluators against a single item/turn.
332
+
333
+ Returns (results_dict, evaluators_ran).
334
+ """
335
+ has_tool_defs = bool(
336
+ args.m365_agent_id and enhanced_response.get("tool_definitions")
337
+ )
338
+ available_context = {
339
+ REQUIRES_AZURE_OPENAI: has_azure_openai,
340
+ REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
341
+ }
320
342
 
321
- results_dict: Dict[str, Optional[str]] = {}
322
- evaluators_ran: List[str] = []
343
+ results_dict: Dict[str, Optional[str]] = {}
344
+ evaluators_ran: List[str] = []
323
345
 
324
- for eval_name, eval_options in resolved.items():
325
- # Check prerequisites
326
- can_run, warn_msg = check_prerequisites(eval_name, available_context)
327
- if not can_run:
328
- if warn_msg:
329
- emit_structured_log("warning", f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}", operation=Operation.EVALUATE)
330
- results_dict[eval_name] = None
331
- continue
346
+ for eval_name, eval_options in resolved_evaluators.items():
347
+ can_run, warn_msg = check_prerequisites(eval_name, available_context)
348
+ if not can_run:
349
+ if warn_msg:
350
+ emit_structured_log(
351
+ "warning",
352
+ f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
353
+ operation=Operation.EVALUATE,
354
+ )
355
+ results_dict[eval_name] = None
356
+ continue
332
357
 
333
- evaluators_ran.append(eval_name)
334
- threshold = get_evaluator_threshold(eval_name, eval_options)
358
+ threshold = get_evaluator_threshold(eval_name, eval_options)
335
359
 
360
+ try:
336
361
  if eval_name == RELEVANCE:
337
- raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
338
- results_dict[RELEVANCE] = decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
362
+ raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
363
+ results_dict[RELEVANCE] = _decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
339
364
  elif eval_name == COHERENCE:
340
- raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response_text)
341
- results_dict[COHERENCE] = decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
365
+ raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
366
+ results_dict[COHERENCE] = _decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
342
367
  elif eval_name == GROUNDEDNESS:
343
- raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response_text, context=expected_response)
344
- results_dict[GROUNDEDNESS] = decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
368
+ raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response, context=expected_response)
369
+ results_dict[GROUNDEDNESS] = _decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
345
370
  elif eval_name == TOOL_CALL_ACCURACY:
346
371
  raw_score = ToolCallAccuracyEvaluator(model_config)(
347
372
  query=prompt,
348
- response=enhanced_response.get("response", actual_response_text),
349
- tool_definitions=enhanced_response["tool_definitions"],
373
+ response=enhanced_response.get("response", actual_response),
374
+ tool_definitions=enhanced_response.get("tool_definitions", []),
350
375
  )
351
- results_dict[TOOL_CALL_ACCURACY] = decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
376
+ results_dict[TOOL_CALL_ACCURACY] = _decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
352
377
  elif eval_name == CITATIONS:
353
378
  fmt_str = eval_options.get("citation_format", "oai_unicode")
354
379
  fmt_map = {
@@ -356,45 +381,456 @@ def run_evaluations(args, responses: List[Dict[str, Any]], eval_items: List[Dict
356
381
  "bracket": CitationFormat.LEGACY_BRACKET,
357
382
  "mixed": CitationFormat.AUTO,
358
383
  }
359
- raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response_text)
360
- results_dict[CITATIONS] = decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
384
+ raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response)
385
+ results_dict[CITATIONS] = _decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
361
386
  elif eval_name == EXACT_MATCH:
362
- # ExactMatch is binary (match/no-match) — it includes its own result
363
- # field, so we skip decorate_metric which assumes a numeric score.
364
387
  case_sensitive = eval_options.get("case_sensitive", False)
365
- raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
366
- results_dict[EXACT_MATCH] = json.dumps(raw_score, indent=4)
388
+ raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
389
+ # ExactMatch is binary — the evaluator already sets 'result'
390
+ # so _decorate_metric (which computes result from score vs threshold) is not needed.
391
+ results_dict[EXACT_MATCH] = raw_score
367
392
  elif eval_name == PARTIAL_MATCH:
368
393
  case_sensitive = eval_options.get("case_sensitive", False)
369
- raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response_text, expected_answer=expected_response)
370
- results_dict[PARTIAL_MATCH] = decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
371
-
372
- evaluation_result = {
373
- "prompt": prompt,
374
- "response": actual_response_text,
375
- "expected_response": expected_response,
376
- "evaluators_ran": evaluators_ran,
377
- "results": results_dict,
394
+ raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
395
+ results_dict[PARTIAL_MATCH] = _decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
396
+
397
+ evaluators_ran.append(eval_name)
398
+ except Exception as e:
399
+ emit_structured_log(
400
+ "error",
401
+ f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
402
+ operation=Operation.EVALUATE,
403
+ )
404
+ results_dict[eval_name] = None
405
+
406
+ return results_dict, evaluators_ran
407
+
408
+
409
+ def _evaluate_single_response(
410
+ enhanced_response: Dict[str, Any],
411
+ eval_item: Dict,
412
+ args,
413
+ model_config: AzureOpenAIModelConfiguration,
414
+ has_azure_openai: bool,
415
+ default_evaluators: Dict[str, Any],
416
+ ) -> Dict[str, Any]:
417
+ """Run all evaluators for a single prompt/response pair and return the result dict."""
418
+ actual_response_text = get_response_text_for_evaluation(enhanced_response)
419
+ prompt = eval_item.get("prompt", "")
420
+ expected_response = eval_item.get("expected_response", "")
421
+
422
+ resolved = resolve_evaluators_for_prompt(
423
+ eval_item.get("evaluators"), eval_item.get("evaluators_mode", "extend"),
424
+ prompt, default_evaluators,
425
+ )
426
+
427
+ results_dict, evaluators_ran = _run_evaluators_for_item(
428
+ prompt, actual_response_text, expected_response, enhanced_response,
429
+ resolved, model_config, has_azure_openai, args,
430
+ )
431
+
432
+ evaluation_result = {
433
+ "prompt": prompt,
434
+ "response": enhanced_response.get(
435
+ "display_response_text", actual_response_text
436
+ ),
437
+ "expected_response": expected_response,
438
+ "evaluators_ran": evaluators_ran,
439
+ "results": results_dict,
440
+ }
441
+
442
+ if "evaluators" in eval_item:
443
+ evaluation_result["evaluators"] = eval_item["evaluators"]
444
+ if "evaluators_mode" in eval_item:
445
+ evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
446
+
447
+ if getattr(args, "effective_log_level", "info") == "debug":
448
+ emit_structured_log(
449
+ "debug",
450
+ f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
451
+ f"Evaluators: {', '.join(evaluators_ran)}. "
452
+ f"Scores: {evaluation_result['results']}",
453
+ operation=Operation.EVALUATE,
454
+ )
455
+
456
+ return evaluation_result
457
+
458
+
459
+ def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
460
+ """Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
461
+ for result_data in results_dict.values():
462
+ if result_data is None:
463
+ continue
464
+ if result_data.get("result") == STATUS_FAIL:
465
+ return False
466
+ return True
467
+
468
+ def _evaluate_multi_turn_responses(
469
+ turns: List[Dict],
470
+ args,
471
+ default_evaluators: Dict[str, Any],
472
+ model_config: AzureOpenAIModelConfiguration,
473
+ has_azure_openai: bool,
474
+ ) -> Tuple[List[Dict], Dict]:
475
+ """Run per-turn evaluations and build evaluated turn results with summary.
476
+
477
+ Returns:
478
+ Tuple of (evaluated_turns, summary). Each evaluated turn contains
479
+ prompt, response, expected_response, status, evaluators_ran, results,
480
+ and optionally error. Does not mutate the input turns.
481
+ """
482
+ evaluated_turns: List[Dict] = []
483
+ turns_passed = 0
484
+ turns_failed = 0
485
+
486
+ for i, turn in enumerate(turns):
487
+ evaluated_turn: Dict[str, Any] = {
488
+ "prompt": turn.get("prompt", ""),
378
489
  }
490
+ if "expected_response" in turn:
491
+ evaluated_turn["expected_response"] = turn["expected_response"]
492
+ if "response" in turn:
493
+ evaluated_turn["response"] = turn["response"]
494
+ if "evaluators" in turn:
495
+ evaluated_turn["evaluators"] = turn["evaluators"]
496
+ if "evaluators_mode" in turn:
497
+ evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
498
+
499
+ if turn.get("status") == STATUS_ERROR:
500
+ evaluated_turn["status"] = STATUS_ERROR
501
+ evaluated_turn["error"] = turn.get("error", "")
502
+ turns_failed += 1
503
+ evaluated_turns.append(evaluated_turn)
504
+ continue
379
505
 
380
- # Preserve evaluator config metadata for output
381
- if "evaluators" in eval_item:
382
- evaluation_result["evaluators"] = eval_item["evaluators"]
383
- if "evaluators_mode" in eval_item:
384
- evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
506
+ enhanced_response = turn.get("_enhanced_response", {})
507
+ actual_response = get_response_text_for_evaluation(enhanced_response)
508
+
509
+ resolved = resolve_evaluators_for_prompt(
510
+ turn.get("evaluators"), turn.get("evaluators_mode", "extend"),
511
+ turn.get("prompt", ""), default_evaluators,
512
+ )
513
+
514
+ results_dict, evaluators_ran = _run_evaluators_for_item(
515
+ turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
516
+ enhanced_response, resolved, model_config, has_azure_openai, args,
517
+ )
518
+
519
+ all_passed = _check_all_passed(results_dict)
520
+
521
+ evaluated_turn["results"] = results_dict
522
+ evaluated_turn["evaluators_ran"] = evaluators_ran
523
+ evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
385
524
 
386
525
  if getattr(args, "effective_log_level", "info") == "debug":
387
526
  emit_structured_log(
388
527
  "debug",
389
- f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
528
+ f"Evaluation completed for turn {i + 1} prompt='{turn.get('prompt', '')}'. "
390
529
  f"Evaluators: {', '.join(evaluators_ran)}. "
391
- f"Scores: {evaluation_result['results']}",
530
+ f"Scores: {results_dict}",
392
531
  operation=Operation.EVALUATE,
393
532
  )
394
533
 
395
- evaluation_results.append(evaluation_result)
534
+ if all_passed:
535
+ turns_passed += 1
536
+ else:
537
+ turns_failed += 1
538
+
539
+ evaluated_turns.append(evaluated_turn)
540
+
541
+ turns_total = len(turns)
542
+ if turns_passed == turns_total:
543
+ overall_status = STATUS_PASS
544
+ elif turns_failed == turns_total:
545
+ overall_status = STATUS_FAIL
546
+ else:
547
+ overall_status = STATUS_PARTIAL
548
+
549
+ summary = {
550
+ "turns_total": turns_total,
551
+ "turns_passed": turns_passed,
552
+ "turns_failed": turns_failed,
553
+ "overall_status": overall_status,
554
+ }
555
+
556
+ return evaluated_turns, summary
557
+
558
+
559
+ def get_effective_worker_count(prompt_count: int, args) -> int:
560
+ """Compute safe worker count for prompt processing."""
561
+ if prompt_count <= 0:
562
+ return 1
563
+
564
+ requested = getattr(args, "concurrency", 5)
565
+ try:
566
+ requested_int = int(requested)
567
+ except (TypeError, ValueError):
568
+ requested_int = 5
569
+
570
+ bounded = max(1, min(requested_int, MAX_CONCURRENCY))
571
+ return min(bounded, prompt_count)
572
+
573
+
574
+ def run_pipeline(
575
+ pipeline: PipelineConfig,
576
+ eval_items: List[Dict],
577
+ args,
578
+ ) -> List[Dict[str, Any]]:
579
+ """Run the full evaluation pipeline: send prompts and evaluate responses in parallel.
580
+
581
+ Each worker processes one prompt end-to-end: send → evaluate.
582
+ Results are returned in original prompt order (FR-006).
583
+ """
584
+ # Validate all evaluator names upfront before dispatching workers
585
+ all_evaluator_maps = [pipeline.default_evaluators]
586
+ for eval_item in eval_items:
587
+ if "evaluators" in eval_item:
588
+ all_evaluator_maps.append(eval_item["evaluators"])
589
+ for turn in eval_item.get("turns", []):
590
+ if "evaluators" in turn:
591
+ all_evaluator_maps.append(turn["evaluators"])
592
+ for emap in all_evaluator_maps:
593
+ validate_evaluator_names(emap)
594
+
595
+ # Validate all items upfront and classify types before dispatching workers
596
+ item_types: List[ItemType] = []
597
+ for idx, eval_item in enumerate(eval_items):
598
+ try:
599
+ item_type = detect_item_type(eval_item)
600
+ except ValueError as e:
601
+ raise ValueError(f"Invalid evaluation item at index {idx}: {e}") from e
602
+ if item_type == ItemType.MULTI_TURN:
603
+ turn_count = len(eval_item["turns"])
604
+ if turn_count > MAX_TURNS_PER_THREAD:
605
+ raise ValueError(
606
+ f"Invalid evaluation item at index {idx}: 'turns' array has "
607
+ f"{turn_count} items (max {MAX_TURNS_PER_THREAD})"
608
+ )
609
+ item_types.append(item_type)
610
+
611
+ total = len(eval_items)
612
+ worker_count = get_effective_worker_count(total, args)
613
+
614
+ multi_turn_count = sum(1 for t in item_types if t == ItemType.MULTI_TURN)
615
+ single_turn_count = total - multi_turn_count
616
+
617
+ emit_structured_log(
618
+ "info",
619
+ f"Running pipeline with {worker_count} worker(s) for {total} item(s) "
620
+ f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
621
+ operation=Operation.EVALUATE,
622
+ )
623
+
624
+ def _process_item(eval_item: Dict, index: int) -> Dict[str, Any]:
625
+ if item_types[index] == ItemType.MULTI_TURN:
626
+ return _process_multi_turn(eval_item, index)
627
+ return _process_single_turn(eval_item, index)
628
+
629
+ def _process_single_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
630
+ prompt = eval_item.get("prompt", "")
631
+ emit_structured_log(
632
+ "info",
633
+ f"Processing item {index + 1}/{total} (single-turn).",
634
+ operation=Operation.SEND_PROMPT,
635
+ )
636
+
637
+ # Phase A: Send prompt to agent (with retry + throttle gate)
638
+ response = None
639
+ for attempt in range(1, MAX_ATTEMPTS + 1):
640
+ pipeline.chat_gate.wait_if_blocked()
641
+ try:
642
+ response, _ = pipeline.agent_client.send_prompt(prompt, agent_id=args.m365_agent_id)
643
+ break
644
+ except Exception as exc:
645
+ cause = exc.__cause__
646
+ status = int(getattr(cause, "code", 0) or 0) or None if cause else None
647
+ retry_after = get_retry_after_seconds(
648
+ cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
649
+ )
650
+
651
+ if retry_after is not None and pipeline.is_retryable_status(status):
652
+ pipeline.chat_gate.apply_retry_after(retry_after)
653
+
654
+ if not pipeline.is_retryable_status(status) or attempt >= MAX_ATTEMPTS:
655
+ emit_structured_log(
656
+ "error",
657
+ f"Item {index + 1}/{total} failed after {attempt} attempt(s): {exc}",
658
+ operation=Operation.SEND_PROMPT,
659
+ )
660
+ return {
661
+ "prompt": prompt,
662
+ "response": "",
663
+ "expected_response": eval_item.get("expected_response", ""),
664
+ "evaluators_ran": [],
665
+ "results": {},
666
+ "status": STATUS_ERROR,
667
+ "errorDetails": str(exc),
668
+ }
669
+
670
+ delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
671
+ time.sleep(delay)
672
+
673
+ # Phase B: Evaluate response
674
+ return _evaluate_single_response(
675
+ response, eval_item, args,
676
+ pipeline.model_config, pipeline.has_azure_openai,
677
+ pipeline.default_evaluators,
678
+ )
679
+
680
+ def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
681
+ turns = eval_item["turns"]
682
+ thread_name = eval_item.get("name", "Unnamed thread")
683
+ emit_structured_log(
684
+ "info",
685
+ f"Processing item {index + 1}/{total} (multi-turn: '{thread_name}').",
686
+ operation=Operation.SEND_PROMPT,
687
+ )
688
+
689
+ if len(turns) > LONG_THREAD_WARNING_THRESHOLD:
690
+ emit_structured_log(
691
+ "warning",
692
+ f"Thread '{thread_name}' has {len(turns)} turns (>{LONG_THREAD_WARNING_THRESHOLD}). This may take a while.",
693
+ operation=Operation.SEND_PROMPT,
694
+ )
695
+
696
+ # Phase A: Send each turn with throttle gate + 429-only retry
697
+ # Multi-turn only retries on 429 (server confirmed it didn't process
698
+ # the request). Other transient errors (503, 504) are ambiguous about
699
+ # whether the server processed the turn, risking duplicate turns in
700
+ # the conversation if retried.
701
+ conversation_context = None
702
+ conversation_id = None
703
+ enriched_turns: List[Dict[str, Any]] = []
704
+ failed = False
705
+
706
+ for i, turn in enumerate(turns):
707
+ prompt = turn["prompt"]
708
+ emit_structured_log(
709
+ "debug",
710
+ f"Sending turn {i + 1}/{len(turns)} of '{thread_name}'.",
711
+ operation=Operation.SEND_PROMPT,
712
+ )
713
+
714
+ response = None
715
+ for attempt in range(1, MAX_ATTEMPTS + 1):
716
+ pipeline.chat_gate.wait_if_blocked()
717
+ try:
718
+ response, conversation_context = pipeline.agent_client.send_prompt(
719
+ prompt, agent_id=args.m365_agent_id,
720
+ conversation_context=conversation_context,
721
+ )
722
+ break
723
+ except Exception as exc:
724
+ cause = exc.__cause__
725
+ status = int(getattr(cause, "code", 0) or 0) or None if cause else None
726
+ retry_after = get_retry_after_seconds(
727
+ cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
728
+ )
729
+
730
+ # Only retry on 429 — server confirmed it didn't process the request
731
+ if status == 429 and attempt < MAX_ATTEMPTS:
732
+ if retry_after is not None:
733
+ pipeline.chat_gate.apply_retry_after(retry_after)
734
+ delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
735
+ time.sleep(delay)
736
+ continue
737
+
738
+ # All other errors: stop the thread
739
+ emit_structured_log(
740
+ "error",
741
+ f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
742
+ operation=Operation.SEND_PROMPT,
743
+ )
744
+ failed = True
745
+ break
746
+
747
+ if failed:
748
+ # Mark this turn and all remaining turns as error
749
+ enriched_turns.append({
750
+ **turn,
751
+ "response": "",
752
+ "status": STATUS_ERROR,
753
+ "error": "Failed to get response from agent",
754
+ })
755
+ for j in range(i + 1, len(turns)):
756
+ enriched_turns.append({
757
+ **turns[j],
758
+ "response": "",
759
+ "status": STATUS_ERROR,
760
+ "error": "Skipped: preceding turn failed",
761
+ })
762
+ break
763
+
764
+ # Enrich turn with response
765
+ response_text = get_response_text_for_evaluation(response)
766
+ enriched_turns.append({
767
+ **turn,
768
+ "response": response.get("display_response_text", response_text),
769
+ "_enhanced_response": response,
770
+ })
771
+
772
+ # Capture conversation_id from first response
773
+ if conversation_id is None:
774
+ conversation_id = response.get("metadata", {}).get("conversation_id")
775
+
776
+ # Phase B: Run per-turn evaluations
777
+ evaluated_turns, summary = _evaluate_multi_turn_responses(
778
+ enriched_turns, args, pipeline.default_evaluators,
779
+ model_config=pipeline.model_config,
780
+ has_azure_openai=pipeline.has_azure_openai,
781
+ )
782
+
783
+ return {
784
+ "type": "multi_turn",
785
+ "name": eval_item.get("name", ""),
786
+ "description": eval_item.get("description", ""),
787
+ "conversation_id": conversation_id or "",
788
+ "turns": evaluated_turns,
789
+ "summary": summary,
790
+ }
791
+
792
+ execution_results = execute_in_parallel(
793
+ eval_items, _process_item, max_workers=worker_count,
794
+ )
795
+
796
+ # Unwrap WorkerResult objects into plain dicts, with error fallback
797
+ ordered_results: List[Dict[str, Any]] = []
798
+ for wr in execution_results:
799
+ if wr.error:
800
+ idx = wr.index
801
+ item = eval_items[idx]
802
+ if item_types[idx] == ItemType.MULTI_TURN:
803
+ ordered_results.append({
804
+ "type": "multi_turn",
805
+ "name": item.get("name", ""),
806
+ "turns": [
807
+ {**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
808
+ for t in item.get("turns", [])
809
+ ],
810
+ "summary": {
811
+ "turns_total": len(item.get("turns", [])),
812
+ "turns_passed": 0,
813
+ "turns_failed": len(item.get("turns", [])),
814
+ "overall_status": STATUS_FAIL,
815
+ },
816
+ "error": str(wr.error),
817
+ })
818
+ else:
819
+ ordered_results.append({
820
+ "prompt": item.get("prompt", ""),
821
+ "response": "",
822
+ "expected_response": item.get("expected_response", ""),
823
+ "evaluators_ran": [],
824
+ "results": {},
825
+ "status": STATUS_ERROR,
826
+ "errorDetails": str(wr.error),
827
+ })
828
+ else:
829
+ ordered_results.append(wr.value)
830
+
831
+ return ordered_results
832
+
396
833
 
397
- return evaluation_results
398
834
 
399
835
  def write_results_to_console(results, agent_name: Optional[str] = None,
400
836
  agent_id: Optional[str] = None,
@@ -411,6 +847,35 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
411
847
  RED = '\033[91m'
412
848
  RESET = '\033[0m'
413
849
 
850
+ def _print_evaluated_item(response: str, expected_response: str,
851
+ evaluators_ran: List[str], item_results: Dict[str, Any],
852
+ error: Optional[str] = None) -> None:
853
+ """Print the body of a single evaluated item (single-turn prompt or multi-turn turn).
854
+
855
+ The item header (Prompt X / Turn X) is printed by the caller; this helper
856
+ prints evaluators, response, expected response, error, and metrics.
857
+ """
858
+ if evaluators_ran:
859
+ print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
860
+ if response:
861
+ print(f"{BOLD}{CYAN}Response:{RESET} {response}")
862
+ if expected_response:
863
+ print(f"{BOLD}{YELLOW}Expected Response:{RESET} {expected_response}")
864
+ if error:
865
+ print(f"{BOLD}{RED}Error:{RESET} {error}")
866
+
867
+ for eval_name, v in item_results.items():
868
+ if v is None:
869
+ continue
870
+ display_name = pascal_case_to_title(eval_name)
871
+ if eval_name == RELEVANCE:
872
+ color = MAGENTA
873
+ elif eval_name == COHERENCE:
874
+ color = ORANGE
875
+ else:
876
+ color = BLUE
877
+ print(f"{BOLD}{color}{display_name}:{RESET} {json.dumps(v, indent=4)}")
878
+
414
879
  # Show metadata
415
880
  metadata_parts = []
416
881
  if agent_name:
@@ -423,17 +888,17 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
423
888
  print(f"{BOLD}{CYAN}{' | '.join(metadata_parts)}{RESET}")
424
889
  print()
425
890
 
426
- # Show aggregate statistics if multiple results
427
- if len(results) > 1:
428
- aggregates = calculate_aggregate_statistics(results)
429
- if aggregates:
430
- print(f"{BOLD}{BLUE}Aggregate Statistics ({len(results)} prompts):{RESET}")
891
+ aggregates = calculate_aggregate_statistics(results)
892
+ if aggregates:
893
+ total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
894
+ if total_items > 1:
895
+ print(f"{BOLD}{BLUE}Aggregate Statistics ({total_items} prompts):{RESET}")
431
896
  print(f"{BLUE}{'=' * 60}{RESET}")
432
897
 
433
898
  for metric_name, stats in aggregates.items():
434
899
  pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
435
900
  prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
436
- total_prompts = stats.get('total_prompts', len(results))
901
+ total_prompts = stats.get('total_prompts', total_items)
437
902
  print(f"{BOLD}{CYAN}{metric_name}:{RESET} ({prompts_evaluated}/{total_prompts} prompts)")
438
903
  print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
439
904
  print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
@@ -447,30 +912,39 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
447
912
  print(f"{BOLD}{BLUE}Individual Results:{RESET}")
448
913
  print(f"{BLUE}{'=' * 50}{RESET}")
449
914
  for i, result in enumerate(results, 1):
450
- print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
451
-
452
- # Show which evaluators ran for this prompt
453
- evaluators_ran = result.get('evaluators_ran', [])
454
- if evaluators_ran:
455
- print(f"{BOLD}{CYAN}Evaluators:{RESET} {', '.join(evaluators_ran)}")
456
-
457
- print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
458
- print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
459
-
460
- # Print metric scores from results
461
- metrics = result.get('results', {})
462
- for eval_name, v in metrics.items():
463
- if v is None:
464
- continue # Skip null/N/A scores from skipped evaluators
465
- display_name = pascal_case_to_title(eval_name)
466
- if eval_name == RELEVANCE:
467
- color = MAGENTA
468
- elif eval_name == COHERENCE:
469
- color = ORANGE
470
- else:
471
- color = BLUE
472
- print(f"{BOLD}{color}{display_name}:{RESET} {v}")
473
- print(f"{BLUE}{'-' * 30}{RESET}")
915
+ if result.get("type") == "multi_turn":
916
+ thread_name = result.get("name", "Unnamed Thread")
917
+ summary = result.get("summary", {})
918
+ status = summary.get("overall_status", STATUS_UNKNOWN)
919
+ status_color = GREEN if status == STATUS_PASS else YELLOW if status == STATUS_PARTIAL else RED
920
+
921
+ print(f"{BOLD}{MAGENTA}Thread {i}: {thread_name}{RESET}")
922
+ for t_idx, turn in enumerate(result.get("turns", []), 1):
923
+ turn_status = turn.get("status", STATUS_UNKNOWN)
924
+ turn_color = GREEN if turn_status == STATUS_PASS else RED if turn_status in (STATUS_FAIL, STATUS_ERROR) else YELLOW
925
+ print(f"{BOLD}{turn_color}Turn {t_idx}:{RESET} [{turn_status}] {turn.get('prompt', '')}")
926
+ _print_evaluated_item(
927
+ response=turn.get("response", ""),
928
+ expected_response=turn.get("expected_response", ""),
929
+ evaluators_ran=turn.get("evaluators_ran", []),
930
+ item_results=turn.get("results", {}),
931
+ error=turn.get("error"),
932
+ )
933
+ print()
934
+ print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
935
+ print(f" Status: {status_color}{status.upper()}{RESET}")
936
+ print(f" Turns passed: {status_color}{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)}{RESET}")
937
+ print(f"{BLUE}{'-' * 30}{RESET}")
938
+ else:
939
+ print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
940
+ _print_evaluated_item(
941
+ response=result.get('response', ''),
942
+ expected_response=result.get('expected_response', ''),
943
+ evaluators_ran=result.get('evaluators_ran', []),
944
+ item_results=result.get('results', {}),
945
+ error=result.get('errorDetails'),
946
+ )
947
+ print(f"{BLUE}{'-' * 30}{RESET}")
474
948
 
475
949
  def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
476
950
  """Extract an EvalScore object from a decorated metric dict.
@@ -487,8 +961,8 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
487
961
  return None
488
962
 
489
963
  result = data.get("result")
490
- if result not in ("pass", "fail"):
491
- result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
964
+ if result not in (STATUS_PASS, STATUS_FAIL):
965
+ result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else STATUS_FAIL
492
966
 
493
967
  eval_score: Dict[str, Any] = {
494
968
  "score": score_val,
@@ -501,55 +975,33 @@ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
501
975
  return eval_score
502
976
 
503
977
 
504
- def convert_result_to_eval_item(result: Dict) -> Dict:
505
- """Convert an internal evaluation result dict to a schema-compliant EvalItem.
506
-
507
- Internal format (from run_evaluations):
508
- {prompt, response, expected_response, results: {Relevance: "JSON", ...},
509
- evaluators_ran: [...], evaluators: {...}, evaluators_mode: "..."}
510
- Schema EvalItem format:
511
- {prompt, response, expected_response, scores: {relevance: EvalScore, ...},
512
- evaluators: {...}, evaluators_mode: "..."}
513
- """
514
- item: Dict[str, Any] = {
515
- "prompt": result["prompt"],
516
- "response": result["response"],
517
- "expected_response": result["expected_response"],
518
- }
519
-
520
- # Preserve evaluator config in output
521
- if "evaluators" in result:
522
- item["evaluators"] = result["evaluators"]
523
- if "evaluators_mode" in result:
524
- item["evaluators_mode"] = result["evaluators_mode"]
978
+ def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
979
+ """Convert raw evaluator results to schema-compliant score objects.
525
980
 
981
+ Evaluator results in results_dict are dicts (from _decorate_metric) or
982
+ None when skipped/crashed. None values are omitted from output.
983
+ """
526
984
  scores: Dict[str, Any] = {}
527
- results_dict = result.get("results", {})
528
985
 
529
- # EvalScore metrics (all share the same schema shape: {score, result, threshold})
530
986
  for eval_key, schema_key in [
531
987
  (RELEVANCE, "relevance"),
532
988
  (COHERENCE, "coherence"),
533
989
  (GROUNDEDNESS, "groundedness"),
534
990
  (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
535
991
  ]:
536
- raw = results_dict.get(eval_key)
537
- if not raw:
992
+ data = results_dict.get(eval_key)
993
+ if data is None:
538
994
  continue
539
- data = json.loads(raw) if isinstance(raw, str) else raw
540
995
  eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
541
996
  if eval_score:
542
997
  scores[schema_key] = eval_score
543
998
 
544
- # Citations → CitationScore
545
- raw_citations = results_dict.get(CITATIONS)
546
- if raw_citations:
547
- data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
999
+ data = results_dict.get(CITATIONS)
1000
+ if data is not None:
548
1001
  count = data.get("citations", 0)
549
1002
  cit_result = data.get("result")
550
- if cit_result not in ("pass", "fail"):
551
- cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
552
-
1003
+ if cit_result not in (STATUS_PASS, STATUS_FAIL):
1004
+ cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
553
1005
  citation_score: Dict[str, Any] = {
554
1006
  "count": count,
555
1007
  "result": cit_result,
@@ -559,34 +1011,92 @@ def convert_result_to_eval_item(result: Dict) -> Dict:
559
1011
  citation_score["format"] = data["citation_format"]
560
1012
  scores["citations"] = citation_score
561
1013
 
562
- # ExactMatch → ExactMatchScore
563
- raw_exact = results_dict.get(EXACT_MATCH)
564
- if raw_exact:
565
- data = json.loads(raw_exact) if isinstance(raw_exact, str) else raw_exact
1014
+ data = results_dict.get(EXACT_MATCH)
1015
+ if data is not None:
566
1016
  is_match = data.get("exact_match", 0.0) == 1.0
567
1017
  scores["exactMatch"] = {
568
1018
  "match": is_match,
569
- "result": data.get("result", "pass" if is_match else "fail"),
1019
+ "result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
570
1020
  "reason": data.get("exact_match_reason", ""),
571
1021
  }
572
1022
 
573
- # PartialMatch → PartialMatchScore
574
- raw_partial = results_dict.get(PARTIAL_MATCH)
575
- if raw_partial:
576
- data = json.loads(raw_partial) if isinstance(raw_partial, str) else raw_partial
1023
+ data = results_dict.get(PARTIAL_MATCH)
1024
+ if data is not None:
577
1025
  scores["partialMatch"] = {
578
1026
  "score": data.get("partial_match", 0.0),
579
- "result": data.get("result", "fail"),
1027
+ "result": data.get("result", STATUS_FAIL),
580
1028
  "threshold": data.get("threshold", 0.5),
581
1029
  "reason": data.get("partial_match_reason", ""),
582
1030
  }
583
1031
 
1032
+ return scores
1033
+
1034
+
1035
+ def convert_result_to_eval_item(result: Dict) -> Dict:
1036
+ """Convert an internal evaluation result dict to a schema-compliant EvalItem."""
1037
+ item: Dict[str, Any] = {
1038
+ "prompt": result["prompt"],
1039
+ "response": result["response"],
1040
+ "expected_response": result["expected_response"],
1041
+ }
1042
+
1043
+ if "evaluators" in result:
1044
+ item["evaluators"] = result["evaluators"]
1045
+ if "evaluators_mode" in result:
1046
+ item["evaluators_mode"] = result["evaluators_mode"]
1047
+
1048
+ scores = _convert_scores_to_schema(result.get("results", {}))
584
1049
  if scores:
585
1050
  item["scores"] = scores
586
1051
 
587
1052
  return item
588
1053
 
589
1054
 
1055
+ def convert_thread_result_to_output(thread_result: Dict) -> Dict:
1056
+ """Convert a multi-turn thread result to the output format."""
1057
+ output_turns = []
1058
+ for turn in thread_result.get("turns", []):
1059
+ output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
1060
+ if "expected_response" in turn:
1061
+ output_turn["expected_response"] = turn["expected_response"]
1062
+ if "response" in turn:
1063
+ output_turn["response"] = turn["response"]
1064
+ if "status" in turn:
1065
+ output_turn["status"] = turn["status"]
1066
+ if "error" in turn:
1067
+ output_turn["error"] = turn["error"]
1068
+ if "evaluators" in turn:
1069
+ output_turn["evaluators"] = turn["evaluators"]
1070
+ if "evaluators_mode" in turn:
1071
+ output_turn["evaluators_mode"] = turn["evaluators_mode"]
1072
+
1073
+ scores = _convert_scores_to_schema(turn.get("results", {}))
1074
+ if scores:
1075
+ output_turn["scores"] = scores
1076
+
1077
+ output_turns.append(output_turn)
1078
+
1079
+ output: Dict[str, Any] = {}
1080
+ if thread_result.get("name"):
1081
+ output["name"] = thread_result["name"]
1082
+ if thread_result.get("description"):
1083
+ output["description"] = thread_result["description"]
1084
+ if thread_result.get("conversation_id"):
1085
+ output["conversation_id"] = thread_result["conversation_id"]
1086
+ output["turns"] = output_turns
1087
+ if thread_result.get("summary"):
1088
+ output["summary"] = thread_result["summary"]
1089
+
1090
+ return output
1091
+
1092
+
1093
+ def convert_result_to_output_item(result: Dict) -> Dict:
1094
+ """Convert an internal result dict to an output item. Routes by type."""
1095
+ if result.get("type") == "multi_turn":
1096
+ return convert_thread_result_to_output(result)
1097
+ return convert_result_to_eval_item(result)
1098
+
1099
+
590
1100
  def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
591
1101
  default_evaluators: Optional[Dict[str, Any]] = None,
592
1102
  agent_name: Optional[str] = None,
@@ -602,7 +1112,7 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
602
1112
  except Exception:
603
1113
  current_version = "1.0.0"
604
1114
 
605
- items = [convert_result_to_eval_item(r) for r in results]
1115
+ items = [convert_result_to_output_item(r) for r in results]
606
1116
 
607
1117
  metadata: Dict[str, Any] = {
608
1118
  "evaluatedAt": datetime.now(timezone.utc).isoformat(),
@@ -631,6 +1141,18 @@ def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optio
631
1141
  emit_structured_log("error", f"Error writing to JSON file: {e}", operation=Operation.WRITE_OUTPUT)
632
1142
  sys.exit(1)
633
1143
 
1144
+ def _results_to_csv_json(results_dict: Dict) -> str:
1145
+ """Serialize evaluator results dict to a CSV-safe JSON string.
1146
+
1147
+ Skips None (crashed/skipped evaluators). Results are dicts produced
1148
+ by _decorate_metric.
1149
+ """
1150
+ if not results_dict:
1151
+ return ""
1152
+ non_null = {k: v for k, v in results_dict.items() if v is not None}
1153
+ return json.dumps(non_null) if non_null else ""
1154
+
1155
+
634
1156
  def write_results_to_csv(results: List[Dict], output_file: str,
635
1157
  agent_name: Optional[str] = None, agent_id: Optional[str] = None,
636
1158
  cli_version: Optional[str] = None):
@@ -638,7 +1160,6 @@ def write_results_to_csv(results: List[Dict], output_file: str,
638
1160
  try:
639
1161
  with open(output_file, 'w', newline='', encoding='utf-8') as f:
640
1162
  if results:
641
- # Write metadata header
642
1163
  metadata_parts = []
643
1164
  if agent_name:
644
1165
  metadata_parts.append(f"Agent Name: {agent_name}")
@@ -649,30 +1170,87 @@ def write_results_to_csv(results: List[Dict], output_file: str,
649
1170
  if metadata_parts:
650
1171
  f.write(f"# {' | '.join(metadata_parts)}\n")
651
1172
 
652
- # Write aggregate statistics first if multiple results
653
- if len(results) > 1:
654
- aggregates = calculate_aggregate_statistics(results)
655
- if aggregates:
1173
+ aggregates = calculate_aggregate_statistics(results)
1174
+ if aggregates:
1175
+ total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
1176
+ if total_items > 1:
656
1177
  f.write("# AGGREGATE STATISTICS\n")
657
1178
  f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
658
1179
  for metric_name, stats in aggregates.items():
659
1180
  threshold_str = str(stats.get('threshold', 'N/A'))
660
1181
  prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
661
- total_prompts = stats.get('total_prompts', len(results))
1182
+ total_prompts = stats.get('total_prompts', total_items)
662
1183
  f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
663
1184
  f.write("\n# INDIVIDUAL RESULTS\n")
664
1185
 
665
- # Write individual results (exclude internal fields)
666
- exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode'}
667
- fieldnames = [k for k in results[0].keys() if k not in exclude_keys]
668
- writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
669
- writer.writeheader()
670
- writer.writerows(results)
1186
+ single_turn_rows = []
1187
+ multi_turn_rows = []
1188
+ for result in results:
1189
+ if result.get("type") == "multi_turn":
1190
+ thread_name = result.get("name", "")
1191
+ for turn_idx, turn in enumerate(result.get("turns", [])):
1192
+ multi_turn_rows.append({
1193
+ "thread_name": thread_name,
1194
+ "turn_index": turn_idx + 1,
1195
+ "prompt": turn.get("prompt", ""),
1196
+ "response": turn.get("response", ""),
1197
+ "expected_response": turn.get("expected_response", ""),
1198
+ "status": turn.get("status", ""),
1199
+ "error": turn.get("error", ""),
1200
+ "scores": _results_to_csv_json(turn.get("results", {})),
1201
+ })
1202
+ summary = result.get("summary", {})
1203
+ multi_turn_rows.append({
1204
+ "thread_name": thread_name,
1205
+ "turn_index": "summary",
1206
+ "prompt": "",
1207
+ "response": "",
1208
+ "expected_response": "",
1209
+ "status": summary.get("overall_status", ""),
1210
+ "scores": f"{summary.get('turns_passed', 0)}/{summary.get('turns_total', 0)} turns passed",
1211
+ })
1212
+ else:
1213
+ exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
1214
+ row = {k: v for k, v in result.items() if k not in exclude_keys}
1215
+ if "results" in result:
1216
+ row["scores"] = _results_to_csv_json(result["results"])
1217
+ single_turn_rows.append(row)
1218
+
1219
+ if single_turn_rows:
1220
+ if multi_turn_rows:
1221
+ f.write("# SINGLE-TURN RESULTS\n")
1222
+ fieldnames = list(single_turn_rows[0].keys())
1223
+ for row in single_turn_rows:
1224
+ for k in row:
1225
+ if k not in fieldnames:
1226
+ fieldnames.append(k)
1227
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
1228
+ writer.writeheader()
1229
+ writer.writerows(single_turn_rows)
1230
+
1231
+ if multi_turn_rows:
1232
+ if single_turn_rows:
1233
+ f.write("\n")
1234
+ f.write("# MULTI-TURN RESULTS\n")
1235
+ fieldnames = ["thread_name", "turn_index", "prompt", "response", "expected_response", "status", "error", "scores"]
1236
+ writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
1237
+ writer.writeheader()
1238
+ writer.writerows(multi_turn_rows)
671
1239
  emit_structured_log("info", f"Results saved to {output_file}", operation=Operation.WRITE_OUTPUT)
672
1240
  except Exception as e:
673
1241
  emit_structured_log("error", f"Error writing to CSV file: {e}", operation=Operation.WRITE_OUTPUT)
674
1242
  sys.exit(1)
675
1243
 
1244
+ def normalize_agent_id(agent_id):
1245
+ """Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
1246
+
1247
+ Returns the input unchanged when it is None/empty or already contains a dot.
1248
+ """
1249
+ if not agent_id:
1250
+ return agent_id
1251
+ return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
1252
+
1253
+
676
1254
  def parse_arguments():
677
1255
  """Parse command line arguments."""
678
1256
  parser = argparse.ArgumentParser(
@@ -763,8 +1341,29 @@ Examples:
763
1341
  action='store_true',
764
1342
  help='Sign out and clear cached authentication tokens'
765
1343
  )
1344
+
1345
+ parser.add_argument(
1346
+ '--concurrency',
1347
+ type=int,
1348
+ default=5,
1349
+ help='Number of parallel workers for prompt processing (1-5, default: 5)'
1350
+ )
766
1351
 
767
- return parser.parse_args()
1352
+ args = parser.parse_args()
1353
+
1354
+ args.m365_agent_id = normalize_agent_id(args.m365_agent_id)
1355
+
1356
+ if args.concurrency < 1:
1357
+ parser.error('--concurrency must be an integer >= 1.')
1358
+ if args.concurrency > MAX_CONCURRENCY:
1359
+ emit_structured_log(
1360
+ "warning",
1361
+ f"--concurrency {args.concurrency} exceeds max {MAX_CONCURRENCY}; clamping to {MAX_CONCURRENCY}.",
1362
+ operation=Operation.SETUP,
1363
+ )
1364
+ args.concurrency = MAX_CONCURRENCY
1365
+
1366
+ return args
768
1367
 
769
1368
  def validate_environment() -> CallPath:
770
1369
  """Validate required environment variables."""
@@ -773,19 +1372,29 @@ def validate_environment() -> CallPath:
773
1372
  "AZURE_AI_API_KEY",
774
1373
  "AZURE_AI_API_VERSION",
775
1374
  "AZURE_AI_MODEL_NAME",
776
- # Chat API specific
777
- "COPILOT_API_ENDPOINT",
778
- "X_SCENARIO_HEADER"
779
1375
  ]
780
1376
 
781
1377
  if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
782
1378
  call_path = CallPath.ACCESS_TOKEN
783
- required_env_vars.append("COPILOT_API_ACCESS_TOKEN")
1379
+ required_env_vars.extend([
1380
+ "COPILOT_API_ACCESS_TOKEN",
1381
+ "COPILOT_API_ENDPOINT",
1382
+ "X_SCENARIO_HEADER",
1383
+ ])
1384
+ elif os.environ.get("WORK_IQ_A2A_ENDPOINT"):
1385
+ call_path = CallPath.A2A
1386
+ required_env_vars.extend([
1387
+ "WORK_IQ_A2A_ENDPOINT",
1388
+ "WORK_IQ_A2A_CLIENT_ID",
1389
+ "TENANT_ID",
1390
+ ])
784
1391
  else:
785
1392
  call_path = CallPath.COPILOT_AUTH
786
1393
  required_env_vars.extend([
1394
+ "COPILOT_API_ENDPOINT",
1395
+ "X_SCENARIO_HEADER",
787
1396
  "M365_EVAL_CLIENT_ID",
788
- "TENANT_ID"
1397
+ "TENANT_ID",
789
1398
  ])
790
1399
 
791
1400
  missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
@@ -867,46 +1476,6 @@ def get_prompt_datasets(args) -> Tuple[List[Dict], Optional[Dict]]:
867
1476
  ]
868
1477
  return eval_items, None
869
1478
 
870
- def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
871
- """
872
- Fetch available agents for the user from the Copilot API.
873
-
874
- Args:
875
- access_token: Bearer token for API authentication
876
- user_oid: User object ID for agent filtering
877
-
878
- Returns:
879
- List of agent dictionaries.
880
- """
881
- request_headers = {
882
- "Content-Type": "application/json",
883
- "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
884
- "Authorization": f"Bearer {access_token}"
885
- }
886
-
887
- try:
888
- # Build the query parameter with participant info
889
- request_data = json.dumps({"participant": {"id": user_oid}})
890
- query_param = urllib.parse.quote(request_data)
891
-
892
- # Try to fetch agents from /GetGptList endpoint
893
- req = urllib.request.Request(
894
- f"{copilot_api_endpoint}/GetGptList?request={query_param}",
895
- headers=request_headers,
896
- method="GET"
897
- )
898
- with urllib.request.urlopen(req, timeout=120) as resp:
899
- data = json.loads(resp.read().decode("utf-8"))
900
- agents = data.get("gptList", [])
901
- return agents
902
- except urllib.error.HTTPError as e:
903
- # If endpoint doesn't exist or returns error, return empty list
904
- emit_structured_log("warning", f"Unable to fetch agents list (HTTP {e.code}).", operation=Operation.FETCH_AGENTS)
905
- return []
906
- except Exception as e:
907
- emit_structured_log("warning", f"Error fetching agents: {e}", operation=Operation.FETCH_AGENTS)
908
- return []
909
-
910
1479
  def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
911
1480
  """
912
1481
  Display an interactive agent selector using questionary.
@@ -946,127 +1515,6 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[s
946
1515
 
947
1516
  return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
948
1517
 
949
- @functools.lru_cache(maxsize=1)
950
- def _get_iana_timezone_name() -> str:
951
- """Get the IANA timezone name from the system using tzlocal.
952
-
953
- Tries get_localzone_name() first; falls back to str(get_localzone()) when the
954
- former raises (e.g. no zone configured on some Unix systems). Result is cached
955
- after the first call so tzlocal is only invoked once per session.
956
- """
957
- try:
958
- return tzlocal.get_localzone_name()
959
- except Exception:
960
- return str(tzlocal.get_localzone())
961
-
962
-
963
- @functools.lru_cache(maxsize=1)
964
- def _get_location_info() -> Dict[str, Any]:
965
- """Return a locationInfo dict containing the local UTC offset and IANA timezone name.
966
-
967
- Result is cached after the first call so the computation runs only once per session.
968
- """
969
- now = datetime.now().astimezone()
970
- utc_offset = now.utcoffset()
971
- offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
972
- return {
973
- "timeZoneOffset": offset_hours,
974
- "timeZone": _get_iana_timezone_name(),
975
- }
976
-
977
-
978
- def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
979
- message = {
980
- "message": {
981
- "text": prompt,
982
- "author": "user",
983
- "messageType": "chat",
984
- "timestamp": datetime.now(timezone.utc).isoformat(),
985
- "locationInfo": _get_location_info(),
986
- "from": {
987
- "id": user_oid,
988
- }
989
- },
990
- "verbosity": "verbose", # To enable detailed telemetry in response (to extract tool usage, etc.)
991
- }
992
-
993
- if agent_id:
994
- message["gpts"] = [
995
- {
996
- "id": agent_id.strip(),
997
- "source": "MOS3"
998
- }
999
- ]
1000
- message["optionsSets"] = [
1001
- "disable_action_confirmation" # Disable 3P action confirmation prompts for agents while scraping
1002
- ]
1003
-
1004
- return json.dumps(message).encode("utf-8")
1005
-
1006
- def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> List[Dict[str, Any]]:
1007
- """ Send prompts to the chat API and return enhanced responses. """
1008
-
1009
- request_headers = {
1010
- "Content-Type": "application/json",
1011
- "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
1012
- "Authorization": f"Bearer {access_token}"
1013
- }
1014
- raw_responses: List[Tuple[str, str]] = []
1015
- for i, prompt in enumerate(prompts, 1):
1016
- if getattr(args, "effective_log_level", "info") in ("info", "debug"):
1017
- emit_structured_log("info", f"Processing prompt {i}/{len(prompts)}.", operation=Operation.SEND_PROMPT)
1018
-
1019
- # Build the payload
1020
- payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
1021
- if getattr(args, "effective_log_level", "info") == "debug":
1022
- emit_structured_log("debug", f"[Sydney] Sending payload: {payload.decode('utf-8')}", operation=Operation.SEND_PROMPT)
1023
-
1024
- # Send the request to /chat
1025
- req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
1026
- try:
1027
- with urllib.request.urlopen(req, timeout=120) as resp:
1028
- raw = resp.read().decode("utf-8", errors="replace")
1029
- except urllib.error.HTTPError as e:
1030
- error_body = None
1031
- try:
1032
- error_body = e.read().decode("utf-8", errors="replace")
1033
- except Exception:
1034
- pass
1035
- msg = f"Chat API request failed (HTTP {e.code} {e.reason})."
1036
- if error_body:
1037
- msg += f" Body: {error_body[:500]}"
1038
- raise RuntimeError(msg) from e
1039
- except urllib.error.URLError as e:
1040
- raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
1041
-
1042
- if getattr(args, "effective_log_level", "info") == "debug":
1043
- emit_structured_log("debug", f"[Sydney] Raw response: {raw}", operation=Operation.SEND_PROMPT)
1044
-
1045
- # Store raw response for enhancement
1046
- raw_responses.append((prompt, raw.strip()))
1047
-
1048
- # Extract enhanced responses using the new extractor
1049
- enhanced_responses = extract_enhanced_responses(raw_responses, log_level=getattr(args, "effective_log_level", "info"))
1050
-
1051
- if getattr(args, "effective_log_level", "info") == "debug":
1052
- for idx, enhanced in enumerate(enhanced_responses, 1):
1053
- metadata = enhanced.get("metadata", {})
1054
- context = {
1055
- "request-id": metadata.get("request_id"),
1056
- "conversation-id": metadata.get("conversation_id"),
1057
- "message-id": metadata.get("message_id"),
1058
- "operation": Operation.SEND_PROMPT,
1059
- }
1060
- entry = format_structured_log_entry(
1061
- level="debug",
1062
- message=f"Response IDs for prompt {idx}/{len(enhanced_responses)}.",
1063
- logger_name=CLI_LOGGER_NAME,
1064
- run_context=context,
1065
- )
1066
- DIAGNOSTIC_RECORDS.append(entry)
1067
- CLI_LOGGER.log(logging.DEBUG, render_diagnostic(entry))
1068
-
1069
- return enhanced_responses
1070
1518
 
1071
1519
  def output_results(results: List[Dict], args, default_evaluators: Optional[Dict[str, Any]] = None,
1072
1520
  agent_name: Optional[str] = None, cli_version: Optional[str] = None):
@@ -1119,76 +1567,142 @@ def main():
1119
1567
 
1120
1568
  # Validate environment variables required for evaluation
1121
1569
  call_path = validate_environment()
1122
- copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
1123
- validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
1124
1570
 
1125
1571
  user_oid = ""
1126
1572
 
1127
- if call_path == CallPath.ACCESS_TOKEN:
1128
- access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
1129
- else:
1130
- scopes_str = os.environ.get(
1131
- "COPILOT_SCOPES", "https://substrate.office.com/sydney/.default"
1132
- )
1133
-
1134
- auth_handler = AuthHandler(
1135
- client_id=os.environ["M365_EVAL_CLIENT_ID"],
1136
- tenant_id=os.environ["TENANT_ID"],
1137
- scopes_str=scopes_str
1138
- )
1139
-
1140
- # Signout user
1141
- if args.signout:
1142
- try:
1143
- auth_handler.clear_cache()
1144
- except Exception as e:
1145
- emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
1146
- sys.exit(1)
1147
- sys.exit(0)
1148
-
1149
- # Authenticate before loading prompts
1150
- try:
1151
- auth_result = auth_handler.acquire_token_interactive() or {}
1152
- access_token = auth_result.get("access_token") or ""
1153
- if not access_token:
1154
- raise RuntimeError("Failed to acquire access token from authentication result")
1155
-
1156
- id_token_claims = auth_result.get("id_token_claims")
1157
- if not isinstance(id_token_claims, dict):
1573
+ match call_path:
1574
+ case CallPath.ACCESS_TOKEN:
1575
+ access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
1576
+ user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
1577
+ copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
1578
+ validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
1579
+ agent_client = SydneyClient(
1580
+ copilot_api_endpoint=copilot_api_endpoint,
1581
+ access_token=access_token,
1582
+ user_oid=user_oid,
1583
+ logger=CLI_LOGGER,
1584
+ diagnostic_records=DIAGNOSTIC_RECORDS,
1585
+ )
1586
+
1587
+ case CallPath.A2A:
1158
1588
  emit_structured_log(
1159
- "warning", "id_token_claims is missing or invalid in authentication result",
1160
- operation=Operation.AUTHENTICATE,
1589
+ "warning",
1590
+ "The A2A endpoint is experimental and may change without notice.",
1591
+ operation=Operation.SETUP,
1161
1592
  )
1162
- else:
1163
- user_oid = id_token_claims.get("oid") or ""
1164
-
1165
- except Exception as e:
1166
- emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
1167
- if effective_log_level == "debug":
1168
- import traceback
1169
- traceback.print_exc()
1170
- sys.exit(1)
1593
+ a2a_endpoint = os.environ["WORK_IQ_A2A_ENDPOINT"]
1594
+ validate_endpoint_url(a2a_endpoint, ALLOWED_ENDPOINTS)
1595
+
1596
+ a2a_scopes_str = os.environ.get("WORK_IQ_A2A_SCOPES", "")
1597
+ a2a_auth_handler = AuthHandler(
1598
+ client_id=os.environ["WORK_IQ_A2A_CLIENT_ID"],
1599
+ tenant_id=os.environ["TENANT_ID"],
1600
+ scopes_str=a2a_scopes_str,
1601
+ )
1602
+ try:
1603
+ a2a_auth_result = a2a_auth_handler.acquire_token_interactive() or {}
1604
+ a2a_access_token = a2a_auth_result.get("access_token") or ""
1605
+ if not a2a_access_token:
1606
+ raise RuntimeError("Failed to acquire A2A access token")
1607
+ except Exception as e:
1608
+ emit_structured_log(
1609
+ "error",
1610
+ f"Error during A2A authentication: {e}",
1611
+ operation=Operation.AUTHENTICATE,
1612
+ )
1613
+ if effective_log_level == "debug":
1614
+ import traceback
1615
+ traceback.print_exc()
1616
+ sys.exit(1)
1617
+ try:
1618
+ agent_client = A2AClient(
1619
+ a2a_endpoint=a2a_endpoint,
1620
+ access_token=a2a_access_token,
1621
+ logger=CLI_LOGGER,
1622
+ diagnostic_records=DIAGNOSTIC_RECORDS,
1623
+ )
1624
+ except Exception as e:
1625
+ emit_structured_log(
1626
+ "error",
1627
+ f"Failed to initialize A2A client: {e}",
1628
+ operation=Operation.SETUP,
1629
+ )
1630
+ sys.exit(1)
1171
1631
 
1172
- if not user_oid and access_token:
1173
- # Fallback: extract from access token.
1174
- user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
1632
+ case CallPath.COPILOT_AUTH:
1633
+ copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
1634
+ validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
1635
+ auth_handler = AuthHandler(
1636
+ client_id=os.environ["M365_EVAL_CLIENT_ID"],
1637
+ tenant_id=os.environ["TENANT_ID"],
1638
+ scopes_str=os.environ.get("COPILOT_SCOPES", ""),
1639
+ )
1640
+
1641
+ # Signout user
1642
+ if args.signout:
1643
+ try:
1644
+ auth_handler.clear_cache()
1645
+ except Exception as e:
1646
+ emit_structured_log("error", f"Error during signout: {e}", operation=Operation.AUTHENTICATE)
1647
+ sys.exit(1)
1648
+ sys.exit(0)
1649
+
1650
+ try:
1651
+ auth_result = auth_handler.acquire_token_interactive() or {}
1652
+ access_token = auth_result.get("access_token") or ""
1653
+ if not access_token:
1654
+ raise RuntimeError("Failed to acquire access token from authentication result")
1655
+
1656
+ id_token_claims = auth_result.get("id_token_claims")
1657
+ if not isinstance(id_token_claims, dict):
1658
+ emit_structured_log(
1659
+ "warning", "id_token_claims is missing or invalid in authentication result",
1660
+ operation=Operation.AUTHENTICATE,
1661
+ )
1662
+ else:
1663
+ user_oid = id_token_claims.get("oid") or ""
1664
+
1665
+ if not user_oid:
1666
+ user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
1667
+
1668
+ except Exception as e:
1669
+ emit_structured_log("error", f"Error during authentication: {e}", operation=Operation.AUTHENTICATE)
1670
+ if effective_log_level == "debug":
1671
+ import traceback
1672
+ traceback.print_exc()
1673
+ sys.exit(1)
1674
+
1675
+ agent_client = SydneyClient(
1676
+ copilot_api_endpoint=copilot_api_endpoint,
1677
+ access_token=access_token,
1678
+ user_oid=user_oid,
1679
+ logger=CLI_LOGGER,
1680
+ diagnostic_records=DIAGNOSTIC_RECORDS,
1681
+ )
1175
1682
 
1176
1683
  # 1. Load evaluation datasets
1177
1684
  eval_items, file_default_evaluators = get_prompt_datasets(args)
1178
1685
  default_evaluators = resolve_default_evaluators(file_default_evaluators)
1179
- prompts = [eval_item.get("prompt", "") for eval_item in eval_items]
1180
1686
 
1181
1687
  if effective_log_level in ("info", "debug"):
1182
- emit_structured_log("info", f"Running evaluation on {len(prompts)} prompt(s).", operation=Operation.SETUP)
1688
+ multi_turn_count = sum(1 for item in eval_items if "turns" in item)
1689
+ single_turn_count = len(eval_items) - multi_turn_count
1690
+ emit_structured_log(
1691
+ "info",
1692
+ f"Running evaluation on {len(eval_items)} item(s) "
1693
+ f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
1694
+ operation=Operation.SETUP,
1695
+ )
1183
1696
 
1184
1697
  agent_name = None
1185
1698
  try:
1186
- # 3. Agent selection - if no agent ID provided, prompt user to select
1699
+ # 2. Agent selection - when no agent ID is provided, discover agents
1700
+ # via the active client (A2A or REST) and prompt interactively.
1187
1701
  if not args.m365_agent_id:
1188
1702
  if effective_log_level in ("info", "debug"):
1189
1703
  emit_structured_log("info", "No agent ID provided. Fetching available agents.", operation=Operation.FETCH_AGENTS)
1190
-
1191
- available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
1704
+
1705
+ available_agents = agent_client.fetch_available_agents()
1192
1706
  if not available_agents:
1193
1707
  emit_structured_log(
1194
1708
  "error",
@@ -1210,30 +1724,46 @@ def main():
1210
1724
  operation=Operation.FETCH_AGENTS,
1211
1725
  )
1212
1726
  sys.exit(1)
1213
-
1214
- # 4. Send prompts to chat API
1215
- responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
1216
1727
  except Exception as e:
1217
- emit_structured_log("error", f"Error sending prompts to chat API: {e}", operation=Operation.SEND_PROMPT)
1728
+ emit_structured_log("error", f"Error during agent discovery: {e}", operation=Operation.FETCH_AGENTS)
1218
1729
  if effective_log_level == "debug":
1219
1730
  import traceback
1220
1731
  traceback.print_exc()
1221
1732
  sys.exit(1)
1222
1733
 
1734
+ # Pre-resolve agent endpoint (A2A agent card lookup; no-op for REST)
1735
+ if args.m365_agent_id:
1736
+ agent_client.resolve_agent(args.m365_agent_id)
1737
+
1738
+ # 3. Build pipeline config and run evaluation pipeline
1739
+ model_config = AzureOpenAIModelConfiguration(
1740
+ azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
1741
+ api_key=os.environ.get("AZURE_AI_API_KEY"),
1742
+ api_version=os.environ.get("AZURE_AI_API_VERSION"),
1743
+ azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
1744
+ )
1745
+ has_azure_openai = bool(
1746
+ os.environ.get("AZURE_AI_OPENAI_ENDPOINT")
1747
+ and os.environ.get("AZURE_AI_API_KEY")
1748
+ )
1749
+
1750
+ pipeline = PipelineConfig(
1751
+ agent_client=agent_client,
1752
+ model_config=model_config,
1753
+ has_azure_openai=has_azure_openai,
1754
+ default_evaluators=default_evaluators,
1755
+ )
1756
+
1757
+ results = run_pipeline(pipeline, eval_items, args)
1223
1758
 
1224
- # 5. Run evaluations
1225
- if effective_log_level in ("info", "debug"):
1226
- emit_structured_log("info", "Running evaluations.", operation=Operation.EVALUATE)
1227
- results = run_evaluations(args, responses, eval_items, default_evaluators)
1228
-
1229
- # 6. Output results
1759
+ # 4. Output results
1230
1760
  output_results(results, args, default_evaluators=default_evaluators,
1231
1761
  agent_name=agent_name, cli_version=str(cli_version) if cli_version else None)
1232
1762
 
1233
1763
  if effective_log_level in ("info", "debug"):
1234
1764
  emit_structured_log(
1235
1765
  "info",
1236
- f"Evaluation completed successfully. Processed {len(prompts)} prompt(s).",
1766
+ f"Evaluation completed successfully. Processed {len(eval_items)} item(s).",
1237
1767
  operation=Operation.EVALUATE,
1238
1768
  )
1239
1769