@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,653 @@
1
+ """Core evaluation pipeline — evaluator dispatch, retry, parallel execution."""
2
+
3
+ import json
4
+ import time
5
+ from dataclasses import dataclass, field
6
+ from enum import Enum
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ from azure.ai.evaluation import (
10
+ AzureOpenAIModelConfiguration,
11
+ RelevanceEvaluator,
12
+ CoherenceEvaluator,
13
+ GroundednessEvaluator,
14
+ SimilarityEvaluator,
15
+ ToolCallAccuracyEvaluator,
16
+ )
17
+
18
+ from api_clients.base_agent_client import BaseAgentClient
19
+ from cli_logging.cli_logger import emit_structured_log
20
+ from cli_logging.logging_utils import Operation
21
+ from common import (
22
+ RELEVANCE,
23
+ COHERENCE,
24
+ GROUNDEDNESS,
25
+ SIMILARITY,
26
+ TOOL_CALL_ACCURACY,
27
+ CITATIONS,
28
+ EXACT_MATCH,
29
+ PARTIAL_MATCH,
30
+ REQUIRES_AZURE_OPENAI,
31
+ REQUIRES_TOOL_DEFINITIONS,
32
+ METRIC_IDS,
33
+ MAX_ATTEMPTS,
34
+ MAX_CONCURRENCY,
35
+ DEFAULT_PASS_THRESHOLD,
36
+ STATUS_PASS,
37
+ STATUS_FAIL,
38
+ STATUS_ERROR,
39
+ STATUS_PARTIAL,
40
+ STATUS_UNKNOWN,
41
+ MAX_TURNS_PER_THREAD,
42
+ LONG_THREAD_WARNING_THRESHOLD,
43
+ RunConfig,
44
+ )
45
+ from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
46
+ from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
47
+ from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
48
+ from evaluator_resolver import (
49
+ validate_evaluator_names,
50
+ check_prerequisites,
51
+ resolve_evaluators_for_prompt,
52
+ get_evaluator_threshold,
53
+ )
54
+ from parallel_executor import execute_in_parallel
55
+ from response_extractor import get_response_text_for_evaluation
56
+ from retry_policy import (
57
+ is_retryable_status,
58
+ get_backoff_seconds,
59
+ get_retry_after_seconds,
60
+ )
61
+ from throttle_gate import ThrottleGate
62
+
63
+
64
+ @dataclass
65
+ class PipelineConfig:
66
+ """Runtime configuration for the evaluation pipeline."""
67
+ agent_client: BaseAgentClient
68
+ model_config: AzureOpenAIModelConfiguration
69
+ has_azure_openai: bool
70
+ default_evaluators: Dict[str, Any]
71
+ chat_gate: ThrottleGate = field(default_factory=lambda: ThrottleGate("chat_api"))
72
+ is_retryable_status: Any = field(default=is_retryable_status)
73
+ get_backoff_seconds: Any = field(default=get_backoff_seconds)
74
+
75
+
76
+ class ItemType(Enum):
77
+ SINGLE_TURN = "single_turn"
78
+ MULTI_TURN = "multi_turn"
79
+
80
+
81
+ def detect_item_type(item: dict) -> ItemType:
82
+ """Determine if an evaluation item is single-turn or multi-turn.
83
+
84
+ Returns ItemType.SINGLE_TURN if item has 'prompt' without 'turns',
85
+ ItemType.MULTI_TURN if item has 'turns' array.
86
+
87
+ Raises ValueError for invalid items (both, neither, or invalid turns).
88
+ """
89
+ has_turns = "turns" in item
90
+ has_prompt = "prompt" in item
91
+
92
+ if has_turns and has_prompt:
93
+ raise ValueError(
94
+ "Invalid evaluation item: cannot have both 'turns' and 'prompt'. "
95
+ "Use 'turns' for multi-turn threads or 'prompt' for single-turn."
96
+ )
97
+
98
+ if has_turns and not isinstance(item["turns"], list):
99
+ raise ValueError("Invalid evaluation item: 'turns' must be a list")
100
+
101
+ if has_turns:
102
+ if len(item["turns"]) == 0:
103
+ raise ValueError("Invalid multi-turn thread: 'turns' array cannot be empty")
104
+ return ItemType.MULTI_TURN
105
+
106
+ if has_prompt:
107
+ return ItemType.SINGLE_TURN
108
+
109
+ raise ValueError(
110
+ "Invalid evaluation item: must have either 'turns' array (multi-turn) "
111
+ "or 'prompt' field (single-turn)"
112
+ )
113
+
114
+
115
+ def _decorate_metric(metric_id: str, data, threshold: Optional[float] = None) -> Dict[str, Any]:
116
+ """Augment raw evaluator output with standardized threshold + pass/fail result."""
117
+ pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
118
+ payload = {}
119
+ if isinstance(data, dict):
120
+ payload.update(data)
121
+ else:
122
+ payload['raw'] = data
123
+
124
+ score_val = None
125
+ if isinstance(data, dict):
126
+ if metric_id in data:
127
+ score_val = data[metric_id]
128
+ if isinstance(score_val, (int, float)):
129
+ payload['threshold'] = pass_threshold
130
+ payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
131
+ else:
132
+ payload['threshold'] = pass_threshold
133
+ payload.setdefault('result', STATUS_UNKNOWN)
134
+ return payload
135
+
136
+
137
+ def _run_evaluators_for_item(
138
+ prompt: str,
139
+ actual_response: str,
140
+ expected_response: str,
141
+ enhanced_response: Dict[str, Any],
142
+ resolved_evaluators: Dict[str, Any],
143
+ model_config: AzureOpenAIModelConfiguration,
144
+ has_azure_openai: bool,
145
+ m365_agent_id: Optional[str],
146
+ ) -> Tuple[Dict[str, Optional[str]], List[str]]:
147
+ """Run resolved evaluators against a single item/turn.
148
+
149
+ Returns (results_dict, evaluators_ran).
150
+ """
151
+ has_tool_defs = bool(
152
+ m365_agent_id and enhanced_response.get("tool_definitions")
153
+ )
154
+ available_context = {
155
+ REQUIRES_AZURE_OPENAI: has_azure_openai,
156
+ REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
157
+ }
158
+
159
+ results_dict: Dict[str, Optional[str]] = {}
160
+ evaluators_ran: List[str] = []
161
+
162
+ for eval_name, eval_options in resolved_evaluators.items():
163
+ can_run, warn_msg = check_prerequisites(eval_name, available_context)
164
+ if not can_run:
165
+ if warn_msg:
166
+ emit_structured_log(
167
+ "warning",
168
+ f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
169
+ operation=Operation.EVALUATE,
170
+ )
171
+ results_dict[eval_name] = None
172
+ continue
173
+
174
+ threshold = get_evaluator_threshold(eval_name, eval_options)
175
+
176
+ try:
177
+ if eval_name == RELEVANCE:
178
+ raw_score = RelevanceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
179
+ results_dict[RELEVANCE] = _decorate_metric(METRIC_IDS[RELEVANCE], raw_score, threshold)
180
+ elif eval_name == COHERENCE:
181
+ raw_score = CoherenceEvaluator(model_config=model_config)(query=prompt, response=actual_response)
182
+ results_dict[COHERENCE] = _decorate_metric(METRIC_IDS[COHERENCE], raw_score, threshold)
183
+ elif eval_name == GROUNDEDNESS:
184
+ raw_score = GroundednessEvaluator(model_config=model_config)(response=actual_response, context=expected_response)
185
+ results_dict[GROUNDEDNESS] = _decorate_metric(METRIC_IDS[GROUNDEDNESS], raw_score, threshold)
186
+ elif eval_name == SIMILARITY:
187
+ raw_score = SimilarityEvaluator(model_config=model_config)(query=prompt, response=actual_response, ground_truth=expected_response)
188
+ results_dict[SIMILARITY] = _decorate_metric(METRIC_IDS[SIMILARITY], raw_score, threshold)
189
+ elif eval_name == TOOL_CALL_ACCURACY:
190
+ raw_score = ToolCallAccuracyEvaluator(model_config)(
191
+ query=prompt,
192
+ response=enhanced_response.get("response", actual_response),
193
+ tool_definitions=enhanced_response.get("tool_definitions", []),
194
+ )
195
+ results_dict[TOOL_CALL_ACCURACY] = _decorate_metric(METRIC_IDS[TOOL_CALL_ACCURACY], raw_score, threshold)
196
+ elif eval_name == CITATIONS:
197
+ fmt_str = eval_options.get("citation_format", "oai_unicode")
198
+ fmt_map = {
199
+ "oai_unicode": CitationFormat.OAI_UNICODE,
200
+ "bracket": CitationFormat.LEGACY_BRACKET,
201
+ "mixed": CitationFormat.AUTO,
202
+ }
203
+ raw_score = CitationsEvaluator(citation_format=fmt_map.get(fmt_str, CitationFormat.OAI_UNICODE))(response=actual_response)
204
+ results_dict[CITATIONS] = _decorate_metric(METRIC_IDS[CITATIONS], raw_score, threshold)
205
+ elif eval_name == EXACT_MATCH:
206
+ case_sensitive = eval_options.get("case_sensitive", False)
207
+ raw_score = ExactMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
208
+ # ExactMatch is binary — the evaluator already sets 'result'
209
+ # so _decorate_metric (which computes result from score vs threshold) is not needed.
210
+ results_dict[EXACT_MATCH] = raw_score
211
+ elif eval_name == PARTIAL_MATCH:
212
+ case_sensitive = eval_options.get("case_sensitive", False)
213
+ raw_score = PartialMatchEvaluator(case_sensitive=case_sensitive)(response=actual_response, expected_answer=expected_response)
214
+ results_dict[PARTIAL_MATCH] = _decorate_metric(METRIC_IDS[PARTIAL_MATCH], raw_score, threshold)
215
+
216
+ evaluators_ran.append(eval_name)
217
+ except Exception as e:
218
+ emit_structured_log(
219
+ "error",
220
+ f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
221
+ operation=Operation.EVALUATE,
222
+ )
223
+ results_dict[eval_name] = None
224
+
225
+ return results_dict, evaluators_ran
226
+
227
+
228
+ def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
229
+ """Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
230
+ for result_data in results_dict.values():
231
+ if result_data is None:
232
+ continue
233
+ if result_data.get("result") == STATUS_FAIL:
234
+ return False
235
+ return True
236
+
237
+
238
+ def _evaluate_multi_turn_responses(
239
+ turns: List[Dict],
240
+ m365_agent_id: Optional[str],
241
+ effective_log_level: str,
242
+ default_evaluators: Dict[str, Any],
243
+ model_config: AzureOpenAIModelConfiguration,
244
+ has_azure_openai: bool,
245
+ ) -> Tuple[List[Dict], Dict]:
246
+ """Run per-turn evaluations and build evaluated turn results with summary.
247
+
248
+ Returns:
249
+ Tuple of (evaluated_turns, summary). Each evaluated turn contains
250
+ prompt, response, expected_response, status, evaluators_ran, results,
251
+ and optionally error. Does not mutate the input turns.
252
+ """
253
+ evaluated_turns: List[Dict] = []
254
+ turns_passed = 0
255
+ turns_failed = 0
256
+
257
+ for i, turn in enumerate(turns):
258
+ evaluated_turn: Dict[str, Any] = {
259
+ "prompt": turn.get("prompt", ""),
260
+ }
261
+ if "expected_response" in turn:
262
+ evaluated_turn["expected_response"] = turn["expected_response"]
263
+ if "response" in turn:
264
+ evaluated_turn["response"] = turn["response"]
265
+ if "evaluators" in turn:
266
+ evaluated_turn["evaluators"] = turn["evaluators"]
267
+ if "evaluators_mode" in turn:
268
+ evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
269
+
270
+ if turn.get("status") == STATUS_ERROR:
271
+ evaluated_turn["status"] = STATUS_ERROR
272
+ evaluated_turn["error"] = turn.get("error", "")
273
+ turns_failed += 1
274
+ evaluated_turns.append(evaluated_turn)
275
+ continue
276
+
277
+ enhanced_response = turn.get("_enhanced_response", {})
278
+ actual_response = get_response_text_for_evaluation(enhanced_response)
279
+
280
+ resolved = resolve_evaluators_for_prompt(
281
+ turn.get("evaluators"), turn.get("evaluators_mode", "extend"),
282
+ turn.get("prompt", ""), default_evaluators,
283
+ )
284
+
285
+ results_dict, evaluators_ran = _run_evaluators_for_item(
286
+ turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
287
+ enhanced_response, resolved, model_config, has_azure_openai, m365_agent_id,
288
+ )
289
+
290
+ all_passed = _check_all_passed(results_dict)
291
+
292
+ evaluated_turn["results"] = results_dict
293
+ evaluated_turn["evaluators_ran"] = evaluators_ran
294
+ evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
295
+
296
+ if effective_log_level == "debug":
297
+ emit_structured_log(
298
+ "debug",
299
+ f"Evaluation completed for turn {i + 1} prompt='{turn.get('prompt', '')}'. "
300
+ f"Evaluators: {', '.join(evaluators_ran)}. "
301
+ f"Scores: {results_dict}",
302
+ operation=Operation.EVALUATE,
303
+ )
304
+
305
+ if all_passed:
306
+ turns_passed += 1
307
+ else:
308
+ turns_failed += 1
309
+
310
+ evaluated_turns.append(evaluated_turn)
311
+
312
+ turns_total = len(turns)
313
+ if turns_passed == turns_total:
314
+ overall_status = STATUS_PASS
315
+ elif turns_failed == turns_total:
316
+ overall_status = STATUS_FAIL
317
+ else:
318
+ overall_status = STATUS_PARTIAL
319
+
320
+ summary = {
321
+ "turns_total": turns_total,
322
+ "turns_passed": turns_passed,
323
+ "turns_failed": turns_failed,
324
+ "overall_status": overall_status,
325
+ }
326
+
327
+ return evaluated_turns, summary
328
+
329
+
330
+ def _evaluate_single_response(
331
+ enhanced_response: Dict[str, Any],
332
+ eval_item: Dict,
333
+ m365_agent_id: Optional[str],
334
+ effective_log_level: str,
335
+ model_config: AzureOpenAIModelConfiguration,
336
+ has_azure_openai: bool,
337
+ default_evaluators: Dict[str, Any],
338
+ ) -> Dict[str, Any]:
339
+ """Run all evaluators for a single prompt/response pair and return the result dict."""
340
+ actual_response_text = get_response_text_for_evaluation(enhanced_response)
341
+ prompt = eval_item.get("prompt", "")
342
+ expected_response = eval_item.get("expected_response", "")
343
+
344
+ resolved = resolve_evaluators_for_prompt(
345
+ eval_item.get("evaluators"), eval_item.get("evaluators_mode", "extend"),
346
+ prompt, default_evaluators,
347
+ )
348
+
349
+ results_dict, evaluators_ran = _run_evaluators_for_item(
350
+ prompt, actual_response_text, expected_response, enhanced_response,
351
+ resolved, model_config, has_azure_openai, m365_agent_id,
352
+ )
353
+
354
+ evaluation_result = {
355
+ "prompt": prompt,
356
+ "response": enhanced_response.get(
357
+ "display_response_text", actual_response_text
358
+ ),
359
+ "expected_response": expected_response,
360
+ "evaluators_ran": evaluators_ran,
361
+ "results": results_dict,
362
+ }
363
+
364
+ if "evaluators" in eval_item:
365
+ evaluation_result["evaluators"] = eval_item["evaluators"]
366
+ if "evaluators_mode" in eval_item:
367
+ evaluation_result["evaluators_mode"] = eval_item["evaluators_mode"]
368
+
369
+ if effective_log_level == "debug":
370
+ emit_structured_log(
371
+ "debug",
372
+ f"Evaluation completed for prompt='{evaluation_result['prompt']}'. "
373
+ f"Evaluators: {', '.join(evaluators_ran)}. "
374
+ f"Scores: {evaluation_result['results']}",
375
+ operation=Operation.EVALUATE,
376
+ )
377
+
378
+ return evaluation_result
379
+
380
+
381
+ def get_effective_worker_count(prompt_count: int, concurrency: int) -> int:
382
+ """Compute safe worker count for prompt processing."""
383
+ if prompt_count <= 0:
384
+ return 1
385
+
386
+ try:
387
+ requested_int = int(concurrency)
388
+ except (TypeError, ValueError):
389
+ requested_int = MAX_CONCURRENCY
390
+
391
+ bounded = max(1, min(requested_int, MAX_CONCURRENCY))
392
+ return min(bounded, prompt_count)
393
+
394
+
395
+ def run_pipeline(
396
+ pipeline: PipelineConfig,
397
+ eval_items: List[Dict],
398
+ config: RunConfig,
399
+ ) -> List[Dict[str, Any]]:
400
+ """Run the full evaluation pipeline: send prompts and evaluate responses in parallel.
401
+
402
+ Each worker processes one prompt end-to-end: send → evaluate.
403
+ Results are returned in original prompt order (FR-006).
404
+ """
405
+ # Validate all evaluator names upfront before dispatching workers
406
+ all_evaluator_maps = [pipeline.default_evaluators]
407
+ for eval_item in eval_items:
408
+ if "evaluators" in eval_item:
409
+ all_evaluator_maps.append(eval_item["evaluators"])
410
+ for turn in eval_item.get("turns", []):
411
+ if "evaluators" in turn:
412
+ all_evaluator_maps.append(turn["evaluators"])
413
+ for emap in all_evaluator_maps:
414
+ validate_evaluator_names(emap)
415
+
416
+ # Validate all items upfront and classify types before dispatching workers
417
+ item_types: List[ItemType] = []
418
+ for idx, eval_item in enumerate(eval_items):
419
+ try:
420
+ item_type = detect_item_type(eval_item)
421
+ except ValueError as e:
422
+ raise ValueError(f"Invalid evaluation item at index {idx}: {e}") from e
423
+ if item_type == ItemType.MULTI_TURN:
424
+ turn_count = len(eval_item["turns"])
425
+ if turn_count > MAX_TURNS_PER_THREAD:
426
+ raise ValueError(
427
+ f"Invalid evaluation item at index {idx}: 'turns' array has "
428
+ f"{turn_count} items (max {MAX_TURNS_PER_THREAD})"
429
+ )
430
+ item_types.append(item_type)
431
+
432
+ total = len(eval_items)
433
+ worker_count = get_effective_worker_count(total, config.concurrency)
434
+
435
+ multi_turn_count = sum(1 for t in item_types if t == ItemType.MULTI_TURN)
436
+ single_turn_count = total - multi_turn_count
437
+
438
+ emit_structured_log(
439
+ "info",
440
+ f"Running pipeline with {worker_count} worker(s) for {total} item(s) "
441
+ f"({single_turn_count} single-turn, {multi_turn_count} multi-turn).",
442
+ operation=Operation.EVALUATE,
443
+ )
444
+
445
+ def _process_item(eval_item: Dict, index: int) -> Dict[str, Any]:
446
+ if item_types[index] == ItemType.MULTI_TURN:
447
+ return _process_multi_turn(eval_item, index)
448
+ return _process_single_turn(eval_item, index)
449
+
450
+ def _process_single_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
451
+ prompt = eval_item.get("prompt", "")
452
+ emit_structured_log(
453
+ "info",
454
+ f"Processing item {index + 1}/{total} (single-turn).",
455
+ operation=Operation.SEND_PROMPT,
456
+ )
457
+
458
+ # Phase A: Send prompt to agent (with retry + throttle gate)
459
+ response = None
460
+ for attempt in range(1, MAX_ATTEMPTS + 1):
461
+ pipeline.chat_gate.wait_if_blocked()
462
+ try:
463
+ response, _ = pipeline.agent_client.send_prompt(prompt, agent_id=config.m365_agent_id)
464
+ break
465
+ except Exception as exc:
466
+ cause = exc.__cause__
467
+ status = int(getattr(cause, "code", 0) or 0) or None if cause else None
468
+ retry_after = get_retry_after_seconds(
469
+ cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
470
+ )
471
+
472
+ if retry_after is not None and pipeline.is_retryable_status(status):
473
+ pipeline.chat_gate.apply_retry_after(retry_after)
474
+
475
+ if not pipeline.is_retryable_status(status) or attempt >= MAX_ATTEMPTS:
476
+ emit_structured_log(
477
+ "error",
478
+ f"Item {index + 1}/{total} failed after {attempt} attempt(s): {exc}",
479
+ operation=Operation.SEND_PROMPT,
480
+ )
481
+ return {
482
+ "prompt": prompt,
483
+ "response": "",
484
+ "expected_response": eval_item.get("expected_response", ""),
485
+ "evaluators_ran": [],
486
+ "results": {},
487
+ "status": STATUS_ERROR,
488
+ "errorDetails": str(exc),
489
+ }
490
+
491
+ delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
492
+ time.sleep(delay)
493
+
494
+ # Phase B: Evaluate response
495
+ return _evaluate_single_response(
496
+ response, eval_item, config.m365_agent_id, config.effective_log_level,
497
+ pipeline.model_config, pipeline.has_azure_openai,
498
+ pipeline.default_evaluators,
499
+ )
500
+
501
+ def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
502
+ turns = eval_item["turns"]
503
+ thread_name = eval_item.get("name", "Unnamed thread")
504
+ emit_structured_log(
505
+ "info",
506
+ f"Processing item {index + 1}/{total} (multi-turn: '{thread_name}').",
507
+ operation=Operation.SEND_PROMPT,
508
+ )
509
+
510
+ if len(turns) > LONG_THREAD_WARNING_THRESHOLD:
511
+ emit_structured_log(
512
+ "warning",
513
+ f"Thread '{thread_name}' has {len(turns)} turns (>{LONG_THREAD_WARNING_THRESHOLD}). This may take a while.",
514
+ operation=Operation.SEND_PROMPT,
515
+ )
516
+
517
+ # Phase A: Send each turn with throttle gate + 429-only retry
518
+ # Multi-turn only retries on 429 (server confirmed it didn't process
519
+ # the request). Other transient errors (503, 504) are ambiguous about
520
+ # whether the server processed the turn, risking duplicate turns in
521
+ # the conversation if retried.
522
+ conversation_context = None
523
+ conversation_id = None
524
+ enriched_turns: List[Dict[str, Any]] = []
525
+ failed = False
526
+
527
+ for i, turn in enumerate(turns):
528
+ prompt = turn["prompt"]
529
+ emit_structured_log(
530
+ "debug",
531
+ f"Sending turn {i + 1}/{len(turns)} of '{thread_name}'.",
532
+ operation=Operation.SEND_PROMPT,
533
+ )
534
+
535
+ response = None
536
+ for attempt in range(1, MAX_ATTEMPTS + 1):
537
+ pipeline.chat_gate.wait_if_blocked()
538
+ try:
539
+ response, conversation_context = pipeline.agent_client.send_prompt(
540
+ prompt, agent_id=config.m365_agent_id,
541
+ conversation_context=conversation_context,
542
+ )
543
+ break
544
+ except Exception as exc:
545
+ cause = exc.__cause__
546
+ status = int(getattr(cause, "code", 0) or 0) or None if cause else None
547
+ retry_after = get_retry_after_seconds(
548
+ cause.headers.get("Retry-After") if cause and getattr(cause, "headers", None) else None
549
+ )
550
+
551
+ # Only retry on 429 — server confirmed it didn't process the request
552
+ if status == 429 and attempt < MAX_ATTEMPTS:
553
+ if retry_after is not None:
554
+ pipeline.chat_gate.apply_retry_after(retry_after)
555
+ delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
556
+ time.sleep(delay)
557
+ continue
558
+
559
+ # All other errors: stop the thread
560
+ emit_structured_log(
561
+ "error",
562
+ f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
563
+ operation=Operation.SEND_PROMPT,
564
+ )
565
+ failed = True
566
+ break
567
+
568
+ if failed:
569
+ # Mark this turn and all remaining turns as error
570
+ enriched_turns.append({
571
+ **turn,
572
+ "response": "",
573
+ "status": STATUS_ERROR,
574
+ "error": "Failed to get response from agent",
575
+ })
576
+ for j in range(i + 1, len(turns)):
577
+ enriched_turns.append({
578
+ **turns[j],
579
+ "response": "",
580
+ "status": STATUS_ERROR,
581
+ "error": "Skipped: preceding turn failed",
582
+ })
583
+ break
584
+
585
+ # Enrich turn with response
586
+ response_text = get_response_text_for_evaluation(response)
587
+ enriched_turns.append({
588
+ **turn,
589
+ "response": response.get("display_response_text", response_text),
590
+ "_enhanced_response": response,
591
+ })
592
+
593
+ # Capture conversation_id from first response
594
+ if conversation_id is None:
595
+ conversation_id = response.get("metadata", {}).get("conversation_id")
596
+
597
+ # Phase B: Run per-turn evaluations
598
+ evaluated_turns, summary = _evaluate_multi_turn_responses(
599
+ enriched_turns, config.m365_agent_id, config.effective_log_level,
600
+ pipeline.default_evaluators,
601
+ model_config=pipeline.model_config,
602
+ has_azure_openai=pipeline.has_azure_openai,
603
+ )
604
+
605
+ return {
606
+ "type": "multi_turn",
607
+ "name": eval_item.get("name", ""),
608
+ "description": eval_item.get("description", ""),
609
+ "conversation_id": conversation_id or "",
610
+ "turns": evaluated_turns,
611
+ "summary": summary,
612
+ }
613
+
614
+ execution_results = execute_in_parallel(
615
+ eval_items, _process_item, max_workers=worker_count,
616
+ )
617
+
618
+ # Unwrap WorkerResult objects into plain dicts, with error fallback
619
+ ordered_results: List[Dict[str, Any]] = []
620
+ for wr in execution_results:
621
+ if wr.error:
622
+ idx = wr.index
623
+ item = eval_items[idx]
624
+ if item_types[idx] == ItemType.MULTI_TURN:
625
+ ordered_results.append({
626
+ "type": "multi_turn",
627
+ "name": item.get("name", ""),
628
+ "turns": [
629
+ {**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
630
+ for t in item.get("turns", [])
631
+ ],
632
+ "summary": {
633
+ "turns_total": len(item.get("turns", [])),
634
+ "turns_passed": 0,
635
+ "turns_failed": len(item.get("turns", [])),
636
+ "overall_status": STATUS_FAIL,
637
+ },
638
+ "error": str(wr.error),
639
+ })
640
+ else:
641
+ ordered_results.append({
642
+ "prompt": item.get("prompt", ""),
643
+ "response": "",
644
+ "expected_response": item.get("expected_response", ""),
645
+ "evaluators_ran": [],
646
+ "results": {},
647
+ "status": STATUS_ERROR,
648
+ "errorDetails": str(wr.error),
649
+ })
650
+ else:
651
+ ordered_results.append(wr.value)
652
+
653
+ return ordered_results
@@ -12,12 +12,11 @@ from common import (
12
12
  RELEVANCE,
13
13
  COHERENCE,
14
14
  GROUNDEDNESS,
15
- TOOL_CALL_ACCURACY,
15
+ SIMILARITY,
16
16
  CITATIONS,
17
17
  EXACT_MATCH,
18
18
  PARTIAL_MATCH,
19
19
  REQUIRES_AZURE_OPENAI,
20
- REQUIRES_TOOL_DEFINITIONS,
21
20
  SYSTEM_DEFAULT_EVALUATORS,
22
21
  RegistryEntry,
23
22
  )
@@ -30,7 +29,7 @@ EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
30
29
  RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
31
30
  COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
32
31
  GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
33
- TOOL_CALL_ACCURACY: RegistryEntry(type="tool", requires=[REQUIRES_AZURE_OPENAI, REQUIRES_TOOL_DEFINITIONS], default_threshold=3),
32
+ SIMILARITY: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
34
33
  CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
35
34
  EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
36
35
  PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
@@ -61,9 +60,13 @@ def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
61
60
 
62
61
  lines.append("")
63
62
  lines.append("Valid evaluators are:")
64
- lines.append(f" - {', '.join(llm_evals)} (LLM-based)")
65
- lines.append(f" - {', '.join(tool_evals)} (tool evaluation)")
66
- lines.append(f" - {', '.join(non_llm_evals)} (non-LLM)")
63
+ for category, label in [
64
+ (llm_evals, "LLM-based"),
65
+ (tool_evals, "tool evaluation"),
66
+ (non_llm_evals, "non-LLM"),
67
+ ]:
68
+ if category:
69
+ lines.append(f" - {', '.join(category)} ({label})")
67
70
 
68
71
  raise ValueError("\n".join(lines))
69
72