@microsoft/m365-copilot-eval 1.5.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,5 @@
1
1
  """Core evaluation pipeline — evaluator dispatch, retry, parallel execution."""
2
2
 
3
- import json
4
3
  import time
5
4
  from dataclasses import dataclass, field
6
5
  from enum import Enum
@@ -27,8 +26,6 @@ from common import (
27
26
  CITATIONS,
28
27
  EXACT_MATCH,
29
28
  PARTIAL_MATCH,
30
- REQUIRES_AZURE_OPENAI,
31
- REQUIRES_TOOL_DEFINITIONS,
32
29
  METRIC_IDS,
33
30
  MAX_ATTEMPTS,
34
31
  MAX_CONCURRENCY,
@@ -37,17 +34,17 @@ from common import (
37
34
  STATUS_FAIL,
38
35
  STATUS_ERROR,
39
36
  STATUS_PARTIAL,
40
- STATUS_UNKNOWN,
41
37
  MAX_TURNS_PER_THREAD,
42
38
  LONG_THREAD_WARNING_THRESHOLD,
43
39
  RunConfig,
44
40
  )
41
+ from error_messages import agent_request_failed, evaluator_failed, turn_skipped
42
+ from status_derivation import rollup_thread_status, status_for_response
45
43
  from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
46
44
  from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
47
45
  from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
48
46
  from evaluator_resolver import (
49
47
  validate_evaluator_names,
50
- check_prerequisites,
51
48
  resolve_evaluators_for_prompt,
52
49
  get_evaluator_threshold,
53
50
  )
@@ -113,7 +110,12 @@ def detect_item_type(item: dict) -> ItemType:
113
110
 
114
111
 
115
112
  def _decorate_metric(metric_id: str, data, threshold: Optional[float] = None) -> Dict[str, Any]:
116
- """Augment raw evaluator output with standardized threshold + pass/fail result."""
113
+ """Augment raw evaluator output with standardized threshold + pass/fail result.
114
+
115
+ Raises ValueError if the SDK returned a malformed result (no numeric score
116
+ under ``metric_id``). The outer try/except in :func:`_run_evaluators_for_item`
117
+ catches it and emits a standard ``evaluator_failed`` error entry.
118
+ """
117
119
  pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
118
120
  payload = {}
119
121
  if isinstance(data, dict):
@@ -125,12 +127,12 @@ def _decorate_metric(metric_id: str, data, threshold: Optional[float] = None) ->
125
127
  if isinstance(data, dict):
126
128
  if metric_id in data:
127
129
  score_val = data[metric_id]
128
- if isinstance(score_val, (int, float)):
129
- payload['threshold'] = pass_threshold
130
- payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
131
- else:
132
- payload['threshold'] = pass_threshold
133
- payload.setdefault('result', STATUS_UNKNOWN)
130
+ if not isinstance(score_val, (int, float)):
131
+ raise ValueError(
132
+ f"non-numeric score from evaluator (metric_id={metric_id!r}, score={score_val!r})"
133
+ )
134
+ payload['threshold'] = pass_threshold
135
+ payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
134
136
  return payload
135
137
 
136
138
 
@@ -141,36 +143,20 @@ def _run_evaluators_for_item(
141
143
  enhanced_response: Dict[str, Any],
142
144
  resolved_evaluators: Dict[str, Any],
143
145
  model_config: AzureOpenAIModelConfiguration,
144
- has_azure_openai: bool,
145
- m365_agent_id: Optional[str],
146
- ) -> Tuple[Dict[str, Optional[str]], List[str]]:
146
+ context_label: str = "",
147
+ ) -> Tuple[Dict[str, Dict[str, Any]], List[str]]:
147
148
  """Run resolved evaluators against a single item/turn.
148
149
 
149
- Returns (results_dict, evaluators_ran).
150
+ Each value in results_dict is a decorated metric dict on success or an
151
+ errored entry ``{result: "error", error: "Evaluator failed: <exc.message>", threshold}``
152
+ on crash. The ``threshold`` is included on errored entries so the aggregate
153
+ report can still display it; the persisted ErroredScore shape strips it
154
+ out at write time (see ``_as_errored_score`` in result_writer).
150
155
  """
151
- has_tool_defs = bool(
152
- m365_agent_id and enhanced_response.get("tool_definitions")
153
- )
154
- available_context = {
155
- REQUIRES_AZURE_OPENAI: has_azure_openai,
156
- REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
157
- }
158
-
159
- results_dict: Dict[str, Optional[str]] = {}
156
+ results_dict: Dict[str, Dict[str, Any]] = {}
160
157
  evaluators_ran: List[str] = []
161
158
 
162
159
  for eval_name, eval_options in resolved_evaluators.items():
163
- can_run, warn_msg = check_prerequisites(eval_name, available_context)
164
- if not can_run:
165
- if warn_msg:
166
- emit_structured_log(
167
- "warning",
168
- f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
169
- operation=Operation.EVALUATE,
170
- )
171
- results_dict[eval_name] = None
172
- continue
173
-
174
160
  threshold = get_evaluator_threshold(eval_name, eval_options)
175
161
 
176
162
  try:
@@ -215,33 +201,39 @@ def _run_evaluators_for_item(
215
201
 
216
202
  evaluators_ran.append(eval_name)
217
203
  except Exception as e:
204
+ # Full exception detail goes to the log stream (FR-009). Persisted
205
+ # output gets the scrubbed text from error_messages.evaluator_failed
206
+ # — exception.message only, never repr / class name / traceback.
207
+ where = f" on response for {context_label}" if context_label else ""
218
208
  emit_structured_log(
219
209
  "error",
220
- f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
210
+ f"Evaluator '{eval_name}' crashed{where}: {e}",
221
211
  operation=Operation.EVALUATE,
222
212
  )
223
- results_dict[eval_name] = None
213
+ exc_msg = getattr(e, "message", None) or str(e)
214
+ results_dict[eval_name] = {
215
+ "result": STATUS_ERROR,
216
+ "error": evaluator_failed(exc_msg),
217
+ "threshold": threshold,
218
+ }
224
219
 
225
220
  return results_dict, evaluators_ran
226
221
 
227
222
 
228
- def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
229
- """Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
230
- for result_data in results_dict.values():
231
- if result_data is None:
232
- continue
233
- if result_data.get("result") == STATUS_FAIL:
234
- return False
235
- return True
223
+ def _collect_evaluator_results(results_dict: Dict[str, Dict[str, Any]]) -> List[str]:
224
+ """Extract per-evaluator ``result`` values (one of pass/fail/error) for status derivation."""
225
+ return [
226
+ d["result"] for d in results_dict.values()
227
+ if d.get("result") in (STATUS_PASS, STATUS_FAIL, STATUS_ERROR)
228
+ ]
236
229
 
237
230
 
238
231
  def _evaluate_multi_turn_responses(
239
232
  turns: List[Dict],
240
- m365_agent_id: Optional[str],
241
233
  effective_log_level: str,
242
234
  default_evaluators: Dict[str, Any],
243
235
  model_config: AzureOpenAIModelConfiguration,
244
- has_azure_openai: bool,
236
+ thread_name: str = "",
245
237
  ) -> Tuple[List[Dict], Dict]:
246
238
  """Run per-turn evaluations and build evaluated turn results with summary.
247
239
 
@@ -251,8 +243,6 @@ def _evaluate_multi_turn_responses(
251
243
  and optionally error. Does not mutate the input turns.
252
244
  """
253
245
  evaluated_turns: List[Dict] = []
254
- turns_passed = 0
255
- turns_failed = 0
256
246
 
257
247
  for i, turn in enumerate(turns):
258
248
  evaluated_turn: Dict[str, Any] = {
@@ -268,9 +258,10 @@ def _evaluate_multi_turn_responses(
268
258
  evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
269
259
 
270
260
  if turn.get("status") == STATUS_ERROR:
261
+ # Request-failure or downstream-skip turn — error already set upstream.
271
262
  evaluated_turn["status"] = STATUS_ERROR
272
- evaluated_turn["error"] = turn.get("error", "")
273
- turns_failed += 1
263
+ if "error" in turn:
264
+ evaluated_turn["error"] = turn["error"]
274
265
  evaluated_turns.append(evaluated_turn)
275
266
  continue
276
267
 
@@ -282,16 +273,22 @@ def _evaluate_multi_turn_responses(
282
273
  turn.get("prompt", ""), default_evaluators,
283
274
  )
284
275
 
276
+ thread_part = f" of '{thread_name}'" if thread_name else ""
277
+ turn_label = f"turn {i + 1}/{len(turns)}{thread_part}"
285
278
  results_dict, evaluators_ran = _run_evaluators_for_item(
286
279
  turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
287
- enhanced_response, resolved, model_config, has_azure_openai, m365_agent_id,
280
+ enhanced_response, resolved, model_config,
281
+ context_label=turn_label,
288
282
  )
289
283
 
290
- all_passed = _check_all_passed(results_dict)
284
+ evaluator_result_values = _collect_evaluator_results(results_dict)
285
+ status, error_obj = status_for_response(evaluator_result_values)
291
286
 
292
287
  evaluated_turn["results"] = results_dict
293
288
  evaluated_turn["evaluators_ran"] = evaluators_ran
294
- evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
289
+ evaluated_turn["status"] = status
290
+ if error_obj is not None:
291
+ evaluated_turn["error"] = error_obj
295
292
 
296
293
  if effective_log_level == "debug":
297
294
  emit_structured_log(
@@ -302,26 +299,17 @@ def _evaluate_multi_turn_responses(
302
299
  operation=Operation.EVALUATE,
303
300
  )
304
301
 
305
- if all_passed:
306
- turns_passed += 1
307
- else:
308
- turns_failed += 1
309
-
310
302
  evaluated_turns.append(evaluated_turn)
311
303
 
312
- turns_total = len(turns)
313
- if turns_passed == turns_total:
314
- overall_status = STATUS_PASS
315
- elif turns_failed == turns_total:
316
- overall_status = STATUS_FAIL
317
- else:
318
- overall_status = STATUS_PARTIAL
319
-
304
+ turn_statuses = [t.get("status", STATUS_ERROR) for t in evaluated_turns]
305
+ turns_total = len(evaluated_turns)
320
306
  summary = {
321
307
  "turns_total": turns_total,
322
- "turns_passed": turns_passed,
323
- "turns_failed": turns_failed,
324
- "overall_status": overall_status,
308
+ "turns_passed": sum(1 for s in turn_statuses if s == STATUS_PASS),
309
+ "turns_failed": sum(1 for s in turn_statuses if s == STATUS_FAIL),
310
+ "turns_partial": sum(1 for s in turn_statuses if s == STATUS_PARTIAL),
311
+ "turns_errored": sum(1 for s in turn_statuses if s == STATUS_ERROR),
312
+ "overall_status": rollup_thread_status(turn_statuses),
325
313
  }
326
314
 
327
315
  return evaluated_turns, summary
@@ -330,10 +318,8 @@ def _evaluate_multi_turn_responses(
330
318
  def _evaluate_single_response(
331
319
  enhanced_response: Dict[str, Any],
332
320
  eval_item: Dict,
333
- m365_agent_id: Optional[str],
334
321
  effective_log_level: str,
335
322
  model_config: AzureOpenAIModelConfiguration,
336
- has_azure_openai: bool,
337
323
  default_evaluators: Dict[str, Any],
338
324
  ) -> Dict[str, Any]:
339
325
  """Run all evaluators for a single prompt/response pair and return the result dict."""
@@ -348,10 +334,14 @@ def _evaluate_single_response(
348
334
 
349
335
  results_dict, evaluators_ran = _run_evaluators_for_item(
350
336
  prompt, actual_response_text, expected_response, enhanced_response,
351
- resolved, model_config, has_azure_openai, m365_agent_id,
337
+ resolved, model_config,
338
+ context_label=f"prompt '{prompt[:60]}'" if prompt else "",
352
339
  )
353
340
 
354
- evaluation_result = {
341
+ evaluator_result_values = _collect_evaluator_results(results_dict)
342
+ status, error_obj = status_for_response(evaluator_result_values)
343
+
344
+ evaluation_result: Dict[str, Any] = {
355
345
  "prompt": prompt,
356
346
  "response": enhanced_response.get(
357
347
  "display_response_text", actual_response_text
@@ -359,7 +349,10 @@ def _evaluate_single_response(
359
349
  "expected_response": expected_response,
360
350
  "evaluators_ran": evaluators_ran,
361
351
  "results": results_dict,
352
+ "status": status,
362
353
  }
354
+ if error_obj is not None:
355
+ evaluation_result["error"] = error_obj
363
356
 
364
357
  if "evaluators" in eval_item:
365
358
  evaluation_result["evaluators"] = eval_item["evaluators"]
@@ -485,7 +478,7 @@ def run_pipeline(
485
478
  "evaluators_ran": [],
486
479
  "results": {},
487
480
  "status": STATUS_ERROR,
488
- "errorDetails": str(exc),
481
+ "error": agent_request_failed(getattr(exc, "message", None) or str(exc)),
489
482
  }
490
483
 
491
484
  delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
@@ -493,9 +486,8 @@ def run_pipeline(
493
486
 
494
487
  # Phase B: Evaluate response
495
488
  return _evaluate_single_response(
496
- response, eval_item, config.m365_agent_id, config.effective_log_level,
497
- pipeline.model_config, pipeline.has_azure_openai,
498
- pipeline.default_evaluators,
489
+ response, eval_item, config.effective_log_level,
490
+ pipeline.model_config, pipeline.default_evaluators,
499
491
  )
500
492
 
501
493
  def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
@@ -522,7 +514,7 @@ def run_pipeline(
522
514
  conversation_context = None
523
515
  conversation_id = None
524
516
  enriched_turns: List[Dict[str, Any]] = []
525
- failed = False
517
+ failure_exception: Optional[Exception] = None
526
518
 
527
519
  for i, turn in enumerate(turns):
528
520
  prompt = turn["prompt"]
@@ -557,28 +549,34 @@ def run_pipeline(
557
549
  continue
558
550
 
559
551
  # All other errors: stop the thread
552
+ if status == 429:
553
+ note = "" # 429 retries were exhausted; the attempt count is enough.
554
+ else:
555
+ status_part = f"HTTP {status}" if status else "this error"
556
+ note = f" ({status_part} is not retried in multi-turn to avoid duplicate turns in the conversation)"
560
557
  emit_structured_log(
561
558
  "error",
562
- f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
559
+ f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s){note}: {exc}",
563
560
  operation=Operation.SEND_PROMPT,
564
561
  )
565
- failed = True
562
+ failure_exception = exc
566
563
  break
567
564
 
568
- if failed:
569
- # Mark this turn and all remaining turns as error
565
+ if failure_exception is not None:
566
+ # Failing turn carries the cause; downstream turns are skipped.
567
+ exc_msg = getattr(failure_exception, "message", None) or str(failure_exception)
570
568
  enriched_turns.append({
571
569
  **turn,
572
570
  "response": "",
573
571
  "status": STATUS_ERROR,
574
- "error": "Failed to get response from agent",
572
+ "error": agent_request_failed(exc_msg),
575
573
  })
576
574
  for j in range(i + 1, len(turns)):
577
575
  enriched_turns.append({
578
576
  **turns[j],
579
577
  "response": "",
580
578
  "status": STATUS_ERROR,
581
- "error": "Skipped: preceding turn failed",
579
+ "error": turn_skipped(),
582
580
  })
583
581
  break
584
582
 
@@ -596,10 +594,10 @@ def run_pipeline(
596
594
 
597
595
  # Phase B: Run per-turn evaluations
598
596
  evaluated_turns, summary = _evaluate_multi_turn_responses(
599
- enriched_turns, config.m365_agent_id, config.effective_log_level,
597
+ enriched_turns, config.effective_log_level,
600
598
  pipeline.default_evaluators,
599
+ thread_name=thread_name,
601
600
  model_config=pipeline.model_config,
602
- has_azure_openai=pipeline.has_azure_openai,
603
601
  )
604
602
 
605
603
  return {
@@ -621,21 +619,34 @@ def run_pipeline(
621
619
  if wr.error:
622
620
  idx = wr.index
623
621
  item = eval_items[idx]
622
+ exc_msg = getattr(wr.error, "message", None) or str(wr.error)
623
+ cause_error = agent_request_failed(exc_msg)
624
624
  if item_types[idx] == ItemType.MULTI_TURN:
625
+ # Worker raised before any turn ran. Turn 1 carries the cause;
626
+ # remaining turns are downstream-skipped. All turns errored →
627
+ # thread overall_status="error".
628
+ turns = item.get("turns", [])
629
+ turn_dicts = []
630
+ for j, t in enumerate(turns):
631
+ turn_dicts.append({
632
+ **t,
633
+ "response": "",
634
+ "results": {},
635
+ "status": STATUS_ERROR,
636
+ "error": cause_error if j == 0 else turn_skipped(),
637
+ })
625
638
  ordered_results.append({
626
639
  "type": "multi_turn",
627
640
  "name": item.get("name", ""),
628
- "turns": [
629
- {**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
630
- for t in item.get("turns", [])
631
- ],
641
+ "turns": turn_dicts,
632
642
  "summary": {
633
- "turns_total": len(item.get("turns", [])),
643
+ "turns_total": len(turns),
634
644
  "turns_passed": 0,
635
- "turns_failed": len(item.get("turns", [])),
636
- "overall_status": STATUS_FAIL,
645
+ "turns_failed": 0,
646
+ "turns_partial": 0,
647
+ "turns_errored": len(turns),
648
+ "overall_status": STATUS_ERROR,
637
649
  },
638
- "error": str(wr.error),
639
650
  })
640
651
  else:
641
652
  ordered_results.append({
@@ -645,7 +656,7 @@ def run_pipeline(
645
656
  "evaluators_ran": [],
646
657
  "results": {},
647
658
  "status": STATUS_ERROR,
648
- "errorDetails": str(wr.error),
659
+ "error": cause_error,
649
660
  })
650
661
  else:
651
662
  ordered_results.append(wr.value)
@@ -6,7 +6,7 @@ with file-level defaults and system defaults, following extend/replace modes.
6
6
 
7
7
  import difflib
8
8
  import logging
9
- from typing import Any, Dict, Optional, Tuple
9
+ from typing import Any, Dict, Optional
10
10
 
11
11
  from common import (
12
12
  RELEVANCE,
@@ -16,7 +16,6 @@ from common import (
16
16
  CITATIONS,
17
17
  EXACT_MATCH,
18
18
  PARTIAL_MATCH,
19
- REQUIRES_AZURE_OPENAI,
20
19
  SYSTEM_DEFAULT_EVALUATORS,
21
20
  RegistryEntry,
22
21
  )
@@ -26,13 +25,13 @@ logger = logging.getLogger(__name__)
26
25
 
27
26
  # Static registry of available evaluators per data-model.md
28
27
  EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
29
- RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
30
- COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
31
- GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
32
- SIMILARITY: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
33
- CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
34
- EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
35
- PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
28
+ RELEVANCE: RegistryEntry(type="llm", default_threshold=3),
29
+ COHERENCE: RegistryEntry(type="llm", default_threshold=3),
30
+ GROUNDEDNESS: RegistryEntry(type="llm", default_threshold=3),
31
+ SIMILARITY: RegistryEntry(type="llm", default_threshold=3),
32
+ CITATIONS: RegistryEntry(type="non-llm", default_threshold=1),
33
+ EXACT_MATCH: RegistryEntry(type="non-llm", default_threshold=None),
34
+ PARTIAL_MATCH: RegistryEntry(type="non-llm", default_threshold=0.5),
36
35
  }
37
36
 
38
37
 
@@ -71,30 +70,6 @@ def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
71
70
  raise ValueError("\n".join(lines))
72
71
 
73
72
 
74
- def check_prerequisites(
75
- evaluator_name: str,
76
- available_context: Dict[str, bool],
77
- ) -> Tuple[bool, Optional[str]]:
78
- """Check if prerequisites for an evaluator are available.
79
-
80
- Returns (True, None) if all prerequisites are met, or
81
- (False, warning_message) if a prerequisite is missing.
82
- """
83
- registry_entry = EVALUATOR_REGISTRY.get(evaluator_name)
84
- if not registry_entry:
85
- return False, f"Unknown evaluator: {evaluator_name}"
86
-
87
- for req in registry_entry.requires:
88
- if not available_context.get(req, False):
89
- msg = (
90
- f"Skipping evaluator '{evaluator_name}': "
91
- f"missing prerequisite '{req}'"
92
- )
93
- return False, msg
94
-
95
- return True, None
96
-
97
-
98
73
  def resolve_default_evaluators(file_defaults: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
99
74
  """Resolve effective default evaluators, falling back to system defaults.
100
75