@microsoft/m365-copilot-eval 1.6.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,7 +145,7 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
145
145
  expected_response=turn.get("expected_response", ""),
146
146
  evaluators_ran=turn.get("evaluators_ran", []),
147
147
  item_results=turn.get("results", {}),
148
- error=turn.get("error"),
148
+ error=_format_error_object(turn.get("error")),
149
149
  )
150
150
  print()
151
151
  print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
@@ -159,140 +159,156 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
159
159
  expected_response=result.get('expected_response', ''),
160
160
  evaluators_ran=result.get('evaluators_ran', []),
161
161
  item_results=result.get('results', {}),
162
- error=result.get('errorDetails'),
162
+ error=_format_error_object(result.get('error')),
163
163
  )
164
164
  print(f"{BLUE}{'-' * 30}{RESET}")
165
165
 
166
166
 
167
- def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
168
- """Extract an EvalScore object from a decorated metric dict.
167
+ def _format_error_object(error_obj: Optional[Dict[str, str]]) -> str:
168
+ """Flatten an ErrorObject ``{code, message}`` to ``"code: message"`` for one-line
169
+ contexts (console summary, CSV cell). Empty string when absent."""
170
+ if not error_obj:
171
+ return ""
172
+ return f"{error_obj['code']}: {error_obj['message']}"
173
+
174
+
175
+ def _as_errored_score(data: dict) -> Optional[Dict[str, Any]]:
176
+ """If ``data`` is an errored entry, return its ErroredScore dict; else None."""
177
+ if data.get("result") == STATUS_ERROR and isinstance(data.get("error"), str):
178
+ return {"result": STATUS_ERROR, "error": data["error"]}
179
+ return None
169
180
 
170
- Maps internal decorated-metric format to schema EvalScore:
171
- {score, result, threshold} (required) + reason, evaluator (optional).
172
- """
173
- score_val = None
174
- if metric_id in data and isinstance(data[metric_id], (int, float)):
175
- score_val = data[metric_id]
176
- if score_val is None:
177
- return None
178
181
 
182
+ # ── Per-evaluator-type valid-shape builders ─────────────────────────
183
+ # Each takes a decorated metric dict and returns the schema-compliant valid
184
+ # variant. They never see errored entries — _convert_scores_to_schema's loop
185
+ # handles ErroredScore dispatch before reaching these.
186
+
187
+
188
+ def _build_eval_score(data: dict, metric_id: str) -> Optional[Dict[str, Any]]:
189
+ """Standard 1-5 score: {score, result, threshold, reason?}. None if no numeric score."""
190
+ score_val = data.get(metric_id)
191
+ if not isinstance(score_val, (int, float)):
192
+ return None
179
193
  result = data.get("result")
180
194
  if result not in (STATUS_PASS, STATUS_FAIL):
181
195
  result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_PASS_THRESHOLD) else STATUS_FAIL
182
-
183
- eval_score: Dict[str, Any] = {
196
+ out: Dict[str, Any] = {
184
197
  "score": score_val,
185
198
  "result": result,
186
199
  "threshold": data.get("threshold", DEFAULT_PASS_THRESHOLD),
187
200
  }
188
201
  reason = data.get(f"{metric_id}_reason") or data.get("reason")
189
202
  if reason:
190
- eval_score["reason"] = reason
191
- return eval_score
203
+ out["reason"] = reason
204
+ return out
205
+
206
+
207
+ def _build_citation_score(data: dict, _metric_id: str) -> Dict[str, Any]:
208
+ count = data.get("citations", 0)
209
+ result = data.get("result")
210
+ if result not in (STATUS_PASS, STATUS_FAIL):
211
+ result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
212
+ out: Dict[str, Any] = {
213
+ "count": count,
214
+ "result": result,
215
+ "threshold": data.get("threshold", 1),
216
+ }
217
+ if "citation_format" in data:
218
+ out["format"] = data["citation_format"]
219
+ return out
220
+
221
+
222
+ def _build_exact_match_score(data: dict, _metric_id: str) -> Dict[str, Any]:
223
+ is_match = data.get("exact_match", 0.0) == 1.0
224
+ return {
225
+ "match": is_match,
226
+ "result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
227
+ "reason": data.get("exact_match_reason", ""),
228
+ }
229
+
230
+
231
+ def _build_partial_match_score(data: dict, _metric_id: str) -> Dict[str, Any]:
232
+ return {
233
+ "score": data.get("partial_match", 0.0),
234
+ "result": data.get("result", STATUS_FAIL),
235
+ "threshold": data.get("threshold", 0.5),
236
+ "reason": data.get("partial_match_reason", ""),
237
+ }
238
+
239
+
240
+ # Internal evaluator name → (schema-output key, valid-shape builder).
241
+ _SCORE_CONVERTERS = (
242
+ (RELEVANCE, "relevance", _build_eval_score),
243
+ (COHERENCE, "coherence", _build_eval_score),
244
+ (GROUNDEDNESS, "groundedness", _build_eval_score),
245
+ (SIMILARITY, "similarity", _build_eval_score),
246
+ (TOOL_CALL_ACCURACY, "toolCallAccuracy", _build_eval_score),
247
+ (CITATIONS, "citations", _build_citation_score),
248
+ (EXACT_MATCH, "exactMatch", _build_exact_match_score),
249
+ (PARTIAL_MATCH, "partialMatch", _build_partial_match_score),
250
+ )
251
+
252
+
253
+ def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
254
+ """Extract a schema-compliant EvalScore from a decorated metric dict.
255
+
256
+ Returns ErroredScore for crashes, the standard 1-5 score shape on success,
257
+ or None if no usable numeric score.
258
+ """
259
+ errored = _as_errored_score(data)
260
+ if errored is not None:
261
+ return errored
262
+ return _build_eval_score(data, metric_id)
192
263
 
193
264
 
194
265
  def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
195
266
  """Convert raw evaluator results to schema-compliant score objects.
196
267
 
197
- Evaluator results in results_dict are dicts (from _decorate_metric) or
198
- None when skipped/crashed. None values are omitted from output.
268
+ Each value in results_dict is either a decorated metric dict (valid score)
269
+ or an errored entry ``{result: "error", error}``. Errored entries pass
270
+ through unchanged as ErroredScore. Evaluators not present in results_dict
271
+ are omitted from the output.
199
272
  """
200
273
  scores: Dict[str, Any] = {}
201
-
202
- for eval_key, schema_key in [
203
- (RELEVANCE, "relevance"),
204
- (COHERENCE, "coherence"),
205
- (GROUNDEDNESS, "groundedness"),
206
- (SIMILARITY, "similarity"),
207
- (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
208
- ]:
274
+ for eval_key, schema_key, build_valid_score in _SCORE_CONVERTERS:
209
275
  data = results_dict.get(eval_key)
210
276
  if data is None:
211
277
  continue
212
- eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
213
- if eval_score:
214
- scores[schema_key] = eval_score
215
-
216
- data = results_dict.get(CITATIONS)
217
- if data is not None:
218
- count = data.get("citations", 0)
219
- cit_result = data.get("result")
220
- if cit_result not in (STATUS_PASS, STATUS_FAIL):
221
- cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
222
- citation_score: Dict[str, Any] = {
223
- "count": count,
224
- "result": cit_result,
225
- "threshold": data.get("threshold", 1),
226
- }
227
- if "citation_format" in data:
228
- citation_score["format"] = data["citation_format"]
229
- scores["citations"] = citation_score
230
-
231
- data = results_dict.get(EXACT_MATCH)
232
- if data is not None:
233
- is_match = data.get("exact_match", 0.0) == 1.0
234
- scores["exactMatch"] = {
235
- "match": is_match,
236
- "result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
237
- "reason": data.get("exact_match_reason", ""),
238
- }
239
-
240
- data = results_dict.get(PARTIAL_MATCH)
241
- if data is not None:
242
- scores["partialMatch"] = {
243
- "score": data.get("partial_match", 0.0),
244
- "result": data.get("result", STATUS_FAIL),
245
- "threshold": data.get("threshold", 0.5),
246
- "reason": data.get("partial_match_reason", ""),
247
- }
248
-
278
+ errored = _as_errored_score(data)
279
+ if errored is not None:
280
+ scores[schema_key] = errored
281
+ continue
282
+ valid = build_valid_score(data, METRIC_IDS.get(eval_key, eval_key.lower()))
283
+ if valid is not None:
284
+ scores[schema_key] = valid
249
285
  return scores
250
286
 
251
287
 
252
- def convert_result_to_eval_item(result: Dict) -> Dict:
253
- """Convert an internal evaluation result dict to a schema-compliant EvalItem."""
254
- item: Dict[str, Any] = {
255
- "prompt": result["prompt"],
256
- "response": result["response"],
257
- "expected_response": result["expected_response"],
258
- }
259
-
260
- if "evaluators" in result:
261
- item["evaluators"] = result["evaluators"]
262
- if "evaluators_mode" in result:
263
- item["evaluators_mode"] = result["evaluators_mode"]
288
+ def convert_single_item_result_to_output(source: Dict) -> Dict[str, Any]:
289
+ """Convert a single item result a single-turn item OR one turn inside
290
+ a multi-turn thread to its schema-compliant output shape.
264
291
 
265
- scores = _convert_scores_to_schema(result.get("results", {}))
292
+ Common shape: prompt, expected_response?, response?, evaluators?,
293
+ evaluators_mode?, scores?, status?, error?. Optional fields are emitted
294
+ only when present on the source.
295
+ """
296
+ out: Dict[str, Any] = {"prompt": source.get("prompt", "")}
297
+ for key in ("expected_response", "response", "evaluators", "evaluators_mode"):
298
+ if key in source:
299
+ out[key] = source[key]
300
+ scores = _convert_scores_to_schema(source.get("results", {}))
266
301
  if scores:
267
- item["scores"] = scores
268
-
269
- return item
302
+ out["scores"] = scores
303
+ if "status" in source:
304
+ out["status"] = source["status"]
305
+ if "error" in source:
306
+ out["error"] = source["error"]
307
+ return out
270
308
 
271
309
 
272
310
  def convert_thread_result_to_output(thread_result: Dict) -> Dict:
273
- """Convert a multi-turn thread result to the output format."""
274
- output_turns = []
275
- for turn in thread_result.get("turns", []):
276
- output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
277
- if "expected_response" in turn:
278
- output_turn["expected_response"] = turn["expected_response"]
279
- if "response" in turn:
280
- output_turn["response"] = turn["response"]
281
- if "status" in turn:
282
- output_turn["status"] = turn["status"]
283
- if "error" in turn:
284
- output_turn["error"] = turn["error"]
285
- if "evaluators" in turn:
286
- output_turn["evaluators"] = turn["evaluators"]
287
- if "evaluators_mode" in turn:
288
- output_turn["evaluators_mode"] = turn["evaluators_mode"]
289
-
290
- scores = _convert_scores_to_schema(turn.get("results", {}))
291
- if scores:
292
- output_turn["scores"] = scores
293
-
294
- output_turns.append(output_turn)
295
-
311
+ """Convert a multi-turn thread result to a schema-compliant ThreadOutput."""
296
312
  output: Dict[str, Any] = {}
297
313
  if thread_result.get("name"):
298
314
  output["name"] = thread_result["name"]
@@ -300,18 +316,17 @@ def convert_thread_result_to_output(thread_result: Dict) -> Dict:
300
316
  output["description"] = thread_result["description"]
301
317
  if thread_result.get("conversation_id"):
302
318
  output["conversation_id"] = thread_result["conversation_id"]
303
- output["turns"] = output_turns
319
+ output["turns"] = [convert_single_item_result_to_output(t) for t in thread_result.get("turns", [])]
304
320
  if thread_result.get("summary"):
305
321
  output["summary"] = thread_result["summary"]
306
-
307
322
  return output
308
323
 
309
324
 
310
325
  def convert_result_to_output_item(result: Dict) -> Dict:
311
- """Convert an internal result dict to an output item. Routes by type."""
326
+ """Top-level dispatch: routes a result dict by item type to the right converter."""
312
327
  if result.get("type") == "multi_turn":
313
328
  return convert_thread_result_to_output(result)
314
- return convert_result_to_eval_item(result)
329
+ return convert_single_item_result_to_output(result)
315
330
 
316
331
 
317
332
  def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
@@ -395,12 +410,14 @@ def write_results_to_csv(results: List[Dict], output_file: str,
395
410
  total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
396
411
  if total_items > 1:
397
412
  f.write("# AGGREGATE STATISTICS\n")
398
- f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
413
+ f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Errored,Avg Score,Threshold\n")
399
414
  for metric_name, stats in aggregates.items():
400
- threshold_str = str(stats.get('threshold', 'N/A'))
415
+ threshold_val = stats.get('threshold')
416
+ threshold_str = "N/A" if threshold_val is None else str(threshold_val)
401
417
  prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
402
418
  total_prompts = stats.get('total_prompts', total_items)
403
- f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
419
+ error_count = stats.get('error_count', 0)
420
+ f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{error_count},{stats['avg_score']:.2f},{threshold_str}\n")
404
421
  f.write("\n# INDIVIDUAL RESULTS\n")
405
422
 
406
423
  single_turn_rows = []
@@ -416,7 +433,7 @@ def write_results_to_csv(results: List[Dict], output_file: str,
416
433
  "response": turn.get("response", ""),
417
434
  "expected_response": turn.get("expected_response", ""),
418
435
  "status": turn.get("status", ""),
419
- "error": turn.get("error", ""),
436
+ "error": _format_error_object(turn.get("error")),
420
437
  "scores": _results_to_csv_json(turn.get("results", {})),
421
438
  })
422
439
  summary = result.get("summary", {})
@@ -432,6 +449,8 @@ def write_results_to_csv(results: List[Dict], output_file: str,
432
449
  else:
433
450
  exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
434
451
  row = {k: v for k, v in result.items() if k not in exclude_keys}
452
+ if "error" in row:
453
+ row["error"] = _format_error_object(row["error"])
435
454
  if "results" in result:
436
455
  row["scores"] = _results_to_csv_json(result["results"])
437
456
  single_turn_rows.append(row)
@@ -0,0 +1,91 @@
1
+ """Pure status-derivation and rollup helpers — the single source of truth for
2
+ turn-level and thread-level status under the v1.4.0 unified-error-reporting design.
3
+
4
+ Two functions, both pure:
5
+
6
+ * :func:`status_for_response` — turn/item-level status + optional top-level
7
+ error summary, computed from per-evaluator results when the agent responded.
8
+ * :func:`rollup_thread_status` — thread-level rollup over per-turn statuses.
9
+
10
+ The "no response obtained" case (turn/item ``status="error"``) is handled
11
+ inline at the agent-failure sites in :mod:`evaluation_runner`, which set
12
+ ``status=STATUS_ERROR`` and build the cause object directly via
13
+ :func:`error_messages.agent_request_failed` or
14
+ :func:`error_messages.turn_skipped`. This module only covers the
15
+ response-obtained side.
16
+
17
+ See research.md §R4 for the canonical pseudocode and quickstart.md §2.6 for the
18
+ exhaustive test matrix these functions must satisfy.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from typing import Optional, Sequence, Tuple
24
+
25
+ from common import STATUS_ERROR, STATUS_FAIL, STATUS_PARTIAL, STATUS_PASS
26
+ from error_messages import ErrorObject, evaluators_failed_summary
27
+
28
+
29
+ def status_for_response(
30
+ evaluator_results: Sequence[str],
31
+ ) -> Tuple[str, Optional[ErrorObject]]:
32
+ """Compute (status, optional summary error) for a turn/item where the agent responded.
33
+
34
+ Args:
35
+ evaluator_results: Per-evaluator 'result' values, each in {'pass', 'fail', 'error'}.
36
+
37
+ Returns:
38
+ A (status, error) tuple where status is one of 'pass', 'fail', or 'partial':
39
+
40
+ * 'pass' — every evaluator returned 'pass', OR no evaluators ran
41
+ (vacuous truth — items with no evaluators pass by default). error is None.
42
+ * 'partial' — at least one evaluator returned 'error'. Error takes
43
+ priority over pass/fail; a turn with one passing evaluator and one
44
+ errored evaluator is 'partial' regardless of the others. error is the
45
+ evaluatorsFailed summary
46
+ {code, message: 'Agent response obtained. N of M evaluators failed to run.'}.
47
+ * 'fail' — every evaluator ran successfully (no errors) AND at least
48
+ one returned 'fail'. Covers uniform-fail and pass+fail mixes. error is None.
49
+
50
+ Status 'error' is never returned — the caller handles the no-response
51
+ case directly.
52
+ """
53
+ uniques = set(evaluator_results)
54
+ if not uniques or uniques == {STATUS_PASS}:
55
+ return STATUS_PASS, None
56
+ error_count = sum(1 for r in evaluator_results if r == STATUS_ERROR)
57
+ if error_count > 0:
58
+ return STATUS_PARTIAL, evaluators_failed_summary(error_count, len(evaluator_results))
59
+ return STATUS_FAIL, None
60
+
61
+
62
+ def rollup_thread_status(turn_statuses: Sequence[str]) -> str:
63
+ """Compute a thread-level overall_status from the per-turn statuses.
64
+
65
+ Priority rules:
66
+
67
+ 1. Any errored turn → thread 'error' (the run didn't complete).
68
+ 2. Else, any partial turn → thread 'partial'.
69
+ 3. Else, all turns 'pass' → 'pass'.
70
+ 4. Else, all turns 'fail' → 'fail'.
71
+ 5. Else (mix of pass and fail at thread level) → 'partial'.
72
+
73
+ Note rule 5 (pass+fail mix → 'partial') does not match status_for_response
74
+ at the per-turn level (where pass+fail among evaluators yields 'fail').
75
+ The thread-level rule preserves existing behaviour; the mismatch is known
76
+ and deferred for revisit.
77
+
78
+ As a defensive fallback an empty sequence returns 'error'.
79
+ """
80
+ if not turn_statuses:
81
+ return STATUS_ERROR
82
+ if STATUS_ERROR in turn_statuses:
83
+ return STATUS_ERROR
84
+ if STATUS_PARTIAL in turn_statuses:
85
+ return STATUS_PARTIAL
86
+ uniques = set(turn_statuses)
87
+ if uniques == {STATUS_PASS}:
88
+ return STATUS_PASS
89
+ if uniques == {STATUS_FAIL}:
90
+ return STATUS_FAIL
91
+ return STATUS_PARTIAL
@@ -2,7 +2,7 @@
2
2
  * Build-time injected default values
3
3
  * DO NOT EDIT - This file is auto-generated during build.
4
4
  *
5
- * Generated: 2026-05-07T22:53:22.056Z
5
+ * Generated: 2026-05-14T18:32:53.816Z
6
6
  *
7
7
  * @copyright Microsoft Corporation. All rights reserved.
8
8
  * @license MIT
@@ -19,6 +19,23 @@ const AGENT_ID_ALIASES = [
19
19
  { key: 'M365_TITLE_ID', transform: (v) => `${v}.declarativeAgent` },
20
20
  ];
21
21
 
22
+ // Aliases resolved into TENANT_ID (first match wins)
23
+ const TENANT_ID_ALIASES = [{ key: 'TEAMS_APP_TENANT_ID' }];
24
+
25
+ function _resolveAliases(targetKey, aliases, envVars) {
26
+ if (envVars[targetKey]) return;
27
+ for (const alias of aliases) {
28
+ if (envVars[alias.key]) {
29
+ const resolved = alias.transform
30
+ ? alias.transform(envVars[alias.key])
31
+ : envVars[alias.key];
32
+ envVars[targetKey] = resolved;
33
+ process.env[targetKey] = resolved;
34
+ break;
35
+ }
36
+ }
37
+ }
38
+
22
39
  /**
23
40
  * Load environment variables from a .env-style file.
24
41
  * Uses dotenv.parse() for standards-compliant parsing (handles quoted values,
@@ -71,19 +88,9 @@ export function _loadEnvFile(envFilePath) {
71
88
  return null;
72
89
  }
73
90
 
74
- // Resolve agent ID aliases into M365_AGENT_ID (first match wins)
75
- if (!envVars['M365_AGENT_ID']) {
76
- for (const alias of AGENT_ID_ALIASES) {
77
- if (envVars[alias.key]) {
78
- const agentId = alias.transform
79
- ? alias.transform(envVars[alias.key])
80
- : envVars[alias.key];
81
- envVars['M365_AGENT_ID'] = agentId;
82
- process.env['M365_AGENT_ID'] = agentId;
83
- break;
84
- }
85
- }
86
- }
91
+ // Resolve aliases into canonical keys (first match wins)
92
+ _resolveAliases('M365_AGENT_ID', AGENT_ID_ALIASES, envVars);
93
+ _resolveAliases('TENANT_ID', TENANT_ID_ALIASES, envVars);
87
94
 
88
95
  return envVars;
89
96
  }