@microsoft/m365-copilot-eval 1.6.0-preview.1 → 1.7.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/package.json +2 -2
- package/schema/v1/eval-document.schema.json +144 -333
- package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
- package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
- package/schema/v1/examples/valid/multi-turn-output.json +2 -0
- package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
- package/src/clients/cli/common.py +8 -14
- package/src/clients/cli/error_messages.py +91 -0
- package/src/clients/cli/evaluation_runner.py +108 -97
- package/src/clients/cli/evaluator_resolver.py +8 -33
- package/src/clients/cli/generate_report.py +125 -96
- package/src/clients/cli/readme.md +1 -1
- package/src/clients/cli/result_writer.py +129 -110
- package/src/clients/cli/status_derivation.py +91 -0
- package/src/clients/node-js/config/default.js +1 -1
- package/src/clients/node-js/lib/env-loader.js +20 -13
|
@@ -145,7 +145,7 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
|
145
145
|
expected_response=turn.get("expected_response", ""),
|
|
146
146
|
evaluators_ran=turn.get("evaluators_ran", []),
|
|
147
147
|
item_results=turn.get("results", {}),
|
|
148
|
-
error=turn.get("error"),
|
|
148
|
+
error=_format_error_object(turn.get("error")),
|
|
149
149
|
)
|
|
150
150
|
print()
|
|
151
151
|
print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
|
|
@@ -159,140 +159,156 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
|
|
|
159
159
|
expected_response=result.get('expected_response', ''),
|
|
160
160
|
evaluators_ran=result.get('evaluators_ran', []),
|
|
161
161
|
item_results=result.get('results', {}),
|
|
162
|
-
error=result.get('
|
|
162
|
+
error=_format_error_object(result.get('error')),
|
|
163
163
|
)
|
|
164
164
|
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
165
165
|
|
|
166
166
|
|
|
167
|
-
def
|
|
168
|
-
"""
|
|
167
|
+
def _format_error_object(error_obj: Optional[Dict[str, str]]) -> str:
|
|
168
|
+
"""Flatten an ErrorObject ``{code, message}`` to ``"code: message"`` for one-line
|
|
169
|
+
contexts (console summary, CSV cell). Empty string when absent."""
|
|
170
|
+
if not error_obj:
|
|
171
|
+
return ""
|
|
172
|
+
return f"{error_obj['code']}: {error_obj['message']}"
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _as_errored_score(data: dict) -> Optional[Dict[str, Any]]:
|
|
176
|
+
"""If ``data`` is an errored entry, return its ErroredScore dict; else None."""
|
|
177
|
+
if data.get("result") == STATUS_ERROR and isinstance(data.get("error"), str):
|
|
178
|
+
return {"result": STATUS_ERROR, "error": data["error"]}
|
|
179
|
+
return None
|
|
169
180
|
|
|
170
|
-
Maps internal decorated-metric format to schema EvalScore:
|
|
171
|
-
{score, result, threshold} (required) + reason, evaluator (optional).
|
|
172
|
-
"""
|
|
173
|
-
score_val = None
|
|
174
|
-
if metric_id in data and isinstance(data[metric_id], (int, float)):
|
|
175
|
-
score_val = data[metric_id]
|
|
176
|
-
if score_val is None:
|
|
177
|
-
return None
|
|
178
181
|
|
|
182
|
+
# ── Per-evaluator-type valid-shape builders ─────────────────────────
|
|
183
|
+
# Each takes a decorated metric dict and returns the schema-compliant valid
|
|
184
|
+
# variant. They never see errored entries — _convert_scores_to_schema's loop
|
|
185
|
+
# handles ErroredScore dispatch before reaching these.
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _build_eval_score(data: dict, metric_id: str) -> Optional[Dict[str, Any]]:
|
|
189
|
+
"""Standard 1-5 score: {score, result, threshold, reason?}. None if no numeric score."""
|
|
190
|
+
score_val = data.get(metric_id)
|
|
191
|
+
if not isinstance(score_val, (int, float)):
|
|
192
|
+
return None
|
|
179
193
|
result = data.get("result")
|
|
180
194
|
if result not in (STATUS_PASS, STATUS_FAIL):
|
|
181
195
|
result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_PASS_THRESHOLD) else STATUS_FAIL
|
|
182
|
-
|
|
183
|
-
eval_score: Dict[str, Any] = {
|
|
196
|
+
out: Dict[str, Any] = {
|
|
184
197
|
"score": score_val,
|
|
185
198
|
"result": result,
|
|
186
199
|
"threshold": data.get("threshold", DEFAULT_PASS_THRESHOLD),
|
|
187
200
|
}
|
|
188
201
|
reason = data.get(f"{metric_id}_reason") or data.get("reason")
|
|
189
202
|
if reason:
|
|
190
|
-
|
|
191
|
-
return
|
|
203
|
+
out["reason"] = reason
|
|
204
|
+
return out
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _build_citation_score(data: dict, _metric_id: str) -> Dict[str, Any]:
|
|
208
|
+
count = data.get("citations", 0)
|
|
209
|
+
result = data.get("result")
|
|
210
|
+
if result not in (STATUS_PASS, STATUS_FAIL):
|
|
211
|
+
result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
|
|
212
|
+
out: Dict[str, Any] = {
|
|
213
|
+
"count": count,
|
|
214
|
+
"result": result,
|
|
215
|
+
"threshold": data.get("threshold", 1),
|
|
216
|
+
}
|
|
217
|
+
if "citation_format" in data:
|
|
218
|
+
out["format"] = data["citation_format"]
|
|
219
|
+
return out
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _build_exact_match_score(data: dict, _metric_id: str) -> Dict[str, Any]:
|
|
223
|
+
is_match = data.get("exact_match", 0.0) == 1.0
|
|
224
|
+
return {
|
|
225
|
+
"match": is_match,
|
|
226
|
+
"result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
|
|
227
|
+
"reason": data.get("exact_match_reason", ""),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _build_partial_match_score(data: dict, _metric_id: str) -> Dict[str, Any]:
|
|
232
|
+
return {
|
|
233
|
+
"score": data.get("partial_match", 0.0),
|
|
234
|
+
"result": data.get("result", STATUS_FAIL),
|
|
235
|
+
"threshold": data.get("threshold", 0.5),
|
|
236
|
+
"reason": data.get("partial_match_reason", ""),
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# Internal evaluator name → (schema-output key, valid-shape builder).
|
|
241
|
+
_SCORE_CONVERTERS = (
|
|
242
|
+
(RELEVANCE, "relevance", _build_eval_score),
|
|
243
|
+
(COHERENCE, "coherence", _build_eval_score),
|
|
244
|
+
(GROUNDEDNESS, "groundedness", _build_eval_score),
|
|
245
|
+
(SIMILARITY, "similarity", _build_eval_score),
|
|
246
|
+
(TOOL_CALL_ACCURACY, "toolCallAccuracy", _build_eval_score),
|
|
247
|
+
(CITATIONS, "citations", _build_citation_score),
|
|
248
|
+
(EXACT_MATCH, "exactMatch", _build_exact_match_score),
|
|
249
|
+
(PARTIAL_MATCH, "partialMatch", _build_partial_match_score),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
254
|
+
"""Extract a schema-compliant EvalScore from a decorated metric dict.
|
|
255
|
+
|
|
256
|
+
Returns ErroredScore for crashes, the standard 1-5 score shape on success,
|
|
257
|
+
or None if no usable numeric score.
|
|
258
|
+
"""
|
|
259
|
+
errored = _as_errored_score(data)
|
|
260
|
+
if errored is not None:
|
|
261
|
+
return errored
|
|
262
|
+
return _build_eval_score(data, metric_id)
|
|
192
263
|
|
|
193
264
|
|
|
194
265
|
def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
|
|
195
266
|
"""Convert raw evaluator results to schema-compliant score objects.
|
|
196
267
|
|
|
197
|
-
|
|
198
|
-
|
|
268
|
+
Each value in results_dict is either a decorated metric dict (valid score)
|
|
269
|
+
or an errored entry ``{result: "error", error}``. Errored entries pass
|
|
270
|
+
through unchanged as ErroredScore. Evaluators not present in results_dict
|
|
271
|
+
are omitted from the output.
|
|
199
272
|
"""
|
|
200
273
|
scores: Dict[str, Any] = {}
|
|
201
|
-
|
|
202
|
-
for eval_key, schema_key in [
|
|
203
|
-
(RELEVANCE, "relevance"),
|
|
204
|
-
(COHERENCE, "coherence"),
|
|
205
|
-
(GROUNDEDNESS, "groundedness"),
|
|
206
|
-
(SIMILARITY, "similarity"),
|
|
207
|
-
(TOOL_CALL_ACCURACY, "toolCallAccuracy"),
|
|
208
|
-
]:
|
|
274
|
+
for eval_key, schema_key, build_valid_score in _SCORE_CONVERTERS:
|
|
209
275
|
data = results_dict.get(eval_key)
|
|
210
276
|
if data is None:
|
|
211
277
|
continue
|
|
212
|
-
|
|
213
|
-
if
|
|
214
|
-
scores[schema_key] =
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
cit_result = data.get("result")
|
|
220
|
-
if cit_result not in (STATUS_PASS, STATUS_FAIL):
|
|
221
|
-
cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
|
|
222
|
-
citation_score: Dict[str, Any] = {
|
|
223
|
-
"count": count,
|
|
224
|
-
"result": cit_result,
|
|
225
|
-
"threshold": data.get("threshold", 1),
|
|
226
|
-
}
|
|
227
|
-
if "citation_format" in data:
|
|
228
|
-
citation_score["format"] = data["citation_format"]
|
|
229
|
-
scores["citations"] = citation_score
|
|
230
|
-
|
|
231
|
-
data = results_dict.get(EXACT_MATCH)
|
|
232
|
-
if data is not None:
|
|
233
|
-
is_match = data.get("exact_match", 0.0) == 1.0
|
|
234
|
-
scores["exactMatch"] = {
|
|
235
|
-
"match": is_match,
|
|
236
|
-
"result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
|
|
237
|
-
"reason": data.get("exact_match_reason", ""),
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
data = results_dict.get(PARTIAL_MATCH)
|
|
241
|
-
if data is not None:
|
|
242
|
-
scores["partialMatch"] = {
|
|
243
|
-
"score": data.get("partial_match", 0.0),
|
|
244
|
-
"result": data.get("result", STATUS_FAIL),
|
|
245
|
-
"threshold": data.get("threshold", 0.5),
|
|
246
|
-
"reason": data.get("partial_match_reason", ""),
|
|
247
|
-
}
|
|
248
|
-
|
|
278
|
+
errored = _as_errored_score(data)
|
|
279
|
+
if errored is not None:
|
|
280
|
+
scores[schema_key] = errored
|
|
281
|
+
continue
|
|
282
|
+
valid = build_valid_score(data, METRIC_IDS.get(eval_key, eval_key.lower()))
|
|
283
|
+
if valid is not None:
|
|
284
|
+
scores[schema_key] = valid
|
|
249
285
|
return scores
|
|
250
286
|
|
|
251
287
|
|
|
252
|
-
def
|
|
253
|
-
"""Convert
|
|
254
|
-
|
|
255
|
-
"prompt": result["prompt"],
|
|
256
|
-
"response": result["response"],
|
|
257
|
-
"expected_response": result["expected_response"],
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
if "evaluators" in result:
|
|
261
|
-
item["evaluators"] = result["evaluators"]
|
|
262
|
-
if "evaluators_mode" in result:
|
|
263
|
-
item["evaluators_mode"] = result["evaluators_mode"]
|
|
288
|
+
def convert_single_item_result_to_output(source: Dict) -> Dict[str, Any]:
|
|
289
|
+
"""Convert a single item result — a single-turn item OR one turn inside
|
|
290
|
+
a multi-turn thread — to its schema-compliant output shape.
|
|
264
291
|
|
|
265
|
-
|
|
292
|
+
Common shape: prompt, expected_response?, response?, evaluators?,
|
|
293
|
+
evaluators_mode?, scores?, status?, error?. Optional fields are emitted
|
|
294
|
+
only when present on the source.
|
|
295
|
+
"""
|
|
296
|
+
out: Dict[str, Any] = {"prompt": source.get("prompt", "")}
|
|
297
|
+
for key in ("expected_response", "response", "evaluators", "evaluators_mode"):
|
|
298
|
+
if key in source:
|
|
299
|
+
out[key] = source[key]
|
|
300
|
+
scores = _convert_scores_to_schema(source.get("results", {}))
|
|
266
301
|
if scores:
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
302
|
+
out["scores"] = scores
|
|
303
|
+
if "status" in source:
|
|
304
|
+
out["status"] = source["status"]
|
|
305
|
+
if "error" in source:
|
|
306
|
+
out["error"] = source["error"]
|
|
307
|
+
return out
|
|
270
308
|
|
|
271
309
|
|
|
272
310
|
def convert_thread_result_to_output(thread_result: Dict) -> Dict:
|
|
273
|
-
"""Convert a multi-turn thread result to
|
|
274
|
-
output_turns = []
|
|
275
|
-
for turn in thread_result.get("turns", []):
|
|
276
|
-
output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
|
|
277
|
-
if "expected_response" in turn:
|
|
278
|
-
output_turn["expected_response"] = turn["expected_response"]
|
|
279
|
-
if "response" in turn:
|
|
280
|
-
output_turn["response"] = turn["response"]
|
|
281
|
-
if "status" in turn:
|
|
282
|
-
output_turn["status"] = turn["status"]
|
|
283
|
-
if "error" in turn:
|
|
284
|
-
output_turn["error"] = turn["error"]
|
|
285
|
-
if "evaluators" in turn:
|
|
286
|
-
output_turn["evaluators"] = turn["evaluators"]
|
|
287
|
-
if "evaluators_mode" in turn:
|
|
288
|
-
output_turn["evaluators_mode"] = turn["evaluators_mode"]
|
|
289
|
-
|
|
290
|
-
scores = _convert_scores_to_schema(turn.get("results", {}))
|
|
291
|
-
if scores:
|
|
292
|
-
output_turn["scores"] = scores
|
|
293
|
-
|
|
294
|
-
output_turns.append(output_turn)
|
|
295
|
-
|
|
311
|
+
"""Convert a multi-turn thread result to a schema-compliant ThreadOutput."""
|
|
296
312
|
output: Dict[str, Any] = {}
|
|
297
313
|
if thread_result.get("name"):
|
|
298
314
|
output["name"] = thread_result["name"]
|
|
@@ -300,18 +316,17 @@ def convert_thread_result_to_output(thread_result: Dict) -> Dict:
|
|
|
300
316
|
output["description"] = thread_result["description"]
|
|
301
317
|
if thread_result.get("conversation_id"):
|
|
302
318
|
output["conversation_id"] = thread_result["conversation_id"]
|
|
303
|
-
output["turns"] =
|
|
319
|
+
output["turns"] = [convert_single_item_result_to_output(t) for t in thread_result.get("turns", [])]
|
|
304
320
|
if thread_result.get("summary"):
|
|
305
321
|
output["summary"] = thread_result["summary"]
|
|
306
|
-
|
|
307
322
|
return output
|
|
308
323
|
|
|
309
324
|
|
|
310
325
|
def convert_result_to_output_item(result: Dict) -> Dict:
|
|
311
|
-
"""
|
|
326
|
+
"""Top-level dispatch: routes a result dict by item type to the right converter."""
|
|
312
327
|
if result.get("type") == "multi_turn":
|
|
313
328
|
return convert_thread_result_to_output(result)
|
|
314
|
-
return
|
|
329
|
+
return convert_single_item_result_to_output(result)
|
|
315
330
|
|
|
316
331
|
|
|
317
332
|
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
|
|
@@ -395,12 +410,14 @@ def write_results_to_csv(results: List[Dict], output_file: str,
|
|
|
395
410
|
total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
|
|
396
411
|
if total_items > 1:
|
|
397
412
|
f.write("# AGGREGATE STATISTICS\n")
|
|
398
|
-
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
413
|
+
f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Errored,Avg Score,Threshold\n")
|
|
399
414
|
for metric_name, stats in aggregates.items():
|
|
400
|
-
|
|
415
|
+
threshold_val = stats.get('threshold')
|
|
416
|
+
threshold_str = "N/A" if threshold_val is None else str(threshold_val)
|
|
401
417
|
prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
|
|
402
418
|
total_prompts = stats.get('total_prompts', total_items)
|
|
403
|
-
|
|
419
|
+
error_count = stats.get('error_count', 0)
|
|
420
|
+
f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{error_count},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
404
421
|
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
405
422
|
|
|
406
423
|
single_turn_rows = []
|
|
@@ -416,7 +433,7 @@ def write_results_to_csv(results: List[Dict], output_file: str,
|
|
|
416
433
|
"response": turn.get("response", ""),
|
|
417
434
|
"expected_response": turn.get("expected_response", ""),
|
|
418
435
|
"status": turn.get("status", ""),
|
|
419
|
-
"error": turn.get("error"
|
|
436
|
+
"error": _format_error_object(turn.get("error")),
|
|
420
437
|
"scores": _results_to_csv_json(turn.get("results", {})),
|
|
421
438
|
})
|
|
422
439
|
summary = result.get("summary", {})
|
|
@@ -432,6 +449,8 @@ def write_results_to_csv(results: List[Dict], output_file: str,
|
|
|
432
449
|
else:
|
|
433
450
|
exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
|
|
434
451
|
row = {k: v for k, v in result.items() if k not in exclude_keys}
|
|
452
|
+
if "error" in row:
|
|
453
|
+
row["error"] = _format_error_object(row["error"])
|
|
435
454
|
if "results" in result:
|
|
436
455
|
row["scores"] = _results_to_csv_json(result["results"])
|
|
437
456
|
single_turn_rows.append(row)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Pure status-derivation and rollup helpers — the single source of truth for
|
|
2
|
+
turn-level and thread-level status under the v1.4.0 unified-error-reporting design.
|
|
3
|
+
|
|
4
|
+
Two functions, both pure:
|
|
5
|
+
|
|
6
|
+
* :func:`status_for_response` — turn/item-level status + optional top-level
|
|
7
|
+
error summary, computed from per-evaluator results when the agent responded.
|
|
8
|
+
* :func:`rollup_thread_status` — thread-level rollup over per-turn statuses.
|
|
9
|
+
|
|
10
|
+
The "no response obtained" case (turn/item ``status="error"``) is handled
|
|
11
|
+
inline at the agent-failure sites in :mod:`evaluation_runner`, which set
|
|
12
|
+
``status=STATUS_ERROR`` and build the cause object directly via
|
|
13
|
+
:func:`error_messages.agent_request_failed` or
|
|
14
|
+
:func:`error_messages.turn_skipped`. This module only covers the
|
|
15
|
+
response-obtained side.
|
|
16
|
+
|
|
17
|
+
See research.md §R4 for the canonical pseudocode and quickstart.md §2.6 for the
|
|
18
|
+
exhaustive test matrix these functions must satisfy.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from typing import Optional, Sequence, Tuple
|
|
24
|
+
|
|
25
|
+
from common import STATUS_ERROR, STATUS_FAIL, STATUS_PARTIAL, STATUS_PASS
|
|
26
|
+
from error_messages import ErrorObject, evaluators_failed_summary
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def status_for_response(
|
|
30
|
+
evaluator_results: Sequence[str],
|
|
31
|
+
) -> Tuple[str, Optional[ErrorObject]]:
|
|
32
|
+
"""Compute (status, optional summary error) for a turn/item where the agent responded.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
evaluator_results: Per-evaluator 'result' values, each in {'pass', 'fail', 'error'}.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
A (status, error) tuple where status is one of 'pass', 'fail', or 'partial':
|
|
39
|
+
|
|
40
|
+
* 'pass' — every evaluator returned 'pass', OR no evaluators ran
|
|
41
|
+
(vacuous truth — items with no evaluators pass by default). error is None.
|
|
42
|
+
* 'partial' — at least one evaluator returned 'error'. Error takes
|
|
43
|
+
priority over pass/fail; a turn with one passing evaluator and one
|
|
44
|
+
errored evaluator is 'partial' regardless of the others. error is the
|
|
45
|
+
evaluatorsFailed summary
|
|
46
|
+
{code, message: 'Agent response obtained. N of M evaluators failed to run.'}.
|
|
47
|
+
* 'fail' — every evaluator ran successfully (no errors) AND at least
|
|
48
|
+
one returned 'fail'. Covers uniform-fail and pass+fail mixes. error is None.
|
|
49
|
+
|
|
50
|
+
Status 'error' is never returned — the caller handles the no-response
|
|
51
|
+
case directly.
|
|
52
|
+
"""
|
|
53
|
+
uniques = set(evaluator_results)
|
|
54
|
+
if not uniques or uniques == {STATUS_PASS}:
|
|
55
|
+
return STATUS_PASS, None
|
|
56
|
+
error_count = sum(1 for r in evaluator_results if r == STATUS_ERROR)
|
|
57
|
+
if error_count > 0:
|
|
58
|
+
return STATUS_PARTIAL, evaluators_failed_summary(error_count, len(evaluator_results))
|
|
59
|
+
return STATUS_FAIL, None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def rollup_thread_status(turn_statuses: Sequence[str]) -> str:
|
|
63
|
+
"""Compute a thread-level overall_status from the per-turn statuses.
|
|
64
|
+
|
|
65
|
+
Priority rules:
|
|
66
|
+
|
|
67
|
+
1. Any errored turn → thread 'error' (the run didn't complete).
|
|
68
|
+
2. Else, any partial turn → thread 'partial'.
|
|
69
|
+
3. Else, all turns 'pass' → 'pass'.
|
|
70
|
+
4. Else, all turns 'fail' → 'fail'.
|
|
71
|
+
5. Else (mix of pass and fail at thread level) → 'partial'.
|
|
72
|
+
|
|
73
|
+
Note rule 5 (pass+fail mix → 'partial') does not match status_for_response
|
|
74
|
+
at the per-turn level (where pass+fail among evaluators yields 'fail').
|
|
75
|
+
The thread-level rule preserves existing behaviour; the mismatch is known
|
|
76
|
+
and deferred for revisit.
|
|
77
|
+
|
|
78
|
+
As a defensive fallback an empty sequence returns 'error'.
|
|
79
|
+
"""
|
|
80
|
+
if not turn_statuses:
|
|
81
|
+
return STATUS_ERROR
|
|
82
|
+
if STATUS_ERROR in turn_statuses:
|
|
83
|
+
return STATUS_ERROR
|
|
84
|
+
if STATUS_PARTIAL in turn_statuses:
|
|
85
|
+
return STATUS_PARTIAL
|
|
86
|
+
uniques = set(turn_statuses)
|
|
87
|
+
if uniques == {STATUS_PASS}:
|
|
88
|
+
return STATUS_PASS
|
|
89
|
+
if uniques == {STATUS_FAIL}:
|
|
90
|
+
return STATUS_FAIL
|
|
91
|
+
return STATUS_PARTIAL
|
|
@@ -19,6 +19,23 @@ const AGENT_ID_ALIASES = [
|
|
|
19
19
|
{ key: 'M365_TITLE_ID', transform: (v) => `${v}.declarativeAgent` },
|
|
20
20
|
];
|
|
21
21
|
|
|
22
|
+
// Aliases resolved into TENANT_ID (first match wins)
|
|
23
|
+
const TENANT_ID_ALIASES = [{ key: 'TEAMS_APP_TENANT_ID' }];
|
|
24
|
+
|
|
25
|
+
function _resolveAliases(targetKey, aliases, envVars) {
|
|
26
|
+
if (envVars[targetKey]) return;
|
|
27
|
+
for (const alias of aliases) {
|
|
28
|
+
if (envVars[alias.key]) {
|
|
29
|
+
const resolved = alias.transform
|
|
30
|
+
? alias.transform(envVars[alias.key])
|
|
31
|
+
: envVars[alias.key];
|
|
32
|
+
envVars[targetKey] = resolved;
|
|
33
|
+
process.env[targetKey] = resolved;
|
|
34
|
+
break;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
22
39
|
/**
|
|
23
40
|
* Load environment variables from a .env-style file.
|
|
24
41
|
* Uses dotenv.parse() for standards-compliant parsing (handles quoted values,
|
|
@@ -71,19 +88,9 @@ export function _loadEnvFile(envFilePath) {
|
|
|
71
88
|
return null;
|
|
72
89
|
}
|
|
73
90
|
|
|
74
|
-
// Resolve
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
if (envVars[alias.key]) {
|
|
78
|
-
const agentId = alias.transform
|
|
79
|
-
? alias.transform(envVars[alias.key])
|
|
80
|
-
: envVars[alias.key];
|
|
81
|
-
envVars['M365_AGENT_ID'] = agentId;
|
|
82
|
-
process.env['M365_AGENT_ID'] = agentId;
|
|
83
|
-
break;
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
}
|
|
91
|
+
// Resolve aliases into canonical keys (first match wins)
|
|
92
|
+
_resolveAliases('M365_AGENT_ID', AGENT_ID_ALIASES, envVars);
|
|
93
|
+
_resolveAliases('TENANT_ID', TENANT_ID_ALIASES, envVars);
|
|
87
94
|
|
|
88
95
|
return envVars;
|
|
89
96
|
}
|