npm - @microsoft/m365-copilot-eval - Versions diffs - 1.6.0-preview.1 → 1.7.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.6.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +4 -1
package/package.json +2 -2
package/schema/v1/eval-document.schema.json +144 -333
package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
package/schema/v1/examples/valid/multi-turn-output.json +2 -0
package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
package/src/clients/cli/common.py +8 -14
package/src/clients/cli/error_messages.py +91 -0
package/src/clients/cli/evaluation_runner.py +108 -97
package/src/clients/cli/evaluator_resolver.py +8 -33
package/src/clients/cli/generate_report.py +125 -96
package/src/clients/cli/readme.md +1 -1
package/src/clients/cli/result_writer.py +129 -110
package/src/clients/cli/status_derivation.py +91 -0
package/src/clients/node-js/config/default.js +1 -1
package/src/clients/node-js/lib/env-loader.js +20 -13

package/src/clients/cli/result_writer.py CHANGED Viewed

@@ -145,7 +145,7 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
                     expected_response=turn.get("expected_response", ""),
                     evaluators_ran=turn.get("evaluators_ran", []),
                     item_results=turn.get("results", {}),
-                    error=turn.get("error"),
+                    error=_format_error_object(turn.get("error")),
                 )
             print()
             print(f"{BOLD}{MAGENTA}Thread {i} Summary:{RESET}")
@@ -159,140 +159,156 @@ def write_results_to_console(results, agent_name: Optional[str] = None,
                 expected_response=result.get('expected_response', ''),
                 evaluators_ran=result.get('evaluators_ran', []),
                 item_results=result.get('results', {}),
-                error=result.get('errorDetails'),
+                error=_format_error_object(result.get('error')),
             )
             print(f"{BLUE}{'-' * 30}{RESET}")
-def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
-    """Extract an EvalScore object from a decorated metric dict.
+def _format_error_object(error_obj: Optional[Dict[str, str]]) -> str:
+    """Flatten an ErrorObject ``{code, message}`` to ``"code: message"`` for one-line
+    contexts (console summary, CSV cell). Empty string when absent."""
+    if not error_obj:
+        return ""
+    return f"{error_obj['code']}: {error_obj['message']}"
+def _as_errored_score(data: dict) -> Optional[Dict[str, Any]]:
+    """If ``data`` is an errored entry, return its ErroredScore dict; else None."""
+    if data.get("result") == STATUS_ERROR and isinstance(data.get("error"), str):
+        return {"result": STATUS_ERROR, "error": data["error"]}
+    return None
-    Maps internal decorated-metric format to schema EvalScore:
-    {score, result, threshold} (required) + reason, evaluator (optional).
-    """
-    score_val = None
-    if metric_id in data and isinstance(data[metric_id], (int, float)):
-        score_val = data[metric_id]
-    if score_val is None:
-        return None
+# ── Per-evaluator-type valid-shape builders ─────────────────────────
+# Each takes a decorated metric dict and returns the schema-compliant valid
+# variant. They never see errored entries — _convert_scores_to_schema's loop
+# handles ErroredScore dispatch before reaching these.
+def _build_eval_score(data: dict, metric_id: str) -> Optional[Dict[str, Any]]:
+    """Standard 1-5 score: {score, result, threshold, reason?}. None if no numeric score."""
+    score_val = data.get(metric_id)
+    if not isinstance(score_val, (int, float)):
+        return None
     result = data.get("result")
     if result not in (STATUS_PASS, STATUS_FAIL):
         result = STATUS_PASS if score_val >= data.get("threshold", DEFAULT_PASS_THRESHOLD) else STATUS_FAIL
-    eval_score: Dict[str, Any] = {
+    out: Dict[str, Any] = {
         "score": score_val,
         "result": result,
         "threshold": data.get("threshold", DEFAULT_PASS_THRESHOLD),
     }
     reason = data.get(f"{metric_id}_reason") or data.get("reason")
     if reason:
-        eval_score["reason"] = reason
-    return eval_score
+        out["reason"] = reason
+    return out
+def _build_citation_score(data: dict, _metric_id: str) -> Dict[str, Any]:
+    count = data.get("citations", 0)
+    result = data.get("result")
+    if result not in (STATUS_PASS, STATUS_FAIL):
+        result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
+    out: Dict[str, Any] = {
+        "count": count,
+        "result": result,
+        "threshold": data.get("threshold", 1),
+    }
+    if "citation_format" in data:
+        out["format"] = data["citation_format"]
+    return out
+def _build_exact_match_score(data: dict, _metric_id: str) -> Dict[str, Any]:
+    is_match = data.get("exact_match", 0.0) == 1.0
+    return {
+        "match": is_match,
+        "result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
+        "reason": data.get("exact_match_reason", ""),
+    }
+def _build_partial_match_score(data: dict, _metric_id: str) -> Dict[str, Any]:
+    return {
+        "score": data.get("partial_match", 0.0),
+        "result": data.get("result", STATUS_FAIL),
+        "threshold": data.get("threshold", 0.5),
+        "reason": data.get("partial_match_reason", ""),
+    }
+# Internal evaluator name → (schema-output key, valid-shape builder).
+_SCORE_CONVERTERS = (
+    (RELEVANCE,          "relevance",        _build_eval_score),
+    (COHERENCE,          "coherence",        _build_eval_score),
+    (GROUNDEDNESS,       "groundedness",     _build_eval_score),
+    (SIMILARITY,         "similarity",       _build_eval_score),
+    (TOOL_CALL_ACCURACY, "toolCallAccuracy", _build_eval_score),
+    (CITATIONS,          "citations",        _build_citation_score),
+    (EXACT_MATCH,        "exactMatch",       _build_exact_match_score),
+    (PARTIAL_MATCH,      "partialMatch",     _build_partial_match_score),
+)
+def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
+    """Extract a schema-compliant EvalScore from a decorated metric dict.
+    Returns ErroredScore for crashes, the standard 1-5 score shape on success,
+    or None if no usable numeric score.
+    """
+    errored = _as_errored_score(data)
+    if errored is not None:
+        return errored
+    return _build_eval_score(data, metric_id)
 def _convert_scores_to_schema(results_dict: Dict[str, Any]) -> Dict[str, Any]:
     """Convert raw evaluator results to schema-compliant score objects.
-    Evaluator results in results_dict are dicts (from _decorate_metric) or
-    None when skipped/crashed. None values are omitted from output.
+    Each value in results_dict is either a decorated metric dict (valid score)
+    or an errored entry ``{result: "error", error}``. Errored entries pass
+    through unchanged as ErroredScore. Evaluators not present in results_dict
+    are omitted from the output.
     """
     scores: Dict[str, Any] = {}
-    for eval_key, schema_key in [
-        (RELEVANCE, "relevance"),
-        (COHERENCE, "coherence"),
-        (GROUNDEDNESS, "groundedness"),
-        (SIMILARITY, "similarity"),
-        (TOOL_CALL_ACCURACY, "toolCallAccuracy"),
-    ]:
+    for eval_key, schema_key, build_valid_score in _SCORE_CONVERTERS:
         data = results_dict.get(eval_key)
         if data is None:
             continue
-        eval_score = extract_eval_score(data, METRIC_IDS[eval_key])
-        if eval_score:
-            scores[schema_key] = eval_score
-    data = results_dict.get(CITATIONS)
-    if data is not None:
-        count = data.get("citations", 0)
-        cit_result = data.get("result")
-        if cit_result not in (STATUS_PASS, STATUS_FAIL):
-            cit_result = STATUS_PASS if count >= data.get("threshold", 1) else STATUS_FAIL
-        citation_score: Dict[str, Any] = {
-            "count": count,
-            "result": cit_result,
-            "threshold": data.get("threshold", 1),
-        }
-        if "citation_format" in data:
-            citation_score["format"] = data["citation_format"]
-        scores["citations"] = citation_score
-    data = results_dict.get(EXACT_MATCH)
-    if data is not None:
-        is_match = data.get("exact_match", 0.0) == 1.0
-        scores["exactMatch"] = {
-            "match": is_match,
-            "result": data.get("result", STATUS_PASS if is_match else STATUS_FAIL),
-            "reason": data.get("exact_match_reason", ""),
-        }
-    data = results_dict.get(PARTIAL_MATCH)
-    if data is not None:
-        scores["partialMatch"] = {
-            "score": data.get("partial_match", 0.0),
-            "result": data.get("result", STATUS_FAIL),
-            "threshold": data.get("threshold", 0.5),
-            "reason": data.get("partial_match_reason", ""),
-        }
+        errored = _as_errored_score(data)
+        if errored is not None:
+            scores[schema_key] = errored
+            continue
+        valid = build_valid_score(data, METRIC_IDS.get(eval_key, eval_key.lower()))
+        if valid is not None:
+            scores[schema_key] = valid
     return scores
-def convert_result_to_eval_item(result: Dict) -> Dict:
-    """Convert an internal evaluation result dict to a schema-compliant EvalItem."""
-    item: Dict[str, Any] = {
-        "prompt": result["prompt"],
-        "response": result["response"],
-        "expected_response": result["expected_response"],
-    }
-    if "evaluators" in result:
-        item["evaluators"] = result["evaluators"]
-    if "evaluators_mode" in result:
-        item["evaluators_mode"] = result["evaluators_mode"]
+def convert_single_item_result_to_output(source: Dict) -> Dict[str, Any]:
+    """Convert a single item result — a single-turn item OR one turn inside
+    a multi-turn thread — to its schema-compliant output shape.
-    scores = _convert_scores_to_schema(result.get("results", {}))
+    Common shape: prompt, expected_response?, response?, evaluators?,
+    evaluators_mode?, scores?, status?, error?. Optional fields are emitted
+    only when present on the source.
+    """
+    out: Dict[str, Any] = {"prompt": source.get("prompt", "")}
+    for key in ("expected_response", "response", "evaluators", "evaluators_mode"):
+        if key in source:
+            out[key] = source[key]
+    scores = _convert_scores_to_schema(source.get("results", {}))
     if scores:
-        item["scores"] = scores
-    return item
+        out["scores"] = scores
+    if "status" in source:
+        out["status"] = source["status"]
+    if "error" in source:
+        out["error"] = source["error"]
+    return out
 def convert_thread_result_to_output(thread_result: Dict) -> Dict:
-    """Convert a multi-turn thread result to the output format."""
-    output_turns = []
-    for turn in thread_result.get("turns", []):
-        output_turn: Dict[str, Any] = {"prompt": turn.get("prompt", "")}
-        if "expected_response" in turn:
-            output_turn["expected_response"] = turn["expected_response"]
-        if "response" in turn:
-            output_turn["response"] = turn["response"]
-        if "status" in turn:
-            output_turn["status"] = turn["status"]
-        if "error" in turn:
-            output_turn["error"] = turn["error"]
-        if "evaluators" in turn:
-            output_turn["evaluators"] = turn["evaluators"]
-        if "evaluators_mode" in turn:
-            output_turn["evaluators_mode"] = turn["evaluators_mode"]
-        scores = _convert_scores_to_schema(turn.get("results", {}))
-        if scores:
-            output_turn["scores"] = scores
-        output_turns.append(output_turn)
+    """Convert a multi-turn thread result to a schema-compliant ThreadOutput."""
     output: Dict[str, Any] = {}
     if thread_result.get("name"):
         output["name"] = thread_result["name"]
@@ -300,18 +316,17 @@ def convert_thread_result_to_output(thread_result: Dict) -> Dict:
         output["description"] = thread_result["description"]
     if thread_result.get("conversation_id"):
         output["conversation_id"] = thread_result["conversation_id"]
-    output["turns"] = output_turns
+    output["turns"] = [convert_single_item_result_to_output(t) for t in thread_result.get("turns", [])]
     if thread_result.get("summary"):
         output["summary"] = thread_result["summary"]
     return output
 def convert_result_to_output_item(result: Dict) -> Dict:
-    """Convert an internal result dict to an output item. Routes by type."""
+    """Top-level dispatch: routes a result dict by item type to the right converter."""
     if result.get("type") == "multi_turn":
         return convert_thread_result_to_output(result)
-    return convert_result_to_eval_item(result)
+    return convert_single_item_result_to_output(result)
 def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None,
@@ -395,12 +410,14 @@ def write_results_to_csv(results: List[Dict], output_file: str,
                     total_items = aggregates[next(iter(aggregates))].get('total_prompts', len(results))
                     if total_items > 1:
                         f.write("# AGGREGATE STATISTICS\n")
-                        f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
+                        f.write("Metric,Prompts Evaluated,Total Prompts,Pass Rate (%),Passed,Failed,Errored,Avg Score,Threshold\n")
                         for metric_name, stats in aggregates.items():
-                            threshold_str = str(stats.get('threshold', 'N/A'))
+                            threshold_val = stats.get('threshold')
+                            threshold_str = "N/A" if threshold_val is None else str(threshold_val)
                             prompts_evaluated = stats.get('prompts_evaluated', stats['total_evaluated'])
                             total_prompts = stats.get('total_prompts', total_items)
-                            f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
+                            error_count = stats.get('error_count', 0)
+                            f.write(f"{metric_name},{prompts_evaluated},{total_prompts},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{error_count},{stats['avg_score']:.2f},{threshold_str}\n")
                         f.write("\n# INDIVIDUAL RESULTS\n")
                 single_turn_rows = []
@@ -416,7 +433,7 @@ def write_results_to_csv(results: List[Dict], output_file: str,
                                 "response": turn.get("response", ""),
                                 "expected_response": turn.get("expected_response", ""),
                                 "status": turn.get("status", ""),
-                                "error": turn.get("error", ""),
+                                "error": _format_error_object(turn.get("error")),
                                 "scores": _results_to_csv_json(turn.get("results", {})),
                             })
                         summary = result.get("summary", {})
@@ -432,6 +449,8 @@ def write_results_to_csv(results: List[Dict], output_file: str,
                     else:
                         exclude_keys = {'evaluators_ran', 'evaluators', 'evaluators_mode', '_enhanced_response', 'results'}
                         row = {k: v for k, v in result.items() if k not in exclude_keys}
+                        if "error" in row:
+                            row["error"] = _format_error_object(row["error"])
                         if "results" in result:
                             row["scores"] = _results_to_csv_json(result["results"])
                         single_turn_rows.append(row)

package/src/clients/cli/status_derivation.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""Pure status-derivation and rollup helpers — the single source of truth for
+turn-level and thread-level status under the v1.4.0 unified-error-reporting design.
+Two functions, both pure:
+* :func:`status_for_response` — turn/item-level status + optional top-level
+  error summary, computed from per-evaluator results when the agent responded.
+* :func:`rollup_thread_status` — thread-level rollup over per-turn statuses.
+The "no response obtained" case (turn/item ``status="error"``) is handled
+inline at the agent-failure sites in :mod:`evaluation_runner`, which set
+``status=STATUS_ERROR`` and build the cause object directly via
+:func:`error_messages.agent_request_failed` or
+:func:`error_messages.turn_skipped`. This module only covers the
+response-obtained side.
+See research.md §R4 for the canonical pseudocode and quickstart.md §2.6 for the
+exhaustive test matrix these functions must satisfy.
+"""
+from __future__ import annotations
+from typing import Optional, Sequence, Tuple
+from common import STATUS_ERROR, STATUS_FAIL, STATUS_PARTIAL, STATUS_PASS
+from error_messages import ErrorObject, evaluators_failed_summary
+def status_for_response(
+    evaluator_results: Sequence[str],
+) -> Tuple[str, Optional[ErrorObject]]:
+    """Compute (status, optional summary error) for a turn/item where the agent responded.
+    Args:
+        evaluator_results: Per-evaluator 'result' values, each in {'pass', 'fail', 'error'}.
+    Returns:
+        A (status, error) tuple where status is one of 'pass', 'fail', or 'partial':
+        * 'pass' — every evaluator returned 'pass', OR no evaluators ran
+          (vacuous truth — items with no evaluators pass by default). error is None.
+        * 'partial' — at least one evaluator returned 'error'. Error takes
+          priority over pass/fail; a turn with one passing evaluator and one
+          errored evaluator is 'partial' regardless of the others. error is the
+          evaluatorsFailed summary
+          {code, message: 'Agent response obtained. N of M evaluators failed to run.'}.
+        * 'fail' — every evaluator ran successfully (no errors) AND at least
+          one returned 'fail'. Covers uniform-fail and pass+fail mixes. error is None.
+        Status 'error' is never returned — the caller handles the no-response
+        case directly.
+    """
+    uniques = set(evaluator_results)
+    if not uniques or uniques == {STATUS_PASS}:
+        return STATUS_PASS, None
+    error_count = sum(1 for r in evaluator_results if r == STATUS_ERROR)
+    if error_count > 0:
+        return STATUS_PARTIAL, evaluators_failed_summary(error_count, len(evaluator_results))
+    return STATUS_FAIL, None
+def rollup_thread_status(turn_statuses: Sequence[str]) -> str:
+    """Compute a thread-level overall_status from the per-turn statuses.
+    Priority rules:
+    1. Any errored turn → thread 'error' (the run didn't complete).
+    2. Else, any partial turn → thread 'partial'.
+    3. Else, all turns 'pass' → 'pass'.
+    4. Else, all turns 'fail' → 'fail'.
+    5. Else (mix of pass and fail at thread level) → 'partial'.
+    Note rule 5 (pass+fail mix → 'partial') does not match status_for_response
+    at the per-turn level (where pass+fail among evaluators yields 'fail').
+    The thread-level rule preserves existing behaviour; the mismatch is known
+    and deferred for revisit.
+    As a defensive fallback an empty sequence returns 'error'.
+    """
+    if not turn_statuses:
+        return STATUS_ERROR
+    if STATUS_ERROR in turn_statuses:
+        return STATUS_ERROR
+    if STATUS_PARTIAL in turn_statuses:
+        return STATUS_PARTIAL
+    uniques = set(turn_statuses)
+    if uniques == {STATUS_PASS}:
+        return STATUS_PASS
+    if uniques == {STATUS_FAIL}:
+        return STATUS_FAIL
+    return STATUS_PARTIAL

package/src/clients/node-js/config/default.js CHANGED Viewed

@@ -2,7 +2,7 @@
  * Build-time injected default values
  * DO NOT EDIT - This file is auto-generated during build.
  *
- * Generated: 2026-05-07T22:53:22.056Z
+ * Generated: 2026-05-14T18:32:53.816Z
  *
  * @copyright Microsoft Corporation. All rights reserved.
  * @license MIT

package/src/clients/node-js/lib/env-loader.js CHANGED Viewed

@@ -19,6 +19,23 @@ const AGENT_ID_ALIASES = [
   { key: 'M365_TITLE_ID', transform: (v) => `${v}.declarativeAgent` },
 ];
+// Aliases resolved into TENANT_ID (first match wins)
+const TENANT_ID_ALIASES = [{ key: 'TEAMS_APP_TENANT_ID' }];
+function _resolveAliases(targetKey, aliases, envVars) {
+  if (envVars[targetKey]) return;
+  for (const alias of aliases) {
+    if (envVars[alias.key]) {
+      const resolved = alias.transform
+        ? alias.transform(envVars[alias.key])
+        : envVars[alias.key];
+      envVars[targetKey] = resolved;
+      process.env[targetKey] = resolved;
+      break;
+    }
+  }
+}
 /**
  * Load environment variables from a .env-style file.
  * Uses dotenv.parse() for standards-compliant parsing (handles quoted values,
@@ -71,19 +88,9 @@ export function _loadEnvFile(envFilePath) {
     return null;
   }
-  // Resolve agent ID aliases into M365_AGENT_ID (first match wins)
-  if (!envVars['M365_AGENT_ID']) {
-    for (const alias of AGENT_ID_ALIASES) {
-      if (envVars[alias.key]) {
-        const agentId = alias.transform
-          ? alias.transform(envVars[alias.key])
-          : envVars[alias.key];
-        envVars['M365_AGENT_ID'] = agentId;
-        process.env['M365_AGENT_ID'] = agentId;
-        break;
-      }
-    }
-  }
+  // Resolve aliases into canonical keys (first match wins)
+  _resolveAliases('M365_AGENT_ID', AGENT_ID_ALIASES, envVars);
+  _resolveAliases('TENANT_ID', TENANT_ID_ALIASES, envVars);
   return envVars;
 }