npm - @microsoft/m365-copilot-eval - Versions diffs - 1.5.0-preview.1 → 1.7.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.5.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +19 -1
package/package.json +4 -3
package/schema/CHANGELOG.md +7 -0
package/schema/v1/eval-document.schema.json +144 -333
package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
package/schema/v1/examples/valid/multi-turn-output.json +2 -0
package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
package/schema/version.json +1 -1
package/src/clients/cli/api_clients/A2A/a2a_client.py +57 -10
package/src/clients/cli/auth/auth_handler.py +21 -1
package/src/clients/cli/common.py +8 -14
package/src/clients/cli/error_messages.py +91 -0
package/src/clients/cli/evaluation_runner.py +108 -97
package/src/clients/cli/evaluator_resolver.py +8 -33
package/src/clients/cli/generate_report.py +125 -96
package/src/clients/cli/main.py +2 -1
package/src/clients/cli/readme.md +1 -1
package/src/clients/cli/result_writer.py +129 -110
package/src/clients/cli/status_derivation.py +91 -0
package/src/clients/node-js/bin/runevals.js +31 -9
package/src/clients/node-js/config/default.js +1 -1
package/src/clients/node-js/lib/env-loader.js +20 -13
package/src/clients/node-js/lib/python-runtime.js +137 -65
package/src/clients/node-js/lib/venv-manager.js +3 -2
package/src/clients/node-js/lib/version-check.js +268 -0

package/schema/v1/examples/valid/scenarios-with-mixed-errors.json ADDED Viewed

@@ -0,0 +1,239 @@
+{
+  "$schema": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
+  "schemaVersion": "1.4.0",
+  "metadata": {
+    "name": "All scenarios — comprehensive comparison fixture",
+    "description": "Single fixture exercising every output shape under v1.4.0. Items are structured to isolate one variable at a time so a reader can see exactly which combination drives which top-level `status` and whether the top-level `error` field is populated. Each item carries an `extensions.scenario` label for jq/grep.",
+    "evaluatedAt": "2026-05-01T11:00:00Z",
+    "agentName": "Test Agent",
+    "cliVersion": "1.4.0"
+  },
+  "items": [
+    {
+      "extensions": {
+        "scenario": "1-single-turn-uniform-pass",
+        "notes": "Baseline. All evaluators uniformly pass. status=pass. No top-level error field.",
+        "evaluators_in_scores": ["pass", "pass"],
+        "expected_status": "pass",
+        "expected_error_field_populated": false
+      },
+      "prompt": "What is Microsoft Graph API?",
+      "expected_response": "Microsoft Graph is a unified endpoint for M365 data.",
+      "response": "Microsoft Graph API is a gateway to data and intelligence in Microsoft 365.",
+      "scores": {
+        "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
+        "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
+      },
+      "status": "pass"
+    },
+    {
+      "extensions": {
+        "scenario": "2-single-turn-uniform-fail",
+        "notes": "All evaluators uniformly fail. status=fail. No top-level error field. Under v1.4.0 this is the ONLY shape producing status=fail — uniform-fail is strict.",
+        "evaluators_in_scores": ["fail", "fail"],
+        "expected_status": "fail",
+        "expected_error_field_populated": false
+      },
+      "prompt": "What is the boiling point of water in Fahrenheit at sea level?",
+      "expected_response": "212°F.",
+      "response": "Water boils at 150°F at sea level.",
+      "scores": {
+        "relevance": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response is factually incorrect." },
+        "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response contains an internal contradiction." }
+      },
+      "status": "fail"
+    },
+    {
+      "extensions": {
+        "scenario": "3-single-turn-fail-pass-fail-mix",
+        "notes": "All evaluators ran successfully; one passed, one returned a fail verdict. status=fail (covers uniform-fail and pass+fail mixes). Top-level error field is ABSENT — no errored evaluator to summarize.",
+        "evaluators_in_scores": ["pass", "fail"],
+        "expected_status": "fail",
+        "expected_error_field_populated": false
+      },
+      "prompt": "Explain the difference between SharePoint and OneDrive.",
+      "expected_response": "Covers shared vs personal storage, permissions model.",
+      "response": "SharePoint is for team collaboration; OneDrive is for personal files. Both use the same permissions engine.",
+      "scores": {
+        "relevance": { "score": 4.0, "result": "pass", "threshold": 3, "reason": "Response addresses the key distinction." },
+        "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "The claim about a 'same permissions engine' is inaccurate; the response is internally inconsistent with product reality." }
+      },
+      "status": "fail"
+    },
+    {
+      "extensions": {
+        "scenario": "4-single-turn-partial-fail-plus-evaluator-crash",
+        "notes": "One fail + one evaluator crash (judge raised an exception). status=partial — error takes priority over pass/fail when ≥1 evaluator errored. Top-level error field IS populated with the evaluatorsFailed summary. Per-evaluator error uses the 'Evaluator failed:' prefix and appends exception.message text.",
+        "evaluators_in_scores": ["fail", "error"],
+        "expected_status": "partial",
+        "expected_error_field_populated": true
+      },
+      "prompt": "How do I configure conditional access for guest accounts?",
+      "expected_response": "Covers Azure AD guest CA policies.",
+      "response": "Use the on-prem AD Users and Computers tool to block external logins.",
+      "scores": {
+        "relevance": { "score": 1.0, "result": "fail", "threshold": 3, "reason": "Response references on-prem AD instead of Azure AD." },
+        "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
+      },
+      "status": "partial",
+      "error": {
+        "code": "evaluatorsFailed",
+        "message": "Agent response obtained. 1 of 2 evaluators failed to run."
+      }
+    },
+    {
+      "extensions": {
+        "scenario": "5-single-turn-partial-all-evaluators-errored",
+        "notes": "Response obtained, but every attempted evaluator errored (zero verdicts rendered). status=partial (not error) — the agent DID respond; we just couldn't fully evaluate it. Same unified message template as scenario 4, now reading '2 of 2' to indicate all evaluators errored. The 'Agent response obtained.' prefix distinguishes this case from scenario 6 (no response, status=error).",
+        "evaluators_in_scores": ["error", "error"],
+        "expected_status": "partial",
+        "expected_error_field_populated": true
+      },
+      "prompt": "List the top 5 security best practices for M365 tenants.",
+      "expected_response": "Covers MFA, conditional access, audit logging, DLP, least privilege.",
+      "response": "Here are five M365 security best practices: 1) MFA, 2) conditional access, 3) DLP, 4) audit logs, 5) least-privilege roles.",
+      "scores": {
+        "relevance": { "result": "error", "error": "Evaluator failed: Service rate limit exceeded after 3 retries" },
+        "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
+      },
+      "status": "partial",
+      "error": {
+        "code": "evaluatorsFailed",
+        "message": "Agent response obtained. 2 of 2 evaluators failed to run."
+      }
+    },
+    {
+      "extensions": {
+        "scenario": "6-single-turn-error-no-response",
+        "notes": "Agent did not respond after retries. No evaluators attempted. status=error. Top-level error field IS populated with the request-failure cause template. This is the ONLY single-turn case producing status=error under v1.4.0.",
+        "evaluators_in_scores": [],
+        "expected_status": "error",
+        "expected_error_field_populated": true
+      },
+      "prompt": "What is Microsoft Graph API?",
+      "expected_response": "Microsoft Graph is a unified endpoint for M365 data.",
+      "response": "",
+      "scores": {},
+      "status": "error",
+      "error": {
+        "code": "agentRequestFailed",
+        "message": "Agent request failed: HTTP 503 Service Unavailable"
+      }
+    },
+    {
+      "extensions": {
+        "scenario": "7-multi-turn-partial-mixed-turn-outcomes",
+        "notes": "3-turn thread with no errored turns. Turn 1 uniformly passed; Turn 2 had a pass+error evaluator mix (partial); Turn 3 had a uniform-fail (status=fail). Per-turn statuses: [pass, partial, fail] — any partial turn drives thread to partial under FR-004's priority rules. Summary invariant: 1+1+1+0=3.",
+        "per_turn_statuses": ["pass", "partial", "fail"],
+        "expected_overall_status": "partial"
+      },
+      "name": "Seattle trip planning — mixed turn outcomes",
+      "conversation_id": "conv-abc-007",
+      "turns": [
+        {
+          "prompt": "I'm based in Seattle.",
+          "expected_response": "I can help with Seattle-related queries.",
+          "response": "Understood — I can help with Seattle questions.",
+          "scores": {
+            "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
+            "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
+          },
+          "status": "pass"
+        },
+        {
+          "prompt": "What's the weather like here?",
+          "expected_response": "Seattle has mild, rainy weather.",
+          "response": "Seattle tends to be rainy most of the year, especially in winter.",
+          "scores": {
+            "relevance": { "score": 5.0, "result": "pass", "threshold": 3, "reason": "Response addresses Seattle weather accurately." },
+            "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
+          },
+          "status": "partial",
+          "error": {
+            "code": "evaluatorsFailed",
+            "message": "Agent response obtained. 1 of 2 evaluators failed to run."
+          }
+        },
+        {
+          "prompt": "What's the average temperature in Seattle in March?",
+          "expected_response": "Around 50°F (10°C).",
+          "response": "Seattle averages 80°F in March.",
+          "scores": {
+            "relevance": { "score": 1.0, "result": "fail", "threshold": 3, "reason": "Response is factually incorrect — Seattle's March averages are far below 80°F." },
+            "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response contradicts well-known regional climate data." }
+          },
+          "status": "fail"
+        }
+      ],
+      "summary": {
+        "turns_total": 3,
+        "turns_passed": 1,
+        "turns_failed": 1,
+        "turns_partial": 1,
+        "turns_errored": 0,
+        "overall_status": "partial"
+      }
+    },
+    {
+      "extensions": {
+        "scenario": "8-multi-turn-error-any-errored-turn",
+        "notes": "3-turn thread with a mid-conversation request failure. Turn 1 uniformly passed; Turn 2's request failed; Turn 3 was downstream-skipped. Per-turn statuses: [pass, error, error] — under FR-004's priority rules, any errored turn drives the thread to error (the run didn't complete). The two error turns carry distinct error codes (agentRequestFailed vs turnSkipped) demonstrating the cascade. Summary invariant: 1+0+0+2=3.",
+        "per_turn_statuses": ["pass", "error", "error"],
+        "expected_overall_status": "error"
+      },
+      "name": "Conversation that aborted mid-thread",
+      "conversation_id": "conv-abc-008",
+      "turns": [
+        {
+          "prompt": "I'd like to plan a trip from SFO.",
+          "expected_response": "I can help with travel planning from SFO.",
+          "response": "Sure — I can help plan a trip from SFO.",
+          "scores": {
+            "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
+            "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
+          },
+          "status": "pass"
+        },
+        {
+          "prompt": "Book me a flight from SFO to SEA next Tuesday.",
+          "expected_response": "I can help with flight queries.",
+          "response": "",
+          "scores": {},
+          "status": "error",
+          "error": {
+            "code": "agentRequestFailed",
+            "message": "Agent request failed: DNS resolution failed for agent endpoint"
+          }
+        },
+        {
+          "prompt": "Prefer morning departure, aisle seat.",
+          "expected_response": "Noted.",
+          "response": "",
+          "scores": {},
+          "status": "error",
+          "error": {
+            "code": "turnSkipped",
+            "message": "Turn not attempted: preceding turn failed"
+          }
+        }
+      ],
+      "summary": {
+        "turns_total": 3,
+        "turns_passed": 1,
+        "turns_failed": 0,
+        "turns_partial": 0,
+        "turns_errored": 2,
+        "overall_status": "error"
+      }
+    }
+  ]
+}

package/schema/version.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "version": "1.2.0",
+  "version": "1.3.0",
   "releaseDate": "2026-04-02",
   "schemaId": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
   "description": "M365 Copilot Eval Document Schema"

package/src/clients/cli/api_clients/A2A/a2a_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ import re
 import urllib.error
 import urllib.request
 import uuid
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 from api_clients.base_agent_client import BaseAgentClient
 from cli_logging.console_diagnostics import emit_structured_log
@@ -35,6 +35,7 @@ class A2AClient(BaseAgentClient):
         access_token: str,
         logger: Optional[logging.Logger] = None,
         diagnostic_records: Optional[List[Dict[str, Any]]] = None,
+        token_refresh_fn: Optional[Callable[[], str]] = None,
     ) -> None:
         """
         Args:
@@ -42,11 +43,15 @@ class A2AClient(BaseAgentClient):
             access_token: Bearer token for A2A authentication.
             logger: Logger to use. Defaults to a module-level logger if not provided.
             diagnostic_records: List to accumulate structured log entries.
+            token_refresh_fn: Optional callable that returns a fresh access token string.
+                When provided, a single HTTP 401 response will trigger a token refresh
+                and one automatic retry, making the refresh invisible to the caller.
         """
         self._endpoint = a2a_endpoint.rstrip("/")
         self._access_token = access_token
         self._logger = logger or logging.getLogger(__name__)
         self._diagnostic_records = diagnostic_records
+        self._token_refresh_fn = token_refresh_fn
         self._resolved_agent_url: Optional[str] = None
     # ------------------------------------------------------------------ #
@@ -261,6 +266,12 @@ class A2AClient(BaseAgentClient):
     ) -> tuple[Dict[str, Any], Dict[str, Any]]:
         """Send a JSON-RPC message to the agent and parse the response.
+        When a ``token_refresh_fn`` was supplied at construction time and the
+        server responds with HTTP 401 (Unauthorized), the token is refreshed
+        automatically and the request is retried exactly once.  This keeps
+        long-running eval sessions alive beyond the initial token lifetime
+        without requiring any user interaction.
         Returns:
             A tuple of (result_dict, raw_result) where result_dict is the
             normalized response dict (raw_response_text, display_response_text,
@@ -275,15 +286,51 @@ class A2AClient(BaseAgentClient):
             with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
                 raw = resp.read().decode("utf-8", errors="replace")
         except urllib.error.HTTPError as e:
-            body = ""
-            try:
-                body = e.read().decode("utf-8", errors="replace")
-            except Exception:
-                pass
-            raise RuntimeError(
-                f"A2A request failed (HTTP {e.code} {e.reason})."
-                + (f" Body: {body[:500]}" if body else "")
-            ) from e
+            if e.code == 401 and self._token_refresh_fn is not None:
+                emit_structured_log(
+                    "info",
+                    "[A2A] Access token expired (HTTP 401); refreshing token and retrying.",
+                    Operation.AUTHENTICATE,
+                    logger=self._logger,
+                    diagnostic_records=self._diagnostic_records,
+                )
+                new_token = self._token_refresh_fn()
+                if not new_token:
+                    raise RuntimeError(
+                        "A2A request failed (HTTP 401 Unauthorized) and token refresh returned no token."
+                    ) from e
+                self._access_token = new_token
+                headers["Authorization"] = f"Bearer {self._access_token}"
+                retry_req = urllib.request.Request(
+                    agent_url, data=payload, headers=headers, method="POST"
+                )
+                try:
+                    with urllib.request.urlopen(retry_req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
+                        raw = resp.read().decode("utf-8", errors="replace")
+                except urllib.error.HTTPError as retry_e:
+                    body = ""
+                    try:
+                        body = retry_e.read().decode("utf-8", errors="replace")
+                    except Exception:
+                        pass
+                    raise RuntimeError(
+                        f"A2A request failed (HTTP {retry_e.code} {retry_e.reason}) after token refresh."
+                        + (f" Body: {body[:500]}" if body else "")
+                    ) from retry_e
+                except urllib.error.URLError as retry_e:
+                    raise RuntimeError(
+                        f"A2A connection error after token refresh: {getattr(retry_e, 'reason', str(retry_e))}"
+                    ) from retry_e
+            else:
+                body = ""
+                try:
+                    body = e.read().decode("utf-8", errors="replace")
+                except Exception:
+                    pass
+                raise RuntimeError(
+                    f"A2A request failed (HTTP {e.code} {e.reason})."
+                    + (f" Body: {body[:500]}" if body else "")
+                ) from e
         except urllib.error.URLError as e:
             raise RuntimeError(
                 f"A2A connection error: {getattr(e, 'reason', str(e))}"

package/src/clients/cli/auth/auth_handler.py CHANGED Viewed

@@ -13,7 +13,7 @@ https://github.com/AzureAD/microsoft-authentication-extensions-for-python
 import os
 import platform
 import logging
-from typing import Optional
+from typing import Callable, Optional
 from pathlib import Path
 import jwt
 from msal import PublicClientApplication
@@ -260,3 +260,23 @@ class AuthHandler:
             return oid
         except jwt.DecodeError as e:
             raise ValueError(f"Failed to decode token: {e}")
+def make_token_refresh_fn(auth_handler: "AuthHandler") -> Callable[[], str]:
+    """Return a callable that silently refreshes the A2A access token.
+    On a 401 response the caller invokes this function.  It first attempts a
+    silent refresh (using the MSAL refresh token) and falls back to interactive
+    authentication only when a silent refresh is not possible.  The returned
+    string is the new access token; an empty string signals failure.
+    Args:
+        auth_handler: An initialized AuthHandler instance to use for token acquisition.
+    Returns:
+        A zero-argument callable that returns a fresh access token string.
+    """
+    def _refresh() -> str:
+        result = auth_handler.acquire_token_interactive() or {}
+        return result.get("access_token") or ""
+    return _refresh

package/src/clients/cli/common.py CHANGED Viewed

@@ -39,19 +39,14 @@ CITATIONS = "Citations"
 EXACT_MATCH = "ExactMatch"
 PARTIAL_MATCH = "PartialMatch"
-# Prerequisite constants
-REQUIRES_AZURE_OPENAI = "azure_openai"
-REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
-# Evaluation status constants
-# Outcome statuses (agent responded, evaluators ran):
-STATUS_PASS = "pass"  # All evaluators scored above threshold
-STATUS_FAIL = "fail"  # At least one evaluator scored below threshold
-# Error state (evaluation couldn't complete):
-STATUS_ERROR = "error"  # API call failed / response couldn't be obtained
-# Thread-level aggregate status (multi-turn only):
-STATUS_PARTIAL = "partial"  # Some turns passed, some did not
-# Fallback for missing status:
+# Evaluation status constants — four-value enum used at the turn/item level
+# AND the thread-level overall_status. See status_derivation.py for the
+# canonical derivation and rollup rules.
+STATUS_PASS = "pass"
+STATUS_FAIL = "fail"
+STATUS_PARTIAL = "partial"
+STATUS_ERROR = "error"
+# Internal-only sentinel — never appears in emitted output.
 STATUS_UNKNOWN = "unknown"
 # System defaults when no file-level or env-level defaults are configured
@@ -77,7 +72,6 @@ METRIC_IDS = {
 @dataclass
 class RegistryEntry:
     type: str  # "llm", "tool", or "non-llm"
-    requires: List[str]
     default_threshold: Optional[float]

package/src/clients/cli/error_messages.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""Canonical error-message templates for persisted evaluation output.
+Every error string written into a JSON/CSV/HTML output file MUST be produced
+by a builder in this module. The builders accept only string arguments — never
+exception objects — which keeps ``repr(exc)``, ``traceback.format_exc()``, and
+SDK class names out of persisted output by construction.
+Two flavours:
+* **Turn/item-level `ErrorObject` builders** return a structured ``{code, message}``
+  dict for the top-level ``error`` field on a turn or single-turn item. Used
+  when ``status == "error"`` (cause) or ``status == "partial"`` with at least
+  one errored evaluator (summary).
+* **Per-evaluator string builders** return a flat string formatted as
+  ``"<category prefix>: <detail>"`` for the ``error`` field inside an
+  ``ErroredScore`` entry. Evaluator identity is encoded by the ``scores`` map's
+  parent property key, so no ``code`` is needed at this level.
+"""
+from __future__ import annotations
+from typing import TypedDict
+class ErrorObject(TypedDict):
+    """Turn/item-level top-level error shape — `{code, message}` per ErrorObject $def."""
+    code: str
+    message: str
+# ── Error code constants ──────────────────────────────────────────────
+# These are the canonical machine-readable codes emitted in the top-level
+# `error.code` field. They appear in persisted output and consumer-facing
+# documentation; treat the string values as part of the public contract
+# (do not rename without a schema-version bump).
+ERROR_CODE_AGENT_REQUEST_FAILED = "agentRequestFailed"
+ERROR_CODE_TURN_SKIPPED = "turnSkipped"
+ERROR_CODE_EVALUATORS_FAILED = "evaluatorsFailed"
+# ── Turn/item-level ErrorObject builders ──────────────────────────────
+def agent_request_failed(exc_message: str) -> ErrorObject:
+    """`status == "error"` cause when the agent client raised — no response obtained."""
+    return {
+        "code": ERROR_CODE_AGENT_REQUEST_FAILED,
+        "message": f"Agent request failed: {exc_message}",
+    }
+def turn_skipped() -> ErrorObject:
+    """`status == "error"` cause for downstream turns after a preceding turn failed.
+    Synthesized cause — no exception text appended (FR-013).
+    """
+    return {
+        "code": ERROR_CODE_TURN_SKIPPED,
+        "message": "Turn not attempted: preceding turn failed",
+    }
+def evaluators_failed_summary(error_count: int, total: int) -> ErrorObject:
+    """`status == "partial"` summary when at least one evaluator returned `result: "error"`.
+    Unified template regardless of error_count vs total — per-evaluator detail
+    (crash vs missing-prereq, with optional exception text) lives in `scores`.
+    """
+    return {
+        "code": ERROR_CODE_EVALUATORS_FAILED,
+        "message": f"Agent response obtained. {error_count} of {total} evaluators failed to run.",
+    }
+# ── Per-evaluator string builders (inside `scores`) ──────────────────
+def evaluator_failed(exc_message: str) -> str:
+    """Per-evaluator `error` string when the evaluator raised during run."""
+    return f"Evaluator failed: {exc_message}"
+# Per-evaluator prerequisite-miss builders are not present today because no
+# reachable prereq-fail exists: `validate_environment()` exits the process if
+# Azure OpenAI config is missing, and no registered evaluator has a
+# data-dependent prereq. When that changes, add builders following the
+# convention `"Evaluator missing prerequisites: <description>"` and wire a
+# prereq check in evaluation_runner. See specs/236-unified-error-output for
+# the deferred sub-cases.