npm - @microsoft/m365-copilot-eval - Versions diffs - 1.6.0-preview.1 → 1.7.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.6.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +4 -1
package/package.json +2 -2
package/schema/v1/eval-document.schema.json +144 -333
package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
package/schema/v1/examples/valid/multi-turn-output.json +2 -0
package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
package/src/clients/cli/common.py +8 -14
package/src/clients/cli/error_messages.py +91 -0
package/src/clients/cli/evaluation_runner.py +108 -97
package/src/clients/cli/evaluator_resolver.py +8 -33
package/src/clients/cli/generate_report.py +125 -96
package/src/clients/cli/readme.md +1 -1
package/src/clients/cli/result_writer.py +129 -110
package/src/clients/cli/status_derivation.py +91 -0
package/src/clients/node-js/config/default.js +1 -1
package/src/clients/node-js/lib/env-loader.js +20 -13

package/schema/v1/examples/invalid/missing-error-on-error.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+  "schemaVersion": "1.4.0",
+  "items": [
+    {
+      "prompt": "What is Microsoft Graph?",
+      "scores": {
+        "relevance": {
+          "result": "error"
+        }
+      }
+    }
+  ]
+}

package/schema/v1/examples/valid/multi-turn-output.json CHANGED Viewed

@@ -52,6 +52,8 @@
         "turns_total": 2,
         "turns_passed": 2,
         "turns_failed": 0,
+        "turns_partial": 0,
+        "turns_errored": 0,
         "overall_status": "pass"
       }
     }

package/schema/v1/examples/valid/scenarios-with-mixed-errors.json ADDED Viewed

@@ -0,0 +1,239 @@
+{
+  "$schema": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
+  "schemaVersion": "1.4.0",
+  "metadata": {
+    "name": "All scenarios — comprehensive comparison fixture",
+    "description": "Single fixture exercising every output shape under v1.4.0. Items are structured to isolate one variable at a time so a reader can see exactly which combination drives which top-level `status` and whether the top-level `error` field is populated. Each item carries an `extensions.scenario` label for jq/grep.",
+    "evaluatedAt": "2026-05-01T11:00:00Z",
+    "agentName": "Test Agent",
+    "cliVersion": "1.4.0"
+  },
+  "items": [
+    {
+      "extensions": {
+        "scenario": "1-single-turn-uniform-pass",
+        "notes": "Baseline. All evaluators uniformly pass. status=pass. No top-level error field.",
+        "evaluators_in_scores": ["pass", "pass"],
+        "expected_status": "pass",
+        "expected_error_field_populated": false
+      },
+      "prompt": "What is Microsoft Graph API?",
+      "expected_response": "Microsoft Graph is a unified endpoint for M365 data.",
+      "response": "Microsoft Graph API is a gateway to data and intelligence in Microsoft 365.",
+      "scores": {
+        "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
+        "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
+      },
+      "status": "pass"
+    },
+    {
+      "extensions": {
+        "scenario": "2-single-turn-uniform-fail",
+        "notes": "All evaluators uniformly fail. status=fail. No top-level error field. Under v1.4.0 this is the ONLY shape producing status=fail — uniform-fail is strict.",
+        "evaluators_in_scores": ["fail", "fail"],
+        "expected_status": "fail",
+        "expected_error_field_populated": false
+      },
+      "prompt": "What is the boiling point of water in Fahrenheit at sea level?",
+      "expected_response": "212°F.",
+      "response": "Water boils at 150°F at sea level.",
+      "scores": {
+        "relevance": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response is factually incorrect." },
+        "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response contains an internal contradiction." }
+      },
+      "status": "fail"
+    },
+    {
+      "extensions": {
+        "scenario": "3-single-turn-fail-pass-fail-mix",
+        "notes": "All evaluators ran successfully; one passed, one returned a fail verdict. status=fail (covers uniform-fail and pass+fail mixes). Top-level error field is ABSENT — no errored evaluator to summarize.",
+        "evaluators_in_scores": ["pass", "fail"],
+        "expected_status": "fail",
+        "expected_error_field_populated": false
+      },
+      "prompt": "Explain the difference between SharePoint and OneDrive.",
+      "expected_response": "Covers shared vs personal storage, permissions model.",
+      "response": "SharePoint is for team collaboration; OneDrive is for personal files. Both use the same permissions engine.",
+      "scores": {
+        "relevance": { "score": 4.0, "result": "pass", "threshold": 3, "reason": "Response addresses the key distinction." },
+        "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "The claim about a 'same permissions engine' is inaccurate; the response is internally inconsistent with product reality." }
+      },
+      "status": "fail"
+    },
+    {
+      "extensions": {
+        "scenario": "4-single-turn-partial-fail-plus-evaluator-crash",
+        "notes": "One fail + one evaluator crash (judge raised an exception). status=partial — error takes priority over pass/fail when ≥1 evaluator errored. Top-level error field IS populated with the evaluatorsFailed summary. Per-evaluator error uses the 'Evaluator failed:' prefix and appends exception.message text.",
+        "evaluators_in_scores": ["fail", "error"],
+        "expected_status": "partial",
+        "expected_error_field_populated": true
+      },
+      "prompt": "How do I configure conditional access for guest accounts?",
+      "expected_response": "Covers Azure AD guest CA policies.",
+      "response": "Use the on-prem AD Users and Computers tool to block external logins.",
+      "scores": {
+        "relevance": { "score": 1.0, "result": "fail", "threshold": 3, "reason": "Response references on-prem AD instead of Azure AD." },
+        "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
+      },
+      "status": "partial",
+      "error": {
+        "code": "evaluatorsFailed",
+        "message": "Agent response obtained. 1 of 2 evaluators failed to run."
+      }
+    },
+    {
+      "extensions": {
+        "scenario": "5-single-turn-partial-all-evaluators-errored",
+        "notes": "Response obtained, but every attempted evaluator errored (zero verdicts rendered). status=partial (not error) — the agent DID respond; we just couldn't fully evaluate it. Same unified message template as scenario 4, now reading '2 of 2' to indicate all evaluators errored. The 'Agent response obtained.' prefix distinguishes this case from scenario 6 (no response, status=error).",
+        "evaluators_in_scores": ["error", "error"],
+        "expected_status": "partial",
+        "expected_error_field_populated": true
+      },
+      "prompt": "List the top 5 security best practices for M365 tenants.",
+      "expected_response": "Covers MFA, conditional access, audit logging, DLP, least privilege.",
+      "response": "Here are five M365 security best practices: 1) MFA, 2) conditional access, 3) DLP, 4) audit logs, 5) least-privilege roles.",
+      "scores": {
+        "relevance": { "result": "error", "error": "Evaluator failed: Service rate limit exceeded after 3 retries" },
+        "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
+      },
+      "status": "partial",
+      "error": {
+        "code": "evaluatorsFailed",
+        "message": "Agent response obtained. 2 of 2 evaluators failed to run."
+      }
+    },
+    {
+      "extensions": {
+        "scenario": "6-single-turn-error-no-response",
+        "notes": "Agent did not respond after retries. No evaluators attempted. status=error. Top-level error field IS populated with the request-failure cause template. This is the ONLY single-turn case producing status=error under v1.4.0.",
+        "evaluators_in_scores": [],
+        "expected_status": "error",
+        "expected_error_field_populated": true
+      },
+      "prompt": "What is Microsoft Graph API?",
+      "expected_response": "Microsoft Graph is a unified endpoint for M365 data.",
+      "response": "",
+      "scores": {},
+      "status": "error",
+      "error": {
+        "code": "agentRequestFailed",
+        "message": "Agent request failed: HTTP 503 Service Unavailable"
+      }
+    },
+    {
+      "extensions": {
+        "scenario": "7-multi-turn-partial-mixed-turn-outcomes",
+        "notes": "3-turn thread with no errored turns. Turn 1 uniformly passed; Turn 2 had a pass+error evaluator mix (partial); Turn 3 had a uniform-fail (status=fail). Per-turn statuses: [pass, partial, fail] — any partial turn drives thread to partial under FR-004's priority rules. Summary invariant: 1+1+1+0=3.",
+        "per_turn_statuses": ["pass", "partial", "fail"],
+        "expected_overall_status": "partial"
+      },
+      "name": "Seattle trip planning — mixed turn outcomes",
+      "conversation_id": "conv-abc-007",
+      "turns": [
+        {
+          "prompt": "I'm based in Seattle.",
+          "expected_response": "I can help with Seattle-related queries.",
+          "response": "Understood — I can help with Seattle questions.",
+          "scores": {
+            "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
+            "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
+          },
+          "status": "pass"
+        },
+        {
+          "prompt": "What's the weather like here?",
+          "expected_response": "Seattle has mild, rainy weather.",
+          "response": "Seattle tends to be rainy most of the year, especially in winter.",
+          "scores": {
+            "relevance": { "score": 5.0, "result": "pass", "threshold": 3, "reason": "Response addresses Seattle weather accurately." },
+            "coherence": { "result": "error", "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint after 30s" }
+          },
+          "status": "partial",
+          "error": {
+            "code": "evaluatorsFailed",
+            "message": "Agent response obtained. 1 of 2 evaluators failed to run."
+          }
+        },
+        {
+          "prompt": "What's the average temperature in Seattle in March?",
+          "expected_response": "Around 50°F (10°C).",
+          "response": "Seattle averages 80°F in March.",
+          "scores": {
+            "relevance": { "score": 1.0, "result": "fail", "threshold": 3, "reason": "Response is factually incorrect — Seattle's March averages are far below 80°F." },
+            "coherence": { "score": 2.0, "result": "fail", "threshold": 3, "reason": "Response contradicts well-known regional climate data." }
+          },
+          "status": "fail"
+        }
+      ],
+      "summary": {
+        "turns_total": 3,
+        "turns_passed": 1,
+        "turns_failed": 1,
+        "turns_partial": 1,
+        "turns_errored": 0,
+        "overall_status": "partial"
+      }
+    },
+    {
+      "extensions": {
+        "scenario": "8-multi-turn-error-any-errored-turn",
+        "notes": "3-turn thread with a mid-conversation request failure. Turn 1 uniformly passed; Turn 2's request failed; Turn 3 was downstream-skipped. Per-turn statuses: [pass, error, error] — under FR-004's priority rules, any errored turn drives the thread to error (the run didn't complete). The two error turns carry distinct error codes (agentRequestFailed vs turnSkipped) demonstrating the cascade. Summary invariant: 1+0+0+2=3.",
+        "per_turn_statuses": ["pass", "error", "error"],
+        "expected_overall_status": "error"
+      },
+      "name": "Conversation that aborted mid-thread",
+      "conversation_id": "conv-abc-008",
+      "turns": [
+        {
+          "prompt": "I'd like to plan a trip from SFO.",
+          "expected_response": "I can help with travel planning from SFO.",
+          "response": "Sure — I can help plan a trip from SFO.",
+          "scores": {
+            "relevance": { "score": 5.0, "result": "pass", "threshold": 3 },
+            "coherence": { "score": 5.0, "result": "pass", "threshold": 3 }
+          },
+          "status": "pass"
+        },
+        {
+          "prompt": "Book me a flight from SFO to SEA next Tuesday.",
+          "expected_response": "I can help with flight queries.",
+          "response": "",
+          "scores": {},
+          "status": "error",
+          "error": {
+            "code": "agentRequestFailed",
+            "message": "Agent request failed: DNS resolution failed for agent endpoint"
+          }
+        },
+        {
+          "prompt": "Prefer morning departure, aisle seat.",
+          "expected_response": "Noted.",
+          "response": "",
+          "scores": {},
+          "status": "error",
+          "error": {
+            "code": "turnSkipped",
+            "message": "Turn not attempted: preceding turn failed"
+          }
+        }
+      ],
+      "summary": {
+        "turns_total": 3,
+        "turns_passed": 1,
+        "turns_failed": 0,
+        "turns_partial": 0,
+        "turns_errored": 2,
+        "overall_status": "error"
+      }
+    }
+  ]
+}

package/src/clients/cli/common.py CHANGED Viewed

@@ -39,19 +39,14 @@ CITATIONS = "Citations"
 EXACT_MATCH = "ExactMatch"
 PARTIAL_MATCH = "PartialMatch"
-# Prerequisite constants
-REQUIRES_AZURE_OPENAI = "azure_openai"
-REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
-# Evaluation status constants
-# Outcome statuses (agent responded, evaluators ran):
-STATUS_PASS = "pass"  # All evaluators scored above threshold
-STATUS_FAIL = "fail"  # At least one evaluator scored below threshold
-# Error state (evaluation couldn't complete):
-STATUS_ERROR = "error"  # API call failed / response couldn't be obtained
-# Thread-level aggregate status (multi-turn only):
-STATUS_PARTIAL = "partial"  # Some turns passed, some did not
-# Fallback for missing status:
+# Evaluation status constants — four-value enum used at the turn/item level
+# AND the thread-level overall_status. See status_derivation.py for the
+# canonical derivation and rollup rules.
+STATUS_PASS = "pass"
+STATUS_FAIL = "fail"
+STATUS_PARTIAL = "partial"
+STATUS_ERROR = "error"
+# Internal-only sentinel — never appears in emitted output.
 STATUS_UNKNOWN = "unknown"
 # System defaults when no file-level or env-level defaults are configured
@@ -77,7 +72,6 @@ METRIC_IDS = {
 @dataclass
 class RegistryEntry:
     type: str  # "llm", "tool", or "non-llm"
-    requires: List[str]
     default_threshold: Optional[float]

package/src/clients/cli/error_messages.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""Canonical error-message templates for persisted evaluation output.
+Every error string written into a JSON/CSV/HTML output file MUST be produced
+by a builder in this module. The builders accept only string arguments — never
+exception objects — which keeps ``repr(exc)``, ``traceback.format_exc()``, and
+SDK class names out of persisted output by construction.
+Two flavours:
+* **Turn/item-level `ErrorObject` builders** return a structured ``{code, message}``
+  dict for the top-level ``error`` field on a turn or single-turn item. Used
+  when ``status == "error"`` (cause) or ``status == "partial"`` with at least
+  one errored evaluator (summary).
+* **Per-evaluator string builders** return a flat string formatted as
+  ``"<category prefix>: <detail>"`` for the ``error`` field inside an
+  ``ErroredScore`` entry. Evaluator identity is encoded by the ``scores`` map's
+  parent property key, so no ``code`` is needed at this level.
+"""
+from __future__ import annotations
+from typing import TypedDict
+class ErrorObject(TypedDict):
+    """Turn/item-level top-level error shape — `{code, message}` per ErrorObject $def."""
+    code: str
+    message: str
+# ── Error code constants ──────────────────────────────────────────────
+# These are the canonical machine-readable codes emitted in the top-level
+# `error.code` field. They appear in persisted output and consumer-facing
+# documentation; treat the string values as part of the public contract
+# (do not rename without a schema-version bump).
+ERROR_CODE_AGENT_REQUEST_FAILED = "agentRequestFailed"
+ERROR_CODE_TURN_SKIPPED = "turnSkipped"
+ERROR_CODE_EVALUATORS_FAILED = "evaluatorsFailed"
+# ── Turn/item-level ErrorObject builders ──────────────────────────────
+def agent_request_failed(exc_message: str) -> ErrorObject:
+    """`status == "error"` cause when the agent client raised — no response obtained."""
+    return {
+        "code": ERROR_CODE_AGENT_REQUEST_FAILED,
+        "message": f"Agent request failed: {exc_message}",
+    }
+def turn_skipped() -> ErrorObject:
+    """`status == "error"` cause for downstream turns after a preceding turn failed.
+    Synthesized cause — no exception text appended (FR-013).
+    """
+    return {
+        "code": ERROR_CODE_TURN_SKIPPED,
+        "message": "Turn not attempted: preceding turn failed",
+    }
+def evaluators_failed_summary(error_count: int, total: int) -> ErrorObject:
+    """`status == "partial"` summary when at least one evaluator returned `result: "error"`.
+    Unified template regardless of error_count vs total — per-evaluator detail
+    (crash vs missing-prereq, with optional exception text) lives in `scores`.
+    """
+    return {
+        "code": ERROR_CODE_EVALUATORS_FAILED,
+        "message": f"Agent response obtained. {error_count} of {total} evaluators failed to run.",
+    }
+# ── Per-evaluator string builders (inside `scores`) ──────────────────
+def evaluator_failed(exc_message: str) -> str:
+    """Per-evaluator `error` string when the evaluator raised during run."""
+    return f"Evaluator failed: {exc_message}"
+# Per-evaluator prerequisite-miss builders are not present today because no
+# reachable prereq-fail exists: `validate_environment()` exits the process if
+# Azure OpenAI config is missing, and no registered evaluator has a
+# data-dependent prereq. When that changes, add builders following the
+# convention `"Evaluator missing prerequisites: <description>"` and wire a
+# prereq check in evaluation_runner. See specs/236-unified-error-output for
+# the deferred sub-cases.