npm - @microsoft/m365-copilot-eval - Versions diffs - 1.5.0-preview.1 → 1.7.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.5.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +19 -1
package/package.json +4 -3
package/schema/CHANGELOG.md +7 -0
package/schema/v1/eval-document.schema.json +144 -333
package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
package/schema/v1/examples/valid/multi-turn-output.json +2 -0
package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
package/schema/version.json +1 -1
package/src/clients/cli/api_clients/A2A/a2a_client.py +57 -10
package/src/clients/cli/auth/auth_handler.py +21 -1
package/src/clients/cli/common.py +8 -14
package/src/clients/cli/error_messages.py +91 -0
package/src/clients/cli/evaluation_runner.py +108 -97
package/src/clients/cli/evaluator_resolver.py +8 -33
package/src/clients/cli/generate_report.py +125 -96
package/src/clients/cli/main.py +2 -1
package/src/clients/cli/readme.md +1 -1
package/src/clients/cli/result_writer.py +129 -110
package/src/clients/cli/status_derivation.py +91 -0
package/src/clients/node-js/bin/runevals.js +31 -9
package/src/clients/node-js/config/default.js +1 -1
package/src/clients/node-js/lib/env-loader.js +20 -13
package/src/clients/node-js/lib/python-runtime.js +137 -65
package/src/clients/node-js/lib/venv-manager.js +3 -2
package/src/clients/node-js/lib/version-check.js +268 -0

package/src/clients/cli/evaluation_runner.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Core evaluation pipeline — evaluator dispatch, retry, parallel execution."""
-import json
 import time
 from dataclasses import dataclass, field
 from enum import Enum
@@ -27,8 +26,6 @@ from common import (
     CITATIONS,
     EXACT_MATCH,
     PARTIAL_MATCH,
-    REQUIRES_AZURE_OPENAI,
-    REQUIRES_TOOL_DEFINITIONS,
     METRIC_IDS,
     MAX_ATTEMPTS,
     MAX_CONCURRENCY,
@@ -37,17 +34,17 @@ from common import (
     STATUS_FAIL,
     STATUS_ERROR,
     STATUS_PARTIAL,
-    STATUS_UNKNOWN,
     MAX_TURNS_PER_THREAD,
     LONG_THREAD_WARNING_THRESHOLD,
     RunConfig,
 )
+from error_messages import agent_request_failed, evaluator_failed, turn_skipped
+from status_derivation import rollup_thread_status, status_for_response
 from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
 from custom_evaluators.ExactMatchEvaluator import ExactMatchEvaluator
 from custom_evaluators.PartialMatchEvaluator import PartialMatchEvaluator
 from evaluator_resolver import (
     validate_evaluator_names,
-    check_prerequisites,
     resolve_evaluators_for_prompt,
     get_evaluator_threshold,
 )
@@ -113,7 +110,12 @@ def detect_item_type(item: dict) -> ItemType:
 def _decorate_metric(metric_id: str, data, threshold: Optional[float] = None) -> Dict[str, Any]:
-    """Augment raw evaluator output with standardized threshold + pass/fail result."""
+    """Augment raw evaluator output with standardized threshold + pass/fail result.
+    Raises ValueError if the SDK returned a malformed result (no numeric score
+    under ``metric_id``). The outer try/except in :func:`_run_evaluators_for_item`
+    catches it and emits a standard ``evaluator_failed`` error entry.
+    """
     pass_threshold = threshold if threshold is not None else DEFAULT_PASS_THRESHOLD
     payload = {}
     if isinstance(data, dict):
@@ -125,12 +127,12 @@ def _decorate_metric(metric_id: str, data, threshold: Optional[float] = None) ->
     if isinstance(data, dict):
         if metric_id in data:
             score_val = data[metric_id]
-    if isinstance(score_val, (int, float)):
-        payload['threshold'] = pass_threshold
-        payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
-    else:
-        payload['threshold'] = pass_threshold
-        payload.setdefault('result', STATUS_UNKNOWN)
+    if not isinstance(score_val, (int, float)):
+        raise ValueError(
+            f"non-numeric score from evaluator (metric_id={metric_id!r}, score={score_val!r})"
+        )
+    payload['threshold'] = pass_threshold
+    payload['result'] = STATUS_PASS if score_val >= pass_threshold else STATUS_FAIL
     return payload
@@ -141,36 +143,20 @@ def _run_evaluators_for_item(
     enhanced_response: Dict[str, Any],
     resolved_evaluators: Dict[str, Any],
     model_config: AzureOpenAIModelConfiguration,
-    has_azure_openai: bool,
-    m365_agent_id: Optional[str],
-) -> Tuple[Dict[str, Optional[str]], List[str]]:
+    context_label: str = "",
+) -> Tuple[Dict[str, Dict[str, Any]], List[str]]:
     """Run resolved evaluators against a single item/turn.
-    Returns (results_dict, evaluators_ran).
+    Each value in results_dict is a decorated metric dict on success or an
+    errored entry ``{result: "error", error: "Evaluator failed: <exc.message>", threshold}``
+    on crash. The ``threshold`` is included on errored entries so the aggregate
+    report can still display it; the persisted ErroredScore shape strips it
+    out at write time (see ``_as_errored_score`` in result_writer).
     """
-    has_tool_defs = bool(
-        m365_agent_id and enhanced_response.get("tool_definitions")
-    )
-    available_context = {
-        REQUIRES_AZURE_OPENAI: has_azure_openai,
-        REQUIRES_TOOL_DEFINITIONS: has_tool_defs,
-    }
-    results_dict: Dict[str, Optional[str]] = {}
+    results_dict: Dict[str, Dict[str, Any]] = {}
     evaluators_ran: List[str] = []
     for eval_name, eval_options in resolved_evaluators.items():
-        can_run, warn_msg = check_prerequisites(eval_name, available_context)
-        if not can_run:
-            if warn_msg:
-                emit_structured_log(
-                    "warning",
-                    f"Evaluator '{eval_name}' prerequisite check failed: {warn_msg}",
-                    operation=Operation.EVALUATE,
-                )
-            results_dict[eval_name] = None
-            continue
         threshold = get_evaluator_threshold(eval_name, eval_options)
         try:
@@ -215,33 +201,39 @@ def _run_evaluators_for_item(
             evaluators_ran.append(eval_name)
         except Exception as e:
+            # Full exception detail goes to the log stream (FR-009). Persisted
+            # output gets the scrubbed text from error_messages.evaluator_failed
+            # — exception.message only, never repr / class name / traceback.
+            where = f" on response for {context_label}" if context_label else ""
             emit_structured_log(
                 "error",
-                f"Evaluator '{eval_name}' crashed and will be omitted from results: {e}",
+                f"Evaluator '{eval_name}' crashed{where}: {e}",
                 operation=Operation.EVALUATE,
             )
-            results_dict[eval_name] = None
+            exc_msg = getattr(e, "message", None) or str(e)
+            results_dict[eval_name] = {
+                "result": STATUS_ERROR,
+                "error": evaluator_failed(exc_msg),
+                "threshold": threshold,
+            }
     return results_dict, evaluators_ran
-def _check_all_passed(results_dict: Dict[str, Optional[Dict[str, Any]]]) -> bool:
-    """Check if all evaluator results passed. Skipped evaluators (None) are ignored."""
-    for result_data in results_dict.values():
-        if result_data is None:
-            continue
-        if result_data.get("result") == STATUS_FAIL:
-            return False
-    return True
+def _collect_evaluator_results(results_dict: Dict[str, Dict[str, Any]]) -> List[str]:
+    """Extract per-evaluator ``result`` values (one of pass/fail/error) for status derivation."""
+    return [
+        d["result"] for d in results_dict.values()
+        if d.get("result") in (STATUS_PASS, STATUS_FAIL, STATUS_ERROR)
+    ]
 def _evaluate_multi_turn_responses(
     turns: List[Dict],
-    m365_agent_id: Optional[str],
     effective_log_level: str,
     default_evaluators: Dict[str, Any],
     model_config: AzureOpenAIModelConfiguration,
-    has_azure_openai: bool,
+    thread_name: str = "",
 ) -> Tuple[List[Dict], Dict]:
     """Run per-turn evaluations and build evaluated turn results with summary.
@@ -251,8 +243,6 @@ def _evaluate_multi_turn_responses(
         and optionally error. Does not mutate the input turns.
     """
     evaluated_turns: List[Dict] = []
-    turns_passed = 0
-    turns_failed = 0
     for i, turn in enumerate(turns):
         evaluated_turn: Dict[str, Any] = {
@@ -268,9 +258,10 @@ def _evaluate_multi_turn_responses(
             evaluated_turn["evaluators_mode"] = turn["evaluators_mode"]
         if turn.get("status") == STATUS_ERROR:
+            # Request-failure or downstream-skip turn — error already set upstream.
             evaluated_turn["status"] = STATUS_ERROR
-            evaluated_turn["error"] = turn.get("error", "")
-            turns_failed += 1
+            if "error" in turn:
+                evaluated_turn["error"] = turn["error"]
             evaluated_turns.append(evaluated_turn)
             continue
@@ -282,16 +273,22 @@ def _evaluate_multi_turn_responses(
             turn.get("prompt", ""), default_evaluators,
         )
+        thread_part = f" of '{thread_name}'" if thread_name else ""
+        turn_label = f"turn {i + 1}/{len(turns)}{thread_part}"
         results_dict, evaluators_ran = _run_evaluators_for_item(
             turn.get("prompt", ""), actual_response, turn.get("expected_response", ""),
-            enhanced_response, resolved, model_config, has_azure_openai, m365_agent_id,
+            enhanced_response, resolved, model_config,
+            context_label=turn_label,
         )
-        all_passed = _check_all_passed(results_dict)
+        evaluator_result_values = _collect_evaluator_results(results_dict)
+        status, error_obj = status_for_response(evaluator_result_values)
         evaluated_turn["results"] = results_dict
         evaluated_turn["evaluators_ran"] = evaluators_ran
-        evaluated_turn["status"] = STATUS_PASS if all_passed else STATUS_FAIL
+        evaluated_turn["status"] = status
+        if error_obj is not None:
+            evaluated_turn["error"] = error_obj
         if effective_log_level == "debug":
             emit_structured_log(
@@ -302,26 +299,17 @@ def _evaluate_multi_turn_responses(
                 operation=Operation.EVALUATE,
             )
-        if all_passed:
-            turns_passed += 1
-        else:
-            turns_failed += 1
         evaluated_turns.append(evaluated_turn)
-    turns_total = len(turns)
-    if turns_passed == turns_total:
-        overall_status = STATUS_PASS
-    elif turns_failed == turns_total:
-        overall_status = STATUS_FAIL
-    else:
-        overall_status = STATUS_PARTIAL
+    turn_statuses = [t.get("status", STATUS_ERROR) for t in evaluated_turns]
+    turns_total = len(evaluated_turns)
     summary = {
         "turns_total": turns_total,
-        "turns_passed": turns_passed,
-        "turns_failed": turns_failed,
-        "overall_status": overall_status,
+        "turns_passed": sum(1 for s in turn_statuses if s == STATUS_PASS),
+        "turns_failed": sum(1 for s in turn_statuses if s == STATUS_FAIL),
+        "turns_partial": sum(1 for s in turn_statuses if s == STATUS_PARTIAL),
+        "turns_errored": sum(1 for s in turn_statuses if s == STATUS_ERROR),
+        "overall_status": rollup_thread_status(turn_statuses),
     }
     return evaluated_turns, summary
@@ -330,10 +318,8 @@ def _evaluate_multi_turn_responses(
 def _evaluate_single_response(
     enhanced_response: Dict[str, Any],
     eval_item: Dict,
-    m365_agent_id: Optional[str],
     effective_log_level: str,
     model_config: AzureOpenAIModelConfiguration,
-    has_azure_openai: bool,
     default_evaluators: Dict[str, Any],
 ) -> Dict[str, Any]:
     """Run all evaluators for a single prompt/response pair and return the result dict."""
@@ -348,10 +334,14 @@ def _evaluate_single_response(
     results_dict, evaluators_ran = _run_evaluators_for_item(
         prompt, actual_response_text, expected_response, enhanced_response,
-        resolved, model_config, has_azure_openai, m365_agent_id,
+        resolved, model_config,
+        context_label=f"prompt '{prompt[:60]}'" if prompt else "",
     )
-    evaluation_result = {
+    evaluator_result_values = _collect_evaluator_results(results_dict)
+    status, error_obj = status_for_response(evaluator_result_values)
+    evaluation_result: Dict[str, Any] = {
         "prompt": prompt,
         "response": enhanced_response.get(
             "display_response_text", actual_response_text
@@ -359,7 +349,10 @@ def _evaluate_single_response(
         "expected_response": expected_response,
         "evaluators_ran": evaluators_ran,
         "results": results_dict,
+        "status": status,
     }
+    if error_obj is not None:
+        evaluation_result["error"] = error_obj
     if "evaluators" in eval_item:
         evaluation_result["evaluators"] = eval_item["evaluators"]
@@ -485,7 +478,7 @@ def run_pipeline(
                         "evaluators_ran": [],
                         "results": {},
                         "status": STATUS_ERROR,
-                        "errorDetails": str(exc),
+                        "error": agent_request_failed(getattr(exc, "message", None) or str(exc)),
                     }
                 delay = retry_after if retry_after is not None else pipeline.get_backoff_seconds(attempt)
@@ -493,9 +486,8 @@ def run_pipeline(
         # Phase B: Evaluate response
         return _evaluate_single_response(
-            response, eval_item, config.m365_agent_id, config.effective_log_level,
-            pipeline.model_config, pipeline.has_azure_openai,
-            pipeline.default_evaluators,
+            response, eval_item, config.effective_log_level,
+            pipeline.model_config, pipeline.default_evaluators,
         )
     def _process_multi_turn(eval_item: Dict, index: int) -> Dict[str, Any]:
@@ -522,7 +514,7 @@ def run_pipeline(
         conversation_context = None
         conversation_id = None
         enriched_turns: List[Dict[str, Any]] = []
-        failed = False
+        failure_exception: Optional[Exception] = None
         for i, turn in enumerate(turns):
             prompt = turn["prompt"]
@@ -557,28 +549,34 @@ def run_pipeline(
                         continue
                     # All other errors: stop the thread
+                    if status == 429:
+                        note = ""  # 429 retries were exhausted; the attempt count is enough.
+                    else:
+                        status_part = f"HTTP {status}" if status else "this error"
+                        note = f" ({status_part} is not retried in multi-turn to avoid duplicate turns in the conversation)"
                     emit_structured_log(
                         "error",
-                        f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s): {exc}",
+                        f"Turn {i + 1}/{len(turns)} of '{thread_name}' failed after {attempt} attempt(s){note}: {exc}",
                         operation=Operation.SEND_PROMPT,
                     )
-                    failed = True
+                    failure_exception = exc
                     break
-            if failed:
-                # Mark this turn and all remaining turns as error
+            if failure_exception is not None:
+                # Failing turn carries the cause; downstream turns are skipped.
+                exc_msg = getattr(failure_exception, "message", None) or str(failure_exception)
                 enriched_turns.append({
                     **turn,
                     "response": "",
                     "status": STATUS_ERROR,
-                    "error": "Failed to get response from agent",
+                    "error": agent_request_failed(exc_msg),
                 })
                 for j in range(i + 1, len(turns)):
                     enriched_turns.append({
                         **turns[j],
                         "response": "",
                         "status": STATUS_ERROR,
-                        "error": "Skipped: preceding turn failed",
+                        "error": turn_skipped(),
                     })
                 break
@@ -596,10 +594,10 @@ def run_pipeline(
         # Phase B: Run per-turn evaluations
         evaluated_turns, summary = _evaluate_multi_turn_responses(
-            enriched_turns, config.m365_agent_id, config.effective_log_level,
+            enriched_turns, config.effective_log_level,
             pipeline.default_evaluators,
+            thread_name=thread_name,
             model_config=pipeline.model_config,
-            has_azure_openai=pipeline.has_azure_openai,
         )
         return {
@@ -621,21 +619,34 @@ def run_pipeline(
         if wr.error:
             idx = wr.index
             item = eval_items[idx]
+            exc_msg = getattr(wr.error, "message", None) or str(wr.error)
+            cause_error = agent_request_failed(exc_msg)
             if item_types[idx] == ItemType.MULTI_TURN:
+                # Worker raised before any turn ran. Turn 1 carries the cause;
+                # remaining turns are downstream-skipped. All turns errored →
+                # thread overall_status="error".
+                turns = item.get("turns", [])
+                turn_dicts = []
+                for j, t in enumerate(turns):
+                    turn_dicts.append({
+                        **t,
+                        "response": "",
+                        "results": {},
+                        "status": STATUS_ERROR,
+                        "error": cause_error if j == 0 else turn_skipped(),
+                    })
                 ordered_results.append({
                     "type": "multi_turn",
                     "name": item.get("name", ""),
-                    "turns": [
-                        {**t, "status": STATUS_ERROR, "error": str(wr.error), "response": "", "results": {}}
-                        for t in item.get("turns", [])
-                    ],
+                    "turns": turn_dicts,
                     "summary": {
-                        "turns_total": len(item.get("turns", [])),
+                        "turns_total": len(turns),
                         "turns_passed": 0,
-                        "turns_failed": len(item.get("turns", [])),
-                        "overall_status": STATUS_FAIL,
+                        "turns_failed": 0,
+                        "turns_partial": 0,
+                        "turns_errored": len(turns),
+                        "overall_status": STATUS_ERROR,
                     },
-                    "error": str(wr.error),
                 })
             else:
                 ordered_results.append({
@@ -645,7 +656,7 @@ def run_pipeline(
                     "evaluators_ran": [],
                     "results": {},
                     "status": STATUS_ERROR,
-                    "errorDetails": str(wr.error),
+                    "error": cause_error,
                 })
         else:
             ordered_results.append(wr.value)

package/src/clients/cli/evaluator_resolver.py CHANGED Viewed

@@ -6,7 +6,7 @@ with file-level defaults and system defaults, following extend/replace modes.
 import difflib
 import logging
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional
 from common import (
     RELEVANCE,
@@ -16,7 +16,6 @@ from common import (
     CITATIONS,
     EXACT_MATCH,
     PARTIAL_MATCH,
-    REQUIRES_AZURE_OPENAI,
     SYSTEM_DEFAULT_EVALUATORS,
     RegistryEntry,
 )
@@ -26,13 +25,13 @@ logger = logging.getLogger(__name__)
 # Static registry of available evaluators per data-model.md
 EVALUATOR_REGISTRY: Dict[str, RegistryEntry] = {
-    RELEVANCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
-    COHERENCE: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
-    GROUNDEDNESS: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
-    SIMILARITY: RegistryEntry(type="llm", requires=[REQUIRES_AZURE_OPENAI], default_threshold=3),
-    CITATIONS: RegistryEntry(type="non-llm", requires=[], default_threshold=1),
-    EXACT_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=None),
-    PARTIAL_MATCH: RegistryEntry(type="non-llm", requires=[], default_threshold=0.5),
+    RELEVANCE: RegistryEntry(type="llm", default_threshold=3),
+    COHERENCE: RegistryEntry(type="llm", default_threshold=3),
+    GROUNDEDNESS: RegistryEntry(type="llm", default_threshold=3),
+    SIMILARITY: RegistryEntry(type="llm", default_threshold=3),
+    CITATIONS: RegistryEntry(type="non-llm", default_threshold=1),
+    EXACT_MATCH: RegistryEntry(type="non-llm", default_threshold=None),
+    PARTIAL_MATCH: RegistryEntry(type="non-llm", default_threshold=0.5),
 }
@@ -71,30 +70,6 @@ def validate_evaluator_names(evaluator_map: Dict[str, Any]) -> None:
     raise ValueError("\n".join(lines))
-def check_prerequisites(
-    evaluator_name: str,
-    available_context: Dict[str, bool],
-) -> Tuple[bool, Optional[str]]:
-    """Check if prerequisites for an evaluator are available.
-    Returns (True, None) if all prerequisites are met, or
-    (False, warning_message) if a prerequisite is missing.
-    """
-    registry_entry = EVALUATOR_REGISTRY.get(evaluator_name)
-    if not registry_entry:
-        return False, f"Unknown evaluator: {evaluator_name}"
-    for req in registry_entry.requires:
-        if not available_context.get(req, False):
-            msg = (
-                f"Skipping evaluator '{evaluator_name}': "
-                f"missing prerequisite '{req}'"
-            )
-            return False, msg
-    return True, None
 def resolve_default_evaluators(file_defaults: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
     """Resolve effective default evaluators, falling back to system defaults.