npm - @microsoft/m365-copilot-eval - Versions diffs - 1.2.1-preview.1 → 1.3.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.3.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +11 -4
package/package.json +2 -2
package/schema/CHANGELOG.md +8 -0
package/schema/v1/eval-document.schema.json +117 -1
package/schema/v1/examples/valid/comprehensive.json +27 -2
package/schema/version.json +2 -2
package/src/clients/cli/cli_logging/__init__.py +0 -0
package/src/clients/cli/cli_logging/console_diagnostics.py +55 -0
package/src/clients/cli/cli_logging/logging_utils.py +145 -0
package/src/clients/cli/common.py +51 -0
package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
package/src/clients/cli/evaluator_resolver.py +150 -0
package/src/clients/cli/generate_report.py +130 -110
package/src/clients/cli/main.py +513 -236
package/src/clients/cli/readme.md +14 -7
package/src/clients/cli/response_extractor.py +32 -14
package/src/clients/node-js/bin/runevals.js +58 -28
package/src/clients/node-js/config/default.js +1 -1

package/README.md CHANGED Viewed

@@ -265,10 +265,18 @@ runevals --prompts "What is Microsoft Graph?" --expected "Gateway to M365 data"
 # Interactive mode (enter prompts interactively)
 runevals --interactive
+# Canonical logging verbosity
+runevals --log-level debug
+runevals --log-level info
+runevals --log-level warning
+runevals --log-level error
 # Custom output location in your project
 runevals --output ./reports/results.html
 ```
+> **⚠️ Debug log safety notice:** The `--log-level debug` option is opt-in and may include raw API payloads and response data in console output. Redaction is pattern-based (API keys, tokens, passwords, long mixed-case strings) and **will not catch arbitrary PII or custom credentials** embedded in prompts or responses. Do not share debug-level output publicly without manual review.
 ### Optional: Add Shortcuts to package.json
 You can add shortcuts (npm scripts) to your agent project's `package.json`:
@@ -320,8 +328,7 @@ runevals --output results.csv
 ```bash
 Options:
   -V, --version                 output version number
-  -v, --verbose                 show detailed processing steps
-  -q, --quiet                   minimal output
+  --log-level [level]           log level: debug|info|warning|error (bare flag -> info)
   --prompts <prompts...>        inline prompts to evaluate
   --expected <responses...>     expected responses (with --prompts)
   --prompts-file <file>         JSON file with prompts
@@ -360,7 +367,7 @@ runevals cache-info
 # Clear and rebuild
 runevals cache-clear
-runevals --init-only --verbose
+runevals --init-only --log-level debug
 ```
 ### Network/Proxy Issues
@@ -369,7 +376,7 @@ runevals --init-only --verbose
 export HTTPS_PROXY=http://proxy:8080
 # Retry with verbose output
-runevals --init-only --verbose
+runevals --init-only --log-level debug
 ```
 ### Permission Issues

package/package.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "name": "@microsoft/m365-copilot-eval",
-  "version": "1.2.1-preview.1",
+  "version": "1.3.0-preview.1",
   "minCliVersion": "1.0.1-preview.1",
   "description": "Zero-config Node.js wrapper for M365 Copilot Agent Evaluations CLI (Python-based Azure AI Evaluation SDK)",
-  "publishDate": "2026-03-23",
+  "publishDate": "2026-04-01",
   "main": "src/clients/node-js/lib/index.js",
   "type": "module",
   "bin": {

package/schema/CHANGELOG.md CHANGED Viewed

@@ -5,6 +5,14 @@ All notable changes to the eval document schema will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [1.1.0](https://github.com/microsoft/M365-Copilot-Agent-Evals/compare/schema-v1.0.0...schema-v1.1.0) (2026-03-30)
+### Features
+* **WI-6855059:** add agentName/cliVersion to schema, fix duplicate prompt loss, include default_evaluators in output ([#181](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/181)) ([9321474](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/93214746144e9d11f507433eff185aefac4a858a))
+* **WI-6855059:** implement per-prompt evaluator configuration ([#168](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/168)) ([eface7e](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/eface7e7041b118681cd4c68582fe903640bf6c0))
 ## [1.0.0] - 2026-02-19
 ### Added

package/schema/v1/eval-document.schema.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
   "title": "M365 Copilot Eval Document",
-  "description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Version 1.0.0.",
+  "description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Version 1.1.0.",
   "type": "object",
   "required": ["schemaVersion", "items"],
   "additionalProperties": true,
@@ -21,6 +21,10 @@
     "metadata": {
       "$ref": "#/$defs/DocumentMetadata"
     },
+    "default_evaluators": {
+      "$ref": "#/$defs/EvaluatorMap",
+      "description": "File-level default evaluators (overrides system defaults)"
+    },
     "items": {
       "type": "array",
       "minItems": 1,
@@ -69,6 +73,14 @@
           "type": "string",
           "description": "M365 Agent ID this evaluation targets"
         },
+        "agentName": {
+          "type": "string",
+          "description": "Name of the M365 agent this evaluation targets"
+        },
+        "cliVersion": {
+          "type": "string",
+          "description": "Version of the M365 Copilot Agent Evals CLI that produced this document"
+        },
         "extensions": {
           "type": "object",
           "additionalProperties": true,
@@ -99,6 +111,16 @@
           "type": "string",
           "description": "Additional context for grounding evaluation"
         },
+        "evaluators": {
+          "$ref": "#/$defs/EvaluatorMap",
+          "description": "Per-prompt evaluator overrides"
+        },
+        "evaluators_mode": {
+          "type": "string",
+          "enum": ["extend", "replace"],
+          "default": "extend",
+          "description": "How per-prompt evaluators combine with defaults"
+        },
         "citations": {
           "type": "array",
           "items": {
@@ -140,6 +162,14 @@
         "citations": {
           "$ref": "#/$defs/CitationScore",
           "description": "Citation evaluation results"
+        },
+        "exactMatch": {
+          "$ref": "#/$defs/ExactMatchScore",
+          "description": "Exact match evaluation result"
+        },
+        "partialMatch": {
+          "$ref": "#/$defs/PartialMatchScore",
+          "description": "Partial match evaluation result"
         }
       }
     },
@@ -211,6 +241,92 @@
         }
       }
     },
+    "ExactMatchScore": {
+      "type": "object",
+      "description": "Exact match evaluation result",
+      "required": ["match", "result"],
+      "additionalProperties": true,
+      "properties": {
+        "match": {
+          "type": "boolean",
+          "description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)"
+        },
+        "result": {
+          "type": "string",
+          "enum": ["pass", "fail"],
+          "description": "Pass when match is true, fail otherwise"
+        },
+        "reason": {
+          "type": "string",
+          "description": "Explanation of the match result"
+        }
+      }
+    },
+    "PartialMatchScore": {
+      "type": "object",
+      "description": "Partial match evaluation result",
+      "required": ["score", "result", "threshold"],
+      "additionalProperties": true,
+      "properties": {
+        "score": {
+          "type": "number",
+          "minimum": 0,
+          "maximum": 1,
+          "description": "Match score from 0.0 (no match) to 1.0 (full match)"
+        },
+        "result": {
+          "type": "string",
+          "enum": ["pass", "fail"],
+          "description": "Pass/fail based on score vs threshold"
+        },
+        "threshold": {
+          "type": "number",
+          "minimum": 0,
+          "maximum": 1,
+          "description": "Minimum score required for pass (default: 0.5)"
+        },
+        "reason": {
+          "type": "string",
+          "description": "Explanation of the match result"
+        }
+      }
+    },
+    "EvaluatorMap": {
+      "type": "object",
+      "description": "Map of evaluator names to their configuration options",
+      "propertyNames": {
+        "enum": ["Relevance", "Coherence", "Groundedness", "ToolCallAccuracy", "Citations", "ExactMatch", "PartialMatch"]
+      },
+      "additionalProperties": {
+        "$ref": "#/$defs/EvaluatorOptions"
+      }
+    },
+    "EvaluatorOptions": {
+      "type": "object",
+      "description": "Evaluator configuration options. Use empty object {} for defaults.",
+      "additionalProperties": false,
+      "properties": {
+        "threshold": {
+          "type": "number",
+          "description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (min citation count, default: 1), 0.0-1.0 for PartialMatch (min match ratio, default: 0.5). Validated per-evaluator at runtime."
+        },
+        "citation_format": {
+          "type": "string",
+          "examples": ["oai_unicode", "bracket", "mixed"],
+          "description": "Citation format for detection. 'oai_unicode': new OAI unicode format, 'bracket': legacy [^i^] bracket format, 'mixed': auto-detect both formats. Default: oai_unicode."
+        },
+        "case_sensitive": {
+          "type": "boolean",
+          "default": false,
+          "description": "Case-sensitive matching for ExactMatch/PartialMatch"
+        },
+        "options": {
+          "type": "object",
+          "additionalProperties": true,
+          "description": "Evaluator-specific configuration"
+        }
+      }
+    },
     "Citation": {
       "type": "object",
       "description": "A single citation reference",

package/schema/v1/examples/valid/comprehensive.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "$schema": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
-  "schemaVersion": "1.0.0",
+  "schemaVersion": "1.1.0",
   "metadata": {
     "name": "Graph API Evaluation Set",
     "description": "Test prompts for Microsoft Graph API knowledge",
@@ -9,11 +9,17 @@
     "evaluatedAt": "2026-01-20T10:30:00Z",
     "tags": ["graph", "api", "authentication"],
     "agentId": "12345678-1234-1234-1234-123456789abc",
+    "agentName": "Graph Knowledge Agent",
+    "cliVersion": "1.2.0",
     "extensions": {
       "com.contoso.department": "engineering",
       "com.contoso.priority": "high"
     }
   },
+  "default_evaluators": {
+    "Relevance": {},
+    "Coherence": {}
+  },
   "items": [
     {
       "prompt": "What is Microsoft Graph API?",
@@ -86,7 +92,26 @@
     },
     {
       "prompt": "How do I authenticate with Microsoft Graph?",
-      "expected_response": "You can authenticate using OAuth 2.0 or client credentials flow."
+      "expected_response": "You can authenticate using OAuth 2.0 or client credentials flow.",
+      "evaluators": {
+        "ExactMatch": { "case_sensitive": false },
+        "PartialMatch": { "threshold": 0.5 }
+      },
+      "evaluators_mode": "replace",
+      "response": "You can authenticate using OAuth 2.0 or client credentials flow.",
+      "scores": {
+        "exactMatch": {
+          "match": true,
+          "result": "pass",
+          "reason": "Exact match found"
+        },
+        "partialMatch": {
+          "score": 1.0,
+          "result": "pass",
+          "threshold": 0.5,
+          "reason": "Match score: 1.000"
+        }
+      }
     }
   ]
 }

package/schema/version.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "version": "1.0.0",
-  "releaseDate": "2026-02-19",
+  "version": "1.1.0",
+  "releaseDate": "2026-03-17",
   "schemaId": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
   "description": "M365 Copilot Eval Document Schema"
 }

package/src/clients/cli/cli_logging/__init__.py ADDED Viewed

File without changes

package/src/clients/cli/cli_logging/console_diagnostics.py ADDED Viewed

@@ -0,0 +1,55 @@
+import json
+import sys
+from collections import OrderedDict
+from typing import Any, Dict
+from cli_logging.logging_utils import STRUCTURED_LOG_FIELDS
+_ANSI_COLORS = {
+    "debug": "\033[2m",     # dim
+    "info": "",             # default
+    "warning": "\033[33m",  # yellow
+    "error": "\033[31m",    # red
+}
+_ANSI_RESET = "\033[0m"
+def format_diagnostic_record(record: Dict[str, Any]) -> OrderedDict:
+    ordered = OrderedDict()
+    for field in STRUCTURED_LOG_FIELDS:
+        default = False if field == "is-redacted" else None
+        ordered[field] = record.get(field, default)
+    return ordered
+def serialize_diagnostic_record(record: Dict[str, Any]) -> str:
+    return json.dumps(format_diagnostic_record(record), ensure_ascii=False)
+def format_console_record(record: Dict[str, Any], max_message_length: int = 250) -> str:
+    """Format a diagnostic record for human-readable TTY output with ANSI colors."""
+    ts = record.get("timestamp", "")
+    # Extract HH:MM:SS from ISO timestamp
+    time_part = ts[11:19] if len(ts) >= 19 else ts
+    level = (record.get("level") or "info").upper()
+    message = record.get("message", "")
+    if len(message) > max_message_length:
+        message = message[:max_message_length] + "…"
+    ids = []
+    for key in ("request-id", "conversation-id", "message-id"):
+        val = record.get(key)
+        if val:
+            ids.append(f"{key}={val}")
+    id_suffix = f" ({' | '.join(ids)})" if ids else ""
+    color = _ANSI_COLORS.get((record.get("level") or "info").lower(), "")
+    reset = _ANSI_RESET if color else ""
+    return f"{color}[{time_part}] {level} {message}{id_suffix}{reset}"
+def render_diagnostic(record: Dict[str, Any]) -> str:
+    """Return TTY-friendly or JSON output depending on whether stdout is a terminal."""
+    if sys.stdout.isatty():
+        return format_console_record(record)
+    return serialize_diagnostic_record(record)

package/src/clients/cli/cli_logging/logging_utils.py ADDED Viewed

@@ -0,0 +1,145 @@
+import logging
+import re
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+class LogLevel(str, Enum):
+    """Log level enum. Inherits from str so comparisons like level == "debug" work."""
+    DEBUG = "debug"
+    INFO = "info"
+    WARNING = "warning"
+    ERROR = "error"
+class Operation(str, Enum):
+    """CLI operation steps for structured log entries."""
+    SETUP = "setup"
+    AUTHENTICATE = "authenticate"
+    VALIDATE_ENV = "validate-env"
+    LOAD_PROMPTS = "load-prompts"
+    FETCH_AGENTS = "fetch-agents"
+    SEND_PROMPT = "send-prompt"
+    EVALUATE = "evaluate"
+    WRITE_OUTPUT = "write-output"
+ALLOWED_LOG_LEVELS = tuple(level.value for level in LogLevel)
+LOG_LEVEL_MAP = {
+    LogLevel.DEBUG: logging.DEBUG,
+    LogLevel.INFO: logging.INFO,
+    LogLevel.WARNING: logging.WARNING,
+    LogLevel.ERROR: logging.ERROR,
+}
+STRUCTURED_LOG_FIELDS = (
+    "timestamp",
+    "level",
+    "operation",
+    "request-id",
+    "conversation-id",
+    "message-id",
+    "logger",
+    "message",
+    "is-redacted",
+)
+def normalize_log_level(value: Optional[str]) -> Optional[str]:
+    if value is None:
+        return None
+    return value.strip().lower()
+def resolve_log_level(
+    log_level_values: Optional[List[str]],
+) -> Tuple[Optional[str], Optional[str]]:
+    values = log_level_values or []
+    if not values:
+        return "info", None
+    # Use the last value provided (aligns with Node.js wrapper behavior).
+    last = normalize_log_level(values[-1])
+    if last not in ALLOWED_LOG_LEVELS:
+        return (
+            None,
+            "Invalid value for --log-level. Supported values are: "
+            "debug, info, warning, error.",
+        )
+    return last, None
+def utc_iso_timestamp() -> str:
+    return datetime.now(timezone.utc).isoformat()
+def build_run_context(
+    operation: str = "evaluate",
+    request_id: Optional[str] = None,
+    conversation_id: Optional[str] = None,
+    message_id: Optional[str] = None,
+) -> Dict[str, Optional[str]]:
+    return {
+        "request-id": request_id,
+        "conversation-id": conversation_id,
+        "message-id": message_id,
+        "operation": operation,
+    }
+_SECRET_PATTERNS = [
+    re.compile(r"(?i)(api[_-]?key\s*[:=]\s*)([^\s,;]+)"),
+    re.compile(r"(?i)(token\s*[:=]\s*)([^\s,;]+)"),
+    re.compile(r"(?i)(authorization\s*[:=]\s*bearer\s+)([^\s,;]+)"),
+    re.compile(r"(?i)(password\s*[:=]\s*)([^\s,;]+)"),
+]
+def redact_sensitive_content(message: Optional[str]) -> Tuple[str, bool]:
+    if message is None:
+        return "", False
+    redacted = message
+    changed = False
+    for pattern in _SECRET_PATTERNS:
+        updated = pattern.sub(r"\1***REDACTED***", redacted)
+        if updated != redacted:
+            changed = True
+            redacted = updated
+    # Fallback: match strings 32+ chars containing mixed case and digits
+    # (likely a credential/token) that weren't already caught above.
+    if (
+        "***REDACTED***" not in redacted
+        and re.search(
+            r"(?=[A-Za-z0-9_\-]*[A-Z])(?=[A-Za-z0-9_\-]*[a-z])"
+            r"(?=[A-Za-z0-9_\-]*[0-9])[A-Za-z0-9_\-]{32,}",
+            redacted,
+        )
+    ):
+        return "[REDACTED]", True
+    return redacted, changed
+def format_structured_log_entry(
+    level: str,
+    message: str,
+    logger_name: str,
+    run_context: Dict[str, Optional[str]],
+) -> Dict[str, Any]:
+    safe_message, is_redacted = redact_sensitive_content(message)
+    return {
+        "level": normalize_log_level(level) or "info",
+        "message": safe_message,
+        "logger": logger_name,
+        "timestamp": utc_iso_timestamp(),
+        "request-id": run_context.get("request-id"),
+        "conversation-id": run_context.get("conversation-id"),
+        "message-id": run_context.get("message-id"),
+        "operation": run_context.get("operation"),
+        "is-redacted": is_redacted,
+    }

package/src/clients/cli/common.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""Shared types and constants for the CLI."""
+import re
+from dataclasses import dataclass
+from typing import List, Optional
+def pascal_case_to_title(eval_name: str) -> str:
+    """Convert PascalCase evaluator name to space-separated display name.
+    e.g., "ToolCallAccuracy" → "Tool Call Accuracy"
+    """
+    return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', eval_name)
+# Canonical evaluator name constants
+RELEVANCE = "Relevance"
+COHERENCE = "Coherence"
+GROUNDEDNESS = "Groundedness"
+TOOL_CALL_ACCURACY = "ToolCallAccuracy"
+CITATIONS = "Citations"
+EXACT_MATCH = "ExactMatch"
+PARTIAL_MATCH = "PartialMatch"
+# Prerequisite constants
+REQUIRES_AZURE_OPENAI = "azure_openai"
+REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
+# System defaults when no file-level or env-level defaults are configured
+SYSTEM_DEFAULT_EVALUATORS = [
+    RELEVANCE,
+    COHERENCE,
+]
+# Mapping from evaluator name to the key used in evaluator output dicts
+METRIC_IDS = {
+    RELEVANCE: "relevance",
+    COHERENCE: "coherence",
+    GROUNDEDNESS: "groundedness",
+    TOOL_CALL_ACCURACY: "tool_call_accuracy",
+    CITATIONS: "citations",
+    EXACT_MATCH: "exact_match",
+    PARTIAL_MATCH: "partial_match",
+}
+@dataclass
+class RegistryEntry:
+    type: str  # "llm", "tool", or "non-llm"
+    requires: List[str]
+    default_threshold: Optional[float]

package/src/clients/cli/custom_evaluators/CitationsEvaluator.py CHANGED Viewed

@@ -17,8 +17,8 @@ from typing import Dict, Any, Optional
 class CitationFormat(Enum):
     """Enum for different citation formats supported by the evaluator."""
     OAI_UNICODE = "oai_unicode"  # New format: \ue200cite\ue202turn{X}search{Y}\ue201
-    LEGACY_BRACKET = "legacy_bracket"  # Old format: [^i^]
-    AUTO = "auto"  # Automatically detect both formats
+    LEGACY_BRACKET = "bracket"  # Old format: [^i^]
+    AUTO = "mixed"  # Automatically detect both formats
 class CitationsEvaluator:
@@ -141,7 +141,7 @@ class CitationsEvaluator:
         results = {
             "citation_format": self.citation_format.value,
-            "score": total_citations,
+            "citations": total_citations,
             "result": "pass" if total_citations > 0 else "fail",
             "threshold": 1,
             "reason": " ".join(reason_parts)

package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py CHANGED Viewed

@@ -1,8 +1,6 @@
-from azure.ai.evaluation import evaluate
 class ExactMatchEvaluator:
-    def __init__(self):
-        pass
+    def __init__(self, case_sensitive=False):
+        self.case_sensitive = case_sensitive
     def __call__(self, *, response: str, expected_answer: str, **kwargs):
         if response is None or response.strip() == "":
@@ -11,15 +9,17 @@ class ExactMatchEvaluator:
         if expected_answer is None:
             raise ValueError("Expected answer cannot be None.")
-        # Case-sensitive exact match (mimics C# StringComparison.InvariantCulture)
-        is_match = response.strip() == expected_answer.strip()
+        resp = response.strip()
+        exp = expected_answer.strip()
+        if not self.case_sensitive:
+            resp = resp.lower()
+            exp = exp.lower()
+        is_match = resp == exp
         return {
             "exact_match": 1.0 if is_match else 0.0,
-            "exact_match_result": "pass" if is_match else "fail",
-            "exact_match_threshold": 1.0,
+            "result": "pass" if is_match else "fail",
             "exact_match_reason": "Exact match found" if is_match else "No exact match found"
         }
-exact_match_evaluator = ExactMatchEvaluator()

package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from azure.ai.evaluation import evaluate
 class PartialMatchEvaluator:
     def __init__(self, case_sensitive=False):
         self.case_sensitive = case_sensitive
@@ -25,15 +23,7 @@ class PartialMatchEvaluator:
         else:
             score = 0.0
-        threshold = 0.5  # 50% match threshold
-        is_pass = score >= threshold
         return {
             "partial_match": score,
-            "partial_match_result": "pass" if is_pass else "fail",
-            "partial_match_threshold": threshold,
-            "partial_match_reason": f"Match score: {score:.3f} ({'above' if is_pass else 'below'} threshold {threshold})"
+            "partial_match_reason": f"Match score: {score:.3f}"
         }
-partial_match_evaluator = PartialMatchEvaluator(case_sensitive=False)