npm - @microsoft/m365-copilot-eval - Versions diffs - 1.2.1-preview.1 → 1.4.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md +140 -101
package/package.json +7 -4
package/schema/CHANGELOG.md +8 -0
package/schema/v1/eval-document.schema.json +256 -8
package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
package/schema/v1/examples/valid/comprehensive.json +27 -2
package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
package/schema/v1/examples/valid/multi-turn-output.json +59 -0
package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
package/schema/version.json +2 -2
package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
package/src/clients/cli/api_clients/REST/__init__.py +3 -0
package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
package/src/clients/cli/api_clients/__init__.py +3 -0
package/src/clients/cli/api_clients/base_agent_client.py +78 -0
package/src/clients/cli/cli_logging/__init__.py +0 -0
package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
package/src/clients/cli/cli_logging/logging_utils.py +144 -0
package/src/clients/cli/common.py +62 -0
package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
package/src/clients/cli/evaluator_resolver.py +150 -0
package/src/clients/cli/generate_report.py +347 -184
package/src/clients/cli/main.py +1288 -481
package/src/clients/cli/parallel_executor.py +57 -0
package/src/clients/cli/readme.md +14 -7
package/src/clients/cli/requirements.txt +1 -1
package/src/clients/cli/response_extractor.py +30 -14
package/src/clients/cli/retry_policy.py +52 -0
package/src/clients/cli/samples/multiturn_example.json +35 -0
package/src/clients/cli/throttle_gate.py +82 -0
package/src/clients/node-js/bin/runevals.js +134 -41
package/src/clients/node-js/config/default.js +5 -1
package/src/clients/node-js/lib/agent-id.js +12 -0
package/src/clients/node-js/lib/env-loader.js +11 -16
package/src/clients/node-js/lib/eula-manager.js +78 -0
package/src/clients/node-js/lib/progress.js +13 -11

package/schema/v1/eval-document.schema.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "$schema": "https://json-schema.org/draft/2020-12/schema",
   "$id": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
   "title": "M365 Copilot Eval Document",
-  "description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Version 1.0.0.",
+  "description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Supports single-turn and multi-turn evaluations.",
   "type": "object",
   "required": ["schemaVersion", "items"],
   "additionalProperties": true,
@@ -21,12 +21,19 @@
     "metadata": {
       "$ref": "#/$defs/DocumentMetadata"
     },
+    "default_evaluators": {
+      "$ref": "#/$defs/EvaluatorMap",
+      "description": "File-level default evaluators (overrides system defaults)"
+    },
     "items": {
       "type": "array",
       "minItems": 1,
-      "description": "Array of evaluation items (prompts and optionally responses with scores)",
+      "description": "Array of evaluation items: single-turn evaluations or multi-turn threads",
       "items": {
-        "$ref": "#/$defs/EvalItem"
+        "oneOf": [
+          { "$ref": "#/$defs/SingleTurnEvaluation" },
+          { "$ref": "#/$defs/MultiTurnThread" }
+        ]
       }
     }
   },
@@ -56,7 +63,7 @@
         "evaluatedAt": {
           "type": "string",
           "format": "date-time",
-          "description": "ISO 8601 timestamp when evaluation was performed (output documents)"
+          "description": "ISO 8601 timestamp when evaluation was performed"
         },
         "tags": {
           "type": "array",
@@ -69,6 +76,14 @@
           "type": "string",
           "description": "M365 Agent ID this evaluation targets"
         },
+        "agentName": {
+          "type": "string",
+          "description": "Name of the M365 agent this evaluation targets"
+        },
+        "cliVersion": {
+          "type": "string",
+          "description": "Version of the M365 Copilot Agent Evals CLI that produced this document"
+        },
         "extensions": {
           "type": "object",
           "additionalProperties": true,
@@ -76,11 +91,11 @@
         }
       }
     },
-    "EvalItem": {
+    "SingleTurnEvaluation": {
       "type": "object",
-      "description": "A single evaluation item containing a prompt and optionally a response with scores",
+      "description": "A standalone single-turn prompt-response evaluation",
       "required": ["prompt"],
-      "additionalProperties": true,
+      "additionalProperties": false,
       "properties": {
         "prompt": {
           "type": "string",
@@ -93,12 +108,22 @@
         },
         "response": {
           "type": "string",
-          "description": "Actual response from the agent (present in output documents)"
+          "description": "Actual response from the agent"
         },
         "context": {
           "type": "string",
           "description": "Additional context for grounding evaluation"
         },
+        "evaluators": {
+          "$ref": "#/$defs/EvaluatorMap",
+          "description": "Per-prompt evaluator overrides"
+        },
+        "evaluators_mode": {
+          "type": "string",
+          "enum": ["extend", "replace"],
+          "default": "extend",
+          "description": "How per-prompt evaluators combine with defaults"
+        },
         "citations": {
           "type": "array",
           "items": {
@@ -114,6 +139,135 @@
           "additionalProperties": true,
           "description": "Extension point for custom item-level fields"
         }
+      },
+      "not": {
+        "required": ["turns"]
+      }
+    },
+    "MultiTurnThread": {
+      "type": "object",
+      "description": "A multi-turn conversation thread with ordered turns sharing conversation context",
+      "required": ["turns"],
+      "additionalProperties": false,
+      "properties": {
+        "name": {
+          "type": "string",
+          "description": "Human-readable name for the thread"
+        },
+        "description": {
+          "type": "string",
+          "description": "Description of what this thread tests"
+        },
+        "turns": {
+          "type": "array",
+          "minItems": 1,
+          "maxItems": 20,
+          "items": { "$ref": "#/$defs/Turn" },
+          "description": "Ordered array of conversation turns"
+        },
+        "conversation_id": {
+          "type": "string",
+          "description": "Unique identifier for this conversation thread"
+        },
+        "summary": {
+          "$ref": "#/$defs/ThreadSummary",
+          "description": "Aggregate statistics for the thread"
+        },
+        "extensions": {
+          "type": "object",
+          "additionalProperties": true,
+          "description": "Extension point for custom thread-level fields"
+        }
+      },
+      "not": {
+        "required": ["prompt"]
+      }
+    },
+    "Turn": {
+      "type": "object",
+      "description": "A single turn within a multi-turn thread",
+      "required": ["prompt"],
+      "additionalProperties": false,
+      "properties": {
+        "prompt": {
+          "type": "string",
+          "minLength": 1,
+          "description": "The user message for this turn"
+        },
+        "expected_response": {
+          "type": "string",
+          "description": "Expected agent response for this turn"
+        },
+        "response": {
+          "type": "string",
+          "description": "Actual agent response"
+        },
+        "context": {
+          "type": "string",
+          "description": "Additional context for grounding evaluation"
+        },
+        "evaluators": {
+          "$ref": "#/$defs/EvaluatorMap",
+          "description": "Per-turn evaluator overrides"
+        },
+        "evaluators_mode": {
+          "type": "string",
+          "enum": ["extend", "replace"],
+          "default": "extend",
+          "description": "How per-turn evaluators combine with defaults"
+        },
+        "citations": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/Citation"
+          },
+          "description": "Citations included in the response"
+        },
+        "scores": {
+          "$ref": "#/$defs/ScoreCollection"
+        },
+        "status": {
+          "type": "string",
+          "enum": ["pass", "fail", "error"],
+          "description": "Overall status of this turn"
+        },
+        "error": {
+          "type": "string",
+          "description": "Error message if status is 'error'"
+        },
+        "extensions": {
+          "type": "object",
+          "additionalProperties": true,
+          "description": "Extension point for custom turn-level fields"
+        }
+      }
+    },
+    "ThreadSummary": {
+      "type": "object",
+      "description": "Aggregate statistics for a thread",
+      "required": ["turns_total", "turns_passed", "turns_failed", "overall_status"],
+      "additionalProperties": false,
+      "properties": {
+        "turns_total": {
+          "type": "integer",
+          "minimum": 1,
+          "description": "Total number of turns executed"
+        },
+        "turns_passed": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of turns where all evaluators passed"
+        },
+        "turns_failed": {
+          "type": "integer",
+          "minimum": 0,
+          "description": "Number of turns where any evaluator failed"
+        },
+        "overall_status": {
+          "type": "string",
+          "enum": ["pass", "partial", "fail"],
+          "description": "pass: all turns passed, partial: some failed, fail: all failed or error"
+        }
       }
     },
     "ScoreCollection": {
@@ -140,6 +294,14 @@
         "citations": {
           "$ref": "#/$defs/CitationScore",
           "description": "Citation evaluation results"
+        },
+        "exactMatch": {
+          "$ref": "#/$defs/ExactMatchScore",
+          "description": "Exact match evaluation result"
+        },
+        "partialMatch": {
+          "$ref": "#/$defs/PartialMatchScore",
+          "description": "Partial match evaluation result"
         }
       }
     },
@@ -211,6 +373,92 @@
         }
       }
     },
+    "ExactMatchScore": {
+      "type": "object",
+      "description": "Exact match evaluation result",
+      "required": ["match", "result"],
+      "additionalProperties": true,
+      "properties": {
+        "match": {
+          "type": "boolean",
+          "description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)"
+        },
+        "result": {
+          "type": "string",
+          "enum": ["pass", "fail"],
+          "description": "Pass when match is true, fail otherwise"
+        },
+        "reason": {
+          "type": "string",
+          "description": "Explanation of the match result"
+        }
+      }
+    },
+    "PartialMatchScore": {
+      "type": "object",
+      "description": "Partial match evaluation result",
+      "required": ["score", "result", "threshold"],
+      "additionalProperties": true,
+      "properties": {
+        "score": {
+          "type": "number",
+          "minimum": 0,
+          "maximum": 1,
+          "description": "Match score from 0.0 (no match) to 1.0 (full match)"
+        },
+        "result": {
+          "type": "string",
+          "enum": ["pass", "fail"],
+          "description": "Pass/fail based on score vs threshold"
+        },
+        "threshold": {
+          "type": "number",
+          "minimum": 0,
+          "maximum": 1,
+          "description": "Minimum score required for pass (default: 0.5)"
+        },
+        "reason": {
+          "type": "string",
+          "description": "Explanation of the match result"
+        }
+      }
+    },
+    "EvaluatorMap": {
+      "type": "object",
+      "description": "Map of evaluator names to their configuration options",
+      "propertyNames": {
+        "enum": ["Relevance", "Coherence", "Groundedness", "ToolCallAccuracy", "Citations", "ExactMatch", "PartialMatch"]
+      },
+      "additionalProperties": {
+        "$ref": "#/$defs/EvaluatorOptions"
+      }
+    },
+    "EvaluatorOptions": {
+      "type": "object",
+      "description": "Evaluator configuration options. Use empty object {} for defaults.",
+      "additionalProperties": false,
+      "properties": {
+        "threshold": {
+          "type": "number",
+          "description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (min citation count, default: 1), 0.0-1.0 for PartialMatch (min match ratio, default: 0.5). Validated per-evaluator at runtime."
+        },
+        "citation_format": {
+          "type": "string",
+          "examples": ["oai_unicode", "bracket", "mixed"],
+          "description": "Citation format for detection. 'oai_unicode': new OAI unicode format, 'bracket': legacy [^i^] bracket format, 'mixed': auto-detect both formats. Default: oai_unicode."
+        },
+        "case_sensitive": {
+          "type": "boolean",
+          "default": false,
+          "description": "Case-sensitive matching for ExactMatch/PartialMatch"
+        },
+        "options": {
+          "type": "object",
+          "additionalProperties": true,
+          "description": "Evaluator-specific configuration"
+        }
+      }
+    },
     "Citation": {
       "type": "object",
       "description": "A single citation reference",

package/schema/v1/examples/invalid/multi-turn-empty-turns.json ADDED Viewed

@@ -0,0 +1,8 @@
+{
+  "schemaVersion": "1.2.0",
+  "items": [
+    {
+      "turns": []
+    }
+  ]
+}

package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+  "schemaVersion": "1.2.0",
+  "items": [
+    {
+      "prompt": "This item has both prompt and turns",
+      "turns": [
+        {
+          "prompt": "Turn 1"
+        }
+      ]
+    }
+  ]
+}

package/schema/v1/examples/invalid/multi-turn-missing-prompt.json ADDED Viewed

@@ -0,0 +1,12 @@
+{
+  "schemaVersion": "1.2.0",
+  "items": [
+    {
+      "turns": [
+        {
+          "expected_response": "This turn is missing a prompt."
+        }
+      ]
+    }
+  ]
+}

package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+  "schemaVersion": "1.2.0",
+  "items": [
+    {
+      "turns": [
+        {
+          "prompt": "Hello",
+          "expeceted_response": "Typo in field name"
+        }
+      ]
+    }
+  ]
+}

package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "schemaVersion": "1.2.0",
+  "items": [
+    {
+      "turns": [
+        {
+          "prompt": "Hello",
+          "evaluators": {
+            "TaskCompletionEvaluator": {}
+          }
+        }
+      ]
+    }
+  ]
+}

package/schema/v1/examples/valid/comprehensive.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "$schema": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
-  "schemaVersion": "1.0.0",
+  "schemaVersion": "1.1.0",
   "metadata": {
     "name": "Graph API Evaluation Set",
     "description": "Test prompts for Microsoft Graph API knowledge",
@@ -9,11 +9,17 @@
     "evaluatedAt": "2026-01-20T10:30:00Z",
     "tags": ["graph", "api", "authentication"],
     "agentId": "12345678-1234-1234-1234-123456789abc",
+    "agentName": "Graph Knowledge Agent",
+    "cliVersion": "1.2.0",
     "extensions": {
       "com.contoso.department": "engineering",
       "com.contoso.priority": "high"
     }
   },
+  "default_evaluators": {
+    "Relevance": {},
+    "Coherence": {}
+  },
   "items": [
     {
       "prompt": "What is Microsoft Graph API?",
@@ -86,7 +92,26 @@
     },
     {
       "prompt": "How do I authenticate with Microsoft Graph?",
-      "expected_response": "You can authenticate using OAuth 2.0 or client credentials flow."
+      "expected_response": "You can authenticate using OAuth 2.0 or client credentials flow.",
+      "evaluators": {
+        "ExactMatch": { "case_sensitive": false },
+        "PartialMatch": { "threshold": 0.5 }
+      },
+      "evaluators_mode": "replace",
+      "response": "You can authenticate using OAuth 2.0 or client credentials flow.",
+      "scores": {
+        "exactMatch": {
+          "match": true,
+          "result": "pass",
+          "reason": "Exact match found"
+        },
+        "partialMatch": {
+          "score": 1.0,
+          "result": "pass",
+          "threshold": 0.5,
+          "reason": "Match score: 1.000"
+        }
+      }
     }
   ]
 }

package/schema/v1/examples/valid/mixed-single-and-multi-turn.json ADDED Viewed

@@ -0,0 +1,30 @@
+{
+  "schemaVersion": "1.2.0",
+  "default_evaluators": {
+    "Relevance": {},
+    "Coherence": {}
+  },
+  "items": [
+    {
+      "prompt": "What is Microsoft Graph API?",
+      "expected_response": "Microsoft Graph API is a unified endpoint for accessing Microsoft services."
+    },
+    {
+      "name": "Canadian Employee HR Inquiry",
+      "turns": [
+        {
+          "prompt": "I'm a Canadian employee based in Toronto.",
+          "expected_response": "Got it! I can help with Canada-specific HR questions."
+        },
+        {
+          "prompt": "Is July 4th a holiday for me?",
+          "expected_response": "July 4th is not a statutory holiday in Canada. However, July 1st (Canada Day) is."
+        }
+      ]
+    },
+    {
+      "prompt": "How do I authenticate with Microsoft Graph?",
+      "expected_response": "You can authenticate using OAuth 2.0 or client credentials flow."
+    }
+  ]
+}

package/schema/v1/examples/valid/multi-turn-output.json ADDED Viewed

@@ -0,0 +1,59 @@
+{
+  "schemaVersion": "1.2.0",
+  "metadata": {
+    "evaluatedAt": "2026-03-30T10:00:00Z",
+    "agentName": "Travel Assistant",
+    "cliVersion": "1.3.0"
+  },
+  "items": [
+    {
+      "name": "Context Persistence Test",
+      "turns": [
+        {
+          "prompt": "I'm based in Seattle.",
+          "expected_response": "Got it! I can help with Seattle-specific questions.",
+          "response": "Understood! I can assist with Seattle-related queries.",
+          "scores": {
+            "relevance": {
+              "score": 4.0,
+              "result": "pass",
+              "threshold": 3,
+              "reason": "Response acknowledges Seattle context."
+            },
+            "coherence": {
+              "score": 5.0,
+              "result": "pass",
+              "threshold": 3
+            }
+          },
+          "status": "pass"
+        },
+        {
+          "prompt": "What's the weather like here?",
+          "expected_response": "Seattle weather is typically mild with rain.",
+          "response": "Seattle generally has mild temperatures with frequent rain, especially in fall and winter.",
+          "scores": {
+            "relevance": {
+              "score": 5.0,
+              "result": "pass",
+              "threshold": 3
+            },
+            "coherence": {
+              "score": 4.0,
+              "result": "pass",
+              "threshold": 3
+            }
+          },
+          "status": "pass"
+        }
+      ],
+      "conversation_id": "conv-abc-123",
+      "summary": {
+        "turns_total": 2,
+        "turns_passed": 2,
+        "turns_failed": 0,
+        "overall_status": "pass"
+      }
+    }
+  ]
+}

package/schema/v1/examples/valid/multi-turn-simple.json ADDED Viewed

@@ -0,0 +1,21 @@
+{
+  "schemaVersion": "1.2.0",
+  "items": [
+    {
+      "turns": [
+        {
+          "prompt": "I'm traveling to Seattle next week for a conference.",
+          "expected_response": "I can help with travel-related questions."
+        },
+        {
+          "prompt": "What's the weather usually like?",
+          "expected_response": "Seattle weather is typically mild with some rain."
+        },
+        {
+          "prompt": "Should I bring a rain jacket?",
+          "expected_response": "Yes, Seattle is known for rain. A rain jacket is recommended."
+        }
+      ]
+    }
+  ]
+}

package/schema/v1/examples/valid/multi-turn-with-evaluators.json ADDED Viewed

@@ -0,0 +1,34 @@
+{
+  "schemaVersion": "1.2.0",
+  "default_evaluators": {
+    "Relevance": {},
+    "Coherence": {}
+  },
+  "items": [
+    {
+      "name": "Expense Policy Flow",
+      "description": "Test that agent handles expense policy questions across turns",
+      "turns": [
+        {
+          "prompt": "I'm traveling to Seattle next week for a conference.",
+          "expected_response": "I can help with travel-related questions."
+        },
+        {
+          "prompt": "My dinner last night was $250. Is that okay?",
+          "expected_response": "The per-diem meal allowance is a maximum of $200.",
+          "evaluators": {
+            "Groundedness": { "threshold": 4 }
+          }
+        },
+        {
+          "prompt": "What should I do about the overage?",
+          "expected_response": "For expenses exceeding the policy limit, you'll need manager approval.",
+          "evaluators": {
+            "ExactMatch": { "case_sensitive": false }
+          },
+          "evaluators_mode": "replace"
+        }
+      ]
+    }
+  ]
+}

package/schema/version.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "version": "1.0.0",
-  "releaseDate": "2026-02-19",
+  "version": "1.2.0",
+  "releaseDate": "2026-04-02",
   "schemaId": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
   "description": "M365 Copilot Eval Document Schema"
 }

package/src/clients/cli/api_clients/A2A/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .a2a_client import A2AClient
+__all__ = ["A2AClient"]