npm - @microsoft/m365-copilot-eval - Versions diffs - 1.5.0-preview.1 → 1.7.0-preview.1 - Mend

@microsoft/m365-copilot-eval 1.5.0-preview.1 → 1.7.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +19 -1
package/package.json +4 -3
package/schema/CHANGELOG.md +7 -0
package/schema/v1/eval-document.schema.json +144 -333
package/schema/v1/examples/invalid/error-result-with-score.json +16 -0
package/schema/v1/examples/invalid/missing-error-on-error.json +13 -0
package/schema/v1/examples/valid/multi-turn-output.json +2 -0
package/schema/v1/examples/valid/scenarios-with-mixed-errors.json +239 -0
package/schema/version.json +1 -1
package/src/clients/cli/api_clients/A2A/a2a_client.py +57 -10
package/src/clients/cli/auth/auth_handler.py +21 -1
package/src/clients/cli/common.py +8 -14
package/src/clients/cli/error_messages.py +91 -0
package/src/clients/cli/evaluation_runner.py +108 -97
package/src/clients/cli/evaluator_resolver.py +8 -33
package/src/clients/cli/generate_report.py +125 -96
package/src/clients/cli/main.py +2 -1
package/src/clients/cli/readme.md +1 -1
package/src/clients/cli/result_writer.py +129 -110
package/src/clients/cli/status_derivation.py +91 -0
package/src/clients/node-js/bin/runevals.js +31 -9
package/src/clients/node-js/config/default.js +1 -1
package/src/clients/node-js/lib/env-loader.js +20 -13
package/src/clients/node-js/lib/python-runtime.js +137 -65
package/src/clients/node-js/lib/venv-manager.js +3 -2
package/src/clients/node-js/lib/version-check.js +268 -0

package/schema/v1/eval-document.schema.json CHANGED Viewed

@@ -15,8 +15,8 @@
     "schemaVersion": {
       "type": "string",
       "pattern": "^1\\.\\d+\\.\\d+$",
-      "description": "SemVer string identifying the schema version this document conforms to (e.g., '1.0.0')",
-      "examples": ["1.0.0", "1.1.0", "1.2.0"]
+      "description": "SemVer string identifying the schema version this document conforms to (e.g., '1.4.0')",
+      "examples": ["1.0.0", "1.1.0", "1.2.0", "1.3.0", "1.4.0"]
     },
     "metadata": {
       "$ref": "#/$defs/DocumentMetadata"
@@ -43,52 +43,16 @@
       "description": "Optional metadata about the evaluation document",
       "additionalProperties": true,
       "properties": {
-        "name": {
-          "type": "string",
-          "description": "Human-readable name for the evaluation set"
-        },
-        "description": {
-          "type": "string",
-          "description": "Description of what this evaluation set tests"
-        },
-        "createdAt": {
-          "type": "string",
-          "format": "date-time",
-          "description": "ISO 8601 timestamp when the document was created"
-        },
-        "createdBy": {
-          "type": "string",
-          "description": "Author or system that created the document"
-        },
-        "evaluatedAt": {
-          "type": "string",
-          "format": "date-time",
-          "description": "ISO 8601 timestamp when evaluation was performed"
-        },
-        "tags": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          },
-          "description": "Tags for categorization and filtering"
-        },
-        "agentId": {
-          "type": "string",
-          "description": "M365 Agent ID this evaluation targets"
-        },
-        "agentName": {
-          "type": "string",
-          "description": "Name of the M365 agent this evaluation targets"
-        },
-        "cliVersion": {
-          "type": "string",
-          "description": "Version of the M365 Copilot Agent Evals CLI that produced this document"
-        },
-        "extensions": {
-          "type": "object",
-          "additionalProperties": true,
-          "description": "Extension point for custom metadata. Use reverse-domain notation for field names."
-        }
+        "name":        { "type": "string", "description": "Human-readable name for the evaluation set" },
+        "description": { "type": "string", "description": "Description of what this evaluation set tests" },
+        "createdAt":   { "type": "string", "format": "date-time", "description": "ISO 8601 timestamp when the document was created" },
+        "createdBy":   { "type": "string", "description": "Author or system that created the document" },
+        "evaluatedAt": { "type": "string", "format": "date-time", "description": "ISO 8601 timestamp when evaluation was performed" },
+        "tags":        { "type": "array", "items": { "type": "string" }, "description": "Tags for categorization and filtering" },
+        "agentId":     { "type": "string", "description": "M365 Agent ID this evaluation targets" },
+        "agentName":   { "type": "string", "description": "Name of the M365 agent this evaluation targets" },
+        "cliVersion":  { "type": "string", "description": "Version of the M365 Copilot Agent Evals CLI that produced this document" },
+        "extensions":  { "type": "object", "additionalProperties": true, "description": "Extension point for custom metadata. Use reverse-domain notation for field names." }
       }
     },
     "SingleTurnEvaluation": {
@@ -97,52 +61,26 @@
       "required": ["prompt"],
       "additionalProperties": false,
       "properties": {
-        "prompt": {
-          "type": "string",
-          "minLength": 1,
-          "description": "The input prompt to evaluate"
-        },
-        "expected_response": {
-          "type": "string",
-          "description": "Expected or ideal response for comparison during evaluation"
-        },
-        "response": {
-          "type": "string",
-          "description": "Actual response from the agent"
-        },
-        "context": {
-          "type": "string",
-          "description": "Additional context for grounding evaluation"
-        },
-        "evaluators": {
-          "$ref": "#/$defs/EvaluatorMap",
-          "description": "Per-prompt evaluator overrides"
-        },
-        "evaluators_mode": {
+        "prompt":            { "type": "string", "minLength": 1, "description": "The input prompt to evaluate" },
+        "expected_response": { "type": "string", "description": "Expected or ideal response for comparison during evaluation" },
+        "response":          { "type": "string", "description": "Actual response from the agent" },
+        "context":           { "type": "string", "description": "Additional context for grounding evaluation" },
+        "evaluators":        { "$ref": "#/$defs/EvaluatorMap", "description": "Per-prompt evaluator overrides" },
+        "evaluators_mode":   { "type": "string", "enum": ["extend", "replace"], "default": "extend", "description": "How per-prompt evaluators combine with defaults" },
+        "citations":         { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Citations included in the response" },
+        "scores":            { "$ref": "#/$defs/ScoreCollection" },
+        "status": {
           "type": "string",
-          "enum": ["extend", "replace"],
-          "default": "extend",
-          "description": "How per-prompt evaluators combine with defaults"
-        },
-        "citations": {
-          "type": "array",
-          "items": {
-            "$ref": "#/$defs/Citation"
-          },
-          "description": "Citations included in the response"
+          "enum": ["pass", "fail", "partial", "error"],
+          "description": "Overall status of this item."
         },
-        "scores": {
-          "$ref": "#/$defs/ScoreCollection"
+        "error": {
+          "$ref": "#/$defs/ErrorObject",
+          "description": "Error details for this item, if any."
         },
-        "extensions": {
-          "type": "object",
-          "additionalProperties": true,
-          "description": "Extension point for custom item-level fields"
-        }
+        "extensions":        { "type": "object", "additionalProperties": true, "description": "Extension point for custom item-level fields" }
       },
-      "not": {
-        "required": ["turns"]
-      }
+      "not": { "required": ["turns"] }
     },
     "MultiTurnThread": {
       "type": "object",
@@ -150,38 +88,14 @@
       "required": ["turns"],
       "additionalProperties": false,
       "properties": {
-        "name": {
-          "type": "string",
-          "description": "Human-readable name for the thread"
-        },
-        "description": {
-          "type": "string",
-          "description": "Description of what this thread tests"
-        },
-        "turns": {
-          "type": "array",
-          "minItems": 1,
-          "maxItems": 20,
-          "items": { "$ref": "#/$defs/Turn" },
-          "description": "Ordered array of conversation turns"
-        },
-        "conversation_id": {
-          "type": "string",
-          "description": "Unique identifier for this conversation thread"
-        },
-        "summary": {
-          "$ref": "#/$defs/ThreadSummary",
-          "description": "Aggregate statistics for the thread"
-        },
-        "extensions": {
-          "type": "object",
-          "additionalProperties": true,
-          "description": "Extension point for custom thread-level fields"
-        }
+        "name":            { "type": "string", "description": "Human-readable name for the thread" },
+        "description":     { "type": "string", "description": "Description of what this thread tests" },
+        "turns":           { "type": "array", "minItems": 1, "maxItems": 20, "items": { "$ref": "#/$defs/Turn" }, "description": "Ordered array of conversation turns" },
+        "conversation_id": { "type": "string", "description": "Unique identifier for this conversation thread" },
+        "summary":         { "$ref": "#/$defs/ThreadSummary", "description": "Aggregate statistics for the thread" },
+        "extensions":      { "type": "object", "additionalProperties": true, "description": "Extension point for custom thread-level fields" }
       },
-      "not": {
-        "required": ["prompt"]
-      }
+      "not": { "required": ["prompt"] }
     },
     "Turn": {
       "type": "object",
@@ -189,237 +103,159 @@
       "required": ["prompt"],
       "additionalProperties": false,
       "properties": {
-        "prompt": {
-          "type": "string",
-          "minLength": 1,
-          "description": "The user message for this turn"
-        },
-        "expected_response": {
-          "type": "string",
-          "description": "Expected agent response for this turn"
-        },
-        "response": {
-          "type": "string",
-          "description": "Actual agent response"
-        },
-        "context": {
-          "type": "string",
-          "description": "Additional context for grounding evaluation"
-        },
-        "evaluators": {
-          "$ref": "#/$defs/EvaluatorMap",
-          "description": "Per-turn evaluator overrides"
-        },
-        "evaluators_mode": {
-          "type": "string",
-          "enum": ["extend", "replace"],
-          "default": "extend",
-          "description": "How per-turn evaluators combine with defaults"
-        },
-        "citations": {
-          "type": "array",
-          "items": {
-            "$ref": "#/$defs/Citation"
-          },
-          "description": "Citations included in the response"
-        },
-        "scores": {
-          "$ref": "#/$defs/ScoreCollection"
-        },
+        "prompt":            { "type": "string", "minLength": 1, "description": "The user message for this turn" },
+        "expected_response": { "type": "string", "description": "Expected agent response for this turn" },
+        "response":          { "type": "string", "description": "Actual agent response" },
+        "context":           { "type": "string", "description": "Additional context for grounding evaluation" },
+        "evaluators":        { "$ref": "#/$defs/EvaluatorMap", "description": "Per-turn evaluator overrides" },
+        "evaluators_mode":   { "type": "string", "enum": ["extend", "replace"], "default": "extend", "description": "How per-turn evaluators combine with defaults" },
+        "citations":         { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Citations included in the response" },
+        "scores":            { "$ref": "#/$defs/ScoreCollection" },
         "status": {
           "type": "string",
-          "enum": ["pass", "fail", "error"],
-          "description": "Overall status of this turn"
+          "enum": ["pass", "fail", "partial", "error"],
+          "description": "Overall status of this turn."
         },
         "error": {
-          "type": "string",
-          "description": "Error message if status is 'error'"
+          "$ref": "#/$defs/ErrorObject",
+          "description": "Error details for this turn, if any."
         },
-        "extensions": {
-          "type": "object",
-          "additionalProperties": true,
-          "description": "Extension point for custom turn-level fields"
-        }
+        "extensions":        { "type": "object", "additionalProperties": true, "description": "Extension point for custom turn-level fields" }
       }
     },
     "ThreadSummary": {
       "type": "object",
-      "description": "Aggregate statistics for a thread",
-      "required": ["turns_total", "turns_passed", "turns_failed", "overall_status"],
+      "description": "Aggregate statistics for a thread.",
+      "required": ["turns_total", "turns_passed", "turns_failed", "turns_partial", "turns_errored", "overall_status"],
       "additionalProperties": false,
       "properties": {
-        "turns_total": {
-          "type": "integer",
-          "minimum": 1,
-          "description": "Total number of turns executed"
-        },
-        "turns_passed": {
-          "type": "integer",
-          "minimum": 0,
-          "description": "Number of turns where all evaluators passed"
-        },
-        "turns_failed": {
-          "type": "integer",
-          "minimum": 0,
-          "description": "Number of turns where any evaluator failed"
-        },
+        "turns_total":   { "type": "integer", "minimum": 1, "description": "Total number of turns executed" },
+        "turns_passed":  { "type": "integer", "minimum": 0, "description": "Count of turns with status='pass'" },
+        "turns_failed":  { "type": "integer", "minimum": 0, "description": "Count of turns with status='fail'" },
+        "turns_partial": { "type": "integer", "minimum": 0, "description": "Count of turns with status='partial'" },
+        "turns_errored": { "type": "integer", "minimum": 0, "description": "Count of turns with status='error'" },
         "overall_status": {
           "type": "string",
-          "enum": ["pass", "partial", "fail"],
-          "description": "pass: all turns passed, partial: some failed, fail: all failed or error"
+          "enum": ["pass", "fail", "partial", "error"],
+          "description": "Overall status of the thread."
         }
       }
     },
     "ScoreCollection": {
       "type": "object",
-      "description": "Collection of evaluation scores for an item",
+      "description": "Collection of evaluation scores for an item. Each entry is either a valid result (ValidScore variants) or an errored record (ErroredScore) under the discriminated oneOf.",
       "additionalProperties": true,
       "properties": {
-        "relevance": {
-          "$ref": "#/$defs/EvalScore",
-          "description": "Relevance score (1-5)"
-        },
-        "coherence": {
-          "$ref": "#/$defs/EvalScore",
-          "description": "Coherence score (1-5)"
-        },
-        "groundedness": {
-          "$ref": "#/$defs/EvalScore",
-          "description": "Groundedness score (1-5)"
-        },
-        "similarity": {
-          "$ref": "#/$defs/EvalScore",
-          "description": "Similarity score (1-5)"
-        },
-        "citations": {
-          "$ref": "#/$defs/CitationScore",
-          "description": "Citation evaluation results"
-        },
-        "exactMatch": {
-          "$ref": "#/$defs/ExactMatchScore",
-          "description": "Exact match evaluation result"
-        },
-        "partialMatch": {
-          "$ref": "#/$defs/PartialMatchScore",
-          "description": "Partial match evaluation result"
-        }
+        "relevance":    { "$ref": "#/$defs/EvalScore",         "description": "Relevance score (1-5) or errored entry" },
+        "coherence":    { "$ref": "#/$defs/EvalScore",         "description": "Coherence score (1-5) or errored entry" },
+        "groundedness": { "$ref": "#/$defs/EvalScore",         "description": "Groundedness score (1-5) or errored entry" },
+        "similarity":   { "$ref": "#/$defs/EvalScore",         "description": "Similarity score (1-5) or errored entry" },
+        "citations":    { "$ref": "#/$defs/CitationScore",     "description": "Citation evaluation result or errored entry" },
+        "exactMatch":   { "$ref": "#/$defs/ExactMatchScore",   "description": "Exact match evaluation result or errored entry" },
+        "partialMatch": { "$ref": "#/$defs/PartialMatchScore", "description": "Partial match evaluation result or errored entry" }
       }
     },
     "EvalScore": {
+      "description": "Standard evaluation score (1-5 scale) — valid result OR errored-evaluator record.",
+      "oneOf": [
+        { "$ref": "#/$defs/EvalScoreValid" },
+        { "$ref": "#/$defs/ErroredScore" }
+      ]
+    },
+    "EvalScoreValid": {
       "type": "object",
-      "description": "Standard evaluation score (1-5 scale)",
+      "description": "Valid 1-5 score result. Required when result is pass or fail.",
       "required": ["score", "result", "threshold"],
       "additionalProperties": true,
       "properties": {
-        "score": {
-          "type": "number",
-          "minimum": 1,
-          "maximum": 5,
-          "description": "Numeric score from 1.0 (worst) to 5.0 (best)"
-        },
-        "result": {
-          "type": "string",
-          "enum": ["pass", "fail"],
-          "description": "Pass/fail result based on threshold comparison"
-        },
-        "threshold": {
-          "type": "number",
-          "minimum": 1,
-          "maximum": 5,
-          "description": "Threshold used for pass/fail determination"
-        },
-        "reason": {
-          "type": "string",
-          "description": "Explanation of why this score was assigned"
-        },
-        "evaluator": {
-          "type": "string",
-          "description": "Name or identifier of the evaluator that produced this score"
-        }
+        "score":     { "type": "number", "minimum": 1, "maximum": 5, "description": "Numeric score from 1.0 (worst) to 5.0 (best)" },
+        "result":    { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail result based on threshold comparison" },
+        "threshold": { "type": "number", "minimum": 1, "maximum": 5, "description": "Threshold used for pass/fail determination" },
+        "reason":    { "type": "string", "description": "Explanation of why this score was assigned" }
       }
     },
     "CitationScore": {
+      "description": "Citation-specific evaluation result — valid result OR errored-evaluator record.",
+      "oneOf": [
+        { "$ref": "#/$defs/CitationScoreValid" },
+        { "$ref": "#/$defs/ErroredScore" }
+      ]
+    },
+    "CitationScoreValid": {
       "type": "object",
-      "description": "Citation-specific evaluation score",
+      "description": "Valid citation-count result. Required when result is pass or fail.",
       "required": ["count", "result", "threshold"],
       "additionalProperties": true,
       "properties": {
-        "count": {
-          "type": "integer",
-          "minimum": 0,
-          "description": "Number of citations found in the response"
-        },
-        "result": {
-          "type": "string",
-          "enum": ["pass", "fail"],
-          "description": "Pass/fail result based on citation count vs threshold"
-        },
-        "threshold": {
-          "type": "integer",
-          "minimum": 0,
-          "description": "Minimum required number of citations for pass"
-        },
-        "format": {
-          "type": "string",
-          "description": "Citation format detected. Known values: 'oai_unicode', 'bracket', 'mixed'. Additional formats may be added.",
-          "examples": ["oai_unicode", "bracket", "mixed"]
-        },
-        "citations": {
-          "type": "array",
-          "items": {
-            "$ref": "#/$defs/Citation"
-          },
-          "description": "Parsed citation objects"
-        }
+        "count":     { "type": "integer", "minimum": 0, "description": "Number of citations found in the response" },
+        "result":    { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail result based on citation count vs threshold" },
+        "threshold": { "type": "integer", "minimum": 0, "description": "Minimum required number of citations for pass" },
+        "format":    { "type": "string", "description": "Citation format detected. Known values: 'oai_unicode', 'bracket', 'mixed'.", "examples": ["oai_unicode", "bracket", "mixed"] },
+        "citations": { "type": "array", "items": { "$ref": "#/$defs/Citation" }, "description": "Parsed citation objects" }
       }
     },
     "ExactMatchScore": {
+      "description": "Exact match evaluation result — valid result OR errored-evaluator record.",
+      "oneOf": [
+        { "$ref": "#/$defs/ExactMatchScoreValid" },
+        { "$ref": "#/$defs/ErroredScore" }
+      ]
+    },
+    "ExactMatchScoreValid": {
       "type": "object",
-      "description": "Exact match evaluation result",
+      "description": "Valid exact-match result. Required when result is pass or fail.",
       "required": ["match", "result"],
       "additionalProperties": true,
       "properties": {
-        "match": {
-          "type": "boolean",
-          "description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)"
-        },
-        "result": {
-          "type": "string",
-          "enum": ["pass", "fail"],
-          "description": "Pass when match is true, fail otherwise"
-        },
-        "reason": {
-          "type": "string",
-          "description": "Explanation of the match result"
-        }
+        "match":  { "type": "boolean", "description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)" },
+        "result": { "type": "string", "enum": ["pass", "fail"], "description": "Pass when match is true, fail otherwise" },
+        "reason": { "type": "string", "description": "Explanation of the match result" }
       }
     },
     "PartialMatchScore": {
+      "description": "Partial match evaluation result — valid result OR errored-evaluator record.",
+      "oneOf": [
+        { "$ref": "#/$defs/PartialMatchScoreValid" },
+        { "$ref": "#/$defs/ErroredScore" }
+      ]
+    },
+    "PartialMatchScoreValid": {
       "type": "object",
-      "description": "Partial match evaluation result",
+      "description": "Valid partial-match result (0.0-1.0 score). Required when result is pass or fail.",
       "required": ["score", "result", "threshold"],
       "additionalProperties": true,
       "properties": {
-        "score": {
-          "type": "number",
-          "minimum": 0,
-          "maximum": 1,
-          "description": "Match score from 0.0 (no match) to 1.0 (full match)"
-        },
-        "result": {
+        "score":     { "type": "number", "minimum": 0, "maximum": 1, "description": "Match score from 0.0 (no match) to 1.0 (full match)" },
+        "result":    { "type": "string", "enum": ["pass", "fail"], "description": "Pass/fail based on score vs threshold" },
+        "threshold": { "type": "number", "minimum": 0, "maximum": 1, "description": "Minimum score required for pass (default: 0.5)" },
+        "reason":    { "type": "string", "description": "Explanation of the match result" }
+      }
+    },
+    "ErroredScore": {
+      "type": "object",
+      "description": "Per-evaluator entry for an evaluator that did not produce a result.",
+      "required": ["result", "error"],
+      "additionalProperties": false,
+      "properties": {
+        "result": { "type": "string", "const": "error", "description": "Always 'error' for this variant." },
+        "error":  { "type": "string", "minLength": 1, "description": "Error message describing why the evaluator did not produce a result." }
+      }
+    },
+    "ErrorObject": {
+      "type": "object",
+      "description": "Structured turn/item-level error with a machine-readable code and a human-readable message.",
+      "required": ["code", "message"],
+      "additionalProperties": false,
+      "properties": {
+        "code": {
           "type": "string",
-          "enum": ["pass", "fail"],
-          "description": "Pass/fail based on score vs threshold"
-        },
-        "threshold": {
-          "type": "number",
-          "minimum": 0,
-          "maximum": 1,
-          "description": "Minimum score required for pass (default: 0.5)"
+          "minLength": 1,
+          "description": "Machine-readable error category. One of: 'agentRequestFailed', 'turnSkipped', 'evaluatorsFailed'."
         },
-        "reason": {
+        "message": {
           "type": "string",
-          "description": "Explanation of the match result"
+          "minLength": 1,
+          "description": "Human-readable message paired with the code."
         }
       }
     },
@@ -438,25 +274,10 @@
       "description": "Evaluator configuration options. Use empty object {} for defaults.",
       "additionalProperties": false,
       "properties": {
-        "threshold": {
-          "type": "number",
-          "description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (min citation count, default: 1), 0.0-1.0 for PartialMatch (min match ratio, default: 0.5). Validated per-evaluator at runtime."
-        },
-        "citation_format": {
-          "type": "string",
-          "examples": ["oai_unicode", "bracket", "mixed"],
-          "description": "Citation format for detection. 'oai_unicode': new OAI unicode format, 'bracket': legacy [^i^] bracket format, 'mixed': auto-detect both formats. Default: oai_unicode."
-        },
-        "case_sensitive": {
-          "type": "boolean",
-          "default": false,
-          "description": "Case-sensitive matching for ExactMatch/PartialMatch"
-        },
-        "options": {
-          "type": "object",
-          "additionalProperties": true,
-          "description": "Evaluator-specific configuration"
-        }
+        "threshold":       { "type": "number", "description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (default: 1), 0.0-1.0 for PartialMatch (default: 0.5)." },
+        "citation_format": { "type": "string", "examples": ["oai_unicode", "bracket", "mixed"], "description": "Citation format for detection. Default: oai_unicode." },
+        "case_sensitive":  { "type": "boolean", "default": false, "description": "Case-sensitive matching for ExactMatch/PartialMatch" },
+        "options":         { "type": "object", "additionalProperties": true, "description": "Evaluator-specific configuration" }
       }
     },
     "Citation": {
@@ -465,19 +286,9 @@
       "required": ["index"],
       "additionalProperties": true,
       "properties": {
-        "index": {
-          "type": "integer",
-          "minimum": 1,
-          "description": "Citation index (1-based)"
-        },
-        "text": {
-          "type": "string",
-          "description": "The cited text"
-        },
-        "source": {
-          "type": "string",
-          "description": "Source reference (URL, document name, etc.)"
-        }
+        "index":  { "type": "integer", "minimum": 1, "description": "Citation index (1-based)" },
+        "text":   { "type": "string", "description": "The cited text" },
+        "source": { "type": "string", "description": "Source reference (URL, document name, etc.)" }
       }
     }
   }

package/schema/v1/examples/invalid/error-result-with-score.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+  "schemaVersion": "1.4.0",
+  "items": [
+    {
+      "prompt": "What is Microsoft Graph?",
+      "scores": {
+        "relevance": {
+          "result": "error",
+          "error": "Evaluator failed: Connection timeout to Azure OpenAI endpoint",
+          "score": 0,
+          "threshold": 3
+        }
+      }
+    }
+  ]
+}

package/schema/v1/examples/invalid/missing-error-on-error.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+  "schemaVersion": "1.4.0",
+  "items": [
+    {
+      "prompt": "What is Microsoft Graph?",
+      "scores": {
+        "relevance": {
+          "result": "error"
+        }
+      }
+    }
+  ]
+}

package/schema/v1/examples/valid/multi-turn-output.json CHANGED Viewed

@@ -52,6 +52,8 @@
         "turns_total": 2,
         "turns_passed": 2,
         "turns_failed": 0,
+        "turns_partial": 0,
+        "turns_errored": 0,
         "overall_status": "pass"
       }
     }