@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.3.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -265,10 +265,18 @@ runevals --prompts "What is Microsoft Graph?" --expected "Gateway to M365 data"
265
265
  # Interactive mode (enter prompts interactively)
266
266
  runevals --interactive
267
267
 
268
+ # Canonical logging verbosity
269
+ runevals --log-level debug
270
+ runevals --log-level info
271
+ runevals --log-level warning
272
+ runevals --log-level error
273
+
268
274
  # Custom output location in your project
269
275
  runevals --output ./reports/results.html
270
276
  ```
271
277
 
278
+ > **⚠️ Debug log safety notice:** The `--log-level debug` option is opt-in and may include raw API payloads and response data in console output. Redaction is pattern-based (API keys, tokens, passwords, long mixed-case strings) and **will not catch arbitrary PII or custom credentials** embedded in prompts or responses. Do not share debug-level output publicly without manual review.
279
+
272
280
  ### Optional: Add Shortcuts to package.json
273
281
 
274
282
  You can add shortcuts (npm scripts) to your agent project's `package.json`:
@@ -320,8 +328,7 @@ runevals --output results.csv
320
328
  ```bash
321
329
  Options:
322
330
  -V, --version output version number
323
- -v, --verbose show detailed processing steps
324
- -q, --quiet minimal output
331
+ --log-level [level] log level: debug|info|warning|error (bare flag -> info)
325
332
  --prompts <prompts...> inline prompts to evaluate
326
333
  --expected <responses...> expected responses (with --prompts)
327
334
  --prompts-file <file> JSON file with prompts
@@ -360,7 +367,7 @@ runevals cache-info
360
367
 
361
368
  # Clear and rebuild
362
369
  runevals cache-clear
363
- runevals --init-only --verbose
370
+ runevals --init-only --log-level debug
364
371
  ```
365
372
 
366
373
  ### Network/Proxy Issues
@@ -369,7 +376,7 @@ runevals --init-only --verbose
369
376
  export HTTPS_PROXY=http://proxy:8080
370
377
 
371
378
  # Retry with verbose output
372
- runevals --init-only --verbose
379
+ runevals --init-only --log-level debug
373
380
  ```
374
381
 
375
382
  ### Permission Issues
package/package.json CHANGED
@@ -1,9 +1,9 @@
1
1
  {
2
2
  "name": "@microsoft/m365-copilot-eval",
3
- "version": "1.2.1-preview.1",
3
+ "version": "1.3.0-preview.1",
4
4
  "minCliVersion": "1.0.1-preview.1",
5
5
  "description": "Zero-config Node.js wrapper for M365 Copilot Agent Evaluations CLI (Python-based Azure AI Evaluation SDK)",
6
- "publishDate": "2026-03-23",
6
+ "publishDate": "2026-04-01",
7
7
  "main": "src/clients/node-js/lib/index.js",
8
8
  "type": "module",
9
9
  "bin": {
@@ -5,6 +5,14 @@ All notable changes to the eval document schema will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.1.0](https://github.com/microsoft/M365-Copilot-Agent-Evals/compare/schema-v1.0.0...schema-v1.1.0) (2026-03-30)
9
+
10
+
11
+ ### Features
12
+
13
+ * **WI-6855059:** add agentName/cliVersion to schema, fix duplicate prompt loss, include default_evaluators in output ([#181](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/181)) ([9321474](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/93214746144e9d11f507433eff185aefac4a858a))
14
+ * **WI-6855059:** implement per-prompt evaluator configuration ([#168](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/168)) ([eface7e](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/eface7e7041b118681cd4c68582fe903640bf6c0))
15
+
8
16
  ## [1.0.0] - 2026-02-19
9
17
 
10
18
  ### Added
@@ -2,7 +2,7 @@
2
2
  "$schema": "https://json-schema.org/draft/2020-12/schema",
3
3
  "$id": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
4
4
  "title": "M365 Copilot Eval Document",
5
- "description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Version 1.0.0.",
5
+ "description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Version 1.1.0.",
6
6
  "type": "object",
7
7
  "required": ["schemaVersion", "items"],
8
8
  "additionalProperties": true,
@@ -21,6 +21,10 @@
21
21
  "metadata": {
22
22
  "$ref": "#/$defs/DocumentMetadata"
23
23
  },
24
+ "default_evaluators": {
25
+ "$ref": "#/$defs/EvaluatorMap",
26
+ "description": "File-level default evaluators (overrides system defaults)"
27
+ },
24
28
  "items": {
25
29
  "type": "array",
26
30
  "minItems": 1,
@@ -69,6 +73,14 @@
69
73
  "type": "string",
70
74
  "description": "M365 Agent ID this evaluation targets"
71
75
  },
76
+ "agentName": {
77
+ "type": "string",
78
+ "description": "Name of the M365 agent this evaluation targets"
79
+ },
80
+ "cliVersion": {
81
+ "type": "string",
82
+ "description": "Version of the M365 Copilot Agent Evals CLI that produced this document"
83
+ },
72
84
  "extensions": {
73
85
  "type": "object",
74
86
  "additionalProperties": true,
@@ -99,6 +111,16 @@
99
111
  "type": "string",
100
112
  "description": "Additional context for grounding evaluation"
101
113
  },
114
+ "evaluators": {
115
+ "$ref": "#/$defs/EvaluatorMap",
116
+ "description": "Per-prompt evaluator overrides"
117
+ },
118
+ "evaluators_mode": {
119
+ "type": "string",
120
+ "enum": ["extend", "replace"],
121
+ "default": "extend",
122
+ "description": "How per-prompt evaluators combine with defaults"
123
+ },
102
124
  "citations": {
103
125
  "type": "array",
104
126
  "items": {
@@ -140,6 +162,14 @@
140
162
  "citations": {
141
163
  "$ref": "#/$defs/CitationScore",
142
164
  "description": "Citation evaluation results"
165
+ },
166
+ "exactMatch": {
167
+ "$ref": "#/$defs/ExactMatchScore",
168
+ "description": "Exact match evaluation result"
169
+ },
170
+ "partialMatch": {
171
+ "$ref": "#/$defs/PartialMatchScore",
172
+ "description": "Partial match evaluation result"
143
173
  }
144
174
  }
145
175
  },
@@ -211,6 +241,92 @@
211
241
  }
212
242
  }
213
243
  },
244
+ "ExactMatchScore": {
245
+ "type": "object",
246
+ "description": "Exact match evaluation result",
247
+ "required": ["match", "result"],
248
+ "additionalProperties": true,
249
+ "properties": {
250
+ "match": {
251
+ "type": "boolean",
252
+ "description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)"
253
+ },
254
+ "result": {
255
+ "type": "string",
256
+ "enum": ["pass", "fail"],
257
+ "description": "Pass when match is true, fail otherwise"
258
+ },
259
+ "reason": {
260
+ "type": "string",
261
+ "description": "Explanation of the match result"
262
+ }
263
+ }
264
+ },
265
+ "PartialMatchScore": {
266
+ "type": "object",
267
+ "description": "Partial match evaluation result",
268
+ "required": ["score", "result", "threshold"],
269
+ "additionalProperties": true,
270
+ "properties": {
271
+ "score": {
272
+ "type": "number",
273
+ "minimum": 0,
274
+ "maximum": 1,
275
+ "description": "Match score from 0.0 (no match) to 1.0 (full match)"
276
+ },
277
+ "result": {
278
+ "type": "string",
279
+ "enum": ["pass", "fail"],
280
+ "description": "Pass/fail based on score vs threshold"
281
+ },
282
+ "threshold": {
283
+ "type": "number",
284
+ "minimum": 0,
285
+ "maximum": 1,
286
+ "description": "Minimum score required for pass (default: 0.5)"
287
+ },
288
+ "reason": {
289
+ "type": "string",
290
+ "description": "Explanation of the match result"
291
+ }
292
+ }
293
+ },
294
+ "EvaluatorMap": {
295
+ "type": "object",
296
+ "description": "Map of evaluator names to their configuration options",
297
+ "propertyNames": {
298
+ "enum": ["Relevance", "Coherence", "Groundedness", "ToolCallAccuracy", "Citations", "ExactMatch", "PartialMatch"]
299
+ },
300
+ "additionalProperties": {
301
+ "$ref": "#/$defs/EvaluatorOptions"
302
+ }
303
+ },
304
+ "EvaluatorOptions": {
305
+ "type": "object",
306
+ "description": "Evaluator configuration options. Use empty object {} for defaults.",
307
+ "additionalProperties": false,
308
+ "properties": {
309
+ "threshold": {
310
+ "type": "number",
311
+ "description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (min citation count, default: 1), 0.0-1.0 for PartialMatch (min match ratio, default: 0.5). Validated per-evaluator at runtime."
312
+ },
313
+ "citation_format": {
314
+ "type": "string",
315
+ "examples": ["oai_unicode", "bracket", "mixed"],
316
+ "description": "Citation format for detection. 'oai_unicode': new OAI unicode format, 'bracket': legacy [^i^] bracket format, 'mixed': auto-detect both formats. Default: oai_unicode."
317
+ },
318
+ "case_sensitive": {
319
+ "type": "boolean",
320
+ "default": false,
321
+ "description": "Case-sensitive matching for ExactMatch/PartialMatch"
322
+ },
323
+ "options": {
324
+ "type": "object",
325
+ "additionalProperties": true,
326
+ "description": "Evaluator-specific configuration"
327
+ }
328
+ }
329
+ },
214
330
  "Citation": {
215
331
  "type": "object",
216
332
  "description": "A single citation reference",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "$schema": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
3
- "schemaVersion": "1.0.0",
3
+ "schemaVersion": "1.1.0",
4
4
  "metadata": {
5
5
  "name": "Graph API Evaluation Set",
6
6
  "description": "Test prompts for Microsoft Graph API knowledge",
@@ -9,11 +9,17 @@
9
9
  "evaluatedAt": "2026-01-20T10:30:00Z",
10
10
  "tags": ["graph", "api", "authentication"],
11
11
  "agentId": "12345678-1234-1234-1234-123456789abc",
12
+ "agentName": "Graph Knowledge Agent",
13
+ "cliVersion": "1.2.0",
12
14
  "extensions": {
13
15
  "com.contoso.department": "engineering",
14
16
  "com.contoso.priority": "high"
15
17
  }
16
18
  },
19
+ "default_evaluators": {
20
+ "Relevance": {},
21
+ "Coherence": {}
22
+ },
17
23
  "items": [
18
24
  {
19
25
  "prompt": "What is Microsoft Graph API?",
@@ -86,7 +92,26 @@
86
92
  },
87
93
  {
88
94
  "prompt": "How do I authenticate with Microsoft Graph?",
89
- "expected_response": "You can authenticate using OAuth 2.0 or client credentials flow."
95
+ "expected_response": "You can authenticate using OAuth 2.0 or client credentials flow.",
96
+ "evaluators": {
97
+ "ExactMatch": { "case_sensitive": false },
98
+ "PartialMatch": { "threshold": 0.5 }
99
+ },
100
+ "evaluators_mode": "replace",
101
+ "response": "You can authenticate using OAuth 2.0 or client credentials flow.",
102
+ "scores": {
103
+ "exactMatch": {
104
+ "match": true,
105
+ "result": "pass",
106
+ "reason": "Exact match found"
107
+ },
108
+ "partialMatch": {
109
+ "score": 1.0,
110
+ "result": "pass",
111
+ "threshold": 0.5,
112
+ "reason": "Match score: 1.000"
113
+ }
114
+ }
90
115
  }
91
116
  ]
92
117
  }
@@ -1,6 +1,6 @@
1
1
  {
2
- "version": "1.0.0",
3
- "releaseDate": "2026-02-19",
2
+ "version": "1.1.0",
3
+ "releaseDate": "2026-03-17",
4
4
  "schemaId": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
5
5
  "description": "M365 Copilot Eval Document Schema"
6
6
  }
File without changes
@@ -0,0 +1,55 @@
1
+ import json
2
+ import sys
3
+ from collections import OrderedDict
4
+ from typing import Any, Dict
5
+
6
+ from cli_logging.logging_utils import STRUCTURED_LOG_FIELDS
7
+
8
+ _ANSI_COLORS = {
9
+ "debug": "\033[2m", # dim
10
+ "info": "", # default
11
+ "warning": "\033[33m", # yellow
12
+ "error": "\033[31m", # red
13
+ }
14
+ _ANSI_RESET = "\033[0m"
15
+
16
+
17
+ def format_diagnostic_record(record: Dict[str, Any]) -> OrderedDict:
18
+ ordered = OrderedDict()
19
+ for field in STRUCTURED_LOG_FIELDS:
20
+ default = False if field == "is-redacted" else None
21
+ ordered[field] = record.get(field, default)
22
+ return ordered
23
+
24
+
25
+ def serialize_diagnostic_record(record: Dict[str, Any]) -> str:
26
+ return json.dumps(format_diagnostic_record(record), ensure_ascii=False)
27
+
28
+
29
+ def format_console_record(record: Dict[str, Any], max_message_length: int = 250) -> str:
30
+ """Format a diagnostic record for human-readable TTY output with ANSI colors."""
31
+ ts = record.get("timestamp", "")
32
+ # Extract HH:MM:SS from ISO timestamp
33
+ time_part = ts[11:19] if len(ts) >= 19 else ts
34
+ level = (record.get("level") or "info").upper()
35
+ message = record.get("message", "")
36
+ if len(message) > max_message_length:
37
+ message = message[:max_message_length] + "…"
38
+
39
+ ids = []
40
+ for key in ("request-id", "conversation-id", "message-id"):
41
+ val = record.get(key)
42
+ if val:
43
+ ids.append(f"{key}={val}")
44
+ id_suffix = f" ({' | '.join(ids)})" if ids else ""
45
+
46
+ color = _ANSI_COLORS.get((record.get("level") or "info").lower(), "")
47
+ reset = _ANSI_RESET if color else ""
48
+ return f"{color}[{time_part}] {level} {message}{id_suffix}{reset}"
49
+
50
+
51
+ def render_diagnostic(record: Dict[str, Any]) -> str:
52
+ """Return TTY-friendly or JSON output depending on whether stdout is a terminal."""
53
+ if sys.stdout.isatty():
54
+ return format_console_record(record)
55
+ return serialize_diagnostic_record(record)
@@ -0,0 +1,145 @@
1
+ import logging
2
+ import re
3
+ from datetime import datetime, timezone
4
+ from enum import Enum
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+
7
+
8
+ class LogLevel(str, Enum):
9
+ """Log level enum. Inherits from str so comparisons like level == "debug" work."""
10
+ DEBUG = "debug"
11
+ INFO = "info"
12
+ WARNING = "warning"
13
+ ERROR = "error"
14
+
15
+
16
+ class Operation(str, Enum):
17
+ """CLI operation steps for structured log entries."""
18
+ SETUP = "setup"
19
+ AUTHENTICATE = "authenticate"
20
+ VALIDATE_ENV = "validate-env"
21
+ LOAD_PROMPTS = "load-prompts"
22
+ FETCH_AGENTS = "fetch-agents"
23
+ SEND_PROMPT = "send-prompt"
24
+ EVALUATE = "evaluate"
25
+ WRITE_OUTPUT = "write-output"
26
+
27
+
28
+ ALLOWED_LOG_LEVELS = tuple(level.value for level in LogLevel)
29
+ LOG_LEVEL_MAP = {
30
+ LogLevel.DEBUG: logging.DEBUG,
31
+ LogLevel.INFO: logging.INFO,
32
+ LogLevel.WARNING: logging.WARNING,
33
+ LogLevel.ERROR: logging.ERROR,
34
+ }
35
+
36
+ STRUCTURED_LOG_FIELDS = (
37
+ "timestamp",
38
+ "level",
39
+ "operation",
40
+ "request-id",
41
+ "conversation-id",
42
+ "message-id",
43
+ "logger",
44
+ "message",
45
+ "is-redacted",
46
+ )
47
+
48
+
49
+ def normalize_log_level(value: Optional[str]) -> Optional[str]:
50
+ if value is None:
51
+ return None
52
+ return value.strip().lower()
53
+
54
+
55
+
56
+ def resolve_log_level(
57
+ log_level_values: Optional[List[str]],
58
+ ) -> Tuple[Optional[str], Optional[str]]:
59
+ values = log_level_values or []
60
+ if not values:
61
+ return "info", None
62
+
63
+ # Use the last value provided (aligns with Node.js wrapper behavior).
64
+ last = normalize_log_level(values[-1])
65
+ if last not in ALLOWED_LOG_LEVELS:
66
+ return (
67
+ None,
68
+ "Invalid value for --log-level. Supported values are: "
69
+ "debug, info, warning, error.",
70
+ )
71
+
72
+ return last, None
73
+
74
+
75
+ def utc_iso_timestamp() -> str:
76
+ return datetime.now(timezone.utc).isoformat()
77
+
78
+
79
+ def build_run_context(
80
+ operation: str = "evaluate",
81
+ request_id: Optional[str] = None,
82
+ conversation_id: Optional[str] = None,
83
+ message_id: Optional[str] = None,
84
+ ) -> Dict[str, Optional[str]]:
85
+ return {
86
+ "request-id": request_id,
87
+ "conversation-id": conversation_id,
88
+ "message-id": message_id,
89
+ "operation": operation,
90
+ }
91
+
92
+
93
+ _SECRET_PATTERNS = [
94
+ re.compile(r"(?i)(api[_-]?key\s*[:=]\s*)([^\s,;]+)"),
95
+ re.compile(r"(?i)(token\s*[:=]\s*)([^\s,;]+)"),
96
+ re.compile(r"(?i)(authorization\s*[:=]\s*bearer\s+)([^\s,;]+)"),
97
+ re.compile(r"(?i)(password\s*[:=]\s*)([^\s,;]+)"),
98
+ ]
99
+
100
+
101
+ def redact_sensitive_content(message: Optional[str]) -> Tuple[str, bool]:
102
+ if message is None:
103
+ return "", False
104
+
105
+ redacted = message
106
+ changed = False
107
+ for pattern in _SECRET_PATTERNS:
108
+ updated = pattern.sub(r"\1***REDACTED***", redacted)
109
+ if updated != redacted:
110
+ changed = True
111
+ redacted = updated
112
+
113
+ # Fallback: match strings 32+ chars containing mixed case and digits
114
+ # (likely a credential/token) that weren't already caught above.
115
+ if (
116
+ "***REDACTED***" not in redacted
117
+ and re.search(
118
+ r"(?=[A-Za-z0-9_\-]*[A-Z])(?=[A-Za-z0-9_\-]*[a-z])"
119
+ r"(?=[A-Za-z0-9_\-]*[0-9])[A-Za-z0-9_\-]{32,}",
120
+ redacted,
121
+ )
122
+ ):
123
+ return "[REDACTED]", True
124
+
125
+ return redacted, changed
126
+
127
+
128
+ def format_structured_log_entry(
129
+ level: str,
130
+ message: str,
131
+ logger_name: str,
132
+ run_context: Dict[str, Optional[str]],
133
+ ) -> Dict[str, Any]:
134
+ safe_message, is_redacted = redact_sensitive_content(message)
135
+ return {
136
+ "level": normalize_log_level(level) or "info",
137
+ "message": safe_message,
138
+ "logger": logger_name,
139
+ "timestamp": utc_iso_timestamp(),
140
+ "request-id": run_context.get("request-id"),
141
+ "conversation-id": run_context.get("conversation-id"),
142
+ "message-id": run_context.get("message-id"),
143
+ "operation": run_context.get("operation"),
144
+ "is-redacted": is_redacted,
145
+ }
@@ -0,0 +1,51 @@
1
+ """Shared types and constants for the CLI."""
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from typing import List, Optional
6
+
7
+
8
+ def pascal_case_to_title(eval_name: str) -> str:
9
+ """Convert PascalCase evaluator name to space-separated display name.
10
+
11
+ e.g., "ToolCallAccuracy" → "Tool Call Accuracy"
12
+ """
13
+ return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', eval_name)
14
+
15
+ # Canonical evaluator name constants
16
+ RELEVANCE = "Relevance"
17
+ COHERENCE = "Coherence"
18
+ GROUNDEDNESS = "Groundedness"
19
+ TOOL_CALL_ACCURACY = "ToolCallAccuracy"
20
+ CITATIONS = "Citations"
21
+ EXACT_MATCH = "ExactMatch"
22
+ PARTIAL_MATCH = "PartialMatch"
23
+
24
+ # Prerequisite constants
25
+ REQUIRES_AZURE_OPENAI = "azure_openai"
26
+ REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
27
+
28
+ # System defaults when no file-level or env-level defaults are configured
29
+ SYSTEM_DEFAULT_EVALUATORS = [
30
+ RELEVANCE,
31
+ COHERENCE,
32
+ ]
33
+
34
+
35
+ # Mapping from evaluator name to the key used in evaluator output dicts
36
+ METRIC_IDS = {
37
+ RELEVANCE: "relevance",
38
+ COHERENCE: "coherence",
39
+ GROUNDEDNESS: "groundedness",
40
+ TOOL_CALL_ACCURACY: "tool_call_accuracy",
41
+ CITATIONS: "citations",
42
+ EXACT_MATCH: "exact_match",
43
+ PARTIAL_MATCH: "partial_match",
44
+ }
45
+
46
+
47
+ @dataclass
48
+ class RegistryEntry:
49
+ type: str # "llm", "tool", or "non-llm"
50
+ requires: List[str]
51
+ default_threshold: Optional[float]
@@ -17,8 +17,8 @@ from typing import Dict, Any, Optional
17
17
  class CitationFormat(Enum):
18
18
  """Enum for different citation formats supported by the evaluator."""
19
19
  OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
20
- LEGACY_BRACKET = "legacy_bracket" # Old format: [^i^]
21
- AUTO = "auto" # Automatically detect both formats
20
+ LEGACY_BRACKET = "bracket" # Old format: [^i^]
21
+ AUTO = "mixed" # Automatically detect both formats
22
22
 
23
23
 
24
24
  class CitationsEvaluator:
@@ -141,7 +141,7 @@ class CitationsEvaluator:
141
141
 
142
142
  results = {
143
143
  "citation_format": self.citation_format.value,
144
- "score": total_citations,
144
+ "citations": total_citations,
145
145
  "result": "pass" if total_citations > 0 else "fail",
146
146
  "threshold": 1,
147
147
  "reason": " ".join(reason_parts)
@@ -1,8 +1,6 @@
1
- from azure.ai.evaluation import evaluate
2
-
3
1
  class ExactMatchEvaluator:
4
- def __init__(self):
5
- pass
2
+ def __init__(self, case_sensitive=False):
3
+ self.case_sensitive = case_sensitive
6
4
 
7
5
  def __call__(self, *, response: str, expected_answer: str, **kwargs):
8
6
  if response is None or response.strip() == "":
@@ -11,15 +9,17 @@ class ExactMatchEvaluator:
11
9
  if expected_answer is None:
12
10
  raise ValueError("Expected answer cannot be None.")
13
11
 
14
- # Case-sensitive exact match (mimics C# StringComparison.InvariantCulture)
15
- is_match = response.strip() == expected_answer.strip()
12
+ resp = response.strip()
13
+ exp = expected_answer.strip()
14
+
15
+ if not self.case_sensitive:
16
+ resp = resp.lower()
17
+ exp = exp.lower()
18
+
19
+ is_match = resp == exp
16
20
 
17
21
  return {
18
22
  "exact_match": 1.0 if is_match else 0.0,
19
- "exact_match_result": "pass" if is_match else "fail",
20
- "exact_match_threshold": 1.0,
23
+ "result": "pass" if is_match else "fail",
21
24
  "exact_match_reason": "Exact match found" if is_match else "No exact match found"
22
25
  }
23
-
24
-
25
- exact_match_evaluator = ExactMatchEvaluator()
@@ -1,5 +1,3 @@
1
- from azure.ai.evaluation import evaluate
2
-
3
1
  class PartialMatchEvaluator:
4
2
  def __init__(self, case_sensitive=False):
5
3
  self.case_sensitive = case_sensitive
@@ -25,15 +23,7 @@ class PartialMatchEvaluator:
25
23
  else:
26
24
  score = 0.0
27
25
 
28
- threshold = 0.5 # 50% match threshold
29
- is_pass = score >= threshold
30
-
31
26
  return {
32
27
  "partial_match": score,
33
- "partial_match_result": "pass" if is_pass else "fail",
34
- "partial_match_threshold": threshold,
35
- "partial_match_reason": f"Match score: {score:.3f} ({'above' if is_pass else 'below'} threshold {threshold})"
28
+ "partial_match_reason": f"Match score: {score:.3f}"
36
29
  }
37
-
38
-
39
- partial_match_evaluator = PartialMatchEvaluator(case_sensitive=False)