@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.3.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -4
- package/package.json +2 -2
- package/schema/CHANGELOG.md +8 -0
- package/schema/v1/eval-document.schema.json +117 -1
- package/schema/v1/examples/valid/comprehensive.json +27 -2
- package/schema/version.json +2 -2
- package/src/clients/cli/cli_logging/__init__.py +0 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +55 -0
- package/src/clients/cli/cli_logging/logging_utils.py +145 -0
- package/src/clients/cli/common.py +51 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
- package/src/clients/cli/evaluator_resolver.py +150 -0
- package/src/clients/cli/generate_report.py +130 -110
- package/src/clients/cli/main.py +513 -236
- package/src/clients/cli/readme.md +14 -7
- package/src/clients/cli/response_extractor.py +32 -14
- package/src/clients/node-js/bin/runevals.js +58 -28
- package/src/clients/node-js/config/default.js +1 -1
package/README.md
CHANGED
|
@@ -265,10 +265,18 @@ runevals --prompts "What is Microsoft Graph?" --expected "Gateway to M365 data"
|
|
|
265
265
|
# Interactive mode (enter prompts interactively)
|
|
266
266
|
runevals --interactive
|
|
267
267
|
|
|
268
|
+
# Canonical logging verbosity
|
|
269
|
+
runevals --log-level debug
|
|
270
|
+
runevals --log-level info
|
|
271
|
+
runevals --log-level warning
|
|
272
|
+
runevals --log-level error
|
|
273
|
+
|
|
268
274
|
# Custom output location in your project
|
|
269
275
|
runevals --output ./reports/results.html
|
|
270
276
|
```
|
|
271
277
|
|
|
278
|
+
> **⚠️ Debug log safety notice:** The `--log-level debug` option is opt-in and may include raw API payloads and response data in console output. Redaction is pattern-based (API keys, tokens, passwords, long mixed-case strings) and **will not catch arbitrary PII or custom credentials** embedded in prompts or responses. Do not share debug-level output publicly without manual review.
|
|
279
|
+
|
|
272
280
|
### Optional: Add Shortcuts to package.json
|
|
273
281
|
|
|
274
282
|
You can add shortcuts (npm scripts) to your agent project's `package.json`:
|
|
@@ -320,8 +328,7 @@ runevals --output results.csv
|
|
|
320
328
|
```bash
|
|
321
329
|
Options:
|
|
322
330
|
-V, --version output version number
|
|
323
|
-
-
|
|
324
|
-
-q, --quiet minimal output
|
|
331
|
+
--log-level [level] log level: debug|info|warning|error (bare flag -> info)
|
|
325
332
|
--prompts <prompts...> inline prompts to evaluate
|
|
326
333
|
--expected <responses...> expected responses (with --prompts)
|
|
327
334
|
--prompts-file <file> JSON file with prompts
|
|
@@ -360,7 +367,7 @@ runevals cache-info
|
|
|
360
367
|
|
|
361
368
|
# Clear and rebuild
|
|
362
369
|
runevals cache-clear
|
|
363
|
-
runevals --init-only --
|
|
370
|
+
runevals --init-only --log-level debug
|
|
364
371
|
```
|
|
365
372
|
|
|
366
373
|
### Network/Proxy Issues
|
|
@@ -369,7 +376,7 @@ runevals --init-only --verbose
|
|
|
369
376
|
export HTTPS_PROXY=http://proxy:8080
|
|
370
377
|
|
|
371
378
|
# Retry with verbose output
|
|
372
|
-
runevals --init-only --
|
|
379
|
+
runevals --init-only --log-level debug
|
|
373
380
|
```
|
|
374
381
|
|
|
375
382
|
### Permission Issues
|
package/package.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@microsoft/m365-copilot-eval",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0-preview.1",
|
|
4
4
|
"minCliVersion": "1.0.1-preview.1",
|
|
5
5
|
"description": "Zero-config Node.js wrapper for M365 Copilot Agent Evaluations CLI (Python-based Azure AI Evaluation SDK)",
|
|
6
|
-
"publishDate": "2026-
|
|
6
|
+
"publishDate": "2026-04-01",
|
|
7
7
|
"main": "src/clients/node-js/lib/index.js",
|
|
8
8
|
"type": "module",
|
|
9
9
|
"bin": {
|
package/schema/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,14 @@ All notable changes to the eval document schema will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.1.0](https://github.com/microsoft/M365-Copilot-Agent-Evals/compare/schema-v1.0.0...schema-v1.1.0) (2026-03-30)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* **WI-6855059:** add agentName/cliVersion to schema, fix duplicate prompt loss, include default_evaluators in output ([#181](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/181)) ([9321474](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/93214746144e9d11f507433eff185aefac4a858a))
|
|
14
|
+
* **WI-6855059:** implement per-prompt evaluator configuration ([#168](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/168)) ([eface7e](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/eface7e7041b118681cd4c68582fe903640bf6c0))
|
|
15
|
+
|
|
8
16
|
## [1.0.0] - 2026-02-19
|
|
9
17
|
|
|
10
18
|
### Added
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
3
3
|
"$id": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
|
|
4
4
|
"title": "M365 Copilot Eval Document",
|
|
5
|
-
"description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Version 1.
|
|
5
|
+
"description": "Schema for evaluation documents used by M365 Copilot Agent Evals CLI. Version 1.1.0.",
|
|
6
6
|
"type": "object",
|
|
7
7
|
"required": ["schemaVersion", "items"],
|
|
8
8
|
"additionalProperties": true,
|
|
@@ -21,6 +21,10 @@
|
|
|
21
21
|
"metadata": {
|
|
22
22
|
"$ref": "#/$defs/DocumentMetadata"
|
|
23
23
|
},
|
|
24
|
+
"default_evaluators": {
|
|
25
|
+
"$ref": "#/$defs/EvaluatorMap",
|
|
26
|
+
"description": "File-level default evaluators (overrides system defaults)"
|
|
27
|
+
},
|
|
24
28
|
"items": {
|
|
25
29
|
"type": "array",
|
|
26
30
|
"minItems": 1,
|
|
@@ -69,6 +73,14 @@
|
|
|
69
73
|
"type": "string",
|
|
70
74
|
"description": "M365 Agent ID this evaluation targets"
|
|
71
75
|
},
|
|
76
|
+
"agentName": {
|
|
77
|
+
"type": "string",
|
|
78
|
+
"description": "Name of the M365 agent this evaluation targets"
|
|
79
|
+
},
|
|
80
|
+
"cliVersion": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"description": "Version of the M365 Copilot Agent Evals CLI that produced this document"
|
|
83
|
+
},
|
|
72
84
|
"extensions": {
|
|
73
85
|
"type": "object",
|
|
74
86
|
"additionalProperties": true,
|
|
@@ -99,6 +111,16 @@
|
|
|
99
111
|
"type": "string",
|
|
100
112
|
"description": "Additional context for grounding evaluation"
|
|
101
113
|
},
|
|
114
|
+
"evaluators": {
|
|
115
|
+
"$ref": "#/$defs/EvaluatorMap",
|
|
116
|
+
"description": "Per-prompt evaluator overrides"
|
|
117
|
+
},
|
|
118
|
+
"evaluators_mode": {
|
|
119
|
+
"type": "string",
|
|
120
|
+
"enum": ["extend", "replace"],
|
|
121
|
+
"default": "extend",
|
|
122
|
+
"description": "How per-prompt evaluators combine with defaults"
|
|
123
|
+
},
|
|
102
124
|
"citations": {
|
|
103
125
|
"type": "array",
|
|
104
126
|
"items": {
|
|
@@ -140,6 +162,14 @@
|
|
|
140
162
|
"citations": {
|
|
141
163
|
"$ref": "#/$defs/CitationScore",
|
|
142
164
|
"description": "Citation evaluation results"
|
|
165
|
+
},
|
|
166
|
+
"exactMatch": {
|
|
167
|
+
"$ref": "#/$defs/ExactMatchScore",
|
|
168
|
+
"description": "Exact match evaluation result"
|
|
169
|
+
},
|
|
170
|
+
"partialMatch": {
|
|
171
|
+
"$ref": "#/$defs/PartialMatchScore",
|
|
172
|
+
"description": "Partial match evaluation result"
|
|
143
173
|
}
|
|
144
174
|
}
|
|
145
175
|
},
|
|
@@ -211,6 +241,92 @@
|
|
|
211
241
|
}
|
|
212
242
|
}
|
|
213
243
|
},
|
|
244
|
+
"ExactMatchScore": {
|
|
245
|
+
"type": "object",
|
|
246
|
+
"description": "Exact match evaluation result",
|
|
247
|
+
"required": ["match", "result"],
|
|
248
|
+
"additionalProperties": true,
|
|
249
|
+
"properties": {
|
|
250
|
+
"match": {
|
|
251
|
+
"type": "boolean",
|
|
252
|
+
"description": "Whether response exactly matches expected_response (trimmed; case-insensitive by default)"
|
|
253
|
+
},
|
|
254
|
+
"result": {
|
|
255
|
+
"type": "string",
|
|
256
|
+
"enum": ["pass", "fail"],
|
|
257
|
+
"description": "Pass when match is true, fail otherwise"
|
|
258
|
+
},
|
|
259
|
+
"reason": {
|
|
260
|
+
"type": "string",
|
|
261
|
+
"description": "Explanation of the match result"
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
"PartialMatchScore": {
|
|
266
|
+
"type": "object",
|
|
267
|
+
"description": "Partial match evaluation result",
|
|
268
|
+
"required": ["score", "result", "threshold"],
|
|
269
|
+
"additionalProperties": true,
|
|
270
|
+
"properties": {
|
|
271
|
+
"score": {
|
|
272
|
+
"type": "number",
|
|
273
|
+
"minimum": 0,
|
|
274
|
+
"maximum": 1,
|
|
275
|
+
"description": "Match score from 0.0 (no match) to 1.0 (full match)"
|
|
276
|
+
},
|
|
277
|
+
"result": {
|
|
278
|
+
"type": "string",
|
|
279
|
+
"enum": ["pass", "fail"],
|
|
280
|
+
"description": "Pass/fail based on score vs threshold"
|
|
281
|
+
},
|
|
282
|
+
"threshold": {
|
|
283
|
+
"type": "number",
|
|
284
|
+
"minimum": 0,
|
|
285
|
+
"maximum": 1,
|
|
286
|
+
"description": "Minimum score required for pass (default: 0.5)"
|
|
287
|
+
},
|
|
288
|
+
"reason": {
|
|
289
|
+
"type": "string",
|
|
290
|
+
"description": "Explanation of the match result"
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
},
|
|
294
|
+
"EvaluatorMap": {
|
|
295
|
+
"type": "object",
|
|
296
|
+
"description": "Map of evaluator names to their configuration options",
|
|
297
|
+
"propertyNames": {
|
|
298
|
+
"enum": ["Relevance", "Coherence", "Groundedness", "ToolCallAccuracy", "Citations", "ExactMatch", "PartialMatch"]
|
|
299
|
+
},
|
|
300
|
+
"additionalProperties": {
|
|
301
|
+
"$ref": "#/$defs/EvaluatorOptions"
|
|
302
|
+
}
|
|
303
|
+
},
|
|
304
|
+
"EvaluatorOptions": {
|
|
305
|
+
"type": "object",
|
|
306
|
+
"description": "Evaluator configuration options. Use empty object {} for defaults.",
|
|
307
|
+
"additionalProperties": false,
|
|
308
|
+
"properties": {
|
|
309
|
+
"threshold": {
|
|
310
|
+
"type": "number",
|
|
311
|
+
"description": "Pass/fail threshold. Range depends on evaluator type: 1-5 for LLM evaluators (default: 3), >= 1 integer for Citations (min citation count, default: 1), 0.0-1.0 for PartialMatch (min match ratio, default: 0.5). Validated per-evaluator at runtime."
|
|
312
|
+
},
|
|
313
|
+
"citation_format": {
|
|
314
|
+
"type": "string",
|
|
315
|
+
"examples": ["oai_unicode", "bracket", "mixed"],
|
|
316
|
+
"description": "Citation format for detection. 'oai_unicode': new OAI unicode format, 'bracket': legacy [^i^] bracket format, 'mixed': auto-detect both formats. Default: oai_unicode."
|
|
317
|
+
},
|
|
318
|
+
"case_sensitive": {
|
|
319
|
+
"type": "boolean",
|
|
320
|
+
"default": false,
|
|
321
|
+
"description": "Case-sensitive matching for ExactMatch/PartialMatch"
|
|
322
|
+
},
|
|
323
|
+
"options": {
|
|
324
|
+
"type": "object",
|
|
325
|
+
"additionalProperties": true,
|
|
326
|
+
"description": "Evaluator-specific configuration"
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
},
|
|
214
330
|
"Citation": {
|
|
215
331
|
"type": "object",
|
|
216
332
|
"description": "A single citation reference",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"$schema": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
|
|
3
|
-
"schemaVersion": "1.
|
|
3
|
+
"schemaVersion": "1.1.0",
|
|
4
4
|
"metadata": {
|
|
5
5
|
"name": "Graph API Evaluation Set",
|
|
6
6
|
"description": "Test prompts for Microsoft Graph API knowledge",
|
|
@@ -9,11 +9,17 @@
|
|
|
9
9
|
"evaluatedAt": "2026-01-20T10:30:00Z",
|
|
10
10
|
"tags": ["graph", "api", "authentication"],
|
|
11
11
|
"agentId": "12345678-1234-1234-1234-123456789abc",
|
|
12
|
+
"agentName": "Graph Knowledge Agent",
|
|
13
|
+
"cliVersion": "1.2.0",
|
|
12
14
|
"extensions": {
|
|
13
15
|
"com.contoso.department": "engineering",
|
|
14
16
|
"com.contoso.priority": "high"
|
|
15
17
|
}
|
|
16
18
|
},
|
|
19
|
+
"default_evaluators": {
|
|
20
|
+
"Relevance": {},
|
|
21
|
+
"Coherence": {}
|
|
22
|
+
},
|
|
17
23
|
"items": [
|
|
18
24
|
{
|
|
19
25
|
"prompt": "What is Microsoft Graph API?",
|
|
@@ -86,7 +92,26 @@
|
|
|
86
92
|
},
|
|
87
93
|
{
|
|
88
94
|
"prompt": "How do I authenticate with Microsoft Graph?",
|
|
89
|
-
"expected_response": "You can authenticate using OAuth 2.0 or client credentials flow."
|
|
95
|
+
"expected_response": "You can authenticate using OAuth 2.0 or client credentials flow.",
|
|
96
|
+
"evaluators": {
|
|
97
|
+
"ExactMatch": { "case_sensitive": false },
|
|
98
|
+
"PartialMatch": { "threshold": 0.5 }
|
|
99
|
+
},
|
|
100
|
+
"evaluators_mode": "replace",
|
|
101
|
+
"response": "You can authenticate using OAuth 2.0 or client credentials flow.",
|
|
102
|
+
"scores": {
|
|
103
|
+
"exactMatch": {
|
|
104
|
+
"match": true,
|
|
105
|
+
"result": "pass",
|
|
106
|
+
"reason": "Exact match found"
|
|
107
|
+
},
|
|
108
|
+
"partialMatch": {
|
|
109
|
+
"score": 1.0,
|
|
110
|
+
"result": "pass",
|
|
111
|
+
"threshold": 0.5,
|
|
112
|
+
"reason": "Match score: 1.000"
|
|
113
|
+
}
|
|
114
|
+
}
|
|
90
115
|
}
|
|
91
116
|
]
|
|
92
117
|
}
|
package/schema/version.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "1.
|
|
3
|
-
"releaseDate": "2026-
|
|
2
|
+
"version": "1.1.0",
|
|
3
|
+
"releaseDate": "2026-03-17",
|
|
4
4
|
"schemaId": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
|
|
5
5
|
"description": "M365 Copilot Eval Document Schema"
|
|
6
6
|
}
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sys
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
from typing import Any, Dict
|
|
5
|
+
|
|
6
|
+
from cli_logging.logging_utils import STRUCTURED_LOG_FIELDS
|
|
7
|
+
|
|
8
|
+
_ANSI_COLORS = {
|
|
9
|
+
"debug": "\033[2m", # dim
|
|
10
|
+
"info": "", # default
|
|
11
|
+
"warning": "\033[33m", # yellow
|
|
12
|
+
"error": "\033[31m", # red
|
|
13
|
+
}
|
|
14
|
+
_ANSI_RESET = "\033[0m"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def format_diagnostic_record(record: Dict[str, Any]) -> OrderedDict:
|
|
18
|
+
ordered = OrderedDict()
|
|
19
|
+
for field in STRUCTURED_LOG_FIELDS:
|
|
20
|
+
default = False if field == "is-redacted" else None
|
|
21
|
+
ordered[field] = record.get(field, default)
|
|
22
|
+
return ordered
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def serialize_diagnostic_record(record: Dict[str, Any]) -> str:
|
|
26
|
+
return json.dumps(format_diagnostic_record(record), ensure_ascii=False)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def format_console_record(record: Dict[str, Any], max_message_length: int = 250) -> str:
|
|
30
|
+
"""Format a diagnostic record for human-readable TTY output with ANSI colors."""
|
|
31
|
+
ts = record.get("timestamp", "")
|
|
32
|
+
# Extract HH:MM:SS from ISO timestamp
|
|
33
|
+
time_part = ts[11:19] if len(ts) >= 19 else ts
|
|
34
|
+
level = (record.get("level") or "info").upper()
|
|
35
|
+
message = record.get("message", "")
|
|
36
|
+
if len(message) > max_message_length:
|
|
37
|
+
message = message[:max_message_length] + "…"
|
|
38
|
+
|
|
39
|
+
ids = []
|
|
40
|
+
for key in ("request-id", "conversation-id", "message-id"):
|
|
41
|
+
val = record.get(key)
|
|
42
|
+
if val:
|
|
43
|
+
ids.append(f"{key}={val}")
|
|
44
|
+
id_suffix = f" ({' | '.join(ids)})" if ids else ""
|
|
45
|
+
|
|
46
|
+
color = _ANSI_COLORS.get((record.get("level") or "info").lower(), "")
|
|
47
|
+
reset = _ANSI_RESET if color else ""
|
|
48
|
+
return f"{color}[{time_part}] {level} {message}{id_suffix}{reset}"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def render_diagnostic(record: Dict[str, Any]) -> str:
|
|
52
|
+
"""Return TTY-friendly or JSON output depending on whether stdout is a terminal."""
|
|
53
|
+
if sys.stdout.isatty():
|
|
54
|
+
return format_console_record(record)
|
|
55
|
+
return serialize_diagnostic_record(record)
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LogLevel(str, Enum):
|
|
9
|
+
"""Log level enum. Inherits from str so comparisons like level == "debug" work."""
|
|
10
|
+
DEBUG = "debug"
|
|
11
|
+
INFO = "info"
|
|
12
|
+
WARNING = "warning"
|
|
13
|
+
ERROR = "error"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Operation(str, Enum):
|
|
17
|
+
"""CLI operation steps for structured log entries."""
|
|
18
|
+
SETUP = "setup"
|
|
19
|
+
AUTHENTICATE = "authenticate"
|
|
20
|
+
VALIDATE_ENV = "validate-env"
|
|
21
|
+
LOAD_PROMPTS = "load-prompts"
|
|
22
|
+
FETCH_AGENTS = "fetch-agents"
|
|
23
|
+
SEND_PROMPT = "send-prompt"
|
|
24
|
+
EVALUATE = "evaluate"
|
|
25
|
+
WRITE_OUTPUT = "write-output"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
ALLOWED_LOG_LEVELS = tuple(level.value for level in LogLevel)
|
|
29
|
+
LOG_LEVEL_MAP = {
|
|
30
|
+
LogLevel.DEBUG: logging.DEBUG,
|
|
31
|
+
LogLevel.INFO: logging.INFO,
|
|
32
|
+
LogLevel.WARNING: logging.WARNING,
|
|
33
|
+
LogLevel.ERROR: logging.ERROR,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
STRUCTURED_LOG_FIELDS = (
|
|
37
|
+
"timestamp",
|
|
38
|
+
"level",
|
|
39
|
+
"operation",
|
|
40
|
+
"request-id",
|
|
41
|
+
"conversation-id",
|
|
42
|
+
"message-id",
|
|
43
|
+
"logger",
|
|
44
|
+
"message",
|
|
45
|
+
"is-redacted",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def normalize_log_level(value: Optional[str]) -> Optional[str]:
|
|
50
|
+
if value is None:
|
|
51
|
+
return None
|
|
52
|
+
return value.strip().lower()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def resolve_log_level(
|
|
57
|
+
log_level_values: Optional[List[str]],
|
|
58
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
59
|
+
values = log_level_values or []
|
|
60
|
+
if not values:
|
|
61
|
+
return "info", None
|
|
62
|
+
|
|
63
|
+
# Use the last value provided (aligns with Node.js wrapper behavior).
|
|
64
|
+
last = normalize_log_level(values[-1])
|
|
65
|
+
if last not in ALLOWED_LOG_LEVELS:
|
|
66
|
+
return (
|
|
67
|
+
None,
|
|
68
|
+
"Invalid value for --log-level. Supported values are: "
|
|
69
|
+
"debug, info, warning, error.",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return last, None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def utc_iso_timestamp() -> str:
|
|
76
|
+
return datetime.now(timezone.utc).isoformat()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def build_run_context(
|
|
80
|
+
operation: str = "evaluate",
|
|
81
|
+
request_id: Optional[str] = None,
|
|
82
|
+
conversation_id: Optional[str] = None,
|
|
83
|
+
message_id: Optional[str] = None,
|
|
84
|
+
) -> Dict[str, Optional[str]]:
|
|
85
|
+
return {
|
|
86
|
+
"request-id": request_id,
|
|
87
|
+
"conversation-id": conversation_id,
|
|
88
|
+
"message-id": message_id,
|
|
89
|
+
"operation": operation,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
_SECRET_PATTERNS = [
|
|
94
|
+
re.compile(r"(?i)(api[_-]?key\s*[:=]\s*)([^\s,;]+)"),
|
|
95
|
+
re.compile(r"(?i)(token\s*[:=]\s*)([^\s,;]+)"),
|
|
96
|
+
re.compile(r"(?i)(authorization\s*[:=]\s*bearer\s+)([^\s,;]+)"),
|
|
97
|
+
re.compile(r"(?i)(password\s*[:=]\s*)([^\s,;]+)"),
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def redact_sensitive_content(message: Optional[str]) -> Tuple[str, bool]:
|
|
102
|
+
if message is None:
|
|
103
|
+
return "", False
|
|
104
|
+
|
|
105
|
+
redacted = message
|
|
106
|
+
changed = False
|
|
107
|
+
for pattern in _SECRET_PATTERNS:
|
|
108
|
+
updated = pattern.sub(r"\1***REDACTED***", redacted)
|
|
109
|
+
if updated != redacted:
|
|
110
|
+
changed = True
|
|
111
|
+
redacted = updated
|
|
112
|
+
|
|
113
|
+
# Fallback: match strings 32+ chars containing mixed case and digits
|
|
114
|
+
# (likely a credential/token) that weren't already caught above.
|
|
115
|
+
if (
|
|
116
|
+
"***REDACTED***" not in redacted
|
|
117
|
+
and re.search(
|
|
118
|
+
r"(?=[A-Za-z0-9_\-]*[A-Z])(?=[A-Za-z0-9_\-]*[a-z])"
|
|
119
|
+
r"(?=[A-Za-z0-9_\-]*[0-9])[A-Za-z0-9_\-]{32,}",
|
|
120
|
+
redacted,
|
|
121
|
+
)
|
|
122
|
+
):
|
|
123
|
+
return "[REDACTED]", True
|
|
124
|
+
|
|
125
|
+
return redacted, changed
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def format_structured_log_entry(
|
|
129
|
+
level: str,
|
|
130
|
+
message: str,
|
|
131
|
+
logger_name: str,
|
|
132
|
+
run_context: Dict[str, Optional[str]],
|
|
133
|
+
) -> Dict[str, Any]:
|
|
134
|
+
safe_message, is_redacted = redact_sensitive_content(message)
|
|
135
|
+
return {
|
|
136
|
+
"level": normalize_log_level(level) or "info",
|
|
137
|
+
"message": safe_message,
|
|
138
|
+
"logger": logger_name,
|
|
139
|
+
"timestamp": utc_iso_timestamp(),
|
|
140
|
+
"request-id": run_context.get("request-id"),
|
|
141
|
+
"conversation-id": run_context.get("conversation-id"),
|
|
142
|
+
"message-id": run_context.get("message-id"),
|
|
143
|
+
"operation": run_context.get("operation"),
|
|
144
|
+
"is-redacted": is_redacted,
|
|
145
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Shared types and constants for the CLI."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def pascal_case_to_title(eval_name: str) -> str:
|
|
9
|
+
"""Convert PascalCase evaluator name to space-separated display name.
|
|
10
|
+
|
|
11
|
+
e.g., "ToolCallAccuracy" → "Tool Call Accuracy"
|
|
12
|
+
"""
|
|
13
|
+
return re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', eval_name)
|
|
14
|
+
|
|
15
|
+
# Canonical evaluator name constants
|
|
16
|
+
RELEVANCE = "Relevance"
|
|
17
|
+
COHERENCE = "Coherence"
|
|
18
|
+
GROUNDEDNESS = "Groundedness"
|
|
19
|
+
TOOL_CALL_ACCURACY = "ToolCallAccuracy"
|
|
20
|
+
CITATIONS = "Citations"
|
|
21
|
+
EXACT_MATCH = "ExactMatch"
|
|
22
|
+
PARTIAL_MATCH = "PartialMatch"
|
|
23
|
+
|
|
24
|
+
# Prerequisite constants
|
|
25
|
+
REQUIRES_AZURE_OPENAI = "azure_openai"
|
|
26
|
+
REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
|
|
27
|
+
|
|
28
|
+
# System defaults when no file-level or env-level defaults are configured
|
|
29
|
+
SYSTEM_DEFAULT_EVALUATORS = [
|
|
30
|
+
RELEVANCE,
|
|
31
|
+
COHERENCE,
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Mapping from evaluator name to the key used in evaluator output dicts
|
|
36
|
+
METRIC_IDS = {
|
|
37
|
+
RELEVANCE: "relevance",
|
|
38
|
+
COHERENCE: "coherence",
|
|
39
|
+
GROUNDEDNESS: "groundedness",
|
|
40
|
+
TOOL_CALL_ACCURACY: "tool_call_accuracy",
|
|
41
|
+
CITATIONS: "citations",
|
|
42
|
+
EXACT_MATCH: "exact_match",
|
|
43
|
+
PARTIAL_MATCH: "partial_match",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class RegistryEntry:
|
|
49
|
+
type: str # "llm", "tool", or "non-llm"
|
|
50
|
+
requires: List[str]
|
|
51
|
+
default_threshold: Optional[float]
|
|
@@ -17,8 +17,8 @@ from typing import Dict, Any, Optional
|
|
|
17
17
|
class CitationFormat(Enum):
|
|
18
18
|
"""Enum for different citation formats supported by the evaluator."""
|
|
19
19
|
OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
20
|
-
LEGACY_BRACKET = "
|
|
21
|
-
AUTO = "
|
|
20
|
+
LEGACY_BRACKET = "bracket" # Old format: [^i^]
|
|
21
|
+
AUTO = "mixed" # Automatically detect both formats
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class CitationsEvaluator:
|
|
@@ -141,7 +141,7 @@ class CitationsEvaluator:
|
|
|
141
141
|
|
|
142
142
|
results = {
|
|
143
143
|
"citation_format": self.citation_format.value,
|
|
144
|
-
"
|
|
144
|
+
"citations": total_citations,
|
|
145
145
|
"result": "pass" if total_citations > 0 else "fail",
|
|
146
146
|
"threshold": 1,
|
|
147
147
|
"reason": " ".join(reason_parts)
|
|
@@ -1,8 +1,6 @@
|
|
|
1
|
-
from azure.ai.evaluation import evaluate
|
|
2
|
-
|
|
3
1
|
class ExactMatchEvaluator:
|
|
4
|
-
def __init__(self):
|
|
5
|
-
|
|
2
|
+
def __init__(self, case_sensitive=False):
|
|
3
|
+
self.case_sensitive = case_sensitive
|
|
6
4
|
|
|
7
5
|
def __call__(self, *, response: str, expected_answer: str, **kwargs):
|
|
8
6
|
if response is None or response.strip() == "":
|
|
@@ -11,15 +9,17 @@ class ExactMatchEvaluator:
|
|
|
11
9
|
if expected_answer is None:
|
|
12
10
|
raise ValueError("Expected answer cannot be None.")
|
|
13
11
|
|
|
14
|
-
|
|
15
|
-
|
|
12
|
+
resp = response.strip()
|
|
13
|
+
exp = expected_answer.strip()
|
|
14
|
+
|
|
15
|
+
if not self.case_sensitive:
|
|
16
|
+
resp = resp.lower()
|
|
17
|
+
exp = exp.lower()
|
|
18
|
+
|
|
19
|
+
is_match = resp == exp
|
|
16
20
|
|
|
17
21
|
return {
|
|
18
22
|
"exact_match": 1.0 if is_match else 0.0,
|
|
19
|
-
"
|
|
20
|
-
"exact_match_threshold": 1.0,
|
|
23
|
+
"result": "pass" if is_match else "fail",
|
|
21
24
|
"exact_match_reason": "Exact match found" if is_match else "No exact match found"
|
|
22
25
|
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
exact_match_evaluator = ExactMatchEvaluator()
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from azure.ai.evaluation import evaluate
|
|
2
|
-
|
|
3
1
|
class PartialMatchEvaluator:
|
|
4
2
|
def __init__(self, case_sensitive=False):
|
|
5
3
|
self.case_sensitive = case_sensitive
|
|
@@ -25,15 +23,7 @@ class PartialMatchEvaluator:
|
|
|
25
23
|
else:
|
|
26
24
|
score = 0.0
|
|
27
25
|
|
|
28
|
-
threshold = 0.5 # 50% match threshold
|
|
29
|
-
is_pass = score >= threshold
|
|
30
|
-
|
|
31
26
|
return {
|
|
32
27
|
"partial_match": score,
|
|
33
|
-
"
|
|
34
|
-
"partial_match_threshold": threshold,
|
|
35
|
-
"partial_match_reason": f"Match score: {score:.3f} ({'above' if is_pass else 'below'} threshold {threshold})"
|
|
28
|
+
"partial_match_reason": f"Match score: {score:.3f}"
|
|
36
29
|
}
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
partial_match_evaluator = PartialMatchEvaluator(case_sensitive=False)
|