@microsoft/m365-copilot-eval 1.2.1-preview.1 ā 1.3.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -4
- package/package.json +2 -2
- package/schema/CHANGELOG.md +8 -0
- package/schema/v1/eval-document.schema.json +117 -1
- package/schema/v1/examples/valid/comprehensive.json +27 -2
- package/schema/version.json +2 -2
- package/src/clients/cli/cli_logging/__init__.py +0 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +55 -0
- package/src/clients/cli/cli_logging/logging_utils.py +145 -0
- package/src/clients/cli/common.py +51 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
- package/src/clients/cli/evaluator_resolver.py +150 -0
- package/src/clients/cli/generate_report.py +130 -110
- package/src/clients/cli/main.py +513 -236
- package/src/clients/cli/readme.md +14 -7
- package/src/clients/cli/response_extractor.py +32 -14
- package/src/clients/node-js/bin/runevals.js +58 -28
- package/src/clients/node-js/config/default.js +1 -1
|
@@ -97,21 +97,28 @@ python main.py --interactive
|
|
|
97
97
|
|
|
98
98
|
#### Additional Options
|
|
99
99
|
```bash
|
|
100
|
-
#
|
|
101
|
-
python main.py --
|
|
100
|
+
# Logging verbosity (canonical control surface)
|
|
101
|
+
python main.py --log-level debug
|
|
102
|
+
python main.py --log-level info
|
|
103
|
+
python main.py --log-level warning
|
|
104
|
+
python main.py --log-level error
|
|
105
|
+
|
|
106
|
+
# Bare flag resolves to info
|
|
107
|
+
python main.py --log-level
|
|
102
108
|
|
|
103
|
-
#
|
|
109
|
+
# Legacy flags (no longer supported; use --log-level instead)
|
|
110
|
+
# The following will fail with "unrecognized arguments" errors:
|
|
111
|
+
python main.py --verbose
|
|
104
112
|
python main.py --quiet
|
|
105
113
|
|
|
114
|
+
# Share diagnostics with support (console-based, no archive artifacts)
|
|
115
|
+
python main.py --log-level debug --prompts-file samples/example_prompts.json
|
|
116
|
+
|
|
106
117
|
# Get help and see all options
|
|
107
118
|
python main.py --help
|
|
108
119
|
|
|
109
120
|
# Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
|
|
110
121
|
python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
|
|
111
|
-
|
|
112
|
-
# Citation format options
|
|
113
|
-
python main.py --citation-format oai_unicode # Default: New OAI format
|
|
114
|
-
python main.py --citation-format legacy_bracket # Old [^i^] format
|
|
115
122
|
```
|
|
116
123
|
|
|
117
124
|
#### File Format Examples
|
|
@@ -27,14 +27,20 @@ Date: September 21, 2025
|
|
|
27
27
|
|
|
28
28
|
import json
|
|
29
29
|
import logging
|
|
30
|
-
from typing import Dict, List, Any, Optional
|
|
30
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
31
31
|
from datetime import datetime
|
|
32
32
|
from enum import Enum
|
|
33
|
+
from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel
|
|
33
34
|
|
|
34
35
|
# Configure logging
|
|
35
|
-
logging.
|
|
36
|
+
if not logging.getLogger().handlers:
|
|
37
|
+
logging.basicConfig(level=logging.INFO)
|
|
36
38
|
logger = logging.getLogger(__name__)
|
|
37
39
|
|
|
40
|
+
def _log_level_to_python_level(log_level: str) -> int:
|
|
41
|
+
normalized = (log_level or "info").strip().lower()
|
|
42
|
+
return LOG_LEVEL_MAP.get(normalized, logging.INFO)
|
|
43
|
+
|
|
38
44
|
class MessageRole(Enum):
|
|
39
45
|
"""Enumeration for message roles."""
|
|
40
46
|
USER = "user"
|
|
@@ -71,8 +77,10 @@ class EnhancedResponseExtractor:
|
|
|
71
77
|
"generate_response"
|
|
72
78
|
}
|
|
73
79
|
|
|
74
|
-
def __init__(self):
|
|
80
|
+
def __init__(self, log_level: str = "info"):
|
|
75
81
|
self.tool_call_counter = 0
|
|
82
|
+
self.log_level = (log_level or "info").strip().lower()
|
|
83
|
+
logger.setLevel(_log_level_to_python_level(self.log_level))
|
|
76
84
|
|
|
77
85
|
def _generate_tool_call_id(self, tool_name: str) -> str:
|
|
78
86
|
"""Generate a unique tool call ID."""
|
|
@@ -461,6 +469,7 @@ class EnhancedResponseExtractor:
|
|
|
461
469
|
"metadata": {
|
|
462
470
|
"conversation_id": response_data.get("conversationId"),
|
|
463
471
|
"request_id": response_data.get("requestId"),
|
|
472
|
+
"message_id": None,
|
|
464
473
|
"telemetry_available": False
|
|
465
474
|
}
|
|
466
475
|
}
|
|
@@ -470,6 +479,11 @@ class EnhancedResponseExtractor:
|
|
|
470
479
|
if isinstance(response_data, dict):
|
|
471
480
|
# Messages are directly in the response_data object
|
|
472
481
|
messages = response_data.get("messages", [])
|
|
482
|
+
|
|
483
|
+
# Extract message_id from the last bot message in this response
|
|
484
|
+
bot_messages = [m for m in messages if m.get("author") != "user"]
|
|
485
|
+
if bot_messages and bot_messages[-1].get("messageId"):
|
|
486
|
+
enhanced_response["metadata"]["message_id"] = bot_messages[-1]["messageId"]
|
|
473
487
|
|
|
474
488
|
# Extract telemetry tools if available
|
|
475
489
|
telemetry_tools = []
|
|
@@ -526,6 +540,7 @@ class EnhancedResponseExtractor:
|
|
|
526
540
|
"metadata": {
|
|
527
541
|
"conversation_id": None,
|
|
528
542
|
"request_id": None,
|
|
543
|
+
"message_id": None,
|
|
529
544
|
"telemetry_available": False
|
|
530
545
|
}
|
|
531
546
|
}
|
|
@@ -552,27 +567,30 @@ class EnhancedResponseExtractor:
|
|
|
552
567
|
"metadata": {
|
|
553
568
|
"conversation_id": None,
|
|
554
569
|
"request_id": None,
|
|
570
|
+
"message_id": None,
|
|
555
571
|
"telemetry_available": False,
|
|
556
572
|
"error": str(e)
|
|
557
573
|
}
|
|
558
574
|
}
|
|
559
575
|
|
|
560
|
-
def extract_enhanced_responses(responses:
|
|
576
|
+
def extract_enhanced_responses(responses: List[Tuple[str, str]], log_level: str = "info") -> List[Dict[str, Any]]:
|
|
561
577
|
"""
|
|
562
578
|
Extract enhanced response information for multiple responses.
|
|
563
|
-
|
|
579
|
+
|
|
564
580
|
Args:
|
|
565
|
-
responses:
|
|
566
|
-
|
|
581
|
+
responses: List of (prompt_text, raw_response_string) tuples, one per prompt
|
|
582
|
+
sent to the chat API. Order and duplicates are preserved.
|
|
583
|
+
|
|
567
584
|
Returns:
|
|
568
|
-
|
|
585
|
+
List of enhanced response dicts (one per prompt, same order as input).
|
|
569
586
|
"""
|
|
570
|
-
extractor = EnhancedResponseExtractor()
|
|
571
|
-
enhanced_responses =
|
|
572
|
-
|
|
573
|
-
for prompt, raw_response in responses
|
|
574
|
-
|
|
575
|
-
|
|
587
|
+
extractor = EnhancedResponseExtractor(log_level=log_level)
|
|
588
|
+
enhanced_responses = []
|
|
589
|
+
|
|
590
|
+
for prompt, raw_response in responses:
|
|
591
|
+
enhanced = extractor.extract_enhanced_response(raw_response)
|
|
592
|
+
enhanced_responses.append(enhanced)
|
|
593
|
+
|
|
576
594
|
return enhanced_responses
|
|
577
595
|
|
|
578
596
|
def get_response_text_for_evaluation(enhanced_response: Dict[str, Any]) -> str:
|
|
@@ -29,8 +29,8 @@ const REQUIREMENTS_FILE = path.join(PYTHON_CLI_DIR, 'requirements.txt');
|
|
|
29
29
|
|
|
30
30
|
/**
|
|
31
31
|
* Display usage terms notice
|
|
32
|
-
* Called before running evaluations (but not for --init-only, cache commands, or --signout)
|
|
33
|
-
* This notice
|
|
32
|
+
* Called before running evaluations (but not for --init-only, cache commands, or --signout).
|
|
33
|
+
* This notice is always displayed per legal requirements (FR-006).
|
|
34
34
|
*/
|
|
35
35
|
function displayUsageTerms() {
|
|
36
36
|
console.log('By using this tool, you agree to the Terms of Use: https://aka.ms/evaltoolterms\n');
|
|
@@ -52,10 +52,10 @@ async function setDefaultEnvironmentConstants() {
|
|
|
52
52
|
/**
|
|
53
53
|
* Check for required environment variables and provide helpful guidance.
|
|
54
54
|
* @param {string} envName - Environment name (e.g. 'dev')
|
|
55
|
-
* @param {boolean} [
|
|
55
|
+
* @param {boolean} [suppressOutput=false] - Suppress guidance output
|
|
56
56
|
* @returns {boolean} true if all required vars are present
|
|
57
57
|
*/
|
|
58
|
-
function validateEnvironmentVariables(envName,
|
|
58
|
+
function validateEnvironmentVariables(envName, suppressOutput = false) {
|
|
59
59
|
const required = [
|
|
60
60
|
{ key: 'TENANT_ID', description: 'Your Tenant ID' },
|
|
61
61
|
{ key: 'AZURE_AI_OPENAI_ENDPOINT', description: 'Azure OpenAI endpoint URL' },
|
|
@@ -72,8 +72,8 @@ function validateEnvironmentVariables(envName, quiet = false) {
|
|
|
72
72
|
return true; // All required vars present
|
|
73
73
|
}
|
|
74
74
|
|
|
75
|
-
// Show error with helpful guidance
|
|
76
|
-
if (!
|
|
75
|
+
// Show error with helpful guidance unless output is suppressed.
|
|
76
|
+
if (!suppressOutput) {
|
|
77
77
|
console.error('\nā Missing required environment variables:\n');
|
|
78
78
|
|
|
79
79
|
const envFile = envName ? `env/.env.${envName}` : '.env.local or env/.env.local';
|
|
@@ -96,6 +96,34 @@ function validateEnvironmentVariables(envName, quiet = false) {
|
|
|
96
96
|
return false;
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
+
/**
|
|
100
|
+
* Resolve canonical log level for Python CLI forwarding.
|
|
101
|
+
*/
|
|
102
|
+
function resolveLogLevel(options) {
|
|
103
|
+
const allowedLevels = new Set(['debug', 'info', 'warning', 'error']);
|
|
104
|
+
const rawLogLevel =
|
|
105
|
+
options.logLevel === true || options.logLevel === undefined
|
|
106
|
+
? undefined
|
|
107
|
+
: String(options.logLevel).toLowerCase();
|
|
108
|
+
const effectiveLogLevel = rawLogLevel || 'info';
|
|
109
|
+
|
|
110
|
+
if (!allowedLevels.has(effectiveLogLevel)) {
|
|
111
|
+
console.error(
|
|
112
|
+
`ā Invalid --log-level value: ${effectiveLogLevel}. Supported values: debug, info, warning, error.`
|
|
113
|
+
);
|
|
114
|
+
process.exit(2);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return effectiveLogLevel;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function deriveWrapperOutputMode(effectiveLogLevel) {
|
|
121
|
+
return {
|
|
122
|
+
verbose: effectiveLogLevel === 'debug',
|
|
123
|
+
quiet: effectiveLogLevel === 'warning' || effectiveLogLevel === 'error',
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
|
|
99
127
|
/**
|
|
100
128
|
* Initialize the Python environment (download, venv, pip install)
|
|
101
129
|
* @param {boolean} [verbose=false] - Enable verbose output
|
|
@@ -126,7 +154,7 @@ async function initializePythonEnvironment(verbose = false, quiet = false) {
|
|
|
126
154
|
console.error(' - Check your internet connection');
|
|
127
155
|
console.error(' - If behind a proxy, set HTTP_PROXY/HTTPS_PROXY environment variables');
|
|
128
156
|
console.error(' - For SSL issues, set NODE_EXTRA_CA_CERTS or PIP_CERT');
|
|
129
|
-
console.error(' - Run with --
|
|
157
|
+
console.error(' - Run with --log-level debug for detailed output');
|
|
130
158
|
|
|
131
159
|
process.exit(1);
|
|
132
160
|
}
|
|
@@ -142,8 +170,7 @@ async function main() {
|
|
|
142
170
|
.name('runevals')
|
|
143
171
|
.description('M365 Copilot Agent Evaluations CLI - Zero-config Python evaluation tool')
|
|
144
172
|
.version(VERSION)
|
|
145
|
-
.option('-
|
|
146
|
-
.option('-q, --quiet', 'quiet mode (minimal output)')
|
|
173
|
+
.option('--log-level [level]', 'log level for diagnostics: debug, info, warning, error (bare flag resolves to info)')
|
|
147
174
|
.option('--prompts <prompts...>', 'prompts to evaluate')
|
|
148
175
|
.option('--expected <responses...>', 'expected responses')
|
|
149
176
|
.option('--prompts-file <file>', 'JSON file with prompts and expected responses')
|
|
@@ -159,6 +186,10 @@ async function main() {
|
|
|
159
186
|
|
|
160
187
|
program.parse(process.argv);
|
|
161
188
|
const options = program.opts();
|
|
189
|
+
const effectiveLogLevel = resolveLogLevel(options);
|
|
190
|
+
const outputMode = deriveWrapperOutputMode(effectiveLogLevel);
|
|
191
|
+
const wrapperVerbose = outputMode.verbose;
|
|
192
|
+
const wrapperQuiet = outputMode.quiet;
|
|
162
193
|
|
|
163
194
|
// Handle cache commands first (they don't need environment validation or config)
|
|
164
195
|
if (options.cacheInfo) {
|
|
@@ -183,7 +214,7 @@ async function main() {
|
|
|
183
214
|
|
|
184
215
|
if (options.cacheClear) {
|
|
185
216
|
console.log('šļø Clearing cache...\n');
|
|
186
|
-
const success = await clearCache(
|
|
217
|
+
const success = await clearCache(wrapperVerbose);
|
|
187
218
|
|
|
188
219
|
if (success) {
|
|
189
220
|
console.log('ā
Cache cleared successfully!');
|
|
@@ -203,14 +234,14 @@ async function main() {
|
|
|
203
234
|
|
|
204
235
|
// Initialize Python environment (do this early for --init-only)
|
|
205
236
|
// Skip env file loading for --init-only since it's not needed
|
|
206
|
-
if (!
|
|
237
|
+
if (!wrapperQuiet) {
|
|
207
238
|
console.log('š M365 Copilot Agent Evaluations CLI\n');
|
|
208
239
|
}
|
|
209
|
-
await initializePythonEnvironment(
|
|
240
|
+
await initializePythonEnvironment(wrapperVerbose, wrapperQuiet);
|
|
210
241
|
|
|
211
242
|
// If --init-only, stop here (no config or env files needed)
|
|
212
243
|
if (options.initOnly) {
|
|
213
|
-
if (!
|
|
244
|
+
if (!wrapperQuiet) {
|
|
214
245
|
console.log('\nā
Python environment initialized successfully!\n');
|
|
215
246
|
console.log('ā ļø Note: Configure environment variables before running evaluations.');
|
|
216
247
|
console.log('š See README.md for complete setup guide.\n');
|
|
@@ -233,7 +264,7 @@ async function main() {
|
|
|
233
264
|
let localEnvFound = false;
|
|
234
265
|
|
|
235
266
|
if (fs.existsSync(localEnvPath)) {
|
|
236
|
-
if (!
|
|
267
|
+
if (!wrapperQuiet && wrapperVerbose) {
|
|
237
268
|
console.log(`š Loading .env.local from current directory`);
|
|
238
269
|
}
|
|
239
270
|
const localEnvVars = loadEnvFile(localEnvPath) || {};
|
|
@@ -245,7 +276,7 @@ async function main() {
|
|
|
245
276
|
if (!localEnvFound) {
|
|
246
277
|
localEnvPath = path.join(process.cwd(), 'env', '.env.local');
|
|
247
278
|
if (fs.existsSync(localEnvPath)) {
|
|
248
|
-
if (!
|
|
279
|
+
if (!wrapperQuiet && wrapperVerbose) {
|
|
249
280
|
console.log(`š Loading .env.local from current directory env folder`);
|
|
250
281
|
}
|
|
251
282
|
const localEnvVars = loadEnvFile(localEnvPath) || {};
|
|
@@ -263,7 +294,7 @@ async function main() {
|
|
|
263
294
|
let envFileFound = false;
|
|
264
295
|
|
|
265
296
|
if (fs.existsSync(envFilePath)) {
|
|
266
|
-
if (!
|
|
297
|
+
if (!wrapperQuiet) {
|
|
267
298
|
console.log(`š Loading environment: ${options.env} from current directory env folder`);
|
|
268
299
|
}
|
|
269
300
|
const fileEnvVars = loadEnvFile(envFilePath) || {};
|
|
@@ -274,7 +305,7 @@ async function main() {
|
|
|
274
305
|
envFilePath = path.join(__dirname, '..', 'env', `.env.${options.env}`);
|
|
275
306
|
|
|
276
307
|
if (fs.existsSync(envFilePath)) {
|
|
277
|
-
if (!
|
|
308
|
+
if (!wrapperQuiet) {
|
|
278
309
|
console.log(`š Loading environment: ${options.env} from package env folder`);
|
|
279
310
|
}
|
|
280
311
|
const fileEnvVars = loadEnvFile(envFilePath) || {};
|
|
@@ -295,14 +326,14 @@ async function main() {
|
|
|
295
326
|
// loadEnvFile already resolved aliases (e.g. M365_TITLE_ID) into M365_AGENT_ID
|
|
296
327
|
if (!resolvedAgentId) {
|
|
297
328
|
resolvedAgentId = envVars['M365_AGENT_ID'] || process.env.M365_AGENT_ID;
|
|
298
|
-
if (resolvedAgentId && !
|
|
329
|
+
if (resolvedAgentId && !wrapperQuiet) {
|
|
299
330
|
console.log(`š¤ Agent ID: ${resolvedAgentId}`);
|
|
300
331
|
}
|
|
301
332
|
}
|
|
302
333
|
|
|
303
334
|
// Validate required environment variables (always validate, quiet just suppresses output)
|
|
304
|
-
if (!validateEnvironmentVariables(options.env,
|
|
305
|
-
if (
|
|
335
|
+
if (!validateEnvironmentVariables(options.env, wrapperQuiet)) {
|
|
336
|
+
if (wrapperQuiet) {
|
|
306
337
|
console.error('š Setup guide: https://www.npmjs.com/package/@microsoft/m365-copilot-eval?activeTab=readme\n');
|
|
307
338
|
}
|
|
308
339
|
process.exit(1);
|
|
@@ -311,8 +342,7 @@ async function main() {
|
|
|
311
342
|
// Build arguments to pass to Python CLI
|
|
312
343
|
const pythonArgs = [];
|
|
313
344
|
|
|
314
|
-
|
|
315
|
-
if (options.quiet) pythonArgs.push('--quiet');
|
|
345
|
+
pythonArgs.push('--log-level', effectiveLogLevel);
|
|
316
346
|
if (options.interactive) pythonArgs.push('--interactive');
|
|
317
347
|
if (resolvedAgentId) pythonArgs.push('--m365-agent-id', resolvedAgentId);
|
|
318
348
|
|
|
@@ -327,7 +357,7 @@ async function main() {
|
|
|
327
357
|
console.log('ā
Successfully signed out and cleared cached authentication tokens!');
|
|
328
358
|
} catch (error) {
|
|
329
359
|
console.error('ā Failed to sign out:', error.message);
|
|
330
|
-
if (
|
|
360
|
+
if (wrapperVerbose) {
|
|
331
361
|
console.error('\nFull error:', error);
|
|
332
362
|
}
|
|
333
363
|
process.exit(1);
|
|
@@ -350,7 +380,7 @@ async function main() {
|
|
|
350
380
|
const candidatePath = path.join(location.dir, filename);
|
|
351
381
|
if (fs.existsSync(candidatePath)) {
|
|
352
382
|
promptsFile = candidatePath;
|
|
353
|
-
if (!
|
|
383
|
+
if (!wrapperQuiet) {
|
|
354
384
|
const displayPath = candidatePath.startsWith(process.cwd())
|
|
355
385
|
? './' + path.relative(process.cwd(), candidatePath)
|
|
356
386
|
: candidatePath;
|
|
@@ -363,7 +393,7 @@ async function main() {
|
|
|
363
393
|
}
|
|
364
394
|
|
|
365
395
|
// If no prompts file found, offer to create starter file
|
|
366
|
-
if (!promptsFile && !
|
|
396
|
+
if (!promptsFile && !wrapperQuiet) {
|
|
367
397
|
console.log('\nā ļø No prompts file found in current directory or ./evals/\n');
|
|
368
398
|
|
|
369
399
|
// Use readline for interactive prompt
|
|
@@ -413,7 +443,7 @@ async function main() {
|
|
|
413
443
|
const evalsDir = path.join(process.cwd(), '.evals');
|
|
414
444
|
if (!fs.existsSync(evalsDir)) {
|
|
415
445
|
fs.mkdirSync(evalsDir, { recursive: true });
|
|
416
|
-
if (
|
|
446
|
+
if (wrapperVerbose) {
|
|
417
447
|
console.log(`Created .evals directory`);
|
|
418
448
|
}
|
|
419
449
|
}
|
|
@@ -434,7 +464,7 @@ async function main() {
|
|
|
434
464
|
}
|
|
435
465
|
|
|
436
466
|
// Execute Python CLI with working directory set to Python CLI directory
|
|
437
|
-
if (!
|
|
467
|
+
if (!wrapperQuiet) {
|
|
438
468
|
console.log('\nš Running evaluations...\n');
|
|
439
469
|
console.log('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n');
|
|
440
470
|
}
|
|
@@ -442,7 +472,7 @@ async function main() {
|
|
|
442
472
|
try {
|
|
443
473
|
await executePythonCli(MAIN_SCRIPT, pythonArgs, { cwd: PYTHON_CLI_DIR });
|
|
444
474
|
|
|
445
|
-
if (!
|
|
475
|
+
if (!wrapperQuiet) {
|
|
446
476
|
console.log('\nāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā\n');
|
|
447
477
|
console.log('ā Evals completed successfully!');
|
|
448
478
|
if (outputFile) {
|