@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.4.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -97
- package/package.json +7 -4
- package/schema/v1/eval-document.schema.json +140 -8
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
- package/src/clients/cli/api_clients/REST/__init__.py +3 -0
- package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +78 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +54 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +11 -0
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +1006 -476
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +12 -14
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +79 -16
- package/src/clients/node-js/config/default.js +5 -1
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +11 -16
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Parallel prompt execution utilities.
|
|
2
|
+
|
|
3
|
+
This module provides a minimal reusable executor that preserves input order.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any, Callable, Generic, Iterable, List, Optional, TypeVar
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
T = TypeVar("T")
|
|
14
|
+
R = TypeVar("R")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class WorkerResult(Generic[R]):
|
|
19
|
+
"""Result model used to preserve input ordering and capture failures."""
|
|
20
|
+
|
|
21
|
+
index: int
|
|
22
|
+
value: Optional[R] = None
|
|
23
|
+
error: Optional[Exception] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def execute_in_parallel(
|
|
27
|
+
items: Iterable[T],
|
|
28
|
+
worker: Callable[[T, int], R],
|
|
29
|
+
max_workers: int,
|
|
30
|
+
) -> List[WorkerResult[R]]:
|
|
31
|
+
"""Execute worker function in parallel while preserving input order.
|
|
32
|
+
|
|
33
|
+
`worker` receives `(item, index)` and returns a value.
|
|
34
|
+
"""
|
|
35
|
+
indexed_items = list(enumerate(items))
|
|
36
|
+
if not indexed_items:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
normalized_workers = max(1, min(max_workers, len(indexed_items)))
|
|
40
|
+
results: List[WorkerResult[R]] = [WorkerResult(index=i) for i, _ in indexed_items]
|
|
41
|
+
|
|
42
|
+
with ThreadPoolExecutor(max_workers=normalized_workers) as executor:
|
|
43
|
+
future_map = {
|
|
44
|
+
executor.submit(worker, item, index): index
|
|
45
|
+
for index, item in indexed_items
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
for future in as_completed(future_map):
|
|
49
|
+
index = future_map[future]
|
|
50
|
+
try:
|
|
51
|
+
results[index] = WorkerResult(index=index, value=future.result())
|
|
52
|
+
except (KeyboardInterrupt, SystemExit):
|
|
53
|
+
raise
|
|
54
|
+
except Exception as exc:
|
|
55
|
+
results[index] = WorkerResult(index=index, error=exc)
|
|
56
|
+
|
|
57
|
+
return results
|
|
@@ -573,25 +573,23 @@ class EnhancedResponseExtractor:
|
|
|
573
573
|
}
|
|
574
574
|
}
|
|
575
575
|
|
|
576
|
-
def
|
|
576
|
+
def extract_enhanced_response(raw_response: str, log_level: str = "info") -> Dict[str, Any]:
|
|
577
577
|
"""
|
|
578
|
-
Extract enhanced response information
|
|
579
|
-
|
|
578
|
+
Extract enhanced response information from a raw response string.
|
|
579
|
+
|
|
580
580
|
Args:
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
581
|
+
raw_response: Raw response string from the agent
|
|
582
|
+
log_level: Logging level for the extraction process (default: "info")
|
|
583
|
+
|
|
584
584
|
Returns:
|
|
585
|
-
|
|
585
|
+
A dictionary containing the enhanced response information, including:
|
|
586
|
+
- "response": Reconstructed message flow with tool calls and results
|
|
587
|
+
- "tool_definitions": List of tool definitions extracted from telemetry
|
|
588
|
+
- "raw_response_text": Original response text for backward compatibility
|
|
589
|
+
- "metadata": Additional metadata such as conversation ID, request ID, etc.
|
|
586
590
|
"""
|
|
587
591
|
extractor = EnhancedResponseExtractor(log_level=log_level)
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
for prompt, raw_response in responses:
|
|
591
|
-
enhanced = extractor.extract_enhanced_response(raw_response)
|
|
592
|
-
enhanced_responses.append(enhanced)
|
|
593
|
-
|
|
594
|
-
return enhanced_responses
|
|
592
|
+
return extractor.extract_enhanced_response(raw_response)
|
|
595
593
|
|
|
596
594
|
def get_response_text_for_evaluation(enhanced_response: Dict[str, Any]) -> str:
|
|
597
595
|
"""
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Retry utilities for transient HTTP failures in evaluation flows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from email.utils import parsedate_to_datetime
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
RETRYABLE_HTTP_STATUS_CODES = {429, 503, 504}
|
|
10
|
+
MAX_BACKOFF_SECONDS = 60
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_retryable_status(status_code: Optional[int]) -> bool:
|
|
14
|
+
"""Return True for transient HTTP status codes covered by the spec."""
|
|
15
|
+
if status_code is None:
|
|
16
|
+
return False
|
|
17
|
+
return int(status_code) in RETRYABLE_HTTP_STATUS_CODES
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_backoff_seconds(attempt: int) -> int:
|
|
21
|
+
"""Return exponential backoff delay capped at MAX_BACKOFF_SECONDS.
|
|
22
|
+
|
|
23
|
+
Examples: 2, 4, 8 for attempts 1..3.
|
|
24
|
+
"""
|
|
25
|
+
if attempt < 1:
|
|
26
|
+
raise ValueError("attempt must be >= 1")
|
|
27
|
+
return min(2 ** attempt, MAX_BACKOFF_SECONDS)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_retry_after_seconds(retry_after_header: Optional[str]) -> Optional[int]:
|
|
31
|
+
"""Parse Retry-After header value (delay-seconds or HTTP-date per RFC 7231)."""
|
|
32
|
+
if retry_after_header is None:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
value = retry_after_header.strip()
|
|
36
|
+
if not value:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
# Try delay-seconds (integer) first
|
|
40
|
+
try:
|
|
41
|
+
return max(0, int(value))
|
|
42
|
+
except ValueError:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
# Try HTTP-date format (RFC 7231 §7.1.3)
|
|
46
|
+
try:
|
|
47
|
+
retry_date = parsedate_to_datetime(value)
|
|
48
|
+
now = datetime.now(timezone.utc)
|
|
49
|
+
delta = int((retry_date - now).total_seconds())
|
|
50
|
+
return max(0, delta)
|
|
51
|
+
except (ValueError, TypeError):
|
|
52
|
+
return None
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.2.0",
|
|
3
|
+
"default_evaluators": {
|
|
4
|
+
"Relevance": {},
|
|
5
|
+
"Coherence": {}
|
|
6
|
+
},
|
|
7
|
+
"items": [
|
|
8
|
+
{
|
|
9
|
+
"prompt": "What is Microsoft Graph?",
|
|
10
|
+
"expected_response": "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"name": "Travel planning conversation",
|
|
14
|
+
"description": "Multi-turn thread testing context retention across turns",
|
|
15
|
+
"turns": [
|
|
16
|
+
{
|
|
17
|
+
"prompt": "I'm planning a trip to Seattle next week.",
|
|
18
|
+
"expected_response": "I can help you plan your Seattle trip."
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"prompt": "What's the weather going to be like?",
|
|
22
|
+
"expected_response": "Seattle weather is typically mild with possible rain."
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"prompt": "Should I bring a rain jacket?",
|
|
26
|
+
"expected_response": "Yes, it's always a good idea to bring rain gear to Seattle.",
|
|
27
|
+
"evaluators": {
|
|
28
|
+
"Groundedness": { "threshold": 4 }
|
|
29
|
+
},
|
|
30
|
+
"evaluators_mode": "extend"
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Per-API throttle gate support for transient HTTP 429 handling."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class GateState:
|
|
13
|
+
"""Snapshot state for diagnostics and tests."""
|
|
14
|
+
|
|
15
|
+
api_name: str
|
|
16
|
+
blocked_until_epoch: float
|
|
17
|
+
is_blocked: bool
|
|
18
|
+
last_retry_after_seconds: Optional[int]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ThrottleGate:
|
|
22
|
+
"""Thread-safe per-API gate that pauses workers until the block window elapses."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, api_name: str) -> None:
|
|
25
|
+
self.api_name = api_name
|
|
26
|
+
self._lock = threading.Lock()
|
|
27
|
+
self._blocked_until_epoch = 0.0
|
|
28
|
+
self._last_retry_after_seconds: Optional[int] = None
|
|
29
|
+
|
|
30
|
+
def apply_retry_after(self, retry_after_seconds: int) -> float:
|
|
31
|
+
"""Apply retry-after duration and keep the maximum active block window.
|
|
32
|
+
|
|
33
|
+
Returns the current effective blocked-until epoch.
|
|
34
|
+
"""
|
|
35
|
+
retry_after_seconds = max(0, int(retry_after_seconds))
|
|
36
|
+
candidate = time.time() + retry_after_seconds
|
|
37
|
+
|
|
38
|
+
with self._lock:
|
|
39
|
+
if candidate > self._blocked_until_epoch:
|
|
40
|
+
self._blocked_until_epoch = candidate
|
|
41
|
+
self._last_retry_after_seconds = retry_after_seconds
|
|
42
|
+
return self._blocked_until_epoch
|
|
43
|
+
|
|
44
|
+
MAX_GATE_WAIT_SECONDS = 300.0
|
|
45
|
+
|
|
46
|
+
def wait_if_blocked(self) -> float:
|
|
47
|
+
"""Sleep until the gate opens. Returns the total slept duration in seconds.
|
|
48
|
+
|
|
49
|
+
Re-checks the block window after each sleep to handle concurrent
|
|
50
|
+
``apply_retry_after`` calls that extend the window (avoids TOCTOU).
|
|
51
|
+
Raises ``TimeoutError`` if the total wait exceeds ``MAX_GATE_WAIT_SECONDS``.
|
|
52
|
+
"""
|
|
53
|
+
total_slept = 0.0
|
|
54
|
+
while True:
|
|
55
|
+
with self._lock:
|
|
56
|
+
delay = max(0.0, self._blocked_until_epoch - time.time())
|
|
57
|
+
if delay <= 0:
|
|
58
|
+
return total_slept
|
|
59
|
+
if total_slept + delay > self.MAX_GATE_WAIT_SECONDS:
|
|
60
|
+
raise TimeoutError(
|
|
61
|
+
f"ThrottleGate '{self.api_name}' exceeded maximum wait of "
|
|
62
|
+
f"{self.MAX_GATE_WAIT_SECONDS}s (slept {total_slept:.1f}s so far)."
|
|
63
|
+
)
|
|
64
|
+
time.sleep(delay)
|
|
65
|
+
total_slept += delay
|
|
66
|
+
|
|
67
|
+
def clear(self) -> None:
|
|
68
|
+
"""Reset the gate to unblocked state."""
|
|
69
|
+
with self._lock:
|
|
70
|
+
self._blocked_until_epoch = 0.0
|
|
71
|
+
self._last_retry_after_seconds = None
|
|
72
|
+
|
|
73
|
+
def state(self) -> GateState:
|
|
74
|
+
"""Return immutable snapshot state."""
|
|
75
|
+
with self._lock:
|
|
76
|
+
now = time.time()
|
|
77
|
+
return GateState(
|
|
78
|
+
api_name=self.api_name,
|
|
79
|
+
blocked_until_epoch=self._blocked_until_epoch,
|
|
80
|
+
is_blocked=self._blocked_until_epoch > now,
|
|
81
|
+
last_retry_after_seconds=self._last_retry_after_seconds,
|
|
82
|
+
)
|
|
@@ -8,8 +8,10 @@ import { ensurePythonRuntime, getCacheDir } from '../lib/python-runtime.js';
|
|
|
8
8
|
import { ensureVenv, executePythonCli } from '../lib/venv-manager.js';
|
|
9
9
|
import { getCacheStats, clearCache, formatBytes } from '../lib/cache-utils.js';
|
|
10
10
|
import { checkPackageExpiry } from '../lib/expiry-check.js';
|
|
11
|
+
import { recordAcceptance, checkAcceptance } from '../lib/eula-manager.js';
|
|
11
12
|
import { ProgressReporter } from '../lib/progress.js';
|
|
12
13
|
import { _loadEnvFile as loadEnvFile, _loadUserEnvOverride } from '../lib/env-loader.js';
|
|
14
|
+
import { normalizeAgentId } from '../lib/agent-id.js';
|
|
13
15
|
|
|
14
16
|
// Check package expiry (exits if expired, warns if close to expiry)
|
|
15
17
|
checkPackageExpiry();
|
|
@@ -22,20 +24,13 @@ const packageJsonPath = path.join(__dirname, '..', '..', '..', '..', 'package.js
|
|
|
22
24
|
const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf8'));
|
|
23
25
|
const VERSION = packageJson.version;
|
|
24
26
|
|
|
27
|
+
const EULA_URL = 'https://aka.ms/evaltoolterms';
|
|
28
|
+
|
|
25
29
|
// Path to Python CLI and requirements
|
|
26
30
|
const PYTHON_CLI_DIR = path.join(__dirname, '..', '..', 'cli');
|
|
27
31
|
const MAIN_SCRIPT = path.join(PYTHON_CLI_DIR, 'main.py');
|
|
28
32
|
const REQUIREMENTS_FILE = path.join(PYTHON_CLI_DIR, 'requirements.txt');
|
|
29
33
|
|
|
30
|
-
/**
|
|
31
|
-
* Display usage terms notice
|
|
32
|
-
* Called before running evaluations (but not for --init-only, cache commands, or --signout).
|
|
33
|
-
* This notice is always displayed per legal requirements (FR-006).
|
|
34
|
-
*/
|
|
35
|
-
function displayUsageTerms() {
|
|
36
|
-
console.log('By using this tool, you agree to the Terms of Use: https://aka.ms/evaltoolterms\n');
|
|
37
|
-
}
|
|
38
|
-
|
|
39
34
|
/**
|
|
40
35
|
* Set default environment constants that cannot be overridden
|
|
41
36
|
* This ensures these values are always set regardless of .env files
|
|
@@ -176,21 +171,80 @@ async function main() {
|
|
|
176
171
|
.option('--prompts-file <file>', 'JSON file with prompts and expected responses')
|
|
177
172
|
.option('-o, --output <file>', 'output file (JSON, CSV, or HTML)')
|
|
178
173
|
.option('-i, --interactive', 'interactive mode (enter prompts interactively)')
|
|
174
|
+
.option('--concurrency <number>', 'max prompts to process in parallel (1-5)')
|
|
179
175
|
.option('--m365-agent-id <id>', 'agent ID (overrides env vars and auto-construction)')
|
|
180
176
|
.option('--env <environment>', 'environment name (loads env/.env.<environment>)', 'local')
|
|
181
177
|
.option('--init-only', 'only initialize Python environment, don\'t run evaluations')
|
|
182
178
|
.option('--cache-info', 'show cache information and statistics')
|
|
183
179
|
.option('--cache-clear', 'clear the cache (removes Python runtime and venv)')
|
|
184
180
|
.option('--cache-dir', 'print the cache directory path')
|
|
185
|
-
.option('--signout', 'sign out and clear cached authentication tokens')
|
|
181
|
+
.option('--signout', 'sign out and clear cached authentication tokens')
|
|
182
|
+
.action(() => {
|
|
183
|
+
// Default command — handled by the main flow below parseAsync()
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
program
|
|
187
|
+
.command('accept-eula')
|
|
188
|
+
.description('Accept the End User License Agreement (EULA)')
|
|
189
|
+
.action(async () => {
|
|
190
|
+
const config = (await import('../config/default.js')).default;
|
|
191
|
+
try {
|
|
192
|
+
await recordAcceptance(config.eula.version);
|
|
193
|
+
console.log('EULA has been accepted');
|
|
194
|
+
process.exit(0);
|
|
195
|
+
} catch (err) {
|
|
196
|
+
console.error(
|
|
197
|
+
`⚠️ Unable to persist EULA acceptance: ${err.message}`,
|
|
198
|
+
);
|
|
199
|
+
console.error(
|
|
200
|
+
'Please ensure the directory ~/.m365-copilot-agent-evals/ is writable.',
|
|
201
|
+
);
|
|
202
|
+
process.exit(1);
|
|
203
|
+
}
|
|
204
|
+
});
|
|
186
205
|
|
|
187
|
-
program.
|
|
206
|
+
await program.parseAsync(process.argv);
|
|
188
207
|
const options = program.opts();
|
|
189
208
|
const effectiveLogLevel = resolveLogLevel(options);
|
|
190
209
|
const outputMode = deriveWrapperOutputMode(effectiveLogLevel);
|
|
191
210
|
const wrapperVerbose = outputMode.verbose;
|
|
192
211
|
const wrapperQuiet = outputMode.quiet;
|
|
193
212
|
|
|
213
|
+
// === EULA Enforcement Gate ===
|
|
214
|
+
// Block all commands until EULA is accepted (FR-010, FR-011).
|
|
215
|
+
// accept-eula subcommand, --help, and --version are already handled
|
|
216
|
+
// by Commander during program.parse() and exit before reaching here.
|
|
217
|
+
const config = (await import('../config/default.js')).default;
|
|
218
|
+
const { accepted, stale } = await checkAcceptance(config.eula.version);
|
|
219
|
+
if (!accepted) {
|
|
220
|
+
if (stale) {
|
|
221
|
+
console.error(
|
|
222
|
+
`==============================================================
|
|
223
|
+
The End User License Agreement (EULA) has been updated.
|
|
224
|
+
Please review the updated terms at:
|
|
225
|
+
${EULA_URL}
|
|
226
|
+
|
|
227
|
+
To accept the updated EULA, please execute the following command:
|
|
228
|
+
|
|
229
|
+
runevals accept-eula
|
|
230
|
+
|
|
231
|
+
==============================================================`);
|
|
232
|
+
} else {
|
|
233
|
+
console.error(
|
|
234
|
+
`==============================================================
|
|
235
|
+
In order to use this tool you must accept the End User License
|
|
236
|
+
Agreement (EULA) found at:
|
|
237
|
+
${EULA_URL}
|
|
238
|
+
|
|
239
|
+
To accept the EULA, please execute the following command:
|
|
240
|
+
|
|
241
|
+
runevals accept-eula
|
|
242
|
+
|
|
243
|
+
==============================================================`);
|
|
244
|
+
}
|
|
245
|
+
process.exit(2);
|
|
246
|
+
}
|
|
247
|
+
|
|
194
248
|
// Handle cache commands first (they don't need environment validation or config)
|
|
195
249
|
if (options.cacheInfo) {
|
|
196
250
|
console.log('🗂️ Cache Information\n');
|
|
@@ -251,8 +305,7 @@ async function main() {
|
|
|
251
305
|
|
|
252
306
|
// === From here on, we're running actual evals - load config and env files ===
|
|
253
307
|
|
|
254
|
-
|
|
255
|
-
// Load build-time config
|
|
308
|
+
// Load build-time config (already loaded above for EULA check)
|
|
256
309
|
await setDefaultEnvironmentConstants();
|
|
257
310
|
|
|
258
311
|
// Load environment files
|
|
@@ -322,11 +375,17 @@ async function main() {
|
|
|
322
375
|
}
|
|
323
376
|
}
|
|
324
377
|
|
|
325
|
-
// Resolve agent ID from environment if not explicitly provided via CLI flag
|
|
326
|
-
// loadEnvFile already resolved aliases (e.g. M365_TITLE_ID) into M365_AGENT_ID
|
|
378
|
+
// Resolve agent ID from environment if not explicitly provided via CLI flag.
|
|
379
|
+
// loadEnvFile already resolved aliases (e.g. M365_TITLE_ID) into M365_AGENT_ID.
|
|
380
|
+
// Then normalize via shared helper and sync to process.env so downstream
|
|
381
|
+
// readers (and the python CLI) see the canonical form.
|
|
327
382
|
if (!resolvedAgentId) {
|
|
328
383
|
resolvedAgentId = envVars['M365_AGENT_ID'] || process.env.M365_AGENT_ID;
|
|
329
|
-
|
|
384
|
+
}
|
|
385
|
+
resolvedAgentId = normalizeAgentId(resolvedAgentId);
|
|
386
|
+
if (resolvedAgentId) {
|
|
387
|
+
process.env.M365_AGENT_ID = resolvedAgentId;
|
|
388
|
+
if (!options.m365AgentId && !wrapperQuiet) {
|
|
330
389
|
console.log(`🤖 Agent ID: ${resolvedAgentId}`);
|
|
331
390
|
}
|
|
332
391
|
}
|
|
@@ -458,6 +517,10 @@ async function main() {
|
|
|
458
517
|
if (options.prompts && options.prompts.length > 0) {
|
|
459
518
|
pythonArgs.push('--prompts', ...options.prompts);
|
|
460
519
|
}
|
|
520
|
+
|
|
521
|
+
if (options.concurrency !== undefined) {
|
|
522
|
+
pythonArgs.push('--concurrency', String(options.concurrency));
|
|
523
|
+
}
|
|
461
524
|
|
|
462
525
|
if (options.expected && options.expected.length > 0) {
|
|
463
526
|
pythonArgs.push('--expected', ...options.expected);
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Build-time injected default values
|
|
3
3
|
* DO NOT EDIT - This file is auto-generated during build.
|
|
4
4
|
*
|
|
5
|
-
* Generated: 2026-04-
|
|
5
|
+
* Generated: 2026-04-22T20:44:41.713Z
|
|
6
6
|
*
|
|
7
7
|
* @copyright Microsoft Corporation. All rights reserved.
|
|
8
8
|
* @license MIT
|
|
@@ -21,5 +21,9 @@ export default {
|
|
|
21
21
|
|
|
22
22
|
/** Scenario Header for Copilot API */
|
|
23
23
|
scenarioHeader: "agenticevaluation"
|
|
24
|
+
},
|
|
25
|
+
eula: {
|
|
26
|
+
/** EULA version string for acceptance tracking */
|
|
27
|
+
version: "2026-04-01"
|
|
24
28
|
}
|
|
25
29
|
};
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalize an M365 agent ID by appending '.declarativeAgent' when the value
|
|
3
|
+
* has no '.' segment. Returns the input unchanged when null/undefined/empty
|
|
4
|
+
* or when it already contains a dot.
|
|
5
|
+
*
|
|
6
|
+
* @param {string|null|undefined} id - The raw agent ID value.
|
|
7
|
+
* @returns {string|null|undefined} The normalized agent ID.
|
|
8
|
+
*/
|
|
9
|
+
export function normalizeAgentId(id) {
|
|
10
|
+
if (!id) return id;
|
|
11
|
+
return id.includes('.') ? id : `${id}.declarativeAgent`;
|
|
12
|
+
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
* Handles .env.local, .env.local.user, and other env file formats.
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
|
+
import { parse as dotenvParse } from 'dotenv';
|
|
6
7
|
import fs from 'fs';
|
|
7
8
|
import path from 'path';
|
|
8
9
|
|
|
@@ -21,7 +22,8 @@ const AGENT_ID_ALIASES = [
|
|
|
21
22
|
|
|
22
23
|
/**
|
|
23
24
|
* Load environment variables from a .env-style file.
|
|
24
|
-
*
|
|
25
|
+
* Uses dotenv.parse() for standards-compliant parsing (handles quoted values,
|
|
26
|
+
* inline comments, escape sequences). Protected keys are ignored with a warning.
|
|
25
27
|
* Malformed lines (no '=' separator) are skipped with a warning.
|
|
26
28
|
* @param {string} envFilePath - Absolute path to the env file
|
|
27
29
|
* @returns {Object|null} Parsed key-value pairs, or null if file cannot be read
|
|
@@ -34,32 +36,25 @@ export function _loadEnvFile(envFilePath) {
|
|
|
34
36
|
const envVars = {};
|
|
35
37
|
try {
|
|
36
38
|
const content = fs.readFileSync(envFilePath, 'utf-8');
|
|
37
|
-
const lines = content.split('\n');
|
|
38
39
|
|
|
39
|
-
for
|
|
40
|
+
// Pre-scan for malformed lines (no '=') and emit warnings
|
|
41
|
+
for (const line of content.split('\n')) {
|
|
40
42
|
const trimmedLine = line.trim();
|
|
41
43
|
if (!trimmedLine || trimmedLine.startsWith('#')) {
|
|
42
44
|
continue;
|
|
43
45
|
}
|
|
44
|
-
|
|
45
|
-
const eqIndex = trimmedLine.indexOf('=');
|
|
46
|
-
if (eqIndex === -1) {
|
|
46
|
+
if (trimmedLine.indexOf('=') === -1) {
|
|
47
47
|
console.warn(
|
|
48
48
|
`⚠️ Ignoring malformed line in env file (missing '='): ${trimmedLine}`
|
|
49
49
|
);
|
|
50
|
-
continue;
|
|
51
50
|
}
|
|
51
|
+
}
|
|
52
52
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
.trim()
|
|
57
|
-
.replace(/^(['"])(.*)\1$/, '$2');
|
|
58
|
-
|
|
59
|
-
if (!keyName) {
|
|
60
|
-
continue;
|
|
61
|
-
}
|
|
53
|
+
// Use dotenv.parse() for standards-compliant .env parsing
|
|
54
|
+
// (handles quoted values, inline comments, escape sequences, export prefix)
|
|
55
|
+
const parsed = dotenvParse(content);
|
|
62
56
|
|
|
57
|
+
for (const [keyName, value] of Object.entries(parsed)) {
|
|
63
58
|
if (PROTECTED_KEYS.includes(keyName)) {
|
|
64
59
|
console.warn(
|
|
65
60
|
`⚠️ Ignoring ${keyName} from .env file (using built-in value)`
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EULA acceptance manager
|
|
3
|
+
*
|
|
4
|
+
* Manages reading and writing the EULA acceptance marker file at
|
|
5
|
+
* ~/.m365-copilot-agent-evals/eula-acceptance.json.
|
|
6
|
+
* This location is independent of the cache directory so acceptance
|
|
7
|
+
* survives --cache-clear operations.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import fs from 'node:fs/promises';
|
|
11
|
+
import path from 'node:path';
|
|
12
|
+
import os from 'node:os';
|
|
13
|
+
|
|
14
|
+
const EULA_DIR_NAME = '.m365-copilot-agent-evals';
|
|
15
|
+
const EULA_FILE_NAME = 'eula-acceptance.json';
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Returns the EULA directory path (~/.m365-copilot-agent-evals/).
|
|
19
|
+
* @returns {string}
|
|
20
|
+
*/
|
|
21
|
+
export function getEulaDir() {
|
|
22
|
+
return path.join(os.homedir(), EULA_DIR_NAME);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Returns the full path to the acceptance marker file.
|
|
27
|
+
* @returns {string}
|
|
28
|
+
*/
|
|
29
|
+
export function getEulaFilePath() {
|
|
30
|
+
return path.join(getEulaDir(), EULA_FILE_NAME);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Write an acceptance marker for the given EULA version.
|
|
35
|
+
* Creates the directory if it doesn't exist.
|
|
36
|
+
* @param {string} version - EULA version string
|
|
37
|
+
* @returns {Promise<void>}
|
|
38
|
+
*/
|
|
39
|
+
export async function recordAcceptance(version) {
|
|
40
|
+
const dir = getEulaDir();
|
|
41
|
+
await fs.mkdir(dir, { recursive: true });
|
|
42
|
+
const marker = { version, acceptedAt: new Date().toISOString() };
|
|
43
|
+
await fs.writeFile(
|
|
44
|
+
getEulaFilePath(),
|
|
45
|
+
JSON.stringify(marker, null, 2),
|
|
46
|
+
'utf-8'
|
|
47
|
+
);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Check whether the EULA has been accepted for the required version.
|
|
52
|
+
* @param {string} requiredVersion - The version to check against
|
|
53
|
+
* @returns {Promise<{accepted: boolean, stale: boolean, marker: object|null}>}
|
|
54
|
+
*/
|
|
55
|
+
export async function checkAcceptance(requiredVersion) {
|
|
56
|
+
const marker = await _readMarker();
|
|
57
|
+
if (!marker) return { accepted: false, stale: false, marker: null };
|
|
58
|
+
if (marker.version !== requiredVersion)
|
|
59
|
+
return { accepted: false, stale: true, marker };
|
|
60
|
+
return { accepted: true, stale: false, marker };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Read and parse the acceptance marker file.
|
|
65
|
+
* Returns null if the file is missing, unreadable, or malformed.
|
|
66
|
+
* Exported with _ prefix for unit testing.
|
|
67
|
+
* @returns {Promise<object|null>}
|
|
68
|
+
*/
|
|
69
|
+
export async function _readMarker() {
|
|
70
|
+
try {
|
|
71
|
+
const raw = await fs.readFile(getEulaFilePath(), 'utf-8');
|
|
72
|
+
const parsed = JSON.parse(raw);
|
|
73
|
+
if (!parsed.version || !parsed.acceptedAt) return null;
|
|
74
|
+
return parsed;
|
|
75
|
+
} catch {
|
|
76
|
+
return null;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
@@ -605,18 +605,20 @@ export class ProgressReporter {
|
|
|
605
605
|
|
|
606
606
|
this.phaseStatuses.set(phaseId, 'failed');
|
|
607
607
|
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
608
|
+
if (!this.options.quiet) {
|
|
609
|
+
// Clear current line and display error
|
|
610
|
+
if (this.isInteractive) {
|
|
611
|
+
readline.clearLine(process.stdout, 0);
|
|
612
|
+
readline.cursorTo(process.stdout, 0);
|
|
613
|
+
}
|
|
613
614
|
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
615
|
+
console.error(`\n❌ Failed: ${phase.name}`);
|
|
616
|
+
console.error(`\nError: ${error.message}`);
|
|
617
|
+
console.error(`\nSuggested actions:`);
|
|
618
|
+
console.error(` • Check your internet connection`);
|
|
619
|
+
console.error(` • If behind a proxy, set HTTP_PROXY/HTTPS_PROXY`);
|
|
620
|
+
console.error(` • Run with --verbose for detailed output`);
|
|
621
|
+
}
|
|
620
622
|
|
|
621
623
|
this.currentPhase = null;
|
|
622
624
|
this.phaseStartTime = null;
|