@microsoft/m365-copilot-eval 1.2.1-preview.1 → 1.4.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -101
- package/package.json +7 -4
- package/schema/CHANGELOG.md +8 -0
- package/schema/v1/eval-document.schema.json +256 -8
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/comprehensive.json +27 -2
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
- package/src/clients/cli/api_clients/REST/__init__.py +3 -0
- package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +78 -0
- package/src/clients/cli/cli_logging/__init__.py +0 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +107 -0
- package/src/clients/cli/cli_logging/logging_utils.py +144 -0
- package/src/clients/cli/common.py +62 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +3 -3
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +11 -11
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +1 -11
- package/src/clients/cli/evaluator_resolver.py +150 -0
- package/src/clients/cli/generate_report.py +347 -184
- package/src/clients/cli/main.py +1288 -481
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/readme.md +14 -7
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +30 -14
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +134 -41
- package/src/clients/node-js/config/default.js +5 -1
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +11 -16
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Parallel prompt execution utilities.
|
|
2
|
+
|
|
3
|
+
This module provides a minimal reusable executor that preserves input order.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any, Callable, Generic, Iterable, List, Optional, TypeVar
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
T = TypeVar("T")
|
|
14
|
+
R = TypeVar("R")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class WorkerResult(Generic[R]):
|
|
19
|
+
"""Result model used to preserve input ordering and capture failures."""
|
|
20
|
+
|
|
21
|
+
index: int
|
|
22
|
+
value: Optional[R] = None
|
|
23
|
+
error: Optional[Exception] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def execute_in_parallel(
|
|
27
|
+
items: Iterable[T],
|
|
28
|
+
worker: Callable[[T, int], R],
|
|
29
|
+
max_workers: int,
|
|
30
|
+
) -> List[WorkerResult[R]]:
|
|
31
|
+
"""Execute worker function in parallel while preserving input order.
|
|
32
|
+
|
|
33
|
+
`worker` receives `(item, index)` and returns a value.
|
|
34
|
+
"""
|
|
35
|
+
indexed_items = list(enumerate(items))
|
|
36
|
+
if not indexed_items:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
normalized_workers = max(1, min(max_workers, len(indexed_items)))
|
|
40
|
+
results: List[WorkerResult[R]] = [WorkerResult(index=i) for i, _ in indexed_items]
|
|
41
|
+
|
|
42
|
+
with ThreadPoolExecutor(max_workers=normalized_workers) as executor:
|
|
43
|
+
future_map = {
|
|
44
|
+
executor.submit(worker, item, index): index
|
|
45
|
+
for index, item in indexed_items
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
for future in as_completed(future_map):
|
|
49
|
+
index = future_map[future]
|
|
50
|
+
try:
|
|
51
|
+
results[index] = WorkerResult(index=index, value=future.result())
|
|
52
|
+
except (KeyboardInterrupt, SystemExit):
|
|
53
|
+
raise
|
|
54
|
+
except Exception as exc:
|
|
55
|
+
results[index] = WorkerResult(index=index, error=exc)
|
|
56
|
+
|
|
57
|
+
return results
|
|
@@ -97,21 +97,28 @@ python main.py --interactive
|
|
|
97
97
|
|
|
98
98
|
#### Additional Options
|
|
99
99
|
```bash
|
|
100
|
-
#
|
|
101
|
-
python main.py --
|
|
100
|
+
# Logging verbosity (canonical control surface)
|
|
101
|
+
python main.py --log-level debug
|
|
102
|
+
python main.py --log-level info
|
|
103
|
+
python main.py --log-level warning
|
|
104
|
+
python main.py --log-level error
|
|
105
|
+
|
|
106
|
+
# Bare flag resolves to info
|
|
107
|
+
python main.py --log-level
|
|
102
108
|
|
|
103
|
-
#
|
|
109
|
+
# Legacy flags (no longer supported; use --log-level instead)
|
|
110
|
+
# The following will fail with "unrecognized arguments" errors:
|
|
111
|
+
python main.py --verbose
|
|
104
112
|
python main.py --quiet
|
|
105
113
|
|
|
114
|
+
# Share diagnostics with support (console-based, no archive artifacts)
|
|
115
|
+
python main.py --log-level debug --prompts-file samples/example_prompts.json
|
|
116
|
+
|
|
106
117
|
# Get help and see all options
|
|
107
118
|
python main.py --help
|
|
108
119
|
|
|
109
120
|
# Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
|
|
110
121
|
python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
|
|
111
|
-
|
|
112
|
-
# Citation format options
|
|
113
|
-
python main.py --citation-format oai_unicode # Default: New OAI format
|
|
114
|
-
python main.py --citation-format legacy_bracket # Old [^i^] format
|
|
115
122
|
```
|
|
116
123
|
|
|
117
124
|
#### File Format Examples
|
|
@@ -27,14 +27,20 @@ Date: September 21, 2025
|
|
|
27
27
|
|
|
28
28
|
import json
|
|
29
29
|
import logging
|
|
30
|
-
from typing import Dict, List, Any, Optional
|
|
30
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
31
31
|
from datetime import datetime
|
|
32
32
|
from enum import Enum
|
|
33
|
+
from cli_logging.logging_utils import LOG_LEVEL_MAP, LogLevel
|
|
33
34
|
|
|
34
35
|
# Configure logging
|
|
35
|
-
logging.
|
|
36
|
+
if not logging.getLogger().handlers:
|
|
37
|
+
logging.basicConfig(level=logging.INFO)
|
|
36
38
|
logger = logging.getLogger(__name__)
|
|
37
39
|
|
|
40
|
+
def _log_level_to_python_level(log_level: str) -> int:
|
|
41
|
+
normalized = (log_level or "info").strip().lower()
|
|
42
|
+
return LOG_LEVEL_MAP.get(normalized, logging.INFO)
|
|
43
|
+
|
|
38
44
|
class MessageRole(Enum):
|
|
39
45
|
"""Enumeration for message roles."""
|
|
40
46
|
USER = "user"
|
|
@@ -71,8 +77,10 @@ class EnhancedResponseExtractor:
|
|
|
71
77
|
"generate_response"
|
|
72
78
|
}
|
|
73
79
|
|
|
74
|
-
def __init__(self):
|
|
80
|
+
def __init__(self, log_level: str = "info"):
|
|
75
81
|
self.tool_call_counter = 0
|
|
82
|
+
self.log_level = (log_level or "info").strip().lower()
|
|
83
|
+
logger.setLevel(_log_level_to_python_level(self.log_level))
|
|
76
84
|
|
|
77
85
|
def _generate_tool_call_id(self, tool_name: str) -> str:
|
|
78
86
|
"""Generate a unique tool call ID."""
|
|
@@ -461,6 +469,7 @@ class EnhancedResponseExtractor:
|
|
|
461
469
|
"metadata": {
|
|
462
470
|
"conversation_id": response_data.get("conversationId"),
|
|
463
471
|
"request_id": response_data.get("requestId"),
|
|
472
|
+
"message_id": None,
|
|
464
473
|
"telemetry_available": False
|
|
465
474
|
}
|
|
466
475
|
}
|
|
@@ -470,6 +479,11 @@ class EnhancedResponseExtractor:
|
|
|
470
479
|
if isinstance(response_data, dict):
|
|
471
480
|
# Messages are directly in the response_data object
|
|
472
481
|
messages = response_data.get("messages", [])
|
|
482
|
+
|
|
483
|
+
# Extract message_id from the last bot message in this response
|
|
484
|
+
bot_messages = [m for m in messages if m.get("author") != "user"]
|
|
485
|
+
if bot_messages and bot_messages[-1].get("messageId"):
|
|
486
|
+
enhanced_response["metadata"]["message_id"] = bot_messages[-1]["messageId"]
|
|
473
487
|
|
|
474
488
|
# Extract telemetry tools if available
|
|
475
489
|
telemetry_tools = []
|
|
@@ -526,6 +540,7 @@ class EnhancedResponseExtractor:
|
|
|
526
540
|
"metadata": {
|
|
527
541
|
"conversation_id": None,
|
|
528
542
|
"request_id": None,
|
|
543
|
+
"message_id": None,
|
|
529
544
|
"telemetry_available": False
|
|
530
545
|
}
|
|
531
546
|
}
|
|
@@ -552,28 +567,29 @@ class EnhancedResponseExtractor:
|
|
|
552
567
|
"metadata": {
|
|
553
568
|
"conversation_id": None,
|
|
554
569
|
"request_id": None,
|
|
570
|
+
"message_id": None,
|
|
555
571
|
"telemetry_available": False,
|
|
556
572
|
"error": str(e)
|
|
557
573
|
}
|
|
558
574
|
}
|
|
559
575
|
|
|
560
|
-
def
|
|
576
|
+
def extract_enhanced_response(raw_response: str, log_level: str = "info") -> Dict[str, Any]:
|
|
561
577
|
"""
|
|
562
|
-
Extract enhanced response information
|
|
578
|
+
Extract enhanced response information from a raw response string.
|
|
563
579
|
|
|
564
580
|
Args:
|
|
565
|
-
|
|
581
|
+
raw_response: Raw response string from the agent
|
|
582
|
+
log_level: Logging level for the extraction process (default: "info")
|
|
566
583
|
|
|
567
584
|
Returns:
|
|
568
|
-
|
|
585
|
+
A dictionary containing the enhanced response information, including:
|
|
586
|
+
- "response": Reconstructed message flow with tool calls and results
|
|
587
|
+
- "tool_definitions": List of tool definitions extracted from telemetry
|
|
588
|
+
- "raw_response_text": Original response text for backward compatibility
|
|
589
|
+
- "metadata": Additional metadata such as conversation ID, request ID, etc.
|
|
569
590
|
"""
|
|
570
|
-
extractor = EnhancedResponseExtractor()
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
for prompt, raw_response in responses.items():
|
|
574
|
-
enhanced_responses[prompt] = extractor.extract_enhanced_response(raw_response)
|
|
575
|
-
|
|
576
|
-
return enhanced_responses
|
|
591
|
+
extractor = EnhancedResponseExtractor(log_level=log_level)
|
|
592
|
+
return extractor.extract_enhanced_response(raw_response)
|
|
577
593
|
|
|
578
594
|
def get_response_text_for_evaluation(enhanced_response: Dict[str, Any]) -> str:
|
|
579
595
|
"""
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Retry utilities for transient HTTP failures in evaluation flows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from email.utils import parsedate_to_datetime
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
RETRYABLE_HTTP_STATUS_CODES = {429, 503, 504}
|
|
10
|
+
MAX_BACKOFF_SECONDS = 60
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_retryable_status(status_code: Optional[int]) -> bool:
|
|
14
|
+
"""Return True for transient HTTP status codes covered by the spec."""
|
|
15
|
+
if status_code is None:
|
|
16
|
+
return False
|
|
17
|
+
return int(status_code) in RETRYABLE_HTTP_STATUS_CODES
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_backoff_seconds(attempt: int) -> int:
|
|
21
|
+
"""Return exponential backoff delay capped at MAX_BACKOFF_SECONDS.
|
|
22
|
+
|
|
23
|
+
Examples: 2, 4, 8 for attempts 1..3.
|
|
24
|
+
"""
|
|
25
|
+
if attempt < 1:
|
|
26
|
+
raise ValueError("attempt must be >= 1")
|
|
27
|
+
return min(2 ** attempt, MAX_BACKOFF_SECONDS)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_retry_after_seconds(retry_after_header: Optional[str]) -> Optional[int]:
|
|
31
|
+
"""Parse Retry-After header value (delay-seconds or HTTP-date per RFC 7231)."""
|
|
32
|
+
if retry_after_header is None:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
value = retry_after_header.strip()
|
|
36
|
+
if not value:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
# Try delay-seconds (integer) first
|
|
40
|
+
try:
|
|
41
|
+
return max(0, int(value))
|
|
42
|
+
except ValueError:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
# Try HTTP-date format (RFC 7231 §7.1.3)
|
|
46
|
+
try:
|
|
47
|
+
retry_date = parsedate_to_datetime(value)
|
|
48
|
+
now = datetime.now(timezone.utc)
|
|
49
|
+
delta = int((retry_date - now).total_seconds())
|
|
50
|
+
return max(0, delta)
|
|
51
|
+
except (ValueError, TypeError):
|
|
52
|
+
return None
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.2.0",
|
|
3
|
+
"default_evaluators": {
|
|
4
|
+
"Relevance": {},
|
|
5
|
+
"Coherence": {}
|
|
6
|
+
},
|
|
7
|
+
"items": [
|
|
8
|
+
{
|
|
9
|
+
"prompt": "What is Microsoft Graph?",
|
|
10
|
+
"expected_response": "Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"name": "Travel planning conversation",
|
|
14
|
+
"description": "Multi-turn thread testing context retention across turns",
|
|
15
|
+
"turns": [
|
|
16
|
+
{
|
|
17
|
+
"prompt": "I'm planning a trip to Seattle next week.",
|
|
18
|
+
"expected_response": "I can help you plan your Seattle trip."
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"prompt": "What's the weather going to be like?",
|
|
22
|
+
"expected_response": "Seattle weather is typically mild with possible rain."
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"prompt": "Should I bring a rain jacket?",
|
|
26
|
+
"expected_response": "Yes, it's always a good idea to bring rain gear to Seattle.",
|
|
27
|
+
"evaluators": {
|
|
28
|
+
"Groundedness": { "threshold": 4 }
|
|
29
|
+
},
|
|
30
|
+
"evaluators_mode": "extend"
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Per-API throttle gate support for transient HTTP 429 handling."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import threading
|
|
6
|
+
import time
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class GateState:
|
|
13
|
+
"""Snapshot state for diagnostics and tests."""
|
|
14
|
+
|
|
15
|
+
api_name: str
|
|
16
|
+
blocked_until_epoch: float
|
|
17
|
+
is_blocked: bool
|
|
18
|
+
last_retry_after_seconds: Optional[int]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ThrottleGate:
|
|
22
|
+
"""Thread-safe per-API gate that pauses workers until the block window elapses."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, api_name: str) -> None:
|
|
25
|
+
self.api_name = api_name
|
|
26
|
+
self._lock = threading.Lock()
|
|
27
|
+
self._blocked_until_epoch = 0.0
|
|
28
|
+
self._last_retry_after_seconds: Optional[int] = None
|
|
29
|
+
|
|
30
|
+
def apply_retry_after(self, retry_after_seconds: int) -> float:
|
|
31
|
+
"""Apply retry-after duration and keep the maximum active block window.
|
|
32
|
+
|
|
33
|
+
Returns the current effective blocked-until epoch.
|
|
34
|
+
"""
|
|
35
|
+
retry_after_seconds = max(0, int(retry_after_seconds))
|
|
36
|
+
candidate = time.time() + retry_after_seconds
|
|
37
|
+
|
|
38
|
+
with self._lock:
|
|
39
|
+
if candidate > self._blocked_until_epoch:
|
|
40
|
+
self._blocked_until_epoch = candidate
|
|
41
|
+
self._last_retry_after_seconds = retry_after_seconds
|
|
42
|
+
return self._blocked_until_epoch
|
|
43
|
+
|
|
44
|
+
MAX_GATE_WAIT_SECONDS = 300.0
|
|
45
|
+
|
|
46
|
+
def wait_if_blocked(self) -> float:
|
|
47
|
+
"""Sleep until the gate opens. Returns the total slept duration in seconds.
|
|
48
|
+
|
|
49
|
+
Re-checks the block window after each sleep to handle concurrent
|
|
50
|
+
``apply_retry_after`` calls that extend the window (avoids TOCTOU).
|
|
51
|
+
Raises ``TimeoutError`` if the total wait exceeds ``MAX_GATE_WAIT_SECONDS``.
|
|
52
|
+
"""
|
|
53
|
+
total_slept = 0.0
|
|
54
|
+
while True:
|
|
55
|
+
with self._lock:
|
|
56
|
+
delay = max(0.0, self._blocked_until_epoch - time.time())
|
|
57
|
+
if delay <= 0:
|
|
58
|
+
return total_slept
|
|
59
|
+
if total_slept + delay > self.MAX_GATE_WAIT_SECONDS:
|
|
60
|
+
raise TimeoutError(
|
|
61
|
+
f"ThrottleGate '{self.api_name}' exceeded maximum wait of "
|
|
62
|
+
f"{self.MAX_GATE_WAIT_SECONDS}s (slept {total_slept:.1f}s so far)."
|
|
63
|
+
)
|
|
64
|
+
time.sleep(delay)
|
|
65
|
+
total_slept += delay
|
|
66
|
+
|
|
67
|
+
def clear(self) -> None:
|
|
68
|
+
"""Reset the gate to unblocked state."""
|
|
69
|
+
with self._lock:
|
|
70
|
+
self._blocked_until_epoch = 0.0
|
|
71
|
+
self._last_retry_after_seconds = None
|
|
72
|
+
|
|
73
|
+
def state(self) -> GateState:
|
|
74
|
+
"""Return immutable snapshot state."""
|
|
75
|
+
with self._lock:
|
|
76
|
+
now = time.time()
|
|
77
|
+
return GateState(
|
|
78
|
+
api_name=self.api_name,
|
|
79
|
+
blocked_until_epoch=self._blocked_until_epoch,
|
|
80
|
+
is_blocked=self._blocked_until_epoch > now,
|
|
81
|
+
last_retry_after_seconds=self._last_retry_after_seconds,
|
|
82
|
+
)
|