@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.4.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +129 -97
- package/package.json +7 -4
- package/schema/v1/eval-document.schema.json +140 -8
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
- package/src/clients/cli/api_clients/REST/__init__.py +3 -0
- package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +78 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +54 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +11 -0
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +1006 -476
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +12 -14
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +79 -16
- package/src/clients/node-js/config/default.js +5 -1
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +11 -16
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import urllib.error
|
|
7
|
+
import urllib.parse
|
|
8
|
+
import urllib.request
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
from api_clients.base_agent_client import BaseAgentClient
|
|
13
|
+
from cli_logging.console_diagnostics import emit_structured_log
|
|
14
|
+
from cli_logging.logging_utils import Operation
|
|
15
|
+
from response_extractor import extract_enhanced_response
|
|
16
|
+
|
|
17
|
+
_REQUEST_TIMEOUT_SECS = 120
|
|
18
|
+
|
|
19
|
+
# int → str used when passing log level to extract_enhanced_response
|
|
20
|
+
_LEVEL_INT_TO_STR: Dict[int, str] = {
|
|
21
|
+
logging.DEBUG: "debug",
|
|
22
|
+
logging.INFO: "info",
|
|
23
|
+
logging.WARNING: "warning",
|
|
24
|
+
logging.ERROR: "error",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SydneyClient(BaseAgentClient):
|
|
29
|
+
"""REST client for the Microsoft Copilot Sydney chat API."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*,
|
|
34
|
+
copilot_api_endpoint: str,
|
|
35
|
+
access_token: str,
|
|
36
|
+
user_oid: str,
|
|
37
|
+
logger: Optional[logging.Logger] = None,
|
|
38
|
+
diagnostic_records: Optional[List[Dict[str, Any]]] = None,
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Args:
|
|
42
|
+
copilot_api_endpoint: Base URL for the Copilot API.
|
|
43
|
+
access_token: Bearer token for API authentication.
|
|
44
|
+
user_oid: User object ID included in request payloads.
|
|
45
|
+
logger: Logger to use for all client logging. Defaults to a module-level
|
|
46
|
+
logger if not provided.
|
|
47
|
+
diagnostic_records: List to accumulate structured log entries.
|
|
48
|
+
"""
|
|
49
|
+
self._endpoint = copilot_api_endpoint
|
|
50
|
+
self._access_token = access_token
|
|
51
|
+
self._user_oid = user_oid
|
|
52
|
+
self._logger = logger or logging.getLogger(__name__)
|
|
53
|
+
self._diagnostic_records = diagnostic_records
|
|
54
|
+
self._log_level = _LEVEL_INT_TO_STR.get(self._logger.getEffectiveLevel(), "info")
|
|
55
|
+
|
|
56
|
+
# ------------------------------------------------------------------ #
|
|
57
|
+
# BaseAgentClient implementation #
|
|
58
|
+
# ------------------------------------------------------------------ #
|
|
59
|
+
|
|
60
|
+
def fetch_available_agents(self) -> List[Dict[str, Any]]:
|
|
61
|
+
"""Fetch agents available to the user from the Copilot API.
|
|
62
|
+
|
|
63
|
+
Returns an empty list if the endpoint is unavailable or returns an error.
|
|
64
|
+
"""
|
|
65
|
+
try:
|
|
66
|
+
request_data = json.dumps({"participant": {"id": self._user_oid}})
|
|
67
|
+
query_param = urllib.parse.quote(request_data)
|
|
68
|
+
agents_url = f"{self._endpoint}/GetGptList?request={query_param}"
|
|
69
|
+
emit_structured_log(
|
|
70
|
+
"debug",
|
|
71
|
+
f"[REST] Fetching available agents from: {agents_url}",
|
|
72
|
+
Operation.FETCH_AGENTS,
|
|
73
|
+
logger=self._logger,
|
|
74
|
+
diagnostic_records=self._diagnostic_records,
|
|
75
|
+
)
|
|
76
|
+
req = urllib.request.Request(
|
|
77
|
+
agents_url,
|
|
78
|
+
headers=self._build_request_headers(),
|
|
79
|
+
method="GET",
|
|
80
|
+
)
|
|
81
|
+
with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
82
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
83
|
+
return data.get("gptList", [])
|
|
84
|
+
except urllib.error.HTTPError as e:
|
|
85
|
+
emit_structured_log("warning", f"[REST] Unable to fetch agents list (HTTP {e.code}).", Operation.FETCH_AGENTS, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
86
|
+
return []
|
|
87
|
+
except Exception as e:
|
|
88
|
+
emit_structured_log("warning", f"[REST] Error fetching agents: {e}", Operation.FETCH_AGENTS, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
89
|
+
return []
|
|
90
|
+
|
|
91
|
+
def send_prompt(
|
|
92
|
+
self,
|
|
93
|
+
prompt: str,
|
|
94
|
+
agent_id: str | None = None,
|
|
95
|
+
conversation_context: Optional[Dict[str, Any]] = None,
|
|
96
|
+
) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
|
|
97
|
+
"""Send a prompt to the Sydney /chat endpoint and return the response with context.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
prompt: Prompt string to send to the agent.
|
|
101
|
+
agent_id: Optional agent ID to target a specific Copilot agent.
|
|
102
|
+
conversation_context: Context from a previous turn (contains conversation_id),
|
|
103
|
+
or None for the first turn / single-turn usage.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Tuple of (enhanced_response_dict, conversation_context).
|
|
107
|
+
"""
|
|
108
|
+
request_headers = self._build_request_headers()
|
|
109
|
+
conversation_id = conversation_context.get("conversation_id") if conversation_context else None
|
|
110
|
+
|
|
111
|
+
emit_structured_log("debug", "[REST] Sending prompt to agent.", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
112
|
+
|
|
113
|
+
payload = self._build_chat_payload(prompt, agent_id, conversation_id)
|
|
114
|
+
emit_structured_log("debug", f"[REST] Sending payload: {payload.decode('utf-8')[:500]}", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
115
|
+
|
|
116
|
+
req = urllib.request.Request(
|
|
117
|
+
f"{self._endpoint}/chat",
|
|
118
|
+
data=payload,
|
|
119
|
+
headers=request_headers,
|
|
120
|
+
method="POST",
|
|
121
|
+
)
|
|
122
|
+
try:
|
|
123
|
+
with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
124
|
+
raw = resp.read().decode("utf-8", errors="replace")
|
|
125
|
+
except urllib.error.HTTPError as e:
|
|
126
|
+
error_body = None
|
|
127
|
+
try:
|
|
128
|
+
error_body = e.read().decode("utf-8", errors="replace")
|
|
129
|
+
except Exception:
|
|
130
|
+
pass
|
|
131
|
+
msg = f"[REST] Chat API request failed (HTTP {e.code} {e.reason})."
|
|
132
|
+
if error_body:
|
|
133
|
+
msg += f" Body: {error_body[:500]}"
|
|
134
|
+
raise RuntimeError(msg) from e
|
|
135
|
+
except urllib.error.URLError as e:
|
|
136
|
+
raise RuntimeError(
|
|
137
|
+
f"[REST] Chat API connection error: {getattr(e, 'reason', str(e))}"
|
|
138
|
+
) from e
|
|
139
|
+
|
|
140
|
+
emit_structured_log("debug", f"[REST] Raw response: {raw[:500]}", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
141
|
+
|
|
142
|
+
enhanced_response = extract_enhanced_response(raw.strip(), self._log_level)
|
|
143
|
+
|
|
144
|
+
metadata = enhanced_response.get("metadata", {})
|
|
145
|
+
emit_structured_log(
|
|
146
|
+
"debug",
|
|
147
|
+
"Response IDs for prompt.",
|
|
148
|
+
Operation.SEND_PROMPT,
|
|
149
|
+
logger=self._logger,
|
|
150
|
+
diagnostic_records=self._diagnostic_records,
|
|
151
|
+
run_context={
|
|
152
|
+
"operation": Operation.SEND_PROMPT,
|
|
153
|
+
"request-id": metadata.get("request_id"),
|
|
154
|
+
"conversation-id": metadata.get("conversation_id"),
|
|
155
|
+
"message-id": metadata.get("message_id"),
|
|
156
|
+
},
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Build updated context for subsequent turns
|
|
160
|
+
new_conversation_id = metadata.get("conversation_id") or conversation_id
|
|
161
|
+
updated_context = {"conversation_id": new_conversation_id} if new_conversation_id else None
|
|
162
|
+
|
|
163
|
+
return enhanced_response, updated_context
|
|
164
|
+
|
|
165
|
+
# ------------------------------------------------------------------ #
|
|
166
|
+
# Private helpers #
|
|
167
|
+
# ------------------------------------------------------------------ #
|
|
168
|
+
|
|
169
|
+
def _build_request_headers(self) -> Dict[str, str]:
|
|
170
|
+
headers = {
|
|
171
|
+
"Content-Type": "application/json",
|
|
172
|
+
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
173
|
+
"Authorization": f"Bearer {self._access_token}",
|
|
174
|
+
}
|
|
175
|
+
return {k: v for k, v in headers.items() if v is not None}
|
|
176
|
+
|
|
177
|
+
def _build_chat_payload(
|
|
178
|
+
self,
|
|
179
|
+
prompt: str,
|
|
180
|
+
agent_id: str | None,
|
|
181
|
+
conversation_id: str | None = None,
|
|
182
|
+
) -> bytes:
|
|
183
|
+
message: Dict[str, Any] = {
|
|
184
|
+
"message": {
|
|
185
|
+
"text": prompt,
|
|
186
|
+
"author": "user",
|
|
187
|
+
"messageType": "chat",
|
|
188
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
189
|
+
"locationInfo": self._get_location_info(),
|
|
190
|
+
"from": {
|
|
191
|
+
"id": self._user_oid,
|
|
192
|
+
},
|
|
193
|
+
},
|
|
194
|
+
"verbosity": "verbose",
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
if agent_id:
|
|
198
|
+
message["gpts"] = [{"id": agent_id.strip(), "source": "MOS3"}]
|
|
199
|
+
message["optionsSets"] = ["disable_action_confirmation"]
|
|
200
|
+
|
|
201
|
+
if conversation_id:
|
|
202
|
+
message["conversationId"] = conversation_id
|
|
203
|
+
|
|
204
|
+
return json.dumps(message).encode("utf-8")
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import tzlocal
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseAgentClient(ABC):
|
|
12
|
+
"""Abstract base class for agent API clients.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def fetch_available_agents(self) -> List[Dict[str, Any]]:
|
|
17
|
+
"""Return the list of agents accessible to the configured user.
|
|
18
|
+
|
|
19
|
+
Implementations that do not support agent enumeration should
|
|
20
|
+
return an empty list.
|
|
21
|
+
"""
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def send_prompt(
|
|
26
|
+
self,
|
|
27
|
+
prompt: str,
|
|
28
|
+
agent_id: str | None = None,
|
|
29
|
+
conversation_context: Optional[Dict[str, Any]] = None,
|
|
30
|
+
) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
|
|
31
|
+
"""Send a single prompt and return the response with conversation context.
|
|
32
|
+
|
|
33
|
+
For single-turn usage, pass conversation_context=None.
|
|
34
|
+
For multi-turn usage, pass the context returned from the previous turn.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
prompt: The prompt string to send.
|
|
38
|
+
agent_id: Optional agent ID to target.
|
|
39
|
+
conversation_context: Opaque context dict from a previous turn,
|
|
40
|
+
or None for the first turn / single-turn usage.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Tuple of (enhanced_response_dict, conversation_context).
|
|
44
|
+
The conversation_context should be passed to the next turn
|
|
45
|
+
in a multi-turn conversation, or discarded for single-turn.
|
|
46
|
+
The context structure is implementation-specific:
|
|
47
|
+
- Sydney/REST: {"conversation_id": str}
|
|
48
|
+
- A2A: {"context_id": str}
|
|
49
|
+
Returns None as context when no conversation state is established.
|
|
50
|
+
"""
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
def resolve_agent(self, agent_id: str) -> None:
|
|
54
|
+
"""Pre-resolve agent endpoint. Called once before pipeline starts.
|
|
55
|
+
|
|
56
|
+
Default is no-op. Subclasses may override to cache agent discovery.
|
|
57
|
+
"""
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
@functools.lru_cache(maxsize=1)
|
|
62
|
+
def _get_iana_timezone_name() -> str:
|
|
63
|
+
try:
|
|
64
|
+
return tzlocal.get_localzone_name()
|
|
65
|
+
except Exception:
|
|
66
|
+
return str(tzlocal.get_localzone())
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
@functools.lru_cache(maxsize=1)
|
|
70
|
+
def _get_location_info() -> Dict[str, Any]:
|
|
71
|
+
now = datetime.now().astimezone()
|
|
72
|
+
utc_offset = now.utcoffset()
|
|
73
|
+
offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
|
|
74
|
+
return {
|
|
75
|
+
"timeZoneOffset": offset_hours,
|
|
76
|
+
"timeZone": BaseAgentClient._get_iana_timezone_name(),
|
|
77
|
+
}
|
|
78
|
+
|
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import logging
|
|
2
3
|
import sys
|
|
3
4
|
from collections import OrderedDict
|
|
4
|
-
from typing import Any, Dict
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
5
6
|
|
|
6
|
-
from cli_logging.logging_utils import
|
|
7
|
+
from cli_logging.logging_utils import (
|
|
8
|
+
STRUCTURED_LOG_FIELDS,
|
|
9
|
+
Operation,
|
|
10
|
+
format_structured_log_entry,
|
|
11
|
+
)
|
|
7
12
|
|
|
8
13
|
_ANSI_COLORS = {
|
|
9
14
|
"debug": "\033[2m", # dim
|
|
@@ -53,3 +58,50 @@ def render_diagnostic(record: Dict[str, Any]) -> str:
|
|
|
53
58
|
if sys.stdout.isatty():
|
|
54
59
|
return format_console_record(record)
|
|
55
60
|
return serialize_diagnostic_record(record)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def emit_structured_log(
|
|
64
|
+
level: str,
|
|
65
|
+
message: str,
|
|
66
|
+
operation: str = Operation.EVALUATE,
|
|
67
|
+
*,
|
|
68
|
+
logger: logging.Logger,
|
|
69
|
+
diagnostic_records: Optional[List[Dict[str, Any]]] = None,
|
|
70
|
+
run_context: Optional[Dict[str, Any]] = None,
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Emit a structured log entry.
|
|
73
|
+
|
|
74
|
+
Formats via format_structured_log_entry, optionally appends to
|
|
75
|
+
diagnostic_records, then logs via render_diagnostic (TTY-friendly or JSON).
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
level: One of "debug", "info", "warning", "error".
|
|
79
|
+
message: Human-readable log message.
|
|
80
|
+
operation: The CLI operation step (e.g. Operation.SEND_PROMPT).
|
|
81
|
+
logger: Logger to emit through.
|
|
82
|
+
diagnostic_records: If provided, the structured entry is appended here.
|
|
83
|
+
run_context: Full run context override (request-id, conversation-id,
|
|
84
|
+
message-id). Defaults to nulls with the given operation.
|
|
85
|
+
"""
|
|
86
|
+
log_level_int = getattr(logging, level.upper(), logging.INFO)
|
|
87
|
+
if diagnostic_records is None and not logger.isEnabledFor(log_level_int):
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
context = run_context or {
|
|
91
|
+
"request-id": None,
|
|
92
|
+
"conversation-id": None,
|
|
93
|
+
"message-id": None,
|
|
94
|
+
"operation": operation,
|
|
95
|
+
}
|
|
96
|
+
entry = format_structured_log_entry(
|
|
97
|
+
level=level,
|
|
98
|
+
message=message,
|
|
99
|
+
logger_name=logger.name,
|
|
100
|
+
run_context=context,
|
|
101
|
+
)
|
|
102
|
+
if diagnostic_records is not None:
|
|
103
|
+
diagnostic_records.append(entry)
|
|
104
|
+
try:
|
|
105
|
+
logger.log(getattr(logging, level.upper(), logging.INFO), render_diagnostic(entry))
|
|
106
|
+
except Exception:
|
|
107
|
+
pass
|
|
@@ -25,6 +25,17 @@ PARTIAL_MATCH = "PartialMatch"
|
|
|
25
25
|
REQUIRES_AZURE_OPENAI = "azure_openai"
|
|
26
26
|
REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
|
|
27
27
|
|
|
28
|
+
# Evaluation status constants
|
|
29
|
+
# Outcome statuses (agent responded, evaluators ran):
|
|
30
|
+
STATUS_PASS = "pass" # All evaluators scored above threshold
|
|
31
|
+
STATUS_FAIL = "fail" # At least one evaluator scored below threshold
|
|
32
|
+
# Error state (evaluation couldn't complete):
|
|
33
|
+
STATUS_ERROR = "error" # API call failed / response couldn't be obtained
|
|
34
|
+
# Thread-level aggregate status (multi-turn only):
|
|
35
|
+
STATUS_PARTIAL = "partial" # Some turns passed, some did not
|
|
36
|
+
# Fallback for missing status:
|
|
37
|
+
STATUS_UNKNOWN = "unknown"
|
|
38
|
+
|
|
28
39
|
# System defaults when no file-level or env-level defaults are configured
|
|
29
40
|
SYSTEM_DEFAULT_EVALUATORS = [
|
|
30
41
|
RELEVANCE,
|