@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.5.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -4
- package/package.json +2 -2
- package/schema/CHANGELOG.md +7 -0
- package/schema/v1/eval-document.schema.json +3 -3
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +39 -20
- package/src/clients/cli/api_clients/base_agent_client.py +0 -1
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +3 -1
- package/src/clients/cli/common.py +53 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/main.py +128 -1675
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/response_extractor.py +4 -601
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/node-js/bin/runevals.js +3 -4
- package/src/clients/node-js/config/default.js +8 -11
- package/src/clients/node-js/lib/env-loader.js +3 -4
- package/src/clients/cli/api_clients/REST/__init__.py +0 -3
- package/src/clients/cli/api_clients/REST/sydney_client.py +0 -204
|
@@ -1,204 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import logging
|
|
5
|
-
import os
|
|
6
|
-
import urllib.error
|
|
7
|
-
import urllib.parse
|
|
8
|
-
import urllib.request
|
|
9
|
-
from datetime import datetime, timezone
|
|
10
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
-
|
|
12
|
-
from api_clients.base_agent_client import BaseAgentClient
|
|
13
|
-
from cli_logging.console_diagnostics import emit_structured_log
|
|
14
|
-
from cli_logging.logging_utils import Operation
|
|
15
|
-
from response_extractor import extract_enhanced_response
|
|
16
|
-
|
|
17
|
-
_REQUEST_TIMEOUT_SECS = 120
|
|
18
|
-
|
|
19
|
-
# int → str used when passing log level to extract_enhanced_response
|
|
20
|
-
_LEVEL_INT_TO_STR: Dict[int, str] = {
|
|
21
|
-
logging.DEBUG: "debug",
|
|
22
|
-
logging.INFO: "info",
|
|
23
|
-
logging.WARNING: "warning",
|
|
24
|
-
logging.ERROR: "error",
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class SydneyClient(BaseAgentClient):
|
|
29
|
-
"""REST client for the Microsoft Copilot Sydney chat API."""
|
|
30
|
-
|
|
31
|
-
def __init__(
|
|
32
|
-
self,
|
|
33
|
-
*,
|
|
34
|
-
copilot_api_endpoint: str,
|
|
35
|
-
access_token: str,
|
|
36
|
-
user_oid: str,
|
|
37
|
-
logger: Optional[logging.Logger] = None,
|
|
38
|
-
diagnostic_records: Optional[List[Dict[str, Any]]] = None,
|
|
39
|
-
) -> None:
|
|
40
|
-
"""
|
|
41
|
-
Args:
|
|
42
|
-
copilot_api_endpoint: Base URL for the Copilot API.
|
|
43
|
-
access_token: Bearer token for API authentication.
|
|
44
|
-
user_oid: User object ID included in request payloads.
|
|
45
|
-
logger: Logger to use for all client logging. Defaults to a module-level
|
|
46
|
-
logger if not provided.
|
|
47
|
-
diagnostic_records: List to accumulate structured log entries.
|
|
48
|
-
"""
|
|
49
|
-
self._endpoint = copilot_api_endpoint
|
|
50
|
-
self._access_token = access_token
|
|
51
|
-
self._user_oid = user_oid
|
|
52
|
-
self._logger = logger or logging.getLogger(__name__)
|
|
53
|
-
self._diagnostic_records = diagnostic_records
|
|
54
|
-
self._log_level = _LEVEL_INT_TO_STR.get(self._logger.getEffectiveLevel(), "info")
|
|
55
|
-
|
|
56
|
-
# ------------------------------------------------------------------ #
|
|
57
|
-
# BaseAgentClient implementation #
|
|
58
|
-
# ------------------------------------------------------------------ #
|
|
59
|
-
|
|
60
|
-
def fetch_available_agents(self) -> List[Dict[str, Any]]:
|
|
61
|
-
"""Fetch agents available to the user from the Copilot API.
|
|
62
|
-
|
|
63
|
-
Returns an empty list if the endpoint is unavailable or returns an error.
|
|
64
|
-
"""
|
|
65
|
-
try:
|
|
66
|
-
request_data = json.dumps({"participant": {"id": self._user_oid}})
|
|
67
|
-
query_param = urllib.parse.quote(request_data)
|
|
68
|
-
agents_url = f"{self._endpoint}/GetGptList?request={query_param}"
|
|
69
|
-
emit_structured_log(
|
|
70
|
-
"debug",
|
|
71
|
-
f"[REST] Fetching available agents from: {agents_url}",
|
|
72
|
-
Operation.FETCH_AGENTS,
|
|
73
|
-
logger=self._logger,
|
|
74
|
-
diagnostic_records=self._diagnostic_records,
|
|
75
|
-
)
|
|
76
|
-
req = urllib.request.Request(
|
|
77
|
-
agents_url,
|
|
78
|
-
headers=self._build_request_headers(),
|
|
79
|
-
method="GET",
|
|
80
|
-
)
|
|
81
|
-
with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
82
|
-
data = json.loads(resp.read().decode("utf-8"))
|
|
83
|
-
return data.get("gptList", [])
|
|
84
|
-
except urllib.error.HTTPError as e:
|
|
85
|
-
emit_structured_log("warning", f"[REST] Unable to fetch agents list (HTTP {e.code}).", Operation.FETCH_AGENTS, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
86
|
-
return []
|
|
87
|
-
except Exception as e:
|
|
88
|
-
emit_structured_log("warning", f"[REST] Error fetching agents: {e}", Operation.FETCH_AGENTS, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
89
|
-
return []
|
|
90
|
-
|
|
91
|
-
def send_prompt(
|
|
92
|
-
self,
|
|
93
|
-
prompt: str,
|
|
94
|
-
agent_id: str | None = None,
|
|
95
|
-
conversation_context: Optional[Dict[str, Any]] = None,
|
|
96
|
-
) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
|
|
97
|
-
"""Send a prompt to the Sydney /chat endpoint and return the response with context.
|
|
98
|
-
|
|
99
|
-
Args:
|
|
100
|
-
prompt: Prompt string to send to the agent.
|
|
101
|
-
agent_id: Optional agent ID to target a specific Copilot agent.
|
|
102
|
-
conversation_context: Context from a previous turn (contains conversation_id),
|
|
103
|
-
or None for the first turn / single-turn usage.
|
|
104
|
-
|
|
105
|
-
Returns:
|
|
106
|
-
Tuple of (enhanced_response_dict, conversation_context).
|
|
107
|
-
"""
|
|
108
|
-
request_headers = self._build_request_headers()
|
|
109
|
-
conversation_id = conversation_context.get("conversation_id") if conversation_context else None
|
|
110
|
-
|
|
111
|
-
emit_structured_log("debug", "[REST] Sending prompt to agent.", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
112
|
-
|
|
113
|
-
payload = self._build_chat_payload(prompt, agent_id, conversation_id)
|
|
114
|
-
emit_structured_log("debug", f"[REST] Sending payload: {payload.decode('utf-8')[:500]}", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
115
|
-
|
|
116
|
-
req = urllib.request.Request(
|
|
117
|
-
f"{self._endpoint}/chat",
|
|
118
|
-
data=payload,
|
|
119
|
-
headers=request_headers,
|
|
120
|
-
method="POST",
|
|
121
|
-
)
|
|
122
|
-
try:
|
|
123
|
-
with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
124
|
-
raw = resp.read().decode("utf-8", errors="replace")
|
|
125
|
-
except urllib.error.HTTPError as e:
|
|
126
|
-
error_body = None
|
|
127
|
-
try:
|
|
128
|
-
error_body = e.read().decode("utf-8", errors="replace")
|
|
129
|
-
except Exception:
|
|
130
|
-
pass
|
|
131
|
-
msg = f"[REST] Chat API request failed (HTTP {e.code} {e.reason})."
|
|
132
|
-
if error_body:
|
|
133
|
-
msg += f" Body: {error_body[:500]}"
|
|
134
|
-
raise RuntimeError(msg) from e
|
|
135
|
-
except urllib.error.URLError as e:
|
|
136
|
-
raise RuntimeError(
|
|
137
|
-
f"[REST] Chat API connection error: {getattr(e, 'reason', str(e))}"
|
|
138
|
-
) from e
|
|
139
|
-
|
|
140
|
-
emit_structured_log("debug", f"[REST] Raw response: {raw[:500]}", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
|
|
141
|
-
|
|
142
|
-
enhanced_response = extract_enhanced_response(raw.strip(), self._log_level)
|
|
143
|
-
|
|
144
|
-
metadata = enhanced_response.get("metadata", {})
|
|
145
|
-
emit_structured_log(
|
|
146
|
-
"debug",
|
|
147
|
-
"Response IDs for prompt.",
|
|
148
|
-
Operation.SEND_PROMPT,
|
|
149
|
-
logger=self._logger,
|
|
150
|
-
diagnostic_records=self._diagnostic_records,
|
|
151
|
-
run_context={
|
|
152
|
-
"operation": Operation.SEND_PROMPT,
|
|
153
|
-
"request-id": metadata.get("request_id"),
|
|
154
|
-
"conversation-id": metadata.get("conversation_id"),
|
|
155
|
-
"message-id": metadata.get("message_id"),
|
|
156
|
-
},
|
|
157
|
-
)
|
|
158
|
-
|
|
159
|
-
# Build updated context for subsequent turns
|
|
160
|
-
new_conversation_id = metadata.get("conversation_id") or conversation_id
|
|
161
|
-
updated_context = {"conversation_id": new_conversation_id} if new_conversation_id else None
|
|
162
|
-
|
|
163
|
-
return enhanced_response, updated_context
|
|
164
|
-
|
|
165
|
-
# ------------------------------------------------------------------ #
|
|
166
|
-
# Private helpers #
|
|
167
|
-
# ------------------------------------------------------------------ #
|
|
168
|
-
|
|
169
|
-
def _build_request_headers(self) -> Dict[str, str]:
|
|
170
|
-
headers = {
|
|
171
|
-
"Content-Type": "application/json",
|
|
172
|
-
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
173
|
-
"Authorization": f"Bearer {self._access_token}",
|
|
174
|
-
}
|
|
175
|
-
return {k: v for k, v in headers.items() if v is not None}
|
|
176
|
-
|
|
177
|
-
def _build_chat_payload(
|
|
178
|
-
self,
|
|
179
|
-
prompt: str,
|
|
180
|
-
agent_id: str | None,
|
|
181
|
-
conversation_id: str | None = None,
|
|
182
|
-
) -> bytes:
|
|
183
|
-
message: Dict[str, Any] = {
|
|
184
|
-
"message": {
|
|
185
|
-
"text": prompt,
|
|
186
|
-
"author": "user",
|
|
187
|
-
"messageType": "chat",
|
|
188
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
189
|
-
"locationInfo": self._get_location_info(),
|
|
190
|
-
"from": {
|
|
191
|
-
"id": self._user_oid,
|
|
192
|
-
},
|
|
193
|
-
},
|
|
194
|
-
"verbosity": "verbose",
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
if agent_id:
|
|
198
|
-
message["gpts"] = [{"id": agent_id.strip(), "source": "MOS3"}]
|
|
199
|
-
message["optionsSets"] = ["disable_action_confirmation"]
|
|
200
|
-
|
|
201
|
-
if conversation_id:
|
|
202
|
-
message["conversationId"] = conversation_id
|
|
203
|
-
|
|
204
|
-
return json.dumps(message).encode("utf-8")
|