@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.4.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/README.md +129 -97
  2. package/package.json +7 -4
  3. package/schema/v1/eval-document.schema.json +140 -8
  4. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  5. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  6. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  7. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  8. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  9. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  10. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  11. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  12. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  13. package/schema/version.json +2 -2
  14. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  15. package/src/clients/cli/api_clients/A2A/a2a_client.py +456 -0
  16. package/src/clients/cli/api_clients/REST/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/REST/sydney_client.py +204 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +78 -0
  20. package/src/clients/cli/cli_logging/console_diagnostics.py +54 -2
  21. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  22. package/src/clients/cli/common.py +11 -0
  23. package/src/clients/cli/generate_report.py +272 -129
  24. package/src/clients/cli/main.py +1006 -476
  25. package/src/clients/cli/parallel_executor.py +57 -0
  26. package/src/clients/cli/requirements.txt +1 -1
  27. package/src/clients/cli/response_extractor.py +12 -14
  28. package/src/clients/cli/retry_policy.py +52 -0
  29. package/src/clients/cli/samples/multiturn_example.json +35 -0
  30. package/src/clients/cli/throttle_gate.py +82 -0
  31. package/src/clients/node-js/bin/runevals.js +79 -16
  32. package/src/clients/node-js/config/default.js +5 -1
  33. package/src/clients/node-js/lib/agent-id.js +12 -0
  34. package/src/clients/node-js/lib/env-loader.js +11 -16
  35. package/src/clients/node-js/lib/eula-manager.js +78 -0
  36. package/src/clients/node-js/lib/progress.js +13 -11
@@ -0,0 +1,204 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import urllib.error
7
+ import urllib.parse
8
+ import urllib.request
9
+ from datetime import datetime, timezone
10
+ from typing import Any, Dict, List, Optional, Tuple
11
+
12
+ from api_clients.base_agent_client import BaseAgentClient
13
+ from cli_logging.console_diagnostics import emit_structured_log
14
+ from cli_logging.logging_utils import Operation
15
+ from response_extractor import extract_enhanced_response
16
+
17
+ _REQUEST_TIMEOUT_SECS = 120
18
+
19
+ # int → str used when passing log level to extract_enhanced_response
20
+ _LEVEL_INT_TO_STR: Dict[int, str] = {
21
+ logging.DEBUG: "debug",
22
+ logging.INFO: "info",
23
+ logging.WARNING: "warning",
24
+ logging.ERROR: "error",
25
+ }
26
+
27
+
28
+ class SydneyClient(BaseAgentClient):
29
+ """REST client for the Microsoft Copilot Sydney chat API."""
30
+
31
+ def __init__(
32
+ self,
33
+ *,
34
+ copilot_api_endpoint: str,
35
+ access_token: str,
36
+ user_oid: str,
37
+ logger: Optional[logging.Logger] = None,
38
+ diagnostic_records: Optional[List[Dict[str, Any]]] = None,
39
+ ) -> None:
40
+ """
41
+ Args:
42
+ copilot_api_endpoint: Base URL for the Copilot API.
43
+ access_token: Bearer token for API authentication.
44
+ user_oid: User object ID included in request payloads.
45
+ logger: Logger to use for all client logging. Defaults to a module-level
46
+ logger if not provided.
47
+ diagnostic_records: List to accumulate structured log entries.
48
+ """
49
+ self._endpoint = copilot_api_endpoint
50
+ self._access_token = access_token
51
+ self._user_oid = user_oid
52
+ self._logger = logger or logging.getLogger(__name__)
53
+ self._diagnostic_records = diagnostic_records
54
+ self._log_level = _LEVEL_INT_TO_STR.get(self._logger.getEffectiveLevel(), "info")
55
+
56
+ # ------------------------------------------------------------------ #
57
+ # BaseAgentClient implementation #
58
+ # ------------------------------------------------------------------ #
59
+
60
+ def fetch_available_agents(self) -> List[Dict[str, Any]]:
61
+ """Fetch agents available to the user from the Copilot API.
62
+
63
+ Returns an empty list if the endpoint is unavailable or returns an error.
64
+ """
65
+ try:
66
+ request_data = json.dumps({"participant": {"id": self._user_oid}})
67
+ query_param = urllib.parse.quote(request_data)
68
+ agents_url = f"{self._endpoint}/GetGptList?request={query_param}"
69
+ emit_structured_log(
70
+ "debug",
71
+ f"[REST] Fetching available agents from: {agents_url}",
72
+ Operation.FETCH_AGENTS,
73
+ logger=self._logger,
74
+ diagnostic_records=self._diagnostic_records,
75
+ )
76
+ req = urllib.request.Request(
77
+ agents_url,
78
+ headers=self._build_request_headers(),
79
+ method="GET",
80
+ )
81
+ with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
82
+ data = json.loads(resp.read().decode("utf-8"))
83
+ return data.get("gptList", [])
84
+ except urllib.error.HTTPError as e:
85
+ emit_structured_log("warning", f"[REST] Unable to fetch agents list (HTTP {e.code}).", Operation.FETCH_AGENTS, logger=self._logger, diagnostic_records=self._diagnostic_records)
86
+ return []
87
+ except Exception as e:
88
+ emit_structured_log("warning", f"[REST] Error fetching agents: {e}", Operation.FETCH_AGENTS, logger=self._logger, diagnostic_records=self._diagnostic_records)
89
+ return []
90
+
91
+ def send_prompt(
92
+ self,
93
+ prompt: str,
94
+ agent_id: str | None = None,
95
+ conversation_context: Optional[Dict[str, Any]] = None,
96
+ ) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
97
+ """Send a prompt to the Sydney /chat endpoint and return the response with context.
98
+
99
+ Args:
100
+ prompt: Prompt string to send to the agent.
101
+ agent_id: Optional agent ID to target a specific Copilot agent.
102
+ conversation_context: Context from a previous turn (contains conversation_id),
103
+ or None for the first turn / single-turn usage.
104
+
105
+ Returns:
106
+ Tuple of (enhanced_response_dict, conversation_context).
107
+ """
108
+ request_headers = self._build_request_headers()
109
+ conversation_id = conversation_context.get("conversation_id") if conversation_context else None
110
+
111
+ emit_structured_log("debug", "[REST] Sending prompt to agent.", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
112
+
113
+ payload = self._build_chat_payload(prompt, agent_id, conversation_id)
114
+ emit_structured_log("debug", f"[REST] Sending payload: {payload.decode('utf-8')[:500]}", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
115
+
116
+ req = urllib.request.Request(
117
+ f"{self._endpoint}/chat",
118
+ data=payload,
119
+ headers=request_headers,
120
+ method="POST",
121
+ )
122
+ try:
123
+ with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
124
+ raw = resp.read().decode("utf-8", errors="replace")
125
+ except urllib.error.HTTPError as e:
126
+ error_body = None
127
+ try:
128
+ error_body = e.read().decode("utf-8", errors="replace")
129
+ except Exception:
130
+ pass
131
+ msg = f"[REST] Chat API request failed (HTTP {e.code} {e.reason})."
132
+ if error_body:
133
+ msg += f" Body: {error_body[:500]}"
134
+ raise RuntimeError(msg) from e
135
+ except urllib.error.URLError as e:
136
+ raise RuntimeError(
137
+ f"[REST] Chat API connection error: {getattr(e, 'reason', str(e))}"
138
+ ) from e
139
+
140
+ emit_structured_log("debug", f"[REST] Raw response: {raw[:500]}", Operation.SEND_PROMPT, logger=self._logger, diagnostic_records=self._diagnostic_records)
141
+
142
+ enhanced_response = extract_enhanced_response(raw.strip(), self._log_level)
143
+
144
+ metadata = enhanced_response.get("metadata", {})
145
+ emit_structured_log(
146
+ "debug",
147
+ "Response IDs for prompt.",
148
+ Operation.SEND_PROMPT,
149
+ logger=self._logger,
150
+ diagnostic_records=self._diagnostic_records,
151
+ run_context={
152
+ "operation": Operation.SEND_PROMPT,
153
+ "request-id": metadata.get("request_id"),
154
+ "conversation-id": metadata.get("conversation_id"),
155
+ "message-id": metadata.get("message_id"),
156
+ },
157
+ )
158
+
159
+ # Build updated context for subsequent turns
160
+ new_conversation_id = metadata.get("conversation_id") or conversation_id
161
+ updated_context = {"conversation_id": new_conversation_id} if new_conversation_id else None
162
+
163
+ return enhanced_response, updated_context
164
+
165
+ # ------------------------------------------------------------------ #
166
+ # Private helpers #
167
+ # ------------------------------------------------------------------ #
168
+
169
+ def _build_request_headers(self) -> Dict[str, str]:
170
+ headers = {
171
+ "Content-Type": "application/json",
172
+ "X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
173
+ "Authorization": f"Bearer {self._access_token}",
174
+ }
175
+ return {k: v for k, v in headers.items() if v is not None}
176
+
177
+ def _build_chat_payload(
178
+ self,
179
+ prompt: str,
180
+ agent_id: str | None,
181
+ conversation_id: str | None = None,
182
+ ) -> bytes:
183
+ message: Dict[str, Any] = {
184
+ "message": {
185
+ "text": prompt,
186
+ "author": "user",
187
+ "messageType": "chat",
188
+ "timestamp": datetime.now(timezone.utc).isoformat(),
189
+ "locationInfo": self._get_location_info(),
190
+ "from": {
191
+ "id": self._user_oid,
192
+ },
193
+ },
194
+ "verbosity": "verbose",
195
+ }
196
+
197
+ if agent_id:
198
+ message["gpts"] = [{"id": agent_id.strip(), "source": "MOS3"}]
199
+ message["optionsSets"] = ["disable_action_confirmation"]
200
+
201
+ if conversation_id:
202
+ message["conversationId"] = conversation_id
203
+
204
+ return json.dumps(message).encode("utf-8")
@@ -0,0 +1,3 @@
1
+ from .base_agent_client import BaseAgentClient
2
+
3
+ __all__ = ["BaseAgentClient"]
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ from abc import ABC, abstractmethod
5
+ from datetime import datetime
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+
8
+ import tzlocal
9
+
10
+
11
+ class BaseAgentClient(ABC):
12
+ """Abstract base class for agent API clients.
13
+ """
14
+
15
+ @abstractmethod
16
+ def fetch_available_agents(self) -> List[Dict[str, Any]]:
17
+ """Return the list of agents accessible to the configured user.
18
+
19
+ Implementations that do not support agent enumeration should
20
+ return an empty list.
21
+ """
22
+ pass
23
+
24
+ @abstractmethod
25
+ def send_prompt(
26
+ self,
27
+ prompt: str,
28
+ agent_id: str | None = None,
29
+ conversation_context: Optional[Dict[str, Any]] = None,
30
+ ) -> Tuple[Dict[str, Any], Optional[Dict[str, Any]]]:
31
+ """Send a single prompt and return the response with conversation context.
32
+
33
+ For single-turn usage, pass conversation_context=None.
34
+ For multi-turn usage, pass the context returned from the previous turn.
35
+
36
+ Args:
37
+ prompt: The prompt string to send.
38
+ agent_id: Optional agent ID to target.
39
+ conversation_context: Opaque context dict from a previous turn,
40
+ or None for the first turn / single-turn usage.
41
+
42
+ Returns:
43
+ Tuple of (enhanced_response_dict, conversation_context).
44
+ The conversation_context should be passed to the next turn
45
+ in a multi-turn conversation, or discarded for single-turn.
46
+ The context structure is implementation-specific:
47
+ - Sydney/REST: {"conversation_id": str}
48
+ - A2A: {"context_id": str}
49
+ Returns None as context when no conversation state is established.
50
+ """
51
+ pass
52
+
53
+ def resolve_agent(self, agent_id: str) -> None:
54
+ """Pre-resolve agent endpoint. Called once before pipeline starts.
55
+
56
+ Default is no-op. Subclasses may override to cache agent discovery.
57
+ """
58
+ pass
59
+
60
+ @staticmethod
61
+ @functools.lru_cache(maxsize=1)
62
+ def _get_iana_timezone_name() -> str:
63
+ try:
64
+ return tzlocal.get_localzone_name()
65
+ except Exception:
66
+ return str(tzlocal.get_localzone())
67
+
68
+ @staticmethod
69
+ @functools.lru_cache(maxsize=1)
70
+ def _get_location_info() -> Dict[str, Any]:
71
+ now = datetime.now().astimezone()
72
+ utc_offset = now.utcoffset()
73
+ offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
74
+ return {
75
+ "timeZoneOffset": offset_hours,
76
+ "timeZone": BaseAgentClient._get_iana_timezone_name(),
77
+ }
78
+
@@ -1,9 +1,14 @@
1
1
  import json
2
+ import logging
2
3
  import sys
3
4
  from collections import OrderedDict
4
- from typing import Any, Dict
5
+ from typing import Any, Dict, List, Optional
5
6
 
6
- from cli_logging.logging_utils import STRUCTURED_LOG_FIELDS
7
+ from cli_logging.logging_utils import (
8
+ STRUCTURED_LOG_FIELDS,
9
+ Operation,
10
+ format_structured_log_entry,
11
+ )
7
12
 
8
13
  _ANSI_COLORS = {
9
14
  "debug": "\033[2m", # dim
@@ -53,3 +58,50 @@ def render_diagnostic(record: Dict[str, Any]) -> str:
53
58
  if sys.stdout.isatty():
54
59
  return format_console_record(record)
55
60
  return serialize_diagnostic_record(record)
61
+
62
+
63
+ def emit_structured_log(
64
+ level: str,
65
+ message: str,
66
+ operation: str = Operation.EVALUATE,
67
+ *,
68
+ logger: logging.Logger,
69
+ diagnostic_records: Optional[List[Dict[str, Any]]] = None,
70
+ run_context: Optional[Dict[str, Any]] = None,
71
+ ) -> None:
72
+ """Emit a structured log entry.
73
+
74
+ Formats via format_structured_log_entry, optionally appends to
75
+ diagnostic_records, then logs via render_diagnostic (TTY-friendly or JSON).
76
+
77
+ Args:
78
+ level: One of "debug", "info", "warning", "error".
79
+ message: Human-readable log message.
80
+ operation: The CLI operation step (e.g. Operation.SEND_PROMPT).
81
+ logger: Logger to emit through.
82
+ diagnostic_records: If provided, the structured entry is appended here.
83
+ run_context: Full run context override (request-id, conversation-id,
84
+ message-id). Defaults to nulls with the given operation.
85
+ """
86
+ log_level_int = getattr(logging, level.upper(), logging.INFO)
87
+ if diagnostic_records is None and not logger.isEnabledFor(log_level_int):
88
+ return
89
+
90
+ context = run_context or {
91
+ "request-id": None,
92
+ "conversation-id": None,
93
+ "message-id": None,
94
+ "operation": operation,
95
+ }
96
+ entry = format_structured_log_entry(
97
+ level=level,
98
+ message=message,
99
+ logger_name=logger.name,
100
+ run_context=context,
101
+ )
102
+ if diagnostic_records is not None:
103
+ diagnostic_records.append(entry)
104
+ try:
105
+ logger.log(getattr(logging, level.upper(), logging.INFO), render_diagnostic(entry))
106
+ except Exception:
107
+ pass
@@ -52,7 +52,6 @@ def normalize_log_level(value: Optional[str]) -> Optional[str]:
52
52
  return value.strip().lower()
53
53
 
54
54
 
55
-
56
55
  def resolve_log_level(
57
56
  log_level_values: Optional[List[str]],
58
57
  ) -> Tuple[Optional[str], Optional[str]]:
@@ -25,6 +25,17 @@ PARTIAL_MATCH = "PartialMatch"
25
25
  REQUIRES_AZURE_OPENAI = "azure_openai"
26
26
  REQUIRES_TOOL_DEFINITIONS = "tool_definitions"
27
27
 
28
+ # Evaluation status constants
29
+ # Outcome statuses (agent responded, evaluators ran):
30
+ STATUS_PASS = "pass" # All evaluators scored above threshold
31
+ STATUS_FAIL = "fail" # At least one evaluator scored below threshold
32
+ # Error state (evaluation couldn't complete):
33
+ STATUS_ERROR = "error" # API call failed / response couldn't be obtained
34
+ # Thread-level aggregate status (multi-turn only):
35
+ STATUS_PARTIAL = "partial" # Some turns passed, some did not
36
+ # Fallback for missing status:
37
+ STATUS_UNKNOWN = "unknown"
38
+
28
39
  # System defaults when no file-level or env-level defaults are configured
29
40
  SYSTEM_DEFAULT_EVALUATORS = [
30
41
  RELEVANCE,