@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.6.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -4
- package/package.json +4 -3
- package/schema/CHANGELOG.md +14 -0
- package/schema/v1/eval-document.schema.json +3 -3
- package/schema/version.json +1 -1
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +96 -30
- package/src/clients/cli/api_clients/base_agent_client.py +0 -1
- package/src/clients/cli/auth/auth_handler.py +21 -1
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +3 -1
- package/src/clients/cli/common.py +53 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/main.py +130 -1676
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/response_extractor.py +4 -601
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/node-js/bin/runevals.js +34 -13
- package/src/clients/node-js/config/default.js +8 -11
- package/src/clients/node-js/lib/env-loader.js +3 -4
- package/src/clients/node-js/lib/python-runtime.js +137 -65
- package/src/clients/node-js/lib/venv-manager.js +3 -2
- package/src/clients/node-js/lib/version-check.js +268 -0
- package/src/clients/cli/api_clients/REST/__init__.py +0 -3
- package/src/clients/cli/api_clients/REST/sydney_client.py +0 -204
package/README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# M365 Copilot Agent Evaluations
|
|
2
2
|
|
|
3
|
-
>
|
|
3
|
+
> **PUBLIC PREVIEW:** This tool is currently in public preview; refer to the instructions below to get started.
|
|
4
4
|
|
|
5
|
-
A
|
|
5
|
+
A CLI for evaluating M365 Copilot agents. Send prompts to your agent, get responses, and automatically score them with Azure AI Evaluation metrics (relevance, coherence, groundedness).
|
|
6
6
|
- Send a batch (or interactive set) of prompts to a configured chat API endpoint.
|
|
7
7
|
- Collect agent responses and evaluate them locally using Azure AI Evaluation SDK.
|
|
8
8
|
- The CLI supports 7 evaluator types. Evaluators marked with ⭐ are **enabled by default**.
|
|
@@ -12,7 +12,7 @@ A **zero-configuration** CLI for evaluating M365 Copilot agents. Send prompts to
|
|
|
12
12
|
| **Relevance** ⭐ | LLM-based | 1-5 | 3 | Yes |
|
|
13
13
|
| **Coherence** ⭐ | LLM-based | 1-5 | 3 | Yes |
|
|
14
14
|
| **Groundedness** | LLM-based | 1-5 | 3 | No |
|
|
15
|
-
| **
|
|
15
|
+
| **Similarity** | LLM-based | 1-5 | 3 | No |
|
|
16
16
|
| **Citations** | Count-based | >= 0 | 1 | No |
|
|
17
17
|
| **ExactMatch** | String match | boolean | N/A | No |
|
|
18
18
|
| **PartialMatch** | String match | 0.0-1.0 | 0.5 | No |
|
|
@@ -24,8 +24,10 @@ A **zero-configuration** CLI for evaluating M365 Copilot agents. Send prompts to
|
|
|
24
24
|
- **M365 Copilot License** for your tenant
|
|
25
25
|
- **M365 Copilot Agent** deployed to your tenant (can be created with [M365 Agents Toolkit](https://learn.microsoft.com/en-us/microsoft-365/developer/overview-m365-agents-toolkit) or any other method)
|
|
26
26
|
- **Node.js 24.12.0+** (check: `node --version`)
|
|
27
|
+
- **Python 3.13.x** is downloaded automatically. If the download fails (e.g., network restrictions), set `PYTHON_PATH` to a local Python 3.13.x installation (see [Troubleshooting](#-troubleshooting))
|
|
27
28
|
- **Environment file** with your credentials and agent ID (see [Environment Setup](#-environment-setup) below)
|
|
28
29
|
- **Your Tenant ID** - get your tenant id using the instructions [here](https://learn.microsoft.com/en-us/azure/azure-portal/get-subscription-tenant-id)
|
|
30
|
+
- Admin approval to run WORKIQ Client App for your tenant [here](https://github.com/microsoft/work-iq/blob/main/ADMIN-INSTRUCTIONS.md)
|
|
29
31
|
- **Azure OpenAI endpoint, and API key** (see [Getting Variables](#-getting-variables) below)
|
|
30
32
|
|
|
31
33
|
> Note: Authentication is currently supported on Windows only. Support for other operating systems is coming soon.
|
|
@@ -66,6 +68,8 @@ M365_TITLE_ID="T_your-title-id-here" # Auto-generated by ATK
|
|
|
66
68
|
# .env.local.user (NOT checked in — secrets go here)
|
|
67
69
|
AZURE_AI_OPENAI_ENDPOINT="<your-azure-openai-endpoint>"
|
|
68
70
|
AZURE_AI_API_KEY="<your-api-key-from-azure-portal>"
|
|
71
|
+
AZURE_AI_API_VERSION="2024-12-01-preview" # default
|
|
72
|
+
AZURE_AI_MODEL_NAME="gpt-4o-mini" # recommended
|
|
69
73
|
TENANT_ID="<your-tenant-id>"
|
|
70
74
|
```
|
|
71
75
|
|
|
@@ -90,7 +94,7 @@ M365_AGENT_ID="your-agent-id" # e.g., U_0dc4a8a2-b95f-edac-91c8-d802023ec2d4
|
|
|
90
94
|
AZURE_AI_OPENAI_ENDPOINT="<your-azure-openai-endpoint>"
|
|
91
95
|
AZURE_AI_API_KEY="<your-api-key-from-azure-portal>"
|
|
92
96
|
AZURE_AI_API_VERSION="2024-12-01-preview" # default
|
|
93
|
-
AZURE_AI_MODEL_NAME="gpt-4o-mini" #
|
|
97
|
+
AZURE_AI_MODEL_NAME="gpt-4o-mini" # recommended
|
|
94
98
|
TENANT_ID="<your-tenant-id>"
|
|
95
99
|
```
|
|
96
100
|
|
|
@@ -457,6 +461,20 @@ runevals cache-dir
|
|
|
457
461
|
chmod -R u+w $(runevals cache-dir)
|
|
458
462
|
```
|
|
459
463
|
|
|
464
|
+
### Custom Python Runtime (PYTHON_PATH)
|
|
465
|
+
|
|
466
|
+
If the automatic Python download fails (e.g., network restrictions, unsupported platform), provide your own Python installation:
|
|
467
|
+
|
|
468
|
+
```bash
|
|
469
|
+
# Windows
|
|
470
|
+
set PYTHON_PATH=C:\Python313\python.exe
|
|
471
|
+
|
|
472
|
+
# macOS/Linux
|
|
473
|
+
export PYTHON_PATH=/usr/local/bin/python3.13
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
Python 3.13.x is the tested version. If a different version is found, you'll be prompted to confirm before proceeding. In CI/CD, a version mismatch fails automatically.
|
|
477
|
+
|
|
460
478
|
## 📚 Advanced Documentation
|
|
461
479
|
|
|
462
480
|
- **[CI/CD Integration](./CICD_CACHE_GUIDE.md)** - GitHub Actions, Azure DevOps caching
|
package/package.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@microsoft/m365-copilot-eval",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0-preview.1",
|
|
4
4
|
"minCliVersion": "1.0.1-preview.1",
|
|
5
5
|
"description": "Zero-config Node.js wrapper for M365 Copilot Agent Evaluations CLI (Python-based Azure AI Evaluation SDK)",
|
|
6
|
-
"publishDate": "2026-
|
|
6
|
+
"publishDate": "2026-05-07",
|
|
7
7
|
"main": "src/clients/node-js/lib/index.js",
|
|
8
8
|
"type": "module",
|
|
9
9
|
"bin": {
|
|
@@ -80,8 +80,9 @@
|
|
|
80
80
|
"README.md",
|
|
81
81
|
"LICENSE"
|
|
82
82
|
],
|
|
83
|
+
"homepage": "https://github.com/microsoft/m365-copilot-eval",
|
|
83
84
|
"repository": {
|
|
84
85
|
"type": "git",
|
|
85
|
-
"url": "https://github.com/microsoft/
|
|
86
|
+
"url": "https://github.com/microsoft/m365-copilot-eval.git"
|
|
86
87
|
}
|
|
87
88
|
}
|
package/schema/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,20 @@ All notable changes to the eval document schema will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.3.0](https://github.com/microsoft/M365-Copilot-Agent-Evals/compare/schema-v1.2.0...schema-v1.3.0) (2026-04-30)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* Added similarity evaluator for compatibility with MCS Evals. ([#228](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/228)) ([0fe8315](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/0fe8315abc8e0422d1ac9117fe9f29195f29044f))
|
|
14
|
+
|
|
15
|
+
## [1.2.0](https://github.com/microsoft/M365-Copilot-Agent-Evals/compare/schema-v1.1.0...schema-v1.2.0) (2026-04-22)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
### Features
|
|
19
|
+
|
|
20
|
+
* **schema:** add multi-turn evaluation support (v1.2.0) ([#208](https://github.com/microsoft/M365-Copilot-Agent-Evals/issues/208)) ([a5ad22b](https://github.com/microsoft/M365-Copilot-Agent-Evals/commit/a5ad22bb4f6ac8ba548dc7f431ace073fa5970ce))
|
|
21
|
+
|
|
8
22
|
## [1.1.0](https://github.com/microsoft/M365-Copilot-Agent-Evals/compare/schema-v1.0.0...schema-v1.1.0) (2026-03-30)
|
|
9
23
|
|
|
10
24
|
|
|
@@ -287,9 +287,9 @@
|
|
|
287
287
|
"$ref": "#/$defs/EvalScore",
|
|
288
288
|
"description": "Groundedness score (1-5)"
|
|
289
289
|
},
|
|
290
|
-
"
|
|
290
|
+
"similarity": {
|
|
291
291
|
"$ref": "#/$defs/EvalScore",
|
|
292
|
-
"description": "
|
|
292
|
+
"description": "Similarity score (1-5)"
|
|
293
293
|
},
|
|
294
294
|
"citations": {
|
|
295
295
|
"$ref": "#/$defs/CitationScore",
|
|
@@ -427,7 +427,7 @@
|
|
|
427
427
|
"type": "object",
|
|
428
428
|
"description": "Map of evaluator names to their configuration options",
|
|
429
429
|
"propertyNames": {
|
|
430
|
-
"enum": ["Relevance", "Coherence", "Groundedness", "
|
|
430
|
+
"enum": ["Relevance", "Coherence", "Groundedness", "Similarity", "Citations", "ExactMatch", "PartialMatch"]
|
|
431
431
|
},
|
|
432
432
|
"additionalProperties": {
|
|
433
433
|
"$ref": "#/$defs/EvaluatorOptions"
|
package/schema/version.json
CHANGED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Interactive agent selection and agent-id utilities."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import questionary
|
|
6
|
+
|
|
7
|
+
from cli_logging.cli_logger import emit_structured_log
|
|
8
|
+
from cli_logging.logging_utils import Operation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def normalize_agent_id(agent_id):
|
|
12
|
+
"""Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
|
|
13
|
+
|
|
14
|
+
Returns the input unchanged when it is None/empty or already contains a dot.
|
|
15
|
+
"""
|
|
16
|
+
if not agent_id:
|
|
17
|
+
return agent_id
|
|
18
|
+
return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
|
|
22
|
+
"""
|
|
23
|
+
Display an interactive agent selector using questionary.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
agents: List of agent dictionaries.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
|
|
30
|
+
"""
|
|
31
|
+
if not agents:
|
|
32
|
+
return None, None
|
|
33
|
+
|
|
34
|
+
# Build id→name lookup and choices
|
|
35
|
+
id_to_name: Dict[str, str] = {}
|
|
36
|
+
choices = []
|
|
37
|
+
sorted_agents = sorted(agents, key=lambda a: a.get("name", ""))
|
|
38
|
+
for agent in sorted_agents:
|
|
39
|
+
agent_name = agent.get("name", "Unknown")
|
|
40
|
+
agent_id = (agent.get("gptId") or "").strip()
|
|
41
|
+
if not agent_id:
|
|
42
|
+
emit_structured_log("warning", f"Skipping agent '{agent_name}': missing or empty gptId.", operation=Operation.FETCH_AGENTS)
|
|
43
|
+
continue
|
|
44
|
+
agent_description = agent.get("description")
|
|
45
|
+
agent_is_owner = agent.get('isOwner')
|
|
46
|
+
agent_provider = agent.get("provider")
|
|
47
|
+
id_to_name[agent_id] = agent_name
|
|
48
|
+
|
|
49
|
+
# Format the display text
|
|
50
|
+
if agent_provider:
|
|
51
|
+
title = f"{agent_name} - {agent_provider} ({agent_id})"
|
|
52
|
+
else:
|
|
53
|
+
title = f"{agent_name} ({agent_id})"
|
|
54
|
+
segments = [title]
|
|
55
|
+
if agent_is_owner:
|
|
56
|
+
segments.append(f"IsOwner: {agent_is_owner}")
|
|
57
|
+
if agent_description:
|
|
58
|
+
segments.append(agent_description)
|
|
59
|
+
display_text = " - ".join(segments)
|
|
60
|
+
|
|
61
|
+
choices.append(questionary.Choice(title=display_text, value=agent_id))
|
|
62
|
+
|
|
63
|
+
if not choices:
|
|
64
|
+
return None, None
|
|
65
|
+
|
|
66
|
+
# Display the selection prompt
|
|
67
|
+
selected_agent = questionary.select(
|
|
68
|
+
"Select an agent to evaluate:",
|
|
69
|
+
choices=choices,
|
|
70
|
+
use_shortcuts=len(choices) <= 35,
|
|
71
|
+
use_arrow_keys=True
|
|
72
|
+
).ask()
|
|
73
|
+
|
|
74
|
+
return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
|
|
@@ -8,7 +8,7 @@ import re
|
|
|
8
8
|
import urllib.error
|
|
9
9
|
import urllib.request
|
|
10
10
|
import uuid
|
|
11
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
11
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
12
12
|
|
|
13
13
|
from api_clients.base_agent_client import BaseAgentClient
|
|
14
14
|
from cli_logging.console_diagnostics import emit_structured_log
|
|
@@ -35,6 +35,7 @@ class A2AClient(BaseAgentClient):
|
|
|
35
35
|
access_token: str,
|
|
36
36
|
logger: Optional[logging.Logger] = None,
|
|
37
37
|
diagnostic_records: Optional[List[Dict[str, Any]]] = None,
|
|
38
|
+
token_refresh_fn: Optional[Callable[[], str]] = None,
|
|
38
39
|
) -> None:
|
|
39
40
|
"""
|
|
40
41
|
Args:
|
|
@@ -42,11 +43,15 @@ class A2AClient(BaseAgentClient):
|
|
|
42
43
|
access_token: Bearer token for A2A authentication.
|
|
43
44
|
logger: Logger to use. Defaults to a module-level logger if not provided.
|
|
44
45
|
diagnostic_records: List to accumulate structured log entries.
|
|
46
|
+
token_refresh_fn: Optional callable that returns a fresh access token string.
|
|
47
|
+
When provided, a single HTTP 401 response will trigger a token refresh
|
|
48
|
+
and one automatic retry, making the refresh invisible to the caller.
|
|
45
49
|
"""
|
|
46
50
|
self._endpoint = a2a_endpoint.rstrip("/")
|
|
47
51
|
self._access_token = access_token
|
|
48
52
|
self._logger = logger or logging.getLogger(__name__)
|
|
49
53
|
self._diagnostic_records = diagnostic_records
|
|
54
|
+
self._token_refresh_fn = token_refresh_fn
|
|
50
55
|
self._resolved_agent_url: Optional[str] = None
|
|
51
56
|
|
|
52
57
|
# ------------------------------------------------------------------ #
|
|
@@ -61,7 +66,7 @@ class A2AClient(BaseAgentClient):
|
|
|
61
66
|
"""Fetch agents from the A2A discovery endpoint.
|
|
62
67
|
|
|
63
68
|
Calls GET {endpoint}/.agents. Each A2A agent card is normalized to
|
|
64
|
-
include 'gptId', 'name', and '
|
|
69
|
+
include 'gptId', 'name', and 'provider' so it is compatible with
|
|
65
70
|
the shared select_agent_interactively selector.
|
|
66
71
|
|
|
67
72
|
Returns an empty list if the endpoint is unreachable or returns an
|
|
@@ -79,8 +84,14 @@ class A2AClient(BaseAgentClient):
|
|
|
79
84
|
)
|
|
80
85
|
req = urllib.request.Request(agents_url, headers=headers, method="GET")
|
|
81
86
|
with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
82
|
-
|
|
83
|
-
|
|
87
|
+
agents = json.loads(resp.read().decode("utf-8"))
|
|
88
|
+
emit_structured_log(
|
|
89
|
+
"debug",
|
|
90
|
+
f"[A2A] Available agents response: {json.dumps(agents)}",
|
|
91
|
+
Operation.FETCH_AGENTS,
|
|
92
|
+
logger=self._logger,
|
|
93
|
+
diagnostic_records=self._diagnostic_records
|
|
94
|
+
)
|
|
84
95
|
return [self._normalize_agent_card(a) for a in agents]
|
|
85
96
|
except urllib.error.HTTPError as e:
|
|
86
97
|
emit_structured_log(
|
|
@@ -104,22 +115,11 @@ class A2AClient(BaseAgentClient):
|
|
|
104
115
|
@staticmethod
|
|
105
116
|
def _normalize_agent_card(agent: Dict[str, Any]) -> Dict[str, Any]:
|
|
106
117
|
"""Normalize an A2A agent card to the shape expected by the selector.
|
|
107
|
-
|
|
108
|
-
A2A agent cards use a 'url' field rather than a discrete ID. The
|
|
109
|
-
agent ID is extracted as the last path segment of that URL, falling
|
|
110
|
-
back to the agent name when the URL is absent.
|
|
111
118
|
"""
|
|
112
|
-
agent_url = agent.get("url", "")
|
|
113
|
-
agent_id = (
|
|
114
|
-
agent_url.rstrip("/").rsplit("/", 1)[-1]
|
|
115
|
-
if agent_url
|
|
116
|
-
else agent.get("name", "")
|
|
117
|
-
)
|
|
118
119
|
return {
|
|
119
|
-
"gptId":
|
|
120
|
-
"name": agent.get("name"
|
|
121
|
-
"
|
|
122
|
-
"isOwner": False,
|
|
120
|
+
"gptId": agent.get("agentId"),
|
|
121
|
+
"name": agent.get("name"),
|
|
122
|
+
"provider": agent.get("provider")
|
|
123
123
|
}
|
|
124
124
|
|
|
125
125
|
def send_prompt(
|
|
@@ -266,6 +266,12 @@ class A2AClient(BaseAgentClient):
|
|
|
266
266
|
) -> tuple[Dict[str, Any], Dict[str, Any]]:
|
|
267
267
|
"""Send a JSON-RPC message to the agent and parse the response.
|
|
268
268
|
|
|
269
|
+
When a ``token_refresh_fn`` was supplied at construction time and the
|
|
270
|
+
server responds with HTTP 401 (Unauthorized), the token is refreshed
|
|
271
|
+
automatically and the request is retried exactly once. This keeps
|
|
272
|
+
long-running eval sessions alive beyond the initial token lifetime
|
|
273
|
+
without requiring any user interaction.
|
|
274
|
+
|
|
269
275
|
Returns:
|
|
270
276
|
A tuple of (result_dict, raw_result) where result_dict is the
|
|
271
277
|
normalized response dict (raw_response_text, display_response_text,
|
|
@@ -280,15 +286,51 @@ class A2AClient(BaseAgentClient):
|
|
|
280
286
|
with urllib.request.urlopen(req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
281
287
|
raw = resp.read().decode("utf-8", errors="replace")
|
|
282
288
|
except urllib.error.HTTPError as e:
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
289
|
+
if e.code == 401 and self._token_refresh_fn is not None:
|
|
290
|
+
emit_structured_log(
|
|
291
|
+
"info",
|
|
292
|
+
"[A2A] Access token expired (HTTP 401); refreshing token and retrying.",
|
|
293
|
+
Operation.AUTHENTICATE,
|
|
294
|
+
logger=self._logger,
|
|
295
|
+
diagnostic_records=self._diagnostic_records,
|
|
296
|
+
)
|
|
297
|
+
new_token = self._token_refresh_fn()
|
|
298
|
+
if not new_token:
|
|
299
|
+
raise RuntimeError(
|
|
300
|
+
"A2A request failed (HTTP 401 Unauthorized) and token refresh returned no token."
|
|
301
|
+
) from e
|
|
302
|
+
self._access_token = new_token
|
|
303
|
+
headers["Authorization"] = f"Bearer {self._access_token}"
|
|
304
|
+
retry_req = urllib.request.Request(
|
|
305
|
+
agent_url, data=payload, headers=headers, method="POST"
|
|
306
|
+
)
|
|
307
|
+
try:
|
|
308
|
+
with urllib.request.urlopen(retry_req, timeout=_REQUEST_TIMEOUT_SECS) as resp:
|
|
309
|
+
raw = resp.read().decode("utf-8", errors="replace")
|
|
310
|
+
except urllib.error.HTTPError as retry_e:
|
|
311
|
+
body = ""
|
|
312
|
+
try:
|
|
313
|
+
body = retry_e.read().decode("utf-8", errors="replace")
|
|
314
|
+
except Exception:
|
|
315
|
+
pass
|
|
316
|
+
raise RuntimeError(
|
|
317
|
+
f"A2A request failed (HTTP {retry_e.code} {retry_e.reason}) after token refresh."
|
|
318
|
+
+ (f" Body: {body[:500]}" if body else "")
|
|
319
|
+
) from retry_e
|
|
320
|
+
except urllib.error.URLError as retry_e:
|
|
321
|
+
raise RuntimeError(
|
|
322
|
+
f"A2A connection error after token refresh: {getattr(retry_e, 'reason', str(retry_e))}"
|
|
323
|
+
) from retry_e
|
|
324
|
+
else:
|
|
325
|
+
body = ""
|
|
326
|
+
try:
|
|
327
|
+
body = e.read().decode("utf-8", errors="replace")
|
|
328
|
+
except Exception:
|
|
329
|
+
pass
|
|
330
|
+
raise RuntimeError(
|
|
331
|
+
f"A2A request failed (HTTP {e.code} {e.reason})."
|
|
332
|
+
+ (f" Body: {body[:500]}" if body else "")
|
|
333
|
+
) from e
|
|
292
334
|
except urllib.error.URLError as e:
|
|
293
335
|
raise RuntimeError(
|
|
294
336
|
f"A2A connection error: {getattr(e, 'reason', str(e))}"
|
|
@@ -334,15 +376,39 @@ class A2AClient(BaseAgentClient):
|
|
|
334
376
|
state = result.get("status", {}).get("state")
|
|
335
377
|
if state == "completed":
|
|
336
378
|
msg = result.get("status", {}).get("message") or {}
|
|
379
|
+
all_parts = list(msg.get("parts", []))
|
|
380
|
+
for artifact in result.get("artifacts", []):
|
|
381
|
+
all_parts.extend(artifact.get("parts", []))
|
|
337
382
|
text = "\n".join(
|
|
338
383
|
p.get("text", "")
|
|
339
|
-
for p in
|
|
384
|
+
for p in all_parts
|
|
340
385
|
if p.get("kind") == "text"
|
|
341
386
|
)
|
|
342
387
|
attributions = msg.get("metadata", {}).get("attributions", [])
|
|
343
|
-
elif state in ("failed", "canceled"):
|
|
388
|
+
elif state in ("failed", "canceled", "rejected"):
|
|
389
|
+
status_msg = result.get("status", {}).get("message") or {}
|
|
390
|
+
detail = "\n".join(
|
|
391
|
+
p.get("text", "")
|
|
392
|
+
for p in status_msg.get("parts", [])
|
|
393
|
+
if p.get("kind") == "text"
|
|
394
|
+
).strip()
|
|
395
|
+
suffix = f" Detail: {detail}" if detail else ""
|
|
396
|
+
raise RuntimeError(
|
|
397
|
+
f"A2A task {state}. Task id: {result.get('id')}{suffix}"
|
|
398
|
+
)
|
|
399
|
+
elif state in ("input_required", "auth_required"):
|
|
400
|
+
requirement = {
|
|
401
|
+
"input_required": "user input",
|
|
402
|
+
"auth_required": "authentication",
|
|
403
|
+
}.get(state, state.replace("_", " "))
|
|
404
|
+
raise RuntimeError(
|
|
405
|
+
f"A2A task requires {requirement} and cannot proceed automatically."
|
|
406
|
+
f" Task id: {result.get('id')}"
|
|
407
|
+
)
|
|
408
|
+
elif state in ("submitted", "working"):
|
|
344
409
|
raise RuntimeError(
|
|
345
|
-
f"A2A task {state}
|
|
410
|
+
f"A2A task is still {state}; synchronous send returned before completion."
|
|
411
|
+
f" Task id: {result.get('id')}"
|
|
346
412
|
)
|
|
347
413
|
else:
|
|
348
414
|
raise RuntimeError(
|
|
@@ -44,7 +44,6 @@ class BaseAgentClient(ABC):
|
|
|
44
44
|
The conversation_context should be passed to the next turn
|
|
45
45
|
in a multi-turn conversation, or discarded for single-turn.
|
|
46
46
|
The context structure is implementation-specific:
|
|
47
|
-
- Sydney/REST: {"conversation_id": str}
|
|
48
47
|
- A2A: {"context_id": str}
|
|
49
48
|
Returns None as context when no conversation state is established.
|
|
50
49
|
"""
|
|
@@ -13,7 +13,7 @@ https://github.com/AzureAD/microsoft-authentication-extensions-for-python
|
|
|
13
13
|
import os
|
|
14
14
|
import platform
|
|
15
15
|
import logging
|
|
16
|
-
from typing import Optional
|
|
16
|
+
from typing import Callable, Optional
|
|
17
17
|
from pathlib import Path
|
|
18
18
|
import jwt
|
|
19
19
|
from msal import PublicClientApplication
|
|
@@ -260,3 +260,23 @@ class AuthHandler:
|
|
|
260
260
|
return oid
|
|
261
261
|
except jwt.DecodeError as e:
|
|
262
262
|
raise ValueError(f"Failed to decode token: {e}")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def make_token_refresh_fn(auth_handler: "AuthHandler") -> Callable[[], str]:
|
|
266
|
+
"""Return a callable that silently refreshes the A2A access token.
|
|
267
|
+
|
|
268
|
+
On a 401 response the caller invokes this function. It first attempts a
|
|
269
|
+
silent refresh (using the MSAL refresh token) and falls back to interactive
|
|
270
|
+
authentication only when a silent refresh is not possible. The returned
|
|
271
|
+
string is the new access token; an empty string signals failure.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
auth_handler: An initialized AuthHandler instance to use for token acquisition.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
A zero-argument callable that returns a fresh access token string.
|
|
278
|
+
"""
|
|
279
|
+
def _refresh() -> str:
|
|
280
|
+
result = auth_handler.acquire_token_interactive() or {}
|
|
281
|
+
return result.get("access_token") or ""
|
|
282
|
+
return _refresh
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""CLI argument parsing and version-check bypass logic."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from cli_logging.cli_logger import emit_structured_log
|
|
7
|
+
from cli_logging.logging_utils import Operation
|
|
8
|
+
from common import MAX_CONCURRENCY, RunConfig
|
|
9
|
+
from agent_selector import normalize_agent_id
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Flags that should bypass remote min-version enforcement.
|
|
13
|
+
# --help is not needed here because argparse exits before runtime checks.
|
|
14
|
+
VERSION_CHECK_BYPASS_FLAGS = (
|
|
15
|
+
"signout",
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def should_bypass_min_version_check(config: RunConfig) -> bool:
|
|
20
|
+
"""Return True if the current invocation should skip min-version checks."""
|
|
21
|
+
return any(getattr(config, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse_arguments():
|
|
25
|
+
"""Parse command line arguments."""
|
|
26
|
+
parser = argparse.ArgumentParser(
|
|
27
|
+
description="M365 Copilot Agent Evaluation CLI",
|
|
28
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
29
|
+
epilog="""
|
|
30
|
+
Examples:
|
|
31
|
+
# Run with default prompts
|
|
32
|
+
python main.py
|
|
33
|
+
|
|
34
|
+
# Run with custom prompts
|
|
35
|
+
python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph is a gateway..."
|
|
36
|
+
|
|
37
|
+
# Run with prompts from file
|
|
38
|
+
python main.py --prompts-file prompts.json
|
|
39
|
+
|
|
40
|
+
# Interactive mode
|
|
41
|
+
python main.py --interactive
|
|
42
|
+
|
|
43
|
+
# Save results to JSON
|
|
44
|
+
python main.py --output results.json
|
|
45
|
+
|
|
46
|
+
# Save results to CSV
|
|
47
|
+
python main.py --output results.csv
|
|
48
|
+
|
|
49
|
+
# Save results to HTML and open in browser
|
|
50
|
+
python main.py --output report.html
|
|
51
|
+
|
|
52
|
+
# Debug-level diagnostics
|
|
53
|
+
python main.py --log-level debug
|
|
54
|
+
|
|
55
|
+
# Sign out and clear cached authentication tokens
|
|
56
|
+
python main.py --signout
|
|
57
|
+
"""
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Input options (mutually exclusive)
|
|
61
|
+
input_group = parser.add_mutually_exclusive_group()
|
|
62
|
+
input_group.add_argument(
|
|
63
|
+
'--prompts',
|
|
64
|
+
nargs='+',
|
|
65
|
+
help='List of prompts to evaluate'
|
|
66
|
+
)
|
|
67
|
+
input_group.add_argument(
|
|
68
|
+
'--prompts-file',
|
|
69
|
+
type=str,
|
|
70
|
+
help='JSON file containing prompts and expected responses'
|
|
71
|
+
)
|
|
72
|
+
input_group.add_argument(
|
|
73
|
+
'--interactive',
|
|
74
|
+
action='store_true',
|
|
75
|
+
help='Interactive mode to enter prompts'
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Expected responses (only used with --prompts)
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
'--expected',
|
|
81
|
+
nargs='+',
|
|
82
|
+
help='List of expected responses (must match number of prompts)'
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
|
|
86
|
+
parser.add_argument(
|
|
87
|
+
'--m365-agent-id', '--agent-id',
|
|
88
|
+
type=str,
|
|
89
|
+
default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
|
|
90
|
+
help='Agent ID (default from M365_AGENT_ID environment variable)'
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Output options
|
|
94
|
+
parser.add_argument(
|
|
95
|
+
'--output',
|
|
96
|
+
type=str,
|
|
97
|
+
help='Output file path. Format is determined by file extension: .json, .csv, .html. If not provided, results are printed to console.'
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Behavior options
|
|
101
|
+
parser.add_argument(
|
|
102
|
+
'--log-level',
|
|
103
|
+
nargs='?',
|
|
104
|
+
const='info',
|
|
105
|
+
action='append',
|
|
106
|
+
help='Set log verbosity: debug, info, warning, error. Bare --log-level resolves to info.'
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
parser.add_argument(
|
|
110
|
+
'--signout',
|
|
111
|
+
action='store_true',
|
|
112
|
+
help='Sign out and clear cached authentication tokens'
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
parser.add_argument(
|
|
116
|
+
'--concurrency',
|
|
117
|
+
type=int,
|
|
118
|
+
default=MAX_CONCURRENCY,
|
|
119
|
+
help=f'Number of parallel workers for prompt processing (1-{MAX_CONCURRENCY}, default: {MAX_CONCURRENCY})'
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
args = parser.parse_args()
|
|
123
|
+
|
|
124
|
+
args.m365_agent_id = normalize_agent_id(args.m365_agent_id)
|
|
125
|
+
|
|
126
|
+
if args.concurrency < 1:
|
|
127
|
+
parser.error('--concurrency must be an integer >= 1.')
|
|
128
|
+
if args.concurrency > MAX_CONCURRENCY:
|
|
129
|
+
emit_structured_log(
|
|
130
|
+
"warning",
|
|
131
|
+
f"--concurrency {args.concurrency} exceeds max {MAX_CONCURRENCY}; clamping to {MAX_CONCURRENCY}.",
|
|
132
|
+
operation=Operation.SETUP,
|
|
133
|
+
)
|
|
134
|
+
args.concurrency = MAX_CONCURRENCY
|
|
135
|
+
|
|
136
|
+
return args
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Shared CLI logger instance and structured-log convenience wrapper.
|
|
2
|
+
|
|
3
|
+
Every module in the CLI layer that needs to emit diagnostics imports from here
|
|
4
|
+
instead of main.py, which avoids circular-import issues.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import sys
|
|
9
|
+
from typing import Any, Dict, List
|
|
10
|
+
|
|
11
|
+
from cli_logging.console_diagnostics import emit_structured_log as _emit_structured_log
|
|
12
|
+
from cli_logging.logging_utils import LOG_LEVEL_MAP, Operation
|
|
13
|
+
|
|
14
|
+
CLI_LOGGER_NAME = "m365.eval.cli"
|
|
15
|
+
CLI_LOGGER = logging.getLogger(CLI_LOGGER_NAME)
|
|
16
|
+
DIAGNOSTIC_RECORDS: List[Dict[str, Any]] = []
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def configure_cli_logging(effective_log_level: str) -> None:
|
|
20
|
+
if not CLI_LOGGER.handlers:
|
|
21
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
22
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
23
|
+
CLI_LOGGER.addHandler(handler)
|
|
24
|
+
CLI_LOGGER.propagate = False
|
|
25
|
+
CLI_LOGGER.setLevel(LOG_LEVEL_MAP[effective_log_level])
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def emit_structured_log(level: str, message: str, operation: str = Operation.EVALUATE) -> None:
|
|
29
|
+
_emit_structured_log(
|
|
30
|
+
level, message, operation,
|
|
31
|
+
logger=CLI_LOGGER,
|
|
32
|
+
diagnostic_records=DIAGNOSTIC_RECORDS,
|
|
33
|
+
)
|
|
@@ -8,6 +8,7 @@ from cli_logging.logging_utils import (
|
|
|
8
8
|
STRUCTURED_LOG_FIELDS,
|
|
9
9
|
Operation,
|
|
10
10
|
format_structured_log_entry,
|
|
11
|
+
redact_sensitive_content,
|
|
11
12
|
)
|
|
12
13
|
|
|
13
14
|
_ANSI_COLORS = {
|
|
@@ -102,6 +103,7 @@ def emit_structured_log(
|
|
|
102
103
|
if diagnostic_records is not None:
|
|
103
104
|
diagnostic_records.append(entry)
|
|
104
105
|
try:
|
|
105
|
-
|
|
106
|
+
rendered, _ = redact_sensitive_content(render_diagnostic(entry))
|
|
107
|
+
logger.log(getattr(logging, level.upper(), logging.INFO), rendered)
|
|
106
108
|
except Exception:
|
|
107
109
|
pass
|