@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +135 -100
- package/package.json +7 -4
- package/schema/CHANGELOG.md +7 -0
- package/schema/v1/eval-document.schema.json +143 -11
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +77 -0
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +64 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +157 -1174
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +4 -603
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +82 -20
- package/src/clients/node-js/config/default.js +12 -11
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +14 -20
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.2.0",
|
|
3
|
+
"default_evaluators": {
|
|
4
|
+
"Relevance": {},
|
|
5
|
+
"Coherence": {}
|
|
6
|
+
},
|
|
7
|
+
"items": [
|
|
8
|
+
{
|
|
9
|
+
"prompt": "What is Microsoft Graph API?",
|
|
10
|
+
"expected_response": "Microsoft Graph API is a unified endpoint for accessing Microsoft services."
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"name": "Canadian Employee HR Inquiry",
|
|
14
|
+
"turns": [
|
|
15
|
+
{
|
|
16
|
+
"prompt": "I'm a Canadian employee based in Toronto.",
|
|
17
|
+
"expected_response": "Got it! I can help with Canada-specific HR questions."
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"prompt": "Is July 4th a holiday for me?",
|
|
21
|
+
"expected_response": "July 4th is not a statutory holiday in Canada. However, July 1st (Canada Day) is."
|
|
22
|
+
}
|
|
23
|
+
]
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"prompt": "How do I authenticate with Microsoft Graph?",
|
|
27
|
+
"expected_response": "You can authenticate using OAuth 2.0 or client credentials flow."
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.2.0",
|
|
3
|
+
"metadata": {
|
|
4
|
+
"evaluatedAt": "2026-03-30T10:00:00Z",
|
|
5
|
+
"agentName": "Travel Assistant",
|
|
6
|
+
"cliVersion": "1.3.0"
|
|
7
|
+
},
|
|
8
|
+
"items": [
|
|
9
|
+
{
|
|
10
|
+
"name": "Context Persistence Test",
|
|
11
|
+
"turns": [
|
|
12
|
+
{
|
|
13
|
+
"prompt": "I'm based in Seattle.",
|
|
14
|
+
"expected_response": "Got it! I can help with Seattle-specific questions.",
|
|
15
|
+
"response": "Understood! I can assist with Seattle-related queries.",
|
|
16
|
+
"scores": {
|
|
17
|
+
"relevance": {
|
|
18
|
+
"score": 4.0,
|
|
19
|
+
"result": "pass",
|
|
20
|
+
"threshold": 3,
|
|
21
|
+
"reason": "Response acknowledges Seattle context."
|
|
22
|
+
},
|
|
23
|
+
"coherence": {
|
|
24
|
+
"score": 5.0,
|
|
25
|
+
"result": "pass",
|
|
26
|
+
"threshold": 3
|
|
27
|
+
}
|
|
28
|
+
},
|
|
29
|
+
"status": "pass"
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"prompt": "What's the weather like here?",
|
|
33
|
+
"expected_response": "Seattle weather is typically mild with rain.",
|
|
34
|
+
"response": "Seattle generally has mild temperatures with frequent rain, especially in fall and winter.",
|
|
35
|
+
"scores": {
|
|
36
|
+
"relevance": {
|
|
37
|
+
"score": 5.0,
|
|
38
|
+
"result": "pass",
|
|
39
|
+
"threshold": 3
|
|
40
|
+
},
|
|
41
|
+
"coherence": {
|
|
42
|
+
"score": 4.0,
|
|
43
|
+
"result": "pass",
|
|
44
|
+
"threshold": 3
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
"status": "pass"
|
|
48
|
+
}
|
|
49
|
+
],
|
|
50
|
+
"conversation_id": "conv-abc-123",
|
|
51
|
+
"summary": {
|
|
52
|
+
"turns_total": 2,
|
|
53
|
+
"turns_passed": 2,
|
|
54
|
+
"turns_failed": 0,
|
|
55
|
+
"overall_status": "pass"
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
]
|
|
59
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.2.0",
|
|
3
|
+
"items": [
|
|
4
|
+
{
|
|
5
|
+
"turns": [
|
|
6
|
+
{
|
|
7
|
+
"prompt": "I'm traveling to Seattle next week for a conference.",
|
|
8
|
+
"expected_response": "I can help with travel-related questions."
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"prompt": "What's the weather usually like?",
|
|
12
|
+
"expected_response": "Seattle weather is typically mild with some rain."
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"prompt": "Should I bring a rain jacket?",
|
|
16
|
+
"expected_response": "Yes, Seattle is known for rain. A rain jacket is recommended."
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.2.0",
|
|
3
|
+
"default_evaluators": {
|
|
4
|
+
"Relevance": {},
|
|
5
|
+
"Coherence": {}
|
|
6
|
+
},
|
|
7
|
+
"items": [
|
|
8
|
+
{
|
|
9
|
+
"name": "Expense Policy Flow",
|
|
10
|
+
"description": "Test that agent handles expense policy questions across turns",
|
|
11
|
+
"turns": [
|
|
12
|
+
{
|
|
13
|
+
"prompt": "I'm traveling to Seattle next week for a conference.",
|
|
14
|
+
"expected_response": "I can help with travel-related questions."
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"prompt": "My dinner last night was $250. Is that okay?",
|
|
18
|
+
"expected_response": "The per-diem meal allowance is a maximum of $200.",
|
|
19
|
+
"evaluators": {
|
|
20
|
+
"Groundedness": { "threshold": 4 }
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"prompt": "What should I do about the overage?",
|
|
25
|
+
"expected_response": "For expenses exceeding the policy limit, you'll need manager approval.",
|
|
26
|
+
"evaluators": {
|
|
27
|
+
"ExactMatch": { "case_sensitive": false }
|
|
28
|
+
},
|
|
29
|
+
"evaluators_mode": "replace"
|
|
30
|
+
}
|
|
31
|
+
]
|
|
32
|
+
}
|
|
33
|
+
]
|
|
34
|
+
}
|
package/schema/version.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "1.
|
|
3
|
-
"releaseDate": "2026-
|
|
2
|
+
"version": "1.2.0",
|
|
3
|
+
"releaseDate": "2026-04-02",
|
|
4
4
|
"schemaId": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
|
|
5
5
|
"description": "M365 Copilot Eval Document Schema"
|
|
6
6
|
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Interactive agent selection and agent-id utilities."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import questionary
|
|
6
|
+
|
|
7
|
+
from cli_logging.cli_logger import emit_structured_log
|
|
8
|
+
from cli_logging.logging_utils import Operation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def normalize_agent_id(agent_id):
|
|
12
|
+
"""Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
|
|
13
|
+
|
|
14
|
+
Returns the input unchanged when it is None/empty or already contains a dot.
|
|
15
|
+
"""
|
|
16
|
+
if not agent_id:
|
|
17
|
+
return agent_id
|
|
18
|
+
return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
|
|
22
|
+
"""
|
|
23
|
+
Display an interactive agent selector using questionary.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
agents: List of agent dictionaries.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
|
|
30
|
+
"""
|
|
31
|
+
if not agents:
|
|
32
|
+
return None, None
|
|
33
|
+
|
|
34
|
+
# Build id→name lookup and choices
|
|
35
|
+
id_to_name: Dict[str, str] = {}
|
|
36
|
+
choices = []
|
|
37
|
+
sorted_agents = sorted(agents, key=lambda a: a.get("name", ""))
|
|
38
|
+
for agent in sorted_agents:
|
|
39
|
+
agent_name = agent.get("name", "Unknown")
|
|
40
|
+
agent_id = (agent.get("gptId") or "").strip()
|
|
41
|
+
if not agent_id:
|
|
42
|
+
emit_structured_log("warning", f"Skipping agent '{agent_name}': missing or empty gptId.", operation=Operation.FETCH_AGENTS)
|
|
43
|
+
continue
|
|
44
|
+
agent_description = agent.get("description")
|
|
45
|
+
agent_is_owner = agent.get('isOwner')
|
|
46
|
+
agent_provider = agent.get("provider")
|
|
47
|
+
id_to_name[agent_id] = agent_name
|
|
48
|
+
|
|
49
|
+
# Format the display text
|
|
50
|
+
if agent_provider:
|
|
51
|
+
title = f"{agent_name} - {agent_provider} ({agent_id})"
|
|
52
|
+
else:
|
|
53
|
+
title = f"{agent_name} ({agent_id})"
|
|
54
|
+
segments = [title]
|
|
55
|
+
if agent_is_owner:
|
|
56
|
+
segments.append(f"IsOwner: {agent_is_owner}")
|
|
57
|
+
if agent_description:
|
|
58
|
+
segments.append(agent_description)
|
|
59
|
+
display_text = " - ".join(segments)
|
|
60
|
+
|
|
61
|
+
choices.append(questionary.Choice(title=display_text, value=agent_id))
|
|
62
|
+
|
|
63
|
+
if not choices:
|
|
64
|
+
return None, None
|
|
65
|
+
|
|
66
|
+
# Display the selection prompt
|
|
67
|
+
selected_agent = questionary.select(
|
|
68
|
+
"Select an agent to evaluate:",
|
|
69
|
+
choices=choices,
|
|
70
|
+
use_shortcuts=len(choices) <= 35,
|
|
71
|
+
use_arrow_keys=True
|
|
72
|
+
).ask()
|
|
73
|
+
|
|
74
|
+
return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
|