@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +135 -100
  2. package/package.json +7 -4
  3. package/schema/CHANGELOG.md +7 -0
  4. package/schema/v1/eval-document.schema.json +143 -11
  5. package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
  6. package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
  7. package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
  8. package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
  9. package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
  10. package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
  11. package/schema/v1/examples/valid/multi-turn-output.json +59 -0
  12. package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
  13. package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
  14. package/schema/version.json +2 -2
  15. package/src/clients/cli/agent_selector.py +74 -0
  16. package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
  17. package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
  18. package/src/clients/cli/api_clients/__init__.py +3 -0
  19. package/src/clients/cli/api_clients/base_agent_client.py +77 -0
  20. package/src/clients/cli/cli_args.py +136 -0
  21. package/src/clients/cli/cli_logging/cli_logger.py +33 -0
  22. package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
  23. package/src/clients/cli/cli_logging/logging_utils.py +0 -1
  24. package/src/clients/cli/common.py +64 -0
  25. package/src/clients/cli/env_validator.py +73 -0
  26. package/src/clients/cli/evaluation_runner.py +653 -0
  27. package/src/clients/cli/evaluator_resolver.py +9 -6
  28. package/src/clients/cli/generate_report.py +272 -129
  29. package/src/clients/cli/main.py +157 -1174
  30. package/src/clients/cli/parallel_executor.py +57 -0
  31. package/src/clients/cli/prompt_loader.py +148 -0
  32. package/src/clients/cli/readme.md +9 -53
  33. package/src/clients/cli/requirements.txt +1 -1
  34. package/src/clients/cli/response_extractor.py +4 -603
  35. package/src/clients/cli/result_writer.py +488 -0
  36. package/src/clients/cli/retry_policy.py +52 -0
  37. package/src/clients/cli/samples/multiturn_example.json +35 -0
  38. package/src/clients/cli/throttle_gate.py +82 -0
  39. package/src/clients/node-js/bin/runevals.js +82 -20
  40. package/src/clients/node-js/config/default.js +12 -11
  41. package/src/clients/node-js/lib/agent-id.js +12 -0
  42. package/src/clients/node-js/lib/env-loader.js +14 -20
  43. package/src/clients/node-js/lib/eula-manager.js +78 -0
  44. package/src/clients/node-js/lib/progress.js +13 -11
@@ -0,0 +1,30 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "default_evaluators": {
4
+ "Relevance": {},
5
+ "Coherence": {}
6
+ },
7
+ "items": [
8
+ {
9
+ "prompt": "What is Microsoft Graph API?",
10
+ "expected_response": "Microsoft Graph API is a unified endpoint for accessing Microsoft services."
11
+ },
12
+ {
13
+ "name": "Canadian Employee HR Inquiry",
14
+ "turns": [
15
+ {
16
+ "prompt": "I'm a Canadian employee based in Toronto.",
17
+ "expected_response": "Got it! I can help with Canada-specific HR questions."
18
+ },
19
+ {
20
+ "prompt": "Is July 4th a holiday for me?",
21
+ "expected_response": "July 4th is not a statutory holiday in Canada. However, July 1st (Canada Day) is."
22
+ }
23
+ ]
24
+ },
25
+ {
26
+ "prompt": "How do I authenticate with Microsoft Graph?",
27
+ "expected_response": "You can authenticate using OAuth 2.0 or client credentials flow."
28
+ }
29
+ ]
30
+ }
@@ -0,0 +1,59 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "metadata": {
4
+ "evaluatedAt": "2026-03-30T10:00:00Z",
5
+ "agentName": "Travel Assistant",
6
+ "cliVersion": "1.3.0"
7
+ },
8
+ "items": [
9
+ {
10
+ "name": "Context Persistence Test",
11
+ "turns": [
12
+ {
13
+ "prompt": "I'm based in Seattle.",
14
+ "expected_response": "Got it! I can help with Seattle-specific questions.",
15
+ "response": "Understood! I can assist with Seattle-related queries.",
16
+ "scores": {
17
+ "relevance": {
18
+ "score": 4.0,
19
+ "result": "pass",
20
+ "threshold": 3,
21
+ "reason": "Response acknowledges Seattle context."
22
+ },
23
+ "coherence": {
24
+ "score": 5.0,
25
+ "result": "pass",
26
+ "threshold": 3
27
+ }
28
+ },
29
+ "status": "pass"
30
+ },
31
+ {
32
+ "prompt": "What's the weather like here?",
33
+ "expected_response": "Seattle weather is typically mild with rain.",
34
+ "response": "Seattle generally has mild temperatures with frequent rain, especially in fall and winter.",
35
+ "scores": {
36
+ "relevance": {
37
+ "score": 5.0,
38
+ "result": "pass",
39
+ "threshold": 3
40
+ },
41
+ "coherence": {
42
+ "score": 4.0,
43
+ "result": "pass",
44
+ "threshold": 3
45
+ }
46
+ },
47
+ "status": "pass"
48
+ }
49
+ ],
50
+ "conversation_id": "conv-abc-123",
51
+ "summary": {
52
+ "turns_total": 2,
53
+ "turns_passed": 2,
54
+ "turns_failed": 0,
55
+ "overall_status": "pass"
56
+ }
57
+ }
58
+ ]
59
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "items": [
4
+ {
5
+ "turns": [
6
+ {
7
+ "prompt": "I'm traveling to Seattle next week for a conference.",
8
+ "expected_response": "I can help with travel-related questions."
9
+ },
10
+ {
11
+ "prompt": "What's the weather usually like?",
12
+ "expected_response": "Seattle weather is typically mild with some rain."
13
+ },
14
+ {
15
+ "prompt": "Should I bring a rain jacket?",
16
+ "expected_response": "Yes, Seattle is known for rain. A rain jacket is recommended."
17
+ }
18
+ ]
19
+ }
20
+ ]
21
+ }
@@ -0,0 +1,34 @@
1
+ {
2
+ "schemaVersion": "1.2.0",
3
+ "default_evaluators": {
4
+ "Relevance": {},
5
+ "Coherence": {}
6
+ },
7
+ "items": [
8
+ {
9
+ "name": "Expense Policy Flow",
10
+ "description": "Test that agent handles expense policy questions across turns",
11
+ "turns": [
12
+ {
13
+ "prompt": "I'm traveling to Seattle next week for a conference.",
14
+ "expected_response": "I can help with travel-related questions."
15
+ },
16
+ {
17
+ "prompt": "My dinner last night was $250. Is that okay?",
18
+ "expected_response": "The per-diem meal allowance is a maximum of $200.",
19
+ "evaluators": {
20
+ "Groundedness": { "threshold": 4 }
21
+ }
22
+ },
23
+ {
24
+ "prompt": "What should I do about the overage?",
25
+ "expected_response": "For expenses exceeding the policy limit, you'll need manager approval.",
26
+ "evaluators": {
27
+ "ExactMatch": { "case_sensitive": false }
28
+ },
29
+ "evaluators_mode": "replace"
30
+ }
31
+ ]
32
+ }
33
+ ]
34
+ }
@@ -1,6 +1,6 @@
1
1
  {
2
- "version": "1.1.0",
3
- "releaseDate": "2026-03-17",
2
+ "version": "1.2.0",
3
+ "releaseDate": "2026-04-02",
4
4
  "schemaId": "https://raw.githubusercontent.com/microsoft/M365-Copilot-Agent-Evals/refs/heads/main/schema/v1/eval-document.schema.json",
5
5
  "description": "M365 Copilot Eval Document Schema"
6
6
  }
@@ -0,0 +1,74 @@
1
+ """Interactive agent selection and agent-id utilities."""
2
+
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import questionary
6
+
7
+ from cli_logging.cli_logger import emit_structured_log
8
+ from cli_logging.logging_utils import Operation
9
+
10
+
11
+ def normalize_agent_id(agent_id):
12
+ """Append '.declarativeAgent' if agent_id has no '.', else return unchanged.
13
+
14
+ Returns the input unchanged when it is None/empty or already contains a dot.
15
+ """
16
+ if not agent_id:
17
+ return agent_id
18
+ return agent_id if '.' in agent_id else f"{agent_id}.declarativeAgent"
19
+
20
+
21
+ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Tuple[Optional[str], Optional[str]]:
22
+ """
23
+ Display an interactive agent selector using questionary.
24
+
25
+ Args:
26
+ agents: List of agent dictionaries.
27
+
28
+ Returns:
29
+ Tuple of (agent_id, agent_name) or (None, None) if cancelled/skipped
30
+ """
31
+ if not agents:
32
+ return None, None
33
+
34
+ # Build id→name lookup and choices
35
+ id_to_name: Dict[str, str] = {}
36
+ choices = []
37
+ sorted_agents = sorted(agents, key=lambda a: a.get("name", ""))
38
+ for agent in sorted_agents:
39
+ agent_name = agent.get("name", "Unknown")
40
+ agent_id = (agent.get("gptId") or "").strip()
41
+ if not agent_id:
42
+ emit_structured_log("warning", f"Skipping agent '{agent_name}': missing or empty gptId.", operation=Operation.FETCH_AGENTS)
43
+ continue
44
+ agent_description = agent.get("description")
45
+ agent_is_owner = agent.get('isOwner')
46
+ agent_provider = agent.get("provider")
47
+ id_to_name[agent_id] = agent_name
48
+
49
+ # Format the display text
50
+ if agent_provider:
51
+ title = f"{agent_name} - {agent_provider} ({agent_id})"
52
+ else:
53
+ title = f"{agent_name} ({agent_id})"
54
+ segments = [title]
55
+ if agent_is_owner:
56
+ segments.append(f"IsOwner: {agent_is_owner}")
57
+ if agent_description:
58
+ segments.append(agent_description)
59
+ display_text = " - ".join(segments)
60
+
61
+ choices.append(questionary.Choice(title=display_text, value=agent_id))
62
+
63
+ if not choices:
64
+ return None, None
65
+
66
+ # Display the selection prompt
67
+ selected_agent = questionary.select(
68
+ "Select an agent to evaluate:",
69
+ choices=choices,
70
+ use_shortcuts=len(choices) <= 35,
71
+ use_arrow_keys=True
72
+ ).ask()
73
+
74
+ return selected_agent, id_to_name.get(selected_agent) if selected_agent else None
@@ -0,0 +1,3 @@
1
+ from .a2a_client import A2AClient
2
+
3
+ __all__ = ["A2AClient"]