@microsoft/m365-copilot-eval 1.3.0-preview.1 → 1.5.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +135 -100
- package/package.json +7 -4
- package/schema/CHANGELOG.md +7 -0
- package/schema/v1/eval-document.schema.json +143 -11
- package/schema/v1/examples/invalid/multi-turn-empty-turns.json +8 -0
- package/schema/v1/examples/invalid/multi-turn-has-both-prompt-and-turns.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-missing-prompt.json +12 -0
- package/schema/v1/examples/invalid/multi-turn-typo-in-turn.json +13 -0
- package/schema/v1/examples/invalid/multi-turn-unknown-evaluator.json +15 -0
- package/schema/v1/examples/valid/mixed-single-and-multi-turn.json +30 -0
- package/schema/v1/examples/valid/multi-turn-output.json +59 -0
- package/schema/v1/examples/valid/multi-turn-simple.json +21 -0
- package/schema/v1/examples/valid/multi-turn-with-evaluators.json +34 -0
- package/schema/version.json +2 -2
- package/src/clients/cli/agent_selector.py +74 -0
- package/src/clients/cli/api_clients/A2A/__init__.py +3 -0
- package/src/clients/cli/api_clients/A2A/a2a_client.py +475 -0
- package/src/clients/cli/api_clients/__init__.py +3 -0
- package/src/clients/cli/api_clients/base_agent_client.py +77 -0
- package/src/clients/cli/cli_args.py +136 -0
- package/src/clients/cli/cli_logging/cli_logger.py +33 -0
- package/src/clients/cli/cli_logging/console_diagnostics.py +56 -2
- package/src/clients/cli/cli_logging/logging_utils.py +0 -1
- package/src/clients/cli/common.py +64 -0
- package/src/clients/cli/env_validator.py +73 -0
- package/src/clients/cli/evaluation_runner.py +653 -0
- package/src/clients/cli/evaluator_resolver.py +9 -6
- package/src/clients/cli/generate_report.py +272 -129
- package/src/clients/cli/main.py +157 -1174
- package/src/clients/cli/parallel_executor.py +57 -0
- package/src/clients/cli/prompt_loader.py +148 -0
- package/src/clients/cli/readme.md +9 -53
- package/src/clients/cli/requirements.txt +1 -1
- package/src/clients/cli/response_extractor.py +4 -603
- package/src/clients/cli/result_writer.py +488 -0
- package/src/clients/cli/retry_policy.py +52 -0
- package/src/clients/cli/samples/multiturn_example.json +35 -0
- package/src/clients/cli/throttle_gate.py +82 -0
- package/src/clients/node-js/bin/runevals.js +82 -20
- package/src/clients/node-js/config/default.js +12 -11
- package/src/clients/node-js/lib/agent-id.js +12 -0
- package/src/clients/node-js/lib/env-loader.js +14 -20
- package/src/clients/node-js/lib/eula-manager.js +78 -0
- package/src/clients/node-js/lib/progress.js +13 -11
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Parallel prompt execution utilities.
|
|
2
|
+
|
|
3
|
+
This module provides a minimal reusable executor that preserves input order.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any, Callable, Generic, Iterable, List, Optional, TypeVar
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
T = TypeVar("T")
|
|
14
|
+
R = TypeVar("R")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class WorkerResult(Generic[R]):
|
|
19
|
+
"""Result model used to preserve input ordering and capture failures."""
|
|
20
|
+
|
|
21
|
+
index: int
|
|
22
|
+
value: Optional[R] = None
|
|
23
|
+
error: Optional[Exception] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def execute_in_parallel(
|
|
27
|
+
items: Iterable[T],
|
|
28
|
+
worker: Callable[[T, int], R],
|
|
29
|
+
max_workers: int,
|
|
30
|
+
) -> List[WorkerResult[R]]:
|
|
31
|
+
"""Execute worker function in parallel while preserving input order.
|
|
32
|
+
|
|
33
|
+
`worker` receives `(item, index)` and returns a value.
|
|
34
|
+
"""
|
|
35
|
+
indexed_items = list(enumerate(items))
|
|
36
|
+
if not indexed_items:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
normalized_workers = max(1, min(max_workers, len(indexed_items)))
|
|
40
|
+
results: List[WorkerResult[R]] = [WorkerResult(index=i) for i, _ in indexed_items]
|
|
41
|
+
|
|
42
|
+
with ThreadPoolExecutor(max_workers=normalized_workers) as executor:
|
|
43
|
+
future_map = {
|
|
44
|
+
executor.submit(worker, item, index): index
|
|
45
|
+
for index, item in indexed_items
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
for future in as_completed(future_map):
|
|
49
|
+
index = future_map[future]
|
|
50
|
+
try:
|
|
51
|
+
results[index] = WorkerResult(index=index, value=future.result())
|
|
52
|
+
except (KeyboardInterrupt, SystemExit):
|
|
53
|
+
raise
|
|
54
|
+
except Exception as exc:
|
|
55
|
+
results[index] = WorkerResult(index=index, error=exc)
|
|
56
|
+
|
|
57
|
+
return results
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Dataset loading (file, interactive)."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import questionary
|
|
9
|
+
|
|
10
|
+
from cli_logging.cli_logger import emit_structured_log
|
|
11
|
+
from cli_logging.logging_utils import Operation
|
|
12
|
+
from common import RunConfig
|
|
13
|
+
from schema_handler import DocumentUpgrader
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
|
|
17
|
+
"""Load prompts and expected responses from a JSON file.
|
|
18
|
+
|
|
19
|
+
Supports three formats:
|
|
20
|
+
1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
|
|
21
|
+
2. Array format: [{"prompt": "...", "expected_response": "..."}]
|
|
22
|
+
3. Dict format: {"prompts": [...], "expected_responses": [...]}
|
|
23
|
+
|
|
24
|
+
For eval documents (format 1) and array format (format 2), schema validation
|
|
25
|
+
and auto-upgrade are applied via DocumentUpgrader.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
|
|
29
|
+
expected_response, and optional evaluators/evaluators_mode fields.
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
33
|
+
data = json.load(f)
|
|
34
|
+
|
|
35
|
+
# Detect if this is an eval document (has "items" key) or could be upgraded
|
|
36
|
+
is_eval_document = (
|
|
37
|
+
isinstance(data, dict) and "items" in data
|
|
38
|
+
) or isinstance(data, list)
|
|
39
|
+
|
|
40
|
+
# Run schema validation and auto-upgrade for eval documents
|
|
41
|
+
if is_eval_document:
|
|
42
|
+
try:
|
|
43
|
+
upgrader = DocumentUpgrader()
|
|
44
|
+
except Exception as e:
|
|
45
|
+
# Schema infrastructure not available (missing files, etc.) — skip
|
|
46
|
+
emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
|
|
47
|
+
upgrader = None
|
|
48
|
+
|
|
49
|
+
if upgrader is not None:
|
|
50
|
+
result = upgrader.upgrade(Path(file_path))
|
|
51
|
+
|
|
52
|
+
if result.error:
|
|
53
|
+
emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
|
|
54
|
+
sys.exit(1)
|
|
55
|
+
|
|
56
|
+
if result.upgraded and result.message:
|
|
57
|
+
emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
|
|
58
|
+
|
|
59
|
+
# Use the parsed document from the upgrade result
|
|
60
|
+
if result.document is not None:
|
|
61
|
+
data = result.document
|
|
62
|
+
|
|
63
|
+
if isinstance(data, list):
|
|
64
|
+
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
65
|
+
return data, None
|
|
66
|
+
elif isinstance(data, dict):
|
|
67
|
+
if "items" in data:
|
|
68
|
+
# Eval document format: {"schemaVersion": "...", "items": [...]}
|
|
69
|
+
return data["items"], data.get("default_evaluators")
|
|
70
|
+
else:
|
|
71
|
+
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
72
|
+
prompts = data.get("prompts", [])
|
|
73
|
+
expected_responses = data.get("expected_responses", [])
|
|
74
|
+
eval_items = [
|
|
75
|
+
{"prompt": p, "expected_response": e}
|
|
76
|
+
for p, e in zip(prompts, expected_responses)
|
|
77
|
+
]
|
|
78
|
+
return eval_items, None
|
|
79
|
+
else:
|
|
80
|
+
raise ValueError("Invalid file format")
|
|
81
|
+
except SystemExit:
|
|
82
|
+
raise
|
|
83
|
+
except Exception as e:
|
|
84
|
+
emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
|
|
85
|
+
sys.exit(1)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
89
|
+
"""Get prompts and expected responses interactively."""
|
|
90
|
+
prompts = []
|
|
91
|
+
expected_responses = []
|
|
92
|
+
|
|
93
|
+
print("Interactive mode: Enter your prompts and expected responses.")
|
|
94
|
+
print("Press Enter with empty prompt to finish.")
|
|
95
|
+
|
|
96
|
+
while True:
|
|
97
|
+
prompt = input(f"\nPrompt {len(prompts) + 1}: ").strip()
|
|
98
|
+
if not prompt:
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
expected = input(f"Expected response {len(expected_responses) + 1}: ").strip()
|
|
102
|
+
|
|
103
|
+
prompts.append(prompt)
|
|
104
|
+
expected_responses.append(expected)
|
|
105
|
+
|
|
106
|
+
if not prompts:
|
|
107
|
+
print("No prompts entered. Exiting.")
|
|
108
|
+
sys.exit(1)
|
|
109
|
+
|
|
110
|
+
return prompts, expected_responses
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def get_prompt_datasets(config: RunConfig) -> Tuple[List[Dict], Optional[Dict]]:
|
|
114
|
+
"""Get prompts and expected responses based on command line arguments.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Tuple of (eval_items, default_evaluators).
|
|
118
|
+
"""
|
|
119
|
+
if config.prompts:
|
|
120
|
+
if config.expected and len(config.prompts) != len(config.expected):
|
|
121
|
+
emit_structured_log(
|
|
122
|
+
"error",
|
|
123
|
+
"Number of prompts must match number of expected responses. "
|
|
124
|
+
"Update --expected values to match the prompt count.",
|
|
125
|
+
)
|
|
126
|
+
sys.exit(1)
|
|
127
|
+
expected_responses = config.expected or [""] * len(config.prompts)
|
|
128
|
+
eval_items = [
|
|
129
|
+
{"prompt": p, "expected_response": e}
|
|
130
|
+
for p, e in zip(config.prompts, expected_responses)
|
|
131
|
+
]
|
|
132
|
+
return eval_items, None
|
|
133
|
+
elif config.prompts_file:
|
|
134
|
+
return load_prompts_from_file(config.prompts_file)
|
|
135
|
+
elif config.interactive:
|
|
136
|
+
prompts, expected_responses = get_interactive_prompts()
|
|
137
|
+
eval_items = [
|
|
138
|
+
{"prompt": p, "expected_response": e}
|
|
139
|
+
for p, e in zip(prompts, expected_responses)
|
|
140
|
+
]
|
|
141
|
+
return eval_items, None
|
|
142
|
+
else:
|
|
143
|
+
emit_structured_log(
|
|
144
|
+
"error",
|
|
145
|
+
"No prompts provided. Use --prompts, --prompts-file, or --interactive.",
|
|
146
|
+
operation=Operation.SETUP,
|
|
147
|
+
)
|
|
148
|
+
sys.exit(1)
|
|
@@ -7,7 +7,6 @@ Current evaluation metrics:
|
|
|
7
7
|
- Coherence (1–5)
|
|
8
8
|
- Groundedness (1–5)
|
|
9
9
|
- Citations (count with pass/fail based on presence)
|
|
10
|
-
- Tool Call Accuracy (1-5, evaluates correct tool usage when tools are invoked)
|
|
11
10
|
|
|
12
11
|
## 📋 Prerequisites
|
|
13
12
|
|
|
@@ -28,7 +27,7 @@ pip install -r requirements.txt
|
|
|
28
27
|
|
|
29
28
|
### 2. Set Up Environment Variables
|
|
30
29
|
|
|
31
|
-
Create a `.env` file in the `src/clients/cli` directory (or export them).
|
|
30
|
+
Create a `.env` file in the `src/clients/cli` directory (or export them). Use interactive WAM auth (Windows) to authenticate with the Copilot API.
|
|
32
31
|
|
|
33
32
|
```bash
|
|
34
33
|
# Azure OpenAI (evaluation models)
|
|
@@ -37,17 +36,8 @@ AZURE_AI_API_KEY="<azure-openai-key>"
|
|
|
37
36
|
AZURE_AI_API_VERSION="2024-12-01-preview"
|
|
38
37
|
AZURE_AI_MODEL_NAME="gpt-4o-mini"
|
|
39
38
|
|
|
40
|
-
#
|
|
41
|
-
COPILOT_API_ENDPOINT="https://substrate.office.com/m365Copilot" # CLI appends /chat
|
|
42
|
-
X_SCENARIO_HEADER="<scenario-header>" # e.g, officeweb
|
|
43
|
-
|
|
44
|
-
# Auth option A: static access token (no prompt)
|
|
45
|
-
COPILOT_API_ACCESS_TOKEN="<access-token>"
|
|
46
|
-
|
|
47
|
-
# Auth option B: interactive WAM auth (used if COPILOT_API_ACCESS_TOKEN is empty)
|
|
48
|
-
M365_EVAL_CLIENT_ID="<app-registration-client-id>"
|
|
39
|
+
# Your Tenant Id
|
|
49
40
|
TENANT_ID="<aad-tenant-id>"
|
|
50
|
-
COPILOT_SCOPES="https://substrate.office.com/sydney/.default"
|
|
51
41
|
|
|
52
42
|
# Optional: default agent id (overridable via --m365-agent-id)
|
|
53
43
|
M365_AGENT_ID="00000000-0000-0000-0000-000000000000"
|
|
@@ -151,39 +141,6 @@ python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
|
|
|
151
141
|
}
|
|
152
142
|
```
|
|
153
143
|
|
|
154
|
-
## 🔧 Tool Call Accuracy Evaluation
|
|
155
|
-
|
|
156
|
-
The CLI now includes advanced tool call accuracy evaluation that analyzes how effectively the agent uses available tools:
|
|
157
|
-
|
|
158
|
-
### What It Evaluates
|
|
159
|
-
- **Tool Selection**: Whether the agent chooses appropriate tools for the given task
|
|
160
|
-
- **Parameter Accuracy**: Correctness of arguments passed to tool functions
|
|
161
|
-
- **Tool Usage Patterns**: Overall effectiveness of tool invocation strategies
|
|
162
|
-
|
|
163
|
-
### How It Works
|
|
164
|
-
1. **Response Analysis**: Extracts tool calls and results from conversation telemetry
|
|
165
|
-
2. **Tool Definitions**: Captures available tools from conversation metadata
|
|
166
|
-
3. **Accuracy Assessment**: Uses Azure AI Evaluation SDK's ToolCallAccuracyEvaluator
|
|
167
|
-
4. **Score Calculation**: Returns 1-5 score with pass/fail threshold (default: 3)
|
|
168
|
-
|
|
169
|
-
### Enhanced Data Extraction
|
|
170
|
-
The tool now extracts detailed information from agent responses:
|
|
171
|
-
- **Message Flow**: Complete chronological sequence of tool calls and results
|
|
172
|
-
- **Tool Definitions**: Available tools and their schemas
|
|
173
|
-
- **Internal Filtering**: Removes framework-internal tools for cleaner analysis
|
|
174
|
-
|
|
175
|
-
### Example Output
|
|
176
|
-
```bash
|
|
177
|
-
📊 Aggregate Statistics (3 prompts):
|
|
178
|
-
════════════════════════════════════════════════════════════
|
|
179
|
-
Tool Call Accuracy:
|
|
180
|
-
Pass Rate: 66.7% (2/3 passed)
|
|
181
|
-
Avg Score: 0.75
|
|
182
|
-
Threshold: 0.5
|
|
183
|
-
```
|
|
184
|
-
|
|
185
|
-
**Note**: Tool Call Accuracy evaluation only applies when the agent response includes tool invocations. For text-only responses, this metric will not be computed.
|
|
186
|
-
|
|
187
144
|
## 🔧 Configuration
|
|
188
145
|
|
|
189
146
|
### Getting Azure AI Foundry Configuration Values
|
|
@@ -224,7 +181,7 @@ M365-Copilot-Agent-Evals/
|
|
|
224
181
|
│ └── clients/
|
|
225
182
|
│ └── cli/
|
|
226
183
|
│ ├── main.py # Main evaluation script
|
|
227
|
-
│ ├── response_extractor.py # Enhanced response parsing
|
|
184
|
+
│ ├── response_extractor.py # Enhanced response parsing
|
|
228
185
|
│ ├── generate_report.py # HTML report generation with aggregates
|
|
229
186
|
│ ├── requirements.txt # Python dependencies
|
|
230
187
|
│ ├── readme.md # This file
|
|
@@ -247,11 +204,10 @@ M365-Copilot-Agent-Evals/
|
|
|
247
204
|
|
|
248
205
|
## 📊 Features
|
|
249
206
|
|
|
250
|
-
- **Chat Invocation**: Sends prompts
|
|
251
|
-
- **Evaluation Metrics**: Relevance, Coherence, Groundedness, Citations
|
|
207
|
+
- **Chat Invocation**: Sends prompts via the WorkIQ API
|
|
208
|
+
- **Evaluation Metrics**: Relevance, Coherence, Groundedness, Citations
|
|
252
209
|
- **Multi-format Citation Detection**: Supports both new OAI Unicode and legacy bracket formats
|
|
253
|
-
|
|
254
|
-
- **Enhanced Response Extraction**: Detailed parsing of tool calls, results, and message flow
|
|
210
|
+
- **Enhanced Response Extraction**: Detailed parsing of results, and message flow
|
|
255
211
|
- **Aggregate Statistics**: Summary metrics across multiple prompts with pass/fail rates
|
|
256
212
|
- **Colorized Console Output**
|
|
257
213
|
- **Multiple Output Formats**: JSON, CSV, HTML with aggregate dashboards
|
|
@@ -265,7 +221,7 @@ M365-Copilot-Agent-Evals/
|
|
|
265
221
|
|
|
266
222
|
## 🔑 Authentication Requirements
|
|
267
223
|
|
|
268
|
-
-
|
|
224
|
+
- WorkIQ A2A API: WAM interactive auth via `WORK_IQ_A2A_CLIENT_ID`, `TENANT_ID`, and `WORK_IQ_A2A_SCOPES`
|
|
269
225
|
- Evaluators: Azure OpenAI key (`AZURE_AI_API_KEY`)
|
|
270
226
|
|
|
271
227
|
## 📚 Useful Resources
|
|
@@ -282,9 +238,9 @@ M365-Copilot-Agent-Evals/
|
|
|
282
238
|
|
|
283
239
|
1. **Authentication Errors**: Ensure your Azure credentials are properly configured and you have access to the resources.
|
|
284
240
|
|
|
285
|
-
2. **HTTP 401 / 403**:
|
|
241
|
+
2. **HTTP 401 / 403**: Confirm `WORK_IQ_A2A_CLIENT_ID`, `TENANT_ID`, and `WORK_IQ_A2A_SCOPES` are correct. Re-run `--signout` to clear cached tokens and re-authenticate.
|
|
286
242
|
|
|
287
|
-
3. **Endpoint Issues**: Confirm `
|
|
243
|
+
3. **Endpoint Issues**: Confirm `WORK_IQ_A2A_ENDPOINT` is reachable (try `curl` / `Invoke-WebRequest`).
|
|
288
244
|
|
|
289
245
|
### Getting Help
|
|
290
246
|
|