@microsoft/m365-copilot-eval 1.4.0-preview.1 → 1.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,148 @@
1
+ """Dataset loading (file, interactive)."""
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional, Tuple
7
+
8
+ import questionary
9
+
10
+ from cli_logging.cli_logger import emit_structured_log
11
+ from cli_logging.logging_utils import Operation
12
+ from common import RunConfig
13
+ from schema_handler import DocumentUpgrader
14
+
15
+
16
+ def load_prompts_from_file(file_path: str) -> Tuple[List[Dict], Optional[Dict]]:
17
+ """Load prompts and expected responses from a JSON file.
18
+
19
+ Supports three formats:
20
+ 1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
21
+ 2. Array format: [{"prompt": "...", "expected_response": "..."}]
22
+ 3. Dict format: {"prompts": [...], "expected_responses": [...]}
23
+
24
+ For eval documents (format 1) and array format (format 2), schema validation
25
+ and auto-upgrade are applied via DocumentUpgrader.
26
+
27
+ Returns:
28
+ Tuple of (eval_items, default_evaluators). Items are dicts with prompt,
29
+ expected_response, and optional evaluators/evaluators_mode fields.
30
+ """
31
+ try:
32
+ with open(file_path, 'r', encoding='utf-8') as f:
33
+ data = json.load(f)
34
+
35
+ # Detect if this is an eval document (has "items" key) or could be upgraded
36
+ is_eval_document = (
37
+ isinstance(data, dict) and "items" in data
38
+ ) or isinstance(data, list)
39
+
40
+ # Run schema validation and auto-upgrade for eval documents
41
+ if is_eval_document:
42
+ try:
43
+ upgrader = DocumentUpgrader()
44
+ except Exception as e:
45
+ # Schema infrastructure not available (missing files, etc.) — skip
46
+ emit_structured_log("warning", f"Unable to initialize document upgrader: {e}", operation=Operation.LOAD_PROMPTS)
47
+ upgrader = None
48
+
49
+ if upgrader is not None:
50
+ result = upgrader.upgrade(Path(file_path))
51
+
52
+ if result.error:
53
+ emit_structured_log("error", f"Schema validation error: {result.error}", operation=Operation.LOAD_PROMPTS)
54
+ sys.exit(1)
55
+
56
+ if result.upgraded and result.message:
57
+ emit_structured_log("info", result.message, operation=Operation.LOAD_PROMPTS)
58
+
59
+ # Use the parsed document from the upgrade result
60
+ if result.document is not None:
61
+ data = result.document
62
+
63
+ if isinstance(data, list):
64
+ # Format: [{"prompt": "...", "expected_response": "..."}, ...]
65
+ return data, None
66
+ elif isinstance(data, dict):
67
+ if "items" in data:
68
+ # Eval document format: {"schemaVersion": "...", "items": [...]}
69
+ return data["items"], data.get("default_evaluators")
70
+ else:
71
+ # Format: {"prompts": [...], "expected_responses": [...]}
72
+ prompts = data.get("prompts", [])
73
+ expected_responses = data.get("expected_responses", [])
74
+ eval_items = [
75
+ {"prompt": p, "expected_response": e}
76
+ for p, e in zip(prompts, expected_responses)
77
+ ]
78
+ return eval_items, None
79
+ else:
80
+ raise ValueError("Invalid file format")
81
+ except SystemExit:
82
+ raise
83
+ except Exception as e:
84
+ emit_structured_log("error", f"Error loading prompts from file: {e}", operation=Operation.LOAD_PROMPTS)
85
+ sys.exit(1)
86
+
87
+
88
+ def get_interactive_prompts() -> Tuple[List[str], List[str]]:
89
+ """Get prompts and expected responses interactively."""
90
+ prompts = []
91
+ expected_responses = []
92
+
93
+ print("Interactive mode: Enter your prompts and expected responses.")
94
+ print("Press Enter with empty prompt to finish.")
95
+
96
+ while True:
97
+ prompt = input(f"\nPrompt {len(prompts) + 1}: ").strip()
98
+ if not prompt:
99
+ break
100
+
101
+ expected = input(f"Expected response {len(expected_responses) + 1}: ").strip()
102
+
103
+ prompts.append(prompt)
104
+ expected_responses.append(expected)
105
+
106
+ if not prompts:
107
+ print("No prompts entered. Exiting.")
108
+ sys.exit(1)
109
+
110
+ return prompts, expected_responses
111
+
112
+
113
+ def get_prompt_datasets(config: RunConfig) -> Tuple[List[Dict], Optional[Dict]]:
114
+ """Get prompts and expected responses based on command line arguments.
115
+
116
+ Returns:
117
+ Tuple of (eval_items, default_evaluators).
118
+ """
119
+ if config.prompts:
120
+ if config.expected and len(config.prompts) != len(config.expected):
121
+ emit_structured_log(
122
+ "error",
123
+ "Number of prompts must match number of expected responses. "
124
+ "Update --expected values to match the prompt count.",
125
+ )
126
+ sys.exit(1)
127
+ expected_responses = config.expected or [""] * len(config.prompts)
128
+ eval_items = [
129
+ {"prompt": p, "expected_response": e}
130
+ for p, e in zip(config.prompts, expected_responses)
131
+ ]
132
+ return eval_items, None
133
+ elif config.prompts_file:
134
+ return load_prompts_from_file(config.prompts_file)
135
+ elif config.interactive:
136
+ prompts, expected_responses = get_interactive_prompts()
137
+ eval_items = [
138
+ {"prompt": p, "expected_response": e}
139
+ for p, e in zip(prompts, expected_responses)
140
+ ]
141
+ return eval_items, None
142
+ else:
143
+ emit_structured_log(
144
+ "error",
145
+ "No prompts provided. Use --prompts, --prompts-file, or --interactive.",
146
+ operation=Operation.SETUP,
147
+ )
148
+ sys.exit(1)
@@ -7,7 +7,6 @@ Current evaluation metrics:
7
7
  - Coherence (1–5)
8
8
  - Groundedness (1–5)
9
9
  - Citations (count with pass/fail based on presence)
10
- - Tool Call Accuracy (1-5, evaluates correct tool usage when tools are invoked)
11
10
 
12
11
  ## 📋 Prerequisites
13
12
 
@@ -28,7 +27,7 @@ pip install -r requirements.txt
28
27
 
29
28
  ### 2. Set Up Environment Variables
30
29
 
31
- Create a `.env` file in the `src/clients/cli` directory (or export them). Choose **one** auth path for the Copilot API: either provide a pre-issued token or use interactive WAM auth (Windows).
30
+ Create a `.env` file in the `src/clients/cli` directory (or export them). Use interactive WAM auth (Windows) to authenticate with the Copilot API.
32
31
 
33
32
  ```bash
34
33
  # Azure OpenAI (evaluation models)
@@ -37,17 +36,8 @@ AZURE_AI_API_KEY="<azure-openai-key>"
37
36
  AZURE_AI_API_VERSION="2024-12-01-preview"
38
37
  AZURE_AI_MODEL_NAME="gpt-4o-mini"
39
38
 
40
- # Copilot Chat API (response generation)
41
- COPILOT_API_ENDPOINT="https://substrate.office.com/m365Copilot" # CLI appends /chat
42
- X_SCENARIO_HEADER="<scenario-header>" # e.g, officeweb
43
-
44
- # Auth option A: static access token (no prompt)
45
- COPILOT_API_ACCESS_TOKEN="<access-token>"
46
-
47
- # Auth option B: interactive WAM auth (used if COPILOT_API_ACCESS_TOKEN is empty)
48
- M365_EVAL_CLIENT_ID="<app-registration-client-id>"
39
+ # Your Tenant Id
49
40
  TENANT_ID="<aad-tenant-id>"
50
- COPILOT_SCOPES="https://substrate.office.com/sydney/.default"
51
41
 
52
42
  # Optional: default agent id (overridable via --m365-agent-id)
53
43
  M365_AGENT_ID="00000000-0000-0000-0000-000000000000"
@@ -151,39 +141,6 @@ python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
151
141
  }
152
142
  ```
153
143
 
154
- ## 🔧 Tool Call Accuracy Evaluation
155
-
156
- The CLI now includes advanced tool call accuracy evaluation that analyzes how effectively the agent uses available tools:
157
-
158
- ### What It Evaluates
159
- - **Tool Selection**: Whether the agent chooses appropriate tools for the given task
160
- - **Parameter Accuracy**: Correctness of arguments passed to tool functions
161
- - **Tool Usage Patterns**: Overall effectiveness of tool invocation strategies
162
-
163
- ### How It Works
164
- 1. **Response Analysis**: Extracts tool calls and results from conversation telemetry
165
- 2. **Tool Definitions**: Captures available tools from conversation metadata
166
- 3. **Accuracy Assessment**: Uses Azure AI Evaluation SDK's ToolCallAccuracyEvaluator
167
- 4. **Score Calculation**: Returns 1-5 score with pass/fail threshold (default: 3)
168
-
169
- ### Enhanced Data Extraction
170
- The tool now extracts detailed information from agent responses:
171
- - **Message Flow**: Complete chronological sequence of tool calls and results
172
- - **Tool Definitions**: Available tools and their schemas
173
- - **Internal Filtering**: Removes framework-internal tools for cleaner analysis
174
-
175
- ### Example Output
176
- ```bash
177
- 📊 Aggregate Statistics (3 prompts):
178
- ════════════════════════════════════════════════════════════
179
- Tool Call Accuracy:
180
- Pass Rate: 66.7% (2/3 passed)
181
- Avg Score: 0.75
182
- Threshold: 0.5
183
- ```
184
-
185
- **Note**: Tool Call Accuracy evaluation only applies when the agent response includes tool invocations. For text-only responses, this metric will not be computed.
186
-
187
144
  ## 🔧 Configuration
188
145
 
189
146
  ### Getting Azure AI Foundry Configuration Values
@@ -224,7 +181,7 @@ M365-Copilot-Agent-Evals/
224
181
  │ └── clients/
225
182
  │ └── cli/
226
183
  │ ├── main.py # Main evaluation script
227
- │ ├── response_extractor.py # Enhanced response parsing and tool extraction
184
+ │ ├── response_extractor.py # Enhanced response parsing
228
185
  │ ├── generate_report.py # HTML report generation with aggregates
229
186
  │ ├── requirements.txt # Python dependencies
230
187
  │ ├── readme.md # This file
@@ -247,11 +204,10 @@ M365-Copilot-Agent-Evals/
247
204
 
248
205
  ## 📊 Features
249
206
 
250
- - **Chat Invocation**: Sends prompts to the Sydney chat API
251
- - **Evaluation Metrics**: Relevance, Coherence, Groundedness, Citations, Tool Call Accuracy
207
+ - **Chat Invocation**: Sends prompts via the WorkIQ API
208
+ - **Evaluation Metrics**: Relevance, Coherence, Groundedness, Citations
252
209
  - **Multi-format Citation Detection**: Supports both new OAI Unicode and legacy bracket formats
253
- - **Tool Call Analysis**: Extracts and evaluates tool invocations from conversation telemetry
254
- - **Enhanced Response Extraction**: Detailed parsing of tool calls, results, and message flow
210
+ - **Enhanced Response Extraction**: Detailed parsing of results, and message flow
255
211
  - **Aggregate Statistics**: Summary metrics across multiple prompts with pass/fail rates
256
212
  - **Colorized Console Output**
257
213
  - **Multiple Output Formats**: JSON, CSV, HTML with aggregate dashboards
@@ -265,7 +221,7 @@ M365-Copilot-Agent-Evals/
265
221
 
266
222
  ## 🔑 Authentication Requirements
267
223
 
268
- - Copilot API: Either a static `COPILOT_API_ACCESS_TOKEN` **or** WAM interactive auth via `M365_EVAL_CLIENT_ID` and `TENANT_ID` (optional `COPILOT_SCOPES`)
224
+ - WorkIQ A2A API: WAM interactive auth via `WORK_IQ_A2A_CLIENT_ID`, `TENANT_ID`, and `WORK_IQ_A2A_SCOPES`
269
225
  - Evaluators: Azure OpenAI key (`AZURE_AI_API_KEY`)
270
226
 
271
227
  ## 📚 Useful Resources
@@ -282,9 +238,9 @@ M365-Copilot-Agent-Evals/
282
238
 
283
239
  1. **Authentication Errors**: Ensure your Azure credentials are properly configured and you have access to the resources.
284
240
 
285
- 2. **HTTP 401 / 403**: If using a static token, verify `COPILOT_API_ACCESS_TOKEN` and required headers. If using WAM, confirm `M365_EVAL_CLIENT_ID`, `X_SCENARIO_HEADER`, `TENANT_ID`, and `COPILOT_SCOPES` are correct.
241
+ 2. **HTTP 401 / 403**: Confirm `WORK_IQ_A2A_CLIENT_ID`, `TENANT_ID`, and `WORK_IQ_A2A_SCOPES` are correct. Re-run `--signout` to clear cached tokens and re-authenticate.
286
242
 
287
- 3. **Endpoint Issues**: Confirm `COPILOT_API_ENDPOINT` (base URL without `/chat`) is reachable (try `curl` / `Invoke-WebRequest`).
243
+ 3. **Endpoint Issues**: Confirm `WORK_IQ_A2A_ENDPOINT` is reachable (try `curl` / `Invoke-WebRequest`).
288
244
 
289
245
  ### Getting Help
290
246