@microsoft/m365-copilot-eval 1.0.1-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +415 -0
- package/TERMS.txt +65 -0
- package/package.json +82 -0
- package/src/clients/cli/auth/__init__.py +1 -0
- package/src/clients/cli/auth/auth_handler.py +262 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +136 -0
- package/src/clients/cli/custom_evaluators/ConcisenessNonLLMEvaluator.py +18 -0
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +25 -0
- package/src/clients/cli/custom_evaluators/PII/PII.py +45 -0
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +39 -0
- package/src/clients/cli/custom_evaluators/__init__.py +1 -0
- package/src/clients/cli/demo_usage.py +83 -0
- package/src/clients/cli/generate_report.py +251 -0
- package/src/clients/cli/main.py +766 -0
- package/src/clients/cli/readme.md +301 -0
- package/src/clients/cli/requirements.txt +10 -0
- package/src/clients/cli/response_extractor.py +589 -0
- package/src/clients/cli/samples/PartnerSuccess.json +122 -0
- package/src/clients/cli/samples/example_prompts.json +14 -0
- package/src/clients/cli/samples/example_prompts_alt.json +12 -0
- package/src/clients/cli/samples/prompts_ambiguity.json +22 -0
- package/src/clients/cli/samples/prompts_rag_grounding.json +22 -0
- package/src/clients/cli/samples/prompts_security_injection.json +22 -0
- package/src/clients/cli/samples/prompts_tool_use_negatives.json +22 -0
- package/src/clients/cli/samples/psaSample.json +18 -0
- package/src/clients/cli/samples/starter.json +10 -0
- package/src/clients/node-js/bin/runevals.js +505 -0
- package/src/clients/node-js/config/default.js +25 -0
- package/src/clients/node-js/lib/cache-utils.js +119 -0
- package/src/clients/node-js/lib/expiry-check.js +164 -0
- package/src/clients/node-js/lib/index.js +25 -0
- package/src/clients/node-js/lib/python-runtime.js +253 -0
- package/src/clients/node-js/lib/venv-manager.js +242 -0
|
@@ -0,0 +1,766 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
import csv
|
|
6
|
+
import webbrowser
|
|
7
|
+
import urllib.request
|
|
8
|
+
import urllib.error
|
|
9
|
+
import urllib.parse
|
|
10
|
+
import questionary
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import List, Dict, Tuple, Optional, Any
|
|
13
|
+
from azure.ai.evaluation import (
|
|
14
|
+
AzureOpenAIModelConfiguration,
|
|
15
|
+
RelevanceEvaluator,
|
|
16
|
+
CoherenceEvaluator,
|
|
17
|
+
GroundednessEvaluator,
|
|
18
|
+
ToolCallAccuracyEvaluator
|
|
19
|
+
)
|
|
20
|
+
from dotenv import load_dotenv
|
|
21
|
+
from auth.auth_handler import AuthHandler
|
|
22
|
+
from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFormat
|
|
23
|
+
#from custom_evaluators.ConcisenessNonLLMEvaluator import ConcisenessNonLLMEvaluator
|
|
24
|
+
#from custom_evaluators.PII.PII import PIIEvaluator
|
|
25
|
+
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
26
|
+
from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
|
|
27
|
+
from datetime import datetime, timezone
|
|
28
|
+
|
|
29
|
+
# Allowed endpoints for URL validation
|
|
30
|
+
ALLOWED_ENDPOINTS = [
|
|
31
|
+
'substrate.office.com'
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
class CallPath(Enum):
|
|
35
|
+
""" Enum to indicate which call path to use. """
|
|
36
|
+
ACCESS_TOKEN = "access_token"
|
|
37
|
+
COPILOT_AUTH = "copilot_auth"
|
|
38
|
+
|
|
39
|
+
def write_results_to_html(results: List[Dict], output_file: str):
|
|
40
|
+
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
41
|
+
try:
|
|
42
|
+
html = generate_html_report(results)
|
|
43
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
44
|
+
f.write(html)
|
|
45
|
+
print(f"HTML report saved to {output_file}")
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(f"Error writing to HTML file: {e}")
|
|
48
|
+
sys.exit(1)
|
|
49
|
+
|
|
50
|
+
def get_default_prompts_and_responses():
|
|
51
|
+
"""Get a list of prompts and responses."""
|
|
52
|
+
prompts = [
|
|
53
|
+
"What is Microsoft Graph?"
|
|
54
|
+
]
|
|
55
|
+
expected_responses = [
|
|
56
|
+
"Microsoft Graph is a gateway to data and intelligence in Microsoft 365."
|
|
57
|
+
]
|
|
58
|
+
return prompts, expected_responses
|
|
59
|
+
|
|
60
|
+
def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
61
|
+
"""Load prompts and expected responses from a JSON file."""
|
|
62
|
+
try:
|
|
63
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
64
|
+
data = json.load(f)
|
|
65
|
+
|
|
66
|
+
if isinstance(data, list):
|
|
67
|
+
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
68
|
+
prompts = [item.get("prompt", "") for item in data]
|
|
69
|
+
expected_responses = [item.get("expected_response", "") for item in data]
|
|
70
|
+
elif isinstance(data, dict):
|
|
71
|
+
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
72
|
+
prompts = data.get("prompts", [])
|
|
73
|
+
expected_responses = data.get("expected_responses", [])
|
|
74
|
+
else:
|
|
75
|
+
raise ValueError("Invalid file format")
|
|
76
|
+
|
|
77
|
+
return prompts, expected_responses
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(f"Error loading prompts from file: {e}")
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
|
|
82
|
+
def get_interactive_prompts() -> Tuple[List[str], List[str]]:
|
|
83
|
+
"""Get prompts and expected responses interactively."""
|
|
84
|
+
prompts = []
|
|
85
|
+
expected_responses = []
|
|
86
|
+
|
|
87
|
+
print("Interactive mode: Enter your prompts and expected responses.")
|
|
88
|
+
print("Press Enter with empty prompt to finish.")
|
|
89
|
+
|
|
90
|
+
while True:
|
|
91
|
+
prompt = input(f"\nPrompt {len(prompts) + 1}: ").strip()
|
|
92
|
+
if not prompt:
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
expected = input(f"Expected response {len(expected_responses) + 1}: ").strip()
|
|
96
|
+
|
|
97
|
+
prompts.append(prompt)
|
|
98
|
+
expected_responses.append(expected)
|
|
99
|
+
|
|
100
|
+
if not prompts:
|
|
101
|
+
print("No prompts entered. Exiting.")
|
|
102
|
+
sys.exit(1)
|
|
103
|
+
|
|
104
|
+
return prompts, expected_responses
|
|
105
|
+
|
|
106
|
+
def run_evaluations(args, responses: dict, expected_responses: list) -> list:
|
|
107
|
+
"""Run evaluations against the responses."""
|
|
108
|
+
model_config = AzureOpenAIModelConfiguration(
|
|
109
|
+
azure_endpoint=os.environ.get("AZURE_AI_OPENAI_ENDPOINT"),
|
|
110
|
+
api_key=os.environ.get("AZURE_AI_API_KEY"),
|
|
111
|
+
api_version=os.environ.get("AZURE_AI_API_VERSION"),
|
|
112
|
+
azure_deployment=os.environ.get("AZURE_AI_MODEL_NAME"),
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Initialize evaluators
|
|
116
|
+
relevance_evaluator = RelevanceEvaluator(model_config=model_config) # Evaluate relevance for a given response. Range is 1 - 5.
|
|
117
|
+
coherence_evaluator = CoherenceEvaluator(model_config=model_config) # Measures the coherence (human-like quality) of the response. Range is 1 - 5.
|
|
118
|
+
groundedness_evaluator = GroundednessEvaluator(model_config=model_config) # Evaluates the response for for factuality and groundedness against provided context. Range is 1 - 5.
|
|
119
|
+
#concisenessnonllm_evaluator = ConcisenessNonLLMEvaluator() # Evaluates the response for conciseness. Range is 1 - 5.
|
|
120
|
+
#pii_evaluator = PIIEvaluator(model_config=model_config) # Evaluates the response for presence of PII. Range
|
|
121
|
+
# Parse citation format from args
|
|
122
|
+
citation_format = CitationFormat.OAI_UNICODE if args.citation_format == 'oai_unicode' else CitationFormat.LEGACY_BRACKET
|
|
123
|
+
citations_evaluator = CitationsEvaluator(citation_format=citation_format) # Evaluates citations present in the response using regex pattern matching
|
|
124
|
+
|
|
125
|
+
tool_call_accuracy_evaluator = ToolCallAccuracyEvaluator(model_config) # Evaluate tool call accuracy if tool definitions are present in response
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
PASS_THRESHOLD = 3 # All evaluators must meet or exceed this value (out of 5) to pass
|
|
129
|
+
|
|
130
|
+
def decorate_metric(metric_id: str, data):
|
|
131
|
+
"""Augment raw evaluator output with standardized threshold + pass/fail result."""
|
|
132
|
+
payload = {}
|
|
133
|
+
# Preserve original structure if dict
|
|
134
|
+
if isinstance(data, dict):
|
|
135
|
+
payload.update(data)
|
|
136
|
+
else:
|
|
137
|
+
payload['raw'] = data
|
|
138
|
+
|
|
139
|
+
# Try to extract a numeric score
|
|
140
|
+
score_val = None
|
|
141
|
+
if isinstance(data, dict):
|
|
142
|
+
for k in (metric_id, f"{metric_id}_score", 'score', 'value'):
|
|
143
|
+
if k in data:
|
|
144
|
+
score_val = data[k]
|
|
145
|
+
break
|
|
146
|
+
if isinstance(score_val, (int, float)):
|
|
147
|
+
payload['threshold'] = PASS_THRESHOLD
|
|
148
|
+
payload['result'] = 'pass' if score_val >= PASS_THRESHOLD else 'fail'
|
|
149
|
+
else:
|
|
150
|
+
# If we cannot determine score, mark unknown (no pass/fail)
|
|
151
|
+
payload['threshold'] = PASS_THRESHOLD
|
|
152
|
+
payload.setdefault('result', 'unknown')
|
|
153
|
+
return json.dumps(payload, indent=4)
|
|
154
|
+
|
|
155
|
+
evaluation_results = []
|
|
156
|
+
for prompt, expected_response in zip(responses.keys(), expected_responses):
|
|
157
|
+
# Extract text response for evaluation (backward compatibility)
|
|
158
|
+
enhanced_response = responses[prompt]
|
|
159
|
+
actual_response_text = get_response_text_for_evaluation(enhanced_response)
|
|
160
|
+
|
|
161
|
+
# Run evaluations using text response
|
|
162
|
+
relevance_score = relevance_evaluator(
|
|
163
|
+
query=prompt,
|
|
164
|
+
response=actual_response_text
|
|
165
|
+
)
|
|
166
|
+
coherence_score = coherence_evaluator(
|
|
167
|
+
query=prompt,
|
|
168
|
+
response=actual_response_text
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
groundedness_score = groundedness_evaluator(
|
|
172
|
+
response=actual_response_text,
|
|
173
|
+
context=expected_response
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
#PII_score = pii_evaluator(response=actual_response_text)
|
|
177
|
+
#concisenessNonLLM_score = concisenessnonllm_evaluator(response=actual_response_text)
|
|
178
|
+
|
|
179
|
+
citations_score = citations_evaluator(
|
|
180
|
+
response=actual_response_text
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
tool_call_accuracy = None
|
|
184
|
+
if args.agent_id and enhanced_response.get("tool_definitions"):
|
|
185
|
+
tool_call_accuracy = tool_call_accuracy_evaluator(
|
|
186
|
+
query=prompt,
|
|
187
|
+
response=enhanced_response.get("response", actual_response_text),
|
|
188
|
+
tool_definitions=enhanced_response["tool_definitions"]
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
evaluation_result = {
|
|
192
|
+
"prompt": prompt,
|
|
193
|
+
"response": actual_response_text, # Keep simple text for backward compatibility
|
|
194
|
+
"expected_response": expected_response,
|
|
195
|
+
"results": {
|
|
196
|
+
"relevance_score": decorate_metric("relevance", relevance_score),
|
|
197
|
+
"coherence_score": decorate_metric("coherence", coherence_score),
|
|
198
|
+
"groundedness_score": decorate_metric("groundedness", groundedness_score),
|
|
199
|
+
#"concisenessnonllm_score": decorate_metric("concisenessnonllm", concisenessNonLLM_score),
|
|
200
|
+
#"pii_score": decorate_metric("pii", PII_score),
|
|
201
|
+
"citations_score": json.dumps(citations_score, indent=4),
|
|
202
|
+
"tool_call_accuracy_score": json.dumps(tool_call_accuracy, indent=4) if tool_call_accuracy else None
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
if args.verbose:
|
|
207
|
+
print(f".................................. Evaluation for prompt: {evaluation_result['prompt']} ..................................")
|
|
208
|
+
print(f"Scores: {evaluation_result['results']}")
|
|
209
|
+
print("...........................................................................................................................")
|
|
210
|
+
|
|
211
|
+
evaluation_results.append(evaluation_result)
|
|
212
|
+
|
|
213
|
+
return evaluation_results
|
|
214
|
+
|
|
215
|
+
def write_results_to_console(results):
|
|
216
|
+
"""Write the response to console."""
|
|
217
|
+
# ANSI color codes
|
|
218
|
+
BOLD = '\033[1m'
|
|
219
|
+
BLUE = '\033[94m'
|
|
220
|
+
GREEN = '\033[92m'
|
|
221
|
+
YELLOW = '\033[93m'
|
|
222
|
+
CYAN = '\033[96m'
|
|
223
|
+
MAGENTA = '\033[95m'
|
|
224
|
+
ORANGE = '\033[38;5;208m'
|
|
225
|
+
RED = '\033[91m'
|
|
226
|
+
RESET = '\033[0m'
|
|
227
|
+
|
|
228
|
+
# Show aggregate statistics if multiple results
|
|
229
|
+
if len(results) > 1:
|
|
230
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
231
|
+
if aggregates:
|
|
232
|
+
print(f"{BOLD}{BLUE}📊 Aggregate Statistics ({len(results)} prompts):{RESET}")
|
|
233
|
+
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
234
|
+
|
|
235
|
+
for metric_name, stats in aggregates.items():
|
|
236
|
+
pass_color = GREEN if stats['pass_rate'] >= 80 else YELLOW if stats['pass_rate'] >= 60 else RED
|
|
237
|
+
print(f"{BOLD}{CYAN}{metric_name}:{RESET}")
|
|
238
|
+
print(f" Pass Rate: {pass_color}{stats['pass_rate']:.1f}%{RESET} ({stats['pass_count']}/{stats['total_evaluated']} passed)")
|
|
239
|
+
print(f" Avg Score: {MAGENTA}{stats['avg_score']:.2f}{RESET}")
|
|
240
|
+
if stats.get('threshold') is not None:
|
|
241
|
+
print(f" Threshold: {YELLOW}{stats['threshold']}{RESET}")
|
|
242
|
+
print()
|
|
243
|
+
|
|
244
|
+
print(f"{BLUE}{'=' * 60}{RESET}")
|
|
245
|
+
print()
|
|
246
|
+
|
|
247
|
+
print(f"{BOLD}{BLUE}📝 Individual Results:{RESET}")
|
|
248
|
+
print(f"{BLUE}{'=' * 50}{RESET}")
|
|
249
|
+
for i, result in enumerate(results, 1):
|
|
250
|
+
print(f"{BOLD}{GREEN}Prompt {i}:{RESET} {result['prompt']}")
|
|
251
|
+
print(f"{BOLD}{CYAN}Response:{RESET} {result['response']}")
|
|
252
|
+
print(f"{BOLD}{YELLOW}Expected Response:{RESET} {result['expected_response']}")
|
|
253
|
+
|
|
254
|
+
# Print metric scores generically from nested results (fallback to flat keys for back-compat)
|
|
255
|
+
metrics = result.get('results') or {k: v for k, v in result.items() if isinstance(k, str) and k.endswith('_score')}
|
|
256
|
+
if metrics:
|
|
257
|
+
for k, v in metrics.items():
|
|
258
|
+
name = k.replace('_', ' ')
|
|
259
|
+
if 'relevance' in k:
|
|
260
|
+
color = MAGENTA
|
|
261
|
+
elif 'coherence' in k:
|
|
262
|
+
color = ORANGE
|
|
263
|
+
elif 'fluency' in k:
|
|
264
|
+
color = GREEN
|
|
265
|
+
else:
|
|
266
|
+
color = BLUE
|
|
267
|
+
print(f"{BOLD}{color}{name}:{RESET} {v}")
|
|
268
|
+
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
269
|
+
|
|
270
|
+
def write_results_to_json(results: List[Dict], output_file: str):
|
|
271
|
+
"""Write results to JSON file."""
|
|
272
|
+
try:
|
|
273
|
+
output_data = {
|
|
274
|
+
"individual_results": results
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
# Add aggregate statistics if multiple results
|
|
278
|
+
if len(results) > 1:
|
|
279
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
280
|
+
output_data["aggregate_statistics"] = aggregates
|
|
281
|
+
|
|
282
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
283
|
+
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
284
|
+
print(f"Results saved to {output_file}")
|
|
285
|
+
except Exception as e:
|
|
286
|
+
print(f"Error writing to JSON file: {e}")
|
|
287
|
+
sys.exit(1)
|
|
288
|
+
|
|
289
|
+
def write_results_to_csv(results: List[Dict], output_file: str):
|
|
290
|
+
"""Write results to CSV file."""
|
|
291
|
+
try:
|
|
292
|
+
with open(output_file, 'w', newline='', encoding='utf-8') as f:
|
|
293
|
+
if results:
|
|
294
|
+
# Write aggregate statistics first if multiple results
|
|
295
|
+
if len(results) > 1:
|
|
296
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
297
|
+
if aggregates:
|
|
298
|
+
f.write("# AGGREGATE STATISTICS\n")
|
|
299
|
+
f.write("Metric,Pass Rate (%),Passed,Failed,Avg Score,Threshold\n")
|
|
300
|
+
for metric_name, stats in aggregates.items():
|
|
301
|
+
threshold_str = str(stats.get('threshold', 'N/A'))
|
|
302
|
+
f.write(f"{metric_name},{stats['pass_rate']:.1f},{stats['pass_count']},{stats['fail_count']},{stats['avg_score']:.2f},{threshold_str}\n")
|
|
303
|
+
f.write("\n# INDIVIDUAL RESULTS\n")
|
|
304
|
+
|
|
305
|
+
# Write individual results
|
|
306
|
+
writer = csv.DictWriter(f, fieldnames=results[0].keys())
|
|
307
|
+
writer.writeheader()
|
|
308
|
+
writer.writerows(results)
|
|
309
|
+
print(f"Results saved to {output_file}")
|
|
310
|
+
except Exception as e:
|
|
311
|
+
print(f"Error writing to CSV file: {e}")
|
|
312
|
+
sys.exit(1)
|
|
313
|
+
|
|
314
|
+
def parse_arguments():
|
|
315
|
+
"""Parse command line arguments."""
|
|
316
|
+
parser = argparse.ArgumentParser(
|
|
317
|
+
description="M365 Copilot Agent Evaluation CLI",
|
|
318
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
319
|
+
epilog="""
|
|
320
|
+
Examples:
|
|
321
|
+
# Run with default prompts
|
|
322
|
+
python main.py
|
|
323
|
+
|
|
324
|
+
# Run with custom prompts
|
|
325
|
+
python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph is a gateway..."
|
|
326
|
+
|
|
327
|
+
# Run with prompts from file
|
|
328
|
+
python main.py --prompts-file prompts.json
|
|
329
|
+
|
|
330
|
+
# Interactive mode
|
|
331
|
+
python main.py --interactive
|
|
332
|
+
|
|
333
|
+
# Save results to JSON
|
|
334
|
+
python main.py --output results.json
|
|
335
|
+
|
|
336
|
+
# Save results to CSV
|
|
337
|
+
python main.py --output results.csv
|
|
338
|
+
|
|
339
|
+
# Save results to HTML and open in browser
|
|
340
|
+
python main.py --output report.html
|
|
341
|
+
|
|
342
|
+
# Verbose output
|
|
343
|
+
python main.py --verbose
|
|
344
|
+
|
|
345
|
+
# Sign out and clear cached authentication tokens
|
|
346
|
+
python main.py --signout
|
|
347
|
+
"""
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Input options (mutually exclusive)
|
|
351
|
+
input_group = parser.add_mutually_exclusive_group()
|
|
352
|
+
input_group.add_argument(
|
|
353
|
+
'--prompts',
|
|
354
|
+
nargs='+',
|
|
355
|
+
help='List of prompts to evaluate'
|
|
356
|
+
)
|
|
357
|
+
input_group.add_argument(
|
|
358
|
+
'--prompts-file',
|
|
359
|
+
type=str,
|
|
360
|
+
help='JSON file containing prompts and expected responses'
|
|
361
|
+
)
|
|
362
|
+
input_group.add_argument(
|
|
363
|
+
'--interactive',
|
|
364
|
+
action='store_true',
|
|
365
|
+
help='Interactive mode to enter prompts'
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# Expected responses (only used with --prompts)
|
|
369
|
+
parser.add_argument(
|
|
370
|
+
'--expected',
|
|
371
|
+
nargs='+',
|
|
372
|
+
help='List of expected responses (must match number of prompts)'
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
# Agent ID
|
|
376
|
+
parser.add_argument(
|
|
377
|
+
'--agent-id',
|
|
378
|
+
type=str,
|
|
379
|
+
default=os.environ.get("AGENT_ID"),
|
|
380
|
+
help='Azure AI Agent ID (default from environment variable)'
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Output options
|
|
384
|
+
parser.add_argument(
|
|
385
|
+
'--output',
|
|
386
|
+
type=str,
|
|
387
|
+
help='Output file path. Format is determined by file extension: .json, .csv, .html. If not provided, results are printed to console.'
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
# Behavior options
|
|
391
|
+
parser.add_argument(
|
|
392
|
+
'--verbose',
|
|
393
|
+
action='store_true',
|
|
394
|
+
help='Enable verbose output'
|
|
395
|
+
)
|
|
396
|
+
parser.add_argument(
|
|
397
|
+
'--quiet',
|
|
398
|
+
action='store_true',
|
|
399
|
+
help='Suppress non-essential output'
|
|
400
|
+
)
|
|
401
|
+
parser.add_argument(
|
|
402
|
+
'--citation-format',
|
|
403
|
+
choices=['oai_unicode', 'legacy_bracket'],
|
|
404
|
+
default='oai_unicode',
|
|
405
|
+
help='Citation format to detect. "oai_unicode" for new OAI format (default), "legacy_bracket" for old [^i^] format'
|
|
406
|
+
)
|
|
407
|
+
parser.add_argument(
|
|
408
|
+
'--signout',
|
|
409
|
+
action='store_true',
|
|
410
|
+
help='Sign out and clear cached authentication tokens'
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
return parser.parse_args()
|
|
414
|
+
|
|
415
|
+
def validate_environment() -> CallPath:
|
|
416
|
+
"""Validate required environment variables."""
|
|
417
|
+
required_env_vars = [
|
|
418
|
+
"AZURE_AI_OPENAI_ENDPOINT",
|
|
419
|
+
"AZURE_AI_API_KEY",
|
|
420
|
+
"AZURE_AI_API_VERSION",
|
|
421
|
+
"AZURE_AI_MODEL_NAME",
|
|
422
|
+
# Chat API specific
|
|
423
|
+
"COPILOT_API_ENDPOINT",
|
|
424
|
+
"X_SCENARIO_HEADER"
|
|
425
|
+
]
|
|
426
|
+
|
|
427
|
+
if os.environ.get("COPILOT_API_ACCESS_TOKEN"):
|
|
428
|
+
call_path = CallPath.ACCESS_TOKEN
|
|
429
|
+
required_env_vars.append("COPILOT_API_ACCESS_TOKEN")
|
|
430
|
+
else:
|
|
431
|
+
call_path = CallPath.COPILOT_AUTH
|
|
432
|
+
required_env_vars.extend([
|
|
433
|
+
"M365_EVAL_CLIENT_ID",
|
|
434
|
+
"TENANT_ID"
|
|
435
|
+
])
|
|
436
|
+
|
|
437
|
+
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
|
438
|
+
if missing_vars:
|
|
439
|
+
print(f"Error: Missing required environment variables: {', '.join(missing_vars)}")
|
|
440
|
+
print("Please ensure your .env file contains all required Azure configuration.")
|
|
441
|
+
sys.exit(1)
|
|
442
|
+
return call_path
|
|
443
|
+
|
|
444
|
+
def validate_endpoint_url(url: str, allowed_domains: List[str]) -> bool:
|
|
445
|
+
"""Validate URL against security requirements."""
|
|
446
|
+
try:
|
|
447
|
+
parsed = urllib.parse.urlparse(url)
|
|
448
|
+
|
|
449
|
+
# Check for dangerous schemes
|
|
450
|
+
if parsed.scheme in ['javascript', 'data']:
|
|
451
|
+
raise ValueError(f"Dangerous URL scheme detected: {parsed.scheme}")
|
|
452
|
+
|
|
453
|
+
# Check for HTTPS requirement
|
|
454
|
+
if parsed.scheme != 'https':
|
|
455
|
+
raise ValueError(f"Only HTTPS URLs are allowed, got: {parsed.scheme}")
|
|
456
|
+
|
|
457
|
+
# Check if domain is in allowed list
|
|
458
|
+
if parsed.netloc not in allowed_domains:
|
|
459
|
+
raise ValueError(f"Domain not in allowed list: {parsed.netloc}")
|
|
460
|
+
|
|
461
|
+
# Reject fragment URLs
|
|
462
|
+
if parsed.fragment:
|
|
463
|
+
raise ValueError("Fragment URLs are not allowed")
|
|
464
|
+
|
|
465
|
+
return True
|
|
466
|
+
|
|
467
|
+
except ValueError:
|
|
468
|
+
# Re-raise ValueError exceptions
|
|
469
|
+
raise
|
|
470
|
+
except Exception as e:
|
|
471
|
+
# Convert other parsing errors to ValueError
|
|
472
|
+
raise ValueError(f"Invalid URL format: {url}") from e
|
|
473
|
+
|
|
474
|
+
def get_prompt_datasets(args) -> Tuple[List[str], List[str]]:
|
|
475
|
+
"""Get prompts and expected responses based on command line arguments."""
|
|
476
|
+
if args.prompts:
|
|
477
|
+
if args.expected and len(args.prompts) != len(args.expected):
|
|
478
|
+
print("Error: Number of prompts must match number of expected responses")
|
|
479
|
+
sys.exit(1)
|
|
480
|
+
prompts = args.prompts
|
|
481
|
+
expected_responses = args.expected or [""] * len(prompts)
|
|
482
|
+
elif args.prompts_file:
|
|
483
|
+
prompts, expected_responses = load_prompts_from_file(args.prompts_file)
|
|
484
|
+
elif args.interactive:
|
|
485
|
+
prompts, expected_responses = get_interactive_prompts()
|
|
486
|
+
else:
|
|
487
|
+
# Use default prompts
|
|
488
|
+
prompts, expected_responses = get_default_prompts_and_responses()
|
|
489
|
+
|
|
490
|
+
return prompts, expected_responses
|
|
491
|
+
|
|
492
|
+
def fetch_available_agents(copilot_api_endpoint: str, access_token: str, user_oid: str) -> List[Dict[str, Any]]:
|
|
493
|
+
"""
|
|
494
|
+
Fetch available agents for the user from the Copilot API.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
access_token: Bearer token for API authentication
|
|
498
|
+
user_oid: User object ID for agent filtering
|
|
499
|
+
|
|
500
|
+
Returns:
|
|
501
|
+
List of agent dictionaries.
|
|
502
|
+
"""
|
|
503
|
+
request_headers = {
|
|
504
|
+
"Content-Type": "application/json",
|
|
505
|
+
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
506
|
+
"Authorization": f"Bearer {access_token}"
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
# Build the query parameter with participant info
|
|
511
|
+
request_data = json.dumps({"participant": {"id": user_oid}})
|
|
512
|
+
query_param = urllib.parse.quote(request_data)
|
|
513
|
+
|
|
514
|
+
# Try to fetch agents from /GetGptList endpoint
|
|
515
|
+
req = urllib.request.Request(
|
|
516
|
+
f"{copilot_api_endpoint}/GetGptList?request={query_param}",
|
|
517
|
+
headers=request_headers,
|
|
518
|
+
method="GET"
|
|
519
|
+
)
|
|
520
|
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
521
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
522
|
+
agents = data.get("gptList", [])
|
|
523
|
+
return agents
|
|
524
|
+
except urllib.error.HTTPError as e:
|
|
525
|
+
# If endpoint doesn't exist or returns error, return empty list
|
|
526
|
+
print(f"Warning: Unable to fetch agents list (HTTP {e.code}).")
|
|
527
|
+
return []
|
|
528
|
+
except Exception as e:
|
|
529
|
+
print(f"Warning: Error fetching agents: {e}")
|
|
530
|
+
return []
|
|
531
|
+
|
|
532
|
+
def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
533
|
+
"""
|
|
534
|
+
Display an interactive agent selector using questionary.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
agents: List of agent dictionaries.
|
|
538
|
+
|
|
539
|
+
Returns:
|
|
540
|
+
Selected agent ID or None if cancelled/skipped
|
|
541
|
+
"""
|
|
542
|
+
if not agents:
|
|
543
|
+
return None
|
|
544
|
+
|
|
545
|
+
# Create choices for questionary
|
|
546
|
+
choices = []
|
|
547
|
+
sorted_agents = sorted(agents, key=lambda x: (not x.get('isOwner', False), x.get('name', '')))
|
|
548
|
+
for agent in sorted_agents:
|
|
549
|
+
agent_id = agent.get("gptId", "Unknown")
|
|
550
|
+
agent_name = agent.get("name", "Unknown")
|
|
551
|
+
agent_description = agent.get("description", "Unknown")
|
|
552
|
+
agent_is_owner = agent.get('isOwner')
|
|
553
|
+
|
|
554
|
+
# Format the display text
|
|
555
|
+
display_text = f"{agent_name} ({agent_id}, IsOwner: {agent_is_owner}) - {agent_description}"
|
|
556
|
+
|
|
557
|
+
choices.append(questionary.Choice(title=display_text, value=agent_id))
|
|
558
|
+
|
|
559
|
+
# Display the selection prompt
|
|
560
|
+
selected_agent = questionary.select(
|
|
561
|
+
"Select an agent to evaluate:",
|
|
562
|
+
choices=choices,
|
|
563
|
+
use_shortcuts=True,
|
|
564
|
+
use_arrow_keys=True
|
|
565
|
+
).ask()
|
|
566
|
+
|
|
567
|
+
return selected_agent
|
|
568
|
+
|
|
569
|
+
def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
570
|
+
message = {
|
|
571
|
+
"message": {
|
|
572
|
+
"text": prompt,
|
|
573
|
+
"author": "user",
|
|
574
|
+
"messageType": "chat",
|
|
575
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
576
|
+
"from": {
|
|
577
|
+
"id": user_oid,
|
|
578
|
+
}
|
|
579
|
+
},
|
|
580
|
+
"verbosity": "verbose", # To enable detailed telemetry in response (to extract tool usage, etc.)
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
if agent_id:
|
|
584
|
+
message["gpts"] = [
|
|
585
|
+
{
|
|
586
|
+
"id": agent_id.strip(),
|
|
587
|
+
"source": "MOS3"
|
|
588
|
+
}
|
|
589
|
+
]
|
|
590
|
+
message["optionsSets"] = [
|
|
591
|
+
"disable_action_confirmation" # Disable 3P action confirmation prompts for agents while scraping
|
|
592
|
+
]
|
|
593
|
+
|
|
594
|
+
return json.dumps(message).encode("utf-8")
|
|
595
|
+
|
|
596
|
+
def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str, access_token: str, user_oid: str, args) -> Dict[str, Dict[str, any]]:
|
|
597
|
+
""" Send prompts to the chat API and return enhanced responses. """
|
|
598
|
+
|
|
599
|
+
request_headers = {
|
|
600
|
+
"Content-Type": "application/json",
|
|
601
|
+
"X-Scenario": os.environ.get("X_SCENARIO_HEADER"),
|
|
602
|
+
"Authorization": f"Bearer {access_token}"
|
|
603
|
+
}
|
|
604
|
+
raw_responses: Dict[str, str] = {}
|
|
605
|
+
for i, prompt in enumerate(prompts, 1):
|
|
606
|
+
if not args.quiet:
|
|
607
|
+
print(f"Processing prompt {i}/{len(prompts)}...")
|
|
608
|
+
|
|
609
|
+
# Build the payload
|
|
610
|
+
payload = build_chat_payload(prompt, user_oid, args.agent_id)
|
|
611
|
+
if args.verbose:
|
|
612
|
+
print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
|
|
613
|
+
|
|
614
|
+
# Send the request to /chat
|
|
615
|
+
req = urllib.request.Request(f"{copilot_api_endpoint}/chat", data=payload, headers=request_headers, method="POST")
|
|
616
|
+
try:
|
|
617
|
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
618
|
+
raw = resp.read().decode("utf-8", errors="replace")
|
|
619
|
+
except urllib.error.HTTPError as e:
|
|
620
|
+
error_body = None
|
|
621
|
+
try:
|
|
622
|
+
error_body = e.read().decode("utf-8", errors="replace")
|
|
623
|
+
except Exception:
|
|
624
|
+
pass
|
|
625
|
+
msg = f"Chat API request failed (HTTP {e.code} {e.reason})."
|
|
626
|
+
if error_body:
|
|
627
|
+
msg += f" Body: {error_body[:500]}"
|
|
628
|
+
raise RuntimeError(msg) from e
|
|
629
|
+
except urllib.error.URLError as e:
|
|
630
|
+
raise RuntimeError(f"Chat API connection error: {getattr(e, 'reason', str(e))}") from e
|
|
631
|
+
|
|
632
|
+
if args.verbose:
|
|
633
|
+
print(f"[Sydney] Raw response: {raw}")
|
|
634
|
+
|
|
635
|
+
# Store raw response for enhancement
|
|
636
|
+
raw_responses[prompt] = raw.strip()
|
|
637
|
+
|
|
638
|
+
# Extract enhanced responses using the new extractor
|
|
639
|
+
enhanced_responses = extract_enhanced_responses(raw_responses)
|
|
640
|
+
return enhanced_responses
|
|
641
|
+
|
|
642
|
+
def output_results(results: List[Dict], args):
|
|
643
|
+
"""Output results based on specified format."""
|
|
644
|
+
if args.output:
|
|
645
|
+
output_lower = args.output.lower()
|
|
646
|
+
if output_lower.endswith('.json'):
|
|
647
|
+
write_results_to_json(results, args.output)
|
|
648
|
+
elif output_lower.endswith('.csv'):
|
|
649
|
+
write_results_to_csv(results, args.output)
|
|
650
|
+
elif output_lower.endswith('.html'):
|
|
651
|
+
write_results_to_html(results, args.output)
|
|
652
|
+
abs_path = os.path.abspath(args.output)
|
|
653
|
+
webbrowser.open(f'file://{abs_path}')
|
|
654
|
+
else:
|
|
655
|
+
write_results_to_json(results, args.output)
|
|
656
|
+
else:
|
|
657
|
+
write_results_to_console(results)
|
|
658
|
+
|
|
659
|
+
def main():
|
|
660
|
+
"""Main function to orchestrate the evaluation process."""
|
|
661
|
+
load_dotenv()
|
|
662
|
+
args = parse_arguments()
|
|
663
|
+
|
|
664
|
+
# Validate environment variables required for evaluation
|
|
665
|
+
call_path = validate_environment()
|
|
666
|
+
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
667
|
+
validate_endpoint_url(copilot_api_endpoint, ALLOWED_ENDPOINTS)
|
|
668
|
+
|
|
669
|
+
user_oid = ""
|
|
670
|
+
|
|
671
|
+
if call_path == CallPath.ACCESS_TOKEN:
|
|
672
|
+
access_token = os.environ["COPILOT_API_ACCESS_TOKEN"]
|
|
673
|
+
else:
|
|
674
|
+
scopes_str = os.environ.get(
|
|
675
|
+
"COPILOT_SCOPES", "https://substrate.office.com/sydney/.default"
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
auth_handler = AuthHandler(
|
|
679
|
+
client_id=os.environ["M365_EVAL_CLIENT_ID"],
|
|
680
|
+
tenant_id=os.environ["TENANT_ID"],
|
|
681
|
+
scopes_str=scopes_str
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
# Signout user
|
|
685
|
+
if args.signout:
|
|
686
|
+
try:
|
|
687
|
+
auth_handler.clear_cache()
|
|
688
|
+
except Exception as e:
|
|
689
|
+
print(f"Error during signout: {e}")
|
|
690
|
+
sys.exit(1)
|
|
691
|
+
sys.exit(0)
|
|
692
|
+
|
|
693
|
+
# Authenticate before loading prompts
|
|
694
|
+
try:
|
|
695
|
+
auth_result = auth_handler.acquire_token_interactive() or {}
|
|
696
|
+
access_token = auth_result.get("access_token") or ""
|
|
697
|
+
if not access_token:
|
|
698
|
+
raise RuntimeError("Failed to acquire access token from authentication result")
|
|
699
|
+
|
|
700
|
+
id_token_claims = auth_result.get("id_token_claims")
|
|
701
|
+
if not isinstance(id_token_claims, dict):
|
|
702
|
+
print("id_token_claims is missing or invalid in authentication result")
|
|
703
|
+
else:
|
|
704
|
+
user_oid = id_token_claims.get("oid") or ""
|
|
705
|
+
|
|
706
|
+
except Exception as e:
|
|
707
|
+
print(f"\033[91mError during authentication: {e}\033[0m")
|
|
708
|
+
if args.verbose:
|
|
709
|
+
import traceback
|
|
710
|
+
traceback.print_exc()
|
|
711
|
+
sys.exit(1)
|
|
712
|
+
|
|
713
|
+
if not user_oid and access_token:
|
|
714
|
+
# Fallback: extract from access token.
|
|
715
|
+
user_oid = AuthHandler.extract_user_oid_from_access_token(access_token)
|
|
716
|
+
|
|
717
|
+
# 1. Load evaluation datasets (prompts and expected_responses)
|
|
718
|
+
prompts, expected_responses = get_prompt_datasets(args)
|
|
719
|
+
|
|
720
|
+
if not args.quiet:
|
|
721
|
+
print(f"Running evaluation on {len(prompts)} prompt(s)...")
|
|
722
|
+
|
|
723
|
+
try:
|
|
724
|
+
# 3. Agent selection - if no agent ID provided, prompt user to select
|
|
725
|
+
if not args.agent_id:
|
|
726
|
+
if not args.quiet:
|
|
727
|
+
print("No agent ID provided. Fetching available agents...")
|
|
728
|
+
|
|
729
|
+
available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
|
|
730
|
+
if not available_agents:
|
|
731
|
+
print("No agents are available for interactive selection. Please re-run with --agent-id or set the AGENT_ID environment variable.")
|
|
732
|
+
sys.exit(1)
|
|
733
|
+
|
|
734
|
+
if available_agents:
|
|
735
|
+
selected_agent_id = select_agent_interactively(available_agents)
|
|
736
|
+
if selected_agent_id:
|
|
737
|
+
args.agent_id = selected_agent_id
|
|
738
|
+
if not args.quiet:
|
|
739
|
+
print(f"Selected agent: {args.agent_id}")
|
|
740
|
+
else:
|
|
741
|
+
print("No agent selected. Please re-run with --agent-id or set the AGENT_ID environment variable.")
|
|
742
|
+
sys.exit(1)
|
|
743
|
+
|
|
744
|
+
# 4. Send prompts to chat API
|
|
745
|
+
responses = send_prompt_to_agent_in_sydney(prompts, copilot_api_endpoint, access_token, user_oid, args)
|
|
746
|
+
except Exception as e:
|
|
747
|
+
print(f"\033[91mError sending prompts to chat API: {e}\033[0m")
|
|
748
|
+
if args.verbose:
|
|
749
|
+
import traceback
|
|
750
|
+
traceback.print_exc()
|
|
751
|
+
sys.exit(1)
|
|
752
|
+
|
|
753
|
+
# 5. Run evaluations
|
|
754
|
+
if not args.quiet:
|
|
755
|
+
print("Running evaluations...")
|
|
756
|
+
results = run_evaluations(args, responses, expected_responses)
|
|
757
|
+
|
|
758
|
+
# 6. Output results
|
|
759
|
+
output_results(results, args)
|
|
760
|
+
|
|
761
|
+
if not args.quiet:
|
|
762
|
+
print(f"\nEvaluation completed successfully! Processed {len(prompts)} prompt(s).")
|
|
763
|
+
|
|
764
|
+
# Call the main function when script is run directly
|
|
765
|
+
if __name__ == "__main__":
|
|
766
|
+
main()
|