@microsoft/m365-copilot-eval 1.1.1-preview.1 → 1.2.1-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -18
- package/package.json +5 -2
- package/schema/CHANGELOG.md +21 -0
- package/schema/v1/eval-document.schema.json +236 -0
- package/schema/v1/examples/invalid/empty-items.json +4 -0
- package/schema/v1/examples/invalid/invalid-semver.json +8 -0
- package/schema/v1/examples/invalid/missing-schema-version.json +7 -0
- package/schema/v1/examples/invalid/wrong-type.json +6 -0
- package/schema/v1/examples/valid/comprehensive.json +92 -0
- package/schema/v1/examples/valid/minimal.json +8 -0
- package/schema/version.json +6 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +77 -33
- package/src/clients/cli/main.py +229 -30
- package/src/clients/cli/readme.md +5 -5
- package/src/clients/cli/requirements.txt +3 -0
- package/src/clients/cli/samples/starter.json +13 -10
- package/src/clients/cli/schema_handler.py +349 -0
- package/src/clients/cli/version_check.py +139 -0
- package/src/clients/node-js/bin/runevals.js +34 -103
- package/src/clients/node-js/config/default.js +1 -1
- package/src/clients/node-js/lib/env-loader.js +126 -0
- package/src/clients/node-js/lib/progress.js +36 -36
- package/src/clients/node-js/lib/python-runtime.js +4 -6
- package/src/clients/node-js/lib/venv-manager.js +60 -18
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
CitationsEvaluator - A custom evaluator for analyzing citations in M365 Copilot responses.
|
|
3
3
|
|
|
4
|
-
This evaluator uses regex-based pattern matching to detect citations in
|
|
5
|
-
1. New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
6
|
-
2. Old format: [^i^] where i is the citation index
|
|
4
|
+
This evaluator uses regex-based pattern matching to detect citations in three modes:
|
|
5
|
+
1. OAI_UNICODE: New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
6
|
+
2. LEGACY_BRACKET: Old format: [^i^] where i is the citation index
|
|
7
|
+
3. AUTO: Automatically detects both formats simultaneously
|
|
7
8
|
|
|
8
9
|
Where X, Y, and i are natural numbers representing conversation turn, search result index, or citation index.
|
|
9
10
|
"""
|
|
@@ -17,48 +18,60 @@ class CitationFormat(Enum):
|
|
|
17
18
|
"""Enum for different citation formats supported by the evaluator."""
|
|
18
19
|
OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
19
20
|
LEGACY_BRACKET = "legacy_bracket" # Old format: [^i^]
|
|
21
|
+
AUTO = "auto" # Automatically detect both formats
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class CitationsEvaluator:
|
|
23
25
|
"""
|
|
24
26
|
A custom evaluator that analyzes citations in response text without using an LLM.
|
|
25
|
-
|
|
27
|
+
|
|
26
28
|
This evaluator detects citation patterns and returns:
|
|
27
29
|
- Whether at least one citation is present
|
|
28
30
|
- The number of unique citations found
|
|
29
|
-
|
|
30
|
-
Supports
|
|
31
|
+
|
|
32
|
+
Supports three modes:
|
|
33
|
+
- OAI_UNICODE: Detects only OAI unicode format citations
|
|
34
|
+
- LEGACY_BRACKET: Detects only legacy bracket format citations
|
|
35
|
+
- AUTO: Automatically detects both formats simultaneously
|
|
31
36
|
"""
|
|
32
37
|
|
|
33
38
|
def __init__(self, citation_format: CitationFormat = CitationFormat.OAI_UNICODE):
|
|
34
39
|
"""
|
|
35
40
|
Initialize the CitationsEvaluator with the specified citation format.
|
|
36
|
-
|
|
41
|
+
|
|
37
42
|
Args:
|
|
38
43
|
citation_format (CitationFormat): The format of citations to detect.
|
|
39
44
|
Defaults to OAI_UNICODE format.
|
|
40
45
|
"""
|
|
41
46
|
self.citation_format = citation_format
|
|
42
|
-
|
|
47
|
+
|
|
48
|
+
oai_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
|
|
49
|
+
legacy_pattern = r'\[\^\d+\^\]'
|
|
50
|
+
|
|
43
51
|
if citation_format == CitationFormat.OAI_UNICODE:
|
|
44
52
|
# Pattern to match citations: \ue200cite\ue202turn{number}search{number}\ue201
|
|
45
|
-
self.citation_pattern =
|
|
53
|
+
self.citation_pattern = oai_pattern
|
|
46
54
|
elif citation_format == CitationFormat.LEGACY_BRACKET:
|
|
47
55
|
# Pattern to match citations: [^number^]
|
|
48
|
-
self.citation_pattern =
|
|
56
|
+
self.citation_pattern = legacy_pattern
|
|
57
|
+
elif citation_format == CitationFormat.AUTO:
|
|
58
|
+
# Auto-detect both formats using alternation (|)
|
|
59
|
+
# Matches either OAI unicode OR legacy bracket format
|
|
60
|
+
self.citation_pattern = rf'(?:{oai_pattern})|(?:{legacy_pattern})'
|
|
49
61
|
else:
|
|
50
62
|
raise ValueError(f"Unsupported citation format: {citation_format}")
|
|
51
|
-
|
|
63
|
+
|
|
64
|
+
# Compile the pattern once after determining which format to use
|
|
52
65
|
self.compiled_pattern = re.compile(self.citation_pattern)
|
|
53
66
|
|
|
54
67
|
def __call__(self, *, response: str, **kwargs) -> Dict[str, Any]:
|
|
55
68
|
"""
|
|
56
69
|
Evaluate the response text for citations.
|
|
57
|
-
|
|
70
|
+
|
|
58
71
|
Args:
|
|
59
72
|
response (str): The response text from the M365 Copilot agent
|
|
60
73
|
**kwargs: Additional keyword arguments (not used but kept for compatibility)
|
|
61
|
-
|
|
74
|
+
|
|
62
75
|
Returns:
|
|
63
76
|
Dict[str, Any]: Evaluation results containing:
|
|
64
77
|
- citation_format (str): The format used for detection
|
|
@@ -69,40 +82,71 @@ class CitationsEvaluator:
|
|
|
69
82
|
"""
|
|
70
83
|
if not isinstance(response, str):
|
|
71
84
|
response = str(response) if response is not None else ""
|
|
72
|
-
|
|
73
|
-
# Find all
|
|
85
|
+
|
|
86
|
+
# Find all citations and get unique ones (same for all modes)
|
|
74
87
|
citation_matches = self.compiled_pattern.findall(response)
|
|
75
|
-
|
|
76
|
-
# Get unique citations (remove duplicates)
|
|
77
88
|
unique_citations = list(set(citation_matches))
|
|
78
|
-
|
|
79
|
-
#
|
|
89
|
+
|
|
90
|
+
# Initialize citation details list (used by all modes)
|
|
80
91
|
citation_details = []
|
|
92
|
+
|
|
93
|
+
# Initialize counters only for AUTO mode
|
|
94
|
+
if self.citation_format == CitationFormat.AUTO:
|
|
95
|
+
oai_count = 0
|
|
96
|
+
legacy_count = 0
|
|
97
|
+
|
|
98
|
+
# Process all citations (unified extraction logic)
|
|
81
99
|
for citation in unique_citations:
|
|
82
|
-
|
|
83
|
-
|
|
100
|
+
# Determine citation type and extract details
|
|
101
|
+
if '\ue200' in citation:
|
|
102
|
+
# OAI format (contains start marker)
|
|
84
103
|
turn_search_match = re.search(r'turn(\d+)search(\d+)', citation)
|
|
85
104
|
if turn_search_match:
|
|
86
105
|
turn_num = turn_search_match.group(1)
|
|
87
106
|
search_num = turn_search_match.group(2)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
107
|
+
|
|
108
|
+
# Add appropriate prefix based on mode
|
|
109
|
+
if self.citation_format == CitationFormat.AUTO:
|
|
110
|
+
citation_details.append(f"oai:turn{turn_num}search{search_num}")
|
|
111
|
+
oai_count += 1
|
|
112
|
+
else: # OAI_UNICODE mode
|
|
113
|
+
citation_details.append(f"turn{turn_num}search{search_num}")
|
|
114
|
+
else:
|
|
115
|
+
# Legacy bracket format
|
|
91
116
|
bracket_match = re.search(r'\[\^(\d+)\^\]', citation)
|
|
92
117
|
if bracket_match:
|
|
93
118
|
citation_num = bracket_match.group(1)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
119
|
+
|
|
120
|
+
# Add appropriate prefix based on mode
|
|
121
|
+
if self.citation_format == CitationFormat.AUTO:
|
|
122
|
+
citation_details.append(f"legacy:citation{citation_num}")
|
|
123
|
+
legacy_count += 1
|
|
124
|
+
else: # LEGACY_BRACKET mode
|
|
125
|
+
citation_details.append(f"citation{citation_num}")
|
|
126
|
+
|
|
127
|
+
format_info = None
|
|
128
|
+
if self.citation_format == CitationFormat.AUTO:
|
|
129
|
+
format_info = f"OAI: {oai_count}, Legacy: {legacy_count}"
|
|
130
|
+
|
|
131
|
+
# Build results (common for all modes)
|
|
132
|
+
total_citations = len(unique_citations)
|
|
133
|
+
|
|
134
|
+
# Construct reason string with optional format info
|
|
135
|
+
reason_parts = [f"Found {total_citations} unique citation(s)"]
|
|
136
|
+
if format_info:
|
|
137
|
+
reason_parts.append(f"[{format_info}]:")
|
|
138
|
+
else:
|
|
139
|
+
reason_parts.append(":")
|
|
140
|
+
reason_parts.append(', '.join(citation_details) if citation_details else 'None')
|
|
141
|
+
|
|
97
142
|
results = {
|
|
98
143
|
"citation_format": self.citation_format.value,
|
|
99
|
-
|
|
100
|
-
"
|
|
101
|
-
"
|
|
102
|
-
"
|
|
103
|
-
"reason": f"Found {len(unique_citations)} unique citation(s): {', '.join(citation_details) if citation_details else 'None'}"
|
|
144
|
+
"score": total_citations,
|
|
145
|
+
"result": "pass" if total_citations > 0 else "fail",
|
|
146
|
+
"threshold": 1,
|
|
147
|
+
"reason": " ".join(reason_parts)
|
|
104
148
|
}
|
|
105
|
-
|
|
149
|
+
|
|
106
150
|
return results
|
|
107
151
|
|
|
108
152
|
def get_name(self) -> str:
|
package/src/clients/cli/main.py
CHANGED
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import argparse
|
|
4
4
|
import sys
|
|
5
5
|
import csv
|
|
6
|
+
import functools
|
|
6
7
|
import webbrowser
|
|
7
8
|
import urllib.request
|
|
8
9
|
import urllib.error
|
|
@@ -24,7 +25,11 @@ from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFor
|
|
|
24
25
|
#from custom_evaluators.PII.PII import PIIEvaluator
|
|
25
26
|
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
26
27
|
from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
|
|
28
|
+
from schema_handler import DocumentUpgrader, SchemaVersionManager
|
|
29
|
+
from version_check import check_min_version, get_cli_version
|
|
27
30
|
from datetime import datetime, timezone
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
import tzlocal
|
|
28
33
|
|
|
29
34
|
# Allowed endpoints for URL validation
|
|
30
35
|
ALLOWED_ENDPOINTS = [
|
|
@@ -36,6 +41,18 @@ class CallPath(Enum):
|
|
|
36
41
|
ACCESS_TOKEN = "access_token"
|
|
37
42
|
COPILOT_AUTH = "copilot_auth"
|
|
38
43
|
|
|
44
|
+
|
|
45
|
+
# Flags that should bypass remote min-version enforcement.
|
|
46
|
+
# --help is not needed here because argparse exits before runtime checks.
|
|
47
|
+
VERSION_CHECK_BYPASS_FLAGS = (
|
|
48
|
+
"signout",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
|
|
53
|
+
"""Return True if the current invocation should skip min-version checks."""
|
|
54
|
+
return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
|
|
55
|
+
|
|
39
56
|
def write_results_to_html(results: List[Dict], output_file: str):
|
|
40
57
|
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
41
58
|
try:
|
|
@@ -58,23 +75,68 @@ def get_default_prompts_and_responses():
|
|
|
58
75
|
return prompts, expected_responses
|
|
59
76
|
|
|
60
77
|
def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
61
|
-
"""Load prompts and expected responses from a JSON file.
|
|
78
|
+
"""Load prompts and expected responses from a JSON file.
|
|
79
|
+
|
|
80
|
+
Supports three formats:
|
|
81
|
+
1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
|
|
82
|
+
2. Array format: [{"prompt": "...", "expected_response": "..."}]
|
|
83
|
+
3. Dict format: {"prompts": [...], "expected_responses": [...]}
|
|
84
|
+
|
|
85
|
+
For eval documents (format 1) and array format (format 2), schema validation
|
|
86
|
+
and auto-upgrade are applied via DocumentUpgrader.
|
|
87
|
+
"""
|
|
62
88
|
try:
|
|
63
89
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
64
90
|
data = json.load(f)
|
|
65
|
-
|
|
91
|
+
|
|
92
|
+
# Detect if this is an eval document (has "items" key) or could be upgraded
|
|
93
|
+
is_eval_document = (
|
|
94
|
+
isinstance(data, dict) and "items" in data
|
|
95
|
+
) or isinstance(data, list)
|
|
96
|
+
|
|
97
|
+
# Run schema validation and auto-upgrade for eval documents
|
|
98
|
+
if is_eval_document:
|
|
99
|
+
try:
|
|
100
|
+
upgrader = DocumentUpgrader()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
# Schema infrastructure not available (missing files, etc.) — skip
|
|
103
|
+
print(f"Warning: Unable to initialize document upgrader: {e}")
|
|
104
|
+
upgrader = None
|
|
105
|
+
|
|
106
|
+
if upgrader is not None:
|
|
107
|
+
result = upgrader.upgrade(Path(file_path))
|
|
108
|
+
|
|
109
|
+
if result.error:
|
|
110
|
+
print(f"Schema validation error: {result.error}")
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
|
|
113
|
+
if result.upgraded and result.message:
|
|
114
|
+
print(result.message)
|
|
115
|
+
|
|
116
|
+
# Use the parsed document from the upgrade result
|
|
117
|
+
if result.document is not None:
|
|
118
|
+
data = result.document
|
|
119
|
+
|
|
66
120
|
if isinstance(data, list):
|
|
67
121
|
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
68
122
|
prompts = [item.get("prompt", "") for item in data]
|
|
69
123
|
expected_responses = [item.get("expected_response", "") for item in data]
|
|
70
124
|
elif isinstance(data, dict):
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
125
|
+
if "items" in data:
|
|
126
|
+
# Eval document format: {"schemaVersion": "...", "items": [...]}
|
|
127
|
+
items = data["items"]
|
|
128
|
+
prompts = [item.get("prompt", "") for item in items]
|
|
129
|
+
expected_responses = [item.get("expected_response", "") for item in items]
|
|
130
|
+
else:
|
|
131
|
+
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
132
|
+
prompts = data.get("prompts", [])
|
|
133
|
+
expected_responses = data.get("expected_responses", [])
|
|
74
134
|
else:
|
|
75
135
|
raise ValueError("Invalid file format")
|
|
76
|
-
|
|
136
|
+
|
|
77
137
|
return prompts, expected_responses
|
|
138
|
+
except SystemExit:
|
|
139
|
+
raise
|
|
78
140
|
except Exception as e:
|
|
79
141
|
print(f"Error loading prompts from file: {e}")
|
|
80
142
|
sys.exit(1)
|
|
@@ -181,7 +243,7 @@ def run_evaluations(args, responses: dict, expected_responses: list) -> list:
|
|
|
181
243
|
)
|
|
182
244
|
|
|
183
245
|
tool_call_accuracy = None
|
|
184
|
-
if args.
|
|
246
|
+
if args.m365_agent_id and enhanced_response.get("tool_definitions"):
|
|
185
247
|
tool_call_accuracy = tool_call_accuracy_evaluator(
|
|
186
248
|
query=prompt,
|
|
187
249
|
response=enhanced_response.get("response", actual_response_text),
|
|
@@ -267,18 +329,120 @@ def write_results_to_console(results):
|
|
|
267
329
|
print(f"{BOLD}{color}{name}:{RESET} {v}")
|
|
268
330
|
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
269
331
|
|
|
270
|
-
def
|
|
271
|
-
"""
|
|
332
|
+
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
333
|
+
"""Extract an EvalScore object from a decorated metric dict.
|
|
334
|
+
|
|
335
|
+
Maps internal decorated-metric format to schema EvalScore:
|
|
336
|
+
{score, result, threshold} (required) + reason, evaluator (optional).
|
|
337
|
+
"""
|
|
338
|
+
DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
|
|
339
|
+
|
|
340
|
+
score_val = None
|
|
341
|
+
for k in (metric_id, f"{metric_id}_score", "score", "value"):
|
|
342
|
+
if k in data and isinstance(data[k], (int, float)):
|
|
343
|
+
score_val = data[k]
|
|
344
|
+
break
|
|
345
|
+
if score_val is None:
|
|
346
|
+
return None
|
|
347
|
+
|
|
348
|
+
result = data.get("result")
|
|
349
|
+
if result not in ("pass", "fail"):
|
|
350
|
+
result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
|
|
351
|
+
|
|
352
|
+
eval_score: Dict[str, Any] = {
|
|
353
|
+
"score": score_val,
|
|
354
|
+
"result": result,
|
|
355
|
+
"threshold": data.get("threshold", DEFAULT_THRESHOLD),
|
|
356
|
+
}
|
|
357
|
+
reason = data.get(f"{metric_id}_reason") or data.get("reason")
|
|
358
|
+
if reason:
|
|
359
|
+
eval_score["reason"] = reason
|
|
360
|
+
return eval_score
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
364
|
+
"""Convert an internal evaluation result dict to a schema-compliant EvalItem.
|
|
365
|
+
|
|
366
|
+
Internal format (from run_evaluations):
|
|
367
|
+
{prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
|
|
368
|
+
Schema EvalItem format:
|
|
369
|
+
{prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
|
|
370
|
+
"""
|
|
371
|
+
item: Dict[str, Any] = {
|
|
372
|
+
"prompt": result["prompt"],
|
|
373
|
+
"response": result["response"],
|
|
374
|
+
"expected_response": result["expected_response"],
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
scores: Dict[str, Any] = {}
|
|
378
|
+
results_dict = result.get("results", {})
|
|
379
|
+
|
|
380
|
+
# EvalScore metrics (all share the same schema shape: {score, result, threshold})
|
|
381
|
+
# Tuple: (internal results key, metric ID for score lookup, schema output key)
|
|
382
|
+
for internal_key, metric_id, schema_key in [
|
|
383
|
+
("relevance_score", "relevance", "relevance"),
|
|
384
|
+
("coherence_score", "coherence", "coherence"),
|
|
385
|
+
("groundedness_score", "groundedness", "groundedness"),
|
|
386
|
+
("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
|
|
387
|
+
]:
|
|
388
|
+
raw = results_dict.get(internal_key)
|
|
389
|
+
if not raw:
|
|
390
|
+
continue
|
|
391
|
+
data = json.loads(raw) if isinstance(raw, str) else raw
|
|
392
|
+
eval_score = extract_eval_score(data, metric_id)
|
|
393
|
+
if eval_score:
|
|
394
|
+
scores[schema_key] = eval_score
|
|
395
|
+
|
|
396
|
+
# Citations → CitationScore (different schema shape: {count, result, threshold} + format)
|
|
397
|
+
raw_citations = results_dict.get("citations_score")
|
|
398
|
+
if raw_citations:
|
|
399
|
+
data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
|
|
400
|
+
count = data.get("score", 0)
|
|
401
|
+
cit_result = data.get("result")
|
|
402
|
+
if cit_result not in ("pass", "fail"):
|
|
403
|
+
cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
|
|
404
|
+
|
|
405
|
+
citation_score: Dict[str, Any] = {
|
|
406
|
+
"count": count,
|
|
407
|
+
"result": cit_result,
|
|
408
|
+
"threshold": data.get("threshold", 1),
|
|
409
|
+
}
|
|
410
|
+
if "citation_format" in data:
|
|
411
|
+
citation_score["format"] = data["citation_format"]
|
|
412
|
+
scores["citations"] = citation_score
|
|
413
|
+
|
|
414
|
+
if scores:
|
|
415
|
+
item["scores"] = scores
|
|
416
|
+
|
|
417
|
+
return item
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
|
|
421
|
+
"""Write results to a schema-compliant eval document JSON file.
|
|
422
|
+
|
|
423
|
+
Output follows the eval-document.schema.json format:
|
|
424
|
+
{schemaVersion, metadata, items: [EvalItem]}
|
|
425
|
+
"""
|
|
272
426
|
try:
|
|
273
|
-
|
|
274
|
-
|
|
427
|
+
try:
|
|
428
|
+
current_version = SchemaVersionManager().get_current_version()
|
|
429
|
+
except Exception:
|
|
430
|
+
current_version = "1.0.0"
|
|
431
|
+
|
|
432
|
+
items = [convert_result_to_eval_item(r) for r in results]
|
|
433
|
+
|
|
434
|
+
metadata: Dict[str, Any] = {
|
|
435
|
+
"evaluatedAt": datetime.now(timezone.utc).isoformat(),
|
|
275
436
|
}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
437
|
+
if agent_id:
|
|
438
|
+
metadata["agentId"] = agent_id
|
|
439
|
+
|
|
440
|
+
output_data: Dict[str, Any] = {
|
|
441
|
+
"schemaVersion": current_version,
|
|
442
|
+
"metadata": metadata,
|
|
443
|
+
"items": items,
|
|
444
|
+
}
|
|
445
|
+
|
|
282
446
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
283
447
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
284
448
|
print(f"Results saved to {output_file}")
|
|
@@ -372,12 +536,12 @@ Examples:
|
|
|
372
536
|
help='List of expected responses (must match number of prompts)'
|
|
373
537
|
)
|
|
374
538
|
|
|
375
|
-
# Agent ID
|
|
539
|
+
# Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
|
|
376
540
|
parser.add_argument(
|
|
377
|
-
'--agent-id',
|
|
378
|
-
type=str,
|
|
379
|
-
default=os.environ.get("AGENT_ID"),
|
|
380
|
-
help='
|
|
541
|
+
'--m365-agent-id', '--agent-id',
|
|
542
|
+
type=str,
|
|
543
|
+
default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
|
|
544
|
+
help='Agent ID (default from M365_AGENT_ID environment variable)'
|
|
381
545
|
)
|
|
382
546
|
|
|
383
547
|
# Output options
|
|
@@ -566,6 +730,35 @@ def select_agent_interactively(agents: List[Dict[str, Any]]) -> Optional[str]:
|
|
|
566
730
|
|
|
567
731
|
return selected_agent
|
|
568
732
|
|
|
733
|
+
@functools.lru_cache(maxsize=1)
|
|
734
|
+
def _get_iana_timezone_name() -> str:
|
|
735
|
+
"""Get the IANA timezone name from the system using tzlocal.
|
|
736
|
+
|
|
737
|
+
Tries get_localzone_name() first; falls back to str(get_localzone()) when the
|
|
738
|
+
former raises (e.g. no zone configured on some Unix systems). Result is cached
|
|
739
|
+
after the first call so tzlocal is only invoked once per session.
|
|
740
|
+
"""
|
|
741
|
+
try:
|
|
742
|
+
return tzlocal.get_localzone_name()
|
|
743
|
+
except Exception:
|
|
744
|
+
return str(tzlocal.get_localzone())
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
@functools.lru_cache(maxsize=1)
|
|
748
|
+
def _get_location_info() -> Dict[str, Any]:
|
|
749
|
+
"""Return a locationInfo dict containing the local UTC offset and IANA timezone name.
|
|
750
|
+
|
|
751
|
+
Result is cached after the first call so the computation runs only once per session.
|
|
752
|
+
"""
|
|
753
|
+
now = datetime.now().astimezone()
|
|
754
|
+
utc_offset = now.utcoffset()
|
|
755
|
+
offset_hours = int(utc_offset.total_seconds() // 3600) if utc_offset is not None else 0
|
|
756
|
+
return {
|
|
757
|
+
"timeZoneOffset": offset_hours,
|
|
758
|
+
"timeZone": _get_iana_timezone_name(),
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
|
|
569
762
|
def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
570
763
|
message = {
|
|
571
764
|
"message": {
|
|
@@ -573,6 +766,7 @@ def build_chat_payload(prompt: str, user_oid: str, agent_id: str|None) -> bytes:
|
|
|
573
766
|
"author": "user",
|
|
574
767
|
"messageType": "chat",
|
|
575
768
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
769
|
+
"locationInfo": _get_location_info(),
|
|
576
770
|
"from": {
|
|
577
771
|
"id": user_oid,
|
|
578
772
|
}
|
|
@@ -607,7 +801,7 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
|
|
|
607
801
|
print(f"Processing prompt {i}/{len(prompts)}...")
|
|
608
802
|
|
|
609
803
|
# Build the payload
|
|
610
|
-
payload = build_chat_payload(prompt, user_oid, args.
|
|
804
|
+
payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
|
|
611
805
|
if args.verbose:
|
|
612
806
|
print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
|
|
613
807
|
|
|
@@ -644,7 +838,7 @@ def output_results(results: List[Dict], args):
|
|
|
644
838
|
if args.output:
|
|
645
839
|
output_lower = args.output.lower()
|
|
646
840
|
if output_lower.endswith('.json'):
|
|
647
|
-
write_results_to_json(results, args.output)
|
|
841
|
+
write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
|
|
648
842
|
elif output_lower.endswith('.csv'):
|
|
649
843
|
write_results_to_csv(results, args.output)
|
|
650
844
|
elif output_lower.endswith('.html'):
|
|
@@ -652,7 +846,7 @@ def output_results(results: List[Dict], args):
|
|
|
652
846
|
abs_path = os.path.abspath(args.output)
|
|
653
847
|
webbrowser.open(f'file://{abs_path}')
|
|
654
848
|
else:
|
|
655
|
-
write_results_to_json(results, args.output)
|
|
849
|
+
write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
|
|
656
850
|
else:
|
|
657
851
|
write_results_to_console(results)
|
|
658
852
|
|
|
@@ -661,6 +855,11 @@ def main():
|
|
|
661
855
|
load_dotenv()
|
|
662
856
|
args = parse_arguments()
|
|
663
857
|
|
|
858
|
+
# Check minimum version before proceeding
|
|
859
|
+
cli_version = get_cli_version(quiet=args.quiet)
|
|
860
|
+
if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
|
|
861
|
+
sys.exit(1)
|
|
862
|
+
|
|
664
863
|
# Validate environment variables required for evaluation
|
|
665
864
|
call_path = validate_environment()
|
|
666
865
|
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
@@ -722,23 +921,23 @@ def main():
|
|
|
722
921
|
|
|
723
922
|
try:
|
|
724
923
|
# 3. Agent selection - if no agent ID provided, prompt user to select
|
|
725
|
-
if not args.
|
|
924
|
+
if not args.m365_agent_id:
|
|
726
925
|
if not args.quiet:
|
|
727
926
|
print("No agent ID provided. Fetching available agents...")
|
|
728
927
|
|
|
729
928
|
available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
|
|
730
929
|
if not available_agents:
|
|
731
|
-
print("No agents are available for interactive selection. Please re-run with --agent-id or set the
|
|
930
|
+
print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
|
|
732
931
|
sys.exit(1)
|
|
733
932
|
|
|
734
933
|
if available_agents:
|
|
735
934
|
selected_agent_id = select_agent_interactively(available_agents)
|
|
736
935
|
if selected_agent_id:
|
|
737
|
-
args.
|
|
936
|
+
args.m365_agent_id = selected_agent_id
|
|
738
937
|
if not args.quiet:
|
|
739
|
-
print(f"Selected agent: {args.
|
|
938
|
+
print(f"Selected agent: {args.m365_agent_id}")
|
|
740
939
|
else:
|
|
741
|
-
print("No agent selected. Please re-run with --agent-id or set the
|
|
940
|
+
print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
|
|
742
941
|
sys.exit(1)
|
|
743
942
|
|
|
744
943
|
# 4. Send prompts to chat API
|
|
@@ -49,8 +49,8 @@ M365_EVAL_CLIENT_ID="<app-registration-client-id>"
|
|
|
49
49
|
TENANT_ID="<aad-tenant-id>"
|
|
50
50
|
COPILOT_SCOPES="https://substrate.office.com/sydney/.default"
|
|
51
51
|
|
|
52
|
-
# Optional: default agent id (overridable via --agent-id)
|
|
53
|
-
|
|
52
|
+
# Optional: default agent id (overridable via --m365-agent-id)
|
|
53
|
+
M365_AGENT_ID="00000000-0000-0000-0000-000000000000"
|
|
54
54
|
```
|
|
55
55
|
|
|
56
56
|
### 3. Run the Agent Evaluation
|
|
@@ -71,7 +71,7 @@ python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph
|
|
|
71
71
|
python main.py --prompts "What is Microsoft Graph?" "How does authentication work?" --expected "Microsoft Graph is a gateway..." "Authentication works by..."
|
|
72
72
|
|
|
73
73
|
# Override the agent configured in environment variables
|
|
74
|
-
python main.py --agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
|
|
74
|
+
python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
|
|
75
75
|
```
|
|
76
76
|
|
|
77
77
|
#### Using Prompts from File
|
|
@@ -106,8 +106,8 @@ python main.py --quiet
|
|
|
106
106
|
# Get help and see all options
|
|
107
107
|
python main.py --help
|
|
108
108
|
|
|
109
|
-
# Specify / override the Agent ID (takes precedence over
|
|
110
|
-
python main.py --agent-id "00000000-0000-0000-0000-000000000000"
|
|
109
|
+
# Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
|
|
110
|
+
python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
|
|
111
111
|
|
|
112
112
|
# Citation format options
|
|
113
113
|
python main.py --citation-format oai_unicode # Default: New OAI format
|
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
ansible-core==2.19.0
|
|
2
|
+
tzlocal>=5.0
|
|
2
3
|
azure-ai-evaluation==1.10.0
|
|
3
4
|
azure-ai-projects==1.0.0
|
|
4
5
|
msal[broker]>=1.34,<2
|
|
5
6
|
msal-extensions>=1.3.1
|
|
7
|
+
packaging>=20.0
|
|
6
8
|
PyJWT>=2.11.0
|
|
7
9
|
python-dotenv==1.1.1
|
|
8
10
|
markdown==3.8.2
|
|
9
11
|
promptflow>=1.18.1
|
|
10
12
|
questionary>=2.1.1
|
|
13
|
+
jsonschema>=4.26.0,<5
|
|
@@ -1,10 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.0.0",
|
|
3
|
+
"items": [
|
|
4
|
+
{
|
|
5
|
+
"prompt": "What is Microsoft 365?",
|
|
6
|
+
"expected_response": "Microsoft 365 is a cloud-based productivity suite that includes applications like Word, Excel, PowerPoint, Teams, and other collaboration tools."
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"prompt": "How can I share a file in Teams?",
|
|
10
|
+
"expected_response": "You can share a file in Teams by uploading it to a channel or chat, or by sharing a link from OneDrive or SharePoint."
|
|
11
|
+
}
|
|
12
|
+
]
|
|
13
|
+
}
|