@microsoft/m365-copilot-eval 1.1.1-preview.1 → 1.2.0-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -18
- package/package.json +4 -2
- package/schema/CHANGELOG.md +21 -0
- package/schema/v1/eval-document.schema.json +236 -0
- package/schema/v1/examples/invalid/empty-items.json +4 -0
- package/schema/v1/examples/invalid/invalid-semver.json +8 -0
- package/schema/v1/examples/invalid/missing-schema-version.json +7 -0
- package/schema/v1/examples/invalid/wrong-type.json +6 -0
- package/schema/v1/examples/valid/comprehensive.json +92 -0
- package/schema/v1/examples/valid/minimal.json +8 -0
- package/schema/version.json +6 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +77 -33
- package/src/clients/cli/main.py +197 -30
- package/src/clients/cli/readme.md +5 -5
- package/src/clients/cli/requirements.txt +2 -0
- package/src/clients/cli/samples/starter.json +13 -10
- package/src/clients/cli/schema_handler.py +349 -0
- package/src/clients/cli/version_check.py +139 -0
- package/src/clients/node-js/bin/runevals.js +34 -103
- package/src/clients/node-js/config/default.js +1 -1
- package/src/clients/node-js/lib/env-loader.js +126 -0
- package/src/clients/node-js/lib/progress.js +36 -36
- package/src/clients/node-js/lib/python-runtime.js +4 -6
- package/src/clients/node-js/lib/venv-manager.js +60 -18
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
2
|
CitationsEvaluator - A custom evaluator for analyzing citations in M365 Copilot responses.
|
|
3
3
|
|
|
4
|
-
This evaluator uses regex-based pattern matching to detect citations in
|
|
5
|
-
1. New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
6
|
-
2. Old format: [^i^] where i is the citation index
|
|
4
|
+
This evaluator uses regex-based pattern matching to detect citations in three modes:
|
|
5
|
+
1. OAI_UNICODE: New OAI format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
6
|
+
2. LEGACY_BRACKET: Old format: [^i^] where i is the citation index
|
|
7
|
+
3. AUTO: Automatically detects both formats simultaneously
|
|
7
8
|
|
|
8
9
|
Where X, Y, and i are natural numbers representing conversation turn, search result index, or citation index.
|
|
9
10
|
"""
|
|
@@ -17,48 +18,60 @@ class CitationFormat(Enum):
|
|
|
17
18
|
"""Enum for different citation formats supported by the evaluator."""
|
|
18
19
|
OAI_UNICODE = "oai_unicode" # New format: \ue200cite\ue202turn{X}search{Y}\ue201
|
|
19
20
|
LEGACY_BRACKET = "legacy_bracket" # Old format: [^i^]
|
|
21
|
+
AUTO = "auto" # Automatically detect both formats
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class CitationsEvaluator:
|
|
23
25
|
"""
|
|
24
26
|
A custom evaluator that analyzes citations in response text without using an LLM.
|
|
25
|
-
|
|
27
|
+
|
|
26
28
|
This evaluator detects citation patterns and returns:
|
|
27
29
|
- Whether at least one citation is present
|
|
28
30
|
- The number of unique citations found
|
|
29
|
-
|
|
30
|
-
Supports
|
|
31
|
+
|
|
32
|
+
Supports three modes:
|
|
33
|
+
- OAI_UNICODE: Detects only OAI unicode format citations
|
|
34
|
+
- LEGACY_BRACKET: Detects only legacy bracket format citations
|
|
35
|
+
- AUTO: Automatically detects both formats simultaneously
|
|
31
36
|
"""
|
|
32
37
|
|
|
33
38
|
def __init__(self, citation_format: CitationFormat = CitationFormat.OAI_UNICODE):
|
|
34
39
|
"""
|
|
35
40
|
Initialize the CitationsEvaluator with the specified citation format.
|
|
36
|
-
|
|
41
|
+
|
|
37
42
|
Args:
|
|
38
43
|
citation_format (CitationFormat): The format of citations to detect.
|
|
39
44
|
Defaults to OAI_UNICODE format.
|
|
40
45
|
"""
|
|
41
46
|
self.citation_format = citation_format
|
|
42
|
-
|
|
47
|
+
|
|
48
|
+
oai_pattern = r'\ue200cite\ue202turn\d+search\d+\ue201'
|
|
49
|
+
legacy_pattern = r'\[\^\d+\^\]'
|
|
50
|
+
|
|
43
51
|
if citation_format == CitationFormat.OAI_UNICODE:
|
|
44
52
|
# Pattern to match citations: \ue200cite\ue202turn{number}search{number}\ue201
|
|
45
|
-
self.citation_pattern =
|
|
53
|
+
self.citation_pattern = oai_pattern
|
|
46
54
|
elif citation_format == CitationFormat.LEGACY_BRACKET:
|
|
47
55
|
# Pattern to match citations: [^number^]
|
|
48
|
-
self.citation_pattern =
|
|
56
|
+
self.citation_pattern = legacy_pattern
|
|
57
|
+
elif citation_format == CitationFormat.AUTO:
|
|
58
|
+
# Auto-detect both formats using alternation (|)
|
|
59
|
+
# Matches either OAI unicode OR legacy bracket format
|
|
60
|
+
self.citation_pattern = rf'(?:{oai_pattern})|(?:{legacy_pattern})'
|
|
49
61
|
else:
|
|
50
62
|
raise ValueError(f"Unsupported citation format: {citation_format}")
|
|
51
|
-
|
|
63
|
+
|
|
64
|
+
# Compile the pattern once after determining which format to use
|
|
52
65
|
self.compiled_pattern = re.compile(self.citation_pattern)
|
|
53
66
|
|
|
54
67
|
def __call__(self, *, response: str, **kwargs) -> Dict[str, Any]:
|
|
55
68
|
"""
|
|
56
69
|
Evaluate the response text for citations.
|
|
57
|
-
|
|
70
|
+
|
|
58
71
|
Args:
|
|
59
72
|
response (str): The response text from the M365 Copilot agent
|
|
60
73
|
**kwargs: Additional keyword arguments (not used but kept for compatibility)
|
|
61
|
-
|
|
74
|
+
|
|
62
75
|
Returns:
|
|
63
76
|
Dict[str, Any]: Evaluation results containing:
|
|
64
77
|
- citation_format (str): The format used for detection
|
|
@@ -69,40 +82,71 @@ class CitationsEvaluator:
|
|
|
69
82
|
"""
|
|
70
83
|
if not isinstance(response, str):
|
|
71
84
|
response = str(response) if response is not None else ""
|
|
72
|
-
|
|
73
|
-
# Find all
|
|
85
|
+
|
|
86
|
+
# Find all citations and get unique ones (same for all modes)
|
|
74
87
|
citation_matches = self.compiled_pattern.findall(response)
|
|
75
|
-
|
|
76
|
-
# Get unique citations (remove duplicates)
|
|
77
88
|
unique_citations = list(set(citation_matches))
|
|
78
|
-
|
|
79
|
-
#
|
|
89
|
+
|
|
90
|
+
# Initialize citation details list (used by all modes)
|
|
80
91
|
citation_details = []
|
|
92
|
+
|
|
93
|
+
# Initialize counters only for AUTO mode
|
|
94
|
+
if self.citation_format == CitationFormat.AUTO:
|
|
95
|
+
oai_count = 0
|
|
96
|
+
legacy_count = 0
|
|
97
|
+
|
|
98
|
+
# Process all citations (unified extraction logic)
|
|
81
99
|
for citation in unique_citations:
|
|
82
|
-
|
|
83
|
-
|
|
100
|
+
# Determine citation type and extract details
|
|
101
|
+
if '\ue200' in citation:
|
|
102
|
+
# OAI format (contains start marker)
|
|
84
103
|
turn_search_match = re.search(r'turn(\d+)search(\d+)', citation)
|
|
85
104
|
if turn_search_match:
|
|
86
105
|
turn_num = turn_search_match.group(1)
|
|
87
106
|
search_num = turn_search_match.group(2)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
107
|
+
|
|
108
|
+
# Add appropriate prefix based on mode
|
|
109
|
+
if self.citation_format == CitationFormat.AUTO:
|
|
110
|
+
citation_details.append(f"oai:turn{turn_num}search{search_num}")
|
|
111
|
+
oai_count += 1
|
|
112
|
+
else: # OAI_UNICODE mode
|
|
113
|
+
citation_details.append(f"turn{turn_num}search{search_num}")
|
|
114
|
+
else:
|
|
115
|
+
# Legacy bracket format
|
|
91
116
|
bracket_match = re.search(r'\[\^(\d+)\^\]', citation)
|
|
92
117
|
if bracket_match:
|
|
93
118
|
citation_num = bracket_match.group(1)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
119
|
+
|
|
120
|
+
# Add appropriate prefix based on mode
|
|
121
|
+
if self.citation_format == CitationFormat.AUTO:
|
|
122
|
+
citation_details.append(f"legacy:citation{citation_num}")
|
|
123
|
+
legacy_count += 1
|
|
124
|
+
else: # LEGACY_BRACKET mode
|
|
125
|
+
citation_details.append(f"citation{citation_num}")
|
|
126
|
+
|
|
127
|
+
format_info = None
|
|
128
|
+
if self.citation_format == CitationFormat.AUTO:
|
|
129
|
+
format_info = f"OAI: {oai_count}, Legacy: {legacy_count}"
|
|
130
|
+
|
|
131
|
+
# Build results (common for all modes)
|
|
132
|
+
total_citations = len(unique_citations)
|
|
133
|
+
|
|
134
|
+
# Construct reason string with optional format info
|
|
135
|
+
reason_parts = [f"Found {total_citations} unique citation(s)"]
|
|
136
|
+
if format_info:
|
|
137
|
+
reason_parts.append(f"[{format_info}]:")
|
|
138
|
+
else:
|
|
139
|
+
reason_parts.append(":")
|
|
140
|
+
reason_parts.append(', '.join(citation_details) if citation_details else 'None')
|
|
141
|
+
|
|
97
142
|
results = {
|
|
98
143
|
"citation_format": self.citation_format.value,
|
|
99
|
-
|
|
100
|
-
"
|
|
101
|
-
"
|
|
102
|
-
"
|
|
103
|
-
"reason": f"Found {len(unique_citations)} unique citation(s): {', '.join(citation_details) if citation_details else 'None'}"
|
|
144
|
+
"score": total_citations,
|
|
145
|
+
"result": "pass" if total_citations > 0 else "fail",
|
|
146
|
+
"threshold": 1,
|
|
147
|
+
"reason": " ".join(reason_parts)
|
|
104
148
|
}
|
|
105
|
-
|
|
149
|
+
|
|
106
150
|
return results
|
|
107
151
|
|
|
108
152
|
def get_name(self) -> str:
|
package/src/clients/cli/main.py
CHANGED
|
@@ -24,7 +24,10 @@ from custom_evaluators.CitationsEvaluator import CitationsEvaluator, CitationFor
|
|
|
24
24
|
#from custom_evaluators.PII.PII import PIIEvaluator
|
|
25
25
|
from generate_report import generate_html_report, calculate_aggregate_statistics
|
|
26
26
|
from response_extractor import extract_enhanced_responses, get_response_text_for_evaluation
|
|
27
|
+
from schema_handler import DocumentUpgrader, SchemaVersionManager
|
|
28
|
+
from version_check import check_min_version, get_cli_version
|
|
27
29
|
from datetime import datetime, timezone
|
|
30
|
+
from pathlib import Path
|
|
28
31
|
|
|
29
32
|
# Allowed endpoints for URL validation
|
|
30
33
|
ALLOWED_ENDPOINTS = [
|
|
@@ -36,6 +39,18 @@ class CallPath(Enum):
|
|
|
36
39
|
ACCESS_TOKEN = "access_token"
|
|
37
40
|
COPILOT_AUTH = "copilot_auth"
|
|
38
41
|
|
|
42
|
+
|
|
43
|
+
# Flags that should bypass remote min-version enforcement.
|
|
44
|
+
# --help is not needed here because argparse exits before runtime checks.
|
|
45
|
+
VERSION_CHECK_BYPASS_FLAGS = (
|
|
46
|
+
"signout",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def should_bypass_min_version_check(args: argparse.Namespace) -> bool:
|
|
51
|
+
"""Return True if the current invocation should skip min-version checks."""
|
|
52
|
+
return any(getattr(args, flag, False) for flag in VERSION_CHECK_BYPASS_FLAGS)
|
|
53
|
+
|
|
39
54
|
def write_results_to_html(results: List[Dict], output_file: str):
|
|
40
55
|
"""Write results to HTML file using generate_html_report from generate_report.py."""
|
|
41
56
|
try:
|
|
@@ -58,23 +73,68 @@ def get_default_prompts_and_responses():
|
|
|
58
73
|
return prompts, expected_responses
|
|
59
74
|
|
|
60
75
|
def load_prompts_from_file(file_path: str) -> Tuple[List[str], List[str]]:
|
|
61
|
-
"""Load prompts and expected responses from a JSON file.
|
|
76
|
+
"""Load prompts and expected responses from a JSON file.
|
|
77
|
+
|
|
78
|
+
Supports three formats:
|
|
79
|
+
1. Eval document: {"schemaVersion": "1.0.0", "items": [{"prompt": "..."}]}
|
|
80
|
+
2. Array format: [{"prompt": "...", "expected_response": "..."}]
|
|
81
|
+
3. Dict format: {"prompts": [...], "expected_responses": [...]}
|
|
82
|
+
|
|
83
|
+
For eval documents (format 1) and array format (format 2), schema validation
|
|
84
|
+
and auto-upgrade are applied via DocumentUpgrader.
|
|
85
|
+
"""
|
|
62
86
|
try:
|
|
63
87
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
64
88
|
data = json.load(f)
|
|
65
|
-
|
|
89
|
+
|
|
90
|
+
# Detect if this is an eval document (has "items" key) or could be upgraded
|
|
91
|
+
is_eval_document = (
|
|
92
|
+
isinstance(data, dict) and "items" in data
|
|
93
|
+
) or isinstance(data, list)
|
|
94
|
+
|
|
95
|
+
# Run schema validation and auto-upgrade for eval documents
|
|
96
|
+
if is_eval_document:
|
|
97
|
+
try:
|
|
98
|
+
upgrader = DocumentUpgrader()
|
|
99
|
+
except Exception as e:
|
|
100
|
+
# Schema infrastructure not available (missing files, etc.) — skip
|
|
101
|
+
print(f"Warning: Unable to initialize document upgrader: {e}")
|
|
102
|
+
upgrader = None
|
|
103
|
+
|
|
104
|
+
if upgrader is not None:
|
|
105
|
+
result = upgrader.upgrade(Path(file_path))
|
|
106
|
+
|
|
107
|
+
if result.error:
|
|
108
|
+
print(f"Schema validation error: {result.error}")
|
|
109
|
+
sys.exit(1)
|
|
110
|
+
|
|
111
|
+
if result.upgraded and result.message:
|
|
112
|
+
print(result.message)
|
|
113
|
+
|
|
114
|
+
# Use the parsed document from the upgrade result
|
|
115
|
+
if result.document is not None:
|
|
116
|
+
data = result.document
|
|
117
|
+
|
|
66
118
|
if isinstance(data, list):
|
|
67
119
|
# Format: [{"prompt": "...", "expected_response": "..."}, ...]
|
|
68
120
|
prompts = [item.get("prompt", "") for item in data]
|
|
69
121
|
expected_responses = [item.get("expected_response", "") for item in data]
|
|
70
122
|
elif isinstance(data, dict):
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
123
|
+
if "items" in data:
|
|
124
|
+
# Eval document format: {"schemaVersion": "...", "items": [...]}
|
|
125
|
+
items = data["items"]
|
|
126
|
+
prompts = [item.get("prompt", "") for item in items]
|
|
127
|
+
expected_responses = [item.get("expected_response", "") for item in items]
|
|
128
|
+
else:
|
|
129
|
+
# Format: {"prompts": [...], "expected_responses": [...]}
|
|
130
|
+
prompts = data.get("prompts", [])
|
|
131
|
+
expected_responses = data.get("expected_responses", [])
|
|
74
132
|
else:
|
|
75
133
|
raise ValueError("Invalid file format")
|
|
76
|
-
|
|
134
|
+
|
|
77
135
|
return prompts, expected_responses
|
|
136
|
+
except SystemExit:
|
|
137
|
+
raise
|
|
78
138
|
except Exception as e:
|
|
79
139
|
print(f"Error loading prompts from file: {e}")
|
|
80
140
|
sys.exit(1)
|
|
@@ -181,7 +241,7 @@ def run_evaluations(args, responses: dict, expected_responses: list) -> list:
|
|
|
181
241
|
)
|
|
182
242
|
|
|
183
243
|
tool_call_accuracy = None
|
|
184
|
-
if args.
|
|
244
|
+
if args.m365_agent_id and enhanced_response.get("tool_definitions"):
|
|
185
245
|
tool_call_accuracy = tool_call_accuracy_evaluator(
|
|
186
246
|
query=prompt,
|
|
187
247
|
response=enhanced_response.get("response", actual_response_text),
|
|
@@ -267,18 +327,120 @@ def write_results_to_console(results):
|
|
|
267
327
|
print(f"{BOLD}{color}{name}:{RESET} {v}")
|
|
268
328
|
print(f"{BLUE}{'-' * 30}{RESET}")
|
|
269
329
|
|
|
270
|
-
def
|
|
271
|
-
"""
|
|
330
|
+
def extract_eval_score(data: dict, metric_id: str) -> Optional[Dict]:
|
|
331
|
+
"""Extract an EvalScore object from a decorated metric dict.
|
|
332
|
+
|
|
333
|
+
Maps internal decorated-metric format to schema EvalScore:
|
|
334
|
+
{score, result, threshold} (required) + reason, evaluator (optional).
|
|
335
|
+
"""
|
|
336
|
+
DEFAULT_THRESHOLD = 3 # fallback; decorate_metric should always set this
|
|
337
|
+
|
|
338
|
+
score_val = None
|
|
339
|
+
for k in (metric_id, f"{metric_id}_score", "score", "value"):
|
|
340
|
+
if k in data and isinstance(data[k], (int, float)):
|
|
341
|
+
score_val = data[k]
|
|
342
|
+
break
|
|
343
|
+
if score_val is None:
|
|
344
|
+
return None
|
|
345
|
+
|
|
346
|
+
result = data.get("result")
|
|
347
|
+
if result not in ("pass", "fail"):
|
|
348
|
+
result = "pass" if score_val >= data.get("threshold", DEFAULT_THRESHOLD) else "fail"
|
|
349
|
+
|
|
350
|
+
eval_score: Dict[str, Any] = {
|
|
351
|
+
"score": score_val,
|
|
352
|
+
"result": result,
|
|
353
|
+
"threshold": data.get("threshold", DEFAULT_THRESHOLD),
|
|
354
|
+
}
|
|
355
|
+
reason = data.get(f"{metric_id}_reason") or data.get("reason")
|
|
356
|
+
if reason:
|
|
357
|
+
eval_score["reason"] = reason
|
|
358
|
+
return eval_score
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def convert_result_to_eval_item(result: Dict) -> Dict:
|
|
362
|
+
"""Convert an internal evaluation result dict to a schema-compliant EvalItem.
|
|
363
|
+
|
|
364
|
+
Internal format (from run_evaluations):
|
|
365
|
+
{prompt, response, expected_response, results: {relevance_score: "JSON", ...}}
|
|
366
|
+
Schema EvalItem format:
|
|
367
|
+
{prompt, response, expected_response, scores: {relevance: EvalScore, ...}}
|
|
368
|
+
"""
|
|
369
|
+
item: Dict[str, Any] = {
|
|
370
|
+
"prompt": result["prompt"],
|
|
371
|
+
"response": result["response"],
|
|
372
|
+
"expected_response": result["expected_response"],
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
scores: Dict[str, Any] = {}
|
|
376
|
+
results_dict = result.get("results", {})
|
|
377
|
+
|
|
378
|
+
# EvalScore metrics (all share the same schema shape: {score, result, threshold})
|
|
379
|
+
# Tuple: (internal results key, metric ID for score lookup, schema output key)
|
|
380
|
+
for internal_key, metric_id, schema_key in [
|
|
381
|
+
("relevance_score", "relevance", "relevance"),
|
|
382
|
+
("coherence_score", "coherence", "coherence"),
|
|
383
|
+
("groundedness_score", "groundedness", "groundedness"),
|
|
384
|
+
("tool_call_accuracy_score", "tool_call_accuracy", "toolCallAccuracy"),
|
|
385
|
+
]:
|
|
386
|
+
raw = results_dict.get(internal_key)
|
|
387
|
+
if not raw:
|
|
388
|
+
continue
|
|
389
|
+
data = json.loads(raw) if isinstance(raw, str) else raw
|
|
390
|
+
eval_score = extract_eval_score(data, metric_id)
|
|
391
|
+
if eval_score:
|
|
392
|
+
scores[schema_key] = eval_score
|
|
393
|
+
|
|
394
|
+
# Citations → CitationScore (different schema shape: {count, result, threshold} + format)
|
|
395
|
+
raw_citations = results_dict.get("citations_score")
|
|
396
|
+
if raw_citations:
|
|
397
|
+
data = json.loads(raw_citations) if isinstance(raw_citations, str) else raw_citations
|
|
398
|
+
count = data.get("score", 0)
|
|
399
|
+
cit_result = data.get("result")
|
|
400
|
+
if cit_result not in ("pass", "fail"):
|
|
401
|
+
cit_result = "pass" if count >= data.get("threshold", 1) else "fail"
|
|
402
|
+
|
|
403
|
+
citation_score: Dict[str, Any] = {
|
|
404
|
+
"count": count,
|
|
405
|
+
"result": cit_result,
|
|
406
|
+
"threshold": data.get("threshold", 1),
|
|
407
|
+
}
|
|
408
|
+
if "citation_format" in data:
|
|
409
|
+
citation_score["format"] = data["citation_format"]
|
|
410
|
+
scores["citations"] = citation_score
|
|
411
|
+
|
|
412
|
+
if scores:
|
|
413
|
+
item["scores"] = scores
|
|
414
|
+
|
|
415
|
+
return item
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def write_results_to_json(results: List[Dict], output_file: str, agent_id: Optional[str] = None):
|
|
419
|
+
"""Write results to a schema-compliant eval document JSON file.
|
|
420
|
+
|
|
421
|
+
Output follows the eval-document.schema.json format:
|
|
422
|
+
{schemaVersion, metadata, items: [EvalItem]}
|
|
423
|
+
"""
|
|
272
424
|
try:
|
|
273
|
-
|
|
274
|
-
|
|
425
|
+
try:
|
|
426
|
+
current_version = SchemaVersionManager().get_current_version()
|
|
427
|
+
except Exception:
|
|
428
|
+
current_version = "1.0.0"
|
|
429
|
+
|
|
430
|
+
items = [convert_result_to_eval_item(r) for r in results]
|
|
431
|
+
|
|
432
|
+
metadata: Dict[str, Any] = {
|
|
433
|
+
"evaluatedAt": datetime.now(timezone.utc).isoformat(),
|
|
275
434
|
}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
435
|
+
if agent_id:
|
|
436
|
+
metadata["agentId"] = agent_id
|
|
437
|
+
|
|
438
|
+
output_data: Dict[str, Any] = {
|
|
439
|
+
"schemaVersion": current_version,
|
|
440
|
+
"metadata": metadata,
|
|
441
|
+
"items": items,
|
|
442
|
+
}
|
|
443
|
+
|
|
282
444
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
283
445
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
284
446
|
print(f"Results saved to {output_file}")
|
|
@@ -372,12 +534,12 @@ Examples:
|
|
|
372
534
|
help='List of expected responses (must match number of prompts)'
|
|
373
535
|
)
|
|
374
536
|
|
|
375
|
-
# Agent ID
|
|
537
|
+
# Agent ID (--m365-agent-id is primary, --agent-id kept for backward compatibility)
|
|
376
538
|
parser.add_argument(
|
|
377
|
-
'--agent-id',
|
|
378
|
-
type=str,
|
|
379
|
-
default=os.environ.get("AGENT_ID"),
|
|
380
|
-
help='
|
|
539
|
+
'--m365-agent-id', '--agent-id',
|
|
540
|
+
type=str,
|
|
541
|
+
default=os.environ.get("M365_AGENT_ID") or os.environ.get("AGENT_ID"),
|
|
542
|
+
help='Agent ID (default from M365_AGENT_ID environment variable)'
|
|
381
543
|
)
|
|
382
544
|
|
|
383
545
|
# Output options
|
|
@@ -607,7 +769,7 @@ def send_prompt_to_agent_in_sydney(prompts: List[str], copilot_api_endpoint: str
|
|
|
607
769
|
print(f"Processing prompt {i}/{len(prompts)}...")
|
|
608
770
|
|
|
609
771
|
# Build the payload
|
|
610
|
-
payload = build_chat_payload(prompt, user_oid, args.
|
|
772
|
+
payload = build_chat_payload(prompt, user_oid, args.m365_agent_id)
|
|
611
773
|
if args.verbose:
|
|
612
774
|
print(f"[Sydney] Sending payload: {payload.decode('utf-8')}")
|
|
613
775
|
|
|
@@ -644,7 +806,7 @@ def output_results(results: List[Dict], args):
|
|
|
644
806
|
if args.output:
|
|
645
807
|
output_lower = args.output.lower()
|
|
646
808
|
if output_lower.endswith('.json'):
|
|
647
|
-
write_results_to_json(results, args.output)
|
|
809
|
+
write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
|
|
648
810
|
elif output_lower.endswith('.csv'):
|
|
649
811
|
write_results_to_csv(results, args.output)
|
|
650
812
|
elif output_lower.endswith('.html'):
|
|
@@ -652,7 +814,7 @@ def output_results(results: List[Dict], args):
|
|
|
652
814
|
abs_path = os.path.abspath(args.output)
|
|
653
815
|
webbrowser.open(f'file://{abs_path}')
|
|
654
816
|
else:
|
|
655
|
-
write_results_to_json(results, args.output)
|
|
817
|
+
write_results_to_json(results, args.output, agent_id=getattr(args, 'm365_agent_id', None))
|
|
656
818
|
else:
|
|
657
819
|
write_results_to_console(results)
|
|
658
820
|
|
|
@@ -661,6 +823,11 @@ def main():
|
|
|
661
823
|
load_dotenv()
|
|
662
824
|
args = parse_arguments()
|
|
663
825
|
|
|
826
|
+
# Check minimum version before proceeding
|
|
827
|
+
cli_version = get_cli_version(quiet=args.quiet)
|
|
828
|
+
if not should_bypass_min_version_check(args) and not check_min_version(cli_version, quiet=args.quiet):
|
|
829
|
+
sys.exit(1)
|
|
830
|
+
|
|
664
831
|
# Validate environment variables required for evaluation
|
|
665
832
|
call_path = validate_environment()
|
|
666
833
|
copilot_api_endpoint = os.environ["COPILOT_API_ENDPOINT"]
|
|
@@ -722,23 +889,23 @@ def main():
|
|
|
722
889
|
|
|
723
890
|
try:
|
|
724
891
|
# 3. Agent selection - if no agent ID provided, prompt user to select
|
|
725
|
-
if not args.
|
|
892
|
+
if not args.m365_agent_id:
|
|
726
893
|
if not args.quiet:
|
|
727
894
|
print("No agent ID provided. Fetching available agents...")
|
|
728
895
|
|
|
729
896
|
available_agents = fetch_available_agents(copilot_api_endpoint, access_token, user_oid)
|
|
730
897
|
if not available_agents:
|
|
731
|
-
print("No agents are available for interactive selection. Please re-run with --agent-id or set the
|
|
898
|
+
print("No agents are available for interactive selection. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
|
|
732
899
|
sys.exit(1)
|
|
733
900
|
|
|
734
901
|
if available_agents:
|
|
735
902
|
selected_agent_id = select_agent_interactively(available_agents)
|
|
736
903
|
if selected_agent_id:
|
|
737
|
-
args.
|
|
904
|
+
args.m365_agent_id = selected_agent_id
|
|
738
905
|
if not args.quiet:
|
|
739
|
-
print(f"Selected agent: {args.
|
|
906
|
+
print(f"Selected agent: {args.m365_agent_id}")
|
|
740
907
|
else:
|
|
741
|
-
print("No agent selected. Please re-run with --agent-id or set the
|
|
908
|
+
print("No agent selected. Please re-run with --m365-agent-id or set the M365_AGENT_ID environment variable.")
|
|
742
909
|
sys.exit(1)
|
|
743
910
|
|
|
744
911
|
# 4. Send prompts to chat API
|
|
@@ -49,8 +49,8 @@ M365_EVAL_CLIENT_ID="<app-registration-client-id>"
|
|
|
49
49
|
TENANT_ID="<aad-tenant-id>"
|
|
50
50
|
COPILOT_SCOPES="https://substrate.office.com/sydney/.default"
|
|
51
51
|
|
|
52
|
-
# Optional: default agent id (overridable via --agent-id)
|
|
53
|
-
|
|
52
|
+
# Optional: default agent id (overridable via --m365-agent-id)
|
|
53
|
+
M365_AGENT_ID="00000000-0000-0000-0000-000000000000"
|
|
54
54
|
```
|
|
55
55
|
|
|
56
56
|
### 3. Run the Agent Evaluation
|
|
@@ -71,7 +71,7 @@ python main.py --prompts "What is Microsoft Graph?" --expected "Microsoft Graph
|
|
|
71
71
|
python main.py --prompts "What is Microsoft Graph?" "How does authentication work?" --expected "Microsoft Graph is a gateway..." "Authentication works by..."
|
|
72
72
|
|
|
73
73
|
# Override the agent configured in environment variables
|
|
74
|
-
python main.py --agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
|
|
74
|
+
python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000" --prompts "What is Microsoft Graph?"
|
|
75
75
|
```
|
|
76
76
|
|
|
77
77
|
#### Using Prompts from File
|
|
@@ -106,8 +106,8 @@ python main.py --quiet
|
|
|
106
106
|
# Get help and see all options
|
|
107
107
|
python main.py --help
|
|
108
108
|
|
|
109
|
-
# Specify / override the Agent ID (takes precedence over
|
|
110
|
-
python main.py --agent-id "00000000-0000-0000-0000-000000000000"
|
|
109
|
+
# Specify / override the Agent ID (takes precedence over M365_AGENT_ID env var)
|
|
110
|
+
python main.py --m365-agent-id "00000000-0000-0000-0000-000000000000"
|
|
111
111
|
|
|
112
112
|
# Citation format options
|
|
113
113
|
python main.py --citation-format oai_unicode # Default: New OAI format
|
|
@@ -1,10 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
1
|
+
{
|
|
2
|
+
"schemaVersion": "1.0.0",
|
|
3
|
+
"items": [
|
|
4
|
+
{
|
|
5
|
+
"prompt": "What is Microsoft 365?",
|
|
6
|
+
"expected_response": "Microsoft 365 is a cloud-based productivity suite that includes applications like Word, Excel, PowerPoint, Teams, and other collaboration tools."
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
"prompt": "How can I share a file in Teams?",
|
|
10
|
+
"expected_response": "You can share a file in Teams by uploading it to a channel or chat, or by sharing a link from OneDrive or SharePoint."
|
|
11
|
+
}
|
|
12
|
+
]
|
|
13
|
+
}
|