@microsoft/m365-copilot-eval 1.0.1-preview.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +415 -0
- package/TERMS.txt +65 -0
- package/package.json +82 -0
- package/src/clients/cli/auth/__init__.py +1 -0
- package/src/clients/cli/auth/auth_handler.py +262 -0
- package/src/clients/cli/custom_evaluators/CitationsEvaluator.py +136 -0
- package/src/clients/cli/custom_evaluators/ConcisenessNonLLMEvaluator.py +18 -0
- package/src/clients/cli/custom_evaluators/ExactMatchEvaluator.py +25 -0
- package/src/clients/cli/custom_evaluators/PII/PII.py +45 -0
- package/src/clients/cli/custom_evaluators/PartialMatchEvaluator.py +39 -0
- package/src/clients/cli/custom_evaluators/__init__.py +1 -0
- package/src/clients/cli/demo_usage.py +83 -0
- package/src/clients/cli/generate_report.py +251 -0
- package/src/clients/cli/main.py +766 -0
- package/src/clients/cli/readme.md +301 -0
- package/src/clients/cli/requirements.txt +10 -0
- package/src/clients/cli/response_extractor.py +589 -0
- package/src/clients/cli/samples/PartnerSuccess.json +122 -0
- package/src/clients/cli/samples/example_prompts.json +14 -0
- package/src/clients/cli/samples/example_prompts_alt.json +12 -0
- package/src/clients/cli/samples/prompts_ambiguity.json +22 -0
- package/src/clients/cli/samples/prompts_rag_grounding.json +22 -0
- package/src/clients/cli/samples/prompts_security_injection.json +22 -0
- package/src/clients/cli/samples/prompts_tool_use_negatives.json +22 -0
- package/src/clients/cli/samples/psaSample.json +18 -0
- package/src/clients/cli/samples/starter.json +10 -0
- package/src/clients/node-js/bin/runevals.js +505 -0
- package/src/clients/node-js/config/default.js +25 -0
- package/src/clients/node-js/lib/cache-utils.js +119 -0
- package/src/clients/node-js/lib/expiry-check.js +164 -0
- package/src/clients/node-js/lib/index.js +25 -0
- package/src/clients/node-js/lib/python-runtime.js +253 -0
- package/src/clients/node-js/lib/venv-manager.js +242 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import markdown
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
def calculate_aggregate_statistics(results):
|
|
7
|
+
"""Calculate aggregate statistics across all evaluation results."""
|
|
8
|
+
if not results:
|
|
9
|
+
return {}
|
|
10
|
+
|
|
11
|
+
# Extract all metrics from the first result to know what metrics we have
|
|
12
|
+
first_result = results[0]
|
|
13
|
+
metrics = first_result.get('results', {})
|
|
14
|
+
|
|
15
|
+
aggregates = {}
|
|
16
|
+
|
|
17
|
+
for metric_key in metrics.keys():
|
|
18
|
+
if not metric_key.endswith('_score'):
|
|
19
|
+
continue
|
|
20
|
+
|
|
21
|
+
metric_name = metric_key[:-6] # Remove '_score' suffix
|
|
22
|
+
metric_display_name = metric_name.replace('_', ' ').title()
|
|
23
|
+
|
|
24
|
+
scores = []
|
|
25
|
+
pass_count = 0
|
|
26
|
+
fail_count = 0
|
|
27
|
+
threshold_value = None
|
|
28
|
+
|
|
29
|
+
for result in results:
|
|
30
|
+
metric_data = result.get('results', {}).get(metric_key)
|
|
31
|
+
if metric_data:
|
|
32
|
+
try:
|
|
33
|
+
# Parse the JSON string to get the actual data
|
|
34
|
+
parsed_data = json.loads(metric_data) if isinstance(metric_data, str) else metric_data
|
|
35
|
+
|
|
36
|
+
# Extract score, result, and threshold
|
|
37
|
+
score = parsed_data.get('score')
|
|
38
|
+
if score is None:
|
|
39
|
+
score = parsed_data.get(metric_name)
|
|
40
|
+
if score is None:
|
|
41
|
+
score = parsed_data.get(f'{metric_name}_score')
|
|
42
|
+
|
|
43
|
+
result_status = parsed_data.get('result')
|
|
44
|
+
if result_status is None:
|
|
45
|
+
result_status = parsed_data.get(f'{metric_name}_result')
|
|
46
|
+
|
|
47
|
+
threshold = parsed_data.get('threshold')
|
|
48
|
+
if threshold is None:
|
|
49
|
+
threshold = parsed_data.get(f'{metric_name}_threshold')
|
|
50
|
+
|
|
51
|
+
if score is not None:
|
|
52
|
+
scores.append(float(score))
|
|
53
|
+
|
|
54
|
+
if result_status:
|
|
55
|
+
if str(result_status).lower() == 'pass':
|
|
56
|
+
pass_count += 1
|
|
57
|
+
elif str(result_status).lower() == 'fail':
|
|
58
|
+
fail_count += 1
|
|
59
|
+
|
|
60
|
+
# Capture threshold (should be consistent across all results)
|
|
61
|
+
if threshold is not None and threshold_value is None:
|
|
62
|
+
threshold_value = threshold
|
|
63
|
+
|
|
64
|
+
except (json.JSONDecodeError, ValueError, TypeError):
|
|
65
|
+
continue
|
|
66
|
+
|
|
67
|
+
if scores:
|
|
68
|
+
avg_score = sum(scores) / len(scores)
|
|
69
|
+
total_evaluated = pass_count + fail_count
|
|
70
|
+
pass_rate = (pass_count / total_evaluated * 100) if total_evaluated > 0 else 0
|
|
71
|
+
|
|
72
|
+
aggregates[metric_display_name] = {
|
|
73
|
+
'total_prompts': len(results),
|
|
74
|
+
'total_evaluated': total_evaluated,
|
|
75
|
+
'pass_count': pass_count,
|
|
76
|
+
'fail_count': fail_count,
|
|
77
|
+
'pass_rate': pass_rate,
|
|
78
|
+
'avg_score': avg_score,
|
|
79
|
+
'threshold': threshold_value,
|
|
80
|
+
'scores': scores
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return aggregates
|
|
84
|
+
|
|
85
|
+
def parse_score(score_str):
|
|
86
|
+
try:
|
|
87
|
+
return json.loads(score_str)
|
|
88
|
+
except Exception:
|
|
89
|
+
return {}
|
|
90
|
+
|
|
91
|
+
def format_score(score):
|
|
92
|
+
try:
|
|
93
|
+
val = float(score)
|
|
94
|
+
except (TypeError, ValueError):
|
|
95
|
+
return score
|
|
96
|
+
if val.is_integer():
|
|
97
|
+
return str(int(val))
|
|
98
|
+
s = f"{val:.3f}".rstrip('0').rstrip('.')
|
|
99
|
+
return s or "0"
|
|
100
|
+
|
|
101
|
+
def extract_metric_rows(entry):
|
|
102
|
+
"""
|
|
103
|
+
Build generic metric rows from any `*_score` keys on an entry.
|
|
104
|
+
Each row has: Metric, Result, Score, Threshold, Reason.
|
|
105
|
+
Supports metrics under entry['results'] and falls back to top-level for backward compatibility.
|
|
106
|
+
"""
|
|
107
|
+
rows = []
|
|
108
|
+
|
|
109
|
+
def pick(d, candidates):
|
|
110
|
+
for k in candidates:
|
|
111
|
+
if k in d and d[k] not in (None, ''):
|
|
112
|
+
return d[k]
|
|
113
|
+
return ''
|
|
114
|
+
|
|
115
|
+
def iter_score_fields(e):
|
|
116
|
+
container = e.get('results') if isinstance(e, dict) else None
|
|
117
|
+
if isinstance(container, dict):
|
|
118
|
+
for k, v in container.items():
|
|
119
|
+
if isinstance(k, str) and k.endswith('_score'):
|
|
120
|
+
yield k, v
|
|
121
|
+
return
|
|
122
|
+
# fallback to top-level flat structure
|
|
123
|
+
for k, v in e.items():
|
|
124
|
+
if isinstance(k, str) and k.endswith('_score'):
|
|
125
|
+
yield k, v
|
|
126
|
+
|
|
127
|
+
for key, raw in iter_score_fields(entry):
|
|
128
|
+
metric_id = key[:-6] # strip "_score"
|
|
129
|
+
metric_obj = parse_score(raw) if isinstance(raw, (str, bytes)) else (raw or {})
|
|
130
|
+
|
|
131
|
+
display_name = metric_id.replace('_', ' ').title()
|
|
132
|
+
|
|
133
|
+
# Candidate key patterns inside the parsed metric object
|
|
134
|
+
score_val = pick(metric_obj, [metric_id, f'{metric_id}_score', 'score', 'value'])
|
|
135
|
+
result_val = pick(metric_obj, [f'{metric_id}_result', 'result', 'status'])
|
|
136
|
+
threshold_val = pick(metric_obj, [f'{metric_id}_threshold', 'threshold', 'min_threshold', 'expected'])
|
|
137
|
+
reason_val = pick(metric_obj, [f'{metric_id}_reason', 'reason', 'rationale', 'explanation'])
|
|
138
|
+
|
|
139
|
+
rows.append({
|
|
140
|
+
'Metric': display_name,
|
|
141
|
+
'Result': str(result_val).lower() if isinstance(result_val, str) else result_val,
|
|
142
|
+
'Score': format_score(score_val),
|
|
143
|
+
'Threshold': format_score(threshold_val),
|
|
144
|
+
'Reason': reason_val
|
|
145
|
+
})
|
|
146
|
+
return rows
|
|
147
|
+
|
|
148
|
+
def generate_html_report(results):
|
|
149
|
+
html = [
|
|
150
|
+
'<!DOCTYPE html>',
|
|
151
|
+
'<html lang="en">',
|
|
152
|
+
'<head>',
|
|
153
|
+
' <meta charset="UTF-8">',
|
|
154
|
+
' <title>M365 Copilot Agents Evaluation Scores Report</title>',
|
|
155
|
+
' <style>',
|
|
156
|
+
' body { font-family: Arial, sans-serif; margin: 2em; padding: 1.5em; background: #fafafa; }',
|
|
157
|
+
' h1 { margin-top: 0; color: #2c3e50; }',
|
|
158
|
+
' h2 { color: #34495e; margin-top: 2em; }',
|
|
159
|
+
' table { border-collapse: collapse; width: 100%; margin: 1.5em 0; }',
|
|
160
|
+
' th, td { border: 1px solid #ccc; padding: 10px 12px; text-align: left; vertical-align: top; }',
|
|
161
|
+
' th { background: #f4f4f4; font-weight: 600; }',
|
|
162
|
+
' details { padding: 0.75em 1em; border: 1px solid #ddd; border-radius: 6px; background: #fff; }',
|
|
163
|
+
' details summary { cursor: pointer; font-weight: 600; margin: -0.75em -1em 0.75em; padding: 0.75em 1em; background: #eef2f5; border-bottom: 1px solid #ddd; }',
|
|
164
|
+
' .pass { background: #d4edda; color: #155724; }',
|
|
165
|
+
' .fail { background: #f8d7da; color: #721c24; }',
|
|
166
|
+
' .score-details { font-size: 0.95em; color: #333; background: #f9f9f9; }',
|
|
167
|
+
' .score-details table { margin-top: 0.5em !important; }',
|
|
168
|
+
' .score-details th { background: #e9ecef; }',
|
|
169
|
+
' .aggregate-section { margin: 2em 0; }',
|
|
170
|
+
' .aggregate-section h2 { color: #34495e; margin-top: 0; }',
|
|
171
|
+
' .pass-rate-excellent { background: #d1f2eb; color: #186a3b; font-weight: bold; }',
|
|
172
|
+
' .pass-rate-good { background: #fef9e7; color: #7d6608; font-weight: bold; }',
|
|
173
|
+
' .pass-rate-poor { background: #fadbd8; color: #b03a2e; font-weight: bold; }',
|
|
174
|
+
' .individual-results { margin-top: 3em; }',
|
|
175
|
+
' </style>',
|
|
176
|
+
'</head>',
|
|
177
|
+
'<body>',
|
|
178
|
+
' <h1> M365 Copilot Agents Evaluation Report</h1>',
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
# Add aggregate statistics if multiple results
|
|
182
|
+
if len(results) > 1:
|
|
183
|
+
aggregates = calculate_aggregate_statistics(results)
|
|
184
|
+
if aggregates:
|
|
185
|
+
html.append('<div class="aggregate-section">')
|
|
186
|
+
html.append(f'<h2> Aggregate Statistics ({len(results)} prompts evaluated)</h2>')
|
|
187
|
+
|
|
188
|
+
# Create aggregate table with same style as individual results
|
|
189
|
+
html.append('<table>')
|
|
190
|
+
html.append('<tr><th>Metric</th><th>Pass Rate</th><th>Passed</th><th>Failed</th><th>Avg Score</th><th>Threshold</th></tr>')
|
|
191
|
+
|
|
192
|
+
for metric_name, stats in aggregates.items():
|
|
193
|
+
pass_rate_class = 'pass-rate-excellent' if stats['pass_rate'] >= 80 else 'pass-rate-good' if stats['pass_rate'] >= 60 else 'pass-rate-poor'
|
|
194
|
+
threshold_display = stats.get('threshold', 'N/A')
|
|
195
|
+
html.append(f'''
|
|
196
|
+
<tr>
|
|
197
|
+
<td><strong>{metric_name}</strong></td>
|
|
198
|
+
<td class="{pass_rate_class}">{stats['pass_rate']:.1f}%</td>
|
|
199
|
+
<td class="pass">{stats['pass_count']}</td>
|
|
200
|
+
<td class="fail">{stats['fail_count']}</td>
|
|
201
|
+
<td>{stats['avg_score']:.2f}</td>
|
|
202
|
+
<td>{threshold_display}</td>
|
|
203
|
+
</tr>
|
|
204
|
+
''')
|
|
205
|
+
|
|
206
|
+
html.append('</table>')
|
|
207
|
+
html.append('</div>')
|
|
208
|
+
|
|
209
|
+
# Individual results section
|
|
210
|
+
html.append('<div class="individual-results">')
|
|
211
|
+
html.append('<h2> Individual Results</h2>')
|
|
212
|
+
|
|
213
|
+
for idx, entry in enumerate(results, 1):
|
|
214
|
+
html.append(f'<h3>Prompt {idx}: {entry.get("prompt", "")}</h3>')
|
|
215
|
+
html.append('<table>')
|
|
216
|
+
html.append('<tr><th>Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("response", ""))))
|
|
217
|
+
html.append('<tr><th>Expected Response</th><td>{}</td></tr>'.format(markdown.markdown(entry.get("expected_response", ""))))
|
|
218
|
+
html.append('</table>')
|
|
219
|
+
|
|
220
|
+
score_rows = extract_metric_rows(entry)
|
|
221
|
+
score_df = pd.DataFrame(score_rows)
|
|
222
|
+
|
|
223
|
+
def highlight_result(val):
|
|
224
|
+
lv = str(val).lower()
|
|
225
|
+
if lv == 'pass':
|
|
226
|
+
return 'background-color: #d4edda; color: #155724;'
|
|
227
|
+
elif lv == 'fail':
|
|
228
|
+
return 'background-color: #f8d7da; color: #721c24;'
|
|
229
|
+
return ''
|
|
230
|
+
|
|
231
|
+
score_html = (
|
|
232
|
+
score_df.style
|
|
233
|
+
.map(highlight_result, subset=['Result'])
|
|
234
|
+
.set_table_attributes('style="margin-top:1em;"')
|
|
235
|
+
.hide(axis="index")
|
|
236
|
+
.to_html()
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
html.append('<details open class="score-details"><summary>Show/Hide Evaluation Metric Scores</summary>')
|
|
240
|
+
html.append(score_html)
|
|
241
|
+
html.append('</details>')
|
|
242
|
+
|
|
243
|
+
html.append(f'<p><em>Entry {idx} of {len(results)}</em></p>')
|
|
244
|
+
if idx < len(results): # Don't add HR after last item
|
|
245
|
+
html.append('<hr>')
|
|
246
|
+
|
|
247
|
+
html.append('</div>') # Close individual-results
|
|
248
|
+
html.append('</body></html>')
|
|
249
|
+
html.append('<p><small>Generated by M365 Copilot Agents Evaluation CLI.</small></p>')
|
|
250
|
+
|
|
251
|
+
return '\n'.join(html)
|